summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks13
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/audit.c2
-rw-r--r--kernel/auditsc.c7
-rw-r--r--kernel/bpf/arraymap.c113
-rw-r--r--kernel/bpf/core.c105
-rw-r--r--kernel/bpf/helpers.c105
-rw-r--r--kernel/bpf/syscall.c42
-rw-r--r--kernel/bpf/verifier.c54
-rw-r--r--kernel/cgroup.c273
-rw-r--r--kernel/configs/xen.config48
-rw-r--r--kernel/context_tracking.c67
-rw-r--r--kernel/cpu.c17
-rw-r--r--kernel/cpuset.c2
-rw-r--r--kernel/delayacct.c2
-rw-r--r--kernel/events/core.c267
-rw-r--r--kernel/events/internal.h19
-rw-r--r--kernel/events/ring_buffer.c39
-rw-r--r--kernel/exit.c6
-rw-r--r--kernel/fork.c71
-rw-r--r--kernel/futex.c72
-rw-r--r--kernel/gcov/base.c6
-rw-r--r--kernel/gcov/gcc_4_7.c4
-rw-r--r--kernel/irq/chip.c104
-rw-r--r--kernel/irq/dummychip.c1
-rw-r--r--kernel/irq/generic-chip.c5
-rw-r--r--kernel/irq/internals.h21
-rw-r--r--kernel/irq/irqdesc.c13
-rw-r--r--kernel/irq/irqdomain.c25
-rw-r--r--kernel/irq/manage.c33
-rw-r--r--kernel/irq/migration.c15
-rw-r--r--kernel/irq/msi.c2
-rw-r--r--kernel/irq/pm.c4
-rw-r--r--kernel/irq/proc.c2
-rw-r--r--kernel/jump_label.c10
-rw-r--r--kernel/kexec.c11
-rw-r--r--kernel/kthread.c6
-rw-r--r--kernel/livepatch/core.c78
-rw-r--r--kernel/locking/Makefile3
-rw-r--r--kernel/locking/lglock.c22
-rw-r--r--kernel/locking/lockdep.c180
-rw-r--r--kernel/locking/locktorture.c14
-rw-r--r--kernel/locking/mcs_spinlock.h1
-rw-r--r--kernel/locking/qrwlock.c30
-rw-r--r--kernel/locking/qspinlock.c473
-rw-r--r--kernel/locking/qspinlock_paravirt.h334
-rw-r--r--kernel/locking/rtmutex.c105
-rw-r--r--kernel/locking/rtmutex_common.h3
-rw-r--r--kernel/locking/rwsem-xadd.c44
-rw-r--r--kernel/module.c336
-rw-r--r--kernel/panic.c5
-rw-r--r--kernel/params.c127
-rw-r--r--kernel/power/Kconfig278
-rw-r--r--kernel/power/Makefile35
-rw-r--r--kernel/power/block_io.c103
-rw-r--r--kernel/power/hibernate.c38
-rw-r--r--kernel/power/main.c2
-rw-r--r--kernel/power/power.h47
-rw-r--r--kernel/power/snapshot.c317
-rw-r--r--kernel/power/suspend.c8
-rw-r--r--kernel/power/swap.c159
-rw-r--r--kernel/power/tuxonice.h260
-rw-r--r--kernel/power/tuxonice_alloc.c308
-rw-r--r--kernel/power/tuxonice_alloc.h54
-rw-r--r--kernel/power/tuxonice_atomic_copy.c469
-rw-r--r--kernel/power/tuxonice_atomic_copy.h25
-rw-r--r--kernel/power/tuxonice_bio.h78
-rw-r--r--kernel/power/tuxonice_bio_chains.c1126
-rw-r--r--kernel/power/tuxonice_bio_core.c1933
-rw-r--r--kernel/power/tuxonice_bio_internal.h101
-rw-r--r--kernel/power/tuxonice_bio_signature.c403
-rw-r--r--kernel/power/tuxonice_builtin.c498
-rw-r--r--kernel/power/tuxonice_builtin.h41
-rw-r--r--kernel/power/tuxonice_checksum.c392
-rw-r--r--kernel/power/tuxonice_checksum.h31
-rw-r--r--kernel/power/tuxonice_cluster.c1058
-rw-r--r--kernel/power/tuxonice_cluster.h18
-rw-r--r--kernel/power/tuxonice_compress.c452
-rw-r--r--kernel/power/tuxonice_copy_before_write.c240
-rw-r--r--kernel/power/tuxonice_extent.c144
-rw-r--r--kernel/power/tuxonice_extent.h45
-rw-r--r--kernel/power/tuxonice_file.c484
-rw-r--r--kernel/power/tuxonice_highlevel.c1413
-rw-r--r--kernel/power/tuxonice_incremental.c402
-rw-r--r--kernel/power/tuxonice_io.c1932
-rw-r--r--kernel/power/tuxonice_io.h72
-rw-r--r--kernel/power/tuxonice_modules.c520
-rw-r--r--kernel/power/tuxonice_modules.h212
-rw-r--r--kernel/power/tuxonice_netlink.c324
-rw-r--r--kernel/power/tuxonice_netlink.h62
-rw-r--r--kernel/power/tuxonice_pagedir.c345
-rw-r--r--kernel/power/tuxonice_pagedir.h50
-rw-r--r--kernel/power/tuxonice_pageflags.c18
-rw-r--r--kernel/power/tuxonice_pageflags.h106
-rw-r--r--kernel/power/tuxonice_power_off.c286
-rw-r--r--kernel/power/tuxonice_power_off.h24
-rw-r--r--kernel/power/tuxonice_prepare_image.c1080
-rw-r--r--kernel/power/tuxonice_prepare_image.h38
-rw-r--r--kernel/power/tuxonice_prune.c406
-rw-r--r--kernel/power/tuxonice_storage.c282
-rw-r--r--kernel/power/tuxonice_storage.h45
-rw-r--r--kernel/power/tuxonice_swap.c474
-rw-r--r--kernel/power/tuxonice_sysfs.c333
-rw-r--r--kernel/power/tuxonice_sysfs.h137
-rw-r--r--kernel/power/tuxonice_ui.c247
-rw-r--r--kernel/power/tuxonice_ui.h97
-rw-r--r--kernel/power/tuxonice_userui.c658
-rw-r--r--kernel/printk/printk.c245
-rw-r--r--kernel/rcu/rcutorture.c103
-rw-r--r--kernel/rcu/srcu.c10
-rw-r--r--kernel/rcu/tiny.c35
-rw-r--r--kernel/rcu/tiny_plugin.h12
-rw-r--r--kernel/rcu/tree.c367
-rw-r--r--kernel/rcu/tree.h35
-rw-r--r--kernel/rcu/tree_plugin.h232
-rw-r--r--kernel/rcu/tree_trace.c6
-rw-r--r--kernel/rcu/update.c30
-rw-r--r--kernel/relay.c5
-rw-r--r--kernel/resource.c6
-rw-r--r--kernel/sched/Makefile13
-rw-r--r--kernel/sched/auto_group.c6
-rw-r--r--kernel/sched/auto_group.h2
-rw-r--r--kernel/sched/bfs.c7420
-rw-r--r--kernel/sched/bfs_sched.h172
-rw-r--r--kernel/sched/core.c741
-rw-r--r--kernel/sched/cputime.c2
-rw-r--r--kernel/sched/deadline.c299
-rw-r--r--kernel/sched/debug.c53
-rw-r--r--kernel/sched/fair.c481
-rw-r--r--kernel/sched/idle.c118
-rw-r--r--kernel/sched/loadavg.c (renamed from kernel/sched/proc.c)236
-rw-r--r--kernel/sched/rt.c108
-rw-r--r--kernel/sched/sched.h60
-rw-r--r--kernel/sched/stats.c4
-rw-r--r--kernel/sched/stats.h19
-rw-r--r--kernel/sched/wait.c8
-rw-r--r--kernel/seccomp.c70
-rw-r--r--kernel/signal.c19
-rw-r--r--kernel/smpboot.c62
-rw-r--r--kernel/stop_machine.c45
-rw-r--r--kernel/sys.c166
-rw-r--r--kernel/sysctl.c56
-rw-r--r--kernel/time/Kconfig2
-rw-r--r--kernel/time/Makefile17
-rw-r--r--kernel/time/alarmtimer.c17
-rw-r--r--kernel/time/clockevents.c91
-rw-r--r--kernel/time/clocksource.c24
-rw-r--r--kernel/time/hrtimer.c699
-rw-r--r--kernel/time/ntp.c61
-rw-r--r--kernel/time/ntp_internal.h1
-rw-r--r--kernel/time/posix-cpu-timers.c97
-rw-r--r--kernel/time/posix-timers.c17
-rw-r--r--kernel/time/tick-broadcast-hrtimer.c18
-rw-r--r--kernel/time/tick-broadcast.c257
-rw-r--r--kernel/time/tick-common.c57
-rw-r--r--kernel/time/tick-internal.h31
-rw-r--r--kernel/time/tick-oneshot.c22
-rw-r--r--kernel/time/tick-sched.c320
-rw-r--r--kernel/time/tick-sched.h12
-rw-r--r--kernel/time/time.c78
-rw-r--r--kernel/time/timeconst.bc3
-rw-r--r--kernel/time/timekeeping.c182
-rw-r--r--kernel/time/timekeeping.h11
-rw-r--r--kernel/time/timer.c363
-rw-r--r--kernel/time/timer_list.c51
-rw-r--r--kernel/time/timer_stats.c10
-rw-r--r--kernel/torture.c26
-rw-r--r--kernel/trace/blktrace.c10
-rw-r--r--kernel/trace/bpf_trace.c42
-rw-r--r--kernel/trace/ring_buffer.c221
-rw-r--r--kernel/trace/ring_buffer_benchmark.c23
-rw-r--r--kernel/trace/trace.c23
-rw-r--r--kernel/trace/trace.h42
-rw-r--r--kernel/trace/trace_branch.c4
-rw-r--r--kernel/trace/trace_clock.c3
-rw-r--r--kernel/trace/trace_event_perf.c20
-rw-r--r--kernel/trace/trace_events.c304
-rw-r--r--kernel/trace/trace_events_filter.c91
-rw-r--r--kernel/trace/trace_events_trigger.c70
-rw-r--r--kernel/trace/trace_export.c10
-rw-r--r--kernel/trace/trace_functions_graph.c8
-rw-r--r--kernel/trace/trace_kprobe.c70
-rw-r--r--kernel/trace/trace_mmiotrace.c4
-rw-r--r--kernel/trace/trace_output.c78
-rw-r--r--kernel/trace/trace_output.h2
-rw-r--r--kernel/trace/trace_probe.h8
-rw-r--r--kernel/trace/trace_sched_wakeup.c4
-rw-r--r--kernel/trace/trace_selftest.c5
-rw-r--r--kernel/trace/trace_syscalls.c72
-rw-r--r--kernel/trace/trace_uprobe.c46
-rw-r--r--kernel/watchdog.c67
-rw-r--r--kernel/workqueue.c493
192 files changed, 7059 insertions, 30427 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 08561f1ac..ebdb00432 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -235,9 +235,16 @@ config LOCK_SPIN_ON_OWNER
def_bool y
depends on MUTEX_SPIN_ON_OWNER || RWSEM_SPIN_ON_OWNER
-config ARCH_USE_QUEUE_RWLOCK
+config ARCH_USE_QUEUED_SPINLOCKS
bool
-config QUEUE_RWLOCK
- def_bool y if ARCH_USE_QUEUE_RWLOCK
+config QUEUED_SPINLOCKS
+ def_bool y if ARCH_USE_QUEUED_SPINLOCKS
+ depends on SMP
+
+config ARCH_USE_QUEUED_RWLOCKS
+ bool
+
+config QUEUED_RWLOCKS
+ def_bool y if ARCH_USE_QUEUED_RWLOCKS
depends on SMP
diff --git a/kernel/Makefile b/kernel/Makefile
index 60c302cfb..43c4c920f 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -137,7 +137,7 @@ endif
ifneq ($(wildcard $(obj)/.x509.list),)
ifneq ($(shell cat $(obj)/.x509.list),$(X509_CERTIFICATES))
-$(info X.509 certificate list changed)
+$(warning X.509 certificate list changed to "$(X509_CERTIFICATES)" from "$(shell cat $(obj)/.x509.list)")
$(shell rm $(obj)/.x509.list)
endif
endif
diff --git a/kernel/audit.c b/kernel/audit.c
index 1c13e4267..f9e606534 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1904,7 +1904,7 @@ EXPORT_SYMBOL(audit_log_task_info);
/**
* audit_log_link_denied - report a link restriction denial
- * @operation: specific link opreation
+ * @operation: specific link operation
* @link: the path that triggered the restriction
*/
void audit_log_link_denied(const char *operation, struct path *link)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 9fb9d1cb8..e85bdfd15 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -599,9 +599,7 @@ static int audit_filter_rules(struct task_struct *tsk,
result = match_tree_refs(ctx, rule->tree);
break;
case AUDIT_LOGINUID:
- result = 0;
- if (ctx)
- result = audit_uid_comparator(tsk->loginuid, f->op, f->uid);
+ result = audit_uid_comparator(tsk->loginuid, f->op, f->uid);
break;
case AUDIT_LOGINUID_SET:
result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val);
@@ -1023,8 +1021,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
* for strings that are too long, we should not have created
* any.
*/
- if (unlikely((len == -1) || len > MAX_ARG_STRLEN - 1)) {
- WARN_ON(1);
+ if (WARN_ON_ONCE(len < 0 || len > MAX_ARG_STRLEN - 1)) {
send_sig(SIGKILL, current, 0);
return -1;
}
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 8a6616583..cb31229a6 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -14,12 +14,7 @@
#include <linux/vmalloc.h>
#include <linux/slab.h>
#include <linux/mm.h>
-
-struct bpf_array {
- struct bpf_map map;
- u32 elem_size;
- char value[0] __aligned(8);
-};
+#include <linux/filter.h>
/* Called from syscall */
static struct bpf_map *array_map_alloc(union bpf_attr *attr)
@@ -154,3 +149,109 @@ static int __init register_array_map(void)
return 0;
}
late_initcall(register_array_map);
+
+static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
+{
+ /* only bpf_prog file descriptors can be stored in prog_array map */
+ if (attr->value_size != sizeof(u32))
+ return ERR_PTR(-EINVAL);
+ return array_map_alloc(attr);
+}
+
+static void prog_array_map_free(struct bpf_map *map)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ int i;
+
+ synchronize_rcu();
+
+ /* make sure it's empty */
+ for (i = 0; i < array->map.max_entries; i++)
+ BUG_ON(array->prog[i] != NULL);
+ kvfree(array);
+}
+
+static void *prog_array_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ return NULL;
+}
+
+/* only called from syscall */
+static int prog_array_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct bpf_prog *prog, *old_prog;
+ u32 index = *(u32 *)key, ufd;
+
+ if (map_flags != BPF_ANY)
+ return -EINVAL;
+
+ if (index >= array->map.max_entries)
+ return -E2BIG;
+
+ ufd = *(u32 *)value;
+ prog = bpf_prog_get(ufd);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ if (!bpf_prog_array_compatible(array, prog)) {
+ bpf_prog_put(prog);
+ return -EINVAL;
+ }
+
+ old_prog = xchg(array->prog + index, prog);
+ if (old_prog)
+ bpf_prog_put_rcu(old_prog);
+
+ return 0;
+}
+
+static int prog_array_map_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct bpf_prog *old_prog;
+ u32 index = *(u32 *)key;
+
+ if (index >= array->map.max_entries)
+ return -E2BIG;
+
+ old_prog = xchg(array->prog + index, NULL);
+ if (old_prog) {
+ bpf_prog_put_rcu(old_prog);
+ return 0;
+ } else {
+ return -ENOENT;
+ }
+}
+
+/* decrement refcnt of all bpf_progs that are stored in this map */
+void bpf_prog_array_map_clear(struct bpf_map *map)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ int i;
+
+ for (i = 0; i < array->map.max_entries; i++)
+ prog_array_map_delete_elem(map, &i);
+}
+
+static const struct bpf_map_ops prog_array_ops = {
+ .map_alloc = prog_array_map_alloc,
+ .map_free = prog_array_map_free,
+ .map_get_next_key = array_map_get_next_key,
+ .map_lookup_elem = prog_array_map_lookup_elem,
+ .map_update_elem = prog_array_map_update_elem,
+ .map_delete_elem = prog_array_map_delete_elem,
+};
+
+static struct bpf_map_type_list prog_array_type __read_mostly = {
+ .ops = &prog_array_ops,
+ .type = BPF_MAP_TYPE_PROG_ARRAY,
+};
+
+static int __init register_prog_array_map(void)
+{
+ bpf_register_map_type(&prog_array_type);
+ return 0;
+}
+late_initcall(register_prog_array_map);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 54f0e7fcd..c5bedc82b 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -26,9 +26,10 @@
#include <linux/vmalloc.h>
#include <linux/random.h>
#include <linux/moduleloader.h>
-#include <asm/unaligned.h>
#include <linux/bpf.h>
+#include <asm/unaligned.h>
+
/* Registers */
#define BPF_R0 regs[BPF_REG_0]
#define BPF_R1 regs[BPF_REG_1]
@@ -62,6 +63,7 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns
ptr = skb_network_header(skb) + k - SKF_NET_OFF;
else if (k >= SKF_LL_OFF)
ptr = skb_mac_header(skb) + k - SKF_LL_OFF;
+
if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb))
return ptr;
@@ -244,6 +246,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
[BPF_ALU64 | BPF_NEG] = &&ALU64_NEG,
/* Call instruction */
[BPF_JMP | BPF_CALL] = &&JMP_CALL,
+ [BPF_JMP | BPF_CALL | BPF_X] = &&JMP_TAIL_CALL,
/* Jumps */
[BPF_JMP | BPF_JA] = &&JMP_JA,
[BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X,
@@ -286,6 +289,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
[BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B,
[BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW,
};
+ u32 tail_call_cnt = 0;
void *ptr;
int off;
@@ -431,6 +435,30 @@ select_insn:
BPF_R4, BPF_R5);
CONT;
+ JMP_TAIL_CALL: {
+ struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct bpf_prog *prog;
+ u64 index = BPF_R3;
+
+ if (unlikely(index >= array->map.max_entries))
+ goto out;
+
+ if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT))
+ goto out;
+
+ tail_call_cnt++;
+
+ prog = READ_ONCE(array->prog[index]);
+ if (unlikely(!prog))
+ goto out;
+
+ ARG1 = BPF_R1;
+ insn = prog->insnsi;
+ goto select_insn;
+out:
+ CONT;
+ }
/* JMP */
JMP_JA:
insn += insn->off;
@@ -615,25 +643,63 @@ load_byte:
return 0;
}
-void __weak bpf_int_jit_compile(struct bpf_prog *prog)
+bool bpf_prog_array_compatible(struct bpf_array *array,
+ const struct bpf_prog *fp)
{
+ if (!array->owner_prog_type) {
+ /* There's no owner yet where we could check for
+ * compatibility.
+ */
+ array->owner_prog_type = fp->type;
+ array->owner_jited = fp->jited;
+
+ return true;
+ }
+
+ return array->owner_prog_type == fp->type &&
+ array->owner_jited == fp->jited;
+}
+
+static int bpf_check_tail_call(const struct bpf_prog *fp)
+{
+ struct bpf_prog_aux *aux = fp->aux;
+ int i;
+
+ for (i = 0; i < aux->used_map_cnt; i++) {
+ struct bpf_map *map = aux->used_maps[i];
+ struct bpf_array *array;
+
+ if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
+ continue;
+
+ array = container_of(map, struct bpf_array, map);
+ if (!bpf_prog_array_compatible(array, fp))
+ return -EINVAL;
+ }
+
+ return 0;
}
/**
- * bpf_prog_select_runtime - select execution runtime for BPF program
+ * bpf_prog_select_runtime - select exec runtime for BPF program
* @fp: bpf_prog populated with internal BPF program
*
- * try to JIT internal BPF program, if JIT is not available select interpreter
- * BPF program will be executed via BPF_PROG_RUN() macro
+ * Try to JIT eBPF program, if JIT is not available, use interpreter.
+ * The BPF program will be executed via BPF_PROG_RUN() macro.
*/
-void bpf_prog_select_runtime(struct bpf_prog *fp)
+int bpf_prog_select_runtime(struct bpf_prog *fp)
{
fp->bpf_func = (void *) __bpf_prog_run;
- /* Probe if internal BPF can be JITed */
bpf_int_jit_compile(fp);
- /* Lock whole bpf_prog as read-only */
bpf_prog_lock_ro(fp);
+
+ /* The tail call compatibility check can only be done at
+ * this late stage as we need to determine, if we deal
+ * with JITed or non JITed program concatenations and not
+ * all eBPF JITs might immediately support all features.
+ */
+ return bpf_check_tail_call(fp);
}
EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
@@ -663,6 +729,29 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
+const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
+const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
+const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
+const struct bpf_func_proto bpf_get_current_comm_proto __weak;
+const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
+{
+ return NULL;
+}
+
+/* Always built-in helper functions. */
+const struct bpf_func_proto bpf_tail_call_proto = {
+ .func = NULL,
+ .gpl_only = false,
+ .ret_type = RET_VOID,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+};
+
+/* For classic BPF JITs that don't implement bpf_int_jit_compile(). */
+void __weak bpf_int_jit_compile(struct bpf_prog *prog)
+{
+}
/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
* skb_copy_bits(), so provide a weak definition of it for NET-less config.
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index bd7f5988e..1447ec094 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -13,6 +13,9 @@
#include <linux/rcupdate.h>
#include <linux/random.h>
#include <linux/smp.h>
+#include <linux/ktime.h>
+#include <linux/sched.h>
+#include <linux/uidgid.h>
/* If kernel subsystem is allowing eBPF programs to call this function,
* inside its own verifier_ops->get_func_proto() callback it should return
@@ -44,11 +47,11 @@ static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
}
const struct bpf_func_proto bpf_map_lookup_elem_proto = {
- .func = bpf_map_lookup_elem,
- .gpl_only = false,
- .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
- .arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_MAP_KEY,
+ .func = bpf_map_lookup_elem,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_MAP_KEY,
};
static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
@@ -63,13 +66,13 @@ static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
}
const struct bpf_func_proto bpf_map_update_elem_proto = {
- .func = bpf_map_update_elem,
- .gpl_only = false,
- .ret_type = RET_INTEGER,
- .arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_MAP_KEY,
- .arg3_type = ARG_PTR_TO_MAP_VALUE,
- .arg4_type = ARG_ANYTHING,
+ .func = bpf_map_update_elem,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_MAP_KEY,
+ .arg3_type = ARG_PTR_TO_MAP_VALUE,
+ .arg4_type = ARG_ANYTHING,
};
static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
@@ -83,11 +86,11 @@ static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
}
const struct bpf_func_proto bpf_map_delete_elem_proto = {
- .func = bpf_map_delete_elem,
- .gpl_only = false,
- .ret_type = RET_INTEGER,
- .arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_MAP_KEY,
+ .func = bpf_map_delete_elem,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_MAP_KEY,
};
static u64 bpf_get_prandom_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
@@ -111,3 +114,71 @@ const struct bpf_func_proto bpf_get_smp_processor_id_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
};
+
+static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ /* NMI safe access to clock monotonic */
+ return ktime_get_mono_fast_ns();
+}
+
+const struct bpf_func_proto bpf_ktime_get_ns_proto = {
+ .func = bpf_ktime_get_ns,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+};
+
+static u64 bpf_get_current_pid_tgid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ struct task_struct *task = current;
+
+ if (!task)
+ return -EINVAL;
+
+ return (u64) task->tgid << 32 | task->pid;
+}
+
+const struct bpf_func_proto bpf_get_current_pid_tgid_proto = {
+ .func = bpf_get_current_pid_tgid,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
+static u64 bpf_get_current_uid_gid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ struct task_struct *task = current;
+ kuid_t uid;
+ kgid_t gid;
+
+ if (!task)
+ return -EINVAL;
+
+ current_uid_gid(&uid, &gid);
+ return (u64) from_kgid(&init_user_ns, gid) << 32 |
+ from_kuid(&init_user_ns, uid);
+}
+
+const struct bpf_func_proto bpf_get_current_uid_gid_proto = {
+ .func = bpf_get_current_uid_gid,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
+static u64 bpf_get_current_comm(u64 r1, u64 size, u64 r3, u64 r4, u64 r5)
+{
+ struct task_struct *task = current;
+ char *buf = (char *) (long) r1;
+
+ if (!task)
+ return -EINVAL;
+
+ memcpy(buf, task->comm, min_t(size_t, size, sizeof(task->comm)));
+ return 0;
+}
+
+const struct bpf_func_proto bpf_get_current_comm_proto = {
+ .func = bpf_get_current_comm,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_STACK,
+ .arg2_type = ARG_CONST_STACK_SIZE,
+};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 3bae6c591..a1b14d197 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -68,6 +68,12 @@ static int bpf_map_release(struct inode *inode, struct file *filp)
{
struct bpf_map *map = filp->private_data;
+ if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY)
+ /* prog_array stores refcnt-ed bpf_prog pointers
+ * release them all when user space closes prog_array_fd
+ */
+ bpf_prog_array_map_clear(map);
+
bpf_map_put(map);
return 0;
}
@@ -392,6 +398,19 @@ static void fixup_bpf_calls(struct bpf_prog *prog)
*/
BUG_ON(!prog->aux->ops->get_func_proto);
+ if (insn->imm == BPF_FUNC_tail_call) {
+ /* mark bpf_tail_call as different opcode
+ * to avoid conditional branch in
+ * interpeter for every normal call
+ * and to prevent accidental JITing by
+ * JIT compiler that doesn't support
+ * bpf_tail_call yet
+ */
+ insn->imm = 0;
+ insn->code |= BPF_X;
+ continue;
+ }
+
fn = prog->aux->ops->get_func_proto(insn->imm);
/* all functions that have prototype and verifier allowed
* programs to call them, must be real in-kernel functions
@@ -413,6 +432,23 @@ static void free_used_maps(struct bpf_prog_aux *aux)
kfree(aux->used_maps);
}
+static void __prog_put_rcu(struct rcu_head *rcu)
+{
+ struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
+
+ free_used_maps(aux);
+ bpf_prog_free(aux->prog);
+}
+
+/* version of bpf_prog_put() that is called after a grace period */
+void bpf_prog_put_rcu(struct bpf_prog *prog)
+{
+ if (atomic_dec_and_test(&prog->aux->refcnt)) {
+ prog->aux->prog = prog;
+ call_rcu(&prog->aux->rcu, __prog_put_rcu);
+ }
+}
+
void bpf_prog_put(struct bpf_prog *prog)
{
if (atomic_dec_and_test(&prog->aux->refcnt)) {
@@ -426,7 +462,7 @@ static int bpf_prog_release(struct inode *inode, struct file *filp)
{
struct bpf_prog *prog = filp->private_data;
- bpf_prog_put(prog);
+ bpf_prog_put_rcu(prog);
return 0;
}
@@ -532,7 +568,9 @@ static int bpf_prog_load(union bpf_attr *attr)
fixup_bpf_calls(prog);
/* eBPF program is ready to be JITed */
- bpf_prog_select_runtime(prog);
+ err = bpf_prog_select_runtime(prog);
+ if (err < 0)
+ goto free_used_maps;
err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC);
if (err < 0)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 47dcd3aa6..039d866fd 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -907,6 +907,23 @@ static int check_call(struct verifier_env *env, int func_id)
fn->ret_type, func_id);
return -EINVAL;
}
+
+ if (map && map->map_type == BPF_MAP_TYPE_PROG_ARRAY &&
+ func_id != BPF_FUNC_tail_call)
+ /* prog_array map type needs extra care:
+ * only allow to pass it into bpf_tail_call() for now.
+ * bpf_map_delete_elem() can be allowed in the future,
+ * while bpf_map_update_elem() must only be done via syscall
+ */
+ return -EINVAL;
+
+ if (func_id == BPF_FUNC_tail_call &&
+ map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
+ /* don't allow any other map type to be passed into
+ * bpf_tail_call()
+ */
+ return -EINVAL;
+
return 0;
}
@@ -1675,6 +1692,8 @@ static int do_check(struct verifier_env *env)
}
} else if (class == BPF_STX) {
+ enum bpf_reg_type dst_reg_type;
+
if (BPF_MODE(insn->code) == BPF_XADD) {
err = check_xadd(env, insn);
if (err)
@@ -1683,11 +1702,6 @@ static int do_check(struct verifier_env *env)
continue;
}
- if (BPF_MODE(insn->code) != BPF_MEM ||
- insn->imm != 0) {
- verbose("BPF_STX uses reserved fields\n");
- return -EINVAL;
- }
/* check src1 operand */
err = check_reg_arg(regs, insn->src_reg, SRC_OP);
if (err)
@@ -1697,6 +1711,8 @@ static int do_check(struct verifier_env *env)
if (err)
return err;
+ dst_reg_type = regs[insn->dst_reg].type;
+
/* check that memory (dst_reg + off) is writeable */
err = check_mem_access(env, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_WRITE,
@@ -1704,6 +1720,15 @@ static int do_check(struct verifier_env *env)
if (err)
return err;
+ if (insn->imm == 0) {
+ insn->imm = dst_reg_type;
+ } else if (dst_reg_type != insn->imm &&
+ (dst_reg_type == PTR_TO_CTX ||
+ insn->imm == PTR_TO_CTX)) {
+ verbose("same insn cannot be used with different pointers\n");
+ return -EINVAL;
+ }
+
} else if (class == BPF_ST) {
if (BPF_MODE(insn->code) != BPF_MEM ||
insn->src_reg != BPF_REG_0) {
@@ -1822,12 +1847,18 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env)
for (i = 0; i < insn_cnt; i++, insn++) {
if (BPF_CLASS(insn->code) == BPF_LDX &&
- (BPF_MODE(insn->code) != BPF_MEM ||
- insn->imm != 0)) {
+ (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0)) {
verbose("BPF_LDX uses reserved fields\n");
return -EINVAL;
}
+ if (BPF_CLASS(insn->code) == BPF_STX &&
+ ((BPF_MODE(insn->code) != BPF_MEM &&
+ BPF_MODE(insn->code) != BPF_XADD) || insn->imm != 0)) {
+ verbose("BPF_STX uses reserved fields\n");
+ return -EINVAL;
+ }
+
if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) {
struct bpf_map *map;
struct fd f;
@@ -1950,12 +1981,17 @@ static int convert_ctx_accesses(struct verifier_env *env)
struct bpf_prog *new_prog;
u32 cnt;
int i;
+ enum bpf_access_type type;
if (!env->prog->aux->ops->convert_ctx_access)
return 0;
for (i = 0; i < insn_cnt; i++, insn++) {
- if (insn->code != (BPF_LDX | BPF_MEM | BPF_W))
+ if (insn->code == (BPF_LDX | BPF_MEM | BPF_W))
+ type = BPF_READ;
+ else if (insn->code == (BPF_STX | BPF_MEM | BPF_W))
+ type = BPF_WRITE;
+ else
continue;
if (insn->imm != PTR_TO_CTX) {
@@ -1965,7 +2001,7 @@ static int convert_ctx_accesses(struct verifier_env *env)
}
cnt = env->prog->aux->ops->
- convert_ctx_access(insn->dst_reg, insn->src_reg,
+ convert_ctx_access(type, insn->dst_reg, insn->src_reg,
insn->off, insn_buf);
if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
verbose("bpf verifier is misconfigured\n");
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e8a5491be..f89d9292e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -46,6 +46,7 @@
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/rwsem.h>
+#include <linux/percpu-rwsem.h>
#include <linux/string.h>
#include <linux/sort.h>
#include <linux/kmod.h>
@@ -103,6 +104,8 @@ static DEFINE_SPINLOCK(cgroup_idr_lock);
*/
static DEFINE_SPINLOCK(release_agent_path_lock);
+struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
+
#define cgroup_assert_mutex_or_rcu_locked() \
rcu_lockdep_assert(rcu_read_lock_held() || \
lockdep_is_held(&cgroup_mutex), \
@@ -156,7 +159,7 @@ static bool cgrp_dfl_root_visible;
static bool cgroup_legacy_files_on_dfl;
/* some controllers are not supported in the default hierarchy */
-static unsigned int cgrp_dfl_root_inhibit_ss_mask;
+static unsigned long cgrp_dfl_root_inhibit_ss_mask;
/* The list of hierarchy roots */
@@ -175,18 +178,19 @@ static DEFINE_IDR(cgroup_hierarchy_idr);
*/
static u64 css_serial_nr_next = 1;
-/* This flag indicates whether tasks in the fork and exit paths should
- * check for fork/exit handlers to call. This avoids us having to do
- * extra work in the fork/exit path if none of the subsystems need to
- * be called.
+/*
+ * These bitmask flags indicate whether tasks in the fork and exit paths have
+ * fork/exit handlers to call. This avoids us having to do extra work in the
+ * fork/exit path to check which subsystems have fork/exit callbacks.
*/
-static int need_forkexit_callback __read_mostly;
+static unsigned long have_fork_callback __read_mostly;
+static unsigned long have_exit_callback __read_mostly;
static struct cftype cgroup_dfl_base_files[];
static struct cftype cgroup_legacy_base_files[];
static int rebind_subsystems(struct cgroup_root *dst_root,
- unsigned int ss_mask);
+ unsigned long ss_mask);
static int cgroup_destroy_locked(struct cgroup *cgrp);
static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
bool visible);
@@ -261,7 +265,7 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
* @cgrp: the cgroup of interest
* @ss: the subsystem of interest (%NULL returns @cgrp->self)
*
- * Similar to cgroup_css() but returns the effctive css, which is defined
+ * Similar to cgroup_css() but returns the effective css, which is defined
* as the matching css of the nearest ancestor including self which has @ss
* enabled. If @ss is associated with the hierarchy @cgrp is on, this
* function is guaranteed to return non-NULL css.
@@ -409,6 +413,24 @@ static int notify_on_release(const struct cgroup *cgrp)
for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \
(((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
+/**
+ * for_each_subsys_which - filter for_each_subsys with a bitmask
+ * @ss: the iteration cursor
+ * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
+ * @ss_maskp: a pointer to the bitmask
+ *
+ * The block will only run for cases where the ssid-th bit (1 << ssid) of
+ * mask is set to 1.
+ */
+#define for_each_subsys_which(ss, ssid, ss_maskp) \
+ if (!CGROUP_SUBSYS_COUNT) /* to avoid spurious gcc warning */ \
+ (ssid) = 0; \
+ else \
+ for_each_set_bit(ssid, ss_maskp, CGROUP_SUBSYS_COUNT) \
+ if (((ss) = cgroup_subsys[ssid]) && false) \
+ break; \
+ else
+
/* iterate across the hierarchies */
#define for_each_root(root) \
list_for_each_entry((root), &cgroup_roots, root_list)
@@ -882,7 +904,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root)
static void cgroup_free_root(struct cgroup_root *root)
{
if (root) {
- /* hierarhcy ID shoulid already have been released */
+ /* hierarchy ID should already have been released */
WARN_ON_ONCE(root->hierarchy_id);
idr_destroy(&root->cgroup_idr);
@@ -998,7 +1020,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
* update of a tasks cgroup pointer by cgroup_attach_task()
*/
-static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask);
+static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
static const struct file_operations proc_cgroupstats_operations;
@@ -1068,11 +1090,11 @@ static void cgroup_put(struct cgroup *cgrp)
* @subtree_control is to be applied to @cgrp. The returned mask is always
* a superset of @subtree_control and follows the usual hierarchy rules.
*/
-static unsigned int cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
- unsigned int subtree_control)
+static unsigned long cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
+ unsigned long subtree_control)
{
struct cgroup *parent = cgroup_parent(cgrp);
- unsigned int cur_ss_mask = subtree_control;
+ unsigned long cur_ss_mask = subtree_control;
struct cgroup_subsys *ss;
int ssid;
@@ -1082,11 +1104,10 @@ static unsigned int cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
return cur_ss_mask;
while (true) {
- unsigned int new_ss_mask = cur_ss_mask;
+ unsigned long new_ss_mask = cur_ss_mask;
- for_each_subsys(ss, ssid)
- if (cur_ss_mask & (1 << ssid))
- new_ss_mask |= ss->depends_on;
+ for_each_subsys_which(ss, ssid, &cur_ss_mask)
+ new_ss_mask |= ss->depends_on;
/*
* Mask out subsystems which aren't available. This can
@@ -1200,7 +1221,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
* @cgrp: target cgroup
* @subsys_mask: mask of the subsystem ids whose files should be removed
*/
-static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask)
+static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
{
struct cgroup_subsys *ss;
int i;
@@ -1215,18 +1236,16 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask)
}
}
-static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
+static int rebind_subsystems(struct cgroup_root *dst_root,
+ unsigned long ss_mask)
{
struct cgroup_subsys *ss;
- unsigned int tmp_ss_mask;
+ unsigned long tmp_ss_mask;
int ssid, i, ret;
lockdep_assert_held(&cgroup_mutex);
- for_each_subsys(ss, ssid) {
- if (!(ss_mask & (1 << ssid)))
- continue;
-
+ for_each_subsys_which(ss, ssid, &ss_mask) {
/* if @ss has non-root csses attached to it, can't move */
if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)))
return -EBUSY;
@@ -1253,7 +1272,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
* Just warn about it and continue.
*/
if (cgrp_dfl_root_visible) {
- pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n",
+ pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n",
ret, ss_mask);
pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
}
@@ -1263,18 +1282,14 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
* Nothing can fail from this point on. Remove files for the
* removed subsystems and rebind each subsystem.
*/
- for_each_subsys(ss, ssid)
- if (ss_mask & (1 << ssid))
- cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
+ for_each_subsys_which(ss, ssid, &ss_mask)
+ cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
- for_each_subsys(ss, ssid) {
+ for_each_subsys_which(ss, ssid, &ss_mask) {
struct cgroup_root *src_root;
struct cgroup_subsys_state *css;
struct css_set *cset;
- if (!(ss_mask & (1 << ssid)))
- continue;
-
src_root = ss->root;
css = cgroup_css(&src_root->cgrp, ss);
@@ -1338,7 +1353,7 @@ static int cgroup_show_options(struct seq_file *seq,
}
struct cgroup_sb_opts {
- unsigned int subsys_mask;
+ unsigned long subsys_mask;
unsigned int flags;
char *release_agent;
bool cpuset_clone_children;
@@ -1351,7 +1366,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
{
char *token, *o = data;
bool all_ss = false, one_ss = false;
- unsigned int mask = -1U;
+ unsigned long mask = -1UL;
struct cgroup_subsys *ss;
int nr_opts = 0;
int i;
@@ -1495,7 +1510,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
int ret = 0;
struct cgroup_root *root = cgroup_root_from_kf(kf_root);
struct cgroup_sb_opts opts;
- unsigned int added_mask, removed_mask;
+ unsigned long added_mask, removed_mask;
if (root == &cgrp_dfl_root) {
pr_err("remount is not allowed\n");
@@ -1641,7 +1656,7 @@ static void init_cgroup_root(struct cgroup_root *root,
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
}
-static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
+static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
{
LIST_HEAD(tmp_links);
struct cgroup *root_cgrp = &root->cgrp;
@@ -2050,9 +2065,9 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
lockdep_assert_held(&css_set_rwsem);
/*
- * We are synchronized through threadgroup_lock() against PF_EXITING
- * setting such that we can't race against cgroup_exit() changing the
- * css_set to init_css_set and dropping the old one.
+ * We are synchronized through cgroup_threadgroup_rwsem against
+ * PF_EXITING setting such that we can't race against cgroup_exit()
+ * changing the css_set to init_css_set and dropping the old one.
*/
WARN_ON_ONCE(tsk->flags & PF_EXITING);
old_cset = task_css_set(tsk);
@@ -2109,10 +2124,11 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
* @src_cset and add it to @preloaded_csets, which should later be cleaned
* up by cgroup_migrate_finish().
*
- * This function may be called without holding threadgroup_lock even if the
- * target is a process. Threads may be created and destroyed but as long
- * as cgroup_mutex is not dropped, no new css_set can be put into play and
- * the preloaded css_sets are guaranteed to cover all migrations.
+ * This function may be called without holding cgroup_threadgroup_rwsem
+ * even if the target is a process. Threads may be created and destroyed
+ * but as long as cgroup_mutex is not dropped, no new css_set can be put
+ * into play and the preloaded css_sets are guaranteed to cover all
+ * migrations.
*/
static void cgroup_migrate_add_src(struct css_set *src_cset,
struct cgroup *dst_cgrp,
@@ -2215,7 +2231,7 @@ err:
* @threadgroup: whether @leader points to the whole process or a single task
*
* Migrate a process or task denoted by @leader to @cgrp. If migrating a
- * process, the caller must be holding threadgroup_lock of @leader. The
+ * process, the caller must be holding cgroup_threadgroup_rwsem. The
* caller is also responsible for invoking cgroup_migrate_add_src() and
* cgroup_migrate_prepare_dst() on the targets before invoking this
* function and following up with cgroup_migrate_finish().
@@ -2343,7 +2359,7 @@ out_release_tset:
* @leader: the task or the leader of the threadgroup to be attached
* @threadgroup: attach the whole threadgroup?
*
- * Call holding cgroup_mutex and threadgroup_lock of @leader.
+ * Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
*/
static int cgroup_attach_task(struct cgroup *dst_cgrp,
struct task_struct *leader, bool threadgroup)
@@ -2374,6 +2390,47 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
return ret;
}
+static int cgroup_procs_write_permission(struct task_struct *task,
+ struct cgroup *dst_cgrp,
+ struct kernfs_open_file *of)
+{
+ const struct cred *cred = current_cred();
+ const struct cred *tcred = get_task_cred(task);
+ int ret = 0;
+
+ /*
+ * even if we're attaching all tasks in the thread group, we only
+ * need to check permissions on one of them.
+ */
+ if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
+ !uid_eq(cred->euid, tcred->uid) &&
+ !uid_eq(cred->euid, tcred->suid))
+ ret = -EACCES;
+
+ if (!ret && cgroup_on_dfl(dst_cgrp)) {
+ struct super_block *sb = of->file->f_path.dentry->d_sb;
+ struct cgroup *cgrp;
+ struct inode *inode;
+
+ down_read(&css_set_rwsem);
+ cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
+ up_read(&css_set_rwsem);
+
+ while (!cgroup_is_descendant(dst_cgrp, cgrp))
+ cgrp = cgroup_parent(cgrp);
+
+ ret = -ENOMEM;
+ inode = kernfs_get_inode(sb, cgrp->procs_kn);
+ if (inode) {
+ ret = inode_permission(inode, MAY_WRITE);
+ iput(inode);
+ }
+ }
+
+ put_cred(tcred);
+ return ret;
+}
+
/*
* Find the task_struct of the task to attach by vpid and pass it along to the
* function to attach either it or all tasks in its threadgroup. Will lock
@@ -2383,7 +2440,6 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off, bool threadgroup)
{
struct task_struct *tsk;
- const struct cred *cred = current_cred(), *tcred;
struct cgroup *cgrp;
pid_t pid;
int ret;
@@ -2395,29 +2451,17 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
if (!cgrp)
return -ENODEV;
-retry_find_task:
+ percpu_down_write(&cgroup_threadgroup_rwsem);
rcu_read_lock();
if (pid) {
tsk = find_task_by_vpid(pid);
if (!tsk) {
- rcu_read_unlock();
ret = -ESRCH;
- goto out_unlock_cgroup;
+ goto out_unlock_rcu;
}
- /*
- * even if we're attaching all tasks in the thread group, we
- * only need to check permissions on one of them.
- */
- tcred = __task_cred(tsk);
- if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
- !uid_eq(cred->euid, tcred->uid) &&
- !uid_eq(cred->euid, tcred->suid)) {
- rcu_read_unlock();
- ret = -EACCES;
- goto out_unlock_cgroup;
- }
- } else
+ } else {
tsk = current;
+ }
if (threadgroup)
tsk = tsk->group_leader;
@@ -2429,35 +2473,23 @@ retry_find_task:
*/
if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
ret = -EINVAL;
- rcu_read_unlock();
- goto out_unlock_cgroup;
+ goto out_unlock_rcu;
}
get_task_struct(tsk);
rcu_read_unlock();
- threadgroup_lock(tsk);
- if (threadgroup) {
- if (!thread_group_leader(tsk)) {
- /*
- * a race with de_thread from another thread's exec()
- * may strip us of our leadership, if this happens,
- * there is no choice but to throw this task away and
- * try again; this is
- * "double-double-toil-and-trouble-check locking".
- */
- threadgroup_unlock(tsk);
- put_task_struct(tsk);
- goto retry_find_task;
- }
- }
-
- ret = cgroup_attach_task(cgrp, tsk, threadgroup);
-
- threadgroup_unlock(tsk);
+ ret = cgroup_procs_write_permission(tsk, cgrp, of);
+ if (!ret)
+ ret = cgroup_attach_task(cgrp, tsk, threadgroup);
put_task_struct(tsk);
-out_unlock_cgroup:
+ goto out_unlock_threadgroup;
+
+out_unlock_rcu:
+ rcu_read_unlock();
+out_unlock_threadgroup:
+ percpu_up_write(&cgroup_threadgroup_rwsem);
cgroup_kn_unlock(of->kn);
return ret ?: nbytes;
}
@@ -2540,19 +2572,17 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
return 0;
}
-static void cgroup_print_ss_mask(struct seq_file *seq, unsigned int ss_mask)
+static void cgroup_print_ss_mask(struct seq_file *seq, unsigned long ss_mask)
{
struct cgroup_subsys *ss;
bool printed = false;
int ssid;
- for_each_subsys(ss, ssid) {
- if (ss_mask & (1 << ssid)) {
- if (printed)
- seq_putc(seq, ' ');
- seq_printf(seq, "%s", ss->name);
- printed = true;
- }
+ for_each_subsys_which(ss, ssid, &ss_mask) {
+ if (printed)
+ seq_putc(seq, ' ');
+ seq_printf(seq, "%s", ss->name);
+ printed = true;
}
if (printed)
seq_putc(seq, '\n');
@@ -2604,6 +2634,8 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
lockdep_assert_held(&cgroup_mutex);
+ percpu_down_write(&cgroup_threadgroup_rwsem);
+
/* look up all csses currently attached to @cgrp's subtree */
down_read(&css_set_rwsem);
css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
@@ -2659,17 +2691,8 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
goto out_finish;
last_task = task;
- threadgroup_lock(task);
- /* raced against de_thread() from another thread? */
- if (!thread_group_leader(task)) {
- threadgroup_unlock(task);
- put_task_struct(task);
- continue;
- }
-
ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
- threadgroup_unlock(task);
put_task_struct(task);
if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
@@ -2679,6 +2702,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
out_finish:
cgroup_migrate_finish(&preloaded_csets);
+ percpu_up_write(&cgroup_threadgroup_rwsem);
return ret;
}
@@ -2687,8 +2711,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
- unsigned int enable = 0, disable = 0;
- unsigned int css_enable, css_disable, old_sc, new_sc, old_ss, new_ss;
+ unsigned long enable = 0, disable = 0;
+ unsigned long css_enable, css_disable, old_sc, new_sc, old_ss, new_ss;
struct cgroup *cgrp, *child;
struct cgroup_subsys *ss;
char *tok;
@@ -2700,11 +2724,12 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
*/
buf = strstrip(buf);
while ((tok = strsep(&buf, " "))) {
+ unsigned long tmp_ss_mask = ~cgrp_dfl_root_inhibit_ss_mask;
+
if (tok[0] == '\0')
continue;
- for_each_subsys(ss, ssid) {
- if (ss->disabled || strcmp(tok + 1, ss->name) ||
- ((1 << ss->id) & cgrp_dfl_root_inhibit_ss_mask))
+ for_each_subsys_which(ss, ssid, &tmp_ss_mask) {
+ if (ss->disabled || strcmp(tok + 1, ss->name))
continue;
if (*tok == '+') {
@@ -2791,10 +2816,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
* still around. In such cases, wait till it's gone using
* offline_waitq.
*/
- for_each_subsys(ss, ssid) {
- if (!(css_enable & (1 << ssid)))
- continue;
-
+ for_each_subsys_which(ss, ssid, &css_enable) {
cgroup_for_each_live_child(child, cgrp) {
DEFINE_WAIT(wait);
@@ -3085,7 +3107,9 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
return ret;
}
- if (cft->seq_show == cgroup_populated_show)
+ if (cft->write == cgroup_procs_write)
+ cgrp->procs_kn = kn;
+ else if (cft->seq_show == cgroup_populated_show)
cgrp->populated_kn = kn;
return 0;
}
@@ -4320,7 +4344,7 @@ static struct cftype cgroup_legacy_base_files[] = {
*
* On failure, no file is added.
*/
-static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask)
+static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
{
struct cgroup_subsys *ss;
int i, ret = 0;
@@ -4929,7 +4953,8 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
* init_css_set is in the subsystem's root cgroup. */
init_css_set.subsys[ss->id] = css;
- need_forkexit_callback |= ss->fork || ss->exit;
+ have_fork_callback |= (bool)ss->fork << ss->id;
+ have_exit_callback |= (bool)ss->exit << ss->id;
/* At system boot, before all subsystems have been
* registered, no tasks have been forked, so we don't
@@ -4987,6 +5012,7 @@ int __init cgroup_init(void)
unsigned long key;
int ssid, err;
+ BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
@@ -5239,11 +5265,8 @@ void cgroup_post_fork(struct task_struct *child)
* css_set; otherwise, @child might change state between ->fork()
* and addition to css_set.
*/
- if (need_forkexit_callback) {
- for_each_subsys(ss, i)
- if (ss->fork)
- ss->fork(child);
- }
+ for_each_subsys_which(ss, i, &have_fork_callback)
+ ss->fork(child);
}
/**
@@ -5287,16 +5310,12 @@ void cgroup_exit(struct task_struct *tsk)
cset = task_css_set(tsk);
RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
- if (need_forkexit_callback) {
- /* see cgroup_post_fork() for details */
- for_each_subsys(ss, i) {
- if (ss->exit) {
- struct cgroup_subsys_state *old_css = cset->subsys[i];
- struct cgroup_subsys_state *css = task_css(tsk, i);
+ /* see cgroup_post_fork() for details */
+ for_each_subsys_which(ss, i, &have_exit_callback) {
+ struct cgroup_subsys_state *old_css = cset->subsys[i];
+ struct cgroup_subsys_state *css = task_css(tsk, i);
- ss->exit(css, old_css, tsk);
- }
- }
+ ss->exit(css, old_css, tsk);
}
if (put_cset)
diff --git a/kernel/configs/xen.config b/kernel/configs/xen.config
new file mode 100644
index 000000000..ff756221f
--- /dev/null
+++ b/kernel/configs/xen.config
@@ -0,0 +1,48 @@
+# global stuff - these enable us to allow some
+# of the not so generic stuff below for xen
+CONFIG_PARAVIRT=y
+CONFIG_NET=y
+CONFIG_NET_CORE=y
+CONFIG_NETDEVICES=y
+CONFIG_BLOCK=y
+CONFIG_WATCHDOG=y
+CONFIG_TARGET_CORE=y
+CONFIG_SCSI=y
+CONFIG_FB=y
+CONFIG_INPUT_MISC=y
+CONFIG_MEMORY_HOTPLUG=y
+CONFIG_TTY=y
+# Technically not required but otherwise produces
+# pretty useless systems starting from allnoconfig
+# You want TCP/IP and ELF binaries right?
+CONFIG_INET=y
+CONFIG_BINFMT_ELF=y
+# generic config
+CONFIG_XEN=y
+CONFIG_XEN_DOM0=y
+# backend drivers
+CONFIG_XEN_BACKEND=y
+CONFIG_XEN_BLKDEV_BACKEND=m
+CONFIG_XEN_NETDEV_BACKEND=m
+CONFIG_HVC_XEN=y
+CONFIG_XEN_WDT=m
+CONFIG_XEN_SCSI_BACKEND=m
+# frontend drivers
+CONFIG_XEN_FBDEV_FRONTEND=m
+CONFIG_HVC_XEN_FRONTEND=y
+CONFIG_INPUT_XEN_KBDDEV_FRONTEND=m
+CONFIG_XEN_SCSI_FRONTEND=m
+# others
+CONFIG_XEN_BALLOON=y
+CONFIG_XEN_SCRUB_PAGES=y
+CONFIG_XEN_DEV_EVTCHN=m
+CONFIG_XEN_BLKDEV_FRONTEND=m
+CONFIG_XEN_NETDEV_FRONTEND=m
+CONFIG_XENFS=m
+CONFIG_XEN_COMPAT_XENFS=y
+CONFIG_XEN_SYS_HYPERVISOR=y
+CONFIG_XEN_XENBUS_FRONTEND=y
+CONFIG_XEN_GNTDEV=m
+CONFIG_XEN_GRANT_DEV_ALLOC=m
+CONFIG_SWIOTLB_XEN=y
+CONFIG_XEN_PRIVCMD=m
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 72d59a1a6..0a495ab35 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -30,12 +30,23 @@ EXPORT_SYMBOL_GPL(context_tracking_enabled);
DEFINE_PER_CPU(struct context_tracking, context_tracking);
EXPORT_SYMBOL_GPL(context_tracking);
-void context_tracking_cpu_set(int cpu)
+static bool context_tracking_recursion_enter(void)
{
- if (!per_cpu(context_tracking.active, cpu)) {
- per_cpu(context_tracking.active, cpu) = true;
- static_key_slow_inc(&context_tracking_enabled);
- }
+ int recursion;
+
+ recursion = __this_cpu_inc_return(context_tracking.recursion);
+ if (recursion == 1)
+ return true;
+
+ WARN_ONCE((recursion < 1), "Invalid context tracking recursion value %d\n", recursion);
+ __this_cpu_dec(context_tracking.recursion);
+
+ return false;
+}
+
+static void context_tracking_recursion_exit(void)
+{
+ __this_cpu_dec(context_tracking.recursion);
}
/**
@@ -75,6 +86,9 @@ void context_tracking_enter(enum ctx_state state)
WARN_ON_ONCE(!current->mm);
local_irq_save(flags);
+ if (!context_tracking_recursion_enter())
+ goto out_irq_restore;
+
if ( __this_cpu_read(context_tracking.state) != state) {
if (__this_cpu_read(context_tracking.active)) {
/*
@@ -105,6 +119,8 @@ void context_tracking_enter(enum ctx_state state)
*/
__this_cpu_write(context_tracking.state, state);
}
+ context_tracking_recursion_exit();
+out_irq_restore:
local_irq_restore(flags);
}
NOKPROBE_SYMBOL(context_tracking_enter);
@@ -139,6 +155,9 @@ void context_tracking_exit(enum ctx_state state)
return;
local_irq_save(flags);
+ if (!context_tracking_recursion_enter())
+ goto out_irq_restore;
+
if (__this_cpu_read(context_tracking.state) == state) {
if (__this_cpu_read(context_tracking.active)) {
/*
@@ -153,6 +172,8 @@ void context_tracking_exit(enum ctx_state state)
}
__this_cpu_write(context_tracking.state, CONTEXT_KERNEL);
}
+ context_tracking_recursion_exit();
+out_irq_restore:
local_irq_restore(flags);
}
NOKPROBE_SYMBOL(context_tracking_exit);
@@ -164,24 +185,26 @@ void context_tracking_user_exit(void)
}
NOKPROBE_SYMBOL(context_tracking_user_exit);
-/**
- * __context_tracking_task_switch - context switch the syscall callbacks
- * @prev: the task that is being switched out
- * @next: the task that is being switched in
- *
- * The context tracking uses the syscall slow path to implement its user-kernel
- * boundaries probes on syscalls. This way it doesn't impact the syscall fast
- * path on CPUs that don't do context tracking.
- *
- * But we need to clear the flag on the previous task because it may later
- * migrate to some CPU that doesn't do the context tracking. As such the TIF
- * flag may not be desired there.
- */
-void __context_tracking_task_switch(struct task_struct *prev,
- struct task_struct *next)
+void __init context_tracking_cpu_set(int cpu)
{
- clear_tsk_thread_flag(prev, TIF_NOHZ);
- set_tsk_thread_flag(next, TIF_NOHZ);
+ static __initdata bool initialized = false;
+
+ if (!per_cpu(context_tracking.active, cpu)) {
+ per_cpu(context_tracking.active, cpu) = true;
+ static_key_slow_inc(&context_tracking_enabled);
+ }
+
+ if (initialized)
+ return;
+
+ /*
+ * Set TIF_NOHZ to init/0 and let it propagate to all tasks through fork
+ * This assumes that init is the only task at this early boot stage.
+ */
+ set_tsk_thread_flag(&init_task, TIF_NOHZ);
+ WARN_ON_ONCE(!tasklist_empty());
+
+ initialized = true;
}
#ifdef CONFIG_CONTEXT_TRACKING_FORCE
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 94bbe4695..5644ec558 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -21,6 +21,7 @@
#include <linux/suspend.h>
#include <linux/lockdep.h>
#include <linux/tick.h>
+#include <linux/irq.h>
#include <trace/events/power.h>
#include "smpboot.h"
@@ -392,14 +393,19 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
smpboot_park_threads(cpu);
/*
- * So now all preempt/rcu users must observe !cpu_active().
+ * Prevent irq alloc/free while the dying cpu reorganizes the
+ * interrupt affinities.
*/
+ irq_lock_sparse();
+ /*
+ * So now all preempt/rcu users must observe !cpu_active().
+ */
err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
if (err) {
/* CPU didn't die: tell everyone. Can't complain. */
- smpboot_unpark_threads(cpu);
cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
+ irq_unlock_sparse();
goto out_release;
}
BUG_ON(cpu_online(cpu));
@@ -416,6 +422,9 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
smp_mb(); /* Read from cpu_dead_idle before __cpu_die(). */
per_cpu(cpu_dead_idle, cpu) = false;
+ /* Interrupts are moved away from the dying cpu, reenable alloc/free */
+ irq_unlock_sparse();
+
hotplug_cpu__broadcast_tick_pull(cpu);
/* This actually kills the CPU. */
__cpu_die(cpu);
@@ -463,6 +472,7 @@ static int smpboot_thread_call(struct notifier_block *nfb,
switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_DOWN_FAILED:
case CPU_ONLINE:
smpboot_unpark_threads(cpu);
break;
@@ -479,7 +489,7 @@ static struct notifier_block smpboot_thread_notifier = {
.priority = CPU_PRI_SMPBOOT,
};
-void __cpuinit smpboot_thread_init(void)
+void smpboot_thread_init(void)
{
register_cpu_notifier(&smpboot_thread_notifier);
}
@@ -519,6 +529,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen)
/* Arch-specific enabling code. */
ret = __cpu_up(cpu, idle);
+
if (ret != 0)
goto out_notify;
BUG_ON(!cpu_online(cpu));
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ee14e3a35..f0acff0f6 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1223,7 +1223,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
spin_unlock_irq(&callback_lock);
/* use trialcs->mems_allowed as a temp variable */
- update_nodemasks_hier(cs, &cs->mems_allowed);
+ update_nodemasks_hier(cs, &trialcs->mems_allowed);
done:
return retval;
}
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index d12807d40..ef90b04d7 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -104,7 +104,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
*/
t1 = tsk->sched_info.pcount;
t2 = tsk->sched_info.run_delay;
- t3 = tsk_seruntime(tsk);
+ t3 = tsk->se.sum_exec_runtime;
d->cpu_count += t1;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0ceb38677..e6feb5114 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -36,7 +36,7 @@
#include <linux/kernel_stat.h>
#include <linux/cgroup.h>
#include <linux/perf_event.h>
-#include <linux/ftrace_event.h>
+#include <linux/trace_events.h>
#include <linux/hw_breakpoint.h>
#include <linux/mm_types.h>
#include <linux/module.h>
@@ -51,9 +51,11 @@
static struct workqueue_struct *perf_wq;
+typedef int (*remote_function_f)(void *);
+
struct remote_function_call {
struct task_struct *p;
- int (*func)(void *info);
+ remote_function_f func;
void *info;
int ret;
};
@@ -86,7 +88,7 @@ static void remote_function(void *data)
* -EAGAIN - when the process moved away
*/
static int
-task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
+task_function_call(struct task_struct *p, remote_function_f func, void *info)
{
struct remote_function_call data = {
.p = p,
@@ -110,7 +112,7 @@ task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
*
* returns: @func return value or -ENXIO when the cpu is offline
*/
-static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
+static int cpu_function_call(int cpu, remote_function_f func, void *info)
{
struct remote_function_call data = {
.p = NULL,
@@ -747,62 +749,31 @@ perf_cgroup_mark_enabled(struct perf_event *event,
/*
* function must be called with interrupts disbled
*/
-static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr)
+static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
{
struct perf_cpu_context *cpuctx;
- enum hrtimer_restart ret = HRTIMER_NORESTART;
int rotations = 0;
WARN_ON(!irqs_disabled());
cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
-
rotations = perf_rotate_context(cpuctx);
- /*
- * arm timer if needed
- */
- if (rotations) {
+ raw_spin_lock(&cpuctx->hrtimer_lock);
+ if (rotations)
hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
- ret = HRTIMER_RESTART;
- }
-
- return ret;
-}
-
-/* CPU is going down */
-void perf_cpu_hrtimer_cancel(int cpu)
-{
- struct perf_cpu_context *cpuctx;
- struct pmu *pmu;
- unsigned long flags;
-
- if (WARN_ON(cpu != smp_processor_id()))
- return;
-
- local_irq_save(flags);
-
- rcu_read_lock();
-
- list_for_each_entry_rcu(pmu, &pmus, entry) {
- cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-
- if (pmu->task_ctx_nr == perf_sw_context)
- continue;
-
- hrtimer_cancel(&cpuctx->hrtimer);
- }
-
- rcu_read_unlock();
+ else
+ cpuctx->hrtimer_active = 0;
+ raw_spin_unlock(&cpuctx->hrtimer_lock);
- local_irq_restore(flags);
+ return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
}
-static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
+static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
{
- struct hrtimer *hr = &cpuctx->hrtimer;
+ struct hrtimer *timer = &cpuctx->hrtimer;
struct pmu *pmu = cpuctx->ctx.pmu;
- int timer;
+ u64 interval;
/* no multiplexing needed for SW PMU */
if (pmu->task_ctx_nr == perf_sw_context)
@@ -812,31 +783,36 @@ static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
* check default is sane, if not set then force to
* default interval (1/tick)
*/
- timer = pmu->hrtimer_interval_ms;
- if (timer < 1)
- timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
+ interval = pmu->hrtimer_interval_ms;
+ if (interval < 1)
+ interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
- cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
+ cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
- hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
- hr->function = perf_cpu_hrtimer_handler;
+ raw_spin_lock_init(&cpuctx->hrtimer_lock);
+ hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+ timer->function = perf_mux_hrtimer_handler;
}
-static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx)
+static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
{
- struct hrtimer *hr = &cpuctx->hrtimer;
+ struct hrtimer *timer = &cpuctx->hrtimer;
struct pmu *pmu = cpuctx->ctx.pmu;
+ unsigned long flags;
/* not for SW PMU */
if (pmu->task_ctx_nr == perf_sw_context)
- return;
+ return 0;
- if (hrtimer_active(hr))
- return;
+ raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
+ if (!cpuctx->hrtimer_active) {
+ cpuctx->hrtimer_active = 1;
+ hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
+ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
+ }
+ raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
- if (!hrtimer_callback_running(hr))
- __hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval,
- 0, HRTIMER_MODE_REL_PINNED, 0);
+ return 0;
}
void perf_pmu_disable(struct pmu *pmu)
@@ -1526,11 +1502,17 @@ static int __init perf_workqueue_init(void)
core_initcall(perf_workqueue_init);
+static inline int pmu_filter_match(struct perf_event *event)
+{
+ struct pmu *pmu = event->pmu;
+ return pmu->filter_match ? pmu->filter_match(event) : 1;
+}
+
static inline int
event_filter_match(struct perf_event *event)
{
return (event->cpu == -1 || event->cpu == smp_processor_id())
- && perf_cgroup_match(event);
+ && perf_cgroup_match(event) && pmu_filter_match(event);
}
static void
@@ -1886,8 +1868,6 @@ event_sched_in(struct perf_event *event,
perf_pmu_disable(event->pmu);
- event->tstamp_running += tstamp - event->tstamp_stopped;
-
perf_set_shadow_time(event, ctx, tstamp);
perf_log_itrace_start(event);
@@ -1899,6 +1879,8 @@ event_sched_in(struct perf_event *event,
goto out;
}
+ event->tstamp_running += tstamp - event->tstamp_stopped;
+
if (!is_software_event(event))
cpuctx->active_oncpu++;
if (!ctx->nr_active++)
@@ -1935,7 +1917,7 @@ group_sched_in(struct perf_event *group_event,
if (event_sched_in(group_event, cpuctx, ctx)) {
pmu->cancel_txn(pmu);
- perf_cpu_hrtimer_restart(cpuctx);
+ perf_mux_hrtimer_restart(cpuctx);
return -EAGAIN;
}
@@ -1982,7 +1964,7 @@ group_error:
pmu->cancel_txn(pmu);
- perf_cpu_hrtimer_restart(cpuctx);
+ perf_mux_hrtimer_restart(cpuctx);
return -EAGAIN;
}
@@ -2255,7 +2237,7 @@ static int __perf_event_enable(void *info)
*/
if (leader != event) {
group_sched_out(leader, cpuctx, ctx);
- perf_cpu_hrtimer_restart(cpuctx);
+ perf_mux_hrtimer_restart(cpuctx);
}
if (leader->attr.pinned) {
update_group_times(leader);
@@ -3976,28 +3958,21 @@ static void perf_event_for_each(struct perf_event *event,
perf_event_for_each_child(sibling, func);
}
-static int perf_event_period(struct perf_event *event, u64 __user *arg)
-{
- struct perf_event_context *ctx = event->ctx;
- int ret = 0, active;
+struct period_event {
+ struct perf_event *event;
u64 value;
+};
- if (!is_sampling_event(event))
- return -EINVAL;
-
- if (copy_from_user(&value, arg, sizeof(value)))
- return -EFAULT;
-
- if (!value)
- return -EINVAL;
+static int __perf_event_period(void *info)
+{
+ struct period_event *pe = info;
+ struct perf_event *event = pe->event;
+ struct perf_event_context *ctx = event->ctx;
+ u64 value = pe->value;
+ bool active;
- raw_spin_lock_irq(&ctx->lock);
+ raw_spin_lock(&ctx->lock);
if (event->attr.freq) {
- if (value > sysctl_perf_event_sample_rate) {
- ret = -EINVAL;
- goto unlock;
- }
-
event->attr.sample_freq = value;
} else {
event->attr.sample_period = value;
@@ -4016,11 +3991,53 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
event->pmu->start(event, PERF_EF_RELOAD);
perf_pmu_enable(ctx->pmu);
}
+ raw_spin_unlock(&ctx->lock);
-unlock:
+ return 0;
+}
+
+static int perf_event_period(struct perf_event *event, u64 __user *arg)
+{
+ struct period_event pe = { .event = event, };
+ struct perf_event_context *ctx = event->ctx;
+ struct task_struct *task;
+ u64 value;
+
+ if (!is_sampling_event(event))
+ return -EINVAL;
+
+ if (copy_from_user(&value, arg, sizeof(value)))
+ return -EFAULT;
+
+ if (!value)
+ return -EINVAL;
+
+ if (event->attr.freq && value > sysctl_perf_event_sample_rate)
+ return -EINVAL;
+
+ task = ctx->task;
+ pe.value = value;
+
+ if (!task) {
+ cpu_function_call(event->cpu, __perf_event_period, &pe);
+ return 0;
+ }
+
+retry:
+ if (!task_function_call(task, __perf_event_period, &pe))
+ return 0;
+
+ raw_spin_lock_irq(&ctx->lock);
+ if (ctx->is_active) {
+ raw_spin_unlock_irq(&ctx->lock);
+ task = ctx->task;
+ goto retry;
+ }
+
+ __perf_event_period(&pe);
raw_spin_unlock_irq(&ctx->lock);
- return ret;
+ return 0;
}
static const struct file_operations perf_fops;
@@ -4376,14 +4393,6 @@ static void ring_buffer_wakeup(struct perf_event *event)
rcu_read_unlock();
}
-static void rb_free_rcu(struct rcu_head *rcu_head)
-{
- struct ring_buffer *rb;
-
- rb = container_of(rcu_head, struct ring_buffer, rcu_head);
- rb_free(rb);
-}
-
struct ring_buffer *ring_buffer_get(struct perf_event *event)
{
struct ring_buffer *rb;
@@ -4766,12 +4775,20 @@ static const struct file_operations perf_fops = {
* to user-space before waking everybody up.
*/
+static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
+{
+ /* only the parent has fasync state */
+ if (event->parent)
+ event = event->parent;
+ return &event->fasync;
+}
+
void perf_event_wakeup(struct perf_event *event)
{
ring_buffer_wakeup(event);
if (event->pending_kill) {
- kill_fasync(&event->fasync, SIGIO, event->pending_kill);
+ kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
event->pending_kill = 0;
}
}
@@ -5381,9 +5398,9 @@ void perf_prepare_sample(struct perf_event_header *header,
}
}
-static void perf_event_output(struct perf_event *event,
- struct perf_sample_data *data,
- struct pt_regs *regs)
+void perf_event_output(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
{
struct perf_output_handle handle;
struct perf_event_header header;
@@ -5812,7 +5829,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
* need to add enough zero bytes after the string to handle
* the 64bit alignment we do later.
*/
- name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64));
+ name = file_path(file, buf, PATH_MAX - sizeof(u64));
if (IS_ERR(name)) {
name = "//toolong";
goto cpy_name;
@@ -5975,6 +5992,39 @@ void perf_event_aux_event(struct perf_event *event, unsigned long head,
}
/*
+ * Lost/dropped samples logging
+ */
+void perf_log_lost_samples(struct perf_event *event, u64 lost)
+{
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ int ret;
+
+ struct {
+ struct perf_event_header header;
+ u64 lost;
+ } lost_samples_event = {
+ .header = {
+ .type = PERF_RECORD_LOST_SAMPLES,
+ .misc = 0,
+ .size = sizeof(lost_samples_event),
+ },
+ .lost = lost,
+ };
+
+ perf_event_header__init_id(&lost_samples_event.header, &sample, event);
+
+ ret = perf_output_begin(&handle, event,
+ lost_samples_event.header.size);
+ if (ret)
+ return;
+
+ perf_output_put(&handle, lost_samples_event);
+ perf_event__output_id_sample(event, &handle, &sample);
+ perf_output_end(&handle);
+}
+
+/*
* IRQ throttle logging
*/
@@ -6117,7 +6167,7 @@ static int __perf_event_overflow(struct perf_event *event,
else
perf_event_output(event, data, regs);
- if (event->fasync && event->pending_kill) {
+ if (*perf_event_fasync(event) && event->pending_kill) {
event->pending_wakeup = 1;
irq_work_queue(&event->pending);
}
@@ -6864,9 +6914,8 @@ static void perf_swevent_start_hrtimer(struct perf_event *event)
} else {
period = max_t(u64, 10000, hwc->sample_period);
}
- __hrtimer_start_range_ns(&hwc->hrtimer,
- ns_to_ktime(period), 0,
- HRTIMER_MODE_REL_PINNED, 0);
+ hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
+ HRTIMER_MODE_REL_PINNED);
}
static void perf_swevent_cancel_hrtimer(struct perf_event *event)
@@ -7167,6 +7216,8 @@ perf_event_mux_interval_ms_show(struct device *dev,
return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
}
+static DEFINE_MUTEX(mux_interval_mutex);
+
static ssize_t
perf_event_mux_interval_ms_store(struct device *dev,
struct device_attribute *attr,
@@ -7186,17 +7237,21 @@ perf_event_mux_interval_ms_store(struct device *dev,
if (timer == pmu->hrtimer_interval_ms)
return count;
+ mutex_lock(&mux_interval_mutex);
pmu->hrtimer_interval_ms = timer;
/* update all cpuctx for this PMU */
- for_each_possible_cpu(cpu) {
+ get_online_cpus();
+ for_each_online_cpu(cpu) {
struct perf_cpu_context *cpuctx;
cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
- if (hrtimer_active(&cpuctx->hrtimer))
- hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval);
+ cpu_function_call(cpu,
+ (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
}
+ put_online_cpus();
+ mutex_unlock(&mux_interval_mutex);
return count;
}
@@ -7301,7 +7356,7 @@ skip_type:
lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
cpuctx->ctx.pmu = pmu;
- __perf_cpu_hrtimer_init(cpuctx, cpu);
+ __perf_mux_hrtimer_init(cpuctx, cpu);
cpuctx->unique_pmu = pmu;
}
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 9f6ce9ba4..2bbad9c12 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -11,6 +11,7 @@
struct ring_buffer {
atomic_t refcount;
struct rcu_head rcu_head;
+ struct irq_work irq_work;
#ifdef CONFIG_PERF_USE_VMALLOC
struct work_struct work;
int page_order; /* allocation order */
@@ -55,6 +56,15 @@ struct ring_buffer {
};
extern void rb_free(struct ring_buffer *rb);
+
+static inline void rb_free_rcu(struct rcu_head *rcu_head)
+{
+ struct ring_buffer *rb;
+
+ rb = container_of(rcu_head, struct ring_buffer, rcu_head);
+ rb_free(rb);
+}
+
extern struct ring_buffer *
rb_alloc(int nr_pages, long watermark, int cpu, int flags);
extern void perf_event_wakeup(struct perf_event *event);
@@ -72,15 +82,6 @@ static inline bool rb_has_aux(struct ring_buffer *rb)
void perf_event_aux_event(struct perf_event *event, unsigned long head,
unsigned long size, u64 flags);
-extern void
-perf_event_header__init_id(struct perf_event_header *header,
- struct perf_sample_data *data,
- struct perf_event *event);
-extern void
-perf_event__output_id_sample(struct perf_event *event,
- struct perf_output_handle *handle,
- struct perf_sample_data *sample);
-
extern struct page *
perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff);
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 725c41608..c8aa3f75b 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -141,7 +141,7 @@ int perf_output_begin(struct perf_output_handle *handle,
perf_output_get_handle(handle);
do {
- tail = ACCESS_ONCE(rb->user_page->data_tail);
+ tail = READ_ONCE_CTRL(rb->user_page->data_tail);
offset = head = local_read(&rb->head);
if (!rb->overwrite &&
unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size))
@@ -221,6 +221,8 @@ void perf_output_end(struct perf_output_handle *handle)
rcu_read_unlock();
}
+static void rb_irq_work(struct irq_work *work);
+
static void
ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
{
@@ -241,6 +243,16 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
INIT_LIST_HEAD(&rb->event_list);
spin_lock_init(&rb->event_lock);
+ init_irq_work(&rb->irq_work, rb_irq_work);
+}
+
+static void ring_buffer_put_async(struct ring_buffer *rb)
+{
+ if (!atomic_dec_and_test(&rb->refcount))
+ return;
+
+ rb->rcu_head.next = (void *)rb;
+ irq_work_queue(&rb->irq_work);
}
/*
@@ -319,7 +331,7 @@ err_put:
rb_free_aux(rb);
err:
- ring_buffer_put(rb);
+ ring_buffer_put_async(rb);
handle->event = NULL;
return NULL;
@@ -370,7 +382,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
local_set(&rb->aux_nest, 0);
rb_free_aux(rb);
- ring_buffer_put(rb);
+ ring_buffer_put_async(rb);
}
/*
@@ -547,17 +559,30 @@ static void __rb_free_aux(struct ring_buffer *rb)
rb->aux_priv = NULL;
}
- for (pg = 0; pg < rb->aux_nr_pages; pg++)
- rb_free_aux_page(rb, pg);
+ if (rb->aux_nr_pages) {
+ for (pg = 0; pg < rb->aux_nr_pages; pg++)
+ rb_free_aux_page(rb, pg);
- kfree(rb->aux_pages);
- rb->aux_nr_pages = 0;
+ kfree(rb->aux_pages);
+ rb->aux_nr_pages = 0;
+ }
}
void rb_free_aux(struct ring_buffer *rb)
{
if (atomic_dec_and_test(&rb->aux_refcount))
+ irq_work_queue(&rb->irq_work);
+}
+
+static void rb_irq_work(struct irq_work *work)
+{
+ struct ring_buffer *rb = container_of(work, struct ring_buffer, irq_work);
+
+ if (!atomic_read(&rb->aux_refcount))
__rb_free_aux(rb);
+
+ if (rb->rcu_head.next == (void *)rb)
+ call_rcu(&rb->rcu_head, rb_free_rcu);
}
#ifndef CONFIG_PERF_USE_VMALLOC
diff --git a/kernel/exit.c b/kernel/exit.c
index 490a707c7..031325e9a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -135,7 +135,7 @@ static void __exit_signal(struct task_struct *tsk)
sig->inblock += task_io_get_inblock(tsk);
sig->oublock += task_io_get_oublock(tsk);
task_io_accounting_add(&sig->ioac, &tsk->ioac);
- sig->sum_sched_runtime += tsk_seruntime(tsk);
+ sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
sig->nr_threads--;
__unhash_process(tsk, group_dead);
write_sequnlock(&sig->stats_lock);
@@ -436,7 +436,7 @@ static void exit_mm(struct task_struct *tsk)
mm_update_next_owner(mm);
mmput(mm);
if (test_thread_flag(TIF_MEMDIE))
- unmark_oom_victim();
+ exit_oom_victim();
}
static struct task_struct *find_alive_thread(struct task_struct *p)
@@ -711,10 +711,10 @@ void do_exit(long code)
current->comm, task_pid_nr(current),
preempt_count());
- acct_update_integrals(tsk);
/* sync mm's RSS info before statistics gathering */
if (tsk->mm)
sync_mm_rss(tsk->mm);
+ acct_update_integrals(tsk);
group_dead = atomic_dec_and_test(&tsk->signal->live);
if (group_dead) {
hrtimer_cancel(&tsk->signal->real_timer);
diff --git a/kernel/fork.c b/kernel/fork.c
index e37f372d3..d6dfe2c23 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -138,7 +138,7 @@ static struct kmem_cache *task_struct_cachep;
static inline struct task_struct *alloc_task_struct_node(int node)
{
- return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL | ___GFP_TOI_NOTRACK, node);
+ return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
}
static inline void free_task_struct(struct task_struct *tsk)
@@ -287,6 +287,11 @@ static void set_max_threads(unsigned int max_threads_suggested)
max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
}
+#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
+/* Initialized by the architecture: */
+int arch_task_struct_size __read_mostly;
+#endif
+
void __init fork_init(void)
{
#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
@@ -295,7 +300,7 @@ void __init fork_init(void)
#endif
/* create a slab on which task_structs can be allocated */
task_struct_cachep =
- kmem_cache_create("task_struct", sizeof(struct task_struct),
+ kmem_cache_create("task_struct", arch_task_struct_size,
ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
#endif
@@ -456,7 +461,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
struct inode *inode = file_inode(file);
struct address_space *mapping = file->f_mapping;
- vma_get_file(tmp);
+ get_file(file);
if (tmp->vm_flags & VM_DENYWRITE)
atomic_dec(&inode->i_writecount);
i_mmap_lock_write(mapping);
@@ -1091,10 +1096,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
{
unsigned long cpu_limit;
- /* Thread group counters. */
- thread_group_cputime_init(sig);
-
- cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
+ cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
if (cpu_limit != RLIM_INFINITY) {
sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
sig->cputimer.running = 1;
@@ -1144,10 +1146,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
tty_audit_fork(sig);
sched_autogroup_fork(sig);
-#ifdef CONFIG_CGROUPS
- init_rwsem(&sig->group_rwsem);
-#endif
-
sig->oom_score_adj = current->signal->oom_score_adj;
sig->oom_score_adj_min = current->signal->oom_score_adj_min;
@@ -1241,7 +1239,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
unsigned long stack_size,
int __user *child_tidptr,
struct pid *pid,
- int trace)
+ int trace,
+ unsigned long tls)
{
int retval;
struct task_struct *p;
@@ -1396,6 +1395,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->hardirq_context = 0;
p->softirq_context = 0;
#endif
+
+ p->pagefault_disabled = 0;
+
#ifdef CONFIG_LOCKDEP
p->lockdep_depth = 0; /* no locks held yet */
p->curr_chain_key = 0;
@@ -1447,7 +1449,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
retval = copy_io(clone_flags, p);
if (retval)
goto bad_fork_cleanup_namespaces;
- retval = copy_thread(clone_flags, stack_start, stack_size, p);
+ retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls);
if (retval)
goto bad_fork_cleanup_io;
@@ -1659,7 +1661,7 @@ static inline void init_idle_pids(struct pid_link *links)
struct task_struct *fork_idle(int cpu)
{
struct task_struct *task;
- task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0);
+ task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0);
if (!IS_ERR(task)) {
init_idle_pids(task->pids);
init_idle(task, cpu);
@@ -1674,11 +1676,12 @@ struct task_struct *fork_idle(int cpu)
* It copies the process, and if successful kick-starts
* it and waits for it to finish using the VM if required.
*/
-long do_fork(unsigned long clone_flags,
+long _do_fork(unsigned long clone_flags,
unsigned long stack_start,
unsigned long stack_size,
int __user *parent_tidptr,
- int __user *child_tidptr)
+ int __user *child_tidptr,
+ unsigned long tls)
{
struct task_struct *p;
int trace = 0;
@@ -1703,7 +1706,7 @@ long do_fork(unsigned long clone_flags,
}
p = copy_process(clone_flags, stack_start, stack_size,
- child_tidptr, NULL, trace);
+ child_tidptr, NULL, trace, tls);
/*
* Do this prior waking up the new thread - the thread pointer
* might get invalid after that point, if the thread exits quickly.
@@ -1744,20 +1747,34 @@ long do_fork(unsigned long clone_flags,
return nr;
}
+#ifndef CONFIG_HAVE_COPY_THREAD_TLS
+/* For compatibility with architectures that call do_fork directly rather than
+ * using the syscall entry points below. */
+long do_fork(unsigned long clone_flags,
+ unsigned long stack_start,
+ unsigned long stack_size,
+ int __user *parent_tidptr,
+ int __user *child_tidptr)
+{
+ return _do_fork(clone_flags, stack_start, stack_size,
+ parent_tidptr, child_tidptr, 0);
+}
+#endif
+
/*
* Create a kernel thread.
*/
pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
{
- return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
- (unsigned long)arg, NULL, NULL);
+ return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
+ (unsigned long)arg, NULL, NULL, 0);
}
#ifdef __ARCH_WANT_SYS_FORK
SYSCALL_DEFINE0(fork)
{
#ifdef CONFIG_MMU
- return do_fork(SIGCHLD, 0, 0, NULL, NULL);
+ return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0);
#else
/* can not support in nommu mode */
return -EINVAL;
@@ -1768,8 +1785,8 @@ SYSCALL_DEFINE0(fork)
#ifdef __ARCH_WANT_SYS_VFORK
SYSCALL_DEFINE0(vfork)
{
- return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
- 0, NULL, NULL);
+ return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
+ 0, NULL, NULL, 0);
}
#endif
@@ -1777,27 +1794,27 @@ SYSCALL_DEFINE0(vfork)
#ifdef CONFIG_CLONE_BACKWARDS
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
int __user *, parent_tidptr,
- int, tls_val,
+ unsigned long, tls,
int __user *, child_tidptr)
#elif defined(CONFIG_CLONE_BACKWARDS2)
SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
int __user *, parent_tidptr,
int __user *, child_tidptr,
- int, tls_val)
+ unsigned long, tls)
#elif defined(CONFIG_CLONE_BACKWARDS3)
SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
int, stack_size,
int __user *, parent_tidptr,
int __user *, child_tidptr,
- int, tls_val)
+ unsigned long, tls)
#else
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
int __user *, parent_tidptr,
int __user *, child_tidptr,
- int, tls_val)
+ unsigned long, tls)
#endif
{
- return do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr);
+ return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls);
}
#endif
diff --git a/kernel/futex.c b/kernel/futex.c
index 2579e407f..c4a182f53 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1090,9 +1090,11 @@ static void __unqueue_futex(struct futex_q *q)
/*
* The hash bucket lock must be held when this is called.
- * Afterwards, the futex_q must not be accessed.
+ * Afterwards, the futex_q must not be accessed. Callers
+ * must ensure to later call wake_up_q() for the actual
+ * wakeups to occur.
*/
-static void wake_futex(struct futex_q *q)
+static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
{
struct task_struct *p = q->task;
@@ -1100,14 +1102,10 @@ static void wake_futex(struct futex_q *q)
return;
/*
- * We set q->lock_ptr = NULL _before_ we wake up the task. If
- * a non-futex wake up happens on another CPU then the task
- * might exit and p would dereference a non-existing task
- * struct. Prevent this by holding a reference on p across the
- * wake up.
+ * Queue the task for later wakeup for after we've released
+ * the hb->lock. wake_q_add() grabs reference to p.
*/
- get_task_struct(p);
-
+ wake_q_add(wake_q, p);
__unqueue_futex(q);
/*
* The waiting task can free the futex_q as soon as
@@ -1117,16 +1115,16 @@ static void wake_futex(struct futex_q *q)
*/
smp_wmb();
q->lock_ptr = NULL;
-
- wake_up_state(p, TASK_NORMAL);
- put_task_struct(p);
}
-static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
+static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
+ struct futex_hash_bucket *hb)
{
struct task_struct *new_owner;
struct futex_pi_state *pi_state = this->pi_state;
u32 uninitialized_var(curval), newval;
+ WAKE_Q(wake_q);
+ bool deboost;
int ret = 0;
if (!pi_state)
@@ -1178,7 +1176,19 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
raw_spin_unlock_irq(&new_owner->pi_lock);
raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
- rt_mutex_unlock(&pi_state->pi_mutex);
+
+ deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
+
+ /*
+ * First unlock HB so the waiter does not spin on it once he got woken
+ * up. Second wake up the waiter before the priority is adjusted. If we
+ * deboost first (and lose our higher priority), then the task might get
+ * scheduled away before the wake up can take place.
+ */
+ spin_unlock(&hb->lock);
+ wake_up_q(&wake_q);
+ if (deboost)
+ rt_mutex_adjust_prio(current);
return 0;
}
@@ -1217,6 +1227,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
struct futex_q *this, *next;
union futex_key key = FUTEX_KEY_INIT;
int ret;
+ WAKE_Q(wake_q);
if (!bitset)
return -EINVAL;
@@ -1244,13 +1255,14 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
if (!(this->bitset & bitset))
continue;
- wake_futex(this);
+ mark_wake_futex(&wake_q, this);
if (++ret >= nr_wake)
break;
}
}
spin_unlock(&hb->lock);
+ wake_up_q(&wake_q);
out_put_key:
put_futex_key(&key);
out:
@@ -1269,6 +1281,7 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
struct futex_hash_bucket *hb1, *hb2;
struct futex_q *this, *next;
int ret, op_ret;
+ WAKE_Q(wake_q);
retry:
ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
@@ -1320,7 +1333,7 @@ retry_private:
ret = -EINVAL;
goto out_unlock;
}
- wake_futex(this);
+ mark_wake_futex(&wake_q, this);
if (++ret >= nr_wake)
break;
}
@@ -1334,7 +1347,7 @@ retry_private:
ret = -EINVAL;
goto out_unlock;
}
- wake_futex(this);
+ mark_wake_futex(&wake_q, this);
if (++op_ret >= nr_wake2)
break;
}
@@ -1344,6 +1357,7 @@ retry_private:
out_unlock:
double_unlock_hb(hb1, hb2);
+ wake_up_q(&wake_q);
out_put_keys:
put_futex_key(&key2);
out_put_key1:
@@ -1503,6 +1517,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
struct futex_pi_state *pi_state = NULL;
struct futex_hash_bucket *hb1, *hb2;
struct futex_q *this, *next;
+ WAKE_Q(wake_q);
if (requeue_pi) {
/*
@@ -1679,7 +1694,7 @@ retry_private:
* woken by futex_unlock_pi().
*/
if (++task_count <= nr_wake && !requeue_pi) {
- wake_futex(this);
+ mark_wake_futex(&wake_q, this);
continue;
}
@@ -1719,6 +1734,7 @@ retry_private:
out_unlock:
free_pi_state(pi_state);
double_unlock_hb(hb1, hb2);
+ wake_up_q(&wake_q);
hb_waiters_dec(hb2);
/*
@@ -2055,7 +2071,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
{
/*
* The task state is guaranteed to be set before another task can
- * wake it. set_current_state() is implemented using set_mb() and
+ * wake it. set_current_state() is implemented using smp_store_mb() and
* queue_me() calls spin_unlock() upon completion, both serializing
* access to the hash list and forcing another memory barrier.
*/
@@ -2063,11 +2079,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
queue_me(q, hb);
/* Arm the timer */
- if (timeout) {
+ if (timeout)
hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
- if (!hrtimer_active(&timeout->timer))
- timeout->task = NULL;
- }
/*
* If we have been removed from the hash list, then another task
@@ -2412,13 +2425,23 @@ retry:
*/
match = futex_top_waiter(hb, &key);
if (match) {
- ret = wake_futex_pi(uaddr, uval, match);
+ ret = wake_futex_pi(uaddr, uval, match, hb);
+ /*
+ * In case of success wake_futex_pi dropped the hash
+ * bucket lock.
+ */
+ if (!ret)
+ goto out_putkey;
/*
* The atomic access to the futex value generated a
* pagefault, so retry the user-access and the wakeup:
*/
if (ret == -EFAULT)
goto pi_faulted;
+ /*
+ * wake_futex_pi has detected invalid state. Tell user
+ * space.
+ */
goto out_unlock;
}
@@ -2439,6 +2462,7 @@ retry:
out_unlock:
spin_unlock(&hb->lock);
+out_putkey:
put_futex_key(&key);
return ret;
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
index a744098e4..7080ae1eb 100644
--- a/kernel/gcov/base.c
+++ b/kernel/gcov/base.c
@@ -92,6 +92,12 @@ void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters)
}
EXPORT_SYMBOL(__gcov_merge_time_profile);
+void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_icall_topn);
+
/**
* gcov_enable_events - enable event reporting through gcov_event()
*
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
index 826ba9fb5..e25e92fb4 100644
--- a/kernel/gcov/gcc_4_7.c
+++ b/kernel/gcov/gcc_4_7.c
@@ -18,7 +18,9 @@
#include <linux/vmalloc.h>
#include "gcov.h"
-#if __GNUC__ == 4 && __GNUC_MINOR__ >= 9
+#if __GNUC__ == 5 && __GNUC_MINOR__ >= 1
+#define GCOV_COUNTERS 10
+#elif __GNUC__ == 4 && __GNUC_MINOR__ >= 9
#define GCOV_COUNTERS 9
#else
#define GCOV_COUNTERS 8
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index eb9a4ea39..ae216824e 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -719,15 +719,9 @@ void handle_percpu_devid_irq(unsigned int irq, struct irq_desc *desc)
}
void
-__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
- const char *name)
+__irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
+ int is_chained, const char *name)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0);
-
- if (!desc)
- return;
-
if (!handle) {
handle = handle_bad_irq;
} else {
@@ -749,13 +743,13 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
* right away.
*/
if (WARN_ON(is_chained))
- goto out;
+ return;
/* Try the parent */
irq_data = irq_data->parent_data;
}
#endif
if (WARN_ON(!irq_data || irq_data->chip == &no_irq_chip))
- goto out;
+ return;
}
/* Uninstall? */
@@ -774,12 +768,41 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
irq_settings_set_nothread(desc);
irq_startup(desc, true);
}
-out:
+}
+
+void
+__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
+ const char *name)
+{
+ unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0);
+
+ if (!desc)
+ return;
+
+ __irq_do_set_handler(desc, handle, is_chained, name);
irq_put_desc_busunlock(desc, flags);
}
EXPORT_SYMBOL_GPL(__irq_set_handler);
void
+irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle,
+ void *data)
+{
+ unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0);
+
+ if (!desc)
+ return;
+
+ __irq_do_set_handler(desc, handle, 1, NULL);
+ desc->irq_data.handler_data = data;
+
+ irq_put_desc_busunlock(desc, flags);
+}
+EXPORT_SYMBOL_GPL(irq_set_chained_handler_and_data);
+
+void
irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
irq_flow_handler_t handle, const char *name)
{
@@ -876,6 +899,34 @@ void irq_cpu_offline(void)
#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
/**
+ * irq_chip_enable_parent - Enable the parent interrupt (defaults to unmask if
+ * NULL)
+ * @data: Pointer to interrupt specific data
+ */
+void irq_chip_enable_parent(struct irq_data *data)
+{
+ data = data->parent_data;
+ if (data->chip->irq_enable)
+ data->chip->irq_enable(data);
+ else
+ data->chip->irq_unmask(data);
+}
+
+/**
+ * irq_chip_disable_parent - Disable the parent interrupt (defaults to mask if
+ * NULL)
+ * @data: Pointer to interrupt specific data
+ */
+void irq_chip_disable_parent(struct irq_data *data)
+{
+ data = data->parent_data;
+ if (data->chip->irq_disable)
+ data->chip->irq_disable(data);
+ else
+ data->chip->irq_mask(data);
+}
+
+/**
* irq_chip_ack_parent - Acknowledge the parent interrupt
* @data: Pointer to interrupt specific data
*/
@@ -934,6 +985,23 @@ int irq_chip_set_affinity_parent(struct irq_data *data,
}
/**
+ * irq_chip_set_type_parent - Set IRQ type on the parent interrupt
+ * @data: Pointer to interrupt specific data
+ * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h
+ *
+ * Conditional, as the underlying parent chip might not implement it.
+ */
+int irq_chip_set_type_parent(struct irq_data *data, unsigned int type)
+{
+ data = data->parent_data;
+
+ if (data->chip->irq_set_type)
+ return data->chip->irq_set_type(data, type);
+
+ return -ENOSYS;
+}
+
+/**
* irq_chip_retrigger_hierarchy - Retrigger an interrupt in hardware
* @data: Pointer to interrupt specific data
*
@@ -946,6 +1014,20 @@ int irq_chip_retrigger_hierarchy(struct irq_data *data)
if (data->chip && data->chip->irq_retrigger)
return data->chip->irq_retrigger(data);
+ return 0;
+}
+
+/**
+ * irq_chip_set_vcpu_affinity_parent - Set vcpu affinity on the parent interrupt
+ * @data: Pointer to interrupt specific data
+ * @dest: The vcpu affinity information
+ */
+int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info)
+{
+ data = data->parent_data;
+ if (data->chip->irq_set_vcpu_affinity)
+ return data->chip->irq_set_vcpu_affinity(data, vcpu_info);
+
return -ENOSYS;
}
diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c
index 2feb6feca..326a67f24 100644
--- a/kernel/irq/dummychip.c
+++ b/kernel/irq/dummychip.c
@@ -42,6 +42,7 @@ struct irq_chip no_irq_chip = {
.irq_enable = noop,
.irq_disable = noop,
.irq_ack = ack_bad,
+ .flags = IRQCHIP_SKIP_SET_WAKE,
};
/*
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index 61024e8ab..15b370daf 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -360,7 +360,7 @@ static struct lock_class_key irq_nested_lock_class;
int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
irq_hw_number_t hw_irq)
{
- struct irq_data *data = irq_get_irq_data(virq);
+ struct irq_data *data = irq_domain_get_irq_data(d, virq);
struct irq_domain_chip_generic *dgc = d->gc;
struct irq_chip_generic *gc;
struct irq_chip_type *ct;
@@ -405,8 +405,7 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
else
data->mask = 1 << idx;
- irq_set_chip_and_handler(virq, chip, ct->handler);
- irq_set_chip_data(virq, gc);
+ irq_domain_set_info(d, virq, hw_irq, chip, gc, ct->handler, NULL, NULL);
irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set);
return 0;
}
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index df553b0af..61008b843 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -59,8 +59,6 @@ enum {
#include "debug.h"
#include "settings.h"
-#define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data)
-
extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
unsigned long flags);
extern void __disable_irq(struct irq_desc *desc, unsigned int irq);
@@ -78,12 +76,8 @@ extern void unmask_threaded_irq(struct irq_desc *desc);
#ifdef CONFIG_SPARSE_IRQ
static inline void irq_mark_irq(unsigned int irq) { }
-extern void irq_lock_sparse(void);
-extern void irq_unlock_sparse(void);
#else
extern void irq_mark_irq(unsigned int irq);
-static inline void irq_lock_sparse(void) { }
-static inline void irq_unlock_sparse(void) { }
#endif
extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
@@ -170,27 +164,27 @@ irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags)
*/
static inline void irqd_set_move_pending(struct irq_data *d)
{
- d->state_use_accessors |= IRQD_SETAFFINITY_PENDING;
+ __irqd_to_state(d) |= IRQD_SETAFFINITY_PENDING;
}
static inline void irqd_clr_move_pending(struct irq_data *d)
{
- d->state_use_accessors &= ~IRQD_SETAFFINITY_PENDING;
+ __irqd_to_state(d) &= ~IRQD_SETAFFINITY_PENDING;
}
static inline void irqd_clear(struct irq_data *d, unsigned int mask)
{
- d->state_use_accessors &= ~mask;
+ __irqd_to_state(d) &= ~mask;
}
static inline void irqd_set(struct irq_data *d, unsigned int mask)
{
- d->state_use_accessors |= mask;
+ __irqd_to_state(d) |= mask;
}
static inline bool irqd_has_set(struct irq_data *d, unsigned int mask)
{
- return d->state_use_accessors & mask;
+ return __irqd_to_state(d) & mask;
}
static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *desc)
@@ -199,6 +193,11 @@ static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *d
__this_cpu_inc(kstat.irqs_sum);
}
+static inline int irq_desc_get_node(struct irq_desc *desc)
+{
+ return irq_data_get_node(&desc->irq_data);
+}
+
#ifdef CONFIG_PM_SLEEP
bool irq_pm_check_wakeup(struct irq_desc *desc);
void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 99793b9b6..4afc45761 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -59,16 +59,10 @@ static void desc_smp_init(struct irq_desc *desc, int node)
#endif
}
-static inline int desc_node(struct irq_desc *desc)
-{
- return desc->irq_data.node;
-}
-
#else
static inline int
alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; }
static inline void desc_smp_init(struct irq_desc *desc, int node) { }
-static inline int desc_node(struct irq_desc *desc) { return 0; }
#endif
static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
@@ -76,6 +70,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
{
int cpu;
+ desc->irq_data.common = &desc->irq_common_data;
desc->irq_data.irq = irq;
desc->irq_data.chip = &no_irq_chip;
desc->irq_data.chip_data = NULL;
@@ -299,7 +294,7 @@ static void free_desc(unsigned int irq)
unsigned long flags;
raw_spin_lock_irqsave(&desc->lock, flags);
- desc_set_defaults(irq, desc, desc_node(desc), NULL);
+ desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL);
raw_spin_unlock_irqrestore(&desc->lock, flags);
}
@@ -619,7 +614,7 @@ unsigned int kstat_irqs(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
int cpu;
- int sum = 0;
+ unsigned int sum = 0;
if (!desc || !desc->kstat_irqs)
return 0;
@@ -639,7 +634,7 @@ unsigned int kstat_irqs(unsigned int irq)
*/
unsigned int kstat_irqs_usr(unsigned int irq)
{
- int sum;
+ unsigned int sum;
irq_lock_sparse();
sum = kstat_irqs(irq);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 7fac31105..8c3577fef 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -830,10 +830,12 @@ static struct irq_data *irq_domain_insert_irq_data(struct irq_domain *domain,
{
struct irq_data *irq_data;
- irq_data = kzalloc_node(sizeof(*irq_data), GFP_KERNEL, child->node);
+ irq_data = kzalloc_node(sizeof(*irq_data), GFP_KERNEL,
+ irq_data_get_node(child));
if (irq_data) {
child->parent_data = irq_data;
irq_data->irq = child->irq;
+ irq_data->common = child->common;
irq_data->node = child->node;
irq_data->domain = domain;
}
@@ -1232,6 +1234,27 @@ struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain,
return (irq_data && irq_data->domain == domain) ? irq_data : NULL;
}
+/**
+ * irq_domain_set_info - Set the complete data for a @virq in @domain
+ * @domain: Interrupt domain to match
+ * @virq: IRQ number
+ * @hwirq: The hardware interrupt number
+ * @chip: The associated interrupt chip
+ * @chip_data: The associated interrupt chip data
+ * @handler: The interrupt flow handler
+ * @handler_data: The interrupt flow handler data
+ * @handler_name: The interrupt handler name
+ */
+void irq_domain_set_info(struct irq_domain *domain, unsigned int virq,
+ irq_hw_number_t hwirq, struct irq_chip *chip,
+ void *chip_data, irq_flow_handler_t handler,
+ void *handler_data, const char *handler_name)
+{
+ irq_set_chip_and_handler_name(virq, chip, handler, handler_name);
+ irq_set_chip_data(virq, chip_data);
+ irq_set_handler_data(virq, handler_data);
+}
+
static void irq_domain_check_hierarchy(struct irq_domain *domain)
{
}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index e68932bb3..f9744853b 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -256,6 +256,37 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
}
EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
+/**
+ * irq_set_vcpu_affinity - Set vcpu affinity for the interrupt
+ * @irq: interrupt number to set affinity
+ * @vcpu_info: vCPU specific data
+ *
+ * This function uses the vCPU specific data to set the vCPU
+ * affinity for an irq. The vCPU specific data is passed from
+ * outside, such as KVM. One example code path is as below:
+ * KVM -> IOMMU -> irq_set_vcpu_affinity().
+ */
+int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info)
+{
+ unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
+ struct irq_data *data;
+ struct irq_chip *chip;
+ int ret = -ENOSYS;
+
+ if (!desc)
+ return -EINVAL;
+
+ data = irq_desc_get_irq_data(desc);
+ chip = irq_data_get_irq_chip(data);
+ if (chip && chip->irq_set_vcpu_affinity)
+ ret = chip->irq_set_vcpu_affinity(data, vcpu_info);
+ irq_put_desc_unlock(desc, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(irq_set_vcpu_affinity);
+
static void irq_affinity_notify(struct work_struct *work)
{
struct irq_affinity_notify *notify =
@@ -332,7 +363,7 @@ static int
setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
{
struct cpumask *set = irq_default_affinity;
- int node = desc->irq_data.node;
+ int node = irq_desc_get_node(desc);
/* Excludes PER_CPU and NO_BALANCE interrupts */
if (!irq_can_set_affinity(irq))
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index ca3f4aaff..37ddb7bda 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -7,21 +7,21 @@
void irq_move_masked_irq(struct irq_data *idata)
{
struct irq_desc *desc = irq_data_to_desc(idata);
- struct irq_chip *chip = idata->chip;
+ struct irq_chip *chip = desc->irq_data.chip;
if (likely(!irqd_is_setaffinity_pending(&desc->irq_data)))
return;
+ irqd_clr_move_pending(&desc->irq_data);
+
/*
* Paranoia: cpu-local interrupts shouldn't be calling in here anyway.
*/
- if (!irqd_can_balance(&desc->irq_data)) {
+ if (irqd_is_per_cpu(&desc->irq_data)) {
WARN_ON(1);
return;
}
- irqd_clr_move_pending(&desc->irq_data);
-
if (unlikely(cpumask_empty(desc->pending_mask)))
return;
@@ -52,6 +52,13 @@ void irq_move_irq(struct irq_data *idata)
{
bool masked;
+ /*
+ * Get top level irq_data when CONFIG_IRQ_DOMAIN_HIERARCHY is enabled,
+ * and it should be optimized away when CONFIG_IRQ_DOMAIN_HIERARCHY is
+ * disabled. So we avoid an "#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY" here.
+ */
+ idata = irq_desc_get_irq_data(irq_data_to_desc(idata));
+
if (likely(!irqd_is_setaffinity_pending(idata)))
return;
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 474de5cb3..7bf1f1bbb 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -124,7 +124,7 @@ static void msi_domain_free(struct irq_domain *domain, unsigned int virq,
irq_domain_free_irqs_top(domain, virq, nr_irqs);
}
-static struct irq_domain_ops msi_domain_ops = {
+static const struct irq_domain_ops msi_domain_ops = {
.alloc = msi_domain_alloc,
.free = msi_domain_free,
.activate = msi_domain_activate,
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 5204a6d1b..d22786a6d 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -123,6 +123,8 @@ void suspend_device_irqs(void)
unsigned long flags;
bool sync;
+ if (irq_settings_is_nested_thread(desc))
+ continue;
raw_spin_lock_irqsave(&desc->lock, flags);
sync = suspend_device_irq(desc, irq);
raw_spin_unlock_irqrestore(&desc->lock, flags);
@@ -163,6 +165,8 @@ static void resume_irqs(bool want_early)
if (!is_early && want_early)
continue;
+ if (irq_settings_is_nested_thread(desc))
+ continue;
raw_spin_lock_irqsave(&desc->lock, flags);
resume_irq(desc, irq);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index df2f4642d..0e97c142c 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -241,7 +241,7 @@ static int irq_node_proc_show(struct seq_file *m, void *v)
{
struct irq_desc *desc = irq_to_desc((long) m->private);
- seq_printf(m, "%d\n", desc->irq_data.node);
+ seq_printf(m, "%d\n", irq_desc_get_node(desc));
return 0;
}
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 9019f15de..52ebaca1b 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -302,7 +302,7 @@ static int jump_label_add_module(struct module *mod)
continue;
key = iterk;
- if (__module_address(iter->key) == mod) {
+ if (within_module(iter->key, mod)) {
/*
* Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH.
*/
@@ -339,7 +339,7 @@ static void jump_label_del_module(struct module *mod)
key = (struct static_key *)(unsigned long)iter->key;
- if (__module_address(iter->key) == mod)
+ if (within_module(iter->key, mod))
continue;
prev = &key->next;
@@ -443,14 +443,16 @@ static void jump_label_update(struct static_key *key, int enable)
{
struct jump_entry *stop = __stop___jump_table;
struct jump_entry *entry = jump_label_get_entries(key);
-
#ifdef CONFIG_MODULES
- struct module *mod = __module_address((unsigned long)key);
+ struct module *mod;
__jump_label_mod_update(key, enable);
+ preempt_disable();
+ mod = __module_address((unsigned long)key);
if (mod)
stop = mod->jump_entries + mod->num_jump_entries;
+ preempt_enable();
#endif
/* if there are no users, entry can be NULL */
if (entry)
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 7a36fdcca..a785c1015 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -84,6 +84,17 @@ struct resource crashk_low_res = {
int kexec_should_crash(struct task_struct *p)
{
+ /*
+ * If crash_kexec_post_notifiers is enabled, don't run
+ * crash_kexec() here yet, which must be run after panic
+ * notifiers in panic().
+ */
+ if (crash_kexec_post_notifiers)
+ return 0;
+ /*
+ * There are 4 panic() calls in do_exit() path, each of which
+ * corresponds to each of these 4 conditions.
+ */
if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
return 1;
return 0;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index c4237f12c..fdea0bee7 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -97,6 +97,7 @@ bool kthread_should_park(void)
{
return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(current)->flags);
}
+EXPORT_SYMBOL_GPL(kthread_should_park);
/**
* kthread_freezable_should_stop - should this freezable kthread return now?
@@ -171,6 +172,7 @@ void kthread_parkme(void)
{
__kthread_parkme(to_kthread(current));
}
+EXPORT_SYMBOL_GPL(kthread_parkme);
static int kthread(void *_create)
{
@@ -272,7 +274,7 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
DECLARE_COMPLETION_ONSTACK(done);
struct task_struct *task;
struct kthread_create_info *create = kmalloc(sizeof(*create),
- GFP_KERNEL | ___GFP_TOI_NOTRACK);
+ GFP_KERNEL);
if (!create)
return ERR_PTR(-ENOMEM);
@@ -411,6 +413,7 @@ void kthread_unpark(struct task_struct *k)
if (kthread)
__kthread_unpark(k, kthread);
}
+EXPORT_SYMBOL_GPL(kthread_unpark);
/**
* kthread_park - park a thread created by kthread_create().
@@ -441,6 +444,7 @@ int kthread_park(struct task_struct *k)
}
return ret;
}
+EXPORT_SYMBOL_GPL(kthread_park);
/**
* kthread_stop - stop a thread created by kthread_create().
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 9ec555732..c40ebcca0 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -128,7 +128,7 @@ static bool klp_is_patch_registered(struct klp_patch *patch)
static bool klp_initialized(void)
{
- return klp_root_kobj;
+ return !!klp_root_kobj;
}
struct klp_find_arg {
@@ -242,8 +242,9 @@ static int klp_find_verify_func_addr(struct klp_object *obj,
int ret;
#if defined(CONFIG_RANDOMIZE_BASE)
- /* KASLR is enabled, disregard old_addr from user */
- func->old_addr = 0;
+ /* If KASLR has been enabled, adjust old_addr accordingly */
+ if (kaslr_enabled() && func->old_addr)
+ func->old_addr += kaslr_offset();
#endif
if (!func->old_addr || klp_is_module(obj))
@@ -430,7 +431,7 @@ static void klp_disable_object(struct klp_object *obj)
{
struct klp_func *func;
- for (func = obj->funcs; func->old_name; func++)
+ klp_for_each_func(obj, func)
if (func->state == KLP_ENABLED)
klp_disable_func(func);
@@ -448,7 +449,7 @@ static int klp_enable_object(struct klp_object *obj)
if (WARN_ON(!klp_is_object_loaded(obj)))
return -EINVAL;
- for (func = obj->funcs; func->old_name; func++) {
+ klp_for_each_func(obj, func) {
ret = klp_enable_func(func);
if (ret) {
klp_disable_object(obj);
@@ -471,7 +472,7 @@ static int __klp_disable_patch(struct klp_patch *patch)
pr_notice("disabling patch '%s'\n", patch->mod->name);
- for (obj = patch->objs; obj->funcs; obj++) {
+ klp_for_each_object(patch, obj) {
if (obj->state == KLP_ENABLED)
klp_disable_object(obj);
}
@@ -531,7 +532,7 @@ static int __klp_enable_patch(struct klp_patch *patch)
pr_notice("enabling patch '%s'\n", patch->mod->name);
- for (obj = patch->objs; obj->funcs; obj++) {
+ klp_for_each_object(patch, obj) {
if (!klp_is_object_loaded(obj))
continue;
@@ -659,6 +660,15 @@ static struct kobj_type klp_ktype_patch = {
.default_attrs = klp_patch_attrs,
};
+static void klp_kobj_release_object(struct kobject *kobj)
+{
+}
+
+static struct kobj_type klp_ktype_object = {
+ .release = klp_kobj_release_object,
+ .sysfs_ops = &kobj_sysfs_ops,
+};
+
static void klp_kobj_release_func(struct kobject *kobj)
{
}
@@ -688,7 +698,7 @@ static void klp_free_object_loaded(struct klp_object *obj)
obj->mod = NULL;
- for (func = obj->funcs; func->old_name; func++)
+ klp_for_each_func(obj, func)
func->old_addr = 0;
}
@@ -703,7 +713,7 @@ static void klp_free_objects_limited(struct klp_patch *patch,
for (obj = patch->objs; obj->funcs && obj != limit; obj++) {
klp_free_funcs_limited(obj, NULL);
- kobject_put(obj->kobj);
+ kobject_put(&obj->kobj);
}
}
@@ -721,7 +731,7 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func)
func->state = KLP_DISABLED;
return kobject_init_and_add(&func->kobj, &klp_ktype_func,
- obj->kobj, "%s", func->old_name);
+ &obj->kobj, "%s", func->old_name);
}
/* parts of the initialization that is done only when the object is loaded */
@@ -737,7 +747,7 @@ static int klp_init_object_loaded(struct klp_patch *patch,
return ret;
}
- for (func = obj->funcs; func->old_name; func++) {
+ klp_for_each_func(obj, func) {
ret = klp_find_verify_func_addr(obj, func);
if (ret)
return ret;
@@ -761,11 +771,12 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj)
klp_find_object_module(obj);
name = klp_is_module(obj) ? obj->name : "vmlinux";
- obj->kobj = kobject_create_and_add(name, &patch->kobj);
- if (!obj->kobj)
- return -ENOMEM;
+ ret = kobject_init_and_add(&obj->kobj, &klp_ktype_object,
+ &patch->kobj, "%s", name);
+ if (ret)
+ return ret;
- for (func = obj->funcs; func->old_name; func++) {
+ klp_for_each_func(obj, func) {
ret = klp_init_func(obj, func);
if (ret)
goto free;
@@ -781,7 +792,7 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj)
free:
klp_free_funcs_limited(obj, func);
- kobject_put(obj->kobj);
+ kobject_put(&obj->kobj);
return ret;
}
@@ -802,7 +813,7 @@ static int klp_init_patch(struct klp_patch *patch)
if (ret)
goto unlock;
- for (obj = patch->objs; obj->funcs; obj++) {
+ klp_for_each_object(patch, obj) {
ret = klp_init_object(patch, obj);
if (ret)
goto free;
@@ -891,7 +902,7 @@ int klp_register_patch(struct klp_patch *patch)
}
EXPORT_SYMBOL_GPL(klp_register_patch);
-static void klp_module_notify_coming(struct klp_patch *patch,
+static int klp_module_notify_coming(struct klp_patch *patch,
struct klp_object *obj)
{
struct module *pmod = patch->mod;
@@ -899,22 +910,23 @@ static void klp_module_notify_coming(struct klp_patch *patch,
int ret;
ret = klp_init_object_loaded(patch, obj);
- if (ret)
- goto err;
+ if (ret) {
+ pr_warn("failed to initialize patch '%s' for module '%s' (%d)\n",
+ pmod->name, mod->name, ret);
+ return ret;
+ }
if (patch->state == KLP_DISABLED)
- return;
+ return 0;
pr_notice("applying patch '%s' to loading module '%s'\n",
pmod->name, mod->name);
ret = klp_enable_object(obj);
- if (!ret)
- return;
-
-err:
- pr_warn("failed to apply patch '%s' to module '%s' (%d)\n",
- pmod->name, mod->name, ret);
+ if (ret)
+ pr_warn("failed to apply patch '%s' to module '%s' (%d)\n",
+ pmod->name, mod->name, ret);
+ return ret;
}
static void klp_module_notify_going(struct klp_patch *patch,
@@ -938,6 +950,7 @@ disabled:
static int klp_module_notify(struct notifier_block *nb, unsigned long action,
void *data)
{
+ int ret;
struct module *mod = data;
struct klp_patch *patch;
struct klp_object *obj;
@@ -957,13 +970,18 @@ static int klp_module_notify(struct notifier_block *nb, unsigned long action,
mod->klp_alive = false;
list_for_each_entry(patch, &klp_patches, list) {
- for (obj = patch->objs; obj->funcs; obj++) {
+ klp_for_each_object(patch, obj) {
if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
continue;
if (action == MODULE_STATE_COMING) {
obj->mod = mod;
- klp_module_notify_coming(patch, obj);
+ ret = klp_module_notify_coming(patch, obj);
+ if (ret) {
+ obj->mod = NULL;
+ pr_warn("patch '%s' is in an inconsistent state!\n",
+ patch->mod->name);
+ }
} else /* MODULE_STATE_GOING */
klp_module_notify_going(patch, obj);
@@ -981,7 +999,7 @@ static struct notifier_block klp_module_nb = {
.priority = INT_MIN+1, /* called late but before ftrace notifier */
};
-static int klp_init(void)
+static int __init klp_init(void)
{
int ret;
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index de7a416cc..7dd5c9918 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -17,6 +17,7 @@ obj-$(CONFIG_SMP) += spinlock.o
obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o
obj-$(CONFIG_SMP) += lglock.o
obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
+obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o
obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
@@ -25,5 +26,5 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o
-obj-$(CONFIG_QUEUE_RWLOCK) += qrwlock.o
+obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
diff --git a/kernel/locking/lglock.c b/kernel/locking/lglock.c
index 86ae2aebf..951cfcd10 100644
--- a/kernel/locking/lglock.c
+++ b/kernel/locking/lglock.c
@@ -60,6 +60,28 @@ void lg_local_unlock_cpu(struct lglock *lg, int cpu)
}
EXPORT_SYMBOL(lg_local_unlock_cpu);
+void lg_double_lock(struct lglock *lg, int cpu1, int cpu2)
+{
+ BUG_ON(cpu1 == cpu2);
+
+ /* lock in cpu order, just like lg_global_lock */
+ if (cpu2 < cpu1)
+ swap(cpu1, cpu2);
+
+ preempt_disable();
+ lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
+ arch_spin_lock(per_cpu_ptr(lg->lock, cpu1));
+ arch_spin_lock(per_cpu_ptr(lg->lock, cpu2));
+}
+
+void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2)
+{
+ lock_release(&lg->lock_dep_map, 1, _RET_IP_);
+ arch_spin_unlock(per_cpu_ptr(lg->lock, cpu1));
+ arch_spin_unlock(per_cpu_ptr(lg->lock, cpu2));
+ preempt_enable();
+}
+
void lg_global_lock(struct lglock *lg)
{
int i;
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index aaeae885d..8acfbf773 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3157,6 +3157,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
hlock->waittime_stamp = 0;
hlock->holdtime_stamp = lockstat_clock();
#endif
+ hlock->pin_count = 0;
if (check && !mark_irqflags(curr, hlock))
return 0;
@@ -3260,26 +3261,6 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
return 0;
}
-/*
- * Common debugging checks for both nested and non-nested unlock:
- */
-static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
- unsigned long ip)
-{
- if (unlikely(!debug_locks))
- return 0;
- /*
- * Lockdep should run with IRQs disabled, recursion, head-ache, etc..
- */
- if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
- return 0;
-
- if (curr->lockdep_depth <= 0)
- return print_unlock_imbalance_bug(curr, lock, ip);
-
- return 1;
-}
-
static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
{
if (hlock->instance == lock)
@@ -3376,31 +3357,35 @@ found_it:
}
/*
- * Remove the lock to the list of currently held locks in a
- * potentially non-nested (out of order) manner. This is a
- * relatively rare operation, as all the unlock APIs default
- * to nested mode (which uses lock_release()):
+ * Remove the lock to the list of currently held locks - this gets
+ * called on mutex_unlock()/spin_unlock*() (or on a failed
+ * mutex_lock_interruptible()).
+ *
+ * @nested is an hysterical artifact, needs a tree wide cleanup.
*/
static int
-lock_release_non_nested(struct task_struct *curr,
- struct lockdep_map *lock, unsigned long ip)
+__lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
{
+ struct task_struct *curr = current;
struct held_lock *hlock, *prev_hlock;
unsigned int depth;
int i;
- /*
- * Check whether the lock exists in the current stack
- * of held locks:
- */
+ if (unlikely(!debug_locks))
+ return 0;
+
depth = curr->lockdep_depth;
/*
* So we're all set to release this lock.. wait what lock? We don't
* own any locks, you've been drinking again?
*/
- if (DEBUG_LOCKS_WARN_ON(!depth))
- return 0;
+ if (DEBUG_LOCKS_WARN_ON(depth <= 0))
+ return print_unlock_imbalance_bug(curr, lock, ip);
+ /*
+ * Check whether the lock exists in the current stack
+ * of held locks:
+ */
prev_hlock = NULL;
for (i = depth-1; i >= 0; i--) {
hlock = curr->held_locks + i;
@@ -3419,6 +3404,8 @@ found_it:
if (hlock->instance == lock)
lock_release_holdtime(hlock);
+ WARN(hlock->pin_count, "releasing a pinned lock\n");
+
if (hlock->references) {
hlock->references--;
if (hlock->references) {
@@ -3456,91 +3443,66 @@ found_it:
*/
if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1))
return 0;
+
return 1;
}
-/*
- * Remove the lock to the list of currently held locks - this gets
- * called on mutex_unlock()/spin_unlock*() (or on a failed
- * mutex_lock_interruptible()). This is done for unlocks that nest
- * perfectly. (i.e. the current top of the lock-stack is unlocked)
- */
-static int lock_release_nested(struct task_struct *curr,
- struct lockdep_map *lock, unsigned long ip)
+static int __lock_is_held(struct lockdep_map *lock)
{
- struct held_lock *hlock;
- unsigned int depth;
-
- /*
- * Pop off the top of the lock stack:
- */
- depth = curr->lockdep_depth - 1;
- hlock = curr->held_locks + depth;
-
- /*
- * Is the unlock non-nested:
- */
- if (hlock->instance != lock || hlock->references)
- return lock_release_non_nested(curr, lock, ip);
- curr->lockdep_depth--;
-
- /*
- * No more locks, but somehow we've got hash left over, who left it?
- */
- if (DEBUG_LOCKS_WARN_ON(!depth && (hlock->prev_chain_key != 0)))
- return 0;
+ struct task_struct *curr = current;
+ int i;
- curr->curr_chain_key = hlock->prev_chain_key;
+ for (i = 0; i < curr->lockdep_depth; i++) {
+ struct held_lock *hlock = curr->held_locks + i;
- lock_release_holdtime(hlock);
+ if (match_held_lock(hlock, lock))
+ return 1;
+ }
-#ifdef CONFIG_DEBUG_LOCKDEP
- hlock->prev_chain_key = 0;
- hlock->class_idx = 0;
- hlock->acquire_ip = 0;
- hlock->irq_context = 0;
-#endif
- return 1;
+ return 0;
}
-/*
- * Remove the lock to the list of currently held locks - this gets
- * called on mutex_unlock()/spin_unlock*() (or on a failed
- * mutex_lock_interruptible()). This is done for unlocks that nest
- * perfectly. (i.e. the current top of the lock-stack is unlocked)
- */
-static void
-__lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
+static void __lock_pin_lock(struct lockdep_map *lock)
{
struct task_struct *curr = current;
+ int i;
- if (!check_unlock(curr, lock, ip))
+ if (unlikely(!debug_locks))
return;
- if (nested) {
- if (!lock_release_nested(curr, lock, ip))
- return;
- } else {
- if (!lock_release_non_nested(curr, lock, ip))
+ for (i = 0; i < curr->lockdep_depth; i++) {
+ struct held_lock *hlock = curr->held_locks + i;
+
+ if (match_held_lock(hlock, lock)) {
+ hlock->pin_count++;
return;
+ }
}
- check_chain_key(curr);
+ WARN(1, "pinning an unheld lock\n");
}
-static int __lock_is_held(struct lockdep_map *lock)
+static void __lock_unpin_lock(struct lockdep_map *lock)
{
struct task_struct *curr = current;
int i;
+ if (unlikely(!debug_locks))
+ return;
+
for (i = 0; i < curr->lockdep_depth; i++) {
struct held_lock *hlock = curr->held_locks + i;
- if (match_held_lock(hlock, lock))
- return 1;
+ if (match_held_lock(hlock, lock)) {
+ if (WARN(!hlock->pin_count, "unpinning an unpinned lock\n"))
+ return;
+
+ hlock->pin_count--;
+ return;
+ }
}
- return 0;
+ WARN(1, "unpinning an unheld lock\n");
}
/*
@@ -3639,7 +3601,8 @@ void lock_release(struct lockdep_map *lock, int nested,
check_flags(flags);
current->lockdep_recursion = 1;
trace_lock_release(lock, ip);
- __lock_release(lock, nested, ip);
+ if (__lock_release(lock, nested, ip))
+ check_chain_key(current);
current->lockdep_recursion = 0;
raw_local_irq_restore(flags);
}
@@ -3665,6 +3628,40 @@ int lock_is_held(struct lockdep_map *lock)
}
EXPORT_SYMBOL_GPL(lock_is_held);
+void lock_pin_lock(struct lockdep_map *lock)
+{
+ unsigned long flags;
+
+ if (unlikely(current->lockdep_recursion))
+ return;
+
+ raw_local_irq_save(flags);
+ check_flags(flags);
+
+ current->lockdep_recursion = 1;
+ __lock_pin_lock(lock);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_pin_lock);
+
+void lock_unpin_lock(struct lockdep_map *lock)
+{
+ unsigned long flags;
+
+ if (unlikely(current->lockdep_recursion))
+ return;
+
+ raw_local_irq_save(flags);
+ check_flags(flags);
+
+ current->lockdep_recursion = 1;
+ __lock_unpin_lock(lock);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_unpin_lock);
+
void lockdep_set_current_reclaim_state(gfp_t gfp_mask)
{
current->lockdep_reclaim_gfp = gfp_mask;
@@ -4067,8 +4064,7 @@ void __init lockdep_info(void)
#ifdef CONFIG_DEBUG_LOCKDEP
if (lockdep_init_error) {
- printk("WARNING: lockdep init error! lock-%s was acquired"
- "before lockdep_init\n", lock_init_error);
+ printk("WARNING: lockdep init error: lock '%s' was acquired before lockdep_init().\n", lock_init_error);
printk("Call stack leading to lockdep invocation was:\n");
print_stack_trace(&lockdep_init_trace, 0);
}
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index ec8cce259..32244186f 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -122,12 +122,12 @@ static int torture_lock_busted_write_lock(void)
static void torture_lock_busted_write_delay(struct torture_random_state *trsp)
{
- const unsigned long longdelay_us = 100;
+ const unsigned long longdelay_ms = 100;
/* We want a long delay occasionally to force massive contention. */
if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2000 * longdelay_us)))
- mdelay(longdelay_us);
+ (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
+ mdelay(longdelay_ms);
#ifdef CONFIG_PREEMPT
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
preempt_schedule(); /* Allow test to be preempted. */
@@ -160,14 +160,14 @@ static int torture_spin_lock_write_lock(void) __acquires(torture_spinlock)
static void torture_spin_lock_write_delay(struct torture_random_state *trsp)
{
const unsigned long shortdelay_us = 2;
- const unsigned long longdelay_us = 100;
+ const unsigned long longdelay_ms = 100;
/* We want a short delay mostly to emulate likely code, and
* we want a long delay occasionally to force massive contention.
*/
if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2000 * longdelay_us)))
- mdelay(longdelay_us);
+ (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
+ mdelay(longdelay_ms);
if (!(torture_random(trsp) %
(cxt.nrealwriters_stress * 2 * shortdelay_us)))
udelay(shortdelay_us);
@@ -309,7 +309,7 @@ static int torture_rwlock_read_lock_irq(void) __acquires(torture_rwlock)
static void torture_rwlock_read_unlock_irq(void)
__releases(torture_rwlock)
{
- write_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags);
+ read_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags);
}
static struct lock_torture_ops rw_lock_irq_ops = {
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index 75e114bdf..fd91aaa45 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -17,6 +17,7 @@
struct mcs_spinlock {
struct mcs_spinlock *next;
int locked; /* 1 if lock acquired */
+ int count; /* nesting count, see qspinlock.c */
};
#ifndef arch_mcs_spin_lock_contended
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index f956ede7f..6c5da4839 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -1,5 +1,5 @@
/*
- * Queue read/write lock
+ * Queued read/write locks
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -22,6 +22,26 @@
#include <linux/hardirq.h>
#include <asm/qrwlock.h>
+/*
+ * This internal data structure is used for optimizing access to some of
+ * the subfields within the atomic_t cnts.
+ */
+struct __qrwlock {
+ union {
+ atomic_t cnts;
+ struct {
+#ifdef __LITTLE_ENDIAN
+ u8 wmode; /* Writer mode */
+ u8 rcnts[3]; /* Reader counts */
+#else
+ u8 rcnts[3]; /* Reader counts */
+ u8 wmode; /* Writer mode */
+#endif
+ };
+ };
+ arch_spinlock_t lock;
+};
+
/**
* rspin_until_writer_unlock - inc reader count & spin until writer is gone
* @lock : Pointer to queue rwlock structure
@@ -107,10 +127,10 @@ void queue_write_lock_slowpath(struct qrwlock *lock)
* or wait for a previous writer to go away.
*/
for (;;) {
- cnts = atomic_read(&lock->cnts);
- if (!(cnts & _QW_WMASK) &&
- (atomic_cmpxchg(&lock->cnts, cnts,
- cnts | _QW_WAITING) == cnts))
+ struct __qrwlock *l = (struct __qrwlock *)lock;
+
+ if (!READ_ONCE(l->wmode) &&
+ (cmpxchg(&l->wmode, 0, _QW_WAITING) == 0))
break;
cpu_relax_lowlatency();
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
new file mode 100644
index 000000000..38c49202d
--- /dev/null
+++ b/kernel/locking/qspinlock.c
@@ -0,0 +1,473 @@
+/*
+ * Queued spinlock
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
+ * (C) Copyright 2013-2014 Red Hat, Inc.
+ * (C) Copyright 2015 Intel Corp.
+ *
+ * Authors: Waiman Long <waiman.long@hp.com>
+ * Peter Zijlstra <peterz@infradead.org>
+ */
+
+#ifndef _GEN_PV_LOCK_SLOWPATH
+
+#include <linux/smp.h>
+#include <linux/bug.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <linux/mutex.h>
+#include <asm/byteorder.h>
+#include <asm/qspinlock.h>
+
+/*
+ * The basic principle of a queue-based spinlock can best be understood
+ * by studying a classic queue-based spinlock implementation called the
+ * MCS lock. The paper below provides a good description for this kind
+ * of lock.
+ *
+ * http://www.cise.ufl.edu/tr/DOC/REP-1992-71.pdf
+ *
+ * This queued spinlock implementation is based on the MCS lock, however to make
+ * it fit the 4 bytes we assume spinlock_t to be, and preserve its existing
+ * API, we must modify it somehow.
+ *
+ * In particular; where the traditional MCS lock consists of a tail pointer
+ * (8 bytes) and needs the next pointer (another 8 bytes) of its own node to
+ * unlock the next pending (next->locked), we compress both these: {tail,
+ * next->locked} into a single u32 value.
+ *
+ * Since a spinlock disables recursion of its own context and there is a limit
+ * to the contexts that can nest; namely: task, softirq, hardirq, nmi. As there
+ * are at most 4 nesting levels, it can be encoded by a 2-bit number. Now
+ * we can encode the tail by combining the 2-bit nesting level with the cpu
+ * number. With one byte for the lock value and 3 bytes for the tail, only a
+ * 32-bit word is now needed. Even though we only need 1 bit for the lock,
+ * we extend it to a full byte to achieve better performance for architectures
+ * that support atomic byte write.
+ *
+ * We also change the first spinner to spin on the lock bit instead of its
+ * node; whereby avoiding the need to carry a node from lock to unlock, and
+ * preserving existing lock API. This also makes the unlock code simpler and
+ * faster.
+ *
+ * N.B. The current implementation only supports architectures that allow
+ * atomic operations on smaller 8-bit and 16-bit data types.
+ *
+ */
+
+#include "mcs_spinlock.h"
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#define MAX_NODES 8
+#else
+#define MAX_NODES 4
+#endif
+
+/*
+ * Per-CPU queue node structures; we can never have more than 4 nested
+ * contexts: task, softirq, hardirq, nmi.
+ *
+ * Exactly fits one 64-byte cacheline on a 64-bit architecture.
+ *
+ * PV doubles the storage and uses the second cacheline for PV state.
+ */
+static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]);
+
+/*
+ * We must be able to distinguish between no-tail and the tail at 0:0,
+ * therefore increment the cpu number by one.
+ */
+
+static inline u32 encode_tail(int cpu, int idx)
+{
+ u32 tail;
+
+#ifdef CONFIG_DEBUG_SPINLOCK
+ BUG_ON(idx > 3);
+#endif
+ tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET;
+ tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */
+
+ return tail;
+}
+
+static inline struct mcs_spinlock *decode_tail(u32 tail)
+{
+ int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
+ int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;
+
+ return per_cpu_ptr(&mcs_nodes[idx], cpu);
+}
+
+#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)
+
+/*
+ * By using the whole 2nd least significant byte for the pending bit, we
+ * can allow better optimization of the lock acquisition for the pending
+ * bit holder.
+ *
+ * This internal structure is also used by the set_locked function which
+ * is not restricted to _Q_PENDING_BITS == 8.
+ */
+struct __qspinlock {
+ union {
+ atomic_t val;
+#ifdef __LITTLE_ENDIAN
+ struct {
+ u8 locked;
+ u8 pending;
+ };
+ struct {
+ u16 locked_pending;
+ u16 tail;
+ };
+#else
+ struct {
+ u16 tail;
+ u16 locked_pending;
+ };
+ struct {
+ u8 reserved[2];
+ u8 pending;
+ u8 locked;
+ };
+#endif
+ };
+};
+
+#if _Q_PENDING_BITS == 8
+/**
+ * clear_pending_set_locked - take ownership and clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,0 -> *,0,1
+ *
+ * Lock stealing is not allowed if this function is used.
+ */
+static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
+{
+ struct __qspinlock *l = (void *)lock;
+
+ WRITE_ONCE(l->locked_pending, _Q_LOCKED_VAL);
+}
+
+/*
+ * xchg_tail - Put in the new queue tail code word & retrieve previous one
+ * @lock : Pointer to queued spinlock structure
+ * @tail : The new queue tail code word
+ * Return: The previous queue tail code word
+ *
+ * xchg(lock, tail)
+ *
+ * p,*,* -> n,*,* ; prev = xchg(lock, node)
+ */
+static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
+{
+ struct __qspinlock *l = (void *)lock;
+
+ return (u32)xchg(&l->tail, tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
+}
+
+#else /* _Q_PENDING_BITS == 8 */
+
+/**
+ * clear_pending_set_locked - take ownership and clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,0 -> *,0,1
+ */
+static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
+{
+ atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val);
+}
+
+/**
+ * xchg_tail - Put in the new queue tail code word & retrieve previous one
+ * @lock : Pointer to queued spinlock structure
+ * @tail : The new queue tail code word
+ * Return: The previous queue tail code word
+ *
+ * xchg(lock, tail)
+ *
+ * p,*,* -> n,*,* ; prev = xchg(lock, node)
+ */
+static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
+{
+ u32 old, new, val = atomic_read(&lock->val);
+
+ for (;;) {
+ new = (val & _Q_LOCKED_PENDING_MASK) | tail;
+ old = atomic_cmpxchg(&lock->val, val, new);
+ if (old == val)
+ break;
+
+ val = old;
+ }
+ return old;
+}
+#endif /* _Q_PENDING_BITS == 8 */
+
+/**
+ * set_locked - Set the lock bit and own the lock
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,*,0 -> *,0,1
+ */
+static __always_inline void set_locked(struct qspinlock *lock)
+{
+ struct __qspinlock *l = (void *)lock;
+
+ WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
+}
+
+
+/*
+ * Generate the native code for queued_spin_unlock_slowpath(); provide NOPs for
+ * all the PV callbacks.
+ */
+
+static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_kick_node(struct mcs_spinlock *node) { }
+
+static __always_inline void __pv_wait_head(struct qspinlock *lock,
+ struct mcs_spinlock *node) { }
+
+#define pv_enabled() false
+
+#define pv_init_node __pv_init_node
+#define pv_wait_node __pv_wait_node
+#define pv_kick_node __pv_kick_node
+#define pv_wait_head __pv_wait_head
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#define queued_spin_lock_slowpath native_queued_spin_lock_slowpath
+#endif
+
+#endif /* _GEN_PV_LOCK_SLOWPATH */
+
+/**
+ * queued_spin_lock_slowpath - acquire the queued spinlock
+ * @lock: Pointer to queued spinlock structure
+ * @val: Current value of the queued spinlock 32-bit word
+ *
+ * (queue tail, pending bit, lock value)
+ *
+ * fast : slow : unlock
+ * : :
+ * uncontended (0,0,0) -:--> (0,0,1) ------------------------------:--> (*,*,0)
+ * : | ^--------.------. / :
+ * : v \ \ | :
+ * pending : (0,1,1) +--> (0,1,0) \ | :
+ * : | ^--' | | :
+ * : v | | :
+ * uncontended : (n,x,y) +--> (n,0,0) --' | :
+ * queue : | ^--' | :
+ * : v | :
+ * contended : (*,x,y) +--> (*,0,0) ---> (*,0,1) -' :
+ * queue : ^--' :
+ */
+void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
+{
+ struct mcs_spinlock *prev, *next, *node;
+ u32 new, old, tail;
+ int idx;
+
+ BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
+
+ if (pv_enabled())
+ goto queue;
+
+ if (virt_queued_spin_lock(lock))
+ return;
+
+ /*
+ * wait for in-progress pending->locked hand-overs
+ *
+ * 0,1,0 -> 0,0,1
+ */
+ if (val == _Q_PENDING_VAL) {
+ while ((val = atomic_read(&lock->val)) == _Q_PENDING_VAL)
+ cpu_relax();
+ }
+
+ /*
+ * trylock || pending
+ *
+ * 0,0,0 -> 0,0,1 ; trylock
+ * 0,0,1 -> 0,1,1 ; pending
+ */
+ for (;;) {
+ /*
+ * If we observe any contention; queue.
+ */
+ if (val & ~_Q_LOCKED_MASK)
+ goto queue;
+
+ new = _Q_LOCKED_VAL;
+ if (val == new)
+ new |= _Q_PENDING_VAL;
+
+ old = atomic_cmpxchg(&lock->val, val, new);
+ if (old == val)
+ break;
+
+ val = old;
+ }
+
+ /*
+ * we won the trylock
+ */
+ if (new == _Q_LOCKED_VAL)
+ return;
+
+ /*
+ * we're pending, wait for the owner to go away.
+ *
+ * *,1,1 -> *,1,0
+ *
+ * this wait loop must be a load-acquire such that we match the
+ * store-release that clears the locked bit and create lock
+ * sequentiality; this is because not all clear_pending_set_locked()
+ * implementations imply full barriers.
+ */
+ while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_MASK)
+ cpu_relax();
+
+ /*
+ * take ownership and clear the pending bit.
+ *
+ * *,1,0 -> *,0,1
+ */
+ clear_pending_set_locked(lock);
+ return;
+
+ /*
+ * End of pending bit optimistic spinning and beginning of MCS
+ * queuing.
+ */
+queue:
+ node = this_cpu_ptr(&mcs_nodes[0]);
+ idx = node->count++;
+ tail = encode_tail(smp_processor_id(), idx);
+
+ node += idx;
+ node->locked = 0;
+ node->next = NULL;
+ pv_init_node(node);
+
+ /*
+ * We touched a (possibly) cold cacheline in the per-cpu queue node;
+ * attempt the trylock once more in the hope someone let go while we
+ * weren't watching.
+ */
+ if (queued_spin_trylock(lock))
+ goto release;
+
+ /*
+ * We have already touched the queueing cacheline; don't bother with
+ * pending stuff.
+ *
+ * p,*,* -> n,*,*
+ */
+ old = xchg_tail(lock, tail);
+
+ /*
+ * if there was a previous node; link it and wait until reaching the
+ * head of the waitqueue.
+ */
+ if (old & _Q_TAIL_MASK) {
+ prev = decode_tail(old);
+ WRITE_ONCE(prev->next, node);
+
+ pv_wait_node(node);
+ arch_mcs_spin_lock_contended(&node->locked);
+ }
+
+ /*
+ * we're at the head of the waitqueue, wait for the owner & pending to
+ * go away.
+ *
+ * *,x,y -> *,0,0
+ *
+ * this wait loop must use a load-acquire such that we match the
+ * store-release that clears the locked bit and create lock
+ * sequentiality; this is because the set_locked() function below
+ * does not imply a full barrier.
+ *
+ */
+ pv_wait_head(lock, node);
+ while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_PENDING_MASK)
+ cpu_relax();
+
+ /*
+ * claim the lock:
+ *
+ * n,0,0 -> 0,0,1 : lock, uncontended
+ * *,0,0 -> *,0,1 : lock, contended
+ *
+ * If the queue head is the only one in the queue (lock value == tail),
+ * clear the tail code and grab the lock. Otherwise, we only need
+ * to grab the lock.
+ */
+ for (;;) {
+ if (val != tail) {
+ set_locked(lock);
+ break;
+ }
+ old = atomic_cmpxchg(&lock->val, val, _Q_LOCKED_VAL);
+ if (old == val)
+ goto release; /* No contention */
+
+ val = old;
+ }
+
+ /*
+ * contended path; wait for next, release.
+ */
+ while (!(next = READ_ONCE(node->next)))
+ cpu_relax();
+
+ arch_mcs_spin_unlock_contended(&next->locked);
+ pv_kick_node(next);
+
+release:
+ /*
+ * release the node
+ */
+ this_cpu_dec(mcs_nodes[0].count);
+}
+EXPORT_SYMBOL(queued_spin_lock_slowpath);
+
+/*
+ * Generate the paravirt code for queued_spin_unlock_slowpath().
+ */
+#if !defined(_GEN_PV_LOCK_SLOWPATH) && defined(CONFIG_PARAVIRT_SPINLOCKS)
+#define _GEN_PV_LOCK_SLOWPATH
+
+#undef pv_enabled
+#define pv_enabled() true
+
+#undef pv_init_node
+#undef pv_wait_node
+#undef pv_kick_node
+#undef pv_wait_head
+
+#undef queued_spin_lock_slowpath
+#define queued_spin_lock_slowpath __pv_queued_spin_lock_slowpath
+
+#include "qspinlock_paravirt.h"
+#include "qspinlock.c"
+
+#endif
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
new file mode 100644
index 000000000..df19ae4de
--- /dev/null
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -0,0 +1,334 @@
+#ifndef _GEN_PV_LOCK_SLOWPATH
+#error "do not include this file"
+#endif
+
+#include <linux/hash.h>
+#include <linux/bootmem.h>
+#include <linux/debug_locks.h>
+
+/*
+ * Implement paravirt qspinlocks; the general idea is to halt the vcpus instead
+ * of spinning them.
+ *
+ * This relies on the architecture to provide two paravirt hypercalls:
+ *
+ * pv_wait(u8 *ptr, u8 val) -- suspends the vcpu if *ptr == val
+ * pv_kick(cpu) -- wakes a suspended vcpu
+ *
+ * Using these we implement __pv_queued_spin_lock_slowpath() and
+ * __pv_queued_spin_unlock() to replace native_queued_spin_lock_slowpath() and
+ * native_queued_spin_unlock().
+ */
+
+#define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET)
+
+enum vcpu_state {
+ vcpu_running = 0,
+ vcpu_halted,
+};
+
+struct pv_node {
+ struct mcs_spinlock mcs;
+ struct mcs_spinlock __res[3];
+
+ int cpu;
+ u8 state;
+};
+
+/*
+ * Lock and MCS node addresses hash table for fast lookup
+ *
+ * Hashing is done on a per-cacheline basis to minimize the need to access
+ * more than one cacheline.
+ *
+ * Dynamically allocate a hash table big enough to hold at least 4X the
+ * number of possible cpus in the system. Allocation is done on page
+ * granularity. So the minimum number of hash buckets should be at least
+ * 256 (64-bit) or 512 (32-bit) to fully utilize a 4k page.
+ *
+ * Since we should not be holding locks from NMI context (very rare indeed) the
+ * max load factor is 0.75, which is around the point where open addressing
+ * breaks down.
+ *
+ */
+struct pv_hash_entry {
+ struct qspinlock *lock;
+ struct pv_node *node;
+};
+
+#define PV_HE_PER_LINE (SMP_CACHE_BYTES / sizeof(struct pv_hash_entry))
+#define PV_HE_MIN (PAGE_SIZE / sizeof(struct pv_hash_entry))
+
+static struct pv_hash_entry *pv_lock_hash;
+static unsigned int pv_lock_hash_bits __read_mostly;
+
+/*
+ * Allocate memory for the PV qspinlock hash buckets
+ *
+ * This function should be called from the paravirt spinlock initialization
+ * routine.
+ */
+void __init __pv_init_lock_hash(void)
+{
+ int pv_hash_size = ALIGN(4 * num_possible_cpus(), PV_HE_PER_LINE);
+
+ if (pv_hash_size < PV_HE_MIN)
+ pv_hash_size = PV_HE_MIN;
+
+ /*
+ * Allocate space from bootmem which should be page-size aligned
+ * and hence cacheline aligned.
+ */
+ pv_lock_hash = alloc_large_system_hash("PV qspinlock",
+ sizeof(struct pv_hash_entry),
+ pv_hash_size, 0, HASH_EARLY,
+ &pv_lock_hash_bits, NULL,
+ pv_hash_size, pv_hash_size);
+}
+
+#define for_each_hash_entry(he, offset, hash) \
+ for (hash &= ~(PV_HE_PER_LINE - 1), he = &pv_lock_hash[hash], offset = 0; \
+ offset < (1 << pv_lock_hash_bits); \
+ offset++, he = &pv_lock_hash[(hash + offset) & ((1 << pv_lock_hash_bits) - 1)])
+
+static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
+{
+ unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
+ struct pv_hash_entry *he;
+
+ for_each_hash_entry(he, offset, hash) {
+ if (!cmpxchg(&he->lock, NULL, lock)) {
+ WRITE_ONCE(he->node, node);
+ return &he->lock;
+ }
+ }
+ /*
+ * Hard assume there is a free entry for us.
+ *
+ * This is guaranteed by ensuring every blocked lock only ever consumes
+ * a single entry, and since we only have 4 nesting levels per CPU
+ * and allocated 4*nr_possible_cpus(), this must be so.
+ *
+ * The single entry is guaranteed by having the lock owner unhash
+ * before it releases.
+ */
+ BUG();
+}
+
+static struct pv_node *pv_unhash(struct qspinlock *lock)
+{
+ unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
+ struct pv_hash_entry *he;
+ struct pv_node *node;
+
+ for_each_hash_entry(he, offset, hash) {
+ if (READ_ONCE(he->lock) == lock) {
+ node = READ_ONCE(he->node);
+ WRITE_ONCE(he->lock, NULL);
+ return node;
+ }
+ }
+ /*
+ * Hard assume we'll find an entry.
+ *
+ * This guarantees a limited lookup time and is itself guaranteed by
+ * having the lock owner do the unhash -- IFF the unlock sees the
+ * SLOW flag, there MUST be a hash entry.
+ */
+ BUG();
+}
+
+/*
+ * Initialize the PV part of the mcs_spinlock node.
+ */
+static void pv_init_node(struct mcs_spinlock *node)
+{
+ struct pv_node *pn = (struct pv_node *)node;
+
+ BUILD_BUG_ON(sizeof(struct pv_node) > 5*sizeof(struct mcs_spinlock));
+
+ pn->cpu = smp_processor_id();
+ pn->state = vcpu_running;
+}
+
+/*
+ * Wait for node->locked to become true, halt the vcpu after a short spin.
+ * pv_kick_node() is used to wake the vcpu again.
+ */
+static void pv_wait_node(struct mcs_spinlock *node)
+{
+ struct pv_node *pn = (struct pv_node *)node;
+ int loop;
+
+ for (;;) {
+ for (loop = SPIN_THRESHOLD; loop; loop--) {
+ if (READ_ONCE(node->locked))
+ return;
+ cpu_relax();
+ }
+
+ /*
+ * Order pn->state vs pn->locked thusly:
+ *
+ * [S] pn->state = vcpu_halted [S] next->locked = 1
+ * MB MB
+ * [L] pn->locked [RmW] pn->state = vcpu_running
+ *
+ * Matches the xchg() from pv_kick_node().
+ */
+ smp_store_mb(pn->state, vcpu_halted);
+
+ if (!READ_ONCE(node->locked))
+ pv_wait(&pn->state, vcpu_halted);
+
+ /*
+ * Reset the vCPU state to avoid unncessary CPU kicking
+ */
+ WRITE_ONCE(pn->state, vcpu_running);
+
+ /*
+ * If the locked flag is still not set after wakeup, it is a
+ * spurious wakeup and the vCPU should wait again. However,
+ * there is a pretty high overhead for CPU halting and kicking.
+ * So it is better to spin for a while in the hope that the
+ * MCS lock will be released soon.
+ */
+ }
+ /*
+ * By now our node->locked should be 1 and our caller will not actually
+ * spin-wait for it. We do however rely on our caller to do a
+ * load-acquire for us.
+ */
+}
+
+/*
+ * Called after setting next->locked = 1, used to wake those stuck in
+ * pv_wait_node().
+ */
+static void pv_kick_node(struct mcs_spinlock *node)
+{
+ struct pv_node *pn = (struct pv_node *)node;
+
+ /*
+ * Note that because node->locked is already set, this actual
+ * mcs_spinlock entry could be re-used already.
+ *
+ * This should be fine however, kicking people for no reason is
+ * harmless.
+ *
+ * See the comment in pv_wait_node().
+ */
+ if (xchg(&pn->state, vcpu_running) == vcpu_halted)
+ pv_kick(pn->cpu);
+}
+
+/*
+ * Wait for l->locked to become clear; halt the vcpu after a short spin.
+ * __pv_queued_spin_unlock() will wake us.
+ */
+static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
+{
+ struct pv_node *pn = (struct pv_node *)node;
+ struct __qspinlock *l = (void *)lock;
+ struct qspinlock **lp = NULL;
+ int loop;
+
+ for (;;) {
+ for (loop = SPIN_THRESHOLD; loop; loop--) {
+ if (!READ_ONCE(l->locked))
+ return;
+ cpu_relax();
+ }
+
+ WRITE_ONCE(pn->state, vcpu_halted);
+ if (!lp) { /* ONCE */
+ lp = pv_hash(lock, pn);
+ /*
+ * lp must be set before setting _Q_SLOW_VAL
+ *
+ * [S] lp = lock [RmW] l = l->locked = 0
+ * MB MB
+ * [S] l->locked = _Q_SLOW_VAL [L] lp
+ *
+ * Matches the cmpxchg() in __pv_queued_spin_unlock().
+ */
+ if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) {
+ /*
+ * The lock is free and _Q_SLOW_VAL has never
+ * been set. Therefore we need to unhash before
+ * getting the lock.
+ */
+ WRITE_ONCE(*lp, NULL);
+ return;
+ }
+ }
+ pv_wait(&l->locked, _Q_SLOW_VAL);
+
+ /*
+ * The unlocker should have freed the lock before kicking the
+ * CPU. So if the lock is still not free, it is a spurious
+ * wakeup and so the vCPU should wait again after spinning for
+ * a while.
+ */
+ }
+
+ /*
+ * Lock is unlocked now; the caller will acquire it without waiting.
+ * As with pv_wait_node() we rely on the caller to do a load-acquire
+ * for us.
+ */
+}
+
+/*
+ * PV version of the unlock function to be used in stead of
+ * queued_spin_unlock().
+ */
+__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
+{
+ struct __qspinlock *l = (void *)lock;
+ struct pv_node *node;
+ u8 lockval = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
+
+ /*
+ * We must not unlock if SLOW, because in that case we must first
+ * unhash. Otherwise it would be possible to have multiple @lock
+ * entries, which would be BAD.
+ */
+ if (likely(lockval == _Q_LOCKED_VAL))
+ return;
+
+ if (unlikely(lockval != _Q_SLOW_VAL)) {
+ if (debug_locks_silent)
+ return;
+ WARN(1, "pvqspinlock: lock %p has corrupted value 0x%x!\n", lock, atomic_read(&lock->val));
+ return;
+ }
+
+ /*
+ * Since the above failed to release, this must be the SLOW path.
+ * Therefore start by looking up the blocked node and unhashing it.
+ */
+ node = pv_unhash(lock);
+
+ /*
+ * Now that we have a reference to the (likely) blocked pv_node,
+ * release the lock.
+ */
+ smp_store_release(&l->locked, 0);
+
+ /*
+ * At this point the memory pointed at by lock can be freed/reused,
+ * however we can still use the pv_node to kick the CPU.
+ */
+ if (READ_ONCE(node->state) == vcpu_halted)
+ pv_kick(node->cpu);
+}
+/*
+ * Include the architecture specific callee-save thunk of the
+ * __pv_queued_spin_unlock(). This thunk is put together with
+ * __pv_queued_spin_unlock() near the top of the file to make sure
+ * that the callee-save thunk and the real unlock function are close
+ * to each other sharing consecutive instruction cachelines.
+ */
+#include <asm/qspinlock_paravirt.h>
+
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index b025295f4..5674b0734 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -70,10 +70,10 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
}
/*
- * We can speed up the acquire/release, if the architecture
- * supports cmpxchg and if there's no debugging state to be set up
+ * We can speed up the acquire/release, if there's no debugging state to be
+ * set up.
*/
-#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
+#ifndef CONFIG_DEBUG_RT_MUTEXES
# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c)
static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
{
@@ -300,7 +300,7 @@ static void __rt_mutex_adjust_prio(struct task_struct *task)
* of task. We do not use the spin_xx_mutex() variants here as we are
* outside of the debug path.)
*/
-static void rt_mutex_adjust_prio(struct task_struct *task)
+void rt_mutex_adjust_prio(struct task_struct *task)
{
unsigned long flags;
@@ -624,7 +624,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
*/
prerequeue_top_waiter = rt_mutex_top_waiter(lock);
- /* [7] Requeue the waiter in the lock waiter list. */
+ /* [7] Requeue the waiter in the lock waiter tree. */
rt_mutex_dequeue(lock, waiter);
waiter->prio = task->prio;
rt_mutex_enqueue(lock, waiter);
@@ -662,7 +662,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
/*
* The waiter became the new top (highest priority)
* waiter on the lock. Replace the previous top waiter
- * in the owner tasks pi waiters list with this waiter
+ * in the owner tasks pi waiters tree with this waiter
* and adjust the priority of the owner.
*/
rt_mutex_dequeue_pi(task, prerequeue_top_waiter);
@@ -673,7 +673,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
/*
* The waiter was the top waiter on the lock, but is
* no longer the top prority waiter. Replace waiter in
- * the owner tasks pi waiters list with the new top
+ * the owner tasks pi waiters tree with the new top
* (highest priority) waiter and adjust the priority
* of the owner.
* The new top waiter is stored in @waiter so that
@@ -747,7 +747,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
*
* @lock: The lock to be acquired.
* @task: The task which wants to acquire the lock
- * @waiter: The waiter that is queued to the lock's wait list if the
+ * @waiter: The waiter that is queued to the lock's wait tree if the
* callsite called task_blocked_on_lock(), otherwise NULL
*/
static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
@@ -782,7 +782,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
/*
* If @waiter != NULL, @task has already enqueued the waiter
- * into @lock waiter list. If @waiter == NULL then this is a
+ * into @lock waiter tree. If @waiter == NULL then this is a
* trylock attempt.
*/
if (waiter) {
@@ -795,7 +795,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
/*
* We can acquire the lock. Remove the waiter from the
- * lock waiters list.
+ * lock waiters tree.
*/
rt_mutex_dequeue(lock, waiter);
@@ -827,7 +827,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
* No waiters. Take the lock without the
* pi_lock dance.@task->pi_blocked_on is NULL
* and we have no waiters to enqueue in @task
- * pi waiters list.
+ * pi waiters tree.
*/
goto takeit;
}
@@ -844,7 +844,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
/*
* Finish the lock acquisition. @task is the new owner. If
* other waiters exist we have to insert the highest priority
- * waiter into @task->pi_waiters list.
+ * waiter into @task->pi_waiters tree.
*/
if (rt_mutex_has_waiters(lock))
rt_mutex_enqueue_pi(task, rt_mutex_top_waiter(lock));
@@ -955,14 +955,13 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
}
/*
- * Wake up the next waiter on the lock.
- *
- * Remove the top waiter from the current tasks pi waiter list and
- * wake it up.
+ * Remove the top waiter from the current tasks pi waiter tree and
+ * queue it up.
*
* Called with lock->wait_lock held.
*/
-static void wakeup_next_waiter(struct rt_mutex *lock)
+static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
+ struct rt_mutex *lock)
{
struct rt_mutex_waiter *waiter;
unsigned long flags;
@@ -991,12 +990,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
raw_spin_unlock_irqrestore(&current->pi_lock, flags);
- /*
- * It's safe to dereference waiter as it cannot go away as
- * long as we hold lock->wait_lock. The waiter task needs to
- * acquire it in order to dequeue the waiter.
- */
- wake_up_process(waiter->task);
+ wake_q_add(wake_q, waiter->task);
}
/*
@@ -1182,11 +1176,8 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
set_current_state(state);
/* Setup the timer, when timeout != NULL */
- if (unlikely(timeout)) {
+ if (unlikely(timeout))
hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
- if (!hrtimer_active(&timeout->timer))
- timeout->task = NULL;
- }
ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
@@ -1253,10 +1244,11 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
}
/*
- * Slow path to release a rt-mutex:
+ * Slow path to release a rt-mutex.
+ * Return whether the current task needs to undo a potential priority boosting.
*/
-static void __sched
-rt_mutex_slowunlock(struct rt_mutex *lock)
+static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
+ struct wake_q_head *wake_q)
{
raw_spin_lock(&lock->wait_lock);
@@ -1298,7 +1290,7 @@ rt_mutex_slowunlock(struct rt_mutex *lock)
while (!rt_mutex_has_waiters(lock)) {
/* Drops lock->wait_lock ! */
if (unlock_rt_mutex_safe(lock) == true)
- return;
+ return false;
/* Relock the rtmutex and try again */
raw_spin_lock(&lock->wait_lock);
}
@@ -1306,13 +1298,15 @@ rt_mutex_slowunlock(struct rt_mutex *lock)
/*
* The wakeup next waiter path does not suffer from the above
* race. See the comments there.
+ *
+ * Queue the next waiter for wakeup once we release the wait_lock.
*/
- wakeup_next_waiter(lock);
+ mark_wakeup_next_waiter(wake_q, lock);
raw_spin_unlock(&lock->wait_lock);
- /* Undo pi boosting if necessary: */
- rt_mutex_adjust_prio(current);
+ /* check PI boosting */
+ return true;
}
/*
@@ -1363,12 +1357,23 @@ rt_mutex_fasttrylock(struct rt_mutex *lock,
static inline void
rt_mutex_fastunlock(struct rt_mutex *lock,
- void (*slowfn)(struct rt_mutex *lock))
+ bool (*slowfn)(struct rt_mutex *lock,
+ struct wake_q_head *wqh))
{
- if (likely(rt_mutex_cmpxchg(lock, current, NULL)))
+ WAKE_Q(wake_q);
+
+ if (likely(rt_mutex_cmpxchg(lock, current, NULL))) {
rt_mutex_deadlock_account_unlock(current);
- else
- slowfn(lock);
+
+ } else {
+ bool deboost = slowfn(lock, &wake_q);
+
+ wake_up_q(&wake_q);
+
+ /* Undo pi boosting if necessary: */
+ if (deboost)
+ rt_mutex_adjust_prio(current);
+ }
}
/**
@@ -1443,10 +1448,17 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
*
* @lock: the rt_mutex to be locked
*
+ * This function can only be called in thread context. It's safe to
+ * call it from atomic regions, but not from hard interrupt or soft
+ * interrupt context.
+ *
* Returns 1 on success and 0 on contention
*/
int __sched rt_mutex_trylock(struct rt_mutex *lock)
{
+ if (WARN_ON(in_irq() || in_nmi() || in_serving_softirq()))
+ return 0;
+
return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
}
EXPORT_SYMBOL_GPL(rt_mutex_trylock);
@@ -1463,6 +1475,23 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock)
EXPORT_SYMBOL_GPL(rt_mutex_unlock);
/**
+ * rt_mutex_futex_unlock - Futex variant of rt_mutex_unlock
+ * @lock: the rt_mutex to be unlocked
+ *
+ * Returns: true/false indicating whether priority adjustment is
+ * required or not.
+ */
+bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock,
+ struct wake_q_head *wqh)
+{
+ if (likely(rt_mutex_cmpxchg(lock, current, NULL))) {
+ rt_mutex_deadlock_account_unlock(current);
+ return false;
+ }
+ return rt_mutex_slowunlock(lock, wqh);
+}
+
+/**
* rt_mutex_destroy - mark a mutex unusable
* @lock: the mutex to be destroyed
*
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 855212501..7844f8f0e 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -131,6 +131,9 @@ extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
struct hrtimer_sleeper *to,
struct rt_mutex_waiter *waiter);
extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
+extern bool rt_mutex_futex_unlock(struct rt_mutex *lock,
+ struct wake_q_head *wqh);
+extern void rt_mutex_adjust_prio(struct task_struct *task);
#ifdef CONFIG_DEBUG_RT_MUTEXES
# include "rtmutex-debug.h"
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 3417d0172..0f189714e 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -409,11 +409,24 @@ done:
return taken;
}
+/*
+ * Return true if the rwsem has active spinner
+ */
+static inline bool rwsem_has_spinner(struct rw_semaphore *sem)
+{
+ return osq_is_locked(&sem->osq);
+}
+
#else
static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
{
return false;
}
+
+static inline bool rwsem_has_spinner(struct rw_semaphore *sem)
+{
+ return false;
+}
#endif
/*
@@ -496,7 +509,38 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
{
unsigned long flags;
+ /*
+ * If a spinner is present, it is not necessary to do the wakeup.
+ * Try to do wakeup only if the trylock succeeds to minimize
+ * spinlock contention which may introduce too much delay in the
+ * unlock operation.
+ *
+ * spinning writer up_write/up_read caller
+ * --------------- -----------------------
+ * [S] osq_unlock() [L] osq
+ * MB RMB
+ * [RmW] rwsem_try_write_lock() [RmW] spin_trylock(wait_lock)
+ *
+ * Here, it is important to make sure that there won't be a missed
+ * wakeup while the rwsem is free and the only spinning writer goes
+ * to sleep without taking the rwsem. Even when the spinning writer
+ * is just going to break out of the waiting loop, it will still do
+ * a trylock in rwsem_down_write_failed() before sleeping. IOW, if
+ * rwsem_has_spinner() is true, it will guarantee at least one
+ * trylock attempt on the rwsem later on.
+ */
+ if (rwsem_has_spinner(sem)) {
+ /*
+ * The smp_rmb() here is to make sure that the spinner
+ * state is consulted before reading the wait_lock.
+ */
+ smp_rmb();
+ if (!raw_spin_trylock_irqsave(&sem->wait_lock, flags))
+ return sem;
+ goto locked;
+ }
raw_spin_lock_irqsave(&sem->wait_lock, flags);
+locked:
/* do nothing if list empty */
if (!list_empty(&sem->wait_list))
diff --git a/kernel/module.c b/kernel/module.c
index cfc9e843a..b86b7bf1b 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -18,7 +18,7 @@
*/
#include <linux/export.h>
#include <linux/moduleloader.h>
-#include <linux/ftrace_event.h>
+#include <linux/trace_events.h>
#include <linux/init.h>
#include <linux/kallsyms.h>
#include <linux/file.h>
@@ -101,48 +101,201 @@
DEFINE_MUTEX(module_mutex);
EXPORT_SYMBOL_GPL(module_mutex);
static LIST_HEAD(modules);
-#ifdef CONFIG_KGDB_KDB
-struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
-#endif /* CONFIG_KGDB_KDB */
-#ifdef CONFIG_MODULE_SIG
-#ifdef CONFIG_MODULE_SIG_FORCE
-static bool sig_enforce = true;
-#else
-static bool sig_enforce = false;
+#ifdef CONFIG_MODULES_TREE_LOOKUP
+
+/*
+ * Use a latched RB-tree for __module_address(); this allows us to use
+ * RCU-sched lookups of the address from any context.
+ *
+ * Because modules have two address ranges: init and core, we need two
+ * latch_tree_nodes entries. Therefore we need the back-pointer from
+ * mod_tree_node.
+ *
+ * Because init ranges are short lived we mark them unlikely and have placed
+ * them outside the critical cacheline in struct module.
+ *
+ * This is conditional on PERF_EVENTS || TRACING because those can really hit
+ * __module_address() hard by doing a lot of stack unwinding; potentially from
+ * NMI context.
+ */
-static int param_set_bool_enable_only(const char *val,
- const struct kernel_param *kp)
+static __always_inline unsigned long __mod_tree_val(struct latch_tree_node *n)
{
- int err;
- bool test;
- struct kernel_param dummy_kp = *kp;
+ struct mod_tree_node *mtn = container_of(n, struct mod_tree_node, node);
+ struct module *mod = mtn->mod;
- dummy_kp.arg = &test;
+ if (unlikely(mtn == &mod->mtn_init))
+ return (unsigned long)mod->module_init;
- err = param_set_bool(val, &dummy_kp);
- if (err)
- return err;
+ return (unsigned long)mod->module_core;
+}
+
+static __always_inline unsigned long __mod_tree_size(struct latch_tree_node *n)
+{
+ struct mod_tree_node *mtn = container_of(n, struct mod_tree_node, node);
+ struct module *mod = mtn->mod;
+
+ if (unlikely(mtn == &mod->mtn_init))
+ return (unsigned long)mod->init_size;
- /* Don't let them unset it once it's set! */
- if (!test && sig_enforce)
- return -EROFS;
+ return (unsigned long)mod->core_size;
+}
+
+static __always_inline bool
+mod_tree_less(struct latch_tree_node *a, struct latch_tree_node *b)
+{
+ return __mod_tree_val(a) < __mod_tree_val(b);
+}
+
+static __always_inline int
+mod_tree_comp(void *key, struct latch_tree_node *n)
+{
+ unsigned long val = (unsigned long)key;
+ unsigned long start, end;
+
+ start = __mod_tree_val(n);
+ if (val < start)
+ return -1;
+
+ end = start + __mod_tree_size(n);
+ if (val >= end)
+ return 1;
- if (test)
- sig_enforce = true;
return 0;
}
-static const struct kernel_param_ops param_ops_bool_enable_only = {
- .flags = KERNEL_PARAM_OPS_FL_NOARG,
- .set = param_set_bool_enable_only,
- .get = param_get_bool,
+static const struct latch_tree_ops mod_tree_ops = {
+ .less = mod_tree_less,
+ .comp = mod_tree_comp,
};
-#define param_check_bool_enable_only param_check_bool
+static struct mod_tree_root {
+ struct latch_tree_root root;
+ unsigned long addr_min;
+ unsigned long addr_max;
+} mod_tree __cacheline_aligned = {
+ .addr_min = -1UL,
+};
+
+#define module_addr_min mod_tree.addr_min
+#define module_addr_max mod_tree.addr_max
+
+static noinline void __mod_tree_insert(struct mod_tree_node *node)
+{
+ latch_tree_insert(&node->node, &mod_tree.root, &mod_tree_ops);
+}
+
+static void __mod_tree_remove(struct mod_tree_node *node)
+{
+ latch_tree_erase(&node->node, &mod_tree.root, &mod_tree_ops);
+}
+
+/*
+ * These modifications: insert, remove_init and remove; are serialized by the
+ * module_mutex.
+ */
+static void mod_tree_insert(struct module *mod)
+{
+ mod->mtn_core.mod = mod;
+ mod->mtn_init.mod = mod;
+
+ __mod_tree_insert(&mod->mtn_core);
+ if (mod->init_size)
+ __mod_tree_insert(&mod->mtn_init);
+}
+
+static void mod_tree_remove_init(struct module *mod)
+{
+ if (mod->init_size)
+ __mod_tree_remove(&mod->mtn_init);
+}
+
+static void mod_tree_remove(struct module *mod)
+{
+ __mod_tree_remove(&mod->mtn_core);
+ mod_tree_remove_init(mod);
+}
+
+static struct module *mod_find(unsigned long addr)
+{
+ struct latch_tree_node *ltn;
+
+ ltn = latch_tree_find((void *)addr, &mod_tree.root, &mod_tree_ops);
+ if (!ltn)
+ return NULL;
+
+ return container_of(ltn, struct mod_tree_node, node)->mod;
+}
+
+#else /* MODULES_TREE_LOOKUP */
+
+static unsigned long module_addr_min = -1UL, module_addr_max = 0;
+
+static void mod_tree_insert(struct module *mod) { }
+static void mod_tree_remove_init(struct module *mod) { }
+static void mod_tree_remove(struct module *mod) { }
+
+static struct module *mod_find(unsigned long addr)
+{
+ struct module *mod;
+
+ list_for_each_entry_rcu(mod, &modules, list) {
+ if (within_module(addr, mod))
+ return mod;
+ }
+
+ return NULL;
+}
+
+#endif /* MODULES_TREE_LOOKUP */
+
+/*
+ * Bounds of module text, for speeding up __module_address.
+ * Protected by module_mutex.
+ */
+static void __mod_update_bounds(void *base, unsigned int size)
+{
+ unsigned long min = (unsigned long)base;
+ unsigned long max = min + size;
+
+ if (min < module_addr_min)
+ module_addr_min = min;
+ if (max > module_addr_max)
+ module_addr_max = max;
+}
+
+static void mod_update_bounds(struct module *mod)
+{
+ __mod_update_bounds(mod->module_core, mod->core_size);
+ if (mod->init_size)
+ __mod_update_bounds(mod->module_init, mod->init_size);
+}
+
+#ifdef CONFIG_KGDB_KDB
+struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
+#endif /* CONFIG_KGDB_KDB */
+
+static void module_assert_mutex(void)
+{
+ lockdep_assert_held(&module_mutex);
+}
+
+static void module_assert_mutex_or_preempt(void)
+{
+#ifdef CONFIG_LOCKDEP
+ if (unlikely(!debug_locks))
+ return;
+
+ WARN_ON(!rcu_read_lock_sched_held() &&
+ !lockdep_is_held(&module_mutex));
+#endif
+}
+
+static bool sig_enforce = IS_ENABLED(CONFIG_MODULE_SIG_FORCE);
+#ifndef CONFIG_MODULE_SIG_FORCE
module_param(sig_enforce, bool_enable_only, 0644);
#endif /* !CONFIG_MODULE_SIG_FORCE */
-#endif /* CONFIG_MODULE_SIG */
/* Block module loading/unloading? */
int modules_disabled = 0;
@@ -153,10 +306,6 @@ static DECLARE_WAIT_QUEUE_HEAD(module_wq);
static BLOCKING_NOTIFIER_HEAD(module_notify_list);
-/* Bounds of module allocation, for speeding __module_address.
- * Protected by module_mutex. */
-static unsigned long module_addr_min = -1UL, module_addr_max = 0;
-
int register_module_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_register(&module_notify_list, nb);
@@ -318,6 +467,8 @@ bool each_symbol_section(bool (*fn)(const struct symsearch *arr,
#endif
};
+ module_assert_mutex_or_preempt();
+
if (each_symbol_in_section(arr, ARRAY_SIZE(arr), NULL, fn, data))
return true;
@@ -451,12 +602,17 @@ const struct kernel_symbol *find_symbol(const char *name,
}
EXPORT_SYMBOL_GPL(find_symbol);
-/* Search for module by name: must hold module_mutex. */
+/*
+ * Search for module by name: must hold module_mutex (or preempt disabled
+ * for read-only access).
+ */
static struct module *find_module_all(const char *name, size_t len,
bool even_unformed)
{
struct module *mod;
+ module_assert_mutex_or_preempt();
+
list_for_each_entry(mod, &modules, list) {
if (!even_unformed && mod->state == MODULE_STATE_UNFORMED)
continue;
@@ -468,6 +624,7 @@ static struct module *find_module_all(const char *name, size_t len,
struct module *find_module(const char *name)
{
+ module_assert_mutex();
return find_module_all(name, strlen(name), false);
}
EXPORT_SYMBOL_GPL(find_module);
@@ -1169,11 +1326,17 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
{
const unsigned long *crc;
- /* Since this should be found in kernel (which can't be removed),
- * no locking is necessary. */
+ /*
+ * Since this should be found in kernel (which can't be removed), no
+ * locking is necessary -- use preempt_disable() to placate lockdep.
+ */
+ preempt_disable();
if (!find_symbol(VMLINUX_SYMBOL_STR(module_layout), NULL,
- &crc, true, false))
+ &crc, true, false)) {
+ preempt_enable();
BUG();
+ }
+ preempt_enable();
return check_version(sechdrs, versindex,
VMLINUX_SYMBOL_STR(module_layout), mod, crc,
NULL);
@@ -1661,6 +1824,10 @@ static void mod_sysfs_fini(struct module *mod)
mod_kobject_put(mod);
}
+static void init_param_lock(struct module *mod)
+{
+ mutex_init(&mod->param_lock);
+}
#else /* !CONFIG_SYSFS */
static int mod_sysfs_setup(struct module *mod,
@@ -1683,6 +1850,9 @@ static void del_usage_links(struct module *mod)
{
}
+static void init_param_lock(struct module *mod)
+{
+}
#endif /* CONFIG_SYSFS */
static void mod_sysfs_teardown(struct module *mod)
@@ -1852,10 +2022,11 @@ static void free_module(struct module *mod)
mutex_lock(&module_mutex);
/* Unlink carefully: kallsyms could be walking list. */
list_del_rcu(&mod->list);
+ mod_tree_remove(mod);
/* Remove this module from bug list, this uses list_del_rcu */
module_bug_cleanup(mod);
- /* Wait for RCU synchronizing before releasing mod->list and buglist. */
- synchronize_rcu();
+ /* Wait for RCU-sched synchronizing before releasing mod->list and buglist. */
+ synchronize_sched();
mutex_unlock(&module_mutex);
/* This may be NULL, but that's OK */
@@ -2384,22 +2555,6 @@ void * __weak module_alloc(unsigned long size)
return vmalloc_exec(size);
}
-static void *module_alloc_update_bounds(unsigned long size)
-{
- void *ret = module_alloc(size);
-
- if (ret) {
- mutex_lock(&module_mutex);
- /* Update module bounds. */
- if ((unsigned long)ret < module_addr_min)
- module_addr_min = (unsigned long)ret;
- if ((unsigned long)ret + size > module_addr_max)
- module_addr_max = (unsigned long)ret + size;
- mutex_unlock(&module_mutex);
- }
- return ret;
-}
-
#ifdef CONFIG_DEBUG_KMEMLEAK
static void kmemleak_load_module(const struct module *mod,
const struct load_info *info)
@@ -2805,7 +2960,7 @@ static int move_module(struct module *mod, struct load_info *info)
void *ptr;
/* Do the allocs. */
- ptr = module_alloc_update_bounds(mod->core_size);
+ ptr = module_alloc(mod->core_size);
/*
* The pointer to this block is stored in the module structure
* which is inside the block. Just mark it as not being a
@@ -2819,7 +2974,7 @@ static int move_module(struct module *mod, struct load_info *info)
mod->module_core = ptr;
if (mod->init_size) {
- ptr = module_alloc_update_bounds(mod->init_size);
+ ptr = module_alloc(mod->init_size);
/*
* The pointer to this block is stored in the module structure
* which is inside the block. This block doesn't need to be
@@ -3107,7 +3262,7 @@ static noinline int do_init_module(struct module *mod)
*
* http://thread.gmane.org/gmane.linux.kernel/1420814
*/
- if (current->flags & PF_USED_ASYNC)
+ if (!mod->async_probe_requested && (current->flags & PF_USED_ASYNC))
async_synchronize_full();
mutex_lock(&module_mutex);
@@ -3119,6 +3274,7 @@ static noinline int do_init_module(struct module *mod)
mod->symtab = mod->core_symtab;
mod->strtab = mod->core_strtab;
#endif
+ mod_tree_remove_init(mod);
unset_module_init_ro_nx(mod);
module_arch_freeing_init(mod);
mod->module_init = NULL;
@@ -3127,11 +3283,11 @@ static noinline int do_init_module(struct module *mod)
mod->init_text_size = 0;
/*
* We want to free module_init, but be aware that kallsyms may be
- * walking this with preempt disabled. In all the failure paths,
- * we call synchronize_rcu/synchronize_sched, but we don't want
- * to slow down the success path, so use actual RCU here.
+ * walking this with preempt disabled. In all the failure paths, we
+ * call synchronize_sched(), but we don't want to slow down the success
+ * path, so use actual RCU here.
*/
- call_rcu(&freeinit->rcu, do_free_init);
+ call_rcu_sched(&freeinit->rcu, do_free_init);
mutex_unlock(&module_mutex);
wake_up_all(&module_wq);
@@ -3188,7 +3344,9 @@ again:
err = -EEXIST;
goto out;
}
+ mod_update_bounds(mod);
list_add_rcu(&mod->list, &modules);
+ mod_tree_insert(mod);
err = 0;
out:
@@ -3237,10 +3395,19 @@ out:
return err;
}
-static int unknown_module_param_cb(char *param, char *val, const char *modname)
+static int unknown_module_param_cb(char *param, char *val, const char *modname,
+ void *arg)
{
+ struct module *mod = arg;
+ int ret;
+
+ if (strcmp(param, "async_probe") == 0) {
+ mod->async_probe_requested = true;
+ return 0;
+ }
+
/* Check for magic 'dyndbg' arg */
- int ret = ddebug_dyndbg_module_param_cb(param, val, modname);
+ ret = ddebug_dyndbg_module_param_cb(param, val, modname);
if (ret != 0)
pr_warn("%s: unknown parameter '%s' ignored\n", modname, param);
return 0;
@@ -3295,6 +3462,8 @@ static int load_module(struct load_info *info, const char __user *uargs,
if (err)
goto unlink_mod;
+ init_param_lock(mod);
+
/* Now we've got everything in the final locations, we can
* find optional sections. */
err = find_module_sections(mod, info);
@@ -3342,7 +3511,8 @@ static int load_module(struct load_info *info, const char __user *uargs,
/* Module is ready to execute: parsing args may do that. */
after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
- -32768, 32767, unknown_module_param_cb);
+ -32768, 32767, NULL,
+ unknown_module_param_cb);
if (IS_ERR(after_dashes)) {
err = PTR_ERR(after_dashes);
goto bug_cleanup;
@@ -3391,9 +3561,10 @@ static int load_module(struct load_info *info, const char __user *uargs,
mutex_lock(&module_mutex);
/* Unlink carefully: kallsyms could be walking list. */
list_del_rcu(&mod->list);
+ mod_tree_remove(mod);
wake_up_all(&module_wq);
- /* Wait for RCU synchronizing before releasing mod->list. */
- synchronize_rcu();
+ /* Wait for RCU-sched synchronizing before releasing mod->list. */
+ synchronize_sched();
mutex_unlock(&module_mutex);
free_module:
/* Free lock-classes; relies on the preceding sync_rcu() */
@@ -3517,19 +3688,15 @@ const char *module_address_lookup(unsigned long addr,
char **modname,
char *namebuf)
{
- struct module *mod;
const char *ret = NULL;
+ struct module *mod;
preempt_disable();
- list_for_each_entry_rcu(mod, &modules, list) {
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
- if (within_module(addr, mod)) {
- if (modname)
- *modname = mod->name;
- ret = get_ksymbol(mod, addr, size, offset);
- break;
- }
+ mod = __module_address(addr);
+ if (mod) {
+ if (modname)
+ *modname = mod->name;
+ ret = get_ksymbol(mod, addr, size, offset);
}
/* Make a copy in here where it's safe */
if (ret) {
@@ -3537,6 +3704,7 @@ const char *module_address_lookup(unsigned long addr,
ret = namebuf;
}
preempt_enable();
+
return ret;
}
@@ -3660,6 +3828,8 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
unsigned int i;
int ret;
+ module_assert_mutex();
+
list_for_each_entry(mod, &modules, list) {
if (mod->state == MODULE_STATE_UNFORMED)
continue;
@@ -3834,13 +4004,15 @@ struct module *__module_address(unsigned long addr)
if (addr < module_addr_min || addr > module_addr_max)
return NULL;
- list_for_each_entry_rcu(mod, &modules, list) {
+ module_assert_mutex_or_preempt();
+
+ mod = mod_find(addr);
+ if (mod) {
+ BUG_ON(!within_module(addr, mod));
if (mod->state == MODULE_STATE_UNFORMED)
- continue;
- if (within_module(addr, mod))
- return mod;
+ mod = NULL;
}
- return NULL;
+ return mod;
}
EXPORT_SYMBOL_GPL(__module_address);
diff --git a/kernel/panic.c b/kernel/panic.c
index 8136ad76e..04e91ff75 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -32,7 +32,7 @@ static unsigned long tainted_mask;
static int pause_on_oops;
static int pause_on_oops_flag;
static DEFINE_SPINLOCK(pause_on_oops_lock);
-static bool crash_kexec_post_notifiers;
+bool crash_kexec_post_notifiers;
int panic_on_warn __read_mostly;
int panic_timeout = CONFIG_PANIC_TIMEOUT;
@@ -142,7 +142,8 @@ void panic(const char *fmt, ...)
* Note: since some panic_notifiers can make crashed kernel
* more unstable, it can increase risks of the kdump failure too.
*/
- crash_kexec(NULL);
+ if (crash_kexec_post_notifiers)
+ crash_kexec(NULL);
bust_spinlocks(0);
diff --git a/kernel/params.c b/kernel/params.c
index a22d6a759..b6554aa71 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -25,15 +25,34 @@
#include <linux/slab.h>
#include <linux/ctype.h>
-/* Protects all parameters, and incidentally kmalloced_param list. */
+#ifdef CONFIG_SYSFS
+/* Protects all built-in parameters, modules use their own param_lock */
static DEFINE_MUTEX(param_lock);
+/* Use the module's mutex, or if built-in use the built-in mutex */
+#ifdef CONFIG_MODULES
+#define KPARAM_MUTEX(mod) ((mod) ? &(mod)->param_lock : &param_lock)
+#else
+#define KPARAM_MUTEX(mod) (&param_lock)
+#endif
+
+static inline void check_kparam_locked(struct module *mod)
+{
+ BUG_ON(!mutex_is_locked(KPARAM_MUTEX(mod)));
+}
+#else
+static inline void check_kparam_locked(struct module *mod)
+{
+}
+#endif /* !CONFIG_SYSFS */
+
/* This just allows us to keep track of which parameters are kmalloced. */
struct kmalloced_param {
struct list_head list;
char val[];
};
static LIST_HEAD(kmalloced_params);
+static DEFINE_SPINLOCK(kmalloced_params_lock);
static void *kmalloc_parameter(unsigned int size)
{
@@ -43,7 +62,10 @@ static void *kmalloc_parameter(unsigned int size)
if (!p)
return NULL;
+ spin_lock(&kmalloced_params_lock);
list_add(&p->list, &kmalloced_params);
+ spin_unlock(&kmalloced_params_lock);
+
return p->val;
}
@@ -52,6 +74,7 @@ static void maybe_kfree_parameter(void *param)
{
struct kmalloced_param *p;
+ spin_lock(&kmalloced_params_lock);
list_for_each_entry(p, &kmalloced_params, list) {
if (p->val == param) {
list_del(&p->list);
@@ -59,6 +82,7 @@ static void maybe_kfree_parameter(void *param)
break;
}
}
+ spin_unlock(&kmalloced_params_lock);
}
static char dash2underscore(char c)
@@ -100,8 +124,9 @@ static int parse_one(char *param,
unsigned num_params,
s16 min_level,
s16 max_level,
+ void *arg,
int (*handle_unknown)(char *param, char *val,
- const char *doing))
+ const char *doing, void *arg))
{
unsigned int i;
int err;
@@ -118,17 +143,17 @@ static int parse_one(char *param,
return -EINVAL;
pr_debug("handling %s with %p\n", param,
params[i].ops->set);
- mutex_lock(&param_lock);
+ kernel_param_lock(params[i].mod);
param_check_unsafe(&params[i]);
err = params[i].ops->set(val, &params[i]);
- mutex_unlock(&param_lock);
+ kernel_param_unlock(params[i].mod);
return err;
}
}
if (handle_unknown) {
pr_debug("doing %s: %s='%s'\n", doing, param, val);
- return handle_unknown(param, val, doing);
+ return handle_unknown(param, val, doing, arg);
}
pr_debug("Unknown argument '%s'\n", param);
@@ -194,7 +219,9 @@ char *parse_args(const char *doing,
unsigned num,
s16 min_level,
s16 max_level,
- int (*unknown)(char *param, char *val, const char *doing))
+ void *arg,
+ int (*unknown)(char *param, char *val,
+ const char *doing, void *arg))
{
char *param, *val;
@@ -214,7 +241,7 @@ char *parse_args(const char *doing,
return args;
irq_was_disabled = irqs_disabled();
ret = parse_one(param, val, doing, params, num,
- min_level, max_level, unknown);
+ min_level, max_level, arg, unknown);
if (irq_was_disabled && !irqs_disabled())
pr_warn("%s: option '%s' enabled irq's!\n",
doing, param);
@@ -251,7 +278,7 @@ char *parse_args(const char *doing,
return scnprintf(buffer, PAGE_SIZE, format, \
*((type *)kp->arg)); \
} \
- struct kernel_param_ops param_ops_##name = { \
+ const struct kernel_param_ops param_ops_##name = { \
.set = param_set_##name, \
.get = param_get_##name, \
}; \
@@ -303,7 +330,7 @@ static void param_free_charp(void *arg)
maybe_kfree_parameter(*((char **)arg));
}
-struct kernel_param_ops param_ops_charp = {
+const struct kernel_param_ops param_ops_charp = {
.set = param_set_charp,
.get = param_get_charp,
.free = param_free_charp,
@@ -328,13 +355,44 @@ int param_get_bool(char *buffer, const struct kernel_param *kp)
}
EXPORT_SYMBOL(param_get_bool);
-struct kernel_param_ops param_ops_bool = {
+const struct kernel_param_ops param_ops_bool = {
.flags = KERNEL_PARAM_OPS_FL_NOARG,
.set = param_set_bool,
.get = param_get_bool,
};
EXPORT_SYMBOL(param_ops_bool);
+int param_set_bool_enable_only(const char *val, const struct kernel_param *kp)
+{
+ int err = 0;
+ bool new_value;
+ bool orig_value = *(bool *)kp->arg;
+ struct kernel_param dummy_kp = *kp;
+
+ dummy_kp.arg = &new_value;
+
+ err = param_set_bool(val, &dummy_kp);
+ if (err)
+ return err;
+
+ /* Don't let them unset it once it's set! */
+ if (!new_value && orig_value)
+ return -EROFS;
+
+ if (new_value)
+ err = param_set_bool(val, kp);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(param_set_bool_enable_only);
+
+const struct kernel_param_ops param_ops_bool_enable_only = {
+ .flags = KERNEL_PARAM_OPS_FL_NOARG,
+ .set = param_set_bool_enable_only,
+ .get = param_get_bool,
+};
+EXPORT_SYMBOL_GPL(param_ops_bool_enable_only);
+
/* This one must be bool. */
int param_set_invbool(const char *val, const struct kernel_param *kp)
{
@@ -356,7 +414,7 @@ int param_get_invbool(char *buffer, const struct kernel_param *kp)
}
EXPORT_SYMBOL(param_get_invbool);
-struct kernel_param_ops param_ops_invbool = {
+const struct kernel_param_ops param_ops_invbool = {
.set = param_set_invbool,
.get = param_get_invbool,
};
@@ -364,12 +422,11 @@ EXPORT_SYMBOL(param_ops_invbool);
int param_set_bint(const char *val, const struct kernel_param *kp)
{
- struct kernel_param boolkp;
+ /* Match bool exactly, by re-using it. */
+ struct kernel_param boolkp = *kp;
bool v;
int ret;
- /* Match bool exactly, by re-using it. */
- boolkp = *kp;
boolkp.arg = &v;
ret = param_set_bool(val, &boolkp);
@@ -379,7 +436,7 @@ int param_set_bint(const char *val, const struct kernel_param *kp)
}
EXPORT_SYMBOL(param_set_bint);
-struct kernel_param_ops param_ops_bint = {
+const struct kernel_param_ops param_ops_bint = {
.flags = KERNEL_PARAM_OPS_FL_NOARG,
.set = param_set_bint,
.get = param_get_int,
@@ -387,7 +444,8 @@ struct kernel_param_ops param_ops_bint = {
EXPORT_SYMBOL(param_ops_bint);
/* We break the rule and mangle the string. */
-static int param_array(const char *name,
+static int param_array(struct module *mod,
+ const char *name,
const char *val,
unsigned int min, unsigned int max,
void *elem, int elemsize,
@@ -418,7 +476,7 @@ static int param_array(const char *name,
/* nul-terminate and parse */
save = val[len];
((char *)val)[len] = '\0';
- BUG_ON(!mutex_is_locked(&param_lock));
+ check_kparam_locked(mod);
ret = set(val, &kp);
if (ret != 0)
@@ -440,7 +498,7 @@ static int param_array_set(const char *val, const struct kernel_param *kp)
const struct kparam_array *arr = kp->arr;
unsigned int temp_num;
- return param_array(kp->name, val, 1, arr->max, arr->elem,
+ return param_array(kp->mod, kp->name, val, 1, arr->max, arr->elem,
arr->elemsize, arr->ops->set, kp->level,
arr->num ?: &temp_num);
}
@@ -449,14 +507,13 @@ static int param_array_get(char *buffer, const struct kernel_param *kp)
{
int i, off, ret;
const struct kparam_array *arr = kp->arr;
- struct kernel_param p;
+ struct kernel_param p = *kp;
- p = *kp;
for (i = off = 0; i < (arr->num ? *arr->num : arr->max); i++) {
if (i)
buffer[off++] = ',';
p.arg = arr->elem + arr->elemsize * i;
- BUG_ON(!mutex_is_locked(&param_lock));
+ check_kparam_locked(p.mod);
ret = arr->ops->get(buffer + off, &p);
if (ret < 0)
return ret;
@@ -476,7 +533,7 @@ static void param_array_free(void *arg)
arr->ops->free(arr->elem + arr->elemsize * i);
}
-struct kernel_param_ops param_array_ops = {
+const struct kernel_param_ops param_array_ops = {
.set = param_array_set,
.get = param_array_get,
.free = param_array_free,
@@ -504,7 +561,7 @@ int param_get_string(char *buffer, const struct kernel_param *kp)
}
EXPORT_SYMBOL(param_get_string);
-struct kernel_param_ops param_ops_string = {
+const struct kernel_param_ops param_ops_string = {
.set = param_set_copystring,
.get = param_get_string,
};
@@ -539,9 +596,9 @@ static ssize_t param_attr_show(struct module_attribute *mattr,
if (!attribute->param->ops->get)
return -EPERM;
- mutex_lock(&param_lock);
+ kernel_param_lock(mk->mod);
count = attribute->param->ops->get(buf, attribute->param);
- mutex_unlock(&param_lock);
+ kernel_param_unlock(mk->mod);
if (count > 0) {
strcat(buf, "\n");
++count;
@@ -551,7 +608,7 @@ static ssize_t param_attr_show(struct module_attribute *mattr,
/* sysfs always hands a nul-terminated string in buf. We rely on that. */
static ssize_t param_attr_store(struct module_attribute *mattr,
- struct module_kobject *km,
+ struct module_kobject *mk,
const char *buf, size_t len)
{
int err;
@@ -560,10 +617,10 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
if (!attribute->param->ops->set)
return -EPERM;
- mutex_lock(&param_lock);
+ kernel_param_lock(mk->mod);
param_check_unsafe(attribute->param);
err = attribute->param->ops->set(buf, attribute->param);
- mutex_unlock(&param_lock);
+ kernel_param_unlock(mk->mod);
if (!err)
return len;
return err;
@@ -577,17 +634,18 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
#endif
#ifdef CONFIG_SYSFS
-void __kernel_param_lock(void)
+void kernel_param_lock(struct module *mod)
{
- mutex_lock(&param_lock);
+ mutex_lock(KPARAM_MUTEX(mod));
}
-EXPORT_SYMBOL(__kernel_param_lock);
-void __kernel_param_unlock(void)
+void kernel_param_unlock(struct module *mod)
{
- mutex_unlock(&param_lock);
+ mutex_unlock(KPARAM_MUTEX(mod));
}
-EXPORT_SYMBOL(__kernel_param_unlock);
+
+EXPORT_SYMBOL(kernel_param_lock);
+EXPORT_SYMBOL(kernel_param_unlock);
/*
* add_sysfs_param - add a parameter to sysfs
@@ -853,6 +911,7 @@ static void __init version_sysfs_builtin(void)
mk = locate_module_kobject(vattr->module_name);
if (mk) {
err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr);
+ WARN_ON_ONCE(err);
kobject_uevent(&mk->kobj, KOBJ_ADD);
kobject_put(&mk->kobj);
}
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 89a46f3ff..9e302315e 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -91,284 +91,6 @@ config PM_STD_PARTITION
suspended image to. It will simply pick the first available swap
device.
-menuconfig TOI_CORE
- bool "Enhanced Hibernation (TuxOnIce)"
- depends on HIBERNATION
- default y
- ---help---
- TuxOnIce is the 'new and improved' suspend support.
-
- See the TuxOnIce home page (tuxonice.net)
- for FAQs, HOWTOs and other documentation.
-
- comment "Image Storage (you need at least one allocator)"
- depends on TOI_CORE
-
- config TOI_FILE
- bool "File Allocator"
- depends on TOI_CORE
- default y
- ---help---
- This option enables support for storing an image in a
- simple file. You might want this if your swap is
- sometimes full enough that you don't have enough spare
- space to store an image.
-
- config TOI_SWAP
- bool "Swap Allocator"
- depends on TOI_CORE && SWAP
- default y
- ---help---
- This option enables support for storing an image in your
- swap space.
-
- comment "General Options"
- depends on TOI_CORE
-
- config TOI_PRUNE
- bool "Image pruning support"
- depends on TOI_CORE && CRYPTO && BROKEN
- default y
- ---help---
- This option adds support for using cryptoapi hashing
- algorithms to identify pages with the same content. We
- then write a much smaller pointer to the first copy of
- the data instead of a complete (perhaps compressed)
- additional copy.
-
- You probably want this, so say Y here.
-
- comment "No image pruning support available without Cryptoapi support."
- depends on TOI_CORE && !CRYPTO
-
- config TOI_CRYPTO
- bool "Compression support"
- depends on TOI_CORE && CRYPTO
- default y
- ---help---
- This option adds support for using cryptoapi compression
- algorithms. Compression is particularly useful as it can
- more than double your suspend and resume speed (depending
- upon how well your image compresses).
-
- You probably want this, so say Y here.
-
- comment "No compression support available without Cryptoapi support."
- depends on TOI_CORE && !CRYPTO
-
- config TOI_USERUI
- bool "Userspace User Interface support"
- depends on TOI_CORE && NET && (VT || SERIAL_CONSOLE)
- default y
- ---help---
- This option enabled support for a userspace based user interface
- to TuxOnIce, which allows you to have a nice display while suspending
- and resuming, and also enables features such as pressing escape to
- cancel a cycle or interactive debugging.
-
- config TOI_USERUI_DEFAULT_PATH
- string "Default userui program location"
- default "/usr/local/sbin/tuxoniceui_text"
- depends on TOI_USERUI
- ---help---
- This entry allows you to specify a default path to the userui binary.
-
- config TOI_DEFAULT_IMAGE_SIZE_LIMIT
- int "Default image size limit"
- range -2 65536
- default "-2"
- depends on TOI_CORE
- ---help---
- This entry allows you to specify a default image size limit. It can
- be overridden at run-time using /sys/power/tuxonice/image_size_limit.
-
- config TOI_KEEP_IMAGE
- bool "Allow Keep Image Mode"
- depends on TOI_CORE
- ---help---
- This option allows you to keep and image and reuse it. It is intended
- __ONLY__ for use with systems where all filesystems are mounted read-
- only (kiosks, for example). To use it, compile this option in and boot
- normally. Set the KEEP_IMAGE flag in /sys/power/tuxonice and suspend.
- When you resume, the image will not be removed. You will be unable to turn
- off swap partitions (assuming you are using the swap allocator), but future
- suspends simply do a power-down. The image can be updated using the
- kernel command line parameter suspend_act= to turn off the keep image
- bit. Keep image mode is a little less user friendly on purpose - it
- should not be used without thought!
-
- config TOI_INCREMENTAL
- bool "Incremental Image Support"
- depends on TOI_CORE && 64BIT && TOI_KEEP_IMAGE
- default n
- ---help---
- This option enables the work in progress toward using the dirty page
- tracking to record changes to pages. It is hoped that
- this will be an initial step toward implementing storing just
- the differences between consecutive images, which will
- increase the amount of storage needed for the image, but also
- increase the speed at which writing an image occurs and
- reduce the wear and tear on drives.
-
- At the moment, all that is implemented is the first step of keeping
- an existing image and then comparing it to the contents in memory
- (by setting /sys/power/tuxonice/verify_image to 1 and triggering a
- (fake) resume) to see what the page change tracking should find to be
- different. If you have verify_image set to 1, TuxOnIce will automatically
- invalidate the old image when you next try to hibernate, so there's no
- greater chance of disk corruption than normal.
-
- comment "No incremental image support available without Keep Image support."
- depends on TOI_CORE && !TOI_KEEP_IMAGE && 64BIT
-
- config TOI_REPLACE_SWSUSP
- bool "Replace swsusp by default"
- default y
- depends on TOI_CORE
- ---help---
- TuxOnIce can replace swsusp. This option makes that the default state,
- requiring you to echo 0 > /sys/power/tuxonice/replace_swsusp if you want
- to use the vanilla kernel functionality. Note that your initrd/ramfs will
- need to do this before trying to resume, too.
- With overriding swsusp enabled, echoing disk to /sys/power/state will
- start a TuxOnIce cycle. If resume= doesn't specify an allocator and both
- the swap and file allocators are compiled in, the swap allocator will be
- used by default.
-
- config TOI_IGNORE_LATE_INITCALL
- bool "Wait for initrd/ramfs to run, by default"
- default n
- depends on TOI_CORE
- ---help---
- When booting, TuxOnIce can check for an image and start to resume prior
- to any initrd/ramfs running (via a late initcall).
-
- If you don't have an initrd/ramfs, this is what you want to happen -
- otherwise you won't be able to safely resume. You should set this option
- to 'No'.
-
- If, however, you want your initrd/ramfs to run anyway before resuming,
- you need to tell TuxOnIce to ignore that earlier opportunity to resume.
- This can be done either by using this compile time option, or by
- overriding this option with the boot-time parameter toi_initramfs_resume_only=1.
-
- Note that if TuxOnIce can't resume at the earlier opportunity, the
- value of this option won't matter - the initramfs/initrd (if any) will
- run anyway.
-
- menuconfig TOI_CLUSTER
- bool "Cluster support"
- default n
- depends on TOI_CORE && NET && BROKEN
- ---help---
- Support for linking multiple machines in a cluster so that they suspend
- and resume together.
-
- config TOI_DEFAULT_CLUSTER_INTERFACE
- string "Default cluster interface"
- depends on TOI_CLUSTER
- ---help---
- The default interface on which to communicate with other nodes in
- the cluster.
-
- If no value is set here, cluster support will be disabled by default.
-
- config TOI_DEFAULT_CLUSTER_KEY
- string "Default cluster key"
- default "Default"
- depends on TOI_CLUSTER
- ---help---
- The default key used by this node. All nodes in the same cluster
- have the same key. Multiple clusters may coexist on the same lan
- by using different values for this key.
-
- config TOI_CLUSTER_IMAGE_TIMEOUT
- int "Timeout when checking for image"
- default 15
- depends on TOI_CLUSTER
- ---help---
- Timeout (seconds) before continuing to boot when waiting to see
- whether other nodes might have an image. Set to -1 to wait
- indefinitely. In WAIT_UNTIL_NODES is non zero, we might continue
- booting sooner than this timeout.
-
- config TOI_CLUSTER_WAIT_UNTIL_NODES
- int "Nodes without image before continuing"
- default 0
- depends on TOI_CLUSTER
- ---help---
- When booting and no image is found, we wait to see if other nodes
- have an image before continuing to boot. This value lets us
- continue after seeing a certain number of nodes without an image,
- instead of continuing to wait for the timeout. Set to 0 to only
- use the timeout.
-
- config TOI_DEFAULT_CLUSTER_PRE_HIBERNATE
- string "Default pre-hibernate script"
- depends on TOI_CLUSTER
- ---help---
- The default script to be called when starting to hibernate.
-
- config TOI_DEFAULT_CLUSTER_POST_HIBERNATE
- string "Default post-hibernate script"
- depends on TOI_CLUSTER
- ---help---
- The default script to be called after resuming from hibernation.
-
- config TOI_DEFAULT_WAIT
- int "Default waiting time for emergency boot messages"
- default "25"
- range -1 32768
- depends on TOI_CORE
- help
- TuxOnIce can display warnings very early in the process of resuming,
- if (for example) it appears that you have booted a kernel that doesn't
- match an image on disk. It can then give you the opportunity to either
- continue booting that kernel, or reboot the machine. This option can be
- used to control how long to wait in such circumstances. -1 means wait
- forever. 0 means don't wait at all (do the default action, which will
- generally be to continue booting and remove the image). Values of 1 or
- more indicate a number of seconds (up to 255) to wait before doing the
- default.
-
- config TOI_DEFAULT_EXTRA_PAGES_ALLOWANCE
- int "Default extra pages allowance"
- default "2000"
- range 500 32768
- depends on TOI_CORE
- help
- This value controls the default for the allowance TuxOnIce makes for
- drivers to allocate extra memory during the atomic copy. The default
- value of 2000 will be okay in most cases. If you are using
- DRI, the easiest way to find what value to use is to try to hibernate
- and look at how many pages were actually needed in the sysfs entry
- /sys/power/tuxonice/debug_info (first number on the last line), adding
- a little extra because the value is not always the same.
-
- config TOI_CHECKSUM
- bool "Checksum pageset2"
- default n
- depends on TOI_CORE
- select CRYPTO
- select CRYPTO_ALGAPI
- select CRYPTO_MD4
- ---help---
- Adds support for checksumming pageset2 pages, to ensure you really get an
- atomic copy. Since some filesystems (XFS especially) change metadata even
- when there's no other activity, we need this to check for pages that have
- been changed while we were saving the page cache. If your debugging output
- always says no pages were resaved, you may be able to safely disable this
- option.
-
-config TOI
- bool
- depends on TOI_CORE!=n
- default y
-
-config TOI_ZRAM_SUPPORT
- def_bool y
- depends on TOI && ZRAM!=n
-
config PM_SLEEP
def_bool y
depends on SUSPEND || HIBERNATE_CALLBACKS
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index b8d7b68f7..cb880a14c 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,46 +1,13 @@
ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG
-tuxonice_core-y := tuxonice_modules.o
-
-obj-$(CONFIG_TOI) += tuxonice_builtin.o
-obj-$(CONFIG_TOI_INCREMENTAL) += tuxonice_incremental.o \
- tuxonice_copy_before_write.o
-
-tuxonice_core-$(CONFIG_PM_DEBUG) += tuxonice_alloc.o
-
-# Compile these in after allocation debugging, if used.
-
-tuxonice_core-y += tuxonice_sysfs.o tuxonice_highlevel.o \
- tuxonice_io.o tuxonice_pagedir.o tuxonice_prepare_image.o \
- tuxonice_extent.o tuxonice_pageflags.o tuxonice_ui.o \
- tuxonice_power_off.o tuxonice_atomic_copy.o
-
-tuxonice_core-$(CONFIG_TOI_CHECKSUM) += tuxonice_checksum.o
-
-tuxonice_core-$(CONFIG_NET) += tuxonice_storage.o tuxonice_netlink.o
-
-obj-$(CONFIG_TOI_CORE) += tuxonice_core.o
-obj-$(CONFIG_TOI_PRUNE) += tuxonice_prune.o
-obj-$(CONFIG_TOI_CRYPTO) += tuxonice_compress.o
-
-tuxonice_bio-y := tuxonice_bio_core.o tuxonice_bio_chains.o \
- tuxonice_bio_signature.o
-
-obj-$(CONFIG_TOI_SWAP) += tuxonice_bio.o tuxonice_swap.o
-obj-$(CONFIG_TOI_FILE) += tuxonice_bio.o tuxonice_file.o
-obj-$(CONFIG_TOI_CLUSTER) += tuxonice_cluster.o
-
-obj-$(CONFIG_TOI_USERUI) += tuxonice_userui.o
-
obj-y += qos.o
obj-$(CONFIG_PM) += main.o
obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o
obj-$(CONFIG_FREEZER) += process.o
obj-$(CONFIG_SUSPEND) += suspend.o
obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o
-obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \
- block_io.o
+obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o
obj-$(CONFIG_PM_AUTOSLEEP) += autosleep.o
obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o
diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c
deleted file mode 100644
index 9a58bc258..000000000
--- a/kernel/power/block_io.c
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * This file provides functions for block I/O operations on swap/file.
- *
- * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
- * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
- *
- * This file is released under the GPLv2.
- */
-
-#include <linux/bio.h>
-#include <linux/kernel.h>
-#include <linux/pagemap.h>
-#include <linux/swap.h>
-
-#include "power.h"
-
-/**
- * submit - submit BIO request.
- * @rw: READ or WRITE.
- * @off physical offset of page.
- * @page: page we're reading or writing.
- * @bio_chain: list of pending biod (for async reading)
- *
- * Straight from the textbook - allocate and initialize the bio.
- * If we're reading, make sure the page is marked as dirty.
- * Then submit it and, if @bio_chain == NULL, wait.
- */
-static int submit(int rw, struct block_device *bdev, sector_t sector,
- struct page *page, struct bio **bio_chain)
-{
- const int bio_rw = rw | REQ_SYNC;
- struct bio *bio;
-
- bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
- bio->bi_iter.bi_sector = sector;
- bio->bi_bdev = bdev;
- bio->bi_end_io = end_swap_bio_read;
-
- if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
- printk(KERN_ERR "PM: Adding page to bio failed at %llu\n",
- (unsigned long long)sector);
- bio_put(bio);
- return -EFAULT;
- }
-
- lock_page(page);
- bio_get(bio);
-
- if (bio_chain == NULL) {
- submit_bio(bio_rw, bio);
- wait_on_page_locked(page);
- if (rw == READ)
- bio_set_pages_dirty(bio);
- bio_put(bio);
- } else {
- if (rw == READ)
- get_page(page); /* These pages are freed later */
- bio->bi_private = *bio_chain;
- *bio_chain = bio;
- submit_bio(bio_rw, bio);
- }
- return 0;
-}
-
-int hib_bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
-{
- return submit(READ, hib_resume_bdev, page_off * (PAGE_SIZE >> 9),
- virt_to_page(addr), bio_chain);
-}
-
-int hib_bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
-{
- return submit(WRITE, hib_resume_bdev, page_off * (PAGE_SIZE >> 9),
- virt_to_page(addr), bio_chain);
-}
-
-int hib_wait_on_bio_chain(struct bio **bio_chain)
-{
- struct bio *bio;
- struct bio *next_bio;
- int ret = 0;
-
- if (bio_chain == NULL)
- return 0;
-
- bio = *bio_chain;
- if (bio == NULL)
- return 0;
- while (bio) {
- struct page *page;
-
- next_bio = bio->bi_private;
- page = bio->bi_io_vec[0].bv_page;
- wait_on_page_locked(page);
- if (!PageUptodate(page) || PageError(page))
- ret = -EIO;
- put_page(page);
- bio_put(bio);
- bio = next_bio;
- }
- *bio_chain = NULL;
- return ret;
-}
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index b7d3bc724..690f78f21 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -31,7 +31,7 @@
#include <linux/ktime.h>
#include <trace/events/power.h>
-#include "tuxonice.h"
+#include "power.h"
static int nocompress;
@@ -39,7 +39,7 @@ static int noresume;
static int nohibernate;
static int resume_wait;
static unsigned int resume_delay;
-char resume_file[256] = CONFIG_PM_STD_PARTITION;
+static char resume_file[256] = CONFIG_PM_STD_PARTITION;
dev_t swsusp_resume_device;
sector_t swsusp_resume_block;
__visible int in_suspend __nosavedata;
@@ -123,7 +123,7 @@ static int hibernation_test(int level) { return 0; }
* platform_begin - Call platform to start hibernation.
* @platform_mode: Whether or not to use the platform driver.
*/
-int platform_begin(int platform_mode)
+static int platform_begin(int platform_mode)
{
return (platform_mode && hibernation_ops) ?
hibernation_ops->begin() : 0;
@@ -133,7 +133,7 @@ int platform_begin(int platform_mode)
* platform_end - Call platform to finish transition to the working state.
* @platform_mode: Whether or not to use the platform driver.
*/
-void platform_end(int platform_mode)
+static void platform_end(int platform_mode)
{
if (platform_mode && hibernation_ops)
hibernation_ops->end();
@@ -147,7 +147,7 @@ void platform_end(int platform_mode)
* if so configured, and return an error code if that fails.
*/
-int platform_pre_snapshot(int platform_mode)
+static int platform_pre_snapshot(int platform_mode)
{
return (platform_mode && hibernation_ops) ?
hibernation_ops->pre_snapshot() : 0;
@@ -162,7 +162,7 @@ int platform_pre_snapshot(int platform_mode)
*
* This routine is called on one CPU with interrupts disabled.
*/
-void platform_leave(int platform_mode)
+static void platform_leave(int platform_mode)
{
if (platform_mode && hibernation_ops)
hibernation_ops->leave();
@@ -177,7 +177,7 @@ void platform_leave(int platform_mode)
*
* This routine must be called after platform_prepare().
*/
-void platform_finish(int platform_mode)
+static void platform_finish(int platform_mode)
{
if (platform_mode && hibernation_ops)
hibernation_ops->finish();
@@ -193,7 +193,7 @@ void platform_finish(int platform_mode)
* If the restore fails after this function has been called,
* platform_restore_cleanup() must be called.
*/
-int platform_pre_restore(int platform_mode)
+static int platform_pre_restore(int platform_mode)
{
return (platform_mode && hibernation_ops) ?
hibernation_ops->pre_restore() : 0;
@@ -210,7 +210,7 @@ int platform_pre_restore(int platform_mode)
* function must be called too, regardless of the result of
* platform_pre_restore().
*/
-void platform_restore_cleanup(int platform_mode)
+static void platform_restore_cleanup(int platform_mode)
{
if (platform_mode && hibernation_ops)
hibernation_ops->restore_cleanup();
@@ -220,7 +220,7 @@ void platform_restore_cleanup(int platform_mode)
* platform_recover - Recover from a failure to suspend devices.
* @platform_mode: Whether or not to use the platform driver.
*/
-void platform_recover(int platform_mode)
+static void platform_recover(int platform_mode)
{
if (platform_mode && hibernation_ops && hibernation_ops->recover)
hibernation_ops->recover();
@@ -552,7 +552,7 @@ int hibernation_platform_enter(void)
error = disable_nonboot_cpus();
if (error)
- goto Platform_finish;
+ goto Enable_cpus;
local_irq_disable();
syscore_suspend();
@@ -568,6 +568,8 @@ int hibernation_platform_enter(void)
Power_up:
syscore_resume();
local_irq_enable();
+
+ Enable_cpus:
enable_nonboot_cpus();
Platform_finish:
@@ -646,9 +648,6 @@ int hibernate(void)
{
int error;
- if (test_action_state(TOI_REPLACE_SWSUSP))
- return try_tuxonice_hibernate();
-
if (!hibernation_available()) {
pr_debug("PM: Hibernation not available.\n");
return -EPERM;
@@ -738,19 +737,11 @@ int hibernate(void)
* attempts to recover gracefully and make the kernel return to the normal mode
* of operation.
*/
-int software_resume(void)
+static int software_resume(void)
{
int error;
unsigned int flags;
- resume_attempted = 1;
-
- /*
- * We can't know (until an image header - if any - is loaded), whether
- * we did override swsusp. We therefore ensure that both are tried.
- */
- try_tuxonice_resume();
-
/*
* If the user said "noresume".. bail out early.
*/
@@ -1137,7 +1128,6 @@ static int __init hibernate_setup(char *str)
static int __init noresume_setup(char *str)
{
noresume = 1;
- set_toi_state(TOI_NORESUME_SPECIFIED);
return 1;
}
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 86e8157a4..63d395b5d 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -272,7 +272,7 @@ static inline void pm_print_times_init(void)
{
pm_print_times_enabled = !!initcall_debug;
}
-#else /* !CONFIG_PP_SLEEP_DEBUG */
+#else /* !CONFIG_PM_SLEEP_DEBUG */
static inline void pm_print_times_init(void) {}
#endif /* CONFIG_PM_SLEEP_DEBUG */
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 095ed9f03..caadb566e 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -36,12 +36,8 @@ static inline char *check_image_kernel(struct swsusp_info *info)
return arch_hibernation_header_restore(info) ?
"architecture specific data" : NULL;
}
-#else
-extern char *check_image_kernel(struct swsusp_info *info);
#endif /* CONFIG_ARCH_HIBERNATION_HEADER */
-extern int init_header(struct swsusp_info *info);
-extern char resume_file[256];
/*
* Keep some memory free so that I/O operations can succeed without paging
* [Might this be more than 4 MB?]
@@ -81,8 +77,6 @@ static struct kobj_attribute _name##_attr = { \
.store = _name##_store, \
}
-extern struct pbe *restore_pblist;
-
/* Preferred image size in bytes (default 500 MB) */
extern unsigned long image_size;
/* Size of memory reserved for drivers (default SPARE_PAGES x PAGE_SIZE) */
@@ -169,15 +163,6 @@ extern void swsusp_close(fmode_t);
extern int swsusp_unmark(void);
#endif
-/* kernel/power/block_io.c */
-extern struct block_device *hib_resume_bdev;
-
-extern int hib_bio_read_page(pgoff_t page_off, void *addr,
- struct bio **bio_chain);
-extern int hib_bio_write_page(pgoff_t page_off, void *addr,
- struct bio **bio_chain);
-extern int hib_wait_on_bio_chain(struct bio **bio_chain);
-
struct timeval;
/* kernel/power/swsusp.c */
extern void swsusp_show_speed(ktime_t, ktime_t, unsigned int, char *);
@@ -275,31 +260,6 @@ static inline void suspend_thaw_processes(void)
}
#endif
-extern struct page *saveable_page(struct zone *z, unsigned long p);
-#ifdef CONFIG_HIGHMEM
-struct page *saveable_highmem_page(struct zone *z, unsigned long p);
-#else
-static
-inline void *saveable_highmem_page(struct zone *z, unsigned long p)
-{
- return NULL;
-}
-#endif
-
-#define PBES_PER_PAGE (PAGE_SIZE / sizeof(struct pbe))
-extern struct list_head nosave_regions;
-
-/**
- * This structure represents a range of page frames the contents of which
- * should not be saved during the suspend.
- */
-
-struct nosave_region {
- struct list_head list;
- unsigned long start_pfn;
- unsigned long end_pfn;
-};
-
#ifdef CONFIG_PM_AUTOSLEEP
/* kernel/power/autosleep.c */
@@ -326,10 +286,3 @@ extern int pm_wake_lock(const char *buf);
extern int pm_wake_unlock(const char *buf);
#endif /* !CONFIG_PM_WAKELOCKS */
-
-#ifdef CONFIG_TOI
-unsigned long toi_get_nonconflicting_page(void);
-#define BM_END_OF_MAP (~0UL)
-#else
-#define toi_get_nonconflicting_page() (0)
-#endif
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index ba9d20ebc..5235dd4e1 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -36,9 +36,6 @@
#include <asm/tlbflush.h>
#include <asm/io.h>
-#include "tuxonice_modules.h"
-#include "tuxonice_builtin.h"
-#include "tuxonice_alloc.h"
#include "power.h"
static int swsusp_page_is_free(struct page *);
@@ -101,9 +98,6 @@ static void *get_image_page(gfp_t gfp_mask, int safe_needed)
{
void *res;
- if (toi_running)
- return (void *) toi_get_nonconflicting_page();
-
res = (void *)get_zeroed_page(gfp_mask);
if (safe_needed)
while (res && swsusp_page_is_free(virt_to_page(res))) {
@@ -149,11 +143,6 @@ static inline void free_image_page(void *addr, int clear_nosave_free)
page = virt_to_page(addr);
- if (toi_running) {
- toi__free_page(29, page);
- return;
- }
-
swsusp_unset_page_forbidden(page);
if (clear_nosave_free)
swsusp_unset_page_free(page);
@@ -313,15 +302,13 @@ struct bm_position {
int node_bit;
};
-#define BM_POSITION_SLOTS (NR_CPUS * 2)
-
struct memory_bitmap {
struct list_head zones;
struct linked_page *p_list; /* list of pages used to store zone
* bitmap objects and bitmap block
* objects
*/
- struct bm_position cur[BM_POSITION_SLOTS]; /* most recently used bit position */
+ struct bm_position cur; /* most recently used bit position */
};
/* Functions that operate on memory bitmaps */
@@ -486,39 +473,16 @@ static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone,
free_image_page(node->data, clear_nosave_free);
}
-void memory_bm_position_reset(struct memory_bitmap *bm)
+static void memory_bm_position_reset(struct memory_bitmap *bm)
{
- int index;
-
- for (index = 0; index < BM_POSITION_SLOTS; index++) {
- bm->cur[index].zone = list_entry(bm->zones.next, struct mem_zone_bm_rtree,
+ bm->cur.zone = list_entry(bm->zones.next, struct mem_zone_bm_rtree,
list);
- bm->cur[index].node = list_entry(bm->cur[index].zone->leaves.next,
+ bm->cur.node = list_entry(bm->cur.zone->leaves.next,
struct rtree_node, list);
- bm->cur[index].node_pfn = 0;
- bm->cur[index].node_bit = 0;
- }
+ bm->cur.node_pfn = 0;
+ bm->cur.node_bit = 0;
}
-static void memory_bm_clear_current(struct memory_bitmap *bm, int index);
-unsigned long memory_bm_next_pfn(struct memory_bitmap *bm, int index);
-
-/**
- * memory_bm_clear
- * @param bm - The bitmap to clear
- *
- * Only run while single threaded - locking not needed
- */
-void memory_bm_clear(struct memory_bitmap *bm)
-{
- memory_bm_position_reset(bm);
-
- while (memory_bm_next_pfn(bm, 0) != BM_END_OF_MAP) {
- memory_bm_clear_current(bm, 0);
- }
-
- memory_bm_position_reset(bm);
-}
static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
struct mem_extent {
@@ -631,8 +595,7 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
}
bm->p_list = ca.chain;
-
- memory_bm_position_reset(bm);
+ memory_bm_position_reset(bm);
Exit:
free_mem_extents(&mem_extents);
return error;
@@ -668,24 +631,14 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
* It walks the radix tree to find the page which contains the bit for
* pfn and returns the bit position in **addr and *bit_nr.
*/
-int memory_bm_find_bit(struct memory_bitmap *bm, int index,
- unsigned long pfn, void **addr, unsigned int *bit_nr)
+static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
+ void **addr, unsigned int *bit_nr)
{
struct mem_zone_bm_rtree *curr, *zone;
struct rtree_node *node;
int i, block_nr;
- if (!bm->cur[index].zone) {
- // Reset
- bm->cur[index].zone = list_entry(bm->zones.next, struct mem_zone_bm_rtree,
- list);
- bm->cur[index].node = list_entry(bm->cur[index].zone->leaves.next,
- struct rtree_node, list);
- bm->cur[index].node_pfn = 0;
- bm->cur[index].node_bit = 0;
- }
-
- zone = bm->cur[index].zone;
+ zone = bm->cur.zone;
if (pfn >= zone->start_pfn && pfn < zone->end_pfn)
goto zone_found;
@@ -709,8 +662,8 @@ zone_found:
* node for our pfn.
*/
- node = bm->cur[index].node;
- if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur[index].node_pfn)
+ node = bm->cur.node;
+ if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn)
goto node_found;
node = zone->rtree;
@@ -727,9 +680,9 @@ zone_found:
node_found:
/* Update last position */
- bm->cur[index].zone = zone;
- bm->cur[index].node = node;
- bm->cur[index].node_pfn = (pfn - zone->start_pfn) & ~BM_BLOCK_MASK;
+ bm->cur.zone = zone;
+ bm->cur.node = node;
+ bm->cur.node_pfn = (pfn - zone->start_pfn) & ~BM_BLOCK_MASK;
/* Set return values */
*addr = node->data;
@@ -738,66 +691,66 @@ node_found:
return 0;
}
-void memory_bm_set_bit(struct memory_bitmap *bm, int index, unsigned long pfn)
+static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn)
{
void *addr;
unsigned int bit;
int error;
- error = memory_bm_find_bit(bm, index, pfn, &addr, &bit);
+ error = memory_bm_find_bit(bm, pfn, &addr, &bit);
BUG_ON(error);
set_bit(bit, addr);
}
-int mem_bm_set_bit_check(struct memory_bitmap *bm, int index, unsigned long pfn)
+static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn)
{
void *addr;
unsigned int bit;
int error;
- error = memory_bm_find_bit(bm, index, pfn, &addr, &bit);
+ error = memory_bm_find_bit(bm, pfn, &addr, &bit);
if (!error)
set_bit(bit, addr);
return error;
}
-void memory_bm_clear_bit(struct memory_bitmap *bm, int index, unsigned long pfn)
+static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn)
{
void *addr;
unsigned int bit;
int error;
- error = memory_bm_find_bit(bm, index, pfn, &addr, &bit);
+ error = memory_bm_find_bit(bm, pfn, &addr, &bit);
BUG_ON(error);
clear_bit(bit, addr);
}
-static void memory_bm_clear_current(struct memory_bitmap *bm, int index)
+static void memory_bm_clear_current(struct memory_bitmap *bm)
{
int bit;
- bit = max(bm->cur[index].node_bit - 1, 0);
- clear_bit(bit, bm->cur[index].node->data);
+ bit = max(bm->cur.node_bit - 1, 0);
+ clear_bit(bit, bm->cur.node->data);
}
-int memory_bm_test_bit(struct memory_bitmap *bm, int index, unsigned long pfn)
+static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
{
void *addr;
unsigned int bit;
int error;
- error = memory_bm_find_bit(bm, index, pfn, &addr, &bit);
+ error = memory_bm_find_bit(bm, pfn, &addr, &bit);
BUG_ON(error);
return test_bit(bit, addr);
}
-static bool memory_bm_pfn_present(struct memory_bitmap *bm, int index, unsigned long pfn)
+static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn)
{
void *addr;
unsigned int bit;
- return !memory_bm_find_bit(bm, index, pfn, &addr, &bit);
+ return !memory_bm_find_bit(bm, pfn, &addr, &bit);
}
/*
@@ -810,25 +763,25 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, int index, unsigned
*
* Returns true if there is a next node, false otherwise.
*/
-static bool rtree_next_node(struct memory_bitmap *bm, int index)
+static bool rtree_next_node(struct memory_bitmap *bm)
{
- bm->cur[index].node = list_entry(bm->cur[index].node->list.next,
+ bm->cur.node = list_entry(bm->cur.node->list.next,
struct rtree_node, list);
- if (&bm->cur[index].node->list != &bm->cur[index].zone->leaves) {
- bm->cur[index].node_pfn += BM_BITS_PER_BLOCK;
- bm->cur[index].node_bit = 0;
+ if (&bm->cur.node->list != &bm->cur.zone->leaves) {
+ bm->cur.node_pfn += BM_BITS_PER_BLOCK;
+ bm->cur.node_bit = 0;
touch_softlockup_watchdog();
return true;
}
/* No more nodes, goto next zone */
- bm->cur[index].zone = list_entry(bm->cur[index].zone->list.next,
+ bm->cur.zone = list_entry(bm->cur.zone->list.next,
struct mem_zone_bm_rtree, list);
- if (&bm->cur[index].zone->list != &bm->zones) {
- bm->cur[index].node = list_entry(bm->cur[index].zone->leaves.next,
+ if (&bm->cur.zone->list != &bm->zones) {
+ bm->cur.node = list_entry(bm->cur.zone->leaves.next,
struct rtree_node, list);
- bm->cur[index].node_pfn = 0;
- bm->cur[index].node_bit = 0;
+ bm->cur.node_pfn = 0;
+ bm->cur.node_bit = 0;
return true;
}
@@ -846,29 +799,38 @@ static bool rtree_next_node(struct memory_bitmap *bm, int index)
* It is required to run memory_bm_position_reset() before the
* first call to this function.
*/
-unsigned long memory_bm_next_pfn(struct memory_bitmap *bm, int index)
+static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
{
unsigned long bits, pfn, pages;
int bit;
- index += NR_CPUS; /* Iteration state is separated from get/set/test */
-
do {
- pages = bm->cur[index].zone->end_pfn - bm->cur[index].zone->start_pfn;
- bits = min(pages - bm->cur[index].node_pfn, BM_BITS_PER_BLOCK);
- bit = find_next_bit(bm->cur[index].node->data, bits,
- bm->cur[index].node_bit);
+ pages = bm->cur.zone->end_pfn - bm->cur.zone->start_pfn;
+ bits = min(pages - bm->cur.node_pfn, BM_BITS_PER_BLOCK);
+ bit = find_next_bit(bm->cur.node->data, bits,
+ bm->cur.node_bit);
if (bit < bits) {
- pfn = bm->cur[index].zone->start_pfn + bm->cur[index].node_pfn + bit;
- bm->cur[index].node_bit = bit + 1;
+ pfn = bm->cur.zone->start_pfn + bm->cur.node_pfn + bit;
+ bm->cur.node_bit = bit + 1;
return pfn;
}
- } while (rtree_next_node(bm, index));
+ } while (rtree_next_node(bm));
return BM_END_OF_MAP;
}
-LIST_HEAD(nosave_regions);
+/**
+ * This structure represents a range of page frames the contents of which
+ * should not be saved during the suspend.
+ */
+
+struct nosave_region {
+ struct list_head list;
+ unsigned long start_pfn;
+ unsigned long end_pfn;
+};
+
+static LIST_HEAD(nosave_regions);
/**
* register_nosave_region - register a range of page frames the contents
@@ -927,37 +889,37 @@ static struct memory_bitmap *free_pages_map;
void swsusp_set_page_free(struct page *page)
{
if (free_pages_map)
- memory_bm_set_bit(free_pages_map, 0, page_to_pfn(page));
+ memory_bm_set_bit(free_pages_map, page_to_pfn(page));
}
static int swsusp_page_is_free(struct page *page)
{
return free_pages_map ?
- memory_bm_test_bit(free_pages_map, 0, page_to_pfn(page)) : 0;
+ memory_bm_test_bit(free_pages_map, page_to_pfn(page)) : 0;
}
void swsusp_unset_page_free(struct page *page)
{
if (free_pages_map)
- memory_bm_clear_bit(free_pages_map, 0, page_to_pfn(page));
+ memory_bm_clear_bit(free_pages_map, page_to_pfn(page));
}
static void swsusp_set_page_forbidden(struct page *page)
{
if (forbidden_pages_map)
- memory_bm_set_bit(forbidden_pages_map, 0, page_to_pfn(page));
+ memory_bm_set_bit(forbidden_pages_map, page_to_pfn(page));
}
int swsusp_page_is_forbidden(struct page *page)
{
return forbidden_pages_map ?
- memory_bm_test_bit(forbidden_pages_map, 0, page_to_pfn(page)) : 0;
+ memory_bm_test_bit(forbidden_pages_map, page_to_pfn(page)) : 0;
}
static void swsusp_unset_page_forbidden(struct page *page)
{
if (forbidden_pages_map)
- memory_bm_clear_bit(forbidden_pages_map, 0, page_to_pfn(page));
+ memory_bm_clear_bit(forbidden_pages_map, page_to_pfn(page));
}
/**
@@ -988,7 +950,7 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
* touch the PFNs for which the error is
* returned anyway.
*/
- mem_bm_set_bit_check(bm, 0, pfn);
+ mem_bm_set_bit_check(bm, pfn);
}
}
}
@@ -1116,7 +1078,7 @@ static unsigned int count_free_highmem_pages(void)
* We should save the page if it isn't Nosave or NosaveFree, or Reserved,
* and it isn't a part of a free chunk of pages.
*/
-struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
+static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
{
struct page *page;
@@ -1163,6 +1125,11 @@ static unsigned int count_highmem_pages(void)
}
return n;
}
+#else
+static inline void *saveable_highmem_page(struct zone *z, unsigned long p)
+{
+ return NULL;
+}
#endif /* CONFIG_HIGHMEM */
/**
@@ -1173,7 +1140,7 @@ static unsigned int count_highmem_pages(void)
* of pages statically defined as 'unsaveable', and it isn't a part of
* a free chunk of pages.
*/
-struct page *saveable_page(struct zone *zone, unsigned long pfn)
+static struct page *saveable_page(struct zone *zone, unsigned long pfn)
{
struct page *page;
@@ -1311,15 +1278,15 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
max_zone_pfn = zone_end_pfn(zone);
for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
if (page_is_saveable(zone, pfn))
- memory_bm_set_bit(orig_bm, 0, pfn);
+ memory_bm_set_bit(orig_bm, pfn);
}
memory_bm_position_reset(orig_bm);
memory_bm_position_reset(copy_bm);
for(;;) {
- pfn = memory_bm_next_pfn(orig_bm, 0);
+ pfn = memory_bm_next_pfn(orig_bm);
if (unlikely(pfn == BM_END_OF_MAP))
break;
- copy_data_page(memory_bm_next_pfn(copy_bm, 0), pfn);
+ copy_data_page(memory_bm_next_pfn(copy_bm), pfn);
}
}
@@ -1365,8 +1332,8 @@ void swsusp_free(void)
memory_bm_position_reset(free_pages_map);
loop:
- fr_pfn = memory_bm_next_pfn(free_pages_map, 0);
- fb_pfn = memory_bm_next_pfn(forbidden_pages_map, 0);
+ fr_pfn = memory_bm_next_pfn(free_pages_map);
+ fb_pfn = memory_bm_next_pfn(forbidden_pages_map);
/*
* Find the next bit set in both bitmaps. This is guaranteed to
@@ -1374,16 +1341,16 @@ loop:
*/
do {
if (fb_pfn < fr_pfn)
- fb_pfn = memory_bm_next_pfn(forbidden_pages_map, 0);
+ fb_pfn = memory_bm_next_pfn(forbidden_pages_map);
if (fr_pfn < fb_pfn)
- fr_pfn = memory_bm_next_pfn(free_pages_map, 0);
+ fr_pfn = memory_bm_next_pfn(free_pages_map);
} while (fb_pfn != fr_pfn);
if (fr_pfn != BM_END_OF_MAP && pfn_valid(fr_pfn)) {
struct page *page = pfn_to_page(fr_pfn);
- memory_bm_clear_current(forbidden_pages_map, 0);
- memory_bm_clear_current(free_pages_map, 0);
+ memory_bm_clear_current(forbidden_pages_map);
+ memory_bm_clear_current(free_pages_map);
__free_page(page);
goto loop;
}
@@ -1418,7 +1385,7 @@ static unsigned long preallocate_image_pages(unsigned long nr_pages, gfp_t mask)
page = alloc_image_page(mask);
if (!page)
break;
- memory_bm_set_bit(&copy_bm, 0, page_to_pfn(page));
+ memory_bm_set_bit(&copy_bm, page_to_pfn(page));
if (PageHighMem(page))
alloc_highmem++;
else
@@ -1514,7 +1481,7 @@ static unsigned long free_unnecessary_pages(void)
memory_bm_position_reset(&copy_bm);
while (to_free_normal > 0 || to_free_highmem > 0) {
- unsigned long pfn = memory_bm_next_pfn(&copy_bm, 0);
+ unsigned long pfn = memory_bm_next_pfn(&copy_bm);
struct page *page = pfn_to_page(pfn);
if (PageHighMem(page)) {
@@ -1528,7 +1495,7 @@ static unsigned long free_unnecessary_pages(void)
to_free_normal--;
alloc_normal--;
}
- memory_bm_clear_bit(&copy_bm, 0, pfn);
+ memory_bm_clear_bit(&copy_bm, pfn);
swsusp_unset_page_forbidden(page);
swsusp_unset_page_free(page);
__free_page(page);
@@ -1813,7 +1780,7 @@ alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
struct page *page;
page = alloc_image_page(__GFP_HIGHMEM);
- memory_bm_set_bit(bm, 0, page_to_pfn(page));
+ memory_bm_set_bit(bm, page_to_pfn(page));
}
return nr_highmem;
}
@@ -1856,7 +1823,7 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
page = alloc_image_page(GFP_ATOMIC | __GFP_COLD);
if (!page)
goto err_out;
- memory_bm_set_bit(copy_bm, 0, page_to_pfn(page));
+ memory_bm_set_bit(copy_bm, page_to_pfn(page));
}
}
@@ -1871,9 +1838,6 @@ asmlinkage __visible int swsusp_save(void)
{
unsigned int nr_pages, nr_highmem;
- if (toi_running)
- return toi_post_context_save();
-
printk(KERN_INFO "PM: Creating hibernation image:\n");
drain_local_pages(NULL);
@@ -1921,7 +1885,7 @@ static int init_header_complete(struct swsusp_info *info)
return 0;
}
-char *check_image_kernel(struct swsusp_info *info)
+static char *check_image_kernel(struct swsusp_info *info)
{
if (info->version_code != LINUX_VERSION_CODE)
return "kernel version";
@@ -1942,7 +1906,7 @@ unsigned long snapshot_get_image_size(void)
return nr_copy_pages + nr_meta_pages + 1;
}
-int init_header(struct swsusp_info *info)
+static int init_header(struct swsusp_info *info)
{
memset(info, 0, sizeof(struct swsusp_info));
info->num_physpages = get_num_physpages();
@@ -1964,7 +1928,7 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
int j;
for (j = 0; j < PAGE_SIZE / sizeof(long); j++) {
- buf[j] = memory_bm_next_pfn(bm, 0);
+ buf[j] = memory_bm_next_pfn(bm);
if (unlikely(buf[j] == BM_END_OF_MAP))
break;
/* Save page key for data page (s390 only). */
@@ -2015,7 +1979,7 @@ int snapshot_read_next(struct snapshot_handle *handle)
} else {
struct page *page;
- page = pfn_to_page(memory_bm_next_pfn(&copy_bm, 0));
+ page = pfn_to_page(memory_bm_next_pfn(&copy_bm));
if (PageHighMem(page)) {
/* Highmem pages are copied to the buffer,
* because we can't return with a kmapped
@@ -2057,7 +2021,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
/* Mark pages that correspond to the "original" pfns as "unsafe" */
memory_bm_position_reset(bm);
do {
- pfn = memory_bm_next_pfn(bm, 0);
+ pfn = memory_bm_next_pfn(bm);
if (likely(pfn != BM_END_OF_MAP)) {
if (likely(pfn_valid(pfn)))
swsusp_set_page_free(pfn_to_page(pfn));
@@ -2077,10 +2041,10 @@ duplicate_memory_bitmap(struct memory_bitmap *dst, struct memory_bitmap *src)
unsigned long pfn;
memory_bm_position_reset(src);
- pfn = memory_bm_next_pfn(src, 0);
+ pfn = memory_bm_next_pfn(src);
while (pfn != BM_END_OF_MAP) {
- memory_bm_set_bit(dst, 0, pfn);
- pfn = memory_bm_next_pfn(src, 0);
+ memory_bm_set_bit(dst, pfn);
+ pfn = memory_bm_next_pfn(src);
}
}
@@ -2131,8 +2095,8 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
/* Extract and buffer page key for data page (s390 only). */
page_key_memorize(buf + j);
- if (memory_bm_pfn_present(bm, 0, buf[j]))
- memory_bm_set_bit(bm, 0, buf[j]);
+ if (memory_bm_pfn_present(bm, buf[j]))
+ memory_bm_set_bit(bm, buf[j]);
else
return -EFAULT;
}
@@ -2175,12 +2139,12 @@ static unsigned int count_highmem_image_pages(struct memory_bitmap *bm)
unsigned int cnt = 0;
memory_bm_position_reset(bm);
- pfn = memory_bm_next_pfn(bm, 0);
+ pfn = memory_bm_next_pfn(bm);
while (pfn != BM_END_OF_MAP) {
if (PageHighMem(pfn_to_page(pfn)))
cnt++;
- pfn = memory_bm_next_pfn(bm, 0);
+ pfn = memory_bm_next_pfn(bm);
}
return cnt;
}
@@ -2225,7 +2189,7 @@ prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
page = alloc_page(__GFP_HIGHMEM);
if (!swsusp_page_is_free(page)) {
/* The page is "safe", set its bit the bitmap */
- memory_bm_set_bit(bm, 0, page_to_pfn(page));
+ memory_bm_set_bit(bm, page_to_pfn(page));
safe_highmem_pages++;
}
/* Mark the page as allocated */
@@ -2283,7 +2247,7 @@ get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
/* Copy of the page will be stored in high memory */
kaddr = buffer;
- tmp = pfn_to_page(memory_bm_next_pfn(safe_highmem_bm, 0));
+ tmp = pfn_to_page(memory_bm_next_pfn(safe_highmem_bm));
safe_highmem_pages--;
last_highmem_page = tmp;
pbe->copy_page = tmp;
@@ -2454,7 +2418,7 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
{
struct pbe *pbe;
struct page *page;
- unsigned long pfn = memory_bm_next_pfn(bm, 0);
+ unsigned long pfn = memory_bm_next_pfn(bm);
if (pfn == BM_END_OF_MAP)
return ERR_PTR(-EFAULT);
@@ -2641,82 +2605,3 @@ int restore_highmem(void)
return 0;
}
#endif /* CONFIG_HIGHMEM */
-
-struct memory_bitmap *pageset1_map, *pageset2_map, *free_map, *nosave_map,
- *pageset1_copy_map, *io_map, *page_resave_map, *compare_map;
-
-int resume_attempted;
-
-int memory_bm_write(struct memory_bitmap *bm, int (*rw_chunk)
- (int rw, struct toi_module_ops *owner, char *buffer, int buffer_size))
-{
- int result;
-
- memory_bm_position_reset(bm);
-
- do {
- result = rw_chunk(WRITE, NULL, (char *) bm->cur[0].node->data, PAGE_SIZE);
-
- if (result)
- return result;
- } while (rtree_next_node(bm, 0));
- return 0;
-}
-
-int memory_bm_read(struct memory_bitmap *bm, int (*rw_chunk)
- (int rw, struct toi_module_ops *owner, char *buffer, int buffer_size))
-{
- int result;
-
- memory_bm_position_reset(bm);
-
- do {
- result = rw_chunk(READ, NULL, (char *) bm->cur[0].node->data, PAGE_SIZE);
-
- if (result)
- return result;
-
- } while (rtree_next_node(bm, 0));
- return 0;
-}
-
-int memory_bm_space_needed(struct memory_bitmap *bm)
-{
- unsigned long bytes = 0;
-
- memory_bm_position_reset(bm);
- do {
- bytes += PAGE_SIZE;
- } while (rtree_next_node(bm, 0));
- return bytes;
-}
-
-int toi_alloc_bitmap(struct memory_bitmap **bm)
-{
- int error;
- struct memory_bitmap *bm1;
-
- bm1 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL);
- if (!bm1)
- return -ENOMEM;
-
- error = memory_bm_create(bm1, GFP_KERNEL, PG_ANY);
- if (error) {
- printk("Error returned - %d.\n", error);
- kfree(bm1);
- return -ENOMEM;
- }
-
- *bm = bm1;
- return 0;
-}
-
-void toi_free_bitmap(struct memory_bitmap **bm)
-{
- if (!*bm)
- return;
-
- memory_bm_free(*bm, 0);
- kfree(*bm);
- *bm = NULL;
-}
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 8d7a1ef72..53266b729 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -366,6 +366,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
trace_suspend_resume(TPS("machine_suspend"),
state, false);
events_check_enabled = false;
+ } else if (*wakeup) {
+ error = -EBUSY;
}
syscore_resume();
}
@@ -468,7 +470,7 @@ static int enter_state(suspend_state_t state)
if (state == PM_SUSPEND_FREEZE) {
#ifdef CONFIG_PM_DEBUG
if (pm_test_level != TEST_NONE && pm_test_level <= TEST_CPUS) {
- pr_warning("PM: Unsupported test mode for freeze state,"
+ pr_warning("PM: Unsupported test mode for suspend to idle,"
"please choose none/freezer/devices/platform.\n");
return -EAGAIN;
}
@@ -488,7 +490,7 @@ static int enter_state(suspend_state_t state)
printk("done.\n");
trace_suspend_resume(TPS("sync_filesystems"), 0, false);
- pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
+ pr_debug("PM: Preparing system for sleep (%s)\n", pm_states[state]);
error = suspend_prepare(state);
if (error)
goto Unlock;
@@ -497,7 +499,7 @@ static int enter_state(suspend_state_t state)
goto Finish;
trace_suspend_resume(TPS("suspend_enter"), state, false);
- pr_debug("PM: Entering %s sleep\n", pm_states[state]);
+ pr_debug("PM: Suspending system (%s)\n", pm_states[state]);
pm_restrict_gfp_mask();
error = suspend_devices_and_enter(state);
pm_restore_gfp_mask();
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 570aff817..2f30ca91e 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -212,7 +212,84 @@ int swsusp_swap_in_use(void)
*/
static unsigned short root_swap = 0xffff;
-struct block_device *hib_resume_bdev;
+static struct block_device *hib_resume_bdev;
+
+struct hib_bio_batch {
+ atomic_t count;
+ wait_queue_head_t wait;
+ int error;
+};
+
+static void hib_init_batch(struct hib_bio_batch *hb)
+{
+ atomic_set(&hb->count, 0);
+ init_waitqueue_head(&hb->wait);
+ hb->error = 0;
+}
+
+static void hib_end_io(struct bio *bio, int error)
+{
+ struct hib_bio_batch *hb = bio->bi_private;
+ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct page *page = bio->bi_io_vec[0].bv_page;
+
+ if (!uptodate || error) {
+ printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
+ imajor(bio->bi_bdev->bd_inode),
+ iminor(bio->bi_bdev->bd_inode),
+ (unsigned long long)bio->bi_iter.bi_sector);
+
+ if (!error)
+ error = -EIO;
+ }
+
+ if (bio_data_dir(bio) == WRITE)
+ put_page(page);
+
+ if (error && !hb->error)
+ hb->error = error;
+ if (atomic_dec_and_test(&hb->count))
+ wake_up(&hb->wait);
+
+ bio_put(bio);
+}
+
+static int hib_submit_io(int rw, pgoff_t page_off, void *addr,
+ struct hib_bio_batch *hb)
+{
+ struct page *page = virt_to_page(addr);
+ struct bio *bio;
+ int error = 0;
+
+ bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
+ bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9);
+ bio->bi_bdev = hib_resume_bdev;
+
+ if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
+ printk(KERN_ERR "PM: Adding page to bio failed at %llu\n",
+ (unsigned long long)bio->bi_iter.bi_sector);
+ bio_put(bio);
+ return -EFAULT;
+ }
+
+ if (hb) {
+ bio->bi_end_io = hib_end_io;
+ bio->bi_private = hb;
+ atomic_inc(&hb->count);
+ submit_bio(rw, bio);
+ } else {
+ error = submit_bio_wait(rw, bio);
+ bio_put(bio);
+ }
+
+ return error;
+}
+
+static int hib_wait_io(struct hib_bio_batch *hb)
+{
+ wait_event(hb->wait, atomic_read(&hb->count) == 0);
+ return hb->error;
+}
/*
* Saving part
@@ -222,7 +299,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
{
int error;
- hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL);
+ hib_submit_io(READ_SYNC, swsusp_resume_block, swsusp_header, NULL);
if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) ||
!memcmp("SWAPSPACE2",swsusp_header->sig, 10)) {
memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
@@ -231,7 +308,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
swsusp_header->flags = flags;
if (flags & SF_CRC32_MODE)
swsusp_header->crc32 = handle->crc32;
- error = hib_bio_write_page(swsusp_resume_block,
+ error = hib_submit_io(WRITE_SYNC, swsusp_resume_block,
swsusp_header, NULL);
} else {
printk(KERN_ERR "PM: Swap header not found!\n");
@@ -271,10 +348,10 @@ static int swsusp_swap_check(void)
* write_page - Write one page to given swap location.
* @buf: Address we're writing.
* @offset: Offset of the swap page we're writing to.
- * @bio_chain: Link the next write BIO here
+ * @hb: bio completion batch
*/
-static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
+static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb)
{
void *src;
int ret;
@@ -282,13 +359,13 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
if (!offset)
return -ENOSPC;
- if (bio_chain) {
+ if (hb) {
src = (void *)__get_free_page(__GFP_WAIT | __GFP_NOWARN |
__GFP_NORETRY);
if (src) {
copy_page(src, buf);
} else {
- ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */
+ ret = hib_wait_io(hb); /* Free pages */
if (ret)
return ret;
src = (void *)__get_free_page(__GFP_WAIT |
@@ -298,14 +375,14 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
copy_page(src, buf);
} else {
WARN_ON_ONCE(1);
- bio_chain = NULL; /* Go synchronous */
+ hb = NULL; /* Go synchronous */
src = buf;
}
}
} else {
src = buf;
}
- return hib_bio_write_page(offset, src, bio_chain);
+ return hib_submit_io(WRITE_SYNC, offset, src, hb);
}
static void release_swap_writer(struct swap_map_handle *handle)
@@ -348,7 +425,7 @@ err_close:
}
static int swap_write_page(struct swap_map_handle *handle, void *buf,
- struct bio **bio_chain)
+ struct hib_bio_batch *hb)
{
int error = 0;
sector_t offset;
@@ -356,7 +433,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
if (!handle->cur)
return -EINVAL;
offset = alloc_swapdev_block(root_swap);
- error = write_page(buf, offset, bio_chain);
+ error = write_page(buf, offset, hb);
if (error)
return error;
handle->cur->entries[handle->k++] = offset;
@@ -365,15 +442,15 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
if (!offset)
return -ENOSPC;
handle->cur->next_swap = offset;
- error = write_page(handle->cur, handle->cur_swap, bio_chain);
+ error = write_page(handle->cur, handle->cur_swap, hb);
if (error)
goto out;
clear_page(handle->cur);
handle->cur_swap = offset;
handle->k = 0;
- if (bio_chain && low_free_pages() <= handle->reqd_free_pages) {
- error = hib_wait_on_bio_chain(bio_chain);
+ if (hb && low_free_pages() <= handle->reqd_free_pages) {
+ error = hib_wait_io(hb);
if (error)
goto out;
/*
@@ -445,23 +522,24 @@ static int save_image(struct swap_map_handle *handle,
int ret;
int nr_pages;
int err2;
- struct bio *bio;
+ struct hib_bio_batch hb;
ktime_t start;
ktime_t stop;
+ hib_init_batch(&hb);
+
printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n",
nr_to_write);
m = nr_to_write / 10;
if (!m)
m = 1;
nr_pages = 0;
- bio = NULL;
start = ktime_get();
while (1) {
ret = snapshot_read_next(snapshot);
if (ret <= 0)
break;
- ret = swap_write_page(handle, data_of(*snapshot), &bio);
+ ret = swap_write_page(handle, data_of(*snapshot), &hb);
if (ret)
break;
if (!(nr_pages % m))
@@ -469,7 +547,7 @@ static int save_image(struct swap_map_handle *handle,
nr_pages / m * 10);
nr_pages++;
}
- err2 = hib_wait_on_bio_chain(&bio);
+ err2 = hib_wait_io(&hb);
stop = ktime_get();
if (!ret)
ret = err2;
@@ -580,7 +658,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
int ret = 0;
int nr_pages;
int err2;
- struct bio *bio;
+ struct hib_bio_batch hb;
ktime_t start;
ktime_t stop;
size_t off;
@@ -589,6 +667,8 @@ static int save_image_lzo(struct swap_map_handle *handle,
struct cmp_data *data = NULL;
struct crc_data *crc = NULL;
+ hib_init_batch(&hb);
+
/*
* We'll limit the number of threads for compression to limit memory
* footprint.
@@ -674,7 +754,6 @@ static int save_image_lzo(struct swap_map_handle *handle,
if (!m)
m = 1;
nr_pages = 0;
- bio = NULL;
start = ktime_get();
for (;;) {
for (thr = 0; thr < nr_threads; thr++) {
@@ -748,7 +827,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
off += PAGE_SIZE) {
memcpy(page, data[thr].cmp + off, PAGE_SIZE);
- ret = swap_write_page(handle, page, &bio);
+ ret = swap_write_page(handle, page, &hb);
if (ret)
goto out_finish;
}
@@ -759,7 +838,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
}
out_finish:
- err2 = hib_wait_on_bio_chain(&bio);
+ err2 = hib_wait_io(&hb);
stop = ktime_get();
if (!ret)
ret = err2;
@@ -906,7 +985,7 @@ static int get_swap_reader(struct swap_map_handle *handle,
return -ENOMEM;
}
- error = hib_bio_read_page(offset, tmp->map, NULL);
+ error = hib_submit_io(READ_SYNC, offset, tmp->map, NULL);
if (error) {
release_swap_reader(handle);
return error;
@@ -919,7 +998,7 @@ static int get_swap_reader(struct swap_map_handle *handle,
}
static int swap_read_page(struct swap_map_handle *handle, void *buf,
- struct bio **bio_chain)
+ struct hib_bio_batch *hb)
{
sector_t offset;
int error;
@@ -930,7 +1009,7 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
offset = handle->cur->entries[handle->k];
if (!offset)
return -EFAULT;
- error = hib_bio_read_page(offset, buf, bio_chain);
+ error = hib_submit_io(READ_SYNC, offset, buf, hb);
if (error)
return error;
if (++handle->k >= MAP_PAGE_ENTRIES) {
@@ -968,27 +1047,28 @@ static int load_image(struct swap_map_handle *handle,
int ret = 0;
ktime_t start;
ktime_t stop;
- struct bio *bio;
+ struct hib_bio_batch hb;
int err2;
unsigned nr_pages;
+ hib_init_batch(&hb);
+
printk(KERN_INFO "PM: Loading image data pages (%u pages)...\n",
nr_to_read);
m = nr_to_read / 10;
if (!m)
m = 1;
nr_pages = 0;
- bio = NULL;
start = ktime_get();
for ( ; ; ) {
ret = snapshot_write_next(snapshot);
if (ret <= 0)
break;
- ret = swap_read_page(handle, data_of(*snapshot), &bio);
+ ret = swap_read_page(handle, data_of(*snapshot), &hb);
if (ret)
break;
if (snapshot->sync_read)
- ret = hib_wait_on_bio_chain(&bio);
+ ret = hib_wait_io(&hb);
if (ret)
break;
if (!(nr_pages % m))
@@ -996,7 +1076,7 @@ static int load_image(struct swap_map_handle *handle,
nr_pages / m * 10);
nr_pages++;
}
- err2 = hib_wait_on_bio_chain(&bio);
+ err2 = hib_wait_io(&hb);
stop = ktime_get();
if (!ret)
ret = err2;
@@ -1067,7 +1147,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
unsigned int m;
int ret = 0;
int eof = 0;
- struct bio *bio;
+ struct hib_bio_batch hb;
ktime_t start;
ktime_t stop;
unsigned nr_pages;
@@ -1080,6 +1160,8 @@ static int load_image_lzo(struct swap_map_handle *handle,
struct dec_data *data = NULL;
struct crc_data *crc = NULL;
+ hib_init_batch(&hb);
+
/*
* We'll limit the number of threads for decompression to limit memory
* footprint.
@@ -1190,7 +1272,6 @@ static int load_image_lzo(struct swap_map_handle *handle,
if (!m)
m = 1;
nr_pages = 0;
- bio = NULL;
start = ktime_get();
ret = snapshot_write_next(snapshot);
@@ -1199,7 +1280,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
for(;;) {
for (i = 0; !eof && i < want; i++) {
- ret = swap_read_page(handle, page[ring], &bio);
+ ret = swap_read_page(handle, page[ring], &hb);
if (ret) {
/*
* On real read error, finish. On end of data,
@@ -1226,7 +1307,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
if (!asked)
break;
- ret = hib_wait_on_bio_chain(&bio);
+ ret = hib_wait_io(&hb);
if (ret)
goto out_finish;
have += asked;
@@ -1281,7 +1362,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
* Wait for more data while we are decompressing.
*/
if (have < LZO_CMP_PAGES && asked) {
- ret = hib_wait_on_bio_chain(&bio);
+ ret = hib_wait_io(&hb);
if (ret)
goto out_finish;
have += asked;
@@ -1430,7 +1511,7 @@ int swsusp_check(void)
if (!IS_ERR(hib_resume_bdev)) {
set_blocksize(hib_resume_bdev, PAGE_SIZE);
clear_page(swsusp_header);
- error = hib_bio_read_page(swsusp_resume_block,
+ error = hib_submit_io(READ_SYNC, swsusp_resume_block,
swsusp_header, NULL);
if (error)
goto put;
@@ -1438,7 +1519,7 @@ int swsusp_check(void)
if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) {
memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
/* Reset swap signature now */
- error = hib_bio_write_page(swsusp_resume_block,
+ error = hib_submit_io(WRITE_SYNC, swsusp_resume_block,
swsusp_header, NULL);
} else {
error = -EINVAL;
@@ -1482,10 +1563,10 @@ int swsusp_unmark(void)
{
int error;
- hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL);
+ hib_submit_io(READ_SYNC, swsusp_resume_block, swsusp_header, NULL);
if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) {
memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10);
- error = hib_bio_write_page(swsusp_resume_block,
+ error = hib_submit_io(WRITE_SYNC, swsusp_resume_block,
swsusp_header, NULL);
} else {
printk(KERN_ERR "PM: Cannot find swsusp signature!\n");
diff --git a/kernel/power/tuxonice.h b/kernel/power/tuxonice.h
deleted file mode 100644
index 1aff98026..000000000
--- a/kernel/power/tuxonice.h
+++ /dev/null
@@ -1,260 +0,0 @@
-/*
- * kernel/power/tuxonice.h
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * It contains declarations used throughout swsusp.
- *
- */
-
-#ifndef KERNEL_POWER_TOI_H
-#define KERNEL_POWER_TOI_H
-
-#include <linux/delay.h>
-#include <linux/bootmem.h>
-#include <linux/suspend.h>
-#include <linux/fs.h>
-#include <asm/setup.h>
-#include "tuxonice_pageflags.h"
-#include "power.h"
-
-#define TOI_CORE_VERSION "3.3"
-#define TOI_HEADER_VERSION 3
-#define MY_BOOT_KERNEL_DATA_VERSION 4
-
-struct toi_boot_kernel_data {
- int version;
- int size;
- unsigned long toi_action;
- unsigned long toi_debug_state;
- u32 toi_default_console_level;
- int toi_io_time[2][2];
- char toi_nosave_commandline[COMMAND_LINE_SIZE];
- unsigned long pages_used[33];
- unsigned long incremental_bytes_in;
- unsigned long incremental_bytes_out;
- unsigned long compress_bytes_in;
- unsigned long compress_bytes_out;
- unsigned long pruned_pages;
-};
-
-extern struct toi_boot_kernel_data toi_bkd;
-
-/* Location of book kernel data struct in kernel being resumed */
-extern unsigned long boot_kernel_data_buffer;
-
-/* == Action states == */
-
-enum {
- TOI_REBOOT,
- TOI_PAUSE,
- TOI_LOGALL,
- TOI_CAN_CANCEL,
- TOI_KEEP_IMAGE,
- TOI_FREEZER_TEST,
- TOI_SINGLESTEP,
- TOI_PAUSE_NEAR_PAGESET_END,
- TOI_TEST_FILTER_SPEED,
- TOI_TEST_BIO,
- TOI_NO_PAGESET2,
- TOI_IGNORE_ROOTFS,
- TOI_REPLACE_SWSUSP,
- TOI_PAGESET2_FULL,
- TOI_ABORT_ON_RESAVE_NEEDED,
- TOI_NO_MULTITHREADED_IO,
- TOI_NO_DIRECT_LOAD, /* Obsolete */
- TOI_LATE_CPU_HOTPLUG, /* Obsolete */
- TOI_GET_MAX_MEM_ALLOCD,
- TOI_NO_FLUSHER_THREAD,
- TOI_NO_PS2_IF_UNNEEDED,
- TOI_POST_RESUME_BREAKPOINT,
- TOI_NO_READAHEAD,
- TOI_TRACE_DEBUG_ON,
- TOI_INCREMENTAL_IMAGE,
-};
-
-extern unsigned long toi_bootflags_mask;
-
-#define clear_action_state(bit) (test_and_clear_bit(bit, &toi_bkd.toi_action))
-
-/* == Result states == */
-
-enum {
- TOI_ABORTED,
- TOI_ABORT_REQUESTED,
- TOI_NOSTORAGE_AVAILABLE,
- TOI_INSUFFICIENT_STORAGE,
- TOI_FREEZING_FAILED,
- TOI_KEPT_IMAGE,
- TOI_WOULD_EAT_MEMORY,
- TOI_UNABLE_TO_FREE_ENOUGH_MEMORY,
- TOI_PM_SEM,
- TOI_DEVICE_REFUSED,
- TOI_SYSDEV_REFUSED,
- TOI_EXTRA_PAGES_ALLOW_TOO_SMALL,
- TOI_UNABLE_TO_PREPARE_IMAGE,
- TOI_FAILED_MODULE_INIT,
- TOI_FAILED_MODULE_CLEANUP,
- TOI_FAILED_IO,
- TOI_OUT_OF_MEMORY,
- TOI_IMAGE_ERROR,
- TOI_PLATFORM_PREP_FAILED,
- TOI_CPU_HOTPLUG_FAILED,
- TOI_ARCH_PREPARE_FAILED, /* Removed Linux-3.0 */
- TOI_RESAVE_NEEDED,
- TOI_CANT_SUSPEND,
- TOI_NOTIFIERS_PREPARE_FAILED,
- TOI_PRE_SNAPSHOT_FAILED,
- TOI_PRE_RESTORE_FAILED,
- TOI_USERMODE_HELPERS_ERR,
- TOI_CANT_USE_ALT_RESUME,
- TOI_HEADER_TOO_BIG,
- TOI_WAKEUP_EVENT,
- TOI_SYSCORE_REFUSED,
- TOI_DPM_PREPARE_FAILED,
- TOI_DPM_SUSPEND_FAILED,
- TOI_NUM_RESULT_STATES /* Used in printing debug info only */
-};
-
-extern unsigned long toi_result;
-
-#define set_result_state(bit) (test_and_set_bit(bit, &toi_result))
-#define set_abort_result(bit) (test_and_set_bit(TOI_ABORTED, &toi_result), \
- test_and_set_bit(bit, &toi_result))
-#define clear_result_state(bit) (test_and_clear_bit(bit, &toi_result))
-#define test_result_state(bit) (test_bit(bit, &toi_result))
-
-/* == Debug sections and levels == */
-
-/* debugging levels. */
-enum {
- TOI_STATUS = 0,
- TOI_ERROR = 2,
- TOI_LOW,
- TOI_MEDIUM,
- TOI_HIGH,
- TOI_VERBOSE,
-};
-
-enum {
- TOI_ANY_SECTION,
- TOI_EAT_MEMORY,
- TOI_IO,
- TOI_HEADER,
- TOI_WRITER,
- TOI_MEMORY,
- TOI_PAGEDIR,
- TOI_COMPRESS,
- TOI_BIO,
-};
-
-#define set_debug_state(bit) (test_and_set_bit(bit, &toi_bkd.toi_debug_state))
-#define clear_debug_state(bit) \
- (test_and_clear_bit(bit, &toi_bkd.toi_debug_state))
-#define test_debug_state(bit) (test_bit(bit, &toi_bkd.toi_debug_state))
-
-/* == Steps in hibernating == */
-
-enum {
- STEP_HIBERNATE_PREPARE_IMAGE,
- STEP_HIBERNATE_SAVE_IMAGE,
- STEP_HIBERNATE_POWERDOWN,
- STEP_RESUME_CAN_RESUME,
- STEP_RESUME_LOAD_PS1,
- STEP_RESUME_DO_RESTORE,
- STEP_RESUME_READ_PS2,
- STEP_RESUME_GO,
- STEP_RESUME_ALT_IMAGE,
- STEP_CLEANUP,
- STEP_QUIET_CLEANUP
-};
-
-/* == TuxOnIce states ==
- (see also include/linux/suspend.h) */
-
-#define get_toi_state() (toi_state)
-#define restore_toi_state(saved_state) \
- do { toi_state = saved_state; } while (0)
-
-/* == Module support == */
-
-struct toi_core_fns {
- int (*post_context_save)(void);
- unsigned long (*get_nonconflicting_page)(void);
- int (*try_hibernate)(void);
- void (*try_resume)(void);
-};
-
-extern struct toi_core_fns *toi_core_fns;
-
-/* == All else == */
-#define KB(x) ((x) << (PAGE_SHIFT - 10))
-#define MB(x) ((x) >> (20 - PAGE_SHIFT))
-
-extern int toi_start_anything(int toi_or_resume);
-extern void toi_finish_anything(int toi_or_resume);
-
-extern int save_image_part1(void);
-extern int toi_atomic_restore(void);
-
-extern int toi_try_hibernate(void);
-extern void toi_try_resume(void);
-
-extern int __toi_post_context_save(void);
-
-extern unsigned int nr_hibernates;
-extern char alt_resume_param[256];
-
-extern void copyback_post(void);
-extern int toi_hibernate(void);
-extern unsigned long extra_pd1_pages_used;
-
-#define SECTOR_SIZE 512
-
-extern void toi_early_boot_message(int can_erase_image, int default_answer,
- char *warning_reason, ...);
-
-extern int do_check_can_resume(void);
-extern int do_toi_step(int step);
-extern int toi_launch_userspace_program(char *command, int channel_no,
- int wait, int debug);
-
-extern char tuxonice_signature[9];
-
-extern int toi_start_other_threads(void);
-extern void toi_stop_other_threads(void);
-
-extern int toi_trace_index;
-#define TOI_TRACE_DEBUG(PFN, DESC, ...) \
- do { \
- if (test_action_state(TOI_TRACE_DEBUG_ON)) { \
- printk("*TOI* %ld %02d" DESC "\n", PFN, toi_trace_index, ##__VA_ARGS__); \
- } \
- } while(0)
-
-#ifdef CONFIG_TOI_KEEP_IMAGE
-#define toi_keeping_image (test_action_state(TOI_KEEP_IMAGE) || test_action_state(TOI_INCREMENTAL_IMAGE))
-#else
-#define toi_keeping_image (0)
-#endif
-
-#ifdef CONFIG_TOI_INCREMENTAL
-extern void toi_reset_dirtiness_one(unsigned long pfn, int verbose);
-extern int toi_reset_dirtiness(int verbose);
-extern void toi_cbw_write(void);
-extern void toi_cbw_restore(void);
-extern int toi_allocate_cbw_data(void);
-extern void toi_free_cbw_data(void);
-extern int toi_cbw_init(void);
-extern void toi_mark_tasks_cbw(void);
-#else
-static inline int toi_reset_dirtiness(int verbose) { return 0; }
-#define toi_cbw_write() do { } while(0)
-#define toi_cbw_restore() do { } while(0)
-#define toi_allocate_cbw_data() do { } while(0)
-#define toi_free_cbw_data() do { } while(0)
-static inline int toi_cbw_init(void) { return 0; }
-#endif
-#endif
diff --git a/kernel/power/tuxonice_alloc.c b/kernel/power/tuxonice_alloc.c
deleted file mode 100644
index 5729240d8..000000000
--- a/kernel/power/tuxonice_alloc.c
+++ /dev/null
@@ -1,308 +0,0 @@
-/*
- * kernel/power/tuxonice_alloc.c
- *
- * Copyright (C) 2008-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- */
-
-#include <linux/export.h>
-#include <linux/slab.h>
-#include "tuxonice_modules.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice.h"
-
-#define TOI_ALLOC_PATHS 41
-
-static DEFINE_MUTEX(toi_alloc_mutex);
-
-static struct toi_module_ops toi_alloc_ops;
-
-static int toi_fail_num;
-
-static atomic_t toi_alloc_count[TOI_ALLOC_PATHS],
- toi_free_count[TOI_ALLOC_PATHS],
- toi_test_count[TOI_ALLOC_PATHS],
- toi_fail_count[TOI_ALLOC_PATHS];
-static int toi_cur_allocd[TOI_ALLOC_PATHS], toi_max_allocd[TOI_ALLOC_PATHS];
-static int cur_allocd, max_allocd;
-
-static char *toi_alloc_desc[TOI_ALLOC_PATHS] = {
- "", /* 0 */
- "get_io_info_struct",
- "extent",
- "extent (loading chain)",
- "userui channel",
- "userui arg", /* 5 */
- "attention list metadata",
- "extra pagedir memory metadata",
- "bdev metadata",
- "extra pagedir memory",
- "header_locations_read", /* 10 */
- "bio queue",
- "prepare_readahead",
- "i/o buffer",
- "writer buffer in bio_init",
- "checksum buffer", /* 15 */
- "compression buffer",
- "filewriter signature op",
- "set resume param alloc1",
- "set resume param alloc2",
- "debugging info buffer", /* 20 */
- "check can resume buffer",
- "write module config buffer",
- "read module config buffer",
- "write image header buffer",
- "read pageset1 buffer", /* 25 */
- "get_have_image_data buffer",
- "checksum page",
- "worker rw loop",
- "get nonconflicting page",
- "ps1 load addresses", /* 30 */
- "remove swap image",
- "swap image exists",
- "swap parse sig location",
- "sysfs kobj",
- "swap mark resume attempted buffer", /* 35 */
- "cluster member",
- "boot kernel data buffer",
- "setting swap signature",
- "block i/o bdev struct",
- "copy before write", /* 40 */
-};
-
-#define MIGHT_FAIL(FAIL_NUM, FAIL_VAL) \
- do { \
- BUG_ON(FAIL_NUM >= TOI_ALLOC_PATHS); \
- \
- if (FAIL_NUM == toi_fail_num) { \
- atomic_inc(&toi_test_count[FAIL_NUM]); \
- toi_fail_num = 0; \
- return FAIL_VAL; \
- } \
- } while (0)
-
-static void alloc_update_stats(int fail_num, void *result, int size)
-{
- if (!result) {
- atomic_inc(&toi_fail_count[fail_num]);
- return;
- }
-
- atomic_inc(&toi_alloc_count[fail_num]);
- if (unlikely(test_action_state(TOI_GET_MAX_MEM_ALLOCD))) {
- mutex_lock(&toi_alloc_mutex);
- toi_cur_allocd[fail_num]++;
- cur_allocd += size;
- if (unlikely(cur_allocd > max_allocd)) {
- int i;
-
- for (i = 0; i < TOI_ALLOC_PATHS; i++)
- toi_max_allocd[i] = toi_cur_allocd[i];
- max_allocd = cur_allocd;
- }
- mutex_unlock(&toi_alloc_mutex);
- }
-}
-
-static void free_update_stats(int fail_num, int size)
-{
- BUG_ON(fail_num >= TOI_ALLOC_PATHS);
- atomic_inc(&toi_free_count[fail_num]);
- if (unlikely(atomic_read(&toi_free_count[fail_num]) >
- atomic_read(&toi_alloc_count[fail_num])))
- dump_stack();
- if (unlikely(test_action_state(TOI_GET_MAX_MEM_ALLOCD))) {
- mutex_lock(&toi_alloc_mutex);
- cur_allocd -= size;
- toi_cur_allocd[fail_num]--;
- mutex_unlock(&toi_alloc_mutex);
- }
-}
-
-void *toi_kzalloc(int fail_num, size_t size, gfp_t flags)
-{
- void *result;
-
- if (toi_alloc_ops.enabled)
- MIGHT_FAIL(fail_num, NULL);
- result = kzalloc(size, flags);
- if (toi_alloc_ops.enabled)
- alloc_update_stats(fail_num, result, size);
- if (fail_num == toi_trace_allocs)
- dump_stack();
- return result;
-}
-
-unsigned long toi_get_free_pages(int fail_num, gfp_t mask,
- unsigned int order)
-{
- unsigned long result;
-
- mask |= ___GFP_TOI_NOTRACK;
- if (toi_alloc_ops.enabled)
- MIGHT_FAIL(fail_num, 0);
- result = __get_free_pages(mask, order);
- if (toi_alloc_ops.enabled)
- alloc_update_stats(fail_num, (void *) result,
- PAGE_SIZE << order);
- if (fail_num == toi_trace_allocs)
- dump_stack();
- return result;
-}
-
-struct page *toi_alloc_page(int fail_num, gfp_t mask)
-{
- struct page *result;
-
- if (toi_alloc_ops.enabled)
- MIGHT_FAIL(fail_num, NULL);
- mask |= ___GFP_TOI_NOTRACK;
- result = alloc_page(mask);
- if (toi_alloc_ops.enabled)
- alloc_update_stats(fail_num, (void *) result, PAGE_SIZE);
- if (fail_num == toi_trace_allocs)
- dump_stack();
- return result;
-}
-
-unsigned long toi_get_zeroed_page(int fail_num, gfp_t mask)
-{
- unsigned long result;
-
- if (toi_alloc_ops.enabled)
- MIGHT_FAIL(fail_num, 0);
- mask |= ___GFP_TOI_NOTRACK;
- result = get_zeroed_page(mask);
- if (toi_alloc_ops.enabled)
- alloc_update_stats(fail_num, (void *) result, PAGE_SIZE);
- if (fail_num == toi_trace_allocs)
- dump_stack();
- return result;
-}
-
-void toi_kfree(int fail_num, const void *arg, int size)
-{
- if (arg && toi_alloc_ops.enabled)
- free_update_stats(fail_num, size);
-
- if (fail_num == toi_trace_allocs)
- dump_stack();
- kfree(arg);
-}
-
-void toi_free_page(int fail_num, unsigned long virt)
-{
- if (virt && toi_alloc_ops.enabled)
- free_update_stats(fail_num, PAGE_SIZE);
-
- if (fail_num == toi_trace_allocs)
- dump_stack();
- free_page(virt);
-}
-
-void toi__free_page(int fail_num, struct page *page)
-{
- if (page && toi_alloc_ops.enabled)
- free_update_stats(fail_num, PAGE_SIZE);
-
- if (fail_num == toi_trace_allocs)
- dump_stack();
- __free_page(page);
-}
-
-void toi_free_pages(int fail_num, struct page *page, int order)
-{
- if (page && toi_alloc_ops.enabled)
- free_update_stats(fail_num, PAGE_SIZE << order);
-
- if (fail_num == toi_trace_allocs)
- dump_stack();
- __free_pages(page, order);
-}
-
-void toi_alloc_print_debug_stats(void)
-{
- int i, header_done = 0;
-
- if (!toi_alloc_ops.enabled)
- return;
-
- for (i = 0; i < TOI_ALLOC_PATHS; i++)
- if (atomic_read(&toi_alloc_count[i]) !=
- atomic_read(&toi_free_count[i])) {
- if (!header_done) {
- printk(KERN_INFO "Idx Allocs Frees Tests "
- " Fails Max Description\n");
- header_done = 1;
- }
-
- printk(KERN_INFO "%3d %7d %7d %7d %7d %7d %s\n", i,
- atomic_read(&toi_alloc_count[i]),
- atomic_read(&toi_free_count[i]),
- atomic_read(&toi_test_count[i]),
- atomic_read(&toi_fail_count[i]),
- toi_max_allocd[i],
- toi_alloc_desc[i]);
- }
-}
-
-static int toi_alloc_initialise(int starting_cycle)
-{
- int i;
-
- if (!starting_cycle)
- return 0;
-
- if (toi_trace_allocs)
- dump_stack();
-
- for (i = 0; i < TOI_ALLOC_PATHS; i++) {
- atomic_set(&toi_alloc_count[i], 0);
- atomic_set(&toi_free_count[i], 0);
- atomic_set(&toi_test_count[i], 0);
- atomic_set(&toi_fail_count[i], 0);
- toi_cur_allocd[i] = 0;
- toi_max_allocd[i] = 0;
- };
-
- max_allocd = 0;
- cur_allocd = 0;
- return 0;
-}
-
-static struct toi_sysfs_data sysfs_params[] = {
- SYSFS_INT("failure_test", SYSFS_RW, &toi_fail_num, 0, 99, 0, NULL),
- SYSFS_INT("trace", SYSFS_RW, &toi_trace_allocs, 0, TOI_ALLOC_PATHS, 0,
- NULL),
- SYSFS_BIT("find_max_mem_allocated", SYSFS_RW, &toi_bkd.toi_action,
- TOI_GET_MAX_MEM_ALLOCD, 0),
- SYSFS_INT("enabled", SYSFS_RW, &toi_alloc_ops.enabled, 0, 1, 0,
- NULL)
-};
-
-static struct toi_module_ops toi_alloc_ops = {
- .type = MISC_HIDDEN_MODULE,
- .name = "allocation debugging",
- .directory = "alloc",
- .module = THIS_MODULE,
- .early = 1,
- .initialise = toi_alloc_initialise,
-
- .sysfs_data = sysfs_params,
- .num_sysfs_entries = sizeof(sysfs_params) /
- sizeof(struct toi_sysfs_data),
-};
-
-int toi_alloc_init(void)
-{
- int result = toi_register_module(&toi_alloc_ops);
- return result;
-}
-
-void toi_alloc_exit(void)
-{
- toi_unregister_module(&toi_alloc_ops);
-}
diff --git a/kernel/power/tuxonice_alloc.h b/kernel/power/tuxonice_alloc.h
deleted file mode 100644
index 28c5af193..000000000
--- a/kernel/power/tuxonice_alloc.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * kernel/power/tuxonice_alloc.h
- *
- * Copyright (C) 2008-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- */
-
-#include <linux/slab.h>
-#define TOI_WAIT_GFP (GFP_NOFS | __GFP_NOWARN)
-#define TOI_ATOMIC_GFP (GFP_ATOMIC | __GFP_NOWARN)
-
-#ifdef CONFIG_PM_DEBUG
-extern void *toi_kzalloc(int fail_num, size_t size, gfp_t flags);
-extern void toi_kfree(int fail_num, const void *arg, int size);
-
-extern unsigned long toi_get_free_pages(int fail_num, gfp_t mask,
- unsigned int order);
-#define toi_get_free_page(FAIL_NUM, MASK) toi_get_free_pages(FAIL_NUM, MASK, 0)
-extern unsigned long toi_get_zeroed_page(int fail_num, gfp_t mask);
-extern void toi_free_page(int fail_num, unsigned long buf);
-extern void toi__free_page(int fail_num, struct page *page);
-extern void toi_free_pages(int fail_num, struct page *page, int order);
-extern struct page *toi_alloc_page(int fail_num, gfp_t mask);
-extern int toi_alloc_init(void);
-extern void toi_alloc_exit(void);
-
-extern void toi_alloc_print_debug_stats(void);
-
-#else /* CONFIG_PM_DEBUG */
-
-#define toi_kzalloc(FAIL, SIZE, FLAGS) (kzalloc(SIZE, FLAGS))
-#define toi_kfree(FAIL, ALLOCN, SIZE) (kfree(ALLOCN))
-
-#define toi_get_free_pages(FAIL, FLAGS, ORDER) __get_free_pages(FLAGS, ORDER)
-#define toi_get_free_page(FAIL, FLAGS) __get_free_page(FLAGS)
-#define toi_get_zeroed_page(FAIL, FLAGS) get_zeroed_page(FLAGS)
-#define toi_free_page(FAIL, ALLOCN) do { free_page(ALLOCN); } while (0)
-#define toi__free_page(FAIL, PAGE) __free_page(PAGE)
-#define toi_free_pages(FAIL, PAGE, ORDER) __free_pages(PAGE, ORDER)
-#define toi_alloc_page(FAIL, MASK) alloc_page(MASK)
-static inline int toi_alloc_init(void)
-{
- return 0;
-}
-
-static inline void toi_alloc_exit(void) { }
-
-static inline void toi_alloc_print_debug_stats(void) { }
-
-#endif
-
-extern int toi_trace_allocs;
diff --git a/kernel/power/tuxonice_atomic_copy.c b/kernel/power/tuxonice_atomic_copy.c
deleted file mode 100644
index 7b9886f54..000000000
--- a/kernel/power/tuxonice_atomic_copy.c
+++ /dev/null
@@ -1,469 +0,0 @@
-/*
- * kernel/power/tuxonice_atomic_copy.c
- *
- * Copyright 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- * Routines for doing the atomic save/restore.
- */
-
-#include <linux/suspend.h>
-#include <linux/highmem.h>
-#include <linux/cpu.h>
-#include <linux/freezer.h>
-#include <linux/console.h>
-#include <linux/syscore_ops.h>
-#include <linux/ftrace.h>
-#include <asm/suspend.h>
-#include "tuxonice.h"
-#include "tuxonice_storage.h"
-#include "tuxonice_power_off.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_io.h"
-#include "tuxonice_prepare_image.h"
-#include "tuxonice_pageflags.h"
-#include "tuxonice_checksum.h"
-#include "tuxonice_builtin.h"
-#include "tuxonice_atomic_copy.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_modules.h"
-
-unsigned long extra_pd1_pages_used;
-
-/**
- * free_pbe_list - free page backup entries used by the atomic copy code.
- * @list: List to free.
- * @highmem: Whether the list is in highmem.
- *
- * Normally, this function isn't used. If, however, we need to abort before
- * doing the atomic copy, we use this to free the pbes previously allocated.
- **/
-static void free_pbe_list(struct pbe **list, int highmem)
-{
- while (*list) {
- int i;
- struct pbe *free_pbe, *next_page = NULL;
- struct page *page;
-
- if (highmem) {
- page = (struct page *) *list;
- free_pbe = (struct pbe *) kmap(page);
- } else {
- page = virt_to_page(*list);
- free_pbe = *list;
- }
-
- for (i = 0; i < PBES_PER_PAGE; i++) {
- if (!free_pbe)
- break;
- if (highmem)
- toi__free_page(29, free_pbe->address);
- else
- toi_free_page(29,
- (unsigned long) free_pbe->address);
- free_pbe = free_pbe->next;
- }
-
- if (highmem) {
- if (free_pbe)
- next_page = free_pbe;
- kunmap(page);
- } else {
- if (free_pbe)
- next_page = free_pbe;
- }
-
- toi__free_page(29, page);
- *list = (struct pbe *) next_page;
- };
-}
-
-/**
- * copyback_post - post atomic-restore actions
- *
- * After doing the atomic restore, we have a few more things to do:
- * 1) We want to retain some values across the restore, so we now copy
- * these from the nosave variables to the normal ones.
- * 2) Set the status flags.
- * 3) Resume devices.
- * 4) Tell userui so it can redraw & restore settings.
- * 5) Reread the page cache.
- **/
-void copyback_post(void)
-{
- struct toi_boot_kernel_data *bkd =
- (struct toi_boot_kernel_data *) boot_kernel_data_buffer;
-
- if (toi_activate_storage(1))
- panic("Failed to reactivate our storage.");
-
- toi_post_atomic_restore_modules(bkd);
-
- toi_cond_pause(1, "About to reload secondary pagedir.");
-
- if (read_pageset2(0))
- panic("Unable to successfully reread the page cache.");
-
- /*
- * If the user wants to sleep again after resuming from full-off,
- * it's most likely to be in order to suspend to ram, so we'll
- * do this check after loading pageset2, to give them the fastest
- * wakeup when they are ready to use the computer again.
- */
- toi_check_resleep();
-
- if (test_action_state(TOI_INCREMENTAL_IMAGE))
- toi_reset_dirtiness(1);
-}
-
-/**
- * toi_copy_pageset1 - do the atomic copy of pageset1
- *
- * Make the atomic copy of pageset1. We can't use copy_page (as we once did)
- * because we can't be sure what side effects it has. On my old Duron, with
- * 3DNOW, kernel_fpu_begin increments preempt count, making our preempt
- * count at resume time 4 instead of 3.
- *
- * We don't want to call kmap_atomic unconditionally because it has the side
- * effect of incrementing the preempt count, which will leave it one too high
- * post resume (the page containing the preempt count will be copied after
- * its incremented. This is essentially the same problem.
- **/
-void toi_copy_pageset1(void)
-{
- int i;
- unsigned long source_index, dest_index;
-
- memory_bm_position_reset(pageset1_map);
- memory_bm_position_reset(pageset1_copy_map);
-
- source_index = memory_bm_next_pfn(pageset1_map, 0);
- dest_index = memory_bm_next_pfn(pageset1_copy_map, 0);
-
- for (i = 0; i < pagedir1.size; i++) {
- unsigned long *origvirt, *copyvirt;
- struct page *origpage, *copypage;
- int loop = (PAGE_SIZE / sizeof(unsigned long)) - 1,
- was_present1, was_present2;
-
- origpage = pfn_to_page(source_index);
- copypage = pfn_to_page(dest_index);
-
- origvirt = PageHighMem(origpage) ?
- kmap_atomic(origpage) :
- page_address(origpage);
-
- copyvirt = PageHighMem(copypage) ?
- kmap_atomic(copypage) :
- page_address(copypage);
-
- was_present1 = kernel_page_present(origpage);
- if (!was_present1)
- kernel_map_pages(origpage, 1, 1);
-
- was_present2 = kernel_page_present(copypage);
- if (!was_present2)
- kernel_map_pages(copypage, 1, 1);
-
- while (loop >= 0) {
- *(copyvirt + loop) = *(origvirt + loop);
- loop--;
- }
-
- if (!was_present1)
- kernel_map_pages(origpage, 1, 0);
-
- if (!was_present2)
- kernel_map_pages(copypage, 1, 0);
-
- if (PageHighMem(origpage))
- kunmap_atomic(origvirt);
-
- if (PageHighMem(copypage))
- kunmap_atomic(copyvirt);
-
- source_index = memory_bm_next_pfn(pageset1_map, 0);
- dest_index = memory_bm_next_pfn(pageset1_copy_map, 0);
- }
-}
-
-/**
- * __toi_post_context_save - steps after saving the cpu context
- *
- * Steps taken after saving the CPU state to make the actual
- * atomic copy.
- *
- * Called from swsusp_save in snapshot.c via toi_post_context_save.
- **/
-int __toi_post_context_save(void)
-{
- unsigned long old_ps1_size = pagedir1.size;
-
- check_checksums();
-
- free_checksum_pages();
-
- toi_recalculate_image_contents(1);
-
- extra_pd1_pages_used = pagedir1.size > old_ps1_size ?
- pagedir1.size - old_ps1_size : 0;
-
- if (extra_pd1_pages_used > extra_pd1_pages_allowance) {
- printk(KERN_INFO "Pageset1 has grown by %lu pages. "
- "extra_pages_allowance is currently only %lu.\n",
- pagedir1.size - old_ps1_size,
- extra_pd1_pages_allowance);
-
- /*
- * Highlevel code will see this, clear the state and
- * retry if we haven't already done so twice.
- */
- if (any_to_free(1)) {
- set_abort_result(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL);
- return 1;
- }
- if (try_allocate_extra_memory()) {
- printk(KERN_INFO "Failed to allocate the extra memory"
- " needed. Restarting the process.");
- set_abort_result(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL);
- return 1;
- }
- printk(KERN_INFO "However it looks like there's enough"
- " free ram and storage to handle this, so "
- " continuing anyway.");
- /*
- * What if try_allocate_extra_memory above calls
- * toi_allocate_extra_pagedir_memory and it allocs a new
- * slab page via toi_kzalloc which should be in ps1? So...
- */
- toi_recalculate_image_contents(1);
- }
-
- if (!test_action_state(TOI_TEST_FILTER_SPEED) &&
- !test_action_state(TOI_TEST_BIO))
- toi_copy_pageset1();
-
- return 0;
-}
-
-/**
- * toi_hibernate - high level code for doing the atomic copy
- *
- * High-level code which prepares to do the atomic copy. Loosely based
- * on the swsusp version, but with the following twists:
- * - We set toi_running so the swsusp code uses our code paths.
- * - We give better feedback regarding what goes wrong if there is a
- * problem.
- * - We use an extra function to call the assembly, just in case this code
- * is in a module (return address).
- **/
-int toi_hibernate(void)
-{
- int error;
-
- error = toi_lowlevel_builtin();
-
- if (!error) {
- struct toi_boot_kernel_data *bkd =
- (struct toi_boot_kernel_data *) boot_kernel_data_buffer;
-
- /*
- * The boot kernel's data may be larger (newer version) or
- * smaller (older version) than ours. Copy the minimum
- * of the two sizes, so that we don't overwrite valid values
- * from pre-atomic copy.
- */
-
- memcpy(&toi_bkd, (char *) boot_kernel_data_buffer,
- min_t(int, sizeof(struct toi_boot_kernel_data),
- bkd->size));
- }
-
- return error;
-}
-
-/**
- * toi_atomic_restore - prepare to do the atomic restore
- *
- * Get ready to do the atomic restore. This part gets us into the same
- * state we are in prior to do calling do_toi_lowlevel while
- * hibernating: hot-unplugging secondary cpus and freeze processes,
- * before starting the thread that will do the restore.
- **/
-int toi_atomic_restore(void)
-{
- int error;
-
- toi_prepare_status(DONT_CLEAR_BAR, "Atomic restore.");
-
- memcpy(&toi_bkd.toi_nosave_commandline, saved_command_line,
- strlen(saved_command_line));
-
- toi_pre_atomic_restore_modules(&toi_bkd);
-
- if (add_boot_kernel_data_pbe())
- goto Failed;
-
- toi_prepare_status(DONT_CLEAR_BAR, "Doing atomic copy/restore.");
-
- if (toi_go_atomic(PMSG_QUIESCE, 0))
- goto Failed;
-
- /* We'll ignore saved state, but this gets preempt count (etc) right */
- save_processor_state();
-
- error = swsusp_arch_resume();
- /*
- * Code below is only ever reached in case of failure. Otherwise
- * execution continues at place where swsusp_arch_suspend was called.
- *
- * We don't know whether it's safe to continue (this shouldn't happen),
- * so lets err on the side of caution.
- */
- BUG();
-
-Failed:
- free_pbe_list(&restore_pblist, 0);
-#ifdef CONFIG_HIGHMEM
- free_pbe_list(&restore_highmem_pblist, 1);
-#endif
- return 1;
-}
-
-/**
- * toi_go_atomic - do the actual atomic copy/restore
- * @state: The state to use for dpm_suspend_start & power_down calls.
- * @suspend_time: Whether we're suspending or resuming.
- **/
-int toi_go_atomic(pm_message_t state, int suspend_time)
-{
- if (suspend_time) {
- if (platform_begin(1)) {
- set_abort_result(TOI_PLATFORM_PREP_FAILED);
- toi_end_atomic(ATOMIC_STEP_PLATFORM_END, suspend_time, 3);
- return 1;
- }
-
- if (dpm_prepare(PMSG_FREEZE)) {
- set_abort_result(TOI_DPM_PREPARE_FAILED);
- dpm_complete(PMSG_RECOVER);
- toi_end_atomic(ATOMIC_STEP_PLATFORM_END, suspend_time, 3);
- return 1;
- }
- }
-
- suspend_console();
- pm_restrict_gfp_mask();
-
- if (suspend_time) {
- if (dpm_suspend(state)) {
- set_abort_result(TOI_DPM_SUSPEND_FAILED);
- toi_end_atomic(ATOMIC_STEP_DEVICE_RESUME, suspend_time, 3);
- return 1;
- }
- } else {
- if (dpm_suspend_start(state)) {
- set_abort_result(TOI_DPM_SUSPEND_FAILED);
- toi_end_atomic(ATOMIC_STEP_DEVICE_RESUME, suspend_time, 3);
- return 1;
- }
- }
-
- /* At this point, dpm_suspend_start() has been called, but *not*
- * dpm_suspend_noirq(). We *must* dpm_suspend_noirq() now.
- * Otherwise, drivers for some devices (e.g. interrupt controllers)
- * become desynchronized with the actual state of the hardware
- * at resume time, and evil weirdness ensues.
- */
-
- if (dpm_suspend_end(state)) {
- set_abort_result(TOI_DEVICE_REFUSED);
- toi_end_atomic(ATOMIC_STEP_DEVICE_RESUME, suspend_time, 1);
- return 1;
- }
-
- if (suspend_time) {
- if (platform_pre_snapshot(1))
- set_abort_result(TOI_PRE_SNAPSHOT_FAILED);
- } else {
- if (platform_pre_restore(1))
- set_abort_result(TOI_PRE_RESTORE_FAILED);
- }
-
- if (test_result_state(TOI_ABORTED)) {
- toi_end_atomic(ATOMIC_STEP_PLATFORM_FINISH, suspend_time, 1);
- return 1;
- }
-
- if (disable_nonboot_cpus()) {
- set_abort_result(TOI_CPU_HOTPLUG_FAILED);
- toi_end_atomic(ATOMIC_STEP_CPU_HOTPLUG,
- suspend_time, 1);
- return 1;
- }
-
- local_irq_disable();
-
- if (syscore_suspend()) {
- set_abort_result(TOI_SYSCORE_REFUSED);
- toi_end_atomic(ATOMIC_STEP_IRQS, suspend_time, 1);
- return 1;
- }
-
- if (suspend_time && pm_wakeup_pending()) {
- set_abort_result(TOI_WAKEUP_EVENT);
- toi_end_atomic(ATOMIC_STEP_SYSCORE_RESUME, suspend_time, 1);
- return 1;
- }
- return 0;
-}
-
-/**
- * toi_end_atomic - post atomic copy/restore routines
- * @stage: What step to start at.
- * @suspend_time: Whether we're suspending or resuming.
- * @error: Whether we're recovering from an error.
- **/
-void toi_end_atomic(int stage, int suspend_time, int error)
-{
- pm_message_t msg = suspend_time ? (error ? PMSG_RECOVER : PMSG_THAW) :
- PMSG_RESTORE;
-
- switch (stage) {
- case ATOMIC_ALL_STEPS:
- if (!suspend_time) {
- events_check_enabled = false;
- }
- platform_leave(1);
- case ATOMIC_STEP_SYSCORE_RESUME:
- syscore_resume();
- case ATOMIC_STEP_IRQS:
- local_irq_enable();
- case ATOMIC_STEP_CPU_HOTPLUG:
- enable_nonboot_cpus();
- case ATOMIC_STEP_PLATFORM_FINISH:
- if (!suspend_time && error & 2)
- platform_restore_cleanup(1);
- else
- platform_finish(1);
- dpm_resume_start(msg);
- case ATOMIC_STEP_DEVICE_RESUME:
- if (suspend_time && (error & 2))
- platform_recover(1);
- dpm_resume(msg);
- if (!toi_in_suspend()) {
- dpm_resume_end(PMSG_RECOVER);
- }
- if (error || !toi_in_suspend()) {
- pm_restore_gfp_mask();
- }
- resume_console();
- case ATOMIC_STEP_DPM_COMPLETE:
- dpm_complete(msg);
- case ATOMIC_STEP_PLATFORM_END:
- platform_end(1);
-
- toi_prepare_status(DONT_CLEAR_BAR, "Post atomic.");
- }
-}
diff --git a/kernel/power/tuxonice_atomic_copy.h b/kernel/power/tuxonice_atomic_copy.h
deleted file mode 100644
index 2de0e3b49..000000000
--- a/kernel/power/tuxonice_atomic_copy.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * kernel/power/tuxonice_atomic_copy.h
- *
- * Copyright 2008-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- * Routines for doing the atomic save/restore.
- */
-
-enum {
- ATOMIC_ALL_STEPS,
- ATOMIC_STEP_SYSCORE_RESUME,
- ATOMIC_STEP_IRQS,
- ATOMIC_STEP_CPU_HOTPLUG,
- ATOMIC_STEP_PLATFORM_FINISH,
- ATOMIC_STEP_DEVICE_RESUME,
- ATOMIC_STEP_DPM_COMPLETE,
- ATOMIC_STEP_PLATFORM_END,
-};
-
-int toi_go_atomic(pm_message_t state, int toi_time);
-void toi_end_atomic(int stage, int toi_time, int error);
-
-extern void platform_recover(int platform_mode);
diff --git a/kernel/power/tuxonice_bio.h b/kernel/power/tuxonice_bio.h
deleted file mode 100644
index 201e3cd47..000000000
--- a/kernel/power/tuxonice_bio.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * kernel/power/tuxonice_bio.h
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- * This file contains declarations for functions exported from
- * tuxonice_bio.c, which contains low level io functions.
- */
-
-#include <linux/buffer_head.h>
-#include "tuxonice_extent.h"
-
-void toi_put_extent_chain(struct hibernate_extent_chain *chain);
-int toi_add_to_extent_chain(struct hibernate_extent_chain *chain,
- unsigned long start, unsigned long end);
-
-struct hibernate_extent_saved_state {
- int extent_num;
- struct hibernate_extent *extent_ptr;
- unsigned long offset;
-};
-
-struct toi_bdev_info {
- struct toi_bdev_info *next;
- struct hibernate_extent_chain blocks;
- struct block_device *bdev;
- struct toi_module_ops *allocator;
- int allocator_index;
- struct hibernate_extent_chain allocations;
- char name[266]; /* "swap on " or "file " + up to 256 chars */
-
- /* Saved in header */
- char uuid[17];
- dev_t dev_t;
- int prio;
- int bmap_shift;
- int blocks_per_page;
- unsigned long pages_used;
- struct hibernate_extent_saved_state saved_state[4];
-};
-
-struct toi_extent_iterate_state {
- struct toi_bdev_info *current_chain;
- int num_chains;
- int saved_chain_number[4];
- struct toi_bdev_info *saved_chain_ptr[4];
-};
-
-/*
- * Our exported interface so the swapwriter and filewriter don't
- * need these functions duplicated.
- */
-struct toi_bio_ops {
- int (*bdev_page_io) (int rw, struct block_device *bdev, long pos,
- struct page *page);
- int (*register_storage)(struct toi_bdev_info *new);
- void (*free_storage)(void);
-};
-
-struct toi_allocator_ops {
- unsigned long (*toi_swap_storage_available) (void);
-};
-
-extern struct toi_bio_ops toi_bio_ops;
-
-extern char *toi_writer_buffer;
-extern int toi_writer_buffer_posn;
-
-struct toi_bio_allocator_ops {
- int (*register_storage) (void);
- unsigned long (*storage_available)(void);
- int (*allocate_storage) (struct toi_bdev_info *, unsigned long);
- int (*bmap) (struct toi_bdev_info *);
- void (*free_storage) (struct toi_bdev_info *);
- unsigned long (*free_unused_storage) (struct toi_bdev_info *, unsigned long used);
-};
diff --git a/kernel/power/tuxonice_bio_chains.c b/kernel/power/tuxonice_bio_chains.c
deleted file mode 100644
index 364fae9db..000000000
--- a/kernel/power/tuxonice_bio_chains.c
+++ /dev/null
@@ -1,1126 +0,0 @@
-/*
- * kernel/power/tuxonice_bio_devinfo.c
- *
- * Copyright (C) 2009-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- */
-
-#include <linux/mm_types.h>
-#include "tuxonice_bio.h"
-#include "tuxonice_bio_internal.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_ui.h"
-#include "tuxonice.h"
-#include "tuxonice_io.h"
-
-static struct toi_bdev_info *prio_chain_head;
-static int num_chains;
-
-/* Pointer to current entry being loaded/saved. */
-struct toi_extent_iterate_state toi_writer_posn;
-
-#define metadata_size (sizeof(struct toi_bdev_info) - \
- offsetof(struct toi_bdev_info, uuid))
-
-/*
- * After section 0 (header) comes 2 => next_section[0] = 2
- */
-static int next_section[3] = { 2, 3, 1 };
-
-/**
- * dump_block_chains - print the contents of the bdev info array.
- **/
-void dump_block_chains(void)
-{
- int i = 0;
- int j;
- struct toi_bdev_info *cur_chain = prio_chain_head;
-
- while (cur_chain) {
- struct hibernate_extent *this = cur_chain->blocks.first;
-
- printk(KERN_DEBUG "Chain %d (prio %d):", i, cur_chain->prio);
-
- while (this) {
- printk(KERN_CONT " [%lu-%lu]%s", this->start,
- this->end, this->next ? "," : "");
- this = this->next;
- }
-
- printk("\n");
- cur_chain = cur_chain->next;
- i++;
- }
-
- printk(KERN_DEBUG "Saved states:\n");
- for (i = 0; i < 4; i++) {
- printk(KERN_DEBUG "Slot %d: Chain %d.\n",
- i, toi_writer_posn.saved_chain_number[i]);
-
- cur_chain = prio_chain_head;
- j = 0;
- while (cur_chain) {
- printk(KERN_DEBUG " Chain %d: Extent %d. Offset %lu.\n",
- j, cur_chain->saved_state[i].extent_num,
- cur_chain->saved_state[i].offset);
- cur_chain = cur_chain->next;
- j++;
- }
- printk(KERN_CONT "\n");
- }
-}
-
-/**
- *
- **/
-static void toi_extent_chain_next(void)
-{
- struct toi_bdev_info *this = toi_writer_posn.current_chain;
-
- if (!this->blocks.current_extent)
- return;
-
- if (this->blocks.current_offset == this->blocks.current_extent->end) {
- if (this->blocks.current_extent->next) {
- this->blocks.current_extent =
- this->blocks.current_extent->next;
- this->blocks.current_offset =
- this->blocks.current_extent->start;
- } else {
- this->blocks.current_extent = NULL;
- this->blocks.current_offset = 0;
- }
- } else
- this->blocks.current_offset++;
-}
-
-/**
- *
- */
-
-static struct toi_bdev_info *__find_next_chain_same_prio(void)
-{
- struct toi_bdev_info *start_chain = toi_writer_posn.current_chain;
- struct toi_bdev_info *this = start_chain;
- int orig_prio = this->prio;
-
- do {
- this = this->next;
-
- if (!this)
- this = prio_chain_head;
-
- /* Back on original chain? Use it again. */
- if (this == start_chain)
- return start_chain;
-
- } while (!this->blocks.current_extent || this->prio != orig_prio);
-
- return this;
-}
-
-static void find_next_chain(void)
-{
- struct toi_bdev_info *this;
-
- this = __find_next_chain_same_prio();
-
- /*
- * If we didn't get another chain of the same priority that we
- * can use, look for the next priority.
- */
- while (this && !this->blocks.current_extent)
- this = this->next;
-
- toi_writer_posn.current_chain = this;
-}
-
-/**
- * toi_extent_state_next - go to the next extent
- * @blocks: The number of values to progress.
- * @stripe_mode: Whether to spread usage across all chains.
- *
- * Given a state, progress to the next valid entry. We may begin in an
- * invalid state, as we do when invoked after extent_state_goto_start below.
- *
- * When using compression and expected_compression > 0, we let the image size
- * be larger than storage, so we can validly run out of data to return.
- **/
-static unsigned long toi_extent_state_next(int blocks, int current_stream)
-{
- int i;
-
- if (!toi_writer_posn.current_chain)
- return -ENOSPC;
-
- /* Assume chains always have lengths that are multiples of @blocks */
- for (i = 0; i < blocks; i++)
- toi_extent_chain_next();
-
- /* The header stream is not striped */
- if (current_stream ||
- !toi_writer_posn.current_chain->blocks.current_extent)
- find_next_chain();
-
- return toi_writer_posn.current_chain ? 0 : -ENOSPC;
-}
-
-static void toi_insert_chain_in_prio_list(struct toi_bdev_info *this)
-{
- struct toi_bdev_info **prev_ptr;
- struct toi_bdev_info *cur;
-
- /* Loop through the existing chain, finding where to insert it */
- prev_ptr = &prio_chain_head;
- cur = prio_chain_head;
-
- while (cur && cur->prio >= this->prio) {
- prev_ptr = &cur->next;
- cur = cur->next;
- }
-
- this->next = *prev_ptr;
- *prev_ptr = this;
-
- this = prio_chain_head;
- while (this)
- this = this->next;
- num_chains++;
-}
-
-/**
- * toi_extent_state_goto_start - reinitialize an extent chain iterator
- * @state: Iterator to reinitialize
- **/
-void toi_extent_state_goto_start(void)
-{
- struct toi_bdev_info *this = prio_chain_head;
-
- while (this) {
- toi_message(TOI_BIO, TOI_VERBOSE, 0,
- "Setting current extent to %p.", this->blocks.first);
- this->blocks.current_extent = this->blocks.first;
- if (this->blocks.current_extent) {
- toi_message(TOI_BIO, TOI_VERBOSE, 0,
- "Setting current offset to %lu.",
- this->blocks.current_extent->start);
- this->blocks.current_offset =
- this->blocks.current_extent->start;
- }
-
- this = this->next;
- }
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Setting current chain to %p.",
- prio_chain_head);
- toi_writer_posn.current_chain = prio_chain_head;
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Leaving extent state goto start.");
-}
-
-/**
- * toi_extent_state_save - save state of the iterator
- * @state: Current state of the chain
- * @saved_state: Iterator to populate
- *
- * Given a state and a struct hibernate_extent_state_store, save the current
- * position in a format that can be used with relocated chains (at
- * resume time).
- **/
-void toi_extent_state_save(int slot)
-{
- struct toi_bdev_info *cur_chain = prio_chain_head;
- struct hibernate_extent *extent;
- struct hibernate_extent_saved_state *chain_state;
- int i = 0;
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_extent_state_save, slot %d.",
- slot);
-
- if (!toi_writer_posn.current_chain) {
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "No current chain => "
- "chain_num = -1.");
- toi_writer_posn.saved_chain_number[slot] = -1;
- return;
- }
-
- while (cur_chain) {
- i++;
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Saving chain %d (%p) "
- "state, slot %d.", i, cur_chain, slot);
-
- chain_state = &cur_chain->saved_state[slot];
-
- chain_state->offset = cur_chain->blocks.current_offset;
-
- if (toi_writer_posn.current_chain == cur_chain) {
- toi_writer_posn.saved_chain_number[slot] = i;
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "This is the chain "
- "we were on => chain_num is %d.", i);
- }
-
- if (!cur_chain->blocks.current_extent) {
- chain_state->extent_num = 0;
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "No current extent "
- "for this chain => extent_num %d is 0.",
- i);
- cur_chain = cur_chain->next;
- continue;
- }
-
- extent = cur_chain->blocks.first;
- chain_state->extent_num = 1;
-
- while (extent != cur_chain->blocks.current_extent) {
- chain_state->extent_num++;
- extent = extent->next;
- }
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "extent num %d is %d.", i,
- chain_state->extent_num);
-
- cur_chain = cur_chain->next;
- }
- toi_message(TOI_BIO, TOI_VERBOSE, 0,
- "Completed saving extent state slot %d.", slot);
-}
-
-/**
- * toi_extent_state_restore - restore the position saved by extent_state_save
- * @state: State to populate
- * @saved_state: Iterator saved to restore
- **/
-void toi_extent_state_restore(int slot)
-{
- int i = 0;
- struct toi_bdev_info *cur_chain = prio_chain_head;
- struct hibernate_extent_saved_state *chain_state;
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0,
- "toi_extent_state_restore - slot %d.", slot);
-
- if (toi_writer_posn.saved_chain_number[slot] == -1) {
- toi_writer_posn.current_chain = NULL;
- return;
- }
-
- while (cur_chain) {
- int posn;
- int j;
- i++;
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Restoring chain %d (%p) "
- "state, slot %d.", i, cur_chain, slot);
-
- chain_state = &cur_chain->saved_state[slot];
-
- posn = chain_state->extent_num;
-
- cur_chain->blocks.current_extent = cur_chain->blocks.first;
- cur_chain->blocks.current_offset = chain_state->offset;
-
- if (i == toi_writer_posn.saved_chain_number[slot]) {
- toi_writer_posn.current_chain = cur_chain;
- toi_message(TOI_BIO, TOI_VERBOSE, 0,
- "Found current chain.");
- }
-
- for (j = 0; j < 4; j++)
- if (i == toi_writer_posn.saved_chain_number[j]) {
- toi_writer_posn.saved_chain_ptr[j] = cur_chain;
- toi_message(TOI_BIO, TOI_VERBOSE, 0,
- "Found saved chain ptr %d (%p) (offset"
- " %d).", j, cur_chain,
- cur_chain->saved_state[j].offset);
- }
-
- if (posn) {
- while (--posn)
- cur_chain->blocks.current_extent =
- cur_chain->blocks.current_extent->next;
- } else
- cur_chain->blocks.current_extent = NULL;
-
- cur_chain = cur_chain->next;
- }
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Done.");
- if (test_action_state(TOI_LOGALL))
- dump_block_chains();
-}
-
-/*
- * Storage needed
- *
- * Returns amount of space in the image header required
- * for the chain data. This ignores the links between
- * pages, which we factor in when allocating the space.
- */
-int toi_bio_devinfo_storage_needed(void)
-{
- int result = sizeof(num_chains);
- struct toi_bdev_info *chain = prio_chain_head;
-
- while (chain) {
- result += metadata_size;
-
- /* Chain size */
- result += sizeof(int);
-
- /* Extents */
- result += (2 * sizeof(unsigned long) *
- chain->blocks.num_extents);
-
- chain = chain->next;
- }
-
- result += 4 * sizeof(int);
- return result;
-}
-
-static unsigned long chain_pages_used(struct toi_bdev_info *chain)
-{
- struct hibernate_extent *this = chain->blocks.first;
- struct hibernate_extent_saved_state *state = &chain->saved_state[3];
- unsigned long size = 0;
- int extent_idx = 1;
-
- if (!state->extent_num) {
- if (!this)
- return 0;
- else
- return chain->blocks.size;
- }
-
- while (extent_idx < state->extent_num) {
- size += (this->end - this->start + 1);
- this = this->next;
- extent_idx++;
- }
-
- /* We didn't use the one we're sitting on, so don't count it */
- return size + state->offset - this->start;
-}
-
-void toi_bio_free_unused_storage_chain(struct toi_bdev_info *chain)
-{
- unsigned long used = chain_pages_used(chain);
-
- /* Free the storage */
- unsigned long first_freed = 0;
-
- if (chain->allocator->bio_allocator_ops->free_unused_storage)
- first_freed = chain->allocator->bio_allocator_ops->free_unused_storage(chain, used);
-
- printk(KERN_EMERG "Used %ld blocks in this chain. First extent freed is %lx.\n", used, first_freed);
-
- /* Adjust / free the extents. */
- toi_put_extent_chain_from(&chain->blocks, first_freed);
-
- {
- struct hibernate_extent *this = chain->blocks.first;
- while (this) {
- printk("Extent %lx-%lx.\n", this->start, this->end);
- this = this->next;
- }
- }
-}
-
-/**
- * toi_serialise_extent_chain - write a chain in the image
- * @chain: Chain to write.
- **/
-static int toi_serialise_extent_chain(struct toi_bdev_info *chain)
-{
- struct hibernate_extent *this;
- int ret;
- int i = 1;
-
- chain->pages_used = chain_pages_used(chain);
-
- if (test_action_state(TOI_LOGALL))
- dump_block_chains();
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Serialising chain (dev_t %lx).",
- chain->dev_t);
- /* Device info - dev_t, prio, bmap_shift, blocks per page, positions */
- ret = toiActiveAllocator->rw_header_chunk(WRITE, &toi_blockwriter_ops,
- (char *) &chain->uuid, metadata_size);
- if (ret)
- return ret;
-
- /* Num extents */
- ret = toiActiveAllocator->rw_header_chunk(WRITE, &toi_blockwriter_ops,
- (char *) &chain->blocks.num_extents, sizeof(int));
- if (ret)
- return ret;
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "%d extents.",
- chain->blocks.num_extents);
-
- this = chain->blocks.first;
- while (this) {
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Extent %d.", i);
- ret = toiActiveAllocator->rw_header_chunk(WRITE,
- &toi_blockwriter_ops,
- (char *) this, 2 * sizeof(this->start));
- if (ret)
- return ret;
- this = this->next;
- i++;
- }
-
- return ret;
-}
-
-int toi_serialise_extent_chains(void)
-{
- struct toi_bdev_info *this = prio_chain_head;
- int result;
-
- /* Write the number of chains */
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Write number of chains (%d)",
- num_chains);
- result = toiActiveAllocator->rw_header_chunk(WRITE,
- &toi_blockwriter_ops, (char *) &num_chains,
- sizeof(int));
- if (result)
- return result;
-
- /* Then the chains themselves */
- while (this) {
- result = toi_serialise_extent_chain(this);
- if (result)
- return result;
- this = this->next;
- }
-
- /*
- * Finally, the chain we should be on at the start of each
- * section.
- */
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Saved chain numbers.");
- result = toiActiveAllocator->rw_header_chunk(WRITE,
- &toi_blockwriter_ops,
- (char *) &toi_writer_posn.saved_chain_number[0],
- 4 * sizeof(int));
-
- return result;
-}
-
-int toi_register_storage_chain(struct toi_bdev_info *new)
-{
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Inserting chain %p into list.",
- new);
- toi_insert_chain_in_prio_list(new);
- return 0;
-}
-
-static void free_bdev_info(struct toi_bdev_info *chain)
-{
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Free chain %p.", chain);
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Block extents.");
- toi_put_extent_chain(&chain->blocks);
-
- /*
- * The allocator may need to do more than just free the chains
- * (swap_free, for example). Don't call from boot kernel.
- */
- toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Allocator extents.");
- if (chain->allocator)
- chain->allocator->bio_allocator_ops->free_storage(chain);
-
- /*
- * Dropping out of reading atomic copy? Need to undo
- * toi_open_by_devnum.
- */
- toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Bdev.");
- if (chain->bdev && !IS_ERR(chain->bdev) &&
- chain->bdev != resume_block_device &&
- chain->bdev != header_block_device &&
- test_toi_state(TOI_TRYING_TO_RESUME))
- toi_close_bdev(chain->bdev);
-
- /* Poison */
- toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Struct.");
- toi_kfree(39, chain, sizeof(*chain));
-
- if (prio_chain_head == chain)
- prio_chain_head = NULL;
-
- num_chains--;
-}
-
-void free_all_bdev_info(void)
-{
- struct toi_bdev_info *this = prio_chain_head;
-
- while (this) {
- struct toi_bdev_info *next = this->next;
- free_bdev_info(this);
- this = next;
- }
-
- memset((char *) &toi_writer_posn, 0, sizeof(toi_writer_posn));
- prio_chain_head = NULL;
-}
-
-static void set_up_start_position(void)
-{
- toi_writer_posn.current_chain = prio_chain_head;
- go_next_page(0, 0);
-}
-
-/**
- * toi_load_extent_chain - read back a chain saved in the image
- * @chain: Chain to load
- *
- * The linked list of extents is reconstructed from the disk. chain will point
- * to the first entry.
- **/
-int toi_load_extent_chain(int index, int *num_loaded)
-{
- struct toi_bdev_info *chain = toi_kzalloc(39,
- sizeof(struct toi_bdev_info), GFP_ATOMIC);
- struct hibernate_extent *this, *last = NULL;
- int i, ret;
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Loading extent chain %d.", index);
- /* Get dev_t, prio, bmap_shift, blocks per page, positions */
- ret = toiActiveAllocator->rw_header_chunk_noreadahead(READ, NULL,
- (char *) &chain->uuid, metadata_size);
-
- if (ret) {
- printk(KERN_ERR "Failed to read the size of extent chain.\n");
- toi_kfree(39, chain, sizeof(*chain));
- return 1;
- }
-
- toi_bkd.pages_used[index] = chain->pages_used;
-
- ret = toiActiveAllocator->rw_header_chunk_noreadahead(READ, NULL,
- (char *) &chain->blocks.num_extents, sizeof(int));
- if (ret) {
- printk(KERN_ERR "Failed to read the size of extent chain.\n");
- toi_kfree(39, chain, sizeof(*chain));
- return 1;
- }
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "%d extents.",
- chain->blocks.num_extents);
-
- for (i = 0; i < chain->blocks.num_extents; i++) {
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Extent %d.", i + 1);
-
- this = toi_kzalloc(2, sizeof(struct hibernate_extent),
- TOI_ATOMIC_GFP);
- if (!this) {
- printk(KERN_INFO "Failed to allocate a new extent.\n");
- free_bdev_info(chain);
- return -ENOMEM;
- }
- this->next = NULL;
- /* Get the next page */
- ret = toiActiveAllocator->rw_header_chunk_noreadahead(READ,
- NULL, (char *) this, 2 * sizeof(this->start));
- if (ret) {
- printk(KERN_INFO "Failed to read an extent.\n");
- toi_kfree(2, this, sizeof(struct hibernate_extent));
- free_bdev_info(chain);
- return 1;
- }
-
- if (last)
- last->next = this;
- else {
- char b1[32], b2[32], b3[32];
- /*
- * Open the bdev
- */
- toi_message(TOI_BIO, TOI_VERBOSE, 0,
- "Chain dev_t is %s. Resume dev t is %s. Header"
- " bdev_t is %s.\n",
- format_dev_t(b1, chain->dev_t),
- format_dev_t(b2, resume_dev_t),
- format_dev_t(b3, toi_sig_data->header_dev_t));
-
- if (chain->dev_t == resume_dev_t)
- chain->bdev = resume_block_device;
- else if (chain->dev_t == toi_sig_data->header_dev_t)
- chain->bdev = header_block_device;
- else {
- chain->bdev = toi_open_bdev(chain->uuid,
- chain->dev_t, 1);
- if (IS_ERR(chain->bdev)) {
- free_bdev_info(chain);
- return -ENODEV;
- }
- }
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Chain bmap shift "
- "is %d and blocks per page is %d.",
- chain->bmap_shift,
- chain->blocks_per_page);
-
- chain->blocks.first = this;
-
- /*
- * Couldn't do this earlier, but can't do
- * goto_start now - we may have already used blocks
- * in the first chain.
- */
- chain->blocks.current_extent = this;
- chain->blocks.current_offset = this->start;
-
- /*
- * Can't wait until we've read the whole chain
- * before we insert it in the list. We might need
- * this chain to read the next page in the header
- */
- toi_insert_chain_in_prio_list(chain);
- }
-
- /*
- * We have to wait until 2 extents are loaded before setting up
- * properly because if the first extent has only one page, we
- * will need to put the position on the second extent. Sounds
- * obvious, but it wasn't!
- */
- (*num_loaded)++;
- if ((*num_loaded) == 2)
- set_up_start_position();
- last = this;
- }
-
- /*
- * Shouldn't get empty chains, but it's not impossible. Link them in so
- * they get freed properly later.
- */
- if (!chain->blocks.num_extents)
- toi_insert_chain_in_prio_list(chain);
-
- if (!chain->blocks.current_extent) {
- chain->blocks.current_extent = chain->blocks.first;
- if (chain->blocks.current_extent)
- chain->blocks.current_offset =
- chain->blocks.current_extent->start;
- }
- return 0;
-}
-
-int toi_load_extent_chains(void)
-{
- int result;
- int to_load;
- int i;
- int extents_loaded = 0;
-
- result = toiActiveAllocator->rw_header_chunk_noreadahead(READ, NULL,
- (char *) &to_load,
- sizeof(int));
- if (result)
- return result;
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "%d chains to read.", to_load);
-
- for (i = 0; i < to_load; i++) {
- toi_message(TOI_BIO, TOI_VERBOSE, 0, " >> Loading chain %d/%d.",
- i, to_load);
- result = toi_load_extent_chain(i, &extents_loaded);
- if (result)
- return result;
- }
-
- /* If we never got to a second extent, we still need to do this. */
- if (extents_loaded == 1)
- set_up_start_position();
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Save chain numbers.");
- result = toiActiveAllocator->rw_header_chunk_noreadahead(READ,
- &toi_blockwriter_ops,
- (char *) &toi_writer_posn.saved_chain_number[0],
- 4 * sizeof(int));
-
- return result;
-}
-
-static int toi_end_of_stream(int writing, int section_barrier)
-{
- struct toi_bdev_info *cur_chain = toi_writer_posn.current_chain;
- int compare_to = next_section[current_stream];
- struct toi_bdev_info *compare_chain =
- toi_writer_posn.saved_chain_ptr[compare_to];
- int compare_offset = compare_chain ?
- compare_chain->saved_state[compare_to].offset : 0;
-
- if (!section_barrier)
- return 0;
-
- if (!cur_chain)
- return 1;
-
- if (cur_chain == compare_chain &&
- cur_chain->blocks.current_offset == compare_offset) {
- if (writing) {
- if (!current_stream) {
- debug_broken_header();
- return 1;
- }
- } else {
- more_readahead = 0;
- toi_message(TOI_BIO, TOI_VERBOSE, 0,
- "Reached the end of stream %d "
- "(not an error).", current_stream);
- return 1;
- }
- }
-
- return 0;
-}
-
-/**
- * go_next_page - skip blocks to the start of the next page
- * @writing: Whether we're reading or writing the image.
- *
- * Go forward one page.
- **/
-int go_next_page(int writing, int section_barrier)
-{
- struct toi_bdev_info *cur_chain = toi_writer_posn.current_chain;
- int max = cur_chain ? cur_chain->blocks_per_page : 1;
-
- /* Nope. Go foward a page - or maybe two. Don't stripe the header,
- * so that bad fragmentation doesn't put the extent data containing
- * the location of the second page out of the first header page.
- */
- if (toi_extent_state_next(max, current_stream)) {
- /* Don't complain if readahead falls off the end */
- if (writing && section_barrier) {
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Extent state eof. "
- "Expected compression ratio too optimistic?");
- if (test_action_state(TOI_LOGALL))
- dump_block_chains();
- }
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Ran out of extents to "
- "read/write. (Not necessarily a fatal error.");
- return -ENOSPC;
- }
-
- return 0;
-}
-
-int devices_of_same_priority(struct toi_bdev_info *this)
-{
- struct toi_bdev_info *check = prio_chain_head;
- int i = 0;
-
- while (check) {
- if (check->prio == this->prio)
- i++;
- check = check->next;
- }
-
- return i;
-}
-
-/**
- * toi_bio_rw_page - do i/o on the next disk page in the image
- * @writing: Whether reading or writing.
- * @page: Page to do i/o on.
- * @is_readahead: Whether we're doing readahead
- * @free_group: The group used in allocating the page
- *
- * Submit a page for reading or writing, possibly readahead.
- * Pass the group used in allocating the page as well, as it should
- * be freed on completion of the bio if we're writing the page.
- **/
-int toi_bio_rw_page(int writing, struct page *page,
- int is_readahead, int free_group)
-{
- int result = toi_end_of_stream(writing, 1);
- struct toi_bdev_info *dev_info = toi_writer_posn.current_chain;
-
- if (result) {
- if (writing)
- abort_hibernate(TOI_INSUFFICIENT_STORAGE,
- "Insufficient storage for your image.");
- else
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Seeking to "
- "read/write another page when stream has "
- "ended.");
- return -ENOSPC;
- }
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0,
- "%s %lx:%ld",
- writing ? "Write" : "Read",
- dev_info->dev_t, dev_info->blocks.current_offset);
-
- result = toi_do_io(writing, dev_info->bdev,
- dev_info->blocks.current_offset << dev_info->bmap_shift,
- page, is_readahead, 0, free_group);
-
- /* Ignore the result here - will check end of stream if come in again */
- go_next_page(writing, 1);
-
- if (result)
- printk(KERN_ERR "toi_do_io returned %d.\n", result);
- return result;
-}
-
-dev_t get_header_dev_t(void)
-{
- return prio_chain_head->dev_t;
-}
-
-struct block_device *get_header_bdev(void)
-{
- return prio_chain_head->bdev;
-}
-
-unsigned long get_headerblock(void)
-{
- return prio_chain_head->blocks.first->start <<
- prio_chain_head->bmap_shift;
-}
-
-int get_main_pool_phys_params(void)
-{
- struct toi_bdev_info *this = prio_chain_head;
- int result;
-
- while (this) {
- result = this->allocator->bio_allocator_ops->bmap(this);
- if (result)
- return result;
- this = this->next;
- }
-
- return 0;
-}
-
-static int apply_header_reservation(void)
-{
- int i;
-
- if (!header_pages_reserved) {
- toi_message(TOI_BIO, TOI_VERBOSE, 0,
- "No header pages reserved at the moment.");
- return 0;
- }
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Applying header reservation.");
-
- /* Apply header space reservation */
- toi_extent_state_goto_start();
-
- for (i = 0; i < header_pages_reserved; i++)
- if (go_next_page(1, 0))
- return -ENOSPC;
-
- /* The end of header pages will be the start of pageset 2 */
- toi_extent_state_save(2);
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0,
- "Finished applying header reservation.");
- return 0;
-}
-
-static int toi_bio_register_storage(void)
-{
- int result = 0;
- struct toi_module_ops *this_module;
-
- list_for_each_entry(this_module, &toi_modules, module_list) {
- if (!this_module->enabled ||
- this_module->type != BIO_ALLOCATOR_MODULE)
- continue;
- toi_message(TOI_BIO, TOI_VERBOSE, 0,
- "Registering storage from %s.",
- this_module->name);
- result = this_module->bio_allocator_ops->register_storage();
- if (result)
- break;
- }
-
- return result;
-}
-
-void toi_bio_free_unused_storage(void)
-{
- struct toi_bdev_info *this = prio_chain_head;
-
- while (this) {
- toi_bio_free_unused_storage_chain(this);
- this = this->next;
- }
-}
-
-int toi_bio_allocate_storage(unsigned long request)
-{
- struct toi_bdev_info *chain = prio_chain_head;
- unsigned long to_get = request;
- unsigned long extra_pages, needed;
- int no_free = 0;
-
- if (!chain) {
- int result = toi_bio_register_storage();
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_allocate_storage: "
- "Registering storage.");
- if (result)
- return 0;
- chain = prio_chain_head;
- if (!chain) {
- printk("TuxOnIce: No storage was registered.\n");
- return 0;
- }
- }
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_allocate_storage: "
- "Request is %lu pages.", request);
- extra_pages = DIV_ROUND_UP(request * (sizeof(unsigned long)
- + sizeof(int)), PAGE_SIZE);
- needed = request + extra_pages + header_pages_reserved;
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Adding %lu extra pages and %lu "
- "for header => %lu.",
- extra_pages, header_pages_reserved, needed);
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Already allocated %lu pages.",
- raw_pages_allocd);
-
- to_get = needed > raw_pages_allocd ? needed - raw_pages_allocd : 0;
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Need to get %lu pages.", to_get);
-
- if (!to_get)
- return apply_header_reservation();
-
- while (to_get && chain) {
- int num_group = devices_of_same_priority(chain);
- int divisor = num_group - no_free;
- int i;
- unsigned long portion = DIV_ROUND_UP(to_get, divisor);
- unsigned long got = 0;
- unsigned long got_this_round = 0;
- struct toi_bdev_info *top = chain;
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0,
- " Start of loop. To get is %lu. Divisor is %d.",
- to_get, divisor);
- no_free = 0;
-
- /*
- * We're aiming to spread the allocated storage as evenly
- * as possible, but we also want to get all the storage we
- * can off this priority.
- */
- for (i = 0; i < num_group; i++) {
- struct toi_bio_allocator_ops *ops =
- chain->allocator->bio_allocator_ops;
- toi_message(TOI_BIO, TOI_VERBOSE, 0,
- " Asking for %lu pages from chain %p.",
- portion, chain);
- got = ops->allocate_storage(chain, portion);
- toi_message(TOI_BIO, TOI_VERBOSE, 0,
- " Got %lu pages from allocator %p.",
- got, chain);
- if (!got)
- no_free++;
- got_this_round += got;
- chain = chain->next;
- }
- toi_message(TOI_BIO, TOI_VERBOSE, 0, " Loop finished. Got a "
- "total of %lu pages from %d allocators.",
- got_this_round, divisor - no_free);
-
- raw_pages_allocd += got_this_round;
- to_get = needed > raw_pages_allocd ? needed - raw_pages_allocd :
- 0;
-
- /*
- * If we got anything from chains of this priority and we
- * still have storage to allocate, go over this priority
- * again.
- */
- if (got_this_round && to_get)
- chain = top;
- else
- no_free = 0;
- }
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Finished allocating. Calling "
- "get_main_pool_phys_params");
- /* Now let swap allocator bmap the pages */
- get_main_pool_phys_params();
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Done. Reserving header.");
- return apply_header_reservation();
-}
-
-void toi_bio_chains_post_atomic(struct toi_boot_kernel_data *bkd)
-{
- int i = 0;
- struct toi_bdev_info *cur_chain = prio_chain_head;
-
- while (cur_chain) {
- cur_chain->pages_used = bkd->pages_used[i];
- cur_chain = cur_chain->next;
- i++;
- }
-}
-
-int toi_bio_chains_debug_info(char *buffer, int size)
-{
- /* Show what we actually used */
- struct toi_bdev_info *cur_chain = prio_chain_head;
- int len = 0;
-
- while (cur_chain) {
- len += scnprintf(buffer + len, size - len, " Used %lu pages "
- "from %s.\n", cur_chain->pages_used,
- cur_chain->name);
- cur_chain = cur_chain->next;
- }
-
- return len;
-}
-
-void toi_bio_store_inc_image_ptr(struct toi_incremental_image_pointer *ptr)
-{
- struct toi_bdev_info *this = toi_writer_posn.current_chain,
- *cmp = prio_chain_head;
-
- ptr->save.chain = 1;
- while (this != cmp) {
- ptr->save.chain++;
- cmp = cmp->next;
- }
- ptr->save.block = this->blocks.current_offset;
-
- /* Save the raw info internally for quicker access when updating pointers */
- ptr->bdev = this->bdev;
- ptr->block = this->blocks.current_offset << this->bmap_shift;
-}
-
-void toi_bio_restore_inc_image_ptr(struct toi_incremental_image_pointer *ptr)
-{
- int i = ptr->save.chain - 1;
- struct toi_bdev_info *this;
- struct hibernate_extent *hib;
-
- /* Find chain by stored index */
- this = prio_chain_head;
- while (i) {
- this = this->next;
- i--;
- }
- toi_writer_posn.current_chain = this;
-
- /* Restore block */
- this->blocks.current_offset = ptr->save.block;
-
- /* Find current offset from block number */
- hib = this->blocks.first;
-
- while (hib->start > ptr->save.block) {
- hib = hib->next;
- }
-
- this->blocks.last_touched = this->blocks.current_extent = hib;
-}
diff --git a/kernel/power/tuxonice_bio_core.c b/kernel/power/tuxonice_bio_core.c
deleted file mode 100644
index d18f2751c..000000000
--- a/kernel/power/tuxonice_bio_core.c
+++ /dev/null
@@ -1,1933 +0,0 @@
-/*
- * kernel/power/tuxonice_bio.c
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- * This file contains block io functions for TuxOnIce. These are
- * used by the swapwriter and it is planned that they will also
- * be used by the NFSwriter.
- *
- */
-
-#include <linux/blkdev.h>
-#include <linux/syscalls.h>
-#include <linux/suspend.h>
-#include <linux/ctype.h>
-#include <linux/fs_uuid.h>
-#include <linux/mount.h>
-
-#include "tuxonice.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_prepare_image.h"
-#include "tuxonice_bio.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_io.h"
-#include "tuxonice_builtin.h"
-#include "tuxonice_bio_internal.h"
-
-#define MEMORY_ONLY 1
-#define THROTTLE_WAIT 2
-
-/* #define MEASURE_MUTEX_CONTENTION */
-#ifndef MEASURE_MUTEX_CONTENTION
-#define my_mutex_lock(index, the_lock) mutex_lock(the_lock)
-#define my_mutex_unlock(index, the_lock) mutex_unlock(the_lock)
-#else
-unsigned long mutex_times[2][2][NR_CPUS];
-#define my_mutex_lock(index, the_lock) do { \
- int have_mutex; \
- have_mutex = mutex_trylock(the_lock); \
- if (!have_mutex) { \
- mutex_lock(the_lock); \
- mutex_times[index][0][smp_processor_id()]++; \
- } else { \
- mutex_times[index][1][smp_processor_id()]++; \
- }
-
-#define my_mutex_unlock(index, the_lock) \
- mutex_unlock(the_lock); \
-} while (0)
-#endif
-
-static int page_idx, reset_idx;
-
-static int target_outstanding_io = 1024;
-static int max_outstanding_writes, max_outstanding_reads;
-
-static struct page *bio_queue_head, *bio_queue_tail;
-static atomic_t toi_bio_queue_size;
-static DEFINE_SPINLOCK(bio_queue_lock);
-
-static int free_mem_throttle, throughput_throttle;
-int more_readahead = 1;
-static struct page *readahead_list_head, *readahead_list_tail;
-
-static struct page *waiting_on;
-
-static atomic_t toi_io_in_progress, toi_io_done;
-static DECLARE_WAIT_QUEUE_HEAD(num_in_progress_wait);
-
-int current_stream;
-/* Not static, so that the allocators can setup and complete
- * writing the header */
-char *toi_writer_buffer;
-int toi_writer_buffer_posn;
-
-static DEFINE_MUTEX(toi_bio_mutex);
-static DEFINE_MUTEX(toi_bio_readahead_mutex);
-
-static struct task_struct *toi_queue_flusher;
-static int toi_bio_queue_flush_pages(int dedicated_thread);
-
-struct toi_module_ops toi_blockwriter_ops;
-
-struct toi_incremental_image_pointer toi_inc_ptr[2][2];
-
-#define TOTAL_OUTSTANDING_IO (atomic_read(&toi_io_in_progress) + \
- atomic_read(&toi_bio_queue_size))
-
-unsigned long raw_pages_allocd, header_pages_reserved;
-
-static int toi_rw_buffer(int writing, char *buffer, int buffer_size,
- int no_readahead);
-
-/**
- * set_free_mem_throttle - set the point where we pause to avoid oom.
- *
- * Initially, this value is zero, but when we first fail to allocate memory,
- * we set it (plus a buffer) and thereafter throttle i/o once that limit is
- * reached.
- **/
-static void set_free_mem_throttle(void)
-{
- int new_throttle = nr_free_buffer_pages() + 256;
-
- if (new_throttle > free_mem_throttle)
- free_mem_throttle = new_throttle;
-}
-
-#define NUM_REASONS 7
-static atomic_t reasons[NUM_REASONS];
-static char *reason_name[NUM_REASONS] = {
- "readahead not ready",
- "bio allocation",
- "synchronous I/O",
- "toi_bio_get_new_page",
- "memory low",
- "readahead buffer allocation",
- "throughput_throttle",
-};
-
-/* User Specified Parameters. */
-unsigned long resume_firstblock;
-dev_t resume_dev_t;
-struct block_device *resume_block_device;
-static atomic_t resume_bdev_open_count;
-
-struct block_device *header_block_device;
-
-/**
- * toi_open_bdev: Open a bdev at resume time.
- *
- * index: The swap index. May be MAX_SWAPFILES for the resume_dev_t
- * (the user can have resume= pointing at a swap partition/file that isn't
- * swapon'd when they hibernate. MAX_SWAPFILES+1 for the first page of the
- * header. It will be from a swap partition that was enabled when we hibernated,
- * but we don't know it's real index until we read that first page.
- * dev_t: The device major/minor.
- * display_errs: Whether to try to do this quietly.
- *
- * We stored a dev_t in the image header. Open the matching device without
- * requiring /dev/<whatever> in most cases and record the details needed
- * to close it later and avoid duplicating work.
- */
-struct block_device *toi_open_bdev(char *uuid, dev_t default_device,
- int display_errs)
-{
- struct block_device *bdev;
- dev_t device = default_device;
- char buf[32];
- int retried = 0;
-
-retry:
- if (uuid) {
- struct fs_info seek;
- strncpy((char *) &seek.uuid, uuid, 16);
- seek.dev_t = 0;
- seek.last_mount_size = 0;
- device = blk_lookup_fs_info(&seek);
- if (!device) {
- device = default_device;
- printk(KERN_DEBUG "Unable to resolve uuid. Falling back"
- " to dev_t.\n");
- } else
- printk(KERN_DEBUG "Resolved uuid to device %s.\n",
- format_dev_t(buf, device));
- }
-
- if (!device) {
- printk(KERN_ERR "TuxOnIce attempting to open a "
- "blank dev_t!\n");
- dump_stack();
- return NULL;
- }
- bdev = toi_open_by_devnum(device);
-
- if (IS_ERR(bdev) || !bdev) {
- if (!retried) {
- retried = 1;
- wait_for_device_probe();
- goto retry;
- }
- if (display_errs)
- toi_early_boot_message(1, TOI_CONTINUE_REQ,
- "Failed to get access to block device "
- "\"%x\" (error %d).\n Maybe you need "
- "to run mknod and/or lvmsetup in an "
- "initrd/ramfs?", device, bdev);
- return ERR_PTR(-EINVAL);
- }
- toi_message(TOI_BIO, TOI_VERBOSE, 0,
- "TuxOnIce got bdev %p for dev_t %x.",
- bdev, device);
-
- return bdev;
-}
-
-static void toi_bio_reserve_header_space(unsigned long request)
-{
- header_pages_reserved = request;
-}
-
-/**
- * do_bio_wait - wait for some TuxOnIce I/O to complete
- * @reason: The array index of the reason we're waiting.
- *
- * Wait for a particular page of I/O if we're after a particular page.
- * If we're not after a particular page, wait instead for all in flight
- * I/O to be completed or for us to have enough free memory to be able
- * to submit more I/O.
- *
- * If we wait, we also update our statistics regarding why we waited.
- **/
-static void do_bio_wait(int reason)
-{
- struct page *was_waiting_on = waiting_on;
-
- /* On SMP, waiting_on can be reset, so we make a copy */
- if (was_waiting_on) {
- wait_on_page_locked(was_waiting_on);
- atomic_inc(&reasons[reason]);
- } else {
- atomic_inc(&reasons[reason]);
-
- wait_event(num_in_progress_wait,
- !atomic_read(&toi_io_in_progress) ||
- nr_free_buffer_pages() > free_mem_throttle);
- }
-}
-
-/**
- * throttle_if_needed - wait for I/O completion if throttle points are reached
- * @flags: What to check and how to act.
- *
- * Check whether we need to wait for some I/O to complete. We always check
- * whether we have enough memory available, but may also (depending upon
- * @reason) check if the throughput throttle limit has been reached.
- **/
-static int throttle_if_needed(int flags)
-{
- int free_pages = nr_free_buffer_pages();
-
- /* Getting low on memory and I/O is in progress? */
- while (unlikely(free_pages < free_mem_throttle) &&
- atomic_read(&toi_io_in_progress) &&
- !test_result_state(TOI_ABORTED)) {
- if (!(flags & THROTTLE_WAIT))
- return -ENOMEM;
- do_bio_wait(4);
- free_pages = nr_free_buffer_pages();
- }
-
- while (!(flags & MEMORY_ONLY) && throughput_throttle &&
- TOTAL_OUTSTANDING_IO >= throughput_throttle &&
- !test_result_state(TOI_ABORTED)) {
- int result = toi_bio_queue_flush_pages(0);
- if (result)
- return result;
- atomic_inc(&reasons[6]);
- wait_event(num_in_progress_wait,
- !atomic_read(&toi_io_in_progress) ||
- TOTAL_OUTSTANDING_IO < throughput_throttle);
- }
-
- return 0;
-}
-
-/**
- * update_throughput_throttle - update the raw throughput throttle
- * @jif_index: The number of times this function has been called.
- *
- * This function is called four times per second by the core, and used to limit
- * the amount of I/O we submit at once, spreading out our waiting through the
- * whole job and letting userui get an opportunity to do its work.
- *
- * We don't start limiting I/O until 1/4s has gone so that we get a
- * decent sample for our initial limit, and keep updating it because
- * throughput may vary (on rotating media, eg) with our block number.
- *
- * We throttle to 1/10s worth of I/O.
- **/
-static void update_throughput_throttle(int jif_index)
-{
- int done = atomic_read(&toi_io_done);
- throughput_throttle = done * 2 / 5 / jif_index;
-}
-
-/**
- * toi_finish_all_io - wait for all outstanding i/o to complete
- *
- * Flush any queued but unsubmitted I/O and wait for it all to complete.
- **/
-static int toi_finish_all_io(void)
-{
- int result = toi_bio_queue_flush_pages(0);
- toi_bio_queue_flusher_should_finish = 1;
- wake_up(&toi_io_queue_flusher);
- wait_event(num_in_progress_wait, !TOTAL_OUTSTANDING_IO);
- return result;
-}
-
-/**
- * toi_end_bio - bio completion function.
- * @bio: bio that has completed.
- * @err: Error value. Yes, like end_swap_bio_read, we ignore it.
- *
- * Function called by the block driver from interrupt context when I/O is
- * completed. If we were writing the page, we want to free it and will have
- * set bio->bi_private to the parameter we should use in telling the page
- * allocation accounting code what the page was allocated for. If we're
- * reading the page, it will be in the singly linked list made from
- * page->private pointers.
- **/
-static void toi_end_bio(struct bio *bio, int err)
-{
- struct page *page = bio->bi_io_vec[0].bv_page;
-
- BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
-
- unlock_page(page);
- bio_put(bio);
-
- if (waiting_on == page)
- waiting_on = NULL;
-
- put_page(page);
-
- if (bio->bi_private)
- toi__free_page((int) ((unsigned long) bio->bi_private) , page);
-
- bio_put(bio);
-
- atomic_dec(&toi_io_in_progress);
- atomic_inc(&toi_io_done);
-
- wake_up(&num_in_progress_wait);
-}
-
-/**
- * submit - submit BIO request
- * @writing: READ or WRITE.
- * @dev: The block device we're using.
- * @first_block: The first sector we're using.
- * @page: The page being used for I/O.
- * @free_group: If writing, the group that was used in allocating the page
- * and which will be used in freeing the page from the completion
- * routine.
- *
- * Based on Patrick Mochell's pmdisk code from long ago: "Straight from the
- * textbook - allocate and initialize the bio. If we're writing, make sure
- * the page is marked as dirty. Then submit it and carry on."
- *
- * If we're just testing the speed of our own code, we fake having done all
- * the hard work and all toi_end_bio immediately.
- **/
-static int submit(int writing, struct block_device *dev, sector_t first_block,
- struct page *page, int free_group)
-{
- struct bio *bio = NULL;
- int cur_outstanding_io, result;
-
- /*
- * Shouldn't throttle if reading - can deadlock in the single
- * threaded case as pages are only freed when we use the
- * readahead.
- */
- if (writing) {
- result = throttle_if_needed(MEMORY_ONLY | THROTTLE_WAIT);
- if (result)
- return result;
- }
-
- while (!bio) {
- bio = bio_alloc(TOI_ATOMIC_GFP, 1);
- if (!bio) {
- set_free_mem_throttle();
- do_bio_wait(1);
- }
- }
-
- bio->bi_bdev = dev;
- bio->bi_iter.bi_sector = first_block;
- bio->bi_private = (void *) ((unsigned long) free_group);
- bio->bi_end_io = toi_end_bio;
- bio->bi_flags |= (1 << BIO_TOI);
-
- if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
- printk(KERN_DEBUG "ERROR: adding page to bio at %lld\n",
- (unsigned long long) first_block);
- bio_put(bio);
- return -EFAULT;
- }
-
- bio_get(bio);
-
- cur_outstanding_io = atomic_add_return(1, &toi_io_in_progress);
- if (writing) {
- if (cur_outstanding_io > max_outstanding_writes)
- max_outstanding_writes = cur_outstanding_io;
- } else {
- if (cur_outstanding_io > max_outstanding_reads)
- max_outstanding_reads = cur_outstanding_io;
- }
-
- /* Still read the header! */
- if (unlikely(test_action_state(TOI_TEST_BIO) && writing)) {
- /* Fake having done the hard work */
- set_bit(BIO_UPTODATE, &bio->bi_flags);
- toi_end_bio(bio, 0);
- } else
- submit_bio(writing | REQ_SYNC, bio);
-
- return 0;
-}
-
-/**
- * toi_do_io: Prepare to do some i/o on a page and submit or batch it.
- *
- * @writing: Whether reading or writing.
- * @bdev: The block device which we're using.
- * @block0: The first sector we're reading or writing.
- * @page: The page on which I/O is being done.
- * @readahead_index: If doing readahead, the index (reset this flag when done).
- * @syncio: Whether the i/o is being done synchronously.
- *
- * Prepare and start a read or write operation.
- *
- * Note that we always work with our own page. If writing, we might be given a
- * compression buffer that will immediately be used to start compressing the
- * next page. For reading, we do readahead and therefore don't know the final
- * address where the data needs to go.
- **/
-int toi_do_io(int writing, struct block_device *bdev, long block0,
- struct page *page, int is_readahead, int syncio, int free_group)
-{
- page->private = 0;
-
- /* Do here so we don't race against toi_bio_get_next_page_read */
- lock_page(page);
-
- if (is_readahead) {
- if (readahead_list_head)
- readahead_list_tail->private = (unsigned long) page;
- else
- readahead_list_head = page;
-
- readahead_list_tail = page;
- }
-
- /* Done before submitting to avoid races. */
- if (syncio)
- waiting_on = page;
-
- /* Submit the page */
- get_page(page);
-
- if (submit(writing, bdev, block0, page, free_group))
- return -EFAULT;
-
- if (syncio)
- do_bio_wait(2);
-
- return 0;
-}
-
-/**
- * toi_bdev_page_io - simpler interface to do directly i/o on a single page
- * @writing: Whether reading or writing.
- * @bdev: Block device on which we're operating.
- * @pos: Sector at which page to read or write starts.
- * @page: Page to be read/written.
- *
- * A simple interface to submit a page of I/O and wait for its completion.
- * The caller must free the page used.
- **/
-static int toi_bdev_page_io(int writing, struct block_device *bdev,
- long pos, struct page *page)
-{
- return toi_do_io(writing, bdev, pos, page, 0, 1, 0);
-}
-
-/**
- * toi_bio_memory_needed - report the amount of memory needed for block i/o
- *
- * We want to have at least enough memory so as to have target_outstanding_io
- * or more transactions on the fly at once. If we can do more, fine.
- **/
-static int toi_bio_memory_needed(void)
-{
- return target_outstanding_io * (PAGE_SIZE + sizeof(struct request) +
- sizeof(struct bio));
-}
-
-/**
- * toi_bio_print_debug_stats - put out debugging info in the buffer provided
- * @buffer: A buffer of size @size into which text should be placed.
- * @size: The size of @buffer.
- *
- * Fill a buffer with debugging info. This is used for both our debug_info sysfs
- * entry and for recording the same info in dmesg.
- **/
-static int toi_bio_print_debug_stats(char *buffer, int size)
-{
- int len = 0;
-
- if (toiActiveAllocator != &toi_blockwriter_ops) {
- len = scnprintf(buffer, size,
- "- Block I/O inactive.\n");
- return len;
- }
-
- len = scnprintf(buffer, size, "- Block I/O active.\n");
-
- len += toi_bio_chains_debug_info(buffer + len, size - len);
-
- len += scnprintf(buffer + len, size - len,
- "- Max outstanding reads %d. Max writes %d.\n",
- max_outstanding_reads, max_outstanding_writes);
-
- len += scnprintf(buffer + len, size - len,
- " Memory_needed: %d x (%lu + %u + %u) = %d bytes.\n",
- target_outstanding_io,
- PAGE_SIZE, (unsigned int) sizeof(struct request),
- (unsigned int) sizeof(struct bio), toi_bio_memory_needed());
-
-#ifdef MEASURE_MUTEX_CONTENTION
- {
- int i;
-
- len += scnprintf(buffer + len, size - len,
- " Mutex contention while reading:\n Contended Free\n");
-
- for_each_online_cpu(i)
- len += scnprintf(buffer + len, size - len,
- " %9lu %9lu\n",
- mutex_times[0][0][i], mutex_times[0][1][i]);
-
- len += scnprintf(buffer + len, size - len,
- " Mutex contention while writing:\n Contended Free\n");
-
- for_each_online_cpu(i)
- len += scnprintf(buffer + len, size - len,
- " %9lu %9lu\n",
- mutex_times[1][0][i], mutex_times[1][1][i]);
-
- }
-#endif
-
- return len + scnprintf(buffer + len, size - len,
- " Free mem throttle point reached %d.\n", free_mem_throttle);
-}
-
-static int total_header_bytes;
-static int unowned;
-
-void debug_broken_header(void)
-{
- printk(KERN_DEBUG "Image header too big for size allocated!\n");
- print_toi_header_storage_for_modules();
- printk(KERN_DEBUG "Page flags : %d.\n", toi_pageflags_space_needed());
- printk(KERN_DEBUG "toi_header : %zu.\n", sizeof(struct toi_header));
- printk(KERN_DEBUG "Total unowned : %d.\n", unowned);
- printk(KERN_DEBUG "Total used : %d (%ld pages).\n", total_header_bytes,
- DIV_ROUND_UP(total_header_bytes, PAGE_SIZE));
- printk(KERN_DEBUG "Space needed now : %ld.\n",
- get_header_storage_needed());
- dump_block_chains();
- abort_hibernate(TOI_HEADER_TOO_BIG, "Header reservation too small.");
-}
-
-static int toi_bio_update_previous_inc_img_ptr(int stream)
-{
- int result;
- char * buffer = (char *) toi_get_zeroed_page(12, TOI_ATOMIC_GFP);
- struct page *page;
- struct toi_incremental_image_pointer *prev, *this;
-
- prev = &toi_inc_ptr[stream][0];
- this = &toi_inc_ptr[stream][1];
-
- if (!buffer) {
- // We're at the start of writing a pageset. Memory should not be that scarce.
- return -ENOMEM;
- }
-
- page = virt_to_page(buffer);
- result = toi_do_io(READ, prev->bdev, prev->block, page, 0, 1, 0);
-
- if (result)
- goto out;
-
- memcpy(buffer, (char *) this, sizeof(this->save));
-
- result = toi_do_io(WRITE, prev->bdev, prev->block, page, 0, 0, 12);
-
- // If the IO is successfully submitted (!result), the page will be freed
- // asynchronously on completion.
-out:
- if (result)
- toi__free_page(12, virt_to_page(buffer));
- return result;
-}
-
-/**
- * toi_rw_init_incremental - incremental image part of setting up to write new section
- */
-static int toi_write_init_incremental(int stream)
-{
- int result = 0;
-
- // Remember the location of this block so we can link to it.
- toi_bio_store_inc_image_ptr(&toi_inc_ptr[stream][1]);
-
- // Update the pointer at the start of the last pageset with the same stream number.
- result = toi_bio_update_previous_inc_img_ptr(stream);
- if (result)
- return result;
-
- // Move the current to the previous slot.
- memcpy(&toi_inc_ptr[stream][0], &toi_inc_ptr[stream][1], sizeof(toi_inc_ptr[stream][1]));
-
- // Store a blank pointer at the start of this incremental pageset
- memset(&toi_inc_ptr[stream][1], 0, sizeof(toi_inc_ptr[stream][1]));
- result = toi_rw_buffer(WRITE, (char *) &toi_inc_ptr[stream][1], sizeof(toi_inc_ptr[stream][1]), 0);
- if (result)
- return result;
-
- // Serialise extent chains if this is an incremental pageset
- return toi_serialise_extent_chains();
-}
-
-/**
- * toi_read_init_incremental - incremental image part of setting up to read new section
- */
-static int toi_read_init_incremental(int stream)
-{
- int result;
-
- // Set our position to the start of the next pageset
- toi_bio_restore_inc_image_ptr(&toi_inc_ptr[stream][1]);
-
- // Read the start of the next incremental pageset (if any)
- result = toi_rw_buffer(READ, (char *) &toi_inc_ptr[stream][1], sizeof(toi_inc_ptr[stream][1]), 0);
-
- if (!result)
- result = toi_load_extent_chains();
-
- return result;
-}
-
-/**
- * toi_rw_init - prepare to read or write a stream in the image
- * @writing: Whether reading or writing.
- * @stream number: Section of the image being processed.
- *
- * Prepare to read or write a section ('stream') in the image.
- **/
-static int toi_rw_init(int writing, int stream_number)
-{
- if (stream_number)
- toi_extent_state_restore(stream_number);
- else
- toi_extent_state_goto_start();
-
- if (writing) {
- reset_idx = 0;
- if (!current_stream)
- page_idx = 0;
- } else {
- reset_idx = 1;
- }
-
- atomic_set(&toi_io_done, 0);
- if (!toi_writer_buffer)
- toi_writer_buffer = (char *) toi_get_zeroed_page(11,
- TOI_ATOMIC_GFP);
- toi_writer_buffer_posn = writing ? 0 : PAGE_SIZE;
-
- current_stream = stream_number;
-
- more_readahead = 1;
-
- if (test_result_state(TOI_KEPT_IMAGE)) {
- int result;
-
- if (writing) {
- result = toi_write_init_incremental(stream_number);
- } else {
- result = toi_read_init_incremental(stream_number);
- }
-
- if (result)
- return result;
- }
-
- return toi_writer_buffer ? 0 : -ENOMEM;
-}
-
-/**
- * toi_bio_queue_write - queue a page for writing
- * @full_buffer: Pointer to a page to be queued
- *
- * Add a page to the queue to be submitted. If we're the queue flusher,
- * we'll do this once we've dropped toi_bio_mutex, so other threads can
- * continue to submit I/O while we're on the slow path doing the actual
- * submission.
- **/
-static void toi_bio_queue_write(char **full_buffer)
-{
- struct page *page = virt_to_page(*full_buffer);
- unsigned long flags;
-
- *full_buffer = NULL;
- page->private = 0;
-
- spin_lock_irqsave(&bio_queue_lock, flags);
- if (!bio_queue_head)
- bio_queue_head = page;
- else
- bio_queue_tail->private = (unsigned long) page;
-
- bio_queue_tail = page;
- atomic_inc(&toi_bio_queue_size);
-
- spin_unlock_irqrestore(&bio_queue_lock, flags);
- wake_up(&toi_io_queue_flusher);
-}
-
-/**
- * toi_rw_cleanup - Cleanup after i/o.
- * @writing: Whether we were reading or writing.
- *
- * Flush all I/O and clean everything up after reading or writing a
- * section of the image.
- **/
-static int toi_rw_cleanup(int writing)
-{
- int i, result = 0;
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_rw_cleanup.");
- if (writing) {
- if (toi_writer_buffer_posn && !test_result_state(TOI_ABORTED))
- toi_bio_queue_write(&toi_writer_buffer);
-
- while (bio_queue_head && !result)
- result = toi_bio_queue_flush_pages(0);
-
- if (result)
- return result;
-
- if (current_stream == 2)
- toi_extent_state_save(1);
- else if (current_stream == 1)
- toi_extent_state_save(3);
- }
-
- result = toi_finish_all_io();
-
- while (readahead_list_head) {
- void *next = (void *) readahead_list_head->private;
- toi__free_page(12, readahead_list_head);
- readahead_list_head = next;
- }
-
- readahead_list_tail = NULL;
-
- if (!current_stream)
- return result;
-
- for (i = 0; i < NUM_REASONS; i++) {
- if (!atomic_read(&reasons[i]))
- continue;
- printk(KERN_DEBUG "Waited for i/o due to %s %d times.\n",
- reason_name[i], atomic_read(&reasons[i]));
- atomic_set(&reasons[i], 0);
- }
-
- current_stream = 0;
- return result;
-}
-
-/**
- * toi_start_one_readahead - start one page of readahead
- * @dedicated_thread: Is this a thread dedicated to doing readahead?
- *
- * Start one new page of readahead. If this is being called by a thread
- * whose only just is to submit readahead, don't quit because we failed
- * to allocate a page.
- **/
-static int toi_start_one_readahead(int dedicated_thread)
-{
- char *buffer = NULL;
- int oom = 0, result;
-
- result = throttle_if_needed(dedicated_thread ? THROTTLE_WAIT : 0);
- if (result) {
- printk("toi_start_one_readahead: throttle_if_needed returned %d.\n", result);
- return result;
- }
-
- mutex_lock(&toi_bio_readahead_mutex);
-
- while (!buffer) {
- buffer = (char *) toi_get_zeroed_page(12,
- TOI_ATOMIC_GFP);
- if (!buffer) {
- if (oom && !dedicated_thread) {
- mutex_unlock(&toi_bio_readahead_mutex);
- printk("toi_start_one_readahead: oom and !dedicated thread %d.\n", result);
- return -ENOMEM;
- }
-
- oom = 1;
- set_free_mem_throttle();
- do_bio_wait(5);
- }
- }
-
- result = toi_bio_rw_page(READ, virt_to_page(buffer), 1, 0);
- if (result) {
- printk("toi_start_one_readahead: toi_bio_rw_page returned %d.\n", result);
- }
- if (result == -ENOSPC)
- toi__free_page(12, virt_to_page(buffer));
- mutex_unlock(&toi_bio_readahead_mutex);
- if (result) {
- if (result == -ENOSPC)
- toi_message(TOI_BIO, TOI_VERBOSE, 0,
- "Last readahead page submitted.");
- else
- printk(KERN_DEBUG "toi_bio_rw_page returned %d.\n",
- result);
- }
- return result;
-}
-
-/**
- * toi_start_new_readahead - start new readahead
- * @dedicated_thread: Are we dedicated to this task?
- *
- * Start readahead of image pages.
- *
- * We can be called as a thread dedicated to this task (may be helpful on
- * systems with lots of CPUs), in which case we don't exit until there's no
- * more readahead.
- *
- * If this is not called by a dedicated thread, we top up our queue until
- * there's no more readahead to submit, we've submitted the number given
- * in target_outstanding_io or the number in progress exceeds the target
- * outstanding I/O value.
- *
- * No mutex needed because this is only ever called by the first cpu.
- **/
-static int toi_start_new_readahead(int dedicated_thread)
-{
- int last_result, num_submitted = 0;
-
- /* Start a new readahead? */
- if (!more_readahead)
- return 0;
-
- do {
- last_result = toi_start_one_readahead(dedicated_thread);
-
- if (last_result) {
- if (last_result == -ENOMEM || last_result == -ENOSPC)
- return 0;
-
- printk(KERN_DEBUG
- "Begin read chunk returned %d.\n",
- last_result);
- } else
- num_submitted++;
-
- } while (more_readahead && !last_result &&
- (dedicated_thread ||
- (num_submitted < target_outstanding_io &&
- atomic_read(&toi_io_in_progress) < target_outstanding_io)));
-
- return last_result;
-}
-
-/**
- * bio_io_flusher - start the dedicated I/O flushing routine
- * @writing: Whether we're writing the image.
- **/
-static int bio_io_flusher(int writing)
-{
-
- if (writing)
- return toi_bio_queue_flush_pages(1);
- else
- return toi_start_new_readahead(1);
-}
-
-/**
- * toi_bio_get_next_page_read - read a disk page, perhaps with readahead
- * @no_readahead: Whether we can use readahead
- *
- * Read a page from disk, submitting readahead and cleaning up finished i/o
- * while we wait for the page we're after.
- **/
-static int toi_bio_get_next_page_read(int no_readahead)
-{
- char *virt;
- struct page *old_readahead_list_head;
-
- /*
- * When reading the second page of the header, we have to
- * delay submitting the read until after we've gotten the
- * extents out of the first page.
- */
- if (unlikely(no_readahead)) {
- int result = toi_start_one_readahead(0);
- if (result) {
- printk(KERN_EMERG "No readahead and toi_start_one_readahead "
- "returned non-zero.\n");
- return -EIO;
- }
- }
-
- if (unlikely(!readahead_list_head)) {
- /*
- * If the last page finishes exactly on the page
- * boundary, we will be called one extra time and
- * have no data to return. In this case, we should
- * not BUG(), like we used to!
- */
- if (!more_readahead) {
- printk(KERN_EMERG "No more readahead.\n");
- return -ENOSPC;
- }
- if (unlikely(toi_start_one_readahead(0))) {
- printk(KERN_EMERG "No readahead and "
- "toi_start_one_readahead returned non-zero.\n");
- return -EIO;
- }
- }
-
- if (PageLocked(readahead_list_head)) {
- waiting_on = readahead_list_head;
- do_bio_wait(0);
- }
-
- virt = page_address(readahead_list_head);
- memcpy(toi_writer_buffer, virt, PAGE_SIZE);
-
- mutex_lock(&toi_bio_readahead_mutex);
- old_readahead_list_head = readahead_list_head;
- readahead_list_head = (struct page *) readahead_list_head->private;
- mutex_unlock(&toi_bio_readahead_mutex);
- toi__free_page(12, old_readahead_list_head);
- return 0;
-}
-
-/**
- * toi_bio_queue_flush_pages - flush the queue of pages queued for writing
- * @dedicated_thread: Whether we're a dedicated thread
- *
- * Flush the queue of pages ready to be written to disk.
- *
- * If we're a dedicated thread, stay in here until told to leave,
- * sleeping in wait_event.
- *
- * The first thread is normally the only one to come in here. Another
- * thread can enter this routine too, though, via throttle_if_needed.
- * Since that's the case, we must be careful to only have one thread
- * doing this work at a time. Otherwise we have a race and could save
- * pages out of order.
- *
- * If an error occurs, free all remaining pages without submitting them
- * for I/O.
- **/
-
-int toi_bio_queue_flush_pages(int dedicated_thread)
-{
- unsigned long flags;
- int result = 0;
- static DEFINE_MUTEX(busy);
-
- if (!mutex_trylock(&busy))
- return 0;
-
-top:
- spin_lock_irqsave(&bio_queue_lock, flags);
- while (bio_queue_head) {
- struct page *page = bio_queue_head;
- bio_queue_head = (struct page *) page->private;
- if (bio_queue_tail == page)
- bio_queue_tail = NULL;
- atomic_dec(&toi_bio_queue_size);
- spin_unlock_irqrestore(&bio_queue_lock, flags);
-
- /* Don't generate more error messages if already had one */
- if (!result)
- result = toi_bio_rw_page(WRITE, page, 0, 11);
- /*
- * If writing the page failed, don't drop out.
- * Flush the rest of the queue too.
- */
- if (result)
- toi__free_page(11 , page);
- spin_lock_irqsave(&bio_queue_lock, flags);
- }
- spin_unlock_irqrestore(&bio_queue_lock, flags);
-
- if (dedicated_thread) {
- wait_event(toi_io_queue_flusher, bio_queue_head ||
- toi_bio_queue_flusher_should_finish);
- if (likely(!toi_bio_queue_flusher_should_finish))
- goto top;
- toi_bio_queue_flusher_should_finish = 0;
- }
-
- mutex_unlock(&busy);
- return result;
-}
-
-/**
- * toi_bio_get_new_page - get a new page for I/O
- * @full_buffer: Pointer to a page to allocate.
- **/
-static int toi_bio_get_new_page(char **full_buffer)
-{
- int result = throttle_if_needed(THROTTLE_WAIT);
- if (result)
- return result;
-
- while (!*full_buffer) {
- *full_buffer = (char *) toi_get_zeroed_page(11, TOI_ATOMIC_GFP);
- if (!*full_buffer) {
- set_free_mem_throttle();
- do_bio_wait(3);
- }
- }
-
- return 0;
-}
-
-/**
- * toi_rw_buffer - combine smaller buffers into PAGE_SIZE I/O
- * @writing: Bool - whether writing (or reading).
- * @buffer: The start of the buffer to write or fill.
- * @buffer_size: The size of the buffer to write or fill.
- * @no_readahead: Don't try to start readhead (when getting extents).
- **/
-static int toi_rw_buffer(int writing, char *buffer, int buffer_size,
- int no_readahead)
-{
- int bytes_left = buffer_size, result = 0;
-
- while (bytes_left) {
- char *source_start = buffer + buffer_size - bytes_left;
- char *dest_start = toi_writer_buffer + toi_writer_buffer_posn;
- int capacity = PAGE_SIZE - toi_writer_buffer_posn;
- char *to = writing ? dest_start : source_start;
- char *from = writing ? source_start : dest_start;
-
- if (bytes_left <= capacity) {
- memcpy(to, from, bytes_left);
- toi_writer_buffer_posn += bytes_left;
- return 0;
- }
-
- /* Complete this page and start a new one */
- memcpy(to, from, capacity);
- bytes_left -= capacity;
-
- if (!writing) {
- /*
- * Perform actual I/O:
- * read readahead_list_head into toi_writer_buffer
- */
- int result = toi_bio_get_next_page_read(no_readahead);
- if (result && bytes_left) {
- printk("toi_bio_get_next_page_read "
- "returned %d. Expecting to read %d bytes.\n", result, bytes_left);
- return result;
- }
- } else {
- toi_bio_queue_write(&toi_writer_buffer);
- result = toi_bio_get_new_page(&toi_writer_buffer);
- if (result) {
- printk(KERN_ERR "toi_bio_get_new_page returned "
- "%d.\n", result);
- return result;
- }
- }
-
- toi_writer_buffer_posn = 0;
- toi_cond_pause(0, NULL);
- }
-
- return 0;
-}
-
-/**
- * toi_bio_read_page - read a page of the image
- * @pfn: The pfn where the data belongs.
- * @buffer_page: The page containing the (possibly compressed) data.
- * @buf_size: The number of bytes on @buffer_page used (PAGE_SIZE).
- *
- * Read a (possibly compressed) page from the image, into buffer_page,
- * returning its pfn and the buffer size.
- **/
-static int toi_bio_read_page(unsigned long *pfn, int buf_type,
- void *buffer_page, unsigned int *buf_size)
-{
- int result = 0;
- int this_idx;
- char *buffer_virt = TOI_MAP(buf_type, buffer_page);
-
- /*
- * Only call start_new_readahead if we don't have a dedicated thread
- * and we're the queue flusher.
- */
- if (current == toi_queue_flusher && more_readahead &&
- !test_action_state(TOI_NO_READAHEAD)) {
- int result2 = toi_start_new_readahead(0);
- if (result2) {
- printk(KERN_DEBUG "Queue flusher and "
- "toi_start_one_readahead returned non-zero.\n");
- result = -EIO;
- goto out;
- }
- }
-
- my_mutex_lock(0, &toi_bio_mutex);
-
- /*
- * Structure in the image:
- * [destination pfn|page size|page data]
- * buf_size is PAGE_SIZE
- * We can validly find there's nothing to read in a multithreaded
- * situation.
- */
- if (toi_rw_buffer(READ, (char *) &this_idx, sizeof(int), 0) ||
- toi_rw_buffer(READ, (char *) pfn, sizeof(unsigned long), 0) ||
- toi_rw_buffer(READ, (char *) buf_size, sizeof(int), 0) ||
- toi_rw_buffer(READ, buffer_virt, *buf_size, 0)) {
- result = -ENODATA;
- goto out_unlock;
- }
-
- if (reset_idx) {
- page_idx = this_idx;
- reset_idx = 0;
- } else {
- page_idx++;
- if (!this_idx)
- result = -ENODATA;
- else if (page_idx != this_idx)
- printk(KERN_ERR "Got page index %d, expected %d.\n",
- this_idx, page_idx);
- }
-
-out_unlock:
- my_mutex_unlock(0, &toi_bio_mutex);
-out:
- TOI_UNMAP(buf_type, buffer_page);
- return result;
-}
-
-/**
- * toi_bio_write_page - write a page of the image
- * @pfn: The pfn where the data belongs.
- * @buffer_page: The page containing the (possibly compressed) data.
- * @buf_size: The number of bytes on @buffer_page used.
- *
- * Write a (possibly compressed) page to the image from the buffer, together
- * with it's index and buffer size.
- **/
-static int toi_bio_write_page(unsigned long pfn, int buf_type,
- void *buffer_page, unsigned int buf_size)
-{
- char *buffer_virt;
- int result = 0, result2 = 0;
-
- if (unlikely(test_action_state(TOI_TEST_FILTER_SPEED)))
- return 0;
-
- my_mutex_lock(1, &toi_bio_mutex);
-
- if (test_result_state(TOI_ABORTED)) {
- my_mutex_unlock(1, &toi_bio_mutex);
- return 0;
- }
-
- buffer_virt = TOI_MAP(buf_type, buffer_page);
- page_idx++;
-
- /*
- * Structure in the image:
- * [destination pfn|page size|page data]
- * buf_size is PAGE_SIZE
- */
- if (toi_rw_buffer(WRITE, (char *) &page_idx, sizeof(int), 0) ||
- toi_rw_buffer(WRITE, (char *) &pfn, sizeof(unsigned long), 0) ||
- toi_rw_buffer(WRITE, (char *) &buf_size, sizeof(int), 0) ||
- toi_rw_buffer(WRITE, buffer_virt, buf_size, 0)) {
- printk(KERN_DEBUG "toi_rw_buffer returned non-zero to "
- "toi_bio_write_page.\n");
- result = -EIO;
- }
-
- TOI_UNMAP(buf_type, buffer_page);
- my_mutex_unlock(1, &toi_bio_mutex);
-
- if (current == toi_queue_flusher)
- result2 = toi_bio_queue_flush_pages(0);
-
- return result ? result : result2;
-}
-
-/**
- * _toi_rw_header_chunk - read or write a portion of the image header
- * @writing: Whether reading or writing.
- * @owner: The module for which we're writing.
- * Used for confirming that modules
- * don't use more header space than they asked for.
- * @buffer: Address of the data to write.
- * @buffer_size: Size of the data buffer.
- * @no_readahead: Don't try to start readhead (when getting extents).
- *
- * Perform PAGE_SIZE I/O. Start readahead if needed.
- **/
-static int _toi_rw_header_chunk(int writing, struct toi_module_ops *owner,
- char *buffer, int buffer_size, int no_readahead)
-{
- int result = 0;
-
- if (owner) {
- owner->header_used += buffer_size;
- toi_message(TOI_HEADER, TOI_LOW, 1,
- "Header: %s : %d bytes (%d/%d) from offset %d.",
- owner->name,
- buffer_size, owner->header_used,
- owner->header_requested,
- toi_writer_buffer_posn);
- if (owner->header_used > owner->header_requested && writing) {
- printk(KERN_EMERG "TuxOnIce module %s is using more "
- "header space (%u) than it requested (%u).\n",
- owner->name,
- owner->header_used,
- owner->header_requested);
- return buffer_size;
- }
- } else {
- unowned += buffer_size;
- toi_message(TOI_HEADER, TOI_LOW, 1,
- "Header: (No owner): %d bytes (%d total so far) from "
- "offset %d.", buffer_size, unowned,
- toi_writer_buffer_posn);
- }
-
- if (!writing && !no_readahead && more_readahead) {
- result = toi_start_new_readahead(0);
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Start new readahead "
- "returned %d.", result);
- }
-
- if (!result) {
- result = toi_rw_buffer(writing, buffer, buffer_size,
- no_readahead);
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "rw_buffer returned "
- "%d.", result);
- }
-
- total_header_bytes += buffer_size;
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "_toi_rw_header_chunk returning "
- "%d.", result);
- return result;
-}
-
-static int toi_rw_header_chunk(int writing, struct toi_module_ops *owner,
- char *buffer, int size)
-{
- return _toi_rw_header_chunk(writing, owner, buffer, size, 1);
-}
-
-static int toi_rw_header_chunk_noreadahead(int writing,
- struct toi_module_ops *owner, char *buffer, int size)
-{
- return _toi_rw_header_chunk(writing, owner, buffer, size, 1);
-}
-
-/**
- * toi_bio_storage_needed - get the amount of storage needed for my fns
- **/
-static int toi_bio_storage_needed(void)
-{
- return sizeof(int) + PAGE_SIZE + toi_bio_devinfo_storage_needed();
-}
-
-/**
- * toi_bio_save_config_info - save block I/O config to image header
- * @buf: PAGE_SIZE'd buffer into which data should be saved.
- **/
-static int toi_bio_save_config_info(char *buf)
-{
- int *ints = (int *) buf;
- ints[0] = target_outstanding_io;
- return sizeof(int);
-}
-
-/**
- * toi_bio_load_config_info - restore block I/O config
- * @buf: Data to be reloaded.
- * @size: Size of the buffer saved.
- **/
-static void toi_bio_load_config_info(char *buf, int size)
-{
- int *ints = (int *) buf;
- target_outstanding_io = ints[0];
-}
-
-void close_resume_dev_t(int force)
-{
- if (!resume_block_device)
- return;
-
- if (force)
- atomic_set(&resume_bdev_open_count, 0);
- else
- atomic_dec(&resume_bdev_open_count);
-
- if (!atomic_read(&resume_bdev_open_count)) {
- toi_close_bdev(resume_block_device);
- resume_block_device = NULL;
- }
-}
-
-int open_resume_dev_t(int force, int quiet)
-{
- if (force) {
- close_resume_dev_t(1);
- atomic_set(&resume_bdev_open_count, 1);
- } else
- atomic_inc(&resume_bdev_open_count);
-
- if (resume_block_device)
- return 0;
-
- resume_block_device = toi_open_bdev(NULL, resume_dev_t, 0);
- if (IS_ERR(resume_block_device)) {
- if (!quiet)
- toi_early_boot_message(1, TOI_CONTINUE_REQ,
- "Failed to open device %x, where"
- " the header should be found.",
- resume_dev_t);
- resume_block_device = NULL;
- atomic_set(&resume_bdev_open_count, 0);
- return 1;
- }
-
- return 0;
-}
-
-/**
- * toi_bio_initialise - initialise bio code at start of some action
- * @starting_cycle: Whether starting a hibernation cycle, or just reading or
- * writing a sysfs value.
- **/
-static int toi_bio_initialise(int starting_cycle)
-{
- int result;
-
- if (!starting_cycle || !resume_dev_t)
- return 0;
-
- max_outstanding_writes = 0;
- max_outstanding_reads = 0;
- current_stream = 0;
- toi_queue_flusher = current;
-#ifdef MEASURE_MUTEX_CONTENTION
- {
- int i, j, k;
-
- for (i = 0; i < 2; i++)
- for (j = 0; j < 2; j++)
- for_each_online_cpu(k)
- mutex_times[i][j][k] = 0;
- }
-#endif
- result = open_resume_dev_t(0, 1);
-
- if (result)
- return result;
-
- return get_signature_page();
-}
-
-static unsigned long raw_to_real(unsigned long raw)
-{
- unsigned long extra;
-
- extra = (raw * (sizeof(unsigned long) + sizeof(int)) +
- (PAGE_SIZE + sizeof(unsigned long) + sizeof(int) + 1)) /
- (PAGE_SIZE + sizeof(unsigned long) + sizeof(int));
-
- return raw > extra ? raw - extra : 0;
-}
-
-static unsigned long toi_bio_storage_available(void)
-{
- unsigned long sum = 0;
- struct toi_module_ops *this_module;
-
- list_for_each_entry(this_module, &toi_modules, module_list) {
- if (!this_module->enabled ||
- this_module->type != BIO_ALLOCATOR_MODULE)
- continue;
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Seeking storage "
- "available from %s.", this_module->name);
- sum += this_module->bio_allocator_ops->storage_available();
- }
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Total storage available is %lu "
- "pages (%d header pages).", sum, header_pages_reserved);
-
- return sum > header_pages_reserved ?
- raw_to_real(sum - header_pages_reserved) : 0;
-
-}
-
-static unsigned long toi_bio_storage_allocated(void)
-{
- return raw_pages_allocd > header_pages_reserved ?
- raw_to_real(raw_pages_allocd - header_pages_reserved) : 0;
-}
-
-/*
- * If we have read part of the image, we might have filled memory with
- * data that should be zeroed out.
- */
-static void toi_bio_noresume_reset(void)
-{
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_noresume_reset.");
- toi_rw_cleanup(READ);
- free_all_bdev_info();
-}
-
-/**
- * toi_bio_cleanup - cleanup after some action
- * @finishing_cycle: Whether completing a cycle.
- **/
-static void toi_bio_cleanup(int finishing_cycle)
-{
- if (!finishing_cycle)
- return;
-
- if (toi_writer_buffer) {
- toi_free_page(11, (unsigned long) toi_writer_buffer);
- toi_writer_buffer = NULL;
- }
-
- forget_signature_page();
-
- if (header_block_device && toi_sig_data &&
- toi_sig_data->header_dev_t != resume_dev_t)
- toi_close_bdev(header_block_device);
-
- header_block_device = NULL;
-
- close_resume_dev_t(0);
-}
-
-static int toi_bio_write_header_init(void)
-{
- int result;
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_write_header_init");
- toi_rw_init(WRITE, 0);
- toi_writer_buffer_posn = 0;
-
- /* Info needed to bootstrap goes at the start of the header.
- * First we save the positions and devinfo, including the number
- * of header pages. Then we save the structs containing data needed
- * for reading the header pages back.
- * Note that even if header pages take more than one page, when we
- * read back the info, we will have restored the location of the
- * next header page by the time we go to use it.
- */
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "serialise extent chains.");
- result = toi_serialise_extent_chains();
-
- if (result)
- return result;
-
- /*
- * Signature page hasn't been modified at this point. Write it in
- * the header so we can restore it later.
- */
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "serialise signature page.");
- return toi_rw_header_chunk_noreadahead(WRITE, &toi_blockwriter_ops,
- (char *) toi_cur_sig_page,
- PAGE_SIZE);
-}
-
-static int toi_bio_write_header_cleanup(void)
-{
- int result = 0;
-
- if (toi_writer_buffer_posn)
- toi_bio_queue_write(&toi_writer_buffer);
-
- result = toi_finish_all_io();
-
- unowned = 0;
- total_header_bytes = 0;
-
- /* Set signature to save we have an image */
- if (!result)
- result = toi_bio_mark_have_image();
-
- return result;
-}
-
-/*
- * toi_bio_read_header_init()
- *
- * Description:
- * 1. Attempt to read the device specified with resume=.
- * 2. Check the contents of the swap header for our signature.
- * 3. Warn, ignore, reset and/or continue as appropriate.
- * 4. If continuing, read the toi_swap configuration section
- * of the header and set up block device info so we can read
- * the rest of the header & image.
- *
- * Returns:
- * May not return if user choose to reboot at a warning.
- * -EINVAL if cannot resume at this time. Booting should continue
- * normally.
- */
-
-static int toi_bio_read_header_init(void)
-{
- int result = 0;
- char buf[32];
-
- toi_writer_buffer_posn = 0;
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_read_header_init");
-
- if (!toi_sig_data) {
- printk(KERN_INFO "toi_bio_read_header_init called when we "
- "haven't verified there is an image!\n");
- return -EINVAL;
- }
-
- /*
- * If the header is not on the resume_swap_dev_t, get the resume device
- * first.
- */
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "Header dev_t is %lx.",
- toi_sig_data->header_dev_t);
- if (toi_sig_data->have_uuid) {
- struct fs_info seek;
- dev_t device;
-
- strncpy((char *) seek.uuid, toi_sig_data->header_uuid, 16);
- seek.dev_t = toi_sig_data->header_dev_t;
- seek.last_mount_size = 0;
- device = blk_lookup_fs_info(&seek);
- if (device) {
- printk("Using dev_t %s, returned by blk_lookup_fs_info.\n",
- format_dev_t(buf, device));
- toi_sig_data->header_dev_t = device;
- }
- }
- if (toi_sig_data->header_dev_t != resume_dev_t) {
- header_block_device = toi_open_bdev(NULL,
- toi_sig_data->header_dev_t, 1);
-
- if (IS_ERR(header_block_device))
- return PTR_ERR(header_block_device);
- } else
- header_block_device = resume_block_device;
-
- if (!toi_writer_buffer)
- toi_writer_buffer = (char *) toi_get_zeroed_page(11,
- TOI_ATOMIC_GFP);
- more_readahead = 1;
-
- /*
- * Read toi_swap configuration.
- * Headerblock size taken into account already.
- */
- result = toi_bio_ops.bdev_page_io(READ, header_block_device,
- toi_sig_data->first_header_block,
- virt_to_page((unsigned long) toi_writer_buffer));
- if (result)
- return result;
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "load extent chains.");
- result = toi_load_extent_chains();
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "load original signature page.");
- toi_orig_sig_page = (char *) toi_get_zeroed_page(38, TOI_ATOMIC_GFP);
- if (!toi_orig_sig_page) {
- printk(KERN_ERR "Failed to allocate memory for the current"
- " image signature.\n");
- return -ENOMEM;
- }
-
- return toi_rw_header_chunk_noreadahead(READ, &toi_blockwriter_ops,
- (char *) toi_orig_sig_page,
- PAGE_SIZE);
-}
-
-static int toi_bio_read_header_cleanup(void)
-{
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_read_header_cleanup.");
- return toi_rw_cleanup(READ);
-}
-
-/* Works only for digits and letters, but small and fast */
-#define TOLOWER(x) ((x) | 0x20)
-
-/*
- * UUID must be 32 chars long. It may have dashes, but nothing
- * else.
- */
-char *uuid_from_commandline(char *commandline)
-{
- int low = 0;
- char *result = NULL, *output, *ptr;
-
- if (strncmp(commandline, "UUID=", 5))
- return NULL;
-
- result = kzalloc(17, GFP_KERNEL);
- if (!result) {
- printk("Failed to kzalloc UUID text memory.\n");
- return NULL;
- }
-
- ptr = commandline + 5;
- output = result;
-
- while (*ptr && (output - result) < 16) {
- if (isxdigit(*ptr)) {
- int value = isdigit(*ptr) ? *ptr - '0' :
- TOLOWER(*ptr) - 'a' + 10;
- if (low) {
- *output += value;
- output++;
- } else {
- *output = value << 4;
- }
- low = !low;
- } else if (*ptr != '-')
- break;
- ptr++;
- }
-
- if ((output - result) < 16 || *ptr) {
- printk(KERN_DEBUG "Found resume=UUID=, but the value looks "
- "invalid.\n");
- kfree(result);
- result = NULL;
- }
-
- return result;
-}
-
-#define retry_if_fails(command) \
-do { \
- command; \
- if (!resume_dev_t && !waited_for_device_probe) { \
- wait_for_device_probe(); \
- command; \
- waited_for_device_probe = 1; \
- } \
-} while(0)
-
-/**
- * try_to_open_resume_device: Try to parse and open resume=
- *
- * Any "swap:" has been stripped away and we just have the path to deal with.
- * We attempt to do name_to_dev_t, open and stat the file. Having opened the
- * file, get the struct block_device * to match.
- */
-static int try_to_open_resume_device(char *commandline, int quiet)
-{
- struct kstat stat;
- int error = 0;
- char *uuid = uuid_from_commandline(commandline);
- int waited_for_device_probe = 0;
-
- resume_dev_t = MKDEV(0, 0);
-
- if (!strlen(commandline))
- retry_if_fails(toi_bio_scan_for_image(quiet));
-
- if (uuid) {
- struct fs_info seek;
- strncpy((char *) &seek.uuid, uuid, 16);
- seek.dev_t = resume_dev_t;
- seek.last_mount_size = 0;
- retry_if_fails(resume_dev_t = blk_lookup_fs_info(&seek));
- kfree(uuid);
- }
-
- if (!resume_dev_t)
- retry_if_fails(resume_dev_t = name_to_dev_t(commandline));
-
- if (!resume_dev_t) {
- struct file *file = filp_open(commandline,
- O_RDONLY|O_LARGEFILE, 0);
-
- if (!IS_ERR(file) && file) {
- vfs_getattr(&file->f_path, &stat);
- filp_close(file, NULL);
- } else
- error = vfs_stat(commandline, &stat);
- if (!error)
- resume_dev_t = stat.rdev;
- }
-
- if (!resume_dev_t) {
- if (quiet)
- return 1;
-
- if (test_toi_state(TOI_TRYING_TO_RESUME))
- toi_early_boot_message(1, toi_translate_err_default,
- "Failed to translate \"%s\" into a device id.\n",
- commandline);
- else
- printk("TuxOnIce: Can't translate \"%s\" into a device "
- "id yet.\n", commandline);
- return 1;
- }
-
- return open_resume_dev_t(1, quiet);
-}
-
-/*
- * Parse Image Location
- *
- * Attempt to parse a resume= parameter.
- * Swap Writer accepts:
- * resume=[swap:|file:]DEVNAME[:FIRSTBLOCK][@BLOCKSIZE]
- *
- * Where:
- * DEVNAME is convertable to a dev_t by name_to_dev_t
- * FIRSTBLOCK is the location of the first block in the swap file
- * (specifying for a swap partition is nonsensical but not prohibited).
- * Data is validated by attempting to read a swap header from the
- * location given. Failure will result in toi_swap refusing to
- * save an image, and a reboot with correct parameters will be
- * necessary.
- */
-static int toi_bio_parse_sig_location(char *commandline,
- int only_allocator, int quiet)
-{
- char *thischar, *devstart, *colon = NULL;
- int signature_found, result = -EINVAL, temp_result = 0;
-
- if (strncmp(commandline, "swap:", 5) &&
- strncmp(commandline, "file:", 5)) {
- /*
- * Failing swap:, we'll take a simple resume=/dev/hda2, or a
- * blank value (scan) but fall through to other allocators
- * if /dev/ or UUID= isn't matched.
- */
- if (strncmp(commandline, "/dev/", 5) &&
- strncmp(commandline, "UUID=", 5) &&
- strlen(commandline))
- return 1;
- } else
- commandline += 5;
-
- devstart = commandline;
- thischar = commandline;
- while ((*thischar != ':') && (*thischar != '@') &&
- ((thischar - commandline) < 250) && (*thischar))
- thischar++;
-
- if (*thischar == ':') {
- colon = thischar;
- *colon = 0;
- thischar++;
- }
-
- while ((thischar - commandline) < 250 && *thischar)
- thischar++;
-
- if (colon) {
- unsigned long block;
- temp_result = kstrtoul(colon + 1, 0, &block);
- if (!temp_result)
- resume_firstblock = (int) block;
- } else
- resume_firstblock = 0;
-
- clear_toi_state(TOI_CAN_HIBERNATE);
- clear_toi_state(TOI_CAN_RESUME);
-
- if (!temp_result)
- temp_result = try_to_open_resume_device(devstart, quiet);
-
- if (colon)
- *colon = ':';
-
- /* No error if we only scanned */
- if (temp_result)
- return strlen(commandline) ? -EINVAL : 1;
-
- signature_found = toi_bio_image_exists(quiet);
-
- if (signature_found != -1) {
- result = 0;
- /*
- * TODO: If only file storage, CAN_HIBERNATE should only be
- * set if file allocator's target is valid.
- */
- set_toi_state(TOI_CAN_HIBERNATE);
- set_toi_state(TOI_CAN_RESUME);
- } else
- if (!quiet)
- printk(KERN_ERR "TuxOnIce: Block I/O: No "
- "signature found at %s.\n", devstart);
-
- return result;
-}
-
-static void toi_bio_release_storage(void)
-{
- header_pages_reserved = 0;
- raw_pages_allocd = 0;
-
- free_all_bdev_info();
-}
-
-/* toi_swap_remove_image
- *
- */
-static int toi_bio_remove_image(void)
-{
- int result;
-
- toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_remove_image.");
-
- result = toi_bio_restore_original_signature();
-
- /*
- * We don't do a sanity check here: we want to restore the swap
- * whatever version of kernel made the hibernate image.
- *
- * We need to write swap, but swap may not be enabled so
- * we write the device directly
- *
- * If we don't have an current_signature_page, we didn't
- * read an image header, so don't change anything.
- */
-
- toi_bio_release_storage();
-
- return result;
-}
-
-struct toi_bio_ops toi_bio_ops = {
- .bdev_page_io = toi_bdev_page_io,
- .register_storage = toi_register_storage_chain,
- .free_storage = toi_bio_release_storage,
-};
-
-static struct toi_sysfs_data sysfs_params[] = {
- SYSFS_INT("target_outstanding_io", SYSFS_RW, &target_outstanding_io,
- 0, 16384, 0, NULL),
-};
-
-struct toi_module_ops toi_blockwriter_ops = {
- .type = WRITER_MODULE,
- .name = "block i/o",
- .directory = "block_io",
- .module = THIS_MODULE,
- .memory_needed = toi_bio_memory_needed,
- .print_debug_info = toi_bio_print_debug_stats,
- .storage_needed = toi_bio_storage_needed,
- .save_config_info = toi_bio_save_config_info,
- .load_config_info = toi_bio_load_config_info,
- .initialise = toi_bio_initialise,
- .cleanup = toi_bio_cleanup,
- .post_atomic_restore = toi_bio_chains_post_atomic,
-
- .rw_init = toi_rw_init,
- .rw_cleanup = toi_rw_cleanup,
- .read_page = toi_bio_read_page,
- .write_page = toi_bio_write_page,
- .rw_header_chunk = toi_rw_header_chunk,
- .rw_header_chunk_noreadahead = toi_rw_header_chunk_noreadahead,
- .io_flusher = bio_io_flusher,
- .update_throughput_throttle = update_throughput_throttle,
- .finish_all_io = toi_finish_all_io,
-
- .noresume_reset = toi_bio_noresume_reset,
- .storage_available = toi_bio_storage_available,
- .storage_allocated = toi_bio_storage_allocated,
- .reserve_header_space = toi_bio_reserve_header_space,
- .allocate_storage = toi_bio_allocate_storage,
- .free_unused_storage = toi_bio_free_unused_storage,
- .image_exists = toi_bio_image_exists,
- .mark_resume_attempted = toi_bio_mark_resume_attempted,
- .write_header_init = toi_bio_write_header_init,
- .write_header_cleanup = toi_bio_write_header_cleanup,
- .read_header_init = toi_bio_read_header_init,
- .read_header_cleanup = toi_bio_read_header_cleanup,
- .get_header_version = toi_bio_get_header_version,
- .remove_image = toi_bio_remove_image,
- .parse_sig_location = toi_bio_parse_sig_location,
-
- .sysfs_data = sysfs_params,
- .num_sysfs_entries = sizeof(sysfs_params) /
- sizeof(struct toi_sysfs_data),
-};
-
-/**
- * toi_block_io_load - load time routine for block I/O module
- *
- * Register block i/o ops and sysfs entries.
- **/
-static __init int toi_block_io_load(void)
-{
- return toi_register_module(&toi_blockwriter_ops);
-}
-
-late_initcall(toi_block_io_load);
diff --git a/kernel/power/tuxonice_bio_internal.h b/kernel/power/tuxonice_bio_internal.h
deleted file mode 100644
index cf9211ed9..000000000
--- a/kernel/power/tuxonice_bio_internal.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * kernel/power/tuxonice_bio_internal.h
- *
- * Copyright (C) 2009-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- * This file contains declarations for functions exported from
- * tuxonice_bio.c, which contains low level io functions.
- */
-
-/* Extent chains */
-void toi_extent_state_goto_start(void);
-void toi_extent_state_save(int slot);
-int go_next_page(int writing, int section_barrier);
-void toi_extent_state_restore(int slot);
-void free_all_bdev_info(void);
-int devices_of_same_priority(struct toi_bdev_info *this);
-int toi_register_storage_chain(struct toi_bdev_info *new);
-int toi_serialise_extent_chains(void);
-int toi_load_extent_chains(void);
-int toi_bio_rw_page(int writing, struct page *page, int is_readahead,
- int free_group);
-int toi_bio_restore_original_signature(void);
-int toi_bio_devinfo_storage_needed(void);
-unsigned long get_headerblock(void);
-dev_t get_header_dev_t(void);
-struct block_device *get_header_bdev(void);
-int toi_bio_allocate_storage(unsigned long request);
-void toi_bio_free_unused_storage(void);
-
-/* Signature functions */
-#define HaveImage "HaveImage"
-#define NoImage "TuxOnIce"
-#define sig_size (sizeof(HaveImage))
-
-struct sig_data {
- char sig[sig_size];
- int have_image;
- int resumed_before;
-
- char have_uuid;
- char header_uuid[17];
- dev_t header_dev_t;
- unsigned long first_header_block;
-
- /* Repeat the signature to be sure we have a header version */
- char sig2[sig_size];
- int header_version;
-};
-
-void forget_signature_page(void);
-int toi_check_for_signature(void);
-int toi_bio_image_exists(int quiet);
-int get_signature_page(void);
-int toi_bio_mark_resume_attempted(int);
-extern char *toi_cur_sig_page;
-extern char *toi_orig_sig_page;
-int toi_bio_mark_have_image(void);
-extern struct sig_data *toi_sig_data;
-extern dev_t resume_dev_t;
-extern struct block_device *resume_block_device;
-extern struct block_device *header_block_device;
-extern unsigned long resume_firstblock;
-
-struct block_device *open_bdev(dev_t device, int display_errs);
-extern int current_stream;
-extern int more_readahead;
-int toi_do_io(int writing, struct block_device *bdev, long block0,
- struct page *page, int is_readahead, int syncio, int free_group);
-int get_main_pool_phys_params(void);
-
-void toi_close_bdev(struct block_device *bdev);
-struct block_device *toi_open_bdev(char *uuid, dev_t default_device,
- int display_errs);
-
-extern struct toi_module_ops toi_blockwriter_ops;
-void dump_block_chains(void);
-void debug_broken_header(void);
-extern unsigned long raw_pages_allocd, header_pages_reserved;
-int toi_bio_chains_debug_info(char *buffer, int size);
-void toi_bio_chains_post_atomic(struct toi_boot_kernel_data *bkd);
-int toi_bio_scan_for_image(int quiet);
-int toi_bio_get_header_version(void);
-
-void close_resume_dev_t(int force);
-int open_resume_dev_t(int force, int quiet);
-
-struct toi_incremental_image_pointer_saved_data {
- unsigned long block;
- int chain;
-};
-
-struct toi_incremental_image_pointer {
- struct toi_incremental_image_pointer_saved_data save;
- struct block_device *bdev;
- unsigned long block;
-};
-
-void toi_bio_store_inc_image_ptr(struct toi_incremental_image_pointer *ptr);
-void toi_bio_restore_inc_image_ptr(struct toi_incremental_image_pointer *ptr);
diff --git a/kernel/power/tuxonice_bio_signature.c b/kernel/power/tuxonice_bio_signature.c
deleted file mode 100644
index ead874f8e..000000000
--- a/kernel/power/tuxonice_bio_signature.c
+++ /dev/null
@@ -1,403 +0,0 @@
-/*
- * kernel/power/tuxonice_bio_signature.c
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- */
-
-#include <linux/fs_uuid.h>
-
-#include "tuxonice.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_prepare_image.h"
-#include "tuxonice_bio.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_io.h"
-#include "tuxonice_builtin.h"
-#include "tuxonice_bio_internal.h"
-
-struct sig_data *toi_sig_data;
-
-/* Struct of swap header pages */
-
-struct old_sig_data {
- dev_t device;
- unsigned long sector;
- int resume_attempted;
- int orig_sig_type;
-};
-
-union diskpage {
- union swap_header swh; /* swh.magic is the only member used */
- struct sig_data sig_data;
- struct old_sig_data old_sig_data;
-};
-
-union p_diskpage {
- union diskpage *pointer;
- char *ptr;
- unsigned long address;
-};
-
-char *toi_cur_sig_page;
-char *toi_orig_sig_page;
-int have_image;
-int have_old_image;
-
-int get_signature_page(void)
-{
- if (!toi_cur_sig_page) {
- toi_message(TOI_IO, TOI_VERBOSE, 0,
- "Allocating current signature page.");
- toi_cur_sig_page = (char *) toi_get_zeroed_page(38,
- TOI_ATOMIC_GFP);
- if (!toi_cur_sig_page) {
- printk(KERN_ERR "Failed to allocate memory for the "
- "current image signature.\n");
- return -ENOMEM;
- }
-
- toi_sig_data = (struct sig_data *) toi_cur_sig_page;
- }
-
- toi_message(TOI_IO, TOI_VERBOSE, 0, "Reading signature from dev %lx,"
- " sector %d.",
- resume_block_device->bd_dev, resume_firstblock);
-
- return toi_bio_ops.bdev_page_io(READ, resume_block_device,
- resume_firstblock, virt_to_page(toi_cur_sig_page));
-}
-
-void forget_signature_page(void)
-{
- if (toi_cur_sig_page) {
- toi_sig_data = NULL;
- toi_message(TOI_IO, TOI_VERBOSE, 0, "Freeing toi_cur_sig_page"
- " (%p).", toi_cur_sig_page);
- toi_free_page(38, (unsigned long) toi_cur_sig_page);
- toi_cur_sig_page = NULL;
- }
-
- if (toi_orig_sig_page) {
- toi_message(TOI_IO, TOI_VERBOSE, 0, "Freeing toi_orig_sig_page"
- " (%p).", toi_orig_sig_page);
- toi_free_page(38, (unsigned long) toi_orig_sig_page);
- toi_orig_sig_page = NULL;
- }
-}
-
-/*
- * We need to ensure we use the signature page that's currently on disk,
- * so as to not remove the image header. Post-atomic-restore, the orig sig
- * page will be empty, so we can use that as our method of knowing that we
- * need to load the on-disk signature and not use the non-image sig in
- * memory. (We're going to powerdown after writing the change, so it's safe.
- */
-int toi_bio_mark_resume_attempted(int flag)
-{
- toi_message(TOI_IO, TOI_VERBOSE, 0, "Make resume attempted = %d.",
- flag);
- if (!toi_orig_sig_page) {
- forget_signature_page();
- get_signature_page();
- }
- toi_sig_data->resumed_before = flag;
- return toi_bio_ops.bdev_page_io(WRITE, resume_block_device,
- resume_firstblock, virt_to_page(toi_cur_sig_page));
-}
-
-int toi_bio_mark_have_image(void)
-{
- int result = 0;
- char buf[32];
- struct fs_info *fs_info;
-
- toi_message(TOI_IO, TOI_VERBOSE, 0, "Recording that an image exists.");
- memcpy(toi_sig_data->sig, tuxonice_signature,
- sizeof(tuxonice_signature));
- toi_sig_data->have_image = 1;
- toi_sig_data->resumed_before = 0;
- toi_sig_data->header_dev_t = get_header_dev_t();
- toi_sig_data->have_uuid = 0;
-
- fs_info = fs_info_from_block_dev(get_header_bdev());
- if (fs_info && !IS_ERR(fs_info)) {
- memcpy(toi_sig_data->header_uuid, &fs_info->uuid, 16);
- free_fs_info(fs_info);
- } else
- result = (int) PTR_ERR(fs_info);
-
- if (!result) {
- toi_message(TOI_IO, TOI_VERBOSE, 0, "Got uuid for dev_t %s.",
- format_dev_t(buf, get_header_dev_t()));
- toi_sig_data->have_uuid = 1;
- } else
- toi_message(TOI_IO, TOI_VERBOSE, 0, "Could not get uuid for "
- "dev_t %s.",
- format_dev_t(buf, get_header_dev_t()));
-
- toi_sig_data->first_header_block = get_headerblock();
- have_image = 1;
- toi_message(TOI_IO, TOI_VERBOSE, 0, "header dev_t is %x. First block "
- "is %d.", toi_sig_data->header_dev_t,
- toi_sig_data->first_header_block);
-
- memcpy(toi_sig_data->sig2, tuxonice_signature,
- sizeof(tuxonice_signature));
- toi_sig_data->header_version = TOI_HEADER_VERSION;
-
- return toi_bio_ops.bdev_page_io(WRITE, resume_block_device,
- resume_firstblock, virt_to_page(toi_cur_sig_page));
-}
-
-int remove_old_signature(void)
-{
- union p_diskpage swap_header_page = (union p_diskpage) toi_cur_sig_page;
- char *orig_sig;
- char *header_start = (char *) toi_get_zeroed_page(38, TOI_ATOMIC_GFP);
- int result;
- struct block_device *header_bdev;
- struct old_sig_data *old_sig_data =
- &swap_header_page.pointer->old_sig_data;
-
- header_bdev = toi_open_bdev(NULL, old_sig_data->device, 1);
- result = toi_bio_ops.bdev_page_io(READ, header_bdev,
- old_sig_data->sector, virt_to_page(header_start));
-
- if (result)
- goto out;
-
- /*
- * TODO: Get the original contents of the first bytes of the swap
- * header page.
- */
- if (!old_sig_data->orig_sig_type)
- orig_sig = "SWAP-SPACE";
- else
- orig_sig = "SWAPSPACE2";
-
- memcpy(swap_header_page.pointer->swh.magic.magic, orig_sig, 10);
- memcpy(swap_header_page.ptr, header_start, 10);
-
- result = toi_bio_ops.bdev_page_io(WRITE, resume_block_device,
- resume_firstblock, virt_to_page(swap_header_page.ptr));
-
-out:
- toi_close_bdev(header_bdev);
- have_old_image = 0;
- toi_free_page(38, (unsigned long) header_start);
- return result;
-}
-
-/*
- * toi_bio_restore_original_signature - restore the original signature
- *
- * At boot time (aborting pre atomic-restore), toi_orig_sig_page gets used.
- * It will have the original signature page contents, stored in the image
- * header. Post atomic-restore, we use :toi_cur_sig_page, which will contain
- * the contents that were loaded when we started the cycle.
- */
-int toi_bio_restore_original_signature(void)
-{
- char *use = toi_orig_sig_page ? toi_orig_sig_page : toi_cur_sig_page;
-
- if (have_old_image)
- return remove_old_signature();
-
- if (!use) {
- printk("toi_bio_restore_original_signature: No signature "
- "page loaded.\n");
- return 0;
- }
-
- toi_message(TOI_IO, TOI_VERBOSE, 0, "Recording that no image exists.");
- have_image = 0;
- toi_sig_data->have_image = 0;
- return toi_bio_ops.bdev_page_io(WRITE, resume_block_device,
- resume_firstblock, virt_to_page(use));
-}
-
-/*
- * check_for_signature - See whether we have an image.
- *
- * Returns 0 if no image, 1 if there is one, -1 if indeterminate.
- */
-int toi_check_for_signature(void)
-{
- union p_diskpage swap_header_page;
- int type;
- const char *normal_sigs[] = {"SWAP-SPACE", "SWAPSPACE2" };
- const char *swsusp_sigs[] = {"S1SUSP", "S2SUSP", "S1SUSPEND" };
- char *swap_header;
-
- if (!toi_cur_sig_page) {
- int result = get_signature_page();
-
- if (result)
- return result;
- }
-
- /*
- * Start by looking for the binary header.
- */
- if (!memcmp(tuxonice_signature, toi_cur_sig_page,
- sizeof(tuxonice_signature))) {
- have_image = toi_sig_data->have_image;
- toi_message(TOI_IO, TOI_VERBOSE, 0, "Have binary signature. "
- "Have image is %d.", have_image);
- if (have_image)
- toi_message(TOI_IO, TOI_VERBOSE, 0, "header dev_t is "
- "%x. First block is %d.",
- toi_sig_data->header_dev_t,
- toi_sig_data->first_header_block);
- return toi_sig_data->have_image;
- }
-
- /*
- * Failing that, try old file allocator headers.
- */
-
- if (!memcmp(HaveImage, toi_cur_sig_page, strlen(HaveImage))) {
- have_image = 1;
- return 1;
- }
-
- have_image = 0;
-
- if (!memcmp(NoImage, toi_cur_sig_page, strlen(NoImage)))
- return 0;
-
- /*
- * Nope? How about swap?
- */
- swap_header_page = (union p_diskpage) toi_cur_sig_page;
- swap_header = swap_header_page.pointer->swh.magic.magic;
-
- /* Normal swapspace? */
- for (type = 0; type < 2; type++)
- if (!memcmp(normal_sigs[type], swap_header,
- strlen(normal_sigs[type])))
- return 0;
-
- /* Swsusp or uswsusp? */
- for (type = 0; type < 3; type++)
- if (!memcmp(swsusp_sigs[type], swap_header,
- strlen(swsusp_sigs[type])))
- return 2;
-
- /* Old TuxOnIce version? */
- if (!memcmp(tuxonice_signature, swap_header,
- sizeof(tuxonice_signature) - 1)) {
- toi_message(TOI_IO, TOI_VERBOSE, 0, "Found old TuxOnIce "
- "signature.");
- have_old_image = 1;
- return 3;
- }
-
- return -1;
-}
-
-/*
- * Image_exists
- *
- * Returns -1 if don't know, otherwise 0 (no) or 1 (yes).
- */
-int toi_bio_image_exists(int quiet)
-{
- int result;
- char *msg = NULL;
-
- toi_message(TOI_IO, TOI_VERBOSE, 0, "toi_bio_image_exists.");
-
- if (!resume_dev_t) {
- if (!quiet)
- printk(KERN_INFO "Not even trying to read header "
- "because resume_dev_t is not set.\n");
- return -1;
- }
-
- if (open_resume_dev_t(0, quiet))
- return -1;
-
- result = toi_check_for_signature();
-
- clear_toi_state(TOI_RESUMED_BEFORE);
- if (toi_sig_data->resumed_before)
- set_toi_state(TOI_RESUMED_BEFORE);
-
- if (quiet || result == -ENOMEM)
- return result;
-
- if (result == -1)
- msg = "TuxOnIce: Unable to find a signature."
- " Could you have moved a swap file?\n";
- else if (!result)
- msg = "TuxOnIce: No image found.\n";
- else if (result == 1)
- msg = "TuxOnIce: Image found.\n";
- else if (result == 2)
- msg = "TuxOnIce: uswsusp or swsusp image found.\n";
- else if (result == 3)
- msg = "TuxOnIce: Old implementation's signature found.\n";
-
- printk(KERN_INFO "%s", msg);
-
- return result;
-}
-
-int toi_bio_scan_for_image(int quiet)
-{
- struct block_device *bdev;
- char default_name[255] = "";
-
- if (!quiet)
- printk(KERN_DEBUG "Scanning swap devices for TuxOnIce "
- "signature...\n");
- for (bdev = next_bdev_of_type(NULL, "swap"); bdev;
- bdev = next_bdev_of_type(bdev, "swap")) {
- int result;
- char name[255] = "";
- sprintf(name, "%u:%u", MAJOR(bdev->bd_dev),
- MINOR(bdev->bd_dev));
- if (!quiet)
- printk(KERN_DEBUG "- Trying %s.\n", name);
- resume_block_device = bdev;
- resume_dev_t = bdev->bd_dev;
-
- result = toi_check_for_signature();
-
- resume_block_device = NULL;
- resume_dev_t = MKDEV(0, 0);
-
- if (!default_name[0])
- strcpy(default_name, name);
-
- if (result == 1) {
- /* Got one! */
- strcpy(resume_file, name);
- next_bdev_of_type(bdev, NULL);
- if (!quiet)
- printk(KERN_DEBUG " ==> Image found on %s.\n",
- resume_file);
- return 1;
- }
- forget_signature_page();
- }
-
- if (!quiet)
- printk(KERN_DEBUG "TuxOnIce scan: No image found.\n");
- strcpy(resume_file, default_name);
- return 0;
-}
-
-int toi_bio_get_header_version(void)
-{
- return (memcmp(toi_sig_data->sig2, tuxonice_signature,
- sizeof(tuxonice_signature))) ?
- 0 : toi_sig_data->header_version;
-
-}
diff --git a/kernel/power/tuxonice_builtin.c b/kernel/power/tuxonice_builtin.c
deleted file mode 100644
index 0a6733ae0..000000000
--- a/kernel/power/tuxonice_builtin.c
+++ /dev/null
@@ -1,498 +0,0 @@
-/*
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- */
-#include <linux/kernel.h>
-#include <linux/swap.h>
-#include <linux/syscalls.h>
-#include <linux/bio.h>
-#include <linux/root_dev.h>
-#include <linux/freezer.h>
-#include <linux/reboot.h>
-#include <linux/writeback.h>
-#include <linux/tty.h>
-#include <linux/crypto.h>
-#include <linux/cpu.h>
-#include <linux/ctype.h>
-#include <linux/kthread.h>
-#include "tuxonice_io.h"
-#include "tuxonice.h"
-#include "tuxonice_extent.h"
-#include "tuxonice_netlink.h"
-#include "tuxonice_prepare_image.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_pagedir.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_builtin.h"
-#include "tuxonice_power_off.h"
-#include "tuxonice_alloc.h"
-
-unsigned long toi_bootflags_mask;
-
-/*
- * Highmem related functions (x86 only).
- */
-
-#ifdef CONFIG_HIGHMEM
-
-/**
- * copyback_high: Restore highmem pages.
- *
- * Highmem data and pbe lists are/can be stored in highmem.
- * The format is slightly different to the lowmem pbe lists
- * used for the assembly code: the last pbe in each page is
- * a struct page * instead of struct pbe *, pointing to the
- * next page where pbes are stored (or NULL if happens to be
- * the end of the list). Since we don't want to generate
- * unnecessary deltas against swsusp code, we use a cast
- * instead of a union.
- **/
-
-static void copyback_high(void)
-{
- struct page *pbe_page = (struct page *) restore_highmem_pblist;
- struct pbe *this_pbe, *first_pbe;
- unsigned long *origpage, *copypage;
- int pbe_index = 1;
-
- if (!pbe_page)
- return;
-
- this_pbe = (struct pbe *) kmap_atomic(pbe_page);
- first_pbe = this_pbe;
-
- while (this_pbe) {
- int loop = (PAGE_SIZE / sizeof(unsigned long)) - 1;
-
- origpage = kmap_atomic(pfn_to_page((unsigned long) this_pbe->orig_address));
- copypage = kmap_atomic((struct page *) this_pbe->address);
-
- while (loop >= 0) {
- *(origpage + loop) = *(copypage + loop);
- loop--;
- }
-
- kunmap_atomic(origpage);
- kunmap_atomic(copypage);
-
- if (!this_pbe->next)
- break;
-
- if (pbe_index < PBES_PER_PAGE) {
- this_pbe++;
- pbe_index++;
- } else {
- pbe_page = (struct page *) this_pbe->next;
- kunmap_atomic(first_pbe);
- if (!pbe_page)
- return;
- this_pbe = (struct pbe *) kmap_atomic(pbe_page);
- first_pbe = this_pbe;
- pbe_index = 1;
- }
- }
- kunmap_atomic(first_pbe);
-}
-
-#else /* CONFIG_HIGHMEM */
-static void copyback_high(void) { }
-#endif
-
-char toi_wait_for_keypress_dev_console(int timeout)
-{
- int fd, this_timeout = 255, orig_kthread = 0;
- char key = '\0';
- struct termios t, t_backup;
-
- /* We should be guaranteed /dev/console exists after populate_rootfs()
- * in init/main.c.
- */
- fd = sys_open("/dev/console", O_RDONLY, 0);
- if (fd < 0) {
- printk(KERN_INFO "Couldn't open /dev/console.\n");
- return key;
- }
-
- if (sys_ioctl(fd, TCGETS, (long)&t) < 0)
- goto out_close;
-
- memcpy(&t_backup, &t, sizeof(t));
-
- t.c_lflag &= ~(ISIG|ICANON|ECHO);
- t.c_cc[VMIN] = 0;
-
-new_timeout:
- if (timeout > 0) {
- this_timeout = timeout < 26 ? timeout : 25;
- timeout -= this_timeout;
- this_timeout *= 10;
- }
-
- t.c_cc[VTIME] = this_timeout;
-
- if (sys_ioctl(fd, TCSETS, (long)&t) < 0)
- goto out_restore;
-
- if (current->flags & PF_KTHREAD) {
- orig_kthread = (current->flags & PF_KTHREAD);
- current->flags &= ~PF_KTHREAD;
- }
-
- while (1) {
- if (sys_read(fd, &key, 1) <= 0) {
- if (timeout)
- goto new_timeout;
- key = '\0';
- break;
- }
- key = tolower(key);
- if (test_toi_state(TOI_SANITY_CHECK_PROMPT)) {
- if (key == 'c') {
- set_toi_state(TOI_CONTINUE_REQ);
- break;
- } else if (key == ' ')
- break;
- } else
- break;
- }
- if (orig_kthread) {
- current->flags |= PF_KTHREAD;
- }
-
-out_restore:
- sys_ioctl(fd, TCSETS, (long)&t_backup);
-out_close:
- sys_close(fd);
-
- return key;
-}
-
-struct toi_boot_kernel_data toi_bkd __nosavedata
- __attribute__((aligned(PAGE_SIZE))) = {
- MY_BOOT_KERNEL_DATA_VERSION,
- 0,
-#ifdef CONFIG_TOI_REPLACE_SWSUSP
- (1 << TOI_REPLACE_SWSUSP) |
-#endif
- (1 << TOI_NO_FLUSHER_THREAD) |
- (1 << TOI_PAGESET2_FULL),
-};
-
-struct block_device *toi_open_by_devnum(dev_t dev)
-{
- struct block_device *bdev = bdget(dev);
- int err = -ENOMEM;
- if (bdev)
- err = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY, NULL);
- return err ? ERR_PTR(err) : bdev;
-}
-
-/**
- * toi_close_bdev: Close a swap bdev.
- *
- * int: The swap entry number to close.
- */
-void toi_close_bdev(struct block_device *bdev)
-{
- blkdev_put(bdev, FMODE_READ | FMODE_NDELAY);
-}
-
-int toi_wait = CONFIG_TOI_DEFAULT_WAIT;
-struct toi_core_fns *toi_core_fns;
-unsigned long toi_result;
-struct pagedir pagedir1 = {1};
-struct toi_cbw **toi_first_cbw;
-int toi_next_cbw;
-
-unsigned long toi_get_nonconflicting_page(void)
-{
- return toi_core_fns->get_nonconflicting_page();
-}
-
-int toi_post_context_save(void)
-{
- return toi_core_fns->post_context_save();
-}
-
-int try_tuxonice_hibernate(void)
-{
- if (!toi_core_fns)
- return -ENODEV;
-
- return toi_core_fns->try_hibernate();
-}
-
-static int num_resume_calls;
-#ifdef CONFIG_TOI_IGNORE_LATE_INITCALL
-static int ignore_late_initcall = 1;
-#else
-static int ignore_late_initcall;
-#endif
-
-int toi_translate_err_default = TOI_CONTINUE_REQ;
-
-void try_tuxonice_resume(void)
-{
- if (!hibernation_available())
- return;
-
- /* Don't let it wrap around eventually */
- if (num_resume_calls < 2)
- num_resume_calls++;
-
- if (num_resume_calls == 1 && ignore_late_initcall) {
- printk(KERN_INFO "TuxOnIce: Ignoring late initcall, as requested.\n");
- return;
- }
-
- if (toi_core_fns)
- toi_core_fns->try_resume();
- else
- printk(KERN_INFO "TuxOnIce core not loaded yet.\n");
-}
-
-int toi_lowlevel_builtin(void)
-{
- int error = 0;
-
- save_processor_state();
- error = swsusp_arch_suspend();
- if (error)
- printk(KERN_ERR "Error %d hibernating\n", error);
-
- /* Restore control flow appears here */
- if (!toi_in_hibernate) {
- copyback_high();
- set_toi_state(TOI_NOW_RESUMING);
- }
-
- restore_processor_state();
- return error;
-}
-
-unsigned long toi_compress_bytes_in;
-unsigned long toi_compress_bytes_out;
-
-int toi_in_suspend(void)
-{
- return in_suspend;
-}
-
-unsigned long toi_state = ((1 << TOI_BOOT_TIME) |
- (1 << TOI_IGNORE_LOGLEVEL) |
- (1 << TOI_IO_STOPPED));
-
-/* The number of hibernates we have started (some may have been cancelled) */
-unsigned int nr_hibernates;
-int toi_running;
-__nosavedata int toi_in_hibernate;
-__nosavedata struct pbe *restore_highmem_pblist;
-
-int toi_trace_allocs;
-
-void toi_read_lock_tasklist(void)
-{
- read_lock(&tasklist_lock);
-}
-
-void toi_read_unlock_tasklist(void)
-{
- read_unlock(&tasklist_lock);
-}
-
-#ifdef CONFIG_TOI_ZRAM_SUPPORT
-int (*toi_flag_zram_disks) (void);
-
-int toi_do_flag_zram_disks(void)
-{
- return toi_flag_zram_disks ? (*toi_flag_zram_disks)() : 0;
-}
-
-#endif
-
-/* toi_generate_free_page_map
- *
- * Description: This routine generates a bitmap of free pages from the
- * lists used by the memory manager. We then use the bitmap
- * to quickly calculate which pages to save and in which
- * pagesets.
- */
-void toi_generate_free_page_map(void)
-{
- int order, cpu, t;
- unsigned long flags, i;
- struct zone *zone;
- struct list_head *curr;
- unsigned long pfn;
- struct page *page;
-
- for_each_populated_zone(zone) {
-
- if (!zone->spanned_pages)
- continue;
-
- spin_lock_irqsave(&zone->lock, flags);
-
- for (i = 0; i < zone->spanned_pages; i++) {
- pfn = zone->zone_start_pfn + i;
-
- if (!pfn_valid(pfn))
- continue;
-
- page = pfn_to_page(pfn);
-
- ClearPageNosaveFree(page);
- }
-
- for_each_migratetype_order(order, t) {
- list_for_each(curr,
- &zone->free_area[order].free_list[t]) {
- unsigned long j;
-
- pfn = page_to_pfn(list_entry(curr, struct page,
- lru));
- for (j = 0; j < (1UL << order); j++)
- SetPageNosaveFree(pfn_to_page(pfn + j));
- }
- }
-
- for_each_online_cpu(cpu) {
- struct per_cpu_pageset *pset =
- per_cpu_ptr(zone->pageset, cpu);
- struct per_cpu_pages *pcp = &pset->pcp;
- struct page *page;
- int t;
-
- for (t = 0; t < MIGRATE_PCPTYPES; t++)
- list_for_each_entry(page, &pcp->lists[t], lru)
- SetPageNosaveFree(page);
- }
-
- spin_unlock_irqrestore(&zone->lock, flags);
- }
-}
-
-/* toi_size_of_free_region
- *
- * Description: Return the number of pages that are free, beginning with and
- * including this one.
- */
-int toi_size_of_free_region(struct zone *zone, unsigned long start_pfn)
-{
- unsigned long this_pfn = start_pfn,
- end_pfn = zone_end_pfn(zone);
-
- while (pfn_valid(this_pfn) && this_pfn < end_pfn && PageNosaveFree(pfn_to_page(this_pfn)))
- this_pfn++;
-
- return this_pfn - start_pfn;
-}
-
-static int __init toi_wait_setup(char *str)
-{
- int value;
-
- if (sscanf(str, "=%d", &value)) {
- if (value < -1 || value > 255)
- printk(KERN_INFO "TuxOnIce_wait outside range -1 to "
- "255.\n");
- else
- toi_wait = value;
- }
-
- return 1;
-}
-__setup("toi_wait", toi_wait_setup);
-
-static int __init toi_translate_retry_setup(char *str)
-{
- toi_translate_err_default = 0;
- return 1;
-}
-__setup("toi_translate_retry", toi_translate_retry_setup);
-
-static int __init toi_debug_setup(char *str)
-{
- toi_bkd.toi_action |= (1 << TOI_LOGALL);
- toi_bootflags_mask |= (1 << TOI_LOGALL);
- toi_bkd.toi_debug_state = 255;
- toi_bkd.toi_default_console_level = 7;
- return 1;
-}
-__setup("toi_debug_setup", toi_debug_setup);
-
-static int __init toi_pause_setup(char *str)
-{
- toi_bkd.toi_action |= (1 << TOI_PAUSE);
- toi_bootflags_mask |= (1 << TOI_PAUSE);
- return 1;
-}
-__setup("toi_pause", toi_pause_setup);
-
-#ifdef CONFIG_PM_DEBUG
-static int __init toi_trace_allocs_setup(char *str)
-{
- int value;
-
- if (sscanf(str, "=%d", &value))
- toi_trace_allocs = value;
-
- return 1;
-}
-__setup("toi_trace_allocs", toi_trace_allocs_setup);
-#endif
-
-static int __init toi_ignore_late_initcall_setup(char *str)
-{
- int value;
-
- if (sscanf(str, "=%d", &value))
- ignore_late_initcall = value;
-
- return 1;
-}
-__setup("toi_initramfs_resume_only", toi_ignore_late_initcall_setup);
-
-static int __init toi_force_no_multithreaded_setup(char *str)
-{
- int value;
-
- toi_bkd.toi_action &= ~(1 << TOI_NO_MULTITHREADED_IO);
- toi_bootflags_mask |= (1 << TOI_NO_MULTITHREADED_IO);
-
- if (sscanf(str, "=%d", &value) && value)
- toi_bkd.toi_action |= (1 << TOI_NO_MULTITHREADED_IO);
-
- return 1;
-}
-__setup("toi_no_multithreaded", toi_force_no_multithreaded_setup);
-
-#ifdef CONFIG_KGDB
-static int __init toi_post_resume_breakpoint_setup(char *str)
-{
- int value;
-
- toi_bkd.toi_action &= ~(1 << TOI_POST_RESUME_BREAKPOINT);
- toi_bootflags_mask |= (1 << TOI_POST_RESUME_BREAKPOINT);
- if (sscanf(str, "=%d", &value) && value)
- toi_bkd.toi_action |= (1 << TOI_POST_RESUME_BREAKPOINT);
-
- return 1;
-}
-__setup("toi_post_resume_break", toi_post_resume_breakpoint_setup);
-#endif
-
-static int __init toi_disable_readahead_setup(char *str)
-{
- int value;
-
- toi_bkd.toi_action &= ~(1 << TOI_NO_READAHEAD);
- toi_bootflags_mask |= (1 << TOI_NO_READAHEAD);
- if (sscanf(str, "=%d", &value) && value)
- toi_bkd.toi_action |= (1 << TOI_NO_READAHEAD);
-
- return 1;
-}
-__setup("toi_no_readahead", toi_disable_readahead_setup);
diff --git a/kernel/power/tuxonice_builtin.h b/kernel/power/tuxonice_builtin.h
deleted file mode 100644
index 9539818e0..000000000
--- a/kernel/power/tuxonice_builtin.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- */
-#include <asm/setup.h>
-
-extern struct toi_core_fns *toi_core_fns;
-extern unsigned long toi_compress_bytes_in, toi_compress_bytes_out;
-extern unsigned int nr_hibernates;
-extern int toi_in_hibernate;
-
-extern __nosavedata struct pbe *restore_highmem_pblist;
-
-int toi_lowlevel_builtin(void);
-
-#ifdef CONFIG_HIGHMEM
-extern __nosavedata struct zone_data *toi_nosave_zone_list;
-extern __nosavedata unsigned long toi_nosave_max_pfn;
-#endif
-
-extern unsigned long toi_get_nonconflicting_page(void);
-extern int toi_post_context_save(void);
-
-extern char toi_wait_for_keypress_dev_console(int timeout);
-extern struct block_device *toi_open_by_devnum(dev_t dev);
-extern void toi_close_bdev(struct block_device *bdev);
-extern int toi_wait;
-extern int toi_translate_err_default;
-extern int toi_force_no_multithreaded;
-extern void toi_read_lock_tasklist(void);
-extern void toi_read_unlock_tasklist(void);
-extern int toi_in_suspend(void);
-extern void toi_generate_free_page_map(void);
-extern int toi_size_of_free_region(struct zone *zone, unsigned long start_pfn);
-
-#ifdef CONFIG_TOI_ZRAM_SUPPORT
-extern int toi_do_flag_zram_disks(void);
-#else
-#define toi_do_flag_zram_disks() (0)
-#endif
diff --git a/kernel/power/tuxonice_checksum.c b/kernel/power/tuxonice_checksum.c
deleted file mode 100644
index 8952c0fec..000000000
--- a/kernel/power/tuxonice_checksum.c
+++ /dev/null
@@ -1,392 +0,0 @@
-/*
- * kernel/power/tuxonice_checksum.c
- *
- * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * This file contains data checksum routines for TuxOnIce,
- * using cryptoapi. They are used to locate any modifications
- * made to pageset 2 while we're saving it.
- */
-
-#include <linux/suspend.h>
-#include <linux/highmem.h>
-#include <linux/vmalloc.h>
-#include <linux/crypto.h>
-#include <linux/scatterlist.h>
-
-#include "tuxonice.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_io.h"
-#include "tuxonice_pageflags.h"
-#include "tuxonice_checksum.h"
-#include "tuxonice_pagedir.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_ui.h"
-
-static struct toi_module_ops toi_checksum_ops;
-
-/* Constant at the mo, but I might allow tuning later */
-static char toi_checksum_name[32] = "md4";
-/* Bytes per checksum */
-#define CHECKSUM_SIZE (16)
-
-#define CHECKSUMS_PER_PAGE ((PAGE_SIZE - sizeof(void *)) / CHECKSUM_SIZE)
-
-struct cpu_context {
- struct crypto_hash *transform;
- struct hash_desc desc;
- struct scatterlist sg[2];
- char *buf;
-};
-
-static DEFINE_PER_CPU(struct cpu_context, contexts);
-static int pages_allocated;
-static unsigned long page_list;
-
-static int toi_num_resaved;
-
-static unsigned long this_checksum, next_page;
-static int checksum_count;
-
-static inline int checksum_pages_needed(void)
-{
- return DIV_ROUND_UP(pagedir2.size, CHECKSUMS_PER_PAGE);
-}
-
-/* ---- Local buffer management ---- */
-
-/*
- * toi_checksum_cleanup
- *
- * Frees memory allocated for our labours.
- */
-static void toi_checksum_cleanup(int ending_cycle)
-{
- int cpu;
-
- if (ending_cycle) {
- for_each_online_cpu(cpu) {
- struct cpu_context *this = &per_cpu(contexts, cpu);
- if (this->transform) {
- crypto_free_hash(this->transform);
- this->transform = NULL;
- this->desc.tfm = NULL;
- }
-
- if (this->buf) {
- toi_free_page(27, (unsigned long) this->buf);
- this->buf = NULL;
- }
- }
- }
-}
-
-/*
- * toi_crypto_initialise
- *
- * Prepare to do some work by allocating buffers and transforms.
- * Returns: Int: Zero. Even if we can't set up checksum, we still
- * seek to hibernate.
- */
-static int toi_checksum_initialise(int starting_cycle)
-{
- int cpu;
-
- if (!(starting_cycle & SYSFS_HIBERNATE) || !toi_checksum_ops.enabled)
- return 0;
-
- if (!*toi_checksum_name) {
- printk(KERN_INFO "TuxOnIce: No checksum algorithm name set.\n");
- return 1;
- }
-
- for_each_online_cpu(cpu) {
- struct cpu_context *this = &per_cpu(contexts, cpu);
- struct page *page;
-
- this->transform = crypto_alloc_hash(toi_checksum_name, 0, 0);
- if (IS_ERR(this->transform)) {
- printk(KERN_INFO "TuxOnIce: Failed to initialise the "
- "%s checksum algorithm: %ld.\n",
- toi_checksum_name, (long) this->transform);
- this->transform = NULL;
- return 1;
- }
-
- this->desc.tfm = this->transform;
- this->desc.flags = 0;
-
- page = toi_alloc_page(27, GFP_KERNEL);
- if (!page)
- return 1;
- this->buf = page_address(page);
- sg_init_one(&this->sg[0], this->buf, PAGE_SIZE);
- }
- return 0;
-}
-
-/*
- * toi_checksum_print_debug_stats
- * @buffer: Pointer to a buffer into which the debug info will be printed.
- * @size: Size of the buffer.
- *
- * Print information to be recorded for debugging purposes into a buffer.
- * Returns: Number of characters written to the buffer.
- */
-
-static int toi_checksum_print_debug_stats(char *buffer, int size)
-{
- int len;
-
- if (!toi_checksum_ops.enabled)
- return scnprintf(buffer, size,
- "- Checksumming disabled.\n");
-
- len = scnprintf(buffer, size, "- Checksum method is '%s'.\n",
- toi_checksum_name);
- len += scnprintf(buffer + len, size - len,
- " %d pages resaved in atomic copy.\n", toi_num_resaved);
- return len;
-}
-
-static int toi_checksum_memory_needed(void)
-{
- return toi_checksum_ops.enabled ?
- checksum_pages_needed() << PAGE_SHIFT : 0;
-}
-
-static int toi_checksum_storage_needed(void)
-{
- if (toi_checksum_ops.enabled)
- return strlen(toi_checksum_name) + sizeof(int) + 1;
- else
- return 0;
-}
-
-/*
- * toi_checksum_save_config_info
- * @buffer: Pointer to a buffer of size PAGE_SIZE.
- *
- * Save informaton needed when reloading the image at resume time.
- * Returns: Number of bytes used for saving our data.
- */
-static int toi_checksum_save_config_info(char *buffer)
-{
- int namelen = strlen(toi_checksum_name) + 1;
- int total_len;
-
- *((unsigned int *) buffer) = namelen;
- strncpy(buffer + sizeof(unsigned int), toi_checksum_name, namelen);
- total_len = sizeof(unsigned int) + namelen;
- return total_len;
-}
-
-/* toi_checksum_load_config_info
- * @buffer: Pointer to the start of the data.
- * @size: Number of bytes that were saved.
- *
- * Description: Reload information needed for dechecksuming the image at
- * resume time.
- */
-static void toi_checksum_load_config_info(char *buffer, int size)
-{
- int namelen;
-
- namelen = *((unsigned int *) (buffer));
- strncpy(toi_checksum_name, buffer + sizeof(unsigned int),
- namelen);
- return;
-}
-
-/*
- * Free Checksum Memory
- */
-
-void free_checksum_pages(void)
-{
- while (pages_allocated) {
- unsigned long next = *((unsigned long *) page_list);
- ClearPageNosave(virt_to_page(page_list));
- toi_free_page(15, (unsigned long) page_list);
- page_list = next;
- pages_allocated--;
- }
-}
-
-/*
- * Allocate Checksum Memory
- */
-
-int allocate_checksum_pages(void)
-{
- int pages_needed = checksum_pages_needed();
-
- if (!toi_checksum_ops.enabled)
- return 0;
-
- while (pages_allocated < pages_needed) {
- unsigned long *new_page =
- (unsigned long *) toi_get_zeroed_page(15, TOI_ATOMIC_GFP);
- if (!new_page) {
- printk(KERN_ERR "Unable to allocate checksum pages.\n");
- return -ENOMEM;
- }
- SetPageNosave(virt_to_page(new_page));
- (*new_page) = page_list;
- page_list = (unsigned long) new_page;
- pages_allocated++;
- }
-
- next_page = (unsigned long) page_list;
- checksum_count = 0;
-
- return 0;
-}
-
-char *tuxonice_get_next_checksum(void)
-{
- if (!toi_checksum_ops.enabled)
- return NULL;
-
- if (checksum_count % CHECKSUMS_PER_PAGE)
- this_checksum += CHECKSUM_SIZE;
- else {
- this_checksum = next_page + sizeof(void *);
- next_page = *((unsigned long *) next_page);
- }
-
- checksum_count++;
- return (char *) this_checksum;
-}
-
-int tuxonice_calc_checksum(struct page *page, char *checksum_locn)
-{
- char *pa;
- int result, cpu = smp_processor_id();
- struct cpu_context *ctx = &per_cpu(contexts, cpu);
-
- if (!toi_checksum_ops.enabled)
- return 0;
-
- pa = kmap(page);
- memcpy(ctx->buf, pa, PAGE_SIZE);
- kunmap(page);
- result = crypto_hash_digest(&ctx->desc, ctx->sg, PAGE_SIZE,
- checksum_locn);
- if (result)
- printk(KERN_ERR "TuxOnIce checksumming: crypto_hash_digest "
- "returned %d.\n", result);
- return result;
-}
-/*
- * Calculate checksums
- */
-
-void check_checksums(void)
-{
- int index = 0, cpu = smp_processor_id();
- char current_checksum[CHECKSUM_SIZE];
- struct cpu_context *ctx = &per_cpu(contexts, cpu);
- unsigned long pfn;
-
- if (!toi_checksum_ops.enabled) {
- toi_message(TOI_IO, TOI_VERBOSE, 0, "Checksumming disabled.");
- return;
- }
-
- next_page = (unsigned long) page_list;
-
- toi_num_resaved = 0;
- this_checksum = 0;
-
- toi_trace_index++;
-
- toi_message(TOI_IO, TOI_VERBOSE, 0, "Verifying checksums.");
- memory_bm_position_reset(pageset2_map);
- for (pfn = memory_bm_next_pfn(pageset2_map, 0); pfn != BM_END_OF_MAP;
- pfn = memory_bm_next_pfn(pageset2_map, 0)) {
- int ret, resave_needed = false;
- char *pa;
- struct page *page = pfn_to_page(pfn);
-
- if (index < checksum_count) {
- if (index % CHECKSUMS_PER_PAGE) {
- this_checksum += CHECKSUM_SIZE;
- } else {
- this_checksum = next_page + sizeof(void *);
- next_page = *((unsigned long *) next_page);
- }
-
- /* Done when IRQs disabled so must be atomic */
- pa = kmap_atomic(page);
- memcpy(ctx->buf, pa, PAGE_SIZE);
- kunmap_atomic(pa);
- ret = crypto_hash_digest(&ctx->desc, ctx->sg, PAGE_SIZE,
- current_checksum);
-
- if (ret) {
- printk(KERN_INFO "Digest failed. Returned %d.\n", ret);
- return;
- }
-
- resave_needed = memcmp(current_checksum, (char *) this_checksum,
- CHECKSUM_SIZE);
- } else {
- resave_needed = true;
- }
-
- if (resave_needed) {
- TOI_TRACE_DEBUG(pfn, "_Resaving %d", resave_needed);
- SetPageResave(pfn_to_page(pfn));
- toi_num_resaved++;
- if (test_action_state(TOI_ABORT_ON_RESAVE_NEEDED))
- set_abort_result(TOI_RESAVE_NEEDED);
- }
-
- index++;
- }
- toi_message(TOI_IO, TOI_VERBOSE, 0, "Checksum verification complete.");
-}
-
-static struct toi_sysfs_data sysfs_params[] = {
- SYSFS_INT("enabled", SYSFS_RW, &toi_checksum_ops.enabled, 0, 1, 0,
- NULL),
- SYSFS_BIT("abort_if_resave_needed", SYSFS_RW, &toi_bkd.toi_action,
- TOI_ABORT_ON_RESAVE_NEEDED, 0)
-};
-
-/*
- * Ops structure.
- */
-static struct toi_module_ops toi_checksum_ops = {
- .type = MISC_MODULE,
- .name = "checksumming",
- .directory = "checksum",
- .module = THIS_MODULE,
- .initialise = toi_checksum_initialise,
- .cleanup = toi_checksum_cleanup,
- .print_debug_info = toi_checksum_print_debug_stats,
- .save_config_info = toi_checksum_save_config_info,
- .load_config_info = toi_checksum_load_config_info,
- .memory_needed = toi_checksum_memory_needed,
- .storage_needed = toi_checksum_storage_needed,
-
- .sysfs_data = sysfs_params,
- .num_sysfs_entries = sizeof(sysfs_params) /
- sizeof(struct toi_sysfs_data),
-};
-
-/* ---- Registration ---- */
-int toi_checksum_init(void)
-{
- int result = toi_register_module(&toi_checksum_ops);
- return result;
-}
-
-void toi_checksum_exit(void)
-{
- toi_unregister_module(&toi_checksum_ops);
-}
diff --git a/kernel/power/tuxonice_checksum.h b/kernel/power/tuxonice_checksum.h
deleted file mode 100644
index 7d6478a6a..000000000
--- a/kernel/power/tuxonice_checksum.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * kernel/power/tuxonice_checksum.h
- *
- * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * This file contains data checksum routines for TuxOnIce,
- * using cryptoapi. They are used to locate any modifications
- * made to pageset 2 while we're saving it.
- */
-
-#if defined(CONFIG_TOI_CHECKSUM)
-extern int toi_checksum_init(void);
-extern void toi_checksum_exit(void);
-void check_checksums(void);
-int allocate_checksum_pages(void);
-void free_checksum_pages(void);
-char *tuxonice_get_next_checksum(void);
-int tuxonice_calc_checksum(struct page *page, char *checksum_locn);
-#else
-static inline int toi_checksum_init(void) { return 0; }
-static inline void toi_checksum_exit(void) { }
-static inline void check_checksums(void) { };
-static inline int allocate_checksum_pages(void) { return 0; };
-static inline void free_checksum_pages(void) { };
-static inline char *tuxonice_get_next_checksum(void) { return NULL; };
-static inline int tuxonice_calc_checksum(struct page *page, char *checksum_locn)
- { return 0; }
-#endif
-
diff --git a/kernel/power/tuxonice_cluster.c b/kernel/power/tuxonice_cluster.c
deleted file mode 100644
index cfe3383ab..000000000
--- a/kernel/power/tuxonice_cluster.c
+++ /dev/null
@@ -1,1058 +0,0 @@
-/*
- * kernel/power/tuxonice_cluster.c
- *
- * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * This file contains routines for cluster hibernation support.
- *
- * Based on ip autoconfiguration code in net/ipv4/ipconfig.c.
- *
- * How does it work?
- *
- * There is no 'master' node that tells everyone else what to do. All nodes
- * send messages to the broadcast address/port, maintain a list of peers
- * and figure out when to progress to the next step in hibernating or resuming.
- * This makes us more fault tolerant when it comes to nodes coming and going
- * (which may be more of an issue if we're hibernating when power supplies
- * are being unreliable).
- *
- * At boot time, we start a ktuxonice thread that handles communication with
- * other nodes. This node maintains a state machine that controls our progress
- * through hibernating and resuming, keeping us in step with other nodes. Nodes
- * are identified by their hw address.
- *
- * On startup, the node sends CLUSTER_PING on the configured interface's
- * broadcast address, port $toi_cluster_port (see below) and begins to listen
- * for other broadcast messages. CLUSTER_PING messages are repeated at
- * intervals of 5 minutes, with a random offset to spread traffic out.
- *
- * A hibernation cycle is initiated from any node via
- *
- * echo > /sys/power/tuxonice/do_hibernate
- *
- * and (possibily) the hibernate script. At each step of the process, the node
- * completes its work, and waits for all other nodes to signal completion of
- * their work (or timeout) before progressing to the next step.
- *
- * Request/state Action before reply Possible reply Next state
- * HIBERNATE capable, pre-script HIBERNATE|ACK NODE_PREP
- * HIBERNATE|NACK INIT_0
- *
- * PREP prepare_image PREP|ACK IMAGE_WRITE
- * PREP|NACK INIT_0
- * ABORT RUNNING
- *
- * IO write image IO|ACK power off
- * ABORT POST_RESUME
- *
- * (Boot time) check for image IMAGE|ACK RESUME_PREP
- * (Note 1)
- * IMAGE|NACK (Note 2)
- *
- * PREP prepare read image PREP|ACK IMAGE_READ
- * PREP|NACK (As NACK_IMAGE)
- *
- * IO read image IO|ACK POST_RESUME
- *
- * POST_RESUME thaw, post-script RUNNING
- *
- * INIT_0 init 0
- *
- * Other messages:
- *
- * - PING: Request for all other live nodes to send a PONG. Used at startup to
- * announce presence, when a node is suspected dead and periodically, in case
- * segments of the network are [un]plugged.
- *
- * - PONG: Response to a PING.
- *
- * - ABORT: Request to cancel writing an image.
- *
- * - BYE: Notification that this node is shutting down.
- *
- * Note 1: Repeated at 3s intervals until we continue to boot/resume, so that
- * nodes which are slower to start up can get state synchronised. If a node
- * starting up sees other nodes sending RESUME_PREP or IMAGE_READ, it may send
- * ACK_IMAGE and they will wait for it to catch up. If it sees ACK_READ, it
- * must invalidate its image (if any) and boot normally.
- *
- * Note 2: May occur when one node lost power or powered off while others
- * hibernated. This node waits for others to complete resuming (ACK_READ)
- * before completing its boot, so that it appears as a fail node restarting.
- *
- * If any node has an image, then it also has a list of nodes that hibernated
- * in synchronisation with it. The node will wait for other nodes to appear
- * or timeout before beginning its restoration.
- *
- * If a node has no image, it needs to wait, in case other nodes which do have
- * an image are going to resume, but are taking longer to announce their
- * presence. For this reason, the user can specify a timeout value and a number
- * of nodes detected before we just continue. (We might want to assume in a
- * cluster of, say, 15 nodes, if 8 others have booted without finding an image,
- * the remaining nodes will too. This might help in situations where some nodes
- * are much slower to boot, or more subject to hardware failures or such like).
- */
-
-#include <linux/suspend.h>
-#include <linux/if.h>
-#include <linux/rtnetlink.h>
-#include <linux/ip.h>
-#include <linux/udp.h>
-#include <linux/in.h>
-#include <linux/if_arp.h>
-#include <linux/kthread.h>
-#include <linux/wait.h>
-#include <linux/netdevice.h>
-#include <net/ip.h>
-
-#include "tuxonice.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_io.h"
-
-#if 1
-#define PRINTK(a, b...) do { printk(a, ##b); } while (0)
-#else
-#define PRINTK(a, b...) do { } while (0)
-#endif
-
-static int loopback_mode;
-static int num_local_nodes = 1;
-#define MAX_LOCAL_NODES 8
-#define SADDR (loopback_mode ? b->sid : h->saddr)
-
-#define MYNAME "TuxOnIce Clustering"
-
-enum cluster_message {
- MSG_ACK = 1,
- MSG_NACK = 2,
- MSG_PING = 4,
- MSG_ABORT = 8,
- MSG_BYE = 16,
- MSG_HIBERNATE = 32,
- MSG_IMAGE = 64,
- MSG_IO = 128,
- MSG_RUNNING = 256
-};
-
-static char *str_message(int message)
-{
- switch (message) {
- case 4:
- return "Ping";
- case 8:
- return "Abort";
- case 9:
- return "Abort acked";
- case 10:
- return "Abort nacked";
- case 16:
- return "Bye";
- case 17:
- return "Bye acked";
- case 18:
- return "Bye nacked";
- case 32:
- return "Hibernate request";
- case 33:
- return "Hibernate ack";
- case 34:
- return "Hibernate nack";
- case 64:
- return "Image exists?";
- case 65:
- return "Image does exist";
- case 66:
- return "No image here";
- case 128:
- return "I/O";
- case 129:
- return "I/O okay";
- case 130:
- return "I/O failed";
- case 256:
- return "Running";
- default:
- printk(KERN_ERR "Unrecognised message %d.\n", message);
- return "Unrecognised message (see dmesg)";
- }
-}
-
-#define MSG_ACK_MASK (MSG_ACK | MSG_NACK)
-#define MSG_STATE_MASK (~MSG_ACK_MASK)
-
-struct node_info {
- struct list_head member_list;
- wait_queue_head_t member_events;
- spinlock_t member_list_lock;
- spinlock_t receive_lock;
- int peer_count, ignored_peer_count;
- struct toi_sysfs_data sysfs_data;
- enum cluster_message current_message;
-};
-
-struct node_info node_array[MAX_LOCAL_NODES];
-
-struct cluster_member {
- __be32 addr;
- enum cluster_message message;
- struct list_head list;
- int ignore;
-};
-
-#define toi_cluster_port_send 3501
-#define toi_cluster_port_recv 3502
-
-static struct net_device *net_dev;
-static struct toi_module_ops toi_cluster_ops;
-
-static int toi_recv(struct sk_buff *skb, struct net_device *dev,
- struct packet_type *pt, struct net_device *orig_dev);
-
-static struct packet_type toi_cluster_packet_type = {
- .type = __constant_htons(ETH_P_IP),
- .func = toi_recv,
-};
-
-struct toi_pkt { /* BOOTP packet format */
- struct iphdr iph; /* IP header */
- struct udphdr udph; /* UDP header */
- u8 htype; /* HW address type */
- u8 hlen; /* HW address length */
- __be32 xid; /* Transaction ID */
- __be16 secs; /* Seconds since we started */
- __be16 flags; /* Just what it says */
- u8 hw_addr[16]; /* Sender's HW address */
- u16 message; /* Message */
- unsigned long sid; /* Source ID for loopback testing */
-};
-
-static char toi_cluster_iface[IFNAMSIZ] = CONFIG_TOI_DEFAULT_CLUSTER_INTERFACE;
-
-static int added_pack;
-
-static int others_have_image;
-
-/* Key used to allow multiple clusters on the same lan */
-static char toi_cluster_key[32] = CONFIG_TOI_DEFAULT_CLUSTER_KEY;
-static char pre_hibernate_script[255] =
- CONFIG_TOI_DEFAULT_CLUSTER_PRE_HIBERNATE;
-static char post_hibernate_script[255] =
- CONFIG_TOI_DEFAULT_CLUSTER_POST_HIBERNATE;
-
-/* List of cluster members */
-static unsigned long continue_delay = 5 * HZ;
-static unsigned long cluster_message_timeout = 3 * HZ;
-
-/* === Membership list === */
-
-static void print_member_info(int index)
-{
- struct cluster_member *this;
-
- printk(KERN_INFO "==> Dumping node %d.\n", index);
-
- list_for_each_entry(this, &node_array[index].member_list, list)
- printk(KERN_INFO "%d.%d.%d.%d last message %s. %s\n",
- NIPQUAD(this->addr),
- str_message(this->message),
- this->ignore ? "(Ignored)" : "");
- printk(KERN_INFO "== Done ==\n");
-}
-
-static struct cluster_member *__find_member(int index, __be32 addr)
-{
- struct cluster_member *this;
-
- list_for_each_entry(this, &node_array[index].member_list, list) {
- if (this->addr != addr)
- continue;
-
- return this;
- }
-
- return NULL;
-}
-
-static void set_ignore(int index, __be32 addr, struct cluster_member *this)
-{
- if (this->ignore) {
- PRINTK("Node %d already ignoring %d.%d.%d.%d.\n",
- index, NIPQUAD(addr));
- return;
- }
-
- PRINTK("Node %d sees node %d.%d.%d.%d now being ignored.\n",
- index, NIPQUAD(addr));
- this->ignore = 1;
- node_array[index].ignored_peer_count++;
-}
-
-static int __add_update_member(int index, __be32 addr, int message)
-{
- struct cluster_member *this;
-
- this = __find_member(index, addr);
- if (this) {
- if (this->message != message) {
- this->message = message;
- if ((message & MSG_NACK) &&
- (message & (MSG_HIBERNATE | MSG_IMAGE | MSG_IO)))
- set_ignore(index, addr, this);
- PRINTK("Node %d sees node %d.%d.%d.%d now sending "
- "%s.\n", index, NIPQUAD(addr),
- str_message(message));
- wake_up(&node_array[index].member_events);
- }
- return 0;
- }
-
- this = (struct cluster_member *) toi_kzalloc(36,
- sizeof(struct cluster_member), GFP_KERNEL);
-
- if (!this)
- return -1;
-
- this->addr = addr;
- this->message = message;
- this->ignore = 0;
- INIT_LIST_HEAD(&this->list);
-
- node_array[index].peer_count++;
-
- PRINTK("Node %d sees node %d.%d.%d.%d sending %s.\n", index,
- NIPQUAD(addr), str_message(message));
-
- if ((message & MSG_NACK) &&
- (message & (MSG_HIBERNATE | MSG_IMAGE | MSG_IO)))
- set_ignore(index, addr, this);
- list_add_tail(&this->list, &node_array[index].member_list);
- return 1;
-}
-
-static int add_update_member(int index, __be32 addr, int message)
-{
- int result;
- unsigned long flags;
- spin_lock_irqsave(&node_array[index].member_list_lock, flags);
- result = __add_update_member(index, addr, message);
- spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
-
- print_member_info(index);
-
- wake_up(&node_array[index].member_events);
-
- return result;
-}
-
-static void del_member(int index, __be32 addr)
-{
- struct cluster_member *this;
- unsigned long flags;
-
- spin_lock_irqsave(&node_array[index].member_list_lock, flags);
- this = __find_member(index, addr);
-
- if (this) {
- list_del_init(&this->list);
- toi_kfree(36, this, sizeof(*this));
- node_array[index].peer_count--;
- }
-
- spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
-}
-
-/* === Message transmission === */
-
-static void toi_send_if(int message, unsigned long my_id);
-
-/*
- * Process received TOI packet.
- */
-static int toi_recv(struct sk_buff *skb, struct net_device *dev,
- struct packet_type *pt, struct net_device *orig_dev)
-{
- struct toi_pkt *b;
- struct iphdr *h;
- int len, result, index;
- unsigned long addr, message, ack;
-
- /* Perform verifications before taking the lock. */
- if (skb->pkt_type == PACKET_OTHERHOST)
- goto drop;
-
- if (dev != net_dev)
- goto drop;
-
- skb = skb_share_check(skb, GFP_ATOMIC);
- if (!skb)
- return NET_RX_DROP;
-
- if (!pskb_may_pull(skb,
- sizeof(struct iphdr) +
- sizeof(struct udphdr)))
- goto drop;
-
- b = (struct toi_pkt *)skb_network_header(skb);
- h = &b->iph;
-
- if (h->ihl != 5 || h->version != 4 || h->protocol != IPPROTO_UDP)
- goto drop;
-
- /* Fragments are not supported */
- if (h->frag_off & htons(IP_OFFSET | IP_MF)) {
- if (net_ratelimit())
- printk(KERN_ERR "TuxOnIce: Ignoring fragmented "
- "cluster message.\n");
- goto drop;
- }
-
- if (skb->len < ntohs(h->tot_len))
- goto drop;
-
- if (ip_fast_csum((char *) h, h->ihl))
- goto drop;
-
- if (b->udph.source != htons(toi_cluster_port_send) ||
- b->udph.dest != htons(toi_cluster_port_recv))
- goto drop;
-
- if (ntohs(h->tot_len) < ntohs(b->udph.len) + sizeof(struct iphdr))
- goto drop;
-
- len = ntohs(b->udph.len) - sizeof(struct udphdr);
-
- /* Ok the front looks good, make sure we can get at the rest. */
- if (!pskb_may_pull(skb, skb->len))
- goto drop;
-
- b = (struct toi_pkt *)skb_network_header(skb);
- h = &b->iph;
-
- addr = SADDR;
- PRINTK(">>> Message %s received from " NIPQUAD_FMT ".\n",
- str_message(b->message), NIPQUAD(addr));
-
- message = b->message & MSG_STATE_MASK;
- ack = b->message & MSG_ACK_MASK;
-
- for (index = 0; index < num_local_nodes; index++) {
- int new_message = node_array[index].current_message,
- old_message = new_message;
-
- if (index == SADDR || !old_message) {
- PRINTK("Ignoring node %d (offline or self).\n", index);
- continue;
- }
-
- /* One message at a time, please. */
- spin_lock(&node_array[index].receive_lock);
-
- result = add_update_member(index, SADDR, b->message);
- if (result == -1) {
- printk(KERN_INFO "Failed to add new cluster member "
- NIPQUAD_FMT ".\n",
- NIPQUAD(addr));
- goto drop_unlock;
- }
-
- switch (b->message & MSG_STATE_MASK) {
- case MSG_PING:
- break;
- case MSG_ABORT:
- break;
- case MSG_BYE:
- break;
- case MSG_HIBERNATE:
- /* Can I hibernate? */
- new_message = MSG_HIBERNATE |
- ((index & 1) ? MSG_NACK : MSG_ACK);
- break;
- case MSG_IMAGE:
- /* Can I resume? */
- new_message = MSG_IMAGE |
- ((index & 1) ? MSG_NACK : MSG_ACK);
- if (new_message != old_message)
- printk(KERN_ERR "Setting whether I can resume "
- "to %d.\n", new_message);
- break;
- case MSG_IO:
- new_message = MSG_IO | MSG_ACK;
- break;
- case MSG_RUNNING:
- break;
- default:
- if (net_ratelimit())
- printk(KERN_ERR "Unrecognised TuxOnIce cluster"
- " message %d from " NIPQUAD_FMT ".\n",
- b->message, NIPQUAD(addr));
- };
-
- if (old_message != new_message) {
- node_array[index].current_message = new_message;
- printk(KERN_INFO ">>> Sending new message for node "
- "%d.\n", index);
- toi_send_if(new_message, index);
- } else if (!ack) {
- printk(KERN_INFO ">>> Resending message for node %d.\n",
- index);
- toi_send_if(new_message, index);
- }
-drop_unlock:
- spin_unlock(&node_array[index].receive_lock);
- };
-
-drop:
- /* Throw the packet out. */
- kfree_skb(skb);
-
- return 0;
-}
-
-/*
- * Send cluster message to single interface.
- */
-static void toi_send_if(int message, unsigned long my_id)
-{
- struct sk_buff *skb;
- struct toi_pkt *b;
- int hh_len = LL_RESERVED_SPACE(net_dev);
- struct iphdr *h;
-
- /* Allocate packet */
- skb = alloc_skb(sizeof(struct toi_pkt) + hh_len + 15, GFP_KERNEL);
- if (!skb)
- return;
- skb_reserve(skb, hh_len);
- b = (struct toi_pkt *) skb_put(skb, sizeof(struct toi_pkt));
- memset(b, 0, sizeof(struct toi_pkt));
-
- /* Construct IP header */
- skb_reset_network_header(skb);
- h = ip_hdr(skb);
- h->version = 4;
- h->ihl = 5;
- h->tot_len = htons(sizeof(struct toi_pkt));
- h->frag_off = htons(IP_DF);
- h->ttl = 64;
- h->protocol = IPPROTO_UDP;
- h->daddr = htonl(INADDR_BROADCAST);
- h->check = ip_fast_csum((unsigned char *) h, h->ihl);
-
- /* Construct UDP header */
- b->udph.source = htons(toi_cluster_port_send);
- b->udph.dest = htons(toi_cluster_port_recv);
- b->udph.len = htons(sizeof(struct toi_pkt) - sizeof(struct iphdr));
- /* UDP checksum not calculated -- explicitly allowed in BOOTP RFC */
-
- /* Construct message */
- b->message = message;
- b->sid = my_id;
- b->htype = net_dev->type; /* can cause undefined behavior */
- b->hlen = net_dev->addr_len;
- memcpy(b->hw_addr, net_dev->dev_addr, net_dev->addr_len);
- b->secs = htons(3); /* 3 seconds */
-
- /* Chain packet down the line... */
- skb->dev = net_dev;
- skb->protocol = htons(ETH_P_IP);
- if ((dev_hard_header(skb, net_dev, ntohs(skb->protocol),
- net_dev->broadcast, net_dev->dev_addr, skb->len) < 0) ||
- dev_queue_xmit(skb) < 0)
- printk(KERN_INFO "E");
-}
-
-/* ========================================= */
-
-/* kTOICluster */
-
-static atomic_t num_cluster_threads;
-static DECLARE_WAIT_QUEUE_HEAD(clusterd_events);
-
-static int kTOICluster(void *data)
-{
- unsigned long my_id;
-
- my_id = atomic_add_return(1, &num_cluster_threads) - 1;
- node_array[my_id].current_message = (unsigned long) data;
-
- PRINTK("kTOICluster daemon %lu starting.\n", my_id);
-
- current->flags |= PF_NOFREEZE;
-
- while (node_array[my_id].current_message) {
- toi_send_if(node_array[my_id].current_message, my_id);
- sleep_on_timeout(&clusterd_events,
- cluster_message_timeout);
- PRINTK("Link state %lu is %d.\n", my_id,
- node_array[my_id].current_message);
- }
-
- toi_send_if(MSG_BYE, my_id);
- atomic_dec(&num_cluster_threads);
- wake_up(&clusterd_events);
-
- PRINTK("kTOICluster daemon %lu exiting.\n", my_id);
- __set_current_state(TASK_RUNNING);
- return 0;
-}
-
-static void kill_clusterd(void)
-{
- int i;
-
- for (i = 0; i < num_local_nodes; i++) {
- if (node_array[i].current_message) {
- PRINTK("Seeking to kill clusterd %d.\n", i);
- node_array[i].current_message = 0;
- }
- }
- wait_event(clusterd_events,
- !atomic_read(&num_cluster_threads));
- PRINTK("All cluster daemons have exited.\n");
-}
-
-static int peers_not_in_message(int index, int message, int precise)
-{
- struct cluster_member *this;
- unsigned long flags;
- int result = 0;
-
- spin_lock_irqsave(&node_array[index].member_list_lock, flags);
- list_for_each_entry(this, &node_array[index].member_list, list) {
- if (this->ignore)
- continue;
-
- PRINTK("Peer %d.%d.%d.%d sending %s. "
- "Seeking %s.\n",
- NIPQUAD(this->addr),
- str_message(this->message), str_message(message));
- if ((precise ? this->message :
- this->message & MSG_STATE_MASK) !=
- message)
- result++;
- }
- spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
- PRINTK("%d peers in sought message.\n", result);
- return result;
-}
-
-static void reset_ignored(int index)
-{
- struct cluster_member *this;
- unsigned long flags;
-
- spin_lock_irqsave(&node_array[index].member_list_lock, flags);
- list_for_each_entry(this, &node_array[index].member_list, list)
- this->ignore = 0;
- node_array[index].ignored_peer_count = 0;
- spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
-}
-
-static int peers_in_message(int index, int message, int precise)
-{
- return node_array[index].peer_count -
- node_array[index].ignored_peer_count -
- peers_not_in_message(index, message, precise);
-}
-
-static int time_to_continue(int index, unsigned long start, int message)
-{
- int first = peers_not_in_message(index, message, 0);
- int second = peers_in_message(index, message, 1);
-
- PRINTK("First part returns %d, second returns %d.\n", first, second);
-
- if (!first && !second) {
- PRINTK("All peers answered message %d.\n",
- message);
- return 1;
- }
-
- if (time_after(jiffies, start + continue_delay)) {
- PRINTK("Timeout reached.\n");
- return 1;
- }
-
- PRINTK("Not time to continue yet (%lu < %lu).\n", jiffies,
- start + continue_delay);
- return 0;
-}
-
-void toi_initiate_cluster_hibernate(void)
-{
- int result;
- unsigned long start;
-
- result = do_toi_step(STEP_HIBERNATE_PREPARE_IMAGE);
- if (result)
- return;
-
- toi_send_if(MSG_HIBERNATE, 0);
-
- start = jiffies;
- wait_event(node_array[0].member_events,
- time_to_continue(0, start, MSG_HIBERNATE));
-
- if (test_action_state(TOI_FREEZER_TEST)) {
- toi_send_if(MSG_ABORT, 0);
-
- start = jiffies;
- wait_event(node_array[0].member_events,
- time_to_continue(0, start, MSG_RUNNING));
-
- do_toi_step(STEP_QUIET_CLEANUP);
- return;
- }
-
- toi_send_if(MSG_IO, 0);
-
- result = do_toi_step(STEP_HIBERNATE_SAVE_IMAGE);
- if (result)
- return;
-
- /* This code runs at resume time too! */
- if (toi_in_hibernate)
- result = do_toi_step(STEP_HIBERNATE_POWERDOWN);
-}
-
-/* toi_cluster_print_debug_stats
- *
- * Description: Print information to be recorded for debugging purposes into a
- * buffer.
- * Arguments: buffer: Pointer to a buffer into which the debug info will be
- * printed.
- * size: Size of the buffer.
- * Returns: Number of characters written to the buffer.
- */
-static int toi_cluster_print_debug_stats(char *buffer, int size)
-{
- int len;
-
- if (strlen(toi_cluster_iface))
- len = scnprintf(buffer, size,
- "- Cluster interface is '%s'.\n",
- toi_cluster_iface);
- else
- len = scnprintf(buffer, size,
- "- Cluster support is disabled.\n");
- return len;
-}
-
-/* cluster_memory_needed
- *
- * Description: Tell the caller how much memory we need to operate during
- * hibernate/resume.
- * Returns: Unsigned long. Maximum number of bytes of memory required for
- * operation.
- */
-static int toi_cluster_memory_needed(void)
-{
- return 0;
-}
-
-static int toi_cluster_storage_needed(void)
-{
- return 1 + strlen(toi_cluster_iface);
-}
-
-/* toi_cluster_save_config_info
- *
- * Description: Save informaton needed when reloading the image at resume time.
- * Arguments: Buffer: Pointer to a buffer of size PAGE_SIZE.
- * Returns: Number of bytes used for saving our data.
- */
-static int toi_cluster_save_config_info(char *buffer)
-{
- strcpy(buffer, toi_cluster_iface);
- return strlen(toi_cluster_iface + 1);
-}
-
-/* toi_cluster_load_config_info
- *
- * Description: Reload information needed for declustering the image at
- * resume time.
- * Arguments: Buffer: Pointer to the start of the data.
- * Size: Number of bytes that were saved.
- */
-static void toi_cluster_load_config_info(char *buffer, int size)
-{
- strncpy(toi_cluster_iface, buffer, size);
- return;
-}
-
-static void cluster_startup(void)
-{
- int have_image = do_check_can_resume(), i;
- unsigned long start = jiffies, initial_message;
- struct task_struct *p;
-
- initial_message = MSG_IMAGE;
-
- have_image = 1;
-
- for (i = 0; i < num_local_nodes; i++) {
- PRINTK("Starting ktoiclusterd %d.\n", i);
- p = kthread_create(kTOICluster, (void *) initial_message,
- "ktoiclusterd/%d", i);
- if (IS_ERR(p)) {
- printk(KERN_ERR "Failed to start ktoiclusterd.\n");
- return;
- }
-
- wake_up_process(p);
- }
-
- /* Wait for delay or someone else sending first message */
- wait_event(node_array[0].member_events, time_to_continue(0, start,
- MSG_IMAGE));
-
- others_have_image = peers_in_message(0, MSG_IMAGE | MSG_ACK, 1);
-
- printk(KERN_INFO "Continuing. I %shave an image. Peers with image:"
- " %d.\n", have_image ? "" : "don't ", others_have_image);
-
- if (have_image) {
- int result;
-
- /* Start to resume */
- printk(KERN_INFO " === Starting to resume === \n");
- node_array[0].current_message = MSG_IO;
- toi_send_if(MSG_IO, 0);
-
- /* result = do_toi_step(STEP_RESUME_LOAD_PS1); */
- result = 0;
-
- if (!result) {
- /*
- * Atomic restore - we'll come back in the hibernation
- * path.
- */
-
- /* result = do_toi_step(STEP_RESUME_DO_RESTORE); */
- result = 0;
-
- /* do_toi_step(STEP_QUIET_CLEANUP); */
- }
-
- node_array[0].current_message |= MSG_NACK;
-
- /* For debugging - disable for real life? */
- wait_event(node_array[0].member_events,
- time_to_continue(0, start, MSG_IO));
- }
-
- if (others_have_image) {
- /* Wait for them to resume */
- printk(KERN_INFO "Waiting for other nodes to resume.\n");
- start = jiffies;
- wait_event(node_array[0].member_events,
- time_to_continue(0, start, MSG_RUNNING));
- if (peers_not_in_message(0, MSG_RUNNING, 0))
- printk(KERN_INFO "Timed out while waiting for other "
- "nodes to resume.\n");
- }
-
- /* Find out whether an image exists here. Send ACK_IMAGE or NACK_IMAGE
- * as appropriate.
- *
- * If we don't have an image:
- * - Wait until someone else says they have one, or conditions are met
- * for continuing to boot (n machines or t seconds).
- * - If anyone has an image, wait for them to resume before continuing
- * to boot.
- *
- * If we have an image:
- * - Wait until conditions are met before continuing to resume (n
- * machines or t seconds). Send RESUME_PREP and freeze processes.
- * NACK_PREP if freezing fails (shouldn't) and follow logic for
- * us having no image above. On success, wait for [N]ACK_PREP from
- * other machines. Read image (including atomic restore) until done.
- * Wait for ACK_READ from others (should never fail). Thaw processes
- * and do post-resume. (The section after the atomic restore is done
- * via the code for hibernating).
- */
-
- node_array[0].current_message = MSG_RUNNING;
-}
-
-/* toi_cluster_open_iface
- *
- * Description: Prepare to use an interface.
- */
-
-static int toi_cluster_open_iface(void)
-{
- struct net_device *dev;
-
- rtnl_lock();
-
- for_each_netdev(&init_net, dev) {
- if (/* dev == &init_net.loopback_dev || */
- strcmp(dev->name, toi_cluster_iface))
- continue;
-
- net_dev = dev;
- break;
- }
-
- rtnl_unlock();
-
- if (!net_dev) {
- printk(KERN_ERR MYNAME ": Device %s not found.\n",
- toi_cluster_iface);
- return -ENODEV;
- }
-
- dev_add_pack(&toi_cluster_packet_type);
- added_pack = 1;
-
- loopback_mode = (net_dev == init_net.loopback_dev);
- num_local_nodes = loopback_mode ? 8 : 1;
-
- PRINTK("Loopback mode is %s. Number of local nodes is %d.\n",
- loopback_mode ? "on" : "off", num_local_nodes);
-
- cluster_startup();
- return 0;
-}
-
-/* toi_cluster_close_iface
- *
- * Description: Stop using an interface.
- */
-
-static int toi_cluster_close_iface(void)
-{
- kill_clusterd();
- if (added_pack) {
- dev_remove_pack(&toi_cluster_packet_type);
- added_pack = 0;
- }
- return 0;
-}
-
-static void write_side_effect(void)
-{
- if (toi_cluster_ops.enabled) {
- toi_cluster_open_iface();
- set_toi_state(TOI_CLUSTER_MODE);
- } else {
- toi_cluster_close_iface();
- clear_toi_state(TOI_CLUSTER_MODE);
- }
-}
-
-static void node_write_side_effect(void)
-{
-}
-
-/*
- * data for our sysfs entries.
- */
-static struct toi_sysfs_data sysfs_params[] = {
- SYSFS_STRING("interface", SYSFS_RW, toi_cluster_iface, IFNAMSIZ, 0,
- NULL),
- SYSFS_INT("enabled", SYSFS_RW, &toi_cluster_ops.enabled, 0, 1, 0,
- write_side_effect),
- SYSFS_STRING("cluster_name", SYSFS_RW, toi_cluster_key, 32, 0, NULL),
- SYSFS_STRING("pre-hibernate-script", SYSFS_RW, pre_hibernate_script,
- 256, 0, NULL),
- SYSFS_STRING("post-hibernate-script", SYSFS_RW, post_hibernate_script,
- 256, 0, STRING),
- SYSFS_UL("continue_delay", SYSFS_RW, &continue_delay, HZ / 2, 60 * HZ,
- 0)
-};
-
-/*
- * Ops structure.
- */
-
-static struct toi_module_ops toi_cluster_ops = {
- .type = FILTER_MODULE,
- .name = "Cluster",
- .directory = "cluster",
- .module = THIS_MODULE,
- .memory_needed = toi_cluster_memory_needed,
- .print_debug_info = toi_cluster_print_debug_stats,
- .save_config_info = toi_cluster_save_config_info,
- .load_config_info = toi_cluster_load_config_info,
- .storage_needed = toi_cluster_storage_needed,
-
- .sysfs_data = sysfs_params,
- .num_sysfs_entries = sizeof(sysfs_params) /
- sizeof(struct toi_sysfs_data),
-};
-
-/* ---- Registration ---- */
-
-#ifdef MODULE
-#define INIT static __init
-#define EXIT static __exit
-#else
-#define INIT
-#define EXIT
-#endif
-
-INIT int toi_cluster_init(void)
-{
- int temp = toi_register_module(&toi_cluster_ops), i;
- struct kobject *kobj = toi_cluster_ops.dir_kobj;
-
- for (i = 0; i < MAX_LOCAL_NODES; i++) {
- node_array[i].current_message = 0;
- INIT_LIST_HEAD(&node_array[i].member_list);
- init_waitqueue_head(&node_array[i].member_events);
- spin_lock_init(&node_array[i].member_list_lock);
- spin_lock_init(&node_array[i].receive_lock);
-
- /* Set up sysfs entry */
- node_array[i].sysfs_data.attr.name = toi_kzalloc(8,
- sizeof(node_array[i].sysfs_data.attr.name),
- GFP_KERNEL);
- sprintf((char *) node_array[i].sysfs_data.attr.name, "node_%d",
- i);
- node_array[i].sysfs_data.attr.mode = SYSFS_RW;
- node_array[i].sysfs_data.type = TOI_SYSFS_DATA_INTEGER;
- node_array[i].sysfs_data.flags = 0;
- node_array[i].sysfs_data.data.integer.variable =
- (int *) &node_array[i].current_message;
- node_array[i].sysfs_data.data.integer.minimum = 0;
- node_array[i].sysfs_data.data.integer.maximum = INT_MAX;
- node_array[i].sysfs_data.write_side_effect =
- node_write_side_effect;
- toi_register_sysfs_file(kobj, &node_array[i].sysfs_data);
- }
-
- toi_cluster_ops.enabled = (strlen(toi_cluster_iface) > 0);
-
- if (toi_cluster_ops.enabled)
- toi_cluster_open_iface();
-
- return temp;
-}
-
-EXIT void toi_cluster_exit(void)
-{
- int i;
- toi_cluster_close_iface();
-
- for (i = 0; i < MAX_LOCAL_NODES; i++)
- toi_unregister_sysfs_file(toi_cluster_ops.dir_kobj,
- &node_array[i].sysfs_data);
- toi_unregister_module(&toi_cluster_ops);
-}
-
-static int __init toi_cluster_iface_setup(char *iface)
-{
- toi_cluster_ops.enabled = (*iface &&
- strcmp(iface, "off"));
-
- if (toi_cluster_ops.enabled)
- strncpy(toi_cluster_iface, iface, strlen(iface));
-}
-
-__setup("toi_cluster=", toi_cluster_iface_setup);
diff --git a/kernel/power/tuxonice_cluster.h b/kernel/power/tuxonice_cluster.h
deleted file mode 100644
index 84356b304..000000000
--- a/kernel/power/tuxonice_cluster.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/*
- * kernel/power/tuxonice_cluster.h
- *
- * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- */
-
-#ifdef CONFIG_TOI_CLUSTER
-extern int toi_cluster_init(void);
-extern void toi_cluster_exit(void);
-extern void toi_initiate_cluster_hibernate(void);
-#else
-static inline int toi_cluster_init(void) { return 0; }
-static inline void toi_cluster_exit(void) { }
-static inline void toi_initiate_cluster_hibernate(void) { }
-#endif
-
diff --git a/kernel/power/tuxonice_compress.c b/kernel/power/tuxonice_compress.c
deleted file mode 100644
index d118568b7..000000000
--- a/kernel/power/tuxonice_compress.c
+++ /dev/null
@@ -1,452 +0,0 @@
-/*
- * kernel/power/compression.c
- *
- * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * This file contains data compression routines for TuxOnIce,
- * using cryptoapi.
- */
-
-#include <linux/suspend.h>
-#include <linux/highmem.h>
-#include <linux/vmalloc.h>
-#include <linux/crypto.h>
-
-#include "tuxonice_builtin.h"
-#include "tuxonice.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_io.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_alloc.h"
-
-static int toi_expected_compression;
-
-static struct toi_module_ops toi_compression_ops;
-static struct toi_module_ops *next_driver;
-
-static char toi_compressor_name[32] = "lzo";
-
-static DEFINE_MUTEX(stats_lock);
-
-struct cpu_context {
- u8 *page_buffer;
- struct crypto_comp *transform;
- unsigned int len;
- u8 *buffer_start;
- u8 *output_buffer;
-};
-
-#define OUT_BUF_SIZE (2 * PAGE_SIZE)
-
-static DEFINE_PER_CPU(struct cpu_context, contexts);
-
-/*
- * toi_crypto_prepare
- *
- * Prepare to do some work by allocating buffers and transforms.
- */
-static int toi_compress_crypto_prepare(void)
-{
- int cpu;
-
- if (!*toi_compressor_name) {
- printk(KERN_INFO "TuxOnIce: Compression enabled but no "
- "compressor name set.\n");
- return 1;
- }
-
- for_each_online_cpu(cpu) {
- struct cpu_context *this = &per_cpu(contexts, cpu);
- this->transform = crypto_alloc_comp(toi_compressor_name, 0, 0);
- if (IS_ERR(this->transform)) {
- printk(KERN_INFO "TuxOnIce: Failed to initialise the "
- "%s compression transform.\n",
- toi_compressor_name);
- this->transform = NULL;
- return 1;
- }
-
- this->page_buffer =
- (char *) toi_get_zeroed_page(16, TOI_ATOMIC_GFP);
-
- if (!this->page_buffer) {
- printk(KERN_ERR
- "Failed to allocate a page buffer for TuxOnIce "
- "compression driver.\n");
- return -ENOMEM;
- }
-
- this->output_buffer =
- (char *) vmalloc_32(OUT_BUF_SIZE);
-
- if (!this->output_buffer) {
- printk(KERN_ERR
- "Failed to allocate a output buffer for TuxOnIce "
- "compression driver.\n");
- return -ENOMEM;
- }
- }
-
- return 0;
-}
-
-static int toi_compress_rw_cleanup(int writing)
-{
- int cpu;
-
- for_each_online_cpu(cpu) {
- struct cpu_context *this = &per_cpu(contexts, cpu);
- if (this->transform) {
- crypto_free_comp(this->transform);
- this->transform = NULL;
- }
-
- if (this->page_buffer)
- toi_free_page(16, (unsigned long) this->page_buffer);
-
- this->page_buffer = NULL;
-
- if (this->output_buffer)
- vfree(this->output_buffer);
-
- this->output_buffer = NULL;
- }
-
- return 0;
-}
-
-/*
- * toi_compress_init
- */
-
-static int toi_compress_init(int toi_or_resume)
-{
- if (!toi_or_resume)
- return 0;
-
- toi_compress_bytes_in = 0;
- toi_compress_bytes_out = 0;
-
- next_driver = toi_get_next_filter(&toi_compression_ops);
-
- return next_driver ? 0 : -ECHILD;
-}
-
-/*
- * toi_compress_rw_init()
- */
-
-static int toi_compress_rw_init(int rw, int stream_number)
-{
- if (toi_compress_crypto_prepare()) {
- printk(KERN_ERR "Failed to initialise compression "
- "algorithm.\n");
- if (rw == READ) {
- printk(KERN_INFO "Unable to read the image.\n");
- return -ENODEV;
- } else {
- printk(KERN_INFO "Continuing without "
- "compressing the image.\n");
- toi_compression_ops.enabled = 0;
- }
- }
-
- return 0;
-}
-
-/*
- * toi_compress_write_page()
- *
- * Compress a page of data, buffering output and passing on filled
- * pages to the next module in the pipeline.
- *
- * Buffer_page: Pointer to a buffer of size PAGE_SIZE, containing
- * data to be compressed.
- *
- * Returns: 0 on success. Otherwise the error is that returned by later
- * modules, -ECHILD if we have a broken pipeline or -EIO if
- * zlib errs.
- */
-static int toi_compress_write_page(unsigned long index, int buf_type,
- void *buffer_page, unsigned int buf_size)
-{
- int ret = 0, cpu = smp_processor_id();
- struct cpu_context *ctx = &per_cpu(contexts, cpu);
- u8* output_buffer = buffer_page;
- int output_len = buf_size;
- int out_buf_type = buf_type;
-
- if (ctx->transform) {
-
- ctx->buffer_start = TOI_MAP(buf_type, buffer_page);
- ctx->len = OUT_BUF_SIZE;
-
- ret = crypto_comp_compress(ctx->transform,
- ctx->buffer_start, buf_size,
- ctx->output_buffer, &ctx->len);
-
- TOI_UNMAP(buf_type, buffer_page);
-
- toi_message(TOI_COMPRESS, TOI_VERBOSE, 0,
- "CPU %d, index %lu: %d bytes",
- cpu, index, ctx->len);
-
- if (!ret && ctx->len < buf_size) { /* some compression */
- output_buffer = ctx->output_buffer;
- output_len = ctx->len;
- out_buf_type = TOI_VIRT;
- }
-
- }
-
- mutex_lock(&stats_lock);
-
- toi_compress_bytes_in += buf_size;
- toi_compress_bytes_out += output_len;
-
- mutex_unlock(&stats_lock);
-
- if (!ret)
- ret = next_driver->write_page(index, out_buf_type,
- output_buffer, output_len);
-
- return ret;
-}
-
-/*
- * toi_compress_read_page()
- * @buffer_page: struct page *. Pointer to a buffer of size PAGE_SIZE.
- *
- * Retrieve data from later modules and decompress it until the input buffer
- * is filled.
- * Zero if successful. Error condition from me or from downstream on failure.
- */
-static int toi_compress_read_page(unsigned long *index, int buf_type,
- void *buffer_page, unsigned int *buf_size)
-{
- int ret, cpu = smp_processor_id();
- unsigned int len;
- unsigned int outlen = PAGE_SIZE;
- char *buffer_start;
- struct cpu_context *ctx = &per_cpu(contexts, cpu);
-
- if (!ctx->transform)
- return next_driver->read_page(index, TOI_PAGE, buffer_page,
- buf_size);
-
- /*
- * All our reads must be synchronous - we can't decompress
- * data that hasn't been read yet.
- */
-
- ret = next_driver->read_page(index, TOI_VIRT, ctx->page_buffer, &len);
-
- buffer_start = kmap(buffer_page);
-
- /* Error or uncompressed data */
- if (ret || len == PAGE_SIZE) {
- memcpy(buffer_start, ctx->page_buffer, len);
- goto out;
- }
-
- ret = crypto_comp_decompress(
- ctx->transform,
- ctx->page_buffer,
- len, buffer_start, &outlen);
-
- toi_message(TOI_COMPRESS, TOI_VERBOSE, 0,
- "CPU %d, index %lu: %d=>%d (%d).",
- cpu, *index, len, outlen, ret);
-
- if (ret)
- abort_hibernate(TOI_FAILED_IO,
- "Compress_read returned %d.\n", ret);
- else if (outlen != PAGE_SIZE) {
- abort_hibernate(TOI_FAILED_IO,
- "Decompression yielded %d bytes instead of %ld.\n",
- outlen, PAGE_SIZE);
- printk(KERN_ERR "Decompression yielded %d bytes instead of "
- "%ld.\n", outlen, PAGE_SIZE);
- ret = -EIO;
- *buf_size = outlen;
- }
-out:
- TOI_UNMAP(buf_type, buffer_page);
- return ret;
-}
-
-/*
- * toi_compress_print_debug_stats
- * @buffer: Pointer to a buffer into which the debug info will be printed.
- * @size: Size of the buffer.
- *
- * Print information to be recorded for debugging purposes into a buffer.
- * Returns: Number of characters written to the buffer.
- */
-
-static int toi_compress_print_debug_stats(char *buffer, int size)
-{
- unsigned long pages_in = toi_compress_bytes_in >> PAGE_SHIFT,
- pages_out = toi_compress_bytes_out >> PAGE_SHIFT;
- int len;
-
- /* Output the compression ratio achieved. */
- if (*toi_compressor_name)
- len = scnprintf(buffer, size, "- Compressor is '%s'.\n",
- toi_compressor_name);
- else
- len = scnprintf(buffer, size, "- Compressor is not set.\n");
-
- if (pages_in)
- len += scnprintf(buffer+len, size - len, " Compressed "
- "%lu bytes into %lu (%ld percent compression).\n",
- toi_compress_bytes_in,
- toi_compress_bytes_out,
- (pages_in - pages_out) * 100 / pages_in);
- return len;
-}
-
-/*
- * toi_compress_compression_memory_needed
- *
- * Tell the caller how much memory we need to operate during hibernate/resume.
- * Returns: Unsigned long. Maximum number of bytes of memory required for
- * operation.
- */
-static int toi_compress_memory_needed(void)
-{
- return 2 * PAGE_SIZE;
-}
-
-static int toi_compress_storage_needed(void)
-{
- return 2 * sizeof(unsigned long) + 2 * sizeof(int) +
- strlen(toi_compressor_name) + 1;
-}
-
-/*
- * toi_compress_save_config_info
- * @buffer: Pointer to a buffer of size PAGE_SIZE.
- *
- * Save informaton needed when reloading the image at resume time.
- * Returns: Number of bytes used for saving our data.
- */
-static int toi_compress_save_config_info(char *buffer)
-{
- int len = strlen(toi_compressor_name) + 1, offset = 0;
-
- *((unsigned long *) buffer) = toi_compress_bytes_in;
- offset += sizeof(unsigned long);
- *((unsigned long *) (buffer + offset)) = toi_compress_bytes_out;
- offset += sizeof(unsigned long);
- *((int *) (buffer + offset)) = toi_expected_compression;
- offset += sizeof(int);
- *((int *) (buffer + offset)) = len;
- offset += sizeof(int);
- strncpy(buffer + offset, toi_compressor_name, len);
- return offset + len;
-}
-
-/* toi_compress_load_config_info
- * @buffer: Pointer to the start of the data.
- * @size: Number of bytes that were saved.
- *
- * Description: Reload information needed for decompressing the image at
- * resume time.
- */
-static void toi_compress_load_config_info(char *buffer, int size)
-{
- int len, offset = 0;
-
- toi_compress_bytes_in = *((unsigned long *) buffer);
- offset += sizeof(unsigned long);
- toi_compress_bytes_out = *((unsigned long *) (buffer + offset));
- offset += sizeof(unsigned long);
- toi_expected_compression = *((int *) (buffer + offset));
- offset += sizeof(int);
- len = *((int *) (buffer + offset));
- offset += sizeof(int);
- strncpy(toi_compressor_name, buffer + offset, len);
-}
-
-static void toi_compress_pre_atomic_restore(struct toi_boot_kernel_data *bkd)
-{
- bkd->compress_bytes_in = toi_compress_bytes_in;
- bkd->compress_bytes_out = toi_compress_bytes_out;
-}
-
-static void toi_compress_post_atomic_restore(struct toi_boot_kernel_data *bkd)
-{
- toi_compress_bytes_in = bkd->compress_bytes_in;
- toi_compress_bytes_out = bkd->compress_bytes_out;
-}
-
-/*
- * toi_expected_compression_ratio
- *
- * Description: Returns the expected ratio between data passed into this module
- * and the amount of data output when writing.
- * Returns: 100 if the module is disabled. Otherwise the value set by the
- * user via our sysfs entry.
- */
-
-static int toi_compress_expected_ratio(void)
-{
- if (!toi_compression_ops.enabled)
- return 100;
- else
- return 100 - toi_expected_compression;
-}
-
-/*
- * data for our sysfs entries.
- */
-static struct toi_sysfs_data sysfs_params[] = {
- SYSFS_INT("expected_compression", SYSFS_RW, &toi_expected_compression,
- 0, 99, 0, NULL),
- SYSFS_INT("enabled", SYSFS_RW, &toi_compression_ops.enabled, 0, 1, 0,
- NULL),
- SYSFS_STRING("algorithm", SYSFS_RW, toi_compressor_name, 31, 0, NULL),
-};
-
-/*
- * Ops structure.
- */
-static struct toi_module_ops toi_compression_ops = {
- .type = FILTER_MODULE,
- .name = "compression",
- .directory = "compression",
- .module = THIS_MODULE,
- .initialise = toi_compress_init,
- .memory_needed = toi_compress_memory_needed,
- .print_debug_info = toi_compress_print_debug_stats,
- .save_config_info = toi_compress_save_config_info,
- .load_config_info = toi_compress_load_config_info,
- .storage_needed = toi_compress_storage_needed,
- .expected_compression = toi_compress_expected_ratio,
-
- .pre_atomic_restore = toi_compress_pre_atomic_restore,
- .post_atomic_restore = toi_compress_post_atomic_restore,
-
- .rw_init = toi_compress_rw_init,
- .rw_cleanup = toi_compress_rw_cleanup,
-
- .write_page = toi_compress_write_page,
- .read_page = toi_compress_read_page,
-
- .sysfs_data = sysfs_params,
- .num_sysfs_entries = sizeof(sysfs_params) /
- sizeof(struct toi_sysfs_data),
-};
-
-/* ---- Registration ---- */
-
-static __init int toi_compress_load(void)
-{
- return toi_register_module(&toi_compression_ops);
-}
-
-late_initcall(toi_compress_load);
diff --git a/kernel/power/tuxonice_copy_before_write.c b/kernel/power/tuxonice_copy_before_write.c
deleted file mode 100644
index dc02a4acf..000000000
--- a/kernel/power/tuxonice_copy_before_write.c
+++ /dev/null
@@ -1,240 +0,0 @@
-/*
- * kernel/power/tuxonice_copy_before_write.c
- *
- * Copyright (C) 2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Routines (apart from the fault handling code) to deal with allocating memory
- * for copying pages before they are modified, restoring the contents and getting
- * the contents written to disk.
- */
-
-#include <linux/percpu-defs.h>
-#include <linux/sched.h>
-#include <linux/tuxonice.h>
-#include "tuxonice_alloc.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice.h"
-
-DEFINE_PER_CPU(struct toi_cbw_state, toi_cbw_states);
-#define CBWS_PER_PAGE (PAGE_SIZE / sizeof(struct toi_cbw))
-#define toi_cbw_pool_size 100
-
-static void _toi_free_cbw_data(struct toi_cbw_state *state)
-{
- struct toi_cbw *page_ptr, *ptr, *next;
-
- page_ptr = ptr = state->first;
-
- while(ptr) {
- next = ptr->next;
-
- if (ptr->virt) {
- toi__free_page(40, virt_to_page(ptr->virt));
- }
- if ((((unsigned long) ptr) & PAGE_MASK) != (unsigned long) page_ptr) {
- /* Must be on a new page - free the previous one. */
- toi__free_page(40, virt_to_page(page_ptr));
- page_ptr = ptr;
- }
- ptr = next;
- }
-
- if (page_ptr) {
- toi__free_page(40, virt_to_page(page_ptr));
- }
-
- state->first = state->next = state->last = NULL;
- state->size = 0;
-}
-
-void toi_free_cbw_data(void)
-{
- int i;
-
- for_each_online_cpu(i) {
- struct toi_cbw_state *state = &per_cpu(toi_cbw_states, i);
-
- if (!state->first)
- continue;
-
- state->enabled = 0;
-
- while (state->active) {
- schedule();
- }
-
- _toi_free_cbw_data(state);
- }
-}
-
-static int _toi_allocate_cbw_data(struct toi_cbw_state *state)
-{
- while(state->size < toi_cbw_pool_size) {
- int i;
- struct toi_cbw *ptr;
-
- ptr = (struct toi_cbw *) toi_get_zeroed_page(40, GFP_KERNEL);
-
- if (!ptr) {
- return -ENOMEM;
- }
-
- if (!state->first) {
- state->first = state->next = state->last = ptr;
- }
-
- for (i = 0; i < CBWS_PER_PAGE; i++) {
- struct toi_cbw *cbw = &ptr[i];
-
- cbw->virt = (char *) toi_get_zeroed_page(40, GFP_KERNEL);
- if (!cbw->virt) {
- state->size += i;
- printk("Out of memory allocating CBW pages.\n");
- return -ENOMEM;
- }
-
- if (cbw == state->first)
- continue;
-
- state->last->next = cbw;
- state->last = cbw;
- }
-
- state->size += CBWS_PER_PAGE;
- }
-
- state->enabled = 1;
-
- return 0;
-}
-
-
-int toi_allocate_cbw_data(void)
-{
- int i, result;
-
- for_each_online_cpu(i) {
- struct toi_cbw_state *state = &per_cpu(toi_cbw_states, i);
-
- result = _toi_allocate_cbw_data(state);
-
- if (result)
- return result;
- }
-
- return 0;
-}
-
-void toi_cbw_restore(void)
-{
- if (!toi_keeping_image)
- return;
-
-}
-
-void toi_cbw_write(void)
-{
- if (!toi_keeping_image)
- return;
-
-}
-
-/**
- * toi_cbw_test_read - Test copy before write on one page
- *
- * Allocate copy before write buffers, then make one page only copy-before-write
- * and attempt to write to it. We should then be able to retrieve the original
- * version from the cbw buffer and the modified version from the page itself.
- */
-static int toi_cbw_test_read(const char *buffer, int count)
-{
- unsigned long virt = toi_get_zeroed_page(40, GFP_KERNEL);
- char *original = "Original contents";
- char *modified = "Modified material";
- struct page *page = virt_to_page(virt);
- int i, len = 0, found = 0, pfn = page_to_pfn(page);
-
- if (!page) {
- printk("toi_cbw_test_read: Unable to allocate a page for testing.\n");
- return -ENOMEM;
- }
-
- memcpy((char *) virt, original, strlen(original));
-
- if (toi_allocate_cbw_data()) {
- printk("toi_cbw_test_read: Unable to allocate cbw data.\n");
- return -ENOMEM;
- }
-
- toi_reset_dirtiness_one(pfn, 0);
-
- SetPageTOI_CBW(page);
-
- memcpy((char *) virt, modified, strlen(modified));
-
- if (strncmp((char *) virt, modified, strlen(modified))) {
- len += sprintf((char *) buffer + len, "Failed to write to page after protecting it.\n");
- }
-
- for_each_online_cpu(i) {
- struct toi_cbw_state *state = &per_cpu(toi_cbw_states, i);
- struct toi_cbw *ptr = state->first, *last_ptr = ptr;
-
- if (!found) {
- while (ptr) {
- if (ptr->pfn == pfn) {
- found = 1;
- if (strncmp(ptr->virt, original, strlen(original))) {
- len += sprintf((char *) buffer + len, "Contents of original buffer are not original.\n");
- } else {
- len += sprintf((char *) buffer + len, "Test passed. Buffer changed and original contents preserved.\n");
- }
- break;
- }
-
- last_ptr = ptr;
- ptr = ptr->next;
- }
- }
-
- if (!last_ptr)
- len += sprintf((char *) buffer + len, "All available CBW buffers on cpu %d used.\n", i);
- }
-
- if (!found)
- len += sprintf((char *) buffer + len, "Copy before write buffer not found.\n");
-
- toi_free_cbw_data();
-
- return len;
-}
-
-/*
- * This array contains entries that are automatically registered at
- * boot. Modules and the console code register their own entries separately.
- */
-static struct toi_sysfs_data sysfs_params[] = {
- SYSFS_CUSTOM("test", SYSFS_RW, toi_cbw_test_read,
- NULL, SYSFS_NEEDS_SM_FOR_READ, NULL),
-};
-
-static struct toi_module_ops toi_cbw_ops = {
- .type = MISC_HIDDEN_MODULE,
- .name = "copy_before_write debugging",
- .directory = "cbw",
- .module = THIS_MODULE,
- .early = 1,
-
- .sysfs_data = sysfs_params,
- .num_sysfs_entries = sizeof(sysfs_params) /
- sizeof(struct toi_sysfs_data),
-};
-
-int toi_cbw_init(void)
-{
- int result = toi_register_module(&toi_cbw_ops);
- return result;
-}
diff --git a/kernel/power/tuxonice_extent.c b/kernel/power/tuxonice_extent.c
deleted file mode 100644
index 3b558b220..000000000
--- a/kernel/power/tuxonice_extent.c
+++ /dev/null
@@ -1,144 +0,0 @@
-/*
- * kernel/power/tuxonice_extent.c
- *
- * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- * These functions encapsulate the manipulation of storage metadata.
- */
-
-#include <linux/suspend.h>
-#include "tuxonice_modules.h"
-#include "tuxonice_extent.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_ui.h"
-#include "tuxonice.h"
-
-/**
- * toi_get_extent - return a free extent
- *
- * May fail, returning NULL instead.
- **/
-static struct hibernate_extent *toi_get_extent(void)
-{
- return (struct hibernate_extent *) toi_kzalloc(2,
- sizeof(struct hibernate_extent), TOI_ATOMIC_GFP);
-}
-
-/**
- * toi_put_extent_chain - free a chain of extents starting from value 'from'
- * @chain: Chain to free.
- *
- * Note that 'from' is an extent value, and may be part way through an extent.
- * In this case, the extent should be truncated (if necessary) and following
- * extents freed.
- **/
-void toi_put_extent_chain_from(struct hibernate_extent_chain *chain, unsigned long from)
-{
- struct hibernate_extent *this;
-
- this = chain->first;
-
- while (this) {
- struct hibernate_extent *next = this->next;
-
- // Delete the whole extent?
- if (this->start >= from) {
- chain->size -= (this->end - this->start + 1);
- if (chain->first == this)
- chain->first = next;
- if (chain->last_touched == this)
- chain->last_touched = NULL;
- if (chain->current_extent == this)
- chain->current_extent = NULL;
- toi_kfree(2, this, sizeof(*this));
- chain->num_extents--;
- } else if (this->end >= from) {
- // Delete part of the extent
- chain->size -= (this->end - from + 1);
- this->start = from;
- }
- this = next;
- }
-}
-
-/**
- * toi_put_extent_chain - free a whole chain of extents
- * @chain: Chain to free.
- **/
-void toi_put_extent_chain(struct hibernate_extent_chain *chain)
-{
- toi_put_extent_chain_from(chain, 0);
-}
-
-/**
- * toi_add_to_extent_chain - add an extent to an existing chain
- * @chain: Chain to which the extend should be added
- * @start: Start of the extent (first physical block)
- * @end: End of the extent (last physical block)
- *
- * The chain information is updated if the insertion is successful.
- **/
-int toi_add_to_extent_chain(struct hibernate_extent_chain *chain,
- unsigned long start, unsigned long end)
-{
- struct hibernate_extent *new_ext = NULL, *cur_ext = NULL;
-
- toi_message(TOI_IO, TOI_VERBOSE, 0,
- "Adding extent %lu-%lu to chain %p.\n", start, end, chain);
-
- /* Find the right place in the chain */
- if (chain->last_touched && chain->last_touched->start < start)
- cur_ext = chain->last_touched;
- else if (chain->first && chain->first->start < start)
- cur_ext = chain->first;
-
- if (cur_ext) {
- while (cur_ext->next && cur_ext->next->start < start)
- cur_ext = cur_ext->next;
-
- if (cur_ext->end == (start - 1)) {
- struct hibernate_extent *next_ext = cur_ext->next;
- cur_ext->end = end;
-
- /* Merge with the following one? */
- if (next_ext && cur_ext->end + 1 == next_ext->start) {
- cur_ext->end = next_ext->end;
- cur_ext->next = next_ext->next;
- toi_kfree(2, next_ext, sizeof(*next_ext));
- chain->num_extents--;
- }
-
- chain->last_touched = cur_ext;
- chain->size += (end - start + 1);
-
- return 0;
- }
- }
-
- new_ext = toi_get_extent();
- if (!new_ext) {
- printk(KERN_INFO "Error unable to append a new extent to the "
- "chain.\n");
- return -ENOMEM;
- }
-
- chain->num_extents++;
- chain->size += (end - start + 1);
- new_ext->start = start;
- new_ext->end = end;
-
- chain->last_touched = new_ext;
-
- if (cur_ext) {
- new_ext->next = cur_ext->next;
- cur_ext->next = new_ext;
- } else {
- if (chain->first)
- new_ext->next = chain->first;
- chain->first = new_ext;
- }
-
- return 0;
-}
diff --git a/kernel/power/tuxonice_extent.h b/kernel/power/tuxonice_extent.h
deleted file mode 100644
index cf1289efc..000000000
--- a/kernel/power/tuxonice_extent.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * kernel/power/tuxonice_extent.h
- *
- * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * It contains declarations related to extents. Extents are
- * TuxOnIce's method of storing some of the metadata for the image.
- * See tuxonice_extent.c for more info.
- *
- */
-
-#include "tuxonice_modules.h"
-
-#ifndef EXTENT_H
-#define EXTENT_H
-
-struct hibernate_extent {
- unsigned long start, end;
- struct hibernate_extent *next;
-};
-
-struct hibernate_extent_chain {
- unsigned long size; /* size of the chain ie sum (max-min+1) */
- int num_extents;
- struct hibernate_extent *first, *last_touched;
- struct hibernate_extent *current_extent;
- unsigned long current_offset;
-};
-
-/* Simplify iterating through all the values in an extent chain */
-#define toi_extent_for_each(extent_chain, extentpointer, value) \
-if ((extent_chain)->first) \
- for ((extentpointer) = (extent_chain)->first, (value) = \
- (extentpointer)->start; \
- ((extentpointer) && ((extentpointer)->next || (value) <= \
- (extentpointer)->end)); \
- (((value) == (extentpointer)->end) ? \
- ((extentpointer) = (extentpointer)->next, (value) = \
- ((extentpointer) ? (extentpointer)->start : 0)) : \
- (value)++))
-
-extern void toi_put_extent_chain_from(struct hibernate_extent_chain *chain, unsigned long from);
-#endif
diff --git a/kernel/power/tuxonice_file.c b/kernel/power/tuxonice_file.c
deleted file mode 100644
index 607246051..000000000
--- a/kernel/power/tuxonice_file.c
+++ /dev/null
@@ -1,484 +0,0 @@
-/*
- * kernel/power/tuxonice_file.c
- *
- * Copyright (C) 2005-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- * This file encapsulates functions for usage of a simple file as a
- * backing store. It is based upon the swapallocator, and shares the
- * same basic working. Here, though, we have nothing to do with
- * swapspace, and only one device to worry about.
- *
- * The user can just
- *
- * echo TuxOnIce > /path/to/my_file
- *
- * dd if=/dev/zero bs=1M count=<file_size_desired> >> /path/to/my_file
- *
- * and
- *
- * echo /path/to/my_file > /sys/power/tuxonice/file/target
- *
- * then put what they find in /sys/power/tuxonice/resume
- * as their resume= parameter in lilo.conf (and rerun lilo if using it).
- *
- * Having done this, they're ready to hibernate and resume.
- *
- * TODO:
- * - File resizing.
- */
-
-#include <linux/blkdev.h>
-#include <linux/mount.h>
-#include <linux/fs.h>
-#include <linux/fs_uuid.h>
-
-#include "tuxonice.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_bio.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_builtin.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_io.h"
-
-#define target_is_normal_file() (S_ISREG(target_inode->i_mode))
-
-static struct toi_module_ops toi_fileops;
-
-static struct file *target_file;
-static struct block_device *toi_file_target_bdev;
-static unsigned long pages_available, pages_allocated;
-static char toi_file_target[256];
-static struct inode *target_inode;
-static int file_target_priority;
-static int used_devt;
-static int target_claim;
-static dev_t toi_file_dev_t;
-static int sig_page_index;
-
-/* For test_toi_file_target */
-static struct toi_bdev_info *file_chain;
-
-static int has_contiguous_blocks(struct toi_bdev_info *dev_info, int page_num)
-{
- int j;
- sector_t last = 0;
-
- for (j = 0; j < dev_info->blocks_per_page; j++) {
- sector_t this = bmap(target_inode,
- page_num * dev_info->blocks_per_page + j);
-
- if (!this || (last && (last + 1) != this))
- break;
-
- last = this;
- }
-
- return j == dev_info->blocks_per_page;
-}
-
-static unsigned long get_usable_pages(struct toi_bdev_info *dev_info)
-{
- unsigned long result = 0;
- struct block_device *bdev = dev_info->bdev;
- int i;
-
- switch (target_inode->i_mode & S_IFMT) {
- case S_IFSOCK:
- case S_IFCHR:
- case S_IFIFO: /* Socket, Char, Fifo */
- return -1;
- case S_IFREG: /* Regular file: current size - holes + free
- space on part */
- for (i = 0; i < (target_inode->i_size >> PAGE_SHIFT) ; i++) {
- if (has_contiguous_blocks(dev_info, i))
- result++;
- }
- break;
- case S_IFBLK: /* Block device */
- if (!bdev->bd_disk) {
- toi_message(TOI_IO, TOI_VERBOSE, 0,
- "bdev->bd_disk null.");
- return 0;
- }
-
- result = (bdev->bd_part ?
- bdev->bd_part->nr_sects :
- get_capacity(bdev->bd_disk)) >> (PAGE_SHIFT - 9);
- }
-
-
- return result;
-}
-
-static int toi_file_register_storage(void)
-{
- struct toi_bdev_info *devinfo;
- int result = 0;
- struct fs_info *fs_info;
-
- toi_message(TOI_IO, TOI_VERBOSE, 0, "toi_file_register_storage.");
- if (!strlen(toi_file_target)) {
- toi_message(TOI_IO, TOI_VERBOSE, 0, "Register file storage: "
- "No target filename set.");
- return 0;
- }
-
- target_file = filp_open(toi_file_target, O_RDONLY|O_LARGEFILE, 0);
- toi_message(TOI_IO, TOI_VERBOSE, 0, "filp_open %s returned %p.",
- toi_file_target, target_file);
-
- if (IS_ERR(target_file) || !target_file) {
- target_file = NULL;
- toi_file_dev_t = name_to_dev_t(toi_file_target);
- if (!toi_file_dev_t) {
- struct kstat stat;
- int error = vfs_stat(toi_file_target, &stat);
- printk(KERN_INFO "Open file %s returned %p and "
- "name_to_devt failed.\n",
- toi_file_target, target_file);
- if (error) {
- printk(KERN_INFO "Stating the file also failed."
- " Nothing more we can do.\n");
- return 0;
- } else
- toi_file_dev_t = stat.rdev;
- }
-
- toi_file_target_bdev = toi_open_by_devnum(toi_file_dev_t);
- if (IS_ERR(toi_file_target_bdev)) {
- printk(KERN_INFO "Got a dev_num (%lx) but failed to "
- "open it.\n",
- (unsigned long) toi_file_dev_t);
- toi_file_target_bdev = NULL;
- return 0;
- }
- used_devt = 1;
- target_inode = toi_file_target_bdev->bd_inode;
- } else
- target_inode = target_file->f_mapping->host;
-
- toi_message(TOI_IO, TOI_VERBOSE, 0, "Succeeded in opening the target.");
- if (S_ISLNK(target_inode->i_mode) || S_ISDIR(target_inode->i_mode) ||
- S_ISSOCK(target_inode->i_mode) || S_ISFIFO(target_inode->i_mode)) {
- printk(KERN_INFO "File support works with regular files,"
- " character files and block devices.\n");
- /* Cleanup routine will undo the above */
- return 0;
- }
-
- if (!used_devt) {
- if (S_ISBLK(target_inode->i_mode)) {
- toi_file_target_bdev = I_BDEV(target_inode);
- if (!blkdev_get(toi_file_target_bdev, FMODE_WRITE |
- FMODE_READ, NULL))
- target_claim = 1;
- } else
- toi_file_target_bdev = target_inode->i_sb->s_bdev;
- if (!toi_file_target_bdev) {
- printk(KERN_INFO "%s is not a valid file allocator "
- "target.\n", toi_file_target);
- return 0;
- }
- toi_file_dev_t = toi_file_target_bdev->bd_dev;
- }
-
- devinfo = toi_kzalloc(39, sizeof(struct toi_bdev_info), GFP_ATOMIC);
- if (!devinfo) {
- printk("Failed to allocate a toi_bdev_info struct for the file allocator.\n");
- return -ENOMEM;
- }
-
- devinfo->bdev = toi_file_target_bdev;
- devinfo->allocator = &toi_fileops;
- devinfo->allocator_index = 0;
-
- fs_info = fs_info_from_block_dev(toi_file_target_bdev);
- if (fs_info && !IS_ERR(fs_info)) {
- memcpy(devinfo->uuid, &fs_info->uuid, 16);
- free_fs_info(fs_info);
- } else
- result = (int) PTR_ERR(fs_info);
-
- /* Unlike swap code, only complain if fs_info_from_block_dev returned
- * -ENOMEM. The 'file' might be a full partition, so might validly not
- * have an identifiable type, UUID etc.
- */
- if (result)
- printk(KERN_DEBUG "Failed to get fs_info for file device (%d).\n",
- result);
- devinfo->dev_t = toi_file_dev_t;
- devinfo->prio = file_target_priority;
- devinfo->bmap_shift = target_inode->i_blkbits - 9;
- devinfo->blocks_per_page =
- (1 << (PAGE_SHIFT - target_inode->i_blkbits));
- sprintf(devinfo->name, "file %s", toi_file_target);
- file_chain = devinfo;
- toi_message(TOI_IO, TOI_VERBOSE, 0, "Dev_t is %lx. Prio is %d. Bmap "
- "shift is %d. Blocks per page %d.",
- devinfo->dev_t, devinfo->prio, devinfo->bmap_shift,
- devinfo->blocks_per_page);
-
- /* Keep one aside for the signature */
- pages_available = get_usable_pages(devinfo) - 1;
-
- toi_message(TOI_IO, TOI_VERBOSE, 0, "Registering file storage, %lu "
- "pages.", pages_available);
-
- toi_bio_ops.register_storage(devinfo);
- return 0;
-}
-
-static unsigned long toi_file_storage_available(void)
-{
- return pages_available;
-}
-
-static int toi_file_allocate_storage(struct toi_bdev_info *chain,
- unsigned long request)
-{
- unsigned long available = pages_available - pages_allocated;
- unsigned long to_add = min(available, request);
-
- toi_message(TOI_IO, TOI_VERBOSE, 0, "Pages available is %lu. Allocated "
- "is %lu. Allocating %lu pages from file.",
- pages_available, pages_allocated, to_add);
- pages_allocated += to_add;
-
- return to_add;
-}
-
-/**
- * __populate_block_list - add an extent to the chain
- * @min: Start of the extent (first physical block = sector)
- * @max: End of the extent (last physical block = sector)
- *
- * If TOI_TEST_BIO is set, print a debug message, outputting the min and max
- * fs block numbers.
- **/
-static int __populate_block_list(struct toi_bdev_info *chain, int min, int max)
-{
- if (test_action_state(TOI_TEST_BIO))
- toi_message(TOI_IO, TOI_VERBOSE, 0, "Adding extent %d-%d.",
- min << chain->bmap_shift,
- ((max + 1) << chain->bmap_shift) - 1);
-
- return toi_add_to_extent_chain(&chain->blocks, min, max);
-}
-
-static int get_main_pool_phys_params(struct toi_bdev_info *chain)
-{
- int i, extent_min = -1, extent_max = -1, result = 0, have_sig_page = 0;
- unsigned long pages_mapped = 0;
-
- toi_message(TOI_IO, TOI_VERBOSE, 0, "Getting file allocator blocks.");
-
- if (chain->blocks.first)
- toi_put_extent_chain(&chain->blocks);
-
- if (!target_is_normal_file()) {
- result = (pages_available > 0) ?
- __populate_block_list(chain, chain->blocks_per_page,
- (pages_allocated + 1) *
- chain->blocks_per_page - 1) : 0;
- return result;
- }
-
- /*
- * FIXME: We are assuming the first page is contiguous. Is that
- * assumption always right?
- */
-
- for (i = 0; i < (target_inode->i_size >> PAGE_SHIFT); i++) {
- sector_t new_sector;
-
- if (!has_contiguous_blocks(chain, i))
- continue;
-
- if (!have_sig_page) {
- have_sig_page = 1;
- sig_page_index = i;
- continue;
- }
-
- pages_mapped++;
-
- /* Ignore first page - it has the header */
- if (pages_mapped == 1)
- continue;
-
- new_sector = bmap(target_inode, (i * chain->blocks_per_page));
-
- /*
- * I'd love to be able to fill in holes and resize
- * files, but not yet...
- */
-
- if (new_sector == extent_max + 1)
- extent_max += chain->blocks_per_page;
- else {
- if (extent_min > -1) {
- result = __populate_block_list(chain,
- extent_min, extent_max);
- if (result)
- return result;
- }
-
- extent_min = new_sector;
- extent_max = extent_min +
- chain->blocks_per_page - 1;
- }
-
- if (pages_mapped == pages_allocated)
- break;
- }
-
- if (extent_min > -1) {
- result = __populate_block_list(chain, extent_min, extent_max);
- if (result)
- return result;
- }
-
- return 0;
-}
-
-static void toi_file_free_storage(struct toi_bdev_info *chain)
-{
- pages_allocated = 0;
- file_chain = NULL;
-}
-
-/**
- * toi_file_print_debug_stats - print debug info
- * @buffer: Buffer to data to populate
- * @size: Size of the buffer
- **/
-static int toi_file_print_debug_stats(char *buffer, int size)
-{
- int len = scnprintf(buffer, size, "- File Allocator active.\n");
-
- len += scnprintf(buffer+len, size-len, " Storage available for "
- "image: %lu pages.\n", pages_available);
-
- return len;
-}
-
-static void toi_file_cleanup(int finishing_cycle)
-{
- if (toi_file_target_bdev) {
- if (target_claim) {
- blkdev_put(toi_file_target_bdev, FMODE_WRITE | FMODE_READ);
- target_claim = 0;
- }
-
- if (used_devt) {
- blkdev_put(toi_file_target_bdev,
- FMODE_READ | FMODE_NDELAY);
- used_devt = 0;
- }
- toi_file_target_bdev = NULL;
- target_inode = NULL;
- }
-
- if (target_file) {
- filp_close(target_file, NULL);
- target_file = NULL;
- }
-
- pages_available = 0;
-}
-
-/**
- * test_toi_file_target - sysfs callback for /sys/power/tuxonince/file/target
- *
- * Test wheter the target file is valid for hibernating.
- **/
-static void test_toi_file_target(void)
-{
- int result = toi_file_register_storage();
- sector_t sector;
- char buf[50];
- struct fs_info *fs_info;
-
- if (result || !file_chain)
- return;
-
- /* This doesn't mean we're in business. Is any storage available? */
- if (!pages_available)
- goto out;
-
- toi_file_allocate_storage(file_chain, 1);
- result = get_main_pool_phys_params(file_chain);
- if (result)
- goto out;
-
-
- sector = bmap(target_inode, sig_page_index *
- file_chain->blocks_per_page) << file_chain->bmap_shift;
-
- /* Use the uuid, or the dev_t if that fails */
- fs_info = fs_info_from_block_dev(toi_file_target_bdev);
- if (!fs_info || IS_ERR(fs_info)) {
- bdevname(toi_file_target_bdev, buf);
- sprintf(resume_file, "/dev/%s:%llu", buf,
- (unsigned long long) sector);
- } else {
- int i;
- hex_dump_to_buffer(fs_info->uuid, 16, 32, 1, buf, 50, 0);
-
- /* Remove the spaces */
- for (i = 1; i < 16; i++) {
- buf[2 * i] = buf[3 * i];
- buf[2 * i + 1] = buf[3 * i + 1];
- }
- buf[32] = 0;
- sprintf(resume_file, "UUID=%s:0x%llx", buf,
- (unsigned long long) sector);
- free_fs_info(fs_info);
- }
-
- toi_attempt_to_parse_resume_device(0);
-out:
- toi_file_free_storage(file_chain);
- toi_bio_ops.free_storage();
-}
-
-static struct toi_sysfs_data sysfs_params[] = {
- SYSFS_STRING("target", SYSFS_RW, toi_file_target, 256,
- SYSFS_NEEDS_SM_FOR_WRITE, test_toi_file_target),
- SYSFS_INT("enabled", SYSFS_RW, &toi_fileops.enabled, 0, 1, 0, NULL),
- SYSFS_INT("priority", SYSFS_RW, &file_target_priority, -4095,
- 4096, 0, NULL),
-};
-
-static struct toi_bio_allocator_ops toi_bio_fileops = {
- .register_storage = toi_file_register_storage,
- .storage_available = toi_file_storage_available,
- .allocate_storage = toi_file_allocate_storage,
- .bmap = get_main_pool_phys_params,
- .free_storage = toi_file_free_storage,
-};
-
-static struct toi_module_ops toi_fileops = {
- .type = BIO_ALLOCATOR_MODULE,
- .name = "file storage",
- .directory = "file",
- .module = THIS_MODULE,
- .print_debug_info = toi_file_print_debug_stats,
- .cleanup = toi_file_cleanup,
- .bio_allocator_ops = &toi_bio_fileops,
-
- .sysfs_data = sysfs_params,
- .num_sysfs_entries = sizeof(sysfs_params) /
- sizeof(struct toi_sysfs_data),
-};
-
-/* ---- Registration ---- */
-static __init int toi_file_load(void)
-{
- return toi_register_module(&toi_fileops);
-}
-
-late_initcall(toi_file_load);
diff --git a/kernel/power/tuxonice_highlevel.c b/kernel/power/tuxonice_highlevel.c
deleted file mode 100644
index bdcd832f3..000000000
--- a/kernel/power/tuxonice_highlevel.c
+++ /dev/null
@@ -1,1413 +0,0 @@
-/*
- * kernel/power/tuxonice_highlevel.c
- */
-/** \mainpage TuxOnIce.
- *
- * TuxOnIce provides support for saving and restoring an image of
- * system memory to an arbitrary storage device, either on the local computer,
- * or across some network. The support is entirely OS based, so TuxOnIce
- * works without requiring BIOS, APM or ACPI support. The vast majority of the
- * code is also architecture independant, so it should be very easy to port
- * the code to new architectures. TuxOnIce includes support for SMP, 4G HighMem
- * and preemption. Initramfses and initrds are also supported.
- *
- * TuxOnIce uses a modular design, in which the method of storing the image is
- * completely abstracted from the core code, as are transformations on the data
- * such as compression and/or encryption (multiple 'modules' can be used to
- * provide arbitrary combinations of functionality). The user interface is also
- * modular, so that arbitrarily simple or complex interfaces can be used to
- * provide anything from debugging information through to eye candy.
- *
- * \section Copyright
- *
- * TuxOnIce is released under the GPLv2.
- *
- * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu><BR>
- * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz><BR>
- * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr><BR>
- * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au)<BR>
- *
- * \section Credits
- *
- * Nigel would like to thank the following people for their work:
- *
- * Bernard Blackham <bernard@blackham.com.au><BR>
- * Web page & Wiki administration, some coding. A person without whom
- * TuxOnIce would not be where it is.
- *
- * Michael Frank <mhf@linuxmail.org><BR>
- * Extensive testing and help with improving stability. I was constantly
- * amazed by the quality and quantity of Michael's help.
- *
- * Pavel Machek <pavel@ucw.cz><BR>
- * Modifications, defectiveness pointing, being with Gabor at the very
- * beginning, suspend to swap space, stop all tasks. Port to 2.4.18-ac and
- * 2.5.17. Even though Pavel and I disagree on the direction suspend to
- * disk should take, I appreciate the valuable work he did in helping Gabor
- * get the concept working.
- *
- * ..and of course the myriads of TuxOnIce users who have helped diagnose
- * and fix bugs, made suggestions on how to improve the code, proofread
- * documentation, and donated time and money.
- *
- * Thanks also to corporate sponsors:
- *
- * <B>Redhat.</B>Sometime employer from May 2006 (my fault, not Redhat's!).
- *
- * <B>Cyclades.com.</B> Nigel's employers from Dec 2004 until May 2006, who
- * allowed him to work on TuxOnIce and PM related issues on company time.
- *
- * <B>LinuxFund.org.</B> Sponsored Nigel's work on TuxOnIce for four months Oct
- * 2003 to Jan 2004.
- *
- * <B>LAC Linux.</B> Donated P4 hardware that enabled development and ongoing
- * maintenance of SMP and Highmem support.
- *
- * <B>OSDL.</B> Provided access to various hardware configurations, make
- * occasional small donations to the project.
- */
-
-#include <linux/suspend.h>
-#include <linux/module.h>
-#include <linux/freezer.h>
-#include <generated/utsrelease.h>
-#include <linux/cpu.h>
-#include <linux/console.h>
-#include <linux/writeback.h>
-#include <linux/uaccess.h> /* for get/set_fs & KERNEL_DS on i386 */
-#include <linux/bio.h>
-#include <linux/kgdb.h>
-
-#include "tuxonice.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_prepare_image.h"
-#include "tuxonice_io.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_power_off.h"
-#include "tuxonice_storage.h"
-#include "tuxonice_checksum.h"
-#include "tuxonice_builtin.h"
-#include "tuxonice_atomic_copy.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_cluster.h"
-
-/*! Pageset metadata. */
-struct pagedir pagedir2 = {2};
-
-static mm_segment_t oldfs;
-static DEFINE_MUTEX(tuxonice_in_use);
-static int block_dump_save;
-
-int toi_trace_index;
-
-/* Binary signature if an image is present */
-char tuxonice_signature[9] = "\xed\xc3\x02\xe9\x98\x56\xe5\x0c";
-
-unsigned long boot_kernel_data_buffer;
-
-static char *result_strings[] = {
- "Hibernation was aborted",
- "The user requested that we cancel the hibernation",
- "No storage was available",
- "Insufficient storage was available",
- "Freezing filesystems and/or tasks failed",
- "A pre-existing image was used",
- "We would free memory, but image size limit doesn't allow this",
- "Unable to free enough memory to hibernate",
- "Unable to obtain the Power Management Semaphore",
- "A device suspend/resume returned an error",
- "A system device suspend/resume returned an error",
- "The extra pages allowance is too small",
- "We were unable to successfully prepare an image",
- "TuxOnIce module initialisation failed",
- "TuxOnIce module cleanup failed",
- "I/O errors were encountered",
- "Ran out of memory",
- "An error was encountered while reading the image",
- "Platform preparation failed",
- "CPU Hotplugging failed",
- "Architecture specific preparation failed",
- "Pages needed resaving, but we were told to abort if this happens",
- "We can't hibernate at the moment (invalid resume= or filewriter "
- "target?)",
- "A hibernation preparation notifier chain member cancelled the "
- "hibernation",
- "Pre-snapshot preparation failed",
- "Pre-restore preparation failed",
- "Failed to disable usermode helpers",
- "Can't resume from alternate image",
- "Header reservation too small",
- "Device Power Management Preparation failed",
-};
-
-/**
- * toi_finish_anything - cleanup after doing anything
- * @hibernate_or_resume: Whether finishing a cycle or attempt at
- * resuming.
- *
- * This is our basic clean-up routine, matching start_anything below. We
- * call cleanup routines, drop module references and restore process fs and
- * cpus allowed masks, together with the global block_dump variable's value.
- **/
-void toi_finish_anything(int hibernate_or_resume)
-{
- toi_running = 0;
- toi_cleanup_modules(hibernate_or_resume);
- toi_put_modules();
- if (hibernate_or_resume) {
- block_dump = block_dump_save;
- set_cpus_allowed_ptr(current, cpu_all_mask);
- toi_alloc_print_debug_stats();
- atomic_inc(&snapshot_device_available);
- unlock_system_sleep();
- }
-
- set_fs(oldfs);
- mutex_unlock(&tuxonice_in_use);
-}
-
-/**
- * toi_start_anything - basic initialisation for TuxOnIce
- * @toi_or_resume: Whether starting a cycle or attempt at resuming.
- *
- * Our basic initialisation routine. Take references on modules, use the
- * kernel segment, recheck resume= if no active allocator is set, initialise
- * modules, save and reset block_dump and ensure we're running on CPU0.
- **/
-int toi_start_anything(int hibernate_or_resume)
-{
- mutex_lock(&tuxonice_in_use);
-
- oldfs = get_fs();
- set_fs(KERNEL_DS);
-
- toi_trace_index = 0;
-
- if (hibernate_or_resume) {
- lock_system_sleep();
-
- if (!atomic_add_unless(&snapshot_device_available, -1, 0))
- goto snapshotdevice_unavailable;
- }
-
- if (hibernate_or_resume == SYSFS_HIBERNATE)
- toi_print_modules();
-
- if (toi_get_modules()) {
- printk(KERN_INFO "TuxOnIce: Get modules failed!\n");
- goto prehibernate_err;
- }
-
- if (hibernate_or_resume) {
- block_dump_save = block_dump;
- block_dump = 0;
- set_cpus_allowed_ptr(current,
- cpumask_of(cpumask_first(cpu_online_mask)));
- }
-
- if (toi_initialise_modules_early(hibernate_or_resume))
- goto early_init_err;
-
- if (!toiActiveAllocator)
- toi_attempt_to_parse_resume_device(!hibernate_or_resume);
-
- if (!toi_initialise_modules_late(hibernate_or_resume)) {
- toi_running = 1; /* For the swsusp code we use :< */
- return 0;
- }
-
- toi_cleanup_modules(hibernate_or_resume);
-early_init_err:
- if (hibernate_or_resume) {
- block_dump_save = block_dump;
- set_cpus_allowed_ptr(current, cpu_all_mask);
- }
- toi_put_modules();
-prehibernate_err:
- if (hibernate_or_resume)
- atomic_inc(&snapshot_device_available);
-snapshotdevice_unavailable:
- if (hibernate_or_resume)
- mutex_unlock(&pm_mutex);
- set_fs(oldfs);
- mutex_unlock(&tuxonice_in_use);
- return -EBUSY;
-}
-
-/*
- * Nosave page tracking.
- *
- * Here rather than in prepare_image because we want to do it once only at the
- * start of a cycle.
- */
-
-/**
- * mark_nosave_pages - set up our Nosave bitmap
- *
- * Build a bitmap of Nosave pages from the list. The bitmap allows faster
- * use when preparing the image.
- **/
-static void mark_nosave_pages(void)
-{
- struct nosave_region *region;
-
- list_for_each_entry(region, &nosave_regions, list) {
- unsigned long pfn;
-
- for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++)
- if (pfn_valid(pfn)) {
- SetPageNosave(pfn_to_page(pfn));
- }
- }
-}
-
-/**
- * allocate_bitmaps - allocate bitmaps used to record page states
- *
- * Allocate the bitmaps we use to record the various TuxOnIce related
- * page states.
- **/
-static int allocate_bitmaps(void)
-{
- if (toi_alloc_bitmap(&pageset1_map) ||
- toi_alloc_bitmap(&pageset1_copy_map) ||
- toi_alloc_bitmap(&pageset2_map) ||
- toi_alloc_bitmap(&io_map) ||
- toi_alloc_bitmap(&nosave_map) ||
- toi_alloc_bitmap(&free_map) ||
- toi_alloc_bitmap(&compare_map) ||
- toi_alloc_bitmap(&page_resave_map))
- return 1;
-
- return 0;
-}
-
-/**
- * free_bitmaps - free the bitmaps used to record page states
- *
- * Free the bitmaps allocated above. It is not an error to call
- * memory_bm_free on a bitmap that isn't currently allocated.
- **/
-static void free_bitmaps(void)
-{
- toi_free_bitmap(&pageset1_map);
- toi_free_bitmap(&pageset1_copy_map);
- toi_free_bitmap(&pageset2_map);
- toi_free_bitmap(&io_map);
- toi_free_bitmap(&nosave_map);
- toi_free_bitmap(&free_map);
- toi_free_bitmap(&compare_map);
- toi_free_bitmap(&page_resave_map);
-}
-
-/**
- * io_MB_per_second - return the number of MB/s read or written
- * @write: Whether to return the speed at which we wrote.
- *
- * Calculate the number of megabytes per second that were read or written.
- **/
-static int io_MB_per_second(int write)
-{
- return (toi_bkd.toi_io_time[write][1]) ?
- MB((unsigned long) toi_bkd.toi_io_time[write][0]) * HZ /
- toi_bkd.toi_io_time[write][1] : 0;
-}
-
-#define SNPRINTF(a...) do { len += scnprintf(((char *) buffer) + len, \
- count - len - 1, ## a); } while (0)
-
-/**
- * get_debug_info - fill a buffer with debugging information
- * @buffer: The buffer to be filled.
- * @count: The size of the buffer, in bytes.
- *
- * Fill a (usually PAGE_SIZEd) buffer with the debugging info that we will
- * either printk or return via sysfs.
- **/
-static int get_toi_debug_info(const char *buffer, int count)
-{
- int len = 0, i, first_result = 1;
-
- SNPRINTF("TuxOnIce debugging info:\n");
- SNPRINTF("- TuxOnIce core : " TOI_CORE_VERSION "\n");
- SNPRINTF("- Kernel Version : " UTS_RELEASE "\n");
- SNPRINTF("- Compiler vers. : %d.%d\n", __GNUC__, __GNUC_MINOR__);
- SNPRINTF("- Attempt number : %d\n", nr_hibernates);
- SNPRINTF("- Parameters : %ld %ld %ld %d %ld %ld\n",
- toi_result,
- toi_bkd.toi_action,
- toi_bkd.toi_debug_state,
- toi_bkd.toi_default_console_level,
- image_size_limit,
- toi_poweroff_method);
- SNPRINTF("- Overall expected compression percentage: %d.\n",
- 100 - toi_expected_compression_ratio());
- len += toi_print_module_debug_info(((char *) buffer) + len,
- count - len - 1);
- if (toi_bkd.toi_io_time[0][1]) {
- if ((io_MB_per_second(0) < 5) || (io_MB_per_second(1) < 5)) {
- SNPRINTF("- I/O speed: Write %ld KB/s",
- (KB((unsigned long) toi_bkd.toi_io_time[0][0]) * HZ /
- toi_bkd.toi_io_time[0][1]));
- if (toi_bkd.toi_io_time[1][1])
- SNPRINTF(", Read %ld KB/s",
- (KB((unsigned long)
- toi_bkd.toi_io_time[1][0]) * HZ /
- toi_bkd.toi_io_time[1][1]));
- } else {
- SNPRINTF("- I/O speed: Write %ld MB/s",
- (MB((unsigned long) toi_bkd.toi_io_time[0][0]) * HZ /
- toi_bkd.toi_io_time[0][1]));
- if (toi_bkd.toi_io_time[1][1])
- SNPRINTF(", Read %ld MB/s",
- (MB((unsigned long)
- toi_bkd.toi_io_time[1][0]) * HZ /
- toi_bkd.toi_io_time[1][1]));
- }
- SNPRINTF(".\n");
- } else
- SNPRINTF("- No I/O speed stats available.\n");
- SNPRINTF("- Extra pages : %lu used/%lu.\n",
- extra_pd1_pages_used, extra_pd1_pages_allowance);
-
- for (i = 0; i < TOI_NUM_RESULT_STATES; i++)
- if (test_result_state(i)) {
- SNPRINTF("%s: %s.\n", first_result ?
- "- Result " :
- " ",
- result_strings[i]);
- first_result = 0;
- }
- if (first_result)
- SNPRINTF("- Result : %s.\n", nr_hibernates ?
- "Succeeded" :
- "No hibernation attempts so far");
- return len;
-}
-
-#ifdef CONFIG_TOI_INCREMENTAL
-/**
- * get_toi_page_state - fill a buffer with page state information
- * @buffer: The buffer to be filled.
- * @count: The size of the buffer, in bytes.
- *
- * Fill a (usually PAGE_SIZEd) buffer with the debugging info that we will
- * either printk or return via sysfs.
- **/
-static int get_toi_page_state(const char *buffer, int count)
-{
- int free = 0, untracked = 0, dirty = 0, ro = 0, invalid = 0, other = 0, total = 0;
- int len = 0;
- struct zone *zone;
- int allocated_bitmaps = 0;
-
- set_cpus_allowed_ptr(current,
- cpumask_of(cpumask_first(cpu_online_mask)));
-
- if (!free_map) {
- BUG_ON(toi_alloc_bitmap(&free_map));
- allocated_bitmaps = 1;
- }
-
- toi_generate_free_page_map();
-
- for_each_populated_zone(zone) {
- unsigned long loop;
-
- total += zone->spanned_pages;
-
- for (loop = 0; loop < zone->spanned_pages; loop++) {
- unsigned long pfn = zone->zone_start_pfn + loop;
- struct page *page;
- int chunk_size;
-
- if (!pfn_valid(pfn)) {
- continue;
- }
-
- chunk_size = toi_size_of_free_region(zone, pfn);
- if (chunk_size) {
- /*
- * If the page gets allocated, it will be need
- * saving in an image.
- * Don't bother with explicitly removing any
- * RO protection applied below.
- * We'll SetPageTOI_Dirty(page) if/when it
- * gets allocated.
- */
- free += chunk_size;
- loop += chunk_size - 1;
- continue;
- }
-
- page = pfn_to_page(pfn);
-
- if (PageTOI_Untracked(page)) {
- untracked++;
- } else if (PageTOI_RO(page)) {
- ro++;
- } else if (PageTOI_Dirty(page)) {
- dirty++;
- } else {
- printk("Page %ld state 'other'.\n", pfn);
- other++;
- }
- }
- }
-
- if (allocated_bitmaps) {
- toi_free_bitmap(&free_map);
- }
-
- set_cpus_allowed_ptr(current, cpu_all_mask);
-
- SNPRINTF("TuxOnIce page breakdown:\n");
- SNPRINTF("- Free : %d\n", free);
- SNPRINTF("- Untracked : %d\n", untracked);
- SNPRINTF("- Read only : %d\n", ro);
- SNPRINTF("- Dirty : %d\n", dirty);
- SNPRINTF("- Other : %d\n", other);
- SNPRINTF("- Invalid : %d\n", invalid);
- SNPRINTF("- Total : %d\n", total);
- return len;
-}
-#endif
-
-/**
- * do_cleanup - cleanup after attempting to hibernate or resume
- * @get_debug_info: Whether to allocate and return debugging info.
- *
- * Cleanup after attempting to hibernate or resume, possibly getting
- * debugging info as we do so.
- **/
-static void do_cleanup(int get_debug_info, int restarting)
-{
- int i = 0;
- char *buffer = NULL;
-
- trap_non_toi_io = 0;
-
- if (get_debug_info)
- toi_prepare_status(DONT_CLEAR_BAR, "Cleaning up...");
-
- free_checksum_pages();
-
- toi_cbw_restore();
- toi_free_cbw_data();
-
- if (get_debug_info)
- buffer = (char *) toi_get_zeroed_page(20, TOI_ATOMIC_GFP);
-
- if (buffer)
- i = get_toi_debug_info(buffer, PAGE_SIZE);
-
- toi_free_extra_pagedir_memory();
-
- pagedir1.size = 0;
- pagedir2.size = 0;
- set_highmem_size(pagedir1, 0);
- set_highmem_size(pagedir2, 0);
-
- if (boot_kernel_data_buffer) {
- if (!test_toi_state(TOI_BOOT_KERNEL))
- toi_free_page(37, boot_kernel_data_buffer);
- boot_kernel_data_buffer = 0;
- }
-
- if (test_toi_state(TOI_DEVICE_HOTPLUG_LOCKED)) {
- unlock_device_hotplug();
- clear_toi_state(TOI_DEVICE_HOTPLUG_LOCKED);
- }
-
- clear_toi_state(TOI_BOOT_KERNEL);
- if (current->flags & PF_SUSPEND_TASK)
- thaw_processes();
-
- if (!restarting)
- toi_stop_other_threads();
-
- if (toi_keeping_image &&
- !test_result_state(TOI_ABORTED)) {
- toi_message(TOI_ANY_SECTION, TOI_LOW, 1,
- "TuxOnIce: Not invalidating the image due "
- "to Keep Image or Incremental Image being enabled.");
- set_result_state(TOI_KEPT_IMAGE);
-
- /*
- * For an incremental image, free unused storage so
- * swap (if any) can be used for normal system operation,
- * if so desired.
- */
-
- toiActiveAllocator->free_unused_storage();
- } else
- if (toiActiveAllocator)
- toiActiveAllocator->remove_image();
-
- free_bitmaps();
- usermodehelper_enable();
-
- if (test_toi_state(TOI_NOTIFIERS_PREPARE)) {
- pm_notifier_call_chain(PM_POST_HIBERNATION);
- clear_toi_state(TOI_NOTIFIERS_PREPARE);
- }
-
- if (buffer && i) {
- /* Printk can only handle 1023 bytes, including
- * its level mangling. */
- for (i = 0; i < 3; i++)
- printk(KERN_ERR "%s", buffer + (1023 * i));
- toi_free_page(20, (unsigned long) buffer);
- }
-
- if (!restarting)
- toi_cleanup_console();
-
- free_attention_list();
-
- if (!restarting)
- toi_deactivate_storage(0);
-
- clear_toi_state(TOI_IGNORE_LOGLEVEL);
- clear_toi_state(TOI_TRYING_TO_RESUME);
- clear_toi_state(TOI_NOW_RESUMING);
-}
-
-/**
- * check_still_keeping_image - we kept an image; check whether to reuse it.
- *
- * We enter this routine when we have kept an image. If the user has said they
- * want to still keep it, all we need to do is powerdown. If powering down
- * means hibernating to ram and the power doesn't run out, we'll return 1.
- * If we do power off properly or the battery runs out, we'll resume via the
- * normal paths.
- *
- * If the user has said they want to remove the previously kept image, we
- * remove it, and return 0. We'll then store a new image.
- **/
-static int check_still_keeping_image(void)
-{
- if (toi_keeping_image) {
- if (!test_action_state(TOI_INCREMENTAL_IMAGE)) {
- printk(KERN_INFO "Image already stored: powering down "
- "immediately.");
- do_toi_step(STEP_HIBERNATE_POWERDOWN);
- return 1;
- }
- /**
- * Incremental image - need to write new part.
- * We detect that we're writing an incremental image by looking
- * at test_result_state(TOI_KEPT_IMAGE)
- **/
- return 0;
- }
-
- printk(KERN_INFO "Invalidating previous image.\n");
- toiActiveAllocator->remove_image();
-
- return 0;
-}
-
-/**
- * toi_init - prepare to hibernate to disk
- *
- * Initialise variables & data structures, in preparation for
- * hibernating to disk.
- **/
-static int toi_init(int restarting)
-{
- int result, i, j;
-
- toi_result = 0;
-
- printk(KERN_INFO "Initiating a hibernation cycle.\n");
-
- nr_hibernates++;
-
- for (i = 0; i < 2; i++)
- for (j = 0; j < 2; j++)
- toi_bkd.toi_io_time[i][j] = 0;
-
- if (!test_toi_state(TOI_CAN_HIBERNATE) ||
- allocate_bitmaps())
- return 1;
-
- mark_nosave_pages();
-
- if (!restarting)
- toi_prepare_console();
-
- result = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
- if (result) {
- set_result_state(TOI_NOTIFIERS_PREPARE_FAILED);
- return 1;
- }
- set_toi_state(TOI_NOTIFIERS_PREPARE);
-
- if (!restarting) {
- printk(KERN_ERR "Starting other threads.");
- toi_start_other_threads();
- }
-
- result = usermodehelper_disable();
- if (result) {
- printk(KERN_ERR "TuxOnIce: Failed to disable usermode "
- "helpers\n");
- set_result_state(TOI_USERMODE_HELPERS_ERR);
- return 1;
- }
-
- boot_kernel_data_buffer = toi_get_zeroed_page(37, TOI_ATOMIC_GFP);
- if (!boot_kernel_data_buffer) {
- printk(KERN_ERR "TuxOnIce: Failed to allocate "
- "boot_kernel_data_buffer.\n");
- set_result_state(TOI_OUT_OF_MEMORY);
- return 1;
- }
-
- toi_allocate_cbw_data();
-
- return 0;
-}
-
-/**
- * can_hibernate - perform basic 'Can we hibernate?' tests
- *
- * Perform basic tests that must pass if we're going to be able to hibernate:
- * Can we get the pm_mutex? Is resume= valid (we need to know where to write
- * the image header).
- **/
-static int can_hibernate(void)
-{
- if (!test_toi_state(TOI_CAN_HIBERNATE))
- toi_attempt_to_parse_resume_device(0);
-
- if (!test_toi_state(TOI_CAN_HIBERNATE)) {
- printk(KERN_INFO "TuxOnIce: Hibernation is disabled.\n"
- "This may be because you haven't put something along "
- "the lines of\n\nresume=swap:/dev/hda1\n\n"
- "in lilo.conf or equivalent. (Where /dev/hda1 is your "
- "swap partition).\n");
- set_abort_result(TOI_CANT_SUSPEND);
- return 0;
- }
-
- if (strlen(alt_resume_param)) {
- attempt_to_parse_alt_resume_param();
-
- if (!strlen(alt_resume_param)) {
- printk(KERN_INFO "Alternate resume parameter now "
- "invalid. Aborting.\n");
- set_abort_result(TOI_CANT_USE_ALT_RESUME);
- return 0;
- }
- }
-
- return 1;
-}
-
-/**
- * do_post_image_write - having written an image, figure out what to do next
- *
- * After writing an image, we might load an alternate image or power down.
- * Powering down might involve hibernating to ram, in which case we also
- * need to handle reloading pageset2.
- **/
-static int do_post_image_write(void)
-{
- /* If switching images fails, do normal powerdown */
- if (alt_resume_param[0])
- do_toi_step(STEP_RESUME_ALT_IMAGE);
-
- toi_power_down();
-
- barrier();
- mb();
- return 0;
-}
-
-/**
- * __save_image - do the hard work of saving the image
- *
- * High level routine for getting the image saved. The key assumptions made
- * are that processes have been frozen and sufficient memory is available.
- *
- * We also exit through here at resume time, coming back from toi_hibernate
- * after the atomic restore. This is the reason for the toi_in_hibernate
- * test.
- **/
-static int __save_image(void)
-{
- int temp_result, did_copy = 0;
-
- toi_prepare_status(DONT_CLEAR_BAR, "Starting to save the image..");
-
- toi_message(TOI_ANY_SECTION, TOI_LOW, 1,
- " - Final values: %d and %d.",
- pagedir1.size, pagedir2.size);
-
- toi_cond_pause(1, "About to write pagedir2.");
-
- temp_result = write_pageset(&pagedir2);
-
- if (temp_result == -1 || test_result_state(TOI_ABORTED))
- return 1;
-
- toi_cond_pause(1, "About to copy pageset 1.");
-
- if (test_result_state(TOI_ABORTED))
- return 1;
-
- toi_deactivate_storage(1);
-
- toi_prepare_status(DONT_CLEAR_BAR, "Doing atomic copy/restore.");
-
- toi_in_hibernate = 1;
-
- if (toi_go_atomic(PMSG_FREEZE, 1))
- goto Failed;
-
- temp_result = toi_hibernate();
-
-#ifdef CONFIG_KGDB
- if (test_action_state(TOI_POST_RESUME_BREAKPOINT))
- kgdb_breakpoint();
-#endif
-
- if (!temp_result)
- did_copy = 1;
-
- /* We return here at resume time too! */
- toi_end_atomic(ATOMIC_ALL_STEPS, toi_in_hibernate, temp_result);
-
-Failed:
- if (toi_activate_storage(1))
- panic("Failed to reactivate our storage.");
-
- /* Resume time? */
- if (!toi_in_hibernate) {
- copyback_post();
- return 0;
- }
-
- /* Nope. Hibernating. So, see if we can save the image... */
-
- if (temp_result || test_result_state(TOI_ABORTED)) {
- if (did_copy)
- goto abort_reloading_pagedir_two;
- else
- return 1;
- }
-
- toi_update_status(pagedir2.size, pagedir1.size + pagedir2.size,
- NULL);
-
- if (test_result_state(TOI_ABORTED))
- goto abort_reloading_pagedir_two;
-
- toi_cond_pause(1, "About to write pageset1.");
-
- toi_message(TOI_ANY_SECTION, TOI_LOW, 1, "-- Writing pageset1");
-
- temp_result = write_pageset(&pagedir1);
-
- /* We didn't overwrite any memory, so no reread needs to be done. */
- if (test_action_state(TOI_TEST_FILTER_SPEED) ||
- test_action_state(TOI_TEST_BIO))
- return 1;
-
- if (temp_result == 1 || test_result_state(TOI_ABORTED))
- goto abort_reloading_pagedir_two;
-
- toi_cond_pause(1, "About to write header.");
-
- if (test_result_state(TOI_ABORTED))
- goto abort_reloading_pagedir_two;
-
- temp_result = write_image_header();
-
- if (!temp_result && !test_result_state(TOI_ABORTED))
- return 0;
-
-abort_reloading_pagedir_two:
- temp_result = read_pageset2(1);
-
- /* If that failed, we're sunk. Panic! */
- if (temp_result)
- panic("Attempt to reload pagedir 2 while aborting "
- "a hibernate failed.");
-
- return 1;
-}
-
-static void map_ps2_pages(int enable)
-{
- unsigned long pfn = 0;
-
- memory_bm_position_reset(pageset2_map);
- pfn = memory_bm_next_pfn(pageset2_map, 0);
-
- while (pfn != BM_END_OF_MAP) {
- struct page *page = pfn_to_page(pfn);
- kernel_map_pages(page, 1, enable);
- pfn = memory_bm_next_pfn(pageset2_map, 0);
- }
-}
-
-/**
- * do_save_image - save the image and handle the result
- *
- * Save the prepared image. If we fail or we're in the path returning
- * from the atomic restore, cleanup.
- **/
-static int do_save_image(void)
-{
- int result;
- map_ps2_pages(0);
- result = __save_image();
- map_ps2_pages(1);
- return result;
-}
-
-/**
- * do_prepare_image - try to prepare an image
- *
- * Seek to initialise and prepare an image to be saved. On failure,
- * cleanup.
- **/
-static int do_prepare_image(void)
-{
- int restarting = test_result_state(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL);
-
- if (!restarting && toi_activate_storage(0))
- return 1;
-
- /*
- * If kept image and still keeping image and hibernating to RAM, (non
- * incremental image case) we will return 1 after hibernating and
- * resuming (provided the power doesn't run out. In that case, we skip
- * directly to cleaning up and exiting.
- */
-
- if (!can_hibernate() ||
- (test_result_state(TOI_KEPT_IMAGE) &&
- check_still_keeping_image()))
- return 1;
-
- if (toi_init(restarting) || toi_prepare_image() ||
- test_result_state(TOI_ABORTED))
- return 1;
-
- trap_non_toi_io = 1;
-
- return 0;
-}
-
-/**
- * do_check_can_resume - find out whether an image has been stored
- *
- * Read whether an image exists. We use the same routine as the
- * image_exists sysfs entry, and just look to see whether the
- * first character in the resulting buffer is a '1'.
- **/
-int do_check_can_resume(void)
-{
- int result = -1;
-
- if (toi_activate_storage(0))
- return -1;
-
- if (!test_toi_state(TOI_RESUME_DEVICE_OK))
- toi_attempt_to_parse_resume_device(1);
-
- if (toiActiveAllocator)
- result = toiActiveAllocator->image_exists(1);
-
- toi_deactivate_storage(0);
- return result;
-}
-
-/**
- * do_load_atomic_copy - load the first part of an image, if it exists
- *
- * Check whether we have an image. If one exists, do sanity checking
- * (possibly invalidating the image or even rebooting if the user
- * requests that) before loading it into memory in preparation for the
- * atomic restore.
- *
- * If and only if we have an image loaded and ready to restore, we return 1.
- **/
-static int do_load_atomic_copy(void)
-{
- int read_image_result = 0;
-
- if (sizeof(swp_entry_t) != sizeof(long)) {
- printk(KERN_WARNING "TuxOnIce: The size of swp_entry_t != size"
- " of long. Please report this!\n");
- return 1;
- }
-
- if (!resume_file[0])
- printk(KERN_WARNING "TuxOnIce: "
- "You need to use a resume= command line parameter to "
- "tell TuxOnIce where to look for an image.\n");
-
- toi_activate_storage(0);
-
- if (!(test_toi_state(TOI_RESUME_DEVICE_OK)) &&
- !toi_attempt_to_parse_resume_device(0)) {
- /*
- * Without a usable storage device we can do nothing -
- * even if noresume is given
- */
-
- if (!toiNumAllocators)
- printk(KERN_ALERT "TuxOnIce: "
- "No storage allocators have been registered.\n");
- else
- printk(KERN_ALERT "TuxOnIce: "
- "Missing or invalid storage location "
- "(resume= parameter). Please correct and "
- "rerun lilo (or equivalent) before "
- "hibernating.\n");
- toi_deactivate_storage(0);
- return 1;
- }
-
- if (allocate_bitmaps())
- return 1;
-
- read_image_result = read_pageset1(); /* non fatal error ignored */
-
- if (test_toi_state(TOI_NORESUME_SPECIFIED))
- clear_toi_state(TOI_NORESUME_SPECIFIED);
-
- toi_deactivate_storage(0);
-
- if (read_image_result)
- return 1;
-
- return 0;
-}
-
-/**
- * prepare_restore_load_alt_image - save & restore alt image variables
- *
- * Save and restore the pageset1 maps, when loading an alternate image.
- **/
-static void prepare_restore_load_alt_image(int prepare)
-{
- static struct memory_bitmap *pageset1_map_save, *pageset1_copy_map_save;
-
- if (prepare) {
- pageset1_map_save = pageset1_map;
- pageset1_map = NULL;
- pageset1_copy_map_save = pageset1_copy_map;
- pageset1_copy_map = NULL;
- set_toi_state(TOI_LOADING_ALT_IMAGE);
- toi_reset_alt_image_pageset2_pfn();
- } else {
- toi_free_bitmap(&pageset1_map);
- pageset1_map = pageset1_map_save;
- toi_free_bitmap(&pageset1_copy_map);
- pageset1_copy_map = pageset1_copy_map_save;
- clear_toi_state(TOI_NOW_RESUMING);
- clear_toi_state(TOI_LOADING_ALT_IMAGE);
- }
-}
-
-/**
- * do_toi_step - perform a step in hibernating or resuming
- *
- * Perform a step in hibernating or resuming an image. This abstraction
- * is in preparation for implementing cluster support, and perhaps replacing
- * uswsusp too (haven't looked whether that's possible yet).
- **/
-int do_toi_step(int step)
-{
- switch (step) {
- case STEP_HIBERNATE_PREPARE_IMAGE:
- return do_prepare_image();
- case STEP_HIBERNATE_SAVE_IMAGE:
- return do_save_image();
- case STEP_HIBERNATE_POWERDOWN:
- return do_post_image_write();
- case STEP_RESUME_CAN_RESUME:
- return do_check_can_resume();
- case STEP_RESUME_LOAD_PS1:
- return do_load_atomic_copy();
- case STEP_RESUME_DO_RESTORE:
- /*
- * If we succeed, this doesn't return.
- * Instead, we return from do_save_image() in the
- * hibernated kernel.
- */
- return toi_atomic_restore();
- case STEP_RESUME_ALT_IMAGE:
- printk(KERN_INFO "Trying to resume alternate image.\n");
- toi_in_hibernate = 0;
- save_restore_alt_param(SAVE, NOQUIET);
- prepare_restore_load_alt_image(1);
- if (!do_check_can_resume()) {
- printk(KERN_INFO "Nothing to resume from.\n");
- goto out;
- }
- if (!do_load_atomic_copy())
- toi_atomic_restore();
-
- printk(KERN_INFO "Failed to load image.\n");
-out:
- prepare_restore_load_alt_image(0);
- save_restore_alt_param(RESTORE, NOQUIET);
- break;
- case STEP_CLEANUP:
- do_cleanup(1, 0);
- break;
- case STEP_QUIET_CLEANUP:
- do_cleanup(0, 0);
- break;
- }
-
- return 0;
-}
-
-/* -- Functions for kickstarting a hibernate or resume --- */
-
-/**
- * toi_try_resume - try to do the steps in resuming
- *
- * Check if we have an image and if so try to resume. Clear the status
- * flags too.
- **/
-void toi_try_resume(void)
-{
- set_toi_state(TOI_TRYING_TO_RESUME);
- resume_attempted = 1;
-
- current->flags |= PF_MEMALLOC;
- toi_start_other_threads();
-
- if (do_toi_step(STEP_RESUME_CAN_RESUME) &&
- !do_toi_step(STEP_RESUME_LOAD_PS1))
- do_toi_step(STEP_RESUME_DO_RESTORE);
-
- toi_stop_other_threads();
- do_cleanup(0, 0);
-
- current->flags &= ~PF_MEMALLOC;
-
- clear_toi_state(TOI_IGNORE_LOGLEVEL);
- clear_toi_state(TOI_TRYING_TO_RESUME);
- clear_toi_state(TOI_NOW_RESUMING);
-}
-
-/**
- * toi_sys_power_disk_try_resume - wrapper calling toi_try_resume
- *
- * Wrapper for when __toi_try_resume is called from swsusp resume path,
- * rather than from echo > /sys/power/tuxonice/do_resume.
- **/
-static void toi_sys_power_disk_try_resume(void)
-{
- resume_attempted = 1;
-
- /*
- * There's a comment in kernel/power/disk.c that indicates
- * we should be able to use mutex_lock_nested below. That
- * doesn't seem to cut it, though, so let's just turn lockdep
- * off for now.
- */
- lockdep_off();
-
- if (toi_start_anything(SYSFS_RESUMING))
- goto out;
-
- toi_try_resume();
-
- /*
- * For initramfs, we have to clear the boot time
- * flag after trying to resume
- */
- clear_toi_state(TOI_BOOT_TIME);
-
- toi_finish_anything(SYSFS_RESUMING);
-out:
- lockdep_on();
-}
-
-/**
- * toi_try_hibernate - try to start a hibernation cycle
- *
- * Start a hibernation cycle, coming in from either
- * echo > /sys/power/tuxonice/do_suspend
- *
- * or
- *
- * echo disk > /sys/power/state
- *
- * In the later case, we come in without pm_sem taken; in the
- * former, it has been taken.
- **/
-int toi_try_hibernate(void)
-{
- int result = 0, sys_power_disk = 0, retries = 0;
-
- if (!mutex_is_locked(&tuxonice_in_use)) {
- /* Came in via /sys/power/disk */
- if (toi_start_anything(SYSFS_HIBERNATING))
- return -EBUSY;
- sys_power_disk = 1;
- }
-
- current->flags |= PF_MEMALLOC;
-
- if (test_toi_state(TOI_CLUSTER_MODE)) {
- toi_initiate_cluster_hibernate();
- goto out;
- }
-
-prepare:
- result = do_toi_step(STEP_HIBERNATE_PREPARE_IMAGE);
-
- if (result)
- goto out;
-
- if (test_action_state(TOI_FREEZER_TEST))
- goto out_restore_gfp_mask;
-
- result = do_toi_step(STEP_HIBERNATE_SAVE_IMAGE);
-
- if (test_result_state(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL)) {
- if (retries < 2) {
- do_cleanup(0, 1);
- retries++;
- clear_result_state(TOI_ABORTED);
- extra_pd1_pages_allowance = extra_pd1_pages_used + 500;
- printk(KERN_INFO "Automatically adjusting the extra"
- " pages allowance to %ld and restarting.\n",
- extra_pd1_pages_allowance);
- pm_restore_gfp_mask();
- goto prepare;
- }
-
- printk(KERN_INFO "Adjusted extra pages allowance twice and "
- "still couldn't hibernate successfully. Giving up.");
- }
-
- /* This code runs at resume time too! */
- if (!result && toi_in_hibernate)
- result = do_toi_step(STEP_HIBERNATE_POWERDOWN);
-
-out_restore_gfp_mask:
- pm_restore_gfp_mask();
-out:
- do_cleanup(1, 0);
- current->flags &= ~PF_MEMALLOC;
-
- if (sys_power_disk)
- toi_finish_anything(SYSFS_HIBERNATING);
-
- return result;
-}
-
-/*
- * channel_no: If !0, -c <channel_no> is added to args (userui).
- */
-int toi_launch_userspace_program(char *command, int channel_no,
- int wait, int debug)
-{
- int retval;
- static char *envp[] = {
- "HOME=/",
- "TERM=linux",
- "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
- NULL };
- static char *argv[] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
- };
- char *channel = NULL;
- int arg = 0, size;
- char test_read[255];
- char *orig_posn = command;
-
- if (!strlen(orig_posn))
- return 1;
-
- if (channel_no) {
- channel = toi_kzalloc(4, 6, GFP_KERNEL);
- if (!channel) {
- printk(KERN_INFO "Failed to allocate memory in "
- "preparing to launch userspace program.\n");
- return 1;
- }
- }
-
- /* Up to 6 args supported */
- while (arg < 6) {
- sscanf(orig_posn, "%s", test_read);
- size = strlen(test_read);
- if (!(size))
- break;
- argv[arg] = toi_kzalloc(5, size + 1, TOI_ATOMIC_GFP);
- strcpy(argv[arg], test_read);
- orig_posn += size + 1;
- *test_read = 0;
- arg++;
- }
-
- if (channel_no) {
- sprintf(channel, "-c%d", channel_no);
- argv[arg] = channel;
- } else
- arg--;
-
- if (debug) {
- argv[++arg] = toi_kzalloc(5, 8, TOI_ATOMIC_GFP);
- strcpy(argv[arg], "--debug");
- }
-
- retval = call_usermodehelper(argv[0], argv, envp, wait);
-
- /*
- * If the program reports an error, retval = 256. Don't complain
- * about that here.
- */
- if (retval && retval != 256)
- printk(KERN_ERR "Failed to launch userspace program '%s': "
- "Error %d\n", command, retval);
-
- {
- int i;
- for (i = 0; i < arg; i++)
- if (argv[i] && argv[i] != channel)
- toi_kfree(5, argv[i], sizeof(*argv[i]));
- }
-
- toi_kfree(4, channel, sizeof(*channel));
-
- return retval;
-}
-
-/*
- * This array contains entries that are automatically registered at
- * boot. Modules and the console code register their own entries separately.
- */
-static struct toi_sysfs_data sysfs_params[] = {
- SYSFS_LONG("extra_pages_allowance", SYSFS_RW,
- &extra_pd1_pages_allowance, 0, LONG_MAX, 0),
- SYSFS_CUSTOM("image_exists", SYSFS_RW, image_exists_read,
- image_exists_write, SYSFS_NEEDS_SM_FOR_BOTH, NULL),
- SYSFS_STRING("resume", SYSFS_RW, resume_file, 255,
- SYSFS_NEEDS_SM_FOR_WRITE,
- attempt_to_parse_resume_device2),
- SYSFS_STRING("alt_resume_param", SYSFS_RW, alt_resume_param, 255,
- SYSFS_NEEDS_SM_FOR_WRITE,
- attempt_to_parse_alt_resume_param),
- SYSFS_CUSTOM("debug_info", SYSFS_READONLY, get_toi_debug_info, NULL, 0,
- NULL),
- SYSFS_BIT("ignore_rootfs", SYSFS_RW, &toi_bkd.toi_action,
- TOI_IGNORE_ROOTFS, 0),
- SYSFS_LONG("image_size_limit", SYSFS_RW, &image_size_limit, -2,
- INT_MAX, 0),
- SYSFS_UL("last_result", SYSFS_RW, &toi_result, 0, 0, 0),
- SYSFS_BIT("no_multithreaded_io", SYSFS_RW, &toi_bkd.toi_action,
- TOI_NO_MULTITHREADED_IO, 0),
- SYSFS_BIT("no_flusher_thread", SYSFS_RW, &toi_bkd.toi_action,
- TOI_NO_FLUSHER_THREAD, 0),
- SYSFS_BIT("full_pageset2", SYSFS_RW, &toi_bkd.toi_action,
- TOI_PAGESET2_FULL, 0),
- SYSFS_BIT("reboot", SYSFS_RW, &toi_bkd.toi_action, TOI_REBOOT, 0),
- SYSFS_BIT("replace_swsusp", SYSFS_RW, &toi_bkd.toi_action,
- TOI_REPLACE_SWSUSP, 0),
- SYSFS_STRING("resume_commandline", SYSFS_RW,
- toi_bkd.toi_nosave_commandline, COMMAND_LINE_SIZE, 0,
- NULL),
- SYSFS_STRING("version", SYSFS_READONLY, TOI_CORE_VERSION, 0, 0, NULL),
- SYSFS_BIT("freezer_test", SYSFS_RW, &toi_bkd.toi_action,
- TOI_FREEZER_TEST, 0),
- SYSFS_BIT("test_bio", SYSFS_RW, &toi_bkd.toi_action, TOI_TEST_BIO, 0),
- SYSFS_BIT("test_filter_speed", SYSFS_RW, &toi_bkd.toi_action,
- TOI_TEST_FILTER_SPEED, 0),
- SYSFS_BIT("no_pageset2", SYSFS_RW, &toi_bkd.toi_action,
- TOI_NO_PAGESET2, 0),
- SYSFS_BIT("no_pageset2_if_unneeded", SYSFS_RW, &toi_bkd.toi_action,
- TOI_NO_PS2_IF_UNNEEDED, 0),
- SYSFS_STRING("binary_signature", SYSFS_READONLY,
- tuxonice_signature, 9, 0, NULL),
- SYSFS_INT("max_workers", SYSFS_RW, &toi_max_workers, 0, NR_CPUS, 0,
- NULL),
-#ifdef CONFIG_KGDB
- SYSFS_BIT("post_resume_breakpoint", SYSFS_RW, &toi_bkd.toi_action,
- TOI_POST_RESUME_BREAKPOINT, 0),
-#endif
- SYSFS_BIT("no_readahead", SYSFS_RW, &toi_bkd.toi_action,
- TOI_NO_READAHEAD, 0),
- SYSFS_BIT("trace_debug_on", SYSFS_RW, &toi_bkd.toi_action,
- TOI_TRACE_DEBUG_ON, 0),
-#ifdef CONFIG_TOI_KEEP_IMAGE
- SYSFS_BIT("keep_image", SYSFS_RW , &toi_bkd.toi_action, TOI_KEEP_IMAGE,
- 0),
-#endif
-#ifdef CONFIG_TOI_INCREMENTAL
- SYSFS_CUSTOM("pagestate", SYSFS_READONLY, get_toi_page_state, NULL, 0,
- NULL),
- SYSFS_BIT("incremental", SYSFS_RW, &toi_bkd.toi_action,
- TOI_INCREMENTAL_IMAGE, 1),
-#endif
-};
-
-static struct toi_core_fns my_fns = {
- .get_nonconflicting_page = __toi_get_nonconflicting_page,
- .post_context_save = __toi_post_context_save,
- .try_hibernate = toi_try_hibernate,
- .try_resume = toi_sys_power_disk_try_resume,
-};
-
-/**
- * core_load - initialisation of TuxOnIce core
- *
- * Initialise the core, beginning with sysfs. Checksum and so on are part of
- * the core, but have their own initialisation routines because they either
- * aren't compiled in all the time or have their own subdirectories.
- **/
-static __init int core_load(void)
-{
- int i,
- numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data);
-
- printk(KERN_INFO "TuxOnIce " TOI_CORE_VERSION
- " (http://tuxonice.net)\n");
-
- if (!hibernation_available()) {
- printk(KERN_INFO "TuxOnIce disabled due to request for hibernation"
- " to be disabled in this kernel.\n");
- return 1;
- }
-
- if (toi_sysfs_init())
- return 1;
-
- for (i = 0; i < numfiles; i++)
- toi_register_sysfs_file(tuxonice_kobj, &sysfs_params[i]);
-
- toi_core_fns = &my_fns;
-
- if (toi_alloc_init())
- return 1;
- if (toi_checksum_init())
- return 1;
- if (toi_usm_init())
- return 1;
- if (toi_ui_init())
- return 1;
- if (toi_poweroff_init())
- return 1;
- if (toi_cluster_init())
- return 1;
- if (toi_cbw_init())
- return 1;
-
- return 0;
-}
-
-late_initcall(core_load);
diff --git a/kernel/power/tuxonice_incremental.c b/kernel/power/tuxonice_incremental.c
deleted file mode 100644
index c5a09789e..000000000
--- a/kernel/power/tuxonice_incremental.c
+++ /dev/null
@@ -1,402 +0,0 @@
-/*
- * kernel/power/tuxonice_incremental.c
- *
- * Copyright (C) 2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * This file contains routines related to storing incremental images - that
- * is, retaining an image after an initial cycle and then storing incremental
- * changes on subsequent hibernations.
- *
- * Based in part on on...
- *
- * Debug helper to dump the current kernel pagetables of the system
- * so that we can see what the various memory ranges are set to.
- *
- * (C) Copyright 2008 Intel Corporation
- *
- * Author: Arjan van de Ven <arjan@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- */
-
-#include <linux/mm.h>
-#include <linux/tuxonice.h>
-#include <linux/sched.h>
-#include <asm/pgtable.h>
-#include <asm/cacheflush.h>
-#include <asm/tlbflush.h>
-#include <asm/page.h>
-#include "tuxonice_pageflags.h"
-#include "tuxonice_builtin.h"
-#include "power.h"
-
-int toi_do_incremental_initcall;
-
-extern void kdb_init(int level);
-extern noinline void kgdb_breakpoint(void);
-
-#undef pr_debug
-#if 0
-#define pr_debug(a, b...) do { printk(a, ##b); } while(0)
-#else
-#define pr_debug(a, b...) do { } while(0)
-#endif
-
-/* Multipliers for offsets within the PTEs */
-#define PTE_LEVEL_MULT (PAGE_SIZE)
-#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
-#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
-#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
-
-/*
- * This function gets called on a break in a continuous series
- * of PTE entries; the next one is different so we need to
- * print what we collected so far.
- */
-static void note_page(void *addr)
-{
- static struct page *lastpage;
- struct page *page;
-
- page = virt_to_page(addr);
-
- if (page != lastpage) {
- unsigned int level;
- pte_t *pte = lookup_address((unsigned long) addr, &level);
- struct page *pt_page2 = pte_page(*pte);
- //debug("Note page %p (=> %p => %p|%ld).\n", addr, pte, pt_page2, page_to_pfn(pt_page2));
- SetPageTOI_Untracked(pt_page2);
- lastpage = page;
- }
-}
-
-static void walk_pte_level(pmd_t addr)
-{
- int i;
- pte_t *start;
-
- start = (pte_t *) pmd_page_vaddr(addr);
- for (i = 0; i < PTRS_PER_PTE; i++) {
- note_page(start);
- start++;
- }
-}
-
-#if PTRS_PER_PMD > 1
-
-static void walk_pmd_level(pud_t addr)
-{
- int i;
- pmd_t *start;
-
- start = (pmd_t *) pud_page_vaddr(addr);
- for (i = 0; i < PTRS_PER_PMD; i++) {
- if (!pmd_none(*start)) {
- if (pmd_large(*start) || !pmd_present(*start))
- note_page(start);
- else
- walk_pte_level(*start);
- } else
- note_page(start);
- start++;
- }
-}
-
-#else
-#define walk_pmd_level(a) walk_pte_level(__pmd(pud_val(a)))
-#define pud_large(a) pmd_large(__pmd(pud_val(a)))
-#define pud_none(a) pmd_none(__pmd(pud_val(a)))
-#endif
-
-#if PTRS_PER_PUD > 1
-
-static void walk_pud_level(pgd_t addr)
-{
- int i;
- pud_t *start;
-
- start = (pud_t *) pgd_page_vaddr(addr);
-
- for (i = 0; i < PTRS_PER_PUD; i++) {
- if (!pud_none(*start)) {
- if (pud_large(*start) || !pud_present(*start))
- note_page(start);
- else
- walk_pmd_level(*start);
- } else
- note_page(start);
-
- start++;
- }
-}
-
-#else
-#define walk_pud_level(a) walk_pmd_level(__pud(pgd_val(a)))
-#define pgd_large(a) pud_large(__pud(pgd_val(a)))
-#define pgd_none(a) pud_none(__pud(pgd_val(a)))
-#endif
-
-/*
- * Not static in the original at the time of writing, so needs renaming here.
- */
-static void toi_ptdump_walk_pgd_level(pgd_t *pgd)
-{
-#ifdef CONFIG_X86_64
- pgd_t *start = (pgd_t *) &init_level4_pgt;
-#else
- pgd_t *start = swapper_pg_dir;
-#endif
- int i;
- if (pgd) {
- start = pgd;
- }
-
- for (i = 0; i < PTRS_PER_PGD; i++) {
- if (!pgd_none(*start)) {
- if (pgd_large(*start) || !pgd_present(*start))
- note_page(start);
- else
- walk_pud_level(*start);
- } else
- note_page(start);
-
- start++;
- }
-
- /* Flush out the last page */
- note_page(start);
-}
-
-#ifdef CONFIG_PARAVIRT
-extern struct pv_info pv_info;
-
-static void toi_set_paravirt_ops_untracked(void) {
- int i;
-
- unsigned long pvpfn = page_to_pfn(virt_to_page(__parainstructions)),
- pvpfn_end = page_to_pfn(virt_to_page(__parainstructions_end));
- //debug(KERN_EMERG ".parainstructions goes from pfn %ld to %ld.\n", pvpfn, pvpfn_end);
- for (i = pvpfn; i <= pvpfn_end; i++) {
- SetPageTOI_Untracked(pfn_to_page(i));
- }
-}
-#else
-#define toi_set_paravirt_ops_untracked() { do { } while(0) }
-#endif
-
-extern void toi_mark_per_cpus_pages_untracked(void);
-
-void toi_untrack_stack(unsigned long *stack)
-{
- int i;
- struct page *stack_page = virt_to_page(stack);
-
- for (i = 0; i < (1 << THREAD_SIZE_ORDER); i++) {
- pr_debug("Untrack stack page %p.\n", page_address(stack_page + i));
- SetPageTOI_Untracked(stack_page + i);
- }
-}
-void toi_untrack_process(struct task_struct *p)
-{
- SetPageTOI_Untracked(virt_to_page(p));
- pr_debug("Untrack process %d page %p.\n", p->pid, page_address(virt_to_page(p)));
-
- toi_untrack_stack(p->stack);
-}
-
-void toi_generate_untracked_map(void)
-{
- struct task_struct *p, *t;
- struct page *page;
- pte_t *pte;
- int i;
- unsigned int level;
- static int been_here = 0;
-
- if (been_here)
- return;
-
- been_here = 1;
-
- /* Pagetable pages */
- toi_ptdump_walk_pgd_level(NULL);
-
- /* Printk buffer - not normally needed but can be helpful for debugging. */
- //toi_set_logbuf_untracked();
-
- /* Paravirt ops */
- toi_set_paravirt_ops_untracked();
-
- /* Task structs and stacks */
- for_each_process_thread(p, t) {
- toi_untrack_process(p);
- //toi_untrack_stack((unsigned long *) t->thread.sp);
- }
-
- for (i = 0; i < NR_CPUS; i++) {
- struct task_struct *idle = idle_task(i);
-
- if (idle) {
- pr_debug("Untrack idle process for CPU %d.\n", i);
- toi_untrack_process(idle);
- }
-
- /* IRQ stack */
- pr_debug("Untrack IRQ stack for CPU %d.\n", i);
- toi_untrack_stack((unsigned long *)per_cpu(irq_stack_ptr, i));
- }
-
- /* Per CPU data */
- //pr_debug("Untracking per CPU variable pages.\n");
- toi_mark_per_cpus_pages_untracked();
-
- /* Init stack - for bringing up secondary CPUs */
- page = virt_to_page(init_stack);
- for (i = 0; i < DIV_ROUND_UP(sizeof(init_stack), PAGE_SIZE); i++) {
- SetPageTOI_Untracked(page + i);
- }
-
- pte = lookup_address((unsigned long) &mmu_cr4_features, &level);
- SetPageTOI_Untracked(pte_page(*pte));
- SetPageTOI_Untracked(virt_to_page(trampoline_cr4_features));
-}
-
-/**
- * toi_reset_dirtiness_one
- */
-
-void toi_reset_dirtiness_one(unsigned long pfn, int verbose)
-{
- struct page *page = pfn_to_page(pfn);
-
- /**
- * Don't worry about whether the Dirty flag is
- * already set. If this is our first call, it
- * won't be.
- */
-
- preempt_disable();
-
- ClearPageTOI_Dirty(page);
- SetPageTOI_RO(page);
- if (verbose)
- printk(KERN_EMERG "Making page %ld (%p|%p) read only.\n", pfn, page, page_address(page));
-
- set_memory_ro((unsigned long) page_address(page), 1);
-
- preempt_enable();
-}
-
-/**
- * TuxOnIce's incremental image support works by marking all memory apart from
- * the page tables read-only, then in the page-faults that result enabling
- * writing if appropriate and flagging the page as dirty. Free pages are also
- * marked as dirty and not protected so that if allocated, they will be included
- * in the image without further processing.
- *
- * toi_reset_dirtiness is called when and image exists and incremental images are
- * enabled, and each time we resume thereafter. It is not invoked on a fresh boot.
- *
- * This routine should be called from a single-cpu-running context to avoid races in setting
- * page dirty/read only flags.
- *
- * TODO: Make "it is not invoked on a fresh boot" true when I've finished developing it!
- *
- * TODO: Consider Xen paravirt guest boot issues. See arch/x86/mm/pageattr.c.
- **/
-
-int toi_reset_dirtiness(int verbose)
-{
- struct zone *zone;
- unsigned long loop;
- int allocated_map = 0;
-
- toi_generate_untracked_map();
-
- if (!free_map) {
- if (!toi_alloc_bitmap(&free_map))
- return -ENOMEM;
- allocated_map = 1;
- }
-
- toi_generate_free_page_map();
-
- pr_debug(KERN_EMERG "Reset dirtiness.\n");
- for_each_populated_zone(zone) {
- // 64 bit only. No need to worry about highmem.
- for (loop = 0; loop < zone->spanned_pages; loop++) {
- unsigned long pfn = zone->zone_start_pfn + loop;
- struct page *page;
- int chunk_size;
-
- if (!pfn_valid(pfn)) {
- continue;
- }
-
- chunk_size = toi_size_of_free_region(zone, pfn);
- if (chunk_size) {
- loop += chunk_size - 1;
- continue;
- }
-
- page = pfn_to_page(pfn);
-
- if (PageNosave(page) || !saveable_page(zone, pfn)) {
- continue;
- }
-
- if (PageTOI_Untracked(page)) {
- continue;
- }
-
- /**
- * Do we need to (re)protect the page?
- * If it is already protected (PageTOI_RO), there is
- * nothing to do - skip the following.
- * If it is marked as dirty (PageTOI_Dirty), it was
- * either free and has been allocated or has been
- * written to and marked dirty. Reset the dirty flag
- * and (re)apply the protection.
- */
- if (!PageTOI_RO(page)) {
- toi_reset_dirtiness_one(pfn, verbose);
- }
- }
- }
-
- pr_debug(KERN_EMERG "Done resetting dirtiness.\n");
-
- if (allocated_map) {
- toi_free_bitmap(&free_map);
- }
- return 0;
-}
-
-static int toi_reset_dirtiness_initcall(void)
-{
- if (toi_do_incremental_initcall) {
- pr_info("TuxOnIce: Enabling dirty page tracking.\n");
- toi_reset_dirtiness(0);
- }
- return 1;
-}
-extern void toi_generate_untracked_map(void);
-
-// Leave early_initcall for pages to register untracked sections.
-early_initcall(toi_reset_dirtiness_initcall);
-
-static int __init toi_incremental_initcall_setup(char *str)
-{
- int value;
-
- if (sscanf(str, "=%d", &value) && value)
- toi_do_incremental_initcall = value;
-
- return 1;
-}
-__setup("toi_incremental_initcall", toi_incremental_initcall_setup);
diff --git a/kernel/power/tuxonice_io.c b/kernel/power/tuxonice_io.c
deleted file mode 100644
index 91b0c4fd0..000000000
--- a/kernel/power/tuxonice_io.c
+++ /dev/null
@@ -1,1932 +0,0 @@
-/*
- * kernel/power/tuxonice_io.c
- *
- * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
- * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz>
- * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr>
- * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * It contains high level IO routines for hibernating.
- *
- */
-
-#include <linux/suspend.h>
-#include <linux/version.h>
-#include <linux/utsname.h>
-#include <linux/mount.h>
-#include <linux/highmem.h>
-#include <linux/kthread.h>
-#include <linux/cpu.h>
-#include <linux/fs_struct.h>
-#include <linux/bio.h>
-#include <linux/fs_uuid.h>
-#include <linux/kmod.h>
-#include <asm/tlbflush.h>
-
-#include "tuxonice.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_pageflags.h"
-#include "tuxonice_io.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_storage.h"
-#include "tuxonice_prepare_image.h"
-#include "tuxonice_extent.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_builtin.h"
-#include "tuxonice_checksum.h"
-#include "tuxonice_alloc.h"
-char alt_resume_param[256];
-
-/* Version read from image header at resume */
-static int toi_image_header_version;
-
-#define read_if_version(VERS, VAR, DESC, ERR_ACT) do { \
- if (likely(toi_image_header_version >= VERS)) \
- if (toiActiveAllocator->rw_header_chunk(READ, NULL, \
- (char *) &VAR, sizeof(VAR))) { \
- abort_hibernate(TOI_FAILED_IO, "Failed to read DESC."); \
- ERR_ACT; \
- } \
-} while(0) \
-
-/* Variables shared between threads and updated under the mutex */
-static int io_write, io_finish_at, io_base, io_barmax, io_pageset, io_result;
-static int io_index, io_nextupdate, io_pc, io_pc_step;
-static DEFINE_MUTEX(io_mutex);
-static DEFINE_PER_CPU(struct page *, last_sought);
-static DEFINE_PER_CPU(struct page *, last_high_page);
-static DEFINE_PER_CPU(char *, checksum_locn);
-static DEFINE_PER_CPU(struct pbe *, last_low_page);
-static atomic_t io_count;
-atomic_t toi_io_workers;
-
-static int using_flusher;
-
-DECLARE_WAIT_QUEUE_HEAD(toi_io_queue_flusher);
-
-int toi_bio_queue_flusher_should_finish;
-
-int toi_max_workers;
-
-static char *image_version_error = "The image header version is newer than " \
- "this kernel supports.";
-
-struct toi_module_ops *first_filter;
-
-static atomic_t toi_num_other_threads;
-static DECLARE_WAIT_QUEUE_HEAD(toi_worker_wait_queue);
-enum toi_worker_commands {
- TOI_IO_WORKER_STOP,
- TOI_IO_WORKER_RUN,
- TOI_IO_WORKER_EXIT
-};
-static enum toi_worker_commands toi_worker_command;
-
-/**
- * toi_attempt_to_parse_resume_device - determine if we can hibernate
- *
- * Can we hibernate, using the current resume= parameter?
- **/
-int toi_attempt_to_parse_resume_device(int quiet)
-{
- struct list_head *Allocator;
- struct toi_module_ops *thisAllocator;
- int result, returning = 0;
-
- if (toi_activate_storage(0))
- return 0;
-
- toiActiveAllocator = NULL;
- clear_toi_state(TOI_RESUME_DEVICE_OK);
- clear_toi_state(TOI_CAN_RESUME);
- clear_result_state(TOI_ABORTED);
-
- if (!toiNumAllocators) {
- if (!quiet)
- printk(KERN_INFO "TuxOnIce: No storage allocators have "
- "been registered. Hibernating will be "
- "disabled.\n");
- goto cleanup;
- }
-
- list_for_each(Allocator, &toiAllocators) {
- thisAllocator = list_entry(Allocator, struct toi_module_ops,
- type_list);
-
- /*
- * Not sure why you'd want to disable an allocator, but
- * we should honour the flag if we're providing it
- */
- if (!thisAllocator->enabled)
- continue;
-
- result = thisAllocator->parse_sig_location(
- resume_file, (toiNumAllocators == 1),
- quiet);
-
- switch (result) {
- case -EINVAL:
- /* For this allocator, but not a valid
- * configuration. Error already printed. */
- goto cleanup;
-
- case 0:
- /* For this allocator and valid. */
- toiActiveAllocator = thisAllocator;
-
- set_toi_state(TOI_RESUME_DEVICE_OK);
- set_toi_state(TOI_CAN_RESUME);
- returning = 1;
- goto cleanup;
- }
- }
- if (!quiet)
- printk(KERN_INFO "TuxOnIce: No matching enabled allocator "
- "found. Resuming disabled.\n");
-cleanup:
- toi_deactivate_storage(0);
- return returning;
-}
-
-void attempt_to_parse_resume_device2(void)
-{
- toi_prepare_usm();
- toi_attempt_to_parse_resume_device(0);
- toi_cleanup_usm();
-}
-
-void save_restore_alt_param(int replace, int quiet)
-{
- static char resume_param_save[255];
- static unsigned long toi_state_save;
-
- if (replace) {
- toi_state_save = toi_state;
- strcpy(resume_param_save, resume_file);
- strcpy(resume_file, alt_resume_param);
- } else {
- strcpy(resume_file, resume_param_save);
- toi_state = toi_state_save;
- }
- toi_attempt_to_parse_resume_device(quiet);
-}
-
-void attempt_to_parse_alt_resume_param(void)
-{
- int ok = 0;
-
- /* Temporarily set resume_param to the poweroff value */
- if (!strlen(alt_resume_param))
- return;
-
- printk(KERN_INFO "=== Trying Poweroff Resume2 ===\n");
- save_restore_alt_param(SAVE, NOQUIET);
- if (test_toi_state(TOI_CAN_RESUME))
- ok = 1;
-
- printk(KERN_INFO "=== Done ===\n");
- save_restore_alt_param(RESTORE, QUIET);
-
- /* If not ok, clear the string */
- if (ok)
- return;
-
- printk(KERN_INFO "Can't resume from that location; clearing "
- "alt_resume_param.\n");
- alt_resume_param[0] = '\0';
-}
-
-/**
- * noresume_reset_modules - reset data structures in case of non resuming
- *
- * When we read the start of an image, modules (and especially the
- * active allocator) might need to reset data structures if we
- * decide to remove the image rather than resuming from it.
- **/
-static void noresume_reset_modules(void)
-{
- struct toi_module_ops *this_filter;
-
- list_for_each_entry(this_filter, &toi_filters, type_list)
- if (this_filter->noresume_reset)
- this_filter->noresume_reset();
-
- if (toiActiveAllocator && toiActiveAllocator->noresume_reset)
- toiActiveAllocator->noresume_reset();
-}
-
-/**
- * fill_toi_header - fill the hibernate header structure
- * @struct toi_header: Header data structure to be filled.
- **/
-static int fill_toi_header(struct toi_header *sh)
-{
- int i, error;
-
- error = init_header((struct swsusp_info *) sh);
- if (error)
- return error;
-
- sh->pagedir = pagedir1;
- sh->pageset_2_size = pagedir2.size;
- sh->param0 = toi_result;
- sh->param1 = toi_bkd.toi_action;
- sh->param2 = toi_bkd.toi_debug_state;
- sh->param3 = toi_bkd.toi_default_console_level;
- sh->root_fs = current->fs->root.mnt->mnt_sb->s_dev;
- for (i = 0; i < 4; i++)
- sh->io_time[i/2][i%2] = toi_bkd.toi_io_time[i/2][i%2];
- sh->bkd = boot_kernel_data_buffer;
- return 0;
-}
-
-/**
- * rw_init_modules - initialize modules
- * @rw: Whether we are reading of writing an image.
- * @which: Section of the image being processed.
- *
- * Iterate over modules, preparing the ones that will be used to read or write
- * data.
- **/
-static int rw_init_modules(int rw, int which)
-{
- struct toi_module_ops *this_module;
- /* Initialise page transformers */
- list_for_each_entry(this_module, &toi_filters, type_list) {
- if (!this_module->enabled)
- continue;
- if (this_module->rw_init && this_module->rw_init(rw, which)) {
- abort_hibernate(TOI_FAILED_MODULE_INIT,
- "Failed to initialize the %s filter.",
- this_module->name);
- return 1;
- }
- }
-
- /* Initialise allocator */
- if (toiActiveAllocator->rw_init(rw, which)) {
- abort_hibernate(TOI_FAILED_MODULE_INIT,
- "Failed to initialise the allocator.");
- return 1;
- }
-
- /* Initialise other modules */
- list_for_each_entry(this_module, &toi_modules, module_list) {
- if (!this_module->enabled ||
- this_module->type == FILTER_MODULE ||
- this_module->type == WRITER_MODULE)
- continue;
- if (this_module->rw_init && this_module->rw_init(rw, which)) {
- set_abort_result(TOI_FAILED_MODULE_INIT);
- printk(KERN_INFO "Setting aborted flag due to module "
- "init failure.\n");
- return 1;
- }
- }
-
- return 0;
-}
-
-/**
- * rw_cleanup_modules - cleanup modules
- * @rw: Whether we are reading of writing an image.
- *
- * Cleanup components after reading or writing a set of pages.
- * Only the allocator may fail.
- **/
-static int rw_cleanup_modules(int rw)
-{
- struct toi_module_ops *this_module;
- int result = 0;
-
- /* Cleanup other modules */
- list_for_each_entry(this_module, &toi_modules, module_list) {
- if (!this_module->enabled ||
- this_module->type == FILTER_MODULE ||
- this_module->type == WRITER_MODULE)
- continue;
- if (this_module->rw_cleanup)
- result |= this_module->rw_cleanup(rw);
- }
-
- /* Flush data and cleanup */
- list_for_each_entry(this_module, &toi_filters, type_list) {
- if (!this_module->enabled)
- continue;
- if (this_module->rw_cleanup)
- result |= this_module->rw_cleanup(rw);
- }
-
- result |= toiActiveAllocator->rw_cleanup(rw);
-
- return result;
-}
-
-static struct page *copy_page_from_orig_page(struct page *orig_page, int is_high)
-{
- int index, min, max;
- struct page *high_page = NULL,
- **my_last_high_page = raw_cpu_ptr(&last_high_page),
- **my_last_sought = raw_cpu_ptr(&last_sought);
- struct pbe *this, **my_last_low_page = raw_cpu_ptr(&last_low_page);
- void *compare;
-
- if (is_high) {
- if (*my_last_sought && *my_last_high_page &&
- *my_last_sought < orig_page)
- high_page = *my_last_high_page;
- else
- high_page = (struct page *) restore_highmem_pblist;
- this = (struct pbe *) kmap(high_page);
- compare = orig_page;
- } else {
- if (*my_last_sought && *my_last_low_page &&
- *my_last_sought < orig_page)
- this = *my_last_low_page;
- else
- this = restore_pblist;
- compare = page_address(orig_page);
- }
-
- *my_last_sought = orig_page;
-
- /* Locate page containing pbe */
- while (this[PBES_PER_PAGE - 1].next &&
- this[PBES_PER_PAGE - 1].orig_address < compare) {
- if (is_high) {
- struct page *next_high_page = (struct page *)
- this[PBES_PER_PAGE - 1].next;
- kunmap(high_page);
- this = kmap(next_high_page);
- high_page = next_high_page;
- } else
- this = this[PBES_PER_PAGE - 1].next;
- }
-
- /* Do a binary search within the page */
- min = 0;
- max = PBES_PER_PAGE;
- index = PBES_PER_PAGE / 2;
- while (max - min) {
- if (!this[index].orig_address ||
- this[index].orig_address > compare)
- max = index;
- else if (this[index].orig_address == compare) {
- if (is_high) {
- struct page *page = this[index].address;
- *my_last_high_page = high_page;
- kunmap(high_page);
- return page;
- }
- *my_last_low_page = this;
- return virt_to_page(this[index].address);
- } else
- min = index;
- index = ((max + min) / 2);
- };
-
- if (is_high)
- kunmap(high_page);
-
- abort_hibernate(TOI_FAILED_IO, "Failed to get destination page for"
- " orig page %p. This[min].orig_address=%p.\n", orig_page,
- this[index].orig_address);
- return NULL;
-}
-
-/**
- * write_next_page - write the next page in a pageset
- * @data_pfn: The pfn where the next data to write is located.
- * @my_io_index: The index of the page in the pageset.
- * @write_pfn: The pfn number to write in the image (where the data belongs).
- *
- * Get the pfn of the next page to write, map the page if necessary and do the
- * write.
- **/
-static int write_next_page(unsigned long *data_pfn, int *my_io_index,
- unsigned long *write_pfn)
-{
- struct page *page;
- char **my_checksum_locn = raw_cpu_ptr(&checksum_locn);
- int result = 0, was_present;
-
- *data_pfn = memory_bm_next_pfn(io_map, 0);
-
- /* Another thread could have beaten us to it. */
- if (*data_pfn == BM_END_OF_MAP) {
- if (atomic_read(&io_count)) {
- printk(KERN_INFO "Ran out of pfns but io_count is "
- "still %d.\n", atomic_read(&io_count));
- BUG();
- }
- mutex_unlock(&io_mutex);
- return -ENODATA;
- }
-
- *my_io_index = io_finish_at - atomic_sub_return(1, &io_count);
-
- memory_bm_clear_bit(io_map, 0, *data_pfn);
- page = pfn_to_page(*data_pfn);
-
- was_present = kernel_page_present(page);
- if (!was_present)
- kernel_map_pages(page, 1, 1);
-
- if (io_pageset == 1)
- *write_pfn = memory_bm_next_pfn(pageset1_map, 0);
- else {
- *write_pfn = *data_pfn;
- *my_checksum_locn = tuxonice_get_next_checksum();
- }
-
- TOI_TRACE_DEBUG(*data_pfn, "_PS%d_write %d", io_pageset, *my_io_index);
-
- mutex_unlock(&io_mutex);
-
- if (io_pageset == 2 && tuxonice_calc_checksum(page, *my_checksum_locn))
- return 1;
-
- result = first_filter->write_page(*write_pfn, TOI_PAGE, page,
- PAGE_SIZE);
-
- if (!was_present)
- kernel_map_pages(page, 1, 0);
-
- return result;
-}
-
-/**
- * read_next_page - read the next page in a pageset
- * @my_io_index: The index of the page in the pageset.
- * @write_pfn: The pfn in which the data belongs.
- *
- * Read a page of the image into our buffer. It can happen (here and in the
- * write routine) that threads don't get run until after other CPUs have done
- * all the work. This was the cause of the long standing issue with
- * occasionally getting -ENODATA errors at the end of reading the image. We
- * therefore need to check there's actually a page to read before trying to
- * retrieve one.
- **/
-
-static int read_next_page(int *my_io_index, unsigned long *write_pfn,
- struct page *buffer)
-{
- unsigned int buf_size = PAGE_SIZE;
- unsigned long left = atomic_read(&io_count);
-
- if (!left)
- return -ENODATA;
-
- /* Start off assuming the page we read isn't resaved */
- *my_io_index = io_finish_at - atomic_sub_return(1, &io_count);
-
- mutex_unlock(&io_mutex);
-
- /*
- * Are we aborting? If so, don't submit any more I/O as
- * resetting the resume_attempted flag (from ui.c) will
- * clear the bdev flags, making this thread oops.
- */
- if (unlikely(test_toi_state(TOI_STOP_RESUME))) {
- atomic_dec(&toi_io_workers);
- if (!atomic_read(&toi_io_workers)) {
- /*
- * So we can be sure we'll have memory for
- * marking that we haven't resumed.
- */
- rw_cleanup_modules(READ);
- set_toi_state(TOI_IO_STOPPED);
- }
- while (1)
- schedule();
- }
-
- /*
- * See toi_bio_read_page in tuxonice_bio.c:
- * read the next page in the image.
- */
- return first_filter->read_page(write_pfn, TOI_PAGE, buffer, &buf_size);
-}
-
-static void use_read_page(unsigned long write_pfn, struct page *buffer)
-{
- struct page *final_page = pfn_to_page(write_pfn),
- *copy_page = final_page;
- char *virt, *buffer_virt;
- int was_present, cpu = smp_processor_id();
- unsigned long idx = 0;
-
- if (io_pageset == 1 && (!pageset1_copy_map ||
- !memory_bm_test_bit(pageset1_copy_map, cpu, write_pfn))) {
- int is_high = PageHighMem(final_page);
- copy_page = copy_page_from_orig_page(is_high ? (void *) write_pfn : final_page, is_high);
- }
-
- if (!memory_bm_test_bit(io_map, cpu, write_pfn)) {
- int test = !memory_bm_test_bit(io_map, cpu, write_pfn);
- toi_message(TOI_IO, TOI_VERBOSE, 0, "Discard %ld (%d).", write_pfn, test);
- mutex_lock(&io_mutex);
- idx = atomic_add_return(1, &io_count);
- mutex_unlock(&io_mutex);
- return;
- }
-
- virt = kmap(copy_page);
- buffer_virt = kmap(buffer);
- was_present = kernel_page_present(copy_page);
- if (!was_present)
- kernel_map_pages(copy_page, 1, 1);
- memcpy(virt, buffer_virt, PAGE_SIZE);
- if (!was_present)
- kernel_map_pages(copy_page, 1, 0);
- kunmap(copy_page);
- kunmap(buffer);
- memory_bm_clear_bit(io_map, cpu, write_pfn);
- TOI_TRACE_DEBUG(write_pfn, "_PS%d_read", io_pageset);
-}
-
-static unsigned long status_update(int writing, unsigned long done,
- unsigned long ticks)
-{
- int cs_index = writing ? 0 : 1;
- unsigned long ticks_so_far = toi_bkd.toi_io_time[cs_index][1] + ticks;
- unsigned long msec = jiffies_to_msecs(abs(ticks_so_far));
- unsigned long pgs_per_s, estimate = 0, pages_left;
-
- if (msec) {
- pages_left = io_barmax - done;
- pgs_per_s = 1000 * done / msec;
- if (pgs_per_s)
- estimate = DIV_ROUND_UP(pages_left, pgs_per_s);
- }
-
- if (estimate && ticks > HZ / 2)
- return toi_update_status(done, io_barmax,
- " %d/%d MB (%lu sec left)",
- MB(done+1), MB(io_barmax), estimate);
-
- return toi_update_status(done, io_barmax, " %d/%d MB",
- MB(done+1), MB(io_barmax));
-}
-
-/**
- * worker_rw_loop - main loop to read/write pages
- *
- * The main I/O loop for reading or writing pages. The io_map bitmap is used to
- * track the pages to read/write.
- * If we are reading, the pages are loaded to their final (mapped) pfn.
- * Data is non zero iff this is a thread started via start_other_threads.
- * In that case, we stay in here until told to quit.
- **/
-static int worker_rw_loop(void *data)
-{
- unsigned long data_pfn, write_pfn, next_jiffies = jiffies + HZ / 4,
- jif_index = 1, start_time = jiffies, thread_num;
- int result = 0, my_io_index = 0, last_worker;
- struct page *buffer = toi_alloc_page(28, TOI_ATOMIC_GFP);
- cpumask_var_t orig_mask;
-
- if (!alloc_cpumask_var(&orig_mask, GFP_KERNEL)) {
- printk(KERN_EMERG "Failed to allocate cpumask for TuxOnIce I/O thread %ld.\n", (unsigned long) data);
- result = -ENOMEM;
- goto out;
- }
-
- cpumask_copy(orig_mask, tsk_cpus_allowed(current));
-
- current->flags |= PF_NOFREEZE;
-
-top:
- mutex_lock(&io_mutex);
- thread_num = atomic_read(&toi_io_workers);
-
- cpumask_copy(tsk_cpus_allowed(current), orig_mask);
- schedule();
-
- atomic_inc(&toi_io_workers);
-
- while (atomic_read(&io_count) >= atomic_read(&toi_io_workers) &&
- !(io_write && test_result_state(TOI_ABORTED)) &&
- toi_worker_command == TOI_IO_WORKER_RUN) {
- if (!thread_num && jiffies > next_jiffies) {
- next_jiffies += HZ / 4;
- if (toiActiveAllocator->update_throughput_throttle)
- toiActiveAllocator->update_throughput_throttle(
- jif_index);
- jif_index++;
- }
-
- /*
- * What page to use? If reading, don't know yet which page's
- * data will be read, so always use the buffer. If writing,
- * use the copy (Pageset1) or original page (Pageset2), but
- * always write the pfn of the original page.
- */
- if (io_write)
- result = write_next_page(&data_pfn, &my_io_index,
- &write_pfn);
- else /* Reading */
- result = read_next_page(&my_io_index, &write_pfn,
- buffer);
-
- if (result) {
- mutex_lock(&io_mutex);
- /* Nothing to do? */
- if (result == -ENODATA) {
- toi_message(TOI_IO, TOI_VERBOSE, 0,
- "Thread %d has no more work.",
- smp_processor_id());
- break;
- }
-
- io_result = result;
-
- if (io_write) {
- printk(KERN_INFO "Write chunk returned %d.\n",
- result);
- abort_hibernate(TOI_FAILED_IO,
- "Failed to write a chunk of the "
- "image.");
- break;
- }
-
- if (io_pageset == 1) {
- printk(KERN_ERR "\nBreaking out of I/O loop "
- "because of result code %d.\n", result);
- break;
- }
- panic("Read chunk returned (%d)", result);
- }
-
- /*
- * Discard reads of resaved pages while reading ps2
- * and unwanted pages while rereading ps2 when aborting.
- */
- if (!io_write) {
- if (!PageResave(pfn_to_page(write_pfn)))
- use_read_page(write_pfn, buffer);
- else {
- mutex_lock(&io_mutex);
- toi_message(TOI_IO, TOI_VERBOSE, 0,
- "Resaved %ld.", write_pfn);
- atomic_inc(&io_count);
- mutex_unlock(&io_mutex);
- }
- }
-
- if (!thread_num) {
- if(my_io_index + io_base > io_nextupdate)
- io_nextupdate = status_update(io_write,
- my_io_index + io_base,
- jiffies - start_time);
-
- if (my_io_index > io_pc) {
- printk(KERN_CONT "...%d%%", 20 * io_pc_step);
- io_pc_step++;
- io_pc = io_finish_at * io_pc_step / 5;
- }
- }
-
- toi_cond_pause(0, NULL);
-
- /*
- * Subtle: If there's less I/O still to be done than threads
- * running, quit. This stops us doing I/O beyond the end of
- * the image when reading.
- *
- * Possible race condition. Two threads could do the test at
- * the same time; one should exit and one should continue.
- * Therefore we take the mutex before comparing and exiting.
- */
-
- mutex_lock(&io_mutex);
- }
-
- last_worker = atomic_dec_and_test(&toi_io_workers);
- toi_message(TOI_IO, TOI_VERBOSE, 0, "%d workers left.", atomic_read(&toi_io_workers));
- mutex_unlock(&io_mutex);
-
- if ((unsigned long) data && toi_worker_command != TOI_IO_WORKER_EXIT) {
- /* Were we the last thread and we're using a flusher thread? */
- if (last_worker && using_flusher) {
- toiActiveAllocator->finish_all_io();
- }
- /* First, if we're doing I/O, wait for it to finish */
- wait_event(toi_worker_wait_queue, toi_worker_command != TOI_IO_WORKER_RUN);
- /* Then wait to be told what to do next */
- wait_event(toi_worker_wait_queue, toi_worker_command != TOI_IO_WORKER_STOP);
- if (toi_worker_command == TOI_IO_WORKER_RUN)
- goto top;
- }
-
- if (thread_num)
- atomic_dec(&toi_num_other_threads);
-
-out:
- toi_message(TOI_IO, TOI_LOW, 0, "Thread %d exiting.", thread_num);
- toi__free_page(28, buffer);
- free_cpumask_var(orig_mask);
-
- return result;
-}
-
-int toi_start_other_threads(void)
-{
- int cpu;
- struct task_struct *p;
- int to_start = (toi_max_workers ? toi_max_workers : num_online_cpus()) - 1;
- unsigned long num_started = 0;
-
- if (test_action_state(TOI_NO_MULTITHREADED_IO))
- return 0;
-
- toi_worker_command = TOI_IO_WORKER_STOP;
-
- for_each_online_cpu(cpu) {
- if (num_started == to_start)
- break;
-
- if (cpu == smp_processor_id())
- continue;
-
- p = kthread_create_on_node(worker_rw_loop, (void *) num_started + 1,
- cpu_to_node(cpu), "ktoi_io/%d", cpu);
- if (IS_ERR(p)) {
- printk(KERN_ERR "ktoi_io for %i failed\n", cpu);
- continue;
- }
- kthread_bind(p, cpu);
- p->flags |= PF_MEMALLOC;
- wake_up_process(p);
- num_started++;
- atomic_inc(&toi_num_other_threads);
- }
-
- toi_message(TOI_IO, TOI_LOW, 0, "Started %d threads.", num_started);
- return num_started;
-}
-
-void toi_stop_other_threads(void)
-{
- toi_message(TOI_IO, TOI_LOW, 0, "Stopping other threads.");
- toi_worker_command = TOI_IO_WORKER_EXIT;
- wake_up(&toi_worker_wait_queue);
-}
-
-/**
- * do_rw_loop - main highlevel function for reading or writing pages
- *
- * Create the io_map bitmap and call worker_rw_loop to perform I/O operations.
- **/
-static int do_rw_loop(int write, int finish_at, struct memory_bitmap *pageflags,
- int base, int barmax, int pageset)
-{
- int index = 0, cpu, result = 0, workers_started;
- unsigned long pfn, next;
-
- first_filter = toi_get_next_filter(NULL);
-
- if (!finish_at)
- return 0;
-
- io_write = write;
- io_finish_at = finish_at;
- io_base = base;
- io_barmax = barmax;
- io_pageset = pageset;
- io_index = 0;
- io_pc = io_finish_at / 5;
- io_pc_step = 1;
- io_result = 0;
- io_nextupdate = base + 1;
- toi_bio_queue_flusher_should_finish = 0;
-
- for_each_online_cpu(cpu) {
- per_cpu(last_sought, cpu) = NULL;
- per_cpu(last_low_page, cpu) = NULL;
- per_cpu(last_high_page, cpu) = NULL;
- }
-
- /* Ensure all bits clear */
- memory_bm_clear(io_map);
-
- memory_bm_position_reset(io_map);
- next = memory_bm_next_pfn(io_map, 0);
-
- BUG_ON(next != BM_END_OF_MAP);
-
- /* Set the bits for the pages to write */
- memory_bm_position_reset(pageflags);
-
- pfn = memory_bm_next_pfn(pageflags, 0);
- toi_trace_index++;
-
- while (pfn != BM_END_OF_MAP && index < finish_at) {
- TOI_TRACE_DEBUG(pfn, "_io_pageset_%d (%d/%d)", pageset, index + 1, finish_at);
- memory_bm_set_bit(io_map, 0, pfn);
- pfn = memory_bm_next_pfn(pageflags, 0);
- index++;
- }
-
- BUG_ON(next != BM_END_OF_MAP || index < finish_at);
-
- memory_bm_position_reset(io_map);
- toi_trace_index++;
-
- atomic_set(&io_count, finish_at);
-
- memory_bm_position_reset(pageset1_map);
-
- mutex_lock(&io_mutex);
-
- clear_toi_state(TOI_IO_STOPPED);
-
- using_flusher = (atomic_read(&toi_num_other_threads) &&
- toiActiveAllocator->io_flusher &&
- !test_action_state(TOI_NO_FLUSHER_THREAD));
-
- workers_started = atomic_read(&toi_num_other_threads);
-
- memory_bm_position_reset(io_map);
- memory_bm_position_reset(pageset1_copy_map);
-
- toi_worker_command = TOI_IO_WORKER_RUN;
- wake_up(&toi_worker_wait_queue);
-
- mutex_unlock(&io_mutex);
-
- if (using_flusher)
- result = toiActiveAllocator->io_flusher(write);
- else
- worker_rw_loop(NULL);
-
- while (atomic_read(&toi_io_workers))
- schedule();
-
- printk(KERN_CONT "\n");
-
- toi_worker_command = TOI_IO_WORKER_STOP;
- wake_up(&toi_worker_wait_queue);
-
- if (unlikely(test_toi_state(TOI_STOP_RESUME))) {
- if (!atomic_read(&toi_io_workers)) {
- rw_cleanup_modules(READ);
- set_toi_state(TOI_IO_STOPPED);
- }
- while (1)
- schedule();
- }
- set_toi_state(TOI_IO_STOPPED);
-
- if (!io_result && !result && !test_result_state(TOI_ABORTED)) {
- unsigned long next;
-
- toi_update_status(io_base + io_finish_at, io_barmax,
- " %d/%d MB ",
- MB(io_base + io_finish_at), MB(io_barmax));
-
- memory_bm_position_reset(io_map);
- next = memory_bm_next_pfn(io_map, 0);
- if (next != BM_END_OF_MAP) {
- printk(KERN_INFO "Finished I/O loop but still work to "
- "do?\nFinish at = %d. io_count = %d.\n",
- finish_at, atomic_read(&io_count));
- printk(KERN_INFO "I/O bitmap still records work to do."
- "%ld.\n", next);
- BUG();
- do {
- cpu_relax();
- } while (0);
- }
- }
-
- return io_result ? io_result : result;
-}
-
-/**
- * write_pageset - write a pageset to disk.
- * @pagedir: Which pagedir to write.
- *
- * Returns:
- * Zero on success or -1 on failure.
- **/
-int write_pageset(struct pagedir *pagedir)
-{
- int finish_at, base = 0;
- int barmax = pagedir1.size + pagedir2.size;
- long error = 0;
- struct memory_bitmap *pageflags;
- unsigned long start_time, end_time;
-
- /*
- * Even if there is nothing to read or write, the allocator
- * may need the init/cleanup for it's housekeeping. (eg:
- * Pageset1 may start where pageset2 ends when writing).
- */
- finish_at = pagedir->size;
-
- if (pagedir->id == 1) {
- toi_prepare_status(DONT_CLEAR_BAR,
- "Writing kernel & process data...");
- base = pagedir2.size;
- if (test_action_state(TOI_TEST_FILTER_SPEED) ||
- test_action_state(TOI_TEST_BIO))
- pageflags = pageset1_map;
- else
- pageflags = pageset1_copy_map;
- } else {
- toi_prepare_status(DONT_CLEAR_BAR, "Writing caches...");
- pageflags = pageset2_map;
- }
-
- start_time = jiffies;
-
- if (rw_init_modules(WRITE, pagedir->id)) {
- abort_hibernate(TOI_FAILED_MODULE_INIT,
- "Failed to initialise modules for writing.");
- error = 1;
- }
-
- if (!error)
- error = do_rw_loop(WRITE, finish_at, pageflags, base, barmax,
- pagedir->id);
-
- if (rw_cleanup_modules(WRITE) && !error) {
- abort_hibernate(TOI_FAILED_MODULE_CLEANUP,
- "Failed to cleanup after writing.");
- error = 1;
- }
-
- end_time = jiffies;
-
- if ((end_time - start_time) && (!test_result_state(TOI_ABORTED))) {
- toi_bkd.toi_io_time[0][0] += finish_at,
- toi_bkd.toi_io_time[0][1] += (end_time - start_time);
- }
-
- return error;
-}
-
-/**
- * read_pageset - highlevel function to read a pageset from disk
- * @pagedir: pageset to read
- * @overwrittenpagesonly: Whether to read the whole pageset or
- * only part of it.
- *
- * Returns:
- * Zero on success or -1 on failure.
- **/
-static int read_pageset(struct pagedir *pagedir, int overwrittenpagesonly)
-{
- int result = 0, base = 0;
- int finish_at = pagedir->size;
- int barmax = pagedir1.size + pagedir2.size;
- struct memory_bitmap *pageflags;
- unsigned long start_time, end_time;
-
- if (pagedir->id == 1) {
- toi_prepare_status(DONT_CLEAR_BAR,
- "Reading kernel & process data...");
- pageflags = pageset1_map;
- } else {
- toi_prepare_status(DONT_CLEAR_BAR, "Reading caches...");
- if (overwrittenpagesonly) {
- barmax = min(pagedir1.size, pagedir2.size);
- finish_at = min(pagedir1.size, pagedir2.size);
- } else
- base = pagedir1.size;
- pageflags = pageset2_map;
- }
-
- start_time = jiffies;
-
- if (rw_init_modules(READ, pagedir->id)) {
- toiActiveAllocator->remove_image();
- result = 1;
- } else
- result = do_rw_loop(READ, finish_at, pageflags, base, barmax,
- pagedir->id);
-
- if (rw_cleanup_modules(READ) && !result) {
- abort_hibernate(TOI_FAILED_MODULE_CLEANUP,
- "Failed to cleanup after reading.");
- result = 1;
- }
-
- /* Statistics */
- end_time = jiffies;
-
- if ((end_time - start_time) && (!test_result_state(TOI_ABORTED))) {
- toi_bkd.toi_io_time[1][0] += finish_at,
- toi_bkd.toi_io_time[1][1] += (end_time - start_time);
- }
-
- return result;
-}
-
-/**
- * write_module_configs - store the modules configuration
- *
- * The configuration for each module is stored in the image header.
- * Returns: Int
- * Zero on success, Error value otherwise.
- **/
-static int write_module_configs(void)
-{
- struct toi_module_ops *this_module;
- char *buffer = (char *) toi_get_zeroed_page(22, TOI_ATOMIC_GFP);
- int len, index = 1;
- struct toi_module_header toi_module_header;
-
- if (!buffer) {
- printk(KERN_INFO "Failed to allocate a buffer for saving "
- "module configuration info.\n");
- return -ENOMEM;
- }
-
- /*
- * We have to know which data goes with which module, so we at
- * least write a length of zero for a module. Note that we are
- * also assuming every module's config data takes <= PAGE_SIZE.
- */
-
- /* For each module (in registration order) */
- list_for_each_entry(this_module, &toi_modules, module_list) {
- if (!this_module->enabled || !this_module->storage_needed ||
- (this_module->type == WRITER_MODULE &&
- toiActiveAllocator != this_module))
- continue;
-
- /* Get the data from the module */
- len = 0;
- if (this_module->save_config_info)
- len = this_module->save_config_info(buffer);
-
- /* Save the details of the module */
- toi_module_header.enabled = this_module->enabled;
- toi_module_header.type = this_module->type;
- toi_module_header.index = index++;
- strncpy(toi_module_header.name, this_module->name,
- sizeof(toi_module_header.name));
- toiActiveAllocator->rw_header_chunk(WRITE,
- this_module,
- (char *) &toi_module_header,
- sizeof(toi_module_header));
-
- /* Save the size of the data and any data returned */
- toiActiveAllocator->rw_header_chunk(WRITE,
- this_module,
- (char *) &len, sizeof(int));
- if (len)
- toiActiveAllocator->rw_header_chunk(
- WRITE, this_module, buffer, len);
- }
-
- /* Write a blank header to terminate the list */
- toi_module_header.name[0] = '\0';
- toiActiveAllocator->rw_header_chunk(WRITE, NULL,
- (char *) &toi_module_header, sizeof(toi_module_header));
-
- toi_free_page(22, (unsigned long) buffer);
- return 0;
-}
-
-/**
- * read_one_module_config - read and configure one module
- *
- * Read the configuration for one module, and configure the module
- * to match if it is loaded.
- *
- * Returns: Int
- * Zero on success, Error value otherwise.
- **/
-static int read_one_module_config(struct toi_module_header *header)
-{
- struct toi_module_ops *this_module;
- int result, len;
- char *buffer;
-
- /* Find the module */
- this_module = toi_find_module_given_name(header->name);
-
- if (!this_module) {
- if (header->enabled) {
- toi_early_boot_message(1, TOI_CONTINUE_REQ,
- "It looks like we need module %s for reading "
- "the image but it hasn't been registered.\n",
- header->name);
- if (!(test_toi_state(TOI_CONTINUE_REQ)))
- return -EINVAL;
- } else
- printk(KERN_INFO "Module %s configuration data found, "
- "but the module hasn't registered. Looks like "
- "it was disabled, so we're ignoring its data.",
- header->name);
- }
-
- /* Get the length of the data (if any) */
- result = toiActiveAllocator->rw_header_chunk(READ, NULL, (char *) &len,
- sizeof(int));
- if (result) {
- printk(KERN_ERR "Failed to read the length of the module %s's"
- " configuration data.\n",
- header->name);
- return -EINVAL;
- }
-
- /* Read any data and pass to the module (if we found one) */
- if (!len)
- return 0;
-
- buffer = (char *) toi_get_zeroed_page(23, TOI_ATOMIC_GFP);
-
- if (!buffer) {
- printk(KERN_ERR "Failed to allocate a buffer for reloading "
- "module configuration info.\n");
- return -ENOMEM;
- }
-
- toiActiveAllocator->rw_header_chunk(READ, NULL, buffer, len);
-
- if (!this_module)
- goto out;
-
- if (!this_module->save_config_info)
- printk(KERN_ERR "Huh? Module %s appears to have a "
- "save_config_info, but not a load_config_info "
- "function!\n", this_module->name);
- else
- this_module->load_config_info(buffer, len);
-
- /*
- * Now move this module to the tail of its lists. This will put it in
- * order. Any new modules will end up at the top of the lists. They
- * should have been set to disabled when loaded (people will
- * normally not edit an initrd to load a new module and then hibernate
- * without using it!).
- */
-
- toi_move_module_tail(this_module);
-
- this_module->enabled = header->enabled;
-
-out:
- toi_free_page(23, (unsigned long) buffer);
- return 0;
-}
-
-/**
- * read_module_configs - reload module configurations from the image header.
- *
- * Returns: Int
- * Zero on success or an error code.
- **/
-static int read_module_configs(void)
-{
- int result = 0;
- struct toi_module_header toi_module_header;
- struct toi_module_ops *this_module;
-
- /* All modules are initially disabled. That way, if we have a module
- * loaded now that wasn't loaded when we hibernated, it won't be used
- * in trying to read the data.
- */
- list_for_each_entry(this_module, &toi_modules, module_list)
- this_module->enabled = 0;
-
- /* Get the first module header */
- result = toiActiveAllocator->rw_header_chunk(READ, NULL,
- (char *) &toi_module_header,
- sizeof(toi_module_header));
- if (result) {
- printk(KERN_ERR "Failed to read the next module header.\n");
- return -EINVAL;
- }
-
- /* For each module (in registration order) */
- while (toi_module_header.name[0]) {
- result = read_one_module_config(&toi_module_header);
-
- if (result)
- return -EINVAL;
-
- /* Get the next module header */
- result = toiActiveAllocator->rw_header_chunk(READ, NULL,
- (char *) &toi_module_header,
- sizeof(toi_module_header));
-
- if (result) {
- printk(KERN_ERR "Failed to read the next module "
- "header.\n");
- return -EINVAL;
- }
- }
-
- return 0;
-}
-
-static inline int save_fs_info(struct fs_info *fs, struct block_device *bdev)
-{
- return (!fs || IS_ERR(fs) || !fs->last_mount_size) ? 0 : 1;
-}
-
-int fs_info_space_needed(void)
-{
- const struct super_block *sb;
- int result = sizeof(int);
-
- list_for_each_entry(sb, &super_blocks, s_list) {
- struct fs_info *fs;
-
- if (!sb->s_bdev)
- continue;
-
- fs = fs_info_from_block_dev(sb->s_bdev);
- if (save_fs_info(fs, sb->s_bdev))
- result += 16 + sizeof(dev_t) + sizeof(int) +
- fs->last_mount_size;
- free_fs_info(fs);
- }
- return result;
-}
-
-static int fs_info_num_to_save(void)
-{
- const struct super_block *sb;
- int to_save = 0;
-
- list_for_each_entry(sb, &super_blocks, s_list) {
- struct fs_info *fs;
-
- if (!sb->s_bdev)
- continue;
-
- fs = fs_info_from_block_dev(sb->s_bdev);
- if (save_fs_info(fs, sb->s_bdev))
- to_save++;
- free_fs_info(fs);
- }
-
- return to_save;
-}
-
-static int fs_info_save(void)
-{
- const struct super_block *sb;
- int to_save = fs_info_num_to_save();
-
- if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, (char *) &to_save,
- sizeof(int))) {
- abort_hibernate(TOI_FAILED_IO, "Failed to write num fs_info"
- " to save.");
- return -EIO;
- }
-
- list_for_each_entry(sb, &super_blocks, s_list) {
- struct fs_info *fs;
-
- if (!sb->s_bdev)
- continue;
-
- fs = fs_info_from_block_dev(sb->s_bdev);
- if (save_fs_info(fs, sb->s_bdev)) {
- if (toiActiveAllocator->rw_header_chunk(WRITE, NULL,
- &fs->uuid[0], 16)) {
- abort_hibernate(TOI_FAILED_IO, "Failed to "
- "write uuid.");
- return -EIO;
- }
- if (toiActiveAllocator->rw_header_chunk(WRITE, NULL,
- (char *) &fs->dev_t, sizeof(dev_t))) {
- abort_hibernate(TOI_FAILED_IO, "Failed to "
- "write dev_t.");
- return -EIO;
- }
- if (toiActiveAllocator->rw_header_chunk(WRITE, NULL,
- (char *) &fs->last_mount_size, sizeof(int))) {
- abort_hibernate(TOI_FAILED_IO, "Failed to "
- "write last mount length.");
- return -EIO;
- }
- if (toiActiveAllocator->rw_header_chunk(WRITE, NULL,
- fs->last_mount, fs->last_mount_size)) {
- abort_hibernate(TOI_FAILED_IO, "Failed to "
- "write uuid.");
- return -EIO;
- }
- }
- free_fs_info(fs);
- }
- return 0;
-}
-
-static int fs_info_load_and_check_one(void)
-{
- char uuid[16], *last_mount;
- int result = 0, ln;
- dev_t dev_t;
- struct block_device *dev;
- struct fs_info *fs_info, seek;
-
- if (toiActiveAllocator->rw_header_chunk(READ, NULL, uuid, 16)) {
- abort_hibernate(TOI_FAILED_IO, "Failed to read uuid.");
- return -EIO;
- }
-
- read_if_version(3, dev_t, "uuid dev_t field", return -EIO);
-
- if (toiActiveAllocator->rw_header_chunk(READ, NULL, (char *) &ln,
- sizeof(int))) {
- abort_hibernate(TOI_FAILED_IO,
- "Failed to read last mount size.");
- return -EIO;
- }
-
- last_mount = kzalloc(ln, GFP_KERNEL);
-
- if (!last_mount)
- return -ENOMEM;
-
- if (toiActiveAllocator->rw_header_chunk(READ, NULL, last_mount, ln)) {
- abort_hibernate(TOI_FAILED_IO,
- "Failed to read last mount timestamp.");
- result = -EIO;
- goto out_lmt;
- }
-
- strncpy((char *) &seek.uuid, uuid, 16);
- seek.dev_t = dev_t;
- seek.last_mount_size = ln;
- seek.last_mount = last_mount;
- dev_t = blk_lookup_fs_info(&seek);
- if (!dev_t)
- goto out_lmt;
-
- dev = toi_open_by_devnum(dev_t);
-
- fs_info = fs_info_from_block_dev(dev);
- if (fs_info && !IS_ERR(fs_info)) {
- if (ln != fs_info->last_mount_size) {
- printk(KERN_EMERG "Found matching uuid but last mount "
- "time lengths differ?! "
- "(%d vs %d).\n", ln,
- fs_info->last_mount_size);
- result = -EINVAL;
- } else {
- char buf[BDEVNAME_SIZE];
- result = !!memcmp(fs_info->last_mount, last_mount, ln);
- if (result)
- printk(KERN_EMERG "Last mount time for %s has "
- "changed!\n", bdevname(dev, buf));
- }
- }
- toi_close_bdev(dev);
- free_fs_info(fs_info);
-out_lmt:
- kfree(last_mount);
- return result;
-}
-
-static int fs_info_load_and_check(void)
-{
- int to_do, result = 0;
-
- if (toiActiveAllocator->rw_header_chunk(READ, NULL, (char *) &to_do,
- sizeof(int))) {
- abort_hibernate(TOI_FAILED_IO, "Failed to read num fs_info "
- "to load.");
- return -EIO;
- }
-
- while(to_do--)
- result |= fs_info_load_and_check_one();
-
- return result;
-}
-
-/**
- * write_image_header - write the image header after write the image proper
- *
- * Returns: Int
- * Zero on success, error value otherwise.
- **/
-int write_image_header(void)
-{
- int ret;
- int total = pagedir1.size + pagedir2.size+2;
- char *header_buffer = NULL;
-
- /* Now prepare to write the header */
- ret = toiActiveAllocator->write_header_init();
- if (ret) {
- abort_hibernate(TOI_FAILED_MODULE_INIT,
- "Active allocator's write_header_init"
- " function failed.");
- goto write_image_header_abort;
- }
-
- /* Get a buffer */
- header_buffer = (char *) toi_get_zeroed_page(24, TOI_ATOMIC_GFP);
- if (!header_buffer) {
- abort_hibernate(TOI_OUT_OF_MEMORY,
- "Out of memory when trying to get page for header!");
- goto write_image_header_abort;
- }
-
- /* Write hibernate header */
- if (fill_toi_header((struct toi_header *) header_buffer)) {
- abort_hibernate(TOI_OUT_OF_MEMORY,
- "Failure to fill header information!");
- goto write_image_header_abort;
- }
-
- if (toiActiveAllocator->rw_header_chunk(WRITE, NULL,
- header_buffer, sizeof(struct toi_header))) {
- abort_hibernate(TOI_OUT_OF_MEMORY,
- "Failure to write header info.");
- goto write_image_header_abort;
- }
-
- if (toiActiveAllocator->rw_header_chunk(WRITE, NULL,
- (char *) &toi_max_workers, sizeof(toi_max_workers))) {
- abort_hibernate(TOI_OUT_OF_MEMORY,
- "Failure to number of workers to use.");
- goto write_image_header_abort;
- }
-
- /* Write filesystem info */
- if (fs_info_save())
- goto write_image_header_abort;
-
- /* Write module configurations */
- ret = write_module_configs();
- if (ret) {
- abort_hibernate(TOI_FAILED_IO,
- "Failed to write module configs.");
- goto write_image_header_abort;
- }
-
- if (memory_bm_write(pageset1_map,
- toiActiveAllocator->rw_header_chunk)) {
- abort_hibernate(TOI_FAILED_IO,
- "Failed to write bitmaps.");
- goto write_image_header_abort;
- }
-
- /* Flush data and let allocator cleanup */
- if (toiActiveAllocator->write_header_cleanup()) {
- abort_hibernate(TOI_FAILED_IO,
- "Failed to cleanup writing header.");
- goto write_image_header_abort_no_cleanup;
- }
-
- if (test_result_state(TOI_ABORTED))
- goto write_image_header_abort_no_cleanup;
-
- toi_update_status(total, total, NULL);
-
-out:
- if (header_buffer)
- toi_free_page(24, (unsigned long) header_buffer);
- return ret;
-
-write_image_header_abort:
- toiActiveAllocator->write_header_cleanup();
-write_image_header_abort_no_cleanup:
- ret = -1;
- goto out;
-}
-
-/**
- * sanity_check - check the header
- * @sh: the header which was saved at hibernate time.
- *
- * Perform a few checks, seeking to ensure that the kernel being
- * booted matches the one hibernated. They need to match so we can
- * be _sure_ things will work. It is not absolutely impossible for
- * resuming from a different kernel to work, just not assured.
- **/
-static char *sanity_check(struct toi_header *sh)
-{
- char *reason = check_image_kernel((struct swsusp_info *) sh);
-
- if (reason)
- return reason;
-
- if (!test_action_state(TOI_IGNORE_ROOTFS)) {
- const struct super_block *sb;
- list_for_each_entry(sb, &super_blocks, s_list) {
- if ((!(sb->s_flags & MS_RDONLY)) &&
- (sb->s_type->fs_flags & FS_REQUIRES_DEV))
- return "Device backed fs has been mounted "
- "rw prior to resume or initrd/ramfs "
- "is mounted rw.";
- }
- }
-
- return NULL;
-}
-
-static DECLARE_WAIT_QUEUE_HEAD(freeze_wait);
-
-#define FREEZE_IN_PROGRESS (~0)
-
-static int freeze_result;
-
-static void do_freeze(struct work_struct *dummy)
-{
- freeze_result = freeze_processes();
- wake_up(&freeze_wait);
- trap_non_toi_io = 1;
-}
-
-static DECLARE_WORK(freeze_work, do_freeze);
-
-/**
- * __read_pageset1 - test for the existence of an image and attempt to load it
- *
- * Returns: Int
- * Zero if image found and pageset1 successfully loaded.
- * Error if no image found or loaded.
- **/
-static int __read_pageset1(void)
-{
- int i, result = 0;
- char *header_buffer = (char *) toi_get_zeroed_page(25, TOI_ATOMIC_GFP),
- *sanity_error = NULL;
- struct toi_header *toi_header;
-
- if (!header_buffer) {
- printk(KERN_INFO "Unable to allocate a page for reading the "
- "signature.\n");
- return -ENOMEM;
- }
-
- /* Check for an image */
- result = toiActiveAllocator->image_exists(1);
- if (result == 3) {
- result = -ENODATA;
- toi_early_boot_message(1, 0, "The signature from an older "
- "version of TuxOnIce has been detected.");
- goto out_remove_image;
- }
-
- if (result != 1) {
- result = -ENODATA;
- noresume_reset_modules();
- printk(KERN_INFO "TuxOnIce: No image found.\n");
- goto out;
- }
-
- /*
- * Prepare the active allocator for reading the image header. The
- * activate allocator might read its own configuration.
- *
- * NB: This call may never return because there might be a signature
- * for a different image such that we warn the user and they choose
- * to reboot. (If the device ids look erroneous (2.4 vs 2.6) or the
- * location of the image might be unavailable if it was stored on a
- * network connection).
- */
-
- result = toiActiveAllocator->read_header_init();
- if (result) {
- printk(KERN_INFO "TuxOnIce: Failed to initialise, reading the "
- "image header.\n");
- goto out_remove_image;
- }
-
- /* Check for noresume command line option */
- if (test_toi_state(TOI_NORESUME_SPECIFIED)) {
- printk(KERN_INFO "TuxOnIce: Noresume on command line. Removed "
- "image.\n");
- goto out_remove_image;
- }
-
- /* Check whether we've resumed before */
- if (test_toi_state(TOI_RESUMED_BEFORE)) {
- toi_early_boot_message(1, 0, NULL);
- if (!(test_toi_state(TOI_CONTINUE_REQ))) {
- printk(KERN_INFO "TuxOnIce: Tried to resume before: "
- "Invalidated image.\n");
- goto out_remove_image;
- }
- }
-
- clear_toi_state(TOI_CONTINUE_REQ);
-
- toi_image_header_version = toiActiveAllocator->get_header_version();
-
- if (unlikely(toi_image_header_version > TOI_HEADER_VERSION)) {
- toi_early_boot_message(1, 0, image_version_error);
- if (!(test_toi_state(TOI_CONTINUE_REQ))) {
- printk(KERN_INFO "TuxOnIce: Header version too new: "
- "Invalidated image.\n");
- goto out_remove_image;
- }
- }
-
- /* Read hibernate header */
- result = toiActiveAllocator->rw_header_chunk(READ, NULL,
- header_buffer, sizeof(struct toi_header));
- if (result < 0) {
- printk(KERN_ERR "TuxOnIce: Failed to read the image "
- "signature.\n");
- goto out_remove_image;
- }
-
- toi_header = (struct toi_header *) header_buffer;
-
- /*
- * NB: This call may also result in a reboot rather than returning.
- */
-
- sanity_error = sanity_check(toi_header);
- if (sanity_error) {
- toi_early_boot_message(1, TOI_CONTINUE_REQ,
- sanity_error);
- printk(KERN_INFO "TuxOnIce: Sanity check failed.\n");
- goto out_remove_image;
- }
-
- /*
- * We have an image and it looks like it will load okay.
- *
- * Get metadata from header. Don't override commandline parameters.
- *
- * We don't need to save the image size limit because it's not used
- * during resume and will be restored with the image anyway.
- */
-
- memcpy((char *) &pagedir1,
- (char *) &toi_header->pagedir, sizeof(pagedir1));
- toi_result = toi_header->param0;
- if (!toi_bkd.toi_debug_state) {
- toi_bkd.toi_action =
- (toi_header->param1 & ~toi_bootflags_mask) |
- (toi_bkd.toi_action & toi_bootflags_mask);
- toi_bkd.toi_debug_state = toi_header->param2;
- toi_bkd.toi_default_console_level = toi_header->param3;
- }
- clear_toi_state(TOI_IGNORE_LOGLEVEL);
- pagedir2.size = toi_header->pageset_2_size;
- for (i = 0; i < 4; i++)
- toi_bkd.toi_io_time[i/2][i%2] =
- toi_header->io_time[i/2][i%2];
-
- set_toi_state(TOI_BOOT_KERNEL);
- boot_kernel_data_buffer = toi_header->bkd;
-
- read_if_version(1, toi_max_workers, "TuxOnIce max workers",
- goto out_remove_image);
-
- /* Read filesystem info */
- if (fs_info_load_and_check()) {
- printk(KERN_EMERG "TuxOnIce: File system mount time checks "
- "failed. Refusing to corrupt your filesystems!\n");
- goto out_remove_image;
- }
-
- /* Read module configurations */
- result = read_module_configs();
- if (result) {
- pagedir1.size = 0;
- pagedir2.size = 0;
- printk(KERN_INFO "TuxOnIce: Failed to read TuxOnIce module "
- "configurations.\n");
- clear_action_state(TOI_KEEP_IMAGE);
- goto out_remove_image;
- }
-
- toi_prepare_console();
-
- set_toi_state(TOI_NOW_RESUMING);
-
- result = pm_notifier_call_chain(PM_RESTORE_PREPARE);
- if (result)
- goto out_notifier_call_chain;;
-
- if (usermodehelper_disable())
- goto out_enable_usermodehelper;
-
- current->flags |= PF_NOFREEZE;
- freeze_result = FREEZE_IN_PROGRESS;
-
- schedule_work_on(cpumask_first(cpu_online_mask), &freeze_work);
-
- toi_cond_pause(1, "About to read original pageset1 locations.");
-
- /*
- * See _toi_rw_header_chunk in tuxonice_bio.c:
- * Initialize pageset1_map by reading the map from the image.
- */
- if (memory_bm_read(pageset1_map, toiActiveAllocator->rw_header_chunk))
- goto out_thaw;
-
- /*
- * See toi_rw_cleanup in tuxonice_bio.c:
- * Clean up after reading the header.
- */
- result = toiActiveAllocator->read_header_cleanup();
- if (result) {
- printk(KERN_ERR "TuxOnIce: Failed to cleanup after reading the "
- "image header.\n");
- goto out_thaw;
- }
-
- toi_cond_pause(1, "About to read pagedir.");
-
- /*
- * Get the addresses of pages into which we will load the kernel to
- * be copied back and check if they conflict with the ones we are using.
- */
- if (toi_get_pageset1_load_addresses()) {
- printk(KERN_INFO "TuxOnIce: Failed to get load addresses for "
- "pageset1.\n");
- goto out_thaw;
- }
-
- /* Read the original kernel back */
- toi_cond_pause(1, "About to read pageset 1.");
-
- /* Given the pagemap, read back the data from disk */
- if (read_pageset(&pagedir1, 0)) {
- toi_prepare_status(DONT_CLEAR_BAR, "Failed to read pageset 1.");
- result = -EIO;
- goto out_thaw;
- }
-
- toi_cond_pause(1, "About to restore original kernel.");
- result = 0;
-
- if (!toi_keeping_image &&
- toiActiveAllocator->mark_resume_attempted)
- toiActiveAllocator->mark_resume_attempted(1);
-
- wait_event(freeze_wait, freeze_result != FREEZE_IN_PROGRESS);
-out:
- current->flags &= ~PF_NOFREEZE;
- toi_free_page(25, (unsigned long) header_buffer);
- return result;
-
-out_thaw:
- wait_event(freeze_wait, freeze_result != FREEZE_IN_PROGRESS);
- trap_non_toi_io = 0;
- thaw_processes();
-out_enable_usermodehelper:
- usermodehelper_enable();
-out_notifier_call_chain:
- pm_notifier_call_chain(PM_POST_RESTORE);
- toi_cleanup_console();
-out_remove_image:
- result = -EINVAL;
- if (!toi_keeping_image)
- toiActiveAllocator->remove_image();
- toiActiveAllocator->read_header_cleanup();
- noresume_reset_modules();
- goto out;
-}
-
-/**
- * read_pageset1 - highlevel function to read the saved pages
- *
- * Attempt to read the header and pageset1 of a hibernate image.
- * Handle the outcome, complaining where appropriate.
- **/
-int read_pageset1(void)
-{
- int error;
-
- error = __read_pageset1();
-
- if (error && error != -ENODATA && error != -EINVAL &&
- !test_result_state(TOI_ABORTED))
- abort_hibernate(TOI_IMAGE_ERROR,
- "TuxOnIce: Error %d resuming\n", error);
-
- return error;
-}
-
-/**
- * get_have_image_data - check the image header
- **/
-static char *get_have_image_data(void)
-{
- char *output_buffer = (char *) toi_get_zeroed_page(26, TOI_ATOMIC_GFP);
- struct toi_header *toi_header;
-
- if (!output_buffer) {
- printk(KERN_INFO "Output buffer null.\n");
- return NULL;
- }
-
- /* Check for an image */
- if (!toiActiveAllocator->image_exists(1) ||
- toiActiveAllocator->read_header_init() ||
- toiActiveAllocator->rw_header_chunk(READ, NULL,
- output_buffer, sizeof(struct toi_header))) {
- sprintf(output_buffer, "0\n");
- /*
- * From an initrd/ramfs, catting have_image and
- * getting a result of 0 is sufficient.
- */
- clear_toi_state(TOI_BOOT_TIME);
- goto out;
- }
-
- toi_header = (struct toi_header *) output_buffer;
-
- sprintf(output_buffer, "1\n%s\n%s\n",
- toi_header->uts.machine,
- toi_header->uts.version);
-
- /* Check whether we've resumed before */
- if (test_toi_state(TOI_RESUMED_BEFORE))
- strcat(output_buffer, "Resumed before.\n");
-
-out:
- noresume_reset_modules();
- return output_buffer;
-}
-
-/**
- * read_pageset2 - read second part of the image
- * @overwrittenpagesonly: Read only pages which would have been
- * verwritten by pageset1?
- *
- * Read in part or all of pageset2 of an image, depending upon
- * whether we are hibernating and have only overwritten a portion
- * with pageset1 pages, or are resuming and need to read them
- * all.
- *
- * Returns: Int
- * Zero if no error, otherwise the error value.
- **/
-int read_pageset2(int overwrittenpagesonly)
-{
- int result = 0;
-
- if (!pagedir2.size)
- return 0;
-
- result = read_pageset(&pagedir2, overwrittenpagesonly);
-
- toi_cond_pause(1, "Pagedir 2 read.");
-
- return result;
-}
-
-/**
- * image_exists_read - has an image been found?
- * @page: Output buffer
- *
- * Store 0 or 1 in page, depending on whether an image is found.
- * Incoming buffer is PAGE_SIZE and result is guaranteed
- * to be far less than that, so we don't worry about
- * overflow.
- **/
-int image_exists_read(const char *page, int count)
-{
- int len = 0;
- char *result;
-
- if (toi_activate_storage(0))
- return count;
-
- if (!test_toi_state(TOI_RESUME_DEVICE_OK))
- toi_attempt_to_parse_resume_device(0);
-
- if (!toiActiveAllocator) {
- len = sprintf((char *) page, "-1\n");
- } else {
- result = get_have_image_data();
- if (result) {
- len = sprintf((char *) page, "%s", result);
- toi_free_page(26, (unsigned long) result);
- }
- }
-
- toi_deactivate_storage(0);
-
- return len;
-}
-
-/**
- * image_exists_write - invalidate an image if one exists
- **/
-int image_exists_write(const char *buffer, int count)
-{
- if (toi_activate_storage(0))
- return count;
-
- if (toiActiveAllocator && toiActiveAllocator->image_exists(1))
- toiActiveAllocator->remove_image();
-
- toi_deactivate_storage(0);
-
- clear_result_state(TOI_KEPT_IMAGE);
-
- return count;
-}
diff --git a/kernel/power/tuxonice_io.h b/kernel/power/tuxonice_io.h
deleted file mode 100644
index 56645a5c6..000000000
--- a/kernel/power/tuxonice_io.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * kernel/power/tuxonice_io.h
- *
- * Copyright (C) 2005-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * It contains high level IO routines for hibernating.
- *
- */
-
-#include <linux/utsname.h>
-#include "tuxonice_pagedir.h"
-
-/* Non-module data saved in our image header */
-struct toi_header {
- /*
- * Mirror struct swsusp_info, but without
- * the page aligned attribute
- */
- struct new_utsname uts;
- u32 version_code;
- unsigned long num_physpages;
- int cpus;
- unsigned long image_pages;
- unsigned long pages;
- unsigned long size;
-
- /* Our own data */
- unsigned long orig_mem_free;
- int page_size;
- int pageset_2_size;
- int param0;
- int param1;
- int param2;
- int param3;
- int progress0;
- int progress1;
- int progress2;
- int progress3;
- int io_time[2][2];
- struct pagedir pagedir;
- dev_t root_fs;
- unsigned long bkd; /* Boot kernel data locn */
-};
-
-extern int write_pageset(struct pagedir *pagedir);
-extern int write_image_header(void);
-extern int read_pageset1(void);
-extern int read_pageset2(int overwrittenpagesonly);
-
-extern int toi_attempt_to_parse_resume_device(int quiet);
-extern void attempt_to_parse_resume_device2(void);
-extern void attempt_to_parse_alt_resume_param(void);
-int image_exists_read(const char *page, int count);
-int image_exists_write(const char *buffer, int count);
-extern void save_restore_alt_param(int replace, int quiet);
-extern atomic_t toi_io_workers;
-
-/* Args to save_restore_alt_param */
-#define RESTORE 0
-#define SAVE 1
-
-#define NOQUIET 0
-#define QUIET 1
-
-extern wait_queue_head_t toi_io_queue_flusher;
-extern int toi_bio_queue_flusher_should_finish;
-
-int fs_info_space_needed(void);
-
-extern int toi_max_workers;
diff --git a/kernel/power/tuxonice_modules.c b/kernel/power/tuxonice_modules.c
deleted file mode 100644
index 18f22bdb6..000000000
--- a/kernel/power/tuxonice_modules.c
+++ /dev/null
@@ -1,520 +0,0 @@
-/*
- * kernel/power/tuxonice_modules.c
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- */
-
-#include <linux/suspend.h>
-#include <linux/module.h>
-#include "tuxonice.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_ui.h"
-
-LIST_HEAD(toi_filters);
-LIST_HEAD(toiAllocators);
-
-LIST_HEAD(toi_modules);
-
-struct toi_module_ops *toiActiveAllocator;
-
-static int toi_num_filters;
-int toiNumAllocators, toi_num_modules;
-
-/*
- * toi_header_storage_for_modules
- *
- * Returns the amount of space needed to store configuration
- * data needed by the modules prior to copying back the original
- * kernel. We can exclude data for pageset2 because it will be
- * available anyway once the kernel is copied back.
- */
-long toi_header_storage_for_modules(void)
-{
- struct toi_module_ops *this_module;
- int bytes = 0;
-
- list_for_each_entry(this_module, &toi_modules, module_list) {
- if (!this_module->enabled ||
- (this_module->type == WRITER_MODULE &&
- toiActiveAllocator != this_module))
- continue;
- if (this_module->storage_needed) {
- int this = this_module->storage_needed() +
- sizeof(struct toi_module_header) +
- sizeof(int);
- this_module->header_requested = this;
- bytes += this;
- }
- }
-
- /* One more for the empty terminator */
- return bytes + sizeof(struct toi_module_header);
-}
-
-void print_toi_header_storage_for_modules(void)
-{
- struct toi_module_ops *this_module;
- int bytes = 0;
-
- printk(KERN_DEBUG "Header storage:\n");
- list_for_each_entry(this_module, &toi_modules, module_list) {
- if (!this_module->enabled ||
- (this_module->type == WRITER_MODULE &&
- toiActiveAllocator != this_module))
- continue;
- if (this_module->storage_needed) {
- int this = this_module->storage_needed() +
- sizeof(struct toi_module_header) +
- sizeof(int);
- this_module->header_requested = this;
- bytes += this;
- printk(KERN_DEBUG "+ %16s : %-4d/%d.\n",
- this_module->name,
- this_module->header_used, this);
- }
- }
-
- printk(KERN_DEBUG "+ empty terminator : %zu.\n",
- sizeof(struct toi_module_header));
- printk(KERN_DEBUG " ====\n");
- printk(KERN_DEBUG " %zu\n",
- bytes + sizeof(struct toi_module_header));
-}
-
-/*
- * toi_memory_for_modules
- *
- * Returns the amount of memory requested by modules for
- * doing their work during the cycle.
- */
-
-long toi_memory_for_modules(int print_parts)
-{
- long bytes = 0, result;
- struct toi_module_ops *this_module;
-
- if (print_parts)
- printk(KERN_INFO "Memory for modules:\n===================\n");
- list_for_each_entry(this_module, &toi_modules, module_list) {
- int this;
- if (!this_module->enabled)
- continue;
- if (this_module->memory_needed) {
- this = this_module->memory_needed();
- if (print_parts)
- printk(KERN_INFO "%10d bytes (%5ld pages) for "
- "module '%s'.\n", this,
- DIV_ROUND_UP(this, PAGE_SIZE),
- this_module->name);
- bytes += this;
- }
- }
-
- result = DIV_ROUND_UP(bytes, PAGE_SIZE);
- if (print_parts)
- printk(KERN_INFO " => %ld bytes, %ld pages.\n", bytes, result);
-
- return result;
-}
-
-/*
- * toi_expected_compression_ratio
- *
- * Returns the compression ratio expected when saving the image.
- */
-
-int toi_expected_compression_ratio(void)
-{
- int ratio = 100;
- struct toi_module_ops *this_module;
-
- list_for_each_entry(this_module, &toi_modules, module_list) {
- if (!this_module->enabled)
- continue;
- if (this_module->expected_compression)
- ratio = ratio * this_module->expected_compression()
- / 100;
- }
-
- return ratio;
-}
-
-/* toi_find_module_given_dir
- * Functionality : Return a module (if found), given a pointer
- * to its directory name
- */
-
-static struct toi_module_ops *toi_find_module_given_dir(char *name)
-{
- struct toi_module_ops *this_module, *found_module = NULL;
-
- list_for_each_entry(this_module, &toi_modules, module_list) {
- if (!strcmp(name, this_module->directory)) {
- found_module = this_module;
- break;
- }
- }
-
- return found_module;
-}
-
-/* toi_find_module_given_name
- * Functionality : Return a module (if found), given a pointer
- * to its name
- */
-
-struct toi_module_ops *toi_find_module_given_name(char *name)
-{
- struct toi_module_ops *this_module, *found_module = NULL;
-
- list_for_each_entry(this_module, &toi_modules, module_list) {
- if (!strcmp(name, this_module->name)) {
- found_module = this_module;
- break;
- }
- }
-
- return found_module;
-}
-
-/*
- * toi_print_module_debug_info
- * Functionality : Get debugging info from modules into a buffer.
- */
-int toi_print_module_debug_info(char *buffer, int buffer_size)
-{
- struct toi_module_ops *this_module;
- int len = 0;
-
- list_for_each_entry(this_module, &toi_modules, module_list) {
- if (!this_module->enabled)
- continue;
- if (this_module->print_debug_info) {
- int result;
- result = this_module->print_debug_info(buffer + len,
- buffer_size - len);
- len += result;
- }
- }
-
- /* Ensure null terminated */
- buffer[buffer_size] = 0;
-
- return len;
-}
-
-/*
- * toi_register_module
- *
- * Register a module.
- */
-int toi_register_module(struct toi_module_ops *module)
-{
- int i;
- struct kobject *kobj;
-
- if (!hibernation_available())
- return -ENODEV;
-
- module->enabled = 1;
-
- if (toi_find_module_given_name(module->name)) {
- printk(KERN_INFO "TuxOnIce: Trying to load module %s,"
- " which is already registered.\n",
- module->name);
- return -EBUSY;
- }
-
- switch (module->type) {
- case FILTER_MODULE:
- list_add_tail(&module->type_list, &toi_filters);
- toi_num_filters++;
- break;
- case WRITER_MODULE:
- list_add_tail(&module->type_list, &toiAllocators);
- toiNumAllocators++;
- break;
- case MISC_MODULE:
- case MISC_HIDDEN_MODULE:
- case BIO_ALLOCATOR_MODULE:
- break;
- default:
- printk(KERN_ERR "Hmmm. Module '%s' has an invalid type."
- " It has been ignored.\n", module->name);
- return -EINVAL;
- }
- list_add_tail(&module->module_list, &toi_modules);
- toi_num_modules++;
-
- if ((!module->directory && !module->shared_directory) ||
- !module->sysfs_data || !module->num_sysfs_entries)
- return 0;
-
- /*
- * Modules may share a directory, but those with shared_dir
- * set must be loaded (via symbol dependencies) after parents
- * and unloaded beforehand.
- */
- if (module->shared_directory) {
- struct toi_module_ops *shared =
- toi_find_module_given_dir(module->shared_directory);
- if (!shared) {
- printk(KERN_ERR "TuxOnIce: Module %s wants to share "
- "%s's directory but %s isn't loaded.\n",
- module->name, module->shared_directory,
- module->shared_directory);
- toi_unregister_module(module);
- return -ENODEV;
- }
- kobj = shared->dir_kobj;
- } else {
- if (!strncmp(module->directory, "[ROOT]", 6))
- kobj = tuxonice_kobj;
- else
- kobj = make_toi_sysdir(module->directory);
- }
- module->dir_kobj = kobj;
- for (i = 0; i < module->num_sysfs_entries; i++) {
- int result = toi_register_sysfs_file(kobj,
- &module->sysfs_data[i]);
- if (result)
- return result;
- }
- return 0;
-}
-
-/*
- * toi_unregister_module
- *
- * Remove a module.
- */
-void toi_unregister_module(struct toi_module_ops *module)
-{
- int i;
-
- if (module->dir_kobj)
- for (i = 0; i < module->num_sysfs_entries; i++)
- toi_unregister_sysfs_file(module->dir_kobj,
- &module->sysfs_data[i]);
-
- if (!module->shared_directory && module->directory &&
- strncmp(module->directory, "[ROOT]", 6))
- remove_toi_sysdir(module->dir_kobj);
-
- switch (module->type) {
- case FILTER_MODULE:
- list_del(&module->type_list);
- toi_num_filters--;
- break;
- case WRITER_MODULE:
- list_del(&module->type_list);
- toiNumAllocators--;
- if (toiActiveAllocator == module) {
- toiActiveAllocator = NULL;
- clear_toi_state(TOI_CAN_RESUME);
- clear_toi_state(TOI_CAN_HIBERNATE);
- }
- break;
- case MISC_MODULE:
- case MISC_HIDDEN_MODULE:
- case BIO_ALLOCATOR_MODULE:
- break;
- default:
- printk(KERN_ERR "Module '%s' has an invalid type."
- " It has been ignored.\n", module->name);
- return;
- }
- list_del(&module->module_list);
- toi_num_modules--;
-}
-
-/*
- * toi_move_module_tail
- *
- * Rearrange modules when reloading the config.
- */
-void toi_move_module_tail(struct toi_module_ops *module)
-{
- switch (module->type) {
- case FILTER_MODULE:
- if (toi_num_filters > 1)
- list_move_tail(&module->type_list, &toi_filters);
- break;
- case WRITER_MODULE:
- if (toiNumAllocators > 1)
- list_move_tail(&module->type_list, &toiAllocators);
- break;
- case MISC_MODULE:
- case MISC_HIDDEN_MODULE:
- case BIO_ALLOCATOR_MODULE:
- break;
- default:
- printk(KERN_ERR "Module '%s' has an invalid type."
- " It has been ignored.\n", module->name);
- return;
- }
- if ((toi_num_filters + toiNumAllocators) > 1)
- list_move_tail(&module->module_list, &toi_modules);
-}
-
-/*
- * toi_initialise_modules
- *
- * Get ready to do some work!
- */
-int toi_initialise_modules(int starting_cycle, int early)
-{
- struct toi_module_ops *this_module;
- int result;
-
- list_for_each_entry(this_module, &toi_modules, module_list) {
- this_module->header_requested = 0;
- this_module->header_used = 0;
- if (!this_module->enabled)
- continue;
- if (this_module->early != early)
- continue;
- if (this_module->initialise) {
- result = this_module->initialise(starting_cycle);
- if (result) {
- toi_cleanup_modules(starting_cycle);
- return result;
- }
- this_module->initialised = 1;
- }
- }
-
- return 0;
-}
-
-/*
- * toi_cleanup_modules
- *
- * Tell modules the work is done.
- */
-void toi_cleanup_modules(int finishing_cycle)
-{
- struct toi_module_ops *this_module;
-
- list_for_each_entry(this_module, &toi_modules, module_list) {
- if (!this_module->enabled || !this_module->initialised)
- continue;
- if (this_module->cleanup)
- this_module->cleanup(finishing_cycle);
- this_module->initialised = 0;
- }
-}
-
-/*
- * toi_pre_atomic_restore_modules
- *
- * Get ready to do some work!
- */
-void toi_pre_atomic_restore_modules(struct toi_boot_kernel_data *bkd)
-{
- struct toi_module_ops *this_module;
-
- list_for_each_entry(this_module, &toi_modules, module_list) {
- if (this_module->enabled && this_module->pre_atomic_restore)
- this_module->pre_atomic_restore(bkd);
- }
-}
-
-/*
- * toi_post_atomic_restore_modules
- *
- * Get ready to do some work!
- */
-void toi_post_atomic_restore_modules(struct toi_boot_kernel_data *bkd)
-{
- struct toi_module_ops *this_module;
-
- list_for_each_entry(this_module, &toi_modules, module_list) {
- if (this_module->enabled && this_module->post_atomic_restore)
- this_module->post_atomic_restore(bkd);
- }
-}
-
-/*
- * toi_get_next_filter
- *
- * Get the next filter in the pipeline.
- */
-struct toi_module_ops *toi_get_next_filter(struct toi_module_ops *filter_sought)
-{
- struct toi_module_ops *last_filter = NULL, *this_filter = NULL;
-
- list_for_each_entry(this_filter, &toi_filters, type_list) {
- if (!this_filter->enabled)
- continue;
- if ((last_filter == filter_sought) || (!filter_sought))
- return this_filter;
- last_filter = this_filter;
- }
-
- return toiActiveAllocator;
-}
-
-/**
- * toi_show_modules: Printk what support is loaded.
- */
-void toi_print_modules(void)
-{
- struct toi_module_ops *this_module;
- int prev = 0;
-
- printk(KERN_INFO "TuxOnIce " TOI_CORE_VERSION ", with support for");
-
- list_for_each_entry(this_module, &toi_modules, module_list) {
- if (this_module->type == MISC_HIDDEN_MODULE)
- continue;
- printk("%s %s%s%s", prev ? "," : "",
- this_module->enabled ? "" : "[",
- this_module->name,
- this_module->enabled ? "" : "]");
- prev = 1;
- }
-
- printk(".\n");
-}
-
-/* toi_get_modules
- *
- * Take a reference to modules so they can't go away under us.
- */
-
-int toi_get_modules(void)
-{
- struct toi_module_ops *this_module;
-
- list_for_each_entry(this_module, &toi_modules, module_list) {
- struct toi_module_ops *this_module2;
-
- if (try_module_get(this_module->module))
- continue;
-
- /* Failed! Reverse gets and return error */
- list_for_each_entry(this_module2, &toi_modules,
- module_list) {
- if (this_module == this_module2)
- return -EINVAL;
- module_put(this_module2->module);
- }
- }
- return 0;
-}
-
-/* toi_put_modules
- *
- * Release our references to modules we used.
- */
-
-void toi_put_modules(void)
-{
- struct toi_module_ops *this_module;
-
- list_for_each_entry(this_module, &toi_modules, module_list)
- module_put(this_module->module);
-}
diff --git a/kernel/power/tuxonice_modules.h b/kernel/power/tuxonice_modules.h
deleted file mode 100644
index 34ffe2ee3..000000000
--- a/kernel/power/tuxonice_modules.h
+++ /dev/null
@@ -1,212 +0,0 @@
-/*
- * kernel/power/tuxonice_modules.h
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * It contains declarations for modules. Modules are additions to
- * TuxOnIce that provide facilities such as image compression or
- * encryption, backends for storage of the image and user interfaces.
- *
- */
-
-#ifndef TOI_MODULES_H
-#define TOI_MODULES_H
-
-/* This is the maximum size we store in the image header for a module name */
-#define TOI_MAX_MODULE_NAME_LENGTH 30
-
-struct toi_boot_kernel_data;
-
-/* Per-module metadata */
-struct toi_module_header {
- char name[TOI_MAX_MODULE_NAME_LENGTH];
- int enabled;
- int type;
- int index;
- int data_length;
- unsigned long signature;
-};
-
-enum {
- FILTER_MODULE,
- WRITER_MODULE,
- BIO_ALLOCATOR_MODULE,
- MISC_MODULE,
- MISC_HIDDEN_MODULE,
-};
-
-enum {
- TOI_ASYNC,
- TOI_SYNC
-};
-
-enum {
- TOI_VIRT,
- TOI_PAGE,
-};
-
-#define TOI_MAP(type, addr) \
- (type == TOI_PAGE ? kmap(addr) : addr)
-
-#define TOI_UNMAP(type, addr) \
- do { \
- if (type == TOI_PAGE) \
- kunmap(addr); \
- } while(0)
-
-struct toi_module_ops {
- /* Functions common to all modules */
- int type;
- char *name;
- char *directory;
- char *shared_directory;
- struct kobject *dir_kobj;
- struct module *module;
- int enabled, early, initialised;
- struct list_head module_list;
-
- /* List of filters or allocators */
- struct list_head list, type_list;
-
- /*
- * Requirements for memory and storage in
- * the image header..
- */
- int (*memory_needed) (void);
- int (*storage_needed) (void);
-
- int header_requested, header_used;
-
- int (*expected_compression) (void);
-
- /*
- * Debug info
- */
- int (*print_debug_info) (char *buffer, int size);
- int (*save_config_info) (char *buffer);
- void (*load_config_info) (char *buffer, int len);
-
- /*
- * Initialise & cleanup - general routines called
- * at the start and end of a cycle.
- */
- int (*initialise) (int starting_cycle);
- void (*cleanup) (int finishing_cycle);
-
- void (*pre_atomic_restore) (struct toi_boot_kernel_data *bkd);
- void (*post_atomic_restore) (struct toi_boot_kernel_data *bkd);
-
- /*
- * Calls for allocating storage (allocators only).
- *
- * Header space is requested separately and cannot fail, but the
- * reservation is only applied when main storage is allocated.
- * The header space reservation is thus always set prior to
- * requesting the allocation of storage - and prior to querying
- * how much storage is available.
- */
-
- unsigned long (*storage_available) (void);
- void (*reserve_header_space) (unsigned long space_requested);
- int (*register_storage) (void);
- int (*allocate_storage) (unsigned long space_requested);
- unsigned long (*storage_allocated) (void);
- void (*free_unused_storage) (void);
-
- /*
- * Routines used in image I/O.
- */
- int (*rw_init) (int rw, int stream_number);
- int (*rw_cleanup) (int rw);
- int (*write_page) (unsigned long index, int buf_type, void *buf,
- unsigned int buf_size);
- int (*read_page) (unsigned long *index, int buf_type, void *buf,
- unsigned int *buf_size);
- int (*io_flusher) (int rw);
-
- /* Reset module if image exists but reading aborted */
- void (*noresume_reset) (void);
-
- /* Read and write the metadata */
- int (*write_header_init) (void);
- int (*write_header_cleanup) (void);
-
- int (*read_header_init) (void);
- int (*read_header_cleanup) (void);
-
- /* To be called after read_header_init */
- int (*get_header_version) (void);
-
- int (*rw_header_chunk) (int rw, struct toi_module_ops *owner,
- char *buffer_start, int buffer_size);
-
- int (*rw_header_chunk_noreadahead) (int rw,
- struct toi_module_ops *owner, char *buffer_start,
- int buffer_size);
-
- /* Attempt to parse an image location */
- int (*parse_sig_location) (char *buffer, int only_writer, int quiet);
-
- /* Throttle I/O according to throughput */
- void (*update_throughput_throttle) (int jif_index);
-
- /* Flush outstanding I/O */
- int (*finish_all_io) (void);
-
- /* Determine whether image exists that we can restore */
- int (*image_exists) (int quiet);
-
- /* Mark the image as having tried to resume */
- int (*mark_resume_attempted) (int);
-
- /* Destroy image if one exists */
- int (*remove_image) (void);
-
- /* Sysfs Data */
- struct toi_sysfs_data *sysfs_data;
- int num_sysfs_entries;
-
- /* Block I/O allocator */
- struct toi_bio_allocator_ops *bio_allocator_ops;
-};
-
-extern int toi_num_modules, toiNumAllocators;
-
-extern struct toi_module_ops *toiActiveAllocator;
-extern struct list_head toi_filters, toiAllocators, toi_modules;
-
-extern void toi_prepare_console_modules(void);
-extern void toi_cleanup_console_modules(void);
-
-extern struct toi_module_ops *toi_find_module_given_name(char *name);
-extern struct toi_module_ops *toi_get_next_filter(struct toi_module_ops *);
-
-extern int toi_register_module(struct toi_module_ops *module);
-extern void toi_move_module_tail(struct toi_module_ops *module);
-
-extern long toi_header_storage_for_modules(void);
-extern long toi_memory_for_modules(int print_parts);
-extern void print_toi_header_storage_for_modules(void);
-extern int toi_expected_compression_ratio(void);
-
-extern int toi_print_module_debug_info(char *buffer, int buffer_size);
-extern int toi_register_module(struct toi_module_ops *module);
-extern void toi_unregister_module(struct toi_module_ops *module);
-
-extern int toi_initialise_modules(int starting_cycle, int early);
-#define toi_initialise_modules_early(starting) \
- toi_initialise_modules(starting, 1)
-#define toi_initialise_modules_late(starting) \
- toi_initialise_modules(starting, 0)
-extern void toi_cleanup_modules(int finishing_cycle);
-
-extern void toi_post_atomic_restore_modules(struct toi_boot_kernel_data *bkd);
-extern void toi_pre_atomic_restore_modules(struct toi_boot_kernel_data *bkd);
-
-extern void toi_print_modules(void);
-
-int toi_get_modules(void);
-void toi_put_modules(void);
-#endif
diff --git a/kernel/power/tuxonice_netlink.c b/kernel/power/tuxonice_netlink.c
deleted file mode 100644
index 0db58af8b..000000000
--- a/kernel/power/tuxonice_netlink.c
+++ /dev/null
@@ -1,324 +0,0 @@
-/*
- * kernel/power/tuxonice_netlink.c
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Functions for communicating with a userspace helper via netlink.
- */
-
-#include <linux/suspend.h>
-#include <linux/sched.h>
-#include <linux/kmod.h>
-#include "tuxonice_netlink.h"
-#include "tuxonice.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_builtin.h"
-
-static struct user_helper_data *uhd_list;
-
-/*
- * Refill our pool of SKBs for use in emergencies (eg, when eating memory and
- * none can be allocated).
- */
-static void toi_fill_skb_pool(struct user_helper_data *uhd)
-{
- while (uhd->pool_level < uhd->pool_limit) {
- struct sk_buff *new_skb =
- alloc_skb(NLMSG_SPACE(uhd->skb_size), TOI_ATOMIC_GFP);
-
- if (!new_skb)
- break;
-
- new_skb->next = uhd->emerg_skbs;
- uhd->emerg_skbs = new_skb;
- uhd->pool_level++;
- }
-}
-
-/*
- * Try to allocate a single skb. If we can't get one, try to use one from
- * our pool.
- */
-static struct sk_buff *toi_get_skb(struct user_helper_data *uhd)
-{
- struct sk_buff *skb =
- alloc_skb(NLMSG_SPACE(uhd->skb_size), TOI_ATOMIC_GFP);
-
- if (skb)
- return skb;
-
- skb = uhd->emerg_skbs;
- if (skb) {
- uhd->pool_level--;
- uhd->emerg_skbs = skb->next;
- skb->next = NULL;
- }
-
- return skb;
-}
-
-void toi_send_netlink_message(struct user_helper_data *uhd,
- int type, void *params, size_t len)
-{
- struct sk_buff *skb;
- struct nlmsghdr *nlh;
- void *dest;
- struct task_struct *t;
-
- if (uhd->pid == -1)
- return;
-
- if (uhd->debug)
- printk(KERN_ERR "toi_send_netlink_message: Send "
- "message type %d.\n", type);
-
- skb = toi_get_skb(uhd);
- if (!skb) {
- printk(KERN_INFO "toi_netlink: Can't allocate skb!\n");
- return;
- }
-
- nlh = nlmsg_put(skb, 0, uhd->sock_seq, type, len, 0);
- uhd->sock_seq++;
-
- dest = NLMSG_DATA(nlh);
- if (params && len > 0)
- memcpy(dest, params, len);
-
- netlink_unicast(uhd->nl, skb, uhd->pid, 0);
-
- toi_read_lock_tasklist();
- t = find_task_by_pid_ns(uhd->pid, &init_pid_ns);
- if (!t) {
- toi_read_unlock_tasklist();
- if (uhd->pid > -1)
- printk(KERN_INFO "Hmm. Can't find the userspace task"
- " %d.\n", uhd->pid);
- return;
- }
- wake_up_process(t);
- toi_read_unlock_tasklist();
-
- yield();
-}
-
-static void send_whether_debugging(struct user_helper_data *uhd)
-{
- static u8 is_debugging = 1;
-
- toi_send_netlink_message(uhd, NETLINK_MSG_IS_DEBUGGING,
- &is_debugging, sizeof(u8));
-}
-
-/*
- * Set the PF_NOFREEZE flag on the given process to ensure it can run whilst we
- * are hibernating.
- */
-static int nl_set_nofreeze(struct user_helper_data *uhd, __u32 pid)
-{
- struct task_struct *t;
-
- if (uhd->debug)
- printk(KERN_ERR "nl_set_nofreeze for pid %d.\n", pid);
-
- toi_read_lock_tasklist();
- t = find_task_by_pid_ns(pid, &init_pid_ns);
- if (!t) {
- toi_read_unlock_tasklist();
- printk(KERN_INFO "Strange. Can't find the userspace task %d.\n",
- pid);
- return -EINVAL;
- }
-
- t->flags |= PF_NOFREEZE;
-
- toi_read_unlock_tasklist();
- uhd->pid = pid;
-
- toi_send_netlink_message(uhd, NETLINK_MSG_NOFREEZE_ACK, NULL, 0);
-
- return 0;
-}
-
-/*
- * Called when the userspace process has informed us that it's ready to roll.
- */
-static int nl_ready(struct user_helper_data *uhd, u32 version)
-{
- if (version != uhd->interface_version) {
- printk(KERN_INFO "%s userspace process using invalid interface"
- " version (%d - kernel wants %d). Trying to "
- "continue without it.\n",
- uhd->name, version, uhd->interface_version);
- if (uhd->not_ready)
- uhd->not_ready();
- return -EINVAL;
- }
-
- complete(&uhd->wait_for_process);
-
- return 0;
-}
-
-void toi_netlink_close_complete(struct user_helper_data *uhd)
-{
- if (uhd->nl) {
- netlink_kernel_release(uhd->nl);
- uhd->nl = NULL;
- }
-
- while (uhd->emerg_skbs) {
- struct sk_buff *next = uhd->emerg_skbs->next;
- kfree_skb(uhd->emerg_skbs);
- uhd->emerg_skbs = next;
- }
-
- uhd->pid = -1;
-}
-
-static int toi_nl_gen_rcv_msg(struct user_helper_data *uhd,
- struct sk_buff *skb, struct nlmsghdr *nlh)
-{
- int type = nlh->nlmsg_type;
- int *data;
- int err;
-
- if (uhd->debug)
- printk(KERN_ERR "toi_user_rcv_skb: Received message %d.\n",
- type);
-
- /* Let the more specific handler go first. It returns
- * 1 for valid messages that it doesn't know. */
- err = uhd->rcv_msg(skb, nlh);
- if (err != 1)
- return err;
-
- /* Only allow one task to receive NOFREEZE privileges */
- if (type == NETLINK_MSG_NOFREEZE_ME && uhd->pid != -1) {
- printk(KERN_INFO "Received extra nofreeze me requests.\n");
- return -EBUSY;
- }
-
- data = NLMSG_DATA(nlh);
-
- switch (type) {
- case NETLINK_MSG_NOFREEZE_ME:
- return nl_set_nofreeze(uhd, nlh->nlmsg_pid);
- case NETLINK_MSG_GET_DEBUGGING:
- send_whether_debugging(uhd);
- return 0;
- case NETLINK_MSG_READY:
- if (nlh->nlmsg_len != NLMSG_LENGTH(sizeof(u32))) {
- printk(KERN_INFO "Invalid ready mesage.\n");
- if (uhd->not_ready)
- uhd->not_ready();
- return -EINVAL;
- }
- return nl_ready(uhd, (u32) *data);
- case NETLINK_MSG_CLEANUP:
- toi_netlink_close_complete(uhd);
- return 0;
- }
-
- return -EINVAL;
-}
-
-static void toi_user_rcv_skb(struct sk_buff *skb)
-{
- int err;
- struct nlmsghdr *nlh;
- struct user_helper_data *uhd = uhd_list;
-
- while (uhd && uhd->netlink_id != skb->sk->sk_protocol)
- uhd = uhd->next;
-
- if (!uhd)
- return;
-
- while (skb->len >= NLMSG_SPACE(0)) {
- u32 rlen;
-
- nlh = (struct nlmsghdr *) skb->data;
- if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
- return;
-
- rlen = NLMSG_ALIGN(nlh->nlmsg_len);
- if (rlen > skb->len)
- rlen = skb->len;
-
- err = toi_nl_gen_rcv_msg(uhd, skb, nlh);
- if (err)
- netlink_ack(skb, nlh, err);
- else if (nlh->nlmsg_flags & NLM_F_ACK)
- netlink_ack(skb, nlh, 0);
- skb_pull(skb, rlen);
- }
-}
-
-static int netlink_prepare(struct user_helper_data *uhd)
-{
- struct netlink_kernel_cfg cfg = {
- .groups = 0,
- .input = toi_user_rcv_skb,
- };
-
- uhd->next = uhd_list;
- uhd_list = uhd;
-
- uhd->sock_seq = 0x42c0ffee;
- uhd->nl = netlink_kernel_create(&init_net, uhd->netlink_id, &cfg);
- if (!uhd->nl) {
- printk(KERN_INFO "Failed to allocate netlink socket for %s.\n",
- uhd->name);
- return -ENOMEM;
- }
-
- toi_fill_skb_pool(uhd);
-
- return 0;
-}
-
-void toi_netlink_close(struct user_helper_data *uhd)
-{
- struct task_struct *t;
-
- toi_read_lock_tasklist();
- t = find_task_by_pid_ns(uhd->pid, &init_pid_ns);
- if (t)
- t->flags &= ~PF_NOFREEZE;
- toi_read_unlock_tasklist();
-
- toi_send_netlink_message(uhd, NETLINK_MSG_CLEANUP, NULL, 0);
-}
-int toi_netlink_setup(struct user_helper_data *uhd)
-{
- /* In case userui didn't cleanup properly on us */
- toi_netlink_close_complete(uhd);
-
- if (netlink_prepare(uhd) < 0) {
- printk(KERN_INFO "Netlink prepare failed.\n");
- return 1;
- }
-
- if (toi_launch_userspace_program(uhd->program, uhd->netlink_id,
- UMH_WAIT_EXEC, uhd->debug) < 0) {
- printk(KERN_INFO "Launch userspace program failed.\n");
- toi_netlink_close_complete(uhd);
- return 1;
- }
-
- /* Wait 2 seconds for the userspace process to make contact */
- wait_for_completion_timeout(&uhd->wait_for_process, 2*HZ);
-
- if (uhd->pid == -1) {
- printk(KERN_INFO "%s: Failed to contact userspace process.\n",
- uhd->name);
- toi_netlink_close_complete(uhd);
- return 1;
- }
-
- return 0;
-}
diff --git a/kernel/power/tuxonice_netlink.h b/kernel/power/tuxonice_netlink.h
deleted file mode 100644
index 89e154599..000000000
--- a/kernel/power/tuxonice_netlink.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * kernel/power/tuxonice_netlink.h
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Declarations for functions for communicating with a userspace helper
- * via netlink.
- */
-
-#include <linux/netlink.h>
-#include <net/sock.h>
-
-#define NETLINK_MSG_BASE 0x10
-
-#define NETLINK_MSG_READY 0x10
-#define NETLINK_MSG_NOFREEZE_ME 0x16
-#define NETLINK_MSG_GET_DEBUGGING 0x19
-#define NETLINK_MSG_CLEANUP 0x24
-#define NETLINK_MSG_NOFREEZE_ACK 0x27
-#define NETLINK_MSG_IS_DEBUGGING 0x28
-
-struct user_helper_data {
- int (*rcv_msg) (struct sk_buff *skb, struct nlmsghdr *nlh);
- void (*not_ready) (void);
- struct sock *nl;
- u32 sock_seq;
- pid_t pid;
- char *comm;
- char program[256];
- int pool_level;
- int pool_limit;
- struct sk_buff *emerg_skbs;
- int skb_size;
- int netlink_id;
- char *name;
- struct user_helper_data *next;
- struct completion wait_for_process;
- u32 interface_version;
- int must_init;
- int debug;
-};
-
-#ifdef CONFIG_NET
-int toi_netlink_setup(struct user_helper_data *uhd);
-void toi_netlink_close(struct user_helper_data *uhd);
-void toi_send_netlink_message(struct user_helper_data *uhd,
- int type, void *params, size_t len);
-void toi_netlink_close_complete(struct user_helper_data *uhd);
-#else
-static inline int toi_netlink_setup(struct user_helper_data *uhd)
-{
- return 0;
-}
-
-static inline void toi_netlink_close(struct user_helper_data *uhd) { };
-static inline void toi_send_netlink_message(struct user_helper_data *uhd,
- int type, void *params, size_t len) { };
-static inline void toi_netlink_close_complete(struct user_helper_data *uhd)
- { };
-#endif
diff --git a/kernel/power/tuxonice_pagedir.c b/kernel/power/tuxonice_pagedir.c
deleted file mode 100644
index 9ea185af1..000000000
--- a/kernel/power/tuxonice_pagedir.c
+++ /dev/null
@@ -1,345 +0,0 @@
-/*
- * kernel/power/tuxonice_pagedir.c
- *
- * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
- * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz>
- * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr>
- * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Routines for handling pagesets.
- * Note that pbes aren't actually stored as such. They're stored as
- * bitmaps and extents.
- */
-
-#include <linux/suspend.h>
-#include <linux/highmem.h>
-#include <linux/bootmem.h>
-#include <linux/hardirq.h>
-#include <linux/sched.h>
-#include <linux/cpu.h>
-#include <asm/tlbflush.h>
-
-#include "tuxonice_pageflags.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_pagedir.h"
-#include "tuxonice_prepare_image.h"
-#include "tuxonice.h"
-#include "tuxonice_builtin.h"
-#include "tuxonice_alloc.h"
-
-static int ptoi_pfn;
-static struct pbe *this_low_pbe;
-static struct pbe **last_low_pbe_ptr;
-
-void toi_reset_alt_image_pageset2_pfn(void)
-{
- memory_bm_position_reset(pageset2_map);
-}
-
-static struct page *first_conflicting_page;
-
-/*
- * free_conflicting_pages
- */
-
-static void free_conflicting_pages(void)
-{
- while (first_conflicting_page) {
- struct page *next =
- *((struct page **) kmap(first_conflicting_page));
- kunmap(first_conflicting_page);
- toi__free_page(29, first_conflicting_page);
- first_conflicting_page = next;
- }
-}
-
-/* __toi_get_nonconflicting_page
- *
- * Description: Gets order zero pages that won't be overwritten
- * while copying the original pages.
- */
-
-struct page *___toi_get_nonconflicting_page(int can_be_highmem)
-{
- struct page *page;
- gfp_t flags = TOI_ATOMIC_GFP;
- if (can_be_highmem)
- flags |= __GFP_HIGHMEM;
-
-
- if (test_toi_state(TOI_LOADING_ALT_IMAGE) &&
- pageset2_map && ptoi_pfn) {
- do {
- ptoi_pfn = memory_bm_next_pfn(pageset2_map, 0);
- if (ptoi_pfn != BM_END_OF_MAP) {
- page = pfn_to_page(ptoi_pfn);
- if (!PagePageset1(page) &&
- (can_be_highmem || !PageHighMem(page)))
- return page;
- }
- } while (ptoi_pfn);
- }
-
- do {
- page = toi_alloc_page(29, flags | __GFP_ZERO);
- if (!page) {
- printk(KERN_INFO "Failed to get nonconflicting "
- "page.\n");
- return NULL;
- }
- if (PagePageset1(page)) {
- struct page **next = (struct page **) kmap(page);
- *next = first_conflicting_page;
- first_conflicting_page = page;
- kunmap(page);
- }
- } while (PagePageset1(page));
-
- return page;
-}
-
-unsigned long __toi_get_nonconflicting_page(void)
-{
- struct page *page = ___toi_get_nonconflicting_page(0);
- return page ? (unsigned long) page_address(page) : 0;
-}
-
-static struct pbe *get_next_pbe(struct page **page_ptr, struct pbe *this_pbe,
- int highmem)
-{
- if (((((unsigned long) this_pbe) & (PAGE_SIZE - 1))
- + 2 * sizeof(struct pbe)) > PAGE_SIZE) {
- struct page *new_page =
- ___toi_get_nonconflicting_page(highmem);
- if (!new_page)
- return ERR_PTR(-ENOMEM);
- this_pbe = (struct pbe *) kmap(new_page);
- memset(this_pbe, 0, PAGE_SIZE);
- *page_ptr = new_page;
- } else
- this_pbe++;
-
- return this_pbe;
-}
-
-/**
- * get_pageset1_load_addresses - generate pbes for conflicting pages
- *
- * We check here that pagedir & pages it points to won't collide
- * with pages where we're going to restore from the loaded pages
- * later.
- *
- * Returns:
- * Zero on success, one if couldn't find enough pages (shouldn't
- * happen).
- **/
-int toi_get_pageset1_load_addresses(void)
-{
- int pfn, highallocd = 0, lowallocd = 0;
- int low_needed = pagedir1.size - get_highmem_size(pagedir1);
- int high_needed = get_highmem_size(pagedir1);
- int low_pages_for_highmem = 0;
- gfp_t flags = GFP_ATOMIC | __GFP_NOWARN | __GFP_HIGHMEM;
- struct page *page, *high_pbe_page = NULL, *last_high_pbe_page = NULL,
- *low_pbe_page, *last_low_pbe_page = NULL;
- struct pbe **last_high_pbe_ptr = &restore_highmem_pblist,
- *this_high_pbe = NULL;
- unsigned long orig_low_pfn, orig_high_pfn;
- int high_pbes_done = 0, low_pbes_done = 0;
- int low_direct = 0, high_direct = 0, result = 0, i;
- int high_page = 1, high_offset = 0, low_page = 1, low_offset = 0;
-
- toi_trace_index++;
-
- memory_bm_position_reset(pageset1_map);
- memory_bm_position_reset(pageset1_copy_map);
-
- last_low_pbe_ptr = &restore_pblist;
-
- /* First, allocate pages for the start of our pbe lists. */
- if (high_needed) {
- high_pbe_page = ___toi_get_nonconflicting_page(1);
- if (!high_pbe_page) {
- result = -ENOMEM;
- goto out;
- }
- this_high_pbe = (struct pbe *) kmap(high_pbe_page);
- memset(this_high_pbe, 0, PAGE_SIZE);
- }
-
- low_pbe_page = ___toi_get_nonconflicting_page(0);
- if (!low_pbe_page) {
- result = -ENOMEM;
- goto out;
- }
- this_low_pbe = (struct pbe *) page_address(low_pbe_page);
-
- /*
- * Next, allocate the number of pages we need.
- */
-
- i = low_needed + high_needed;
-
- do {
- int is_high;
-
- if (i == low_needed)
- flags &= ~__GFP_HIGHMEM;
-
- page = toi_alloc_page(30, flags);
- BUG_ON(!page);
-
- SetPagePageset1Copy(page);
- is_high = PageHighMem(page);
-
- if (PagePageset1(page)) {
- if (is_high)
- high_direct++;
- else
- low_direct++;
- } else {
- if (is_high)
- highallocd++;
- else
- lowallocd++;
- }
- } while (--i);
-
- high_needed -= high_direct;
- low_needed -= low_direct;
-
- /*
- * Do we need to use some lowmem pages for the copies of highmem
- * pages?
- */
- if (high_needed > highallocd) {
- low_pages_for_highmem = high_needed - highallocd;
- high_needed -= low_pages_for_highmem;
- low_needed += low_pages_for_highmem;
- }
-
- /*
- * Now generate our pbes (which will be used for the atomic restore),
- * and free unneeded pages.
- */
- memory_bm_position_reset(pageset1_copy_map);
- for (pfn = memory_bm_next_pfn(pageset1_copy_map, 0); pfn != BM_END_OF_MAP;
- pfn = memory_bm_next_pfn(pageset1_copy_map, 0)) {
- int is_high;
- page = pfn_to_page(pfn);
- is_high = PageHighMem(page);
-
- if (PagePageset1(page))
- continue;
-
- /* Nope. We're going to use this page. Add a pbe. */
- if (is_high || low_pages_for_highmem) {
- struct page *orig_page;
- high_pbes_done++;
- if (!is_high)
- low_pages_for_highmem--;
- do {
- orig_high_pfn = memory_bm_next_pfn(pageset1_map, 0);
- BUG_ON(orig_high_pfn == BM_END_OF_MAP);
- orig_page = pfn_to_page(orig_high_pfn);
- } while (!PageHighMem(orig_page) ||
- PagePageset1Copy(orig_page));
-
- this_high_pbe->orig_address = (void *) orig_high_pfn;
- this_high_pbe->address = page;
- this_high_pbe->next = NULL;
- toi_message(TOI_PAGEDIR, TOI_VERBOSE, 0, "High pbe %d/%d: %p(%d)=>%p",
- high_page, high_offset, page, orig_high_pfn, orig_page);
- if (last_high_pbe_page != high_pbe_page) {
- *last_high_pbe_ptr =
- (struct pbe *) high_pbe_page;
- if (last_high_pbe_page) {
- kunmap(last_high_pbe_page);
- high_page++;
- high_offset = 0;
- } else
- high_offset++;
- last_high_pbe_page = high_pbe_page;
- } else {
- *last_high_pbe_ptr = this_high_pbe;
- high_offset++;
- }
- last_high_pbe_ptr = &this_high_pbe->next;
- this_high_pbe = get_next_pbe(&high_pbe_page,
- this_high_pbe, 1);
- if (IS_ERR(this_high_pbe)) {
- printk(KERN_INFO
- "This high pbe is an error.\n");
- return -ENOMEM;
- }
- } else {
- struct page *orig_page;
- low_pbes_done++;
- do {
- orig_low_pfn = memory_bm_next_pfn(pageset1_map, 0);
- BUG_ON(orig_low_pfn == BM_END_OF_MAP);
- orig_page = pfn_to_page(orig_low_pfn);
- } while (PageHighMem(orig_page) ||
- PagePageset1Copy(orig_page));
-
- this_low_pbe->orig_address = page_address(orig_page);
- this_low_pbe->address = page_address(page);
- this_low_pbe->next = NULL;
- toi_message(TOI_PAGEDIR, TOI_VERBOSE, 0, "Low pbe %d/%d: %p(%d)=>%p",
- low_page, low_offset, this_low_pbe->orig_address,
- orig_low_pfn, this_low_pbe->address);
- TOI_TRACE_DEBUG(orig_low_pfn, "LoadAddresses (%d/%d): %p=>%p", low_page, low_offset, this_low_pbe->orig_address, this_low_pbe->address);
- *last_low_pbe_ptr = this_low_pbe;
- last_low_pbe_ptr = &this_low_pbe->next;
- this_low_pbe = get_next_pbe(&low_pbe_page,
- this_low_pbe, 0);
- if (low_pbe_page != last_low_pbe_page) {
- if (last_low_pbe_page) {
- low_page++;
- low_offset = 0;
- } else {
- low_offset++;
- }
- last_low_pbe_page = low_pbe_page;
- } else
- low_offset++;
- if (IS_ERR(this_low_pbe)) {
- printk(KERN_INFO "this_low_pbe is an error.\n");
- return -ENOMEM;
- }
- }
- }
-
- if (high_pbe_page)
- kunmap(high_pbe_page);
-
- if (last_high_pbe_page != high_pbe_page) {
- if (last_high_pbe_page)
- kunmap(last_high_pbe_page);
- toi__free_page(29, high_pbe_page);
- }
-
- free_conflicting_pages();
-
-out:
- return result;
-}
-
-int add_boot_kernel_data_pbe(void)
-{
- this_low_pbe->address = (char *) __toi_get_nonconflicting_page();
- if (!this_low_pbe->address) {
- printk(KERN_INFO "Failed to get bkd atomic restore buffer.");
- return -ENOMEM;
- }
-
- toi_bkd.size = sizeof(toi_bkd);
- memcpy(this_low_pbe->address, &toi_bkd, sizeof(toi_bkd));
-
- *last_low_pbe_ptr = this_low_pbe;
- this_low_pbe->orig_address = (char *) boot_kernel_data_buffer;
- this_low_pbe->next = NULL;
- return 0;
-}
diff --git a/kernel/power/tuxonice_pagedir.h b/kernel/power/tuxonice_pagedir.h
deleted file mode 100644
index 80d1a3d8c..000000000
--- a/kernel/power/tuxonice_pagedir.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * kernel/power/tuxonice_pagedir.h
- *
- * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Declarations for routines for handling pagesets.
- */
-
-#ifndef KERNEL_POWER_PAGEDIR_H
-#define KERNEL_POWER_PAGEDIR_H
-
-/* Pagedir
- *
- * Contains the metadata for a set of pages saved in the image.
- */
-
-struct pagedir {
- int id;
- unsigned long size;
-#ifdef CONFIG_HIGHMEM
- unsigned long size_high;
-#endif
-};
-
-#ifdef CONFIG_HIGHMEM
-#define get_highmem_size(pagedir) (pagedir.size_high)
-#define set_highmem_size(pagedir, sz) do { pagedir.size_high = sz; } while (0)
-#define inc_highmem_size(pagedir) do { pagedir.size_high++; } while (0)
-#define get_lowmem_size(pagedir) (pagedir.size - pagedir.size_high)
-#else
-#define get_highmem_size(pagedir) (0)
-#define set_highmem_size(pagedir, sz) do { } while (0)
-#define inc_highmem_size(pagedir) do { } while (0)
-#define get_lowmem_size(pagedir) (pagedir.size)
-#endif
-
-extern struct pagedir pagedir1, pagedir2;
-
-extern void toi_copy_pageset1(void);
-
-extern int toi_get_pageset1_load_addresses(void);
-
-extern unsigned long __toi_get_nonconflicting_page(void);
-struct page *___toi_get_nonconflicting_page(int can_be_highmem);
-
-extern void toi_reset_alt_image_pageset2_pfn(void);
-extern int add_boot_kernel_data_pbe(void);
-#endif
diff --git a/kernel/power/tuxonice_pageflags.c b/kernel/power/tuxonice_pageflags.c
deleted file mode 100644
index 307d09f33..000000000
--- a/kernel/power/tuxonice_pageflags.c
+++ /dev/null
@@ -1,18 +0,0 @@
-/*
- * kernel/power/tuxonice_pageflags.c
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Routines for serialising and relocating pageflags in which we
- * store our image metadata.
- */
-
-#include "tuxonice_pageflags.h"
-#include "power.h"
-
-int toi_pageflags_space_needed(void)
-{
- return memory_bm_space_needed(pageset1_map);
-}
diff --git a/kernel/power/tuxonice_pageflags.h b/kernel/power/tuxonice_pageflags.h
deleted file mode 100644
index 30ee577c3..000000000
--- a/kernel/power/tuxonice_pageflags.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * kernel/power/tuxonice_pageflags.h
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- */
-
-#ifndef KERNEL_POWER_TUXONICE_PAGEFLAGS_H
-#define KERNEL_POWER_TUXONICE_PAGEFLAGS_H
-
-struct memory_bitmap;
-void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
-void memory_bm_clear(struct memory_bitmap *bm);
-
-int mem_bm_set_bit_check(struct memory_bitmap *bm, int index, unsigned long pfn);
-void memory_bm_set_bit(struct memory_bitmap *bm, int index, unsigned long pfn);
-unsigned long memory_bm_next_pfn(struct memory_bitmap *bm, int index);
-unsigned long memory_bm_next_pfn_index(struct memory_bitmap *bm, int index);
-void memory_bm_position_reset(struct memory_bitmap *bm);
-void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
-int toi_alloc_bitmap(struct memory_bitmap **bm);
-void toi_free_bitmap(struct memory_bitmap **bm);
-void memory_bm_clear(struct memory_bitmap *bm);
-void memory_bm_clear_bit(struct memory_bitmap *bm, int index, unsigned long pfn);
-void memory_bm_set_bit(struct memory_bitmap *bm, int index, unsigned long pfn);
-int memory_bm_test_bit(struct memory_bitmap *bm, int index, unsigned long pfn);
-int memory_bm_test_bit_index(struct memory_bitmap *bm, int index, unsigned long pfn);
-void memory_bm_clear_bit_index(struct memory_bitmap *bm, int index, unsigned long pfn);
-
-struct toi_module_ops;
-int memory_bm_write(struct memory_bitmap *bm, int (*rw_chunk)
- (int rw, struct toi_module_ops *owner, char *buffer, int buffer_size));
-int memory_bm_read(struct memory_bitmap *bm, int (*rw_chunk)
- (int rw, struct toi_module_ops *owner, char *buffer, int buffer_size));
-int memory_bm_space_needed(struct memory_bitmap *bm);
-
-extern struct memory_bitmap *pageset1_map;
-extern struct memory_bitmap *pageset1_copy_map;
-extern struct memory_bitmap *pageset2_map;
-extern struct memory_bitmap *page_resave_map;
-extern struct memory_bitmap *io_map;
-extern struct memory_bitmap *nosave_map;
-extern struct memory_bitmap *free_map;
-extern struct memory_bitmap *compare_map;
-
-#define PagePageset1(page) \
- (pageset1_map && memory_bm_test_bit(pageset1_map, smp_processor_id(), page_to_pfn(page)))
-#define SetPagePageset1(page) \
- (memory_bm_set_bit(pageset1_map, smp_processor_id(), page_to_pfn(page)))
-#define ClearPagePageset1(page) \
- (memory_bm_clear_bit(pageset1_map, smp_processor_id(), page_to_pfn(page)))
-
-#define PagePageset1Copy(page) \
- (memory_bm_test_bit(pageset1_copy_map, smp_processor_id(), page_to_pfn(page)))
-#define SetPagePageset1Copy(page) \
- (memory_bm_set_bit(pageset1_copy_map, smp_processor_id(), page_to_pfn(page)))
-#define ClearPagePageset1Copy(page) \
- (memory_bm_clear_bit(pageset1_copy_map, smp_processor_id(), page_to_pfn(page)))
-
-#define PagePageset2(page) \
- (memory_bm_test_bit(pageset2_map, smp_processor_id(), page_to_pfn(page)))
-#define SetPagePageset2(page) \
- (memory_bm_set_bit(pageset2_map, smp_processor_id(), page_to_pfn(page)))
-#define ClearPagePageset2(page) \
- (memory_bm_clear_bit(pageset2_map, smp_processor_id(), page_to_pfn(page)))
-
-#define PageWasRW(page) \
- (memory_bm_test_bit(pageset2_map, smp_processor_id(), page_to_pfn(page)))
-#define SetPageWasRW(page) \
- (memory_bm_set_bit(pageset2_map, smp_processor_id(), page_to_pfn(page)))
-#define ClearPageWasRW(page) \
- (memory_bm_clear_bit(pageset2_map, smp_processor_id(), page_to_pfn(page)))
-
-#define PageResave(page) (page_resave_map ? \
- memory_bm_test_bit(page_resave_map, smp_processor_id(), page_to_pfn(page)) : 0)
-#define SetPageResave(page) \
- (memory_bm_set_bit(page_resave_map, smp_processor_id(), page_to_pfn(page)))
-#define ClearPageResave(page) \
- (memory_bm_clear_bit(page_resave_map, smp_processor_id(), page_to_pfn(page)))
-
-#define PageNosave(page) (nosave_map ? \
- memory_bm_test_bit(nosave_map, smp_processor_id(), page_to_pfn(page)) : 0)
-#define SetPageNosave(page) \
- (mem_bm_set_bit_check(nosave_map, smp_processor_id(), page_to_pfn(page)))
-#define ClearPageNosave(page) \
- (memory_bm_clear_bit(nosave_map, smp_processor_id(), page_to_pfn(page)))
-
-#define PageNosaveFree(page) (free_map ? \
- memory_bm_test_bit(free_map, smp_processor_id(), page_to_pfn(page)) : 0)
-#define SetPageNosaveFree(page) \
- (memory_bm_set_bit(free_map, smp_processor_id(), page_to_pfn(page)))
-#define ClearPageNosaveFree(page) \
- (memory_bm_clear_bit(free_map, smp_processor_id(), page_to_pfn(page)))
-
-#define PageCompareChanged(page) (compare_map ? \
- memory_bm_test_bit(compare_map, smp_processor_id(), page_to_pfn(page)) : 0)
-#define SetPageCompareChanged(page) \
- (memory_bm_set_bit(compare_map, smp_processor_id(), page_to_pfn(page)))
-#define ClearPageCompareChanged(page) \
- (memory_bm_clear_bit(compare_map, smp_processor_id(), page_to_pfn(page)))
-
-extern void save_pageflags(struct memory_bitmap *pagemap);
-extern int load_pageflags(struct memory_bitmap *pagemap);
-extern int toi_pageflags_space_needed(void);
-#endif
diff --git a/kernel/power/tuxonice_power_off.c b/kernel/power/tuxonice_power_off.c
deleted file mode 100644
index f8e969625..000000000
--- a/kernel/power/tuxonice_power_off.c
+++ /dev/null
@@ -1,286 +0,0 @@
-/*
- * kernel/power/tuxonice_power_off.c
- *
- * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Support for powering down.
- */
-
-#include <linux/device.h>
-#include <linux/suspend.h>
-#include <linux/mm.h>
-#include <linux/pm.h>
-#include <linux/reboot.h>
-#include <linux/cpu.h>
-#include <linux/console.h>
-#include <linux/fs.h>
-#include "tuxonice.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_power_off.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_io.h"
-
-unsigned long toi_poweroff_method; /* 0 - Kernel power off */
-
-static int wake_delay;
-static char lid_state_file[256], wake_alarm_dir[256];
-static struct file *lid_file, *alarm_file, *epoch_file;
-static int post_wake_state = -1;
-
-static int did_suspend_to_both;
-
-/*
- * __toi_power_down
- * Functionality : Powers down or reboots the computer once the image
- * has been written to disk.
- * Key Assumptions : Able to reboot/power down via code called or that
- * the warning emitted if the calls fail will be visible
- * to the user (ie printk resumes devices).
- */
-
-static void __toi_power_down(int method)
-{
- int error;
-
- toi_cond_pause(1, test_action_state(TOI_REBOOT) ? "Ready to reboot." :
- "Powering down.");
-
- if (test_result_state(TOI_ABORTED))
- goto out;
-
- if (test_action_state(TOI_REBOOT))
- kernel_restart(NULL);
-
- switch (method) {
- case 0:
- break;
- case 3:
- /*
- * Re-read the overwritten part of pageset2 to make post-resume
- * faster.
- */
- if (read_pageset2(1))
- panic("Attempt to reload pagedir 2 failed. "
- "Try rebooting.");
-
- pm_prepare_console();
-
- error = pm_notifier_call_chain(PM_SUSPEND_PREPARE);
- if (!error) {
- pm_restore_gfp_mask();
- error = suspend_devices_and_enter(PM_SUSPEND_MEM);
- pm_restrict_gfp_mask();
- if (!error)
- did_suspend_to_both = 1;
- }
- pm_notifier_call_chain(PM_POST_SUSPEND);
- pm_restore_console();
-
- /* Success - we're now post-resume-from-ram */
- if (did_suspend_to_both)
- return;
-
- /* Failed to suspend to ram - do normal power off */
- break;
- case 4:
- /*
- * If succeeds, doesn't return. If fails, do a simple
- * powerdown.
- */
- hibernation_platform_enter();
- break;
- case 5:
- /* Historic entry only now */
- break;
- }
-
- if (method && method != 5)
- toi_cond_pause(1,
- "Falling back to alternate power off method.");
-
- if (test_result_state(TOI_ABORTED))
- goto out;
-
- if (pm_power_off)
- kernel_power_off();
- kernel_halt();
- toi_cond_pause(1, "Powerdown failed.");
- while (1)
- cpu_relax();
-
-out:
- if (read_pageset2(1))
- panic("Attempt to reload pagedir 2 failed. Try rebooting.");
- return;
-}
-
-#define CLOSE_FILE(file) \
- if (file) { \
- filp_close(file, NULL); file = NULL; \
- }
-
-static void powerdown_cleanup(int toi_or_resume)
-{
- if (!toi_or_resume)
- return;
-
- CLOSE_FILE(lid_file);
- CLOSE_FILE(alarm_file);
- CLOSE_FILE(epoch_file);
-}
-
-static void open_file(char *format, char *arg, struct file **var, int mode,
- char *desc)
-{
- char buf[256];
-
- if (strlen(arg)) {
- sprintf(buf, format, arg);
- *var = filp_open(buf, mode, 0);
- if (IS_ERR(*var) || !*var) {
- printk(KERN_INFO "Failed to open %s file '%s' (%p).\n",
- desc, buf, *var);
- *var = NULL;
- }
- }
-}
-
-static int powerdown_init(int toi_or_resume)
-{
- if (!toi_or_resume)
- return 0;
-
- did_suspend_to_both = 0;
-
- open_file("/proc/acpi/button/%s/state", lid_state_file, &lid_file,
- O_RDONLY, "lid");
-
- if (strlen(wake_alarm_dir)) {
- open_file("/sys/class/rtc/%s/wakealarm", wake_alarm_dir,
- &alarm_file, O_WRONLY, "alarm");
-
- open_file("/sys/class/rtc/%s/since_epoch", wake_alarm_dir,
- &epoch_file, O_RDONLY, "epoch");
- }
-
- return 0;
-}
-
-static int lid_closed(void)
-{
- char array[25];
- ssize_t size;
- loff_t pos = 0;
-
- if (!lid_file)
- return 0;
-
- size = vfs_read(lid_file, (char __user *) array, 25, &pos);
- if ((int) size < 1) {
- printk(KERN_INFO "Failed to read lid state file (%d).\n",
- (int) size);
- return 0;
- }
-
- if (!strcmp(array, "state: closed\n"))
- return 1;
-
- return 0;
-}
-
-static void write_alarm_file(int value)
-{
- ssize_t size;
- char buf[40];
- loff_t pos = 0;
-
- if (!alarm_file)
- return;
-
- sprintf(buf, "%d\n", value);
-
- size = vfs_write(alarm_file, (char __user *)buf, strlen(buf), &pos);
-
- if (size < 0)
- printk(KERN_INFO "Error %d writing alarm value %s.\n",
- (int) size, buf);
-}
-
-/**
- * toi_check_resleep: See whether to powerdown again after waking.
- *
- * After waking, check whether we should powerdown again in a (usually
- * different) way. We only do this if the lid switch is still closed.
- */
-void toi_check_resleep(void)
-{
- /* We only return if we suspended to ram and woke. */
- if (lid_closed() && post_wake_state >= 0)
- __toi_power_down(post_wake_state);
-}
-
-void toi_power_down(void)
-{
- if (alarm_file && wake_delay) {
- char array[25];
- loff_t pos = 0;
- size_t size = vfs_read(epoch_file, (char __user *) array, 25,
- &pos);
-
- if (((int) size) < 1)
- printk(KERN_INFO "Failed to read epoch file (%d).\n",
- (int) size);
- else {
- unsigned long since_epoch;
- if (!kstrtoul(array, 0, &since_epoch)) {
- /* Clear any wakeup time. */
- write_alarm_file(0);
-
- /* Set new wakeup time. */
- write_alarm_file(since_epoch + wake_delay);
- }
- }
- }
-
- __toi_power_down(toi_poweroff_method);
-
- toi_check_resleep();
-}
-
-static struct toi_sysfs_data sysfs_params[] = {
-#if defined(CONFIG_ACPI)
- SYSFS_STRING("lid_file", SYSFS_RW, lid_state_file, 256, 0, NULL),
- SYSFS_INT("wake_delay", SYSFS_RW, &wake_delay, 0, INT_MAX, 0, NULL),
- SYSFS_STRING("wake_alarm_dir", SYSFS_RW, wake_alarm_dir, 256, 0, NULL),
- SYSFS_INT("post_wake_state", SYSFS_RW, &post_wake_state, -1, 5, 0,
- NULL),
- SYSFS_UL("powerdown_method", SYSFS_RW, &toi_poweroff_method, 0, 5, 0),
- SYSFS_INT("did_suspend_to_both", SYSFS_READONLY, &did_suspend_to_both,
- 0, 0, 0, NULL)
-#endif
-};
-
-static struct toi_module_ops powerdown_ops = {
- .type = MISC_HIDDEN_MODULE,
- .name = "poweroff",
- .initialise = powerdown_init,
- .cleanup = powerdown_cleanup,
- .directory = "[ROOT]",
- .module = THIS_MODULE,
- .sysfs_data = sysfs_params,
- .num_sysfs_entries = sizeof(sysfs_params) /
- sizeof(struct toi_sysfs_data),
-};
-
-int toi_poweroff_init(void)
-{
- return toi_register_module(&powerdown_ops);
-}
-
-void toi_poweroff_exit(void)
-{
- toi_unregister_module(&powerdown_ops);
-}
diff --git a/kernel/power/tuxonice_power_off.h b/kernel/power/tuxonice_power_off.h
deleted file mode 100644
index 6e1d8bb39..000000000
--- a/kernel/power/tuxonice_power_off.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * kernel/power/tuxonice_power_off.h
- *
- * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Support for the powering down.
- */
-
-int toi_pm_state_finish(void);
-void toi_power_down(void);
-extern unsigned long toi_poweroff_method;
-int toi_poweroff_init(void);
-void toi_poweroff_exit(void);
-void toi_check_resleep(void);
-
-extern int platform_begin(int platform_mode);
-extern int platform_pre_snapshot(int platform_mode);
-extern void platform_leave(int platform_mode);
-extern void platform_end(int platform_mode);
-extern void platform_finish(int platform_mode);
-extern int platform_pre_restore(int platform_mode);
-extern void platform_restore_cleanup(int platform_mode);
diff --git a/kernel/power/tuxonice_prepare_image.c b/kernel/power/tuxonice_prepare_image.c
deleted file mode 100644
index e0593252f..000000000
--- a/kernel/power/tuxonice_prepare_image.c
+++ /dev/null
@@ -1,1080 +0,0 @@
-/*
- * kernel/power/tuxonice_prepare_image.c
- *
- * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * We need to eat memory until we can:
- * 1. Perform the save without changing anything (RAM_NEEDED < #pages)
- * 2. Fit it all in available space (toiActiveAllocator->available_space() >=
- * main_storage_needed())
- * 3. Reload the pagedir and pageset1 to places that don't collide with their
- * final destinations, not knowing to what extent the resumed kernel will
- * overlap with the one loaded at boot time. I think the resumed kernel
- * should overlap completely, but I don't want to rely on this as it is
- * an unproven assumption. We therefore assume there will be no overlap at
- * all (worse case).
- * 4. Meet the user's requested limit (if any) on the size of the image.
- * The limit is in MB, so pages/256 (assuming 4K pages).
- *
- */
-
-#include <linux/highmem.h>
-#include <linux/freezer.h>
-#include <linux/hardirq.h>
-#include <linux/mmzone.h>
-#include <linux/console.h>
-#include <linux/tuxonice.h>
-
-#include "tuxonice_pageflags.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_io.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_prepare_image.h"
-#include "tuxonice.h"
-#include "tuxonice_extent.h"
-#include "tuxonice_checksum.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_atomic_copy.h"
-#include "tuxonice_builtin.h"
-
-static unsigned long num_nosave, main_storage_allocated, storage_limit,
- header_storage_needed;
-unsigned long extra_pd1_pages_allowance =
- CONFIG_TOI_DEFAULT_EXTRA_PAGES_ALLOWANCE;
-long image_size_limit = CONFIG_TOI_DEFAULT_IMAGE_SIZE_LIMIT;
-static int no_ps2_needed;
-
-struct attention_list {
- struct task_struct *task;
- struct attention_list *next;
-};
-
-static struct attention_list *attention_list;
-
-#define PAGESET1 0
-#define PAGESET2 1
-
-void free_attention_list(void)
-{
- struct attention_list *last = NULL;
-
- while (attention_list) {
- last = attention_list;
- attention_list = attention_list->next;
- toi_kfree(6, last, sizeof(*last));
- }
-}
-
-static int build_attention_list(void)
-{
- int i, task_count = 0;
- struct task_struct *p;
- struct attention_list *next;
-
- /*
- * Count all userspace process (with task->mm) marked PF_NOFREEZE.
- */
- toi_read_lock_tasklist();
- for_each_process(p)
- if ((p->flags & PF_NOFREEZE) || p == current)
- task_count++;
- toi_read_unlock_tasklist();
-
- /*
- * Allocate attention list structs.
- */
- for (i = 0; i < task_count; i++) {
- struct attention_list *this =
- toi_kzalloc(6, sizeof(struct attention_list),
- TOI_WAIT_GFP);
- if (!this) {
- printk(KERN_INFO "Failed to allocate slab for "
- "attention list.\n");
- free_attention_list();
- return 1;
- }
- this->next = NULL;
- if (attention_list)
- this->next = attention_list;
- attention_list = this;
- }
-
- next = attention_list;
- toi_read_lock_tasklist();
- for_each_process(p)
- if ((p->flags & PF_NOFREEZE) || p == current) {
- next->task = p;
- next = next->next;
- }
- toi_read_unlock_tasklist();
- return 0;
-}
-
-static void pageset2_full(void)
-{
- struct zone *zone;
- struct page *page;
- unsigned long flags;
- int i;
-
- toi_trace_index++;
-
- for_each_populated_zone(zone) {
- spin_lock_irqsave(&zone->lru_lock, flags);
- for_each_lru(i) {
- if (!zone_page_state(zone, NR_LRU_BASE + i))
- continue;
-
- list_for_each_entry(page, &zone->lruvec.lists[i], lru) {
- struct address_space *mapping;
-
- mapping = page_mapping(page);
- if (!mapping || !mapping->host ||
- !(mapping->host->i_flags & S_ATOMIC_COPY)) {
- if (PageTOI_RO(page) && test_result_state(TOI_KEPT_IMAGE)) {
- TOI_TRACE_DEBUG(page_to_pfn(page), "_Pageset2 unmodified.");
- } else {
- TOI_TRACE_DEBUG(page_to_pfn(page), "_Pageset2 pageset2_full.");
- SetPagePageset2(page);
- }
- }
- }
- }
- spin_unlock_irqrestore(&zone->lru_lock, flags);
- }
-}
-
-/*
- * toi_mark_task_as_pageset
- * Functionality : Marks all the saveable pages belonging to a given process
- * as belonging to a particular pageset.
- */
-
-static void toi_mark_task_as_pageset(struct task_struct *t, int pageset2)
-{
- struct vm_area_struct *vma;
- struct mm_struct *mm;
-
- mm = t->active_mm;
-
- if (!mm || !mm->mmap)
- return;
-
- toi_trace_index++;
-
- if (!irqs_disabled())
- down_read(&mm->mmap_sem);
-
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
- unsigned long posn;
-
- if (!vma->vm_start ||
- vma->vm_flags & VM_PFNMAP)
- continue;
-
- for (posn = vma->vm_start; posn < vma->vm_end;
- posn += PAGE_SIZE) {
- struct page *page = follow_page(vma, posn, 0);
- struct address_space *mapping;
-
- if (!page || !pfn_valid(page_to_pfn(page)))
- continue;
-
- mapping = page_mapping(page);
- if (mapping && mapping->host &&
- mapping->host->i_flags & S_ATOMIC_COPY && pageset2)
- continue;
-
- if (PageTOI_RO(page) && test_result_state(TOI_KEPT_IMAGE)) {
- TOI_TRACE_DEBUG(page_to_pfn(page), "_Unmodified %d", pageset2 ? 1 : 2);
- continue;
- }
-
- if (pageset2) {
- TOI_TRACE_DEBUG(page_to_pfn(page), "_MarkTaskAsPageset 1");
- SetPagePageset2(page);
- } else {
- TOI_TRACE_DEBUG(page_to_pfn(page), "_MarkTaskAsPageset 2");
- ClearPagePageset2(page);
- SetPagePageset1(page);
- }
- }
- }
-
- if (!irqs_disabled())
- up_read(&mm->mmap_sem);
-}
-
-static void mark_tasks(int pageset)
-{
- struct task_struct *p;
-
- toi_read_lock_tasklist();
- for_each_process(p) {
- if (!p->mm)
- continue;
-
- if (p->flags & PF_KTHREAD)
- continue;
-
- toi_mark_task_as_pageset(p, pageset);
- }
- toi_read_unlock_tasklist();
-
-}
-
-/* mark_pages_for_pageset2
- *
- * Description: Mark unshared pages in processes not needed for hibernate as
- * being able to be written out in a separate pagedir.
- * HighMem pages are simply marked as pageset2. They won't be
- * needed during hibernate.
- */
-
-static void toi_mark_pages_for_pageset2(void)
-{
- struct attention_list *this = attention_list;
-
- memory_bm_clear(pageset2_map);
-
- if (test_action_state(TOI_NO_PAGESET2) || no_ps2_needed)
- return;
-
- if (test_action_state(TOI_PAGESET2_FULL))
- pageset2_full();
- else
- mark_tasks(PAGESET2);
-
- /*
- * Because the tasks in attention_list are ones related to hibernating,
- * we know that they won't go away under us.
- */
-
- while (this) {
- if (!test_result_state(TOI_ABORTED))
- toi_mark_task_as_pageset(this->task, PAGESET1);
- this = this->next;
- }
-}
-
-/*
- * The atomic copy of pageset1 is stored in pageset2 pages.
- * But if pageset1 is larger (normally only just after boot),
- * we need to allocate extra pages to store the atomic copy.
- * The following data struct and functions are used to handle
- * the allocation and freeing of that memory.
- */
-
-static unsigned long extra_pages_allocated;
-
-struct extras {
- struct page *page;
- int order;
- struct extras *next;
-};
-
-static struct extras *extras_list;
-
-/* toi_free_extra_pagedir_memory
- *
- * Description: Free previously allocated extra pagedir memory.
- */
-void toi_free_extra_pagedir_memory(void)
-{
- /* Free allocated pages */
- while (extras_list) {
- struct extras *this = extras_list;
- int i;
-
- extras_list = this->next;
-
- for (i = 0; i < (1 << this->order); i++)
- ClearPageNosave(this->page + i);
-
- toi_free_pages(9, this->page, this->order);
- toi_kfree(7, this, sizeof(*this));
- }
-
- extra_pages_allocated = 0;
-}
-
-/* toi_allocate_extra_pagedir_memory
- *
- * Description: Allocate memory for making the atomic copy of pagedir1 in the
- * case where it is bigger than pagedir2.
- * Arguments: int num_to_alloc: Number of extra pages needed.
- * Result: int. Number of extra pages we now have allocated.
- */
-static int toi_allocate_extra_pagedir_memory(int extra_pages_needed)
-{
- int j, order, num_to_alloc = extra_pages_needed - extra_pages_allocated;
- gfp_t flags = TOI_ATOMIC_GFP;
-
- if (num_to_alloc < 1)
- return 0;
-
- order = fls(num_to_alloc);
- if (order >= MAX_ORDER)
- order = MAX_ORDER - 1;
-
- while (num_to_alloc) {
- struct page *newpage;
- unsigned long virt;
- struct extras *extras_entry;
-
- while ((1 << order) > num_to_alloc)
- order--;
-
- extras_entry = (struct extras *) toi_kzalloc(7,
- sizeof(struct extras), TOI_ATOMIC_GFP);
-
- if (!extras_entry)
- return extra_pages_allocated;
-
- virt = toi_get_free_pages(9, flags, order);
- while (!virt && order) {
- order--;
- virt = toi_get_free_pages(9, flags, order);
- }
-
- if (!virt) {
- toi_kfree(7, extras_entry, sizeof(*extras_entry));
- return extra_pages_allocated;
- }
-
- newpage = virt_to_page(virt);
-
- extras_entry->page = newpage;
- extras_entry->order = order;
- extras_entry->next = extras_list;
-
- extras_list = extras_entry;
-
- for (j = 0; j < (1 << order); j++) {
- SetPageNosave(newpage + j);
- SetPagePageset1Copy(newpage + j);
- }
-
- extra_pages_allocated += (1 << order);
- num_to_alloc -= (1 << order);
- }
-
- return extra_pages_allocated;
-}
-
-/*
- * real_nr_free_pages: Count pcp pages for a zone type or all zones
- * (-1 for all, otherwise zone_idx() result desired).
- */
-unsigned long real_nr_free_pages(unsigned long zone_idx_mask)
-{
- struct zone *zone;
- int result = 0, cpu;
-
- /* PCP lists */
- for_each_populated_zone(zone) {
- if (!(zone_idx_mask & (1 << zone_idx(zone))))
- continue;
-
- for_each_online_cpu(cpu) {
- struct per_cpu_pageset *pset =
- per_cpu_ptr(zone->pageset, cpu);
- struct per_cpu_pages *pcp = &pset->pcp;
- result += pcp->count;
- }
-
- result += zone_page_state(zone, NR_FREE_PAGES);
- }
- return result;
-}
-
-/*
- * Discover how much extra memory will be required by the drivers
- * when they're asked to hibernate. We can then ensure that amount
- * of memory is available when we really want it.
- */
-static void get_extra_pd1_allowance(void)
-{
- unsigned long orig_num_free = real_nr_free_pages(all_zones_mask), final;
-
- toi_prepare_status(CLEAR_BAR, "Finding allowance for drivers.");
-
- if (toi_go_atomic(PMSG_FREEZE, 1))
- return;
-
- final = real_nr_free_pages(all_zones_mask);
- toi_end_atomic(ATOMIC_ALL_STEPS, 1, 0);
-
- extra_pd1_pages_allowance = (orig_num_free > final) ?
- orig_num_free - final + MIN_EXTRA_PAGES_ALLOWANCE :
- MIN_EXTRA_PAGES_ALLOWANCE;
-}
-
-/*
- * Amount of storage needed, possibly taking into account the
- * expected compression ratio and possibly also ignoring our
- * allowance for extra pages.
- */
-static unsigned long main_storage_needed(int use_ecr,
- int ignore_extra_pd1_allow)
-{
- return (pagedir1.size + pagedir2.size +
- (ignore_extra_pd1_allow ? 0 : extra_pd1_pages_allowance)) *
- (use_ecr ? toi_expected_compression_ratio() : 100) / 100;
-}
-
-/*
- * Storage needed for the image header, in bytes until the return.
- */
-unsigned long get_header_storage_needed(void)
-{
- unsigned long bytes = sizeof(struct toi_header) +
- toi_header_storage_for_modules() +
- toi_pageflags_space_needed() +
- fs_info_space_needed();
-
- return DIV_ROUND_UP(bytes, PAGE_SIZE);
-}
-
-/*
- * When freeing memory, pages from either pageset might be freed.
- *
- * When seeking to free memory to be able to hibernate, for every ps1 page
- * freed, we need 2 less pages for the atomic copy because there is one less
- * page to copy and one more page into which data can be copied.
- *
- * Freeing ps2 pages saves us nothing directly. No more memory is available
- * for the atomic copy. Indirectly, a ps1 page might be freed (slab?), but
- * that's too much work to figure out.
- *
- * => ps1_to_free functions
- *
- * Of course if we just want to reduce the image size, because of storage
- * limitations or an image size limit either ps will do.
- *
- * => any_to_free function
- */
-
-static unsigned long lowpages_usable_for_highmem_copy(void)
-{
- unsigned long needed = get_lowmem_size(pagedir1) +
- extra_pd1_pages_allowance + MIN_FREE_RAM +
- toi_memory_for_modules(0),
- available = get_lowmem_size(pagedir2) +
- real_nr_free_low_pages() + extra_pages_allocated;
-
- return available > needed ? available - needed : 0;
-}
-
-static unsigned long highpages_ps1_to_free(void)
-{
- unsigned long need = get_highmem_size(pagedir1),
- available = get_highmem_size(pagedir2) +
- real_nr_free_high_pages() +
- lowpages_usable_for_highmem_copy();
-
- return need > available ? DIV_ROUND_UP(need - available, 2) : 0;
-}
-
-static unsigned long lowpages_ps1_to_free(void)
-{
- unsigned long needed = get_lowmem_size(pagedir1) +
- extra_pd1_pages_allowance + MIN_FREE_RAM +
- toi_memory_for_modules(0),
- available = get_lowmem_size(pagedir2) +
- real_nr_free_low_pages() + extra_pages_allocated;
-
- return needed > available ? DIV_ROUND_UP(needed - available, 2) : 0;
-}
-
-static unsigned long current_image_size(void)
-{
- return pagedir1.size + pagedir2.size + header_storage_needed;
-}
-
-static unsigned long storage_still_required(void)
-{
- unsigned long needed = main_storage_needed(1, 1);
- return needed > storage_limit ? needed - storage_limit : 0;
-}
-
-static unsigned long ram_still_required(void)
-{
- unsigned long needed = MIN_FREE_RAM + toi_memory_for_modules(0) +
- 2 * extra_pd1_pages_allowance,
- available = real_nr_free_low_pages() + extra_pages_allocated;
- return needed > available ? needed - available : 0;
-}
-
-unsigned long any_to_free(int use_image_size_limit)
-{
- int use_soft_limit = use_image_size_limit && image_size_limit > 0;
- unsigned long current_size = current_image_size(),
- soft_limit = use_soft_limit ? (image_size_limit << 8) : 0,
- to_free = use_soft_limit ? (current_size > soft_limit ?
- current_size - soft_limit : 0) : 0,
- storage_limit = storage_still_required(),
- ram_limit = ram_still_required(),
- first_max = max(to_free, storage_limit);
-
- return max(first_max, ram_limit);
-}
-
-static int need_pageset2(void)
-{
- return (real_nr_free_low_pages() + extra_pages_allocated -
- 2 * extra_pd1_pages_allowance - MIN_FREE_RAM -
- toi_memory_for_modules(0) - pagedir1.size) < pagedir2.size;
-}
-
-/* amount_needed
- *
- * Calculates the amount by which the image size needs to be reduced to meet
- * our constraints.
- */
-static unsigned long amount_needed(int use_image_size_limit)
-{
- return max(highpages_ps1_to_free() + lowpages_ps1_to_free(),
- any_to_free(use_image_size_limit));
-}
-
-static int image_not_ready(int use_image_size_limit)
-{
- toi_message(TOI_EAT_MEMORY, TOI_LOW, 1,
- "Amount still needed (%lu) > 0:%u,"
- " Storage allocd: %lu < %lu: %u.\n",
- amount_needed(use_image_size_limit),
- (amount_needed(use_image_size_limit) > 0),
- main_storage_allocated,
- main_storage_needed(1, 1),
- main_storage_allocated < main_storage_needed(1, 1));
-
- toi_cond_pause(0, NULL);
-
- return (amount_needed(use_image_size_limit) > 0) ||
- main_storage_allocated < main_storage_needed(1, 1);
-}
-
-static void display_failure_reason(int tries_exceeded)
-{
- unsigned long storage_required = storage_still_required(),
- ram_required = ram_still_required(),
- high_ps1 = highpages_ps1_to_free(),
- low_ps1 = lowpages_ps1_to_free();
-
- printk(KERN_INFO "Failed to prepare the image because...\n");
-
- if (!storage_limit) {
- printk(KERN_INFO "- You need some storage available to be "
- "able to hibernate.\n");
- return;
- }
-
- if (tries_exceeded)
- printk(KERN_INFO "- The maximum number of iterations was "
- "reached without successfully preparing the "
- "image.\n");
-
- if (storage_required) {
- printk(KERN_INFO " - We need at least %lu pages of storage "
- "(ignoring the header), but only have %lu.\n",
- main_storage_needed(1, 1),
- main_storage_allocated);
- set_abort_result(TOI_INSUFFICIENT_STORAGE);
- }
-
- if (ram_required) {
- printk(KERN_INFO " - We need %lu more free pages of low "
- "memory.\n", ram_required);
- printk(KERN_INFO " Minimum free : %8d\n", MIN_FREE_RAM);
- printk(KERN_INFO " + Reqd. by modules : %8lu\n",
- toi_memory_for_modules(0));
- printk(KERN_INFO " + 2 * extra allow : %8lu\n",
- 2 * extra_pd1_pages_allowance);
- printk(KERN_INFO " - Currently free : %8lu\n",
- real_nr_free_low_pages());
- printk(KERN_INFO " - Pages allocd : %8lu\n",
- extra_pages_allocated);
- printk(KERN_INFO " : ========\n");
- printk(KERN_INFO " Still needed : %8lu\n",
- ram_required);
-
- /* Print breakdown of memory needed for modules */
- toi_memory_for_modules(1);
- set_abort_result(TOI_UNABLE_TO_FREE_ENOUGH_MEMORY);
- }
-
- if (high_ps1) {
- printk(KERN_INFO "- We need to free %lu highmem pageset 1 "
- "pages.\n", high_ps1);
- set_abort_result(TOI_UNABLE_TO_FREE_ENOUGH_MEMORY);
- }
-
- if (low_ps1) {
- printk(KERN_INFO " - We need to free %ld lowmem pageset 1 "
- "pages.\n", low_ps1);
- set_abort_result(TOI_UNABLE_TO_FREE_ENOUGH_MEMORY);
- }
-}
-
-static void display_stats(int always, int sub_extra_pd1_allow)
-{
- char buffer[255];
- snprintf(buffer, 254,
- "Free:%lu(%lu). Sets:%lu(%lu),%lu(%lu). "
- "Nosave:%lu-%lu=%lu. Storage:%lu/%lu(%lu=>%lu). "
- "Needed:%lu,%lu,%lu(%u,%lu,%lu,%ld) (PS2:%s)\n",
-
- /* Free */
- real_nr_free_pages(all_zones_mask),
- real_nr_free_low_pages(),
-
- /* Sets */
- pagedir1.size, pagedir1.size - get_highmem_size(pagedir1),
- pagedir2.size, pagedir2.size - get_highmem_size(pagedir2),
-
- /* Nosave */
- num_nosave, extra_pages_allocated,
- num_nosave - extra_pages_allocated,
-
- /* Storage */
- main_storage_allocated,
- storage_limit,
- main_storage_needed(1, sub_extra_pd1_allow),
- main_storage_needed(1, 1),
-
- /* Needed */
- lowpages_ps1_to_free(), highpages_ps1_to_free(),
- any_to_free(1),
- MIN_FREE_RAM, toi_memory_for_modules(0),
- extra_pd1_pages_allowance,
- image_size_limit,
-
- need_pageset2() ? "yes" : "no");
-
- if (always)
- printk("%s", buffer);
- else
- toi_message(TOI_EAT_MEMORY, TOI_MEDIUM, 1, buffer);
-}
-
-/* flag_image_pages
- *
- * This routine generates our lists of pages to be stored in each
- * pageset. Since we store the data using extents, and adding new
- * extents might allocate a new extent page, this routine may well
- * be called more than once.
- */
-static void flag_image_pages(int atomic_copy)
-{
- int num_free = 0, num_unmodified = 0;
- unsigned long loop;
- struct zone *zone;
-
- pagedir1.size = 0;
- pagedir2.size = 0;
-
- set_highmem_size(pagedir1, 0);
- set_highmem_size(pagedir2, 0);
-
- num_nosave = 0;
- toi_trace_index++;
-
- memory_bm_clear(pageset1_map);
-
- toi_generate_free_page_map();
-
- /*
- * Pages not to be saved are marked Nosave irrespective of being
- * reserved.
- */
- for_each_populated_zone(zone) {
- int highmem = is_highmem(zone);
-
- for (loop = 0; loop < zone->spanned_pages; loop++) {
- unsigned long pfn = zone->zone_start_pfn + loop;
- struct page *page;
- int chunk_size;
-
- if (!pfn_valid(pfn)) {
- TOI_TRACE_DEBUG(pfn, "_Flag Invalid");
- continue;
- }
-
- chunk_size = toi_size_of_free_region(zone, pfn);
- if (chunk_size) {
- unsigned long y;
- for (y = pfn; y < pfn + chunk_size; y++) {
- page = pfn_to_page(y);
- TOI_TRACE_DEBUG(y, "_Flag Free");
- ClearPagePageset1(page);
- ClearPagePageset2(page);
- }
- num_free += chunk_size;
- loop += chunk_size - 1;
- continue;
- }
-
- page = pfn_to_page(pfn);
-
- if (PageNosave(page)) {
- char *desc = PagePageset1Copy(page) ? "Pageset1Copy" : "NoSave";
- TOI_TRACE_DEBUG(pfn, "_Flag %s", desc);
- num_nosave++;
- continue;
- }
-
- page = highmem ? saveable_highmem_page(zone, pfn) :
- saveable_page(zone, pfn);
-
- if (!page) {
- TOI_TRACE_DEBUG(pfn, "_Flag Nosave2");
- num_nosave++;
- continue;
- }
-
- if (PageTOI_RO(page) && test_result_state(TOI_KEPT_IMAGE)) {
- TOI_TRACE_DEBUG(pfn, "_Unmodified");
- num_unmodified++;
- continue;
- }
-
- if (PagePageset2(page)) {
- pagedir2.size++;
- TOI_TRACE_DEBUG(pfn, "_Flag PS2");
- if (PageHighMem(page))
- inc_highmem_size(pagedir2);
- else
- SetPagePageset1Copy(page);
- if (PageResave(page)) {
- SetPagePageset1(page);
- ClearPagePageset1Copy(page);
- pagedir1.size++;
- if (PageHighMem(page))
- inc_highmem_size(pagedir1);
- }
- } else {
- pagedir1.size++;
- TOI_TRACE_DEBUG(pfn, "_Flag PS1");
- SetPagePageset1(page);
- if (PageHighMem(page))
- inc_highmem_size(pagedir1);
- }
- }
- }
-
- if (!atomic_copy)
- toi_message(TOI_EAT_MEMORY, TOI_MEDIUM, 0,
- "Count data pages: Set1 (%d) + Set2 (%d) + Nosave (%ld)"
- " + Unmodified (%d) + NumFree (%d) = %d.\n",
- pagedir1.size, pagedir2.size, num_nosave, num_unmodified,
- num_free, pagedir1.size + pagedir2.size + num_nosave + num_free);
-}
-
-void toi_recalculate_image_contents(int atomic_copy)
-{
- memory_bm_clear(pageset1_map);
- if (!atomic_copy) {
- unsigned long pfn;
- memory_bm_position_reset(pageset2_map);
- for (pfn = memory_bm_next_pfn(pageset2_map, 0);
- pfn != BM_END_OF_MAP;
- pfn = memory_bm_next_pfn(pageset2_map, 0))
- ClearPagePageset1Copy(pfn_to_page(pfn));
- /* Need to call this before getting pageset1_size! */
- toi_mark_pages_for_pageset2();
- }
- memory_bm_position_reset(pageset2_map);
- flag_image_pages(atomic_copy);
-
- if (!atomic_copy) {
- storage_limit = toiActiveAllocator->storage_available();
- display_stats(0, 0);
- }
-}
-
-int try_allocate_extra_memory(void)
-{
- unsigned long wanted = pagedir1.size + extra_pd1_pages_allowance -
- get_lowmem_size(pagedir2);
- if (wanted > extra_pages_allocated) {
- unsigned long got = toi_allocate_extra_pagedir_memory(wanted);
- if (wanted < got) {
- toi_message(TOI_EAT_MEMORY, TOI_LOW, 1,
- "Want %d extra pages for pageset1, got %d.\n",
- wanted, got);
- return 1;
- }
- }
- return 0;
-}
-
-/* update_image
- *
- * Allocate [more] memory and storage for the image.
- */
-static void update_image(int ps2_recalc)
-{
- int old_header_req;
- unsigned long seek;
-
- if (try_allocate_extra_memory())
- return;
-
- if (ps2_recalc)
- goto recalc;
-
- thaw_kernel_threads();
-
- /*
- * Allocate remaining storage space, if possible, up to the
- * maximum we know we'll need. It's okay to allocate the
- * maximum if the writer is the swapwriter, but
- * we don't want to grab all available space on an NFS share.
- * We therefore ignore the expected compression ratio here,
- * thereby trying to allocate the maximum image size we could
- * need (assuming compression doesn't expand the image), but
- * don't complain if we can't get the full amount we're after.
- */
-
- do {
- int result;
-
- old_header_req = header_storage_needed;
- toiActiveAllocator->reserve_header_space(header_storage_needed);
-
- /* How much storage is free with the reservation applied? */
- storage_limit = toiActiveAllocator->storage_available();
- seek = min(storage_limit, main_storage_needed(0, 0));
-
- result = toiActiveAllocator->allocate_storage(seek);
- if (result)
- printk("Failed to allocate storage (%d).\n", result);
-
- main_storage_allocated =
- toiActiveAllocator->storage_allocated();
-
- /* Need more header because more storage allocated? */
- header_storage_needed = get_header_storage_needed();
-
- } while (header_storage_needed > old_header_req);
-
- if (freeze_kernel_threads())
- set_abort_result(TOI_FREEZING_FAILED);
-
-recalc:
- toi_recalculate_image_contents(0);
-}
-
-/* attempt_to_freeze
- *
- * Try to freeze processes.
- */
-
-static int attempt_to_freeze(void)
-{
- int result;
-
- /* Stop processes before checking again */
- toi_prepare_status(CLEAR_BAR, "Freezing processes & syncing "
- "filesystems.");
- result = freeze_processes();
-
- if (result)
- set_abort_result(TOI_FREEZING_FAILED);
-
- result = freeze_kernel_threads();
-
- if (result)
- set_abort_result(TOI_FREEZING_FAILED);
-
- return result;
-}
-
-/* eat_memory
- *
- * Try to free some memory, either to meet hard or soft constraints on the image
- * characteristics.
- *
- * Hard constraints:
- * - Pageset1 must be < half of memory;
- * - We must have enough memory free at resume time to have pageset1
- * be able to be loaded in pages that don't conflict with where it has to
- * be restored.
- * Soft constraints
- * - User specificied image size limit.
- */
-static void eat_memory(void)
-{
- unsigned long amount_wanted = 0;
- int did_eat_memory = 0;
-
- /*
- * Note that if we have enough storage space and enough free memory, we
- * may exit without eating anything. We give up when the last 10
- * iterations ate no extra pages because we're not going to get much
- * more anyway, but the few pages we get will take a lot of time.
- *
- * We freeze processes before beginning, and then unfreeze them if we
- * need to eat memory until we think we have enough. If our attempts
- * to freeze fail, we give up and abort.
- */
-
- amount_wanted = amount_needed(1);
-
- switch (image_size_limit) {
- case -1: /* Don't eat any memory */
- if (amount_wanted > 0) {
- set_abort_result(TOI_WOULD_EAT_MEMORY);
- return;
- }
- break;
- case -2: /* Free caches only */
- drop_pagecache();
- toi_recalculate_image_contents(0);
- amount_wanted = amount_needed(1);
- break;
- default:
- break;
- }
-
- if (amount_wanted > 0 && !test_result_state(TOI_ABORTED) &&
- image_size_limit != -1) {
- unsigned long request = amount_wanted;
- unsigned long high_req = max(highpages_ps1_to_free(),
- any_to_free(1));
- unsigned long low_req = lowpages_ps1_to_free();
- unsigned long got = 0;
-
- toi_prepare_status(CLEAR_BAR,
- "Seeking to free %ldMB of memory.",
- MB(amount_wanted));
-
- thaw_kernel_threads();
-
- /*
- * Ask for too many because shrink_memory_mask doesn't
- * currently return enough most of the time.
- */
-
- if (low_req)
- got = shrink_memory_mask(low_req, GFP_KERNEL);
- if (high_req)
- shrink_memory_mask(high_req - got, GFP_HIGHUSER);
-
- did_eat_memory = 1;
-
- toi_recalculate_image_contents(0);
-
- amount_wanted = amount_needed(1);
-
- printk(KERN_DEBUG "Asked shrink_memory_mask for %ld low pages &"
- " %ld pages from anywhere, got %ld.\n",
- high_req, low_req,
- request - amount_wanted);
-
- toi_cond_pause(0, NULL);
-
- if (freeze_kernel_threads())
- set_abort_result(TOI_FREEZING_FAILED);
- }
-
- if (did_eat_memory)
- toi_recalculate_image_contents(0);
-}
-
-/* toi_prepare_image
- *
- * Entry point to the whole image preparation section.
- *
- * We do four things:
- * - Freeze processes;
- * - Ensure image size constraints are met;
- * - Complete all the preparation for saving the image,
- * including allocation of storage. The only memory
- * that should be needed when we're finished is that
- * for actually storing the image (and we know how
- * much is needed for that because the modules tell
- * us).
- * - Make sure that all dirty buffers are written out.
- */
-#define MAX_TRIES 2
-int toi_prepare_image(void)
-{
- int result = 1, tries = 1;
-
- main_storage_allocated = 0;
- no_ps2_needed = 0;
-
- if (attempt_to_freeze())
- return 1;
-
- lock_device_hotplug();
- set_toi_state(TOI_DEVICE_HOTPLUG_LOCKED);
-
- if (!extra_pd1_pages_allowance)
- get_extra_pd1_allowance();
-
- storage_limit = toiActiveAllocator->storage_available();
-
- if (!storage_limit) {
- printk(KERN_INFO "No storage available. Didn't try to prepare "
- "an image.\n");
- display_failure_reason(0);
- set_abort_result(TOI_NOSTORAGE_AVAILABLE);
- return 1;
- }
-
- if (build_attention_list()) {
- abort_hibernate(TOI_UNABLE_TO_PREPARE_IMAGE,
- "Unable to successfully prepare the image.\n");
- return 1;
- }
-
- toi_recalculate_image_contents(0);
-
- do {
- toi_prepare_status(CLEAR_BAR,
- "Preparing Image. Try %d.", tries);
-
- eat_memory();
-
- if (test_result_state(TOI_ABORTED))
- break;
-
- update_image(0);
-
- tries++;
-
- } while (image_not_ready(1) && tries <= MAX_TRIES &&
- !test_result_state(TOI_ABORTED));
-
- result = image_not_ready(0);
-
- /* TODO: Handle case where need to remove existing image and resave
- * instead of adding to incremental image. */
-
- if (!test_result_state(TOI_ABORTED)) {
- if (result) {
- display_stats(1, 0);
- display_failure_reason(tries > MAX_TRIES);
- abort_hibernate(TOI_UNABLE_TO_PREPARE_IMAGE,
- "Unable to successfully prepare the image.\n");
- } else {
- /* Pageset 2 needed? */
- if (!need_pageset2() &&
- test_action_state(TOI_NO_PS2_IF_UNNEEDED)) {
- no_ps2_needed = 1;
- toi_recalculate_image_contents(0);
- update_image(1);
- }
-
- toi_cond_pause(1, "Image preparation complete.");
- }
- }
-
- return result ? result : allocate_checksum_pages();
-}
diff --git a/kernel/power/tuxonice_prepare_image.h b/kernel/power/tuxonice_prepare_image.h
deleted file mode 100644
index af6769ee2..000000000
--- a/kernel/power/tuxonice_prepare_image.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * kernel/power/tuxonice_prepare_image.h
- *
- * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- */
-
-#include <asm/sections.h>
-
-extern int toi_prepare_image(void);
-extern void toi_recalculate_image_contents(int storage_available);
-extern unsigned long real_nr_free_pages(unsigned long zone_idx_mask);
-extern long image_size_limit;
-extern void toi_free_extra_pagedir_memory(void);
-extern unsigned long extra_pd1_pages_allowance;
-extern void free_attention_list(void);
-
-#define MIN_FREE_RAM 100
-#define MIN_EXTRA_PAGES_ALLOWANCE 500
-
-#define all_zones_mask ((unsigned long) ((1 << MAX_NR_ZONES) - 1))
-#ifdef CONFIG_HIGHMEM
-#define real_nr_free_high_pages() (real_nr_free_pages(1 << ZONE_HIGHMEM))
-#define real_nr_free_low_pages() (real_nr_free_pages(all_zones_mask - \
- (1 << ZONE_HIGHMEM)))
-#else
-#define real_nr_free_high_pages() (0)
-#define real_nr_free_low_pages() (real_nr_free_pages(all_zones_mask))
-
-/* For eat_memory function */
-#define ZONE_HIGHMEM (MAX_NR_ZONES + 1)
-#endif
-
-unsigned long get_header_storage_needed(void);
-unsigned long any_to_free(int use_image_size_limit);
-int try_allocate_extra_memory(void);
diff --git a/kernel/power/tuxonice_prune.c b/kernel/power/tuxonice_prune.c
deleted file mode 100644
index 710e48dee..000000000
--- a/kernel/power/tuxonice_prune.c
+++ /dev/null
@@ -1,406 +0,0 @@
-/*
- * kernel/power/tuxonice_prune.c
- *
- * Copyright (C) 2012 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * This file implements a TuxOnIce module that seeks to prune the
- * amount of data written to disk. It builds a table of hashes
- * of the uncompressed data, and writes the pfn of the previous page
- * with the same contents instead of repeating the data when a match
- * is found.
- */
-
-#include <linux/suspend.h>
-#include <linux/highmem.h>
-#include <linux/vmalloc.h>
-#include <linux/crypto.h>
-#include <linux/scatterlist.h>
-#include <crypto/hash.h>
-
-#include "tuxonice_builtin.h"
-#include "tuxonice.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_io.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_alloc.h"
-
-/*
- * We never write a page bigger than PAGE_SIZE, so use a large number
- * to indicate that data is a PFN.
- */
-#define PRUNE_DATA_IS_PFN (PAGE_SIZE + 100)
-
-static unsigned long toi_pruned_pages;
-
-static struct toi_module_ops toi_prune_ops;
-static struct toi_module_ops *next_driver;
-
-static char toi_prune_hash_algo_name[32] = "sha1";
-
-static DEFINE_MUTEX(stats_lock);
-
-struct cpu_context {
- struct shash_desc desc;
- char *digest;
-};
-
-#define OUT_BUF_SIZE (2 * PAGE_SIZE)
-
-static DEFINE_PER_CPU(struct cpu_context, contexts);
-
-/*
- * toi_crypto_prepare
- *
- * Prepare to do some work by allocating buffers and transforms.
- */
-static int toi_prune_crypto_prepare(void)
-{
- int cpu, ret, digestsize;
-
- if (!*toi_prune_hash_algo_name) {
- printk(KERN_INFO "TuxOnIce: Pruning enabled but no "
- "hash algorithm set.\n");
- return 1;
- }
-
- for_each_online_cpu(cpu) {
- struct cpu_context *this = &per_cpu(contexts, cpu);
- this->desc.tfm = crypto_alloc_shash(toi_prune_hash_algo_name, 0, 0);
- if (IS_ERR(this->desc.tfm)) {
- printk(KERN_INFO "TuxOnIce: Failed to allocate the "
- "%s prune hash algorithm.\n",
- toi_prune_hash_algo_name);
- this->desc.tfm = NULL;
- return 1;
- }
-
- if (!digestsize)
- digestsize = crypto_shash_digestsize(this->desc.tfm);
-
- this->digest = kmalloc(digestsize, GFP_KERNEL);
- if (!this->digest) {
- printk(KERN_INFO "TuxOnIce: Failed to allocate space "
- "for digest output.\n");
- crypto_free_shash(this->desc.tfm);
- this->desc.tfm = NULL;
- }
-
- this->desc.flags = 0;
-
- ret = crypto_shash_init(&this->desc);
- if (ret < 0) {
- printk(KERN_INFO "TuxOnIce: Failed to initialise the "
- "%s prune hash algorithm.\n",
- toi_prune_hash_algo_name);
- kfree(this->digest);
- this->digest = NULL;
- crypto_free_shash(this->desc.tfm);
- this->desc.tfm = NULL;
- return 1;
- }
- }
-
- return 0;
-}
-
-static int toi_prune_rw_cleanup(int writing)
-{
- int cpu;
-
- for_each_online_cpu(cpu) {
- struct cpu_context *this = &per_cpu(contexts, cpu);
- if (this->desc.tfm) {
- crypto_free_shash(this->desc.tfm);
- this->desc.tfm = NULL;
- }
-
- if (this->digest) {
- kfree(this->digest);
- this->digest = NULL;
- }
- }
-
- return 0;
-}
-
-/*
- * toi_prune_init
- */
-
-static int toi_prune_init(int toi_or_resume)
-{
- if (!toi_or_resume)
- return 0;
-
- toi_pruned_pages = 0;
-
- next_driver = toi_get_next_filter(&toi_prune_ops);
-
- return next_driver ? 0 : -ECHILD;
-}
-
-/*
- * toi_prune_rw_init()
- */
-
-static int toi_prune_rw_init(int rw, int stream_number)
-{
- if (toi_prune_crypto_prepare()) {
- printk(KERN_ERR "Failed to initialise prune "
- "algorithm.\n");
- if (rw == READ) {
- printk(KERN_INFO "Unable to read the image.\n");
- return -ENODEV;
- } else {
- printk(KERN_INFO "Continuing without "
- "pruning the image.\n");
- toi_prune_ops.enabled = 0;
- }
- }
-
- return 0;
-}
-
-/*
- * toi_prune_write_page()
- *
- * Compress a page of data, buffering output and passing on filled
- * pages to the next module in the pipeline.
- *
- * Buffer_page: Pointer to a buffer of size PAGE_SIZE, containing
- * data to be checked.
- *
- * Returns: 0 on success. Otherwise the error is that returned by later
- * modules, -ECHILD if we have a broken pipeline or -EIO if
- * zlib errs.
- */
-static int toi_prune_write_page(unsigned long index, int buf_type,
- void *buffer_page, unsigned int buf_size)
-{
- int ret = 0, cpu = smp_processor_id(), write_data = 1;
- struct cpu_context *ctx = &per_cpu(contexts, cpu);
- u8* output_buffer = buffer_page;
- int output_len = buf_size;
- int out_buf_type = buf_type;
- void *buffer_start;
- u32 buf[4];
-
- if (ctx->desc.tfm) {
-
- buffer_start = TOI_MAP(buf_type, buffer_page);
- ctx->len = OUT_BUF_SIZE;
-
- ret = crypto_shash_digest(&ctx->desc, buffer_start, buf_size, &ctx->digest);
- if (ret) {
- printk(KERN_INFO "TuxOnIce: Failed to calculate digest (%d).\n", ret);
- } else {
- mutex_lock(&stats_lock);
-
- toi_pruned_pages++;
-
- mutex_unlock(&stats_lock);
-
- }
-
- TOI_UNMAP(buf_type, buffer_page);
- }
-
- if (write_data)
- ret = next_driver->write_page(index, out_buf_type,
- output_buffer, output_len);
- else
- ret = next_driver->write_page(index, out_buf_type,
- output_buffer, output_len);
-
- return ret;
-}
-
-/*
- * toi_prune_read_page()
- * @buffer_page: struct page *. Pointer to a buffer of size PAGE_SIZE.
- *
- * Retrieve data from later modules or from a previously loaded page and
- * fill the input buffer.
- * Zero if successful. Error condition from me or from downstream on failure.
- */
-static int toi_prune_read_page(unsigned long *index, int buf_type,
- void *buffer_page, unsigned int *buf_size)
-{
- int ret, cpu = smp_processor_id();
- unsigned int len;
- char *buffer_start;
- struct cpu_context *ctx = &per_cpu(contexts, cpu);
-
- if (!ctx->desc.tfm)
- return next_driver->read_page(index, TOI_PAGE, buffer_page,
- buf_size);
-
- /*
- * All our reads must be synchronous - we can't handle
- * data that hasn't been read yet.
- */
-
- ret = next_driver->read_page(index, buf_type, buffer_page, &len);
-
- if (len == PRUNE_DATA_IS_PFN) {
- buffer_start = kmap(buffer_page);
- }
-
- return ret;
-}
-
-/*
- * toi_prune_print_debug_stats
- * @buffer: Pointer to a buffer into which the debug info will be printed.
- * @size: Size of the buffer.
- *
- * Print information to be recorded for debugging purposes into a buffer.
- * Returns: Number of characters written to the buffer.
- */
-
-static int toi_prune_print_debug_stats(char *buffer, int size)
-{
- int len;
-
- /* Output the number of pages pruned. */
- if (*toi_prune_hash_algo_name)
- len = scnprintf(buffer, size, "- Compressor is '%s'.\n",
- toi_prune_hash_algo_name);
- else
- len = scnprintf(buffer, size, "- Compressor is not set.\n");
-
- if (toi_pruned_pages)
- len += scnprintf(buffer+len, size - len, " Pruned "
- "%lu pages).\n",
- toi_pruned_pages);
- return len;
-}
-
-/*
- * toi_prune_memory_needed
- *
- * Tell the caller how much memory we need to operate during hibernate/resume.
- * Returns: Unsigned long. Maximum number of bytes of memory required for
- * operation.
- */
-static int toi_prune_memory_needed(void)
-{
- return 2 * PAGE_SIZE;
-}
-
-static int toi_prune_storage_needed(void)
-{
- return 2 * sizeof(unsigned long) + 2 * sizeof(int) +
- strlen(toi_prune_hash_algo_name) + 1;
-}
-
-/*
- * toi_prune_save_config_info
- * @buffer: Pointer to a buffer of size PAGE_SIZE.
- *
- * Save informaton needed when reloading the image at resume time.
- * Returns: Number of bytes used for saving our data.
- */
-static int toi_prune_save_config_info(char *buffer)
-{
- int len = strlen(toi_prune_hash_algo_name) + 1, offset = 0;
-
- *((unsigned long *) buffer) = toi_pruned_pages;
- offset += sizeof(unsigned long);
- *((int *) (buffer + offset)) = len;
- offset += sizeof(int);
- strncpy(buffer + offset, toi_prune_hash_algo_name, len);
- return offset + len;
-}
-
-/* toi_prune_load_config_info
- * @buffer: Pointer to the start of the data.
- * @size: Number of bytes that were saved.
- *
- * Description: Reload information needed for passing back to the
- * resumed kernel.
- */
-static void toi_prune_load_config_info(char *buffer, int size)
-{
- int len, offset = 0;
-
- toi_pruned_pages = *((unsigned long *) buffer);
- offset += sizeof(unsigned long);
- len = *((int *) (buffer + offset));
- offset += sizeof(int);
- strncpy(toi_prune_hash_algo_name, buffer + offset, len);
-}
-
-static void toi_prune_pre_atomic_restore(struct toi_boot_kernel_data *bkd)
-{
- bkd->pruned_pages = toi_pruned_pages;
-}
-
-static void toi_prune_post_atomic_restore(struct toi_boot_kernel_data *bkd)
-{
- toi_pruned_pages = bkd->pruned_pages;
-}
-
-/*
- * toi_expected_ratio
- *
- * Description: Returns the expected ratio between data passed into this module
- * and the amount of data output when writing.
- * Returns: 100 - we have no idea how many pages will be pruned.
- */
-
-static int toi_prune_expected_ratio(void)
-{
- return 100;
-}
-
-/*
- * data for our sysfs entries.
- */
-static struct toi_sysfs_data sysfs_params[] = {
- SYSFS_INT("enabled", SYSFS_RW, &toi_prune_ops.enabled, 0, 1, 0,
- NULL),
- SYSFS_STRING("algorithm", SYSFS_RW, toi_prune_hash_algo_name, 31, 0, NULL),
-};
-
-/*
- * Ops structure.
- */
-static struct toi_module_ops toi_prune_ops = {
- .type = FILTER_MODULE,
- .name = "prune",
- .directory = "prune",
- .module = THIS_MODULE,
- .initialise = toi_prune_init,
- .memory_needed = toi_prune_memory_needed,
- .print_debug_info = toi_prune_print_debug_stats,
- .save_config_info = toi_prune_save_config_info,
- .load_config_info = toi_prune_load_config_info,
- .storage_needed = toi_prune_storage_needed,
- .expected_compression = toi_prune_expected_ratio,
-
- .pre_atomic_restore = toi_prune_pre_atomic_restore,
- .post_atomic_restore = toi_prune_post_atomic_restore,
-
- .rw_init = toi_prune_rw_init,
- .rw_cleanup = toi_prune_rw_cleanup,
-
- .write_page = toi_prune_write_page,
- .read_page = toi_prune_read_page,
-
- .sysfs_data = sysfs_params,
- .num_sysfs_entries = sizeof(sysfs_params) /
- sizeof(struct toi_sysfs_data),
-};
-
-/* ---- Registration ---- */
-
-static __init int toi_prune_load(void)
-{
- return toi_register_module(&toi_prune_ops);
-}
-
-late_initcall(toi_prune_load);
diff --git a/kernel/power/tuxonice_storage.c b/kernel/power/tuxonice_storage.c
deleted file mode 100644
index e99f6e24f..000000000
--- a/kernel/power/tuxonice_storage.c
+++ /dev/null
@@ -1,282 +0,0 @@
-/*
- * kernel/power/tuxonice_storage.c
- *
- * Copyright (C) 2005-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Routines for talking to a userspace program that manages storage.
- *
- * The kernel side:
- * - starts the userspace program;
- * - sends messages telling it when to open and close the connection;
- * - tells it when to quit;
- *
- * The user space side:
- * - passes messages regarding status;
- *
- */
-
-#include <linux/suspend.h>
-#include <linux/freezer.h>
-
-#include "tuxonice_sysfs.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_netlink.h"
-#include "tuxonice_storage.h"
-#include "tuxonice_ui.h"
-
-static struct user_helper_data usm_helper_data;
-static struct toi_module_ops usm_ops;
-static int message_received, usm_prepare_count;
-static int storage_manager_last_action, storage_manager_action;
-
-static int usm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
-{
- int type;
- int *data;
-
- type = nlh->nlmsg_type;
-
- /* A control message: ignore them */
- if (type < NETLINK_MSG_BASE)
- return 0;
-
- /* Unknown message: reply with EINVAL */
- if (type >= USM_MSG_MAX)
- return -EINVAL;
-
- /* All operations require privileges, even GET */
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
-
- /* Only allow one task to receive NOFREEZE privileges */
- if (type == NETLINK_MSG_NOFREEZE_ME && usm_helper_data.pid != -1)
- return -EBUSY;
-
- data = (int *) NLMSG_DATA(nlh);
-
- switch (type) {
- case USM_MSG_SUCCESS:
- case USM_MSG_FAILED:
- message_received = type;
- complete(&usm_helper_data.wait_for_process);
- break;
- default:
- printk(KERN_INFO "Storage manager doesn't recognise "
- "message %d.\n", type);
- }
-
- return 1;
-}
-
-#ifdef CONFIG_NET
-static int activations;
-
-int toi_activate_storage(int force)
-{
- int tries = 1;
-
- if (usm_helper_data.pid == -1 || !usm_ops.enabled)
- return 0;
-
- message_received = 0;
- activations++;
-
- if (activations > 1 && !force)
- return 0;
-
- while ((!message_received || message_received == USM_MSG_FAILED) &&
- tries < 2) {
- toi_prepare_status(DONT_CLEAR_BAR, "Activate storage attempt "
- "%d.\n", tries);
-
- init_completion(&usm_helper_data.wait_for_process);
-
- toi_send_netlink_message(&usm_helper_data,
- USM_MSG_CONNECT,
- NULL, 0);
-
- /* Wait 2 seconds for the userspace process to make contact */
- wait_for_completion_timeout(&usm_helper_data.wait_for_process,
- 2*HZ);
-
- tries++;
- }
-
- return 0;
-}
-
-int toi_deactivate_storage(int force)
-{
- if (usm_helper_data.pid == -1 || !usm_ops.enabled)
- return 0;
-
- message_received = 0;
- activations--;
-
- if (activations && !force)
- return 0;
-
- init_completion(&usm_helper_data.wait_for_process);
-
- toi_send_netlink_message(&usm_helper_data,
- USM_MSG_DISCONNECT,
- NULL, 0);
-
- wait_for_completion_timeout(&usm_helper_data.wait_for_process, 2*HZ);
-
- if (!message_received || message_received == USM_MSG_FAILED) {
- printk(KERN_INFO "Returning failure disconnecting storage.\n");
- return 1;
- }
-
- return 0;
-}
-#endif
-
-static void storage_manager_simulate(void)
-{
- printk(KERN_INFO "--- Storage manager simulate ---\n");
- toi_prepare_usm();
- schedule();
- printk(KERN_INFO "--- Activate storage 1 ---\n");
- toi_activate_storage(1);
- schedule();
- printk(KERN_INFO "--- Deactivate storage 1 ---\n");
- toi_deactivate_storage(1);
- schedule();
- printk(KERN_INFO "--- Cleanup usm ---\n");
- toi_cleanup_usm();
- schedule();
- printk(KERN_INFO "--- Storage manager simulate ends ---\n");
-}
-
-static int usm_storage_needed(void)
-{
- return sizeof(int) + strlen(usm_helper_data.program) + 1;
-}
-
-static int usm_save_config_info(char *buf)
-{
- int len = strlen(usm_helper_data.program);
- memcpy(buf, usm_helper_data.program, len + 1);
- return sizeof(int) + len + 1;
-}
-
-static void usm_load_config_info(char *buf, int size)
-{
- /* Don't load the saved path if one has already been set */
- if (usm_helper_data.program[0])
- return;
-
- memcpy(usm_helper_data.program, buf + sizeof(int), *((int *) buf));
-}
-
-static int usm_memory_needed(void)
-{
- /* ball park figure of 32 pages */
- return 32 * PAGE_SIZE;
-}
-
-/* toi_prepare_usm
- */
-int toi_prepare_usm(void)
-{
- usm_prepare_count++;
-
- if (usm_prepare_count > 1 || !usm_ops.enabled)
- return 0;
-
- usm_helper_data.pid = -1;
-
- if (!*usm_helper_data.program)
- return 0;
-
- toi_netlink_setup(&usm_helper_data);
-
- if (usm_helper_data.pid == -1)
- printk(KERN_INFO "TuxOnIce Storage Manager wanted, but couldn't"
- " start it.\n");
-
- toi_activate_storage(0);
-
- return usm_helper_data.pid != -1;
-}
-
-void toi_cleanup_usm(void)
-{
- usm_prepare_count--;
-
- if (usm_helper_data.pid > -1 && !usm_prepare_count) {
- toi_deactivate_storage(0);
- toi_netlink_close(&usm_helper_data);
- }
-}
-
-static void storage_manager_activate(void)
-{
- if (storage_manager_action == storage_manager_last_action)
- return;
-
- if (storage_manager_action)
- toi_prepare_usm();
- else
- toi_cleanup_usm();
-
- storage_manager_last_action = storage_manager_action;
-}
-
-/*
- * User interface specific /sys/power/tuxonice entries.
- */
-
-static struct toi_sysfs_data sysfs_params[] = {
- SYSFS_NONE("simulate_atomic_copy", storage_manager_simulate),
- SYSFS_INT("enabled", SYSFS_RW, &usm_ops.enabled, 0, 1, 0, NULL),
- SYSFS_STRING("program", SYSFS_RW, usm_helper_data.program, 254, 0,
- NULL),
- SYSFS_INT("activate_storage", SYSFS_RW , &storage_manager_action, 0, 1,
- 0, storage_manager_activate)
-};
-
-static struct toi_module_ops usm_ops = {
- .type = MISC_MODULE,
- .name = "usm",
- .directory = "storage_manager",
- .module = THIS_MODULE,
- .storage_needed = usm_storage_needed,
- .save_config_info = usm_save_config_info,
- .load_config_info = usm_load_config_info,
- .memory_needed = usm_memory_needed,
-
- .sysfs_data = sysfs_params,
- .num_sysfs_entries = sizeof(sysfs_params) /
- sizeof(struct toi_sysfs_data),
-};
-
-/* toi_usm_sysfs_init
- * Description: Boot time initialisation for user interface.
- */
-int toi_usm_init(void)
-{
- usm_helper_data.nl = NULL;
- usm_helper_data.program[0] = '\0';
- usm_helper_data.pid = -1;
- usm_helper_data.skb_size = 0;
- usm_helper_data.pool_limit = 6;
- usm_helper_data.netlink_id = NETLINK_TOI_USM;
- usm_helper_data.name = "userspace storage manager";
- usm_helper_data.rcv_msg = usm_user_rcv_msg;
- usm_helper_data.interface_version = 2;
- usm_helper_data.must_init = 0;
- init_completion(&usm_helper_data.wait_for_process);
-
- return toi_register_module(&usm_ops);
-}
-
-void toi_usm_exit(void)
-{
- toi_netlink_close_complete(&usm_helper_data);
- toi_unregister_module(&usm_ops);
-}
diff --git a/kernel/power/tuxonice_storage.h b/kernel/power/tuxonice_storage.h
deleted file mode 100644
index 1ed9ab156..000000000
--- a/kernel/power/tuxonice_storage.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * kernel/power/tuxonice_storage.h
- *
- * Copyright (C) 2005-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- */
-
-#ifdef CONFIG_NET
-int toi_prepare_usm(void);
-void toi_cleanup_usm(void);
-
-int toi_activate_storage(int force);
-int toi_deactivate_storage(int force);
-extern int toi_usm_init(void);
-extern void toi_usm_exit(void);
-#else
-static inline int toi_usm_init(void) { return 0; }
-static inline void toi_usm_exit(void) { }
-
-static inline int toi_activate_storage(int force)
-{
- return 0;
-}
-
-static inline int toi_deactivate_storage(int force)
-{
- return 0;
-}
-
-static inline int toi_prepare_usm(void) { return 0; }
-static inline void toi_cleanup_usm(void) { }
-#endif
-
-enum {
- USM_MSG_BASE = 0x10,
-
- /* Kernel -> Userspace */
- USM_MSG_CONNECT = 0x30,
- USM_MSG_DISCONNECT = 0x31,
- USM_MSG_SUCCESS = 0x40,
- USM_MSG_FAILED = 0x41,
-
- USM_MSG_MAX,
-};
diff --git a/kernel/power/tuxonice_swap.c b/kernel/power/tuxonice_swap.c
deleted file mode 100644
index ce3215033..000000000
--- a/kernel/power/tuxonice_swap.c
+++ /dev/null
@@ -1,474 +0,0 @@
-/*
- * kernel/power/tuxonice_swap.c
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * Distributed under GPLv2.
- *
- * This file encapsulates functions for usage of swap space as a
- * backing store.
- */
-
-#include <linux/suspend.h>
-#include <linux/blkdev.h>
-#include <linux/swapops.h>
-#include <linux/swap.h>
-#include <linux/syscalls.h>
-#include <linux/fs_uuid.h>
-
-#include "tuxonice.h"
-#include "tuxonice_sysfs.h"
-#include "tuxonice_modules.h"
-#include "tuxonice_io.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_extent.h"
-#include "tuxonice_bio.h"
-#include "tuxonice_alloc.h"
-#include "tuxonice_builtin.h"
-
-static struct toi_module_ops toi_swapops;
-
-/* For swapfile automatically swapon/off'd. */
-static char swapfilename[255] = "";
-static int toi_swapon_status;
-
-/* Swap Pages */
-static unsigned long swap_allocated;
-
-static struct sysinfo swapinfo;
-
-static int is_ram_backed(struct swap_info_struct *si)
-{
- if (!strncmp(si->bdev->bd_disk->disk_name, "ram", 3) ||
- !strncmp(si->bdev->bd_disk->disk_name, "zram", 4))
- return 1;
-
- return 0;
-}
-
-/**
- * enable_swapfile: Swapon the user specified swapfile prior to hibernating.
- *
- * Activate the given swapfile if it wasn't already enabled. Remember whether
- * we really did swapon it for swapoffing later.
- */
-static void enable_swapfile(void)
-{
- int activateswapresult = -EINVAL;
-
- if (swapfilename[0]) {
- /* Attempt to swap on with maximum priority */
- activateswapresult = sys_swapon(swapfilename, 0xFFFF);
- if (activateswapresult && activateswapresult != -EBUSY)
- printk(KERN_ERR "TuxOnIce: The swapfile/partition "
- "specified by /sys/power/tuxonice/swap/swapfile"
- " (%s) could not be turned on (error %d). "
- "Attempting to continue.\n",
- swapfilename, activateswapresult);
- if (!activateswapresult)
- toi_swapon_status = 1;
- }
-}
-
-/**
- * disable_swapfile: Swapoff any file swaponed at the start of the cycle.
- *
- * If we did successfully swapon a file at the start of the cycle, swapoff
- * it now (finishing up).
- */
-static void disable_swapfile(void)
-{
- if (!toi_swapon_status)
- return;
-
- sys_swapoff(swapfilename);
- toi_swapon_status = 0;
-}
-
-static int add_blocks_to_extent_chain(struct toi_bdev_info *chain,
- unsigned long start, unsigned long end)
-{
- if (test_action_state(TOI_TEST_BIO))
- toi_message(TOI_IO, TOI_VERBOSE, 0, "Adding extent %lu-%lu to "
- "chain %p.", start << chain->bmap_shift,
- end << chain->bmap_shift, chain);
-
- return toi_add_to_extent_chain(&chain->blocks, start, end);
-}
-
-
-static int get_main_pool_phys_params(struct toi_bdev_info *chain)
-{
- struct hibernate_extent *extentpointer = NULL;
- unsigned long address, extent_min = 0, extent_max = 0;
- int empty = 1;
-
- toi_message(TOI_IO, TOI_VERBOSE, 0, "get main pool phys params for "
- "chain %d.", chain->allocator_index);
-
- if (!chain->allocations.first)
- return 0;
-
- if (chain->blocks.first)
- toi_put_extent_chain(&chain->blocks);
-
- toi_extent_for_each(&chain->allocations, extentpointer, address) {
- swp_entry_t swap_address = (swp_entry_t) { address };
- struct block_device *bdev;
- sector_t new_sector = map_swap_entry(swap_address, &bdev);
-
- if (empty) {
- empty = 0;
- extent_min = extent_max = new_sector;
- continue;
- }
-
- if (new_sector == extent_max + 1) {
- extent_max++;
- continue;
- }
-
- if (add_blocks_to_extent_chain(chain, extent_min, extent_max)) {
- printk(KERN_ERR "Out of memory while making block "
- "chains.\n");
- return -ENOMEM;
- }
-
- extent_min = new_sector;
- extent_max = new_sector;
- }
-
- if (!empty &&
- add_blocks_to_extent_chain(chain, extent_min, extent_max)) {
- printk(KERN_ERR "Out of memory while making block chains.\n");
- return -ENOMEM;
- }
-
- return 0;
-}
-
-/*
- * Like si_swapinfo, except that we don't include ram backed swap (compcache!)
- * and don't need to use the spinlocks (userspace is stopped when this
- * function is called).
- */
-void si_swapinfo_no_compcache(void)
-{
- unsigned int i;
-
- si_swapinfo(&swapinfo);
- swapinfo.freeswap = 0;
- swapinfo.totalswap = 0;
-
- for (i = 0; i < MAX_SWAPFILES; i++) {
- struct swap_info_struct *si = get_swap_info_struct(i);
- if (si && (si->flags & SWP_WRITEOK) && !is_ram_backed(si)) {
- swapinfo.totalswap += si->inuse_pages;
- swapinfo.freeswap += si->pages - si->inuse_pages;
- }
- }
-}
-/*
- * We can't just remember the value from allocation time, because other
- * processes might have allocated swap in the mean time.
- */
-static unsigned long toi_swap_storage_available(void)
-{
- toi_message(TOI_IO, TOI_VERBOSE, 0, "In toi_swap_storage_available.");
- si_swapinfo_no_compcache();
- return swapinfo.freeswap + swap_allocated;
-}
-
-static int toi_swap_initialise(int starting_cycle)
-{
- if (!starting_cycle)
- return 0;
-
- enable_swapfile();
- return 0;
-}
-
-static void toi_swap_cleanup(int ending_cycle)
-{
- if (!ending_cycle)
- return;
-
- disable_swapfile();
-}
-
-static void toi_swap_free_storage(struct toi_bdev_info *chain)
-{
- /* Free swap entries */
- struct hibernate_extent *extentpointer;
- unsigned long extentvalue;
-
- toi_message(TOI_IO, TOI_VERBOSE, 0, "Freeing storage for chain %p.",
- chain);
-
- swap_allocated -= chain->allocations.size;
- toi_extent_for_each(&chain->allocations, extentpointer, extentvalue)
- swap_free((swp_entry_t) { extentvalue });
-
- toi_put_extent_chain(&chain->allocations);
-}
-
-static void free_swap_range(unsigned long min, unsigned long max)
-{
- int j;
-
- for (j = min; j <= max; j++)
- swap_free((swp_entry_t) { j });
- swap_allocated -= (max - min + 1);
-}
-
-/*
- * Allocation of a single swap type. Swap priorities are handled at the higher
- * level.
- */
-static int toi_swap_allocate_storage(struct toi_bdev_info *chain,
- unsigned long request)
-{
- unsigned long gotten = 0;
-
- toi_message(TOI_IO, TOI_VERBOSE, 0, " Swap allocate storage: Asked to"
- " allocate %lu pages from device %d.", request,
- chain->allocator_index);
-
- while (gotten < request) {
- swp_entry_t start, end;
- if (0) {
- /* Broken at the moment for SSDs */
- get_swap_range_of_type(chain->allocator_index, &start, &end,
- request - gotten + 1);
- } else {
- start = end = get_swap_page_of_type(chain->allocator_index);
- }
- if (start.val) {
- int added = end.val - start.val + 1;
- if (toi_add_to_extent_chain(&chain->allocations,
- start.val, end.val)) {
- printk(KERN_INFO "Failed to allocate extent for "
- "%lu-%lu.\n", start.val, end.val);
- free_swap_range(start.val, end.val);
- break;
- }
- gotten += added;
- swap_allocated += added;
- } else
- break;
- }
-
- toi_message(TOI_IO, TOI_VERBOSE, 0, " Allocated %lu pages.", gotten);
- return gotten;
-}
-
-static int toi_swap_register_storage(void)
-{
- int i, result = 0;
-
- toi_message(TOI_IO, TOI_VERBOSE, 0, "toi_swap_register_storage.");
- for (i = 0; i < MAX_SWAPFILES; i++) {
- struct swap_info_struct *si = get_swap_info_struct(i);
- struct toi_bdev_info *devinfo;
- unsigned char *p;
- unsigned char buf[256];
- struct fs_info *fs_info;
-
- if (!si || !(si->flags & SWP_WRITEOK) || is_ram_backed(si))
- continue;
-
- devinfo = toi_kzalloc(39, sizeof(struct toi_bdev_info),
- GFP_ATOMIC);
- if (!devinfo) {
- printk("Failed to allocate devinfo struct for swap "
- "device %d.\n", i);
- return -ENOMEM;
- }
-
- devinfo->bdev = si->bdev;
- devinfo->allocator = &toi_swapops;
- devinfo->allocator_index = i;
-
- fs_info = fs_info_from_block_dev(si->bdev);
- if (fs_info && !IS_ERR(fs_info)) {
- memcpy(devinfo->uuid, &fs_info->uuid, 16);
- free_fs_info(fs_info);
- } else
- result = (int) PTR_ERR(fs_info);
-
- if (!fs_info)
- printk("fs_info from block dev returned %d.\n", result);
- devinfo->dev_t = si->bdev->bd_dev;
- devinfo->prio = si->prio;
- devinfo->bmap_shift = 3;
- devinfo->blocks_per_page = 1;
-
- p = d_path(&si->swap_file->f_path, buf, sizeof(buf));
- sprintf(devinfo->name, "swap on %s", p);
-
- toi_message(TOI_IO, TOI_VERBOSE, 0, "Registering swap storage:"
- " Device %d (%lx), prio %d.", i,
- (unsigned long) devinfo->dev_t, devinfo->prio);
- toi_bio_ops.register_storage(devinfo);
- }
-
- return 0;
-}
-
-static unsigned long toi_swap_free_unused_storage(struct toi_bdev_info *chain, unsigned long used)
-{
- struct hibernate_extent *extentpointer = NULL;
- unsigned long extentvalue;
- unsigned long i = 0, first_freed = 0;
-
- toi_extent_for_each(&chain->allocations, extentpointer, extentvalue) {
- i++;
- if (i > used) {
- swap_free((swp_entry_t) { extentvalue });
- if (!first_freed)
- first_freed = extentvalue;
- }
- }
-
- return first_freed;
-}
-
-/*
- * workspace_size
- *
- * Description:
- * Returns the number of bytes of RAM needed for this
- * code to do its work. (Used when calculating whether
- * we have enough memory to be able to hibernate & resume).
- *
- */
-static int toi_swap_memory_needed(void)
-{
- return 1;
-}
-
-/*
- * Print debug info
- *
- * Description:
- */
-static int toi_swap_print_debug_stats(char *buffer, int size)
-{
- int len = 0;
-
- len = scnprintf(buffer, size, "- Swap Allocator enabled.\n");
- if (swapfilename[0])
- len += scnprintf(buffer+len, size-len,
- " Attempting to automatically swapon: %s.\n",
- swapfilename);
-
- si_swapinfo_no_compcache();
-
- len += scnprintf(buffer+len, size-len,
- " Swap available for image: %lu pages.\n",
- swapinfo.freeswap + swap_allocated);
-
- return len;
-}
-
-static int header_locations_read_sysfs(const char *page, int count)
-{
- int i, printedpartitionsmessage = 0, len = 0, haveswap = 0;
- struct inode *swapf = NULL;
- int zone;
- char *path_page = (char *) toi_get_free_page(10, GFP_KERNEL);
- char *path, *output = (char *) page;
- int path_len;
-
- if (!page)
- return 0;
-
- for (i = 0; i < MAX_SWAPFILES; i++) {
- struct swap_info_struct *si = get_swap_info_struct(i);
-
- if (!si || !(si->flags & SWP_WRITEOK))
- continue;
-
- if (S_ISBLK(si->swap_file->f_mapping->host->i_mode)) {
- haveswap = 1;
- if (!printedpartitionsmessage) {
- len += sprintf(output + len,
- "For swap partitions, simply use the "
- "format: resume=swap:/dev/hda1.\n");
- printedpartitionsmessage = 1;
- }
- } else {
- path_len = 0;
-
- path = d_path(&si->swap_file->f_path, path_page,
- PAGE_SIZE);
- path_len = snprintf(path_page, PAGE_SIZE, "%s", path);
-
- haveswap = 1;
- swapf = si->swap_file->f_mapping->host;
- zone = bmap(swapf, 0);
- if (!zone) {
- len += sprintf(output + len,
- "Swapfile %s has been corrupted. Reuse"
- " mkswap on it and try again.\n",
- path_page);
- } else {
- char name_buffer[BDEVNAME_SIZE];
- len += sprintf(output + len,
- "For swapfile `%s`,"
- " use resume=swap:/dev/%s:0x%x.\n",
- path_page,
- bdevname(si->bdev, name_buffer),
- zone << (swapf->i_blkbits - 9));
- }
- }
- }
-
- if (!haveswap)
- len = sprintf(output, "You need to turn on swap partitions "
- "before examining this file.\n");
-
- toi_free_page(10, (unsigned long) path_page);
- return len;
-}
-
-static struct toi_sysfs_data sysfs_params[] = {
- SYSFS_STRING("swapfilename", SYSFS_RW, swapfilename, 255, 0, NULL),
- SYSFS_CUSTOM("headerlocations", SYSFS_READONLY,
- header_locations_read_sysfs, NULL, 0, NULL),
- SYSFS_INT("enabled", SYSFS_RW, &toi_swapops.enabled, 0, 1, 0,
- attempt_to_parse_resume_device2),
-};
-
-static struct toi_bio_allocator_ops toi_bio_swapops = {
- .register_storage = toi_swap_register_storage,
- .storage_available = toi_swap_storage_available,
- .allocate_storage = toi_swap_allocate_storage,
- .bmap = get_main_pool_phys_params,
- .free_storage = toi_swap_free_storage,
- .free_unused_storage = toi_swap_free_unused_storage,
-};
-
-static struct toi_module_ops toi_swapops = {
- .type = BIO_ALLOCATOR_MODULE,
- .name = "swap storage",
- .directory = "swap",
- .module = THIS_MODULE,
- .memory_needed = toi_swap_memory_needed,
- .print_debug_info = toi_swap_print_debug_stats,
- .initialise = toi_swap_initialise,
- .cleanup = toi_swap_cleanup,
- .bio_allocator_ops = &toi_bio_swapops,
-
- .sysfs_data = sysfs_params,
- .num_sysfs_entries = sizeof(sysfs_params) /
- sizeof(struct toi_sysfs_data),
-};
-
-/* ---- Registration ---- */
-static __init int toi_swap_load(void)
-{
- return toi_register_module(&toi_swapops);
-}
-
-late_initcall(toi_swap_load);
diff --git a/kernel/power/tuxonice_sysfs.c b/kernel/power/tuxonice_sysfs.c
deleted file mode 100644
index 79c9315b6..000000000
--- a/kernel/power/tuxonice_sysfs.c
+++ /dev/null
@@ -1,333 +0,0 @@
-/*
- * kernel/power/tuxonice_sysfs.c
- *
- * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * This file contains support for sysfs entries for tuning TuxOnIce.
- *
- * We have a generic handler that deals with the most common cases, and
- * hooks for special handlers to use.
- */
-
-#include <linux/suspend.h>
-
-#include "tuxonice_sysfs.h"
-#include "tuxonice.h"
-#include "tuxonice_storage.h"
-#include "tuxonice_alloc.h"
-
-static int toi_sysfs_initialised;
-
-static void toi_initialise_sysfs(void);
-
-static struct toi_sysfs_data sysfs_params[];
-
-#define to_sysfs_data(_attr) container_of(_attr, struct toi_sysfs_data, attr)
-
-static void toi_main_wrapper(void)
-{
- toi_try_hibernate();
-}
-
-static ssize_t toi_attr_show(struct kobject *kobj, struct attribute *attr,
- char *page)
-{
- struct toi_sysfs_data *sysfs_data = to_sysfs_data(attr);
- int len = 0;
- int full_prep = sysfs_data->flags & SYSFS_NEEDS_SM_FOR_READ;
-
- if (full_prep && toi_start_anything(0))
- return -EBUSY;
-
- if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_READ)
- toi_prepare_usm();
-
- switch (sysfs_data->type) {
- case TOI_SYSFS_DATA_CUSTOM:
- len = (sysfs_data->data.special.read_sysfs) ?
- (sysfs_data->data.special.read_sysfs)(page, PAGE_SIZE)
- : 0;
- break;
- case TOI_SYSFS_DATA_BIT:
- len = sprintf(page, "%d\n",
- -test_bit(sysfs_data->data.bit.bit,
- sysfs_data->data.bit.bit_vector));
- break;
- case TOI_SYSFS_DATA_INTEGER:
- len = sprintf(page, "%d\n",
- *(sysfs_data->data.integer.variable));
- break;
- case TOI_SYSFS_DATA_LONG:
- len = sprintf(page, "%ld\n",
- *(sysfs_data->data.a_long.variable));
- break;
- case TOI_SYSFS_DATA_UL:
- len = sprintf(page, "%lu\n",
- *(sysfs_data->data.ul.variable));
- break;
- case TOI_SYSFS_DATA_STRING:
- len = sprintf(page, "%s\n",
- sysfs_data->data.string.variable);
- break;
- }
-
- if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_READ)
- toi_cleanup_usm();
-
- if (full_prep)
- toi_finish_anything(0);
-
- return len;
-}
-
-#define BOUND(_variable, _type) do { \
- if (*_variable < sysfs_data->data._type.minimum) \
- *_variable = sysfs_data->data._type.minimum; \
- else if (*_variable > sysfs_data->data._type.maximum) \
- *_variable = sysfs_data->data._type.maximum; \
-} while (0)
-
-static ssize_t toi_attr_store(struct kobject *kobj, struct attribute *attr,
- const char *my_buf, size_t count)
-{
- int assigned_temp_buffer = 0, result = count;
- struct toi_sysfs_data *sysfs_data = to_sysfs_data(attr);
-
- if (toi_start_anything((sysfs_data->flags & SYSFS_HIBERNATE_OR_RESUME)))
- return -EBUSY;
-
- ((char *) my_buf)[count] = 0;
-
- if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_WRITE)
- toi_prepare_usm();
-
- switch (sysfs_data->type) {
- case TOI_SYSFS_DATA_CUSTOM:
- if (sysfs_data->data.special.write_sysfs)
- result = (sysfs_data->data.special.write_sysfs)(my_buf,
- count);
- break;
- case TOI_SYSFS_DATA_BIT:
- {
- unsigned long value;
- result = kstrtoul(my_buf, 0, &value);
- if (result)
- break;
- if (value)
- set_bit(sysfs_data->data.bit.bit,
- (sysfs_data->data.bit.bit_vector));
- else
- clear_bit(sysfs_data->data.bit.bit,
- (sysfs_data->data.bit.bit_vector));
- }
- break;
- case TOI_SYSFS_DATA_INTEGER:
- {
- long temp;
- result = kstrtol(my_buf, 0, &temp);
- if (result)
- break;
- *(sysfs_data->data.integer.variable) = (int) temp;
- BOUND(sysfs_data->data.integer.variable, integer);
- break;
- }
- case TOI_SYSFS_DATA_LONG:
- {
- long *variable =
- sysfs_data->data.a_long.variable;
- result = kstrtol(my_buf, 0, variable);
- if (result)
- break;
- BOUND(variable, a_long);
- break;
- }
- case TOI_SYSFS_DATA_UL:
- {
- unsigned long *variable =
- sysfs_data->data.ul.variable;
- result = kstrtoul(my_buf, 0, variable);
- if (result)
- break;
- BOUND(variable, ul);
- break;
- }
- break;
- case TOI_SYSFS_DATA_STRING:
- {
- int copy_len = count;
- char *variable =
- sysfs_data->data.string.variable;
-
- if (sysfs_data->data.string.max_length &&
- (copy_len > sysfs_data->data.string.max_length))
- copy_len = sysfs_data->data.string.max_length;
-
- if (!variable) {
- variable = (char *) toi_get_zeroed_page(31,
- TOI_ATOMIC_GFP);
- sysfs_data->data.string.variable = variable;
- assigned_temp_buffer = 1;
- }
- strncpy(variable, my_buf, copy_len);
- if (copy_len && my_buf[copy_len - 1] == '\n')
- variable[count - 1] = 0;
- variable[count] = 0;
- }
- break;
- }
-
- if (!result)
- result = count;
-
- /* Side effect routine? */
- if (result == count && sysfs_data->write_side_effect)
- sysfs_data->write_side_effect();
-
- /* Free temporary buffers */
- if (assigned_temp_buffer) {
- toi_free_page(31,
- (unsigned long) sysfs_data->data.string.variable);
- sysfs_data->data.string.variable = NULL;
- }
-
- if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_WRITE)
- toi_cleanup_usm();
-
- toi_finish_anything(sysfs_data->flags & SYSFS_HIBERNATE_OR_RESUME);
-
- return result;
-}
-
-static struct sysfs_ops toi_sysfs_ops = {
- .show = &toi_attr_show,
- .store = &toi_attr_store,
-};
-
-static struct kobj_type toi_ktype = {
- .sysfs_ops = &toi_sysfs_ops,
-};
-
-struct kobject *tuxonice_kobj;
-
-/* Non-module sysfs entries.
- *
- * This array contains entries that are automatically registered at
- * boot. Modules and the console code register their own entries separately.
- */
-
-static struct toi_sysfs_data sysfs_params[] = {
- SYSFS_CUSTOM("do_hibernate", SYSFS_WRITEONLY, NULL, NULL,
- SYSFS_HIBERNATING, toi_main_wrapper),
- SYSFS_CUSTOM("do_resume", SYSFS_WRITEONLY, NULL, NULL,
- SYSFS_RESUMING, toi_try_resume)
-};
-
-void remove_toi_sysdir(struct kobject *kobj)
-{
- if (!kobj)
- return;
-
- kobject_put(kobj);
-}
-
-struct kobject *make_toi_sysdir(char *name)
-{
- struct kobject *kobj = kobject_create_and_add(name, tuxonice_kobj);
-
- if (!kobj) {
- printk(KERN_INFO "TuxOnIce: Can't allocate kobject for sysfs "
- "dir!\n");
- return NULL;
- }
-
- kobj->ktype = &toi_ktype;
-
- return kobj;
-}
-
-/* toi_register_sysfs_file
- *
- * Helper for registering a new /sysfs/tuxonice entry.
- */
-
-int toi_register_sysfs_file(
- struct kobject *kobj,
- struct toi_sysfs_data *toi_sysfs_data)
-{
- int result;
-
- if (!toi_sysfs_initialised)
- toi_initialise_sysfs();
-
- result = sysfs_create_file(kobj, &toi_sysfs_data->attr);
- if (result)
- printk(KERN_INFO "TuxOnIce: sysfs_create_file for %s "
- "returned %d.\n",
- toi_sysfs_data->attr.name, result);
- kobj->ktype = &toi_ktype;
-
- return result;
-}
-
-/* toi_unregister_sysfs_file
- *
- * Helper for removing unwanted /sys/power/tuxonice entries.
- *
- */
-void toi_unregister_sysfs_file(struct kobject *kobj,
- struct toi_sysfs_data *toi_sysfs_data)
-{
- sysfs_remove_file(kobj, &toi_sysfs_data->attr);
-}
-
-void toi_cleanup_sysfs(void)
-{
- int i,
- numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data);
-
- if (!toi_sysfs_initialised)
- return;
-
- for (i = 0; i < numfiles; i++)
- toi_unregister_sysfs_file(tuxonice_kobj, &sysfs_params[i]);
-
- kobject_put(tuxonice_kobj);
- toi_sysfs_initialised = 0;
-}
-
-/* toi_initialise_sysfs
- *
- * Initialise the /sysfs/tuxonice directory.
- */
-
-static void toi_initialise_sysfs(void)
-{
- int i;
- int numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data);
-
- if (toi_sysfs_initialised)
- return;
-
- /* Make our TuxOnIce directory a child of /sys/power */
- tuxonice_kobj = kobject_create_and_add("tuxonice", power_kobj);
- if (!tuxonice_kobj)
- return;
-
- toi_sysfs_initialised = 1;
-
- for (i = 0; i < numfiles; i++)
- toi_register_sysfs_file(tuxonice_kobj, &sysfs_params[i]);
-}
-
-int toi_sysfs_init(void)
-{
- toi_initialise_sysfs();
- return 0;
-}
-
-void toi_sysfs_exit(void)
-{
- toi_cleanup_sysfs();
-}
diff --git a/kernel/power/tuxonice_sysfs.h b/kernel/power/tuxonice_sysfs.h
deleted file mode 100644
index 5b331b19a..000000000
--- a/kernel/power/tuxonice_sysfs.h
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * kernel/power/tuxonice_sysfs.h
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- */
-
-#include <linux/sysfs.h>
-
-struct toi_sysfs_data {
- struct attribute attr;
- int type;
- int flags;
- union {
- struct {
- unsigned long *bit_vector;
- int bit;
- } bit;
- struct {
- int *variable;
- int minimum;
- int maximum;
- } integer;
- struct {
- long *variable;
- long minimum;
- long maximum;
- } a_long;
- struct {
- unsigned long *variable;
- unsigned long minimum;
- unsigned long maximum;
- } ul;
- struct {
- char *variable;
- int max_length;
- } string;
- struct {
- int (*read_sysfs) (const char *buffer, int count);
- int (*write_sysfs) (const char *buffer, int count);
- void *data;
- } special;
- } data;
-
- /* Side effects routine. Used, eg, for reparsing the
- * resume= entry when it changes */
- void (*write_side_effect) (void);
- struct list_head sysfs_data_list;
-};
-
-enum {
- TOI_SYSFS_DATA_NONE = 1,
- TOI_SYSFS_DATA_CUSTOM,
- TOI_SYSFS_DATA_BIT,
- TOI_SYSFS_DATA_INTEGER,
- TOI_SYSFS_DATA_UL,
- TOI_SYSFS_DATA_LONG,
- TOI_SYSFS_DATA_STRING
-};
-
-#define SYSFS_WRITEONLY 0200
-#define SYSFS_READONLY 0444
-#define SYSFS_RW 0644
-
-#define SYSFS_BIT(_name, _mode, _ul, _bit, _flags) { \
- .attr = {.name = _name , .mode = _mode }, \
- .type = TOI_SYSFS_DATA_BIT, \
- .flags = _flags, \
- .data = { .bit = { .bit_vector = _ul, .bit = _bit } } }
-
-#define SYSFS_INT(_name, _mode, _int, _min, _max, _flags, _wse) { \
- .attr = {.name = _name , .mode = _mode }, \
- .type = TOI_SYSFS_DATA_INTEGER, \
- .flags = _flags, \
- .data = { .integer = { .variable = _int, .minimum = _min, \
- .maximum = _max } }, \
- .write_side_effect = _wse }
-
-#define SYSFS_UL(_name, _mode, _ul, _min, _max, _flags) { \
- .attr = {.name = _name , .mode = _mode }, \
- .type = TOI_SYSFS_DATA_UL, \
- .flags = _flags, \
- .data = { .ul = { .variable = _ul, .minimum = _min, \
- .maximum = _max } } }
-
-#define SYSFS_LONG(_name, _mode, _long, _min, _max, _flags) { \
- .attr = {.name = _name , .mode = _mode }, \
- .type = TOI_SYSFS_DATA_LONG, \
- .flags = _flags, \
- .data = { .a_long = { .variable = _long, .minimum = _min, \
- .maximum = _max } } }
-
-#define SYSFS_STRING(_name, _mode, _string, _max_len, _flags, _wse) { \
- .attr = {.name = _name , .mode = _mode }, \
- .type = TOI_SYSFS_DATA_STRING, \
- .flags = _flags, \
- .data = { .string = { .variable = _string, .max_length = _max_len } }, \
- .write_side_effect = _wse }
-
-#define SYSFS_CUSTOM(_name, _mode, _read, _write, _flags, _wse) { \
- .attr = {.name = _name , .mode = _mode }, \
- .type = TOI_SYSFS_DATA_CUSTOM, \
- .flags = _flags, \
- .data = { .special = { .read_sysfs = _read, .write_sysfs = _write } }, \
- .write_side_effect = _wse }
-
-#define SYSFS_NONE(_name, _wse) { \
- .attr = {.name = _name , .mode = SYSFS_WRITEONLY }, \
- .type = TOI_SYSFS_DATA_NONE, \
- .write_side_effect = _wse, \
-}
-
-/* Flags */
-#define SYSFS_NEEDS_SM_FOR_READ 1
-#define SYSFS_NEEDS_SM_FOR_WRITE 2
-#define SYSFS_HIBERNATE 4
-#define SYSFS_RESUME 8
-#define SYSFS_HIBERNATE_OR_RESUME (SYSFS_HIBERNATE | SYSFS_RESUME)
-#define SYSFS_HIBERNATING (SYSFS_HIBERNATE | SYSFS_NEEDS_SM_FOR_WRITE)
-#define SYSFS_RESUMING (SYSFS_RESUME | SYSFS_NEEDS_SM_FOR_WRITE)
-#define SYSFS_NEEDS_SM_FOR_BOTH \
- (SYSFS_NEEDS_SM_FOR_READ | SYSFS_NEEDS_SM_FOR_WRITE)
-
-int toi_register_sysfs_file(struct kobject *kobj,
- struct toi_sysfs_data *toi_sysfs_data);
-void toi_unregister_sysfs_file(struct kobject *kobj,
- struct toi_sysfs_data *toi_sysfs_data);
-
-extern struct kobject *tuxonice_kobj;
-
-struct kobject *make_toi_sysdir(char *name);
-void remove_toi_sysdir(struct kobject *obj);
-extern void toi_cleanup_sysfs(void);
-
-extern int toi_sysfs_init(void);
-extern void toi_sysfs_exit(void);
diff --git a/kernel/power/tuxonice_ui.c b/kernel/power/tuxonice_ui.c
deleted file mode 100644
index c405f9b9a..000000000
--- a/kernel/power/tuxonice_ui.c
+++ /dev/null
@@ -1,247 +0,0 @@
-/*
- * kernel/power/tuxonice_ui.c
- *
- * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
- * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz>
- * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr>
- * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Routines for TuxOnIce's user interface.
- *
- * The user interface code talks to a userspace program via a
- * netlink socket.
- *
- * The kernel side:
- * - starts the userui program;
- * - sends text messages and progress bar status;
- *
- * The user space side:
- * - passes messages regarding user requests (abort, toggle reboot etc)
- *
- */
-
-#define __KERNEL_SYSCALLS__
-
-#include <linux/reboot.h>
-
-#include "tuxonice_sysfs.h"
-#include "tuxonice_modules.h"
-#include "tuxonice.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_netlink.h"
-#include "tuxonice_power_off.h"
-#include "tuxonice_builtin.h"
-
-static char local_printf_buf[1024]; /* Same as printk - should be safe */
-struct ui_ops *toi_current_ui;
-
-/**
- * toi_wait_for_keypress - Wait for keypress via userui or /dev/console.
- *
- * @timeout: Maximum time to wait.
- *
- * Wait for a keypress, either from userui or /dev/console if userui isn't
- * available. The non-userui path is particularly for at boot-time, prior
- * to userui being started, when we have an important warning to give to
- * the user.
- */
-static char toi_wait_for_keypress(int timeout)
-{
- if (toi_current_ui && toi_current_ui->wait_for_key(timeout))
- return ' ';
-
- return toi_wait_for_keypress_dev_console(timeout);
-}
-
-/* toi_early_boot_message()
- * Description: Handle errors early in the process of booting.
- * The user may press C to continue booting, perhaps
- * invalidating the image, or space to reboot.
- * This works from either the serial console or normally
- * attached keyboard.
- *
- * Note that we come in here from init, while the kernel is
- * locked. If we want to get events from the serial console,
- * we need to temporarily unlock the kernel.
- *
- * toi_early_boot_message may also be called post-boot.
- * In this case, it simply printks the message and returns.
- *
- * Arguments: int Whether we are able to erase the image.
- * int default_answer. What to do when we timeout. This
- * will normally be continue, but the user might
- * provide command line options (__setup) to override
- * particular cases.
- * Char *. Pointer to a string explaining why we're moaning.
- */
-
-#define say(message, a...) printk(KERN_EMERG message, ##a)
-
-void toi_early_boot_message(int message_detail, int default_answer,
- char *warning_reason, ...)
-{
-#if defined(CONFIG_VT) || defined(CONFIG_SERIAL_CONSOLE)
- unsigned long orig_state = get_toi_state(), continue_req = 0;
- unsigned long orig_loglevel = console_loglevel;
- int can_ask = 1;
-#else
- int can_ask = 0;
-#endif
-
- va_list args;
- int printed_len;
-
- if (!toi_wait) {
- set_toi_state(TOI_CONTINUE_REQ);
- can_ask = 0;
- }
-
- if (warning_reason) {
- va_start(args, warning_reason);
- printed_len = vsnprintf(local_printf_buf,
- sizeof(local_printf_buf),
- warning_reason,
- args);
- va_end(args);
- }
-
- if (!test_toi_state(TOI_BOOT_TIME)) {
- printk("TuxOnIce: %s\n", local_printf_buf);
- return;
- }
-
- if (!can_ask) {
- continue_req = !!default_answer;
- goto post_ask;
- }
-
-#if defined(CONFIG_VT) || defined(CONFIG_SERIAL_CONSOLE)
- console_loglevel = 7;
-
- say("=== TuxOnIce ===\n\n");
- if (warning_reason) {
- say("BIG FAT WARNING!! %s\n\n", local_printf_buf);
- switch (message_detail) {
- case 0:
- say("If you continue booting, note that any image WILL"
- "NOT BE REMOVED.\nTuxOnIce is unable to do so "
- "because the appropriate modules aren't\n"
- "loaded. You should manually remove the image "
- "to avoid any\npossibility of corrupting your "
- "filesystem(s) later.\n");
- break;
- case 1:
- say("If you want to use the current TuxOnIce image, "
- "reboot and try\nagain with the same kernel "
- "that you hibernated from. If you want\n"
- "to forget that image, continue and the image "
- "will be erased.\n");
- break;
- }
- say("Press SPACE to reboot or C to continue booting with "
- "this kernel\n\n");
- if (toi_wait > 0)
- say("Default action if you don't select one in %d "
- "seconds is: %s.\n",
- toi_wait,
- default_answer == TOI_CONTINUE_REQ ?
- "continue booting" : "reboot");
- } else {
- say("BIG FAT WARNING!!\n\n"
- "You have tried to resume from this image before.\n"
- "If it failed once, it may well fail again.\n"
- "Would you like to remove the image and boot "
- "normally?\nThis will be equivalent to entering "
- "noresume on the\nkernel command line.\n\n"
- "Press SPACE to remove the image or C to continue "
- "resuming.\n\n");
- if (toi_wait > 0)
- say("Default action if you don't select one in %d "
- "seconds is: %s.\n", toi_wait,
- !!default_answer ?
- "continue resuming" : "remove the image");
- }
- console_loglevel = orig_loglevel;
-
- set_toi_state(TOI_SANITY_CHECK_PROMPT);
- clear_toi_state(TOI_CONTINUE_REQ);
-
- if (toi_wait_for_keypress(toi_wait) == 0) /* We timed out */
- continue_req = !!default_answer;
- else
- continue_req = test_toi_state(TOI_CONTINUE_REQ);
-
-#endif /* CONFIG_VT or CONFIG_SERIAL_CONSOLE */
-
-post_ask:
- if ((warning_reason) && (!continue_req))
- kernel_restart(NULL);
-
- restore_toi_state(orig_state);
- if (continue_req)
- set_toi_state(TOI_CONTINUE_REQ);
-}
-
-#undef say
-
-/*
- * User interface specific /sys/power/tuxonice entries.
- */
-
-static struct toi_sysfs_data sysfs_params[] = {
-#if defined(CONFIG_NET) && defined(CONFIG_SYSFS)
- SYSFS_INT("default_console_level", SYSFS_RW,
- &toi_bkd.toi_default_console_level, 0, 7, 0, NULL),
- SYSFS_UL("debug_sections", SYSFS_RW, &toi_bkd.toi_debug_state, 0,
- 1 << 30, 0),
- SYSFS_BIT("log_everything", SYSFS_RW, &toi_bkd.toi_action, TOI_LOGALL,
- 0)
-#endif
-};
-
-static struct toi_module_ops userui_ops = {
- .type = MISC_HIDDEN_MODULE,
- .name = "printk ui",
- .directory = "user_interface",
- .module = THIS_MODULE,
- .sysfs_data = sysfs_params,
- .num_sysfs_entries = sizeof(sysfs_params) /
- sizeof(struct toi_sysfs_data),
-};
-
-int toi_register_ui_ops(struct ui_ops *this_ui)
-{
- if (toi_current_ui) {
- printk(KERN_INFO "Only one TuxOnIce user interface module can "
- "be loaded at a time.");
- return -EBUSY;
- }
-
- toi_current_ui = this_ui;
-
- return 0;
-}
-
-void toi_remove_ui_ops(struct ui_ops *this_ui)
-{
- if (toi_current_ui != this_ui)
- return;
-
- toi_current_ui = NULL;
-}
-
-/* toi_console_sysfs_init
- * Description: Boot time initialisation for user interface.
- */
-
-int toi_ui_init(void)
-{
- return toi_register_module(&userui_ops);
-}
-
-void toi_ui_exit(void)
-{
- toi_unregister_module(&userui_ops);
-}
diff --git a/kernel/power/tuxonice_ui.h b/kernel/power/tuxonice_ui.h
deleted file mode 100644
index d71c607f6..000000000
--- a/kernel/power/tuxonice_ui.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * kernel/power/tuxonice_ui.h
- *
- * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- */
-
-enum {
- DONT_CLEAR_BAR,
- CLEAR_BAR
-};
-
-enum {
- /* Userspace -> Kernel */
- USERUI_MSG_ABORT = 0x11,
- USERUI_MSG_SET_STATE = 0x12,
- USERUI_MSG_GET_STATE = 0x13,
- USERUI_MSG_GET_DEBUG_STATE = 0x14,
- USERUI_MSG_SET_DEBUG_STATE = 0x15,
- USERUI_MSG_SPACE = 0x18,
- USERUI_MSG_GET_POWERDOWN_METHOD = 0x1A,
- USERUI_MSG_SET_POWERDOWN_METHOD = 0x1B,
- USERUI_MSG_GET_LOGLEVEL = 0x1C,
- USERUI_MSG_SET_LOGLEVEL = 0x1D,
- USERUI_MSG_PRINTK = 0x1E,
-
- /* Kernel -> Userspace */
- USERUI_MSG_MESSAGE = 0x21,
- USERUI_MSG_PROGRESS = 0x22,
- USERUI_MSG_POST_ATOMIC_RESTORE = 0x25,
-
- USERUI_MSG_MAX,
-};
-
-struct userui_msg_params {
- u32 a, b, c, d;
- char text[255];
-};
-
-struct ui_ops {
- char (*wait_for_key) (int timeout);
- u32 (*update_status) (u32 value, u32 maximum, const char *fmt, ...);
- void (*prepare_status) (int clearbar, const char *fmt, ...);
- void (*cond_pause) (int pause, char *message);
- void (*abort)(int result_code, const char *fmt, ...);
- void (*prepare)(void);
- void (*cleanup)(void);
- void (*message)(u32 section, u32 level, u32 normally_logged,
- const char *fmt, ...);
-};
-
-extern struct ui_ops *toi_current_ui;
-
-#define toi_update_status(val, max, fmt, args...) \
- (toi_current_ui ? (toi_current_ui->update_status) (val, max, fmt, ##args) : \
- max)
-
-#define toi_prepare_console(void) \
- do { if (toi_current_ui) \
- (toi_current_ui->prepare)(); \
- } while (0)
-
-#define toi_cleanup_console(void) \
- do { if (toi_current_ui) \
- (toi_current_ui->cleanup)(); \
- } while (0)
-
-#define abort_hibernate(result, fmt, args...) \
- do { if (toi_current_ui) \
- (toi_current_ui->abort)(result, fmt, ##args); \
- else { \
- set_abort_result(result); \
- } \
- } while (0)
-
-#define toi_cond_pause(pause, message) \
- do { if (toi_current_ui) \
- (toi_current_ui->cond_pause)(pause, message); \
- } while (0)
-
-#define toi_prepare_status(clear, fmt, args...) \
- do { if (toi_current_ui) \
- (toi_current_ui->prepare_status)(clear, fmt, ##args); \
- else \
- printk(KERN_INFO fmt "%s", ##args, "\n"); \
- } while (0)
-
-#define toi_message(sn, lev, log, fmt, a...) \
-do { \
- if (toi_current_ui && (!sn || test_debug_state(sn))) \
- toi_current_ui->message(sn, lev, log, fmt, ##a); \
-} while (0)
-
-__exit void toi_ui_cleanup(void);
-extern int toi_ui_init(void);
-extern void toi_ui_exit(void);
-extern int toi_register_ui_ops(struct ui_ops *this_ui);
-extern void toi_remove_ui_ops(struct ui_ops *this_ui);
diff --git a/kernel/power/tuxonice_userui.c b/kernel/power/tuxonice_userui.c
deleted file mode 100644
index edc885c72..000000000
--- a/kernel/power/tuxonice_userui.c
+++ /dev/null
@@ -1,658 +0,0 @@
-/*
- * kernel/power/user_ui.c
- *
- * Copyright (C) 2005-2007 Bernard Blackham
- * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au)
- *
- * This file is released under the GPLv2.
- *
- * Routines for TuxOnIce's user interface.
- *
- * The user interface code talks to a userspace program via a
- * netlink socket.
- *
- * The kernel side:
- * - starts the userui program;
- * - sends text messages and progress bar status;
- *
- * The user space side:
- * - passes messages regarding user requests (abort, toggle reboot etc)
- *
- */
-
-#define __KERNEL_SYSCALLS__
-
-#include <linux/suspend.h>
-#include <linux/freezer.h>
-#include <linux/console.h>
-#include <linux/ctype.h>
-#include <linux/tty.h>
-#include <linux/vt_kern.h>
-#include <linux/reboot.h>
-#include <linux/security.h>
-#include <linux/syscalls.h>
-#include <linux/vt.h>
-
-#include "tuxonice_sysfs.h"
-#include "tuxonice_modules.h"
-#include "tuxonice.h"
-#include "tuxonice_ui.h"
-#include "tuxonice_netlink.h"
-#include "tuxonice_power_off.h"
-
-static char local_printf_buf[1024]; /* Same as printk - should be safe */
-
-static struct user_helper_data ui_helper_data;
-static struct toi_module_ops userui_ops;
-static int orig_kmsg;
-
-static char lastheader[512];
-static int lastheader_message_len;
-static int ui_helper_changed; /* Used at resume-time so don't overwrite value
- set from initrd/ramfs. */
-
-/* Number of distinct progress amounts that userspace can display */
-static int progress_granularity = 30;
-
-static DECLARE_WAIT_QUEUE_HEAD(userui_wait_for_key);
-static int userui_wait_should_wake;
-
-#define toi_stop_waiting_for_userui_key() \
-{ \
- userui_wait_should_wake = true; \
- wake_up_interruptible(&userui_wait_for_key); \
-}
-
-/**
- * ui_nl_set_state - Update toi_action based on a message from userui.
- *
- * @n: The bit (1 << bit) to set.
- */
-static void ui_nl_set_state(int n)
-{
- /* Only let them change certain settings */
- static const u32 toi_action_mask =
- (1 << TOI_REBOOT) | (1 << TOI_PAUSE) |
- (1 << TOI_LOGALL) |
- (1 << TOI_SINGLESTEP) |
- (1 << TOI_PAUSE_NEAR_PAGESET_END);
- static unsigned long new_action;
-
- new_action = (toi_bkd.toi_action & (~toi_action_mask)) |
- (n & toi_action_mask);
-
- printk(KERN_DEBUG "n is %x. Action flags being changed from %lx "
- "to %lx.", n, toi_bkd.toi_action, new_action);
- toi_bkd.toi_action = new_action;
-
- if (!test_action_state(TOI_PAUSE) &&
- !test_action_state(TOI_SINGLESTEP))
- toi_stop_waiting_for_userui_key();
-}
-
-/**
- * userui_post_atomic_restore - Tell userui that atomic restore just happened.
- *
- * Tell userui that atomic restore just occured, so that it can do things like
- * redrawing the screen, re-getting settings and so on.
- */
-static void userui_post_atomic_restore(struct toi_boot_kernel_data *bkd)
-{
- toi_send_netlink_message(&ui_helper_data,
- USERUI_MSG_POST_ATOMIC_RESTORE, NULL, 0);
-}
-
-/**
- * userui_storage_needed - Report how much memory in image header is needed.
- */
-static int userui_storage_needed(void)
-{
- return sizeof(ui_helper_data.program) + 1 + sizeof(int);
-}
-
-/**
- * userui_save_config_info - Fill buffer with config info for image header.
- *
- * @buf: Buffer into which to put the config info we want to save.
- */
-static int userui_save_config_info(char *buf)
-{
- *((int *) buf) = progress_granularity;
- memcpy(buf + sizeof(int), ui_helper_data.program,
- sizeof(ui_helper_data.program));
- return sizeof(ui_helper_data.program) + sizeof(int) + 1;
-}
-
-/**
- * userui_load_config_info - Restore config info from buffer.
- *
- * @buf: Buffer containing header info loaded.
- * @size: Size of data loaded for this module.
- */
-static void userui_load_config_info(char *buf, int size)
-{
- progress_granularity = *((int *) buf);
- size -= sizeof(int);
-
- /* Don't load the saved path if one has already been set */
- if (ui_helper_changed)
- return;
-
- if (size > sizeof(ui_helper_data.program))
- size = sizeof(ui_helper_data.program);
-
- memcpy(ui_helper_data.program, buf + sizeof(int), size);
- ui_helper_data.program[sizeof(ui_helper_data.program)-1] = '\0';
-}
-
-/**
- * set_ui_program_set: Record that userui program was changed.
- *
- * Side effect routine for when the userui program is set. In an initrd or
- * ramfs, the user may set a location for the userui program. If this happens,
- * we don't want to reload the value that was saved in the image header. This
- * routine allows us to flag that we shouldn't restore the program name from
- * the image header.
- */
-static void set_ui_program_set(void)
-{
- ui_helper_changed = 1;
-}
-
-/**
- * userui_memory_needed - Tell core how much memory to reserve for us.
- */
-static int userui_memory_needed(void)
-{
- /* ball park figure of 128 pages */
- return 128 * PAGE_SIZE;
-}
-
-/**
- * userui_update_status - Update the progress bar and (if on) in-bar message.
- *
- * @value: Current progress percentage numerator.
- * @maximum: Current progress percentage denominator.
- * @fmt: Message to be displayed in the middle of the progress bar.
- *
- * Note that a NULL message does not mean that any previous message is erased!
- * For that, you need toi_prepare_status with clearbar on.
- *
- * Returns an unsigned long, being the next numerator (as determined by the
- * maximum and progress granularity) where status needs to be updated.
- * This is to reduce unnecessary calls to update_status.
- */
-static u32 userui_update_status(u32 value, u32 maximum, const char *fmt, ...)
-{
- static u32 last_step = 9999;
- struct userui_msg_params msg;
- u32 this_step, next_update;
- int bitshift;
-
- if (ui_helper_data.pid == -1)
- return 0;
-
- if ((!maximum) || (!progress_granularity))
- return maximum;
-
- if (value < 0)
- value = 0;
-
- if (value > maximum)
- value = maximum;
-
- /* Try to avoid math problems - we can't do 64 bit math here
- * (and shouldn't need it - anyone got screen resolution
- * of 65536 pixels or more?) */
- bitshift = fls(maximum) - 16;
- if (bitshift > 0) {
- u32 temp_maximum = maximum >> bitshift;
- u32 temp_value = value >> bitshift;
- this_step = (u32)
- (temp_value * progress_granularity / temp_maximum);
- next_update = (((this_step + 1) * temp_maximum /
- progress_granularity) + 1) << bitshift;
- } else {
- this_step = (u32) (value * progress_granularity / maximum);
- next_update = ((this_step + 1) * maximum /
- progress_granularity) + 1;
- }
-
- if (this_step == last_step)
- return next_update;
-
- memset(&msg, 0, sizeof(msg));
-
- msg.a = this_step;
- msg.b = progress_granularity;
-
- if (fmt) {
- va_list args;
- va_start(args, fmt);
- vsnprintf(msg.text, sizeof(msg.text), fmt, args);
- va_end(args);
- msg.text[sizeof(msg.text)-1] = '\0';
- }
-
- toi_send_netlink_message(&ui_helper_data, USERUI_MSG_PROGRESS,
- &msg, sizeof(msg));
- last_step = this_step;
-
- return next_update;
-}
-
-/**
- * userui_message - Display a message without necessarily logging it.
- *
- * @section: Type of message. Messages can be filtered by type.
- * @level: Degree of importance of the message. Lower values = higher priority.
- * @normally_logged: Whether logged even if log_everything is off.
- * @fmt: Message (and parameters).
- *
- * This function is intended to do the same job as printk, but without normally
- * logging what is printed. The point is to be able to get debugging info on
- * screen without filling the logs with "1/534. ^M 2/534^M. 3/534^M"
- *
- * It may be called from an interrupt context - can't sleep!
- */
-static void userui_message(u32 section, u32 level, u32 normally_logged,
- const char *fmt, ...)
-{
- struct userui_msg_params msg;
-
- if ((level) && (level > console_loglevel))
- return;
-
- memset(&msg, 0, sizeof(msg));
-
- msg.a = section;
- msg.b = level;
- msg.c = normally_logged;
-
- if (fmt) {
- va_list args;
- va_start(args, fmt);
- vsnprintf(msg.text, sizeof(msg.text), fmt, args);
- va_end(args);
- msg.text[sizeof(msg.text)-1] = '\0';
- }
-
- if (test_action_state(TOI_LOGALL))
- printk(KERN_INFO "%s\n", msg.text);
-
- toi_send_netlink_message(&ui_helper_data, USERUI_MSG_MESSAGE,
- &msg, sizeof(msg));
-}
-
-/**
- * wait_for_key_via_userui - Wait for userui to receive a keypress.
- */
-static void wait_for_key_via_userui(void)
-{
- DECLARE_WAITQUEUE(wait, current);
-
- add_wait_queue(&userui_wait_for_key, &wait);
- set_current_state(TASK_INTERRUPTIBLE);
-
- wait_event_interruptible(userui_wait_for_key, userui_wait_should_wake);
- userui_wait_should_wake = false;
-
- set_current_state(TASK_RUNNING);
- remove_wait_queue(&userui_wait_for_key, &wait);
-}
-
-/**
- * userui_prepare_status - Display high level messages.
- *
- * @clearbar: Whether to clear the progress bar.
- * @fmt...: New message for the title.
- *
- * Prepare the 'nice display', drawing the header and version, along with the
- * current action and perhaps also resetting the progress bar.
- */
-static void userui_prepare_status(int clearbar, const char *fmt, ...)
-{
- va_list args;
-
- if (fmt) {
- va_start(args, fmt);
- lastheader_message_len = vsnprintf(lastheader, 512, fmt, args);
- va_end(args);
- }
-
- if (clearbar)
- toi_update_status(0, 1, NULL);
-
- if (ui_helper_data.pid == -1)
- printk(KERN_EMERG "%s\n", lastheader);
- else
- toi_message(0, TOI_STATUS, 1, lastheader, NULL);
-}
-
-/**
- * toi_wait_for_keypress - Wait for keypress via userui.
- *
- * @timeout: Maximum time to wait.
- *
- * Wait for a keypress from userui.
- *
- * FIXME: Implement timeout?
- */
-static char userui_wait_for_keypress(int timeout)
-{
- char key = '\0';
-
- if (ui_helper_data.pid != -1) {
- wait_for_key_via_userui();
- key = ' ';
- }
-
- return key;
-}
-
-/**
- * userui_abort_hibernate - Abort a cycle & tell user if they didn't request it.
- *
- * @result_code: Reason why we're aborting (1 << bit).
- * @fmt: Message to display if telling the user what's going on.
- *
- * Abort a cycle. If this wasn't at the user's request (and we're displaying
- * output), tell the user why and wait for them to acknowledge the message.
- */
-static void userui_abort_hibernate(int result_code, const char *fmt, ...)
-{
- va_list args;
- int printed_len = 0;
-
- set_result_state(result_code);
-
- if (test_result_state(TOI_ABORTED))
- return;
-
- set_result_state(TOI_ABORTED);
-
- if (test_result_state(TOI_ABORT_REQUESTED))
- return;
-
- va_start(args, fmt);
- printed_len = vsnprintf(local_printf_buf, sizeof(local_printf_buf),
- fmt, args);
- va_end(args);
- if (ui_helper_data.pid != -1)
- printed_len = sprintf(local_printf_buf + printed_len,
- " (Press SPACE to continue)");
-
- toi_prepare_status(CLEAR_BAR, "%s", local_printf_buf);
-
- if (ui_helper_data.pid != -1)
- userui_wait_for_keypress(0);
-}
-
-/**
- * request_abort_hibernate - Abort hibernating or resuming at user request.
- *
- * Handle the user requesting the cancellation of a hibernation or resume by
- * pressing escape.
- */
-static void request_abort_hibernate(void)
-{
- if (test_result_state(TOI_ABORT_REQUESTED) ||
- !test_action_state(TOI_CAN_CANCEL))
- return;
-
- if (test_toi_state(TOI_NOW_RESUMING)) {
- toi_prepare_status(CLEAR_BAR, "Escape pressed. "
- "Powering down again.");
- set_toi_state(TOI_STOP_RESUME);
- while (!test_toi_state(TOI_IO_STOPPED))
- schedule();
- if (toiActiveAllocator->mark_resume_attempted)
- toiActiveAllocator->mark_resume_attempted(0);
- toi_power_down();
- }
-
- toi_prepare_status(CLEAR_BAR, "--- ESCAPE PRESSED :"
- " ABORTING HIBERNATION ---");
- set_abort_result(TOI_ABORT_REQUESTED);
- toi_stop_waiting_for_userui_key();
-}
-
-/**
- * userui_user_rcv_msg - Receive a netlink message from userui.
- *
- * @skb: skb received.
- * @nlh: Netlink header received.
- */
-static int userui_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
-{
- int type;
- int *data;
-
- type = nlh->nlmsg_type;
-
- /* A control message: ignore them */
- if (type < NETLINK_MSG_BASE)
- return 0;
-
- /* Unknown message: reply with EINVAL */
- if (type >= USERUI_MSG_MAX)
- return -EINVAL;
-
- /* All operations require privileges, even GET */
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
-
- /* Only allow one task to receive NOFREEZE privileges */
- if (type == NETLINK_MSG_NOFREEZE_ME && ui_helper_data.pid != -1) {
- printk(KERN_INFO "Got NOFREEZE_ME request when "
- "ui_helper_data.pid is %d.\n", ui_helper_data.pid);
- return -EBUSY;
- }
-
- data = (int *) NLMSG_DATA(nlh);
-
- switch (type) {
- case USERUI_MSG_ABORT:
- request_abort_hibernate();
- return 0;
- case USERUI_MSG_GET_STATE:
- toi_send_netlink_message(&ui_helper_data,
- USERUI_MSG_GET_STATE, &toi_bkd.toi_action,
- sizeof(toi_bkd.toi_action));
- return 0;
- case USERUI_MSG_GET_DEBUG_STATE:
- toi_send_netlink_message(&ui_helper_data,
- USERUI_MSG_GET_DEBUG_STATE,
- &toi_bkd.toi_debug_state,
- sizeof(toi_bkd.toi_debug_state));
- return 0;
- case USERUI_MSG_SET_STATE:
- if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int)))
- return -EINVAL;
- ui_nl_set_state(*data);
- return 0;
- case USERUI_MSG_SET_DEBUG_STATE:
- if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int)))
- return -EINVAL;
- toi_bkd.toi_debug_state = (*data);
- return 0;
- case USERUI_MSG_SPACE:
- toi_stop_waiting_for_userui_key();
- return 0;
- case USERUI_MSG_GET_POWERDOWN_METHOD:
- toi_send_netlink_message(&ui_helper_data,
- USERUI_MSG_GET_POWERDOWN_METHOD,
- &toi_poweroff_method,
- sizeof(toi_poweroff_method));
- return 0;
- case USERUI_MSG_SET_POWERDOWN_METHOD:
- if (nlh->nlmsg_len != NLMSG_LENGTH(sizeof(char)))
- return -EINVAL;
- toi_poweroff_method = (unsigned long)(*data);
- return 0;
- case USERUI_MSG_GET_LOGLEVEL:
- toi_send_netlink_message(&ui_helper_data,
- USERUI_MSG_GET_LOGLEVEL,
- &toi_bkd.toi_default_console_level,
- sizeof(toi_bkd.toi_default_console_level));
- return 0;
- case USERUI_MSG_SET_LOGLEVEL:
- if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int)))
- return -EINVAL;
- toi_bkd.toi_default_console_level = (*data);
- return 0;
- case USERUI_MSG_PRINTK:
- printk(KERN_INFO "%s", (char *) data);
- return 0;
- }
-
- /* Unhandled here */
- return 1;
-}
-
-/**
- * userui_cond_pause - Possibly pause at user request.
- *
- * @pause: Whether to pause or just display the message.
- * @message: Message to display at the start of pausing.
- *
- * Potentially pause and wait for the user to tell us to continue. We normally
- * only pause when @pause is set. While paused, the user can do things like
- * changing the loglevel, toggling the display of debugging sections and such
- * like.
- */
-static void userui_cond_pause(int pause, char *message)
-{
- int displayed_message = 0, last_key = 0;
-
- while (last_key != 32 &&
- ui_helper_data.pid != -1 &&
- ((test_action_state(TOI_PAUSE) && pause) ||
- (test_action_state(TOI_SINGLESTEP)))) {
- if (!displayed_message) {
- toi_prepare_status(DONT_CLEAR_BAR,
- "%s Press SPACE to continue.%s",
- message ? message : "",
- (test_action_state(TOI_SINGLESTEP)) ?
- " Single step on." : "");
- displayed_message = 1;
- }
- last_key = userui_wait_for_keypress(0);
- }
- schedule();
-}
-
-/**
- * userui_prepare_console - Prepare the console for use.
- *
- * Prepare a console for use, saving current kmsg settings and attempting to
- * start userui. Console loglevel changes are handled by userui.
- */
-static void userui_prepare_console(void)
-{
- orig_kmsg = vt_kmsg_redirect(fg_console + 1);
-
- ui_helper_data.pid = -1;
-
- if (!userui_ops.enabled) {
- printk(KERN_INFO "TuxOnIce: Userui disabled.\n");
- return;
- }
-
- if (*ui_helper_data.program)
- toi_netlink_setup(&ui_helper_data);
- else
- printk(KERN_INFO "TuxOnIce: Userui program not configured.\n");
-}
-
-/**
- * userui_cleanup_console - Cleanup after a cycle.
- *
- * Tell userui to cleanup, and restore kmsg_redirect to its original value.
- */
-
-static void userui_cleanup_console(void)
-{
- if (ui_helper_data.pid > -1)
- toi_netlink_close(&ui_helper_data);
-
- vt_kmsg_redirect(orig_kmsg);
-}
-
-/*
- * User interface specific /sys/power/tuxonice entries.
- */
-
-static struct toi_sysfs_data sysfs_params[] = {
-#if defined(CONFIG_NET) && defined(CONFIG_SYSFS)
- SYSFS_BIT("enable_escape", SYSFS_RW, &toi_bkd.toi_action,
- TOI_CAN_CANCEL, 0),
- SYSFS_BIT("pause_between_steps", SYSFS_RW, &toi_bkd.toi_action,
- TOI_PAUSE, 0),
- SYSFS_INT("enabled", SYSFS_RW, &userui_ops.enabled, 0, 1, 0, NULL),
- SYSFS_INT("progress_granularity", SYSFS_RW, &progress_granularity, 1,
- 2048, 0, NULL),
- SYSFS_STRING("program", SYSFS_RW, ui_helper_data.program, 255, 0,
- set_ui_program_set),
- SYSFS_INT("debug", SYSFS_RW, &ui_helper_data.debug, 0, 1, 0, NULL)
-#endif
-};
-
-static struct toi_module_ops userui_ops = {
- .type = MISC_MODULE,
- .name = "userui",
- .shared_directory = "user_interface",
- .module = THIS_MODULE,
- .storage_needed = userui_storage_needed,
- .save_config_info = userui_save_config_info,
- .load_config_info = userui_load_config_info,
- .memory_needed = userui_memory_needed,
- .post_atomic_restore = userui_post_atomic_restore,
- .sysfs_data = sysfs_params,
- .num_sysfs_entries = sizeof(sysfs_params) /
- sizeof(struct toi_sysfs_data),
-};
-
-static struct ui_ops my_ui_ops = {
- .update_status = userui_update_status,
- .message = userui_message,
- .prepare_status = userui_prepare_status,
- .abort = userui_abort_hibernate,
- .cond_pause = userui_cond_pause,
- .prepare = userui_prepare_console,
- .cleanup = userui_cleanup_console,
- .wait_for_key = userui_wait_for_keypress,
-};
-
-/**
- * toi_user_ui_init - Boot time initialisation for user interface.
- *
- * Invoked from the core init routine.
- */
-static __init int toi_user_ui_init(void)
-{
- int result;
-
- ui_helper_data.nl = NULL;
- strncpy(ui_helper_data.program, CONFIG_TOI_USERUI_DEFAULT_PATH, 255);
- ui_helper_data.pid = -1;
- ui_helper_data.skb_size = sizeof(struct userui_msg_params);
- ui_helper_data.pool_limit = 6;
- ui_helper_data.netlink_id = NETLINK_TOI_USERUI;
- ui_helper_data.name = "userspace ui";
- ui_helper_data.rcv_msg = userui_user_rcv_msg;
- ui_helper_data.interface_version = 8;
- ui_helper_data.must_init = 0;
- ui_helper_data.not_ready = userui_cleanup_console;
- init_completion(&ui_helper_data.wait_for_process);
- result = toi_register_module(&userui_ops);
- if (!result) {
- result = toi_register_ui_ops(&my_ui_ops);
- if (result)
- toi_unregister_module(&userui_ops);
- }
-
- return result;
-}
-
-late_initcall(toi_user_ui_init);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 83cf08088..f62f2d3f9 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -33,7 +33,6 @@
#include <linux/bootmem.h>
#include <linux/memblock.h>
#include <linux/syscalls.h>
-#include <linux/suspend.h>
#include <linux/kexec.h>
#include <linux/kdb.h>
#include <linux/ratelimit.h>
@@ -86,6 +85,18 @@ static struct lockdep_map console_lock_dep_map = {
#endif
/*
+ * Number of registered extended console drivers.
+ *
+ * If extended consoles are present, in-kernel cont reassembly is disabled
+ * and each fragment is stored as a separate log entry with proper
+ * continuation flag so that every emitted message has full metadata. This
+ * doesn't change the result for regular consoles or /proc/kmsg. For
+ * /dev/kmsg, as long as the reader concatenates messages according to
+ * consecutive continuation flags, the end result should be the same too.
+ */
+static int nr_ext_console_drivers;
+
+/*
* Helper macros to handle lockdep when locking/unlocking console_sem. We use
* macros instead of functions so that _RET_IP_ contains useful information.
*/
@@ -196,14 +207,14 @@ static int console_may_schedule;
* need to be changed in the future, when the requirements change.
*
* /dev/kmsg exports the structured data in the following line format:
- * "level,sequnum,timestamp;<message text>\n"
+ * "<level>,<sequnum>,<timestamp>,<contflag>[,additional_values, ... ];<message text>\n"
+ *
+ * Users of the export format should ignore possible additional values
+ * separated by ',', and find the message after the ';' character.
*
* The optional key/value pairs are attached as continuation lines starting
* with a space character and terminated by a newline. All possible
* non-prinatable characters are escaped in the "\xff" notation.
- *
- * Users of the export format should ignore possible additional values
- * separated by ',', and find the message after the ';' character.
*/
enum log_flags {
@@ -269,20 +280,6 @@ static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
static char *log_buf = __log_buf;
static u32 log_buf_len = __LOG_BUF_LEN;
-#ifdef CONFIG_TOI_INCREMENTAL
-void toi_set_logbuf_untracked(void)
-{
- int i;
- struct page *log_buf_start_page = virt_to_page(__log_buf);
-
- printk("Not protecting kernel printk log buffer (%p-%p).\n",
- __log_buf, __log_buf + __LOG_BUF_LEN);
-
- for (i = 0; i < (1 << (CONFIG_LOG_BUF_SHIFT - PAGE_SHIFT)); i++)
- SetPageTOI_Untracked(log_buf_start_page + i);
-}
-#endif
-
/* Return log buffer address */
char *log_buf_addr_get(void)
{
@@ -492,13 +489,13 @@ static int syslog_action_restricted(int type)
type != SYSLOG_ACTION_SIZE_BUFFER;
}
-int check_syslog_permissions(int type, bool from_file)
+int check_syslog_permissions(int type, int source)
{
/*
* If this is from /proc/kmsg and we've already opened it, then we've
* already done the capabilities checks at open time.
*/
- if (from_file && type != SYSLOG_ACTION_OPEN)
+ if (source == SYSLOG_FROM_PROC && type != SYSLOG_ACTION_OPEN)
goto ok;
if (syslog_action_restricted(type)) {
@@ -521,6 +518,86 @@ ok:
return security_syslog(type);
}
+static void append_char(char **pp, char *e, char c)
+{
+ if (*pp < e)
+ *(*pp)++ = c;
+}
+
+static ssize_t msg_print_ext_header(char *buf, size_t size,
+ struct printk_log *msg, u64 seq,
+ enum log_flags prev_flags)
+{
+ u64 ts_usec = msg->ts_nsec;
+ char cont = '-';
+
+ do_div(ts_usec, 1000);
+
+ /*
+ * If we couldn't merge continuation line fragments during the print,
+ * export the stored flags to allow an optional external merge of the
+ * records. Merging the records isn't always neccessarily correct, like
+ * when we hit a race during printing. In most cases though, it produces
+ * better readable output. 'c' in the record flags mark the first
+ * fragment of a line, '+' the following.
+ */
+ if (msg->flags & LOG_CONT && !(prev_flags & LOG_CONT))
+ cont = 'c';
+ else if ((msg->flags & LOG_CONT) ||
+ ((prev_flags & LOG_CONT) && !(msg->flags & LOG_PREFIX)))
+ cont = '+';
+
+ return scnprintf(buf, size, "%u,%llu,%llu,%c;",
+ (msg->facility << 3) | msg->level, seq, ts_usec, cont);
+}
+
+static ssize_t msg_print_ext_body(char *buf, size_t size,
+ char *dict, size_t dict_len,
+ char *text, size_t text_len)
+{
+ char *p = buf, *e = buf + size;
+ size_t i;
+
+ /* escape non-printable characters */
+ for (i = 0; i < text_len; i++) {
+ unsigned char c = text[i];
+
+ if (c < ' ' || c >= 127 || c == '\\')
+ p += scnprintf(p, e - p, "\\x%02x", c);
+ else
+ append_char(&p, e, c);
+ }
+ append_char(&p, e, '\n');
+
+ if (dict_len) {
+ bool line = true;
+
+ for (i = 0; i < dict_len; i++) {
+ unsigned char c = dict[i];
+
+ if (line) {
+ append_char(&p, e, ' ');
+ line = false;
+ }
+
+ if (c == '\0') {
+ append_char(&p, e, '\n');
+ line = true;
+ continue;
+ }
+
+ if (c < ' ' || c >= 127 || c == '\\') {
+ p += scnprintf(p, e - p, "\\x%02x", c);
+ continue;
+ }
+
+ append_char(&p, e, c);
+ }
+ append_char(&p, e, '\n');
+ }
+
+ return p - buf;
+}
/* /dev/kmsg - userspace message inject/listen interface */
struct devkmsg_user {
@@ -528,7 +605,7 @@ struct devkmsg_user {
u32 idx;
enum log_flags prev;
struct mutex lock;
- char buf[8192];
+ char buf[CONSOLE_EXT_LOG_MAX];
};
static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
@@ -586,9 +663,6 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
{
struct devkmsg_user *user = file->private_data;
struct printk_log *msg;
- u64 ts_usec;
- size_t i;
- char cont = '-';
size_t len;
ssize_t ret;
@@ -624,66 +698,13 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
}
msg = log_from_idx(user->idx);
- ts_usec = msg->ts_nsec;
- do_div(ts_usec, 1000);
+ len = msg_print_ext_header(user->buf, sizeof(user->buf),
+ msg, user->seq, user->prev);
+ len += msg_print_ext_body(user->buf + len, sizeof(user->buf) - len,
+ log_dict(msg), msg->dict_len,
+ log_text(msg), msg->text_len);
- /*
- * If we couldn't merge continuation line fragments during the print,
- * export the stored flags to allow an optional external merge of the
- * records. Merging the records isn't always neccessarily correct, like
- * when we hit a race during printing. In most cases though, it produces
- * better readable output. 'c' in the record flags mark the first
- * fragment of a line, '+' the following.
- */
- if (msg->flags & LOG_CONT && !(user->prev & LOG_CONT))
- cont = 'c';
- else if ((msg->flags & LOG_CONT) ||
- ((user->prev & LOG_CONT) && !(msg->flags & LOG_PREFIX)))
- cont = '+';
-
- len = sprintf(user->buf, "%u,%llu,%llu,%c;",
- (msg->facility << 3) | msg->level,
- user->seq, ts_usec, cont);
user->prev = msg->flags;
-
- /* escape non-printable characters */
- for (i = 0; i < msg->text_len; i++) {
- unsigned char c = log_text(msg)[i];
-
- if (c < ' ' || c >= 127 || c == '\\')
- len += sprintf(user->buf + len, "\\x%02x", c);
- else
- user->buf[len++] = c;
- }
- user->buf[len++] = '\n';
-
- if (msg->dict_len) {
- bool line = true;
-
- for (i = 0; i < msg->dict_len; i++) {
- unsigned char c = log_dict(msg)[i];
-
- if (line) {
- user->buf[len++] = ' ';
- line = false;
- }
-
- if (c == '\0') {
- user->buf[len++] = '\n';
- line = true;
- continue;
- }
-
- if (c < ' ' || c >= 127 || c == '\\') {
- len += sprintf(user->buf + len, "\\x%02x", c);
- continue;
- }
-
- user->buf[len++] = c;
- }
- user->buf[len++] = '\n';
- }
-
user->idx = log_next(user->idx);
user->seq++;
raw_spin_unlock_irq(&logbuf_lock);
@@ -1269,13 +1290,13 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
return len;
}
-int do_syslog(int type, char __user *buf, int len, bool from_file)
+int do_syslog(int type, char __user *buf, int len, int source)
{
bool clear = false;
static int saved_console_loglevel = LOGLEVEL_DEFAULT;
int error;
- error = check_syslog_permissions(type, from_file);
+ error = check_syslog_permissions(type, source);
if (error)
goto out;
@@ -1358,7 +1379,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
syslog_prev = 0;
syslog_partial = 0;
}
- if (from_file) {
+ if (source == SYSLOG_FROM_PROC) {
/*
* Short-cut for poll(/"proc/kmsg") which simply checks
* for pending data, not the size; return the count of
@@ -1405,7 +1426,9 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
* log_buf[start] to log_buf[end - 1].
* The console_lock must be held.
*/
-static void call_console_drivers(int level, const char *text, size_t len)
+static void call_console_drivers(int level,
+ const char *ext_text, size_t ext_len,
+ const char *text, size_t len)
{
struct console *con;
@@ -1426,7 +1449,10 @@ static void call_console_drivers(int level, const char *text, size_t len)
if (!cpu_online(smp_processor_id()) &&
!(con->flags & CON_ANYTIME))
continue;
- con->write(con, text, len, level);
+ if (con->flags & CON_EXTENDED)
+ con->write(con, ext_text, ext_len, level);
+ else
+ con->write(con, text, len, level);
}
}
@@ -1569,8 +1595,12 @@ static bool cont_add(int facility, int level, const char *text, size_t len)
if (cont.len && cont.flushed)
return false;
- if (cont.len + len > sizeof(cont.buf)) {
- /* the line gets too long, split it up in separate records */
+ /*
+ * If ext consoles are present, flush and skip in-kernel
+ * continuation. See nr_ext_console_drivers definition. Also, if
+ * the line gets too long, split it up in separate records.
+ */
+ if (nr_ext_console_drivers || cont.len + len > sizeof(cont.buf)) {
cont_flush(LOG_CONT);
return false;
}
@@ -1905,9 +1935,19 @@ static struct cont {
u8 level;
bool flushed:1;
} cont;
+static char *log_text(const struct printk_log *msg) { return NULL; }
+static char *log_dict(const struct printk_log *msg) { return NULL; }
static struct printk_log *log_from_idx(u32 idx) { return NULL; }
static u32 log_next(u32 idx) { return 0; }
-static void call_console_drivers(int level, const char *text, size_t len) {}
+static ssize_t msg_print_ext_header(char *buf, size_t size,
+ struct printk_log *msg, u64 seq,
+ enum log_flags prev_flags) { return 0; }
+static ssize_t msg_print_ext_body(char *buf, size_t size,
+ char *dict, size_t dict_len,
+ char *text, size_t text_len) { return 0; }
+static void call_console_drivers(int level,
+ const char *ext_text, size_t ext_len,
+ const char *text, size_t len) {}
static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev,
bool syslog, char *buf, size_t size) { return 0; }
static size_t cont_print_text(char *text, size_t size) { return 0; }
@@ -2160,7 +2200,7 @@ static void console_cont_flush(char *text, size_t size)
len = cont_print_text(text, size);
raw_spin_unlock(&logbuf_lock);
stop_critical_timings();
- call_console_drivers(cont.level, text, len);
+ call_console_drivers(cont.level, NULL, 0, text, len);
start_critical_timings();
local_irq_restore(flags);
return;
@@ -2184,6 +2224,7 @@ out:
*/
void console_unlock(void)
{
+ static char ext_text[CONSOLE_EXT_LOG_MAX];
static char text[LOG_LINE_MAX + PREFIX_MAX];
static u64 seen_seq;
unsigned long flags;
@@ -2202,6 +2243,7 @@ void console_unlock(void)
again:
for (;;) {
struct printk_log *msg;
+ size_t ext_len = 0;
size_t len;
int level;
@@ -2247,13 +2289,22 @@ skip:
level = msg->level;
len += msg_print_text(msg, console_prev, false,
text + len, sizeof(text) - len);
+ if (nr_ext_console_drivers) {
+ ext_len = msg_print_ext_header(ext_text,
+ sizeof(ext_text),
+ msg, console_seq, console_prev);
+ ext_len += msg_print_ext_body(ext_text + ext_len,
+ sizeof(ext_text) - ext_len,
+ log_dict(msg), msg->dict_len,
+ log_text(msg), msg->text_len);
+ }
console_idx = log_next(console_idx);
console_seq++;
console_prev = msg->flags;
raw_spin_unlock(&logbuf_lock);
stop_critical_timings(); /* don't trace print latency */
- call_console_drivers(level, text, len);
+ call_console_drivers(level, ext_text, ext_len, text, len);
start_critical_timings();
local_irq_restore(flags);
}
@@ -2509,6 +2560,11 @@ void register_console(struct console *newcon)
newcon->next = console_drivers->next;
console_drivers->next = newcon;
}
+
+ if (newcon->flags & CON_EXTENDED)
+ if (!nr_ext_console_drivers++)
+ pr_info("printk: continuation disabled due to ext consoles, expect more fragments in /dev/kmsg\n");
+
if (newcon->flags & CON_PRINTBUFFER) {
/*
* console_unlock(); will print out the buffered messages
@@ -2581,6 +2637,9 @@ int unregister_console(struct console *console)
}
}
+ if (!res && (console->flags & CON_EXTENDED))
+ nr_ext_console_drivers--;
+
/*
* If this isn't the last console and it has CON_CONSDEV set, we
* need to set it on the next preferred console.
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 8dbe27611..59e32684c 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -241,6 +241,7 @@ rcu_torture_free(struct rcu_torture *p)
struct rcu_torture_ops {
int ttype;
void (*init)(void);
+ void (*cleanup)(void);
int (*readlock)(void);
void (*read_delay)(struct torture_random_state *rrsp);
void (*readunlock)(int idx);
@@ -477,10 +478,12 @@ static struct rcu_torture_ops rcu_busted_ops = {
*/
DEFINE_STATIC_SRCU(srcu_ctl);
+static struct srcu_struct srcu_ctld;
+static struct srcu_struct *srcu_ctlp = &srcu_ctl;
-static int srcu_torture_read_lock(void) __acquires(&srcu_ctl)
+static int srcu_torture_read_lock(void) __acquires(srcu_ctlp)
{
- return srcu_read_lock(&srcu_ctl);
+ return srcu_read_lock(srcu_ctlp);
}
static void srcu_read_delay(struct torture_random_state *rrsp)
@@ -499,49 +502,49 @@ static void srcu_read_delay(struct torture_random_state *rrsp)
rcu_read_delay(rrsp);
}
-static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
+static void srcu_torture_read_unlock(int idx) __releases(srcu_ctlp)
{
- srcu_read_unlock(&srcu_ctl, idx);
+ srcu_read_unlock(srcu_ctlp, idx);
}
static unsigned long srcu_torture_completed(void)
{
- return srcu_batches_completed(&srcu_ctl);
+ return srcu_batches_completed(srcu_ctlp);
}
static void srcu_torture_deferred_free(struct rcu_torture *rp)
{
- call_srcu(&srcu_ctl, &rp->rtort_rcu, rcu_torture_cb);
+ call_srcu(srcu_ctlp, &rp->rtort_rcu, rcu_torture_cb);
}
static void srcu_torture_synchronize(void)
{
- synchronize_srcu(&srcu_ctl);
+ synchronize_srcu(srcu_ctlp);
}
static void srcu_torture_call(struct rcu_head *head,
void (*func)(struct rcu_head *head))
{
- call_srcu(&srcu_ctl, head, func);
+ call_srcu(srcu_ctlp, head, func);
}
static void srcu_torture_barrier(void)
{
- srcu_barrier(&srcu_ctl);
+ srcu_barrier(srcu_ctlp);
}
static void srcu_torture_stats(void)
{
int cpu;
- int idx = srcu_ctl.completed & 0x1;
+ int idx = srcu_ctlp->completed & 0x1;
pr_alert("%s%s per-CPU(idx=%d):",
torture_type, TORTURE_FLAG, idx);
for_each_possible_cpu(cpu) {
long c0, c1;
- c0 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx];
- c1 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx];
+ c0 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[!idx];
+ c1 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[idx];
pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
}
pr_cont("\n");
@@ -549,7 +552,7 @@ static void srcu_torture_stats(void)
static void srcu_torture_synchronize_expedited(void)
{
- synchronize_srcu_expedited(&srcu_ctl);
+ synchronize_srcu_expedited(srcu_ctlp);
}
static struct rcu_torture_ops srcu_ops = {
@@ -569,6 +572,38 @@ static struct rcu_torture_ops srcu_ops = {
.name = "srcu"
};
+static void srcu_torture_init(void)
+{
+ rcu_sync_torture_init();
+ WARN_ON(init_srcu_struct(&srcu_ctld));
+ srcu_ctlp = &srcu_ctld;
+}
+
+static void srcu_torture_cleanup(void)
+{
+ cleanup_srcu_struct(&srcu_ctld);
+ srcu_ctlp = &srcu_ctl; /* In case of a later rcutorture run. */
+}
+
+/* As above, but dynamically allocated. */
+static struct rcu_torture_ops srcud_ops = {
+ .ttype = SRCU_FLAVOR,
+ .init = srcu_torture_init,
+ .cleanup = srcu_torture_cleanup,
+ .readlock = srcu_torture_read_lock,
+ .read_delay = srcu_read_delay,
+ .readunlock = srcu_torture_read_unlock,
+ .started = NULL,
+ .completed = srcu_torture_completed,
+ .deferred_free = srcu_torture_deferred_free,
+ .sync = srcu_torture_synchronize,
+ .exp_sync = srcu_torture_synchronize_expedited,
+ .call = srcu_torture_call,
+ .cb_barrier = srcu_torture_barrier,
+ .stats = srcu_torture_stats,
+ .name = "srcud"
+};
+
/*
* Definitions for sched torture testing.
*/
@@ -672,8 +707,8 @@ static void rcu_torture_boost_cb(struct rcu_head *head)
struct rcu_boost_inflight *rbip =
container_of(head, struct rcu_boost_inflight, rcu);
- smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */
- rbip->inflight = 0;
+ /* Ensure RCU-core accesses precede clearing ->inflight */
+ smp_store_release(&rbip->inflight, 0);
}
static int rcu_torture_boost(void *arg)
@@ -710,9 +745,9 @@ static int rcu_torture_boost(void *arg)
call_rcu_time = jiffies;
while (ULONG_CMP_LT(jiffies, endtime)) {
/* If we don't have a callback in flight, post one. */
- if (!rbi.inflight) {
- smp_mb(); /* RCU core before ->inflight = 1. */
- rbi.inflight = 1;
+ if (!smp_load_acquire(&rbi.inflight)) {
+ /* RCU core before ->inflight = 1. */
+ smp_store_release(&rbi.inflight, 1);
call_rcu(&rbi.rcu, rcu_torture_boost_cb);
if (jiffies - call_rcu_time >
test_boost_duration * HZ - HZ / 2) {
@@ -751,11 +786,10 @@ checkwait: stutter_wait("rcu_torture_boost");
} while (!torture_must_stop());
/* Clean up and exit. */
- while (!kthread_should_stop() || rbi.inflight) {
+ while (!kthread_should_stop() || smp_load_acquire(&rbi.inflight)) {
torture_shutdown_absorb("rcu_torture_boost");
schedule_timeout_uninterruptible(1);
}
- smp_mb(); /* order accesses to ->inflight before stack-frame death. */
destroy_rcu_head_on_stack(&rbi.rcu);
torture_kthread_stopping("rcu_torture_boost");
return 0;
@@ -1054,7 +1088,7 @@ static void rcu_torture_timer(unsigned long unused)
p = rcu_dereference_check(rcu_torture_current,
rcu_read_lock_bh_held() ||
rcu_read_lock_sched_held() ||
- srcu_read_lock_held(&srcu_ctl));
+ srcu_read_lock_held(srcu_ctlp));
if (p == NULL) {
/* Leave because rcu_torture_writer is not yet underway */
cur_ops->readunlock(idx);
@@ -1128,7 +1162,7 @@ rcu_torture_reader(void *arg)
p = rcu_dereference_check(rcu_torture_current,
rcu_read_lock_bh_held() ||
rcu_read_lock_sched_held() ||
- srcu_read_lock_held(&srcu_ctl));
+ srcu_read_lock_held(srcu_ctlp));
if (p == NULL) {
/* Wait for rcu_torture_writer to get underway */
cur_ops->readunlock(idx);
@@ -1413,12 +1447,15 @@ static int rcu_torture_barrier_cbs(void *arg)
do {
wait_event(barrier_cbs_wq[myid],
(newphase =
- ACCESS_ONCE(barrier_phase)) != lastphase ||
+ smp_load_acquire(&barrier_phase)) != lastphase ||
torture_must_stop());
lastphase = newphase;
- smp_mb(); /* ensure barrier_phase load before ->call(). */
if (torture_must_stop())
break;
+ /*
+ * The above smp_load_acquire() ensures barrier_phase load
+ * is ordered before the folloiwng ->call().
+ */
cur_ops->call(&rcu, rcu_torture_barrier_cbf);
if (atomic_dec_and_test(&barrier_cbs_count))
wake_up(&barrier_wq);
@@ -1439,8 +1476,8 @@ static int rcu_torture_barrier(void *arg)
do {
atomic_set(&barrier_cbs_invoked, 0);
atomic_set(&barrier_cbs_count, n_barrier_cbs);
- smp_mb(); /* Ensure barrier_phase after prior assignments. */
- barrier_phase = !barrier_phase;
+ /* Ensure barrier_phase ordered after prior assignments. */
+ smp_store_release(&barrier_phase, !barrier_phase);
for (i = 0; i < n_barrier_cbs; i++)
wake_up(&barrier_cbs_wq[i]);
wait_event(barrier_wq,
@@ -1588,10 +1625,14 @@ rcu_torture_cleanup(void)
rcutorture_booster_cleanup(i);
}
- /* Wait for all RCU callbacks to fire. */
-
+ /*
+ * Wait for all RCU callbacks to fire, then do flavor-specific
+ * cleanup operations.
+ */
if (cur_ops->cb_barrier != NULL)
cur_ops->cb_barrier();
+ if (cur_ops->cleanup != NULL)
+ cur_ops->cleanup();
rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
@@ -1668,8 +1709,8 @@ rcu_torture_init(void)
int cpu;
int firsterr = 0;
static struct rcu_torture_ops *torture_ops[] = {
- &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops,
- RCUTORTURE_TASKS_OPS
+ &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops,
+ &sched_ops, RCUTORTURE_TASKS_OPS
};
if (!torture_init_begin(torture_type, verbose, &torture_runnable))
@@ -1701,7 +1742,7 @@ rcu_torture_init(void)
if (nreaders >= 0) {
nrealreaders = nreaders;
} else {
- nrealreaders = num_online_cpus() - 1;
+ nrealreaders = num_online_cpus() - 2 - nreaders;
if (nrealreaders <= 0)
nrealreaders = 1;
}
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index cad76e76b..fb33d35ee 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -151,7 +151,7 @@ static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx)
unsigned long t;
for_each_possible_cpu(cpu) {
- t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]);
+ t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]);
sum += t;
}
return sum;
@@ -168,7 +168,7 @@ static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)
unsigned long t;
for_each_possible_cpu(cpu) {
- t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]);
+ t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]);
sum += t;
}
return sum;
@@ -265,8 +265,8 @@ static int srcu_readers_active(struct srcu_struct *sp)
unsigned long sum = 0;
for_each_possible_cpu(cpu) {
- sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]);
- sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]);
+ sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]);
+ sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]);
}
return sum;
}
@@ -296,7 +296,7 @@ int __srcu_read_lock(struct srcu_struct *sp)
{
int idx;
- idx = ACCESS_ONCE(sp->completed) & 0x1;
+ idx = READ_ONCE(sp->completed) & 0x1;
preempt_disable();
__this_cpu_inc(sp->per_cpu_ref->c[idx]);
smp_mb(); /* B */ /* Avoid leaking the critical section. */
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index ec3086879..c291bd65d 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -35,7 +35,7 @@
#include <linux/time.h>
#include <linux/cpu.h>
#include <linux/prefetch.h>
-#include <linux/ftrace_event.h>
+#include <linux/trace_events.h>
#include "rcu.h"
@@ -49,39 +49,6 @@ static void __call_rcu(struct rcu_head *head,
#include "tiny_plugin.h"
-/*
- * Enter idle, which is an extended quiescent state if we have fully
- * entered that mode.
- */
-void rcu_idle_enter(void)
-{
-}
-EXPORT_SYMBOL_GPL(rcu_idle_enter);
-
-/*
- * Exit an interrupt handler towards idle.
- */
-void rcu_irq_exit(void)
-{
-}
-EXPORT_SYMBOL_GPL(rcu_irq_exit);
-
-/*
- * Exit idle, so that we are no longer in an extended quiescent state.
- */
-void rcu_idle_exit(void)
-{
-}
-EXPORT_SYMBOL_GPL(rcu_idle_exit);
-
-/*
- * Enter an interrupt handler, moving away from idle.
- */
-void rcu_irq_enter(void)
-{
-}
-EXPORT_SYMBOL_GPL(rcu_irq_enter);
-
#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE)
/*
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index f94e209a1..e492a5253 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -144,16 +144,17 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp)
return;
rcp->ticks_this_gp++;
j = jiffies;
- js = ACCESS_ONCE(rcp->jiffies_stall);
+ js = READ_ONCE(rcp->jiffies_stall);
if (rcp->rcucblist && ULONG_CMP_GE(j, js)) {
pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
rcp->name, rcp->ticks_this_gp, DYNTICK_TASK_EXIT_IDLE,
jiffies - rcp->gp_start, rcp->qlen);
dump_stack();
- ACCESS_ONCE(rcp->jiffies_stall) = jiffies +
- 3 * rcu_jiffies_till_stall_check() + 3;
+ WRITE_ONCE(rcp->jiffies_stall,
+ jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
} else if (ULONG_CMP_GE(j, js)) {
- ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check();
+ WRITE_ONCE(rcp->jiffies_stall,
+ jiffies + rcu_jiffies_till_stall_check());
}
}
@@ -161,7 +162,8 @@ static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
{
rcp->ticks_this_gp = 0;
rcp->gp_start = jiffies;
- ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check();
+ WRITE_ONCE(rcp->jiffies_stall,
+ jiffies + rcu_jiffies_till_stall_check());
}
static void check_cpu_stalls(void)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 8cf7304b2..65137bc28 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -54,7 +54,7 @@
#include <linux/delay.h>
#include <linux/stop_machine.h>
#include <linux/random.h>
-#include <linux/ftrace_event.h>
+#include <linux/trace_events.h>
#include <linux/suspend.h>
#include "tree.h"
@@ -91,7 +91,7 @@ static const char *tp_##sname##_varname __used __tracepoint_string = sname##_var
#define RCU_STATE_INITIALIZER(sname, sabbr, cr) \
DEFINE_RCU_TPS(sname) \
-DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data); \
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data); \
struct rcu_state sname##_state = { \
.level = { &sname##_state.node[0] }, \
.rda = &sname##_data, \
@@ -110,11 +110,18 @@ struct rcu_state sname##_state = { \
RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
-static struct rcu_state *rcu_state_p;
+static struct rcu_state *const rcu_state_p;
+static struct rcu_data __percpu *const rcu_data_p;
LIST_HEAD(rcu_struct_flavors);
-/* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */
-static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF;
+/* Dump rcu_node combining tree at boot to verify correct setup. */
+static bool dump_tree;
+module_param(dump_tree, bool, 0444);
+/* Control rcu_node-tree auto-balancing at boot time. */
+static bool rcu_fanout_exact;
+module_param(rcu_fanout_exact, bool, 0444);
+/* Increase (but not decrease) the RCU_FANOUT_LEAF at boot time. */
+static int rcu_fanout_leaf = RCU_FANOUT_LEAF;
module_param(rcu_fanout_leaf, int, 0444);
int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */
@@ -159,17 +166,46 @@ static void invoke_rcu_core(void);
static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
/* rcuc/rcub kthread realtime priority */
+#ifdef CONFIG_RCU_KTHREAD_PRIO
static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
+#else /* #ifdef CONFIG_RCU_KTHREAD_PRIO */
+static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0;
+#endif /* #else #ifdef CONFIG_RCU_KTHREAD_PRIO */
module_param(kthread_prio, int, 0644);
/* Delay in jiffies for grace-period initialization delays, debug only. */
+
+#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT
+static int gp_preinit_delay = CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT_DELAY;
+module_param(gp_preinit_delay, int, 0644);
+#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT */
+static const int gp_preinit_delay;
+#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT */
+
#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT
static int gp_init_delay = CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY;
module_param(gp_init_delay, int, 0644);
#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */
static const int gp_init_delay;
#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */
-#define PER_RCU_NODE_PERIOD 10 /* Number of grace periods between delays. */
+
+#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP
+static int gp_cleanup_delay = CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP_DELAY;
+module_param(gp_cleanup_delay, int, 0644);
+#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP */
+static const int gp_cleanup_delay;
+#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP */
+
+/*
+ * Number of grace periods between delays, normalized by the duration of
+ * the delay. The longer the the delay, the more the grace periods between
+ * each delay. The reason for this normalization is that it means that,
+ * for non-zero delays, the overall slowdown of grace periods is constant
+ * regardless of the duration of the delay. This arrangement balances
+ * the need for long delays to increase some race probabilities with the
+ * need for fast grace periods to increase other race probabilities.
+ */
+#define PER_RCU_NODE_PERIOD 3 /* Number of grace periods between delays. */
/*
* Track the rcutorture test sequence number and the update version
@@ -191,17 +227,17 @@ unsigned long rcutorture_vernum;
*/
unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
{
- return ACCESS_ONCE(rnp->qsmaskinitnext);
+ return READ_ONCE(rnp->qsmaskinitnext);
}
/*
- * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
+ * Return true if an RCU grace period is in progress. The READ_ONCE()s
* permit this function to be invoked without holding the root rcu_node
* structure's ->lock, but of course results can be subject to change.
*/
static int rcu_gp_in_progress(struct rcu_state *rsp)
{
- return ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum);
+ return READ_ONCE(rsp->completed) != READ_ONCE(rsp->gpnum);
}
/*
@@ -278,8 +314,8 @@ static void rcu_momentary_dyntick_idle(void)
if (!(resched_mask & rsp->flavor_mask))
continue;
smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */
- if (ACCESS_ONCE(rdp->mynode->completed) !=
- ACCESS_ONCE(rdp->cond_resched_completed))
+ if (READ_ONCE(rdp->mynode->completed) !=
+ READ_ONCE(rdp->cond_resched_completed))
continue;
/*
@@ -491,9 +527,9 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
break;
}
if (rsp != NULL) {
- *flags = ACCESS_ONCE(rsp->gp_flags);
- *gpnum = ACCESS_ONCE(rsp->gpnum);
- *completed = ACCESS_ONCE(rsp->completed);
+ *flags = READ_ONCE(rsp->gp_flags);
+ *gpnum = READ_ONCE(rsp->gpnum);
+ *completed = READ_ONCE(rsp->completed);
return;
}
*flags = 0;
@@ -539,10 +575,10 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
static int rcu_future_needs_gp(struct rcu_state *rsp)
{
struct rcu_node *rnp = rcu_get_root(rsp);
- int idx = (ACCESS_ONCE(rnp->completed) + 1) & 0x1;
+ int idx = (READ_ONCE(rnp->completed) + 1) & 0x1;
int *fp = &rnp->need_future_gp[idx];
- return ACCESS_ONCE(*fp);
+ return READ_ONCE(*fp);
}
/*
@@ -565,7 +601,7 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
return 1; /* Yes, this CPU has newly registered callbacks. */
for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
if (rdp->nxttail[i - 1] != rdp->nxttail[i] &&
- ULONG_CMP_LT(ACCESS_ONCE(rsp->completed),
+ ULONG_CMP_LT(READ_ONCE(rsp->completed),
rdp->nxtcompleted[i]))
return 1; /* Yes, CBs for future grace period. */
return 0; /* No grace period needed. */
@@ -585,7 +621,8 @@ static void rcu_eqs_enter_common(long long oldval, bool user)
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting);
- if (!user && !is_idle_task(current)) {
+ if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+ !user && !is_idle_task(current)) {
struct task_struct *idle __maybe_unused =
idle_task(smp_processor_id());
@@ -604,7 +641,8 @@ static void rcu_eqs_enter_common(long long oldval, bool user)
smp_mb__before_atomic(); /* See above. */
atomic_inc(&rdtp->dynticks);
smp_mb__after_atomic(); /* Force ordering with next sojourn. */
- WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+ atomic_read(&rdtp->dynticks) & 0x1);
rcu_dynticks_task_enter();
/*
@@ -630,7 +668,8 @@ static void rcu_eqs_enter(bool user)
rdtp = this_cpu_ptr(&rcu_dynticks);
oldval = rdtp->dynticks_nesting;
- WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0);
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+ (oldval & DYNTICK_TASK_NEST_MASK) == 0);
if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) {
rdtp->dynticks_nesting = 0;
rcu_eqs_enter_common(oldval, user);
@@ -703,7 +742,8 @@ void rcu_irq_exit(void)
rdtp = this_cpu_ptr(&rcu_dynticks);
oldval = rdtp->dynticks_nesting;
rdtp->dynticks_nesting--;
- WARN_ON_ONCE(rdtp->dynticks_nesting < 0);
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+ rdtp->dynticks_nesting < 0);
if (rdtp->dynticks_nesting)
trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting);
else
@@ -728,10 +768,12 @@ static void rcu_eqs_exit_common(long long oldval, int user)
atomic_inc(&rdtp->dynticks);
/* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
smp_mb__after_atomic(); /* See above. */
- WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+ !(atomic_read(&rdtp->dynticks) & 0x1));
rcu_cleanup_after_idle();
trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting);
- if (!user && !is_idle_task(current)) {
+ if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+ !user && !is_idle_task(current)) {
struct task_struct *idle __maybe_unused =
idle_task(smp_processor_id());
@@ -755,7 +797,7 @@ static void rcu_eqs_exit(bool user)
rdtp = this_cpu_ptr(&rcu_dynticks);
oldval = rdtp->dynticks_nesting;
- WARN_ON_ONCE(oldval < 0);
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0);
if (oldval & DYNTICK_TASK_NEST_MASK) {
rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
} else {
@@ -828,7 +870,8 @@ void rcu_irq_enter(void)
rdtp = this_cpu_ptr(&rcu_dynticks);
oldval = rdtp->dynticks_nesting;
rdtp->dynticks_nesting++;
- WARN_ON_ONCE(rdtp->dynticks_nesting == 0);
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+ rdtp->dynticks_nesting == 0);
if (oldval)
trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting);
else
@@ -1011,9 +1054,9 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
return 1;
} else {
- if (ULONG_CMP_LT(ACCESS_ONCE(rdp->gpnum) + ULONG_MAX / 4,
+ if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4,
rdp->mynode->gpnum))
- ACCESS_ONCE(rdp->gpwrap) = true;
+ WRITE_ONCE(rdp->gpwrap, true);
return 0;
}
}
@@ -1093,12 +1136,12 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
if (ULONG_CMP_GE(jiffies,
rdp->rsp->gp_start + jiffies_till_sched_qs) ||
ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
- if (!(ACCESS_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) {
- ACCESS_ONCE(rdp->cond_resched_completed) =
- ACCESS_ONCE(rdp->mynode->completed);
+ if (!(READ_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) {
+ WRITE_ONCE(rdp->cond_resched_completed,
+ READ_ONCE(rdp->mynode->completed));
smp_mb(); /* ->cond_resched_completed before *rcrmp. */
- ACCESS_ONCE(*rcrmp) =
- ACCESS_ONCE(*rcrmp) + rdp->rsp->flavor_mask;
+ WRITE_ONCE(*rcrmp,
+ READ_ONCE(*rcrmp) + rdp->rsp->flavor_mask);
resched_cpu(rdp->cpu); /* Force CPU into scheduler. */
rdp->rsp->jiffies_resched += 5; /* Enable beating. */
} else if (ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
@@ -1119,9 +1162,9 @@ static void record_gp_stall_check_time(struct rcu_state *rsp)
rsp->gp_start = j;
smp_wmb(); /* Record start time before stall time. */
j1 = rcu_jiffies_till_stall_check();
- ACCESS_ONCE(rsp->jiffies_stall) = j + j1;
+ WRITE_ONCE(rsp->jiffies_stall, j + j1);
rsp->jiffies_resched = j + j1 / 2;
- rsp->n_force_qs_gpstart = ACCESS_ONCE(rsp->n_force_qs);
+ rsp->n_force_qs_gpstart = READ_ONCE(rsp->n_force_qs);
}
/*
@@ -1133,10 +1176,11 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
unsigned long j;
j = jiffies;
- gpa = ACCESS_ONCE(rsp->gp_activity);
+ gpa = READ_ONCE(rsp->gp_activity);
if (j - gpa > 2 * HZ)
- pr_err("%s kthread starved for %ld jiffies!\n",
- rsp->name, j - gpa);
+ pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x\n",
+ rsp->name, j - gpa,
+ rsp->gpnum, rsp->completed, rsp->gp_flags);
}
/*
@@ -1173,12 +1217,13 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
/* Only let one CPU complain about others per time interval. */
raw_spin_lock_irqsave(&rnp->lock, flags);
- delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall);
+ delta = jiffies - READ_ONCE(rsp->jiffies_stall);
if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
raw_spin_unlock_irqrestore(&rnp->lock, flags);
return;
}
- ACCESS_ONCE(rsp->jiffies_stall) = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
+ WRITE_ONCE(rsp->jiffies_stall,
+ jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
/*
@@ -1212,12 +1257,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
if (ndetected) {
rcu_dump_cpu_stacks(rsp);
} else {
- if (ACCESS_ONCE(rsp->gpnum) != gpnum ||
- ACCESS_ONCE(rsp->completed) == gpnum) {
+ if (READ_ONCE(rsp->gpnum) != gpnum ||
+ READ_ONCE(rsp->completed) == gpnum) {
pr_err("INFO: Stall ended before state dump start\n");
} else {
j = jiffies;
- gpa = ACCESS_ONCE(rsp->gp_activity);
+ gpa = READ_ONCE(rsp->gp_activity);
pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n",
rsp->name, j - gpa, j, gpa,
jiffies_till_next_fqs,
@@ -1262,9 +1307,9 @@ static void print_cpu_stall(struct rcu_state *rsp)
rcu_dump_cpu_stacks(rsp);
raw_spin_lock_irqsave(&rnp->lock, flags);
- if (ULONG_CMP_GE(jiffies, ACCESS_ONCE(rsp->jiffies_stall)))
- ACCESS_ONCE(rsp->jiffies_stall) = jiffies +
- 3 * rcu_jiffies_till_stall_check() + 3;
+ if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall)))
+ WRITE_ONCE(rsp->jiffies_stall,
+ jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
/*
@@ -1307,20 +1352,20 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
* Given this check, comparisons of jiffies, rsp->jiffies_stall,
* and rsp->gp_start suffice to forestall false positives.
*/
- gpnum = ACCESS_ONCE(rsp->gpnum);
+ gpnum = READ_ONCE(rsp->gpnum);
smp_rmb(); /* Pick up ->gpnum first... */
- js = ACCESS_ONCE(rsp->jiffies_stall);
+ js = READ_ONCE(rsp->jiffies_stall);
smp_rmb(); /* ...then ->jiffies_stall before the rest... */
- gps = ACCESS_ONCE(rsp->gp_start);
+ gps = READ_ONCE(rsp->gp_start);
smp_rmb(); /* ...and finally ->gp_start before ->completed. */
- completed = ACCESS_ONCE(rsp->completed);
+ completed = READ_ONCE(rsp->completed);
if (ULONG_CMP_GE(completed, gpnum) ||
ULONG_CMP_LT(j, js) ||
ULONG_CMP_GE(gps, js))
return; /* No stall or GP completed since entering function. */
rnp = rdp->mynode;
if (rcu_gp_in_progress(rsp) &&
- (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask)) {
+ (READ_ONCE(rnp->qsmask) & rdp->grpmask)) {
/* We haven't checked in, so go dump stack. */
print_cpu_stall(rsp);
@@ -1347,7 +1392,7 @@ void rcu_cpu_stall_reset(void)
struct rcu_state *rsp;
for_each_rcu_flavor(rsp)
- ACCESS_ONCE(rsp->jiffies_stall) = jiffies + ULONG_MAX / 2;
+ WRITE_ONCE(rsp->jiffies_stall, jiffies + ULONG_MAX / 2);
}
/*
@@ -1457,7 +1502,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
* doing some extra useless work.
*/
if (rnp->gpnum != rnp->completed ||
- ACCESS_ONCE(rnp_root->gpnum) != ACCESS_ONCE(rnp_root->completed)) {
+ READ_ONCE(rnp_root->gpnum) != READ_ONCE(rnp_root->completed)) {
rnp->need_future_gp[c & 0x1]++;
trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf"));
goto out;
@@ -1542,7 +1587,7 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
static void rcu_gp_kthread_wake(struct rcu_state *rsp)
{
if (current == rsp->gp_kthread ||
- !ACCESS_ONCE(rsp->gp_flags) ||
+ !READ_ONCE(rsp->gp_flags) ||
!rsp->gp_kthread)
return;
wake_up(&rsp->gp_wq);
@@ -1677,7 +1722,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
/* Handle the ends of any preceding grace periods first. */
if (rdp->completed == rnp->completed &&
- !unlikely(ACCESS_ONCE(rdp->gpwrap))) {
+ !unlikely(READ_ONCE(rdp->gpwrap))) {
/* No grace period end, so just accelerate recent callbacks. */
ret = rcu_accelerate_cbs(rsp, rnp, rdp);
@@ -1692,7 +1737,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend"));
}
- if (rdp->gpnum != rnp->gpnum || unlikely(ACCESS_ONCE(rdp->gpwrap))) {
+ if (rdp->gpnum != rnp->gpnum || unlikely(READ_ONCE(rdp->gpwrap))) {
/*
* If the current grace period is waiting for this CPU,
* set up to detect a quiescent state, otherwise don't
@@ -1704,7 +1749,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
zero_cpu_stall_ticks(rdp);
- ACCESS_ONCE(rdp->gpwrap) = false;
+ WRITE_ONCE(rdp->gpwrap, false);
}
return ret;
}
@@ -1717,9 +1762,9 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
local_irq_save(flags);
rnp = rdp->mynode;
- if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) &&
- rdp->completed == ACCESS_ONCE(rnp->completed) &&
- !unlikely(ACCESS_ONCE(rdp->gpwrap))) || /* w/out lock. */
+ if ((rdp->gpnum == READ_ONCE(rnp->gpnum) &&
+ rdp->completed == READ_ONCE(rnp->completed) &&
+ !unlikely(READ_ONCE(rdp->gpwrap))) || /* w/out lock. */
!raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
local_irq_restore(flags);
return;
@@ -1731,6 +1776,13 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
rcu_gp_kthread_wake(rsp);
}
+static void rcu_gp_slow(struct rcu_state *rsp, int delay)
+{
+ if (delay > 0 &&
+ !(rsp->gpnum % (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay)))
+ schedule_timeout_uninterruptible(delay);
+}
+
/*
* Initialize a new grace period. Return 0 if no grace period required.
*/
@@ -1740,15 +1792,15 @@ static int rcu_gp_init(struct rcu_state *rsp)
struct rcu_data *rdp;
struct rcu_node *rnp = rcu_get_root(rsp);
- ACCESS_ONCE(rsp->gp_activity) = jiffies;
+ WRITE_ONCE(rsp->gp_activity, jiffies);
raw_spin_lock_irq(&rnp->lock);
smp_mb__after_unlock_lock();
- if (!ACCESS_ONCE(rsp->gp_flags)) {
+ if (!READ_ONCE(rsp->gp_flags)) {
/* Spurious wakeup, tell caller to go back to sleep. */
raw_spin_unlock_irq(&rnp->lock);
return 0;
}
- ACCESS_ONCE(rsp->gp_flags) = 0; /* Clear all flags: New grace period. */
+ WRITE_ONCE(rsp->gp_flags, 0); /* Clear all flags: New grace period. */
if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) {
/*
@@ -1773,6 +1825,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
* will handle subsequent offline CPUs.
*/
rcu_for_each_leaf_node(rsp, rnp) {
+ rcu_gp_slow(rsp, gp_preinit_delay);
raw_spin_lock_irq(&rnp->lock);
smp_mb__after_unlock_lock();
if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
@@ -1829,14 +1882,15 @@ static int rcu_gp_init(struct rcu_state *rsp)
* process finishes, because this kthread handles both.
*/
rcu_for_each_node_breadth_first(rsp, rnp) {
+ rcu_gp_slow(rsp, gp_init_delay);
raw_spin_lock_irq(&rnp->lock);
smp_mb__after_unlock_lock();
rdp = this_cpu_ptr(rsp->rda);
rcu_preempt_check_blocked_tasks(rnp);
rnp->qsmask = rnp->qsmaskinit;
- ACCESS_ONCE(rnp->gpnum) = rsp->gpnum;
+ WRITE_ONCE(rnp->gpnum, rsp->gpnum);
if (WARN_ON_ONCE(rnp->completed != rsp->completed))
- ACCESS_ONCE(rnp->completed) = rsp->completed;
+ WRITE_ONCE(rnp->completed, rsp->completed);
if (rnp == rdp->mynode)
(void)__note_gp_changes(rsp, rnp, rdp);
rcu_preempt_boost_start_gp(rnp);
@@ -1845,10 +1899,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
rnp->grphi, rnp->qsmask);
raw_spin_unlock_irq(&rnp->lock);
cond_resched_rcu_qs();
- ACCESS_ONCE(rsp->gp_activity) = jiffies;
- if (gp_init_delay > 0 &&
- !(rsp->gpnum % (rcu_num_nodes * PER_RCU_NODE_PERIOD)))
- schedule_timeout_uninterruptible(gp_init_delay);
+ WRITE_ONCE(rsp->gp_activity, jiffies);
}
return 1;
@@ -1864,7 +1915,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
unsigned long maxj;
struct rcu_node *rnp = rcu_get_root(rsp);
- ACCESS_ONCE(rsp->gp_activity) = jiffies;
+ WRITE_ONCE(rsp->gp_activity, jiffies);
rsp->n_force_qs++;
if (fqs_state == RCU_SAVE_DYNTICK) {
/* Collect dyntick-idle snapshots. */
@@ -1882,11 +1933,11 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj);
}
/* Clear flag to prevent immediate re-entry. */
- if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
+ if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
raw_spin_lock_irq(&rnp->lock);
smp_mb__after_unlock_lock();
- ACCESS_ONCE(rsp->gp_flags) =
- ACCESS_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS;
+ WRITE_ONCE(rsp->gp_flags,
+ READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS);
raw_spin_unlock_irq(&rnp->lock);
}
return fqs_state;
@@ -1903,7 +1954,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
struct rcu_data *rdp;
struct rcu_node *rnp = rcu_get_root(rsp);
- ACCESS_ONCE(rsp->gp_activity) = jiffies;
+ WRITE_ONCE(rsp->gp_activity, jiffies);
raw_spin_lock_irq(&rnp->lock);
smp_mb__after_unlock_lock();
gp_duration = jiffies - rsp->gp_start;
@@ -1934,7 +1985,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
smp_mb__after_unlock_lock();
WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
WARN_ON_ONCE(rnp->qsmask);
- ACCESS_ONCE(rnp->completed) = rsp->gpnum;
+ WRITE_ONCE(rnp->completed, rsp->gpnum);
rdp = this_cpu_ptr(rsp->rda);
if (rnp == rdp->mynode)
needgp = __note_gp_changes(rsp, rnp, rdp) || needgp;
@@ -1942,7 +1993,8 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
nocb += rcu_future_gp_cleanup(rsp, rnp);
raw_spin_unlock_irq(&rnp->lock);
cond_resched_rcu_qs();
- ACCESS_ONCE(rsp->gp_activity) = jiffies;
+ WRITE_ONCE(rsp->gp_activity, jiffies);
+ rcu_gp_slow(rsp, gp_cleanup_delay);
}
rnp = rcu_get_root(rsp);
raw_spin_lock_irq(&rnp->lock);
@@ -1950,16 +2002,16 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
rcu_nocb_gp_set(rnp, nocb);
/* Declare grace period done. */
- ACCESS_ONCE(rsp->completed) = rsp->gpnum;
+ WRITE_ONCE(rsp->completed, rsp->gpnum);
trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end"));
rsp->fqs_state = RCU_GP_IDLE;
rdp = this_cpu_ptr(rsp->rda);
/* Advance CBs to reduce false positives below. */
needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp;
if (needgp || cpu_needs_another_gp(rsp, rdp)) {
- ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT;
+ WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT);
trace_rcu_grace_period(rsp->name,
- ACCESS_ONCE(rsp->gpnum),
+ READ_ONCE(rsp->gpnum),
TPS("newreq"));
}
raw_spin_unlock_irq(&rnp->lock);
@@ -1983,20 +2035,20 @@ static int __noreturn rcu_gp_kthread(void *arg)
/* Handle grace-period start. */
for (;;) {
trace_rcu_grace_period(rsp->name,
- ACCESS_ONCE(rsp->gpnum),
+ READ_ONCE(rsp->gpnum),
TPS("reqwait"));
rsp->gp_state = RCU_GP_WAIT_GPS;
wait_event_interruptible(rsp->gp_wq,
- ACCESS_ONCE(rsp->gp_flags) &
+ READ_ONCE(rsp->gp_flags) &
RCU_GP_FLAG_INIT);
/* Locking provides needed memory barrier. */
if (rcu_gp_init(rsp))
break;
cond_resched_rcu_qs();
- ACCESS_ONCE(rsp->gp_activity) = jiffies;
+ WRITE_ONCE(rsp->gp_activity, jiffies);
WARN_ON(signal_pending(current));
trace_rcu_grace_period(rsp->name,
- ACCESS_ONCE(rsp->gpnum),
+ READ_ONCE(rsp->gpnum),
TPS("reqwaitsig"));
}
@@ -2012,39 +2064,39 @@ static int __noreturn rcu_gp_kthread(void *arg)
if (!ret)
rsp->jiffies_force_qs = jiffies + j;
trace_rcu_grace_period(rsp->name,
- ACCESS_ONCE(rsp->gpnum),
+ READ_ONCE(rsp->gpnum),
TPS("fqswait"));
rsp->gp_state = RCU_GP_WAIT_FQS;
ret = wait_event_interruptible_timeout(rsp->gp_wq,
- ((gf = ACCESS_ONCE(rsp->gp_flags)) &
+ ((gf = READ_ONCE(rsp->gp_flags)) &
RCU_GP_FLAG_FQS) ||
- (!ACCESS_ONCE(rnp->qsmask) &&
+ (!READ_ONCE(rnp->qsmask) &&
!rcu_preempt_blocked_readers_cgp(rnp)),
j);
/* Locking provides needed memory barriers. */
/* If grace period done, leave loop. */
- if (!ACCESS_ONCE(rnp->qsmask) &&
+ if (!READ_ONCE(rnp->qsmask) &&
!rcu_preempt_blocked_readers_cgp(rnp))
break;
/* If time for quiescent-state forcing, do it. */
if (ULONG_CMP_GE(jiffies, rsp->jiffies_force_qs) ||
(gf & RCU_GP_FLAG_FQS)) {
trace_rcu_grace_period(rsp->name,
- ACCESS_ONCE(rsp->gpnum),
+ READ_ONCE(rsp->gpnum),
TPS("fqsstart"));
fqs_state = rcu_gp_fqs(rsp, fqs_state);
trace_rcu_grace_period(rsp->name,
- ACCESS_ONCE(rsp->gpnum),
+ READ_ONCE(rsp->gpnum),
TPS("fqsend"));
cond_resched_rcu_qs();
- ACCESS_ONCE(rsp->gp_activity) = jiffies;
+ WRITE_ONCE(rsp->gp_activity, jiffies);
} else {
/* Deal with stray signal. */
cond_resched_rcu_qs();
- ACCESS_ONCE(rsp->gp_activity) = jiffies;
+ WRITE_ONCE(rsp->gp_activity, jiffies);
WARN_ON(signal_pending(current));
trace_rcu_grace_period(rsp->name,
- ACCESS_ONCE(rsp->gpnum),
+ READ_ONCE(rsp->gpnum),
TPS("fqswaitsig"));
}
j = jiffies_till_next_fqs;
@@ -2086,8 +2138,8 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
*/
return false;
}
- ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT;
- trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum),
+ WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT);
+ trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum),
TPS("newreq"));
/*
@@ -2137,6 +2189,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
__releases(rcu_get_root(rsp)->lock)
{
WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
+ WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
rcu_gp_kthread_wake(rsp);
}
@@ -2334,8 +2387,6 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
rcu_report_qs_rdp(rdp->cpu, rsp, rdp);
}
-#ifdef CONFIG_HOTPLUG_CPU
-
/*
* Send the specified CPU's RCU callbacks to the orphanage. The
* specified CPU must be offline, and the caller must hold the
@@ -2346,7 +2397,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
struct rcu_node *rnp, struct rcu_data *rdp)
{
/* No-CBs CPUs do not have orphanable callbacks. */
- if (rcu_is_nocb_cpu(rdp->cpu))
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rcu_is_nocb_cpu(rdp->cpu))
return;
/*
@@ -2359,7 +2410,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
rsp->qlen += rdp->qlen;
rdp->n_cbs_orphaned += rdp->qlen;
rdp->qlen_lazy = 0;
- ACCESS_ONCE(rdp->qlen) = 0;
+ WRITE_ONCE(rdp->qlen, 0);
}
/*
@@ -2405,7 +2456,8 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
/* No-CBs CPUs are handled specially. */
- if (rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags))
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
+ rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags))
return;
/* Do the accounting first. */
@@ -2452,6 +2504,9 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda));
RCU_TRACE(struct rcu_node *rnp = rdp->mynode);
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
+ return;
+
RCU_TRACE(mask = rdp->grpmask);
trace_rcu_grace_period(rsp->name,
rnp->gpnum + 1 - !!(rnp->qsmask & mask),
@@ -2480,7 +2535,8 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
long mask;
struct rcu_node *rnp = rnp_leaf;
- if (rnp->qsmaskinit || rcu_preempt_has_tasks(rnp))
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
+ rnp->qsmaskinit || rcu_preempt_has_tasks(rnp))
return;
for (;;) {
mask = rnp->grpmask;
@@ -2511,6 +2567,9 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
+ return;
+
/* Remove outgoing CPU from mask in the leaf rcu_node structure. */
mask = rdp->grpmask;
raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -2532,6 +2591,9 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
+ return;
+
/* Adjust any no-longer-needed kthreads. */
rcu_boost_kthread_setaffinity(rnp, -1);
@@ -2546,26 +2608,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
cpu, rdp->qlen, rdp->nxtlist);
}
-#else /* #ifdef CONFIG_HOTPLUG_CPU */
-
-static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
-{
-}
-
-static void __maybe_unused rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
-{
-}
-
-static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
-{
-}
-
-static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
-{
-}
-
-#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
-
/*
* Invoke any RCU callbacks that have made it to the end of their grace
* period. Thottle as specified by rdp->blimit.
@@ -2580,7 +2622,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
/* If no callbacks are ready, just return. */
if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0);
- trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist),
+ trace_rcu_batch_end(rsp->name, 0, !!READ_ONCE(rdp->nxtlist),
need_resched(), is_idle_task(current),
rcu_is_callbacks_kthread());
return;
@@ -2636,7 +2678,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
}
smp_mb(); /* List handling before counting for rcu_barrier(). */
rdp->qlen_lazy -= count_lazy;
- ACCESS_ONCE(rdp->qlen) = rdp->qlen - count;
+ WRITE_ONCE(rdp->qlen, rdp->qlen - count);
rdp->n_cbs_invoked += count;
/* Reinstate batch limit if we have worked down the excess. */
@@ -2730,10 +2772,6 @@ static void force_qs_rnp(struct rcu_state *rsp,
mask = 0;
raw_spin_lock_irqsave(&rnp->lock, flags);
smp_mb__after_unlock_lock();
- if (!rcu_gp_in_progress(rsp)) {
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- return;
- }
if (rnp->qsmask == 0) {
if (rcu_state_p == &rcu_sched_state ||
rsp != rcu_state_p ||
@@ -2763,8 +2801,6 @@ static void force_qs_rnp(struct rcu_state *rsp,
bit = 1;
for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
if ((rnp->qsmask & bit) != 0) {
- if ((rnp->qsmaskinit & bit) == 0)
- *isidle = false; /* Pending hotplug. */
if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
mask |= bit;
}
@@ -2793,7 +2829,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
/* Funnel through hierarchy to reduce memory contention. */
rnp = __this_cpu_read(rsp->rda->mynode);
for (; rnp != NULL; rnp = rnp->parent) {
- ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) ||
+ ret = (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) ||
!raw_spin_trylock(&rnp->fqslock);
if (rnp_old != NULL)
raw_spin_unlock(&rnp_old->fqslock);
@@ -2809,13 +2845,12 @@ static void force_quiescent_state(struct rcu_state *rsp)
raw_spin_lock_irqsave(&rnp_old->lock, flags);
smp_mb__after_unlock_lock();
raw_spin_unlock(&rnp_old->fqslock);
- if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
+ if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
rsp->n_force_qs_lh++;
raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
return; /* Someone beat us to it. */
}
- ACCESS_ONCE(rsp->gp_flags) =
- ACCESS_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS;
+ WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
rcu_gp_kthread_wake(rsp);
}
@@ -2881,7 +2916,7 @@ static void rcu_process_callbacks(struct softirq_action *unused)
*/
static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
{
- if (unlikely(!ACCESS_ONCE(rcu_scheduler_fully_active)))
+ if (unlikely(!READ_ONCE(rcu_scheduler_fully_active)))
return;
if (likely(!rsp->boost)) {
rcu_do_batch(rsp, rdp);
@@ -2972,7 +3007,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
WARN_ON_ONCE((unsigned long)head & 0x1); /* Misaligned rcu_head! */
if (debug_rcu_head_queue(head)) {
/* Probable double call_rcu(), so leak the callback. */
- ACCESS_ONCE(head->func) = rcu_leak_callback;
+ WRITE_ONCE(head->func, rcu_leak_callback);
WARN_ONCE(1, "__call_rcu(): Leaked duplicate callback\n");
return;
}
@@ -3011,7 +3046,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
if (!likely(rdp->nxtlist))
init_default_callback_list(rdp);
}
- ACCESS_ONCE(rdp->qlen) = rdp->qlen + 1;
+ WRITE_ONCE(rdp->qlen, rdp->qlen + 1);
if (lazy)
rdp->qlen_lazy++;
else
@@ -3287,7 +3322,7 @@ void synchronize_sched_expedited(void)
if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start),
(ulong)atomic_long_read(&rsp->expedited_done) +
ULONG_MAX / 8)) {
- synchronize_sched();
+ wait_rcu_gp(call_rcu_sched);
atomic_long_inc(&rsp->expedited_wrap);
return;
}
@@ -3450,14 +3485,14 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
}
/* Has another RCU grace period completed? */
- if (ACCESS_ONCE(rnp->completed) != rdp->completed) { /* outside lock */
+ if (READ_ONCE(rnp->completed) != rdp->completed) { /* outside lock */
rdp->n_rp_gp_completed++;
return 1;
}
/* Has a new RCU grace period started? */
- if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum ||
- unlikely(ACCESS_ONCE(rdp->gpwrap))) { /* outside lock */
+ if (READ_ONCE(rnp->gpnum) != rdp->gpnum ||
+ unlikely(READ_ONCE(rdp->gpwrap))) { /* outside lock */
rdp->n_rp_gp_started++;
return 1;
}
@@ -3493,7 +3528,7 @@ static int rcu_pending(void)
* non-NULL, store an indication of whether all callbacks are lazy.
* (If there are no callbacks, all of them are deemed to be lazy.)
*/
-static int __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy)
+static bool __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy)
{
bool al = true;
bool hc = false;
@@ -3564,7 +3599,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
{
int cpu;
struct rcu_data *rdp;
- unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done);
+ unsigned long snap = READ_ONCE(rsp->n_barrier_done);
unsigned long snap_done;
_rcu_barrier_trace(rsp, "Begin", -1, snap);
@@ -3606,10 +3641,10 @@ static void _rcu_barrier(struct rcu_state *rsp)
/*
* Increment ->n_barrier_done to avoid duplicate work. Use
- * ACCESS_ONCE() to prevent the compiler from speculating
+ * WRITE_ONCE() to prevent the compiler from speculating
* the increment to precede the early-exit check.
*/
- ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1;
+ WRITE_ONCE(rsp->n_barrier_done, rsp->n_barrier_done + 1);
WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1);
_rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done);
smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */
@@ -3645,7 +3680,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
__call_rcu(&rdp->barrier_head,
rcu_barrier_callback, rsp, cpu, 0);
}
- } else if (ACCESS_ONCE(rdp->qlen)) {
+ } else if (READ_ONCE(rdp->qlen)) {
_rcu_barrier_trace(rsp, "OnlineQ", cpu,
rsp->n_barrier_done);
smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
@@ -3665,7 +3700,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
/* Increment ->n_barrier_done to prevent duplicate work. */
smp_mb(); /* Keep increment after above mechanism. */
- ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1;
+ WRITE_ONCE(rsp->n_barrier_done, rsp->n_barrier_done + 1);
WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0);
_rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done);
smp_mb(); /* Keep increment before caller's subsequent code. */
@@ -3780,7 +3815,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */
rdp->completed = rnp->completed;
rdp->passed_quiesce = false;
- rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
+ rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu);
rdp->qs_pending = false;
trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -3924,16 +3959,16 @@ void rcu_scheduler_starting(void)
/*
* Compute the per-level fanout, either using the exact fanout specified
- * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
+ * or balancing the tree, depending on the rcu_fanout_exact boot parameter.
*/
static void __init rcu_init_levelspread(struct rcu_state *rsp)
{
int i;
- if (IS_ENABLED(CONFIG_RCU_FANOUT_EXACT)) {
+ if (rcu_fanout_exact) {
rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
for (i = rcu_num_lvls - 2; i >= 0; i--)
- rsp->levelspread[i] = CONFIG_RCU_FANOUT;
+ rsp->levelspread[i] = RCU_FANOUT;
} else {
int ccur;
int cprv;
@@ -3971,9 +4006,9 @@ static void __init rcu_init_one(struct rcu_state *rsp,
BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */
- /* Silence gcc 4.8 warning about array index out of range. */
- if (rcu_num_lvls > RCU_NUM_LVLS)
- panic("rcu_init_one: rcu_num_lvls overflow");
+ /* Silence gcc 4.8 false positive about array index out of range. */
+ if (rcu_num_lvls <= 0 || rcu_num_lvls > RCU_NUM_LVLS)
+ panic("rcu_init_one: rcu_num_lvls out of range");
/* Initialize the level-tracking arrays. */
@@ -4059,7 +4094,7 @@ static void __init rcu_init_geometry(void)
jiffies_till_next_fqs = d;
/* If the compile-time values are accurate, just leave. */
- if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF &&
+ if (rcu_fanout_leaf == RCU_FANOUT_LEAF &&
nr_cpu_ids == NR_CPUS)
return;
pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%d\n",
@@ -4073,7 +4108,7 @@ static void __init rcu_init_geometry(void)
rcu_capacity[0] = 1;
rcu_capacity[1] = rcu_fanout_leaf;
for (i = 2; i <= MAX_RCU_LVLS; i++)
- rcu_capacity[i] = rcu_capacity[i - 1] * CONFIG_RCU_FANOUT;
+ rcu_capacity[i] = rcu_capacity[i - 1] * RCU_FANOUT;
/*
* The boot-time rcu_fanout_leaf parameter is only permitted
@@ -4083,7 +4118,7 @@ static void __init rcu_init_geometry(void)
* the configured number of CPUs. Complain and fall back to the
* compile-time values if these limits are exceeded.
*/
- if (rcu_fanout_leaf < CONFIG_RCU_FANOUT_LEAF ||
+ if (rcu_fanout_leaf < RCU_FANOUT_LEAF ||
rcu_fanout_leaf > sizeof(unsigned long) * 8 ||
n > rcu_capacity[MAX_RCU_LVLS]) {
WARN_ON(1);
@@ -4109,6 +4144,28 @@ static void __init rcu_init_geometry(void)
rcu_num_nodes -= n;
}
+/*
+ * Dump out the structure of the rcu_node combining tree associated
+ * with the rcu_state structure referenced by rsp.
+ */
+static void __init rcu_dump_rcu_node_tree(struct rcu_state *rsp)
+{
+ int level = 0;
+ struct rcu_node *rnp;
+
+ pr_info("rcu_node tree layout dump\n");
+ pr_info(" ");
+ rcu_for_each_node_breadth_first(rsp, rnp) {
+ if (rnp->level != level) {
+ pr_cont("\n");
+ pr_info(" ");
+ level = rnp->level;
+ }
+ pr_cont("%d:%d ^%d ", rnp->grplo, rnp->grphi, rnp->grpnum);
+ }
+ pr_cont("\n");
+}
+
void __init rcu_init(void)
{
int cpu;
@@ -4119,6 +4176,8 @@ void __init rcu_init(void)
rcu_init_geometry();
rcu_init_one(&rcu_bh_state, &rcu_bh_data);
rcu_init_one(&rcu_sched_state, &rcu_sched_data);
+ if (dump_tree)
+ rcu_dump_rcu_node_tree(&rcu_sched_state);
__rcu_init_preempt();
open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index a69d3dab2..4adb7ca0b 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -35,11 +35,33 @@
* In practice, this did work well going from three levels to four.
* Of course, your mileage may vary.
*/
+
#define MAX_RCU_LVLS 4
-#define RCU_FANOUT_1 (CONFIG_RCU_FANOUT_LEAF)
-#define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT)
-#define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT)
-#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
+
+#ifdef CONFIG_RCU_FANOUT
+#define RCU_FANOUT CONFIG_RCU_FANOUT
+#else /* #ifdef CONFIG_RCU_FANOUT */
+# ifdef CONFIG_64BIT
+# define RCU_FANOUT 64
+# else
+# define RCU_FANOUT 32
+# endif
+#endif /* #else #ifdef CONFIG_RCU_FANOUT */
+
+#ifdef CONFIG_RCU_FANOUT_LEAF
+#define RCU_FANOUT_LEAF CONFIG_RCU_FANOUT_LEAF
+#else /* #ifdef CONFIG_RCU_FANOUT_LEAF */
+# ifdef CONFIG_64BIT
+# define RCU_FANOUT_LEAF 64
+# else
+# define RCU_FANOUT_LEAF 32
+# endif
+#endif /* #else #ifdef CONFIG_RCU_FANOUT_LEAF */
+
+#define RCU_FANOUT_1 (RCU_FANOUT_LEAF)
+#define RCU_FANOUT_2 (RCU_FANOUT_1 * RCU_FANOUT)
+#define RCU_FANOUT_3 (RCU_FANOUT_2 * RCU_FANOUT)
+#define RCU_FANOUT_4 (RCU_FANOUT_3 * RCU_FANOUT)
#if NR_CPUS <= RCU_FANOUT_1
# define RCU_NUM_LVLS 1
@@ -170,7 +192,6 @@ struct rcu_node {
/* if there is no such task. If there */
/* is no current expedited grace period, */
/* then there can cannot be any such task. */
-#ifdef CONFIG_RCU_BOOST
struct list_head *boost_tasks;
/* Pointer to first task that needs to be */
/* priority boosted, or NULL if no priority */
@@ -208,7 +229,6 @@ struct rcu_node {
unsigned long n_balk_nos;
/* Refused to boost: not sure why, though. */
/* This can happen due to race conditions. */
-#endif /* #ifdef CONFIG_RCU_BOOST */
#ifdef CONFIG_RCU_NOCB_CPU
wait_queue_head_t nocb_gp_wq[2];
/* Place for rcu_nocb_kthread() to wait GP. */
@@ -519,14 +539,11 @@ extern struct list_head rcu_struct_flavors;
* RCU implementation internal declarations:
*/
extern struct rcu_state rcu_sched_state;
-DECLARE_PER_CPU(struct rcu_data, rcu_sched_data);
extern struct rcu_state rcu_bh_state;
-DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
#ifdef CONFIG_PREEMPT_RCU
extern struct rcu_state rcu_preempt_state;
-DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
#endif /* #ifdef CONFIG_PREEMPT_RCU */
#ifdef CONFIG_RCU_BOOST
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 8c0ec0f5a..013485fb2 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -43,7 +43,17 @@ DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
DEFINE_PER_CPU(char, rcu_cpu_has_work);
-#endif /* #ifdef CONFIG_RCU_BOOST */
+#else /* #ifdef CONFIG_RCU_BOOST */
+
+/*
+ * Some architectures do not define rt_mutexes, but if !CONFIG_RCU_BOOST,
+ * all uses are in dead code. Provide a definition to keep the compiler
+ * happy, but add WARN_ON_ONCE() to complain if used in the wrong place.
+ * This probably needs to be excluded from -rt builds.
+ */
+#define rt_mutex_owner(a) ({ WARN_ON_ONCE(1); NULL; })
+
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
#ifdef CONFIG_RCU_NOCB_CPU
static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
@@ -60,11 +70,11 @@ static void __init rcu_bootup_announce_oddness(void)
{
if (IS_ENABLED(CONFIG_RCU_TRACE))
pr_info("\tRCU debugfs-based tracing is enabled.\n");
- if ((IS_ENABLED(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) ||
- (!IS_ENABLED(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32))
+ if ((IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 64) ||
+ (!IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 32))
pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
- CONFIG_RCU_FANOUT);
- if (IS_ENABLED(CONFIG_RCU_FANOUT_EXACT))
+ RCU_FANOUT);
+ if (rcu_fanout_exact)
pr_info("\tHierarchical RCU autobalancing is disabled.\n");
if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ))
pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
@@ -76,10 +86,10 @@ static void __init rcu_bootup_announce_oddness(void)
pr_info("\tAdditional per-CPU info printed with stalls.\n");
if (NUM_RCU_LVL_4 != 0)
pr_info("\tFour-level hierarchy is enabled.\n");
- if (CONFIG_RCU_FANOUT_LEAF != 16)
+ if (RCU_FANOUT_LEAF != 16)
pr_info("\tBuild-time adjustment of leaf fanout to %d.\n",
- CONFIG_RCU_FANOUT_LEAF);
- if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF)
+ RCU_FANOUT_LEAF);
+ if (rcu_fanout_leaf != RCU_FANOUT_LEAF)
pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
if (nr_cpu_ids != NR_CPUS)
pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
@@ -90,7 +100,8 @@ static void __init rcu_bootup_announce_oddness(void)
#ifdef CONFIG_PREEMPT_RCU
RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
-static struct rcu_state *rcu_state_p = &rcu_preempt_state;
+static struct rcu_state *const rcu_state_p = &rcu_preempt_state;
+static struct rcu_data __percpu *const rcu_data_p = &rcu_preempt_data;
static int rcu_preempted_readers_exp(struct rcu_node *rnp);
static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
@@ -116,11 +127,11 @@ static void __init rcu_bootup_announce(void)
*/
static void rcu_preempt_qs(void)
{
- if (!__this_cpu_read(rcu_preempt_data.passed_quiesce)) {
+ if (!__this_cpu_read(rcu_data_p->passed_quiesce)) {
trace_rcu_grace_period(TPS("rcu_preempt"),
- __this_cpu_read(rcu_preempt_data.gpnum),
+ __this_cpu_read(rcu_data_p->gpnum),
TPS("cpuqs"));
- __this_cpu_write(rcu_preempt_data.passed_quiesce, 1);
+ __this_cpu_write(rcu_data_p->passed_quiesce, 1);
barrier(); /* Coordinate with rcu_preempt_check_callbacks(). */
current->rcu_read_unlock_special.b.need_qs = false;
}
@@ -150,7 +161,7 @@ static void rcu_preempt_note_context_switch(void)
!t->rcu_read_unlock_special.b.blocked) {
/* Possibly blocking in an RCU read-side critical section. */
- rdp = this_cpu_ptr(rcu_preempt_state.rda);
+ rdp = this_cpu_ptr(rcu_state_p->rda);
rnp = rdp->mynode;
raw_spin_lock_irqsave(&rnp->lock, flags);
smp_mb__after_unlock_lock();
@@ -180,10 +191,9 @@ static void rcu_preempt_note_context_switch(void)
if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) {
list_add(&t->rcu_node_entry, rnp->gp_tasks->prev);
rnp->gp_tasks = &t->rcu_node_entry;
-#ifdef CONFIG_RCU_BOOST
- if (rnp->boost_tasks != NULL)
+ if (IS_ENABLED(CONFIG_RCU_BOOST) &&
+ rnp->boost_tasks != NULL)
rnp->boost_tasks = rnp->gp_tasks;
-#endif /* #ifdef CONFIG_RCU_BOOST */
} else {
list_add(&t->rcu_node_entry, &rnp->blkd_tasks);
if (rnp->qsmask & rdp->grpmask)
@@ -263,9 +273,7 @@ void rcu_read_unlock_special(struct task_struct *t)
bool empty_exp_now;
unsigned long flags;
struct list_head *np;
-#ifdef CONFIG_RCU_BOOST
bool drop_boost_mutex = false;
-#endif /* #ifdef CONFIG_RCU_BOOST */
struct rcu_node *rnp;
union rcu_special special;
@@ -307,9 +315,11 @@ void rcu_read_unlock_special(struct task_struct *t)
t->rcu_read_unlock_special.b.blocked = false;
/*
- * Remove this task from the list it blocked on. The
- * task can migrate while we acquire the lock, but at
- * most one time. So at most two passes through loop.
+ * Remove this task from the list it blocked on. The task
+ * now remains queued on the rcu_node corresponding to
+ * the CPU it first blocked on, so the first attempt to
+ * acquire the task's rcu_node's ->lock will succeed.
+ * Keep the loop and add a WARN_ON() out of sheer paranoia.
*/
for (;;) {
rnp = t->rcu_blocked_node;
@@ -317,6 +327,7 @@ void rcu_read_unlock_special(struct task_struct *t)
smp_mb__after_unlock_lock();
if (rnp == t->rcu_blocked_node)
break;
+ WARN_ON_ONCE(1);
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
}
empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
@@ -331,12 +342,12 @@ void rcu_read_unlock_special(struct task_struct *t)
rnp->gp_tasks = np;
if (&t->rcu_node_entry == rnp->exp_tasks)
rnp->exp_tasks = np;
-#ifdef CONFIG_RCU_BOOST
- if (&t->rcu_node_entry == rnp->boost_tasks)
- rnp->boost_tasks = np;
- /* Snapshot ->boost_mtx ownership with rcu_node lock held. */
- drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t;
-#endif /* #ifdef CONFIG_RCU_BOOST */
+ if (IS_ENABLED(CONFIG_RCU_BOOST)) {
+ if (&t->rcu_node_entry == rnp->boost_tasks)
+ rnp->boost_tasks = np;
+ /* Snapshot ->boost_mtx ownership w/rnp->lock held. */
+ drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t;
+ }
/*
* If this was the last task on the current list, and if
@@ -353,24 +364,21 @@ void rcu_read_unlock_special(struct task_struct *t)
rnp->grplo,
rnp->grphi,
!!rnp->gp_tasks);
- rcu_report_unblock_qs_rnp(&rcu_preempt_state,
- rnp, flags);
+ rcu_report_unblock_qs_rnp(rcu_state_p, rnp, flags);
} else {
raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
-#ifdef CONFIG_RCU_BOOST
/* Unboost if we were boosted. */
- if (drop_boost_mutex)
+ if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex)
rt_mutex_unlock(&rnp->boost_mtx);
-#endif /* #ifdef CONFIG_RCU_BOOST */
/*
* If this was the last task on the expedited lists,
* then we need to report up the rcu_node hierarchy.
*/
if (!empty_exp && empty_exp_now)
- rcu_report_exp_rnp(&rcu_preempt_state, rnp, true);
+ rcu_report_exp_rnp(rcu_state_p, rnp, true);
} else {
local_irq_restore(flags);
}
@@ -390,7 +398,7 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
raw_spin_unlock_irqrestore(&rnp->lock, flags);
return;
}
- t = list_entry(rnp->gp_tasks,
+ t = list_entry(rnp->gp_tasks->prev,
struct task_struct, rcu_node_entry);
list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
sched_show_task(t);
@@ -447,7 +455,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
if (!rcu_preempt_blocked_readers_cgp(rnp))
return 0;
rcu_print_task_stall_begin(rnp);
- t = list_entry(rnp->gp_tasks,
+ t = list_entry(rnp->gp_tasks->prev,
struct task_struct, rcu_node_entry);
list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
pr_cont(" P%d", t->pid);
@@ -491,8 +499,8 @@ static void rcu_preempt_check_callbacks(void)
return;
}
if (t->rcu_read_lock_nesting > 0 &&
- __this_cpu_read(rcu_preempt_data.qs_pending) &&
- !__this_cpu_read(rcu_preempt_data.passed_quiesce))
+ __this_cpu_read(rcu_data_p->qs_pending) &&
+ !__this_cpu_read(rcu_data_p->passed_quiesce))
t->rcu_read_unlock_special.b.need_qs = true;
}
@@ -500,7 +508,7 @@ static void rcu_preempt_check_callbacks(void)
static void rcu_preempt_do_callbacks(void)
{
- rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data));
+ rcu_do_batch(rcu_state_p, this_cpu_ptr(rcu_data_p));
}
#endif /* #ifdef CONFIG_RCU_BOOST */
@@ -510,7 +518,7 @@ static void rcu_preempt_do_callbacks(void)
*/
void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
{
- __call_rcu(head, func, &rcu_preempt_state, -1, 0);
+ __call_rcu(head, func, rcu_state_p, -1, 0);
}
EXPORT_SYMBOL_GPL(call_rcu);
@@ -570,7 +578,7 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp)
static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
{
return !rcu_preempted_readers_exp(rnp) &&
- ACCESS_ONCE(rnp->expmask) == 0;
+ READ_ONCE(rnp->expmask) == 0;
}
/*
@@ -711,12 +719,12 @@ sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp)
void synchronize_rcu_expedited(void)
{
struct rcu_node *rnp;
- struct rcu_state *rsp = &rcu_preempt_state;
+ struct rcu_state *rsp = rcu_state_p;
unsigned long snap;
int trycount = 0;
smp_mb(); /* Caller's modifications seen first by other CPUs. */
- snap = ACCESS_ONCE(sync_rcu_preempt_exp_count) + 1;
+ snap = READ_ONCE(sync_rcu_preempt_exp_count) + 1;
smp_mb(); /* Above access cannot bleed into critical section. */
/*
@@ -740,7 +748,7 @@ void synchronize_rcu_expedited(void)
*/
while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
if (ULONG_CMP_LT(snap,
- ACCESS_ONCE(sync_rcu_preempt_exp_count))) {
+ READ_ONCE(sync_rcu_preempt_exp_count))) {
put_online_cpus();
goto mb_ret; /* Others did our work for us. */
}
@@ -752,7 +760,7 @@ void synchronize_rcu_expedited(void)
return;
}
}
- if (ULONG_CMP_LT(snap, ACCESS_ONCE(sync_rcu_preempt_exp_count))) {
+ if (ULONG_CMP_LT(snap, READ_ONCE(sync_rcu_preempt_exp_count))) {
put_online_cpus();
goto unlock_mb_ret; /* Others did our work for us. */
}
@@ -780,8 +788,7 @@ void synchronize_rcu_expedited(void)
/* Clean up and exit. */
smp_mb(); /* ensure expedited GP seen before counter increment. */
- ACCESS_ONCE(sync_rcu_preempt_exp_count) =
- sync_rcu_preempt_exp_count + 1;
+ WRITE_ONCE(sync_rcu_preempt_exp_count, sync_rcu_preempt_exp_count + 1);
unlock_mb_ret:
mutex_unlock(&sync_rcu_preempt_exp_mutex);
mb_ret:
@@ -799,7 +806,7 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
*/
void rcu_barrier(void)
{
- _rcu_barrier(&rcu_preempt_state);
+ _rcu_barrier(rcu_state_p);
}
EXPORT_SYMBOL_GPL(rcu_barrier);
@@ -808,7 +815,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier);
*/
static void __init __rcu_init_preempt(void)
{
- rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
+ rcu_init_one(rcu_state_p, rcu_data_p);
}
/*
@@ -831,7 +838,8 @@ void exit_rcu(void)
#else /* #ifdef CONFIG_PREEMPT_RCU */
-static struct rcu_state *rcu_state_p = &rcu_sched_state;
+static struct rcu_state *const rcu_state_p = &rcu_sched_state;
+static struct rcu_data __percpu *const rcu_data_p = &rcu_sched_data;
/*
* Tell them what RCU they are running.
@@ -994,8 +1002,8 @@ static int rcu_boost(struct rcu_node *rnp)
struct task_struct *t;
struct list_head *tb;
- if (ACCESS_ONCE(rnp->exp_tasks) == NULL &&
- ACCESS_ONCE(rnp->boost_tasks) == NULL)
+ if (READ_ONCE(rnp->exp_tasks) == NULL &&
+ READ_ONCE(rnp->boost_tasks) == NULL)
return 0; /* Nothing left to boost. */
raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -1048,8 +1056,8 @@ static int rcu_boost(struct rcu_node *rnp)
rt_mutex_lock(&rnp->boost_mtx);
rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */
- return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
- ACCESS_ONCE(rnp->boost_tasks) != NULL;
+ return READ_ONCE(rnp->exp_tasks) != NULL ||
+ READ_ONCE(rnp->boost_tasks) != NULL;
}
/*
@@ -1173,7 +1181,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
struct sched_param sp;
struct task_struct *t;
- if (&rcu_preempt_state != rsp)
+ if (rcu_state_p != rsp)
return 0;
if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0)
@@ -1367,13 +1375,12 @@ static void rcu_prepare_kthreads(int cpu)
* Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs
* any flavor of RCU.
*/
-#ifndef CONFIG_RCU_NOCB_CPU_ALL
-int rcu_needs_cpu(unsigned long *delta_jiffies)
+int rcu_needs_cpu(u64 basemono, u64 *nextevt)
{
- *delta_jiffies = ULONG_MAX;
- return rcu_cpu_has_callbacks(NULL);
+ *nextevt = KTIME_MAX;
+ return IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)
+ ? 0 : rcu_cpu_has_callbacks(NULL);
}
-#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
/*
* Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
@@ -1432,8 +1439,6 @@ module_param(rcu_idle_gp_delay, int, 0644);
static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY;
module_param(rcu_idle_lazy_gp_delay, int, 0644);
-extern int tick_nohz_active;
-
/*
* Try to advance callbacks for all flavors of RCU on the current CPU, but
* only if it has been awhile since the last time we did so. Afterwards,
@@ -1462,7 +1467,7 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
* callbacks not yet ready to invoke.
*/
if ((rdp->completed != rnp->completed ||
- unlikely(ACCESS_ONCE(rdp->gpwrap))) &&
+ unlikely(READ_ONCE(rdp->gpwrap))) &&
rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
note_gp_changes(rsp, rdp);
@@ -1480,17 +1485,22 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
*
* The caller must have disabled interrupts.
*/
-#ifndef CONFIG_RCU_NOCB_CPU_ALL
-int rcu_needs_cpu(unsigned long *dj)
+int rcu_needs_cpu(u64 basemono, u64 *nextevt)
{
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+ unsigned long dj;
+
+ if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)) {
+ *nextevt = KTIME_MAX;
+ return 0;
+ }
/* Snapshot to detect later posting of non-lazy callback. */
rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
/* If no callbacks, RCU doesn't need the CPU. */
if (!rcu_cpu_has_callbacks(&rdtp->all_lazy)) {
- *dj = ULONG_MAX;
+ *nextevt = KTIME_MAX;
return 0;
}
@@ -1504,14 +1514,14 @@ int rcu_needs_cpu(unsigned long *dj)
/* Request timer delay depending on laziness, and round. */
if (!rdtp->all_lazy) {
- *dj = round_up(rcu_idle_gp_delay + jiffies,
+ dj = round_up(rcu_idle_gp_delay + jiffies,
rcu_idle_gp_delay) - jiffies;
} else {
- *dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
+ dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
}
+ *nextevt = basemono + dj * TICK_NSEC;
return 0;
}
-#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
/*
* Prepare a CPU for idle from an RCU perspective. The first major task
@@ -1525,7 +1535,6 @@ int rcu_needs_cpu(unsigned long *dj)
*/
static void rcu_prepare_for_idle(void)
{
-#ifndef CONFIG_RCU_NOCB_CPU_ALL
bool needwake;
struct rcu_data *rdp;
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
@@ -1533,8 +1542,11 @@ static void rcu_prepare_for_idle(void)
struct rcu_state *rsp;
int tne;
+ if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL))
+ return;
+
/* Handle nohz enablement switches conservatively. */
- tne = ACCESS_ONCE(tick_nohz_active);
+ tne = READ_ONCE(tick_nohz_active);
if (tne != rdtp->tick_nohz_enabled_snap) {
if (rcu_cpu_has_callbacks(NULL))
invoke_rcu_core(); /* force nohz to see update. */
@@ -1580,7 +1592,6 @@ static void rcu_prepare_for_idle(void)
if (needwake)
rcu_gp_kthread_wake(rsp);
}
-#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
}
/*
@@ -1590,12 +1601,11 @@ static void rcu_prepare_for_idle(void)
*/
static void rcu_cleanup_after_idle(void)
{
-#ifndef CONFIG_RCU_NOCB_CPU_ALL
- if (rcu_is_nocb_cpu(smp_processor_id()))
+ if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) ||
+ rcu_is_nocb_cpu(smp_processor_id()))
return;
if (rcu_try_advance_all_cbs())
invoke_rcu_core();
-#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
}
/*
@@ -1760,7 +1770,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
atomic_read(&rdtp->dynticks) & 0xfff,
rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
- ACCESS_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart,
+ READ_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart,
fast_no_hz);
}
@@ -1898,11 +1908,11 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
{
struct rcu_data *rdp_leader = rdp->nocb_leader;
- if (!ACCESS_ONCE(rdp_leader->nocb_kthread))
+ if (!READ_ONCE(rdp_leader->nocb_kthread))
return;
- if (ACCESS_ONCE(rdp_leader->nocb_leader_sleep) || force) {
+ if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) {
/* Prior smp_mb__after_atomic() orders against prior enqueue. */
- ACCESS_ONCE(rdp_leader->nocb_leader_sleep) = false;
+ WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
wake_up(&rdp_leader->nocb_wq);
}
}
@@ -1934,14 +1944,14 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
ret = atomic_long_read(&rdp->nocb_q_count);
#ifdef CONFIG_PROVE_RCU
- rhp = ACCESS_ONCE(rdp->nocb_head);
+ rhp = READ_ONCE(rdp->nocb_head);
if (!rhp)
- rhp = ACCESS_ONCE(rdp->nocb_gp_head);
+ rhp = READ_ONCE(rdp->nocb_gp_head);
if (!rhp)
- rhp = ACCESS_ONCE(rdp->nocb_follower_head);
+ rhp = READ_ONCE(rdp->nocb_follower_head);
/* Having no rcuo kthread but CBs after scheduler starts is bad! */
- if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp &&
+ if (!READ_ONCE(rdp->nocb_kthread) && rhp &&
rcu_scheduler_fully_active) {
/* RCU callback enqueued before CPU first came online??? */
pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n",
@@ -1975,12 +1985,12 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
atomic_long_add(rhcount, &rdp->nocb_q_count);
/* rcu_barrier() relies on ->nocb_q_count add before xchg. */
old_rhpp = xchg(&rdp->nocb_tail, rhtp);
- ACCESS_ONCE(*old_rhpp) = rhp;
+ WRITE_ONCE(*old_rhpp, rhp);
atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy);
smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */
/* If we are not being polled and there is a kthread, awaken it ... */
- t = ACCESS_ONCE(rdp->nocb_kthread);
+ t = READ_ONCE(rdp->nocb_kthread);
if (rcu_nocb_poll || !t) {
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
TPS("WakeNotPoll"));
@@ -2118,7 +2128,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
for (;;) {
wait_event_interruptible(
rnp->nocb_gp_wq[c & 0x1],
- (d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c)));
+ (d = ULONG_CMP_GE(READ_ONCE(rnp->completed), c)));
if (likely(d))
break;
WARN_ON(signal_pending(current));
@@ -2145,7 +2155,7 @@ wait_again:
if (!rcu_nocb_poll) {
trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
wait_event_interruptible(my_rdp->nocb_wq,
- !ACCESS_ONCE(my_rdp->nocb_leader_sleep));
+ !READ_ONCE(my_rdp->nocb_leader_sleep));
/* Memory barrier handled by smp_mb() calls below and repoll. */
} else if (firsttime) {
firsttime = false; /* Don't drown trace log with "Poll"! */
@@ -2159,12 +2169,12 @@ wait_again:
*/
gotcbs = false;
for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
- rdp->nocb_gp_head = ACCESS_ONCE(rdp->nocb_head);
+ rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head);
if (!rdp->nocb_gp_head)
continue; /* No CBs here, try next follower. */
/* Move callbacks to wait-for-GP list, which is empty. */
- ACCESS_ONCE(rdp->nocb_head) = NULL;
+ WRITE_ONCE(rdp->nocb_head, NULL);
rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
gotcbs = true;
}
@@ -2184,7 +2194,7 @@ wait_again:
my_rdp->nocb_leader_sleep = true;
smp_mb(); /* Ensure _sleep true before scan. */
for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower)
- if (ACCESS_ONCE(rdp->nocb_head)) {
+ if (READ_ONCE(rdp->nocb_head)) {
/* Found CB, so short-circuit next wait. */
my_rdp->nocb_leader_sleep = false;
break;
@@ -2205,7 +2215,7 @@ wait_again:
/* Each pass through the following loop wakes a follower, if needed. */
for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
- if (ACCESS_ONCE(rdp->nocb_head))
+ if (READ_ONCE(rdp->nocb_head))
my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/
if (!rdp->nocb_gp_head)
continue; /* No CBs, so no need to wake follower. */
@@ -2241,7 +2251,7 @@ static void nocb_follower_wait(struct rcu_data *rdp)
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
"FollowerSleep");
wait_event_interruptible(rdp->nocb_wq,
- ACCESS_ONCE(rdp->nocb_follower_head));
+ READ_ONCE(rdp->nocb_follower_head));
} else if (firsttime) {
/* Don't drown trace log with "Poll"! */
firsttime = false;
@@ -2282,10 +2292,10 @@ static int rcu_nocb_kthread(void *arg)
nocb_follower_wait(rdp);
/* Pull the ready-to-invoke callbacks onto local list. */
- list = ACCESS_ONCE(rdp->nocb_follower_head);
+ list = READ_ONCE(rdp->nocb_follower_head);
BUG_ON(!list);
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty");
- ACCESS_ONCE(rdp->nocb_follower_head) = NULL;
+ WRITE_ONCE(rdp->nocb_follower_head, NULL);
tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head);
/* Each pass through the following loop invokes a callback. */
@@ -2324,7 +2334,7 @@ static int rcu_nocb_kthread(void *arg)
/* Is a deferred wakeup of rcu_nocb_kthread() required? */
static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
{
- return ACCESS_ONCE(rdp->nocb_defer_wakeup);
+ return READ_ONCE(rdp->nocb_defer_wakeup);
}
/* Do a deferred wakeup of rcu_nocb_kthread(). */
@@ -2334,8 +2344,8 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
if (!rcu_nocb_need_deferred_wakeup(rdp))
return;
- ndw = ACCESS_ONCE(rdp->nocb_defer_wakeup);
- ACCESS_ONCE(rdp->nocb_defer_wakeup) = RCU_NOGP_WAKE_NOT;
+ ndw = READ_ONCE(rdp->nocb_defer_wakeup);
+ WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_NOT);
wake_nocb_leader(rdp, ndw == RCU_NOGP_WAKE_FORCE);
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake"));
}
@@ -2448,7 +2458,7 @@ static void rcu_spawn_one_nocb_kthread(struct rcu_state *rsp, int cpu)
t = kthread_run(rcu_nocb_kthread, rdp_spawn,
"rcuo%c/%d", rsp->abbr, cpu);
BUG_ON(IS_ERR(t));
- ACCESS_ONCE(rdp_spawn->nocb_kthread) = t;
+ WRITE_ONCE(rdp_spawn->nocb_kthread, t);
}
/*
@@ -2663,7 +2673,7 @@ static void rcu_sysidle_enter(int irq)
/* Record start of fully idle period. */
j = jiffies;
- ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j;
+ WRITE_ONCE(rdtp->dynticks_idle_jiffies, j);
smp_mb__before_atomic();
atomic_inc(&rdtp->dynticks_idle);
smp_mb__after_atomic();
@@ -2681,7 +2691,7 @@ static void rcu_sysidle_enter(int irq)
*/
void rcu_sysidle_force_exit(void)
{
- int oldstate = ACCESS_ONCE(full_sysidle_state);
+ int oldstate = READ_ONCE(full_sysidle_state);
int newoldstate;
/*
@@ -2794,7 +2804,7 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
smp_mb(); /* Read counters before timestamps. */
/* Pick up timestamps. */
- j = ACCESS_ONCE(rdtp->dynticks_idle_jiffies);
+ j = READ_ONCE(rdtp->dynticks_idle_jiffies);
/* If this CPU entered idle more recently, update maxj timestamp. */
if (ULONG_CMP_LT(*maxj, j))
*maxj = j;
@@ -2831,11 +2841,11 @@ static unsigned long rcu_sysidle_delay(void)
static void rcu_sysidle(unsigned long j)
{
/* Check the current state. */
- switch (ACCESS_ONCE(full_sysidle_state)) {
+ switch (READ_ONCE(full_sysidle_state)) {
case RCU_SYSIDLE_NOT:
/* First time all are idle, so note a short idle period. */
- ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_SHORT;
+ WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_SHORT);
break;
case RCU_SYSIDLE_SHORT:
@@ -2873,7 +2883,7 @@ static void rcu_sysidle_cancel(void)
{
smp_mb();
if (full_sysidle_state > RCU_SYSIDLE_SHORT)
- ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT;
+ WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_NOT);
}
/*
@@ -2925,7 +2935,7 @@ static void rcu_sysidle_cb(struct rcu_head *rhp)
smp_mb(); /* grace period precedes setting inuse. */
rshp = container_of(rhp, struct rcu_sysidle_head, rh);
- ACCESS_ONCE(rshp->inuse) = 0;
+ WRITE_ONCE(rshp->inuse, 0);
}
/*
@@ -2936,7 +2946,7 @@ static void rcu_sysidle_cb(struct rcu_head *rhp)
bool rcu_sys_is_idle(void)
{
static struct rcu_sysidle_head rsh;
- int rss = ACCESS_ONCE(full_sysidle_state);
+ int rss = READ_ONCE(full_sysidle_state);
if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu))
return false;
@@ -2964,7 +2974,7 @@ bool rcu_sys_is_idle(void)
}
rcu_sysidle_report(rcu_state_p, isidle, maxj, false);
oldrss = rss;
- rss = ACCESS_ONCE(full_sysidle_state);
+ rss = READ_ONCE(full_sysidle_state);
}
}
@@ -3048,10 +3058,10 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp)
#ifdef CONFIG_NO_HZ_FULL
if (tick_nohz_full_cpu(smp_processor_id()) &&
(!rcu_gp_in_progress(rsp) ||
- ULONG_CMP_LT(jiffies, ACCESS_ONCE(rsp->gp_start) + HZ)))
- return 1;
+ ULONG_CMP_LT(jiffies, READ_ONCE(rsp->gp_start) + HZ)))
+ return true;
#endif /* #ifdef CONFIG_NO_HZ_FULL */
- return 0;
+ return false;
}
/*
@@ -3077,7 +3087,7 @@ static void rcu_bind_gp_kthread(void)
static void rcu_dynticks_task_enter(void)
{
#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
- ACCESS_ONCE(current->rcu_tasks_idle_cpu) = smp_processor_id();
+ WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id());
#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
}
@@ -3085,6 +3095,6 @@ static void rcu_dynticks_task_enter(void)
static void rcu_dynticks_task_exit(void)
{
#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
- ACCESS_ONCE(current->rcu_tasks_idle_cpu) = -1;
+ WRITE_ONCE(current->rcu_tasks_idle_cpu, -1);
#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
}
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index f92361efd..3ea7ffc7d 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -277,7 +277,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
rsp->n_force_qs, rsp->n_force_qs_ngp,
rsp->n_force_qs - rsp->n_force_qs_ngp,
- ACCESS_ONCE(rsp->n_force_qs_lh), rsp->qlen_lazy, rsp->qlen);
+ READ_ONCE(rsp->n_force_qs_lh), rsp->qlen_lazy, rsp->qlen);
for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) {
if (rnp->level != level) {
seq_puts(m, "\n");
@@ -323,8 +323,8 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
struct rcu_node *rnp = &rsp->node[0];
raw_spin_lock_irqsave(&rnp->lock, flags);
- completed = ACCESS_ONCE(rsp->completed);
- gpnum = ACCESS_ONCE(rsp->gpnum);
+ completed = READ_ONCE(rsp->completed);
+ gpnum = READ_ONCE(rsp->gpnum);
if (completed == gpnum)
gpage = 0;
else
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 1f133350d..afaecb7a7 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -150,14 +150,14 @@ void __rcu_read_unlock(void)
barrier(); /* critical section before exit code. */
t->rcu_read_lock_nesting = INT_MIN;
barrier(); /* assign before ->rcu_read_unlock_special load */
- if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special.s)))
+ if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s)))
rcu_read_unlock_special(t);
barrier(); /* ->rcu_read_unlock_special load before assign */
t->rcu_read_lock_nesting = 0;
}
#ifdef CONFIG_PROVE_LOCKING
{
- int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
+ int rrln = READ_ONCE(t->rcu_read_lock_nesting);
WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
}
@@ -389,17 +389,17 @@ module_param(rcu_cpu_stall_timeout, int, 0644);
int rcu_jiffies_till_stall_check(void)
{
- int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout);
+ int till_stall_check = READ_ONCE(rcu_cpu_stall_timeout);
/*
* Limit check must be consistent with the Kconfig limits
* for CONFIG_RCU_CPU_STALL_TIMEOUT.
*/
if (till_stall_check < 3) {
- ACCESS_ONCE(rcu_cpu_stall_timeout) = 3;
+ WRITE_ONCE(rcu_cpu_stall_timeout, 3);
till_stall_check = 3;
} else if (till_stall_check > 300) {
- ACCESS_ONCE(rcu_cpu_stall_timeout) = 300;
+ WRITE_ONCE(rcu_cpu_stall_timeout, 300);
till_stall_check = 300;
}
return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
@@ -550,12 +550,12 @@ static void check_holdout_task(struct task_struct *t,
{
int cpu;
- if (!ACCESS_ONCE(t->rcu_tasks_holdout) ||
- t->rcu_tasks_nvcsw != ACCESS_ONCE(t->nvcsw) ||
- !ACCESS_ONCE(t->on_rq) ||
+ if (!READ_ONCE(t->rcu_tasks_holdout) ||
+ t->rcu_tasks_nvcsw != READ_ONCE(t->nvcsw) ||
+ !READ_ONCE(t->on_rq) ||
(IS_ENABLED(CONFIG_NO_HZ_FULL) &&
!is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) {
- ACCESS_ONCE(t->rcu_tasks_holdout) = false;
+ WRITE_ONCE(t->rcu_tasks_holdout, false);
list_del_init(&t->rcu_tasks_holdout_list);
put_task_struct(t);
return;
@@ -639,11 +639,11 @@ static int __noreturn rcu_tasks_kthread(void *arg)
*/
rcu_read_lock();
for_each_process_thread(g, t) {
- if (t != current && ACCESS_ONCE(t->on_rq) &&
+ if (t != current && READ_ONCE(t->on_rq) &&
!is_idle_task(t)) {
get_task_struct(t);
- t->rcu_tasks_nvcsw = ACCESS_ONCE(t->nvcsw);
- ACCESS_ONCE(t->rcu_tasks_holdout) = true;
+ t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw);
+ WRITE_ONCE(t->rcu_tasks_holdout, true);
list_add(&t->rcu_tasks_holdout_list,
&rcu_tasks_holdouts);
}
@@ -672,7 +672,7 @@ static int __noreturn rcu_tasks_kthread(void *arg)
struct task_struct *t1;
schedule_timeout_interruptible(HZ);
- rtst = ACCESS_ONCE(rcu_task_stall_timeout);
+ rtst = READ_ONCE(rcu_task_stall_timeout);
needreport = rtst > 0 &&
time_after(jiffies, lastreport + rtst);
if (needreport)
@@ -728,7 +728,7 @@ static void rcu_spawn_tasks_kthread(void)
static struct task_struct *rcu_tasks_kthread_ptr;
struct task_struct *t;
- if (ACCESS_ONCE(rcu_tasks_kthread_ptr)) {
+ if (READ_ONCE(rcu_tasks_kthread_ptr)) {
smp_mb(); /* Ensure caller sees full kthread. */
return;
}
@@ -740,7 +740,7 @@ static void rcu_spawn_tasks_kthread(void)
t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread");
BUG_ON(IS_ERR(t));
smp_mb(); /* Ensure others see full kthread. */
- ACCESS_ONCE(rcu_tasks_kthread_ptr) = t;
+ WRITE_ONCE(rcu_tasks_kthread_ptr, t);
mutex_unlock(&rcu_tasks_kthread_mutex);
}
diff --git a/kernel/relay.c b/kernel/relay.c
index e9dbaeb8f..0b4570cfa 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -81,10 +81,7 @@ static struct page **relay_alloc_page_array(unsigned int n_pages)
*/
static void relay_free_page_array(struct page **array)
{
- if (is_vmalloc_addr(array))
- vfree(array);
- else
- kfree(array);
+ kvfree(array);
}
/**
diff --git a/kernel/resource.c b/kernel/resource.c
index 90552aab5..fed052a1b 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -504,13 +504,13 @@ int region_is_ram(resource_size_t start, unsigned long size)
{
struct resource *p;
resource_size_t end = start + size - 1;
- int flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY;
const char *name = "System RAM";
int ret = -1;
read_lock(&resource_lock);
for (p = iomem_resource.child; p ; p = p->sibling) {
- if (end < p->start)
+ if (p->end < start)
continue;
if (p->start <= start && end <= p->end) {
@@ -521,7 +521,7 @@ int region_is_ram(resource_size_t start, unsigned long size)
ret = 1;
break;
}
- if (p->end < start)
+ if (end < p->start)
break; /* not found */
}
read_unlock(&resource_lock);
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 54b88a1c0..67687973c 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -11,16 +11,11 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
endif
-ifdef CONFIG_SCHED_BFS
-obj-y += bfs.o clock.o
-else
-obj-y += core.o proc.o clock.o cputime.o
+obj-y += core.o loadavg.o clock.o cputime.o
obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
-obj-$(CONFIG_SMP) += cpudeadline.o
+obj-y += wait.o completion.o idle.o
+obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
+obj-$(CONFIG_SCHEDSTATS) += stats.o
obj-$(CONFIG_SCHED_DEBUG) += debug.o
obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
-endif
-obj-y += wait.o completion.o idle.o
-obj-$(CONFIG_SMP) += cpupri.o
-obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index eae160dd6..750ed601d 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -1,5 +1,3 @@
-#ifdef CONFIG_SCHED_AUTOGROUP
-
#include "sched.h"
#include <linux/proc_fs.h>
@@ -141,7 +139,7 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
p->signal->autogroup = autogroup_kref_get(ag);
- if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
+ if (!READ_ONCE(sysctl_sched_autogroup_enabled))
goto out;
for_each_thread(p, t)
@@ -249,5 +247,3 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen)
return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
}
#endif /* CONFIG_SCHED_DEBUG */
-
-#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched/auto_group.h b/kernel/sched/auto_group.h
index 8bd047142..890c95f25 100644
--- a/kernel/sched/auto_group.h
+++ b/kernel/sched/auto_group.h
@@ -29,7 +29,7 @@ extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg);
static inline struct task_group *
autogroup_task_group(struct task_struct *p, struct task_group *tg)
{
- int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
+ int enabled = READ_ONCE(sysctl_sched_autogroup_enabled);
if (enabled && task_wants_autogroup(p, tg))
return p->signal->autogroup->tg;
diff --git a/kernel/sched/bfs.c b/kernel/sched/bfs.c
deleted file mode 100644
index 5366182bd..000000000
--- a/kernel/sched/bfs.c
+++ /dev/null
@@ -1,7420 +0,0 @@
-/*
- * kernel/sched/bfs.c, was kernel/sched.c
- *
- * Kernel scheduler and related syscalls
- *
- * Copyright (C) 1991-2002 Linus Torvalds
- *
- * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
- * make semaphores SMP safe
- * 1998-11-19 Implemented schedule_timeout() and related stuff
- * by Andrea Arcangeli
- * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
- * hybrid priority-list and round-robin design with
- * an array-switch method of distributing timeslices
- * and per-CPU runqueues. Cleanups and useful suggestions
- * by Davide Libenzi, preemptible kernel bits by Robert Love.
- * 2003-09-03 Interactivity tuning by Con Kolivas.
- * 2004-04-02 Scheduler domains code by Nick Piggin
- * 2007-04-15 Work begun on replacing all interactivity tuning with a
- * fair scheduling design by Con Kolivas.
- * 2007-05-05 Load balancing (smp-nice) and other improvements
- * by Peter Williams
- * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
- * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
- * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,
- * Thomas Gleixner, Mike Kravetz
- * now Brainfuck deadline scheduling policy by Con Kolivas deletes
- * a whole lot of those previous things.
- */
-
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/nmi.h>
-#include <linux/init.h>
-#include <asm/uaccess.h>
-#include <linux/highmem.h>
-#include <asm/mmu_context.h>
-#include <linux/interrupt.h>
-#include <linux/capability.h>
-#include <linux/completion.h>
-#include <linux/kernel_stat.h>
-#include <linux/debug_locks.h>
-#include <linux/perf_event.h>
-#include <linux/security.h>
-#include <linux/notifier.h>
-#include <linux/profile.h>
-#include <linux/freezer.h>
-#include <linux/vmalloc.h>
-#include <linux/blkdev.h>
-#include <linux/delay.h>
-#include <linux/smp.h>
-#include <linux/threads.h>
-#include <linux/timer.h>
-#include <linux/rcupdate.h>
-#include <linux/cpu.h>
-#include <linux/cpuset.h>
-#include <linux/cpumask.h>
-#include <linux/percpu.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/syscalls.h>
-#include <linux/sched/sysctl.h>
-#include <linux/times.h>
-#include <linux/tsacct_kern.h>
-#include <linux/kprobes.h>
-#include <linux/delayacct.h>
-#include <linux/log2.h>
-#include <linux/bootmem.h>
-#include <linux/ftrace.h>
-#include <linux/slab.h>
-#include <linux/init_task.h>
-#include <linux/binfmts.h>
-#include <linux/context_tracking.h>
-#include <linux/sched/prio.h>
-
-#include <asm/irq_regs.h>
-#include <asm/switch_to.h>
-#include <asm/tlb.h>
-#include <asm/unistd.h>
-#include <asm/mutex.h>
-#ifdef CONFIG_PARAVIRT
-#include <asm/paravirt.h>
-#endif
-
-#include "cpupri.h"
-#include "../workqueue_internal.h"
-#include "../smpboot.h"
-
-#define CREATE_TRACE_POINTS
-#include <trace/events/sched.h>
-
-#include "bfs_sched.h"
-
-#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO)
-#define rt_task(p) rt_prio((p)->prio)
-#define rt_queue(rq) rt_prio((rq)->rq_prio)
-#define batch_task(p) (unlikely((p)->policy == SCHED_BATCH))
-#define is_rt_policy(policy) ((policy) == SCHED_FIFO || \
- (policy) == SCHED_RR)
-#define has_rt_policy(p) unlikely(is_rt_policy((p)->policy))
-
-#define is_idle_policy(policy) ((policy) == SCHED_IDLEPRIO)
-#define idleprio_task(p) unlikely(is_idle_policy((p)->policy))
-#define task_running_idle(p) unlikely((p)->prio == IDLE_PRIO)
-#define idle_queue(rq) (unlikely(is_idle_policy((rq)->rq_policy)))
-
-#define is_iso_policy(policy) ((policy) == SCHED_ISO)
-#define iso_task(p) unlikely(is_iso_policy((p)->policy))
-#define iso_queue(rq) unlikely(is_iso_policy((rq)->rq_policy))
-#define task_running_iso(p) unlikely((p)->prio == ISO_PRIO)
-#define rq_running_iso(rq) ((rq)->rq_prio == ISO_PRIO)
-
-#define rq_idle(rq) ((rq)->rq_prio == PRIO_LIMIT)
-
-#define ISO_PERIOD ((5 * HZ * grq.noc) + 1)
-
-#define SCHED_PRIO(p) ((p) + MAX_RT_PRIO)
-#define STOP_PRIO (MAX_RT_PRIO - 1)
-
-/*
- * Some helpers for converting to/from various scales. Use shifts to get
- * approximate multiples of ten for less overhead.
- */
-#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
-#define JIFFY_NS (1000000000 / HZ)
-#define HALF_JIFFY_NS (1000000000 / HZ / 2)
-#define HALF_JIFFY_US (1000000 / HZ / 2)
-#define MS_TO_NS(TIME) ((TIME) << 20)
-#define MS_TO_US(TIME) ((TIME) << 10)
-#define NS_TO_MS(TIME) ((TIME) >> 20)
-#define NS_TO_US(TIME) ((TIME) >> 10)
-
-#define RESCHED_US (100) /* Reschedule if less than this many μs left */
-
-void print_scheduler_version(void)
-{
- printk(KERN_INFO "BFS CPU scheduler v0.464 by Con Kolivas.\n");
-}
-
-/*
- * This is the time all tasks within the same priority round robin.
- * Value is in ms and set to a minimum of 6ms. Scales with number of cpus.
- * Tunable via /proc interface.
- */
-#ifdef CONFIG_PCK_INTERACTIVE
-int rr_interval __read_mostly = 3;
-#else
-int rr_interval __read_mostly = 6;
-#endif
-
-/*
- * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks
- * are allowed to run five seconds as real time tasks. This is the total over
- * all online cpus.
- */
-#ifdef CONFIG_PCK_INTERACTIVE
-int sched_iso_cpu __read_mostly = 25;
-#else
-int sched_iso_cpu __read_mostly = 70;
-#endif
-
-/*
- * The relative length of deadline for each priority(nice) level.
- */
-static int prio_ratios[NICE_WIDTH] __read_mostly;
-
-/*
- * The quota handed out to tasks of all priority levels when refilling their
- * time_slice.
- */
-static inline int timeslice(void)
-{
- return MS_TO_US(rr_interval);
-}
-
-/*
- * The global runqueue data that all CPUs work off. Data is protected either
- * by the global grq lock, or the discrete lock that precedes the data in this
- * struct.
- */
-struct global_rq {
- raw_spinlock_t lock;
- unsigned long nr_running;
- unsigned long nr_uninterruptible;
- unsigned long long nr_switches;
- struct list_head queue[PRIO_LIMIT];
- DECLARE_BITMAP(prio_bitmap, PRIO_LIMIT + 1);
- unsigned long qnr; /* queued not running */
-#ifdef CONFIG_SMP
- cpumask_t cpu_idle_map;
- bool idle_cpus;
-#endif
- int noc; /* num_online_cpus stored and updated when it changes */
- u64 niffies; /* Nanosecond jiffies */
- unsigned long last_jiffy; /* Last jiffy we updated niffies */
-
- raw_spinlock_t iso_lock;
- int iso_ticks;
- bool iso_refractory;
-};
-
-#ifdef CONFIG_SMP
-/*
- * We add the notion of a root-domain which will be used to define per-domain
- * variables. Each exclusive cpuset essentially defines an island domain by
- * fully partitioning the member cpus from any other cpuset. Whenever a new
- * exclusive cpuset is created, we also create and attach a new root-domain
- * object.
- *
- */
-struct root_domain {
- atomic_t refcount;
- atomic_t rto_count;
- struct rcu_head rcu;
- cpumask_var_t span;
- cpumask_var_t online;
-
- /*
- * The "RT overload" flag: it gets set if a CPU has more than
- * one runnable RT task.
- */
- cpumask_var_t rto_mask;
- struct cpupri cpupri;
-};
-
-/*
- * By default the system creates a single root-domain with all cpus as
- * members (mimicking the global state we have today).
- */
-static struct root_domain def_root_domain;
-
-#endif /* CONFIG_SMP */
-
-/* There can be only one */
-static struct global_rq grq;
-
-static DEFINE_MUTEX(sched_hotcpu_mutex);
-
-DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
-#ifdef CONFIG_SMP
-struct rq *cpu_rq(int cpu)
-{
- return &per_cpu(runqueues, (cpu));
-}
-#define task_rq(p) cpu_rq(task_cpu(p))
-#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
-/*
- * sched_domains_mutex serialises calls to init_sched_domains,
- * detach_destroy_domains and partition_sched_domains.
- */
-static DEFINE_MUTEX(sched_domains_mutex);
-
-/*
- * By default the system creates a single root-domain with all cpus as
- * members (mimicking the global state we have today).
- */
-static struct root_domain def_root_domain;
-
-int __weak arch_sd_sibling_asym_packing(void)
-{
- return 0*SD_ASYM_PACKING;
-}
-#else
-struct rq *uprq;
-#endif /* CONFIG_SMP */
-
-static inline void update_rq_clock(struct rq *rq);
-
-/*
- * Sanity check should sched_clock return bogus values. We make sure it does
- * not appear to go backwards, and use jiffies to determine the maximum and
- * minimum it could possibly have increased, and round down to the nearest
- * jiffy when it falls outside this.
- */
-static inline void niffy_diff(s64 *niff_diff, int jiff_diff)
-{
- unsigned long min_diff, max_diff;
-
- if (jiff_diff > 1)
- min_diff = JIFFIES_TO_NS(jiff_diff - 1);
- else
- min_diff = 1;
- /* Round up to the nearest tick for maximum */
- max_diff = JIFFIES_TO_NS(jiff_diff + 1);
-
- if (unlikely(*niff_diff < min_diff || *niff_diff > max_diff))
- *niff_diff = min_diff;
-}
-
-#ifdef CONFIG_SMP
-static inline int cpu_of(struct rq *rq)
-{
- return rq->cpu;
-}
-
-/*
- * Niffies are a globally increasing nanosecond counter. Whenever a runqueue
- * clock is updated with the grq.lock held, it is an opportunity to update the
- * niffies value. Any CPU can update it by adding how much its clock has
- * increased since it last updated niffies, minus any added niffies by other
- * CPUs.
- */
-static inline void update_clocks(struct rq *rq)
-{
- s64 ndiff;
- long jdiff;
-
- update_rq_clock(rq);
- ndiff = rq->clock - rq->old_clock;
- /* old_clock is only updated when we are updating niffies */
- rq->old_clock = rq->clock;
- ndiff -= grq.niffies - rq->last_niffy;
- jdiff = jiffies - grq.last_jiffy;
- niffy_diff(&ndiff, jdiff);
- grq.last_jiffy += jdiff;
- grq.niffies += ndiff;
- rq->last_niffy = grq.niffies;
-}
-#else /* CONFIG_SMP */
-static inline int cpu_of(struct rq *rq)
-{
- return 0;
-}
-
-static inline void update_clocks(struct rq *rq)
-{
- s64 ndiff;
- long jdiff;
-
- update_rq_clock(rq);
- ndiff = rq->clock - rq->old_clock;
- rq->old_clock = rq->clock;
- jdiff = jiffies - grq.last_jiffy;
- niffy_diff(&ndiff, jdiff);
- grq.last_jiffy += jdiff;
- grq.niffies += ndiff;
-}
-#endif
-
-#include "stats.h"
-
-#ifndef prepare_arch_switch
-# define prepare_arch_switch(next) do { } while (0)
-#endif
-#ifndef finish_arch_switch
-# define finish_arch_switch(prev) do { } while (0)
-#endif
-#ifndef finish_arch_post_lock_switch
-# define finish_arch_post_lock_switch() do { } while (0)
-#endif
-
-/*
- * All common locking functions performed on grq.lock. rq->clock is local to
- * the CPU accessing it so it can be modified just with interrupts disabled
- * when we're not updating niffies.
- * Looking up task_rq must be done under grq.lock to be safe.
- */
-static void update_rq_clock_task(struct rq *rq, s64 delta);
-
-static inline void update_rq_clock(struct rq *rq)
-{
- s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
-
- if (unlikely(delta < 0))
- return;
- rq->clock += delta;
- update_rq_clock_task(rq, delta);
-}
-
-static inline bool task_running(struct task_struct *p)
-{
- return p->on_cpu;
-}
-
-static inline void grq_lock(void)
- __acquires(grq.lock)
-{
- raw_spin_lock(&grq.lock);
-}
-
-static inline void grq_unlock(void)
- __releases(grq.lock)
-{
- raw_spin_unlock(&grq.lock);
-}
-
-static inline void grq_lock_irq(void)
- __acquires(grq.lock)
-{
- raw_spin_lock_irq(&grq.lock);
-}
-
-static inline void time_lock_grq(struct rq *rq)
- __acquires(grq.lock)
-{
- grq_lock();
- update_clocks(rq);
-}
-
-static inline void grq_unlock_irq(void)
- __releases(grq.lock)
-{
- raw_spin_unlock_irq(&grq.lock);
-}
-
-static inline void grq_lock_irqsave(unsigned long *flags)
- __acquires(grq.lock)
-{
- raw_spin_lock_irqsave(&grq.lock, *flags);
-}
-
-static inline void grq_unlock_irqrestore(unsigned long *flags)
- __releases(grq.lock)
-{
- raw_spin_unlock_irqrestore(&grq.lock, *flags);
-}
-
-static inline struct rq
-*task_grq_lock(struct task_struct *p, unsigned long *flags)
- __acquires(grq.lock)
-{
- grq_lock_irqsave(flags);
- return task_rq(p);
-}
-
-static inline struct rq
-*time_task_grq_lock(struct task_struct *p, unsigned long *flags)
- __acquires(grq.lock)
-{
- struct rq *rq = task_grq_lock(p, flags);
- update_clocks(rq);
- return rq;
-}
-
-static inline struct rq *task_grq_lock_irq(struct task_struct *p)
- __acquires(grq.lock)
-{
- grq_lock_irq();
- return task_rq(p);
-}
-
-static inline void time_task_grq_lock_irq(struct task_struct *p)
- __acquires(grq.lock)
-{
- struct rq *rq = task_grq_lock_irq(p);
- update_clocks(rq);
-}
-
-static inline void task_grq_unlock_irq(void)
- __releases(grq.lock)
-{
- grq_unlock_irq();
-}
-
-static inline void task_grq_unlock(unsigned long *flags)
- __releases(grq.lock)
-{
- grq_unlock_irqrestore(flags);
-}
-
-/**
- * grunqueue_is_locked
- *
- * Returns true if the global runqueue is locked.
- * This interface allows printk to be called with the runqueue lock
- * held and know whether or not it is OK to wake up the klogd.
- */
-bool grunqueue_is_locked(void)
-{
- return raw_spin_is_locked(&grq.lock);
-}
-
-void grq_unlock_wait(void)
- __releases(grq.lock)
-{
- smp_mb(); /* spin-unlock-wait is not a full memory barrier */
- raw_spin_unlock_wait(&grq.lock);
-}
-
-static inline void time_grq_lock(struct rq *rq, unsigned long *flags)
- __acquires(grq.lock)
-{
- local_irq_save(*flags);
- time_lock_grq(rq);
-}
-
-static inline struct rq *__task_grq_lock(struct task_struct *p)
- __acquires(grq.lock)
-{
- grq_lock();
- return task_rq(p);
-}
-
-static inline void __task_grq_unlock(void)
- __releases(grq.lock)
-{
- grq_unlock();
-}
-
-static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
-{
-}
-
-static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
-{
-#ifdef CONFIG_DEBUG_SPINLOCK
- /* this is a valid case when another task releases the spinlock */
- grq.lock.owner = current;
-#endif
- /*
- * If we are tracking spinlock dependencies then we have to
- * fix up the runqueue lock - which gets 'carried over' from
- * prev into current:
- */
- spin_acquire(&grq.lock.dep_map, 0, 0, _THIS_IP_);
-
- grq_unlock_irq();
-}
-
-static inline bool deadline_before(u64 deadline, u64 time)
-{
- return (deadline < time);
-}
-
-static inline bool deadline_after(u64 deadline, u64 time)
-{
- return (deadline > time);
-}
-
-/*
- * A task that is queued but not running will be on the grq run list.
- * A task that is not running or queued will not be on the grq run list.
- * A task that is currently running will have ->on_cpu set but not on the
- * grq run list.
- */
-static inline bool task_queued(struct task_struct *p)
-{
- return (!list_empty(&p->run_list));
-}
-
-/*
- * Removing from the global runqueue. Enter with grq locked.
- */
-static void dequeue_task(struct task_struct *p)
-{
- list_del_init(&p->run_list);
- if (list_empty(grq.queue + p->prio))
- __clear_bit(p->prio, grq.prio_bitmap);
- sched_info_dequeued(task_rq(p), p);
-}
-
-/*
- * To determine if it's safe for a task of SCHED_IDLEPRIO to actually run as
- * an idle task, we ensure none of the following conditions are met.
- */
-static bool idleprio_suitable(struct task_struct *p)
-{
- return (!freezing(p) && !signal_pending(p) &&
- !(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING)));
-}
-
-/*
- * To determine if a task of SCHED_ISO can run in pseudo-realtime, we check
- * that the iso_refractory flag is not set.
- */
-static bool isoprio_suitable(void)
-{
- return !grq.iso_refractory;
-}
-
-/*
- * Adding to the global runqueue. Enter with grq locked.
- */
-static void enqueue_task(struct task_struct *p, struct rq *rq)
-{
- if (!rt_task(p)) {
- /* Check it hasn't gotten rt from PI */
- if ((idleprio_task(p) && idleprio_suitable(p)) ||
- (iso_task(p) && isoprio_suitable()))
- p->prio = p->normal_prio;
- else
- p->prio = NORMAL_PRIO;
- }
- __set_bit(p->prio, grq.prio_bitmap);
- list_add_tail(&p->run_list, grq.queue + p->prio);
- sched_info_queued(rq, p);
-}
-
-static inline void requeue_task(struct task_struct *p)
-{
- sched_info_queued(task_rq(p), p);
-}
-
-/*
- * Returns the relative length of deadline all compared to the shortest
- * deadline which is that of nice -20.
- */
-static inline int task_prio_ratio(struct task_struct *p)
-{
- return prio_ratios[TASK_USER_PRIO(p)];
-}
-
-/*
- * task_timeslice - all tasks of all priorities get the exact same timeslice
- * length. CPU distribution is handled by giving different deadlines to
- * tasks of different priorities. Use 128 as the base value for fast shifts.
- */
-static inline int task_timeslice(struct task_struct *p)
-{
- return (rr_interval * task_prio_ratio(p) / 128);
-}
-
-static void resched_task(struct task_struct *p);
-
-static inline void resched_curr(struct rq *rq)
-{
- resched_task(rq->curr);
-}
-
-/*
- * qnr is the "queued but not running" count which is the total number of
- * tasks on the global runqueue list waiting for cpu time but not actually
- * currently running on a cpu.
- */
-static inline void inc_qnr(void)
-{
- grq.qnr++;
-}
-
-static inline void dec_qnr(void)
-{
- grq.qnr--;
-}
-
-static inline int queued_notrunning(void)
-{
- return grq.qnr;
-}
-
-#ifdef CONFIG_SMP
-/*
- * The cpu_idle_map stores a bitmap of all the CPUs currently idle to
- * allow easy lookup of whether any suitable idle CPUs are available.
- * It's cheaper to maintain a binary yes/no if there are any idle CPUs on the
- * idle_cpus variable than to do a full bitmask check when we are busy.
- */
-static inline void set_cpuidle_map(int cpu)
-{
- if (likely(cpu_online(cpu))) {
- cpumask_set_cpu(cpu, &grq.cpu_idle_map);
- grq.idle_cpus = true;
- }
-}
-
-static inline void clear_cpuidle_map(int cpu)
-{
- cpumask_clear_cpu(cpu, &grq.cpu_idle_map);
- if (cpumask_empty(&grq.cpu_idle_map))
- grq.idle_cpus = false;
-}
-
-static bool suitable_idle_cpus(struct task_struct *p)
-{
- if (!grq.idle_cpus)
- return false;
- return (cpumask_intersects(&p->cpus_allowed, &grq.cpu_idle_map));
-}
-
-#define CPUIDLE_DIFF_THREAD (1)
-#define CPUIDLE_DIFF_CORE (2)
-#define CPUIDLE_CACHE_BUSY (4)
-#define CPUIDLE_DIFF_CPU (8)
-#define CPUIDLE_THREAD_BUSY (16)
-#define CPUIDLE_THROTTLED (32)
-#define CPUIDLE_DIFF_NODE (64)
-
-static inline bool scaling_rq(struct rq *rq);
-
-/*
- * The best idle CPU is chosen according to the CPUIDLE ranking above where the
- * lowest value would give the most suitable CPU to schedule p onto next. The
- * order works out to be the following:
- *
- * Same core, idle or busy cache, idle or busy threads
- * Other core, same cache, idle or busy cache, idle threads.
- * Same node, other CPU, idle cache, idle threads.
- * Same node, other CPU, busy cache, idle threads.
- * Other core, same cache, busy threads.
- * Same node, other CPU, busy threads.
- * Other node, other CPU, idle cache, idle threads.
- * Other node, other CPU, busy cache, idle threads.
- * Other node, other CPU, busy threads.
- */
-static int best_mask_cpu(int best_cpu, struct rq *rq, cpumask_t *tmpmask)
-{
- int best_ranking = CPUIDLE_DIFF_NODE | CPUIDLE_THROTTLED |
- CPUIDLE_THREAD_BUSY | CPUIDLE_DIFF_CPU | CPUIDLE_CACHE_BUSY |
- CPUIDLE_DIFF_CORE | CPUIDLE_DIFF_THREAD;
- int cpu_tmp;
-
- if (cpumask_test_cpu(best_cpu, tmpmask))
- goto out;
-
- for_each_cpu(cpu_tmp, tmpmask) {
- int ranking, locality;
- struct rq *tmp_rq;
-
- ranking = 0;
- tmp_rq = cpu_rq(cpu_tmp);
-
- locality = rq->cpu_locality[cpu_tmp];
-#ifdef CONFIG_NUMA
- if (locality > 3)
- ranking |= CPUIDLE_DIFF_NODE;
- else
-#endif
- if (locality > 2)
- ranking |= CPUIDLE_DIFF_CPU;
-#ifdef CONFIG_SCHED_MC
- else if (locality == 2)
- ranking |= CPUIDLE_DIFF_CORE;
- if (!(tmp_rq->cache_idle(cpu_tmp)))
- ranking |= CPUIDLE_CACHE_BUSY;
-#endif
-#ifdef CONFIG_SCHED_SMT
- if (locality == 1)
- ranking |= CPUIDLE_DIFF_THREAD;
- if (!(tmp_rq->siblings_idle(cpu_tmp)))
- ranking |= CPUIDLE_THREAD_BUSY;
-#endif
- if (scaling_rq(tmp_rq))
- ranking |= CPUIDLE_THROTTLED;
-
- if (ranking < best_ranking) {
- best_cpu = cpu_tmp;
- best_ranking = ranking;
- }
- }
-out:
- return best_cpu;
-}
-
-static void resched_best_mask(int best_cpu, struct rq *rq, cpumask_t *tmpmask)
-{
- best_cpu = best_mask_cpu(best_cpu, rq, tmpmask);
- resched_curr(cpu_rq(best_cpu));
-}
-
-bool cpus_share_cache(int this_cpu, int that_cpu)
-{
- struct rq *this_rq = cpu_rq(this_cpu);
-
- return (this_rq->cpu_locality[that_cpu] < 3);
-}
-
-#ifdef CONFIG_SCHED_SMT
-#ifdef CONFIG_SMT_NICE
-static const cpumask_t *thread_cpumask(int cpu);
-
-/* Find the best real time priority running on any SMT siblings of cpu and if
- * none are running, the static priority of the best deadline task running.
- * The lookups to the other runqueues is done lockless as the occasional wrong
- * value would be harmless. */
-static int best_smt_bias(int cpu)
-{
- int other_cpu, best_bias = 0;
-
- for_each_cpu(other_cpu, thread_cpumask(cpu)) {
- struct rq *rq;
-
- if (other_cpu == cpu)
- continue;
- rq = cpu_rq(other_cpu);
- if (rq_idle(rq))
- continue;
- if (!rq->online)
- continue;
- if (!rq->rq_mm)
- continue;
- if (likely(rq->rq_smt_bias > best_bias))
- best_bias = rq->rq_smt_bias;
- }
- return best_bias;
-}
-
-static int task_prio_bias(struct task_struct *p)
-{
- if (rt_task(p))
- return 1 << 30;
- else if (task_running_iso(p))
- return 1 << 29;
- else if (task_running_idle(p))
- return 0;
- return MAX_PRIO - p->static_prio;
-}
-
-/* We've already decided p can run on CPU, now test if it shouldn't for SMT
- * nice reasons. */
-static bool smt_should_schedule(struct task_struct *p, int cpu)
-{
- int best_bias, task_bias;
-
- /* Kernel threads always run */
- if (unlikely(!p->mm))
- return true;
- if (rt_task(p))
- return true;
- if (!idleprio_suitable(p))
- return true;
- best_bias = best_smt_bias(cpu);
- /* The smt siblings are all idle or running IDLEPRIO */
- if (best_bias < 1)
- return true;
- task_bias = task_prio_bias(p);
- if (task_bias < 1)
- return false;
- if (task_bias >= best_bias)
- return true;
- /* Dither 25% cpu of normal tasks regardless of nice difference */
- if (best_bias % 4 == 1)
- return true;
- /* Sorry, you lose */
- return false;
-}
-#endif
-#endif
-
-static bool resched_best_idle(struct task_struct *p)
-{
- cpumask_t tmpmask;
- int best_cpu;
-
- cpumask_and(&tmpmask, &p->cpus_allowed, &grq.cpu_idle_map);
- best_cpu = best_mask_cpu(task_cpu(p), task_rq(p), &tmpmask);
-#ifdef CONFIG_SMT_NICE
- if (!smt_should_schedule(p, best_cpu))
- return false;
-#endif
- resched_curr(cpu_rq(best_cpu));
- return true;
-}
-
-static inline void resched_suitable_idle(struct task_struct *p)
-{
- if (suitable_idle_cpus(p))
- resched_best_idle(p);
-}
-/*
- * Flags to tell us whether this CPU is running a CPU frequency governor that
- * has slowed its speed or not. No locking required as the very rare wrongly
- * read value would be harmless.
- */
-void cpu_scaling(int cpu)
-{
- cpu_rq(cpu)->scaling = true;
-}
-
-void cpu_nonscaling(int cpu)
-{
- cpu_rq(cpu)->scaling = false;
-}
-
-static inline bool scaling_rq(struct rq *rq)
-{
- return rq->scaling;
-}
-
-static inline int locality_diff(struct task_struct *p, struct rq *rq)
-{
- return rq->cpu_locality[task_cpu(p)];
-}
-#else /* CONFIG_SMP */
-static inline void set_cpuidle_map(int cpu)
-{
-}
-
-static inline void clear_cpuidle_map(int cpu)
-{
-}
-
-static inline bool suitable_idle_cpus(struct task_struct *p)
-{
- return uprq->curr == uprq->idle;
-}
-
-static inline void resched_suitable_idle(struct task_struct *p)
-{
-}
-
-void cpu_scaling(int __unused)
-{
-}
-
-void cpu_nonscaling(int __unused)
-{
-}
-
-/*
- * Although CPUs can scale in UP, there is nowhere else for tasks to go so this
- * always returns 0.
- */
-static inline bool scaling_rq(struct rq *rq)
-{
- return false;
-}
-
-static inline int locality_diff(struct task_struct *p, struct rq *rq)
-{
- return 0;
-}
-#endif /* CONFIG_SMP */
-EXPORT_SYMBOL_GPL(cpu_scaling);
-EXPORT_SYMBOL_GPL(cpu_nonscaling);
-
-static inline int normal_prio(struct task_struct *p)
-{
- if (has_rt_policy(p))
- return MAX_RT_PRIO - 1 - p->rt_priority;
- if (idleprio_task(p))
- return IDLE_PRIO;
- if (iso_task(p))
- return ISO_PRIO;
- return NORMAL_PRIO;
-}
-
-/*
- * Calculate the current priority, i.e. the priority
- * taken into account by the scheduler. This value might
- * be boosted by RT tasks as it will be RT if the task got
- * RT-boosted. If not then it returns p->normal_prio.
- */
-static int effective_prio(struct task_struct *p)
-{
- p->normal_prio = normal_prio(p);
- /*
- * If we are RT tasks or we were boosted to RT priority,
- * keep the priority unchanged. Otherwise, update priority
- * to the normal priority:
- */
- if (!rt_prio(p->prio))
- return p->normal_prio;
- return p->prio;
-}
-
-/*
- * activate_task - move a task to the runqueue. Enter with grq locked.
- */
-static void activate_task(struct task_struct *p, struct rq *rq)
-{
- update_clocks(rq);
-
- /*
- * Sleep time is in units of nanosecs, so shift by 20 to get a
- * milliseconds-range estimation of the amount of time that the task
- * spent sleeping:
- */
- if (unlikely(prof_on == SLEEP_PROFILING)) {
- if (p->state == TASK_UNINTERRUPTIBLE)
- profile_hits(SLEEP_PROFILING, (void *)get_wchan(p),
- (rq->clock_task - p->last_ran) >> 20);
- }
-
- p->prio = effective_prio(p);
- if (task_contributes_to_load(p))
- grq.nr_uninterruptible--;
- enqueue_task(p, rq);
- rq->soft_affined++;
- p->on_rq = 1;
- grq.nr_running++;
- inc_qnr();
-}
-
-static inline void clear_sticky(struct task_struct *p);
-
-/*
- * deactivate_task - If it's running, it's not on the grq and we can just
- * decrement the nr_running. Enter with grq locked.
- */
-static inline void deactivate_task(struct task_struct *p, struct rq *rq)
-{
- if (task_contributes_to_load(p))
- grq.nr_uninterruptible++;
- rq->soft_affined--;
- p->on_rq = 0;
- grq.nr_running--;
- clear_sticky(p);
-}
-
-#ifdef CONFIG_SMP
-void set_task_cpu(struct task_struct *p, unsigned int cpu)
-{
-#ifdef CONFIG_LOCKDEP
- /*
- * The caller should hold grq lock.
- */
- WARN_ON_ONCE(debug_locks && !lockdep_is_held(&grq.lock));
-#endif
- if (task_cpu(p) == cpu)
- return;
- trace_sched_migrate_task(p, cpu);
- perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
-
- /*
- * After ->cpu is set up to a new value, task_grq_lock(p, ...) can be
- * successfully executed on another CPU. We must ensure that updates of
- * per-task data have been completed by this moment.
- */
- smp_wmb();
- if (p->on_rq) {
- task_rq(p)->soft_affined--;
- cpu_rq(cpu)->soft_affined++;
- }
- task_thread_info(p)->cpu = cpu;
-}
-
-static inline void clear_sticky(struct task_struct *p)
-{
- p->sticky = false;
-}
-
-static inline bool task_sticky(struct task_struct *p)
-{
- return p->sticky;
-}
-
-/* Reschedule the best idle CPU that is not this one. */
-static void
-resched_closest_idle(struct rq *rq, int cpu, struct task_struct *p)
-{
- cpumask_t tmpmask;
-
- cpumask_and(&tmpmask, &p->cpus_allowed, &grq.cpu_idle_map);
- cpumask_clear_cpu(cpu, &tmpmask);
- if (cpumask_empty(&tmpmask))
- return;
- resched_best_mask(cpu, rq, &tmpmask);
-}
-
-/*
- * We set the sticky flag on a task that is descheduled involuntarily meaning
- * it is awaiting further CPU time. If the last sticky task is still sticky
- * but unlucky enough to not be the next task scheduled, we unstick it and try
- * to find it an idle CPU. Realtime tasks do not stick to minimise their
- * latency at all times.
- */
-static inline void
-swap_sticky(struct rq *rq, int cpu, struct task_struct *p)
-{
- if (rq->sticky_task) {
- if (rq->sticky_task == p) {
- p->sticky = true;
- return;
- }
- if (task_sticky(rq->sticky_task)) {
- clear_sticky(rq->sticky_task);
- resched_closest_idle(rq, cpu, rq->sticky_task);
- }
- }
- if (!rt_task(p)) {
- p->sticky = true;
- rq->sticky_task = p;
- } else {
- resched_closest_idle(rq, cpu, p);
- rq->sticky_task = NULL;
- }
-}
-
-static inline void unstick_task(struct rq *rq, struct task_struct *p)
-{
- rq->sticky_task = NULL;
- clear_sticky(p);
-}
-#else
-static inline void clear_sticky(struct task_struct *p)
-{
-}
-
-static inline bool task_sticky(struct task_struct *p)
-{
- return false;
-}
-
-static inline void
-swap_sticky(struct rq *rq, int cpu, struct task_struct *p)
-{
-}
-
-static inline void unstick_task(struct rq *rq, struct task_struct *p)
-{
-}
-#endif
-
-/*
- * Move a task off the global queue and take it to a cpu for it will
- * become the running task.
- */
-static inline void take_task(int cpu, struct task_struct *p)
-{
- set_task_cpu(p, cpu);
- dequeue_task(p);
- clear_sticky(p);
- dec_qnr();
-}
-
-/*
- * Returns a descheduling task to the grq runqueue unless it is being
- * deactivated.
- */
-static inline void return_task(struct task_struct *p, struct rq *rq, bool deactivate)
-{
- if (deactivate)
- deactivate_task(p, rq);
- else {
- inc_qnr();
- enqueue_task(p, rq);
- }
-}
-
-/* Enter with grq lock held. We know p is on the local cpu */
-static inline void __set_tsk_resched(struct task_struct *p)
-{
- set_tsk_need_resched(p);
- set_preempt_need_resched();
-}
-
-/*
- * resched_task - mark a task 'to be rescheduled now'.
- *
- * On UP this means the setting of the need_resched flag, on SMP it
- * might also involve a cross-CPU call to trigger the scheduler on
- * the target CPU.
- */
-void resched_task(struct task_struct *p)
-{
- int cpu;
-
- lockdep_assert_held(&grq.lock);
-
- if (test_tsk_need_resched(p))
- return;
-
- set_tsk_need_resched(p);
-
- cpu = task_cpu(p);
- if (cpu == smp_processor_id()) {
- set_preempt_need_resched();
- return;
- }
-
- smp_send_reschedule(cpu);
-}
-
-/**
- * task_curr - is this task currently executing on a CPU?
- * @p: the task in question.
- *
- * Return: 1 if the task is currently executing. 0 otherwise.
- */
-inline int task_curr(const struct task_struct *p)
-{
- return cpu_curr(task_cpu(p)) == p;
-}
-
-#ifdef CONFIG_SMP
-struct migration_req {
- struct task_struct *task;
- int dest_cpu;
-};
-
-/*
- * wait_task_inactive - wait for a thread to unschedule.
- *
- * If @match_state is nonzero, it's the @p->state value just checked and
- * not expected to change. If it changes, i.e. @p might have woken up,
- * then return zero. When we succeed in waiting for @p to be off its CPU,
- * we return a positive number (its total switch count). If a second call
- * a short while later returns the same number, the caller can be sure that
- * @p has remained unscheduled the whole time.
- *
- * The caller must ensure that the task *will* unschedule sometime soon,
- * else this function might spin for a *long* time. This function can't
- * be called with interrupts off, or it may introduce deadlock with
- * smp_call_function() if an IPI is sent by the same process we are
- * waiting to become inactive.
- */
-unsigned long wait_task_inactive(struct task_struct *p, long match_state)
-{
- unsigned long flags;
- bool running, on_rq;
- unsigned long ncsw;
- struct rq *rq;
-
- for (;;) {
- rq = task_rq(p);
-
- /*
- * If the task is actively running on another CPU
- * still, just relax and busy-wait without holding
- * any locks.
- *
- * NOTE! Since we don't hold any locks, it's not
- * even sure that "rq" stays as the right runqueue!
- * But we don't care, since this will return false
- * if the runqueue has changed and p is actually now
- * running somewhere else!
- */
- while (task_running(p) && p == rq->curr) {
- if (match_state && unlikely(p->state != match_state))
- return 0;
- cpu_relax();
- }
-
- /*
- * Ok, time to look more closely! We need the grq
- * lock now, to be *sure*. If we're wrong, we'll
- * just go back and repeat.
- */
- rq = task_grq_lock(p, &flags);
- trace_sched_wait_task(p);
- running = task_running(p);
- on_rq = p->on_rq;
- ncsw = 0;
- if (!match_state || p->state == match_state)
- ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
- task_grq_unlock(&flags);
-
- /*
- * If it changed from the expected state, bail out now.
- */
- if (unlikely(!ncsw))
- break;
-
- /*
- * Was it really running after all now that we
- * checked with the proper locks actually held?
- *
- * Oops. Go back and try again..
- */
- if (unlikely(running)) {
- cpu_relax();
- continue;
- }
-
- /*
- * It's not enough that it's not actively running,
- * it must be off the runqueue _entirely_, and not
- * preempted!
- *
- * So if it was still runnable (but just not actively
- * running right now), it's preempted, and we should
- * yield - it could be a while.
- */
- if (unlikely(on_rq)) {
- ktime_t to = ktime_set(0, NSEC_PER_SEC / HZ);
-
- set_current_state(TASK_UNINTERRUPTIBLE);
- schedule_hrtimeout(&to, HRTIMER_MODE_REL);
- continue;
- }
-
- /*
- * Ahh, all good. It wasn't running, and it wasn't
- * runnable, which means that it will never become
- * running in the future either. We're all done!
- */
- break;
- }
-
- return ncsw;
-}
-
-/***
- * kick_process - kick a running thread to enter/exit the kernel
- * @p: the to-be-kicked thread
- *
- * Cause a process which is running on another CPU to enter
- * kernel-mode, without any delay. (to get signals handled.)
- *
- * NOTE: this function doesn't have to take the runqueue lock,
- * because all it wants to ensure is that the remote task enters
- * the kernel. If the IPI races and the task has been migrated
- * to another CPU then no harm is done and the purpose has been
- * achieved as well.
- */
-void kick_process(struct task_struct *p)
-{
- int cpu;
-
- preempt_disable();
- cpu = task_cpu(p);
- if ((cpu != smp_processor_id()) && task_curr(p))
- smp_send_reschedule(cpu);
- preempt_enable();
-}
-EXPORT_SYMBOL_GPL(kick_process);
-#endif
-
-/*
- * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the
- * basis of earlier deadlines. SCHED_IDLEPRIO don't preempt anything else or
- * between themselves, they cooperatively multitask. An idle rq scores as
- * prio PRIO_LIMIT so it is always preempted.
- */
-static inline bool
-can_preempt(struct task_struct *p, int prio, u64 deadline)
-{
- /* Better static priority RT task or better policy preemption */
- if (p->prio < prio)
- return true;
- if (p->prio > prio)
- return false;
- /* SCHED_NORMAL, BATCH and ISO will preempt based on deadline */
- if (!deadline_before(p->deadline, deadline))
- return false;
- return true;
-}
-
-#ifdef CONFIG_SMP
-#define cpu_online_map (*(cpumask_t *)cpu_online_mask)
-#ifdef CONFIG_HOTPLUG_CPU
-/*
- * Check to see if there is a task that is affined only to offline CPUs but
- * still wants runtime. This happens to kernel threads during suspend/halt and
- * disabling of CPUs.
- */
-static inline bool online_cpus(struct task_struct *p)
-{
- return (likely(cpumask_intersects(&cpu_online_map, &p->cpus_allowed)));
-}
-#else /* CONFIG_HOTPLUG_CPU */
-/* All available CPUs are always online without hotplug. */
-static inline bool online_cpus(struct task_struct *p)
-{
- return true;
-}
-#endif
-
-/*
- * Check to see if p can run on cpu, and if not, whether there are any online
- * CPUs it can run on instead.
- */
-static inline bool needs_other_cpu(struct task_struct *p, int cpu)
-{
- if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed)))
- return true;
- return false;
-}
-
-/*
- * When all else is equal, still prefer this_rq.
- */
-static void try_preempt(struct task_struct *p, struct rq *this_rq)
-{
- struct rq *highest_prio_rq = NULL;
- int cpu, highest_prio;
- u64 latest_deadline;
- cpumask_t tmp;
-
- /*
- * We clear the sticky flag here because for a task to have called
- * try_preempt with the sticky flag enabled means some complicated
- * re-scheduling has occurred and we should ignore the sticky flag.
- */
- clear_sticky(p);
-
- if (suitable_idle_cpus(p) && resched_best_idle(p))
- return;
-
- /* IDLEPRIO tasks never preempt anything but idle */
- if (p->policy == SCHED_IDLEPRIO)
- return;
-
- if (likely(online_cpus(p)))
- cpumask_and(&tmp, &cpu_online_map, &p->cpus_allowed);
- else
- return;
-
- highest_prio = latest_deadline = 0;
-
- for_each_cpu(cpu, &tmp) {
- struct rq *rq;
- int rq_prio;
-
- rq = cpu_rq(cpu);
- rq_prio = rq->rq_prio;
- if (rq_prio < highest_prio)
- continue;
-
- if (rq_prio > highest_prio ||
- deadline_after(rq->rq_deadline, latest_deadline)) {
- latest_deadline = rq->rq_deadline;
- highest_prio = rq_prio;
- highest_prio_rq = rq;
- }
- }
-
- if (likely(highest_prio_rq)) {
-#ifdef CONFIG_SMT_NICE
- cpu = cpu_of(highest_prio_rq);
- if (!smt_should_schedule(p, cpu))
- return;
-#endif
- if (can_preempt(p, highest_prio, highest_prio_rq->rq_deadline))
- resched_curr(highest_prio_rq);
- }
-}
-#else /* CONFIG_SMP */
-static inline bool needs_other_cpu(struct task_struct *p, int cpu)
-{
- return false;
-}
-
-static void try_preempt(struct task_struct *p, struct rq *this_rq)
-{
- if (p->policy == SCHED_IDLEPRIO)
- return;
- if (can_preempt(p, uprq->rq_prio, uprq->rq_deadline))
- resched_curr(uprq);
-}
-#endif /* CONFIG_SMP */
-
-static void
-ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
-{
-#ifdef CONFIG_SCHEDSTATS
- struct rq *rq = this_rq();
-
-#ifdef CONFIG_SMP
- int this_cpu = smp_processor_id();
-
- if (cpu == this_cpu)
- schedstat_inc(rq, ttwu_local);
- else {
- struct sched_domain *sd;
-
- rcu_read_lock();
- for_each_domain(this_cpu, sd) {
- if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
- schedstat_inc(sd, ttwu_wake_remote);
- break;
- }
- }
- rcu_read_unlock();
- }
-
-#endif /* CONFIG_SMP */
-
- schedstat_inc(rq, ttwu_count);
-#endif /* CONFIG_SCHEDSTATS */
-}
-
-void wake_up_if_idle(int cpu)
-{
- struct rq *rq = cpu_rq(cpu);
- unsigned long flags;
-
- rcu_read_lock();
-
- if (!is_idle_task(rcu_dereference(rq->curr)))
- goto out;
-
- grq_lock_irqsave(&flags);
- if (likely(is_idle_task(rq->curr)))
- smp_send_reschedule(cpu);
- /* Else cpu is not in idle, do nothing here */
- grq_unlock_irqrestore(&flags);
-
-out:
- rcu_read_unlock();
-}
-
-#ifdef CONFIG_SMP
-void scheduler_ipi(void)
-{
- /*
- * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
- * TIF_NEED_RESCHED remotely (for the first time) will also send
- * this IPI.
- */
- preempt_fold_need_resched();
-}
-#endif
-
-static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
- bool is_sync)
-{
- activate_task(p, rq);
-
- /*
- * Sync wakeups (i.e. those types of wakeups where the waker
- * has indicated that it will leave the CPU in short order)
- * don't trigger a preemption if there are no idle cpus,
- * instead waiting for current to deschedule.
- */
- if (!is_sync || suitable_idle_cpus(p))
- try_preempt(p, rq);
-}
-
-static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
- bool success)
-{
- trace_sched_wakeup(p, success);
- p->state = TASK_RUNNING;
-
- /*
- * if a worker is waking up, notify workqueue. Note that on BFS, we
- * don't really know what cpu it will be, so we fake it for
- * wq_worker_waking_up :/
- */
- if ((p->flags & PF_WQ_WORKER) && success)
- wq_worker_waking_up(p, cpu_of(rq));
-}
-
-/*
- * wake flags
- */
-#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */
-#define WF_FORK 0x02 /* child wakeup after fork */
-#define WF_MIGRATED 0x4 /* internal use, task got migrated */
-
-/***
- * try_to_wake_up - wake up a thread
- * @p: the thread to be awakened
- * @state: the mask of task states that can be woken
- * @wake_flags: wake modifier flags (WF_*)
- *
- * Put it on the run-queue if it's not already there. The "current"
- * thread is always on the run-queue (except when the actual
- * re-schedule is in progress), and as such you're allowed to do
- * the simpler "current->state = TASK_RUNNING" to mark yourself
- * runnable without the overhead of this.
- *
- * Return: %true if @p was woken up, %false if it was already running.
- * or @state didn't match @p's state.
- */
-static bool try_to_wake_up(struct task_struct *p, unsigned int state,
- int wake_flags)
-{
- bool success = false;
- unsigned long flags;
- struct rq *rq;
- int cpu;
-
- get_cpu();
-
- /*
- * If we are going to wake up a thread waiting for CONDITION we
- * need to ensure that CONDITION=1 done by the caller can not be
- * reordered with p->state check below. This pairs with mb() in
- * set_current_state() the waiting thread does.
- */
- smp_mb__before_spinlock();
-
- /*
- * No need to do time_lock_grq as we only need to update the rq clock
- * if we activate the task
- */
- rq = task_grq_lock(p, &flags);
- cpu = task_cpu(p);
-
- /* state is a volatile long, どうして、分からない */
- if (!((unsigned int)p->state & state))
- goto out_unlock;
-
- if (task_queued(p) || task_running(p))
- goto out_running;
-
- ttwu_activate(p, rq, wake_flags & WF_SYNC);
- success = true;
-
-out_running:
- ttwu_post_activation(p, rq, success);
-out_unlock:
- task_grq_unlock(&flags);
-
- ttwu_stat(p, cpu, wake_flags);
-
- put_cpu();
-
- return success;
-}
-
-/**
- * try_to_wake_up_local - try to wake up a local task with grq lock held
- * @p: the thread to be awakened
- *
- * Put @p on the run-queue if it's not already there. The caller must
- * ensure that grq is locked and, @p is not the current task.
- * grq stays locked over invocation.
- */
-static void try_to_wake_up_local(struct task_struct *p)
-{
- struct rq *rq = task_rq(p);
- bool success = false;
-
- lockdep_assert_held(&grq.lock);
-
- if (!(p->state & TASK_NORMAL))
- return;
-
- if (!task_queued(p)) {
- if (likely(!task_running(p))) {
- schedstat_inc(rq, ttwu_count);
- schedstat_inc(rq, ttwu_local);
- }
- ttwu_activate(p, rq, false);
- ttwu_stat(p, smp_processor_id(), 0);
- success = true;
- }
- ttwu_post_activation(p, rq, success);
-}
-
-/**
- * wake_up_process - Wake up a specific process
- * @p: The process to be woken up.
- *
- * Attempt to wake up the nominated process and move it to the set of runnable
- * processes.
- *
- * Return: 1 if the process was woken up, 0 if it was already running.
- *
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
- */
-int wake_up_process(struct task_struct *p)
-{
- WARN_ON(task_is_stopped_or_traced(p));
- return try_to_wake_up(p, TASK_NORMAL, 0);
-}
-EXPORT_SYMBOL(wake_up_process);
-
-int wake_up_state(struct task_struct *p, unsigned int state)
-{
- return try_to_wake_up(p, state, 0);
-}
-
-static void time_slice_expired(struct task_struct *p);
-
-/*
- * Perform scheduler related setup for a newly forked process p.
- * p is forked by current.
- */
-int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p)
-{
-#ifdef CONFIG_PREEMPT_NOTIFIERS
- INIT_HLIST_HEAD(&p->preempt_notifiers);
-#endif
- /*
- * The process state is set to the same value of the process executing
- * do_fork() code. That is running. This guarantees that nobody will
- * actually run it, and a signal or other external event cannot wake
- * it up and insert it on the runqueue either.
- */
-
- /* Should be reset in fork.c but done here for ease of bfs patching */
- p->on_rq =
- p->utime =
- p->stime =
- p->utimescaled =
- p->stimescaled =
- p->sched_time =
- p->stime_pc =
- p->utime_pc = 0;
-
- /*
- * Revert to default priority/policy on fork if requested.
- */
- if (unlikely(p->sched_reset_on_fork)) {
- if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
- p->policy = SCHED_NORMAL;
- p->normal_prio = normal_prio(p);
- }
-
- if (PRIO_TO_NICE(p->static_prio) < 0) {
- p->static_prio = NICE_TO_PRIO(0);
- p->normal_prio = p->static_prio;
- }
-
- /*
- * We don't need the reset flag anymore after the fork. It has
- * fulfilled its duty:
- */
- p->sched_reset_on_fork = 0;
- }
-
- INIT_LIST_HEAD(&p->run_list);
-#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
- if (unlikely(sched_info_on()))
- memset(&p->sched_info, 0, sizeof(p->sched_info));
-#endif
- p->on_cpu = false;
- clear_sticky(p);
- init_task_preempt_count(p);
- return 0;
-}
-
-/*
- * wake_up_new_task - wake up a newly created task for the first time.
- *
- * This function will do some initial scheduler statistics housekeeping
- * that must be done for every newly created context, then puts the task
- * on the runqueue and wakes it.
- */
-void wake_up_new_task(struct task_struct *p)
-{
- struct task_struct *parent;
- unsigned long flags;
- struct rq *rq;
-
- parent = p->parent;
- rq = task_grq_lock(p, &flags);
-
- /*
- * Reinit new task deadline as its creator deadline could have changed
- * since call to dup_task_struct().
- */
- p->deadline = rq->rq_deadline;
-
- /*
- * If the task is a new process, current and parent are the same. If
- * the task is a new thread in the thread group, it will have much more
- * in common with current than with the parent.
- */
- set_task_cpu(p, task_cpu(rq->curr));
-
- /*
- * Make sure we do not leak PI boosting priority to the child.
- */
- p->prio = rq->curr->normal_prio;
-
- activate_task(p, rq);
- trace_sched_wakeup_new(p, 1);
- if (unlikely(p->policy == SCHED_FIFO))
- goto after_ts_init;
-
- /*
- * Share the timeslice between parent and child, thus the
- * total amount of pending timeslices in the system doesn't change,
- * resulting in more scheduling fairness. If it's negative, it won't
- * matter since that's the same as being 0. current's time_slice is
- * actually in rq_time_slice when it's running, as is its last_ran
- * value. rq->rq_deadline is only modified within schedule() so it
- * is always equal to current->deadline.
- */
- p->last_ran = rq->rq_last_ran;
- if (likely(rq->rq_time_slice >= RESCHED_US * 2)) {
- rq->rq_time_slice /= 2;
- p->time_slice = rq->rq_time_slice;
-after_ts_init:
- if (rq->curr == parent && !suitable_idle_cpus(p)) {
- /*
- * The VM isn't cloned, so we're in a good position to
- * do child-runs-first in anticipation of an exec. This
- * usually avoids a lot of COW overhead.
- */
- __set_tsk_resched(parent);
- } else
- try_preempt(p, rq);
- } else {
- if (rq->curr == parent) {
- /*
- * Forking task has run out of timeslice. Reschedule it and
- * start its child with a new time slice and deadline. The
- * child will end up running first because its deadline will
- * be slightly earlier.
- */
- rq->rq_time_slice = 0;
- __set_tsk_resched(parent);
- }
- time_slice_expired(p);
- }
- task_grq_unlock(&flags);
-}
-
-#ifdef CONFIG_PREEMPT_NOTIFIERS
-
-/**
- * preempt_notifier_register - tell me when current is being preempted & rescheduled
- * @notifier: notifier struct to register
- */
-void preempt_notifier_register(struct preempt_notifier *notifier)
-{
- hlist_add_head(&notifier->link, &current->preempt_notifiers);
-}
-EXPORT_SYMBOL_GPL(preempt_notifier_register);
-
-/**
- * preempt_notifier_unregister - no longer interested in preemption notifications
- * @notifier: notifier struct to unregister
- *
- * This is safe to call from within a preemption notifier.
- */
-void preempt_notifier_unregister(struct preempt_notifier *notifier)
-{
- hlist_del(&notifier->link);
-}
-EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
-
-static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
-{
- struct preempt_notifier *notifier;
-
- hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
- notifier->ops->sched_in(notifier, raw_smp_processor_id());
-}
-
-static void
-fire_sched_out_preempt_notifiers(struct task_struct *curr,
- struct task_struct *next)
-{
- struct preempt_notifier *notifier;
-
- hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
- notifier->ops->sched_out(notifier, next);
-}
-
-#else /* !CONFIG_PREEMPT_NOTIFIERS */
-
-static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
-{
-}
-
-static void
-fire_sched_out_preempt_notifiers(struct task_struct *curr,
- struct task_struct *next)
-{
-}
-
-#endif /* CONFIG_PREEMPT_NOTIFIERS */
-
-/**
- * prepare_task_switch - prepare to switch tasks
- * @rq: the runqueue preparing to switch
- * @next: the task we are going to switch to.
- *
- * This is called with the rq lock held and interrupts off. It must
- * be paired with a subsequent finish_task_switch after the context
- * switch.
- *
- * prepare_task_switch sets up locking and calls architecture specific
- * hooks.
- */
-static inline void
-prepare_task_switch(struct rq *rq, struct task_struct *prev,
- struct task_struct *next)
-{
- sched_info_switch(rq, prev, next);
- perf_event_task_sched_out(prev, next);
- fire_sched_out_preempt_notifiers(prev, next);
- prepare_lock_switch(rq, next);
- prepare_arch_switch(next);
- trace_sched_switch(prev, next);
-}
-
-/**
- * finish_task_switch - clean up after a task-switch
- * @rq: runqueue associated with task-switch
- * @prev: the thread we just switched away from.
- *
- * finish_task_switch must be called after the context switch, paired
- * with a prepare_task_switch call before the context switch.
- * finish_task_switch will reconcile locking set up by prepare_task_switch,
- * and do any other architecture-specific cleanup actions.
- *
- * Note that we may have delayed dropping an mm in context_switch(). If
- * so, we finish that here outside of the runqueue lock. (Doing it
- * with the lock held can cause deadlocks; see schedule() for
- * details.)
- *
- * The context switch have flipped the stack from under us and restored the
- * local variables which were saved when this task called schedule() in the
- * past. prev == current is still correct but we need to recalculate this_rq
- * because prev may have moved to another CPU.
- */
-static struct rq *finish_task_switch(struct task_struct *prev)
- __releases(grq.lock)
-{
- struct rq *rq = this_rq();
- struct mm_struct *mm = rq->prev_mm;
- long prev_state;
-
- rq->prev_mm = NULL;
-
- /*
- * A task struct has one reference for the use as "current".
- * If a task dies, then it sets TASK_DEAD in tsk->state and calls
- * schedule one last time. The schedule call will never return, and
- * the scheduled task must drop that reference.
- * The test for TASK_DEAD must occur while the runqueue locks are
- * still held, otherwise prev could be scheduled on another cpu, die
- * there before we look at prev->state, and then the reference would
- * be dropped twice.
- * Manfred Spraul <manfred@colorfullife.com>
- */
- prev_state = prev->state;
- vtime_task_switch(prev);
- finish_arch_switch(prev);
- perf_event_task_sched_in(prev, current);
- finish_lock_switch(rq, prev);
- finish_arch_post_lock_switch();
-
- fire_sched_in_preempt_notifiers(current);
- if (mm)
- mmdrop(mm);
- if (unlikely(prev_state == TASK_DEAD)) {
- /*
- * Remove function-return probe instances associated with this
- * task and put them back on the free list.
- */
- kprobe_flush_task(prev);
- put_task_struct(prev);
- }
- return rq;
-}
-
-/**
- * schedule_tail - first thing a freshly forked thread must call.
- * @prev: the thread we just switched away from.
- */
-asmlinkage __visible void schedule_tail(struct task_struct *prev)
- __releases(grq.lock)
-{
- struct rq *rq;
-
- /* finish_task_switch() drops rq->lock and enables preemption */
- preempt_disable();
- rq = finish_task_switch(prev);
- preempt_enable();
-
- if (current->set_child_tid)
- put_user(task_pid_vnr(current), current->set_child_tid);
-}
-
-/*
- * context_switch - switch to the new MM and the new thread's register state.
- */
-static inline struct rq *
-context_switch(struct rq *rq, struct task_struct *prev,
- struct task_struct *next)
-{
- struct mm_struct *mm, *oldmm;
-
- prepare_task_switch(rq, prev, next);
-
- mm = next->mm;
- oldmm = prev->active_mm;
- /*
- * For paravirt, this is coupled with an exit in switch_to to
- * combine the page table reload and the switch backend into
- * one hypercall.
- */
- arch_start_context_switch(prev);
-
- if (!mm) {
- next->active_mm = oldmm;
- atomic_inc(&oldmm->mm_count);
- enter_lazy_tlb(oldmm, next);
- } else
- switch_mm(oldmm, mm, next);
-
- if (!prev->mm) {
- prev->active_mm = NULL;
- rq->prev_mm = oldmm;
- }
- /*
- * Since the runqueue lock will be released by the next
- * task (which is an invalid locking op but in the case
- * of the scheduler it's an obvious special-case), so we
- * do an early lockdep release here:
- */
- spin_release(&grq.lock.dep_map, 1, _THIS_IP_);
-
- /* Here we just switch the register state and the stack. */
- context_tracking_task_switch(prev, next);
- switch_to(prev, next, prev);
-
- barrier();
-
- return finish_task_switch(prev);
-}
-
-/*
- * nr_running, nr_uninterruptible and nr_context_switches:
- *
- * externally visible scheduler statistics: current number of runnable
- * threads, total number of context switches performed since bootup. All are
- * measured without grabbing the grq lock but the occasional inaccurate result
- * doesn't matter so long as it's positive.
- */
-unsigned long nr_running(void)
-{
- long nr = grq.nr_running;
-
- if (unlikely(nr < 0))
- nr = 0;
- return (unsigned long)nr;
-}
-
-static unsigned long nr_uninterruptible(void)
-{
- long nu = grq.nr_uninterruptible;
-
- if (unlikely(nu < 0))
- nu = 0;
- return nu;
-}
-
-/*
- * Check if only the current task is running on the cpu.
- */
-bool single_task_running(void)
-{
- if (cpu_rq(smp_processor_id())->soft_affined == 1)
- return true;
- else
- return false;
-}
-EXPORT_SYMBOL(single_task_running);
-
-unsigned long long nr_context_switches(void)
-{
- long long ns = grq.nr_switches;
-
- /* This is of course impossible */
- if (unlikely(ns < 0))
- ns = 1;
- return (unsigned long long)ns;
-}
-
-unsigned long nr_iowait(void)
-{
- unsigned long i, sum = 0;
-
- for_each_possible_cpu(i)
- sum += atomic_read(&cpu_rq(i)->nr_iowait);
-
- return sum;
-}
-
-unsigned long nr_iowait_cpu(int cpu)
-{
- struct rq *this = cpu_rq(cpu);
- return atomic_read(&this->nr_iowait);
-}
-
-unsigned long nr_active(void)
-{
- return nr_running() + nr_uninterruptible();
-}
-
-/* Beyond a task running on this CPU, load is equal everywhere on BFS, so we
- * base it on the number of running or queued tasks with their ->rq pointer
- * set to this cpu as being the CPU they're more likely to run on. */
-void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
-{
- struct rq *this = this_rq();
-
- *nr_waiters = atomic_read(&this->nr_iowait);
- *load = this->soft_affined;
-}
-
-/* Variables and functions for calc_load */
-static unsigned long calc_load_update;
-unsigned long avenrun[3];
-EXPORT_SYMBOL(avenrun);
-
-/**
- * get_avenrun - get the load average array
- * @loads: pointer to dest load array
- * @offset: offset to add
- * @shift: shift count to shift the result left
- *
- * These values are estimates at best, so no need for locking.
- */
-void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
-{
- loads[0] = (avenrun[0] + offset) << shift;
- loads[1] = (avenrun[1] + offset) << shift;
- loads[2] = (avenrun[2] + offset) << shift;
-}
-
-static unsigned long
-calc_load(unsigned long load, unsigned long exp, unsigned long active)
-{
- load *= exp;
- load += active * (FIXED_1 - exp);
- return load >> FSHIFT;
-}
-
-/*
- * calc_load - update the avenrun load estimates every LOAD_FREQ seconds.
- */
-void calc_global_load(unsigned long ticks)
-{
- long active;
-
- if (time_before(jiffies, calc_load_update))
- return;
- active = nr_active() * FIXED_1;
-
- avenrun[0] = calc_load(avenrun[0], EXP_1, active);
- avenrun[1] = calc_load(avenrun[1], EXP_5, active);
- avenrun[2] = calc_load(avenrun[2], EXP_15, active);
-
- calc_load_update = jiffies + LOAD_FREQ;
-}
-
-DEFINE_PER_CPU(struct kernel_stat, kstat);
-DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
-
-EXPORT_PER_CPU_SYMBOL(kstat);
-EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
-
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-
-/*
- * There are no locks covering percpu hardirq/softirq time.
- * They are only modified in account_system_vtime, on corresponding CPU
- * with interrupts disabled. So, writes are safe.
- * They are read and saved off onto struct rq in update_rq_clock().
- * This may result in other CPU reading this CPU's irq time and can
- * race with irq/account_system_vtime on this CPU. We would either get old
- * or new value with a side effect of accounting a slice of irq time to wrong
- * task when irq is in progress while we read rq->clock. That is a worthy
- * compromise in place of having locks on each irq in account_system_time.
- */
-static DEFINE_PER_CPU(u64, cpu_hardirq_time);
-static DEFINE_PER_CPU(u64, cpu_softirq_time);
-
-static DEFINE_PER_CPU(u64, irq_start_time);
-static int sched_clock_irqtime;
-
-void enable_sched_clock_irqtime(void)
-{
- sched_clock_irqtime = 1;
-}
-
-void disable_sched_clock_irqtime(void)
-{
- sched_clock_irqtime = 0;
-}
-
-#ifndef CONFIG_64BIT
-static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
-
-static inline void irq_time_write_begin(void)
-{
- __this_cpu_inc(irq_time_seq.sequence);
- smp_wmb();
-}
-
-static inline void irq_time_write_end(void)
-{
- smp_wmb();
- __this_cpu_inc(irq_time_seq.sequence);
-}
-
-static inline u64 irq_time_read(int cpu)
-{
- u64 irq_time;
- unsigned seq;
-
- do {
- seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
- irq_time = per_cpu(cpu_softirq_time, cpu) +
- per_cpu(cpu_hardirq_time, cpu);
- } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
-
- return irq_time;
-}
-#else /* CONFIG_64BIT */
-static inline void irq_time_write_begin(void)
-{
-}
-
-static inline void irq_time_write_end(void)
-{
-}
-
-static inline u64 irq_time_read(int cpu)
-{
- return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
-}
-#endif /* CONFIG_64BIT */
-
-/*
- * Called before incrementing preempt_count on {soft,}irq_enter
- * and before decrementing preempt_count on {soft,}irq_exit.
- */
-void irqtime_account_irq(struct task_struct *curr)
-{
- unsigned long flags;
- s64 delta;
- int cpu;
-
- if (!sched_clock_irqtime)
- return;
-
- local_irq_save(flags);
-
- cpu = smp_processor_id();
- delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
- __this_cpu_add(irq_start_time, delta);
-
- irq_time_write_begin();
- /*
- * We do not account for softirq time from ksoftirqd here.
- * We want to continue accounting softirq time to ksoftirqd thread
- * in that case, so as not to confuse scheduler with a special task
- * that do not consume any time, but still wants to run.
- */
- if (hardirq_count())
- __this_cpu_add(cpu_hardirq_time, delta);
- else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
- __this_cpu_add(cpu_softirq_time, delta);
-
- irq_time_write_end();
- local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(irqtime_account_irq);
-
-#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
-
-#ifdef CONFIG_PARAVIRT
-static inline u64 steal_ticks(u64 steal)
-{
- if (unlikely(steal > NSEC_PER_SEC))
- return div_u64(steal, TICK_NSEC);
-
- return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
-}
-#endif
-
-static void update_rq_clock_task(struct rq *rq, s64 delta)
-{
-/*
- * In theory, the compile should just see 0 here, and optimize out the call
- * to sched_rt_avg_update. But I don't trust it...
- */
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
- s64 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
-
- /*
- * Since irq_time is only updated on {soft,}irq_exit, we might run into
- * this case when a previous update_rq_clock() happened inside a
- * {soft,}irq region.
- *
- * When this happens, we stop ->clock_task and only update the
- * prev_irq_time stamp to account for the part that fit, so that a next
- * update will consume the rest. This ensures ->clock_task is
- * monotonic.
- *
- * It does however cause some slight miss-attribution of {soft,}irq
- * time, a more accurate solution would be to update the irq_time using
- * the current rq->clock timestamp, except that would require using
- * atomic ops.
- */
- if (irq_delta > delta)
- irq_delta = delta;
-
- rq->prev_irq_time += irq_delta;
- delta -= irq_delta;
-#endif
-#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
- if (static_key_false((&paravirt_steal_rq_enabled))) {
- s64 steal = paravirt_steal_clock(cpu_of(rq));
-
- steal -= rq->prev_steal_time_rq;
-
- if (unlikely(steal > delta))
- steal = delta;
-
- rq->prev_steal_time_rq += steal;
-
- delta -= steal;
- }
-#endif
-
- rq->clock_task += delta;
-}
-
-#ifndef nsecs_to_cputime
-# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
-#endif
-
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-static void irqtime_account_hi_si(void)
-{
- u64 *cpustat = kcpustat_this_cpu->cpustat;
- u64 latest_ns;
-
- latest_ns = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time));
- if (latest_ns > cpustat[CPUTIME_IRQ])
- cpustat[CPUTIME_IRQ] += (__force u64)cputime_one_jiffy;
-
- latest_ns = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time));
- if (latest_ns > cpustat[CPUTIME_SOFTIRQ])
- cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy;
-}
-#else /* CONFIG_IRQ_TIME_ACCOUNTING */
-
-#define sched_clock_irqtime (0)
-
-static inline void irqtime_account_hi_si(void)
-{
-}
-#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
-
-static __always_inline bool steal_account_process_tick(void)
-{
-#ifdef CONFIG_PARAVIRT
- if (static_key_false(&paravirt_steal_enabled)) {
- u64 steal;
- cputime_t steal_ct;
-
- steal = paravirt_steal_clock(smp_processor_id());
- steal -= this_rq()->prev_steal_time;
-
- /*
- * cputime_t may be less precise than nsecs (eg: if it's
- * based on jiffies). Lets cast the result to cputime
- * granularity and account the rest on the next rounds.
- */
- steal_ct = nsecs_to_cputime(steal);
- this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct);
-
- account_steal_time(steal_ct);
- return steal_ct;
- }
-#endif
- return false;
-}
-
-/*
- * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
- * tasks (sum on group iteration) belonging to @tsk's group.
- */
-void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
-{
- struct signal_struct *sig = tsk->signal;
- cputime_t utime, stime;
- struct task_struct *t;
- unsigned int seq, nextseq;
- unsigned long flags;
-
- rcu_read_lock();
- /* Attempt a lockless read on the first round. */
- nextseq = 0;
- do {
- seq = nextseq;
- flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
- times->utime = sig->utime;
- times->stime = sig->stime;
- times->sum_exec_runtime = sig->sum_sched_runtime;
-
- for_each_thread(tsk, t) {
- task_cputime(t, &utime, &stime);
- times->utime += utime;
- times->stime += stime;
- times->sum_exec_runtime += task_sched_runtime(t);
- }
- /* If lockless access failed, take the lock. */
- nextseq = 1;
- } while (need_seqretry(&sig->stats_lock, seq));
- done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
- rcu_read_unlock();
-}
-
-/*
- * On each tick, see what percentage of that tick was attributed to each
- * component and add the percentage to the _pc values. Once a _pc value has
- * accumulated one tick's worth, account for that. This means the total
- * percentage of load components will always be 128 (pseudo 100) per tick.
- */
-static void pc_idle_time(struct rq *rq, struct task_struct *idle, unsigned long pc)
-{
- u64 *cpustat = kcpustat_this_cpu->cpustat;
-
- if (atomic_read(&rq->nr_iowait) > 0) {
- rq->iowait_pc += pc;
- if (rq->iowait_pc >= 128) {
- cpustat[CPUTIME_IOWAIT] += (__force u64)cputime_one_jiffy * rq->iowait_pc / 128;
- rq->iowait_pc %= 128;
- }
- } else {
- rq->idle_pc += pc;
- if (rq->idle_pc >= 128) {
- cpustat[CPUTIME_IDLE] += (__force u64)cputime_one_jiffy * rq->idle_pc / 128;
- rq->idle_pc %= 128;
- }
- }
- acct_update_integrals(idle);
-}
-
-static void
-pc_system_time(struct rq *rq, struct task_struct *p, int hardirq_offset,
- unsigned long pc, unsigned long ns)
-{
- u64 *cpustat = kcpustat_this_cpu->cpustat;
- cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
-
- p->stime_pc += pc;
- if (p->stime_pc >= 128) {
- int jiffs = p->stime_pc / 128;
-
- p->stime_pc %= 128;
- p->stime += (__force u64)cputime_one_jiffy * jiffs;
- p->stimescaled += one_jiffy_scaled * jiffs;
- account_group_system_time(p, cputime_one_jiffy * jiffs);
- }
- p->sched_time += ns;
- account_group_exec_runtime(p, ns);
-
- if (hardirq_count() - hardirq_offset) {
- rq->irq_pc += pc;
- if (rq->irq_pc >= 128) {
- cpustat[CPUTIME_IRQ] += (__force u64)cputime_one_jiffy * rq->irq_pc / 128;
- rq->irq_pc %= 128;
- }
- } else if (in_serving_softirq()) {
- rq->softirq_pc += pc;
- if (rq->softirq_pc >= 128) {
- cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy * rq->softirq_pc / 128;
- rq->softirq_pc %= 128;
- }
- } else {
- rq->system_pc += pc;
- if (rq->system_pc >= 128) {
- cpustat[CPUTIME_SYSTEM] += (__force u64)cputime_one_jiffy * rq->system_pc / 128;
- rq->system_pc %= 128;
- }
- }
- acct_update_integrals(p);
-}
-
-static void pc_user_time(struct rq *rq, struct task_struct *p,
- unsigned long pc, unsigned long ns)
-{
- u64 *cpustat = kcpustat_this_cpu->cpustat;
- cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
-
- p->utime_pc += pc;
- if (p->utime_pc >= 128) {
- int jiffs = p->utime_pc / 128;
-
- p->utime_pc %= 128;
- p->utime += (__force u64)cputime_one_jiffy * jiffs;
- p->utimescaled += one_jiffy_scaled * jiffs;
- account_group_user_time(p, cputime_one_jiffy * jiffs);
- }
- p->sched_time += ns;
- account_group_exec_runtime(p, ns);
-
- if (this_cpu_ksoftirqd() == p) {
- /*
- * ksoftirqd time do not get accounted in cpu_softirq_time.
- * So, we have to handle it separately here.
- */
- rq->softirq_pc += pc;
- if (rq->softirq_pc >= 128) {
- cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy * rq->softirq_pc / 128;
- rq->softirq_pc %= 128;
- }
- }
-
- if (task_nice(p) > 0 || idleprio_task(p)) {
- rq->nice_pc += pc;
- if (rq->nice_pc >= 128) {
- cpustat[CPUTIME_NICE] += (__force u64)cputime_one_jiffy * rq->nice_pc / 128;
- rq->nice_pc %= 128;
- }
- } else {
- rq->user_pc += pc;
- if (rq->user_pc >= 128) {
- cpustat[CPUTIME_USER] += (__force u64)cputime_one_jiffy * rq->user_pc / 128;
- rq->user_pc %= 128;
- }
- }
- acct_update_integrals(p);
-}
-
-/*
- * Convert nanoseconds to pseudo percentage of one tick. Use 128 for fast
- * shifts instead of 100
- */
-#define NS_TO_PC(NS) (NS * 128 / JIFFY_NS)
-
-/*
- * This is called on clock ticks.
- * Bank in p->sched_time the ns elapsed since the last tick or switch.
- * CPU scheduler quota accounting is also performed here in microseconds.
- */
-static void
-update_cpu_clock_tick(struct rq *rq, struct task_struct *p)
-{
- long account_ns = rq->clock_task - rq->rq_last_ran;
- struct task_struct *idle = rq->idle;
- unsigned long account_pc;
-
- if (unlikely(account_ns < 0) || steal_account_process_tick())
- goto ts_account;
-
- account_pc = NS_TO_PC(account_ns);
-
- /* Accurate tick timekeeping */
- if (user_mode(get_irq_regs()))
- pc_user_time(rq, p, account_pc, account_ns);
- else if (p != idle || (irq_count() != HARDIRQ_OFFSET))
- pc_system_time(rq, p, HARDIRQ_OFFSET,
- account_pc, account_ns);
- else
- pc_idle_time(rq, idle, account_pc);
-
- if (sched_clock_irqtime)
- irqtime_account_hi_si();
-
-ts_account:
- /* time_slice accounting is done in usecs to avoid overflow on 32bit */
- if (rq->rq_policy != SCHED_FIFO && p != idle) {
- s64 time_diff = rq->clock - rq->timekeep_clock;
-
- niffy_diff(&time_diff, 1);
- rq->rq_time_slice -= NS_TO_US(time_diff);
- }
-
- rq->rq_last_ran = rq->clock_task;
- rq->timekeep_clock = rq->clock;
-}
-
-/*
- * This is called on context switches.
- * Bank in p->sched_time the ns elapsed since the last tick or switch.
- * CPU scheduler quota accounting is also performed here in microseconds.
- */
-static void
-update_cpu_clock_switch(struct rq *rq, struct task_struct *p)
-{
- long account_ns = rq->clock_task - rq->rq_last_ran;
- struct task_struct *idle = rq->idle;
- unsigned long account_pc;
-
- if (unlikely(account_ns < 0))
- goto ts_account;
-
- account_pc = NS_TO_PC(account_ns);
-
- /* Accurate subtick timekeeping */
- if (p != idle) {
- pc_user_time(rq, p, account_pc, account_ns);
- }
- else
- pc_idle_time(rq, idle, account_pc);
-
-ts_account:
- /* time_slice accounting is done in usecs to avoid overflow on 32bit */
- if (rq->rq_policy != SCHED_FIFO && p != idle) {
- s64 time_diff = rq->clock - rq->timekeep_clock;
-
- niffy_diff(&time_diff, 1);
- rq->rq_time_slice -= NS_TO_US(time_diff);
- }
-
- rq->rq_last_ran = rq->clock_task;
- rq->timekeep_clock = rq->clock;
-}
-
-/*
- * Return any ns on the sched_clock that have not yet been accounted in
- * @p in case that task is currently running.
- *
- * Called with task_grq_lock() held.
- */
-static inline u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
-{
- u64 ns = 0;
-
- /*
- * Must be ->curr _and_ ->on_rq. If dequeued, we would
- * project cycles that may never be accounted to this
- * thread, breaking clock_gettime().
- */
- if (p == rq->curr && p->on_rq) {
- update_clocks(rq);
- ns = rq->clock_task - rq->rq_last_ran;
- if (unlikely((s64)ns < 0))
- ns = 0;
- }
-
- return ns;
-}
-
-/*
- * Return accounted runtime for the task.
- * Return separately the current's pending runtime that have not been
- * accounted yet.
- *
- */
-unsigned long long task_sched_runtime(struct task_struct *p)
-{
- unsigned long flags;
- struct rq *rq;
- u64 ns;
-
-#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
- /*
- * 64-bit doesn't need locks to atomically read a 64bit value.
- * So we have a optimization chance when the task's delta_exec is 0.
- * Reading ->on_cpu is racy, but this is ok.
- *
- * If we race with it leaving cpu, we'll take a lock. So we're correct.
- * If we race with it entering cpu, unaccounted time is 0. This is
- * indistinguishable from the read occurring a few cycles earlier.
- * If we see ->on_cpu without ->on_rq, the task is leaving, and has
- * been accounted, so we're correct here as well.
- */
- if (!p->on_cpu || !p->on_rq)
- return tsk_seruntime(p);
-#endif
-
- rq = task_grq_lock(p, &flags);
- ns = p->sched_time + do_task_delta_exec(p, rq);
- task_grq_unlock(&flags);
-
- return ns;
-}
-
-/* Compatibility crap */
-void account_user_time(struct task_struct *p, cputime_t cputime,
- cputime_t cputime_scaled)
-{
-}
-
-void account_idle_time(cputime_t cputime)
-{
-}
-
-void update_cpu_load_nohz(void)
-{
-}
-
-#ifdef CONFIG_NO_HZ_COMMON
-void calc_load_enter_idle(void)
-{
-}
-
-void calc_load_exit_idle(void)
-{
-}
-#endif /* CONFIG_NO_HZ_COMMON */
-
-/*
- * Account guest cpu time to a process.
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in virtual machine since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
- */
-static void account_guest_time(struct task_struct *p, cputime_t cputime,
- cputime_t cputime_scaled)
-{
- u64 *cpustat = kcpustat_this_cpu->cpustat;
-
- /* Add guest time to process. */
- p->utime += (__force u64)cputime;
- p->utimescaled += (__force u64)cputime_scaled;
- account_group_user_time(p, cputime);
- p->gtime += (__force u64)cputime;
-
- /* Add guest time to cpustat. */
- if (task_nice(p) > 0) {
- cpustat[CPUTIME_NICE] += (__force u64)cputime;
- cpustat[CPUTIME_GUEST_NICE] += (__force u64)cputime;
- } else {
- cpustat[CPUTIME_USER] += (__force u64)cputime;
- cpustat[CPUTIME_GUEST] += (__force u64)cputime;
- }
-}
-
-/*
- * Account system cpu time to a process and desired cpustat field
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in kernel space since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
- * @target_cputime64: pointer to cpustat field that has to be updated
- */
-static inline
-void __account_system_time(struct task_struct *p, cputime_t cputime,
- cputime_t cputime_scaled, cputime64_t *target_cputime64)
-{
- /* Add system time to process. */
- p->stime += (__force u64)cputime;
- p->stimescaled += (__force u64)cputime_scaled;
- account_group_system_time(p, cputime);
-
- /* Add system time to cpustat. */
- *target_cputime64 += (__force u64)cputime;
-
- /* Account for system time used */
- acct_update_integrals(p);
-}
-
-/*
- * Account system cpu time to a process.
- * @p: the process that the cpu time gets accounted to
- * @hardirq_offset: the offset to subtract from hardirq_count()
- * @cputime: the cpu time spent in kernel space since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
- * This is for guest only now.
- */
-void account_system_time(struct task_struct *p, int hardirq_offset,
- cputime_t cputime, cputime_t cputime_scaled)
-{
-
- if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0))
- account_guest_time(p, cputime, cputime_scaled);
-}
-
-/*
- * Account for involuntary wait time.
- * @steal: the cpu time spent in involuntary wait
- */
-void account_steal_time(cputime_t cputime)
-{
- u64 *cpustat = kcpustat_this_cpu->cpustat;
-
- cpustat[CPUTIME_STEAL] += (__force u64)cputime;
-}
-
-/*
- * Account for idle time.
- * @cputime: the cpu time spent in idle wait
- */
-static void account_idle_times(cputime_t cputime)
-{
- u64 *cpustat = kcpustat_this_cpu->cpustat;
- struct rq *rq = this_rq();
-
- if (atomic_read(&rq->nr_iowait) > 0)
- cpustat[CPUTIME_IOWAIT] += (__force u64)cputime;
- else
- cpustat[CPUTIME_IDLE] += (__force u64)cputime;
-}
-
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-
-void account_process_tick(struct task_struct *p, int user_tick)
-{
-}
-
-/*
- * Account multiple ticks of steal time.
- * @p: the process from which the cpu time has been stolen
- * @ticks: number of stolen ticks
- */
-void account_steal_ticks(unsigned long ticks)
-{
- account_steal_time(jiffies_to_cputime(ticks));
-}
-
-/*
- * Account multiple ticks of idle time.
- * @ticks: number of stolen ticks
- */
-void account_idle_ticks(unsigned long ticks)
-{
- account_idle_times(jiffies_to_cputime(ticks));
-}
-#endif
-
-static inline void grq_iso_lock(void)
- __acquires(grq.iso_lock)
-{
- raw_spin_lock(&grq.iso_lock);
-}
-
-static inline void grq_iso_unlock(void)
- __releases(grq.iso_lock)
-{
- raw_spin_unlock(&grq.iso_lock);
-}
-
-/*
- * Functions to test for when SCHED_ISO tasks have used their allocated
- * quota as real time scheduling and convert them back to SCHED_NORMAL.
- * Where possible, the data is tested lockless, to avoid grabbing iso_lock
- * because the occasional inaccurate result won't matter. However the
- * tick data is only ever modified under lock. iso_refractory is only simply
- * set to 0 or 1 so it's not worth grabbing the lock yet again for that.
- */
-static bool set_iso_refractory(void)
-{
- grq.iso_refractory = true;
- return grq.iso_refractory;
-}
-
-static bool clear_iso_refractory(void)
-{
- grq.iso_refractory = false;
- return grq.iso_refractory;
-}
-
-/*
- * Test if SCHED_ISO tasks have run longer than their alloted period as RT
- * tasks and set the refractory flag if necessary. There is 10% hysteresis
- * for unsetting the flag. 115/128 is ~90/100 as a fast shift instead of a
- * slow division.
- */
-static bool test_ret_isorefractory(struct rq *rq)
-{
- if (likely(!grq.iso_refractory)) {
- if (grq.iso_ticks > ISO_PERIOD * sched_iso_cpu)
- return set_iso_refractory();
- } else {
- if (grq.iso_ticks < ISO_PERIOD * (sched_iso_cpu * 115 / 128))
- return clear_iso_refractory();
- }
- return grq.iso_refractory;
-}
-
-static void iso_tick(void)
-{
- grq_iso_lock();
- grq.iso_ticks += 100;
- grq_iso_unlock();
-}
-
-/* No SCHED_ISO task was running so decrease rq->iso_ticks */
-static inline void no_iso_tick(void)
-{
- if (grq.iso_ticks) {
- grq_iso_lock();
- grq.iso_ticks -= grq.iso_ticks / ISO_PERIOD + 1;
- if (unlikely(grq.iso_refractory && grq.iso_ticks <
- ISO_PERIOD * (sched_iso_cpu * 115 / 128)))
- clear_iso_refractory();
- grq_iso_unlock();
- }
-}
-
-/* This manages tasks that have run out of timeslice during a scheduler_tick */
-static void task_running_tick(struct rq *rq)
-{
- struct task_struct *p;
-
- /*
- * If a SCHED_ISO task is running we increment the iso_ticks. In
- * order to prevent SCHED_ISO tasks from causing starvation in the
- * presence of true RT tasks we account those as iso_ticks as well.
- */
- if ((rt_queue(rq) || (iso_queue(rq) && !grq.iso_refractory))) {
- if (grq.iso_ticks <= (ISO_PERIOD * 128) - 128)
- iso_tick();
- } else
- no_iso_tick();
-
- if (iso_queue(rq)) {
- if (unlikely(test_ret_isorefractory(rq))) {
- if (rq_running_iso(rq)) {
- /*
- * SCHED_ISO task is running as RT and limit
- * has been hit. Force it to reschedule as
- * SCHED_NORMAL by zeroing its time_slice
- */
- rq->rq_time_slice = 0;
- }
- }
- }
-
- /* SCHED_FIFO tasks never run out of timeslice. */
- if (rq->rq_policy == SCHED_FIFO)
- return;
- /*
- * Tasks that were scheduled in the first half of a tick are not
- * allowed to run into the 2nd half of the next tick if they will
- * run out of time slice in the interim. Otherwise, if they have
- * less than RESCHED_US μs of time slice left they will be rescheduled.
- */
- if (rq->dither) {
- if (rq->rq_time_slice > HALF_JIFFY_US)
- return;
- else
- rq->rq_time_slice = 0;
- } else if (rq->rq_time_slice >= RESCHED_US)
- return;
-
- /* p->time_slice < RESCHED_US. We only modify task_struct under grq lock */
- p = rq->curr;
-
- grq_lock();
- requeue_task(p);
- __set_tsk_resched(p);
- grq_unlock();
-}
-
-/*
- * This function gets called by the timer code, with HZ frequency.
- * We call it with interrupts disabled. The data modified is all
- * local to struct rq so we don't need to grab grq lock.
- */
-void scheduler_tick(void)
-{
- int cpu __maybe_unused = smp_processor_id();
- struct rq *rq = cpu_rq(cpu);
-
- sched_clock_tick();
- /* grq lock not grabbed, so only update rq clock */
- update_rq_clock(rq);
- update_cpu_clock_tick(rq, rq->curr);
- if (!rq_idle(rq))
- task_running_tick(rq);
- else
- no_iso_tick();
- rq->last_tick = rq->clock;
- perf_event_task_tick();
-}
-
-notrace unsigned long get_parent_ip(unsigned long addr)
-{
- if (in_lock_functions(addr)) {
- addr = CALLER_ADDR2;
- if (in_lock_functions(addr))
- addr = CALLER_ADDR3;
- }
- return addr;
-}
-
-#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
- defined(CONFIG_PREEMPT_TRACER))
-void preempt_count_add(int val)
-{
-#ifdef CONFIG_DEBUG_PREEMPT
- /*
- * Underflow?
- */
- if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
- return;
-#endif
- __preempt_count_add(val);
-#ifdef CONFIG_DEBUG_PREEMPT
- /*
- * Spinlock count overflowing soon?
- */
- DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
- PREEMPT_MASK - 10);
-#endif
- if (preempt_count() == val) {
- unsigned long ip = get_parent_ip(CALLER_ADDR1);
-#ifdef CONFIG_DEBUG_PREEMPT
- current->preempt_disable_ip = ip;
-#endif
- trace_preempt_off(CALLER_ADDR0, ip);
- }
-}
-EXPORT_SYMBOL(preempt_count_add);
-NOKPROBE_SYMBOL(preempt_count_add);
-
-void preempt_count_sub(int val)
-{
-#ifdef CONFIG_DEBUG_PREEMPT
- /*
- * Underflow?
- */
- if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
- return;
- /*
- * Is the spinlock portion underflowing?
- */
- if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
- !(preempt_count() & PREEMPT_MASK)))
- return;
-#endif
-
- if (preempt_count() == val)
- trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
- __preempt_count_sub(val);
-}
-EXPORT_SYMBOL(preempt_count_sub);
-NOKPROBE_SYMBOL(preempt_count_sub);
-#endif
-
-/*
- * Deadline is "now" in niffies + (offset by priority). Setting the deadline
- * is the key to everything. It distributes cpu fairly amongst tasks of the
- * same nice value, it proportions cpu according to nice level, it means the
- * task that last woke up the longest ago has the earliest deadline, thus
- * ensuring that interactive tasks get low latency on wake up. The CPU
- * proportion works out to the square of the virtual deadline difference, so
- * this equation will give nice 19 3% CPU compared to nice 0.
- */
-static inline u64 prio_deadline_diff(int user_prio)
-{
- return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128));
-}
-
-static inline u64 task_deadline_diff(struct task_struct *p)
-{
- return prio_deadline_diff(TASK_USER_PRIO(p));
-}
-
-static inline u64 static_deadline_diff(int static_prio)
-{
- return prio_deadline_diff(USER_PRIO(static_prio));
-}
-
-static inline int longest_deadline_diff(void)
-{
- return prio_deadline_diff(39);
-}
-
-static inline int ms_longest_deadline_diff(void)
-{
- return NS_TO_MS(longest_deadline_diff());
-}
-
-/*
- * The time_slice is only refilled when it is empty and that is when we set a
- * new deadline.
- */
-static void time_slice_expired(struct task_struct *p)
-{
- p->time_slice = timeslice();
- p->deadline = grq.niffies + task_deadline_diff(p);
-#ifdef CONFIG_SMT_NICE
- if (!p->mm)
- p->smt_bias = 0;
- else if (rt_task(p))
- p->smt_bias = 1 << 30;
- else if (task_running_iso(p))
- p->smt_bias = 1 << 29;
- else if (idleprio_task(p)) {
- if (task_running_idle(p))
- p->smt_bias = 0;
- else
- p->smt_bias = 1;
- } else if (--p->smt_bias < 1)
- p->smt_bias = MAX_PRIO - p->static_prio;
-#endif
-}
-
-/*
- * Timeslices below RESCHED_US are considered as good as expired as there's no
- * point rescheduling when there's so little time left. SCHED_BATCH tasks
- * have been flagged be not latency sensitive and likely to be fully CPU
- * bound so every time they're rescheduled they have their time_slice
- * refilled, but get a new later deadline to have little effect on
- * SCHED_NORMAL tasks.
-
- */
-static inline void check_deadline(struct task_struct *p)
-{
- if (p->time_slice < RESCHED_US || batch_task(p))
- time_slice_expired(p);
-}
-
-#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG)
-
-/*
- * Scheduler queue bitmap specific find next bit.
- */
-static inline unsigned long
-next_sched_bit(const unsigned long *addr, unsigned long offset)
-{
- const unsigned long *p;
- unsigned long result;
- unsigned long size;
- unsigned long tmp;
-
- size = PRIO_LIMIT;
- if (offset >= size)
- return size;
-
- p = addr + BITOP_WORD(offset);
- result = offset & ~(BITS_PER_LONG-1);
- size -= result;
- offset %= BITS_PER_LONG;
- if (offset) {
- tmp = *(p++);
- tmp &= (~0UL << offset);
- if (size < BITS_PER_LONG)
- goto found_first;
- if (tmp)
- goto found_middle;
- size -= BITS_PER_LONG;
- result += BITS_PER_LONG;
- }
- while (size & ~(BITS_PER_LONG-1)) {
- if ((tmp = *(p++)))
- goto found_middle;
- result += BITS_PER_LONG;
- size -= BITS_PER_LONG;
- }
- if (!size)
- return result;
- tmp = *p;
-
-found_first:
- tmp &= (~0UL >> (BITS_PER_LONG - size));
- if (tmp == 0UL) /* Are any bits set? */
- return result + size; /* Nope. */
-found_middle:
- return result + __ffs(tmp);
-}
-
-/*
- * O(n) lookup of all tasks in the global runqueue. The real brainfuck
- * of lock contention and O(n). It's not really O(n) as only the queued,
- * but not running tasks are scanned, and is O(n) queued in the worst case
- * scenario only because the right task can be found before scanning all of
- * them.
- * Tasks are selected in this order:
- * Real time tasks are selected purely by their static priority and in the
- * order they were queued, so the lowest value idx, and the first queued task
- * of that priority value is chosen.
- * If no real time tasks are found, the SCHED_ISO priority is checked, and
- * all SCHED_ISO tasks have the same priority value, so they're selected by
- * the earliest deadline value.
- * If no SCHED_ISO tasks are found, SCHED_NORMAL tasks are selected by the
- * earliest deadline.
- * Finally if no SCHED_NORMAL tasks are found, SCHED_IDLEPRIO tasks are
- * selected by the earliest deadline.
- */
-static inline struct
-task_struct *earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle)
-{
- struct task_struct *edt = NULL;
- unsigned long idx = -1;
-
- do {
- struct list_head *queue;
- struct task_struct *p;
- u64 earliest_deadline;
-
- idx = next_sched_bit(grq.prio_bitmap, ++idx);
- if (idx >= PRIO_LIMIT)
- return idle;
- queue = grq.queue + idx;
-
- if (idx < MAX_RT_PRIO) {
- /* We found an rt task */
- list_for_each_entry(p, queue, run_list) {
- /* Make sure cpu affinity is ok */
- if (needs_other_cpu(p, cpu))
- continue;
- edt = p;
- goto out_take;
- }
- /*
- * None of the RT tasks at this priority can run on
- * this cpu
- */
- continue;
- }
-
- /*
- * No rt tasks. Find the earliest deadline task. Now we're in
- * O(n) territory.
- */
- earliest_deadline = ~0ULL;
- list_for_each_entry(p, queue, run_list) {
- u64 dl;
-
- /* Make sure cpu affinity is ok */
- if (needs_other_cpu(p, cpu))
- continue;
-
-#ifdef CONFIG_SMT_NICE
- if (!smt_should_schedule(p, cpu))
- continue;
-#endif
- /*
- * Soft affinity happens here by not scheduling a task
- * with its sticky flag set that ran on a different CPU
- * last when the CPU is scaling, or by greatly biasing
- * against its deadline when not, based on cpu cache
- * locality.
- */
- if (task_sticky(p) && task_rq(p) != rq) {
- if (scaling_rq(rq))
- continue;
- dl = p->deadline << locality_diff(p, rq);
- } else
- dl = p->deadline;
-
- if (deadline_before(dl, earliest_deadline)) {
- earliest_deadline = dl;
- edt = p;
- }
- }
- } while (!edt);
-
-out_take:
- take_task(cpu, edt);
- return edt;
-}
-
-
-/*
- * Print scheduling while atomic bug:
- */
-static noinline void __schedule_bug(struct task_struct *prev)
-{
- if (oops_in_progress)
- return;
-
- printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
- prev->comm, prev->pid, preempt_count());
-
- debug_show_held_locks(prev);
- print_modules();
- if (irqs_disabled())
- print_irqtrace_events(prev);
-#ifdef CONFIG_DEBUG_PREEMPT
- if (in_atomic_preempt_off()) {
- pr_err("Preemption disabled at:");
- print_ip_sym(current->preempt_disable_ip);
- pr_cont("\n");
- }
-#endif
- dump_stack();
- add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
-}
-
-/*
- * Various schedule()-time debugging checks and statistics:
- */
-static inline void schedule_debug(struct task_struct *prev)
-{
-#ifdef CONFIG_SCHED_STACK_END_CHECK
- BUG_ON(unlikely(task_stack_end_corrupted(prev)));
-#endif
- /*
- * Test if we are atomic. Since do_exit() needs to call into
- * schedule() atomically, we ignore that path. Otherwise whine
- * if we are scheduling when we should not.
- */
- if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))
- __schedule_bug(prev);
- rcu_sleep_check();
-
- profile_hit(SCHED_PROFILING, __builtin_return_address(0));
-
- schedstat_inc(this_rq(), sched_count);
-}
-
-/*
- * The currently running task's information is all stored in rq local data
- * which is only modified by the local CPU, thereby allowing the data to be
- * changed without grabbing the grq lock.
- */
-static inline void set_rq_task(struct rq *rq, struct task_struct *p)
-{
- rq->rq_time_slice = p->time_slice;
- rq->rq_deadline = p->deadline;
- rq->rq_last_ran = p->last_ran = rq->clock_task;
- rq->rq_policy = p->policy;
- rq->rq_prio = p->prio;
-#ifdef CONFIG_SMT_NICE
- rq->rq_mm = p->mm;
- rq->rq_smt_bias = p->smt_bias;
-#endif
- if (p != rq->idle)
- rq->rq_running = true;
- else
- rq->rq_running = false;
-}
-
-static void reset_rq_task(struct rq *rq, struct task_struct *p)
-{
- rq->rq_policy = p->policy;
- rq->rq_prio = p->prio;
-#ifdef CONFIG_SMT_NICE
- rq->rq_smt_bias = p->smt_bias;
-#endif
-}
-
-#ifdef CONFIG_SMT_NICE
-/* Iterate over smt siblings when we've scheduled a process on cpu and decide
- * whether they should continue running or be descheduled. */
-static void check_smt_siblings(int cpu)
-{
- int other_cpu;
-
- for_each_cpu(other_cpu, thread_cpumask(cpu)) {
- struct task_struct *p;
- struct rq *rq;
-
- if (other_cpu == cpu)
- continue;
- rq = cpu_rq(other_cpu);
- if (rq_idle(rq))
- continue;
- if (!rq->online)
- continue;
- p = rq->curr;
- if (!smt_should_schedule(p, cpu)) {
- set_tsk_need_resched(p);
- smp_send_reschedule(other_cpu);
- }
- }
-}
-
-static void wake_smt_siblings(int cpu)
-{
- int other_cpu;
-
- if (!queued_notrunning())
- return;
-
- for_each_cpu(other_cpu, thread_cpumask(cpu)) {
- struct rq *rq;
-
- if (other_cpu == cpu)
- continue;
- rq = cpu_rq(other_cpu);
- if (rq_idle(rq)) {
- struct task_struct *p = rq->curr;
-
- set_tsk_need_resched(p);
- smp_send_reschedule(other_cpu);
- }
- }
-}
-#else
-static void check_smt_siblings(int __maybe_unused cpu) {}
-static void wake_smt_siblings(int __maybe_unused cpu) {}
-#endif
-
-/*
- * schedule() is the main scheduler function.
- *
- * The main means of driving the scheduler and thus entering this function are:
- *
- * 1. Explicit blocking: mutex, semaphore, waitqueue, etc.
- *
- * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
- * paths. For example, see arch/x86/entry_64.S.
- *
- * To drive preemption between tasks, the scheduler sets the flag in timer
- * interrupt handler scheduler_tick().
- *
- * 3. Wakeups don't really cause entry into schedule(). They add a
- * task to the run-queue and that's it.
- *
- * Now, if the new task added to the run-queue preempts the current
- * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
- * called on the nearest possible occasion:
- *
- * - If the kernel is preemptible (CONFIG_PREEMPT=y):
- *
- * - in syscall or exception context, at the next outmost
- * preempt_enable(). (this might be as soon as the wake_up()'s
- * spin_unlock()!)
- *
- * - in IRQ context, return from interrupt-handler to
- * preemptible context
- *
- * - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
- * then at the next:
- *
- * - cond_resched() call
- * - explicit schedule() call
- * - return from syscall or exception to user-space
- * - return from interrupt-handler to user-space
- *
- * WARNING: all callers must re-check need_resched() afterward and reschedule
- * accordingly in case an event triggered the need for rescheduling (such as
- * an interrupt waking up a task) while preemption was disabled in __schedule().
- */
-static void __sched __schedule(void)
-{
- struct task_struct *prev, *next, *idle;
- unsigned long *switch_count;
- bool deactivate;
- struct rq *rq;
- int cpu;
-
-need_resched:
- deactivate = false;
- preempt_disable();
- cpu = smp_processor_id();
- rq = cpu_rq(cpu);
- rcu_note_context_switch();
- prev = rq->curr;
-
- schedule_debug(prev);
-
- /*
- * Make sure that signal_pending_state()->signal_pending() below
- * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
- * done by the caller to avoid the race with signal_wake_up().
- */
- smp_mb__before_spinlock();
- grq_lock_irq();
-
- switch_count = &prev->nivcsw;
- if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
- if (unlikely(signal_pending_state(prev->state, prev))) {
- prev->state = TASK_RUNNING;
- } else {
- deactivate = true;
- prev->on_rq = 0;
-
- /*
- * If a worker is going to sleep, notify and
- * ask workqueue whether it wants to wake up a
- * task to maintain concurrency. If so, wake
- * up the task.
- */
- if (prev->flags & PF_WQ_WORKER) {
- struct task_struct *to_wakeup;
-
- to_wakeup = wq_worker_sleeping(prev, cpu);
- if (to_wakeup) {
- /* This shouldn't happen, but does */
- if (unlikely(to_wakeup == prev))
- deactivate = false;
- else
- try_to_wake_up_local(to_wakeup);
- }
- }
- }
- switch_count = &prev->nvcsw;
- }
-
- /*
- * If we are going to sleep and we have plugged IO queued, make
- * sure to submit it to avoid deadlocks.
- */
- if (unlikely(deactivate && blk_needs_flush_plug(prev))) {
- grq_unlock_irq();
- preempt_enable_no_resched();
- blk_schedule_flush_plug(prev);
- goto need_resched;
- }
-
- update_clocks(rq);
- update_cpu_clock_switch(rq, prev);
- if (rq->clock - rq->last_tick > HALF_JIFFY_NS)
- rq->dither = false;
- else
- rq->dither = true;
-
- clear_tsk_need_resched(prev);
- clear_preempt_need_resched();
-
- idle = rq->idle;
- if (idle != prev) {
- /* Update all the information stored on struct rq */
- prev->time_slice = rq->rq_time_slice;
- prev->deadline = rq->rq_deadline;
- check_deadline(prev);
- prev->last_ran = rq->clock_task;
-
- /* Task changed affinity off this CPU */
- if (likely(!needs_other_cpu(prev, cpu))) {
- if (!deactivate) {
- if (!queued_notrunning()) {
- /*
- * We now know prev is the only thing that is
- * awaiting CPU so we can bypass rechecking for
- * the earliest deadline task and just run it
- * again.
- */
- set_rq_task(rq, prev);
- check_smt_siblings(cpu);
- grq_unlock_irq();
- goto rerun_prev_unlocked;
- } else
- swap_sticky(rq, cpu, prev);
- }
- }
- return_task(prev, rq, deactivate);
- }
-
- if (unlikely(!queued_notrunning())) {
- /*
- * This CPU is now truly idle as opposed to when idle is
- * scheduled as a high priority task in its own right.
- */
- next = idle;
- schedstat_inc(rq, sched_goidle);
- set_cpuidle_map(cpu);
- } else {
- next = earliest_deadline_task(rq, cpu, idle);
- if (likely(next->prio != PRIO_LIMIT))
- clear_cpuidle_map(cpu);
- else
- set_cpuidle_map(cpu);
- }
-
- if (likely(prev != next)) {
- /*
- * Don't reschedule an idle task or deactivated tasks
- */
- if (prev != idle && !deactivate)
- resched_suitable_idle(prev);
- /*
- * Don't stick tasks when a real time task is going to run as
- * they may literally get stuck.
- */
- if (rt_task(next))
- unstick_task(rq, prev);
- set_rq_task(rq, next);
- if (next != idle)
- check_smt_siblings(cpu);
- else
- wake_smt_siblings(cpu);
- grq.nr_switches++;
- prev->on_cpu = false;
- next->on_cpu = true;
- rq->curr = next;
- ++*switch_count;
-
- rq = context_switch(rq, prev, next); /* unlocks the grq */
- cpu = cpu_of(rq);
- idle = rq->idle;
- } else {
- check_smt_siblings(cpu);
- grq_unlock_irq();
- }
-
-rerun_prev_unlocked:
- sched_preempt_enable_no_resched();
-}
-
-asmlinkage __visible void __sched schedule(void)
-{
- do {
- __schedule();
- } while (need_resched());
-}
-
-EXPORT_SYMBOL(schedule);
-
-#ifdef CONFIG_CONTEXT_TRACKING
-asmlinkage __visible void __sched schedule_user(void)
-{
- /*
- * If we come here after a random call to set_need_resched(),
- * or we have been woken up remotely but the IPI has not yet arrived,
- * we haven't yet exited the RCU idle mode. Do it here manually until
- * we find a better solution.
- *
- * NB: There are buggy callers of this function. Ideally we
- * should warn if prev_state != IN_USER, but that will trigger
- * too frequently to make sense yet.
- */
- enum ctx_state prev_state = exception_enter();
- schedule();
- exception_exit(prev_state);
-}
-#endif
-
-/**
- * schedule_preempt_disabled - called with preemption disabled
- *
- * Returns with preemption disabled. Note: preempt_count must be 1
- */
-void __sched schedule_preempt_disabled(void)
-{
- sched_preempt_enable_no_resched();
- schedule();
- preempt_disable();
-}
-
-static void __sched notrace preempt_schedule_common(void)
-{
- do {
- __preempt_count_add(PREEMPT_ACTIVE);
- __schedule();
- __preempt_count_sub(PREEMPT_ACTIVE);
-
- /*
- * Check again in case we missed a preemption opportunity
- * between schedule and now.
- */
- barrier();
- } while (need_resched());
-}
-
-#ifdef CONFIG_PREEMPT
-/*
- * this is the entry point to schedule() from in-kernel preemption
- * off of preempt_enable. Kernel preemptions off return from interrupt
- * occur there and call schedule directly.
- */
-asmlinkage __visible void __sched notrace preempt_schedule(void)
-{
- /*
- * If there is a non-zero preempt_count or interrupts are disabled,
- * we do not want to preempt the current task. Just return..
- */
- if (likely(!preemptible()))
- return;
-
- preempt_schedule_common();
-}
-NOKPROBE_SYMBOL(preempt_schedule);
-EXPORT_SYMBOL(preempt_schedule);
-
-#ifdef CONFIG_CONTEXT_TRACKING
-/**
- * preempt_schedule_context - preempt_schedule called by tracing
- *
- * The tracing infrastructure uses preempt_enable_notrace to prevent
- * recursion and tracing preempt enabling caused by the tracing
- * infrastructure itself. But as tracing can happen in areas coming
- * from userspace or just about to enter userspace, a preempt enable
- * can occur before user_exit() is called. This will cause the scheduler
- * to be called when the system is still in usermode.
- *
- * To prevent this, the preempt_enable_notrace will use this function
- * instead of preempt_schedule() to exit user context if needed before
- * calling the scheduler.
- */
-asmlinkage __visible void __sched notrace preempt_schedule_context(void)
-{
- enum ctx_state prev_ctx;
-
- if (likely(!preemptible()))
- return;
-
- do {
- __preempt_count_add(PREEMPT_ACTIVE);
- /*
- * Needs preempt disabled in case user_exit() is traced
- * and the tracer calls preempt_enable_notrace() causing
- * an infinite recursion.
- */
- prev_ctx = exception_enter();
- __schedule();
- exception_exit(prev_ctx);
-
- __preempt_count_sub(PREEMPT_ACTIVE);
- barrier();
- } while (need_resched());
-}
-EXPORT_SYMBOL_GPL(preempt_schedule_context);
-#endif /* CONFIG_CONTEXT_TRACKING */
-
-#endif /* CONFIG_PREEMPT */
-
-/*
- * this is the entry point to schedule() from kernel preemption
- * off of irq context.
- * Note, that this is called and return with irqs disabled. This will
- * protect us against recursive calling from irq.
- */
-asmlinkage __visible void __sched preempt_schedule_irq(void)
-{
- enum ctx_state prev_state;
-
- /* Catch callers which need to be fixed */
- BUG_ON(preempt_count() || !irqs_disabled());
-
- prev_state = exception_enter();
-
- do {
- __preempt_count_add(PREEMPT_ACTIVE);
- local_irq_enable();
- schedule();
- local_irq_disable();
- __preempt_count_sub(PREEMPT_ACTIVE);
-
- /*
- * Check again in case we missed a preemption opportunity
- * between schedule and now.
- */
- barrier();
- } while (need_resched());
-
- exception_exit(prev_state);
-}
-
-int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
- void *key)
-{
- return try_to_wake_up(curr->private, mode, wake_flags);
-}
-EXPORT_SYMBOL(default_wake_function);
-
-#ifdef CONFIG_RT_MUTEXES
-
-/*
- * rt_mutex_setprio - set the current priority of a task
- * @p: task
- * @prio: prio value (kernel-internal form)
- *
- * This function changes the 'effective' priority of a task. It does
- * not touch ->normal_prio like __setscheduler().
- *
- * Used by the rt_mutex code to implement priority inheritance
- * logic. Call site only calls if the priority of the task changed.
- */
-void rt_mutex_setprio(struct task_struct *p, int prio)
-{
- unsigned long flags;
- int queued, oldprio;
- struct rq *rq;
-
- BUG_ON(prio < 0 || prio > MAX_PRIO);
-
- rq = task_grq_lock(p, &flags);
-
- /*
- * Idle task boosting is a nono in general. There is one
- * exception, when PREEMPT_RT and NOHZ is active:
- *
- * The idle task calls get_next_timer_interrupt() and holds
- * the timer wheel base->lock on the CPU and another CPU wants
- * to access the timer (probably to cancel it). We can safely
- * ignore the boosting request, as the idle CPU runs this code
- * with interrupts disabled and will complete the lock
- * protected section without being interrupted. So there is no
- * real need to boost.
- */
- if (unlikely(p == rq->idle)) {
- WARN_ON(p != rq->curr);
- WARN_ON(p->pi_blocked_on);
- goto out_unlock;
- }
-
- trace_sched_pi_setprio(p, prio);
- oldprio = p->prio;
- queued = task_queued(p);
- if (queued)
- dequeue_task(p);
- p->prio = prio;
- if (task_running(p) && prio > oldprio)
- resched_task(p);
- if (queued) {
- enqueue_task(p, rq);
- try_preempt(p, rq);
- }
-
-out_unlock:
- task_grq_unlock(&flags);
-}
-
-#endif
-
-/*
- * Adjust the deadline for when the priority is to change, before it's
- * changed.
- */
-static inline void adjust_deadline(struct task_struct *p, int new_prio)
-{
- p->deadline += static_deadline_diff(new_prio) - task_deadline_diff(p);
-}
-
-void set_user_nice(struct task_struct *p, long nice)
-{
- int queued, new_static, old_static;
- unsigned long flags;
- struct rq *rq;
-
- if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
- return;
- new_static = NICE_TO_PRIO(nice);
- /*
- * We have to be careful, if called from sys_setpriority(),
- * the task might be in the middle of scheduling on another CPU.
- */
- rq = time_task_grq_lock(p, &flags);
- /*
- * The RT priorities are set via sched_setscheduler(), but we still
- * allow the 'normal' nice value to be set - but as expected
- * it wont have any effect on scheduling until the task is
- * not SCHED_NORMAL/SCHED_BATCH:
- */
- if (has_rt_policy(p)) {
- p->static_prio = new_static;
- goto out_unlock;
- }
- queued = task_queued(p);
- if (queued)
- dequeue_task(p);
-
- adjust_deadline(p, new_static);
- old_static = p->static_prio;
- p->static_prio = new_static;
- p->prio = effective_prio(p);
-
- if (queued) {
- enqueue_task(p, rq);
- if (new_static < old_static)
- try_preempt(p, rq);
- } else if (task_running(p)) {
- reset_rq_task(rq, p);
- if (old_static < new_static)
- resched_task(p);
- }
-out_unlock:
- task_grq_unlock(&flags);
-}
-EXPORT_SYMBOL(set_user_nice);
-
-/*
- * can_nice - check if a task can reduce its nice value
- * @p: task
- * @nice: nice value
- */
-int can_nice(const struct task_struct *p, const int nice)
-{
- /* convert nice value [19,-20] to rlimit style value [1,40] */
- int nice_rlim = nice_to_rlimit(nice);
-
- return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
- capable(CAP_SYS_NICE));
-}
-
-#ifdef __ARCH_WANT_SYS_NICE
-
-/*
- * sys_nice - change the priority of the current process.
- * @increment: priority increment
- *
- * sys_setpriority is a more generic, but much slower function that
- * does similar things.
- */
-SYSCALL_DEFINE1(nice, int, increment)
-{
- long nice, retval;
-
- /*
- * Setpriority might change our priority at the same moment.
- * We don't have to worry. Conceptually one call occurs first
- * and we have a single winner.
- */
-
- increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
- nice = task_nice(current) + increment;
-
- nice = clamp_val(nice, MIN_NICE, MAX_NICE);
- if (increment < 0 && !can_nice(current, nice))
- return -EPERM;
-
- retval = security_task_setnice(current, nice);
- if (retval)
- return retval;
-
- set_user_nice(current, nice);
- return 0;
-}
-
-#endif
-
-/**
- * task_prio - return the priority value of a given task.
- * @p: the task in question.
- *
- * Return: The priority value as seen by users in /proc.
- * RT tasks are offset by -100. Normal tasks are centered around 1, value goes
- * from 0 (SCHED_ISO) up to 82 (nice +19 SCHED_IDLEPRIO).
- */
-int task_prio(const struct task_struct *p)
-{
- int delta, prio = p->prio - MAX_RT_PRIO;
-
- /* rt tasks and iso tasks */
- if (prio <= 0)
- goto out;
-
- /* Convert to ms to avoid overflows */
- delta = NS_TO_MS(p->deadline - grq.niffies);
- delta = delta * 40 / ms_longest_deadline_diff();
- if (delta > 0 && delta <= 80)
- prio += delta;
- if (idleprio_task(p))
- prio += 40;
-out:
- return prio;
-}
-
-/**
- * idle_cpu - is a given cpu idle currently?
- * @cpu: the processor in question.
- *
- * Return: 1 if the CPU is currently idle. 0 otherwise.
- */
-int idle_cpu(int cpu)
-{
- return cpu_curr(cpu) == cpu_rq(cpu)->idle;
-}
-
-/**
- * idle_task - return the idle task for a given cpu.
- * @cpu: the processor in question.
- *
- * Return: The idle task for the cpu @cpu.
- */
-struct task_struct *idle_task(int cpu)
-{
- return cpu_rq(cpu)->idle;
-}
-
-/**
- * find_process_by_pid - find a process with a matching PID value.
- * @pid: the pid in question.
- *
- * The task of @pid, if found. %NULL otherwise.
- */
-static inline struct task_struct *find_process_by_pid(pid_t pid)
-{
- return pid ? find_task_by_vpid(pid) : current;
-}
-
-/* Actually do priority change: must hold grq lock. */
-static void __setscheduler(struct task_struct *p, struct rq *rq, int policy,
- int prio, bool keep_boost)
-{
- int oldrtprio, oldprio;
-
- p->policy = policy;
- oldrtprio = p->rt_priority;
- p->rt_priority = prio;
- p->normal_prio = normal_prio(p);
- oldprio = p->prio;
- /*
- * Keep a potential priority boosting if called from
- * sched_setscheduler().
- */
- if (keep_boost)
- p->prio = rt_mutex_get_effective_prio(p, p->normal_prio);
- else
- p->prio = p->normal_prio;
- if (task_running(p)) {
- reset_rq_task(rq, p);
- /* Resched only if we might now be preempted */
- if (p->prio > oldprio || p->rt_priority > oldrtprio)
- resched_task(p);
- }
-}
-
-/*
- * check the target process has a UID that matches the current process's
- */
-static bool check_same_owner(struct task_struct *p)
-{
- const struct cred *cred = current_cred(), *pcred;
- bool match;
-
- rcu_read_lock();
- pcred = __task_cred(p);
- match = (uid_eq(cred->euid, pcred->euid) ||
- uid_eq(cred->euid, pcred->uid));
- rcu_read_unlock();
- return match;
-}
-
-static int __sched_setscheduler(struct task_struct *p, int policy,
- const struct sched_param *param, bool user)
-{
- struct sched_param zero_param = { .sched_priority = 0 };
- int queued, retval, oldpolicy = -1;
- unsigned long flags, rlim_rtprio = 0;
- int reset_on_fork;
- struct rq *rq;
-
- /* may grab non-irq protected spin_locks */
- BUG_ON(in_interrupt());
-
- if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) {
- unsigned long lflags;
-
- if (!lock_task_sighand(p, &lflags))
- return -ESRCH;
- rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
- unlock_task_sighand(p, &lflags);
- if (rlim_rtprio)
- goto recheck;
- /*
- * If the caller requested an RT policy without having the
- * necessary rights, we downgrade the policy to SCHED_ISO.
- * We also set the parameter to zero to pass the checks.
- */
- policy = SCHED_ISO;
- param = &zero_param;
- }
-recheck:
- /* double check policy once rq lock held */
- if (policy < 0) {
- reset_on_fork = p->sched_reset_on_fork;
- policy = oldpolicy = p->policy;
- } else {
- reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
- policy &= ~SCHED_RESET_ON_FORK;
-
- if (!SCHED_RANGE(policy))
- return -EINVAL;
- }
-
- /*
- * Valid priorities for SCHED_FIFO and SCHED_RR are
- * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and
- * SCHED_BATCH is 0.
- */
- if (param->sched_priority < 0 ||
- (p->mm && param->sched_priority > MAX_USER_RT_PRIO - 1) ||
- (!p->mm && param->sched_priority > MAX_RT_PRIO - 1))
- return -EINVAL;
- if (is_rt_policy(policy) != (param->sched_priority != 0))
- return -EINVAL;
-
- /*
- * Allow unprivileged RT tasks to decrease priority:
- */
- if (user && !capable(CAP_SYS_NICE)) {
- if (is_rt_policy(policy)) {
- unsigned long rlim_rtprio =
- task_rlimit(p, RLIMIT_RTPRIO);
-
- /* can't set/change the rt policy */
- if (policy != p->policy && !rlim_rtprio)
- return -EPERM;
-
- /* can't increase priority */
- if (param->sched_priority > p->rt_priority &&
- param->sched_priority > rlim_rtprio)
- return -EPERM;
- } else {
- switch (p->policy) {
- /*
- * Can only downgrade policies but not back to
- * SCHED_NORMAL
- */
- case SCHED_ISO:
- if (policy == SCHED_ISO)
- goto out;
- if (policy == SCHED_NORMAL)
- return -EPERM;
- break;
- case SCHED_BATCH:
- if (policy == SCHED_BATCH)
- goto out;
- if (policy != SCHED_IDLEPRIO)
- return -EPERM;
- break;
- case SCHED_IDLEPRIO:
- if (policy == SCHED_IDLEPRIO)
- goto out;
- return -EPERM;
- default:
- break;
- }
- }
-
- /* can't change other user's priorities */
- if (!check_same_owner(p))
- return -EPERM;
-
- /* Normal users shall not reset the sched_reset_on_fork flag */
- if (p->sched_reset_on_fork && !reset_on_fork)
- return -EPERM;
- }
-
- if (user) {
- retval = security_task_setscheduler(p);
- if (retval)
- return retval;
- }
-
- /*
- * make sure no PI-waiters arrive (or leave) while we are
- * changing the priority of the task:
- */
- raw_spin_lock_irqsave(&p->pi_lock, flags);
- /*
- * To be able to change p->policy safely, the grunqueue lock must be
- * held.
- */
- rq = __task_grq_lock(p);
-
- /*
- * Changing the policy of the stop threads its a very bad idea
- */
- if (p == rq->stop) {
- __task_grq_unlock();
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
- return -EINVAL;
- }
-
- /*
- * If not changing anything there's no need to proceed further:
- */
- if (unlikely(policy == p->policy && (!is_rt_policy(policy) ||
- param->sched_priority == p->rt_priority))) {
-
- __task_grq_unlock();
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
- return 0;
- }
-
- /* recheck policy now with rq lock held */
- if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
- policy = oldpolicy = -1;
- __task_grq_unlock();
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
- goto recheck;
- }
- update_clocks(rq);
- p->sched_reset_on_fork = reset_on_fork;
-
- queued = task_queued(p);
- if (queued)
- dequeue_task(p);
- __setscheduler(p, rq, policy, param->sched_priority, true);
- if (queued) {
- enqueue_task(p, rq);
- try_preempt(p, rq);
- }
- __task_grq_unlock();
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-
- rt_mutex_adjust_pi(p);
-out:
- return 0;
-}
-
-/**
- * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
- * @p: the task in question.
- * @policy: new policy.
- * @param: structure containing the new RT priority.
- *
- * Return: 0 on success. An error code otherwise.
- *
- * NOTE that the task may be already dead.
- */
-int sched_setscheduler(struct task_struct *p, int policy,
- const struct sched_param *param)
-{
- return __sched_setscheduler(p, policy, param, true);
-}
-
-EXPORT_SYMBOL_GPL(sched_setscheduler);
-
-int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
-{
- const struct sched_param param = { .sched_priority = attr->sched_priority };
- int policy = attr->sched_policy;
-
- return __sched_setscheduler(p, policy, &param, true);
-}
-EXPORT_SYMBOL_GPL(sched_setattr);
-
-/**
- * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
- * @p: the task in question.
- * @policy: new policy.
- * @param: structure containing the new RT priority.
- *
- * Just like sched_setscheduler, only don't bother checking if the
- * current context has permission. For example, this is needed in
- * stop_machine(): we create temporary high priority worker threads,
- * but our caller might not have that capability.
- *
- * Return: 0 on success. An error code otherwise.
- */
-int sched_setscheduler_nocheck(struct task_struct *p, int policy,
- const struct sched_param *param)
-{
- return __sched_setscheduler(p, policy, param, false);
-}
-
-static int
-do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
-{
- struct sched_param lparam;
- struct task_struct *p;
- int retval;
-
- if (!param || pid < 0)
- return -EINVAL;
- if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
- return -EFAULT;
-
- rcu_read_lock();
- retval = -ESRCH;
- p = find_process_by_pid(pid);
- if (p != NULL)
- retval = sched_setscheduler(p, policy, &lparam);
- rcu_read_unlock();
-
- return retval;
-}
-
-/*
- * Mimics kernel/events/core.c perf_copy_attr().
- */
-static int sched_copy_attr(struct sched_attr __user *uattr,
- struct sched_attr *attr)
-{
- u32 size;
- int ret;
-
- if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
- return -EFAULT;
-
- /*
- * zero the full structure, so that a short copy will be nice.
- */
- memset(attr, 0, sizeof(*attr));
-
- ret = get_user(size, &uattr->size);
- if (ret)
- return ret;
-
- if (size > PAGE_SIZE) /* silly large */
- goto err_size;
-
- if (!size) /* abi compat */
- size = SCHED_ATTR_SIZE_VER0;
-
- if (size < SCHED_ATTR_SIZE_VER0)
- goto err_size;
-
- /*
- * If we're handed a bigger struct than we know of,
- * ensure all the unknown bits are 0 - i.e. new
- * user-space does not rely on any kernel feature
- * extensions we dont know about yet.
- */
- if (size > sizeof(*attr)) {
- unsigned char __user *addr;
- unsigned char __user *end;
- unsigned char val;
-
- addr = (void __user *)uattr + sizeof(*attr);
- end = (void __user *)uattr + size;
-
- for (; addr < end; addr++) {
- ret = get_user(val, addr);
- if (ret)
- return ret;
- if (val)
- goto err_size;
- }
- size = sizeof(*attr);
- }
-
- ret = copy_from_user(attr, uattr, size);
- if (ret)
- return -EFAULT;
-
- /*
- * XXX: do we want to be lenient like existing syscalls; or do we want
- * to be strict and return an error on out-of-bounds values?
- */
- attr->sched_nice = clamp(attr->sched_nice, -20, 19);
-
- /* sched/core.c uses zero here but we already know ret is zero */
- return 0;
-
-err_size:
- put_user(sizeof(*attr), &uattr->size);
- return -E2BIG;
-}
-
-/**
- * sys_sched_setscheduler - set/change the scheduler policy and RT priority
- * @pid: the pid in question.
- * @policy: new policy.
- *
- * Return: 0 on success. An error code otherwise.
- * @param: structure containing the new RT priority.
- */
-asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
- struct sched_param __user *param)
-{
- /* negative values for policy are not valid */
- if (policy < 0)
- return -EINVAL;
-
- return do_sched_setscheduler(pid, policy, param);
-}
-
-/*
- * sched_setparam() passes in -1 for its policy, to let the functions
- * it calls know not to change it.
- */
-#define SETPARAM_POLICY -1
-
-/**
- * sys_sched_setparam - set/change the RT priority of a thread
- * @pid: the pid in question.
- * @param: structure containing the new RT priority.
- *
- * Return: 0 on success. An error code otherwise.
- */
-SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
-{
- return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
-}
-
-/**
- * sys_sched_setattr - same as above, but with extended sched_attr
- * @pid: the pid in question.
- * @uattr: structure containing the extended parameters.
- */
-SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
- unsigned int, flags)
-{
- struct sched_attr attr;
- struct task_struct *p;
- int retval;
-
- if (!uattr || pid < 0 || flags)
- return -EINVAL;
-
- retval = sched_copy_attr(uattr, &attr);
- if (retval)
- return retval;
-
- if ((int)attr.sched_policy < 0)
- return -EINVAL;
-
- rcu_read_lock();
- retval = -ESRCH;
- p = find_process_by_pid(pid);
- if (p != NULL)
- retval = sched_setattr(p, &attr);
- rcu_read_unlock();
-
- return retval;
-}
-
-/**
- * sys_sched_getscheduler - get the policy (scheduling class) of a thread
- * @pid: the pid in question.
- *
- * Return: On success, the policy of the thread. Otherwise, a negative error
- * code.
- */
-SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
-{
- struct task_struct *p;
- int retval = -EINVAL;
-
- if (pid < 0)
- goto out_nounlock;
-
- retval = -ESRCH;
- rcu_read_lock();
- p = find_process_by_pid(pid);
- if (p) {
- retval = security_task_getscheduler(p);
- if (!retval)
- retval = p->policy;
- }
- rcu_read_unlock();
-
-out_nounlock:
- return retval;
-}
-
-/**
- * sys_sched_getscheduler - get the RT priority of a thread
- * @pid: the pid in question.
- * @param: structure containing the RT priority.
- *
- * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
- * code.
- */
-SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
-{
- struct sched_param lp = { .sched_priority = 0 };
- struct task_struct *p;
- int retval = -EINVAL;
-
- if (!param || pid < 0)
- goto out_nounlock;
-
- rcu_read_lock();
- p = find_process_by_pid(pid);
- retval = -ESRCH;
- if (!p)
- goto out_unlock;
-
- retval = security_task_getscheduler(p);
- if (retval)
- goto out_unlock;
-
- if (has_rt_policy(p))
- lp.sched_priority = p->rt_priority;
- rcu_read_unlock();
-
- /*
- * This one might sleep, we cannot do it with a spinlock held ...
- */
- retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
-
-out_nounlock:
- return retval;
-
-out_unlock:
- rcu_read_unlock();
- return retval;
-}
-
-static int sched_read_attr(struct sched_attr __user *uattr,
- struct sched_attr *attr,
- unsigned int usize)
-{
- int ret;
-
- if (!access_ok(VERIFY_WRITE, uattr, usize))
- return -EFAULT;
-
- /*
- * If we're handed a smaller struct than we know of,
- * ensure all the unknown bits are 0 - i.e. old
- * user-space does not get uncomplete information.
- */
- if (usize < sizeof(*attr)) {
- unsigned char *addr;
- unsigned char *end;
-
- addr = (void *)attr + usize;
- end = (void *)attr + sizeof(*attr);
-
- for (; addr < end; addr++) {
- if (*addr)
- return -EFBIG;
- }
-
- attr->size = usize;
- }
-
- ret = copy_to_user(uattr, attr, attr->size);
- if (ret)
- return -EFAULT;
-
- /* sched/core.c uses zero here but we already know ret is zero */
- return ret;
-}
-
-/**
- * sys_sched_getattr - similar to sched_getparam, but with sched_attr
- * @pid: the pid in question.
- * @uattr: structure containing the extended parameters.
- * @size: sizeof(attr) for fwd/bwd comp.
- * @flags: for future extension.
- */
-SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
- unsigned int, size, unsigned int, flags)
-{
- struct sched_attr attr = {
- .size = sizeof(struct sched_attr),
- };
- struct task_struct *p;
- int retval;
-
- if (!uattr || pid < 0 || size > PAGE_SIZE ||
- size < SCHED_ATTR_SIZE_VER0 || flags)
- return -EINVAL;
-
- rcu_read_lock();
- p = find_process_by_pid(pid);
- retval = -ESRCH;
- if (!p)
- goto out_unlock;
-
- retval = security_task_getscheduler(p);
- if (retval)
- goto out_unlock;
-
- attr.sched_policy = p->policy;
- if (rt_task(p))
- attr.sched_priority = p->rt_priority;
- else
- attr.sched_nice = task_nice(p);
-
- rcu_read_unlock();
-
- retval = sched_read_attr(uattr, &attr, size);
- return retval;
-
-out_unlock:
- rcu_read_unlock();
- return retval;
-}
-
-long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
-{
- cpumask_var_t cpus_allowed, new_mask;
- struct task_struct *p;
- int retval;
-
- get_online_cpus();
- rcu_read_lock();
-
- p = find_process_by_pid(pid);
- if (!p) {
- rcu_read_unlock();
- put_online_cpus();
- return -ESRCH;
- }
-
- /* Prevent p going away */
- get_task_struct(p);
- rcu_read_unlock();
-
- if (p->flags & PF_NO_SETAFFINITY) {
- retval = -EINVAL;
- goto out_put_task;
- }
- if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
- retval = -ENOMEM;
- goto out_put_task;
- }
- if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
- retval = -ENOMEM;
- goto out_free_cpus_allowed;
- }
- retval = -EPERM;
- if (!check_same_owner(p)) {
- rcu_read_lock();
- if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
- rcu_read_unlock();
- goto out_unlock;
- }
- rcu_read_unlock();
- }
-
- retval = security_task_setscheduler(p);
- if (retval)
- goto out_unlock;
-
- cpuset_cpus_allowed(p, cpus_allowed);
- cpumask_and(new_mask, in_mask, cpus_allowed);
-again:
- retval = set_cpus_allowed_ptr(p, new_mask);
-
- if (!retval) {
- cpuset_cpus_allowed(p, cpus_allowed);
- if (!cpumask_subset(new_mask, cpus_allowed)) {
- /*
- * We must have raced with a concurrent cpuset
- * update. Just reset the cpus_allowed to the
- * cpuset's cpus_allowed
- */
- cpumask_copy(new_mask, cpus_allowed);
- goto again;
- }
- }
-out_unlock:
- free_cpumask_var(new_mask);
-out_free_cpus_allowed:
- free_cpumask_var(cpus_allowed);
-out_put_task:
- put_task_struct(p);
- put_online_cpus();
- return retval;
-}
-
-static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
- cpumask_t *new_mask)
-{
- if (len < sizeof(cpumask_t)) {
- memset(new_mask, 0, sizeof(cpumask_t));
- } else if (len > sizeof(cpumask_t)) {
- len = sizeof(cpumask_t);
- }
- return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
-}
-
-
-/**
- * sys_sched_setaffinity - set the cpu affinity of a process
- * @pid: pid of the process
- * @len: length in bytes of the bitmask pointed to by user_mask_ptr
- * @user_mask_ptr: user-space pointer to the new cpu mask
- *
- * Return: 0 on success. An error code otherwise.
- */
-SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
- unsigned long __user *, user_mask_ptr)
-{
- cpumask_var_t new_mask;
- int retval;
-
- if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
- return -ENOMEM;
-
- retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
- if (retval == 0)
- retval = sched_setaffinity(pid, new_mask);
- free_cpumask_var(new_mask);
- return retval;
-}
-
-long sched_getaffinity(pid_t pid, cpumask_t *mask)
-{
- struct task_struct *p;
- unsigned long flags;
- int retval;
-
- get_online_cpus();
- rcu_read_lock();
-
- retval = -ESRCH;
- p = find_process_by_pid(pid);
- if (!p)
- goto out_unlock;
-
- retval = security_task_getscheduler(p);
- if (retval)
- goto out_unlock;
-
- grq_lock_irqsave(&flags);
- cpumask_and(mask, tsk_cpus_allowed(p), cpu_active_mask);
- grq_unlock_irqrestore(&flags);
-
-out_unlock:
- rcu_read_unlock();
- put_online_cpus();
-
- return retval;
-}
-
-/**
- * sys_sched_getaffinity - get the cpu affinity of a process
- * @pid: pid of the process
- * @len: length in bytes of the bitmask pointed to by user_mask_ptr
- * @user_mask_ptr: user-space pointer to hold the current cpu mask
- *
- * Return: 0 on success. An error code otherwise.
- */
-SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
- unsigned long __user *, user_mask_ptr)
-{
- int ret;
- cpumask_var_t mask;
-
- if ((len * BITS_PER_BYTE) < nr_cpu_ids)
- return -EINVAL;
- if (len & (sizeof(unsigned long)-1))
- return -EINVAL;
-
- if (!alloc_cpumask_var(&mask, GFP_KERNEL))
- return -ENOMEM;
-
- ret = sched_getaffinity(pid, mask);
- if (ret == 0) {
- size_t retlen = min_t(size_t, len, cpumask_size());
-
- if (copy_to_user(user_mask_ptr, mask, retlen))
- ret = -EFAULT;
- else
- ret = retlen;
- }
- free_cpumask_var(mask);
-
- return ret;
-}
-
-/**
- * sys_sched_yield - yield the current processor to other threads.
- *
- * This function yields the current CPU to other tasks. It does this by
- * scheduling away the current task. If it still has the earliest deadline
- * it will be scheduled again as the next task.
- *
- * Return: 0.
- */
-SYSCALL_DEFINE0(sched_yield)
-{
- struct task_struct *p;
-
- p = current;
- grq_lock_irq();
- schedstat_inc(task_rq(p), yld_count);
- requeue_task(p);
-
- /*
- * Since we are going to call schedule() anyway, there's
- * no need to preempt or enable interrupts:
- */
- __release(grq.lock);
- spin_release(&grq.lock.dep_map, 1, _THIS_IP_);
- do_raw_spin_unlock(&grq.lock);
- sched_preempt_enable_no_resched();
-
- schedule();
-
- return 0;
-}
-
-int __sched _cond_resched(void)
-{
- if (should_resched()) {
- preempt_schedule_common();
- return 1;
- }
- return 0;
-}
-EXPORT_SYMBOL(_cond_resched);
-
-/*
- * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
- * call schedule, and on return reacquire the lock.
- *
- * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
- * operations here to prevent schedule() from being called twice (once via
- * spin_unlock(), once by hand).
- */
-int __cond_resched_lock(spinlock_t *lock)
-{
- int resched = should_resched();
- int ret = 0;
-
- lockdep_assert_held(lock);
-
- if (spin_needbreak(lock) || resched) {
- spin_unlock(lock);
- if (resched)
- preempt_schedule_common();
- else
- cpu_relax();
- ret = 1;
- spin_lock(lock);
- }
- return ret;
-}
-EXPORT_SYMBOL(__cond_resched_lock);
-
-int __sched __cond_resched_softirq(void)
-{
- BUG_ON(!in_softirq());
-
- if (should_resched()) {
- local_bh_enable();
- preempt_schedule_common();
- local_bh_disable();
- return 1;
- }
- return 0;
-}
-EXPORT_SYMBOL(__cond_resched_softirq);
-
-/**
- * yield - yield the current processor to other threads.
- *
- * Do not ever use this function, there's a 99% chance you're doing it wrong.
- *
- * The scheduler is at all times free to pick the calling task as the most
- * eligible task to run, if removing the yield() call from your code breaks
- * it, its already broken.
- *
- * Typical broken usage is:
- *
- * while (!event)
- * yield();
- *
- * where one assumes that yield() will let 'the other' process run that will
- * make event true. If the current task is a SCHED_FIFO task that will never
- * happen. Never use yield() as a progress guarantee!!
- *
- * If you want to use yield() to wait for something, use wait_event().
- * If you want to use yield() to be 'nice' for others, use cond_resched().
- * If you still want to use yield(), do not!
- */
-void __sched yield(void)
-{
- set_current_state(TASK_RUNNING);
- sys_sched_yield();
-}
-EXPORT_SYMBOL(yield);
-
-/**
- * yield_to - yield the current processor to another thread in
- * your thread group, or accelerate that thread toward the
- * processor it's on.
- * @p: target task
- * @preempt: whether task preemption is allowed or not
- *
- * It's the caller's job to ensure that the target task struct
- * can't go away on us before we can do any checks.
- *
- * Return:
- * true (>0) if we indeed boosted the target task.
- * false (0) if we failed to boost the target.
- * -ESRCH if there's no task to yield to.
- */
-int __sched yield_to(struct task_struct *p, bool preempt)
-{
- struct rq *rq, *p_rq;
- unsigned long flags;
- int yielded = 0;
-
- rq = this_rq();
- grq_lock_irqsave(&flags);
- if (task_running(p) || p->state) {
- yielded = -ESRCH;
- goto out_unlock;
- }
-
- p_rq = task_rq(p);
- yielded = 1;
- if (p->deadline > rq->rq_deadline)
- p->deadline = rq->rq_deadline;
- p->time_slice += rq->rq_time_slice;
- rq->rq_time_slice = 0;
- if (p->time_slice > timeslice())
- p->time_slice = timeslice();
- if (preempt && rq != p_rq)
- resched_curr(p_rq);
-out_unlock:
- grq_unlock_irqrestore(&flags);
-
- if (yielded > 0)
- schedule();
- return yielded;
-}
-EXPORT_SYMBOL_GPL(yield_to);
-
-/*
- * This task is about to go to sleep on IO. Increment rq->nr_iowait so
- * that process accounting knows that this is a task in IO wait state.
- *
- * But don't do that if it is a deliberate, throttling IO wait (this task
- * has set its backing_dev_info: the queue against which it should throttle)
- */
-
-long __sched io_schedule_timeout(long timeout)
-{
- int old_iowait = current->in_iowait;
- struct rq *rq;
- long ret;
-
- current->in_iowait = 1;
- blk_schedule_flush_plug(current);
-
- delayacct_blkio_start();
- rq = raw_rq();
- atomic_inc(&rq->nr_iowait);
- ret = schedule_timeout(timeout);
- current->in_iowait = old_iowait;
- atomic_dec(&rq->nr_iowait);
- delayacct_blkio_end();
-
- return ret;
-}
-EXPORT_SYMBOL(io_schedule_timeout);
-
-/**
- * sys_sched_get_priority_max - return maximum RT priority.
- * @policy: scheduling class.
- *
- * Return: On success, this syscall returns the maximum
- * rt_priority that can be used by a given scheduling class.
- * On failure, a negative error code is returned.
- */
-SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
-{
- int ret = -EINVAL;
-
- switch (policy) {
- case SCHED_FIFO:
- case SCHED_RR:
- ret = MAX_USER_RT_PRIO-1;
- break;
- case SCHED_NORMAL:
- case SCHED_BATCH:
- case SCHED_ISO:
- case SCHED_IDLEPRIO:
- ret = 0;
- break;
- }
- return ret;
-}
-
-/**
- * sys_sched_get_priority_min - return minimum RT priority.
- * @policy: scheduling class.
- *
- * Return: On success, this syscall returns the minimum
- * rt_priority that can be used by a given scheduling class.
- * On failure, a negative error code is returned.
- */
-SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
-{
- int ret = -EINVAL;
-
- switch (policy) {
- case SCHED_FIFO:
- case SCHED_RR:
- ret = 1;
- break;
- case SCHED_NORMAL:
- case SCHED_BATCH:
- case SCHED_ISO:
- case SCHED_IDLEPRIO:
- ret = 0;
- break;
- }
- return ret;
-}
-
-/**
- * sys_sched_rr_get_interval - return the default timeslice of a process.
- * @pid: pid of the process.
- * @interval: userspace pointer to the timeslice value.
- *
- *
- * Return: On success, 0 and the timeslice is in @interval. Otherwise,
- * an error code.
- */
-SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
- struct timespec __user *, interval)
-{
- struct task_struct *p;
- unsigned int time_slice;
- unsigned long flags;
- int retval;
- struct timespec t;
-
- if (pid < 0)
- return -EINVAL;
-
- retval = -ESRCH;
- rcu_read_lock();
- p = find_process_by_pid(pid);
- if (!p)
- goto out_unlock;
-
- retval = security_task_getscheduler(p);
- if (retval)
- goto out_unlock;
-
- grq_lock_irqsave(&flags);
- time_slice = p->policy == SCHED_FIFO ? 0 : MS_TO_NS(task_timeslice(p));
- grq_unlock_irqrestore(&flags);
-
- rcu_read_unlock();
- t = ns_to_timespec(time_slice);
- retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
- return retval;
-
-out_unlock:
- rcu_read_unlock();
- return retval;
-}
-
-static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
-
-void sched_show_task(struct task_struct *p)
-{
- unsigned long free = 0;
- int ppid;
- unsigned long state = p->state;
-
- if (state)
- state = __ffs(state) + 1;
- printk(KERN_INFO "%-15.15s %c", p->comm,
- state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
-#if BITS_PER_LONG == 32
- if (state == TASK_RUNNING)
- printk(KERN_CONT " running ");
- else
- printk(KERN_CONT " %08lx ", thread_saved_pc(p));
-#else
- if (state == TASK_RUNNING)
- printk(KERN_CONT " running task ");
- else
- printk(KERN_CONT " %016lx ", thread_saved_pc(p));
-#endif
-#ifdef CONFIG_DEBUG_STACK_USAGE
- free = stack_not_used(p);
-#endif
- ppid = 0;
- rcu_read_lock();
- if (pid_alive(p))
- ppid = task_pid_nr(rcu_dereference(p->real_parent));
- rcu_read_unlock();
- printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
- task_pid_nr(p), ppid,
- (unsigned long)task_thread_info(p)->flags);
-
- print_worker_info(KERN_INFO, p);
- show_stack(p, NULL);
-}
-
-void show_state_filter(unsigned long state_filter)
-{
- struct task_struct *g, *p;
-
-#if BITS_PER_LONG == 32
- printk(KERN_INFO
- " task PC stack pid father\n");
-#else
- printk(KERN_INFO
- " task PC stack pid father\n");
-#endif
- rcu_read_lock();
- for_each_process_thread(g, p) {
- /*
- * reset the NMI-timeout, listing all files on a slow
- * console might take a lot of time:
- */
- touch_nmi_watchdog();
- if (!state_filter || (p->state & state_filter))
- sched_show_task(p);
- }
-
- touch_all_softlockup_watchdogs();
-
- rcu_read_unlock();
- /*
- * Only show locks if all tasks are dumped:
- */
- if (!state_filter)
- debug_show_all_locks();
-}
-
-void dump_cpu_task(int cpu)
-{
- pr_info("Task dump for CPU %d:\n", cpu);
- sched_show_task(cpu_curr(cpu));
-}
-
-#ifdef CONFIG_SMP
-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
-{
- cpumask_copy(tsk_cpus_allowed(p), new_mask);
-}
-#endif
-
-/**
- * init_idle - set up an idle thread for a given CPU
- * @idle: task in question
- * @cpu: cpu the idle task belongs to
- *
- * NOTE: this function does not set the idle thread's NEED_RESCHED
- * flag, to make booting more robust.
- */
-void init_idle(struct task_struct *idle, int cpu)
-{
- struct rq *rq = cpu_rq(cpu);
- unsigned long flags;
-
- time_grq_lock(rq, &flags);
- idle->last_ran = rq->clock_task;
- idle->state = TASK_RUNNING;
- /* Setting prio to illegal value shouldn't matter when never queued */
- idle->prio = PRIO_LIMIT;
-#ifdef CONFIG_SMT_NICE
- idle->smt_bias = 0;
-#endif
- set_rq_task(rq, idle);
- do_set_cpus_allowed(idle, get_cpu_mask(cpu));
- /* Silence PROVE_RCU */
- rcu_read_lock();
- set_task_cpu(idle, cpu);
- rcu_read_unlock();
- rq->curr = rq->idle = idle;
- idle->on_cpu = 1;
- grq_unlock_irqrestore(&flags);
-
- /* Set the preempt count _outside_ the spinlocks! */
- init_idle_preempt_count(idle, cpu);
-
- ftrace_graph_init_idle_task(idle, cpu);
-#if defined(CONFIG_SMP)
- sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
-#endif
-}
-
-int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur,
- const struct cpumask __maybe_unused *trial)
-{
- return 1;
-}
-
-int task_can_attach(struct task_struct *p,
- const struct cpumask *cs_cpus_allowed)
-{
- int ret = 0;
-
- /*
- * Kthreads which disallow setaffinity shouldn't be moved
- * to a new cpuset; we don't want to change their cpu
- * affinity and isolating such threads by their set of
- * allowed nodes is unnecessary. Thus, cpusets are not
- * applicable for such threads. This prevents checking for
- * success of set_cpus_allowed_ptr() on all attached tasks
- * before cpus_allowed may be changed.
- */
- if (p->flags & PF_NO_SETAFFINITY)
- ret = -EINVAL;
-
- return ret;
-}
-
-void resched_cpu(int cpu)
-{
- unsigned long flags;
-
- grq_lock_irqsave(&flags);
- resched_task(cpu_curr(cpu));
- grq_unlock_irqrestore(&flags);
-}
-
-#ifdef CONFIG_SMP
-#ifdef CONFIG_NO_HZ_COMMON
-void nohz_balance_enter_idle(int cpu)
-{
-}
-
-void select_nohz_load_balancer(int stop_tick)
-{
-}
-
-void set_cpu_sd_state_idle(void) {}
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-/**
- * lowest_flag_domain - Return lowest sched_domain containing flag.
- * @cpu: The cpu whose lowest level of sched domain is to
- * be returned.
- * @flag: The flag to check for the lowest sched_domain
- * for the given cpu.
- *
- * Returns the lowest sched_domain of a cpu which contains the given flag.
- */
-static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
-{
- struct sched_domain *sd;
-
- for_each_domain(cpu, sd)
- if (sd && (sd->flags & flag))
- break;
-
- return sd;
-}
-
-/**
- * for_each_flag_domain - Iterates over sched_domains containing the flag.
- * @cpu: The cpu whose domains we're iterating over.
- * @sd: variable holding the value of the power_savings_sd
- * for cpu.
- * @flag: The flag to filter the sched_domains to be iterated.
- *
- * Iterates over all the scheduler domains for a given cpu that has the 'flag'
- * set, starting from the lowest sched_domain to the highest.
- */
-#define for_each_flag_domain(cpu, sd, flag) \
- for (sd = lowest_flag_domain(cpu, flag); \
- (sd && (sd->flags & flag)); sd = sd->parent)
-
-#endif /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
-
-/*
- * In the semi idle case, use the nearest busy cpu for migrating timers
- * from an idle cpu. This is good for power-savings.
- *
- * We don't do similar optimization for completely idle system, as
- * selecting an idle cpu will add more delays to the timers than intended
- * (as that cpu's timer base may not be uptodate wrt jiffies etc).
- */
-int get_nohz_timer_target(int pinned)
-{
- int cpu = smp_processor_id();
- int i;
- struct sched_domain *sd;
-
- if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu))
- return cpu;
-
- rcu_read_lock();
- for_each_domain(cpu, sd) {
- for_each_cpu(i, sched_domain_span(sd)) {
- if (!idle_cpu(i)) {
- cpu = i;
- goto unlock;
- }
- }
- }
-unlock:
- rcu_read_unlock();
- return cpu;
-}
-
-/*
- * When add_timer_on() enqueues a timer into the timer wheel of an
- * idle CPU then this timer might expire before the next timer event
- * which is scheduled to wake up that CPU. In case of a completely
- * idle system the next event might even be infinite time into the
- * future. wake_up_idle_cpu() ensures that the CPU is woken up and
- * leaves the inner idle loop so the newly added timer is taken into
- * account when the CPU goes back to idle and evaluates the timer
- * wheel for the next timer event.
- */
-void wake_up_idle_cpu(int cpu)
-{
- if (cpu == smp_processor_id())
- return;
-
- set_tsk_need_resched(cpu_rq(cpu)->idle);
- smp_send_reschedule(cpu);
-}
-
-void wake_up_nohz_cpu(int cpu)
-{
- wake_up_idle_cpu(cpu);
-}
-#endif /* CONFIG_NO_HZ_COMMON */
-
-/*
- * Change a given task's CPU affinity. Migrate the thread to a
- * proper CPU and schedule it away if the CPU it's executing on
- * is removed from the allowed bitmask.
- *
- * NOTE: the caller must have a valid reference to the task, the
- * task must not exit() & deallocate itself prematurely. The
- * call is not atomic; no spinlocks may be held.
- */
-int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
-{
- bool running_wrong = false;
- bool queued = false;
- unsigned long flags;
- struct rq *rq;
- int ret = 0;
-
- rq = task_grq_lock(p, &flags);
-
- if (cpumask_equal(tsk_cpus_allowed(p), new_mask))
- goto out;
-
- if (!cpumask_intersects(new_mask, cpu_active_mask)) {
- ret = -EINVAL;
- goto out;
- }
-
- queued = task_queued(p);
-
- do_set_cpus_allowed(p, new_mask);
-
- /* Can the task run on the task's current CPU? If so, we're done */
- if (cpumask_test_cpu(task_cpu(p), new_mask))
- goto out;
-
- if (task_running(p)) {
- /* Task is running on the wrong cpu now, reschedule it. */
- if (rq == this_rq()) {
- set_tsk_need_resched(p);
- running_wrong = true;
- } else
- resched_task(p);
- } else
- set_task_cpu(p, cpumask_any_and(cpu_active_mask, new_mask));
-
-out:
- if (queued)
- try_preempt(p, rq);
- task_grq_unlock(&flags);
-
- if (running_wrong)
- preempt_schedule_common();
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
-
-#ifdef CONFIG_HOTPLUG_CPU
-extern struct task_struct *cpu_stopper_task;
-/* Run through task list and find tasks affined to the dead cpu, then remove
- * that cpu from the list, enable cpu0 and set the zerobound flag. */
-static void bind_zero(int src_cpu)
-{
- struct task_struct *p, *t, *stopper;
- int bound = 0;
-
- if (src_cpu == 0)
- return;
-
- stopper = per_cpu(cpu_stopper_task, src_cpu);
- do_each_thread(t, p) {
- if (p != stopper && cpumask_test_cpu(src_cpu, tsk_cpus_allowed(p))) {
- cpumask_clear_cpu(src_cpu, tsk_cpus_allowed(p));
- cpumask_set_cpu(0, tsk_cpus_allowed(p));
- p->zerobound = true;
- bound++;
- }
- clear_sticky(p);
- } while_each_thread(t, p);
-
- if (bound) {
- printk(KERN_INFO "Removed affinity for %d processes to cpu %d\n",
- bound, src_cpu);
- }
-}
-
-/* Find processes with the zerobound flag and reenable their affinity for the
- * CPU coming alive. */
-static void unbind_zero(int src_cpu)
-{
- int unbound = 0, zerobound = 0;
- struct task_struct *p, *t;
-
- if (src_cpu == 0)
- return;
-
- do_each_thread(t, p) {
- if (!p->mm)
- p->zerobound = false;
- if (p->zerobound) {
- unbound++;
- cpumask_set_cpu(src_cpu, tsk_cpus_allowed(p));
- /* Once every CPU affinity has been re-enabled, remove
- * the zerobound flag */
- if (cpumask_subset(cpu_possible_mask, tsk_cpus_allowed(p))) {
- p->zerobound = false;
- zerobound++;
- }
- }
- } while_each_thread(t, p);
-
- if (unbound) {
- printk(KERN_INFO "Added affinity for %d processes to cpu %d\n",
- unbound, src_cpu);
- }
- if (zerobound) {
- printk(KERN_INFO "Released forced binding to cpu0 for %d processes\n",
- zerobound);
- }
-}
-
-/*
- * Ensures that the idle task is using init_mm right before its cpu goes
- * offline.
- */
-void idle_task_exit(void)
-{
- struct mm_struct *mm = current->active_mm;
-
- BUG_ON(cpu_online(smp_processor_id()));
-
- if (mm != &init_mm) {
- switch_mm(mm, &init_mm, current);
- finish_arch_post_lock_switch();
- }
- mmdrop(mm);
-}
-#else /* CONFIG_HOTPLUG_CPU */
-static void unbind_zero(int src_cpu) {}
-#endif /* CONFIG_HOTPLUG_CPU */
-
-void sched_set_stop_task(int cpu, struct task_struct *stop)
-{
- struct sched_param stop_param = { .sched_priority = STOP_PRIO };
- struct sched_param start_param = { .sched_priority = 0 };
- struct task_struct *old_stop = cpu_rq(cpu)->stop;
-
- if (stop) {
- /*
- * Make it appear like a SCHED_FIFO task, its something
- * userspace knows about and won't get confused about.
- *
- * Also, it will make PI more or less work without too
- * much confusion -- but then, stop work should not
- * rely on PI working anyway.
- */
- sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param);
- }
-
- cpu_rq(cpu)->stop = stop;
-
- if (old_stop) {
- /*
- * Reset it back to a normal scheduling policy so that
- * it can die in pieces.
- */
- sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param);
- }
-}
-
-
-#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
-
-static struct ctl_table sd_ctl_dir[] = {
- {
- .procname = "sched_domain",
- .mode = 0555,
- },
- {}
-};
-
-static struct ctl_table sd_ctl_root[] = {
- {
- .procname = "kernel",
- .mode = 0555,
- .child = sd_ctl_dir,
- },
- {}
-};
-
-static struct ctl_table *sd_alloc_ctl_entry(int n)
-{
- struct ctl_table *entry =
- kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
-
- return entry;
-}
-
-static void sd_free_ctl_entry(struct ctl_table **tablep)
-{
- struct ctl_table *entry;
-
- /*
- * In the intermediate directories, both the child directory and
- * procname are dynamically allocated and could fail but the mode
- * will always be set. In the lowest directory the names are
- * static strings and all have proc handlers.
- */
- for (entry = *tablep; entry->mode; entry++) {
- if (entry->child)
- sd_free_ctl_entry(&entry->child);
- if (entry->proc_handler == NULL)
- kfree(entry->procname);
- }
-
- kfree(*tablep);
- *tablep = NULL;
-}
-
-static void
-set_table_entry(struct ctl_table *entry,
- const char *procname, void *data, int maxlen,
- mode_t mode, proc_handler *proc_handler)
-{
- entry->procname = procname;
- entry->data = data;
- entry->maxlen = maxlen;
- entry->mode = mode;
- entry->proc_handler = proc_handler;
-}
-
-static struct ctl_table *
-sd_alloc_ctl_domain_table(struct sched_domain *sd)
-{
- struct ctl_table *table = sd_alloc_ctl_entry(14);
-
- if (table == NULL)
- return NULL;
-
- set_table_entry(&table[0], "min_interval", &sd->min_interval,
- sizeof(long), 0644, proc_doulongvec_minmax);
- set_table_entry(&table[1], "max_interval", &sd->max_interval,
- sizeof(long), 0644, proc_doulongvec_minmax);
- set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
- sizeof(int), 0644, proc_dointvec_minmax);
- set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
- sizeof(int), 0644, proc_dointvec_minmax);
- set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
- sizeof(int), 0644, proc_dointvec_minmax);
- set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
- sizeof(int), 0644, proc_dointvec_minmax);
- set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
- sizeof(int), 0644, proc_dointvec_minmax);
- set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
- sizeof(int), 0644, proc_dointvec_minmax);
- set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
- sizeof(int), 0644, proc_dointvec_minmax);
- set_table_entry(&table[9], "cache_nice_tries",
- &sd->cache_nice_tries,
- sizeof(int), 0644, proc_dointvec_minmax);
- set_table_entry(&table[10], "flags", &sd->flags,
- sizeof(int), 0644, proc_dointvec_minmax);
- set_table_entry(&table[11], "max_newidle_lb_cost",
- &sd->max_newidle_lb_cost,
- sizeof(long), 0644, proc_doulongvec_minmax);
- set_table_entry(&table[12], "name", sd->name,
- CORENAME_MAX_SIZE, 0444, proc_dostring);
- /* &table[13] is terminator */
-
- return table;
-}
-
-static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
-{
- struct ctl_table *entry, *table;
- struct sched_domain *sd;
- int domain_num = 0, i;
- char buf[32];
-
- for_each_domain(cpu, sd)
- domain_num++;
- entry = table = sd_alloc_ctl_entry(domain_num + 1);
- if (table == NULL)
- return NULL;
-
- i = 0;
- for_each_domain(cpu, sd) {
- snprintf(buf, 32, "domain%d", i);
- entry->procname = kstrdup(buf, GFP_KERNEL);
- entry->mode = 0555;
- entry->child = sd_alloc_ctl_domain_table(sd);
- entry++;
- i++;
- }
- return table;
-}
-
-static struct ctl_table_header *sd_sysctl_header;
-static void register_sched_domain_sysctl(void)
-{
- int i, cpu_num = num_possible_cpus();
- struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
- char buf[32];
-
- WARN_ON(sd_ctl_dir[0].child);
- sd_ctl_dir[0].child = entry;
-
- if (entry == NULL)
- return;
-
- for_each_possible_cpu(i) {
- snprintf(buf, 32, "cpu%d", i);
- entry->procname = kstrdup(buf, GFP_KERNEL);
- entry->mode = 0555;
- entry->child = sd_alloc_ctl_cpu_table(i);
- entry++;
- }
-
- WARN_ON(sd_sysctl_header);
- sd_sysctl_header = register_sysctl_table(sd_ctl_root);
-}
-
-/* may be called multiple times per register */
-static void unregister_sched_domain_sysctl(void)
-{
- if (sd_sysctl_header)
- unregister_sysctl_table(sd_sysctl_header);
- sd_sysctl_header = NULL;
- if (sd_ctl_dir[0].child)
- sd_free_ctl_entry(&sd_ctl_dir[0].child);
-}
-#else
-static void register_sched_domain_sysctl(void)
-{
-}
-static void unregister_sched_domain_sysctl(void)
-{
-}
-#endif
-
-static void set_rq_online(struct rq *rq)
-{
- if (!rq->online) {
- cpumask_set_cpu(cpu_of(rq), rq->rd->online);
- rq->online = true;
- }
-}
-
-static void set_rq_offline(struct rq *rq)
-{
- if (rq->online) {
- cpumask_clear_cpu(cpu_of(rq), rq->rd->online);
- rq->online = false;
- }
-}
-
-/*
- * migration_call - callback that gets triggered when a CPU is added.
- */
-static int
-migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
-{
- int cpu = (long)hcpu;
- unsigned long flags;
- struct rq *rq = cpu_rq(cpu);
-#ifdef CONFIG_HOTPLUG_CPU
- struct task_struct *idle = rq->idle;
-#endif
-
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_STARTING:
- return NOTIFY_OK;
- case CPU_UP_PREPARE:
- break;
-
- case CPU_ONLINE:
- /* Update our root-domain */
- grq_lock_irqsave(&flags);
- if (rq->rd) {
- BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
-
- set_rq_online(rq);
- }
- unbind_zero(cpu);
- grq.noc = num_online_cpus();
- grq_unlock_irqrestore(&flags);
- break;
-
-#ifdef CONFIG_HOTPLUG_CPU
- case CPU_DEAD:
- grq_lock_irq();
- set_rq_task(rq, idle);
- update_clocks(rq);
- grq_unlock_irq();
- break;
-
- case CPU_DYING:
- /* Update our root-domain */
- grq_lock_irqsave(&flags);
- if (rq->rd) {
- BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
- set_rq_offline(rq);
- }
- bind_zero(cpu);
- grq.noc = num_online_cpus();
- grq_unlock_irqrestore(&flags);
- break;
-#endif
- }
- return NOTIFY_OK;
-}
-
-/*
- * Register at high priority so that task migration (migrate_all_tasks)
- * happens before everything else. This has to be lower priority than
- * the notifier in the perf_counter subsystem, though.
- */
-static struct notifier_block migration_notifier = {
- .notifier_call = migration_call,
- .priority = CPU_PRI_MIGRATION,
-};
-
-static int sched_cpu_active(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
-{
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_DOWN_FAILED:
- set_cpu_active((long)hcpu, true);
- return NOTIFY_OK;
- default:
- return NOTIFY_DONE;
- }
-}
-
-static int sched_cpu_inactive(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
-{
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_DOWN_PREPARE:
- set_cpu_active((long)hcpu, false);
- return NOTIFY_OK;
- default:
- return NOTIFY_DONE;
- }
-}
-
-int __init migration_init(void)
-{
- void *cpu = (void *)(long)smp_processor_id();
- int err;
-
- /* Initialise migration for the boot CPU */
- err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
- BUG_ON(err == NOTIFY_BAD);
- migration_call(&migration_notifier, CPU_ONLINE, cpu);
- register_cpu_notifier(&migration_notifier);
-
- /* Register cpu active notifiers */
- cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
- cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
-
- return 0;
-}
-early_initcall(migration_init);
-#endif
-
-#ifdef CONFIG_SMP
-
-static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
-
-#ifdef CONFIG_SCHED_DEBUG
-
-static __read_mostly int sched_debug_enabled;
-
-static int __init sched_debug_setup(char *str)
-{
- sched_debug_enabled = 1;
-
- return 0;
-}
-early_param("sched_debug", sched_debug_setup);
-
-static inline bool sched_debug(void)
-{
- return sched_debug_enabled;
-}
-
-static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
- struct cpumask *groupmask)
-{
- cpumask_clear(groupmask);
-
- printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
-
- if (!(sd->flags & SD_LOAD_BALANCE)) {
- printk("does not load-balance\n");
- if (sd->parent)
- printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
- " has parent");
- return -1;
- }
-
- printk(KERN_CONT "span %*pbl level %s\n",
- cpumask_pr_args(sched_domain_span(sd)), sd->name);
-
- if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
- printk(KERN_ERR "ERROR: domain->span does not contain "
- "CPU%d\n", cpu);
- }
-
- printk(KERN_CONT "\n");
-
- if (!cpumask_equal(sched_domain_span(sd), groupmask))
- printk(KERN_ERR "ERROR: groups don't span domain->span\n");
-
- if (sd->parent &&
- !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
- printk(KERN_ERR "ERROR: parent span is not a superset "
- "of domain->span\n");
- return 0;
-}
-
-static void sched_domain_debug(struct sched_domain *sd, int cpu)
-{
- int level = 0;
-
- if (!sched_debug_enabled)
- return;
-
- if (!sd) {
- printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
- return;
- }
-
- printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
-
- for (;;) {
- if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
- break;
- level++;
- sd = sd->parent;
- if (!sd)
- break;
- }
-}
-#else /* !CONFIG_SCHED_DEBUG */
-# define sched_domain_debug(sd, cpu) do { } while (0)
-static inline bool sched_debug(void)
-{
- return false;
-}
-#endif /* CONFIG_SCHED_DEBUG */
-
-static int sd_degenerate(struct sched_domain *sd)
-{
- if (cpumask_weight(sched_domain_span(sd)) == 1)
- return 1;
-
- /* Following flags don't use groups */
- if (sd->flags & (SD_WAKE_AFFINE))
- return 0;
-
- return 1;
-}
-
-static int
-sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
-{
- unsigned long cflags = sd->flags, pflags = parent->flags;
-
- if (sd_degenerate(parent))
- return 1;
-
- if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
- return 0;
-
- if (~cflags & pflags)
- return 0;
-
- return 1;
-}
-
-static void free_rootdomain(struct rcu_head *rcu)
-{
- struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
-
- cpupri_cleanup(&rd->cpupri);
- free_cpumask_var(rd->rto_mask);
- free_cpumask_var(rd->online);
- free_cpumask_var(rd->span);
- kfree(rd);
-}
-
-static void rq_attach_root(struct rq *rq, struct root_domain *rd)
-{
- struct root_domain *old_rd = NULL;
- unsigned long flags;
-
- grq_lock_irqsave(&flags);
-
- if (rq->rd) {
- old_rd = rq->rd;
-
- if (cpumask_test_cpu(rq->cpu, old_rd->online))
- set_rq_offline(rq);
-
- cpumask_clear_cpu(rq->cpu, old_rd->span);
-
- /*
- * If we dont want to free the old_rd yet then
- * set old_rd to NULL to skip the freeing later
- * in this function:
- */
- if (!atomic_dec_and_test(&old_rd->refcount))
- old_rd = NULL;
- }
-
- atomic_inc(&rd->refcount);
- rq->rd = rd;
-
- cpumask_set_cpu(rq->cpu, rd->span);
- if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
- set_rq_online(rq);
-
- grq_unlock_irqrestore(&flags);
-
- if (old_rd)
- call_rcu_sched(&old_rd->rcu, free_rootdomain);
-}
-
-static int init_rootdomain(struct root_domain *rd)
-{
- memset(rd, 0, sizeof(*rd));
-
- if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
- goto out;
- if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
- goto free_span;
- if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
- goto free_online;
-
- if (cpupri_init(&rd->cpupri) != 0)
- goto free_rto_mask;
- return 0;
-
-free_rto_mask:
- free_cpumask_var(rd->rto_mask);
-free_online:
- free_cpumask_var(rd->online);
-free_span:
- free_cpumask_var(rd->span);
-out:
- return -ENOMEM;
-}
-
-static void init_defrootdomain(void)
-{
- init_rootdomain(&def_root_domain);
-
- atomic_set(&def_root_domain.refcount, 1);
-}
-
-static struct root_domain *alloc_rootdomain(void)
-{
- struct root_domain *rd;
-
- rd = kmalloc(sizeof(*rd), GFP_KERNEL);
- if (!rd)
- return NULL;
-
- if (init_rootdomain(rd) != 0) {
- kfree(rd);
- return NULL;
- }
-
- return rd;
-}
-
-static void free_sched_domain(struct rcu_head *rcu)
-{
- struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
-
- kfree(sd);
-}
-
-static void destroy_sched_domain(struct sched_domain *sd, int cpu)
-{
- call_rcu(&sd->rcu, free_sched_domain);
-}
-
-static void destroy_sched_domains(struct sched_domain *sd, int cpu)
-{
- for (; sd; sd = sd->parent)
- destroy_sched_domain(sd, cpu);
-}
-
-/*
- * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
- * hold the hotplug lock.
- */
-static void
-cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
-{
- struct rq *rq = cpu_rq(cpu);
- struct sched_domain *tmp;
-
- /* Remove the sched domains which do not contribute to scheduling. */
- for (tmp = sd; tmp; ) {
- struct sched_domain *parent = tmp->parent;
- if (!parent)
- break;
-
- if (sd_parent_degenerate(tmp, parent)) {
- tmp->parent = parent->parent;
- if (parent->parent)
- parent->parent->child = tmp;
- /*
- * Transfer SD_PREFER_SIBLING down in case of a
- * degenerate parent; the spans match for this
- * so the property transfers.
- */
- if (parent->flags & SD_PREFER_SIBLING)
- tmp->flags |= SD_PREFER_SIBLING;
- destroy_sched_domain(parent, cpu);
- } else
- tmp = tmp->parent;
- }
-
- if (sd && sd_degenerate(sd)) {
- tmp = sd;
- sd = sd->parent;
- destroy_sched_domain(tmp, cpu);
- if (sd)
- sd->child = NULL;
- }
-
- sched_domain_debug(sd, cpu);
-
- rq_attach_root(rq, rd);
- tmp = rq->sd;
- rcu_assign_pointer(rq->sd, sd);
- destroy_sched_domains(tmp, cpu);
-}
-
-/* cpus with isolated domains */
-cpumask_var_t cpu_isolated_map;
-
-/* Setup the mask of cpus configured for isolated domains */
-static int __init isolated_cpu_setup(char *str)
-{
- alloc_bootmem_cpumask_var(&cpu_isolated_map);
- cpulist_parse(str, cpu_isolated_map);
- return 1;
-}
-
-__setup("isolcpus=", isolated_cpu_setup);
-
-struct s_data {
- struct sched_domain ** __percpu sd;
- struct root_domain *rd;
-};
-
-enum s_alloc {
- sa_rootdomain,
- sa_sd,
- sa_sd_storage,
- sa_none,
-};
-
-/*
- * Initializers for schedule domains
- * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
- */
-
-static int default_relax_domain_level = -1;
-int sched_domain_level_max;
-
-static int __init setup_relax_domain_level(char *str)
-{
- if (kstrtoint(str, 0, &default_relax_domain_level))
- pr_warn("Unable to set relax_domain_level\n");
-
- return 1;
-}
-__setup("relax_domain_level=", setup_relax_domain_level);
-
-static void set_domain_attribute(struct sched_domain *sd,
- struct sched_domain_attr *attr)
-{
- int request;
-
- if (!attr || attr->relax_domain_level < 0) {
- if (default_relax_domain_level < 0)
- return;
- else
- request = default_relax_domain_level;
- } else
- request = attr->relax_domain_level;
- if (request < sd->level) {
- /* turn off idle balance on this domain */
- sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
- } else {
- /* turn on idle balance on this domain */
- sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
- }
-}
-
-static void __sdt_free(const struct cpumask *cpu_map);
-static int __sdt_alloc(const struct cpumask *cpu_map);
-
-static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
- const struct cpumask *cpu_map)
-{
- switch (what) {
- case sa_rootdomain:
- if (!atomic_read(&d->rd->refcount))
- free_rootdomain(&d->rd->rcu); /* fall through */
- case sa_sd:
- free_percpu(d->sd); /* fall through */
- case sa_sd_storage:
- __sdt_free(cpu_map); /* fall through */
- case sa_none:
- break;
- }
-}
-
-static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
- const struct cpumask *cpu_map)
-{
- memset(d, 0, sizeof(*d));
-
- if (__sdt_alloc(cpu_map))
- return sa_sd_storage;
- d->sd = alloc_percpu(struct sched_domain *);
- if (!d->sd)
- return sa_sd_storage;
- d->rd = alloc_rootdomain();
- if (!d->rd)
- return sa_sd;
- return sa_rootdomain;
-}
-
-/*
- * NULL the sd_data elements we've used to build the sched_domain
- * structure so that the subsequent __free_domain_allocs()
- * will not free the data we're using.
- */
-static void claim_allocations(int cpu, struct sched_domain *sd)
-{
- struct sd_data *sdd = sd->private;
-
- WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
- *per_cpu_ptr(sdd->sd, cpu) = NULL;
-}
-
-#ifdef CONFIG_NUMA
-static int sched_domains_numa_levels;
-static int *sched_domains_numa_distance;
-static struct cpumask ***sched_domains_numa_masks;
-static int sched_domains_curr_level;
-#endif
-
-/*
- * SD_flags allowed in topology descriptions.
- *
- * SD_SHARE_CPUCAPACITY - describes SMT topologies
- * SD_SHARE_PKG_RESOURCES - describes shared caches
- * SD_NUMA - describes NUMA topologies
- * SD_SHARE_POWERDOMAIN - describes shared power domain
- *
- * Odd one out:
- * SD_ASYM_PACKING - describes SMT quirks
- */
-#define TOPOLOGY_SD_FLAGS \
- (SD_SHARE_CPUCAPACITY | \
- SD_SHARE_PKG_RESOURCES | \
- SD_NUMA | \
- SD_ASYM_PACKING | \
- SD_SHARE_POWERDOMAIN)
-
-static struct sched_domain *
-sd_init(struct sched_domain_topology_level *tl, int cpu)
-{
- struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
- int sd_weight, sd_flags = 0;
-
-#ifdef CONFIG_NUMA
- /*
- * Ugly hack to pass state to sd_numa_mask()...
- */
- sched_domains_curr_level = tl->numa_level;
-#endif
-
- sd_weight = cpumask_weight(tl->mask(cpu));
-
- if (tl->sd_flags)
- sd_flags = (*tl->sd_flags)();
- if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
- "wrong sd_flags in topology description\n"))
- sd_flags &= ~TOPOLOGY_SD_FLAGS;
-
- *sd = (struct sched_domain){
- .min_interval = sd_weight,
- .max_interval = 2*sd_weight,
- .busy_factor = 32,
- .imbalance_pct = 125,
-
- .cache_nice_tries = 0,
- .busy_idx = 0,
- .idle_idx = 0,
- .newidle_idx = 0,
- .wake_idx = 0,
- .forkexec_idx = 0,
-
- .flags = 1*SD_LOAD_BALANCE
- | 1*SD_BALANCE_NEWIDLE
- | 1*SD_BALANCE_EXEC
- | 1*SD_BALANCE_FORK
- | 0*SD_BALANCE_WAKE
- | 1*SD_WAKE_AFFINE
- | 0*SD_SHARE_CPUCAPACITY
- | 0*SD_SHARE_PKG_RESOURCES
- | 0*SD_SERIALIZE
- | 0*SD_PREFER_SIBLING
- | 0*SD_NUMA
- | sd_flags
- ,
-
- .last_balance = jiffies,
- .balance_interval = sd_weight,
- .smt_gain = 0,
- .max_newidle_lb_cost = 0,
- .next_decay_max_lb_cost = jiffies,
-#ifdef CONFIG_SCHED_DEBUG
- .name = tl->name,
-#endif
- };
-
- /*
- * Convert topological properties into behaviour.
- */
-
- if (sd->flags & SD_SHARE_CPUCAPACITY) {
- sd->flags |= SD_PREFER_SIBLING;
- sd->imbalance_pct = 110;
- sd->smt_gain = 1178; /* ~15% */
-
- } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
- sd->imbalance_pct = 117;
- sd->cache_nice_tries = 1;
- sd->busy_idx = 2;
-
-#ifdef CONFIG_NUMA
- } else if (sd->flags & SD_NUMA) {
- sd->cache_nice_tries = 2;
- sd->busy_idx = 3;
- sd->idle_idx = 2;
-
- sd->flags |= SD_SERIALIZE;
- if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
- sd->flags &= ~(SD_BALANCE_EXEC |
- SD_BALANCE_FORK |
- SD_WAKE_AFFINE);
- }
-
-#endif
- } else {
- sd->flags |= SD_PREFER_SIBLING;
- sd->cache_nice_tries = 1;
- sd->busy_idx = 2;
- sd->idle_idx = 1;
- }
-
- sd->private = &tl->data;
-
- return sd;
-}
-
-/*
- * Topology list, bottom-up.
- */
-static struct sched_domain_topology_level default_topology[] = {
-#ifdef CONFIG_SCHED_SMT
- { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
-#endif
-#ifdef CONFIG_SCHED_MC
- { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
-#endif
- { cpu_cpu_mask, SD_INIT_NAME(DIE) },
- { NULL, },
-};
-
-struct sched_domain_topology_level *sched_domain_topology = default_topology;
-
-#define for_each_sd_topology(tl) \
- for (tl = sched_domain_topology; tl->mask; tl++)
-
-void set_sched_topology(struct sched_domain_topology_level *tl)
-{
- sched_domain_topology = tl;
-}
-
-#ifdef CONFIG_NUMA
-
-static const struct cpumask *sd_numa_mask(int cpu)
-{
- return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
-}
-
-static void sched_numa_warn(const char *str)
-{
- static int done = false;
- int i,j;
-
- if (done)
- return;
-
- done = true;
-
- printk(KERN_WARNING "ERROR: %s\n\n", str);
-
- for (i = 0; i < nr_node_ids; i++) {
- printk(KERN_WARNING " ");
- for (j = 0; j < nr_node_ids; j++)
- printk(KERN_CONT "%02d ", node_distance(i,j));
- printk(KERN_CONT "\n");
- }
- printk(KERN_WARNING "\n");
-}
-
-static bool find_numa_distance(int distance)
-{
- int i;
-
- if (distance == node_distance(0, 0))
- return true;
-
- for (i = 0; i < sched_domains_numa_levels; i++) {
- if (sched_domains_numa_distance[i] == distance)
- return true;
- }
-
- return false;
-}
-
-static void sched_init_numa(void)
-{
- int next_distance, curr_distance = node_distance(0, 0);
- struct sched_domain_topology_level *tl;
- int level = 0;
- int i, j, k;
-
- sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
- if (!sched_domains_numa_distance)
- return;
-
- /*
- * O(nr_nodes^2) deduplicating selection sort -- in order to find the
- * unique distances in the node_distance() table.
- *
- * Assumes node_distance(0,j) includes all distances in
- * node_distance(i,j) in order to avoid cubic time.
- */
- next_distance = curr_distance;
- for (i = 0; i < nr_node_ids; i++) {
- for (j = 0; j < nr_node_ids; j++) {
- for (k = 0; k < nr_node_ids; k++) {
- int distance = node_distance(i, k);
-
- if (distance > curr_distance &&
- (distance < next_distance ||
- next_distance == curr_distance))
- next_distance = distance;
-
- /*
- * While not a strong assumption it would be nice to know
- * about cases where if node A is connected to B, B is not
- * equally connected to A.
- */
- if (sched_debug() && node_distance(k, i) != distance)
- sched_numa_warn("Node-distance not symmetric");
-
- if (sched_debug() && i && !find_numa_distance(distance))
- sched_numa_warn("Node-0 not representative");
- }
- if (next_distance != curr_distance) {
- sched_domains_numa_distance[level++] = next_distance;
- sched_domains_numa_levels = level;
- curr_distance = next_distance;
- } else break;
- }
-
- /*
- * In case of sched_debug() we verify the above assumption.
- */
- if (!sched_debug())
- break;
- }
- /*
- * 'level' contains the number of unique distances, excluding the
- * identity distance node_distance(i,i).
- *
- * The sched_domains_numa_distance[] array includes the actual distance
- * numbers.
- */
-
- /*
- * Here, we should temporarily reset sched_domains_numa_levels to 0.
- * If it fails to allocate memory for array sched_domains_numa_masks[][],
- * the array will contain less then 'level' members. This could be
- * dangerous when we use it to iterate array sched_domains_numa_masks[][]
- * in other functions.
- *
- * We reset it to 'level' at the end of this function.
- */
- sched_domains_numa_levels = 0;
-
- sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
- if (!sched_domains_numa_masks)
- return;
-
- /*
- * Now for each level, construct a mask per node which contains all
- * cpus of nodes that are that many hops away from us.
- */
- for (i = 0; i < level; i++) {
- sched_domains_numa_masks[i] =
- kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
- if (!sched_domains_numa_masks[i])
- return;
-
- for (j = 0; j < nr_node_ids; j++) {
- struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
- if (!mask)
- return;
-
- sched_domains_numa_masks[i][j] = mask;
-
- for (k = 0; k < nr_node_ids; k++) {
- if (node_distance(j, k) > sched_domains_numa_distance[i])
- continue;
-
- cpumask_or(mask, mask, cpumask_of_node(k));
- }
- }
- }
-
- /* Compute default topology size */
- for (i = 0; sched_domain_topology[i].mask; i++);
-
- tl = kzalloc((i + level + 1) *
- sizeof(struct sched_domain_topology_level), GFP_KERNEL);
- if (!tl)
- return;
-
- /*
- * Copy the default topology bits..
- */
- for (i = 0; sched_domain_topology[i].mask; i++)
- tl[i] = sched_domain_topology[i];
-
- /*
- * .. and append 'j' levels of NUMA goodness.
- */
- for (j = 0; j < level; i++, j++) {
- tl[i] = (struct sched_domain_topology_level){
- .mask = sd_numa_mask,
- .sd_flags = cpu_numa_flags,
- .flags = SDTL_OVERLAP,
- .numa_level = j,
- SD_INIT_NAME(NUMA)
- };
- }
-
- sched_domain_topology = tl;
-
- sched_domains_numa_levels = level;
-}
-
-static void sched_domains_numa_masks_set(int cpu)
-{
- int i, j;
- int node = cpu_to_node(cpu);
-
- for (i = 0; i < sched_domains_numa_levels; i++) {
- for (j = 0; j < nr_node_ids; j++) {
- if (node_distance(j, node) <= sched_domains_numa_distance[i])
- cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
- }
- }
-}
-
-static void sched_domains_numa_masks_clear(int cpu)
-{
- int i, j;
- for (i = 0; i < sched_domains_numa_levels; i++) {
- for (j = 0; j < nr_node_ids; j++)
- cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
- }
-}
-
-/*
- * Update sched_domains_numa_masks[level][node] array when new cpus
- * are onlined.
- */
-static int sched_domains_numa_masks_update(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
-{
- int cpu = (long)hcpu;
-
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_ONLINE:
- sched_domains_numa_masks_set(cpu);
- break;
-
- case CPU_DEAD:
- sched_domains_numa_masks_clear(cpu);
- break;
-
- default:
- return NOTIFY_DONE;
- }
-
- return NOTIFY_OK;
-}
-#else
-static inline void sched_init_numa(void)
-{
-}
-
-static int sched_domains_numa_masks_update(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
-{
- return 0;
-}
-#endif /* CONFIG_NUMA */
-
-static int __sdt_alloc(const struct cpumask *cpu_map)
-{
- struct sched_domain_topology_level *tl;
- int j;
-
- for_each_sd_topology(tl) {
- struct sd_data *sdd = &tl->data;
-
- sdd->sd = alloc_percpu(struct sched_domain *);
- if (!sdd->sd)
- return -ENOMEM;
-
- for_each_cpu(j, cpu_map) {
- struct sched_domain *sd;
-
- sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
- GFP_KERNEL, cpu_to_node(j));
- if (!sd)
- return -ENOMEM;
-
- *per_cpu_ptr(sdd->sd, j) = sd;
- }
- }
-
- return 0;
-}
-
-static void __sdt_free(const struct cpumask *cpu_map)
-{
- struct sched_domain_topology_level *tl;
- int j;
-
- for_each_sd_topology(tl) {
- struct sd_data *sdd = &tl->data;
-
- for_each_cpu(j, cpu_map) {
- struct sched_domain *sd;
-
- if (sdd->sd) {
- sd = *per_cpu_ptr(sdd->sd, j);
- kfree(*per_cpu_ptr(sdd->sd, j));
- }
- }
- free_percpu(sdd->sd);
- sdd->sd = NULL;
- }
-}
-
-struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
- const struct cpumask *cpu_map, struct sched_domain_attr *attr,
- struct sched_domain *child, int cpu)
-{
- struct sched_domain *sd = sd_init(tl, cpu);
- if (!sd)
- return child;
-
- cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
- if (child) {
- sd->level = child->level + 1;
- sched_domain_level_max = max(sched_domain_level_max, sd->level);
- child->parent = sd;
- sd->child = child;
-
- if (!cpumask_subset(sched_domain_span(child),
- sched_domain_span(sd))) {
- pr_err("BUG: arch topology borken\n");
-#ifdef CONFIG_SCHED_DEBUG
- pr_err(" the %s domain not a subset of the %s domain\n",
- child->name, sd->name);
-#endif
- /* Fixup, ensure @sd has at least @child cpus. */
- cpumask_or(sched_domain_span(sd),
- sched_domain_span(sd),
- sched_domain_span(child));
- }
-
- }
- set_domain_attribute(sd, attr);
-
- return sd;
-}
-
-/*
- * Build sched domains for a given set of cpus and attach the sched domains
- * to the individual cpus
- */
-static int build_sched_domains(const struct cpumask *cpu_map,
- struct sched_domain_attr *attr)
-{
- enum s_alloc alloc_state;
- struct sched_domain *sd;
- struct s_data d;
- int i, ret = -ENOMEM;
-
- alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
- if (alloc_state != sa_rootdomain)
- goto error;
-
- /* Set up domains for cpus specified by the cpu_map. */
- for_each_cpu(i, cpu_map) {
- struct sched_domain_topology_level *tl;
-
- sd = NULL;
- for_each_sd_topology(tl) {
- sd = build_sched_domain(tl, cpu_map, attr, sd, i);
- if (tl == sched_domain_topology)
- *per_cpu_ptr(d.sd, i) = sd;
- if (tl->flags & SDTL_OVERLAP)
- sd->flags |= SD_OVERLAP;
- if (cpumask_equal(cpu_map, sched_domain_span(sd)))
- break;
- }
- }
-
- /* Calculate CPU capacity for physical packages and nodes */
- for (i = nr_cpumask_bits-1; i >= 0; i--) {
- if (!cpumask_test_cpu(i, cpu_map))
- continue;
-
- for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
- claim_allocations(i, sd);
- }
- }
-
- /* Attach the domains */
- rcu_read_lock();
- for_each_cpu(i, cpu_map) {
- sd = *per_cpu_ptr(d.sd, i);
- cpu_attach_domain(sd, d.rd, i);
- }
- rcu_read_unlock();
-
- ret = 0;
-error:
- __free_domain_allocs(&d, alloc_state, cpu_map);
- return ret;
-}
-
-static cpumask_var_t *doms_cur; /* current sched domains */
-static int ndoms_cur; /* number of sched domains in 'doms_cur' */
-static struct sched_domain_attr *dattr_cur;
- /* attribues of custom domains in 'doms_cur' */
-
-/*
- * Special case: If a kmalloc of a doms_cur partition (array of
- * cpumask) fails, then fallback to a single sched domain,
- * as determined by the single cpumask fallback_doms.
- */
-static cpumask_var_t fallback_doms;
-
-/*
- * arch_update_cpu_topology lets virtualized architectures update the
- * cpu core maps. It is supposed to return 1 if the topology changed
- * or 0 if it stayed the same.
- */
-int __weak arch_update_cpu_topology(void)
-{
- return 0;
-}
-
-cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
-{
- int i;
- cpumask_var_t *doms;
-
- doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
- if (!doms)
- return NULL;
- for (i = 0; i < ndoms; i++) {
- if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
- free_sched_domains(doms, i);
- return NULL;
- }
- }
- return doms;
-}
-
-void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
-{
- unsigned int i;
- for (i = 0; i < ndoms; i++)
- free_cpumask_var(doms[i]);
- kfree(doms);
-}
-
-/*
- * Set up scheduler domains and groups. Callers must hold the hotplug lock.
- * For now this just excludes isolated cpus, but could be used to
- * exclude other special cases in the future.
- */
-static int init_sched_domains(const struct cpumask *cpu_map)
-{
- int err;
-
- arch_update_cpu_topology();
- ndoms_cur = 1;
- doms_cur = alloc_sched_domains(ndoms_cur);
- if (!doms_cur)
- doms_cur = &fallback_doms;
- cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
- err = build_sched_domains(doms_cur[0], NULL);
- register_sched_domain_sysctl();
-
- return err;
-}
-
-/*
- * Detach sched domains from a group of cpus specified in cpu_map
- * These cpus will now be attached to the NULL domain
- */
-static void detach_destroy_domains(const struct cpumask *cpu_map)
-{
- int i;
-
- rcu_read_lock();
- for_each_cpu(i, cpu_map)
- cpu_attach_domain(NULL, &def_root_domain, i);
- rcu_read_unlock();
-}
-
-/* handle null as "default" */
-static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
- struct sched_domain_attr *new, int idx_new)
-{
- struct sched_domain_attr tmp;
-
- /* fast path */
- if (!new && !cur)
- return 1;
-
- tmp = SD_ATTR_INIT;
- return !memcmp(cur ? (cur + idx_cur) : &tmp,
- new ? (new + idx_new) : &tmp,
- sizeof(struct sched_domain_attr));
-}
-
-/*
- * Partition sched domains as specified by the 'ndoms_new'
- * cpumasks in the array doms_new[] of cpumasks. This compares
- * doms_new[] to the current sched domain partitioning, doms_cur[].
- * It destroys each deleted domain and builds each new domain.
- *
- * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
- * The masks don't intersect (don't overlap.) We should setup one
- * sched domain for each mask. CPUs not in any of the cpumasks will
- * not be load balanced. If the same cpumask appears both in the
- * current 'doms_cur' domains and in the new 'doms_new', we can leave
- * it as it is.
- *
- * The passed in 'doms_new' should be allocated using
- * alloc_sched_domains. This routine takes ownership of it and will
- * free_sched_domains it when done with it. If the caller failed the
- * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
- * and partition_sched_domains() will fallback to the single partition
- * 'fallback_doms', it also forces the domains to be rebuilt.
- *
- * If doms_new == NULL it will be replaced with cpu_online_mask.
- * ndoms_new == 0 is a special case for destroying existing domains,
- * and it will not create the default domain.
- *
- * Call with hotplug lock held
- */
-void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
- struct sched_domain_attr *dattr_new)
-{
- int i, j, n;
- int new_topology;
-
- mutex_lock(&sched_domains_mutex);
-
- /* always unregister in case we don't destroy any domains */
- unregister_sched_domain_sysctl();
-
- /* Let architecture update cpu core mappings. */
- new_topology = arch_update_cpu_topology();
-
- n = doms_new ? ndoms_new : 0;
-
- /* Destroy deleted domains */
- for (i = 0; i < ndoms_cur; i++) {
- for (j = 0; j < n && !new_topology; j++) {
- if (cpumask_equal(doms_cur[i], doms_new[j])
- && dattrs_equal(dattr_cur, i, dattr_new, j))
- goto match1;
- }
- /* no match - a current sched domain not in new doms_new[] */
- detach_destroy_domains(doms_cur[i]);
-match1:
- ;
- }
-
- n = ndoms_cur;
- if (doms_new == NULL) {
- n = 0;
- doms_new = &fallback_doms;
- cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
- WARN_ON_ONCE(dattr_new);
- }
-
- /* Build new domains */
- for (i = 0; i < ndoms_new; i++) {
- for (j = 0; j < n && !new_topology; j++) {
- if (cpumask_equal(doms_new[i], doms_cur[j])
- && dattrs_equal(dattr_new, i, dattr_cur, j))
- goto match2;
- }
- /* no match - add a new doms_new */
- build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
-match2:
- ;
- }
-
- /* Remember the new sched domains */
- if (doms_cur != &fallback_doms)
- free_sched_domains(doms_cur, ndoms_cur);
- kfree(dattr_cur); /* kfree(NULL) is safe */
- doms_cur = doms_new;
- dattr_cur = dattr_new;
- ndoms_cur = ndoms_new;
-
- register_sched_domain_sysctl();
-
- mutex_unlock(&sched_domains_mutex);
-}
-
-static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */
-
-/*
- * Update cpusets according to cpu_active mask. If cpusets are
- * disabled, cpuset_update_active_cpus() becomes a simple wrapper
- * around partition_sched_domains().
- *
- * If we come here as part of a suspend/resume, don't touch cpusets because we
- * want to restore it back to its original state upon resume anyway.
- */
-static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
- void *hcpu)
-{
- switch (action) {
- case CPU_ONLINE_FROZEN:
- case CPU_DOWN_FAILED_FROZEN:
-
- /*
- * num_cpus_frozen tracks how many CPUs are involved in suspend
- * resume sequence. As long as this is not the last online
- * operation in the resume sequence, just build a single sched
- * domain, ignoring cpusets.
- */
- num_cpus_frozen--;
- if (likely(num_cpus_frozen)) {
- partition_sched_domains(1, NULL, NULL);
- break;
- }
-
- /*
- * This is the last CPU online operation. So fall through and
- * restore the original sched domains by considering the
- * cpuset configurations.
- */
-
- case CPU_ONLINE:
- cpuset_update_active_cpus(true);
- break;
- default:
- return NOTIFY_DONE;
- }
- return NOTIFY_OK;
-}
-
-static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
- void *hcpu)
-{
- switch (action) {
- case CPU_DOWN_PREPARE:
- cpuset_update_active_cpus(false);
- break;
- case CPU_DOWN_PREPARE_FROZEN:
- num_cpus_frozen++;
- partition_sched_domains(1, NULL, NULL);
- break;
- default:
- return NOTIFY_DONE;
- }
- return NOTIFY_OK;
-}
-
-#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC)
-/*
- * Cheaper version of the below functions in case support for SMT and MC is
- * compiled in but CPUs have no siblings.
- */
-static bool sole_cpu_idle(int cpu)
-{
- return rq_idle(cpu_rq(cpu));
-}
-#endif
-#ifdef CONFIG_SCHED_SMT
-static const cpumask_t *thread_cpumask(int cpu)
-{
- return topology_thread_cpumask(cpu);
-}
-/* All this CPU's SMT siblings are idle */
-static bool siblings_cpu_idle(int cpu)
-{
- return cpumask_subset(thread_cpumask(cpu), &grq.cpu_idle_map);
-}
-#endif
-#ifdef CONFIG_SCHED_MC
-static const cpumask_t *core_cpumask(int cpu)
-{
- return topology_core_cpumask(cpu);
-}
-/* All this CPU's shared cache siblings are idle */
-static bool cache_cpu_idle(int cpu)
-{
- return cpumask_subset(core_cpumask(cpu), &grq.cpu_idle_map);
-}
-#endif
-
-enum sched_domain_level {
- SD_LV_NONE = 0,
- SD_LV_SIBLING,
- SD_LV_MC,
- SD_LV_BOOK,
- SD_LV_CPU,
- SD_LV_NODE,
- SD_LV_ALLNODES,
- SD_LV_MAX
-};
-
-void __init sched_init_smp(void)
-{
- struct sched_domain *sd;
- int cpu, other_cpu;
-
- cpumask_var_t non_isolated_cpus;
-
- alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
- alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
-
- sched_init_numa();
-
- /*
- * There's no userspace yet to cause hotplug operations; hence all the
- * cpu masks are stable and all blatant races in the below code cannot
- * happen.
- */
- mutex_lock(&sched_domains_mutex);
- init_sched_domains(cpu_active_mask);
- cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
- if (cpumask_empty(non_isolated_cpus))
- cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
- mutex_unlock(&sched_domains_mutex);
-
- hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
- hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
- hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
-
- /* Move init over to a non-isolated CPU */
- if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
- BUG();
- free_cpumask_var(non_isolated_cpus);
-
- grq_lock_irq();
- /*
- * Set up the relative cache distance of each online cpu from each
- * other in a simple array for quick lookup. Locality is determined
- * by the closest sched_domain that CPUs are separated by. CPUs with
- * shared cache in SMT and MC are treated as local. Separate CPUs
- * (within the same package or physically) within the same node are
- * treated as not local. CPUs not even in the same domain (different
- * nodes) are treated as very distant.
- */
- for_each_online_cpu(cpu) {
- struct rq *rq = cpu_rq(cpu);
-
- /* First check if this cpu is in the same node */
- for_each_domain(cpu, sd) {
- if (sd->level > SD_LV_NODE)
- continue;
- /* Set locality to local node if not already found lower */
- for_each_cpu(other_cpu, sched_domain_span(sd)) {
- if (rq->cpu_locality[other_cpu] > 3)
- rq->cpu_locality[other_cpu] = 3;
- }
- }
-
- /*
- * Each runqueue has its own function in case it doesn't have
- * siblings of its own allowing mixed topologies.
- */
-#ifdef CONFIG_SCHED_MC
- for_each_cpu(other_cpu, core_cpumask(cpu)) {
- if (rq->cpu_locality[other_cpu] > 2)
- rq->cpu_locality[other_cpu] = 2;
- }
- if (cpumask_weight(core_cpumask(cpu)) > 1)
- rq->cache_idle = cache_cpu_idle;
-#endif
-#ifdef CONFIG_SCHED_SMT
- for_each_cpu(other_cpu, thread_cpumask(cpu))
- rq->cpu_locality[other_cpu] = 1;
- if (cpumask_weight(thread_cpumask(cpu)) > 1)
- rq->siblings_idle = siblings_cpu_idle;
-#endif
- }
- grq_unlock_irq();
-
- for_each_online_cpu(cpu) {
- struct rq *rq = cpu_rq(cpu);
- for_each_online_cpu(other_cpu) {
- if (other_cpu <= cpu)
- continue;
- printk(KERN_DEBUG "BFS LOCALITY CPU %d to %d: %d\n", cpu, other_cpu, rq->cpu_locality[other_cpu]);
- }
- }
-}
-#else
-void __init sched_init_smp(void)
-{
-}
-#endif /* CONFIG_SMP */
-
-unsigned int sysctl_timer_migration = 1;
-
-int in_sched_functions(unsigned long addr)
-{
- return in_lock_functions(addr) ||
- (addr >= (unsigned long)__sched_text_start
- && addr < (unsigned long)__sched_text_end);
-}
-
-void __init sched_init(void)
-{
-#ifdef CONFIG_SMP
- int cpu_ids;
-#endif
- int i;
- struct rq *rq;
-
- prio_ratios[0] = 128;
- for (i = 1 ; i < NICE_WIDTH ; i++)
- prio_ratios[i] = prio_ratios[i - 1] * 11 / 10;
-
- raw_spin_lock_init(&grq.lock);
- grq.nr_running = grq.nr_uninterruptible = grq.nr_switches = 0;
- grq.niffies = 0;
- grq.last_jiffy = jiffies;
- raw_spin_lock_init(&grq.iso_lock);
- grq.iso_ticks = 0;
- grq.iso_refractory = false;
- grq.noc = 1;
-#ifdef CONFIG_SMP
- init_defrootdomain();
- grq.qnr = grq.idle_cpus = 0;
- cpumask_clear(&grq.cpu_idle_map);
-#else
- uprq = &per_cpu(runqueues, 0);
-#endif
- for_each_possible_cpu(i) {
- rq = cpu_rq(i);
- rq->grq_lock = &grq.lock;
- rq->user_pc = rq->nice_pc = rq->softirq_pc = rq->system_pc =
- rq->iowait_pc = rq->idle_pc = 0;
- rq->dither = false;
-#ifdef CONFIG_SMP
- rq->sticky_task = NULL;
- rq->last_niffy = 0;
- rq->sd = NULL;
- rq->rd = NULL;
- rq->online = false;
- rq->cpu = i;
- rq_attach_root(rq, &def_root_domain);
-#endif
- atomic_set(&rq->nr_iowait, 0);
- }
-
-#ifdef CONFIG_SMP
- cpu_ids = i;
- /*
- * Set the base locality for cpu cache distance calculation to
- * "distant" (3). Make sure the distance from a CPU to itself is 0.
- */
- for_each_possible_cpu(i) {
- int j;
-
- rq = cpu_rq(i);
-#ifdef CONFIG_SCHED_SMT
- rq->siblings_idle = sole_cpu_idle;
-#endif
-#ifdef CONFIG_SCHED_MC
- rq->cache_idle = sole_cpu_idle;
-#endif
- rq->cpu_locality = kmalloc(cpu_ids * sizeof(int *), GFP_ATOMIC);
- for_each_possible_cpu(j) {
- if (i == j)
- rq->cpu_locality[j] = 0;
- else
- rq->cpu_locality[j] = 4;
- }
- }
-#endif
-
- for (i = 0; i < PRIO_LIMIT; i++)
- INIT_LIST_HEAD(grq.queue + i);
- /* delimiter for bitsearch */
- __set_bit(PRIO_LIMIT, grq.prio_bitmap);
-
-#ifdef CONFIG_PREEMPT_NOTIFIERS
- INIT_HLIST_HEAD(&init_task.preempt_notifiers);
-#endif
-
- /*
- * The boot idle thread does lazy MMU switching as well:
- */
- atomic_inc(&init_mm.mm_count);
- enter_lazy_tlb(&init_mm, current);
-
- /*
- * Make us the idle thread. Technically, schedule() should not be
- * called from this thread, however somewhere below it might be,
- * but because we are the idle thread, we just pick up running again
- * when this runqueue becomes "idle".
- */
- init_idle(current, smp_processor_id());
-
-#ifdef CONFIG_SMP
- zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
- /* May be allocated at isolcpus cmdline parse time */
- if (cpu_isolated_map == NULL)
- zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
- idle_thread_set_boot_cpu();
-#endif /* SMP */
-}
-
-#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
-static inline int preempt_count_equals(int preempt_offset)
-{
- int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
-
- return (nested == preempt_offset);
-}
-
-void __might_sleep(const char *file, int line, int preempt_offset)
-{
- /*
- * Blocking primitives will set (and therefore destroy) current->state,
- * since we will exit with TASK_RUNNING make sure we enter with it,
- * otherwise we will destroy state.
- */
- WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,
- "do not call blocking ops when !TASK_RUNNING; "
- "state=%lx set at [<%p>] %pS\n",
- current->state,
- (void *)current->task_state_change,
- (void *)current->task_state_change);
-
- ___might_sleep(file, line, preempt_offset);
-}
-EXPORT_SYMBOL(__might_sleep);
-
-void ___might_sleep(const char *file, int line, int preempt_offset)
-{
- static unsigned long prev_jiffy; /* ratelimiting */
-
- rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
- if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
- !is_idle_task(current)) ||
- system_state != SYSTEM_RUNNING || oops_in_progress)
- return;
- if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
- return;
- prev_jiffy = jiffies;
-
- printk(KERN_ERR
- "BUG: sleeping function called from invalid context at %s:%d\n",
- file, line);
- printk(KERN_ERR
- "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
- in_atomic(), irqs_disabled(),
- current->pid, current->comm);
-
- if (task_stack_end_corrupted(current))
- printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
-
- debug_show_held_locks(current);
- if (irqs_disabled())
- print_irqtrace_events(current);
-#ifdef CONFIG_DEBUG_PREEMPT
- if (!preempt_count_equals(preempt_offset)) {
- pr_err("Preemption disabled at:");
- print_ip_sym(current->preempt_disable_ip);
- pr_cont("\n");
- }
-#endif
- dump_stack();
-}
-EXPORT_SYMBOL(___might_sleep);
-#endif
-
-#ifdef CONFIG_MAGIC_SYSRQ
-void normalize_rt_tasks(void)
-{
- struct task_struct *g, *p;
- unsigned long flags;
- struct rq *rq;
- int queued;
-
- read_lock(&tasklist_lock);
- for_each_process_thread(g, p) {
- if (!rt_task(p) && !iso_task(p))
- continue;
-
- rq = task_grq_lock(p, &flags);
- queued = task_queued(p);
- if (queued)
- dequeue_task(p);
- __setscheduler(p, rq, SCHED_NORMAL, 0, false);
- if (queued) {
- enqueue_task(p, rq);
- try_preempt(p, rq);
- }
-
- task_grq_unlock(&flags);
- }
- read_unlock(&tasklist_lock);
-}
-#endif /* CONFIG_MAGIC_SYSRQ */
-
-#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
-/*
- * These functions are only useful for the IA64 MCA handling, or kdb.
- *
- * They can only be called when the whole system has been
- * stopped - every CPU needs to be quiescent, and no scheduling
- * activity can take place. Using them for anything else would
- * be a serious bug, and as a result, they aren't even visible
- * under any other configuration.
- */
-
-/**
- * curr_task - return the current task for a given cpu.
- * @cpu: the processor in question.
- *
- * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
- *
- * Return: The current task for @cpu.
- */
-struct task_struct *curr_task(int cpu)
-{
- return cpu_curr(cpu);
-}
-
-#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
-
-#ifdef CONFIG_IA64
-/**
- * set_curr_task - set the current task for a given cpu.
- * @cpu: the processor in question.
- * @p: the task pointer to set.
- *
- * Description: This function must only be used when non-maskable interrupts
- * are serviced on a separate stack. It allows the architecture to switch the
- * notion of the current task on a cpu in a non-blocking manner. This function
- * must be called with all CPU's synchronised, and interrupts disabled, the
- * and caller must save the original value of the current task (see
- * curr_task() above) and restore that value before reenabling interrupts and
- * re-starting the system.
- *
- * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
- */
-void set_curr_task(int cpu, struct task_struct *p)
-{
- cpu_curr(cpu) = p;
-}
-
-#endif
-
-/*
- * Use precise platform statistics if available:
- */
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
- *ut = p->utime;
- *st = p->stime;
-}
-
-void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
- struct task_cputime cputime;
-
- thread_group_cputime(p, &cputime);
-
- *ut = cputime.utime;
- *st = cputime.stime;
-}
-
-void vtime_account_system_irqsafe(struct task_struct *tsk)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- vtime_account_system(tsk);
- local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe);
-
-#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
-void vtime_task_switch(struct task_struct *prev)
-{
- if (is_idle_task(prev))
- vtime_account_idle(prev);
- else
- vtime_account_system(prev);
-
- vtime_account_user(prev);
- arch_vtime_task_switch(prev);
-}
-#endif
-
-#else
-/*
- * Perform (stime * rtime) / total, but avoid multiplication overflow by
- * losing precision when the numbers are big.
- */
-static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
-{
- u64 scaled;
-
- for (;;) {
- /* Make sure "rtime" is the bigger of stime/rtime */
- if (stime > rtime) {
- u64 tmp = rtime; rtime = stime; stime = tmp;
- }
-
- /* Make sure 'total' fits in 32 bits */
- if (total >> 32)
- goto drop_precision;
-
- /* Does rtime (and thus stime) fit in 32 bits? */
- if (!(rtime >> 32))
- break;
-
- /* Can we just balance rtime/stime rather than dropping bits? */
- if (stime >> 31)
- goto drop_precision;
-
- /* We can grow stime and shrink rtime and try to make them both fit */
- stime <<= 1;
- rtime >>= 1;
- continue;
-
-drop_precision:
- /* We drop from rtime, it has more bits than stime */
- rtime >>= 1;
- total >>= 1;
- }
-
- /*
- * Make sure gcc understands that this is a 32x32->64 multiply,
- * followed by a 64/32->64 divide.
- */
- scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
- return (__force cputime_t) scaled;
-}
-
-/*
- * Adjust tick based cputime random precision against scheduler
- * runtime accounting.
- */
-static void cputime_adjust(struct task_cputime *curr,
- struct cputime *prev,
- cputime_t *ut, cputime_t *st)
-{
- cputime_t rtime, stime, utime, total;
-
- stime = curr->stime;
- total = stime + curr->utime;
-
- /*
- * Tick based cputime accounting depend on random scheduling
- * timeslices of a task to be interrupted or not by the timer.
- * Depending on these circumstances, the number of these interrupts
- * may be over or under-optimistic, matching the real user and system
- * cputime with a variable precision.
- *
- * Fix this by scaling these tick based values against the total
- * runtime accounted by the CFS scheduler.
- */
- rtime = nsecs_to_cputime(curr->sum_exec_runtime);
-
- /*
- * Update userspace visible utime/stime values only if actual execution
- * time is bigger than already exported. Note that can happen, that we
- * provided bigger values due to scaling inaccuracy on big numbers.
- */
- if (prev->stime + prev->utime >= rtime)
- goto out;
-
- if (total) {
- stime = scale_stime((__force u64)stime,
- (__force u64)rtime, (__force u64)total);
- utime = rtime - stime;
- } else {
- stime = rtime;
- utime = 0;
- }
-
- /*
- * If the tick based count grows faster than the scheduler one,
- * the result of the scaling may go backward.
- * Let's enforce monotonicity.
- */
- prev->stime = max(prev->stime, stime);
- prev->utime = max(prev->utime, utime);
-
-out:
- *ut = prev->utime;
- *st = prev->stime;
-}
-
-void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
- struct task_cputime cputime = {
- .sum_exec_runtime = tsk_seruntime(p),
- };
-
- task_cputime(p, &cputime.utime, &cputime.stime);
- cputime_adjust(&cputime, &p->prev_cputime, ut, st);
-}
-
-/*
- * Must be called with siglock held.
- */
-void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
- struct task_cputime cputime;
-
- thread_group_cputime(p, &cputime);
- cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
-}
-#endif
-
-void init_idle_bootup_task(struct task_struct *idle)
-{}
-
-#ifdef CONFIG_SCHED_DEBUG
-void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
-{}
-
-void proc_sched_set_task(struct task_struct *p)
-{}
-#endif
-
-#ifdef CONFIG_SMP
-#define SCHED_LOAD_SHIFT (10)
-#define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT)
-
-unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
-{
- return SCHED_LOAD_SCALE;
-}
-
-unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
-{
- unsigned long weight = cpumask_weight(sched_domain_span(sd));
- unsigned long smt_gain = sd->smt_gain;
-
- smt_gain /= weight;
-
- return smt_gain;
-}
-#endif
diff --git a/kernel/sched/bfs_sched.h b/kernel/sched/bfs_sched.h
deleted file mode 100644
index 876969fff..000000000
--- a/kernel/sched/bfs_sched.h
+++ /dev/null
@@ -1,172 +0,0 @@
-#include <linux/sched.h>
-#include <linux/cpuidle.h>
-
-#ifndef BFS_SCHED_H
-#define BFS_SCHED_H
-
-/*
- * This is the main, per-CPU runqueue data structure.
- * This data should only be modified by the local cpu.
- */
-struct rq {
- struct task_struct *curr, *idle, *stop;
- struct mm_struct *prev_mm;
-
- /* Pointer to grq spinlock */
- raw_spinlock_t *grq_lock;
-
- /* Stored data about rq->curr to work outside grq lock */
- u64 rq_deadline;
- unsigned int rq_policy;
- int rq_time_slice;
- u64 rq_last_ran;
- int rq_prio;
- bool rq_running; /* There is a task running */
- int soft_affined; /* Running or queued tasks with this set as their rq */
-#ifdef CONFIG_SMT_NICE
- struct mm_struct *rq_mm;
- int rq_smt_bias; /* Policy/nice level bias across smt siblings */
-#endif
- /* Accurate timekeeping data */
- u64 timekeep_clock;
- unsigned long user_pc, nice_pc, irq_pc, softirq_pc, system_pc,
- iowait_pc, idle_pc;
- atomic_t nr_iowait;
-
-#ifdef CONFIG_SMP
- int cpu; /* cpu of this runqueue */
- bool online;
- bool scaling; /* This CPU is managed by a scaling CPU freq governor */
- struct task_struct *sticky_task;
-
- struct root_domain *rd;
- struct sched_domain *sd;
- int *cpu_locality; /* CPU relative cache distance */
-#ifdef CONFIG_SCHED_SMT
- bool (*siblings_idle)(int cpu);
- /* See if all smt siblings are idle */
-#endif /* CONFIG_SCHED_SMT */
-#ifdef CONFIG_SCHED_MC
- bool (*cache_idle)(int cpu);
- /* See if all cache siblings are idle */
-#endif /* CONFIG_SCHED_MC */
- u64 last_niffy; /* Last time this RQ updated grq.niffies */
-#endif /* CONFIG_SMP */
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
- u64 prev_irq_time;
-#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
-#ifdef CONFIG_PARAVIRT
- u64 prev_steal_time;
-#endif /* CONFIG_PARAVIRT */
-#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
- u64 prev_steal_time_rq;
-#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */
-
- u64 clock, old_clock, last_tick;
- u64 clock_task;
- bool dither;
-
-#ifdef CONFIG_SCHEDSTATS
-
- /* latency stats */
- struct sched_info rq_sched_info;
- unsigned long long rq_cpu_time;
- /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
-
- /* sys_sched_yield() stats */
- unsigned int yld_count;
-
- /* schedule() stats */
- unsigned int sched_switch;
- unsigned int sched_count;
- unsigned int sched_goidle;
-
- /* try_to_wake_up() stats */
- unsigned int ttwu_count;
- unsigned int ttwu_local;
-#endif /* CONFIG_SCHEDSTATS */
-#ifdef CONFIG_CPU_IDLE
- /* Must be inspected within a rcu lock section */
- struct cpuidle_state *idle_state;
-#endif
-};
-
-#ifdef CONFIG_SMP
-struct rq *cpu_rq(int cpu);
-#endif
-
-#ifndef CONFIG_SMP
-extern struct rq *uprq;
-#define cpu_rq(cpu) (uprq)
-#define this_rq() (uprq)
-#define raw_rq() (uprq)
-#define task_rq(p) (uprq)
-#define cpu_curr(cpu) ((uprq)->curr)
-#else /* CONFIG_SMP */
-DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
-#define this_rq() this_cpu_ptr(&runqueues)
-#define raw_rq() raw_cpu_ptr(&runqueues)
-#endif /* CONFIG_SMP */
-
-static inline u64 __rq_clock_broken(struct rq *rq)
-{
- return ACCESS_ONCE(rq->clock);
-}
-
-static inline u64 rq_clock(struct rq *rq)
-{
- lockdep_assert_held(rq->grq_lock);
- return rq->clock;
-}
-
-static inline u64 rq_clock_task(struct rq *rq)
-{
- lockdep_assert_held(rq->grq_lock);
- return rq->clock_task;
-}
-
-#define rcu_dereference_check_sched_domain(p) \
- rcu_dereference_check((p), \
- lockdep_is_held(&sched_domains_mutex))
-
-/*
- * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
- * See detach_destroy_domains: synchronize_sched for details.
- *
- * The domain tree of any CPU may only be accessed from within
- * preempt-disabled sections.
- */
-#define for_each_domain(cpu, __sd) \
- for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
-
-static inline void sched_ttwu_pending(void) { }
-
-static inline int task_on_rq_queued(struct task_struct *p)
-{
- return p->on_rq;
-}
-
-#ifdef CONFIG_CPU_IDLE
-static inline void idle_set_state(struct rq *rq,
- struct cpuidle_state *idle_state)
-{
- rq->idle_state = idle_state;
-}
-
-static inline struct cpuidle_state *idle_get_state(struct rq *rq)
-{
- WARN_ON(!rcu_read_lock_held());
- return rq->idle_state;
-}
-#else
-static inline void idle_set_state(struct rq *rq,
- struct cpuidle_state *idle_state)
-{
-}
-
-static inline struct cpuidle_state *idle_get_state(struct rq *rq)
-{
- return NULL;
-}
-#endif
-#endif /* BFS_SCHED_H */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 123673291..78b4bad10 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -90,26 +90,6 @@
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
-void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
-{
- unsigned long delta;
- ktime_t soft, hard, now;
-
- for (;;) {
- if (hrtimer_active(period_timer))
- break;
-
- now = hrtimer_cb_get_time(period_timer);
- hrtimer_forward(period_timer, now, period);
-
- soft = hrtimer_get_softexpires(period_timer);
- hard = hrtimer_get_expires(period_timer);
- delta = ktime_to_ns(ktime_sub(hard, soft));
- __hrtimer_start_range_ns(period_timer, soft, delta,
- HRTIMER_MODE_ABS_PINNED, 0);
- }
-}
-
DEFINE_MUTEX(sched_domains_mutex);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -355,12 +335,11 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
#ifdef CONFIG_SMP
-static int __hrtick_restart(struct rq *rq)
+static void __hrtick_restart(struct rq *rq)
{
struct hrtimer *timer = &rq->hrtick_timer;
- ktime_t time = hrtimer_get_softexpires(timer);
- return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0);
+ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
}
/*
@@ -440,8 +419,8 @@ void hrtick_start(struct rq *rq, u64 delay)
* doesn't make sense. Rely on vruntime for fairness.
*/
delay = max_t(u64, delay, 10000LL);
- __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
- HRTIMER_MODE_REL_PINNED, 0);
+ hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
+ HRTIMER_MODE_REL_PINNED);
}
static inline void init_hrtick(void)
@@ -511,7 +490,7 @@ static bool set_nr_and_not_polling(struct task_struct *p)
static bool set_nr_if_polling(struct task_struct *p)
{
struct thread_info *ti = task_thread_info(p);
- typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags);
+ typeof(ti->flags) old, val = READ_ONCE(ti->flags);
for (;;) {
if (!(val & _TIF_POLLING_NRFLAG))
@@ -541,6 +520,52 @@ static bool set_nr_if_polling(struct task_struct *p)
#endif
#endif
+void wake_q_add(struct wake_q_head *head, struct task_struct *task)
+{
+ struct wake_q_node *node = &task->wake_q;
+
+ /*
+ * Atomically grab the task, if ->wake_q is !nil already it means
+ * its already queued (either by us or someone else) and will get the
+ * wakeup due to that.
+ *
+ * This cmpxchg() implies a full barrier, which pairs with the write
+ * barrier implied by the wakeup in wake_up_list().
+ */
+ if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
+ return;
+
+ get_task_struct(task);
+
+ /*
+ * The head is context local, there can be no concurrency.
+ */
+ *head->lastp = node;
+ head->lastp = &node->next;
+}
+
+void wake_up_q(struct wake_q_head *head)
+{
+ struct wake_q_node *node = head->first;
+
+ while (node != WAKE_Q_TAIL) {
+ struct task_struct *task;
+
+ task = container_of(node, struct task_struct, wake_q);
+ BUG_ON(!task);
+ /* task can safely be re-inserted now */
+ node = node->next;
+ task->wake_q.next = NULL;
+
+ /*
+ * wake_up_process() implies a wmb() to pair with the queueing
+ * in wake_q_add() so as not to miss wakeups.
+ */
+ wake_up_process(task);
+ put_task_struct(task);
+ }
+}
+
/*
* resched_curr - mark rq's current task 'to be rescheduled now'.
*
@@ -593,13 +618,12 @@ void resched_cpu(int cpu)
* selecting an idle cpu will add more delays to the timers than intended
* (as that cpu's timer base may not be uptodate wrt jiffies etc).
*/
-int get_nohz_timer_target(int pinned)
+int get_nohz_timer_target(void)
{
- int cpu = smp_processor_id();
- int i;
+ int i, cpu = smp_processor_id();
struct sched_domain *sd;
- if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu))
+ if (!idle_cpu(cpu))
return cpu;
rcu_read_lock();
@@ -976,7 +1000,11 @@ inline int task_curr(const struct task_struct *p)
}
/*
- * Can drop rq->lock because from sched_class::switched_from() methods drop it.
+ * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
+ * use the balance_callback list if you want balancing.
+ *
+ * this means any call to check_class_changed() must be followed by a call to
+ * balance_callback().
*/
static inline void check_class_changed(struct rq *rq, struct task_struct *p,
const struct sched_class *prev_class,
@@ -985,7 +1013,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
if (prev_class != p->sched_class) {
if (prev_class->switched_from)
prev_class->switched_from(rq, p);
- /* Possble rq->lock 'hole'. */
+
p->sched_class->switched_to(rq, p);
} else if (oldprio != p->prio || dl_task(p))
p->sched_class->prio_changed(rq, p, oldprio);
@@ -1017,6 +1045,177 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
}
#ifdef CONFIG_SMP
+/*
+ * This is how migration works:
+ *
+ * 1) we invoke migration_cpu_stop() on the target CPU using
+ * stop_one_cpu().
+ * 2) stopper starts to run (implicitly forcing the migrated thread
+ * off the CPU)
+ * 3) it checks whether the migrated task is still in the wrong runqueue.
+ * 4) if it's in the wrong runqueue then the migration thread removes
+ * it and puts it into the right queue.
+ * 5) stopper completes and stop_one_cpu() returns and the migration
+ * is done.
+ */
+
+/*
+ * move_queued_task - move a queued task to new rq.
+ *
+ * Returns (locked) new rq. Old rq's lock is released.
+ */
+static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new_cpu)
+{
+ lockdep_assert_held(&rq->lock);
+
+ dequeue_task(rq, p, 0);
+ p->on_rq = TASK_ON_RQ_MIGRATING;
+ set_task_cpu(p, new_cpu);
+ raw_spin_unlock(&rq->lock);
+
+ rq = cpu_rq(new_cpu);
+
+ raw_spin_lock(&rq->lock);
+ BUG_ON(task_cpu(p) != new_cpu);
+ p->on_rq = TASK_ON_RQ_QUEUED;
+ enqueue_task(rq, p, 0);
+ check_preempt_curr(rq, p, 0);
+
+ return rq;
+}
+
+struct migration_arg {
+ struct task_struct *task;
+ int dest_cpu;
+};
+
+/*
+ * Move (not current) task off this cpu, onto dest cpu. We're doing
+ * this because either it can't run here any more (set_cpus_allowed()
+ * away from this CPU, or CPU going down), or because we're
+ * attempting to rebalance this task on exec (sched_exec).
+ *
+ * So we race with normal scheduler movements, but that's OK, as long
+ * as the task is no longer on this CPU.
+ */
+static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu)
+{
+ if (unlikely(!cpu_active(dest_cpu)))
+ return rq;
+
+ /* Affinity changed (again). */
+ if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
+ return rq;
+
+ rq = move_queued_task(rq, p, dest_cpu);
+
+ return rq;
+}
+
+/*
+ * migration_cpu_stop - this will be executed by a highprio stopper thread
+ * and performs thread migration by bumping thread off CPU then
+ * 'pushing' onto another runqueue.
+ */
+static int migration_cpu_stop(void *data)
+{
+ struct migration_arg *arg = data;
+ struct task_struct *p = arg->task;
+ struct rq *rq = this_rq();
+
+ /*
+ * The original target cpu might have gone down and we might
+ * be on another cpu but it doesn't matter.
+ */
+ local_irq_disable();
+ /*
+ * We need to explicitly wake pending tasks before running
+ * __migrate_task() such that we will not miss enforcing cpus_allowed
+ * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
+ */
+ sched_ttwu_pending();
+
+ raw_spin_lock(&p->pi_lock);
+ raw_spin_lock(&rq->lock);
+ /*
+ * If task_rq(p) != rq, it cannot be migrated here, because we're
+ * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
+ * we're holding p->pi_lock.
+ */
+ if (task_rq(p) == rq && task_on_rq_queued(p))
+ rq = __migrate_task(rq, p, arg->dest_cpu);
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock(&p->pi_lock);
+
+ local_irq_enable();
+ return 0;
+}
+
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+ if (p->sched_class->set_cpus_allowed)
+ p->sched_class->set_cpus_allowed(p, new_mask);
+
+ cpumask_copy(&p->cpus_allowed, new_mask);
+ p->nr_cpus_allowed = cpumask_weight(new_mask);
+}
+
+/*
+ * Change a given task's CPU affinity. Migrate the thread to a
+ * proper CPU and schedule it away if the CPU it's executing on
+ * is removed from the allowed bitmask.
+ *
+ * NOTE: the caller must have a valid reference to the task, the
+ * task must not exit() & deallocate itself prematurely. The
+ * call is not atomic; no spinlocks may be held.
+ */
+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+{
+ unsigned long flags;
+ struct rq *rq;
+ unsigned int dest_cpu;
+ int ret = 0;
+
+ rq = task_rq_lock(p, &flags);
+
+ if (cpumask_equal(&p->cpus_allowed, new_mask))
+ goto out;
+
+ if (!cpumask_intersects(new_mask, cpu_active_mask)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ do_set_cpus_allowed(p, new_mask);
+
+ /* Can the task run on the task's current CPU? If so, we're done */
+ if (cpumask_test_cpu(task_cpu(p), new_mask))
+ goto out;
+
+ dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
+ if (task_running(rq, p) || p->state == TASK_WAKING) {
+ struct migration_arg arg = { p, dest_cpu };
+ /* Need help from migration thread: drop lock and wait. */
+ task_rq_unlock(rq, p, &flags);
+ stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+ tlb_migrate_finish(p->mm);
+ return 0;
+ } else if (task_on_rq_queued(p)) {
+ /*
+ * OK, since we're going to drop the lock immediately
+ * afterwards anyway.
+ */
+ lockdep_unpin_lock(&rq->lock);
+ rq = move_queued_task(rq, p, dest_cpu);
+ lockdep_pin_lock(&rq->lock);
+ }
+out:
+ task_rq_unlock(rq, p, &flags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
+
void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
{
#ifdef CONFIG_SCHED_DEBUG
@@ -1049,7 +1248,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
if (p->sched_class->migrate_task_rq)
p->sched_class->migrate_task_rq(p, new_cpu);
p->se.nr_migrations++;
- perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
+ perf_event_task_migrate(p);
}
__set_task_cpu(p, new_cpu);
@@ -1157,13 +1356,6 @@ out:
return ret;
}
-struct migration_arg {
- struct task_struct *task;
- int dest_cpu;
-};
-
-static int migration_cpu_stop(void *data);
-
/*
* wait_task_inactive - wait for a thread to unschedule.
*
@@ -1296,9 +1488,7 @@ void kick_process(struct task_struct *p)
preempt_enable();
}
EXPORT_SYMBOL_GPL(kick_process);
-#endif /* CONFIG_SMP */
-#ifdef CONFIG_SMP
/*
* ->cpus_allowed is protected by both rq->lock and p->pi_lock
*/
@@ -1378,6 +1568,8 @@ out:
static inline
int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
{
+ lockdep_assert_held(&p->pi_lock);
+
if (p->nr_cpus_allowed > 1)
cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
@@ -1403,7 +1595,7 @@ static void update_avg(u64 *avg, u64 sample)
s64 diff = sample - *avg;
*avg += diff >> 3;
}
-#endif
+#endif /* CONFIG_SMP */
static void
ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
@@ -1466,8 +1658,15 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
p->state = TASK_RUNNING;
#ifdef CONFIG_SMP
- if (p->sched_class->task_woken)
+ if (p->sched_class->task_woken) {
+ /*
+ * Our task @p is fully woken up and running; so its safe to
+ * drop the rq->lock, hereafter rq is only used for statistics.
+ */
+ lockdep_unpin_lock(&rq->lock);
p->sched_class->task_woken(rq, p);
+ lockdep_pin_lock(&rq->lock);
+ }
if (rq->idle_stamp) {
u64 delta = rq_clock(rq) - rq->idle_stamp;
@@ -1486,6 +1685,8 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
static void
ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
{
+ lockdep_assert_held(&rq->lock);
+
#ifdef CONFIG_SMP
if (p->sched_contributes_to_load)
rq->nr_uninterruptible--;
@@ -1530,6 +1731,7 @@ void sched_ttwu_pending(void)
return;
raw_spin_lock_irqsave(&rq->lock, flags);
+ lockdep_pin_lock(&rq->lock);
while (llist) {
p = llist_entry(llist, struct task_struct, wake_entry);
@@ -1537,6 +1739,7 @@ void sched_ttwu_pending(void)
ttwu_do_activate(rq, p, 0);
}
+ lockdep_unpin_lock(&rq->lock);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -1633,7 +1836,9 @@ static void ttwu_queue(struct task_struct *p, int cpu)
#endif
raw_spin_lock(&rq->lock);
+ lockdep_pin_lock(&rq->lock);
ttwu_do_activate(rq, p, 0);
+ lockdep_unpin_lock(&rq->lock);
raw_spin_unlock(&rq->lock);
}
@@ -1728,9 +1933,17 @@ static void try_to_wake_up_local(struct task_struct *p)
lockdep_assert_held(&rq->lock);
if (!raw_spin_trylock(&p->pi_lock)) {
+ /*
+ * This is OK, because current is on_cpu, which avoids it being
+ * picked for load-balance and preemption/IRQs are still
+ * disabled avoiding further scheduler activity on it and we've
+ * not yet picked a replacement task.
+ */
+ lockdep_unpin_lock(&rq->lock);
raw_spin_unlock(&rq->lock);
raw_spin_lock(&p->pi_lock);
raw_spin_lock(&rq->lock);
+ lockdep_pin_lock(&rq->lock);
}
if (!(p->state & TASK_NORMAL))
@@ -1951,7 +2164,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
set_task_cpu(p, cpu);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+#ifdef CONFIG_SCHED_INFO
if (likely(sched_info_on()))
memset(&p->sched_info, 0, sizeof(p->sched_info));
#endif
@@ -2105,12 +2318,29 @@ void wake_up_new_task(struct task_struct *p)
#ifdef CONFIG_PREEMPT_NOTIFIERS
+static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE;
+
+void preempt_notifier_inc(void)
+{
+ static_key_slow_inc(&preempt_notifier_key);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_inc);
+
+void preempt_notifier_dec(void)
+{
+ static_key_slow_dec(&preempt_notifier_key);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_dec);
+
/**
* preempt_notifier_register - tell me when current is being preempted & rescheduled
* @notifier: notifier struct to register
*/
void preempt_notifier_register(struct preempt_notifier *notifier)
{
+ if (!static_key_false(&preempt_notifier_key))
+ WARN(1, "registering preempt_notifier while notifiers disabled\n");
+
hlist_add_head(&notifier->link, &current->preempt_notifiers);
}
EXPORT_SYMBOL_GPL(preempt_notifier_register);
@@ -2119,7 +2349,7 @@ EXPORT_SYMBOL_GPL(preempt_notifier_register);
* preempt_notifier_unregister - no longer interested in preemption notifications
* @notifier: notifier struct to unregister
*
- * This is safe to call from within a preemption notifier.
+ * This is *not* safe to call from within a preemption notifier.
*/
void preempt_notifier_unregister(struct preempt_notifier *notifier)
{
@@ -2127,7 +2357,7 @@ void preempt_notifier_unregister(struct preempt_notifier *notifier)
}
EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
-static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
{
struct preempt_notifier *notifier;
@@ -2135,9 +2365,15 @@ static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
notifier->ops->sched_in(notifier, raw_smp_processor_id());
}
+static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+ if (static_key_false(&preempt_notifier_key))
+ __fire_sched_in_preempt_notifiers(curr);
+}
+
static void
-fire_sched_out_preempt_notifiers(struct task_struct *curr,
- struct task_struct *next)
+__fire_sched_out_preempt_notifiers(struct task_struct *curr,
+ struct task_struct *next)
{
struct preempt_notifier *notifier;
@@ -2145,13 +2381,21 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
notifier->ops->sched_out(notifier, next);
}
+static __always_inline void
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
+ struct task_struct *next)
+{
+ if (static_key_false(&preempt_notifier_key))
+ __fire_sched_out_preempt_notifiers(curr, next);
+}
+
#else /* !CONFIG_PREEMPT_NOTIFIERS */
-static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
{
}
-static void
+static inline void
fire_sched_out_preempt_notifiers(struct task_struct *curr,
struct task_struct *next)
{
@@ -2252,23 +2496,35 @@ static struct rq *finish_task_switch(struct task_struct *prev)
#ifdef CONFIG_SMP
/* rq->lock is NOT held, but preemption is disabled */
-static inline void post_schedule(struct rq *rq)
+static void __balance_callback(struct rq *rq)
{
- if (rq->post_schedule) {
- unsigned long flags;
+ struct callback_head *head, *next;
+ void (*func)(struct rq *rq);
+ unsigned long flags;
- raw_spin_lock_irqsave(&rq->lock, flags);
- if (rq->curr->sched_class->post_schedule)
- rq->curr->sched_class->post_schedule(rq);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ head = rq->balance_callback;
+ rq->balance_callback = NULL;
+ while (head) {
+ func = (void (*)(struct rq *))head->func;
+ next = head->next;
+ head->next = NULL;
+ head = next;
- rq->post_schedule = 0;
+ func(rq);
}
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+static inline void balance_callback(struct rq *rq)
+{
+ if (unlikely(rq->balance_callback))
+ __balance_callback(rq);
}
#else
-static inline void post_schedule(struct rq *rq)
+static inline void balance_callback(struct rq *rq)
{
}
@@ -2286,7 +2542,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
/* finish_task_switch() drops rq->lock and enables preemtion */
preempt_disable();
rq = finish_task_switch(prev);
- post_schedule(rq);
+ balance_callback(rq);
preempt_enable();
if (current->set_child_tid)
@@ -2330,9 +2586,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
* of the scheduler it's an obvious special-case), so we
* do an early lockdep release here:
*/
+ lockdep_unpin_lock(&rq->lock);
spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
- context_tracking_task_switch(prev, next);
/* Here we just switch the register state and the stack. */
switch_to(prev, next, prev);
barrier();
@@ -2397,9 +2653,9 @@ unsigned long nr_iowait_cpu(int cpu)
void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
{
- struct rq *this = this_rq();
- *nr_waiters = atomic_read(&this->nr_iowait);
- *load = this->cpu_load[0];
+ struct rq *rq = this_rq();
+ *nr_waiters = atomic_read(&rq->nr_iowait);
+ *load = rq->load.weight;
}
#ifdef CONFIG_SMP
@@ -2497,6 +2753,7 @@ void scheduler_tick(void)
update_rq_clock(rq);
curr->sched_class->task_tick(rq, curr, 0);
update_cpu_load_active(rq);
+ calc_global_load_tick(rq);
raw_spin_unlock(&rq->lock);
perf_event_task_tick();
@@ -2525,7 +2782,7 @@ void scheduler_tick(void)
u64 scheduler_tick_max_deferment(void)
{
struct rq *rq = this_rq();
- unsigned long next, now = ACCESS_ONCE(jiffies);
+ unsigned long next, now = READ_ONCE(jiffies);
next = rq->last_sched_tick + HZ;
@@ -2726,9 +2983,7 @@ again:
* - return from syscall or exception to user-space
* - return from interrupt-handler to user-space
*
- * WARNING: all callers must re-check need_resched() afterward and reschedule
- * accordingly in case an event triggered the need for rescheduling (such as
- * an interrupt waking up a task) while preemption was disabled in __schedule().
+ * WARNING: must be called with preemption disabled!
*/
static void __sched __schedule(void)
{
@@ -2737,7 +2992,6 @@ static void __sched __schedule(void)
struct rq *rq;
int cpu;
- preempt_disable();
cpu = smp_processor_id();
rq = cpu_rq(cpu);
rcu_note_context_switch();
@@ -2755,6 +3009,7 @@ static void __sched __schedule(void)
*/
smp_mb__before_spinlock();
raw_spin_lock_irq(&rq->lock);
+ lockdep_pin_lock(&rq->lock);
rq->clock_skip_update <<= 1; /* promote REQ to ACT */
@@ -2797,12 +3052,12 @@ static void __sched __schedule(void)
rq = context_switch(rq, prev, next); /* unlocks the rq */
cpu = cpu_of(rq);
- } else
+ } else {
+ lockdep_unpin_lock(&rq->lock);
raw_spin_unlock_irq(&rq->lock);
+ }
- post_schedule(rq);
-
- sched_preempt_enable_no_resched();
+ balance_callback(rq);
}
static inline void sched_submit_work(struct task_struct *tsk)
@@ -2823,7 +3078,9 @@ asmlinkage __visible void __sched schedule(void)
sched_submit_work(tsk);
do {
+ preempt_disable();
__schedule();
+ sched_preempt_enable_no_resched();
} while (need_resched());
}
EXPORT_SYMBOL(schedule);
@@ -2862,15 +3119,14 @@ void __sched schedule_preempt_disabled(void)
static void __sched notrace preempt_schedule_common(void)
{
do {
- __preempt_count_add(PREEMPT_ACTIVE);
+ preempt_active_enter();
__schedule();
- __preempt_count_sub(PREEMPT_ACTIVE);
+ preempt_active_exit();
/*
* Check again in case we missed a preemption opportunity
* between schedule and now.
*/
- barrier();
} while (need_resched());
}
@@ -2894,9 +3150,8 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
NOKPROBE_SYMBOL(preempt_schedule);
EXPORT_SYMBOL(preempt_schedule);
-#ifdef CONFIG_CONTEXT_TRACKING
/**
- * preempt_schedule_context - preempt_schedule called by tracing
+ * preempt_schedule_notrace - preempt_schedule called by tracing
*
* The tracing infrastructure uses preempt_enable_notrace to prevent
* recursion and tracing preempt enabling caused by the tracing
@@ -2909,7 +3164,7 @@ EXPORT_SYMBOL(preempt_schedule);
* instead of preempt_schedule() to exit user context if needed before
* calling the scheduler.
*/
-asmlinkage __visible void __sched notrace preempt_schedule_context(void)
+asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
{
enum ctx_state prev_ctx;
@@ -2917,7 +3172,13 @@ asmlinkage __visible void __sched notrace preempt_schedule_context(void)
return;
do {
- __preempt_count_add(PREEMPT_ACTIVE);
+ /*
+ * Use raw __prempt_count() ops that don't call function.
+ * We can't call functions before disabling preemption which
+ * disarm preemption tracing recursions.
+ */
+ __preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
+ barrier();
/*
* Needs preempt disabled in case user_exit() is traced
* and the tracer calls preempt_enable_notrace() causing
@@ -2927,12 +3188,11 @@ asmlinkage __visible void __sched notrace preempt_schedule_context(void)
__schedule();
exception_exit(prev_ctx);
- __preempt_count_sub(PREEMPT_ACTIVE);
barrier();
+ __preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
} while (need_resched());
}
-EXPORT_SYMBOL_GPL(preempt_schedule_context);
-#endif /* CONFIG_CONTEXT_TRACKING */
+EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
#endif /* CONFIG_PREEMPT */
@@ -2952,17 +3212,11 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
prev_state = exception_enter();
do {
- __preempt_count_add(PREEMPT_ACTIVE);
+ preempt_active_enter();
local_irq_enable();
__schedule();
local_irq_disable();
- __preempt_count_sub(PREEMPT_ACTIVE);
-
- /*
- * Check again in case we missed a preemption opportunity
- * between schedule and now.
- */
- barrier();
+ preempt_active_exit();
} while (need_resched());
exception_exit(prev_state);
@@ -3040,7 +3294,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
if (!dl_prio(p->normal_prio) ||
(pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
p->dl.dl_boosted = 1;
- p->dl.dl_throttled = 0;
enqueue_flag = ENQUEUE_REPLENISH;
} else
p->dl.dl_boosted = 0;
@@ -3068,7 +3321,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
check_class_changed(rq, p, prev_class, oldprio);
out_unlock:
+ preempt_disable(); /* avoid rq from going away on us */
__task_rq_unlock(rq);
+
+ balance_callback(rq);
+ preempt_enable();
}
#endif
@@ -3406,7 +3663,7 @@ static bool dl_param_changed(struct task_struct *p,
static int __sched_setscheduler(struct task_struct *p,
const struct sched_attr *attr,
- bool user)
+ bool user, bool pi)
{
int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
MAX_RT_PRIO - 1 - attr->sched_priority;
@@ -3592,18 +3849,20 @@ change:
p->sched_reset_on_fork = reset_on_fork;
oldprio = p->prio;
- /*
- * Take priority boosted tasks into account. If the new
- * effective priority is unchanged, we just store the new
- * normal parameters and do not touch the scheduler class and
- * the runqueue. This will be done when the task deboost
- * itself.
- */
- new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
- if (new_effective_prio == oldprio) {
- __setscheduler_params(p, attr);
- task_rq_unlock(rq, p, &flags);
- return 0;
+ if (pi) {
+ /*
+ * Take priority boosted tasks into account. If the new
+ * effective priority is unchanged, we just store the new
+ * normal parameters and do not touch the scheduler class and
+ * the runqueue. This will be done when the task deboost
+ * itself.
+ */
+ new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
+ if (new_effective_prio == oldprio) {
+ __setscheduler_params(p, attr);
+ task_rq_unlock(rq, p, &flags);
+ return 0;
+ }
}
queued = task_on_rq_queued(p);
@@ -3614,7 +3873,7 @@ change:
put_prev_task(rq, p);
prev_class = p->sched_class;
- __setscheduler(rq, p, attr, true);
+ __setscheduler(rq, p, attr, pi);
if (running)
p->sched_class->set_curr_task(rq);
@@ -3627,9 +3886,17 @@ change:
}
check_class_changed(rq, p, prev_class, oldprio);
+ preempt_disable(); /* avoid rq from going away on us */
task_rq_unlock(rq, p, &flags);
- rt_mutex_adjust_pi(p);
+ if (pi)
+ rt_mutex_adjust_pi(p);
+
+ /*
+ * Run balance callbacks after we've adjusted the PI chain.
+ */
+ balance_callback(rq);
+ preempt_enable();
return 0;
}
@@ -3650,7 +3917,7 @@ static int _sched_setscheduler(struct task_struct *p, int policy,
attr.sched_policy = policy;
}
- return __sched_setscheduler(p, &attr, check);
+ return __sched_setscheduler(p, &attr, check, true);
}
/**
* sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
@@ -3671,7 +3938,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
{
- return __sched_setscheduler(p, attr, true);
+ return __sched_setscheduler(p, attr, true, true);
}
EXPORT_SYMBOL_GPL(sched_setattr);
@@ -4719,149 +4986,6 @@ out:
}
#ifdef CONFIG_SMP
-/*
- * move_queued_task - move a queued task to new rq.
- *
- * Returns (locked) new rq. Old rq's lock is released.
- */
-static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
-{
- struct rq *rq = task_rq(p);
-
- lockdep_assert_held(&rq->lock);
-
- dequeue_task(rq, p, 0);
- p->on_rq = TASK_ON_RQ_MIGRATING;
- set_task_cpu(p, new_cpu);
- raw_spin_unlock(&rq->lock);
-
- rq = cpu_rq(new_cpu);
-
- raw_spin_lock(&rq->lock);
- BUG_ON(task_cpu(p) != new_cpu);
- p->on_rq = TASK_ON_RQ_QUEUED;
- enqueue_task(rq, p, 0);
- check_preempt_curr(rq, p, 0);
-
- return rq;
-}
-
-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
-{
- if (p->sched_class->set_cpus_allowed)
- p->sched_class->set_cpus_allowed(p, new_mask);
-
- cpumask_copy(&p->cpus_allowed, new_mask);
- p->nr_cpus_allowed = cpumask_weight(new_mask);
-}
-
-/*
- * This is how migration works:
- *
- * 1) we invoke migration_cpu_stop() on the target CPU using
- * stop_one_cpu().
- * 2) stopper starts to run (implicitly forcing the migrated thread
- * off the CPU)
- * 3) it checks whether the migrated task is still in the wrong runqueue.
- * 4) if it's in the wrong runqueue then the migration thread removes
- * it and puts it into the right queue.
- * 5) stopper completes and stop_one_cpu() returns and the migration
- * is done.
- */
-
-/*
- * Change a given task's CPU affinity. Migrate the thread to a
- * proper CPU and schedule it away if the CPU it's executing on
- * is removed from the allowed bitmask.
- *
- * NOTE: the caller must have a valid reference to the task, the
- * task must not exit() & deallocate itself prematurely. The
- * call is not atomic; no spinlocks may be held.
- */
-int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
-{
- unsigned long flags;
- struct rq *rq;
- unsigned int dest_cpu;
- int ret = 0;
-
- rq = task_rq_lock(p, &flags);
-
- if (cpumask_equal(&p->cpus_allowed, new_mask))
- goto out;
-
- if (!cpumask_intersects(new_mask, cpu_active_mask)) {
- ret = -EINVAL;
- goto out;
- }
-
- do_set_cpus_allowed(p, new_mask);
-
- /* Can the task run on the task's current CPU? If so, we're done */
- if (cpumask_test_cpu(task_cpu(p), new_mask))
- goto out;
-
- dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
- if (task_running(rq, p) || p->state == TASK_WAKING) {
- struct migration_arg arg = { p, dest_cpu };
- /* Need help from migration thread: drop lock and wait. */
- task_rq_unlock(rq, p, &flags);
- stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
- tlb_migrate_finish(p->mm);
- return 0;
- } else if (task_on_rq_queued(p))
- rq = move_queued_task(p, dest_cpu);
-out:
- task_rq_unlock(rq, p, &flags);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
-
-/*
- * Move (not current) task off this cpu, onto dest cpu. We're doing
- * this because either it can't run here any more (set_cpus_allowed()
- * away from this CPU, or CPU going down), or because we're
- * attempting to rebalance this task on exec (sched_exec).
- *
- * So we race with normal scheduler movements, but that's OK, as long
- * as the task is no longer on this CPU.
- *
- * Returns non-zero if task was successfully migrated.
- */
-static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
-{
- struct rq *rq;
- int ret = 0;
-
- if (unlikely(!cpu_active(dest_cpu)))
- return ret;
-
- rq = cpu_rq(src_cpu);
-
- raw_spin_lock(&p->pi_lock);
- raw_spin_lock(&rq->lock);
- /* Already moved. */
- if (task_cpu(p) != src_cpu)
- goto done;
-
- /* Affinity changed (again). */
- if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
- goto fail;
-
- /*
- * If we're not on a rq, the next wake-up will ensure we're
- * placed properly.
- */
- if (task_on_rq_queued(p))
- rq = move_queued_task(p, dest_cpu);
-done:
- ret = 1;
-fail:
- raw_spin_unlock(&rq->lock);
- raw_spin_unlock(&p->pi_lock);
- return ret;
-}
#ifdef CONFIG_NUMA_BALANCING
/* Migrate current task p to target_cpu */
@@ -4909,35 +5033,9 @@ void sched_setnuma(struct task_struct *p, int nid)
enqueue_task(rq, p, 0);
task_rq_unlock(rq, p, &flags);
}
-#endif
-
-/*
- * migration_cpu_stop - this will be executed by a highprio stopper thread
- * and performs thread migration by bumping thread off CPU then
- * 'pushing' onto another runqueue.
- */
-static int migration_cpu_stop(void *data)
-{
- struct migration_arg *arg = data;
-
- /*
- * The original target cpu might have gone down and we might
- * be on another cpu but it doesn't matter.
- */
- local_irq_disable();
- /*
- * We need to explicitly wake pending tasks before running
- * __migrate_task() such that we will not miss enforcing cpus_allowed
- * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
- */
- sched_ttwu_pending();
- __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
- local_irq_enable();
- return 0;
-}
+#endif /* CONFIG_NUMA_BALANCING */
#ifdef CONFIG_HOTPLUG_CPU
-
/*
* Ensures that the idle task is using init_mm right before its cpu goes
* offline.
@@ -4993,9 +5091,9 @@ static struct task_struct fake_task = {
* there's no concurrency possible, we hold the required locks anyway
* because of lock validation efforts.
*/
-static void migrate_tasks(unsigned int dead_cpu)
+static void migrate_tasks(struct rq *dead_rq)
{
- struct rq *rq = cpu_rq(dead_cpu);
+ struct rq *rq = dead_rq;
struct task_struct *next, *stop = rq->stop;
int dest_cpu;
@@ -5017,7 +5115,7 @@ static void migrate_tasks(unsigned int dead_cpu)
*/
update_rq_clock(rq);
- for ( ; ; ) {
+ for (;;) {
/*
* There's this thread running, bail when that's the only
* remaining thread.
@@ -5025,22 +5123,29 @@ static void migrate_tasks(unsigned int dead_cpu)
if (rq->nr_running == 1)
break;
+ /*
+ * Ensure rq->lock covers the entire task selection
+ * until the migration.
+ */
+ lockdep_pin_lock(&rq->lock);
next = pick_next_task(rq, &fake_task);
BUG_ON(!next);
next->sched_class->put_prev_task(rq, next);
/* Find suitable destination for @next, with force if needed. */
- dest_cpu = select_fallback_rq(dead_cpu, next);
- raw_spin_unlock(&rq->lock);
-
- __migrate_task(next, dead_cpu, dest_cpu);
-
- raw_spin_lock(&rq->lock);
+ dest_cpu = select_fallback_rq(dead_rq->cpu, next);
+
+ lockdep_unpin_lock(&rq->lock);
+ rq = __migrate_task(rq, next, dest_cpu);
+ if (rq != dead_rq) {
+ raw_spin_unlock(&rq->lock);
+ rq = dead_rq;
+ raw_spin_lock(&rq->lock);
+ }
}
rq->stop = stop;
}
-
#endif /* CONFIG_HOTPLUG_CPU */
#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -5219,7 +5324,7 @@ static void register_sched_domain_sysctl(void)
static void unregister_sched_domain_sysctl(void)
{
}
-#endif
+#endif /* CONFIG_SCHED_DEBUG && CONFIG_SYSCTL */
static void set_rq_online(struct rq *rq)
{
@@ -5288,7 +5393,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_offline(rq);
}
- migrate_tasks(cpu);
+ migrate_tasks(rq);
BUG_ON(rq->nr_running != 1); /* the migration thread */
raw_spin_unlock_irqrestore(&rq->lock, flags);
break;
@@ -5314,7 +5419,7 @@ static struct notifier_block migration_notifier = {
.priority = CPU_PRI_MIGRATION,
};
-static void __cpuinit set_cpu_rq_start_time(void)
+static void set_cpu_rq_start_time(void)
{
int cpu = smp_processor_id();
struct rq *rq = cpu_rq(cpu);
@@ -5366,9 +5471,6 @@ static int __init migration_init(void)
return 0;
}
early_initcall(migration_init);
-#endif
-
-#ifdef CONFIG_SMP
static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
@@ -6594,7 +6696,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
struct sched_group *sg;
struct sched_group_capacity *sgc;
- sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
+ sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
GFP_KERNEL, cpu_to_node(j));
if (!sd)
return -ENOMEM;
@@ -7032,6 +7134,9 @@ void __init sched_init_smp(void)
alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
+ /* nohz_full won't take effect without isolating the cpus. */
+ tick_nohz_full_add_cpus_to(cpu_isolated_map);
+
sched_init_numa();
/*
@@ -7068,8 +7173,6 @@ void __init sched_init_smp(void)
}
#endif /* CONFIG_SMP */
-const_debug unsigned int sysctl_timer_migration = 1;
-
int in_sched_functions(unsigned long addr)
{
return in_lock_functions(addr) ||
@@ -7199,7 +7302,7 @@ void __init sched_init(void)
rq->sd = NULL;
rq->rd = NULL;
rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
- rq->post_schedule = 0;
+ rq->balance_callback = NULL;
rq->active_balance = 0;
rq->next_balance = jiffies;
rq->push_cpu = 0;
@@ -7329,32 +7432,12 @@ EXPORT_SYMBOL(___might_sleep);
#endif
#ifdef CONFIG_MAGIC_SYSRQ
-static void normalize_task(struct rq *rq, struct task_struct *p)
+void normalize_rt_tasks(void)
{
- const struct sched_class *prev_class = p->sched_class;
+ struct task_struct *g, *p;
struct sched_attr attr = {
.sched_policy = SCHED_NORMAL,
};
- int old_prio = p->prio;
- int queued;
-
- queued = task_on_rq_queued(p);
- if (queued)
- dequeue_task(rq, p, 0);
- __setscheduler(rq, p, &attr, false);
- if (queued) {
- enqueue_task(rq, p, 0);
- resched_curr(rq);
- }
-
- check_class_changed(rq, p, prev_class, old_prio);
-}
-
-void normalize_rt_tasks(void)
-{
- struct task_struct *g, *p;
- unsigned long flags;
- struct rq *rq;
read_lock(&tasklist_lock);
for_each_process_thread(g, p) {
@@ -7381,9 +7464,7 @@ void normalize_rt_tasks(void)
continue;
}
- rq = task_rq_lock(p, &flags);
- normalize_task(rq, p);
- task_rq_unlock(rq, p, &flags);
+ __sched_setscheduler(p, &attr, false, false);
}
read_unlock(&tasklist_lock);
}
@@ -7734,11 +7815,11 @@ static long sched_group_rt_runtime(struct task_group *tg)
return rt_runtime_us;
}
-static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
+static int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
{
u64 rt_runtime, rt_period;
- rt_period = (u64)rt_period_us * NSEC_PER_USEC;
+ rt_period = rt_period_us * NSEC_PER_USEC;
rt_runtime = tg->rt_bandwidth.rt_runtime;
return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
@@ -8105,10 +8186,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
__refill_cfs_bandwidth_runtime(cfs_b);
/* restart the period timer (if active) to handle new period expiry */
- if (runtime_enabled && cfs_b->timer_active) {
- /* force a reprogram */
- __start_cfs_bandwidth(cfs_b, true);
- }
+ if (runtime_enabled)
+ start_cfs_bandwidth(cfs_b);
raw_spin_unlock_irq(&cfs_b->lock);
for_each_online_cpu(i) {
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 8394b1ee6..f5a64ffad 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -567,7 +567,7 @@ static void cputime_advance(cputime_t *counter, cputime_t new)
{
cputime_t old;
- while (new > (old = ACCESS_ONCE(*counter)))
+ while (new > (old = READ_ONCE(*counter)))
cmpxchg_cputime(counter, old, new);
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 5e9514508..0a17af356 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -213,14 +213,28 @@ static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
return dl_task(prev);
}
-static inline void set_post_schedule(struct rq *rq)
+static DEFINE_PER_CPU(struct callback_head, dl_push_head);
+static DEFINE_PER_CPU(struct callback_head, dl_pull_head);
+
+static void push_dl_tasks(struct rq *);
+static void pull_dl_task(struct rq *);
+
+static inline void queue_push_tasks(struct rq *rq)
+{
+ if (!has_pushable_dl_tasks(rq))
+ return;
+
+ queue_balance_callback(rq, &per_cpu(dl_push_head, rq->cpu), push_dl_tasks);
+}
+
+static inline void queue_pull_task(struct rq *rq)
{
- rq->post_schedule = has_pushable_dl_tasks(rq);
+ queue_balance_callback(rq, &per_cpu(dl_pull_head, rq->cpu), pull_dl_task);
}
static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
-static void dl_task_offline_migration(struct rq *rq, struct task_struct *p)
+static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p)
{
struct rq *later_rq = NULL;
bool fallback = false;
@@ -254,14 +268,19 @@ static void dl_task_offline_migration(struct rq *rq, struct task_struct *p)
double_lock_balance(rq, later_rq);
}
+ /*
+ * By now the task is replenished and enqueued; migrate it.
+ */
deactivate_task(rq, p, 0);
set_task_cpu(p, later_rq->cpu);
- activate_task(later_rq, p, ENQUEUE_REPLENISH);
+ activate_task(later_rq, p, 0);
if (!fallback)
resched_curr(later_rq);
- double_unlock_balance(rq, later_rq);
+ double_unlock_balance(later_rq, rq);
+
+ return later_rq;
}
#else
@@ -291,12 +310,15 @@ static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
return false;
}
-static inline int pull_dl_task(struct rq *rq)
+static inline void pull_dl_task(struct rq *rq)
{
- return 0;
}
-static inline void set_post_schedule(struct rq *rq)
+static inline void queue_push_tasks(struct rq *rq)
+{
+}
+
+static inline void queue_pull_task(struct rq *rq)
{
}
#endif /* CONFIG_SMP */
@@ -498,24 +520,23 @@ static void update_dl_entity(struct sched_dl_entity *dl_se,
* actually started or not (i.e., the replenishment instant is in
* the future or in the past).
*/
-static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted)
+static int start_dl_timer(struct task_struct *p)
{
- struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
- struct rq *rq = rq_of_dl_rq(dl_rq);
+ struct sched_dl_entity *dl_se = &p->dl;
+ struct hrtimer *timer = &dl_se->dl_timer;
+ struct rq *rq = task_rq(p);
ktime_t now, act;
- ktime_t soft, hard;
- unsigned long range;
s64 delta;
- if (boosted)
- return 0;
+ lockdep_assert_held(&rq->lock);
+
/*
* We want the timer to fire at the deadline, but considering
* that it is actually coming from rq->clock and not from
* hrtimer's time base reading.
*/
act = ns_to_ktime(dl_se->deadline);
- now = hrtimer_cb_get_time(&dl_se->dl_timer);
+ now = hrtimer_cb_get_time(timer);
delta = ktime_to_ns(now) - rq_clock(rq);
act = ktime_add_ns(act, delta);
@@ -527,15 +548,21 @@ static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted)
if (ktime_us_delta(act, now) < 0)
return 0;
- hrtimer_set_expires(&dl_se->dl_timer, act);
-
- soft = hrtimer_get_softexpires(&dl_se->dl_timer);
- hard = hrtimer_get_expires(&dl_se->dl_timer);
- range = ktime_to_ns(ktime_sub(hard, soft));
- __hrtimer_start_range_ns(&dl_se->dl_timer, soft,
- range, HRTIMER_MODE_ABS, 0);
+ /*
+ * !enqueued will guarantee another callback; even if one is already in
+ * progress. This ensures a balanced {get,put}_task_struct().
+ *
+ * The race against __run_timer() clearing the enqueued state is
+ * harmless because we're holding task_rq()->lock, therefore the timer
+ * expiring after we've done the check will wait on its task_rq_lock()
+ * and observe our state.
+ */
+ if (!hrtimer_is_queued(timer)) {
+ get_task_struct(p);
+ hrtimer_start(timer, act, HRTIMER_MODE_ABS);
+ }
- return hrtimer_active(&dl_se->dl_timer);
+ return 1;
}
/*
@@ -563,35 +590,40 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
rq = task_rq_lock(p, &flags);
/*
- * We need to take care of several possible races here:
- *
- * - the task might have changed its scheduling policy
- * to something different than SCHED_DEADLINE
- * - the task might have changed its reservation parameters
- * (through sched_setattr())
- * - the task might have been boosted by someone else and
- * might be in the boosting/deboosting path
+ * The task might have changed its scheduling policy to something
+ * different than SCHED_DEADLINE (through switched_fromd_dl()).
+ */
+ if (!dl_task(p)) {
+ __dl_clear_params(p);
+ goto unlock;
+ }
+
+ /*
+ * This is possible if switched_from_dl() raced against a running
+ * callback that took the above !dl_task() path and we've since then
+ * switched back into SCHED_DEADLINE.
*
- * In all this cases we bail out, as the task is already
- * in the runqueue or is going to be enqueued back anyway.
+ * There's nothing to do except drop our task reference.
*/
- if (!dl_task(p) || dl_se->dl_new ||
- dl_se->dl_boosted || !dl_se->dl_throttled)
+ if (dl_se->dl_new)
goto unlock;
- sched_clock_tick();
- update_rq_clock(rq);
+ /*
+ * The task might have been boosted by someone else and might be in the
+ * boosting/deboosting path, its not throttled.
+ */
+ if (dl_se->dl_boosted)
+ goto unlock;
-#ifdef CONFIG_SMP
/*
- * If we find that the rq the task was on is no longer
- * available, we need to select a new rq.
+ * Spurious timer due to start_dl_timer() race; or we already received
+ * a replenishment from rt_mutex_setprio().
*/
- if (unlikely(!rq->online)) {
- dl_task_offline_migration(rq, p);
+ if (!dl_se->dl_throttled)
goto unlock;
- }
-#endif
+
+ sched_clock_tick();
+ update_rq_clock(rq);
/*
* If the throttle happened during sched-out; like:
@@ -617,17 +649,38 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
check_preempt_curr_dl(rq, p, 0);
else
resched_curr(rq);
+
#ifdef CONFIG_SMP
/*
- * Queueing this task back might have overloaded rq,
- * check if we need to kick someone away.
+ * Perform balancing operations here; after the replenishments. We
+ * cannot drop rq->lock before this, otherwise the assertion in
+ * start_dl_timer() about not missing updates is not true.
+ *
+ * If we find that the rq the task was on is no longer available, we
+ * need to select a new rq.
+ *
+ * XXX figure out if select_task_rq_dl() deals with offline cpus.
+ */
+ if (unlikely(!rq->online))
+ rq = dl_task_offline_migration(rq, p);
+
+ /*
+ * Queueing this task back might have overloaded rq, check if we need
+ * to kick someone away.
*/
if (has_pushable_dl_tasks(rq))
push_dl_task(rq);
#endif
+
unlock:
task_rq_unlock(rq, p, &flags);
+ /*
+ * This can free the task_struct, including this hrtimer, do not touch
+ * anything related to that after this.
+ */
+ put_task_struct(p);
+
return HRTIMER_NORESTART;
}
@@ -640,7 +693,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
}
static
-int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se)
+int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
{
return (dl_se->runtime <= 0);
}
@@ -684,10 +737,10 @@ static void update_curr_dl(struct rq *rq)
sched_rt_avg_update(rq, delta_exec);
dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
- if (dl_runtime_exceeded(rq, dl_se)) {
+ if (dl_runtime_exceeded(dl_se)) {
dl_se->dl_throttled = 1;
__dequeue_task_dl(rq, curr, 0);
- if (unlikely(!start_dl_timer(dl_se, curr->dl.dl_boosted)))
+ if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr)))
enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
if (!is_leftmost(curr, &rq->dl))
@@ -995,7 +1048,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
rq = cpu_rq(cpu);
rcu_read_lock();
- curr = ACCESS_ONCE(rq->curr); /* unlocked access */
+ curr = READ_ONCE(rq->curr); /* unlocked access */
/*
* If we are dealing with a -deadline task, we must
@@ -1012,7 +1065,9 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
(p->nr_cpus_allowed > 1)) {
int target = find_later_rq(p);
- if (target != -1)
+ if (target != -1 &&
+ dl_time_before(p->dl.deadline,
+ cpu_rq(target)->dl.earliest_dl.curr))
cpu = target;
}
rcu_read_unlock();
@@ -1042,8 +1097,6 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
resched_curr(rq);
}
-static int pull_dl_task(struct rq *this_rq);
-
#endif /* CONFIG_SMP */
/*
@@ -1100,7 +1153,15 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
dl_rq = &rq->dl;
if (need_pull_dl_task(rq, prev)) {
+ /*
+ * This is OK, because current is on_cpu, which avoids it being
+ * picked for load-balance and preemption/IRQs are still
+ * disabled avoiding further scheduler activity on it and we're
+ * being very careful to re-start the picking loop.
+ */
+ lockdep_unpin_lock(&rq->lock);
pull_dl_task(rq);
+ lockdep_pin_lock(&rq->lock);
/*
* pull_rt_task() can drop (and re-acquire) rq->lock; this
* means a stop task can slip in, in which case we need to
@@ -1134,7 +1195,7 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
if (hrtick_enabled(rq))
start_hrtick_dl(rq, p);
- set_post_schedule(rq);
+ queue_push_tasks(rq);
return p;
}
@@ -1171,7 +1232,6 @@ static void task_fork_dl(struct task_struct *p)
static void task_dead_dl(struct task_struct *p)
{
- struct hrtimer *timer = &p->dl.dl_timer;
struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
/*
@@ -1181,8 +1241,6 @@ static void task_dead_dl(struct task_struct *p)
/* XXX we should retain the bw until 0-lag */
dl_b->total_bw -= p->dl.dl_bw;
raw_spin_unlock_irq(&dl_b->lock);
-
- hrtimer_cancel(timer);
}
static void set_curr_task_dl(struct rq *rq)
@@ -1230,6 +1288,32 @@ next_node:
return NULL;
}
+/*
+ * Return the earliest pushable rq's task, which is suitable to be executed
+ * on the CPU, NULL otherwise:
+ */
+static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu)
+{
+ struct rb_node *next_node = rq->dl.pushable_dl_tasks_leftmost;
+ struct task_struct *p = NULL;
+
+ if (!has_pushable_dl_tasks(rq))
+ return NULL;
+
+next_node:
+ if (next_node) {
+ p = rb_entry(next_node, struct task_struct, pushable_dl_tasks);
+
+ if (pick_dl_task(rq, p, cpu))
+ return p;
+
+ next_node = rb_next(next_node);
+ goto next_node;
+ }
+
+ return NULL;
+}
+
static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl);
static int find_later_rq(struct task_struct *task)
@@ -1333,6 +1417,17 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
later_rq = cpu_rq(cpu);
+ if (!dl_time_before(task->dl.deadline,
+ later_rq->dl.earliest_dl.curr)) {
+ /*
+ * Target rq has tasks of equal or earlier deadline,
+ * retrying does not release any lock and is unlikely
+ * to yield a different result.
+ */
+ later_rq = NULL;
+ break;
+ }
+
/* Retry if something changed. */
if (double_lock_balance(rq, later_rq)) {
if (unlikely(task_rq(task) != rq ||
@@ -1473,15 +1568,16 @@ static void push_dl_tasks(struct rq *rq)
;
}
-static int pull_dl_task(struct rq *this_rq)
+static void pull_dl_task(struct rq *this_rq)
{
- int this_cpu = this_rq->cpu, ret = 0, cpu;
+ int this_cpu = this_rq->cpu, cpu;
struct task_struct *p;
+ bool resched = false;
struct rq *src_rq;
u64 dmin = LONG_MAX;
if (likely(!dl_overloaded(this_rq)))
- return 0;
+ return;
/*
* Match the barrier from dl_set_overloaded; this guarantees that if we
@@ -1514,7 +1610,7 @@ static int pull_dl_task(struct rq *this_rq)
if (src_rq->dl.dl_nr_running <= 1)
goto skip;
- p = pick_next_earliest_dl_task(src_rq, this_cpu);
+ p = pick_earliest_pushable_dl_task(src_rq, this_cpu);
/*
* We found a task to be pulled if:
@@ -1536,7 +1632,7 @@ static int pull_dl_task(struct rq *this_rq)
src_rq->curr->dl.deadline))
goto skip;
- ret = 1;
+ resched = true;
deactivate_task(src_rq, p, 0);
set_task_cpu(p, this_cpu);
@@ -1549,12 +1645,8 @@ skip:
double_unlock_balance(this_rq, src_rq);
}
- return ret;
-}
-
-static void post_schedule_dl(struct rq *rq)
-{
- push_dl_tasks(rq);
+ if (resched)
+ resched_curr(this_rq);
}
/*
@@ -1659,7 +1751,7 @@ static void rq_offline_dl(struct rq *rq)
cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
}
-void init_sched_dl_class(void)
+void __init init_sched_dl_class(void)
{
unsigned int i;
@@ -1670,37 +1762,16 @@ void init_sched_dl_class(void)
#endif /* CONFIG_SMP */
-/*
- * Ensure p's dl_timer is cancelled. May drop rq->lock for a while.
- */
-static void cancel_dl_timer(struct rq *rq, struct task_struct *p)
-{
- struct hrtimer *dl_timer = &p->dl.dl_timer;
-
- /* Nobody will change task's class if pi_lock is held */
- lockdep_assert_held(&p->pi_lock);
-
- if (hrtimer_active(dl_timer)) {
- int ret = hrtimer_try_to_cancel(dl_timer);
-
- if (unlikely(ret == -1)) {
- /*
- * Note, p may migrate OR new deadline tasks
- * may appear in rq when we are unlocking it.
- * A caller of us must be fine with that.
- */
- raw_spin_unlock(&rq->lock);
- hrtimer_cancel(dl_timer);
- raw_spin_lock(&rq->lock);
- }
- }
-}
-
static void switched_from_dl(struct rq *rq, struct task_struct *p)
{
- /* XXX we should retain the bw until 0-lag */
- cancel_dl_timer(rq, p);
- __dl_clear_params(p);
+ /*
+ * Start the deadline timer; if we switch back to dl before this we'll
+ * continue consuming our current CBS slice. If we stay outside of
+ * SCHED_DEADLINE until the deadline passes, the timer will reset the
+ * task.
+ */
+ if (!start_dl_timer(p))
+ __dl_clear_params(p);
/*
* Since this might be the only -deadline task on the rq,
@@ -1710,8 +1781,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
return;
- if (pull_dl_task(rq))
- resched_curr(rq);
+ queue_pull_task(rq);
}
/*
@@ -1720,21 +1790,16 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
*/
static void switched_to_dl(struct rq *rq, struct task_struct *p)
{
- int check_resched = 1;
-
if (task_on_rq_queued(p) && rq->curr != p) {
#ifdef CONFIG_SMP
- if (p->nr_cpus_allowed > 1 && rq->dl.overloaded &&
- push_dl_task(rq) && rq != task_rq(p))
- /* Only reschedule if pushing failed */
- check_resched = 0;
-#endif /* CONFIG_SMP */
- if (check_resched) {
- if (dl_task(rq->curr))
- check_preempt_curr_dl(rq, p, 0);
- else
- resched_curr(rq);
- }
+ if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
+ queue_push_tasks(rq);
+#else
+ if (dl_task(rq->curr))
+ check_preempt_curr_dl(rq, p, 0);
+ else
+ resched_curr(rq);
+#endif
}
}
@@ -1754,15 +1819,14 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
* or lowering its prio, so...
*/
if (!rq->dl.overloaded)
- pull_dl_task(rq);
+ queue_pull_task(rq);
/*
* If we now have a earlier deadline task than p,
* then reschedule, provided p is still on this
* runqueue.
*/
- if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) &&
- rq->curr == p)
+ if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline))
resched_curr(rq);
#else
/*
@@ -1792,7 +1856,6 @@ const struct sched_class dl_sched_class = {
.set_cpus_allowed = set_cpus_allowed_dl,
.rq_online = rq_online_dl,
.rq_offline = rq_offline_dl,
- .post_schedule = post_schedule_dl,
.task_woken = task_woken_dl,
#endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index a245c1fc6..4222ec50a 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -132,15 +132,17 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
p->prio);
#ifdef CONFIG_SCHEDSTATS
SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
- SPLIT_NS(p->se.vruntime),
+ SPLIT_NS(p->se.statistics.wait_sum),
SPLIT_NS(p->se.sum_exec_runtime),
SPLIT_NS(p->se.statistics.sum_sleep_runtime));
#else
- SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
- 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
+ SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
+ 0LL, 0L,
+ SPLIT_NS(p->se.sum_exec_runtime),
+ 0LL, 0L);
#endif
#ifdef CONFIG_NUMA_BALANCING
- SEQ_printf(m, " %d", task_node(p));
+ SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
#endif
#ifdef CONFIG_CGROUP_SCHED
SEQ_printf(m, " %s", task_group_path(task_group(p)));
@@ -156,7 +158,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
SEQ_printf(m,
"\nrunnable tasks:\n"
" task PID tree-key switches prio"
- " exec-runtime sum-exec sum-sleep\n"
+ " wait-time sum-exec sum-sleep\n"
"------------------------------------------------------"
"----------------------------------------------------\n");
@@ -230,8 +232,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
#endif
#endif
#ifdef CONFIG_CFS_BANDWIDTH
- SEQ_printf(m, " .%-30s: %d\n", "tg->cfs_bandwidth.timer_active",
- cfs_rq->tg->cfs_bandwidth.timer_active);
SEQ_printf(m, " .%-30s: %d\n", "throttled",
cfs_rq->throttled);
SEQ_printf(m, " .%-30s: %d\n", "throttle_count",
@@ -517,11 +517,21 @@ __initcall(init_sched_debug_procfs);
SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
+#ifdef CONFIG_NUMA_BALANCING
+void print_numa_stats(struct seq_file *m, int node, unsigned long tsf,
+ unsigned long tpf, unsigned long gsf, unsigned long gpf)
+{
+ SEQ_printf(m, "numa_faults node=%d ", node);
+ SEQ_printf(m, "task_private=%lu task_shared=%lu ", tsf, tpf);
+ SEQ_printf(m, "group_private=%lu group_shared=%lu\n", gsf, gpf);
+}
+#endif
+
+
static void sched_show_numa(struct task_struct *p, struct seq_file *m)
{
#ifdef CONFIG_NUMA_BALANCING
struct mempolicy *pol;
- int node, i;
if (p->mm)
P(mm->numa_scan_seq);
@@ -533,26 +543,12 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
mpol_get(pol);
task_unlock(p);
- SEQ_printf(m, "numa_migrations, %ld\n", xchg(&p->numa_pages_migrated, 0));
-
- for_each_online_node(node) {
- for (i = 0; i < 2; i++) {
- unsigned long nr_faults = -1;
- int cpu_current, home_node;
-
- if (p->numa_faults)
- nr_faults = p->numa_faults[2*node + i];
-
- cpu_current = !i ? (task_node(p) == node) :
- (pol && node_isset(node, pol->v.nodes));
-
- home_node = (p->numa_preferred_nid == node);
-
- SEQ_printf(m, "numa_faults_memory, %d, %d, %d, %d, %ld\n",
- i, node, cpu_current, home_node, nr_faults);
- }
- }
-
+ P(numa_pages_migrated);
+ P(numa_preferred_nid);
+ P(total_numa_faults);
+ SEQ_printf(m, "current_node=%d, numa_group_id=%d\n",
+ task_node(p), task_numa_group_id(p));
+ show_numa_stats(p, m);
mpol_put(pol);
#endif
}
@@ -582,6 +578,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
nr_switches = p->nvcsw + p->nivcsw;
#ifdef CONFIG_SCHEDSTATS
+ PN(se.statistics.sum_sleep_runtime);
PN(se.statistics.wait_start);
PN(se.statistics.sleep_start);
PN(se.statistics.block_start);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 936664319..134314406 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -166,9 +166,9 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w)
*
* This idea comes from the SD scheduler of Con Kolivas:
*/
-static int get_update_sysctl_factor(void)
+static unsigned int get_update_sysctl_factor(void)
{
- unsigned int cpus = min_t(int, num_online_cpus(), 8);
+ unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
unsigned int factor;
switch (sysctl_sched_tunable_scaling) {
@@ -601,7 +601,7 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
loff_t *ppos)
{
int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- int factor = get_update_sysctl_factor();
+ unsigned int factor = get_update_sysctl_factor();
if (ret || !write)
return ret;
@@ -859,7 +859,7 @@ static unsigned int task_nr_scan_windows(struct task_struct *p)
static unsigned int task_scan_min(struct task_struct *p)
{
- unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size);
+ unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
unsigned int scan, floor;
unsigned int windows = 1;
@@ -1223,11 +1223,9 @@ static void task_numa_assign(struct task_numa_env *env,
static bool load_too_imbalanced(long src_load, long dst_load,
struct task_numa_env *env)
{
+ long imb, old_imb;
+ long orig_src_load, orig_dst_load;
long src_capacity, dst_capacity;
- long orig_src_load;
- long load_a, load_b;
- long moved_load;
- long imb;
/*
* The load is corrected for the CPU capacity available on each node.
@@ -1240,39 +1238,30 @@ static bool load_too_imbalanced(long src_load, long dst_load,
dst_capacity = env->dst_stats.compute_capacity;
/* We care about the slope of the imbalance, not the direction. */
- load_a = dst_load;
- load_b = src_load;
- if (load_a < load_b)
- swap(load_a, load_b);
+ if (dst_load < src_load)
+ swap(dst_load, src_load);
/* Is the difference below the threshold? */
- imb = load_a * src_capacity * 100 -
- load_b * dst_capacity * env->imbalance_pct;
+ imb = dst_load * src_capacity * 100 -
+ src_load * dst_capacity * env->imbalance_pct;
if (imb <= 0)
return false;
/*
* The imbalance is above the allowed threshold.
- * Allow a move that brings us closer to a balanced situation,
- * without moving things past the point of balance.
+ * Compare it with the old imbalance.
*/
orig_src_load = env->src_stats.load;
+ orig_dst_load = env->dst_stats.load;
- /*
- * In a task swap, there will be one load moving from src to dst,
- * and another moving back. This is the net sum of both moves.
- * A simple task move will always have a positive value.
- * Allow the move if it brings the system closer to a balanced
- * situation, without crossing over the balance point.
- */
- moved_load = orig_src_load - src_load;
+ if (orig_dst_load < orig_src_load)
+ swap(orig_dst_load, orig_src_load);
- if (moved_load > 0)
- /* Moving src -> dst. Did we overshoot balance? */
- return src_load * dst_capacity < dst_load * src_capacity;
- else
- /* Moving dst -> src. Did we overshoot balance? */
- return dst_load * src_capacity < src_load * dst_capacity;
+ old_imb = orig_dst_load * src_capacity * 100 -
+ orig_src_load * dst_capacity * env->imbalance_pct;
+
+ /* Would this change make things worse? */
+ return (imb > old_imb);
}
/*
@@ -1434,6 +1423,30 @@ static void task_numa_find_cpu(struct task_numa_env *env,
}
}
+/* Only move tasks to a NUMA node less busy than the current node. */
+static bool numa_has_capacity(struct task_numa_env *env)
+{
+ struct numa_stats *src = &env->src_stats;
+ struct numa_stats *dst = &env->dst_stats;
+
+ if (src->has_free_capacity && !dst->has_free_capacity)
+ return false;
+
+ /*
+ * Only consider a task move if the source has a higher load
+ * than the destination, corrected for CPU capacity on each node.
+ *
+ * src->load dst->load
+ * --------------------- vs ---------------------
+ * src->compute_capacity dst->compute_capacity
+ */
+ if (src->load * dst->compute_capacity >
+ dst->load * src->compute_capacity)
+ return true;
+
+ return false;
+}
+
static int task_numa_migrate(struct task_struct *p)
{
struct task_numa_env env = {
@@ -1488,7 +1501,8 @@ static int task_numa_migrate(struct task_struct *p)
update_numa_stats(&env.dst_stats, env.dst_nid);
/* Try to find a spot on the preferred nid. */
- task_numa_find_cpu(&env, taskimp, groupimp);
+ if (numa_has_capacity(&env))
+ task_numa_find_cpu(&env, taskimp, groupimp);
/*
* Look at other nodes in these cases:
@@ -1519,7 +1533,8 @@ static int task_numa_migrate(struct task_struct *p)
env.dist = dist;
env.dst_nid = nid;
update_numa_stats(&env.dst_stats, env.dst_nid);
- task_numa_find_cpu(&env, taskimp, groupimp);
+ if (numa_has_capacity(&env))
+ task_numa_find_cpu(&env, taskimp, groupimp);
}
}
@@ -1819,7 +1834,12 @@ static void task_numa_placement(struct task_struct *p)
u64 runtime, period;
spinlock_t *group_lock = NULL;
- seq = ACCESS_ONCE(p->mm->numa_scan_seq);
+ /*
+ * The p->mm->numa_scan_seq field gets updated without
+ * exclusive access. Use READ_ONCE() here to ensure
+ * that the field is read in a single access:
+ */
+ seq = READ_ONCE(p->mm->numa_scan_seq);
if (p->numa_scan_seq == seq)
return;
p->numa_scan_seq = seq;
@@ -1963,7 +1983,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
}
rcu_read_lock();
- tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);
+ tsk = READ_ONCE(cpu_rq(cpu)->curr);
if (!cpupid_match_pid(tsk, cpupid))
goto no_join;
@@ -2132,7 +2152,15 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
static void reset_ptenuma_scan(struct task_struct *p)
{
- ACCESS_ONCE(p->mm->numa_scan_seq)++;
+ /*
+ * We only did a read acquisition of the mmap sem, so
+ * p->mm->numa_scan_seq is written to without exclusive access
+ * and the update is not guaranteed to be atomic. That's not
+ * much of an issue though, since this is just used for
+ * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
+ * expensive, to avoid any form of compiler optimizations:
+ */
+ WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
p->mm->numa_scan_offset = 0;
}
@@ -3501,16 +3529,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
if (cfs_b->quota == RUNTIME_INF)
amount = min_amount;
else {
- /*
- * If the bandwidth pool has become inactive, then at least one
- * period must have elapsed since the last consumption.
- * Refresh the global state and ensure bandwidth timer becomes
- * active.
- */
- if (!cfs_b->timer_active) {
- __refill_cfs_bandwidth_runtime(cfs_b);
- __start_cfs_bandwidth(cfs_b, false);
- }
+ start_cfs_bandwidth(cfs_b);
if (cfs_b->runtime > 0) {
amount = min(cfs_b->runtime, min_amount);
@@ -3659,6 +3678,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
long task_delta, dequeue = 1;
+ bool empty;
se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
@@ -3688,13 +3708,21 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
cfs_rq->throttled = 1;
cfs_rq->throttled_clock = rq_clock(rq);
raw_spin_lock(&cfs_b->lock);
+ empty = list_empty(&cfs_b->throttled_cfs_rq);
+
/*
* Add to the _head_ of the list, so that an already-started
* distribute_cfs_runtime will not see us
*/
list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
- if (!cfs_b->timer_active)
- __start_cfs_bandwidth(cfs_b, false);
+
+ /*
+ * If we're the first throttled task, make sure the bandwidth
+ * timer is running.
+ */
+ if (empty)
+ start_cfs_bandwidth(cfs_b);
+
raw_spin_unlock(&cfs_b->lock);
}
@@ -3809,13 +3837,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
if (cfs_b->idle && !throttled)
goto out_deactivate;
- /*
- * if we have relooped after returning idle once, we need to update our
- * status as actually running, so that other cpus doing
- * __start_cfs_bandwidth will stop trying to cancel us.
- */
- cfs_b->timer_active = 1;
-
__refill_cfs_bandwidth_runtime(cfs_b);
if (!throttled) {
@@ -3860,7 +3881,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
return 0;
out_deactivate:
- cfs_b->timer_active = 0;
return 1;
}
@@ -3875,7 +3895,7 @@ static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
* Are we near the end of the current quota period?
*
* Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
- * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of
+ * hrtimer base being cleared by hrtimer_start. In the case of
* migrate_hrtimers, base is never cleared, so we are fine.
*/
static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
@@ -3903,8 +3923,9 @@ static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
if (runtime_refresh_within(cfs_b, min_left))
return;
- start_bandwidth_timer(&cfs_b->slack_timer,
- ns_to_ktime(cfs_bandwidth_slack_period));
+ hrtimer_start(&cfs_b->slack_timer,
+ ns_to_ktime(cfs_bandwidth_slack_period),
+ HRTIMER_MODE_REL);
}
/* we know any runtime found here is valid as update_curr() precedes return */
@@ -4024,6 +4045,7 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
{
struct cfs_bandwidth *cfs_b =
container_of(timer, struct cfs_bandwidth, slack_timer);
+
do_sched_cfs_slack_timer(cfs_b);
return HRTIMER_NORESTART;
@@ -4033,20 +4055,19 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
{
struct cfs_bandwidth *cfs_b =
container_of(timer, struct cfs_bandwidth, period_timer);
- ktime_t now;
int overrun;
int idle = 0;
raw_spin_lock(&cfs_b->lock);
for (;;) {
- now = hrtimer_cb_get_time(timer);
- overrun = hrtimer_forward(timer, now, cfs_b->period);
-
+ overrun = hrtimer_forward_now(timer, cfs_b->period);
if (!overrun)
break;
idle = do_sched_cfs_period_timer(cfs_b, overrun);
}
+ if (idle)
+ cfs_b->period_active = 0;
raw_spin_unlock(&cfs_b->lock);
return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
@@ -4060,7 +4081,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
cfs_b->period = ns_to_ktime(default_cfs_period());
INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
- hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
cfs_b->period_timer.function = sched_cfs_period_timer;
hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
cfs_b->slack_timer.function = sched_cfs_slack_timer;
@@ -4072,28 +4093,15 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
INIT_LIST_HEAD(&cfs_rq->throttled_list);
}
-/* requires cfs_b->lock, may release to reprogram timer */
-void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force)
+void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
{
- /*
- * The timer may be active because we're trying to set a new bandwidth
- * period or because we're racing with the tear-down path
- * (timer_active==0 becomes visible before the hrtimer call-back
- * terminates). In either case we ensure that it's re-programmed
- */
- while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
- hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
- /* bounce the lock to allow do_sched_cfs_period_timer to run */
- raw_spin_unlock(&cfs_b->lock);
- cpu_relax();
- raw_spin_lock(&cfs_b->lock);
- /* if someone else restarted the timer then we're done */
- if (!force && cfs_b->timer_active)
- return;
- }
+ lockdep_assert_held(&cfs_b->lock);
- cfs_b->timer_active = 1;
- start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
+ if (!cfs_b->period_active) {
+ cfs_b->period_active = 1;
+ hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
+ hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
+ }
}
static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@ -4348,6 +4356,189 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
}
#ifdef CONFIG_SMP
+
+/*
+ * per rq 'load' arrray crap; XXX kill this.
+ */
+
+/*
+ * The exact cpuload at various idx values, calculated at every tick would be
+ * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
+ *
+ * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
+ * on nth tick when cpu may be busy, then we have:
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
+ *
+ * decay_load_missed() below does efficient calculation of
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
+ *
+ * The calculation is approximated on a 128 point scale.
+ * degrade_zero_ticks is the number of ticks after which load at any
+ * particular idx is approximated to be zero.
+ * degrade_factor is a precomputed table, a row for each load idx.
+ * Each column corresponds to degradation factor for a power of two ticks,
+ * based on 128 point scale.
+ * Example:
+ * row 2, col 3 (=12) says that the degradation at load idx 2 after
+ * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
+ *
+ * With this power of 2 load factors, we can degrade the load n times
+ * by looking at 1 bits in n and doing as many mult/shift instead of
+ * n mult/shifts needed by the exact degradation.
+ */
+#define DEGRADE_SHIFT 7
+static const unsigned char
+ degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
+static const unsigned char
+ degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
+ {0, 0, 0, 0, 0, 0, 0, 0},
+ {64, 32, 8, 0, 0, 0, 0, 0},
+ {96, 72, 40, 12, 1, 0, 0},
+ {112, 98, 75, 43, 15, 1, 0},
+ {120, 112, 98, 76, 45, 16, 2} };
+
+/*
+ * Update cpu_load for any missed ticks, due to tickless idle. The backlog
+ * would be when CPU is idle and so we just decay the old load without
+ * adding any new load.
+ */
+static unsigned long
+decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
+{
+ int j = 0;
+
+ if (!missed_updates)
+ return load;
+
+ if (missed_updates >= degrade_zero_ticks[idx])
+ return 0;
+
+ if (idx == 1)
+ return load >> missed_updates;
+
+ while (missed_updates) {
+ if (missed_updates % 2)
+ load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
+
+ missed_updates >>= 1;
+ j++;
+ }
+ return load;
+}
+
+/*
+ * Update rq->cpu_load[] statistics. This function is usually called every
+ * scheduler tick (TICK_NSEC). With tickless idle this will not be called
+ * every tick. We fix it up based on jiffies.
+ */
+static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
+ unsigned long pending_updates)
+{
+ int i, scale;
+
+ this_rq->nr_load_updates++;
+
+ /* Update our load: */
+ this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
+ for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
+ unsigned long old_load, new_load;
+
+ /* scale is effectively 1 << i now, and >> i divides by scale */
+
+ old_load = this_rq->cpu_load[i];
+ old_load = decay_load_missed(old_load, pending_updates - 1, i);
+ new_load = this_load;
+ /*
+ * Round up the averaging division if load is increasing. This
+ * prevents us from getting stuck on 9 if the load is 10, for
+ * example.
+ */
+ if (new_load > old_load)
+ new_load += scale - 1;
+
+ this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
+ }
+
+ sched_avg_update(this_rq);
+}
+
+#ifdef CONFIG_NO_HZ_COMMON
+/*
+ * There is no sane way to deal with nohz on smp when using jiffies because the
+ * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
+ * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
+ *
+ * Therefore we cannot use the delta approach from the regular tick since that
+ * would seriously skew the load calculation. However we'll make do for those
+ * updates happening while idle (nohz_idle_balance) or coming out of idle
+ * (tick_nohz_idle_exit).
+ *
+ * This means we might still be one tick off for nohz periods.
+ */
+
+/*
+ * Called from nohz_idle_balance() to update the load ratings before doing the
+ * idle balance.
+ */
+static void update_idle_cpu_load(struct rq *this_rq)
+{
+ unsigned long curr_jiffies = READ_ONCE(jiffies);
+ unsigned long load = this_rq->cfs.runnable_load_avg;
+ unsigned long pending_updates;
+
+ /*
+ * bail if there's load or we're actually up-to-date.
+ */
+ if (load || curr_jiffies == this_rq->last_load_update_tick)
+ return;
+
+ pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+ this_rq->last_load_update_tick = curr_jiffies;
+
+ __update_cpu_load(this_rq, load, pending_updates);
+}
+
+/*
+ * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
+ */
+void update_cpu_load_nohz(void)
+{
+ struct rq *this_rq = this_rq();
+ unsigned long curr_jiffies = READ_ONCE(jiffies);
+ unsigned long pending_updates;
+
+ if (curr_jiffies == this_rq->last_load_update_tick)
+ return;
+
+ raw_spin_lock(&this_rq->lock);
+ pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+ if (pending_updates) {
+ this_rq->last_load_update_tick = curr_jiffies;
+ /*
+ * We were idle, this means load 0, the current load might be
+ * !0 due to remote wakeups and the sort.
+ */
+ __update_cpu_load(this_rq, 0, pending_updates);
+ }
+ raw_spin_unlock(&this_rq->lock);
+}
+#endif /* CONFIG_NO_HZ */
+
+/*
+ * Called from scheduler_tick()
+ */
+void update_cpu_load_active(struct rq *this_rq)
+{
+ unsigned long load = this_rq->cfs.runnable_load_avg;
+ /*
+ * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
+ */
+ this_rq->last_load_update_tick = jiffies;
+ __update_cpu_load(this_rq, load, 1);
+}
+
/* Used instead of source_load when we know the type == 0 */
static unsigned long weighted_cpuload(const int cpu)
{
@@ -4400,7 +4591,7 @@ static unsigned long capacity_orig_of(int cpu)
static unsigned long cpu_avg_load_per_task(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running);
+ unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
unsigned long load_avg = rq->cfs.runnable_load_avg;
if (nr_running)
@@ -5151,18 +5342,21 @@ again:
* entity, update_curr() will update its vruntime, otherwise
* forget we've ever seen it.
*/
- if (curr && curr->on_rq)
- update_curr(cfs_rq);
- else
- curr = NULL;
+ if (curr) {
+ if (curr->on_rq)
+ update_curr(cfs_rq);
+ else
+ curr = NULL;
- /*
- * This call to check_cfs_rq_runtime() will do the throttle and
- * dequeue its entity in the parent(s). Therefore the 'simple'
- * nr_running test will indeed be correct.
- */
- if (unlikely(check_cfs_rq_runtime(cfs_rq)))
- goto simple;
+ /*
+ * This call to check_cfs_rq_runtime() will do the
+ * throttle and dequeue its entity in the parent(s).
+ * Therefore the 'simple' nr_running test will indeed
+ * be correct.
+ */
+ if (unlikely(check_cfs_rq_runtime(cfs_rq)))
+ goto simple;
+ }
se = pick_next_entity(cfs_rq, curr);
cfs_rq = group_cfs_rq(se);
@@ -5223,7 +5417,15 @@ simple:
return p;
idle:
+ /*
+ * This is OK, because current is on_cpu, which avoids it being picked
+ * for load-balance and preemption/IRQs are still disabled avoiding
+ * further scheduler activity on it and we're being very careful to
+ * re-start the picking loop.
+ */
+ lockdep_unpin_lock(&rq->lock);
new_tasks = idle_balance(rq);
+ lockdep_pin_lock(&rq->lock);
/*
* Because idle_balance() releases (and re-acquires) rq->lock, it is
* possible for any higher priority task to appear. In that case we
@@ -5492,10 +5694,15 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
}
#ifdef CONFIG_NUMA_BALANCING
-/* Returns true if the destination node has incurred more faults */
+/*
+ * Returns true if the destination node is the preferred node.
+ * Needs to match fbq_classify_rq(): if there is a runnable task
+ * that is not on its preferred node, we should identify it.
+ */
static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
{
struct numa_group *numa_group = rcu_dereference(p->numa_group);
+ unsigned long src_faults, dst_faults;
int src_nid, dst_nid;
if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
@@ -5509,29 +5716,30 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
if (src_nid == dst_nid)
return false;
- if (numa_group) {
- /* Task is already in the group's interleave set. */
- if (node_isset(src_nid, numa_group->active_nodes))
- return false;
-
- /* Task is moving into the group's interleave set. */
- if (node_isset(dst_nid, numa_group->active_nodes))
- return true;
-
- return group_faults(p, dst_nid) > group_faults(p, src_nid);
- }
-
/* Encourage migration to the preferred node. */
if (dst_nid == p->numa_preferred_nid)
return true;
- return task_faults(p, dst_nid) > task_faults(p, src_nid);
+ /* Migrating away from the preferred node is bad. */
+ if (src_nid == p->numa_preferred_nid)
+ return false;
+
+ if (numa_group) {
+ src_faults = group_faults(p, src_nid);
+ dst_faults = group_faults(p, dst_nid);
+ } else {
+ src_faults = task_faults(p, src_nid);
+ dst_faults = task_faults(p, dst_nid);
+ }
+
+ return dst_faults > src_faults;
}
static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
{
struct numa_group *numa_group = rcu_dereference(p->numa_group);
+ unsigned long src_faults, dst_faults;
int src_nid, dst_nid;
if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
@@ -5546,23 +5754,23 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
if (src_nid == dst_nid)
return false;
- if (numa_group) {
- /* Task is moving within/into the group's interleave set. */
- if (node_isset(dst_nid, numa_group->active_nodes))
- return false;
+ /* Migrating away from the preferred node is bad. */
+ if (src_nid == p->numa_preferred_nid)
+ return true;
- /* Task is moving out of the group's interleave set. */
- if (node_isset(src_nid, numa_group->active_nodes))
- return true;
+ /* Encourage migration to the preferred node. */
+ if (dst_nid == p->numa_preferred_nid)
+ return false;
- return group_faults(p, dst_nid) < group_faults(p, src_nid);
+ if (numa_group) {
+ src_faults = group_faults(p, src_nid);
+ dst_faults = group_faults(p, dst_nid);
+ } else {
+ src_faults = task_faults(p, src_nid);
+ dst_faults = task_faults(p, dst_nid);
}
- /* Migrating away from the preferred node is always bad. */
- if (src_nid == p->numa_preferred_nid)
- return true;
-
- return task_faults(p, dst_nid) < task_faults(p, src_nid);
+ return dst_faults < src_faults;
}
#else
@@ -6062,8 +6270,8 @@ static unsigned long scale_rt_capacity(int cpu)
* Since we're reading these variables without serialization make sure
* we read them once before doing sanity checks on them.
*/
- age_stamp = ACCESS_ONCE(rq->age_stamp);
- avg = ACCESS_ONCE(rq->rt_avg);
+ age_stamp = READ_ONCE(rq->age_stamp);
+ avg = READ_ONCE(rq->rt_avg);
delta = __rq_clock_broken(rq) - age_stamp;
if (unlikely(delta < 0))
@@ -7251,9 +7459,6 @@ static int idle_balance(struct rq *this_rq)
goto out;
}
- /*
- * Drop the rq->lock, but keep IRQ/preempt disabled.
- */
raw_spin_unlock(&this_rq->lock);
update_blocked_averages(this_cpu);
@@ -8293,7 +8498,27 @@ void print_cfs_stats(struct seq_file *m, int cpu)
print_cfs_rq(m, cpu, cfs_rq);
rcu_read_unlock();
}
-#endif
+
+#ifdef CONFIG_NUMA_BALANCING
+void show_numa_stats(struct task_struct *p, struct seq_file *m)
+{
+ int node;
+ unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
+
+ for_each_online_node(node) {
+ if (p->numa_faults) {
+ tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
+ tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
+ }
+ if (p->numa_group) {
+ gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)],
+ gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)];
+ }
+ print_numa_stats(m, node, tsf, tpf, gsf, gpf);
+ }
+}
+#endif /* CONFIG_NUMA_BALANCING */
+#endif /* CONFIG_SCHED_DEBUG */
__init void init_sched_fair_class(void)
{
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 70e698d02..594275ed2 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -13,11 +13,16 @@
#include <trace/events/power.h>
-#ifdef CONFIG_SCHED_BFS
-#include "bfs_sched.h"
-#else
#include "sched.h"
-#endif
+
+/**
+ * sched_idle_set_state - Record idle state for the current CPU.
+ * @idle_state: State to record.
+ */
+void sched_idle_set_state(struct cpuidle_state *idle_state)
+{
+ idle_set_state(this_rq(), idle_state);
+}
static int __read_mostly cpu_idle_force_poll;
@@ -72,6 +77,46 @@ void __weak arch_cpu_idle(void)
}
/**
+ * default_idle_call - Default CPU idle routine.
+ *
+ * To use when the cpuidle framework cannot be used.
+ */
+void default_idle_call(void)
+{
+ if (current_clr_polling_and_test())
+ local_irq_enable();
+ else
+ arch_cpu_idle();
+}
+
+static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
+ int next_state)
+{
+ /* Fall back to the default arch idle method on errors. */
+ if (next_state < 0) {
+ default_idle_call();
+ return next_state;
+ }
+
+ /*
+ * The idle task must be scheduled, it is pointless to go to idle, just
+ * update no idle residency and return.
+ */
+ if (current_clr_polling_and_test()) {
+ dev->last_residency = 0;
+ local_irq_enable();
+ return -EBUSY;
+ }
+
+ /*
+ * Enter the idle state previously returned by the governor decision.
+ * This function will block until an interrupt occurs and will take
+ * care of re-enabling the local interrupts
+ */
+ return cpuidle_enter(drv, dev, next_state);
+}
+
+/**
* cpuidle_idle_call - the main idle function
*
* NOTE: no locks or semaphores should be used here
@@ -85,7 +130,6 @@ static void cpuidle_idle_call(void)
struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
int next_state, entered_state;
- bool reflect;
/*
* Check if the idle task must be rescheduled. If it is the
@@ -109,8 +153,10 @@ static void cpuidle_idle_call(void)
*/
rcu_idle_enter();
- if (cpuidle_not_available(drv, dev))
- goto use_default;
+ if (cpuidle_not_available(drv, dev)) {
+ default_idle_call();
+ goto exit_idle;
+ }
/*
* Suspend-to-idle ("freeze") is a system state in which all user space
@@ -128,52 +174,19 @@ static void cpuidle_idle_call(void)
goto exit_idle;
}
- reflect = false;
next_state = cpuidle_find_deepest_state(drv, dev);
+ call_cpuidle(drv, dev, next_state);
} else {
- reflect = true;
/*
* Ask the cpuidle framework to choose a convenient idle state.
*/
next_state = cpuidle_select(drv, dev);
- }
- /* Fall back to the default arch idle method on errors. */
- if (next_state < 0)
- goto use_default;
-
- /*
- * The idle task must be scheduled, it is pointless to
- * go to idle, just update no idle residency and get
- * out of this function
- */
- if (current_clr_polling_and_test()) {
- dev->last_residency = 0;
- entered_state = next_state;
- local_irq_enable();
- goto exit_idle;
- }
-
- /* Take note of the planned idle state. */
- idle_set_state(this_rq(), &drv->states[next_state]);
-
- /*
- * Enter the idle state previously returned by the governor decision.
- * This function will block until an interrupt occurs and will take
- * care of re-enabling the local interrupts
- */
- entered_state = cpuidle_enter(drv, dev, next_state);
-
- /* The cpu is no longer idle or about to enter idle. */
- idle_set_state(this_rq(), NULL);
-
- if (entered_state == -EBUSY)
- goto use_default;
-
- /*
- * Give the governor an opportunity to reflect on the outcome
- */
- if (reflect)
+ entered_state = call_cpuidle(drv, dev, next_state);
+ /*
+ * Give the governor an opportunity to reflect on the outcome
+ */
cpuidle_reflect(dev, entered_state);
+ }
exit_idle:
__current_set_polling();
@@ -186,19 +199,6 @@ exit_idle:
rcu_idle_exit();
start_critical_timings();
- return;
-
-use_default:
- /*
- * We can't use the cpuidle framework, let's use the default
- * idle routine.
- */
- if (current_clr_polling_and_test())
- local_irq_enable();
- else
- arch_cpu_idle();
-
- goto exit_idle;
}
DEFINE_PER_CPU(bool, cpu_dead_idle);
diff --git a/kernel/sched/proc.c b/kernel/sched/loadavg.c
index 8ecd552fe..ef7159012 100644
--- a/kernel/sched/proc.c
+++ b/kernel/sched/loadavg.c
@@ -1,7 +1,9 @@
/*
- * kernel/sched/proc.c
+ * kernel/sched/loadavg.c
*
- * Kernel load calculations, forked from sched/core.c
+ * This file contains the magic bits required to compute the global loadavg
+ * figure. Its a silly number but people think its important. We go through
+ * great pains to make it work on big machines and tickless kernels.
*/
#include <linux/export.h>
@@ -81,7 +83,7 @@ long calc_load_fold_active(struct rq *this_rq)
long nr_active, delta = 0;
nr_active = this_rq->nr_running;
- nr_active += (long) this_rq->nr_uninterruptible;
+ nr_active += (long)this_rq->nr_uninterruptible;
if (nr_active != this_rq->calc_load_active) {
delta = nr_active - this_rq->calc_load_active;
@@ -186,6 +188,7 @@ void calc_load_enter_idle(void)
delta = calc_load_fold_active(this_rq);
if (delta) {
int idx = calc_load_write_idx();
+
atomic_long_add(delta, &calc_load_idle[idx]);
}
}
@@ -241,18 +244,20 @@ fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
{
unsigned long result = 1UL << frac_bits;
- if (n) for (;;) {
- if (n & 1) {
- result *= x;
- result += 1UL << (frac_bits - 1);
- result >>= frac_bits;
+ if (n) {
+ for (;;) {
+ if (n & 1) {
+ result *= x;
+ result += 1UL << (frac_bits - 1);
+ result >>= frac_bits;
+ }
+ n >>= 1;
+ if (!n)
+ break;
+ x *= x;
+ x += 1UL << (frac_bits - 1);
+ x >>= frac_bits;
}
- n >>= 1;
- if (!n)
- break;
- x *= x;
- x += 1UL << (frac_bits - 1);
- x >>= frac_bits;
}
return result;
@@ -285,7 +290,6 @@ static unsigned long
calc_load_n(unsigned long load, unsigned long exp,
unsigned long active, unsigned int n)
{
-
return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
}
@@ -339,6 +343,8 @@ static inline void calc_global_nohz(void) { }
/*
* calc_load - update the avenrun load estimates 10 ticks after the
* CPUs have updated calc_load_tasks.
+ *
+ * Called from the global timer code.
*/
void calc_global_load(unsigned long ticks)
{
@@ -370,10 +376,10 @@ void calc_global_load(unsigned long ticks)
}
/*
- * Called from update_cpu_load() to periodically update this CPU's
+ * Called from scheduler_tick() to periodically update this CPU's
* active count.
*/
-static void calc_load_account_active(struct rq *this_rq)
+void calc_global_load_tick(struct rq *this_rq)
{
long delta;
@@ -386,199 +392,3 @@ static void calc_load_account_active(struct rq *this_rq)
this_rq->calc_load_update += LOAD_FREQ;
}
-
-/*
- * End of global load-average stuff
- */
-
-/*
- * The exact cpuload at various idx values, calculated at every tick would be
- * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
- *
- * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
- * on nth tick when cpu may be busy, then we have:
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
- * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
- *
- * decay_load_missed() below does efficient calculation of
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
- * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
- *
- * The calculation is approximated on a 128 point scale.
- * degrade_zero_ticks is the number of ticks after which load at any
- * particular idx is approximated to be zero.
- * degrade_factor is a precomputed table, a row for each load idx.
- * Each column corresponds to degradation factor for a power of two ticks,
- * based on 128 point scale.
- * Example:
- * row 2, col 3 (=12) says that the degradation at load idx 2 after
- * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
- *
- * With this power of 2 load factors, we can degrade the load n times
- * by looking at 1 bits in n and doing as many mult/shift instead of
- * n mult/shifts needed by the exact degradation.
- */
-#define DEGRADE_SHIFT 7
-static const unsigned char
- degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
-static const unsigned char
- degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
- {0, 0, 0, 0, 0, 0, 0, 0},
- {64, 32, 8, 0, 0, 0, 0, 0},
- {96, 72, 40, 12, 1, 0, 0},
- {112, 98, 75, 43, 15, 1, 0},
- {120, 112, 98, 76, 45, 16, 2} };
-
-/*
- * Update cpu_load for any missed ticks, due to tickless idle. The backlog
- * would be when CPU is idle and so we just decay the old load without
- * adding any new load.
- */
-static unsigned long
-decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
-{
- int j = 0;
-
- if (!missed_updates)
- return load;
-
- if (missed_updates >= degrade_zero_ticks[idx])
- return 0;
-
- if (idx == 1)
- return load >> missed_updates;
-
- while (missed_updates) {
- if (missed_updates % 2)
- load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
-
- missed_updates >>= 1;
- j++;
- }
- return load;
-}
-
-/*
- * Update rq->cpu_load[] statistics. This function is usually called every
- * scheduler tick (TICK_NSEC). With tickless idle this will not be called
- * every tick. We fix it up based on jiffies.
- */
-static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
- unsigned long pending_updates)
-{
- int i, scale;
-
- this_rq->nr_load_updates++;
-
- /* Update our load: */
- this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
- for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
- unsigned long old_load, new_load;
-
- /* scale is effectively 1 << i now, and >> i divides by scale */
-
- old_load = this_rq->cpu_load[i];
- old_load = decay_load_missed(old_load, pending_updates - 1, i);
- new_load = this_load;
- /*
- * Round up the averaging division if load is increasing. This
- * prevents us from getting stuck on 9 if the load is 10, for
- * example.
- */
- if (new_load > old_load)
- new_load += scale - 1;
-
- this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
- }
-
- sched_avg_update(this_rq);
-}
-
-#ifdef CONFIG_SMP
-static inline unsigned long get_rq_runnable_load(struct rq *rq)
-{
- return rq->cfs.runnable_load_avg;
-}
-#else
-static inline unsigned long get_rq_runnable_load(struct rq *rq)
-{
- return rq->load.weight;
-}
-#endif
-
-#ifdef CONFIG_NO_HZ_COMMON
-/*
- * There is no sane way to deal with nohz on smp when using jiffies because the
- * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
- * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
- *
- * Therefore we cannot use the delta approach from the regular tick since that
- * would seriously skew the load calculation. However we'll make do for those
- * updates happening while idle (nohz_idle_balance) or coming out of idle
- * (tick_nohz_idle_exit).
- *
- * This means we might still be one tick off for nohz periods.
- */
-
-/*
- * Called from nohz_idle_balance() to update the load ratings before doing the
- * idle balance.
- */
-void update_idle_cpu_load(struct rq *this_rq)
-{
- unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
- unsigned long load = get_rq_runnable_load(this_rq);
- unsigned long pending_updates;
-
- /*
- * bail if there's load or we're actually up-to-date.
- */
- if (load || curr_jiffies == this_rq->last_load_update_tick)
- return;
-
- pending_updates = curr_jiffies - this_rq->last_load_update_tick;
- this_rq->last_load_update_tick = curr_jiffies;
-
- __update_cpu_load(this_rq, load, pending_updates);
-}
-
-/*
- * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
- */
-void update_cpu_load_nohz(void)
-{
- struct rq *this_rq = this_rq();
- unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
- unsigned long pending_updates;
-
- if (curr_jiffies == this_rq->last_load_update_tick)
- return;
-
- raw_spin_lock(&this_rq->lock);
- pending_updates = curr_jiffies - this_rq->last_load_update_tick;
- if (pending_updates) {
- this_rq->last_load_update_tick = curr_jiffies;
- /*
- * We were idle, this means load 0, the current load might be
- * !0 due to remote wakeups and the sort.
- */
- __update_cpu_load(this_rq, 0, pending_updates);
- }
- raw_spin_unlock(&this_rq->lock);
-}
-#endif /* CONFIG_NO_HZ */
-
-/*
- * Called from scheduler_tick()
- */
-void update_cpu_load_active(struct rq *this_rq)
-{
- unsigned long load = get_rq_runnable_load(this_rq);
- /*
- * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
- */
- this_rq->last_load_update_tick = jiffies;
- __update_cpu_load(this_rq, load, 1);
-
- calc_load_account_active(this_rq);
-}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 575da76a3..0d193a243 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -18,19 +18,22 @@ static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
{
struct rt_bandwidth *rt_b =
container_of(timer, struct rt_bandwidth, rt_period_timer);
- ktime_t now;
- int overrun;
int idle = 0;
+ int overrun;
+ raw_spin_lock(&rt_b->rt_runtime_lock);
for (;;) {
- now = hrtimer_cb_get_time(timer);
- overrun = hrtimer_forward(timer, now, rt_b->rt_period);
-
+ overrun = hrtimer_forward_now(timer, rt_b->rt_period);
if (!overrun)
break;
+ raw_spin_unlock(&rt_b->rt_runtime_lock);
idle = do_sched_rt_period_timer(rt_b, overrun);
+ raw_spin_lock(&rt_b->rt_runtime_lock);
}
+ if (idle)
+ rt_b->rt_period_active = 0;
+ raw_spin_unlock(&rt_b->rt_runtime_lock);
return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
}
@@ -52,11 +55,12 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
return;
- if (hrtimer_active(&rt_b->rt_period_timer))
- return;
-
raw_spin_lock(&rt_b->rt_runtime_lock);
- start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
+ if (!rt_b->rt_period_active) {
+ rt_b->rt_period_active = 1;
+ hrtimer_forward_now(&rt_b->rt_period_timer, rt_b->rt_period);
+ hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED);
+ }
raw_spin_unlock(&rt_b->rt_runtime_lock);
}
@@ -256,7 +260,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
#ifdef CONFIG_SMP
-static int pull_rt_task(struct rq *this_rq);
+static void pull_rt_task(struct rq *this_rq);
static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
{
@@ -350,13 +354,23 @@ static inline int has_pushable_tasks(struct rq *rq)
return !plist_head_empty(&rq->rt.pushable_tasks);
}
-static inline void set_post_schedule(struct rq *rq)
+static DEFINE_PER_CPU(struct callback_head, rt_push_head);
+static DEFINE_PER_CPU(struct callback_head, rt_pull_head);
+
+static void push_rt_tasks(struct rq *);
+static void pull_rt_task(struct rq *);
+
+static inline void queue_push_tasks(struct rq *rq)
{
- /*
- * We detect this state here so that we can avoid taking the RQ
- * lock again later if there is no need to push
- */
- rq->post_schedule = has_pushable_tasks(rq);
+ if (!has_pushable_tasks(rq))
+ return;
+
+ queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
+}
+
+static inline void queue_pull_task(struct rq *rq)
+{
+ queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
}
static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
@@ -408,12 +422,11 @@ static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
return false;
}
-static inline int pull_rt_task(struct rq *this_rq)
+static inline void pull_rt_task(struct rq *this_rq)
{
- return 0;
}
-static inline void set_post_schedule(struct rq *rq)
+static inline void queue_push_tasks(struct rq *rq)
{
}
#endif /* CONFIG_SMP */
@@ -1323,7 +1336,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
rq = cpu_rq(cpu);
rcu_read_lock();
- curr = ACCESS_ONCE(rq->curr); /* unlocked access */
+ curr = READ_ONCE(rq->curr); /* unlocked access */
/*
* If the current task on @p's runqueue is an RT task, then
@@ -1465,7 +1478,15 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
struct rt_rq *rt_rq = &rq->rt;
if (need_pull_rt_task(rq, prev)) {
+ /*
+ * This is OK, because current is on_cpu, which avoids it being
+ * picked for load-balance and preemption/IRQs are still
+ * disabled avoiding further scheduler activity on it and we're
+ * being very careful to re-start the picking loop.
+ */
+ lockdep_unpin_lock(&rq->lock);
pull_rt_task(rq);
+ lockdep_pin_lock(&rq->lock);
/*
* pull_rt_task() can drop (and re-acquire) rq->lock; this
* means a dl or stop task can slip in, in which case we need
@@ -1493,7 +1514,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
/* The running task is never eligible for pushing */
dequeue_pushable_task(rq, p);
- set_post_schedule(rq);
+ queue_push_tasks(rq);
return p;
}
@@ -1948,14 +1969,15 @@ static void push_irq_work_func(struct irq_work *work)
}
#endif /* HAVE_RT_PUSH_IPI */
-static int pull_rt_task(struct rq *this_rq)
+static void pull_rt_task(struct rq *this_rq)
{
- int this_cpu = this_rq->cpu, ret = 0, cpu;
+ int this_cpu = this_rq->cpu, cpu;
+ bool resched = false;
struct task_struct *p;
struct rq *src_rq;
if (likely(!rt_overloaded(this_rq)))
- return 0;
+ return;
/*
* Match the barrier from rt_set_overloaded; this guarantees that if we
@@ -1966,7 +1988,7 @@ static int pull_rt_task(struct rq *this_rq)
#ifdef HAVE_RT_PUSH_IPI
if (sched_feat(RT_PUSH_IPI)) {
tell_cpu_to_push(this_rq);
- return 0;
+ return;
}
#endif
@@ -2019,7 +2041,7 @@ static int pull_rt_task(struct rq *this_rq)
if (p->prio < src_rq->curr->prio)
goto skip;
- ret = 1;
+ resched = true;
deactivate_task(src_rq, p, 0);
set_task_cpu(p, this_cpu);
@@ -2035,12 +2057,8 @@ skip:
double_unlock_balance(this_rq, src_rq);
}
- return ret;
-}
-
-static void post_schedule_rt(struct rq *rq)
-{
- push_rt_tasks(rq);
+ if (resched)
+ resched_curr(this_rq);
}
/*
@@ -2136,8 +2154,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
return;
- if (pull_rt_task(rq))
- resched_curr(rq);
+ queue_pull_task(rq);
}
void __init init_sched_rt_class(void)
@@ -2158,8 +2175,6 @@ void __init init_sched_rt_class(void)
*/
static void switched_to_rt(struct rq *rq, struct task_struct *p)
{
- int check_resched = 1;
-
/*
* If we are already running, then there's nothing
* that needs to be done. But if we are not running
@@ -2169,13 +2184,12 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
*/
if (task_on_rq_queued(p) && rq->curr != p) {
#ifdef CONFIG_SMP
- if (p->nr_cpus_allowed > 1 && rq->rt.overloaded &&
- /* Don't resched if we changed runqueues */
- push_rt_task(rq) && rq != task_rq(p))
- check_resched = 0;
-#endif /* CONFIG_SMP */
- if (check_resched && p->prio < rq->curr->prio)
+ if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
+ queue_push_tasks(rq);
+#else
+ if (p->prio < rq->curr->prio)
resched_curr(rq);
+#endif /* CONFIG_SMP */
}
}
@@ -2196,14 +2210,13 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
* may need to pull tasks to this runqueue.
*/
if (oldprio < p->prio)
- pull_rt_task(rq);
+ queue_pull_task(rq);
+
/*
* If there's a higher priority task waiting to run
- * then reschedule. Note, the above pull_rt_task
- * can release the rq lock and p could migrate.
- * Only reschedule if p is still on the same runqueue.
+ * then reschedule.
*/
- if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
+ if (p->prio > rq->rt.highest_prio.curr)
resched_curr(rq);
#else
/* For UP simply resched on drop of prio */
@@ -2314,7 +2327,6 @@ const struct sched_class rt_sched_class = {
.set_cpus_allowed = set_cpus_allowed_rt,
.rq_online = rq_online_rt,
.rq_offline = rq_offline_rt,
- .post_schedule = post_schedule_rt,
.task_woken = task_woken_rt,
.switched_from = switched_from_rt,
#endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e0e129993..84d48790b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -26,8 +26,14 @@ extern __read_mostly int scheduler_running;
extern unsigned long calc_load_update;
extern atomic_long_t calc_load_tasks;
+extern void calc_global_load_tick(struct rq *this_rq);
extern long calc_load_fold_active(struct rq *this_rq);
+
+#ifdef CONFIG_SMP
extern void update_cpu_load_active(struct rq *this_rq);
+#else
+static inline void update_cpu_load_active(struct rq *this_rq) { }
+#endif
/*
* Helpers for converting nanosecond timing to jiffy resolution
@@ -131,6 +137,7 @@ struct rt_bandwidth {
ktime_t rt_period;
u64 rt_runtime;
struct hrtimer rt_period_timer;
+ unsigned int rt_period_active;
};
void __dl_clear_params(struct task_struct *p);
@@ -215,7 +222,7 @@ struct cfs_bandwidth {
s64 hierarchical_quota;
u64 runtime_expires;
- int idle, timer_active;
+ int idle, period_active;
struct hrtimer period_timer, slack_timer;
struct list_head throttled_cfs_rq;
@@ -306,7 +313,7 @@ extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
-extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force);
+extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
extern void free_rt_sched_group(struct task_group *tg);
@@ -617,9 +624,10 @@ struct rq {
unsigned long cpu_capacity;
unsigned long cpu_capacity_orig;
+ struct callback_head *balance_callback;
+
unsigned char idle_balance;
/* For active balancing */
- int post_schedule;
int active_balance;
int push_cpu;
struct cpu_stop_work active_balance_work;
@@ -707,7 +715,7 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
static inline u64 __rq_clock_broken(struct rq *rq)
{
- return ACCESS_ONCE(rq->clock);
+ return READ_ONCE(rq->clock);
}
static inline u64 rq_clock(struct rq *rq)
@@ -760,6 +768,21 @@ extern int migrate_swap(struct task_struct *, struct task_struct *);
#ifdef CONFIG_SMP
+static inline void
+queue_balance_callback(struct rq *rq,
+ struct callback_head *head,
+ void (*func)(struct rq *rq))
+{
+ lockdep_assert_held(&rq->lock);
+
+ if (unlikely(head->next))
+ return;
+
+ head->func = (void (*)(struct callback_head *))func;
+ head->next = rq->balance_callback;
+ rq->balance_callback = head;
+}
+
extern void sched_ttwu_pending(void);
#define rcu_dereference_check_sched_domain(p) \
@@ -1185,7 +1208,6 @@ struct sched_class {
int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
- void (*post_schedule) (struct rq *this_rq);
void (*task_waking) (struct task_struct *task);
void (*task_woken) (struct rq *this_rq, struct task_struct *task);
@@ -1284,7 +1306,6 @@ extern void update_max_interval(void);
extern void init_sched_dl_class(void);
extern void init_sched_rt_class(void);
extern void init_sched_fair_class(void);
-extern void init_sched_dl_class(void);
extern void resched_curr(struct rq *rq);
extern void resched_cpu(int cpu);
@@ -1298,8 +1319,6 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
unsigned long to_ratio(u64 period, u64 runtime);
-extern void update_idle_cpu_load(struct rq *this_rq);
-
extern void init_task_runnable_average(struct task_struct *p);
static inline void add_nr_running(struct rq *rq, unsigned count)
@@ -1406,8 +1425,6 @@ static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
static inline void sched_avg_update(struct rq *rq) { }
#endif
-extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);
-
/*
* __task_rq_lock - lock the rq @p resides on.
*/
@@ -1421,8 +1438,10 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
for (;;) {
rq = task_rq(p);
raw_spin_lock(&rq->lock);
- if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
+ lockdep_pin_lock(&rq->lock);
return rq;
+ }
raw_spin_unlock(&rq->lock);
while (unlikely(task_on_rq_migrating(p)))
@@ -1459,8 +1478,10 @@ static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flag
* If we observe the new cpu in task_rq_lock, the acquire will
* pair with the WMB to ensure we must then also see migrating.
*/
- if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
+ lockdep_pin_lock(&rq->lock);
return rq;
+ }
raw_spin_unlock(&rq->lock);
raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
@@ -1472,6 +1493,7 @@ static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flag
static inline void __task_rq_unlock(struct rq *rq)
__releases(rq->lock)
{
+ lockdep_unpin_lock(&rq->lock);
raw_spin_unlock(&rq->lock);
}
@@ -1480,6 +1502,7 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
__releases(rq->lock)
__releases(p->pi_lock)
{
+ lockdep_unpin_lock(&rq->lock);
raw_spin_unlock(&rq->lock);
raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
}
@@ -1666,9 +1689,22 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
+
+#ifdef CONFIG_SCHED_DEBUG
extern void print_cfs_stats(struct seq_file *m, int cpu);
extern void print_rt_stats(struct seq_file *m, int cpu);
extern void print_dl_stats(struct seq_file *m, int cpu);
+extern void
+print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
+
+#ifdef CONFIG_NUMA_BALANCING
+extern void
+show_numa_stats(struct task_struct *p, struct seq_file *m);
+extern void
+print_numa_stats(struct seq_file *m, int node, unsigned long tsf,
+ unsigned long tpf, unsigned long gsf, unsigned long gpf);
+#endif /* CONFIG_NUMA_BALANCING */
+#endif /* CONFIG_SCHED_DEBUG */
extern void init_cfs_rq(struct cfs_rq *cfs_rq);
extern void init_rt_rq(struct rt_rq *rt_rq);
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 7466a0bb2..87e2c9f0c 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -4,11 +4,7 @@
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
-#ifndef CONFIG_SCHED_BFS
#include "sched.h"
-#else
-#include "bfs_sched.h"
-#endif
/*
* bump this up when changing the output format or the meaning of an existing
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 4ab704339..b0fbc7632 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -47,7 +47,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
# define schedstat_set(var, val) do { } while (0)
#endif
-#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+#ifdef CONFIG_SCHED_INFO
static inline void sched_info_reset_dequeued(struct task_struct *t)
{
t->sched_info.last_queued = 0;
@@ -156,7 +156,7 @@ sched_info_switch(struct rq *rq,
#define sched_info_depart(rq, t) do { } while (0)
#define sched_info_arrive(rq, next) do { } while (0)
#define sched_info_switch(rq, t, next) do { } while (0)
-#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
+#endif /* CONFIG_SCHED_INFO */
/*
* The following are functions that support scheduler-internal time accounting.
@@ -174,7 +174,8 @@ static inline bool cputimer_running(struct task_struct *tsk)
{
struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
- if (!cputimer->running)
+ /* Check if cputimer isn't running. This is accessed without locking. */
+ if (!READ_ONCE(cputimer->running))
return false;
/*
@@ -215,9 +216,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
if (!cputimer_running(tsk))
return;
- raw_spin_lock(&cputimer->lock);
- cputimer->cputime.utime += cputime;
- raw_spin_unlock(&cputimer->lock);
+ atomic64_add(cputime, &cputimer->cputime_atomic.utime);
}
/**
@@ -238,9 +237,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
if (!cputimer_running(tsk))
return;
- raw_spin_lock(&cputimer->lock);
- cputimer->cputime.stime += cputime;
- raw_spin_unlock(&cputimer->lock);
+ atomic64_add(cputime, &cputimer->cputime_atomic.stime);
}
/**
@@ -261,7 +258,5 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
if (!cputimer_running(tsk))
return;
- raw_spin_lock(&cputimer->lock);
- cputimer->cputime.sum_exec_runtime += ns;
- raw_spin_unlock(&cputimer->lock);
+ atomic64_add(ns, &cputimer->cputime_atomic.sum_exec_runtime);
}
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 852143a79..052e02672 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -341,7 +341,7 @@ long wait_woken(wait_queue_t *wait, unsigned mode, long timeout)
* condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss
* an event.
*/
- set_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */
+ smp_store_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */
return timeout;
}
@@ -354,7 +354,7 @@ int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
* doesn't imply write barrier and the users expects write
* barrier semantics on wakeup functions. The following
* smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
- * and is paired with set_mb() in wait_woken().
+ * and is paired with smp_store_mb() in wait_woken().
*/
smp_wmb(); /* C */
wait->flags |= WQ_FLAG_WOKEN;
@@ -601,7 +601,7 @@ EXPORT_SYMBOL(bit_wait_io);
__sched int bit_wait_timeout(struct wait_bit_key *word)
{
- unsigned long now = ACCESS_ONCE(jiffies);
+ unsigned long now = READ_ONCE(jiffies);
if (signal_pending_state(current->state, current))
return 1;
if (time_after_eq(now, word->timeout))
@@ -613,7 +613,7 @@ EXPORT_SYMBOL_GPL(bit_wait_timeout);
__sched int bit_wait_io_timeout(struct wait_bit_key *word)
{
- unsigned long now = ACCESS_ONCE(jiffies);
+ unsigned long now = READ_ONCE(jiffies);
if (signal_pending_state(current->state, current))
return 1;
if (time_after_eq(now, word->timeout))
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 4f4402894..245df6b32 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -346,16 +346,13 @@ static inline void seccomp_sync_threads(void)
*/
static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
{
- struct seccomp_filter *filter;
- unsigned long fp_size;
- struct sock_filter *fp;
- int new_len;
- long ret;
+ struct seccomp_filter *sfilter;
+ int ret;
if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
return ERR_PTR(-EINVAL);
+
BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter));
- fp_size = fprog->len * sizeof(struct sock_filter);
/*
* Installing a seccomp filter requires that the task has
@@ -368,60 +365,21 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
CAP_SYS_ADMIN) != 0)
return ERR_PTR(-EACCES);
- fp = kzalloc(fp_size, GFP_KERNEL|__GFP_NOWARN);
- if (!fp)
- return ERR_PTR(-ENOMEM);
-
- /* Copy the instructions from fprog. */
- ret = -EFAULT;
- if (copy_from_user(fp, fprog->filter, fp_size))
- goto free_prog;
-
- /* Check and rewrite the fprog via the skb checker */
- ret = bpf_check_classic(fp, fprog->len);
- if (ret)
- goto free_prog;
-
- /* Check and rewrite the fprog for seccomp use */
- ret = seccomp_check_filter(fp, fprog->len);
- if (ret)
- goto free_prog;
-
- /* Convert 'sock_filter' insns to 'bpf_insn' insns */
- ret = bpf_convert_filter(fp, fprog->len, NULL, &new_len);
- if (ret)
- goto free_prog;
-
/* Allocate a new seccomp_filter */
- ret = -ENOMEM;
- filter = kzalloc(sizeof(struct seccomp_filter),
- GFP_KERNEL|__GFP_NOWARN);
- if (!filter)
- goto free_prog;
-
- filter->prog = bpf_prog_alloc(bpf_prog_size(new_len), __GFP_NOWARN);
- if (!filter->prog)
- goto free_filter;
-
- ret = bpf_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len);
- if (ret)
- goto free_filter_prog;
-
- kfree(fp);
- atomic_set(&filter->usage, 1);
- filter->prog->len = new_len;
+ sfilter = kzalloc(sizeof(*sfilter), GFP_KERNEL | __GFP_NOWARN);
+ if (!sfilter)
+ return ERR_PTR(-ENOMEM);
- bpf_prog_select_runtime(filter->prog);
+ ret = bpf_prog_create_from_user(&sfilter->prog, fprog,
+ seccomp_check_filter);
+ if (ret < 0) {
+ kfree(sfilter);
+ return ERR_PTR(ret);
+ }
- return filter;
+ atomic_set(&sfilter->usage, 1);
-free_filter_prog:
- __bpf_prog_free(filter->prog);
-free_filter:
- kfree(filter);
-free_prog:
- kfree(fp);
- return ERR_PTR(ret);
+ return sfilter;
}
/**
diff --git a/kernel/signal.c b/kernel/signal.c
index 0206be728..0f6bbbe77 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -245,7 +245,7 @@ static inline void print_dropped_signal(int sig)
* RETURNS:
* %true if @mask is set, %false if made noop because @task was dying.
*/
-bool task_set_jobctl_pending(struct task_struct *task, unsigned int mask)
+bool task_set_jobctl_pending(struct task_struct *task, unsigned long mask)
{
BUG_ON(mask & ~(JOBCTL_PENDING_MASK | JOBCTL_STOP_CONSUME |
JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING));
@@ -297,7 +297,7 @@ void task_clear_jobctl_trapping(struct task_struct *task)
* CONTEXT:
* Must be called with @task->sighand->siglock held.
*/
-void task_clear_jobctl_pending(struct task_struct *task, unsigned int mask)
+void task_clear_jobctl_pending(struct task_struct *task, unsigned long mask)
{
BUG_ON(mask & ~JOBCTL_PENDING_MASK);
@@ -414,21 +414,16 @@ void flush_sigqueue(struct sigpending *queue)
}
/*
- * Flush all pending signals for a task.
+ * Flush all pending signals for this kthread.
*/
-void __flush_signals(struct task_struct *t)
-{
- clear_tsk_thread_flag(t, TIF_SIGPENDING);
- flush_sigqueue(&t->pending);
- flush_sigqueue(&t->signal->shared_pending);
-}
-
void flush_signals(struct task_struct *t)
{
unsigned long flags;
spin_lock_irqsave(&t->sighand->siglock, flags);
- __flush_signals(t);
+ clear_tsk_thread_flag(t, TIF_SIGPENDING);
+ flush_sigqueue(&t->pending);
+ flush_sigqueue(&t->signal->shared_pending);
spin_unlock_irqrestore(&t->sighand->siglock, flags);
}
@@ -2000,7 +1995,7 @@ static bool do_signal_stop(int signr)
struct signal_struct *sig = current->signal;
if (!(current->jobctl & JOBCTL_STOP_PENDING)) {
- unsigned int gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME;
+ unsigned long gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME;
struct task_struct *t;
/* signr will be recorded in task->jobctl for retries */
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index bdcc6c018..7c434c39f 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -173,7 +173,7 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
if (tsk)
return 0;
- td = kzalloc_node(sizeof(*td), GFP_KERNEL | ___GFP_TOI_NOTRACK, cpu_to_node(cpu));
+ td = kzalloc_node(sizeof(*td), GFP_KERNEL, cpu_to_node(cpu));
if (!td)
return -ENOMEM;
td->cpu = cpu;
@@ -232,7 +232,8 @@ void smpboot_unpark_threads(unsigned int cpu)
mutex_lock(&smpboot_threads_lock);
list_for_each_entry(cur, &hotplug_threads, list)
- smpboot_unpark_thread(cur, cpu);
+ if (cpumask_test_cpu(cpu, cur->cpumask))
+ smpboot_unpark_thread(cur, cpu);
mutex_unlock(&smpboot_threads_lock);
}
@@ -258,6 +259,15 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
{
unsigned int cpu;
+ /* Unpark any threads that were voluntarily parked. */
+ for_each_cpu_not(cpu, ht->cpumask) {
+ if (cpu_online(cpu)) {
+ struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
+ if (tsk)
+ kthread_unpark(tsk);
+ }
+ }
+
/* We need to destroy also the parked threads of offline cpus */
for_each_possible_cpu(cpu) {
struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
@@ -281,6 +291,10 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
unsigned int cpu;
int ret = 0;
+ if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL))
+ return -ENOMEM;
+ cpumask_copy(plug_thread->cpumask, cpu_possible_mask);
+
get_online_cpus();
mutex_lock(&smpboot_threads_lock);
for_each_online_cpu(cpu) {
@@ -313,9 +327,53 @@ void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread)
smpboot_destroy_threads(plug_thread);
mutex_unlock(&smpboot_threads_lock);
put_online_cpus();
+ free_cpumask_var(plug_thread->cpumask);
}
EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
+/**
+ * smpboot_update_cpumask_percpu_thread - Adjust which per_cpu hotplug threads stay parked
+ * @plug_thread: Hotplug thread descriptor
+ * @new: Revised mask to use
+ *
+ * The cpumask field in the smp_hotplug_thread must not be updated directly
+ * by the client, but only by calling this function.
+ * This function can only be called on a registered smp_hotplug_thread.
+ */
+int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
+ const struct cpumask *new)
+{
+ struct cpumask *old = plug_thread->cpumask;
+ cpumask_var_t tmp;
+ unsigned int cpu;
+
+ if (!alloc_cpumask_var(&tmp, GFP_KERNEL))
+ return -ENOMEM;
+
+ get_online_cpus();
+ mutex_lock(&smpboot_threads_lock);
+
+ /* Park threads that were exclusively enabled on the old mask. */
+ cpumask_andnot(tmp, old, new);
+ for_each_cpu_and(cpu, tmp, cpu_online_mask)
+ smpboot_park_thread(plug_thread, cpu);
+
+ /* Unpark threads that are exclusively enabled on the new mask. */
+ cpumask_andnot(tmp, new, old);
+ for_each_cpu_and(cpu, tmp, cpu_online_mask)
+ smpboot_unpark_thread(plug_thread, cpu);
+
+ cpumask_copy(old, new);
+
+ mutex_unlock(&smpboot_threads_lock);
+ put_online_cpus();
+
+ free_cpumask_var(tmp);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(smpboot_update_cpumask_percpu_thread);
+
static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD);
/*
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 263b0e1ad..fd643d8c4 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -41,8 +41,7 @@ struct cpu_stopper {
};
static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
-DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task);
-
+static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task);
static bool stop_machine_initialized = false;
/*
@@ -212,25 +211,6 @@ static int multi_cpu_stop(void *data)
return err;
}
-struct irq_cpu_stop_queue_work_info {
- int cpu1;
- int cpu2;
- struct cpu_stop_work *work1;
- struct cpu_stop_work *work2;
-};
-
-/*
- * This function is always run with irqs and preemption disabled.
- * This guarantees that both work1 and work2 get queued, before
- * our local migrate thread gets the chance to preempt us.
- */
-static void irq_cpu_stop_queue_work(void *arg)
-{
- struct irq_cpu_stop_queue_work_info *info = arg;
- cpu_stop_queue_work(info->cpu1, info->work1);
- cpu_stop_queue_work(info->cpu2, info->work2);
-}
-
/**
* stop_two_cpus - stops two cpus
* @cpu1: the cpu to stop
@@ -246,7 +226,6 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
{
struct cpu_stop_done done;
struct cpu_stop_work work1, work2;
- struct irq_cpu_stop_queue_work_info call_args;
struct multi_stop_data msdata;
preempt_disable();
@@ -263,13 +242,6 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
.done = &done
};
- call_args = (struct irq_cpu_stop_queue_work_info){
- .cpu1 = cpu1,
- .cpu2 = cpu2,
- .work1 = &work1,
- .work2 = &work2,
- };
-
cpu_stop_init_done(&done, 2);
set_state(&msdata, MULTI_STOP_PREPARE);
@@ -286,16 +258,11 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
return -ENOENT;
}
- lg_local_lock(&stop_cpus_lock);
- /*
- * Queuing needs to be done by the lowest numbered CPU, to ensure
- * that works are always queued in the same order on every CPU.
- * This prevents deadlocks.
- */
- smp_call_function_single(min(cpu1, cpu2),
- &irq_cpu_stop_queue_work,
- &call_args, 1);
- lg_local_unlock(&stop_cpus_lock);
+ lg_double_lock(&stop_cpus_lock, cpu1, cpu2);
+ cpu_stop_queue_work(cpu1, &work1);
+ cpu_stop_queue_work(cpu2, &work2);
+ lg_double_unlock(&stop_cpus_lock, cpu1, cpu2);
+
preempt_enable();
wait_for_completion(&done.completion);
diff --git a/kernel/sys.c b/kernel/sys.c
index a4e372b79..259fda25e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -92,10 +92,10 @@
# define SET_TSC_CTL(a) (-EINVAL)
#endif
#ifndef MPX_ENABLE_MANAGEMENT
-# define MPX_ENABLE_MANAGEMENT(a) (-EINVAL)
+# define MPX_ENABLE_MANAGEMENT() (-EINVAL)
#endif
#ifndef MPX_DISABLE_MANAGEMENT
-# define MPX_DISABLE_MANAGEMENT(a) (-EINVAL)
+# define MPX_DISABLE_MANAGEMENT() (-EINVAL)
#endif
#ifndef GET_FP_MODE
# define GET_FP_MODE(a) (-EINVAL)
@@ -1722,7 +1722,6 @@ exit_err:
goto exit;
}
-#ifdef CONFIG_CHECKPOINT_RESTORE
/*
* WARNING: we don't require any capability here so be very careful
* in what is allowed for modification from userspace.
@@ -1818,6 +1817,7 @@ out:
return error;
}
+#ifdef CONFIG_CHECKPOINT_RESTORE
static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size)
{
struct prctl_mm_map prctl_map = { .exe_fd = (u32)-1, };
@@ -1902,10 +1902,41 @@ out:
}
#endif /* CONFIG_CHECKPOINT_RESTORE */
+static int prctl_set_auxv(struct mm_struct *mm, unsigned long addr,
+ unsigned long len)
+{
+ /*
+ * This doesn't move the auxiliary vector itself since it's pinned to
+ * mm_struct, but it permits filling the vector with new values. It's
+ * up to the caller to provide sane values here, otherwise userspace
+ * tools which use this vector might be unhappy.
+ */
+ unsigned long user_auxv[AT_VECTOR_SIZE];
+
+ if (len > sizeof(user_auxv))
+ return -EINVAL;
+
+ if (copy_from_user(user_auxv, (const void __user *)addr, len))
+ return -EFAULT;
+
+ /* Make sure the last entry is always AT_NULL */
+ user_auxv[AT_VECTOR_SIZE - 2] = 0;
+ user_auxv[AT_VECTOR_SIZE - 1] = 0;
+
+ BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));
+
+ task_lock(current);
+ memcpy(mm->saved_auxv, user_auxv, len);
+ task_unlock(current);
+
+ return 0;
+}
+
static int prctl_set_mm(int opt, unsigned long addr,
unsigned long arg4, unsigned long arg5)
{
struct mm_struct *mm = current->mm;
+ struct prctl_mm_map prctl_map;
struct vm_area_struct *vma;
int error;
@@ -1925,6 +1956,9 @@ static int prctl_set_mm(int opt, unsigned long addr,
if (opt == PR_SET_MM_EXE_FILE)
return prctl_set_mm_exe_file(mm, (unsigned int)addr);
+ if (opt == PR_SET_MM_AUXV)
+ return prctl_set_auxv(mm, addr, arg4);
+
if (addr >= TASK_SIZE || addr < mmap_min_addr)
return -EINVAL;
@@ -1933,42 +1967,64 @@ static int prctl_set_mm(int opt, unsigned long addr,
down_read(&mm->mmap_sem);
vma = find_vma(mm, addr);
+ prctl_map.start_code = mm->start_code;
+ prctl_map.end_code = mm->end_code;
+ prctl_map.start_data = mm->start_data;
+ prctl_map.end_data = mm->end_data;
+ prctl_map.start_brk = mm->start_brk;
+ prctl_map.brk = mm->brk;
+ prctl_map.start_stack = mm->start_stack;
+ prctl_map.arg_start = mm->arg_start;
+ prctl_map.arg_end = mm->arg_end;
+ prctl_map.env_start = mm->env_start;
+ prctl_map.env_end = mm->env_end;
+ prctl_map.auxv = NULL;
+ prctl_map.auxv_size = 0;
+ prctl_map.exe_fd = -1;
+
switch (opt) {
case PR_SET_MM_START_CODE:
- mm->start_code = addr;
+ prctl_map.start_code = addr;
break;
case PR_SET_MM_END_CODE:
- mm->end_code = addr;
+ prctl_map.end_code = addr;
break;
case PR_SET_MM_START_DATA:
- mm->start_data = addr;
+ prctl_map.start_data = addr;
break;
case PR_SET_MM_END_DATA:
- mm->end_data = addr;
+ prctl_map.end_data = addr;
+ break;
+ case PR_SET_MM_START_STACK:
+ prctl_map.start_stack = addr;
break;
-
case PR_SET_MM_START_BRK:
- if (addr <= mm->end_data)
- goto out;
-
- if (check_data_rlimit(rlimit(RLIMIT_DATA), mm->brk, addr,
- mm->end_data, mm->start_data))
- goto out;
-
- mm->start_brk = addr;
+ prctl_map.start_brk = addr;
break;
-
case PR_SET_MM_BRK:
- if (addr <= mm->end_data)
- goto out;
-
- if (check_data_rlimit(rlimit(RLIMIT_DATA), addr, mm->start_brk,
- mm->end_data, mm->start_data))
- goto out;
-
- mm->brk = addr;
+ prctl_map.brk = addr;
break;
+ case PR_SET_MM_ARG_START:
+ prctl_map.arg_start = addr;
+ break;
+ case PR_SET_MM_ARG_END:
+ prctl_map.arg_end = addr;
+ break;
+ case PR_SET_MM_ENV_START:
+ prctl_map.env_start = addr;
+ break;
+ case PR_SET_MM_ENV_END:
+ prctl_map.env_end = addr;
+ break;
+ default:
+ goto out;
+ }
+
+ error = validate_prctl_map(&prctl_map);
+ if (error)
+ goto out;
+ switch (opt) {
/*
* If command line arguments and environment
* are placed somewhere else on stack, we can
@@ -1985,52 +2041,20 @@ static int prctl_set_mm(int opt, unsigned long addr,
error = -EFAULT;
goto out;
}
- if (opt == PR_SET_MM_START_STACK)
- mm->start_stack = addr;
- else if (opt == PR_SET_MM_ARG_START)
- mm->arg_start = addr;
- else if (opt == PR_SET_MM_ARG_END)
- mm->arg_end = addr;
- else if (opt == PR_SET_MM_ENV_START)
- mm->env_start = addr;
- else if (opt == PR_SET_MM_ENV_END)
- mm->env_end = addr;
- break;
-
- /*
- * This doesn't move auxiliary vector itself
- * since it's pinned to mm_struct, but allow
- * to fill vector with new values. It's up
- * to a caller to provide sane values here
- * otherwise user space tools which use this
- * vector might be unhappy.
- */
- case PR_SET_MM_AUXV: {
- unsigned long user_auxv[AT_VECTOR_SIZE];
-
- if (arg4 > sizeof(user_auxv))
- goto out;
- up_read(&mm->mmap_sem);
-
- if (copy_from_user(user_auxv, (const void __user *)addr, arg4))
- return -EFAULT;
-
- /* Make sure the last entry is always AT_NULL */
- user_auxv[AT_VECTOR_SIZE - 2] = 0;
- user_auxv[AT_VECTOR_SIZE - 1] = 0;
-
- BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));
-
- task_lock(current);
- memcpy(mm->saved_auxv, user_auxv, arg4);
- task_unlock(current);
-
- return 0;
- }
- default:
- goto out;
}
+ mm->start_code = prctl_map.start_code;
+ mm->end_code = prctl_map.end_code;
+ mm->start_data = prctl_map.start_data;
+ mm->end_data = prctl_map.end_data;
+ mm->start_brk = prctl_map.start_brk;
+ mm->brk = prctl_map.brk;
+ mm->start_stack = prctl_map.start_stack;
+ mm->arg_start = prctl_map.arg_start;
+ mm->arg_end = prctl_map.arg_end;
+ mm->env_start = prctl_map.env_start;
+ mm->env_end = prctl_map.env_end;
+
error = 0;
out:
up_read(&mm->mmap_sem);
@@ -2230,12 +2254,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
case PR_MPX_ENABLE_MANAGEMENT:
if (arg2 || arg3 || arg4 || arg5)
return -EINVAL;
- error = MPX_ENABLE_MANAGEMENT(me);
+ error = MPX_ENABLE_MANAGEMENT();
break;
case PR_MPX_DISABLE_MANAGEMENT:
if (arg2 || arg3 || arg4 || arg5)
return -EINVAL;
- error = MPX_DISABLE_MANAGEMENT(me);
+ error = MPX_DISABLE_MANAGEMENT();
break;
case PR_SET_FP_MODE:
error = SET_FP_MODE(me, arg2);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 7f45887fa..19b62b522 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -124,12 +124,7 @@ static int __maybe_unused one = 1;
static int __maybe_unused two = 2;
static int __maybe_unused four = 4;
static unsigned long one_ul = 1;
-static int __maybe_unused one_hundred = 100;
-#ifdef CONFIG_SCHED_BFS
-extern int rr_interval;
-extern int sched_iso_cpu;
-static int __read_mostly one_thousand = 1000;
-#endif
+static int one_hundred = 100;
#ifdef CONFIG_PRINTK
static int ten_thousand = 10000;
#endif
@@ -264,7 +259,7 @@ static struct ctl_table sysctl_base_table[] = {
{ }
};
-#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_BFS)
+#ifdef CONFIG_SCHED_DEBUG
static int min_sched_granularity_ns = 100000; /* 100 usecs */
static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
static int min_wakeup_granularity_ns; /* 0 usecs */
@@ -281,7 +276,6 @@ static int max_extfrag_threshold = 1000;
#endif
static struct ctl_table kern_table[] = {
-#ifndef CONFIG_SCHED_BFS
{
.procname = "sched_child_runs_first",
.data = &sysctl_sched_child_runs_first,
@@ -355,15 +349,6 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
- {
- .procname = "timer_migration",
- .data = &sysctl_timer_migration,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &one,
- },
#endif /* CONFIG_SMP */
#ifdef CONFIG_NUMA_BALANCING
{
@@ -448,7 +433,6 @@ static struct ctl_table kern_table[] = {
.extra1 = &one,
},
#endif
-#endif /* !CONFIG_SCHED_BFS */
#ifdef CONFIG_PROVE_LOCKING
{
.procname = "prove_locking",
@@ -888,6 +872,13 @@ static struct ctl_table kern_table[] = {
.extra2 = &one,
},
{
+ .procname = "watchdog_cpumask",
+ .data = &watchdog_cpumask_bits,
+ .maxlen = NR_CPUS,
+ .mode = 0644,
+ .proc_handler = proc_watchdog_cpumask,
+ },
+ {
.procname = "softlockup_panic",
.data = &softlockup_panic,
.maxlen = sizeof(int),
@@ -979,26 +970,6 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
-#ifdef CONFIG_SCHED_BFS
- {
- .procname = "rr_interval",
- .data = &rr_interval,
- .maxlen = sizeof (int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_minmax,
- .extra1 = &one,
- .extra2 = &one_thousand,
- },
- {
- .procname = "iso_cpu",
- .data = &sched_iso_cpu,
- .maxlen = sizeof (int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &one_hundred,
- },
-#endif
#if defined(CONFIG_S390) && defined(CONFIG_SMP)
{
.procname = "spin_retry",
@@ -1159,6 +1130,15 @@ static struct ctl_table kern_table[] = {
.extra1 = &zero,
.extra2 = &one,
},
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+ {
+ .procname = "timer_migration",
+ .data = &sysctl_timer_migration,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = timer_migration_handler,
+ },
+#endif
{ }
};
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 7ceb68656..579ce1b92 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -89,7 +89,7 @@ config NO_HZ_IDLE
config NO_HZ_FULL
bool "Full dynticks system (tickless)"
# NO_HZ_COMMON dependency
- depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS && !SCHED_BFS
+ depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
# We need at least one periodic CPU for timekeeping
depends on SMP
# RCU_USER_QS dependency
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 01f031241..49eca0bee 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -12,20 +12,3 @@ obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o
obj-$(CONFIG_TIMER_STATS) += timer_stats.o
obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
obj-$(CONFIG_TEST_UDELAY) += test_udelay.o
-
-$(obj)/time.o: $(obj)/timeconst.h
-
-quiet_cmd_hzfile = HZFILE $@
- cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@
-
-targets += hz.bc
-$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE
- $(call if_changed,hzfile)
-
-quiet_cmd_bc = BC $@
- cmd_bc = bc -q $(filter-out FORCE,$^) > $@
-
-targets += timeconst.h
-$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE
- $(call if_changed,bc)
-
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 1b001ed1e..7fbba635a 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -317,19 +317,16 @@ EXPORT_SYMBOL_GPL(alarm_init);
* @alarm: ptr to alarm to set
* @start: time to run the alarm
*/
-int alarm_start(struct alarm *alarm, ktime_t start)
+void alarm_start(struct alarm *alarm, ktime_t start)
{
struct alarm_base *base = &alarm_bases[alarm->type];
unsigned long flags;
- int ret;
spin_lock_irqsave(&base->lock, flags);
alarm->node.expires = start;
alarmtimer_enqueue(base, alarm);
- ret = hrtimer_start(&alarm->timer, alarm->node.expires,
- HRTIMER_MODE_ABS);
+ hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS);
spin_unlock_irqrestore(&base->lock, flags);
- return ret;
}
EXPORT_SYMBOL_GPL(alarm_start);
@@ -338,12 +335,12 @@ EXPORT_SYMBOL_GPL(alarm_start);
* @alarm: ptr to alarm to set
* @start: time relative to now to run the alarm
*/
-int alarm_start_relative(struct alarm *alarm, ktime_t start)
+void alarm_start_relative(struct alarm *alarm, ktime_t start)
{
struct alarm_base *base = &alarm_bases[alarm->type];
start = ktime_add(start, base->gettime());
- return alarm_start(alarm, start);
+ alarm_start(alarm, start);
}
EXPORT_SYMBOL_GPL(alarm_start_relative);
@@ -495,12 +492,12 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
*/
static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp)
{
- clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid;
-
if (!alarmtimer_get_rtcdev())
return -EINVAL;
- return hrtimer_get_res(baseid, tp);
+ tp->tv_sec = 0;
+ tp->tv_nsec = hrtimer_resolution;
+ return 0;
}
/**
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 637a09461..50eb107f1 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -94,8 +94,8 @@ u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt)
}
EXPORT_SYMBOL_GPL(clockevent_delta2ns);
-static int __clockevents_set_state(struct clock_event_device *dev,
- enum clock_event_state state)
+static int __clockevents_switch_state(struct clock_event_device *dev,
+ enum clock_event_state state)
{
/* Transition with legacy set_mode() callback */
if (dev->set_mode) {
@@ -120,19 +120,37 @@ static int __clockevents_set_state(struct clock_event_device *dev,
/* The clockevent device is getting replaced. Shut it down. */
case CLOCK_EVT_STATE_SHUTDOWN:
- return dev->set_state_shutdown(dev);
+ if (dev->set_state_shutdown)
+ return dev->set_state_shutdown(dev);
+ return 0;
case CLOCK_EVT_STATE_PERIODIC:
/* Core internal bug */
if (!(dev->features & CLOCK_EVT_FEAT_PERIODIC))
return -ENOSYS;
- return dev->set_state_periodic(dev);
+ if (dev->set_state_periodic)
+ return dev->set_state_periodic(dev);
+ return 0;
case CLOCK_EVT_STATE_ONESHOT:
/* Core internal bug */
if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT))
return -ENOSYS;
- return dev->set_state_oneshot(dev);
+ if (dev->set_state_oneshot)
+ return dev->set_state_oneshot(dev);
+ return 0;
+
+ case CLOCK_EVT_STATE_ONESHOT_STOPPED:
+ /* Core internal bug */
+ if (WARN_ONCE(!clockevent_state_oneshot(dev),
+ "Current state: %d\n",
+ clockevent_get_state(dev)))
+ return -EINVAL;
+
+ if (dev->set_state_oneshot_stopped)
+ return dev->set_state_oneshot_stopped(dev);
+ else
+ return -ENOSYS;
default:
return -ENOSYS;
@@ -140,26 +158,26 @@ static int __clockevents_set_state(struct clock_event_device *dev,
}
/**
- * clockevents_set_state - set the operating state of a clock event device
+ * clockevents_switch_state - set the operating state of a clock event device
* @dev: device to modify
* @state: new state
*
* Must be called with interrupts disabled !
*/
-void clockevents_set_state(struct clock_event_device *dev,
- enum clock_event_state state)
+void clockevents_switch_state(struct clock_event_device *dev,
+ enum clock_event_state state)
{
- if (dev->state != state) {
- if (__clockevents_set_state(dev, state))
+ if (clockevent_get_state(dev) != state) {
+ if (__clockevents_switch_state(dev, state))
return;
- dev->state = state;
+ clockevent_set_state(dev, state);
/*
* A nsec2cyc multiplicator of 0 is invalid and we'd crash
* on it, so fix it up and emit a warning:
*/
- if (state == CLOCK_EVT_STATE_ONESHOT) {
+ if (clockevent_state_oneshot(dev)) {
if (unlikely(!dev->mult)) {
dev->mult = 1;
WARN_ON(1);
@@ -174,7 +192,7 @@ void clockevents_set_state(struct clock_event_device *dev,
*/
void clockevents_shutdown(struct clock_event_device *dev)
{
- clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
+ clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
dev->next_event.tv64 = KTIME_MAX;
}
@@ -248,7 +266,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev)
delta = dev->min_delta_ns;
dev->next_event = ktime_add_ns(ktime_get(), delta);
- if (dev->state == CLOCK_EVT_STATE_SHUTDOWN)
+ if (clockevent_state_shutdown(dev))
return 0;
dev->retries++;
@@ -285,7 +303,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev)
delta = dev->min_delta_ns;
dev->next_event = ktime_add_ns(ktime_get(), delta);
- if (dev->state == CLOCK_EVT_STATE_SHUTDOWN)
+ if (clockevent_state_shutdown(dev))
return 0;
dev->retries++;
@@ -317,9 +335,13 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
dev->next_event = expires;
- if (dev->state == CLOCK_EVT_STATE_SHUTDOWN)
+ if (clockevent_state_shutdown(dev))
return 0;
+ /* We must be in ONESHOT state here */
+ WARN_ONCE(!clockevent_state_oneshot(dev), "Current state: %d\n",
+ clockevent_get_state(dev));
+
/* Shortcut for clockevent devices that can deal with ktime. */
if (dev->features & CLOCK_EVT_FEAT_KTIME)
return dev->set_next_ktime(expires, dev);
@@ -362,7 +384,7 @@ static int clockevents_replace(struct clock_event_device *ced)
struct clock_event_device *dev, *newdev = NULL;
list_for_each_entry(dev, &clockevent_devices, list) {
- if (dev == ced || dev->state != CLOCK_EVT_STATE_DETACHED)
+ if (dev == ced || !clockevent_state_detached(dev))
continue;
if (!tick_check_replacement(newdev, dev))
@@ -388,7 +410,7 @@ static int clockevents_replace(struct clock_event_device *ced)
static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu)
{
/* Fast track. Device is unused */
- if (ced->state == CLOCK_EVT_STATE_DETACHED) {
+ if (clockevent_state_detached(ced)) {
list_del_init(&ced->list);
return 0;
}
@@ -445,7 +467,8 @@ static int clockevents_sanity_check(struct clock_event_device *dev)
if (dev->set_mode) {
/* We shouldn't be supporting new modes now */
WARN_ON(dev->set_state_periodic || dev->set_state_oneshot ||
- dev->set_state_shutdown || dev->tick_resume);
+ dev->set_state_shutdown || dev->tick_resume ||
+ dev->set_state_oneshot_stopped);
BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
return 0;
@@ -454,18 +477,6 @@ static int clockevents_sanity_check(struct clock_event_device *dev)
if (dev->features & CLOCK_EVT_FEAT_DUMMY)
return 0;
- /* New state-specific callbacks */
- if (!dev->set_state_shutdown)
- return -EINVAL;
-
- if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
- !dev->set_state_periodic)
- return -EINVAL;
-
- if ((dev->features & CLOCK_EVT_FEAT_ONESHOT) &&
- !dev->set_state_oneshot)
- return -EINVAL;
-
return 0;
}
@@ -480,7 +491,7 @@ void clockevents_register_device(struct clock_event_device *dev)
BUG_ON(clockevents_sanity_check(dev));
/* Initialize state to DETACHED */
- dev->state = CLOCK_EVT_STATE_DETACHED;
+ clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED);
if (!dev->cpumask) {
WARN_ON(num_possible_cpus() > 1);
@@ -545,11 +556,11 @@ int __clockevents_update_freq(struct clock_event_device *dev, u32 freq)
{
clockevents_config(dev, freq);
- if (dev->state == CLOCK_EVT_STATE_ONESHOT)
+ if (clockevent_state_oneshot(dev))
return clockevents_program_event(dev, dev->next_event, false);
- if (dev->state == CLOCK_EVT_STATE_PERIODIC)
- return __clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC);
+ if (clockevent_state_periodic(dev))
+ return __clockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC);
return 0;
}
@@ -603,13 +614,13 @@ void clockevents_exchange_device(struct clock_event_device *old,
*/
if (old) {
module_put(old->owner);
- clockevents_set_state(old, CLOCK_EVT_STATE_DETACHED);
+ clockevents_switch_state(old, CLOCK_EVT_STATE_DETACHED);
list_del(&old->list);
list_add(&old->list, &clockevents_released);
}
if (new) {
- BUG_ON(new->state != CLOCK_EVT_STATE_DETACHED);
+ BUG_ON(!clockevent_state_detached(new));
clockevents_shutdown(new);
}
}
@@ -622,7 +633,7 @@ void clockevents_suspend(void)
struct clock_event_device *dev;
list_for_each_entry_reverse(dev, &clockevent_devices, list)
- if (dev->suspend)
+ if (dev->suspend && !clockevent_state_detached(dev))
dev->suspend(dev);
}
@@ -634,7 +645,7 @@ void clockevents_resume(void)
struct clock_event_device *dev;
list_for_each_entry(dev, &clockevent_devices, list)
- if (dev->resume)
+ if (dev->resume && !clockevent_state_detached(dev))
dev->resume(dev);
}
@@ -665,7 +676,7 @@ void tick_cleanup_dead_cpu(int cpu)
if (cpumask_test_cpu(cpu, dev->cpumask) &&
cpumask_weight(dev->cpumask) == 1 &&
!tick_is_broadcast_device(dev)) {
- BUG_ON(dev->state != CLOCK_EVT_STATE_DETACHED);
+ BUG_ON(!clockevent_state_detached(dev));
list_del(&dev->list);
}
}
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 15facb1b9..841b72f72 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -23,6 +23,8 @@
* o Allow clocksource drivers to be unregistered
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/device.h>
#include <linux/clocksource.h>
#include <linux/init.h>
@@ -216,10 +218,11 @@ static void clocksource_watchdog(unsigned long data)
/* Check the deviation from the watchdog clocksource. */
if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) {
- pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable, because the skew is too large:\n", cs->name);
- pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n",
+ pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable because the skew is too large:\n",
+ cs->name);
+ pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n",
watchdog->name, wdnow, wdlast, watchdog->mask);
- pr_warn(" '%s' cs_now: %llx cs_last: %llx mask: %llx\n",
+ pr_warn(" '%s' cs_now: %llx cs_last: %llx mask: %llx\n",
cs->name, csnow, cslast, cs->mask);
__clocksource_unstable(cs);
continue;
@@ -567,9 +570,8 @@ static void __clocksource_select(bool skipcur)
*/
if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) {
/* Override clocksource cannot be used. */
- printk(KERN_WARNING "Override clocksource %s is not "
- "HRT compatible. Cannot switch while in "
- "HRT/NOHZ mode\n", cs->name);
+ pr_warn("Override clocksource %s is not HRT compatible - cannot switch while in HRT/NOHZ mode\n",
+ cs->name);
override_name[0] = 0;
} else
/* Override clocksource can be used. */
@@ -708,8 +710,8 @@ void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq
clocksource_update_max_deferment(cs);
- pr_info("clocksource %s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n",
- cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns);
+ pr_info("%s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n",
+ cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns);
}
EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale);
@@ -1008,12 +1010,10 @@ __setup("clocksource=", boot_override_clocksource);
static int __init boot_override_clock(char* str)
{
if (!strcmp(str, "pmtmr")) {
- printk("Warning: clock=pmtmr is deprecated. "
- "Use clocksource=acpi_pm.\n");
+ pr_warn("clock=pmtmr is deprecated - use clocksource=acpi_pm\n");
return boot_override_clocksource("acpi_pm");
}
- printk("Warning! clock= boot option is deprecated. "
- "Use clocksource=xyz\n");
+ pr_warn("clock= boot option is deprecated - use clocksource=xyz\n");
return boot_override_clocksource(str);
}
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 93ef7190b..5c7ae4b64 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -66,33 +66,29 @@
*/
DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
{
-
.lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
+ .seq = SEQCNT_ZERO(hrtimer_bases.seq),
.clock_base =
{
{
.index = HRTIMER_BASE_MONOTONIC,
.clockid = CLOCK_MONOTONIC,
.get_time = &ktime_get,
- .resolution = KTIME_LOW_RES,
},
{
.index = HRTIMER_BASE_REALTIME,
.clockid = CLOCK_REALTIME,
.get_time = &ktime_get_real,
- .resolution = KTIME_LOW_RES,
},
{
.index = HRTIMER_BASE_BOOTTIME,
.clockid = CLOCK_BOOTTIME,
.get_time = &ktime_get_boottime,
- .resolution = KTIME_LOW_RES,
},
{
.index = HRTIMER_BASE_TAI,
.clockid = CLOCK_TAI,
.get_time = &ktime_get_clocktai,
- .resolution = KTIME_LOW_RES,
},
}
};
@@ -109,27 +105,6 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id)
return hrtimer_clock_to_base_table[clock_id];
}
-
-/*
- * Get the coarse grained time at the softirq based on xtime and
- * wall_to_monotonic.
- */
-static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
-{
- ktime_t xtim, mono, boot, tai;
- ktime_t off_real, off_boot, off_tai;
-
- mono = ktime_get_update_offsets_tick(&off_real, &off_boot, &off_tai);
- boot = ktime_add(mono, off_boot);
- xtim = ktime_add(mono, off_real);
- tai = ktime_add(mono, off_tai);
-
- base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
- base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
- base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot;
- base->clock_base[HRTIMER_BASE_TAI].softirq_time = tai;
-}
-
/*
* Functions and macros which are different for UP/SMP systems are kept in a
* single place
@@ -137,6 +112,18 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
#ifdef CONFIG_SMP
/*
+ * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base()
+ * such that hrtimer_callback_running() can unconditionally dereference
+ * timer->base->cpu_base
+ */
+static struct hrtimer_cpu_base migration_cpu_base = {
+ .seq = SEQCNT_ZERO(migration_cpu_base),
+ .clock_base = { { .cpu_base = &migration_cpu_base, }, },
+};
+
+#define migration_base migration_cpu_base.clock_base[0]
+
+/*
* We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
* means that all timers which are tied to this base via timer->base are
* locked, and the base itself is locked too.
@@ -145,8 +132,8 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
* be found on the lists/queues.
*
* When the timer's base is locked, and the timer removed from list, it is
- * possible to set timer->base = NULL and drop the lock: the timer remains
- * locked.
+ * possible to set timer->base = &migration_base and drop the lock: the timer
+ * remains locked.
*/
static
struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
@@ -156,7 +143,7 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
for (;;) {
base = timer->base;
- if (likely(base != NULL)) {
+ if (likely(base != &migration_base)) {
raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
if (likely(base == timer->base))
return base;
@@ -190,6 +177,24 @@ hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
#endif
}
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+static inline
+struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
+ int pinned)
+{
+ if (pinned || !base->migration_enabled)
+ return this_cpu_ptr(&hrtimer_bases);
+ return &per_cpu(hrtimer_bases, get_nohz_timer_target());
+}
+#else
+static inline
+struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
+ int pinned)
+{
+ return this_cpu_ptr(&hrtimer_bases);
+}
+#endif
+
/*
* Switch the timer base to the current CPU when possible.
*/
@@ -197,14 +202,13 @@ static inline struct hrtimer_clock_base *
switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
int pinned)
{
+ struct hrtimer_cpu_base *new_cpu_base, *this_base;
struct hrtimer_clock_base *new_base;
- struct hrtimer_cpu_base *new_cpu_base;
- int this_cpu = smp_processor_id();
- int cpu = get_nohz_timer_target(pinned);
int basenum = base->index;
+ this_base = this_cpu_ptr(&hrtimer_bases);
+ new_cpu_base = get_target_base(this_base, pinned);
again:
- new_cpu_base = &per_cpu(hrtimer_bases, cpu);
new_base = &new_cpu_base->clock_base[basenum];
if (base != new_base) {
@@ -220,22 +224,24 @@ again:
if (unlikely(hrtimer_callback_running(timer)))
return base;
- /* See the comment in lock_timer_base() */
- timer->base = NULL;
+ /* See the comment in lock_hrtimer_base() */
+ timer->base = &migration_base;
raw_spin_unlock(&base->cpu_base->lock);
raw_spin_lock(&new_base->cpu_base->lock);
- if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
- cpu = this_cpu;
+ if (new_cpu_base != this_base &&
+ hrtimer_check_target(timer, new_base)) {
raw_spin_unlock(&new_base->cpu_base->lock);
raw_spin_lock(&base->cpu_base->lock);
+ new_cpu_base = this_base;
timer->base = base;
goto again;
}
timer->base = new_base;
} else {
- if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
- cpu = this_cpu;
+ if (new_cpu_base != this_base &&
+ hrtimer_check_target(timer, new_base)) {
+ new_cpu_base = this_base;
goto again;
}
}
@@ -443,24 +449,35 @@ static inline void debug_deactivate(struct hrtimer *timer)
}
#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
+static inline void hrtimer_update_next_timer(struct hrtimer_cpu_base *cpu_base,
+ struct hrtimer *timer)
+{
+#ifdef CONFIG_HIGH_RES_TIMERS
+ cpu_base->next_timer = timer;
+#endif
+}
+
static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
{
struct hrtimer_clock_base *base = cpu_base->clock_base;
ktime_t expires, expires_next = { .tv64 = KTIME_MAX };
- int i;
+ unsigned int active = cpu_base->active_bases;
- for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
+ hrtimer_update_next_timer(cpu_base, NULL);
+ for (; active; base++, active >>= 1) {
struct timerqueue_node *next;
struct hrtimer *timer;
- next = timerqueue_getnext(&base->active);
- if (!next)
+ if (!(active & 0x01))
continue;
+ next = timerqueue_getnext(&base->active);
timer = container_of(next, struct hrtimer, node);
expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
- if (expires.tv64 < expires_next.tv64)
+ if (expires.tv64 < expires_next.tv64) {
expires_next = expires;
+ hrtimer_update_next_timer(cpu_base, timer);
+ }
}
/*
* clock_was_set() might have changed base->offset of any of
@@ -473,6 +490,16 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
}
#endif
+static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
+{
+ ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
+ ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
+ ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
+
+ return ktime_get_update_offsets_now(&base->clock_was_set_seq,
+ offs_real, offs_boot, offs_tai);
+}
+
/* High resolution timer related functions */
#ifdef CONFIG_HIGH_RES_TIMERS
@@ -480,6 +507,8 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
* High resolution timer enabled ?
*/
static int hrtimer_hres_enabled __read_mostly = 1;
+unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC;
+EXPORT_SYMBOL_GPL(hrtimer_resolution);
/*
* Enable / Disable high resolution mode
@@ -508,9 +537,14 @@ static inline int hrtimer_is_hres_enabled(void)
/*
* Is the high resolution mode active ?
*/
+static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base)
+{
+ return cpu_base->hres_active;
+}
+
static inline int hrtimer_hres_active(void)
{
- return __this_cpu_read(hrtimer_bases.hres_active);
+ return __hrtimer_hres_active(this_cpu_ptr(&hrtimer_bases));
}
/*
@@ -521,7 +555,12 @@ static inline int hrtimer_hres_active(void)
static void
hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
{
- ktime_t expires_next = __hrtimer_get_next_event(cpu_base);
+ ktime_t expires_next;
+
+ if (!cpu_base->hres_active)
+ return;
+
+ expires_next = __hrtimer_get_next_event(cpu_base);
if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64)
return;
@@ -545,63 +584,53 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
if (cpu_base->hang_detected)
return;
- if (cpu_base->expires_next.tv64 != KTIME_MAX)
- tick_program_event(cpu_base->expires_next, 1);
+ tick_program_event(cpu_base->expires_next, 1);
}
/*
- * Shared reprogramming for clock_realtime and clock_monotonic
- *
* When a timer is enqueued and expires earlier than the already enqueued
* timers, we have to check, whether it expires earlier than the timer for
* which the clock event device was armed.
*
- * Note, that in case the state has HRTIMER_STATE_CALLBACK set, no reprogramming
- * and no expiry check happens. The timer gets enqueued into the rbtree. The
- * reprogramming and expiry check is done in the hrtimer_interrupt or in the
- * softirq.
- *
* Called with interrupts disabled and base->cpu_base.lock held
*/
-static int hrtimer_reprogram(struct hrtimer *timer,
- struct hrtimer_clock_base *base)
+static void hrtimer_reprogram(struct hrtimer *timer,
+ struct hrtimer_clock_base *base)
{
struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
- int res;
WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
/*
- * When the callback is running, we do not reprogram the clock event
- * device. The timer callback is either running on a different CPU or
- * the callback is executed in the hrtimer_interrupt context. The
- * reprogramming is handled either by the softirq, which called the
- * callback or at the end of the hrtimer_interrupt.
+ * If the timer is not on the current cpu, we cannot reprogram
+ * the other cpus clock event device.
*/
- if (hrtimer_callback_running(timer))
- return 0;
+ if (base->cpu_base != cpu_base)
+ return;
+
+ /*
+ * If the hrtimer interrupt is running, then it will
+ * reevaluate the clock bases and reprogram the clock event
+ * device. The callbacks are always executed in hard interrupt
+ * context so we don't need an extra check for a running
+ * callback.
+ */
+ if (cpu_base->in_hrtirq)
+ return;
/*
* CLOCK_REALTIME timer might be requested with an absolute
- * expiry time which is less than base->offset. Nothing wrong
- * about that, just avoid to call into the tick code, which
- * has now objections against negative expiry values.
+ * expiry time which is less than base->offset. Set it to 0.
*/
if (expires.tv64 < 0)
- return -ETIME;
+ expires.tv64 = 0;
if (expires.tv64 >= cpu_base->expires_next.tv64)
- return 0;
+ return;
- /*
- * When the target cpu of the timer is currently executing
- * hrtimer_interrupt(), then we do not touch the clock event
- * device. hrtimer_interrupt() will reevaluate all clock bases
- * before reprogramming the device.
- */
- if (cpu_base->in_hrtirq)
- return 0;
+ /* Update the pointer to the next expiring timer */
+ cpu_base->next_timer = timer;
/*
* If a hang was detected in the last timer interrupt then we
@@ -610,15 +639,14 @@ static int hrtimer_reprogram(struct hrtimer *timer,
* to make progress.
*/
if (cpu_base->hang_detected)
- return 0;
+ return;
/*
- * Clockevents returns -ETIME, when the event was in the past.
+ * Program the timer hardware. We enforce the expiry for
+ * events which are already in the past.
*/
- res = tick_program_event(expires, 0);
- if (!IS_ERR_VALUE(res))
- cpu_base->expires_next = expires;
- return res;
+ cpu_base->expires_next = expires;
+ tick_program_event(expires, 1);
}
/*
@@ -630,15 +658,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
base->hres_active = 0;
}
-static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
-{
- ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
- ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
- ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
-
- return ktime_get_update_offsets_now(offs_real, offs_boot, offs_tai);
-}
-
/*
* Retrigger next event is called after clock was set
*
@@ -648,7 +667,7 @@ static void retrigger_next_event(void *arg)
{
struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
- if (!hrtimer_hres_active())
+ if (!base->hres_active)
return;
raw_spin_lock(&base->lock);
@@ -662,29 +681,19 @@ static void retrigger_next_event(void *arg)
*/
static int hrtimer_switch_to_hres(void)
{
- int i, cpu = smp_processor_id();
- struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu);
- unsigned long flags;
-
- if (base->hres_active)
- return 1;
-
- local_irq_save(flags);
+ struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
if (tick_init_highres()) {
- local_irq_restore(flags);
printk(KERN_WARNING "Could not switch to high resolution "
- "mode on CPU %d\n", cpu);
+ "mode on CPU %d\n", base->cpu);
return 0;
}
base->hres_active = 1;
- for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
- base->clock_base[i].resolution = KTIME_HIGH_RES;
+ hrtimer_resolution = HIGH_RES_NSEC;
tick_setup_sched_timer();
/* "Retrigger" the interrupt to get things going */
retrigger_next_event(NULL);
- local_irq_restore(flags);
return 1;
}
@@ -706,6 +715,7 @@ void clock_was_set_delayed(void)
#else
+static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *b) { return 0; }
static inline int hrtimer_hres_active(void) { return 0; }
static inline int hrtimer_is_hres_enabled(void) { return 0; }
static inline int hrtimer_switch_to_hres(void) { return 0; }
@@ -803,6 +813,14 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
*
* Forward the timer expiry so it will expire in the future.
* Returns the number of overruns.
+ *
+ * Can be safely called from the callback function of @timer. If
+ * called from other contexts @timer must neither be enqueued nor
+ * running the callback and the caller needs to take care of
+ * serialization.
+ *
+ * Note: This only updates the timer expiry value and does not requeue
+ * the timer.
*/
u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
{
@@ -814,8 +832,11 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
if (delta.tv64 < 0)
return 0;
- if (interval.tv64 < timer->base->resolution.tv64)
- interval.tv64 = timer->base->resolution.tv64;
+ if (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED))
+ return 0;
+
+ if (interval.tv64 < hrtimer_resolution)
+ interval.tv64 = hrtimer_resolution;
if (unlikely(delta.tv64 >= interval.tv64)) {
s64 incr = ktime_to_ns(interval);
@@ -849,16 +870,11 @@ static int enqueue_hrtimer(struct hrtimer *timer,
{
debug_activate(timer);
- timerqueue_add(&base->active, &timer->node);
base->cpu_base->active_bases |= 1 << base->index;
- /*
- * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
- * state of a possibly running callback.
- */
- timer->state |= HRTIMER_STATE_ENQUEUED;
+ timer->state = HRTIMER_STATE_ENQUEUED;
- return (&timer->node == base->active.next);
+ return timerqueue_add(&base->active, &timer->node);
}
/*
@@ -875,39 +891,38 @@ static void __remove_hrtimer(struct hrtimer *timer,
struct hrtimer_clock_base *base,
unsigned long newstate, int reprogram)
{
- struct timerqueue_node *next_timer;
- if (!(timer->state & HRTIMER_STATE_ENQUEUED))
- goto out;
+ struct hrtimer_cpu_base *cpu_base = base->cpu_base;
+ unsigned int state = timer->state;
+
+ timer->state = newstate;
+ if (!(state & HRTIMER_STATE_ENQUEUED))
+ return;
+
+ if (!timerqueue_del(&base->active, &timer->node))
+ cpu_base->active_bases &= ~(1 << base->index);
- next_timer = timerqueue_getnext(&base->active);
- timerqueue_del(&base->active, &timer->node);
- if (&timer->node == next_timer) {
#ifdef CONFIG_HIGH_RES_TIMERS
- /* Reprogram the clock event device. if enabled */
- if (reprogram && hrtimer_hres_active()) {
- ktime_t expires;
-
- expires = ktime_sub(hrtimer_get_expires(timer),
- base->offset);
- if (base->cpu_base->expires_next.tv64 == expires.tv64)
- hrtimer_force_reprogram(base->cpu_base, 1);
- }
+ /*
+ * Note: If reprogram is false we do not update
+ * cpu_base->next_timer. This happens when we remove the first
+ * timer on a remote cpu. No harm as we never dereference
+ * cpu_base->next_timer. So the worst thing what can happen is
+ * an superflous call to hrtimer_force_reprogram() on the
+ * remote cpu later on if the same timer gets enqueued again.
+ */
+ if (reprogram && timer == cpu_base->next_timer)
+ hrtimer_force_reprogram(cpu_base, 1);
#endif
- }
- if (!timerqueue_getnext(&base->active))
- base->cpu_base->active_bases &= ~(1 << base->index);
-out:
- timer->state = newstate;
}
/*
* remove hrtimer, called with base lock held
*/
static inline int
-remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
+remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart)
{
if (hrtimer_is_queued(timer)) {
- unsigned long state;
+ unsigned long state = timer->state;
int reprogram;
/*
@@ -921,30 +936,35 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
debug_deactivate(timer);
timer_stats_hrtimer_clear_start_info(timer);
reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
- /*
- * We must preserve the CALLBACK state flag here,
- * otherwise we could move the timer base in
- * switch_hrtimer_base.
- */
- state = timer->state & HRTIMER_STATE_CALLBACK;
+
+ if (!restart)
+ state = HRTIMER_STATE_INACTIVE;
+
__remove_hrtimer(timer, base, state, reprogram);
return 1;
}
return 0;
}
-int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
- unsigned long delta_ns, const enum hrtimer_mode mode,
- int wakeup)
+/**
+ * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
+ * @timer: the timer to be added
+ * @tim: expiry time
+ * @delta_ns: "slack" range for the timer
+ * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or
+ * relative (HRTIMER_MODE_REL)
+ */
+void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
+ unsigned long delta_ns, const enum hrtimer_mode mode)
{
struct hrtimer_clock_base *base, *new_base;
unsigned long flags;
- int ret, leftmost;
+ int leftmost;
base = lock_hrtimer_base(timer, &flags);
/* Remove an active timer from the queue: */
- ret = remove_hrtimer(timer, base);
+ remove_hrtimer(timer, base, true);
if (mode & HRTIMER_MODE_REL) {
tim = ktime_add_safe(tim, base->get_time());
@@ -956,7 +976,7 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
* timeouts. This will go away with the GTOD framework.
*/
#ifdef CONFIG_TIME_LOW_RES
- tim = ktime_add_safe(tim, base->resolution);
+ tim = ktime_add_safe(tim, ktime_set(0, hrtimer_resolution));
#endif
}
@@ -968,85 +988,25 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
timer_stats_hrtimer_set_start_info(timer);
leftmost = enqueue_hrtimer(timer, new_base);
-
- if (!leftmost) {
- unlock_hrtimer_base(timer, &flags);
- return ret;
- }
+ if (!leftmost)
+ goto unlock;
if (!hrtimer_is_hres_active(timer)) {
/*
* Kick to reschedule the next tick to handle the new timer
* on dynticks target.
*/
- wake_up_nohz_cpu(new_base->cpu_base->cpu);
- } else if (new_base->cpu_base == this_cpu_ptr(&hrtimer_bases) &&
- hrtimer_reprogram(timer, new_base)) {
- /*
- * Only allow reprogramming if the new base is on this CPU.
- * (it might still be on another CPU if the timer was pending)
- *
- * XXX send_remote_softirq() ?
- */
- if (wakeup) {
- /*
- * We need to drop cpu_base->lock to avoid a
- * lock ordering issue vs. rq->lock.
- */
- raw_spin_unlock(&new_base->cpu_base->lock);
- raise_softirq_irqoff(HRTIMER_SOFTIRQ);
- local_irq_restore(flags);
- return ret;
- } else {
- __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
- }
+ if (new_base->cpu_base->nohz_active)
+ wake_up_nohz_cpu(new_base->cpu_base->cpu);
+ } else {
+ hrtimer_reprogram(timer, new_base);
}
-
+unlock:
unlock_hrtimer_base(timer, &flags);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(__hrtimer_start_range_ns);
-
-/**
- * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
- * @timer: the timer to be added
- * @tim: expiry time
- * @delta_ns: "slack" range for the timer
- * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or
- * relative (HRTIMER_MODE_REL)
- *
- * Returns:
- * 0 on success
- * 1 when the timer was active
- */
-int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
- unsigned long delta_ns, const enum hrtimer_mode mode)
-{
- return __hrtimer_start_range_ns(timer, tim, delta_ns, mode, 1);
}
EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
/**
- * hrtimer_start - (re)start an hrtimer on the current CPU
- * @timer: the timer to be added
- * @tim: expiry time
- * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or
- * relative (HRTIMER_MODE_REL)
- *
- * Returns:
- * 0 on success
- * 1 when the timer was active
- */
-int
-hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
-{
- return __hrtimer_start_range_ns(timer, tim, 0, mode, 1);
-}
-EXPORT_SYMBOL_GPL(hrtimer_start);
-
-
-/**
* hrtimer_try_to_cancel - try to deactivate a timer
* @timer: hrtimer to stop
*
@@ -1062,10 +1022,19 @@ int hrtimer_try_to_cancel(struct hrtimer *timer)
unsigned long flags;
int ret = -1;
+ /*
+ * Check lockless first. If the timer is not active (neither
+ * enqueued nor running the callback, nothing to do here. The
+ * base lock does not serialize against a concurrent enqueue,
+ * so we can avoid taking it.
+ */
+ if (!hrtimer_active(timer))
+ return 0;
+
base = lock_hrtimer_base(timer, &flags);
if (!hrtimer_callback_running(timer))
- ret = remove_hrtimer(timer, base);
+ ret = remove_hrtimer(timer, base, false);
unlock_hrtimer_base(timer, &flags);
@@ -1115,26 +1084,22 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
/**
* hrtimer_get_next_event - get the time until next expiry event
*
- * Returns the delta to the next expiry event or KTIME_MAX if no timer
- * is pending.
+ * Returns the next expiry time or KTIME_MAX if no timer is pending.
*/
-ktime_t hrtimer_get_next_event(void)
+u64 hrtimer_get_next_event(void)
{
struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
- ktime_t mindelta = { .tv64 = KTIME_MAX };
+ u64 expires = KTIME_MAX;
unsigned long flags;
raw_spin_lock_irqsave(&cpu_base->lock, flags);
- if (!hrtimer_hres_active())
- mindelta = ktime_sub(__hrtimer_get_next_event(cpu_base),
- ktime_get());
+ if (!__hrtimer_hres_active(cpu_base))
+ expires = __hrtimer_get_next_event(cpu_base).tv64;
raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
- if (mindelta.tv64 < 0)
- mindelta.tv64 = 0;
- return mindelta;
+ return expires;
}
#endif
@@ -1176,37 +1141,73 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
}
EXPORT_SYMBOL_GPL(hrtimer_init);
-/**
- * hrtimer_get_res - get the timer resolution for a clock
- * @which_clock: which clock to query
- * @tp: pointer to timespec variable to store the resolution
+/*
+ * A timer is active, when it is enqueued into the rbtree or the
+ * callback function is running or it's in the state of being migrated
+ * to another cpu.
*
- * Store the resolution of the clock selected by @which_clock in the
- * variable pointed to by @tp.
+ * It is important for this function to not return a false negative.
*/
-int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
+bool hrtimer_active(const struct hrtimer *timer)
{
struct hrtimer_cpu_base *cpu_base;
- int base = hrtimer_clockid_to_base(which_clock);
+ unsigned int seq;
- cpu_base = raw_cpu_ptr(&hrtimer_bases);
- *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution);
+ do {
+ cpu_base = READ_ONCE(timer->base->cpu_base);
+ seq = raw_read_seqcount_begin(&cpu_base->seq);
- return 0;
+ if (timer->state != HRTIMER_STATE_INACTIVE ||
+ cpu_base->running == timer)
+ return true;
+
+ } while (read_seqcount_retry(&cpu_base->seq, seq) ||
+ cpu_base != READ_ONCE(timer->base->cpu_base));
+
+ return false;
}
-EXPORT_SYMBOL_GPL(hrtimer_get_res);
+EXPORT_SYMBOL_GPL(hrtimer_active);
-static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
+/*
+ * The write_seqcount_barrier()s in __run_hrtimer() split the thing into 3
+ * distinct sections:
+ *
+ * - queued: the timer is queued
+ * - callback: the timer is being ran
+ * - post: the timer is inactive or (re)queued
+ *
+ * On the read side we ensure we observe timer->state and cpu_base->running
+ * from the same section, if anything changed while we looked at it, we retry.
+ * This includes timer->base changing because sequence numbers alone are
+ * insufficient for that.
+ *
+ * The sequence numbers are required because otherwise we could still observe
+ * a false negative if the read side got smeared over multiple consequtive
+ * __run_hrtimer() invocations.
+ */
+
+static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
+ struct hrtimer_clock_base *base,
+ struct hrtimer *timer, ktime_t *now)
{
- struct hrtimer_clock_base *base = timer->base;
- struct hrtimer_cpu_base *cpu_base = base->cpu_base;
enum hrtimer_restart (*fn)(struct hrtimer *);
int restart;
- WARN_ON(!irqs_disabled());
+ lockdep_assert_held(&cpu_base->lock);
debug_deactivate(timer);
- __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
+ cpu_base->running = timer;
+
+ /*
+ * Separate the ->running assignment from the ->state assignment.
+ *
+ * As with a regular write barrier, this ensures the read side in
+ * hrtimer_active() cannot observe cpu_base->running == NULL &&
+ * timer->state == INACTIVE.
+ */
+ raw_write_seqcount_barrier(&cpu_base->seq);
+
+ __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
timer_stats_account_hrtimer(timer);
fn = timer->function;
@@ -1222,58 +1223,43 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
raw_spin_lock(&cpu_base->lock);
/*
- * Note: We clear the CALLBACK bit after enqueue_hrtimer and
+ * Note: We clear the running state after enqueue_hrtimer and
* we do not reprogramm the event hardware. Happens either in
* hrtimer_start_range_ns() or in hrtimer_interrupt()
+ *
+ * Note: Because we dropped the cpu_base->lock above,
+ * hrtimer_start_range_ns() can have popped in and enqueued the timer
+ * for us already.
*/
- if (restart != HRTIMER_NORESTART) {
- BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
+ if (restart != HRTIMER_NORESTART &&
+ !(timer->state & HRTIMER_STATE_ENQUEUED))
enqueue_hrtimer(timer, base);
- }
- WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK));
+ /*
+ * Separate the ->running assignment from the ->state assignment.
+ *
+ * As with a regular write barrier, this ensures the read side in
+ * hrtimer_active() cannot observe cpu_base->running == NULL &&
+ * timer->state == INACTIVE.
+ */
+ raw_write_seqcount_barrier(&cpu_base->seq);
- timer->state &= ~HRTIMER_STATE_CALLBACK;
+ WARN_ON_ONCE(cpu_base->running != timer);
+ cpu_base->running = NULL;
}
-#ifdef CONFIG_HIGH_RES_TIMERS
-
-/*
- * High resolution timer interrupt
- * Called with interrupts disabled
- */
-void hrtimer_interrupt(struct clock_event_device *dev)
+static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
{
- struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
- ktime_t expires_next, now, entry_time, delta;
- int i, retries = 0;
-
- BUG_ON(!cpu_base->hres_active);
- cpu_base->nr_events++;
- dev->next_event.tv64 = KTIME_MAX;
-
- raw_spin_lock(&cpu_base->lock);
- entry_time = now = hrtimer_update_base(cpu_base);
-retry:
- cpu_base->in_hrtirq = 1;
- /*
- * We set expires_next to KTIME_MAX here with cpu_base->lock
- * held to prevent that a timer is enqueued in our queue via
- * the migration code. This does not affect enqueueing of
- * timers which run their callback and need to be requeued on
- * this CPU.
- */
- cpu_base->expires_next.tv64 = KTIME_MAX;
+ struct hrtimer_clock_base *base = cpu_base->clock_base;
+ unsigned int active = cpu_base->active_bases;
- for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
- struct hrtimer_clock_base *base;
+ for (; active; base++, active >>= 1) {
struct timerqueue_node *node;
ktime_t basenow;
- if (!(cpu_base->active_bases & (1 << i)))
+ if (!(active & 0x01))
continue;
- base = cpu_base->clock_base + i;
basenow = ktime_add(now, base->offset);
while ((node = timerqueue_getnext(&base->active))) {
@@ -1296,9 +1282,42 @@ retry:
if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer))
break;
- __run_hrtimer(timer, &basenow);
+ __run_hrtimer(cpu_base, base, timer, &basenow);
}
}
+}
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+
+/*
+ * High resolution timer interrupt
+ * Called with interrupts disabled
+ */
+void hrtimer_interrupt(struct clock_event_device *dev)
+{
+ struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+ ktime_t expires_next, now, entry_time, delta;
+ int retries = 0;
+
+ BUG_ON(!cpu_base->hres_active);
+ cpu_base->nr_events++;
+ dev->next_event.tv64 = KTIME_MAX;
+
+ raw_spin_lock(&cpu_base->lock);
+ entry_time = now = hrtimer_update_base(cpu_base);
+retry:
+ cpu_base->in_hrtirq = 1;
+ /*
+ * We set expires_next to KTIME_MAX here with cpu_base->lock
+ * held to prevent that a timer is enqueued in our queue via
+ * the migration code. This does not affect enqueueing of
+ * timers which run their callback and need to be requeued on
+ * this CPU.
+ */
+ cpu_base->expires_next.tv64 = KTIME_MAX;
+
+ __hrtimer_run_queues(cpu_base, now);
+
/* Reevaluate the clock bases for the next expiry */
expires_next = __hrtimer_get_next_event(cpu_base);
/*
@@ -1310,8 +1329,7 @@ retry:
raw_spin_unlock(&cpu_base->lock);
/* Reprogramming necessary ? */
- if (expires_next.tv64 == KTIME_MAX ||
- !tick_program_event(expires_next, 0)) {
+ if (!tick_program_event(expires_next, 0)) {
cpu_base->hang_detected = 0;
return;
}
@@ -1344,8 +1362,8 @@ retry:
cpu_base->hang_detected = 1;
raw_spin_unlock(&cpu_base->lock);
delta = ktime_sub(now, entry_time);
- if (delta.tv64 > cpu_base->max_hang_time.tv64)
- cpu_base->max_hang_time = delta;
+ if ((unsigned int)delta.tv64 > cpu_base->max_hang_time)
+ cpu_base->max_hang_time = (unsigned int) delta.tv64;
/*
* Limit it to a sensible value as we enforce a longer
* delay. Give the CPU at least 100ms to catch up.
@@ -1363,7 +1381,7 @@ retry:
* local version of hrtimer_peek_ahead_timers() called with interrupts
* disabled.
*/
-static void __hrtimer_peek_ahead_timers(void)
+static inline void __hrtimer_peek_ahead_timers(void)
{
struct tick_device *td;
@@ -1375,29 +1393,6 @@ static void __hrtimer_peek_ahead_timers(void)
hrtimer_interrupt(td->evtdev);
}
-/**
- * hrtimer_peek_ahead_timers -- run soft-expired timers now
- *
- * hrtimer_peek_ahead_timers will peek at the timer queue of
- * the current cpu and check if there are any timers for which
- * the soft expires time has passed. If any such timers exist,
- * they are run immediately and then removed from the timer queue.
- *
- */
-void hrtimer_peek_ahead_timers(void)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- __hrtimer_peek_ahead_timers();
- local_irq_restore(flags);
-}
-
-static void run_hrtimer_softirq(struct softirq_action *h)
-{
- hrtimer_peek_ahead_timers();
-}
-
#else /* CONFIG_HIGH_RES_TIMERS */
static inline void __hrtimer_peek_ahead_timers(void) { }
@@ -1405,66 +1400,32 @@ static inline void __hrtimer_peek_ahead_timers(void) { }
#endif /* !CONFIG_HIGH_RES_TIMERS */
/*
- * Called from timer softirq every jiffy, expire hrtimers:
- *
- * For HRT its the fall back code to run the softirq in the timer
- * softirq context in case the hrtimer initialization failed or has
- * not been done yet.
+ * Called from run_local_timers in hardirq context every jiffy
*/
-void hrtimer_run_pending(void)
+void hrtimer_run_queues(void)
{
- if (hrtimer_hres_active())
+ struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+ ktime_t now;
+
+ if (__hrtimer_hres_active(cpu_base))
return;
/*
- * This _is_ ugly: We have to check in the softirq context,
- * whether we can switch to highres and / or nohz mode. The
- * clocksource switch happens in the timer interrupt with
- * xtime_lock held. Notification from there only sets the
- * check bit in the tick_oneshot code, otherwise we might
- * deadlock vs. xtime_lock.
+ * This _is_ ugly: We have to check periodically, whether we
+ * can switch to highres and / or nohz mode. The clocksource
+ * switch happens with xtime_lock held. Notification from
+ * there only sets the check bit in the tick_oneshot code,
+ * otherwise we might deadlock vs. xtime_lock.
*/
- if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
+ if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) {
hrtimer_switch_to_hres();
-}
-
-/*
- * Called from hardirq context every jiffy
- */
-void hrtimer_run_queues(void)
-{
- struct timerqueue_node *node;
- struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
- struct hrtimer_clock_base *base;
- int index, gettime = 1;
-
- if (hrtimer_hres_active())
return;
-
- for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
- base = &cpu_base->clock_base[index];
- if (!timerqueue_getnext(&base->active))
- continue;
-
- if (gettime) {
- hrtimer_get_softirq_time(cpu_base);
- gettime = 0;
- }
-
- raw_spin_lock(&cpu_base->lock);
-
- while ((node = timerqueue_getnext(&base->active))) {
- struct hrtimer *timer;
-
- timer = container_of(node, struct hrtimer, node);
- if (base->softirq_time.tv64 <=
- hrtimer_get_expires_tv64(timer))
- break;
-
- __run_hrtimer(timer, &base->softirq_time);
- }
- raw_spin_unlock(&cpu_base->lock);
}
+
+ raw_spin_lock(&cpu_base->lock);
+ now = hrtimer_update_base(cpu_base);
+ __hrtimer_run_queues(cpu_base, now);
+ raw_spin_unlock(&cpu_base->lock);
}
/*
@@ -1497,8 +1458,6 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
do {
set_current_state(TASK_INTERRUPTIBLE);
hrtimer_start_expires(&t->timer, mode);
- if (!hrtimer_active(&t->timer))
- t->task = NULL;
if (likely(t->task))
freezable_schedule();
@@ -1642,11 +1601,11 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
debug_deactivate(timer);
/*
- * Mark it as STATE_MIGRATE not INACTIVE otherwise the
+ * Mark it as ENQUEUED not INACTIVE otherwise the
* timer could be seen as !active and just vanish away
* under us on another CPU
*/
- __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
+ __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0);
timer->base = new_base;
/*
* Enqueue the timers on the new cpu. This does not
@@ -1657,9 +1616,6 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
* event device.
*/
enqueue_hrtimer(timer, new_base);
-
- /* Clear the migration state bit */
- timer->state &= ~HRTIMER_STATE_MIGRATE;
}
}
@@ -1731,9 +1687,6 @@ void __init hrtimers_init(void)
hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
(void *)(long)smp_processor_id());
register_cpu_notifier(&hrtimers_nb);
-#ifdef CONFIG_HIGH_RES_TIMERS
- open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
-#endif
}
/**
@@ -1772,8 +1725,6 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
hrtimer_init_sleeper(&t, current);
hrtimer_start_expires(&t.timer, mode);
- if (!hrtimer_active(&t.timer))
- t.task = NULL;
if (likely(t.task))
schedule();
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 7a6810030..fb4d98c7f 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -35,6 +35,7 @@ unsigned long tick_nsec;
static u64 tick_length;
static u64 tick_length_base;
+#define SECS_PER_DAY 86400
#define MAX_TICKADJ 500LL /* usecs */
#define MAX_TICKADJ_SCALED \
(((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ)
@@ -76,6 +77,9 @@ static long time_adjust;
/* constant (boot-param configurable) NTP tick adjustment (upscaled) */
static s64 ntp_tick_adj;
+/* second value of the next pending leapsecond, or TIME64_MAX if no leap */
+static time64_t ntp_next_leap_sec = TIME64_MAX;
+
#ifdef CONFIG_NTP_PPS
/*
@@ -349,6 +353,7 @@ void ntp_clear(void)
tick_length = tick_length_base;
time_offset = 0;
+ ntp_next_leap_sec = TIME64_MAX;
/* Clear PPS state variables */
pps_clear();
}
@@ -359,6 +364,21 @@ u64 ntp_tick_length(void)
return tick_length;
}
+/**
+ * ntp_get_next_leap - Returns the next leapsecond in CLOCK_REALTIME ktime_t
+ *
+ * Provides the time of the next leapsecond against CLOCK_REALTIME in
+ * a ktime_t format. Returns KTIME_MAX if no leapsecond is pending.
+ */
+ktime_t ntp_get_next_leap(void)
+{
+ ktime_t ret;
+
+ if ((time_state == TIME_INS) && (time_status & STA_INS))
+ return ktime_set(ntp_next_leap_sec, 0);
+ ret.tv64 = KTIME_MAX;
+ return ret;
+}
/*
* this routine handles the overflow of the microsecond field
@@ -382,15 +402,21 @@ int second_overflow(unsigned long secs)
*/
switch (time_state) {
case TIME_OK:
- if (time_status & STA_INS)
+ if (time_status & STA_INS) {
time_state = TIME_INS;
- else if (time_status & STA_DEL)
+ ntp_next_leap_sec = secs + SECS_PER_DAY -
+ (secs % SECS_PER_DAY);
+ } else if (time_status & STA_DEL) {
time_state = TIME_DEL;
+ ntp_next_leap_sec = secs + SECS_PER_DAY -
+ ((secs+1) % SECS_PER_DAY);
+ }
break;
case TIME_INS:
- if (!(time_status & STA_INS))
+ if (!(time_status & STA_INS)) {
+ ntp_next_leap_sec = TIME64_MAX;
time_state = TIME_OK;
- else if (secs % 86400 == 0) {
+ } else if (secs % SECS_PER_DAY == 0) {
leap = -1;
time_state = TIME_OOP;
printk(KERN_NOTICE
@@ -398,19 +424,21 @@ int second_overflow(unsigned long secs)
}
break;
case TIME_DEL:
- if (!(time_status & STA_DEL))
+ if (!(time_status & STA_DEL)) {
+ ntp_next_leap_sec = TIME64_MAX;
time_state = TIME_OK;
- else if ((secs + 1) % 86400 == 0) {
+ } else if ((secs + 1) % SECS_PER_DAY == 0) {
leap = 1;
+ ntp_next_leap_sec = TIME64_MAX;
time_state = TIME_WAIT;
printk(KERN_NOTICE
"Clock: deleting leap second 23:59:59 UTC\n");
}
break;
case TIME_OOP:
+ ntp_next_leap_sec = TIME64_MAX;
time_state = TIME_WAIT;
break;
-
case TIME_WAIT:
if (!(time_status & (STA_INS | STA_DEL)))
time_state = TIME_OK;
@@ -547,6 +575,7 @@ static inline void process_adj_status(struct timex *txc, struct timespec64 *ts)
if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) {
time_state = TIME_OK;
time_status = STA_UNSYNC;
+ ntp_next_leap_sec = TIME64_MAX;
/* restart PPS frequency calibration */
pps_reset_freq_interval();
}
@@ -711,6 +740,24 @@ int __do_adjtimex(struct timex *txc, struct timespec64 *ts, s32 *time_tai)
if (!(time_status & STA_NANO))
txc->time.tv_usec /= NSEC_PER_USEC;
+ /* Handle leapsec adjustments */
+ if (unlikely(ts->tv_sec >= ntp_next_leap_sec)) {
+ if ((time_state == TIME_INS) && (time_status & STA_INS)) {
+ result = TIME_OOP;
+ txc->tai++;
+ txc->time.tv_sec--;
+ }
+ if ((time_state == TIME_DEL) && (time_status & STA_DEL)) {
+ result = TIME_WAIT;
+ txc->tai--;
+ txc->time.tv_sec++;
+ }
+ if ((time_state == TIME_OOP) &&
+ (ts->tv_sec == ntp_next_leap_sec)) {
+ result = TIME_WAIT;
+ }
+ }
+
return result;
}
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
index bbd102ad9..65430504c 100644
--- a/kernel/time/ntp_internal.h
+++ b/kernel/time/ntp_internal.h
@@ -5,6 +5,7 @@ extern void ntp_init(void);
extern void ntp_clear(void);
/* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */
extern u64 ntp_tick_length(void);
+extern ktime_t ntp_get_next_leap(void);
extern int second_overflow(unsigned long secs);
extern int ntp_validate_timex(struct timex *);
extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *);
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 0ac829b48..892e3dae0 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -196,39 +196,62 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
return 0;
}
-static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
+/*
+ * Set cputime to sum_cputime if sum_cputime > cputime. Use cmpxchg
+ * to avoid race conditions with concurrent updates to cputime.
+ */
+static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime)
{
- if (b->utime > a->utime)
- a->utime = b->utime;
+ u64 curr_cputime;
+retry:
+ curr_cputime = atomic64_read(cputime);
+ if (sum_cputime > curr_cputime) {
+ if (atomic64_cmpxchg(cputime, curr_cputime, sum_cputime) != curr_cputime)
+ goto retry;
+ }
+}
- if (b->stime > a->stime)
- a->stime = b->stime;
+static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, struct task_cputime *sum)
+{
+ __update_gt_cputime(&cputime_atomic->utime, sum->utime);
+ __update_gt_cputime(&cputime_atomic->stime, sum->stime);
+ __update_gt_cputime(&cputime_atomic->sum_exec_runtime, sum->sum_exec_runtime);
+}
- if (b->sum_exec_runtime > a->sum_exec_runtime)
- a->sum_exec_runtime = b->sum_exec_runtime;
+/* Sample task_cputime_atomic values in "atomic_timers", store results in "times". */
+static inline void sample_cputime_atomic(struct task_cputime *times,
+ struct task_cputime_atomic *atomic_times)
+{
+ times->utime = atomic64_read(&atomic_times->utime);
+ times->stime = atomic64_read(&atomic_times->stime);
+ times->sum_exec_runtime = atomic64_read(&atomic_times->sum_exec_runtime);
}
void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
{
struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
struct task_cputime sum;
- unsigned long flags;
- if (!cputimer->running) {
+ /* Check if cputimer isn't running. This is accessed without locking. */
+ if (!READ_ONCE(cputimer->running)) {
/*
* The POSIX timer interface allows for absolute time expiry
* values through the TIMER_ABSTIME flag, therefore we have
- * to synchronize the timer to the clock every time we start
- * it.
+ * to synchronize the timer to the clock every time we start it.
*/
thread_group_cputime(tsk, &sum);
- raw_spin_lock_irqsave(&cputimer->lock, flags);
- cputimer->running = 1;
- update_gt_cputime(&cputimer->cputime, &sum);
- } else
- raw_spin_lock_irqsave(&cputimer->lock, flags);
- *times = cputimer->cputime;
- raw_spin_unlock_irqrestore(&cputimer->lock, flags);
+ update_gt_cputime(&cputimer->cputime_atomic, &sum);
+
+ /*
+ * We're setting cputimer->running without a lock. Ensure
+ * this only gets written to in one operation. We set
+ * running after update_gt_cputime() as a small optimization,
+ * but barriers are not required because update_gt_cputime()
+ * can handle concurrent updates.
+ */
+ WRITE_ONCE(cputimer->running, 1);
+ }
+ sample_cputime_atomic(times, &cputimer->cputime_atomic);
}
/*
@@ -425,7 +448,7 @@ static void cleanup_timers(struct list_head *head)
*/
void posix_cpu_timers_exit(struct task_struct *tsk)
{
- add_device_randomness((const void*) &tsk_seruntime(tsk),
+ add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
sizeof(unsigned long long));
cleanup_timers(tsk->cpu_timers);
@@ -582,7 +605,8 @@ bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk)
if (!task_cputime_zero(&tsk->cputime_expires))
return false;
- if (tsk->signal->cputimer.running)
+ /* Check if cputimer is running. This is accessed without locking. */
+ if (READ_ONCE(tsk->signal->cputimer.running))
return false;
return true;
@@ -847,18 +871,18 @@ static void check_thread_timers(struct task_struct *tsk,
tsk_expires->virt_exp = expires_to_cputime(expires);
tsk_expires->sched_exp = check_timers_list(++timers, firing,
- tsk_seruntime(tsk));
+ tsk->se.sum_exec_runtime);
/*
* Check for the special case thread timers.
*/
- soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
+ soft = READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
if (soft != RLIM_INFINITY) {
unsigned long hard =
- ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
+ READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
if (hard != RLIM_INFINITY &&
- tsk_rttimeout(tsk) > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
+ tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
/*
* At the hard limit, we just die.
* No need to calculate anything else now.
@@ -866,7 +890,7 @@ static void check_thread_timers(struct task_struct *tsk,
__group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
return;
}
- if (tsk_rttimeout(tsk) > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
+ if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
/*
* At the soft limit, send a SIGXCPU every second.
*/
@@ -882,14 +906,12 @@ static void check_thread_timers(struct task_struct *tsk,
}
}
-static void stop_process_timers(struct signal_struct *sig)
+static inline void stop_process_timers(struct signal_struct *sig)
{
struct thread_group_cputimer *cputimer = &sig->cputimer;
- unsigned long flags;
- raw_spin_lock_irqsave(&cputimer->lock, flags);
- cputimer->running = 0;
- raw_spin_unlock_irqrestore(&cputimer->lock, flags);
+ /* Turn off cputimer->running. This is done without locking. */
+ WRITE_ONCE(cputimer->running, 0);
}
static u32 onecputick;
@@ -958,11 +980,11 @@ static void check_process_timers(struct task_struct *tsk,
SIGPROF);
check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
SIGVTALRM);
- soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
+ soft = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
if (soft != RLIM_INFINITY) {
unsigned long psecs = cputime_to_secs(ptime);
unsigned long hard =
- ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
+ READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
cputime_t x;
if (psecs >= hard) {
/*
@@ -1103,7 +1125,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
struct task_cputime task_sample = {
.utime = utime,
.stime = stime,
- .sum_exec_runtime = tsk_seruntime(tsk)
+ .sum_exec_runtime = tsk->se.sum_exec_runtime
};
if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
@@ -1111,12 +1133,11 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
}
sig = tsk->signal;
- if (sig->cputimer.running) {
+ /* Check if cputimer is running. This is accessed without locking. */
+ if (READ_ONCE(sig->cputimer.running)) {
struct task_cputime group_sample;
- raw_spin_lock(&sig->cputimer.lock);
- group_sample = sig->cputimer.cputime;
- raw_spin_unlock(&sig->cputimer.lock);
+ sample_cputime_atomic(&group_sample, &sig->cputimer.cputime_atomic);
if (task_cputime_expired(&group_sample, &sig->cputime_expires))
return 1;
@@ -1157,7 +1178,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
* If there are any active process wide timers (POSIX 1.b, itimers,
* RLIMIT_CPU) cputimer must be running.
*/
- if (tsk->signal->cputimer.running)
+ if (READ_ONCE(tsk->signal->cputimer.running))
check_process_timers(tsk, &firing);
/*
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 31ea01f42..31d11ac9f 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -272,13 +272,20 @@ static int posix_get_tai(clockid_t which_clock, struct timespec *tp)
return 0;
}
+static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec *tp)
+{
+ tp->tv_sec = 0;
+ tp->tv_nsec = hrtimer_resolution;
+ return 0;
+}
+
/*
* Initialize everything, well, just everything in Posix clocks/timers ;)
*/
static __init int init_posix_timers(void)
{
struct k_clock clock_realtime = {
- .clock_getres = hrtimer_get_res,
+ .clock_getres = posix_get_hrtimer_res,
.clock_get = posix_clock_realtime_get,
.clock_set = posix_clock_realtime_set,
.clock_adj = posix_clock_realtime_adj,
@@ -290,7 +297,7 @@ static __init int init_posix_timers(void)
.timer_del = common_timer_del,
};
struct k_clock clock_monotonic = {
- .clock_getres = hrtimer_get_res,
+ .clock_getres = posix_get_hrtimer_res,
.clock_get = posix_ktime_get_ts,
.nsleep = common_nsleep,
.nsleep_restart = hrtimer_nanosleep_restart,
@@ -300,7 +307,7 @@ static __init int init_posix_timers(void)
.timer_del = common_timer_del,
};
struct k_clock clock_monotonic_raw = {
- .clock_getres = hrtimer_get_res,
+ .clock_getres = posix_get_hrtimer_res,
.clock_get = posix_get_monotonic_raw,
};
struct k_clock clock_realtime_coarse = {
@@ -312,7 +319,7 @@ static __init int init_posix_timers(void)
.clock_get = posix_get_monotonic_coarse,
};
struct k_clock clock_tai = {
- .clock_getres = hrtimer_get_res,
+ .clock_getres = posix_get_hrtimer_res,
.clock_get = posix_get_tai,
.nsleep = common_nsleep,
.nsleep_restart = hrtimer_nanosleep_restart,
@@ -322,7 +329,7 @@ static __init int init_posix_timers(void)
.timer_del = common_timer_del,
};
struct k_clock clock_boottime = {
- .clock_getres = hrtimer_get_res,
+ .clock_getres = posix_get_hrtimer_res,
.clock_get = posix_get_boottime,
.nsleep = common_nsleep,
.nsleep_restart = hrtimer_nanosleep_restart,
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index 6aac4beed..3e7db49a2 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -22,6 +22,7 @@ static void bc_set_mode(enum clock_event_mode mode,
struct clock_event_device *bc)
{
switch (mode) {
+ case CLOCK_EVT_MODE_UNUSED:
case CLOCK_EVT_MODE_SHUTDOWN:
/*
* Note, we cannot cancel the timer here as we might
@@ -66,9 +67,11 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
* hrtimer_{start/cancel} functions call into tracing,
* calls to these functions must be bound within RCU_NONIDLE.
*/
- RCU_NONIDLE(bc_moved = (hrtimer_try_to_cancel(&bctimer) >= 0) ?
- !hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED) :
- 0);
+ RCU_NONIDLE({
+ bc_moved = hrtimer_try_to_cancel(&bctimer) >= 0;
+ if (bc_moved)
+ hrtimer_start(&bctimer, expires,
+ HRTIMER_MODE_ABS_PINNED);});
if (bc_moved) {
/* Bind the "device" to the cpu */
bc->bound_on = smp_processor_id();
@@ -99,10 +102,13 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t)
{
ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer);
- if (ce_broadcast_hrtimer.next_event.tv64 == KTIME_MAX)
+ switch (ce_broadcast_hrtimer.mode) {
+ case CLOCK_EVT_MODE_ONESHOT:
+ if (ce_broadcast_hrtimer.next_event.tv64 != KTIME_MAX)
+ return HRTIMER_RESTART;
+ default:
return HRTIMER_NORESTART;
-
- return HRTIMER_RESTART;
+ }
}
void tick_setup_hrtimer_broadcast(void)
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 7e8ca4f44..f6aae7977 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -159,7 +159,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
{
struct clock_event_device *bc = tick_broadcast_device.evtdev;
unsigned long flags;
- int ret;
+ int ret = 0;
raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
@@ -221,13 +221,14 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
* If we kept the cpu in the broadcast mask,
* tell the caller to leave the per cpu device
* in shutdown state. The periodic interrupt
- * is delivered by the broadcast device.
+ * is delivered by the broadcast device, if
+ * the broadcast device exists and is not
+ * hrtimer based.
*/
- ret = cpumask_test_cpu(cpu, tick_broadcast_mask);
+ if (bc && !(bc->features & CLOCK_EVT_FEAT_HRTIMER))
+ ret = cpumask_test_cpu(cpu, tick_broadcast_mask);
break;
default:
- /* Nothing to do */
- ret = 0;
break;
}
}
@@ -255,18 +256,32 @@ int tick_receive_broadcast(void)
/*
* Broadcast the event to the cpus, which are set in the mask (mangled).
*/
-static void tick_do_broadcast(struct cpumask *mask)
+static bool tick_do_broadcast(struct cpumask *mask)
{
int cpu = smp_processor_id();
struct tick_device *td;
+ bool local = false;
/*
* Check, if the current cpu is in the mask
*/
if (cpumask_test_cpu(cpu, mask)) {
+ struct clock_event_device *bc = tick_broadcast_device.evtdev;
+
cpumask_clear_cpu(cpu, mask);
- td = &per_cpu(tick_cpu_device, cpu);
- td->evtdev->event_handler(td->evtdev);
+ /*
+ * We only run the local handler, if the broadcast
+ * device is not hrtimer based. Otherwise we run into
+ * a hrtimer recursion.
+ *
+ * local timer_interrupt()
+ * local_handler()
+ * expire_hrtimers()
+ * bc_handler()
+ * local_handler()
+ * expire_hrtimers()
+ */
+ local = !(bc->features & CLOCK_EVT_FEAT_HRTIMER);
}
if (!cpumask_empty(mask)) {
@@ -279,16 +294,17 @@ static void tick_do_broadcast(struct cpumask *mask)
td = &per_cpu(tick_cpu_device, cpumask_first(mask));
td->evtdev->broadcast(mask);
}
+ return local;
}
/*
* Periodic broadcast:
* - invoke the broadcast handlers
*/
-static void tick_do_periodic_broadcast(void)
+static bool tick_do_periodic_broadcast(void)
{
cpumask_and(tmpmask, cpu_online_mask, tick_broadcast_mask);
- tick_do_broadcast(tmpmask);
+ return tick_do_broadcast(tmpmask);
}
/*
@@ -296,34 +312,33 @@ static void tick_do_periodic_broadcast(void)
*/
static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
{
- ktime_t next;
+ struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
+ bool bc_local;
raw_spin_lock(&tick_broadcast_lock);
- tick_do_periodic_broadcast();
+ /* Handle spurious interrupts gracefully */
+ if (clockevent_state_shutdown(tick_broadcast_device.evtdev)) {
+ raw_spin_unlock(&tick_broadcast_lock);
+ return;
+ }
- /*
- * The device is in periodic mode. No reprogramming necessary:
- */
- if (dev->state == CLOCK_EVT_STATE_PERIODIC)
- goto unlock;
+ bc_local = tick_do_periodic_broadcast();
- /*
- * Setup the next period for devices, which do not have
- * periodic mode. We read dev->next_event first and add to it
- * when the event already expired. clockevents_program_event()
- * sets dev->next_event only when the event is really
- * programmed to the device.
- */
- for (next = dev->next_event; ;) {
- next = ktime_add(next, tick_period);
+ if (clockevent_state_oneshot(dev)) {
+ ktime_t next = ktime_add(dev->next_event, tick_period);
- if (!clockevents_program_event(dev, next, false))
- goto unlock;
- tick_do_periodic_broadcast();
+ clockevents_program_event(dev, next, true);
}
-unlock:
raw_spin_unlock(&tick_broadcast_lock);
+
+ /*
+ * We run the handler of the local cpu after dropping
+ * tick_broadcast_lock because the handler might deadlock when
+ * trying to switch to oneshot mode.
+ */
+ if (bc_local)
+ td->evtdev->event_handler(td->evtdev);
}
/**
@@ -366,8 +381,16 @@ void tick_broadcast_control(enum tick_broadcast_mode mode)
case TICK_BROADCAST_ON:
cpumask_set_cpu(cpu, tick_broadcast_on);
if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) {
- if (tick_broadcast_device.mode ==
- TICKDEV_MODE_PERIODIC)
+ /*
+ * Only shutdown the cpu local device, if:
+ *
+ * - the broadcast device exists
+ * - the broadcast device is not a hrtimer based one
+ * - the broadcast device is in periodic mode to
+ * avoid a hickup during switch to oneshot mode
+ */
+ if (bc && !(bc->features & CLOCK_EVT_FEAT_HRTIMER) &&
+ tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
clockevents_shutdown(dev);
}
break;
@@ -386,14 +409,16 @@ void tick_broadcast_control(enum tick_broadcast_mode mode)
break;
}
- if (cpumask_empty(tick_broadcast_mask)) {
- if (!bc_stopped)
- clockevents_shutdown(bc);
- } else if (bc_stopped) {
- if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
- tick_broadcast_start_periodic(bc);
- else
- tick_broadcast_setup_oneshot(bc);
+ if (bc) {
+ if (cpumask_empty(tick_broadcast_mask)) {
+ if (!bc_stopped)
+ clockevents_shutdown(bc);
+ } else if (bc_stopped) {
+ if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
+ tick_broadcast_start_periodic(bc);
+ else
+ tick_broadcast_setup_oneshot(bc);
+ }
}
raw_spin_unlock(&tick_broadcast_lock);
}
@@ -532,23 +557,19 @@ static void tick_broadcast_set_affinity(struct clock_event_device *bc,
irq_set_affinity(bc->irq, bc->cpumask);
}
-static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu,
- ktime_t expires, int force)
+static void tick_broadcast_set_event(struct clock_event_device *bc, int cpu,
+ ktime_t expires)
{
- int ret;
-
- if (bc->state != CLOCK_EVT_STATE_ONESHOT)
- clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT);
+ if (!clockevent_state_oneshot(bc))
+ clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT);
- ret = clockevents_program_event(bc, expires, force);
- if (!ret)
- tick_broadcast_set_affinity(bc, cpumask_of(cpu));
- return ret;
+ clockevents_program_event(bc, expires, 1);
+ tick_broadcast_set_affinity(bc, cpumask_of(cpu));
}
static void tick_resume_broadcast_oneshot(struct clock_event_device *bc)
{
- clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT);
+ clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT);
}
/*
@@ -566,7 +587,7 @@ void tick_check_oneshot_broadcast_this_cpu(void)
* switched over, leave the device alone.
*/
if (td->mode == TICKDEV_MODE_ONESHOT) {
- clockevents_set_state(td->evtdev,
+ clockevents_switch_state(td->evtdev,
CLOCK_EVT_STATE_ONESHOT);
}
}
@@ -580,9 +601,9 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
struct tick_device *td;
ktime_t now, next_event;
int cpu, next_cpu = 0;
+ bool bc_local;
raw_spin_lock(&tick_broadcast_lock);
-again:
dev->next_event.tv64 = KTIME_MAX;
next_event.tv64 = KTIME_MAX;
cpumask_clear(tmpmask);
@@ -624,7 +645,7 @@ again:
/*
* Wakeup the cpus which have an expired event.
*/
- tick_do_broadcast(tmpmask);
+ bc_local = tick_do_broadcast(tmpmask);
/*
* Two reasons for reprogram:
@@ -636,15 +657,15 @@ again:
* - There are pending events on sleeping CPUs which were not
* in the event mask
*/
- if (next_event.tv64 != KTIME_MAX) {
- /*
- * Rearm the broadcast device. If event expired,
- * repeat the above
- */
- if (tick_broadcast_set_event(dev, next_cpu, next_event, 0))
- goto again;
- }
+ if (next_event.tv64 != KTIME_MAX)
+ tick_broadcast_set_event(dev, next_cpu, next_event);
+
raw_spin_unlock(&tick_broadcast_lock);
+
+ if (bc_local) {
+ td = this_cpu_ptr(&tick_cpu_device);
+ td->evtdev->event_handler(td->evtdev);
+ }
}
static int broadcast_needs_cpu(struct clock_event_device *bc, int cpu)
@@ -670,77 +691,88 @@ static void broadcast_shutdown_local(struct clock_event_device *bc,
if (dev->next_event.tv64 < bc->next_event.tv64)
return;
}
- clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
+ clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
}
-/**
- * tick_broadcast_oneshot_control - Enter/exit broadcast oneshot mode
- * @state: The target state (enter/exit)
- *
- * The system enters/leaves a state, where affected devices might stop
- * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups.
- *
- * Called with interrupts disabled, so clockevents_lock is not
- * required here because the local clock event device cannot go away
- * under us.
- */
-int tick_broadcast_oneshot_control(enum tick_broadcast_state state)
+int __tick_broadcast_oneshot_control(enum tick_broadcast_state state)
{
struct clock_event_device *bc, *dev;
- struct tick_device *td;
int cpu, ret = 0;
ktime_t now;
/*
- * Periodic mode does not care about the enter/exit of power
- * states
+ * If there is no broadcast device, tell the caller not to go
+ * into deep idle.
*/
- if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
- return 0;
+ if (!tick_broadcast_device.evtdev)
+ return -EBUSY;
- /*
- * We are called with preemtion disabled from the depth of the
- * idle code, so we can't be moved away.
- */
- td = this_cpu_ptr(&tick_cpu_device);
- dev = td->evtdev;
-
- if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
- return 0;
+ dev = this_cpu_ptr(&tick_cpu_device)->evtdev;
raw_spin_lock(&tick_broadcast_lock);
bc = tick_broadcast_device.evtdev;
cpu = smp_processor_id();
if (state == TICK_BROADCAST_ENTER) {
+ /*
+ * If the current CPU owns the hrtimer broadcast
+ * mechanism, it cannot go deep idle and we do not add
+ * the CPU to the broadcast mask. We don't have to go
+ * through the EXIT path as the local timer is not
+ * shutdown.
+ */
+ ret = broadcast_needs_cpu(bc, cpu);
+ if (ret)
+ goto out;
+
+ /*
+ * If the broadcast device is in periodic mode, we
+ * return.
+ */
+ if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
+ /* If it is a hrtimer based broadcast, return busy */
+ if (bc->features & CLOCK_EVT_FEAT_HRTIMER)
+ ret = -EBUSY;
+ goto out;
+ }
+
if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) {
WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask));
+
+ /* Conditionally shut down the local timer. */
broadcast_shutdown_local(bc, dev);
+
/*
* We only reprogram the broadcast timer if we
* did not mark ourself in the force mask and
* if the cpu local event is earlier than the
* broadcast event. If the current CPU is in
* the force mask, then we are going to be
- * woken by the IPI right away.
+ * woken by the IPI right away; we return
+ * busy, so the CPU does not try to go deep
+ * idle.
*/
- if (!cpumask_test_cpu(cpu, tick_broadcast_force_mask) &&
- dev->next_event.tv64 < bc->next_event.tv64)
- tick_broadcast_set_event(bc, cpu, dev->next_event, 1);
+ if (cpumask_test_cpu(cpu, tick_broadcast_force_mask)) {
+ ret = -EBUSY;
+ } else if (dev->next_event.tv64 < bc->next_event.tv64) {
+ tick_broadcast_set_event(bc, cpu, dev->next_event);
+ /*
+ * In case of hrtimer broadcasts the
+ * programming might have moved the
+ * timer to this cpu. If yes, remove
+ * us from the broadcast mask and
+ * return busy.
+ */
+ ret = broadcast_needs_cpu(bc, cpu);
+ if (ret) {
+ cpumask_clear_cpu(cpu,
+ tick_broadcast_oneshot_mask);
+ }
+ }
}
- /*
- * If the current CPU owns the hrtimer broadcast
- * mechanism, it cannot go deep idle and we remove the
- * CPU from the broadcast mask. We don't have to go
- * through the EXIT path as the local timer is not
- * shutdown.
- */
- ret = broadcast_needs_cpu(bc, cpu);
- if (ret)
- cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
} else {
if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) {
- clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
+ clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
/*
* The cpu which was handling the broadcast
* timer marked this cpu in the broadcast
@@ -807,7 +839,6 @@ out:
raw_spin_unlock(&tick_broadcast_lock);
return ret;
}
-EXPORT_SYMBOL_GPL(tick_broadcast_oneshot_control);
/*
* Reset the one shot broadcast for a cpu
@@ -842,7 +873,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
/* Set it up only once ! */
if (bc->event_handler != tick_handle_oneshot_broadcast) {
- int was_periodic = bc->state == CLOCK_EVT_STATE_PERIODIC;
+ int was_periodic = clockevent_state_periodic(bc);
bc->event_handler = tick_handle_oneshot_broadcast;
@@ -858,10 +889,10 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
tick_broadcast_oneshot_mask, tmpmask);
if (was_periodic && !cpumask_empty(tmpmask)) {
- clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT);
+ clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT);
tick_broadcast_init_next_event(tmpmask,
tick_next_period);
- tick_broadcast_set_event(bc, cpu, tick_next_period, 1);
+ tick_broadcast_set_event(bc, cpu, tick_next_period);
} else
bc->next_event.tv64 = KTIME_MAX;
} else {
@@ -949,6 +980,16 @@ bool tick_broadcast_oneshot_available(void)
return bc ? bc->features & CLOCK_EVT_FEAT_ONESHOT : false;
}
+#else
+int __tick_broadcast_oneshot_control(enum tick_broadcast_state state)
+{
+ struct clock_event_device *bc = tick_broadcast_device.evtdev;
+
+ if (!bc || (bc->features & CLOCK_EVT_FEAT_HRTIMER))
+ return -EBUSY;
+
+ return 0;
+}
#endif
void __init tick_broadcast_init(void)
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 3ae6afa1e..f8bf47571 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -19,6 +19,7 @@
#include <linux/profile.h>
#include <linux/sched.h>
#include <linux/module.h>
+#include <trace/events/power.h>
#include <asm/irq_regs.h>
@@ -102,7 +103,17 @@ void tick_handle_periodic(struct clock_event_device *dev)
tick_periodic(cpu);
- if (dev->state != CLOCK_EVT_STATE_ONESHOT)
+#if defined(CONFIG_HIGH_RES_TIMERS) || defined(CONFIG_NO_HZ_COMMON)
+ /*
+ * The cpu might have transitioned to HIGHRES or NOHZ mode via
+ * update_process_times() -> run_local_timers() ->
+ * hrtimer_run_queues().
+ */
+ if (dev->event_handler != tick_handle_periodic)
+ return;
+#endif
+
+ if (!clockevent_state_oneshot(dev))
return;
for (;;) {
/*
@@ -140,7 +151,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
!tick_broadcast_oneshot_active()) {
- clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC);
+ clockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC);
} else {
unsigned long seq;
ktime_t next;
@@ -150,7 +161,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
next = tick_next_period;
} while (read_seqretry(&jiffies_lock, seq));
- clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
+ clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
for (;;) {
if (!clockevents_program_event(dev, next, false))
@@ -332,6 +343,28 @@ out_bc:
tick_install_broadcast_device(newdev);
}
+/**
+ * tick_broadcast_oneshot_control - Enter/exit broadcast oneshot mode
+ * @state: The target state (enter/exit)
+ *
+ * The system enters/leaves a state, where affected devices might stop
+ * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups.
+ *
+ * Called with interrupts disabled, so clockevents_lock is not
+ * required here because the local clock event device cannot go away
+ * under us.
+ */
+int tick_broadcast_oneshot_control(enum tick_broadcast_state state)
+{
+ struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
+
+ if (!(td->evtdev->features & CLOCK_EVT_FEAT_C3STOP))
+ return 0;
+
+ return __tick_broadcast_oneshot_control(state);
+}
+EXPORT_SYMBOL_GPL(tick_broadcast_oneshot_control);
+
#ifdef CONFIG_HOTPLUG_CPU
/*
* Transfer the do_timer job away from a dying cpu.
@@ -367,7 +400,7 @@ void tick_shutdown(unsigned int cpu)
* Prevent that the clock events layer tries to call
* the set mode function!
*/
- dev->state = CLOCK_EVT_STATE_DETACHED;
+ clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED);
dev->mode = CLOCK_EVT_MODE_UNUSED;
clockevents_exchange_device(dev, NULL);
dev->event_handler = clockevents_handle_noop;
@@ -440,6 +473,7 @@ void tick_resume(void)
tick_resume_local();
}
+#ifdef CONFIG_SUSPEND
static DEFINE_RAW_SPINLOCK(tick_freeze_lock);
static unsigned int tick_freeze_depth;
@@ -457,10 +491,13 @@ void tick_freeze(void)
raw_spin_lock(&tick_freeze_lock);
tick_freeze_depth++;
- if (tick_freeze_depth == num_online_cpus())
+ if (tick_freeze_depth == num_online_cpus()) {
+ trace_suspend_resume(TPS("timekeeping_freeze"),
+ smp_processor_id(), true);
timekeeping_suspend();
- else
+ } else {
tick_suspend_local();
+ }
raw_spin_unlock(&tick_freeze_lock);
}
@@ -478,15 +515,19 @@ void tick_unfreeze(void)
{
raw_spin_lock(&tick_freeze_lock);
- if (tick_freeze_depth == num_online_cpus())
+ if (tick_freeze_depth == num_online_cpus()) {
timekeeping_resume();
- else
+ trace_suspend_resume(TPS("timekeeping_freeze"),
+ smp_processor_id(), false);
+ } else {
tick_resume_local();
+ }
tick_freeze_depth--;
raw_spin_unlock(&tick_freeze_lock);
}
+#endif /* CONFIG_SUSPEND */
/**
* tick_init - initialize the tick control
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index b64fdd805..966a5a6fd 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -36,11 +36,22 @@ static inline int tick_device_is_functional(struct clock_event_device *dev)
return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
}
+static inline enum clock_event_state clockevent_get_state(struct clock_event_device *dev)
+{
+ return dev->state_use_accessors;
+}
+
+static inline void clockevent_set_state(struct clock_event_device *dev,
+ enum clock_event_state state)
+{
+ dev->state_use_accessors = state;
+}
+
extern void clockevents_shutdown(struct clock_event_device *dev);
extern void clockevents_exchange_device(struct clock_event_device *old,
struct clock_event_device *new);
-extern void clockevents_set_state(struct clock_event_device *dev,
- enum clock_event_state state);
+extern void clockevents_switch_state(struct clock_event_device *dev,
+ enum clock_event_state state);
extern int clockevents_program_event(struct clock_event_device *dev,
ktime_t expires, bool force);
extern void clockevents_handle_noop(struct clock_event_device *dev);
@@ -137,3 +148,19 @@ extern void tick_nohz_init(void);
# else
static inline void tick_nohz_init(void) { }
#endif
+
+#ifdef CONFIG_NO_HZ_COMMON
+extern unsigned long tick_nohz_active;
+#else
+#define tick_nohz_active (0)
+#endif
+
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+extern void timers_update_migration(bool update_nohz);
+#else
+static inline void timers_update_migration(bool update_nohz) { }
+#endif
+
+DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
+
+extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem);
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 67a64b167..b51344652 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -28,6 +28,22 @@ int tick_program_event(ktime_t expires, int force)
{
struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
+ if (unlikely(expires.tv64 == KTIME_MAX)) {
+ /*
+ * We don't need the clock event device any more, stop it.
+ */
+ clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT_STOPPED);
+ return 0;
+ }
+
+ if (unlikely(clockevent_state_oneshot_stopped(dev))) {
+ /*
+ * We need the clock event again, configure it in ONESHOT mode
+ * before using it.
+ */
+ clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
+ }
+
return clockevents_program_event(dev, expires, force);
}
@@ -38,7 +54,7 @@ void tick_resume_oneshot(void)
{
struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
- clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
+ clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
clockevents_program_event(dev, ktime_get(), true);
}
@@ -50,7 +66,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev,
ktime_t next_event)
{
newdev->event_handler = handler;
- clockevents_set_state(newdev, CLOCK_EVT_STATE_ONESHOT);
+ clockevents_switch_state(newdev, CLOCK_EVT_STATE_ONESHOT);
clockevents_program_event(newdev, next_event, true);
}
@@ -81,7 +97,7 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
td->mode = TICKDEV_MODE_ONESHOT;
dev->event_handler = handler;
- clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
+ clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
tick_broadcast_switch_to_oneshot();
return 0;
}
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 914259128..c792429e9 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -399,7 +399,7 @@ void __init tick_nohz_init(void)
* NO HZ enabled ?
*/
static int tick_nohz_enabled __read_mostly = 1;
-int tick_nohz_active __read_mostly;
+unsigned long tick_nohz_active __read_mostly;
/*
* Enable / Disable tickless mode
*/
@@ -565,156 +565,144 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
}
EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
+static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
+{
+ hrtimer_cancel(&ts->sched_timer);
+ hrtimer_set_expires(&ts->sched_timer, ts->last_tick);
+
+ /* Forward the time to expire in the future */
+ hrtimer_forward(&ts->sched_timer, now, tick_period);
+
+ if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
+ hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED);
+ else
+ tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
+}
+
static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
ktime_t now, int cpu)
{
- unsigned long seq, last_jiffies, next_jiffies, delta_jiffies;
- ktime_t last_update, expires, ret = { .tv64 = 0 };
- unsigned long rcu_delta_jiffies;
struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
- u64 time_delta;
-
- time_delta = timekeeping_max_deferment();
+ u64 basemono, next_tick, next_tmr, next_rcu, delta, expires;
+ unsigned long seq, basejiff;
+ ktime_t tick;
/* Read jiffies and the time when jiffies were updated last */
do {
seq = read_seqbegin(&jiffies_lock);
- last_update = last_jiffies_update;
- last_jiffies = jiffies;
+ basemono = last_jiffies_update.tv64;
+ basejiff = jiffies;
} while (read_seqretry(&jiffies_lock, seq));
+ ts->last_jiffies = basejiff;
- if (rcu_needs_cpu(&rcu_delta_jiffies) ||
+ if (rcu_needs_cpu(basemono, &next_rcu) ||
arch_needs_cpu() || irq_work_needs_cpu()) {
- next_jiffies = last_jiffies + 1;
- delta_jiffies = 1;
+ next_tick = basemono + TICK_NSEC;
} else {
- /* Get the next timer wheel timer */
- next_jiffies = get_next_timer_interrupt(last_jiffies);
- delta_jiffies = next_jiffies - last_jiffies;
- if (rcu_delta_jiffies < delta_jiffies) {
- next_jiffies = last_jiffies + rcu_delta_jiffies;
- delta_jiffies = rcu_delta_jiffies;
- }
+ /*
+ * Get the next pending timer. If high resolution
+ * timers are enabled this only takes the timer wheel
+ * timers into account. If high resolution timers are
+ * disabled this also looks at the next expiring
+ * hrtimer.
+ */
+ next_tmr = get_next_timer_interrupt(basejiff, basemono);
+ ts->next_timer = next_tmr;
+ /* Take the next rcu event into account */
+ next_tick = next_rcu < next_tmr ? next_rcu : next_tmr;
}
/*
- * Do not stop the tick, if we are only one off (or less)
- * or if the cpu is required for RCU:
+ * If the tick is due in the next period, keep it ticking or
+ * restart it proper.
*/
- if (!ts->tick_stopped && delta_jiffies <= 1)
- goto out;
-
- /* Schedule the tick, if we are at least one jiffie off */
- if ((long)delta_jiffies >= 1) {
-
- /*
- * If this cpu is the one which updates jiffies, then
- * give up the assignment and let it be taken by the
- * cpu which runs the tick timer next, which might be
- * this cpu as well. If we don't drop this here the
- * jiffies might be stale and do_timer() never
- * invoked. Keep track of the fact that it was the one
- * which had the do_timer() duty last. If this cpu is
- * the one which had the do_timer() duty last, we
- * limit the sleep time to the timekeeping
- * max_deferement value which we retrieved
- * above. Otherwise we can sleep as long as we want.
- */
- if (cpu == tick_do_timer_cpu) {
- tick_do_timer_cpu = TICK_DO_TIMER_NONE;
- ts->do_timer_last = 1;
- } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
- time_delta = KTIME_MAX;
- ts->do_timer_last = 0;
- } else if (!ts->do_timer_last) {
- time_delta = KTIME_MAX;
+ delta = next_tick - basemono;
+ if (delta <= (u64)TICK_NSEC) {
+ tick.tv64 = 0;
+ if (!ts->tick_stopped)
+ goto out;
+ if (delta == 0) {
+ /* Tick is stopped, but required now. Enforce it */
+ tick_nohz_restart(ts, now);
+ goto out;
}
+ }
+
+ /*
+ * If this cpu is the one which updates jiffies, then give up
+ * the assignment and let it be taken by the cpu which runs
+ * the tick timer next, which might be this cpu as well. If we
+ * don't drop this here the jiffies might be stale and
+ * do_timer() never invoked. Keep track of the fact that it
+ * was the one which had the do_timer() duty last. If this cpu
+ * is the one which had the do_timer() duty last, we limit the
+ * sleep time to the timekeeping max_deferement value.
+ * Otherwise we can sleep as long as we want.
+ */
+ delta = timekeeping_max_deferment();
+ if (cpu == tick_do_timer_cpu) {
+ tick_do_timer_cpu = TICK_DO_TIMER_NONE;
+ ts->do_timer_last = 1;
+ } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
+ delta = KTIME_MAX;
+ ts->do_timer_last = 0;
+ } else if (!ts->do_timer_last) {
+ delta = KTIME_MAX;
+ }
#ifdef CONFIG_NO_HZ_FULL
- if (!ts->inidle) {
- time_delta = min(time_delta,
- scheduler_tick_max_deferment());
- }
+ /* Limit the tick delta to the maximum scheduler deferment */
+ if (!ts->inidle)
+ delta = min(delta, scheduler_tick_max_deferment());
#endif
- /*
- * calculate the expiry time for the next timer wheel
- * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals
- * that there is no timer pending or at least extremely
- * far into the future (12 days for HZ=1000). In this
- * case we set the expiry to the end of time.
- */
- if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) {
- /*
- * Calculate the time delta for the next timer event.
- * If the time delta exceeds the maximum time delta
- * permitted by the current clocksource then adjust
- * the time delta accordingly to ensure the
- * clocksource does not wrap.
- */
- time_delta = min_t(u64, time_delta,
- tick_period.tv64 * delta_jiffies);
- }
-
- if (time_delta < KTIME_MAX)
- expires = ktime_add_ns(last_update, time_delta);
- else
- expires.tv64 = KTIME_MAX;
-
- /* Skip reprogram of event if its not changed */
- if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
- goto out;
+ /* Calculate the next expiry time */
+ if (delta < (KTIME_MAX - basemono))
+ expires = basemono + delta;
+ else
+ expires = KTIME_MAX;
- ret = expires;
+ expires = min_t(u64, expires, next_tick);
+ tick.tv64 = expires;
- /*
- * nohz_stop_sched_tick can be called several times before
- * the nohz_restart_sched_tick is called. This happens when
- * interrupts arrive which do not cause a reschedule. In the
- * first call we save the current tick time, so we can restart
- * the scheduler tick in nohz_restart_sched_tick.
- */
- if (!ts->tick_stopped) {
- nohz_balance_enter_idle(cpu);
- calc_load_enter_idle();
+ /* Skip reprogram of event if its not changed */
+ if (ts->tick_stopped && (expires == dev->next_event.tv64))
+ goto out;
- ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
- ts->tick_stopped = 1;
- trace_tick_stop(1, " ");
- }
+ /*
+ * nohz_stop_sched_tick can be called several times before
+ * the nohz_restart_sched_tick is called. This happens when
+ * interrupts arrive which do not cause a reschedule. In the
+ * first call we save the current tick time, so we can restart
+ * the scheduler tick in nohz_restart_sched_tick.
+ */
+ if (!ts->tick_stopped) {
+ nohz_balance_enter_idle(cpu);
+ calc_load_enter_idle();
- /*
- * If the expiration time == KTIME_MAX, then
- * in this case we simply stop the tick timer.
- */
- if (unlikely(expires.tv64 == KTIME_MAX)) {
- if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
- hrtimer_cancel(&ts->sched_timer);
- goto out;
- }
+ ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
+ ts->tick_stopped = 1;
+ trace_tick_stop(1, " ");
+ }
- if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
- hrtimer_start(&ts->sched_timer, expires,
- HRTIMER_MODE_ABS_PINNED);
- /* Check, if the timer was already in the past */
- if (hrtimer_active(&ts->sched_timer))
- goto out;
- } else if (!tick_program_event(expires, 0))
- goto out;
- /*
- * We are past the event already. So we crossed a
- * jiffie boundary. Update jiffies and raise the
- * softirq.
- */
- tick_do_update_jiffies64(ktime_get());
+ /*
+ * If the expiration time == KTIME_MAX, then we simply stop
+ * the tick timer.
+ */
+ if (unlikely(expires == KTIME_MAX)) {
+ if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
+ hrtimer_cancel(&ts->sched_timer);
+ goto out;
}
- raise_softirq_irqoff(TIMER_SOFTIRQ);
+
+ if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
+ hrtimer_start(&ts->sched_timer, tick, HRTIMER_MODE_ABS_PINNED);
+ else
+ tick_program_event(tick, 1);
out:
- ts->next_jiffies = next_jiffies;
- ts->last_jiffies = last_jiffies;
+ /* Update the estimated sleep length */
ts->sleep_length = ktime_sub(dev->next_event, now);
-
- return ret;
+ return tick;
}
static void tick_nohz_full_stop_tick(struct tick_sched *ts)
@@ -876,32 +864,6 @@ ktime_t tick_nohz_get_sleep_length(void)
return ts->sleep_length;
}
-static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
-{
- hrtimer_cancel(&ts->sched_timer);
- hrtimer_set_expires(&ts->sched_timer, ts->last_tick);
-
- while (1) {
- /* Forward the time to expire in the future */
- hrtimer_forward(&ts->sched_timer, now, tick_period);
-
- if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
- hrtimer_start_expires(&ts->sched_timer,
- HRTIMER_MODE_ABS_PINNED);
- /* Check, if the timer was already in the past */
- if (hrtimer_active(&ts->sched_timer))
- break;
- } else {
- if (!tick_program_event(
- hrtimer_get_expires(&ts->sched_timer), 0))
- break;
- }
- /* Reread time and update jiffies */
- now = ktime_get();
- tick_do_update_jiffies64(now);
- }
-}
-
static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
{
/* Update jiffies first */
@@ -972,12 +934,6 @@ void tick_nohz_idle_exit(void)
local_irq_enable();
}
-static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
-{
- hrtimer_forward(&ts->sched_timer, now, tick_period);
- return tick_program_event(hrtimer_get_expires(&ts->sched_timer), 0);
-}
-
/*
* The nohz low res interrupt handler
*/
@@ -996,10 +952,18 @@ static void tick_nohz_handler(struct clock_event_device *dev)
if (unlikely(ts->tick_stopped))
return;
- while (tick_nohz_reprogram(ts, now)) {
- now = ktime_get();
- tick_do_update_jiffies64(now);
- }
+ hrtimer_forward(&ts->sched_timer, now, tick_period);
+ tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
+}
+
+static inline void tick_nohz_activate(struct tick_sched *ts, int mode)
+{
+ if (!tick_nohz_enabled)
+ return;
+ ts->nohz_mode = mode;
+ /* One update is enough */
+ if (!test_and_set_bit(0, &tick_nohz_active))
+ timers_update_migration(true);
}
/**
@@ -1013,13 +977,8 @@ static void tick_nohz_switch_to_nohz(void)
if (!tick_nohz_enabled)
return;
- local_irq_disable();
- if (tick_switch_to_oneshot(tick_nohz_handler)) {
- local_irq_enable();
+ if (tick_switch_to_oneshot(tick_nohz_handler))
return;
- }
- tick_nohz_active = 1;
- ts->nohz_mode = NOHZ_MODE_LOWRES;
/*
* Recycle the hrtimer in ts, so we can share the
@@ -1029,13 +988,10 @@ static void tick_nohz_switch_to_nohz(void)
/* Get the next period */
next = tick_init_jiffy_update();
- for (;;) {
- hrtimer_set_expires(&ts->sched_timer, next);
- if (!tick_program_event(next, 0))
- break;
- next = ktime_add(next, tick_period);
- }
- local_irq_enable();
+ hrtimer_forward_now(&ts->sched_timer, tick_period);
+ hrtimer_set_expires(&ts->sched_timer, next);
+ tick_program_event(next, 1);
+ tick_nohz_activate(ts, NOHZ_MODE_LOWRES);
}
/*
@@ -1087,6 +1043,7 @@ static inline void tick_nohz_irq_enter(void)
static inline void tick_nohz_switch_to_nohz(void) { }
static inline void tick_nohz_irq_enter(void) { }
+static inline void tick_nohz_activate(struct tick_sched *ts, int mode) { }
#endif /* CONFIG_NO_HZ_COMMON */
@@ -1167,22 +1124,9 @@ void tick_setup_sched_timer(void)
hrtimer_add_expires_ns(&ts->sched_timer, offset);
}
- for (;;) {
- hrtimer_forward(&ts->sched_timer, now, tick_period);
- hrtimer_start_expires(&ts->sched_timer,
- HRTIMER_MODE_ABS_PINNED);
- /* Check, if the timer was already in the past */
- if (hrtimer_active(&ts->sched_timer))
- break;
- now = ktime_get();
- }
-
-#ifdef CONFIG_NO_HZ_COMMON
- if (tick_nohz_enabled) {
- ts->nohz_mode = NOHZ_MODE_HIGHRES;
- tick_nohz_active = 1;
- }
-#endif
+ hrtimer_forward(&ts->sched_timer, now, tick_period);
+ hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED);
+ tick_nohz_activate(ts, NOHZ_MODE_HIGHRES);
}
#endif /* HIGH_RES_TIMERS */
@@ -1227,7 +1171,7 @@ void tick_oneshot_notify(void)
* Called cyclic from the hrtimer softirq (driven by the timer
* softirq) allow_nohz signals, that we can switch into low-res nohz
* mode, because high resolution timers are disabled (either compile
- * or runtime).
+ * or runtime). Called with interrupts disabled.
*/
int tick_check_oneshot_change(int allow_nohz)
{
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index 28b5da3e1..a4a8d4e9b 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -57,7 +57,7 @@ struct tick_sched {
ktime_t iowait_sleeptime;
ktime_t sleep_length;
unsigned long last_jiffies;
- unsigned long next_jiffies;
+ u64 next_timer;
ktime_t idle_expires;
int do_timer_last;
};
@@ -71,4 +71,14 @@ extern void tick_cancel_sched_timer(int cpu);
static inline void tick_cancel_sched_timer(int cpu) { }
#endif
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+extern int __tick_broadcast_oneshot_control(enum tick_broadcast_state state);
+#else
+static inline int
+__tick_broadcast_oneshot_control(enum tick_broadcast_state state)
+{
+ return -EBUSY;
+}
+#endif
+
#endif
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 2c85b7724..85d5bb1d6 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -41,7 +41,7 @@
#include <asm/uaccess.h>
#include <asm/unistd.h>
-#include "timeconst.h"
+#include <generated/timeconst.h>
#include "timekeeping.h"
/*
@@ -173,6 +173,10 @@ int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz)
return error;
if (tz) {
+ /* Verify we're witin the +-15 hrs range */
+ if (tz->tz_minuteswest > 15*60 || tz->tz_minuteswest < -15*60)
+ return -EINVAL;
+
sys_tz = *tz;
update_vsyscall_tz();
if (firsttime) {
@@ -483,9 +487,11 @@ struct timespec64 ns_to_timespec64(const s64 nsec)
}
EXPORT_SYMBOL(ns_to_timespec64);
#endif
-/*
- * When we convert to jiffies then we interpret incoming values
- * the following way:
+/**
+ * msecs_to_jiffies: - convert milliseconds to jiffies
+ * @m: time in milliseconds
+ *
+ * conversion is done as follows:
*
* - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET)
*
@@ -493,66 +499,36 @@ EXPORT_SYMBOL(ns_to_timespec64);
* MAX_JIFFY_OFFSET values] mean 'infinite timeout' too.
*
* - all other values are converted to jiffies by either multiplying
- * the input value by a factor or dividing it with a factor
- *
- * We must also be careful about 32-bit overflows.
+ * the input value by a factor or dividing it with a factor and
+ * handling any 32-bit overflows.
+ * for the details see __msecs_to_jiffies()
+ *
+ * msecs_to_jiffies() checks for the passed in value being a constant
+ * via __builtin_constant_p() allowing gcc to eliminate most of the
+ * code, __msecs_to_jiffies() is called if the value passed does not
+ * allow constant folding and the actual conversion must be done at
+ * runtime.
+ * the _msecs_to_jiffies helpers are the HZ dependent conversion
+ * routines found in include/linux/jiffies.h
*/
-unsigned long msecs_to_jiffies(const unsigned int m)
+unsigned long __msecs_to_jiffies(const unsigned int m)
{
/*
* Negative value, means infinite timeout:
*/
if ((int)m < 0)
return MAX_JIFFY_OFFSET;
-
-#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
- /*
- * HZ is equal to or smaller than 1000, and 1000 is a nice
- * round multiple of HZ, divide with the factor between them,
- * but round upwards:
- */
- return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ);
-#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
- /*
- * HZ is larger than 1000, and HZ is a nice round multiple of
- * 1000 - simply multiply with the factor between them.
- *
- * But first make sure the multiplication result cannot
- * overflow:
- */
- if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
- return MAX_JIFFY_OFFSET;
-
- return m * (HZ / MSEC_PER_SEC);
-#else
- /*
- * Generic case - multiply, round and divide. But first
- * check that if we are doing a net multiplication, that
- * we wouldn't overflow:
- */
- if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
- return MAX_JIFFY_OFFSET;
-
- return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32)
- >> MSEC_TO_HZ_SHR32;
-#endif
+ return _msecs_to_jiffies(m);
}
-EXPORT_SYMBOL(msecs_to_jiffies);
+EXPORT_SYMBOL(__msecs_to_jiffies);
-unsigned long usecs_to_jiffies(const unsigned int u)
+unsigned long __usecs_to_jiffies(const unsigned int u)
{
if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET))
return MAX_JIFFY_OFFSET;
-#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
- return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ);
-#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
- return u * (HZ / USEC_PER_SEC);
-#else
- return (USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32)
- >> USEC_TO_HZ_SHR32;
-#endif
+ return _usecs_to_jiffies(u);
}
-EXPORT_SYMBOL(usecs_to_jiffies);
+EXPORT_SYMBOL(__usecs_to_jiffies);
/*
* The TICK_NSEC - 1 rounds up the value to the next resolution. Note
diff --git a/kernel/time/timeconst.bc b/kernel/time/timeconst.bc
index 511bdf2ca..c7388dee8 100644
--- a/kernel/time/timeconst.bc
+++ b/kernel/time/timeconst.bc
@@ -50,7 +50,7 @@ define timeconst(hz) {
print "#include <linux/types.h>\n\n"
print "#if HZ != ", hz, "\n"
- print "#error \qkernel/timeconst.h has the wrong HZ value!\q\n"
+ print "#error \qinclude/generated/timeconst.h has the wrong HZ value!\q\n"
print "#endif\n\n"
if (hz < 2) {
@@ -105,4 +105,5 @@ define timeconst(hz) {
halt
}
+hz = read();
timeconst(hz)
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 946acb721..bca3667a2 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -118,18 +118,6 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
#ifdef CONFIG_DEBUG_TIMEKEEPING
#define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */
-/*
- * These simple flag variables are managed
- * without locks, which is racy, but ok since
- * we don't really care about being super
- * precise about how many events were seen,
- * just that a problem was observed.
- */
-static int timekeeping_underflow_seen;
-static int timekeeping_overflow_seen;
-
-/* last_warning is only modified under the timekeeping lock */
-static long timekeeping_last_warning;
static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
{
@@ -149,29 +137,30 @@ static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
}
}
- if (timekeeping_underflow_seen) {
- if (jiffies - timekeeping_last_warning > WARNING_FREQ) {
+ if (tk->underflow_seen) {
+ if (jiffies - tk->last_warning > WARNING_FREQ) {
printk_deferred("WARNING: Underflow in clocksource '%s' observed, time update ignored.\n", name);
printk_deferred(" Please report this, consider using a different clocksource, if possible.\n");
printk_deferred(" Your kernel is probably still fine.\n");
- timekeeping_last_warning = jiffies;
+ tk->last_warning = jiffies;
}
- timekeeping_underflow_seen = 0;
+ tk->underflow_seen = 0;
}
- if (timekeeping_overflow_seen) {
- if (jiffies - timekeeping_last_warning > WARNING_FREQ) {
+ if (tk->overflow_seen) {
+ if (jiffies - tk->last_warning > WARNING_FREQ) {
printk_deferred("WARNING: Overflow in clocksource '%s' observed, time update capped.\n", name);
printk_deferred(" Please report this, consider using a different clocksource, if possible.\n");
printk_deferred(" Your kernel is probably still fine.\n");
- timekeeping_last_warning = jiffies;
+ tk->last_warning = jiffies;
}
- timekeeping_overflow_seen = 0;
+ tk->overflow_seen = 0;
}
}
static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
{
+ struct timekeeper *tk = &tk_core.timekeeper;
cycle_t now, last, mask, max, delta;
unsigned int seq;
@@ -197,13 +186,13 @@ static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
* mask-relative negative values.
*/
if (unlikely((~delta & mask) < (mask >> 3))) {
- timekeeping_underflow_seen = 1;
+ tk->underflow_seen = 1;
delta = 0;
}
/* Cap delta value to the max_cycles values to avoid mult overflows */
if (unlikely(delta > max)) {
- timekeeping_overflow_seen = 1;
+ tk->overflow_seen = 1;
delta = tkr->clock->max_cycles;
}
@@ -330,32 +319,7 @@ static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
* We want to use this from any context including NMI and tracing /
* instrumenting the timekeeping code itself.
*
- * So we handle this differently than the other timekeeping accessor
- * functions which retry when the sequence count has changed. The
- * update side does:
- *
- * smp_wmb(); <- Ensure that the last base[1] update is visible
- * tkf->seq++;
- * smp_wmb(); <- Ensure that the seqcount update is visible
- * update(tkf->base[0], tkr);
- * smp_wmb(); <- Ensure that the base[0] update is visible
- * tkf->seq++;
- * smp_wmb(); <- Ensure that the seqcount update is visible
- * update(tkf->base[1], tkr);
- *
- * The reader side does:
- *
- * do {
- * seq = tkf->seq;
- * smp_rmb();
- * idx = seq & 0x01;
- * now = now(tkf->base[idx]);
- * smp_rmb();
- * } while (seq != tkf->seq)
- *
- * As long as we update base[0] readers are forced off to
- * base[1]. Once base[0] is updated readers are redirected to base[0]
- * and the base[1] update takes place.
+ * Employ the latch technique; see @raw_write_seqcount_latch.
*
* So if a NMI hits the update of base[0] then it will use base[1]
* which is still consistent. In the worst case this can result is a
@@ -418,7 +382,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
u64 now;
do {
- seq = raw_read_seqcount(&tkf->seq);
+ seq = raw_read_seqcount_latch(&tkf->seq);
tkr = tkf->base + (seq & 0x01);
now = ktime_to_ns(tkr->base) + timekeeping_get_ns(tkr);
} while (read_seqcount_retry(&tkf->seq, seq));
@@ -551,6 +515,17 @@ int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
/*
+ * tk_update_leap_state - helper to update the next_leap_ktime
+ */
+static inline void tk_update_leap_state(struct timekeeper *tk)
+{
+ tk->next_leap_ktime = ntp_get_next_leap();
+ if (tk->next_leap_ktime.tv64 != KTIME_MAX)
+ /* Convert to monotonic time */
+ tk->next_leap_ktime = ktime_sub(tk->next_leap_ktime, tk->offs_real);
+}
+
+/*
* Update the ktime_t based scalar nsec members of the timekeeper
*/
static inline void tk_update_ktime_data(struct timekeeper *tk)
@@ -591,17 +566,25 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
ntp_clear();
}
+ tk_update_leap_state(tk);
tk_update_ktime_data(tk);
update_vsyscall(tk);
update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
+ update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono);
+ update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw);
+
+ if (action & TK_CLOCK_WAS_SET)
+ tk->clock_was_set_seq++;
+ /*
+ * The mirroring of the data to the shadow-timekeeper needs
+ * to happen last here to ensure we don't over-write the
+ * timekeeper structure on the next update with stale data
+ */
if (action & TK_MIRROR)
memcpy(&shadow_timekeeper, &tk_core.timekeeper,
sizeof(tk_core.timekeeper));
-
- update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono);
- update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw);
}
/**
@@ -699,6 +682,23 @@ ktime_t ktime_get(void)
}
EXPORT_SYMBOL_GPL(ktime_get);
+u32 ktime_get_resolution_ns(void)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ unsigned int seq;
+ u32 nsecs;
+
+ WARN_ON(timekeeping_suspended);
+
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+ nsecs = tk->tkr_mono.mult >> tk->tkr_mono.shift;
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ return nsecs;
+}
+EXPORT_SYMBOL_GPL(ktime_get_resolution_ns);
+
static ktime_t *offsets[TK_OFFS_MAX] = {
[TK_OFFS_REAL] = &tk_core.timekeeper.offs_real,
[TK_OFFS_BOOT] = &tk_core.timekeeper.offs_boot,
@@ -1179,28 +1179,20 @@ void __weak read_persistent_clock64(struct timespec64 *ts64)
}
/**
- * read_boot_clock - Return time of the system start.
+ * read_boot_clock64 - Return time of the system start.
*
* Weak dummy function for arches that do not yet support it.
* Function to read the exact time the system has been started.
- * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported.
+ * Returns a timespec64 with tv_sec=0 and tv_nsec=0 if unsupported.
*
* XXX - Do be sure to remove it once all arches implement it.
*/
-void __weak read_boot_clock(struct timespec *ts)
+void __weak read_boot_clock64(struct timespec64 *ts)
{
ts->tv_sec = 0;
ts->tv_nsec = 0;
}
-void __weak read_boot_clock64(struct timespec64 *ts64)
-{
- struct timespec ts;
-
- read_boot_clock(&ts);
- *ts64 = timespec_to_timespec64(ts);
-}
-
/* Flag for if timekeeping_resume() has injected sleeptime */
static bool sleeptime_injected;
@@ -1836,8 +1828,9 @@ void update_wall_time(void)
* memcpy under the tk_core.seq against one before we start
* updating.
*/
+ timekeeping_update(tk, clock_set);
memcpy(real_tk, tk, sizeof(*tk));
- timekeeping_update(real_tk, clock_set);
+ /* The memcpy must come last. Do not put anything here! */
write_seqcount_end(&tk_core.seq);
out:
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@ -1926,47 +1919,20 @@ void do_timer(unsigned long ticks)
}
/**
- * ktime_get_update_offsets_tick - hrtimer helper
- * @offs_real: pointer to storage for monotonic -> realtime offset
- * @offs_boot: pointer to storage for monotonic -> boottime offset
- * @offs_tai: pointer to storage for monotonic -> clock tai offset
- *
- * Returns monotonic time at last tick and various offsets
- */
-ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, ktime_t *offs_boot,
- ktime_t *offs_tai)
-{
- struct timekeeper *tk = &tk_core.timekeeper;
- unsigned int seq;
- ktime_t base;
- u64 nsecs;
-
- do {
- seq = read_seqcount_begin(&tk_core.seq);
-
- base = tk->tkr_mono.base;
- nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
-
- *offs_real = tk->offs_real;
- *offs_boot = tk->offs_boot;
- *offs_tai = tk->offs_tai;
- } while (read_seqcount_retry(&tk_core.seq, seq));
-
- return ktime_add_ns(base, nsecs);
-}
-
-#ifdef CONFIG_HIGH_RES_TIMERS
-/**
* ktime_get_update_offsets_now - hrtimer helper
+ * @cwsseq: pointer to check and store the clock was set sequence number
* @offs_real: pointer to storage for monotonic -> realtime offset
* @offs_boot: pointer to storage for monotonic -> boottime offset
* @offs_tai: pointer to storage for monotonic -> clock tai offset
*
- * Returns current monotonic time and updates the offsets
+ * Returns current monotonic time and updates the offsets if the
+ * sequence number in @cwsseq and timekeeper.clock_was_set_seq are
+ * different.
+ *
* Called from hrtimer_interrupt() or retrigger_next_event()
*/
-ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot,
- ktime_t *offs_tai)
+ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real,
+ ktime_t *offs_boot, ktime_t *offs_tai)
{
struct timekeeper *tk = &tk_core.timekeeper;
unsigned int seq;
@@ -1978,15 +1944,23 @@ ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot,
base = tk->tkr_mono.base;
nsecs = timekeeping_get_ns(&tk->tkr_mono);
+ base = ktime_add_ns(base, nsecs);
+
+ if (*cwsseq != tk->clock_was_set_seq) {
+ *cwsseq = tk->clock_was_set_seq;
+ *offs_real = tk->offs_real;
+ *offs_boot = tk->offs_boot;
+ *offs_tai = tk->offs_tai;
+ }
+
+ /* Handle leapsecond insertion adjustments */
+ if (unlikely(base.tv64 >= tk->next_leap_ktime.tv64))
+ *offs_real = ktime_sub(tk->offs_real, ktime_set(1, 0));
- *offs_real = tk->offs_real;
- *offs_boot = tk->offs_boot;
- *offs_tai = tk->offs_tai;
} while (read_seqcount_retry(&tk_core.seq, seq));
- return ktime_add_ns(base, nsecs);
+ return base;
}
-#endif
/**
* do_adjtimex() - Accessor function to NTP __do_adjtimex function
@@ -2027,6 +2001,8 @@ int do_adjtimex(struct timex *txc)
__timekeeping_set_tai_offset(tk, tai);
timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
}
+ tk_update_leap_state(tk);
+
write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index ead8794b9..704f595ce 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -3,19 +3,16 @@
/*
* Internal interfaces for kernel/time/
*/
-extern ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real,
- ktime_t *offs_boot,
- ktime_t *offs_tai);
-extern ktime_t ktime_get_update_offsets_now(ktime_t *offs_real,
- ktime_t *offs_boot,
- ktime_t *offs_tai);
+extern ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq,
+ ktime_t *offs_real,
+ ktime_t *offs_boot,
+ ktime_t *offs_tai);
extern int timekeeping_valid_for_hres(void);
extern u64 timekeeping_max_deferment(void);
extern int timekeeping_inject_offset(struct timespec *ts);
extern s32 timekeeping_get_tai_offset(void);
extern void timekeeping_set_tai_offset(s32 tai_offset);
-extern void timekeeping_clocktai(struct timespec *ts);
extern int timekeeping_suspend(void);
extern void timekeeping_resume(void);
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 2ece3aa50..84190f02b 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -49,6 +49,8 @@
#include <asm/timex.h>
#include <asm/io.h>
+#include "tick-internal.h"
+
#define CREATE_TRACE_POINTS
#include <trace/events/timer.h>
@@ -68,11 +70,11 @@ EXPORT_SYMBOL(jiffies_64);
#define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1))
struct tvec {
- struct list_head vec[TVN_SIZE];
+ struct hlist_head vec[TVN_SIZE];
};
struct tvec_root {
- struct list_head vec[TVR_SIZE];
+ struct hlist_head vec[TVR_SIZE];
};
struct tvec_base {
@@ -83,6 +85,8 @@ struct tvec_base {
unsigned long active_timers;
unsigned long all_timers;
int cpu;
+ bool migration_enabled;
+ bool nohz_active;
struct tvec_root tv1;
struct tvec tv2;
struct tvec tv3;
@@ -90,43 +94,60 @@ struct tvec_base {
struct tvec tv5;
} ____cacheline_aligned;
-/*
- * __TIMER_INITIALIZER() needs to set ->base to a valid pointer (because we've
- * made NULL special, hint: lock_timer_base()) and we cannot get a compile time
- * pointer to per-cpu entries because we don't know where we'll map the section,
- * even for the boot cpu.
- *
- * And so we use boot_tvec_bases for boot CPU and per-cpu __tvec_bases for the
- * rest of them.
- */
-struct tvec_base boot_tvec_bases;
-EXPORT_SYMBOL(boot_tvec_bases);
-static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
+static DEFINE_PER_CPU(struct tvec_base, tvec_bases);
+
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+unsigned int sysctl_timer_migration = 1;
-/* Functions below help us manage 'deferrable' flag */
-static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
+void timers_update_migration(bool update_nohz)
{
- return ((unsigned int)(unsigned long)base & TIMER_DEFERRABLE);
+ bool on = sysctl_timer_migration && tick_nohz_active;
+ unsigned int cpu;
+
+ /* Avoid the loop, if nothing to update */
+ if (this_cpu_read(tvec_bases.migration_enabled) == on)
+ return;
+
+ for_each_possible_cpu(cpu) {
+ per_cpu(tvec_bases.migration_enabled, cpu) = on;
+ per_cpu(hrtimer_bases.migration_enabled, cpu) = on;
+ if (!update_nohz)
+ continue;
+ per_cpu(tvec_bases.nohz_active, cpu) = true;
+ per_cpu(hrtimer_bases.nohz_active, cpu) = true;
+ }
}
-static inline unsigned int tbase_get_irqsafe(struct tvec_base *base)
+int timer_migration_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
{
- return ((unsigned int)(unsigned long)base & TIMER_IRQSAFE);
+ static DEFINE_MUTEX(mutex);
+ int ret;
+
+ mutex_lock(&mutex);
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (!ret && write)
+ timers_update_migration(false);
+ mutex_unlock(&mutex);
+ return ret;
}
-static inline struct tvec_base *tbase_get_base(struct tvec_base *base)
+static inline struct tvec_base *get_target_base(struct tvec_base *base,
+ int pinned)
{
- return ((struct tvec_base *)((unsigned long)base & ~TIMER_FLAG_MASK));
+ if (pinned || !base->migration_enabled)
+ return this_cpu_ptr(&tvec_bases);
+ return per_cpu_ptr(&tvec_bases, get_nohz_timer_target());
}
-
-static inline void
-timer_set_base(struct timer_list *timer, struct tvec_base *new_base)
+#else
+static inline struct tvec_base *get_target_base(struct tvec_base *base,
+ int pinned)
{
- unsigned long flags = (unsigned long)timer->base & TIMER_FLAG_MASK;
-
- timer->base = (struct tvec_base *)((unsigned long)(new_base) | flags);
+ return this_cpu_ptr(&tvec_bases);
}
+#endif
static unsigned long round_jiffies_common(unsigned long j, int cpu,
bool force_up)
@@ -349,26 +370,12 @@ void set_timer_slack(struct timer_list *timer, int slack_hz)
}
EXPORT_SYMBOL_GPL(set_timer_slack);
-/*
- * If the list is empty, catch up ->timer_jiffies to the current time.
- * The caller must hold the tvec_base lock. Returns true if the list
- * was empty and therefore ->timer_jiffies was updated.
- */
-static bool catchup_timer_jiffies(struct tvec_base *base)
-{
- if (!base->all_timers) {
- base->timer_jiffies = jiffies;
- return true;
- }
- return false;
-}
-
static void
__internal_add_timer(struct tvec_base *base, struct timer_list *timer)
{
unsigned long expires = timer->expires;
unsigned long idx = expires - base->timer_jiffies;
- struct list_head *vec;
+ struct hlist_head *vec;
if (idx < TVR_SIZE) {
int i = expires & TVR_MASK;
@@ -401,25 +408,25 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer)
i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
vec = base->tv5.vec + i;
}
- /*
- * Timers are FIFO:
- */
- list_add_tail(&timer->entry, vec);
+
+ hlist_add_head(&timer->entry, vec);
}
static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
{
- (void)catchup_timer_jiffies(base);
+ /* Advance base->jiffies, if the base is empty */
+ if (!base->all_timers++)
+ base->timer_jiffies = jiffies;
+
__internal_add_timer(base, timer);
/*
* Update base->active_timers and base->next_timer
*/
- if (!tbase_get_deferrable(timer->base)) {
+ if (!(timer->flags & TIMER_DEFERRABLE)) {
if (!base->active_timers++ ||
time_before(timer->expires, base->next_timer))
base->next_timer = timer->expires;
}
- base->all_timers++;
/*
* Check whether the other CPU is in dynticks mode and needs
@@ -434,8 +441,11 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
* require special care against races with idle_cpu(), lets deal
* with that later.
*/
- if (!tbase_get_deferrable(base) || tick_nohz_full_cpu(base->cpu))
- wake_up_nohz_cpu(base->cpu);
+ if (base->nohz_active) {
+ if (!(timer->flags & TIMER_DEFERRABLE) ||
+ tick_nohz_full_cpu(base->cpu))
+ wake_up_nohz_cpu(base->cpu);
+ }
}
#ifdef CONFIG_TIMER_STATS
@@ -451,15 +461,12 @@ void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr)
static void timer_stats_account_timer(struct timer_list *timer)
{
- unsigned int flag = 0;
-
if (likely(!timer->start_site))
return;
- if (unlikely(tbase_get_deferrable(timer->base)))
- flag |= TIMER_STATS_FLAG_DEFERRABLE;
timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
- timer->function, timer->start_comm, flag);
+ timer->function, timer->start_comm,
+ timer->flags);
}
#else
@@ -516,8 +523,8 @@ static int timer_fixup_activate(void *addr, enum debug_obj_state state)
* statically initialized. We just make sure that it
* is tracked in the object tracker.
*/
- if (timer->entry.next == NULL &&
- timer->entry.prev == TIMER_ENTRY_STATIC) {
+ if (timer->entry.pprev == NULL &&
+ timer->entry.next == TIMER_ENTRY_STATIC) {
debug_object_init(timer, &timer_debug_descr);
debug_object_activate(timer, &timer_debug_descr);
return 0;
@@ -563,7 +570,7 @@ static int timer_fixup_assert_init(void *addr, enum debug_obj_state state)
switch (state) {
case ODEBUG_STATE_NOTAVAILABLE:
- if (timer->entry.prev == TIMER_ENTRY_STATIC) {
+ if (timer->entry.next == TIMER_ENTRY_STATIC) {
/*
* This is not really a fixup. The timer was
* statically initialized. We just make sure that it
@@ -648,7 +655,7 @@ static inline void
debug_activate(struct timer_list *timer, unsigned long expires)
{
debug_timer_activate(timer);
- trace_timer_start(timer, expires);
+ trace_timer_start(timer, expires, timer->flags);
}
static inline void debug_deactivate(struct timer_list *timer)
@@ -665,10 +672,8 @@ static inline void debug_assert_init(struct timer_list *timer)
static void do_init_timer(struct timer_list *timer, unsigned int flags,
const char *name, struct lock_class_key *key)
{
- struct tvec_base *base = raw_cpu_read(tvec_bases);
-
- timer->entry.next = NULL;
- timer->base = (void *)((unsigned long)base | flags);
+ timer->entry.pprev = NULL;
+ timer->flags = flags | raw_smp_processor_id();
timer->slack = -1;
#ifdef CONFIG_TIMER_STATS
timer->start_site = NULL;
@@ -699,24 +704,23 @@ EXPORT_SYMBOL(init_timer_key);
static inline void detach_timer(struct timer_list *timer, bool clear_pending)
{
- struct list_head *entry = &timer->entry;
+ struct hlist_node *entry = &timer->entry;
debug_deactivate(timer);
- __list_del(entry->prev, entry->next);
+ __hlist_del(entry);
if (clear_pending)
- entry->next = NULL;
- entry->prev = LIST_POISON2;
+ entry->pprev = NULL;
+ entry->next = LIST_POISON2;
}
static inline void
detach_expired_timer(struct timer_list *timer, struct tvec_base *base)
{
detach_timer(timer, true);
- if (!tbase_get_deferrable(timer->base))
+ if (!(timer->flags & TIMER_DEFERRABLE))
base->active_timers--;
base->all_timers--;
- (void)catchup_timer_jiffies(base);
}
static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
@@ -726,13 +730,14 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
return 0;
detach_timer(timer, clear_pending);
- if (!tbase_get_deferrable(timer->base)) {
+ if (!(timer->flags & TIMER_DEFERRABLE)) {
base->active_timers--;
if (timer->expires == base->next_timer)
base->next_timer = base->timer_jiffies;
}
- base->all_timers--;
- (void)catchup_timer_jiffies(base);
+ /* If this was the last timer, advance base->jiffies */
+ if (!--base->all_timers)
+ base->timer_jiffies = jiffies;
return 1;
}
@@ -744,24 +749,22 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
* So __run_timers/migrate_timers can safely modify all timers which could
* be found on ->tvX lists.
*
- * When the timer's base is locked, and the timer removed from list, it is
- * possible to set timer->base = NULL and drop the lock: the timer remains
- * locked.
+ * When the timer's base is locked and removed from the list, the
+ * TIMER_MIGRATING flag is set, FIXME
*/
static struct tvec_base *lock_timer_base(struct timer_list *timer,
unsigned long *flags)
__acquires(timer->base->lock)
{
- struct tvec_base *base;
-
for (;;) {
- struct tvec_base *prelock_base = timer->base;
- base = tbase_get_base(prelock_base);
- if (likely(base != NULL)) {
+ u32 tf = timer->flags;
+ struct tvec_base *base;
+
+ if (!(tf & TIMER_MIGRATING)) {
+ base = per_cpu_ptr(&tvec_bases, tf & TIMER_CPUMASK);
spin_lock_irqsave(&base->lock, *flags);
- if (likely(prelock_base == timer->base))
+ if (timer->flags == tf)
return base;
- /* The timer has migrated to another CPU */
spin_unlock_irqrestore(&base->lock, *flags);
}
cpu_relax();
@@ -770,11 +773,11 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer,
static inline int
__mod_timer(struct timer_list *timer, unsigned long expires,
- bool pending_only, int pinned)
+ bool pending_only, int pinned)
{
struct tvec_base *base, *new_base;
unsigned long flags;
- int ret = 0 , cpu;
+ int ret = 0;
timer_stats_timer_set_start_info(timer);
BUG_ON(!timer->function);
@@ -787,8 +790,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
debug_activate(timer, expires);
- cpu = get_nohz_timer_target(pinned);
- new_base = per_cpu(tvec_bases, cpu);
+ new_base = get_target_base(base, pinned);
if (base != new_base) {
/*
@@ -800,11 +802,13 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
*/
if (likely(base->running_timer != timer)) {
/* See the comment in lock_timer_base() */
- timer_set_base(timer, NULL);
+ timer->flags |= TIMER_MIGRATING;
+
spin_unlock(&base->lock);
base = new_base;
spin_lock(&base->lock);
- timer_set_base(timer, base);
+ WRITE_ONCE(timer->flags,
+ (timer->flags & ~TIMER_BASEMASK) | base->cpu);
}
}
@@ -966,13 +970,13 @@ EXPORT_SYMBOL(add_timer);
*/
void add_timer_on(struct timer_list *timer, int cpu)
{
- struct tvec_base *base = per_cpu(tvec_bases, cpu);
+ struct tvec_base *base = per_cpu_ptr(&tvec_bases, cpu);
unsigned long flags;
timer_stats_timer_set_start_info(timer);
BUG_ON(timer_pending(timer) || !timer->function);
spin_lock_irqsave(&base->lock, flags);
- timer_set_base(timer, base);
+ timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu;
debug_activate(timer, timer->expires);
internal_add_timer(base, timer);
spin_unlock_irqrestore(&base->lock, flags);
@@ -1037,8 +1041,6 @@ int try_to_del_timer_sync(struct timer_list *timer)
EXPORT_SYMBOL(try_to_del_timer_sync);
#ifdef CONFIG_SMP
-static DEFINE_PER_CPU(struct tvec_base, __tvec_bases);
-
/**
* del_timer_sync - deactivate a timer and wait for the handler to finish.
* @timer: the timer to be deactivated
@@ -1093,7 +1095,7 @@ int del_timer_sync(struct timer_list *timer)
* don't use it in hardirq context, because it
* could lead to deadlock.
*/
- WARN_ON(in_irq() && !tbase_get_irqsafe(timer->base));
+ WARN_ON(in_irq() && !(timer->flags & TIMER_IRQSAFE));
for (;;) {
int ret = try_to_del_timer_sync(timer);
if (ret >= 0)
@@ -1107,17 +1109,17 @@ EXPORT_SYMBOL(del_timer_sync);
static int cascade(struct tvec_base *base, struct tvec *tv, int index)
{
/* cascade all the timers from tv up one level */
- struct timer_list *timer, *tmp;
- struct list_head tv_list;
+ struct timer_list *timer;
+ struct hlist_node *tmp;
+ struct hlist_head tv_list;
- list_replace_init(tv->vec + index, &tv_list);
+ hlist_move_list(tv->vec + index, &tv_list);
/*
* We are removing _all_ timers from the list, so we
* don't have to detach them individually.
*/
- list_for_each_entry_safe(timer, tmp, &tv_list, entry) {
- BUG_ON(tbase_get_base(timer->base) != base);
+ hlist_for_each_entry_safe(timer, tmp, &tv_list, entry) {
/* No accounting, while moving them */
__internal_add_timer(base, timer);
}
@@ -1182,14 +1184,18 @@ static inline void __run_timers(struct tvec_base *base)
struct timer_list *timer;
spin_lock_irq(&base->lock);
- if (catchup_timer_jiffies(base)) {
- spin_unlock_irq(&base->lock);
- return;
- }
+
while (time_after_eq(jiffies, base->timer_jiffies)) {
- struct list_head work_list;
- struct list_head *head = &work_list;
- int index = base->timer_jiffies & TVR_MASK;
+ struct hlist_head work_list;
+ struct hlist_head *head = &work_list;
+ int index;
+
+ if (!base->all_timers) {
+ base->timer_jiffies = jiffies;
+ break;
+ }
+
+ index = base->timer_jiffies & TVR_MASK;
/*
* Cascade timers:
@@ -1200,16 +1206,16 @@ static inline void __run_timers(struct tvec_base *base)
!cascade(base, &base->tv4, INDEX(2)))
cascade(base, &base->tv5, INDEX(3));
++base->timer_jiffies;
- list_replace_init(base->tv1.vec + index, head);
- while (!list_empty(head)) {
+ hlist_move_list(base->tv1.vec + index, head);
+ while (!hlist_empty(head)) {
void (*fn)(unsigned long);
unsigned long data;
bool irqsafe;
- timer = list_first_entry(head, struct timer_list,entry);
+ timer = hlist_entry(head->first, struct timer_list, entry);
fn = timer->function;
data = timer->data;
- irqsafe = tbase_get_irqsafe(timer->base);
+ irqsafe = timer->flags & TIMER_IRQSAFE;
timer_stats_account_timer(timer);
@@ -1248,8 +1254,8 @@ static unsigned long __next_timer_interrupt(struct tvec_base *base)
/* Look for timer events in tv1. */
index = slot = timer_jiffies & TVR_MASK;
do {
- list_for_each_entry(nte, base->tv1.vec + slot, entry) {
- if (tbase_get_deferrable(nte->base))
+ hlist_for_each_entry(nte, base->tv1.vec + slot, entry) {
+ if (nte->flags & TIMER_DEFERRABLE)
continue;
found = 1;
@@ -1279,8 +1285,8 @@ cascade:
index = slot = timer_jiffies & TVN_MASK;
do {
- list_for_each_entry(nte, varp->vec + slot, entry) {
- if (tbase_get_deferrable(nte->base))
+ hlist_for_each_entry(nte, varp->vec + slot, entry) {
+ if (nte->flags & TIMER_DEFERRABLE)
continue;
found = 1;
@@ -1311,54 +1317,48 @@ cascade:
* Check, if the next hrtimer event is before the next timer wheel
* event:
*/
-static unsigned long cmp_next_hrtimer_event(unsigned long now,
- unsigned long expires)
+static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
{
- ktime_t hr_delta = hrtimer_get_next_event();
- struct timespec tsdelta;
- unsigned long delta;
-
- if (hr_delta.tv64 == KTIME_MAX)
- return expires;
+ u64 nextevt = hrtimer_get_next_event();
/*
- * Expired timer available, let it expire in the next tick
+ * If high resolution timers are enabled
+ * hrtimer_get_next_event() returns KTIME_MAX.
*/
- if (hr_delta.tv64 <= 0)
- return now + 1;
-
- tsdelta = ktime_to_timespec(hr_delta);
- delta = timespec_to_jiffies(&tsdelta);
+ if (expires <= nextevt)
+ return expires;
/*
- * Limit the delta to the max value, which is checked in
- * tick_nohz_stop_sched_tick():
+ * If the next timer is already expired, return the tick base
+ * time so the tick is fired immediately.
*/
- if (delta > NEXT_TIMER_MAX_DELTA)
- delta = NEXT_TIMER_MAX_DELTA;
+ if (nextevt <= basem)
+ return basem;
/*
- * Take rounding errors in to account and make sure, that it
- * expires in the next tick. Otherwise we go into an endless
- * ping pong due to tick_nohz_stop_sched_tick() retriggering
- * the timer softirq
+ * Round up to the next jiffie. High resolution timers are
+ * off, so the hrtimers are expired in the tick and we need to
+ * make sure that this tick really expires the timer to avoid
+ * a ping pong of the nohz stop code.
+ *
+ * Use DIV_ROUND_UP_ULL to prevent gcc calling __divdi3
*/
- if (delta < 1)
- delta = 1;
- now += delta;
- if (time_before(now, expires))
- return now;
- return expires;
+ return DIV_ROUND_UP_ULL(nextevt, TICK_NSEC) * TICK_NSEC;
}
/**
- * get_next_timer_interrupt - return the jiffy of the next pending timer
- * @now: current time (in jiffies)
+ * get_next_timer_interrupt - return the time (clock mono) of the next timer
+ * @basej: base time jiffies
+ * @basem: base time clock monotonic
+ *
+ * Returns the tick aligned clock monotonic time of the next pending
+ * timer or KTIME_MAX if no timer is pending.
*/
-unsigned long get_next_timer_interrupt(unsigned long now)
+u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
{
- struct tvec_base *base = __this_cpu_read(tvec_bases);
- unsigned long expires = now + NEXT_TIMER_MAX_DELTA;
+ struct tvec_base *base = this_cpu_ptr(&tvec_bases);
+ u64 expires = KTIME_MAX;
+ unsigned long nextevt;
/*
* Pretend that there is no timer pending if the cpu is offline.
@@ -1371,14 +1371,15 @@ unsigned long get_next_timer_interrupt(unsigned long now)
if (base->active_timers) {
if (time_before_eq(base->next_timer, base->timer_jiffies))
base->next_timer = __next_timer_interrupt(base);
- expires = base->next_timer;
+ nextevt = base->next_timer;
+ if (time_before_eq(nextevt, basej))
+ expires = basem;
+ else
+ expires = basem + (nextevt - basej) * TICK_NSEC;
}
spin_unlock(&base->lock);
- if (time_before_eq(expires, now))
- return now;
-
- return cmp_next_hrtimer_event(now, expires);
+ return cmp_next_hrtimer_event(basem, expires);
}
#endif
@@ -1407,9 +1408,7 @@ void update_process_times(int user_tick)
*/
static void run_timer_softirq(struct softirq_action *h)
{
- struct tvec_base *base = __this_cpu_read(tvec_bases);
-
- hrtimer_run_pending();
+ struct tvec_base *base = this_cpu_ptr(&tvec_bases);
if (time_after_eq(jiffies, base->timer_jiffies))
__run_timers(base);
@@ -1545,15 +1544,16 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout)
EXPORT_SYMBOL(schedule_timeout_uninterruptible);
#ifdef CONFIG_HOTPLUG_CPU
-static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head)
+static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head)
{
struct timer_list *timer;
+ int cpu = new_base->cpu;
- while (!list_empty(head)) {
- timer = list_first_entry(head, struct timer_list, entry);
+ while (!hlist_empty(head)) {
+ timer = hlist_entry(head->first, struct timer_list, entry);
/* We ignore the accounting on the dying cpu */
detach_timer(timer, false);
- timer_set_base(timer, new_base);
+ timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu;
internal_add_timer(new_base, timer);
}
}
@@ -1565,8 +1565,8 @@ static void migrate_timers(int cpu)
int i;
BUG_ON(cpu_online(cpu));
- old_base = per_cpu(tvec_bases, cpu);
- new_base = get_cpu_var(tvec_bases);
+ old_base = per_cpu_ptr(&tvec_bases, cpu);
+ new_base = get_cpu_ptr(&tvec_bases);
/*
* The caller is globally serialized and nobody else
* takes two locks at once, deadlock is not possible.
@@ -1590,7 +1590,7 @@ static void migrate_timers(int cpu)
spin_unlock(&old_base->lock);
spin_unlock_irq(&new_base->lock);
- put_cpu_var(tvec_bases);
+ put_cpu_ptr(&tvec_bases);
}
static int timer_cpu_notify(struct notifier_block *self,
@@ -1616,52 +1616,27 @@ static inline void timer_register_cpu_notifier(void)
static inline void timer_register_cpu_notifier(void) { }
#endif /* CONFIG_HOTPLUG_CPU */
-static void __init init_timer_cpu(struct tvec_base *base, int cpu)
+static void __init init_timer_cpu(int cpu)
{
- int j;
-
- BUG_ON(base != tbase_get_base(base));
+ struct tvec_base *base = per_cpu_ptr(&tvec_bases, cpu);
base->cpu = cpu;
- per_cpu(tvec_bases, cpu) = base;
spin_lock_init(&base->lock);
- for (j = 0; j < TVN_SIZE; j++) {
- INIT_LIST_HEAD(base->tv5.vec + j);
- INIT_LIST_HEAD(base->tv4.vec + j);
- INIT_LIST_HEAD(base->tv3.vec + j);
- INIT_LIST_HEAD(base->tv2.vec + j);
- }
- for (j = 0; j < TVR_SIZE; j++)
- INIT_LIST_HEAD(base->tv1.vec + j);
-
base->timer_jiffies = jiffies;
base->next_timer = base->timer_jiffies;
}
static void __init init_timer_cpus(void)
{
- struct tvec_base *base;
- int local_cpu = smp_processor_id();
int cpu;
- for_each_possible_cpu(cpu) {
- if (cpu == local_cpu)
- base = &boot_tvec_bases;
-#ifdef CONFIG_SMP
- else
- base = per_cpu_ptr(&__tvec_bases, cpu);
-#endif
-
- init_timer_cpu(base, cpu);
- }
+ for_each_possible_cpu(cpu)
+ init_timer_cpu(cpu);
}
void __init init_timers(void)
{
- /* ensure there are enough low bits for flags in timer->base pointer */
- BUILD_BUG_ON(__alignof__(struct tvec_base) & TIMER_FLAG_MASK);
-
init_timer_cpus();
init_timer_stats();
timer_register_cpu_notifier();
@@ -1697,14 +1672,14 @@ unsigned long msleep_interruptible(unsigned int msecs)
EXPORT_SYMBOL(msleep_interruptible);
-static int __sched do_usleep_range(unsigned long min, unsigned long max)
+static void __sched do_usleep_range(unsigned long min, unsigned long max)
{
ktime_t kmin;
unsigned long delta;
kmin = ktime_set(0, min * NSEC_PER_USEC);
delta = (max - min) * NSEC_PER_USEC;
- return schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL);
+ schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL);
}
/**
@@ -1712,7 +1687,7 @@ static int __sched do_usleep_range(unsigned long min, unsigned long max)
* @min: Minimum time in usecs to sleep
* @max: Maximum time in usecs to sleep
*/
-void usleep_range(unsigned long min, unsigned long max)
+void __sched usleep_range(unsigned long min, unsigned long max)
{
__set_current_state(TASK_UNINTERRUPTIBLE);
do_usleep_range(min, max);
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index e878c2e0b..a4536e1e3 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -29,19 +29,24 @@ struct timer_list_iter {
typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes);
-DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
-
/*
* This allows printing both to /proc/timer_list and
* to the console (on SysRq-Q):
*/
-#define SEQ_printf(m, x...) \
- do { \
- if (m) \
- seq_printf(m, x); \
- else \
- printk(x); \
- } while (0)
+__printf(2, 3)
+static void SEQ_printf(struct seq_file *m, const char *fmt, ...)
+{
+ va_list args;
+
+ va_start(args, fmt);
+
+ if (m)
+ seq_vprintf(m, fmt, args);
+ else
+ vprintk(fmt, args);
+
+ va_end(args);
+}
static void print_name_offset(struct seq_file *m, void *sym)
{
@@ -120,10 +125,10 @@ static void
print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
{
SEQ_printf(m, " .base: %pK\n", base);
- SEQ_printf(m, " .index: %d\n",
- base->index);
- SEQ_printf(m, " .resolution: %Lu nsecs\n",
- (unsigned long long)ktime_to_ns(base->resolution));
+ SEQ_printf(m, " .index: %d\n", base->index);
+
+ SEQ_printf(m, " .resolution: %u nsecs\n", (unsigned) hrtimer_resolution);
+
SEQ_printf(m, " .get_time: ");
print_name_offset(m, base->get_time);
SEQ_printf(m, "\n");
@@ -158,7 +163,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
P(nr_events);
P(nr_retries);
P(nr_hangs);
- P_ns(max_hang_time);
+ P(max_hang_time);
#endif
#undef P
#undef P_ns
@@ -184,7 +189,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
P_ns(idle_sleeptime);
P_ns(iowait_sleeptime);
P(last_jiffies);
- P(next_jiffies);
+ P(next_timer);
P_ns(idle_expires);
SEQ_printf(m, "jiffies: %Lu\n",
(unsigned long long)jiffies);
@@ -251,6 +256,12 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
SEQ_printf(m, "\n");
}
+ if (dev->set_state_oneshot_stopped) {
+ SEQ_printf(m, " oneshot stopped: ");
+ print_name_offset(m, dev->set_state_oneshot_stopped);
+ SEQ_printf(m, "\n");
+ }
+
if (dev->tick_resume) {
SEQ_printf(m, " resume: ");
print_name_offset(m, dev->tick_resume);
@@ -269,11 +280,11 @@ static void timer_list_show_tickdevices_header(struct seq_file *m)
{
#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
print_tickdevice(m, tick_get_broadcast_device(), -1);
- SEQ_printf(m, "tick_broadcast_mask: %08lx\n",
- cpumask_bits(tick_get_broadcast_mask())[0]);
+ SEQ_printf(m, "tick_broadcast_mask: %*pb\n",
+ cpumask_pr_args(tick_get_broadcast_mask()));
#ifdef CONFIG_TICK_ONESHOT
- SEQ_printf(m, "tick_broadcast_oneshot_mask: %08lx\n",
- cpumask_bits(tick_get_broadcast_oneshot_mask())[0]);
+ SEQ_printf(m, "tick_broadcast_oneshot_mask: %*pb\n",
+ cpumask_pr_args(tick_get_broadcast_oneshot_mask()));
#endif
SEQ_printf(m, "\n");
#endif
@@ -282,7 +293,7 @@ static void timer_list_show_tickdevices_header(struct seq_file *m)
static inline void timer_list_header(struct seq_file *m, u64 now)
{
- SEQ_printf(m, "Timer List Version: v0.7\n");
+ SEQ_printf(m, "Timer List Version: v0.8\n");
SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
SEQ_printf(m, "\n");
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 1fb08f213..1adecb4b8 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -68,7 +68,7 @@ struct entry {
* Number of timeout events:
*/
unsigned long count;
- unsigned int timer_flag;
+ u32 flags;
/*
* We save the command-line string to preserve
@@ -227,13 +227,13 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm)
* @startf: pointer to the function which did the timer setup
* @timerf: pointer to the timer callback function of the timer
* @comm: name of the process which set up the timer
+ * @tflags: The flags field of the timer
*
* When the timer is already registered, then the event counter is
* incremented. Otherwise the timer is registered in a free slot.
*/
void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
- void *timerf, char *comm,
- unsigned int timer_flag)
+ void *timerf, char *comm, u32 tflags)
{
/*
* It doesn't matter which lock we take:
@@ -251,7 +251,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
input.start_func = startf;
input.expire_func = timerf;
input.pid = pid;
- input.timer_flag = timer_flag;
+ input.flags = tflags;
raw_spin_lock_irqsave(lock, flags);
if (!timer_stats_active)
@@ -306,7 +306,7 @@ static int tstats_show(struct seq_file *m, void *v)
for (i = 0; i < nr_entries; i++) {
entry = entries + i;
- if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) {
+ if (entry->flags & TIMER_DEFERRABLE) {
seq_printf(m, "%4luD, %5d %-16s ",
entry->count, entry->pid, entry->comm);
} else {
diff --git a/kernel/torture.c b/kernel/torture.c
index dd70993c2..3e4840633 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -409,7 +409,7 @@ static void (*torture_shutdown_hook)(void);
*/
void torture_shutdown_absorb(const char *title)
{
- while (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
+ while (READ_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
pr_notice("torture thread %s parking due to system shutdown\n",
title);
schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT);
@@ -480,9 +480,9 @@ static int torture_shutdown_notify(struct notifier_block *unused1,
unsigned long unused2, void *unused3)
{
mutex_lock(&fullstop_mutex);
- if (ACCESS_ONCE(fullstop) == FULLSTOP_DONTSTOP) {
+ if (READ_ONCE(fullstop) == FULLSTOP_DONTSTOP) {
VERBOSE_TOROUT_STRING("Unscheduled system shutdown detected");
- ACCESS_ONCE(fullstop) = FULLSTOP_SHUTDOWN;
+ WRITE_ONCE(fullstop, FULLSTOP_SHUTDOWN);
} else {
pr_warn("Concurrent rmmod and shutdown illegal!\n");
}
@@ -523,13 +523,13 @@ static int stutter;
*/
void stutter_wait(const char *title)
{
- while (ACCESS_ONCE(stutter_pause_test) ||
- (torture_runnable && !ACCESS_ONCE(*torture_runnable))) {
+ while (READ_ONCE(stutter_pause_test) ||
+ (torture_runnable && !READ_ONCE(*torture_runnable))) {
if (stutter_pause_test)
- if (ACCESS_ONCE(stutter_pause_test) == 1)
+ if (READ_ONCE(stutter_pause_test) == 1)
schedule_timeout_interruptible(1);
else
- while (ACCESS_ONCE(stutter_pause_test))
+ while (READ_ONCE(stutter_pause_test))
cond_resched();
else
schedule_timeout_interruptible(round_jiffies_relative(HZ));
@@ -549,14 +549,14 @@ static int torture_stutter(void *arg)
if (!torture_must_stop()) {
if (stutter > 1) {
schedule_timeout_interruptible(stutter - 1);
- ACCESS_ONCE(stutter_pause_test) = 2;
+ WRITE_ONCE(stutter_pause_test, 2);
}
schedule_timeout_interruptible(1);
- ACCESS_ONCE(stutter_pause_test) = 1;
+ WRITE_ONCE(stutter_pause_test, 1);
}
if (!torture_must_stop())
schedule_timeout_interruptible(stutter);
- ACCESS_ONCE(stutter_pause_test) = 0;
+ WRITE_ONCE(stutter_pause_test, 0);
torture_shutdown_absorb("torture_stutter");
} while (!torture_must_stop());
torture_kthread_stopping("torture_stutter");
@@ -642,13 +642,13 @@ EXPORT_SYMBOL_GPL(torture_init_end);
bool torture_cleanup_begin(void)
{
mutex_lock(&fullstop_mutex);
- if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
+ if (READ_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
pr_warn("Concurrent rmmod and shutdown illegal!\n");
mutex_unlock(&fullstop_mutex);
schedule_timeout_uninterruptible(10);
return true;
}
- ACCESS_ONCE(fullstop) = FULLSTOP_RMMOD;
+ WRITE_ONCE(fullstop, FULLSTOP_RMMOD);
mutex_unlock(&fullstop_mutex);
torture_shutdown_cleanup();
torture_shuffle_cleanup();
@@ -681,7 +681,7 @@ EXPORT_SYMBOL_GPL(torture_must_stop);
*/
bool torture_must_stop_irq(void)
{
- return ACCESS_ONCE(fullstop) != FULLSTOP_DONTSTOP;
+ return READ_ONCE(fullstop) != FULLSTOP_DONTSTOP;
}
EXPORT_SYMBOL_GPL(torture_must_stop_irq);
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 483cecfa5..b3e6b39b6 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -439,7 +439,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
{
struct blk_trace *old_bt, *bt = NULL;
struct dentry *dir = NULL;
- int ret, i;
+ int ret;
if (!buts->buf_size || !buts->buf_nr)
return -EINVAL;
@@ -451,9 +451,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
* some device names have larger paths - convert the slashes
* to underscores for this to work as expected
*/
- for (i = 0; i < strlen(buts->name); i++)
- if (buts->name[i] == '/')
- buts->name[i] = '_';
+ strreplace(buts->name, '/', '_');
bt = kzalloc(sizeof(*bt), GFP_KERNEL);
if (!bt)
@@ -1450,14 +1448,14 @@ static struct trace_event trace_blk_event = {
static int __init init_blk_tracer(void)
{
- if (!register_ftrace_event(&trace_blk_event)) {
+ if (!register_trace_event(&trace_blk_event)) {
pr_warning("Warning: could not register block events\n");
return 1;
}
if (register_tracer(&blk_tracer) != 0) {
pr_warning("Warning: could not register the block tracer\n");
- unregister_ftrace_event(&trace_blk_event);
+ unregister_trace_event(&trace_blk_event);
return 1;
}
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 2d56ce501..88a041ade 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -79,18 +79,6 @@ static const struct bpf_func_proto bpf_probe_read_proto = {
.arg3_type = ARG_ANYTHING,
};
-static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
-{
- /* NMI safe access to clock monotonic */
- return ktime_get_mono_fast_ns();
-}
-
-static const struct bpf_func_proto bpf_ktime_get_ns_proto = {
- .func = bpf_ktime_get_ns,
- .gpl_only = true,
- .ret_type = RET_INTEGER,
-};
-
/*
* limited trace_printk()
* only %d %u %x %ld %lu %lx %lld %llu %llx %p conversion specifiers allowed
@@ -159,6 +147,17 @@ static const struct bpf_func_proto bpf_trace_printk_proto = {
.arg2_type = ARG_CONST_STACK_SIZE,
};
+const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
+{
+ /*
+ * this program might be calling bpf_trace_printk,
+ * so allocate per-cpu printk buffers
+ */
+ trace_printk_init_buffers();
+
+ return &bpf_trace_printk_proto;
+}
+
static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
{
switch (func_id) {
@@ -172,15 +171,18 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
return &bpf_probe_read_proto;
case BPF_FUNC_ktime_get_ns:
return &bpf_ktime_get_ns_proto;
-
+ case BPF_FUNC_tail_call:
+ return &bpf_tail_call_proto;
+ case BPF_FUNC_get_current_pid_tgid:
+ return &bpf_get_current_pid_tgid_proto;
+ case BPF_FUNC_get_current_uid_gid:
+ return &bpf_get_current_uid_gid_proto;
+ case BPF_FUNC_get_current_comm:
+ return &bpf_get_current_comm_proto;
case BPF_FUNC_trace_printk:
- /*
- * this program might be calling bpf_trace_printk,
- * so allocate per-cpu printk buffers
- */
- trace_printk_init_buffers();
-
- return &bpf_trace_printk_proto;
+ return bpf_get_trace_printk_proto();
+ case BPF_FUNC_get_smp_processor_id:
+ return &bpf_get_smp_processor_id_proto;
default:
return NULL;
}
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 0315d4317..6260717c1 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3,7 +3,7 @@
*
* Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
*/
-#include <linux/ftrace_event.h>
+#include <linux/trace_events.h>
#include <linux/ring_buffer.h>
#include <linux/trace_clock.h>
#include <linux/trace_seq.h>
@@ -115,63 +115,11 @@ int ring_buffer_print_entry_header(struct trace_seq *s)
*
*/
-/*
- * A fast way to enable or disable all ring buffers is to
- * call tracing_on or tracing_off. Turning off the ring buffers
- * prevents all ring buffers from being recorded to.
- * Turning this switch on, makes it OK to write to the
- * ring buffer, if the ring buffer is enabled itself.
- *
- * There's three layers that must be on in order to write
- * to the ring buffer.
- *
- * 1) This global flag must be set.
- * 2) The ring buffer must be enabled for recording.
- * 3) The per cpu buffer must be enabled for recording.
- *
- * In case of an anomaly, this global flag has a bit set that
- * will permantly disable all ring buffers.
- */
-
-/*
- * Global flag to disable all recording to ring buffers
- * This has two bits: ON, DISABLED
- *
- * ON DISABLED
- * ---- ----------
- * 0 0 : ring buffers are off
- * 1 0 : ring buffers are on
- * X 1 : ring buffers are permanently disabled
- */
-
-enum {
- RB_BUFFERS_ON_BIT = 0,
- RB_BUFFERS_DISABLED_BIT = 1,
-};
-
-enum {
- RB_BUFFERS_ON = 1 << RB_BUFFERS_ON_BIT,
- RB_BUFFERS_DISABLED = 1 << RB_BUFFERS_DISABLED_BIT,
-};
-
-static unsigned long ring_buffer_flags __read_mostly = RB_BUFFERS_ON;
-
/* Used for individual buffers (after the counter) */
#define RB_BUFFER_OFF (1 << 20)
#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data)
-/**
- * tracing_off_permanent - permanently disable ring buffers
- *
- * This function, once called, will disable all ring buffers
- * permanently.
- */
-void tracing_off_permanent(void)
-{
- set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags);
-}
-
#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
#define RB_ALIGNMENT 4U
#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
@@ -452,6 +400,23 @@ struct rb_irq_work {
};
/*
+ * Used for which event context the event is in.
+ * NMI = 0
+ * IRQ = 1
+ * SOFTIRQ = 2
+ * NORMAL = 3
+ *
+ * See trace_recursive_lock() comment below for more details.
+ */
+enum {
+ RB_CTX_NMI,
+ RB_CTX_IRQ,
+ RB_CTX_SOFTIRQ,
+ RB_CTX_NORMAL,
+ RB_CTX_MAX
+};
+
+/*
* head_page == tail_page && head == tail then buffer is empty.
*/
struct ring_buffer_per_cpu {
@@ -462,6 +427,7 @@ struct ring_buffer_per_cpu {
arch_spinlock_t lock;
struct lock_class_key lock_key;
unsigned int nr_pages;
+ unsigned int current_context;
struct list_head *pages;
struct buffer_page *head_page; /* read from head */
struct buffer_page *tail_page; /* write to tail */
@@ -2224,7 +2190,7 @@ static unsigned rb_calculate_event_length(unsigned length)
/* zero length can cause confusions */
if (!length)
- length = 1;
+ length++;
if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
length += sizeof(event.array[0]);
@@ -2636,8 +2602,6 @@ rb_reserve_next_event(struct ring_buffer *buffer,
return NULL;
}
-#ifdef CONFIG_TRACING
-
/*
* The lock and unlock are done within a preempt disable section.
* The current_context per_cpu variable can only be modified
@@ -2675,44 +2639,38 @@ rb_reserve_next_event(struct ring_buffer *buffer,
* just so happens that it is the same bit corresponding to
* the current context.
*/
-static DEFINE_PER_CPU(unsigned int, current_context);
-static __always_inline int trace_recursive_lock(void)
+static __always_inline int
+trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
{
- unsigned int val = __this_cpu_read(current_context);
+ unsigned int val = cpu_buffer->current_context;
int bit;
if (in_interrupt()) {
if (in_nmi())
- bit = 0;
+ bit = RB_CTX_NMI;
else if (in_irq())
- bit = 1;
+ bit = RB_CTX_IRQ;
else
- bit = 2;
+ bit = RB_CTX_SOFTIRQ;
} else
- bit = 3;
+ bit = RB_CTX_NORMAL;
if (unlikely(val & (1 << bit)))
return 1;
val |= (1 << bit);
- __this_cpu_write(current_context, val);
+ cpu_buffer->current_context = val;
return 0;
}
-static __always_inline void trace_recursive_unlock(void)
+static __always_inline void
+trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
{
- __this_cpu_and(current_context, __this_cpu_read(current_context) - 1);
+ cpu_buffer->current_context &= cpu_buffer->current_context - 1;
}
-#else
-
-#define trace_recursive_lock() (0)
-#define trace_recursive_unlock() do { } while (0)
-
-#endif
-
/**
* ring_buffer_lock_reserve - reserve a part of the buffer
* @buffer: the ring buffer to reserve from
@@ -2735,41 +2693,37 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
struct ring_buffer_event *event;
int cpu;
- if (ring_buffer_flags != RB_BUFFERS_ON)
- return NULL;
-
/* If we are tracing schedule, we don't want to recurse */
preempt_disable_notrace();
- if (atomic_read(&buffer->record_disabled))
- goto out_nocheck;
-
- if (trace_recursive_lock())
- goto out_nocheck;
+ if (unlikely(atomic_read(&buffer->record_disabled)))
+ goto out;
cpu = raw_smp_processor_id();
- if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ if (unlikely(!cpumask_test_cpu(cpu, buffer->cpumask)))
goto out;
cpu_buffer = buffer->buffers[cpu];
- if (atomic_read(&cpu_buffer->record_disabled))
+ if (unlikely(atomic_read(&cpu_buffer->record_disabled)))
goto out;
- if (length > BUF_MAX_DATA_SIZE)
+ if (unlikely(length > BUF_MAX_DATA_SIZE))
+ goto out;
+
+ if (unlikely(trace_recursive_lock(cpu_buffer)))
goto out;
event = rb_reserve_next_event(buffer, cpu_buffer, length);
if (!event)
- goto out;
+ goto out_unlock;
return event;
+ out_unlock:
+ trace_recursive_unlock(cpu_buffer);
out:
- trace_recursive_unlock();
-
- out_nocheck:
preempt_enable_notrace();
return NULL;
}
@@ -2859,7 +2813,7 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
rb_wakeups(buffer, cpu_buffer);
- trace_recursive_unlock();
+ trace_recursive_unlock(cpu_buffer);
preempt_enable_notrace();
@@ -2970,7 +2924,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
out:
rb_end_commit(cpu_buffer);
- trace_recursive_unlock();
+ trace_recursive_unlock(cpu_buffer);
preempt_enable_notrace();
@@ -3000,9 +2954,6 @@ int ring_buffer_write(struct ring_buffer *buffer,
int ret = -EBUSY;
int cpu;
- if (ring_buffer_flags != RB_BUFFERS_ON)
- return -EBUSY;
-
preempt_disable_notrace();
if (atomic_read(&buffer->record_disabled))
@@ -3021,9 +2972,12 @@ int ring_buffer_write(struct ring_buffer *buffer,
if (length > BUF_MAX_DATA_SIZE)
goto out;
+ if (unlikely(trace_recursive_lock(cpu_buffer)))
+ goto out;
+
event = rb_reserve_next_event(buffer, cpu_buffer, length);
if (!event)
- goto out;
+ goto out_unlock;
body = rb_event_data(event);
@@ -3034,6 +2988,10 @@ int ring_buffer_write(struct ring_buffer *buffer,
rb_wakeups(buffer, cpu_buffer);
ret = 0;
+
+ out_unlock:
+ trace_recursive_unlock(cpu_buffer);
+
out:
preempt_enable_notrace();
@@ -3860,19 +3818,36 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
}
EXPORT_SYMBOL_GPL(ring_buffer_iter_peek);
-static inline int rb_ok_to_lock(void)
+static inline bool rb_reader_lock(struct ring_buffer_per_cpu *cpu_buffer)
{
+ if (likely(!in_nmi())) {
+ raw_spin_lock(&cpu_buffer->reader_lock);
+ return true;
+ }
+
/*
* If an NMI die dumps out the content of the ring buffer
- * do not grab locks. We also permanently disable the ring
- * buffer too. A one time deal is all you get from reading
- * the ring buffer from an NMI.
+ * trylock must be used to prevent a deadlock if the NMI
+ * preempted a task that holds the ring buffer locks. If
+ * we get the lock then all is fine, if not, then continue
+ * to do the read, but this can corrupt the ring buffer,
+ * so it must be permanently disabled from future writes.
+ * Reading from NMI is a oneshot deal.
*/
- if (likely(!in_nmi()))
- return 1;
+ if (raw_spin_trylock(&cpu_buffer->reader_lock))
+ return true;
- tracing_off_permanent();
- return 0;
+ /* Continue without locking, but disable the ring buffer */
+ atomic_inc(&cpu_buffer->record_disabled);
+ return false;
+}
+
+static inline void
+rb_reader_unlock(struct ring_buffer_per_cpu *cpu_buffer, bool locked)
+{
+ if (likely(locked))
+ raw_spin_unlock(&cpu_buffer->reader_lock);
+ return;
}
/**
@@ -3892,21 +3867,18 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
struct ring_buffer_event *event;
unsigned long flags;
- int dolock;
+ bool dolock;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return NULL;
- dolock = rb_ok_to_lock();
again:
local_irq_save(flags);
- if (dolock)
- raw_spin_lock(&cpu_buffer->reader_lock);
+ dolock = rb_reader_lock(cpu_buffer);
event = rb_buffer_peek(cpu_buffer, ts, lost_events);
if (event && event->type_len == RINGBUF_TYPE_PADDING)
rb_advance_reader(cpu_buffer);
- if (dolock)
- raw_spin_unlock(&cpu_buffer->reader_lock);
+ rb_reader_unlock(cpu_buffer, dolock);
local_irq_restore(flags);
if (event && event->type_len == RINGBUF_TYPE_PADDING)
@@ -3959,9 +3931,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
struct ring_buffer_per_cpu *cpu_buffer;
struct ring_buffer_event *event = NULL;
unsigned long flags;
- int dolock;
-
- dolock = rb_ok_to_lock();
+ bool dolock;
again:
/* might be called in atomic */
@@ -3972,8 +3942,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
cpu_buffer = buffer->buffers[cpu];
local_irq_save(flags);
- if (dolock)
- raw_spin_lock(&cpu_buffer->reader_lock);
+ dolock = rb_reader_lock(cpu_buffer);
event = rb_buffer_peek(cpu_buffer, ts, lost_events);
if (event) {
@@ -3981,8 +3950,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
rb_advance_reader(cpu_buffer);
}
- if (dolock)
- raw_spin_unlock(&cpu_buffer->reader_lock);
+ rb_reader_unlock(cpu_buffer, dolock);
local_irq_restore(flags);
out:
@@ -4263,21 +4231,17 @@ int ring_buffer_empty(struct ring_buffer *buffer)
{
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long flags;
- int dolock;
+ bool dolock;
int cpu;
int ret;
- dolock = rb_ok_to_lock();
-
/* yes this is racy, but if you don't like the race, lock the buffer */
for_each_buffer_cpu(buffer, cpu) {
cpu_buffer = buffer->buffers[cpu];
local_irq_save(flags);
- if (dolock)
- raw_spin_lock(&cpu_buffer->reader_lock);
+ dolock = rb_reader_lock(cpu_buffer);
ret = rb_per_cpu_empty(cpu_buffer);
- if (dolock)
- raw_spin_unlock(&cpu_buffer->reader_lock);
+ rb_reader_unlock(cpu_buffer, dolock);
local_irq_restore(flags);
if (!ret)
@@ -4297,21 +4261,17 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long flags;
- int dolock;
+ bool dolock;
int ret;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return 1;
- dolock = rb_ok_to_lock();
-
cpu_buffer = buffer->buffers[cpu];
local_irq_save(flags);
- if (dolock)
- raw_spin_lock(&cpu_buffer->reader_lock);
+ dolock = rb_reader_lock(cpu_buffer);
ret = rb_per_cpu_empty(cpu_buffer);
- if (dolock)
- raw_spin_unlock(&cpu_buffer->reader_lock);
+ rb_reader_unlock(cpu_buffer, dolock);
local_irq_restore(flags);
return ret;
@@ -4349,9 +4309,6 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
ret = -EAGAIN;
- if (ring_buffer_flags != RB_BUFFERS_ON)
- goto out;
-
if (atomic_read(&buffer_a->record_disabled))
goto out;
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 1b28df2d9..a1503a027 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -32,11 +32,11 @@ static struct task_struct *producer;
static struct task_struct *consumer;
static unsigned long read;
-static int disable_reader;
+static unsigned int disable_reader;
module_param(disable_reader, uint, 0644);
MODULE_PARM_DESC(disable_reader, "only run producer");
-static int write_iteration = 50;
+static unsigned int write_iteration = 50;
module_param(write_iteration, uint, 0644);
MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings");
@@ -46,16 +46,16 @@ static int consumer_nice = MAX_NICE;
static int producer_fifo = -1;
static int consumer_fifo = -1;
-module_param(producer_nice, uint, 0644);
+module_param(producer_nice, int, 0644);
MODULE_PARM_DESC(producer_nice, "nice prio for producer");
-module_param(consumer_nice, uint, 0644);
+module_param(consumer_nice, int, 0644);
MODULE_PARM_DESC(consumer_nice, "nice prio for consumer");
-module_param(producer_fifo, uint, 0644);
+module_param(producer_fifo, int, 0644);
MODULE_PARM_DESC(producer_fifo, "fifo prio for producer");
-module_param(consumer_fifo, uint, 0644);
+module_param(consumer_fifo, int, 0644);
MODULE_PARM_DESC(consumer_fifo, "fifo prio for consumer");
static int read_events;
@@ -263,6 +263,8 @@ static void ring_buffer_producer(void)
if (cnt % wakeup_interval)
cond_resched();
#endif
+ if (kthread_should_stop())
+ kill_test = 1;
} while (ktime_before(end_time, timeout) && !kill_test);
trace_printk("End ring buffer hammer\n");
@@ -285,7 +287,7 @@ static void ring_buffer_producer(void)
entries = ring_buffer_entries(buffer);
overruns = ring_buffer_overruns(buffer);
- if (kill_test)
+ if (kill_test && !kthread_should_stop())
trace_printk("ERROR!\n");
if (!disable_reader) {
@@ -379,7 +381,7 @@ static int ring_buffer_consumer_thread(void *arg)
}
__set_current_state(TASK_RUNNING);
- if (kill_test)
+ if (!kthread_should_stop())
wait_to_die();
return 0;
@@ -399,13 +401,16 @@ static int ring_buffer_producer_thread(void *arg)
}
ring_buffer_producer();
+ if (kill_test)
+ goto out_kill;
trace_printk("Sleeping for 10 secs\n");
set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout(HZ * SLEEP_TIME);
}
- if (kill_test)
+out_kill:
+ if (!kthread_should_stop())
wait_to_die();
return 0;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 05330494a..abcbf7ff8 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -297,11 +297,11 @@ void trace_array_put(struct trace_array *this_tr)
mutex_unlock(&trace_types_lock);
}
-int filter_check_discard(struct ftrace_event_file *file, void *rec,
+int filter_check_discard(struct trace_event_file *file, void *rec,
struct ring_buffer *buffer,
struct ring_buffer_event *event)
{
- if (unlikely(file->flags & FTRACE_EVENT_FL_FILTERED) &&
+ if (unlikely(file->flags & EVENT_FILE_FL_FILTERED) &&
!filter_match_preds(file->filter, rec)) {
ring_buffer_discard_commit(buffer, event);
return 1;
@@ -311,7 +311,7 @@ int filter_check_discard(struct ftrace_event_file *file, void *rec,
}
EXPORT_SYMBOL_GPL(filter_check_discard);
-int call_filter_check_discard(struct ftrace_event_call *call, void *rec,
+int call_filter_check_discard(struct trace_event_call *call, void *rec,
struct ring_buffer *buffer,
struct ring_buffer_event *event)
{
@@ -876,6 +876,7 @@ static struct {
{ trace_clock_jiffies, "uptime", 0 },
{ trace_clock, "perf", 1 },
{ ktime_get_mono_fast_ns, "mono", 1 },
+ { ktime_get_raw_fast_ns, "mono_raw", 1 },
ARCH_TRACE_CLOCKS
};
@@ -1693,13 +1694,13 @@ static struct ring_buffer *temp_buffer;
struct ring_buffer_event *
trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
- struct ftrace_event_file *ftrace_file,
+ struct trace_event_file *trace_file,
int type, unsigned long len,
unsigned long flags, int pc)
{
struct ring_buffer_event *entry;
- *current_rb = ftrace_file->tr->trace_buffer.buffer;
+ *current_rb = trace_file->tr->trace_buffer.buffer;
entry = trace_buffer_lock_reserve(*current_rb,
type, len, flags, pc);
/*
@@ -1708,7 +1709,7 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
* to store the trace event for the tigger to use. It's recusive
* safe and will not be recorded anywhere.
*/
- if (!entry && ftrace_file->flags & FTRACE_EVENT_FL_TRIGGER_COND) {
+ if (!entry && trace_file->flags & EVENT_FILE_FL_TRIGGER_COND) {
*current_rb = temp_buffer;
entry = trace_buffer_lock_reserve(*current_rb,
type, len, flags, pc);
@@ -1760,7 +1761,7 @@ trace_function(struct trace_array *tr,
unsigned long ip, unsigned long parent_ip, unsigned long flags,
int pc)
{
- struct ftrace_event_call *call = &event_function;
+ struct trace_event_call *call = &event_function;
struct ring_buffer *buffer = tr->trace_buffer.buffer;
struct ring_buffer_event *event;
struct ftrace_entry *entry;
@@ -1795,7 +1796,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
unsigned long flags,
int skip, int pc, struct pt_regs *regs)
{
- struct ftrace_event_call *call = &event_kernel_stack;
+ struct trace_event_call *call = &event_kernel_stack;
struct ring_buffer_event *event;
struct stack_entry *entry;
struct stack_trace trace;
@@ -1923,7 +1924,7 @@ static DEFINE_PER_CPU(int, user_stack_count);
void
ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
{
- struct ftrace_event_call *call = &event_user_stack;
+ struct trace_event_call *call = &event_user_stack;
struct ring_buffer_event *event;
struct userstack_entry *entry;
struct stack_trace trace;
@@ -2129,7 +2130,7 @@ static void trace_printk_start_stop_comm(int enabled)
*/
int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
{
- struct ftrace_event_call *call = &event_bprint;
+ struct trace_event_call *call = &event_bprint;
struct ring_buffer_event *event;
struct ring_buffer *buffer;
struct trace_array *tr = &global_trace;
@@ -2187,7 +2188,7 @@ static int
__trace_array_vprintk(struct ring_buffer *buffer,
unsigned long ip, const char *fmt, va_list args)
{
- struct ftrace_event_call *call = &event_print;
+ struct trace_event_call *call = &event_print;
struct ring_buffer_event *event;
int len = 0, size, pc;
struct print_entry *entry;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 921691c5c..74bde8160 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -12,7 +12,7 @@
#include <linux/ftrace.h>
#include <linux/hw_breakpoint.h>
#include <linux/trace_seq.h>
-#include <linux/ftrace_event.h>
+#include <linux/trace_events.h>
#include <linux/compiler.h>
#include <linux/trace_seq.h>
@@ -211,8 +211,8 @@ struct trace_array {
#ifdef CONFIG_FTRACE_SYSCALLS
int sys_refcount_enter;
int sys_refcount_exit;
- struct ftrace_event_file __rcu *enter_syscall_files[NR_syscalls];
- struct ftrace_event_file __rcu *exit_syscall_files[NR_syscalls];
+ struct trace_event_file __rcu *enter_syscall_files[NR_syscalls];
+ struct trace_event_file __rcu *exit_syscall_files[NR_syscalls];
#endif
int stop_count;
int clock_id;
@@ -859,7 +859,7 @@ void ftrace_destroy_filter_files(struct ftrace_ops *ops);
#define ftrace_destroy_filter_files(ops) do { } while (0)
#endif /* CONFIG_FUNCTION_TRACER && CONFIG_DYNAMIC_FTRACE */
-int ftrace_event_is_function(struct ftrace_event_call *call);
+int ftrace_event_is_function(struct trace_event_call *call);
/*
* struct trace_parser - servers for reading the user input separated by spaces
@@ -993,7 +993,7 @@ struct event_subsystem {
int ref_count;
};
-struct ftrace_subsystem_dir {
+struct trace_subsystem_dir {
struct list_head list;
struct event_subsystem *subsystem;
struct trace_array *tr;
@@ -1053,30 +1053,30 @@ struct filter_pred {
extern enum regex_type
filter_parse_regex(char *buff, int len, char **search, int *not);
-extern void print_event_filter(struct ftrace_event_file *file,
+extern void print_event_filter(struct trace_event_file *file,
struct trace_seq *s);
-extern int apply_event_filter(struct ftrace_event_file *file,
+extern int apply_event_filter(struct trace_event_file *file,
char *filter_string);
-extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
+extern int apply_subsystem_event_filter(struct trace_subsystem_dir *dir,
char *filter_string);
extern void print_subsystem_event_filter(struct event_subsystem *system,
struct trace_seq *s);
extern int filter_assign_type(const char *type);
-extern int create_event_filter(struct ftrace_event_call *call,
+extern int create_event_filter(struct trace_event_call *call,
char *filter_str, bool set_str,
struct event_filter **filterp);
extern void free_event_filter(struct event_filter *filter);
struct ftrace_event_field *
-trace_find_event_field(struct ftrace_event_call *call, char *name);
+trace_find_event_field(struct trace_event_call *call, char *name);
extern void trace_event_enable_cmd_record(bool enable);
extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr);
extern int event_trace_del_tracer(struct trace_array *tr);
-extern struct ftrace_event_file *find_event_file(struct trace_array *tr,
- const char *system,
- const char *event);
+extern struct trace_event_file *find_event_file(struct trace_array *tr,
+ const char *system,
+ const char *event);
static inline void *event_file_data(struct file *filp)
{
@@ -1181,7 +1181,7 @@ struct event_trigger_ops {
* commands need to do this if they themselves log to the trace
* buffer (see the @post_trigger() member below). @trigger_type
* values are defined by adding new values to the trigger_type
- * enum in include/linux/ftrace_event.h.
+ * enum in include/linux/trace_events.h.
*
* @post_trigger: A flag that says whether or not this command needs
* to have its action delayed until after the current event has
@@ -1243,23 +1243,23 @@ struct event_command {
enum event_trigger_type trigger_type;
bool post_trigger;
int (*func)(struct event_command *cmd_ops,
- struct ftrace_event_file *file,
+ struct trace_event_file *file,
char *glob, char *cmd, char *params);
int (*reg)(char *glob,
struct event_trigger_ops *ops,
struct event_trigger_data *data,
- struct ftrace_event_file *file);
+ struct trace_event_file *file);
void (*unreg)(char *glob,
struct event_trigger_ops *ops,
struct event_trigger_data *data,
- struct ftrace_event_file *file);
+ struct trace_event_file *file);
int (*set_filter)(char *filter_str,
struct event_trigger_data *data,
- struct ftrace_event_file *file);
+ struct trace_event_file *file);
struct event_trigger_ops *(*get_trigger_ops)(char *cmd, char *param);
};
-extern int trace_event_enable_disable(struct ftrace_event_file *file,
+extern int trace_event_enable_disable(struct trace_event_file *file,
int enable, int soft_disable);
extern int tracing_alloc_snapshot(void);
@@ -1287,7 +1287,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
#undef FTRACE_ENTRY
#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \
- extern struct ftrace_event_call \
+ extern struct trace_event_call \
__aligned(4) event_##call;
#undef FTRACE_ENTRY_DUP
#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \
@@ -1296,7 +1296,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
#include "trace_entries.h"
#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_FUNCTION_TRACER)
-int perf_ftrace_event_register(struct ftrace_event_call *call,
+int perf_ftrace_event_register(struct trace_event_call *call,
enum trace_reg type, void *data);
#else
#define perf_ftrace_event_register NULL
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 1879980f0..e2e12ad31 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -29,7 +29,7 @@ static struct trace_array *branch_tracer;
static void
probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
{
- struct ftrace_event_call *call = &event_branch;
+ struct trace_event_call *call = &event_branch;
struct trace_array *tr = branch_tracer;
struct trace_array_cpu *data;
struct ring_buffer_event *event;
@@ -194,7 +194,7 @@ __init static int init_branch_tracer(void)
{
int ret;
- ret = register_ftrace_event(&trace_branch_event);
+ ret = register_trace_event(&trace_branch_event);
if (!ret) {
printk(KERN_WARNING "Warning: could not register "
"branch events\n");
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 57b67b1f2..0f06532a7 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -56,6 +56,7 @@ u64 notrace trace_clock(void)
{
return local_clock();
}
+EXPORT_SYMBOL_GPL(trace_clock);
/*
* trace_jiffy_clock(): Simply use jiffies as a clock counter.
@@ -68,6 +69,7 @@ u64 notrace trace_clock_jiffies(void)
{
return jiffies_64_to_clock_t(jiffies_64 - INITIAL_JIFFIES);
}
+EXPORT_SYMBOL_GPL(trace_clock_jiffies);
/*
* trace_clock_global(): special globally coherent trace clock
@@ -123,6 +125,7 @@ u64 notrace trace_clock_global(void)
return now;
}
+EXPORT_SYMBOL_GPL(trace_clock_global);
static atomic64_t trace_counter;
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 6fa484de2..abfc903e7 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -21,7 +21,7 @@ typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
/* Count the events in use (per event id, not per instance) */
static int total_ref_count;
-static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
+static int perf_trace_event_perm(struct trace_event_call *tp_event,
struct perf_event *p_event)
{
if (tp_event->perf_perm) {
@@ -83,7 +83,7 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
return 0;
}
-static int perf_trace_event_reg(struct ftrace_event_call *tp_event,
+static int perf_trace_event_reg(struct trace_event_call *tp_event,
struct perf_event *p_event)
{
struct hlist_head __percpu *list;
@@ -143,7 +143,7 @@ fail:
static void perf_trace_event_unreg(struct perf_event *p_event)
{
- struct ftrace_event_call *tp_event = p_event->tp_event;
+ struct trace_event_call *tp_event = p_event->tp_event;
int i;
if (--tp_event->perf_refcount > 0)
@@ -172,17 +172,17 @@ out:
static int perf_trace_event_open(struct perf_event *p_event)
{
- struct ftrace_event_call *tp_event = p_event->tp_event;
+ struct trace_event_call *tp_event = p_event->tp_event;
return tp_event->class->reg(tp_event, TRACE_REG_PERF_OPEN, p_event);
}
static void perf_trace_event_close(struct perf_event *p_event)
{
- struct ftrace_event_call *tp_event = p_event->tp_event;
+ struct trace_event_call *tp_event = p_event->tp_event;
tp_event->class->reg(tp_event, TRACE_REG_PERF_CLOSE, p_event);
}
-static int perf_trace_event_init(struct ftrace_event_call *tp_event,
+static int perf_trace_event_init(struct trace_event_call *tp_event,
struct perf_event *p_event)
{
int ret;
@@ -206,7 +206,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event,
int perf_trace_init(struct perf_event *p_event)
{
- struct ftrace_event_call *tp_event;
+ struct trace_event_call *tp_event;
u64 event_id = p_event->attr.config;
int ret = -EINVAL;
@@ -236,7 +236,7 @@ void perf_trace_destroy(struct perf_event *p_event)
int perf_trace_add(struct perf_event *p_event, int flags)
{
- struct ftrace_event_call *tp_event = p_event->tp_event;
+ struct trace_event_call *tp_event = p_event->tp_event;
struct hlist_head __percpu *pcpu_list;
struct hlist_head *list;
@@ -255,7 +255,7 @@ int perf_trace_add(struct perf_event *p_event, int flags)
void perf_trace_del(struct perf_event *p_event, int flags)
{
- struct ftrace_event_call *tp_event = p_event->tp_event;
+ struct trace_event_call *tp_event = p_event->tp_event;
hlist_del_rcu(&p_event->hlist_entry);
tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
}
@@ -357,7 +357,7 @@ static void perf_ftrace_function_disable(struct perf_event *event)
ftrace_function_local_disable(&event->ftrace_ops);
}
-int perf_ftrace_event_register(struct ftrace_event_call *call,
+int perf_ftrace_event_register(struct trace_event_call *call,
enum trace_reg type, void *data)
{
switch (type) {
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index c4de47fc5..404a372ad 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -61,14 +61,14 @@ static int system_refcount_dec(struct event_subsystem *system)
#define do_for_each_event_file_safe(tr, file) \
list_for_each_entry(tr, &ftrace_trace_arrays, list) { \
- struct ftrace_event_file *___n; \
+ struct trace_event_file *___n; \
list_for_each_entry_safe(file, ___n, &tr->events, list)
#define while_for_each_event_file() \
}
static struct list_head *
-trace_get_fields(struct ftrace_event_call *event_call)
+trace_get_fields(struct trace_event_call *event_call)
{
if (!event_call->class->get_fields)
return &event_call->class->fields;
@@ -89,7 +89,7 @@ __find_event_field(struct list_head *head, char *name)
}
struct ftrace_event_field *
-trace_find_event_field(struct ftrace_event_call *call, char *name)
+trace_find_event_field(struct trace_event_call *call, char *name)
{
struct ftrace_event_field *field;
struct list_head *head;
@@ -129,7 +129,7 @@ static int __trace_define_field(struct list_head *head, const char *type,
return 0;
}
-int trace_define_field(struct ftrace_event_call *call, const char *type,
+int trace_define_field(struct trace_event_call *call, const char *type,
const char *name, int offset, int size, int is_signed,
int filter_type)
{
@@ -166,7 +166,7 @@ static int trace_define_common_fields(void)
return ret;
}
-static void trace_destroy_fields(struct ftrace_event_call *call)
+static void trace_destroy_fields(struct trace_event_call *call)
{
struct ftrace_event_field *field, *next;
struct list_head *head;
@@ -178,11 +178,11 @@ static void trace_destroy_fields(struct ftrace_event_call *call)
}
}
-int trace_event_raw_init(struct ftrace_event_call *call)
+int trace_event_raw_init(struct trace_event_call *call)
{
int id;
- id = register_ftrace_event(&call->event);
+ id = register_trace_event(&call->event);
if (!id)
return -ENODEV;
@@ -190,18 +190,18 @@ int trace_event_raw_init(struct ftrace_event_call *call)
}
EXPORT_SYMBOL_GPL(trace_event_raw_init);
-void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer,
- struct ftrace_event_file *ftrace_file,
- unsigned long len)
+void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
+ struct trace_event_file *trace_file,
+ unsigned long len)
{
- struct ftrace_event_call *event_call = ftrace_file->event_call;
+ struct trace_event_call *event_call = trace_file->event_call;
local_save_flags(fbuffer->flags);
fbuffer->pc = preempt_count();
- fbuffer->ftrace_file = ftrace_file;
+ fbuffer->trace_file = trace_file;
fbuffer->event =
- trace_event_buffer_lock_reserve(&fbuffer->buffer, ftrace_file,
+ trace_event_buffer_lock_reserve(&fbuffer->buffer, trace_file,
event_call->event.type, len,
fbuffer->flags, fbuffer->pc);
if (!fbuffer->event)
@@ -210,13 +210,13 @@ void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer,
fbuffer->entry = ring_buffer_event_data(fbuffer->event);
return fbuffer->entry;
}
-EXPORT_SYMBOL_GPL(ftrace_event_buffer_reserve);
+EXPORT_SYMBOL_GPL(trace_event_buffer_reserve);
static DEFINE_SPINLOCK(tracepoint_iter_lock);
-static void output_printk(struct ftrace_event_buffer *fbuffer)
+static void output_printk(struct trace_event_buffer *fbuffer)
{
- struct ftrace_event_call *event_call;
+ struct trace_event_call *event_call;
struct trace_event *event;
unsigned long flags;
struct trace_iterator *iter = tracepoint_print_iter;
@@ -224,12 +224,12 @@ static void output_printk(struct ftrace_event_buffer *fbuffer)
if (!iter)
return;
- event_call = fbuffer->ftrace_file->event_call;
+ event_call = fbuffer->trace_file->event_call;
if (!event_call || !event_call->event.funcs ||
!event_call->event.funcs->trace)
return;
- event = &fbuffer->ftrace_file->event_call->event;
+ event = &fbuffer->trace_file->event_call->event;
spin_lock_irqsave(&tracepoint_iter_lock, flags);
trace_seq_init(&iter->seq);
@@ -241,21 +241,21 @@ static void output_printk(struct ftrace_event_buffer *fbuffer)
spin_unlock_irqrestore(&tracepoint_iter_lock, flags);
}
-void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer)
+void trace_event_buffer_commit(struct trace_event_buffer *fbuffer)
{
if (tracepoint_printk)
output_printk(fbuffer);
- event_trigger_unlock_commit(fbuffer->ftrace_file, fbuffer->buffer,
+ event_trigger_unlock_commit(fbuffer->trace_file, fbuffer->buffer,
fbuffer->event, fbuffer->entry,
fbuffer->flags, fbuffer->pc);
}
-EXPORT_SYMBOL_GPL(ftrace_event_buffer_commit);
+EXPORT_SYMBOL_GPL(trace_event_buffer_commit);
-int ftrace_event_reg(struct ftrace_event_call *call,
- enum trace_reg type, void *data)
+int trace_event_reg(struct trace_event_call *call,
+ enum trace_reg type, void *data)
{
- struct ftrace_event_file *file = data;
+ struct trace_event_file *file = data;
WARN_ON(!(call->flags & TRACE_EVENT_FL_TRACEPOINT));
switch (type) {
@@ -288,34 +288,34 @@ int ftrace_event_reg(struct ftrace_event_call *call,
}
return 0;
}
-EXPORT_SYMBOL_GPL(ftrace_event_reg);
+EXPORT_SYMBOL_GPL(trace_event_reg);
void trace_event_enable_cmd_record(bool enable)
{
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
struct trace_array *tr;
mutex_lock(&event_mutex);
do_for_each_event_file(tr, file) {
- if (!(file->flags & FTRACE_EVENT_FL_ENABLED))
+ if (!(file->flags & EVENT_FILE_FL_ENABLED))
continue;
if (enable) {
tracing_start_cmdline_record();
- set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
+ set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
} else {
tracing_stop_cmdline_record();
- clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
+ clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
}
} while_for_each_event_file();
mutex_unlock(&event_mutex);
}
-static int __ftrace_event_enable_disable(struct ftrace_event_file *file,
+static int __ftrace_event_enable_disable(struct trace_event_file *file,
int enable, int soft_disable)
{
- struct ftrace_event_call *call = file->event_call;
+ struct trace_event_call *call = file->event_call;
int ret = 0;
int disable;
@@ -337,24 +337,24 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file,
if (soft_disable) {
if (atomic_dec_return(&file->sm_ref) > 0)
break;
- disable = file->flags & FTRACE_EVENT_FL_SOFT_DISABLED;
- clear_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags);
+ disable = file->flags & EVENT_FILE_FL_SOFT_DISABLED;
+ clear_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags);
} else
- disable = !(file->flags & FTRACE_EVENT_FL_SOFT_MODE);
+ disable = !(file->flags & EVENT_FILE_FL_SOFT_MODE);
- if (disable && (file->flags & FTRACE_EVENT_FL_ENABLED)) {
- clear_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags);
- if (file->flags & FTRACE_EVENT_FL_RECORDED_CMD) {
+ if (disable && (file->flags & EVENT_FILE_FL_ENABLED)) {
+ clear_bit(EVENT_FILE_FL_ENABLED_BIT, &file->flags);
+ if (file->flags & EVENT_FILE_FL_RECORDED_CMD) {
tracing_stop_cmdline_record();
- clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
+ clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
}
call->class->reg(call, TRACE_REG_UNREGISTER, file);
}
/* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */
- if (file->flags & FTRACE_EVENT_FL_SOFT_MODE)
- set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
+ if (file->flags & EVENT_FILE_FL_SOFT_MODE)
+ set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
else
- clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
+ clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
break;
case 1:
/*
@@ -366,31 +366,31 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file,
* it still seems to be disabled.
*/
if (!soft_disable)
- clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
+ clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
else {
if (atomic_inc_return(&file->sm_ref) > 1)
break;
- set_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags);
+ set_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags);
}
- if (!(file->flags & FTRACE_EVENT_FL_ENABLED)) {
+ if (!(file->flags & EVENT_FILE_FL_ENABLED)) {
/* Keep the event disabled, when going to SOFT_MODE. */
if (soft_disable)
- set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
+ set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
if (trace_flags & TRACE_ITER_RECORD_CMD) {
tracing_start_cmdline_record();
- set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
+ set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
}
ret = call->class->reg(call, TRACE_REG_REGISTER, file);
if (ret) {
tracing_stop_cmdline_record();
pr_info("event trace: Could not enable event "
- "%s\n", ftrace_event_name(call));
+ "%s\n", trace_event_name(call));
break;
}
- set_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags);
+ set_bit(EVENT_FILE_FL_ENABLED_BIT, &file->flags);
/* WAS_ENABLED gets set but never cleared. */
call->flags |= TRACE_EVENT_FL_WAS_ENABLED;
@@ -401,13 +401,13 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file,
return ret;
}
-int trace_event_enable_disable(struct ftrace_event_file *file,
+int trace_event_enable_disable(struct trace_event_file *file,
int enable, int soft_disable)
{
return __ftrace_event_enable_disable(file, enable, soft_disable);
}
-static int ftrace_event_enable_disable(struct ftrace_event_file *file,
+static int ftrace_event_enable_disable(struct trace_event_file *file,
int enable)
{
return __ftrace_event_enable_disable(file, enable, 0);
@@ -415,7 +415,7 @@ static int ftrace_event_enable_disable(struct ftrace_event_file *file,
static void ftrace_clear_events(struct trace_array *tr)
{
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
mutex_lock(&event_mutex);
list_for_each_entry(file, &tr->events, list) {
@@ -449,14 +449,14 @@ static void __get_system(struct event_subsystem *system)
system_refcount_inc(system);
}
-static void __get_system_dir(struct ftrace_subsystem_dir *dir)
+static void __get_system_dir(struct trace_subsystem_dir *dir)
{
WARN_ON_ONCE(dir->ref_count == 0);
dir->ref_count++;
__get_system(dir->subsystem);
}
-static void __put_system_dir(struct ftrace_subsystem_dir *dir)
+static void __put_system_dir(struct trace_subsystem_dir *dir)
{
WARN_ON_ONCE(dir->ref_count == 0);
/* If the subsystem is about to be freed, the dir must be too */
@@ -467,14 +467,14 @@ static void __put_system_dir(struct ftrace_subsystem_dir *dir)
kfree(dir);
}
-static void put_system(struct ftrace_subsystem_dir *dir)
+static void put_system(struct trace_subsystem_dir *dir)
{
mutex_lock(&event_mutex);
__put_system_dir(dir);
mutex_unlock(&event_mutex);
}
-static void remove_subsystem(struct ftrace_subsystem_dir *dir)
+static void remove_subsystem(struct trace_subsystem_dir *dir)
{
if (!dir)
return;
@@ -486,7 +486,7 @@ static void remove_subsystem(struct ftrace_subsystem_dir *dir)
}
}
-static void remove_event_file_dir(struct ftrace_event_file *file)
+static void remove_event_file_dir(struct trace_event_file *file)
{
struct dentry *dir = file->dir;
struct dentry *child;
@@ -515,15 +515,15 @@ static int
__ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match,
const char *sub, const char *event, int set)
{
- struct ftrace_event_file *file;
- struct ftrace_event_call *call;
+ struct trace_event_file *file;
+ struct trace_event_call *call;
const char *name;
int ret = -EINVAL;
list_for_each_entry(file, &tr->events, list) {
call = file->event_call;
- name = ftrace_event_name(call);
+ name = trace_event_name(call);
if (!name || !call->class || !call->class->reg)
continue;
@@ -671,8 +671,8 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
static void *
t_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct ftrace_event_file *file = v;
- struct ftrace_event_call *call;
+ struct trace_event_file *file = v;
+ struct trace_event_call *call;
struct trace_array *tr = m->private;
(*pos)++;
@@ -692,13 +692,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
static void *t_start(struct seq_file *m, loff_t *pos)
{
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
struct trace_array *tr = m->private;
loff_t l;
mutex_lock(&event_mutex);
- file = list_entry(&tr->events, struct ftrace_event_file, list);
+ file = list_entry(&tr->events, struct trace_event_file, list);
for (l = 0; l <= *pos; ) {
file = t_next(m, file, &l);
if (!file)
@@ -710,13 +710,13 @@ static void *t_start(struct seq_file *m, loff_t *pos)
static void *
s_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct ftrace_event_file *file = v;
+ struct trace_event_file *file = v;
struct trace_array *tr = m->private;
(*pos)++;
list_for_each_entry_continue(file, &tr->events, list) {
- if (file->flags & FTRACE_EVENT_FL_ENABLED)
+ if (file->flags & EVENT_FILE_FL_ENABLED)
return file;
}
@@ -725,13 +725,13 @@ s_next(struct seq_file *m, void *v, loff_t *pos)
static void *s_start(struct seq_file *m, loff_t *pos)
{
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
struct trace_array *tr = m->private;
loff_t l;
mutex_lock(&event_mutex);
- file = list_entry(&tr->events, struct ftrace_event_file, list);
+ file = list_entry(&tr->events, struct trace_event_file, list);
for (l = 0; l <= *pos; ) {
file = s_next(m, file, &l);
if (!file)
@@ -742,12 +742,12 @@ static void *s_start(struct seq_file *m, loff_t *pos)
static int t_show(struct seq_file *m, void *v)
{
- struct ftrace_event_file *file = v;
- struct ftrace_event_call *call = file->event_call;
+ struct trace_event_file *file = v;
+ struct trace_event_call *call = file->event_call;
if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
seq_printf(m, "%s:", call->class->system);
- seq_printf(m, "%s\n", ftrace_event_name(call));
+ seq_printf(m, "%s\n", trace_event_name(call));
return 0;
}
@@ -761,7 +761,7 @@ static ssize_t
event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
loff_t *ppos)
{
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
unsigned long flags;
char buf[4] = "0";
@@ -774,12 +774,12 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
if (!file)
return -ENODEV;
- if (flags & FTRACE_EVENT_FL_ENABLED &&
- !(flags & FTRACE_EVENT_FL_SOFT_DISABLED))
+ if (flags & EVENT_FILE_FL_ENABLED &&
+ !(flags & EVENT_FILE_FL_SOFT_DISABLED))
strcpy(buf, "1");
- if (flags & FTRACE_EVENT_FL_SOFT_DISABLED ||
- flags & FTRACE_EVENT_FL_SOFT_MODE)
+ if (flags & EVENT_FILE_FL_SOFT_DISABLED ||
+ flags & EVENT_FILE_FL_SOFT_MODE)
strcat(buf, "*");
strcat(buf, "\n");
@@ -791,7 +791,7 @@ static ssize_t
event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
loff_t *ppos)
{
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
unsigned long val;
int ret;
@@ -828,10 +828,10 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
loff_t *ppos)
{
const char set_to_char[4] = { '?', '0', '1', 'X' };
- struct ftrace_subsystem_dir *dir = filp->private_data;
+ struct trace_subsystem_dir *dir = filp->private_data;
struct event_subsystem *system = dir->subsystem;
- struct ftrace_event_call *call;
- struct ftrace_event_file *file;
+ struct trace_event_call *call;
+ struct trace_event_file *file;
struct trace_array *tr = dir->tr;
char buf[2];
int set = 0;
@@ -840,7 +840,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
mutex_lock(&event_mutex);
list_for_each_entry(file, &tr->events, list) {
call = file->event_call;
- if (!ftrace_event_name(call) || !call->class || !call->class->reg)
+ if (!trace_event_name(call) || !call->class || !call->class->reg)
continue;
if (system && strcmp(call->class->system, system->name) != 0)
@@ -851,7 +851,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
* or if all events or cleared, or if we have
* a mixture.
*/
- set |= (1 << !!(file->flags & FTRACE_EVENT_FL_ENABLED));
+ set |= (1 << !!(file->flags & EVENT_FILE_FL_ENABLED));
/*
* If we have a mixture, no need to look further.
@@ -873,7 +873,7 @@ static ssize_t
system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
loff_t *ppos)
{
- struct ftrace_subsystem_dir *dir = filp->private_data;
+ struct trace_subsystem_dir *dir = filp->private_data;
struct event_subsystem *system = dir->subsystem;
const char *name = NULL;
unsigned long val;
@@ -917,7 +917,7 @@ enum {
static void *f_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct ftrace_event_call *call = event_file_data(m->private);
+ struct trace_event_call *call = event_file_data(m->private);
struct list_head *common_head = &ftrace_common_fields;
struct list_head *head = trace_get_fields(call);
struct list_head *node = v;
@@ -949,13 +949,13 @@ static void *f_next(struct seq_file *m, void *v, loff_t *pos)
static int f_show(struct seq_file *m, void *v)
{
- struct ftrace_event_call *call = event_file_data(m->private);
+ struct trace_event_call *call = event_file_data(m->private);
struct ftrace_event_field *field;
const char *array_descriptor;
switch ((unsigned long)v) {
case FORMAT_HEADER:
- seq_printf(m, "name: %s\n", ftrace_event_name(call));
+ seq_printf(m, "name: %s\n", trace_event_name(call));
seq_printf(m, "ID: %d\n", call->event.type);
seq_puts(m, "format:\n");
return 0;
@@ -1062,7 +1062,7 @@ static ssize_t
event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
loff_t *ppos)
{
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
struct trace_seq *s;
int r = -ENODEV;
@@ -1095,7 +1095,7 @@ static ssize_t
event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
loff_t *ppos)
{
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
char *buf;
int err = -ENODEV;
@@ -1132,7 +1132,7 @@ static LIST_HEAD(event_subsystems);
static int subsystem_open(struct inode *inode, struct file *filp)
{
struct event_subsystem *system = NULL;
- struct ftrace_subsystem_dir *dir = NULL; /* Initialize for gcc */
+ struct trace_subsystem_dir *dir = NULL; /* Initialize for gcc */
struct trace_array *tr;
int ret;
@@ -1181,7 +1181,7 @@ static int subsystem_open(struct inode *inode, struct file *filp)
static int system_tr_open(struct inode *inode, struct file *filp)
{
- struct ftrace_subsystem_dir *dir;
+ struct trace_subsystem_dir *dir;
struct trace_array *tr = inode->i_private;
int ret;
@@ -1214,7 +1214,7 @@ static int system_tr_open(struct inode *inode, struct file *filp)
static int subsystem_release(struct inode *inode, struct file *file)
{
- struct ftrace_subsystem_dir *dir = file->private_data;
+ struct trace_subsystem_dir *dir = file->private_data;
trace_array_put(dir->tr);
@@ -1235,7 +1235,7 @@ static ssize_t
subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
loff_t *ppos)
{
- struct ftrace_subsystem_dir *dir = filp->private_data;
+ struct trace_subsystem_dir *dir = filp->private_data;
struct event_subsystem *system = dir->subsystem;
struct trace_seq *s;
int r;
@@ -1262,7 +1262,7 @@ static ssize_t
subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
loff_t *ppos)
{
- struct ftrace_subsystem_dir *dir = filp->private_data;
+ struct trace_subsystem_dir *dir = filp->private_data;
char *buf;
int err;
@@ -1497,9 +1497,9 @@ create_new_subsystem(const char *name)
static struct dentry *
event_subsystem_dir(struct trace_array *tr, const char *name,
- struct ftrace_event_file *file, struct dentry *parent)
+ struct trace_event_file *file, struct dentry *parent)
{
- struct ftrace_subsystem_dir *dir;
+ struct trace_subsystem_dir *dir;
struct event_subsystem *system;
struct dentry *entry;
@@ -1571,9 +1571,9 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
}
static int
-event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
+event_create_dir(struct dentry *parent, struct trace_event_file *file)
{
- struct ftrace_event_call *call = file->event_call;
+ struct trace_event_call *call = file->event_call;
struct trace_array *tr = file->tr;
struct list_head *head;
struct dentry *d_events;
@@ -1591,7 +1591,7 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
} else
d_events = parent;
- name = ftrace_event_name(call);
+ name = trace_event_name(call);
file->dir = tracefs_create_dir(name, d_events);
if (!file->dir) {
pr_warn("Could not create tracefs '%s' directory\n", name);
@@ -1634,9 +1634,9 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
return 0;
}
-static void remove_event_from_tracers(struct ftrace_event_call *call)
+static void remove_event_from_tracers(struct trace_event_call *call)
{
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
struct trace_array *tr;
do_for_each_event_file_safe(tr, file) {
@@ -1654,10 +1654,10 @@ static void remove_event_from_tracers(struct ftrace_event_call *call)
} while_for_each_event_file();
}
-static void event_remove(struct ftrace_event_call *call)
+static void event_remove(struct trace_event_call *call)
{
struct trace_array *tr;
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
do_for_each_event_file(tr, file) {
if (file->event_call != call)
@@ -1673,17 +1673,17 @@ static void event_remove(struct ftrace_event_call *call)
} while_for_each_event_file();
if (call->event.funcs)
- __unregister_ftrace_event(&call->event);
+ __unregister_trace_event(&call->event);
remove_event_from_tracers(call);
list_del(&call->list);
}
-static int event_init(struct ftrace_event_call *call)
+static int event_init(struct trace_event_call *call)
{
int ret = 0;
const char *name;
- name = ftrace_event_name(call);
+ name = trace_event_name(call);
if (WARN_ON(!name))
return -EINVAL;
@@ -1697,7 +1697,7 @@ static int event_init(struct ftrace_event_call *call)
}
static int
-__register_event(struct ftrace_event_call *call, struct module *mod)
+__register_event(struct trace_event_call *call, struct module *mod)
{
int ret;
@@ -1733,7 +1733,7 @@ static char *enum_replace(char *ptr, struct trace_enum_map *map, int len)
return ptr + elen;
}
-static void update_event_printk(struct ftrace_event_call *call,
+static void update_event_printk(struct trace_event_call *call,
struct trace_enum_map *map)
{
char *ptr;
@@ -1811,7 +1811,7 @@ static void update_event_printk(struct ftrace_event_call *call,
void trace_event_enum_update(struct trace_enum_map **map, int len)
{
- struct ftrace_event_call *call, *p;
+ struct trace_event_call *call, *p;
const char *last_system = NULL;
int last_i;
int i;
@@ -1836,11 +1836,11 @@ void trace_event_enum_update(struct trace_enum_map **map, int len)
up_write(&trace_event_sem);
}
-static struct ftrace_event_file *
-trace_create_new_event(struct ftrace_event_call *call,
+static struct trace_event_file *
+trace_create_new_event(struct trace_event_call *call,
struct trace_array *tr)
{
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
file = kmem_cache_alloc(file_cachep, GFP_TRACE);
if (!file)
@@ -1858,9 +1858,9 @@ trace_create_new_event(struct ftrace_event_call *call,
/* Add an event to a trace directory */
static int
-__trace_add_new_event(struct ftrace_event_call *call, struct trace_array *tr)
+__trace_add_new_event(struct trace_event_call *call, struct trace_array *tr)
{
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
file = trace_create_new_event(call, tr);
if (!file)
@@ -1875,10 +1875,10 @@ __trace_add_new_event(struct ftrace_event_call *call, struct trace_array *tr)
* the filesystem is initialized.
*/
static __init int
-__trace_early_add_new_event(struct ftrace_event_call *call,
+__trace_early_add_new_event(struct trace_event_call *call,
struct trace_array *tr)
{
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
file = trace_create_new_event(call, tr);
if (!file)
@@ -1888,10 +1888,10 @@ __trace_early_add_new_event(struct ftrace_event_call *call,
}
struct ftrace_module_file_ops;
-static void __add_event_to_tracers(struct ftrace_event_call *call);
+static void __add_event_to_tracers(struct trace_event_call *call);
/* Add an additional event_call dynamically */
-int trace_add_event_call(struct ftrace_event_call *call)
+int trace_add_event_call(struct trace_event_call *call)
{
int ret;
mutex_lock(&trace_types_lock);
@@ -1910,7 +1910,7 @@ int trace_add_event_call(struct ftrace_event_call *call)
* Must be called under locking of trace_types_lock, event_mutex and
* trace_event_sem.
*/
-static void __trace_remove_event_call(struct ftrace_event_call *call)
+static void __trace_remove_event_call(struct trace_event_call *call)
{
event_remove(call);
trace_destroy_fields(call);
@@ -1918,10 +1918,10 @@ static void __trace_remove_event_call(struct ftrace_event_call *call)
call->filter = NULL;
}
-static int probe_remove_event_call(struct ftrace_event_call *call)
+static int probe_remove_event_call(struct trace_event_call *call)
{
struct trace_array *tr;
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
#ifdef CONFIG_PERF_EVENTS
if (call->perf_refcount)
@@ -1932,10 +1932,10 @@ static int probe_remove_event_call(struct ftrace_event_call *call)
continue;
/*
* We can't rely on ftrace_event_enable_disable(enable => 0)
- * we are going to do, FTRACE_EVENT_FL_SOFT_MODE can suppress
+ * we are going to do, EVENT_FILE_FL_SOFT_MODE can suppress
* TRACE_REG_UNREGISTER.
*/
- if (file->flags & FTRACE_EVENT_FL_ENABLED)
+ if (file->flags & EVENT_FILE_FL_ENABLED)
return -EBUSY;
/*
* The do_for_each_event_file_safe() is
@@ -1952,7 +1952,7 @@ static int probe_remove_event_call(struct ftrace_event_call *call)
}
/* Remove an event_call */
-int trace_remove_event_call(struct ftrace_event_call *call)
+int trace_remove_event_call(struct trace_event_call *call)
{
int ret;
@@ -1976,7 +1976,7 @@ int trace_remove_event_call(struct ftrace_event_call *call)
static void trace_module_add_events(struct module *mod)
{
- struct ftrace_event_call **call, **start, **end;
+ struct trace_event_call **call, **start, **end;
if (!mod->num_trace_events)
return;
@@ -1999,7 +1999,7 @@ static void trace_module_add_events(struct module *mod)
static void trace_module_remove_events(struct module *mod)
{
- struct ftrace_event_call *call, *p;
+ struct trace_event_call *call, *p;
bool clear_trace = false;
down_write(&trace_event_sem);
@@ -2055,28 +2055,28 @@ static struct notifier_block trace_module_nb = {
static void
__trace_add_event_dirs(struct trace_array *tr)
{
- struct ftrace_event_call *call;
+ struct trace_event_call *call;
int ret;
list_for_each_entry(call, &ftrace_events, list) {
ret = __trace_add_new_event(call, tr);
if (ret < 0)
pr_warn("Could not create directory for event %s\n",
- ftrace_event_name(call));
+ trace_event_name(call));
}
}
-struct ftrace_event_file *
+struct trace_event_file *
find_event_file(struct trace_array *tr, const char *system, const char *event)
{
- struct ftrace_event_file *file;
- struct ftrace_event_call *call;
+ struct trace_event_file *file;
+ struct trace_event_call *call;
const char *name;
list_for_each_entry(file, &tr->events, list) {
call = file->event_call;
- name = ftrace_event_name(call);
+ name = trace_event_name(call);
if (!name || !call->class || !call->class->reg)
continue;
@@ -2098,7 +2098,7 @@ find_event_file(struct trace_array *tr, const char *system, const char *event)
#define DISABLE_EVENT_STR "disable_event"
struct event_probe_data {
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
unsigned long count;
int ref;
bool enable;
@@ -2114,9 +2114,9 @@ event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data)
return;
if (data->enable)
- clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags);
+ clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &data->file->flags);
else
- set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags);
+ set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &data->file->flags);
}
static void
@@ -2132,7 +2132,7 @@ event_enable_count_probe(unsigned long ip, unsigned long parent_ip, void **_data
return;
/* Skip if the event is in a state we want to switch to */
- if (data->enable == !(data->file->flags & FTRACE_EVENT_FL_SOFT_DISABLED))
+ if (data->enable == !(data->file->flags & EVENT_FILE_FL_SOFT_DISABLED))
return;
if (data->count != -1)
@@ -2152,7 +2152,7 @@ event_enable_print(struct seq_file *m, unsigned long ip,
seq_printf(m, "%s:%s:%s",
data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR,
data->file->event_call->class->system,
- ftrace_event_name(data->file->event_call));
+ trace_event_name(data->file->event_call));
if (data->count == -1)
seq_puts(m, ":unlimited\n");
@@ -2226,7 +2226,7 @@ event_enable_func(struct ftrace_hash *hash,
char *glob, char *cmd, char *param, int enabled)
{
struct trace_array *tr = top_trace_array();
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
struct ftrace_probe_ops *ops;
struct event_probe_data *data;
const char *system;
@@ -2358,7 +2358,7 @@ static inline int register_event_cmds(void) { return 0; }
#endif /* CONFIG_DYNAMIC_FTRACE */
/*
- * The top level array has already had its ftrace_event_file
+ * The top level array has already had its trace_event_file
* descriptors created in order to allow for early events to
* be recorded. This function is called after the tracefs has been
* initialized, and we now have to create the files associated
@@ -2367,7 +2367,7 @@ static inline int register_event_cmds(void) { return 0; }
static __init void
__trace_early_add_event_dirs(struct trace_array *tr)
{
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
int ret;
@@ -2375,7 +2375,7 @@ __trace_early_add_event_dirs(struct trace_array *tr)
ret = event_create_dir(tr->event_dir, file);
if (ret < 0)
pr_warn("Could not create directory for event %s\n",
- ftrace_event_name(file->event_call));
+ trace_event_name(file->event_call));
}
}
@@ -2388,7 +2388,7 @@ __trace_early_add_event_dirs(struct trace_array *tr)
static __init void
__trace_early_add_events(struct trace_array *tr)
{
- struct ftrace_event_call *call;
+ struct trace_event_call *call;
int ret;
list_for_each_entry(call, &ftrace_events, list) {
@@ -2399,7 +2399,7 @@ __trace_early_add_events(struct trace_array *tr)
ret = __trace_early_add_new_event(call, tr);
if (ret < 0)
pr_warn("Could not create early event %s\n",
- ftrace_event_name(call));
+ trace_event_name(call));
}
}
@@ -2407,13 +2407,13 @@ __trace_early_add_events(struct trace_array *tr)
static void
__trace_remove_event_dirs(struct trace_array *tr)
{
- struct ftrace_event_file *file, *next;
+ struct trace_event_file *file, *next;
list_for_each_entry_safe(file, next, &tr->events, list)
remove_event_file_dir(file);
}
-static void __add_event_to_tracers(struct ftrace_event_call *call)
+static void __add_event_to_tracers(struct trace_event_call *call)
{
struct trace_array *tr;
@@ -2421,8 +2421,8 @@ static void __add_event_to_tracers(struct ftrace_event_call *call)
__trace_add_new_event(call, tr);
}
-extern struct ftrace_event_call *__start_ftrace_events[];
-extern struct ftrace_event_call *__stop_ftrace_events[];
+extern struct trace_event_call *__start_ftrace_events[];
+extern struct trace_event_call *__stop_ftrace_events[];
static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;
@@ -2557,7 +2557,7 @@ int event_trace_del_tracer(struct trace_array *tr)
static __init int event_trace_memsetup(void)
{
field_cachep = KMEM_CACHE(ftrace_event_field, SLAB_PANIC);
- file_cachep = KMEM_CACHE(ftrace_event_file, SLAB_PANIC);
+ file_cachep = KMEM_CACHE(trace_event_file, SLAB_PANIC);
return 0;
}
@@ -2593,7 +2593,7 @@ early_enable_events(struct trace_array *tr, bool disable_first)
static __init int event_trace_enable(void)
{
struct trace_array *tr = top_trace_array();
- struct ftrace_event_call **iter, *call;
+ struct trace_event_call **iter, *call;
int ret;
if (!tr)
@@ -2754,9 +2754,9 @@ static __init void event_test_stuff(void)
*/
static __init void event_trace_self_tests(void)
{
- struct ftrace_subsystem_dir *dir;
- struct ftrace_event_file *file;
- struct ftrace_event_call *call;
+ struct trace_subsystem_dir *dir;
+ struct trace_event_file *file;
+ struct trace_event_call *call;
struct event_subsystem *system;
struct trace_array *tr;
int ret;
@@ -2787,13 +2787,13 @@ static __init void event_trace_self_tests(void)
continue;
#endif
- pr_info("Testing event %s: ", ftrace_event_name(call));
+ pr_info("Testing event %s: ", trace_event_name(call));
/*
* If an event is already enabled, someone is using
* it and the self test should not be on.
*/
- if (file->flags & FTRACE_EVENT_FL_ENABLED) {
+ if (file->flags & EVENT_FILE_FL_ENABLED) {
pr_warn("Enabled event during self test!\n");
WARN_ON_ONCE(1);
continue;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 52adf02d7..d81d6f302 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -643,7 +643,7 @@ static void append_filter_err(struct filter_parse_state *ps,
free_page((unsigned long) buf);
}
-static inline struct event_filter *event_filter(struct ftrace_event_file *file)
+static inline struct event_filter *event_filter(struct trace_event_file *file)
{
if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
return file->event_call->filter;
@@ -652,7 +652,7 @@ static inline struct event_filter *event_filter(struct ftrace_event_file *file)
}
/* caller must hold event_mutex */
-void print_event_filter(struct ftrace_event_file *file, struct trace_seq *s)
+void print_event_filter(struct trace_event_file *file, struct trace_seq *s)
{
struct event_filter *filter = event_filter(file);
@@ -780,14 +780,14 @@ static void __free_preds(struct event_filter *filter)
filter->n_preds = 0;
}
-static void filter_disable(struct ftrace_event_file *file)
+static void filter_disable(struct trace_event_file *file)
{
- struct ftrace_event_call *call = file->event_call;
+ struct trace_event_call *call = file->event_call;
if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
call->flags &= ~TRACE_EVENT_FL_FILTERED;
else
- file->flags &= ~FTRACE_EVENT_FL_FILTERED;
+ file->flags &= ~EVENT_FILE_FL_FILTERED;
}
static void __free_filter(struct event_filter *filter)
@@ -837,9 +837,9 @@ static int __alloc_preds(struct event_filter *filter, int n_preds)
return 0;
}
-static inline void __remove_filter(struct ftrace_event_file *file)
+static inline void __remove_filter(struct trace_event_file *file)
{
- struct ftrace_event_call *call = file->event_call;
+ struct trace_event_call *call = file->event_call;
filter_disable(file);
if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
@@ -848,10 +848,10 @@ static inline void __remove_filter(struct ftrace_event_file *file)
remove_filter_string(file->filter);
}
-static void filter_free_subsystem_preds(struct ftrace_subsystem_dir *dir,
+static void filter_free_subsystem_preds(struct trace_subsystem_dir *dir,
struct trace_array *tr)
{
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
list_for_each_entry(file, &tr->events, list) {
if (file->system != dir)
@@ -860,9 +860,9 @@ static void filter_free_subsystem_preds(struct ftrace_subsystem_dir *dir,
}
}
-static inline void __free_subsystem_filter(struct ftrace_event_file *file)
+static inline void __free_subsystem_filter(struct trace_event_file *file)
{
- struct ftrace_event_call *call = file->event_call;
+ struct trace_event_call *call = file->event_call;
if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) {
__free_filter(call->filter);
@@ -873,10 +873,10 @@ static inline void __free_subsystem_filter(struct ftrace_event_file *file)
}
}
-static void filter_free_subsystem_filters(struct ftrace_subsystem_dir *dir,
+static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir,
struct trace_array *tr)
{
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
list_for_each_entry(file, &tr->events, list) {
if (file->system != dir)
@@ -1342,7 +1342,7 @@ parse_operand:
}
static struct filter_pred *create_pred(struct filter_parse_state *ps,
- struct ftrace_event_call *call,
+ struct trace_event_call *call,
int op, char *operand1, char *operand2)
{
struct ftrace_event_field *field;
@@ -1564,7 +1564,7 @@ static int fold_pred_tree(struct event_filter *filter,
filter->preds);
}
-static int replace_preds(struct ftrace_event_call *call,
+static int replace_preds(struct trace_event_call *call,
struct event_filter *filter,
struct filter_parse_state *ps,
bool dry_run)
@@ -1677,20 +1677,20 @@ fail:
return err;
}
-static inline void event_set_filtered_flag(struct ftrace_event_file *file)
+static inline void event_set_filtered_flag(struct trace_event_file *file)
{
- struct ftrace_event_call *call = file->event_call;
+ struct trace_event_call *call = file->event_call;
if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
call->flags |= TRACE_EVENT_FL_FILTERED;
else
- file->flags |= FTRACE_EVENT_FL_FILTERED;
+ file->flags |= EVENT_FILE_FL_FILTERED;
}
-static inline void event_set_filter(struct ftrace_event_file *file,
+static inline void event_set_filter(struct trace_event_file *file,
struct event_filter *filter)
{
- struct ftrace_event_call *call = file->event_call;
+ struct trace_event_call *call = file->event_call;
if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
rcu_assign_pointer(call->filter, filter);
@@ -1698,9 +1698,9 @@ static inline void event_set_filter(struct ftrace_event_file *file,
rcu_assign_pointer(file->filter, filter);
}
-static inline void event_clear_filter(struct ftrace_event_file *file)
+static inline void event_clear_filter(struct trace_event_file *file)
{
- struct ftrace_event_call *call = file->event_call;
+ struct trace_event_call *call = file->event_call;
if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
RCU_INIT_POINTER(call->filter, NULL);
@@ -1709,33 +1709,33 @@ static inline void event_clear_filter(struct ftrace_event_file *file)
}
static inline void
-event_set_no_set_filter_flag(struct ftrace_event_file *file)
+event_set_no_set_filter_flag(struct trace_event_file *file)
{
- struct ftrace_event_call *call = file->event_call;
+ struct trace_event_call *call = file->event_call;
if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
call->flags |= TRACE_EVENT_FL_NO_SET_FILTER;
else
- file->flags |= FTRACE_EVENT_FL_NO_SET_FILTER;
+ file->flags |= EVENT_FILE_FL_NO_SET_FILTER;
}
static inline void
-event_clear_no_set_filter_flag(struct ftrace_event_file *file)
+event_clear_no_set_filter_flag(struct trace_event_file *file)
{
- struct ftrace_event_call *call = file->event_call;
+ struct trace_event_call *call = file->event_call;
if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER;
else
- file->flags &= ~FTRACE_EVENT_FL_NO_SET_FILTER;
+ file->flags &= ~EVENT_FILE_FL_NO_SET_FILTER;
}
static inline bool
-event_no_set_filter_flag(struct ftrace_event_file *file)
+event_no_set_filter_flag(struct trace_event_file *file)
{
- struct ftrace_event_call *call = file->event_call;
+ struct trace_event_call *call = file->event_call;
- if (file->flags & FTRACE_EVENT_FL_NO_SET_FILTER)
+ if (file->flags & EVENT_FILE_FL_NO_SET_FILTER)
return true;
if ((call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) &&
@@ -1750,12 +1750,12 @@ struct filter_list {
struct event_filter *filter;
};
-static int replace_system_preds(struct ftrace_subsystem_dir *dir,
+static int replace_system_preds(struct trace_subsystem_dir *dir,
struct trace_array *tr,
struct filter_parse_state *ps,
char *filter_string)
{
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
struct filter_list *filter_item;
struct filter_list *tmp;
LIST_HEAD(filter_list);
@@ -1899,8 +1899,8 @@ static void create_filter_finish(struct filter_parse_state *ps)
}
/**
- * create_filter - create a filter for a ftrace_event_call
- * @call: ftrace_event_call to create a filter for
+ * create_filter - create a filter for a trace_event_call
+ * @call: trace_event_call to create a filter for
* @filter_str: filter string
* @set_str: remember @filter_str and enable detailed error in filter
* @filterp: out param for created filter (always updated on return)
@@ -1914,7 +1914,7 @@ static void create_filter_finish(struct filter_parse_state *ps)
* information if @set_str is %true and the caller is responsible for
* freeing it.
*/
-static int create_filter(struct ftrace_event_call *call,
+static int create_filter(struct trace_event_call *call,
char *filter_str, bool set_str,
struct event_filter **filterp)
{
@@ -1934,7 +1934,7 @@ static int create_filter(struct ftrace_event_call *call,
return err;
}
-int create_event_filter(struct ftrace_event_call *call,
+int create_event_filter(struct trace_event_call *call,
char *filter_str, bool set_str,
struct event_filter **filterp)
{
@@ -1950,7 +1950,7 @@ int create_event_filter(struct ftrace_event_call *call,
* Identical to create_filter() except that it creates a subsystem filter
* and always remembers @filter_str.
*/
-static int create_system_filter(struct ftrace_subsystem_dir *dir,
+static int create_system_filter(struct trace_subsystem_dir *dir,
struct trace_array *tr,
char *filter_str, struct event_filter **filterp)
{
@@ -1976,9 +1976,9 @@ static int create_system_filter(struct ftrace_subsystem_dir *dir,
}
/* caller must hold event_mutex */
-int apply_event_filter(struct ftrace_event_file *file, char *filter_string)
+int apply_event_filter(struct trace_event_file *file, char *filter_string)
{
- struct ftrace_event_call *call = file->event_call;
+ struct trace_event_call *call = file->event_call;
struct event_filter *filter;
int err;
@@ -2027,7 +2027,7 @@ int apply_event_filter(struct ftrace_event_file *file, char *filter_string)
return err;
}
-int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
+int apply_subsystem_event_filter(struct trace_subsystem_dir *dir,
char *filter_string)
{
struct event_subsystem *system = dir->subsystem;
@@ -2090,7 +2090,7 @@ struct function_filter_data {
static char **
ftrace_function_filter_re(char *buf, int len, int *count)
{
- char *str, *sep, **re;
+ char *str, **re;
str = kstrndup(buf, len, GFP_KERNEL);
if (!str)
@@ -2100,8 +2100,7 @@ ftrace_function_filter_re(char *buf, int len, int *count)
* The argv_split function takes white space
* as a separator, so convert ',' into spaces.
*/
- while ((sep = strchr(str, ',')))
- *sep = ' ';
+ strreplace(str, ',', ' ');
re = argv_split(GFP_KERNEL, str, count);
kfree(str);
@@ -2227,7 +2226,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
{
int err;
struct event_filter *filter;
- struct ftrace_event_call *call;
+ struct trace_event_call *call;
mutex_lock(&event_mutex);
@@ -2283,7 +2282,7 @@ out_unlock:
static struct test_filter_data_t {
char *filter;
- struct ftrace_raw_ftrace_test_filter rec;
+ struct trace_event_raw_ftrace_test_filter rec;
int match;
char *not_visited;
} test_filter_data[] = {
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 8712df9de..42a4009fd 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -40,7 +40,7 @@ trigger_data_free(struct event_trigger_data *data)
/**
* event_triggers_call - Call triggers associated with a trace event
- * @file: The ftrace_event_file associated with the event
+ * @file: The trace_event_file associated with the event
* @rec: The trace entry for the event, NULL for unconditional invocation
*
* For each trigger associated with an event, invoke the trigger
@@ -63,7 +63,7 @@ trigger_data_free(struct event_trigger_data *data)
* any trigger that should be deferred, ETT_NONE if nothing to defer.
*/
enum event_trigger_type
-event_triggers_call(struct ftrace_event_file *file, void *rec)
+event_triggers_call(struct trace_event_file *file, void *rec)
{
struct event_trigger_data *data;
enum event_trigger_type tt = ETT_NONE;
@@ -92,7 +92,7 @@ EXPORT_SYMBOL_GPL(event_triggers_call);
/**
* event_triggers_post_call - Call 'post_triggers' for a trace event
- * @file: The ftrace_event_file associated with the event
+ * @file: The trace_event_file associated with the event
* @tt: enum event_trigger_type containing a set bit for each trigger to invoke
*
* For each trigger associated with an event, invoke the trigger
@@ -103,7 +103,7 @@ EXPORT_SYMBOL_GPL(event_triggers_call);
* Called from tracepoint handlers (with rcu_read_lock_sched() held).
*/
void
-event_triggers_post_call(struct ftrace_event_file *file,
+event_triggers_post_call(struct trace_event_file *file,
enum event_trigger_type tt)
{
struct event_trigger_data *data;
@@ -119,7 +119,7 @@ EXPORT_SYMBOL_GPL(event_triggers_post_call);
static void *trigger_next(struct seq_file *m, void *t, loff_t *pos)
{
- struct ftrace_event_file *event_file = event_file_data(m->private);
+ struct trace_event_file *event_file = event_file_data(m->private);
if (t == SHOW_AVAILABLE_TRIGGERS)
return NULL;
@@ -129,7 +129,7 @@ static void *trigger_next(struct seq_file *m, void *t, loff_t *pos)
static void *trigger_start(struct seq_file *m, loff_t *pos)
{
- struct ftrace_event_file *event_file;
+ struct trace_event_file *event_file;
/* ->stop() is called even if ->start() fails */
mutex_lock(&event_mutex);
@@ -201,7 +201,7 @@ static int event_trigger_regex_open(struct inode *inode, struct file *file)
return ret;
}
-static int trigger_process_regex(struct ftrace_event_file *file, char *buff)
+static int trigger_process_regex(struct trace_event_file *file, char *buff)
{
char *command, *next = buff;
struct event_command *p;
@@ -227,7 +227,7 @@ static ssize_t event_trigger_regex_write(struct file *file,
const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- struct ftrace_event_file *event_file;
+ struct trace_event_file *event_file;
ssize_t ret;
char *buf;
@@ -430,7 +430,7 @@ event_trigger_free(struct event_trigger_ops *ops,
trigger_data_free(data);
}
-static int trace_event_trigger_enable_disable(struct ftrace_event_file *file,
+static int trace_event_trigger_enable_disable(struct trace_event_file *file,
int trigger_enable)
{
int ret = 0;
@@ -438,12 +438,12 @@ static int trace_event_trigger_enable_disable(struct ftrace_event_file *file,
if (trigger_enable) {
if (atomic_inc_return(&file->tm_ref) > 1)
return ret;
- set_bit(FTRACE_EVENT_FL_TRIGGER_MODE_BIT, &file->flags);
+ set_bit(EVENT_FILE_FL_TRIGGER_MODE_BIT, &file->flags);
ret = trace_event_enable_disable(file, 1, 1);
} else {
if (atomic_dec_return(&file->tm_ref) > 0)
return ret;
- clear_bit(FTRACE_EVENT_FL_TRIGGER_MODE_BIT, &file->flags);
+ clear_bit(EVENT_FILE_FL_TRIGGER_MODE_BIT, &file->flags);
ret = trace_event_enable_disable(file, 0, 1);
}
@@ -466,7 +466,7 @@ static int trace_event_trigger_enable_disable(struct ftrace_event_file *file,
void
clear_event_triggers(struct trace_array *tr)
{
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
list_for_each_entry(file, &tr->events, list) {
struct event_trigger_data *data;
@@ -480,7 +480,7 @@ clear_event_triggers(struct trace_array *tr)
/**
* update_cond_flag - Set or reset the TRIGGER_COND bit
- * @file: The ftrace_event_file associated with the event
+ * @file: The trace_event_file associated with the event
*
* If an event has triggers and any of those triggers has a filter or
* a post_trigger, trigger invocation needs to be deferred until after
@@ -488,7 +488,7 @@ clear_event_triggers(struct trace_array *tr)
* its TRIGGER_COND bit set, otherwise the TRIGGER_COND bit should be
* cleared.
*/
-static void update_cond_flag(struct ftrace_event_file *file)
+static void update_cond_flag(struct trace_event_file *file)
{
struct event_trigger_data *data;
bool set_cond = false;
@@ -501,9 +501,9 @@ static void update_cond_flag(struct ftrace_event_file *file)
}
if (set_cond)
- set_bit(FTRACE_EVENT_FL_TRIGGER_COND_BIT, &file->flags);
+ set_bit(EVENT_FILE_FL_TRIGGER_COND_BIT, &file->flags);
else
- clear_bit(FTRACE_EVENT_FL_TRIGGER_COND_BIT, &file->flags);
+ clear_bit(EVENT_FILE_FL_TRIGGER_COND_BIT, &file->flags);
}
/**
@@ -511,7 +511,7 @@ static void update_cond_flag(struct ftrace_event_file *file)
* @glob: The raw string used to register the trigger
* @ops: The trigger ops associated with the trigger
* @data: Trigger-specific data to associate with the trigger
- * @file: The ftrace_event_file associated with the event
+ * @file: The trace_event_file associated with the event
*
* Common implementation for event trigger registration.
*
@@ -522,7 +522,7 @@ static void update_cond_flag(struct ftrace_event_file *file)
*/
static int register_trigger(char *glob, struct event_trigger_ops *ops,
struct event_trigger_data *data,
- struct ftrace_event_file *file)
+ struct trace_event_file *file)
{
struct event_trigger_data *test;
int ret = 0;
@@ -557,7 +557,7 @@ out:
* @glob: The raw string used to register the trigger
* @ops: The trigger ops associated with the trigger
* @test: Trigger-specific data used to find the trigger to remove
- * @file: The ftrace_event_file associated with the event
+ * @file: The trace_event_file associated with the event
*
* Common implementation for event trigger unregistration.
*
@@ -566,7 +566,7 @@ out:
*/
static void unregister_trigger(char *glob, struct event_trigger_ops *ops,
struct event_trigger_data *test,
- struct ftrace_event_file *file)
+ struct trace_event_file *file)
{
struct event_trigger_data *data;
bool unregistered = false;
@@ -588,7 +588,7 @@ static void unregister_trigger(char *glob, struct event_trigger_ops *ops,
/**
* event_trigger_callback - Generic event_command @func implementation
* @cmd_ops: The command ops, used for trigger registration
- * @file: The ftrace_event_file associated with the event
+ * @file: The trace_event_file associated with the event
* @glob: The raw string used to register the trigger
* @cmd: The cmd portion of the string used to register the trigger
* @param: The params portion of the string used to register the trigger
@@ -603,7 +603,7 @@ static void unregister_trigger(char *glob, struct event_trigger_ops *ops,
*/
static int
event_trigger_callback(struct event_command *cmd_ops,
- struct ftrace_event_file *file,
+ struct trace_event_file *file,
char *glob, char *cmd, char *param)
{
struct event_trigger_data *trigger_data;
@@ -688,7 +688,7 @@ event_trigger_callback(struct event_command *cmd_ops,
* set_trigger_filter - Generic event_command @set_filter implementation
* @filter_str: The filter string for the trigger, NULL to remove filter
* @trigger_data: Trigger-specific data
- * @file: The ftrace_event_file associated with the event
+ * @file: The trace_event_file associated with the event
*
* Common implementation for event command filter parsing and filter
* instantiation.
@@ -702,7 +702,7 @@ event_trigger_callback(struct event_command *cmd_ops,
*/
static int set_trigger_filter(char *filter_str,
struct event_trigger_data *trigger_data,
- struct ftrace_event_file *file)
+ struct trace_event_file *file)
{
struct event_trigger_data *data = trigger_data;
struct event_filter *filter = NULL, *tmp;
@@ -900,7 +900,7 @@ snapshot_count_trigger(struct event_trigger_data *data)
static int
register_snapshot_trigger(char *glob, struct event_trigger_ops *ops,
struct event_trigger_data *data,
- struct ftrace_event_file *file)
+ struct trace_event_file *file)
{
int ret = register_trigger(glob, ops, data, file);
@@ -968,7 +968,7 @@ static __init int register_trigger_snapshot_cmd(void) { return 0; }
* Skip 3:
* stacktrace_trigger()
* event_triggers_post_call()
- * ftrace_raw_event_xxx()
+ * trace_event_raw_event_xxx()
*/
#define STACK_SKIP 3
@@ -1053,7 +1053,7 @@ static __init void unregister_trigger_traceon_traceoff_cmds(void)
#define DISABLE_EVENT_STR "disable_event"
struct enable_trigger_data {
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
bool enable;
};
@@ -1063,9 +1063,9 @@ event_enable_trigger(struct event_trigger_data *data)
struct enable_trigger_data *enable_data = data->private_data;
if (enable_data->enable)
- clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &enable_data->file->flags);
+ clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &enable_data->file->flags);
else
- set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &enable_data->file->flags);
+ set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &enable_data->file->flags);
}
static void
@@ -1077,7 +1077,7 @@ event_enable_count_trigger(struct event_trigger_data *data)
return;
/* Skip if the event is in a state we want to switch to */
- if (enable_data->enable == !(enable_data->file->flags & FTRACE_EVENT_FL_SOFT_DISABLED))
+ if (enable_data->enable == !(enable_data->file->flags & EVENT_FILE_FL_SOFT_DISABLED))
return;
if (data->count != -1)
@@ -1095,7 +1095,7 @@ event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
seq_printf(m, "%s:%s:%s",
enable_data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR,
enable_data->file->event_call->class->system,
- ftrace_event_name(enable_data->file->event_call));
+ trace_event_name(enable_data->file->event_call));
if (data->count == -1)
seq_puts(m, ":unlimited");
@@ -1159,10 +1159,10 @@ static struct event_trigger_ops event_disable_count_trigger_ops = {
static int
event_enable_trigger_func(struct event_command *cmd_ops,
- struct ftrace_event_file *file,
+ struct trace_event_file *file,
char *glob, char *cmd, char *param)
{
- struct ftrace_event_file *event_enable_file;
+ struct trace_event_file *event_enable_file;
struct enable_trigger_data *enable_data;
struct event_trigger_data *trigger_data;
struct event_trigger_ops *trigger_ops;
@@ -1294,7 +1294,7 @@ event_enable_trigger_func(struct event_command *cmd_ops,
static int event_enable_register_trigger(char *glob,
struct event_trigger_ops *ops,
struct event_trigger_data *data,
- struct ftrace_event_file *file)
+ struct trace_event_file *file)
{
struct enable_trigger_data *enable_data = data->private_data;
struct enable_trigger_data *test_enable_data;
@@ -1331,7 +1331,7 @@ out:
static void event_enable_unregister_trigger(char *glob,
struct event_trigger_ops *ops,
struct event_trigger_data *test,
- struct ftrace_event_file *file)
+ struct trace_event_file *file)
{
struct enable_trigger_data *test_enable_data = test->private_data;
struct enable_trigger_data *enable_data;
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 174a6a711..adabf7da9 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -125,7 +125,7 @@ static void __always_unused ____ftrace_check_##name(void) \
#undef FTRACE_ENTRY
#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \
static int __init \
-ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
+ftrace_define_fields_##name(struct trace_event_call *event_call) \
{ \
struct struct_name field; \
int ret; \
@@ -163,14 +163,14 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\
regfn) \
\
-struct ftrace_event_class __refdata event_class_ftrace_##call = { \
+struct trace_event_class __refdata event_class_ftrace_##call = { \
.system = __stringify(TRACE_SYSTEM), \
.define_fields = ftrace_define_fields_##call, \
.fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\
.reg = regfn, \
}; \
\
-struct ftrace_event_call __used event_##call = { \
+struct trace_event_call __used event_##call = { \
.class = &event_class_ftrace_##call, \
{ \
.name = #call, \
@@ -179,7 +179,7 @@ struct ftrace_event_call __used event_##call = { \
.print_fmt = print, \
.flags = TRACE_EVENT_FL_IGNORE_ENABLE, \
}; \
-struct ftrace_event_call __used \
+struct trace_event_call __used \
__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
#undef FTRACE_ENTRY
@@ -187,7 +187,7 @@ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
FTRACE_ENTRY_REG(call, struct_name, etype, \
PARAMS(tstruct), PARAMS(print), filter, NULL)
-int ftrace_event_is_function(struct ftrace_event_call *call)
+int ftrace_event_is_function(struct trace_event_call *call)
{
return call == &event_function;
}
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index a51e79688..8968bf720 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -278,7 +278,7 @@ int __trace_graph_entry(struct trace_array *tr,
unsigned long flags,
int pc)
{
- struct ftrace_event_call *call = &event_funcgraph_entry;
+ struct trace_event_call *call = &event_funcgraph_entry;
struct ring_buffer_event *event;
struct ring_buffer *buffer = tr->trace_buffer.buffer;
struct ftrace_graph_ent_entry *entry;
@@ -393,7 +393,7 @@ void __trace_graph_return(struct trace_array *tr,
unsigned long flags,
int pc)
{
- struct ftrace_event_call *call = &event_funcgraph_exit;
+ struct trace_event_call *call = &event_funcgraph_exit;
struct ring_buffer_event *event;
struct ring_buffer *buffer = tr->trace_buffer.buffer;
struct ftrace_graph_ret_entry *entry;
@@ -1454,12 +1454,12 @@ static __init int init_graph_trace(void)
{
max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
- if (!register_ftrace_event(&graph_trace_entry_event)) {
+ if (!register_trace_event(&graph_trace_entry_event)) {
pr_warning("Warning: could not register graph trace events\n");
return 1;
}
- if (!register_ftrace_event(&graph_trace_ret_event)) {
+ if (!register_trace_event(&graph_trace_ret_event)) {
pr_warning("Warning: could not register graph trace events\n");
return 1;
}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index d0ce590f0..b7d0cdd99 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -348,7 +348,7 @@ static struct trace_kprobe *find_trace_kprobe(const char *event,
struct trace_kprobe *tk;
list_for_each_entry(tk, &probe_list, list)
- if (strcmp(ftrace_event_name(&tk->tp.call), event) == 0 &&
+ if (strcmp(trace_event_name(&tk->tp.call), event) == 0 &&
strcmp(tk->tp.call.class->system, group) == 0)
return tk;
return NULL;
@@ -359,7 +359,7 @@ static struct trace_kprobe *find_trace_kprobe(const char *event,
* if the file is NULL, enable "perf" handler, or enable "trace" handler.
*/
static int
-enable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file)
+enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file)
{
int ret = 0;
@@ -394,7 +394,7 @@ enable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file)
* if the file is NULL, disable "perf" handler, or disable "trace" handler.
*/
static int
-disable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file)
+disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file)
{
struct event_file_link *link = NULL;
int wait = 0;
@@ -523,7 +523,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
mutex_lock(&probe_lock);
/* Delete old (same name) event if exist */
- old_tk = find_trace_kprobe(ftrace_event_name(&tk->tp.call),
+ old_tk = find_trace_kprobe(trace_event_name(&tk->tp.call),
tk->tp.call.class->system);
if (old_tk) {
ret = unregister_trace_kprobe(old_tk);
@@ -572,7 +572,7 @@ static int trace_kprobe_module_callback(struct notifier_block *nb,
if (ret)
pr_warning("Failed to re-register probe %s on"
"%s: %d\n",
- ftrace_event_name(&tk->tp.call),
+ trace_event_name(&tk->tp.call),
mod->name, ret);
}
}
@@ -829,7 +829,7 @@ static int probes_seq_show(struct seq_file *m, void *v)
seq_putc(m, trace_kprobe_is_return(tk) ? 'r' : 'p');
seq_printf(m, ":%s/%s", tk->tp.call.class->system,
- ftrace_event_name(&tk->tp.call));
+ trace_event_name(&tk->tp.call));
if (!tk->symbol)
seq_printf(m, " 0x%p", tk->rp.kp.addr);
@@ -888,7 +888,7 @@ static int probes_profile_seq_show(struct seq_file *m, void *v)
struct trace_kprobe *tk = v;
seq_printf(m, " %-44s %15lu %15lu\n",
- ftrace_event_name(&tk->tp.call), tk->nhit,
+ trace_event_name(&tk->tp.call), tk->nhit,
tk->rp.kp.nmissed);
return 0;
@@ -917,18 +917,18 @@ static const struct file_operations kprobe_profile_ops = {
/* Kprobe handler */
static nokprobe_inline void
__kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
- struct ftrace_event_file *ftrace_file)
+ struct trace_event_file *trace_file)
{
struct kprobe_trace_entry_head *entry;
struct ring_buffer_event *event;
struct ring_buffer *buffer;
int size, dsize, pc;
unsigned long irq_flags;
- struct ftrace_event_call *call = &tk->tp.call;
+ struct trace_event_call *call = &tk->tp.call;
- WARN_ON(call != ftrace_file->event_call);
+ WARN_ON(call != trace_file->event_call);
- if (ftrace_trigger_soft_disabled(ftrace_file))
+ if (trace_trigger_soft_disabled(trace_file))
return;
local_save_flags(irq_flags);
@@ -937,7 +937,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
dsize = __get_data_size(&tk->tp, regs);
size = sizeof(*entry) + tk->tp.size + dsize;
- event = trace_event_buffer_lock_reserve(&buffer, ftrace_file,
+ event = trace_event_buffer_lock_reserve(&buffer, trace_file,
call->event.type,
size, irq_flags, pc);
if (!event)
@@ -947,7 +947,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
entry->ip = (unsigned long)tk->rp.kp.addr;
store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
- event_trigger_unlock_commit_regs(ftrace_file, buffer, event,
+ event_trigger_unlock_commit_regs(trace_file, buffer, event,
entry, irq_flags, pc, regs);
}
@@ -965,18 +965,18 @@ NOKPROBE_SYMBOL(kprobe_trace_func);
static nokprobe_inline void
__kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
struct pt_regs *regs,
- struct ftrace_event_file *ftrace_file)
+ struct trace_event_file *trace_file)
{
struct kretprobe_trace_entry_head *entry;
struct ring_buffer_event *event;
struct ring_buffer *buffer;
int size, pc, dsize;
unsigned long irq_flags;
- struct ftrace_event_call *call = &tk->tp.call;
+ struct trace_event_call *call = &tk->tp.call;
- WARN_ON(call != ftrace_file->event_call);
+ WARN_ON(call != trace_file->event_call);
- if (ftrace_trigger_soft_disabled(ftrace_file))
+ if (trace_trigger_soft_disabled(trace_file))
return;
local_save_flags(irq_flags);
@@ -985,7 +985,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
dsize = __get_data_size(&tk->tp, regs);
size = sizeof(*entry) + tk->tp.size + dsize;
- event = trace_event_buffer_lock_reserve(&buffer, ftrace_file,
+ event = trace_event_buffer_lock_reserve(&buffer, trace_file,
call->event.type,
size, irq_flags, pc);
if (!event)
@@ -996,7 +996,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
entry->ret_ip = (unsigned long)ri->ret_addr;
store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
- event_trigger_unlock_commit_regs(ftrace_file, buffer, event,
+ event_trigger_unlock_commit_regs(trace_file, buffer, event,
entry, irq_flags, pc, regs);
}
@@ -1025,7 +1025,7 @@ print_kprobe_event(struct trace_iterator *iter, int flags,
field = (struct kprobe_trace_entry_head *)iter->ent;
tp = container_of(event, struct trace_probe, call.event);
- trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call));
+ trace_seq_printf(s, "%s: (", trace_event_name(&tp->call));
if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
goto out;
@@ -1056,7 +1056,7 @@ print_kretprobe_event(struct trace_iterator *iter, int flags,
field = (struct kretprobe_trace_entry_head *)iter->ent;
tp = container_of(event, struct trace_probe, call.event);
- trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call));
+ trace_seq_printf(s, "%s: (", trace_event_name(&tp->call));
if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET))
goto out;
@@ -1081,7 +1081,7 @@ print_kretprobe_event(struct trace_iterator *iter, int flags,
}
-static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
+static int kprobe_event_define_fields(struct trace_event_call *event_call)
{
int ret, i;
struct kprobe_trace_entry_head field;
@@ -1104,7 +1104,7 @@ static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
return 0;
}
-static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
+static int kretprobe_event_define_fields(struct trace_event_call *event_call)
{
int ret, i;
struct kretprobe_trace_entry_head field;
@@ -1134,7 +1134,7 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
static void
kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
{
- struct ftrace_event_call *call = &tk->tp.call;
+ struct trace_event_call *call = &tk->tp.call;
struct bpf_prog *prog = call->prog;
struct kprobe_trace_entry_head *entry;
struct hlist_head *head;
@@ -1169,7 +1169,7 @@ static void
kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
struct pt_regs *regs)
{
- struct ftrace_event_call *call = &tk->tp.call;
+ struct trace_event_call *call = &tk->tp.call;
struct bpf_prog *prog = call->prog;
struct kretprobe_trace_entry_head *entry;
struct hlist_head *head;
@@ -1206,11 +1206,11 @@ NOKPROBE_SYMBOL(kretprobe_perf_func);
* kprobe_trace_self_tests_init() does enable_trace_probe/disable_trace_probe
* lockless, but we can't race with this __init function.
*/
-static int kprobe_register(struct ftrace_event_call *event,
+static int kprobe_register(struct trace_event_call *event,
enum trace_reg type, void *data)
{
struct trace_kprobe *tk = (struct trace_kprobe *)event->data;
- struct ftrace_event_file *file = data;
+ struct trace_event_file *file = data;
switch (type) {
case TRACE_REG_REGISTER:
@@ -1276,10 +1276,10 @@ static struct trace_event_functions kprobe_funcs = {
static int register_kprobe_event(struct trace_kprobe *tk)
{
- struct ftrace_event_call *call = &tk->tp.call;
+ struct trace_event_call *call = &tk->tp.call;
int ret;
- /* Initialize ftrace_event_call */
+ /* Initialize trace_event_call */
INIT_LIST_HEAD(&call->class->fields);
if (trace_kprobe_is_return(tk)) {
call->event.funcs = &kretprobe_funcs;
@@ -1290,7 +1290,7 @@ static int register_kprobe_event(struct trace_kprobe *tk)
}
if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0)
return -ENOMEM;
- ret = register_ftrace_event(&call->event);
+ ret = register_trace_event(&call->event);
if (!ret) {
kfree(call->print_fmt);
return -ENODEV;
@@ -1301,9 +1301,9 @@ static int register_kprobe_event(struct trace_kprobe *tk)
ret = trace_add_event_call(call);
if (ret) {
pr_info("Failed to register kprobe event: %s\n",
- ftrace_event_name(call));
+ trace_event_name(call));
kfree(call->print_fmt);
- unregister_ftrace_event(&call->event);
+ unregister_trace_event(&call->event);
}
return ret;
}
@@ -1364,10 +1364,10 @@ static __used int kprobe_trace_selftest_target(int a1, int a2, int a3,
return a1 + a2 + a3 + a4 + a5 + a6;
}
-static struct ftrace_event_file *
+static struct trace_event_file *
find_trace_probe_file(struct trace_kprobe *tk, struct trace_array *tr)
{
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
list_for_each_entry(file, &tr->events, list)
if (file->event_call == &tk->tp.call)
@@ -1385,7 +1385,7 @@ static __init int kprobe_trace_self_tests_init(void)
int ret, warn = 0;
int (*target)(int, int, int, int, int, int);
struct trace_kprobe *tk;
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
if (tracing_is_disabled())
return -ENODEV;
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 7a9ba62e9..638e110c5 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -298,7 +298,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
struct trace_array_cpu *data,
struct mmiotrace_rw *rw)
{
- struct ftrace_event_call *call = &event_mmiotrace_rw;
+ struct trace_event_call *call = &event_mmiotrace_rw;
struct ring_buffer *buffer = tr->trace_buffer.buffer;
struct ring_buffer_event *event;
struct trace_mmiotrace_rw *entry;
@@ -328,7 +328,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
struct trace_array_cpu *data,
struct mmiotrace_map *map)
{
- struct ftrace_event_call *call = &event_mmiotrace_map;
+ struct trace_event_call *call = &event_mmiotrace_map;
struct ring_buffer *buffer = tr->trace_buffer.buffer;
struct ring_buffer_event *event;
struct trace_mmiotrace_map *entry;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 25a086bcb..dfab25372 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -60,9 +60,9 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
}
const char *
-ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
- unsigned long flags,
- const struct trace_print_flags *flag_array)
+trace_print_flags_seq(struct trace_seq *p, const char *delim,
+ unsigned long flags,
+ const struct trace_print_flags *flag_array)
{
unsigned long mask;
const char *str;
@@ -95,11 +95,11 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
return ret;
}
-EXPORT_SYMBOL(ftrace_print_flags_seq);
+EXPORT_SYMBOL(trace_print_flags_seq);
const char *
-ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
- const struct trace_print_flags *symbol_array)
+trace_print_symbols_seq(struct trace_seq *p, unsigned long val,
+ const struct trace_print_flags *symbol_array)
{
int i;
const char *ret = trace_seq_buffer_ptr(p);
@@ -120,11 +120,11 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
return ret;
}
-EXPORT_SYMBOL(ftrace_print_symbols_seq);
+EXPORT_SYMBOL(trace_print_symbols_seq);
#if BITS_PER_LONG == 32
const char *
-ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
+trace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
const struct trace_print_flags_u64 *symbol_array)
{
int i;
@@ -146,12 +146,12 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
return ret;
}
-EXPORT_SYMBOL(ftrace_print_symbols_seq_u64);
+EXPORT_SYMBOL(trace_print_symbols_seq_u64);
#endif
const char *
-ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,
- unsigned int bitmask_size)
+trace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,
+ unsigned int bitmask_size)
{
const char *ret = trace_seq_buffer_ptr(p);
@@ -160,10 +160,10 @@ ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,
return ret;
}
-EXPORT_SYMBOL_GPL(ftrace_print_bitmask_seq);
+EXPORT_SYMBOL_GPL(trace_print_bitmask_seq);
const char *
-ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
+trace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
{
int i;
const char *ret = trace_seq_buffer_ptr(p);
@@ -175,11 +175,11 @@ ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
return ret;
}
-EXPORT_SYMBOL(ftrace_print_hex_seq);
+EXPORT_SYMBOL(trace_print_hex_seq);
const char *
-ftrace_print_array_seq(struct trace_seq *p, const void *buf, int count,
- size_t el_size)
+trace_print_array_seq(struct trace_seq *p, const void *buf, int count,
+ size_t el_size)
{
const char *ret = trace_seq_buffer_ptr(p);
const char *prefix = "";
@@ -220,17 +220,17 @@ ftrace_print_array_seq(struct trace_seq *p, const void *buf, int count,
return ret;
}
-EXPORT_SYMBOL(ftrace_print_array_seq);
+EXPORT_SYMBOL(trace_print_array_seq);
-int ftrace_raw_output_prep(struct trace_iterator *iter,
- struct trace_event *trace_event)
+int trace_raw_output_prep(struct trace_iterator *iter,
+ struct trace_event *trace_event)
{
- struct ftrace_event_call *event;
+ struct trace_event_call *event;
struct trace_seq *s = &iter->seq;
struct trace_seq *p = &iter->tmp_seq;
struct trace_entry *entry;
- event = container_of(trace_event, struct ftrace_event_call, event);
+ event = container_of(trace_event, struct trace_event_call, event);
entry = iter->ent;
if (entry->type != event->event.type) {
@@ -239,14 +239,14 @@ int ftrace_raw_output_prep(struct trace_iterator *iter,
}
trace_seq_init(p);
- trace_seq_printf(s, "%s: ", ftrace_event_name(event));
+ trace_seq_printf(s, "%s: ", trace_event_name(event));
return trace_handle_return(s);
}
-EXPORT_SYMBOL(ftrace_raw_output_prep);
+EXPORT_SYMBOL(trace_raw_output_prep);
-static int ftrace_output_raw(struct trace_iterator *iter, char *name,
- char *fmt, va_list ap)
+static int trace_output_raw(struct trace_iterator *iter, char *name,
+ char *fmt, va_list ap)
{
struct trace_seq *s = &iter->seq;
@@ -256,18 +256,18 @@ static int ftrace_output_raw(struct trace_iterator *iter, char *name,
return trace_handle_return(s);
}
-int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...)
+int trace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...)
{
va_list ap;
int ret;
va_start(ap, fmt);
- ret = ftrace_output_raw(iter, name, fmt, ap);
+ ret = trace_output_raw(iter, name, fmt, ap);
va_end(ap);
return ret;
}
-EXPORT_SYMBOL_GPL(ftrace_output_call);
+EXPORT_SYMBOL_GPL(trace_output_call);
#ifdef CONFIG_KRETPROBES
static inline const char *kretprobed(const char *name)
@@ -675,7 +675,7 @@ static int trace_search_list(struct list_head **list)
}
/* Did we used up all 65 thousand events??? */
- if ((last + 1) > FTRACE_MAX_EVENT)
+ if ((last + 1) > TRACE_EVENT_TYPE_MAX)
return 0;
*list = &e->list;
@@ -693,7 +693,7 @@ void trace_event_read_unlock(void)
}
/**
- * register_ftrace_event - register output for an event type
+ * register_trace_event - register output for an event type
* @event: the event type to register
*
* Event types are stored in a hash and this hash is used to
@@ -707,7 +707,7 @@ void trace_event_read_unlock(void)
*
* Returns the event type number or zero on error.
*/
-int register_ftrace_event(struct trace_event *event)
+int register_trace_event(struct trace_event *event)
{
unsigned key;
int ret = 0;
@@ -725,7 +725,7 @@ int register_ftrace_event(struct trace_event *event)
if (!event->type) {
struct list_head *list = NULL;
- if (next_event_type > FTRACE_MAX_EVENT) {
+ if (next_event_type > TRACE_EVENT_TYPE_MAX) {
event->type = trace_search_list(&list);
if (!event->type)
@@ -771,12 +771,12 @@ int register_ftrace_event(struct trace_event *event)
return ret;
}
-EXPORT_SYMBOL_GPL(register_ftrace_event);
+EXPORT_SYMBOL_GPL(register_trace_event);
/*
* Used by module code with the trace_event_sem held for write.
*/
-int __unregister_ftrace_event(struct trace_event *event)
+int __unregister_trace_event(struct trace_event *event)
{
hlist_del(&event->node);
list_del(&event->list);
@@ -784,18 +784,18 @@ int __unregister_ftrace_event(struct trace_event *event)
}
/**
- * unregister_ftrace_event - remove a no longer used event
+ * unregister_trace_event - remove a no longer used event
* @event: the event to remove
*/
-int unregister_ftrace_event(struct trace_event *event)
+int unregister_trace_event(struct trace_event *event)
{
down_write(&trace_event_sem);
- __unregister_ftrace_event(event);
+ __unregister_trace_event(event);
up_write(&trace_event_sem);
return 0;
}
-EXPORT_SYMBOL_GPL(unregister_ftrace_event);
+EXPORT_SYMBOL_GPL(unregister_trace_event);
/*
* Standard events
@@ -1243,7 +1243,7 @@ __init static int init_events(void)
for (i = 0; events[i]; i++) {
event = events[i];
- ret = register_ftrace_event(event);
+ ret = register_trace_event(event);
if (!ret) {
printk(KERN_WARNING "event %d failed to register\n",
event->type);
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 8ef2c40ef..4cbfe85b9 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -32,7 +32,7 @@ extern int
trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
/* used by module unregistering */
-extern int __unregister_ftrace_event(struct trace_event *event);
+extern int __unregister_trace_event(struct trace_event *event);
extern struct rw_semaphore trace_event_sem;
#define SEQ_PUT_FIELD(s, x) \
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index ab283e146..b98dee914 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -272,8 +272,8 @@ struct probe_arg {
struct trace_probe {
unsigned int flags; /* For TP_FLAG_* */
- struct ftrace_event_class class;
- struct ftrace_event_call call;
+ struct trace_event_class class;
+ struct trace_event_call call;
struct list_head files;
ssize_t size; /* trace entry size */
unsigned int nr_args;
@@ -281,7 +281,7 @@ struct trace_probe {
};
struct event_file_link {
- struct ftrace_event_file *file;
+ struct trace_event_file *file;
struct list_head list;
};
@@ -314,7 +314,7 @@ static inline int is_good_name(const char *name)
}
static inline struct event_file_link *
-find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file)
+find_event_file_link(struct trace_probe *tp, struct trace_event_file *file)
{
struct event_file_link *link;
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index d6e100372..9b33dd117 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -369,7 +369,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
struct task_struct *next,
unsigned long flags, int pc)
{
- struct ftrace_event_call *call = &event_context_switch;
+ struct trace_event_call *call = &event_context_switch;
struct ring_buffer *buffer = tr->trace_buffer.buffer;
struct ring_buffer_event *event;
struct ctx_switch_entry *entry;
@@ -397,7 +397,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
struct task_struct *curr,
unsigned long flags, int pc)
{
- struct ftrace_event_call *call = &event_wakeup;
+ struct trace_event_call *call = &event_wakeup;
struct ring_buffer_event *event;
struct ctx_switch_entry *entry;
struct ring_buffer *buffer = tr->trace_buffer.buffer;
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 287cf721c..b0f86ea77 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -1039,15 +1039,10 @@ static int trace_wakeup_test_thread(void *data)
{
/* Make this a -deadline thread */
static const struct sched_attr attr = {
-#ifdef CONFIG_SCHED_BFS
- /* No deadline on BFS, use RR */
- .sched_policy = SCHED_RR,
-#else
.sched_policy = SCHED_DEADLINE,
.sched_runtime = 100000ULL,
.sched_deadline = 10000000ULL,
.sched_period = 10000000ULL
-#endif
};
struct wakeup_test_data *x = data;
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index f97f6e3a6..7d567a4b9 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -13,13 +13,13 @@
static DEFINE_MUTEX(syscall_trace_lock);
-static int syscall_enter_register(struct ftrace_event_call *event,
+static int syscall_enter_register(struct trace_event_call *event,
enum trace_reg type, void *data);
-static int syscall_exit_register(struct ftrace_event_call *event,
+static int syscall_exit_register(struct trace_event_call *event,
enum trace_reg type, void *data);
static struct list_head *
-syscall_get_enter_fields(struct ftrace_event_call *call)
+syscall_get_enter_fields(struct trace_event_call *call)
{
struct syscall_metadata *entry = call->data;
@@ -219,7 +219,7 @@ __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
return pos;
}
-static int __init set_syscall_print_fmt(struct ftrace_event_call *call)
+static int __init set_syscall_print_fmt(struct trace_event_call *call)
{
char *print_fmt;
int len;
@@ -244,7 +244,7 @@ static int __init set_syscall_print_fmt(struct ftrace_event_call *call)
return 0;
}
-static void __init free_syscall_print_fmt(struct ftrace_event_call *call)
+static void __init free_syscall_print_fmt(struct trace_event_call *call)
{
struct syscall_metadata *entry = call->data;
@@ -252,7 +252,7 @@ static void __init free_syscall_print_fmt(struct ftrace_event_call *call)
kfree(call->print_fmt);
}
-static int __init syscall_enter_define_fields(struct ftrace_event_call *call)
+static int __init syscall_enter_define_fields(struct trace_event_call *call)
{
struct syscall_trace_enter trace;
struct syscall_metadata *meta = call->data;
@@ -275,7 +275,7 @@ static int __init syscall_enter_define_fields(struct ftrace_event_call *call)
return ret;
}
-static int __init syscall_exit_define_fields(struct ftrace_event_call *call)
+static int __init syscall_exit_define_fields(struct trace_event_call *call)
{
struct syscall_trace_exit trace;
int ret;
@@ -293,7 +293,7 @@ static int __init syscall_exit_define_fields(struct ftrace_event_call *call)
static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
{
struct trace_array *tr = data;
- struct ftrace_event_file *ftrace_file;
+ struct trace_event_file *trace_file;
struct syscall_trace_enter *entry;
struct syscall_metadata *sys_data;
struct ring_buffer_event *event;
@@ -308,11 +308,11 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
return;
/* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */
- ftrace_file = rcu_dereference_sched(tr->enter_syscall_files[syscall_nr]);
- if (!ftrace_file)
+ trace_file = rcu_dereference_sched(tr->enter_syscall_files[syscall_nr]);
+ if (!trace_file)
return;
- if (ftrace_trigger_soft_disabled(ftrace_file))
+ if (trace_trigger_soft_disabled(trace_file))
return;
sys_data = syscall_nr_to_meta(syscall_nr);
@@ -334,14 +334,14 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
entry->nr = syscall_nr;
syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);
- event_trigger_unlock_commit(ftrace_file, buffer, event, entry,
+ event_trigger_unlock_commit(trace_file, buffer, event, entry,
irq_flags, pc);
}
static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
{
struct trace_array *tr = data;
- struct ftrace_event_file *ftrace_file;
+ struct trace_event_file *trace_file;
struct syscall_trace_exit *entry;
struct syscall_metadata *sys_data;
struct ring_buffer_event *event;
@@ -355,11 +355,11 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
return;
/* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */
- ftrace_file = rcu_dereference_sched(tr->exit_syscall_files[syscall_nr]);
- if (!ftrace_file)
+ trace_file = rcu_dereference_sched(tr->exit_syscall_files[syscall_nr]);
+ if (!trace_file)
return;
- if (ftrace_trigger_soft_disabled(ftrace_file))
+ if (trace_trigger_soft_disabled(trace_file))
return;
sys_data = syscall_nr_to_meta(syscall_nr);
@@ -380,12 +380,12 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
entry->nr = syscall_nr;
entry->ret = syscall_get_return_value(current, regs);
- event_trigger_unlock_commit(ftrace_file, buffer, event, entry,
+ event_trigger_unlock_commit(trace_file, buffer, event, entry,
irq_flags, pc);
}
-static int reg_event_syscall_enter(struct ftrace_event_file *file,
- struct ftrace_event_call *call)
+static int reg_event_syscall_enter(struct trace_event_file *file,
+ struct trace_event_call *call)
{
struct trace_array *tr = file->tr;
int ret = 0;
@@ -405,8 +405,8 @@ static int reg_event_syscall_enter(struct ftrace_event_file *file,
return ret;
}
-static void unreg_event_syscall_enter(struct ftrace_event_file *file,
- struct ftrace_event_call *call)
+static void unreg_event_syscall_enter(struct trace_event_file *file,
+ struct trace_event_call *call)
{
struct trace_array *tr = file->tr;
int num;
@@ -422,8 +422,8 @@ static void unreg_event_syscall_enter(struct ftrace_event_file *file,
mutex_unlock(&syscall_trace_lock);
}
-static int reg_event_syscall_exit(struct ftrace_event_file *file,
- struct ftrace_event_call *call)
+static int reg_event_syscall_exit(struct trace_event_file *file,
+ struct trace_event_call *call)
{
struct trace_array *tr = file->tr;
int ret = 0;
@@ -443,8 +443,8 @@ static int reg_event_syscall_exit(struct ftrace_event_file *file,
return ret;
}
-static void unreg_event_syscall_exit(struct ftrace_event_file *file,
- struct ftrace_event_call *call)
+static void unreg_event_syscall_exit(struct trace_event_file *file,
+ struct trace_event_call *call)
{
struct trace_array *tr = file->tr;
int num;
@@ -460,7 +460,7 @@ static void unreg_event_syscall_exit(struct ftrace_event_file *file,
mutex_unlock(&syscall_trace_lock);
}
-static int __init init_syscall_trace(struct ftrace_event_call *call)
+static int __init init_syscall_trace(struct trace_event_call *call)
{
int id;
int num;
@@ -493,7 +493,7 @@ struct trace_event_functions exit_syscall_print_funcs = {
.trace = print_syscall_exit,
};
-struct ftrace_event_class __refdata event_class_syscall_enter = {
+struct trace_event_class __refdata event_class_syscall_enter = {
.system = "syscalls",
.reg = syscall_enter_register,
.define_fields = syscall_enter_define_fields,
@@ -501,7 +501,7 @@ struct ftrace_event_class __refdata event_class_syscall_enter = {
.raw_init = init_syscall_trace,
};
-struct ftrace_event_class __refdata event_class_syscall_exit = {
+struct trace_event_class __refdata event_class_syscall_exit = {
.system = "syscalls",
.reg = syscall_exit_register,
.define_fields = syscall_exit_define_fields,
@@ -584,7 +584,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
}
-static int perf_sysenter_enable(struct ftrace_event_call *call)
+static int perf_sysenter_enable(struct trace_event_call *call)
{
int ret = 0;
int num;
@@ -605,7 +605,7 @@ static int perf_sysenter_enable(struct ftrace_event_call *call)
return ret;
}
-static void perf_sysenter_disable(struct ftrace_event_call *call)
+static void perf_sysenter_disable(struct trace_event_call *call)
{
int num;
@@ -656,7 +656,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
}
-static int perf_sysexit_enable(struct ftrace_event_call *call)
+static int perf_sysexit_enable(struct trace_event_call *call)
{
int ret = 0;
int num;
@@ -677,7 +677,7 @@ static int perf_sysexit_enable(struct ftrace_event_call *call)
return ret;
}
-static void perf_sysexit_disable(struct ftrace_event_call *call)
+static void perf_sysexit_disable(struct trace_event_call *call)
{
int num;
@@ -693,10 +693,10 @@ static void perf_sysexit_disable(struct ftrace_event_call *call)
#endif /* CONFIG_PERF_EVENTS */
-static int syscall_enter_register(struct ftrace_event_call *event,
+static int syscall_enter_register(struct trace_event_call *event,
enum trace_reg type, void *data)
{
- struct ftrace_event_file *file = data;
+ struct trace_event_file *file = data;
switch (type) {
case TRACE_REG_REGISTER:
@@ -721,10 +721,10 @@ static int syscall_enter_register(struct ftrace_event_call *event,
return 0;
}
-static int syscall_exit_register(struct ftrace_event_call *event,
+static int syscall_exit_register(struct trace_event_call *event,
enum trace_reg type, void *data)
{
- struct ftrace_event_file *file = data;
+ struct trace_event_file *file = data;
switch (type) {
case TRACE_REG_REGISTER:
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 6dd022c7b..aa1ea7b36 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -293,7 +293,7 @@ static struct trace_uprobe *find_probe_event(const char *event, const char *grou
struct trace_uprobe *tu;
list_for_each_entry(tu, &uprobe_list, list)
- if (strcmp(ftrace_event_name(&tu->tp.call), event) == 0 &&
+ if (strcmp(trace_event_name(&tu->tp.call), event) == 0 &&
strcmp(tu->tp.call.class->system, group) == 0)
return tu;
@@ -323,7 +323,7 @@ static int register_trace_uprobe(struct trace_uprobe *tu)
mutex_lock(&uprobe_lock);
/* register as an event */
- old_tu = find_probe_event(ftrace_event_name(&tu->tp.call),
+ old_tu = find_probe_event(trace_event_name(&tu->tp.call),
tu->tp.call.class->system);
if (old_tu) {
/* delete old event */
@@ -600,7 +600,7 @@ static int probes_seq_show(struct seq_file *m, void *v)
int i;
seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system,
- ftrace_event_name(&tu->tp.call));
+ trace_event_name(&tu->tp.call));
seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset);
for (i = 0; i < tu->tp.nr_args; i++)
@@ -651,7 +651,7 @@ static int probes_profile_seq_show(struct seq_file *m, void *v)
struct trace_uprobe *tu = v;
seq_printf(m, " %s %-44s %15lu\n", tu->filename,
- ftrace_event_name(&tu->tp.call), tu->nhit);
+ trace_event_name(&tu->tp.call), tu->nhit);
return 0;
}
@@ -770,26 +770,26 @@ static void uprobe_buffer_put(struct uprobe_cpu_buffer *ucb)
static void __uprobe_trace_func(struct trace_uprobe *tu,
unsigned long func, struct pt_regs *regs,
struct uprobe_cpu_buffer *ucb, int dsize,
- struct ftrace_event_file *ftrace_file)
+ struct trace_event_file *trace_file)
{
struct uprobe_trace_entry_head *entry;
struct ring_buffer_event *event;
struct ring_buffer *buffer;
void *data;
int size, esize;
- struct ftrace_event_call *call = &tu->tp.call;
+ struct trace_event_call *call = &tu->tp.call;
- WARN_ON(call != ftrace_file->event_call);
+ WARN_ON(call != trace_file->event_call);
if (WARN_ON_ONCE(tu->tp.size + dsize > PAGE_SIZE))
return;
- if (ftrace_trigger_soft_disabled(ftrace_file))
+ if (trace_trigger_soft_disabled(trace_file))
return;
esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
size = esize + tu->tp.size + dsize;
- event = trace_event_buffer_lock_reserve(&buffer, ftrace_file,
+ event = trace_event_buffer_lock_reserve(&buffer, trace_file,
call->event.type, size, 0, 0);
if (!event)
return;
@@ -806,7 +806,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu,
memcpy(data, ucb->buf, tu->tp.size + dsize);
- event_trigger_unlock_commit(ftrace_file, buffer, event, entry, 0, 0);
+ event_trigger_unlock_commit(trace_file, buffer, event, entry, 0, 0);
}
/* uprobe handler */
@@ -853,12 +853,12 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e
if (is_ret_probe(tu)) {
trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)",
- ftrace_event_name(&tu->tp.call),
+ trace_event_name(&tu->tp.call),
entry->vaddr[1], entry->vaddr[0]);
data = DATAOF_TRACE_ENTRY(entry, true);
} else {
trace_seq_printf(s, "%s: (0x%lx)",
- ftrace_event_name(&tu->tp.call),
+ trace_event_name(&tu->tp.call),
entry->vaddr[0]);
data = DATAOF_TRACE_ENTRY(entry, false);
}
@@ -881,7 +881,7 @@ typedef bool (*filter_func_t)(struct uprobe_consumer *self,
struct mm_struct *mm);
static int
-probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file,
+probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file,
filter_func_t filter)
{
bool enabled = trace_probe_is_enabled(&tu->tp);
@@ -938,7 +938,7 @@ probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file,
}
static void
-probe_event_disable(struct trace_uprobe *tu, struct ftrace_event_file *file)
+probe_event_disable(struct trace_uprobe *tu, struct trace_event_file *file)
{
if (!trace_probe_is_enabled(&tu->tp))
return;
@@ -967,7 +967,7 @@ probe_event_disable(struct trace_uprobe *tu, struct ftrace_event_file *file)
uprobe_buffer_disable();
}
-static int uprobe_event_define_fields(struct ftrace_event_call *event_call)
+static int uprobe_event_define_fields(struct trace_event_call *event_call)
{
int ret, i, size;
struct uprobe_trace_entry_head field;
@@ -1093,7 +1093,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
unsigned long func, struct pt_regs *regs,
struct uprobe_cpu_buffer *ucb, int dsize)
{
- struct ftrace_event_call *call = &tu->tp.call;
+ struct trace_event_call *call = &tu->tp.call;
struct uprobe_trace_entry_head *entry;
struct hlist_head *head;
void *data;
@@ -1159,11 +1159,11 @@ static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func,
#endif /* CONFIG_PERF_EVENTS */
static int
-trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
+trace_uprobe_register(struct trace_event_call *event, enum trace_reg type,
void *data)
{
struct trace_uprobe *tu = event->data;
- struct ftrace_event_file *file = data;
+ struct trace_event_file *file = data;
switch (type) {
case TRACE_REG_REGISTER:
@@ -1272,10 +1272,10 @@ static struct trace_event_functions uprobe_funcs = {
static int register_uprobe_event(struct trace_uprobe *tu)
{
- struct ftrace_event_call *call = &tu->tp.call;
+ struct trace_event_call *call = &tu->tp.call;
int ret;
- /* Initialize ftrace_event_call */
+ /* Initialize trace_event_call */
INIT_LIST_HEAD(&call->class->fields);
call->event.funcs = &uprobe_funcs;
call->class->define_fields = uprobe_event_define_fields;
@@ -1283,7 +1283,7 @@ static int register_uprobe_event(struct trace_uprobe *tu)
if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0)
return -ENOMEM;
- ret = register_ftrace_event(&call->event);
+ ret = register_trace_event(&call->event);
if (!ret) {
kfree(call->print_fmt);
return -ENODEV;
@@ -1295,9 +1295,9 @@ static int register_uprobe_event(struct trace_uprobe *tu)
if (ret) {
pr_info("Failed to register uprobe event: %s\n",
- ftrace_event_name(call));
+ trace_event_name(call));
kfree(call->print_fmt);
- unregister_ftrace_event(&call->event);
+ unregister_trace_event(&call->event);
}
return ret;
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 581a68a04..a6ffa43f2 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -19,6 +19,7 @@
#include <linux/sysctl.h>
#include <linux/smpboot.h>
#include <linux/sched/rt.h>
+#include <linux/tick.h>
#include <asm/irq_regs.h>
#include <linux/kvm_para.h>
@@ -58,6 +59,12 @@ int __read_mostly sysctl_softlockup_all_cpu_backtrace;
#else
#define sysctl_softlockup_all_cpu_backtrace 0
#endif
+static struct cpumask watchdog_cpumask __read_mostly;
+unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
+
+/* Helper for online, unparked cpus. */
+#define for_each_watchdog_cpu(cpu) \
+ for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
static int __read_mostly watchdog_running;
static u64 __read_mostly sample_period;
@@ -207,7 +214,7 @@ void touch_all_softlockup_watchdogs(void)
* do we care if a 0 races with a timestamp?
* all it means is the softlock check starts one cycle later
*/
- for_each_online_cpu(cpu)
+ for_each_watchdog_cpu(cpu)
per_cpu(watchdog_touch_ts, cpu) = 0;
}
@@ -616,7 +623,7 @@ void watchdog_nmi_enable_all(void)
goto unlock;
get_online_cpus();
- for_each_online_cpu(cpu)
+ for_each_watchdog_cpu(cpu)
watchdog_nmi_enable(cpu);
put_online_cpus();
@@ -634,7 +641,7 @@ void watchdog_nmi_disable_all(void)
goto unlock;
get_online_cpus();
- for_each_online_cpu(cpu)
+ for_each_watchdog_cpu(cpu)
watchdog_nmi_disable(cpu);
put_online_cpus();
@@ -696,7 +703,7 @@ static void update_watchdog_all_cpus(void)
int cpu;
get_online_cpus();
- for_each_online_cpu(cpu)
+ for_each_watchdog_cpu(cpu)
update_watchdog(cpu);
put_online_cpus();
}
@@ -709,8 +716,12 @@ static int watchdog_enable_all_cpus(void)
err = smpboot_register_percpu_thread(&watchdog_threads);
if (err)
pr_err("Failed to create watchdog threads, disabled\n");
- else
+ else {
+ if (smpboot_update_cpumask_percpu_thread(
+ &watchdog_threads, &watchdog_cpumask))
+ pr_err("Failed to set cpumask for watchdog threads\n");
watchdog_running = 1;
+ }
} else {
/*
* Enable/disable the lockup detectors or
@@ -879,12 +890,58 @@ out:
mutex_unlock(&watchdog_proc_mutex);
return err;
}
+
+/*
+ * The cpumask is the mask of possible cpus that the watchdog can run
+ * on, not the mask of cpus it is actually running on. This allows the
+ * user to specify a mask that will include cpus that have not yet
+ * been brought online, if desired.
+ */
+int proc_watchdog_cpumask(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int err;
+
+ mutex_lock(&watchdog_proc_mutex);
+ err = proc_do_large_bitmap(table, write, buffer, lenp, ppos);
+ if (!err && write) {
+ /* Remove impossible cpus to keep sysctl output cleaner. */
+ cpumask_and(&watchdog_cpumask, &watchdog_cpumask,
+ cpu_possible_mask);
+
+ if (watchdog_running) {
+ /*
+ * Failure would be due to being unable to allocate
+ * a temporary cpumask, so we are likely not in a
+ * position to do much else to make things better.
+ */
+ if (smpboot_update_cpumask_percpu_thread(
+ &watchdog_threads, &watchdog_cpumask) != 0)
+ pr_err("cpumask update failed\n");
+ }
+ }
+ mutex_unlock(&watchdog_proc_mutex);
+ return err;
+}
+
#endif /* CONFIG_SYSCTL */
void __init lockup_detector_init(void)
{
set_sample_period();
+#ifdef CONFIG_NO_HZ_FULL
+ if (tick_nohz_full_enabled()) {
+ if (!cpumask_empty(tick_nohz_full_mask))
+ pr_info("Disabling watchdog on nohz_full cores by default\n");
+ cpumask_andnot(&watchdog_cpumask, cpu_possible_mask,
+ tick_nohz_full_mask);
+ } else
+ cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
+#else
+ cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
+#endif
+
if (watchdog_enabled)
watchdog_enable_all_cpus();
}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 586ad9130..a413acb59 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -127,6 +127,11 @@ enum {
*
* PR: wq_pool_mutex protected for writes. Sched-RCU protected for reads.
*
+ * PW: wq_pool_mutex and wq->mutex protected for writes. Either for reads.
+ *
+ * PWR: wq_pool_mutex and wq->mutex protected for writes. Either or
+ * sched-RCU for reads.
+ *
* WQ: wq->mutex protected.
*
* WR: wq->mutex protected for writes. Sched-RCU protected for reads.
@@ -247,8 +252,8 @@ struct workqueue_struct {
int nr_drainers; /* WQ: drain in progress */
int saved_max_active; /* WQ: saved pwq max_active */
- struct workqueue_attrs *unbound_attrs; /* WQ: only for unbound wqs */
- struct pool_workqueue *dfl_pwq; /* WQ: only for unbound wqs */
+ struct workqueue_attrs *unbound_attrs; /* PW: only for unbound wqs */
+ struct pool_workqueue *dfl_pwq; /* PW: only for unbound wqs */
#ifdef CONFIG_SYSFS
struct wq_device *wq_dev; /* I: for sysfs interface */
@@ -268,7 +273,7 @@ struct workqueue_struct {
/* hot fields used during command issue, aligned to cacheline */
unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */
struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */
- struct pool_workqueue __rcu *numa_pwq_tbl[]; /* FR: unbound pwqs indexed by node */
+ struct pool_workqueue __rcu *numa_pwq_tbl[]; /* PWR: unbound pwqs indexed by node */
};
static struct kmem_cache *pwq_cache;
@@ -280,12 +285,7 @@ static bool wq_disable_numa;
module_param_named(disable_numa, wq_disable_numa, bool, 0444);
/* see the comment above the definition of WQ_POWER_EFFICIENT */
-#ifdef CONFIG_WQ_POWER_EFFICIENT_DEFAULT
-static bool wq_power_efficient = true;
-#else
-static bool wq_power_efficient;
-#endif
-
+static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT);
module_param_named(power_efficient, wq_power_efficient, bool, 0444);
static bool wq_numa_enabled; /* unbound NUMA affinity enabled */
@@ -299,6 +299,8 @@ static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */
static LIST_HEAD(workqueues); /* PR: list of all workqueues */
static bool workqueue_freezing; /* PL: have wqs started freezing? */
+static cpumask_var_t wq_unbound_cpumask; /* PL: low level cpumask for all unbound wqs */
+
/* the per-cpu worker pools */
static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
cpu_worker_pools);
@@ -330,8 +332,6 @@ struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
static int worker_thread(void *__worker);
-static void copy_workqueue_attrs(struct workqueue_attrs *to,
- const struct workqueue_attrs *from);
static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
#define CREATE_TRACE_POINTS
@@ -347,6 +347,12 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
lockdep_is_held(&wq->mutex), \
"sched RCU or wq->mutex should be held")
+#define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \
+ rcu_lockdep_assert(rcu_read_lock_sched_held() || \
+ lockdep_is_held(&wq->mutex) || \
+ lockdep_is_held(&wq_pool_mutex), \
+ "sched RCU, wq->mutex or wq_pool_mutex should be held")
+
#define for_each_cpu_worker_pool(pool, cpu) \
for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \
(pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \
@@ -551,7 +557,8 @@ static int worker_pool_assign_id(struct worker_pool *pool)
* @wq: the target workqueue
* @node: the node ID
*
- * This must be called either with pwq_lock held or sched RCU read locked.
+ * This must be called with any of wq_pool_mutex, wq->mutex or sched RCU
+ * read locked.
* If the pwq needs to be used beyond the locking in effect, the caller is
* responsible for guaranteeing that the pwq stays online.
*
@@ -560,7 +567,7 @@ static int worker_pool_assign_id(struct worker_pool *pool)
static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq,
int node)
{
- assert_rcu_or_wq_mutex(wq);
+ assert_rcu_or_wq_mutex_or_pool_mutex(wq);
return rcu_dereference_raw(wq->numa_pwq_tbl[node]);
}
@@ -976,7 +983,7 @@ static struct worker *find_worker_executing_work(struct worker_pool *pool,
* move_linked_works - move linked works to a list
* @work: start of series of works to be scheduled
* @head: target list to append @work to
- * @nextp: out paramter for nested worklist walking
+ * @nextp: out parameter for nested worklist walking
*
* Schedule linked works starting from @work to @head. Work series to
* be scheduled starts at @work and includes any consecutive work with
@@ -2607,7 +2614,7 @@ void flush_workqueue(struct workqueue_struct *wq)
out_unlock:
mutex_unlock(&wq->mutex);
}
-EXPORT_SYMBOL_GPL(flush_workqueue);
+EXPORT_SYMBOL(flush_workqueue);
/**
* drain_workqueue - drain a workqueue
@@ -2616,7 +2623,7 @@ EXPORT_SYMBOL_GPL(flush_workqueue);
* Wait until the workqueue becomes empty. While draining is in progress,
* only chain queueing is allowed. IOW, only currently pending or running
* work items on @wq can queue further work items on it. @wq is flushed
- * repeatedly until it becomes empty. The number of flushing is detemined
+ * repeatedly until it becomes empty. The number of flushing is determined
* by the depth of chaining and should be relatively short. Whine if it
* takes too long.
*/
@@ -2947,36 +2954,6 @@ int schedule_on_each_cpu(work_func_t func)
}
/**
- * flush_scheduled_work - ensure that any scheduled work has run to completion.
- *
- * Forces execution of the kernel-global workqueue and blocks until its
- * completion.
- *
- * Think twice before calling this function! It's very easy to get into
- * trouble if you don't take great care. Either of the following situations
- * will lead to deadlock:
- *
- * One of the work items currently on the workqueue needs to acquire
- * a lock held by your code or its caller.
- *
- * Your code is running in the context of a work routine.
- *
- * They will be detected by lockdep when they occur, but the first might not
- * occur very often. It depends on what work items are on the workqueue and
- * what locks they need, which you have no control over.
- *
- * In most situations flushing the entire workqueue is overkill; you merely
- * need to know that a particular work item isn't queued and isn't running.
- * In such cases you should use cancel_delayed_work_sync() or
- * cancel_work_sync() instead.
- */
-void flush_scheduled_work(void)
-{
- flush_workqueue(system_wq);
-}
-EXPORT_SYMBOL(flush_scheduled_work);
-
-/**
* execute_in_process_context - reliably execute the routine with user context
* @fn: the function to execute
* @ew: guaranteed storage for the execute work structure (must
@@ -3081,7 +3058,7 @@ static bool wqattrs_equal(const struct workqueue_attrs *a,
* init_worker_pool - initialize a newly zalloc'd worker_pool
* @pool: worker_pool to initialize
*
- * Initiailize a newly zalloc'd @pool. It also allocates @pool->attrs.
+ * Initialize a newly zalloc'd @pool. It also allocates @pool->attrs.
*
* Return: 0 on success, -errno on failure. Even on failure, all fields
* inside @pool proper are initialized and put_unbound_pool() can be called
@@ -3425,20 +3402,9 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
return pwq;
}
-/* undo alloc_unbound_pwq(), used only in the error path */
-static void free_unbound_pwq(struct pool_workqueue *pwq)
-{
- lockdep_assert_held(&wq_pool_mutex);
-
- if (pwq) {
- put_unbound_pool(pwq->pool);
- kmem_cache_free(pwq_cache, pwq);
- }
-}
-
/**
- * wq_calc_node_mask - calculate a wq_attrs' cpumask for the specified node
- * @attrs: the wq_attrs of interest
+ * wq_calc_node_cpumask - calculate a wq_attrs' cpumask for the specified node
+ * @attrs: the wq_attrs of the default pwq of the target workqueue
* @node: the target NUMA node
* @cpu_going_down: if >= 0, the CPU to consider as offline
* @cpumask: outarg, the resulting cpumask
@@ -3488,6 +3454,7 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
{
struct pool_workqueue *old_pwq;
+ lockdep_assert_held(&wq_pool_mutex);
lockdep_assert_held(&wq->mutex);
/* link_pwq() can handle duplicate calls */
@@ -3498,46 +3465,59 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
return old_pwq;
}
-/**
- * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue
- * @wq: the target workqueue
- * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs()
- *
- * Apply @attrs to an unbound workqueue @wq. Unless disabled, on NUMA
- * machines, this function maps a separate pwq to each NUMA node with
- * possibles CPUs in @attrs->cpumask so that work items are affine to the
- * NUMA node it was issued on. Older pwqs are released as in-flight work
- * items finish. Note that a work item which repeatedly requeues itself
- * back-to-back will stay on its current pwq.
- *
- * Performs GFP_KERNEL allocations.
- *
- * Return: 0 on success and -errno on failure.
- */
-int apply_workqueue_attrs(struct workqueue_struct *wq,
- const struct workqueue_attrs *attrs)
+/* context to store the prepared attrs & pwqs before applying */
+struct apply_wqattrs_ctx {
+ struct workqueue_struct *wq; /* target workqueue */
+ struct workqueue_attrs *attrs; /* attrs to apply */
+ struct list_head list; /* queued for batching commit */
+ struct pool_workqueue *dfl_pwq;
+ struct pool_workqueue *pwq_tbl[];
+};
+
+/* free the resources after success or abort */
+static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx)
+{
+ if (ctx) {
+ int node;
+
+ for_each_node(node)
+ put_pwq_unlocked(ctx->pwq_tbl[node]);
+ put_pwq_unlocked(ctx->dfl_pwq);
+
+ free_workqueue_attrs(ctx->attrs);
+
+ kfree(ctx);
+ }
+}
+
+/* allocate the attrs and pwqs for later installation */
+static struct apply_wqattrs_ctx *
+apply_wqattrs_prepare(struct workqueue_struct *wq,
+ const struct workqueue_attrs *attrs)
{
+ struct apply_wqattrs_ctx *ctx;
struct workqueue_attrs *new_attrs, *tmp_attrs;
- struct pool_workqueue **pwq_tbl, *dfl_pwq;
- int node, ret;
+ int node;
- /* only unbound workqueues can change attributes */
- if (WARN_ON(!(wq->flags & WQ_UNBOUND)))
- return -EINVAL;
+ lockdep_assert_held(&wq_pool_mutex);
- /* creating multiple pwqs breaks ordering guarantee */
- if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
- return -EINVAL;
+ ctx = kzalloc(sizeof(*ctx) + nr_node_ids * sizeof(ctx->pwq_tbl[0]),
+ GFP_KERNEL);
- pwq_tbl = kzalloc(nr_node_ids * sizeof(pwq_tbl[0]), GFP_KERNEL);
new_attrs = alloc_workqueue_attrs(GFP_KERNEL);
tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL);
- if (!pwq_tbl || !new_attrs || !tmp_attrs)
- goto enomem;
+ if (!ctx || !new_attrs || !tmp_attrs)
+ goto out_free;
- /* make a copy of @attrs and sanitize it */
+ /*
+ * Calculate the attrs of the default pwq.
+ * If the user configured cpumask doesn't overlap with the
+ * wq_unbound_cpumask, we fallback to the wq_unbound_cpumask.
+ */
copy_workqueue_attrs(new_attrs, attrs);
- cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
+ cpumask_and(new_attrs->cpumask, new_attrs->cpumask, wq_unbound_cpumask);
+ if (unlikely(cpumask_empty(new_attrs->cpumask)))
+ cpumask_copy(new_attrs->cpumask, wq_unbound_cpumask);
/*
* We may create multiple pwqs with differing cpumasks. Make a
@@ -3547,75 +3527,129 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
copy_workqueue_attrs(tmp_attrs, new_attrs);
/*
- * CPUs should stay stable across pwq creations and installations.
- * Pin CPUs, determine the target cpumask for each node and create
- * pwqs accordingly.
- */
- get_online_cpus();
-
- mutex_lock(&wq_pool_mutex);
-
- /*
* If something goes wrong during CPU up/down, we'll fall back to
* the default pwq covering whole @attrs->cpumask. Always create
* it even if we don't use it immediately.
*/
- dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
- if (!dfl_pwq)
- goto enomem_pwq;
+ ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
+ if (!ctx->dfl_pwq)
+ goto out_free;
for_each_node(node) {
- if (wq_calc_node_cpumask(attrs, node, -1, tmp_attrs->cpumask)) {
- pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
- if (!pwq_tbl[node])
- goto enomem_pwq;
+ if (wq_calc_node_cpumask(new_attrs, node, -1, tmp_attrs->cpumask)) {
+ ctx->pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
+ if (!ctx->pwq_tbl[node])
+ goto out_free;
} else {
- dfl_pwq->refcnt++;
- pwq_tbl[node] = dfl_pwq;
+ ctx->dfl_pwq->refcnt++;
+ ctx->pwq_tbl[node] = ctx->dfl_pwq;
}
}
- mutex_unlock(&wq_pool_mutex);
+ /* save the user configured attrs and sanitize it. */
+ copy_workqueue_attrs(new_attrs, attrs);
+ cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
+ ctx->attrs = new_attrs;
+
+ ctx->wq = wq;
+ free_workqueue_attrs(tmp_attrs);
+ return ctx;
+
+out_free:
+ free_workqueue_attrs(tmp_attrs);
+ free_workqueue_attrs(new_attrs);
+ apply_wqattrs_cleanup(ctx);
+ return NULL;
+}
+
+/* set attrs and install prepared pwqs, @ctx points to old pwqs on return */
+static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx)
+{
+ int node;
/* all pwqs have been created successfully, let's install'em */
- mutex_lock(&wq->mutex);
+ mutex_lock(&ctx->wq->mutex);
- copy_workqueue_attrs(wq->unbound_attrs, new_attrs);
+ copy_workqueue_attrs(ctx->wq->unbound_attrs, ctx->attrs);
/* save the previous pwq and install the new one */
for_each_node(node)
- pwq_tbl[node] = numa_pwq_tbl_install(wq, node, pwq_tbl[node]);
+ ctx->pwq_tbl[node] = numa_pwq_tbl_install(ctx->wq, node,
+ ctx->pwq_tbl[node]);
/* @dfl_pwq might not have been used, ensure it's linked */
- link_pwq(dfl_pwq);
- swap(wq->dfl_pwq, dfl_pwq);
+ link_pwq(ctx->dfl_pwq);
+ swap(ctx->wq->dfl_pwq, ctx->dfl_pwq);
- mutex_unlock(&wq->mutex);
+ mutex_unlock(&ctx->wq->mutex);
+}
- /* put the old pwqs */
- for_each_node(node)
- put_pwq_unlocked(pwq_tbl[node]);
- put_pwq_unlocked(dfl_pwq);
+static void apply_wqattrs_lock(void)
+{
+ /* CPUs should stay stable across pwq creations and installations */
+ get_online_cpus();
+ mutex_lock(&wq_pool_mutex);
+}
+static void apply_wqattrs_unlock(void)
+{
+ mutex_unlock(&wq_pool_mutex);
put_online_cpus();
- ret = 0;
- /* fall through */
-out_free:
- free_workqueue_attrs(tmp_attrs);
- free_workqueue_attrs(new_attrs);
- kfree(pwq_tbl);
+}
+
+static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
+ const struct workqueue_attrs *attrs)
+{
+ struct apply_wqattrs_ctx *ctx;
+ int ret = -ENOMEM;
+
+ /* only unbound workqueues can change attributes */
+ if (WARN_ON(!(wq->flags & WQ_UNBOUND)))
+ return -EINVAL;
+
+ /* creating multiple pwqs breaks ordering guarantee */
+ if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
+ return -EINVAL;
+
+ ctx = apply_wqattrs_prepare(wq, attrs);
+
+ /* the ctx has been prepared successfully, let's commit it */
+ if (ctx) {
+ apply_wqattrs_commit(ctx);
+ ret = 0;
+ }
+
+ apply_wqattrs_cleanup(ctx);
+
return ret;
+}
-enomem_pwq:
- free_unbound_pwq(dfl_pwq);
- for_each_node(node)
- if (pwq_tbl && pwq_tbl[node] != dfl_pwq)
- free_unbound_pwq(pwq_tbl[node]);
- mutex_unlock(&wq_pool_mutex);
- put_online_cpus();
-enomem:
- ret = -ENOMEM;
- goto out_free;
+/**
+ * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue
+ * @wq: the target workqueue
+ * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs()
+ *
+ * Apply @attrs to an unbound workqueue @wq. Unless disabled, on NUMA
+ * machines, this function maps a separate pwq to each NUMA node with
+ * possibles CPUs in @attrs->cpumask so that work items are affine to the
+ * NUMA node it was issued on. Older pwqs are released as in-flight work
+ * items finish. Note that a work item which repeatedly requeues itself
+ * back-to-back will stay on its current pwq.
+ *
+ * Performs GFP_KERNEL allocations.
+ *
+ * Return: 0 on success and -errno on failure.
+ */
+int apply_workqueue_attrs(struct workqueue_struct *wq,
+ const struct workqueue_attrs *attrs)
+{
+ int ret;
+
+ apply_wqattrs_lock();
+ ret = apply_workqueue_attrs_locked(wq, attrs);
+ apply_wqattrs_unlock();
+
+ return ret;
}
/**
@@ -3651,7 +3685,8 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
lockdep_assert_held(&wq_pool_mutex);
- if (!wq_numa_enabled || !(wq->flags & WQ_UNBOUND))
+ if (!wq_numa_enabled || !(wq->flags & WQ_UNBOUND) ||
+ wq->unbound_attrs->no_numa)
return;
/*
@@ -3662,48 +3697,37 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
target_attrs = wq_update_unbound_numa_attrs_buf;
cpumask = target_attrs->cpumask;
- mutex_lock(&wq->mutex);
- if (wq->unbound_attrs->no_numa)
- goto out_unlock;
-
copy_workqueue_attrs(target_attrs, wq->unbound_attrs);
pwq = unbound_pwq_by_node(wq, node);
/*
* Let's determine what needs to be done. If the target cpumask is
- * different from wq's, we need to compare it to @pwq's and create
- * a new one if they don't match. If the target cpumask equals
- * wq's, the default pwq should be used.
+ * different from the default pwq's, we need to compare it to @pwq's
+ * and create a new one if they don't match. If the target cpumask
+ * equals the default pwq's, the default pwq should be used.
*/
- if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) {
+ if (wq_calc_node_cpumask(wq->dfl_pwq->pool->attrs, node, cpu_off, cpumask)) {
if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask))
- goto out_unlock;
+ return;
} else {
goto use_dfl_pwq;
}
- mutex_unlock(&wq->mutex);
-
/* create a new pwq */
pwq = alloc_unbound_pwq(wq, target_attrs);
if (!pwq) {
pr_warn("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n",
wq->name);
- mutex_lock(&wq->mutex);
goto use_dfl_pwq;
}
- /*
- * Install the new pwq. As this function is called only from CPU
- * hotplug callbacks and applying a new attrs is wrapped with
- * get/put_online_cpus(), @wq->unbound_attrs couldn't have changed
- * inbetween.
- */
+ /* Install the new pwq. */
mutex_lock(&wq->mutex);
old_pwq = numa_pwq_tbl_install(wq, node, pwq);
goto out_unlock;
use_dfl_pwq:
+ mutex_lock(&wq->mutex);
spin_lock_irq(&wq->dfl_pwq->pool->lock);
get_pwq(wq->dfl_pwq);
spin_unlock_irq(&wq->dfl_pwq->pool->lock);
@@ -4385,7 +4409,7 @@ static void rebind_workers(struct worker_pool *pool)
/*
* Restore CPU affinity of all workers. As all idle workers should
* be on the run-queue of the associated CPU before any local
- * wake-ups for concurrency management happen, restore CPU affinty
+ * wake-ups for concurrency management happen, restore CPU affinity
* of all workers first and then clear UNBOUND. As we're called
* from CPU_ONLINE, the following shouldn't fail.
*/
@@ -4698,6 +4722,82 @@ out_unlock:
}
#endif /* CONFIG_FREEZER */
+static int workqueue_apply_unbound_cpumask(void)
+{
+ LIST_HEAD(ctxs);
+ int ret = 0;
+ struct workqueue_struct *wq;
+ struct apply_wqattrs_ctx *ctx, *n;
+
+ lockdep_assert_held(&wq_pool_mutex);
+
+ list_for_each_entry(wq, &workqueues, list) {
+ if (!(wq->flags & WQ_UNBOUND))
+ continue;
+ /* creating multiple pwqs breaks ordering guarantee */
+ if (wq->flags & __WQ_ORDERED)
+ continue;
+
+ ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs);
+ if (!ctx) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ list_add_tail(&ctx->list, &ctxs);
+ }
+
+ list_for_each_entry_safe(ctx, n, &ctxs, list) {
+ if (!ret)
+ apply_wqattrs_commit(ctx);
+ apply_wqattrs_cleanup(ctx);
+ }
+
+ return ret;
+}
+
+/**
+ * workqueue_set_unbound_cpumask - Set the low-level unbound cpumask
+ * @cpumask: the cpumask to set
+ *
+ * The low-level workqueues cpumask is a global cpumask that limits
+ * the affinity of all unbound workqueues. This function check the @cpumask
+ * and apply it to all unbound workqueues and updates all pwqs of them.
+ *
+ * Retun: 0 - Success
+ * -EINVAL - Invalid @cpumask
+ * -ENOMEM - Failed to allocate memory for attrs or pwqs.
+ */
+int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
+{
+ int ret = -EINVAL;
+ cpumask_var_t saved_cpumask;
+
+ if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL))
+ return -ENOMEM;
+
+ cpumask_and(cpumask, cpumask, cpu_possible_mask);
+ if (!cpumask_empty(cpumask)) {
+ apply_wqattrs_lock();
+
+ /* save the old wq_unbound_cpumask. */
+ cpumask_copy(saved_cpumask, wq_unbound_cpumask);
+
+ /* update wq_unbound_cpumask at first and apply it to wqs. */
+ cpumask_copy(wq_unbound_cpumask, cpumask);
+ ret = workqueue_apply_unbound_cpumask();
+
+ /* restore the wq_unbound_cpumask when failed. */
+ if (ret < 0)
+ cpumask_copy(wq_unbound_cpumask, saved_cpumask);
+
+ apply_wqattrs_unlock();
+ }
+
+ free_cpumask_var(saved_cpumask);
+ return ret;
+}
+
#ifdef CONFIG_SYSFS
/*
* Workqueues with WQ_SYSFS flag set is visible to userland via
@@ -4802,13 +4902,13 @@ static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
{
struct workqueue_attrs *attrs;
+ lockdep_assert_held(&wq_pool_mutex);
+
attrs = alloc_workqueue_attrs(GFP_KERNEL);
if (!attrs)
return NULL;
- mutex_lock(&wq->mutex);
copy_workqueue_attrs(attrs, wq->unbound_attrs);
- mutex_unlock(&wq->mutex);
return attrs;
}
@@ -4817,18 +4917,22 @@ static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
{
struct workqueue_struct *wq = dev_to_wq(dev);
struct workqueue_attrs *attrs;
- int ret;
+ int ret = -ENOMEM;
+
+ apply_wqattrs_lock();
attrs = wq_sysfs_prep_attrs(wq);
if (!attrs)
- return -ENOMEM;
+ goto out_unlock;
if (sscanf(buf, "%d", &attrs->nice) == 1 &&
attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
- ret = apply_workqueue_attrs(wq, attrs);
+ ret = apply_workqueue_attrs_locked(wq, attrs);
else
ret = -EINVAL;
+out_unlock:
+ apply_wqattrs_unlock();
free_workqueue_attrs(attrs);
return ret ?: count;
}
@@ -4852,16 +4956,20 @@ static ssize_t wq_cpumask_store(struct device *dev,
{
struct workqueue_struct *wq = dev_to_wq(dev);
struct workqueue_attrs *attrs;
- int ret;
+ int ret = -ENOMEM;
+
+ apply_wqattrs_lock();
attrs = wq_sysfs_prep_attrs(wq);
if (!attrs)
- return -ENOMEM;
+ goto out_unlock;
ret = cpumask_parse(buf, attrs->cpumask);
if (!ret)
- ret = apply_workqueue_attrs(wq, attrs);
+ ret = apply_workqueue_attrs_locked(wq, attrs);
+out_unlock:
+ apply_wqattrs_unlock();
free_workqueue_attrs(attrs);
return ret ?: count;
}
@@ -4885,18 +4993,22 @@ static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
{
struct workqueue_struct *wq = dev_to_wq(dev);
struct workqueue_attrs *attrs;
- int v, ret;
+ int v, ret = -ENOMEM;
+
+ apply_wqattrs_lock();
attrs = wq_sysfs_prep_attrs(wq);
if (!attrs)
- return -ENOMEM;
+ goto out_unlock;
ret = -EINVAL;
if (sscanf(buf, "%d", &v) == 1) {
attrs->no_numa = !v;
- ret = apply_workqueue_attrs(wq, attrs);
+ ret = apply_workqueue_attrs_locked(wq, attrs);
}
+out_unlock:
+ apply_wqattrs_unlock();
free_workqueue_attrs(attrs);
return ret ?: count;
}
@@ -4914,9 +5026,49 @@ static struct bus_type wq_subsys = {
.dev_groups = wq_sysfs_groups,
};
+static ssize_t wq_unbound_cpumask_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ int written;
+
+ mutex_lock(&wq_pool_mutex);
+ written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
+ cpumask_pr_args(wq_unbound_cpumask));
+ mutex_unlock(&wq_pool_mutex);
+
+ return written;
+}
+
+static ssize_t wq_unbound_cpumask_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ cpumask_var_t cpumask;
+ int ret;
+
+ if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
+ return -ENOMEM;
+
+ ret = cpumask_parse(buf, cpumask);
+ if (!ret)
+ ret = workqueue_set_unbound_cpumask(cpumask);
+
+ free_cpumask_var(cpumask);
+ return ret ? ret : count;
+}
+
+static struct device_attribute wq_sysfs_cpumask_attr =
+ __ATTR(cpumask, 0644, wq_unbound_cpumask_show,
+ wq_unbound_cpumask_store);
+
static int __init wq_sysfs_init(void)
{
- return subsys_virtual_register(&wq_subsys, NULL);
+ int err;
+
+ err = subsys_virtual_register(&wq_subsys, NULL);
+ if (err)
+ return err;
+
+ return device_create_file(wq_subsys.dev_root, &wq_sysfs_cpumask_attr);
}
core_initcall(wq_sysfs_init);
@@ -4948,7 +5100,7 @@ int workqueue_sysfs_register(struct workqueue_struct *wq)
int ret;
/*
- * Adjusting max_active or creating new pwqs by applyting
+ * Adjusting max_active or creating new pwqs by applying
* attributes breaks ordering guarantee. Disallow exposing ordered
* workqueues.
*/
@@ -5064,6 +5216,9 @@ static int __init init_workqueues(void)
WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
+ BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
+ cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
+
pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);