summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorAndré Fabian Silva Delgado <emulatorman@parabola.nu>2015-08-05 17:04:01 -0300
committerAndré Fabian Silva Delgado <emulatorman@parabola.nu>2015-08-05 17:04:01 -0300
commit57f0f512b273f60d52568b8c6b77e17f5636edc0 (patch)
tree5e910f0e82173f4ef4f51111366a3f1299037a7b /kernel
Initial import
Diffstat (limited to 'kernel')
-rw-r--r--kernel/.gitignore8
-rw-r--r--kernel/Kconfig.freezer2
-rw-r--r--kernel/Kconfig.hz58
-rw-r--r--kernel/Kconfig.locks243
-rw-r--r--kernel/Kconfig.preempt58
-rw-r--r--kernel/Makefile209
-rw-r--r--kernel/acct.c602
-rw-r--r--kernel/async.c328
-rw-r--r--kernel/audit.c2036
-rw-r--r--kernel/audit.h328
-rw-r--r--kernel/audit_tree.c988
-rw-r--r--kernel/audit_watch.c519
-rw-r--r--kernel/auditfilter.c1416
-rw-r--r--kernel/auditsc.c2426
-rw-r--r--kernel/backtracetest.c91
-rw-r--r--kernel/bounds.c25
-rw-r--r--kernel/bpf/Makefile2
-rw-r--r--kernel/bpf/arraymap.c156
-rw-r--r--kernel/bpf/core.c674
-rw-r--r--kernel/bpf/hashtab.c367
-rw-r--r--kernel/bpf/helpers.c113
-rw-r--r--kernel/bpf/syscall.c621
-rw-r--r--kernel/bpf/verifier.c2146
-rw-r--r--kernel/capability.c449
-rw-r--r--kernel/cgroup.c5601
-rw-r--r--kernel/cgroup_freezer.c484
-rw-r--r--kernel/compat.c1176
-rw-r--r--kernel/configs.c99
-rw-r--r--kernel/configs/tiny.config4
-rw-r--r--kernel/context_tracking.c195
-rw-r--r--kernel/cpu.c825
-rw-r--r--kernel/cpu_pm.c233
-rw-r--r--kernel/cpuset.c2700
-rw-r--r--kernel/crash_dump.c46
-rw-r--r--kernel/cred.c810
-rw-r--r--kernel/debug/Makefile6
-rw-r--r--kernel/debug/debug_core.c1088
-rw-r--r--kernel/debug/debug_core.h85
-rw-r--r--kernel/debug/gdbstub.c1145
-rw-r--r--kernel/debug/kdb/.gitignore1
-rw-r--r--kernel/debug/kdb/Makefile25
-rw-r--r--kernel/debug/kdb/kdb_bp.c560
-rw-r--r--kernel/debug/kdb/kdb_bt.c210
-rw-r--r--kernel/debug/kdb/kdb_cmds31
-rw-r--r--kernel/debug/kdb/kdb_debugger.c186
-rw-r--r--kernel/debug/kdb/kdb_io.c874
-rw-r--r--kernel/debug/kdb/kdb_keyboard.c263
-rw-r--r--kernel/debug/kdb/kdb_main.c2954
-rw-r--r--kernel/debug/kdb/kdb_private.h261
-rw-r--r--kernel/debug/kdb/kdb_support.c927
-rw-r--r--kernel/delayacct.c158
-rw-r--r--kernel/dma.c160
-rw-r--r--kernel/elfcore.c24
-rw-r--r--kernel/events/Makefile9
-rw-r--r--kernel/events/callchain.c209
-rw-r--r--kernel/events/core.c9083
-rw-r--r--kernel/events/hw_breakpoint.c655
-rw-r--r--kernel/events/internal.h231
-rw-r--r--kernel/events/ring_buffer.c751
-rw-r--r--kernel/events/uprobes.c1993
-rw-r--r--kernel/exec_domain.c58
-rw-r--r--kernel/exit.c1632
-rw-r--r--kernel/extable.c144
-rw-r--r--kernel/fork.c2075
-rw-r--r--kernel/freezer.c179
-rw-r--r--kernel/futex.c3053
-rw-r--r--kernel/futex_compat.c201
-rw-r--r--kernel/gcov/Kconfig82
-rw-r--r--kernel/gcov/Makefile7
-rw-r--r--kernel/gcov/base.c161
-rw-r--r--kernel/gcov/fs.c791
-rw-r--r--kernel/gcov/gcc_3_4.c562
-rw-r--r--kernel/gcov/gcc_4_7.c565
-rw-r--r--kernel/gcov/gcov.h85
-rw-r--r--kernel/groups.c277
-rw-r--r--kernel/hung_task.c251
-rw-r--r--kernel/irq/Kconfig103
-rw-r--r--kernel/irq/Makefile9
-rw-r--r--kernel/irq/autoprobe.c185
-rw-r--r--kernel/irq/chip.c993
-rw-r--r--kernel/irq/debug.h45
-rw-r--r--kernel/irq/devres.c139
-rw-r--r--kernel/irq/dummychip.c62
-rw-r--r--kernel/irq/generic-chip.c608
-rw-r--r--kernel/irq/handle.c197
-rw-r--r--kernel/irq/internals.h212
-rw-r--r--kernel/irq/irqdesc.c648
-rw-r--r--kernel/irq/irqdomain.c1238
-rw-r--r--kernel/irq/manage.c1891
-rw-r--r--kernel/irq/migration.c72
-rw-r--r--kernel/irq/msi.c337
-rw-r--r--kernel/irq/pm.c206
-rw-r--r--kernel/irq/proc.c501
-rw-r--r--kernel/irq/resend.c91
-rw-r--r--kernel/irq/settings.h156
-rw-r--r--kernel/irq/spurious.c467
-rw-r--r--kernel/irq_work.c196
-rw-r--r--kernel/jump_label.c460
-rw-r--r--kernel/kallsyms.c608
-rw-r--r--kernel/kcmp.c198
-rw-r--r--kernel/kexec.c2769
-rw-r--r--kernel/kmod.c694
-rw-r--r--kernel/kprobes.c2472
-rw-r--r--kernel/ksysfs.c243
-rw-r--r--kernel/kthread.c692
-rw-r--r--kernel/latencytop.c292
-rw-r--r--kernel/livepatch/Kconfig18
-rw-r--r--kernel/livepatch/Makefile3
-rw-r--r--kernel/livepatch/core.c1011
-rw-r--r--kernel/locking/Makefile29
-rw-r--r--kernel/locking/lglock.c89
-rw-r--r--kernel/locking/lockdep.c4304
-rw-r--r--kernel/locking/lockdep_internals.h170
-rw-r--r--kernel/locking/lockdep_proc.c695
-rw-r--r--kernel/locking/lockdep_states.h9
-rw-r--r--kernel/locking/locktorture.c815
-rw-r--r--kernel/locking/mcs_spinlock.h111
-rw-r--r--kernel/locking/mutex-debug.c120
-rw-r--r--kernel/locking/mutex-debug.h55
-rw-r--r--kernel/locking/mutex.c972
-rw-r--r--kernel/locking/mutex.h48
-rw-r--r--kernel/locking/osq_lock.c203
-rw-r--r--kernel/locking/percpu-rwsem.c165
-rw-r--r--kernel/locking/qrwlock.c132
-rw-r--r--kernel/locking/rtmutex-debug.c184
-rw-r--r--kernel/locking/rtmutex-debug.h39
-rw-r--r--kernel/locking/rtmutex-tester.c420
-rw-r--r--kernel/locking/rtmutex.c1648
-rw-r--r--kernel/locking/rtmutex.h36
-rw-r--r--kernel/locking/rtmutex_common.h141
-rw-r--r--kernel/locking/rwsem-spinlock.c303
-rw-r--r--kernel/locking/rwsem-xadd.c531
-rw-r--r--kernel/locking/rwsem.c166
-rw-r--r--kernel/locking/rwsem.h20
-rw-r--r--kernel/locking/semaphore.c263
-rw-r--r--kernel/locking/spinlock.c407
-rw-r--r--kernel/locking/spinlock_debug.c302
-rw-r--r--kernel/module-internal.h12
-rw-r--r--kernel/module.c3917
-rw-r--r--kernel/module_signing.c250
-rw-r--r--kernel/notifier.c562
-rw-r--r--kernel/nsproxy.c259
-rw-r--r--kernel/padata.c1105
-rw-r--r--kernel/panic.c519
-rw-r--r--kernel/params.c954
-rw-r--r--kernel/pid.c609
-rw-r--r--kernel/pid_namespace.c409
-rw-r--r--kernel/power/Kconfig581
-rw-r--r--kernel/power/Makefile47
-rw-r--r--kernel/power/autosleep.c128
-rw-r--r--kernel/power/block_io.c103
-rw-r--r--kernel/power/console.c151
-rw-r--r--kernel/power/hibernate.c1178
-rw-r--r--kernel/power/main.c646
-rw-r--r--kernel/power/power.h335
-rw-r--r--kernel/power/poweroff.c46
-rw-r--r--kernel/power/process.c237
-rw-r--r--kernel/power/qos.c713
-rw-r--r--kernel/power/snapshot.c2722
-rw-r--r--kernel/power/suspend.c536
-rw-r--r--kernel/power/suspend_test.c218
-rw-r--r--kernel/power/swap.c1512
-rw-r--r--kernel/power/tuxonice.h260
-rw-r--r--kernel/power/tuxonice_alloc.c308
-rw-r--r--kernel/power/tuxonice_alloc.h54
-rw-r--r--kernel/power/tuxonice_atomic_copy.c469
-rw-r--r--kernel/power/tuxonice_atomic_copy.h25
-rw-r--r--kernel/power/tuxonice_bio.h78
-rw-r--r--kernel/power/tuxonice_bio_chains.c1126
-rw-r--r--kernel/power/tuxonice_bio_core.c1933
-rw-r--r--kernel/power/tuxonice_bio_internal.h101
-rw-r--r--kernel/power/tuxonice_bio_signature.c403
-rw-r--r--kernel/power/tuxonice_builtin.c498
-rw-r--r--kernel/power/tuxonice_builtin.h41
-rw-r--r--kernel/power/tuxonice_checksum.c392
-rw-r--r--kernel/power/tuxonice_checksum.h31
-rw-r--r--kernel/power/tuxonice_cluster.c1058
-rw-r--r--kernel/power/tuxonice_cluster.h18
-rw-r--r--kernel/power/tuxonice_compress.c452
-rw-r--r--kernel/power/tuxonice_copy_before_write.c240
-rw-r--r--kernel/power/tuxonice_extent.c144
-rw-r--r--kernel/power/tuxonice_extent.h45
-rw-r--r--kernel/power/tuxonice_file.c484
-rw-r--r--kernel/power/tuxonice_highlevel.c1413
-rw-r--r--kernel/power/tuxonice_incremental.c402
-rw-r--r--kernel/power/tuxonice_io.c1932
-rw-r--r--kernel/power/tuxonice_io.h72
-rw-r--r--kernel/power/tuxonice_modules.c520
-rw-r--r--kernel/power/tuxonice_modules.h212
-rw-r--r--kernel/power/tuxonice_netlink.c324
-rw-r--r--kernel/power/tuxonice_netlink.h62
-rw-r--r--kernel/power/tuxonice_pagedir.c345
-rw-r--r--kernel/power/tuxonice_pagedir.h50
-rw-r--r--kernel/power/tuxonice_pageflags.c18
-rw-r--r--kernel/power/tuxonice_pageflags.h106
-rw-r--r--kernel/power/tuxonice_power_off.c286
-rw-r--r--kernel/power/tuxonice_power_off.h24
-rw-r--r--kernel/power/tuxonice_prepare_image.c1080
-rw-r--r--kernel/power/tuxonice_prepare_image.h38
-rw-r--r--kernel/power/tuxonice_prune.c406
-rw-r--r--kernel/power/tuxonice_storage.c282
-rw-r--r--kernel/power/tuxonice_storage.h45
-rw-r--r--kernel/power/tuxonice_swap.c474
-rw-r--r--kernel/power/tuxonice_sysfs.c333
-rw-r--r--kernel/power/tuxonice_sysfs.h137
-rw-r--r--kernel/power/tuxonice_ui.c247
-rw-r--r--kernel/power/tuxonice_ui.h97
-rw-r--r--kernel/power/tuxonice_userui.c658
-rw-r--r--kernel/power/user.c478
-rw-r--r--kernel/power/wakelock.c268
-rw-r--r--kernel/printk/Makefile2
-rw-r--r--kernel/printk/braille.c49
-rw-r--r--kernel/printk/braille.h48
-rw-r--r--kernel/printk/console_cmdline.h14
-rw-r--r--kernel/printk/printk.c3080
-rw-r--r--kernel/profile.c613
-rw-r--r--kernel/ptrace.c1219
-rw-r--r--kernel/range.c163
-rw-r--r--kernel/rcu/Makefile7
-rw-r--r--kernel/rcu/rcu.h146
-rw-r--r--kernel/rcu/rcutorture.c1861
-rw-r--r--kernel/rcu/srcu.c676
-rw-r--r--kernel/rcu/tiny.c288
-rw-r--r--kernel/rcu/tiny_plugin.h173
-rw-r--r--kernel/rcu/tree.c4136
-rw-r--r--kernel/rcu/tree.h620
-rw-r--r--kernel/rcu/tree_plugin.h3090
-rw-r--r--kernel/rcu/tree_trace.c505
-rw-r--r--kernel/rcu/update.c831
-rw-r--r--kernel/reboot.c557
-rw-r--r--kernel/relay.c1360
-rw-r--r--kernel/resource.c1534
-rw-r--r--kernel/sched/Makefile26
-rw-r--r--kernel/sched/auto_group.c253
-rw-r--r--kernel/sched/auto_group.h64
-rw-r--r--kernel/sched/bfs.c7429
-rw-r--r--kernel/sched/bfs_sched.h172
-rw-r--r--kernel/sched/clock.c435
-rw-r--r--kernel/sched/completion.c317
-rw-r--r--kernel/sched/core.c8381
-rw-r--r--kernel/sched/cpuacct.c283
-rw-r--r--kernel/sched/cpuacct.h17
-rw-r--r--kernel/sched/cpudeadline.c246
-rw-r--r--kernel/sched/cpudeadline.h32
-rw-r--r--kernel/sched/cpupri.c248
-rw-r--r--kernel/sched/cpupri.h31
-rw-r--r--kernel/sched/cputime.c852
-rw-r--r--kernel/sched/deadline.c1818
-rw-r--r--kernel/sched/debug.c674
-rw-r--r--kernel/sched/fair.c8310
-rw-r--r--kernel/sched/features.h98
-rw-r--r--kernel/sched/idle.c302
-rw-r--r--kernel/sched/idle_task.c109
-rw-r--r--kernel/sched/proc.c584
-rw-r--r--kernel/sched/rt.c2346
-rw-r--r--kernel/sched/sched.h1736
-rw-r--r--kernel/sched/stats.c142
-rw-r--r--kernel/sched/stats.h267
-rw-r--r--kernel/sched/stop_task.c136
-rw-r--r--kernel/sched/wait.c624
-rw-r--r--kernel/seccomp.c904
-rw-r--r--kernel/signal.c3674
-rw-r--r--kernel/smp.c741
-rw-r--r--kernel/smpboot.c472
-rw-r--r--kernel/smpboot.h20
-rw-r--r--kernel/softirq.c787
-rw-r--r--kernel/stacktrace.c75
-rw-r--r--kernel/stop_machine.c649
-rw-r--r--kernel/sys.c2408
-rw-r--r--kernel/sys_ni.c245
-rw-r--r--kernel/sysctl.c2819
-rw-r--r--kernel/sysctl_binary.c1494
-rw-r--r--kernel/system_certificates.S20
-rw-r--r--kernel/system_keyring.c106
-rw-r--r--kernel/task_work.c128
-rw-r--r--kernel/taskstats.c710
-rw-r--r--kernel/test_kprobes.c389
-rw-r--r--kernel/time/Kconfig199
-rw-r--r--kernel/time/Makefile31
-rw-r--r--kernel/time/alarmtimer.c873
-rw-r--r--kernel/time/clockevents.c792
-rw-r--r--kernel/time/clocksource.c1020
-rw-r--r--kernel/time/hrtimer.c1852
-rw-r--r--kernel/time/itimer.c301
-rw-r--r--kernel/time/jiffies.c136
-rw-r--r--kernel/time/ntp.c965
-rw-r--r--kernel/time/ntp_internal.h12
-rw-r--r--kernel/time/posix-clock.c446
-rw-r--r--kernel/time/posix-cpu-timers.c1475
-rw-r--r--kernel/time/posix-timers.c1124
-rw-r--r--kernel/time/sched_clock.c303
-rw-r--r--kernel/time/test_udelay.c168
-rw-r--r--kernel/time/tick-broadcast-hrtimer.c113
-rw-r--r--kernel/time/tick-broadcast.c964
-rw-r--r--kernel/time/tick-common.c498
-rw-r--r--kernel/time/tick-internal.h139
-rw-r--r--kernel/time/tick-oneshot.c116
-rw-r--r--kernel/time/tick-sched.c1250
-rw-r--r--kernel/time/tick-sched.h74
-rw-r--r--kernel/time/time.c785
-rw-r--r--kernel/time/timeconst.bc108
-rw-r--r--kernel/time/timeconv.c127
-rw-r--r--kernel/time/timecounter.c112
-rw-r--r--kernel/time/timekeeping.c2072
-rw-r--r--kernel/time/timekeeping.h29
-rw-r--r--kernel/time/timekeeping_debug.c74
-rw-r--r--kernel/time/timekeeping_internal.h29
-rw-r--r--kernel/time/timer.c1720
-rw-r--r--kernel/time/timer_list.c396
-rw-r--r--kernel/time/timer_stats.c425
-rw-r--r--kernel/torture.c741
-rw-r--r--kernel/trace/Kconfig641
-rw-r--r--kernel/trace/Makefile70
-rw-r--r--kernel/trace/blktrace.c1819
-rw-r--r--kernel/trace/bpf_trace.c222
-rw-r--r--kernel/trace/ftrace.c5951
-rw-r--r--kernel/trace/power-traces.c18
-rw-r--r--kernel/trace/ring_buffer.c5014
-rw-r--r--kernel/trace/ring_buffer_benchmark.c483
-rw-r--r--kernel/trace/rpm-traces.c20
-rw-r--r--kernel/trace/trace.c7205
-rw-r--r--kernel/trace/trace.h1321
-rw-r--r--kernel/trace/trace_benchmark.c198
-rw-r--r--kernel/trace/trace_benchmark.h41
-rw-r--r--kernel/trace/trace_branch.c414
-rw-r--r--kernel/trace/trace_clock.c137
-rw-r--r--kernel/trace/trace_entries.h324
-rw-r--r--kernel/trace/trace_event_perf.c384
-rw-r--r--kernel/trace/trace_events.c2943
-rw-r--r--kernel/trace/trace_events_filter.c2440
-rw-r--r--kernel/trace/trace_events_filter_test.h50
-rw-r--r--kernel/trace/trace_events_trigger.c1437
-rw-r--r--kernel/trace/trace_export.c195
-rw-r--r--kernel/trace/trace_functions.c689
-rw-r--r--kernel/trace/trace_functions_graph.c1470
-rw-r--r--kernel/trace/trace_irqsoff.c764
-rw-r--r--kernel/trace/trace_kdb.c140
-rw-r--r--kernel/trace/trace_kprobe.c1495
-rw-r--r--kernel/trace/trace_mmiotrace.c364
-rw-r--r--kernel/trace/trace_nop.c99
-rw-r--r--kernel/trace/trace_output.c1256
-rw-r--r--kernel/trace/trace_output.h45
-rw-r--r--kernel/trace/trace_printk.c366
-rw-r--r--kernel/trace/trace_probe.c723
-rw-r--r--kernel/trace/trace_probe.h394
-rw-r--r--kernel/trace/trace_sched_switch.c101
-rw-r--r--kernel/trace/trace_sched_wakeup.c816
-rw-r--r--kernel/trace/trace_selftest.c1220
-rw-r--r--kernel/trace/trace_selftest_dynamic.c13
-rw-r--r--kernel/trace/trace_seq.c377
-rw-r--r--kernel/trace/trace_stack.c484
-rw-r--r--kernel/trace/trace_stat.c359
-rw-r--r--kernel/trace/trace_stat.h33
-rw-r--r--kernel/trace/trace_syscalls.c750
-rw-r--r--kernel/trace/trace_uprobe.c1336
-rw-r--r--kernel/tracepoint.c520
-rw-r--r--kernel/tsacct.c180
-rw-r--r--kernel/uid16.c217
-rw-r--r--kernel/up.c84
-rw-r--r--kernel/user-return-notifier.c44
-rw-r--r--kernel/user.c228
-rw-r--r--kernel/user_namespace.c1012
-rw-r--r--kernel/utsname.c139
-rw-r--r--kernel/utsname_sysctl.c138
-rw-r--r--kernel/watchdog.c890
-rw-r--r--kernel/workqueue.c5140
-rw-r--r--kernel/workqueue_internal.h74
367 files changed, 269762 insertions, 0 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore
new file mode 100644
index 000000000..790d83c7d
--- /dev/null
+++ b/kernel/.gitignore
@@ -0,0 +1,8 @@
+#
+# Generated files
+#
+config_data.h
+config_data.gz
+timeconst.h
+hz.bc
+x509_certificate_list
diff --git a/kernel/Kconfig.freezer b/kernel/Kconfig.freezer
new file mode 100644
index 000000000..a3bb4cb52
--- /dev/null
+++ b/kernel/Kconfig.freezer
@@ -0,0 +1,2 @@
+config FREEZER
+ def_bool PM_SLEEP || CGROUP_FREEZER
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
new file mode 100644
index 000000000..2a202a846
--- /dev/null
+++ b/kernel/Kconfig.hz
@@ -0,0 +1,58 @@
+#
+# Timer Interrupt Frequency Configuration
+#
+
+choice
+ prompt "Timer frequency"
+ default HZ_250
+ help
+ Allows the configuration of the timer frequency. It is customary
+ to have the timer interrupt run at 1000 Hz but 100 Hz may be more
+ beneficial for servers and NUMA systems that do not need to have
+ a fast response for user interaction and that may experience bus
+ contention and cacheline bounces as a result of timer interrupts.
+ Note that the timer interrupt occurs on each processor in an SMP
+ environment leading to NR_CPUS * HZ number of timer interrupts
+ per second.
+
+
+ config HZ_100
+ bool "100 HZ"
+ help
+ 100 Hz is a typical choice for servers, SMP and NUMA systems
+ with lots of processors that may show reduced performance if
+ too many timer interrupts are occurring.
+
+ config HZ_250
+ bool "250 HZ"
+ help
+ 250 Hz is a good compromise choice allowing server performance
+ while also showing good interactive responsiveness even
+ on SMP and NUMA systems. If you are going to be using NTSC video
+ or multimedia, selected 300Hz instead.
+
+ config HZ_300
+ bool "300 HZ"
+ help
+ 300 Hz is a good compromise choice allowing server performance
+ while also showing good interactive responsiveness even
+ on SMP and NUMA systems and exactly dividing by both PAL and
+ NTSC frame rates for video and multimedia work.
+
+ config HZ_1000
+ bool "1000 HZ"
+ help
+ 1000 Hz is the preferred choice for desktop systems and other
+ systems requiring fast interactive responses to events.
+
+endchoice
+
+config HZ
+ int
+ default 100 if HZ_100
+ default 250 if HZ_250
+ default 300 if HZ_300
+ default 1000 if HZ_1000
+
+config SCHED_HRTICK
+ def_bool HIGH_RES_TIMERS
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
new file mode 100644
index 000000000..08561f1ac
--- /dev/null
+++ b/kernel/Kconfig.locks
@@ -0,0 +1,243 @@
+#
+# The ARCH_INLINE foo is necessary because select ignores "depends on"
+#
+config ARCH_INLINE_SPIN_TRYLOCK
+ bool
+
+config ARCH_INLINE_SPIN_TRYLOCK_BH
+ bool
+
+config ARCH_INLINE_SPIN_LOCK
+ bool
+
+config ARCH_INLINE_SPIN_LOCK_BH
+ bool
+
+config ARCH_INLINE_SPIN_LOCK_IRQ
+ bool
+
+config ARCH_INLINE_SPIN_LOCK_IRQSAVE
+ bool
+
+config ARCH_INLINE_SPIN_UNLOCK
+ bool
+
+config ARCH_INLINE_SPIN_UNLOCK_BH
+ bool
+
+config ARCH_INLINE_SPIN_UNLOCK_IRQ
+ bool
+
+config ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE
+ bool
+
+
+config ARCH_INLINE_READ_TRYLOCK
+ bool
+
+config ARCH_INLINE_READ_LOCK
+ bool
+
+config ARCH_INLINE_READ_LOCK_BH
+ bool
+
+config ARCH_INLINE_READ_LOCK_IRQ
+ bool
+
+config ARCH_INLINE_READ_LOCK_IRQSAVE
+ bool
+
+config ARCH_INLINE_READ_UNLOCK
+ bool
+
+config ARCH_INLINE_READ_UNLOCK_BH
+ bool
+
+config ARCH_INLINE_READ_UNLOCK_IRQ
+ bool
+
+config ARCH_INLINE_READ_UNLOCK_IRQRESTORE
+ bool
+
+
+config ARCH_INLINE_WRITE_TRYLOCK
+ bool
+
+config ARCH_INLINE_WRITE_LOCK
+ bool
+
+config ARCH_INLINE_WRITE_LOCK_BH
+ bool
+
+config ARCH_INLINE_WRITE_LOCK_IRQ
+ bool
+
+config ARCH_INLINE_WRITE_LOCK_IRQSAVE
+ bool
+
+config ARCH_INLINE_WRITE_UNLOCK
+ bool
+
+config ARCH_INLINE_WRITE_UNLOCK_BH
+ bool
+
+config ARCH_INLINE_WRITE_UNLOCK_IRQ
+ bool
+
+config ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
+ bool
+
+config UNINLINE_SPIN_UNLOCK
+ bool
+
+#
+# lock_* functions are inlined when:
+# - DEBUG_SPINLOCK=n and GENERIC_LOCKBREAK=n and ARCH_INLINE_*LOCK=y
+#
+# trylock_* functions are inlined when:
+# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
+#
+# unlock and unlock_irq functions are inlined when:
+# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
+# or
+# - DEBUG_SPINLOCK=n and PREEMPT=n
+#
+# unlock_bh and unlock_irqrestore functions are inlined when:
+# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
+#
+
+if !DEBUG_SPINLOCK
+
+config INLINE_SPIN_TRYLOCK
+ def_bool y
+ depends on ARCH_INLINE_SPIN_TRYLOCK
+
+config INLINE_SPIN_TRYLOCK_BH
+ def_bool y
+ depends on ARCH_INLINE_SPIN_TRYLOCK_BH
+
+config INLINE_SPIN_LOCK
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK
+
+config INLINE_SPIN_LOCK_BH
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK_BH
+
+config INLINE_SPIN_LOCK_IRQ
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK_IRQ
+
+config INLINE_SPIN_LOCK_IRQSAVE
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK_IRQSAVE
+
+config INLINE_SPIN_UNLOCK_BH
+ def_bool y
+ depends on ARCH_INLINE_SPIN_UNLOCK_BH
+
+config INLINE_SPIN_UNLOCK_IRQ
+ def_bool y
+ depends on !PREEMPT || ARCH_INLINE_SPIN_UNLOCK_IRQ
+
+config INLINE_SPIN_UNLOCK_IRQRESTORE
+ def_bool y
+ depends on ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE
+
+
+config INLINE_READ_TRYLOCK
+ def_bool y
+ depends on ARCH_INLINE_READ_TRYLOCK
+
+config INLINE_READ_LOCK
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK
+
+config INLINE_READ_LOCK_BH
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK_BH
+
+config INLINE_READ_LOCK_IRQ
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK_IRQ
+
+config INLINE_READ_LOCK_IRQSAVE
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK_IRQSAVE
+
+config INLINE_READ_UNLOCK
+ def_bool y
+ depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK
+
+config INLINE_READ_UNLOCK_BH
+ def_bool y
+ depends on ARCH_INLINE_READ_UNLOCK_BH
+
+config INLINE_READ_UNLOCK_IRQ
+ def_bool y
+ depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK_IRQ
+
+config INLINE_READ_UNLOCK_IRQRESTORE
+ def_bool y
+ depends on ARCH_INLINE_READ_UNLOCK_IRQRESTORE
+
+
+config INLINE_WRITE_TRYLOCK
+ def_bool y
+ depends on ARCH_INLINE_WRITE_TRYLOCK
+
+config INLINE_WRITE_LOCK
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK
+
+config INLINE_WRITE_LOCK_BH
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK_BH
+
+config INLINE_WRITE_LOCK_IRQ
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK_IRQ
+
+config INLINE_WRITE_LOCK_IRQSAVE
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK_IRQSAVE
+
+config INLINE_WRITE_UNLOCK
+ def_bool y
+ depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK
+
+config INLINE_WRITE_UNLOCK_BH
+ def_bool y
+ depends on ARCH_INLINE_WRITE_UNLOCK_BH
+
+config INLINE_WRITE_UNLOCK_IRQ
+ def_bool y
+ depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK_IRQ
+
+config INLINE_WRITE_UNLOCK_IRQRESTORE
+ def_bool y
+ depends on ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
+
+endif
+
+config ARCH_SUPPORTS_ATOMIC_RMW
+ bool
+
+config MUTEX_SPIN_ON_OWNER
+ def_bool y
+ depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW
+
+config RWSEM_SPIN_ON_OWNER
+ def_bool y
+ depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
+
+config LOCK_SPIN_ON_OWNER
+ def_bool y
+ depends on MUTEX_SPIN_ON_OWNER || RWSEM_SPIN_ON_OWNER
+
+config ARCH_USE_QUEUE_RWLOCK
+ bool
+
+config QUEUE_RWLOCK
+ def_bool y if ARCH_USE_QUEUE_RWLOCK
+ depends on SMP
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
new file mode 100644
index 000000000..3f9c97419
--- /dev/null
+++ b/kernel/Kconfig.preempt
@@ -0,0 +1,58 @@
+
+choice
+ prompt "Preemption Model"
+ default PREEMPT_NONE
+
+config PREEMPT_NONE
+ bool "No Forced Preemption (Server)"
+ help
+ This is the traditional Linux preemption model, geared towards
+ throughput. It will still provide good latencies most of the
+ time, but there are no guarantees and occasional longer delays
+ are possible.
+
+ Select this option if you are building a kernel for a server or
+ scientific/computation system, or if you want to maximize the
+ raw processing power of the kernel, irrespective of scheduling
+ latencies.
+
+config PREEMPT_VOLUNTARY
+ bool "Voluntary Kernel Preemption (Desktop)"
+ help
+ This option reduces the latency of the kernel by adding more
+ "explicit preemption points" to the kernel code. These new
+ preemption points have been selected to reduce the maximum
+ latency of rescheduling, providing faster application reactions,
+ at the cost of slightly lower throughput.
+
+ This allows reaction to interactive events by allowing a
+ low priority process to voluntarily preempt itself even if it
+ is in kernel mode executing a system call. This allows
+ applications to run more 'smoothly' even when the system is
+ under load.
+
+ Select this if you are building a kernel for a desktop system.
+
+config PREEMPT
+ bool "Preemptible Kernel (Low-Latency Desktop)"
+ select PREEMPT_COUNT
+ select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
+ help
+ This option reduces the latency of the kernel by making
+ all kernel code (that is not executing in a critical section)
+ preemptible. This allows reaction to interactive events by
+ permitting a low priority process to be preempted involuntarily
+ even if it is in kernel mode executing a system call and would
+ otherwise not be about to reach a natural preemption point.
+ This allows applications to run more 'smoothly' even when the
+ system is under load, at the cost of slightly lower throughput
+ and a slight runtime overhead to kernel code.
+
+ Select this if you are building a kernel for a desktop or
+ embedded system with latency requirements in the milliseconds
+ range.
+
+endchoice
+
+config PREEMPT_COUNT
+ bool \ No newline at end of file
diff --git a/kernel/Makefile b/kernel/Makefile
new file mode 100644
index 000000000..60c302cfb
--- /dev/null
+++ b/kernel/Makefile
@@ -0,0 +1,209 @@
+#
+# Makefile for the linux kernel.
+#
+
+obj-y = fork.o exec_domain.o panic.o \
+ cpu.o exit.o softirq.o resource.o \
+ sysctl.o sysctl_binary.o capability.o ptrace.o user.o \
+ signal.o sys.o kmod.o workqueue.o pid.o task_work.o \
+ extable.o params.o \
+ kthread.o sys_ni.o nsproxy.o \
+ notifier.o ksysfs.o cred.o reboot.o \
+ async.o range.o smpboot.o
+
+obj-$(CONFIG_MULTIUSER) += groups.o
+
+ifdef CONFIG_FUNCTION_TRACER
+# Do not trace debug files and internal ftrace files
+CFLAGS_REMOVE_cgroup-debug.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_irq_work.o = $(CC_FLAGS_FTRACE)
+endif
+
+# cond_syscall is currently not LTO compatible
+CFLAGS_sys_ni.o = $(DISABLE_LTO)
+
+obj-y += sched/
+obj-y += locking/
+obj-y += power/
+obj-y += printk/
+obj-y += irq/
+obj-y += rcu/
+obj-y += livepatch/
+
+obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
+obj-$(CONFIG_FREEZER) += freezer.o
+obj-$(CONFIG_PROFILING) += profile.o
+obj-$(CONFIG_STACKTRACE) += stacktrace.o
+obj-y += time/
+obj-$(CONFIG_FUTEX) += futex.o
+ifeq ($(CONFIG_COMPAT),y)
+obj-$(CONFIG_FUTEX) += futex_compat.o
+endif
+obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
+obj-$(CONFIG_SMP) += smp.o
+ifneq ($(CONFIG_SMP),y)
+obj-y += up.o
+endif
+obj-$(CONFIG_UID16) += uid16.o
+obj-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += system_keyring.o system_certificates.o
+obj-$(CONFIG_MODULES) += module.o
+obj-$(CONFIG_MODULE_SIG) += module_signing.o
+obj-$(CONFIG_KALLSYMS) += kallsyms.o
+obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
+obj-$(CONFIG_KEXEC) += kexec.o
+obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
+obj-$(CONFIG_COMPAT) += compat.o
+obj-$(CONFIG_CGROUPS) += cgroup.o
+obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
+obj-$(CONFIG_CPUSETS) += cpuset.o
+obj-$(CONFIG_UTS_NS) += utsname.o
+obj-$(CONFIG_USER_NS) += user_namespace.o
+obj-$(CONFIG_PID_NS) += pid_namespace.o
+obj-$(CONFIG_IKCONFIG) += configs.o
+obj-$(CONFIG_SMP) += stop_machine.o
+obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
+obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
+obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
+obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o
+obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
+obj-$(CONFIG_GCOV_KERNEL) += gcov/
+obj-$(CONFIG_KPROBES) += kprobes.o
+obj-$(CONFIG_KGDB) += debug/
+obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
+obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
+obj-$(CONFIG_SECCOMP) += seccomp.o
+obj-$(CONFIG_RELAY) += relay.o
+obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
+obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
+obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
+obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
+obj-$(CONFIG_LATENCYTOP) += latencytop.o
+obj-$(CONFIG_BINFMT_ELF) += elfcore.o
+obj-$(CONFIG_COMPAT_BINFMT_ELF) += elfcore.o
+obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o
+obj-$(CONFIG_FUNCTION_TRACER) += trace/
+obj-$(CONFIG_TRACING) += trace/
+obj-$(CONFIG_TRACE_CLOCK) += trace/
+obj-$(CONFIG_RING_BUFFER) += trace/
+obj-$(CONFIG_TRACEPOINTS) += trace/
+obj-$(CONFIG_IRQ_WORK) += irq_work.o
+obj-$(CONFIG_CPU_PM) += cpu_pm.o
+obj-$(CONFIG_BPF) += bpf/
+
+obj-$(CONFIG_PERF_EVENTS) += events/
+
+obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
+obj-$(CONFIG_PADATA) += padata.o
+obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
+obj-$(CONFIG_JUMP_LABEL) += jump_label.o
+obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
+obj-$(CONFIG_TORTURE_TEST) += torture.o
+
+$(obj)/configs.o: $(obj)/config_data.h
+
+# config_data.h contains the same information as ikconfig.h but gzipped.
+# Info from config_data can be extracted from /proc/config*
+targets += config_data.gz
+$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
+ $(call if_changed,gzip)
+
+ filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/basic/bin2c; echo "MAGIC_END;")
+targets += config_data.h
+$(obj)/config_data.h: $(obj)/config_data.gz FORCE
+ $(call filechk,ikconfiggz)
+
+###############################################################################
+#
+# Roll all the X.509 certificates that we can find together and pull them into
+# the kernel so that they get loaded into the system trusted keyring during
+# boot.
+#
+# We look in the source root and the build root for all files whose name ends
+# in ".x509". Unfortunately, this will generate duplicate filenames, so we
+# have make canonicalise the pathnames and then sort them to discard the
+# duplicates.
+#
+###############################################################################
+ifeq ($(CONFIG_SYSTEM_TRUSTED_KEYRING),y)
+X509_CERTIFICATES-y := $(wildcard *.x509) $(wildcard $(srctree)/*.x509)
+X509_CERTIFICATES-$(CONFIG_MODULE_SIG) += $(objtree)/signing_key.x509
+X509_CERTIFICATES-raw := $(sort $(foreach CERT,$(X509_CERTIFICATES-y), \
+ $(or $(realpath $(CERT)),$(CERT))))
+X509_CERTIFICATES := $(subst $(realpath $(objtree))/,,$(X509_CERTIFICATES-raw))
+
+ifeq ($(X509_CERTIFICATES),)
+$(warning *** No X.509 certificates found ***)
+endif
+
+ifneq ($(wildcard $(obj)/.x509.list),)
+ifneq ($(shell cat $(obj)/.x509.list),$(X509_CERTIFICATES))
+$(info X.509 certificate list changed)
+$(shell rm $(obj)/.x509.list)
+endif
+endif
+
+kernel/system_certificates.o: $(obj)/x509_certificate_list
+
+quiet_cmd_x509certs = CERTS $@
+ cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; $(kecho) " - Including cert $(X509)")
+
+targets += $(obj)/x509_certificate_list
+$(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list
+ $(call if_changed,x509certs)
+
+targets += $(obj)/.x509.list
+$(obj)/.x509.list:
+ @echo $(X509_CERTIFICATES) >$@
+endif
+
+clean-files := x509_certificate_list .x509.list
+
+ifeq ($(CONFIG_MODULE_SIG),y)
+###############################################################################
+#
+# If module signing is requested, say by allyesconfig, but a key has not been
+# supplied, then one will need to be generated to make sure the build does not
+# fail and that the kernel may be used afterwards.
+#
+###############################################################################
+ifndef CONFIG_MODULE_SIG_HASH
+$(error Could not determine digest type to use from kernel config)
+endif
+
+signing_key.priv signing_key.x509: x509.genkey
+ @echo "###"
+ @echo "### Now generating an X.509 key pair to be used for signing modules."
+ @echo "###"
+ @echo "### If this takes a long time, you might wish to run rngd in the"
+ @echo "### background to keep the supply of entropy topped up. It"
+ @echo "### needs to be run as root, and uses a hardware random"
+ @echo "### number generator if one is available."
+ @echo "###"
+ openssl req -new -nodes -utf8 -$(CONFIG_MODULE_SIG_HASH) -days 36500 \
+ -batch -x509 -config x509.genkey \
+ -outform DER -out signing_key.x509 \
+ -keyout signing_key.priv 2>&1
+ @echo "###"
+ @echo "### Key pair generated."
+ @echo "###"
+
+x509.genkey:
+ @echo Generating X.509 key generation config
+ @echo >x509.genkey "[ req ]"
+ @echo >>x509.genkey "default_bits = 4096"
+ @echo >>x509.genkey "distinguished_name = req_distinguished_name"
+ @echo >>x509.genkey "prompt = no"
+ @echo >>x509.genkey "string_mask = utf8only"
+ @echo >>x509.genkey "x509_extensions = myexts"
+ @echo >>x509.genkey
+ @echo >>x509.genkey "[ req_distinguished_name ]"
+ @echo >>x509.genkey "#O = Unspecified company"
+ @echo >>x509.genkey "CN = Build time autogenerated kernel key"
+ @echo >>x509.genkey "#emailAddress = unspecified.user@unspecified.company"
+ @echo >>x509.genkey
+ @echo >>x509.genkey "[ myexts ]"
+ @echo >>x509.genkey "basicConstraints=critical,CA:FALSE"
+ @echo >>x509.genkey "keyUsage=digitalSignature"
+ @echo >>x509.genkey "subjectKeyIdentifier=hash"
+ @echo >>x509.genkey "authorityKeyIdentifier=keyid"
+endif
diff --git a/kernel/acct.c b/kernel/acct.c
new file mode 100644
index 000000000..74963d192
--- /dev/null
+++ b/kernel/acct.c
@@ -0,0 +1,602 @@
+/*
+ * linux/kernel/acct.c
+ *
+ * BSD Process Accounting for Linux
+ *
+ * Author: Marco van Wieringen <mvw@planets.elm.net>
+ *
+ * Some code based on ideas and code from:
+ * Thomas K. Dyas <tdyas@eden.rutgers.edu>
+ *
+ * This file implements BSD-style process accounting. Whenever any
+ * process exits, an accounting record of type "struct acct" is
+ * written to the file specified with the acct() system call. It is
+ * up to user-level programs to do useful things with the accounting
+ * log. The kernel just provides the raw accounting information.
+ *
+ * (C) Copyright 1995 - 1997 Marco van Wieringen - ELM Consultancy B.V.
+ *
+ * Plugged two leaks. 1) It didn't return acct_file into the free_filps if
+ * the file happened to be read-only. 2) If the accounting was suspended
+ * due to the lack of space it happily allowed to reopen it and completely
+ * lost the old acct_file. 3/10/98, Al Viro.
+ *
+ * Now we silently close acct_file on attempt to reopen. Cleaned sys_acct().
+ * XTerms and EMACS are manifestations of pure evil. 21/10/98, AV.
+ *
+ * Fixed a nasty interaction with with sys_umount(). If the accointing
+ * was suspeneded we failed to stop it on umount(). Messy.
+ * Another one: remount to readonly didn't stop accounting.
+ * Question: what should we do if we have CAP_SYS_ADMIN but not
+ * CAP_SYS_PACCT? Current code does the following: umount returns -EBUSY
+ * unless we are messing with the root. In that case we are getting a
+ * real mess with do_remount_sb(). 9/11/98, AV.
+ *
+ * Fixed a bunch of races (and pair of leaks). Probably not the best way,
+ * but this one obviously doesn't introduce deadlocks. Later. BTW, found
+ * one race (and leak) in BSD implementation.
+ * OK, that's better. ANOTHER race and leak in BSD variant. There always
+ * is one more bug... 10/11/98, AV.
+ *
+ * Oh, fsck... Oopsable SMP race in do_process_acct() - we must hold
+ * ->mmap_sem to walk the vma list of current->mm. Nasty, since it leaks
+ * a struct file opened for write. Fixed. 2/6/2000, AV.
+ */
+
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/acct.h>
+#include <linux/capability.h>
+#include <linux/file.h>
+#include <linux/tty.h>
+#include <linux/security.h>
+#include <linux/vfs.h>
+#include <linux/jiffies.h>
+#include <linux/times.h>
+#include <linux/syscalls.h>
+#include <linux/mount.h>
+#include <linux/uaccess.h>
+#include <asm/div64.h>
+#include <linux/blkdev.h> /* sector_div */
+#include <linux/pid_namespace.h>
+#include <linux/fs_pin.h>
+
+/*
+ * These constants control the amount of freespace that suspend and
+ * resume the process accounting system, and the time delay between
+ * each check.
+ * Turned into sysctl-controllable parameters. AV, 12/11/98
+ */
+
+int acct_parm[3] = {4, 2, 30};
+#define RESUME (acct_parm[0]) /* >foo% free space - resume */
+#define SUSPEND (acct_parm[1]) /* <foo% free space - suspend */
+#define ACCT_TIMEOUT (acct_parm[2]) /* foo second timeout between checks */
+
+/*
+ * External references and all of the globals.
+ */
+
+struct bsd_acct_struct {
+ struct fs_pin pin;
+ atomic_long_t count;
+ struct rcu_head rcu;
+ struct mutex lock;
+ int active;
+ unsigned long needcheck;
+ struct file *file;
+ struct pid_namespace *ns;
+ struct work_struct work;
+ struct completion done;
+};
+
+static void do_acct_process(struct bsd_acct_struct *acct);
+
+/*
+ * Check the amount of free space and suspend/resume accordingly.
+ */
+static int check_free_space(struct bsd_acct_struct *acct)
+{
+ struct kstatfs sbuf;
+
+ if (time_is_before_jiffies(acct->needcheck))
+ goto out;
+
+ /* May block */
+ if (vfs_statfs(&acct->file->f_path, &sbuf))
+ goto out;
+
+ if (acct->active) {
+ u64 suspend = sbuf.f_blocks * SUSPEND;
+ do_div(suspend, 100);
+ if (sbuf.f_bavail <= suspend) {
+ acct->active = 0;
+ pr_info("Process accounting paused\n");
+ }
+ } else {
+ u64 resume = sbuf.f_blocks * RESUME;
+ do_div(resume, 100);
+ if (sbuf.f_bavail >= resume) {
+ acct->active = 1;
+ pr_info("Process accounting resumed\n");
+ }
+ }
+
+ acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
+out:
+ return acct->active;
+}
+
+static void acct_put(struct bsd_acct_struct *p)
+{
+ if (atomic_long_dec_and_test(&p->count))
+ kfree_rcu(p, rcu);
+}
+
+static inline struct bsd_acct_struct *to_acct(struct fs_pin *p)
+{
+ return p ? container_of(p, struct bsd_acct_struct, pin) : NULL;
+}
+
+static struct bsd_acct_struct *acct_get(struct pid_namespace *ns)
+{
+ struct bsd_acct_struct *res;
+again:
+ smp_rmb();
+ rcu_read_lock();
+ res = to_acct(ACCESS_ONCE(ns->bacct));
+ if (!res) {
+ rcu_read_unlock();
+ return NULL;
+ }
+ if (!atomic_long_inc_not_zero(&res->count)) {
+ rcu_read_unlock();
+ cpu_relax();
+ goto again;
+ }
+ rcu_read_unlock();
+ mutex_lock(&res->lock);
+ if (res != to_acct(ACCESS_ONCE(ns->bacct))) {
+ mutex_unlock(&res->lock);
+ acct_put(res);
+ goto again;
+ }
+ return res;
+}
+
+static void acct_pin_kill(struct fs_pin *pin)
+{
+ struct bsd_acct_struct *acct = to_acct(pin);
+ mutex_lock(&acct->lock);
+ do_acct_process(acct);
+ schedule_work(&acct->work);
+ wait_for_completion(&acct->done);
+ cmpxchg(&acct->ns->bacct, pin, NULL);
+ mutex_unlock(&acct->lock);
+ pin_remove(pin);
+ acct_put(acct);
+}
+
+static void close_work(struct work_struct *work)
+{
+ struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work);
+ struct file *file = acct->file;
+ if (file->f_op->flush)
+ file->f_op->flush(file, NULL);
+ __fput_sync(file);
+ complete(&acct->done);
+}
+
+static int acct_on(struct filename *pathname)
+{
+ struct file *file;
+ struct vfsmount *mnt, *internal;
+ struct pid_namespace *ns = task_active_pid_ns(current);
+ struct bsd_acct_struct *acct;
+ struct fs_pin *old;
+ int err;
+
+ acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
+ if (!acct)
+ return -ENOMEM;
+
+ /* Difference from BSD - they don't do O_APPEND */
+ file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
+ if (IS_ERR(file)) {
+ kfree(acct);
+ return PTR_ERR(file);
+ }
+
+ if (!S_ISREG(file_inode(file)->i_mode)) {
+ kfree(acct);
+ filp_close(file, NULL);
+ return -EACCES;
+ }
+
+ if (!(file->f_mode & FMODE_CAN_WRITE)) {
+ kfree(acct);
+ filp_close(file, NULL);
+ return -EIO;
+ }
+ internal = mnt_clone_internal(&file->f_path);
+ if (IS_ERR(internal)) {
+ kfree(acct);
+ filp_close(file, NULL);
+ return PTR_ERR(internal);
+ }
+ err = mnt_want_write(internal);
+ if (err) {
+ mntput(internal);
+ kfree(acct);
+ filp_close(file, NULL);
+ return err;
+ }
+ mnt = file->f_path.mnt;
+ file->f_path.mnt = internal;
+
+ atomic_long_set(&acct->count, 1);
+ init_fs_pin(&acct->pin, acct_pin_kill);
+ acct->file = file;
+ acct->needcheck = jiffies;
+ acct->ns = ns;
+ mutex_init(&acct->lock);
+ INIT_WORK(&acct->work, close_work);
+ init_completion(&acct->done);
+ mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */
+ pin_insert(&acct->pin, mnt);
+
+ rcu_read_lock();
+ old = xchg(&ns->bacct, &acct->pin);
+ mutex_unlock(&acct->lock);
+ pin_kill(old);
+ mnt_drop_write(mnt);
+ mntput(mnt);
+ return 0;
+}
+
+static DEFINE_MUTEX(acct_on_mutex);
+
+/**
+ * sys_acct - enable/disable process accounting
+ * @name: file name for accounting records or NULL to shutdown accounting
+ *
+ * Returns 0 for success or negative errno values for failure.
+ *
+ * sys_acct() is the only system call needed to implement process
+ * accounting. It takes the name of the file where accounting records
+ * should be written. If the filename is NULL, accounting will be
+ * shutdown.
+ */
+SYSCALL_DEFINE1(acct, const char __user *, name)
+{
+ int error = 0;
+
+ if (!capable(CAP_SYS_PACCT))
+ return -EPERM;
+
+ if (name) {
+ struct filename *tmp = getname(name);
+
+ if (IS_ERR(tmp))
+ return PTR_ERR(tmp);
+ mutex_lock(&acct_on_mutex);
+ error = acct_on(tmp);
+ mutex_unlock(&acct_on_mutex);
+ putname(tmp);
+ } else {
+ rcu_read_lock();
+ pin_kill(task_active_pid_ns(current)->bacct);
+ }
+
+ return error;
+}
+
+void acct_exit_ns(struct pid_namespace *ns)
+{
+ rcu_read_lock();
+ pin_kill(ns->bacct);
+}
+
+/*
+ * encode an unsigned long into a comp_t
+ *
+ * This routine has been adopted from the encode_comp_t() function in
+ * the kern_acct.c file of the FreeBSD operating system. The encoding
+ * is a 13-bit fraction with a 3-bit (base 8) exponent.
+ */
+
+#define MANTSIZE 13 /* 13 bit mantissa. */
+#define EXPSIZE 3 /* Base 8 (3 bit) exponent. */
+#define MAXFRACT ((1 << MANTSIZE) - 1) /* Maximum fractional value. */
+
+static comp_t encode_comp_t(unsigned long value)
+{
+ int exp, rnd;
+
+ exp = rnd = 0;
+ while (value > MAXFRACT) {
+ rnd = value & (1 << (EXPSIZE - 1)); /* Round up? */
+ value >>= EXPSIZE; /* Base 8 exponent == 3 bit shift. */
+ exp++;
+ }
+
+ /*
+ * If we need to round up, do it (and handle overflow correctly).
+ */
+ if (rnd && (++value > MAXFRACT)) {
+ value >>= EXPSIZE;
+ exp++;
+ }
+
+ /*
+ * Clean it up and polish it off.
+ */
+ exp <<= MANTSIZE; /* Shift the exponent into place */
+ exp += value; /* and add on the mantissa. */
+ return exp;
+}
+
+#if ACCT_VERSION == 1 || ACCT_VERSION == 2
+/*
+ * encode an u64 into a comp2_t (24 bits)
+ *
+ * Format: 5 bit base 2 exponent, 20 bits mantissa.
+ * The leading bit of the mantissa is not stored, but implied for
+ * non-zero exponents.
+ * Largest encodable value is 50 bits.
+ */
+
+#define MANTSIZE2 20 /* 20 bit mantissa. */
+#define EXPSIZE2 5 /* 5 bit base 2 exponent. */
+#define MAXFRACT2 ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */
+#define MAXEXP2 ((1 << EXPSIZE2) - 1) /* Maximum exponent. */
+
+static comp2_t encode_comp2_t(u64 value)
+{
+ int exp, rnd;
+
+ exp = (value > (MAXFRACT2>>1));
+ rnd = 0;
+ while (value > MAXFRACT2) {
+ rnd = value & 1;
+ value >>= 1;
+ exp++;
+ }
+
+ /*
+ * If we need to round up, do it (and handle overflow correctly).
+ */
+ if (rnd && (++value > MAXFRACT2)) {
+ value >>= 1;
+ exp++;
+ }
+
+ if (exp > MAXEXP2) {
+ /* Overflow. Return largest representable number instead. */
+ return (1ul << (MANTSIZE2+EXPSIZE2-1)) - 1;
+ } else {
+ return (value & (MAXFRACT2>>1)) | (exp << (MANTSIZE2-1));
+ }
+}
+#endif
+
+#if ACCT_VERSION == 3
+/*
+ * encode an u64 into a 32 bit IEEE float
+ */
+static u32 encode_float(u64 value)
+{
+ unsigned exp = 190;
+ unsigned u;
+
+ if (value == 0)
+ return 0;
+ while ((s64)value > 0) {
+ value <<= 1;
+ exp--;
+ }
+ u = (u32)(value >> 40) & 0x7fffffu;
+ return u | (exp << 23);
+}
+#endif
+
+/*
+ * Write an accounting entry for an exiting process
+ *
+ * The acct_process() call is the workhorse of the process
+ * accounting system. The struct acct is built here and then written
+ * into the accounting file. This function should only be called from
+ * do_exit() or when switching to a different output file.
+ */
+
+static void fill_ac(acct_t *ac)
+{
+ struct pacct_struct *pacct = &current->signal->pacct;
+ u64 elapsed, run_time;
+ struct tty_struct *tty;
+
+ /*
+ * Fill the accounting struct with the needed info as recorded
+ * by the different kernel functions.
+ */
+ memset(ac, 0, sizeof(acct_t));
+
+ ac->ac_version = ACCT_VERSION | ACCT_BYTEORDER;
+ strlcpy(ac->ac_comm, current->comm, sizeof(ac->ac_comm));
+
+ /* calculate run_time in nsec*/
+ run_time = ktime_get_ns();
+ run_time -= current->group_leader->start_time;
+ /* convert nsec -> AHZ */
+ elapsed = nsec_to_AHZ(run_time);
+#if ACCT_VERSION == 3
+ ac->ac_etime = encode_float(elapsed);
+#else
+ ac->ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ?
+ (unsigned long) elapsed : (unsigned long) -1l);
+#endif
+#if ACCT_VERSION == 1 || ACCT_VERSION == 2
+ {
+ /* new enlarged etime field */
+ comp2_t etime = encode_comp2_t(elapsed);
+
+ ac->ac_etime_hi = etime >> 16;
+ ac->ac_etime_lo = (u16) etime;
+ }
+#endif
+ do_div(elapsed, AHZ);
+ ac->ac_btime = get_seconds() - elapsed;
+#if ACCT_VERSION==2
+ ac->ac_ahz = AHZ;
+#endif
+
+ spin_lock_irq(&current->sighand->siglock);
+ tty = current->signal->tty; /* Safe as we hold the siglock */
+ ac->ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
+ ac->ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
+ ac->ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
+ ac->ac_flag = pacct->ac_flag;
+ ac->ac_mem = encode_comp_t(pacct->ac_mem);
+ ac->ac_minflt = encode_comp_t(pacct->ac_minflt);
+ ac->ac_majflt = encode_comp_t(pacct->ac_majflt);
+ ac->ac_exitcode = pacct->ac_exitcode;
+ spin_unlock_irq(&current->sighand->siglock);
+}
+/*
+ * do_acct_process does all actual work. Caller holds the reference to file.
+ */
+static void do_acct_process(struct bsd_acct_struct *acct)
+{
+ acct_t ac;
+ unsigned long flim;
+ const struct cred *orig_cred;
+ struct file *file = acct->file;
+
+ /*
+ * Accounting records are not subject to resource limits.
+ */
+ flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+ current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
+ /* Perform file operations on behalf of whoever enabled accounting */
+ orig_cred = override_creds(file->f_cred);
+
+ /*
+ * First check to see if there is enough free_space to continue
+ * the process accounting system.
+ */
+ if (!check_free_space(acct))
+ goto out;
+
+ fill_ac(&ac);
+ /* we really need to bite the bullet and change layout */
+ ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid);
+ ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid);
+#if ACCT_VERSION == 1 || ACCT_VERSION == 2
+ /* backward-compatible 16 bit fields */
+ ac.ac_uid16 = ac.ac_uid;
+ ac.ac_gid16 = ac.ac_gid;
+#endif
+#if ACCT_VERSION == 3
+ {
+ struct pid_namespace *ns = acct->ns;
+
+ ac.ac_pid = task_tgid_nr_ns(current, ns);
+ rcu_read_lock();
+ ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent),
+ ns);
+ rcu_read_unlock();
+ }
+#endif
+ /*
+ * Get freeze protection. If the fs is frozen, just skip the write
+ * as we could deadlock the system otherwise.
+ */
+ if (file_start_write_trylock(file)) {
+ /* it's been opened O_APPEND, so position is irrelevant */
+ loff_t pos = 0;
+ __kernel_write(file, (char *)&ac, sizeof(acct_t), &pos);
+ file_end_write(file);
+ }
+out:
+ current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
+ revert_creds(orig_cred);
+}
+
+/**
+ * acct_collect - collect accounting information into pacct_struct
+ * @exitcode: task exit code
+ * @group_dead: not 0, if this thread is the last one in the process.
+ */
+void acct_collect(long exitcode, int group_dead)
+{
+ struct pacct_struct *pacct = &current->signal->pacct;
+ cputime_t utime, stime;
+ unsigned long vsize = 0;
+
+ if (group_dead && current->mm) {
+ struct vm_area_struct *vma;
+
+ down_read(&current->mm->mmap_sem);
+ vma = current->mm->mmap;
+ while (vma) {
+ vsize += vma->vm_end - vma->vm_start;
+ vma = vma->vm_next;
+ }
+ up_read(&current->mm->mmap_sem);
+ }
+
+ spin_lock_irq(&current->sighand->siglock);
+ if (group_dead)
+ pacct->ac_mem = vsize / 1024;
+ if (thread_group_leader(current)) {
+ pacct->ac_exitcode = exitcode;
+ if (current->flags & PF_FORKNOEXEC)
+ pacct->ac_flag |= AFORK;
+ }
+ if (current->flags & PF_SUPERPRIV)
+ pacct->ac_flag |= ASU;
+ if (current->flags & PF_DUMPCORE)
+ pacct->ac_flag |= ACORE;
+ if (current->flags & PF_SIGNALED)
+ pacct->ac_flag |= AXSIG;
+ task_cputime(current, &utime, &stime);
+ pacct->ac_utime += utime;
+ pacct->ac_stime += stime;
+ pacct->ac_minflt += current->min_flt;
+ pacct->ac_majflt += current->maj_flt;
+ spin_unlock_irq(&current->sighand->siglock);
+}
+
+static void slow_acct_process(struct pid_namespace *ns)
+{
+ for ( ; ns; ns = ns->parent) {
+ struct bsd_acct_struct *acct = acct_get(ns);
+ if (acct) {
+ do_acct_process(acct);
+ mutex_unlock(&acct->lock);
+ acct_put(acct);
+ }
+ }
+}
+
+/**
+ * acct_process
+ *
+ * handles process accounting for an exiting task
+ */
+void acct_process(void)
+{
+ struct pid_namespace *ns;
+
+ /*
+ * This loop is safe lockless, since current is still
+ * alive and holds its namespace, which in turn holds
+ * its parent.
+ */
+ for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) {
+ if (ns->bacct)
+ break;
+ }
+ if (unlikely(ns))
+ slow_acct_process(ns);
+}
diff --git a/kernel/async.c b/kernel/async.c
new file mode 100644
index 000000000..4c3773c0b
--- /dev/null
+++ b/kernel/async.c
@@ -0,0 +1,328 @@
+/*
+ * async.c: Asynchronous function calls for boot performance
+ *
+ * (C) Copyright 2009 Intel Corporation
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+
+/*
+
+Goals and Theory of Operation
+
+The primary goal of this feature is to reduce the kernel boot time,
+by doing various independent hardware delays and discovery operations
+decoupled and not strictly serialized.
+
+More specifically, the asynchronous function call concept allows
+certain operations (primarily during system boot) to happen
+asynchronously, out of order, while these operations still
+have their externally visible parts happen sequentially and in-order.
+(not unlike how out-of-order CPUs retire their instructions in order)
+
+Key to the asynchronous function call implementation is the concept of
+a "sequence cookie" (which, although it has an abstracted type, can be
+thought of as a monotonically incrementing number).
+
+The async core will assign each scheduled event such a sequence cookie and
+pass this to the called functions.
+
+The asynchronously called function should before doing a globally visible
+operation, such as registering device numbers, call the
+async_synchronize_cookie() function and pass in its own cookie. The
+async_synchronize_cookie() function will make sure that all asynchronous
+operations that were scheduled prior to the operation corresponding with the
+cookie have completed.
+
+Subsystem/driver initialization code that scheduled asynchronous probe
+functions, but which shares global resources with other drivers/subsystems
+that do not use the asynchronous call feature, need to do a full
+synchronization with the async_synchronize_full() function, before returning
+from their init function. This is to maintain strict ordering between the
+asynchronous and synchronous parts of the kernel.
+
+*/
+
+#include <linux/async.h>
+#include <linux/atomic.h>
+#include <linux/ktime.h>
+#include <linux/export.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+
+#include "workqueue_internal.h"
+
+static async_cookie_t next_cookie = 1;
+
+#define MAX_WORK 32768
+#define ASYNC_COOKIE_MAX ULLONG_MAX /* infinity cookie */
+
+static LIST_HEAD(async_global_pending); /* pending from all registered doms */
+static ASYNC_DOMAIN(async_dfl_domain);
+static DEFINE_SPINLOCK(async_lock);
+
+struct async_entry {
+ struct list_head domain_list;
+ struct list_head global_list;
+ struct work_struct work;
+ async_cookie_t cookie;
+ async_func_t func;
+ void *data;
+ struct async_domain *domain;
+};
+
+static DECLARE_WAIT_QUEUE_HEAD(async_done);
+
+static atomic_t entry_count;
+
+static async_cookie_t lowest_in_progress(struct async_domain *domain)
+{
+ struct list_head *pending;
+ async_cookie_t ret = ASYNC_COOKIE_MAX;
+ unsigned long flags;
+
+ spin_lock_irqsave(&async_lock, flags);
+
+ if (domain)
+ pending = &domain->pending;
+ else
+ pending = &async_global_pending;
+
+ if (!list_empty(pending))
+ ret = list_first_entry(pending, struct async_entry,
+ domain_list)->cookie;
+
+ spin_unlock_irqrestore(&async_lock, flags);
+ return ret;
+}
+
+/*
+ * pick the first pending entry and run it
+ */
+static void async_run_entry_fn(struct work_struct *work)
+{
+ struct async_entry *entry =
+ container_of(work, struct async_entry, work);
+ unsigned long flags;
+ ktime_t uninitialized_var(calltime), delta, rettime;
+
+ /* 1) run (and print duration) */
+ if (initcall_debug && system_state == SYSTEM_BOOTING) {
+ pr_debug("calling %lli_%pF @ %i\n",
+ (long long)entry->cookie,
+ entry->func, task_pid_nr(current));
+ calltime = ktime_get();
+ }
+ entry->func(entry->data, entry->cookie);
+ if (initcall_debug && system_state == SYSTEM_BOOTING) {
+ rettime = ktime_get();
+ delta = ktime_sub(rettime, calltime);
+ pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n",
+ (long long)entry->cookie,
+ entry->func,
+ (long long)ktime_to_ns(delta) >> 10);
+ }
+
+ /* 2) remove self from the pending queues */
+ spin_lock_irqsave(&async_lock, flags);
+ list_del_init(&entry->domain_list);
+ list_del_init(&entry->global_list);
+
+ /* 3) free the entry */
+ kfree(entry);
+ atomic_dec(&entry_count);
+
+ spin_unlock_irqrestore(&async_lock, flags);
+
+ /* 4) wake up any waiters */
+ wake_up(&async_done);
+}
+
+static async_cookie_t __async_schedule(async_func_t func, void *data, struct async_domain *domain)
+{
+ struct async_entry *entry;
+ unsigned long flags;
+ async_cookie_t newcookie;
+
+ /* allow irq-off callers */
+ entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC);
+
+ /*
+ * If we're out of memory or if there's too much work
+ * pending already, we execute synchronously.
+ */
+ if (!entry || atomic_read(&entry_count) > MAX_WORK) {
+ kfree(entry);
+ spin_lock_irqsave(&async_lock, flags);
+ newcookie = next_cookie++;
+ spin_unlock_irqrestore(&async_lock, flags);
+
+ /* low on memory.. run synchronously */
+ func(data, newcookie);
+ return newcookie;
+ }
+ INIT_LIST_HEAD(&entry->domain_list);
+ INIT_LIST_HEAD(&entry->global_list);
+ INIT_WORK(&entry->work, async_run_entry_fn);
+ entry->func = func;
+ entry->data = data;
+ entry->domain = domain;
+
+ spin_lock_irqsave(&async_lock, flags);
+
+ /* allocate cookie and queue */
+ newcookie = entry->cookie = next_cookie++;
+
+ list_add_tail(&entry->domain_list, &domain->pending);
+ if (domain->registered)
+ list_add_tail(&entry->global_list, &async_global_pending);
+
+ atomic_inc(&entry_count);
+ spin_unlock_irqrestore(&async_lock, flags);
+
+ /* mark that this task has queued an async job, used by module init */
+ current->flags |= PF_USED_ASYNC;
+
+ /* schedule for execution */
+ queue_work(system_unbound_wq, &entry->work);
+
+ return newcookie;
+}
+
+/**
+ * async_schedule - schedule a function for asynchronous execution
+ * @func: function to execute asynchronously
+ * @data: data pointer to pass to the function
+ *
+ * Returns an async_cookie_t that may be used for checkpointing later.
+ * Note: This function may be called from atomic or non-atomic contexts.
+ */
+async_cookie_t async_schedule(async_func_t func, void *data)
+{
+ return __async_schedule(func, data, &async_dfl_domain);
+}
+EXPORT_SYMBOL_GPL(async_schedule);
+
+/**
+ * async_schedule_domain - schedule a function for asynchronous execution within a certain domain
+ * @func: function to execute asynchronously
+ * @data: data pointer to pass to the function
+ * @domain: the domain
+ *
+ * Returns an async_cookie_t that may be used for checkpointing later.
+ * @domain may be used in the async_synchronize_*_domain() functions to
+ * wait within a certain synchronization domain rather than globally. A
+ * synchronization domain is specified via @domain. Note: This function
+ * may be called from atomic or non-atomic contexts.
+ */
+async_cookie_t async_schedule_domain(async_func_t func, void *data,
+ struct async_domain *domain)
+{
+ return __async_schedule(func, data, domain);
+}
+EXPORT_SYMBOL_GPL(async_schedule_domain);
+
+/**
+ * async_synchronize_full - synchronize all asynchronous function calls
+ *
+ * This function waits until all asynchronous function calls have been done.
+ */
+void async_synchronize_full(void)
+{
+ async_synchronize_full_domain(NULL);
+}
+EXPORT_SYMBOL_GPL(async_synchronize_full);
+
+/**
+ * async_unregister_domain - ensure no more anonymous waiters on this domain
+ * @domain: idle domain to flush out of any async_synchronize_full instances
+ *
+ * async_synchronize_{cookie|full}_domain() are not flushed since callers
+ * of these routines should know the lifetime of @domain
+ *
+ * Prefer ASYNC_DOMAIN_EXCLUSIVE() declarations over flushing
+ */
+void async_unregister_domain(struct async_domain *domain)
+{
+ spin_lock_irq(&async_lock);
+ WARN_ON(!domain->registered || !list_empty(&domain->pending));
+ domain->registered = 0;
+ spin_unlock_irq(&async_lock);
+}
+EXPORT_SYMBOL_GPL(async_unregister_domain);
+
+/**
+ * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain
+ * @domain: the domain to synchronize
+ *
+ * This function waits until all asynchronous function calls for the
+ * synchronization domain specified by @domain have been done.
+ */
+void async_synchronize_full_domain(struct async_domain *domain)
+{
+ async_synchronize_cookie_domain(ASYNC_COOKIE_MAX, domain);
+}
+EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
+
+/**
+ * async_synchronize_cookie_domain - synchronize asynchronous function calls within a certain domain with cookie checkpointing
+ * @cookie: async_cookie_t to use as checkpoint
+ * @domain: the domain to synchronize (%NULL for all registered domains)
+ *
+ * This function waits until all asynchronous function calls for the
+ * synchronization domain specified by @domain submitted prior to @cookie
+ * have been done.
+ */
+void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *domain)
+{
+ ktime_t uninitialized_var(starttime), delta, endtime;
+
+ if (initcall_debug && system_state == SYSTEM_BOOTING) {
+ pr_debug("async_waiting @ %i\n", task_pid_nr(current));
+ starttime = ktime_get();
+ }
+
+ wait_event(async_done, lowest_in_progress(domain) >= cookie);
+
+ if (initcall_debug && system_state == SYSTEM_BOOTING) {
+ endtime = ktime_get();
+ delta = ktime_sub(endtime, starttime);
+
+ pr_debug("async_continuing @ %i after %lli usec\n",
+ task_pid_nr(current),
+ (long long)ktime_to_ns(delta) >> 10);
+ }
+}
+EXPORT_SYMBOL_GPL(async_synchronize_cookie_domain);
+
+/**
+ * async_synchronize_cookie - synchronize asynchronous function calls with cookie checkpointing
+ * @cookie: async_cookie_t to use as checkpoint
+ *
+ * This function waits until all asynchronous function calls prior to @cookie
+ * have been done.
+ */
+void async_synchronize_cookie(async_cookie_t cookie)
+{
+ async_synchronize_cookie_domain(cookie, &async_dfl_domain);
+}
+EXPORT_SYMBOL_GPL(async_synchronize_cookie);
+
+/**
+ * current_is_async - is %current an async worker task?
+ *
+ * Returns %true if %current is an async worker task.
+ */
+bool current_is_async(void)
+{
+ struct worker *worker = current_wq_worker();
+
+ return worker && worker->current_func == async_run_entry_fn;
+}
diff --git a/kernel/audit.c b/kernel/audit.c
new file mode 100644
index 000000000..1c13e4267
--- /dev/null
+++ b/kernel/audit.c
@@ -0,0 +1,2036 @@
+/* audit.c -- Auditing support
+ * Gateway between the kernel (e.g., selinux) and the user-space audit daemon.
+ * System-call specific features have moved to auditsc.c
+ *
+ * Copyright 2003-2007 Red Hat Inc., Durham, North Carolina.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Written by Rickard E. (Rik) Faith <faith@redhat.com>
+ *
+ * Goals: 1) Integrate fully with Security Modules.
+ * 2) Minimal run-time overhead:
+ * a) Minimal when syscall auditing is disabled (audit_enable=0).
+ * b) Small when syscall auditing is enabled and no audit record
+ * is generated (defer as much work as possible to record
+ * generation time):
+ * i) context is allocated,
+ * ii) names from getname are stored without a copy, and
+ * iii) inode information stored from path_lookup.
+ * 3) Ability to disable syscall auditing at boot time (audit=0).
+ * 4) Usable by other parts of the kernel (if audit_log* is called,
+ * then a syscall record will be generated automatically for the
+ * current syscall).
+ * 5) Netlink interface to user-space.
+ * 6) Support low-overhead kernel-based filtering to minimize the
+ * information that must be passed to user-space.
+ *
+ * Example user-space utilities: http://people.redhat.com/sgrubb/audit/
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/file.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/atomic.h>
+#include <linux/mm.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/kthread.h>
+#include <linux/kernel.h>
+#include <linux/syscalls.h>
+
+#include <linux/audit.h>
+
+#include <net/sock.h>
+#include <net/netlink.h>
+#include <linux/skbuff.h>
+#ifdef CONFIG_SECURITY
+#include <linux/security.h>
+#endif
+#include <linux/freezer.h>
+#include <linux/tty.h>
+#include <linux/pid_namespace.h>
+#include <net/netns/generic.h>
+
+#include "audit.h"
+
+/* No auditing will take place until audit_initialized == AUDIT_INITIALIZED.
+ * (Initialization happens after skb_init is called.) */
+#define AUDIT_DISABLED -1
+#define AUDIT_UNINITIALIZED 0
+#define AUDIT_INITIALIZED 1
+static int audit_initialized;
+
+#define AUDIT_OFF 0
+#define AUDIT_ON 1
+#define AUDIT_LOCKED 2
+u32 audit_enabled;
+u32 audit_ever_enabled;
+
+EXPORT_SYMBOL_GPL(audit_enabled);
+
+/* Default state when kernel boots without any parameters. */
+static u32 audit_default;
+
+/* If auditing cannot proceed, audit_failure selects what happens. */
+static u32 audit_failure = AUDIT_FAIL_PRINTK;
+
+/*
+ * If audit records are to be written to the netlink socket, audit_pid
+ * contains the pid of the auditd process and audit_nlk_portid contains
+ * the portid to use to send netlink messages to that process.
+ */
+int audit_pid;
+static __u32 audit_nlk_portid;
+
+/* If audit_rate_limit is non-zero, limit the rate of sending audit records
+ * to that number per second. This prevents DoS attacks, but results in
+ * audit records being dropped. */
+static u32 audit_rate_limit;
+
+/* Number of outstanding audit_buffers allowed.
+ * When set to zero, this means unlimited. */
+static u32 audit_backlog_limit = 64;
+#define AUDIT_BACKLOG_WAIT_TIME (60 * HZ)
+static u32 audit_backlog_wait_time_master = AUDIT_BACKLOG_WAIT_TIME;
+static u32 audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME;
+static u32 audit_backlog_wait_overflow = 0;
+
+/* The identity of the user shutting down the audit system. */
+kuid_t audit_sig_uid = INVALID_UID;
+pid_t audit_sig_pid = -1;
+u32 audit_sig_sid = 0;
+
+/* Records can be lost in several ways:
+ 0) [suppressed in audit_alloc]
+ 1) out of memory in audit_log_start [kmalloc of struct audit_buffer]
+ 2) out of memory in audit_log_move [alloc_skb]
+ 3) suppressed due to audit_rate_limit
+ 4) suppressed due to audit_backlog_limit
+*/
+static atomic_t audit_lost = ATOMIC_INIT(0);
+
+/* The netlink socket. */
+static struct sock *audit_sock;
+static int audit_net_id;
+
+/* Hash for inode-based rules */
+struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
+
+/* The audit_freelist is a list of pre-allocated audit buffers (if more
+ * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of
+ * being placed on the freelist). */
+static DEFINE_SPINLOCK(audit_freelist_lock);
+static int audit_freelist_count;
+static LIST_HEAD(audit_freelist);
+
+static struct sk_buff_head audit_skb_queue;
+/* queue of skbs to send to auditd when/if it comes back */
+static struct sk_buff_head audit_skb_hold_queue;
+static struct task_struct *kauditd_task;
+static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
+static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
+
+static struct audit_features af = {.vers = AUDIT_FEATURE_VERSION,
+ .mask = -1,
+ .features = 0,
+ .lock = 0,};
+
+static char *audit_feature_names[2] = {
+ "only_unset_loginuid",
+ "loginuid_immutable",
+};
+
+
+/* Serialize requests from userspace. */
+DEFINE_MUTEX(audit_cmd_mutex);
+
+/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
+ * audit records. Since printk uses a 1024 byte buffer, this buffer
+ * should be at least that large. */
+#define AUDIT_BUFSIZ 1024
+
+/* AUDIT_MAXFREE is the number of empty audit_buffers we keep on the
+ * audit_freelist. Doing so eliminates many kmalloc/kfree calls. */
+#define AUDIT_MAXFREE (2*NR_CPUS)
+
+/* The audit_buffer is used when formatting an audit record. The caller
+ * locks briefly to get the record off the freelist or to allocate the
+ * buffer, and locks briefly to send the buffer to the netlink layer or
+ * to place it on a transmit queue. Multiple audit_buffers can be in
+ * use simultaneously. */
+struct audit_buffer {
+ struct list_head list;
+ struct sk_buff *skb; /* formatted skb ready to send */
+ struct audit_context *ctx; /* NULL or associated context */
+ gfp_t gfp_mask;
+};
+
+struct audit_reply {
+ __u32 portid;
+ struct net *net;
+ struct sk_buff *skb;
+};
+
+static void audit_set_portid(struct audit_buffer *ab, __u32 portid)
+{
+ if (ab) {
+ struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
+ nlh->nlmsg_pid = portid;
+ }
+}
+
+void audit_panic(const char *message)
+{
+ switch (audit_failure) {
+ case AUDIT_FAIL_SILENT:
+ break;
+ case AUDIT_FAIL_PRINTK:
+ if (printk_ratelimit())
+ pr_err("%s\n", message);
+ break;
+ case AUDIT_FAIL_PANIC:
+ /* test audit_pid since printk is always losey, why bother? */
+ if (audit_pid)
+ panic("audit: %s\n", message);
+ break;
+ }
+}
+
+static inline int audit_rate_check(void)
+{
+ static unsigned long last_check = 0;
+ static int messages = 0;
+ static DEFINE_SPINLOCK(lock);
+ unsigned long flags;
+ unsigned long now;
+ unsigned long elapsed;
+ int retval = 0;
+
+ if (!audit_rate_limit) return 1;
+
+ spin_lock_irqsave(&lock, flags);
+ if (++messages < audit_rate_limit) {
+ retval = 1;
+ } else {
+ now = jiffies;
+ elapsed = now - last_check;
+ if (elapsed > HZ) {
+ last_check = now;
+ messages = 0;
+ retval = 1;
+ }
+ }
+ spin_unlock_irqrestore(&lock, flags);
+
+ return retval;
+}
+
+/**
+ * audit_log_lost - conditionally log lost audit message event
+ * @message: the message stating reason for lost audit message
+ *
+ * Emit at least 1 message per second, even if audit_rate_check is
+ * throttling.
+ * Always increment the lost messages counter.
+*/
+void audit_log_lost(const char *message)
+{
+ static unsigned long last_msg = 0;
+ static DEFINE_SPINLOCK(lock);
+ unsigned long flags;
+ unsigned long now;
+ int print;
+
+ atomic_inc(&audit_lost);
+
+ print = (audit_failure == AUDIT_FAIL_PANIC || !audit_rate_limit);
+
+ if (!print) {
+ spin_lock_irqsave(&lock, flags);
+ now = jiffies;
+ if (now - last_msg > HZ) {
+ print = 1;
+ last_msg = now;
+ }
+ spin_unlock_irqrestore(&lock, flags);
+ }
+
+ if (print) {
+ if (printk_ratelimit())
+ pr_warn("audit_lost=%u audit_rate_limit=%u audit_backlog_limit=%u\n",
+ atomic_read(&audit_lost),
+ audit_rate_limit,
+ audit_backlog_limit);
+ audit_panic(message);
+ }
+}
+
+static int audit_log_config_change(char *function_name, u32 new, u32 old,
+ int allow_changes)
+{
+ struct audit_buffer *ab;
+ int rc = 0;
+
+ ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
+ if (unlikely(!ab))
+ return rc;
+ audit_log_format(ab, "%s=%u old=%u", function_name, new, old);
+ audit_log_session_info(ab);
+ rc = audit_log_task_context(ab);
+ if (rc)
+ allow_changes = 0; /* Something weird, deny request */
+ audit_log_format(ab, " res=%d", allow_changes);
+ audit_log_end(ab);
+ return rc;
+}
+
+static int audit_do_config_change(char *function_name, u32 *to_change, u32 new)
+{
+ int allow_changes, rc = 0;
+ u32 old = *to_change;
+
+ /* check if we are locked */
+ if (audit_enabled == AUDIT_LOCKED)
+ allow_changes = 0;
+ else
+ allow_changes = 1;
+
+ if (audit_enabled != AUDIT_OFF) {
+ rc = audit_log_config_change(function_name, new, old, allow_changes);
+ if (rc)
+ allow_changes = 0;
+ }
+
+ /* If we are allowed, make the change */
+ if (allow_changes == 1)
+ *to_change = new;
+ /* Not allowed, update reason */
+ else if (rc == 0)
+ rc = -EPERM;
+ return rc;
+}
+
+static int audit_set_rate_limit(u32 limit)
+{
+ return audit_do_config_change("audit_rate_limit", &audit_rate_limit, limit);
+}
+
+static int audit_set_backlog_limit(u32 limit)
+{
+ return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, limit);
+}
+
+static int audit_set_backlog_wait_time(u32 timeout)
+{
+ return audit_do_config_change("audit_backlog_wait_time",
+ &audit_backlog_wait_time_master, timeout);
+}
+
+static int audit_set_enabled(u32 state)
+{
+ int rc;
+ if (state > AUDIT_LOCKED)
+ return -EINVAL;
+
+ rc = audit_do_config_change("audit_enabled", &audit_enabled, state);
+ if (!rc)
+ audit_ever_enabled |= !!state;
+
+ return rc;
+}
+
+static int audit_set_failure(u32 state)
+{
+ if (state != AUDIT_FAIL_SILENT
+ && state != AUDIT_FAIL_PRINTK
+ && state != AUDIT_FAIL_PANIC)
+ return -EINVAL;
+
+ return audit_do_config_change("audit_failure", &audit_failure, state);
+}
+
+/*
+ * Queue skbs to be sent to auditd when/if it comes back. These skbs should
+ * already have been sent via prink/syslog and so if these messages are dropped
+ * it is not a huge concern since we already passed the audit_log_lost()
+ * notification and stuff. This is just nice to get audit messages during
+ * boot before auditd is running or messages generated while auditd is stopped.
+ * This only holds messages is audit_default is set, aka booting with audit=1
+ * or building your kernel that way.
+ */
+static void audit_hold_skb(struct sk_buff *skb)
+{
+ if (audit_default &&
+ (!audit_backlog_limit ||
+ skb_queue_len(&audit_skb_hold_queue) < audit_backlog_limit))
+ skb_queue_tail(&audit_skb_hold_queue, skb);
+ else
+ kfree_skb(skb);
+}
+
+/*
+ * For one reason or another this nlh isn't getting delivered to the userspace
+ * audit daemon, just send it to printk.
+ */
+static void audit_printk_skb(struct sk_buff *skb)
+{
+ struct nlmsghdr *nlh = nlmsg_hdr(skb);
+ char *data = nlmsg_data(nlh);
+
+ if (nlh->nlmsg_type != AUDIT_EOE) {
+ if (printk_ratelimit())
+ pr_notice("type=%d %s\n", nlh->nlmsg_type, data);
+ else
+ audit_log_lost("printk limit exceeded");
+ }
+
+ audit_hold_skb(skb);
+}
+
+static void kauditd_send_skb(struct sk_buff *skb)
+{
+ int err;
+ /* take a reference in case we can't send it and we want to hold it */
+ skb_get(skb);
+ err = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0);
+ if (err < 0) {
+ BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
+ if (audit_pid) {
+ pr_err("*NO* daemon at audit_pid=%d\n", audit_pid);
+ audit_log_lost("auditd disappeared");
+ audit_pid = 0;
+ audit_sock = NULL;
+ }
+ /* we might get lucky and get this in the next auditd */
+ audit_hold_skb(skb);
+ } else
+ /* drop the extra reference if sent ok */
+ consume_skb(skb);
+}
+
+/*
+ * kauditd_send_multicast_skb - send the skb to multicast userspace listeners
+ *
+ * This function doesn't consume an skb as might be expected since it has to
+ * copy it anyways.
+ */
+static void kauditd_send_multicast_skb(struct sk_buff *skb, gfp_t gfp_mask)
+{
+ struct sk_buff *copy;
+ struct audit_net *aunet = net_generic(&init_net, audit_net_id);
+ struct sock *sock = aunet->nlsk;
+
+ if (!netlink_has_listeners(sock, AUDIT_NLGRP_READLOG))
+ return;
+
+ /*
+ * The seemingly wasteful skb_copy() rather than bumping the refcount
+ * using skb_get() is necessary because non-standard mods are made to
+ * the skb by the original kaudit unicast socket send routine. The
+ * existing auditd daemon assumes this breakage. Fixing this would
+ * require co-ordinating a change in the established protocol between
+ * the kaudit kernel subsystem and the auditd userspace code. There is
+ * no reason for new multicast clients to continue with this
+ * non-compliance.
+ */
+ copy = skb_copy(skb, gfp_mask);
+ if (!copy)
+ return;
+
+ nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, gfp_mask);
+}
+
+/*
+ * flush_hold_queue - empty the hold queue if auditd appears
+ *
+ * If auditd just started, drain the queue of messages already
+ * sent to syslog/printk. Remember loss here is ok. We already
+ * called audit_log_lost() if it didn't go out normally. so the
+ * race between the skb_dequeue and the next check for audit_pid
+ * doesn't matter.
+ *
+ * If you ever find kauditd to be too slow we can get a perf win
+ * by doing our own locking and keeping better track if there
+ * are messages in this queue. I don't see the need now, but
+ * in 5 years when I want to play with this again I'll see this
+ * note and still have no friggin idea what i'm thinking today.
+ */
+static void flush_hold_queue(void)
+{
+ struct sk_buff *skb;
+
+ if (!audit_default || !audit_pid)
+ return;
+
+ skb = skb_dequeue(&audit_skb_hold_queue);
+ if (likely(!skb))
+ return;
+
+ while (skb && audit_pid) {
+ kauditd_send_skb(skb);
+ skb = skb_dequeue(&audit_skb_hold_queue);
+ }
+
+ /*
+ * if auditd just disappeared but we
+ * dequeued an skb we need to drop ref
+ */
+ if (skb)
+ consume_skb(skb);
+}
+
+static int kauditd_thread(void *dummy)
+{
+ set_freezable();
+ while (!kthread_should_stop()) {
+ struct sk_buff *skb;
+
+ flush_hold_queue();
+
+ skb = skb_dequeue(&audit_skb_queue);
+
+ if (skb) {
+ if (skb_queue_len(&audit_skb_queue) <= audit_backlog_limit)
+ wake_up(&audit_backlog_wait);
+ if (audit_pid)
+ kauditd_send_skb(skb);
+ else
+ audit_printk_skb(skb);
+ continue;
+ }
+
+ wait_event_freezable(kauditd_wait, skb_queue_len(&audit_skb_queue));
+ }
+ return 0;
+}
+
+int audit_send_list(void *_dest)
+{
+ struct audit_netlink_list *dest = _dest;
+ struct sk_buff *skb;
+ struct net *net = dest->net;
+ struct audit_net *aunet = net_generic(net, audit_net_id);
+
+ /* wait for parent to finish and send an ACK */
+ mutex_lock(&audit_cmd_mutex);
+ mutex_unlock(&audit_cmd_mutex);
+
+ while ((skb = __skb_dequeue(&dest->q)) != NULL)
+ netlink_unicast(aunet->nlsk, skb, dest->portid, 0);
+
+ put_net(net);
+ kfree(dest);
+
+ return 0;
+}
+
+struct sk_buff *audit_make_reply(__u32 portid, int seq, int type, int done,
+ int multi, const void *payload, int size)
+{
+ struct sk_buff *skb;
+ struct nlmsghdr *nlh;
+ void *data;
+ int flags = multi ? NLM_F_MULTI : 0;
+ int t = done ? NLMSG_DONE : type;
+
+ skb = nlmsg_new(size, GFP_KERNEL);
+ if (!skb)
+ return NULL;
+
+ nlh = nlmsg_put(skb, portid, seq, t, size, flags);
+ if (!nlh)
+ goto out_kfree_skb;
+ data = nlmsg_data(nlh);
+ memcpy(data, payload, size);
+ return skb;
+
+out_kfree_skb:
+ kfree_skb(skb);
+ return NULL;
+}
+
+static int audit_send_reply_thread(void *arg)
+{
+ struct audit_reply *reply = (struct audit_reply *)arg;
+ struct net *net = reply->net;
+ struct audit_net *aunet = net_generic(net, audit_net_id);
+
+ mutex_lock(&audit_cmd_mutex);
+ mutex_unlock(&audit_cmd_mutex);
+
+ /* Ignore failure. It'll only happen if the sender goes away,
+ because our timeout is set to infinite. */
+ netlink_unicast(aunet->nlsk , reply->skb, reply->portid, 0);
+ put_net(net);
+ kfree(reply);
+ return 0;
+}
+/**
+ * audit_send_reply - send an audit reply message via netlink
+ * @request_skb: skb of request we are replying to (used to target the reply)
+ * @seq: sequence number
+ * @type: audit message type
+ * @done: done (last) flag
+ * @multi: multi-part message flag
+ * @payload: payload data
+ * @size: payload size
+ *
+ * Allocates an skb, builds the netlink message, and sends it to the port id.
+ * No failure notifications.
+ */
+static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done,
+ int multi, const void *payload, int size)
+{
+ u32 portid = NETLINK_CB(request_skb).portid;
+ struct net *net = sock_net(NETLINK_CB(request_skb).sk);
+ struct sk_buff *skb;
+ struct task_struct *tsk;
+ struct audit_reply *reply = kmalloc(sizeof(struct audit_reply),
+ GFP_KERNEL);
+
+ if (!reply)
+ return;
+
+ skb = audit_make_reply(portid, seq, type, done, multi, payload, size);
+ if (!skb)
+ goto out;
+
+ reply->net = get_net(net);
+ reply->portid = portid;
+ reply->skb = skb;
+
+ tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply");
+ if (!IS_ERR(tsk))
+ return;
+ kfree_skb(skb);
+out:
+ kfree(reply);
+}
+
+/*
+ * Check for appropriate CAP_AUDIT_ capabilities on incoming audit
+ * control messages.
+ */
+static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
+{
+ int err = 0;
+
+ /* Only support initial user namespace for now. */
+ /*
+ * We return ECONNREFUSED because it tricks userspace into thinking
+ * that audit was not configured into the kernel. Lots of users
+ * configure their PAM stack (because that's what the distro does)
+ * to reject login if unable to send messages to audit. If we return
+ * ECONNREFUSED the PAM stack thinks the kernel does not have audit
+ * configured in and will let login proceed. If we return EPERM
+ * userspace will reject all logins. This should be removed when we
+ * support non init namespaces!!
+ */
+ if (current_user_ns() != &init_user_ns)
+ return -ECONNREFUSED;
+
+ switch (msg_type) {
+ case AUDIT_LIST:
+ case AUDIT_ADD:
+ case AUDIT_DEL:
+ return -EOPNOTSUPP;
+ case AUDIT_GET:
+ case AUDIT_SET:
+ case AUDIT_GET_FEATURE:
+ case AUDIT_SET_FEATURE:
+ case AUDIT_LIST_RULES:
+ case AUDIT_ADD_RULE:
+ case AUDIT_DEL_RULE:
+ case AUDIT_SIGNAL_INFO:
+ case AUDIT_TTY_GET:
+ case AUDIT_TTY_SET:
+ case AUDIT_TRIM:
+ case AUDIT_MAKE_EQUIV:
+ /* Only support auditd and auditctl in initial pid namespace
+ * for now. */
+ if (task_active_pid_ns(current) != &init_pid_ns)
+ return -EPERM;
+
+ if (!netlink_capable(skb, CAP_AUDIT_CONTROL))
+ err = -EPERM;
+ break;
+ case AUDIT_USER:
+ case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
+ case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
+ if (!netlink_capable(skb, CAP_AUDIT_WRITE))
+ err = -EPERM;
+ break;
+ default: /* bad msg */
+ err = -EINVAL;
+ }
+
+ return err;
+}
+
+static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
+{
+ int rc = 0;
+ uid_t uid = from_kuid(&init_user_ns, current_uid());
+ pid_t pid = task_tgid_nr(current);
+
+ if (!audit_enabled && msg_type != AUDIT_USER_AVC) {
+ *ab = NULL;
+ return rc;
+ }
+
+ *ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
+ if (unlikely(!*ab))
+ return rc;
+ audit_log_format(*ab, "pid=%d uid=%u", pid, uid);
+ audit_log_session_info(*ab);
+ audit_log_task_context(*ab);
+
+ return rc;
+}
+
+int is_audit_feature_set(int i)
+{
+ return af.features & AUDIT_FEATURE_TO_MASK(i);
+}
+
+
+static int audit_get_feature(struct sk_buff *skb)
+{
+ u32 seq;
+
+ seq = nlmsg_hdr(skb)->nlmsg_seq;
+
+ audit_send_reply(skb, seq, AUDIT_GET_FEATURE, 0, 0, &af, sizeof(af));
+
+ return 0;
+}
+
+static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature,
+ u32 old_lock, u32 new_lock, int res)
+{
+ struct audit_buffer *ab;
+
+ if (audit_enabled == AUDIT_OFF)
+ return;
+
+ ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_FEATURE_CHANGE);
+ audit_log_task_info(ab, current);
+ audit_log_format(ab, " feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d",
+ audit_feature_names[which], !!old_feature, !!new_feature,
+ !!old_lock, !!new_lock, res);
+ audit_log_end(ab);
+}
+
+static int audit_set_feature(struct sk_buff *skb)
+{
+ struct audit_features *uaf;
+ int i;
+
+ BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > ARRAY_SIZE(audit_feature_names));
+ uaf = nlmsg_data(nlmsg_hdr(skb));
+
+ /* if there is ever a version 2 we should handle that here */
+
+ for (i = 0; i <= AUDIT_LAST_FEATURE; i++) {
+ u32 feature = AUDIT_FEATURE_TO_MASK(i);
+ u32 old_feature, new_feature, old_lock, new_lock;
+
+ /* if we are not changing this feature, move along */
+ if (!(feature & uaf->mask))
+ continue;
+
+ old_feature = af.features & feature;
+ new_feature = uaf->features & feature;
+ new_lock = (uaf->lock | af.lock) & feature;
+ old_lock = af.lock & feature;
+
+ /* are we changing a locked feature? */
+ if (old_lock && (new_feature != old_feature)) {
+ audit_log_feature_change(i, old_feature, new_feature,
+ old_lock, new_lock, 0);
+ return -EPERM;
+ }
+ }
+ /* nothing invalid, do the changes */
+ for (i = 0; i <= AUDIT_LAST_FEATURE; i++) {
+ u32 feature = AUDIT_FEATURE_TO_MASK(i);
+ u32 old_feature, new_feature, old_lock, new_lock;
+
+ /* if we are not changing this feature, move along */
+ if (!(feature & uaf->mask))
+ continue;
+
+ old_feature = af.features & feature;
+ new_feature = uaf->features & feature;
+ old_lock = af.lock & feature;
+ new_lock = (uaf->lock | af.lock) & feature;
+
+ if (new_feature != old_feature)
+ audit_log_feature_change(i, old_feature, new_feature,
+ old_lock, new_lock, 1);
+
+ if (new_feature)
+ af.features |= feature;
+ else
+ af.features &= ~feature;
+ af.lock |= new_lock;
+ }
+
+ return 0;
+}
+
+static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ u32 seq;
+ void *data;
+ int err;
+ struct audit_buffer *ab;
+ u16 msg_type = nlh->nlmsg_type;
+ struct audit_sig_info *sig_data;
+ char *ctx = NULL;
+ u32 len;
+
+ err = audit_netlink_ok(skb, msg_type);
+ if (err)
+ return err;
+
+ /* As soon as there's any sign of userspace auditd,
+ * start kauditd to talk to it */
+ if (!kauditd_task) {
+ kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
+ if (IS_ERR(kauditd_task)) {
+ err = PTR_ERR(kauditd_task);
+ kauditd_task = NULL;
+ return err;
+ }
+ }
+ seq = nlh->nlmsg_seq;
+ data = nlmsg_data(nlh);
+
+ switch (msg_type) {
+ case AUDIT_GET: {
+ struct audit_status s;
+ memset(&s, 0, sizeof(s));
+ s.enabled = audit_enabled;
+ s.failure = audit_failure;
+ s.pid = audit_pid;
+ s.rate_limit = audit_rate_limit;
+ s.backlog_limit = audit_backlog_limit;
+ s.lost = atomic_read(&audit_lost);
+ s.backlog = skb_queue_len(&audit_skb_queue);
+ s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL;
+ s.backlog_wait_time = audit_backlog_wait_time_master;
+ audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s));
+ break;
+ }
+ case AUDIT_SET: {
+ struct audit_status s;
+ memset(&s, 0, sizeof(s));
+ /* guard against past and future API changes */
+ memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh)));
+ if (s.mask & AUDIT_STATUS_ENABLED) {
+ err = audit_set_enabled(s.enabled);
+ if (err < 0)
+ return err;
+ }
+ if (s.mask & AUDIT_STATUS_FAILURE) {
+ err = audit_set_failure(s.failure);
+ if (err < 0)
+ return err;
+ }
+ if (s.mask & AUDIT_STATUS_PID) {
+ int new_pid = s.pid;
+
+ if ((!new_pid) && (task_tgid_vnr(current) != audit_pid))
+ return -EACCES;
+ if (audit_enabled != AUDIT_OFF)
+ audit_log_config_change("audit_pid", new_pid, audit_pid, 1);
+ audit_pid = new_pid;
+ audit_nlk_portid = NETLINK_CB(skb).portid;
+ audit_sock = skb->sk;
+ }
+ if (s.mask & AUDIT_STATUS_RATE_LIMIT) {
+ err = audit_set_rate_limit(s.rate_limit);
+ if (err < 0)
+ return err;
+ }
+ if (s.mask & AUDIT_STATUS_BACKLOG_LIMIT) {
+ err = audit_set_backlog_limit(s.backlog_limit);
+ if (err < 0)
+ return err;
+ }
+ if (s.mask & AUDIT_STATUS_BACKLOG_WAIT_TIME) {
+ if (sizeof(s) > (size_t)nlh->nlmsg_len)
+ return -EINVAL;
+ if (s.backlog_wait_time > 10*AUDIT_BACKLOG_WAIT_TIME)
+ return -EINVAL;
+ err = audit_set_backlog_wait_time(s.backlog_wait_time);
+ if (err < 0)
+ return err;
+ }
+ break;
+ }
+ case AUDIT_GET_FEATURE:
+ err = audit_get_feature(skb);
+ if (err)
+ return err;
+ break;
+ case AUDIT_SET_FEATURE:
+ err = audit_set_feature(skb);
+ if (err)
+ return err;
+ break;
+ case AUDIT_USER:
+ case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
+ case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
+ if (!audit_enabled && msg_type != AUDIT_USER_AVC)
+ return 0;
+
+ err = audit_filter_user(msg_type);
+ if (err == 1) { /* match or error */
+ err = 0;
+ if (msg_type == AUDIT_USER_TTY) {
+ err = tty_audit_push_current();
+ if (err)
+ break;
+ }
+ mutex_unlock(&audit_cmd_mutex);
+ audit_log_common_recv_msg(&ab, msg_type);
+ if (msg_type != AUDIT_USER_TTY)
+ audit_log_format(ab, " msg='%.*s'",
+ AUDIT_MESSAGE_TEXT_MAX,
+ (char *)data);
+ else {
+ int size;
+
+ audit_log_format(ab, " data=");
+ size = nlmsg_len(nlh);
+ if (size > 0 &&
+ ((unsigned char *)data)[size - 1] == '\0')
+ size--;
+ audit_log_n_untrustedstring(ab, data, size);
+ }
+ audit_set_portid(ab, NETLINK_CB(skb).portid);
+ audit_log_end(ab);
+ mutex_lock(&audit_cmd_mutex);
+ }
+ break;
+ case AUDIT_ADD_RULE:
+ case AUDIT_DEL_RULE:
+ if (nlmsg_len(nlh) < sizeof(struct audit_rule_data))
+ return -EINVAL;
+ if (audit_enabled == AUDIT_LOCKED) {
+ audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
+ audit_log_format(ab, " audit_enabled=%d res=0", audit_enabled);
+ audit_log_end(ab);
+ return -EPERM;
+ }
+ err = audit_rule_change(msg_type, NETLINK_CB(skb).portid,
+ seq, data, nlmsg_len(nlh));
+ break;
+ case AUDIT_LIST_RULES:
+ err = audit_list_rules_send(skb, seq);
+ break;
+ case AUDIT_TRIM:
+ audit_trim_trees();
+ audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
+ audit_log_format(ab, " op=trim res=1");
+ audit_log_end(ab);
+ break;
+ case AUDIT_MAKE_EQUIV: {
+ void *bufp = data;
+ u32 sizes[2];
+ size_t msglen = nlmsg_len(nlh);
+ char *old, *new;
+
+ err = -EINVAL;
+ if (msglen < 2 * sizeof(u32))
+ break;
+ memcpy(sizes, bufp, 2 * sizeof(u32));
+ bufp += 2 * sizeof(u32);
+ msglen -= 2 * sizeof(u32);
+ old = audit_unpack_string(&bufp, &msglen, sizes[0]);
+ if (IS_ERR(old)) {
+ err = PTR_ERR(old);
+ break;
+ }
+ new = audit_unpack_string(&bufp, &msglen, sizes[1]);
+ if (IS_ERR(new)) {
+ err = PTR_ERR(new);
+ kfree(old);
+ break;
+ }
+ /* OK, here comes... */
+ err = audit_tag_tree(old, new);
+
+ audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
+
+ audit_log_format(ab, " op=make_equiv old=");
+ audit_log_untrustedstring(ab, old);
+ audit_log_format(ab, " new=");
+ audit_log_untrustedstring(ab, new);
+ audit_log_format(ab, " res=%d", !err);
+ audit_log_end(ab);
+ kfree(old);
+ kfree(new);
+ break;
+ }
+ case AUDIT_SIGNAL_INFO:
+ len = 0;
+ if (audit_sig_sid) {
+ err = security_secid_to_secctx(audit_sig_sid, &ctx, &len);
+ if (err)
+ return err;
+ }
+ sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL);
+ if (!sig_data) {
+ if (audit_sig_sid)
+ security_release_secctx(ctx, len);
+ return -ENOMEM;
+ }
+ sig_data->uid = from_kuid(&init_user_ns, audit_sig_uid);
+ sig_data->pid = audit_sig_pid;
+ if (audit_sig_sid) {
+ memcpy(sig_data->ctx, ctx, len);
+ security_release_secctx(ctx, len);
+ }
+ audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0,
+ sig_data, sizeof(*sig_data) + len);
+ kfree(sig_data);
+ break;
+ case AUDIT_TTY_GET: {
+ struct audit_tty_status s;
+ struct task_struct *tsk = current;
+
+ spin_lock(&tsk->sighand->siglock);
+ s.enabled = tsk->signal->audit_tty;
+ s.log_passwd = tsk->signal->audit_tty_log_passwd;
+ spin_unlock(&tsk->sighand->siglock);
+
+ audit_send_reply(skb, seq, AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
+ break;
+ }
+ case AUDIT_TTY_SET: {
+ struct audit_tty_status s, old;
+ struct task_struct *tsk = current;
+ struct audit_buffer *ab;
+
+ memset(&s, 0, sizeof(s));
+ /* guard against past and future API changes */
+ memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh)));
+ /* check if new data is valid */
+ if ((s.enabled != 0 && s.enabled != 1) ||
+ (s.log_passwd != 0 && s.log_passwd != 1))
+ err = -EINVAL;
+
+ spin_lock(&tsk->sighand->siglock);
+ old.enabled = tsk->signal->audit_tty;
+ old.log_passwd = tsk->signal->audit_tty_log_passwd;
+ if (!err) {
+ tsk->signal->audit_tty = s.enabled;
+ tsk->signal->audit_tty_log_passwd = s.log_passwd;
+ }
+ spin_unlock(&tsk->sighand->siglock);
+
+ audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
+ audit_log_format(ab, " op=tty_set old-enabled=%d new-enabled=%d"
+ " old-log_passwd=%d new-log_passwd=%d res=%d",
+ old.enabled, s.enabled, old.log_passwd,
+ s.log_passwd, !err);
+ audit_log_end(ab);
+ break;
+ }
+ default:
+ err = -EINVAL;
+ break;
+ }
+
+ return err < 0 ? err : 0;
+}
+
+/*
+ * Get message from skb. Each message is processed by audit_receive_msg.
+ * Malformed skbs with wrong length are discarded silently.
+ */
+static void audit_receive_skb(struct sk_buff *skb)
+{
+ struct nlmsghdr *nlh;
+ /*
+ * len MUST be signed for nlmsg_next to be able to dec it below 0
+ * if the nlmsg_len was not aligned
+ */
+ int len;
+ int err;
+
+ nlh = nlmsg_hdr(skb);
+ len = skb->len;
+
+ while (nlmsg_ok(nlh, len)) {
+ err = audit_receive_msg(skb, nlh);
+ /* if err or if this message says it wants a response */
+ if (err || (nlh->nlmsg_flags & NLM_F_ACK))
+ netlink_ack(skb, nlh, err);
+
+ nlh = nlmsg_next(nlh, &len);
+ }
+}
+
+/* Receive messages from netlink socket. */
+static void audit_receive(struct sk_buff *skb)
+{
+ mutex_lock(&audit_cmd_mutex);
+ audit_receive_skb(skb);
+ mutex_unlock(&audit_cmd_mutex);
+}
+
+/* Run custom bind function on netlink socket group connect or bind requests. */
+static int audit_bind(struct net *net, int group)
+{
+ if (!capable(CAP_AUDIT_READ))
+ return -EPERM;
+
+ return 0;
+}
+
+static int __net_init audit_net_init(struct net *net)
+{
+ struct netlink_kernel_cfg cfg = {
+ .input = audit_receive,
+ .bind = audit_bind,
+ .flags = NL_CFG_F_NONROOT_RECV,
+ .groups = AUDIT_NLGRP_MAX,
+ };
+
+ struct audit_net *aunet = net_generic(net, audit_net_id);
+
+ aunet->nlsk = netlink_kernel_create(net, NETLINK_AUDIT, &cfg);
+ if (aunet->nlsk == NULL) {
+ audit_panic("cannot initialize netlink socket in namespace");
+ return -ENOMEM;
+ }
+ aunet->nlsk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
+ return 0;
+}
+
+static void __net_exit audit_net_exit(struct net *net)
+{
+ struct audit_net *aunet = net_generic(net, audit_net_id);
+ struct sock *sock = aunet->nlsk;
+ if (sock == audit_sock) {
+ audit_pid = 0;
+ audit_sock = NULL;
+ }
+
+ RCU_INIT_POINTER(aunet->nlsk, NULL);
+ synchronize_net();
+ netlink_kernel_release(sock);
+}
+
+static struct pernet_operations audit_net_ops __net_initdata = {
+ .init = audit_net_init,
+ .exit = audit_net_exit,
+ .id = &audit_net_id,
+ .size = sizeof(struct audit_net),
+};
+
+/* Initialize audit support at boot time. */
+static int __init audit_init(void)
+{
+ int i;
+
+ if (audit_initialized == AUDIT_DISABLED)
+ return 0;
+
+ pr_info("initializing netlink subsys (%s)\n",
+ audit_default ? "enabled" : "disabled");
+ register_pernet_subsys(&audit_net_ops);
+
+ skb_queue_head_init(&audit_skb_queue);
+ skb_queue_head_init(&audit_skb_hold_queue);
+ audit_initialized = AUDIT_INITIALIZED;
+ audit_enabled = audit_default;
+ audit_ever_enabled |= !!audit_default;
+
+ audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
+
+ for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
+ INIT_LIST_HEAD(&audit_inode_hash[i]);
+
+ return 0;
+}
+__initcall(audit_init);
+
+/* Process kernel command-line parameter at boot time. audit=0 or audit=1. */
+static int __init audit_enable(char *str)
+{
+ audit_default = !!simple_strtol(str, NULL, 0);
+ if (!audit_default)
+ audit_initialized = AUDIT_DISABLED;
+
+ pr_info("%s\n", audit_default ?
+ "enabled (after initialization)" : "disabled (until reboot)");
+
+ return 1;
+}
+__setup("audit=", audit_enable);
+
+/* Process kernel command-line parameter at boot time.
+ * audit_backlog_limit=<n> */
+static int __init audit_backlog_limit_set(char *str)
+{
+ u32 audit_backlog_limit_arg;
+
+ pr_info("audit_backlog_limit: ");
+ if (kstrtouint(str, 0, &audit_backlog_limit_arg)) {
+ pr_cont("using default of %u, unable to parse %s\n",
+ audit_backlog_limit, str);
+ return 1;
+ }
+
+ audit_backlog_limit = audit_backlog_limit_arg;
+ pr_cont("%d\n", audit_backlog_limit);
+
+ return 1;
+}
+__setup("audit_backlog_limit=", audit_backlog_limit_set);
+
+static void audit_buffer_free(struct audit_buffer *ab)
+{
+ unsigned long flags;
+
+ if (!ab)
+ return;
+
+ if (ab->skb)
+ kfree_skb(ab->skb);
+
+ spin_lock_irqsave(&audit_freelist_lock, flags);
+ if (audit_freelist_count > AUDIT_MAXFREE)
+ kfree(ab);
+ else {
+ audit_freelist_count++;
+ list_add(&ab->list, &audit_freelist);
+ }
+ spin_unlock_irqrestore(&audit_freelist_lock, flags);
+}
+
+static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
+ gfp_t gfp_mask, int type)
+{
+ unsigned long flags;
+ struct audit_buffer *ab = NULL;
+ struct nlmsghdr *nlh;
+
+ spin_lock_irqsave(&audit_freelist_lock, flags);
+ if (!list_empty(&audit_freelist)) {
+ ab = list_entry(audit_freelist.next,
+ struct audit_buffer, list);
+ list_del(&ab->list);
+ --audit_freelist_count;
+ }
+ spin_unlock_irqrestore(&audit_freelist_lock, flags);
+
+ if (!ab) {
+ ab = kmalloc(sizeof(*ab), gfp_mask);
+ if (!ab)
+ goto err;
+ }
+
+ ab->ctx = ctx;
+ ab->gfp_mask = gfp_mask;
+
+ ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask);
+ if (!ab->skb)
+ goto err;
+
+ nlh = nlmsg_put(ab->skb, 0, 0, type, 0, 0);
+ if (!nlh)
+ goto out_kfree_skb;
+
+ return ab;
+
+out_kfree_skb:
+ kfree_skb(ab->skb);
+ ab->skb = NULL;
+err:
+ audit_buffer_free(ab);
+ return NULL;
+}
+
+/**
+ * audit_serial - compute a serial number for the audit record
+ *
+ * Compute a serial number for the audit record. Audit records are
+ * written to user-space as soon as they are generated, so a complete
+ * audit record may be written in several pieces. The timestamp of the
+ * record and this serial number are used by the user-space tools to
+ * determine which pieces belong to the same audit record. The
+ * (timestamp,serial) tuple is unique for each syscall and is live from
+ * syscall entry to syscall exit.
+ *
+ * NOTE: Another possibility is to store the formatted records off the
+ * audit context (for those records that have a context), and emit them
+ * all at syscall exit. However, this could delay the reporting of
+ * significant errors until syscall exit (or never, if the system
+ * halts).
+ */
+unsigned int audit_serial(void)
+{
+ static atomic_t serial = ATOMIC_INIT(0);
+
+ return atomic_add_return(1, &serial);
+}
+
+static inline void audit_get_stamp(struct audit_context *ctx,
+ struct timespec *t, unsigned int *serial)
+{
+ if (!ctx || !auditsc_get_stamp(ctx, t, serial)) {
+ *t = CURRENT_TIME;
+ *serial = audit_serial();
+ }
+}
+
+/*
+ * Wait for auditd to drain the queue a little
+ */
+static long wait_for_auditd(long sleep_time)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ add_wait_queue_exclusive(&audit_backlog_wait, &wait);
+
+ if (audit_backlog_limit &&
+ skb_queue_len(&audit_skb_queue) > audit_backlog_limit)
+ sleep_time = schedule_timeout(sleep_time);
+
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(&audit_backlog_wait, &wait);
+
+ return sleep_time;
+}
+
+/**
+ * audit_log_start - obtain an audit buffer
+ * @ctx: audit_context (may be NULL)
+ * @gfp_mask: type of allocation
+ * @type: audit message type
+ *
+ * Returns audit_buffer pointer on success or NULL on error.
+ *
+ * Obtain an audit buffer. This routine does locking to obtain the
+ * audit buffer, but then no locking is required for calls to
+ * audit_log_*format. If the task (ctx) is a task that is currently in a
+ * syscall, then the syscall is marked as auditable and an audit record
+ * will be written at syscall exit. If there is no associated task, then
+ * task context (ctx) should be NULL.
+ */
+struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
+ int type)
+{
+ struct audit_buffer *ab = NULL;
+ struct timespec t;
+ unsigned int uninitialized_var(serial);
+ int reserve = 5; /* Allow atomic callers to go up to five
+ entries over the normal backlog limit */
+ unsigned long timeout_start = jiffies;
+
+ if (audit_initialized != AUDIT_INITIALIZED)
+ return NULL;
+
+ if (unlikely(audit_filter_type(type)))
+ return NULL;
+
+ if (gfp_mask & __GFP_WAIT) {
+ if (audit_pid && audit_pid == current->pid)
+ gfp_mask &= ~__GFP_WAIT;
+ else
+ reserve = 0;
+ }
+
+ while (audit_backlog_limit
+ && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) {
+ if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time) {
+ long sleep_time;
+
+ sleep_time = timeout_start + audit_backlog_wait_time - jiffies;
+ if (sleep_time > 0) {
+ sleep_time = wait_for_auditd(sleep_time);
+ if (sleep_time > 0)
+ continue;
+ }
+ }
+ if (audit_rate_check() && printk_ratelimit())
+ pr_warn("audit_backlog=%d > audit_backlog_limit=%d\n",
+ skb_queue_len(&audit_skb_queue),
+ audit_backlog_limit);
+ audit_log_lost("backlog limit exceeded");
+ audit_backlog_wait_time = audit_backlog_wait_overflow;
+ wake_up(&audit_backlog_wait);
+ return NULL;
+ }
+
+ if (!reserve)
+ audit_backlog_wait_time = audit_backlog_wait_time_master;
+
+ ab = audit_buffer_alloc(ctx, gfp_mask, type);
+ if (!ab) {
+ audit_log_lost("out of memory in audit_log_start");
+ return NULL;
+ }
+
+ audit_get_stamp(ab->ctx, &t, &serial);
+
+ audit_log_format(ab, "audit(%lu.%03lu:%u): ",
+ t.tv_sec, t.tv_nsec/1000000, serial);
+ return ab;
+}
+
+/**
+ * audit_expand - expand skb in the audit buffer
+ * @ab: audit_buffer
+ * @extra: space to add at tail of the skb
+ *
+ * Returns 0 (no space) on failed expansion, or available space if
+ * successful.
+ */
+static inline int audit_expand(struct audit_buffer *ab, int extra)
+{
+ struct sk_buff *skb = ab->skb;
+ int oldtail = skb_tailroom(skb);
+ int ret = pskb_expand_head(skb, 0, extra, ab->gfp_mask);
+ int newtail = skb_tailroom(skb);
+
+ if (ret < 0) {
+ audit_log_lost("out of memory in audit_expand");
+ return 0;
+ }
+
+ skb->truesize += newtail - oldtail;
+ return newtail;
+}
+
+/*
+ * Format an audit message into the audit buffer. If there isn't enough
+ * room in the audit buffer, more room will be allocated and vsnprint
+ * will be called a second time. Currently, we assume that a printk
+ * can't format message larger than 1024 bytes, so we don't either.
+ */
+static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
+ va_list args)
+{
+ int len, avail;
+ struct sk_buff *skb;
+ va_list args2;
+
+ if (!ab)
+ return;
+
+ BUG_ON(!ab->skb);
+ skb = ab->skb;
+ avail = skb_tailroom(skb);
+ if (avail == 0) {
+ avail = audit_expand(ab, AUDIT_BUFSIZ);
+ if (!avail)
+ goto out;
+ }
+ va_copy(args2, args);
+ len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args);
+ if (len >= avail) {
+ /* The printk buffer is 1024 bytes long, so if we get
+ * here and AUDIT_BUFSIZ is at least 1024, then we can
+ * log everything that printk could have logged. */
+ avail = audit_expand(ab,
+ max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail));
+ if (!avail)
+ goto out_va_end;
+ len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2);
+ }
+ if (len > 0)
+ skb_put(skb, len);
+out_va_end:
+ va_end(args2);
+out:
+ return;
+}
+
+/**
+ * audit_log_format - format a message into the audit buffer.
+ * @ab: audit_buffer
+ * @fmt: format string
+ * @...: optional parameters matching @fmt string
+ *
+ * All the work is done in audit_log_vformat.
+ */
+void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
+{
+ va_list args;
+
+ if (!ab)
+ return;
+ va_start(args, fmt);
+ audit_log_vformat(ab, fmt, args);
+ va_end(args);
+}
+
+/**
+ * audit_log_hex - convert a buffer to hex and append it to the audit skb
+ * @ab: the audit_buffer
+ * @buf: buffer to convert to hex
+ * @len: length of @buf to be converted
+ *
+ * No return value; failure to expand is silently ignored.
+ *
+ * This function will take the passed buf and convert it into a string of
+ * ascii hex digits. The new string is placed onto the skb.
+ */
+void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf,
+ size_t len)
+{
+ int i, avail, new_len;
+ unsigned char *ptr;
+ struct sk_buff *skb;
+
+ if (!ab)
+ return;
+
+ BUG_ON(!ab->skb);
+ skb = ab->skb;
+ avail = skb_tailroom(skb);
+ new_len = len<<1;
+ if (new_len >= avail) {
+ /* Round the buffer request up to the next multiple */
+ new_len = AUDIT_BUFSIZ*(((new_len-avail)/AUDIT_BUFSIZ) + 1);
+ avail = audit_expand(ab, new_len);
+ if (!avail)
+ return;
+ }
+
+ ptr = skb_tail_pointer(skb);
+ for (i = 0; i < len; i++)
+ ptr = hex_byte_pack_upper(ptr, buf[i]);
+ *ptr = 0;
+ skb_put(skb, len << 1); /* new string is twice the old string */
+}
+
+/*
+ * Format a string of no more than slen characters into the audit buffer,
+ * enclosed in quote marks.
+ */
+void audit_log_n_string(struct audit_buffer *ab, const char *string,
+ size_t slen)
+{
+ int avail, new_len;
+ unsigned char *ptr;
+ struct sk_buff *skb;
+
+ if (!ab)
+ return;
+
+ BUG_ON(!ab->skb);
+ skb = ab->skb;
+ avail = skb_tailroom(skb);
+ new_len = slen + 3; /* enclosing quotes + null terminator */
+ if (new_len > avail) {
+ avail = audit_expand(ab, new_len);
+ if (!avail)
+ return;
+ }
+ ptr = skb_tail_pointer(skb);
+ *ptr++ = '"';
+ memcpy(ptr, string, slen);
+ ptr += slen;
+ *ptr++ = '"';
+ *ptr = 0;
+ skb_put(skb, slen + 2); /* don't include null terminator */
+}
+
+/**
+ * audit_string_contains_control - does a string need to be logged in hex
+ * @string: string to be checked
+ * @len: max length of the string to check
+ */
+int audit_string_contains_control(const char *string, size_t len)
+{
+ const unsigned char *p;
+ for (p = string; p < (const unsigned char *)string + len; p++) {
+ if (*p == '"' || *p < 0x21 || *p > 0x7e)
+ return 1;
+ }
+ return 0;
+}
+
+/**
+ * audit_log_n_untrustedstring - log a string that may contain random characters
+ * @ab: audit_buffer
+ * @len: length of string (not including trailing null)
+ * @string: string to be logged
+ *
+ * This code will escape a string that is passed to it if the string
+ * contains a control character, unprintable character, double quote mark,
+ * or a space. Unescaped strings will start and end with a double quote mark.
+ * Strings that are escaped are printed in hex (2 digits per char).
+ *
+ * The caller specifies the number of characters in the string to log, which may
+ * or may not be the entire string.
+ */
+void audit_log_n_untrustedstring(struct audit_buffer *ab, const char *string,
+ size_t len)
+{
+ if (audit_string_contains_control(string, len))
+ audit_log_n_hex(ab, string, len);
+ else
+ audit_log_n_string(ab, string, len);
+}
+
+/**
+ * audit_log_untrustedstring - log a string that may contain random characters
+ * @ab: audit_buffer
+ * @string: string to be logged
+ *
+ * Same as audit_log_n_untrustedstring(), except that strlen is used to
+ * determine string length.
+ */
+void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
+{
+ audit_log_n_untrustedstring(ab, string, strlen(string));
+}
+
+/* This is a helper-function to print the escaped d_path */
+void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
+ const struct path *path)
+{
+ char *p, *pathname;
+
+ if (prefix)
+ audit_log_format(ab, "%s", prefix);
+
+ /* We will allow 11 spaces for ' (deleted)' to be appended */
+ pathname = kmalloc(PATH_MAX+11, ab->gfp_mask);
+ if (!pathname) {
+ audit_log_string(ab, "<no_memory>");
+ return;
+ }
+ p = d_path(path, pathname, PATH_MAX+11);
+ if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */
+ /* FIXME: can we save some information here? */
+ audit_log_string(ab, "<too_long>");
+ } else
+ audit_log_untrustedstring(ab, p);
+ kfree(pathname);
+}
+
+void audit_log_session_info(struct audit_buffer *ab)
+{
+ unsigned int sessionid = audit_get_sessionid(current);
+ uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current));
+
+ audit_log_format(ab, " auid=%u ses=%u", auid, sessionid);
+}
+
+void audit_log_key(struct audit_buffer *ab, char *key)
+{
+ audit_log_format(ab, " key=");
+ if (key)
+ audit_log_untrustedstring(ab, key);
+ else
+ audit_log_format(ab, "(null)");
+}
+
+void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
+{
+ int i;
+
+ audit_log_format(ab, " %s=", prefix);
+ CAP_FOR_EACH_U32(i) {
+ audit_log_format(ab, "%08x",
+ cap->cap[CAP_LAST_U32 - i]);
+ }
+}
+
+static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
+{
+ kernel_cap_t *perm = &name->fcap.permitted;
+ kernel_cap_t *inh = &name->fcap.inheritable;
+ int log = 0;
+
+ if (!cap_isclear(*perm)) {
+ audit_log_cap(ab, "cap_fp", perm);
+ log = 1;
+ }
+ if (!cap_isclear(*inh)) {
+ audit_log_cap(ab, "cap_fi", inh);
+ log = 1;
+ }
+
+ if (log)
+ audit_log_format(ab, " cap_fe=%d cap_fver=%x",
+ name->fcap.fE, name->fcap_ver);
+}
+
+static inline int audit_copy_fcaps(struct audit_names *name,
+ const struct dentry *dentry)
+{
+ struct cpu_vfs_cap_data caps;
+ int rc;
+
+ if (!dentry)
+ return 0;
+
+ rc = get_vfs_caps_from_disk(dentry, &caps);
+ if (rc)
+ return rc;
+
+ name->fcap.permitted = caps.permitted;
+ name->fcap.inheritable = caps.inheritable;
+ name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
+ name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >>
+ VFS_CAP_REVISION_SHIFT;
+
+ return 0;
+}
+
+/* Copy inode data into an audit_names. */
+void audit_copy_inode(struct audit_names *name, const struct dentry *dentry,
+ const struct inode *inode)
+{
+ name->ino = inode->i_ino;
+ name->dev = inode->i_sb->s_dev;
+ name->mode = inode->i_mode;
+ name->uid = inode->i_uid;
+ name->gid = inode->i_gid;
+ name->rdev = inode->i_rdev;
+ security_inode_getsecid(inode, &name->osid);
+ audit_copy_fcaps(name, dentry);
+}
+
+/**
+ * audit_log_name - produce AUDIT_PATH record from struct audit_names
+ * @context: audit_context for the task
+ * @n: audit_names structure with reportable details
+ * @path: optional path to report instead of audit_names->name
+ * @record_num: record number to report when handling a list of names
+ * @call_panic: optional pointer to int that will be updated if secid fails
+ */
+void audit_log_name(struct audit_context *context, struct audit_names *n,
+ struct path *path, int record_num, int *call_panic)
+{
+ struct audit_buffer *ab;
+ ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
+ if (!ab)
+ return;
+
+ audit_log_format(ab, "item=%d", record_num);
+
+ if (path)
+ audit_log_d_path(ab, " name=", path);
+ else if (n->name) {
+ switch (n->name_len) {
+ case AUDIT_NAME_FULL:
+ /* log the full path */
+ audit_log_format(ab, " name=");
+ audit_log_untrustedstring(ab, n->name->name);
+ break;
+ case 0:
+ /* name was specified as a relative path and the
+ * directory component is the cwd */
+ audit_log_d_path(ab, " name=", &context->pwd);
+ break;
+ default:
+ /* log the name's directory component */
+ audit_log_format(ab, " name=");
+ audit_log_n_untrustedstring(ab, n->name->name,
+ n->name_len);
+ }
+ } else
+ audit_log_format(ab, " name=(null)");
+
+ if (n->ino != (unsigned long)-1)
+ audit_log_format(ab, " inode=%lu"
+ " dev=%02x:%02x mode=%#ho"
+ " ouid=%u ogid=%u rdev=%02x:%02x",
+ n->ino,
+ MAJOR(n->dev),
+ MINOR(n->dev),
+ n->mode,
+ from_kuid(&init_user_ns, n->uid),
+ from_kgid(&init_user_ns, n->gid),
+ MAJOR(n->rdev),
+ MINOR(n->rdev));
+ if (n->osid != 0) {
+ char *ctx = NULL;
+ u32 len;
+ if (security_secid_to_secctx(
+ n->osid, &ctx, &len)) {
+ audit_log_format(ab, " osid=%u", n->osid);
+ if (call_panic)
+ *call_panic = 2;
+ } else {
+ audit_log_format(ab, " obj=%s", ctx);
+ security_release_secctx(ctx, len);
+ }
+ }
+
+ /* log the audit_names record type */
+ audit_log_format(ab, " nametype=");
+ switch(n->type) {
+ case AUDIT_TYPE_NORMAL:
+ audit_log_format(ab, "NORMAL");
+ break;
+ case AUDIT_TYPE_PARENT:
+ audit_log_format(ab, "PARENT");
+ break;
+ case AUDIT_TYPE_CHILD_DELETE:
+ audit_log_format(ab, "DELETE");
+ break;
+ case AUDIT_TYPE_CHILD_CREATE:
+ audit_log_format(ab, "CREATE");
+ break;
+ default:
+ audit_log_format(ab, "UNKNOWN");
+ break;
+ }
+
+ audit_log_fcaps(ab, n);
+ audit_log_end(ab);
+}
+
+int audit_log_task_context(struct audit_buffer *ab)
+{
+ char *ctx = NULL;
+ unsigned len;
+ int error;
+ u32 sid;
+
+ security_task_getsecid(current, &sid);
+ if (!sid)
+ return 0;
+
+ error = security_secid_to_secctx(sid, &ctx, &len);
+ if (error) {
+ if (error != -EINVAL)
+ goto error_path;
+ return 0;
+ }
+
+ audit_log_format(ab, " subj=%s", ctx);
+ security_release_secctx(ctx, len);
+ return 0;
+
+error_path:
+ audit_panic("error in audit_log_task_context");
+ return error;
+}
+EXPORT_SYMBOL(audit_log_task_context);
+
+void audit_log_d_path_exe(struct audit_buffer *ab,
+ struct mm_struct *mm)
+{
+ struct file *exe_file;
+
+ if (!mm)
+ goto out_null;
+
+ exe_file = get_mm_exe_file(mm);
+ if (!exe_file)
+ goto out_null;
+
+ audit_log_d_path(ab, " exe=", &exe_file->f_path);
+ fput(exe_file);
+ return;
+out_null:
+ audit_log_format(ab, " exe=(null)");
+}
+
+void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
+{
+ const struct cred *cred;
+ char comm[sizeof(tsk->comm)];
+ char *tty;
+
+ if (!ab)
+ return;
+
+ /* tsk == current */
+ cred = current_cred();
+
+ spin_lock_irq(&tsk->sighand->siglock);
+ if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
+ tty = tsk->signal->tty->name;
+ else
+ tty = "(none)";
+ spin_unlock_irq(&tsk->sighand->siglock);
+
+ audit_log_format(ab,
+ " ppid=%d pid=%d auid=%u uid=%u gid=%u"
+ " euid=%u suid=%u fsuid=%u"
+ " egid=%u sgid=%u fsgid=%u tty=%s ses=%u",
+ task_ppid_nr(tsk),
+ task_pid_nr(tsk),
+ from_kuid(&init_user_ns, audit_get_loginuid(tsk)),
+ from_kuid(&init_user_ns, cred->uid),
+ from_kgid(&init_user_ns, cred->gid),
+ from_kuid(&init_user_ns, cred->euid),
+ from_kuid(&init_user_ns, cred->suid),
+ from_kuid(&init_user_ns, cred->fsuid),
+ from_kgid(&init_user_ns, cred->egid),
+ from_kgid(&init_user_ns, cred->sgid),
+ from_kgid(&init_user_ns, cred->fsgid),
+ tty, audit_get_sessionid(tsk));
+
+ audit_log_format(ab, " comm=");
+ audit_log_untrustedstring(ab, get_task_comm(comm, tsk));
+
+ audit_log_d_path_exe(ab, tsk->mm);
+ audit_log_task_context(ab);
+}
+EXPORT_SYMBOL(audit_log_task_info);
+
+/**
+ * audit_log_link_denied - report a link restriction denial
+ * @operation: specific link opreation
+ * @link: the path that triggered the restriction
+ */
+void audit_log_link_denied(const char *operation, struct path *link)
+{
+ struct audit_buffer *ab;
+ struct audit_names *name;
+
+ name = kzalloc(sizeof(*name), GFP_NOFS);
+ if (!name)
+ return;
+
+ /* Generate AUDIT_ANOM_LINK with subject, operation, outcome. */
+ ab = audit_log_start(current->audit_context, GFP_KERNEL,
+ AUDIT_ANOM_LINK);
+ if (!ab)
+ goto out;
+ audit_log_format(ab, "op=%s", operation);
+ audit_log_task_info(ab, current);
+ audit_log_format(ab, " res=0");
+ audit_log_end(ab);
+
+ /* Generate AUDIT_PATH record with object. */
+ name->type = AUDIT_TYPE_NORMAL;
+ audit_copy_inode(name, link->dentry, d_backing_inode(link->dentry));
+ audit_log_name(current->audit_context, name, link, 0, NULL);
+out:
+ kfree(name);
+}
+
+/**
+ * audit_log_end - end one audit record
+ * @ab: the audit_buffer
+ *
+ * netlink_unicast() cannot be called inside an irq context because it blocks
+ * (last arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed
+ * on a queue and a tasklet is scheduled to remove them from the queue outside
+ * the irq context. May be called in any context.
+ */
+void audit_log_end(struct audit_buffer *ab)
+{
+ if (!ab)
+ return;
+ if (!audit_rate_check()) {
+ audit_log_lost("rate limit exceeded");
+ } else {
+ struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
+
+ nlh->nlmsg_len = ab->skb->len;
+ kauditd_send_multicast_skb(ab->skb, ab->gfp_mask);
+
+ /*
+ * The original kaudit unicast socket sends up messages with
+ * nlmsg_len set to the payload length rather than the entire
+ * message length. This breaks the standard set by netlink.
+ * The existing auditd daemon assumes this breakage. Fixing
+ * this would require co-ordinating a change in the established
+ * protocol between the kaudit kernel subsystem and the auditd
+ * userspace code.
+ */
+ nlh->nlmsg_len -= NLMSG_HDRLEN;
+
+ if (audit_pid) {
+ skb_queue_tail(&audit_skb_queue, ab->skb);
+ wake_up_interruptible(&kauditd_wait);
+ } else {
+ audit_printk_skb(ab->skb);
+ }
+ ab->skb = NULL;
+ }
+ audit_buffer_free(ab);
+}
+
+/**
+ * audit_log - Log an audit record
+ * @ctx: audit context
+ * @gfp_mask: type of allocation
+ * @type: audit message type
+ * @fmt: format string to use
+ * @...: variable parameters matching the format string
+ *
+ * This is a convenience function that calls audit_log_start,
+ * audit_log_vformat, and audit_log_end. It may be called
+ * in any context.
+ */
+void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
+ const char *fmt, ...)
+{
+ struct audit_buffer *ab;
+ va_list args;
+
+ ab = audit_log_start(ctx, gfp_mask, type);
+ if (ab) {
+ va_start(args, fmt);
+ audit_log_vformat(ab, fmt, args);
+ va_end(args);
+ audit_log_end(ab);
+ }
+}
+
+#ifdef CONFIG_SECURITY
+/**
+ * audit_log_secctx - Converts and logs SELinux context
+ * @ab: audit_buffer
+ * @secid: security number
+ *
+ * This is a helper function that calls security_secid_to_secctx to convert
+ * secid to secctx and then adds the (converted) SELinux context to the audit
+ * log by calling audit_log_format, thus also preventing leak of internal secid
+ * to userspace. If secid cannot be converted audit_panic is called.
+ */
+void audit_log_secctx(struct audit_buffer *ab, u32 secid)
+{
+ u32 len;
+ char *secctx;
+
+ if (security_secid_to_secctx(secid, &secctx, &len)) {
+ audit_panic("Cannot convert secid to context");
+ } else {
+ audit_log_format(ab, " obj=%s", secctx);
+ security_release_secctx(secctx, len);
+ }
+}
+EXPORT_SYMBOL(audit_log_secctx);
+#endif
+
+EXPORT_SYMBOL(audit_log_start);
+EXPORT_SYMBOL(audit_log_end);
+EXPORT_SYMBOL(audit_log_format);
+EXPORT_SYMBOL(audit_log);
diff --git a/kernel/audit.h b/kernel/audit.h
new file mode 100644
index 000000000..d641f9bb3
--- /dev/null
+++ b/kernel/audit.h
@@ -0,0 +1,328 @@
+/* audit -- definition of audit_context structure and supporting types
+ *
+ * Copyright 2003-2004 Red Hat, Inc.
+ * Copyright 2005 Hewlett-Packard Development Company, L.P.
+ * Copyright 2005 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/fs.h>
+#include <linux/audit.h>
+#include <linux/skbuff.h>
+#include <uapi/linux/mqueue.h>
+
+/* AUDIT_NAMES is the number of slots we reserve in the audit_context
+ * for saving names from getname(). If we get more names we will allocate
+ * a name dynamically and also add those to the list anchored by names_list. */
+#define AUDIT_NAMES 5
+
+/* At task start time, the audit_state is set in the audit_context using
+ a per-task filter. At syscall entry, the audit_state is augmented by
+ the syscall filter. */
+enum audit_state {
+ AUDIT_DISABLED, /* Do not create per-task audit_context.
+ * No syscall-specific audit records can
+ * be generated. */
+ AUDIT_BUILD_CONTEXT, /* Create the per-task audit_context,
+ * and fill it in at syscall
+ * entry time. This makes a full
+ * syscall record available if some
+ * other part of the kernel decides it
+ * should be recorded. */
+ AUDIT_RECORD_CONTEXT /* Create the per-task audit_context,
+ * always fill it in at syscall entry
+ * time, and always write out the audit
+ * record at syscall exit time. */
+};
+
+/* Rule lists */
+struct audit_watch;
+struct audit_tree;
+struct audit_chunk;
+
+struct audit_entry {
+ struct list_head list;
+ struct rcu_head rcu;
+ struct audit_krule rule;
+};
+
+struct audit_cap_data {
+ kernel_cap_t permitted;
+ kernel_cap_t inheritable;
+ union {
+ unsigned int fE; /* effective bit of file cap */
+ kernel_cap_t effective; /* effective set of process */
+ };
+};
+
+/* When fs/namei.c:getname() is called, we store the pointer in name and bump
+ * the refcnt in the associated filename struct.
+ *
+ * Further, in fs/namei.c:path_lookup() we store the inode and device.
+ */
+struct audit_names {
+ struct list_head list; /* audit_context->names_list */
+
+ struct filename *name;
+ int name_len; /* number of chars to log */
+ bool hidden; /* don't log this record */
+
+ unsigned long ino;
+ dev_t dev;
+ umode_t mode;
+ kuid_t uid;
+ kgid_t gid;
+ dev_t rdev;
+ u32 osid;
+ struct audit_cap_data fcap;
+ unsigned int fcap_ver;
+ unsigned char type; /* record type */
+ /*
+ * This was an allocated audit_names and not from the array of
+ * names allocated in the task audit context. Thus this name
+ * should be freed on syscall exit.
+ */
+ bool should_free;
+};
+
+struct audit_proctitle {
+ int len; /* length of the cmdline field. */
+ char *value; /* the cmdline field */
+};
+
+/* The per-task audit context. */
+struct audit_context {
+ int dummy; /* must be the first element */
+ int in_syscall; /* 1 if task is in a syscall */
+ enum audit_state state, current_state;
+ unsigned int serial; /* serial number for record */
+ int major; /* syscall number */
+ struct timespec ctime; /* time of syscall entry */
+ unsigned long argv[4]; /* syscall arguments */
+ long return_code;/* syscall return code */
+ u64 prio;
+ int return_valid; /* return code is valid */
+ /*
+ * The names_list is the list of all audit_names collected during this
+ * syscall. The first AUDIT_NAMES entries in the names_list will
+ * actually be from the preallocated_names array for performance
+ * reasons. Except during allocation they should never be referenced
+ * through the preallocated_names array and should only be found/used
+ * by running the names_list.
+ */
+ struct audit_names preallocated_names[AUDIT_NAMES];
+ int name_count; /* total records in names_list */
+ struct list_head names_list; /* struct audit_names->list anchor */
+ char *filterkey; /* key for rule that triggered record */
+ struct path pwd;
+ struct audit_aux_data *aux;
+ struct audit_aux_data *aux_pids;
+ struct sockaddr_storage *sockaddr;
+ size_t sockaddr_len;
+ /* Save things to print about task_struct */
+ pid_t pid, ppid;
+ kuid_t uid, euid, suid, fsuid;
+ kgid_t gid, egid, sgid, fsgid;
+ unsigned long personality;
+ int arch;
+
+ pid_t target_pid;
+ kuid_t target_auid;
+ kuid_t target_uid;
+ unsigned int target_sessionid;
+ u32 target_sid;
+ char target_comm[TASK_COMM_LEN];
+
+ struct audit_tree_refs *trees, *first_trees;
+ struct list_head killed_trees;
+ int tree_count;
+
+ int type;
+ union {
+ struct {
+ int nargs;
+ long args[6];
+ } socketcall;
+ struct {
+ kuid_t uid;
+ kgid_t gid;
+ umode_t mode;
+ u32 osid;
+ int has_perm;
+ uid_t perm_uid;
+ gid_t perm_gid;
+ umode_t perm_mode;
+ unsigned long qbytes;
+ } ipc;
+ struct {
+ mqd_t mqdes;
+ struct mq_attr mqstat;
+ } mq_getsetattr;
+ struct {
+ mqd_t mqdes;
+ int sigev_signo;
+ } mq_notify;
+ struct {
+ mqd_t mqdes;
+ size_t msg_len;
+ unsigned int msg_prio;
+ struct timespec abs_timeout;
+ } mq_sendrecv;
+ struct {
+ int oflag;
+ umode_t mode;
+ struct mq_attr attr;
+ } mq_open;
+ struct {
+ pid_t pid;
+ struct audit_cap_data cap;
+ } capset;
+ struct {
+ int fd;
+ int flags;
+ } mmap;
+ struct {
+ int argc;
+ } execve;
+ };
+ int fds[2];
+ struct audit_proctitle proctitle;
+};
+
+extern u32 audit_ever_enabled;
+
+extern void audit_copy_inode(struct audit_names *name,
+ const struct dentry *dentry,
+ const struct inode *inode);
+extern void audit_log_cap(struct audit_buffer *ab, char *prefix,
+ kernel_cap_t *cap);
+extern void audit_log_name(struct audit_context *context,
+ struct audit_names *n, struct path *path,
+ int record_num, int *call_panic);
+
+extern int audit_pid;
+
+#define AUDIT_INODE_BUCKETS 32
+extern struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
+
+static inline int audit_hash_ino(u32 ino)
+{
+ return (ino & (AUDIT_INODE_BUCKETS-1));
+}
+
+/* Indicates that audit should log the full pathname. */
+#define AUDIT_NAME_FULL -1
+
+extern int audit_match_class(int class, unsigned syscall);
+extern int audit_comparator(const u32 left, const u32 op, const u32 right);
+extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right);
+extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right);
+extern int parent_len(const char *path);
+extern int audit_compare_dname_path(const char *dname, const char *path, int plen);
+extern struct sk_buff *audit_make_reply(__u32 portid, int seq, int type,
+ int done, int multi,
+ const void *payload, int size);
+extern void audit_panic(const char *message);
+
+struct audit_netlink_list {
+ __u32 portid;
+ struct net *net;
+ struct sk_buff_head q;
+};
+
+int audit_send_list(void *);
+
+struct audit_net {
+ struct sock *nlsk;
+};
+
+extern int selinux_audit_rule_update(void);
+
+extern struct mutex audit_filter_mutex;
+extern void audit_free_rule_rcu(struct rcu_head *);
+extern struct list_head audit_filter_list[];
+
+extern struct audit_entry *audit_dupe_rule(struct audit_krule *old);
+
+extern void audit_log_d_path_exe(struct audit_buffer *ab,
+ struct mm_struct *mm);
+
+/* audit watch functions */
+#ifdef CONFIG_AUDIT_WATCH
+extern void audit_put_watch(struct audit_watch *watch);
+extern void audit_get_watch(struct audit_watch *watch);
+extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op);
+extern int audit_add_watch(struct audit_krule *krule, struct list_head **list);
+extern void audit_remove_watch_rule(struct audit_krule *krule);
+extern char *audit_watch_path(struct audit_watch *watch);
+extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev);
+#else
+#define audit_put_watch(w) {}
+#define audit_get_watch(w) {}
+#define audit_to_watch(k, p, l, o) (-EINVAL)
+#define audit_add_watch(k, l) (-EINVAL)
+#define audit_remove_watch_rule(k) BUG()
+#define audit_watch_path(w) ""
+#define audit_watch_compare(w, i, d) 0
+
+#endif /* CONFIG_AUDIT_WATCH */
+
+#ifdef CONFIG_AUDIT_TREE
+extern struct audit_chunk *audit_tree_lookup(const struct inode *);
+extern void audit_put_chunk(struct audit_chunk *);
+extern int audit_tree_match(struct audit_chunk *, struct audit_tree *);
+extern int audit_make_tree(struct audit_krule *, char *, u32);
+extern int audit_add_tree_rule(struct audit_krule *);
+extern int audit_remove_tree_rule(struct audit_krule *);
+extern void audit_trim_trees(void);
+extern int audit_tag_tree(char *old, char *new);
+extern const char *audit_tree_path(struct audit_tree *);
+extern void audit_put_tree(struct audit_tree *);
+extern void audit_kill_trees(struct list_head *);
+#else
+#define audit_remove_tree_rule(rule) BUG()
+#define audit_add_tree_rule(rule) -EINVAL
+#define audit_make_tree(rule, str, op) -EINVAL
+#define audit_trim_trees() (void)0
+#define audit_put_tree(tree) (void)0
+#define audit_tag_tree(old, new) -EINVAL
+#define audit_tree_path(rule) "" /* never called */
+#define audit_kill_trees(list) BUG()
+#endif
+
+extern char *audit_unpack_string(void **, size_t *, size_t);
+
+extern pid_t audit_sig_pid;
+extern kuid_t audit_sig_uid;
+extern u32 audit_sig_sid;
+
+#ifdef CONFIG_AUDITSYSCALL
+extern int __audit_signal_info(int sig, struct task_struct *t);
+static inline int audit_signal_info(int sig, struct task_struct *t)
+{
+ if (unlikely((audit_pid && t->tgid == audit_pid) ||
+ (audit_signals && !audit_dummy_context())))
+ return __audit_signal_info(sig, t);
+ return 0;
+}
+extern void audit_filter_inodes(struct task_struct *, struct audit_context *);
+extern struct list_head *audit_killed_trees(void);
+#else
+#define audit_signal_info(s,t) AUDIT_DISABLED
+#define audit_filter_inodes(t,c) AUDIT_DISABLED
+#endif
+
+extern struct mutex audit_cmd_mutex;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
new file mode 100644
index 000000000..b0f987727
--- /dev/null
+++ b/kernel/audit_tree.c
@@ -0,0 +1,988 @@
+#include "audit.h"
+#include <linux/fsnotify_backend.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/kthread.h>
+#include <linux/slab.h>
+
+struct audit_tree;
+struct audit_chunk;
+
+struct audit_tree {
+ atomic_t count;
+ int goner;
+ struct audit_chunk *root;
+ struct list_head chunks;
+ struct list_head rules;
+ struct list_head list;
+ struct list_head same_root;
+ struct rcu_head head;
+ char pathname[];
+};
+
+struct audit_chunk {
+ struct list_head hash;
+ struct fsnotify_mark mark;
+ struct list_head trees; /* with root here */
+ int dead;
+ int count;
+ atomic_long_t refs;
+ struct rcu_head head;
+ struct node {
+ struct list_head list;
+ struct audit_tree *owner;
+ unsigned index; /* index; upper bit indicates 'will prune' */
+ } owners[];
+};
+
+static LIST_HEAD(tree_list);
+static LIST_HEAD(prune_list);
+static struct task_struct *prune_thread;
+
+/*
+ * One struct chunk is attached to each inode of interest.
+ * We replace struct chunk on tagging/untagging.
+ * Rules have pointer to struct audit_tree.
+ * Rules have struct list_head rlist forming a list of rules over
+ * the same tree.
+ * References to struct chunk are collected at audit_inode{,_child}()
+ * time and used in AUDIT_TREE rule matching.
+ * These references are dropped at the same time we are calling
+ * audit_free_names(), etc.
+ *
+ * Cyclic lists galore:
+ * tree.chunks anchors chunk.owners[].list hash_lock
+ * tree.rules anchors rule.rlist audit_filter_mutex
+ * chunk.trees anchors tree.same_root hash_lock
+ * chunk.hash is a hash with middle bits of watch.inode as
+ * a hash function. RCU, hash_lock
+ *
+ * tree is refcounted; one reference for "some rules on rules_list refer to
+ * it", one for each chunk with pointer to it.
+ *
+ * chunk is refcounted by embedded fsnotify_mark + .refs (non-zero refcount
+ * of watch contributes 1 to .refs).
+ *
+ * node.index allows to get from node.list to containing chunk.
+ * MSB of that sucker is stolen to mark taggings that we might have to
+ * revert - several operations have very unpleasant cleanup logics and
+ * that makes a difference. Some.
+ */
+
+static struct fsnotify_group *audit_tree_group;
+
+static struct audit_tree *alloc_tree(const char *s)
+{
+ struct audit_tree *tree;
+
+ tree = kmalloc(sizeof(struct audit_tree) + strlen(s) + 1, GFP_KERNEL);
+ if (tree) {
+ atomic_set(&tree->count, 1);
+ tree->goner = 0;
+ INIT_LIST_HEAD(&tree->chunks);
+ INIT_LIST_HEAD(&tree->rules);
+ INIT_LIST_HEAD(&tree->list);
+ INIT_LIST_HEAD(&tree->same_root);
+ tree->root = NULL;
+ strcpy(tree->pathname, s);
+ }
+ return tree;
+}
+
+static inline void get_tree(struct audit_tree *tree)
+{
+ atomic_inc(&tree->count);
+}
+
+static inline void put_tree(struct audit_tree *tree)
+{
+ if (atomic_dec_and_test(&tree->count))
+ kfree_rcu(tree, head);
+}
+
+/* to avoid bringing the entire thing in audit.h */
+const char *audit_tree_path(struct audit_tree *tree)
+{
+ return tree->pathname;
+}
+
+static void free_chunk(struct audit_chunk *chunk)
+{
+ int i;
+
+ for (i = 0; i < chunk->count; i++) {
+ if (chunk->owners[i].owner)
+ put_tree(chunk->owners[i].owner);
+ }
+ kfree(chunk);
+}
+
+void audit_put_chunk(struct audit_chunk *chunk)
+{
+ if (atomic_long_dec_and_test(&chunk->refs))
+ free_chunk(chunk);
+}
+
+static void __put_chunk(struct rcu_head *rcu)
+{
+ struct audit_chunk *chunk = container_of(rcu, struct audit_chunk, head);
+ audit_put_chunk(chunk);
+}
+
+static void audit_tree_destroy_watch(struct fsnotify_mark *entry)
+{
+ struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
+ call_rcu(&chunk->head, __put_chunk);
+}
+
+static struct audit_chunk *alloc_chunk(int count)
+{
+ struct audit_chunk *chunk;
+ size_t size;
+ int i;
+
+ size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node);
+ chunk = kzalloc(size, GFP_KERNEL);
+ if (!chunk)
+ return NULL;
+
+ INIT_LIST_HEAD(&chunk->hash);
+ INIT_LIST_HEAD(&chunk->trees);
+ chunk->count = count;
+ atomic_long_set(&chunk->refs, 1);
+ for (i = 0; i < count; i++) {
+ INIT_LIST_HEAD(&chunk->owners[i].list);
+ chunk->owners[i].index = i;
+ }
+ fsnotify_init_mark(&chunk->mark, audit_tree_destroy_watch);
+ chunk->mark.mask = FS_IN_IGNORED;
+ return chunk;
+}
+
+enum {HASH_SIZE = 128};
+static struct list_head chunk_hash_heads[HASH_SIZE];
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(hash_lock);
+
+static inline struct list_head *chunk_hash(const struct inode *inode)
+{
+ unsigned long n = (unsigned long)inode / L1_CACHE_BYTES;
+ return chunk_hash_heads + n % HASH_SIZE;
+}
+
+/* hash_lock & entry->lock is held by caller */
+static void insert_hash(struct audit_chunk *chunk)
+{
+ struct fsnotify_mark *entry = &chunk->mark;
+ struct list_head *list;
+
+ if (!entry->inode)
+ return;
+ list = chunk_hash(entry->inode);
+ list_add_rcu(&chunk->hash, list);
+}
+
+/* called under rcu_read_lock */
+struct audit_chunk *audit_tree_lookup(const struct inode *inode)
+{
+ struct list_head *list = chunk_hash(inode);
+ struct audit_chunk *p;
+
+ list_for_each_entry_rcu(p, list, hash) {
+ /* mark.inode may have gone NULL, but who cares? */
+ if (p->mark.inode == inode) {
+ atomic_long_inc(&p->refs);
+ return p;
+ }
+ }
+ return NULL;
+}
+
+int audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree)
+{
+ int n;
+ for (n = 0; n < chunk->count; n++)
+ if (chunk->owners[n].owner == tree)
+ return 1;
+ return 0;
+}
+
+/* tagging and untagging inodes with trees */
+
+static struct audit_chunk *find_chunk(struct node *p)
+{
+ int index = p->index & ~(1U<<31);
+ p -= index;
+ return container_of(p, struct audit_chunk, owners[0]);
+}
+
+static void untag_chunk(struct node *p)
+{
+ struct audit_chunk *chunk = find_chunk(p);
+ struct fsnotify_mark *entry = &chunk->mark;
+ struct audit_chunk *new = NULL;
+ struct audit_tree *owner;
+ int size = chunk->count - 1;
+ int i, j;
+
+ fsnotify_get_mark(entry);
+
+ spin_unlock(&hash_lock);
+
+ if (size)
+ new = alloc_chunk(size);
+
+ spin_lock(&entry->lock);
+ if (chunk->dead || !entry->inode) {
+ spin_unlock(&entry->lock);
+ if (new)
+ free_chunk(new);
+ goto out;
+ }
+
+ owner = p->owner;
+
+ if (!size) {
+ chunk->dead = 1;
+ spin_lock(&hash_lock);
+ list_del_init(&chunk->trees);
+ if (owner->root == chunk)
+ owner->root = NULL;
+ list_del_init(&p->list);
+ list_del_rcu(&chunk->hash);
+ spin_unlock(&hash_lock);
+ spin_unlock(&entry->lock);
+ fsnotify_destroy_mark(entry, audit_tree_group);
+ goto out;
+ }
+
+ if (!new)
+ goto Fallback;
+
+ fsnotify_duplicate_mark(&new->mark, entry);
+ if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.inode, NULL, 1)) {
+ fsnotify_put_mark(&new->mark);
+ goto Fallback;
+ }
+
+ chunk->dead = 1;
+ spin_lock(&hash_lock);
+ list_replace_init(&chunk->trees, &new->trees);
+ if (owner->root == chunk) {
+ list_del_init(&owner->same_root);
+ owner->root = NULL;
+ }
+
+ for (i = j = 0; j <= size; i++, j++) {
+ struct audit_tree *s;
+ if (&chunk->owners[j] == p) {
+ list_del_init(&p->list);
+ i--;
+ continue;
+ }
+ s = chunk->owners[j].owner;
+ new->owners[i].owner = s;
+ new->owners[i].index = chunk->owners[j].index - j + i;
+ if (!s) /* result of earlier fallback */
+ continue;
+ get_tree(s);
+ list_replace_init(&chunk->owners[j].list, &new->owners[i].list);
+ }
+
+ list_replace_rcu(&chunk->hash, &new->hash);
+ list_for_each_entry(owner, &new->trees, same_root)
+ owner->root = new;
+ spin_unlock(&hash_lock);
+ spin_unlock(&entry->lock);
+ fsnotify_destroy_mark(entry, audit_tree_group);
+ fsnotify_put_mark(&new->mark); /* drop initial reference */
+ goto out;
+
+Fallback:
+ // do the best we can
+ spin_lock(&hash_lock);
+ if (owner->root == chunk) {
+ list_del_init(&owner->same_root);
+ owner->root = NULL;
+ }
+ list_del_init(&p->list);
+ p->owner = NULL;
+ put_tree(owner);
+ spin_unlock(&hash_lock);
+ spin_unlock(&entry->lock);
+out:
+ fsnotify_put_mark(entry);
+ spin_lock(&hash_lock);
+}
+
+static int create_chunk(struct inode *inode, struct audit_tree *tree)
+{
+ struct fsnotify_mark *entry;
+ struct audit_chunk *chunk = alloc_chunk(1);
+ if (!chunk)
+ return -ENOMEM;
+
+ entry = &chunk->mark;
+ if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) {
+ fsnotify_put_mark(entry);
+ return -ENOSPC;
+ }
+
+ spin_lock(&entry->lock);
+ spin_lock(&hash_lock);
+ if (tree->goner) {
+ spin_unlock(&hash_lock);
+ chunk->dead = 1;
+ spin_unlock(&entry->lock);
+ fsnotify_destroy_mark(entry, audit_tree_group);
+ fsnotify_put_mark(entry);
+ return 0;
+ }
+ chunk->owners[0].index = (1U << 31);
+ chunk->owners[0].owner = tree;
+ get_tree(tree);
+ list_add(&chunk->owners[0].list, &tree->chunks);
+ if (!tree->root) {
+ tree->root = chunk;
+ list_add(&tree->same_root, &chunk->trees);
+ }
+ insert_hash(chunk);
+ spin_unlock(&hash_lock);
+ spin_unlock(&entry->lock);
+ fsnotify_put_mark(entry); /* drop initial reference */
+ return 0;
+}
+
+/* the first tagged inode becomes root of tree */
+static int tag_chunk(struct inode *inode, struct audit_tree *tree)
+{
+ struct fsnotify_mark *old_entry, *chunk_entry;
+ struct audit_tree *owner;
+ struct audit_chunk *chunk, *old;
+ struct node *p;
+ int n;
+
+ old_entry = fsnotify_find_inode_mark(audit_tree_group, inode);
+ if (!old_entry)
+ return create_chunk(inode, tree);
+
+ old = container_of(old_entry, struct audit_chunk, mark);
+
+ /* are we already there? */
+ spin_lock(&hash_lock);
+ for (n = 0; n < old->count; n++) {
+ if (old->owners[n].owner == tree) {
+ spin_unlock(&hash_lock);
+ fsnotify_put_mark(old_entry);
+ return 0;
+ }
+ }
+ spin_unlock(&hash_lock);
+
+ chunk = alloc_chunk(old->count + 1);
+ if (!chunk) {
+ fsnotify_put_mark(old_entry);
+ return -ENOMEM;
+ }
+
+ chunk_entry = &chunk->mark;
+
+ spin_lock(&old_entry->lock);
+ if (!old_entry->inode) {
+ /* old_entry is being shot, lets just lie */
+ spin_unlock(&old_entry->lock);
+ fsnotify_put_mark(old_entry);
+ free_chunk(chunk);
+ return -ENOENT;
+ }
+
+ fsnotify_duplicate_mark(chunk_entry, old_entry);
+ if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->inode, NULL, 1)) {
+ spin_unlock(&old_entry->lock);
+ fsnotify_put_mark(chunk_entry);
+ fsnotify_put_mark(old_entry);
+ return -ENOSPC;
+ }
+
+ /* even though we hold old_entry->lock, this is safe since chunk_entry->lock could NEVER have been grabbed before */
+ spin_lock(&chunk_entry->lock);
+ spin_lock(&hash_lock);
+
+ /* we now hold old_entry->lock, chunk_entry->lock, and hash_lock */
+ if (tree->goner) {
+ spin_unlock(&hash_lock);
+ chunk->dead = 1;
+ spin_unlock(&chunk_entry->lock);
+ spin_unlock(&old_entry->lock);
+
+ fsnotify_destroy_mark(chunk_entry, audit_tree_group);
+
+ fsnotify_put_mark(chunk_entry);
+ fsnotify_put_mark(old_entry);
+ return 0;
+ }
+ list_replace_init(&old->trees, &chunk->trees);
+ for (n = 0, p = chunk->owners; n < old->count; n++, p++) {
+ struct audit_tree *s = old->owners[n].owner;
+ p->owner = s;
+ p->index = old->owners[n].index;
+ if (!s) /* result of fallback in untag */
+ continue;
+ get_tree(s);
+ list_replace_init(&old->owners[n].list, &p->list);
+ }
+ p->index = (chunk->count - 1) | (1U<<31);
+ p->owner = tree;
+ get_tree(tree);
+ list_add(&p->list, &tree->chunks);
+ list_replace_rcu(&old->hash, &chunk->hash);
+ list_for_each_entry(owner, &chunk->trees, same_root)
+ owner->root = chunk;
+ old->dead = 1;
+ if (!tree->root) {
+ tree->root = chunk;
+ list_add(&tree->same_root, &chunk->trees);
+ }
+ spin_unlock(&hash_lock);
+ spin_unlock(&chunk_entry->lock);
+ spin_unlock(&old_entry->lock);
+ fsnotify_destroy_mark(old_entry, audit_tree_group);
+ fsnotify_put_mark(chunk_entry); /* drop initial reference */
+ fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */
+ return 0;
+}
+
+static void audit_tree_log_remove_rule(struct audit_krule *rule)
+{
+ struct audit_buffer *ab;
+
+ ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
+ if (unlikely(!ab))
+ return;
+ audit_log_format(ab, "op=");
+ audit_log_string(ab, "remove_rule");
+ audit_log_format(ab, " dir=");
+ audit_log_untrustedstring(ab, rule->tree->pathname);
+ audit_log_key(ab, rule->filterkey);
+ audit_log_format(ab, " list=%d res=1", rule->listnr);
+ audit_log_end(ab);
+}
+
+static void kill_rules(struct audit_tree *tree)
+{
+ struct audit_krule *rule, *next;
+ struct audit_entry *entry;
+
+ list_for_each_entry_safe(rule, next, &tree->rules, rlist) {
+ entry = container_of(rule, struct audit_entry, rule);
+
+ list_del_init(&rule->rlist);
+ if (rule->tree) {
+ /* not a half-baked one */
+ audit_tree_log_remove_rule(rule);
+ rule->tree = NULL;
+ list_del_rcu(&entry->list);
+ list_del(&entry->rule.list);
+ call_rcu(&entry->rcu, audit_free_rule_rcu);
+ }
+ }
+}
+
+/*
+ * finish killing struct audit_tree
+ */
+static void prune_one(struct audit_tree *victim)
+{
+ spin_lock(&hash_lock);
+ while (!list_empty(&victim->chunks)) {
+ struct node *p;
+
+ p = list_entry(victim->chunks.next, struct node, list);
+
+ untag_chunk(p);
+ }
+ spin_unlock(&hash_lock);
+ put_tree(victim);
+}
+
+/* trim the uncommitted chunks from tree */
+
+static void trim_marked(struct audit_tree *tree)
+{
+ struct list_head *p, *q;
+ spin_lock(&hash_lock);
+ if (tree->goner) {
+ spin_unlock(&hash_lock);
+ return;
+ }
+ /* reorder */
+ for (p = tree->chunks.next; p != &tree->chunks; p = q) {
+ struct node *node = list_entry(p, struct node, list);
+ q = p->next;
+ if (node->index & (1U<<31)) {
+ list_del_init(p);
+ list_add(p, &tree->chunks);
+ }
+ }
+
+ while (!list_empty(&tree->chunks)) {
+ struct node *node;
+
+ node = list_entry(tree->chunks.next, struct node, list);
+
+ /* have we run out of marked? */
+ if (!(node->index & (1U<<31)))
+ break;
+
+ untag_chunk(node);
+ }
+ if (!tree->root && !tree->goner) {
+ tree->goner = 1;
+ spin_unlock(&hash_lock);
+ mutex_lock(&audit_filter_mutex);
+ kill_rules(tree);
+ list_del_init(&tree->list);
+ mutex_unlock(&audit_filter_mutex);
+ prune_one(tree);
+ } else {
+ spin_unlock(&hash_lock);
+ }
+}
+
+static void audit_schedule_prune(void);
+
+/* called with audit_filter_mutex */
+int audit_remove_tree_rule(struct audit_krule *rule)
+{
+ struct audit_tree *tree;
+ tree = rule->tree;
+ if (tree) {
+ spin_lock(&hash_lock);
+ list_del_init(&rule->rlist);
+ if (list_empty(&tree->rules) && !tree->goner) {
+ tree->root = NULL;
+ list_del_init(&tree->same_root);
+ tree->goner = 1;
+ list_move(&tree->list, &prune_list);
+ rule->tree = NULL;
+ spin_unlock(&hash_lock);
+ audit_schedule_prune();
+ return 1;
+ }
+ rule->tree = NULL;
+ spin_unlock(&hash_lock);
+ return 1;
+ }
+ return 0;
+}
+
+static int compare_root(struct vfsmount *mnt, void *arg)
+{
+ return d_backing_inode(mnt->mnt_root) == arg;
+}
+
+void audit_trim_trees(void)
+{
+ struct list_head cursor;
+
+ mutex_lock(&audit_filter_mutex);
+ list_add(&cursor, &tree_list);
+ while (cursor.next != &tree_list) {
+ struct audit_tree *tree;
+ struct path path;
+ struct vfsmount *root_mnt;
+ struct node *node;
+ int err;
+
+ tree = container_of(cursor.next, struct audit_tree, list);
+ get_tree(tree);
+ list_del(&cursor);
+ list_add(&cursor, &tree->list);
+ mutex_unlock(&audit_filter_mutex);
+
+ err = kern_path(tree->pathname, 0, &path);
+ if (err)
+ goto skip_it;
+
+ root_mnt = collect_mounts(&path);
+ path_put(&path);
+ if (IS_ERR(root_mnt))
+ goto skip_it;
+
+ spin_lock(&hash_lock);
+ list_for_each_entry(node, &tree->chunks, list) {
+ struct audit_chunk *chunk = find_chunk(node);
+ /* this could be NULL if the watch is dying else where... */
+ struct inode *inode = chunk->mark.inode;
+ node->index |= 1U<<31;
+ if (iterate_mounts(compare_root, inode, root_mnt))
+ node->index &= ~(1U<<31);
+ }
+ spin_unlock(&hash_lock);
+ trim_marked(tree);
+ drop_collected_mounts(root_mnt);
+skip_it:
+ put_tree(tree);
+ mutex_lock(&audit_filter_mutex);
+ }
+ list_del(&cursor);
+ mutex_unlock(&audit_filter_mutex);
+}
+
+int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op)
+{
+
+ if (pathname[0] != '/' ||
+ rule->listnr != AUDIT_FILTER_EXIT ||
+ op != Audit_equal ||
+ rule->inode_f || rule->watch || rule->tree)
+ return -EINVAL;
+ rule->tree = alloc_tree(pathname);
+ if (!rule->tree)
+ return -ENOMEM;
+ return 0;
+}
+
+void audit_put_tree(struct audit_tree *tree)
+{
+ put_tree(tree);
+}
+
+static int tag_mount(struct vfsmount *mnt, void *arg)
+{
+ return tag_chunk(d_backing_inode(mnt->mnt_root), arg);
+}
+
+/*
+ * That gets run when evict_chunk() ends up needing to kill audit_tree.
+ * Runs from a separate thread.
+ */
+static int prune_tree_thread(void *unused)
+{
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (list_empty(&prune_list))
+ schedule();
+ __set_current_state(TASK_RUNNING);
+
+ mutex_lock(&audit_cmd_mutex);
+ mutex_lock(&audit_filter_mutex);
+
+ while (!list_empty(&prune_list)) {
+ struct audit_tree *victim;
+
+ victim = list_entry(prune_list.next,
+ struct audit_tree, list);
+ list_del_init(&victim->list);
+
+ mutex_unlock(&audit_filter_mutex);
+
+ prune_one(victim);
+
+ mutex_lock(&audit_filter_mutex);
+ }
+
+ mutex_unlock(&audit_filter_mutex);
+ mutex_unlock(&audit_cmd_mutex);
+ }
+ return 0;
+}
+
+static int audit_launch_prune(void)
+{
+ if (prune_thread)
+ return 0;
+ prune_thread = kthread_create(prune_tree_thread, NULL,
+ "audit_prune_tree");
+ if (IS_ERR(prune_thread)) {
+ pr_err("cannot start thread audit_prune_tree");
+ prune_thread = NULL;
+ return -ENOMEM;
+ } else {
+ wake_up_process(prune_thread);
+ return 0;
+ }
+}
+
+/* called with audit_filter_mutex */
+int audit_add_tree_rule(struct audit_krule *rule)
+{
+ struct audit_tree *seed = rule->tree, *tree;
+ struct path path;
+ struct vfsmount *mnt;
+ int err;
+
+ rule->tree = NULL;
+ list_for_each_entry(tree, &tree_list, list) {
+ if (!strcmp(seed->pathname, tree->pathname)) {
+ put_tree(seed);
+ rule->tree = tree;
+ list_add(&rule->rlist, &tree->rules);
+ return 0;
+ }
+ }
+ tree = seed;
+ list_add(&tree->list, &tree_list);
+ list_add(&rule->rlist, &tree->rules);
+ /* do not set rule->tree yet */
+ mutex_unlock(&audit_filter_mutex);
+
+ if (unlikely(!prune_thread)) {
+ err = audit_launch_prune();
+ if (err)
+ goto Err;
+ }
+
+ err = kern_path(tree->pathname, 0, &path);
+ if (err)
+ goto Err;
+ mnt = collect_mounts(&path);
+ path_put(&path);
+ if (IS_ERR(mnt)) {
+ err = PTR_ERR(mnt);
+ goto Err;
+ }
+
+ get_tree(tree);
+ err = iterate_mounts(tag_mount, tree, mnt);
+ drop_collected_mounts(mnt);
+
+ if (!err) {
+ struct node *node;
+ spin_lock(&hash_lock);
+ list_for_each_entry(node, &tree->chunks, list)
+ node->index &= ~(1U<<31);
+ spin_unlock(&hash_lock);
+ } else {
+ trim_marked(tree);
+ goto Err;
+ }
+
+ mutex_lock(&audit_filter_mutex);
+ if (list_empty(&rule->rlist)) {
+ put_tree(tree);
+ return -ENOENT;
+ }
+ rule->tree = tree;
+ put_tree(tree);
+
+ return 0;
+Err:
+ mutex_lock(&audit_filter_mutex);
+ list_del_init(&tree->list);
+ list_del_init(&tree->rules);
+ put_tree(tree);
+ return err;
+}
+
+int audit_tag_tree(char *old, char *new)
+{
+ struct list_head cursor, barrier;
+ int failed = 0;
+ struct path path1, path2;
+ struct vfsmount *tagged;
+ int err;
+
+ err = kern_path(new, 0, &path2);
+ if (err)
+ return err;
+ tagged = collect_mounts(&path2);
+ path_put(&path2);
+ if (IS_ERR(tagged))
+ return PTR_ERR(tagged);
+
+ err = kern_path(old, 0, &path1);
+ if (err) {
+ drop_collected_mounts(tagged);
+ return err;
+ }
+
+ mutex_lock(&audit_filter_mutex);
+ list_add(&barrier, &tree_list);
+ list_add(&cursor, &barrier);
+
+ while (cursor.next != &tree_list) {
+ struct audit_tree *tree;
+ int good_one = 0;
+
+ tree = container_of(cursor.next, struct audit_tree, list);
+ get_tree(tree);
+ list_del(&cursor);
+ list_add(&cursor, &tree->list);
+ mutex_unlock(&audit_filter_mutex);
+
+ err = kern_path(tree->pathname, 0, &path2);
+ if (!err) {
+ good_one = path_is_under(&path1, &path2);
+ path_put(&path2);
+ }
+
+ if (!good_one) {
+ put_tree(tree);
+ mutex_lock(&audit_filter_mutex);
+ continue;
+ }
+
+ failed = iterate_mounts(tag_mount, tree, tagged);
+ if (failed) {
+ put_tree(tree);
+ mutex_lock(&audit_filter_mutex);
+ break;
+ }
+
+ mutex_lock(&audit_filter_mutex);
+ spin_lock(&hash_lock);
+ if (!tree->goner) {
+ list_del(&tree->list);
+ list_add(&tree->list, &tree_list);
+ }
+ spin_unlock(&hash_lock);
+ put_tree(tree);
+ }
+
+ while (barrier.prev != &tree_list) {
+ struct audit_tree *tree;
+
+ tree = container_of(barrier.prev, struct audit_tree, list);
+ get_tree(tree);
+ list_del(&tree->list);
+ list_add(&tree->list, &barrier);
+ mutex_unlock(&audit_filter_mutex);
+
+ if (!failed) {
+ struct node *node;
+ spin_lock(&hash_lock);
+ list_for_each_entry(node, &tree->chunks, list)
+ node->index &= ~(1U<<31);
+ spin_unlock(&hash_lock);
+ } else {
+ trim_marked(tree);
+ }
+
+ put_tree(tree);
+ mutex_lock(&audit_filter_mutex);
+ }
+ list_del(&barrier);
+ list_del(&cursor);
+ mutex_unlock(&audit_filter_mutex);
+ path_put(&path1);
+ drop_collected_mounts(tagged);
+ return failed;
+}
+
+
+static void audit_schedule_prune(void)
+{
+ wake_up_process(prune_thread);
+}
+
+/*
+ * ... and that one is done if evict_chunk() decides to delay until the end
+ * of syscall. Runs synchronously.
+ */
+void audit_kill_trees(struct list_head *list)
+{
+ mutex_lock(&audit_cmd_mutex);
+ mutex_lock(&audit_filter_mutex);
+
+ while (!list_empty(list)) {
+ struct audit_tree *victim;
+
+ victim = list_entry(list->next, struct audit_tree, list);
+ kill_rules(victim);
+ list_del_init(&victim->list);
+
+ mutex_unlock(&audit_filter_mutex);
+
+ prune_one(victim);
+
+ mutex_lock(&audit_filter_mutex);
+ }
+
+ mutex_unlock(&audit_filter_mutex);
+ mutex_unlock(&audit_cmd_mutex);
+}
+
+/*
+ * Here comes the stuff asynchronous to auditctl operations
+ */
+
+static void evict_chunk(struct audit_chunk *chunk)
+{
+ struct audit_tree *owner;
+ struct list_head *postponed = audit_killed_trees();
+ int need_prune = 0;
+ int n;
+
+ if (chunk->dead)
+ return;
+
+ chunk->dead = 1;
+ mutex_lock(&audit_filter_mutex);
+ spin_lock(&hash_lock);
+ while (!list_empty(&chunk->trees)) {
+ owner = list_entry(chunk->trees.next,
+ struct audit_tree, same_root);
+ owner->goner = 1;
+ owner->root = NULL;
+ list_del_init(&owner->same_root);
+ spin_unlock(&hash_lock);
+ if (!postponed) {
+ kill_rules(owner);
+ list_move(&owner->list, &prune_list);
+ need_prune = 1;
+ } else {
+ list_move(&owner->list, postponed);
+ }
+ spin_lock(&hash_lock);
+ }
+ list_del_rcu(&chunk->hash);
+ for (n = 0; n < chunk->count; n++)
+ list_del_init(&chunk->owners[n].list);
+ spin_unlock(&hash_lock);
+ mutex_unlock(&audit_filter_mutex);
+ if (need_prune)
+ audit_schedule_prune();
+}
+
+static int audit_tree_handle_event(struct fsnotify_group *group,
+ struct inode *to_tell,
+ struct fsnotify_mark *inode_mark,
+ struct fsnotify_mark *vfsmount_mark,
+ u32 mask, void *data, int data_type,
+ const unsigned char *file_name, u32 cookie)
+{
+ return 0;
+}
+
+static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group)
+{
+ struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
+
+ evict_chunk(chunk);
+
+ /*
+ * We are guaranteed to have at least one reference to the mark from
+ * either the inode or the caller of fsnotify_destroy_mark().
+ */
+ BUG_ON(atomic_read(&entry->refcnt) < 1);
+}
+
+static const struct fsnotify_ops audit_tree_ops = {
+ .handle_event = audit_tree_handle_event,
+ .freeing_mark = audit_tree_freeing_mark,
+};
+
+static int __init audit_tree_init(void)
+{
+ int i;
+
+ audit_tree_group = fsnotify_alloc_group(&audit_tree_ops);
+ if (IS_ERR(audit_tree_group))
+ audit_panic("cannot initialize fsnotify group for rectree watches");
+
+ for (i = 0; i < HASH_SIZE; i++)
+ INIT_LIST_HEAD(&chunk_hash_heads[i]);
+
+ return 0;
+}
+__initcall(audit_tree_init);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
new file mode 100644
index 000000000..6e30024d9
--- /dev/null
+++ b/kernel/audit_watch.c
@@ -0,0 +1,519 @@
+/* audit_watch.c -- watching inodes
+ *
+ * Copyright 2003-2009 Red Hat, Inc.
+ * Copyright 2005 Hewlett-Packard Development Company, L.P.
+ * Copyright 2005 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/audit.h>
+#include <linux/kthread.h>
+#include <linux/mutex.h>
+#include <linux/fs.h>
+#include <linux/fsnotify_backend.h>
+#include <linux/namei.h>
+#include <linux/netlink.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/security.h>
+#include "audit.h"
+
+/*
+ * Reference counting:
+ *
+ * audit_parent: lifetime is from audit_init_parent() to receipt of an FS_IGNORED
+ * event. Each audit_watch holds a reference to its associated parent.
+ *
+ * audit_watch: if added to lists, lifetime is from audit_init_watch() to
+ * audit_remove_watch(). Additionally, an audit_watch may exist
+ * temporarily to assist in searching existing filter data. Each
+ * audit_krule holds a reference to its associated watch.
+ */
+
+struct audit_watch {
+ atomic_t count; /* reference count */
+ dev_t dev; /* associated superblock device */
+ char *path; /* insertion path */
+ unsigned long ino; /* associated inode number */
+ struct audit_parent *parent; /* associated parent */
+ struct list_head wlist; /* entry in parent->watches list */
+ struct list_head rules; /* anchor for krule->rlist */
+};
+
+struct audit_parent {
+ struct list_head watches; /* anchor for audit_watch->wlist */
+ struct fsnotify_mark mark; /* fsnotify mark on the inode */
+};
+
+/* fsnotify handle. */
+static struct fsnotify_group *audit_watch_group;
+
+/* fsnotify events we care about. */
+#define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
+ FS_MOVE_SELF | FS_EVENT_ON_CHILD)
+
+static void audit_free_parent(struct audit_parent *parent)
+{
+ WARN_ON(!list_empty(&parent->watches));
+ kfree(parent);
+}
+
+static void audit_watch_free_mark(struct fsnotify_mark *entry)
+{
+ struct audit_parent *parent;
+
+ parent = container_of(entry, struct audit_parent, mark);
+ audit_free_parent(parent);
+}
+
+static void audit_get_parent(struct audit_parent *parent)
+{
+ if (likely(parent))
+ fsnotify_get_mark(&parent->mark);
+}
+
+static void audit_put_parent(struct audit_parent *parent)
+{
+ if (likely(parent))
+ fsnotify_put_mark(&parent->mark);
+}
+
+/*
+ * Find and return the audit_parent on the given inode. If found a reference
+ * is taken on this parent.
+ */
+static inline struct audit_parent *audit_find_parent(struct inode *inode)
+{
+ struct audit_parent *parent = NULL;
+ struct fsnotify_mark *entry;
+
+ entry = fsnotify_find_inode_mark(audit_watch_group, inode);
+ if (entry)
+ parent = container_of(entry, struct audit_parent, mark);
+
+ return parent;
+}
+
+void audit_get_watch(struct audit_watch *watch)
+{
+ atomic_inc(&watch->count);
+}
+
+void audit_put_watch(struct audit_watch *watch)
+{
+ if (atomic_dec_and_test(&watch->count)) {
+ WARN_ON(watch->parent);
+ WARN_ON(!list_empty(&watch->rules));
+ kfree(watch->path);
+ kfree(watch);
+ }
+}
+
+static void audit_remove_watch(struct audit_watch *watch)
+{
+ list_del(&watch->wlist);
+ audit_put_parent(watch->parent);
+ watch->parent = NULL;
+ audit_put_watch(watch); /* match initial get */
+}
+
+char *audit_watch_path(struct audit_watch *watch)
+{
+ return watch->path;
+}
+
+int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev)
+{
+ return (watch->ino != (unsigned long)-1) &&
+ (watch->ino == ino) &&
+ (watch->dev == dev);
+}
+
+/* Initialize a parent watch entry. */
+static struct audit_parent *audit_init_parent(struct path *path)
+{
+ struct inode *inode = d_backing_inode(path->dentry);
+ struct audit_parent *parent;
+ int ret;
+
+ parent = kzalloc(sizeof(*parent), GFP_KERNEL);
+ if (unlikely(!parent))
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&parent->watches);
+
+ fsnotify_init_mark(&parent->mark, audit_watch_free_mark);
+ parent->mark.mask = AUDIT_FS_WATCH;
+ ret = fsnotify_add_mark(&parent->mark, audit_watch_group, inode, NULL, 0);
+ if (ret < 0) {
+ audit_free_parent(parent);
+ return ERR_PTR(ret);
+ }
+
+ return parent;
+}
+
+/* Initialize a watch entry. */
+static struct audit_watch *audit_init_watch(char *path)
+{
+ struct audit_watch *watch;
+
+ watch = kzalloc(sizeof(*watch), GFP_KERNEL);
+ if (unlikely(!watch))
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&watch->rules);
+ atomic_set(&watch->count, 1);
+ watch->path = path;
+ watch->dev = (dev_t)-1;
+ watch->ino = (unsigned long)-1;
+
+ return watch;
+}
+
+/* Translate a watch string to kernel respresentation. */
+int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op)
+{
+ struct audit_watch *watch;
+
+ if (!audit_watch_group)
+ return -EOPNOTSUPP;
+
+ if (path[0] != '/' || path[len-1] == '/' ||
+ krule->listnr != AUDIT_FILTER_EXIT ||
+ op != Audit_equal ||
+ krule->inode_f || krule->watch || krule->tree)
+ return -EINVAL;
+
+ watch = audit_init_watch(path);
+ if (IS_ERR(watch))
+ return PTR_ERR(watch);
+
+ audit_get_watch(watch);
+ krule->watch = watch;
+
+ return 0;
+}
+
+/* Duplicate the given audit watch. The new watch's rules list is initialized
+ * to an empty list and wlist is undefined. */
+static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
+{
+ char *path;
+ struct audit_watch *new;
+
+ path = kstrdup(old->path, GFP_KERNEL);
+ if (unlikely(!path))
+ return ERR_PTR(-ENOMEM);
+
+ new = audit_init_watch(path);
+ if (IS_ERR(new)) {
+ kfree(path);
+ goto out;
+ }
+
+ new->dev = old->dev;
+ new->ino = old->ino;
+ audit_get_parent(old->parent);
+ new->parent = old->parent;
+
+out:
+ return new;
+}
+
+static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watch *w, char *op)
+{
+ if (audit_enabled) {
+ struct audit_buffer *ab;
+ ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
+ if (unlikely(!ab))
+ return;
+ audit_log_format(ab, "auid=%u ses=%u op=",
+ from_kuid(&init_user_ns, audit_get_loginuid(current)),
+ audit_get_sessionid(current));
+ audit_log_string(ab, op);
+ audit_log_format(ab, " path=");
+ audit_log_untrustedstring(ab, w->path);
+ audit_log_key(ab, r->filterkey);
+ audit_log_format(ab, " list=%d res=1", r->listnr);
+ audit_log_end(ab);
+ }
+}
+
+/* Update inode info in audit rules based on filesystem event. */
+static void audit_update_watch(struct audit_parent *parent,
+ const char *dname, dev_t dev,
+ unsigned long ino, unsigned invalidating)
+{
+ struct audit_watch *owatch, *nwatch, *nextw;
+ struct audit_krule *r, *nextr;
+ struct audit_entry *oentry, *nentry;
+
+ mutex_lock(&audit_filter_mutex);
+ /* Run all of the watches on this parent looking for the one that
+ * matches the given dname */
+ list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
+ if (audit_compare_dname_path(dname, owatch->path,
+ AUDIT_NAME_FULL))
+ continue;
+
+ /* If the update involves invalidating rules, do the inode-based
+ * filtering now, so we don't omit records. */
+ if (invalidating && !audit_dummy_context())
+ audit_filter_inodes(current, current->audit_context);
+
+ /* updating ino will likely change which audit_hash_list we
+ * are on so we need a new watch for the new list */
+ nwatch = audit_dupe_watch(owatch);
+ if (IS_ERR(nwatch)) {
+ mutex_unlock(&audit_filter_mutex);
+ audit_panic("error updating watch, skipping");
+ return;
+ }
+ nwatch->dev = dev;
+ nwatch->ino = ino;
+
+ list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) {
+
+ oentry = container_of(r, struct audit_entry, rule);
+ list_del(&oentry->rule.rlist);
+ list_del_rcu(&oentry->list);
+
+ nentry = audit_dupe_rule(&oentry->rule);
+ if (IS_ERR(nentry)) {
+ list_del(&oentry->rule.list);
+ audit_panic("error updating watch, removing");
+ } else {
+ int h = audit_hash_ino((u32)ino);
+
+ /*
+ * nentry->rule.watch == oentry->rule.watch so
+ * we must drop that reference and set it to our
+ * new watch.
+ */
+ audit_put_watch(nentry->rule.watch);
+ audit_get_watch(nwatch);
+ nentry->rule.watch = nwatch;
+ list_add(&nentry->rule.rlist, &nwatch->rules);
+ list_add_rcu(&nentry->list, &audit_inode_hash[h]);
+ list_replace(&oentry->rule.list,
+ &nentry->rule.list);
+ }
+
+ audit_watch_log_rule_change(r, owatch, "updated_rules");
+
+ call_rcu(&oentry->rcu, audit_free_rule_rcu);
+ }
+
+ audit_remove_watch(owatch);
+ goto add_watch_to_parent; /* event applies to a single watch */
+ }
+ mutex_unlock(&audit_filter_mutex);
+ return;
+
+add_watch_to_parent:
+ list_add(&nwatch->wlist, &parent->watches);
+ mutex_unlock(&audit_filter_mutex);
+ return;
+}
+
+/* Remove all watches & rules associated with a parent that is going away. */
+static void audit_remove_parent_watches(struct audit_parent *parent)
+{
+ struct audit_watch *w, *nextw;
+ struct audit_krule *r, *nextr;
+ struct audit_entry *e;
+
+ mutex_lock(&audit_filter_mutex);
+ list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
+ list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
+ e = container_of(r, struct audit_entry, rule);
+ audit_watch_log_rule_change(r, w, "remove_rule");
+ list_del(&r->rlist);
+ list_del(&r->list);
+ list_del_rcu(&e->list);
+ call_rcu(&e->rcu, audit_free_rule_rcu);
+ }
+ audit_remove_watch(w);
+ }
+ mutex_unlock(&audit_filter_mutex);
+
+ fsnotify_destroy_mark(&parent->mark, audit_watch_group);
+}
+
+/* Get path information necessary for adding watches. */
+static int audit_get_nd(struct audit_watch *watch, struct path *parent)
+{
+ struct dentry *d = kern_path_locked(watch->path, parent);
+ if (IS_ERR(d))
+ return PTR_ERR(d);
+ mutex_unlock(&d_backing_inode(parent->dentry)->i_mutex);
+ if (d_is_positive(d)) {
+ /* update watch filter fields */
+ watch->dev = d_backing_inode(d)->i_sb->s_dev;
+ watch->ino = d_backing_inode(d)->i_ino;
+ }
+ dput(d);
+ return 0;
+}
+
+/* Associate the given rule with an existing parent.
+ * Caller must hold audit_filter_mutex. */
+static void audit_add_to_parent(struct audit_krule *krule,
+ struct audit_parent *parent)
+{
+ struct audit_watch *w, *watch = krule->watch;
+ int watch_found = 0;
+
+ BUG_ON(!mutex_is_locked(&audit_filter_mutex));
+
+ list_for_each_entry(w, &parent->watches, wlist) {
+ if (strcmp(watch->path, w->path))
+ continue;
+
+ watch_found = 1;
+
+ /* put krule's and initial refs to temporary watch */
+ audit_put_watch(watch);
+ audit_put_watch(watch);
+
+ audit_get_watch(w);
+ krule->watch = watch = w;
+ break;
+ }
+
+ if (!watch_found) {
+ audit_get_parent(parent);
+ watch->parent = parent;
+
+ list_add(&watch->wlist, &parent->watches);
+ }
+ list_add(&krule->rlist, &watch->rules);
+}
+
+/* Find a matching watch entry, or add this one.
+ * Caller must hold audit_filter_mutex. */
+int audit_add_watch(struct audit_krule *krule, struct list_head **list)
+{
+ struct audit_watch *watch = krule->watch;
+ struct audit_parent *parent;
+ struct path parent_path;
+ int h, ret = 0;
+
+ mutex_unlock(&audit_filter_mutex);
+
+ /* Avoid calling path_lookup under audit_filter_mutex. */
+ ret = audit_get_nd(watch, &parent_path);
+
+ /* caller expects mutex locked */
+ mutex_lock(&audit_filter_mutex);
+
+ if (ret)
+ return ret;
+
+ /* either find an old parent or attach a new one */
+ parent = audit_find_parent(d_backing_inode(parent_path.dentry));
+ if (!parent) {
+ parent = audit_init_parent(&parent_path);
+ if (IS_ERR(parent)) {
+ ret = PTR_ERR(parent);
+ goto error;
+ }
+ }
+
+ audit_add_to_parent(krule, parent);
+
+ /* match get in audit_find_parent or audit_init_parent */
+ audit_put_parent(parent);
+
+ h = audit_hash_ino((u32)watch->ino);
+ *list = &audit_inode_hash[h];
+error:
+ path_put(&parent_path);
+ return ret;
+}
+
+void audit_remove_watch_rule(struct audit_krule *krule)
+{
+ struct audit_watch *watch = krule->watch;
+ struct audit_parent *parent = watch->parent;
+
+ list_del(&krule->rlist);
+
+ if (list_empty(&watch->rules)) {
+ audit_remove_watch(watch);
+
+ if (list_empty(&parent->watches)) {
+ audit_get_parent(parent);
+ fsnotify_destroy_mark(&parent->mark, audit_watch_group);
+ audit_put_parent(parent);
+ }
+ }
+}
+
+/* Update watch data in audit rules based on fsnotify events. */
+static int audit_watch_handle_event(struct fsnotify_group *group,
+ struct inode *to_tell,
+ struct fsnotify_mark *inode_mark,
+ struct fsnotify_mark *vfsmount_mark,
+ u32 mask, void *data, int data_type,
+ const unsigned char *dname, u32 cookie)
+{
+ struct inode *inode;
+ struct audit_parent *parent;
+
+ parent = container_of(inode_mark, struct audit_parent, mark);
+
+ BUG_ON(group != audit_watch_group);
+
+ switch (data_type) {
+ case (FSNOTIFY_EVENT_PATH):
+ inode = d_backing_inode(((struct path *)data)->dentry);
+ break;
+ case (FSNOTIFY_EVENT_INODE):
+ inode = (struct inode *)data;
+ break;
+ default:
+ BUG();
+ inode = NULL;
+ break;
+ };
+
+ if (mask & (FS_CREATE|FS_MOVED_TO) && inode)
+ audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0);
+ else if (mask & (FS_DELETE|FS_MOVED_FROM))
+ audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
+ else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF))
+ audit_remove_parent_watches(parent);
+
+ return 0;
+}
+
+static const struct fsnotify_ops audit_watch_fsnotify_ops = {
+ .handle_event = audit_watch_handle_event,
+};
+
+static int __init audit_watch_init(void)
+{
+ audit_watch_group = fsnotify_alloc_group(&audit_watch_fsnotify_ops);
+ if (IS_ERR(audit_watch_group)) {
+ audit_watch_group = NULL;
+ audit_panic("cannot create audit fsnotify group");
+ }
+ return 0;
+}
+device_initcall(audit_watch_init);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
new file mode 100644
index 000000000..72e1660a7
--- /dev/null
+++ b/kernel/auditfilter.c
@@ -0,0 +1,1416 @@
+/* auditfilter.c -- filtering of audit events
+ *
+ * Copyright 2003-2004 Red Hat, Inc.
+ * Copyright 2005 Hewlett-Packard Development Company, L.P.
+ * Copyright 2005 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/audit.h>
+#include <linux/kthread.h>
+#include <linux/mutex.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/netlink.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/security.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include "audit.h"
+
+/*
+ * Locking model:
+ *
+ * audit_filter_mutex:
+ * Synchronizes writes and blocking reads of audit's filterlist
+ * data. Rcu is used to traverse the filterlist and access
+ * contents of structs audit_entry, audit_watch and opaque
+ * LSM rules during filtering. If modified, these structures
+ * must be copied and replace their counterparts in the filterlist.
+ * An audit_parent struct is not accessed during filtering, so may
+ * be written directly provided audit_filter_mutex is held.
+ */
+
+/* Audit filter lists, defined in <linux/audit.h> */
+struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
+ LIST_HEAD_INIT(audit_filter_list[0]),
+ LIST_HEAD_INIT(audit_filter_list[1]),
+ LIST_HEAD_INIT(audit_filter_list[2]),
+ LIST_HEAD_INIT(audit_filter_list[3]),
+ LIST_HEAD_INIT(audit_filter_list[4]),
+ LIST_HEAD_INIT(audit_filter_list[5]),
+#if AUDIT_NR_FILTERS != 6
+#error Fix audit_filter_list initialiser
+#endif
+};
+static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = {
+ LIST_HEAD_INIT(audit_rules_list[0]),
+ LIST_HEAD_INIT(audit_rules_list[1]),
+ LIST_HEAD_INIT(audit_rules_list[2]),
+ LIST_HEAD_INIT(audit_rules_list[3]),
+ LIST_HEAD_INIT(audit_rules_list[4]),
+ LIST_HEAD_INIT(audit_rules_list[5]),
+};
+
+DEFINE_MUTEX(audit_filter_mutex);
+
+static void audit_free_lsm_field(struct audit_field *f)
+{
+ switch (f->type) {
+ case AUDIT_SUBJ_USER:
+ case AUDIT_SUBJ_ROLE:
+ case AUDIT_SUBJ_TYPE:
+ case AUDIT_SUBJ_SEN:
+ case AUDIT_SUBJ_CLR:
+ case AUDIT_OBJ_USER:
+ case AUDIT_OBJ_ROLE:
+ case AUDIT_OBJ_TYPE:
+ case AUDIT_OBJ_LEV_LOW:
+ case AUDIT_OBJ_LEV_HIGH:
+ kfree(f->lsm_str);
+ security_audit_rule_free(f->lsm_rule);
+ }
+}
+
+static inline void audit_free_rule(struct audit_entry *e)
+{
+ int i;
+ struct audit_krule *erule = &e->rule;
+
+ /* some rules don't have associated watches */
+ if (erule->watch)
+ audit_put_watch(erule->watch);
+ if (erule->fields)
+ for (i = 0; i < erule->field_count; i++)
+ audit_free_lsm_field(&erule->fields[i]);
+ kfree(erule->fields);
+ kfree(erule->filterkey);
+ kfree(e);
+}
+
+void audit_free_rule_rcu(struct rcu_head *head)
+{
+ struct audit_entry *e = container_of(head, struct audit_entry, rcu);
+ audit_free_rule(e);
+}
+
+/* Initialize an audit filterlist entry. */
+static inline struct audit_entry *audit_init_entry(u32 field_count)
+{
+ struct audit_entry *entry;
+ struct audit_field *fields;
+
+ entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+ if (unlikely(!entry))
+ return NULL;
+
+ fields = kcalloc(field_count, sizeof(*fields), GFP_KERNEL);
+ if (unlikely(!fields)) {
+ kfree(entry);
+ return NULL;
+ }
+ entry->rule.fields = fields;
+
+ return entry;
+}
+
+/* Unpack a filter field's string representation from user-space
+ * buffer. */
+char *audit_unpack_string(void **bufp, size_t *remain, size_t len)
+{
+ char *str;
+
+ if (!*bufp || (len == 0) || (len > *remain))
+ return ERR_PTR(-EINVAL);
+
+ /* Of the currently implemented string fields, PATH_MAX
+ * defines the longest valid length.
+ */
+ if (len > PATH_MAX)
+ return ERR_PTR(-ENAMETOOLONG);
+
+ str = kmalloc(len + 1, GFP_KERNEL);
+ if (unlikely(!str))
+ return ERR_PTR(-ENOMEM);
+
+ memcpy(str, *bufp, len);
+ str[len] = 0;
+ *bufp += len;
+ *remain -= len;
+
+ return str;
+}
+
+/* Translate an inode field to kernel respresentation. */
+static inline int audit_to_inode(struct audit_krule *krule,
+ struct audit_field *f)
+{
+ if (krule->listnr != AUDIT_FILTER_EXIT ||
+ krule->inode_f || krule->watch || krule->tree ||
+ (f->op != Audit_equal && f->op != Audit_not_equal))
+ return -EINVAL;
+
+ krule->inode_f = f;
+ return 0;
+}
+
+static __u32 *classes[AUDIT_SYSCALL_CLASSES];
+
+int __init audit_register_class(int class, unsigned *list)
+{
+ __u32 *p = kcalloc(AUDIT_BITMASK_SIZE, sizeof(__u32), GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+ while (*list != ~0U) {
+ unsigned n = *list++;
+ if (n >= AUDIT_BITMASK_SIZE * 32 - AUDIT_SYSCALL_CLASSES) {
+ kfree(p);
+ return -EINVAL;
+ }
+ p[AUDIT_WORD(n)] |= AUDIT_BIT(n);
+ }
+ if (class >= AUDIT_SYSCALL_CLASSES || classes[class]) {
+ kfree(p);
+ return -EINVAL;
+ }
+ classes[class] = p;
+ return 0;
+}
+
+int audit_match_class(int class, unsigned syscall)
+{
+ if (unlikely(syscall >= AUDIT_BITMASK_SIZE * 32))
+ return 0;
+ if (unlikely(class >= AUDIT_SYSCALL_CLASSES || !classes[class]))
+ return 0;
+ return classes[class][AUDIT_WORD(syscall)] & AUDIT_BIT(syscall);
+}
+
+#ifdef CONFIG_AUDITSYSCALL
+static inline int audit_match_class_bits(int class, u32 *mask)
+{
+ int i;
+
+ if (classes[class]) {
+ for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
+ if (mask[i] & classes[class][i])
+ return 0;
+ }
+ return 1;
+}
+
+static int audit_match_signal(struct audit_entry *entry)
+{
+ struct audit_field *arch = entry->rule.arch_f;
+
+ if (!arch) {
+ /* When arch is unspecified, we must check both masks on biarch
+ * as syscall number alone is ambiguous. */
+ return (audit_match_class_bits(AUDIT_CLASS_SIGNAL,
+ entry->rule.mask) &&
+ audit_match_class_bits(AUDIT_CLASS_SIGNAL_32,
+ entry->rule.mask));
+ }
+
+ switch(audit_classify_arch(arch->val)) {
+ case 0: /* native */
+ return (audit_match_class_bits(AUDIT_CLASS_SIGNAL,
+ entry->rule.mask));
+ case 1: /* 32bit on biarch */
+ return (audit_match_class_bits(AUDIT_CLASS_SIGNAL_32,
+ entry->rule.mask));
+ default:
+ return 1;
+ }
+}
+#endif
+
+/* Common user-space to kernel rule translation. */
+static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data *rule)
+{
+ unsigned listnr;
+ struct audit_entry *entry;
+ int i, err;
+
+ err = -EINVAL;
+ listnr = rule->flags & ~AUDIT_FILTER_PREPEND;
+ switch(listnr) {
+ default:
+ goto exit_err;
+#ifdef CONFIG_AUDITSYSCALL
+ case AUDIT_FILTER_ENTRY:
+ if (rule->action == AUDIT_ALWAYS)
+ goto exit_err;
+ case AUDIT_FILTER_EXIT:
+ case AUDIT_FILTER_TASK:
+#endif
+ case AUDIT_FILTER_USER:
+ case AUDIT_FILTER_TYPE:
+ ;
+ }
+ if (unlikely(rule->action == AUDIT_POSSIBLE)) {
+ pr_err("AUDIT_POSSIBLE is deprecated\n");
+ goto exit_err;
+ }
+ if (rule->action != AUDIT_NEVER && rule->action != AUDIT_ALWAYS)
+ goto exit_err;
+ if (rule->field_count > AUDIT_MAX_FIELDS)
+ goto exit_err;
+
+ err = -ENOMEM;
+ entry = audit_init_entry(rule->field_count);
+ if (!entry)
+ goto exit_err;
+
+ entry->rule.flags = rule->flags & AUDIT_FILTER_PREPEND;
+ entry->rule.listnr = listnr;
+ entry->rule.action = rule->action;
+ entry->rule.field_count = rule->field_count;
+
+ for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
+ entry->rule.mask[i] = rule->mask[i];
+
+ for (i = 0; i < AUDIT_SYSCALL_CLASSES; i++) {
+ int bit = AUDIT_BITMASK_SIZE * 32 - i - 1;
+ __u32 *p = &entry->rule.mask[AUDIT_WORD(bit)];
+ __u32 *class;
+
+ if (!(*p & AUDIT_BIT(bit)))
+ continue;
+ *p &= ~AUDIT_BIT(bit);
+ class = classes[i];
+ if (class) {
+ int j;
+ for (j = 0; j < AUDIT_BITMASK_SIZE; j++)
+ entry->rule.mask[j] |= class[j];
+ }
+ }
+
+ return entry;
+
+exit_err:
+ return ERR_PTR(err);
+}
+
+static u32 audit_ops[] =
+{
+ [Audit_equal] = AUDIT_EQUAL,
+ [Audit_not_equal] = AUDIT_NOT_EQUAL,
+ [Audit_bitmask] = AUDIT_BIT_MASK,
+ [Audit_bittest] = AUDIT_BIT_TEST,
+ [Audit_lt] = AUDIT_LESS_THAN,
+ [Audit_gt] = AUDIT_GREATER_THAN,
+ [Audit_le] = AUDIT_LESS_THAN_OR_EQUAL,
+ [Audit_ge] = AUDIT_GREATER_THAN_OR_EQUAL,
+};
+
+static u32 audit_to_op(u32 op)
+{
+ u32 n;
+ for (n = Audit_equal; n < Audit_bad && audit_ops[n] != op; n++)
+ ;
+ return n;
+}
+
+/* check if an audit field is valid */
+static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
+{
+ switch(f->type) {
+ case AUDIT_MSGTYPE:
+ if (entry->rule.listnr != AUDIT_FILTER_TYPE &&
+ entry->rule.listnr != AUDIT_FILTER_USER)
+ return -EINVAL;
+ break;
+ };
+
+ switch(f->type) {
+ default:
+ return -EINVAL;
+ case AUDIT_UID:
+ case AUDIT_EUID:
+ case AUDIT_SUID:
+ case AUDIT_FSUID:
+ case AUDIT_LOGINUID:
+ case AUDIT_OBJ_UID:
+ case AUDIT_GID:
+ case AUDIT_EGID:
+ case AUDIT_SGID:
+ case AUDIT_FSGID:
+ case AUDIT_OBJ_GID:
+ case AUDIT_PID:
+ case AUDIT_PERS:
+ case AUDIT_MSGTYPE:
+ case AUDIT_PPID:
+ case AUDIT_DEVMAJOR:
+ case AUDIT_DEVMINOR:
+ case AUDIT_EXIT:
+ case AUDIT_SUCCESS:
+ case AUDIT_INODE:
+ /* bit ops are only useful on syscall args */
+ if (f->op == Audit_bitmask || f->op == Audit_bittest)
+ return -EINVAL;
+ break;
+ case AUDIT_ARG0:
+ case AUDIT_ARG1:
+ case AUDIT_ARG2:
+ case AUDIT_ARG3:
+ case AUDIT_SUBJ_USER:
+ case AUDIT_SUBJ_ROLE:
+ case AUDIT_SUBJ_TYPE:
+ case AUDIT_SUBJ_SEN:
+ case AUDIT_SUBJ_CLR:
+ case AUDIT_OBJ_USER:
+ case AUDIT_OBJ_ROLE:
+ case AUDIT_OBJ_TYPE:
+ case AUDIT_OBJ_LEV_LOW:
+ case AUDIT_OBJ_LEV_HIGH:
+ case AUDIT_WATCH:
+ case AUDIT_DIR:
+ case AUDIT_FILTERKEY:
+ break;
+ case AUDIT_LOGINUID_SET:
+ if ((f->val != 0) && (f->val != 1))
+ return -EINVAL;
+ /* FALL THROUGH */
+ case AUDIT_ARCH:
+ if (f->op != Audit_not_equal && f->op != Audit_equal)
+ return -EINVAL;
+ break;
+ case AUDIT_PERM:
+ if (f->val & ~15)
+ return -EINVAL;
+ break;
+ case AUDIT_FILETYPE:
+ if (f->val & ~S_IFMT)
+ return -EINVAL;
+ break;
+ case AUDIT_FIELD_COMPARE:
+ if (f->val > AUDIT_MAX_FIELD_COMPARE)
+ return -EINVAL;
+ break;
+ };
+ return 0;
+}
+
+/* Translate struct audit_rule_data to kernel's rule respresentation. */
+static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
+ size_t datasz)
+{
+ int err = 0;
+ struct audit_entry *entry;
+ void *bufp;
+ size_t remain = datasz - sizeof(struct audit_rule_data);
+ int i;
+ char *str;
+
+ entry = audit_to_entry_common(data);
+ if (IS_ERR(entry))
+ goto exit_nofree;
+
+ bufp = data->buf;
+ for (i = 0; i < data->field_count; i++) {
+ struct audit_field *f = &entry->rule.fields[i];
+
+ err = -EINVAL;
+
+ f->op = audit_to_op(data->fieldflags[i]);
+ if (f->op == Audit_bad)
+ goto exit_free;
+
+ f->type = data->fields[i];
+ f->val = data->values[i];
+
+ /* Support legacy tests for a valid loginuid */
+ if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) {
+ f->type = AUDIT_LOGINUID_SET;
+ f->val = 0;
+ entry->rule.pflags |= AUDIT_LOGINUID_LEGACY;
+ }
+
+ err = audit_field_valid(entry, f);
+ if (err)
+ goto exit_free;
+
+ err = -EINVAL;
+ switch (f->type) {
+ case AUDIT_LOGINUID:
+ case AUDIT_UID:
+ case AUDIT_EUID:
+ case AUDIT_SUID:
+ case AUDIT_FSUID:
+ case AUDIT_OBJ_UID:
+ f->uid = make_kuid(current_user_ns(), f->val);
+ if (!uid_valid(f->uid))
+ goto exit_free;
+ break;
+ case AUDIT_GID:
+ case AUDIT_EGID:
+ case AUDIT_SGID:
+ case AUDIT_FSGID:
+ case AUDIT_OBJ_GID:
+ f->gid = make_kgid(current_user_ns(), f->val);
+ if (!gid_valid(f->gid))
+ goto exit_free;
+ break;
+ case AUDIT_ARCH:
+ entry->rule.arch_f = f;
+ break;
+ case AUDIT_SUBJ_USER:
+ case AUDIT_SUBJ_ROLE:
+ case AUDIT_SUBJ_TYPE:
+ case AUDIT_SUBJ_SEN:
+ case AUDIT_SUBJ_CLR:
+ case AUDIT_OBJ_USER:
+ case AUDIT_OBJ_ROLE:
+ case AUDIT_OBJ_TYPE:
+ case AUDIT_OBJ_LEV_LOW:
+ case AUDIT_OBJ_LEV_HIGH:
+ str = audit_unpack_string(&bufp, &remain, f->val);
+ if (IS_ERR(str))
+ goto exit_free;
+ entry->rule.buflen += f->val;
+
+ err = security_audit_rule_init(f->type, f->op, str,
+ (void **)&f->lsm_rule);
+ /* Keep currently invalid fields around in case they
+ * become valid after a policy reload. */
+ if (err == -EINVAL) {
+ pr_warn("audit rule for LSM \'%s\' is invalid\n",
+ str);
+ err = 0;
+ }
+ if (err) {
+ kfree(str);
+ goto exit_free;
+ } else
+ f->lsm_str = str;
+ break;
+ case AUDIT_WATCH:
+ str = audit_unpack_string(&bufp, &remain, f->val);
+ if (IS_ERR(str))
+ goto exit_free;
+ entry->rule.buflen += f->val;
+
+ err = audit_to_watch(&entry->rule, str, f->val, f->op);
+ if (err) {
+ kfree(str);
+ goto exit_free;
+ }
+ break;
+ case AUDIT_DIR:
+ str = audit_unpack_string(&bufp, &remain, f->val);
+ if (IS_ERR(str))
+ goto exit_free;
+ entry->rule.buflen += f->val;
+
+ err = audit_make_tree(&entry->rule, str, f->op);
+ kfree(str);
+ if (err)
+ goto exit_free;
+ break;
+ case AUDIT_INODE:
+ err = audit_to_inode(&entry->rule, f);
+ if (err)
+ goto exit_free;
+ break;
+ case AUDIT_FILTERKEY:
+ if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN)
+ goto exit_free;
+ str = audit_unpack_string(&bufp, &remain, f->val);
+ if (IS_ERR(str))
+ goto exit_free;
+ entry->rule.buflen += f->val;
+ entry->rule.filterkey = str;
+ break;
+ }
+ }
+
+ if (entry->rule.inode_f && entry->rule.inode_f->op == Audit_not_equal)
+ entry->rule.inode_f = NULL;
+
+exit_nofree:
+ return entry;
+
+exit_free:
+ if (entry->rule.watch)
+ audit_put_watch(entry->rule.watch); /* matches initial get */
+ if (entry->rule.tree)
+ audit_put_tree(entry->rule.tree); /* that's the temporary one */
+ audit_free_rule(entry);
+ return ERR_PTR(err);
+}
+
+/* Pack a filter field's string representation into data block. */
+static inline size_t audit_pack_string(void **bufp, const char *str)
+{
+ size_t len = strlen(str);
+
+ memcpy(*bufp, str, len);
+ *bufp += len;
+
+ return len;
+}
+
+/* Translate kernel rule respresentation to struct audit_rule_data. */
+static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
+{
+ struct audit_rule_data *data;
+ void *bufp;
+ int i;
+
+ data = kmalloc(sizeof(*data) + krule->buflen, GFP_KERNEL);
+ if (unlikely(!data))
+ return NULL;
+ memset(data, 0, sizeof(*data));
+
+ data->flags = krule->flags | krule->listnr;
+ data->action = krule->action;
+ data->field_count = krule->field_count;
+ bufp = data->buf;
+ for (i = 0; i < data->field_count; i++) {
+ struct audit_field *f = &krule->fields[i];
+
+ data->fields[i] = f->type;
+ data->fieldflags[i] = audit_ops[f->op];
+ switch(f->type) {
+ case AUDIT_SUBJ_USER:
+ case AUDIT_SUBJ_ROLE:
+ case AUDIT_SUBJ_TYPE:
+ case AUDIT_SUBJ_SEN:
+ case AUDIT_SUBJ_CLR:
+ case AUDIT_OBJ_USER:
+ case AUDIT_OBJ_ROLE:
+ case AUDIT_OBJ_TYPE:
+ case AUDIT_OBJ_LEV_LOW:
+ case AUDIT_OBJ_LEV_HIGH:
+ data->buflen += data->values[i] =
+ audit_pack_string(&bufp, f->lsm_str);
+ break;
+ case AUDIT_WATCH:
+ data->buflen += data->values[i] =
+ audit_pack_string(&bufp,
+ audit_watch_path(krule->watch));
+ break;
+ case AUDIT_DIR:
+ data->buflen += data->values[i] =
+ audit_pack_string(&bufp,
+ audit_tree_path(krule->tree));
+ break;
+ case AUDIT_FILTERKEY:
+ data->buflen += data->values[i] =
+ audit_pack_string(&bufp, krule->filterkey);
+ break;
+ case AUDIT_LOGINUID_SET:
+ if (krule->pflags & AUDIT_LOGINUID_LEGACY && !f->val) {
+ data->fields[i] = AUDIT_LOGINUID;
+ data->values[i] = AUDIT_UID_UNSET;
+ break;
+ }
+ /* fallthrough if set */
+ default:
+ data->values[i] = f->val;
+ }
+ }
+ for (i = 0; i < AUDIT_BITMASK_SIZE; i++) data->mask[i] = krule->mask[i];
+
+ return data;
+}
+
+/* Compare two rules in kernel format. Considered success if rules
+ * don't match. */
+static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
+{
+ int i;
+
+ if (a->flags != b->flags ||
+ a->pflags != b->pflags ||
+ a->listnr != b->listnr ||
+ a->action != b->action ||
+ a->field_count != b->field_count)
+ return 1;
+
+ for (i = 0; i < a->field_count; i++) {
+ if (a->fields[i].type != b->fields[i].type ||
+ a->fields[i].op != b->fields[i].op)
+ return 1;
+
+ switch(a->fields[i].type) {
+ case AUDIT_SUBJ_USER:
+ case AUDIT_SUBJ_ROLE:
+ case AUDIT_SUBJ_TYPE:
+ case AUDIT_SUBJ_SEN:
+ case AUDIT_SUBJ_CLR:
+ case AUDIT_OBJ_USER:
+ case AUDIT_OBJ_ROLE:
+ case AUDIT_OBJ_TYPE:
+ case AUDIT_OBJ_LEV_LOW:
+ case AUDIT_OBJ_LEV_HIGH:
+ if (strcmp(a->fields[i].lsm_str, b->fields[i].lsm_str))
+ return 1;
+ break;
+ case AUDIT_WATCH:
+ if (strcmp(audit_watch_path(a->watch),
+ audit_watch_path(b->watch)))
+ return 1;
+ break;
+ case AUDIT_DIR:
+ if (strcmp(audit_tree_path(a->tree),
+ audit_tree_path(b->tree)))
+ return 1;
+ break;
+ case AUDIT_FILTERKEY:
+ /* both filterkeys exist based on above type compare */
+ if (strcmp(a->filterkey, b->filterkey))
+ return 1;
+ break;
+ case AUDIT_UID:
+ case AUDIT_EUID:
+ case AUDIT_SUID:
+ case AUDIT_FSUID:
+ case AUDIT_LOGINUID:
+ case AUDIT_OBJ_UID:
+ if (!uid_eq(a->fields[i].uid, b->fields[i].uid))
+ return 1;
+ break;
+ case AUDIT_GID:
+ case AUDIT_EGID:
+ case AUDIT_SGID:
+ case AUDIT_FSGID:
+ case AUDIT_OBJ_GID:
+ if (!gid_eq(a->fields[i].gid, b->fields[i].gid))
+ return 1;
+ break;
+ default:
+ if (a->fields[i].val != b->fields[i].val)
+ return 1;
+ }
+ }
+
+ for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
+ if (a->mask[i] != b->mask[i])
+ return 1;
+
+ return 0;
+}
+
+/* Duplicate LSM field information. The lsm_rule is opaque, so must be
+ * re-initialized. */
+static inline int audit_dupe_lsm_field(struct audit_field *df,
+ struct audit_field *sf)
+{
+ int ret = 0;
+ char *lsm_str;
+
+ /* our own copy of lsm_str */
+ lsm_str = kstrdup(sf->lsm_str, GFP_KERNEL);
+ if (unlikely(!lsm_str))
+ return -ENOMEM;
+ df->lsm_str = lsm_str;
+
+ /* our own (refreshed) copy of lsm_rule */
+ ret = security_audit_rule_init(df->type, df->op, df->lsm_str,
+ (void **)&df->lsm_rule);
+ /* Keep currently invalid fields around in case they
+ * become valid after a policy reload. */
+ if (ret == -EINVAL) {
+ pr_warn("audit rule for LSM \'%s\' is invalid\n",
+ df->lsm_str);
+ ret = 0;
+ }
+
+ return ret;
+}
+
+/* Duplicate an audit rule. This will be a deep copy with the exception
+ * of the watch - that pointer is carried over. The LSM specific fields
+ * will be updated in the copy. The point is to be able to replace the old
+ * rule with the new rule in the filterlist, then free the old rule.
+ * The rlist element is undefined; list manipulations are handled apart from
+ * the initial copy. */
+struct audit_entry *audit_dupe_rule(struct audit_krule *old)
+{
+ u32 fcount = old->field_count;
+ struct audit_entry *entry;
+ struct audit_krule *new;
+ char *fk;
+ int i, err = 0;
+
+ entry = audit_init_entry(fcount);
+ if (unlikely(!entry))
+ return ERR_PTR(-ENOMEM);
+
+ new = &entry->rule;
+ new->flags = old->flags;
+ new->pflags = old->pflags;
+ new->listnr = old->listnr;
+ new->action = old->action;
+ for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
+ new->mask[i] = old->mask[i];
+ new->prio = old->prio;
+ new->buflen = old->buflen;
+ new->inode_f = old->inode_f;
+ new->field_count = old->field_count;
+
+ /*
+ * note that we are OK with not refcounting here; audit_match_tree()
+ * never dereferences tree and we can't get false positives there
+ * since we'd have to have rule gone from the list *and* removed
+ * before the chunks found by lookup had been allocated, i.e. before
+ * the beginning of list scan.
+ */
+ new->tree = old->tree;
+ memcpy(new->fields, old->fields, sizeof(struct audit_field) * fcount);
+
+ /* deep copy this information, updating the lsm_rule fields, because
+ * the originals will all be freed when the old rule is freed. */
+ for (i = 0; i < fcount; i++) {
+ switch (new->fields[i].type) {
+ case AUDIT_SUBJ_USER:
+ case AUDIT_SUBJ_ROLE:
+ case AUDIT_SUBJ_TYPE:
+ case AUDIT_SUBJ_SEN:
+ case AUDIT_SUBJ_CLR:
+ case AUDIT_OBJ_USER:
+ case AUDIT_OBJ_ROLE:
+ case AUDIT_OBJ_TYPE:
+ case AUDIT_OBJ_LEV_LOW:
+ case AUDIT_OBJ_LEV_HIGH:
+ err = audit_dupe_lsm_field(&new->fields[i],
+ &old->fields[i]);
+ break;
+ case AUDIT_FILTERKEY:
+ fk = kstrdup(old->filterkey, GFP_KERNEL);
+ if (unlikely(!fk))
+ err = -ENOMEM;
+ else
+ new->filterkey = fk;
+ }
+ if (err) {
+ audit_free_rule(entry);
+ return ERR_PTR(err);
+ }
+ }
+
+ if (old->watch) {
+ audit_get_watch(old->watch);
+ new->watch = old->watch;
+ }
+
+ return entry;
+}
+
+/* Find an existing audit rule.
+ * Caller must hold audit_filter_mutex to prevent stale rule data. */
+static struct audit_entry *audit_find_rule(struct audit_entry *entry,
+ struct list_head **p)
+{
+ struct audit_entry *e, *found = NULL;
+ struct list_head *list;
+ int h;
+
+ if (entry->rule.inode_f) {
+ h = audit_hash_ino(entry->rule.inode_f->val);
+ *p = list = &audit_inode_hash[h];
+ } else if (entry->rule.watch) {
+ /* we don't know the inode number, so must walk entire hash */
+ for (h = 0; h < AUDIT_INODE_BUCKETS; h++) {
+ list = &audit_inode_hash[h];
+ list_for_each_entry(e, list, list)
+ if (!audit_compare_rule(&entry->rule, &e->rule)) {
+ found = e;
+ goto out;
+ }
+ }
+ goto out;
+ } else {
+ *p = list = &audit_filter_list[entry->rule.listnr];
+ }
+
+ list_for_each_entry(e, list, list)
+ if (!audit_compare_rule(&entry->rule, &e->rule)) {
+ found = e;
+ goto out;
+ }
+
+out:
+ return found;
+}
+
+static u64 prio_low = ~0ULL/2;
+static u64 prio_high = ~0ULL/2 - 1;
+
+/* Add rule to given filterlist if not a duplicate. */
+static inline int audit_add_rule(struct audit_entry *entry)
+{
+ struct audit_entry *e;
+ struct audit_watch *watch = entry->rule.watch;
+ struct audit_tree *tree = entry->rule.tree;
+ struct list_head *list;
+ int err;
+#ifdef CONFIG_AUDITSYSCALL
+ int dont_count = 0;
+
+ /* If either of these, don't count towards total */
+ if (entry->rule.listnr == AUDIT_FILTER_USER ||
+ entry->rule.listnr == AUDIT_FILTER_TYPE)
+ dont_count = 1;
+#endif
+
+ mutex_lock(&audit_filter_mutex);
+ e = audit_find_rule(entry, &list);
+ if (e) {
+ mutex_unlock(&audit_filter_mutex);
+ err = -EEXIST;
+ /* normally audit_add_tree_rule() will free it on failure */
+ if (tree)
+ audit_put_tree(tree);
+ goto error;
+ }
+
+ if (watch) {
+ /* audit_filter_mutex is dropped and re-taken during this call */
+ err = audit_add_watch(&entry->rule, &list);
+ if (err) {
+ mutex_unlock(&audit_filter_mutex);
+ /*
+ * normally audit_add_tree_rule() will free it
+ * on failure
+ */
+ if (tree)
+ audit_put_tree(tree);
+ goto error;
+ }
+ }
+ if (tree) {
+ err = audit_add_tree_rule(&entry->rule);
+ if (err) {
+ mutex_unlock(&audit_filter_mutex);
+ goto error;
+ }
+ }
+
+ entry->rule.prio = ~0ULL;
+ if (entry->rule.listnr == AUDIT_FILTER_EXIT) {
+ if (entry->rule.flags & AUDIT_FILTER_PREPEND)
+ entry->rule.prio = ++prio_high;
+ else
+ entry->rule.prio = --prio_low;
+ }
+
+ if (entry->rule.flags & AUDIT_FILTER_PREPEND) {
+ list_add(&entry->rule.list,
+ &audit_rules_list[entry->rule.listnr]);
+ list_add_rcu(&entry->list, list);
+ entry->rule.flags &= ~AUDIT_FILTER_PREPEND;
+ } else {
+ list_add_tail(&entry->rule.list,
+ &audit_rules_list[entry->rule.listnr]);
+ list_add_tail_rcu(&entry->list, list);
+ }
+#ifdef CONFIG_AUDITSYSCALL
+ if (!dont_count)
+ audit_n_rules++;
+
+ if (!audit_match_signal(entry))
+ audit_signals++;
+#endif
+ mutex_unlock(&audit_filter_mutex);
+
+ return 0;
+
+error:
+ if (watch)
+ audit_put_watch(watch); /* tmp watch, matches initial get */
+ return err;
+}
+
+/* Remove an existing rule from filterlist. */
+static inline int audit_del_rule(struct audit_entry *entry)
+{
+ struct audit_entry *e;
+ struct audit_watch *watch = entry->rule.watch;
+ struct audit_tree *tree = entry->rule.tree;
+ struct list_head *list;
+ int ret = 0;
+#ifdef CONFIG_AUDITSYSCALL
+ int dont_count = 0;
+
+ /* If either of these, don't count towards total */
+ if (entry->rule.listnr == AUDIT_FILTER_USER ||
+ entry->rule.listnr == AUDIT_FILTER_TYPE)
+ dont_count = 1;
+#endif
+
+ mutex_lock(&audit_filter_mutex);
+ e = audit_find_rule(entry, &list);
+ if (!e) {
+ mutex_unlock(&audit_filter_mutex);
+ ret = -ENOENT;
+ goto out;
+ }
+
+ if (e->rule.watch)
+ audit_remove_watch_rule(&e->rule);
+
+ if (e->rule.tree)
+ audit_remove_tree_rule(&e->rule);
+
+ list_del_rcu(&e->list);
+ list_del(&e->rule.list);
+ call_rcu(&e->rcu, audit_free_rule_rcu);
+
+#ifdef CONFIG_AUDITSYSCALL
+ if (!dont_count)
+ audit_n_rules--;
+
+ if (!audit_match_signal(entry))
+ audit_signals--;
+#endif
+ mutex_unlock(&audit_filter_mutex);
+
+out:
+ if (watch)
+ audit_put_watch(watch); /* match initial get */
+ if (tree)
+ audit_put_tree(tree); /* that's the temporary one */
+
+ return ret;
+}
+
+/* List rules using struct audit_rule_data. */
+static void audit_list_rules(__u32 portid, int seq, struct sk_buff_head *q)
+{
+ struct sk_buff *skb;
+ struct audit_krule *r;
+ int i;
+
+ /* This is a blocking read, so use audit_filter_mutex instead of rcu
+ * iterator to sync with list writers. */
+ for (i=0; i<AUDIT_NR_FILTERS; i++) {
+ list_for_each_entry(r, &audit_rules_list[i], list) {
+ struct audit_rule_data *data;
+
+ data = audit_krule_to_data(r);
+ if (unlikely(!data))
+ break;
+ skb = audit_make_reply(portid, seq, AUDIT_LIST_RULES,
+ 0, 1, data,
+ sizeof(*data) + data->buflen);
+ if (skb)
+ skb_queue_tail(q, skb);
+ kfree(data);
+ }
+ }
+ skb = audit_make_reply(portid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0);
+ if (skb)
+ skb_queue_tail(q, skb);
+}
+
+/* Log rule additions and removals */
+static void audit_log_rule_change(char *action, struct audit_krule *rule, int res)
+{
+ struct audit_buffer *ab;
+ uid_t loginuid = from_kuid(&init_user_ns, audit_get_loginuid(current));
+ unsigned int sessionid = audit_get_sessionid(current);
+
+ if (!audit_enabled)
+ return;
+
+ ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
+ if (!ab)
+ return;
+ audit_log_format(ab, "auid=%u ses=%u" ,loginuid, sessionid);
+ audit_log_task_context(ab);
+ audit_log_format(ab, " op=");
+ audit_log_string(ab, action);
+ audit_log_key(ab, rule->filterkey);
+ audit_log_format(ab, " list=%d res=%d", rule->listnr, res);
+ audit_log_end(ab);
+}
+
+/**
+ * audit_rule_change - apply all rules to the specified message type
+ * @type: audit message type
+ * @portid: target port id for netlink audit messages
+ * @seq: netlink audit message sequence (serial) number
+ * @data: payload data
+ * @datasz: size of payload data
+ */
+int audit_rule_change(int type, __u32 portid, int seq, void *data,
+ size_t datasz)
+{
+ int err = 0;
+ struct audit_entry *entry;
+
+ entry = audit_data_to_entry(data, datasz);
+ if (IS_ERR(entry))
+ return PTR_ERR(entry);
+
+ switch (type) {
+ case AUDIT_ADD_RULE:
+ err = audit_add_rule(entry);
+ audit_log_rule_change("add_rule", &entry->rule, !err);
+ break;
+ case AUDIT_DEL_RULE:
+ err = audit_del_rule(entry);
+ audit_log_rule_change("remove_rule", &entry->rule, !err);
+ break;
+ default:
+ err = -EINVAL;
+ WARN_ON(1);
+ }
+
+ if (err || type == AUDIT_DEL_RULE)
+ audit_free_rule(entry);
+
+ return err;
+}
+
+/**
+ * audit_list_rules_send - list the audit rules
+ * @request_skb: skb of request we are replying to (used to target the reply)
+ * @seq: netlink audit message sequence (serial) number
+ */
+int audit_list_rules_send(struct sk_buff *request_skb, int seq)
+{
+ u32 portid = NETLINK_CB(request_skb).portid;
+ struct net *net = sock_net(NETLINK_CB(request_skb).sk);
+ struct task_struct *tsk;
+ struct audit_netlink_list *dest;
+ int err = 0;
+
+ /* We can't just spew out the rules here because we might fill
+ * the available socket buffer space and deadlock waiting for
+ * auditctl to read from it... which isn't ever going to
+ * happen if we're actually running in the context of auditctl
+ * trying to _send_ the stuff */
+
+ dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL);
+ if (!dest)
+ return -ENOMEM;
+ dest->net = get_net(net);
+ dest->portid = portid;
+ skb_queue_head_init(&dest->q);
+
+ mutex_lock(&audit_filter_mutex);
+ audit_list_rules(portid, seq, &dest->q);
+ mutex_unlock(&audit_filter_mutex);
+
+ tsk = kthread_run(audit_send_list, dest, "audit_send_list");
+ if (IS_ERR(tsk)) {
+ skb_queue_purge(&dest->q);
+ kfree(dest);
+ err = PTR_ERR(tsk);
+ }
+
+ return err;
+}
+
+int audit_comparator(u32 left, u32 op, u32 right)
+{
+ switch (op) {
+ case Audit_equal:
+ return (left == right);
+ case Audit_not_equal:
+ return (left != right);
+ case Audit_lt:
+ return (left < right);
+ case Audit_le:
+ return (left <= right);
+ case Audit_gt:
+ return (left > right);
+ case Audit_ge:
+ return (left >= right);
+ case Audit_bitmask:
+ return (left & right);
+ case Audit_bittest:
+ return ((left & right) == right);
+ default:
+ BUG();
+ return 0;
+ }
+}
+
+int audit_uid_comparator(kuid_t left, u32 op, kuid_t right)
+{
+ switch (op) {
+ case Audit_equal:
+ return uid_eq(left, right);
+ case Audit_not_equal:
+ return !uid_eq(left, right);
+ case Audit_lt:
+ return uid_lt(left, right);
+ case Audit_le:
+ return uid_lte(left, right);
+ case Audit_gt:
+ return uid_gt(left, right);
+ case Audit_ge:
+ return uid_gte(left, right);
+ case Audit_bitmask:
+ case Audit_bittest:
+ default:
+ BUG();
+ return 0;
+ }
+}
+
+int audit_gid_comparator(kgid_t left, u32 op, kgid_t right)
+{
+ switch (op) {
+ case Audit_equal:
+ return gid_eq(left, right);
+ case Audit_not_equal:
+ return !gid_eq(left, right);
+ case Audit_lt:
+ return gid_lt(left, right);
+ case Audit_le:
+ return gid_lte(left, right);
+ case Audit_gt:
+ return gid_gt(left, right);
+ case Audit_ge:
+ return gid_gte(left, right);
+ case Audit_bitmask:
+ case Audit_bittest:
+ default:
+ BUG();
+ return 0;
+ }
+}
+
+/**
+ * parent_len - find the length of the parent portion of a pathname
+ * @path: pathname of which to determine length
+ */
+int parent_len(const char *path)
+{
+ int plen;
+ const char *p;
+
+ plen = strlen(path);
+
+ if (plen == 0)
+ return plen;
+
+ /* disregard trailing slashes */
+ p = path + plen - 1;
+ while ((*p == '/') && (p > path))
+ p--;
+
+ /* walk backward until we find the next slash or hit beginning */
+ while ((*p != '/') && (p > path))
+ p--;
+
+ /* did we find a slash? Then increment to include it in path */
+ if (*p == '/')
+ p++;
+
+ return p - path;
+}
+
+/**
+ * audit_compare_dname_path - compare given dentry name with last component in
+ * given path. Return of 0 indicates a match.
+ * @dname: dentry name that we're comparing
+ * @path: full pathname that we're comparing
+ * @parentlen: length of the parent if known. Passing in AUDIT_NAME_FULL
+ * here indicates that we must compute this value.
+ */
+int audit_compare_dname_path(const char *dname, const char *path, int parentlen)
+{
+ int dlen, pathlen;
+ const char *p;
+
+ dlen = strlen(dname);
+ pathlen = strlen(path);
+ if (pathlen < dlen)
+ return 1;
+
+ parentlen = parentlen == AUDIT_NAME_FULL ? parent_len(path) : parentlen;
+ if (pathlen - parentlen != dlen)
+ return 1;
+
+ p = path + parentlen;
+
+ return strncmp(p, dname, dlen);
+}
+
+static int audit_filter_user_rules(struct audit_krule *rule, int type,
+ enum audit_state *state)
+{
+ int i;
+
+ for (i = 0; i < rule->field_count; i++) {
+ struct audit_field *f = &rule->fields[i];
+ pid_t pid;
+ int result = 0;
+ u32 sid;
+
+ switch (f->type) {
+ case AUDIT_PID:
+ pid = task_pid_nr(current);
+ result = audit_comparator(pid, f->op, f->val);
+ break;
+ case AUDIT_UID:
+ result = audit_uid_comparator(current_uid(), f->op, f->uid);
+ break;
+ case AUDIT_GID:
+ result = audit_gid_comparator(current_gid(), f->op, f->gid);
+ break;
+ case AUDIT_LOGINUID:
+ result = audit_uid_comparator(audit_get_loginuid(current),
+ f->op, f->uid);
+ break;
+ case AUDIT_LOGINUID_SET:
+ result = audit_comparator(audit_loginuid_set(current),
+ f->op, f->val);
+ break;
+ case AUDIT_MSGTYPE:
+ result = audit_comparator(type, f->op, f->val);
+ break;
+ case AUDIT_SUBJ_USER:
+ case AUDIT_SUBJ_ROLE:
+ case AUDIT_SUBJ_TYPE:
+ case AUDIT_SUBJ_SEN:
+ case AUDIT_SUBJ_CLR:
+ if (f->lsm_rule) {
+ security_task_getsecid(current, &sid);
+ result = security_audit_rule_match(sid,
+ f->type,
+ f->op,
+ f->lsm_rule,
+ NULL);
+ }
+ break;
+ }
+
+ if (!result)
+ return 0;
+ }
+ switch (rule->action) {
+ case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
+ case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break;
+ }
+ return 1;
+}
+
+int audit_filter_user(int type)
+{
+ enum audit_state state = AUDIT_DISABLED;
+ struct audit_entry *e;
+ int rc, ret;
+
+ ret = 1; /* Audit by default */
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) {
+ rc = audit_filter_user_rules(&e->rule, type, &state);
+ if (rc) {
+ if (rc > 0 && state == AUDIT_DISABLED)
+ ret = 0;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
+int audit_filter_type(int type)
+{
+ struct audit_entry *e;
+ int result = 0;
+
+ rcu_read_lock();
+ if (list_empty(&audit_filter_list[AUDIT_FILTER_TYPE]))
+ goto unlock_and_return;
+
+ list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TYPE],
+ list) {
+ int i;
+ for (i = 0; i < e->rule.field_count; i++) {
+ struct audit_field *f = &e->rule.fields[i];
+ if (f->type == AUDIT_MSGTYPE) {
+ result = audit_comparator(type, f->op, f->val);
+ if (!result)
+ break;
+ }
+ }
+ if (result)
+ goto unlock_and_return;
+ }
+unlock_and_return:
+ rcu_read_unlock();
+ return result;
+}
+
+static int update_lsm_rule(struct audit_krule *r)
+{
+ struct audit_entry *entry = container_of(r, struct audit_entry, rule);
+ struct audit_entry *nentry;
+ int err = 0;
+
+ if (!security_audit_rule_known(r))
+ return 0;
+
+ nentry = audit_dupe_rule(r);
+ if (IS_ERR(nentry)) {
+ /* save the first error encountered for the
+ * return value */
+ err = PTR_ERR(nentry);
+ audit_panic("error updating LSM filters");
+ if (r->watch)
+ list_del(&r->rlist);
+ list_del_rcu(&entry->list);
+ list_del(&r->list);
+ } else {
+ if (r->watch || r->tree)
+ list_replace_init(&r->rlist, &nentry->rule.rlist);
+ list_replace_rcu(&entry->list, &nentry->list);
+ list_replace(&r->list, &nentry->rule.list);
+ }
+ call_rcu(&entry->rcu, audit_free_rule_rcu);
+
+ return err;
+}
+
+/* This function will re-initialize the lsm_rule field of all applicable rules.
+ * It will traverse the filter lists serarching for rules that contain LSM
+ * specific filter fields. When such a rule is found, it is copied, the
+ * LSM field is re-initialized, and the old rule is replaced with the
+ * updated rule. */
+int audit_update_lsm_rules(void)
+{
+ struct audit_krule *r, *n;
+ int i, err = 0;
+
+ /* audit_filter_mutex synchronizes the writers */
+ mutex_lock(&audit_filter_mutex);
+
+ for (i = 0; i < AUDIT_NR_FILTERS; i++) {
+ list_for_each_entry_safe(r, n, &audit_rules_list[i], list) {
+ int res = update_lsm_rule(r);
+ if (!err)
+ err = res;
+ }
+ }
+ mutex_unlock(&audit_filter_mutex);
+
+ return err;
+}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
new file mode 100644
index 000000000..9fb9d1cb8
--- /dev/null
+++ b/kernel/auditsc.c
@@ -0,0 +1,2426 @@
+/* auditsc.c -- System-call auditing support
+ * Handles all system-call specific auditing features.
+ *
+ * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
+ * Copyright 2005 Hewlett-Packard Development Company, L.P.
+ * Copyright (C) 2005, 2006 IBM Corporation
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Written by Rickard E. (Rik) Faith <faith@redhat.com>
+ *
+ * Many of the ideas implemented here are from Stephen C. Tweedie,
+ * especially the idea of avoiding a copy by using getname.
+ *
+ * The method for actual interception of syscall entry and exit (not in
+ * this file -- see entry.S) is based on a GPL'd patch written by
+ * okir@suse.de and Copyright 2003 SuSE Linux AG.
+ *
+ * POSIX message queue support added by George Wilson <ltcgcw@us.ibm.com>,
+ * 2006.
+ *
+ * The support of additional filter rules compares (>, <, >=, <=) was
+ * added by Dustin Kirkland <dustin.kirkland@us.ibm.com>, 2005.
+ *
+ * Modified by Amy Griffis <amy.griffis@hp.com> to collect additional
+ * filesystem information.
+ *
+ * Subject and object context labeling support added by <danjones@us.ibm.com>
+ * and <dustin.kirkland@us.ibm.com> for LSPP certification compliance.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/init.h>
+#include <asm/types.h>
+#include <linux/atomic.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/mm.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/mount.h>
+#include <linux/socket.h>
+#include <linux/mqueue.h>
+#include <linux/audit.h>
+#include <linux/personality.h>
+#include <linux/time.h>
+#include <linux/netlink.h>
+#include <linux/compiler.h>
+#include <asm/unistd.h>
+#include <linux/security.h>
+#include <linux/list.h>
+#include <linux/tty.h>
+#include <linux/binfmts.h>
+#include <linux/highmem.h>
+#include <linux/syscalls.h>
+#include <asm/syscall.h>
+#include <linux/capability.h>
+#include <linux/fs_struct.h>
+#include <linux/compat.h>
+#include <linux/ctype.h>
+#include <linux/string.h>
+#include <uapi/linux/limits.h>
+
+#include "audit.h"
+
+/* flags stating the success for a syscall */
+#define AUDITSC_INVALID 0
+#define AUDITSC_SUCCESS 1
+#define AUDITSC_FAILURE 2
+
+/* no execve audit message should be longer than this (userspace limits) */
+#define MAX_EXECVE_AUDIT_LEN 7500
+
+/* max length to print of cmdline/proctitle value during audit */
+#define MAX_PROCTITLE_AUDIT_LEN 128
+
+/* number of audit rules */
+int audit_n_rules;
+
+/* determines whether we collect data for signals sent */
+int audit_signals;
+
+struct audit_aux_data {
+ struct audit_aux_data *next;
+ int type;
+};
+
+#define AUDIT_AUX_IPCPERM 0
+
+/* Number of target pids per aux struct. */
+#define AUDIT_AUX_PIDS 16
+
+struct audit_aux_data_pids {
+ struct audit_aux_data d;
+ pid_t target_pid[AUDIT_AUX_PIDS];
+ kuid_t target_auid[AUDIT_AUX_PIDS];
+ kuid_t target_uid[AUDIT_AUX_PIDS];
+ unsigned int target_sessionid[AUDIT_AUX_PIDS];
+ u32 target_sid[AUDIT_AUX_PIDS];
+ char target_comm[AUDIT_AUX_PIDS][TASK_COMM_LEN];
+ int pid_count;
+};
+
+struct audit_aux_data_bprm_fcaps {
+ struct audit_aux_data d;
+ struct audit_cap_data fcap;
+ unsigned int fcap_ver;
+ struct audit_cap_data old_pcap;
+ struct audit_cap_data new_pcap;
+};
+
+struct audit_tree_refs {
+ struct audit_tree_refs *next;
+ struct audit_chunk *c[31];
+};
+
+static int audit_match_perm(struct audit_context *ctx, int mask)
+{
+ unsigned n;
+ if (unlikely(!ctx))
+ return 0;
+ n = ctx->major;
+
+ switch (audit_classify_syscall(ctx->arch, n)) {
+ case 0: /* native */
+ if ((mask & AUDIT_PERM_WRITE) &&
+ audit_match_class(AUDIT_CLASS_WRITE, n))
+ return 1;
+ if ((mask & AUDIT_PERM_READ) &&
+ audit_match_class(AUDIT_CLASS_READ, n))
+ return 1;
+ if ((mask & AUDIT_PERM_ATTR) &&
+ audit_match_class(AUDIT_CLASS_CHATTR, n))
+ return 1;
+ return 0;
+ case 1: /* 32bit on biarch */
+ if ((mask & AUDIT_PERM_WRITE) &&
+ audit_match_class(AUDIT_CLASS_WRITE_32, n))
+ return 1;
+ if ((mask & AUDIT_PERM_READ) &&
+ audit_match_class(AUDIT_CLASS_READ_32, n))
+ return 1;
+ if ((mask & AUDIT_PERM_ATTR) &&
+ audit_match_class(AUDIT_CLASS_CHATTR_32, n))
+ return 1;
+ return 0;
+ case 2: /* open */
+ return mask & ACC_MODE(ctx->argv[1]);
+ case 3: /* openat */
+ return mask & ACC_MODE(ctx->argv[2]);
+ case 4: /* socketcall */
+ return ((mask & AUDIT_PERM_WRITE) && ctx->argv[0] == SYS_BIND);
+ case 5: /* execve */
+ return mask & AUDIT_PERM_EXEC;
+ default:
+ return 0;
+ }
+}
+
+static int audit_match_filetype(struct audit_context *ctx, int val)
+{
+ struct audit_names *n;
+ umode_t mode = (umode_t)val;
+
+ if (unlikely(!ctx))
+ return 0;
+
+ list_for_each_entry(n, &ctx->names_list, list) {
+ if ((n->ino != -1) &&
+ ((n->mode & S_IFMT) == mode))
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * We keep a linked list of fixed-sized (31 pointer) arrays of audit_chunk *;
+ * ->first_trees points to its beginning, ->trees - to the current end of data.
+ * ->tree_count is the number of free entries in array pointed to by ->trees.
+ * Original condition is (NULL, NULL, 0); as soon as it grows we never revert to NULL,
+ * "empty" becomes (p, p, 31) afterwards. We don't shrink the list (and seriously,
+ * it's going to remain 1-element for almost any setup) until we free context itself.
+ * References in it _are_ dropped - at the same time we free/drop aux stuff.
+ */
+
+#ifdef CONFIG_AUDIT_TREE
+static void audit_set_auditable(struct audit_context *ctx)
+{
+ if (!ctx->prio) {
+ ctx->prio = 1;
+ ctx->current_state = AUDIT_RECORD_CONTEXT;
+ }
+}
+
+static int put_tree_ref(struct audit_context *ctx, struct audit_chunk *chunk)
+{
+ struct audit_tree_refs *p = ctx->trees;
+ int left = ctx->tree_count;
+ if (likely(left)) {
+ p->c[--left] = chunk;
+ ctx->tree_count = left;
+ return 1;
+ }
+ if (!p)
+ return 0;
+ p = p->next;
+ if (p) {
+ p->c[30] = chunk;
+ ctx->trees = p;
+ ctx->tree_count = 30;
+ return 1;
+ }
+ return 0;
+}
+
+static int grow_tree_refs(struct audit_context *ctx)
+{
+ struct audit_tree_refs *p = ctx->trees;
+ ctx->trees = kzalloc(sizeof(struct audit_tree_refs), GFP_KERNEL);
+ if (!ctx->trees) {
+ ctx->trees = p;
+ return 0;
+ }
+ if (p)
+ p->next = ctx->trees;
+ else
+ ctx->first_trees = ctx->trees;
+ ctx->tree_count = 31;
+ return 1;
+}
+#endif
+
+static void unroll_tree_refs(struct audit_context *ctx,
+ struct audit_tree_refs *p, int count)
+{
+#ifdef CONFIG_AUDIT_TREE
+ struct audit_tree_refs *q;
+ int n;
+ if (!p) {
+ /* we started with empty chain */
+ p = ctx->first_trees;
+ count = 31;
+ /* if the very first allocation has failed, nothing to do */
+ if (!p)
+ return;
+ }
+ n = count;
+ for (q = p; q != ctx->trees; q = q->next, n = 31) {
+ while (n--) {
+ audit_put_chunk(q->c[n]);
+ q->c[n] = NULL;
+ }
+ }
+ while (n-- > ctx->tree_count) {
+ audit_put_chunk(q->c[n]);
+ q->c[n] = NULL;
+ }
+ ctx->trees = p;
+ ctx->tree_count = count;
+#endif
+}
+
+static void free_tree_refs(struct audit_context *ctx)
+{
+ struct audit_tree_refs *p, *q;
+ for (p = ctx->first_trees; p; p = q) {
+ q = p->next;
+ kfree(p);
+ }
+}
+
+static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree)
+{
+#ifdef CONFIG_AUDIT_TREE
+ struct audit_tree_refs *p;
+ int n;
+ if (!tree)
+ return 0;
+ /* full ones */
+ for (p = ctx->first_trees; p != ctx->trees; p = p->next) {
+ for (n = 0; n < 31; n++)
+ if (audit_tree_match(p->c[n], tree))
+ return 1;
+ }
+ /* partial */
+ if (p) {
+ for (n = ctx->tree_count; n < 31; n++)
+ if (audit_tree_match(p->c[n], tree))
+ return 1;
+ }
+#endif
+ return 0;
+}
+
+static int audit_compare_uid(kuid_t uid,
+ struct audit_names *name,
+ struct audit_field *f,
+ struct audit_context *ctx)
+{
+ struct audit_names *n;
+ int rc;
+
+ if (name) {
+ rc = audit_uid_comparator(uid, f->op, name->uid);
+ if (rc)
+ return rc;
+ }
+
+ if (ctx) {
+ list_for_each_entry(n, &ctx->names_list, list) {
+ rc = audit_uid_comparator(uid, f->op, n->uid);
+ if (rc)
+ return rc;
+ }
+ }
+ return 0;
+}
+
+static int audit_compare_gid(kgid_t gid,
+ struct audit_names *name,
+ struct audit_field *f,
+ struct audit_context *ctx)
+{
+ struct audit_names *n;
+ int rc;
+
+ if (name) {
+ rc = audit_gid_comparator(gid, f->op, name->gid);
+ if (rc)
+ return rc;
+ }
+
+ if (ctx) {
+ list_for_each_entry(n, &ctx->names_list, list) {
+ rc = audit_gid_comparator(gid, f->op, n->gid);
+ if (rc)
+ return rc;
+ }
+ }
+ return 0;
+}
+
+static int audit_field_compare(struct task_struct *tsk,
+ const struct cred *cred,
+ struct audit_field *f,
+ struct audit_context *ctx,
+ struct audit_names *name)
+{
+ switch (f->val) {
+ /* process to file object comparisons */
+ case AUDIT_COMPARE_UID_TO_OBJ_UID:
+ return audit_compare_uid(cred->uid, name, f, ctx);
+ case AUDIT_COMPARE_GID_TO_OBJ_GID:
+ return audit_compare_gid(cred->gid, name, f, ctx);
+ case AUDIT_COMPARE_EUID_TO_OBJ_UID:
+ return audit_compare_uid(cred->euid, name, f, ctx);
+ case AUDIT_COMPARE_EGID_TO_OBJ_GID:
+ return audit_compare_gid(cred->egid, name, f, ctx);
+ case AUDIT_COMPARE_AUID_TO_OBJ_UID:
+ return audit_compare_uid(tsk->loginuid, name, f, ctx);
+ case AUDIT_COMPARE_SUID_TO_OBJ_UID:
+ return audit_compare_uid(cred->suid, name, f, ctx);
+ case AUDIT_COMPARE_SGID_TO_OBJ_GID:
+ return audit_compare_gid(cred->sgid, name, f, ctx);
+ case AUDIT_COMPARE_FSUID_TO_OBJ_UID:
+ return audit_compare_uid(cred->fsuid, name, f, ctx);
+ case AUDIT_COMPARE_FSGID_TO_OBJ_GID:
+ return audit_compare_gid(cred->fsgid, name, f, ctx);
+ /* uid comparisons */
+ case AUDIT_COMPARE_UID_TO_AUID:
+ return audit_uid_comparator(cred->uid, f->op, tsk->loginuid);
+ case AUDIT_COMPARE_UID_TO_EUID:
+ return audit_uid_comparator(cred->uid, f->op, cred->euid);
+ case AUDIT_COMPARE_UID_TO_SUID:
+ return audit_uid_comparator(cred->uid, f->op, cred->suid);
+ case AUDIT_COMPARE_UID_TO_FSUID:
+ return audit_uid_comparator(cred->uid, f->op, cred->fsuid);
+ /* auid comparisons */
+ case AUDIT_COMPARE_AUID_TO_EUID:
+ return audit_uid_comparator(tsk->loginuid, f->op, cred->euid);
+ case AUDIT_COMPARE_AUID_TO_SUID:
+ return audit_uid_comparator(tsk->loginuid, f->op, cred->suid);
+ case AUDIT_COMPARE_AUID_TO_FSUID:
+ return audit_uid_comparator(tsk->loginuid, f->op, cred->fsuid);
+ /* euid comparisons */
+ case AUDIT_COMPARE_EUID_TO_SUID:
+ return audit_uid_comparator(cred->euid, f->op, cred->suid);
+ case AUDIT_COMPARE_EUID_TO_FSUID:
+ return audit_uid_comparator(cred->euid, f->op, cred->fsuid);
+ /* suid comparisons */
+ case AUDIT_COMPARE_SUID_TO_FSUID:
+ return audit_uid_comparator(cred->suid, f->op, cred->fsuid);
+ /* gid comparisons */
+ case AUDIT_COMPARE_GID_TO_EGID:
+ return audit_gid_comparator(cred->gid, f->op, cred->egid);
+ case AUDIT_COMPARE_GID_TO_SGID:
+ return audit_gid_comparator(cred->gid, f->op, cred->sgid);
+ case AUDIT_COMPARE_GID_TO_FSGID:
+ return audit_gid_comparator(cred->gid, f->op, cred->fsgid);
+ /* egid comparisons */
+ case AUDIT_COMPARE_EGID_TO_SGID:
+ return audit_gid_comparator(cred->egid, f->op, cred->sgid);
+ case AUDIT_COMPARE_EGID_TO_FSGID:
+ return audit_gid_comparator(cred->egid, f->op, cred->fsgid);
+ /* sgid comparison */
+ case AUDIT_COMPARE_SGID_TO_FSGID:
+ return audit_gid_comparator(cred->sgid, f->op, cred->fsgid);
+ default:
+ WARN(1, "Missing AUDIT_COMPARE define. Report as a bug\n");
+ return 0;
+ }
+ return 0;
+}
+
+/* Determine if any context name data matches a rule's watch data */
+/* Compare a task_struct with an audit_rule. Return 1 on match, 0
+ * otherwise.
+ *
+ * If task_creation is true, this is an explicit indication that we are
+ * filtering a task rule at task creation time. This and tsk == current are
+ * the only situations where tsk->cred may be accessed without an rcu read lock.
+ */
+static int audit_filter_rules(struct task_struct *tsk,
+ struct audit_krule *rule,
+ struct audit_context *ctx,
+ struct audit_names *name,
+ enum audit_state *state,
+ bool task_creation)
+{
+ const struct cred *cred;
+ int i, need_sid = 1;
+ u32 sid;
+
+ cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation);
+
+ for (i = 0; i < rule->field_count; i++) {
+ struct audit_field *f = &rule->fields[i];
+ struct audit_names *n;
+ int result = 0;
+ pid_t pid;
+
+ switch (f->type) {
+ case AUDIT_PID:
+ pid = task_pid_nr(tsk);
+ result = audit_comparator(pid, f->op, f->val);
+ break;
+ case AUDIT_PPID:
+ if (ctx) {
+ if (!ctx->ppid)
+ ctx->ppid = task_ppid_nr(tsk);
+ result = audit_comparator(ctx->ppid, f->op, f->val);
+ }
+ break;
+ case AUDIT_UID:
+ result = audit_uid_comparator(cred->uid, f->op, f->uid);
+ break;
+ case AUDIT_EUID:
+ result = audit_uid_comparator(cred->euid, f->op, f->uid);
+ break;
+ case AUDIT_SUID:
+ result = audit_uid_comparator(cred->suid, f->op, f->uid);
+ break;
+ case AUDIT_FSUID:
+ result = audit_uid_comparator(cred->fsuid, f->op, f->uid);
+ break;
+ case AUDIT_GID:
+ result = audit_gid_comparator(cred->gid, f->op, f->gid);
+ if (f->op == Audit_equal) {
+ if (!result)
+ result = in_group_p(f->gid);
+ } else if (f->op == Audit_not_equal) {
+ if (result)
+ result = !in_group_p(f->gid);
+ }
+ break;
+ case AUDIT_EGID:
+ result = audit_gid_comparator(cred->egid, f->op, f->gid);
+ if (f->op == Audit_equal) {
+ if (!result)
+ result = in_egroup_p(f->gid);
+ } else if (f->op == Audit_not_equal) {
+ if (result)
+ result = !in_egroup_p(f->gid);
+ }
+ break;
+ case AUDIT_SGID:
+ result = audit_gid_comparator(cred->sgid, f->op, f->gid);
+ break;
+ case AUDIT_FSGID:
+ result = audit_gid_comparator(cred->fsgid, f->op, f->gid);
+ break;
+ case AUDIT_PERS:
+ result = audit_comparator(tsk->personality, f->op, f->val);
+ break;
+ case AUDIT_ARCH:
+ if (ctx)
+ result = audit_comparator(ctx->arch, f->op, f->val);
+ break;
+
+ case AUDIT_EXIT:
+ if (ctx && ctx->return_valid)
+ result = audit_comparator(ctx->return_code, f->op, f->val);
+ break;
+ case AUDIT_SUCCESS:
+ if (ctx && ctx->return_valid) {
+ if (f->val)
+ result = audit_comparator(ctx->return_valid, f->op, AUDITSC_SUCCESS);
+ else
+ result = audit_comparator(ctx->return_valid, f->op, AUDITSC_FAILURE);
+ }
+ break;
+ case AUDIT_DEVMAJOR:
+ if (name) {
+ if (audit_comparator(MAJOR(name->dev), f->op, f->val) ||
+ audit_comparator(MAJOR(name->rdev), f->op, f->val))
+ ++result;
+ } else if (ctx) {
+ list_for_each_entry(n, &ctx->names_list, list) {
+ if (audit_comparator(MAJOR(n->dev), f->op, f->val) ||
+ audit_comparator(MAJOR(n->rdev), f->op, f->val)) {
+ ++result;
+ break;
+ }
+ }
+ }
+ break;
+ case AUDIT_DEVMINOR:
+ if (name) {
+ if (audit_comparator(MINOR(name->dev), f->op, f->val) ||
+ audit_comparator(MINOR(name->rdev), f->op, f->val))
+ ++result;
+ } else if (ctx) {
+ list_for_each_entry(n, &ctx->names_list, list) {
+ if (audit_comparator(MINOR(n->dev), f->op, f->val) ||
+ audit_comparator(MINOR(n->rdev), f->op, f->val)) {
+ ++result;
+ break;
+ }
+ }
+ }
+ break;
+ case AUDIT_INODE:
+ if (name)
+ result = audit_comparator(name->ino, f->op, f->val);
+ else if (ctx) {
+ list_for_each_entry(n, &ctx->names_list, list) {
+ if (audit_comparator(n->ino, f->op, f->val)) {
+ ++result;
+ break;
+ }
+ }
+ }
+ break;
+ case AUDIT_OBJ_UID:
+ if (name) {
+ result = audit_uid_comparator(name->uid, f->op, f->uid);
+ } else if (ctx) {
+ list_for_each_entry(n, &ctx->names_list, list) {
+ if (audit_uid_comparator(n->uid, f->op, f->uid)) {
+ ++result;
+ break;
+ }
+ }
+ }
+ break;
+ case AUDIT_OBJ_GID:
+ if (name) {
+ result = audit_gid_comparator(name->gid, f->op, f->gid);
+ } else if (ctx) {
+ list_for_each_entry(n, &ctx->names_list, list) {
+ if (audit_gid_comparator(n->gid, f->op, f->gid)) {
+ ++result;
+ break;
+ }
+ }
+ }
+ break;
+ case AUDIT_WATCH:
+ if (name)
+ result = audit_watch_compare(rule->watch, name->ino, name->dev);
+ break;
+ case AUDIT_DIR:
+ if (ctx)
+ result = match_tree_refs(ctx, rule->tree);
+ break;
+ case AUDIT_LOGINUID:
+ result = 0;
+ if (ctx)
+ result = audit_uid_comparator(tsk->loginuid, f->op, f->uid);
+ break;
+ case AUDIT_LOGINUID_SET:
+ result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val);
+ break;
+ case AUDIT_SUBJ_USER:
+ case AUDIT_SUBJ_ROLE:
+ case AUDIT_SUBJ_TYPE:
+ case AUDIT_SUBJ_SEN:
+ case AUDIT_SUBJ_CLR:
+ /* NOTE: this may return negative values indicating
+ a temporary error. We simply treat this as a
+ match for now to avoid losing information that
+ may be wanted. An error message will also be
+ logged upon error */
+ if (f->lsm_rule) {
+ if (need_sid) {
+ security_task_getsecid(tsk, &sid);
+ need_sid = 0;
+ }
+ result = security_audit_rule_match(sid, f->type,
+ f->op,
+ f->lsm_rule,
+ ctx);
+ }
+ break;
+ case AUDIT_OBJ_USER:
+ case AUDIT_OBJ_ROLE:
+ case AUDIT_OBJ_TYPE:
+ case AUDIT_OBJ_LEV_LOW:
+ case AUDIT_OBJ_LEV_HIGH:
+ /* The above note for AUDIT_SUBJ_USER...AUDIT_SUBJ_CLR
+ also applies here */
+ if (f->lsm_rule) {
+ /* Find files that match */
+ if (name) {
+ result = security_audit_rule_match(
+ name->osid, f->type, f->op,
+ f->lsm_rule, ctx);
+ } else if (ctx) {
+ list_for_each_entry(n, &ctx->names_list, list) {
+ if (security_audit_rule_match(n->osid, f->type,
+ f->op, f->lsm_rule,
+ ctx)) {
+ ++result;
+ break;
+ }
+ }
+ }
+ /* Find ipc objects that match */
+ if (!ctx || ctx->type != AUDIT_IPC)
+ break;
+ if (security_audit_rule_match(ctx->ipc.osid,
+ f->type, f->op,
+ f->lsm_rule, ctx))
+ ++result;
+ }
+ break;
+ case AUDIT_ARG0:
+ case AUDIT_ARG1:
+ case AUDIT_ARG2:
+ case AUDIT_ARG3:
+ if (ctx)
+ result = audit_comparator(ctx->argv[f->type-AUDIT_ARG0], f->op, f->val);
+ break;
+ case AUDIT_FILTERKEY:
+ /* ignore this field for filtering */
+ result = 1;
+ break;
+ case AUDIT_PERM:
+ result = audit_match_perm(ctx, f->val);
+ break;
+ case AUDIT_FILETYPE:
+ result = audit_match_filetype(ctx, f->val);
+ break;
+ case AUDIT_FIELD_COMPARE:
+ result = audit_field_compare(tsk, cred, f, ctx, name);
+ break;
+ }
+ if (!result)
+ return 0;
+ }
+
+ if (ctx) {
+ if (rule->prio <= ctx->prio)
+ return 0;
+ if (rule->filterkey) {
+ kfree(ctx->filterkey);
+ ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC);
+ }
+ ctx->prio = rule->prio;
+ }
+ switch (rule->action) {
+ case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
+ case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break;
+ }
+ return 1;
+}
+
+/* At process creation time, we can determine if system-call auditing is
+ * completely disabled for this task. Since we only have the task
+ * structure at this point, we can only check uid and gid.
+ */
+static enum audit_state audit_filter_task(struct task_struct *tsk, char **key)
+{
+ struct audit_entry *e;
+ enum audit_state state;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) {
+ if (audit_filter_rules(tsk, &e->rule, NULL, NULL,
+ &state, true)) {
+ if (state == AUDIT_RECORD_CONTEXT)
+ *key = kstrdup(e->rule.filterkey, GFP_ATOMIC);
+ rcu_read_unlock();
+ return state;
+ }
+ }
+ rcu_read_unlock();
+ return AUDIT_BUILD_CONTEXT;
+}
+
+static int audit_in_mask(const struct audit_krule *rule, unsigned long val)
+{
+ int word, bit;
+
+ if (val > 0xffffffff)
+ return false;
+
+ word = AUDIT_WORD(val);
+ if (word >= AUDIT_BITMASK_SIZE)
+ return false;
+
+ bit = AUDIT_BIT(val);
+
+ return rule->mask[word] & bit;
+}
+
+/* At syscall entry and exit time, this filter is called if the
+ * audit_state is not low enough that auditing cannot take place, but is
+ * also not high enough that we already know we have to write an audit
+ * record (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT).
+ */
+static enum audit_state audit_filter_syscall(struct task_struct *tsk,
+ struct audit_context *ctx,
+ struct list_head *list)
+{
+ struct audit_entry *e;
+ enum audit_state state;
+
+ if (audit_pid && tsk->tgid == audit_pid)
+ return AUDIT_DISABLED;
+
+ rcu_read_lock();
+ if (!list_empty(list)) {
+ list_for_each_entry_rcu(e, list, list) {
+ if (audit_in_mask(&e->rule, ctx->major) &&
+ audit_filter_rules(tsk, &e->rule, ctx, NULL,
+ &state, false)) {
+ rcu_read_unlock();
+ ctx->current_state = state;
+ return state;
+ }
+ }
+ }
+ rcu_read_unlock();
+ return AUDIT_BUILD_CONTEXT;
+}
+
+/*
+ * Given an audit_name check the inode hash table to see if they match.
+ * Called holding the rcu read lock to protect the use of audit_inode_hash
+ */
+static int audit_filter_inode_name(struct task_struct *tsk,
+ struct audit_names *n,
+ struct audit_context *ctx) {
+ int h = audit_hash_ino((u32)n->ino);
+ struct list_head *list = &audit_inode_hash[h];
+ struct audit_entry *e;
+ enum audit_state state;
+
+ if (list_empty(list))
+ return 0;
+
+ list_for_each_entry_rcu(e, list, list) {
+ if (audit_in_mask(&e->rule, ctx->major) &&
+ audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) {
+ ctx->current_state = state;
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+/* At syscall exit time, this filter is called if any audit_names have been
+ * collected during syscall processing. We only check rules in sublists at hash
+ * buckets applicable to the inode numbers in audit_names.
+ * Regarding audit_state, same rules apply as for audit_filter_syscall().
+ */
+void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
+{
+ struct audit_names *n;
+
+ if (audit_pid && tsk->tgid == audit_pid)
+ return;
+
+ rcu_read_lock();
+
+ list_for_each_entry(n, &ctx->names_list, list) {
+ if (audit_filter_inode_name(tsk, n, ctx))
+ break;
+ }
+ rcu_read_unlock();
+}
+
+/* Transfer the audit context pointer to the caller, clearing it in the tsk's struct */
+static inline struct audit_context *audit_take_context(struct task_struct *tsk,
+ int return_valid,
+ long return_code)
+{
+ struct audit_context *context = tsk->audit_context;
+
+ if (!context)
+ return NULL;
+ context->return_valid = return_valid;
+
+ /*
+ * we need to fix up the return code in the audit logs if the actual
+ * return codes are later going to be fixed up by the arch specific
+ * signal handlers
+ *
+ * This is actually a test for:
+ * (rc == ERESTARTSYS ) || (rc == ERESTARTNOINTR) ||
+ * (rc == ERESTARTNOHAND) || (rc == ERESTART_RESTARTBLOCK)
+ *
+ * but is faster than a bunch of ||
+ */
+ if (unlikely(return_code <= -ERESTARTSYS) &&
+ (return_code >= -ERESTART_RESTARTBLOCK) &&
+ (return_code != -ENOIOCTLCMD))
+ context->return_code = -EINTR;
+ else
+ context->return_code = return_code;
+
+ if (context->in_syscall && !context->dummy) {
+ audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]);
+ audit_filter_inodes(tsk, context);
+ }
+
+ tsk->audit_context = NULL;
+ return context;
+}
+
+static inline void audit_proctitle_free(struct audit_context *context)
+{
+ kfree(context->proctitle.value);
+ context->proctitle.value = NULL;
+ context->proctitle.len = 0;
+}
+
+static inline void audit_free_names(struct audit_context *context)
+{
+ struct audit_names *n, *next;
+
+ list_for_each_entry_safe(n, next, &context->names_list, list) {
+ list_del(&n->list);
+ if (n->name)
+ putname(n->name);
+ if (n->should_free)
+ kfree(n);
+ }
+ context->name_count = 0;
+ path_put(&context->pwd);
+ context->pwd.dentry = NULL;
+ context->pwd.mnt = NULL;
+}
+
+static inline void audit_free_aux(struct audit_context *context)
+{
+ struct audit_aux_data *aux;
+
+ while ((aux = context->aux)) {
+ context->aux = aux->next;
+ kfree(aux);
+ }
+ while ((aux = context->aux_pids)) {
+ context->aux_pids = aux->next;
+ kfree(aux);
+ }
+}
+
+static inline struct audit_context *audit_alloc_context(enum audit_state state)
+{
+ struct audit_context *context;
+
+ context = kzalloc(sizeof(*context), GFP_KERNEL);
+ if (!context)
+ return NULL;
+ context->state = state;
+ context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
+ INIT_LIST_HEAD(&context->killed_trees);
+ INIT_LIST_HEAD(&context->names_list);
+ return context;
+}
+
+/**
+ * audit_alloc - allocate an audit context block for a task
+ * @tsk: task
+ *
+ * Filter on the task information and allocate a per-task audit context
+ * if necessary. Doing so turns on system call auditing for the
+ * specified task. This is called from copy_process, so no lock is
+ * needed.
+ */
+int audit_alloc(struct task_struct *tsk)
+{
+ struct audit_context *context;
+ enum audit_state state;
+ char *key = NULL;
+
+ if (likely(!audit_ever_enabled))
+ return 0; /* Return if not auditing. */
+
+ state = audit_filter_task(tsk, &key);
+ if (state == AUDIT_DISABLED) {
+ clear_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT);
+ return 0;
+ }
+
+ if (!(context = audit_alloc_context(state))) {
+ kfree(key);
+ audit_log_lost("out of memory in audit_alloc");
+ return -ENOMEM;
+ }
+ context->filterkey = key;
+
+ tsk->audit_context = context;
+ set_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT);
+ return 0;
+}
+
+static inline void audit_free_context(struct audit_context *context)
+{
+ audit_free_names(context);
+ unroll_tree_refs(context, NULL, 0);
+ free_tree_refs(context);
+ audit_free_aux(context);
+ kfree(context->filterkey);
+ kfree(context->sockaddr);
+ audit_proctitle_free(context);
+ kfree(context);
+}
+
+static int audit_log_pid_context(struct audit_context *context, pid_t pid,
+ kuid_t auid, kuid_t uid, unsigned int sessionid,
+ u32 sid, char *comm)
+{
+ struct audit_buffer *ab;
+ char *ctx = NULL;
+ u32 len;
+ int rc = 0;
+
+ ab = audit_log_start(context, GFP_KERNEL, AUDIT_OBJ_PID);
+ if (!ab)
+ return rc;
+
+ audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid,
+ from_kuid(&init_user_ns, auid),
+ from_kuid(&init_user_ns, uid), sessionid);
+ if (sid) {
+ if (security_secid_to_secctx(sid, &ctx, &len)) {
+ audit_log_format(ab, " obj=(none)");
+ rc = 1;
+ } else {
+ audit_log_format(ab, " obj=%s", ctx);
+ security_release_secctx(ctx, len);
+ }
+ }
+ audit_log_format(ab, " ocomm=");
+ audit_log_untrustedstring(ab, comm);
+ audit_log_end(ab);
+
+ return rc;
+}
+
+/*
+ * to_send and len_sent accounting are very loose estimates. We aren't
+ * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being
+ * within about 500 bytes (next page boundary)
+ *
+ * why snprintf? an int is up to 12 digits long. if we just assumed when
+ * logging that a[%d]= was going to be 16 characters long we would be wasting
+ * space in every audit message. In one 7500 byte message we can log up to
+ * about 1000 min size arguments. That comes down to about 50% waste of space
+ * if we didn't do the snprintf to find out how long arg_num_len was.
+ */
+static int audit_log_single_execve_arg(struct audit_context *context,
+ struct audit_buffer **ab,
+ int arg_num,
+ size_t *len_sent,
+ const char __user *p,
+ char *buf)
+{
+ char arg_num_len_buf[12];
+ const char __user *tmp_p = p;
+ /* how many digits are in arg_num? 5 is the length of ' a=""' */
+ size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 5;
+ size_t len, len_left, to_send;
+ size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN;
+ unsigned int i, has_cntl = 0, too_long = 0;
+ int ret;
+
+ /* strnlen_user includes the null we don't want to send */
+ len_left = len = strnlen_user(p, MAX_ARG_STRLEN) - 1;
+
+ /*
+ * We just created this mm, if we can't find the strings
+ * we just copied into it something is _very_ wrong. Similar
+ * for strings that are too long, we should not have created
+ * any.
+ */
+ if (unlikely((len == -1) || len > MAX_ARG_STRLEN - 1)) {
+ WARN_ON(1);
+ send_sig(SIGKILL, current, 0);
+ return -1;
+ }
+
+ /* walk the whole argument looking for non-ascii chars */
+ do {
+ if (len_left > MAX_EXECVE_AUDIT_LEN)
+ to_send = MAX_EXECVE_AUDIT_LEN;
+ else
+ to_send = len_left;
+ ret = copy_from_user(buf, tmp_p, to_send);
+ /*
+ * There is no reason for this copy to be short. We just
+ * copied them here, and the mm hasn't been exposed to user-
+ * space yet.
+ */
+ if (ret) {
+ WARN_ON(1);
+ send_sig(SIGKILL, current, 0);
+ return -1;
+ }
+ buf[to_send] = '\0';
+ has_cntl = audit_string_contains_control(buf, to_send);
+ if (has_cntl) {
+ /*
+ * hex messages get logged as 2 bytes, so we can only
+ * send half as much in each message
+ */
+ max_execve_audit_len = MAX_EXECVE_AUDIT_LEN / 2;
+ break;
+ }
+ len_left -= to_send;
+ tmp_p += to_send;
+ } while (len_left > 0);
+
+ len_left = len;
+
+ if (len > max_execve_audit_len)
+ too_long = 1;
+
+ /* rewalk the argument actually logging the message */
+ for (i = 0; len_left > 0; i++) {
+ int room_left;
+
+ if (len_left > max_execve_audit_len)
+ to_send = max_execve_audit_len;
+ else
+ to_send = len_left;
+
+ /* do we have space left to send this argument in this ab? */
+ room_left = MAX_EXECVE_AUDIT_LEN - arg_num_len - *len_sent;
+ if (has_cntl)
+ room_left -= (to_send * 2);
+ else
+ room_left -= to_send;
+ if (room_left < 0) {
+ *len_sent = 0;
+ audit_log_end(*ab);
+ *ab = audit_log_start(context, GFP_KERNEL, AUDIT_EXECVE);
+ if (!*ab)
+ return 0;
+ }
+
+ /*
+ * first record needs to say how long the original string was
+ * so we can be sure nothing was lost.
+ */
+ if ((i == 0) && (too_long))
+ audit_log_format(*ab, " a%d_len=%zu", arg_num,
+ has_cntl ? 2*len : len);
+
+ /*
+ * normally arguments are small enough to fit and we already
+ * filled buf above when we checked for control characters
+ * so don't bother with another copy_from_user
+ */
+ if (len >= max_execve_audit_len)
+ ret = copy_from_user(buf, p, to_send);
+ else
+ ret = 0;
+ if (ret) {
+ WARN_ON(1);
+ send_sig(SIGKILL, current, 0);
+ return -1;
+ }
+ buf[to_send] = '\0';
+
+ /* actually log it */
+ audit_log_format(*ab, " a%d", arg_num);
+ if (too_long)
+ audit_log_format(*ab, "[%d]", i);
+ audit_log_format(*ab, "=");
+ if (has_cntl)
+ audit_log_n_hex(*ab, buf, to_send);
+ else
+ audit_log_string(*ab, buf);
+
+ p += to_send;
+ len_left -= to_send;
+ *len_sent += arg_num_len;
+ if (has_cntl)
+ *len_sent += to_send * 2;
+ else
+ *len_sent += to_send;
+ }
+ /* include the null we didn't log */
+ return len + 1;
+}
+
+static void audit_log_execve_info(struct audit_context *context,
+ struct audit_buffer **ab)
+{
+ int i, len;
+ size_t len_sent = 0;
+ const char __user *p;
+ char *buf;
+
+ p = (const char __user *)current->mm->arg_start;
+
+ audit_log_format(*ab, "argc=%d", context->execve.argc);
+
+ /*
+ * we need some kernel buffer to hold the userspace args. Just
+ * allocate one big one rather than allocating one of the right size
+ * for every single argument inside audit_log_single_execve_arg()
+ * should be <8k allocation so should be pretty safe.
+ */
+ buf = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL);
+ if (!buf) {
+ audit_panic("out of memory for argv string");
+ return;
+ }
+
+ for (i = 0; i < context->execve.argc; i++) {
+ len = audit_log_single_execve_arg(context, ab, i,
+ &len_sent, p, buf);
+ if (len <= 0)
+ break;
+ p += len;
+ }
+ kfree(buf);
+}
+
+static void show_special(struct audit_context *context, int *call_panic)
+{
+ struct audit_buffer *ab;
+ int i;
+
+ ab = audit_log_start(context, GFP_KERNEL, context->type);
+ if (!ab)
+ return;
+
+ switch (context->type) {
+ case AUDIT_SOCKETCALL: {
+ int nargs = context->socketcall.nargs;
+ audit_log_format(ab, "nargs=%d", nargs);
+ for (i = 0; i < nargs; i++)
+ audit_log_format(ab, " a%d=%lx", i,
+ context->socketcall.args[i]);
+ break; }
+ case AUDIT_IPC: {
+ u32 osid = context->ipc.osid;
+
+ audit_log_format(ab, "ouid=%u ogid=%u mode=%#ho",
+ from_kuid(&init_user_ns, context->ipc.uid),
+ from_kgid(&init_user_ns, context->ipc.gid),
+ context->ipc.mode);
+ if (osid) {
+ char *ctx = NULL;
+ u32 len;
+ if (security_secid_to_secctx(osid, &ctx, &len)) {
+ audit_log_format(ab, " osid=%u", osid);
+ *call_panic = 1;
+ } else {
+ audit_log_format(ab, " obj=%s", ctx);
+ security_release_secctx(ctx, len);
+ }
+ }
+ if (context->ipc.has_perm) {
+ audit_log_end(ab);
+ ab = audit_log_start(context, GFP_KERNEL,
+ AUDIT_IPC_SET_PERM);
+ if (unlikely(!ab))
+ return;
+ audit_log_format(ab,
+ "qbytes=%lx ouid=%u ogid=%u mode=%#ho",
+ context->ipc.qbytes,
+ context->ipc.perm_uid,
+ context->ipc.perm_gid,
+ context->ipc.perm_mode);
+ }
+ break; }
+ case AUDIT_MQ_OPEN: {
+ audit_log_format(ab,
+ "oflag=0x%x mode=%#ho mq_flags=0x%lx mq_maxmsg=%ld "
+ "mq_msgsize=%ld mq_curmsgs=%ld",
+ context->mq_open.oflag, context->mq_open.mode,
+ context->mq_open.attr.mq_flags,
+ context->mq_open.attr.mq_maxmsg,
+ context->mq_open.attr.mq_msgsize,
+ context->mq_open.attr.mq_curmsgs);
+ break; }
+ case AUDIT_MQ_SENDRECV: {
+ audit_log_format(ab,
+ "mqdes=%d msg_len=%zd msg_prio=%u "
+ "abs_timeout_sec=%ld abs_timeout_nsec=%ld",
+ context->mq_sendrecv.mqdes,
+ context->mq_sendrecv.msg_len,
+ context->mq_sendrecv.msg_prio,
+ context->mq_sendrecv.abs_timeout.tv_sec,
+ context->mq_sendrecv.abs_timeout.tv_nsec);
+ break; }
+ case AUDIT_MQ_NOTIFY: {
+ audit_log_format(ab, "mqdes=%d sigev_signo=%d",
+ context->mq_notify.mqdes,
+ context->mq_notify.sigev_signo);
+ break; }
+ case AUDIT_MQ_GETSETATTR: {
+ struct mq_attr *attr = &context->mq_getsetattr.mqstat;
+ audit_log_format(ab,
+ "mqdes=%d mq_flags=0x%lx mq_maxmsg=%ld mq_msgsize=%ld "
+ "mq_curmsgs=%ld ",
+ context->mq_getsetattr.mqdes,
+ attr->mq_flags, attr->mq_maxmsg,
+ attr->mq_msgsize, attr->mq_curmsgs);
+ break; }
+ case AUDIT_CAPSET: {
+ audit_log_format(ab, "pid=%d", context->capset.pid);
+ audit_log_cap(ab, "cap_pi", &context->capset.cap.inheritable);
+ audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted);
+ audit_log_cap(ab, "cap_pe", &context->capset.cap.effective);
+ break; }
+ case AUDIT_MMAP: {
+ audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd,
+ context->mmap.flags);
+ break; }
+ case AUDIT_EXECVE: {
+ audit_log_execve_info(context, &ab);
+ break; }
+ }
+ audit_log_end(ab);
+}
+
+static inline int audit_proctitle_rtrim(char *proctitle, int len)
+{
+ char *end = proctitle + len - 1;
+ while (end > proctitle && !isprint(*end))
+ end--;
+
+ /* catch the case where proctitle is only 1 non-print character */
+ len = end - proctitle + 1;
+ len -= isprint(proctitle[len-1]) == 0;
+ return len;
+}
+
+static void audit_log_proctitle(struct task_struct *tsk,
+ struct audit_context *context)
+{
+ int res;
+ char *buf;
+ char *msg = "(null)";
+ int len = strlen(msg);
+ struct audit_buffer *ab;
+
+ ab = audit_log_start(context, GFP_KERNEL, AUDIT_PROCTITLE);
+ if (!ab)
+ return; /* audit_panic or being filtered */
+
+ audit_log_format(ab, "proctitle=");
+
+ /* Not cached */
+ if (!context->proctitle.value) {
+ buf = kmalloc(MAX_PROCTITLE_AUDIT_LEN, GFP_KERNEL);
+ if (!buf)
+ goto out;
+ /* Historically called this from procfs naming */
+ res = get_cmdline(tsk, buf, MAX_PROCTITLE_AUDIT_LEN);
+ if (res == 0) {
+ kfree(buf);
+ goto out;
+ }
+ res = audit_proctitle_rtrim(buf, res);
+ if (res == 0) {
+ kfree(buf);
+ goto out;
+ }
+ context->proctitle.value = buf;
+ context->proctitle.len = res;
+ }
+ msg = context->proctitle.value;
+ len = context->proctitle.len;
+out:
+ audit_log_n_untrustedstring(ab, msg, len);
+ audit_log_end(ab);
+}
+
+static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
+{
+ int i, call_panic = 0;
+ struct audit_buffer *ab;
+ struct audit_aux_data *aux;
+ struct audit_names *n;
+
+ /* tsk == current */
+ context->personality = tsk->personality;
+
+ ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL);
+ if (!ab)
+ return; /* audit_panic has been called */
+ audit_log_format(ab, "arch=%x syscall=%d",
+ context->arch, context->major);
+ if (context->personality != PER_LINUX)
+ audit_log_format(ab, " per=%lx", context->personality);
+ if (context->return_valid)
+ audit_log_format(ab, " success=%s exit=%ld",
+ (context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
+ context->return_code);
+
+ audit_log_format(ab,
+ " a0=%lx a1=%lx a2=%lx a3=%lx items=%d",
+ context->argv[0],
+ context->argv[1],
+ context->argv[2],
+ context->argv[3],
+ context->name_count);
+
+ audit_log_task_info(ab, tsk);
+ audit_log_key(ab, context->filterkey);
+ audit_log_end(ab);
+
+ for (aux = context->aux; aux; aux = aux->next) {
+
+ ab = audit_log_start(context, GFP_KERNEL, aux->type);
+ if (!ab)
+ continue; /* audit_panic has been called */
+
+ switch (aux->type) {
+
+ case AUDIT_BPRM_FCAPS: {
+ struct audit_aux_data_bprm_fcaps *axs = (void *)aux;
+ audit_log_format(ab, "fver=%x", axs->fcap_ver);
+ audit_log_cap(ab, "fp", &axs->fcap.permitted);
+ audit_log_cap(ab, "fi", &axs->fcap.inheritable);
+ audit_log_format(ab, " fe=%d", axs->fcap.fE);
+ audit_log_cap(ab, "old_pp", &axs->old_pcap.permitted);
+ audit_log_cap(ab, "old_pi", &axs->old_pcap.inheritable);
+ audit_log_cap(ab, "old_pe", &axs->old_pcap.effective);
+ audit_log_cap(ab, "new_pp", &axs->new_pcap.permitted);
+ audit_log_cap(ab, "new_pi", &axs->new_pcap.inheritable);
+ audit_log_cap(ab, "new_pe", &axs->new_pcap.effective);
+ break; }
+
+ }
+ audit_log_end(ab);
+ }
+
+ if (context->type)
+ show_special(context, &call_panic);
+
+ if (context->fds[0] >= 0) {
+ ab = audit_log_start(context, GFP_KERNEL, AUDIT_FD_PAIR);
+ if (ab) {
+ audit_log_format(ab, "fd0=%d fd1=%d",
+ context->fds[0], context->fds[1]);
+ audit_log_end(ab);
+ }
+ }
+
+ if (context->sockaddr_len) {
+ ab = audit_log_start(context, GFP_KERNEL, AUDIT_SOCKADDR);
+ if (ab) {
+ audit_log_format(ab, "saddr=");
+ audit_log_n_hex(ab, (void *)context->sockaddr,
+ context->sockaddr_len);
+ audit_log_end(ab);
+ }
+ }
+
+ for (aux = context->aux_pids; aux; aux = aux->next) {
+ struct audit_aux_data_pids *axs = (void *)aux;
+
+ for (i = 0; i < axs->pid_count; i++)
+ if (audit_log_pid_context(context, axs->target_pid[i],
+ axs->target_auid[i],
+ axs->target_uid[i],
+ axs->target_sessionid[i],
+ axs->target_sid[i],
+ axs->target_comm[i]))
+ call_panic = 1;
+ }
+
+ if (context->target_pid &&
+ audit_log_pid_context(context, context->target_pid,
+ context->target_auid, context->target_uid,
+ context->target_sessionid,
+ context->target_sid, context->target_comm))
+ call_panic = 1;
+
+ if (context->pwd.dentry && context->pwd.mnt) {
+ ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD);
+ if (ab) {
+ audit_log_d_path(ab, " cwd=", &context->pwd);
+ audit_log_end(ab);
+ }
+ }
+
+ i = 0;
+ list_for_each_entry(n, &context->names_list, list) {
+ if (n->hidden)
+ continue;
+ audit_log_name(context, n, NULL, i++, &call_panic);
+ }
+
+ audit_log_proctitle(tsk, context);
+
+ /* Send end of event record to help user space know we are finished */
+ ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
+ if (ab)
+ audit_log_end(ab);
+ if (call_panic)
+ audit_panic("error converting sid to string");
+}
+
+/**
+ * audit_free - free a per-task audit context
+ * @tsk: task whose audit context block to free
+ *
+ * Called from copy_process and do_exit
+ */
+void __audit_free(struct task_struct *tsk)
+{
+ struct audit_context *context;
+
+ context = audit_take_context(tsk, 0, 0);
+ if (!context)
+ return;
+
+ /* Check for system calls that do not go through the exit
+ * function (e.g., exit_group), then free context block.
+ * We use GFP_ATOMIC here because we might be doing this
+ * in the context of the idle thread */
+ /* that can happen only if we are called from do_exit() */
+ if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
+ audit_log_exit(context, tsk);
+ if (!list_empty(&context->killed_trees))
+ audit_kill_trees(&context->killed_trees);
+
+ audit_free_context(context);
+}
+
+/**
+ * audit_syscall_entry - fill in an audit record at syscall entry
+ * @major: major syscall type (function)
+ * @a1: additional syscall register 1
+ * @a2: additional syscall register 2
+ * @a3: additional syscall register 3
+ * @a4: additional syscall register 4
+ *
+ * Fill in audit context at syscall entry. This only happens if the
+ * audit context was created when the task was created and the state or
+ * filters demand the audit context be built. If the state from the
+ * per-task filter or from the per-syscall filter is AUDIT_RECORD_CONTEXT,
+ * then the record will be written at syscall exit time (otherwise, it
+ * will only be written if another part of the kernel requests that it
+ * be written).
+ */
+void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
+ unsigned long a3, unsigned long a4)
+{
+ struct task_struct *tsk = current;
+ struct audit_context *context = tsk->audit_context;
+ enum audit_state state;
+
+ if (!context)
+ return;
+
+ BUG_ON(context->in_syscall || context->name_count);
+
+ if (!audit_enabled)
+ return;
+
+ context->arch = syscall_get_arch();
+ context->major = major;
+ context->argv[0] = a1;
+ context->argv[1] = a2;
+ context->argv[2] = a3;
+ context->argv[3] = a4;
+
+ state = context->state;
+ context->dummy = !audit_n_rules;
+ if (!context->dummy && state == AUDIT_BUILD_CONTEXT) {
+ context->prio = 0;
+ state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]);
+ }
+ if (state == AUDIT_DISABLED)
+ return;
+
+ context->serial = 0;
+ context->ctime = CURRENT_TIME;
+ context->in_syscall = 1;
+ context->current_state = state;
+ context->ppid = 0;
+}
+
+/**
+ * audit_syscall_exit - deallocate audit context after a system call
+ * @success: success value of the syscall
+ * @return_code: return value of the syscall
+ *
+ * Tear down after system call. If the audit context has been marked as
+ * auditable (either because of the AUDIT_RECORD_CONTEXT state from
+ * filtering, or because some other part of the kernel wrote an audit
+ * message), then write out the syscall information. In call cases,
+ * free the names stored from getname().
+ */
+void __audit_syscall_exit(int success, long return_code)
+{
+ struct task_struct *tsk = current;
+ struct audit_context *context;
+
+ if (success)
+ success = AUDITSC_SUCCESS;
+ else
+ success = AUDITSC_FAILURE;
+
+ context = audit_take_context(tsk, success, return_code);
+ if (!context)
+ return;
+
+ if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
+ audit_log_exit(context, tsk);
+
+ context->in_syscall = 0;
+ context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
+
+ if (!list_empty(&context->killed_trees))
+ audit_kill_trees(&context->killed_trees);
+
+ audit_free_names(context);
+ unroll_tree_refs(context, NULL, 0);
+ audit_free_aux(context);
+ context->aux = NULL;
+ context->aux_pids = NULL;
+ context->target_pid = 0;
+ context->target_sid = 0;
+ context->sockaddr_len = 0;
+ context->type = 0;
+ context->fds[0] = -1;
+ if (context->state != AUDIT_RECORD_CONTEXT) {
+ kfree(context->filterkey);
+ context->filterkey = NULL;
+ }
+ tsk->audit_context = context;
+}
+
+static inline void handle_one(const struct inode *inode)
+{
+#ifdef CONFIG_AUDIT_TREE
+ struct audit_context *context;
+ struct audit_tree_refs *p;
+ struct audit_chunk *chunk;
+ int count;
+ if (likely(hlist_empty(&inode->i_fsnotify_marks)))
+ return;
+ context = current->audit_context;
+ p = context->trees;
+ count = context->tree_count;
+ rcu_read_lock();
+ chunk = audit_tree_lookup(inode);
+ rcu_read_unlock();
+ if (!chunk)
+ return;
+ if (likely(put_tree_ref(context, chunk)))
+ return;
+ if (unlikely(!grow_tree_refs(context))) {
+ pr_warn("out of memory, audit has lost a tree reference\n");
+ audit_set_auditable(context);
+ audit_put_chunk(chunk);
+ unroll_tree_refs(context, p, count);
+ return;
+ }
+ put_tree_ref(context, chunk);
+#endif
+}
+
+static void handle_path(const struct dentry *dentry)
+{
+#ifdef CONFIG_AUDIT_TREE
+ struct audit_context *context;
+ struct audit_tree_refs *p;
+ const struct dentry *d, *parent;
+ struct audit_chunk *drop;
+ unsigned long seq;
+ int count;
+
+ context = current->audit_context;
+ p = context->trees;
+ count = context->tree_count;
+retry:
+ drop = NULL;
+ d = dentry;
+ rcu_read_lock();
+ seq = read_seqbegin(&rename_lock);
+ for(;;) {
+ struct inode *inode = d_backing_inode(d);
+ if (inode && unlikely(!hlist_empty(&inode->i_fsnotify_marks))) {
+ struct audit_chunk *chunk;
+ chunk = audit_tree_lookup(inode);
+ if (chunk) {
+ if (unlikely(!put_tree_ref(context, chunk))) {
+ drop = chunk;
+ break;
+ }
+ }
+ }
+ parent = d->d_parent;
+ if (parent == d)
+ break;
+ d = parent;
+ }
+ if (unlikely(read_seqretry(&rename_lock, seq) || drop)) { /* in this order */
+ rcu_read_unlock();
+ if (!drop) {
+ /* just a race with rename */
+ unroll_tree_refs(context, p, count);
+ goto retry;
+ }
+ audit_put_chunk(drop);
+ if (grow_tree_refs(context)) {
+ /* OK, got more space */
+ unroll_tree_refs(context, p, count);
+ goto retry;
+ }
+ /* too bad */
+ pr_warn("out of memory, audit has lost a tree reference\n");
+ unroll_tree_refs(context, p, count);
+ audit_set_auditable(context);
+ return;
+ }
+ rcu_read_unlock();
+#endif
+}
+
+static struct audit_names *audit_alloc_name(struct audit_context *context,
+ unsigned char type)
+{
+ struct audit_names *aname;
+
+ if (context->name_count < AUDIT_NAMES) {
+ aname = &context->preallocated_names[context->name_count];
+ memset(aname, 0, sizeof(*aname));
+ } else {
+ aname = kzalloc(sizeof(*aname), GFP_NOFS);
+ if (!aname)
+ return NULL;
+ aname->should_free = true;
+ }
+
+ aname->ino = (unsigned long)-1;
+ aname->type = type;
+ list_add_tail(&aname->list, &context->names_list);
+
+ context->name_count++;
+ return aname;
+}
+
+/**
+ * audit_reusename - fill out filename with info from existing entry
+ * @uptr: userland ptr to pathname
+ *
+ * Search the audit_names list for the current audit context. If there is an
+ * existing entry with a matching "uptr" then return the filename
+ * associated with that audit_name. If not, return NULL.
+ */
+struct filename *
+__audit_reusename(const __user char *uptr)
+{
+ struct audit_context *context = current->audit_context;
+ struct audit_names *n;
+
+ list_for_each_entry(n, &context->names_list, list) {
+ if (!n->name)
+ continue;
+ if (n->name->uptr == uptr) {
+ n->name->refcnt++;
+ return n->name;
+ }
+ }
+ return NULL;
+}
+
+/**
+ * audit_getname - add a name to the list
+ * @name: name to add
+ *
+ * Add a name to the list of audit names for this context.
+ * Called from fs/namei.c:getname().
+ */
+void __audit_getname(struct filename *name)
+{
+ struct audit_context *context = current->audit_context;
+ struct audit_names *n;
+
+ if (!context->in_syscall)
+ return;
+
+ n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);
+ if (!n)
+ return;
+
+ n->name = name;
+ n->name_len = AUDIT_NAME_FULL;
+ name->aname = n;
+ name->refcnt++;
+
+ if (!context->pwd.dentry)
+ get_fs_pwd(current->fs, &context->pwd);
+}
+
+/**
+ * __audit_inode - store the inode and device from a lookup
+ * @name: name being audited
+ * @dentry: dentry being audited
+ * @flags: attributes for this particular entry
+ */
+void __audit_inode(struct filename *name, const struct dentry *dentry,
+ unsigned int flags)
+{
+ struct audit_context *context = current->audit_context;
+ const struct inode *inode = d_backing_inode(dentry);
+ struct audit_names *n;
+ bool parent = flags & AUDIT_INODE_PARENT;
+
+ if (!context->in_syscall)
+ return;
+
+ if (!name)
+ goto out_alloc;
+
+ /*
+ * If we have a pointer to an audit_names entry already, then we can
+ * just use it directly if the type is correct.
+ */
+ n = name->aname;
+ if (n) {
+ if (parent) {
+ if (n->type == AUDIT_TYPE_PARENT ||
+ n->type == AUDIT_TYPE_UNKNOWN)
+ goto out;
+ } else {
+ if (n->type != AUDIT_TYPE_PARENT)
+ goto out;
+ }
+ }
+
+ list_for_each_entry_reverse(n, &context->names_list, list) {
+ if (n->ino) {
+ /* valid inode number, use that for the comparison */
+ if (n->ino != inode->i_ino ||
+ n->dev != inode->i_sb->s_dev)
+ continue;
+ } else if (n->name) {
+ /* inode number has not been set, check the name */
+ if (strcmp(n->name->name, name->name))
+ continue;
+ } else
+ /* no inode and no name (?!) ... this is odd ... */
+ continue;
+
+ /* match the correct record type */
+ if (parent) {
+ if (n->type == AUDIT_TYPE_PARENT ||
+ n->type == AUDIT_TYPE_UNKNOWN)
+ goto out;
+ } else {
+ if (n->type != AUDIT_TYPE_PARENT)
+ goto out;
+ }
+ }
+
+out_alloc:
+ /* unable to find an entry with both a matching name and type */
+ n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);
+ if (!n)
+ return;
+ if (name) {
+ n->name = name;
+ name->refcnt++;
+ }
+
+out:
+ if (parent) {
+ n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL;
+ n->type = AUDIT_TYPE_PARENT;
+ if (flags & AUDIT_INODE_HIDDEN)
+ n->hidden = true;
+ } else {
+ n->name_len = AUDIT_NAME_FULL;
+ n->type = AUDIT_TYPE_NORMAL;
+ }
+ handle_path(dentry);
+ audit_copy_inode(n, dentry, inode);
+}
+
+void __audit_file(const struct file *file)
+{
+ __audit_inode(NULL, file->f_path.dentry, 0);
+}
+
+/**
+ * __audit_inode_child - collect inode info for created/removed objects
+ * @parent: inode of dentry parent
+ * @dentry: dentry being audited
+ * @type: AUDIT_TYPE_* value that we're looking for
+ *
+ * For syscalls that create or remove filesystem objects, audit_inode
+ * can only collect information for the filesystem object's parent.
+ * This call updates the audit context with the child's information.
+ * Syscalls that create a new filesystem object must be hooked after
+ * the object is created. Syscalls that remove a filesystem object
+ * must be hooked prior, in order to capture the target inode during
+ * unsuccessful attempts.
+ */
+void __audit_inode_child(const struct inode *parent,
+ const struct dentry *dentry,
+ const unsigned char type)
+{
+ struct audit_context *context = current->audit_context;
+ const struct inode *inode = d_backing_inode(dentry);
+ const char *dname = dentry->d_name.name;
+ struct audit_names *n, *found_parent = NULL, *found_child = NULL;
+
+ if (!context->in_syscall)
+ return;
+
+ if (inode)
+ handle_one(inode);
+
+ /* look for a parent entry first */
+ list_for_each_entry(n, &context->names_list, list) {
+ if (!n->name ||
+ (n->type != AUDIT_TYPE_PARENT &&
+ n->type != AUDIT_TYPE_UNKNOWN))
+ continue;
+
+ if (n->ino == parent->i_ino && n->dev == parent->i_sb->s_dev &&
+ !audit_compare_dname_path(dname,
+ n->name->name, n->name_len)) {
+ if (n->type == AUDIT_TYPE_UNKNOWN)
+ n->type = AUDIT_TYPE_PARENT;
+ found_parent = n;
+ break;
+ }
+ }
+
+ /* is there a matching child entry? */
+ list_for_each_entry(n, &context->names_list, list) {
+ /* can only match entries that have a name */
+ if (!n->name ||
+ (n->type != type && n->type != AUDIT_TYPE_UNKNOWN))
+ continue;
+
+ if (!strcmp(dname, n->name->name) ||
+ !audit_compare_dname_path(dname, n->name->name,
+ found_parent ?
+ found_parent->name_len :
+ AUDIT_NAME_FULL)) {
+ if (n->type == AUDIT_TYPE_UNKNOWN)
+ n->type = type;
+ found_child = n;
+ break;
+ }
+ }
+
+ if (!found_parent) {
+ /* create a new, "anonymous" parent record */
+ n = audit_alloc_name(context, AUDIT_TYPE_PARENT);
+ if (!n)
+ return;
+ audit_copy_inode(n, NULL, parent);
+ }
+
+ if (!found_child) {
+ found_child = audit_alloc_name(context, type);
+ if (!found_child)
+ return;
+
+ /* Re-use the name belonging to the slot for a matching parent
+ * directory. All names for this context are relinquished in
+ * audit_free_names() */
+ if (found_parent) {
+ found_child->name = found_parent->name;
+ found_child->name_len = AUDIT_NAME_FULL;
+ found_child->name->refcnt++;
+ }
+ }
+
+ if (inode)
+ audit_copy_inode(found_child, dentry, inode);
+ else
+ found_child->ino = (unsigned long)-1;
+}
+EXPORT_SYMBOL_GPL(__audit_inode_child);
+
+/**
+ * auditsc_get_stamp - get local copies of audit_context values
+ * @ctx: audit_context for the task
+ * @t: timespec to store time recorded in the audit_context
+ * @serial: serial value that is recorded in the audit_context
+ *
+ * Also sets the context as auditable.
+ */
+int auditsc_get_stamp(struct audit_context *ctx,
+ struct timespec *t, unsigned int *serial)
+{
+ if (!ctx->in_syscall)
+ return 0;
+ if (!ctx->serial)
+ ctx->serial = audit_serial();
+ t->tv_sec = ctx->ctime.tv_sec;
+ t->tv_nsec = ctx->ctime.tv_nsec;
+ *serial = ctx->serial;
+ if (!ctx->prio) {
+ ctx->prio = 1;
+ ctx->current_state = AUDIT_RECORD_CONTEXT;
+ }
+ return 1;
+}
+
+/* global counter which is incremented every time something logs in */
+static atomic_t session_id = ATOMIC_INIT(0);
+
+static int audit_set_loginuid_perm(kuid_t loginuid)
+{
+ /* if we are unset, we don't need privs */
+ if (!audit_loginuid_set(current))
+ return 0;
+ /* if AUDIT_FEATURE_LOGINUID_IMMUTABLE means never ever allow a change*/
+ if (is_audit_feature_set(AUDIT_FEATURE_LOGINUID_IMMUTABLE))
+ return -EPERM;
+ /* it is set, you need permission */
+ if (!capable(CAP_AUDIT_CONTROL))
+ return -EPERM;
+ /* reject if this is not an unset and we don't allow that */
+ if (is_audit_feature_set(AUDIT_FEATURE_ONLY_UNSET_LOGINUID) && uid_valid(loginuid))
+ return -EPERM;
+ return 0;
+}
+
+static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
+ unsigned int oldsessionid, unsigned int sessionid,
+ int rc)
+{
+ struct audit_buffer *ab;
+ uid_t uid, oldloginuid, loginuid;
+
+ if (!audit_enabled)
+ return;
+
+ uid = from_kuid(&init_user_ns, task_uid(current));
+ oldloginuid = from_kuid(&init_user_ns, koldloginuid);
+ loginuid = from_kuid(&init_user_ns, kloginuid),
+
+ ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
+ if (!ab)
+ return;
+ audit_log_format(ab, "pid=%d uid=%u", task_pid_nr(current), uid);
+ audit_log_task_context(ab);
+ audit_log_format(ab, " old-auid=%u auid=%u old-ses=%u ses=%u res=%d",
+ oldloginuid, loginuid, oldsessionid, sessionid, !rc);
+ audit_log_end(ab);
+}
+
+/**
+ * audit_set_loginuid - set current task's audit_context loginuid
+ * @loginuid: loginuid value
+ *
+ * Returns 0.
+ *
+ * Called (set) from fs/proc/base.c::proc_loginuid_write().
+ */
+int audit_set_loginuid(kuid_t loginuid)
+{
+ struct task_struct *task = current;
+ unsigned int oldsessionid, sessionid = (unsigned int)-1;
+ kuid_t oldloginuid;
+ int rc;
+
+ oldloginuid = audit_get_loginuid(current);
+ oldsessionid = audit_get_sessionid(current);
+
+ rc = audit_set_loginuid_perm(loginuid);
+ if (rc)
+ goto out;
+
+ /* are we setting or clearing? */
+ if (uid_valid(loginuid))
+ sessionid = (unsigned int)atomic_inc_return(&session_id);
+
+ task->sessionid = sessionid;
+ task->loginuid = loginuid;
+out:
+ audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc);
+ return rc;
+}
+
+/**
+ * __audit_mq_open - record audit data for a POSIX MQ open
+ * @oflag: open flag
+ * @mode: mode bits
+ * @attr: queue attributes
+ *
+ */
+void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
+{
+ struct audit_context *context = current->audit_context;
+
+ if (attr)
+ memcpy(&context->mq_open.attr, attr, sizeof(struct mq_attr));
+ else
+ memset(&context->mq_open.attr, 0, sizeof(struct mq_attr));
+
+ context->mq_open.oflag = oflag;
+ context->mq_open.mode = mode;
+
+ context->type = AUDIT_MQ_OPEN;
+}
+
+/**
+ * __audit_mq_sendrecv - record audit data for a POSIX MQ timed send/receive
+ * @mqdes: MQ descriptor
+ * @msg_len: Message length
+ * @msg_prio: Message priority
+ * @abs_timeout: Message timeout in absolute time
+ *
+ */
+void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio,
+ const struct timespec *abs_timeout)
+{
+ struct audit_context *context = current->audit_context;
+ struct timespec *p = &context->mq_sendrecv.abs_timeout;
+
+ if (abs_timeout)
+ memcpy(p, abs_timeout, sizeof(struct timespec));
+ else
+ memset(p, 0, sizeof(struct timespec));
+
+ context->mq_sendrecv.mqdes = mqdes;
+ context->mq_sendrecv.msg_len = msg_len;
+ context->mq_sendrecv.msg_prio = msg_prio;
+
+ context->type = AUDIT_MQ_SENDRECV;
+}
+
+/**
+ * __audit_mq_notify - record audit data for a POSIX MQ notify
+ * @mqdes: MQ descriptor
+ * @notification: Notification event
+ *
+ */
+
+void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification)
+{
+ struct audit_context *context = current->audit_context;
+
+ if (notification)
+ context->mq_notify.sigev_signo = notification->sigev_signo;
+ else
+ context->mq_notify.sigev_signo = 0;
+
+ context->mq_notify.mqdes = mqdes;
+ context->type = AUDIT_MQ_NOTIFY;
+}
+
+/**
+ * __audit_mq_getsetattr - record audit data for a POSIX MQ get/set attribute
+ * @mqdes: MQ descriptor
+ * @mqstat: MQ flags
+ *
+ */
+void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
+{
+ struct audit_context *context = current->audit_context;
+ context->mq_getsetattr.mqdes = mqdes;
+ context->mq_getsetattr.mqstat = *mqstat;
+ context->type = AUDIT_MQ_GETSETATTR;
+}
+
+/**
+ * audit_ipc_obj - record audit data for ipc object
+ * @ipcp: ipc permissions
+ *
+ */
+void __audit_ipc_obj(struct kern_ipc_perm *ipcp)
+{
+ struct audit_context *context = current->audit_context;
+ context->ipc.uid = ipcp->uid;
+ context->ipc.gid = ipcp->gid;
+ context->ipc.mode = ipcp->mode;
+ context->ipc.has_perm = 0;
+ security_ipc_getsecid(ipcp, &context->ipc.osid);
+ context->type = AUDIT_IPC;
+}
+
+/**
+ * audit_ipc_set_perm - record audit data for new ipc permissions
+ * @qbytes: msgq bytes
+ * @uid: msgq user id
+ * @gid: msgq group id
+ * @mode: msgq mode (permissions)
+ *
+ * Called only after audit_ipc_obj().
+ */
+void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode)
+{
+ struct audit_context *context = current->audit_context;
+
+ context->ipc.qbytes = qbytes;
+ context->ipc.perm_uid = uid;
+ context->ipc.perm_gid = gid;
+ context->ipc.perm_mode = mode;
+ context->ipc.has_perm = 1;
+}
+
+void __audit_bprm(struct linux_binprm *bprm)
+{
+ struct audit_context *context = current->audit_context;
+
+ context->type = AUDIT_EXECVE;
+ context->execve.argc = bprm->argc;
+}
+
+
+/**
+ * audit_socketcall - record audit data for sys_socketcall
+ * @nargs: number of args, which should not be more than AUDITSC_ARGS.
+ * @args: args array
+ *
+ */
+int __audit_socketcall(int nargs, unsigned long *args)
+{
+ struct audit_context *context = current->audit_context;
+
+ if (nargs <= 0 || nargs > AUDITSC_ARGS || !args)
+ return -EINVAL;
+ context->type = AUDIT_SOCKETCALL;
+ context->socketcall.nargs = nargs;
+ memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long));
+ return 0;
+}
+
+/**
+ * __audit_fd_pair - record audit data for pipe and socketpair
+ * @fd1: the first file descriptor
+ * @fd2: the second file descriptor
+ *
+ */
+void __audit_fd_pair(int fd1, int fd2)
+{
+ struct audit_context *context = current->audit_context;
+ context->fds[0] = fd1;
+ context->fds[1] = fd2;
+}
+
+/**
+ * audit_sockaddr - record audit data for sys_bind, sys_connect, sys_sendto
+ * @len: data length in user space
+ * @a: data address in kernel space
+ *
+ * Returns 0 for success or NULL context or < 0 on error.
+ */
+int __audit_sockaddr(int len, void *a)
+{
+ struct audit_context *context = current->audit_context;
+
+ if (!context->sockaddr) {
+ void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+ context->sockaddr = p;
+ }
+
+ context->sockaddr_len = len;
+ memcpy(context->sockaddr, a, len);
+ return 0;
+}
+
+void __audit_ptrace(struct task_struct *t)
+{
+ struct audit_context *context = current->audit_context;
+
+ context->target_pid = task_pid_nr(t);
+ context->target_auid = audit_get_loginuid(t);
+ context->target_uid = task_uid(t);
+ context->target_sessionid = audit_get_sessionid(t);
+ security_task_getsecid(t, &context->target_sid);
+ memcpy(context->target_comm, t->comm, TASK_COMM_LEN);
+}
+
+/**
+ * audit_signal_info - record signal info for shutting down audit subsystem
+ * @sig: signal value
+ * @t: task being signaled
+ *
+ * If the audit subsystem is being terminated, record the task (pid)
+ * and uid that is doing that.
+ */
+int __audit_signal_info(int sig, struct task_struct *t)
+{
+ struct audit_aux_data_pids *axp;
+ struct task_struct *tsk = current;
+ struct audit_context *ctx = tsk->audit_context;
+ kuid_t uid = current_uid(), t_uid = task_uid(t);
+
+ if (audit_pid && t->tgid == audit_pid) {
+ if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) {
+ audit_sig_pid = task_pid_nr(tsk);
+ if (uid_valid(tsk->loginuid))
+ audit_sig_uid = tsk->loginuid;
+ else
+ audit_sig_uid = uid;
+ security_task_getsecid(tsk, &audit_sig_sid);
+ }
+ if (!audit_signals || audit_dummy_context())
+ return 0;
+ }
+
+ /* optimize the common case by putting first signal recipient directly
+ * in audit_context */
+ if (!ctx->target_pid) {
+ ctx->target_pid = task_tgid_nr(t);
+ ctx->target_auid = audit_get_loginuid(t);
+ ctx->target_uid = t_uid;
+ ctx->target_sessionid = audit_get_sessionid(t);
+ security_task_getsecid(t, &ctx->target_sid);
+ memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN);
+ return 0;
+ }
+
+ axp = (void *)ctx->aux_pids;
+ if (!axp || axp->pid_count == AUDIT_AUX_PIDS) {
+ axp = kzalloc(sizeof(*axp), GFP_ATOMIC);
+ if (!axp)
+ return -ENOMEM;
+
+ axp->d.type = AUDIT_OBJ_PID;
+ axp->d.next = ctx->aux_pids;
+ ctx->aux_pids = (void *)axp;
+ }
+ BUG_ON(axp->pid_count >= AUDIT_AUX_PIDS);
+
+ axp->target_pid[axp->pid_count] = task_tgid_nr(t);
+ axp->target_auid[axp->pid_count] = audit_get_loginuid(t);
+ axp->target_uid[axp->pid_count] = t_uid;
+ axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t);
+ security_task_getsecid(t, &axp->target_sid[axp->pid_count]);
+ memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN);
+ axp->pid_count++;
+
+ return 0;
+}
+
+/**
+ * __audit_log_bprm_fcaps - store information about a loading bprm and relevant fcaps
+ * @bprm: pointer to the bprm being processed
+ * @new: the proposed new credentials
+ * @old: the old credentials
+ *
+ * Simply check if the proc already has the caps given by the file and if not
+ * store the priv escalation info for later auditing at the end of the syscall
+ *
+ * -Eric
+ */
+int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
+ const struct cred *new, const struct cred *old)
+{
+ struct audit_aux_data_bprm_fcaps *ax;
+ struct audit_context *context = current->audit_context;
+ struct cpu_vfs_cap_data vcaps;
+
+ ax = kmalloc(sizeof(*ax), GFP_KERNEL);
+ if (!ax)
+ return -ENOMEM;
+
+ ax->d.type = AUDIT_BPRM_FCAPS;
+ ax->d.next = context->aux;
+ context->aux = (void *)ax;
+
+ get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
+
+ ax->fcap.permitted = vcaps.permitted;
+ ax->fcap.inheritable = vcaps.inheritable;
+ ax->fcap.fE = !!(vcaps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
+ ax->fcap_ver = (vcaps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT;
+
+ ax->old_pcap.permitted = old->cap_permitted;
+ ax->old_pcap.inheritable = old->cap_inheritable;
+ ax->old_pcap.effective = old->cap_effective;
+
+ ax->new_pcap.permitted = new->cap_permitted;
+ ax->new_pcap.inheritable = new->cap_inheritable;
+ ax->new_pcap.effective = new->cap_effective;
+ return 0;
+}
+
+/**
+ * __audit_log_capset - store information about the arguments to the capset syscall
+ * @new: the new credentials
+ * @old: the old (current) credentials
+ *
+ * Record the arguments userspace sent to sys_capset for later printing by the
+ * audit system if applicable
+ */
+void __audit_log_capset(const struct cred *new, const struct cred *old)
+{
+ struct audit_context *context = current->audit_context;
+ context->capset.pid = task_pid_nr(current);
+ context->capset.cap.effective = new->cap_effective;
+ context->capset.cap.inheritable = new->cap_effective;
+ context->capset.cap.permitted = new->cap_permitted;
+ context->type = AUDIT_CAPSET;
+}
+
+void __audit_mmap_fd(int fd, int flags)
+{
+ struct audit_context *context = current->audit_context;
+ context->mmap.fd = fd;
+ context->mmap.flags = flags;
+ context->type = AUDIT_MMAP;
+}
+
+static void audit_log_task(struct audit_buffer *ab)
+{
+ kuid_t auid, uid;
+ kgid_t gid;
+ unsigned int sessionid;
+ char comm[sizeof(current->comm)];
+
+ auid = audit_get_loginuid(current);
+ sessionid = audit_get_sessionid(current);
+ current_uid_gid(&uid, &gid);
+
+ audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u",
+ from_kuid(&init_user_ns, auid),
+ from_kuid(&init_user_ns, uid),
+ from_kgid(&init_user_ns, gid),
+ sessionid);
+ audit_log_task_context(ab);
+ audit_log_format(ab, " pid=%d comm=", task_pid_nr(current));
+ audit_log_untrustedstring(ab, get_task_comm(comm, current));
+ audit_log_d_path_exe(ab, current->mm);
+}
+
+/**
+ * audit_core_dumps - record information about processes that end abnormally
+ * @signr: signal value
+ *
+ * If a process ends with a core dump, something fishy is going on and we
+ * should record the event for investigation.
+ */
+void audit_core_dumps(long signr)
+{
+ struct audit_buffer *ab;
+
+ if (!audit_enabled)
+ return;
+
+ if (signr == SIGQUIT) /* don't care for those */
+ return;
+
+ ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
+ if (unlikely(!ab))
+ return;
+ audit_log_task(ab);
+ audit_log_format(ab, " sig=%ld", signr);
+ audit_log_end(ab);
+}
+
+void __audit_seccomp(unsigned long syscall, long signr, int code)
+{
+ struct audit_buffer *ab;
+
+ ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_SECCOMP);
+ if (unlikely(!ab))
+ return;
+ audit_log_task(ab);
+ audit_log_format(ab, " sig=%ld arch=%x syscall=%ld compat=%d ip=0x%lx code=0x%x",
+ signr, syscall_get_arch(), syscall, is_compat_task(),
+ KSTK_EIP(current), code);
+ audit_log_end(ab);
+}
+
+struct list_head *audit_killed_trees(void)
+{
+ struct audit_context *ctx = current->audit_context;
+ if (likely(!ctx || !ctx->in_syscall))
+ return NULL;
+ return &ctx->killed_trees;
+}
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c
new file mode 100644
index 000000000..1323360d9
--- /dev/null
+++ b/kernel/backtracetest.c
@@ -0,0 +1,91 @@
+/*
+ * Simple stack backtrace regression test module
+ *
+ * (C) Copyright 2008 Intel Corporation
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/completion.h>
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/stacktrace.h>
+
+static void backtrace_test_normal(void)
+{
+ pr_info("Testing a backtrace from process context.\n");
+ pr_info("The following trace is a kernel self test and not a bug!\n");
+
+ dump_stack();
+}
+
+static DECLARE_COMPLETION(backtrace_work);
+
+static void backtrace_test_irq_callback(unsigned long data)
+{
+ dump_stack();
+ complete(&backtrace_work);
+}
+
+static DECLARE_TASKLET(backtrace_tasklet, &backtrace_test_irq_callback, 0);
+
+static void backtrace_test_irq(void)
+{
+ pr_info("Testing a backtrace from irq context.\n");
+ pr_info("The following trace is a kernel self test and not a bug!\n");
+
+ init_completion(&backtrace_work);
+ tasklet_schedule(&backtrace_tasklet);
+ wait_for_completion(&backtrace_work);
+}
+
+#ifdef CONFIG_STACKTRACE
+static void backtrace_test_saved(void)
+{
+ struct stack_trace trace;
+ unsigned long entries[8];
+
+ pr_info("Testing a saved backtrace.\n");
+ pr_info("The following trace is a kernel self test and not a bug!\n");
+
+ trace.nr_entries = 0;
+ trace.max_entries = ARRAY_SIZE(entries);
+ trace.entries = entries;
+ trace.skip = 0;
+
+ save_stack_trace(&trace);
+ print_stack_trace(&trace, 0);
+}
+#else
+static void backtrace_test_saved(void)
+{
+ pr_info("Saved backtrace test skipped.\n");
+}
+#endif
+
+static int backtrace_regression_test(void)
+{
+ pr_info("====[ backtrace testing ]===========\n");
+
+ backtrace_test_normal();
+ backtrace_test_irq();
+ backtrace_test_saved();
+
+ pr_info("====[ end of backtrace testing ]====\n");
+ return 0;
+}
+
+static void exitf(void)
+{
+}
+
+module_init(backtrace_regression_test);
+module_exit(exitf);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
diff --git a/kernel/bounds.c b/kernel/bounds.c
new file mode 100644
index 000000000..e1d1d1952
--- /dev/null
+++ b/kernel/bounds.c
@@ -0,0 +1,25 @@
+/*
+ * Generate definitions needed by the preprocessor.
+ * This code generates raw asm output which is post-processed
+ * to extract and format the required data.
+ */
+
+#define __GENERATING_BOUNDS_H
+/* Include headers that define the enum constants of interest */
+#include <linux/page-flags.h>
+#include <linux/mmzone.h>
+#include <linux/kbuild.h>
+#include <linux/log2.h>
+#include <linux/spinlock_types.h>
+
+void foo(void)
+{
+ /* The enum constants to put into include/generated/bounds.h */
+ DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
+ DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
+#ifdef CONFIG_SMP
+ DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
+#endif
+ DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
+ /* End of constants */
+}
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
new file mode 100644
index 000000000..e6983be12
--- /dev/null
+++ b/kernel/bpf/Makefile
@@ -0,0 +1,2 @@
+obj-y := core.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o helpers.o
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
new file mode 100644
index 000000000..8a6616583
--- /dev/null
+++ b/kernel/bpf/arraymap.c
@@ -0,0 +1,156 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bpf.h>
+#include <linux/err.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+
+struct bpf_array {
+ struct bpf_map map;
+ u32 elem_size;
+ char value[0] __aligned(8);
+};
+
+/* Called from syscall */
+static struct bpf_map *array_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_array *array;
+ u32 elem_size, array_size;
+
+ /* check sanity of attributes */
+ if (attr->max_entries == 0 || attr->key_size != 4 ||
+ attr->value_size == 0)
+ return ERR_PTR(-EINVAL);
+
+ elem_size = round_up(attr->value_size, 8);
+
+ /* check round_up into zero and u32 overflow */
+ if (elem_size == 0 ||
+ attr->max_entries > (U32_MAX - sizeof(*array)) / elem_size)
+ return ERR_PTR(-ENOMEM);
+
+ array_size = sizeof(*array) + attr->max_entries * elem_size;
+
+ /* allocate all map elements and zero-initialize them */
+ array = kzalloc(array_size, GFP_USER | __GFP_NOWARN);
+ if (!array) {
+ array = vzalloc(array_size);
+ if (!array)
+ return ERR_PTR(-ENOMEM);
+ }
+
+ /* copy mandatory map attributes */
+ array->map.key_size = attr->key_size;
+ array->map.value_size = attr->value_size;
+ array->map.max_entries = attr->max_entries;
+
+ array->elem_size = elem_size;
+
+ return &array->map;
+}
+
+/* Called from syscall or from eBPF program */
+static void *array_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ u32 index = *(u32 *)key;
+
+ if (index >= array->map.max_entries)
+ return NULL;
+
+ return array->value + array->elem_size * index;
+}
+
+/* Called from syscall */
+static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ u32 index = *(u32 *)key;
+ u32 *next = (u32 *)next_key;
+
+ if (index >= array->map.max_entries) {
+ *next = 0;
+ return 0;
+ }
+
+ if (index == array->map.max_entries - 1)
+ return -ENOENT;
+
+ *next = index + 1;
+ return 0;
+}
+
+/* Called from syscall or from eBPF program */
+static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ u32 index = *(u32 *)key;
+
+ if (map_flags > BPF_EXIST)
+ /* unknown flags */
+ return -EINVAL;
+
+ if (index >= array->map.max_entries)
+ /* all elements were pre-allocated, cannot insert a new one */
+ return -E2BIG;
+
+ if (map_flags == BPF_NOEXIST)
+ /* all elements already exist */
+ return -EEXIST;
+
+ memcpy(array->value + array->elem_size * index, value, array->elem_size);
+ return 0;
+}
+
+/* Called from syscall or from eBPF program */
+static int array_map_delete_elem(struct bpf_map *map, void *key)
+{
+ return -EINVAL;
+}
+
+/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
+static void array_map_free(struct bpf_map *map)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+
+ /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+ * so the programs (can be more than one that used this map) were
+ * disconnected from events. Wait for outstanding programs to complete
+ * and free the array
+ */
+ synchronize_rcu();
+
+ kvfree(array);
+}
+
+static const struct bpf_map_ops array_ops = {
+ .map_alloc = array_map_alloc,
+ .map_free = array_map_free,
+ .map_get_next_key = array_map_get_next_key,
+ .map_lookup_elem = array_map_lookup_elem,
+ .map_update_elem = array_map_update_elem,
+ .map_delete_elem = array_map_delete_elem,
+};
+
+static struct bpf_map_type_list array_type __read_mostly = {
+ .ops = &array_ops,
+ .type = BPF_MAP_TYPE_ARRAY,
+};
+
+static int __init register_array_map(void)
+{
+ bpf_register_map_type(&array_type);
+ return 0;
+}
+late_initcall(register_array_map);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
new file mode 100644
index 000000000..54f0e7fcd
--- /dev/null
+++ b/kernel/bpf/core.c
@@ -0,0 +1,674 @@
+/*
+ * Linux Socket Filter - Kernel level socket filtering
+ *
+ * Based on the design of the Berkeley Packet Filter. The new
+ * internal format has been designed by PLUMgrid:
+ *
+ * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com
+ *
+ * Authors:
+ *
+ * Jay Schulist <jschlst@samba.org>
+ * Alexei Starovoitov <ast@plumgrid.com>
+ * Daniel Borkmann <dborkman@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Andi Kleen - Fix a few bad bugs and races.
+ * Kris Katterjohn - Added many additional checks in bpf_check_classic()
+ */
+
+#include <linux/filter.h>
+#include <linux/skbuff.h>
+#include <linux/vmalloc.h>
+#include <linux/random.h>
+#include <linux/moduleloader.h>
+#include <asm/unaligned.h>
+#include <linux/bpf.h>
+
+/* Registers */
+#define BPF_R0 regs[BPF_REG_0]
+#define BPF_R1 regs[BPF_REG_1]
+#define BPF_R2 regs[BPF_REG_2]
+#define BPF_R3 regs[BPF_REG_3]
+#define BPF_R4 regs[BPF_REG_4]
+#define BPF_R5 regs[BPF_REG_5]
+#define BPF_R6 regs[BPF_REG_6]
+#define BPF_R7 regs[BPF_REG_7]
+#define BPF_R8 regs[BPF_REG_8]
+#define BPF_R9 regs[BPF_REG_9]
+#define BPF_R10 regs[BPF_REG_10]
+
+/* Named registers */
+#define DST regs[insn->dst_reg]
+#define SRC regs[insn->src_reg]
+#define FP regs[BPF_REG_FP]
+#define ARG1 regs[BPF_REG_ARG1]
+#define CTX regs[BPF_REG_CTX]
+#define IMM insn->imm
+
+/* No hurry in this branch
+ *
+ * Exported for the bpf jit load helper.
+ */
+void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size)
+{
+ u8 *ptr = NULL;
+
+ if (k >= SKF_NET_OFF)
+ ptr = skb_network_header(skb) + k - SKF_NET_OFF;
+ else if (k >= SKF_LL_OFF)
+ ptr = skb_mac_header(skb) + k - SKF_LL_OFF;
+ if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb))
+ return ptr;
+
+ return NULL;
+}
+
+struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
+{
+ gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO |
+ gfp_extra_flags;
+ struct bpf_prog_aux *aux;
+ struct bpf_prog *fp;
+
+ size = round_up(size, PAGE_SIZE);
+ fp = __vmalloc(size, gfp_flags, PAGE_KERNEL);
+ if (fp == NULL)
+ return NULL;
+
+ aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags);
+ if (aux == NULL) {
+ vfree(fp);
+ return NULL;
+ }
+
+ fp->pages = size / PAGE_SIZE;
+ fp->aux = aux;
+
+ return fp;
+}
+EXPORT_SYMBOL_GPL(bpf_prog_alloc);
+
+struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
+ gfp_t gfp_extra_flags)
+{
+ gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO |
+ gfp_extra_flags;
+ struct bpf_prog *fp;
+
+ BUG_ON(fp_old == NULL);
+
+ size = round_up(size, PAGE_SIZE);
+ if (size <= fp_old->pages * PAGE_SIZE)
+ return fp_old;
+
+ fp = __vmalloc(size, gfp_flags, PAGE_KERNEL);
+ if (fp != NULL) {
+ memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
+ fp->pages = size / PAGE_SIZE;
+
+ /* We keep fp->aux from fp_old around in the new
+ * reallocated structure.
+ */
+ fp_old->aux = NULL;
+ __bpf_prog_free(fp_old);
+ }
+
+ return fp;
+}
+EXPORT_SYMBOL_GPL(bpf_prog_realloc);
+
+void __bpf_prog_free(struct bpf_prog *fp)
+{
+ kfree(fp->aux);
+ vfree(fp);
+}
+EXPORT_SYMBOL_GPL(__bpf_prog_free);
+
+#ifdef CONFIG_BPF_JIT
+struct bpf_binary_header *
+bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
+ unsigned int alignment,
+ bpf_jit_fill_hole_t bpf_fill_ill_insns)
+{
+ struct bpf_binary_header *hdr;
+ unsigned int size, hole, start;
+
+ /* Most of BPF filters are really small, but if some of them
+ * fill a page, allow at least 128 extra bytes to insert a
+ * random section of illegal instructions.
+ */
+ size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);
+ hdr = module_alloc(size);
+ if (hdr == NULL)
+ return NULL;
+
+ /* Fill space with illegal/arch-dep instructions. */
+ bpf_fill_ill_insns(hdr, size);
+
+ hdr->pages = size / PAGE_SIZE;
+ hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
+ PAGE_SIZE - sizeof(*hdr));
+ start = (prandom_u32() % hole) & ~(alignment - 1);
+
+ /* Leave a random number of instructions before BPF code. */
+ *image_ptr = &hdr->image[start];
+
+ return hdr;
+}
+
+void bpf_jit_binary_free(struct bpf_binary_header *hdr)
+{
+ module_memfree(hdr);
+}
+#endif /* CONFIG_BPF_JIT */
+
+/* Base function for offset calculation. Needs to go into .text section,
+ * therefore keeping it non-static as well; will also be used by JITs
+ * anyway later on, so do not let the compiler omit it.
+ */
+noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ return 0;
+}
+
+/**
+ * __bpf_prog_run - run eBPF program on a given context
+ * @ctx: is the data we are operating on
+ * @insn: is the array of eBPF instructions
+ *
+ * Decode and execute eBPF instructions.
+ */
+static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
+{
+ u64 stack[MAX_BPF_STACK / sizeof(u64)];
+ u64 regs[MAX_BPF_REG], tmp;
+ static const void *jumptable[256] = {
+ [0 ... 255] = &&default_label,
+ /* Now overwrite non-defaults ... */
+ /* 32 bit ALU operations */
+ [BPF_ALU | BPF_ADD | BPF_X] = &&ALU_ADD_X,
+ [BPF_ALU | BPF_ADD | BPF_K] = &&ALU_ADD_K,
+ [BPF_ALU | BPF_SUB | BPF_X] = &&ALU_SUB_X,
+ [BPF_ALU | BPF_SUB | BPF_K] = &&ALU_SUB_K,
+ [BPF_ALU | BPF_AND | BPF_X] = &&ALU_AND_X,
+ [BPF_ALU | BPF_AND | BPF_K] = &&ALU_AND_K,
+ [BPF_ALU | BPF_OR | BPF_X] = &&ALU_OR_X,
+ [BPF_ALU | BPF_OR | BPF_K] = &&ALU_OR_K,
+ [BPF_ALU | BPF_LSH | BPF_X] = &&ALU_LSH_X,
+ [BPF_ALU | BPF_LSH | BPF_K] = &&ALU_LSH_K,
+ [BPF_ALU | BPF_RSH | BPF_X] = &&ALU_RSH_X,
+ [BPF_ALU | BPF_RSH | BPF_K] = &&ALU_RSH_K,
+ [BPF_ALU | BPF_XOR | BPF_X] = &&ALU_XOR_X,
+ [BPF_ALU | BPF_XOR | BPF_K] = &&ALU_XOR_K,
+ [BPF_ALU | BPF_MUL | BPF_X] = &&ALU_MUL_X,
+ [BPF_ALU | BPF_MUL | BPF_K] = &&ALU_MUL_K,
+ [BPF_ALU | BPF_MOV | BPF_X] = &&ALU_MOV_X,
+ [BPF_ALU | BPF_MOV | BPF_K] = &&ALU_MOV_K,
+ [BPF_ALU | BPF_DIV | BPF_X] = &&ALU_DIV_X,
+ [BPF_ALU | BPF_DIV | BPF_K] = &&ALU_DIV_K,
+ [BPF_ALU | BPF_MOD | BPF_X] = &&ALU_MOD_X,
+ [BPF_ALU | BPF_MOD | BPF_K] = &&ALU_MOD_K,
+ [BPF_ALU | BPF_NEG] = &&ALU_NEG,
+ [BPF_ALU | BPF_END | BPF_TO_BE] = &&ALU_END_TO_BE,
+ [BPF_ALU | BPF_END | BPF_TO_LE] = &&ALU_END_TO_LE,
+ /* 64 bit ALU operations */
+ [BPF_ALU64 | BPF_ADD | BPF_X] = &&ALU64_ADD_X,
+ [BPF_ALU64 | BPF_ADD | BPF_K] = &&ALU64_ADD_K,
+ [BPF_ALU64 | BPF_SUB | BPF_X] = &&ALU64_SUB_X,
+ [BPF_ALU64 | BPF_SUB | BPF_K] = &&ALU64_SUB_K,
+ [BPF_ALU64 | BPF_AND | BPF_X] = &&ALU64_AND_X,
+ [BPF_ALU64 | BPF_AND | BPF_K] = &&ALU64_AND_K,
+ [BPF_ALU64 | BPF_OR | BPF_X] = &&ALU64_OR_X,
+ [BPF_ALU64 | BPF_OR | BPF_K] = &&ALU64_OR_K,
+ [BPF_ALU64 | BPF_LSH | BPF_X] = &&ALU64_LSH_X,
+ [BPF_ALU64 | BPF_LSH | BPF_K] = &&ALU64_LSH_K,
+ [BPF_ALU64 | BPF_RSH | BPF_X] = &&ALU64_RSH_X,
+ [BPF_ALU64 | BPF_RSH | BPF_K] = &&ALU64_RSH_K,
+ [BPF_ALU64 | BPF_XOR | BPF_X] = &&ALU64_XOR_X,
+ [BPF_ALU64 | BPF_XOR | BPF_K] = &&ALU64_XOR_K,
+ [BPF_ALU64 | BPF_MUL | BPF_X] = &&ALU64_MUL_X,
+ [BPF_ALU64 | BPF_MUL | BPF_K] = &&ALU64_MUL_K,
+ [BPF_ALU64 | BPF_MOV | BPF_X] = &&ALU64_MOV_X,
+ [BPF_ALU64 | BPF_MOV | BPF_K] = &&ALU64_MOV_K,
+ [BPF_ALU64 | BPF_ARSH | BPF_X] = &&ALU64_ARSH_X,
+ [BPF_ALU64 | BPF_ARSH | BPF_K] = &&ALU64_ARSH_K,
+ [BPF_ALU64 | BPF_DIV | BPF_X] = &&ALU64_DIV_X,
+ [BPF_ALU64 | BPF_DIV | BPF_K] = &&ALU64_DIV_K,
+ [BPF_ALU64 | BPF_MOD | BPF_X] = &&ALU64_MOD_X,
+ [BPF_ALU64 | BPF_MOD | BPF_K] = &&ALU64_MOD_K,
+ [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG,
+ /* Call instruction */
+ [BPF_JMP | BPF_CALL] = &&JMP_CALL,
+ /* Jumps */
+ [BPF_JMP | BPF_JA] = &&JMP_JA,
+ [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X,
+ [BPF_JMP | BPF_JEQ | BPF_K] = &&JMP_JEQ_K,
+ [BPF_JMP | BPF_JNE | BPF_X] = &&JMP_JNE_X,
+ [BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K,
+ [BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X,
+ [BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K,
+ [BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X,
+ [BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K,
+ [BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X,
+ [BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K,
+ [BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X,
+ [BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K,
+ [BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X,
+ [BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K,
+ /* Program return */
+ [BPF_JMP | BPF_EXIT] = &&JMP_EXIT,
+ /* Store instructions */
+ [BPF_STX | BPF_MEM | BPF_B] = &&STX_MEM_B,
+ [BPF_STX | BPF_MEM | BPF_H] = &&STX_MEM_H,
+ [BPF_STX | BPF_MEM | BPF_W] = &&STX_MEM_W,
+ [BPF_STX | BPF_MEM | BPF_DW] = &&STX_MEM_DW,
+ [BPF_STX | BPF_XADD | BPF_W] = &&STX_XADD_W,
+ [BPF_STX | BPF_XADD | BPF_DW] = &&STX_XADD_DW,
+ [BPF_ST | BPF_MEM | BPF_B] = &&ST_MEM_B,
+ [BPF_ST | BPF_MEM | BPF_H] = &&ST_MEM_H,
+ [BPF_ST | BPF_MEM | BPF_W] = &&ST_MEM_W,
+ [BPF_ST | BPF_MEM | BPF_DW] = &&ST_MEM_DW,
+ /* Load instructions */
+ [BPF_LDX | BPF_MEM | BPF_B] = &&LDX_MEM_B,
+ [BPF_LDX | BPF_MEM | BPF_H] = &&LDX_MEM_H,
+ [BPF_LDX | BPF_MEM | BPF_W] = &&LDX_MEM_W,
+ [BPF_LDX | BPF_MEM | BPF_DW] = &&LDX_MEM_DW,
+ [BPF_LD | BPF_ABS | BPF_W] = &&LD_ABS_W,
+ [BPF_LD | BPF_ABS | BPF_H] = &&LD_ABS_H,
+ [BPF_LD | BPF_ABS | BPF_B] = &&LD_ABS_B,
+ [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W,
+ [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H,
+ [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B,
+ [BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW,
+ };
+ void *ptr;
+ int off;
+
+#define CONT ({ insn++; goto select_insn; })
+#define CONT_JMP ({ insn++; goto select_insn; })
+
+ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
+ ARG1 = (u64) (unsigned long) ctx;
+
+ /* Registers used in classic BPF programs need to be reset first. */
+ regs[BPF_REG_A] = 0;
+ regs[BPF_REG_X] = 0;
+
+select_insn:
+ goto *jumptable[insn->code];
+
+ /* ALU */
+#define ALU(OPCODE, OP) \
+ ALU64_##OPCODE##_X: \
+ DST = DST OP SRC; \
+ CONT; \
+ ALU_##OPCODE##_X: \
+ DST = (u32) DST OP (u32) SRC; \
+ CONT; \
+ ALU64_##OPCODE##_K: \
+ DST = DST OP IMM; \
+ CONT; \
+ ALU_##OPCODE##_K: \
+ DST = (u32) DST OP (u32) IMM; \
+ CONT;
+
+ ALU(ADD, +)
+ ALU(SUB, -)
+ ALU(AND, &)
+ ALU(OR, |)
+ ALU(LSH, <<)
+ ALU(RSH, >>)
+ ALU(XOR, ^)
+ ALU(MUL, *)
+#undef ALU
+ ALU_NEG:
+ DST = (u32) -DST;
+ CONT;
+ ALU64_NEG:
+ DST = -DST;
+ CONT;
+ ALU_MOV_X:
+ DST = (u32) SRC;
+ CONT;
+ ALU_MOV_K:
+ DST = (u32) IMM;
+ CONT;
+ ALU64_MOV_X:
+ DST = SRC;
+ CONT;
+ ALU64_MOV_K:
+ DST = IMM;
+ CONT;
+ LD_IMM_DW:
+ DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32;
+ insn++;
+ CONT;
+ ALU64_ARSH_X:
+ (*(s64 *) &DST) >>= SRC;
+ CONT;
+ ALU64_ARSH_K:
+ (*(s64 *) &DST) >>= IMM;
+ CONT;
+ ALU64_MOD_X:
+ if (unlikely(SRC == 0))
+ return 0;
+ div64_u64_rem(DST, SRC, &tmp);
+ DST = tmp;
+ CONT;
+ ALU_MOD_X:
+ if (unlikely(SRC == 0))
+ return 0;
+ tmp = (u32) DST;
+ DST = do_div(tmp, (u32) SRC);
+ CONT;
+ ALU64_MOD_K:
+ div64_u64_rem(DST, IMM, &tmp);
+ DST = tmp;
+ CONT;
+ ALU_MOD_K:
+ tmp = (u32) DST;
+ DST = do_div(tmp, (u32) IMM);
+ CONT;
+ ALU64_DIV_X:
+ if (unlikely(SRC == 0))
+ return 0;
+ DST = div64_u64(DST, SRC);
+ CONT;
+ ALU_DIV_X:
+ if (unlikely(SRC == 0))
+ return 0;
+ tmp = (u32) DST;
+ do_div(tmp, (u32) SRC);
+ DST = (u32) tmp;
+ CONT;
+ ALU64_DIV_K:
+ DST = div64_u64(DST, IMM);
+ CONT;
+ ALU_DIV_K:
+ tmp = (u32) DST;
+ do_div(tmp, (u32) IMM);
+ DST = (u32) tmp;
+ CONT;
+ ALU_END_TO_BE:
+ switch (IMM) {
+ case 16:
+ DST = (__force u16) cpu_to_be16(DST);
+ break;
+ case 32:
+ DST = (__force u32) cpu_to_be32(DST);
+ break;
+ case 64:
+ DST = (__force u64) cpu_to_be64(DST);
+ break;
+ }
+ CONT;
+ ALU_END_TO_LE:
+ switch (IMM) {
+ case 16:
+ DST = (__force u16) cpu_to_le16(DST);
+ break;
+ case 32:
+ DST = (__force u32) cpu_to_le32(DST);
+ break;
+ case 64:
+ DST = (__force u64) cpu_to_le64(DST);
+ break;
+ }
+ CONT;
+
+ /* CALL */
+ JMP_CALL:
+ /* Function call scratches BPF_R1-BPF_R5 registers,
+ * preserves BPF_R6-BPF_R9, and stores return value
+ * into BPF_R0.
+ */
+ BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3,
+ BPF_R4, BPF_R5);
+ CONT;
+
+ /* JMP */
+ JMP_JA:
+ insn += insn->off;
+ CONT;
+ JMP_JEQ_X:
+ if (DST == SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JEQ_K:
+ if (DST == IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JNE_X:
+ if (DST != SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JNE_K:
+ if (DST != IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JGT_X:
+ if (DST > SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JGT_K:
+ if (DST > IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JGE_X:
+ if (DST >= SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JGE_K:
+ if (DST >= IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSGT_X:
+ if (((s64) DST) > ((s64) SRC)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSGT_K:
+ if (((s64) DST) > ((s64) IMM)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSGE_X:
+ if (((s64) DST) >= ((s64) SRC)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSGE_K:
+ if (((s64) DST) >= ((s64) IMM)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSET_X:
+ if (DST & SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSET_K:
+ if (DST & IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_EXIT:
+ return BPF_R0;
+
+ /* STX and ST and LDX*/
+#define LDST(SIZEOP, SIZE) \
+ STX_MEM_##SIZEOP: \
+ *(SIZE *)(unsigned long) (DST + insn->off) = SRC; \
+ CONT; \
+ ST_MEM_##SIZEOP: \
+ *(SIZE *)(unsigned long) (DST + insn->off) = IMM; \
+ CONT; \
+ LDX_MEM_##SIZEOP: \
+ DST = *(SIZE *)(unsigned long) (SRC + insn->off); \
+ CONT;
+
+ LDST(B, u8)
+ LDST(H, u16)
+ LDST(W, u32)
+ LDST(DW, u64)
+#undef LDST
+ STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */
+ atomic_add((u32) SRC, (atomic_t *)(unsigned long)
+ (DST + insn->off));
+ CONT;
+ STX_XADD_DW: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */
+ atomic64_add((u64) SRC, (atomic64_t *)(unsigned long)
+ (DST + insn->off));
+ CONT;
+ LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */
+ off = IMM;
+load_word:
+ /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are
+ * only appearing in the programs where ctx ==
+ * skb. All programs keep 'ctx' in regs[BPF_REG_CTX]
+ * == BPF_R6, bpf_convert_filter() saves it in BPF_R6,
+ * internal BPF verifier will check that BPF_R6 ==
+ * ctx.
+ *
+ * BPF_ABS and BPF_IND are wrappers of function calls,
+ * so they scratch BPF_R1-BPF_R5 registers, preserve
+ * BPF_R6-BPF_R9, and store return value into BPF_R0.
+ *
+ * Implicit input:
+ * ctx == skb == BPF_R6 == CTX
+ *
+ * Explicit input:
+ * SRC == any register
+ * IMM == 32-bit immediate
+ *
+ * Output:
+ * BPF_R0 - 8/16/32-bit skb data converted to cpu endianness
+ */
+
+ ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 4, &tmp);
+ if (likely(ptr != NULL)) {
+ BPF_R0 = get_unaligned_be32(ptr);
+ CONT;
+ }
+
+ return 0;
+ LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + imm32)) */
+ off = IMM;
+load_half:
+ ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 2, &tmp);
+ if (likely(ptr != NULL)) {
+ BPF_R0 = get_unaligned_be16(ptr);
+ CONT;
+ }
+
+ return 0;
+ LD_ABS_B: /* BPF_R0 = *(u8 *) (skb->data + imm32) */
+ off = IMM;
+load_byte:
+ ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 1, &tmp);
+ if (likely(ptr != NULL)) {
+ BPF_R0 = *(u8 *)ptr;
+ CONT;
+ }
+
+ return 0;
+ LD_IND_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + src_reg + imm32)) */
+ off = IMM + SRC;
+ goto load_word;
+ LD_IND_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + src_reg + imm32)) */
+ off = IMM + SRC;
+ goto load_half;
+ LD_IND_B: /* BPF_R0 = *(u8 *) (skb->data + src_reg + imm32) */
+ off = IMM + SRC;
+ goto load_byte;
+
+ default_label:
+ /* If we ever reach this, we have a bug somewhere. */
+ WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code);
+ return 0;
+}
+
+void __weak bpf_int_jit_compile(struct bpf_prog *prog)
+{
+}
+
+/**
+ * bpf_prog_select_runtime - select execution runtime for BPF program
+ * @fp: bpf_prog populated with internal BPF program
+ *
+ * try to JIT internal BPF program, if JIT is not available select interpreter
+ * BPF program will be executed via BPF_PROG_RUN() macro
+ */
+void bpf_prog_select_runtime(struct bpf_prog *fp)
+{
+ fp->bpf_func = (void *) __bpf_prog_run;
+
+ /* Probe if internal BPF can be JITed */
+ bpf_int_jit_compile(fp);
+ /* Lock whole bpf_prog as read-only */
+ bpf_prog_lock_ro(fp);
+}
+EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
+
+static void bpf_prog_free_deferred(struct work_struct *work)
+{
+ struct bpf_prog_aux *aux;
+
+ aux = container_of(work, struct bpf_prog_aux, work);
+ bpf_jit_free(aux->prog);
+}
+
+/* Free internal BPF program */
+void bpf_prog_free(struct bpf_prog *fp)
+{
+ struct bpf_prog_aux *aux = fp->aux;
+
+ INIT_WORK(&aux->work, bpf_prog_free_deferred);
+ aux->prog = fp;
+ schedule_work(&aux->work);
+}
+EXPORT_SYMBOL_GPL(bpf_prog_free);
+
+/* Weak definitions of helper functions in case we don't have bpf syscall. */
+const struct bpf_func_proto bpf_map_lookup_elem_proto __weak;
+const struct bpf_func_proto bpf_map_update_elem_proto __weak;
+const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
+
+const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
+const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
+
+/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
+ * skb_copy_bits(), so provide a weak definition of it for NET-less config.
+ */
+int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
+ int len)
+{
+ return -EFAULT;
+}
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
new file mode 100644
index 000000000..83c209d9b
--- /dev/null
+++ b/kernel/bpf/hashtab.c
@@ -0,0 +1,367 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bpf.h>
+#include <linux/jhash.h>
+#include <linux/filter.h>
+#include <linux/vmalloc.h>
+
+struct bpf_htab {
+ struct bpf_map map;
+ struct hlist_head *buckets;
+ spinlock_t lock;
+ u32 count; /* number of elements in this hashtable */
+ u32 n_buckets; /* number of hash buckets */
+ u32 elem_size; /* size of each element in bytes */
+};
+
+/* each htab element is struct htab_elem + key + value */
+struct htab_elem {
+ struct hlist_node hash_node;
+ struct rcu_head rcu;
+ u32 hash;
+ char key[0] __aligned(8);
+};
+
+/* Called from syscall */
+static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_htab *htab;
+ int err, i;
+
+ htab = kzalloc(sizeof(*htab), GFP_USER);
+ if (!htab)
+ return ERR_PTR(-ENOMEM);
+
+ /* mandatory map attributes */
+ htab->map.key_size = attr->key_size;
+ htab->map.value_size = attr->value_size;
+ htab->map.max_entries = attr->max_entries;
+
+ /* check sanity of attributes.
+ * value_size == 0 may be allowed in the future to use map as a set
+ */
+ err = -EINVAL;
+ if (htab->map.max_entries == 0 || htab->map.key_size == 0 ||
+ htab->map.value_size == 0)
+ goto free_htab;
+
+ /* hash table size must be power of 2 */
+ htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
+
+ err = -E2BIG;
+ if (htab->map.key_size > MAX_BPF_STACK)
+ /* eBPF programs initialize keys on stack, so they cannot be
+ * larger than max stack size
+ */
+ goto free_htab;
+
+ err = -ENOMEM;
+ /* prevent zero size kmalloc and check for u32 overflow */
+ if (htab->n_buckets == 0 ||
+ htab->n_buckets > U32_MAX / sizeof(struct hlist_head))
+ goto free_htab;
+
+ htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head),
+ GFP_USER | __GFP_NOWARN);
+
+ if (!htab->buckets) {
+ htab->buckets = vmalloc(htab->n_buckets * sizeof(struct hlist_head));
+ if (!htab->buckets)
+ goto free_htab;
+ }
+
+ for (i = 0; i < htab->n_buckets; i++)
+ INIT_HLIST_HEAD(&htab->buckets[i]);
+
+ spin_lock_init(&htab->lock);
+ htab->count = 0;
+
+ htab->elem_size = sizeof(struct htab_elem) +
+ round_up(htab->map.key_size, 8) +
+ htab->map.value_size;
+ return &htab->map;
+
+free_htab:
+ kfree(htab);
+ return ERR_PTR(err);
+}
+
+static inline u32 htab_map_hash(const void *key, u32 key_len)
+{
+ return jhash(key, key_len, 0);
+}
+
+static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+{
+ return &htab->buckets[hash & (htab->n_buckets - 1)];
+}
+
+static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
+ void *key, u32 key_size)
+{
+ struct htab_elem *l;
+
+ hlist_for_each_entry_rcu(l, head, hash_node)
+ if (l->hash == hash && !memcmp(&l->key, key, key_size))
+ return l;
+
+ return NULL;
+}
+
+/* Called from syscall or from eBPF program */
+static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct hlist_head *head;
+ struct htab_elem *l;
+ u32 hash, key_size;
+
+ /* Must be called with rcu_read_lock. */
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ key_size = map->key_size;
+
+ hash = htab_map_hash(key, key_size);
+
+ head = select_bucket(htab, hash);
+
+ l = lookup_elem_raw(head, hash, key, key_size);
+
+ if (l)
+ return l->key + round_up(map->key_size, 8);
+
+ return NULL;
+}
+
+/* Called from syscall */
+static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct hlist_head *head;
+ struct htab_elem *l, *next_l;
+ u32 hash, key_size;
+ int i;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ key_size = map->key_size;
+
+ hash = htab_map_hash(key, key_size);
+
+ head = select_bucket(htab, hash);
+
+ /* lookup the key */
+ l = lookup_elem_raw(head, hash, key, key_size);
+
+ if (!l) {
+ i = 0;
+ goto find_first_elem;
+ }
+
+ /* key was found, get next key in the same bucket */
+ next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
+ struct htab_elem, hash_node);
+
+ if (next_l) {
+ /* if next elem in this hash list is non-zero, just return it */
+ memcpy(next_key, next_l->key, key_size);
+ return 0;
+ }
+
+ /* no more elements in this hash list, go to the next bucket */
+ i = hash & (htab->n_buckets - 1);
+ i++;
+
+find_first_elem:
+ /* iterate over buckets */
+ for (; i < htab->n_buckets; i++) {
+ head = select_bucket(htab, i);
+
+ /* pick first element in the bucket */
+ next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
+ struct htab_elem, hash_node);
+ if (next_l) {
+ /* if it's not empty, just return it */
+ memcpy(next_key, next_l->key, key_size);
+ return 0;
+ }
+ }
+
+ /* itereated over all buckets and all elements */
+ return -ENOENT;
+}
+
+/* Called from syscall or from eBPF program */
+static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct htab_elem *l_new, *l_old;
+ struct hlist_head *head;
+ unsigned long flags;
+ u32 key_size;
+ int ret;
+
+ if (map_flags > BPF_EXIST)
+ /* unknown flags */
+ return -EINVAL;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ /* allocate new element outside of lock */
+ l_new = kmalloc(htab->elem_size, GFP_ATOMIC);
+ if (!l_new)
+ return -ENOMEM;
+
+ key_size = map->key_size;
+
+ memcpy(l_new->key, key, key_size);
+ memcpy(l_new->key + round_up(key_size, 8), value, map->value_size);
+
+ l_new->hash = htab_map_hash(l_new->key, key_size);
+
+ /* bpf_map_update_elem() can be called in_irq() */
+ spin_lock_irqsave(&htab->lock, flags);
+
+ head = select_bucket(htab, l_new->hash);
+
+ l_old = lookup_elem_raw(head, l_new->hash, key, key_size);
+
+ if (!l_old && unlikely(htab->count >= map->max_entries)) {
+ /* if elem with this 'key' doesn't exist and we've reached
+ * max_entries limit, fail insertion of new elem
+ */
+ ret = -E2BIG;
+ goto err;
+ }
+
+ if (l_old && map_flags == BPF_NOEXIST) {
+ /* elem already exists */
+ ret = -EEXIST;
+ goto err;
+ }
+
+ if (!l_old && map_flags == BPF_EXIST) {
+ /* elem doesn't exist, cannot update it */
+ ret = -ENOENT;
+ goto err;
+ }
+
+ /* add new element to the head of the list, so that concurrent
+ * search will find it before old elem
+ */
+ hlist_add_head_rcu(&l_new->hash_node, head);
+ if (l_old) {
+ hlist_del_rcu(&l_old->hash_node);
+ kfree_rcu(l_old, rcu);
+ } else {
+ htab->count++;
+ }
+ spin_unlock_irqrestore(&htab->lock, flags);
+
+ return 0;
+err:
+ spin_unlock_irqrestore(&htab->lock, flags);
+ kfree(l_new);
+ return ret;
+}
+
+/* Called from syscall or from eBPF program */
+static int htab_map_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct hlist_head *head;
+ struct htab_elem *l;
+ unsigned long flags;
+ u32 hash, key_size;
+ int ret = -ENOENT;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ key_size = map->key_size;
+
+ hash = htab_map_hash(key, key_size);
+
+ spin_lock_irqsave(&htab->lock, flags);
+
+ head = select_bucket(htab, hash);
+
+ l = lookup_elem_raw(head, hash, key, key_size);
+
+ if (l) {
+ hlist_del_rcu(&l->hash_node);
+ htab->count--;
+ kfree_rcu(l, rcu);
+ ret = 0;
+ }
+
+ spin_unlock_irqrestore(&htab->lock, flags);
+ return ret;
+}
+
+static void delete_all_elements(struct bpf_htab *htab)
+{
+ int i;
+
+ for (i = 0; i < htab->n_buckets; i++) {
+ struct hlist_head *head = select_bucket(htab, i);
+ struct hlist_node *n;
+ struct htab_elem *l;
+
+ hlist_for_each_entry_safe(l, n, head, hash_node) {
+ hlist_del_rcu(&l->hash_node);
+ htab->count--;
+ kfree(l);
+ }
+ }
+}
+
+/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
+static void htab_map_free(struct bpf_map *map)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+
+ /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+ * so the programs (can be more than one that used this map) were
+ * disconnected from events. Wait for outstanding critical sections in
+ * these programs to complete
+ */
+ synchronize_rcu();
+
+ /* some of kfree_rcu() callbacks for elements of this map may not have
+ * executed. It's ok. Proceed to free residual elements and map itself
+ */
+ delete_all_elements(htab);
+ kvfree(htab->buckets);
+ kfree(htab);
+}
+
+static const struct bpf_map_ops htab_ops = {
+ .map_alloc = htab_map_alloc,
+ .map_free = htab_map_free,
+ .map_get_next_key = htab_map_get_next_key,
+ .map_lookup_elem = htab_map_lookup_elem,
+ .map_update_elem = htab_map_update_elem,
+ .map_delete_elem = htab_map_delete_elem,
+};
+
+static struct bpf_map_type_list htab_type __read_mostly = {
+ .ops = &htab_ops,
+ .type = BPF_MAP_TYPE_HASH,
+};
+
+static int __init register_htab_map(void)
+{
+ bpf_register_map_type(&htab_type);
+ return 0;
+}
+late_initcall(register_htab_map);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
new file mode 100644
index 000000000..bd7f5988e
--- /dev/null
+++ b/kernel/bpf/helpers.c
@@ -0,0 +1,113 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bpf.h>
+#include <linux/rcupdate.h>
+#include <linux/random.h>
+#include <linux/smp.h>
+
+/* If kernel subsystem is allowing eBPF programs to call this function,
+ * inside its own verifier_ops->get_func_proto() callback it should return
+ * bpf_map_lookup_elem_proto, so that verifier can properly check the arguments
+ *
+ * Different map implementations will rely on rcu in map methods
+ * lookup/update/delete, therefore eBPF programs must run under rcu lock
+ * if program is allowed to access maps, so check rcu_read_lock_held in
+ * all three functions.
+ */
+static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ /* verifier checked that R1 contains a valid pointer to bpf_map
+ * and R2 points to a program stack and map->key_size bytes were
+ * initialized
+ */
+ struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+ void *key = (void *) (unsigned long) r2;
+ void *value;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ value = map->ops->map_lookup_elem(map, key);
+
+ /* lookup() returns either pointer to element value or NULL
+ * which is the meaning of PTR_TO_MAP_VALUE_OR_NULL type
+ */
+ return (unsigned long) value;
+}
+
+const struct bpf_func_proto bpf_map_lookup_elem_proto = {
+ .func = bpf_map_lookup_elem,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_MAP_KEY,
+};
+
+static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+ void *key = (void *) (unsigned long) r2;
+ void *value = (void *) (unsigned long) r3;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ return map->ops->map_update_elem(map, key, value, r4);
+}
+
+const struct bpf_func_proto bpf_map_update_elem_proto = {
+ .func = bpf_map_update_elem,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_MAP_KEY,
+ .arg3_type = ARG_PTR_TO_MAP_VALUE,
+ .arg4_type = ARG_ANYTHING,
+};
+
+static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+ void *key = (void *) (unsigned long) r2;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ return map->ops->map_delete_elem(map, key);
+}
+
+const struct bpf_func_proto bpf_map_delete_elem_proto = {
+ .func = bpf_map_delete_elem,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_MAP_KEY,
+};
+
+static u64 bpf_get_prandom_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ return prandom_u32();
+}
+
+const struct bpf_func_proto bpf_get_prandom_u32_proto = {
+ .func = bpf_get_prandom_u32,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
+static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ return raw_smp_processor_id();
+}
+
+const struct bpf_func_proto bpf_get_smp_processor_id_proto = {
+ .func = bpf_get_smp_processor_id,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
new file mode 100644
index 000000000..3bae6c591
--- /dev/null
+++ b/kernel/bpf/syscall.c
@@ -0,0 +1,621 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bpf.h>
+#include <linux/syscalls.h>
+#include <linux/slab.h>
+#include <linux/anon_inodes.h>
+#include <linux/file.h>
+#include <linux/license.h>
+#include <linux/filter.h>
+#include <linux/version.h>
+
+static LIST_HEAD(bpf_map_types);
+
+static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
+{
+ struct bpf_map_type_list *tl;
+ struct bpf_map *map;
+
+ list_for_each_entry(tl, &bpf_map_types, list_node) {
+ if (tl->type == attr->map_type) {
+ map = tl->ops->map_alloc(attr);
+ if (IS_ERR(map))
+ return map;
+ map->ops = tl->ops;
+ map->map_type = attr->map_type;
+ return map;
+ }
+ }
+ return ERR_PTR(-EINVAL);
+}
+
+/* boot time registration of different map implementations */
+void bpf_register_map_type(struct bpf_map_type_list *tl)
+{
+ list_add(&tl->list_node, &bpf_map_types);
+}
+
+/* called from workqueue */
+static void bpf_map_free_deferred(struct work_struct *work)
+{
+ struct bpf_map *map = container_of(work, struct bpf_map, work);
+
+ /* implementation dependent freeing */
+ map->ops->map_free(map);
+}
+
+/* decrement map refcnt and schedule it for freeing via workqueue
+ * (unrelying map implementation ops->map_free() might sleep)
+ */
+void bpf_map_put(struct bpf_map *map)
+{
+ if (atomic_dec_and_test(&map->refcnt)) {
+ INIT_WORK(&map->work, bpf_map_free_deferred);
+ schedule_work(&map->work);
+ }
+}
+
+static int bpf_map_release(struct inode *inode, struct file *filp)
+{
+ struct bpf_map *map = filp->private_data;
+
+ bpf_map_put(map);
+ return 0;
+}
+
+static const struct file_operations bpf_map_fops = {
+ .release = bpf_map_release,
+};
+
+/* helper macro to check that unused fields 'union bpf_attr' are zero */
+#define CHECK_ATTR(CMD) \
+ memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
+ sizeof(attr->CMD##_LAST_FIELD), 0, \
+ sizeof(*attr) - \
+ offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
+ sizeof(attr->CMD##_LAST_FIELD)) != NULL
+
+#define BPF_MAP_CREATE_LAST_FIELD max_entries
+/* called via syscall */
+static int map_create(union bpf_attr *attr)
+{
+ struct bpf_map *map;
+ int err;
+
+ err = CHECK_ATTR(BPF_MAP_CREATE);
+ if (err)
+ return -EINVAL;
+
+ /* find map type and init map: hashtable vs rbtree vs bloom vs ... */
+ map = find_and_alloc_map(attr);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ atomic_set(&map->refcnt, 1);
+
+ err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC);
+
+ if (err < 0)
+ /* failed to allocate fd */
+ goto free_map;
+
+ return err;
+
+free_map:
+ map->ops->map_free(map);
+ return err;
+}
+
+/* if error is returned, fd is released.
+ * On success caller should complete fd access with matching fdput()
+ */
+struct bpf_map *bpf_map_get(struct fd f)
+{
+ struct bpf_map *map;
+
+ if (!f.file)
+ return ERR_PTR(-EBADF);
+
+ if (f.file->f_op != &bpf_map_fops) {
+ fdput(f);
+ return ERR_PTR(-EINVAL);
+ }
+
+ map = f.file->private_data;
+
+ return map;
+}
+
+/* helper to convert user pointers passed inside __aligned_u64 fields */
+static void __user *u64_to_ptr(__u64 val)
+{
+ return (void __user *) (unsigned long) val;
+}
+
+/* last field in 'union bpf_attr' used by this command */
+#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value
+
+static int map_lookup_elem(union bpf_attr *attr)
+{
+ void __user *ukey = u64_to_ptr(attr->key);
+ void __user *uvalue = u64_to_ptr(attr->value);
+ int ufd = attr->map_fd;
+ struct fd f = fdget(ufd);
+ struct bpf_map *map;
+ void *key, *value, *ptr;
+ int err;
+
+ if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
+ return -EINVAL;
+
+ map = bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ err = -ENOMEM;
+ key = kmalloc(map->key_size, GFP_USER);
+ if (!key)
+ goto err_put;
+
+ err = -EFAULT;
+ if (copy_from_user(key, ukey, map->key_size) != 0)
+ goto free_key;
+
+ err = -ENOMEM;
+ value = kmalloc(map->value_size, GFP_USER);
+ if (!value)
+ goto free_key;
+
+ rcu_read_lock();
+ ptr = map->ops->map_lookup_elem(map, key);
+ if (ptr)
+ memcpy(value, ptr, map->value_size);
+ rcu_read_unlock();
+
+ err = -ENOENT;
+ if (!ptr)
+ goto free_value;
+
+ err = -EFAULT;
+ if (copy_to_user(uvalue, value, map->value_size) != 0)
+ goto free_value;
+
+ err = 0;
+
+free_value:
+ kfree(value);
+free_key:
+ kfree(key);
+err_put:
+ fdput(f);
+ return err;
+}
+
+#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
+
+static int map_update_elem(union bpf_attr *attr)
+{
+ void __user *ukey = u64_to_ptr(attr->key);
+ void __user *uvalue = u64_to_ptr(attr->value);
+ int ufd = attr->map_fd;
+ struct fd f = fdget(ufd);
+ struct bpf_map *map;
+ void *key, *value;
+ int err;
+
+ if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
+ return -EINVAL;
+
+ map = bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ err = -ENOMEM;
+ key = kmalloc(map->key_size, GFP_USER);
+ if (!key)
+ goto err_put;
+
+ err = -EFAULT;
+ if (copy_from_user(key, ukey, map->key_size) != 0)
+ goto free_key;
+
+ err = -ENOMEM;
+ value = kmalloc(map->value_size, GFP_USER);
+ if (!value)
+ goto free_key;
+
+ err = -EFAULT;
+ if (copy_from_user(value, uvalue, map->value_size) != 0)
+ goto free_value;
+
+ /* eBPF program that use maps are running under rcu_read_lock(),
+ * therefore all map accessors rely on this fact, so do the same here
+ */
+ rcu_read_lock();
+ err = map->ops->map_update_elem(map, key, value, attr->flags);
+ rcu_read_unlock();
+
+free_value:
+ kfree(value);
+free_key:
+ kfree(key);
+err_put:
+ fdput(f);
+ return err;
+}
+
+#define BPF_MAP_DELETE_ELEM_LAST_FIELD key
+
+static int map_delete_elem(union bpf_attr *attr)
+{
+ void __user *ukey = u64_to_ptr(attr->key);
+ int ufd = attr->map_fd;
+ struct fd f = fdget(ufd);
+ struct bpf_map *map;
+ void *key;
+ int err;
+
+ if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
+ return -EINVAL;
+
+ map = bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ err = -ENOMEM;
+ key = kmalloc(map->key_size, GFP_USER);
+ if (!key)
+ goto err_put;
+
+ err = -EFAULT;
+ if (copy_from_user(key, ukey, map->key_size) != 0)
+ goto free_key;
+
+ rcu_read_lock();
+ err = map->ops->map_delete_elem(map, key);
+ rcu_read_unlock();
+
+free_key:
+ kfree(key);
+err_put:
+ fdput(f);
+ return err;
+}
+
+/* last field in 'union bpf_attr' used by this command */
+#define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key
+
+static int map_get_next_key(union bpf_attr *attr)
+{
+ void __user *ukey = u64_to_ptr(attr->key);
+ void __user *unext_key = u64_to_ptr(attr->next_key);
+ int ufd = attr->map_fd;
+ struct fd f = fdget(ufd);
+ struct bpf_map *map;
+ void *key, *next_key;
+ int err;
+
+ if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
+ return -EINVAL;
+
+ map = bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ err = -ENOMEM;
+ key = kmalloc(map->key_size, GFP_USER);
+ if (!key)
+ goto err_put;
+
+ err = -EFAULT;
+ if (copy_from_user(key, ukey, map->key_size) != 0)
+ goto free_key;
+
+ err = -ENOMEM;
+ next_key = kmalloc(map->key_size, GFP_USER);
+ if (!next_key)
+ goto free_key;
+
+ rcu_read_lock();
+ err = map->ops->map_get_next_key(map, key, next_key);
+ rcu_read_unlock();
+ if (err)
+ goto free_next_key;
+
+ err = -EFAULT;
+ if (copy_to_user(unext_key, next_key, map->key_size) != 0)
+ goto free_next_key;
+
+ err = 0;
+
+free_next_key:
+ kfree(next_key);
+free_key:
+ kfree(key);
+err_put:
+ fdput(f);
+ return err;
+}
+
+static LIST_HEAD(bpf_prog_types);
+
+static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
+{
+ struct bpf_prog_type_list *tl;
+
+ list_for_each_entry(tl, &bpf_prog_types, list_node) {
+ if (tl->type == type) {
+ prog->aux->ops = tl->ops;
+ prog->type = type;
+ return 0;
+ }
+ }
+
+ return -EINVAL;
+}
+
+void bpf_register_prog_type(struct bpf_prog_type_list *tl)
+{
+ list_add(&tl->list_node, &bpf_prog_types);
+}
+
+/* fixup insn->imm field of bpf_call instructions:
+ * if (insn->imm == BPF_FUNC_map_lookup_elem)
+ * insn->imm = bpf_map_lookup_elem - __bpf_call_base;
+ * else if (insn->imm == BPF_FUNC_map_update_elem)
+ * insn->imm = bpf_map_update_elem - __bpf_call_base;
+ * else ...
+ *
+ * this function is called after eBPF program passed verification
+ */
+static void fixup_bpf_calls(struct bpf_prog *prog)
+{
+ const struct bpf_func_proto *fn;
+ int i;
+
+ for (i = 0; i < prog->len; i++) {
+ struct bpf_insn *insn = &prog->insnsi[i];
+
+ if (insn->code == (BPF_JMP | BPF_CALL)) {
+ /* we reach here when program has bpf_call instructions
+ * and it passed bpf_check(), means that
+ * ops->get_func_proto must have been supplied, check it
+ */
+ BUG_ON(!prog->aux->ops->get_func_proto);
+
+ fn = prog->aux->ops->get_func_proto(insn->imm);
+ /* all functions that have prototype and verifier allowed
+ * programs to call them, must be real in-kernel functions
+ */
+ BUG_ON(!fn->func);
+ insn->imm = fn->func - __bpf_call_base;
+ }
+ }
+}
+
+/* drop refcnt on maps used by eBPF program and free auxilary data */
+static void free_used_maps(struct bpf_prog_aux *aux)
+{
+ int i;
+
+ for (i = 0; i < aux->used_map_cnt; i++)
+ bpf_map_put(aux->used_maps[i]);
+
+ kfree(aux->used_maps);
+}
+
+void bpf_prog_put(struct bpf_prog *prog)
+{
+ if (atomic_dec_and_test(&prog->aux->refcnt)) {
+ free_used_maps(prog->aux);
+ bpf_prog_free(prog);
+ }
+}
+EXPORT_SYMBOL_GPL(bpf_prog_put);
+
+static int bpf_prog_release(struct inode *inode, struct file *filp)
+{
+ struct bpf_prog *prog = filp->private_data;
+
+ bpf_prog_put(prog);
+ return 0;
+}
+
+static const struct file_operations bpf_prog_fops = {
+ .release = bpf_prog_release,
+};
+
+static struct bpf_prog *get_prog(struct fd f)
+{
+ struct bpf_prog *prog;
+
+ if (!f.file)
+ return ERR_PTR(-EBADF);
+
+ if (f.file->f_op != &bpf_prog_fops) {
+ fdput(f);
+ return ERR_PTR(-EINVAL);
+ }
+
+ prog = f.file->private_data;
+
+ return prog;
+}
+
+/* called by sockets/tracing/seccomp before attaching program to an event
+ * pairs with bpf_prog_put()
+ */
+struct bpf_prog *bpf_prog_get(u32 ufd)
+{
+ struct fd f = fdget(ufd);
+ struct bpf_prog *prog;
+
+ prog = get_prog(f);
+
+ if (IS_ERR(prog))
+ return prog;
+
+ atomic_inc(&prog->aux->refcnt);
+ fdput(f);
+ return prog;
+}
+EXPORT_SYMBOL_GPL(bpf_prog_get);
+
+/* last field in 'union bpf_attr' used by this command */
+#define BPF_PROG_LOAD_LAST_FIELD kern_version
+
+static int bpf_prog_load(union bpf_attr *attr)
+{
+ enum bpf_prog_type type = attr->prog_type;
+ struct bpf_prog *prog;
+ int err;
+ char license[128];
+ bool is_gpl;
+
+ if (CHECK_ATTR(BPF_PROG_LOAD))
+ return -EINVAL;
+
+ /* copy eBPF program license from user space */
+ if (strncpy_from_user(license, u64_to_ptr(attr->license),
+ sizeof(license) - 1) < 0)
+ return -EFAULT;
+ license[sizeof(license) - 1] = 0;
+
+ /* eBPF programs must be GPL compatible to use GPL-ed functions */
+ is_gpl = license_is_gpl_compatible(license);
+
+ if (attr->insn_cnt >= BPF_MAXINSNS)
+ return -EINVAL;
+
+ if (type == BPF_PROG_TYPE_KPROBE &&
+ attr->kern_version != LINUX_VERSION_CODE)
+ return -EINVAL;
+
+ /* plain bpf_prog allocation */
+ prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
+ if (!prog)
+ return -ENOMEM;
+
+ prog->len = attr->insn_cnt;
+
+ err = -EFAULT;
+ if (copy_from_user(prog->insns, u64_to_ptr(attr->insns),
+ prog->len * sizeof(struct bpf_insn)) != 0)
+ goto free_prog;
+
+ prog->orig_prog = NULL;
+ prog->jited = false;
+
+ atomic_set(&prog->aux->refcnt, 1);
+ prog->gpl_compatible = is_gpl;
+
+ /* find program type: socket_filter vs tracing_filter */
+ err = find_prog_type(type, prog);
+ if (err < 0)
+ goto free_prog;
+
+ /* run eBPF verifier */
+ err = bpf_check(&prog, attr);
+ if (err < 0)
+ goto free_used_maps;
+
+ /* fixup BPF_CALL->imm field */
+ fixup_bpf_calls(prog);
+
+ /* eBPF program is ready to be JITed */
+ bpf_prog_select_runtime(prog);
+
+ err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC);
+ if (err < 0)
+ /* failed to allocate fd */
+ goto free_used_maps;
+
+ return err;
+
+free_used_maps:
+ free_used_maps(prog->aux);
+free_prog:
+ bpf_prog_free(prog);
+ return err;
+}
+
+SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
+{
+ union bpf_attr attr = {};
+ int err;
+
+ /* the syscall is limited to root temporarily. This restriction will be
+ * lifted when security audit is clean. Note that eBPF+tracing must have
+ * this restriction, since it may pass kernel data to user space
+ */
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!access_ok(VERIFY_READ, uattr, 1))
+ return -EFAULT;
+
+ if (size > PAGE_SIZE) /* silly large */
+ return -E2BIG;
+
+ /* If we're handed a bigger struct than we know of,
+ * ensure all the unknown bits are 0 - i.e. new
+ * user-space does not rely on any kernel feature
+ * extensions we dont know about yet.
+ */
+ if (size > sizeof(attr)) {
+ unsigned char __user *addr;
+ unsigned char __user *end;
+ unsigned char val;
+
+ addr = (void __user *)uattr + sizeof(attr);
+ end = (void __user *)uattr + size;
+
+ for (; addr < end; addr++) {
+ err = get_user(val, addr);
+ if (err)
+ return err;
+ if (val)
+ return -E2BIG;
+ }
+ size = sizeof(attr);
+ }
+
+ /* copy attributes from user space, may be less than sizeof(bpf_attr) */
+ if (copy_from_user(&attr, uattr, size) != 0)
+ return -EFAULT;
+
+ switch (cmd) {
+ case BPF_MAP_CREATE:
+ err = map_create(&attr);
+ break;
+ case BPF_MAP_LOOKUP_ELEM:
+ err = map_lookup_elem(&attr);
+ break;
+ case BPF_MAP_UPDATE_ELEM:
+ err = map_update_elem(&attr);
+ break;
+ case BPF_MAP_DELETE_ELEM:
+ err = map_delete_elem(&attr);
+ break;
+ case BPF_MAP_GET_NEXT_KEY:
+ err = map_get_next_key(&attr);
+ break;
+ case BPF_PROG_LOAD:
+ err = bpf_prog_load(&attr);
+ break;
+ default:
+ err = -EINVAL;
+ break;
+ }
+
+ return err;
+}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
new file mode 100644
index 000000000..47dcd3aa6
--- /dev/null
+++ b/kernel/bpf/verifier.c
@@ -0,0 +1,2146 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <net/netlink.h>
+#include <linux/file.h>
+#include <linux/vmalloc.h>
+
+/* bpf_check() is a static code analyzer that walks eBPF program
+ * instruction by instruction and updates register/stack state.
+ * All paths of conditional branches are analyzed until 'bpf_exit' insn.
+ *
+ * The first pass is depth-first-search to check that the program is a DAG.
+ * It rejects the following programs:
+ * - larger than BPF_MAXINSNS insns
+ * - if loop is present (detected via back-edge)
+ * - unreachable insns exist (shouldn't be a forest. program = one function)
+ * - out of bounds or malformed jumps
+ * The second pass is all possible path descent from the 1st insn.
+ * Since it's analyzing all pathes through the program, the length of the
+ * analysis is limited to 32k insn, which may be hit even if total number of
+ * insn is less then 4K, but there are too many branches that change stack/regs.
+ * Number of 'branches to be analyzed' is limited to 1k
+ *
+ * On entry to each instruction, each register has a type, and the instruction
+ * changes the types of the registers depending on instruction semantics.
+ * If instruction is BPF_MOV64_REG(BPF_REG_1, BPF_REG_5), then type of R5 is
+ * copied to R1.
+ *
+ * All registers are 64-bit.
+ * R0 - return register
+ * R1-R5 argument passing registers
+ * R6-R9 callee saved registers
+ * R10 - frame pointer read-only
+ *
+ * At the start of BPF program the register R1 contains a pointer to bpf_context
+ * and has type PTR_TO_CTX.
+ *
+ * Verifier tracks arithmetic operations on pointers in case:
+ * BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ * BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -20),
+ * 1st insn copies R10 (which has FRAME_PTR) type into R1
+ * and 2nd arithmetic instruction is pattern matched to recognize
+ * that it wants to construct a pointer to some element within stack.
+ * So after 2nd insn, the register R1 has type PTR_TO_STACK
+ * (and -20 constant is saved for further stack bounds checking).
+ * Meaning that this reg is a pointer to stack plus known immediate constant.
+ *
+ * Most of the time the registers have UNKNOWN_VALUE type, which
+ * means the register has some value, but it's not a valid pointer.
+ * (like pointer plus pointer becomes UNKNOWN_VALUE type)
+ *
+ * When verifier sees load or store instructions the type of base register
+ * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, FRAME_PTR. These are three pointer
+ * types recognized by check_mem_access() function.
+ *
+ * PTR_TO_MAP_VALUE means that this register is pointing to 'map element value'
+ * and the range of [ptr, ptr + map's value_size) is accessible.
+ *
+ * registers used to pass values to function calls are checked against
+ * function argument constraints.
+ *
+ * ARG_PTR_TO_MAP_KEY is one of such argument constraints.
+ * It means that the register type passed to this function must be
+ * PTR_TO_STACK and it will be used inside the function as
+ * 'pointer to map element key'
+ *
+ * For example the argument constraints for bpf_map_lookup_elem():
+ * .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ * .arg1_type = ARG_CONST_MAP_PTR,
+ * .arg2_type = ARG_PTR_TO_MAP_KEY,
+ *
+ * ret_type says that this function returns 'pointer to map elem value or null'
+ * function expects 1st argument to be a const pointer to 'struct bpf_map' and
+ * 2nd argument should be a pointer to stack, which will be used inside
+ * the helper function as a pointer to map element key.
+ *
+ * On the kernel side the helper function looks like:
+ * u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+ * {
+ * struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+ * void *key = (void *) (unsigned long) r2;
+ * void *value;
+ *
+ * here kernel can access 'key' and 'map' pointers safely, knowing that
+ * [key, key + map->key_size) bytes are valid and were initialized on
+ * the stack of eBPF program.
+ * }
+ *
+ * Corresponding eBPF program may look like:
+ * BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), // after this insn R2 type is FRAME_PTR
+ * BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), // after this insn R2 type is PTR_TO_STACK
+ * BPF_LD_MAP_FD(BPF_REG_1, map_fd), // after this insn R1 type is CONST_PTR_TO_MAP
+ * BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ * here verifier looks at prototype of map_lookup_elem() and sees:
+ * .arg1_type == ARG_CONST_MAP_PTR and R1->type == CONST_PTR_TO_MAP, which is ok,
+ * Now verifier knows that this map has key of R1->map_ptr->key_size bytes
+ *
+ * Then .arg2_type == ARG_PTR_TO_MAP_KEY and R2->type == PTR_TO_STACK, ok so far,
+ * Now verifier checks that [R2, R2 + map's key_size) are within stack limits
+ * and were initialized prior to this call.
+ * If it's ok, then verifier allows this BPF_CALL insn and looks at
+ * .ret_type which is RET_PTR_TO_MAP_VALUE_OR_NULL, so it sets
+ * R0->type = PTR_TO_MAP_VALUE_OR_NULL which means bpf_map_lookup_elem() function
+ * returns ether pointer to map value or NULL.
+ *
+ * When type PTR_TO_MAP_VALUE_OR_NULL passes through 'if (reg != 0) goto +off'
+ * insn, the register holding that pointer in the true branch changes state to
+ * PTR_TO_MAP_VALUE and the same register changes state to CONST_IMM in the false
+ * branch. See check_cond_jmp_op().
+ *
+ * After the call R0 is set to return type of the function and registers R1-R5
+ * are set to NOT_INIT to indicate that they are no longer readable.
+ */
+
+/* types of values stored in eBPF registers */
+enum bpf_reg_type {
+ NOT_INIT = 0, /* nothing was written into register */
+ UNKNOWN_VALUE, /* reg doesn't contain a valid pointer */
+ PTR_TO_CTX, /* reg points to bpf_context */
+ CONST_PTR_TO_MAP, /* reg points to struct bpf_map */
+ PTR_TO_MAP_VALUE, /* reg points to map element value */
+ PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */
+ FRAME_PTR, /* reg == frame_pointer */
+ PTR_TO_STACK, /* reg == frame_pointer + imm */
+ CONST_IMM, /* constant integer value */
+};
+
+struct reg_state {
+ enum bpf_reg_type type;
+ union {
+ /* valid when type == CONST_IMM | PTR_TO_STACK */
+ int imm;
+
+ /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE |
+ * PTR_TO_MAP_VALUE_OR_NULL
+ */
+ struct bpf_map *map_ptr;
+ };
+};
+
+enum bpf_stack_slot_type {
+ STACK_INVALID, /* nothing was stored in this stack slot */
+ STACK_SPILL, /* register spilled into stack */
+ STACK_MISC /* BPF program wrote some data into this slot */
+};
+
+#define BPF_REG_SIZE 8 /* size of eBPF register in bytes */
+
+/* state of the program:
+ * type of all registers and stack info
+ */
+struct verifier_state {
+ struct reg_state regs[MAX_BPF_REG];
+ u8 stack_slot_type[MAX_BPF_STACK];
+ struct reg_state spilled_regs[MAX_BPF_STACK / BPF_REG_SIZE];
+};
+
+/* linked list of verifier states used to prune search */
+struct verifier_state_list {
+ struct verifier_state state;
+ struct verifier_state_list *next;
+};
+
+/* verifier_state + insn_idx are pushed to stack when branch is encountered */
+struct verifier_stack_elem {
+ /* verifer state is 'st'
+ * before processing instruction 'insn_idx'
+ * and after processing instruction 'prev_insn_idx'
+ */
+ struct verifier_state st;
+ int insn_idx;
+ int prev_insn_idx;
+ struct verifier_stack_elem *next;
+};
+
+#define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
+
+/* single container for all structs
+ * one verifier_env per bpf_check() call
+ */
+struct verifier_env {
+ struct bpf_prog *prog; /* eBPF program being verified */
+ struct verifier_stack_elem *head; /* stack of verifier states to be processed */
+ int stack_size; /* number of states to be processed */
+ struct verifier_state cur_state; /* current verifier state */
+ struct verifier_state_list **explored_states; /* search pruning optimization */
+ struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
+ u32 used_map_cnt; /* number of used maps */
+};
+
+/* verbose verifier prints what it's seeing
+ * bpf_check() is called under lock, so no race to access these global vars
+ */
+static u32 log_level, log_size, log_len;
+static char *log_buf;
+
+static DEFINE_MUTEX(bpf_verifier_lock);
+
+/* log_level controls verbosity level of eBPF verifier.
+ * verbose() is used to dump the verification trace to the log, so the user
+ * can figure out what's wrong with the program
+ */
+static void verbose(const char *fmt, ...)
+{
+ va_list args;
+
+ if (log_level == 0 || log_len >= log_size - 1)
+ return;
+
+ va_start(args, fmt);
+ log_len += vscnprintf(log_buf + log_len, log_size - log_len, fmt, args);
+ va_end(args);
+}
+
+/* string representation of 'enum bpf_reg_type' */
+static const char * const reg_type_str[] = {
+ [NOT_INIT] = "?",
+ [UNKNOWN_VALUE] = "inv",
+ [PTR_TO_CTX] = "ctx",
+ [CONST_PTR_TO_MAP] = "map_ptr",
+ [PTR_TO_MAP_VALUE] = "map_value",
+ [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null",
+ [FRAME_PTR] = "fp",
+ [PTR_TO_STACK] = "fp",
+ [CONST_IMM] = "imm",
+};
+
+static void print_verifier_state(struct verifier_env *env)
+{
+ enum bpf_reg_type t;
+ int i;
+
+ for (i = 0; i < MAX_BPF_REG; i++) {
+ t = env->cur_state.regs[i].type;
+ if (t == NOT_INIT)
+ continue;
+ verbose(" R%d=%s", i, reg_type_str[t]);
+ if (t == CONST_IMM || t == PTR_TO_STACK)
+ verbose("%d", env->cur_state.regs[i].imm);
+ else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE ||
+ t == PTR_TO_MAP_VALUE_OR_NULL)
+ verbose("(ks=%d,vs=%d)",
+ env->cur_state.regs[i].map_ptr->key_size,
+ env->cur_state.regs[i].map_ptr->value_size);
+ }
+ for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
+ if (env->cur_state.stack_slot_type[i] == STACK_SPILL)
+ verbose(" fp%d=%s", -MAX_BPF_STACK + i,
+ reg_type_str[env->cur_state.spilled_regs[i / BPF_REG_SIZE].type]);
+ }
+ verbose("\n");
+}
+
+static const char *const bpf_class_string[] = {
+ [BPF_LD] = "ld",
+ [BPF_LDX] = "ldx",
+ [BPF_ST] = "st",
+ [BPF_STX] = "stx",
+ [BPF_ALU] = "alu",
+ [BPF_JMP] = "jmp",
+ [BPF_RET] = "BUG",
+ [BPF_ALU64] = "alu64",
+};
+
+static const char *const bpf_alu_string[] = {
+ [BPF_ADD >> 4] = "+=",
+ [BPF_SUB >> 4] = "-=",
+ [BPF_MUL >> 4] = "*=",
+ [BPF_DIV >> 4] = "/=",
+ [BPF_OR >> 4] = "|=",
+ [BPF_AND >> 4] = "&=",
+ [BPF_LSH >> 4] = "<<=",
+ [BPF_RSH >> 4] = ">>=",
+ [BPF_NEG >> 4] = "neg",
+ [BPF_MOD >> 4] = "%=",
+ [BPF_XOR >> 4] = "^=",
+ [BPF_MOV >> 4] = "=",
+ [BPF_ARSH >> 4] = "s>>=",
+ [BPF_END >> 4] = "endian",
+};
+
+static const char *const bpf_ldst_string[] = {
+ [BPF_W >> 3] = "u32",
+ [BPF_H >> 3] = "u16",
+ [BPF_B >> 3] = "u8",
+ [BPF_DW >> 3] = "u64",
+};
+
+static const char *const bpf_jmp_string[] = {
+ [BPF_JA >> 4] = "jmp",
+ [BPF_JEQ >> 4] = "==",
+ [BPF_JGT >> 4] = ">",
+ [BPF_JGE >> 4] = ">=",
+ [BPF_JSET >> 4] = "&",
+ [BPF_JNE >> 4] = "!=",
+ [BPF_JSGT >> 4] = "s>",
+ [BPF_JSGE >> 4] = "s>=",
+ [BPF_CALL >> 4] = "call",
+ [BPF_EXIT >> 4] = "exit",
+};
+
+static void print_bpf_insn(struct bpf_insn *insn)
+{
+ u8 class = BPF_CLASS(insn->code);
+
+ if (class == BPF_ALU || class == BPF_ALU64) {
+ if (BPF_SRC(insn->code) == BPF_X)
+ verbose("(%02x) %sr%d %s %sr%d\n",
+ insn->code, class == BPF_ALU ? "(u32) " : "",
+ insn->dst_reg,
+ bpf_alu_string[BPF_OP(insn->code) >> 4],
+ class == BPF_ALU ? "(u32) " : "",
+ insn->src_reg);
+ else
+ verbose("(%02x) %sr%d %s %s%d\n",
+ insn->code, class == BPF_ALU ? "(u32) " : "",
+ insn->dst_reg,
+ bpf_alu_string[BPF_OP(insn->code) >> 4],
+ class == BPF_ALU ? "(u32) " : "",
+ insn->imm);
+ } else if (class == BPF_STX) {
+ if (BPF_MODE(insn->code) == BPF_MEM)
+ verbose("(%02x) *(%s *)(r%d %+d) = r%d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg,
+ insn->off, insn->src_reg);
+ else if (BPF_MODE(insn->code) == BPF_XADD)
+ verbose("(%02x) lock *(%s *)(r%d %+d) += r%d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg, insn->off,
+ insn->src_reg);
+ else
+ verbose("BUG_%02x\n", insn->code);
+ } else if (class == BPF_ST) {
+ if (BPF_MODE(insn->code) != BPF_MEM) {
+ verbose("BUG_st_%02x\n", insn->code);
+ return;
+ }
+ verbose("(%02x) *(%s *)(r%d %+d) = %d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg,
+ insn->off, insn->imm);
+ } else if (class == BPF_LDX) {
+ if (BPF_MODE(insn->code) != BPF_MEM) {
+ verbose("BUG_ldx_%02x\n", insn->code);
+ return;
+ }
+ verbose("(%02x) r%d = *(%s *)(r%d %+d)\n",
+ insn->code, insn->dst_reg,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->src_reg, insn->off);
+ } else if (class == BPF_LD) {
+ if (BPF_MODE(insn->code) == BPF_ABS) {
+ verbose("(%02x) r0 = *(%s *)skb[%d]\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->imm);
+ } else if (BPF_MODE(insn->code) == BPF_IND) {
+ verbose("(%02x) r0 = *(%s *)skb[r%d + %d]\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->src_reg, insn->imm);
+ } else if (BPF_MODE(insn->code) == BPF_IMM) {
+ verbose("(%02x) r%d = 0x%x\n",
+ insn->code, insn->dst_reg, insn->imm);
+ } else {
+ verbose("BUG_ld_%02x\n", insn->code);
+ return;
+ }
+ } else if (class == BPF_JMP) {
+ u8 opcode = BPF_OP(insn->code);
+
+ if (opcode == BPF_CALL) {
+ verbose("(%02x) call %d\n", insn->code, insn->imm);
+ } else if (insn->code == (BPF_JMP | BPF_JA)) {
+ verbose("(%02x) goto pc%+d\n",
+ insn->code, insn->off);
+ } else if (insn->code == (BPF_JMP | BPF_EXIT)) {
+ verbose("(%02x) exit\n", insn->code);
+ } else if (BPF_SRC(insn->code) == BPF_X) {
+ verbose("(%02x) if r%d %s r%d goto pc%+d\n",
+ insn->code, insn->dst_reg,
+ bpf_jmp_string[BPF_OP(insn->code) >> 4],
+ insn->src_reg, insn->off);
+ } else {
+ verbose("(%02x) if r%d %s 0x%x goto pc%+d\n",
+ insn->code, insn->dst_reg,
+ bpf_jmp_string[BPF_OP(insn->code) >> 4],
+ insn->imm, insn->off);
+ }
+ } else {
+ verbose("(%02x) %s\n", insn->code, bpf_class_string[class]);
+ }
+}
+
+static int pop_stack(struct verifier_env *env, int *prev_insn_idx)
+{
+ struct verifier_stack_elem *elem;
+ int insn_idx;
+
+ if (env->head == NULL)
+ return -1;
+
+ memcpy(&env->cur_state, &env->head->st, sizeof(env->cur_state));
+ insn_idx = env->head->insn_idx;
+ if (prev_insn_idx)
+ *prev_insn_idx = env->head->prev_insn_idx;
+ elem = env->head->next;
+ kfree(env->head);
+ env->head = elem;
+ env->stack_size--;
+ return insn_idx;
+}
+
+static struct verifier_state *push_stack(struct verifier_env *env, int insn_idx,
+ int prev_insn_idx)
+{
+ struct verifier_stack_elem *elem;
+
+ elem = kmalloc(sizeof(struct verifier_stack_elem), GFP_KERNEL);
+ if (!elem)
+ goto err;
+
+ memcpy(&elem->st, &env->cur_state, sizeof(env->cur_state));
+ elem->insn_idx = insn_idx;
+ elem->prev_insn_idx = prev_insn_idx;
+ elem->next = env->head;
+ env->head = elem;
+ env->stack_size++;
+ if (env->stack_size > 1024) {
+ verbose("BPF program is too complex\n");
+ goto err;
+ }
+ return &elem->st;
+err:
+ /* pop all elements and return */
+ while (pop_stack(env, NULL) >= 0);
+ return NULL;
+}
+
+#define CALLER_SAVED_REGS 6
+static const int caller_saved[CALLER_SAVED_REGS] = {
+ BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
+};
+
+static void init_reg_state(struct reg_state *regs)
+{
+ int i;
+
+ for (i = 0; i < MAX_BPF_REG; i++) {
+ regs[i].type = NOT_INIT;
+ regs[i].imm = 0;
+ regs[i].map_ptr = NULL;
+ }
+
+ /* frame pointer */
+ regs[BPF_REG_FP].type = FRAME_PTR;
+
+ /* 1st arg to a function */
+ regs[BPF_REG_1].type = PTR_TO_CTX;
+}
+
+static void mark_reg_unknown_value(struct reg_state *regs, u32 regno)
+{
+ BUG_ON(regno >= MAX_BPF_REG);
+ regs[regno].type = UNKNOWN_VALUE;
+ regs[regno].imm = 0;
+ regs[regno].map_ptr = NULL;
+}
+
+enum reg_arg_type {
+ SRC_OP, /* register is used as source operand */
+ DST_OP, /* register is used as destination operand */
+ DST_OP_NO_MARK /* same as above, check only, don't mark */
+};
+
+static int check_reg_arg(struct reg_state *regs, u32 regno,
+ enum reg_arg_type t)
+{
+ if (regno >= MAX_BPF_REG) {
+ verbose("R%d is invalid\n", regno);
+ return -EINVAL;
+ }
+
+ if (t == SRC_OP) {
+ /* check whether register used as source operand can be read */
+ if (regs[regno].type == NOT_INIT) {
+ verbose("R%d !read_ok\n", regno);
+ return -EACCES;
+ }
+ } else {
+ /* check whether register used as dest operand can be written to */
+ if (regno == BPF_REG_FP) {
+ verbose("frame pointer is read only\n");
+ return -EACCES;
+ }
+ if (t == DST_OP)
+ mark_reg_unknown_value(regs, regno);
+ }
+ return 0;
+}
+
+static int bpf_size_to_bytes(int bpf_size)
+{
+ if (bpf_size == BPF_W)
+ return 4;
+ else if (bpf_size == BPF_H)
+ return 2;
+ else if (bpf_size == BPF_B)
+ return 1;
+ else if (bpf_size == BPF_DW)
+ return 8;
+ else
+ return -EINVAL;
+}
+
+/* check_stack_read/write functions track spill/fill of registers,
+ * stack boundary and alignment are checked in check_mem_access()
+ */
+static int check_stack_write(struct verifier_state *state, int off, int size,
+ int value_regno)
+{
+ int i;
+ /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
+ * so it's aligned access and [off, off + size) are within stack limits
+ */
+
+ if (value_regno >= 0 &&
+ (state->regs[value_regno].type == PTR_TO_MAP_VALUE ||
+ state->regs[value_regno].type == PTR_TO_STACK ||
+ state->regs[value_regno].type == PTR_TO_CTX)) {
+
+ /* register containing pointer is being spilled into stack */
+ if (size != BPF_REG_SIZE) {
+ verbose("invalid size of register spill\n");
+ return -EACCES;
+ }
+
+ /* save register state */
+ state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] =
+ state->regs[value_regno];
+
+ for (i = 0; i < BPF_REG_SIZE; i++)
+ state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL;
+ } else {
+ /* regular write of data into stack */
+ state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] =
+ (struct reg_state) {};
+
+ for (i = 0; i < size; i++)
+ state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC;
+ }
+ return 0;
+}
+
+static int check_stack_read(struct verifier_state *state, int off, int size,
+ int value_regno)
+{
+ u8 *slot_type;
+ int i;
+
+ slot_type = &state->stack_slot_type[MAX_BPF_STACK + off];
+
+ if (slot_type[0] == STACK_SPILL) {
+ if (size != BPF_REG_SIZE) {
+ verbose("invalid size of register spill\n");
+ return -EACCES;
+ }
+ for (i = 1; i < BPF_REG_SIZE; i++) {
+ if (slot_type[i] != STACK_SPILL) {
+ verbose("corrupted spill memory\n");
+ return -EACCES;
+ }
+ }
+
+ if (value_regno >= 0)
+ /* restore register state from stack */
+ state->regs[value_regno] =
+ state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE];
+ return 0;
+ } else {
+ for (i = 0; i < size; i++) {
+ if (slot_type[i] != STACK_MISC) {
+ verbose("invalid read from stack off %d+%d size %d\n",
+ off, i, size);
+ return -EACCES;
+ }
+ }
+ if (value_regno >= 0)
+ /* have read misc data from the stack */
+ mark_reg_unknown_value(state->regs, value_regno);
+ return 0;
+ }
+}
+
+/* check read/write into map element returned by bpf_map_lookup_elem() */
+static int check_map_access(struct verifier_env *env, u32 regno, int off,
+ int size)
+{
+ struct bpf_map *map = env->cur_state.regs[regno].map_ptr;
+
+ if (off < 0 || off + size > map->value_size) {
+ verbose("invalid access to map value, value_size=%d off=%d size=%d\n",
+ map->value_size, off, size);
+ return -EACCES;
+ }
+ return 0;
+}
+
+/* check access to 'struct bpf_context' fields */
+static int check_ctx_access(struct verifier_env *env, int off, int size,
+ enum bpf_access_type t)
+{
+ if (env->prog->aux->ops->is_valid_access &&
+ env->prog->aux->ops->is_valid_access(off, size, t))
+ return 0;
+
+ verbose("invalid bpf_context access off=%d size=%d\n", off, size);
+ return -EACCES;
+}
+
+/* check whether memory at (regno + off) is accessible for t = (read | write)
+ * if t==write, value_regno is a register which value is stored into memory
+ * if t==read, value_regno is a register which will receive the value from memory
+ * if t==write && value_regno==-1, some unknown value is stored into memory
+ * if t==read && value_regno==-1, don't care what we read from memory
+ */
+static int check_mem_access(struct verifier_env *env, u32 regno, int off,
+ int bpf_size, enum bpf_access_type t,
+ int value_regno)
+{
+ struct verifier_state *state = &env->cur_state;
+ int size, err = 0;
+
+ size = bpf_size_to_bytes(bpf_size);
+ if (size < 0)
+ return size;
+
+ if (off % size != 0) {
+ verbose("misaligned access off %d size %d\n", off, size);
+ return -EACCES;
+ }
+
+ if (state->regs[regno].type == PTR_TO_MAP_VALUE) {
+ err = check_map_access(env, regno, off, size);
+ if (!err && t == BPF_READ && value_regno >= 0)
+ mark_reg_unknown_value(state->regs, value_regno);
+
+ } else if (state->regs[regno].type == PTR_TO_CTX) {
+ err = check_ctx_access(env, off, size, t);
+ if (!err && t == BPF_READ && value_regno >= 0)
+ mark_reg_unknown_value(state->regs, value_regno);
+
+ } else if (state->regs[regno].type == FRAME_PTR) {
+ if (off >= 0 || off < -MAX_BPF_STACK) {
+ verbose("invalid stack off=%d size=%d\n", off, size);
+ return -EACCES;
+ }
+ if (t == BPF_WRITE)
+ err = check_stack_write(state, off, size, value_regno);
+ else
+ err = check_stack_read(state, off, size, value_regno);
+ } else {
+ verbose("R%d invalid mem access '%s'\n",
+ regno, reg_type_str[state->regs[regno].type]);
+ return -EACCES;
+ }
+ return err;
+}
+
+static int check_xadd(struct verifier_env *env, struct bpf_insn *insn)
+{
+ struct reg_state *regs = env->cur_state.regs;
+ int err;
+
+ if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) ||
+ insn->imm != 0) {
+ verbose("BPF_XADD uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ /* check src1 operand */
+ err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ if (err)
+ return err;
+
+ /* check src2 operand */
+ err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ if (err)
+ return err;
+
+ /* check whether atomic_add can read the memory */
+ err = check_mem_access(env, insn->dst_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_READ, -1);
+ if (err)
+ return err;
+
+ /* check whether atomic_add can write into the same memory */
+ return check_mem_access(env, insn->dst_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_WRITE, -1);
+}
+
+/* when register 'regno' is passed into function that will read 'access_size'
+ * bytes from that pointer, make sure that it's within stack boundary
+ * and all elements of stack are initialized
+ */
+static int check_stack_boundary(struct verifier_env *env,
+ int regno, int access_size)
+{
+ struct verifier_state *state = &env->cur_state;
+ struct reg_state *regs = state->regs;
+ int off, i;
+
+ if (regs[regno].type != PTR_TO_STACK)
+ return -EACCES;
+
+ off = regs[regno].imm;
+ if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
+ access_size <= 0) {
+ verbose("invalid stack type R%d off=%d access_size=%d\n",
+ regno, off, access_size);
+ return -EACCES;
+ }
+
+ for (i = 0; i < access_size; i++) {
+ if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) {
+ verbose("invalid indirect read from stack off %d+%d size %d\n",
+ off, i, access_size);
+ return -EACCES;
+ }
+ }
+ return 0;
+}
+
+static int check_func_arg(struct verifier_env *env, u32 regno,
+ enum bpf_arg_type arg_type, struct bpf_map **mapp)
+{
+ struct reg_state *reg = env->cur_state.regs + regno;
+ enum bpf_reg_type expected_type;
+ int err = 0;
+
+ if (arg_type == ARG_DONTCARE)
+ return 0;
+
+ if (reg->type == NOT_INIT) {
+ verbose("R%d !read_ok\n", regno);
+ return -EACCES;
+ }
+
+ if (arg_type == ARG_ANYTHING)
+ return 0;
+
+ if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY ||
+ arg_type == ARG_PTR_TO_MAP_VALUE) {
+ expected_type = PTR_TO_STACK;
+ } else if (arg_type == ARG_CONST_STACK_SIZE) {
+ expected_type = CONST_IMM;
+ } else if (arg_type == ARG_CONST_MAP_PTR) {
+ expected_type = CONST_PTR_TO_MAP;
+ } else if (arg_type == ARG_PTR_TO_CTX) {
+ expected_type = PTR_TO_CTX;
+ } else {
+ verbose("unsupported arg_type %d\n", arg_type);
+ return -EFAULT;
+ }
+
+ if (reg->type != expected_type) {
+ verbose("R%d type=%s expected=%s\n", regno,
+ reg_type_str[reg->type], reg_type_str[expected_type]);
+ return -EACCES;
+ }
+
+ if (arg_type == ARG_CONST_MAP_PTR) {
+ /* bpf_map_xxx(map_ptr) call: remember that map_ptr */
+ *mapp = reg->map_ptr;
+
+ } else if (arg_type == ARG_PTR_TO_MAP_KEY) {
+ /* bpf_map_xxx(..., map_ptr, ..., key) call:
+ * check that [key, key + map->key_size) are within
+ * stack limits and initialized
+ */
+ if (!*mapp) {
+ /* in function declaration map_ptr must come before
+ * map_key, so that it's verified and known before
+ * we have to check map_key here. Otherwise it means
+ * that kernel subsystem misconfigured verifier
+ */
+ verbose("invalid map_ptr to access map->key\n");
+ return -EACCES;
+ }
+ err = check_stack_boundary(env, regno, (*mapp)->key_size);
+
+ } else if (arg_type == ARG_PTR_TO_MAP_VALUE) {
+ /* bpf_map_xxx(..., map_ptr, ..., value) call:
+ * check [value, value + map->value_size) validity
+ */
+ if (!*mapp) {
+ /* kernel subsystem misconfigured verifier */
+ verbose("invalid map_ptr to access map->value\n");
+ return -EACCES;
+ }
+ err = check_stack_boundary(env, regno, (*mapp)->value_size);
+
+ } else if (arg_type == ARG_CONST_STACK_SIZE) {
+ /* bpf_xxx(..., buf, len) call will access 'len' bytes
+ * from stack pointer 'buf'. Check it
+ * note: regno == len, regno - 1 == buf
+ */
+ if (regno == 0) {
+ /* kernel subsystem misconfigured verifier */
+ verbose("ARG_CONST_STACK_SIZE cannot be first argument\n");
+ return -EACCES;
+ }
+ err = check_stack_boundary(env, regno - 1, reg->imm);
+ }
+
+ return err;
+}
+
+static int check_call(struct verifier_env *env, int func_id)
+{
+ struct verifier_state *state = &env->cur_state;
+ const struct bpf_func_proto *fn = NULL;
+ struct reg_state *regs = state->regs;
+ struct bpf_map *map = NULL;
+ struct reg_state *reg;
+ int i, err;
+
+ /* find function prototype */
+ if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) {
+ verbose("invalid func %d\n", func_id);
+ return -EINVAL;
+ }
+
+ if (env->prog->aux->ops->get_func_proto)
+ fn = env->prog->aux->ops->get_func_proto(func_id);
+
+ if (!fn) {
+ verbose("unknown func %d\n", func_id);
+ return -EINVAL;
+ }
+
+ /* eBPF programs must be GPL compatible to use GPL-ed functions */
+ if (!env->prog->gpl_compatible && fn->gpl_only) {
+ verbose("cannot call GPL only function from proprietary program\n");
+ return -EINVAL;
+ }
+
+ /* check args */
+ err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &map);
+ if (err)
+ return err;
+ err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &map);
+ if (err)
+ return err;
+ err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &map);
+ if (err)
+ return err;
+ err = check_func_arg(env, BPF_REG_4, fn->arg4_type, &map);
+ if (err)
+ return err;
+ err = check_func_arg(env, BPF_REG_5, fn->arg5_type, &map);
+ if (err)
+ return err;
+
+ /* reset caller saved regs */
+ for (i = 0; i < CALLER_SAVED_REGS; i++) {
+ reg = regs + caller_saved[i];
+ reg->type = NOT_INIT;
+ reg->imm = 0;
+ }
+
+ /* update return register */
+ if (fn->ret_type == RET_INTEGER) {
+ regs[BPF_REG_0].type = UNKNOWN_VALUE;
+ } else if (fn->ret_type == RET_VOID) {
+ regs[BPF_REG_0].type = NOT_INIT;
+ } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
+ regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
+ /* remember map_ptr, so that check_map_access()
+ * can check 'value_size' boundary of memory access
+ * to map element returned from bpf_map_lookup_elem()
+ */
+ if (map == NULL) {
+ verbose("kernel subsystem misconfigured verifier\n");
+ return -EINVAL;
+ }
+ regs[BPF_REG_0].map_ptr = map;
+ } else {
+ verbose("unknown return type %d of func %d\n",
+ fn->ret_type, func_id);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/* check validity of 32-bit and 64-bit arithmetic operations */
+static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
+{
+ u8 opcode = BPF_OP(insn->code);
+ int err;
+
+ if (opcode == BPF_END || opcode == BPF_NEG) {
+ if (opcode == BPF_NEG) {
+ if (BPF_SRC(insn->code) != 0 ||
+ insn->src_reg != BPF_REG_0 ||
+ insn->off != 0 || insn->imm != 0) {
+ verbose("BPF_NEG uses reserved fields\n");
+ return -EINVAL;
+ }
+ } else {
+ if (insn->src_reg != BPF_REG_0 || insn->off != 0 ||
+ (insn->imm != 16 && insn->imm != 32 && insn->imm != 64)) {
+ verbose("BPF_END uses reserved fields\n");
+ return -EINVAL;
+ }
+ }
+
+ /* check src operand */
+ err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ if (err)
+ return err;
+
+ /* check dest operand */
+ err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+ if (err)
+ return err;
+
+ } else if (opcode == BPF_MOV) {
+
+ if (BPF_SRC(insn->code) == BPF_X) {
+ if (insn->imm != 0 || insn->off != 0) {
+ verbose("BPF_MOV uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ /* check src operand */
+ err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ if (err)
+ return err;
+ } else {
+ if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
+ verbose("BPF_MOV uses reserved fields\n");
+ return -EINVAL;
+ }
+ }
+
+ /* check dest operand */
+ err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+ if (err)
+ return err;
+
+ if (BPF_SRC(insn->code) == BPF_X) {
+ if (BPF_CLASS(insn->code) == BPF_ALU64) {
+ /* case: R1 = R2
+ * copy register state to dest reg
+ */
+ regs[insn->dst_reg] = regs[insn->src_reg];
+ } else {
+ regs[insn->dst_reg].type = UNKNOWN_VALUE;
+ regs[insn->dst_reg].map_ptr = NULL;
+ }
+ } else {
+ /* case: R = imm
+ * remember the value we stored into this reg
+ */
+ regs[insn->dst_reg].type = CONST_IMM;
+ regs[insn->dst_reg].imm = insn->imm;
+ }
+
+ } else if (opcode > BPF_END) {
+ verbose("invalid BPF_ALU opcode %x\n", opcode);
+ return -EINVAL;
+
+ } else { /* all other ALU ops: and, sub, xor, add, ... */
+
+ bool stack_relative = false;
+
+ if (BPF_SRC(insn->code) == BPF_X) {
+ if (insn->imm != 0 || insn->off != 0) {
+ verbose("BPF_ALU uses reserved fields\n");
+ return -EINVAL;
+ }
+ /* check src1 operand */
+ err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ if (err)
+ return err;
+ } else {
+ if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
+ verbose("BPF_ALU uses reserved fields\n");
+ return -EINVAL;
+ }
+ }
+
+ /* check src2 operand */
+ err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ if (err)
+ return err;
+
+ if ((opcode == BPF_MOD || opcode == BPF_DIV) &&
+ BPF_SRC(insn->code) == BPF_K && insn->imm == 0) {
+ verbose("div by zero\n");
+ return -EINVAL;
+ }
+
+ /* pattern match 'bpf_add Rx, imm' instruction */
+ if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 &&
+ regs[insn->dst_reg].type == FRAME_PTR &&
+ BPF_SRC(insn->code) == BPF_K)
+ stack_relative = true;
+
+ /* check dest operand */
+ err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+ if (err)
+ return err;
+
+ if (stack_relative) {
+ regs[insn->dst_reg].type = PTR_TO_STACK;
+ regs[insn->dst_reg].imm = insn->imm;
+ }
+ }
+
+ return 0;
+}
+
+static int check_cond_jmp_op(struct verifier_env *env,
+ struct bpf_insn *insn, int *insn_idx)
+{
+ struct reg_state *regs = env->cur_state.regs;
+ struct verifier_state *other_branch;
+ u8 opcode = BPF_OP(insn->code);
+ int err;
+
+ if (opcode > BPF_EXIT) {
+ verbose("invalid BPF_JMP opcode %x\n", opcode);
+ return -EINVAL;
+ }
+
+ if (BPF_SRC(insn->code) == BPF_X) {
+ if (insn->imm != 0) {
+ verbose("BPF_JMP uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ /* check src1 operand */
+ err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ if (err)
+ return err;
+ } else {
+ if (insn->src_reg != BPF_REG_0) {
+ verbose("BPF_JMP uses reserved fields\n");
+ return -EINVAL;
+ }
+ }
+
+ /* check src2 operand */
+ err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ if (err)
+ return err;
+
+ /* detect if R == 0 where R was initialized to zero earlier */
+ if (BPF_SRC(insn->code) == BPF_K &&
+ (opcode == BPF_JEQ || opcode == BPF_JNE) &&
+ regs[insn->dst_reg].type == CONST_IMM &&
+ regs[insn->dst_reg].imm == insn->imm) {
+ if (opcode == BPF_JEQ) {
+ /* if (imm == imm) goto pc+off;
+ * only follow the goto, ignore fall-through
+ */
+ *insn_idx += insn->off;
+ return 0;
+ } else {
+ /* if (imm != imm) goto pc+off;
+ * only follow fall-through branch, since
+ * that's where the program will go
+ */
+ return 0;
+ }
+ }
+
+ other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx);
+ if (!other_branch)
+ return -EFAULT;
+
+ /* detect if R == 0 where R is returned value from bpf_map_lookup_elem() */
+ if (BPF_SRC(insn->code) == BPF_K &&
+ insn->imm == 0 && (opcode == BPF_JEQ ||
+ opcode == BPF_JNE) &&
+ regs[insn->dst_reg].type == PTR_TO_MAP_VALUE_OR_NULL) {
+ if (opcode == BPF_JEQ) {
+ /* next fallthrough insn can access memory via
+ * this register
+ */
+ regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
+ /* branch targer cannot access it, since reg == 0 */
+ other_branch->regs[insn->dst_reg].type = CONST_IMM;
+ other_branch->regs[insn->dst_reg].imm = 0;
+ } else {
+ other_branch->regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
+ regs[insn->dst_reg].type = CONST_IMM;
+ regs[insn->dst_reg].imm = 0;
+ }
+ } else if (BPF_SRC(insn->code) == BPF_K &&
+ (opcode == BPF_JEQ || opcode == BPF_JNE)) {
+
+ if (opcode == BPF_JEQ) {
+ /* detect if (R == imm) goto
+ * and in the target state recognize that R = imm
+ */
+ other_branch->regs[insn->dst_reg].type = CONST_IMM;
+ other_branch->regs[insn->dst_reg].imm = insn->imm;
+ } else {
+ /* detect if (R != imm) goto
+ * and in the fall-through state recognize that R = imm
+ */
+ regs[insn->dst_reg].type = CONST_IMM;
+ regs[insn->dst_reg].imm = insn->imm;
+ }
+ }
+ if (log_level)
+ print_verifier_state(env);
+ return 0;
+}
+
+/* return the map pointer stored inside BPF_LD_IMM64 instruction */
+static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn)
+{
+ u64 imm64 = ((u64) (u32) insn[0].imm) | ((u64) (u32) insn[1].imm) << 32;
+
+ return (struct bpf_map *) (unsigned long) imm64;
+}
+
+/* verify BPF_LD_IMM64 instruction */
+static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn)
+{
+ struct reg_state *regs = env->cur_state.regs;
+ int err;
+
+ if (BPF_SIZE(insn->code) != BPF_DW) {
+ verbose("invalid BPF_LD_IMM insn\n");
+ return -EINVAL;
+ }
+ if (insn->off != 0) {
+ verbose("BPF_LD_IMM64 uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+ if (err)
+ return err;
+
+ if (insn->src_reg == 0)
+ /* generic move 64-bit immediate into a register */
+ return 0;
+
+ /* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */
+ BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD);
+
+ regs[insn->dst_reg].type = CONST_PTR_TO_MAP;
+ regs[insn->dst_reg].map_ptr = ld_imm64_to_map_ptr(insn);
+ return 0;
+}
+
+static bool may_access_skb(enum bpf_prog_type type)
+{
+ switch (type) {
+ case BPF_PROG_TYPE_SOCKET_FILTER:
+ case BPF_PROG_TYPE_SCHED_CLS:
+ case BPF_PROG_TYPE_SCHED_ACT:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/* verify safety of LD_ABS|LD_IND instructions:
+ * - they can only appear in the programs where ctx == skb
+ * - since they are wrappers of function calls, they scratch R1-R5 registers,
+ * preserve R6-R9, and store return value into R0
+ *
+ * Implicit input:
+ * ctx == skb == R6 == CTX
+ *
+ * Explicit input:
+ * SRC == any register
+ * IMM == 32-bit immediate
+ *
+ * Output:
+ * R0 - 8/16/32-bit skb data converted to cpu endianness
+ */
+static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn)
+{
+ struct reg_state *regs = env->cur_state.regs;
+ u8 mode = BPF_MODE(insn->code);
+ struct reg_state *reg;
+ int i, err;
+
+ if (!may_access_skb(env->prog->type)) {
+ verbose("BPF_LD_ABS|IND instructions not allowed for this program type\n");
+ return -EINVAL;
+ }
+
+ if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
+ (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
+ verbose("BPF_LD_ABS uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ /* check whether implicit source operand (register R6) is readable */
+ err = check_reg_arg(regs, BPF_REG_6, SRC_OP);
+ if (err)
+ return err;
+
+ if (regs[BPF_REG_6].type != PTR_TO_CTX) {
+ verbose("at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
+ return -EINVAL;
+ }
+
+ if (mode == BPF_IND) {
+ /* check explicit source operand */
+ err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ if (err)
+ return err;
+ }
+
+ /* reset caller saved regs to unreadable */
+ for (i = 0; i < CALLER_SAVED_REGS; i++) {
+ reg = regs + caller_saved[i];
+ reg->type = NOT_INIT;
+ reg->imm = 0;
+ }
+
+ /* mark destination R0 register as readable, since it contains
+ * the value fetched from the packet
+ */
+ regs[BPF_REG_0].type = UNKNOWN_VALUE;
+ return 0;
+}
+
+/* non-recursive DFS pseudo code
+ * 1 procedure DFS-iterative(G,v):
+ * 2 label v as discovered
+ * 3 let S be a stack
+ * 4 S.push(v)
+ * 5 while S is not empty
+ * 6 t <- S.pop()
+ * 7 if t is what we're looking for:
+ * 8 return t
+ * 9 for all edges e in G.adjacentEdges(t) do
+ * 10 if edge e is already labelled
+ * 11 continue with the next edge
+ * 12 w <- G.adjacentVertex(t,e)
+ * 13 if vertex w is not discovered and not explored
+ * 14 label e as tree-edge
+ * 15 label w as discovered
+ * 16 S.push(w)
+ * 17 continue at 5
+ * 18 else if vertex w is discovered
+ * 19 label e as back-edge
+ * 20 else
+ * 21 // vertex w is explored
+ * 22 label e as forward- or cross-edge
+ * 23 label t as explored
+ * 24 S.pop()
+ *
+ * convention:
+ * 0x10 - discovered
+ * 0x11 - discovered and fall-through edge labelled
+ * 0x12 - discovered and fall-through and branch edges labelled
+ * 0x20 - explored
+ */
+
+enum {
+ DISCOVERED = 0x10,
+ EXPLORED = 0x20,
+ FALLTHROUGH = 1,
+ BRANCH = 2,
+};
+
+#define STATE_LIST_MARK ((struct verifier_state_list *) -1L)
+
+static int *insn_stack; /* stack of insns to process */
+static int cur_stack; /* current stack index */
+static int *insn_state;
+
+/* t, w, e - match pseudo-code above:
+ * t - index of current instruction
+ * w - next instruction
+ * e - edge
+ */
+static int push_insn(int t, int w, int e, struct verifier_env *env)
+{
+ if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH))
+ return 0;
+
+ if (e == BRANCH && insn_state[t] >= (DISCOVERED | BRANCH))
+ return 0;
+
+ if (w < 0 || w >= env->prog->len) {
+ verbose("jump out of range from insn %d to %d\n", t, w);
+ return -EINVAL;
+ }
+
+ if (e == BRANCH)
+ /* mark branch target for state pruning */
+ env->explored_states[w] = STATE_LIST_MARK;
+
+ if (insn_state[w] == 0) {
+ /* tree-edge */
+ insn_state[t] = DISCOVERED | e;
+ insn_state[w] = DISCOVERED;
+ if (cur_stack >= env->prog->len)
+ return -E2BIG;
+ insn_stack[cur_stack++] = w;
+ return 1;
+ } else if ((insn_state[w] & 0xF0) == DISCOVERED) {
+ verbose("back-edge from insn %d to %d\n", t, w);
+ return -EINVAL;
+ } else if (insn_state[w] == EXPLORED) {
+ /* forward- or cross-edge */
+ insn_state[t] = DISCOVERED | e;
+ } else {
+ verbose("insn state internal bug\n");
+ return -EFAULT;
+ }
+ return 0;
+}
+
+/* non-recursive depth-first-search to detect loops in BPF program
+ * loop == back-edge in directed graph
+ */
+static int check_cfg(struct verifier_env *env)
+{
+ struct bpf_insn *insns = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ int ret = 0;
+ int i, t;
+
+ insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
+ if (!insn_state)
+ return -ENOMEM;
+
+ insn_stack = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
+ if (!insn_stack) {
+ kfree(insn_state);
+ return -ENOMEM;
+ }
+
+ insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */
+ insn_stack[0] = 0; /* 0 is the first instruction */
+ cur_stack = 1;
+
+peek_stack:
+ if (cur_stack == 0)
+ goto check_state;
+ t = insn_stack[cur_stack - 1];
+
+ if (BPF_CLASS(insns[t].code) == BPF_JMP) {
+ u8 opcode = BPF_OP(insns[t].code);
+
+ if (opcode == BPF_EXIT) {
+ goto mark_explored;
+ } else if (opcode == BPF_CALL) {
+ ret = push_insn(t, t + 1, FALLTHROUGH, env);
+ if (ret == 1)
+ goto peek_stack;
+ else if (ret < 0)
+ goto err_free;
+ } else if (opcode == BPF_JA) {
+ if (BPF_SRC(insns[t].code) != BPF_K) {
+ ret = -EINVAL;
+ goto err_free;
+ }
+ /* unconditional jump with single edge */
+ ret = push_insn(t, t + insns[t].off + 1,
+ FALLTHROUGH, env);
+ if (ret == 1)
+ goto peek_stack;
+ else if (ret < 0)
+ goto err_free;
+ /* tell verifier to check for equivalent states
+ * after every call and jump
+ */
+ if (t + 1 < insn_cnt)
+ env->explored_states[t + 1] = STATE_LIST_MARK;
+ } else {
+ /* conditional jump with two edges */
+ ret = push_insn(t, t + 1, FALLTHROUGH, env);
+ if (ret == 1)
+ goto peek_stack;
+ else if (ret < 0)
+ goto err_free;
+
+ ret = push_insn(t, t + insns[t].off + 1, BRANCH, env);
+ if (ret == 1)
+ goto peek_stack;
+ else if (ret < 0)
+ goto err_free;
+ }
+ } else {
+ /* all other non-branch instructions with single
+ * fall-through edge
+ */
+ ret = push_insn(t, t + 1, FALLTHROUGH, env);
+ if (ret == 1)
+ goto peek_stack;
+ else if (ret < 0)
+ goto err_free;
+ }
+
+mark_explored:
+ insn_state[t] = EXPLORED;
+ if (cur_stack-- <= 0) {
+ verbose("pop stack internal bug\n");
+ ret = -EFAULT;
+ goto err_free;
+ }
+ goto peek_stack;
+
+check_state:
+ for (i = 0; i < insn_cnt; i++) {
+ if (insn_state[i] != EXPLORED) {
+ verbose("unreachable insn %d\n", i);
+ ret = -EINVAL;
+ goto err_free;
+ }
+ }
+ ret = 0; /* cfg looks good */
+
+err_free:
+ kfree(insn_state);
+ kfree(insn_stack);
+ return ret;
+}
+
+/* compare two verifier states
+ *
+ * all states stored in state_list are known to be valid, since
+ * verifier reached 'bpf_exit' instruction through them
+ *
+ * this function is called when verifier exploring different branches of
+ * execution popped from the state stack. If it sees an old state that has
+ * more strict register state and more strict stack state then this execution
+ * branch doesn't need to be explored further, since verifier already
+ * concluded that more strict state leads to valid finish.
+ *
+ * Therefore two states are equivalent if register state is more conservative
+ * and explored stack state is more conservative than the current one.
+ * Example:
+ * explored current
+ * (slot1=INV slot2=MISC) == (slot1=MISC slot2=MISC)
+ * (slot1=MISC slot2=MISC) != (slot1=INV slot2=MISC)
+ *
+ * In other words if current stack state (one being explored) has more
+ * valid slots than old one that already passed validation, it means
+ * the verifier can stop exploring and conclude that current state is valid too
+ *
+ * Similarly with registers. If explored state has register type as invalid
+ * whereas register type in current state is meaningful, it means that
+ * the current state will reach 'bpf_exit' instruction safely
+ */
+static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
+{
+ int i;
+
+ for (i = 0; i < MAX_BPF_REG; i++) {
+ if (memcmp(&old->regs[i], &cur->regs[i],
+ sizeof(old->regs[0])) != 0) {
+ if (old->regs[i].type == NOT_INIT ||
+ (old->regs[i].type == UNKNOWN_VALUE &&
+ cur->regs[i].type != NOT_INIT))
+ continue;
+ return false;
+ }
+ }
+
+ for (i = 0; i < MAX_BPF_STACK; i++) {
+ if (old->stack_slot_type[i] == STACK_INVALID)
+ continue;
+ if (old->stack_slot_type[i] != cur->stack_slot_type[i])
+ /* Ex: old explored (safe) state has STACK_SPILL in
+ * this stack slot, but current has has STACK_MISC ->
+ * this verifier states are not equivalent,
+ * return false to continue verification of this path
+ */
+ return false;
+ if (i % BPF_REG_SIZE)
+ continue;
+ if (memcmp(&old->spilled_regs[i / BPF_REG_SIZE],
+ &cur->spilled_regs[i / BPF_REG_SIZE],
+ sizeof(old->spilled_regs[0])))
+ /* when explored and current stack slot types are
+ * the same, check that stored pointers types
+ * are the same as well.
+ * Ex: explored safe path could have stored
+ * (struct reg_state) {.type = PTR_TO_STACK, .imm = -8}
+ * but current path has stored:
+ * (struct reg_state) {.type = PTR_TO_STACK, .imm = -16}
+ * such verifier states are not equivalent.
+ * return false to continue verification of this path
+ */
+ return false;
+ else
+ continue;
+ }
+ return true;
+}
+
+static int is_state_visited(struct verifier_env *env, int insn_idx)
+{
+ struct verifier_state_list *new_sl;
+ struct verifier_state_list *sl;
+
+ sl = env->explored_states[insn_idx];
+ if (!sl)
+ /* this 'insn_idx' instruction wasn't marked, so we will not
+ * be doing state search here
+ */
+ return 0;
+
+ while (sl != STATE_LIST_MARK) {
+ if (states_equal(&sl->state, &env->cur_state))
+ /* reached equivalent register/stack state,
+ * prune the search
+ */
+ return 1;
+ sl = sl->next;
+ }
+
+ /* there were no equivalent states, remember current one.
+ * technically the current state is not proven to be safe yet,
+ * but it will either reach bpf_exit (which means it's safe) or
+ * it will be rejected. Since there are no loops, we won't be
+ * seeing this 'insn_idx' instruction again on the way to bpf_exit
+ */
+ new_sl = kmalloc(sizeof(struct verifier_state_list), GFP_USER);
+ if (!new_sl)
+ return -ENOMEM;
+
+ /* add new state to the head of linked list */
+ memcpy(&new_sl->state, &env->cur_state, sizeof(env->cur_state));
+ new_sl->next = env->explored_states[insn_idx];
+ env->explored_states[insn_idx] = new_sl;
+ return 0;
+}
+
+static int do_check(struct verifier_env *env)
+{
+ struct verifier_state *state = &env->cur_state;
+ struct bpf_insn *insns = env->prog->insnsi;
+ struct reg_state *regs = state->regs;
+ int insn_cnt = env->prog->len;
+ int insn_idx, prev_insn_idx = 0;
+ int insn_processed = 0;
+ bool do_print_state = false;
+
+ init_reg_state(regs);
+ insn_idx = 0;
+ for (;;) {
+ struct bpf_insn *insn;
+ u8 class;
+ int err;
+
+ if (insn_idx >= insn_cnt) {
+ verbose("invalid insn idx %d insn_cnt %d\n",
+ insn_idx, insn_cnt);
+ return -EFAULT;
+ }
+
+ insn = &insns[insn_idx];
+ class = BPF_CLASS(insn->code);
+
+ if (++insn_processed > 32768) {
+ verbose("BPF program is too large. Proccessed %d insn\n",
+ insn_processed);
+ return -E2BIG;
+ }
+
+ err = is_state_visited(env, insn_idx);
+ if (err < 0)
+ return err;
+ if (err == 1) {
+ /* found equivalent state, can prune the search */
+ if (log_level) {
+ if (do_print_state)
+ verbose("\nfrom %d to %d: safe\n",
+ prev_insn_idx, insn_idx);
+ else
+ verbose("%d: safe\n", insn_idx);
+ }
+ goto process_bpf_exit;
+ }
+
+ if (log_level && do_print_state) {
+ verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx);
+ print_verifier_state(env);
+ do_print_state = false;
+ }
+
+ if (log_level) {
+ verbose("%d: ", insn_idx);
+ print_bpf_insn(insn);
+ }
+
+ if (class == BPF_ALU || class == BPF_ALU64) {
+ err = check_alu_op(regs, insn);
+ if (err)
+ return err;
+
+ } else if (class == BPF_LDX) {
+ enum bpf_reg_type src_reg_type;
+
+ /* check for reserved fields is already done */
+
+ /* check src operand */
+ err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ if (err)
+ return err;
+
+ err = check_reg_arg(regs, insn->dst_reg, DST_OP_NO_MARK);
+ if (err)
+ return err;
+
+ src_reg_type = regs[insn->src_reg].type;
+
+ /* check that memory (src_reg + off) is readable,
+ * the state of dst_reg will be updated by this func
+ */
+ err = check_mem_access(env, insn->src_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_READ,
+ insn->dst_reg);
+ if (err)
+ return err;
+
+ if (BPF_SIZE(insn->code) != BPF_W) {
+ insn_idx++;
+ continue;
+ }
+
+ if (insn->imm == 0) {
+ /* saw a valid insn
+ * dst_reg = *(u32 *)(src_reg + off)
+ * use reserved 'imm' field to mark this insn
+ */
+ insn->imm = src_reg_type;
+
+ } else if (src_reg_type != insn->imm &&
+ (src_reg_type == PTR_TO_CTX ||
+ insn->imm == PTR_TO_CTX)) {
+ /* ABuser program is trying to use the same insn
+ * dst_reg = *(u32*) (src_reg + off)
+ * with different pointer types:
+ * src_reg == ctx in one branch and
+ * src_reg == stack|map in some other branch.
+ * Reject it.
+ */
+ verbose("same insn cannot be used with different pointers\n");
+ return -EINVAL;
+ }
+
+ } else if (class == BPF_STX) {
+ if (BPF_MODE(insn->code) == BPF_XADD) {
+ err = check_xadd(env, insn);
+ if (err)
+ return err;
+ insn_idx++;
+ continue;
+ }
+
+ if (BPF_MODE(insn->code) != BPF_MEM ||
+ insn->imm != 0) {
+ verbose("BPF_STX uses reserved fields\n");
+ return -EINVAL;
+ }
+ /* check src1 operand */
+ err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ if (err)
+ return err;
+ /* check src2 operand */
+ err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ if (err)
+ return err;
+
+ /* check that memory (dst_reg + off) is writeable */
+ err = check_mem_access(env, insn->dst_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_WRITE,
+ insn->src_reg);
+ if (err)
+ return err;
+
+ } else if (class == BPF_ST) {
+ if (BPF_MODE(insn->code) != BPF_MEM ||
+ insn->src_reg != BPF_REG_0) {
+ verbose("BPF_ST uses reserved fields\n");
+ return -EINVAL;
+ }
+ /* check src operand */
+ err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ if (err)
+ return err;
+
+ /* check that memory (dst_reg + off) is writeable */
+ err = check_mem_access(env, insn->dst_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_WRITE,
+ -1);
+ if (err)
+ return err;
+
+ } else if (class == BPF_JMP) {
+ u8 opcode = BPF_OP(insn->code);
+
+ if (opcode == BPF_CALL) {
+ if (BPF_SRC(insn->code) != BPF_K ||
+ insn->off != 0 ||
+ insn->src_reg != BPF_REG_0 ||
+ insn->dst_reg != BPF_REG_0) {
+ verbose("BPF_CALL uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ err = check_call(env, insn->imm);
+ if (err)
+ return err;
+
+ } else if (opcode == BPF_JA) {
+ if (BPF_SRC(insn->code) != BPF_K ||
+ insn->imm != 0 ||
+ insn->src_reg != BPF_REG_0 ||
+ insn->dst_reg != BPF_REG_0) {
+ verbose("BPF_JA uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ insn_idx += insn->off + 1;
+ continue;
+
+ } else if (opcode == BPF_EXIT) {
+ if (BPF_SRC(insn->code) != BPF_K ||
+ insn->imm != 0 ||
+ insn->src_reg != BPF_REG_0 ||
+ insn->dst_reg != BPF_REG_0) {
+ verbose("BPF_EXIT uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ /* eBPF calling convetion is such that R0 is used
+ * to return the value from eBPF program.
+ * Make sure that it's readable at this time
+ * of bpf_exit, which means that program wrote
+ * something into it earlier
+ */
+ err = check_reg_arg(regs, BPF_REG_0, SRC_OP);
+ if (err)
+ return err;
+
+process_bpf_exit:
+ insn_idx = pop_stack(env, &prev_insn_idx);
+ if (insn_idx < 0) {
+ break;
+ } else {
+ do_print_state = true;
+ continue;
+ }
+ } else {
+ err = check_cond_jmp_op(env, insn, &insn_idx);
+ if (err)
+ return err;
+ }
+ } else if (class == BPF_LD) {
+ u8 mode = BPF_MODE(insn->code);
+
+ if (mode == BPF_ABS || mode == BPF_IND) {
+ err = check_ld_abs(env, insn);
+ if (err)
+ return err;
+
+ } else if (mode == BPF_IMM) {
+ err = check_ld_imm(env, insn);
+ if (err)
+ return err;
+
+ insn_idx++;
+ } else {
+ verbose("invalid BPF_LD mode\n");
+ return -EINVAL;
+ }
+ } else {
+ verbose("unknown insn class %d\n", class);
+ return -EINVAL;
+ }
+
+ insn_idx++;
+ }
+
+ return 0;
+}
+
+/* look for pseudo eBPF instructions that access map FDs and
+ * replace them with actual map pointers
+ */
+static int replace_map_fd_with_map_ptr(struct verifier_env *env)
+{
+ struct bpf_insn *insn = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ int i, j;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (BPF_CLASS(insn->code) == BPF_LDX &&
+ (BPF_MODE(insn->code) != BPF_MEM ||
+ insn->imm != 0)) {
+ verbose("BPF_LDX uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) {
+ struct bpf_map *map;
+ struct fd f;
+
+ if (i == insn_cnt - 1 || insn[1].code != 0 ||
+ insn[1].dst_reg != 0 || insn[1].src_reg != 0 ||
+ insn[1].off != 0) {
+ verbose("invalid bpf_ld_imm64 insn\n");
+ return -EINVAL;
+ }
+
+ if (insn->src_reg == 0)
+ /* valid generic load 64-bit imm */
+ goto next_insn;
+
+ if (insn->src_reg != BPF_PSEUDO_MAP_FD) {
+ verbose("unrecognized bpf_ld_imm64 insn\n");
+ return -EINVAL;
+ }
+
+ f = fdget(insn->imm);
+
+ map = bpf_map_get(f);
+ if (IS_ERR(map)) {
+ verbose("fd %d is not pointing to valid bpf_map\n",
+ insn->imm);
+ fdput(f);
+ return PTR_ERR(map);
+ }
+
+ /* store map pointer inside BPF_LD_IMM64 instruction */
+ insn[0].imm = (u32) (unsigned long) map;
+ insn[1].imm = ((u64) (unsigned long) map) >> 32;
+
+ /* check whether we recorded this map already */
+ for (j = 0; j < env->used_map_cnt; j++)
+ if (env->used_maps[j] == map) {
+ fdput(f);
+ goto next_insn;
+ }
+
+ if (env->used_map_cnt >= MAX_USED_MAPS) {
+ fdput(f);
+ return -E2BIG;
+ }
+
+ /* remember this map */
+ env->used_maps[env->used_map_cnt++] = map;
+
+ /* hold the map. If the program is rejected by verifier,
+ * the map will be released by release_maps() or it
+ * will be used by the valid program until it's unloaded
+ * and all maps are released in free_bpf_prog_info()
+ */
+ atomic_inc(&map->refcnt);
+
+ fdput(f);
+next_insn:
+ insn++;
+ i++;
+ }
+ }
+
+ /* now all pseudo BPF_LD_IMM64 instructions load valid
+ * 'struct bpf_map *' into a register instead of user map_fd.
+ * These pointers will be used later by verifier to validate map access.
+ */
+ return 0;
+}
+
+/* drop refcnt of maps used by the rejected program */
+static void release_maps(struct verifier_env *env)
+{
+ int i;
+
+ for (i = 0; i < env->used_map_cnt; i++)
+ bpf_map_put(env->used_maps[i]);
+}
+
+/* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */
+static void convert_pseudo_ld_imm64(struct verifier_env *env)
+{
+ struct bpf_insn *insn = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ int i;
+
+ for (i = 0; i < insn_cnt; i++, insn++)
+ if (insn->code == (BPF_LD | BPF_IMM | BPF_DW))
+ insn->src_reg = 0;
+}
+
+static void adjust_branches(struct bpf_prog *prog, int pos, int delta)
+{
+ struct bpf_insn *insn = prog->insnsi;
+ int insn_cnt = prog->len;
+ int i;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (BPF_CLASS(insn->code) != BPF_JMP ||
+ BPF_OP(insn->code) == BPF_CALL ||
+ BPF_OP(insn->code) == BPF_EXIT)
+ continue;
+
+ /* adjust offset of jmps if necessary */
+ if (i < pos && i + insn->off + 1 > pos)
+ insn->off += delta;
+ else if (i > pos && i + insn->off + 1 < pos)
+ insn->off -= delta;
+ }
+}
+
+/* convert load instructions that access fields of 'struct __sk_buff'
+ * into sequence of instructions that access fields of 'struct sk_buff'
+ */
+static int convert_ctx_accesses(struct verifier_env *env)
+{
+ struct bpf_insn *insn = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ struct bpf_insn insn_buf[16];
+ struct bpf_prog *new_prog;
+ u32 cnt;
+ int i;
+
+ if (!env->prog->aux->ops->convert_ctx_access)
+ return 0;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (insn->code != (BPF_LDX | BPF_MEM | BPF_W))
+ continue;
+
+ if (insn->imm != PTR_TO_CTX) {
+ /* clear internal mark */
+ insn->imm = 0;
+ continue;
+ }
+
+ cnt = env->prog->aux->ops->
+ convert_ctx_access(insn->dst_reg, insn->src_reg,
+ insn->off, insn_buf);
+ if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
+ verbose("bpf verifier is misconfigured\n");
+ return -EINVAL;
+ }
+
+ if (cnt == 1) {
+ memcpy(insn, insn_buf, sizeof(*insn));
+ continue;
+ }
+
+ /* several new insns need to be inserted. Make room for them */
+ insn_cnt += cnt - 1;
+ new_prog = bpf_prog_realloc(env->prog,
+ bpf_prog_size(insn_cnt),
+ GFP_USER);
+ if (!new_prog)
+ return -ENOMEM;
+
+ new_prog->len = insn_cnt;
+
+ memmove(new_prog->insnsi + i + cnt, new_prog->insns + i + 1,
+ sizeof(*insn) * (insn_cnt - i - cnt));
+
+ /* copy substitute insns in place of load instruction */
+ memcpy(new_prog->insnsi + i, insn_buf, sizeof(*insn) * cnt);
+
+ /* adjust branches in the whole program */
+ adjust_branches(new_prog, i, cnt - 1);
+
+ /* keep walking new program and skip insns we just inserted */
+ env->prog = new_prog;
+ insn = new_prog->insnsi + i + cnt - 1;
+ i += cnt - 1;
+ }
+
+ return 0;
+}
+
+static void free_states(struct verifier_env *env)
+{
+ struct verifier_state_list *sl, *sln;
+ int i;
+
+ if (!env->explored_states)
+ return;
+
+ for (i = 0; i < env->prog->len; i++) {
+ sl = env->explored_states[i];
+
+ if (sl)
+ while (sl != STATE_LIST_MARK) {
+ sln = sl->next;
+ kfree(sl);
+ sl = sln;
+ }
+ }
+
+ kfree(env->explored_states);
+}
+
+int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
+{
+ char __user *log_ubuf = NULL;
+ struct verifier_env *env;
+ int ret = -EINVAL;
+
+ if ((*prog)->len <= 0 || (*prog)->len > BPF_MAXINSNS)
+ return -E2BIG;
+
+ /* 'struct verifier_env' can be global, but since it's not small,
+ * allocate/free it every time bpf_check() is called
+ */
+ env = kzalloc(sizeof(struct verifier_env), GFP_KERNEL);
+ if (!env)
+ return -ENOMEM;
+
+ env->prog = *prog;
+
+ /* grab the mutex to protect few globals used by verifier */
+ mutex_lock(&bpf_verifier_lock);
+
+ if (attr->log_level || attr->log_buf || attr->log_size) {
+ /* user requested verbose verifier output
+ * and supplied buffer to store the verification trace
+ */
+ log_level = attr->log_level;
+ log_ubuf = (char __user *) (unsigned long) attr->log_buf;
+ log_size = attr->log_size;
+ log_len = 0;
+
+ ret = -EINVAL;
+ /* log_* values have to be sane */
+ if (log_size < 128 || log_size > UINT_MAX >> 8 ||
+ log_level == 0 || log_ubuf == NULL)
+ goto free_env;
+
+ ret = -ENOMEM;
+ log_buf = vmalloc(log_size);
+ if (!log_buf)
+ goto free_env;
+ } else {
+ log_level = 0;
+ }
+
+ ret = replace_map_fd_with_map_ptr(env);
+ if (ret < 0)
+ goto skip_full_check;
+
+ env->explored_states = kcalloc(env->prog->len,
+ sizeof(struct verifier_state_list *),
+ GFP_USER);
+ ret = -ENOMEM;
+ if (!env->explored_states)
+ goto skip_full_check;
+
+ ret = check_cfg(env);
+ if (ret < 0)
+ goto skip_full_check;
+
+ ret = do_check(env);
+
+skip_full_check:
+ while (pop_stack(env, NULL) >= 0);
+ free_states(env);
+
+ if (ret == 0)
+ /* program is valid, convert *(u32*)(ctx + off) accesses */
+ ret = convert_ctx_accesses(env);
+
+ if (log_level && log_len >= log_size - 1) {
+ BUG_ON(log_len >= log_size);
+ /* verifier log exceeded user supplied buffer */
+ ret = -ENOSPC;
+ /* fall through to return what was recorded */
+ }
+
+ /* copy verifier log back to user space including trailing zero */
+ if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) {
+ ret = -EFAULT;
+ goto free_log_buf;
+ }
+
+ if (ret == 0 && env->used_map_cnt) {
+ /* if program passed verifier, update used_maps in bpf_prog_info */
+ env->prog->aux->used_maps = kmalloc_array(env->used_map_cnt,
+ sizeof(env->used_maps[0]),
+ GFP_KERNEL);
+
+ if (!env->prog->aux->used_maps) {
+ ret = -ENOMEM;
+ goto free_log_buf;
+ }
+
+ memcpy(env->prog->aux->used_maps, env->used_maps,
+ sizeof(env->used_maps[0]) * env->used_map_cnt);
+ env->prog->aux->used_map_cnt = env->used_map_cnt;
+
+ /* program is valid. Convert pseudo bpf_ld_imm64 into generic
+ * bpf_ld_imm64 instructions
+ */
+ convert_pseudo_ld_imm64(env);
+ }
+
+free_log_buf:
+ if (log_level)
+ vfree(log_buf);
+free_env:
+ if (!env->prog->aux->used_maps)
+ /* if we didn't copy map pointers into bpf_prog_info, release
+ * them now. Otherwise free_bpf_prog_info() will release them.
+ */
+ release_maps(env);
+ *prog = env->prog;
+ kfree(env);
+ mutex_unlock(&bpf_verifier_lock);
+ return ret;
+}
diff --git a/kernel/capability.c b/kernel/capability.c
new file mode 100644
index 000000000..45432b54d
--- /dev/null
+++ b/kernel/capability.c
@@ -0,0 +1,449 @@
+/*
+ * linux/kernel/capability.c
+ *
+ * Copyright (C) 1997 Andrew Main <zefram@fysh.org>
+ *
+ * Integrated into 2.1.97+, Andrew G. Morgan <morgan@kernel.org>
+ * 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/audit.h>
+#include <linux/capability.h>
+#include <linux/mm.h>
+#include <linux/export.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <linux/pid_namespace.h>
+#include <linux/user_namespace.h>
+#include <asm/uaccess.h>
+
+/*
+ * Leveraged for setting/resetting capabilities
+ */
+
+const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET;
+EXPORT_SYMBOL(__cap_empty_set);
+
+int file_caps_enabled = 1;
+
+static int __init file_caps_disable(char *str)
+{
+ file_caps_enabled = 0;
+ return 1;
+}
+__setup("no_file_caps", file_caps_disable);
+
+#ifdef CONFIG_MULTIUSER
+/*
+ * More recent versions of libcap are available from:
+ *
+ * http://www.kernel.org/pub/linux/libs/security/linux-privs/
+ */
+
+static void warn_legacy_capability_use(void)
+{
+ char name[sizeof(current->comm)];
+
+ pr_info_once("warning: `%s' uses 32-bit capabilities (legacy support in use)\n",
+ get_task_comm(name, current));
+}
+
+/*
+ * Version 2 capabilities worked fine, but the linux/capability.h file
+ * that accompanied their introduction encouraged their use without
+ * the necessary user-space source code changes. As such, we have
+ * created a version 3 with equivalent functionality to version 2, but
+ * with a header change to protect legacy source code from using
+ * version 2 when it wanted to use version 1. If your system has code
+ * that trips the following warning, it is using version 2 specific
+ * capabilities and may be doing so insecurely.
+ *
+ * The remedy is to either upgrade your version of libcap (to 2.10+,
+ * if the application is linked against it), or recompile your
+ * application with modern kernel headers and this warning will go
+ * away.
+ */
+
+static void warn_deprecated_v2(void)
+{
+ char name[sizeof(current->comm)];
+
+ pr_info_once("warning: `%s' uses deprecated v2 capabilities in a way that may be insecure\n",
+ get_task_comm(name, current));
+}
+
+/*
+ * Version check. Return the number of u32s in each capability flag
+ * array, or a negative value on error.
+ */
+static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy)
+{
+ __u32 version;
+
+ if (get_user(version, &header->version))
+ return -EFAULT;
+
+ switch (version) {
+ case _LINUX_CAPABILITY_VERSION_1:
+ warn_legacy_capability_use();
+ *tocopy = _LINUX_CAPABILITY_U32S_1;
+ break;
+ case _LINUX_CAPABILITY_VERSION_2:
+ warn_deprecated_v2();
+ /*
+ * fall through - v3 is otherwise equivalent to v2.
+ */
+ case _LINUX_CAPABILITY_VERSION_3:
+ *tocopy = _LINUX_CAPABILITY_U32S_3;
+ break;
+ default:
+ if (put_user((u32)_KERNEL_CAPABILITY_VERSION, &header->version))
+ return -EFAULT;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/*
+ * The only thing that can change the capabilities of the current
+ * process is the current process. As such, we can't be in this code
+ * at the same time as we are in the process of setting capabilities
+ * in this process. The net result is that we can limit our use of
+ * locks to when we are reading the caps of another process.
+ */
+static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
+ kernel_cap_t *pIp, kernel_cap_t *pPp)
+{
+ int ret;
+
+ if (pid && (pid != task_pid_vnr(current))) {
+ struct task_struct *target;
+
+ rcu_read_lock();
+
+ target = find_task_by_vpid(pid);
+ if (!target)
+ ret = -ESRCH;
+ else
+ ret = security_capget(target, pEp, pIp, pPp);
+
+ rcu_read_unlock();
+ } else
+ ret = security_capget(current, pEp, pIp, pPp);
+
+ return ret;
+}
+
+/**
+ * sys_capget - get the capabilities of a given process.
+ * @header: pointer to struct that contains capability version and
+ * target pid data
+ * @dataptr: pointer to struct that contains the effective, permitted,
+ * and inheritable capabilities that are returned
+ *
+ * Returns 0 on success and < 0 on error.
+ */
+SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
+{
+ int ret = 0;
+ pid_t pid;
+ unsigned tocopy;
+ kernel_cap_t pE, pI, pP;
+
+ ret = cap_validate_magic(header, &tocopy);
+ if ((dataptr == NULL) || (ret != 0))
+ return ((dataptr == NULL) && (ret == -EINVAL)) ? 0 : ret;
+
+ if (get_user(pid, &header->pid))
+ return -EFAULT;
+
+ if (pid < 0)
+ return -EINVAL;
+
+ ret = cap_get_target_pid(pid, &pE, &pI, &pP);
+ if (!ret) {
+ struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
+ unsigned i;
+
+ for (i = 0; i < tocopy; i++) {
+ kdata[i].effective = pE.cap[i];
+ kdata[i].permitted = pP.cap[i];
+ kdata[i].inheritable = pI.cap[i];
+ }
+
+ /*
+ * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S,
+ * we silently drop the upper capabilities here. This
+ * has the effect of making older libcap
+ * implementations implicitly drop upper capability
+ * bits when they perform a: capget/modify/capset
+ * sequence.
+ *
+ * This behavior is considered fail-safe
+ * behavior. Upgrading the application to a newer
+ * version of libcap will enable access to the newer
+ * capabilities.
+ *
+ * An alternative would be to return an error here
+ * (-ERANGE), but that causes legacy applications to
+ * unexpectedly fail; the capget/modify/capset aborts
+ * before modification is attempted and the application
+ * fails.
+ */
+ if (copy_to_user(dataptr, kdata, tocopy
+ * sizeof(struct __user_cap_data_struct))) {
+ return -EFAULT;
+ }
+ }
+
+ return ret;
+}
+
+/**
+ * sys_capset - set capabilities for a process or (*) a group of processes
+ * @header: pointer to struct that contains capability version and
+ * target pid data
+ * @data: pointer to struct that contains the effective, permitted,
+ * and inheritable capabilities
+ *
+ * Set capabilities for the current process only. The ability to any other
+ * process(es) has been deprecated and removed.
+ *
+ * The restrictions on setting capabilities are specified as:
+ *
+ * I: any raised capabilities must be a subset of the old permitted
+ * P: any raised capabilities must be a subset of the old permitted
+ * E: must be set to a subset of new permitted
+ *
+ * Returns 0 on success and < 0 on error.
+ */
+SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
+{
+ struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
+ unsigned i, tocopy, copybytes;
+ kernel_cap_t inheritable, permitted, effective;
+ struct cred *new;
+ int ret;
+ pid_t pid;
+
+ ret = cap_validate_magic(header, &tocopy);
+ if (ret != 0)
+ return ret;
+
+ if (get_user(pid, &header->pid))
+ return -EFAULT;
+
+ /* may only affect current now */
+ if (pid != 0 && pid != task_pid_vnr(current))
+ return -EPERM;
+
+ copybytes = tocopy * sizeof(struct __user_cap_data_struct);
+ if (copybytes > sizeof(kdata))
+ return -EFAULT;
+
+ if (copy_from_user(&kdata, data, copybytes))
+ return -EFAULT;
+
+ for (i = 0; i < tocopy; i++) {
+ effective.cap[i] = kdata[i].effective;
+ permitted.cap[i] = kdata[i].permitted;
+ inheritable.cap[i] = kdata[i].inheritable;
+ }
+ while (i < _KERNEL_CAPABILITY_U32S) {
+ effective.cap[i] = 0;
+ permitted.cap[i] = 0;
+ inheritable.cap[i] = 0;
+ i++;
+ }
+
+ effective.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
+ permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
+ inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
+
+ new = prepare_creds();
+ if (!new)
+ return -ENOMEM;
+
+ ret = security_capset(new, current_cred(),
+ &effective, &inheritable, &permitted);
+ if (ret < 0)
+ goto error;
+
+ audit_log_capset(new, current_cred());
+
+ return commit_creds(new);
+
+error:
+ abort_creds(new);
+ return ret;
+}
+
+/**
+ * has_ns_capability - Does a task have a capability in a specific user ns
+ * @t: The task in question
+ * @ns: target user namespace
+ * @cap: The capability to be tested for
+ *
+ * Return true if the specified task has the given superior capability
+ * currently in effect to the specified user namespace, false if not.
+ *
+ * Note that this does not set PF_SUPERPRIV on the task.
+ */
+bool has_ns_capability(struct task_struct *t,
+ struct user_namespace *ns, int cap)
+{
+ int ret;
+
+ rcu_read_lock();
+ ret = security_capable(__task_cred(t), ns, cap);
+ rcu_read_unlock();
+
+ return (ret == 0);
+}
+
+/**
+ * has_capability - Does a task have a capability in init_user_ns
+ * @t: The task in question
+ * @cap: The capability to be tested for
+ *
+ * Return true if the specified task has the given superior capability
+ * currently in effect to the initial user namespace, false if not.
+ *
+ * Note that this does not set PF_SUPERPRIV on the task.
+ */
+bool has_capability(struct task_struct *t, int cap)
+{
+ return has_ns_capability(t, &init_user_ns, cap);
+}
+
+/**
+ * has_ns_capability_noaudit - Does a task have a capability (unaudited)
+ * in a specific user ns.
+ * @t: The task in question
+ * @ns: target user namespace
+ * @cap: The capability to be tested for
+ *
+ * Return true if the specified task has the given superior capability
+ * currently in effect to the specified user namespace, false if not.
+ * Do not write an audit message for the check.
+ *
+ * Note that this does not set PF_SUPERPRIV on the task.
+ */
+bool has_ns_capability_noaudit(struct task_struct *t,
+ struct user_namespace *ns, int cap)
+{
+ int ret;
+
+ rcu_read_lock();
+ ret = security_capable_noaudit(__task_cred(t), ns, cap);
+ rcu_read_unlock();
+
+ return (ret == 0);
+}
+
+/**
+ * has_capability_noaudit - Does a task have a capability (unaudited) in the
+ * initial user ns
+ * @t: The task in question
+ * @cap: The capability to be tested for
+ *
+ * Return true if the specified task has the given superior capability
+ * currently in effect to init_user_ns, false if not. Don't write an
+ * audit message for the check.
+ *
+ * Note that this does not set PF_SUPERPRIV on the task.
+ */
+bool has_capability_noaudit(struct task_struct *t, int cap)
+{
+ return has_ns_capability_noaudit(t, &init_user_ns, cap);
+}
+
+/**
+ * ns_capable - Determine if the current task has a superior capability in effect
+ * @ns: The usernamespace we want the capability in
+ * @cap: The capability to be tested for
+ *
+ * Return true if the current task has the given superior capability currently
+ * available for use, false if not.
+ *
+ * This sets PF_SUPERPRIV on the task if the capability is available on the
+ * assumption that it's about to be used.
+ */
+bool ns_capable(struct user_namespace *ns, int cap)
+{
+ if (unlikely(!cap_valid(cap))) {
+ pr_crit("capable() called with invalid cap=%u\n", cap);
+ BUG();
+ }
+
+ if (security_capable(current_cred(), ns, cap) == 0) {
+ current->flags |= PF_SUPERPRIV;
+ return true;
+ }
+ return false;
+}
+EXPORT_SYMBOL(ns_capable);
+
+
+/**
+ * capable - Determine if the current task has a superior capability in effect
+ * @cap: The capability to be tested for
+ *
+ * Return true if the current task has the given superior capability currently
+ * available for use, false if not.
+ *
+ * This sets PF_SUPERPRIV on the task if the capability is available on the
+ * assumption that it's about to be used.
+ */
+bool capable(int cap)
+{
+ return ns_capable(&init_user_ns, cap);
+}
+EXPORT_SYMBOL(capable);
+#endif /* CONFIG_MULTIUSER */
+
+/**
+ * file_ns_capable - Determine if the file's opener had a capability in effect
+ * @file: The file we want to check
+ * @ns: The usernamespace we want the capability in
+ * @cap: The capability to be tested for
+ *
+ * Return true if task that opened the file had a capability in effect
+ * when the file was opened.
+ *
+ * This does not set PF_SUPERPRIV because the caller may not
+ * actually be privileged.
+ */
+bool file_ns_capable(const struct file *file, struct user_namespace *ns,
+ int cap)
+{
+ if (WARN_ON_ONCE(!cap_valid(cap)))
+ return false;
+
+ if (security_capable(file->f_cred, ns, cap) == 0)
+ return true;
+
+ return false;
+}
+EXPORT_SYMBOL(file_ns_capable);
+
+/**
+ * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped
+ * @inode: The inode in question
+ * @cap: The capability in question
+ *
+ * Return true if the current task has the given capability targeted at
+ * its own user namespace and that the given inode's uid and gid are
+ * mapped into the current user namespace.
+ */
+bool capable_wrt_inode_uidgid(const struct inode *inode, int cap)
+{
+ struct user_namespace *ns = current_user_ns();
+
+ return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid) &&
+ kgid_has_mapping(ns, inode->i_gid);
+}
+EXPORT_SYMBOL(capable_wrt_inode_uidgid);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
new file mode 100644
index 000000000..e8a5491be
--- /dev/null
+++ b/kernel/cgroup.c
@@ -0,0 +1,5601 @@
+/*
+ * Generic process-grouping system.
+ *
+ * Based originally on the cpuset system, extracted by Paul Menage
+ * Copyright (C) 2006 Google, Inc
+ *
+ * Notifications support
+ * Copyright (C) 2009 Nokia Corporation
+ * Author: Kirill A. Shutemov
+ *
+ * Copyright notices from the original cpuset code:
+ * --------------------------------------------------
+ * Copyright (C) 2003 BULL SA.
+ * Copyright (C) 2004-2006 Silicon Graphics, Inc.
+ *
+ * Portions derived from Patrick Mochel's sysfs code.
+ * sysfs is Copyright (c) 2001-3 Patrick Mochel
+ *
+ * 2003-10-10 Written by Simon Derr.
+ * 2003-10-22 Updates by Stephen Hemminger.
+ * 2004 May-July Rework by Paul Jackson.
+ * ---------------------------------------------------
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cgroup.h>
+#include <linux/cred.h>
+#include <linux/ctype.h>
+#include <linux/errno.h>
+#include <linux/init_task.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/magic.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/proc_fs.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/rwsem.h>
+#include <linux/string.h>
+#include <linux/sort.h>
+#include <linux/kmod.h>
+#include <linux/delayacct.h>
+#include <linux/cgroupstats.h>
+#include <linux/hashtable.h>
+#include <linux/pid_namespace.h>
+#include <linux/idr.h>
+#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
+#include <linux/kthread.h>
+#include <linux/delay.h>
+
+#include <linux/atomic.h>
+
+/*
+ * pidlists linger the following amount before being destroyed. The goal
+ * is avoiding frequent destruction in the middle of consecutive read calls
+ * Expiring in the middle is a performance problem not a correctness one.
+ * 1 sec should be enough.
+ */
+#define CGROUP_PIDLIST_DESTROY_DELAY HZ
+
+#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
+ MAX_CFTYPE_NAME + 2)
+
+/*
+ * cgroup_mutex is the master lock. Any modification to cgroup or its
+ * hierarchy must be performed while holding it.
+ *
+ * css_set_rwsem protects task->cgroups pointer, the list of css_set
+ * objects, and the chain of tasks off each css_set.
+ *
+ * These locks are exported if CONFIG_PROVE_RCU so that accessors in
+ * cgroup.h can use them for lockdep annotations.
+ */
+#ifdef CONFIG_PROVE_RCU
+DEFINE_MUTEX(cgroup_mutex);
+DECLARE_RWSEM(css_set_rwsem);
+EXPORT_SYMBOL_GPL(cgroup_mutex);
+EXPORT_SYMBOL_GPL(css_set_rwsem);
+#else
+static DEFINE_MUTEX(cgroup_mutex);
+static DECLARE_RWSEM(css_set_rwsem);
+#endif
+
+/*
+ * Protects cgroup_idr and css_idr so that IDs can be released without
+ * grabbing cgroup_mutex.
+ */
+static DEFINE_SPINLOCK(cgroup_idr_lock);
+
+/*
+ * Protects cgroup_subsys->release_agent_path. Modifying it also requires
+ * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.
+ */
+static DEFINE_SPINLOCK(release_agent_path_lock);
+
+#define cgroup_assert_mutex_or_rcu_locked() \
+ rcu_lockdep_assert(rcu_read_lock_held() || \
+ lockdep_is_held(&cgroup_mutex), \
+ "cgroup_mutex or RCU read lock required");
+
+/*
+ * cgroup destruction makes heavy use of work items and there can be a lot
+ * of concurrent destructions. Use a separate workqueue so that cgroup
+ * destruction work items don't end up filling up max_active of system_wq
+ * which may lead to deadlock.
+ */
+static struct workqueue_struct *cgroup_destroy_wq;
+
+/*
+ * pidlist destructions need to be flushed on cgroup destruction. Use a
+ * separate workqueue as flush domain.
+ */
+static struct workqueue_struct *cgroup_pidlist_destroy_wq;
+
+/* generate an array of cgroup subsystem pointers */
+#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
+static struct cgroup_subsys *cgroup_subsys[] = {
+#include <linux/cgroup_subsys.h>
+};
+#undef SUBSYS
+
+/* array of cgroup subsystem names */
+#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
+static const char *cgroup_subsys_name[] = {
+#include <linux/cgroup_subsys.h>
+};
+#undef SUBSYS
+
+/*
+ * The default hierarchy, reserved for the subsystems that are otherwise
+ * unattached - it never has more than a single cgroup, and all tasks are
+ * part of that cgroup.
+ */
+struct cgroup_root cgrp_dfl_root;
+
+/*
+ * The default hierarchy always exists but is hidden until mounted for the
+ * first time. This is for backward compatibility.
+ */
+static bool cgrp_dfl_root_visible;
+
+/*
+ * Set by the boot param of the same name and makes subsystems with NULL
+ * ->dfl_files to use ->legacy_files on the default hierarchy.
+ */
+static bool cgroup_legacy_files_on_dfl;
+
+/* some controllers are not supported in the default hierarchy */
+static unsigned int cgrp_dfl_root_inhibit_ss_mask;
+
+/* The list of hierarchy roots */
+
+static LIST_HEAD(cgroup_roots);
+static int cgroup_root_count;
+
+/* hierarchy ID allocation and mapping, protected by cgroup_mutex */
+static DEFINE_IDR(cgroup_hierarchy_idr);
+
+/*
+ * Assign a monotonically increasing serial number to csses. It guarantees
+ * cgroups with bigger numbers are newer than those with smaller numbers.
+ * Also, as csses are always appended to the parent's ->children list, it
+ * guarantees that sibling csses are always sorted in the ascending serial
+ * number order on the list. Protected by cgroup_mutex.
+ */
+static u64 css_serial_nr_next = 1;
+
+/* This flag indicates whether tasks in the fork and exit paths should
+ * check for fork/exit handlers to call. This avoids us having to do
+ * extra work in the fork/exit path if none of the subsystems need to
+ * be called.
+ */
+static int need_forkexit_callback __read_mostly;
+
+static struct cftype cgroup_dfl_base_files[];
+static struct cftype cgroup_legacy_base_files[];
+
+static int rebind_subsystems(struct cgroup_root *dst_root,
+ unsigned int ss_mask);
+static int cgroup_destroy_locked(struct cgroup *cgrp);
+static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
+ bool visible);
+static void css_release(struct percpu_ref *ref);
+static void kill_css(struct cgroup_subsys_state *css);
+static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
+ bool is_add);
+
+/* IDR wrappers which synchronize using cgroup_idr_lock */
+static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
+ gfp_t gfp_mask)
+{
+ int ret;
+
+ idr_preload(gfp_mask);
+ spin_lock_bh(&cgroup_idr_lock);
+ ret = idr_alloc(idr, ptr, start, end, gfp_mask);
+ spin_unlock_bh(&cgroup_idr_lock);
+ idr_preload_end();
+ return ret;
+}
+
+static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
+{
+ void *ret;
+
+ spin_lock_bh(&cgroup_idr_lock);
+ ret = idr_replace(idr, ptr, id);
+ spin_unlock_bh(&cgroup_idr_lock);
+ return ret;
+}
+
+static void cgroup_idr_remove(struct idr *idr, int id)
+{
+ spin_lock_bh(&cgroup_idr_lock);
+ idr_remove(idr, id);
+ spin_unlock_bh(&cgroup_idr_lock);
+}
+
+static struct cgroup *cgroup_parent(struct cgroup *cgrp)
+{
+ struct cgroup_subsys_state *parent_css = cgrp->self.parent;
+
+ if (parent_css)
+ return container_of(parent_css, struct cgroup, self);
+ return NULL;
+}
+
+/**
+ * cgroup_css - obtain a cgroup's css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest (%NULL returns @cgrp->self)
+ *
+ * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This
+ * function must be called either under cgroup_mutex or rcu_read_lock() and
+ * the caller is responsible for pinning the returned css if it wants to
+ * keep accessing it outside the said locks. This function may return
+ * %NULL if @cgrp doesn't have @subsys_id enabled.
+ */
+static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
+ struct cgroup_subsys *ss)
+{
+ if (ss)
+ return rcu_dereference_check(cgrp->subsys[ss->id],
+ lockdep_is_held(&cgroup_mutex));
+ else
+ return &cgrp->self;
+}
+
+/**
+ * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest (%NULL returns @cgrp->self)
+ *
+ * Similar to cgroup_css() but returns the effctive css, which is defined
+ * as the matching css of the nearest ancestor including self which has @ss
+ * enabled. If @ss is associated with the hierarchy @cgrp is on, this
+ * function is guaranteed to return non-NULL css.
+ */
+static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
+ struct cgroup_subsys *ss)
+{
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (!ss)
+ return &cgrp->self;
+
+ if (!(cgrp->root->subsys_mask & (1 << ss->id)))
+ return NULL;
+
+ /*
+ * This function is used while updating css associations and thus
+ * can't test the csses directly. Use ->child_subsys_mask.
+ */
+ while (cgroup_parent(cgrp) &&
+ !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id)))
+ cgrp = cgroup_parent(cgrp);
+
+ return cgroup_css(cgrp, ss);
+}
+
+/**
+ * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest
+ *
+ * Find and get the effective css of @cgrp for @ss. The effective css is
+ * defined as the matching css of the nearest ancestor including self which
+ * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on,
+ * the root css is returned, so this function always returns a valid css.
+ * The returned css must be put using css_put().
+ */
+struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
+ struct cgroup_subsys *ss)
+{
+ struct cgroup_subsys_state *css;
+
+ rcu_read_lock();
+
+ do {
+ css = cgroup_css(cgrp, ss);
+
+ if (css && css_tryget_online(css))
+ goto out_unlock;
+ cgrp = cgroup_parent(cgrp);
+ } while (cgrp);
+
+ css = init_css_set.subsys[ss->id];
+ css_get(css);
+out_unlock:
+ rcu_read_unlock();
+ return css;
+}
+
+/* convenient tests for these bits */
+static inline bool cgroup_is_dead(const struct cgroup *cgrp)
+{
+ return !(cgrp->self.flags & CSS_ONLINE);
+}
+
+struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
+{
+ struct cgroup *cgrp = of->kn->parent->priv;
+ struct cftype *cft = of_cft(of);
+
+ /*
+ * This is open and unprotected implementation of cgroup_css().
+ * seq_css() is only called from a kernfs file operation which has
+ * an active reference on the file. Because all the subsystem
+ * files are drained before a css is disassociated with a cgroup,
+ * the matching css from the cgroup's subsys table is guaranteed to
+ * be and stay valid until the enclosing operation is complete.
+ */
+ if (cft->ss)
+ return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
+ else
+ return &cgrp->self;
+}
+EXPORT_SYMBOL_GPL(of_css);
+
+/**
+ * cgroup_is_descendant - test ancestry
+ * @cgrp: the cgroup to be tested
+ * @ancestor: possible ancestor of @cgrp
+ *
+ * Test whether @cgrp is a descendant of @ancestor. It also returns %true
+ * if @cgrp == @ancestor. This function is safe to call as long as @cgrp
+ * and @ancestor are accessible.
+ */
+bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
+{
+ while (cgrp) {
+ if (cgrp == ancestor)
+ return true;
+ cgrp = cgroup_parent(cgrp);
+ }
+ return false;
+}
+
+static int notify_on_release(const struct cgroup *cgrp)
+{
+ return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+}
+
+/**
+ * for_each_css - iterate all css's of a cgroup
+ * @css: the iteration cursor
+ * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
+ * @cgrp: the target cgroup to iterate css's of
+ *
+ * Should be called under cgroup_[tree_]mutex.
+ */
+#define for_each_css(css, ssid, cgrp) \
+ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
+ if (!((css) = rcu_dereference_check( \
+ (cgrp)->subsys[(ssid)], \
+ lockdep_is_held(&cgroup_mutex)))) { } \
+ else
+
+/**
+ * for_each_e_css - iterate all effective css's of a cgroup
+ * @css: the iteration cursor
+ * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
+ * @cgrp: the target cgroup to iterate css's of
+ *
+ * Should be called under cgroup_[tree_]mutex.
+ */
+#define for_each_e_css(css, ssid, cgrp) \
+ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
+ if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
+ ; \
+ else
+
+/**
+ * for_each_subsys - iterate all enabled cgroup subsystems
+ * @ss: the iteration cursor
+ * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
+ */
+#define for_each_subsys(ss, ssid) \
+ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \
+ (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
+
+/* iterate across the hierarchies */
+#define for_each_root(root) \
+ list_for_each_entry((root), &cgroup_roots, root_list)
+
+/* iterate over child cgrps, lock should be held throughout iteration */
+#define cgroup_for_each_live_child(child, cgrp) \
+ list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
+ if (({ lockdep_assert_held(&cgroup_mutex); \
+ cgroup_is_dead(child); })) \
+ ; \
+ else
+
+static void cgroup_release_agent(struct work_struct *work);
+static void check_for_release(struct cgroup *cgrp);
+
+/*
+ * A cgroup can be associated with multiple css_sets as different tasks may
+ * belong to different cgroups on different hierarchies. In the other
+ * direction, a css_set is naturally associated with multiple cgroups.
+ * This M:N relationship is represented by the following link structure
+ * which exists for each association and allows traversing the associations
+ * from both sides.
+ */
+struct cgrp_cset_link {
+ /* the cgroup and css_set this link associates */
+ struct cgroup *cgrp;
+ struct css_set *cset;
+
+ /* list of cgrp_cset_links anchored at cgrp->cset_links */
+ struct list_head cset_link;
+
+ /* list of cgrp_cset_links anchored at css_set->cgrp_links */
+ struct list_head cgrp_link;
+};
+
+/*
+ * The default css_set - used by init and its children prior to any
+ * hierarchies being mounted. It contains a pointer to the root state
+ * for each subsystem. Also used to anchor the list of css_sets. Not
+ * reference-counted, to improve performance when child cgroups
+ * haven't been created.
+ */
+struct css_set init_css_set = {
+ .refcount = ATOMIC_INIT(1),
+ .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
+ .tasks = LIST_HEAD_INIT(init_css_set.tasks),
+ .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
+ .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
+ .mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
+};
+
+static int css_set_count = 1; /* 1 for init_css_set */
+
+/**
+ * cgroup_update_populated - updated populated count of a cgroup
+ * @cgrp: the target cgroup
+ * @populated: inc or dec populated count
+ *
+ * @cgrp is either getting the first task (css_set) or losing the last.
+ * Update @cgrp->populated_cnt accordingly. The count is propagated
+ * towards root so that a given cgroup's populated_cnt is zero iff the
+ * cgroup and all its descendants are empty.
+ *
+ * @cgrp's interface file "cgroup.populated" is zero if
+ * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt
+ * changes from or to zero, userland is notified that the content of the
+ * interface file has changed. This can be used to detect when @cgrp and
+ * its descendants become populated or empty.
+ */
+static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
+{
+ lockdep_assert_held(&css_set_rwsem);
+
+ do {
+ bool trigger;
+
+ if (populated)
+ trigger = !cgrp->populated_cnt++;
+ else
+ trigger = !--cgrp->populated_cnt;
+
+ if (!trigger)
+ break;
+
+ if (cgrp->populated_kn)
+ kernfs_notify(cgrp->populated_kn);
+ cgrp = cgroup_parent(cgrp);
+ } while (cgrp);
+}
+
+/*
+ * hash table for cgroup groups. This improves the performance to find
+ * an existing css_set. This hash doesn't (currently) take into
+ * account cgroups in empty hierarchies.
+ */
+#define CSS_SET_HASH_BITS 7
+static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
+
+static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
+{
+ unsigned long key = 0UL;
+ struct cgroup_subsys *ss;
+ int i;
+
+ for_each_subsys(ss, i)
+ key += (unsigned long)css[i];
+ key = (key >> 16) ^ key;
+
+ return key;
+}
+
+static void put_css_set_locked(struct css_set *cset)
+{
+ struct cgrp_cset_link *link, *tmp_link;
+ struct cgroup_subsys *ss;
+ int ssid;
+
+ lockdep_assert_held(&css_set_rwsem);
+
+ if (!atomic_dec_and_test(&cset->refcount))
+ return;
+
+ /* This css_set is dead. unlink it and release cgroup refcounts */
+ for_each_subsys(ss, ssid)
+ list_del(&cset->e_cset_node[ssid]);
+ hash_del(&cset->hlist);
+ css_set_count--;
+
+ list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
+ struct cgroup *cgrp = link->cgrp;
+
+ list_del(&link->cset_link);
+ list_del(&link->cgrp_link);
+
+ /* @cgrp can't go away while we're holding css_set_rwsem */
+ if (list_empty(&cgrp->cset_links)) {
+ cgroup_update_populated(cgrp, false);
+ check_for_release(cgrp);
+ }
+
+ kfree(link);
+ }
+
+ kfree_rcu(cset, rcu_head);
+}
+
+static void put_css_set(struct css_set *cset)
+{
+ /*
+ * Ensure that the refcount doesn't hit zero while any readers
+ * can see it. Similar to atomic_dec_and_lock(), but for an
+ * rwlock
+ */
+ if (atomic_add_unless(&cset->refcount, -1, 1))
+ return;
+
+ down_write(&css_set_rwsem);
+ put_css_set_locked(cset);
+ up_write(&css_set_rwsem);
+}
+
+/*
+ * refcounted get/put for css_set objects
+ */
+static inline void get_css_set(struct css_set *cset)
+{
+ atomic_inc(&cset->refcount);
+}
+
+/**
+ * compare_css_sets - helper function for find_existing_css_set().
+ * @cset: candidate css_set being tested
+ * @old_cset: existing css_set for a task
+ * @new_cgrp: cgroup that's being entered by the task
+ * @template: desired set of css pointers in css_set (pre-calculated)
+ *
+ * Returns true if "cset" matches "old_cset" except for the hierarchy
+ * which "new_cgrp" belongs to, for which it should match "new_cgrp".
+ */
+static bool compare_css_sets(struct css_set *cset,
+ struct css_set *old_cset,
+ struct cgroup *new_cgrp,
+ struct cgroup_subsys_state *template[])
+{
+ struct list_head *l1, *l2;
+
+ /*
+ * On the default hierarchy, there can be csets which are
+ * associated with the same set of cgroups but different csses.
+ * Let's first ensure that csses match.
+ */
+ if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
+ return false;
+
+ /*
+ * Compare cgroup pointers in order to distinguish between
+ * different cgroups in hierarchies. As different cgroups may
+ * share the same effective css, this comparison is always
+ * necessary.
+ */
+ l1 = &cset->cgrp_links;
+ l2 = &old_cset->cgrp_links;
+ while (1) {
+ struct cgrp_cset_link *link1, *link2;
+ struct cgroup *cgrp1, *cgrp2;
+
+ l1 = l1->next;
+ l2 = l2->next;
+ /* See if we reached the end - both lists are equal length. */
+ if (l1 == &cset->cgrp_links) {
+ BUG_ON(l2 != &old_cset->cgrp_links);
+ break;
+ } else {
+ BUG_ON(l2 == &old_cset->cgrp_links);
+ }
+ /* Locate the cgroups associated with these links. */
+ link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
+ link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
+ cgrp1 = link1->cgrp;
+ cgrp2 = link2->cgrp;
+ /* Hierarchies should be linked in the same order. */
+ BUG_ON(cgrp1->root != cgrp2->root);
+
+ /*
+ * If this hierarchy is the hierarchy of the cgroup
+ * that's changing, then we need to check that this
+ * css_set points to the new cgroup; if it's any other
+ * hierarchy, then this css_set should point to the
+ * same cgroup as the old css_set.
+ */
+ if (cgrp1->root == new_cgrp->root) {
+ if (cgrp1 != new_cgrp)
+ return false;
+ } else {
+ if (cgrp1 != cgrp2)
+ return false;
+ }
+ }
+ return true;
+}
+
+/**
+ * find_existing_css_set - init css array and find the matching css_set
+ * @old_cset: the css_set that we're using before the cgroup transition
+ * @cgrp: the cgroup that we're moving into
+ * @template: out param for the new set of csses, should be clear on entry
+ */
+static struct css_set *find_existing_css_set(struct css_set *old_cset,
+ struct cgroup *cgrp,
+ struct cgroup_subsys_state *template[])
+{
+ struct cgroup_root *root = cgrp->root;
+ struct cgroup_subsys *ss;
+ struct css_set *cset;
+ unsigned long key;
+ int i;
+
+ /*
+ * Build the set of subsystem state objects that we want to see in the
+ * new css_set. while subsystems can change globally, the entries here
+ * won't change, so no need for locking.
+ */
+ for_each_subsys(ss, i) {
+ if (root->subsys_mask & (1UL << i)) {
+ /*
+ * @ss is in this hierarchy, so we want the
+ * effective css from @cgrp.
+ */
+ template[i] = cgroup_e_css(cgrp, ss);
+ } else {
+ /*
+ * @ss is not in this hierarchy, so we don't want
+ * to change the css.
+ */
+ template[i] = old_cset->subsys[i];
+ }
+ }
+
+ key = css_set_hash(template);
+ hash_for_each_possible(css_set_table, cset, hlist, key) {
+ if (!compare_css_sets(cset, old_cset, cgrp, template))
+ continue;
+
+ /* This css_set matches what we need */
+ return cset;
+ }
+
+ /* No existing cgroup group matched */
+ return NULL;
+}
+
+static void free_cgrp_cset_links(struct list_head *links_to_free)
+{
+ struct cgrp_cset_link *link, *tmp_link;
+
+ list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
+ list_del(&link->cset_link);
+ kfree(link);
+ }
+}
+
+/**
+ * allocate_cgrp_cset_links - allocate cgrp_cset_links
+ * @count: the number of links to allocate
+ * @tmp_links: list_head the allocated links are put on
+ *
+ * Allocate @count cgrp_cset_link structures and chain them on @tmp_links
+ * through ->cset_link. Returns 0 on success or -errno.
+ */
+static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
+{
+ struct cgrp_cset_link *link;
+ int i;
+
+ INIT_LIST_HEAD(tmp_links);
+
+ for (i = 0; i < count; i++) {
+ link = kzalloc(sizeof(*link), GFP_KERNEL);
+ if (!link) {
+ free_cgrp_cset_links(tmp_links);
+ return -ENOMEM;
+ }
+ list_add(&link->cset_link, tmp_links);
+ }
+ return 0;
+}
+
+/**
+ * link_css_set - a helper function to link a css_set to a cgroup
+ * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
+ * @cset: the css_set to be linked
+ * @cgrp: the destination cgroup
+ */
+static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
+ struct cgroup *cgrp)
+{
+ struct cgrp_cset_link *link;
+
+ BUG_ON(list_empty(tmp_links));
+
+ if (cgroup_on_dfl(cgrp))
+ cset->dfl_cgrp = cgrp;
+
+ link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
+ link->cset = cset;
+ link->cgrp = cgrp;
+
+ if (list_empty(&cgrp->cset_links))
+ cgroup_update_populated(cgrp, true);
+ list_move(&link->cset_link, &cgrp->cset_links);
+
+ /*
+ * Always add links to the tail of the list so that the list
+ * is sorted by order of hierarchy creation
+ */
+ list_add_tail(&link->cgrp_link, &cset->cgrp_links);
+}
+
+/**
+ * find_css_set - return a new css_set with one cgroup updated
+ * @old_cset: the baseline css_set
+ * @cgrp: the cgroup to be updated
+ *
+ * Return a new css_set that's equivalent to @old_cset, but with @cgrp
+ * substituted into the appropriate hierarchy.
+ */
+static struct css_set *find_css_set(struct css_set *old_cset,
+ struct cgroup *cgrp)
+{
+ struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
+ struct css_set *cset;
+ struct list_head tmp_links;
+ struct cgrp_cset_link *link;
+ struct cgroup_subsys *ss;
+ unsigned long key;
+ int ssid;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ /* First see if we already have a cgroup group that matches
+ * the desired set */
+ down_read(&css_set_rwsem);
+ cset = find_existing_css_set(old_cset, cgrp, template);
+ if (cset)
+ get_css_set(cset);
+ up_read(&css_set_rwsem);
+
+ if (cset)
+ return cset;
+
+ cset = kzalloc(sizeof(*cset), GFP_KERNEL);
+ if (!cset)
+ return NULL;
+
+ /* Allocate all the cgrp_cset_link objects that we'll need */
+ if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
+ kfree(cset);
+ return NULL;
+ }
+
+ atomic_set(&cset->refcount, 1);
+ INIT_LIST_HEAD(&cset->cgrp_links);
+ INIT_LIST_HEAD(&cset->tasks);
+ INIT_LIST_HEAD(&cset->mg_tasks);
+ INIT_LIST_HEAD(&cset->mg_preload_node);
+ INIT_LIST_HEAD(&cset->mg_node);
+ INIT_HLIST_NODE(&cset->hlist);
+
+ /* Copy the set of subsystem state objects generated in
+ * find_existing_css_set() */
+ memcpy(cset->subsys, template, sizeof(cset->subsys));
+
+ down_write(&css_set_rwsem);
+ /* Add reference counts and links from the new css_set. */
+ list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
+ struct cgroup *c = link->cgrp;
+
+ if (c->root == cgrp->root)
+ c = cgrp;
+ link_css_set(&tmp_links, cset, c);
+ }
+
+ BUG_ON(!list_empty(&tmp_links));
+
+ css_set_count++;
+
+ /* Add @cset to the hash table */
+ key = css_set_hash(cset->subsys);
+ hash_add(css_set_table, &cset->hlist, key);
+
+ for_each_subsys(ss, ssid)
+ list_add_tail(&cset->e_cset_node[ssid],
+ &cset->subsys[ssid]->cgroup->e_csets[ssid]);
+
+ up_write(&css_set_rwsem);
+
+ return cset;
+}
+
+static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
+{
+ struct cgroup *root_cgrp = kf_root->kn->priv;
+
+ return root_cgrp->root;
+}
+
+static int cgroup_init_root_id(struct cgroup_root *root)
+{
+ int id;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
+ if (id < 0)
+ return id;
+
+ root->hierarchy_id = id;
+ return 0;
+}
+
+static void cgroup_exit_root_id(struct cgroup_root *root)
+{
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (root->hierarchy_id) {
+ idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
+ root->hierarchy_id = 0;
+ }
+}
+
+static void cgroup_free_root(struct cgroup_root *root)
+{
+ if (root) {
+ /* hierarhcy ID shoulid already have been released */
+ WARN_ON_ONCE(root->hierarchy_id);
+
+ idr_destroy(&root->cgroup_idr);
+ kfree(root);
+ }
+}
+
+static void cgroup_destroy_root(struct cgroup_root *root)
+{
+ struct cgroup *cgrp = &root->cgrp;
+ struct cgrp_cset_link *link, *tmp_link;
+
+ mutex_lock(&cgroup_mutex);
+
+ BUG_ON(atomic_read(&root->nr_cgrps));
+ BUG_ON(!list_empty(&cgrp->self.children));
+
+ /* Rebind all subsystems back to the default hierarchy */
+ rebind_subsystems(&cgrp_dfl_root, root->subsys_mask);
+
+ /*
+ * Release all the links from cset_links to this hierarchy's
+ * root cgroup
+ */
+ down_write(&css_set_rwsem);
+
+ list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
+ list_del(&link->cset_link);
+ list_del(&link->cgrp_link);
+ kfree(link);
+ }
+ up_write(&css_set_rwsem);
+
+ if (!list_empty(&root->root_list)) {
+ list_del(&root->root_list);
+ cgroup_root_count--;
+ }
+
+ cgroup_exit_root_id(root);
+
+ mutex_unlock(&cgroup_mutex);
+
+ kernfs_destroy_root(root->kf_root);
+ cgroup_free_root(root);
+}
+
+/* look up cgroup associated with given css_set on the specified hierarchy */
+static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
+ struct cgroup_root *root)
+{
+ struct cgroup *res = NULL;
+
+ lockdep_assert_held(&cgroup_mutex);
+ lockdep_assert_held(&css_set_rwsem);
+
+ if (cset == &init_css_set) {
+ res = &root->cgrp;
+ } else {
+ struct cgrp_cset_link *link;
+
+ list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
+ struct cgroup *c = link->cgrp;
+
+ if (c->root == root) {
+ res = c;
+ break;
+ }
+ }
+ }
+
+ BUG_ON(!res);
+ return res;
+}
+
+/*
+ * Return the cgroup for "task" from the given hierarchy. Must be
+ * called with cgroup_mutex and css_set_rwsem held.
+ */
+static struct cgroup *task_cgroup_from_root(struct task_struct *task,
+ struct cgroup_root *root)
+{
+ /*
+ * No need to lock the task - since we hold cgroup_mutex the
+ * task can't change groups, so the only thing that can happen
+ * is that it exits and its css is set back to init_css_set.
+ */
+ return cset_cgroup_from_root(task_css_set(task), root);
+}
+
+/*
+ * A task must hold cgroup_mutex to modify cgroups.
+ *
+ * Any task can increment and decrement the count field without lock.
+ * So in general, code holding cgroup_mutex can't rely on the count
+ * field not changing. However, if the count goes to zero, then only
+ * cgroup_attach_task() can increment it again. Because a count of zero
+ * means that no tasks are currently attached, therefore there is no
+ * way a task attached to that cgroup can fork (the other way to
+ * increment the count). So code holding cgroup_mutex can safely
+ * assume that if the count is zero, it will stay zero. Similarly, if
+ * a task holds cgroup_mutex on a cgroup with zero count, it
+ * knows that the cgroup won't be removed, as cgroup_rmdir()
+ * needs that mutex.
+ *
+ * A cgroup can only be deleted if both its 'count' of using tasks
+ * is zero, and its list of 'children' cgroups is empty. Since all
+ * tasks in the system use _some_ cgroup, and since there is always at
+ * least one task in the system (init, pid == 1), therefore, root cgroup
+ * always has either children cgroups and/or using tasks. So we don't
+ * need a special hack to ensure that root cgroup cannot be deleted.
+ *
+ * P.S. One more locking exception. RCU is used to guard the
+ * update of a tasks cgroup pointer by cgroup_attach_task()
+ */
+
+static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask);
+static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
+static const struct file_operations proc_cgroupstats_operations;
+
+static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
+ char *buf)
+{
+ if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
+ !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
+ snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
+ cft->ss->name, cft->name);
+ else
+ strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
+ return buf;
+}
+
+/**
+ * cgroup_file_mode - deduce file mode of a control file
+ * @cft: the control file in question
+ *
+ * returns cft->mode if ->mode is not 0
+ * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
+ * returns S_IRUGO if it has only a read handler
+ * returns S_IWUSR if it has only a write hander
+ */
+static umode_t cgroup_file_mode(const struct cftype *cft)
+{
+ umode_t mode = 0;
+
+ if (cft->mode)
+ return cft->mode;
+
+ if (cft->read_u64 || cft->read_s64 || cft->seq_show)
+ mode |= S_IRUGO;
+
+ if (cft->write_u64 || cft->write_s64 || cft->write)
+ mode |= S_IWUSR;
+
+ return mode;
+}
+
+static void cgroup_get(struct cgroup *cgrp)
+{
+ WARN_ON_ONCE(cgroup_is_dead(cgrp));
+ css_get(&cgrp->self);
+}
+
+static bool cgroup_tryget(struct cgroup *cgrp)
+{
+ return css_tryget(&cgrp->self);
+}
+
+static void cgroup_put(struct cgroup *cgrp)
+{
+ css_put(&cgrp->self);
+}
+
+/**
+ * cgroup_calc_child_subsys_mask - calculate child_subsys_mask
+ * @cgrp: the target cgroup
+ * @subtree_control: the new subtree_control mask to consider
+ *
+ * On the default hierarchy, a subsystem may request other subsystems to be
+ * enabled together through its ->depends_on mask. In such cases, more
+ * subsystems than specified in "cgroup.subtree_control" may be enabled.
+ *
+ * This function calculates which subsystems need to be enabled if
+ * @subtree_control is to be applied to @cgrp. The returned mask is always
+ * a superset of @subtree_control and follows the usual hierarchy rules.
+ */
+static unsigned int cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
+ unsigned int subtree_control)
+{
+ struct cgroup *parent = cgroup_parent(cgrp);
+ unsigned int cur_ss_mask = subtree_control;
+ struct cgroup_subsys *ss;
+ int ssid;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (!cgroup_on_dfl(cgrp))
+ return cur_ss_mask;
+
+ while (true) {
+ unsigned int new_ss_mask = cur_ss_mask;
+
+ for_each_subsys(ss, ssid)
+ if (cur_ss_mask & (1 << ssid))
+ new_ss_mask |= ss->depends_on;
+
+ /*
+ * Mask out subsystems which aren't available. This can
+ * happen only if some depended-upon subsystems were bound
+ * to non-default hierarchies.
+ */
+ if (parent)
+ new_ss_mask &= parent->child_subsys_mask;
+ else
+ new_ss_mask &= cgrp->root->subsys_mask;
+
+ if (new_ss_mask == cur_ss_mask)
+ break;
+ cur_ss_mask = new_ss_mask;
+ }
+
+ return cur_ss_mask;
+}
+
+/**
+ * cgroup_refresh_child_subsys_mask - update child_subsys_mask
+ * @cgrp: the target cgroup
+ *
+ * Update @cgrp->child_subsys_mask according to the current
+ * @cgrp->subtree_control using cgroup_calc_child_subsys_mask().
+ */
+static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
+{
+ cgrp->child_subsys_mask =
+ cgroup_calc_child_subsys_mask(cgrp, cgrp->subtree_control);
+}
+
+/**
+ * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
+ * @kn: the kernfs_node being serviced
+ *
+ * This helper undoes cgroup_kn_lock_live() and should be invoked before
+ * the method finishes if locking succeeded. Note that once this function
+ * returns the cgroup returned by cgroup_kn_lock_live() may become
+ * inaccessible any time. If the caller intends to continue to access the
+ * cgroup, it should pin it before invoking this function.
+ */
+static void cgroup_kn_unlock(struct kernfs_node *kn)
+{
+ struct cgroup *cgrp;
+
+ if (kernfs_type(kn) == KERNFS_DIR)
+ cgrp = kn->priv;
+ else
+ cgrp = kn->parent->priv;
+
+ mutex_unlock(&cgroup_mutex);
+
+ kernfs_unbreak_active_protection(kn);
+ cgroup_put(cgrp);
+}
+
+/**
+ * cgroup_kn_lock_live - locking helper for cgroup kernfs methods
+ * @kn: the kernfs_node being serviced
+ *
+ * This helper is to be used by a cgroup kernfs method currently servicing
+ * @kn. It breaks the active protection, performs cgroup locking and
+ * verifies that the associated cgroup is alive. Returns the cgroup if
+ * alive; otherwise, %NULL. A successful return should be undone by a
+ * matching cgroup_kn_unlock() invocation.
+ *
+ * Any cgroup kernfs method implementation which requires locking the
+ * associated cgroup should use this helper. It avoids nesting cgroup
+ * locking under kernfs active protection and allows all kernfs operations
+ * including self-removal.
+ */
+static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn)
+{
+ struct cgroup *cgrp;
+
+ if (kernfs_type(kn) == KERNFS_DIR)
+ cgrp = kn->priv;
+ else
+ cgrp = kn->parent->priv;
+
+ /*
+ * We're gonna grab cgroup_mutex which nests outside kernfs
+ * active_ref. cgroup liveliness check alone provides enough
+ * protection against removal. Ensure @cgrp stays accessible and
+ * break the active_ref protection.
+ */
+ if (!cgroup_tryget(cgrp))
+ return NULL;
+ kernfs_break_active_protection(kn);
+
+ mutex_lock(&cgroup_mutex);
+
+ if (!cgroup_is_dead(cgrp))
+ return cgrp;
+
+ cgroup_kn_unlock(kn);
+ return NULL;
+}
+
+static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
+{
+ char name[CGROUP_FILE_NAME_MAX];
+
+ lockdep_assert_held(&cgroup_mutex);
+ kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
+}
+
+/**
+ * cgroup_clear_dir - remove subsys files in a cgroup directory
+ * @cgrp: target cgroup
+ * @subsys_mask: mask of the subsystem ids whose files should be removed
+ */
+static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask)
+{
+ struct cgroup_subsys *ss;
+ int i;
+
+ for_each_subsys(ss, i) {
+ struct cftype *cfts;
+
+ if (!(subsys_mask & (1 << i)))
+ continue;
+ list_for_each_entry(cfts, &ss->cfts, node)
+ cgroup_addrm_files(cgrp, cfts, false);
+ }
+}
+
+static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
+{
+ struct cgroup_subsys *ss;
+ unsigned int tmp_ss_mask;
+ int ssid, i, ret;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ for_each_subsys(ss, ssid) {
+ if (!(ss_mask & (1 << ssid)))
+ continue;
+
+ /* if @ss has non-root csses attached to it, can't move */
+ if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)))
+ return -EBUSY;
+
+ /* can't move between two non-dummy roots either */
+ if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
+ return -EBUSY;
+ }
+
+ /* skip creating root files on dfl_root for inhibited subsystems */
+ tmp_ss_mask = ss_mask;
+ if (dst_root == &cgrp_dfl_root)
+ tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask;
+
+ ret = cgroup_populate_dir(&dst_root->cgrp, tmp_ss_mask);
+ if (ret) {
+ if (dst_root != &cgrp_dfl_root)
+ return ret;
+
+ /*
+ * Rebinding back to the default root is not allowed to
+ * fail. Using both default and non-default roots should
+ * be rare. Moving subsystems back and forth even more so.
+ * Just warn about it and continue.
+ */
+ if (cgrp_dfl_root_visible) {
+ pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n",
+ ret, ss_mask);
+ pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
+ }
+ }
+
+ /*
+ * Nothing can fail from this point on. Remove files for the
+ * removed subsystems and rebind each subsystem.
+ */
+ for_each_subsys(ss, ssid)
+ if (ss_mask & (1 << ssid))
+ cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
+
+ for_each_subsys(ss, ssid) {
+ struct cgroup_root *src_root;
+ struct cgroup_subsys_state *css;
+ struct css_set *cset;
+
+ if (!(ss_mask & (1 << ssid)))
+ continue;
+
+ src_root = ss->root;
+ css = cgroup_css(&src_root->cgrp, ss);
+
+ WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss));
+
+ RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL);
+ rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css);
+ ss->root = dst_root;
+ css->cgroup = &dst_root->cgrp;
+
+ down_write(&css_set_rwsem);
+ hash_for_each(css_set_table, i, cset, hlist)
+ list_move_tail(&cset->e_cset_node[ss->id],
+ &dst_root->cgrp.e_csets[ss->id]);
+ up_write(&css_set_rwsem);
+
+ src_root->subsys_mask &= ~(1 << ssid);
+ src_root->cgrp.subtree_control &= ~(1 << ssid);
+ cgroup_refresh_child_subsys_mask(&src_root->cgrp);
+
+ /* default hierarchy doesn't enable controllers by default */
+ dst_root->subsys_mask |= 1 << ssid;
+ if (dst_root != &cgrp_dfl_root) {
+ dst_root->cgrp.subtree_control |= 1 << ssid;
+ cgroup_refresh_child_subsys_mask(&dst_root->cgrp);
+ }
+
+ if (ss->bind)
+ ss->bind(css);
+ }
+
+ kernfs_activate(dst_root->cgrp.kn);
+ return 0;
+}
+
+static int cgroup_show_options(struct seq_file *seq,
+ struct kernfs_root *kf_root)
+{
+ struct cgroup_root *root = cgroup_root_from_kf(kf_root);
+ struct cgroup_subsys *ss;
+ int ssid;
+
+ for_each_subsys(ss, ssid)
+ if (root->subsys_mask & (1 << ssid))
+ seq_printf(seq, ",%s", ss->name);
+ if (root->flags & CGRP_ROOT_NOPREFIX)
+ seq_puts(seq, ",noprefix");
+ if (root->flags & CGRP_ROOT_XATTR)
+ seq_puts(seq, ",xattr");
+
+ spin_lock(&release_agent_path_lock);
+ if (strlen(root->release_agent_path))
+ seq_printf(seq, ",release_agent=%s", root->release_agent_path);
+ spin_unlock(&release_agent_path_lock);
+
+ if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
+ seq_puts(seq, ",clone_children");
+ if (strlen(root->name))
+ seq_printf(seq, ",name=%s", root->name);
+ return 0;
+}
+
+struct cgroup_sb_opts {
+ unsigned int subsys_mask;
+ unsigned int flags;
+ char *release_agent;
+ bool cpuset_clone_children;
+ char *name;
+ /* User explicitly requested empty subsystem */
+ bool none;
+};
+
+static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
+{
+ char *token, *o = data;
+ bool all_ss = false, one_ss = false;
+ unsigned int mask = -1U;
+ struct cgroup_subsys *ss;
+ int nr_opts = 0;
+ int i;
+
+#ifdef CONFIG_CPUSETS
+ mask = ~(1U << cpuset_cgrp_id);
+#endif
+
+ memset(opts, 0, sizeof(*opts));
+
+ while ((token = strsep(&o, ",")) != NULL) {
+ nr_opts++;
+
+ if (!*token)
+ return -EINVAL;
+ if (!strcmp(token, "none")) {
+ /* Explicitly have no subsystems */
+ opts->none = true;
+ continue;
+ }
+ if (!strcmp(token, "all")) {
+ /* Mutually exclusive option 'all' + subsystem name */
+ if (one_ss)
+ return -EINVAL;
+ all_ss = true;
+ continue;
+ }
+ if (!strcmp(token, "__DEVEL__sane_behavior")) {
+ opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
+ continue;
+ }
+ if (!strcmp(token, "noprefix")) {
+ opts->flags |= CGRP_ROOT_NOPREFIX;
+ continue;
+ }
+ if (!strcmp(token, "clone_children")) {
+ opts->cpuset_clone_children = true;
+ continue;
+ }
+ if (!strcmp(token, "xattr")) {
+ opts->flags |= CGRP_ROOT_XATTR;
+ continue;
+ }
+ if (!strncmp(token, "release_agent=", 14)) {
+ /* Specifying two release agents is forbidden */
+ if (opts->release_agent)
+ return -EINVAL;
+ opts->release_agent =
+ kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
+ if (!opts->release_agent)
+ return -ENOMEM;
+ continue;
+ }
+ if (!strncmp(token, "name=", 5)) {
+ const char *name = token + 5;
+ /* Can't specify an empty name */
+ if (!strlen(name))
+ return -EINVAL;
+ /* Must match [\w.-]+ */
+ for (i = 0; i < strlen(name); i++) {
+ char c = name[i];
+ if (isalnum(c))
+ continue;
+ if ((c == '.') || (c == '-') || (c == '_'))
+ continue;
+ return -EINVAL;
+ }
+ /* Specifying two names is forbidden */
+ if (opts->name)
+ return -EINVAL;
+ opts->name = kstrndup(name,
+ MAX_CGROUP_ROOT_NAMELEN - 1,
+ GFP_KERNEL);
+ if (!opts->name)
+ return -ENOMEM;
+
+ continue;
+ }
+
+ for_each_subsys(ss, i) {
+ if (strcmp(token, ss->name))
+ continue;
+ if (ss->disabled)
+ continue;
+
+ /* Mutually exclusive option 'all' + subsystem name */
+ if (all_ss)
+ return -EINVAL;
+ opts->subsys_mask |= (1 << i);
+ one_ss = true;
+
+ break;
+ }
+ if (i == CGROUP_SUBSYS_COUNT)
+ return -ENOENT;
+ }
+
+ if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
+ pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
+ if (nr_opts != 1) {
+ pr_err("sane_behavior: no other mount options allowed\n");
+ return -EINVAL;
+ }
+ return 0;
+ }
+
+ /*
+ * If the 'all' option was specified select all the subsystems,
+ * otherwise if 'none', 'name=' and a subsystem name options were
+ * not specified, let's default to 'all'
+ */
+ if (all_ss || (!one_ss && !opts->none && !opts->name))
+ for_each_subsys(ss, i)
+ if (!ss->disabled)
+ opts->subsys_mask |= (1 << i);
+
+ /*
+ * We either have to specify by name or by subsystems. (So all
+ * empty hierarchies must have a name).
+ */
+ if (!opts->subsys_mask && !opts->name)
+ return -EINVAL;
+
+ /*
+ * Option noprefix was introduced just for backward compatibility
+ * with the old cpuset, so we allow noprefix only if mounting just
+ * the cpuset subsystem.
+ */
+ if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
+ return -EINVAL;
+
+ /* Can't specify "none" and some subsystems */
+ if (opts->subsys_mask && opts->none)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
+{
+ int ret = 0;
+ struct cgroup_root *root = cgroup_root_from_kf(kf_root);
+ struct cgroup_sb_opts opts;
+ unsigned int added_mask, removed_mask;
+
+ if (root == &cgrp_dfl_root) {
+ pr_err("remount is not allowed\n");
+ return -EINVAL;
+ }
+
+ mutex_lock(&cgroup_mutex);
+
+ /* See what subsystems are wanted */
+ ret = parse_cgroupfs_options(data, &opts);
+ if (ret)
+ goto out_unlock;
+
+ if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
+ pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
+ task_tgid_nr(current), current->comm);
+
+ added_mask = opts.subsys_mask & ~root->subsys_mask;
+ removed_mask = root->subsys_mask & ~opts.subsys_mask;
+
+ /* Don't allow flags or name to change at remount */
+ if ((opts.flags ^ root->flags) ||
+ (opts.name && strcmp(opts.name, root->name))) {
+ pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
+ opts.flags, opts.name ?: "", root->flags, root->name);
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ /* remounting is not allowed for populated hierarchies */
+ if (!list_empty(&root->cgrp.self.children)) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
+ ret = rebind_subsystems(root, added_mask);
+ if (ret)
+ goto out_unlock;
+
+ rebind_subsystems(&cgrp_dfl_root, removed_mask);
+
+ if (opts.release_agent) {
+ spin_lock(&release_agent_path_lock);
+ strcpy(root->release_agent_path, opts.release_agent);
+ spin_unlock(&release_agent_path_lock);
+ }
+ out_unlock:
+ kfree(opts.release_agent);
+ kfree(opts.name);
+ mutex_unlock(&cgroup_mutex);
+ return ret;
+}
+
+/*
+ * To reduce the fork() overhead for systems that are not actually using
+ * their cgroups capability, we don't maintain the lists running through
+ * each css_set to its tasks until we see the list actually used - in other
+ * words after the first mount.
+ */
+static bool use_task_css_set_links __read_mostly;
+
+static void cgroup_enable_task_cg_lists(void)
+{
+ struct task_struct *p, *g;
+
+ down_write(&css_set_rwsem);
+
+ if (use_task_css_set_links)
+ goto out_unlock;
+
+ use_task_css_set_links = true;
+
+ /*
+ * We need tasklist_lock because RCU is not safe against
+ * while_each_thread(). Besides, a forking task that has passed
+ * cgroup_post_fork() without seeing use_task_css_set_links = 1
+ * is not guaranteed to have its child immediately visible in the
+ * tasklist if we walk through it with RCU.
+ */
+ read_lock(&tasklist_lock);
+ do_each_thread(g, p) {
+ WARN_ON_ONCE(!list_empty(&p->cg_list) ||
+ task_css_set(p) != &init_css_set);
+
+ /*
+ * We should check if the process is exiting, otherwise
+ * it will race with cgroup_exit() in that the list
+ * entry won't be deleted though the process has exited.
+ * Do it while holding siglock so that we don't end up
+ * racing against cgroup_exit().
+ */
+ spin_lock_irq(&p->sighand->siglock);
+ if (!(p->flags & PF_EXITING)) {
+ struct css_set *cset = task_css_set(p);
+
+ list_add(&p->cg_list, &cset->tasks);
+ get_css_set(cset);
+ }
+ spin_unlock_irq(&p->sighand->siglock);
+ } while_each_thread(g, p);
+ read_unlock(&tasklist_lock);
+out_unlock:
+ up_write(&css_set_rwsem);
+}
+
+static void init_cgroup_housekeeping(struct cgroup *cgrp)
+{
+ struct cgroup_subsys *ss;
+ int ssid;
+
+ INIT_LIST_HEAD(&cgrp->self.sibling);
+ INIT_LIST_HEAD(&cgrp->self.children);
+ INIT_LIST_HEAD(&cgrp->cset_links);
+ INIT_LIST_HEAD(&cgrp->pidlists);
+ mutex_init(&cgrp->pidlist_mutex);
+ cgrp->self.cgroup = cgrp;
+ cgrp->self.flags |= CSS_ONLINE;
+
+ for_each_subsys(ss, ssid)
+ INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
+
+ init_waitqueue_head(&cgrp->offline_waitq);
+ INIT_WORK(&cgrp->release_agent_work, cgroup_release_agent);
+}
+
+static void init_cgroup_root(struct cgroup_root *root,
+ struct cgroup_sb_opts *opts)
+{
+ struct cgroup *cgrp = &root->cgrp;
+
+ INIT_LIST_HEAD(&root->root_list);
+ atomic_set(&root->nr_cgrps, 1);
+ cgrp->root = root;
+ init_cgroup_housekeeping(cgrp);
+ idr_init(&root->cgroup_idr);
+
+ root->flags = opts->flags;
+ if (opts->release_agent)
+ strcpy(root->release_agent_path, opts->release_agent);
+ if (opts->name)
+ strcpy(root->name, opts->name);
+ if (opts->cpuset_clone_children)
+ set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
+}
+
+static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
+{
+ LIST_HEAD(tmp_links);
+ struct cgroup *root_cgrp = &root->cgrp;
+ struct cftype *base_files;
+ struct css_set *cset;
+ int i, ret;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT);
+ if (ret < 0)
+ goto out;
+ root_cgrp->id = ret;
+
+ ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0,
+ GFP_KERNEL);
+ if (ret)
+ goto out;
+
+ /*
+ * We're accessing css_set_count without locking css_set_rwsem here,
+ * but that's OK - it can only be increased by someone holding
+ * cgroup_lock, and that's us. The worst that can happen is that we
+ * have some link structures left over
+ */
+ ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
+ if (ret)
+ goto cancel_ref;
+
+ ret = cgroup_init_root_id(root);
+ if (ret)
+ goto cancel_ref;
+
+ root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
+ KERNFS_ROOT_CREATE_DEACTIVATED,
+ root_cgrp);
+ if (IS_ERR(root->kf_root)) {
+ ret = PTR_ERR(root->kf_root);
+ goto exit_root_id;
+ }
+ root_cgrp->kn = root->kf_root->kn;
+
+ if (root == &cgrp_dfl_root)
+ base_files = cgroup_dfl_base_files;
+ else
+ base_files = cgroup_legacy_base_files;
+
+ ret = cgroup_addrm_files(root_cgrp, base_files, true);
+ if (ret)
+ goto destroy_root;
+
+ ret = rebind_subsystems(root, ss_mask);
+ if (ret)
+ goto destroy_root;
+
+ /*
+ * There must be no failure case after here, since rebinding takes
+ * care of subsystems' refcounts, which are explicitly dropped in
+ * the failure exit path.
+ */
+ list_add(&root->root_list, &cgroup_roots);
+ cgroup_root_count++;
+
+ /*
+ * Link the root cgroup in this hierarchy into all the css_set
+ * objects.
+ */
+ down_write(&css_set_rwsem);
+ hash_for_each(css_set_table, i, cset, hlist)
+ link_css_set(&tmp_links, cset, root_cgrp);
+ up_write(&css_set_rwsem);
+
+ BUG_ON(!list_empty(&root_cgrp->self.children));
+ BUG_ON(atomic_read(&root->nr_cgrps) != 1);
+
+ kernfs_activate(root_cgrp->kn);
+ ret = 0;
+ goto out;
+
+destroy_root:
+ kernfs_destroy_root(root->kf_root);
+ root->kf_root = NULL;
+exit_root_id:
+ cgroup_exit_root_id(root);
+cancel_ref:
+ percpu_ref_exit(&root_cgrp->self.refcnt);
+out:
+ free_cgrp_cset_links(&tmp_links);
+ return ret;
+}
+
+static struct dentry *cgroup_mount(struct file_system_type *fs_type,
+ int flags, const char *unused_dev_name,
+ void *data)
+{
+ struct super_block *pinned_sb = NULL;
+ struct cgroup_subsys *ss;
+ struct cgroup_root *root;
+ struct cgroup_sb_opts opts;
+ struct dentry *dentry;
+ int ret;
+ int i;
+ bool new_sb;
+
+ /*
+ * The first time anyone tries to mount a cgroup, enable the list
+ * linking each css_set to its tasks and fix up all existing tasks.
+ */
+ if (!use_task_css_set_links)
+ cgroup_enable_task_cg_lists();
+
+ mutex_lock(&cgroup_mutex);
+
+ /* First find the desired set of subsystems */
+ ret = parse_cgroupfs_options(data, &opts);
+ if (ret)
+ goto out_unlock;
+
+ /* look for a matching existing root */
+ if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) {
+ cgrp_dfl_root_visible = true;
+ root = &cgrp_dfl_root;
+ cgroup_get(&root->cgrp);
+ ret = 0;
+ goto out_unlock;
+ }
+
+ /*
+ * Destruction of cgroup root is asynchronous, so subsystems may
+ * still be dying after the previous unmount. Let's drain the
+ * dying subsystems. We just need to ensure that the ones
+ * unmounted previously finish dying and don't care about new ones
+ * starting. Testing ref liveliness is good enough.
+ */
+ for_each_subsys(ss, i) {
+ if (!(opts.subsys_mask & (1 << i)) ||
+ ss->root == &cgrp_dfl_root)
+ continue;
+
+ if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
+ mutex_unlock(&cgroup_mutex);
+ msleep(10);
+ ret = restart_syscall();
+ goto out_free;
+ }
+ cgroup_put(&ss->root->cgrp);
+ }
+
+ for_each_root(root) {
+ bool name_match = false;
+
+ if (root == &cgrp_dfl_root)
+ continue;
+
+ /*
+ * If we asked for a name then it must match. Also, if
+ * name matches but sybsys_mask doesn't, we should fail.
+ * Remember whether name matched.
+ */
+ if (opts.name) {
+ if (strcmp(opts.name, root->name))
+ continue;
+ name_match = true;
+ }
+
+ /*
+ * If we asked for subsystems (or explicitly for no
+ * subsystems) then they must match.
+ */
+ if ((opts.subsys_mask || opts.none) &&
+ (opts.subsys_mask != root->subsys_mask)) {
+ if (!name_match)
+ continue;
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
+ if (root->flags ^ opts.flags)
+ pr_warn("new mount options do not match the existing superblock, will be ignored\n");
+
+ /*
+ * We want to reuse @root whose lifetime is governed by its
+ * ->cgrp. Let's check whether @root is alive and keep it
+ * that way. As cgroup_kill_sb() can happen anytime, we
+ * want to block it by pinning the sb so that @root doesn't
+ * get killed before mount is complete.
+ *
+ * With the sb pinned, tryget_live can reliably indicate
+ * whether @root can be reused. If it's being killed,
+ * drain it. We can use wait_queue for the wait but this
+ * path is super cold. Let's just sleep a bit and retry.
+ */
+ pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
+ if (IS_ERR(pinned_sb) ||
+ !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
+ mutex_unlock(&cgroup_mutex);
+ if (!IS_ERR_OR_NULL(pinned_sb))
+ deactivate_super(pinned_sb);
+ msleep(10);
+ ret = restart_syscall();
+ goto out_free;
+ }
+
+ ret = 0;
+ goto out_unlock;
+ }
+
+ /*
+ * No such thing, create a new one. name= matching without subsys
+ * specification is allowed for already existing hierarchies but we
+ * can't create new one without subsys specification.
+ */
+ if (!opts.subsys_mask && !opts.none) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ root = kzalloc(sizeof(*root), GFP_KERNEL);
+ if (!root) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+
+ init_cgroup_root(root, &opts);
+
+ ret = cgroup_setup_root(root, opts.subsys_mask);
+ if (ret)
+ cgroup_free_root(root);
+
+out_unlock:
+ mutex_unlock(&cgroup_mutex);
+out_free:
+ kfree(opts.release_agent);
+ kfree(opts.name);
+
+ if (ret)
+ return ERR_PTR(ret);
+
+ dentry = kernfs_mount(fs_type, flags, root->kf_root,
+ CGROUP_SUPER_MAGIC, &new_sb);
+ if (IS_ERR(dentry) || !new_sb)
+ cgroup_put(&root->cgrp);
+
+ /*
+ * If @pinned_sb, we're reusing an existing root and holding an
+ * extra ref on its sb. Mount is complete. Put the extra ref.
+ */
+ if (pinned_sb) {
+ WARN_ON(new_sb);
+ deactivate_super(pinned_sb);
+ }
+
+ return dentry;
+}
+
+static void cgroup_kill_sb(struct super_block *sb)
+{
+ struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
+ struct cgroup_root *root = cgroup_root_from_kf(kf_root);
+
+ /*
+ * If @root doesn't have any mounts or children, start killing it.
+ * This prevents new mounts by disabling percpu_ref_tryget_live().
+ * cgroup_mount() may wait for @root's release.
+ *
+ * And don't kill the default root.
+ */
+ if (!list_empty(&root->cgrp.self.children) ||
+ root == &cgrp_dfl_root)
+ cgroup_put(&root->cgrp);
+ else
+ percpu_ref_kill(&root->cgrp.self.refcnt);
+
+ kernfs_kill_sb(sb);
+}
+
+static struct file_system_type cgroup_fs_type = {
+ .name = "cgroup",
+ .mount = cgroup_mount,
+ .kill_sb = cgroup_kill_sb,
+};
+
+/**
+ * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
+ * @task: target task
+ * @buf: the buffer to write the path into
+ * @buflen: the length of the buffer
+ *
+ * Determine @task's cgroup on the first (the one with the lowest non-zero
+ * hierarchy_id) cgroup hierarchy and copy its path into @buf. This
+ * function grabs cgroup_mutex and shouldn't be used inside locks used by
+ * cgroup controller callbacks.
+ *
+ * Return value is the same as kernfs_path().
+ */
+char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
+{
+ struct cgroup_root *root;
+ struct cgroup *cgrp;
+ int hierarchy_id = 1;
+ char *path = NULL;
+
+ mutex_lock(&cgroup_mutex);
+ down_read(&css_set_rwsem);
+
+ root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
+
+ if (root) {
+ cgrp = task_cgroup_from_root(task, root);
+ path = cgroup_path(cgrp, buf, buflen);
+ } else {
+ /* if no hierarchy exists, everyone is in "/" */
+ if (strlcpy(buf, "/", buflen) < buflen)
+ path = buf;
+ }
+
+ up_read(&css_set_rwsem);
+ mutex_unlock(&cgroup_mutex);
+ return path;
+}
+EXPORT_SYMBOL_GPL(task_cgroup_path);
+
+/* used to track tasks and other necessary states during migration */
+struct cgroup_taskset {
+ /* the src and dst cset list running through cset->mg_node */
+ struct list_head src_csets;
+ struct list_head dst_csets;
+
+ /*
+ * Fields for cgroup_taskset_*() iteration.
+ *
+ * Before migration is committed, the target migration tasks are on
+ * ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of
+ * the csets on ->dst_csets. ->csets point to either ->src_csets
+ * or ->dst_csets depending on whether migration is committed.
+ *
+ * ->cur_csets and ->cur_task point to the current task position
+ * during iteration.
+ */
+ struct list_head *csets;
+ struct css_set *cur_cset;
+ struct task_struct *cur_task;
+};
+
+/**
+ * cgroup_taskset_first - reset taskset and return the first task
+ * @tset: taskset of interest
+ *
+ * @tset iteration is initialized and the first task is returned.
+ */
+struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
+{
+ tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
+ tset->cur_task = NULL;
+
+ return cgroup_taskset_next(tset);
+}
+
+/**
+ * cgroup_taskset_next - iterate to the next task in taskset
+ * @tset: taskset of interest
+ *
+ * Return the next task in @tset. Iteration must have been initialized
+ * with cgroup_taskset_first().
+ */
+struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
+{
+ struct css_set *cset = tset->cur_cset;
+ struct task_struct *task = tset->cur_task;
+
+ while (&cset->mg_node != tset->csets) {
+ if (!task)
+ task = list_first_entry(&cset->mg_tasks,
+ struct task_struct, cg_list);
+ else
+ task = list_next_entry(task, cg_list);
+
+ if (&task->cg_list != &cset->mg_tasks) {
+ tset->cur_cset = cset;
+ tset->cur_task = task;
+ return task;
+ }
+
+ cset = list_next_entry(cset, mg_node);
+ task = NULL;
+ }
+
+ return NULL;
+}
+
+/**
+ * cgroup_task_migrate - move a task from one cgroup to another.
+ * @old_cgrp: the cgroup @tsk is being migrated from
+ * @tsk: the task being migrated
+ * @new_cset: the new css_set @tsk is being attached to
+ *
+ * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked.
+ */
+static void cgroup_task_migrate(struct cgroup *old_cgrp,
+ struct task_struct *tsk,
+ struct css_set *new_cset)
+{
+ struct css_set *old_cset;
+
+ lockdep_assert_held(&cgroup_mutex);
+ lockdep_assert_held(&css_set_rwsem);
+
+ /*
+ * We are synchronized through threadgroup_lock() against PF_EXITING
+ * setting such that we can't race against cgroup_exit() changing the
+ * css_set to init_css_set and dropping the old one.
+ */
+ WARN_ON_ONCE(tsk->flags & PF_EXITING);
+ old_cset = task_css_set(tsk);
+
+ get_css_set(new_cset);
+ rcu_assign_pointer(tsk->cgroups, new_cset);
+
+ /*
+ * Use move_tail so that cgroup_taskset_first() still returns the
+ * leader after migration. This works because cgroup_migrate()
+ * ensures that the dst_cset of the leader is the first on the
+ * tset's dst_csets list.
+ */
+ list_move_tail(&tsk->cg_list, &new_cset->mg_tasks);
+
+ /*
+ * We just gained a reference on old_cset by taking it from the
+ * task. As trading it for new_cset is protected by cgroup_mutex,
+ * we're safe to drop it here; it will be freed under RCU.
+ */
+ put_css_set_locked(old_cset);
+}
+
+/**
+ * cgroup_migrate_finish - cleanup after attach
+ * @preloaded_csets: list of preloaded css_sets
+ *
+ * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See
+ * those functions for details.
+ */
+static void cgroup_migrate_finish(struct list_head *preloaded_csets)
+{
+ struct css_set *cset, *tmp_cset;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ down_write(&css_set_rwsem);
+ list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
+ cset->mg_src_cgrp = NULL;
+ cset->mg_dst_cset = NULL;
+ list_del_init(&cset->mg_preload_node);
+ put_css_set_locked(cset);
+ }
+ up_write(&css_set_rwsem);
+}
+
+/**
+ * cgroup_migrate_add_src - add a migration source css_set
+ * @src_cset: the source css_set to add
+ * @dst_cgrp: the destination cgroup
+ * @preloaded_csets: list of preloaded css_sets
+ *
+ * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin
+ * @src_cset and add it to @preloaded_csets, which should later be cleaned
+ * up by cgroup_migrate_finish().
+ *
+ * This function may be called without holding threadgroup_lock even if the
+ * target is a process. Threads may be created and destroyed but as long
+ * as cgroup_mutex is not dropped, no new css_set can be put into play and
+ * the preloaded css_sets are guaranteed to cover all migrations.
+ */
+static void cgroup_migrate_add_src(struct css_set *src_cset,
+ struct cgroup *dst_cgrp,
+ struct list_head *preloaded_csets)
+{
+ struct cgroup *src_cgrp;
+
+ lockdep_assert_held(&cgroup_mutex);
+ lockdep_assert_held(&css_set_rwsem);
+
+ src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
+
+ if (!list_empty(&src_cset->mg_preload_node))
+ return;
+
+ WARN_ON(src_cset->mg_src_cgrp);
+ WARN_ON(!list_empty(&src_cset->mg_tasks));
+ WARN_ON(!list_empty(&src_cset->mg_node));
+
+ src_cset->mg_src_cgrp = src_cgrp;
+ get_css_set(src_cset);
+ list_add(&src_cset->mg_preload_node, preloaded_csets);
+}
+
+/**
+ * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
+ * @dst_cgrp: the destination cgroup (may be %NULL)
+ * @preloaded_csets: list of preloaded source css_sets
+ *
+ * Tasks are about to be moved to @dst_cgrp and all the source css_sets
+ * have been preloaded to @preloaded_csets. This function looks up and
+ * pins all destination css_sets, links each to its source, and append them
+ * to @preloaded_csets. If @dst_cgrp is %NULL, the destination of each
+ * source css_set is assumed to be its cgroup on the default hierarchy.
+ *
+ * This function must be called after cgroup_migrate_add_src() has been
+ * called on each migration source css_set. After migration is performed
+ * using cgroup_migrate(), cgroup_migrate_finish() must be called on
+ * @preloaded_csets.
+ */
+static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
+ struct list_head *preloaded_csets)
+{
+ LIST_HEAD(csets);
+ struct css_set *src_cset, *tmp_cset;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ /*
+ * Except for the root, child_subsys_mask must be zero for a cgroup
+ * with tasks so that child cgroups don't compete against tasks.
+ */
+ if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && cgroup_parent(dst_cgrp) &&
+ dst_cgrp->child_subsys_mask)
+ return -EBUSY;
+
+ /* look up the dst cset for each src cset and link it to src */
+ list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {
+ struct css_set *dst_cset;
+
+ dst_cset = find_css_set(src_cset,
+ dst_cgrp ?: src_cset->dfl_cgrp);
+ if (!dst_cset)
+ goto err;
+
+ WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
+
+ /*
+ * If src cset equals dst, it's noop. Drop the src.
+ * cgroup_migrate() will skip the cset too. Note that we
+ * can't handle src == dst as some nodes are used by both.
+ */
+ if (src_cset == dst_cset) {
+ src_cset->mg_src_cgrp = NULL;
+ list_del_init(&src_cset->mg_preload_node);
+ put_css_set(src_cset);
+ put_css_set(dst_cset);
+ continue;
+ }
+
+ src_cset->mg_dst_cset = dst_cset;
+
+ if (list_empty(&dst_cset->mg_preload_node))
+ list_add(&dst_cset->mg_preload_node, &csets);
+ else
+ put_css_set(dst_cset);
+ }
+
+ list_splice_tail(&csets, preloaded_csets);
+ return 0;
+err:
+ cgroup_migrate_finish(&csets);
+ return -ENOMEM;
+}
+
+/**
+ * cgroup_migrate - migrate a process or task to a cgroup
+ * @cgrp: the destination cgroup
+ * @leader: the leader of the process or the task to migrate
+ * @threadgroup: whether @leader points to the whole process or a single task
+ *
+ * Migrate a process or task denoted by @leader to @cgrp. If migrating a
+ * process, the caller must be holding threadgroup_lock of @leader. The
+ * caller is also responsible for invoking cgroup_migrate_add_src() and
+ * cgroup_migrate_prepare_dst() on the targets before invoking this
+ * function and following up with cgroup_migrate_finish().
+ *
+ * As long as a controller's ->can_attach() doesn't fail, this function is
+ * guaranteed to succeed. This means that, excluding ->can_attach()
+ * failure, when migrating multiple targets, the success or failure can be
+ * decided for all targets by invoking group_migrate_prepare_dst() before
+ * actually starting migrating.
+ */
+static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
+ bool threadgroup)
+{
+ struct cgroup_taskset tset = {
+ .src_csets = LIST_HEAD_INIT(tset.src_csets),
+ .dst_csets = LIST_HEAD_INIT(tset.dst_csets),
+ .csets = &tset.src_csets,
+ };
+ struct cgroup_subsys_state *css, *failed_css = NULL;
+ struct css_set *cset, *tmp_cset;
+ struct task_struct *task, *tmp_task;
+ int i, ret;
+
+ /*
+ * Prevent freeing of tasks while we take a snapshot. Tasks that are
+ * already PF_EXITING could be freed from underneath us unless we
+ * take an rcu_read_lock.
+ */
+ down_write(&css_set_rwsem);
+ rcu_read_lock();
+ task = leader;
+ do {
+ /* @task either already exited or can't exit until the end */
+ if (task->flags & PF_EXITING)
+ goto next;
+
+ /* leave @task alone if post_fork() hasn't linked it yet */
+ if (list_empty(&task->cg_list))
+ goto next;
+
+ cset = task_css_set(task);
+ if (!cset->mg_src_cgrp)
+ goto next;
+
+ /*
+ * cgroup_taskset_first() must always return the leader.
+ * Take care to avoid disturbing the ordering.
+ */
+ list_move_tail(&task->cg_list, &cset->mg_tasks);
+ if (list_empty(&cset->mg_node))
+ list_add_tail(&cset->mg_node, &tset.src_csets);
+ if (list_empty(&cset->mg_dst_cset->mg_node))
+ list_move_tail(&cset->mg_dst_cset->mg_node,
+ &tset.dst_csets);
+ next:
+ if (!threadgroup)
+ break;
+ } while_each_thread(leader, task);
+ rcu_read_unlock();
+ up_write(&css_set_rwsem);
+
+ /* methods shouldn't be called if no task is actually migrating */
+ if (list_empty(&tset.src_csets))
+ return 0;
+
+ /* check that we can legitimately attach to the cgroup */
+ for_each_e_css(css, i, cgrp) {
+ if (css->ss->can_attach) {
+ ret = css->ss->can_attach(css, &tset);
+ if (ret) {
+ failed_css = css;
+ goto out_cancel_attach;
+ }
+ }
+ }
+
+ /*
+ * Now that we're guaranteed success, proceed to move all tasks to
+ * the new cgroup. There are no failure cases after here, so this
+ * is the commit point.
+ */
+ down_write(&css_set_rwsem);
+ list_for_each_entry(cset, &tset.src_csets, mg_node) {
+ list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list)
+ cgroup_task_migrate(cset->mg_src_cgrp, task,
+ cset->mg_dst_cset);
+ }
+ up_write(&css_set_rwsem);
+
+ /*
+ * Migration is committed, all target tasks are now on dst_csets.
+ * Nothing is sensitive to fork() after this point. Notify
+ * controllers that migration is complete.
+ */
+ tset.csets = &tset.dst_csets;
+
+ for_each_e_css(css, i, cgrp)
+ if (css->ss->attach)
+ css->ss->attach(css, &tset);
+
+ ret = 0;
+ goto out_release_tset;
+
+out_cancel_attach:
+ for_each_e_css(css, i, cgrp) {
+ if (css == failed_css)
+ break;
+ if (css->ss->cancel_attach)
+ css->ss->cancel_attach(css, &tset);
+ }
+out_release_tset:
+ down_write(&css_set_rwsem);
+ list_splice_init(&tset.dst_csets, &tset.src_csets);
+ list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) {
+ list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
+ list_del_init(&cset->mg_node);
+ }
+ up_write(&css_set_rwsem);
+ return ret;
+}
+
+/**
+ * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
+ * @dst_cgrp: the cgroup to attach to
+ * @leader: the task or the leader of the threadgroup to be attached
+ * @threadgroup: attach the whole threadgroup?
+ *
+ * Call holding cgroup_mutex and threadgroup_lock of @leader.
+ */
+static int cgroup_attach_task(struct cgroup *dst_cgrp,
+ struct task_struct *leader, bool threadgroup)
+{
+ LIST_HEAD(preloaded_csets);
+ struct task_struct *task;
+ int ret;
+
+ /* look up all src csets */
+ down_read(&css_set_rwsem);
+ rcu_read_lock();
+ task = leader;
+ do {
+ cgroup_migrate_add_src(task_css_set(task), dst_cgrp,
+ &preloaded_csets);
+ if (!threadgroup)
+ break;
+ } while_each_thread(leader, task);
+ rcu_read_unlock();
+ up_read(&css_set_rwsem);
+
+ /* prepare dst csets and commit */
+ ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
+ if (!ret)
+ ret = cgroup_migrate(dst_cgrp, leader, threadgroup);
+
+ cgroup_migrate_finish(&preloaded_csets);
+ return ret;
+}
+
+/*
+ * Find the task_struct of the task to attach by vpid and pass it along to the
+ * function to attach either it or all tasks in its threadgroup. Will lock
+ * cgroup_mutex and threadgroup.
+ */
+static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off, bool threadgroup)
+{
+ struct task_struct *tsk;
+ const struct cred *cred = current_cred(), *tcred;
+ struct cgroup *cgrp;
+ pid_t pid;
+ int ret;
+
+ if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
+ return -EINVAL;
+
+ cgrp = cgroup_kn_lock_live(of->kn);
+ if (!cgrp)
+ return -ENODEV;
+
+retry_find_task:
+ rcu_read_lock();
+ if (pid) {
+ tsk = find_task_by_vpid(pid);
+ if (!tsk) {
+ rcu_read_unlock();
+ ret = -ESRCH;
+ goto out_unlock_cgroup;
+ }
+ /*
+ * even if we're attaching all tasks in the thread group, we
+ * only need to check permissions on one of them.
+ */
+ tcred = __task_cred(tsk);
+ if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
+ !uid_eq(cred->euid, tcred->uid) &&
+ !uid_eq(cred->euid, tcred->suid)) {
+ rcu_read_unlock();
+ ret = -EACCES;
+ goto out_unlock_cgroup;
+ }
+ } else
+ tsk = current;
+
+ if (threadgroup)
+ tsk = tsk->group_leader;
+
+ /*
+ * Workqueue threads may acquire PF_NO_SETAFFINITY and become
+ * trapped in a cpuset, or RT worker may be born in a cgroup
+ * with no rt_runtime allocated. Just say no.
+ */
+ if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
+ ret = -EINVAL;
+ rcu_read_unlock();
+ goto out_unlock_cgroup;
+ }
+
+ get_task_struct(tsk);
+ rcu_read_unlock();
+
+ threadgroup_lock(tsk);
+ if (threadgroup) {
+ if (!thread_group_leader(tsk)) {
+ /*
+ * a race with de_thread from another thread's exec()
+ * may strip us of our leadership, if this happens,
+ * there is no choice but to throw this task away and
+ * try again; this is
+ * "double-double-toil-and-trouble-check locking".
+ */
+ threadgroup_unlock(tsk);
+ put_task_struct(tsk);
+ goto retry_find_task;
+ }
+ }
+
+ ret = cgroup_attach_task(cgrp, tsk, threadgroup);
+
+ threadgroup_unlock(tsk);
+
+ put_task_struct(tsk);
+out_unlock_cgroup:
+ cgroup_kn_unlock(of->kn);
+ return ret ?: nbytes;
+}
+
+/**
+ * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
+ * @from: attach to all cgroups of a given task
+ * @tsk: the task to be attached
+ */
+int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
+{
+ struct cgroup_root *root;
+ int retval = 0;
+
+ mutex_lock(&cgroup_mutex);
+ for_each_root(root) {
+ struct cgroup *from_cgrp;
+
+ if (root == &cgrp_dfl_root)
+ continue;
+
+ down_read(&css_set_rwsem);
+ from_cgrp = task_cgroup_from_root(from, root);
+ up_read(&css_set_rwsem);
+
+ retval = cgroup_attach_task(from_cgrp, tsk, false);
+ if (retval)
+ break;
+ }
+ mutex_unlock(&cgroup_mutex);
+
+ return retval;
+}
+EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
+
+static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ return __cgroup_procs_write(of, buf, nbytes, off, false);
+}
+
+static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ return __cgroup_procs_write(of, buf, nbytes, off, true);
+}
+
+static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct cgroup *cgrp;
+
+ BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
+
+ cgrp = cgroup_kn_lock_live(of->kn);
+ if (!cgrp)
+ return -ENODEV;
+ spin_lock(&release_agent_path_lock);
+ strlcpy(cgrp->root->release_agent_path, strstrip(buf),
+ sizeof(cgrp->root->release_agent_path));
+ spin_unlock(&release_agent_path_lock);
+ cgroup_kn_unlock(of->kn);
+ return nbytes;
+}
+
+static int cgroup_release_agent_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+ spin_lock(&release_agent_path_lock);
+ seq_puts(seq, cgrp->root->release_agent_path);
+ spin_unlock(&release_agent_path_lock);
+ seq_putc(seq, '\n');
+ return 0;
+}
+
+static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
+{
+ seq_puts(seq, "0\n");
+ return 0;
+}
+
+static void cgroup_print_ss_mask(struct seq_file *seq, unsigned int ss_mask)
+{
+ struct cgroup_subsys *ss;
+ bool printed = false;
+ int ssid;
+
+ for_each_subsys(ss, ssid) {
+ if (ss_mask & (1 << ssid)) {
+ if (printed)
+ seq_putc(seq, ' ');
+ seq_printf(seq, "%s", ss->name);
+ printed = true;
+ }
+ }
+ if (printed)
+ seq_putc(seq, '\n');
+}
+
+/* show controllers which are currently attached to the default hierarchy */
+static int cgroup_root_controllers_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+ cgroup_print_ss_mask(seq, cgrp->root->subsys_mask &
+ ~cgrp_dfl_root_inhibit_ss_mask);
+ return 0;
+}
+
+/* show controllers which are enabled from the parent */
+static int cgroup_controllers_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+ cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->subtree_control);
+ return 0;
+}
+
+/* show controllers which are enabled for a given cgroup's children */
+static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+ cgroup_print_ss_mask(seq, cgrp->subtree_control);
+ return 0;
+}
+
+/**
+ * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy
+ * @cgrp: root of the subtree to update csses for
+ *
+ * @cgrp's child_subsys_mask has changed and its subtree's (self excluded)
+ * css associations need to be updated accordingly. This function looks up
+ * all css_sets which are attached to the subtree, creates the matching
+ * updated css_sets and migrates the tasks to the new ones.
+ */
+static int cgroup_update_dfl_csses(struct cgroup *cgrp)
+{
+ LIST_HEAD(preloaded_csets);
+ struct cgroup_subsys_state *css;
+ struct css_set *src_cset;
+ int ret;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ /* look up all csses currently attached to @cgrp's subtree */
+ down_read(&css_set_rwsem);
+ css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
+ struct cgrp_cset_link *link;
+
+ /* self is not affected by child_subsys_mask change */
+ if (css->cgroup == cgrp)
+ continue;
+
+ list_for_each_entry(link, &css->cgroup->cset_links, cset_link)
+ cgroup_migrate_add_src(link->cset, cgrp,
+ &preloaded_csets);
+ }
+ up_read(&css_set_rwsem);
+
+ /* NULL dst indicates self on default hierarchy */
+ ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets);
+ if (ret)
+ goto out_finish;
+
+ list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
+ struct task_struct *last_task = NULL, *task;
+
+ /* src_csets precede dst_csets, break on the first dst_cset */
+ if (!src_cset->mg_src_cgrp)
+ break;
+
+ /*
+ * All tasks in src_cset need to be migrated to the
+ * matching dst_cset. Empty it process by process. We
+ * walk tasks but migrate processes. The leader might even
+ * belong to a different cset but such src_cset would also
+ * be among the target src_csets because the default
+ * hierarchy enforces per-process membership.
+ */
+ while (true) {
+ down_read(&css_set_rwsem);
+ task = list_first_entry_or_null(&src_cset->tasks,
+ struct task_struct, cg_list);
+ if (task) {
+ task = task->group_leader;
+ WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp);
+ get_task_struct(task);
+ }
+ up_read(&css_set_rwsem);
+
+ if (!task)
+ break;
+
+ /* guard against possible infinite loop */
+ if (WARN(last_task == task,
+ "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n"))
+ goto out_finish;
+ last_task = task;
+
+ threadgroup_lock(task);
+ /* raced against de_thread() from another thread? */
+ if (!thread_group_leader(task)) {
+ threadgroup_unlock(task);
+ put_task_struct(task);
+ continue;
+ }
+
+ ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
+
+ threadgroup_unlock(task);
+ put_task_struct(task);
+
+ if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
+ goto out_finish;
+ }
+ }
+
+out_finish:
+ cgroup_migrate_finish(&preloaded_csets);
+ return ret;
+}
+
+/* change the enabled child controllers for a cgroup in the default hierarchy */
+static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ unsigned int enable = 0, disable = 0;
+ unsigned int css_enable, css_disable, old_sc, new_sc, old_ss, new_ss;
+ struct cgroup *cgrp, *child;
+ struct cgroup_subsys *ss;
+ char *tok;
+ int ssid, ret;
+
+ /*
+ * Parse input - space separated list of subsystem names prefixed
+ * with either + or -.
+ */
+ buf = strstrip(buf);
+ while ((tok = strsep(&buf, " "))) {
+ if (tok[0] == '\0')
+ continue;
+ for_each_subsys(ss, ssid) {
+ if (ss->disabled || strcmp(tok + 1, ss->name) ||
+ ((1 << ss->id) & cgrp_dfl_root_inhibit_ss_mask))
+ continue;
+
+ if (*tok == '+') {
+ enable |= 1 << ssid;
+ disable &= ~(1 << ssid);
+ } else if (*tok == '-') {
+ disable |= 1 << ssid;
+ enable &= ~(1 << ssid);
+ } else {
+ return -EINVAL;
+ }
+ break;
+ }
+ if (ssid == CGROUP_SUBSYS_COUNT)
+ return -EINVAL;
+ }
+
+ cgrp = cgroup_kn_lock_live(of->kn);
+ if (!cgrp)
+ return -ENODEV;
+
+ for_each_subsys(ss, ssid) {
+ if (enable & (1 << ssid)) {
+ if (cgrp->subtree_control & (1 << ssid)) {
+ enable &= ~(1 << ssid);
+ continue;
+ }
+
+ /* unavailable or not enabled on the parent? */
+ if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
+ (cgroup_parent(cgrp) &&
+ !(cgroup_parent(cgrp)->subtree_control & (1 << ssid)))) {
+ ret = -ENOENT;
+ goto out_unlock;
+ }
+ } else if (disable & (1 << ssid)) {
+ if (!(cgrp->subtree_control & (1 << ssid))) {
+ disable &= ~(1 << ssid);
+ continue;
+ }
+
+ /* a child has it enabled? */
+ cgroup_for_each_live_child(child, cgrp) {
+ if (child->subtree_control & (1 << ssid)) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+ }
+ }
+ }
+
+ if (!enable && !disable) {
+ ret = 0;
+ goto out_unlock;
+ }
+
+ /*
+ * Except for the root, subtree_control must be zero for a cgroup
+ * with tasks so that child cgroups don't compete against tasks.
+ */
+ if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
+ /*
+ * Update subsys masks and calculate what needs to be done. More
+ * subsystems than specified may need to be enabled or disabled
+ * depending on subsystem dependencies.
+ */
+ old_sc = cgrp->subtree_control;
+ old_ss = cgrp->child_subsys_mask;
+ new_sc = (old_sc | enable) & ~disable;
+ new_ss = cgroup_calc_child_subsys_mask(cgrp, new_sc);
+
+ css_enable = ~old_ss & new_ss;
+ css_disable = old_ss & ~new_ss;
+ enable |= css_enable;
+ disable |= css_disable;
+
+ /*
+ * Because css offlining is asynchronous, userland might try to
+ * re-enable the same controller while the previous instance is
+ * still around. In such cases, wait till it's gone using
+ * offline_waitq.
+ */
+ for_each_subsys(ss, ssid) {
+ if (!(css_enable & (1 << ssid)))
+ continue;
+
+ cgroup_for_each_live_child(child, cgrp) {
+ DEFINE_WAIT(wait);
+
+ if (!cgroup_css(child, ss))
+ continue;
+
+ cgroup_get(child);
+ prepare_to_wait(&child->offline_waitq, &wait,
+ TASK_UNINTERRUPTIBLE);
+ cgroup_kn_unlock(of->kn);
+ schedule();
+ finish_wait(&child->offline_waitq, &wait);
+ cgroup_put(child);
+
+ return restart_syscall();
+ }
+ }
+
+ cgrp->subtree_control = new_sc;
+ cgrp->child_subsys_mask = new_ss;
+
+ /*
+ * Create new csses or make the existing ones visible. A css is
+ * created invisible if it's being implicitly enabled through
+ * dependency. An invisible css is made visible when the userland
+ * explicitly enables it.
+ */
+ for_each_subsys(ss, ssid) {
+ if (!(enable & (1 << ssid)))
+ continue;
+
+ cgroup_for_each_live_child(child, cgrp) {
+ if (css_enable & (1 << ssid))
+ ret = create_css(child, ss,
+ cgrp->subtree_control & (1 << ssid));
+ else
+ ret = cgroup_populate_dir(child, 1 << ssid);
+ if (ret)
+ goto err_undo_css;
+ }
+ }
+
+ /*
+ * At this point, cgroup_e_css() results reflect the new csses
+ * making the following cgroup_update_dfl_csses() properly update
+ * css associations of all tasks in the subtree.
+ */
+ ret = cgroup_update_dfl_csses(cgrp);
+ if (ret)
+ goto err_undo_css;
+
+ /*
+ * All tasks are migrated out of disabled csses. Kill or hide
+ * them. A css is hidden when the userland requests it to be
+ * disabled while other subsystems are still depending on it. The
+ * css must not actively control resources and be in the vanilla
+ * state if it's made visible again later. Controllers which may
+ * be depended upon should provide ->css_reset() for this purpose.
+ */
+ for_each_subsys(ss, ssid) {
+ if (!(disable & (1 << ssid)))
+ continue;
+
+ cgroup_for_each_live_child(child, cgrp) {
+ struct cgroup_subsys_state *css = cgroup_css(child, ss);
+
+ if (css_disable & (1 << ssid)) {
+ kill_css(css);
+ } else {
+ cgroup_clear_dir(child, 1 << ssid);
+ if (ss->css_reset)
+ ss->css_reset(css);
+ }
+ }
+ }
+
+ /*
+ * The effective csses of all the descendants (excluding @cgrp) may
+ * have changed. Subsystems can optionally subscribe to this event
+ * by implementing ->css_e_css_changed() which is invoked if any of
+ * the effective csses seen from the css's cgroup may have changed.
+ */
+ for_each_subsys(ss, ssid) {
+ struct cgroup_subsys_state *this_css = cgroup_css(cgrp, ss);
+ struct cgroup_subsys_state *css;
+
+ if (!ss->css_e_css_changed || !this_css)
+ continue;
+
+ css_for_each_descendant_pre(css, this_css)
+ if (css != this_css)
+ ss->css_e_css_changed(css);
+ }
+
+ kernfs_activate(cgrp->kn);
+ ret = 0;
+out_unlock:
+ cgroup_kn_unlock(of->kn);
+ return ret ?: nbytes;
+
+err_undo_css:
+ cgrp->subtree_control = old_sc;
+ cgrp->child_subsys_mask = old_ss;
+
+ for_each_subsys(ss, ssid) {
+ if (!(enable & (1 << ssid)))
+ continue;
+
+ cgroup_for_each_live_child(child, cgrp) {
+ struct cgroup_subsys_state *css = cgroup_css(child, ss);
+
+ if (!css)
+ continue;
+
+ if (css_enable & (1 << ssid))
+ kill_css(css);
+ else
+ cgroup_clear_dir(child, 1 << ssid);
+ }
+ }
+ goto out_unlock;
+}
+
+static int cgroup_populated_show(struct seq_file *seq, void *v)
+{
+ seq_printf(seq, "%d\n", (bool)seq_css(seq)->cgroup->populated_cnt);
+ return 0;
+}
+
+static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct cgroup *cgrp = of->kn->parent->priv;
+ struct cftype *cft = of->kn->priv;
+ struct cgroup_subsys_state *css;
+ int ret;
+
+ if (cft->write)
+ return cft->write(of, buf, nbytes, off);
+
+ /*
+ * kernfs guarantees that a file isn't deleted with operations in
+ * flight, which means that the matching css is and stays alive and
+ * doesn't need to be pinned. The RCU locking is not necessary
+ * either. It's just for the convenience of using cgroup_css().
+ */
+ rcu_read_lock();
+ css = cgroup_css(cgrp, cft->ss);
+ rcu_read_unlock();
+
+ if (cft->write_u64) {
+ unsigned long long v;
+ ret = kstrtoull(buf, 0, &v);
+ if (!ret)
+ ret = cft->write_u64(css, cft, v);
+ } else if (cft->write_s64) {
+ long long v;
+ ret = kstrtoll(buf, 0, &v);
+ if (!ret)
+ ret = cft->write_s64(css, cft, v);
+ } else {
+ ret = -EINVAL;
+ }
+
+ return ret ?: nbytes;
+}
+
+static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
+{
+ return seq_cft(seq)->seq_start(seq, ppos);
+}
+
+static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
+{
+ return seq_cft(seq)->seq_next(seq, v, ppos);
+}
+
+static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
+{
+ seq_cft(seq)->seq_stop(seq, v);
+}
+
+static int cgroup_seqfile_show(struct seq_file *m, void *arg)
+{
+ struct cftype *cft = seq_cft(m);
+ struct cgroup_subsys_state *css = seq_css(m);
+
+ if (cft->seq_show)
+ return cft->seq_show(m, arg);
+
+ if (cft->read_u64)
+ seq_printf(m, "%llu\n", cft->read_u64(css, cft));
+ else if (cft->read_s64)
+ seq_printf(m, "%lld\n", cft->read_s64(css, cft));
+ else
+ return -EINVAL;
+ return 0;
+}
+
+static struct kernfs_ops cgroup_kf_single_ops = {
+ .atomic_write_len = PAGE_SIZE,
+ .write = cgroup_file_write,
+ .seq_show = cgroup_seqfile_show,
+};
+
+static struct kernfs_ops cgroup_kf_ops = {
+ .atomic_write_len = PAGE_SIZE,
+ .write = cgroup_file_write,
+ .seq_start = cgroup_seqfile_start,
+ .seq_next = cgroup_seqfile_next,
+ .seq_stop = cgroup_seqfile_stop,
+ .seq_show = cgroup_seqfile_show,
+};
+
+/*
+ * cgroup_rename - Only allow simple rename of directories in place.
+ */
+static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
+ const char *new_name_str)
+{
+ struct cgroup *cgrp = kn->priv;
+ int ret;
+
+ if (kernfs_type(kn) != KERNFS_DIR)
+ return -ENOTDIR;
+ if (kn->parent != new_parent)
+ return -EIO;
+
+ /*
+ * This isn't a proper migration and its usefulness is very
+ * limited. Disallow on the default hierarchy.
+ */
+ if (cgroup_on_dfl(cgrp))
+ return -EPERM;
+
+ /*
+ * We're gonna grab cgroup_mutex which nests outside kernfs
+ * active_ref. kernfs_rename() doesn't require active_ref
+ * protection. Break them before grabbing cgroup_mutex.
+ */
+ kernfs_break_active_protection(new_parent);
+ kernfs_break_active_protection(kn);
+
+ mutex_lock(&cgroup_mutex);
+
+ ret = kernfs_rename(kn, new_parent, new_name_str);
+
+ mutex_unlock(&cgroup_mutex);
+
+ kernfs_unbreak_active_protection(kn);
+ kernfs_unbreak_active_protection(new_parent);
+ return ret;
+}
+
+/* set uid and gid of cgroup dirs and files to that of the creator */
+static int cgroup_kn_set_ugid(struct kernfs_node *kn)
+{
+ struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
+ .ia_uid = current_fsuid(),
+ .ia_gid = current_fsgid(), };
+
+ if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
+ gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
+ return 0;
+
+ return kernfs_setattr(kn, &iattr);
+}
+
+static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
+{
+ char name[CGROUP_FILE_NAME_MAX];
+ struct kernfs_node *kn;
+ struct lock_class_key *key = NULL;
+ int ret;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ key = &cft->lockdep_key;
+#endif
+ kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
+ cgroup_file_mode(cft), 0, cft->kf_ops, cft,
+ NULL, key);
+ if (IS_ERR(kn))
+ return PTR_ERR(kn);
+
+ ret = cgroup_kn_set_ugid(kn);
+ if (ret) {
+ kernfs_remove(kn);
+ return ret;
+ }
+
+ if (cft->seq_show == cgroup_populated_show)
+ cgrp->populated_kn = kn;
+ return 0;
+}
+
+/**
+ * cgroup_addrm_files - add or remove files to a cgroup directory
+ * @cgrp: the target cgroup
+ * @cfts: array of cftypes to be added
+ * @is_add: whether to add or remove
+ *
+ * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
+ * For removals, this function never fails. If addition fails, this
+ * function doesn't remove files already added. The caller is responsible
+ * for cleaning up.
+ */
+static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
+ bool is_add)
+{
+ struct cftype *cft;
+ int ret;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ for (cft = cfts; cft->name[0] != '\0'; cft++) {
+ /* does cft->flags tell us to skip this file on @cgrp? */
+ if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
+ continue;
+ if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
+ continue;
+ if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
+ continue;
+ if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
+ continue;
+
+ if (is_add) {
+ ret = cgroup_add_file(cgrp, cft);
+ if (ret) {
+ pr_warn("%s: failed to add %s, err=%d\n",
+ __func__, cft->name, ret);
+ return ret;
+ }
+ } else {
+ cgroup_rm_file(cgrp, cft);
+ }
+ }
+ return 0;
+}
+
+static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
+{
+ LIST_HEAD(pending);
+ struct cgroup_subsys *ss = cfts[0].ss;
+ struct cgroup *root = &ss->root->cgrp;
+ struct cgroup_subsys_state *css;
+ int ret = 0;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ /* add/rm files for all cgroups created before */
+ css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
+ struct cgroup *cgrp = css->cgroup;
+
+ if (cgroup_is_dead(cgrp))
+ continue;
+
+ ret = cgroup_addrm_files(cgrp, cfts, is_add);
+ if (ret)
+ break;
+ }
+
+ if (is_add && !ret)
+ kernfs_activate(root->kn);
+ return ret;
+}
+
+static void cgroup_exit_cftypes(struct cftype *cfts)
+{
+ struct cftype *cft;
+
+ for (cft = cfts; cft->name[0] != '\0'; cft++) {
+ /* free copy for custom atomic_write_len, see init_cftypes() */
+ if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
+ kfree(cft->kf_ops);
+ cft->kf_ops = NULL;
+ cft->ss = NULL;
+
+ /* revert flags set by cgroup core while adding @cfts */
+ cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL);
+ }
+}
+
+static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+{
+ struct cftype *cft;
+
+ for (cft = cfts; cft->name[0] != '\0'; cft++) {
+ struct kernfs_ops *kf_ops;
+
+ WARN_ON(cft->ss || cft->kf_ops);
+
+ if (cft->seq_start)
+ kf_ops = &cgroup_kf_ops;
+ else
+ kf_ops = &cgroup_kf_single_ops;
+
+ /*
+ * Ugh... if @cft wants a custom max_write_len, we need to
+ * make a copy of kf_ops to set its atomic_write_len.
+ */
+ if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
+ kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
+ if (!kf_ops) {
+ cgroup_exit_cftypes(cfts);
+ return -ENOMEM;
+ }
+ kf_ops->atomic_write_len = cft->max_write_len;
+ }
+
+ cft->kf_ops = kf_ops;
+ cft->ss = ss;
+ }
+
+ return 0;
+}
+
+static int cgroup_rm_cftypes_locked(struct cftype *cfts)
+{
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (!cfts || !cfts[0].ss)
+ return -ENOENT;
+
+ list_del(&cfts->node);
+ cgroup_apply_cftypes(cfts, false);
+ cgroup_exit_cftypes(cfts);
+ return 0;
+}
+
+/**
+ * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
+ * @cfts: zero-length name terminated array of cftypes
+ *
+ * Unregister @cfts. Files described by @cfts are removed from all
+ * existing cgroups and all future cgroups won't have them either. This
+ * function can be called anytime whether @cfts' subsys is attached or not.
+ *
+ * Returns 0 on successful unregistration, -ENOENT if @cfts is not
+ * registered.
+ */
+int cgroup_rm_cftypes(struct cftype *cfts)
+{
+ int ret;
+
+ mutex_lock(&cgroup_mutex);
+ ret = cgroup_rm_cftypes_locked(cfts);
+ mutex_unlock(&cgroup_mutex);
+ return ret;
+}
+
+/**
+ * cgroup_add_cftypes - add an array of cftypes to a subsystem
+ * @ss: target cgroup subsystem
+ * @cfts: zero-length name terminated array of cftypes
+ *
+ * Register @cfts to @ss. Files described by @cfts are created for all
+ * existing cgroups to which @ss is attached and all future cgroups will
+ * have them too. This function can be called anytime whether @ss is
+ * attached or not.
+ *
+ * Returns 0 on successful registration, -errno on failure. Note that this
+ * function currently returns 0 as long as @cfts registration is successful
+ * even if some file creation attempts on existing cgroups fail.
+ */
+static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+{
+ int ret;
+
+ if (ss->disabled)
+ return 0;
+
+ if (!cfts || cfts[0].name[0] == '\0')
+ return 0;
+
+ ret = cgroup_init_cftypes(ss, cfts);
+ if (ret)
+ return ret;
+
+ mutex_lock(&cgroup_mutex);
+
+ list_add_tail(&cfts->node, &ss->cfts);
+ ret = cgroup_apply_cftypes(cfts, true);
+ if (ret)
+ cgroup_rm_cftypes_locked(cfts);
+
+ mutex_unlock(&cgroup_mutex);
+ return ret;
+}
+
+/**
+ * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy
+ * @ss: target cgroup subsystem
+ * @cfts: zero-length name terminated array of cftypes
+ *
+ * Similar to cgroup_add_cftypes() but the added files are only used for
+ * the default hierarchy.
+ */
+int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+{
+ struct cftype *cft;
+
+ for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
+ cft->flags |= __CFTYPE_ONLY_ON_DFL;
+ return cgroup_add_cftypes(ss, cfts);
+}
+
+/**
+ * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies
+ * @ss: target cgroup subsystem
+ * @cfts: zero-length name terminated array of cftypes
+ *
+ * Similar to cgroup_add_cftypes() but the added files are only used for
+ * the legacy hierarchies.
+ */
+int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+{
+ struct cftype *cft;
+
+ /*
+ * If legacy_flies_on_dfl, we want to show the legacy files on the
+ * dfl hierarchy but iff the target subsystem hasn't been updated
+ * for the dfl hierarchy yet.
+ */
+ if (!cgroup_legacy_files_on_dfl ||
+ ss->dfl_cftypes != ss->legacy_cftypes) {
+ for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
+ cft->flags |= __CFTYPE_NOT_ON_DFL;
+ }
+
+ return cgroup_add_cftypes(ss, cfts);
+}
+
+/**
+ * cgroup_task_count - count the number of tasks in a cgroup.
+ * @cgrp: the cgroup in question
+ *
+ * Return the number of tasks in the cgroup.
+ */
+static int cgroup_task_count(const struct cgroup *cgrp)
+{
+ int count = 0;
+ struct cgrp_cset_link *link;
+
+ down_read(&css_set_rwsem);
+ list_for_each_entry(link, &cgrp->cset_links, cset_link)
+ count += atomic_read(&link->cset->refcount);
+ up_read(&css_set_rwsem);
+ return count;
+}
+
+/**
+ * css_next_child - find the next child of a given css
+ * @pos: the current position (%NULL to initiate traversal)
+ * @parent: css whose children to walk
+ *
+ * This function returns the next child of @parent and should be called
+ * under either cgroup_mutex or RCU read lock. The only requirement is
+ * that @parent and @pos are accessible. The next sibling is guaranteed to
+ * be returned regardless of their states.
+ *
+ * If a subsystem synchronizes ->css_online() and the start of iteration, a
+ * css which finished ->css_online() is guaranteed to be visible in the
+ * future iterations and will stay visible until the last reference is put.
+ * A css which hasn't finished ->css_online() or already finished
+ * ->css_offline() may show up during traversal. It's each subsystem's
+ * responsibility to synchronize against on/offlining.
+ */
+struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
+ struct cgroup_subsys_state *parent)
+{
+ struct cgroup_subsys_state *next;
+
+ cgroup_assert_mutex_or_rcu_locked();
+
+ /*
+ * @pos could already have been unlinked from the sibling list.
+ * Once a cgroup is removed, its ->sibling.next is no longer
+ * updated when its next sibling changes. CSS_RELEASED is set when
+ * @pos is taken off list, at which time its next pointer is valid,
+ * and, as releases are serialized, the one pointed to by the next
+ * pointer is guaranteed to not have started release yet. This
+ * implies that if we observe !CSS_RELEASED on @pos in this RCU
+ * critical section, the one pointed to by its next pointer is
+ * guaranteed to not have finished its RCU grace period even if we
+ * have dropped rcu_read_lock() inbetween iterations.
+ *
+ * If @pos has CSS_RELEASED set, its next pointer can't be
+ * dereferenced; however, as each css is given a monotonically
+ * increasing unique serial number and always appended to the
+ * sibling list, the next one can be found by walking the parent's
+ * children until the first css with higher serial number than
+ * @pos's. While this path can be slower, it happens iff iteration
+ * races against release and the race window is very small.
+ */
+ if (!pos) {
+ next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling);
+ } else if (likely(!(pos->flags & CSS_RELEASED))) {
+ next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
+ } else {
+ list_for_each_entry_rcu(next, &parent->children, sibling)
+ if (next->serial_nr > pos->serial_nr)
+ break;
+ }
+
+ /*
+ * @next, if not pointing to the head, can be dereferenced and is
+ * the next sibling.
+ */
+ if (&next->sibling != &parent->children)
+ return next;
+ return NULL;
+}
+
+/**
+ * css_next_descendant_pre - find the next descendant for pre-order walk
+ * @pos: the current position (%NULL to initiate traversal)
+ * @root: css whose descendants to walk
+ *
+ * To be used by css_for_each_descendant_pre(). Find the next descendant
+ * to visit for pre-order traversal of @root's descendants. @root is
+ * included in the iteration and the first node to be visited.
+ *
+ * While this function requires cgroup_mutex or RCU read locking, it
+ * doesn't require the whole traversal to be contained in a single critical
+ * section. This function will return the correct next descendant as long
+ * as both @pos and @root are accessible and @pos is a descendant of @root.
+ *
+ * If a subsystem synchronizes ->css_online() and the start of iteration, a
+ * css which finished ->css_online() is guaranteed to be visible in the
+ * future iterations and will stay visible until the last reference is put.
+ * A css which hasn't finished ->css_online() or already finished
+ * ->css_offline() may show up during traversal. It's each subsystem's
+ * responsibility to synchronize against on/offlining.
+ */
+struct cgroup_subsys_state *
+css_next_descendant_pre(struct cgroup_subsys_state *pos,
+ struct cgroup_subsys_state *root)
+{
+ struct cgroup_subsys_state *next;
+
+ cgroup_assert_mutex_or_rcu_locked();
+
+ /* if first iteration, visit @root */
+ if (!pos)
+ return root;
+
+ /* visit the first child if exists */
+ next = css_next_child(NULL, pos);
+ if (next)
+ return next;
+
+ /* no child, visit my or the closest ancestor's next sibling */
+ while (pos != root) {
+ next = css_next_child(pos, pos->parent);
+ if (next)
+ return next;
+ pos = pos->parent;
+ }
+
+ return NULL;
+}
+
+/**
+ * css_rightmost_descendant - return the rightmost descendant of a css
+ * @pos: css of interest
+ *
+ * Return the rightmost descendant of @pos. If there's no descendant, @pos
+ * is returned. This can be used during pre-order traversal to skip
+ * subtree of @pos.
+ *
+ * While this function requires cgroup_mutex or RCU read locking, it
+ * doesn't require the whole traversal to be contained in a single critical
+ * section. This function will return the correct rightmost descendant as
+ * long as @pos is accessible.
+ */
+struct cgroup_subsys_state *
+css_rightmost_descendant(struct cgroup_subsys_state *pos)
+{
+ struct cgroup_subsys_state *last, *tmp;
+
+ cgroup_assert_mutex_or_rcu_locked();
+
+ do {
+ last = pos;
+ /* ->prev isn't RCU safe, walk ->next till the end */
+ pos = NULL;
+ css_for_each_child(tmp, last)
+ pos = tmp;
+ } while (pos);
+
+ return last;
+}
+
+static struct cgroup_subsys_state *
+css_leftmost_descendant(struct cgroup_subsys_state *pos)
+{
+ struct cgroup_subsys_state *last;
+
+ do {
+ last = pos;
+ pos = css_next_child(NULL, pos);
+ } while (pos);
+
+ return last;
+}
+
+/**
+ * css_next_descendant_post - find the next descendant for post-order walk
+ * @pos: the current position (%NULL to initiate traversal)
+ * @root: css whose descendants to walk
+ *
+ * To be used by css_for_each_descendant_post(). Find the next descendant
+ * to visit for post-order traversal of @root's descendants. @root is
+ * included in the iteration and the last node to be visited.
+ *
+ * While this function requires cgroup_mutex or RCU read locking, it
+ * doesn't require the whole traversal to be contained in a single critical
+ * section. This function will return the correct next descendant as long
+ * as both @pos and @cgroup are accessible and @pos is a descendant of
+ * @cgroup.
+ *
+ * If a subsystem synchronizes ->css_online() and the start of iteration, a
+ * css which finished ->css_online() is guaranteed to be visible in the
+ * future iterations and will stay visible until the last reference is put.
+ * A css which hasn't finished ->css_online() or already finished
+ * ->css_offline() may show up during traversal. It's each subsystem's
+ * responsibility to synchronize against on/offlining.
+ */
+struct cgroup_subsys_state *
+css_next_descendant_post(struct cgroup_subsys_state *pos,
+ struct cgroup_subsys_state *root)
+{
+ struct cgroup_subsys_state *next;
+
+ cgroup_assert_mutex_or_rcu_locked();
+
+ /* if first iteration, visit leftmost descendant which may be @root */
+ if (!pos)
+ return css_leftmost_descendant(root);
+
+ /* if we visited @root, we're done */
+ if (pos == root)
+ return NULL;
+
+ /* if there's an unvisited sibling, visit its leftmost descendant */
+ next = css_next_child(pos, pos->parent);
+ if (next)
+ return css_leftmost_descendant(next);
+
+ /* no sibling left, visit parent */
+ return pos->parent;
+}
+
+/**
+ * css_has_online_children - does a css have online children
+ * @css: the target css
+ *
+ * Returns %true if @css has any online children; otherwise, %false. This
+ * function can be called from any context but the caller is responsible
+ * for synchronizing against on/offlining as necessary.
+ */
+bool css_has_online_children(struct cgroup_subsys_state *css)
+{
+ struct cgroup_subsys_state *child;
+ bool ret = false;
+
+ rcu_read_lock();
+ css_for_each_child(child, css) {
+ if (child->flags & CSS_ONLINE) {
+ ret = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
+/**
+ * css_advance_task_iter - advance a task itererator to the next css_set
+ * @it: the iterator to advance
+ *
+ * Advance @it to the next css_set to walk.
+ */
+static void css_advance_task_iter(struct css_task_iter *it)
+{
+ struct list_head *l = it->cset_pos;
+ struct cgrp_cset_link *link;
+ struct css_set *cset;
+
+ /* Advance to the next non-empty css_set */
+ do {
+ l = l->next;
+ if (l == it->cset_head) {
+ it->cset_pos = NULL;
+ return;
+ }
+
+ if (it->ss) {
+ cset = container_of(l, struct css_set,
+ e_cset_node[it->ss->id]);
+ } else {
+ link = list_entry(l, struct cgrp_cset_link, cset_link);
+ cset = link->cset;
+ }
+ } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));
+
+ it->cset_pos = l;
+
+ if (!list_empty(&cset->tasks))
+ it->task_pos = cset->tasks.next;
+ else
+ it->task_pos = cset->mg_tasks.next;
+
+ it->tasks_head = &cset->tasks;
+ it->mg_tasks_head = &cset->mg_tasks;
+}
+
+/**
+ * css_task_iter_start - initiate task iteration
+ * @css: the css to walk tasks of
+ * @it: the task iterator to use
+ *
+ * Initiate iteration through the tasks of @css. The caller can call
+ * css_task_iter_next() to walk through the tasks until the function
+ * returns NULL. On completion of iteration, css_task_iter_end() must be
+ * called.
+ *
+ * Note that this function acquires a lock which is released when the
+ * iteration finishes. The caller can't sleep while iteration is in
+ * progress.
+ */
+void css_task_iter_start(struct cgroup_subsys_state *css,
+ struct css_task_iter *it)
+ __acquires(css_set_rwsem)
+{
+ /* no one should try to iterate before mounting cgroups */
+ WARN_ON_ONCE(!use_task_css_set_links);
+
+ down_read(&css_set_rwsem);
+
+ it->ss = css->ss;
+
+ if (it->ss)
+ it->cset_pos = &css->cgroup->e_csets[css->ss->id];
+ else
+ it->cset_pos = &css->cgroup->cset_links;
+
+ it->cset_head = it->cset_pos;
+
+ css_advance_task_iter(it);
+}
+
+/**
+ * css_task_iter_next - return the next task for the iterator
+ * @it: the task iterator being iterated
+ *
+ * The "next" function for task iteration. @it should have been
+ * initialized via css_task_iter_start(). Returns NULL when the iteration
+ * reaches the end.
+ */
+struct task_struct *css_task_iter_next(struct css_task_iter *it)
+{
+ struct task_struct *res;
+ struct list_head *l = it->task_pos;
+
+ /* If the iterator cg is NULL, we have no tasks */
+ if (!it->cset_pos)
+ return NULL;
+ res = list_entry(l, struct task_struct, cg_list);
+
+ /*
+ * Advance iterator to find next entry. cset->tasks is consumed
+ * first and then ->mg_tasks. After ->mg_tasks, we move onto the
+ * next cset.
+ */
+ l = l->next;
+
+ if (l == it->tasks_head)
+ l = it->mg_tasks_head->next;
+
+ if (l == it->mg_tasks_head)
+ css_advance_task_iter(it);
+ else
+ it->task_pos = l;
+
+ return res;
+}
+
+/**
+ * css_task_iter_end - finish task iteration
+ * @it: the task iterator to finish
+ *
+ * Finish task iteration started by css_task_iter_start().
+ */
+void css_task_iter_end(struct css_task_iter *it)
+ __releases(css_set_rwsem)
+{
+ up_read(&css_set_rwsem);
+}
+
+/**
+ * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
+ * @to: cgroup to which the tasks will be moved
+ * @from: cgroup in which the tasks currently reside
+ *
+ * Locking rules between cgroup_post_fork() and the migration path
+ * guarantee that, if a task is forking while being migrated, the new child
+ * is guaranteed to be either visible in the source cgroup after the
+ * parent's migration is complete or put into the target cgroup. No task
+ * can slip out of migration through forking.
+ */
+int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
+{
+ LIST_HEAD(preloaded_csets);
+ struct cgrp_cset_link *link;
+ struct css_task_iter it;
+ struct task_struct *task;
+ int ret;
+
+ mutex_lock(&cgroup_mutex);
+
+ /* all tasks in @from are being moved, all csets are source */
+ down_read(&css_set_rwsem);
+ list_for_each_entry(link, &from->cset_links, cset_link)
+ cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
+ up_read(&css_set_rwsem);
+
+ ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
+ if (ret)
+ goto out_err;
+
+ /*
+ * Migrate tasks one-by-one until @form is empty. This fails iff
+ * ->can_attach() fails.
+ */
+ do {
+ css_task_iter_start(&from->self, &it);
+ task = css_task_iter_next(&it);
+ if (task)
+ get_task_struct(task);
+ css_task_iter_end(&it);
+
+ if (task) {
+ ret = cgroup_migrate(to, task, false);
+ put_task_struct(task);
+ }
+ } while (task && !ret);
+out_err:
+ cgroup_migrate_finish(&preloaded_csets);
+ mutex_unlock(&cgroup_mutex);
+ return ret;
+}
+
+/*
+ * Stuff for reading the 'tasks'/'procs' files.
+ *
+ * Reading this file can return large amounts of data if a cgroup has
+ * *lots* of attached tasks. So it may need several calls to read(),
+ * but we cannot guarantee that the information we produce is correct
+ * unless we produce it entirely atomically.
+ *
+ */
+
+/* which pidlist file are we talking about? */
+enum cgroup_filetype {
+ CGROUP_FILE_PROCS,
+ CGROUP_FILE_TASKS,
+};
+
+/*
+ * A pidlist is a list of pids that virtually represents the contents of one
+ * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
+ * a pair (one each for procs, tasks) for each pid namespace that's relevant
+ * to the cgroup.
+ */
+struct cgroup_pidlist {
+ /*
+ * used to find which pidlist is wanted. doesn't change as long as
+ * this particular list stays in the list.
+ */
+ struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
+ /* array of xids */
+ pid_t *list;
+ /* how many elements the above list has */
+ int length;
+ /* each of these stored in a list by its cgroup */
+ struct list_head links;
+ /* pointer to the cgroup we belong to, for list removal purposes */
+ struct cgroup *owner;
+ /* for delayed destruction */
+ struct delayed_work destroy_dwork;
+};
+
+/*
+ * The following two functions "fix" the issue where there are more pids
+ * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
+ * TODO: replace with a kernel-wide solution to this problem
+ */
+#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
+static void *pidlist_allocate(int count)
+{
+ if (PIDLIST_TOO_LARGE(count))
+ return vmalloc(count * sizeof(pid_t));
+ else
+ return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
+}
+
+static void pidlist_free(void *p)
+{
+ kvfree(p);
+}
+
+/*
+ * Used to destroy all pidlists lingering waiting for destroy timer. None
+ * should be left afterwards.
+ */
+static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
+{
+ struct cgroup_pidlist *l, *tmp_l;
+
+ mutex_lock(&cgrp->pidlist_mutex);
+ list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
+ mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
+ mutex_unlock(&cgrp->pidlist_mutex);
+
+ flush_workqueue(cgroup_pidlist_destroy_wq);
+ BUG_ON(!list_empty(&cgrp->pidlists));
+}
+
+static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
+{
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
+ destroy_dwork);
+ struct cgroup_pidlist *tofree = NULL;
+
+ mutex_lock(&l->owner->pidlist_mutex);
+
+ /*
+ * Destroy iff we didn't get queued again. The state won't change
+ * as destroy_dwork can only be queued while locked.
+ */
+ if (!delayed_work_pending(dwork)) {
+ list_del(&l->links);
+ pidlist_free(l->list);
+ put_pid_ns(l->key.ns);
+ tofree = l;
+ }
+
+ mutex_unlock(&l->owner->pidlist_mutex);
+ kfree(tofree);
+}
+
+/*
+ * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
+ * Returns the number of unique elements.
+ */
+static int pidlist_uniq(pid_t *list, int length)
+{
+ int src, dest = 1;
+
+ /*
+ * we presume the 0th element is unique, so i starts at 1. trivial
+ * edge cases first; no work needs to be done for either
+ */
+ if (length == 0 || length == 1)
+ return length;
+ /* src and dest walk down the list; dest counts unique elements */
+ for (src = 1; src < length; src++) {
+ /* find next unique element */
+ while (list[src] == list[src-1]) {
+ src++;
+ if (src == length)
+ goto after;
+ }
+ /* dest always points to where the next unique element goes */
+ list[dest] = list[src];
+ dest++;
+ }
+after:
+ return dest;
+}
+
+/*
+ * The two pid files - task and cgroup.procs - guaranteed that the result
+ * is sorted, which forced this whole pidlist fiasco. As pid order is
+ * different per namespace, each namespace needs differently sorted list,
+ * making it impossible to use, for example, single rbtree of member tasks
+ * sorted by task pointer. As pidlists can be fairly large, allocating one
+ * per open file is dangerous, so cgroup had to implement shared pool of
+ * pidlists keyed by cgroup and namespace.
+ *
+ * All this extra complexity was caused by the original implementation
+ * committing to an entirely unnecessary property. In the long term, we
+ * want to do away with it. Explicitly scramble sort order if on the
+ * default hierarchy so that no such expectation exists in the new
+ * interface.
+ *
+ * Scrambling is done by swapping every two consecutive bits, which is
+ * non-identity one-to-one mapping which disturbs sort order sufficiently.
+ */
+static pid_t pid_fry(pid_t pid)
+{
+ unsigned a = pid & 0x55555555;
+ unsigned b = pid & 0xAAAAAAAA;
+
+ return (a << 1) | (b >> 1);
+}
+
+static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
+{
+ if (cgroup_on_dfl(cgrp))
+ return pid_fry(pid);
+ else
+ return pid;
+}
+
+static int cmppid(const void *a, const void *b)
+{
+ return *(pid_t *)a - *(pid_t *)b;
+}
+
+static int fried_cmppid(const void *a, const void *b)
+{
+ return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
+}
+
+static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
+ enum cgroup_filetype type)
+{
+ struct cgroup_pidlist *l;
+ /* don't need task_nsproxy() if we're looking at ourself */
+ struct pid_namespace *ns = task_active_pid_ns(current);
+
+ lockdep_assert_held(&cgrp->pidlist_mutex);
+
+ list_for_each_entry(l, &cgrp->pidlists, links)
+ if (l->key.type == type && l->key.ns == ns)
+ return l;
+ return NULL;
+}
+
+/*
+ * find the appropriate pidlist for our purpose (given procs vs tasks)
+ * returns with the lock on that pidlist already held, and takes care
+ * of the use count, or returns NULL with no locks held if we're out of
+ * memory.
+ */
+static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
+ enum cgroup_filetype type)
+{
+ struct cgroup_pidlist *l;
+
+ lockdep_assert_held(&cgrp->pidlist_mutex);
+
+ l = cgroup_pidlist_find(cgrp, type);
+ if (l)
+ return l;
+
+ /* entry not found; create a new one */
+ l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
+ if (!l)
+ return l;
+
+ INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
+ l->key.type = type;
+ /* don't need task_nsproxy() if we're looking at ourself */
+ l->key.ns = get_pid_ns(task_active_pid_ns(current));
+ l->owner = cgrp;
+ list_add(&l->links, &cgrp->pidlists);
+ return l;
+}
+
+/*
+ * Load a cgroup's pidarray with either procs' tgids or tasks' pids
+ */
+static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
+ struct cgroup_pidlist **lp)
+{
+ pid_t *array;
+ int length;
+ int pid, n = 0; /* used for populating the array */
+ struct css_task_iter it;
+ struct task_struct *tsk;
+ struct cgroup_pidlist *l;
+
+ lockdep_assert_held(&cgrp->pidlist_mutex);
+
+ /*
+ * If cgroup gets more users after we read count, we won't have
+ * enough space - tough. This race is indistinguishable to the
+ * caller from the case that the additional cgroup users didn't
+ * show up until sometime later on.
+ */
+ length = cgroup_task_count(cgrp);
+ array = pidlist_allocate(length);
+ if (!array)
+ return -ENOMEM;
+ /* now, populate the array */
+ css_task_iter_start(&cgrp->self, &it);
+ while ((tsk = css_task_iter_next(&it))) {
+ if (unlikely(n == length))
+ break;
+ /* get tgid or pid for procs or tasks file respectively */
+ if (type == CGROUP_FILE_PROCS)
+ pid = task_tgid_vnr(tsk);
+ else
+ pid = task_pid_vnr(tsk);
+ if (pid > 0) /* make sure to only use valid results */
+ array[n++] = pid;
+ }
+ css_task_iter_end(&it);
+ length = n;
+ /* now sort & (if procs) strip out duplicates */
+ if (cgroup_on_dfl(cgrp))
+ sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
+ else
+ sort(array, length, sizeof(pid_t), cmppid, NULL);
+ if (type == CGROUP_FILE_PROCS)
+ length = pidlist_uniq(array, length);
+
+ l = cgroup_pidlist_find_create(cgrp, type);
+ if (!l) {
+ pidlist_free(array);
+ return -ENOMEM;
+ }
+
+ /* store array, freeing old if necessary */
+ pidlist_free(l->list);
+ l->list = array;
+ l->length = length;
+ *lp = l;
+ return 0;
+}
+
+/**
+ * cgroupstats_build - build and fill cgroupstats
+ * @stats: cgroupstats to fill information into
+ * @dentry: A dentry entry belonging to the cgroup for which stats have
+ * been requested.
+ *
+ * Build and fill cgroupstats so that taskstats can export it to user
+ * space.
+ */
+int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
+{
+ struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
+ struct cgroup *cgrp;
+ struct css_task_iter it;
+ struct task_struct *tsk;
+
+ /* it should be kernfs_node belonging to cgroupfs and is a directory */
+ if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
+ kernfs_type(kn) != KERNFS_DIR)
+ return -EINVAL;
+
+ mutex_lock(&cgroup_mutex);
+
+ /*
+ * We aren't being called from kernfs and there's no guarantee on
+ * @kn->priv's validity. For this and css_tryget_online_from_dir(),
+ * @kn->priv is RCU safe. Let's do the RCU dancing.
+ */
+ rcu_read_lock();
+ cgrp = rcu_dereference(kn->priv);
+ if (!cgrp || cgroup_is_dead(cgrp)) {
+ rcu_read_unlock();
+ mutex_unlock(&cgroup_mutex);
+ return -ENOENT;
+ }
+ rcu_read_unlock();
+
+ css_task_iter_start(&cgrp->self, &it);
+ while ((tsk = css_task_iter_next(&it))) {
+ switch (tsk->state) {
+ case TASK_RUNNING:
+ stats->nr_running++;
+ break;
+ case TASK_INTERRUPTIBLE:
+ stats->nr_sleeping++;
+ break;
+ case TASK_UNINTERRUPTIBLE:
+ stats->nr_uninterruptible++;
+ break;
+ case TASK_STOPPED:
+ stats->nr_stopped++;
+ break;
+ default:
+ if (delayacct_is_task_waiting_on_io(tsk))
+ stats->nr_io_wait++;
+ break;
+ }
+ }
+ css_task_iter_end(&it);
+
+ mutex_unlock(&cgroup_mutex);
+ return 0;
+}
+
+
+/*
+ * seq_file methods for the tasks/procs files. The seq_file position is the
+ * next pid to display; the seq_file iterator is a pointer to the pid
+ * in the cgroup->l->list array.
+ */
+
+static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
+{
+ /*
+ * Initially we receive a position value that corresponds to
+ * one more than the last pid shown (or 0 on the first call or
+ * after a seek to the start). Use a binary-search to find the
+ * next pid to display, if any
+ */
+ struct kernfs_open_file *of = s->private;
+ struct cgroup *cgrp = seq_css(s)->cgroup;
+ struct cgroup_pidlist *l;
+ enum cgroup_filetype type = seq_cft(s)->private;
+ int index = 0, pid = *pos;
+ int *iter, ret;
+
+ mutex_lock(&cgrp->pidlist_mutex);
+
+ /*
+ * !NULL @of->priv indicates that this isn't the first start()
+ * after open. If the matching pidlist is around, we can use that.
+ * Look for it. Note that @of->priv can't be used directly. It
+ * could already have been destroyed.
+ */
+ if (of->priv)
+ of->priv = cgroup_pidlist_find(cgrp, type);
+
+ /*
+ * Either this is the first start() after open or the matching
+ * pidlist has been destroyed inbetween. Create a new one.
+ */
+ if (!of->priv) {
+ ret = pidlist_array_load(cgrp, type,
+ (struct cgroup_pidlist **)&of->priv);
+ if (ret)
+ return ERR_PTR(ret);
+ }
+ l = of->priv;
+
+ if (pid) {
+ int end = l->length;
+
+ while (index < end) {
+ int mid = (index + end) / 2;
+ if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
+ index = mid;
+ break;
+ } else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
+ index = mid + 1;
+ else
+ end = mid;
+ }
+ }
+ /* If we're off the end of the array, we're done */
+ if (index >= l->length)
+ return NULL;
+ /* Update the abstract position to be the actual pid that we found */
+ iter = l->list + index;
+ *pos = cgroup_pid_fry(cgrp, *iter);
+ return iter;
+}
+
+static void cgroup_pidlist_stop(struct seq_file *s, void *v)
+{
+ struct kernfs_open_file *of = s->private;
+ struct cgroup_pidlist *l = of->priv;
+
+ if (l)
+ mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
+ CGROUP_PIDLIST_DESTROY_DELAY);
+ mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
+}
+
+static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ struct kernfs_open_file *of = s->private;
+ struct cgroup_pidlist *l = of->priv;
+ pid_t *p = v;
+ pid_t *end = l->list + l->length;
+ /*
+ * Advance to the next pid in the array. If this goes off the
+ * end, we're done
+ */
+ p++;
+ if (p >= end) {
+ return NULL;
+ } else {
+ *pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);
+ return p;
+ }
+}
+
+static int cgroup_pidlist_show(struct seq_file *s, void *v)
+{
+ seq_printf(s, "%d\n", *(int *)v);
+
+ return 0;
+}
+
+static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return notify_on_release(css->cgroup);
+}
+
+static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 val)
+{
+ if (val)
+ set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
+ else
+ clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
+ return 0;
+}
+
+static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
+}
+
+static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 val)
+{
+ if (val)
+ set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
+ else
+ clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
+ return 0;
+}
+
+/* cgroup core interface files for the default hierarchy */
+static struct cftype cgroup_dfl_base_files[] = {
+ {
+ .name = "cgroup.procs",
+ .seq_start = cgroup_pidlist_start,
+ .seq_next = cgroup_pidlist_next,
+ .seq_stop = cgroup_pidlist_stop,
+ .seq_show = cgroup_pidlist_show,
+ .private = CGROUP_FILE_PROCS,
+ .write = cgroup_procs_write,
+ .mode = S_IRUGO | S_IWUSR,
+ },
+ {
+ .name = "cgroup.controllers",
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ .seq_show = cgroup_root_controllers_show,
+ },
+ {
+ .name = "cgroup.controllers",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cgroup_controllers_show,
+ },
+ {
+ .name = "cgroup.subtree_control",
+ .seq_show = cgroup_subtree_control_show,
+ .write = cgroup_subtree_control_write,
+ },
+ {
+ .name = "cgroup.populated",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cgroup_populated_show,
+ },
+ { } /* terminate */
+};
+
+/* cgroup core interface files for the legacy hierarchies */
+static struct cftype cgroup_legacy_base_files[] = {
+ {
+ .name = "cgroup.procs",
+ .seq_start = cgroup_pidlist_start,
+ .seq_next = cgroup_pidlist_next,
+ .seq_stop = cgroup_pidlist_stop,
+ .seq_show = cgroup_pidlist_show,
+ .private = CGROUP_FILE_PROCS,
+ .write = cgroup_procs_write,
+ .mode = S_IRUGO | S_IWUSR,
+ },
+ {
+ .name = "cgroup.clone_children",
+ .read_u64 = cgroup_clone_children_read,
+ .write_u64 = cgroup_clone_children_write,
+ },
+ {
+ .name = "cgroup.sane_behavior",
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ .seq_show = cgroup_sane_behavior_show,
+ },
+ {
+ .name = "tasks",
+ .seq_start = cgroup_pidlist_start,
+ .seq_next = cgroup_pidlist_next,
+ .seq_stop = cgroup_pidlist_stop,
+ .seq_show = cgroup_pidlist_show,
+ .private = CGROUP_FILE_TASKS,
+ .write = cgroup_tasks_write,
+ .mode = S_IRUGO | S_IWUSR,
+ },
+ {
+ .name = "notify_on_release",
+ .read_u64 = cgroup_read_notify_on_release,
+ .write_u64 = cgroup_write_notify_on_release,
+ },
+ {
+ .name = "release_agent",
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ .seq_show = cgroup_release_agent_show,
+ .write = cgroup_release_agent_write,
+ .max_write_len = PATH_MAX - 1,
+ },
+ { } /* terminate */
+};
+
+/**
+ * cgroup_populate_dir - create subsys files in a cgroup directory
+ * @cgrp: target cgroup
+ * @subsys_mask: mask of the subsystem ids whose files should be added
+ *
+ * On failure, no file is added.
+ */
+static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask)
+{
+ struct cgroup_subsys *ss;
+ int i, ret = 0;
+
+ /* process cftsets of each subsystem */
+ for_each_subsys(ss, i) {
+ struct cftype *cfts;
+
+ if (!(subsys_mask & (1 << i)))
+ continue;
+
+ list_for_each_entry(cfts, &ss->cfts, node) {
+ ret = cgroup_addrm_files(cgrp, cfts, true);
+ if (ret < 0)
+ goto err;
+ }
+ }
+ return 0;
+err:
+ cgroup_clear_dir(cgrp, subsys_mask);
+ return ret;
+}
+
+/*
+ * css destruction is four-stage process.
+ *
+ * 1. Destruction starts. Killing of the percpu_ref is initiated.
+ * Implemented in kill_css().
+ *
+ * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
+ * and thus css_tryget_online() is guaranteed to fail, the css can be
+ * offlined by invoking offline_css(). After offlining, the base ref is
+ * put. Implemented in css_killed_work_fn().
+ *
+ * 3. When the percpu_ref reaches zero, the only possible remaining
+ * accessors are inside RCU read sections. css_release() schedules the
+ * RCU callback.
+ *
+ * 4. After the grace period, the css can be freed. Implemented in
+ * css_free_work_fn().
+ *
+ * It is actually hairier because both step 2 and 4 require process context
+ * and thus involve punting to css->destroy_work adding two additional
+ * steps to the already complex sequence.
+ */
+static void css_free_work_fn(struct work_struct *work)
+{
+ struct cgroup_subsys_state *css =
+ container_of(work, struct cgroup_subsys_state, destroy_work);
+ struct cgroup_subsys *ss = css->ss;
+ struct cgroup *cgrp = css->cgroup;
+
+ percpu_ref_exit(&css->refcnt);
+
+ if (ss) {
+ /* css free path */
+ int id = css->id;
+
+ if (css->parent)
+ css_put(css->parent);
+
+ ss->css_free(css);
+ cgroup_idr_remove(&ss->css_idr, id);
+ cgroup_put(cgrp);
+ } else {
+ /* cgroup free path */
+ atomic_dec(&cgrp->root->nr_cgrps);
+ cgroup_pidlist_destroy_all(cgrp);
+ cancel_work_sync(&cgrp->release_agent_work);
+
+ if (cgroup_parent(cgrp)) {
+ /*
+ * We get a ref to the parent, and put the ref when
+ * this cgroup is being freed, so it's guaranteed
+ * that the parent won't be destroyed before its
+ * children.
+ */
+ cgroup_put(cgroup_parent(cgrp));
+ kernfs_put(cgrp->kn);
+ kfree(cgrp);
+ } else {
+ /*
+ * This is root cgroup's refcnt reaching zero,
+ * which indicates that the root should be
+ * released.
+ */
+ cgroup_destroy_root(cgrp->root);
+ }
+ }
+}
+
+static void css_free_rcu_fn(struct rcu_head *rcu_head)
+{
+ struct cgroup_subsys_state *css =
+ container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
+
+ INIT_WORK(&css->destroy_work, css_free_work_fn);
+ queue_work(cgroup_destroy_wq, &css->destroy_work);
+}
+
+static void css_release_work_fn(struct work_struct *work)
+{
+ struct cgroup_subsys_state *css =
+ container_of(work, struct cgroup_subsys_state, destroy_work);
+ struct cgroup_subsys *ss = css->ss;
+ struct cgroup *cgrp = css->cgroup;
+
+ mutex_lock(&cgroup_mutex);
+
+ css->flags |= CSS_RELEASED;
+ list_del_rcu(&css->sibling);
+
+ if (ss) {
+ /* css release path */
+ cgroup_idr_replace(&ss->css_idr, NULL, css->id);
+ if (ss->css_released)
+ ss->css_released(css);
+ } else {
+ /* cgroup release path */
+ cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
+ cgrp->id = -1;
+
+ /*
+ * There are two control paths which try to determine
+ * cgroup from dentry without going through kernfs -
+ * cgroupstats_build() and css_tryget_online_from_dir().
+ * Those are supported by RCU protecting clearing of
+ * cgrp->kn->priv backpointer.
+ */
+ RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);
+ }
+
+ mutex_unlock(&cgroup_mutex);
+
+ call_rcu(&css->rcu_head, css_free_rcu_fn);
+}
+
+static void css_release(struct percpu_ref *ref)
+{
+ struct cgroup_subsys_state *css =
+ container_of(ref, struct cgroup_subsys_state, refcnt);
+
+ INIT_WORK(&css->destroy_work, css_release_work_fn);
+ queue_work(cgroup_destroy_wq, &css->destroy_work);
+}
+
+static void init_and_link_css(struct cgroup_subsys_state *css,
+ struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+ lockdep_assert_held(&cgroup_mutex);
+
+ cgroup_get(cgrp);
+
+ memset(css, 0, sizeof(*css));
+ css->cgroup = cgrp;
+ css->ss = ss;
+ INIT_LIST_HEAD(&css->sibling);
+ INIT_LIST_HEAD(&css->children);
+ css->serial_nr = css_serial_nr_next++;
+
+ if (cgroup_parent(cgrp)) {
+ css->parent = cgroup_css(cgroup_parent(cgrp), ss);
+ css_get(css->parent);
+ }
+
+ BUG_ON(cgroup_css(cgrp, ss));
+}
+
+/* invoke ->css_online() on a new CSS and mark it online if successful */
+static int online_css(struct cgroup_subsys_state *css)
+{
+ struct cgroup_subsys *ss = css->ss;
+ int ret = 0;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (ss->css_online)
+ ret = ss->css_online(css);
+ if (!ret) {
+ css->flags |= CSS_ONLINE;
+ rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
+ }
+ return ret;
+}
+
+/* if the CSS is online, invoke ->css_offline() on it and mark it offline */
+static void offline_css(struct cgroup_subsys_state *css)
+{
+ struct cgroup_subsys *ss = css->ss;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (!(css->flags & CSS_ONLINE))
+ return;
+
+ if (ss->css_offline)
+ ss->css_offline(css);
+
+ css->flags &= ~CSS_ONLINE;
+ RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
+
+ wake_up_all(&css->cgroup->offline_waitq);
+}
+
+/**
+ * create_css - create a cgroup_subsys_state
+ * @cgrp: the cgroup new css will be associated with
+ * @ss: the subsys of new css
+ * @visible: whether to create control knobs for the new css or not
+ *
+ * Create a new css associated with @cgrp - @ss pair. On success, the new
+ * css is online and installed in @cgrp with all interface files created if
+ * @visible. Returns 0 on success, -errno on failure.
+ */
+static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
+ bool visible)
+{
+ struct cgroup *parent = cgroup_parent(cgrp);
+ struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
+ struct cgroup_subsys_state *css;
+ int err;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ css = ss->css_alloc(parent_css);
+ if (IS_ERR(css))
+ return PTR_ERR(css);
+
+ init_and_link_css(css, ss, cgrp);
+
+ err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL);
+ if (err)
+ goto err_free_css;
+
+ err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT);
+ if (err < 0)
+ goto err_free_percpu_ref;
+ css->id = err;
+
+ if (visible) {
+ err = cgroup_populate_dir(cgrp, 1 << ss->id);
+ if (err)
+ goto err_free_id;
+ }
+
+ /* @css is ready to be brought online now, make it visible */
+ list_add_tail_rcu(&css->sibling, &parent_css->children);
+ cgroup_idr_replace(&ss->css_idr, css, css->id);
+
+ err = online_css(css);
+ if (err)
+ goto err_list_del;
+
+ if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
+ cgroup_parent(parent)) {
+ pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
+ current->comm, current->pid, ss->name);
+ if (!strcmp(ss->name, "memory"))
+ pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n");
+ ss->warned_broken_hierarchy = true;
+ }
+
+ return 0;
+
+err_list_del:
+ list_del_rcu(&css->sibling);
+ cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
+err_free_id:
+ cgroup_idr_remove(&ss->css_idr, css->id);
+err_free_percpu_ref:
+ percpu_ref_exit(&css->refcnt);
+err_free_css:
+ call_rcu(&css->rcu_head, css_free_rcu_fn);
+ return err;
+}
+
+static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
+ umode_t mode)
+{
+ struct cgroup *parent, *cgrp;
+ struct cgroup_root *root;
+ struct cgroup_subsys *ss;
+ struct kernfs_node *kn;
+ struct cftype *base_files;
+ int ssid, ret;
+
+ /* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable.
+ */
+ if (strchr(name, '\n'))
+ return -EINVAL;
+
+ parent = cgroup_kn_lock_live(parent_kn);
+ if (!parent)
+ return -ENODEV;
+ root = parent->root;
+
+ /* allocate the cgroup and its ID, 0 is reserved for the root */
+ cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
+ if (!cgrp) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+
+ ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);
+ if (ret)
+ goto out_free_cgrp;
+
+ /*
+ * Temporarily set the pointer to NULL, so idr_find() won't return
+ * a half-baked cgroup.
+ */
+ cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT);
+ if (cgrp->id < 0) {
+ ret = -ENOMEM;
+ goto out_cancel_ref;
+ }
+
+ init_cgroup_housekeeping(cgrp);
+
+ cgrp->self.parent = &parent->self;
+ cgrp->root = root;
+
+ if (notify_on_release(parent))
+ set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+
+ if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
+ set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
+
+ /* create the directory */
+ kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
+ if (IS_ERR(kn)) {
+ ret = PTR_ERR(kn);
+ goto out_free_id;
+ }
+ cgrp->kn = kn;
+
+ /*
+ * This extra ref will be put in cgroup_free_fn() and guarantees
+ * that @cgrp->kn is always accessible.
+ */
+ kernfs_get(kn);
+
+ cgrp->self.serial_nr = css_serial_nr_next++;
+
+ /* allocation complete, commit to creation */
+ list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
+ atomic_inc(&root->nr_cgrps);
+ cgroup_get(parent);
+
+ /*
+ * @cgrp is now fully operational. If something fails after this
+ * point, it'll be released via the normal destruction path.
+ */
+ cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
+
+ ret = cgroup_kn_set_ugid(kn);
+ if (ret)
+ goto out_destroy;
+
+ if (cgroup_on_dfl(cgrp))
+ base_files = cgroup_dfl_base_files;
+ else
+ base_files = cgroup_legacy_base_files;
+
+ ret = cgroup_addrm_files(cgrp, base_files, true);
+ if (ret)
+ goto out_destroy;
+
+ /* let's create and online css's */
+ for_each_subsys(ss, ssid) {
+ if (parent->child_subsys_mask & (1 << ssid)) {
+ ret = create_css(cgrp, ss,
+ parent->subtree_control & (1 << ssid));
+ if (ret)
+ goto out_destroy;
+ }
+ }
+
+ /*
+ * On the default hierarchy, a child doesn't automatically inherit
+ * subtree_control from the parent. Each is configured manually.
+ */
+ if (!cgroup_on_dfl(cgrp)) {
+ cgrp->subtree_control = parent->subtree_control;
+ cgroup_refresh_child_subsys_mask(cgrp);
+ }
+
+ kernfs_activate(kn);
+
+ ret = 0;
+ goto out_unlock;
+
+out_free_id:
+ cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
+out_cancel_ref:
+ percpu_ref_exit(&cgrp->self.refcnt);
+out_free_cgrp:
+ kfree(cgrp);
+out_unlock:
+ cgroup_kn_unlock(parent_kn);
+ return ret;
+
+out_destroy:
+ cgroup_destroy_locked(cgrp);
+ goto out_unlock;
+}
+
+/*
+ * This is called when the refcnt of a css is confirmed to be killed.
+ * css_tryget_online() is now guaranteed to fail. Tell the subsystem to
+ * initate destruction and put the css ref from kill_css().
+ */
+static void css_killed_work_fn(struct work_struct *work)
+{
+ struct cgroup_subsys_state *css =
+ container_of(work, struct cgroup_subsys_state, destroy_work);
+
+ mutex_lock(&cgroup_mutex);
+ offline_css(css);
+ mutex_unlock(&cgroup_mutex);
+
+ css_put(css);
+}
+
+/* css kill confirmation processing requires process context, bounce */
+static void css_killed_ref_fn(struct percpu_ref *ref)
+{
+ struct cgroup_subsys_state *css =
+ container_of(ref, struct cgroup_subsys_state, refcnt);
+
+ INIT_WORK(&css->destroy_work, css_killed_work_fn);
+ queue_work(cgroup_destroy_wq, &css->destroy_work);
+}
+
+/**
+ * kill_css - destroy a css
+ * @css: css to destroy
+ *
+ * This function initiates destruction of @css by removing cgroup interface
+ * files and putting its base reference. ->css_offline() will be invoked
+ * asynchronously once css_tryget_online() is guaranteed to fail and when
+ * the reference count reaches zero, @css will be released.
+ */
+static void kill_css(struct cgroup_subsys_state *css)
+{
+ lockdep_assert_held(&cgroup_mutex);
+
+ /*
+ * This must happen before css is disassociated with its cgroup.
+ * See seq_css() for details.
+ */
+ cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
+
+ /*
+ * Killing would put the base ref, but we need to keep it alive
+ * until after ->css_offline().
+ */
+ css_get(css);
+
+ /*
+ * cgroup core guarantees that, by the time ->css_offline() is
+ * invoked, no new css reference will be given out via
+ * css_tryget_online(). We can't simply call percpu_ref_kill() and
+ * proceed to offlining css's because percpu_ref_kill() doesn't
+ * guarantee that the ref is seen as killed on all CPUs on return.
+ *
+ * Use percpu_ref_kill_and_confirm() to get notifications as each
+ * css is confirmed to be seen as killed on all CPUs.
+ */
+ percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
+}
+
+/**
+ * cgroup_destroy_locked - the first stage of cgroup destruction
+ * @cgrp: cgroup to be destroyed
+ *
+ * css's make use of percpu refcnts whose killing latency shouldn't be
+ * exposed to userland and are RCU protected. Also, cgroup core needs to
+ * guarantee that css_tryget_online() won't succeed by the time
+ * ->css_offline() is invoked. To satisfy all the requirements,
+ * destruction is implemented in the following two steps.
+ *
+ * s1. Verify @cgrp can be destroyed and mark it dying. Remove all
+ * userland visible parts and start killing the percpu refcnts of
+ * css's. Set up so that the next stage will be kicked off once all
+ * the percpu refcnts are confirmed to be killed.
+ *
+ * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
+ * rest of destruction. Once all cgroup references are gone, the
+ * cgroup is RCU-freed.
+ *
+ * This function implements s1. After this step, @cgrp is gone as far as
+ * the userland is concerned and a new cgroup with the same name may be
+ * created. As cgroup doesn't care about the names internally, this
+ * doesn't cause any problem.
+ */
+static int cgroup_destroy_locked(struct cgroup *cgrp)
+ __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
+{
+ struct cgroup_subsys_state *css;
+ bool empty;
+ int ssid;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ /*
+ * css_set_rwsem synchronizes access to ->cset_links and prevents
+ * @cgrp from being removed while put_css_set() is in progress.
+ */
+ down_read(&css_set_rwsem);
+ empty = list_empty(&cgrp->cset_links);
+ up_read(&css_set_rwsem);
+ if (!empty)
+ return -EBUSY;
+
+ /*
+ * Make sure there's no live children. We can't test emptiness of
+ * ->self.children as dead children linger on it while being
+ * drained; otherwise, "rmdir parent/child parent" may fail.
+ */
+ if (css_has_online_children(&cgrp->self))
+ return -EBUSY;
+
+ /*
+ * Mark @cgrp dead. This prevents further task migration and child
+ * creation by disabling cgroup_lock_live_group().
+ */
+ cgrp->self.flags &= ~CSS_ONLINE;
+
+ /* initiate massacre of all css's */
+ for_each_css(css, ssid, cgrp)
+ kill_css(css);
+
+ /*
+ * Remove @cgrp directory along with the base files. @cgrp has an
+ * extra ref on its kn.
+ */
+ kernfs_remove(cgrp->kn);
+
+ check_for_release(cgroup_parent(cgrp));
+
+ /* put the base reference */
+ percpu_ref_kill(&cgrp->self.refcnt);
+
+ return 0;
+};
+
+static int cgroup_rmdir(struct kernfs_node *kn)
+{
+ struct cgroup *cgrp;
+ int ret = 0;
+
+ cgrp = cgroup_kn_lock_live(kn);
+ if (!cgrp)
+ return 0;
+
+ ret = cgroup_destroy_locked(cgrp);
+
+ cgroup_kn_unlock(kn);
+ return ret;
+}
+
+static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
+ .remount_fs = cgroup_remount,
+ .show_options = cgroup_show_options,
+ .mkdir = cgroup_mkdir,
+ .rmdir = cgroup_rmdir,
+ .rename = cgroup_rename,
+};
+
+static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
+{
+ struct cgroup_subsys_state *css;
+
+ printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
+
+ mutex_lock(&cgroup_mutex);
+
+ idr_init(&ss->css_idr);
+ INIT_LIST_HEAD(&ss->cfts);
+
+ /* Create the root cgroup state for this subsystem */
+ ss->root = &cgrp_dfl_root;
+ css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
+ /* We don't handle early failures gracefully */
+ BUG_ON(IS_ERR(css));
+ init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
+
+ /*
+ * Root csses are never destroyed and we can't initialize
+ * percpu_ref during early init. Disable refcnting.
+ */
+ css->flags |= CSS_NO_REF;
+
+ if (early) {
+ /* allocation can't be done safely during early init */
+ css->id = 1;
+ } else {
+ css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
+ BUG_ON(css->id < 0);
+ }
+
+ /* Update the init_css_set to contain a subsys
+ * pointer to this state - since the subsystem is
+ * newly registered, all tasks and hence the
+ * init_css_set is in the subsystem's root cgroup. */
+ init_css_set.subsys[ss->id] = css;
+
+ need_forkexit_callback |= ss->fork || ss->exit;
+
+ /* At system boot, before all subsystems have been
+ * registered, no tasks have been forked, so we don't
+ * need to invoke fork callbacks here. */
+ BUG_ON(!list_empty(&init_task.tasks));
+
+ BUG_ON(online_css(css));
+
+ mutex_unlock(&cgroup_mutex);
+}
+
+/**
+ * cgroup_init_early - cgroup initialization at system boot
+ *
+ * Initialize cgroups at system boot, and initialize any
+ * subsystems that request early init.
+ */
+int __init cgroup_init_early(void)
+{
+ static struct cgroup_sb_opts __initdata opts;
+ struct cgroup_subsys *ss;
+ int i;
+
+ init_cgroup_root(&cgrp_dfl_root, &opts);
+ cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;
+
+ RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
+
+ for_each_subsys(ss, i) {
+ WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
+ "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n",
+ i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
+ ss->id, ss->name);
+ WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
+ "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
+
+ ss->id = i;
+ ss->name = cgroup_subsys_name[i];
+
+ if (ss->early_init)
+ cgroup_init_subsys(ss, true);
+ }
+ return 0;
+}
+
+/**
+ * cgroup_init - cgroup initialization
+ *
+ * Register cgroup filesystem and /proc file, and initialize
+ * any subsystems that didn't request early init.
+ */
+int __init cgroup_init(void)
+{
+ struct cgroup_subsys *ss;
+ unsigned long key;
+ int ssid, err;
+
+ BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
+ BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
+
+ mutex_lock(&cgroup_mutex);
+
+ /* Add init_css_set to the hash table */
+ key = css_set_hash(init_css_set.subsys);
+ hash_add(css_set_table, &init_css_set.hlist, key);
+
+ BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
+
+ mutex_unlock(&cgroup_mutex);
+
+ for_each_subsys(ss, ssid) {
+ if (ss->early_init) {
+ struct cgroup_subsys_state *css =
+ init_css_set.subsys[ss->id];
+
+ css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
+ GFP_KERNEL);
+ BUG_ON(css->id < 0);
+ } else {
+ cgroup_init_subsys(ss, false);
+ }
+
+ list_add_tail(&init_css_set.e_cset_node[ssid],
+ &cgrp_dfl_root.cgrp.e_csets[ssid]);
+
+ /*
+ * Setting dfl_root subsys_mask needs to consider the
+ * disabled flag and cftype registration needs kmalloc,
+ * both of which aren't available during early_init.
+ */
+ if (ss->disabled)
+ continue;
+
+ cgrp_dfl_root.subsys_mask |= 1 << ss->id;
+
+ if (cgroup_legacy_files_on_dfl && !ss->dfl_cftypes)
+ ss->dfl_cftypes = ss->legacy_cftypes;
+
+ if (!ss->dfl_cftypes)
+ cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id;
+
+ if (ss->dfl_cftypes == ss->legacy_cftypes) {
+ WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
+ } else {
+ WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
+ WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
+ }
+
+ if (ss->bind)
+ ss->bind(init_css_set.subsys[ssid]);
+ }
+
+ err = sysfs_create_mount_point(fs_kobj, "cgroup");
+ if (err)
+ return err;
+
+ err = register_filesystem(&cgroup_fs_type);
+ if (err < 0) {
+ sysfs_remove_mount_point(fs_kobj, "cgroup");
+ return err;
+ }
+
+ proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
+ return 0;
+}
+
+static int __init cgroup_wq_init(void)
+{
+ /*
+ * There isn't much point in executing destruction path in
+ * parallel. Good chunk is serialized with cgroup_mutex anyway.
+ * Use 1 for @max_active.
+ *
+ * We would prefer to do this in cgroup_init() above, but that
+ * is called before init_workqueues(): so leave this until after.
+ */
+ cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
+ BUG_ON(!cgroup_destroy_wq);
+
+ /*
+ * Used to destroy pidlists and separate to serve as flush domain.
+ * Cap @max_active to 1 too.
+ */
+ cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
+ 0, 1);
+ BUG_ON(!cgroup_pidlist_destroy_wq);
+
+ return 0;
+}
+core_initcall(cgroup_wq_init);
+
+/*
+ * proc_cgroup_show()
+ * - Print task's cgroup paths into seq_file, one line for each hierarchy
+ * - Used for /proc/<pid>/cgroup.
+ */
+int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *tsk)
+{
+ char *buf, *path;
+ int retval;
+ struct cgroup_root *root;
+
+ retval = -ENOMEM;
+ buf = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (!buf)
+ goto out;
+
+ mutex_lock(&cgroup_mutex);
+ down_read(&css_set_rwsem);
+
+ for_each_root(root) {
+ struct cgroup_subsys *ss;
+ struct cgroup *cgrp;
+ int ssid, count = 0;
+
+ if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible)
+ continue;
+
+ seq_printf(m, "%d:", root->hierarchy_id);
+ for_each_subsys(ss, ssid)
+ if (root->subsys_mask & (1 << ssid))
+ seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
+ if (strlen(root->name))
+ seq_printf(m, "%sname=%s", count ? "," : "",
+ root->name);
+ seq_putc(m, ':');
+ cgrp = task_cgroup_from_root(tsk, root);
+ path = cgroup_path(cgrp, buf, PATH_MAX);
+ if (!path) {
+ retval = -ENAMETOOLONG;
+ goto out_unlock;
+ }
+ seq_puts(m, path);
+ seq_putc(m, '\n');
+ }
+
+ retval = 0;
+out_unlock:
+ up_read(&css_set_rwsem);
+ mutex_unlock(&cgroup_mutex);
+ kfree(buf);
+out:
+ return retval;
+}
+
+/* Display information about each subsystem and each hierarchy */
+static int proc_cgroupstats_show(struct seq_file *m, void *v)
+{
+ struct cgroup_subsys *ss;
+ int i;
+
+ seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
+ /*
+ * ideally we don't want subsystems moving around while we do this.
+ * cgroup_mutex is also necessary to guarantee an atomic snapshot of
+ * subsys/hierarchy state.
+ */
+ mutex_lock(&cgroup_mutex);
+
+ for_each_subsys(ss, i)
+ seq_printf(m, "%s\t%d\t%d\t%d\n",
+ ss->name, ss->root->hierarchy_id,
+ atomic_read(&ss->root->nr_cgrps), !ss->disabled);
+
+ mutex_unlock(&cgroup_mutex);
+ return 0;
+}
+
+static int cgroupstats_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, proc_cgroupstats_show, NULL);
+}
+
+static const struct file_operations proc_cgroupstats_operations = {
+ .open = cgroupstats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+/**
+ * cgroup_fork - initialize cgroup related fields during copy_process()
+ * @child: pointer to task_struct of forking parent process.
+ *
+ * A task is associated with the init_css_set until cgroup_post_fork()
+ * attaches it to the parent's css_set. Empty cg_list indicates that
+ * @child isn't holding reference to its css_set.
+ */
+void cgroup_fork(struct task_struct *child)
+{
+ RCU_INIT_POINTER(child->cgroups, &init_css_set);
+ INIT_LIST_HEAD(&child->cg_list);
+}
+
+/**
+ * cgroup_post_fork - called on a new task after adding it to the task list
+ * @child: the task in question
+ *
+ * Adds the task to the list running through its css_set if necessary and
+ * call the subsystem fork() callbacks. Has to be after the task is
+ * visible on the task list in case we race with the first call to
+ * cgroup_task_iter_start() - to guarantee that the new task ends up on its
+ * list.
+ */
+void cgroup_post_fork(struct task_struct *child)
+{
+ struct cgroup_subsys *ss;
+ int i;
+
+ /*
+ * This may race against cgroup_enable_task_cg_lists(). As that
+ * function sets use_task_css_set_links before grabbing
+ * tasklist_lock and we just went through tasklist_lock to add
+ * @child, it's guaranteed that either we see the set
+ * use_task_css_set_links or cgroup_enable_task_cg_lists() sees
+ * @child during its iteration.
+ *
+ * If we won the race, @child is associated with %current's
+ * css_set. Grabbing css_set_rwsem guarantees both that the
+ * association is stable, and, on completion of the parent's
+ * migration, @child is visible in the source of migration or
+ * already in the destination cgroup. This guarantee is necessary
+ * when implementing operations which need to migrate all tasks of
+ * a cgroup to another.
+ *
+ * Note that if we lose to cgroup_enable_task_cg_lists(), @child
+ * will remain in init_css_set. This is safe because all tasks are
+ * in the init_css_set before cg_links is enabled and there's no
+ * operation which transfers all tasks out of init_css_set.
+ */
+ if (use_task_css_set_links) {
+ struct css_set *cset;
+
+ down_write(&css_set_rwsem);
+ cset = task_css_set(current);
+ if (list_empty(&child->cg_list)) {
+ rcu_assign_pointer(child->cgroups, cset);
+ list_add(&child->cg_list, &cset->tasks);
+ get_css_set(cset);
+ }
+ up_write(&css_set_rwsem);
+ }
+
+ /*
+ * Call ss->fork(). This must happen after @child is linked on
+ * css_set; otherwise, @child might change state between ->fork()
+ * and addition to css_set.
+ */
+ if (need_forkexit_callback) {
+ for_each_subsys(ss, i)
+ if (ss->fork)
+ ss->fork(child);
+ }
+}
+
+/**
+ * cgroup_exit - detach cgroup from exiting task
+ * @tsk: pointer to task_struct of exiting process
+ *
+ * Description: Detach cgroup from @tsk and release it.
+ *
+ * Note that cgroups marked notify_on_release force every task in
+ * them to take the global cgroup_mutex mutex when exiting.
+ * This could impact scaling on very large systems. Be reluctant to
+ * use notify_on_release cgroups where very high task exit scaling
+ * is required on large systems.
+ *
+ * We set the exiting tasks cgroup to the root cgroup (top_cgroup). We
+ * call cgroup_exit() while the task is still competent to handle
+ * notify_on_release(), then leave the task attached to the root cgroup in
+ * each hierarchy for the remainder of its exit. No need to bother with
+ * init_css_set refcnting. init_css_set never goes away and we can't race
+ * with migration path - PF_EXITING is visible to migration path.
+ */
+void cgroup_exit(struct task_struct *tsk)
+{
+ struct cgroup_subsys *ss;
+ struct css_set *cset;
+ bool put_cset = false;
+ int i;
+
+ /*
+ * Unlink from @tsk from its css_set. As migration path can't race
+ * with us, we can check cg_list without grabbing css_set_rwsem.
+ */
+ if (!list_empty(&tsk->cg_list)) {
+ down_write(&css_set_rwsem);
+ list_del_init(&tsk->cg_list);
+ up_write(&css_set_rwsem);
+ put_cset = true;
+ }
+
+ /* Reassign the task to the init_css_set. */
+ cset = task_css_set(tsk);
+ RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
+
+ if (need_forkexit_callback) {
+ /* see cgroup_post_fork() for details */
+ for_each_subsys(ss, i) {
+ if (ss->exit) {
+ struct cgroup_subsys_state *old_css = cset->subsys[i];
+ struct cgroup_subsys_state *css = task_css(tsk, i);
+
+ ss->exit(css, old_css, tsk);
+ }
+ }
+ }
+
+ if (put_cset)
+ put_css_set(cset);
+}
+
+static void check_for_release(struct cgroup *cgrp)
+{
+ if (notify_on_release(cgrp) && !cgroup_has_tasks(cgrp) &&
+ !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
+ schedule_work(&cgrp->release_agent_work);
+}
+
+/*
+ * Notify userspace when a cgroup is released, by running the
+ * configured release agent with the name of the cgroup (path
+ * relative to the root of cgroup file system) as the argument.
+ *
+ * Most likely, this user command will try to rmdir this cgroup.
+ *
+ * This races with the possibility that some other task will be
+ * attached to this cgroup before it is removed, or that some other
+ * user task will 'mkdir' a child cgroup of this cgroup. That's ok.
+ * The presumed 'rmdir' will fail quietly if this cgroup is no longer
+ * unused, and this cgroup will be reprieved from its death sentence,
+ * to continue to serve a useful existence. Next time it's released,
+ * we will get notified again, if it still has 'notify_on_release' set.
+ *
+ * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
+ * means only wait until the task is successfully execve()'d. The
+ * separate release agent task is forked by call_usermodehelper(),
+ * then control in this thread returns here, without waiting for the
+ * release agent task. We don't bother to wait because the caller of
+ * this routine has no use for the exit status of the release agent
+ * task, so no sense holding our caller up for that.
+ */
+static void cgroup_release_agent(struct work_struct *work)
+{
+ struct cgroup *cgrp =
+ container_of(work, struct cgroup, release_agent_work);
+ char *pathbuf = NULL, *agentbuf = NULL, *path;
+ char *argv[3], *envp[3];
+
+ mutex_lock(&cgroup_mutex);
+
+ pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
+ agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
+ if (!pathbuf || !agentbuf)
+ goto out;
+
+ path = cgroup_path(cgrp, pathbuf, PATH_MAX);
+ if (!path)
+ goto out;
+
+ argv[0] = agentbuf;
+ argv[1] = path;
+ argv[2] = NULL;
+
+ /* minimal command environment */
+ envp[0] = "HOME=/";
+ envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
+ envp[2] = NULL;
+
+ mutex_unlock(&cgroup_mutex);
+ call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+ goto out_free;
+out:
+ mutex_unlock(&cgroup_mutex);
+out_free:
+ kfree(agentbuf);
+ kfree(pathbuf);
+}
+
+static int __init cgroup_disable(char *str)
+{
+ struct cgroup_subsys *ss;
+ char *token;
+ int i;
+
+ while ((token = strsep(&str, ",")) != NULL) {
+ if (!*token)
+ continue;
+
+ for_each_subsys(ss, i) {
+ if (!strcmp(token, ss->name)) {
+ ss->disabled = 1;
+ printk(KERN_INFO "Disabling %s control group"
+ " subsystem\n", ss->name);
+ break;
+ }
+ }
+ }
+ return 1;
+}
+__setup("cgroup_disable=", cgroup_disable);
+
+static int __init cgroup_set_legacy_files_on_dfl(char *str)
+{
+ printk("cgroup: using legacy files on the default hierarchy\n");
+ cgroup_legacy_files_on_dfl = true;
+ return 0;
+}
+__setup("cgroup__DEVEL__legacy_files_on_dfl", cgroup_set_legacy_files_on_dfl);
+
+/**
+ * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
+ * @dentry: directory dentry of interest
+ * @ss: subsystem of interest
+ *
+ * If @dentry is a directory for a cgroup which has @ss enabled on it, try
+ * to get the corresponding css and return it. If such css doesn't exist
+ * or can't be pinned, an ERR_PTR value is returned.
+ */
+struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
+ struct cgroup_subsys *ss)
+{
+ struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
+ struct cgroup_subsys_state *css = NULL;
+ struct cgroup *cgrp;
+
+ /* is @dentry a cgroup dir? */
+ if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
+ kernfs_type(kn) != KERNFS_DIR)
+ return ERR_PTR(-EBADF);
+
+ rcu_read_lock();
+
+ /*
+ * This path doesn't originate from kernfs and @kn could already
+ * have been or be removed at any point. @kn->priv is RCU
+ * protected for this access. See css_release_work_fn() for details.
+ */
+ cgrp = rcu_dereference(kn->priv);
+ if (cgrp)
+ css = cgroup_css(cgrp, ss);
+
+ if (!css || !css_tryget_online(css))
+ css = ERR_PTR(-ENOENT);
+
+ rcu_read_unlock();
+ return css;
+}
+
+/**
+ * css_from_id - lookup css by id
+ * @id: the cgroup id
+ * @ss: cgroup subsys to be looked into
+ *
+ * Returns the css if there's valid one with @id, otherwise returns NULL.
+ * Should be called under rcu_read_lock().
+ */
+struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
+{
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ return id > 0 ? idr_find(&ss->css_idr, id) : NULL;
+}
+
+#ifdef CONFIG_CGROUP_DEBUG
+static struct cgroup_subsys_state *
+debug_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
+
+ if (!css)
+ return ERR_PTR(-ENOMEM);
+
+ return css;
+}
+
+static void debug_css_free(struct cgroup_subsys_state *css)
+{
+ kfree(css);
+}
+
+static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return cgroup_task_count(css->cgroup);
+}
+
+static u64 current_css_set_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return (u64)(unsigned long)current->cgroups;
+}
+
+static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ u64 count;
+
+ rcu_read_lock();
+ count = atomic_read(&task_css_set(current)->refcount);
+ rcu_read_unlock();
+ return count;
+}
+
+static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
+{
+ struct cgrp_cset_link *link;
+ struct css_set *cset;
+ char *name_buf;
+
+ name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
+ if (!name_buf)
+ return -ENOMEM;
+
+ down_read(&css_set_rwsem);
+ rcu_read_lock();
+ cset = rcu_dereference(current->cgroups);
+ list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
+ struct cgroup *c = link->cgrp;
+
+ cgroup_name(c, name_buf, NAME_MAX + 1);
+ seq_printf(seq, "Root %d group %s\n",
+ c->root->hierarchy_id, name_buf);
+ }
+ rcu_read_unlock();
+ up_read(&css_set_rwsem);
+ kfree(name_buf);
+ return 0;
+}
+
+#define MAX_TASKS_SHOWN_PER_CSS 25
+static int cgroup_css_links_read(struct seq_file *seq, void *v)
+{
+ struct cgroup_subsys_state *css = seq_css(seq);
+ struct cgrp_cset_link *link;
+
+ down_read(&css_set_rwsem);
+ list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
+ struct css_set *cset = link->cset;
+ struct task_struct *task;
+ int count = 0;
+
+ seq_printf(seq, "css_set %p\n", cset);
+
+ list_for_each_entry(task, &cset->tasks, cg_list) {
+ if (count++ > MAX_TASKS_SHOWN_PER_CSS)
+ goto overflow;
+ seq_printf(seq, " task %d\n", task_pid_vnr(task));
+ }
+
+ list_for_each_entry(task, &cset->mg_tasks, cg_list) {
+ if (count++ > MAX_TASKS_SHOWN_PER_CSS)
+ goto overflow;
+ seq_printf(seq, " task %d\n", task_pid_vnr(task));
+ }
+ continue;
+ overflow:
+ seq_puts(seq, " ...\n");
+ }
+ up_read(&css_set_rwsem);
+ return 0;
+}
+
+static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ return (!cgroup_has_tasks(css->cgroup) &&
+ !css_has_online_children(&css->cgroup->self));
+}
+
+static struct cftype debug_files[] = {
+ {
+ .name = "taskcount",
+ .read_u64 = debug_taskcount_read,
+ },
+
+ {
+ .name = "current_css_set",
+ .read_u64 = current_css_set_read,
+ },
+
+ {
+ .name = "current_css_set_refcount",
+ .read_u64 = current_css_set_refcount_read,
+ },
+
+ {
+ .name = "current_css_set_cg_links",
+ .seq_show = current_css_set_cg_links_read,
+ },
+
+ {
+ .name = "cgroup_css_links",
+ .seq_show = cgroup_css_links_read,
+ },
+
+ {
+ .name = "releasable",
+ .read_u64 = releasable_read,
+ },
+
+ { } /* terminate */
+};
+
+struct cgroup_subsys debug_cgrp_subsys = {
+ .css_alloc = debug_css_alloc,
+ .css_free = debug_css_free,
+ .legacy_cftypes = debug_files,
+};
+#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
new file mode 100644
index 000000000..92b98cc0e
--- /dev/null
+++ b/kernel/cgroup_freezer.c
@@ -0,0 +1,484 @@
+/*
+ * cgroup_freezer.c - control group freezer subsystem
+ *
+ * Copyright IBM Corporation, 2007
+ *
+ * Author : Cedric Le Goater <clg@fr.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/cgroup.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/freezer.h>
+#include <linux/seq_file.h>
+#include <linux/mutex.h>
+
+/*
+ * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is
+ * set if "FROZEN" is written to freezer.state cgroupfs file, and cleared
+ * for "THAWED". FREEZING_PARENT is set if the parent freezer is FREEZING
+ * for whatever reason. IOW, a cgroup has FREEZING_PARENT set if one of
+ * its ancestors has FREEZING_SELF set.
+ */
+enum freezer_state_flags {
+ CGROUP_FREEZER_ONLINE = (1 << 0), /* freezer is fully online */
+ CGROUP_FREEZING_SELF = (1 << 1), /* this freezer is freezing */
+ CGROUP_FREEZING_PARENT = (1 << 2), /* the parent freezer is freezing */
+ CGROUP_FROZEN = (1 << 3), /* this and its descendants frozen */
+
+ /* mask for all FREEZING flags */
+ CGROUP_FREEZING = CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT,
+};
+
+struct freezer {
+ struct cgroup_subsys_state css;
+ unsigned int state;
+};
+
+static DEFINE_MUTEX(freezer_mutex);
+
+static inline struct freezer *css_freezer(struct cgroup_subsys_state *css)
+{
+ return css ? container_of(css, struct freezer, css) : NULL;
+}
+
+static inline struct freezer *task_freezer(struct task_struct *task)
+{
+ return css_freezer(task_css(task, freezer_cgrp_id));
+}
+
+static struct freezer *parent_freezer(struct freezer *freezer)
+{
+ return css_freezer(freezer->css.parent);
+}
+
+bool cgroup_freezing(struct task_struct *task)
+{
+ bool ret;
+
+ rcu_read_lock();
+ ret = task_freezer(task)->state & CGROUP_FREEZING;
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static const char *freezer_state_strs(unsigned int state)
+{
+ if (state & CGROUP_FROZEN)
+ return "FROZEN";
+ if (state & CGROUP_FREEZING)
+ return "FREEZING";
+ return "THAWED";
+};
+
+static struct cgroup_subsys_state *
+freezer_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct freezer *freezer;
+
+ freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL);
+ if (!freezer)
+ return ERR_PTR(-ENOMEM);
+
+ return &freezer->css;
+}
+
+/**
+ * freezer_css_online - commit creation of a freezer css
+ * @css: css being created
+ *
+ * We're committing to creation of @css. Mark it online and inherit
+ * parent's freezing state while holding both parent's and our
+ * freezer->lock.
+ */
+static int freezer_css_online(struct cgroup_subsys_state *css)
+{
+ struct freezer *freezer = css_freezer(css);
+ struct freezer *parent = parent_freezer(freezer);
+
+ mutex_lock(&freezer_mutex);
+
+ freezer->state |= CGROUP_FREEZER_ONLINE;
+
+ if (parent && (parent->state & CGROUP_FREEZING)) {
+ freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN;
+ atomic_inc(&system_freezing_cnt);
+ }
+
+ mutex_unlock(&freezer_mutex);
+ return 0;
+}
+
+/**
+ * freezer_css_offline - initiate destruction of a freezer css
+ * @css: css being destroyed
+ *
+ * @css is going away. Mark it dead and decrement system_freezing_count if
+ * it was holding one.
+ */
+static void freezer_css_offline(struct cgroup_subsys_state *css)
+{
+ struct freezer *freezer = css_freezer(css);
+
+ mutex_lock(&freezer_mutex);
+
+ if (freezer->state & CGROUP_FREEZING)
+ atomic_dec(&system_freezing_cnt);
+
+ freezer->state = 0;
+
+ mutex_unlock(&freezer_mutex);
+}
+
+static void freezer_css_free(struct cgroup_subsys_state *css)
+{
+ kfree(css_freezer(css));
+}
+
+/*
+ * Tasks can be migrated into a different freezer anytime regardless of its
+ * current state. freezer_attach() is responsible for making new tasks
+ * conform to the current state.
+ *
+ * Freezer state changes and task migration are synchronized via
+ * @freezer->lock. freezer_attach() makes the new tasks conform to the
+ * current state and all following state changes can see the new tasks.
+ */
+static void freezer_attach(struct cgroup_subsys_state *new_css,
+ struct cgroup_taskset *tset)
+{
+ struct freezer *freezer = css_freezer(new_css);
+ struct task_struct *task;
+ bool clear_frozen = false;
+
+ mutex_lock(&freezer_mutex);
+
+ /*
+ * Make the new tasks conform to the current state of @new_css.
+ * For simplicity, when migrating any task to a FROZEN cgroup, we
+ * revert it to FREEZING and let update_if_frozen() determine the
+ * correct state later.
+ *
+ * Tasks in @tset are on @new_css but may not conform to its
+ * current state before executing the following - !frozen tasks may
+ * be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
+ */
+ cgroup_taskset_for_each(task, tset) {
+ if (!(freezer->state & CGROUP_FREEZING)) {
+ __thaw_task(task);
+ } else {
+ freeze_task(task);
+ freezer->state &= ~CGROUP_FROZEN;
+ clear_frozen = true;
+ }
+ }
+
+ /* propagate FROZEN clearing upwards */
+ while (clear_frozen && (freezer = parent_freezer(freezer))) {
+ freezer->state &= ~CGROUP_FROZEN;
+ clear_frozen = freezer->state & CGROUP_FREEZING;
+ }
+
+ mutex_unlock(&freezer_mutex);
+}
+
+/**
+ * freezer_fork - cgroup post fork callback
+ * @task: a task which has just been forked
+ *
+ * @task has just been created and should conform to the current state of
+ * the cgroup_freezer it belongs to. This function may race against
+ * freezer_attach(). Losing to freezer_attach() means that we don't have
+ * to do anything as freezer_attach() will put @task into the appropriate
+ * state.
+ */
+static void freezer_fork(struct task_struct *task)
+{
+ struct freezer *freezer;
+
+ /*
+ * The root cgroup is non-freezable, so we can skip locking the
+ * freezer. This is safe regardless of race with task migration.
+ * If we didn't race or won, skipping is obviously the right thing
+ * to do. If we lost and root is the new cgroup, noop is still the
+ * right thing to do.
+ */
+ if (task_css_is_root(task, freezer_cgrp_id))
+ return;
+
+ mutex_lock(&freezer_mutex);
+ rcu_read_lock();
+
+ freezer = task_freezer(task);
+ if (freezer->state & CGROUP_FREEZING)
+ freeze_task(task);
+
+ rcu_read_unlock();
+ mutex_unlock(&freezer_mutex);
+}
+
+/**
+ * update_if_frozen - update whether a cgroup finished freezing
+ * @css: css of interest
+ *
+ * Once FREEZING is initiated, transition to FROZEN is lazily updated by
+ * calling this function. If the current state is FREEZING but not FROZEN,
+ * this function checks whether all tasks of this cgroup and the descendant
+ * cgroups finished freezing and, if so, sets FROZEN.
+ *
+ * The caller is responsible for grabbing RCU read lock and calling
+ * update_if_frozen() on all descendants prior to invoking this function.
+ *
+ * Task states and freezer state might disagree while tasks are being
+ * migrated into or out of @css, so we can't verify task states against
+ * @freezer state here. See freezer_attach() for details.
+ */
+static void update_if_frozen(struct cgroup_subsys_state *css)
+{
+ struct freezer *freezer = css_freezer(css);
+ struct cgroup_subsys_state *pos;
+ struct css_task_iter it;
+ struct task_struct *task;
+
+ lockdep_assert_held(&freezer_mutex);
+
+ if (!(freezer->state & CGROUP_FREEZING) ||
+ (freezer->state & CGROUP_FROZEN))
+ return;
+
+ /* are all (live) children frozen? */
+ rcu_read_lock();
+ css_for_each_child(pos, css) {
+ struct freezer *child = css_freezer(pos);
+
+ if ((child->state & CGROUP_FREEZER_ONLINE) &&
+ !(child->state & CGROUP_FROZEN)) {
+ rcu_read_unlock();
+ return;
+ }
+ }
+ rcu_read_unlock();
+
+ /* are all tasks frozen? */
+ css_task_iter_start(css, &it);
+
+ while ((task = css_task_iter_next(&it))) {
+ if (freezing(task)) {
+ /*
+ * freezer_should_skip() indicates that the task
+ * should be skipped when determining freezing
+ * completion. Consider it frozen in addition to
+ * the usual frozen condition.
+ */
+ if (!frozen(task) && !freezer_should_skip(task))
+ goto out_iter_end;
+ }
+ }
+
+ freezer->state |= CGROUP_FROZEN;
+out_iter_end:
+ css_task_iter_end(&it);
+}
+
+static int freezer_read(struct seq_file *m, void *v)
+{
+ struct cgroup_subsys_state *css = seq_css(m), *pos;
+
+ mutex_lock(&freezer_mutex);
+ rcu_read_lock();
+
+ /* update states bottom-up */
+ css_for_each_descendant_post(pos, css) {
+ if (!css_tryget_online(pos))
+ continue;
+ rcu_read_unlock();
+
+ update_if_frozen(pos);
+
+ rcu_read_lock();
+ css_put(pos);
+ }
+
+ rcu_read_unlock();
+ mutex_unlock(&freezer_mutex);
+
+ seq_puts(m, freezer_state_strs(css_freezer(css)->state));
+ seq_putc(m, '\n');
+ return 0;
+}
+
+static void freeze_cgroup(struct freezer *freezer)
+{
+ struct css_task_iter it;
+ struct task_struct *task;
+
+ css_task_iter_start(&freezer->css, &it);
+ while ((task = css_task_iter_next(&it)))
+ freeze_task(task);
+ css_task_iter_end(&it);
+}
+
+static void unfreeze_cgroup(struct freezer *freezer)
+{
+ struct css_task_iter it;
+ struct task_struct *task;
+
+ css_task_iter_start(&freezer->css, &it);
+ while ((task = css_task_iter_next(&it)))
+ __thaw_task(task);
+ css_task_iter_end(&it);
+}
+
+/**
+ * freezer_apply_state - apply state change to a single cgroup_freezer
+ * @freezer: freezer to apply state change to
+ * @freeze: whether to freeze or unfreeze
+ * @state: CGROUP_FREEZING_* flag to set or clear
+ *
+ * Set or clear @state on @cgroup according to @freeze, and perform
+ * freezing or thawing as necessary.
+ */
+static void freezer_apply_state(struct freezer *freezer, bool freeze,
+ unsigned int state)
+{
+ /* also synchronizes against task migration, see freezer_attach() */
+ lockdep_assert_held(&freezer_mutex);
+
+ if (!(freezer->state & CGROUP_FREEZER_ONLINE))
+ return;
+
+ if (freeze) {
+ if (!(freezer->state & CGROUP_FREEZING))
+ atomic_inc(&system_freezing_cnt);
+ freezer->state |= state;
+ freeze_cgroup(freezer);
+ } else {
+ bool was_freezing = freezer->state & CGROUP_FREEZING;
+
+ freezer->state &= ~state;
+
+ if (!(freezer->state & CGROUP_FREEZING)) {
+ if (was_freezing)
+ atomic_dec(&system_freezing_cnt);
+ freezer->state &= ~CGROUP_FROZEN;
+ unfreeze_cgroup(freezer);
+ }
+ }
+}
+
+/**
+ * freezer_change_state - change the freezing state of a cgroup_freezer
+ * @freezer: freezer of interest
+ * @freeze: whether to freeze or thaw
+ *
+ * Freeze or thaw @freezer according to @freeze. The operations are
+ * recursive - all descendants of @freezer will be affected.
+ */
+static void freezer_change_state(struct freezer *freezer, bool freeze)
+{
+ struct cgroup_subsys_state *pos;
+
+ /*
+ * Update all its descendants in pre-order traversal. Each
+ * descendant will try to inherit its parent's FREEZING state as
+ * CGROUP_FREEZING_PARENT.
+ */
+ mutex_lock(&freezer_mutex);
+ rcu_read_lock();
+ css_for_each_descendant_pre(pos, &freezer->css) {
+ struct freezer *pos_f = css_freezer(pos);
+ struct freezer *parent = parent_freezer(pos_f);
+
+ if (!css_tryget_online(pos))
+ continue;
+ rcu_read_unlock();
+
+ if (pos_f == freezer)
+ freezer_apply_state(pos_f, freeze,
+ CGROUP_FREEZING_SELF);
+ else
+ freezer_apply_state(pos_f,
+ parent->state & CGROUP_FREEZING,
+ CGROUP_FREEZING_PARENT);
+
+ rcu_read_lock();
+ css_put(pos);
+ }
+ rcu_read_unlock();
+ mutex_unlock(&freezer_mutex);
+}
+
+static ssize_t freezer_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ bool freeze;
+
+ buf = strstrip(buf);
+
+ if (strcmp(buf, freezer_state_strs(0)) == 0)
+ freeze = false;
+ else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0)
+ freeze = true;
+ else
+ return -EINVAL;
+
+ freezer_change_state(css_freezer(of_css(of)), freeze);
+ return nbytes;
+}
+
+static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct freezer *freezer = css_freezer(css);
+
+ return (bool)(freezer->state & CGROUP_FREEZING_SELF);
+}
+
+static u64 freezer_parent_freezing_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct freezer *freezer = css_freezer(css);
+
+ return (bool)(freezer->state & CGROUP_FREEZING_PARENT);
+}
+
+static struct cftype files[] = {
+ {
+ .name = "state",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = freezer_read,
+ .write = freezer_write,
+ },
+ {
+ .name = "self_freezing",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = freezer_self_freezing_read,
+ },
+ {
+ .name = "parent_freezing",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = freezer_parent_freezing_read,
+ },
+ { } /* terminate */
+};
+
+struct cgroup_subsys freezer_cgrp_subsys = {
+ .css_alloc = freezer_css_alloc,
+ .css_online = freezer_css_online,
+ .css_offline = freezer_css_offline,
+ .css_free = freezer_css_free,
+ .attach = freezer_attach,
+ .fork = freezer_fork,
+ .legacy_cftypes = files,
+};
diff --git a/kernel/compat.c b/kernel/compat.c
new file mode 100644
index 000000000..333d364be
--- /dev/null
+++ b/kernel/compat.c
@@ -0,0 +1,1176 @@
+/*
+ * linux/kernel/compat.c
+ *
+ * Kernel compatibililty routines for e.g. 32 bit syscall support
+ * on 64 bit kernels.
+ *
+ * Copyright (C) 2002-2003 Stephen Rothwell, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <linux/compat.h>
+#include <linux/errno.h>
+#include <linux/time.h>
+#include <linux/signal.h>
+#include <linux/sched.h> /* for MAX_SCHEDULE_TIMEOUT */
+#include <linux/syscalls.h>
+#include <linux/unistd.h>
+#include <linux/security.h>
+#include <linux/timex.h>
+#include <linux/export.h>
+#include <linux/migrate.h>
+#include <linux/posix-timers.h>
+#include <linux/times.h>
+#include <linux/ptrace.h>
+#include <linux/gfp.h>
+
+#include <asm/uaccess.h>
+
+static int compat_get_timex(struct timex *txc, struct compat_timex __user *utp)
+{
+ memset(txc, 0, sizeof(struct timex));
+
+ if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) ||
+ __get_user(txc->modes, &utp->modes) ||
+ __get_user(txc->offset, &utp->offset) ||
+ __get_user(txc->freq, &utp->freq) ||
+ __get_user(txc->maxerror, &utp->maxerror) ||
+ __get_user(txc->esterror, &utp->esterror) ||
+ __get_user(txc->status, &utp->status) ||
+ __get_user(txc->constant, &utp->constant) ||
+ __get_user(txc->precision, &utp->precision) ||
+ __get_user(txc->tolerance, &utp->tolerance) ||
+ __get_user(txc->time.tv_sec, &utp->time.tv_sec) ||
+ __get_user(txc->time.tv_usec, &utp->time.tv_usec) ||
+ __get_user(txc->tick, &utp->tick) ||
+ __get_user(txc->ppsfreq, &utp->ppsfreq) ||
+ __get_user(txc->jitter, &utp->jitter) ||
+ __get_user(txc->shift, &utp->shift) ||
+ __get_user(txc->stabil, &utp->stabil) ||
+ __get_user(txc->jitcnt, &utp->jitcnt) ||
+ __get_user(txc->calcnt, &utp->calcnt) ||
+ __get_user(txc->errcnt, &utp->errcnt) ||
+ __get_user(txc->stbcnt, &utp->stbcnt))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int compat_put_timex(struct compat_timex __user *utp, struct timex *txc)
+{
+ if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) ||
+ __put_user(txc->modes, &utp->modes) ||
+ __put_user(txc->offset, &utp->offset) ||
+ __put_user(txc->freq, &utp->freq) ||
+ __put_user(txc->maxerror, &utp->maxerror) ||
+ __put_user(txc->esterror, &utp->esterror) ||
+ __put_user(txc->status, &utp->status) ||
+ __put_user(txc->constant, &utp->constant) ||
+ __put_user(txc->precision, &utp->precision) ||
+ __put_user(txc->tolerance, &utp->tolerance) ||
+ __put_user(txc->time.tv_sec, &utp->time.tv_sec) ||
+ __put_user(txc->time.tv_usec, &utp->time.tv_usec) ||
+ __put_user(txc->tick, &utp->tick) ||
+ __put_user(txc->ppsfreq, &utp->ppsfreq) ||
+ __put_user(txc->jitter, &utp->jitter) ||
+ __put_user(txc->shift, &utp->shift) ||
+ __put_user(txc->stabil, &utp->stabil) ||
+ __put_user(txc->jitcnt, &utp->jitcnt) ||
+ __put_user(txc->calcnt, &utp->calcnt) ||
+ __put_user(txc->errcnt, &utp->errcnt) ||
+ __put_user(txc->stbcnt, &utp->stbcnt) ||
+ __put_user(txc->tai, &utp->tai))
+ return -EFAULT;
+ return 0;
+}
+
+COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv,
+ struct timezone __user *, tz)
+{
+ if (tv) {
+ struct timeval ktv;
+ do_gettimeofday(&ktv);
+ if (compat_put_timeval(&ktv, tv))
+ return -EFAULT;
+ }
+ if (tz) {
+ if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv,
+ struct timezone __user *, tz)
+{
+ struct timeval user_tv;
+ struct timespec new_ts;
+ struct timezone new_tz;
+
+ if (tv) {
+ if (compat_get_timeval(&user_tv, tv))
+ return -EFAULT;
+ new_ts.tv_sec = user_tv.tv_sec;
+ new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC;
+ }
+ if (tz) {
+ if (copy_from_user(&new_tz, tz, sizeof(*tz)))
+ return -EFAULT;
+ }
+
+ return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
+}
+
+static int __compat_get_timeval(struct timeval *tv, const struct compat_timeval __user *ctv)
+{
+ return (!access_ok(VERIFY_READ, ctv, sizeof(*ctv)) ||
+ __get_user(tv->tv_sec, &ctv->tv_sec) ||
+ __get_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0;
+}
+
+static int __compat_put_timeval(const struct timeval *tv, struct compat_timeval __user *ctv)
+{
+ return (!access_ok(VERIFY_WRITE, ctv, sizeof(*ctv)) ||
+ __put_user(tv->tv_sec, &ctv->tv_sec) ||
+ __put_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0;
+}
+
+static int __compat_get_timespec(struct timespec *ts, const struct compat_timespec __user *cts)
+{
+ return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) ||
+ __get_user(ts->tv_sec, &cts->tv_sec) ||
+ __get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
+}
+
+static int __compat_put_timespec(const struct timespec *ts, struct compat_timespec __user *cts)
+{
+ return (!access_ok(VERIFY_WRITE, cts, sizeof(*cts)) ||
+ __put_user(ts->tv_sec, &cts->tv_sec) ||
+ __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
+}
+
+int compat_get_timeval(struct timeval *tv, const void __user *utv)
+{
+ if (COMPAT_USE_64BIT_TIME)
+ return copy_from_user(tv, utv, sizeof(*tv)) ? -EFAULT : 0;
+ else
+ return __compat_get_timeval(tv, utv);
+}
+EXPORT_SYMBOL_GPL(compat_get_timeval);
+
+int compat_put_timeval(const struct timeval *tv, void __user *utv)
+{
+ if (COMPAT_USE_64BIT_TIME)
+ return copy_to_user(utv, tv, sizeof(*tv)) ? -EFAULT : 0;
+ else
+ return __compat_put_timeval(tv, utv);
+}
+EXPORT_SYMBOL_GPL(compat_put_timeval);
+
+int compat_get_timespec(struct timespec *ts, const void __user *uts)
+{
+ if (COMPAT_USE_64BIT_TIME)
+ return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0;
+ else
+ return __compat_get_timespec(ts, uts);
+}
+EXPORT_SYMBOL_GPL(compat_get_timespec);
+
+int compat_put_timespec(const struct timespec *ts, void __user *uts)
+{
+ if (COMPAT_USE_64BIT_TIME)
+ return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0;
+ else
+ return __compat_put_timespec(ts, uts);
+}
+EXPORT_SYMBOL_GPL(compat_put_timespec);
+
+int compat_convert_timespec(struct timespec __user **kts,
+ const void __user *cts)
+{
+ struct timespec ts;
+ struct timespec __user *uts;
+
+ if (!cts || COMPAT_USE_64BIT_TIME) {
+ *kts = (struct timespec __user *)cts;
+ return 0;
+ }
+
+ uts = compat_alloc_user_space(sizeof(ts));
+ if (!uts)
+ return -EFAULT;
+ if (compat_get_timespec(&ts, cts))
+ return -EFAULT;
+ if (copy_to_user(uts, &ts, sizeof(ts)))
+ return -EFAULT;
+
+ *kts = uts;
+ return 0;
+}
+
+static long compat_nanosleep_restart(struct restart_block *restart)
+{
+ struct compat_timespec __user *rmtp;
+ struct timespec rmt;
+ mm_segment_t oldfs;
+ long ret;
+
+ restart->nanosleep.rmtp = (struct timespec __user *) &rmt;
+ oldfs = get_fs();
+ set_fs(KERNEL_DS);
+ ret = hrtimer_nanosleep_restart(restart);
+ set_fs(oldfs);
+
+ if (ret == -ERESTART_RESTARTBLOCK) {
+ rmtp = restart->nanosleep.compat_rmtp;
+
+ if (rmtp && compat_put_timespec(&rmt, rmtp))
+ return -EFAULT;
+ }
+
+ return ret;
+}
+
+COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
+ struct compat_timespec __user *, rmtp)
+{
+ struct timespec tu, rmt;
+ mm_segment_t oldfs;
+ long ret;
+
+ if (compat_get_timespec(&tu, rqtp))
+ return -EFAULT;
+
+ if (!timespec_valid(&tu))
+ return -EINVAL;
+
+ oldfs = get_fs();
+ set_fs(KERNEL_DS);
+ ret = hrtimer_nanosleep(&tu,
+ rmtp ? (struct timespec __user *)&rmt : NULL,
+ HRTIMER_MODE_REL, CLOCK_MONOTONIC);
+ set_fs(oldfs);
+
+ /*
+ * hrtimer_nanosleep() can only return 0 or
+ * -ERESTART_RESTARTBLOCK here because:
+ *
+ * - we call it with HRTIMER_MODE_REL and therefor exclude the
+ * -ERESTARTNOHAND return path.
+ *
+ * - we supply the rmtp argument from the task stack (due to
+ * the necessary compat conversion. So the update cannot
+ * fail, which excludes the -EFAULT return path as well. If
+ * it fails nevertheless we have a bigger problem and wont
+ * reach this place anymore.
+ *
+ * - if the return value is 0, we do not have to update rmtp
+ * because there is no remaining time.
+ *
+ * We check for -ERESTART_RESTARTBLOCK nevertheless if the
+ * core implementation decides to return random nonsense.
+ */
+ if (ret == -ERESTART_RESTARTBLOCK) {
+ struct restart_block *restart = &current->restart_block;
+
+ restart->fn = compat_nanosleep_restart;
+ restart->nanosleep.compat_rmtp = rmtp;
+
+ if (rmtp && compat_put_timespec(&rmt, rmtp))
+ return -EFAULT;
+ }
+ return ret;
+}
+
+static inline long get_compat_itimerval(struct itimerval *o,
+ struct compat_itimerval __user *i)
+{
+ return (!access_ok(VERIFY_READ, i, sizeof(*i)) ||
+ (__get_user(o->it_interval.tv_sec, &i->it_interval.tv_sec) |
+ __get_user(o->it_interval.tv_usec, &i->it_interval.tv_usec) |
+ __get_user(o->it_value.tv_sec, &i->it_value.tv_sec) |
+ __get_user(o->it_value.tv_usec, &i->it_value.tv_usec)));
+}
+
+static inline long put_compat_itimerval(struct compat_itimerval __user *o,
+ struct itimerval *i)
+{
+ return (!access_ok(VERIFY_WRITE, o, sizeof(*o)) ||
+ (__put_user(i->it_interval.tv_sec, &o->it_interval.tv_sec) |
+ __put_user(i->it_interval.tv_usec, &o->it_interval.tv_usec) |
+ __put_user(i->it_value.tv_sec, &o->it_value.tv_sec) |
+ __put_user(i->it_value.tv_usec, &o->it_value.tv_usec)));
+}
+
+COMPAT_SYSCALL_DEFINE2(getitimer, int, which,
+ struct compat_itimerval __user *, it)
+{
+ struct itimerval kit;
+ int error;
+
+ error = do_getitimer(which, &kit);
+ if (!error && put_compat_itimerval(it, &kit))
+ error = -EFAULT;
+ return error;
+}
+
+COMPAT_SYSCALL_DEFINE3(setitimer, int, which,
+ struct compat_itimerval __user *, in,
+ struct compat_itimerval __user *, out)
+{
+ struct itimerval kin, kout;
+ int error;
+
+ if (in) {
+ if (get_compat_itimerval(&kin, in))
+ return -EFAULT;
+ } else
+ memset(&kin, 0, sizeof(kin));
+
+ error = do_setitimer(which, &kin, out ? &kout : NULL);
+ if (error || !out)
+ return error;
+ if (put_compat_itimerval(out, &kout))
+ return -EFAULT;
+ return 0;
+}
+
+static compat_clock_t clock_t_to_compat_clock_t(clock_t x)
+{
+ return compat_jiffies_to_clock_t(clock_t_to_jiffies(x));
+}
+
+COMPAT_SYSCALL_DEFINE1(times, struct compat_tms __user *, tbuf)
+{
+ if (tbuf) {
+ struct tms tms;
+ struct compat_tms tmp;
+
+ do_sys_times(&tms);
+ /* Convert our struct tms to the compat version. */
+ tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime);
+ tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime);
+ tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime);
+ tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime);
+ if (copy_to_user(tbuf, &tmp, sizeof(tmp)))
+ return -EFAULT;
+ }
+ force_successful_syscall_return();
+ return compat_jiffies_to_clock_t(jiffies);
+}
+
+#ifdef __ARCH_WANT_SYS_SIGPENDING
+
+/*
+ * Assumption: old_sigset_t and compat_old_sigset_t are both
+ * types that can be passed to put_user()/get_user().
+ */
+
+COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set)
+{
+ old_sigset_t s;
+ long ret;
+ mm_segment_t old_fs = get_fs();
+
+ set_fs(KERNEL_DS);
+ ret = sys_sigpending((old_sigset_t __user *) &s);
+ set_fs(old_fs);
+ if (ret == 0)
+ ret = put_user(s, set);
+ return ret;
+}
+
+#endif
+
+#ifdef __ARCH_WANT_SYS_SIGPROCMASK
+
+/*
+ * sys_sigprocmask SIG_SETMASK sets the first (compat) word of the
+ * blocked set of signals to the supplied signal set
+ */
+static inline void compat_sig_setmask(sigset_t *blocked, compat_sigset_word set)
+{
+ memcpy(blocked->sig, &set, sizeof(set));
+}
+
+COMPAT_SYSCALL_DEFINE3(sigprocmask, int, how,
+ compat_old_sigset_t __user *, nset,
+ compat_old_sigset_t __user *, oset)
+{
+ old_sigset_t old_set, new_set;
+ sigset_t new_blocked;
+
+ old_set = current->blocked.sig[0];
+
+ if (nset) {
+ if (get_user(new_set, nset))
+ return -EFAULT;
+ new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP));
+
+ new_blocked = current->blocked;
+
+ switch (how) {
+ case SIG_BLOCK:
+ sigaddsetmask(&new_blocked, new_set);
+ break;
+ case SIG_UNBLOCK:
+ sigdelsetmask(&new_blocked, new_set);
+ break;
+ case SIG_SETMASK:
+ compat_sig_setmask(&new_blocked, new_set);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ set_current_blocked(&new_blocked);
+ }
+
+ if (oset) {
+ if (put_user(old_set, oset))
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+#endif
+
+COMPAT_SYSCALL_DEFINE2(setrlimit, unsigned int, resource,
+ struct compat_rlimit __user *, rlim)
+{
+ struct rlimit r;
+
+ if (!access_ok(VERIFY_READ, rlim, sizeof(*rlim)) ||
+ __get_user(r.rlim_cur, &rlim->rlim_cur) ||
+ __get_user(r.rlim_max, &rlim->rlim_max))
+ return -EFAULT;
+
+ if (r.rlim_cur == COMPAT_RLIM_INFINITY)
+ r.rlim_cur = RLIM_INFINITY;
+ if (r.rlim_max == COMPAT_RLIM_INFINITY)
+ r.rlim_max = RLIM_INFINITY;
+ return do_prlimit(current, resource, &r, NULL);
+}
+
+#ifdef COMPAT_RLIM_OLD_INFINITY
+
+COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
+ struct compat_rlimit __user *, rlim)
+{
+ struct rlimit r;
+ int ret;
+ mm_segment_t old_fs = get_fs();
+
+ set_fs(KERNEL_DS);
+ ret = sys_old_getrlimit(resource, (struct rlimit __user *)&r);
+ set_fs(old_fs);
+
+ if (!ret) {
+ if (r.rlim_cur > COMPAT_RLIM_OLD_INFINITY)
+ r.rlim_cur = COMPAT_RLIM_INFINITY;
+ if (r.rlim_max > COMPAT_RLIM_OLD_INFINITY)
+ r.rlim_max = COMPAT_RLIM_INFINITY;
+
+ if (!access_ok(VERIFY_WRITE, rlim, sizeof(*rlim)) ||
+ __put_user(r.rlim_cur, &rlim->rlim_cur) ||
+ __put_user(r.rlim_max, &rlim->rlim_max))
+ return -EFAULT;
+ }
+ return ret;
+}
+
+#endif
+
+COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource,
+ struct compat_rlimit __user *, rlim)
+{
+ struct rlimit r;
+ int ret;
+
+ ret = do_prlimit(current, resource, NULL, &r);
+ if (!ret) {
+ if (r.rlim_cur > COMPAT_RLIM_INFINITY)
+ r.rlim_cur = COMPAT_RLIM_INFINITY;
+ if (r.rlim_max > COMPAT_RLIM_INFINITY)
+ r.rlim_max = COMPAT_RLIM_INFINITY;
+
+ if (!access_ok(VERIFY_WRITE, rlim, sizeof(*rlim)) ||
+ __put_user(r.rlim_cur, &rlim->rlim_cur) ||
+ __put_user(r.rlim_max, &rlim->rlim_max))
+ return -EFAULT;
+ }
+ return ret;
+}
+
+int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru)
+{
+ if (!access_ok(VERIFY_WRITE, ru, sizeof(*ru)) ||
+ __put_user(r->ru_utime.tv_sec, &ru->ru_utime.tv_sec) ||
+ __put_user(r->ru_utime.tv_usec, &ru->ru_utime.tv_usec) ||
+ __put_user(r->ru_stime.tv_sec, &ru->ru_stime.tv_sec) ||
+ __put_user(r->ru_stime.tv_usec, &ru->ru_stime.tv_usec) ||
+ __put_user(r->ru_maxrss, &ru->ru_maxrss) ||
+ __put_user(r->ru_ixrss, &ru->ru_ixrss) ||
+ __put_user(r->ru_idrss, &ru->ru_idrss) ||
+ __put_user(r->ru_isrss, &ru->ru_isrss) ||
+ __put_user(r->ru_minflt, &ru->ru_minflt) ||
+ __put_user(r->ru_majflt, &ru->ru_majflt) ||
+ __put_user(r->ru_nswap, &ru->ru_nswap) ||
+ __put_user(r->ru_inblock, &ru->ru_inblock) ||
+ __put_user(r->ru_oublock, &ru->ru_oublock) ||
+ __put_user(r->ru_msgsnd, &ru->ru_msgsnd) ||
+ __put_user(r->ru_msgrcv, &ru->ru_msgrcv) ||
+ __put_user(r->ru_nsignals, &ru->ru_nsignals) ||
+ __put_user(r->ru_nvcsw, &ru->ru_nvcsw) ||
+ __put_user(r->ru_nivcsw, &ru->ru_nivcsw))
+ return -EFAULT;
+ return 0;
+}
+
+COMPAT_SYSCALL_DEFINE4(wait4,
+ compat_pid_t, pid,
+ compat_uint_t __user *, stat_addr,
+ int, options,
+ struct compat_rusage __user *, ru)
+{
+ if (!ru) {
+ return sys_wait4(pid, stat_addr, options, NULL);
+ } else {
+ struct rusage r;
+ int ret;
+ unsigned int status;
+ mm_segment_t old_fs = get_fs();
+
+ set_fs (KERNEL_DS);
+ ret = sys_wait4(pid,
+ (stat_addr ?
+ (unsigned int __user *) &status : NULL),
+ options, (struct rusage __user *) &r);
+ set_fs (old_fs);
+
+ if (ret > 0) {
+ if (put_compat_rusage(&r, ru))
+ return -EFAULT;
+ if (stat_addr && put_user(status, stat_addr))
+ return -EFAULT;
+ }
+ return ret;
+ }
+}
+
+COMPAT_SYSCALL_DEFINE5(waitid,
+ int, which, compat_pid_t, pid,
+ struct compat_siginfo __user *, uinfo, int, options,
+ struct compat_rusage __user *, uru)
+{
+ siginfo_t info;
+ struct rusage ru;
+ long ret;
+ mm_segment_t old_fs = get_fs();
+
+ memset(&info, 0, sizeof(info));
+
+ set_fs(KERNEL_DS);
+ ret = sys_waitid(which, pid, (siginfo_t __user *)&info, options,
+ uru ? (struct rusage __user *)&ru : NULL);
+ set_fs(old_fs);
+
+ if ((ret < 0) || (info.si_signo == 0))
+ return ret;
+
+ if (uru) {
+ /* sys_waitid() overwrites everything in ru */
+ if (COMPAT_USE_64BIT_TIME)
+ ret = copy_to_user(uru, &ru, sizeof(ru));
+ else
+ ret = put_compat_rusage(&ru, uru);
+ if (ret)
+ return -EFAULT;
+ }
+
+ BUG_ON(info.si_code & __SI_MASK);
+ info.si_code |= __SI_CHLD;
+ return copy_siginfo_to_user32(uinfo, &info);
+}
+
+static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr,
+ unsigned len, struct cpumask *new_mask)
+{
+ unsigned long *k;
+
+ if (len < cpumask_size())
+ memset(new_mask, 0, cpumask_size());
+ else if (len > cpumask_size())
+ len = cpumask_size();
+
+ k = cpumask_bits(new_mask);
+ return compat_get_bitmap(k, user_mask_ptr, len * 8);
+}
+
+COMPAT_SYSCALL_DEFINE3(sched_setaffinity, compat_pid_t, pid,
+ unsigned int, len,
+ compat_ulong_t __user *, user_mask_ptr)
+{
+ cpumask_var_t new_mask;
+ int retval;
+
+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ retval = compat_get_user_cpu_mask(user_mask_ptr, len, new_mask);
+ if (retval)
+ goto out;
+
+ retval = sched_setaffinity(pid, new_mask);
+out:
+ free_cpumask_var(new_mask);
+ return retval;
+}
+
+COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t, pid, unsigned int, len,
+ compat_ulong_t __user *, user_mask_ptr)
+{
+ int ret;
+ cpumask_var_t mask;
+
+ if ((len * BITS_PER_BYTE) < nr_cpu_ids)
+ return -EINVAL;
+ if (len & (sizeof(compat_ulong_t)-1))
+ return -EINVAL;
+
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ ret = sched_getaffinity(pid, mask);
+ if (ret == 0) {
+ size_t retlen = min_t(size_t, len, cpumask_size());
+
+ if (compat_put_bitmap(user_mask_ptr, cpumask_bits(mask), retlen * 8))
+ ret = -EFAULT;
+ else
+ ret = retlen;
+ }
+ free_cpumask_var(mask);
+
+ return ret;
+}
+
+int get_compat_itimerspec(struct itimerspec *dst,
+ const struct compat_itimerspec __user *src)
+{
+ if (__compat_get_timespec(&dst->it_interval, &src->it_interval) ||
+ __compat_get_timespec(&dst->it_value, &src->it_value))
+ return -EFAULT;
+ return 0;
+}
+
+int put_compat_itimerspec(struct compat_itimerspec __user *dst,
+ const struct itimerspec *src)
+{
+ if (__compat_put_timespec(&src->it_interval, &dst->it_interval) ||
+ __compat_put_timespec(&src->it_value, &dst->it_value))
+ return -EFAULT;
+ return 0;
+}
+
+COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock,
+ struct compat_sigevent __user *, timer_event_spec,
+ timer_t __user *, created_timer_id)
+{
+ struct sigevent __user *event = NULL;
+
+ if (timer_event_spec) {
+ struct sigevent kevent;
+
+ event = compat_alloc_user_space(sizeof(*event));
+ if (get_compat_sigevent(&kevent, timer_event_spec) ||
+ copy_to_user(event, &kevent, sizeof(*event)))
+ return -EFAULT;
+ }
+
+ return sys_timer_create(which_clock, event, created_timer_id);
+}
+
+COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
+ struct compat_itimerspec __user *, new,
+ struct compat_itimerspec __user *, old)
+{
+ long err;
+ mm_segment_t oldfs;
+ struct itimerspec newts, oldts;
+
+ if (!new)
+ return -EINVAL;
+ if (get_compat_itimerspec(&newts, new))
+ return -EFAULT;
+ oldfs = get_fs();
+ set_fs(KERNEL_DS);
+ err = sys_timer_settime(timer_id, flags,
+ (struct itimerspec __user *) &newts,
+ (struct itimerspec __user *) &oldts);
+ set_fs(oldfs);
+ if (!err && old && put_compat_itimerspec(old, &oldts))
+ return -EFAULT;
+ return err;
+}
+
+COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
+ struct compat_itimerspec __user *, setting)
+{
+ long err;
+ mm_segment_t oldfs;
+ struct itimerspec ts;
+
+ oldfs = get_fs();
+ set_fs(KERNEL_DS);
+ err = sys_timer_gettime(timer_id,
+ (struct itimerspec __user *) &ts);
+ set_fs(oldfs);
+ if (!err && put_compat_itimerspec(setting, &ts))
+ return -EFAULT;
+ return err;
+}
+
+COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock,
+ struct compat_timespec __user *, tp)
+{
+ long err;
+ mm_segment_t oldfs;
+ struct timespec ts;
+
+ if (compat_get_timespec(&ts, tp))
+ return -EFAULT;
+ oldfs = get_fs();
+ set_fs(KERNEL_DS);
+ err = sys_clock_settime(which_clock,
+ (struct timespec __user *) &ts);
+ set_fs(oldfs);
+ return err;
+}
+
+COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock,
+ struct compat_timespec __user *, tp)
+{
+ long err;
+ mm_segment_t oldfs;
+ struct timespec ts;
+
+ oldfs = get_fs();
+ set_fs(KERNEL_DS);
+ err = sys_clock_gettime(which_clock,
+ (struct timespec __user *) &ts);
+ set_fs(oldfs);
+ if (!err && compat_put_timespec(&ts, tp))
+ return -EFAULT;
+ return err;
+}
+
+COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock,
+ struct compat_timex __user *, utp)
+{
+ struct timex txc;
+ mm_segment_t oldfs;
+ int err, ret;
+
+ err = compat_get_timex(&txc, utp);
+ if (err)
+ return err;
+
+ oldfs = get_fs();
+ set_fs(KERNEL_DS);
+ ret = sys_clock_adjtime(which_clock, (struct timex __user *) &txc);
+ set_fs(oldfs);
+
+ err = compat_put_timex(utp, &txc);
+ if (err)
+ return err;
+
+ return ret;
+}
+
+COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock,
+ struct compat_timespec __user *, tp)
+{
+ long err;
+ mm_segment_t oldfs;
+ struct timespec ts;
+
+ oldfs = get_fs();
+ set_fs(KERNEL_DS);
+ err = sys_clock_getres(which_clock,
+ (struct timespec __user *) &ts);
+ set_fs(oldfs);
+ if (!err && tp && compat_put_timespec(&ts, tp))
+ return -EFAULT;
+ return err;
+}
+
+static long compat_clock_nanosleep_restart(struct restart_block *restart)
+{
+ long err;
+ mm_segment_t oldfs;
+ struct timespec tu;
+ struct compat_timespec __user *rmtp = restart->nanosleep.compat_rmtp;
+
+ restart->nanosleep.rmtp = (struct timespec __user *) &tu;
+ oldfs = get_fs();
+ set_fs(KERNEL_DS);
+ err = clock_nanosleep_restart(restart);
+ set_fs(oldfs);
+
+ if ((err == -ERESTART_RESTARTBLOCK) && rmtp &&
+ compat_put_timespec(&tu, rmtp))
+ return -EFAULT;
+
+ if (err == -ERESTART_RESTARTBLOCK) {
+ restart->fn = compat_clock_nanosleep_restart;
+ restart->nanosleep.compat_rmtp = rmtp;
+ }
+ return err;
+}
+
+COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
+ struct compat_timespec __user *, rqtp,
+ struct compat_timespec __user *, rmtp)
+{
+ long err;
+ mm_segment_t oldfs;
+ struct timespec in, out;
+ struct restart_block *restart;
+
+ if (compat_get_timespec(&in, rqtp))
+ return -EFAULT;
+
+ oldfs = get_fs();
+ set_fs(KERNEL_DS);
+ err = sys_clock_nanosleep(which_clock, flags,
+ (struct timespec __user *) &in,
+ (struct timespec __user *) &out);
+ set_fs(oldfs);
+
+ if ((err == -ERESTART_RESTARTBLOCK) && rmtp &&
+ compat_put_timespec(&out, rmtp))
+ return -EFAULT;
+
+ if (err == -ERESTART_RESTARTBLOCK) {
+ restart = &current->restart_block;
+ restart->fn = compat_clock_nanosleep_restart;
+ restart->nanosleep.compat_rmtp = rmtp;
+ }
+ return err;
+}
+
+/*
+ * We currently only need the following fields from the sigevent
+ * structure: sigev_value, sigev_signo, sig_notify and (sometimes
+ * sigev_notify_thread_id). The others are handled in user mode.
+ * We also assume that copying sigev_value.sival_int is sufficient
+ * to keep all the bits of sigev_value.sival_ptr intact.
+ */
+int get_compat_sigevent(struct sigevent *event,
+ const struct compat_sigevent __user *u_event)
+{
+ memset(event, 0, sizeof(*event));
+ return (!access_ok(VERIFY_READ, u_event, sizeof(*u_event)) ||
+ __get_user(event->sigev_value.sival_int,
+ &u_event->sigev_value.sival_int) ||
+ __get_user(event->sigev_signo, &u_event->sigev_signo) ||
+ __get_user(event->sigev_notify, &u_event->sigev_notify) ||
+ __get_user(event->sigev_notify_thread_id,
+ &u_event->sigev_notify_thread_id))
+ ? -EFAULT : 0;
+}
+
+long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask,
+ unsigned long bitmap_size)
+{
+ int i, j;
+ unsigned long m;
+ compat_ulong_t um;
+ unsigned long nr_compat_longs;
+
+ /* align bitmap up to nearest compat_long_t boundary */
+ bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG);
+
+ if (!access_ok(VERIFY_READ, umask, bitmap_size / 8))
+ return -EFAULT;
+
+ nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size);
+
+ for (i = 0; i < BITS_TO_LONGS(bitmap_size); i++) {
+ m = 0;
+
+ for (j = 0; j < sizeof(m)/sizeof(um); j++) {
+ /*
+ * We dont want to read past the end of the userspace
+ * bitmap. We must however ensure the end of the
+ * kernel bitmap is zeroed.
+ */
+ if (nr_compat_longs) {
+ nr_compat_longs--;
+ if (__get_user(um, umask))
+ return -EFAULT;
+ } else {
+ um = 0;
+ }
+
+ umask++;
+ m |= (long)um << (j * BITS_PER_COMPAT_LONG);
+ }
+ *mask++ = m;
+ }
+
+ return 0;
+}
+
+long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask,
+ unsigned long bitmap_size)
+{
+ int i, j;
+ unsigned long m;
+ compat_ulong_t um;
+ unsigned long nr_compat_longs;
+
+ /* align bitmap up to nearest compat_long_t boundary */
+ bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG);
+
+ if (!access_ok(VERIFY_WRITE, umask, bitmap_size / 8))
+ return -EFAULT;
+
+ nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size);
+
+ for (i = 0; i < BITS_TO_LONGS(bitmap_size); i++) {
+ m = *mask++;
+
+ for (j = 0; j < sizeof(m)/sizeof(um); j++) {
+ um = m;
+
+ /*
+ * We dont want to write past the end of the userspace
+ * bitmap.
+ */
+ if (nr_compat_longs) {
+ nr_compat_longs--;
+ if (__put_user(um, umask))
+ return -EFAULT;
+ }
+
+ umask++;
+ m >>= 4*sizeof(um);
+ m >>= 4*sizeof(um);
+ }
+ }
+
+ return 0;
+}
+
+void
+sigset_from_compat(sigset_t *set, const compat_sigset_t *compat)
+{
+ switch (_NSIG_WORDS) {
+ case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 );
+ case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 );
+ case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 );
+ case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 );
+ }
+}
+EXPORT_SYMBOL_GPL(sigset_from_compat);
+
+void
+sigset_to_compat(compat_sigset_t *compat, const sigset_t *set)
+{
+ switch (_NSIG_WORDS) {
+ case 4: compat->sig[7] = (set->sig[3] >> 32); compat->sig[6] = set->sig[3];
+ case 3: compat->sig[5] = (set->sig[2] >> 32); compat->sig[4] = set->sig[2];
+ case 2: compat->sig[3] = (set->sig[1] >> 32); compat->sig[2] = set->sig[1];
+ case 1: compat->sig[1] = (set->sig[0] >> 32); compat->sig[0] = set->sig[0];
+ }
+}
+
+COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
+ struct compat_siginfo __user *, uinfo,
+ struct compat_timespec __user *, uts, compat_size_t, sigsetsize)
+{
+ compat_sigset_t s32;
+ sigset_t s;
+ struct timespec t;
+ siginfo_t info;
+ long ret;
+
+ if (sigsetsize != sizeof(sigset_t))
+ return -EINVAL;
+
+ if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t)))
+ return -EFAULT;
+ sigset_from_compat(&s, &s32);
+
+ if (uts) {
+ if (compat_get_timespec(&t, uts))
+ return -EFAULT;
+ }
+
+ ret = do_sigtimedwait(&s, &info, uts ? &t : NULL);
+
+ if (ret > 0 && uinfo) {
+ if (copy_siginfo_to_user32(uinfo, &info))
+ ret = -EFAULT;
+ }
+
+ return ret;
+}
+
+#ifdef __ARCH_WANT_COMPAT_SYS_TIME
+
+/* compat_time_t is a 32 bit "long" and needs to get converted. */
+
+COMPAT_SYSCALL_DEFINE1(time, compat_time_t __user *, tloc)
+{
+ compat_time_t i;
+ struct timeval tv;
+
+ do_gettimeofday(&tv);
+ i = tv.tv_sec;
+
+ if (tloc) {
+ if (put_user(i,tloc))
+ return -EFAULT;
+ }
+ force_successful_syscall_return();
+ return i;
+}
+
+COMPAT_SYSCALL_DEFINE1(stime, compat_time_t __user *, tptr)
+{
+ struct timespec tv;
+ int err;
+
+ if (get_user(tv.tv_sec, tptr))
+ return -EFAULT;
+
+ tv.tv_nsec = 0;
+
+ err = security_settime(&tv, NULL);
+ if (err)
+ return err;
+
+ do_settimeofday(&tv);
+ return 0;
+}
+
+#endif /* __ARCH_WANT_COMPAT_SYS_TIME */
+
+COMPAT_SYSCALL_DEFINE1(adjtimex, struct compat_timex __user *, utp)
+{
+ struct timex txc;
+ int err, ret;
+
+ err = compat_get_timex(&txc, utp);
+ if (err)
+ return err;
+
+ ret = do_adjtimex(&txc);
+
+ err = compat_put_timex(utp, &txc);
+ if (err)
+ return err;
+
+ return ret;
+}
+
+#ifdef CONFIG_NUMA
+COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages,
+ compat_uptr_t __user *, pages32,
+ const int __user *, nodes,
+ int __user *, status,
+ int, flags)
+{
+ const void __user * __user *pages;
+ int i;
+
+ pages = compat_alloc_user_space(nr_pages * sizeof(void *));
+ for (i = 0; i < nr_pages; i++) {
+ compat_uptr_t p;
+
+ if (get_user(p, pages32 + i) ||
+ put_user(compat_ptr(p), pages + i))
+ return -EFAULT;
+ }
+ return sys_move_pages(pid, nr_pages, pages, nodes, status, flags);
+}
+
+COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
+ compat_ulong_t, maxnode,
+ const compat_ulong_t __user *, old_nodes,
+ const compat_ulong_t __user *, new_nodes)
+{
+ unsigned long __user *old = NULL;
+ unsigned long __user *new = NULL;
+ nodemask_t tmp_mask;
+ unsigned long nr_bits;
+ unsigned long size;
+
+ nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
+ size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
+ if (old_nodes) {
+ if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
+ return -EFAULT;
+ old = compat_alloc_user_space(new_nodes ? size * 2 : size);
+ if (new_nodes)
+ new = old + size / sizeof(unsigned long);
+ if (copy_to_user(old, nodes_addr(tmp_mask), size))
+ return -EFAULT;
+ }
+ if (new_nodes) {
+ if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
+ return -EFAULT;
+ if (new == NULL)
+ new = compat_alloc_user_space(size);
+ if (copy_to_user(new, nodes_addr(tmp_mask), size))
+ return -EFAULT;
+ }
+ return sys_migrate_pages(pid, nr_bits + 1, old, new);
+}
+#endif
+
+COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
+ compat_pid_t, pid,
+ struct compat_timespec __user *, interval)
+{
+ struct timespec t;
+ int ret;
+ mm_segment_t old_fs = get_fs();
+
+ set_fs(KERNEL_DS);
+ ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t);
+ set_fs(old_fs);
+ if (compat_put_timespec(&t, interval))
+ return -EFAULT;
+ return ret;
+}
+
+/*
+ * Allocate user-space memory for the duration of a single system call,
+ * in order to marshall parameters inside a compat thunk.
+ */
+void __user *compat_alloc_user_space(unsigned long len)
+{
+ void __user *ptr;
+
+ /* If len would occupy more than half of the entire compat space... */
+ if (unlikely(len > (((compat_uptr_t)~0) >> 1)))
+ return NULL;
+
+ ptr = arch_compat_alloc_user_space(len);
+
+ if (unlikely(!access_ok(VERIFY_WRITE, ptr, len)))
+ return NULL;
+
+ return ptr;
+}
+EXPORT_SYMBOL_GPL(compat_alloc_user_space);
diff --git a/kernel/configs.c b/kernel/configs.c
new file mode 100644
index 000000000..c18b1f1ae
--- /dev/null
+++ b/kernel/configs.c
@@ -0,0 +1,99 @@
+/*
+ * kernel/configs.c
+ * Echo the kernel .config file used to build the kernel
+ *
+ * Copyright (C) 2002 Khalid Aziz <khalid_aziz@hp.com>
+ * Copyright (C) 2002 Randy Dunlap <rdunlap@xenotime.net>
+ * Copyright (C) 2002 Al Stone <ahs3@fc.hp.com>
+ * Copyright (C) 2002 Hewlett-Packard Company
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <asm/uaccess.h>
+
+/**************************************************/
+/* the actual current config file */
+
+/*
+ * Define kernel_config_data and kernel_config_data_size, which contains the
+ * wrapped and compressed configuration file. The file is first compressed
+ * with gzip and then bounded by two eight byte magic numbers to allow
+ * extraction from a binary kernel image:
+ *
+ * IKCFG_ST
+ * <image>
+ * IKCFG_ED
+ */
+#define MAGIC_START "IKCFG_ST"
+#define MAGIC_END "IKCFG_ED"
+#include "config_data.h"
+
+
+#define MAGIC_SIZE (sizeof(MAGIC_START) - 1)
+#define kernel_config_data_size \
+ (sizeof(kernel_config_data) - 1 - MAGIC_SIZE * 2)
+
+#ifdef CONFIG_IKCONFIG_PROC
+
+static ssize_t
+ikconfig_read_current(struct file *file, char __user *buf,
+ size_t len, loff_t * offset)
+{
+ return simple_read_from_buffer(buf, len, offset,
+ kernel_config_data + MAGIC_SIZE,
+ kernel_config_data_size);
+}
+
+static const struct file_operations ikconfig_file_ops = {
+ .owner = THIS_MODULE,
+ .read = ikconfig_read_current,
+ .llseek = default_llseek,
+};
+
+static int __init ikconfig_init(void)
+{
+ struct proc_dir_entry *entry;
+
+ /* create the current config file */
+ entry = proc_create("config.gz", S_IFREG | S_IRUGO, NULL,
+ &ikconfig_file_ops);
+ if (!entry)
+ return -ENOMEM;
+
+ proc_set_size(entry, kernel_config_data_size);
+
+ return 0;
+}
+
+static void __exit ikconfig_cleanup(void)
+{
+ remove_proc_entry("config.gz", NULL);
+}
+
+module_init(ikconfig_init);
+module_exit(ikconfig_cleanup);
+
+#endif /* CONFIG_IKCONFIG_PROC */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Randy Dunlap");
+MODULE_DESCRIPTION("Echo the kernel .config file used to build the kernel");
diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config
new file mode 100644
index 000000000..c2de56ab0
--- /dev/null
+++ b/kernel/configs/tiny.config
@@ -0,0 +1,4 @@
+CONFIG_CC_OPTIMIZE_FOR_SIZE=y
+CONFIG_KERNEL_XZ=y
+CONFIG_OPTIMIZE_INLINING=y
+CONFIG_SLOB=y
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
new file mode 100644
index 000000000..72d59a1a6
--- /dev/null
+++ b/kernel/context_tracking.c
@@ -0,0 +1,195 @@
+/*
+ * Context tracking: Probe on high level context boundaries such as kernel
+ * and userspace. This includes syscalls and exceptions entry/exit.
+ *
+ * This is used by RCU to remove its dependency on the timer tick while a CPU
+ * runs in userspace.
+ *
+ * Started by Frederic Weisbecker:
+ *
+ * Copyright (C) 2012 Red Hat, Inc., Frederic Weisbecker <fweisbec@redhat.com>
+ *
+ * Many thanks to Gilad Ben-Yossef, Paul McKenney, Ingo Molnar, Andrew Morton,
+ * Steven Rostedt, Peter Zijlstra for suggestions and improvements.
+ *
+ */
+
+#include <linux/context_tracking.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/hardirq.h>
+#include <linux/export.h>
+#include <linux/kprobes.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/context_tracking.h>
+
+struct static_key context_tracking_enabled = STATIC_KEY_INIT_FALSE;
+EXPORT_SYMBOL_GPL(context_tracking_enabled);
+
+DEFINE_PER_CPU(struct context_tracking, context_tracking);
+EXPORT_SYMBOL_GPL(context_tracking);
+
+void context_tracking_cpu_set(int cpu)
+{
+ if (!per_cpu(context_tracking.active, cpu)) {
+ per_cpu(context_tracking.active, cpu) = true;
+ static_key_slow_inc(&context_tracking_enabled);
+ }
+}
+
+/**
+ * context_tracking_enter - Inform the context tracking that the CPU is going
+ * enter user or guest space mode.
+ *
+ * This function must be called right before we switch from the kernel
+ * to user or guest space, when it's guaranteed the remaining kernel
+ * instructions to execute won't use any RCU read side critical section
+ * because this function sets RCU in extended quiescent state.
+ */
+void context_tracking_enter(enum ctx_state state)
+{
+ unsigned long flags;
+
+ /*
+ * Repeat the user_enter() check here because some archs may be calling
+ * this from asm and if no CPU needs context tracking, they shouldn't
+ * go further. Repeat the check here until they support the inline static
+ * key check.
+ */
+ if (!context_tracking_is_enabled())
+ return;
+
+ /*
+ * Some contexts may involve an exception occuring in an irq,
+ * leading to that nesting:
+ * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
+ * This would mess up the dyntick_nesting count though. And rcu_irq_*()
+ * helpers are enough to protect RCU uses inside the exception. So
+ * just return immediately if we detect we are in an IRQ.
+ */
+ if (in_interrupt())
+ return;
+
+ /* Kernel threads aren't supposed to go to userspace */
+ WARN_ON_ONCE(!current->mm);
+
+ local_irq_save(flags);
+ if ( __this_cpu_read(context_tracking.state) != state) {
+ if (__this_cpu_read(context_tracking.active)) {
+ /*
+ * At this stage, only low level arch entry code remains and
+ * then we'll run in userspace. We can assume there won't be
+ * any RCU read-side critical section until the next call to
+ * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency
+ * on the tick.
+ */
+ if (state == CONTEXT_USER) {
+ trace_user_enter(0);
+ vtime_user_enter(current);
+ }
+ rcu_user_enter();
+ }
+ /*
+ * Even if context tracking is disabled on this CPU, because it's outside
+ * the full dynticks mask for example, we still have to keep track of the
+ * context transitions and states to prevent inconsistency on those of
+ * other CPUs.
+ * If a task triggers an exception in userspace, sleep on the exception
+ * handler and then migrate to another CPU, that new CPU must know where
+ * the exception returns by the time we call exception_exit().
+ * This information can only be provided by the previous CPU when it called
+ * exception_enter().
+ * OTOH we can spare the calls to vtime and RCU when context_tracking.active
+ * is false because we know that CPU is not tickless.
+ */
+ __this_cpu_write(context_tracking.state, state);
+ }
+ local_irq_restore(flags);
+}
+NOKPROBE_SYMBOL(context_tracking_enter);
+EXPORT_SYMBOL_GPL(context_tracking_enter);
+
+void context_tracking_user_enter(void)
+{
+ context_tracking_enter(CONTEXT_USER);
+}
+NOKPROBE_SYMBOL(context_tracking_user_enter);
+
+/**
+ * context_tracking_exit - Inform the context tracking that the CPU is
+ * exiting user or guest mode and entering the kernel.
+ *
+ * This function must be called after we entered the kernel from user or
+ * guest space before any use of RCU read side critical section. This
+ * potentially include any high level kernel code like syscalls, exceptions,
+ * signal handling, etc...
+ *
+ * This call supports re-entrancy. This way it can be called from any exception
+ * handler without needing to know if we came from userspace or not.
+ */
+void context_tracking_exit(enum ctx_state state)
+{
+ unsigned long flags;
+
+ if (!context_tracking_is_enabled())
+ return;
+
+ if (in_interrupt())
+ return;
+
+ local_irq_save(flags);
+ if (__this_cpu_read(context_tracking.state) == state) {
+ if (__this_cpu_read(context_tracking.active)) {
+ /*
+ * We are going to run code that may use RCU. Inform
+ * RCU core about that (ie: we may need the tick again).
+ */
+ rcu_user_exit();
+ if (state == CONTEXT_USER) {
+ vtime_user_exit(current);
+ trace_user_exit(0);
+ }
+ }
+ __this_cpu_write(context_tracking.state, CONTEXT_KERNEL);
+ }
+ local_irq_restore(flags);
+}
+NOKPROBE_SYMBOL(context_tracking_exit);
+EXPORT_SYMBOL_GPL(context_tracking_exit);
+
+void context_tracking_user_exit(void)
+{
+ context_tracking_exit(CONTEXT_USER);
+}
+NOKPROBE_SYMBOL(context_tracking_user_exit);
+
+/**
+ * __context_tracking_task_switch - context switch the syscall callbacks
+ * @prev: the task that is being switched out
+ * @next: the task that is being switched in
+ *
+ * The context tracking uses the syscall slow path to implement its user-kernel
+ * boundaries probes on syscalls. This way it doesn't impact the syscall fast
+ * path on CPUs that don't do context tracking.
+ *
+ * But we need to clear the flag on the previous task because it may later
+ * migrate to some CPU that doesn't do the context tracking. As such the TIF
+ * flag may not be desired there.
+ */
+void __context_tracking_task_switch(struct task_struct *prev,
+ struct task_struct *next)
+{
+ clear_tsk_thread_flag(prev, TIF_NOHZ);
+ set_tsk_thread_flag(next, TIF_NOHZ);
+}
+
+#ifdef CONFIG_CONTEXT_TRACKING_FORCE
+void __init context_tracking_init(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ context_tracking_cpu_set(cpu);
+}
+#endif
diff --git a/kernel/cpu.c b/kernel/cpu.c
new file mode 100644
index 000000000..94bbe4695
--- /dev/null
+++ b/kernel/cpu.c
@@ -0,0 +1,825 @@
+/* CPU control.
+ * (C) 2001, 2002, 2003, 2004 Rusty Russell
+ *
+ * This code is licenced under the GPL.
+ */
+#include <linux/proc_fs.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/notifier.h>
+#include <linux/sched.h>
+#include <linux/unistd.h>
+#include <linux/cpu.h>
+#include <linux/oom.h>
+#include <linux/rcupdate.h>
+#include <linux/export.h>
+#include <linux/bug.h>
+#include <linux/kthread.h>
+#include <linux/stop_machine.h>
+#include <linux/mutex.h>
+#include <linux/gfp.h>
+#include <linux/suspend.h>
+#include <linux/lockdep.h>
+#include <linux/tick.h>
+#include <trace/events/power.h>
+
+#include "smpboot.h"
+
+#ifdef CONFIG_SMP
+/* Serializes the updates to cpu_online_mask, cpu_present_mask */
+static DEFINE_MUTEX(cpu_add_remove_lock);
+
+/*
+ * The following two APIs (cpu_maps_update_begin/done) must be used when
+ * attempting to serialize the updates to cpu_online_mask & cpu_present_mask.
+ * The APIs cpu_notifier_register_begin/done() must be used to protect CPU
+ * hotplug callback (un)registration performed using __register_cpu_notifier()
+ * or __unregister_cpu_notifier().
+ */
+void cpu_maps_update_begin(void)
+{
+ mutex_lock(&cpu_add_remove_lock);
+}
+EXPORT_SYMBOL(cpu_notifier_register_begin);
+
+void cpu_maps_update_done(void)
+{
+ mutex_unlock(&cpu_add_remove_lock);
+}
+EXPORT_SYMBOL(cpu_notifier_register_done);
+
+static RAW_NOTIFIER_HEAD(cpu_chain);
+
+/* If set, cpu_up and cpu_down will return -EBUSY and do nothing.
+ * Should always be manipulated under cpu_add_remove_lock
+ */
+static int cpu_hotplug_disabled;
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+static struct {
+ struct task_struct *active_writer;
+ /* wait queue to wake up the active_writer */
+ wait_queue_head_t wq;
+ /* verifies that no writer will get active while readers are active */
+ struct mutex lock;
+ /*
+ * Also blocks the new readers during
+ * an ongoing cpu hotplug operation.
+ */
+ atomic_t refcount;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct lockdep_map dep_map;
+#endif
+} cpu_hotplug = {
+ .active_writer = NULL,
+ .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq),
+ .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ .dep_map = {.name = "cpu_hotplug.lock" },
+#endif
+};
+
+/* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */
+#define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map)
+#define cpuhp_lock_acquire_tryread() \
+ lock_map_acquire_tryread(&cpu_hotplug.dep_map)
+#define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map)
+#define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map)
+
+
+void get_online_cpus(void)
+{
+ might_sleep();
+ if (cpu_hotplug.active_writer == current)
+ return;
+ cpuhp_lock_acquire_read();
+ mutex_lock(&cpu_hotplug.lock);
+ atomic_inc(&cpu_hotplug.refcount);
+ mutex_unlock(&cpu_hotplug.lock);
+}
+EXPORT_SYMBOL_GPL(get_online_cpus);
+
+bool try_get_online_cpus(void)
+{
+ if (cpu_hotplug.active_writer == current)
+ return true;
+ if (!mutex_trylock(&cpu_hotplug.lock))
+ return false;
+ cpuhp_lock_acquire_tryread();
+ atomic_inc(&cpu_hotplug.refcount);
+ mutex_unlock(&cpu_hotplug.lock);
+ return true;
+}
+EXPORT_SYMBOL_GPL(try_get_online_cpus);
+
+void put_online_cpus(void)
+{
+ int refcount;
+
+ if (cpu_hotplug.active_writer == current)
+ return;
+
+ refcount = atomic_dec_return(&cpu_hotplug.refcount);
+ if (WARN_ON(refcount < 0)) /* try to fix things up */
+ atomic_inc(&cpu_hotplug.refcount);
+
+ if (refcount <= 0 && waitqueue_active(&cpu_hotplug.wq))
+ wake_up(&cpu_hotplug.wq);
+
+ cpuhp_lock_release();
+
+}
+EXPORT_SYMBOL_GPL(put_online_cpus);
+
+/*
+ * This ensures that the hotplug operation can begin only when the
+ * refcount goes to zero.
+ *
+ * Note that during a cpu-hotplug operation, the new readers, if any,
+ * will be blocked by the cpu_hotplug.lock
+ *
+ * Since cpu_hotplug_begin() is always called after invoking
+ * cpu_maps_update_begin(), we can be sure that only one writer is active.
+ *
+ * Note that theoretically, there is a possibility of a livelock:
+ * - Refcount goes to zero, last reader wakes up the sleeping
+ * writer.
+ * - Last reader unlocks the cpu_hotplug.lock.
+ * - A new reader arrives at this moment, bumps up the refcount.
+ * - The writer acquires the cpu_hotplug.lock finds the refcount
+ * non zero and goes to sleep again.
+ *
+ * However, this is very difficult to achieve in practice since
+ * get_online_cpus() not an api which is called all that often.
+ *
+ */
+void cpu_hotplug_begin(void)
+{
+ DEFINE_WAIT(wait);
+
+ cpu_hotplug.active_writer = current;
+ cpuhp_lock_acquire();
+
+ for (;;) {
+ mutex_lock(&cpu_hotplug.lock);
+ prepare_to_wait(&cpu_hotplug.wq, &wait, TASK_UNINTERRUPTIBLE);
+ if (likely(!atomic_read(&cpu_hotplug.refcount)))
+ break;
+ mutex_unlock(&cpu_hotplug.lock);
+ schedule();
+ }
+ finish_wait(&cpu_hotplug.wq, &wait);
+}
+
+void cpu_hotplug_done(void)
+{
+ cpu_hotplug.active_writer = NULL;
+ mutex_unlock(&cpu_hotplug.lock);
+ cpuhp_lock_release();
+}
+
+/*
+ * Wait for currently running CPU hotplug operations to complete (if any) and
+ * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects
+ * the 'cpu_hotplug_disabled' flag. The same lock is also acquired by the
+ * hotplug path before performing hotplug operations. So acquiring that lock
+ * guarantees mutual exclusion from any currently running hotplug operations.
+ */
+void cpu_hotplug_disable(void)
+{
+ cpu_maps_update_begin();
+ cpu_hotplug_disabled = 1;
+ cpu_maps_update_done();
+}
+
+void cpu_hotplug_enable(void)
+{
+ cpu_maps_update_begin();
+ cpu_hotplug_disabled = 0;
+ cpu_maps_update_done();
+}
+
+#endif /* CONFIG_HOTPLUG_CPU */
+
+/* Need to know about CPUs going up/down? */
+int __ref register_cpu_notifier(struct notifier_block *nb)
+{
+ int ret;
+ cpu_maps_update_begin();
+ ret = raw_notifier_chain_register(&cpu_chain, nb);
+ cpu_maps_update_done();
+ return ret;
+}
+
+int __ref __register_cpu_notifier(struct notifier_block *nb)
+{
+ return raw_notifier_chain_register(&cpu_chain, nb);
+}
+
+static int __cpu_notify(unsigned long val, void *v, int nr_to_call,
+ int *nr_calls)
+{
+ int ret;
+
+ ret = __raw_notifier_call_chain(&cpu_chain, val, v, nr_to_call,
+ nr_calls);
+
+ return notifier_to_errno(ret);
+}
+
+static int cpu_notify(unsigned long val, void *v)
+{
+ return __cpu_notify(val, v, -1, NULL);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+static void cpu_notify_nofail(unsigned long val, void *v)
+{
+ BUG_ON(cpu_notify(val, v));
+}
+EXPORT_SYMBOL(register_cpu_notifier);
+EXPORT_SYMBOL(__register_cpu_notifier);
+
+void __ref unregister_cpu_notifier(struct notifier_block *nb)
+{
+ cpu_maps_update_begin();
+ raw_notifier_chain_unregister(&cpu_chain, nb);
+ cpu_maps_update_done();
+}
+EXPORT_SYMBOL(unregister_cpu_notifier);
+
+void __ref __unregister_cpu_notifier(struct notifier_block *nb)
+{
+ raw_notifier_chain_unregister(&cpu_chain, nb);
+}
+EXPORT_SYMBOL(__unregister_cpu_notifier);
+
+/**
+ * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU
+ * @cpu: a CPU id
+ *
+ * This function walks all processes, finds a valid mm struct for each one and
+ * then clears a corresponding bit in mm's cpumask. While this all sounds
+ * trivial, there are various non-obvious corner cases, which this function
+ * tries to solve in a safe manner.
+ *
+ * Also note that the function uses a somewhat relaxed locking scheme, so it may
+ * be called only for an already offlined CPU.
+ */
+void clear_tasks_mm_cpumask(int cpu)
+{
+ struct task_struct *p;
+
+ /*
+ * This function is called after the cpu is taken down and marked
+ * offline, so its not like new tasks will ever get this cpu set in
+ * their mm mask. -- Peter Zijlstra
+ * Thus, we may use rcu_read_lock() here, instead of grabbing
+ * full-fledged tasklist_lock.
+ */
+ WARN_ON(cpu_online(cpu));
+ rcu_read_lock();
+ for_each_process(p) {
+ struct task_struct *t;
+
+ /*
+ * Main thread might exit, but other threads may still have
+ * a valid mm. Find one.
+ */
+ t = find_lock_task_mm(p);
+ if (!t)
+ continue;
+ cpumask_clear_cpu(cpu, mm_cpumask(t->mm));
+ task_unlock(t);
+ }
+ rcu_read_unlock();
+}
+
+static inline void check_for_tasks(int dead_cpu)
+{
+ struct task_struct *g, *p;
+
+ read_lock_irq(&tasklist_lock);
+ do_each_thread(g, p) {
+ if (!p->on_rq)
+ continue;
+ /*
+ * We do the check with unlocked task_rq(p)->lock.
+ * Order the reading to do not warn about a task,
+ * which was running on this cpu in the past, and
+ * it's just been woken on another cpu.
+ */
+ rmb();
+ if (task_cpu(p) != dead_cpu)
+ continue;
+
+ pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n",
+ p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags);
+ } while_each_thread(g, p);
+ read_unlock_irq(&tasklist_lock);
+}
+
+struct take_cpu_down_param {
+ unsigned long mod;
+ void *hcpu;
+};
+
+/* Take this CPU down. */
+static int __ref take_cpu_down(void *_param)
+{
+ struct take_cpu_down_param *param = _param;
+ int err;
+
+ /* Ensure this CPU doesn't handle any more interrupts. */
+ err = __cpu_disable();
+ if (err < 0)
+ return err;
+
+ cpu_notify(CPU_DYING | param->mod, param->hcpu);
+ /* Give up timekeeping duties */
+ tick_handover_do_timer();
+ /* Park the stopper thread */
+ kthread_park(current);
+ return 0;
+}
+
+/* Requires cpu_add_remove_lock to be held */
+static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
+{
+ int err, nr_calls = 0;
+ void *hcpu = (void *)(long)cpu;
+ unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
+ struct take_cpu_down_param tcd_param = {
+ .mod = mod,
+ .hcpu = hcpu,
+ };
+
+ if (num_online_cpus() == 1)
+ return -EBUSY;
+
+ if (!cpu_online(cpu))
+ return -EINVAL;
+
+ cpu_hotplug_begin();
+
+ err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
+ if (err) {
+ nr_calls--;
+ __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
+ pr_warn("%s: attempt to take down CPU %u failed\n",
+ __func__, cpu);
+ goto out_release;
+ }
+
+ /*
+ * By now we've cleared cpu_active_mask, wait for all preempt-disabled
+ * and RCU users of this state to go away such that all new such users
+ * will observe it.
+ *
+ * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
+ * not imply sync_sched(), so explicitly call both.
+ *
+ * Do sync before park smpboot threads to take care the rcu boost case.
+ */
+#ifdef CONFIG_PREEMPT
+ synchronize_sched();
+#endif
+ synchronize_rcu();
+
+ smpboot_park_threads(cpu);
+
+ /*
+ * So now all preempt/rcu users must observe !cpu_active().
+ */
+
+ err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
+ if (err) {
+ /* CPU didn't die: tell everyone. Can't complain. */
+ smpboot_unpark_threads(cpu);
+ cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
+ goto out_release;
+ }
+ BUG_ON(cpu_online(cpu));
+
+ /*
+ * The migration_call() CPU_DYING callback will have removed all
+ * runnable tasks from the cpu, there's only the idle task left now
+ * that the migration thread is done doing the stop_machine thing.
+ *
+ * Wait for the stop thread to go away.
+ */
+ while (!per_cpu(cpu_dead_idle, cpu))
+ cpu_relax();
+ smp_mb(); /* Read from cpu_dead_idle before __cpu_die(). */
+ per_cpu(cpu_dead_idle, cpu) = false;
+
+ hotplug_cpu__broadcast_tick_pull(cpu);
+ /* This actually kills the CPU. */
+ __cpu_die(cpu);
+
+ /* CPU is completely dead: tell everyone. Too late to complain. */
+ tick_cleanup_dead_cpu(cpu);
+ cpu_notify_nofail(CPU_DEAD | mod, hcpu);
+
+ check_for_tasks(cpu);
+
+out_release:
+ cpu_hotplug_done();
+ if (!err)
+ cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu);
+ return err;
+}
+
+int __ref cpu_down(unsigned int cpu)
+{
+ int err;
+
+ cpu_maps_update_begin();
+
+ if (cpu_hotplug_disabled) {
+ err = -EBUSY;
+ goto out;
+ }
+
+ err = _cpu_down(cpu, 0);
+
+out:
+ cpu_maps_update_done();
+ return err;
+}
+EXPORT_SYMBOL(cpu_down);
+#endif /*CONFIG_HOTPLUG_CPU*/
+
+/*
+ * Unpark per-CPU smpboot kthreads at CPU-online time.
+ */
+static int smpboot_thread_call(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ int cpu = (long)hcpu;
+
+ switch (action & ~CPU_TASKS_FROZEN) {
+
+ case CPU_ONLINE:
+ smpboot_unpark_threads(cpu);
+ break;
+
+ default:
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block smpboot_thread_notifier = {
+ .notifier_call = smpboot_thread_call,
+ .priority = CPU_PRI_SMPBOOT,
+};
+
+void __cpuinit smpboot_thread_init(void)
+{
+ register_cpu_notifier(&smpboot_thread_notifier);
+}
+
+/* Requires cpu_add_remove_lock to be held */
+static int _cpu_up(unsigned int cpu, int tasks_frozen)
+{
+ int ret, nr_calls = 0;
+ void *hcpu = (void *)(long)cpu;
+ unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
+ struct task_struct *idle;
+
+ cpu_hotplug_begin();
+
+ if (cpu_online(cpu) || !cpu_present(cpu)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ idle = idle_thread_get(cpu);
+ if (IS_ERR(idle)) {
+ ret = PTR_ERR(idle);
+ goto out;
+ }
+
+ ret = smpboot_create_threads(cpu);
+ if (ret)
+ goto out;
+
+ ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
+ if (ret) {
+ nr_calls--;
+ pr_warn("%s: attempt to bring up CPU %u failed\n",
+ __func__, cpu);
+ goto out_notify;
+ }
+
+ /* Arch-specific enabling code. */
+ ret = __cpu_up(cpu, idle);
+ if (ret != 0)
+ goto out_notify;
+ BUG_ON(!cpu_online(cpu));
+
+ /* Now call notifier in preparation. */
+ cpu_notify(CPU_ONLINE | mod, hcpu);
+
+out_notify:
+ if (ret != 0)
+ __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
+out:
+ cpu_hotplug_done();
+
+ return ret;
+}
+
+int cpu_up(unsigned int cpu)
+{
+ int err = 0;
+
+ if (!cpu_possible(cpu)) {
+ pr_err("can't online cpu %d because it is not configured as may-hotadd at boot time\n",
+ cpu);
+#if defined(CONFIG_IA64)
+ pr_err("please check additional_cpus= boot parameter\n");
+#endif
+ return -EINVAL;
+ }
+
+ err = try_online_node(cpu_to_node(cpu));
+ if (err)
+ return err;
+
+ cpu_maps_update_begin();
+
+ if (cpu_hotplug_disabled) {
+ err = -EBUSY;
+ goto out;
+ }
+
+ err = _cpu_up(cpu, 0);
+
+out:
+ cpu_maps_update_done();
+ return err;
+}
+EXPORT_SYMBOL_GPL(cpu_up);
+
+#ifdef CONFIG_PM_SLEEP_SMP
+static cpumask_var_t frozen_cpus;
+
+int disable_nonboot_cpus(void)
+{
+ int cpu, first_cpu, error = 0;
+
+ cpu_maps_update_begin();
+ first_cpu = cpumask_first(cpu_online_mask);
+ /*
+ * We take down all of the non-boot CPUs in one shot to avoid races
+ * with the userspace trying to use the CPU hotplug at the same time
+ */
+ cpumask_clear(frozen_cpus);
+
+ pr_info("Disabling non-boot CPUs ...\n");
+ for_each_online_cpu(cpu) {
+ if (cpu == first_cpu)
+ continue;
+ trace_suspend_resume(TPS("CPU_OFF"), cpu, true);
+ error = _cpu_down(cpu, 1);
+ trace_suspend_resume(TPS("CPU_OFF"), cpu, false);
+ if (!error)
+ cpumask_set_cpu(cpu, frozen_cpus);
+ else {
+ pr_err("Error taking CPU%d down: %d\n", cpu, error);
+ break;
+ }
+ }
+
+ if (!error) {
+ BUG_ON(num_online_cpus() > 1);
+ /* Make sure the CPUs won't be enabled by someone else */
+ cpu_hotplug_disabled = 1;
+ } else {
+ pr_err("Non-boot CPUs are not disabled\n");
+ }
+ cpu_maps_update_done();
+ return error;
+}
+
+void __weak arch_enable_nonboot_cpus_begin(void)
+{
+}
+
+void __weak arch_enable_nonboot_cpus_end(void)
+{
+}
+
+void __ref enable_nonboot_cpus(void)
+{
+ int cpu, error;
+
+ /* Allow everyone to use the CPU hotplug again */
+ cpu_maps_update_begin();
+ cpu_hotplug_disabled = 0;
+ if (cpumask_empty(frozen_cpus))
+ goto out;
+
+ pr_info("Enabling non-boot CPUs ...\n");
+
+ arch_enable_nonboot_cpus_begin();
+
+ for_each_cpu(cpu, frozen_cpus) {
+ trace_suspend_resume(TPS("CPU_ON"), cpu, true);
+ error = _cpu_up(cpu, 1);
+ trace_suspend_resume(TPS("CPU_ON"), cpu, false);
+ if (!error) {
+ pr_info("CPU%d is up\n", cpu);
+ continue;
+ }
+ pr_warn("Error taking CPU%d up: %d\n", cpu, error);
+ }
+
+ arch_enable_nonboot_cpus_end();
+
+ cpumask_clear(frozen_cpus);
+out:
+ cpu_maps_update_done();
+}
+
+static int __init alloc_frozen_cpus(void)
+{
+ if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO))
+ return -ENOMEM;
+ return 0;
+}
+core_initcall(alloc_frozen_cpus);
+
+/*
+ * When callbacks for CPU hotplug notifications are being executed, we must
+ * ensure that the state of the system with respect to the tasks being frozen
+ * or not, as reported by the notification, remains unchanged *throughout the
+ * duration* of the execution of the callbacks.
+ * Hence we need to prevent the freezer from racing with regular CPU hotplug.
+ *
+ * This synchronization is implemented by mutually excluding regular CPU
+ * hotplug and Suspend/Hibernate call paths by hooking onto the Suspend/
+ * Hibernate notifications.
+ */
+static int
+cpu_hotplug_pm_callback(struct notifier_block *nb,
+ unsigned long action, void *ptr)
+{
+ switch (action) {
+
+ case PM_SUSPEND_PREPARE:
+ case PM_HIBERNATION_PREPARE:
+ cpu_hotplug_disable();
+ break;
+
+ case PM_POST_SUSPEND:
+ case PM_POST_HIBERNATION:
+ cpu_hotplug_enable();
+ break;
+
+ default:
+ return NOTIFY_DONE;
+ }
+
+ return NOTIFY_OK;
+}
+
+
+static int __init cpu_hotplug_pm_sync_init(void)
+{
+ /*
+ * cpu_hotplug_pm_callback has higher priority than x86
+ * bsp_pm_callback which depends on cpu_hotplug_pm_callback
+ * to disable cpu hotplug to avoid cpu hotplug race.
+ */
+ pm_notifier(cpu_hotplug_pm_callback, 0);
+ return 0;
+}
+core_initcall(cpu_hotplug_pm_sync_init);
+
+#endif /* CONFIG_PM_SLEEP_SMP */
+
+/**
+ * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers
+ * @cpu: cpu that just started
+ *
+ * This function calls the cpu_chain notifiers with CPU_STARTING.
+ * It must be called by the arch code on the new cpu, before the new cpu
+ * enables interrupts and before the "boot" cpu returns from __cpu_up().
+ */
+void notify_cpu_starting(unsigned int cpu)
+{
+ unsigned long val = CPU_STARTING;
+
+#ifdef CONFIG_PM_SLEEP_SMP
+ if (frozen_cpus != NULL && cpumask_test_cpu(cpu, frozen_cpus))
+ val = CPU_STARTING_FROZEN;
+#endif /* CONFIG_PM_SLEEP_SMP */
+ cpu_notify(val, (void *)(long)cpu);
+}
+
+#endif /* CONFIG_SMP */
+
+/*
+ * cpu_bit_bitmap[] is a special, "compressed" data structure that
+ * represents all NR_CPUS bits binary values of 1<<nr.
+ *
+ * It is used by cpumask_of() to get a constant address to a CPU
+ * mask value that has a single bit set only.
+ */
+
+/* cpu_bit_bitmap[0] is empty - so we can back into it */
+#define MASK_DECLARE_1(x) [x+1][0] = (1UL << (x))
+#define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
+#define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
+#define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)
+
+const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = {
+
+ MASK_DECLARE_8(0), MASK_DECLARE_8(8),
+ MASK_DECLARE_8(16), MASK_DECLARE_8(24),
+#if BITS_PER_LONG > 32
+ MASK_DECLARE_8(32), MASK_DECLARE_8(40),
+ MASK_DECLARE_8(48), MASK_DECLARE_8(56),
+#endif
+};
+EXPORT_SYMBOL_GPL(cpu_bit_bitmap);
+
+const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL;
+EXPORT_SYMBOL(cpu_all_bits);
+
+#ifdef CONFIG_INIT_ALL_POSSIBLE
+static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly
+ = CPU_BITS_ALL;
+#else
+static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly;
+#endif
+const struct cpumask *const cpu_possible_mask = to_cpumask(cpu_possible_bits);
+EXPORT_SYMBOL(cpu_possible_mask);
+
+static DECLARE_BITMAP(cpu_online_bits, CONFIG_NR_CPUS) __read_mostly;
+const struct cpumask *const cpu_online_mask = to_cpumask(cpu_online_bits);
+EXPORT_SYMBOL(cpu_online_mask);
+
+static DECLARE_BITMAP(cpu_present_bits, CONFIG_NR_CPUS) __read_mostly;
+const struct cpumask *const cpu_present_mask = to_cpumask(cpu_present_bits);
+EXPORT_SYMBOL(cpu_present_mask);
+
+static DECLARE_BITMAP(cpu_active_bits, CONFIG_NR_CPUS) __read_mostly;
+const struct cpumask *const cpu_active_mask = to_cpumask(cpu_active_bits);
+EXPORT_SYMBOL(cpu_active_mask);
+
+void set_cpu_possible(unsigned int cpu, bool possible)
+{
+ if (possible)
+ cpumask_set_cpu(cpu, to_cpumask(cpu_possible_bits));
+ else
+ cpumask_clear_cpu(cpu, to_cpumask(cpu_possible_bits));
+}
+
+void set_cpu_present(unsigned int cpu, bool present)
+{
+ if (present)
+ cpumask_set_cpu(cpu, to_cpumask(cpu_present_bits));
+ else
+ cpumask_clear_cpu(cpu, to_cpumask(cpu_present_bits));
+}
+
+void set_cpu_online(unsigned int cpu, bool online)
+{
+ if (online) {
+ cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits));
+ cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits));
+ } else {
+ cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits));
+ }
+}
+
+void set_cpu_active(unsigned int cpu, bool active)
+{
+ if (active)
+ cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits));
+ else
+ cpumask_clear_cpu(cpu, to_cpumask(cpu_active_bits));
+}
+
+void init_cpu_present(const struct cpumask *src)
+{
+ cpumask_copy(to_cpumask(cpu_present_bits), src);
+}
+
+void init_cpu_possible(const struct cpumask *src)
+{
+ cpumask_copy(to_cpumask(cpu_possible_bits), src);
+}
+
+void init_cpu_online(const struct cpumask *src)
+{
+ cpumask_copy(to_cpumask(cpu_online_bits), src);
+}
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
new file mode 100644
index 000000000..9656a3c36
--- /dev/null
+++ b/kernel/cpu_pm.c
@@ -0,0 +1,233 @@
+/*
+ * Copyright (C) 2011 Google, Inc.
+ *
+ * Author:
+ * Colin Cross <ccross@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/cpu_pm.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <linux/spinlock.h>
+#include <linux/syscore_ops.h>
+
+static DEFINE_RWLOCK(cpu_pm_notifier_lock);
+static RAW_NOTIFIER_HEAD(cpu_pm_notifier_chain);
+
+static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls)
+{
+ int ret;
+
+ ret = __raw_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL,
+ nr_to_call, nr_calls);
+
+ return notifier_to_errno(ret);
+}
+
+/**
+ * cpu_pm_register_notifier - register a driver with cpu_pm
+ * @nb: notifier block to register
+ *
+ * Add a driver to a list of drivers that are notified about
+ * CPU and CPU cluster low power entry and exit.
+ *
+ * This function may sleep, and has the same return conditions as
+ * raw_notifier_chain_register.
+ */
+int cpu_pm_register_notifier(struct notifier_block *nb)
+{
+ unsigned long flags;
+ int ret;
+
+ write_lock_irqsave(&cpu_pm_notifier_lock, flags);
+ ret = raw_notifier_chain_register(&cpu_pm_notifier_chain, nb);
+ write_unlock_irqrestore(&cpu_pm_notifier_lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(cpu_pm_register_notifier);
+
+/**
+ * cpu_pm_unregister_notifier - unregister a driver with cpu_pm
+ * @nb: notifier block to be unregistered
+ *
+ * Remove a driver from the CPU PM notifier list.
+ *
+ * This function may sleep, and has the same return conditions as
+ * raw_notifier_chain_unregister.
+ */
+int cpu_pm_unregister_notifier(struct notifier_block *nb)
+{
+ unsigned long flags;
+ int ret;
+
+ write_lock_irqsave(&cpu_pm_notifier_lock, flags);
+ ret = raw_notifier_chain_unregister(&cpu_pm_notifier_chain, nb);
+ write_unlock_irqrestore(&cpu_pm_notifier_lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier);
+
+/**
+ * cpu_pm_enter - CPU low power entry notifier
+ *
+ * Notifies listeners that a single CPU is entering a low power state that may
+ * cause some blocks in the same power domain as the cpu to reset.
+ *
+ * Must be called on the affected CPU with interrupts disabled. Platform is
+ * responsible for ensuring that cpu_pm_enter is not called twice on the same
+ * CPU before cpu_pm_exit is called. Notified drivers can include VFP
+ * co-processor, interrupt controller and its PM extensions, local CPU
+ * timers context save/restore which shouldn't be interrupted. Hence it
+ * must be called with interrupts disabled.
+ *
+ * Return conditions are same as __raw_notifier_call_chain.
+ */
+int cpu_pm_enter(void)
+{
+ int nr_calls;
+ int ret = 0;
+
+ read_lock(&cpu_pm_notifier_lock);
+ ret = cpu_pm_notify(CPU_PM_ENTER, -1, &nr_calls);
+ if (ret)
+ /*
+ * Inform listeners (nr_calls - 1) about failure of CPU PM
+ * PM entry who are notified earlier to prepare for it.
+ */
+ cpu_pm_notify(CPU_PM_ENTER_FAILED, nr_calls - 1, NULL);
+ read_unlock(&cpu_pm_notifier_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(cpu_pm_enter);
+
+/**
+ * cpu_pm_exit - CPU low power exit notifier
+ *
+ * Notifies listeners that a single CPU is exiting a low power state that may
+ * have caused some blocks in the same power domain as the cpu to reset.
+ *
+ * Notified drivers can include VFP co-processor, interrupt controller
+ * and its PM extensions, local CPU timers context save/restore which
+ * shouldn't be interrupted. Hence it must be called with interrupts disabled.
+ *
+ * Return conditions are same as __raw_notifier_call_chain.
+ */
+int cpu_pm_exit(void)
+{
+ int ret;
+
+ read_lock(&cpu_pm_notifier_lock);
+ ret = cpu_pm_notify(CPU_PM_EXIT, -1, NULL);
+ read_unlock(&cpu_pm_notifier_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(cpu_pm_exit);
+
+/**
+ * cpu_cluster_pm_enter - CPU cluster low power entry notifier
+ *
+ * Notifies listeners that all cpus in a power domain are entering a low power
+ * state that may cause some blocks in the same power domain to reset.
+ *
+ * Must be called after cpu_pm_enter has been called on all cpus in the power
+ * domain, and before cpu_pm_exit has been called on any cpu in the power
+ * domain. Notified drivers can include VFP co-processor, interrupt controller
+ * and its PM extensions, local CPU timers context save/restore which
+ * shouldn't be interrupted. Hence it must be called with interrupts disabled.
+ *
+ * Must be called with interrupts disabled.
+ *
+ * Return conditions are same as __raw_notifier_call_chain.
+ */
+int cpu_cluster_pm_enter(void)
+{
+ int nr_calls;
+ int ret = 0;
+
+ read_lock(&cpu_pm_notifier_lock);
+ ret = cpu_pm_notify(CPU_CLUSTER_PM_ENTER, -1, &nr_calls);
+ if (ret)
+ /*
+ * Inform listeners (nr_calls - 1) about failure of CPU cluster
+ * PM entry who are notified earlier to prepare for it.
+ */
+ cpu_pm_notify(CPU_CLUSTER_PM_ENTER_FAILED, nr_calls - 1, NULL);
+ read_unlock(&cpu_pm_notifier_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
+
+/**
+ * cpu_cluster_pm_exit - CPU cluster low power exit notifier
+ *
+ * Notifies listeners that all cpus in a power domain are exiting form a
+ * low power state that may have caused some blocks in the same power domain
+ * to reset.
+ *
+ * Must be called after cpu_pm_exit has been called on all cpus in the power
+ * domain, and before cpu_pm_exit has been called on any cpu in the power
+ * domain. Notified drivers can include VFP co-processor, interrupt controller
+ * and its PM extensions, local CPU timers context save/restore which
+ * shouldn't be interrupted. Hence it must be called with interrupts disabled.
+ *
+ * Return conditions are same as __raw_notifier_call_chain.
+ */
+int cpu_cluster_pm_exit(void)
+{
+ int ret;
+
+ read_lock(&cpu_pm_notifier_lock);
+ ret = cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL);
+ read_unlock(&cpu_pm_notifier_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(cpu_cluster_pm_exit);
+
+#ifdef CONFIG_PM
+static int cpu_pm_suspend(void)
+{
+ int ret;
+
+ ret = cpu_pm_enter();
+ if (ret)
+ return ret;
+
+ ret = cpu_cluster_pm_enter();
+ return ret;
+}
+
+static void cpu_pm_resume(void)
+{
+ cpu_cluster_pm_exit();
+ cpu_pm_exit();
+}
+
+static struct syscore_ops cpu_pm_syscore_ops = {
+ .suspend = cpu_pm_suspend,
+ .resume = cpu_pm_resume,
+};
+
+static int cpu_pm_init(void)
+{
+ register_syscore_ops(&cpu_pm_syscore_ops);
+ return 0;
+}
+core_initcall(cpu_pm_init);
+#endif
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
new file mode 100644
index 000000000..ee14e3a35
--- /dev/null
+++ b/kernel/cpuset.c
@@ -0,0 +1,2700 @@
+/*
+ * kernel/cpuset.c
+ *
+ * Processor and Memory placement constraints for sets of tasks.
+ *
+ * Copyright (C) 2003 BULL SA.
+ * Copyright (C) 2004-2007 Silicon Graphics, Inc.
+ * Copyright (C) 2006 Google, Inc
+ *
+ * Portions derived from Patrick Mochel's sysfs code.
+ * sysfs is Copyright (c) 2001-3 Patrick Mochel
+ *
+ * 2003-10-10 Written by Simon Derr.
+ * 2003-10-22 Updates by Stephen Hemminger.
+ * 2004 May-July Rework by Paul Jackson.
+ * 2006 Rework by Paul Menage to use generic cgroups
+ * 2008 Rework of the scheduler domains and CPU hotplug handling
+ * by Max Krasnyansky
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/cpuset.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/kmod.h>
+#include <linux/list.h>
+#include <linux/mempolicy.h>
+#include <linux/mm.h>
+#include <linux/memory.h>
+#include <linux/export.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/pagemap.h>
+#include <linux/proc_fs.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/security.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/time.h>
+#include <linux/backing-dev.h>
+#include <linux/sort.h>
+
+#include <asm/uaccess.h>
+#include <linux/atomic.h>
+#include <linux/mutex.h>
+#include <linux/workqueue.h>
+#include <linux/cgroup.h>
+#include <linux/wait.h>
+
+struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE;
+
+/* See "Frequency meter" comments, below. */
+
+struct fmeter {
+ int cnt; /* unprocessed events count */
+ int val; /* most recent output value */
+ time_t time; /* clock (secs) when val computed */
+ spinlock_t lock; /* guards read or write of above */
+};
+
+struct cpuset {
+ struct cgroup_subsys_state css;
+
+ unsigned long flags; /* "unsigned long" so bitops work */
+
+ /*
+ * On default hierarchy:
+ *
+ * The user-configured masks can only be changed by writing to
+ * cpuset.cpus and cpuset.mems, and won't be limited by the
+ * parent masks.
+ *
+ * The effective masks is the real masks that apply to the tasks
+ * in the cpuset. They may be changed if the configured masks are
+ * changed or hotplug happens.
+ *
+ * effective_mask == configured_mask & parent's effective_mask,
+ * and if it ends up empty, it will inherit the parent's mask.
+ *
+ *
+ * On legacy hierachy:
+ *
+ * The user-configured masks are always the same with effective masks.
+ */
+
+ /* user-configured CPUs and Memory Nodes allow to tasks */
+ cpumask_var_t cpus_allowed;
+ nodemask_t mems_allowed;
+
+ /* effective CPUs and Memory Nodes allow to tasks */
+ cpumask_var_t effective_cpus;
+ nodemask_t effective_mems;
+
+ /*
+ * This is old Memory Nodes tasks took on.
+ *
+ * - top_cpuset.old_mems_allowed is initialized to mems_allowed.
+ * - A new cpuset's old_mems_allowed is initialized when some
+ * task is moved into it.
+ * - old_mems_allowed is used in cpuset_migrate_mm() when we change
+ * cpuset.mems_allowed and have tasks' nodemask updated, and
+ * then old_mems_allowed is updated to mems_allowed.
+ */
+ nodemask_t old_mems_allowed;
+
+ struct fmeter fmeter; /* memory_pressure filter */
+
+ /*
+ * Tasks are being attached to this cpuset. Used to prevent
+ * zeroing cpus/mems_allowed between ->can_attach() and ->attach().
+ */
+ int attach_in_progress;
+
+ /* partition number for rebuild_sched_domains() */
+ int pn;
+
+ /* for custom sched domain */
+ int relax_domain_level;
+};
+
+static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
+{
+ return css ? container_of(css, struct cpuset, css) : NULL;
+}
+
+/* Retrieve the cpuset for a task */
+static inline struct cpuset *task_cs(struct task_struct *task)
+{
+ return css_cs(task_css(task, cpuset_cgrp_id));
+}
+
+static inline struct cpuset *parent_cs(struct cpuset *cs)
+{
+ return css_cs(cs->css.parent);
+}
+
+#ifdef CONFIG_NUMA
+static inline bool task_has_mempolicy(struct task_struct *task)
+{
+ return task->mempolicy;
+}
+#else
+static inline bool task_has_mempolicy(struct task_struct *task)
+{
+ return false;
+}
+#endif
+
+
+/* bits in struct cpuset flags field */
+typedef enum {
+ CS_ONLINE,
+ CS_CPU_EXCLUSIVE,
+ CS_MEM_EXCLUSIVE,
+ CS_MEM_HARDWALL,
+ CS_MEMORY_MIGRATE,
+ CS_SCHED_LOAD_BALANCE,
+ CS_SPREAD_PAGE,
+ CS_SPREAD_SLAB,
+} cpuset_flagbits_t;
+
+/* convenient tests for these bits */
+static inline bool is_cpuset_online(const struct cpuset *cs)
+{
+ return test_bit(CS_ONLINE, &cs->flags);
+}
+
+static inline int is_cpu_exclusive(const struct cpuset *cs)
+{
+ return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
+}
+
+static inline int is_mem_exclusive(const struct cpuset *cs)
+{
+ return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
+}
+
+static inline int is_mem_hardwall(const struct cpuset *cs)
+{
+ return test_bit(CS_MEM_HARDWALL, &cs->flags);
+}
+
+static inline int is_sched_load_balance(const struct cpuset *cs)
+{
+ return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
+}
+
+static inline int is_memory_migrate(const struct cpuset *cs)
+{
+ return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
+}
+
+static inline int is_spread_page(const struct cpuset *cs)
+{
+ return test_bit(CS_SPREAD_PAGE, &cs->flags);
+}
+
+static inline int is_spread_slab(const struct cpuset *cs)
+{
+ return test_bit(CS_SPREAD_SLAB, &cs->flags);
+}
+
+static struct cpuset top_cpuset = {
+ .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
+ (1 << CS_MEM_EXCLUSIVE)),
+};
+
+/**
+ * cpuset_for_each_child - traverse online children of a cpuset
+ * @child_cs: loop cursor pointing to the current child
+ * @pos_css: used for iteration
+ * @parent_cs: target cpuset to walk children of
+ *
+ * Walk @child_cs through the online children of @parent_cs. Must be used
+ * with RCU read locked.
+ */
+#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \
+ css_for_each_child((pos_css), &(parent_cs)->css) \
+ if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
+
+/**
+ * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
+ * @des_cs: loop cursor pointing to the current descendant
+ * @pos_css: used for iteration
+ * @root_cs: target cpuset to walk ancestor of
+ *
+ * Walk @des_cs through the online descendants of @root_cs. Must be used
+ * with RCU read locked. The caller may modify @pos_css by calling
+ * css_rightmost_descendant() to skip subtree. @root_cs is included in the
+ * iteration and the first node to be visited.
+ */
+#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \
+ css_for_each_descendant_pre((pos_css), &(root_cs)->css) \
+ if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
+
+/*
+ * There are two global locks guarding cpuset structures - cpuset_mutex and
+ * callback_lock. We also require taking task_lock() when dereferencing a
+ * task's cpuset pointer. See "The task_lock() exception", at the end of this
+ * comment.
+ *
+ * A task must hold both locks to modify cpusets. If a task holds
+ * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
+ * is the only task able to also acquire callback_lock and be able to
+ * modify cpusets. It can perform various checks on the cpuset structure
+ * first, knowing nothing will change. It can also allocate memory while
+ * just holding cpuset_mutex. While it is performing these checks, various
+ * callback routines can briefly acquire callback_lock to query cpusets.
+ * Once it is ready to make the changes, it takes callback_lock, blocking
+ * everyone else.
+ *
+ * Calls to the kernel memory allocator can not be made while holding
+ * callback_lock, as that would risk double tripping on callback_lock
+ * from one of the callbacks into the cpuset code from within
+ * __alloc_pages().
+ *
+ * If a task is only holding callback_lock, then it has read-only
+ * access to cpusets.
+ *
+ * Now, the task_struct fields mems_allowed and mempolicy may be changed
+ * by other task, we use alloc_lock in the task_struct fields to protect
+ * them.
+ *
+ * The cpuset_common_file_read() handlers only hold callback_lock across
+ * small pieces of code, such as when reading out possibly multi-word
+ * cpumasks and nodemasks.
+ *
+ * Accessing a task's cpuset should be done in accordance with the
+ * guidelines for accessing subsystem state in kernel/cgroup.c
+ */
+
+static DEFINE_MUTEX(cpuset_mutex);
+static DEFINE_SPINLOCK(callback_lock);
+
+/*
+ * CPU / memory hotplug is handled asynchronously.
+ */
+static void cpuset_hotplug_workfn(struct work_struct *work);
+static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
+
+static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
+
+/*
+ * This is ugly, but preserves the userspace API for existing cpuset
+ * users. If someone tries to mount the "cpuset" filesystem, we
+ * silently switch it to mount "cgroup" instead
+ */
+static struct dentry *cpuset_mount(struct file_system_type *fs_type,
+ int flags, const char *unused_dev_name, void *data)
+{
+ struct file_system_type *cgroup_fs = get_fs_type("cgroup");
+ struct dentry *ret = ERR_PTR(-ENODEV);
+ if (cgroup_fs) {
+ char mountopts[] =
+ "cpuset,noprefix,"
+ "release_agent=/sbin/cpuset_release_agent";
+ ret = cgroup_fs->mount(cgroup_fs, flags,
+ unused_dev_name, mountopts);
+ put_filesystem(cgroup_fs);
+ }
+ return ret;
+}
+
+static struct file_system_type cpuset_fs_type = {
+ .name = "cpuset",
+ .mount = cpuset_mount,
+};
+
+/*
+ * Return in pmask the portion of a cpusets's cpus_allowed that
+ * are online. If none are online, walk up the cpuset hierarchy
+ * until we find one that does have some online cpus. The top
+ * cpuset always has some cpus online.
+ *
+ * One way or another, we guarantee to return some non-empty subset
+ * of cpu_online_mask.
+ *
+ * Call with callback_lock or cpuset_mutex held.
+ */
+static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
+{
+ while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask))
+ cs = parent_cs(cs);
+ cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
+}
+
+/*
+ * Return in *pmask the portion of a cpusets's mems_allowed that
+ * are online, with memory. If none are online with memory, walk
+ * up the cpuset hierarchy until we find one that does have some
+ * online mems. The top cpuset always has some mems online.
+ *
+ * One way or another, we guarantee to return some non-empty subset
+ * of node_states[N_MEMORY].
+ *
+ * Call with callback_lock or cpuset_mutex held.
+ */
+static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
+{
+ while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
+ cs = parent_cs(cs);
+ nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
+}
+
+/*
+ * update task's spread flag if cpuset's page/slab spread flag is set
+ *
+ * Call with callback_lock or cpuset_mutex held.
+ */
+static void cpuset_update_task_spread_flag(struct cpuset *cs,
+ struct task_struct *tsk)
+{
+ if (is_spread_page(cs))
+ task_set_spread_page(tsk);
+ else
+ task_clear_spread_page(tsk);
+
+ if (is_spread_slab(cs))
+ task_set_spread_slab(tsk);
+ else
+ task_clear_spread_slab(tsk);
+}
+
+/*
+ * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
+ *
+ * One cpuset is a subset of another if all its allowed CPUs and
+ * Memory Nodes are a subset of the other, and its exclusive flags
+ * are only set if the other's are set. Call holding cpuset_mutex.
+ */
+
+static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
+{
+ return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
+ nodes_subset(p->mems_allowed, q->mems_allowed) &&
+ is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
+ is_mem_exclusive(p) <= is_mem_exclusive(q);
+}
+
+/**
+ * alloc_trial_cpuset - allocate a trial cpuset
+ * @cs: the cpuset that the trial cpuset duplicates
+ */
+static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
+{
+ struct cpuset *trial;
+
+ trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
+ if (!trial)
+ return NULL;
+
+ if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL))
+ goto free_cs;
+ if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL))
+ goto free_cpus;
+
+ cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
+ cpumask_copy(trial->effective_cpus, cs->effective_cpus);
+ return trial;
+
+free_cpus:
+ free_cpumask_var(trial->cpus_allowed);
+free_cs:
+ kfree(trial);
+ return NULL;
+}
+
+/**
+ * free_trial_cpuset - free the trial cpuset
+ * @trial: the trial cpuset to be freed
+ */
+static void free_trial_cpuset(struct cpuset *trial)
+{
+ free_cpumask_var(trial->effective_cpus);
+ free_cpumask_var(trial->cpus_allowed);
+ kfree(trial);
+}
+
+/*
+ * validate_change() - Used to validate that any proposed cpuset change
+ * follows the structural rules for cpusets.
+ *
+ * If we replaced the flag and mask values of the current cpuset
+ * (cur) with those values in the trial cpuset (trial), would
+ * our various subset and exclusive rules still be valid? Presumes
+ * cpuset_mutex held.
+ *
+ * 'cur' is the address of an actual, in-use cpuset. Operations
+ * such as list traversal that depend on the actual address of the
+ * cpuset in the list must use cur below, not trial.
+ *
+ * 'trial' is the address of bulk structure copy of cur, with
+ * perhaps one or more of the fields cpus_allowed, mems_allowed,
+ * or flags changed to new, trial values.
+ *
+ * Return 0 if valid, -errno if not.
+ */
+
+static int validate_change(struct cpuset *cur, struct cpuset *trial)
+{
+ struct cgroup_subsys_state *css;
+ struct cpuset *c, *par;
+ int ret;
+
+ rcu_read_lock();
+
+ /* Each of our child cpusets must be a subset of us */
+ ret = -EBUSY;
+ cpuset_for_each_child(c, css, cur)
+ if (!is_cpuset_subset(c, trial))
+ goto out;
+
+ /* Remaining checks don't apply to root cpuset */
+ ret = 0;
+ if (cur == &top_cpuset)
+ goto out;
+
+ par = parent_cs(cur);
+
+ /* On legacy hiearchy, we must be a subset of our parent cpuset. */
+ ret = -EACCES;
+ if (!cgroup_on_dfl(cur->css.cgroup) && !is_cpuset_subset(trial, par))
+ goto out;
+
+ /*
+ * If either I or some sibling (!= me) is exclusive, we can't
+ * overlap
+ */
+ ret = -EINVAL;
+ cpuset_for_each_child(c, css, par) {
+ if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
+ c != cur &&
+ cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
+ goto out;
+ if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
+ c != cur &&
+ nodes_intersects(trial->mems_allowed, c->mems_allowed))
+ goto out;
+ }
+
+ /*
+ * Cpusets with tasks - existing or newly being attached - can't
+ * be changed to have empty cpus_allowed or mems_allowed.
+ */
+ ret = -ENOSPC;
+ if ((cgroup_has_tasks(cur->css.cgroup) || cur->attach_in_progress)) {
+ if (!cpumask_empty(cur->cpus_allowed) &&
+ cpumask_empty(trial->cpus_allowed))
+ goto out;
+ if (!nodes_empty(cur->mems_allowed) &&
+ nodes_empty(trial->mems_allowed))
+ goto out;
+ }
+
+ /*
+ * We can't shrink if we won't have enough room for SCHED_DEADLINE
+ * tasks.
+ */
+ ret = -EBUSY;
+ if (is_cpu_exclusive(cur) &&
+ !cpuset_cpumask_can_shrink(cur->cpus_allowed,
+ trial->cpus_allowed))
+ goto out;
+
+ ret = 0;
+out:
+ rcu_read_unlock();
+ return ret;
+}
+
+#ifdef CONFIG_SMP
+/*
+ * Helper routine for generate_sched_domains().
+ * Do cpusets a, b have overlapping effective cpus_allowed masks?
+ */
+static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
+{
+ return cpumask_intersects(a->effective_cpus, b->effective_cpus);
+}
+
+static void
+update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
+{
+ if (dattr->relax_domain_level < c->relax_domain_level)
+ dattr->relax_domain_level = c->relax_domain_level;
+ return;
+}
+
+static void update_domain_attr_tree(struct sched_domain_attr *dattr,
+ struct cpuset *root_cs)
+{
+ struct cpuset *cp;
+ struct cgroup_subsys_state *pos_css;
+
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
+ /* skip the whole subtree if @cp doesn't have any CPU */
+ if (cpumask_empty(cp->cpus_allowed)) {
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
+ }
+
+ if (is_sched_load_balance(cp))
+ update_domain_attr(dattr, cp);
+ }
+ rcu_read_unlock();
+}
+
+/*
+ * generate_sched_domains()
+ *
+ * This function builds a partial partition of the systems CPUs
+ * A 'partial partition' is a set of non-overlapping subsets whose
+ * union is a subset of that set.
+ * The output of this function needs to be passed to kernel/sched/core.c
+ * partition_sched_domains() routine, which will rebuild the scheduler's
+ * load balancing domains (sched domains) as specified by that partial
+ * partition.
+ *
+ * See "What is sched_load_balance" in Documentation/cgroups/cpusets.txt
+ * for a background explanation of this.
+ *
+ * Does not return errors, on the theory that the callers of this
+ * routine would rather not worry about failures to rebuild sched
+ * domains when operating in the severe memory shortage situations
+ * that could cause allocation failures below.
+ *
+ * Must be called with cpuset_mutex held.
+ *
+ * The three key local variables below are:
+ * q - a linked-list queue of cpuset pointers, used to implement a
+ * top-down scan of all cpusets. This scan loads a pointer
+ * to each cpuset marked is_sched_load_balance into the
+ * array 'csa'. For our purposes, rebuilding the schedulers
+ * sched domains, we can ignore !is_sched_load_balance cpusets.
+ * csa - (for CpuSet Array) Array of pointers to all the cpusets
+ * that need to be load balanced, for convenient iterative
+ * access by the subsequent code that finds the best partition,
+ * i.e the set of domains (subsets) of CPUs such that the
+ * cpus_allowed of every cpuset marked is_sched_load_balance
+ * is a subset of one of these domains, while there are as
+ * many such domains as possible, each as small as possible.
+ * doms - Conversion of 'csa' to an array of cpumasks, for passing to
+ * the kernel/sched/core.c routine partition_sched_domains() in a
+ * convenient format, that can be easily compared to the prior
+ * value to determine what partition elements (sched domains)
+ * were changed (added or removed.)
+ *
+ * Finding the best partition (set of domains):
+ * The triple nested loops below over i, j, k scan over the
+ * load balanced cpusets (using the array of cpuset pointers in
+ * csa[]) looking for pairs of cpusets that have overlapping
+ * cpus_allowed, but which don't have the same 'pn' partition
+ * number and gives them in the same partition number. It keeps
+ * looping on the 'restart' label until it can no longer find
+ * any such pairs.
+ *
+ * The union of the cpus_allowed masks from the set of
+ * all cpusets having the same 'pn' value then form the one
+ * element of the partition (one sched domain) to be passed to
+ * partition_sched_domains().
+ */
+static int generate_sched_domains(cpumask_var_t **domains,
+ struct sched_domain_attr **attributes)
+{
+ struct cpuset *cp; /* scans q */
+ struct cpuset **csa; /* array of all cpuset ptrs */
+ int csn; /* how many cpuset ptrs in csa so far */
+ int i, j, k; /* indices for partition finding loops */
+ cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
+ cpumask_var_t non_isolated_cpus; /* load balanced CPUs */
+ struct sched_domain_attr *dattr; /* attributes for custom domains */
+ int ndoms = 0; /* number of sched domains in result */
+ int nslot; /* next empty doms[] struct cpumask slot */
+ struct cgroup_subsys_state *pos_css;
+
+ doms = NULL;
+ dattr = NULL;
+ csa = NULL;
+
+ if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL))
+ goto done;
+ cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
+
+ /* Special case for the 99% of systems with one, full, sched domain */
+ if (is_sched_load_balance(&top_cpuset)) {
+ ndoms = 1;
+ doms = alloc_sched_domains(ndoms);
+ if (!doms)
+ goto done;
+
+ dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
+ if (dattr) {
+ *dattr = SD_ATTR_INIT;
+ update_domain_attr_tree(dattr, &top_cpuset);
+ }
+ cpumask_and(doms[0], top_cpuset.effective_cpus,
+ non_isolated_cpus);
+
+ goto done;
+ }
+
+ csa = kmalloc(nr_cpusets() * sizeof(cp), GFP_KERNEL);
+ if (!csa)
+ goto done;
+ csn = 0;
+
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
+ if (cp == &top_cpuset)
+ continue;
+ /*
+ * Continue traversing beyond @cp iff @cp has some CPUs and
+ * isn't load balancing. The former is obvious. The
+ * latter: All child cpusets contain a subset of the
+ * parent's cpus, so just skip them, and then we call
+ * update_domain_attr_tree() to calc relax_domain_level of
+ * the corresponding sched domain.
+ */
+ if (!cpumask_empty(cp->cpus_allowed) &&
+ !(is_sched_load_balance(cp) &&
+ cpumask_intersects(cp->cpus_allowed, non_isolated_cpus)))
+ continue;
+
+ if (is_sched_load_balance(cp))
+ csa[csn++] = cp;
+
+ /* skip @cp's subtree */
+ pos_css = css_rightmost_descendant(pos_css);
+ }
+ rcu_read_unlock();
+
+ for (i = 0; i < csn; i++)
+ csa[i]->pn = i;
+ ndoms = csn;
+
+restart:
+ /* Find the best partition (set of sched domains) */
+ for (i = 0; i < csn; i++) {
+ struct cpuset *a = csa[i];
+ int apn = a->pn;
+
+ for (j = 0; j < csn; j++) {
+ struct cpuset *b = csa[j];
+ int bpn = b->pn;
+
+ if (apn != bpn && cpusets_overlap(a, b)) {
+ for (k = 0; k < csn; k++) {
+ struct cpuset *c = csa[k];
+
+ if (c->pn == bpn)
+ c->pn = apn;
+ }
+ ndoms--; /* one less element */
+ goto restart;
+ }
+ }
+ }
+
+ /*
+ * Now we know how many domains to create.
+ * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
+ */
+ doms = alloc_sched_domains(ndoms);
+ if (!doms)
+ goto done;
+
+ /*
+ * The rest of the code, including the scheduler, can deal with
+ * dattr==NULL case. No need to abort if alloc fails.
+ */
+ dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
+
+ for (nslot = 0, i = 0; i < csn; i++) {
+ struct cpuset *a = csa[i];
+ struct cpumask *dp;
+ int apn = a->pn;
+
+ if (apn < 0) {
+ /* Skip completed partitions */
+ continue;
+ }
+
+ dp = doms[nslot];
+
+ if (nslot == ndoms) {
+ static int warnings = 10;
+ if (warnings) {
+ pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n",
+ nslot, ndoms, csn, i, apn);
+ warnings--;
+ }
+ continue;
+ }
+
+ cpumask_clear(dp);
+ if (dattr)
+ *(dattr + nslot) = SD_ATTR_INIT;
+ for (j = i; j < csn; j++) {
+ struct cpuset *b = csa[j];
+
+ if (apn == b->pn) {
+ cpumask_or(dp, dp, b->effective_cpus);
+ cpumask_and(dp, dp, non_isolated_cpus);
+ if (dattr)
+ update_domain_attr_tree(dattr + nslot, b);
+
+ /* Done with this partition */
+ b->pn = -1;
+ }
+ }
+ nslot++;
+ }
+ BUG_ON(nslot != ndoms);
+
+done:
+ free_cpumask_var(non_isolated_cpus);
+ kfree(csa);
+
+ /*
+ * Fallback to the default domain if kmalloc() failed.
+ * See comments in partition_sched_domains().
+ */
+ if (doms == NULL)
+ ndoms = 1;
+
+ *domains = doms;
+ *attributes = dattr;
+ return ndoms;
+}
+
+/*
+ * Rebuild scheduler domains.
+ *
+ * If the flag 'sched_load_balance' of any cpuset with non-empty
+ * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
+ * which has that flag enabled, or if any cpuset with a non-empty
+ * 'cpus' is removed, then call this routine to rebuild the
+ * scheduler's dynamic sched domains.
+ *
+ * Call with cpuset_mutex held. Takes get_online_cpus().
+ */
+static void rebuild_sched_domains_locked(void)
+{
+ struct sched_domain_attr *attr;
+ cpumask_var_t *doms;
+ int ndoms;
+
+ lockdep_assert_held(&cpuset_mutex);
+ get_online_cpus();
+
+ /*
+ * We have raced with CPU hotplug. Don't do anything to avoid
+ * passing doms with offlined cpu to partition_sched_domains().
+ * Anyways, hotplug work item will rebuild sched domains.
+ */
+ if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
+ goto out;
+
+ /* Generate domain masks and attrs */
+ ndoms = generate_sched_domains(&doms, &attr);
+
+ /* Have scheduler rebuild the domains */
+ partition_sched_domains(ndoms, doms, attr);
+out:
+ put_online_cpus();
+}
+#else /* !CONFIG_SMP */
+static void rebuild_sched_domains_locked(void)
+{
+}
+#endif /* CONFIG_SMP */
+
+void rebuild_sched_domains(void)
+{
+ mutex_lock(&cpuset_mutex);
+ rebuild_sched_domains_locked();
+ mutex_unlock(&cpuset_mutex);
+}
+
+/**
+ * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
+ * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
+ *
+ * Iterate through each task of @cs updating its cpus_allowed to the
+ * effective cpuset's. As this function is called with cpuset_mutex held,
+ * cpuset membership stays stable.
+ */
+static void update_tasks_cpumask(struct cpuset *cs)
+{
+ struct css_task_iter it;
+ struct task_struct *task;
+
+ css_task_iter_start(&cs->css, &it);
+ while ((task = css_task_iter_next(&it)))
+ set_cpus_allowed_ptr(task, cs->effective_cpus);
+ css_task_iter_end(&it);
+}
+
+/*
+ * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
+ * @cs: the cpuset to consider
+ * @new_cpus: temp variable for calculating new effective_cpus
+ *
+ * When congifured cpumask is changed, the effective cpumasks of this cpuset
+ * and all its descendants need to be updated.
+ *
+ * On legacy hierachy, effective_cpus will be the same with cpu_allowed.
+ *
+ * Called with cpuset_mutex held
+ */
+static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
+{
+ struct cpuset *cp;
+ struct cgroup_subsys_state *pos_css;
+ bool need_rebuild_sched_domains = false;
+
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cp, pos_css, cs) {
+ struct cpuset *parent = parent_cs(cp);
+
+ cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus);
+
+ /*
+ * If it becomes empty, inherit the effective mask of the
+ * parent, which is guaranteed to have some CPUs.
+ */
+ if (cgroup_on_dfl(cp->css.cgroup) && cpumask_empty(new_cpus))
+ cpumask_copy(new_cpus, parent->effective_cpus);
+
+ /* Skip the whole subtree if the cpumask remains the same. */
+ if (cpumask_equal(new_cpus, cp->effective_cpus)) {
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
+ }
+
+ if (!css_tryget_online(&cp->css))
+ continue;
+ rcu_read_unlock();
+
+ spin_lock_irq(&callback_lock);
+ cpumask_copy(cp->effective_cpus, new_cpus);
+ spin_unlock_irq(&callback_lock);
+
+ WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
+ !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
+
+ update_tasks_cpumask(cp);
+
+ /*
+ * If the effective cpumask of any non-empty cpuset is changed,
+ * we need to rebuild sched domains.
+ */
+ if (!cpumask_empty(cp->cpus_allowed) &&
+ is_sched_load_balance(cp))
+ need_rebuild_sched_domains = true;
+
+ rcu_read_lock();
+ css_put(&cp->css);
+ }
+ rcu_read_unlock();
+
+ if (need_rebuild_sched_domains)
+ rebuild_sched_domains_locked();
+}
+
+/**
+ * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
+ * @cs: the cpuset to consider
+ * @trialcs: trial cpuset
+ * @buf: buffer of cpu numbers written to this cpuset
+ */
+static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
+ const char *buf)
+{
+ int retval;
+
+ /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
+ if (cs == &top_cpuset)
+ return -EACCES;
+
+ /*
+ * An empty cpus_allowed is ok only if the cpuset has no tasks.
+ * Since cpulist_parse() fails on an empty mask, we special case
+ * that parsing. The validate_change() call ensures that cpusets
+ * with tasks have cpus.
+ */
+ if (!*buf) {
+ cpumask_clear(trialcs->cpus_allowed);
+ } else {
+ retval = cpulist_parse(buf, trialcs->cpus_allowed);
+ if (retval < 0)
+ return retval;
+
+ if (!cpumask_subset(trialcs->cpus_allowed,
+ top_cpuset.cpus_allowed))
+ return -EINVAL;
+ }
+
+ /* Nothing to do if the cpus didn't change */
+ if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
+ return 0;
+
+ retval = validate_change(cs, trialcs);
+ if (retval < 0)
+ return retval;
+
+ spin_lock_irq(&callback_lock);
+ cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
+ spin_unlock_irq(&callback_lock);
+
+ /* use trialcs->cpus_allowed as a temp variable */
+ update_cpumasks_hier(cs, trialcs->cpus_allowed);
+ return 0;
+}
+
+/*
+ * cpuset_migrate_mm
+ *
+ * Migrate memory region from one set of nodes to another.
+ *
+ * Temporarilly set tasks mems_allowed to target nodes of migration,
+ * so that the migration code can allocate pages on these nodes.
+ *
+ * While the mm_struct we are migrating is typically from some
+ * other task, the task_struct mems_allowed that we are hacking
+ * is for our current task, which must allocate new pages for that
+ * migrating memory region.
+ */
+
+static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
+ const nodemask_t *to)
+{
+ struct task_struct *tsk = current;
+
+ tsk->mems_allowed = *to;
+
+ do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
+
+ rcu_read_lock();
+ guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed);
+ rcu_read_unlock();
+}
+
+/*
+ * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
+ * @tsk: the task to change
+ * @newmems: new nodes that the task will be set
+ *
+ * In order to avoid seeing no nodes if the old and new nodes are disjoint,
+ * we structure updates as setting all new allowed nodes, then clearing newly
+ * disallowed ones.
+ */
+static void cpuset_change_task_nodemask(struct task_struct *tsk,
+ nodemask_t *newmems)
+{
+ bool need_loop;
+
+ /*
+ * Allow tasks that have access to memory reserves because they have
+ * been OOM killed to get memory anywhere.
+ */
+ if (unlikely(test_thread_flag(TIF_MEMDIE)))
+ return;
+ if (current->flags & PF_EXITING) /* Let dying task have memory */
+ return;
+
+ task_lock(tsk);
+ /*
+ * Determine if a loop is necessary if another thread is doing
+ * read_mems_allowed_begin(). If at least one node remains unchanged and
+ * tsk does not have a mempolicy, then an empty nodemask will not be
+ * possible when mems_allowed is larger than a word.
+ */
+ need_loop = task_has_mempolicy(tsk) ||
+ !nodes_intersects(*newmems, tsk->mems_allowed);
+
+ if (need_loop) {
+ local_irq_disable();
+ write_seqcount_begin(&tsk->mems_allowed_seq);
+ }
+
+ nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
+ mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
+
+ mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
+ tsk->mems_allowed = *newmems;
+
+ if (need_loop) {
+ write_seqcount_end(&tsk->mems_allowed_seq);
+ local_irq_enable();
+ }
+
+ task_unlock(tsk);
+}
+
+static void *cpuset_being_rebound;
+
+/**
+ * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
+ * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
+ *
+ * Iterate through each task of @cs updating its mems_allowed to the
+ * effective cpuset's. As this function is called with cpuset_mutex held,
+ * cpuset membership stays stable.
+ */
+static void update_tasks_nodemask(struct cpuset *cs)
+{
+ static nodemask_t newmems; /* protected by cpuset_mutex */
+ struct css_task_iter it;
+ struct task_struct *task;
+
+ cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
+
+ guarantee_online_mems(cs, &newmems);
+
+ /*
+ * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
+ * take while holding tasklist_lock. Forks can happen - the
+ * mpol_dup() cpuset_being_rebound check will catch such forks,
+ * and rebind their vma mempolicies too. Because we still hold
+ * the global cpuset_mutex, we know that no other rebind effort
+ * will be contending for the global variable cpuset_being_rebound.
+ * It's ok if we rebind the same mm twice; mpol_rebind_mm()
+ * is idempotent. Also migrate pages in each mm to new nodes.
+ */
+ css_task_iter_start(&cs->css, &it);
+ while ((task = css_task_iter_next(&it))) {
+ struct mm_struct *mm;
+ bool migrate;
+
+ cpuset_change_task_nodemask(task, &newmems);
+
+ mm = get_task_mm(task);
+ if (!mm)
+ continue;
+
+ migrate = is_memory_migrate(cs);
+
+ mpol_rebind_mm(mm, &cs->mems_allowed);
+ if (migrate)
+ cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
+ mmput(mm);
+ }
+ css_task_iter_end(&it);
+
+ /*
+ * All the tasks' nodemasks have been updated, update
+ * cs->old_mems_allowed.
+ */
+ cs->old_mems_allowed = newmems;
+
+ /* We're done rebinding vmas to this cpuset's new mems_allowed. */
+ cpuset_being_rebound = NULL;
+}
+
+/*
+ * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree
+ * @cs: the cpuset to consider
+ * @new_mems: a temp variable for calculating new effective_mems
+ *
+ * When configured nodemask is changed, the effective nodemasks of this cpuset
+ * and all its descendants need to be updated.
+ *
+ * On legacy hiearchy, effective_mems will be the same with mems_allowed.
+ *
+ * Called with cpuset_mutex held
+ */
+static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
+{
+ struct cpuset *cp;
+ struct cgroup_subsys_state *pos_css;
+
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cp, pos_css, cs) {
+ struct cpuset *parent = parent_cs(cp);
+
+ nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
+
+ /*
+ * If it becomes empty, inherit the effective mask of the
+ * parent, which is guaranteed to have some MEMs.
+ */
+ if (cgroup_on_dfl(cp->css.cgroup) && nodes_empty(*new_mems))
+ *new_mems = parent->effective_mems;
+
+ /* Skip the whole subtree if the nodemask remains the same. */
+ if (nodes_equal(*new_mems, cp->effective_mems)) {
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
+ }
+
+ if (!css_tryget_online(&cp->css))
+ continue;
+ rcu_read_unlock();
+
+ spin_lock_irq(&callback_lock);
+ cp->effective_mems = *new_mems;
+ spin_unlock_irq(&callback_lock);
+
+ WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
+ !nodes_equal(cp->mems_allowed, cp->effective_mems));
+
+ update_tasks_nodemask(cp);
+
+ rcu_read_lock();
+ css_put(&cp->css);
+ }
+ rcu_read_unlock();
+}
+
+/*
+ * Handle user request to change the 'mems' memory placement
+ * of a cpuset. Needs to validate the request, update the
+ * cpusets mems_allowed, and for each task in the cpuset,
+ * update mems_allowed and rebind task's mempolicy and any vma
+ * mempolicies and if the cpuset is marked 'memory_migrate',
+ * migrate the tasks pages to the new memory.
+ *
+ * Call with cpuset_mutex held. May take callback_lock during call.
+ * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
+ * lock each such tasks mm->mmap_sem, scan its vma's and rebind
+ * their mempolicies to the cpusets new mems_allowed.
+ */
+static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
+ const char *buf)
+{
+ int retval;
+
+ /*
+ * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
+ * it's read-only
+ */
+ if (cs == &top_cpuset) {
+ retval = -EACCES;
+ goto done;
+ }
+
+ /*
+ * An empty mems_allowed is ok iff there are no tasks in the cpuset.
+ * Since nodelist_parse() fails on an empty mask, we special case
+ * that parsing. The validate_change() call ensures that cpusets
+ * with tasks have memory.
+ */
+ if (!*buf) {
+ nodes_clear(trialcs->mems_allowed);
+ } else {
+ retval = nodelist_parse(buf, trialcs->mems_allowed);
+ if (retval < 0)
+ goto done;
+
+ if (!nodes_subset(trialcs->mems_allowed,
+ top_cpuset.mems_allowed)) {
+ retval = -EINVAL;
+ goto done;
+ }
+ }
+
+ if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
+ retval = 0; /* Too easy - nothing to do */
+ goto done;
+ }
+ retval = validate_change(cs, trialcs);
+ if (retval < 0)
+ goto done;
+
+ spin_lock_irq(&callback_lock);
+ cs->mems_allowed = trialcs->mems_allowed;
+ spin_unlock_irq(&callback_lock);
+
+ /* use trialcs->mems_allowed as a temp variable */
+ update_nodemasks_hier(cs, &cs->mems_allowed);
+done:
+ return retval;
+}
+
+int current_cpuset_is_being_rebound(void)
+{
+ int ret;
+
+ rcu_read_lock();
+ ret = task_cs(current) == cpuset_being_rebound;
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static int update_relax_domain_level(struct cpuset *cs, s64 val)
+{
+#ifdef CONFIG_SMP
+ if (val < -1 || val >= sched_domain_level_max)
+ return -EINVAL;
+#endif
+
+ if (val != cs->relax_domain_level) {
+ cs->relax_domain_level = val;
+ if (!cpumask_empty(cs->cpus_allowed) &&
+ is_sched_load_balance(cs))
+ rebuild_sched_domains_locked();
+ }
+
+ return 0;
+}
+
+/**
+ * update_tasks_flags - update the spread flags of tasks in the cpuset.
+ * @cs: the cpuset in which each task's spread flags needs to be changed
+ *
+ * Iterate through each task of @cs updating its spread flags. As this
+ * function is called with cpuset_mutex held, cpuset membership stays
+ * stable.
+ */
+static void update_tasks_flags(struct cpuset *cs)
+{
+ struct css_task_iter it;
+ struct task_struct *task;
+
+ css_task_iter_start(&cs->css, &it);
+ while ((task = css_task_iter_next(&it)))
+ cpuset_update_task_spread_flag(cs, task);
+ css_task_iter_end(&it);
+}
+
+/*
+ * update_flag - read a 0 or a 1 in a file and update associated flag
+ * bit: the bit to update (see cpuset_flagbits_t)
+ * cs: the cpuset to update
+ * turning_on: whether the flag is being set or cleared
+ *
+ * Call with cpuset_mutex held.
+ */
+
+static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
+ int turning_on)
+{
+ struct cpuset *trialcs;
+ int balance_flag_changed;
+ int spread_flag_changed;
+ int err;
+
+ trialcs = alloc_trial_cpuset(cs);
+ if (!trialcs)
+ return -ENOMEM;
+
+ if (turning_on)
+ set_bit(bit, &trialcs->flags);
+ else
+ clear_bit(bit, &trialcs->flags);
+
+ err = validate_change(cs, trialcs);
+ if (err < 0)
+ goto out;
+
+ balance_flag_changed = (is_sched_load_balance(cs) !=
+ is_sched_load_balance(trialcs));
+
+ spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
+ || (is_spread_page(cs) != is_spread_page(trialcs)));
+
+ spin_lock_irq(&callback_lock);
+ cs->flags = trialcs->flags;
+ spin_unlock_irq(&callback_lock);
+
+ if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
+ rebuild_sched_domains_locked();
+
+ if (spread_flag_changed)
+ update_tasks_flags(cs);
+out:
+ free_trial_cpuset(trialcs);
+ return err;
+}
+
+/*
+ * Frequency meter - How fast is some event occurring?
+ *
+ * These routines manage a digitally filtered, constant time based,
+ * event frequency meter. There are four routines:
+ * fmeter_init() - initialize a frequency meter.
+ * fmeter_markevent() - called each time the event happens.
+ * fmeter_getrate() - returns the recent rate of such events.
+ * fmeter_update() - internal routine used to update fmeter.
+ *
+ * A common data structure is passed to each of these routines,
+ * which is used to keep track of the state required to manage the
+ * frequency meter and its digital filter.
+ *
+ * The filter works on the number of events marked per unit time.
+ * The filter is single-pole low-pass recursive (IIR). The time unit
+ * is 1 second. Arithmetic is done using 32-bit integers scaled to
+ * simulate 3 decimal digits of precision (multiplied by 1000).
+ *
+ * With an FM_COEF of 933, and a time base of 1 second, the filter
+ * has a half-life of 10 seconds, meaning that if the events quit
+ * happening, then the rate returned from the fmeter_getrate()
+ * will be cut in half each 10 seconds, until it converges to zero.
+ *
+ * It is not worth doing a real infinitely recursive filter. If more
+ * than FM_MAXTICKS ticks have elapsed since the last filter event,
+ * just compute FM_MAXTICKS ticks worth, by which point the level
+ * will be stable.
+ *
+ * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
+ * arithmetic overflow in the fmeter_update() routine.
+ *
+ * Given the simple 32 bit integer arithmetic used, this meter works
+ * best for reporting rates between one per millisecond (msec) and
+ * one per 32 (approx) seconds. At constant rates faster than one
+ * per msec it maxes out at values just under 1,000,000. At constant
+ * rates between one per msec, and one per second it will stabilize
+ * to a value N*1000, where N is the rate of events per second.
+ * At constant rates between one per second and one per 32 seconds,
+ * it will be choppy, moving up on the seconds that have an event,
+ * and then decaying until the next event. At rates slower than
+ * about one in 32 seconds, it decays all the way back to zero between
+ * each event.
+ */
+
+#define FM_COEF 933 /* coefficient for half-life of 10 secs */
+#define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */
+#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */
+#define FM_SCALE 1000 /* faux fixed point scale */
+
+/* Initialize a frequency meter */
+static void fmeter_init(struct fmeter *fmp)
+{
+ fmp->cnt = 0;
+ fmp->val = 0;
+ fmp->time = 0;
+ spin_lock_init(&fmp->lock);
+}
+
+/* Internal meter update - process cnt events and update value */
+static void fmeter_update(struct fmeter *fmp)
+{
+ time_t now = get_seconds();
+ time_t ticks = now - fmp->time;
+
+ if (ticks == 0)
+ return;
+
+ ticks = min(FM_MAXTICKS, ticks);
+ while (ticks-- > 0)
+ fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
+ fmp->time = now;
+
+ fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
+ fmp->cnt = 0;
+}
+
+/* Process any previous ticks, then bump cnt by one (times scale). */
+static void fmeter_markevent(struct fmeter *fmp)
+{
+ spin_lock(&fmp->lock);
+ fmeter_update(fmp);
+ fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
+ spin_unlock(&fmp->lock);
+}
+
+/* Process any previous ticks, then return current value. */
+static int fmeter_getrate(struct fmeter *fmp)
+{
+ int val;
+
+ spin_lock(&fmp->lock);
+ fmeter_update(fmp);
+ val = fmp->val;
+ spin_unlock(&fmp->lock);
+ return val;
+}
+
+static struct cpuset *cpuset_attach_old_cs;
+
+/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
+static int cpuset_can_attach(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
+{
+ struct cpuset *cs = css_cs(css);
+ struct task_struct *task;
+ int ret;
+
+ /* used later by cpuset_attach() */
+ cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset));
+
+ mutex_lock(&cpuset_mutex);
+
+ /* allow moving tasks into an empty cpuset if on default hierarchy */
+ ret = -ENOSPC;
+ if (!cgroup_on_dfl(css->cgroup) &&
+ (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
+ goto out_unlock;
+
+ cgroup_taskset_for_each(task, tset) {
+ ret = task_can_attach(task, cs->cpus_allowed);
+ if (ret)
+ goto out_unlock;
+ ret = security_task_setscheduler(task);
+ if (ret)
+ goto out_unlock;
+ }
+
+ /*
+ * Mark attach is in progress. This makes validate_change() fail
+ * changes which zero cpus/mems_allowed.
+ */
+ cs->attach_in_progress++;
+ ret = 0;
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
+ return ret;
+}
+
+static void cpuset_cancel_attach(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
+{
+ mutex_lock(&cpuset_mutex);
+ css_cs(css)->attach_in_progress--;
+ mutex_unlock(&cpuset_mutex);
+}
+
+/*
+ * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach()
+ * but we can't allocate it dynamically there. Define it global and
+ * allocate from cpuset_init().
+ */
+static cpumask_var_t cpus_attach;
+
+static void cpuset_attach(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
+{
+ /* static buf protected by cpuset_mutex */
+ static nodemask_t cpuset_attach_nodemask_to;
+ struct mm_struct *mm;
+ struct task_struct *task;
+ struct task_struct *leader = cgroup_taskset_first(tset);
+ struct cpuset *cs = css_cs(css);
+ struct cpuset *oldcs = cpuset_attach_old_cs;
+
+ mutex_lock(&cpuset_mutex);
+
+ /* prepare for attach */
+ if (cs == &top_cpuset)
+ cpumask_copy(cpus_attach, cpu_possible_mask);
+ else
+ guarantee_online_cpus(cs, cpus_attach);
+
+ guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
+
+ cgroup_taskset_for_each(task, tset) {
+ /*
+ * can_attach beforehand should guarantee that this doesn't
+ * fail. TODO: have a better way to handle failure here
+ */
+ WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
+
+ cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
+ cpuset_update_task_spread_flag(cs, task);
+ }
+
+ /*
+ * Change mm, possibly for multiple threads in a threadgroup. This is
+ * expensive and may sleep.
+ */
+ cpuset_attach_nodemask_to = cs->effective_mems;
+ mm = get_task_mm(leader);
+ if (mm) {
+ mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
+
+ /*
+ * old_mems_allowed is the same with mems_allowed here, except
+ * if this task is being moved automatically due to hotplug.
+ * In that case @mems_allowed has been updated and is empty,
+ * so @old_mems_allowed is the right nodesets that we migrate
+ * mm from.
+ */
+ if (is_memory_migrate(cs)) {
+ cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
+ &cpuset_attach_nodemask_to);
+ }
+ mmput(mm);
+ }
+
+ cs->old_mems_allowed = cpuset_attach_nodemask_to;
+
+ cs->attach_in_progress--;
+ if (!cs->attach_in_progress)
+ wake_up(&cpuset_attach_wq);
+
+ mutex_unlock(&cpuset_mutex);
+}
+
+/* The various types of files and directories in a cpuset file system */
+
+typedef enum {
+ FILE_MEMORY_MIGRATE,
+ FILE_CPULIST,
+ FILE_MEMLIST,
+ FILE_EFFECTIVE_CPULIST,
+ FILE_EFFECTIVE_MEMLIST,
+ FILE_CPU_EXCLUSIVE,
+ FILE_MEM_EXCLUSIVE,
+ FILE_MEM_HARDWALL,
+ FILE_SCHED_LOAD_BALANCE,
+ FILE_SCHED_RELAX_DOMAIN_LEVEL,
+ FILE_MEMORY_PRESSURE_ENABLED,
+ FILE_MEMORY_PRESSURE,
+ FILE_SPREAD_PAGE,
+ FILE_SPREAD_SLAB,
+} cpuset_filetype_t;
+
+static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
+ u64 val)
+{
+ struct cpuset *cs = css_cs(css);
+ cpuset_filetype_t type = cft->private;
+ int retval = 0;
+
+ mutex_lock(&cpuset_mutex);
+ if (!is_cpuset_online(cs)) {
+ retval = -ENODEV;
+ goto out_unlock;
+ }
+
+ switch (type) {
+ case FILE_CPU_EXCLUSIVE:
+ retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
+ break;
+ case FILE_MEM_EXCLUSIVE:
+ retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
+ break;
+ case FILE_MEM_HARDWALL:
+ retval = update_flag(CS_MEM_HARDWALL, cs, val);
+ break;
+ case FILE_SCHED_LOAD_BALANCE:
+ retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
+ break;
+ case FILE_MEMORY_MIGRATE:
+ retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
+ break;
+ case FILE_MEMORY_PRESSURE_ENABLED:
+ cpuset_memory_pressure_enabled = !!val;
+ break;
+ case FILE_MEMORY_PRESSURE:
+ retval = -EACCES;
+ break;
+ case FILE_SPREAD_PAGE:
+ retval = update_flag(CS_SPREAD_PAGE, cs, val);
+ break;
+ case FILE_SPREAD_SLAB:
+ retval = update_flag(CS_SPREAD_SLAB, cs, val);
+ break;
+ default:
+ retval = -EINVAL;
+ break;
+ }
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
+ return retval;
+}
+
+static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
+ s64 val)
+{
+ struct cpuset *cs = css_cs(css);
+ cpuset_filetype_t type = cft->private;
+ int retval = -ENODEV;
+
+ mutex_lock(&cpuset_mutex);
+ if (!is_cpuset_online(cs))
+ goto out_unlock;
+
+ switch (type) {
+ case FILE_SCHED_RELAX_DOMAIN_LEVEL:
+ retval = update_relax_domain_level(cs, val);
+ break;
+ default:
+ retval = -EINVAL;
+ break;
+ }
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
+ return retval;
+}
+
+/*
+ * Common handling for a write to a "cpus" or "mems" file.
+ */
+static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct cpuset *cs = css_cs(of_css(of));
+ struct cpuset *trialcs;
+ int retval = -ENODEV;
+
+ buf = strstrip(buf);
+
+ /*
+ * CPU or memory hotunplug may leave @cs w/o any execution
+ * resources, in which case the hotplug code asynchronously updates
+ * configuration and transfers all tasks to the nearest ancestor
+ * which can execute.
+ *
+ * As writes to "cpus" or "mems" may restore @cs's execution
+ * resources, wait for the previously scheduled operations before
+ * proceeding, so that we don't end up keep removing tasks added
+ * after execution capability is restored.
+ *
+ * cpuset_hotplug_work calls back into cgroup core via
+ * cgroup_transfer_tasks() and waiting for it from a cgroupfs
+ * operation like this one can lead to a deadlock through kernfs
+ * active_ref protection. Let's break the protection. Losing the
+ * protection is okay as we check whether @cs is online after
+ * grabbing cpuset_mutex anyway. This only happens on the legacy
+ * hierarchies.
+ */
+ css_get(&cs->css);
+ kernfs_break_active_protection(of->kn);
+ flush_work(&cpuset_hotplug_work);
+
+ mutex_lock(&cpuset_mutex);
+ if (!is_cpuset_online(cs))
+ goto out_unlock;
+
+ trialcs = alloc_trial_cpuset(cs);
+ if (!trialcs) {
+ retval = -ENOMEM;
+ goto out_unlock;
+ }
+
+ switch (of_cft(of)->private) {
+ case FILE_CPULIST:
+ retval = update_cpumask(cs, trialcs, buf);
+ break;
+ case FILE_MEMLIST:
+ retval = update_nodemask(cs, trialcs, buf);
+ break;
+ default:
+ retval = -EINVAL;
+ break;
+ }
+
+ free_trial_cpuset(trialcs);
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
+ kernfs_unbreak_active_protection(of->kn);
+ css_put(&cs->css);
+ return retval ?: nbytes;
+}
+
+/*
+ * These ascii lists should be read in a single call, by using a user
+ * buffer large enough to hold the entire map. If read in smaller
+ * chunks, there is no guarantee of atomicity. Since the display format
+ * used, list of ranges of sequential numbers, is variable length,
+ * and since these maps can change value dynamically, one could read
+ * gibberish by doing partial reads while a list was changing.
+ */
+static int cpuset_common_seq_show(struct seq_file *sf, void *v)
+{
+ struct cpuset *cs = css_cs(seq_css(sf));
+ cpuset_filetype_t type = seq_cft(sf)->private;
+ int ret = 0;
+
+ spin_lock_irq(&callback_lock);
+
+ switch (type) {
+ case FILE_CPULIST:
+ seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));
+ break;
+ case FILE_MEMLIST:
+ seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
+ break;
+ case FILE_EFFECTIVE_CPULIST:
+ seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus));
+ break;
+ case FILE_EFFECTIVE_MEMLIST:
+ seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+ spin_unlock_irq(&callback_lock);
+ return ret;
+}
+
+static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ struct cpuset *cs = css_cs(css);
+ cpuset_filetype_t type = cft->private;
+ switch (type) {
+ case FILE_CPU_EXCLUSIVE:
+ return is_cpu_exclusive(cs);
+ case FILE_MEM_EXCLUSIVE:
+ return is_mem_exclusive(cs);
+ case FILE_MEM_HARDWALL:
+ return is_mem_hardwall(cs);
+ case FILE_SCHED_LOAD_BALANCE:
+ return is_sched_load_balance(cs);
+ case FILE_MEMORY_MIGRATE:
+ return is_memory_migrate(cs);
+ case FILE_MEMORY_PRESSURE_ENABLED:
+ return cpuset_memory_pressure_enabled;
+ case FILE_MEMORY_PRESSURE:
+ return fmeter_getrate(&cs->fmeter);
+ case FILE_SPREAD_PAGE:
+ return is_spread_page(cs);
+ case FILE_SPREAD_SLAB:
+ return is_spread_slab(cs);
+ default:
+ BUG();
+ }
+
+ /* Unreachable but makes gcc happy */
+ return 0;
+}
+
+static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ struct cpuset *cs = css_cs(css);
+ cpuset_filetype_t type = cft->private;
+ switch (type) {
+ case FILE_SCHED_RELAX_DOMAIN_LEVEL:
+ return cs->relax_domain_level;
+ default:
+ BUG();
+ }
+
+ /* Unrechable but makes gcc happy */
+ return 0;
+}
+
+
+/*
+ * for the common functions, 'private' gives the type of file
+ */
+
+static struct cftype files[] = {
+ {
+ .name = "cpus",
+ .seq_show = cpuset_common_seq_show,
+ .write = cpuset_write_resmask,
+ .max_write_len = (100U + 6 * NR_CPUS),
+ .private = FILE_CPULIST,
+ },
+
+ {
+ .name = "mems",
+ .seq_show = cpuset_common_seq_show,
+ .write = cpuset_write_resmask,
+ .max_write_len = (100U + 6 * MAX_NUMNODES),
+ .private = FILE_MEMLIST,
+ },
+
+ {
+ .name = "effective_cpus",
+ .seq_show = cpuset_common_seq_show,
+ .private = FILE_EFFECTIVE_CPULIST,
+ },
+
+ {
+ .name = "effective_mems",
+ .seq_show = cpuset_common_seq_show,
+ .private = FILE_EFFECTIVE_MEMLIST,
+ },
+
+ {
+ .name = "cpu_exclusive",
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_CPU_EXCLUSIVE,
+ },
+
+ {
+ .name = "mem_exclusive",
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_MEM_EXCLUSIVE,
+ },
+
+ {
+ .name = "mem_hardwall",
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_MEM_HARDWALL,
+ },
+
+ {
+ .name = "sched_load_balance",
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_SCHED_LOAD_BALANCE,
+ },
+
+ {
+ .name = "sched_relax_domain_level",
+ .read_s64 = cpuset_read_s64,
+ .write_s64 = cpuset_write_s64,
+ .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
+ },
+
+ {
+ .name = "memory_migrate",
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_MEMORY_MIGRATE,
+ },
+
+ {
+ .name = "memory_pressure",
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_MEMORY_PRESSURE,
+ .mode = S_IRUGO,
+ },
+
+ {
+ .name = "memory_spread_page",
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_SPREAD_PAGE,
+ },
+
+ {
+ .name = "memory_spread_slab",
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_SPREAD_SLAB,
+ },
+
+ {
+ .name = "memory_pressure_enabled",
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_MEMORY_PRESSURE_ENABLED,
+ },
+
+ { } /* terminate */
+};
+
+/*
+ * cpuset_css_alloc - allocate a cpuset css
+ * cgrp: control group that the new cpuset will be part of
+ */
+
+static struct cgroup_subsys_state *
+cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct cpuset *cs;
+
+ if (!parent_css)
+ return &top_cpuset.css;
+
+ cs = kzalloc(sizeof(*cs), GFP_KERNEL);
+ if (!cs)
+ return ERR_PTR(-ENOMEM);
+ if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL))
+ goto free_cs;
+ if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL))
+ goto free_cpus;
+
+ set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
+ cpumask_clear(cs->cpus_allowed);
+ nodes_clear(cs->mems_allowed);
+ cpumask_clear(cs->effective_cpus);
+ nodes_clear(cs->effective_mems);
+ fmeter_init(&cs->fmeter);
+ cs->relax_domain_level = -1;
+
+ return &cs->css;
+
+free_cpus:
+ free_cpumask_var(cs->cpus_allowed);
+free_cs:
+ kfree(cs);
+ return ERR_PTR(-ENOMEM);
+}
+
+static int cpuset_css_online(struct cgroup_subsys_state *css)
+{
+ struct cpuset *cs = css_cs(css);
+ struct cpuset *parent = parent_cs(cs);
+ struct cpuset *tmp_cs;
+ struct cgroup_subsys_state *pos_css;
+
+ if (!parent)
+ return 0;
+
+ mutex_lock(&cpuset_mutex);
+
+ set_bit(CS_ONLINE, &cs->flags);
+ if (is_spread_page(parent))
+ set_bit(CS_SPREAD_PAGE, &cs->flags);
+ if (is_spread_slab(parent))
+ set_bit(CS_SPREAD_SLAB, &cs->flags);
+
+ cpuset_inc();
+
+ spin_lock_irq(&callback_lock);
+ if (cgroup_on_dfl(cs->css.cgroup)) {
+ cpumask_copy(cs->effective_cpus, parent->effective_cpus);
+ cs->effective_mems = parent->effective_mems;
+ }
+ spin_unlock_irq(&callback_lock);
+
+ if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
+ goto out_unlock;
+
+ /*
+ * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
+ * set. This flag handling is implemented in cgroup core for
+ * histrical reasons - the flag may be specified during mount.
+ *
+ * Currently, if any sibling cpusets have exclusive cpus or mem, we
+ * refuse to clone the configuration - thereby refusing the task to
+ * be entered, and as a result refusing the sys_unshare() or
+ * clone() which initiated it. If this becomes a problem for some
+ * users who wish to allow that scenario, then this could be
+ * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
+ * (and likewise for mems) to the new cgroup.
+ */
+ rcu_read_lock();
+ cpuset_for_each_child(tmp_cs, pos_css, parent) {
+ if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
+ rcu_read_unlock();
+ goto out_unlock;
+ }
+ }
+ rcu_read_unlock();
+
+ spin_lock_irq(&callback_lock);
+ cs->mems_allowed = parent->mems_allowed;
+ cs->effective_mems = parent->mems_allowed;
+ cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
+ cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
+ spin_unlock_irq(&callback_lock);
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
+ return 0;
+}
+
+/*
+ * If the cpuset being removed has its flag 'sched_load_balance'
+ * enabled, then simulate turning sched_load_balance off, which
+ * will call rebuild_sched_domains_locked().
+ */
+
+static void cpuset_css_offline(struct cgroup_subsys_state *css)
+{
+ struct cpuset *cs = css_cs(css);
+
+ mutex_lock(&cpuset_mutex);
+
+ if (is_sched_load_balance(cs))
+ update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
+
+ cpuset_dec();
+ clear_bit(CS_ONLINE, &cs->flags);
+
+ mutex_unlock(&cpuset_mutex);
+}
+
+static void cpuset_css_free(struct cgroup_subsys_state *css)
+{
+ struct cpuset *cs = css_cs(css);
+
+ free_cpumask_var(cs->effective_cpus);
+ free_cpumask_var(cs->cpus_allowed);
+ kfree(cs);
+}
+
+static void cpuset_bind(struct cgroup_subsys_state *root_css)
+{
+ mutex_lock(&cpuset_mutex);
+ spin_lock_irq(&callback_lock);
+
+ if (cgroup_on_dfl(root_css->cgroup)) {
+ cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
+ top_cpuset.mems_allowed = node_possible_map;
+ } else {
+ cpumask_copy(top_cpuset.cpus_allowed,
+ top_cpuset.effective_cpus);
+ top_cpuset.mems_allowed = top_cpuset.effective_mems;
+ }
+
+ spin_unlock_irq(&callback_lock);
+ mutex_unlock(&cpuset_mutex);
+}
+
+struct cgroup_subsys cpuset_cgrp_subsys = {
+ .css_alloc = cpuset_css_alloc,
+ .css_online = cpuset_css_online,
+ .css_offline = cpuset_css_offline,
+ .css_free = cpuset_css_free,
+ .can_attach = cpuset_can_attach,
+ .cancel_attach = cpuset_cancel_attach,
+ .attach = cpuset_attach,
+ .bind = cpuset_bind,
+ .legacy_cftypes = files,
+ .early_init = 1,
+};
+
+/**
+ * cpuset_init - initialize cpusets at system boot
+ *
+ * Description: Initialize top_cpuset and the cpuset internal file system,
+ **/
+
+int __init cpuset_init(void)
+{
+ int err = 0;
+
+ if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
+ BUG();
+ if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL))
+ BUG();
+
+ cpumask_setall(top_cpuset.cpus_allowed);
+ nodes_setall(top_cpuset.mems_allowed);
+ cpumask_setall(top_cpuset.effective_cpus);
+ nodes_setall(top_cpuset.effective_mems);
+
+ fmeter_init(&top_cpuset.fmeter);
+ set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
+ top_cpuset.relax_domain_level = -1;
+
+ err = register_filesystem(&cpuset_fs_type);
+ if (err < 0)
+ return err;
+
+ if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
+ BUG();
+
+ return 0;
+}
+
+/*
+ * If CPU and/or memory hotplug handlers, below, unplug any CPUs
+ * or memory nodes, we need to walk over the cpuset hierarchy,
+ * removing that CPU or node from all cpusets. If this removes the
+ * last CPU or node from a cpuset, then move the tasks in the empty
+ * cpuset to its next-highest non-empty parent.
+ */
+static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
+{
+ struct cpuset *parent;
+
+ /*
+ * Find its next-highest non-empty parent, (top cpuset
+ * has online cpus, so can't be empty).
+ */
+ parent = parent_cs(cs);
+ while (cpumask_empty(parent->cpus_allowed) ||
+ nodes_empty(parent->mems_allowed))
+ parent = parent_cs(parent);
+
+ if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
+ pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
+ pr_cont_cgroup_name(cs->css.cgroup);
+ pr_cont("\n");
+ }
+}
+
+static void
+hotplug_update_tasks_legacy(struct cpuset *cs,
+ struct cpumask *new_cpus, nodemask_t *new_mems,
+ bool cpus_updated, bool mems_updated)
+{
+ bool is_empty;
+
+ spin_lock_irq(&callback_lock);
+ cpumask_copy(cs->cpus_allowed, new_cpus);
+ cpumask_copy(cs->effective_cpus, new_cpus);
+ cs->mems_allowed = *new_mems;
+ cs->effective_mems = *new_mems;
+ spin_unlock_irq(&callback_lock);
+
+ /*
+ * Don't call update_tasks_cpumask() if the cpuset becomes empty,
+ * as the tasks will be migratecd to an ancestor.
+ */
+ if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
+ update_tasks_cpumask(cs);
+ if (mems_updated && !nodes_empty(cs->mems_allowed))
+ update_tasks_nodemask(cs);
+
+ is_empty = cpumask_empty(cs->cpus_allowed) ||
+ nodes_empty(cs->mems_allowed);
+
+ mutex_unlock(&cpuset_mutex);
+
+ /*
+ * Move tasks to the nearest ancestor with execution resources,
+ * This is full cgroup operation which will also call back into
+ * cpuset. Should be done outside any lock.
+ */
+ if (is_empty)
+ remove_tasks_in_empty_cpuset(cs);
+
+ mutex_lock(&cpuset_mutex);
+}
+
+static void
+hotplug_update_tasks(struct cpuset *cs,
+ struct cpumask *new_cpus, nodemask_t *new_mems,
+ bool cpus_updated, bool mems_updated)
+{
+ if (cpumask_empty(new_cpus))
+ cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
+ if (nodes_empty(*new_mems))
+ *new_mems = parent_cs(cs)->effective_mems;
+
+ spin_lock_irq(&callback_lock);
+ cpumask_copy(cs->effective_cpus, new_cpus);
+ cs->effective_mems = *new_mems;
+ spin_unlock_irq(&callback_lock);
+
+ if (cpus_updated)
+ update_tasks_cpumask(cs);
+ if (mems_updated)
+ update_tasks_nodemask(cs);
+}
+
+/**
+ * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
+ * @cs: cpuset in interest
+ *
+ * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
+ * offline, update @cs accordingly. If @cs ends up with no CPU or memory,
+ * all its tasks are moved to the nearest ancestor with both resources.
+ */
+static void cpuset_hotplug_update_tasks(struct cpuset *cs)
+{
+ static cpumask_t new_cpus;
+ static nodemask_t new_mems;
+ bool cpus_updated;
+ bool mems_updated;
+retry:
+ wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
+
+ mutex_lock(&cpuset_mutex);
+
+ /*
+ * We have raced with task attaching. We wait until attaching
+ * is finished, so we won't attach a task to an empty cpuset.
+ */
+ if (cs->attach_in_progress) {
+ mutex_unlock(&cpuset_mutex);
+ goto retry;
+ }
+
+ cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus);
+ nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems);
+
+ cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
+ mems_updated = !nodes_equal(new_mems, cs->effective_mems);
+
+ if (cgroup_on_dfl(cs->css.cgroup))
+ hotplug_update_tasks(cs, &new_cpus, &new_mems,
+ cpus_updated, mems_updated);
+ else
+ hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
+ cpus_updated, mems_updated);
+
+ mutex_unlock(&cpuset_mutex);
+}
+
+/**
+ * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
+ *
+ * This function is called after either CPU or memory configuration has
+ * changed and updates cpuset accordingly. The top_cpuset is always
+ * synchronized to cpu_active_mask and N_MEMORY, which is necessary in
+ * order to make cpusets transparent (of no affect) on systems that are
+ * actively using CPU hotplug but making no active use of cpusets.
+ *
+ * Non-root cpusets are only affected by offlining. If any CPUs or memory
+ * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on
+ * all descendants.
+ *
+ * Note that CPU offlining during suspend is ignored. We don't modify
+ * cpusets across suspend/resume cycles at all.
+ */
+static void cpuset_hotplug_workfn(struct work_struct *work)
+{
+ static cpumask_t new_cpus;
+ static nodemask_t new_mems;
+ bool cpus_updated, mems_updated;
+ bool on_dfl = cgroup_on_dfl(top_cpuset.css.cgroup);
+
+ mutex_lock(&cpuset_mutex);
+
+ /* fetch the available cpus/mems and find out which changed how */
+ cpumask_copy(&new_cpus, cpu_active_mask);
+ new_mems = node_states[N_MEMORY];
+
+ cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
+ mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
+
+ /* synchronize cpus_allowed to cpu_active_mask */
+ if (cpus_updated) {
+ spin_lock_irq(&callback_lock);
+ if (!on_dfl)
+ cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
+ cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
+ spin_unlock_irq(&callback_lock);
+ /* we don't mess with cpumasks of tasks in top_cpuset */
+ }
+
+ /* synchronize mems_allowed to N_MEMORY */
+ if (mems_updated) {
+ spin_lock_irq(&callback_lock);
+ if (!on_dfl)
+ top_cpuset.mems_allowed = new_mems;
+ top_cpuset.effective_mems = new_mems;
+ spin_unlock_irq(&callback_lock);
+ update_tasks_nodemask(&top_cpuset);
+ }
+
+ mutex_unlock(&cpuset_mutex);
+
+ /* if cpus or mems changed, we need to propagate to descendants */
+ if (cpus_updated || mems_updated) {
+ struct cpuset *cs;
+ struct cgroup_subsys_state *pos_css;
+
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
+ if (cs == &top_cpuset || !css_tryget_online(&cs->css))
+ continue;
+ rcu_read_unlock();
+
+ cpuset_hotplug_update_tasks(cs);
+
+ rcu_read_lock();
+ css_put(&cs->css);
+ }
+ rcu_read_unlock();
+ }
+
+ /* rebuild sched domains if cpus_allowed has changed */
+ if (cpus_updated)
+ rebuild_sched_domains();
+}
+
+void cpuset_update_active_cpus(bool cpu_online)
+{
+ /*
+ * We're inside cpu hotplug critical region which usually nests
+ * inside cgroup synchronization. Bounce actual hotplug processing
+ * to a work item to avoid reverse locking order.
+ *
+ * We still need to do partition_sched_domains() synchronously;
+ * otherwise, the scheduler will get confused and put tasks to the
+ * dead CPU. Fall back to the default single domain.
+ * cpuset_hotplug_workfn() will rebuild it as necessary.
+ */
+ partition_sched_domains(1, NULL, NULL);
+ schedule_work(&cpuset_hotplug_work);
+}
+
+/*
+ * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
+ * Call this routine anytime after node_states[N_MEMORY] changes.
+ * See cpuset_update_active_cpus() for CPU hotplug handling.
+ */
+static int cpuset_track_online_nodes(struct notifier_block *self,
+ unsigned long action, void *arg)
+{
+ schedule_work(&cpuset_hotplug_work);
+ return NOTIFY_OK;
+}
+
+static struct notifier_block cpuset_track_online_nodes_nb = {
+ .notifier_call = cpuset_track_online_nodes,
+ .priority = 10, /* ??! */
+};
+
+/**
+ * cpuset_init_smp - initialize cpus_allowed
+ *
+ * Description: Finish top cpuset after cpu, node maps are initialized
+ */
+void __init cpuset_init_smp(void)
+{
+ cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
+ top_cpuset.mems_allowed = node_states[N_MEMORY];
+ top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
+
+ cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
+ top_cpuset.effective_mems = node_states[N_MEMORY];
+
+ register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
+}
+
+/**
+ * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
+ * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
+ * @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
+ *
+ * Description: Returns the cpumask_var_t cpus_allowed of the cpuset
+ * attached to the specified @tsk. Guaranteed to return some non-empty
+ * subset of cpu_online_mask, even if this means going outside the
+ * tasks cpuset.
+ **/
+
+void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&callback_lock, flags);
+ rcu_read_lock();
+ guarantee_online_cpus(task_cs(tsk), pmask);
+ rcu_read_unlock();
+ spin_unlock_irqrestore(&callback_lock, flags);
+}
+
+void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
+{
+ rcu_read_lock();
+ do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus);
+ rcu_read_unlock();
+
+ /*
+ * We own tsk->cpus_allowed, nobody can change it under us.
+ *
+ * But we used cs && cs->cpus_allowed lockless and thus can
+ * race with cgroup_attach_task() or update_cpumask() and get
+ * the wrong tsk->cpus_allowed. However, both cases imply the
+ * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
+ * which takes task_rq_lock().
+ *
+ * If we are called after it dropped the lock we must see all
+ * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
+ * set any mask even if it is not right from task_cs() pov,
+ * the pending set_cpus_allowed_ptr() will fix things.
+ *
+ * select_fallback_rq() will fix things ups and set cpu_possible_mask
+ * if required.
+ */
+}
+
+void __init cpuset_init_current_mems_allowed(void)
+{
+ nodes_setall(current->mems_allowed);
+}
+
+/**
+ * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.
+ * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.
+ *
+ * Description: Returns the nodemask_t mems_allowed of the cpuset
+ * attached to the specified @tsk. Guaranteed to return some non-empty
+ * subset of node_states[N_MEMORY], even if this means going outside the
+ * tasks cpuset.
+ **/
+
+nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
+{
+ nodemask_t mask;
+ unsigned long flags;
+
+ spin_lock_irqsave(&callback_lock, flags);
+ rcu_read_lock();
+ guarantee_online_mems(task_cs(tsk), &mask);
+ rcu_read_unlock();
+ spin_unlock_irqrestore(&callback_lock, flags);
+
+ return mask;
+}
+
+/**
+ * cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed
+ * @nodemask: the nodemask to be checked
+ *
+ * Are any of the nodes in the nodemask allowed in current->mems_allowed?
+ */
+int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
+{
+ return nodes_intersects(*nodemask, current->mems_allowed);
+}
+
+/*
+ * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
+ * mem_hardwall ancestor to the specified cpuset. Call holding
+ * callback_lock. If no ancestor is mem_exclusive or mem_hardwall
+ * (an unusual configuration), then returns the root cpuset.
+ */
+static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
+{
+ while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
+ cs = parent_cs(cs);
+ return cs;
+}
+
+/**
+ * cpuset_node_allowed - Can we allocate on a memory node?
+ * @node: is this an allowed node?
+ * @gfp_mask: memory allocation flags
+ *
+ * If we're in interrupt, yes, we can always allocate. If @node is set in
+ * current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this
+ * node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
+ * yes. If current has access to memory reserves due to TIF_MEMDIE, yes.
+ * Otherwise, no.
+ *
+ * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
+ * and do not allow allocations outside the current tasks cpuset
+ * unless the task has been OOM killed as is marked TIF_MEMDIE.
+ * GFP_KERNEL allocations are not so marked, so can escape to the
+ * nearest enclosing hardwalled ancestor cpuset.
+ *
+ * Scanning up parent cpusets requires callback_lock. The
+ * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
+ * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
+ * current tasks mems_allowed came up empty on the first pass over
+ * the zonelist. So only GFP_KERNEL allocations, if all nodes in the
+ * cpuset are short of memory, might require taking the callback_lock.
+ *
+ * The first call here from mm/page_alloc:get_page_from_freelist()
+ * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
+ * so no allocation on a node outside the cpuset is allowed (unless
+ * in interrupt, of course).
+ *
+ * The second pass through get_page_from_freelist() doesn't even call
+ * here for GFP_ATOMIC calls. For those calls, the __alloc_pages()
+ * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set
+ * in alloc_flags. That logic and the checks below have the combined
+ * affect that:
+ * in_interrupt - any node ok (current task context irrelevant)
+ * GFP_ATOMIC - any node ok
+ * TIF_MEMDIE - any node ok
+ * GFP_KERNEL - any node in enclosing hardwalled cpuset ok
+ * GFP_USER - only nodes in current tasks mems allowed ok.
+ */
+int __cpuset_node_allowed(int node, gfp_t gfp_mask)
+{
+ struct cpuset *cs; /* current cpuset ancestors */
+ int allowed; /* is allocation in zone z allowed? */
+ unsigned long flags;
+
+ if (in_interrupt())
+ return 1;
+ if (node_isset(node, current->mems_allowed))
+ return 1;
+ /*
+ * Allow tasks that have access to memory reserves because they have
+ * been OOM killed to get memory anywhere.
+ */
+ if (unlikely(test_thread_flag(TIF_MEMDIE)))
+ return 1;
+ if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */
+ return 0;
+
+ if (current->flags & PF_EXITING) /* Let dying task have memory */
+ return 1;
+
+ /* Not hardwall and node outside mems_allowed: scan up cpusets */
+ spin_lock_irqsave(&callback_lock, flags);
+
+ rcu_read_lock();
+ cs = nearest_hardwall_ancestor(task_cs(current));
+ allowed = node_isset(node, cs->mems_allowed);
+ rcu_read_unlock();
+
+ spin_unlock_irqrestore(&callback_lock, flags);
+ return allowed;
+}
+
+/**
+ * cpuset_mem_spread_node() - On which node to begin search for a file page
+ * cpuset_slab_spread_node() - On which node to begin search for a slab page
+ *
+ * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
+ * tasks in a cpuset with is_spread_page or is_spread_slab set),
+ * and if the memory allocation used cpuset_mem_spread_node()
+ * to determine on which node to start looking, as it will for
+ * certain page cache or slab cache pages such as used for file
+ * system buffers and inode caches, then instead of starting on the
+ * local node to look for a free page, rather spread the starting
+ * node around the tasks mems_allowed nodes.
+ *
+ * We don't have to worry about the returned node being offline
+ * because "it can't happen", and even if it did, it would be ok.
+ *
+ * The routines calling guarantee_online_mems() are careful to
+ * only set nodes in task->mems_allowed that are online. So it
+ * should not be possible for the following code to return an
+ * offline node. But if it did, that would be ok, as this routine
+ * is not returning the node where the allocation must be, only
+ * the node where the search should start. The zonelist passed to
+ * __alloc_pages() will include all nodes. If the slab allocator
+ * is passed an offline node, it will fall back to the local node.
+ * See kmem_cache_alloc_node().
+ */
+
+static int cpuset_spread_node(int *rotor)
+{
+ int node;
+
+ node = next_node(*rotor, current->mems_allowed);
+ if (node == MAX_NUMNODES)
+ node = first_node(current->mems_allowed);
+ *rotor = node;
+ return node;
+}
+
+int cpuset_mem_spread_node(void)
+{
+ if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
+ current->cpuset_mem_spread_rotor =
+ node_random(&current->mems_allowed);
+
+ return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
+}
+
+int cpuset_slab_spread_node(void)
+{
+ if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
+ current->cpuset_slab_spread_rotor =
+ node_random(&current->mems_allowed);
+
+ return cpuset_spread_node(&current->cpuset_slab_spread_rotor);
+}
+
+EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
+
+/**
+ * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's?
+ * @tsk1: pointer to task_struct of some task.
+ * @tsk2: pointer to task_struct of some other task.
+ *
+ * Description: Return true if @tsk1's mems_allowed intersects the
+ * mems_allowed of @tsk2. Used by the OOM killer to determine if
+ * one of the task's memory usage might impact the memory available
+ * to the other.
+ **/
+
+int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
+ const struct task_struct *tsk2)
+{
+ return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
+}
+
+/**
+ * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
+ * @tsk: pointer to task_struct of some task.
+ *
+ * Description: Prints @task's name, cpuset name, and cached copy of its
+ * mems_allowed to the kernel log.
+ */
+void cpuset_print_task_mems_allowed(struct task_struct *tsk)
+{
+ struct cgroup *cgrp;
+
+ rcu_read_lock();
+
+ cgrp = task_cs(tsk)->css.cgroup;
+ pr_info("%s cpuset=", tsk->comm);
+ pr_cont_cgroup_name(cgrp);
+ pr_cont(" mems_allowed=%*pbl\n", nodemask_pr_args(&tsk->mems_allowed));
+
+ rcu_read_unlock();
+}
+
+/*
+ * Collection of memory_pressure is suppressed unless
+ * this flag is enabled by writing "1" to the special
+ * cpuset file 'memory_pressure_enabled' in the root cpuset.
+ */
+
+int cpuset_memory_pressure_enabled __read_mostly;
+
+/**
+ * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
+ *
+ * Keep a running average of the rate of synchronous (direct)
+ * page reclaim efforts initiated by tasks in each cpuset.
+ *
+ * This represents the rate at which some task in the cpuset
+ * ran low on memory on all nodes it was allowed to use, and
+ * had to enter the kernels page reclaim code in an effort to
+ * create more free memory by tossing clean pages or swapping
+ * or writing dirty pages.
+ *
+ * Display to user space in the per-cpuset read-only file
+ * "memory_pressure". Value displayed is an integer
+ * representing the recent rate of entry into the synchronous
+ * (direct) page reclaim by any task attached to the cpuset.
+ **/
+
+void __cpuset_memory_pressure_bump(void)
+{
+ rcu_read_lock();
+ fmeter_markevent(&task_cs(current)->fmeter);
+ rcu_read_unlock();
+}
+
+#ifdef CONFIG_PROC_PID_CPUSET
+/*
+ * proc_cpuset_show()
+ * - Print tasks cpuset path into seq_file.
+ * - Used for /proc/<pid>/cpuset.
+ * - No need to task_lock(tsk) on this tsk->cpuset reference, as it
+ * doesn't really matter if tsk->cpuset changes after we read it,
+ * and we take cpuset_mutex, keeping cpuset_attach() from changing it
+ * anyway.
+ */
+int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *tsk)
+{
+ char *buf, *p;
+ struct cgroup_subsys_state *css;
+ int retval;
+
+ retval = -ENOMEM;
+ buf = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (!buf)
+ goto out;
+
+ retval = -ENAMETOOLONG;
+ rcu_read_lock();
+ css = task_css(tsk, cpuset_cgrp_id);
+ p = cgroup_path(css->cgroup, buf, PATH_MAX);
+ rcu_read_unlock();
+ if (!p)
+ goto out_free;
+ seq_puts(m, p);
+ seq_putc(m, '\n');
+ retval = 0;
+out_free:
+ kfree(buf);
+out:
+ return retval;
+}
+#endif /* CONFIG_PROC_PID_CPUSET */
+
+/* Display task mems_allowed in /proc/<pid>/status file. */
+void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
+{
+ seq_printf(m, "Mems_allowed:\t%*pb\n",
+ nodemask_pr_args(&task->mems_allowed));
+ seq_printf(m, "Mems_allowed_list:\t%*pbl\n",
+ nodemask_pr_args(&task->mems_allowed));
+}
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
new file mode 100644
index 000000000..b64e238b5
--- /dev/null
+++ b/kernel/crash_dump.c
@@ -0,0 +1,46 @@
+#include <linux/kernel.h>
+#include <linux/crash_dump.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/export.h>
+
+/*
+ * If we have booted due to a crash, max_pfn will be a very low value. We need
+ * to know the amount of memory that the previous kernel used.
+ */
+unsigned long saved_max_pfn;
+
+/*
+ * stores the physical address of elf header of crash image
+ *
+ * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by
+ * is_kdump_kernel() to determine if we are booting after a panic. Hence put
+ * it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
+ */
+unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
+EXPORT_SYMBOL_GPL(elfcorehdr_addr);
+
+/*
+ * stores the size of elf header of crash image
+ */
+unsigned long long elfcorehdr_size;
+
+/*
+ * elfcorehdr= specifies the location of elf core header stored by the crashed
+ * kernel. This option will be passed by kexec loader to the capture kernel.
+ *
+ * Syntax: elfcorehdr=[size[KMG]@]offset[KMG]
+ */
+static int __init setup_elfcorehdr(char *arg)
+{
+ char *end;
+ if (!arg)
+ return -EINVAL;
+ elfcorehdr_addr = memparse(arg, &end);
+ if (*end == '@') {
+ elfcorehdr_size = elfcorehdr_addr;
+ elfcorehdr_addr = memparse(end + 1, &end);
+ }
+ return end > arg ? 0 : -EINVAL;
+}
+early_param("elfcorehdr", setup_elfcorehdr);
diff --git a/kernel/cred.c b/kernel/cred.c
new file mode 100644
index 000000000..ec1c07667
--- /dev/null
+++ b/kernel/cred.c
@@ -0,0 +1,810 @@
+/* Task credentials management - see Documentation/security/credentials.txt
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+#include <linux/export.h>
+#include <linux/cred.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/key.h>
+#include <linux/keyctl.h>
+#include <linux/init_task.h>
+#include <linux/security.h>
+#include <linux/binfmts.h>
+#include <linux/cn_proc.h>
+
+#if 0
+#define kdebug(FMT, ...) \
+ printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
+#else
+#define kdebug(FMT, ...) \
+ no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
+#endif
+
+static struct kmem_cache *cred_jar;
+
+/* init to 2 - one for init_task, one to ensure it is never freed */
+struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
+
+/*
+ * The initial credentials for the initial task
+ */
+struct cred init_cred = {
+ .usage = ATOMIC_INIT(4),
+#ifdef CONFIG_DEBUG_CREDENTIALS
+ .subscribers = ATOMIC_INIT(2),
+ .magic = CRED_MAGIC,
+#endif
+ .uid = GLOBAL_ROOT_UID,
+ .gid = GLOBAL_ROOT_GID,
+ .suid = GLOBAL_ROOT_UID,
+ .sgid = GLOBAL_ROOT_GID,
+ .euid = GLOBAL_ROOT_UID,
+ .egid = GLOBAL_ROOT_GID,
+ .fsuid = GLOBAL_ROOT_UID,
+ .fsgid = GLOBAL_ROOT_GID,
+ .securebits = SECUREBITS_DEFAULT,
+ .cap_inheritable = CAP_EMPTY_SET,
+ .cap_permitted = CAP_FULL_SET,
+ .cap_effective = CAP_FULL_SET,
+ .cap_bset = CAP_FULL_SET,
+ .user = INIT_USER,
+ .user_ns = &init_user_ns,
+ .group_info = &init_groups,
+};
+
+static inline void set_cred_subscribers(struct cred *cred, int n)
+{
+#ifdef CONFIG_DEBUG_CREDENTIALS
+ atomic_set(&cred->subscribers, n);
+#endif
+}
+
+static inline int read_cred_subscribers(const struct cred *cred)
+{
+#ifdef CONFIG_DEBUG_CREDENTIALS
+ return atomic_read(&cred->subscribers);
+#else
+ return 0;
+#endif
+}
+
+static inline void alter_cred_subscribers(const struct cred *_cred, int n)
+{
+#ifdef CONFIG_DEBUG_CREDENTIALS
+ struct cred *cred = (struct cred *) _cred;
+
+ atomic_add(n, &cred->subscribers);
+#endif
+}
+
+/*
+ * The RCU callback to actually dispose of a set of credentials
+ */
+static void put_cred_rcu(struct rcu_head *rcu)
+{
+ struct cred *cred = container_of(rcu, struct cred, rcu);
+
+ kdebug("put_cred_rcu(%p)", cred);
+
+#ifdef CONFIG_DEBUG_CREDENTIALS
+ if (cred->magic != CRED_MAGIC_DEAD ||
+ atomic_read(&cred->usage) != 0 ||
+ read_cred_subscribers(cred) != 0)
+ panic("CRED: put_cred_rcu() sees %p with"
+ " mag %x, put %p, usage %d, subscr %d\n",
+ cred, cred->magic, cred->put_addr,
+ atomic_read(&cred->usage),
+ read_cred_subscribers(cred));
+#else
+ if (atomic_read(&cred->usage) != 0)
+ panic("CRED: put_cred_rcu() sees %p with usage %d\n",
+ cred, atomic_read(&cred->usage));
+#endif
+
+ security_cred_free(cred);
+ key_put(cred->session_keyring);
+ key_put(cred->process_keyring);
+ key_put(cred->thread_keyring);
+ key_put(cred->request_key_auth);
+ if (cred->group_info)
+ put_group_info(cred->group_info);
+ free_uid(cred->user);
+ put_user_ns(cred->user_ns);
+ kmem_cache_free(cred_jar, cred);
+}
+
+/**
+ * __put_cred - Destroy a set of credentials
+ * @cred: The record to release
+ *
+ * Destroy a set of credentials on which no references remain.
+ */
+void __put_cred(struct cred *cred)
+{
+ kdebug("__put_cred(%p{%d,%d})", cred,
+ atomic_read(&cred->usage),
+ read_cred_subscribers(cred));
+
+ BUG_ON(atomic_read(&cred->usage) != 0);
+#ifdef CONFIG_DEBUG_CREDENTIALS
+ BUG_ON(read_cred_subscribers(cred) != 0);
+ cred->magic = CRED_MAGIC_DEAD;
+ cred->put_addr = __builtin_return_address(0);
+#endif
+ BUG_ON(cred == current->cred);
+ BUG_ON(cred == current->real_cred);
+
+ call_rcu(&cred->rcu, put_cred_rcu);
+}
+EXPORT_SYMBOL(__put_cred);
+
+/*
+ * Clean up a task's credentials when it exits
+ */
+void exit_creds(struct task_struct *tsk)
+{
+ struct cred *cred;
+
+ kdebug("exit_creds(%u,%p,%p,{%d,%d})", tsk->pid, tsk->real_cred, tsk->cred,
+ atomic_read(&tsk->cred->usage),
+ read_cred_subscribers(tsk->cred));
+
+ cred = (struct cred *) tsk->real_cred;
+ tsk->real_cred = NULL;
+ validate_creds(cred);
+ alter_cred_subscribers(cred, -1);
+ put_cred(cred);
+
+ cred = (struct cred *) tsk->cred;
+ tsk->cred = NULL;
+ validate_creds(cred);
+ alter_cred_subscribers(cred, -1);
+ put_cred(cred);
+}
+
+/**
+ * get_task_cred - Get another task's objective credentials
+ * @task: The task to query
+ *
+ * Get the objective credentials of a task, pinning them so that they can't go
+ * away. Accessing a task's credentials directly is not permitted.
+ *
+ * The caller must also make sure task doesn't get deleted, either by holding a
+ * ref on task or by holding tasklist_lock to prevent it from being unlinked.
+ */
+const struct cred *get_task_cred(struct task_struct *task)
+{
+ const struct cred *cred;
+
+ rcu_read_lock();
+
+ do {
+ cred = __task_cred((task));
+ BUG_ON(!cred);
+ } while (!atomic_inc_not_zero(&((struct cred *)cred)->usage));
+
+ rcu_read_unlock();
+ return cred;
+}
+
+/*
+ * Allocate blank credentials, such that the credentials can be filled in at a
+ * later date without risk of ENOMEM.
+ */
+struct cred *cred_alloc_blank(void)
+{
+ struct cred *new;
+
+ new = kmem_cache_zalloc(cred_jar, GFP_KERNEL);
+ if (!new)
+ return NULL;
+
+ atomic_set(&new->usage, 1);
+#ifdef CONFIG_DEBUG_CREDENTIALS
+ new->magic = CRED_MAGIC;
+#endif
+
+ if (security_cred_alloc_blank(new, GFP_KERNEL) < 0)
+ goto error;
+
+ return new;
+
+error:
+ abort_creds(new);
+ return NULL;
+}
+
+/**
+ * prepare_creds - Prepare a new set of credentials for modification
+ *
+ * Prepare a new set of task credentials for modification. A task's creds
+ * shouldn't generally be modified directly, therefore this function is used to
+ * prepare a new copy, which the caller then modifies and then commits by
+ * calling commit_creds().
+ *
+ * Preparation involves making a copy of the objective creds for modification.
+ *
+ * Returns a pointer to the new creds-to-be if successful, NULL otherwise.
+ *
+ * Call commit_creds() or abort_creds() to clean up.
+ */
+struct cred *prepare_creds(void)
+{
+ struct task_struct *task = current;
+ const struct cred *old;
+ struct cred *new;
+
+ validate_process_creds();
+
+ new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
+ if (!new)
+ return NULL;
+
+ kdebug("prepare_creds() alloc %p", new);
+
+ old = task->cred;
+ memcpy(new, old, sizeof(struct cred));
+
+ atomic_set(&new->usage, 1);
+ set_cred_subscribers(new, 0);
+ get_group_info(new->group_info);
+ get_uid(new->user);
+ get_user_ns(new->user_ns);
+
+#ifdef CONFIG_KEYS
+ key_get(new->session_keyring);
+ key_get(new->process_keyring);
+ key_get(new->thread_keyring);
+ key_get(new->request_key_auth);
+#endif
+
+#ifdef CONFIG_SECURITY
+ new->security = NULL;
+#endif
+
+ if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
+ goto error;
+ validate_creds(new);
+ return new;
+
+error:
+ abort_creds(new);
+ return NULL;
+}
+EXPORT_SYMBOL(prepare_creds);
+
+/*
+ * Prepare credentials for current to perform an execve()
+ * - The caller must hold ->cred_guard_mutex
+ */
+struct cred *prepare_exec_creds(void)
+{
+ struct cred *new;
+
+ new = prepare_creds();
+ if (!new)
+ return new;
+
+#ifdef CONFIG_KEYS
+ /* newly exec'd tasks don't get a thread keyring */
+ key_put(new->thread_keyring);
+ new->thread_keyring = NULL;
+
+ /* inherit the session keyring; new process keyring */
+ key_put(new->process_keyring);
+ new->process_keyring = NULL;
+#endif
+
+ return new;
+}
+
+/*
+ * Copy credentials for the new process created by fork()
+ *
+ * We share if we can, but under some circumstances we have to generate a new
+ * set.
+ *
+ * The new process gets the current process's subjective credentials as its
+ * objective and subjective credentials
+ */
+int copy_creds(struct task_struct *p, unsigned long clone_flags)
+{
+ struct cred *new;
+ int ret;
+
+ if (
+#ifdef CONFIG_KEYS
+ !p->cred->thread_keyring &&
+#endif
+ clone_flags & CLONE_THREAD
+ ) {
+ p->real_cred = get_cred(p->cred);
+ get_cred(p->cred);
+ alter_cred_subscribers(p->cred, 2);
+ kdebug("share_creds(%p{%d,%d})",
+ p->cred, atomic_read(&p->cred->usage),
+ read_cred_subscribers(p->cred));
+ atomic_inc(&p->cred->user->processes);
+ return 0;
+ }
+
+ new = prepare_creds();
+ if (!new)
+ return -ENOMEM;
+
+ if (clone_flags & CLONE_NEWUSER) {
+ ret = create_user_ns(new);
+ if (ret < 0)
+ goto error_put;
+ }
+
+#ifdef CONFIG_KEYS
+ /* new threads get their own thread keyrings if their parent already
+ * had one */
+ if (new->thread_keyring) {
+ key_put(new->thread_keyring);
+ new->thread_keyring = NULL;
+ if (clone_flags & CLONE_THREAD)
+ install_thread_keyring_to_cred(new);
+ }
+
+ /* The process keyring is only shared between the threads in a process;
+ * anything outside of those threads doesn't inherit.
+ */
+ if (!(clone_flags & CLONE_THREAD)) {
+ key_put(new->process_keyring);
+ new->process_keyring = NULL;
+ }
+#endif
+
+ atomic_inc(&new->user->processes);
+ p->cred = p->real_cred = get_cred(new);
+ alter_cred_subscribers(new, 2);
+ validate_creds(new);
+ return 0;
+
+error_put:
+ put_cred(new);
+ return ret;
+}
+
+static bool cred_cap_issubset(const struct cred *set, const struct cred *subset)
+{
+ const struct user_namespace *set_ns = set->user_ns;
+ const struct user_namespace *subset_ns = subset->user_ns;
+
+ /* If the two credentials are in the same user namespace see if
+ * the capabilities of subset are a subset of set.
+ */
+ if (set_ns == subset_ns)
+ return cap_issubset(subset->cap_permitted, set->cap_permitted);
+
+ /* The credentials are in a different user namespaces
+ * therefore one is a subset of the other only if a set is an
+ * ancestor of subset and set->euid is owner of subset or one
+ * of subsets ancestors.
+ */
+ for (;subset_ns != &init_user_ns; subset_ns = subset_ns->parent) {
+ if ((set_ns == subset_ns->parent) &&
+ uid_eq(subset_ns->owner, set->euid))
+ return true;
+ }
+
+ return false;
+}
+
+/**
+ * commit_creds - Install new credentials upon the current task
+ * @new: The credentials to be assigned
+ *
+ * Install a new set of credentials to the current task, using RCU to replace
+ * the old set. Both the objective and the subjective credentials pointers are
+ * updated. This function may not be called if the subjective credentials are
+ * in an overridden state.
+ *
+ * This function eats the caller's reference to the new credentials.
+ *
+ * Always returns 0 thus allowing this function to be tail-called at the end
+ * of, say, sys_setgid().
+ */
+int commit_creds(struct cred *new)
+{
+ struct task_struct *task = current;
+ const struct cred *old = task->real_cred;
+
+ kdebug("commit_creds(%p{%d,%d})", new,
+ atomic_read(&new->usage),
+ read_cred_subscribers(new));
+
+ BUG_ON(task->cred != old);
+#ifdef CONFIG_DEBUG_CREDENTIALS
+ BUG_ON(read_cred_subscribers(old) < 2);
+ validate_creds(old);
+ validate_creds(new);
+#endif
+ BUG_ON(atomic_read(&new->usage) < 1);
+
+ get_cred(new); /* we will require a ref for the subj creds too */
+
+ /* dumpability changes */
+ if (!uid_eq(old->euid, new->euid) ||
+ !gid_eq(old->egid, new->egid) ||
+ !uid_eq(old->fsuid, new->fsuid) ||
+ !gid_eq(old->fsgid, new->fsgid) ||
+ !cred_cap_issubset(old, new)) {
+ if (task->mm)
+ set_dumpable(task->mm, suid_dumpable);
+ task->pdeath_signal = 0;
+ smp_wmb();
+ }
+
+ /* alter the thread keyring */
+ if (!uid_eq(new->fsuid, old->fsuid))
+ key_fsuid_changed(task);
+ if (!gid_eq(new->fsgid, old->fsgid))
+ key_fsgid_changed(task);
+
+ /* do it
+ * RLIMIT_NPROC limits on user->processes have already been checked
+ * in set_user().
+ */
+ alter_cred_subscribers(new, 2);
+ if (new->user != old->user)
+ atomic_inc(&new->user->processes);
+ rcu_assign_pointer(task->real_cred, new);
+ rcu_assign_pointer(task->cred, new);
+ if (new->user != old->user)
+ atomic_dec(&old->user->processes);
+ alter_cred_subscribers(old, -2);
+
+ /* send notifications */
+ if (!uid_eq(new->uid, old->uid) ||
+ !uid_eq(new->euid, old->euid) ||
+ !uid_eq(new->suid, old->suid) ||
+ !uid_eq(new->fsuid, old->fsuid))
+ proc_id_connector(task, PROC_EVENT_UID);
+
+ if (!gid_eq(new->gid, old->gid) ||
+ !gid_eq(new->egid, old->egid) ||
+ !gid_eq(new->sgid, old->sgid) ||
+ !gid_eq(new->fsgid, old->fsgid))
+ proc_id_connector(task, PROC_EVENT_GID);
+
+ /* release the old obj and subj refs both */
+ put_cred(old);
+ put_cred(old);
+ return 0;
+}
+EXPORT_SYMBOL(commit_creds);
+
+/**
+ * abort_creds - Discard a set of credentials and unlock the current task
+ * @new: The credentials that were going to be applied
+ *
+ * Discard a set of credentials that were under construction and unlock the
+ * current task.
+ */
+void abort_creds(struct cred *new)
+{
+ kdebug("abort_creds(%p{%d,%d})", new,
+ atomic_read(&new->usage),
+ read_cred_subscribers(new));
+
+#ifdef CONFIG_DEBUG_CREDENTIALS
+ BUG_ON(read_cred_subscribers(new) != 0);
+#endif
+ BUG_ON(atomic_read(&new->usage) < 1);
+ put_cred(new);
+}
+EXPORT_SYMBOL(abort_creds);
+
+/**
+ * override_creds - Override the current process's subjective credentials
+ * @new: The credentials to be assigned
+ *
+ * Install a set of temporary override subjective credentials on the current
+ * process, returning the old set for later reversion.
+ */
+const struct cred *override_creds(const struct cred *new)
+{
+ const struct cred *old = current->cred;
+
+ kdebug("override_creds(%p{%d,%d})", new,
+ atomic_read(&new->usage),
+ read_cred_subscribers(new));
+
+ validate_creds(old);
+ validate_creds(new);
+ get_cred(new);
+ alter_cred_subscribers(new, 1);
+ rcu_assign_pointer(current->cred, new);
+ alter_cred_subscribers(old, -1);
+
+ kdebug("override_creds() = %p{%d,%d}", old,
+ atomic_read(&old->usage),
+ read_cred_subscribers(old));
+ return old;
+}
+EXPORT_SYMBOL(override_creds);
+
+/**
+ * revert_creds - Revert a temporary subjective credentials override
+ * @old: The credentials to be restored
+ *
+ * Revert a temporary set of override subjective credentials to an old set,
+ * discarding the override set.
+ */
+void revert_creds(const struct cred *old)
+{
+ const struct cred *override = current->cred;
+
+ kdebug("revert_creds(%p{%d,%d})", old,
+ atomic_read(&old->usage),
+ read_cred_subscribers(old));
+
+ validate_creds(old);
+ validate_creds(override);
+ alter_cred_subscribers(old, 1);
+ rcu_assign_pointer(current->cred, old);
+ alter_cred_subscribers(override, -1);
+ put_cred(override);
+}
+EXPORT_SYMBOL(revert_creds);
+
+/*
+ * initialise the credentials stuff
+ */
+void __init cred_init(void)
+{
+ /* allocate a slab in which we can store credentials */
+ cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred),
+ 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+}
+
+/**
+ * prepare_kernel_cred - Prepare a set of credentials for a kernel service
+ * @daemon: A userspace daemon to be used as a reference
+ *
+ * Prepare a set of credentials for a kernel service. This can then be used to
+ * override a task's own credentials so that work can be done on behalf of that
+ * task that requires a different subjective context.
+ *
+ * @daemon is used to provide a base for the security record, but can be NULL.
+ * If @daemon is supplied, then the security data will be derived from that;
+ * otherwise they'll be set to 0 and no groups, full capabilities and no keys.
+ *
+ * The caller may change these controls afterwards if desired.
+ *
+ * Returns the new credentials or NULL if out of memory.
+ *
+ * Does not take, and does not return holding current->cred_replace_mutex.
+ */
+struct cred *prepare_kernel_cred(struct task_struct *daemon)
+{
+ const struct cred *old;
+ struct cred *new;
+
+ new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
+ if (!new)
+ return NULL;
+
+ kdebug("prepare_kernel_cred() alloc %p", new);
+
+ if (daemon)
+ old = get_task_cred(daemon);
+ else
+ old = get_cred(&init_cred);
+
+ validate_creds(old);
+
+ *new = *old;
+ atomic_set(&new->usage, 1);
+ set_cred_subscribers(new, 0);
+ get_uid(new->user);
+ get_user_ns(new->user_ns);
+ get_group_info(new->group_info);
+
+#ifdef CONFIG_KEYS
+ new->session_keyring = NULL;
+ new->process_keyring = NULL;
+ new->thread_keyring = NULL;
+ new->request_key_auth = NULL;
+ new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
+#endif
+
+#ifdef CONFIG_SECURITY
+ new->security = NULL;
+#endif
+ if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
+ goto error;
+
+ put_cred(old);
+ validate_creds(new);
+ return new;
+
+error:
+ put_cred(new);
+ put_cred(old);
+ return NULL;
+}
+EXPORT_SYMBOL(prepare_kernel_cred);
+
+/**
+ * set_security_override - Set the security ID in a set of credentials
+ * @new: The credentials to alter
+ * @secid: The LSM security ID to set
+ *
+ * Set the LSM security ID in a set of credentials so that the subjective
+ * security is overridden when an alternative set of credentials is used.
+ */
+int set_security_override(struct cred *new, u32 secid)
+{
+ return security_kernel_act_as(new, secid);
+}
+EXPORT_SYMBOL(set_security_override);
+
+/**
+ * set_security_override_from_ctx - Set the security ID in a set of credentials
+ * @new: The credentials to alter
+ * @secctx: The LSM security context to generate the security ID from.
+ *
+ * Set the LSM security ID in a set of credentials so that the subjective
+ * security is overridden when an alternative set of credentials is used. The
+ * security ID is specified in string form as a security context to be
+ * interpreted by the LSM.
+ */
+int set_security_override_from_ctx(struct cred *new, const char *secctx)
+{
+ u32 secid;
+ int ret;
+
+ ret = security_secctx_to_secid(secctx, strlen(secctx), &secid);
+ if (ret < 0)
+ return ret;
+
+ return set_security_override(new, secid);
+}
+EXPORT_SYMBOL(set_security_override_from_ctx);
+
+/**
+ * set_create_files_as - Set the LSM file create context in a set of credentials
+ * @new: The credentials to alter
+ * @inode: The inode to take the context from
+ *
+ * Change the LSM file creation context in a set of credentials to be the same
+ * as the object context of the specified inode, so that the new inodes have
+ * the same MAC context as that inode.
+ */
+int set_create_files_as(struct cred *new, struct inode *inode)
+{
+ new->fsuid = inode->i_uid;
+ new->fsgid = inode->i_gid;
+ return security_kernel_create_files_as(new, inode);
+}
+EXPORT_SYMBOL(set_create_files_as);
+
+#ifdef CONFIG_DEBUG_CREDENTIALS
+
+bool creds_are_invalid(const struct cred *cred)
+{
+ if (cred->magic != CRED_MAGIC)
+ return true;
+#ifdef CONFIG_SECURITY_SELINUX
+ /*
+ * cred->security == NULL if security_cred_alloc_blank() or
+ * security_prepare_creds() returned an error.
+ */
+ if (selinux_is_enabled() && cred->security) {
+ if ((unsigned long) cred->security < PAGE_SIZE)
+ return true;
+ if ((*(u32 *)cred->security & 0xffffff00) ==
+ (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8))
+ return true;
+ }
+#endif
+ return false;
+}
+EXPORT_SYMBOL(creds_are_invalid);
+
+/*
+ * dump invalid credentials
+ */
+static void dump_invalid_creds(const struct cred *cred, const char *label,
+ const struct task_struct *tsk)
+{
+ printk(KERN_ERR "CRED: %s credentials: %p %s%s%s\n",
+ label, cred,
+ cred == &init_cred ? "[init]" : "",
+ cred == tsk->real_cred ? "[real]" : "",
+ cred == tsk->cred ? "[eff]" : "");
+ printk(KERN_ERR "CRED: ->magic=%x, put_addr=%p\n",
+ cred->magic, cred->put_addr);
+ printk(KERN_ERR "CRED: ->usage=%d, subscr=%d\n",
+ atomic_read(&cred->usage),
+ read_cred_subscribers(cred));
+ printk(KERN_ERR "CRED: ->*uid = { %d,%d,%d,%d }\n",
+ from_kuid_munged(&init_user_ns, cred->uid),
+ from_kuid_munged(&init_user_ns, cred->euid),
+ from_kuid_munged(&init_user_ns, cred->suid),
+ from_kuid_munged(&init_user_ns, cred->fsuid));
+ printk(KERN_ERR "CRED: ->*gid = { %d,%d,%d,%d }\n",
+ from_kgid_munged(&init_user_ns, cred->gid),
+ from_kgid_munged(&init_user_ns, cred->egid),
+ from_kgid_munged(&init_user_ns, cred->sgid),
+ from_kgid_munged(&init_user_ns, cred->fsgid));
+#ifdef CONFIG_SECURITY
+ printk(KERN_ERR "CRED: ->security is %p\n", cred->security);
+ if ((unsigned long) cred->security >= PAGE_SIZE &&
+ (((unsigned long) cred->security & 0xffffff00) !=
+ (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8)))
+ printk(KERN_ERR "CRED: ->security {%x, %x}\n",
+ ((u32*)cred->security)[0],
+ ((u32*)cred->security)[1]);
+#endif
+}
+
+/*
+ * report use of invalid credentials
+ */
+void __invalid_creds(const struct cred *cred, const char *file, unsigned line)
+{
+ printk(KERN_ERR "CRED: Invalid credentials\n");
+ printk(KERN_ERR "CRED: At %s:%u\n", file, line);
+ dump_invalid_creds(cred, "Specified", current);
+ BUG();
+}
+EXPORT_SYMBOL(__invalid_creds);
+
+/*
+ * check the credentials on a process
+ */
+void __validate_process_creds(struct task_struct *tsk,
+ const char *file, unsigned line)
+{
+ if (tsk->cred == tsk->real_cred) {
+ if (unlikely(read_cred_subscribers(tsk->cred) < 2 ||
+ creds_are_invalid(tsk->cred)))
+ goto invalid_creds;
+ } else {
+ if (unlikely(read_cred_subscribers(tsk->real_cred) < 1 ||
+ read_cred_subscribers(tsk->cred) < 1 ||
+ creds_are_invalid(tsk->real_cred) ||
+ creds_are_invalid(tsk->cred)))
+ goto invalid_creds;
+ }
+ return;
+
+invalid_creds:
+ printk(KERN_ERR "CRED: Invalid process credentials\n");
+ printk(KERN_ERR "CRED: At %s:%u\n", file, line);
+
+ dump_invalid_creds(tsk->real_cred, "Real", tsk);
+ if (tsk->cred != tsk->real_cred)
+ dump_invalid_creds(tsk->cred, "Effective", tsk);
+ else
+ printk(KERN_ERR "CRED: Effective creds == Real creds\n");
+ BUG();
+}
+EXPORT_SYMBOL(__validate_process_creds);
+
+/*
+ * check creds for do_exit()
+ */
+void validate_creds_for_do_exit(struct task_struct *tsk)
+{
+ kdebug("validate_creds_for_do_exit(%p,%p{%d,%d})",
+ tsk->real_cred, tsk->cred,
+ atomic_read(&tsk->cred->usage),
+ read_cred_subscribers(tsk->cred));
+
+ __validate_process_creds(tsk, __FILE__, __LINE__);
+}
+
+#endif /* CONFIG_DEBUG_CREDENTIALS */
diff --git a/kernel/debug/Makefile b/kernel/debug/Makefile
new file mode 100644
index 000000000..a85edc339
--- /dev/null
+++ b/kernel/debug/Makefile
@@ -0,0 +1,6 @@
+#
+# Makefile for the linux kernel debugger
+#
+
+obj-$(CONFIG_KGDB) += debug_core.o gdbstub.o
+obj-$(CONFIG_KGDB_KDB) += kdb/
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
new file mode 100644
index 000000000..0874e2edd
--- /dev/null
+++ b/kernel/debug/debug_core.c
@@ -0,0 +1,1088 @@
+/*
+ * Kernel Debug Core
+ *
+ * Maintainer: Jason Wessel <jason.wessel@windriver.com>
+ *
+ * Copyright (C) 2000-2001 VERITAS Software Corporation.
+ * Copyright (C) 2002-2004 Timesys Corporation
+ * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
+ * Copyright (C) 2004 Pavel Machek <pavel@ucw.cz>
+ * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
+ * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
+ * Copyright (C) 2005-2009 Wind River Systems, Inc.
+ * Copyright (C) 2007 MontaVista Software, Inc.
+ * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * Contributors at various stages not listed above:
+ * Jason Wessel ( jason.wessel@windriver.com )
+ * George Anzinger <george@mvista.com>
+ * Anurekh Saxena (anurekh.saxena@timesys.com)
+ * Lake Stevens Instrument Division (Glenn Engel)
+ * Jim Kingdon, Cygnus Support.
+ *
+ * Original KGDB stub: David Grothe <dave@gcom.com>,
+ * Tigran Aivazian <tigran@sco.com>
+ *
+ * This file is licensed under the terms of the GNU General Public License
+ * version 2. This program is licensed "as is" without any warranty of any
+ * kind, whether express or implied.
+ */
+
+#define pr_fmt(fmt) "KGDB: " fmt
+
+#include <linux/pid_namespace.h>
+#include <linux/clocksource.h>
+#include <linux/serial_core.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/console.h>
+#include <linux/threads.h>
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/ptrace.h>
+#include <linux/string.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/sysrq.h>
+#include <linux/reboot.h>
+#include <linux/init.h>
+#include <linux/kgdb.h>
+#include <linux/kdb.h>
+#include <linux/pid.h>
+#include <linux/smp.h>
+#include <linux/mm.h>
+#include <linux/vmacache.h>
+#include <linux/rcupdate.h>
+
+#include <asm/cacheflush.h>
+#include <asm/byteorder.h>
+#include <linux/atomic.h>
+
+#include "debug_core.h"
+
+static int kgdb_break_asap;
+
+struct debuggerinfo_struct kgdb_info[NR_CPUS];
+
+/**
+ * kgdb_connected - Is a host GDB connected to us?
+ */
+int kgdb_connected;
+EXPORT_SYMBOL_GPL(kgdb_connected);
+
+/* All the KGDB handlers are installed */
+int kgdb_io_module_registered;
+
+/* Guard for recursive entry */
+static int exception_level;
+
+struct kgdb_io *dbg_io_ops;
+static DEFINE_SPINLOCK(kgdb_registration_lock);
+
+/* Action for the reboot notifiter, a global allow kdb to change it */
+static int kgdbreboot;
+/* kgdb console driver is loaded */
+static int kgdb_con_registered;
+/* determine if kgdb console output should be used */
+static int kgdb_use_con;
+/* Flag for alternate operations for early debugging */
+bool dbg_is_early = true;
+/* Next cpu to become the master debug core */
+int dbg_switch_cpu;
+
+/* Use kdb or gdbserver mode */
+int dbg_kdb_mode = 1;
+
+static int __init opt_kgdb_con(char *str)
+{
+ kgdb_use_con = 1;
+ return 0;
+}
+
+early_param("kgdbcon", opt_kgdb_con);
+
+module_param(kgdb_use_con, int, 0644);
+module_param(kgdbreboot, int, 0644);
+
+/*
+ * Holds information about breakpoints in a kernel. These breakpoints are
+ * added and removed by gdb.
+ */
+static struct kgdb_bkpt kgdb_break[KGDB_MAX_BREAKPOINTS] = {
+ [0 ... KGDB_MAX_BREAKPOINTS-1] = { .state = BP_UNDEFINED }
+};
+
+/*
+ * The CPU# of the active CPU, or -1 if none:
+ */
+atomic_t kgdb_active = ATOMIC_INIT(-1);
+EXPORT_SYMBOL_GPL(kgdb_active);
+static DEFINE_RAW_SPINLOCK(dbg_master_lock);
+static DEFINE_RAW_SPINLOCK(dbg_slave_lock);
+
+/*
+ * We use NR_CPUs not PERCPU, in case kgdb is used to debug early
+ * bootup code (which might not have percpu set up yet):
+ */
+static atomic_t masters_in_kgdb;
+static atomic_t slaves_in_kgdb;
+static atomic_t kgdb_break_tasklet_var;
+atomic_t kgdb_setting_breakpoint;
+
+struct task_struct *kgdb_usethread;
+struct task_struct *kgdb_contthread;
+
+int kgdb_single_step;
+static pid_t kgdb_sstep_pid;
+
+/* to keep track of the CPU which is doing the single stepping*/
+atomic_t kgdb_cpu_doing_single_step = ATOMIC_INIT(-1);
+
+/*
+ * If you are debugging a problem where roundup (the collection of
+ * all other CPUs) is a problem [this should be extremely rare],
+ * then use the nokgdbroundup option to avoid roundup. In that case
+ * the other CPUs might interfere with your debugging context, so
+ * use this with care:
+ */
+static int kgdb_do_roundup = 1;
+
+static int __init opt_nokgdbroundup(char *str)
+{
+ kgdb_do_roundup = 0;
+
+ return 0;
+}
+
+early_param("nokgdbroundup", opt_nokgdbroundup);
+
+/*
+ * Finally, some KGDB code :-)
+ */
+
+/*
+ * Weak aliases for breakpoint management,
+ * can be overriden by architectures when needed:
+ */
+int __weak kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
+{
+ int err;
+
+ err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr,
+ BREAK_INSTR_SIZE);
+ if (err)
+ return err;
+ err = probe_kernel_write((char *)bpt->bpt_addr,
+ arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE);
+ return err;
+}
+
+int __weak kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
+{
+ return probe_kernel_write((char *)bpt->bpt_addr,
+ (char *)bpt->saved_instr, BREAK_INSTR_SIZE);
+}
+
+int __weak kgdb_validate_break_address(unsigned long addr)
+{
+ struct kgdb_bkpt tmp;
+ int err;
+ /* Validate setting the breakpoint and then removing it. If the
+ * remove fails, the kernel needs to emit a bad message because we
+ * are deep trouble not being able to put things back the way we
+ * found them.
+ */
+ tmp.bpt_addr = addr;
+ err = kgdb_arch_set_breakpoint(&tmp);
+ if (err)
+ return err;
+ err = kgdb_arch_remove_breakpoint(&tmp);
+ if (err)
+ pr_err("Critical breakpoint error, kernel memory destroyed at: %lx\n",
+ addr);
+ return err;
+}
+
+unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs)
+{
+ return instruction_pointer(regs);
+}
+
+int __weak kgdb_arch_init(void)
+{
+ return 0;
+}
+
+int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
+{
+ return 0;
+}
+
+/*
+ * Some architectures need cache flushes when we set/clear a
+ * breakpoint:
+ */
+static void kgdb_flush_swbreak_addr(unsigned long addr)
+{
+ if (!CACHE_FLUSH_IS_SAFE)
+ return;
+
+ if (current->mm) {
+ int i;
+
+ for (i = 0; i < VMACACHE_SIZE; i++) {
+ if (!current->vmacache[i])
+ continue;
+ flush_cache_range(current->vmacache[i],
+ addr, addr + BREAK_INSTR_SIZE);
+ }
+ }
+
+ /* Force flush instruction cache if it was outside the mm */
+ flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
+}
+
+/*
+ * SW breakpoint management:
+ */
+int dbg_activate_sw_breakpoints(void)
+{
+ int error;
+ int ret = 0;
+ int i;
+
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if (kgdb_break[i].state != BP_SET)
+ continue;
+
+ error = kgdb_arch_set_breakpoint(&kgdb_break[i]);
+ if (error) {
+ ret = error;
+ pr_info("BP install failed: %lx\n",
+ kgdb_break[i].bpt_addr);
+ continue;
+ }
+
+ kgdb_flush_swbreak_addr(kgdb_break[i].bpt_addr);
+ kgdb_break[i].state = BP_ACTIVE;
+ }
+ return ret;
+}
+
+int dbg_set_sw_break(unsigned long addr)
+{
+ int err = kgdb_validate_break_address(addr);
+ int breakno = -1;
+ int i;
+
+ if (err)
+ return err;
+
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if ((kgdb_break[i].state == BP_SET) &&
+ (kgdb_break[i].bpt_addr == addr))
+ return -EEXIST;
+ }
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if (kgdb_break[i].state == BP_REMOVED &&
+ kgdb_break[i].bpt_addr == addr) {
+ breakno = i;
+ break;
+ }
+ }
+
+ if (breakno == -1) {
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if (kgdb_break[i].state == BP_UNDEFINED) {
+ breakno = i;
+ break;
+ }
+ }
+ }
+
+ if (breakno == -1)
+ return -E2BIG;
+
+ kgdb_break[breakno].state = BP_SET;
+ kgdb_break[breakno].type = BP_BREAKPOINT;
+ kgdb_break[breakno].bpt_addr = addr;
+
+ return 0;
+}
+
+int dbg_deactivate_sw_breakpoints(void)
+{
+ int error;
+ int ret = 0;
+ int i;
+
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if (kgdb_break[i].state != BP_ACTIVE)
+ continue;
+ error = kgdb_arch_remove_breakpoint(&kgdb_break[i]);
+ if (error) {
+ pr_info("BP remove failed: %lx\n",
+ kgdb_break[i].bpt_addr);
+ ret = error;
+ }
+
+ kgdb_flush_swbreak_addr(kgdb_break[i].bpt_addr);
+ kgdb_break[i].state = BP_SET;
+ }
+ return ret;
+}
+
+int dbg_remove_sw_break(unsigned long addr)
+{
+ int i;
+
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if ((kgdb_break[i].state == BP_SET) &&
+ (kgdb_break[i].bpt_addr == addr)) {
+ kgdb_break[i].state = BP_REMOVED;
+ return 0;
+ }
+ }
+ return -ENOENT;
+}
+
+int kgdb_isremovedbreak(unsigned long addr)
+{
+ int i;
+
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if ((kgdb_break[i].state == BP_REMOVED) &&
+ (kgdb_break[i].bpt_addr == addr))
+ return 1;
+ }
+ return 0;
+}
+
+int dbg_remove_all_break(void)
+{
+ int error;
+ int i;
+
+ /* Clear memory breakpoints. */
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if (kgdb_break[i].state != BP_ACTIVE)
+ goto setundefined;
+ error = kgdb_arch_remove_breakpoint(&kgdb_break[i]);
+ if (error)
+ pr_err("breakpoint remove failed: %lx\n",
+ kgdb_break[i].bpt_addr);
+setundefined:
+ kgdb_break[i].state = BP_UNDEFINED;
+ }
+
+ /* Clear hardware breakpoints. */
+ if (arch_kgdb_ops.remove_all_hw_break)
+ arch_kgdb_ops.remove_all_hw_break();
+
+ return 0;
+}
+
+/*
+ * Return true if there is a valid kgdb I/O module. Also if no
+ * debugger is attached a message can be printed to the console about
+ * waiting for the debugger to attach.
+ *
+ * The print_wait argument is only to be true when called from inside
+ * the core kgdb_handle_exception, because it will wait for the
+ * debugger to attach.
+ */
+static int kgdb_io_ready(int print_wait)
+{
+ if (!dbg_io_ops)
+ return 0;
+ if (kgdb_connected)
+ return 1;
+ if (atomic_read(&kgdb_setting_breakpoint))
+ return 1;
+ if (print_wait) {
+#ifdef CONFIG_KGDB_KDB
+ if (!dbg_kdb_mode)
+ pr_crit("waiting... or $3#33 for KDB\n");
+#else
+ pr_crit("Waiting for remote debugger\n");
+#endif
+ }
+ return 1;
+}
+
+static int kgdb_reenter_check(struct kgdb_state *ks)
+{
+ unsigned long addr;
+
+ if (atomic_read(&kgdb_active) != raw_smp_processor_id())
+ return 0;
+
+ /* Panic on recursive debugger calls: */
+ exception_level++;
+ addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs);
+ dbg_deactivate_sw_breakpoints();
+
+ /*
+ * If the break point removed ok at the place exception
+ * occurred, try to recover and print a warning to the end
+ * user because the user planted a breakpoint in a place that
+ * KGDB needs in order to function.
+ */
+ if (dbg_remove_sw_break(addr) == 0) {
+ exception_level = 0;
+ kgdb_skipexception(ks->ex_vector, ks->linux_regs);
+ dbg_activate_sw_breakpoints();
+ pr_crit("re-enter error: breakpoint removed %lx\n", addr);
+ WARN_ON_ONCE(1);
+
+ return 1;
+ }
+ dbg_remove_all_break();
+ kgdb_skipexception(ks->ex_vector, ks->linux_regs);
+
+ if (exception_level > 1) {
+ dump_stack();
+ panic("Recursive entry to debugger");
+ }
+
+ pr_crit("re-enter exception: ALL breakpoints killed\n");
+#ifdef CONFIG_KGDB_KDB
+ /* Allow kdb to debug itself one level */
+ return 0;
+#endif
+ dump_stack();
+ panic("Recursive entry to debugger");
+
+ return 1;
+}
+
+static void dbg_touch_watchdogs(void)
+{
+ touch_softlockup_watchdog_sync();
+ clocksource_touch_watchdog();
+ rcu_cpu_stall_reset();
+}
+
+static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs,
+ int exception_state)
+{
+ unsigned long flags;
+ int sstep_tries = 100;
+ int error;
+ int cpu;
+ int trace_on = 0;
+ int online_cpus = num_online_cpus();
+ u64 time_left;
+
+ kgdb_info[ks->cpu].enter_kgdb++;
+ kgdb_info[ks->cpu].exception_state |= exception_state;
+
+ if (exception_state == DCPU_WANT_MASTER)
+ atomic_inc(&masters_in_kgdb);
+ else
+ atomic_inc(&slaves_in_kgdb);
+
+ if (arch_kgdb_ops.disable_hw_break)
+ arch_kgdb_ops.disable_hw_break(regs);
+
+acquirelock:
+ /*
+ * Interrupts will be restored by the 'trap return' code, except when
+ * single stepping.
+ */
+ local_irq_save(flags);
+
+ cpu = ks->cpu;
+ kgdb_info[cpu].debuggerinfo = regs;
+ kgdb_info[cpu].task = current;
+ kgdb_info[cpu].ret_state = 0;
+ kgdb_info[cpu].irq_depth = hardirq_count() >> HARDIRQ_SHIFT;
+
+ /* Make sure the above info reaches the primary CPU */
+ smp_mb();
+
+ if (exception_level == 1) {
+ if (raw_spin_trylock(&dbg_master_lock))
+ atomic_xchg(&kgdb_active, cpu);
+ goto cpu_master_loop;
+ }
+
+ /*
+ * CPU will loop if it is a slave or request to become a kgdb
+ * master cpu and acquire the kgdb_active lock:
+ */
+ while (1) {
+cpu_loop:
+ if (kgdb_info[cpu].exception_state & DCPU_NEXT_MASTER) {
+ kgdb_info[cpu].exception_state &= ~DCPU_NEXT_MASTER;
+ goto cpu_master_loop;
+ } else if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) {
+ if (raw_spin_trylock(&dbg_master_lock)) {
+ atomic_xchg(&kgdb_active, cpu);
+ break;
+ }
+ } else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) {
+ if (!raw_spin_is_locked(&dbg_slave_lock))
+ goto return_normal;
+ } else {
+return_normal:
+ /* Return to normal operation by executing any
+ * hw breakpoint fixup.
+ */
+ if (arch_kgdb_ops.correct_hw_break)
+ arch_kgdb_ops.correct_hw_break();
+ if (trace_on)
+ tracing_on();
+ kgdb_info[cpu].exception_state &=
+ ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE);
+ kgdb_info[cpu].enter_kgdb--;
+ smp_mb__before_atomic();
+ atomic_dec(&slaves_in_kgdb);
+ dbg_touch_watchdogs();
+ local_irq_restore(flags);
+ return 0;
+ }
+ cpu_relax();
+ }
+
+ /*
+ * For single stepping, try to only enter on the processor
+ * that was single stepping. To guard against a deadlock, the
+ * kernel will only try for the value of sstep_tries before
+ * giving up and continuing on.
+ */
+ if (atomic_read(&kgdb_cpu_doing_single_step) != -1 &&
+ (kgdb_info[cpu].task &&
+ kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
+ atomic_set(&kgdb_active, -1);
+ raw_spin_unlock(&dbg_master_lock);
+ dbg_touch_watchdogs();
+ local_irq_restore(flags);
+
+ goto acquirelock;
+ }
+
+ if (!kgdb_io_ready(1)) {
+ kgdb_info[cpu].ret_state = 1;
+ goto kgdb_restore; /* No I/O connection, resume the system */
+ }
+
+ /*
+ * Don't enter if we have hit a removed breakpoint.
+ */
+ if (kgdb_skipexception(ks->ex_vector, ks->linux_regs))
+ goto kgdb_restore;
+
+ /* Call the I/O driver's pre_exception routine */
+ if (dbg_io_ops->pre_exception)
+ dbg_io_ops->pre_exception();
+
+ /*
+ * Get the passive CPU lock which will hold all the non-primary
+ * CPU in a spin state while the debugger is active
+ */
+ if (!kgdb_single_step)
+ raw_spin_lock(&dbg_slave_lock);
+
+#ifdef CONFIG_SMP
+ /* If send_ready set, slaves are already waiting */
+ if (ks->send_ready)
+ atomic_set(ks->send_ready, 1);
+
+ /* Signal the other CPUs to enter kgdb_wait() */
+ else if ((!kgdb_single_step) && kgdb_do_roundup)
+ kgdb_roundup_cpus(flags);
+#endif
+
+ /*
+ * Wait for the other CPUs to be notified and be waiting for us:
+ */
+ time_left = loops_per_jiffy * HZ;
+ while (kgdb_do_roundup && --time_left &&
+ (atomic_read(&masters_in_kgdb) + atomic_read(&slaves_in_kgdb)) !=
+ online_cpus)
+ cpu_relax();
+ if (!time_left)
+ pr_crit("Timed out waiting for secondary CPUs.\n");
+
+ /*
+ * At this point the primary processor is completely
+ * in the debugger and all secondary CPUs are quiescent
+ */
+ dbg_deactivate_sw_breakpoints();
+ kgdb_single_step = 0;
+ kgdb_contthread = current;
+ exception_level = 0;
+ trace_on = tracing_is_on();
+ if (trace_on)
+ tracing_off();
+
+ while (1) {
+cpu_master_loop:
+ if (dbg_kdb_mode) {
+ kgdb_connected = 1;
+ error = kdb_stub(ks);
+ if (error == -1)
+ continue;
+ kgdb_connected = 0;
+ } else {
+ error = gdb_serial_stub(ks);
+ }
+
+ if (error == DBG_PASS_EVENT) {
+ dbg_kdb_mode = !dbg_kdb_mode;
+ } else if (error == DBG_SWITCH_CPU_EVENT) {
+ kgdb_info[dbg_switch_cpu].exception_state |=
+ DCPU_NEXT_MASTER;
+ goto cpu_loop;
+ } else {
+ kgdb_info[cpu].ret_state = error;
+ break;
+ }
+ }
+
+ /* Call the I/O driver's post_exception routine */
+ if (dbg_io_ops->post_exception)
+ dbg_io_ops->post_exception();
+
+ if (!kgdb_single_step) {
+ raw_spin_unlock(&dbg_slave_lock);
+ /* Wait till all the CPUs have quit from the debugger. */
+ while (kgdb_do_roundup && atomic_read(&slaves_in_kgdb))
+ cpu_relax();
+ }
+
+kgdb_restore:
+ if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
+ int sstep_cpu = atomic_read(&kgdb_cpu_doing_single_step);
+ if (kgdb_info[sstep_cpu].task)
+ kgdb_sstep_pid = kgdb_info[sstep_cpu].task->pid;
+ else
+ kgdb_sstep_pid = 0;
+ }
+ if (arch_kgdb_ops.correct_hw_break)
+ arch_kgdb_ops.correct_hw_break();
+ if (trace_on)
+ tracing_on();
+
+ kgdb_info[cpu].exception_state &=
+ ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE);
+ kgdb_info[cpu].enter_kgdb--;
+ smp_mb__before_atomic();
+ atomic_dec(&masters_in_kgdb);
+ /* Free kgdb_active */
+ atomic_set(&kgdb_active, -1);
+ raw_spin_unlock(&dbg_master_lock);
+ dbg_touch_watchdogs();
+ local_irq_restore(flags);
+
+ return kgdb_info[cpu].ret_state;
+}
+
+/*
+ * kgdb_handle_exception() - main entry point from a kernel exception
+ *
+ * Locking hierarchy:
+ * interface locks, if any (begin_session)
+ * kgdb lock (kgdb_active)
+ */
+int
+kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
+{
+ struct kgdb_state kgdb_var;
+ struct kgdb_state *ks = &kgdb_var;
+ int ret = 0;
+
+ if (arch_kgdb_ops.enable_nmi)
+ arch_kgdb_ops.enable_nmi(0);
+ /*
+ * Avoid entering the debugger if we were triggered due to an oops
+ * but panic_timeout indicates the system should automatically
+ * reboot on panic. We don't want to get stuck waiting for input
+ * on such systems, especially if its "just" an oops.
+ */
+ if (signo != SIGTRAP && panic_timeout)
+ return 1;
+
+ memset(ks, 0, sizeof(struct kgdb_state));
+ ks->cpu = raw_smp_processor_id();
+ ks->ex_vector = evector;
+ ks->signo = signo;
+ ks->err_code = ecode;
+ ks->linux_regs = regs;
+
+ if (kgdb_reenter_check(ks))
+ goto out; /* Ouch, double exception ! */
+ if (kgdb_info[ks->cpu].enter_kgdb != 0)
+ goto out;
+
+ ret = kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER);
+out:
+ if (arch_kgdb_ops.enable_nmi)
+ arch_kgdb_ops.enable_nmi(1);
+ return ret;
+}
+
+/*
+ * GDB places a breakpoint at this function to know dynamically
+ * loaded objects. It's not defined static so that only one instance with this
+ * name exists in the kernel.
+ */
+
+static int module_event(struct notifier_block *self, unsigned long val,
+ void *data)
+{
+ return 0;
+}
+
+static struct notifier_block dbg_module_load_nb = {
+ .notifier_call = module_event,
+};
+
+int kgdb_nmicallback(int cpu, void *regs)
+{
+#ifdef CONFIG_SMP
+ struct kgdb_state kgdb_var;
+ struct kgdb_state *ks = &kgdb_var;
+
+ memset(ks, 0, sizeof(struct kgdb_state));
+ ks->cpu = cpu;
+ ks->linux_regs = regs;
+
+ if (kgdb_info[ks->cpu].enter_kgdb == 0 &&
+ raw_spin_is_locked(&dbg_master_lock)) {
+ kgdb_cpu_enter(ks, regs, DCPU_IS_SLAVE);
+ return 0;
+ }
+#endif
+ return 1;
+}
+
+int kgdb_nmicallin(int cpu, int trapnr, void *regs, int err_code,
+ atomic_t *send_ready)
+{
+#ifdef CONFIG_SMP
+ if (!kgdb_io_ready(0) || !send_ready)
+ return 1;
+
+ if (kgdb_info[cpu].enter_kgdb == 0) {
+ struct kgdb_state kgdb_var;
+ struct kgdb_state *ks = &kgdb_var;
+
+ memset(ks, 0, sizeof(struct kgdb_state));
+ ks->cpu = cpu;
+ ks->ex_vector = trapnr;
+ ks->signo = SIGTRAP;
+ ks->err_code = err_code;
+ ks->linux_regs = regs;
+ ks->send_ready = send_ready;
+ kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER);
+ return 0;
+ }
+#endif
+ return 1;
+}
+
+static void kgdb_console_write(struct console *co, const char *s,
+ unsigned count)
+{
+ unsigned long flags;
+
+ /* If we're debugging, or KGDB has not connected, don't try
+ * and print. */
+ if (!kgdb_connected || atomic_read(&kgdb_active) != -1 || dbg_kdb_mode)
+ return;
+
+ local_irq_save(flags);
+ gdbstub_msg_write(s, count);
+ local_irq_restore(flags);
+}
+
+static struct console kgdbcons = {
+ .name = "kgdb",
+ .write = kgdb_console_write,
+ .flags = CON_PRINTBUFFER | CON_ENABLED,
+ .index = -1,
+};
+
+#ifdef CONFIG_MAGIC_SYSRQ
+static void sysrq_handle_dbg(int key)
+{
+ if (!dbg_io_ops) {
+ pr_crit("ERROR: No KGDB I/O module available\n");
+ return;
+ }
+ if (!kgdb_connected) {
+#ifdef CONFIG_KGDB_KDB
+ if (!dbg_kdb_mode)
+ pr_crit("KGDB or $3#33 for KDB\n");
+#else
+ pr_crit("Entering KGDB\n");
+#endif
+ }
+
+ kgdb_breakpoint();
+}
+
+static struct sysrq_key_op sysrq_dbg_op = {
+ .handler = sysrq_handle_dbg,
+ .help_msg = "debug(g)",
+ .action_msg = "DEBUG",
+};
+#endif
+
+static int kgdb_panic_event(struct notifier_block *self,
+ unsigned long val,
+ void *data)
+{
+ /*
+ * Avoid entering the debugger if we were triggered due to a panic
+ * We don't want to get stuck waiting for input from user in such case.
+ * panic_timeout indicates the system should automatically
+ * reboot on panic.
+ */
+ if (panic_timeout)
+ return NOTIFY_DONE;
+
+ if (dbg_kdb_mode)
+ kdb_printf("PANIC: %s\n", (char *)data);
+ kgdb_breakpoint();
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block kgdb_panic_event_nb = {
+ .notifier_call = kgdb_panic_event,
+ .priority = INT_MAX,
+};
+
+void __weak kgdb_arch_late(void)
+{
+}
+
+void __init dbg_late_init(void)
+{
+ dbg_is_early = false;
+ if (kgdb_io_module_registered)
+ kgdb_arch_late();
+ kdb_init(KDB_INIT_FULL);
+}
+
+static int
+dbg_notify_reboot(struct notifier_block *this, unsigned long code, void *x)
+{
+ /*
+ * Take the following action on reboot notify depending on value:
+ * 1 == Enter debugger
+ * 0 == [the default] detatch debug client
+ * -1 == Do nothing... and use this until the board resets
+ */
+ switch (kgdbreboot) {
+ case 1:
+ kgdb_breakpoint();
+ case -1:
+ goto done;
+ }
+ if (!dbg_kdb_mode)
+ gdbstub_exit(code);
+done:
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block dbg_reboot_notifier = {
+ .notifier_call = dbg_notify_reboot,
+ .next = NULL,
+ .priority = INT_MAX,
+};
+
+static void kgdb_register_callbacks(void)
+{
+ if (!kgdb_io_module_registered) {
+ kgdb_io_module_registered = 1;
+ kgdb_arch_init();
+ if (!dbg_is_early)
+ kgdb_arch_late();
+ register_module_notifier(&dbg_module_load_nb);
+ register_reboot_notifier(&dbg_reboot_notifier);
+ atomic_notifier_chain_register(&panic_notifier_list,
+ &kgdb_panic_event_nb);
+#ifdef CONFIG_MAGIC_SYSRQ
+ register_sysrq_key('g', &sysrq_dbg_op);
+#endif
+ if (kgdb_use_con && !kgdb_con_registered) {
+ register_console(&kgdbcons);
+ kgdb_con_registered = 1;
+ }
+ }
+}
+
+static void kgdb_unregister_callbacks(void)
+{
+ /*
+ * When this routine is called KGDB should unregister from the
+ * panic handler and clean up, making sure it is not handling any
+ * break exceptions at the time.
+ */
+ if (kgdb_io_module_registered) {
+ kgdb_io_module_registered = 0;
+ unregister_reboot_notifier(&dbg_reboot_notifier);
+ unregister_module_notifier(&dbg_module_load_nb);
+ atomic_notifier_chain_unregister(&panic_notifier_list,
+ &kgdb_panic_event_nb);
+ kgdb_arch_exit();
+#ifdef CONFIG_MAGIC_SYSRQ
+ unregister_sysrq_key('g', &sysrq_dbg_op);
+#endif
+ if (kgdb_con_registered) {
+ unregister_console(&kgdbcons);
+ kgdb_con_registered = 0;
+ }
+ }
+}
+
+/*
+ * There are times a tasklet needs to be used vs a compiled in
+ * break point so as to cause an exception outside a kgdb I/O module,
+ * such as is the case with kgdboe, where calling a breakpoint in the
+ * I/O driver itself would be fatal.
+ */
+static void kgdb_tasklet_bpt(unsigned long ing)
+{
+ kgdb_breakpoint();
+ atomic_set(&kgdb_break_tasklet_var, 0);
+}
+
+static DECLARE_TASKLET(kgdb_tasklet_breakpoint, kgdb_tasklet_bpt, 0);
+
+void kgdb_schedule_breakpoint(void)
+{
+ if (atomic_read(&kgdb_break_tasklet_var) ||
+ atomic_read(&kgdb_active) != -1 ||
+ atomic_read(&kgdb_setting_breakpoint))
+ return;
+ atomic_inc(&kgdb_break_tasklet_var);
+ tasklet_schedule(&kgdb_tasklet_breakpoint);
+}
+EXPORT_SYMBOL_GPL(kgdb_schedule_breakpoint);
+
+static void kgdb_initial_breakpoint(void)
+{
+ kgdb_break_asap = 0;
+
+ pr_crit("Waiting for connection from remote gdb...\n");
+ kgdb_breakpoint();
+}
+
+/**
+ * kgdb_register_io_module - register KGDB IO module
+ * @new_dbg_io_ops: the io ops vector
+ *
+ * Register it with the KGDB core.
+ */
+int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops)
+{
+ int err;
+
+ spin_lock(&kgdb_registration_lock);
+
+ if (dbg_io_ops) {
+ spin_unlock(&kgdb_registration_lock);
+
+ pr_err("Another I/O driver is already registered with KGDB\n");
+ return -EBUSY;
+ }
+
+ if (new_dbg_io_ops->init) {
+ err = new_dbg_io_ops->init();
+ if (err) {
+ spin_unlock(&kgdb_registration_lock);
+ return err;
+ }
+ }
+
+ dbg_io_ops = new_dbg_io_ops;
+
+ spin_unlock(&kgdb_registration_lock);
+
+ pr_info("Registered I/O driver %s\n", new_dbg_io_ops->name);
+
+ /* Arm KGDB now. */
+ kgdb_register_callbacks();
+
+ if (kgdb_break_asap)
+ kgdb_initial_breakpoint();
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kgdb_register_io_module);
+
+/**
+ * kkgdb_unregister_io_module - unregister KGDB IO module
+ * @old_dbg_io_ops: the io ops vector
+ *
+ * Unregister it with the KGDB core.
+ */
+void kgdb_unregister_io_module(struct kgdb_io *old_dbg_io_ops)
+{
+ BUG_ON(kgdb_connected);
+
+ /*
+ * KGDB is no longer able to communicate out, so
+ * unregister our callbacks and reset state.
+ */
+ kgdb_unregister_callbacks();
+
+ spin_lock(&kgdb_registration_lock);
+
+ WARN_ON_ONCE(dbg_io_ops != old_dbg_io_ops);
+ dbg_io_ops = NULL;
+
+ spin_unlock(&kgdb_registration_lock);
+
+ pr_info("Unregistered I/O driver %s, debugger disabled\n",
+ old_dbg_io_ops->name);
+}
+EXPORT_SYMBOL_GPL(kgdb_unregister_io_module);
+
+int dbg_io_get_char(void)
+{
+ int ret = dbg_io_ops->read_char();
+ if (ret == NO_POLL_CHAR)
+ return -1;
+ if (!dbg_kdb_mode)
+ return ret;
+ if (ret == 127)
+ return 8;
+ return ret;
+}
+
+/**
+ * kgdb_breakpoint - generate breakpoint exception
+ *
+ * This function will generate a breakpoint exception. It is used at the
+ * beginning of a program to sync up with a debugger and can be used
+ * otherwise as a quick means to stop program execution and "break" into
+ * the debugger.
+ */
+noinline void kgdb_breakpoint(void)
+{
+ atomic_inc(&kgdb_setting_breakpoint);
+ wmb(); /* Sync point before breakpoint */
+ arch_kgdb_breakpoint();
+ wmb(); /* Sync point after breakpoint */
+ atomic_dec(&kgdb_setting_breakpoint);
+}
+EXPORT_SYMBOL_GPL(kgdb_breakpoint);
+
+static int __init opt_kgdb_wait(char *str)
+{
+ kgdb_break_asap = 1;
+
+ kdb_init(KDB_INIT_EARLY);
+ if (kgdb_io_module_registered)
+ kgdb_initial_breakpoint();
+
+ return 0;
+}
+
+early_param("kgdbwait", opt_kgdb_wait);
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
new file mode 100644
index 000000000..127d9bc49
--- /dev/null
+++ b/kernel/debug/debug_core.h
@@ -0,0 +1,85 @@
+/*
+ * Created by: Jason Wessel <jason.wessel@windriver.com>
+ *
+ * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#ifndef _DEBUG_CORE_H_
+#define _DEBUG_CORE_H_
+/*
+ * These are the private implementation headers between the kernel
+ * debugger core and the debugger front end code.
+ */
+
+/* kernel debug core data structures */
+struct kgdb_state {
+ int ex_vector;
+ int signo;
+ int err_code;
+ int cpu;
+ int pass_exception;
+ unsigned long thr_query;
+ unsigned long threadid;
+ long kgdb_usethreadid;
+ struct pt_regs *linux_regs;
+ atomic_t *send_ready;
+};
+
+/* Exception state values */
+#define DCPU_WANT_MASTER 0x1 /* Waiting to become a master kgdb cpu */
+#define DCPU_NEXT_MASTER 0x2 /* Transition from one master cpu to another */
+#define DCPU_IS_SLAVE 0x4 /* Slave cpu enter exception */
+#define DCPU_SSTEP 0x8 /* CPU is single stepping */
+
+struct debuggerinfo_struct {
+ void *debuggerinfo;
+ struct task_struct *task;
+ int exception_state;
+ int ret_state;
+ int irq_depth;
+ int enter_kgdb;
+};
+
+extern struct debuggerinfo_struct kgdb_info[];
+
+/* kernel debug core break point routines */
+extern int dbg_remove_all_break(void);
+extern int dbg_set_sw_break(unsigned long addr);
+extern int dbg_remove_sw_break(unsigned long addr);
+extern int dbg_activate_sw_breakpoints(void);
+extern int dbg_deactivate_sw_breakpoints(void);
+
+/* polled character access to i/o module */
+extern int dbg_io_get_char(void);
+
+/* stub return value for switching between the gdbstub and kdb */
+#define DBG_PASS_EVENT -12345
+/* Switch from one cpu to another */
+#define DBG_SWITCH_CPU_EVENT -123456
+extern int dbg_switch_cpu;
+
+/* gdbstub interface functions */
+extern int gdb_serial_stub(struct kgdb_state *ks);
+extern void gdbstub_msg_write(const char *s, int len);
+
+/* gdbstub functions used for kdb <-> gdbstub transition */
+extern int gdbstub_state(struct kgdb_state *ks, char *cmd);
+extern int dbg_kdb_mode;
+
+#ifdef CONFIG_KGDB_KDB
+extern int kdb_stub(struct kgdb_state *ks);
+extern int kdb_parse(const char *cmdstr);
+extern int kdb_common_init_state(struct kgdb_state *ks);
+extern int kdb_common_deinit_state(void);
+#else /* ! CONFIG_KGDB_KDB */
+static inline int kdb_stub(struct kgdb_state *ks)
+{
+ return DBG_PASS_EVENT;
+}
+#endif /* CONFIG_KGDB_KDB */
+
+#endif /* _DEBUG_CORE_H_ */
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
new file mode 100644
index 000000000..19d9a578c
--- /dev/null
+++ b/kernel/debug/gdbstub.c
@@ -0,0 +1,1145 @@
+/*
+ * Kernel Debug Core
+ *
+ * Maintainer: Jason Wessel <jason.wessel@windriver.com>
+ *
+ * Copyright (C) 2000-2001 VERITAS Software Corporation.
+ * Copyright (C) 2002-2004 Timesys Corporation
+ * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
+ * Copyright (C) 2004 Pavel Machek <pavel@ucw.cz>
+ * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
+ * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
+ * Copyright (C) 2005-2009 Wind River Systems, Inc.
+ * Copyright (C) 2007 MontaVista Software, Inc.
+ * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * Contributors at various stages not listed above:
+ * Jason Wessel ( jason.wessel@windriver.com )
+ * George Anzinger <george@mvista.com>
+ * Anurekh Saxena (anurekh.saxena@timesys.com)
+ * Lake Stevens Instrument Division (Glenn Engel)
+ * Jim Kingdon, Cygnus Support.
+ *
+ * Original KGDB stub: David Grothe <dave@gcom.com>,
+ * Tigran Aivazian <tigran@sco.com>
+ *
+ * This file is licensed under the terms of the GNU General Public License
+ * version 2. This program is licensed "as is" without any warranty of any
+ * kind, whether express or implied.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kgdb.h>
+#include <linux/kdb.h>
+#include <linux/serial_core.h>
+#include <linux/reboot.h>
+#include <linux/uaccess.h>
+#include <asm/cacheflush.h>
+#include <asm/unaligned.h>
+#include "debug_core.h"
+
+#define KGDB_MAX_THREAD_QUERY 17
+
+/* Our I/O buffers. */
+static char remcom_in_buffer[BUFMAX];
+static char remcom_out_buffer[BUFMAX];
+static int gdbstub_use_prev_in_buf;
+static int gdbstub_prev_in_buf_pos;
+
+/* Storage for the registers, in GDB format. */
+static unsigned long gdb_regs[(NUMREGBYTES +
+ sizeof(unsigned long) - 1) /
+ sizeof(unsigned long)];
+
+/*
+ * GDB remote protocol parser:
+ */
+
+#ifdef CONFIG_KGDB_KDB
+static int gdbstub_read_wait(void)
+{
+ int ret = -1;
+ int i;
+
+ if (unlikely(gdbstub_use_prev_in_buf)) {
+ if (gdbstub_prev_in_buf_pos < gdbstub_use_prev_in_buf)
+ return remcom_in_buffer[gdbstub_prev_in_buf_pos++];
+ else
+ gdbstub_use_prev_in_buf = 0;
+ }
+
+ /* poll any additional I/O interfaces that are defined */
+ while (ret < 0)
+ for (i = 0; kdb_poll_funcs[i] != NULL; i++) {
+ ret = kdb_poll_funcs[i]();
+ if (ret > 0)
+ break;
+ }
+ return ret;
+}
+#else
+static int gdbstub_read_wait(void)
+{
+ int ret = dbg_io_ops->read_char();
+ while (ret == NO_POLL_CHAR)
+ ret = dbg_io_ops->read_char();
+ return ret;
+}
+#endif
+/* scan for the sequence $<data>#<checksum> */
+static void get_packet(char *buffer)
+{
+ unsigned char checksum;
+ unsigned char xmitcsum;
+ int count;
+ char ch;
+
+ do {
+ /*
+ * Spin and wait around for the start character, ignore all
+ * other characters:
+ */
+ while ((ch = (gdbstub_read_wait())) != '$')
+ /* nothing */;
+
+ kgdb_connected = 1;
+ checksum = 0;
+ xmitcsum = -1;
+
+ count = 0;
+
+ /*
+ * now, read until a # or end of buffer is found:
+ */
+ while (count < (BUFMAX - 1)) {
+ ch = gdbstub_read_wait();
+ if (ch == '#')
+ break;
+ checksum = checksum + ch;
+ buffer[count] = ch;
+ count = count + 1;
+ }
+
+ if (ch == '#') {
+ xmitcsum = hex_to_bin(gdbstub_read_wait()) << 4;
+ xmitcsum += hex_to_bin(gdbstub_read_wait());
+
+ if (checksum != xmitcsum)
+ /* failed checksum */
+ dbg_io_ops->write_char('-');
+ else
+ /* successful transfer */
+ dbg_io_ops->write_char('+');
+ if (dbg_io_ops->flush)
+ dbg_io_ops->flush();
+ }
+ buffer[count] = 0;
+ } while (checksum != xmitcsum);
+}
+
+/*
+ * Send the packet in buffer.
+ * Check for gdb connection if asked for.
+ */
+static void put_packet(char *buffer)
+{
+ unsigned char checksum;
+ int count;
+ char ch;
+
+ /*
+ * $<packet info>#<checksum>.
+ */
+ while (1) {
+ dbg_io_ops->write_char('$');
+ checksum = 0;
+ count = 0;
+
+ while ((ch = buffer[count])) {
+ dbg_io_ops->write_char(ch);
+ checksum += ch;
+ count++;
+ }
+
+ dbg_io_ops->write_char('#');
+ dbg_io_ops->write_char(hex_asc_hi(checksum));
+ dbg_io_ops->write_char(hex_asc_lo(checksum));
+ if (dbg_io_ops->flush)
+ dbg_io_ops->flush();
+
+ /* Now see what we get in reply. */
+ ch = gdbstub_read_wait();
+
+ if (ch == 3)
+ ch = gdbstub_read_wait();
+
+ /* If we get an ACK, we are done. */
+ if (ch == '+')
+ return;
+
+ /*
+ * If we get the start of another packet, this means
+ * that GDB is attempting to reconnect. We will NAK
+ * the packet being sent, and stop trying to send this
+ * packet.
+ */
+ if (ch == '$') {
+ dbg_io_ops->write_char('-');
+ if (dbg_io_ops->flush)
+ dbg_io_ops->flush();
+ return;
+ }
+ }
+}
+
+static char gdbmsgbuf[BUFMAX + 1];
+
+void gdbstub_msg_write(const char *s, int len)
+{
+ char *bufptr;
+ int wcount;
+ int i;
+
+ if (len == 0)
+ len = strlen(s);
+
+ /* 'O'utput */
+ gdbmsgbuf[0] = 'O';
+
+ /* Fill and send buffers... */
+ while (len > 0) {
+ bufptr = gdbmsgbuf + 1;
+
+ /* Calculate how many this time */
+ if ((len << 1) > (BUFMAX - 2))
+ wcount = (BUFMAX - 2) >> 1;
+ else
+ wcount = len;
+
+ /* Pack in hex chars */
+ for (i = 0; i < wcount; i++)
+ bufptr = hex_byte_pack(bufptr, s[i]);
+ *bufptr = '\0';
+
+ /* Move up */
+ s += wcount;
+ len -= wcount;
+
+ /* Write packet */
+ put_packet(gdbmsgbuf);
+ }
+}
+
+/*
+ * Convert the memory pointed to by mem into hex, placing result in
+ * buf. Return a pointer to the last char put in buf (null). May
+ * return an error.
+ */
+char *kgdb_mem2hex(char *mem, char *buf, int count)
+{
+ char *tmp;
+ int err;
+
+ /*
+ * We use the upper half of buf as an intermediate buffer for the
+ * raw memory copy. Hex conversion will work against this one.
+ */
+ tmp = buf + count;
+
+ err = probe_kernel_read(tmp, mem, count);
+ if (err)
+ return NULL;
+ while (count > 0) {
+ buf = hex_byte_pack(buf, *tmp);
+ tmp++;
+ count--;
+ }
+ *buf = 0;
+
+ return buf;
+}
+
+/*
+ * Convert the hex array pointed to by buf into binary to be placed in
+ * mem. Return a pointer to the character AFTER the last byte
+ * written. May return an error.
+ */
+int kgdb_hex2mem(char *buf, char *mem, int count)
+{
+ char *tmp_raw;
+ char *tmp_hex;
+
+ /*
+ * We use the upper half of buf as an intermediate buffer for the
+ * raw memory that is converted from hex.
+ */
+ tmp_raw = buf + count * 2;
+
+ tmp_hex = tmp_raw - 1;
+ while (tmp_hex >= buf) {
+ tmp_raw--;
+ *tmp_raw = hex_to_bin(*tmp_hex--);
+ *tmp_raw |= hex_to_bin(*tmp_hex--) << 4;
+ }
+
+ return probe_kernel_write(mem, tmp_raw, count);
+}
+
+/*
+ * While we find nice hex chars, build a long_val.
+ * Return number of chars processed.
+ */
+int kgdb_hex2long(char **ptr, unsigned long *long_val)
+{
+ int hex_val;
+ int num = 0;
+ int negate = 0;
+
+ *long_val = 0;
+
+ if (**ptr == '-') {
+ negate = 1;
+ (*ptr)++;
+ }
+ while (**ptr) {
+ hex_val = hex_to_bin(**ptr);
+ if (hex_val < 0)
+ break;
+
+ *long_val = (*long_val << 4) | hex_val;
+ num++;
+ (*ptr)++;
+ }
+
+ if (negate)
+ *long_val = -*long_val;
+
+ return num;
+}
+
+/*
+ * Copy the binary array pointed to by buf into mem. Fix $, #, and
+ * 0x7d escaped with 0x7d. Return -EFAULT on failure or 0 on success.
+ * The input buf is overwitten with the result to write to mem.
+ */
+static int kgdb_ebin2mem(char *buf, char *mem, int count)
+{
+ int size = 0;
+ char *c = buf;
+
+ while (count-- > 0) {
+ c[size] = *buf++;
+ if (c[size] == 0x7d)
+ c[size] = *buf++ ^ 0x20;
+ size++;
+ }
+
+ return probe_kernel_write(mem, c, size);
+}
+
+#if DBG_MAX_REG_NUM > 0
+void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
+{
+ int i;
+ int idx = 0;
+ char *ptr = (char *)gdb_regs;
+
+ for (i = 0; i < DBG_MAX_REG_NUM; i++) {
+ dbg_get_reg(i, ptr + idx, regs);
+ idx += dbg_reg_def[i].size;
+ }
+}
+
+void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
+{
+ int i;
+ int idx = 0;
+ char *ptr = (char *)gdb_regs;
+
+ for (i = 0; i < DBG_MAX_REG_NUM; i++) {
+ dbg_set_reg(i, ptr + idx, regs);
+ idx += dbg_reg_def[i].size;
+ }
+}
+#endif /* DBG_MAX_REG_NUM > 0 */
+
+/* Write memory due to an 'M' or 'X' packet. */
+static int write_mem_msg(int binary)
+{
+ char *ptr = &remcom_in_buffer[1];
+ unsigned long addr;
+ unsigned long length;
+ int err;
+
+ if (kgdb_hex2long(&ptr, &addr) > 0 && *(ptr++) == ',' &&
+ kgdb_hex2long(&ptr, &length) > 0 && *(ptr++) == ':') {
+ if (binary)
+ err = kgdb_ebin2mem(ptr, (char *)addr, length);
+ else
+ err = kgdb_hex2mem(ptr, (char *)addr, length);
+ if (err)
+ return err;
+ if (CACHE_FLUSH_IS_SAFE)
+ flush_icache_range(addr, addr + length);
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+static void error_packet(char *pkt, int error)
+{
+ error = -error;
+ pkt[0] = 'E';
+ pkt[1] = hex_asc[(error / 10)];
+ pkt[2] = hex_asc[(error % 10)];
+ pkt[3] = '\0';
+}
+
+/*
+ * Thread ID accessors. We represent a flat TID space to GDB, where
+ * the per CPU idle threads (which under Linux all have PID 0) are
+ * remapped to negative TIDs.
+ */
+
+#define BUF_THREAD_ID_SIZE 8
+
+static char *pack_threadid(char *pkt, unsigned char *id)
+{
+ unsigned char *limit;
+ int lzero = 1;
+
+ limit = id + (BUF_THREAD_ID_SIZE / 2);
+ while (id < limit) {
+ if (!lzero || *id != 0) {
+ pkt = hex_byte_pack(pkt, *id);
+ lzero = 0;
+ }
+ id++;
+ }
+
+ if (lzero)
+ pkt = hex_byte_pack(pkt, 0);
+
+ return pkt;
+}
+
+static void int_to_threadref(unsigned char *id, int value)
+{
+ put_unaligned_be32(value, id);
+}
+
+static struct task_struct *getthread(struct pt_regs *regs, int tid)
+{
+ /*
+ * Non-positive TIDs are remapped to the cpu shadow information
+ */
+ if (tid == 0 || tid == -1)
+ tid = -atomic_read(&kgdb_active) - 2;
+ if (tid < -1 && tid > -NR_CPUS - 2) {
+ if (kgdb_info[-tid - 2].task)
+ return kgdb_info[-tid - 2].task;
+ else
+ return idle_task(-tid - 2);
+ }
+ if (tid <= 0) {
+ printk(KERN_ERR "KGDB: Internal thread select error\n");
+ dump_stack();
+ return NULL;
+ }
+
+ /*
+ * find_task_by_pid_ns() does not take the tasklist lock anymore
+ * but is nicely RCU locked - hence is a pretty resilient
+ * thing to use:
+ */
+ return find_task_by_pid_ns(tid, &init_pid_ns);
+}
+
+
+/*
+ * Remap normal tasks to their real PID,
+ * CPU shadow threads are mapped to -CPU - 2
+ */
+static inline int shadow_pid(int realpid)
+{
+ if (realpid)
+ return realpid;
+
+ return -raw_smp_processor_id() - 2;
+}
+
+/*
+ * All the functions that start with gdb_cmd are the various
+ * operations to implement the handlers for the gdbserial protocol
+ * where KGDB is communicating with an external debugger
+ */
+
+/* Handle the '?' status packets */
+static void gdb_cmd_status(struct kgdb_state *ks)
+{
+ /*
+ * We know that this packet is only sent
+ * during initial connect. So to be safe,
+ * we clear out our breakpoints now in case
+ * GDB is reconnecting.
+ */
+ dbg_remove_all_break();
+
+ remcom_out_buffer[0] = 'S';
+ hex_byte_pack(&remcom_out_buffer[1], ks->signo);
+}
+
+static void gdb_get_regs_helper(struct kgdb_state *ks)
+{
+ struct task_struct *thread;
+ void *local_debuggerinfo;
+ int i;
+
+ thread = kgdb_usethread;
+ if (!thread) {
+ thread = kgdb_info[ks->cpu].task;
+ local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo;
+ } else {
+ local_debuggerinfo = NULL;
+ for_each_online_cpu(i) {
+ /*
+ * Try to find the task on some other
+ * or possibly this node if we do not
+ * find the matching task then we try
+ * to approximate the results.
+ */
+ if (thread == kgdb_info[i].task)
+ local_debuggerinfo = kgdb_info[i].debuggerinfo;
+ }
+ }
+
+ /*
+ * All threads that don't have debuggerinfo should be
+ * in schedule() sleeping, since all other CPUs
+ * are in kgdb_wait, and thus have debuggerinfo.
+ */
+ if (local_debuggerinfo) {
+ pt_regs_to_gdb_regs(gdb_regs, local_debuggerinfo);
+ } else {
+ /*
+ * Pull stuff saved during switch_to; nothing
+ * else is accessible (or even particularly
+ * relevant).
+ *
+ * This should be enough for a stack trace.
+ */
+ sleeping_thread_to_gdb_regs(gdb_regs, thread);
+ }
+}
+
+/* Handle the 'g' get registers request */
+static void gdb_cmd_getregs(struct kgdb_state *ks)
+{
+ gdb_get_regs_helper(ks);
+ kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES);
+}
+
+/* Handle the 'G' set registers request */
+static void gdb_cmd_setregs(struct kgdb_state *ks)
+{
+ kgdb_hex2mem(&remcom_in_buffer[1], (char *)gdb_regs, NUMREGBYTES);
+
+ if (kgdb_usethread && kgdb_usethread != current) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ } else {
+ gdb_regs_to_pt_regs(gdb_regs, ks->linux_regs);
+ strcpy(remcom_out_buffer, "OK");
+ }
+}
+
+/* Handle the 'm' memory read bytes */
+static void gdb_cmd_memread(struct kgdb_state *ks)
+{
+ char *ptr = &remcom_in_buffer[1];
+ unsigned long length;
+ unsigned long addr;
+ char *err;
+
+ if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' &&
+ kgdb_hex2long(&ptr, &length) > 0) {
+ err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length);
+ if (!err)
+ error_packet(remcom_out_buffer, -EINVAL);
+ } else {
+ error_packet(remcom_out_buffer, -EINVAL);
+ }
+}
+
+/* Handle the 'M' memory write bytes */
+static void gdb_cmd_memwrite(struct kgdb_state *ks)
+{
+ int err = write_mem_msg(0);
+
+ if (err)
+ error_packet(remcom_out_buffer, err);
+ else
+ strcpy(remcom_out_buffer, "OK");
+}
+
+#if DBG_MAX_REG_NUM > 0
+static char *gdb_hex_reg_helper(int regnum, char *out)
+{
+ int i;
+ int offset = 0;
+
+ for (i = 0; i < regnum; i++)
+ offset += dbg_reg_def[i].size;
+ return kgdb_mem2hex((char *)gdb_regs + offset, out,
+ dbg_reg_def[i].size);
+}
+
+/* Handle the 'p' individual regster get */
+static void gdb_cmd_reg_get(struct kgdb_state *ks)
+{
+ unsigned long regnum;
+ char *ptr = &remcom_in_buffer[1];
+
+ kgdb_hex2long(&ptr, &regnum);
+ if (regnum >= DBG_MAX_REG_NUM) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ return;
+ }
+ gdb_get_regs_helper(ks);
+ gdb_hex_reg_helper(regnum, remcom_out_buffer);
+}
+
+/* Handle the 'P' individual regster set */
+static void gdb_cmd_reg_set(struct kgdb_state *ks)
+{
+ unsigned long regnum;
+ char *ptr = &remcom_in_buffer[1];
+ int i = 0;
+
+ kgdb_hex2long(&ptr, &regnum);
+ if (*ptr++ != '=' ||
+ !(!kgdb_usethread || kgdb_usethread == current) ||
+ !dbg_get_reg(regnum, gdb_regs, ks->linux_regs)) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ return;
+ }
+ memset(gdb_regs, 0, sizeof(gdb_regs));
+ while (i < sizeof(gdb_regs) * 2)
+ if (hex_to_bin(ptr[i]) >= 0)
+ i++;
+ else
+ break;
+ i = i / 2;
+ kgdb_hex2mem(ptr, (char *)gdb_regs, i);
+ dbg_set_reg(regnum, gdb_regs, ks->linux_regs);
+ strcpy(remcom_out_buffer, "OK");
+}
+#endif /* DBG_MAX_REG_NUM > 0 */
+
+/* Handle the 'X' memory binary write bytes */
+static void gdb_cmd_binwrite(struct kgdb_state *ks)
+{
+ int err = write_mem_msg(1);
+
+ if (err)
+ error_packet(remcom_out_buffer, err);
+ else
+ strcpy(remcom_out_buffer, "OK");
+}
+
+/* Handle the 'D' or 'k', detach or kill packets */
+static void gdb_cmd_detachkill(struct kgdb_state *ks)
+{
+ int error;
+
+ /* The detach case */
+ if (remcom_in_buffer[0] == 'D') {
+ error = dbg_remove_all_break();
+ if (error < 0) {
+ error_packet(remcom_out_buffer, error);
+ } else {
+ strcpy(remcom_out_buffer, "OK");
+ kgdb_connected = 0;
+ }
+ put_packet(remcom_out_buffer);
+ } else {
+ /*
+ * Assume the kill case, with no exit code checking,
+ * trying to force detach the debugger:
+ */
+ dbg_remove_all_break();
+ kgdb_connected = 0;
+ }
+}
+
+/* Handle the 'R' reboot packets */
+static int gdb_cmd_reboot(struct kgdb_state *ks)
+{
+ /* For now, only honor R0 */
+ if (strcmp(remcom_in_buffer, "R0") == 0) {
+ printk(KERN_CRIT "Executing emergency reboot\n");
+ strcpy(remcom_out_buffer, "OK");
+ put_packet(remcom_out_buffer);
+
+ /*
+ * Execution should not return from
+ * machine_emergency_restart()
+ */
+ machine_emergency_restart();
+ kgdb_connected = 0;
+
+ return 1;
+ }
+ return 0;
+}
+
+/* Handle the 'q' query packets */
+static void gdb_cmd_query(struct kgdb_state *ks)
+{
+ struct task_struct *g;
+ struct task_struct *p;
+ unsigned char thref[BUF_THREAD_ID_SIZE];
+ char *ptr;
+ int i;
+ int cpu;
+ int finished = 0;
+
+ switch (remcom_in_buffer[1]) {
+ case 's':
+ case 'f':
+ if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10))
+ break;
+
+ i = 0;
+ remcom_out_buffer[0] = 'm';
+ ptr = remcom_out_buffer + 1;
+ if (remcom_in_buffer[1] == 'f') {
+ /* Each cpu is a shadow thread */
+ for_each_online_cpu(cpu) {
+ ks->thr_query = 0;
+ int_to_threadref(thref, -cpu - 2);
+ ptr = pack_threadid(ptr, thref);
+ *(ptr++) = ',';
+ i++;
+ }
+ }
+
+ do_each_thread(g, p) {
+ if (i >= ks->thr_query && !finished) {
+ int_to_threadref(thref, p->pid);
+ ptr = pack_threadid(ptr, thref);
+ *(ptr++) = ',';
+ ks->thr_query++;
+ if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0)
+ finished = 1;
+ }
+ i++;
+ } while_each_thread(g, p);
+
+ *(--ptr) = '\0';
+ break;
+
+ case 'C':
+ /* Current thread id */
+ strcpy(remcom_out_buffer, "QC");
+ ks->threadid = shadow_pid(current->pid);
+ int_to_threadref(thref, ks->threadid);
+ pack_threadid(remcom_out_buffer + 2, thref);
+ break;
+ case 'T':
+ if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16))
+ break;
+
+ ks->threadid = 0;
+ ptr = remcom_in_buffer + 17;
+ kgdb_hex2long(&ptr, &ks->threadid);
+ if (!getthread(ks->linux_regs, ks->threadid)) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ break;
+ }
+ if ((int)ks->threadid > 0) {
+ kgdb_mem2hex(getthread(ks->linux_regs,
+ ks->threadid)->comm,
+ remcom_out_buffer, 16);
+ } else {
+ static char tmpstr[23 + BUF_THREAD_ID_SIZE];
+
+ sprintf(tmpstr, "shadowCPU%d",
+ (int)(-ks->threadid - 2));
+ kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr));
+ }
+ break;
+#ifdef CONFIG_KGDB_KDB
+ case 'R':
+ if (strncmp(remcom_in_buffer, "qRcmd,", 6) == 0) {
+ int len = strlen(remcom_in_buffer + 6);
+
+ if ((len % 2) != 0) {
+ strcpy(remcom_out_buffer, "E01");
+ break;
+ }
+ kgdb_hex2mem(remcom_in_buffer + 6,
+ remcom_out_buffer, len);
+ len = len / 2;
+ remcom_out_buffer[len++] = 0;
+
+ kdb_common_init_state(ks);
+ kdb_parse(remcom_out_buffer);
+ kdb_common_deinit_state();
+
+ strcpy(remcom_out_buffer, "OK");
+ }
+ break;
+#endif
+ }
+}
+
+/* Handle the 'H' task query packets */
+static void gdb_cmd_task(struct kgdb_state *ks)
+{
+ struct task_struct *thread;
+ char *ptr;
+
+ switch (remcom_in_buffer[1]) {
+ case 'g':
+ ptr = &remcom_in_buffer[2];
+ kgdb_hex2long(&ptr, &ks->threadid);
+ thread = getthread(ks->linux_regs, ks->threadid);
+ if (!thread && ks->threadid > 0) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ break;
+ }
+ kgdb_usethread = thread;
+ ks->kgdb_usethreadid = ks->threadid;
+ strcpy(remcom_out_buffer, "OK");
+ break;
+ case 'c':
+ ptr = &remcom_in_buffer[2];
+ kgdb_hex2long(&ptr, &ks->threadid);
+ if (!ks->threadid) {
+ kgdb_contthread = NULL;
+ } else {
+ thread = getthread(ks->linux_regs, ks->threadid);
+ if (!thread && ks->threadid > 0) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ break;
+ }
+ kgdb_contthread = thread;
+ }
+ strcpy(remcom_out_buffer, "OK");
+ break;
+ }
+}
+
+/* Handle the 'T' thread query packets */
+static void gdb_cmd_thread(struct kgdb_state *ks)
+{
+ char *ptr = &remcom_in_buffer[1];
+ struct task_struct *thread;
+
+ kgdb_hex2long(&ptr, &ks->threadid);
+ thread = getthread(ks->linux_regs, ks->threadid);
+ if (thread)
+ strcpy(remcom_out_buffer, "OK");
+ else
+ error_packet(remcom_out_buffer, -EINVAL);
+}
+
+/* Handle the 'z' or 'Z' breakpoint remove or set packets */
+static void gdb_cmd_break(struct kgdb_state *ks)
+{
+ /*
+ * Since GDB-5.3, it's been drafted that '0' is a software
+ * breakpoint, '1' is a hardware breakpoint, so let's do that.
+ */
+ char *bpt_type = &remcom_in_buffer[1];
+ char *ptr = &remcom_in_buffer[2];
+ unsigned long addr;
+ unsigned long length;
+ int error = 0;
+
+ if (arch_kgdb_ops.set_hw_breakpoint && *bpt_type >= '1') {
+ /* Unsupported */
+ if (*bpt_type > '4')
+ return;
+ } else {
+ if (*bpt_type != '0' && *bpt_type != '1')
+ /* Unsupported. */
+ return;
+ }
+
+ /*
+ * Test if this is a hardware breakpoint, and
+ * if we support it:
+ */
+ if (*bpt_type == '1' && !(arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT))
+ /* Unsupported. */
+ return;
+
+ if (*(ptr++) != ',') {
+ error_packet(remcom_out_buffer, -EINVAL);
+ return;
+ }
+ if (!kgdb_hex2long(&ptr, &addr)) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ return;
+ }
+ if (*(ptr++) != ',' ||
+ !kgdb_hex2long(&ptr, &length)) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ return;
+ }
+
+ if (remcom_in_buffer[0] == 'Z' && *bpt_type == '0')
+ error = dbg_set_sw_break(addr);
+ else if (remcom_in_buffer[0] == 'z' && *bpt_type == '0')
+ error = dbg_remove_sw_break(addr);
+ else if (remcom_in_buffer[0] == 'Z')
+ error = arch_kgdb_ops.set_hw_breakpoint(addr,
+ (int)length, *bpt_type - '0');
+ else if (remcom_in_buffer[0] == 'z')
+ error = arch_kgdb_ops.remove_hw_breakpoint(addr,
+ (int) length, *bpt_type - '0');
+
+ if (error == 0)
+ strcpy(remcom_out_buffer, "OK");
+ else
+ error_packet(remcom_out_buffer, error);
+}
+
+/* Handle the 'C' signal / exception passing packets */
+static int gdb_cmd_exception_pass(struct kgdb_state *ks)
+{
+ /* C09 == pass exception
+ * C15 == detach kgdb, pass exception
+ */
+ if (remcom_in_buffer[1] == '0' && remcom_in_buffer[2] == '9') {
+
+ ks->pass_exception = 1;
+ remcom_in_buffer[0] = 'c';
+
+ } else if (remcom_in_buffer[1] == '1' && remcom_in_buffer[2] == '5') {
+
+ ks->pass_exception = 1;
+ remcom_in_buffer[0] = 'D';
+ dbg_remove_all_break();
+ kgdb_connected = 0;
+ return 1;
+
+ } else {
+ gdbstub_msg_write("KGDB only knows signal 9 (pass)"
+ " and 15 (pass and disconnect)\n"
+ "Executing a continue without signal passing\n", 0);
+ remcom_in_buffer[0] = 'c';
+ }
+
+ /* Indicate fall through */
+ return -1;
+}
+
+/*
+ * This function performs all gdbserial command procesing
+ */
+int gdb_serial_stub(struct kgdb_state *ks)
+{
+ int error = 0;
+ int tmp;
+
+ /* Initialize comm buffer and globals. */
+ memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
+ kgdb_usethread = kgdb_info[ks->cpu].task;
+ ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid);
+ ks->pass_exception = 0;
+
+ if (kgdb_connected) {
+ unsigned char thref[BUF_THREAD_ID_SIZE];
+ char *ptr;
+
+ /* Reply to host that an exception has occurred */
+ ptr = remcom_out_buffer;
+ *ptr++ = 'T';
+ ptr = hex_byte_pack(ptr, ks->signo);
+ ptr += strlen(strcpy(ptr, "thread:"));
+ int_to_threadref(thref, shadow_pid(current->pid));
+ ptr = pack_threadid(ptr, thref);
+ *ptr++ = ';';
+ put_packet(remcom_out_buffer);
+ }
+
+ while (1) {
+ error = 0;
+
+ /* Clear the out buffer. */
+ memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
+
+ get_packet(remcom_in_buffer);
+
+ switch (remcom_in_buffer[0]) {
+ case '?': /* gdbserial status */
+ gdb_cmd_status(ks);
+ break;
+ case 'g': /* return the value of the CPU registers */
+ gdb_cmd_getregs(ks);
+ break;
+ case 'G': /* set the value of the CPU registers - return OK */
+ gdb_cmd_setregs(ks);
+ break;
+ case 'm': /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */
+ gdb_cmd_memread(ks);
+ break;
+ case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */
+ gdb_cmd_memwrite(ks);
+ break;
+#if DBG_MAX_REG_NUM > 0
+ case 'p': /* pXX Return gdb register XX (in hex) */
+ gdb_cmd_reg_get(ks);
+ break;
+ case 'P': /* PXX=aaaa Set gdb register XX to aaaa (in hex) */
+ gdb_cmd_reg_set(ks);
+ break;
+#endif /* DBG_MAX_REG_NUM > 0 */
+ case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */
+ gdb_cmd_binwrite(ks);
+ break;
+ /* kill or detach. KGDB should treat this like a
+ * continue.
+ */
+ case 'D': /* Debugger detach */
+ case 'k': /* Debugger detach via kill */
+ gdb_cmd_detachkill(ks);
+ goto default_handle;
+ case 'R': /* Reboot */
+ if (gdb_cmd_reboot(ks))
+ goto default_handle;
+ break;
+ case 'q': /* query command */
+ gdb_cmd_query(ks);
+ break;
+ case 'H': /* task related */
+ gdb_cmd_task(ks);
+ break;
+ case 'T': /* Query thread status */
+ gdb_cmd_thread(ks);
+ break;
+ case 'z': /* Break point remove */
+ case 'Z': /* Break point set */
+ gdb_cmd_break(ks);
+ break;
+#ifdef CONFIG_KGDB_KDB
+ case '3': /* Escape into back into kdb */
+ if (remcom_in_buffer[1] == '\0') {
+ gdb_cmd_detachkill(ks);
+ return DBG_PASS_EVENT;
+ }
+#endif
+ case 'C': /* Exception passing */
+ tmp = gdb_cmd_exception_pass(ks);
+ if (tmp > 0)
+ goto default_handle;
+ if (tmp == 0)
+ break;
+ /* Fall through on tmp < 0 */
+ case 'c': /* Continue packet */
+ case 's': /* Single step packet */
+ if (kgdb_contthread && kgdb_contthread != current) {
+ /* Can't switch threads in kgdb */
+ error_packet(remcom_out_buffer, -EINVAL);
+ break;
+ }
+ dbg_activate_sw_breakpoints();
+ /* Fall through to default processing */
+ default:
+default_handle:
+ error = kgdb_arch_handle_exception(ks->ex_vector,
+ ks->signo,
+ ks->err_code,
+ remcom_in_buffer,
+ remcom_out_buffer,
+ ks->linux_regs);
+ /*
+ * Leave cmd processing on error, detach,
+ * kill, continue, or single step.
+ */
+ if (error >= 0 || remcom_in_buffer[0] == 'D' ||
+ remcom_in_buffer[0] == 'k') {
+ error = 0;
+ goto kgdb_exit;
+ }
+
+ }
+
+ /* reply to the request */
+ put_packet(remcom_out_buffer);
+ }
+
+kgdb_exit:
+ if (ks->pass_exception)
+ error = 1;
+ return error;
+}
+
+int gdbstub_state(struct kgdb_state *ks, char *cmd)
+{
+ int error;
+
+ switch (cmd[0]) {
+ case 'e':
+ error = kgdb_arch_handle_exception(ks->ex_vector,
+ ks->signo,
+ ks->err_code,
+ remcom_in_buffer,
+ remcom_out_buffer,
+ ks->linux_regs);
+ return error;
+ case 's':
+ case 'c':
+ strcpy(remcom_in_buffer, cmd);
+ return 0;
+ case '$':
+ strcpy(remcom_in_buffer, cmd);
+ gdbstub_use_prev_in_buf = strlen(remcom_in_buffer);
+ gdbstub_prev_in_buf_pos = 0;
+ return 0;
+ }
+ dbg_io_ops->write_char('+');
+ put_packet(remcom_out_buffer);
+ return 0;
+}
+
+/**
+ * gdbstub_exit - Send an exit message to GDB
+ * @status: The exit code to report.
+ */
+void gdbstub_exit(int status)
+{
+ unsigned char checksum, ch, buffer[3];
+ int loop;
+
+ if (!kgdb_connected)
+ return;
+ kgdb_connected = 0;
+
+ if (!dbg_io_ops || dbg_kdb_mode)
+ return;
+
+ buffer[0] = 'W';
+ buffer[1] = hex_asc_hi(status);
+ buffer[2] = hex_asc_lo(status);
+
+ dbg_io_ops->write_char('$');
+ checksum = 0;
+
+ for (loop = 0; loop < 3; loop++) {
+ ch = buffer[loop];
+ checksum += ch;
+ dbg_io_ops->write_char(ch);
+ }
+
+ dbg_io_ops->write_char('#');
+ dbg_io_ops->write_char(hex_asc_hi(checksum));
+ dbg_io_ops->write_char(hex_asc_lo(checksum));
+
+ /* make sure the output is flushed, lest the bootloader clobber it */
+ if (dbg_io_ops->flush)
+ dbg_io_ops->flush();
+}
diff --git a/kernel/debug/kdb/.gitignore b/kernel/debug/kdb/.gitignore
new file mode 100644
index 000000000..396d12eda
--- /dev/null
+++ b/kernel/debug/kdb/.gitignore
@@ -0,0 +1 @@
+gen-kdb_cmds.c
diff --git a/kernel/debug/kdb/Makefile b/kernel/debug/kdb/Makefile
new file mode 100644
index 000000000..d4fc58f4b
--- /dev/null
+++ b/kernel/debug/kdb/Makefile
@@ -0,0 +1,25 @@
+# This file is subject to the terms and conditions of the GNU General Public
+# License. See the file "COPYING" in the main directory of this archive
+# for more details.
+#
+# Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved.
+# Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
+#
+
+CCVERSION := $(shell $(CC) -v 2>&1 | sed -ne '$$p')
+obj-y := kdb_io.o kdb_main.o kdb_support.o kdb_bt.o gen-kdb_cmds.o kdb_bp.o kdb_debugger.o
+obj-$(CONFIG_KDB_KEYBOARD) += kdb_keyboard.o
+
+clean-files := gen-kdb_cmds.c
+
+quiet_cmd_gen-kdb = GENKDB $@
+ cmd_gen-kdb = $(AWK) 'BEGIN {print "\#include <linux/stddef.h>"; print "\#include <linux/init.h>"} \
+ /^\#/{next} \
+ /^[ \t]*$$/{next} \
+ {gsub(/"/, "\\\"", $$0); \
+ print "static __initdata char kdb_cmd" cmds++ "[] = \"" $$0 "\\n\";"} \
+ END {print "extern char *kdb_cmds[]; char __initdata *kdb_cmds[] = {"; for (i = 0; i < cmds; ++i) {print " kdb_cmd" i ","}; print(" NULL\n};");}' \
+ $(filter-out %/Makefile,$^) > $@#
+
+$(obj)/gen-kdb_cmds.c: $(src)/kdb_cmds $(src)/Makefile
+ $(call cmd,gen-kdb)
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
new file mode 100644
index 000000000..e1dbf4a2c
--- /dev/null
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -0,0 +1,560 @@
+/*
+ * Kernel Debugger Architecture Independent Breakpoint Handler
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
+ */
+
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/kdb.h>
+#include <linux/kgdb.h>
+#include <linux/smp.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include "kdb_private.h"
+
+/*
+ * Table of kdb_breakpoints
+ */
+kdb_bp_t kdb_breakpoints[KDB_MAXBPT];
+
+static void kdb_setsinglestep(struct pt_regs *regs)
+{
+ KDB_STATE_SET(DOING_SS);
+}
+
+static char *kdb_rwtypes[] = {
+ "Instruction(i)",
+ "Instruction(Register)",
+ "Data Write",
+ "I/O",
+ "Data Access"
+};
+
+static char *kdb_bptype(kdb_bp_t *bp)
+{
+ if (bp->bp_type < 0 || bp->bp_type > 4)
+ return "";
+
+ return kdb_rwtypes[bp->bp_type];
+}
+
+static int kdb_parsebp(int argc, const char **argv, int *nextargp, kdb_bp_t *bp)
+{
+ int nextarg = *nextargp;
+ int diag;
+
+ bp->bph_length = 1;
+ if ((argc + 1) != nextarg) {
+ if (strncasecmp(argv[nextarg], "datar", sizeof("datar")) == 0)
+ bp->bp_type = BP_ACCESS_WATCHPOINT;
+ else if (strncasecmp(argv[nextarg], "dataw", sizeof("dataw")) == 0)
+ bp->bp_type = BP_WRITE_WATCHPOINT;
+ else if (strncasecmp(argv[nextarg], "inst", sizeof("inst")) == 0)
+ bp->bp_type = BP_HARDWARE_BREAKPOINT;
+ else
+ return KDB_ARGCOUNT;
+
+ bp->bph_length = 1;
+
+ nextarg++;
+
+ if ((argc + 1) != nextarg) {
+ unsigned long len;
+
+ diag = kdbgetularg((char *)argv[nextarg],
+ &len);
+ if (diag)
+ return diag;
+
+
+ if (len > 8)
+ return KDB_BADLENGTH;
+
+ bp->bph_length = len;
+ nextarg++;
+ }
+
+ if ((argc + 1) != nextarg)
+ return KDB_ARGCOUNT;
+ }
+
+ *nextargp = nextarg;
+ return 0;
+}
+
+static int _kdb_bp_remove(kdb_bp_t *bp)
+{
+ int ret = 1;
+ if (!bp->bp_installed)
+ return ret;
+ if (!bp->bp_type)
+ ret = dbg_remove_sw_break(bp->bp_addr);
+ else
+ ret = arch_kgdb_ops.remove_hw_breakpoint(bp->bp_addr,
+ bp->bph_length,
+ bp->bp_type);
+ if (ret == 0)
+ bp->bp_installed = 0;
+ return ret;
+}
+
+static void kdb_handle_bp(struct pt_regs *regs, kdb_bp_t *bp)
+{
+ if (KDB_DEBUG(BP))
+ kdb_printf("regs->ip = 0x%lx\n", instruction_pointer(regs));
+
+ /*
+ * Setup single step
+ */
+ kdb_setsinglestep(regs);
+
+ /*
+ * Reset delay attribute
+ */
+ bp->bp_delay = 0;
+ bp->bp_delayed = 1;
+}
+
+static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp)
+{
+ int ret;
+ /*
+ * Install the breakpoint, if it is not already installed.
+ */
+
+ if (KDB_DEBUG(BP))
+ kdb_printf("%s: bp_installed %d\n",
+ __func__, bp->bp_installed);
+ if (!KDB_STATE(SSBPT))
+ bp->bp_delay = 0;
+ if (bp->bp_installed)
+ return 1;
+ if (bp->bp_delay || (bp->bp_delayed && KDB_STATE(DOING_SS))) {
+ if (KDB_DEBUG(BP))
+ kdb_printf("%s: delayed bp\n", __func__);
+ kdb_handle_bp(regs, bp);
+ return 0;
+ }
+ if (!bp->bp_type)
+ ret = dbg_set_sw_break(bp->bp_addr);
+ else
+ ret = arch_kgdb_ops.set_hw_breakpoint(bp->bp_addr,
+ bp->bph_length,
+ bp->bp_type);
+ if (ret == 0) {
+ bp->bp_installed = 1;
+ } else {
+ kdb_printf("%s: failed to set breakpoint at 0x%lx\n",
+ __func__, bp->bp_addr);
+#ifdef CONFIG_DEBUG_RODATA
+ if (!bp->bp_type) {
+ kdb_printf("Software breakpoints are unavailable.\n"
+ " Change the kernel CONFIG_DEBUG_RODATA=n\n"
+ " OR use hw breaks: help bph\n");
+ }
+#endif
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * kdb_bp_install
+ *
+ * Install kdb_breakpoints prior to returning from the
+ * kernel debugger. This allows the kdb_breakpoints to be set
+ * upon functions that are used internally by kdb, such as
+ * printk(). This function is only called once per kdb session.
+ */
+void kdb_bp_install(struct pt_regs *regs)
+{
+ int i;
+
+ for (i = 0; i < KDB_MAXBPT; i++) {
+ kdb_bp_t *bp = &kdb_breakpoints[i];
+
+ if (KDB_DEBUG(BP)) {
+ kdb_printf("%s: bp %d bp_enabled %d\n",
+ __func__, i, bp->bp_enabled);
+ }
+ if (bp->bp_enabled)
+ _kdb_bp_install(regs, bp);
+ }
+}
+
+/*
+ * kdb_bp_remove
+ *
+ * Remove kdb_breakpoints upon entry to the kernel debugger.
+ *
+ * Parameters:
+ * None.
+ * Outputs:
+ * None.
+ * Returns:
+ * None.
+ * Locking:
+ * None.
+ * Remarks:
+ */
+void kdb_bp_remove(void)
+{
+ int i;
+
+ for (i = KDB_MAXBPT - 1; i >= 0; i--) {
+ kdb_bp_t *bp = &kdb_breakpoints[i];
+
+ if (KDB_DEBUG(BP)) {
+ kdb_printf("%s: bp %d bp_enabled %d\n",
+ __func__, i, bp->bp_enabled);
+ }
+ if (bp->bp_enabled)
+ _kdb_bp_remove(bp);
+ }
+}
+
+
+/*
+ * kdb_printbp
+ *
+ * Internal function to format and print a breakpoint entry.
+ *
+ * Parameters:
+ * None.
+ * Outputs:
+ * None.
+ * Returns:
+ * None.
+ * Locking:
+ * None.
+ * Remarks:
+ */
+
+static void kdb_printbp(kdb_bp_t *bp, int i)
+{
+ kdb_printf("%s ", kdb_bptype(bp));
+ kdb_printf("BP #%d at ", i);
+ kdb_symbol_print(bp->bp_addr, NULL, KDB_SP_DEFAULT);
+
+ if (bp->bp_enabled)
+ kdb_printf("\n is enabled");
+ else
+ kdb_printf("\n is disabled");
+
+ kdb_printf("\taddr at %016lx, hardtype=%d installed=%d\n",
+ bp->bp_addr, bp->bp_type, bp->bp_installed);
+
+ kdb_printf("\n");
+}
+
+/*
+ * kdb_bp
+ *
+ * Handle the bp commands.
+ *
+ * [bp|bph] <addr-expression> [DATAR|DATAW]
+ *
+ * Parameters:
+ * argc Count of arguments in argv
+ * argv Space delimited command line arguments
+ * Outputs:
+ * None.
+ * Returns:
+ * Zero for success, a kdb diagnostic if failure.
+ * Locking:
+ * None.
+ * Remarks:
+ *
+ * bp Set breakpoint on all cpus. Only use hardware assist if need.
+ * bph Set breakpoint on all cpus. Force hardware register
+ */
+
+static int kdb_bp(int argc, const char **argv)
+{
+ int i, bpno;
+ kdb_bp_t *bp, *bp_check;
+ int diag;
+ char *symname = NULL;
+ long offset = 0ul;
+ int nextarg;
+ kdb_bp_t template = {0};
+
+ if (argc == 0) {
+ /*
+ * Display breakpoint table
+ */
+ for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT;
+ bpno++, bp++) {
+ if (bp->bp_free)
+ continue;
+ kdb_printbp(bp, bpno);
+ }
+
+ return 0;
+ }
+
+ nextarg = 1;
+ diag = kdbgetaddrarg(argc, argv, &nextarg, &template.bp_addr,
+ &offset, &symname);
+ if (diag)
+ return diag;
+ if (!template.bp_addr)
+ return KDB_BADINT;
+
+ /*
+ * Find an empty bp structure to allocate
+ */
+ for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT; bpno++, bp++) {
+ if (bp->bp_free)
+ break;
+ }
+
+ if (bpno == KDB_MAXBPT)
+ return KDB_TOOMANYBPT;
+
+ if (strcmp(argv[0], "bph") == 0) {
+ template.bp_type = BP_HARDWARE_BREAKPOINT;
+ diag = kdb_parsebp(argc, argv, &nextarg, &template);
+ if (diag)
+ return diag;
+ } else {
+ template.bp_type = BP_BREAKPOINT;
+ }
+
+ /*
+ * Check for clashing breakpoints.
+ *
+ * Note, in this design we can't have hardware breakpoints
+ * enabled for both read and write on the same address.
+ */
+ for (i = 0, bp_check = kdb_breakpoints; i < KDB_MAXBPT;
+ i++, bp_check++) {
+ if (!bp_check->bp_free &&
+ bp_check->bp_addr == template.bp_addr) {
+ kdb_printf("You already have a breakpoint at "
+ kdb_bfd_vma_fmt0 "\n", template.bp_addr);
+ return KDB_DUPBPT;
+ }
+ }
+
+ template.bp_enabled = 1;
+
+ /*
+ * Actually allocate the breakpoint found earlier
+ */
+ *bp = template;
+ bp->bp_free = 0;
+
+ kdb_printbp(bp, bpno);
+
+ return 0;
+}
+
+/*
+ * kdb_bc
+ *
+ * Handles the 'bc', 'be', and 'bd' commands
+ *
+ * [bd|bc|be] <breakpoint-number>
+ * [bd|bc|be] *
+ *
+ * Parameters:
+ * argc Count of arguments in argv
+ * argv Space delimited command line arguments
+ * Outputs:
+ * None.
+ * Returns:
+ * Zero for success, a kdb diagnostic for failure
+ * Locking:
+ * None.
+ * Remarks:
+ */
+static int kdb_bc(int argc, const char **argv)
+{
+ unsigned long addr;
+ kdb_bp_t *bp = NULL;
+ int lowbp = KDB_MAXBPT;
+ int highbp = 0;
+ int done = 0;
+ int i;
+ int diag = 0;
+
+ int cmd; /* KDBCMD_B? */
+#define KDBCMD_BC 0
+#define KDBCMD_BE 1
+#define KDBCMD_BD 2
+
+ if (strcmp(argv[0], "be") == 0)
+ cmd = KDBCMD_BE;
+ else if (strcmp(argv[0], "bd") == 0)
+ cmd = KDBCMD_BD;
+ else
+ cmd = KDBCMD_BC;
+
+ if (argc != 1)
+ return KDB_ARGCOUNT;
+
+ if (strcmp(argv[1], "*") == 0) {
+ lowbp = 0;
+ highbp = KDB_MAXBPT;
+ } else {
+ diag = kdbgetularg(argv[1], &addr);
+ if (diag)
+ return diag;
+
+ /*
+ * For addresses less than the maximum breakpoint number,
+ * assume that the breakpoint number is desired.
+ */
+ if (addr < KDB_MAXBPT) {
+ bp = &kdb_breakpoints[addr];
+ lowbp = highbp = addr;
+ highbp++;
+ } else {
+ for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT;
+ i++, bp++) {
+ if (bp->bp_addr == addr) {
+ lowbp = highbp = i;
+ highbp++;
+ break;
+ }
+ }
+ }
+ }
+
+ /*
+ * Now operate on the set of breakpoints matching the input
+ * criteria (either '*' for all, or an individual breakpoint).
+ */
+ for (bp = &kdb_breakpoints[lowbp], i = lowbp;
+ i < highbp;
+ i++, bp++) {
+ if (bp->bp_free)
+ continue;
+
+ done++;
+
+ switch (cmd) {
+ case KDBCMD_BC:
+ bp->bp_enabled = 0;
+
+ kdb_printf("Breakpoint %d at "
+ kdb_bfd_vma_fmt " cleared\n",
+ i, bp->bp_addr);
+
+ bp->bp_addr = 0;
+ bp->bp_free = 1;
+
+ break;
+ case KDBCMD_BE:
+ bp->bp_enabled = 1;
+
+ kdb_printf("Breakpoint %d at "
+ kdb_bfd_vma_fmt " enabled",
+ i, bp->bp_addr);
+
+ kdb_printf("\n");
+ break;
+ case KDBCMD_BD:
+ if (!bp->bp_enabled)
+ break;
+
+ bp->bp_enabled = 0;
+
+ kdb_printf("Breakpoint %d at "
+ kdb_bfd_vma_fmt " disabled\n",
+ i, bp->bp_addr);
+
+ break;
+ }
+ if (bp->bp_delay && (cmd == KDBCMD_BC || cmd == KDBCMD_BD)) {
+ bp->bp_delay = 0;
+ KDB_STATE_CLEAR(SSBPT);
+ }
+ }
+
+ return (!done) ? KDB_BPTNOTFOUND : 0;
+}
+
+/*
+ * kdb_ss
+ *
+ * Process the 'ss' (Single Step) command.
+ *
+ * ss
+ *
+ * Parameters:
+ * argc Argument count
+ * argv Argument vector
+ * Outputs:
+ * None.
+ * Returns:
+ * KDB_CMD_SS for success, a kdb error if failure.
+ * Locking:
+ * None.
+ * Remarks:
+ *
+ * Set the arch specific option to trigger a debug trap after the next
+ * instruction.
+ */
+
+static int kdb_ss(int argc, const char **argv)
+{
+ if (argc != 0)
+ return KDB_ARGCOUNT;
+ /*
+ * Set trace flag and go.
+ */
+ KDB_STATE_SET(DOING_SS);
+ return KDB_CMD_SS;
+}
+
+/* Initialize the breakpoint table and register breakpoint commands. */
+
+void __init kdb_initbptab(void)
+{
+ int i;
+ kdb_bp_t *bp;
+
+ /*
+ * First time initialization.
+ */
+ memset(&kdb_breakpoints, '\0', sizeof(kdb_breakpoints));
+
+ for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++)
+ bp->bp_free = 1;
+
+ kdb_register_flags("bp", kdb_bp, "[<vaddr>]",
+ "Set/Display breakpoints", 0,
+ KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
+ kdb_register_flags("bl", kdb_bp, "[<vaddr>]",
+ "Display breakpoints", 0,
+ KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
+ if (arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT)
+ kdb_register_flags("bph", kdb_bp, "[<vaddr>]",
+ "[datar [length]|dataw [length]] Set hw brk", 0,
+ KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
+ kdb_register_flags("bc", kdb_bc, "<bpnum>",
+ "Clear Breakpoint", 0,
+ KDB_ENABLE_FLOW_CTRL);
+ kdb_register_flags("be", kdb_bc, "<bpnum>",
+ "Enable Breakpoint", 0,
+ KDB_ENABLE_FLOW_CTRL);
+ kdb_register_flags("bd", kdb_bc, "<bpnum>",
+ "Disable Breakpoint", 0,
+ KDB_ENABLE_FLOW_CTRL);
+
+ kdb_register_flags("ss", kdb_ss, "",
+ "Single Step", 1,
+ KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
+ /*
+ * Architecture dependent initialization.
+ */
+}
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
new file mode 100644
index 000000000..fe15fff5d
--- /dev/null
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -0,0 +1,210 @@
+/*
+ * Kernel Debugger Architecture Independent Stack Traceback
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
+ */
+
+#include <linux/ctype.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/kdb.h>
+#include <linux/nmi.h>
+#include "kdb_private.h"
+
+
+static void kdb_show_stack(struct task_struct *p, void *addr)
+{
+ int old_lvl = console_loglevel;
+ console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH;
+ kdb_trap_printk++;
+ kdb_set_current_task(p);
+ if (addr) {
+ show_stack((struct task_struct *)p, addr);
+ } else if (kdb_current_regs) {
+#ifdef CONFIG_X86
+ show_stack(p, &kdb_current_regs->sp);
+#else
+ show_stack(p, NULL);
+#endif
+ } else {
+ show_stack(p, NULL);
+ }
+ console_loglevel = old_lvl;
+ kdb_trap_printk--;
+}
+
+/*
+ * kdb_bt
+ *
+ * This function implements the 'bt' command. Print a stack
+ * traceback.
+ *
+ * bt [<address-expression>] (addr-exp is for alternate stacks)
+ * btp <pid> Kernel stack for <pid>
+ * btt <address-expression> Kernel stack for task structure at
+ * <address-expression>
+ * bta [DRSTCZEUIMA] All useful processes, optionally
+ * filtered by state
+ * btc [<cpu>] The current process on one cpu,
+ * default is all cpus
+ *
+ * bt <address-expression> refers to a address on the stack, that location
+ * is assumed to contain a return address.
+ *
+ * btt <address-expression> refers to the address of a struct task.
+ *
+ * Inputs:
+ * argc argument count
+ * argv argument vector
+ * Outputs:
+ * None.
+ * Returns:
+ * zero for success, a kdb diagnostic if error
+ * Locking:
+ * none.
+ * Remarks:
+ * Backtrack works best when the code uses frame pointers. But even
+ * without frame pointers we should get a reasonable trace.
+ *
+ * mds comes in handy when examining the stack to do a manual traceback or
+ * to get a starting point for bt <address-expression>.
+ */
+
+static int
+kdb_bt1(struct task_struct *p, unsigned long mask,
+ int argcount, int btaprompt)
+{
+ char buffer[2];
+ if (kdb_getarea(buffer[0], (unsigned long)p) ||
+ kdb_getarea(buffer[0], (unsigned long)(p+1)-1))
+ return KDB_BADADDR;
+ if (!kdb_task_state(p, mask))
+ return 0;
+ kdb_printf("Stack traceback for pid %d\n", p->pid);
+ kdb_ps1(p);
+ kdb_show_stack(p, NULL);
+ if (btaprompt) {
+ kdb_getstr(buffer, sizeof(buffer),
+ "Enter <q> to end, <cr> to continue:");
+ if (buffer[0] == 'q') {
+ kdb_printf("\n");
+ return 1;
+ }
+ }
+ touch_nmi_watchdog();
+ return 0;
+}
+
+int
+kdb_bt(int argc, const char **argv)
+{
+ int diag;
+ int argcount = 5;
+ int btaprompt = 1;
+ int nextarg;
+ unsigned long addr;
+ long offset;
+
+ /* Prompt after each proc in bta */
+ kdbgetintenv("BTAPROMPT", &btaprompt);
+
+ if (strcmp(argv[0], "bta") == 0) {
+ struct task_struct *g, *p;
+ unsigned long cpu;
+ unsigned long mask = kdb_task_state_string(argc ? argv[1] :
+ NULL);
+ if (argc == 0)
+ kdb_ps_suppressed();
+ /* Run the active tasks first */
+ for_each_online_cpu(cpu) {
+ p = kdb_curr_task(cpu);
+ if (kdb_bt1(p, mask, argcount, btaprompt))
+ return 0;
+ }
+ /* Now the inactive tasks */
+ kdb_do_each_thread(g, p) {
+ if (KDB_FLAG(CMD_INTERRUPT))
+ return 0;
+ if (task_curr(p))
+ continue;
+ if (kdb_bt1(p, mask, argcount, btaprompt))
+ return 0;
+ } kdb_while_each_thread(g, p);
+ } else if (strcmp(argv[0], "btp") == 0) {
+ struct task_struct *p;
+ unsigned long pid;
+ if (argc != 1)
+ return KDB_ARGCOUNT;
+ diag = kdbgetularg((char *)argv[1], &pid);
+ if (diag)
+ return diag;
+ p = find_task_by_pid_ns(pid, &init_pid_ns);
+ if (p) {
+ kdb_set_current_task(p);
+ return kdb_bt1(p, ~0UL, argcount, 0);
+ }
+ kdb_printf("No process with pid == %ld found\n", pid);
+ return 0;
+ } else if (strcmp(argv[0], "btt") == 0) {
+ if (argc != 1)
+ return KDB_ARGCOUNT;
+ diag = kdbgetularg((char *)argv[1], &addr);
+ if (diag)
+ return diag;
+ kdb_set_current_task((struct task_struct *)addr);
+ return kdb_bt1((struct task_struct *)addr, ~0UL, argcount, 0);
+ } else if (strcmp(argv[0], "btc") == 0) {
+ unsigned long cpu = ~0;
+ struct task_struct *save_current_task = kdb_current_task;
+ char buf[80];
+ if (argc > 1)
+ return KDB_ARGCOUNT;
+ if (argc == 1) {
+ diag = kdbgetularg((char *)argv[1], &cpu);
+ if (diag)
+ return diag;
+ }
+ /* Recursive use of kdb_parse, do not use argv after
+ * this point */
+ argv = NULL;
+ if (cpu != ~0) {
+ if (cpu >= num_possible_cpus() || !cpu_online(cpu)) {
+ kdb_printf("no process for cpu %ld\n", cpu);
+ return 0;
+ }
+ sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu));
+ kdb_parse(buf);
+ return 0;
+ }
+ kdb_printf("btc: cpu status: ");
+ kdb_parse("cpu\n");
+ for_each_online_cpu(cpu) {
+ sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu));
+ kdb_parse(buf);
+ touch_nmi_watchdog();
+ }
+ kdb_set_current_task(save_current_task);
+ return 0;
+ } else {
+ if (argc) {
+ nextarg = 1;
+ diag = kdbgetaddrarg(argc, argv, &nextarg, &addr,
+ &offset, NULL);
+ if (diag)
+ return diag;
+ kdb_show_stack(kdb_current_task, (void *)addr);
+ return 0;
+ } else {
+ return kdb_bt1(kdb_current_task, ~0UL, argcount, 0);
+ }
+ }
+
+ /* NOTREACHED */
+ return 0;
+}
diff --git a/kernel/debug/kdb/kdb_cmds b/kernel/debug/kdb/kdb_cmds
new file mode 100644
index 000000000..9834ad303
--- /dev/null
+++ b/kernel/debug/kdb/kdb_cmds
@@ -0,0 +1,31 @@
+# Initial commands for kdb, alter to suit your needs.
+# These commands are executed in kdb_init() context, no SMP, no
+# processes. Commands that require process data (including stack or
+# registers) are not reliable this early. set and bp commands should
+# be safe. Global breakpoint commands affect each cpu as it is booted.
+
+# Standard debugging information for first level support, just type archkdb
+# or archkdbcpu or archkdbshort at the kdb prompt.
+
+defcmd dumpcommon "" "Common kdb debugging"
+ set BTAPROMPT 0
+ set LINES 10000
+ -summary
+ -cpu
+ -ps
+ -dmesg 600
+ -bt
+endefcmd
+
+defcmd dumpall "" "First line debugging"
+ pid R
+ -dumpcommon
+ -bta
+endefcmd
+
+defcmd dumpcpu "" "Same as dumpall but only tasks on cpus"
+ pid R
+ -dumpcommon
+ -btc
+endefcmd
+
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
new file mode 100644
index 000000000..15e1a7af5
--- /dev/null
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -0,0 +1,186 @@
+/*
+ * Created by: Jason Wessel <jason.wessel@windriver.com>
+ *
+ * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include <linux/kgdb.h>
+#include <linux/kdb.h>
+#include <linux/kdebug.h>
+#include <linux/export.h>
+#include <linux/hardirq.h>
+#include "kdb_private.h"
+#include "../debug_core.h"
+
+/*
+ * KDB interface to KGDB internals
+ */
+get_char_func kdb_poll_funcs[] = {
+ dbg_io_get_char,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+};
+EXPORT_SYMBOL_GPL(kdb_poll_funcs);
+
+int kdb_poll_idx = 1;
+EXPORT_SYMBOL_GPL(kdb_poll_idx);
+
+static struct kgdb_state *kdb_ks;
+
+int kdb_common_init_state(struct kgdb_state *ks)
+{
+ kdb_initial_cpu = atomic_read(&kgdb_active);
+ kdb_current_task = kgdb_info[ks->cpu].task;
+ kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo;
+ return 0;
+}
+
+int kdb_common_deinit_state(void)
+{
+ kdb_initial_cpu = -1;
+ kdb_current_task = NULL;
+ kdb_current_regs = NULL;
+ return 0;
+}
+
+int kdb_stub(struct kgdb_state *ks)
+{
+ int error = 0;
+ kdb_bp_t *bp;
+ unsigned long addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs);
+ kdb_reason_t reason = KDB_REASON_OOPS;
+ kdb_dbtrap_t db_result = KDB_DB_NOBPT;
+ int i;
+
+ kdb_ks = ks;
+ if (KDB_STATE(REENTRY)) {
+ reason = KDB_REASON_SWITCH;
+ KDB_STATE_CLEAR(REENTRY);
+ addr = instruction_pointer(ks->linux_regs);
+ }
+ ks->pass_exception = 0;
+ if (atomic_read(&kgdb_setting_breakpoint))
+ reason = KDB_REASON_KEYBOARD;
+
+ if (ks->err_code == KDB_REASON_SYSTEM_NMI && ks->signo == SIGTRAP)
+ reason = KDB_REASON_SYSTEM_NMI;
+
+ else if (in_nmi())
+ reason = KDB_REASON_NMI;
+
+ for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) {
+ if ((bp->bp_enabled) && (bp->bp_addr == addr)) {
+ reason = KDB_REASON_BREAK;
+ db_result = KDB_DB_BPT;
+ if (addr != instruction_pointer(ks->linux_regs))
+ kgdb_arch_set_pc(ks->linux_regs, addr);
+ break;
+ }
+ }
+ if (reason == KDB_REASON_BREAK || reason == KDB_REASON_SWITCH) {
+ for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) {
+ if (bp->bp_free)
+ continue;
+ if (bp->bp_addr == addr) {
+ bp->bp_delay = 1;
+ bp->bp_delayed = 1;
+ /*
+ * SSBPT is set when the kernel debugger must single step a
+ * task in order to re-establish an instruction breakpoint
+ * which uses the instruction replacement mechanism. It is
+ * cleared by any action that removes the need to single-step
+ * the breakpoint.
+ */
+ reason = KDB_REASON_BREAK;
+ db_result = KDB_DB_BPT;
+ KDB_STATE_SET(SSBPT);
+ break;
+ }
+ }
+ }
+
+ if (reason != KDB_REASON_BREAK && ks->ex_vector == 0 &&
+ ks->signo == SIGTRAP) {
+ reason = KDB_REASON_SSTEP;
+ db_result = KDB_DB_BPT;
+ }
+ /* Set initial kdb state variables */
+ KDB_STATE_CLEAR(KGDB_TRANS);
+ kdb_common_init_state(ks);
+ /* Remove any breakpoints as needed by kdb and clear single step */
+ kdb_bp_remove();
+ KDB_STATE_CLEAR(DOING_SS);
+ KDB_STATE_SET(PAGER);
+ /* zero out any offline cpu data */
+ for_each_present_cpu(i) {
+ if (!cpu_online(i)) {
+ kgdb_info[i].debuggerinfo = NULL;
+ kgdb_info[i].task = NULL;
+ }
+ }
+ if (ks->err_code == DIE_OOPS || reason == KDB_REASON_OOPS) {
+ ks->pass_exception = 1;
+ KDB_FLAG_SET(CATASTROPHIC);
+ }
+ /* set CATASTROPHIC if the system contains unresponsive processors */
+ for_each_online_cpu(i)
+ if (!kgdb_info[i].enter_kgdb)
+ KDB_FLAG_SET(CATASTROPHIC);
+ if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) {
+ KDB_STATE_CLEAR(SSBPT);
+ KDB_STATE_CLEAR(DOING_SS);
+ } else {
+ /* Start kdb main loop */
+ error = kdb_main_loop(KDB_REASON_ENTER, reason,
+ ks->err_code, db_result, ks->linux_regs);
+ }
+ /*
+ * Upon exit from the kdb main loop setup break points and restart
+ * the system based on the requested continue state
+ */
+ kdb_common_deinit_state();
+ KDB_STATE_CLEAR(PAGER);
+ kdbnearsym_cleanup();
+ if (error == KDB_CMD_KGDB) {
+ if (KDB_STATE(DOING_KGDB))
+ KDB_STATE_CLEAR(DOING_KGDB);
+ return DBG_PASS_EVENT;
+ }
+ kdb_bp_install(ks->linux_regs);
+ dbg_activate_sw_breakpoints();
+ /* Set the exit state to a single step or a continue */
+ if (KDB_STATE(DOING_SS))
+ gdbstub_state(ks, "s");
+ else
+ gdbstub_state(ks, "c");
+
+ KDB_FLAG_CLEAR(CATASTROPHIC);
+
+ /* Invoke arch specific exception handling prior to system resume */
+ kgdb_info[ks->cpu].ret_state = gdbstub_state(ks, "e");
+ if (ks->pass_exception)
+ kgdb_info[ks->cpu].ret_state = 1;
+ if (error == KDB_CMD_CPU) {
+ KDB_STATE_SET(REENTRY);
+ /*
+ * Force clear the single step bit because kdb emulates this
+ * differently vs the gdbstub
+ */
+ kgdb_single_step = 0;
+ dbg_deactivate_sw_breakpoints();
+ return DBG_SWITCH_CPU_EVENT;
+ }
+ return kgdb_info[ks->cpu].ret_state;
+}
+
+void kdb_gdb_state_pass(char *buf)
+{
+ gdbstub_state(kdb_ks, buf);
+}
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
new file mode 100644
index 000000000..6697a3d87
--- /dev/null
+++ b/kernel/debug/kdb/kdb_io.c
@@ -0,0 +1,874 @@
+/*
+ * Kernel Debugger Architecture Independent Console I/O handler
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 1999-2006 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/ctype.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/kdev_t.h>
+#include <linux/console.h>
+#include <linux/string.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/nmi.h>
+#include <linux/delay.h>
+#include <linux/kgdb.h>
+#include <linux/kdb.h>
+#include <linux/kallsyms.h>
+#include "kdb_private.h"
+
+#define CMD_BUFLEN 256
+char kdb_prompt_str[CMD_BUFLEN];
+
+int kdb_trap_printk;
+
+static int kgdb_transition_check(char *buffer)
+{
+ if (buffer[0] != '+' && buffer[0] != '$') {
+ KDB_STATE_SET(KGDB_TRANS);
+ kdb_printf("%s", buffer);
+ } else {
+ int slen = strlen(buffer);
+ if (slen > 3 && buffer[slen - 3] == '#') {
+ kdb_gdb_state_pass(buffer);
+ strcpy(buffer, "kgdb");
+ KDB_STATE_SET(DOING_KGDB);
+ return 1;
+ }
+ }
+ return 0;
+}
+
+static int kdb_read_get_key(char *buffer, size_t bufsize)
+{
+#define ESCAPE_UDELAY 1000
+#define ESCAPE_DELAY (2*1000000/ESCAPE_UDELAY) /* 2 seconds worth of udelays */
+ char escape_data[5]; /* longest vt100 escape sequence is 4 bytes */
+ char *ped = escape_data;
+ int escape_delay = 0;
+ get_char_func *f, *f_escape = NULL;
+ int key;
+
+ for (f = &kdb_poll_funcs[0]; ; ++f) {
+ if (*f == NULL) {
+ /* Reset NMI watchdog once per poll loop */
+ touch_nmi_watchdog();
+ f = &kdb_poll_funcs[0];
+ }
+ if (escape_delay == 2) {
+ *ped = '\0';
+ ped = escape_data;
+ --escape_delay;
+ }
+ if (escape_delay == 1) {
+ key = *ped++;
+ if (!*ped)
+ --escape_delay;
+ break;
+ }
+ key = (*f)();
+ if (key == -1) {
+ if (escape_delay) {
+ udelay(ESCAPE_UDELAY);
+ --escape_delay;
+ }
+ continue;
+ }
+ if (bufsize <= 2) {
+ if (key == '\r')
+ key = '\n';
+ *buffer++ = key;
+ *buffer = '\0';
+ return -1;
+ }
+ if (escape_delay == 0 && key == '\e') {
+ escape_delay = ESCAPE_DELAY;
+ ped = escape_data;
+ f_escape = f;
+ }
+ if (escape_delay) {
+ *ped++ = key;
+ if (f_escape != f) {
+ escape_delay = 2;
+ continue;
+ }
+ if (ped - escape_data == 1) {
+ /* \e */
+ continue;
+ } else if (ped - escape_data == 2) {
+ /* \e<something> */
+ if (key != '[')
+ escape_delay = 2;
+ continue;
+ } else if (ped - escape_data == 3) {
+ /* \e[<something> */
+ int mapkey = 0;
+ switch (key) {
+ case 'A': /* \e[A, up arrow */
+ mapkey = 16;
+ break;
+ case 'B': /* \e[B, down arrow */
+ mapkey = 14;
+ break;
+ case 'C': /* \e[C, right arrow */
+ mapkey = 6;
+ break;
+ case 'D': /* \e[D, left arrow */
+ mapkey = 2;
+ break;
+ case '1': /* dropthrough */
+ case '3': /* dropthrough */
+ /* \e[<1,3,4>], may be home, del, end */
+ case '4':
+ mapkey = -1;
+ break;
+ }
+ if (mapkey != -1) {
+ if (mapkey > 0) {
+ escape_data[0] = mapkey;
+ escape_data[1] = '\0';
+ }
+ escape_delay = 2;
+ }
+ continue;
+ } else if (ped - escape_data == 4) {
+ /* \e[<1,3,4><something> */
+ int mapkey = 0;
+ if (key == '~') {
+ switch (escape_data[2]) {
+ case '1': /* \e[1~, home */
+ mapkey = 1;
+ break;
+ case '3': /* \e[3~, del */
+ mapkey = 4;
+ break;
+ case '4': /* \e[4~, end */
+ mapkey = 5;
+ break;
+ }
+ }
+ if (mapkey > 0) {
+ escape_data[0] = mapkey;
+ escape_data[1] = '\0';
+ }
+ escape_delay = 2;
+ continue;
+ }
+ }
+ break; /* A key to process */
+ }
+ return key;
+}
+
+/*
+ * kdb_read
+ *
+ * This function reads a string of characters, terminated by
+ * a newline, or by reaching the end of the supplied buffer,
+ * from the current kernel debugger console device.
+ * Parameters:
+ * buffer - Address of character buffer to receive input characters.
+ * bufsize - size, in bytes, of the character buffer
+ * Returns:
+ * Returns a pointer to the buffer containing the received
+ * character string. This string will be terminated by a
+ * newline character.
+ * Locking:
+ * No locks are required to be held upon entry to this
+ * function. It is not reentrant - it relies on the fact
+ * that while kdb is running on only one "master debug" cpu.
+ * Remarks:
+ *
+ * The buffer size must be >= 2. A buffer size of 2 means that the caller only
+ * wants a single key.
+ *
+ * An escape key could be the start of a vt100 control sequence such as \e[D
+ * (left arrow) or it could be a character in its own right. The standard
+ * method for detecting the difference is to wait for 2 seconds to see if there
+ * are any other characters. kdb is complicated by the lack of a timer service
+ * (interrupts are off), by multiple input sources and by the need to sometimes
+ * return after just one key. Escape sequence processing has to be done as
+ * states in the polling loop.
+ */
+
+static char *kdb_read(char *buffer, size_t bufsize)
+{
+ char *cp = buffer;
+ char *bufend = buffer+bufsize-2; /* Reserve space for newline
+ * and null byte */
+ char *lastchar;
+ char *p_tmp;
+ char tmp;
+ static char tmpbuffer[CMD_BUFLEN];
+ int len = strlen(buffer);
+ int len_tmp;
+ int tab = 0;
+ int count;
+ int i;
+ int diag, dtab_count;
+ int key;
+
+
+ diag = kdbgetintenv("DTABCOUNT", &dtab_count);
+ if (diag)
+ dtab_count = 30;
+
+ if (len > 0) {
+ cp += len;
+ if (*(buffer+len-1) == '\n')
+ cp--;
+ }
+
+ lastchar = cp;
+ *cp = '\0';
+ kdb_printf("%s", buffer);
+poll_again:
+ key = kdb_read_get_key(buffer, bufsize);
+ if (key == -1)
+ return buffer;
+ if (key != 9)
+ tab = 0;
+ switch (key) {
+ case 8: /* backspace */
+ if (cp > buffer) {
+ if (cp < lastchar) {
+ memcpy(tmpbuffer, cp, lastchar - cp);
+ memcpy(cp-1, tmpbuffer, lastchar - cp);
+ }
+ *(--lastchar) = '\0';
+ --cp;
+ kdb_printf("\b%s \r", cp);
+ tmp = *cp;
+ *cp = '\0';
+ kdb_printf(kdb_prompt_str);
+ kdb_printf("%s", buffer);
+ *cp = tmp;
+ }
+ break;
+ case 13: /* enter */
+ *lastchar++ = '\n';
+ *lastchar++ = '\0';
+ if (!KDB_STATE(KGDB_TRANS)) {
+ KDB_STATE_SET(KGDB_TRANS);
+ kdb_printf("%s", buffer);
+ }
+ kdb_printf("\n");
+ return buffer;
+ case 4: /* Del */
+ if (cp < lastchar) {
+ memcpy(tmpbuffer, cp+1, lastchar - cp - 1);
+ memcpy(cp, tmpbuffer, lastchar - cp - 1);
+ *(--lastchar) = '\0';
+ kdb_printf("%s \r", cp);
+ tmp = *cp;
+ *cp = '\0';
+ kdb_printf(kdb_prompt_str);
+ kdb_printf("%s", buffer);
+ *cp = tmp;
+ }
+ break;
+ case 1: /* Home */
+ if (cp > buffer) {
+ kdb_printf("\r");
+ kdb_printf(kdb_prompt_str);
+ cp = buffer;
+ }
+ break;
+ case 5: /* End */
+ if (cp < lastchar) {
+ kdb_printf("%s", cp);
+ cp = lastchar;
+ }
+ break;
+ case 2: /* Left */
+ if (cp > buffer) {
+ kdb_printf("\b");
+ --cp;
+ }
+ break;
+ case 14: /* Down */
+ memset(tmpbuffer, ' ',
+ strlen(kdb_prompt_str) + (lastchar-buffer));
+ *(tmpbuffer+strlen(kdb_prompt_str) +
+ (lastchar-buffer)) = '\0';
+ kdb_printf("\r%s\r", tmpbuffer);
+ *lastchar = (char)key;
+ *(lastchar+1) = '\0';
+ return lastchar;
+ case 6: /* Right */
+ if (cp < lastchar) {
+ kdb_printf("%c", *cp);
+ ++cp;
+ }
+ break;
+ case 16: /* Up */
+ memset(tmpbuffer, ' ',
+ strlen(kdb_prompt_str) + (lastchar-buffer));
+ *(tmpbuffer+strlen(kdb_prompt_str) +
+ (lastchar-buffer)) = '\0';
+ kdb_printf("\r%s\r", tmpbuffer);
+ *lastchar = (char)key;
+ *(lastchar+1) = '\0';
+ return lastchar;
+ case 9: /* Tab */
+ if (tab < 2)
+ ++tab;
+ p_tmp = buffer;
+ while (*p_tmp == ' ')
+ p_tmp++;
+ if (p_tmp > cp)
+ break;
+ memcpy(tmpbuffer, p_tmp, cp-p_tmp);
+ *(tmpbuffer + (cp-p_tmp)) = '\0';
+ p_tmp = strrchr(tmpbuffer, ' ');
+ if (p_tmp)
+ ++p_tmp;
+ else
+ p_tmp = tmpbuffer;
+ len = strlen(p_tmp);
+ count = kallsyms_symbol_complete(p_tmp,
+ sizeof(tmpbuffer) -
+ (p_tmp - tmpbuffer));
+ if (tab == 2 && count > 0) {
+ kdb_printf("\n%d symbols are found.", count);
+ if (count > dtab_count) {
+ count = dtab_count;
+ kdb_printf(" But only first %d symbols will"
+ " be printed.\nYou can change the"
+ " environment variable DTABCOUNT.",
+ count);
+ }
+ kdb_printf("\n");
+ for (i = 0; i < count; i++) {
+ if (kallsyms_symbol_next(p_tmp, i) < 0)
+ break;
+ kdb_printf("%s ", p_tmp);
+ *(p_tmp + len) = '\0';
+ }
+ if (i >= dtab_count)
+ kdb_printf("...");
+ kdb_printf("\n");
+ kdb_printf(kdb_prompt_str);
+ kdb_printf("%s", buffer);
+ } else if (tab != 2 && count > 0) {
+ len_tmp = strlen(p_tmp);
+ strncpy(p_tmp+len_tmp, cp, lastchar-cp+1);
+ len_tmp = strlen(p_tmp);
+ strncpy(cp, p_tmp+len, len_tmp-len + 1);
+ len = len_tmp - len;
+ kdb_printf("%s", cp);
+ cp += len;
+ lastchar += len;
+ }
+ kdb_nextline = 1; /* reset output line number */
+ break;
+ default:
+ if (key >= 32 && lastchar < bufend) {
+ if (cp < lastchar) {
+ memcpy(tmpbuffer, cp, lastchar - cp);
+ memcpy(cp+1, tmpbuffer, lastchar - cp);
+ *++lastchar = '\0';
+ *cp = key;
+ kdb_printf("%s\r", cp);
+ ++cp;
+ tmp = *cp;
+ *cp = '\0';
+ kdb_printf(kdb_prompt_str);
+ kdb_printf("%s", buffer);
+ *cp = tmp;
+ } else {
+ *++lastchar = '\0';
+ *cp++ = key;
+ /* The kgdb transition check will hide
+ * printed characters if we think that
+ * kgdb is connecting, until the check
+ * fails */
+ if (!KDB_STATE(KGDB_TRANS)) {
+ if (kgdb_transition_check(buffer))
+ return buffer;
+ } else {
+ kdb_printf("%c", key);
+ }
+ }
+ /* Special escape to kgdb */
+ if (lastchar - buffer >= 5 &&
+ strcmp(lastchar - 5, "$?#3f") == 0) {
+ kdb_gdb_state_pass(lastchar - 5);
+ strcpy(buffer, "kgdb");
+ KDB_STATE_SET(DOING_KGDB);
+ return buffer;
+ }
+ if (lastchar - buffer >= 11 &&
+ strcmp(lastchar - 11, "$qSupported") == 0) {
+ kdb_gdb_state_pass(lastchar - 11);
+ strcpy(buffer, "kgdb");
+ KDB_STATE_SET(DOING_KGDB);
+ return buffer;
+ }
+ }
+ break;
+ }
+ goto poll_again;
+}
+
+/*
+ * kdb_getstr
+ *
+ * Print the prompt string and read a command from the
+ * input device.
+ *
+ * Parameters:
+ * buffer Address of buffer to receive command
+ * bufsize Size of buffer in bytes
+ * prompt Pointer to string to use as prompt string
+ * Returns:
+ * Pointer to command buffer.
+ * Locking:
+ * None.
+ * Remarks:
+ * For SMP kernels, the processor number will be
+ * substituted for %d, %x or %o in the prompt.
+ */
+
+char *kdb_getstr(char *buffer, size_t bufsize, const char *prompt)
+{
+ if (prompt && kdb_prompt_str != prompt)
+ strncpy(kdb_prompt_str, prompt, CMD_BUFLEN);
+ kdb_printf(kdb_prompt_str);
+ kdb_nextline = 1; /* Prompt and input resets line number */
+ return kdb_read(buffer, bufsize);
+}
+
+/*
+ * kdb_input_flush
+ *
+ * Get rid of any buffered console input.
+ *
+ * Parameters:
+ * none
+ * Returns:
+ * nothing
+ * Locking:
+ * none
+ * Remarks:
+ * Call this function whenever you want to flush input. If there is any
+ * outstanding input, it ignores all characters until there has been no
+ * data for approximately 1ms.
+ */
+
+static void kdb_input_flush(void)
+{
+ get_char_func *f;
+ int res;
+ int flush_delay = 1;
+ while (flush_delay) {
+ flush_delay--;
+empty:
+ touch_nmi_watchdog();
+ for (f = &kdb_poll_funcs[0]; *f; ++f) {
+ res = (*f)();
+ if (res != -1) {
+ flush_delay = 1;
+ goto empty;
+ }
+ }
+ if (flush_delay)
+ mdelay(1);
+ }
+}
+
+/*
+ * kdb_printf
+ *
+ * Print a string to the output device(s).
+ *
+ * Parameters:
+ * printf-like format and optional args.
+ * Returns:
+ * 0
+ * Locking:
+ * None.
+ * Remarks:
+ * use 'kdbcons->write()' to avoid polluting 'log_buf' with
+ * kdb output.
+ *
+ * If the user is doing a cmd args | grep srch
+ * then kdb_grepping_flag is set.
+ * In that case we need to accumulate full lines (ending in \n) before
+ * searching for the pattern.
+ */
+
+static char kdb_buffer[256]; /* A bit too big to go on stack */
+static char *next_avail = kdb_buffer;
+static int size_avail;
+static int suspend_grep;
+
+/*
+ * search arg1 to see if it contains arg2
+ * (kdmain.c provides flags for ^pat and pat$)
+ *
+ * return 1 for found, 0 for not found
+ */
+static int kdb_search_string(char *searched, char *searchfor)
+{
+ char firstchar, *cp;
+ int len1, len2;
+
+ /* not counting the newline at the end of "searched" */
+ len1 = strlen(searched)-1;
+ len2 = strlen(searchfor);
+ if (len1 < len2)
+ return 0;
+ if (kdb_grep_leading && kdb_grep_trailing && len1 != len2)
+ return 0;
+ if (kdb_grep_leading) {
+ if (!strncmp(searched, searchfor, len2))
+ return 1;
+ } else if (kdb_grep_trailing) {
+ if (!strncmp(searched+len1-len2, searchfor, len2))
+ return 1;
+ } else {
+ firstchar = *searchfor;
+ cp = searched;
+ while ((cp = strchr(cp, firstchar))) {
+ if (!strncmp(cp, searchfor, len2))
+ return 1;
+ cp++;
+ }
+ }
+ return 0;
+}
+
+int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
+{
+ int diag;
+ int linecount;
+ int colcount;
+ int logging, saved_loglevel = 0;
+ int saved_trap_printk;
+ int got_printf_lock = 0;
+ int retlen = 0;
+ int fnd, len;
+ char *cp, *cp2, *cphold = NULL, replaced_byte = ' ';
+ char *moreprompt = "more> ";
+ struct console *c = console_drivers;
+ static DEFINE_SPINLOCK(kdb_printf_lock);
+ unsigned long uninitialized_var(flags);
+
+ preempt_disable();
+ saved_trap_printk = kdb_trap_printk;
+ kdb_trap_printk = 0;
+
+ /* Serialize kdb_printf if multiple cpus try to write at once.
+ * But if any cpu goes recursive in kdb, just print the output,
+ * even if it is interleaved with any other text.
+ */
+ if (!KDB_STATE(PRINTF_LOCK)) {
+ KDB_STATE_SET(PRINTF_LOCK);
+ spin_lock_irqsave(&kdb_printf_lock, flags);
+ got_printf_lock = 1;
+ atomic_inc(&kdb_event);
+ } else {
+ __acquire(kdb_printf_lock);
+ }
+
+ diag = kdbgetintenv("LINES", &linecount);
+ if (diag || linecount <= 1)
+ linecount = 24;
+
+ diag = kdbgetintenv("COLUMNS", &colcount);
+ if (diag || colcount <= 1)
+ colcount = 80;
+
+ diag = kdbgetintenv("LOGGING", &logging);
+ if (diag)
+ logging = 0;
+
+ if (!kdb_grepping_flag || suspend_grep) {
+ /* normally, every vsnprintf starts a new buffer */
+ next_avail = kdb_buffer;
+ size_avail = sizeof(kdb_buffer);
+ }
+ vsnprintf(next_avail, size_avail, fmt, ap);
+
+ /*
+ * If kdb_parse() found that the command was cmd xxx | grep yyy
+ * then kdb_grepping_flag is set, and kdb_grep_string contains yyy
+ *
+ * Accumulate the print data up to a newline before searching it.
+ * (vsnprintf does null-terminate the string that it generates)
+ */
+
+ /* skip the search if prints are temporarily unconditional */
+ if (!suspend_grep && kdb_grepping_flag) {
+ cp = strchr(kdb_buffer, '\n');
+ if (!cp) {
+ /*
+ * Special cases that don't end with newlines
+ * but should be written without one:
+ * The "[nn]kdb> " prompt should
+ * appear at the front of the buffer.
+ *
+ * The "[nn]more " prompt should also be
+ * (MOREPROMPT -> moreprompt)
+ * written * but we print that ourselves,
+ * we set the suspend_grep flag to make
+ * it unconditional.
+ *
+ */
+ if (next_avail == kdb_buffer) {
+ /*
+ * these should occur after a newline,
+ * so they will be at the front of the
+ * buffer
+ */
+ cp2 = kdb_buffer;
+ len = strlen(kdb_prompt_str);
+ if (!strncmp(cp2, kdb_prompt_str, len)) {
+ /*
+ * We're about to start a new
+ * command, so we can go back
+ * to normal mode.
+ */
+ kdb_grepping_flag = 0;
+ goto kdb_printit;
+ }
+ }
+ /* no newline; don't search/write the buffer
+ until one is there */
+ len = strlen(kdb_buffer);
+ next_avail = kdb_buffer + len;
+ size_avail = sizeof(kdb_buffer) - len;
+ goto kdb_print_out;
+ }
+
+ /*
+ * The newline is present; print through it or discard
+ * it, depending on the results of the search.
+ */
+ cp++; /* to byte after the newline */
+ replaced_byte = *cp; /* remember what/where it was */
+ cphold = cp;
+ *cp = '\0'; /* end the string for our search */
+
+ /*
+ * We now have a newline at the end of the string
+ * Only continue with this output if it contains the
+ * search string.
+ */
+ fnd = kdb_search_string(kdb_buffer, kdb_grep_string);
+ if (!fnd) {
+ /*
+ * At this point the complete line at the start
+ * of kdb_buffer can be discarded, as it does
+ * not contain what the user is looking for.
+ * Shift the buffer left.
+ */
+ *cphold = replaced_byte;
+ strcpy(kdb_buffer, cphold);
+ len = strlen(kdb_buffer);
+ next_avail = kdb_buffer + len;
+ size_avail = sizeof(kdb_buffer) - len;
+ goto kdb_print_out;
+ }
+ if (kdb_grepping_flag >= KDB_GREPPING_FLAG_SEARCH)
+ /*
+ * This was a interactive search (using '/' at more
+ * prompt) and it has completed. Clear the flag.
+ */
+ kdb_grepping_flag = 0;
+ /*
+ * at this point the string is a full line and
+ * should be printed, up to the null.
+ */
+ }
+kdb_printit:
+
+ /*
+ * Write to all consoles.
+ */
+ retlen = strlen(kdb_buffer);
+ cp = (char *) printk_skip_level(kdb_buffer);
+ if (!dbg_kdb_mode && kgdb_connected) {
+ gdbstub_msg_write(cp, retlen - (cp - kdb_buffer));
+ } else {
+ if (dbg_io_ops && !dbg_io_ops->is_console) {
+ len = retlen - (cp - kdb_buffer);
+ cp2 = cp;
+ while (len--) {
+ dbg_io_ops->write_char(*cp2);
+ cp2++;
+ }
+ }
+ while (c) {
+ c->write(c, cp, retlen - (cp - kdb_buffer), 7); /* 7 == KERN_DEBUG */
+ touch_nmi_watchdog();
+ c = c->next;
+ }
+ }
+ if (logging) {
+ saved_loglevel = console_loglevel;
+ console_loglevel = CONSOLE_LOGLEVEL_SILENT;
+ if (printk_get_level(kdb_buffer) || src == KDB_MSGSRC_PRINTK)
+ printk("%s", kdb_buffer);
+ else
+ pr_info("%s", kdb_buffer);
+ }
+
+ if (KDB_STATE(PAGER)) {
+ /*
+ * Check printed string to decide how to bump the
+ * kdb_nextline to control when the more prompt should
+ * show up.
+ */
+ int got = 0;
+ len = retlen;
+ while (len--) {
+ if (kdb_buffer[len] == '\n') {
+ kdb_nextline++;
+ got = 0;
+ } else if (kdb_buffer[len] == '\r') {
+ got = 0;
+ } else {
+ got++;
+ }
+ }
+ kdb_nextline += got / (colcount + 1);
+ }
+
+ /* check for having reached the LINES number of printed lines */
+ if (kdb_nextline >= linecount) {
+ char buf1[16] = "";
+
+ /* Watch out for recursion here. Any routine that calls
+ * kdb_printf will come back through here. And kdb_read
+ * uses kdb_printf to echo on serial consoles ...
+ */
+ kdb_nextline = 1; /* In case of recursion */
+
+ /*
+ * Pause until cr.
+ */
+ moreprompt = kdbgetenv("MOREPROMPT");
+ if (moreprompt == NULL)
+ moreprompt = "more> ";
+
+ kdb_input_flush();
+ c = console_drivers;
+
+ if (dbg_io_ops && !dbg_io_ops->is_console) {
+ len = strlen(moreprompt);
+ cp = moreprompt;
+ while (len--) {
+ dbg_io_ops->write_char(*cp);
+ cp++;
+ }
+ }
+ while (c) {
+ c->write(c, moreprompt, strlen(moreprompt), 7); /* 7 == KERN_DEBUG */
+ touch_nmi_watchdog();
+ c = c->next;
+ }
+
+ if (logging)
+ printk("%s", moreprompt);
+
+ kdb_read(buf1, 2); /* '2' indicates to return
+ * immediately after getting one key. */
+ kdb_nextline = 1; /* Really set output line 1 */
+
+ /* empty and reset the buffer: */
+ kdb_buffer[0] = '\0';
+ next_avail = kdb_buffer;
+ size_avail = sizeof(kdb_buffer);
+ if ((buf1[0] == 'q') || (buf1[0] == 'Q')) {
+ /* user hit q or Q */
+ KDB_FLAG_SET(CMD_INTERRUPT); /* command interrupted */
+ KDB_STATE_CLEAR(PAGER);
+ /* end of command output; back to normal mode */
+ kdb_grepping_flag = 0;
+ kdb_printf("\n");
+ } else if (buf1[0] == ' ') {
+ kdb_printf("\r");
+ suspend_grep = 1; /* for this recursion */
+ } else if (buf1[0] == '\n') {
+ kdb_nextline = linecount - 1;
+ kdb_printf("\r");
+ suspend_grep = 1; /* for this recursion */
+ } else if (buf1[0] == '/' && !kdb_grepping_flag) {
+ kdb_printf("\r");
+ kdb_getstr(kdb_grep_string, KDB_GREP_STRLEN,
+ kdbgetenv("SEARCHPROMPT") ?: "search> ");
+ *strchrnul(kdb_grep_string, '\n') = '\0';
+ kdb_grepping_flag += KDB_GREPPING_FLAG_SEARCH;
+ suspend_grep = 1; /* for this recursion */
+ } else if (buf1[0] && buf1[0] != '\n') {
+ /* user hit something other than enter */
+ suspend_grep = 1; /* for this recursion */
+ if (buf1[0] != '/')
+ kdb_printf(
+ "\nOnly 'q', 'Q' or '/' are processed at "
+ "more prompt, input ignored\n");
+ else
+ kdb_printf("\n'/' cannot be used during | "
+ "grep filtering, input ignored\n");
+ } else if (kdb_grepping_flag) {
+ /* user hit enter */
+ suspend_grep = 1; /* for this recursion */
+ kdb_printf("\n");
+ }
+ kdb_input_flush();
+ }
+
+ /*
+ * For grep searches, shift the printed string left.
+ * replaced_byte contains the character that was overwritten with
+ * the terminating null, and cphold points to the null.
+ * Then adjust the notion of available space in the buffer.
+ */
+ if (kdb_grepping_flag && !suspend_grep) {
+ *cphold = replaced_byte;
+ strcpy(kdb_buffer, cphold);
+ len = strlen(kdb_buffer);
+ next_avail = kdb_buffer + len;
+ size_avail = sizeof(kdb_buffer) - len;
+ }
+
+kdb_print_out:
+ suspend_grep = 0; /* end of what may have been a recursive call */
+ if (logging)
+ console_loglevel = saved_loglevel;
+ if (KDB_STATE(PRINTF_LOCK) && got_printf_lock) {
+ got_printf_lock = 0;
+ spin_unlock_irqrestore(&kdb_printf_lock, flags);
+ KDB_STATE_CLEAR(PRINTF_LOCK);
+ atomic_dec(&kdb_event);
+ } else {
+ __release(kdb_printf_lock);
+ }
+ kdb_trap_printk = saved_trap_printk;
+ preempt_enable();
+ return retlen;
+}
+
+int kdb_printf(const char *fmt, ...)
+{
+ va_list ap;
+ int r;
+
+ va_start(ap, fmt);
+ r = vkdb_printf(KDB_MSGSRC_INTERNAL, fmt, ap);
+ va_end(ap);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(kdb_printf);
diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c
new file mode 100644
index 000000000..118527aa6
--- /dev/null
+++ b/kernel/debug/kdb/kdb_keyboard.c
@@ -0,0 +1,263 @@
+/*
+ * Kernel Debugger Architecture Dependent Console I/O handler
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.
+ *
+ * Copyright (c) 1999-2006 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
+ */
+
+#include <linux/kdb.h>
+#include <linux/keyboard.h>
+#include <linux/ctype.h>
+#include <linux/module.h>
+#include <linux/io.h>
+
+/* Keyboard Controller Registers on normal PCs. */
+
+#define KBD_STATUS_REG 0x64 /* Status register (R) */
+#define KBD_DATA_REG 0x60 /* Keyboard data register (R/W) */
+
+/* Status Register Bits */
+
+#define KBD_STAT_OBF 0x01 /* Keyboard output buffer full */
+#define KBD_STAT_MOUSE_OBF 0x20 /* Mouse output buffer full */
+
+static int kbd_exists;
+static int kbd_last_ret;
+
+/*
+ * Check if the keyboard controller has a keypress for us.
+ * Some parts (Enter Release, LED change) are still blocking polled here,
+ * but hopefully they are all short.
+ */
+int kdb_get_kbd_char(void)
+{
+ int scancode, scanstatus;
+ static int shift_lock; /* CAPS LOCK state (0-off, 1-on) */
+ static int shift_key; /* Shift next keypress */
+ static int ctrl_key;
+ u_short keychar;
+
+ if (KDB_FLAG(NO_I8042) || KDB_FLAG(NO_VT_CONSOLE) ||
+ (inb(KBD_STATUS_REG) == 0xff && inb(KBD_DATA_REG) == 0xff)) {
+ kbd_exists = 0;
+ return -1;
+ }
+ kbd_exists = 1;
+
+ if ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0)
+ return -1;
+
+ /*
+ * Fetch the scancode
+ */
+ scancode = inb(KBD_DATA_REG);
+ scanstatus = inb(KBD_STATUS_REG);
+
+ /*
+ * Ignore mouse events.
+ */
+ if (scanstatus & KBD_STAT_MOUSE_OBF)
+ return -1;
+
+ /*
+ * Ignore release, trigger on make
+ * (except for shift keys, where we want to
+ * keep the shift state so long as the key is
+ * held down).
+ */
+
+ if (((scancode&0x7f) == 0x2a) || ((scancode&0x7f) == 0x36)) {
+ /*
+ * Next key may use shift table
+ */
+ if ((scancode & 0x80) == 0)
+ shift_key = 1;
+ else
+ shift_key = 0;
+ return -1;
+ }
+
+ if ((scancode&0x7f) == 0x1d) {
+ /*
+ * Left ctrl key
+ */
+ if ((scancode & 0x80) == 0)
+ ctrl_key = 1;
+ else
+ ctrl_key = 0;
+ return -1;
+ }
+
+ if ((scancode & 0x80) != 0) {
+ if (scancode == 0x9c)
+ kbd_last_ret = 0;
+ return -1;
+ }
+
+ scancode &= 0x7f;
+
+ /*
+ * Translate scancode
+ */
+
+ if (scancode == 0x3a) {
+ /*
+ * Toggle caps lock
+ */
+ shift_lock ^= 1;
+
+#ifdef KDB_BLINK_LED
+ kdb_toggleled(0x4);
+#endif
+ return -1;
+ }
+
+ if (scancode == 0x0e) {
+ /*
+ * Backspace
+ */
+ return 8;
+ }
+
+ /* Special Key */
+ switch (scancode) {
+ case 0xF: /* Tab */
+ return 9;
+ case 0x53: /* Del */
+ return 4;
+ case 0x47: /* Home */
+ return 1;
+ case 0x4F: /* End */
+ return 5;
+ case 0x4B: /* Left */
+ return 2;
+ case 0x48: /* Up */
+ return 16;
+ case 0x50: /* Down */
+ return 14;
+ case 0x4D: /* Right */
+ return 6;
+ }
+
+ if (scancode == 0xe0)
+ return -1;
+
+ /*
+ * For Japanese 86/106 keyboards
+ * See comment in drivers/char/pc_keyb.c.
+ * - Masahiro Adegawa
+ */
+ if (scancode == 0x73)
+ scancode = 0x59;
+ else if (scancode == 0x7d)
+ scancode = 0x7c;
+
+ if (!shift_lock && !shift_key && !ctrl_key) {
+ keychar = plain_map[scancode];
+ } else if ((shift_lock || shift_key) && key_maps[1]) {
+ keychar = key_maps[1][scancode];
+ } else if (ctrl_key && key_maps[4]) {
+ keychar = key_maps[4][scancode];
+ } else {
+ keychar = 0x0020;
+ kdb_printf("Unknown state/scancode (%d)\n", scancode);
+ }
+ keychar &= 0x0fff;
+ if (keychar == '\t')
+ keychar = ' ';
+ switch (KTYP(keychar)) {
+ case KT_LETTER:
+ case KT_LATIN:
+ if (isprint(keychar))
+ break; /* printable characters */
+ /* drop through */
+ case KT_SPEC:
+ if (keychar == K_ENTER)
+ break;
+ /* drop through */
+ default:
+ return -1; /* ignore unprintables */
+ }
+
+ if (scancode == 0x1c) {
+ kbd_last_ret = 1;
+ return 13;
+ }
+
+ return keychar & 0xff;
+}
+EXPORT_SYMBOL_GPL(kdb_get_kbd_char);
+
+/*
+ * Best effort cleanup of ENTER break codes on leaving KDB. Called on
+ * exiting KDB, when we know we processed an ENTER or KP ENTER scan
+ * code.
+ */
+void kdb_kbd_cleanup_state(void)
+{
+ int scancode, scanstatus;
+
+ /*
+ * Nothing to clean up, since either
+ * ENTER was never pressed, or has already
+ * gotten cleaned up.
+ */
+ if (!kbd_last_ret)
+ return;
+
+ kbd_last_ret = 0;
+ /*
+ * Enter key. Need to absorb the break code here, lest it gets
+ * leaked out if we exit KDB as the result of processing 'g'.
+ *
+ * This has several interesting implications:
+ * + Need to handle KP ENTER, which has break code 0xe0 0x9c.
+ * + Need to handle repeat ENTER and repeat KP ENTER. Repeats
+ * only get a break code at the end of the repeated
+ * sequence. This means we can't propagate the repeated key
+ * press, and must swallow it away.
+ * + Need to handle possible PS/2 mouse input.
+ * + Need to handle mashed keys.
+ */
+
+ while (1) {
+ while ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0)
+ cpu_relax();
+
+ /*
+ * Fetch the scancode.
+ */
+ scancode = inb(KBD_DATA_REG);
+ scanstatus = inb(KBD_STATUS_REG);
+
+ /*
+ * Skip mouse input.
+ */
+ if (scanstatus & KBD_STAT_MOUSE_OBF)
+ continue;
+
+ /*
+ * If we see 0xe0, this is either a break code for KP
+ * ENTER, or a repeat make for KP ENTER. Either way,
+ * since the second byte is equivalent to an ENTER,
+ * skip the 0xe0 and try again.
+ *
+ * If we see 0x1c, this must be a repeat ENTER or KP
+ * ENTER (and we swallowed 0xe0 before). Try again.
+ *
+ * We can also see make and break codes for other keys
+ * mashed before or after pressing ENTER. Thus, if we
+ * see anything other than 0x9c, we have to try again.
+ *
+ * Note, if you held some key as ENTER was depressed,
+ * that break code would get leaked out.
+ */
+ if (scancode != 0x9c)
+ continue;
+
+ return;
+ }
+}
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
new file mode 100644
index 000000000..412134549
--- /dev/null
+++ b/kernel/debug/kdb/kdb_main.c
@@ -0,0 +1,2954 @@
+/*
+ * Kernel Debugger Architecture Independent Main Code
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 1999-2004 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (C) 2000 Stephane Eranian <eranian@hpl.hp.com>
+ * Xscale (R) modifications copyright (C) 2003 Intel Corporation.
+ * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
+ */
+
+#include <linux/ctype.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/kmsg_dump.h>
+#include <linux/reboot.h>
+#include <linux/sched.h>
+#include <linux/sysrq.h>
+#include <linux/smp.h>
+#include <linux/utsname.h>
+#include <linux/vmalloc.h>
+#include <linux/atomic.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/kallsyms.h>
+#include <linux/kgdb.h>
+#include <linux/kdb.h>
+#include <linux/notifier.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/nmi.h>
+#include <linux/time.h>
+#include <linux/ptrace.h>
+#include <linux/sysctl.h>
+#include <linux/cpu.h>
+#include <linux/kdebug.h>
+#include <linux/proc_fs.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include "kdb_private.h"
+
+#undef MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX "kdb."
+
+static int kdb_cmd_enabled = CONFIG_KDB_DEFAULT_ENABLE;
+module_param_named(cmd_enable, kdb_cmd_enabled, int, 0600);
+
+char kdb_grep_string[KDB_GREP_STRLEN];
+int kdb_grepping_flag;
+EXPORT_SYMBOL(kdb_grepping_flag);
+int kdb_grep_leading;
+int kdb_grep_trailing;
+
+/*
+ * Kernel debugger state flags
+ */
+int kdb_flags;
+atomic_t kdb_event;
+
+/*
+ * kdb_lock protects updates to kdb_initial_cpu. Used to
+ * single thread processors through the kernel debugger.
+ */
+int kdb_initial_cpu = -1; /* cpu number that owns kdb */
+int kdb_nextline = 1;
+int kdb_state; /* General KDB state */
+
+struct task_struct *kdb_current_task;
+EXPORT_SYMBOL(kdb_current_task);
+struct pt_regs *kdb_current_regs;
+
+const char *kdb_diemsg;
+static int kdb_go_count;
+#ifdef CONFIG_KDB_CONTINUE_CATASTROPHIC
+static unsigned int kdb_continue_catastrophic =
+ CONFIG_KDB_CONTINUE_CATASTROPHIC;
+#else
+static unsigned int kdb_continue_catastrophic;
+#endif
+
+/* kdb_commands describes the available commands. */
+static kdbtab_t *kdb_commands;
+#define KDB_BASE_CMD_MAX 50
+static int kdb_max_commands = KDB_BASE_CMD_MAX;
+static kdbtab_t kdb_base_commands[KDB_BASE_CMD_MAX];
+#define for_each_kdbcmd(cmd, num) \
+ for ((cmd) = kdb_base_commands, (num) = 0; \
+ num < kdb_max_commands; \
+ num++, num == KDB_BASE_CMD_MAX ? cmd = kdb_commands : cmd++)
+
+typedef struct _kdbmsg {
+ int km_diag; /* kdb diagnostic */
+ char *km_msg; /* Corresponding message text */
+} kdbmsg_t;
+
+#define KDBMSG(msgnum, text) \
+ { KDB_##msgnum, text }
+
+static kdbmsg_t kdbmsgs[] = {
+ KDBMSG(NOTFOUND, "Command Not Found"),
+ KDBMSG(ARGCOUNT, "Improper argument count, see usage."),
+ KDBMSG(BADWIDTH, "Illegal value for BYTESPERWORD use 1, 2, 4 or 8, "
+ "8 is only allowed on 64 bit systems"),
+ KDBMSG(BADRADIX, "Illegal value for RADIX use 8, 10 or 16"),
+ KDBMSG(NOTENV, "Cannot find environment variable"),
+ KDBMSG(NOENVVALUE, "Environment variable should have value"),
+ KDBMSG(NOTIMP, "Command not implemented"),
+ KDBMSG(ENVFULL, "Environment full"),
+ KDBMSG(ENVBUFFULL, "Environment buffer full"),
+ KDBMSG(TOOMANYBPT, "Too many breakpoints defined"),
+#ifdef CONFIG_CPU_XSCALE
+ KDBMSG(TOOMANYDBREGS, "More breakpoints than ibcr registers defined"),
+#else
+ KDBMSG(TOOMANYDBREGS, "More breakpoints than db registers defined"),
+#endif
+ KDBMSG(DUPBPT, "Duplicate breakpoint address"),
+ KDBMSG(BPTNOTFOUND, "Breakpoint not found"),
+ KDBMSG(BADMODE, "Invalid IDMODE"),
+ KDBMSG(BADINT, "Illegal numeric value"),
+ KDBMSG(INVADDRFMT, "Invalid symbolic address format"),
+ KDBMSG(BADREG, "Invalid register name"),
+ KDBMSG(BADCPUNUM, "Invalid cpu number"),
+ KDBMSG(BADLENGTH, "Invalid length field"),
+ KDBMSG(NOBP, "No Breakpoint exists"),
+ KDBMSG(BADADDR, "Invalid address"),
+ KDBMSG(NOPERM, "Permission denied"),
+};
+#undef KDBMSG
+
+static const int __nkdb_err = ARRAY_SIZE(kdbmsgs);
+
+
+/*
+ * Initial environment. This is all kept static and local to
+ * this file. We don't want to rely on the memory allocation
+ * mechanisms in the kernel, so we use a very limited allocate-only
+ * heap for new and altered environment variables. The entire
+ * environment is limited to a fixed number of entries (add more
+ * to __env[] if required) and a fixed amount of heap (add more to
+ * KDB_ENVBUFSIZE if required).
+ */
+
+static char *__env[] = {
+#if defined(CONFIG_SMP)
+ "PROMPT=[%d]kdb> ",
+#else
+ "PROMPT=kdb> ",
+#endif
+ "MOREPROMPT=more> ",
+ "RADIX=16",
+ "MDCOUNT=8", /* lines of md output */
+ KDB_PLATFORM_ENV,
+ "DTABCOUNT=30",
+ "NOSECT=1",
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+};
+
+static const int __nenv = ARRAY_SIZE(__env);
+
+struct task_struct *kdb_curr_task(int cpu)
+{
+ struct task_struct *p = curr_task(cpu);
+#ifdef _TIF_MCA_INIT
+ if ((task_thread_info(p)->flags & _TIF_MCA_INIT) && KDB_TSK(cpu))
+ p = krp->p;
+#endif
+ return p;
+}
+
+/*
+ * Check whether the flags of the current command and the permissions
+ * of the kdb console has allow a command to be run.
+ */
+static inline bool kdb_check_flags(kdb_cmdflags_t flags, int permissions,
+ bool no_args)
+{
+ /* permissions comes from userspace so needs massaging slightly */
+ permissions &= KDB_ENABLE_MASK;
+ permissions |= KDB_ENABLE_ALWAYS_SAFE;
+
+ /* some commands change group when launched with no arguments */
+ if (no_args)
+ permissions |= permissions << KDB_ENABLE_NO_ARGS_SHIFT;
+
+ flags |= KDB_ENABLE_ALL;
+
+ return permissions & flags;
+}
+
+/*
+ * kdbgetenv - This function will return the character string value of
+ * an environment variable.
+ * Parameters:
+ * match A character string representing an environment variable.
+ * Returns:
+ * NULL No environment variable matches 'match'
+ * char* Pointer to string value of environment variable.
+ */
+char *kdbgetenv(const char *match)
+{
+ char **ep = __env;
+ int matchlen = strlen(match);
+ int i;
+
+ for (i = 0; i < __nenv; i++) {
+ char *e = *ep++;
+
+ if (!e)
+ continue;
+
+ if ((strncmp(match, e, matchlen) == 0)
+ && ((e[matchlen] == '\0')
+ || (e[matchlen] == '='))) {
+ char *cp = strchr(e, '=');
+ return cp ? ++cp : "";
+ }
+ }
+ return NULL;
+}
+
+/*
+ * kdballocenv - This function is used to allocate bytes for
+ * environment entries.
+ * Parameters:
+ * match A character string representing a numeric value
+ * Outputs:
+ * *value the unsigned long representation of the env variable 'match'
+ * Returns:
+ * Zero on success, a kdb diagnostic on failure.
+ * Remarks:
+ * We use a static environment buffer (envbuffer) to hold the values
+ * of dynamically generated environment variables (see kdb_set). Buffer
+ * space once allocated is never free'd, so over time, the amount of space
+ * (currently 512 bytes) will be exhausted if env variables are changed
+ * frequently.
+ */
+static char *kdballocenv(size_t bytes)
+{
+#define KDB_ENVBUFSIZE 512
+ static char envbuffer[KDB_ENVBUFSIZE];
+ static int envbufsize;
+ char *ep = NULL;
+
+ if ((KDB_ENVBUFSIZE - envbufsize) >= bytes) {
+ ep = &envbuffer[envbufsize];
+ envbufsize += bytes;
+ }
+ return ep;
+}
+
+/*
+ * kdbgetulenv - This function will return the value of an unsigned
+ * long-valued environment variable.
+ * Parameters:
+ * match A character string representing a numeric value
+ * Outputs:
+ * *value the unsigned long represntation of the env variable 'match'
+ * Returns:
+ * Zero on success, a kdb diagnostic on failure.
+ */
+static int kdbgetulenv(const char *match, unsigned long *value)
+{
+ char *ep;
+
+ ep = kdbgetenv(match);
+ if (!ep)
+ return KDB_NOTENV;
+ if (strlen(ep) == 0)
+ return KDB_NOENVVALUE;
+
+ *value = simple_strtoul(ep, NULL, 0);
+
+ return 0;
+}
+
+/*
+ * kdbgetintenv - This function will return the value of an
+ * integer-valued environment variable.
+ * Parameters:
+ * match A character string representing an integer-valued env variable
+ * Outputs:
+ * *value the integer representation of the environment variable 'match'
+ * Returns:
+ * Zero on success, a kdb diagnostic on failure.
+ */
+int kdbgetintenv(const char *match, int *value)
+{
+ unsigned long val;
+ int diag;
+
+ diag = kdbgetulenv(match, &val);
+ if (!diag)
+ *value = (int) val;
+ return diag;
+}
+
+/*
+ * kdbgetularg - This function will convert a numeric string into an
+ * unsigned long value.
+ * Parameters:
+ * arg A character string representing a numeric value
+ * Outputs:
+ * *value the unsigned long represntation of arg.
+ * Returns:
+ * Zero on success, a kdb diagnostic on failure.
+ */
+int kdbgetularg(const char *arg, unsigned long *value)
+{
+ char *endp;
+ unsigned long val;
+
+ val = simple_strtoul(arg, &endp, 0);
+
+ if (endp == arg) {
+ /*
+ * Also try base 16, for us folks too lazy to type the
+ * leading 0x...
+ */
+ val = simple_strtoul(arg, &endp, 16);
+ if (endp == arg)
+ return KDB_BADINT;
+ }
+
+ *value = val;
+
+ return 0;
+}
+
+int kdbgetu64arg(const char *arg, u64 *value)
+{
+ char *endp;
+ u64 val;
+
+ val = simple_strtoull(arg, &endp, 0);
+
+ if (endp == arg) {
+
+ val = simple_strtoull(arg, &endp, 16);
+ if (endp == arg)
+ return KDB_BADINT;
+ }
+
+ *value = val;
+
+ return 0;
+}
+
+/*
+ * kdb_set - This function implements the 'set' command. Alter an
+ * existing environment variable or create a new one.
+ */
+int kdb_set(int argc, const char **argv)
+{
+ int i;
+ char *ep;
+ size_t varlen, vallen;
+
+ /*
+ * we can be invoked two ways:
+ * set var=value argv[1]="var", argv[2]="value"
+ * set var = value argv[1]="var", argv[2]="=", argv[3]="value"
+ * - if the latter, shift 'em down.
+ */
+ if (argc == 3) {
+ argv[2] = argv[3];
+ argc--;
+ }
+
+ if (argc != 2)
+ return KDB_ARGCOUNT;
+
+ /*
+ * Check for internal variables
+ */
+ if (strcmp(argv[1], "KDBDEBUG") == 0) {
+ unsigned int debugflags;
+ char *cp;
+
+ debugflags = simple_strtoul(argv[2], &cp, 0);
+ if (cp == argv[2] || debugflags & ~KDB_DEBUG_FLAG_MASK) {
+ kdb_printf("kdb: illegal debug flags '%s'\n",
+ argv[2]);
+ return 0;
+ }
+ kdb_flags = (kdb_flags &
+ ~(KDB_DEBUG_FLAG_MASK << KDB_DEBUG_FLAG_SHIFT))
+ | (debugflags << KDB_DEBUG_FLAG_SHIFT);
+
+ return 0;
+ }
+
+ /*
+ * Tokenizer squashed the '=' sign. argv[1] is variable
+ * name, argv[2] = value.
+ */
+ varlen = strlen(argv[1]);
+ vallen = strlen(argv[2]);
+ ep = kdballocenv(varlen + vallen + 2);
+ if (ep == (char *)0)
+ return KDB_ENVBUFFULL;
+
+ sprintf(ep, "%s=%s", argv[1], argv[2]);
+
+ ep[varlen+vallen+1] = '\0';
+
+ for (i = 0; i < __nenv; i++) {
+ if (__env[i]
+ && ((strncmp(__env[i], argv[1], varlen) == 0)
+ && ((__env[i][varlen] == '\0')
+ || (__env[i][varlen] == '=')))) {
+ __env[i] = ep;
+ return 0;
+ }
+ }
+
+ /*
+ * Wasn't existing variable. Fit into slot.
+ */
+ for (i = 0; i < __nenv-1; i++) {
+ if (__env[i] == (char *)0) {
+ __env[i] = ep;
+ return 0;
+ }
+ }
+
+ return KDB_ENVFULL;
+}
+
+static int kdb_check_regs(void)
+{
+ if (!kdb_current_regs) {
+ kdb_printf("No current kdb registers."
+ " You may need to select another task\n");
+ return KDB_BADREG;
+ }
+ return 0;
+}
+
+/*
+ * kdbgetaddrarg - This function is responsible for parsing an
+ * address-expression and returning the value of the expression,
+ * symbol name, and offset to the caller.
+ *
+ * The argument may consist of a numeric value (decimal or
+ * hexidecimal), a symbol name, a register name (preceded by the
+ * percent sign), an environment variable with a numeric value
+ * (preceded by a dollar sign) or a simple arithmetic expression
+ * consisting of a symbol name, +/-, and a numeric constant value
+ * (offset).
+ * Parameters:
+ * argc - count of arguments in argv
+ * argv - argument vector
+ * *nextarg - index to next unparsed argument in argv[]
+ * regs - Register state at time of KDB entry
+ * Outputs:
+ * *value - receives the value of the address-expression
+ * *offset - receives the offset specified, if any
+ * *name - receives the symbol name, if any
+ * *nextarg - index to next unparsed argument in argv[]
+ * Returns:
+ * zero is returned on success, a kdb diagnostic code is
+ * returned on error.
+ */
+int kdbgetaddrarg(int argc, const char **argv, int *nextarg,
+ unsigned long *value, long *offset,
+ char **name)
+{
+ unsigned long addr;
+ unsigned long off = 0;
+ int positive;
+ int diag;
+ int found = 0;
+ char *symname;
+ char symbol = '\0';
+ char *cp;
+ kdb_symtab_t symtab;
+
+ /*
+ * If the enable flags prohibit both arbitrary memory access
+ * and flow control then there are no reasonable grounds to
+ * provide symbol lookup.
+ */
+ if (!kdb_check_flags(KDB_ENABLE_MEM_READ | KDB_ENABLE_FLOW_CTRL,
+ kdb_cmd_enabled, false))
+ return KDB_NOPERM;
+
+ /*
+ * Process arguments which follow the following syntax:
+ *
+ * symbol | numeric-address [+/- numeric-offset]
+ * %register
+ * $environment-variable
+ */
+
+ if (*nextarg > argc)
+ return KDB_ARGCOUNT;
+
+ symname = (char *)argv[*nextarg];
+
+ /*
+ * If there is no whitespace between the symbol
+ * or address and the '+' or '-' symbols, we
+ * remember the character and replace it with a
+ * null so the symbol/value can be properly parsed
+ */
+ cp = strpbrk(symname, "+-");
+ if (cp != NULL) {
+ symbol = *cp;
+ *cp++ = '\0';
+ }
+
+ if (symname[0] == '$') {
+ diag = kdbgetulenv(&symname[1], &addr);
+ if (diag)
+ return diag;
+ } else if (symname[0] == '%') {
+ diag = kdb_check_regs();
+ if (diag)
+ return diag;
+ /* Implement register values with % at a later time as it is
+ * arch optional.
+ */
+ return KDB_NOTIMP;
+ } else {
+ found = kdbgetsymval(symname, &symtab);
+ if (found) {
+ addr = symtab.sym_start;
+ } else {
+ diag = kdbgetularg(argv[*nextarg], &addr);
+ if (diag)
+ return diag;
+ }
+ }
+
+ if (!found)
+ found = kdbnearsym(addr, &symtab);
+
+ (*nextarg)++;
+
+ if (name)
+ *name = symname;
+ if (value)
+ *value = addr;
+ if (offset && name && *name)
+ *offset = addr - symtab.sym_start;
+
+ if ((*nextarg > argc)
+ && (symbol == '\0'))
+ return 0;
+
+ /*
+ * check for +/- and offset
+ */
+
+ if (symbol == '\0') {
+ if ((argv[*nextarg][0] != '+')
+ && (argv[*nextarg][0] != '-')) {
+ /*
+ * Not our argument. Return.
+ */
+ return 0;
+ } else {
+ positive = (argv[*nextarg][0] == '+');
+ (*nextarg)++;
+ }
+ } else
+ positive = (symbol == '+');
+
+ /*
+ * Now there must be an offset!
+ */
+ if ((*nextarg > argc)
+ && (symbol == '\0')) {
+ return KDB_INVADDRFMT;
+ }
+
+ if (!symbol) {
+ cp = (char *)argv[*nextarg];
+ (*nextarg)++;
+ }
+
+ diag = kdbgetularg(cp, &off);
+ if (diag)
+ return diag;
+
+ if (!positive)
+ off = -off;
+
+ if (offset)
+ *offset += off;
+
+ if (value)
+ *value += off;
+
+ return 0;
+}
+
+static void kdb_cmderror(int diag)
+{
+ int i;
+
+ if (diag >= 0) {
+ kdb_printf("no error detected (diagnostic is %d)\n", diag);
+ return;
+ }
+
+ for (i = 0; i < __nkdb_err; i++) {
+ if (kdbmsgs[i].km_diag == diag) {
+ kdb_printf("diag: %d: %s\n", diag, kdbmsgs[i].km_msg);
+ return;
+ }
+ }
+
+ kdb_printf("Unknown diag %d\n", -diag);
+}
+
+/*
+ * kdb_defcmd, kdb_defcmd2 - This function implements the 'defcmd'
+ * command which defines one command as a set of other commands,
+ * terminated by endefcmd. kdb_defcmd processes the initial
+ * 'defcmd' command, kdb_defcmd2 is invoked from kdb_parse for
+ * the following commands until 'endefcmd'.
+ * Inputs:
+ * argc argument count
+ * argv argument vector
+ * Returns:
+ * zero for success, a kdb diagnostic if error
+ */
+struct defcmd_set {
+ int count;
+ int usable;
+ char *name;
+ char *usage;
+ char *help;
+ char **command;
+};
+static struct defcmd_set *defcmd_set;
+static int defcmd_set_count;
+static int defcmd_in_progress;
+
+/* Forward references */
+static int kdb_exec_defcmd(int argc, const char **argv);
+
+static int kdb_defcmd2(const char *cmdstr, const char *argv0)
+{
+ struct defcmd_set *s = defcmd_set + defcmd_set_count - 1;
+ char **save_command = s->command;
+ if (strcmp(argv0, "endefcmd") == 0) {
+ defcmd_in_progress = 0;
+ if (!s->count)
+ s->usable = 0;
+ if (s->usable)
+ /* macros are always safe because when executed each
+ * internal command re-enters kdb_parse() and is
+ * safety checked individually.
+ */
+ kdb_register_flags(s->name, kdb_exec_defcmd, s->usage,
+ s->help, 0,
+ KDB_ENABLE_ALWAYS_SAFE);
+ return 0;
+ }
+ if (!s->usable)
+ return KDB_NOTIMP;
+ s->command = kzalloc((s->count + 1) * sizeof(*(s->command)), GFP_KDB);
+ if (!s->command) {
+ kdb_printf("Could not allocate new kdb_defcmd table for %s\n",
+ cmdstr);
+ s->usable = 0;
+ return KDB_NOTIMP;
+ }
+ memcpy(s->command, save_command, s->count * sizeof(*(s->command)));
+ s->command[s->count++] = kdb_strdup(cmdstr, GFP_KDB);
+ kfree(save_command);
+ return 0;
+}
+
+static int kdb_defcmd(int argc, const char **argv)
+{
+ struct defcmd_set *save_defcmd_set = defcmd_set, *s;
+ if (defcmd_in_progress) {
+ kdb_printf("kdb: nested defcmd detected, assuming missing "
+ "endefcmd\n");
+ kdb_defcmd2("endefcmd", "endefcmd");
+ }
+ if (argc == 0) {
+ int i;
+ for (s = defcmd_set; s < defcmd_set + defcmd_set_count; ++s) {
+ kdb_printf("defcmd %s \"%s\" \"%s\"\n", s->name,
+ s->usage, s->help);
+ for (i = 0; i < s->count; ++i)
+ kdb_printf("%s", s->command[i]);
+ kdb_printf("endefcmd\n");
+ }
+ return 0;
+ }
+ if (argc != 3)
+ return KDB_ARGCOUNT;
+ if (in_dbg_master()) {
+ kdb_printf("Command only available during kdb_init()\n");
+ return KDB_NOTIMP;
+ }
+ defcmd_set = kmalloc((defcmd_set_count + 1) * sizeof(*defcmd_set),
+ GFP_KDB);
+ if (!defcmd_set)
+ goto fail_defcmd;
+ memcpy(defcmd_set, save_defcmd_set,
+ defcmd_set_count * sizeof(*defcmd_set));
+ s = defcmd_set + defcmd_set_count;
+ memset(s, 0, sizeof(*s));
+ s->usable = 1;
+ s->name = kdb_strdup(argv[1], GFP_KDB);
+ if (!s->name)
+ goto fail_name;
+ s->usage = kdb_strdup(argv[2], GFP_KDB);
+ if (!s->usage)
+ goto fail_usage;
+ s->help = kdb_strdup(argv[3], GFP_KDB);
+ if (!s->help)
+ goto fail_help;
+ if (s->usage[0] == '"') {
+ strcpy(s->usage, argv[2]+1);
+ s->usage[strlen(s->usage)-1] = '\0';
+ }
+ if (s->help[0] == '"') {
+ strcpy(s->help, argv[3]+1);
+ s->help[strlen(s->help)-1] = '\0';
+ }
+ ++defcmd_set_count;
+ defcmd_in_progress = 1;
+ kfree(save_defcmd_set);
+ return 0;
+fail_help:
+ kfree(s->usage);
+fail_usage:
+ kfree(s->name);
+fail_name:
+ kfree(defcmd_set);
+fail_defcmd:
+ kdb_printf("Could not allocate new defcmd_set entry for %s\n", argv[1]);
+ defcmd_set = save_defcmd_set;
+ return KDB_NOTIMP;
+}
+
+/*
+ * kdb_exec_defcmd - Execute the set of commands associated with this
+ * defcmd name.
+ * Inputs:
+ * argc argument count
+ * argv argument vector
+ * Returns:
+ * zero for success, a kdb diagnostic if error
+ */
+static int kdb_exec_defcmd(int argc, const char **argv)
+{
+ int i, ret;
+ struct defcmd_set *s;
+ if (argc != 0)
+ return KDB_ARGCOUNT;
+ for (s = defcmd_set, i = 0; i < defcmd_set_count; ++i, ++s) {
+ if (strcmp(s->name, argv[0]) == 0)
+ break;
+ }
+ if (i == defcmd_set_count) {
+ kdb_printf("kdb_exec_defcmd: could not find commands for %s\n",
+ argv[0]);
+ return KDB_NOTIMP;
+ }
+ for (i = 0; i < s->count; ++i) {
+ /* Recursive use of kdb_parse, do not use argv after
+ * this point */
+ argv = NULL;
+ kdb_printf("[%s]kdb> %s\n", s->name, s->command[i]);
+ ret = kdb_parse(s->command[i]);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+/* Command history */
+#define KDB_CMD_HISTORY_COUNT 32
+#define CMD_BUFLEN 200 /* kdb_printf: max printline
+ * size == 256 */
+static unsigned int cmd_head, cmd_tail;
+static unsigned int cmdptr;
+static char cmd_hist[KDB_CMD_HISTORY_COUNT][CMD_BUFLEN];
+static char cmd_cur[CMD_BUFLEN];
+
+/*
+ * The "str" argument may point to something like | grep xyz
+ */
+static void parse_grep(const char *str)
+{
+ int len;
+ char *cp = (char *)str, *cp2;
+
+ /* sanity check: we should have been called with the \ first */
+ if (*cp != '|')
+ return;
+ cp++;
+ while (isspace(*cp))
+ cp++;
+ if (strncmp(cp, "grep ", 5)) {
+ kdb_printf("invalid 'pipe', see grephelp\n");
+ return;
+ }
+ cp += 5;
+ while (isspace(*cp))
+ cp++;
+ cp2 = strchr(cp, '\n');
+ if (cp2)
+ *cp2 = '\0'; /* remove the trailing newline */
+ len = strlen(cp);
+ if (len == 0) {
+ kdb_printf("invalid 'pipe', see grephelp\n");
+ return;
+ }
+ /* now cp points to a nonzero length search string */
+ if (*cp == '"') {
+ /* allow it be "x y z" by removing the "'s - there must
+ be two of them */
+ cp++;
+ cp2 = strchr(cp, '"');
+ if (!cp2) {
+ kdb_printf("invalid quoted string, see grephelp\n");
+ return;
+ }
+ *cp2 = '\0'; /* end the string where the 2nd " was */
+ }
+ kdb_grep_leading = 0;
+ if (*cp == '^') {
+ kdb_grep_leading = 1;
+ cp++;
+ }
+ len = strlen(cp);
+ kdb_grep_trailing = 0;
+ if (*(cp+len-1) == '$') {
+ kdb_grep_trailing = 1;
+ *(cp+len-1) = '\0';
+ }
+ len = strlen(cp);
+ if (!len)
+ return;
+ if (len >= KDB_GREP_STRLEN) {
+ kdb_printf("search string too long\n");
+ return;
+ }
+ strcpy(kdb_grep_string, cp);
+ kdb_grepping_flag++;
+ return;
+}
+
+/*
+ * kdb_parse - Parse the command line, search the command table for a
+ * matching command and invoke the command function. This
+ * function may be called recursively, if it is, the second call
+ * will overwrite argv and cbuf. It is the caller's
+ * responsibility to save their argv if they recursively call
+ * kdb_parse().
+ * Parameters:
+ * cmdstr The input command line to be parsed.
+ * regs The registers at the time kdb was entered.
+ * Returns:
+ * Zero for success, a kdb diagnostic if failure.
+ * Remarks:
+ * Limited to 20 tokens.
+ *
+ * Real rudimentary tokenization. Basically only whitespace
+ * is considered a token delimeter (but special consideration
+ * is taken of the '=' sign as used by the 'set' command).
+ *
+ * The algorithm used to tokenize the input string relies on
+ * there being at least one whitespace (or otherwise useless)
+ * character between tokens as the character immediately following
+ * the token is altered in-place to a null-byte to terminate the
+ * token string.
+ */
+
+#define MAXARGC 20
+
+int kdb_parse(const char *cmdstr)
+{
+ static char *argv[MAXARGC];
+ static int argc;
+ static char cbuf[CMD_BUFLEN+2];
+ char *cp;
+ char *cpp, quoted;
+ kdbtab_t *tp;
+ int i, escaped, ignore_errors = 0, check_grep = 0;
+
+ /*
+ * First tokenize the command string.
+ */
+ cp = (char *)cmdstr;
+
+ if (KDB_FLAG(CMD_INTERRUPT)) {
+ /* Previous command was interrupted, newline must not
+ * repeat the command */
+ KDB_FLAG_CLEAR(CMD_INTERRUPT);
+ KDB_STATE_SET(PAGER);
+ argc = 0; /* no repeat */
+ }
+
+ if (*cp != '\n' && *cp != '\0') {
+ argc = 0;
+ cpp = cbuf;
+ while (*cp) {
+ /* skip whitespace */
+ while (isspace(*cp))
+ cp++;
+ if ((*cp == '\0') || (*cp == '\n') ||
+ (*cp == '#' && !defcmd_in_progress))
+ break;
+ /* special case: check for | grep pattern */
+ if (*cp == '|') {
+ check_grep++;
+ break;
+ }
+ if (cpp >= cbuf + CMD_BUFLEN) {
+ kdb_printf("kdb_parse: command buffer "
+ "overflow, command ignored\n%s\n",
+ cmdstr);
+ return KDB_NOTFOUND;
+ }
+ if (argc >= MAXARGC - 1) {
+ kdb_printf("kdb_parse: too many arguments, "
+ "command ignored\n%s\n", cmdstr);
+ return KDB_NOTFOUND;
+ }
+ argv[argc++] = cpp;
+ escaped = 0;
+ quoted = '\0';
+ /* Copy to next unquoted and unescaped
+ * whitespace or '=' */
+ while (*cp && *cp != '\n' &&
+ (escaped || quoted || !isspace(*cp))) {
+ if (cpp >= cbuf + CMD_BUFLEN)
+ break;
+ if (escaped) {
+ escaped = 0;
+ *cpp++ = *cp++;
+ continue;
+ }
+ if (*cp == '\\') {
+ escaped = 1;
+ ++cp;
+ continue;
+ }
+ if (*cp == quoted)
+ quoted = '\0';
+ else if (*cp == '\'' || *cp == '"')
+ quoted = *cp;
+ *cpp = *cp++;
+ if (*cpp == '=' && !quoted)
+ break;
+ ++cpp;
+ }
+ *cpp++ = '\0'; /* Squash a ws or '=' character */
+ }
+ }
+ if (!argc)
+ return 0;
+ if (check_grep)
+ parse_grep(cp);
+ if (defcmd_in_progress) {
+ int result = kdb_defcmd2(cmdstr, argv[0]);
+ if (!defcmd_in_progress) {
+ argc = 0; /* avoid repeat on endefcmd */
+ *(argv[0]) = '\0';
+ }
+ return result;
+ }
+ if (argv[0][0] == '-' && argv[0][1] &&
+ (argv[0][1] < '0' || argv[0][1] > '9')) {
+ ignore_errors = 1;
+ ++argv[0];
+ }
+
+ for_each_kdbcmd(tp, i) {
+ if (tp->cmd_name) {
+ /*
+ * If this command is allowed to be abbreviated,
+ * check to see if this is it.
+ */
+
+ if (tp->cmd_minlen
+ && (strlen(argv[0]) <= tp->cmd_minlen)) {
+ if (strncmp(argv[0],
+ tp->cmd_name,
+ tp->cmd_minlen) == 0) {
+ break;
+ }
+ }
+
+ if (strcmp(argv[0], tp->cmd_name) == 0)
+ break;
+ }
+ }
+
+ /*
+ * If we don't find a command by this name, see if the first
+ * few characters of this match any of the known commands.
+ * e.g., md1c20 should match md.
+ */
+ if (i == kdb_max_commands) {
+ for_each_kdbcmd(tp, i) {
+ if (tp->cmd_name) {
+ if (strncmp(argv[0],
+ tp->cmd_name,
+ strlen(tp->cmd_name)) == 0) {
+ break;
+ }
+ }
+ }
+ }
+
+ if (i < kdb_max_commands) {
+ int result;
+
+ if (!kdb_check_flags(tp->cmd_flags, kdb_cmd_enabled, argc <= 1))
+ return KDB_NOPERM;
+
+ KDB_STATE_SET(CMD);
+ result = (*tp->cmd_func)(argc-1, (const char **)argv);
+ if (result && ignore_errors && result > KDB_CMD_GO)
+ result = 0;
+ KDB_STATE_CLEAR(CMD);
+
+ if (tp->cmd_flags & KDB_REPEAT_WITH_ARGS)
+ return result;
+
+ argc = tp->cmd_flags & KDB_REPEAT_NO_ARGS ? 1 : 0;
+ if (argv[argc])
+ *(argv[argc]) = '\0';
+ return result;
+ }
+
+ /*
+ * If the input with which we were presented does not
+ * map to an existing command, attempt to parse it as an
+ * address argument and display the result. Useful for
+ * obtaining the address of a variable, or the nearest symbol
+ * to an address contained in a register.
+ */
+ {
+ unsigned long value;
+ char *name = NULL;
+ long offset;
+ int nextarg = 0;
+
+ if (kdbgetaddrarg(0, (const char **)argv, &nextarg,
+ &value, &offset, &name)) {
+ return KDB_NOTFOUND;
+ }
+
+ kdb_printf("%s = ", argv[0]);
+ kdb_symbol_print(value, NULL, KDB_SP_DEFAULT);
+ kdb_printf("\n");
+ return 0;
+ }
+}
+
+
+static int handle_ctrl_cmd(char *cmd)
+{
+#define CTRL_P 16
+#define CTRL_N 14
+
+ /* initial situation */
+ if (cmd_head == cmd_tail)
+ return 0;
+ switch (*cmd) {
+ case CTRL_P:
+ if (cmdptr != cmd_tail)
+ cmdptr = (cmdptr-1) % KDB_CMD_HISTORY_COUNT;
+ strncpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN);
+ return 1;
+ case CTRL_N:
+ if (cmdptr != cmd_head)
+ cmdptr = (cmdptr+1) % KDB_CMD_HISTORY_COUNT;
+ strncpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN);
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * kdb_reboot - This function implements the 'reboot' command. Reboot
+ * the system immediately, or loop for ever on failure.
+ */
+static int kdb_reboot(int argc, const char **argv)
+{
+ emergency_restart();
+ kdb_printf("Hmm, kdb_reboot did not reboot, spinning here\n");
+ while (1)
+ cpu_relax();
+ /* NOTREACHED */
+ return 0;
+}
+
+static void kdb_dumpregs(struct pt_regs *regs)
+{
+ int old_lvl = console_loglevel;
+ console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH;
+ kdb_trap_printk++;
+ show_regs(regs);
+ kdb_trap_printk--;
+ kdb_printf("\n");
+ console_loglevel = old_lvl;
+}
+
+void kdb_set_current_task(struct task_struct *p)
+{
+ kdb_current_task = p;
+
+ if (kdb_task_has_cpu(p)) {
+ kdb_current_regs = KDB_TSKREGS(kdb_process_cpu(p));
+ return;
+ }
+ kdb_current_regs = NULL;
+}
+
+/*
+ * kdb_local - The main code for kdb. This routine is invoked on a
+ * specific processor, it is not global. The main kdb() routine
+ * ensures that only one processor at a time is in this routine.
+ * This code is called with the real reason code on the first
+ * entry to a kdb session, thereafter it is called with reason
+ * SWITCH, even if the user goes back to the original cpu.
+ * Inputs:
+ * reason The reason KDB was invoked
+ * error The hardware-defined error code
+ * regs The exception frame at time of fault/breakpoint.
+ * db_result Result code from the break or debug point.
+ * Returns:
+ * 0 KDB was invoked for an event which it wasn't responsible
+ * 1 KDB handled the event for which it was invoked.
+ * KDB_CMD_GO User typed 'go'.
+ * KDB_CMD_CPU User switched to another cpu.
+ * KDB_CMD_SS Single step.
+ */
+static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
+ kdb_dbtrap_t db_result)
+{
+ char *cmdbuf;
+ int diag;
+ struct task_struct *kdb_current =
+ kdb_curr_task(raw_smp_processor_id());
+
+ KDB_DEBUG_STATE("kdb_local 1", reason);
+ kdb_go_count = 0;
+ if (reason == KDB_REASON_DEBUG) {
+ /* special case below */
+ } else {
+ kdb_printf("\nEntering kdb (current=0x%p, pid %d) ",
+ kdb_current, kdb_current ? kdb_current->pid : 0);
+#if defined(CONFIG_SMP)
+ kdb_printf("on processor %d ", raw_smp_processor_id());
+#endif
+ }
+
+ switch (reason) {
+ case KDB_REASON_DEBUG:
+ {
+ /*
+ * If re-entering kdb after a single step
+ * command, don't print the message.
+ */
+ switch (db_result) {
+ case KDB_DB_BPT:
+ kdb_printf("\nEntering kdb (0x%p, pid %d) ",
+ kdb_current, kdb_current->pid);
+#if defined(CONFIG_SMP)
+ kdb_printf("on processor %d ", raw_smp_processor_id());
+#endif
+ kdb_printf("due to Debug @ " kdb_machreg_fmt "\n",
+ instruction_pointer(regs));
+ break;
+ case KDB_DB_SS:
+ break;
+ case KDB_DB_SSBPT:
+ KDB_DEBUG_STATE("kdb_local 4", reason);
+ return 1; /* kdba_db_trap did the work */
+ default:
+ kdb_printf("kdb: Bad result from kdba_db_trap: %d\n",
+ db_result);
+ break;
+ }
+
+ }
+ break;
+ case KDB_REASON_ENTER:
+ if (KDB_STATE(KEYBOARD))
+ kdb_printf("due to Keyboard Entry\n");
+ else
+ kdb_printf("due to KDB_ENTER()\n");
+ break;
+ case KDB_REASON_KEYBOARD:
+ KDB_STATE_SET(KEYBOARD);
+ kdb_printf("due to Keyboard Entry\n");
+ break;
+ case KDB_REASON_ENTER_SLAVE:
+ /* drop through, slaves only get released via cpu switch */
+ case KDB_REASON_SWITCH:
+ kdb_printf("due to cpu switch\n");
+ break;
+ case KDB_REASON_OOPS:
+ kdb_printf("Oops: %s\n", kdb_diemsg);
+ kdb_printf("due to oops @ " kdb_machreg_fmt "\n",
+ instruction_pointer(regs));
+ kdb_dumpregs(regs);
+ break;
+ case KDB_REASON_SYSTEM_NMI:
+ kdb_printf("due to System NonMaskable Interrupt\n");
+ break;
+ case KDB_REASON_NMI:
+ kdb_printf("due to NonMaskable Interrupt @ "
+ kdb_machreg_fmt "\n",
+ instruction_pointer(regs));
+ break;
+ case KDB_REASON_SSTEP:
+ case KDB_REASON_BREAK:
+ kdb_printf("due to %s @ " kdb_machreg_fmt "\n",
+ reason == KDB_REASON_BREAK ?
+ "Breakpoint" : "SS trap", instruction_pointer(regs));
+ /*
+ * Determine if this breakpoint is one that we
+ * are interested in.
+ */
+ if (db_result != KDB_DB_BPT) {
+ kdb_printf("kdb: error return from kdba_bp_trap: %d\n",
+ db_result);
+ KDB_DEBUG_STATE("kdb_local 6", reason);
+ return 0; /* Not for us, dismiss it */
+ }
+ break;
+ case KDB_REASON_RECURSE:
+ kdb_printf("due to Recursion @ " kdb_machreg_fmt "\n",
+ instruction_pointer(regs));
+ break;
+ default:
+ kdb_printf("kdb: unexpected reason code: %d\n", reason);
+ KDB_DEBUG_STATE("kdb_local 8", reason);
+ return 0; /* Not for us, dismiss it */
+ }
+
+ while (1) {
+ /*
+ * Initialize pager context.
+ */
+ kdb_nextline = 1;
+ KDB_STATE_CLEAR(SUPPRESS);
+ kdb_grepping_flag = 0;
+ /* ensure the old search does not leak into '/' commands */
+ kdb_grep_string[0] = '\0';
+
+ cmdbuf = cmd_cur;
+ *cmdbuf = '\0';
+ *(cmd_hist[cmd_head]) = '\0';
+
+do_full_getstr:
+#if defined(CONFIG_SMP)
+ snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"),
+ raw_smp_processor_id());
+#else
+ snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"));
+#endif
+ if (defcmd_in_progress)
+ strncat(kdb_prompt_str, "[defcmd]", CMD_BUFLEN);
+
+ /*
+ * Fetch command from keyboard
+ */
+ cmdbuf = kdb_getstr(cmdbuf, CMD_BUFLEN, kdb_prompt_str);
+ if (*cmdbuf != '\n') {
+ if (*cmdbuf < 32) {
+ if (cmdptr == cmd_head) {
+ strncpy(cmd_hist[cmd_head], cmd_cur,
+ CMD_BUFLEN);
+ *(cmd_hist[cmd_head] +
+ strlen(cmd_hist[cmd_head])-1) = '\0';
+ }
+ if (!handle_ctrl_cmd(cmdbuf))
+ *(cmd_cur+strlen(cmd_cur)-1) = '\0';
+ cmdbuf = cmd_cur;
+ goto do_full_getstr;
+ } else {
+ strncpy(cmd_hist[cmd_head], cmd_cur,
+ CMD_BUFLEN);
+ }
+
+ cmd_head = (cmd_head+1) % KDB_CMD_HISTORY_COUNT;
+ if (cmd_head == cmd_tail)
+ cmd_tail = (cmd_tail+1) % KDB_CMD_HISTORY_COUNT;
+ }
+
+ cmdptr = cmd_head;
+ diag = kdb_parse(cmdbuf);
+ if (diag == KDB_NOTFOUND) {
+ kdb_printf("Unknown kdb command: '%s'\n", cmdbuf);
+ diag = 0;
+ }
+ if (diag == KDB_CMD_GO
+ || diag == KDB_CMD_CPU
+ || diag == KDB_CMD_SS
+ || diag == KDB_CMD_KGDB)
+ break;
+
+ if (diag)
+ kdb_cmderror(diag);
+ }
+ KDB_DEBUG_STATE("kdb_local 9", diag);
+ return diag;
+}
+
+
+/*
+ * kdb_print_state - Print the state data for the current processor
+ * for debugging.
+ * Inputs:
+ * text Identifies the debug point
+ * value Any integer value to be printed, e.g. reason code.
+ */
+void kdb_print_state(const char *text, int value)
+{
+ kdb_printf("state: %s cpu %d value %d initial %d state %x\n",
+ text, raw_smp_processor_id(), value, kdb_initial_cpu,
+ kdb_state);
+}
+
+/*
+ * kdb_main_loop - After initial setup and assignment of the
+ * controlling cpu, all cpus are in this loop. One cpu is in
+ * control and will issue the kdb prompt, the others will spin
+ * until 'go' or cpu switch.
+ *
+ * To get a consistent view of the kernel stacks for all
+ * processes, this routine is invoked from the main kdb code via
+ * an architecture specific routine. kdba_main_loop is
+ * responsible for making the kernel stacks consistent for all
+ * processes, there should be no difference between a blocked
+ * process and a running process as far as kdb is concerned.
+ * Inputs:
+ * reason The reason KDB was invoked
+ * error The hardware-defined error code
+ * reason2 kdb's current reason code.
+ * Initially error but can change
+ * according to kdb state.
+ * db_result Result code from break or debug point.
+ * regs The exception frame at time of fault/breakpoint.
+ * should always be valid.
+ * Returns:
+ * 0 KDB was invoked for an event which it wasn't responsible
+ * 1 KDB handled the event for which it was invoked.
+ */
+int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error,
+ kdb_dbtrap_t db_result, struct pt_regs *regs)
+{
+ int result = 1;
+ /* Stay in kdb() until 'go', 'ss[b]' or an error */
+ while (1) {
+ /*
+ * All processors except the one that is in control
+ * will spin here.
+ */
+ KDB_DEBUG_STATE("kdb_main_loop 1", reason);
+ while (KDB_STATE(HOLD_CPU)) {
+ /* state KDB is turned off by kdb_cpu to see if the
+ * other cpus are still live, each cpu in this loop
+ * turns it back on.
+ */
+ if (!KDB_STATE(KDB))
+ KDB_STATE_SET(KDB);
+ }
+
+ KDB_STATE_CLEAR(SUPPRESS);
+ KDB_DEBUG_STATE("kdb_main_loop 2", reason);
+ if (KDB_STATE(LEAVING))
+ break; /* Another cpu said 'go' */
+ /* Still using kdb, this processor is in control */
+ result = kdb_local(reason2, error, regs, db_result);
+ KDB_DEBUG_STATE("kdb_main_loop 3", result);
+
+ if (result == KDB_CMD_CPU)
+ break;
+
+ if (result == KDB_CMD_SS) {
+ KDB_STATE_SET(DOING_SS);
+ break;
+ }
+
+ if (result == KDB_CMD_KGDB) {
+ if (!KDB_STATE(DOING_KGDB))
+ kdb_printf("Entering please attach debugger "
+ "or use $D#44+ or $3#33\n");
+ break;
+ }
+ if (result && result != 1 && result != KDB_CMD_GO)
+ kdb_printf("\nUnexpected kdb_local return code %d\n",
+ result);
+ KDB_DEBUG_STATE("kdb_main_loop 4", reason);
+ break;
+ }
+ if (KDB_STATE(DOING_SS))
+ KDB_STATE_CLEAR(SSBPT);
+
+ /* Clean up any keyboard devices before leaving */
+ kdb_kbd_cleanup_state();
+
+ return result;
+}
+
+/*
+ * kdb_mdr - This function implements the guts of the 'mdr', memory
+ * read command.
+ * mdr <addr arg>,<byte count>
+ * Inputs:
+ * addr Start address
+ * count Number of bytes
+ * Returns:
+ * Always 0. Any errors are detected and printed by kdb_getarea.
+ */
+static int kdb_mdr(unsigned long addr, unsigned int count)
+{
+ unsigned char c;
+ while (count--) {
+ if (kdb_getarea(c, addr))
+ return 0;
+ kdb_printf("%02x", c);
+ addr++;
+ }
+ kdb_printf("\n");
+ return 0;
+}
+
+/*
+ * kdb_md - This function implements the 'md', 'md1', 'md2', 'md4',
+ * 'md8' 'mdr' and 'mds' commands.
+ *
+ * md|mds [<addr arg> [<line count> [<radix>]]]
+ * mdWcN [<addr arg> [<line count> [<radix>]]]
+ * where W = is the width (1, 2, 4 or 8) and N is the count.
+ * for eg., md1c20 reads 20 bytes, 1 at a time.
+ * mdr <addr arg>,<byte count>
+ */
+static void kdb_md_line(const char *fmtstr, unsigned long addr,
+ int symbolic, int nosect, int bytesperword,
+ int num, int repeat, int phys)
+{
+ /* print just one line of data */
+ kdb_symtab_t symtab;
+ char cbuf[32];
+ char *c = cbuf;
+ int i;
+ unsigned long word;
+
+ memset(cbuf, '\0', sizeof(cbuf));
+ if (phys)
+ kdb_printf("phys " kdb_machreg_fmt0 " ", addr);
+ else
+ kdb_printf(kdb_machreg_fmt0 " ", addr);
+
+ for (i = 0; i < num && repeat--; i++) {
+ if (phys) {
+ if (kdb_getphysword(&word, addr, bytesperword))
+ break;
+ } else if (kdb_getword(&word, addr, bytesperword))
+ break;
+ kdb_printf(fmtstr, word);
+ if (symbolic)
+ kdbnearsym(word, &symtab);
+ else
+ memset(&symtab, 0, sizeof(symtab));
+ if (symtab.sym_name) {
+ kdb_symbol_print(word, &symtab, 0);
+ if (!nosect) {
+ kdb_printf("\n");
+ kdb_printf(" %s %s "
+ kdb_machreg_fmt " "
+ kdb_machreg_fmt " "
+ kdb_machreg_fmt, symtab.mod_name,
+ symtab.sec_name, symtab.sec_start,
+ symtab.sym_start, symtab.sym_end);
+ }
+ addr += bytesperword;
+ } else {
+ union {
+ u64 word;
+ unsigned char c[8];
+ } wc;
+ unsigned char *cp;
+#ifdef __BIG_ENDIAN
+ cp = wc.c + 8 - bytesperword;
+#else
+ cp = wc.c;
+#endif
+ wc.word = word;
+#define printable_char(c) \
+ ({unsigned char __c = c; isascii(__c) && isprint(__c) ? __c : '.'; })
+ switch (bytesperword) {
+ case 8:
+ *c++ = printable_char(*cp++);
+ *c++ = printable_char(*cp++);
+ *c++ = printable_char(*cp++);
+ *c++ = printable_char(*cp++);
+ addr += 4;
+ case 4:
+ *c++ = printable_char(*cp++);
+ *c++ = printable_char(*cp++);
+ addr += 2;
+ case 2:
+ *c++ = printable_char(*cp++);
+ addr++;
+ case 1:
+ *c++ = printable_char(*cp++);
+ addr++;
+ break;
+ }
+#undef printable_char
+ }
+ }
+ kdb_printf("%*s %s\n", (int)((num-i)*(2*bytesperword + 1)+1),
+ " ", cbuf);
+}
+
+static int kdb_md(int argc, const char **argv)
+{
+ static unsigned long last_addr;
+ static int last_radix, last_bytesperword, last_repeat;
+ int radix = 16, mdcount = 8, bytesperword = KDB_WORD_SIZE, repeat;
+ int nosect = 0;
+ char fmtchar, fmtstr[64];
+ unsigned long addr;
+ unsigned long word;
+ long offset = 0;
+ int symbolic = 0;
+ int valid = 0;
+ int phys = 0;
+
+ kdbgetintenv("MDCOUNT", &mdcount);
+ kdbgetintenv("RADIX", &radix);
+ kdbgetintenv("BYTESPERWORD", &bytesperword);
+
+ /* Assume 'md <addr>' and start with environment values */
+ repeat = mdcount * 16 / bytesperword;
+
+ if (strcmp(argv[0], "mdr") == 0) {
+ if (argc != 2)
+ return KDB_ARGCOUNT;
+ valid = 1;
+ } else if (isdigit(argv[0][2])) {
+ bytesperword = (int)(argv[0][2] - '0');
+ if (bytesperword == 0) {
+ bytesperword = last_bytesperword;
+ if (bytesperword == 0)
+ bytesperword = 4;
+ }
+ last_bytesperword = bytesperword;
+ repeat = mdcount * 16 / bytesperword;
+ if (!argv[0][3])
+ valid = 1;
+ else if (argv[0][3] == 'c' && argv[0][4]) {
+ char *p;
+ repeat = simple_strtoul(argv[0] + 4, &p, 10);
+ mdcount = ((repeat * bytesperword) + 15) / 16;
+ valid = !*p;
+ }
+ last_repeat = repeat;
+ } else if (strcmp(argv[0], "md") == 0)
+ valid = 1;
+ else if (strcmp(argv[0], "mds") == 0)
+ valid = 1;
+ else if (strcmp(argv[0], "mdp") == 0) {
+ phys = valid = 1;
+ }
+ if (!valid)
+ return KDB_NOTFOUND;
+
+ if (argc == 0) {
+ if (last_addr == 0)
+ return KDB_ARGCOUNT;
+ addr = last_addr;
+ radix = last_radix;
+ bytesperword = last_bytesperword;
+ repeat = last_repeat;
+ mdcount = ((repeat * bytesperword) + 15) / 16;
+ }
+
+ if (argc) {
+ unsigned long val;
+ int diag, nextarg = 1;
+ diag = kdbgetaddrarg(argc, argv, &nextarg, &addr,
+ &offset, NULL);
+ if (diag)
+ return diag;
+ if (argc > nextarg+2)
+ return KDB_ARGCOUNT;
+
+ if (argc >= nextarg) {
+ diag = kdbgetularg(argv[nextarg], &val);
+ if (!diag) {
+ mdcount = (int) val;
+ repeat = mdcount * 16 / bytesperword;
+ }
+ }
+ if (argc >= nextarg+1) {
+ diag = kdbgetularg(argv[nextarg+1], &val);
+ if (!diag)
+ radix = (int) val;
+ }
+ }
+
+ if (strcmp(argv[0], "mdr") == 0)
+ return kdb_mdr(addr, mdcount);
+
+ switch (radix) {
+ case 10:
+ fmtchar = 'd';
+ break;
+ case 16:
+ fmtchar = 'x';
+ break;
+ case 8:
+ fmtchar = 'o';
+ break;
+ default:
+ return KDB_BADRADIX;
+ }
+
+ last_radix = radix;
+
+ if (bytesperword > KDB_WORD_SIZE)
+ return KDB_BADWIDTH;
+
+ switch (bytesperword) {
+ case 8:
+ sprintf(fmtstr, "%%16.16l%c ", fmtchar);
+ break;
+ case 4:
+ sprintf(fmtstr, "%%8.8l%c ", fmtchar);
+ break;
+ case 2:
+ sprintf(fmtstr, "%%4.4l%c ", fmtchar);
+ break;
+ case 1:
+ sprintf(fmtstr, "%%2.2l%c ", fmtchar);
+ break;
+ default:
+ return KDB_BADWIDTH;
+ }
+
+ last_repeat = repeat;
+ last_bytesperword = bytesperword;
+
+ if (strcmp(argv[0], "mds") == 0) {
+ symbolic = 1;
+ /* Do not save these changes as last_*, they are temporary mds
+ * overrides.
+ */
+ bytesperword = KDB_WORD_SIZE;
+ repeat = mdcount;
+ kdbgetintenv("NOSECT", &nosect);
+ }
+
+ /* Round address down modulo BYTESPERWORD */
+
+ addr &= ~(bytesperword-1);
+
+ while (repeat > 0) {
+ unsigned long a;
+ int n, z, num = (symbolic ? 1 : (16 / bytesperword));
+
+ if (KDB_FLAG(CMD_INTERRUPT))
+ return 0;
+ for (a = addr, z = 0; z < repeat; a += bytesperword, ++z) {
+ if (phys) {
+ if (kdb_getphysword(&word, a, bytesperword)
+ || word)
+ break;
+ } else if (kdb_getword(&word, a, bytesperword) || word)
+ break;
+ }
+ n = min(num, repeat);
+ kdb_md_line(fmtstr, addr, symbolic, nosect, bytesperword,
+ num, repeat, phys);
+ addr += bytesperword * n;
+ repeat -= n;
+ z = (z + num - 1) / num;
+ if (z > 2) {
+ int s = num * (z-2);
+ kdb_printf(kdb_machreg_fmt0 "-" kdb_machreg_fmt0
+ " zero suppressed\n",
+ addr, addr + bytesperword * s - 1);
+ addr += bytesperword * s;
+ repeat -= s;
+ }
+ }
+ last_addr = addr;
+
+ return 0;
+}
+
+/*
+ * kdb_mm - This function implements the 'mm' command.
+ * mm address-expression new-value
+ * Remarks:
+ * mm works on machine words, mmW works on bytes.
+ */
+static int kdb_mm(int argc, const char **argv)
+{
+ int diag;
+ unsigned long addr;
+ long offset = 0;
+ unsigned long contents;
+ int nextarg;
+ int width;
+
+ if (argv[0][2] && !isdigit(argv[0][2]))
+ return KDB_NOTFOUND;
+
+ if (argc < 2)
+ return KDB_ARGCOUNT;
+
+ nextarg = 1;
+ diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
+ if (diag)
+ return diag;
+
+ if (nextarg > argc)
+ return KDB_ARGCOUNT;
+ diag = kdbgetaddrarg(argc, argv, &nextarg, &contents, NULL, NULL);
+ if (diag)
+ return diag;
+
+ if (nextarg != argc + 1)
+ return KDB_ARGCOUNT;
+
+ width = argv[0][2] ? (argv[0][2] - '0') : (KDB_WORD_SIZE);
+ diag = kdb_putword(addr, contents, width);
+ if (diag)
+ return diag;
+
+ kdb_printf(kdb_machreg_fmt " = " kdb_machreg_fmt "\n", addr, contents);
+
+ return 0;
+}
+
+/*
+ * kdb_go - This function implements the 'go' command.
+ * go [address-expression]
+ */
+static int kdb_go(int argc, const char **argv)
+{
+ unsigned long addr;
+ int diag;
+ int nextarg;
+ long offset;
+
+ if (raw_smp_processor_id() != kdb_initial_cpu) {
+ kdb_printf("go must execute on the entry cpu, "
+ "please use \"cpu %d\" and then execute go\n",
+ kdb_initial_cpu);
+ return KDB_BADCPUNUM;
+ }
+ if (argc == 1) {
+ nextarg = 1;
+ diag = kdbgetaddrarg(argc, argv, &nextarg,
+ &addr, &offset, NULL);
+ if (diag)
+ return diag;
+ } else if (argc) {
+ return KDB_ARGCOUNT;
+ }
+
+ diag = KDB_CMD_GO;
+ if (KDB_FLAG(CATASTROPHIC)) {
+ kdb_printf("Catastrophic error detected\n");
+ kdb_printf("kdb_continue_catastrophic=%d, ",
+ kdb_continue_catastrophic);
+ if (kdb_continue_catastrophic == 0 && kdb_go_count++ == 0) {
+ kdb_printf("type go a second time if you really want "
+ "to continue\n");
+ return 0;
+ }
+ if (kdb_continue_catastrophic == 2) {
+ kdb_printf("forcing reboot\n");
+ kdb_reboot(0, NULL);
+ }
+ kdb_printf("attempting to continue\n");
+ }
+ return diag;
+}
+
+/*
+ * kdb_rd - This function implements the 'rd' command.
+ */
+static int kdb_rd(int argc, const char **argv)
+{
+ int len = kdb_check_regs();
+#if DBG_MAX_REG_NUM > 0
+ int i;
+ char *rname;
+ int rsize;
+ u64 reg64;
+ u32 reg32;
+ u16 reg16;
+ u8 reg8;
+
+ if (len)
+ return len;
+
+ for (i = 0; i < DBG_MAX_REG_NUM; i++) {
+ rsize = dbg_reg_def[i].size * 2;
+ if (rsize > 16)
+ rsize = 2;
+ if (len + strlen(dbg_reg_def[i].name) + 4 + rsize > 80) {
+ len = 0;
+ kdb_printf("\n");
+ }
+ if (len)
+ len += kdb_printf(" ");
+ switch(dbg_reg_def[i].size * 8) {
+ case 8:
+ rname = dbg_get_reg(i, &reg8, kdb_current_regs);
+ if (!rname)
+ break;
+ len += kdb_printf("%s: %02x", rname, reg8);
+ break;
+ case 16:
+ rname = dbg_get_reg(i, &reg16, kdb_current_regs);
+ if (!rname)
+ break;
+ len += kdb_printf("%s: %04x", rname, reg16);
+ break;
+ case 32:
+ rname = dbg_get_reg(i, &reg32, kdb_current_regs);
+ if (!rname)
+ break;
+ len += kdb_printf("%s: %08x", rname, reg32);
+ break;
+ case 64:
+ rname = dbg_get_reg(i, &reg64, kdb_current_regs);
+ if (!rname)
+ break;
+ len += kdb_printf("%s: %016llx", rname, reg64);
+ break;
+ default:
+ len += kdb_printf("%s: ??", dbg_reg_def[i].name);
+ }
+ }
+ kdb_printf("\n");
+#else
+ if (len)
+ return len;
+
+ kdb_dumpregs(kdb_current_regs);
+#endif
+ return 0;
+}
+
+/*
+ * kdb_rm - This function implements the 'rm' (register modify) command.
+ * rm register-name new-contents
+ * Remarks:
+ * Allows register modification with the same restrictions as gdb
+ */
+static int kdb_rm(int argc, const char **argv)
+{
+#if DBG_MAX_REG_NUM > 0
+ int diag;
+ const char *rname;
+ int i;
+ u64 reg64;
+ u32 reg32;
+ u16 reg16;
+ u8 reg8;
+
+ if (argc != 2)
+ return KDB_ARGCOUNT;
+ /*
+ * Allow presence or absence of leading '%' symbol.
+ */
+ rname = argv[1];
+ if (*rname == '%')
+ rname++;
+
+ diag = kdbgetu64arg(argv[2], &reg64);
+ if (diag)
+ return diag;
+
+ diag = kdb_check_regs();
+ if (diag)
+ return diag;
+
+ diag = KDB_BADREG;
+ for (i = 0; i < DBG_MAX_REG_NUM; i++) {
+ if (strcmp(rname, dbg_reg_def[i].name) == 0) {
+ diag = 0;
+ break;
+ }
+ }
+ if (!diag) {
+ switch(dbg_reg_def[i].size * 8) {
+ case 8:
+ reg8 = reg64;
+ dbg_set_reg(i, &reg8, kdb_current_regs);
+ break;
+ case 16:
+ reg16 = reg64;
+ dbg_set_reg(i, &reg16, kdb_current_regs);
+ break;
+ case 32:
+ reg32 = reg64;
+ dbg_set_reg(i, &reg32, kdb_current_regs);
+ break;
+ case 64:
+ dbg_set_reg(i, &reg64, kdb_current_regs);
+ break;
+ }
+ }
+ return diag;
+#else
+ kdb_printf("ERROR: Register set currently not implemented\n");
+ return 0;
+#endif
+}
+
+#if defined(CONFIG_MAGIC_SYSRQ)
+/*
+ * kdb_sr - This function implements the 'sr' (SYSRQ key) command
+ * which interfaces to the soi-disant MAGIC SYSRQ functionality.
+ * sr <magic-sysrq-code>
+ */
+static int kdb_sr(int argc, const char **argv)
+{
+ bool check_mask =
+ !kdb_check_flags(KDB_ENABLE_ALL, kdb_cmd_enabled, false);
+
+ if (argc != 1)
+ return KDB_ARGCOUNT;
+
+ kdb_trap_printk++;
+ __handle_sysrq(*argv[1], check_mask);
+ kdb_trap_printk--;
+
+ return 0;
+}
+#endif /* CONFIG_MAGIC_SYSRQ */
+
+/*
+ * kdb_ef - This function implements the 'regs' (display exception
+ * frame) command. This command takes an address and expects to
+ * find an exception frame at that address, formats and prints
+ * it.
+ * regs address-expression
+ * Remarks:
+ * Not done yet.
+ */
+static int kdb_ef(int argc, const char **argv)
+{
+ int diag;
+ unsigned long addr;
+ long offset;
+ int nextarg;
+
+ if (argc != 1)
+ return KDB_ARGCOUNT;
+
+ nextarg = 1;
+ diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
+ if (diag)
+ return diag;
+ show_regs((struct pt_regs *)addr);
+ return 0;
+}
+
+#if defined(CONFIG_MODULES)
+/*
+ * kdb_lsmod - This function implements the 'lsmod' command. Lists
+ * currently loaded kernel modules.
+ * Mostly taken from userland lsmod.
+ */
+static int kdb_lsmod(int argc, const char **argv)
+{
+ struct module *mod;
+
+ if (argc != 0)
+ return KDB_ARGCOUNT;
+
+ kdb_printf("Module Size modstruct Used by\n");
+ list_for_each_entry(mod, kdb_modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+
+ kdb_printf("%-20s%8u 0x%p ", mod->name,
+ mod->core_size, (void *)mod);
+#ifdef CONFIG_MODULE_UNLOAD
+ kdb_printf("%4d ", module_refcount(mod));
+#endif
+ if (mod->state == MODULE_STATE_GOING)
+ kdb_printf(" (Unloading)");
+ else if (mod->state == MODULE_STATE_COMING)
+ kdb_printf(" (Loading)");
+ else
+ kdb_printf(" (Live)");
+ kdb_printf(" 0x%p", mod->module_core);
+
+#ifdef CONFIG_MODULE_UNLOAD
+ {
+ struct module_use *use;
+ kdb_printf(" [ ");
+ list_for_each_entry(use, &mod->source_list,
+ source_list)
+ kdb_printf("%s ", use->target->name);
+ kdb_printf("]\n");
+ }
+#endif
+ }
+
+ return 0;
+}
+
+#endif /* CONFIG_MODULES */
+
+/*
+ * kdb_env - This function implements the 'env' command. Display the
+ * current environment variables.
+ */
+
+static int kdb_env(int argc, const char **argv)
+{
+ int i;
+
+ for (i = 0; i < __nenv; i++) {
+ if (__env[i])
+ kdb_printf("%s\n", __env[i]);
+ }
+
+ if (KDB_DEBUG(MASK))
+ kdb_printf("KDBFLAGS=0x%x\n", kdb_flags);
+
+ return 0;
+}
+
+#ifdef CONFIG_PRINTK
+/*
+ * kdb_dmesg - This function implements the 'dmesg' command to display
+ * the contents of the syslog buffer.
+ * dmesg [lines] [adjust]
+ */
+static int kdb_dmesg(int argc, const char **argv)
+{
+ int diag;
+ int logging;
+ int lines = 0;
+ int adjust = 0;
+ int n = 0;
+ int skip = 0;
+ struct kmsg_dumper dumper = { .active = 1 };
+ size_t len;
+ char buf[201];
+
+ if (argc > 2)
+ return KDB_ARGCOUNT;
+ if (argc) {
+ char *cp;
+ lines = simple_strtol(argv[1], &cp, 0);
+ if (*cp)
+ lines = 0;
+ if (argc > 1) {
+ adjust = simple_strtoul(argv[2], &cp, 0);
+ if (*cp || adjust < 0)
+ adjust = 0;
+ }
+ }
+
+ /* disable LOGGING if set */
+ diag = kdbgetintenv("LOGGING", &logging);
+ if (!diag && logging) {
+ const char *setargs[] = { "set", "LOGGING", "0" };
+ kdb_set(2, setargs);
+ }
+
+ kmsg_dump_rewind_nolock(&dumper);
+ while (kmsg_dump_get_line_nolock(&dumper, 1, NULL, 0, NULL))
+ n++;
+
+ if (lines < 0) {
+ if (adjust >= n)
+ kdb_printf("buffer only contains %d lines, nothing "
+ "printed\n", n);
+ else if (adjust - lines >= n)
+ kdb_printf("buffer only contains %d lines, last %d "
+ "lines printed\n", n, n - adjust);
+ skip = adjust;
+ lines = abs(lines);
+ } else if (lines > 0) {
+ skip = n - lines - adjust;
+ lines = abs(lines);
+ if (adjust >= n) {
+ kdb_printf("buffer only contains %d lines, "
+ "nothing printed\n", n);
+ skip = n;
+ } else if (skip < 0) {
+ lines += skip;
+ skip = 0;
+ kdb_printf("buffer only contains %d lines, first "
+ "%d lines printed\n", n, lines);
+ }
+ } else {
+ lines = n;
+ }
+
+ if (skip >= n || skip < 0)
+ return 0;
+
+ kmsg_dump_rewind_nolock(&dumper);
+ while (kmsg_dump_get_line_nolock(&dumper, 1, buf, sizeof(buf), &len)) {
+ if (skip) {
+ skip--;
+ continue;
+ }
+ if (!lines--)
+ break;
+ if (KDB_FLAG(CMD_INTERRUPT))
+ return 0;
+
+ kdb_printf("%.*s\n", (int)len - 1, buf);
+ }
+
+ return 0;
+}
+#endif /* CONFIG_PRINTK */
+
+/* Make sure we balance enable/disable calls, must disable first. */
+static atomic_t kdb_nmi_disabled;
+
+static int kdb_disable_nmi(int argc, const char *argv[])
+{
+ if (atomic_read(&kdb_nmi_disabled))
+ return 0;
+ atomic_set(&kdb_nmi_disabled, 1);
+ arch_kgdb_ops.enable_nmi(0);
+ return 0;
+}
+
+static int kdb_param_enable_nmi(const char *val, const struct kernel_param *kp)
+{
+ if (!atomic_add_unless(&kdb_nmi_disabled, -1, 0))
+ return -EINVAL;
+ arch_kgdb_ops.enable_nmi(1);
+ return 0;
+}
+
+static const struct kernel_param_ops kdb_param_ops_enable_nmi = {
+ .set = kdb_param_enable_nmi,
+};
+module_param_cb(enable_nmi, &kdb_param_ops_enable_nmi, NULL, 0600);
+
+/*
+ * kdb_cpu - This function implements the 'cpu' command.
+ * cpu [<cpunum>]
+ * Returns:
+ * KDB_CMD_CPU for success, a kdb diagnostic if error
+ */
+static void kdb_cpu_status(void)
+{
+ int i, start_cpu, first_print = 1;
+ char state, prev_state = '?';
+
+ kdb_printf("Currently on cpu %d\n", raw_smp_processor_id());
+ kdb_printf("Available cpus: ");
+ for (start_cpu = -1, i = 0; i < NR_CPUS; i++) {
+ if (!cpu_online(i)) {
+ state = 'F'; /* cpu is offline */
+ } else if (!kgdb_info[i].enter_kgdb) {
+ state = 'D'; /* cpu is online but unresponsive */
+ } else {
+ state = ' '; /* cpu is responding to kdb */
+ if (kdb_task_state_char(KDB_TSK(i)) == 'I')
+ state = 'I'; /* idle task */
+ }
+ if (state != prev_state) {
+ if (prev_state != '?') {
+ if (!first_print)
+ kdb_printf(", ");
+ first_print = 0;
+ kdb_printf("%d", start_cpu);
+ if (start_cpu < i-1)
+ kdb_printf("-%d", i-1);
+ if (prev_state != ' ')
+ kdb_printf("(%c)", prev_state);
+ }
+ prev_state = state;
+ start_cpu = i;
+ }
+ }
+ /* print the trailing cpus, ignoring them if they are all offline */
+ if (prev_state != 'F') {
+ if (!first_print)
+ kdb_printf(", ");
+ kdb_printf("%d", start_cpu);
+ if (start_cpu < i-1)
+ kdb_printf("-%d", i-1);
+ if (prev_state != ' ')
+ kdb_printf("(%c)", prev_state);
+ }
+ kdb_printf("\n");
+}
+
+static int kdb_cpu(int argc, const char **argv)
+{
+ unsigned long cpunum;
+ int diag;
+
+ if (argc == 0) {
+ kdb_cpu_status();
+ return 0;
+ }
+
+ if (argc != 1)
+ return KDB_ARGCOUNT;
+
+ diag = kdbgetularg(argv[1], &cpunum);
+ if (diag)
+ return diag;
+
+ /*
+ * Validate cpunum
+ */
+ if ((cpunum >= CONFIG_NR_CPUS) || !kgdb_info[cpunum].enter_kgdb)
+ return KDB_BADCPUNUM;
+
+ dbg_switch_cpu = cpunum;
+
+ /*
+ * Switch to other cpu
+ */
+ return KDB_CMD_CPU;
+}
+
+/* The user may not realize that ps/bta with no parameters does not print idle
+ * or sleeping system daemon processes, so tell them how many were suppressed.
+ */
+void kdb_ps_suppressed(void)
+{
+ int idle = 0, daemon = 0;
+ unsigned long mask_I = kdb_task_state_string("I"),
+ mask_M = kdb_task_state_string("M");
+ unsigned long cpu;
+ const struct task_struct *p, *g;
+ for_each_online_cpu(cpu) {
+ p = kdb_curr_task(cpu);
+ if (kdb_task_state(p, mask_I))
+ ++idle;
+ }
+ kdb_do_each_thread(g, p) {
+ if (kdb_task_state(p, mask_M))
+ ++daemon;
+ } kdb_while_each_thread(g, p);
+ if (idle || daemon) {
+ if (idle)
+ kdb_printf("%d idle process%s (state I)%s\n",
+ idle, idle == 1 ? "" : "es",
+ daemon ? " and " : "");
+ if (daemon)
+ kdb_printf("%d sleeping system daemon (state M) "
+ "process%s", daemon,
+ daemon == 1 ? "" : "es");
+ kdb_printf(" suppressed,\nuse 'ps A' to see all.\n");
+ }
+}
+
+/*
+ * kdb_ps - This function implements the 'ps' command which shows a
+ * list of the active processes.
+ * ps [DRSTCZEUIMA] All processes, optionally filtered by state
+ */
+void kdb_ps1(const struct task_struct *p)
+{
+ int cpu;
+ unsigned long tmp;
+
+ if (!p || probe_kernel_read(&tmp, (char *)p, sizeof(unsigned long)))
+ return;
+
+ cpu = kdb_process_cpu(p);
+ kdb_printf("0x%p %8d %8d %d %4d %c 0x%p %c%s\n",
+ (void *)p, p->pid, p->parent->pid,
+ kdb_task_has_cpu(p), kdb_process_cpu(p),
+ kdb_task_state_char(p),
+ (void *)(&p->thread),
+ p == kdb_curr_task(raw_smp_processor_id()) ? '*' : ' ',
+ p->comm);
+ if (kdb_task_has_cpu(p)) {
+ if (!KDB_TSK(cpu)) {
+ kdb_printf(" Error: no saved data for this cpu\n");
+ } else {
+ if (KDB_TSK(cpu) != p)
+ kdb_printf(" Error: does not match running "
+ "process table (0x%p)\n", KDB_TSK(cpu));
+ }
+ }
+}
+
+static int kdb_ps(int argc, const char **argv)
+{
+ struct task_struct *g, *p;
+ unsigned long mask, cpu;
+
+ if (argc == 0)
+ kdb_ps_suppressed();
+ kdb_printf("%-*s Pid Parent [*] cpu State %-*s Command\n",
+ (int)(2*sizeof(void *))+2, "Task Addr",
+ (int)(2*sizeof(void *))+2, "Thread");
+ mask = kdb_task_state_string(argc ? argv[1] : NULL);
+ /* Run the active tasks first */
+ for_each_online_cpu(cpu) {
+ if (KDB_FLAG(CMD_INTERRUPT))
+ return 0;
+ p = kdb_curr_task(cpu);
+ if (kdb_task_state(p, mask))
+ kdb_ps1(p);
+ }
+ kdb_printf("\n");
+ /* Now the real tasks */
+ kdb_do_each_thread(g, p) {
+ if (KDB_FLAG(CMD_INTERRUPT))
+ return 0;
+ if (kdb_task_state(p, mask))
+ kdb_ps1(p);
+ } kdb_while_each_thread(g, p);
+
+ return 0;
+}
+
+/*
+ * kdb_pid - This function implements the 'pid' command which switches
+ * the currently active process.
+ * pid [<pid> | R]
+ */
+static int kdb_pid(int argc, const char **argv)
+{
+ struct task_struct *p;
+ unsigned long val;
+ int diag;
+
+ if (argc > 1)
+ return KDB_ARGCOUNT;
+
+ if (argc) {
+ if (strcmp(argv[1], "R") == 0) {
+ p = KDB_TSK(kdb_initial_cpu);
+ } else {
+ diag = kdbgetularg(argv[1], &val);
+ if (diag)
+ return KDB_BADINT;
+
+ p = find_task_by_pid_ns((pid_t)val, &init_pid_ns);
+ if (!p) {
+ kdb_printf("No task with pid=%d\n", (pid_t)val);
+ return 0;
+ }
+ }
+ kdb_set_current_task(p);
+ }
+ kdb_printf("KDB current process is %s(pid=%d)\n",
+ kdb_current_task->comm,
+ kdb_current_task->pid);
+
+ return 0;
+}
+
+static int kdb_kgdb(int argc, const char **argv)
+{
+ return KDB_CMD_KGDB;
+}
+
+/*
+ * kdb_help - This function implements the 'help' and '?' commands.
+ */
+static int kdb_help(int argc, const char **argv)
+{
+ kdbtab_t *kt;
+ int i;
+
+ kdb_printf("%-15.15s %-20.20s %s\n", "Command", "Usage", "Description");
+ kdb_printf("-----------------------------"
+ "-----------------------------\n");
+ for_each_kdbcmd(kt, i) {
+ char *space = "";
+ if (KDB_FLAG(CMD_INTERRUPT))
+ return 0;
+ if (!kt->cmd_name)
+ continue;
+ if (!kdb_check_flags(kt->cmd_flags, kdb_cmd_enabled, true))
+ continue;
+ if (strlen(kt->cmd_usage) > 20)
+ space = "\n ";
+ kdb_printf("%-15.15s %-20s%s%s\n", kt->cmd_name,
+ kt->cmd_usage, space, kt->cmd_help);
+ }
+ return 0;
+}
+
+/*
+ * kdb_kill - This function implements the 'kill' commands.
+ */
+static int kdb_kill(int argc, const char **argv)
+{
+ long sig, pid;
+ char *endp;
+ struct task_struct *p;
+ struct siginfo info;
+
+ if (argc != 2)
+ return KDB_ARGCOUNT;
+
+ sig = simple_strtol(argv[1], &endp, 0);
+ if (*endp)
+ return KDB_BADINT;
+ if (sig >= 0) {
+ kdb_printf("Invalid signal parameter.<-signal>\n");
+ return 0;
+ }
+ sig = -sig;
+
+ pid = simple_strtol(argv[2], &endp, 0);
+ if (*endp)
+ return KDB_BADINT;
+ if (pid <= 0) {
+ kdb_printf("Process ID must be large than 0.\n");
+ return 0;
+ }
+
+ /* Find the process. */
+ p = find_task_by_pid_ns(pid, &init_pid_ns);
+ if (!p) {
+ kdb_printf("The specified process isn't found.\n");
+ return 0;
+ }
+ p = p->group_leader;
+ info.si_signo = sig;
+ info.si_errno = 0;
+ info.si_code = SI_USER;
+ info.si_pid = pid; /* same capabilities as process being signalled */
+ info.si_uid = 0; /* kdb has root authority */
+ kdb_send_sig_info(p, &info);
+ return 0;
+}
+
+struct kdb_tm {
+ int tm_sec; /* seconds */
+ int tm_min; /* minutes */
+ int tm_hour; /* hours */
+ int tm_mday; /* day of the month */
+ int tm_mon; /* month */
+ int tm_year; /* year */
+};
+
+static void kdb_gmtime(struct timespec *tv, struct kdb_tm *tm)
+{
+ /* This will work from 1970-2099, 2100 is not a leap year */
+ static int mon_day[] = { 31, 29, 31, 30, 31, 30, 31,
+ 31, 30, 31, 30, 31 };
+ memset(tm, 0, sizeof(*tm));
+ tm->tm_sec = tv->tv_sec % (24 * 60 * 60);
+ tm->tm_mday = tv->tv_sec / (24 * 60 * 60) +
+ (2 * 365 + 1); /* shift base from 1970 to 1968 */
+ tm->tm_min = tm->tm_sec / 60 % 60;
+ tm->tm_hour = tm->tm_sec / 60 / 60;
+ tm->tm_sec = tm->tm_sec % 60;
+ tm->tm_year = 68 + 4*(tm->tm_mday / (4*365+1));
+ tm->tm_mday %= (4*365+1);
+ mon_day[1] = 29;
+ while (tm->tm_mday >= mon_day[tm->tm_mon]) {
+ tm->tm_mday -= mon_day[tm->tm_mon];
+ if (++tm->tm_mon == 12) {
+ tm->tm_mon = 0;
+ ++tm->tm_year;
+ mon_day[1] = 28;
+ }
+ }
+ ++tm->tm_mday;
+}
+
+/*
+ * Most of this code has been lifted from kernel/timer.c::sys_sysinfo().
+ * I cannot call that code directly from kdb, it has an unconditional
+ * cli()/sti() and calls routines that take locks which can stop the debugger.
+ */
+static void kdb_sysinfo(struct sysinfo *val)
+{
+ struct timespec uptime;
+ ktime_get_ts(&uptime);
+ memset(val, 0, sizeof(*val));
+ val->uptime = uptime.tv_sec;
+ val->loads[0] = avenrun[0];
+ val->loads[1] = avenrun[1];
+ val->loads[2] = avenrun[2];
+ val->procs = nr_threads-1;
+ si_meminfo(val);
+
+ return;
+}
+
+/*
+ * kdb_summary - This function implements the 'summary' command.
+ */
+static int kdb_summary(int argc, const char **argv)
+{
+ struct timespec now;
+ struct kdb_tm tm;
+ struct sysinfo val;
+
+ if (argc)
+ return KDB_ARGCOUNT;
+
+ kdb_printf("sysname %s\n", init_uts_ns.name.sysname);
+ kdb_printf("release %s\n", init_uts_ns.name.release);
+ kdb_printf("version %s\n", init_uts_ns.name.version);
+ kdb_printf("machine %s\n", init_uts_ns.name.machine);
+ kdb_printf("nodename %s\n", init_uts_ns.name.nodename);
+ kdb_printf("domainname %s\n", init_uts_ns.name.domainname);
+ kdb_printf("ccversion %s\n", __stringify(CCVERSION));
+
+ now = __current_kernel_time();
+ kdb_gmtime(&now, &tm);
+ kdb_printf("date %04d-%02d-%02d %02d:%02d:%02d "
+ "tz_minuteswest %d\n",
+ 1900+tm.tm_year, tm.tm_mon+1, tm.tm_mday,
+ tm.tm_hour, tm.tm_min, tm.tm_sec,
+ sys_tz.tz_minuteswest);
+
+ kdb_sysinfo(&val);
+ kdb_printf("uptime ");
+ if (val.uptime > (24*60*60)) {
+ int days = val.uptime / (24*60*60);
+ val.uptime %= (24*60*60);
+ kdb_printf("%d day%s ", days, days == 1 ? "" : "s");
+ }
+ kdb_printf("%02ld:%02ld\n", val.uptime/(60*60), (val.uptime/60)%60);
+
+ /* lifted from fs/proc/proc_misc.c::loadavg_read_proc() */
+
+#define LOAD_INT(x) ((x) >> FSHIFT)
+#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
+ kdb_printf("load avg %ld.%02ld %ld.%02ld %ld.%02ld\n",
+ LOAD_INT(val.loads[0]), LOAD_FRAC(val.loads[0]),
+ LOAD_INT(val.loads[1]), LOAD_FRAC(val.loads[1]),
+ LOAD_INT(val.loads[2]), LOAD_FRAC(val.loads[2]));
+#undef LOAD_INT
+#undef LOAD_FRAC
+ /* Display in kilobytes */
+#define K(x) ((x) << (PAGE_SHIFT - 10))
+ kdb_printf("\nMemTotal: %8lu kB\nMemFree: %8lu kB\n"
+ "Buffers: %8lu kB\n",
+ K(val.totalram), K(val.freeram), K(val.bufferram));
+ return 0;
+}
+
+/*
+ * kdb_per_cpu - This function implements the 'per_cpu' command.
+ */
+static int kdb_per_cpu(int argc, const char **argv)
+{
+ char fmtstr[64];
+ int cpu, diag, nextarg = 1;
+ unsigned long addr, symaddr, val, bytesperword = 0, whichcpu = ~0UL;
+
+ if (argc < 1 || argc > 3)
+ return KDB_ARGCOUNT;
+
+ diag = kdbgetaddrarg(argc, argv, &nextarg, &symaddr, NULL, NULL);
+ if (diag)
+ return diag;
+
+ if (argc >= 2) {
+ diag = kdbgetularg(argv[2], &bytesperword);
+ if (diag)
+ return diag;
+ }
+ if (!bytesperword)
+ bytesperword = KDB_WORD_SIZE;
+ else if (bytesperword > KDB_WORD_SIZE)
+ return KDB_BADWIDTH;
+ sprintf(fmtstr, "%%0%dlx ", (int)(2*bytesperword));
+ if (argc >= 3) {
+ diag = kdbgetularg(argv[3], &whichcpu);
+ if (diag)
+ return diag;
+ if (!cpu_online(whichcpu)) {
+ kdb_printf("cpu %ld is not online\n", whichcpu);
+ return KDB_BADCPUNUM;
+ }
+ }
+
+ /* Most architectures use __per_cpu_offset[cpu], some use
+ * __per_cpu_offset(cpu), smp has no __per_cpu_offset.
+ */
+#ifdef __per_cpu_offset
+#define KDB_PCU(cpu) __per_cpu_offset(cpu)
+#else
+#ifdef CONFIG_SMP
+#define KDB_PCU(cpu) __per_cpu_offset[cpu]
+#else
+#define KDB_PCU(cpu) 0
+#endif
+#endif
+ for_each_online_cpu(cpu) {
+ if (KDB_FLAG(CMD_INTERRUPT))
+ return 0;
+
+ if (whichcpu != ~0UL && whichcpu != cpu)
+ continue;
+ addr = symaddr + KDB_PCU(cpu);
+ diag = kdb_getword(&val, addr, bytesperword);
+ if (diag) {
+ kdb_printf("%5d " kdb_bfd_vma_fmt0 " - unable to "
+ "read, diag=%d\n", cpu, addr, diag);
+ continue;
+ }
+ kdb_printf("%5d ", cpu);
+ kdb_md_line(fmtstr, addr,
+ bytesperword == KDB_WORD_SIZE,
+ 1, bytesperword, 1, 1, 0);
+ }
+#undef KDB_PCU
+ return 0;
+}
+
+/*
+ * display help for the use of cmd | grep pattern
+ */
+static int kdb_grep_help(int argc, const char **argv)
+{
+ kdb_printf("Usage of cmd args | grep pattern:\n");
+ kdb_printf(" Any command's output may be filtered through an ");
+ kdb_printf("emulated 'pipe'.\n");
+ kdb_printf(" 'grep' is just a key word.\n");
+ kdb_printf(" The pattern may include a very limited set of "
+ "metacharacters:\n");
+ kdb_printf(" pattern or ^pattern or pattern$ or ^pattern$\n");
+ kdb_printf(" And if there are spaces in the pattern, you may "
+ "quote it:\n");
+ kdb_printf(" \"pat tern\" or \"^pat tern\" or \"pat tern$\""
+ " or \"^pat tern$\"\n");
+ return 0;
+}
+
+/*
+ * kdb_register_flags - This function is used to register a kernel
+ * debugger command.
+ * Inputs:
+ * cmd Command name
+ * func Function to execute the command
+ * usage A simple usage string showing arguments
+ * help A simple help string describing command
+ * repeat Does the command auto repeat on enter?
+ * Returns:
+ * zero for success, one if a duplicate command.
+ */
+#define kdb_command_extend 50 /* arbitrary */
+int kdb_register_flags(char *cmd,
+ kdb_func_t func,
+ char *usage,
+ char *help,
+ short minlen,
+ kdb_cmdflags_t flags)
+{
+ int i;
+ kdbtab_t *kp;
+
+ /*
+ * Brute force method to determine duplicates
+ */
+ for_each_kdbcmd(kp, i) {
+ if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) {
+ kdb_printf("Duplicate kdb command registered: "
+ "%s, func %p help %s\n", cmd, func, help);
+ return 1;
+ }
+ }
+
+ /*
+ * Insert command into first available location in table
+ */
+ for_each_kdbcmd(kp, i) {
+ if (kp->cmd_name == NULL)
+ break;
+ }
+
+ if (i >= kdb_max_commands) {
+ kdbtab_t *new = kmalloc((kdb_max_commands - KDB_BASE_CMD_MAX +
+ kdb_command_extend) * sizeof(*new), GFP_KDB);
+ if (!new) {
+ kdb_printf("Could not allocate new kdb_command "
+ "table\n");
+ return 1;
+ }
+ if (kdb_commands) {
+ memcpy(new, kdb_commands,
+ (kdb_max_commands - KDB_BASE_CMD_MAX) * sizeof(*new));
+ kfree(kdb_commands);
+ }
+ memset(new + kdb_max_commands - KDB_BASE_CMD_MAX, 0,
+ kdb_command_extend * sizeof(*new));
+ kdb_commands = new;
+ kp = kdb_commands + kdb_max_commands - KDB_BASE_CMD_MAX;
+ kdb_max_commands += kdb_command_extend;
+ }
+
+ kp->cmd_name = cmd;
+ kp->cmd_func = func;
+ kp->cmd_usage = usage;
+ kp->cmd_help = help;
+ kp->cmd_minlen = minlen;
+ kp->cmd_flags = flags;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kdb_register_flags);
+
+
+/*
+ * kdb_register - Compatibility register function for commands that do
+ * not need to specify a repeat state. Equivalent to
+ * kdb_register_flags with flags set to 0.
+ * Inputs:
+ * cmd Command name
+ * func Function to execute the command
+ * usage A simple usage string showing arguments
+ * help A simple help string describing command
+ * Returns:
+ * zero for success, one if a duplicate command.
+ */
+int kdb_register(char *cmd,
+ kdb_func_t func,
+ char *usage,
+ char *help,
+ short minlen)
+{
+ return kdb_register_flags(cmd, func, usage, help, minlen, 0);
+}
+EXPORT_SYMBOL_GPL(kdb_register);
+
+/*
+ * kdb_unregister - This function is used to unregister a kernel
+ * debugger command. It is generally called when a module which
+ * implements kdb commands is unloaded.
+ * Inputs:
+ * cmd Command name
+ * Returns:
+ * zero for success, one command not registered.
+ */
+int kdb_unregister(char *cmd)
+{
+ int i;
+ kdbtab_t *kp;
+
+ /*
+ * find the command.
+ */
+ for_each_kdbcmd(kp, i) {
+ if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) {
+ kp->cmd_name = NULL;
+ return 0;
+ }
+ }
+
+ /* Couldn't find it. */
+ return 1;
+}
+EXPORT_SYMBOL_GPL(kdb_unregister);
+
+/* Initialize the kdb command table. */
+static void __init kdb_inittab(void)
+{
+ int i;
+ kdbtab_t *kp;
+
+ for_each_kdbcmd(kp, i)
+ kp->cmd_name = NULL;
+
+ kdb_register_flags("md", kdb_md, "<vaddr>",
+ "Display Memory Contents, also mdWcN, e.g. md8c1", 1,
+ KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
+ kdb_register_flags("mdr", kdb_md, "<vaddr> <bytes>",
+ "Display Raw Memory", 0,
+ KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
+ kdb_register_flags("mdp", kdb_md, "<paddr> <bytes>",
+ "Display Physical Memory", 0,
+ KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
+ kdb_register_flags("mds", kdb_md, "<vaddr>",
+ "Display Memory Symbolically", 0,
+ KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
+ kdb_register_flags("mm", kdb_mm, "<vaddr> <contents>",
+ "Modify Memory Contents", 0,
+ KDB_ENABLE_MEM_WRITE | KDB_REPEAT_NO_ARGS);
+ kdb_register_flags("go", kdb_go, "[<vaddr>]",
+ "Continue Execution", 1,
+ KDB_ENABLE_REG_WRITE | KDB_ENABLE_ALWAYS_SAFE_NO_ARGS);
+ kdb_register_flags("rd", kdb_rd, "",
+ "Display Registers", 0,
+ KDB_ENABLE_REG_READ);
+ kdb_register_flags("rm", kdb_rm, "<reg> <contents>",
+ "Modify Registers", 0,
+ KDB_ENABLE_REG_WRITE);
+ kdb_register_flags("ef", kdb_ef, "<vaddr>",
+ "Display exception frame", 0,
+ KDB_ENABLE_MEM_READ);
+ kdb_register_flags("bt", kdb_bt, "[<vaddr>]",
+ "Stack traceback", 1,
+ KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS);
+ kdb_register_flags("btp", kdb_bt, "<pid>",
+ "Display stack for process <pid>", 0,
+ KDB_ENABLE_INSPECT);
+ kdb_register_flags("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]",
+ "Backtrace all processes matching state flag", 0,
+ KDB_ENABLE_INSPECT);
+ kdb_register_flags("btc", kdb_bt, "",
+ "Backtrace current process on each cpu", 0,
+ KDB_ENABLE_INSPECT);
+ kdb_register_flags("btt", kdb_bt, "<vaddr>",
+ "Backtrace process given its struct task address", 0,
+ KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS);
+ kdb_register_flags("env", kdb_env, "",
+ "Show environment variables", 0,
+ KDB_ENABLE_ALWAYS_SAFE);
+ kdb_register_flags("set", kdb_set, "",
+ "Set environment variables", 0,
+ KDB_ENABLE_ALWAYS_SAFE);
+ kdb_register_flags("help", kdb_help, "",
+ "Display Help Message", 1,
+ KDB_ENABLE_ALWAYS_SAFE);
+ kdb_register_flags("?", kdb_help, "",
+ "Display Help Message", 0,
+ KDB_ENABLE_ALWAYS_SAFE);
+ kdb_register_flags("cpu", kdb_cpu, "<cpunum>",
+ "Switch to new cpu", 0,
+ KDB_ENABLE_ALWAYS_SAFE_NO_ARGS);
+ kdb_register_flags("kgdb", kdb_kgdb, "",
+ "Enter kgdb mode", 0, 0);
+ kdb_register_flags("ps", kdb_ps, "[<flags>|A]",
+ "Display active task list", 0,
+ KDB_ENABLE_INSPECT);
+ kdb_register_flags("pid", kdb_pid, "<pidnum>",
+ "Switch to another task", 0,
+ KDB_ENABLE_INSPECT);
+ kdb_register_flags("reboot", kdb_reboot, "",
+ "Reboot the machine immediately", 0,
+ KDB_ENABLE_REBOOT);
+#if defined(CONFIG_MODULES)
+ kdb_register_flags("lsmod", kdb_lsmod, "",
+ "List loaded kernel modules", 0,
+ KDB_ENABLE_INSPECT);
+#endif
+#if defined(CONFIG_MAGIC_SYSRQ)
+ kdb_register_flags("sr", kdb_sr, "<key>",
+ "Magic SysRq key", 0,
+ KDB_ENABLE_ALWAYS_SAFE);
+#endif
+#if defined(CONFIG_PRINTK)
+ kdb_register_flags("dmesg", kdb_dmesg, "[lines]",
+ "Display syslog buffer", 0,
+ KDB_ENABLE_ALWAYS_SAFE);
+#endif
+ if (arch_kgdb_ops.enable_nmi) {
+ kdb_register_flags("disable_nmi", kdb_disable_nmi, "",
+ "Disable NMI entry to KDB", 0,
+ KDB_ENABLE_ALWAYS_SAFE);
+ }
+ kdb_register_flags("defcmd", kdb_defcmd, "name \"usage\" \"help\"",
+ "Define a set of commands, down to endefcmd", 0,
+ KDB_ENABLE_ALWAYS_SAFE);
+ kdb_register_flags("kill", kdb_kill, "<-signal> <pid>",
+ "Send a signal to a process", 0,
+ KDB_ENABLE_SIGNAL);
+ kdb_register_flags("summary", kdb_summary, "",
+ "Summarize the system", 4,
+ KDB_ENABLE_ALWAYS_SAFE);
+ kdb_register_flags("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]",
+ "Display per_cpu variables", 3,
+ KDB_ENABLE_MEM_READ);
+ kdb_register_flags("grephelp", kdb_grep_help, "",
+ "Display help on | grep", 0,
+ KDB_ENABLE_ALWAYS_SAFE);
+}
+
+/* Execute any commands defined in kdb_cmds. */
+static void __init kdb_cmd_init(void)
+{
+ int i, diag;
+ for (i = 0; kdb_cmds[i]; ++i) {
+ diag = kdb_parse(kdb_cmds[i]);
+ if (diag)
+ kdb_printf("kdb command %s failed, kdb diag %d\n",
+ kdb_cmds[i], diag);
+ }
+ if (defcmd_in_progress) {
+ kdb_printf("Incomplete 'defcmd' set, forcing endefcmd\n");
+ kdb_parse("endefcmd");
+ }
+}
+
+/* Initialize kdb_printf, breakpoint tables and kdb state */
+void __init kdb_init(int lvl)
+{
+ static int kdb_init_lvl = KDB_NOT_INITIALIZED;
+ int i;
+
+ if (kdb_init_lvl == KDB_INIT_FULL || lvl <= kdb_init_lvl)
+ return;
+ for (i = kdb_init_lvl; i < lvl; i++) {
+ switch (i) {
+ case KDB_NOT_INITIALIZED:
+ kdb_inittab(); /* Initialize Command Table */
+ kdb_initbptab(); /* Initialize Breakpoints */
+ break;
+ case KDB_INIT_EARLY:
+ kdb_cmd_init(); /* Build kdb_cmds tables */
+ break;
+ }
+ }
+ kdb_init_lvl = lvl;
+}
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
new file mode 100644
index 000000000..75014d7f4
--- /dev/null
+++ b/kernel/debug/kdb/kdb_private.h
@@ -0,0 +1,261 @@
+#ifndef _KDBPRIVATE_H
+#define _KDBPRIVATE_H
+
+/*
+ * Kernel Debugger Architecture Independent Private Headers
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
+ */
+
+#include <linux/kgdb.h>
+#include "../debug_core.h"
+
+/* Kernel Debugger Command codes. Must not overlap with error codes. */
+#define KDB_CMD_GO (-1001)
+#define KDB_CMD_CPU (-1002)
+#define KDB_CMD_SS (-1003)
+#define KDB_CMD_KGDB (-1005)
+
+/* Internal debug flags */
+#define KDB_DEBUG_FLAG_BP 0x0002 /* Breakpoint subsystem debug */
+#define KDB_DEBUG_FLAG_BB_SUMM 0x0004 /* Basic block analysis, summary only */
+#define KDB_DEBUG_FLAG_AR 0x0008 /* Activation record, generic */
+#define KDB_DEBUG_FLAG_ARA 0x0010 /* Activation record, arch specific */
+#define KDB_DEBUG_FLAG_BB 0x0020 /* All basic block analysis */
+#define KDB_DEBUG_FLAG_STATE 0x0040 /* State flags */
+#define KDB_DEBUG_FLAG_MASK 0xffff /* All debug flags */
+#define KDB_DEBUG_FLAG_SHIFT 16 /* Shift factor for dbflags */
+
+#define KDB_DEBUG(flag) (kdb_flags & \
+ (KDB_DEBUG_FLAG_##flag << KDB_DEBUG_FLAG_SHIFT))
+#define KDB_DEBUG_STATE(text, value) if (KDB_DEBUG(STATE)) \
+ kdb_print_state(text, value)
+
+#if BITS_PER_LONG == 32
+
+#define KDB_PLATFORM_ENV "BYTESPERWORD=4"
+
+#define kdb_machreg_fmt "0x%lx"
+#define kdb_machreg_fmt0 "0x%08lx"
+#define kdb_bfd_vma_fmt "0x%lx"
+#define kdb_bfd_vma_fmt0 "0x%08lx"
+#define kdb_elfw_addr_fmt "0x%x"
+#define kdb_elfw_addr_fmt0 "0x%08x"
+#define kdb_f_count_fmt "%d"
+
+#elif BITS_PER_LONG == 64
+
+#define KDB_PLATFORM_ENV "BYTESPERWORD=8"
+
+#define kdb_machreg_fmt "0x%lx"
+#define kdb_machreg_fmt0 "0x%016lx"
+#define kdb_bfd_vma_fmt "0x%lx"
+#define kdb_bfd_vma_fmt0 "0x%016lx"
+#define kdb_elfw_addr_fmt "0x%x"
+#define kdb_elfw_addr_fmt0 "0x%016x"
+#define kdb_f_count_fmt "%ld"
+
+#endif
+
+/*
+ * KDB_MAXBPT describes the total number of breakpoints
+ * supported by this architecure.
+ */
+#define KDB_MAXBPT 16
+
+/* Symbol table format returned by kallsyms. */
+typedef struct __ksymtab {
+ unsigned long value; /* Address of symbol */
+ const char *mod_name; /* Module containing symbol or
+ * "kernel" */
+ unsigned long mod_start;
+ unsigned long mod_end;
+ const char *sec_name; /* Section containing symbol */
+ unsigned long sec_start;
+ unsigned long sec_end;
+ const char *sym_name; /* Full symbol name, including
+ * any version */
+ unsigned long sym_start;
+ unsigned long sym_end;
+ } kdb_symtab_t;
+extern int kallsyms_symbol_next(char *prefix_name, int flag);
+extern int kallsyms_symbol_complete(char *prefix_name, int max_len);
+
+/* Exported Symbols for kernel loadable modules to use. */
+extern int kdb_getarea_size(void *, unsigned long, size_t);
+extern int kdb_putarea_size(unsigned long, void *, size_t);
+
+/*
+ * Like get_user and put_user, kdb_getarea and kdb_putarea take variable
+ * names, not pointers. The underlying *_size functions take pointers.
+ */
+#define kdb_getarea(x, addr) kdb_getarea_size(&(x), addr, sizeof((x)))
+#define kdb_putarea(addr, x) kdb_putarea_size(addr, &(x), sizeof((x)))
+
+extern int kdb_getphysword(unsigned long *word,
+ unsigned long addr, size_t size);
+extern int kdb_getword(unsigned long *, unsigned long, size_t);
+extern int kdb_putword(unsigned long, unsigned long, size_t);
+
+extern int kdbgetularg(const char *, unsigned long *);
+extern int kdbgetu64arg(const char *, u64 *);
+extern char *kdbgetenv(const char *);
+extern int kdbgetaddrarg(int, const char **, int*, unsigned long *,
+ long *, char **);
+extern int kdbgetsymval(const char *, kdb_symtab_t *);
+extern int kdbnearsym(unsigned long, kdb_symtab_t *);
+extern void kdbnearsym_cleanup(void);
+extern char *kdb_strdup(const char *str, gfp_t type);
+extern void kdb_symbol_print(unsigned long, const kdb_symtab_t *, unsigned int);
+
+/* Routine for debugging the debugger state. */
+extern void kdb_print_state(const char *, int);
+
+extern int kdb_state;
+#define KDB_STATE_KDB 0x00000001 /* Cpu is inside kdb */
+#define KDB_STATE_LEAVING 0x00000002 /* Cpu is leaving kdb */
+#define KDB_STATE_CMD 0x00000004 /* Running a kdb command */
+#define KDB_STATE_KDB_CONTROL 0x00000008 /* This cpu is under
+ * kdb control */
+#define KDB_STATE_HOLD_CPU 0x00000010 /* Hold this cpu inside kdb */
+#define KDB_STATE_DOING_SS 0x00000020 /* Doing ss command */
+#define KDB_STATE_SSBPT 0x00000080 /* Install breakpoint
+ * after one ss, independent of
+ * DOING_SS */
+#define KDB_STATE_REENTRY 0x00000100 /* Valid re-entry into kdb */
+#define KDB_STATE_SUPPRESS 0x00000200 /* Suppress error messages */
+#define KDB_STATE_PAGER 0x00000400 /* pager is available */
+#define KDB_STATE_GO_SWITCH 0x00000800 /* go is switching
+ * back to initial cpu */
+#define KDB_STATE_PRINTF_LOCK 0x00001000 /* Holds kdb_printf lock */
+#define KDB_STATE_WAIT_IPI 0x00002000 /* Waiting for kdb_ipi() NMI */
+#define KDB_STATE_RECURSE 0x00004000 /* Recursive entry to kdb */
+#define KDB_STATE_IP_ADJUSTED 0x00008000 /* Restart IP has been
+ * adjusted */
+#define KDB_STATE_GO1 0x00010000 /* go only releases one cpu */
+#define KDB_STATE_KEYBOARD 0x00020000 /* kdb entered via
+ * keyboard on this cpu */
+#define KDB_STATE_KEXEC 0x00040000 /* kexec issued */
+#define KDB_STATE_DOING_KGDB 0x00080000 /* kgdb enter now issued */
+#define KDB_STATE_KGDB_TRANS 0x00200000 /* Transition to kgdb */
+#define KDB_STATE_ARCH 0xff000000 /* Reserved for arch
+ * specific use */
+
+#define KDB_STATE(flag) (kdb_state & KDB_STATE_##flag)
+#define KDB_STATE_SET(flag) ((void)(kdb_state |= KDB_STATE_##flag))
+#define KDB_STATE_CLEAR(flag) ((void)(kdb_state &= ~KDB_STATE_##flag))
+
+extern int kdb_nextline; /* Current number of lines displayed */
+
+typedef struct _kdb_bp {
+ unsigned long bp_addr; /* Address breakpoint is present at */
+ unsigned int bp_free:1; /* This entry is available */
+ unsigned int bp_enabled:1; /* Breakpoint is active in register */
+ unsigned int bp_type:4; /* Uses hardware register */
+ unsigned int bp_installed:1; /* Breakpoint is installed */
+ unsigned int bp_delay:1; /* Do delayed bp handling */
+ unsigned int bp_delayed:1; /* Delayed breakpoint */
+ unsigned int bph_length; /* HW break length */
+} kdb_bp_t;
+
+#ifdef CONFIG_KGDB_KDB
+extern kdb_bp_t kdb_breakpoints[/* KDB_MAXBPT */];
+
+/* The KDB shell command table */
+typedef struct _kdbtab {
+ char *cmd_name; /* Command name */
+ kdb_func_t cmd_func; /* Function to execute command */
+ char *cmd_usage; /* Usage String for this command */
+ char *cmd_help; /* Help message for this command */
+ short cmd_minlen; /* Minimum legal # command
+ * chars required */
+ kdb_cmdflags_t cmd_flags; /* Command behaviour flags */
+} kdbtab_t;
+
+extern int kdb_bt(int, const char **); /* KDB display back trace */
+
+/* KDB breakpoint management functions */
+extern void kdb_initbptab(void);
+extern void kdb_bp_install(struct pt_regs *);
+extern void kdb_bp_remove(void);
+
+typedef enum {
+ KDB_DB_BPT, /* Breakpoint */
+ KDB_DB_SS, /* Single-step trap */
+ KDB_DB_SSBPT, /* Single step over breakpoint */
+ KDB_DB_NOBPT /* Spurious breakpoint */
+} kdb_dbtrap_t;
+
+extern int kdb_main_loop(kdb_reason_t, kdb_reason_t,
+ int, kdb_dbtrap_t, struct pt_regs *);
+
+/* Miscellaneous functions and data areas */
+extern int kdb_grepping_flag;
+#define KDB_GREPPING_FLAG_SEARCH 0x8000
+extern char kdb_grep_string[];
+#define KDB_GREP_STRLEN 256
+extern int kdb_grep_leading;
+extern int kdb_grep_trailing;
+extern char *kdb_cmds[];
+extern unsigned long kdb_task_state_string(const char *);
+extern char kdb_task_state_char (const struct task_struct *);
+extern unsigned long kdb_task_state(const struct task_struct *p,
+ unsigned long mask);
+extern void kdb_ps_suppressed(void);
+extern void kdb_ps1(const struct task_struct *p);
+extern void kdb_print_nameval(const char *name, unsigned long val);
+extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info);
+extern void kdb_meminfo_proc_show(void);
+extern char *kdb_getstr(char *, size_t, const char *);
+extern void kdb_gdb_state_pass(char *buf);
+
+/* Defines for kdb_symbol_print */
+#define KDB_SP_SPACEB 0x0001 /* Space before string */
+#define KDB_SP_SPACEA 0x0002 /* Space after string */
+#define KDB_SP_PAREN 0x0004 /* Parenthesis around string */
+#define KDB_SP_VALUE 0x0008 /* Print the value of the address */
+#define KDB_SP_SYMSIZE 0x0010 /* Print the size of the symbol */
+#define KDB_SP_NEWLINE 0x0020 /* Newline after string */
+#define KDB_SP_DEFAULT (KDB_SP_VALUE|KDB_SP_PAREN)
+
+#define KDB_TSK(cpu) kgdb_info[cpu].task
+#define KDB_TSKREGS(cpu) kgdb_info[cpu].debuggerinfo
+
+extern struct task_struct *kdb_curr_task(int);
+
+#define kdb_task_has_cpu(p) (task_curr(p))
+
+/* Simplify coexistence with NPTL */
+#define kdb_do_each_thread(g, p) do_each_thread(g, p)
+#define kdb_while_each_thread(g, p) while_each_thread(g, p)
+
+#define GFP_KDB (in_interrupt() ? GFP_ATOMIC : GFP_KERNEL)
+
+extern void *debug_kmalloc(size_t size, gfp_t flags);
+extern void debug_kfree(void *);
+extern void debug_kusage(void);
+
+extern void kdb_set_current_task(struct task_struct *);
+extern struct task_struct *kdb_current_task;
+
+#ifdef CONFIG_KDB_KEYBOARD
+extern void kdb_kbd_cleanup_state(void);
+#else /* ! CONFIG_KDB_KEYBOARD */
+#define kdb_kbd_cleanup_state()
+#endif /* ! CONFIG_KDB_KEYBOARD */
+
+#ifdef CONFIG_MODULES
+extern struct list_head *kdb_modules;
+#endif /* CONFIG_MODULES */
+
+extern char kdb_prompt_str[];
+
+#define KDB_WORD_SIZE ((int)sizeof(unsigned long))
+
+#endif /* CONFIG_KGDB_KDB */
+#endif /* !_KDBPRIVATE_H */
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
new file mode 100644
index 000000000..d35cc2d3a
--- /dev/null
+++ b/kernel/debug/kdb/kdb_support.c
@@ -0,0 +1,927 @@
+/*
+ * Kernel Debugger Architecture Independent Support Functions
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
+ * 03/02/13 added new 2.5 kallsyms <xavier.bru@bull.net>
+ */
+
+#include <stdarg.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/kallsyms.h>
+#include <linux/stddef.h>
+#include <linux/vmalloc.h>
+#include <linux/ptrace.h>
+#include <linux/module.h>
+#include <linux/highmem.h>
+#include <linux/hardirq.h>
+#include <linux/delay.h>
+#include <linux/uaccess.h>
+#include <linux/kdb.h>
+#include <linux/slab.h>
+#include "kdb_private.h"
+
+/*
+ * kdbgetsymval - Return the address of the given symbol.
+ *
+ * Parameters:
+ * symname Character string containing symbol name
+ * symtab Structure to receive results
+ * Returns:
+ * 0 Symbol not found, symtab zero filled
+ * 1 Symbol mapped to module/symbol/section, data in symtab
+ */
+int kdbgetsymval(const char *symname, kdb_symtab_t *symtab)
+{
+ if (KDB_DEBUG(AR))
+ kdb_printf("kdbgetsymval: symname=%s, symtab=%p\n", symname,
+ symtab);
+ memset(symtab, 0, sizeof(*symtab));
+ symtab->sym_start = kallsyms_lookup_name(symname);
+ if (symtab->sym_start) {
+ if (KDB_DEBUG(AR))
+ kdb_printf("kdbgetsymval: returns 1, "
+ "symtab->sym_start=0x%lx\n",
+ symtab->sym_start);
+ return 1;
+ }
+ if (KDB_DEBUG(AR))
+ kdb_printf("kdbgetsymval: returns 0\n");
+ return 0;
+}
+EXPORT_SYMBOL(kdbgetsymval);
+
+static char *kdb_name_table[100]; /* arbitrary size */
+
+/*
+ * kdbnearsym - Return the name of the symbol with the nearest address
+ * less than 'addr'.
+ *
+ * Parameters:
+ * addr Address to check for symbol near
+ * symtab Structure to receive results
+ * Returns:
+ * 0 No sections contain this address, symtab zero filled
+ * 1 Address mapped to module/symbol/section, data in symtab
+ * Remarks:
+ * 2.6 kallsyms has a "feature" where it unpacks the name into a
+ * string. If that string is reused before the caller expects it
+ * then the caller sees its string change without warning. To
+ * avoid cluttering up the main kdb code with lots of kdb_strdup,
+ * tests and kfree calls, kdbnearsym maintains an LRU list of the
+ * last few unique strings. The list is sized large enough to
+ * hold active strings, no kdb caller of kdbnearsym makes more
+ * than ~20 later calls before using a saved value.
+ */
+int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab)
+{
+ int ret = 0;
+ unsigned long symbolsize = 0;
+ unsigned long offset = 0;
+#define knt1_size 128 /* must be >= kallsyms table size */
+ char *knt1 = NULL;
+
+ if (KDB_DEBUG(AR))
+ kdb_printf("kdbnearsym: addr=0x%lx, symtab=%p\n", addr, symtab);
+ memset(symtab, 0, sizeof(*symtab));
+
+ if (addr < 4096)
+ goto out;
+ knt1 = debug_kmalloc(knt1_size, GFP_ATOMIC);
+ if (!knt1) {
+ kdb_printf("kdbnearsym: addr=0x%lx cannot kmalloc knt1\n",
+ addr);
+ goto out;
+ }
+ symtab->sym_name = kallsyms_lookup(addr, &symbolsize , &offset,
+ (char **)(&symtab->mod_name), knt1);
+ if (offset > 8*1024*1024) {
+ symtab->sym_name = NULL;
+ addr = offset = symbolsize = 0;
+ }
+ symtab->sym_start = addr - offset;
+ symtab->sym_end = symtab->sym_start + symbolsize;
+ ret = symtab->sym_name != NULL && *(symtab->sym_name) != '\0';
+
+ if (ret) {
+ int i;
+ /* Another 2.6 kallsyms "feature". Sometimes the sym_name is
+ * set but the buffer passed into kallsyms_lookup is not used,
+ * so it contains garbage. The caller has to work out which
+ * buffer needs to be saved.
+ *
+ * What was Rusty smoking when he wrote that code?
+ */
+ if (symtab->sym_name != knt1) {
+ strncpy(knt1, symtab->sym_name, knt1_size);
+ knt1[knt1_size-1] = '\0';
+ }
+ for (i = 0; i < ARRAY_SIZE(kdb_name_table); ++i) {
+ if (kdb_name_table[i] &&
+ strcmp(kdb_name_table[i], knt1) == 0)
+ break;
+ }
+ if (i >= ARRAY_SIZE(kdb_name_table)) {
+ debug_kfree(kdb_name_table[0]);
+ memcpy(kdb_name_table, kdb_name_table+1,
+ sizeof(kdb_name_table[0]) *
+ (ARRAY_SIZE(kdb_name_table)-1));
+ } else {
+ debug_kfree(knt1);
+ knt1 = kdb_name_table[i];
+ memcpy(kdb_name_table+i, kdb_name_table+i+1,
+ sizeof(kdb_name_table[0]) *
+ (ARRAY_SIZE(kdb_name_table)-i-1));
+ }
+ i = ARRAY_SIZE(kdb_name_table) - 1;
+ kdb_name_table[i] = knt1;
+ symtab->sym_name = kdb_name_table[i];
+ knt1 = NULL;
+ }
+
+ if (symtab->mod_name == NULL)
+ symtab->mod_name = "kernel";
+ if (KDB_DEBUG(AR))
+ kdb_printf("kdbnearsym: returns %d symtab->sym_start=0x%lx, "
+ "symtab->mod_name=%p, symtab->sym_name=%p (%s)\n", ret,
+ symtab->sym_start, symtab->mod_name, symtab->sym_name,
+ symtab->sym_name);
+
+out:
+ debug_kfree(knt1);
+ return ret;
+}
+
+void kdbnearsym_cleanup(void)
+{
+ int i;
+ for (i = 0; i < ARRAY_SIZE(kdb_name_table); ++i) {
+ if (kdb_name_table[i]) {
+ debug_kfree(kdb_name_table[i]);
+ kdb_name_table[i] = NULL;
+ }
+ }
+}
+
+static char ks_namebuf[KSYM_NAME_LEN+1], ks_namebuf_prev[KSYM_NAME_LEN+1];
+
+/*
+ * kallsyms_symbol_complete
+ *
+ * Parameters:
+ * prefix_name prefix of a symbol name to lookup
+ * max_len maximum length that can be returned
+ * Returns:
+ * Number of symbols which match the given prefix.
+ * Notes:
+ * prefix_name is changed to contain the longest unique prefix that
+ * starts with this prefix (tab completion).
+ */
+int kallsyms_symbol_complete(char *prefix_name, int max_len)
+{
+ loff_t pos = 0;
+ int prefix_len = strlen(prefix_name), prev_len = 0;
+ int i, number = 0;
+ const char *name;
+
+ while ((name = kdb_walk_kallsyms(&pos))) {
+ if (strncmp(name, prefix_name, prefix_len) == 0) {
+ strcpy(ks_namebuf, name);
+ /* Work out the longest name that matches the prefix */
+ if (++number == 1) {
+ prev_len = min_t(int, max_len-1,
+ strlen(ks_namebuf));
+ memcpy(ks_namebuf_prev, ks_namebuf, prev_len);
+ ks_namebuf_prev[prev_len] = '\0';
+ continue;
+ }
+ for (i = 0; i < prev_len; i++) {
+ if (ks_namebuf[i] != ks_namebuf_prev[i]) {
+ prev_len = i;
+ ks_namebuf_prev[i] = '\0';
+ break;
+ }
+ }
+ }
+ }
+ if (prev_len > prefix_len)
+ memcpy(prefix_name, ks_namebuf_prev, prev_len+1);
+ return number;
+}
+
+/*
+ * kallsyms_symbol_next
+ *
+ * Parameters:
+ * prefix_name prefix of a symbol name to lookup
+ * flag 0 means search from the head, 1 means continue search.
+ * Returns:
+ * 1 if a symbol matches the given prefix.
+ * 0 if no string found
+ */
+int kallsyms_symbol_next(char *prefix_name, int flag)
+{
+ int prefix_len = strlen(prefix_name);
+ static loff_t pos;
+ const char *name;
+
+ if (!flag)
+ pos = 0;
+
+ while ((name = kdb_walk_kallsyms(&pos))) {
+ if (strncmp(name, prefix_name, prefix_len) == 0) {
+ strncpy(prefix_name, name, strlen(name)+1);
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/*
+ * kdb_symbol_print - Standard method for printing a symbol name and offset.
+ * Inputs:
+ * addr Address to be printed.
+ * symtab Address of symbol data, if NULL this routine does its
+ * own lookup.
+ * punc Punctuation for string, bit field.
+ * Remarks:
+ * The string and its punctuation is only printed if the address
+ * is inside the kernel, except that the value is always printed
+ * when requested.
+ */
+void kdb_symbol_print(unsigned long addr, const kdb_symtab_t *symtab_p,
+ unsigned int punc)
+{
+ kdb_symtab_t symtab, *symtab_p2;
+ if (symtab_p) {
+ symtab_p2 = (kdb_symtab_t *)symtab_p;
+ } else {
+ symtab_p2 = &symtab;
+ kdbnearsym(addr, symtab_p2);
+ }
+ if (!(symtab_p2->sym_name || (punc & KDB_SP_VALUE)))
+ return;
+ if (punc & KDB_SP_SPACEB)
+ kdb_printf(" ");
+ if (punc & KDB_SP_VALUE)
+ kdb_printf(kdb_machreg_fmt0, addr);
+ if (symtab_p2->sym_name) {
+ if (punc & KDB_SP_VALUE)
+ kdb_printf(" ");
+ if (punc & KDB_SP_PAREN)
+ kdb_printf("(");
+ if (strcmp(symtab_p2->mod_name, "kernel"))
+ kdb_printf("[%s]", symtab_p2->mod_name);
+ kdb_printf("%s", symtab_p2->sym_name);
+ if (addr != symtab_p2->sym_start)
+ kdb_printf("+0x%lx", addr - symtab_p2->sym_start);
+ if (punc & KDB_SP_SYMSIZE)
+ kdb_printf("/0x%lx",
+ symtab_p2->sym_end - symtab_p2->sym_start);
+ if (punc & KDB_SP_PAREN)
+ kdb_printf(")");
+ }
+ if (punc & KDB_SP_SPACEA)
+ kdb_printf(" ");
+ if (punc & KDB_SP_NEWLINE)
+ kdb_printf("\n");
+}
+
+/*
+ * kdb_strdup - kdb equivalent of strdup, for disasm code.
+ * Inputs:
+ * str The string to duplicate.
+ * type Flags to kmalloc for the new string.
+ * Returns:
+ * Address of the new string, NULL if storage could not be allocated.
+ * Remarks:
+ * This is not in lib/string.c because it uses kmalloc which is not
+ * available when string.o is used in boot loaders.
+ */
+char *kdb_strdup(const char *str, gfp_t type)
+{
+ int n = strlen(str)+1;
+ char *s = kmalloc(n, type);
+ if (!s)
+ return NULL;
+ return strcpy(s, str);
+}
+
+/*
+ * kdb_getarea_size - Read an area of data. The kdb equivalent of
+ * copy_from_user, with kdb messages for invalid addresses.
+ * Inputs:
+ * res Pointer to the area to receive the result.
+ * addr Address of the area to copy.
+ * size Size of the area.
+ * Returns:
+ * 0 for success, < 0 for error.
+ */
+int kdb_getarea_size(void *res, unsigned long addr, size_t size)
+{
+ int ret = probe_kernel_read((char *)res, (char *)addr, size);
+ if (ret) {
+ if (!KDB_STATE(SUPPRESS)) {
+ kdb_printf("kdb_getarea: Bad address 0x%lx\n", addr);
+ KDB_STATE_SET(SUPPRESS);
+ }
+ ret = KDB_BADADDR;
+ } else {
+ KDB_STATE_CLEAR(SUPPRESS);
+ }
+ return ret;
+}
+
+/*
+ * kdb_putarea_size - Write an area of data. The kdb equivalent of
+ * copy_to_user, with kdb messages for invalid addresses.
+ * Inputs:
+ * addr Address of the area to write to.
+ * res Pointer to the area holding the data.
+ * size Size of the area.
+ * Returns:
+ * 0 for success, < 0 for error.
+ */
+int kdb_putarea_size(unsigned long addr, void *res, size_t size)
+{
+ int ret = probe_kernel_read((char *)addr, (char *)res, size);
+ if (ret) {
+ if (!KDB_STATE(SUPPRESS)) {
+ kdb_printf("kdb_putarea: Bad address 0x%lx\n", addr);
+ KDB_STATE_SET(SUPPRESS);
+ }
+ ret = KDB_BADADDR;
+ } else {
+ KDB_STATE_CLEAR(SUPPRESS);
+ }
+ return ret;
+}
+
+/*
+ * kdb_getphys - Read data from a physical address. Validate the
+ * address is in range, use kmap_atomic() to get data
+ * similar to kdb_getarea() - but for phys addresses
+ * Inputs:
+ * res Pointer to the word to receive the result
+ * addr Physical address of the area to copy
+ * size Size of the area
+ * Returns:
+ * 0 for success, < 0 for error.
+ */
+static int kdb_getphys(void *res, unsigned long addr, size_t size)
+{
+ unsigned long pfn;
+ void *vaddr;
+ struct page *page;
+
+ pfn = (addr >> PAGE_SHIFT);
+ if (!pfn_valid(pfn))
+ return 1;
+ page = pfn_to_page(pfn);
+ vaddr = kmap_atomic(page);
+ memcpy(res, vaddr + (addr & (PAGE_SIZE - 1)), size);
+ kunmap_atomic(vaddr);
+
+ return 0;
+}
+
+/*
+ * kdb_getphysword
+ * Inputs:
+ * word Pointer to the word to receive the result.
+ * addr Address of the area to copy.
+ * size Size of the area.
+ * Returns:
+ * 0 for success, < 0 for error.
+ */
+int kdb_getphysword(unsigned long *word, unsigned long addr, size_t size)
+{
+ int diag;
+ __u8 w1;
+ __u16 w2;
+ __u32 w4;
+ __u64 w8;
+ *word = 0; /* Default value if addr or size is invalid */
+
+ switch (size) {
+ case 1:
+ diag = kdb_getphys(&w1, addr, sizeof(w1));
+ if (!diag)
+ *word = w1;
+ break;
+ case 2:
+ diag = kdb_getphys(&w2, addr, sizeof(w2));
+ if (!diag)
+ *word = w2;
+ break;
+ case 4:
+ diag = kdb_getphys(&w4, addr, sizeof(w4));
+ if (!diag)
+ *word = w4;
+ break;
+ case 8:
+ if (size <= sizeof(*word)) {
+ diag = kdb_getphys(&w8, addr, sizeof(w8));
+ if (!diag)
+ *word = w8;
+ break;
+ }
+ /* drop through */
+ default:
+ diag = KDB_BADWIDTH;
+ kdb_printf("kdb_getphysword: bad width %ld\n", (long) size);
+ }
+ return diag;
+}
+
+/*
+ * kdb_getword - Read a binary value. Unlike kdb_getarea, this treats
+ * data as numbers.
+ * Inputs:
+ * word Pointer to the word to receive the result.
+ * addr Address of the area to copy.
+ * size Size of the area.
+ * Returns:
+ * 0 for success, < 0 for error.
+ */
+int kdb_getword(unsigned long *word, unsigned long addr, size_t size)
+{
+ int diag;
+ __u8 w1;
+ __u16 w2;
+ __u32 w4;
+ __u64 w8;
+ *word = 0; /* Default value if addr or size is invalid */
+ switch (size) {
+ case 1:
+ diag = kdb_getarea(w1, addr);
+ if (!diag)
+ *word = w1;
+ break;
+ case 2:
+ diag = kdb_getarea(w2, addr);
+ if (!diag)
+ *word = w2;
+ break;
+ case 4:
+ diag = kdb_getarea(w4, addr);
+ if (!diag)
+ *word = w4;
+ break;
+ case 8:
+ if (size <= sizeof(*word)) {
+ diag = kdb_getarea(w8, addr);
+ if (!diag)
+ *word = w8;
+ break;
+ }
+ /* drop through */
+ default:
+ diag = KDB_BADWIDTH;
+ kdb_printf("kdb_getword: bad width %ld\n", (long) size);
+ }
+ return diag;
+}
+
+/*
+ * kdb_putword - Write a binary value. Unlike kdb_putarea, this
+ * treats data as numbers.
+ * Inputs:
+ * addr Address of the area to write to..
+ * word The value to set.
+ * size Size of the area.
+ * Returns:
+ * 0 for success, < 0 for error.
+ */
+int kdb_putword(unsigned long addr, unsigned long word, size_t size)
+{
+ int diag;
+ __u8 w1;
+ __u16 w2;
+ __u32 w4;
+ __u64 w8;
+ switch (size) {
+ case 1:
+ w1 = word;
+ diag = kdb_putarea(addr, w1);
+ break;
+ case 2:
+ w2 = word;
+ diag = kdb_putarea(addr, w2);
+ break;
+ case 4:
+ w4 = word;
+ diag = kdb_putarea(addr, w4);
+ break;
+ case 8:
+ if (size <= sizeof(word)) {
+ w8 = word;
+ diag = kdb_putarea(addr, w8);
+ break;
+ }
+ /* drop through */
+ default:
+ diag = KDB_BADWIDTH;
+ kdb_printf("kdb_putword: bad width %ld\n", (long) size);
+ }
+ return diag;
+}
+
+/*
+ * kdb_task_state_string - Convert a string containing any of the
+ * letters DRSTCZEUIMA to a mask for the process state field and
+ * return the value. If no argument is supplied, return the mask
+ * that corresponds to environment variable PS, DRSTCZEU by
+ * default.
+ * Inputs:
+ * s String to convert
+ * Returns:
+ * Mask for process state.
+ * Notes:
+ * The mask folds data from several sources into a single long value, so
+ * be careful not to overlap the bits. TASK_* bits are in the LSB,
+ * special cases like UNRUNNABLE are in the MSB. As of 2.6.10-rc1 there
+ * is no overlap between TASK_* and EXIT_* but that may not always be
+ * true, so EXIT_* bits are shifted left 16 bits before being stored in
+ * the mask.
+ */
+
+/* unrunnable is < 0 */
+#define UNRUNNABLE (1UL << (8*sizeof(unsigned long) - 1))
+#define RUNNING (1UL << (8*sizeof(unsigned long) - 2))
+#define IDLE (1UL << (8*sizeof(unsigned long) - 3))
+#define DAEMON (1UL << (8*sizeof(unsigned long) - 4))
+
+unsigned long kdb_task_state_string(const char *s)
+{
+ long res = 0;
+ if (!s) {
+ s = kdbgetenv("PS");
+ if (!s)
+ s = "DRSTCZEU"; /* default value for ps */
+ }
+ while (*s) {
+ switch (*s) {
+ case 'D':
+ res |= TASK_UNINTERRUPTIBLE;
+ break;
+ case 'R':
+ res |= RUNNING;
+ break;
+ case 'S':
+ res |= TASK_INTERRUPTIBLE;
+ break;
+ case 'T':
+ res |= TASK_STOPPED;
+ break;
+ case 'C':
+ res |= TASK_TRACED;
+ break;
+ case 'Z':
+ res |= EXIT_ZOMBIE << 16;
+ break;
+ case 'E':
+ res |= EXIT_DEAD << 16;
+ break;
+ case 'U':
+ res |= UNRUNNABLE;
+ break;
+ case 'I':
+ res |= IDLE;
+ break;
+ case 'M':
+ res |= DAEMON;
+ break;
+ case 'A':
+ res = ~0UL;
+ break;
+ default:
+ kdb_printf("%s: unknown flag '%c' ignored\n",
+ __func__, *s);
+ break;
+ }
+ ++s;
+ }
+ return res;
+}
+
+/*
+ * kdb_task_state_char - Return the character that represents the task state.
+ * Inputs:
+ * p struct task for the process
+ * Returns:
+ * One character to represent the task state.
+ */
+char kdb_task_state_char (const struct task_struct *p)
+{
+ int cpu;
+ char state;
+ unsigned long tmp;
+
+ if (!p || probe_kernel_read(&tmp, (char *)p, sizeof(unsigned long)))
+ return 'E';
+
+ cpu = kdb_process_cpu(p);
+ state = (p->state == 0) ? 'R' :
+ (p->state < 0) ? 'U' :
+ (p->state & TASK_UNINTERRUPTIBLE) ? 'D' :
+ (p->state & TASK_STOPPED) ? 'T' :
+ (p->state & TASK_TRACED) ? 'C' :
+ (p->exit_state & EXIT_ZOMBIE) ? 'Z' :
+ (p->exit_state & EXIT_DEAD) ? 'E' :
+ (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?';
+ if (is_idle_task(p)) {
+ /* Idle task. Is it really idle, apart from the kdb
+ * interrupt? */
+ if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) {
+ if (cpu != kdb_initial_cpu)
+ state = 'I'; /* idle task */
+ }
+ } else if (!p->mm && state == 'S') {
+ state = 'M'; /* sleeping system daemon */
+ }
+ return state;
+}
+
+/*
+ * kdb_task_state - Return true if a process has the desired state
+ * given by the mask.
+ * Inputs:
+ * p struct task for the process
+ * mask mask from kdb_task_state_string to select processes
+ * Returns:
+ * True if the process matches at least one criteria defined by the mask.
+ */
+unsigned long kdb_task_state(const struct task_struct *p, unsigned long mask)
+{
+ char state[] = { kdb_task_state_char(p), '\0' };
+ return (mask & kdb_task_state_string(state)) != 0;
+}
+
+/*
+ * kdb_print_nameval - Print a name and its value, converting the
+ * value to a symbol lookup if possible.
+ * Inputs:
+ * name field name to print
+ * val value of field
+ */
+void kdb_print_nameval(const char *name, unsigned long val)
+{
+ kdb_symtab_t symtab;
+ kdb_printf(" %-11.11s ", name);
+ if (kdbnearsym(val, &symtab))
+ kdb_symbol_print(val, &symtab,
+ KDB_SP_VALUE|KDB_SP_SYMSIZE|KDB_SP_NEWLINE);
+ else
+ kdb_printf("0x%lx\n", val);
+}
+
+/* Last ditch allocator for debugging, so we can still debug even when
+ * the GFP_ATOMIC pool has been exhausted. The algorithms are tuned
+ * for space usage, not for speed. One smallish memory pool, the free
+ * chain is always in ascending address order to allow coalescing,
+ * allocations are done in brute force best fit.
+ */
+
+struct debug_alloc_header {
+ u32 next; /* offset of next header from start of pool */
+ u32 size;
+ void *caller;
+};
+
+/* The memory returned by this allocator must be aligned, which means
+ * so must the header size. Do not assume that sizeof(struct
+ * debug_alloc_header) is a multiple of the alignment, explicitly
+ * calculate the overhead of this header, including the alignment.
+ * The rest of this code must not use sizeof() on any header or
+ * pointer to a header.
+ */
+#define dah_align 8
+#define dah_overhead ALIGN(sizeof(struct debug_alloc_header), dah_align)
+
+static u64 debug_alloc_pool_aligned[256*1024/dah_align]; /* 256K pool */
+static char *debug_alloc_pool = (char *)debug_alloc_pool_aligned;
+static u32 dah_first, dah_first_call = 1, dah_used, dah_used_max;
+
+/* Locking is awkward. The debug code is called from all contexts,
+ * including non maskable interrupts. A normal spinlock is not safe
+ * in NMI context. Try to get the debug allocator lock, if it cannot
+ * be obtained after a second then give up. If the lock could not be
+ * previously obtained on this cpu then only try once.
+ *
+ * sparse has no annotation for "this function _sometimes_ acquires a
+ * lock", so fudge the acquire/release notation.
+ */
+static DEFINE_SPINLOCK(dap_lock);
+static int get_dap_lock(void)
+ __acquires(dap_lock)
+{
+ static int dap_locked = -1;
+ int count;
+ if (dap_locked == smp_processor_id())
+ count = 1;
+ else
+ count = 1000;
+ while (1) {
+ if (spin_trylock(&dap_lock)) {
+ dap_locked = -1;
+ return 1;
+ }
+ if (!count--)
+ break;
+ udelay(1000);
+ }
+ dap_locked = smp_processor_id();
+ __acquire(dap_lock);
+ return 0;
+}
+
+void *debug_kmalloc(size_t size, gfp_t flags)
+{
+ unsigned int rem, h_offset;
+ struct debug_alloc_header *best, *bestprev, *prev, *h;
+ void *p = NULL;
+ if (!get_dap_lock()) {
+ __release(dap_lock); /* we never actually got it */
+ return NULL;
+ }
+ h = (struct debug_alloc_header *)(debug_alloc_pool + dah_first);
+ if (dah_first_call) {
+ h->size = sizeof(debug_alloc_pool_aligned) - dah_overhead;
+ dah_first_call = 0;
+ }
+ size = ALIGN(size, dah_align);
+ prev = best = bestprev = NULL;
+ while (1) {
+ if (h->size >= size && (!best || h->size < best->size)) {
+ best = h;
+ bestprev = prev;
+ if (h->size == size)
+ break;
+ }
+ if (!h->next)
+ break;
+ prev = h;
+ h = (struct debug_alloc_header *)(debug_alloc_pool + h->next);
+ }
+ if (!best)
+ goto out;
+ rem = best->size - size;
+ /* The pool must always contain at least one header */
+ if (best->next == 0 && bestprev == NULL && rem < dah_overhead)
+ goto out;
+ if (rem >= dah_overhead) {
+ best->size = size;
+ h_offset = ((char *)best - debug_alloc_pool) +
+ dah_overhead + best->size;
+ h = (struct debug_alloc_header *)(debug_alloc_pool + h_offset);
+ h->size = rem - dah_overhead;
+ h->next = best->next;
+ } else
+ h_offset = best->next;
+ best->caller = __builtin_return_address(0);
+ dah_used += best->size;
+ dah_used_max = max(dah_used, dah_used_max);
+ if (bestprev)
+ bestprev->next = h_offset;
+ else
+ dah_first = h_offset;
+ p = (char *)best + dah_overhead;
+ memset(p, POISON_INUSE, best->size - 1);
+ *((char *)p + best->size - 1) = POISON_END;
+out:
+ spin_unlock(&dap_lock);
+ return p;
+}
+
+void debug_kfree(void *p)
+{
+ struct debug_alloc_header *h;
+ unsigned int h_offset;
+ if (!p)
+ return;
+ if ((char *)p < debug_alloc_pool ||
+ (char *)p >= debug_alloc_pool + sizeof(debug_alloc_pool_aligned)) {
+ kfree(p);
+ return;
+ }
+ if (!get_dap_lock()) {
+ __release(dap_lock); /* we never actually got it */
+ return; /* memory leak, cannot be helped */
+ }
+ h = (struct debug_alloc_header *)((char *)p - dah_overhead);
+ memset(p, POISON_FREE, h->size - 1);
+ *((char *)p + h->size - 1) = POISON_END;
+ h->caller = NULL;
+ dah_used -= h->size;
+ h_offset = (char *)h - debug_alloc_pool;
+ if (h_offset < dah_first) {
+ h->next = dah_first;
+ dah_first = h_offset;
+ } else {
+ struct debug_alloc_header *prev;
+ unsigned int prev_offset;
+ prev = (struct debug_alloc_header *)(debug_alloc_pool +
+ dah_first);
+ while (1) {
+ if (!prev->next || prev->next > h_offset)
+ break;
+ prev = (struct debug_alloc_header *)
+ (debug_alloc_pool + prev->next);
+ }
+ prev_offset = (char *)prev - debug_alloc_pool;
+ if (prev_offset + dah_overhead + prev->size == h_offset) {
+ prev->size += dah_overhead + h->size;
+ memset(h, POISON_FREE, dah_overhead - 1);
+ *((char *)h + dah_overhead - 1) = POISON_END;
+ h = prev;
+ h_offset = prev_offset;
+ } else {
+ h->next = prev->next;
+ prev->next = h_offset;
+ }
+ }
+ if (h_offset + dah_overhead + h->size == h->next) {
+ struct debug_alloc_header *next;
+ next = (struct debug_alloc_header *)
+ (debug_alloc_pool + h->next);
+ h->size += dah_overhead + next->size;
+ h->next = next->next;
+ memset(next, POISON_FREE, dah_overhead - 1);
+ *((char *)next + dah_overhead - 1) = POISON_END;
+ }
+ spin_unlock(&dap_lock);
+}
+
+void debug_kusage(void)
+{
+ struct debug_alloc_header *h_free, *h_used;
+#ifdef CONFIG_IA64
+ /* FIXME: using dah for ia64 unwind always results in a memory leak.
+ * Fix that memory leak first, then set debug_kusage_one_time = 1 for
+ * all architectures.
+ */
+ static int debug_kusage_one_time;
+#else
+ static int debug_kusage_one_time = 1;
+#endif
+ if (!get_dap_lock()) {
+ __release(dap_lock); /* we never actually got it */
+ return;
+ }
+ h_free = (struct debug_alloc_header *)(debug_alloc_pool + dah_first);
+ if (dah_first == 0 &&
+ (h_free->size == sizeof(debug_alloc_pool_aligned) - dah_overhead ||
+ dah_first_call))
+ goto out;
+ if (!debug_kusage_one_time)
+ goto out;
+ debug_kusage_one_time = 0;
+ kdb_printf("%s: debug_kmalloc memory leak dah_first %d\n",
+ __func__, dah_first);
+ if (dah_first) {
+ h_used = (struct debug_alloc_header *)debug_alloc_pool;
+ kdb_printf("%s: h_used %p size %d\n", __func__, h_used,
+ h_used->size);
+ }
+ do {
+ h_used = (struct debug_alloc_header *)
+ ((char *)h_free + dah_overhead + h_free->size);
+ kdb_printf("%s: h_used %p size %d caller %p\n",
+ __func__, h_used, h_used->size, h_used->caller);
+ h_free = (struct debug_alloc_header *)
+ (debug_alloc_pool + h_free->next);
+ } while (h_free->next);
+ h_used = (struct debug_alloc_header *)
+ ((char *)h_free + dah_overhead + h_free->size);
+ if ((char *)h_used - debug_alloc_pool !=
+ sizeof(debug_alloc_pool_aligned))
+ kdb_printf("%s: h_used %p size %d caller %p\n",
+ __func__, h_used, h_used->size, h_used->caller);
+out:
+ spin_unlock(&dap_lock);
+}
+
+/* Maintain a small stack of kdb_flags to allow recursion without disturbing
+ * the global kdb state.
+ */
+
+static int kdb_flags_stack[4], kdb_flags_index;
+
+void kdb_save_flags(void)
+{
+ BUG_ON(kdb_flags_index >= ARRAY_SIZE(kdb_flags_stack));
+ kdb_flags_stack[kdb_flags_index++] = kdb_flags;
+}
+
+void kdb_restore_flags(void)
+{
+ BUG_ON(kdb_flags_index <= 0);
+ kdb_flags = kdb_flags_stack[--kdb_flags_index];
+}
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
new file mode 100644
index 000000000..d12807d40
--- /dev/null
+++ b/kernel/delayacct.c
@@ -0,0 +1,158 @@
+/* delayacct.c - per-task delay accounting
+ *
+ * Copyright (C) Shailabh Nagar, IBM Corp. 2006
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/taskstats.h>
+#include <linux/time.h>
+#include <linux/sysctl.h>
+#include <linux/delayacct.h>
+#include <linux/module.h>
+
+int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */
+EXPORT_SYMBOL_GPL(delayacct_on);
+struct kmem_cache *delayacct_cache;
+
+static int __init delayacct_setup_disable(char *str)
+{
+ delayacct_on = 0;
+ return 1;
+}
+__setup("nodelayacct", delayacct_setup_disable);
+
+void delayacct_init(void)
+{
+ delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC);
+ delayacct_tsk_init(&init_task);
+}
+
+void __delayacct_tsk_init(struct task_struct *tsk)
+{
+ tsk->delays = kmem_cache_zalloc(delayacct_cache, GFP_KERNEL);
+ if (tsk->delays)
+ spin_lock_init(&tsk->delays->lock);
+}
+
+/*
+ * Finish delay accounting for a statistic using its timestamps (@start),
+ * accumalator (@total) and @count
+ */
+static void delayacct_end(u64 *start, u64 *total, u32 *count)
+{
+ s64 ns = ktime_get_ns() - *start;
+ unsigned long flags;
+
+ if (ns > 0) {
+ spin_lock_irqsave(&current->delays->lock, flags);
+ *total += ns;
+ (*count)++;
+ spin_unlock_irqrestore(&current->delays->lock, flags);
+ }
+}
+
+void __delayacct_blkio_start(void)
+{
+ current->delays->blkio_start = ktime_get_ns();
+}
+
+void __delayacct_blkio_end(void)
+{
+ if (current->delays->flags & DELAYACCT_PF_SWAPIN)
+ /* Swapin block I/O */
+ delayacct_end(&current->delays->blkio_start,
+ &current->delays->swapin_delay,
+ &current->delays->swapin_count);
+ else /* Other block I/O */
+ delayacct_end(&current->delays->blkio_start,
+ &current->delays->blkio_delay,
+ &current->delays->blkio_count);
+}
+
+int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
+{
+ cputime_t utime, stime, stimescaled, utimescaled;
+ unsigned long long t2, t3;
+ unsigned long flags, t1;
+ s64 tmp;
+
+ task_cputime(tsk, &utime, &stime);
+ tmp = (s64)d->cpu_run_real_total;
+ tmp += cputime_to_nsecs(utime + stime);
+ d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;
+
+ task_cputime_scaled(tsk, &utimescaled, &stimescaled);
+ tmp = (s64)d->cpu_scaled_run_real_total;
+ tmp += cputime_to_nsecs(utimescaled + stimescaled);
+ d->cpu_scaled_run_real_total =
+ (tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp;
+
+ /*
+ * No locking available for sched_info (and too expensive to add one)
+ * Mitigate by taking snapshot of values
+ */
+ t1 = tsk->sched_info.pcount;
+ t2 = tsk->sched_info.run_delay;
+ t3 = tsk_seruntime(tsk);
+
+ d->cpu_count += t1;
+
+ tmp = (s64)d->cpu_delay_total + t2;
+ d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp;
+
+ tmp = (s64)d->cpu_run_virtual_total + t3;
+ d->cpu_run_virtual_total =
+ (tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp;
+
+ /* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */
+
+ spin_lock_irqsave(&tsk->delays->lock, flags);
+ tmp = d->blkio_delay_total + tsk->delays->blkio_delay;
+ d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp;
+ tmp = d->swapin_delay_total + tsk->delays->swapin_delay;
+ d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp;
+ tmp = d->freepages_delay_total + tsk->delays->freepages_delay;
+ d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp;
+ d->blkio_count += tsk->delays->blkio_count;
+ d->swapin_count += tsk->delays->swapin_count;
+ d->freepages_count += tsk->delays->freepages_count;
+ spin_unlock_irqrestore(&tsk->delays->lock, flags);
+
+ return 0;
+}
+
+__u64 __delayacct_blkio_ticks(struct task_struct *tsk)
+{
+ __u64 ret;
+ unsigned long flags;
+
+ spin_lock_irqsave(&tsk->delays->lock, flags);
+ ret = nsec_to_clock_t(tsk->delays->blkio_delay +
+ tsk->delays->swapin_delay);
+ spin_unlock_irqrestore(&tsk->delays->lock, flags);
+ return ret;
+}
+
+void __delayacct_freepages_start(void)
+{
+ current->delays->freepages_start = ktime_get_ns();
+}
+
+void __delayacct_freepages_end(void)
+{
+ delayacct_end(&current->delays->freepages_start,
+ &current->delays->freepages_delay,
+ &current->delays->freepages_count);
+}
+
diff --git a/kernel/dma.c b/kernel/dma.c
new file mode 100644
index 000000000..6c6262f86
--- /dev/null
+++ b/kernel/dma.c
@@ -0,0 +1,160 @@
+/*
+ * linux/kernel/dma.c: A DMA channel allocator. Inspired by linux/kernel/irq.c.
+ *
+ * Written by Hennus Bergman, 1992.
+ *
+ * 1994/12/26: Changes by Alex Nash to fix a minor bug in /proc/dma.
+ * In the previous version the reported device could end up being wrong,
+ * if a device requested a DMA channel that was already in use.
+ * [It also happened to remove the sizeof(char *) == sizeof(int)
+ * assumption introduced because of those /proc/dma patches. -- Hennus]
+ */
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <asm/dma.h>
+
+
+
+/* A note on resource allocation:
+ *
+ * All drivers needing DMA channels, should allocate and release them
+ * through the public routines `request_dma()' and `free_dma()'.
+ *
+ * In order to avoid problems, all processes should allocate resources in
+ * the same sequence and release them in the reverse order.
+ *
+ * So, when allocating DMAs and IRQs, first allocate the IRQ, then the DMA.
+ * When releasing them, first release the DMA, then release the IRQ.
+ * If you don't, you may cause allocation requests to fail unnecessarily.
+ * This doesn't really matter now, but it will once we get real semaphores
+ * in the kernel.
+ */
+
+
+DEFINE_SPINLOCK(dma_spin_lock);
+
+/*
+ * If our port doesn't define this it has no PC like DMA
+ */
+
+#ifdef MAX_DMA_CHANNELS
+
+
+/* Channel n is busy iff dma_chan_busy[n].lock != 0.
+ * DMA0 used to be reserved for DRAM refresh, but apparently not any more...
+ * DMA4 is reserved for cascading.
+ */
+
+struct dma_chan {
+ int lock;
+ const char *device_id;
+};
+
+static struct dma_chan dma_chan_busy[MAX_DMA_CHANNELS] = {
+ [4] = { 1, "cascade" },
+};
+
+
+/**
+ * request_dma - request and reserve a system DMA channel
+ * @dmanr: DMA channel number
+ * @device_id: reserving device ID string, used in /proc/dma
+ */
+int request_dma(unsigned int dmanr, const char * device_id)
+{
+ if (dmanr >= MAX_DMA_CHANNELS)
+ return -EINVAL;
+
+ if (xchg(&dma_chan_busy[dmanr].lock, 1) != 0)
+ return -EBUSY;
+
+ dma_chan_busy[dmanr].device_id = device_id;
+
+ /* old flag was 0, now contains 1 to indicate busy */
+ return 0;
+} /* request_dma */
+
+/**
+ * free_dma - free a reserved system DMA channel
+ * @dmanr: DMA channel number
+ */
+void free_dma(unsigned int dmanr)
+{
+ if (dmanr >= MAX_DMA_CHANNELS) {
+ printk(KERN_WARNING "Trying to free DMA%d\n", dmanr);
+ return;
+ }
+
+ if (xchg(&dma_chan_busy[dmanr].lock, 0) == 0) {
+ printk(KERN_WARNING "Trying to free free DMA%d\n", dmanr);
+ return;
+ }
+
+} /* free_dma */
+
+#else
+
+int request_dma(unsigned int dmanr, const char *device_id)
+{
+ return -EINVAL;
+}
+
+void free_dma(unsigned int dmanr)
+{
+}
+
+#endif
+
+#ifdef CONFIG_PROC_FS
+
+#ifdef MAX_DMA_CHANNELS
+static int proc_dma_show(struct seq_file *m, void *v)
+{
+ int i;
+
+ for (i = 0 ; i < MAX_DMA_CHANNELS ; i++) {
+ if (dma_chan_busy[i].lock) {
+ seq_printf(m, "%2d: %s\n", i,
+ dma_chan_busy[i].device_id);
+ }
+ }
+ return 0;
+}
+#else
+static int proc_dma_show(struct seq_file *m, void *v)
+{
+ seq_puts(m, "No DMA\n");
+ return 0;
+}
+#endif /* MAX_DMA_CHANNELS */
+
+static int proc_dma_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, proc_dma_show, NULL);
+}
+
+static const struct file_operations proc_dma_operations = {
+ .open = proc_dma_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init proc_dma_init(void)
+{
+ proc_create("dma", 0, NULL, &proc_dma_operations);
+ return 0;
+}
+
+__initcall(proc_dma_init);
+#endif
+
+EXPORT_SYMBOL(request_dma);
+EXPORT_SYMBOL(free_dma);
+EXPORT_SYMBOL(dma_spin_lock);
diff --git a/kernel/elfcore.c b/kernel/elfcore.c
new file mode 100644
index 000000000..e556751d1
--- /dev/null
+++ b/kernel/elfcore.c
@@ -0,0 +1,24 @@
+#include <linux/elf.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/binfmts.h>
+
+Elf_Half __weak elf_core_extra_phdrs(void)
+{
+ return 0;
+}
+
+int __weak elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset)
+{
+ return 1;
+}
+
+int __weak elf_core_write_extra_data(struct coredump_params *cprm)
+{
+ return 1;
+}
+
+size_t __weak elf_core_extra_data_size(void)
+{
+ return 0;
+}
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
new file mode 100644
index 000000000..2925188f5
--- /dev/null
+++ b/kernel/events/Makefile
@@ -0,0 +1,9 @@
+ifdef CONFIG_FUNCTION_TRACER
+CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE)
+endif
+
+obj-y := core.o ring_buffer.o callchain.o
+
+obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
+obj-$(CONFIG_UPROBES) += uprobes.o
+
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
new file mode 100644
index 000000000..d65948725
--- /dev/null
+++ b/kernel/events/callchain.c
@@ -0,0 +1,209 @@
+/*
+ * Performance events callchain code, extracted from core.c:
+ *
+ * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
+ * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *
+ * For licensing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_event.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+struct callchain_cpus_entries {
+ struct rcu_head rcu_head;
+ struct perf_callchain_entry *cpu_entries[0];
+};
+
+static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
+static atomic_t nr_callchain_events;
+static DEFINE_MUTEX(callchain_mutex);
+static struct callchain_cpus_entries *callchain_cpus_entries;
+
+
+__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
+ struct pt_regs *regs)
+{
+}
+
+__weak void perf_callchain_user(struct perf_callchain_entry *entry,
+ struct pt_regs *regs)
+{
+}
+
+static void release_callchain_buffers_rcu(struct rcu_head *head)
+{
+ struct callchain_cpus_entries *entries;
+ int cpu;
+
+ entries = container_of(head, struct callchain_cpus_entries, rcu_head);
+
+ for_each_possible_cpu(cpu)
+ kfree(entries->cpu_entries[cpu]);
+
+ kfree(entries);
+}
+
+static void release_callchain_buffers(void)
+{
+ struct callchain_cpus_entries *entries;
+
+ entries = callchain_cpus_entries;
+ RCU_INIT_POINTER(callchain_cpus_entries, NULL);
+ call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
+}
+
+static int alloc_callchain_buffers(void)
+{
+ int cpu;
+ int size;
+ struct callchain_cpus_entries *entries;
+
+ /*
+ * We can't use the percpu allocation API for data that can be
+ * accessed from NMI. Use a temporary manual per cpu allocation
+ * until that gets sorted out.
+ */
+ size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]);
+
+ entries = kzalloc(size, GFP_KERNEL);
+ if (!entries)
+ return -ENOMEM;
+
+ size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
+
+ for_each_possible_cpu(cpu) {
+ entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
+ cpu_to_node(cpu));
+ if (!entries->cpu_entries[cpu])
+ goto fail;
+ }
+
+ rcu_assign_pointer(callchain_cpus_entries, entries);
+
+ return 0;
+
+fail:
+ for_each_possible_cpu(cpu)
+ kfree(entries->cpu_entries[cpu]);
+ kfree(entries);
+
+ return -ENOMEM;
+}
+
+int get_callchain_buffers(void)
+{
+ int err = 0;
+ int count;
+
+ mutex_lock(&callchain_mutex);
+
+ count = atomic_inc_return(&nr_callchain_events);
+ if (WARN_ON_ONCE(count < 1)) {
+ err = -EINVAL;
+ goto exit;
+ }
+
+ if (count > 1) {
+ /* If the allocation failed, give up */
+ if (!callchain_cpus_entries)
+ err = -ENOMEM;
+ goto exit;
+ }
+
+ err = alloc_callchain_buffers();
+exit:
+ if (err)
+ atomic_dec(&nr_callchain_events);
+
+ mutex_unlock(&callchain_mutex);
+
+ return err;
+}
+
+void put_callchain_buffers(void)
+{
+ if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
+ release_callchain_buffers();
+ mutex_unlock(&callchain_mutex);
+ }
+}
+
+static struct perf_callchain_entry *get_callchain_entry(int *rctx)
+{
+ int cpu;
+ struct callchain_cpus_entries *entries;
+
+ *rctx = get_recursion_context(this_cpu_ptr(callchain_recursion));
+ if (*rctx == -1)
+ return NULL;
+
+ entries = rcu_dereference(callchain_cpus_entries);
+ if (!entries)
+ return NULL;
+
+ cpu = smp_processor_id();
+
+ return &entries->cpu_entries[cpu][*rctx];
+}
+
+static void
+put_callchain_entry(int rctx)
+{
+ put_recursion_context(this_cpu_ptr(callchain_recursion), rctx);
+}
+
+struct perf_callchain_entry *
+perf_callchain(struct perf_event *event, struct pt_regs *regs)
+{
+ int rctx;
+ struct perf_callchain_entry *entry;
+
+ int kernel = !event->attr.exclude_callchain_kernel;
+ int user = !event->attr.exclude_callchain_user;
+
+ if (!kernel && !user)
+ return NULL;
+
+ entry = get_callchain_entry(&rctx);
+ if (rctx == -1)
+ return NULL;
+
+ if (!entry)
+ goto exit_put;
+
+ entry->nr = 0;
+
+ if (kernel && !user_mode(regs)) {
+ perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
+ perf_callchain_kernel(entry, regs);
+ }
+
+ if (user) {
+ if (!user_mode(regs)) {
+ if (current->mm)
+ regs = task_pt_regs(current);
+ else
+ regs = NULL;
+ }
+
+ if (regs) {
+ /*
+ * Disallow cross-task user callchains.
+ */
+ if (event->ctx->task && event->ctx->task != current)
+ goto exit_put;
+
+ perf_callchain_store(entry, PERF_CONTEXT_USER);
+ perf_callchain_user(entry, regs);
+ }
+ }
+
+exit_put:
+ put_callchain_entry(rctx);
+
+ return entry;
+}
diff --git a/kernel/events/core.c b/kernel/events/core.c
new file mode 100644
index 000000000..0ceb38677
--- /dev/null
+++ b/kernel/events/core.c
@@ -0,0 +1,9083 @@
+/*
+ * Performance events core code:
+ *
+ * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
+ * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *
+ * For licensing details see kernel-base/COPYING
+ */
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
+#include <linux/idr.h>
+#include <linux/file.h>
+#include <linux/poll.h>
+#include <linux/slab.h>
+#include <linux/hash.h>
+#include <linux/tick.h>
+#include <linux/sysfs.h>
+#include <linux/dcache.h>
+#include <linux/percpu.h>
+#include <linux/ptrace.h>
+#include <linux/reboot.h>
+#include <linux/vmstat.h>
+#include <linux/device.h>
+#include <linux/export.h>
+#include <linux/vmalloc.h>
+#include <linux/hardirq.h>
+#include <linux/rculist.h>
+#include <linux/uaccess.h>
+#include <linux/syscalls.h>
+#include <linux/anon_inodes.h>
+#include <linux/kernel_stat.h>
+#include <linux/cgroup.h>
+#include <linux/perf_event.h>
+#include <linux/ftrace_event.h>
+#include <linux/hw_breakpoint.h>
+#include <linux/mm_types.h>
+#include <linux/module.h>
+#include <linux/mman.h>
+#include <linux/compat.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+
+#include "internal.h"
+
+#include <asm/irq_regs.h>
+
+static struct workqueue_struct *perf_wq;
+
+struct remote_function_call {
+ struct task_struct *p;
+ int (*func)(void *info);
+ void *info;
+ int ret;
+};
+
+static void remote_function(void *data)
+{
+ struct remote_function_call *tfc = data;
+ struct task_struct *p = tfc->p;
+
+ if (p) {
+ tfc->ret = -EAGAIN;
+ if (task_cpu(p) != smp_processor_id() || !task_curr(p))
+ return;
+ }
+
+ tfc->ret = tfc->func(tfc->info);
+}
+
+/**
+ * task_function_call - call a function on the cpu on which a task runs
+ * @p: the task to evaluate
+ * @func: the function to be called
+ * @info: the function call argument
+ *
+ * Calls the function @func when the task is currently running. This might
+ * be on the current CPU, which just calls the function directly
+ *
+ * returns: @func return value, or
+ * -ESRCH - when the process isn't running
+ * -EAGAIN - when the process moved away
+ */
+static int
+task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
+{
+ struct remote_function_call data = {
+ .p = p,
+ .func = func,
+ .info = info,
+ .ret = -ESRCH, /* No such (running) process */
+ };
+
+ if (task_curr(p))
+ smp_call_function_single(task_cpu(p), remote_function, &data, 1);
+
+ return data.ret;
+}
+
+/**
+ * cpu_function_call - call a function on the cpu
+ * @func: the function to be called
+ * @info: the function call argument
+ *
+ * Calls the function @func on the remote cpu.
+ *
+ * returns: @func return value or -ENXIO when the cpu is offline
+ */
+static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
+{
+ struct remote_function_call data = {
+ .p = NULL,
+ .func = func,
+ .info = info,
+ .ret = -ENXIO, /* No such CPU */
+ };
+
+ smp_call_function_single(cpu, remote_function, &data, 1);
+
+ return data.ret;
+}
+
+#define EVENT_OWNER_KERNEL ((void *) -1)
+
+static bool is_kernel_event(struct perf_event *event)
+{
+ return event->owner == EVENT_OWNER_KERNEL;
+}
+
+#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
+ PERF_FLAG_FD_OUTPUT |\
+ PERF_FLAG_PID_CGROUP |\
+ PERF_FLAG_FD_CLOEXEC)
+
+/*
+ * branch priv levels that need permission checks
+ */
+#define PERF_SAMPLE_BRANCH_PERM_PLM \
+ (PERF_SAMPLE_BRANCH_KERNEL |\
+ PERF_SAMPLE_BRANCH_HV)
+
+enum event_type_t {
+ EVENT_FLEXIBLE = 0x1,
+ EVENT_PINNED = 0x2,
+ EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
+};
+
+/*
+ * perf_sched_events : >0 events exist
+ * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
+ */
+struct static_key_deferred perf_sched_events __read_mostly;
+static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
+static DEFINE_PER_CPU(int, perf_sched_cb_usages);
+
+static atomic_t nr_mmap_events __read_mostly;
+static atomic_t nr_comm_events __read_mostly;
+static atomic_t nr_task_events __read_mostly;
+static atomic_t nr_freq_events __read_mostly;
+
+static LIST_HEAD(pmus);
+static DEFINE_MUTEX(pmus_lock);
+static struct srcu_struct pmus_srcu;
+
+/*
+ * perf event paranoia level:
+ * -1 - not paranoid at all
+ * 0 - disallow raw tracepoint access for unpriv
+ * 1 - disallow cpu events for unpriv
+ * 2 - disallow kernel profiling for unpriv
+ */
+int sysctl_perf_event_paranoid __read_mostly = 1;
+
+/* Minimum for 512 kiB + 1 user control page */
+int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
+
+/*
+ * max perf event sample rate
+ */
+#define DEFAULT_MAX_SAMPLE_RATE 100000
+#define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
+#define DEFAULT_CPU_TIME_MAX_PERCENT 25
+
+int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
+
+static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
+static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
+
+static int perf_sample_allowed_ns __read_mostly =
+ DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
+
+void update_perf_cpu_limits(void)
+{
+ u64 tmp = perf_sample_period_ns;
+
+ tmp *= sysctl_perf_cpu_time_max_percent;
+ do_div(tmp, 100);
+ ACCESS_ONCE(perf_sample_allowed_ns) = tmp;
+}
+
+static int perf_rotate_context(struct perf_cpu_context *cpuctx);
+
+int perf_proc_update_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+ if (ret || !write)
+ return ret;
+
+ max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
+ perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
+ update_perf_cpu_limits();
+
+ return 0;
+}
+
+int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
+
+int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret = proc_dointvec(table, write, buffer, lenp, ppos);
+
+ if (ret || !write)
+ return ret;
+
+ update_perf_cpu_limits();
+
+ return 0;
+}
+
+/*
+ * perf samples are done in some very critical code paths (NMIs).
+ * If they take too much CPU time, the system can lock up and not
+ * get any real work done. This will drop the sample rate when
+ * we detect that events are taking too long.
+ */
+#define NR_ACCUMULATED_SAMPLES 128
+static DEFINE_PER_CPU(u64, running_sample_length);
+
+static void perf_duration_warn(struct irq_work *w)
+{
+ u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
+ u64 avg_local_sample_len;
+ u64 local_samples_len;
+
+ local_samples_len = __this_cpu_read(running_sample_length);
+ avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
+
+ printk_ratelimited(KERN_WARNING
+ "perf interrupt took too long (%lld > %lld), lowering "
+ "kernel.perf_event_max_sample_rate to %d\n",
+ avg_local_sample_len, allowed_ns >> 1,
+ sysctl_perf_event_sample_rate);
+}
+
+static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
+
+void perf_sample_event_took(u64 sample_len_ns)
+{
+ u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
+ u64 avg_local_sample_len;
+ u64 local_samples_len;
+
+ if (allowed_ns == 0)
+ return;
+
+ /* decay the counter by 1 average sample */
+ local_samples_len = __this_cpu_read(running_sample_length);
+ local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES;
+ local_samples_len += sample_len_ns;
+ __this_cpu_write(running_sample_length, local_samples_len);
+
+ /*
+ * note: this will be biased artifically low until we have
+ * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
+ * from having to maintain a count.
+ */
+ avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
+
+ if (avg_local_sample_len <= allowed_ns)
+ return;
+
+ if (max_samples_per_tick <= 1)
+ return;
+
+ max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2);
+ sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
+ perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
+
+ update_perf_cpu_limits();
+
+ if (!irq_work_queue(&perf_duration_work)) {
+ early_printk("perf interrupt took too long (%lld > %lld), lowering "
+ "kernel.perf_event_max_sample_rate to %d\n",
+ avg_local_sample_len, allowed_ns >> 1,
+ sysctl_perf_event_sample_rate);
+ }
+}
+
+static atomic64_t perf_event_id;
+
+static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
+ enum event_type_t event_type);
+
+static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
+ enum event_type_t event_type,
+ struct task_struct *task);
+
+static void update_context_time(struct perf_event_context *ctx);
+static u64 perf_event_time(struct perf_event *event);
+
+void __weak perf_event_print_debug(void) { }
+
+extern __weak const char *perf_pmu_name(void)
+{
+ return "pmu";
+}
+
+static inline u64 perf_clock(void)
+{
+ return local_clock();
+}
+
+static inline u64 perf_event_clock(struct perf_event *event)
+{
+ return event->clock();
+}
+
+static inline struct perf_cpu_context *
+__get_cpu_context(struct perf_event_context *ctx)
+{
+ return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
+}
+
+static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx)
+{
+ raw_spin_lock(&cpuctx->ctx.lock);
+ if (ctx)
+ raw_spin_lock(&ctx->lock);
+}
+
+static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx)
+{
+ if (ctx)
+ raw_spin_unlock(&ctx->lock);
+ raw_spin_unlock(&cpuctx->ctx.lock);
+}
+
+#ifdef CONFIG_CGROUP_PERF
+
+static inline bool
+perf_cgroup_match(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+
+ /* @event doesn't care about cgroup */
+ if (!event->cgrp)
+ return true;
+
+ /* wants specific cgroup scope but @cpuctx isn't associated with any */
+ if (!cpuctx->cgrp)
+ return false;
+
+ /*
+ * Cgroup scoping is recursive. An event enabled for a cgroup is
+ * also enabled for all its descendant cgroups. If @cpuctx's
+ * cgroup is a descendant of @event's (the test covers identity
+ * case), it's a match.
+ */
+ return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
+ event->cgrp->css.cgroup);
+}
+
+static inline void perf_detach_cgroup(struct perf_event *event)
+{
+ css_put(&event->cgrp->css);
+ event->cgrp = NULL;
+}
+
+static inline int is_cgroup_event(struct perf_event *event)
+{
+ return event->cgrp != NULL;
+}
+
+static inline u64 perf_cgroup_event_time(struct perf_event *event)
+{
+ struct perf_cgroup_info *t;
+
+ t = per_cpu_ptr(event->cgrp->info, event->cpu);
+ return t->time;
+}
+
+static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
+{
+ struct perf_cgroup_info *info;
+ u64 now;
+
+ now = perf_clock();
+
+ info = this_cpu_ptr(cgrp->info);
+
+ info->time += now - info->timestamp;
+ info->timestamp = now;
+}
+
+static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
+{
+ struct perf_cgroup *cgrp_out = cpuctx->cgrp;
+ if (cgrp_out)
+ __update_cgrp_time(cgrp_out);
+}
+
+static inline void update_cgrp_time_from_event(struct perf_event *event)
+{
+ struct perf_cgroup *cgrp;
+
+ /*
+ * ensure we access cgroup data only when needed and
+ * when we know the cgroup is pinned (css_get)
+ */
+ if (!is_cgroup_event(event))
+ return;
+
+ cgrp = perf_cgroup_from_task(current);
+ /*
+ * Do not update time when cgroup is not active
+ */
+ if (cgrp == event->cgrp)
+ __update_cgrp_time(event->cgrp);
+}
+
+static inline void
+perf_cgroup_set_timestamp(struct task_struct *task,
+ struct perf_event_context *ctx)
+{
+ struct perf_cgroup *cgrp;
+ struct perf_cgroup_info *info;
+
+ /*
+ * ctx->lock held by caller
+ * ensure we do not access cgroup data
+ * unless we have the cgroup pinned (css_get)
+ */
+ if (!task || !ctx->nr_cgroups)
+ return;
+
+ cgrp = perf_cgroup_from_task(task);
+ info = this_cpu_ptr(cgrp->info);
+ info->timestamp = ctx->timestamp;
+}
+
+#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */
+#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */
+
+/*
+ * reschedule events based on the cgroup constraint of task.
+ *
+ * mode SWOUT : schedule out everything
+ * mode SWIN : schedule in based on cgroup for next
+ */
+void perf_cgroup_switch(struct task_struct *task, int mode)
+{
+ struct perf_cpu_context *cpuctx;
+ struct pmu *pmu;
+ unsigned long flags;
+
+ /*
+ * disable interrupts to avoid geting nr_cgroup
+ * changes via __perf_event_disable(). Also
+ * avoids preemption.
+ */
+ local_irq_save(flags);
+
+ /*
+ * we reschedule only in the presence of cgroup
+ * constrained events.
+ */
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(pmu, &pmus, entry) {
+ cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ if (cpuctx->unique_pmu != pmu)
+ continue; /* ensure we process each cpuctx once */
+
+ /*
+ * perf_cgroup_events says at least one
+ * context on this CPU has cgroup events.
+ *
+ * ctx->nr_cgroups reports the number of cgroup
+ * events for a context.
+ */
+ if (cpuctx->ctx.nr_cgroups > 0) {
+ perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+ perf_pmu_disable(cpuctx->ctx.pmu);
+
+ if (mode & PERF_CGROUP_SWOUT) {
+ cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+ /*
+ * must not be done before ctxswout due
+ * to event_filter_match() in event_sched_out()
+ */
+ cpuctx->cgrp = NULL;
+ }
+
+ if (mode & PERF_CGROUP_SWIN) {
+ WARN_ON_ONCE(cpuctx->cgrp);
+ /*
+ * set cgrp before ctxsw in to allow
+ * event_filter_match() to not have to pass
+ * task around
+ */
+ cpuctx->cgrp = perf_cgroup_from_task(task);
+ cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
+ }
+ perf_pmu_enable(cpuctx->ctx.pmu);
+ perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+ }
+ }
+
+ rcu_read_unlock();
+
+ local_irq_restore(flags);
+}
+
+static inline void perf_cgroup_sched_out(struct task_struct *task,
+ struct task_struct *next)
+{
+ struct perf_cgroup *cgrp1;
+ struct perf_cgroup *cgrp2 = NULL;
+
+ /*
+ * we come here when we know perf_cgroup_events > 0
+ */
+ cgrp1 = perf_cgroup_from_task(task);
+
+ /*
+ * next is NULL when called from perf_event_enable_on_exec()
+ * that will systematically cause a cgroup_switch()
+ */
+ if (next)
+ cgrp2 = perf_cgroup_from_task(next);
+
+ /*
+ * only schedule out current cgroup events if we know
+ * that we are switching to a different cgroup. Otherwise,
+ * do no touch the cgroup events.
+ */
+ if (cgrp1 != cgrp2)
+ perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
+}
+
+static inline void perf_cgroup_sched_in(struct task_struct *prev,
+ struct task_struct *task)
+{
+ struct perf_cgroup *cgrp1;
+ struct perf_cgroup *cgrp2 = NULL;
+
+ /*
+ * we come here when we know perf_cgroup_events > 0
+ */
+ cgrp1 = perf_cgroup_from_task(task);
+
+ /* prev can never be NULL */
+ cgrp2 = perf_cgroup_from_task(prev);
+
+ /*
+ * only need to schedule in cgroup events if we are changing
+ * cgroup during ctxsw. Cgroup events were not scheduled
+ * out of ctxsw out if that was not the case.
+ */
+ if (cgrp1 != cgrp2)
+ perf_cgroup_switch(task, PERF_CGROUP_SWIN);
+}
+
+static inline int perf_cgroup_connect(int fd, struct perf_event *event,
+ struct perf_event_attr *attr,
+ struct perf_event *group_leader)
+{
+ struct perf_cgroup *cgrp;
+ struct cgroup_subsys_state *css;
+ struct fd f = fdget(fd);
+ int ret = 0;
+
+ if (!f.file)
+ return -EBADF;
+
+ css = css_tryget_online_from_dir(f.file->f_path.dentry,
+ &perf_event_cgrp_subsys);
+ if (IS_ERR(css)) {
+ ret = PTR_ERR(css);
+ goto out;
+ }
+
+ cgrp = container_of(css, struct perf_cgroup, css);
+ event->cgrp = cgrp;
+
+ /*
+ * all events in a group must monitor
+ * the same cgroup because a task belongs
+ * to only one perf cgroup at a time
+ */
+ if (group_leader && group_leader->cgrp != cgrp) {
+ perf_detach_cgroup(event);
+ ret = -EINVAL;
+ }
+out:
+ fdput(f);
+ return ret;
+}
+
+static inline void
+perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
+{
+ struct perf_cgroup_info *t;
+ t = per_cpu_ptr(event->cgrp->info, event->cpu);
+ event->shadow_ctx_time = now - t->timestamp;
+}
+
+static inline void
+perf_cgroup_defer_enabled(struct perf_event *event)
+{
+ /*
+ * when the current task's perf cgroup does not match
+ * the event's, we need to remember to call the
+ * perf_mark_enable() function the first time a task with
+ * a matching perf cgroup is scheduled in.
+ */
+ if (is_cgroup_event(event) && !perf_cgroup_match(event))
+ event->cgrp_defer_enabled = 1;
+}
+
+static inline void
+perf_cgroup_mark_enabled(struct perf_event *event,
+ struct perf_event_context *ctx)
+{
+ struct perf_event *sub;
+ u64 tstamp = perf_event_time(event);
+
+ if (!event->cgrp_defer_enabled)
+ return;
+
+ event->cgrp_defer_enabled = 0;
+
+ event->tstamp_enabled = tstamp - event->total_time_enabled;
+ list_for_each_entry(sub, &event->sibling_list, group_entry) {
+ if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
+ sub->tstamp_enabled = tstamp - sub->total_time_enabled;
+ sub->cgrp_defer_enabled = 0;
+ }
+ }
+}
+#else /* !CONFIG_CGROUP_PERF */
+
+static inline bool
+perf_cgroup_match(struct perf_event *event)
+{
+ return true;
+}
+
+static inline void perf_detach_cgroup(struct perf_event *event)
+{}
+
+static inline int is_cgroup_event(struct perf_event *event)
+{
+ return 0;
+}
+
+static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
+{
+ return 0;
+}
+
+static inline void update_cgrp_time_from_event(struct perf_event *event)
+{
+}
+
+static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
+{
+}
+
+static inline void perf_cgroup_sched_out(struct task_struct *task,
+ struct task_struct *next)
+{
+}
+
+static inline void perf_cgroup_sched_in(struct task_struct *prev,
+ struct task_struct *task)
+{
+}
+
+static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
+ struct perf_event_attr *attr,
+ struct perf_event *group_leader)
+{
+ return -EINVAL;
+}
+
+static inline void
+perf_cgroup_set_timestamp(struct task_struct *task,
+ struct perf_event_context *ctx)
+{
+}
+
+void
+perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
+{
+}
+
+static inline void
+perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
+{
+}
+
+static inline u64 perf_cgroup_event_time(struct perf_event *event)
+{
+ return 0;
+}
+
+static inline void
+perf_cgroup_defer_enabled(struct perf_event *event)
+{
+}
+
+static inline void
+perf_cgroup_mark_enabled(struct perf_event *event,
+ struct perf_event_context *ctx)
+{
+}
+#endif
+
+/*
+ * set default to be dependent on timer tick just
+ * like original code
+ */
+#define PERF_CPU_HRTIMER (1000 / HZ)
+/*
+ * function must be called with interrupts disbled
+ */
+static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr)
+{
+ struct perf_cpu_context *cpuctx;
+ enum hrtimer_restart ret = HRTIMER_NORESTART;
+ int rotations = 0;
+
+ WARN_ON(!irqs_disabled());
+
+ cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
+
+ rotations = perf_rotate_context(cpuctx);
+
+ /*
+ * arm timer if needed
+ */
+ if (rotations) {
+ hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
+ ret = HRTIMER_RESTART;
+ }
+
+ return ret;
+}
+
+/* CPU is going down */
+void perf_cpu_hrtimer_cancel(int cpu)
+{
+ struct perf_cpu_context *cpuctx;
+ struct pmu *pmu;
+ unsigned long flags;
+
+ if (WARN_ON(cpu != smp_processor_id()))
+ return;
+
+ local_irq_save(flags);
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(pmu, &pmus, entry) {
+ cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
+ if (pmu->task_ctx_nr == perf_sw_context)
+ continue;
+
+ hrtimer_cancel(&cpuctx->hrtimer);
+ }
+
+ rcu_read_unlock();
+
+ local_irq_restore(flags);
+}
+
+static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
+{
+ struct hrtimer *hr = &cpuctx->hrtimer;
+ struct pmu *pmu = cpuctx->ctx.pmu;
+ int timer;
+
+ /* no multiplexing needed for SW PMU */
+ if (pmu->task_ctx_nr == perf_sw_context)
+ return;
+
+ /*
+ * check default is sane, if not set then force to
+ * default interval (1/tick)
+ */
+ timer = pmu->hrtimer_interval_ms;
+ if (timer < 1)
+ timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
+
+ cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
+
+ hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
+ hr->function = perf_cpu_hrtimer_handler;
+}
+
+static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx)
+{
+ struct hrtimer *hr = &cpuctx->hrtimer;
+ struct pmu *pmu = cpuctx->ctx.pmu;
+
+ /* not for SW PMU */
+ if (pmu->task_ctx_nr == perf_sw_context)
+ return;
+
+ if (hrtimer_active(hr))
+ return;
+
+ if (!hrtimer_callback_running(hr))
+ __hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval,
+ 0, HRTIMER_MODE_REL_PINNED, 0);
+}
+
+void perf_pmu_disable(struct pmu *pmu)
+{
+ int *count = this_cpu_ptr(pmu->pmu_disable_count);
+ if (!(*count)++)
+ pmu->pmu_disable(pmu);
+}
+
+void perf_pmu_enable(struct pmu *pmu)
+{
+ int *count = this_cpu_ptr(pmu->pmu_disable_count);
+ if (!--(*count))
+ pmu->pmu_enable(pmu);
+}
+
+static DEFINE_PER_CPU(struct list_head, active_ctx_list);
+
+/*
+ * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
+ * perf_event_task_tick() are fully serialized because they're strictly cpu
+ * affine and perf_event_ctx{activate,deactivate} are called with IRQs
+ * disabled, while perf_event_task_tick is called from IRQ context.
+ */
+static void perf_event_ctx_activate(struct perf_event_context *ctx)
+{
+ struct list_head *head = this_cpu_ptr(&active_ctx_list);
+
+ WARN_ON(!irqs_disabled());
+
+ WARN_ON(!list_empty(&ctx->active_ctx_list));
+
+ list_add(&ctx->active_ctx_list, head);
+}
+
+static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
+{
+ WARN_ON(!irqs_disabled());
+
+ WARN_ON(list_empty(&ctx->active_ctx_list));
+
+ list_del_init(&ctx->active_ctx_list);
+}
+
+static void get_ctx(struct perf_event_context *ctx)
+{
+ WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
+}
+
+static void free_ctx(struct rcu_head *head)
+{
+ struct perf_event_context *ctx;
+
+ ctx = container_of(head, struct perf_event_context, rcu_head);
+ kfree(ctx->task_ctx_data);
+ kfree(ctx);
+}
+
+static void put_ctx(struct perf_event_context *ctx)
+{
+ if (atomic_dec_and_test(&ctx->refcount)) {
+ if (ctx->parent_ctx)
+ put_ctx(ctx->parent_ctx);
+ if (ctx->task)
+ put_task_struct(ctx->task);
+ call_rcu(&ctx->rcu_head, free_ctx);
+ }
+}
+
+/*
+ * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
+ * perf_pmu_migrate_context() we need some magic.
+ *
+ * Those places that change perf_event::ctx will hold both
+ * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
+ *
+ * Lock ordering is by mutex address. There are two other sites where
+ * perf_event_context::mutex nests and those are:
+ *
+ * - perf_event_exit_task_context() [ child , 0 ]
+ * __perf_event_exit_task()
+ * sync_child_event()
+ * put_event() [ parent, 1 ]
+ *
+ * - perf_event_init_context() [ parent, 0 ]
+ * inherit_task_group()
+ * inherit_group()
+ * inherit_event()
+ * perf_event_alloc()
+ * perf_init_event()
+ * perf_try_init_event() [ child , 1 ]
+ *
+ * While it appears there is an obvious deadlock here -- the parent and child
+ * nesting levels are inverted between the two. This is in fact safe because
+ * life-time rules separate them. That is an exiting task cannot fork, and a
+ * spawning task cannot (yet) exit.
+ *
+ * But remember that that these are parent<->child context relations, and
+ * migration does not affect children, therefore these two orderings should not
+ * interact.
+ *
+ * The change in perf_event::ctx does not affect children (as claimed above)
+ * because the sys_perf_event_open() case will install a new event and break
+ * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
+ * concerned with cpuctx and that doesn't have children.
+ *
+ * The places that change perf_event::ctx will issue:
+ *
+ * perf_remove_from_context();
+ * synchronize_rcu();
+ * perf_install_in_context();
+ *
+ * to affect the change. The remove_from_context() + synchronize_rcu() should
+ * quiesce the event, after which we can install it in the new location. This
+ * means that only external vectors (perf_fops, prctl) can perturb the event
+ * while in transit. Therefore all such accessors should also acquire
+ * perf_event_context::mutex to serialize against this.
+ *
+ * However; because event->ctx can change while we're waiting to acquire
+ * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
+ * function.
+ *
+ * Lock order:
+ * task_struct::perf_event_mutex
+ * perf_event_context::mutex
+ * perf_event_context::lock
+ * perf_event::child_mutex;
+ * perf_event::mmap_mutex
+ * mmap_sem
+ */
+static struct perf_event_context *
+perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
+{
+ struct perf_event_context *ctx;
+
+again:
+ rcu_read_lock();
+ ctx = ACCESS_ONCE(event->ctx);
+ if (!atomic_inc_not_zero(&ctx->refcount)) {
+ rcu_read_unlock();
+ goto again;
+ }
+ rcu_read_unlock();
+
+ mutex_lock_nested(&ctx->mutex, nesting);
+ if (event->ctx != ctx) {
+ mutex_unlock(&ctx->mutex);
+ put_ctx(ctx);
+ goto again;
+ }
+
+ return ctx;
+}
+
+static inline struct perf_event_context *
+perf_event_ctx_lock(struct perf_event *event)
+{
+ return perf_event_ctx_lock_nested(event, 0);
+}
+
+static void perf_event_ctx_unlock(struct perf_event *event,
+ struct perf_event_context *ctx)
+{
+ mutex_unlock(&ctx->mutex);
+ put_ctx(ctx);
+}
+
+/*
+ * This must be done under the ctx->lock, such as to serialize against
+ * context_equiv(), therefore we cannot call put_ctx() since that might end up
+ * calling scheduler related locks and ctx->lock nests inside those.
+ */
+static __must_check struct perf_event_context *
+unclone_ctx(struct perf_event_context *ctx)
+{
+ struct perf_event_context *parent_ctx = ctx->parent_ctx;
+
+ lockdep_assert_held(&ctx->lock);
+
+ if (parent_ctx)
+ ctx->parent_ctx = NULL;
+ ctx->generation++;
+
+ return parent_ctx;
+}
+
+static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
+{
+ /*
+ * only top level events have the pid namespace they were created in
+ */
+ if (event->parent)
+ event = event->parent;
+
+ return task_tgid_nr_ns(p, event->ns);
+}
+
+static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
+{
+ /*
+ * only top level events have the pid namespace they were created in
+ */
+ if (event->parent)
+ event = event->parent;
+
+ return task_pid_nr_ns(p, event->ns);
+}
+
+/*
+ * If we inherit events we want to return the parent event id
+ * to userspace.
+ */
+static u64 primary_event_id(struct perf_event *event)
+{
+ u64 id = event->id;
+
+ if (event->parent)
+ id = event->parent->id;
+
+ return id;
+}
+
+/*
+ * Get the perf_event_context for a task and lock it.
+ * This has to cope with with the fact that until it is locked,
+ * the context could get moved to another task.
+ */
+static struct perf_event_context *
+perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
+{
+ struct perf_event_context *ctx;
+
+retry:
+ /*
+ * One of the few rules of preemptible RCU is that one cannot do
+ * rcu_read_unlock() while holding a scheduler (or nested) lock when
+ * part of the read side critical section was preemptible -- see
+ * rcu_read_unlock_special().
+ *
+ * Since ctx->lock nests under rq->lock we must ensure the entire read
+ * side critical section is non-preemptible.
+ */
+ preempt_disable();
+ rcu_read_lock();
+ ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
+ if (ctx) {
+ /*
+ * If this context is a clone of another, it might
+ * get swapped for another underneath us by
+ * perf_event_task_sched_out, though the
+ * rcu_read_lock() protects us from any context
+ * getting freed. Lock the context and check if it
+ * got swapped before we could get the lock, and retry
+ * if so. If we locked the right context, then it
+ * can't get swapped on us any more.
+ */
+ raw_spin_lock_irqsave(&ctx->lock, *flags);
+ if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
+ raw_spin_unlock_irqrestore(&ctx->lock, *flags);
+ rcu_read_unlock();
+ preempt_enable();
+ goto retry;
+ }
+
+ if (!atomic_inc_not_zero(&ctx->refcount)) {
+ raw_spin_unlock_irqrestore(&ctx->lock, *flags);
+ ctx = NULL;
+ }
+ }
+ rcu_read_unlock();
+ preempt_enable();
+ return ctx;
+}
+
+/*
+ * Get the context for a task and increment its pin_count so it
+ * can't get swapped to another task. This also increments its
+ * reference count so that the context can't get freed.
+ */
+static struct perf_event_context *
+perf_pin_task_context(struct task_struct *task, int ctxn)
+{
+ struct perf_event_context *ctx;
+ unsigned long flags;
+
+ ctx = perf_lock_task_context(task, ctxn, &flags);
+ if (ctx) {
+ ++ctx->pin_count;
+ raw_spin_unlock_irqrestore(&ctx->lock, flags);
+ }
+ return ctx;
+}
+
+static void perf_unpin_context(struct perf_event_context *ctx)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&ctx->lock, flags);
+ --ctx->pin_count;
+ raw_spin_unlock_irqrestore(&ctx->lock, flags);
+}
+
+/*
+ * Update the record of the current time in a context.
+ */
+static void update_context_time(struct perf_event_context *ctx)
+{
+ u64 now = perf_clock();
+
+ ctx->time += now - ctx->timestamp;
+ ctx->timestamp = now;
+}
+
+static u64 perf_event_time(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+
+ if (is_cgroup_event(event))
+ return perf_cgroup_event_time(event);
+
+ return ctx ? ctx->time : 0;
+}
+
+/*
+ * Update the total_time_enabled and total_time_running fields for a event.
+ * The caller of this function needs to hold the ctx->lock.
+ */
+static void update_event_times(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+ u64 run_end;
+
+ if (event->state < PERF_EVENT_STATE_INACTIVE ||
+ event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
+ return;
+ /*
+ * in cgroup mode, time_enabled represents
+ * the time the event was enabled AND active
+ * tasks were in the monitored cgroup. This is
+ * independent of the activity of the context as
+ * there may be a mix of cgroup and non-cgroup events.
+ *
+ * That is why we treat cgroup events differently
+ * here.
+ */
+ if (is_cgroup_event(event))
+ run_end = perf_cgroup_event_time(event);
+ else if (ctx->is_active)
+ run_end = ctx->time;
+ else
+ run_end = event->tstamp_stopped;
+
+ event->total_time_enabled = run_end - event->tstamp_enabled;
+
+ if (event->state == PERF_EVENT_STATE_INACTIVE)
+ run_end = event->tstamp_stopped;
+ else
+ run_end = perf_event_time(event);
+
+ event->total_time_running = run_end - event->tstamp_running;
+
+}
+
+/*
+ * Update total_time_enabled and total_time_running for all events in a group.
+ */
+static void update_group_times(struct perf_event *leader)
+{
+ struct perf_event *event;
+
+ update_event_times(leader);
+ list_for_each_entry(event, &leader->sibling_list, group_entry)
+ update_event_times(event);
+}
+
+static struct list_head *
+ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
+{
+ if (event->attr.pinned)
+ return &ctx->pinned_groups;
+ else
+ return &ctx->flexible_groups;
+}
+
+/*
+ * Add a event from the lists for its context.
+ * Must be called with ctx->mutex and ctx->lock held.
+ */
+static void
+list_add_event(struct perf_event *event, struct perf_event_context *ctx)
+{
+ WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
+ event->attach_state |= PERF_ATTACH_CONTEXT;
+
+ /*
+ * If we're a stand alone event or group leader, we go to the context
+ * list, group events are kept attached to the group so that
+ * perf_group_detach can, at all times, locate all siblings.
+ */
+ if (event->group_leader == event) {
+ struct list_head *list;
+
+ if (is_software_event(event))
+ event->group_flags |= PERF_GROUP_SOFTWARE;
+
+ list = ctx_group_list(event, ctx);
+ list_add_tail(&event->group_entry, list);
+ }
+
+ if (is_cgroup_event(event))
+ ctx->nr_cgroups++;
+
+ list_add_rcu(&event->event_entry, &ctx->event_list);
+ ctx->nr_events++;
+ if (event->attr.inherit_stat)
+ ctx->nr_stat++;
+
+ ctx->generation++;
+}
+
+/*
+ * Initialize event state based on the perf_event_attr::disabled.
+ */
+static inline void perf_event__state_init(struct perf_event *event)
+{
+ event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
+ PERF_EVENT_STATE_INACTIVE;
+}
+
+/*
+ * Called at perf_event creation and when events are attached/detached from a
+ * group.
+ */
+static void perf_event__read_size(struct perf_event *event)
+{
+ int entry = sizeof(u64); /* value */
+ int size = 0;
+ int nr = 1;
+
+ if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+ size += sizeof(u64);
+
+ if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+ size += sizeof(u64);
+
+ if (event->attr.read_format & PERF_FORMAT_ID)
+ entry += sizeof(u64);
+
+ if (event->attr.read_format & PERF_FORMAT_GROUP) {
+ nr += event->group_leader->nr_siblings;
+ size += sizeof(u64);
+ }
+
+ size += entry * nr;
+ event->read_size = size;
+}
+
+static void perf_event__header_size(struct perf_event *event)
+{
+ struct perf_sample_data *data;
+ u64 sample_type = event->attr.sample_type;
+ u16 size = 0;
+
+ perf_event__read_size(event);
+
+ if (sample_type & PERF_SAMPLE_IP)
+ size += sizeof(data->ip);
+
+ if (sample_type & PERF_SAMPLE_ADDR)
+ size += sizeof(data->addr);
+
+ if (sample_type & PERF_SAMPLE_PERIOD)
+ size += sizeof(data->period);
+
+ if (sample_type & PERF_SAMPLE_WEIGHT)
+ size += sizeof(data->weight);
+
+ if (sample_type & PERF_SAMPLE_READ)
+ size += event->read_size;
+
+ if (sample_type & PERF_SAMPLE_DATA_SRC)
+ size += sizeof(data->data_src.val);
+
+ if (sample_type & PERF_SAMPLE_TRANSACTION)
+ size += sizeof(data->txn);
+
+ event->header_size = size;
+}
+
+static void perf_event__id_header_size(struct perf_event *event)
+{
+ struct perf_sample_data *data;
+ u64 sample_type = event->attr.sample_type;
+ u16 size = 0;
+
+ if (sample_type & PERF_SAMPLE_TID)
+ size += sizeof(data->tid_entry);
+
+ if (sample_type & PERF_SAMPLE_TIME)
+ size += sizeof(data->time);
+
+ if (sample_type & PERF_SAMPLE_IDENTIFIER)
+ size += sizeof(data->id);
+
+ if (sample_type & PERF_SAMPLE_ID)
+ size += sizeof(data->id);
+
+ if (sample_type & PERF_SAMPLE_STREAM_ID)
+ size += sizeof(data->stream_id);
+
+ if (sample_type & PERF_SAMPLE_CPU)
+ size += sizeof(data->cpu_entry);
+
+ event->id_header_size = size;
+}
+
+static void perf_group_attach(struct perf_event *event)
+{
+ struct perf_event *group_leader = event->group_leader, *pos;
+
+ /*
+ * We can have double attach due to group movement in perf_event_open.
+ */
+ if (event->attach_state & PERF_ATTACH_GROUP)
+ return;
+
+ event->attach_state |= PERF_ATTACH_GROUP;
+
+ if (group_leader == event)
+ return;
+
+ WARN_ON_ONCE(group_leader->ctx != event->ctx);
+
+ if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
+ !is_software_event(event))
+ group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
+
+ list_add_tail(&event->group_entry, &group_leader->sibling_list);
+ group_leader->nr_siblings++;
+
+ perf_event__header_size(group_leader);
+
+ list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
+ perf_event__header_size(pos);
+}
+
+/*
+ * Remove a event from the lists for its context.
+ * Must be called with ctx->mutex and ctx->lock held.
+ */
+static void
+list_del_event(struct perf_event *event, struct perf_event_context *ctx)
+{
+ struct perf_cpu_context *cpuctx;
+
+ WARN_ON_ONCE(event->ctx != ctx);
+ lockdep_assert_held(&ctx->lock);
+
+ /*
+ * We can have double detach due to exit/hot-unplug + close.
+ */
+ if (!(event->attach_state & PERF_ATTACH_CONTEXT))
+ return;
+
+ event->attach_state &= ~PERF_ATTACH_CONTEXT;
+
+ if (is_cgroup_event(event)) {
+ ctx->nr_cgroups--;
+ cpuctx = __get_cpu_context(ctx);
+ /*
+ * if there are no more cgroup events
+ * then cler cgrp to avoid stale pointer
+ * in update_cgrp_time_from_cpuctx()
+ */
+ if (!ctx->nr_cgroups)
+ cpuctx->cgrp = NULL;
+ }
+
+ ctx->nr_events--;
+ if (event->attr.inherit_stat)
+ ctx->nr_stat--;
+
+ list_del_rcu(&event->event_entry);
+
+ if (event->group_leader == event)
+ list_del_init(&event->group_entry);
+
+ update_group_times(event);
+
+ /*
+ * If event was in error state, then keep it
+ * that way, otherwise bogus counts will be
+ * returned on read(). The only way to get out
+ * of error state is by explicit re-enabling
+ * of the event
+ */
+ if (event->state > PERF_EVENT_STATE_OFF)
+ event->state = PERF_EVENT_STATE_OFF;
+
+ ctx->generation++;
+}
+
+static void perf_group_detach(struct perf_event *event)
+{
+ struct perf_event *sibling, *tmp;
+ struct list_head *list = NULL;
+
+ /*
+ * We can have double detach due to exit/hot-unplug + close.
+ */
+ if (!(event->attach_state & PERF_ATTACH_GROUP))
+ return;
+
+ event->attach_state &= ~PERF_ATTACH_GROUP;
+
+ /*
+ * If this is a sibling, remove it from its group.
+ */
+ if (event->group_leader != event) {
+ list_del_init(&event->group_entry);
+ event->group_leader->nr_siblings--;
+ goto out;
+ }
+
+ if (!list_empty(&event->group_entry))
+ list = &event->group_entry;
+
+ /*
+ * If this was a group event with sibling events then
+ * upgrade the siblings to singleton events by adding them
+ * to whatever list we are on.
+ */
+ list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
+ if (list)
+ list_move_tail(&sibling->group_entry, list);
+ sibling->group_leader = sibling;
+
+ /* Inherit group flags from the previous leader */
+ sibling->group_flags = event->group_flags;
+
+ WARN_ON_ONCE(sibling->ctx != event->ctx);
+ }
+
+out:
+ perf_event__header_size(event->group_leader);
+
+ list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
+ perf_event__header_size(tmp);
+}
+
+/*
+ * User event without the task.
+ */
+static bool is_orphaned_event(struct perf_event *event)
+{
+ return event && !is_kernel_event(event) && !event->owner;
+}
+
+/*
+ * Event has a parent but parent's task finished and it's
+ * alive only because of children holding refference.
+ */
+static bool is_orphaned_child(struct perf_event *event)
+{
+ return is_orphaned_event(event->parent);
+}
+
+static void orphans_remove_work(struct work_struct *work);
+
+static void schedule_orphans_remove(struct perf_event_context *ctx)
+{
+ if (!ctx->task || ctx->orphans_remove_sched || !perf_wq)
+ return;
+
+ if (queue_delayed_work(perf_wq, &ctx->orphans_remove, 1)) {
+ get_ctx(ctx);
+ ctx->orphans_remove_sched = true;
+ }
+}
+
+static int __init perf_workqueue_init(void)
+{
+ perf_wq = create_singlethread_workqueue("perf");
+ WARN(!perf_wq, "failed to create perf workqueue\n");
+ return perf_wq ? 0 : -1;
+}
+
+core_initcall(perf_workqueue_init);
+
+static inline int
+event_filter_match(struct perf_event *event)
+{
+ return (event->cpu == -1 || event->cpu == smp_processor_id())
+ && perf_cgroup_match(event);
+}
+
+static void
+event_sched_out(struct perf_event *event,
+ struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx)
+{
+ u64 tstamp = perf_event_time(event);
+ u64 delta;
+
+ WARN_ON_ONCE(event->ctx != ctx);
+ lockdep_assert_held(&ctx->lock);
+
+ /*
+ * An event which could not be activated because of
+ * filter mismatch still needs to have its timings
+ * maintained, otherwise bogus information is return
+ * via read() for time_enabled, time_running:
+ */
+ if (event->state == PERF_EVENT_STATE_INACTIVE
+ && !event_filter_match(event)) {
+ delta = tstamp - event->tstamp_stopped;
+ event->tstamp_running += delta;
+ event->tstamp_stopped = tstamp;
+ }
+
+ if (event->state != PERF_EVENT_STATE_ACTIVE)
+ return;
+
+ perf_pmu_disable(event->pmu);
+
+ event->state = PERF_EVENT_STATE_INACTIVE;
+ if (event->pending_disable) {
+ event->pending_disable = 0;
+ event->state = PERF_EVENT_STATE_OFF;
+ }
+ event->tstamp_stopped = tstamp;
+ event->pmu->del(event, 0);
+ event->oncpu = -1;
+
+ if (!is_software_event(event))
+ cpuctx->active_oncpu--;
+ if (!--ctx->nr_active)
+ perf_event_ctx_deactivate(ctx);
+ if (event->attr.freq && event->attr.sample_freq)
+ ctx->nr_freq--;
+ if (event->attr.exclusive || !cpuctx->active_oncpu)
+ cpuctx->exclusive = 0;
+
+ if (is_orphaned_child(event))
+ schedule_orphans_remove(ctx);
+
+ perf_pmu_enable(event->pmu);
+}
+
+static void
+group_sched_out(struct perf_event *group_event,
+ struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx)
+{
+ struct perf_event *event;
+ int state = group_event->state;
+
+ event_sched_out(group_event, cpuctx, ctx);
+
+ /*
+ * Schedule out siblings (if any):
+ */
+ list_for_each_entry(event, &group_event->sibling_list, group_entry)
+ event_sched_out(event, cpuctx, ctx);
+
+ if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
+ cpuctx->exclusive = 0;
+}
+
+struct remove_event {
+ struct perf_event *event;
+ bool detach_group;
+};
+
+/*
+ * Cross CPU call to remove a performance event
+ *
+ * We disable the event on the hardware level first. After that we
+ * remove it from the context list.
+ */
+static int __perf_remove_from_context(void *info)
+{
+ struct remove_event *re = info;
+ struct perf_event *event = re->event;
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+
+ raw_spin_lock(&ctx->lock);
+ event_sched_out(event, cpuctx, ctx);
+ if (re->detach_group)
+ perf_group_detach(event);
+ list_del_event(event, ctx);
+ if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
+ ctx->is_active = 0;
+ cpuctx->task_ctx = NULL;
+ }
+ raw_spin_unlock(&ctx->lock);
+
+ return 0;
+}
+
+
+/*
+ * Remove the event from a task's (or a CPU's) list of events.
+ *
+ * CPU events are removed with a smp call. For task events we only
+ * call when the task is on a CPU.
+ *
+ * If event->ctx is a cloned context, callers must make sure that
+ * every task struct that event->ctx->task could possibly point to
+ * remains valid. This is OK when called from perf_release since
+ * that only calls us on the top-level context, which can't be a clone.
+ * When called from perf_event_exit_task, it's OK because the
+ * context has been detached from its task.
+ */
+static void perf_remove_from_context(struct perf_event *event, bool detach_group)
+{
+ struct perf_event_context *ctx = event->ctx;
+ struct task_struct *task = ctx->task;
+ struct remove_event re = {
+ .event = event,
+ .detach_group = detach_group,
+ };
+
+ lockdep_assert_held(&ctx->mutex);
+
+ if (!task) {
+ /*
+ * Per cpu events are removed via an smp call. The removal can
+ * fail if the CPU is currently offline, but in that case we
+ * already called __perf_remove_from_context from
+ * perf_event_exit_cpu.
+ */
+ cpu_function_call(event->cpu, __perf_remove_from_context, &re);
+ return;
+ }
+
+retry:
+ if (!task_function_call(task, __perf_remove_from_context, &re))
+ return;
+
+ raw_spin_lock_irq(&ctx->lock);
+ /*
+ * If we failed to find a running task, but find the context active now
+ * that we've acquired the ctx->lock, retry.
+ */
+ if (ctx->is_active) {
+ raw_spin_unlock_irq(&ctx->lock);
+ /*
+ * Reload the task pointer, it might have been changed by
+ * a concurrent perf_event_context_sched_out().
+ */
+ task = ctx->task;
+ goto retry;
+ }
+
+ /*
+ * Since the task isn't running, its safe to remove the event, us
+ * holding the ctx->lock ensures the task won't get scheduled in.
+ */
+ if (detach_group)
+ perf_group_detach(event);
+ list_del_event(event, ctx);
+ raw_spin_unlock_irq(&ctx->lock);
+}
+
+/*
+ * Cross CPU call to disable a performance event
+ */
+int __perf_event_disable(void *info)
+{
+ struct perf_event *event = info;
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+
+ /*
+ * If this is a per-task event, need to check whether this
+ * event's task is the current task on this cpu.
+ *
+ * Can trigger due to concurrent perf_event_context_sched_out()
+ * flipping contexts around.
+ */
+ if (ctx->task && cpuctx->task_ctx != ctx)
+ return -EINVAL;
+
+ raw_spin_lock(&ctx->lock);
+
+ /*
+ * If the event is on, turn it off.
+ * If it is in error state, leave it in error state.
+ */
+ if (event->state >= PERF_EVENT_STATE_INACTIVE) {
+ update_context_time(ctx);
+ update_cgrp_time_from_event(event);
+ update_group_times(event);
+ if (event == event->group_leader)
+ group_sched_out(event, cpuctx, ctx);
+ else
+ event_sched_out(event, cpuctx, ctx);
+ event->state = PERF_EVENT_STATE_OFF;
+ }
+
+ raw_spin_unlock(&ctx->lock);
+
+ return 0;
+}
+
+/*
+ * Disable a event.
+ *
+ * If event->ctx is a cloned context, callers must make sure that
+ * every task struct that event->ctx->task could possibly point to
+ * remains valid. This condition is satisifed when called through
+ * perf_event_for_each_child or perf_event_for_each because they
+ * hold the top-level event's child_mutex, so any descendant that
+ * goes to exit will block in sync_child_event.
+ * When called from perf_pending_event it's OK because event->ctx
+ * is the current context on this CPU and preemption is disabled,
+ * hence we can't get into perf_event_task_sched_out for this context.
+ */
+static void _perf_event_disable(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+ struct task_struct *task = ctx->task;
+
+ if (!task) {
+ /*
+ * Disable the event on the cpu that it's on
+ */
+ cpu_function_call(event->cpu, __perf_event_disable, event);
+ return;
+ }
+
+retry:
+ if (!task_function_call(task, __perf_event_disable, event))
+ return;
+
+ raw_spin_lock_irq(&ctx->lock);
+ /*
+ * If the event is still active, we need to retry the cross-call.
+ */
+ if (event->state == PERF_EVENT_STATE_ACTIVE) {
+ raw_spin_unlock_irq(&ctx->lock);
+ /*
+ * Reload the task pointer, it might have been changed by
+ * a concurrent perf_event_context_sched_out().
+ */
+ task = ctx->task;
+ goto retry;
+ }
+
+ /*
+ * Since we have the lock this context can't be scheduled
+ * in, so we can change the state safely.
+ */
+ if (event->state == PERF_EVENT_STATE_INACTIVE) {
+ update_group_times(event);
+ event->state = PERF_EVENT_STATE_OFF;
+ }
+ raw_spin_unlock_irq(&ctx->lock);
+}
+
+/*
+ * Strictly speaking kernel users cannot create groups and therefore this
+ * interface does not need the perf_event_ctx_lock() magic.
+ */
+void perf_event_disable(struct perf_event *event)
+{
+ struct perf_event_context *ctx;
+
+ ctx = perf_event_ctx_lock(event);
+ _perf_event_disable(event);
+ perf_event_ctx_unlock(event, ctx);
+}
+EXPORT_SYMBOL_GPL(perf_event_disable);
+
+static void perf_set_shadow_time(struct perf_event *event,
+ struct perf_event_context *ctx,
+ u64 tstamp)
+{
+ /*
+ * use the correct time source for the time snapshot
+ *
+ * We could get by without this by leveraging the
+ * fact that to get to this function, the caller
+ * has most likely already called update_context_time()
+ * and update_cgrp_time_xx() and thus both timestamp
+ * are identical (or very close). Given that tstamp is,
+ * already adjusted for cgroup, we could say that:
+ * tstamp - ctx->timestamp
+ * is equivalent to
+ * tstamp - cgrp->timestamp.
+ *
+ * Then, in perf_output_read(), the calculation would
+ * work with no changes because:
+ * - event is guaranteed scheduled in
+ * - no scheduled out in between
+ * - thus the timestamp would be the same
+ *
+ * But this is a bit hairy.
+ *
+ * So instead, we have an explicit cgroup call to remain
+ * within the time time source all along. We believe it
+ * is cleaner and simpler to understand.
+ */
+ if (is_cgroup_event(event))
+ perf_cgroup_set_shadow_time(event, tstamp);
+ else
+ event->shadow_ctx_time = tstamp - ctx->timestamp;
+}
+
+#define MAX_INTERRUPTS (~0ULL)
+
+static void perf_log_throttle(struct perf_event *event, int enable);
+static void perf_log_itrace_start(struct perf_event *event);
+
+static int
+event_sched_in(struct perf_event *event,
+ struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx)
+{
+ u64 tstamp = perf_event_time(event);
+ int ret = 0;
+
+ lockdep_assert_held(&ctx->lock);
+
+ if (event->state <= PERF_EVENT_STATE_OFF)
+ return 0;
+
+ event->state = PERF_EVENT_STATE_ACTIVE;
+ event->oncpu = smp_processor_id();
+
+ /*
+ * Unthrottle events, since we scheduled we might have missed several
+ * ticks already, also for a heavily scheduling task there is little
+ * guarantee it'll get a tick in a timely manner.
+ */
+ if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
+ perf_log_throttle(event, 1);
+ event->hw.interrupts = 0;
+ }
+
+ /*
+ * The new state must be visible before we turn it on in the hardware:
+ */
+ smp_wmb();
+
+ perf_pmu_disable(event->pmu);
+
+ event->tstamp_running += tstamp - event->tstamp_stopped;
+
+ perf_set_shadow_time(event, ctx, tstamp);
+
+ perf_log_itrace_start(event);
+
+ if (event->pmu->add(event, PERF_EF_START)) {
+ event->state = PERF_EVENT_STATE_INACTIVE;
+ event->oncpu = -1;
+ ret = -EAGAIN;
+ goto out;
+ }
+
+ if (!is_software_event(event))
+ cpuctx->active_oncpu++;
+ if (!ctx->nr_active++)
+ perf_event_ctx_activate(ctx);
+ if (event->attr.freq && event->attr.sample_freq)
+ ctx->nr_freq++;
+
+ if (event->attr.exclusive)
+ cpuctx->exclusive = 1;
+
+ if (is_orphaned_child(event))
+ schedule_orphans_remove(ctx);
+
+out:
+ perf_pmu_enable(event->pmu);
+
+ return ret;
+}
+
+static int
+group_sched_in(struct perf_event *group_event,
+ struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx)
+{
+ struct perf_event *event, *partial_group = NULL;
+ struct pmu *pmu = ctx->pmu;
+ u64 now = ctx->time;
+ bool simulate = false;
+
+ if (group_event->state == PERF_EVENT_STATE_OFF)
+ return 0;
+
+ pmu->start_txn(pmu);
+
+ if (event_sched_in(group_event, cpuctx, ctx)) {
+ pmu->cancel_txn(pmu);
+ perf_cpu_hrtimer_restart(cpuctx);
+ return -EAGAIN;
+ }
+
+ /*
+ * Schedule in siblings as one group (if any):
+ */
+ list_for_each_entry(event, &group_event->sibling_list, group_entry) {
+ if (event_sched_in(event, cpuctx, ctx)) {
+ partial_group = event;
+ goto group_error;
+ }
+ }
+
+ if (!pmu->commit_txn(pmu))
+ return 0;
+
+group_error:
+ /*
+ * Groups can be scheduled in as one unit only, so undo any
+ * partial group before returning:
+ * The events up to the failed event are scheduled out normally,
+ * tstamp_stopped will be updated.
+ *
+ * The failed events and the remaining siblings need to have
+ * their timings updated as if they had gone thru event_sched_in()
+ * and event_sched_out(). This is required to get consistent timings
+ * across the group. This also takes care of the case where the group
+ * could never be scheduled by ensuring tstamp_stopped is set to mark
+ * the time the event was actually stopped, such that time delta
+ * calculation in update_event_times() is correct.
+ */
+ list_for_each_entry(event, &group_event->sibling_list, group_entry) {
+ if (event == partial_group)
+ simulate = true;
+
+ if (simulate) {
+ event->tstamp_running += now - event->tstamp_stopped;
+ event->tstamp_stopped = now;
+ } else {
+ event_sched_out(event, cpuctx, ctx);
+ }
+ }
+ event_sched_out(group_event, cpuctx, ctx);
+
+ pmu->cancel_txn(pmu);
+
+ perf_cpu_hrtimer_restart(cpuctx);
+
+ return -EAGAIN;
+}
+
+/*
+ * Work out whether we can put this event group on the CPU now.
+ */
+static int group_can_go_on(struct perf_event *event,
+ struct perf_cpu_context *cpuctx,
+ int can_add_hw)
+{
+ /*
+ * Groups consisting entirely of software events can always go on.
+ */
+ if (event->group_flags & PERF_GROUP_SOFTWARE)
+ return 1;
+ /*
+ * If an exclusive group is already on, no other hardware
+ * events can go on.
+ */
+ if (cpuctx->exclusive)
+ return 0;
+ /*
+ * If this group is exclusive and there are already
+ * events on the CPU, it can't go on.
+ */
+ if (event->attr.exclusive && cpuctx->active_oncpu)
+ return 0;
+ /*
+ * Otherwise, try to add it if all previous groups were able
+ * to go on.
+ */
+ return can_add_hw;
+}
+
+static void add_event_to_ctx(struct perf_event *event,
+ struct perf_event_context *ctx)
+{
+ u64 tstamp = perf_event_time(event);
+
+ list_add_event(event, ctx);
+ perf_group_attach(event);
+ event->tstamp_enabled = tstamp;
+ event->tstamp_running = tstamp;
+ event->tstamp_stopped = tstamp;
+}
+
+static void task_ctx_sched_out(struct perf_event_context *ctx);
+static void
+ctx_sched_in(struct perf_event_context *ctx,
+ struct perf_cpu_context *cpuctx,
+ enum event_type_t event_type,
+ struct task_struct *task);
+
+static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx,
+ struct task_struct *task)
+{
+ cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
+ if (ctx)
+ ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
+ cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
+ if (ctx)
+ ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
+}
+
+/*
+ * Cross CPU call to install and enable a performance event
+ *
+ * Must be called with ctx->mutex held
+ */
+static int __perf_install_in_context(void *info)
+{
+ struct perf_event *event = info;
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_event_context *task_ctx = cpuctx->task_ctx;
+ struct task_struct *task = current;
+
+ perf_ctx_lock(cpuctx, task_ctx);
+ perf_pmu_disable(cpuctx->ctx.pmu);
+
+ /*
+ * If there was an active task_ctx schedule it out.
+ */
+ if (task_ctx)
+ task_ctx_sched_out(task_ctx);
+
+ /*
+ * If the context we're installing events in is not the
+ * active task_ctx, flip them.
+ */
+ if (ctx->task && task_ctx != ctx) {
+ if (task_ctx)
+ raw_spin_unlock(&task_ctx->lock);
+ raw_spin_lock(&ctx->lock);
+ task_ctx = ctx;
+ }
+
+ if (task_ctx) {
+ cpuctx->task_ctx = task_ctx;
+ task = task_ctx->task;
+ }
+
+ cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+
+ update_context_time(ctx);
+ /*
+ * update cgrp time only if current cgrp
+ * matches event->cgrp. Must be done before
+ * calling add_event_to_ctx()
+ */
+ update_cgrp_time_from_event(event);
+
+ add_event_to_ctx(event, ctx);
+
+ /*
+ * Schedule everything back in
+ */
+ perf_event_sched_in(cpuctx, task_ctx, task);
+
+ perf_pmu_enable(cpuctx->ctx.pmu);
+ perf_ctx_unlock(cpuctx, task_ctx);
+
+ return 0;
+}
+
+/*
+ * Attach a performance event to a context
+ *
+ * First we add the event to the list with the hardware enable bit
+ * in event->hw_config cleared.
+ *
+ * If the event is attached to a task which is on a CPU we use a smp
+ * call to enable it in the task context. The task might have been
+ * scheduled away, but we check this in the smp call again.
+ */
+static void
+perf_install_in_context(struct perf_event_context *ctx,
+ struct perf_event *event,
+ int cpu)
+{
+ struct task_struct *task = ctx->task;
+
+ lockdep_assert_held(&ctx->mutex);
+
+ event->ctx = ctx;
+ if (event->cpu != -1)
+ event->cpu = cpu;
+
+ if (!task) {
+ /*
+ * Per cpu events are installed via an smp call and
+ * the install is always successful.
+ */
+ cpu_function_call(cpu, __perf_install_in_context, event);
+ return;
+ }
+
+retry:
+ if (!task_function_call(task, __perf_install_in_context, event))
+ return;
+
+ raw_spin_lock_irq(&ctx->lock);
+ /*
+ * If we failed to find a running task, but find the context active now
+ * that we've acquired the ctx->lock, retry.
+ */
+ if (ctx->is_active) {
+ raw_spin_unlock_irq(&ctx->lock);
+ /*
+ * Reload the task pointer, it might have been changed by
+ * a concurrent perf_event_context_sched_out().
+ */
+ task = ctx->task;
+ goto retry;
+ }
+
+ /*
+ * Since the task isn't running, its safe to add the event, us holding
+ * the ctx->lock ensures the task won't get scheduled in.
+ */
+ add_event_to_ctx(event, ctx);
+ raw_spin_unlock_irq(&ctx->lock);
+}
+
+/*
+ * Put a event into inactive state and update time fields.
+ * Enabling the leader of a group effectively enables all
+ * the group members that aren't explicitly disabled, so we
+ * have to update their ->tstamp_enabled also.
+ * Note: this works for group members as well as group leaders
+ * since the non-leader members' sibling_lists will be empty.
+ */
+static void __perf_event_mark_enabled(struct perf_event *event)
+{
+ struct perf_event *sub;
+ u64 tstamp = perf_event_time(event);
+
+ event->state = PERF_EVENT_STATE_INACTIVE;
+ event->tstamp_enabled = tstamp - event->total_time_enabled;
+ list_for_each_entry(sub, &event->sibling_list, group_entry) {
+ if (sub->state >= PERF_EVENT_STATE_INACTIVE)
+ sub->tstamp_enabled = tstamp - sub->total_time_enabled;
+ }
+}
+
+/*
+ * Cross CPU call to enable a performance event
+ */
+static int __perf_event_enable(void *info)
+{
+ struct perf_event *event = info;
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_event *leader = event->group_leader;
+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ int err;
+
+ /*
+ * There's a time window between 'ctx->is_active' check
+ * in perf_event_enable function and this place having:
+ * - IRQs on
+ * - ctx->lock unlocked
+ *
+ * where the task could be killed and 'ctx' deactivated
+ * by perf_event_exit_task.
+ */
+ if (!ctx->is_active)
+ return -EINVAL;
+
+ raw_spin_lock(&ctx->lock);
+ update_context_time(ctx);
+
+ if (event->state >= PERF_EVENT_STATE_INACTIVE)
+ goto unlock;
+
+ /*
+ * set current task's cgroup time reference point
+ */
+ perf_cgroup_set_timestamp(current, ctx);
+
+ __perf_event_mark_enabled(event);
+
+ if (!event_filter_match(event)) {
+ if (is_cgroup_event(event))
+ perf_cgroup_defer_enabled(event);
+ goto unlock;
+ }
+
+ /*
+ * If the event is in a group and isn't the group leader,
+ * then don't put it on unless the group is on.
+ */
+ if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
+ goto unlock;
+
+ if (!group_can_go_on(event, cpuctx, 1)) {
+ err = -EEXIST;
+ } else {
+ if (event == leader)
+ err = group_sched_in(event, cpuctx, ctx);
+ else
+ err = event_sched_in(event, cpuctx, ctx);
+ }
+
+ if (err) {
+ /*
+ * If this event can't go on and it's part of a
+ * group, then the whole group has to come off.
+ */
+ if (leader != event) {
+ group_sched_out(leader, cpuctx, ctx);
+ perf_cpu_hrtimer_restart(cpuctx);
+ }
+ if (leader->attr.pinned) {
+ update_group_times(leader);
+ leader->state = PERF_EVENT_STATE_ERROR;
+ }
+ }
+
+unlock:
+ raw_spin_unlock(&ctx->lock);
+
+ return 0;
+}
+
+/*
+ * Enable a event.
+ *
+ * If event->ctx is a cloned context, callers must make sure that
+ * every task struct that event->ctx->task could possibly point to
+ * remains valid. This condition is satisfied when called through
+ * perf_event_for_each_child or perf_event_for_each as described
+ * for perf_event_disable.
+ */
+static void _perf_event_enable(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+ struct task_struct *task = ctx->task;
+
+ if (!task) {
+ /*
+ * Enable the event on the cpu that it's on
+ */
+ cpu_function_call(event->cpu, __perf_event_enable, event);
+ return;
+ }
+
+ raw_spin_lock_irq(&ctx->lock);
+ if (event->state >= PERF_EVENT_STATE_INACTIVE)
+ goto out;
+
+ /*
+ * If the event is in error state, clear that first.
+ * That way, if we see the event in error state below, we
+ * know that it has gone back into error state, as distinct
+ * from the task having been scheduled away before the
+ * cross-call arrived.
+ */
+ if (event->state == PERF_EVENT_STATE_ERROR)
+ event->state = PERF_EVENT_STATE_OFF;
+
+retry:
+ if (!ctx->is_active) {
+ __perf_event_mark_enabled(event);
+ goto out;
+ }
+
+ raw_spin_unlock_irq(&ctx->lock);
+
+ if (!task_function_call(task, __perf_event_enable, event))
+ return;
+
+ raw_spin_lock_irq(&ctx->lock);
+
+ /*
+ * If the context is active and the event is still off,
+ * we need to retry the cross-call.
+ */
+ if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) {
+ /*
+ * task could have been flipped by a concurrent
+ * perf_event_context_sched_out()
+ */
+ task = ctx->task;
+ goto retry;
+ }
+
+out:
+ raw_spin_unlock_irq(&ctx->lock);
+}
+
+/*
+ * See perf_event_disable();
+ */
+void perf_event_enable(struct perf_event *event)
+{
+ struct perf_event_context *ctx;
+
+ ctx = perf_event_ctx_lock(event);
+ _perf_event_enable(event);
+ perf_event_ctx_unlock(event, ctx);
+}
+EXPORT_SYMBOL_GPL(perf_event_enable);
+
+static int _perf_event_refresh(struct perf_event *event, int refresh)
+{
+ /*
+ * not supported on inherited events
+ */
+ if (event->attr.inherit || !is_sampling_event(event))
+ return -EINVAL;
+
+ atomic_add(refresh, &event->event_limit);
+ _perf_event_enable(event);
+
+ return 0;
+}
+
+/*
+ * See perf_event_disable()
+ */
+int perf_event_refresh(struct perf_event *event, int refresh)
+{
+ struct perf_event_context *ctx;
+ int ret;
+
+ ctx = perf_event_ctx_lock(event);
+ ret = _perf_event_refresh(event, refresh);
+ perf_event_ctx_unlock(event, ctx);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(perf_event_refresh);
+
+static void ctx_sched_out(struct perf_event_context *ctx,
+ struct perf_cpu_context *cpuctx,
+ enum event_type_t event_type)
+{
+ struct perf_event *event;
+ int is_active = ctx->is_active;
+
+ ctx->is_active &= ~event_type;
+ if (likely(!ctx->nr_events))
+ return;
+
+ update_context_time(ctx);
+ update_cgrp_time_from_cpuctx(cpuctx);
+ if (!ctx->nr_active)
+ return;
+
+ perf_pmu_disable(ctx->pmu);
+ if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) {
+ list_for_each_entry(event, &ctx->pinned_groups, group_entry)
+ group_sched_out(event, cpuctx, ctx);
+ }
+
+ if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) {
+ list_for_each_entry(event, &ctx->flexible_groups, group_entry)
+ group_sched_out(event, cpuctx, ctx);
+ }
+ perf_pmu_enable(ctx->pmu);
+}
+
+/*
+ * Test whether two contexts are equivalent, i.e. whether they have both been
+ * cloned from the same version of the same context.
+ *
+ * Equivalence is measured using a generation number in the context that is
+ * incremented on each modification to it; see unclone_ctx(), list_add_event()
+ * and list_del_event().
+ */
+static int context_equiv(struct perf_event_context *ctx1,
+ struct perf_event_context *ctx2)
+{
+ lockdep_assert_held(&ctx1->lock);
+ lockdep_assert_held(&ctx2->lock);
+
+ /* Pinning disables the swap optimization */
+ if (ctx1->pin_count || ctx2->pin_count)
+ return 0;
+
+ /* If ctx1 is the parent of ctx2 */
+ if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
+ return 1;
+
+ /* If ctx2 is the parent of ctx1 */
+ if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
+ return 1;
+
+ /*
+ * If ctx1 and ctx2 have the same parent; we flatten the parent
+ * hierarchy, see perf_event_init_context().
+ */
+ if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
+ ctx1->parent_gen == ctx2->parent_gen)
+ return 1;
+
+ /* Unmatched */
+ return 0;
+}
+
+static void __perf_event_sync_stat(struct perf_event *event,
+ struct perf_event *next_event)
+{
+ u64 value;
+
+ if (!event->attr.inherit_stat)
+ return;
+
+ /*
+ * Update the event value, we cannot use perf_event_read()
+ * because we're in the middle of a context switch and have IRQs
+ * disabled, which upsets smp_call_function_single(), however
+ * we know the event must be on the current CPU, therefore we
+ * don't need to use it.
+ */
+ switch (event->state) {
+ case PERF_EVENT_STATE_ACTIVE:
+ event->pmu->read(event);
+ /* fall-through */
+
+ case PERF_EVENT_STATE_INACTIVE:
+ update_event_times(event);
+ break;
+
+ default:
+ break;
+ }
+
+ /*
+ * In order to keep per-task stats reliable we need to flip the event
+ * values when we flip the contexts.
+ */
+ value = local64_read(&next_event->count);
+ value = local64_xchg(&event->count, value);
+ local64_set(&next_event->count, value);
+
+ swap(event->total_time_enabled, next_event->total_time_enabled);
+ swap(event->total_time_running, next_event->total_time_running);
+
+ /*
+ * Since we swizzled the values, update the user visible data too.
+ */
+ perf_event_update_userpage(event);
+ perf_event_update_userpage(next_event);
+}
+
+static void perf_event_sync_stat(struct perf_event_context *ctx,
+ struct perf_event_context *next_ctx)
+{
+ struct perf_event *event, *next_event;
+
+ if (!ctx->nr_stat)
+ return;
+
+ update_context_time(ctx);
+
+ event = list_first_entry(&ctx->event_list,
+ struct perf_event, event_entry);
+
+ next_event = list_first_entry(&next_ctx->event_list,
+ struct perf_event, event_entry);
+
+ while (&event->event_entry != &ctx->event_list &&
+ &next_event->event_entry != &next_ctx->event_list) {
+
+ __perf_event_sync_stat(event, next_event);
+
+ event = list_next_entry(event, event_entry);
+ next_event = list_next_entry(next_event, event_entry);
+ }
+}
+
+static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
+ struct task_struct *next)
+{
+ struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
+ struct perf_event_context *next_ctx;
+ struct perf_event_context *parent, *next_parent;
+ struct perf_cpu_context *cpuctx;
+ int do_switch = 1;
+
+ if (likely(!ctx))
+ return;
+
+ cpuctx = __get_cpu_context(ctx);
+ if (!cpuctx->task_ctx)
+ return;
+
+ rcu_read_lock();
+ next_ctx = next->perf_event_ctxp[ctxn];
+ if (!next_ctx)
+ goto unlock;
+
+ parent = rcu_dereference(ctx->parent_ctx);
+ next_parent = rcu_dereference(next_ctx->parent_ctx);
+
+ /* If neither context have a parent context; they cannot be clones. */
+ if (!parent && !next_parent)
+ goto unlock;
+
+ if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
+ /*
+ * Looks like the two contexts are clones, so we might be
+ * able to optimize the context switch. We lock both
+ * contexts and check that they are clones under the
+ * lock (including re-checking that neither has been
+ * uncloned in the meantime). It doesn't matter which
+ * order we take the locks because no other cpu could
+ * be trying to lock both of these tasks.
+ */
+ raw_spin_lock(&ctx->lock);
+ raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
+ if (context_equiv(ctx, next_ctx)) {
+ /*
+ * XXX do we need a memory barrier of sorts
+ * wrt to rcu_dereference() of perf_event_ctxp
+ */
+ task->perf_event_ctxp[ctxn] = next_ctx;
+ next->perf_event_ctxp[ctxn] = ctx;
+ ctx->task = next;
+ next_ctx->task = task;
+
+ swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
+
+ do_switch = 0;
+
+ perf_event_sync_stat(ctx, next_ctx);
+ }
+ raw_spin_unlock(&next_ctx->lock);
+ raw_spin_unlock(&ctx->lock);
+ }
+unlock:
+ rcu_read_unlock();
+
+ if (do_switch) {
+ raw_spin_lock(&ctx->lock);
+ ctx_sched_out(ctx, cpuctx, EVENT_ALL);
+ cpuctx->task_ctx = NULL;
+ raw_spin_unlock(&ctx->lock);
+ }
+}
+
+void perf_sched_cb_dec(struct pmu *pmu)
+{
+ this_cpu_dec(perf_sched_cb_usages);
+}
+
+void perf_sched_cb_inc(struct pmu *pmu)
+{
+ this_cpu_inc(perf_sched_cb_usages);
+}
+
+/*
+ * This function provides the context switch callback to the lower code
+ * layer. It is invoked ONLY when the context switch callback is enabled.
+ */
+static void perf_pmu_sched_task(struct task_struct *prev,
+ struct task_struct *next,
+ bool sched_in)
+{
+ struct perf_cpu_context *cpuctx;
+ struct pmu *pmu;
+ unsigned long flags;
+
+ if (prev == next)
+ return;
+
+ local_irq_save(flags);
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(pmu, &pmus, entry) {
+ if (pmu->sched_task) {
+ cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
+ perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+
+ perf_pmu_disable(pmu);
+
+ pmu->sched_task(cpuctx->task_ctx, sched_in);
+
+ perf_pmu_enable(pmu);
+
+ perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+ }
+ }
+
+ rcu_read_unlock();
+
+ local_irq_restore(flags);
+}
+
+#define for_each_task_context_nr(ctxn) \
+ for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
+
+/*
+ * Called from scheduler to remove the events of the current task,
+ * with interrupts disabled.
+ *
+ * We stop each event and update the event value in event->count.
+ *
+ * This does not protect us against NMI, but disable()
+ * sets the disabled bit in the control field of event _before_
+ * accessing the event control register. If a NMI hits, then it will
+ * not restart the event.
+ */
+void __perf_event_task_sched_out(struct task_struct *task,
+ struct task_struct *next)
+{
+ int ctxn;
+
+ if (__this_cpu_read(perf_sched_cb_usages))
+ perf_pmu_sched_task(task, next, false);
+
+ for_each_task_context_nr(ctxn)
+ perf_event_context_sched_out(task, ctxn, next);
+
+ /*
+ * if cgroup events exist on this CPU, then we need
+ * to check if we have to switch out PMU state.
+ * cgroup event are system-wide mode only
+ */
+ if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
+ perf_cgroup_sched_out(task, next);
+}
+
+static void task_ctx_sched_out(struct perf_event_context *ctx)
+{
+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+
+ if (!cpuctx->task_ctx)
+ return;
+
+ if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
+ return;
+
+ ctx_sched_out(ctx, cpuctx, EVENT_ALL);
+ cpuctx->task_ctx = NULL;
+}
+
+/*
+ * Called with IRQs disabled
+ */
+static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
+ enum event_type_t event_type)
+{
+ ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
+}
+
+static void
+ctx_pinned_sched_in(struct perf_event_context *ctx,
+ struct perf_cpu_context *cpuctx)
+{
+ struct perf_event *event;
+
+ list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
+ if (event->state <= PERF_EVENT_STATE_OFF)
+ continue;
+ if (!event_filter_match(event))
+ continue;
+
+ /* may need to reset tstamp_enabled */
+ if (is_cgroup_event(event))
+ perf_cgroup_mark_enabled(event, ctx);
+
+ if (group_can_go_on(event, cpuctx, 1))
+ group_sched_in(event, cpuctx, ctx);
+
+ /*
+ * If this pinned group hasn't been scheduled,
+ * put it in error state.
+ */
+ if (event->state == PERF_EVENT_STATE_INACTIVE) {
+ update_group_times(event);
+ event->state = PERF_EVENT_STATE_ERROR;
+ }
+ }
+}
+
+static void
+ctx_flexible_sched_in(struct perf_event_context *ctx,
+ struct perf_cpu_context *cpuctx)
+{
+ struct perf_event *event;
+ int can_add_hw = 1;
+
+ list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
+ /* Ignore events in OFF or ERROR state */
+ if (event->state <= PERF_EVENT_STATE_OFF)
+ continue;
+ /*
+ * Listen to the 'cpu' scheduling filter constraint
+ * of events:
+ */
+ if (!event_filter_match(event))
+ continue;
+
+ /* may need to reset tstamp_enabled */
+ if (is_cgroup_event(event))
+ perf_cgroup_mark_enabled(event, ctx);
+
+ if (group_can_go_on(event, cpuctx, can_add_hw)) {
+ if (group_sched_in(event, cpuctx, ctx))
+ can_add_hw = 0;
+ }
+ }
+}
+
+static void
+ctx_sched_in(struct perf_event_context *ctx,
+ struct perf_cpu_context *cpuctx,
+ enum event_type_t event_type,
+ struct task_struct *task)
+{
+ u64 now;
+ int is_active = ctx->is_active;
+
+ ctx->is_active |= event_type;
+ if (likely(!ctx->nr_events))
+ return;
+
+ now = perf_clock();
+ ctx->timestamp = now;
+ perf_cgroup_set_timestamp(task, ctx);
+ /*
+ * First go through the list and put on any pinned groups
+ * in order to give them the best chance of going on.
+ */
+ if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED))
+ ctx_pinned_sched_in(ctx, cpuctx);
+
+ /* Then walk through the lower prio flexible groups */
+ if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE))
+ ctx_flexible_sched_in(ctx, cpuctx);
+}
+
+static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
+ enum event_type_t event_type,
+ struct task_struct *task)
+{
+ struct perf_event_context *ctx = &cpuctx->ctx;
+
+ ctx_sched_in(ctx, cpuctx, event_type, task);
+}
+
+static void perf_event_context_sched_in(struct perf_event_context *ctx,
+ struct task_struct *task)
+{
+ struct perf_cpu_context *cpuctx;
+
+ cpuctx = __get_cpu_context(ctx);
+ if (cpuctx->task_ctx == ctx)
+ return;
+
+ perf_ctx_lock(cpuctx, ctx);
+ perf_pmu_disable(ctx->pmu);
+ /*
+ * We want to keep the following priority order:
+ * cpu pinned (that don't need to move), task pinned,
+ * cpu flexible, task flexible.
+ */
+ cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+
+ if (ctx->nr_events)
+ cpuctx->task_ctx = ctx;
+
+ perf_event_sched_in(cpuctx, cpuctx->task_ctx, task);
+
+ perf_pmu_enable(ctx->pmu);
+ perf_ctx_unlock(cpuctx, ctx);
+}
+
+/*
+ * Called from scheduler to add the events of the current task
+ * with interrupts disabled.
+ *
+ * We restore the event value and then enable it.
+ *
+ * This does not protect us against NMI, but enable()
+ * sets the enabled bit in the control field of event _before_
+ * accessing the event control register. If a NMI hits, then it will
+ * keep the event running.
+ */
+void __perf_event_task_sched_in(struct task_struct *prev,
+ struct task_struct *task)
+{
+ struct perf_event_context *ctx;
+ int ctxn;
+
+ for_each_task_context_nr(ctxn) {
+ ctx = task->perf_event_ctxp[ctxn];
+ if (likely(!ctx))
+ continue;
+
+ perf_event_context_sched_in(ctx, task);
+ }
+ /*
+ * if cgroup events exist on this CPU, then we need
+ * to check if we have to switch in PMU state.
+ * cgroup event are system-wide mode only
+ */
+ if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
+ perf_cgroup_sched_in(prev, task);
+
+ if (__this_cpu_read(perf_sched_cb_usages))
+ perf_pmu_sched_task(prev, task, true);
+}
+
+static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
+{
+ u64 frequency = event->attr.sample_freq;
+ u64 sec = NSEC_PER_SEC;
+ u64 divisor, dividend;
+
+ int count_fls, nsec_fls, frequency_fls, sec_fls;
+
+ count_fls = fls64(count);
+ nsec_fls = fls64(nsec);
+ frequency_fls = fls64(frequency);
+ sec_fls = 30;
+
+ /*
+ * We got @count in @nsec, with a target of sample_freq HZ
+ * the target period becomes:
+ *
+ * @count * 10^9
+ * period = -------------------
+ * @nsec * sample_freq
+ *
+ */
+
+ /*
+ * Reduce accuracy by one bit such that @a and @b converge
+ * to a similar magnitude.
+ */
+#define REDUCE_FLS(a, b) \
+do { \
+ if (a##_fls > b##_fls) { \
+ a >>= 1; \
+ a##_fls--; \
+ } else { \
+ b >>= 1; \
+ b##_fls--; \
+ } \
+} while (0)
+
+ /*
+ * Reduce accuracy until either term fits in a u64, then proceed with
+ * the other, so that finally we can do a u64/u64 division.
+ */
+ while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
+ REDUCE_FLS(nsec, frequency);
+ REDUCE_FLS(sec, count);
+ }
+
+ if (count_fls + sec_fls > 64) {
+ divisor = nsec * frequency;
+
+ while (count_fls + sec_fls > 64) {
+ REDUCE_FLS(count, sec);
+ divisor >>= 1;
+ }
+
+ dividend = count * sec;
+ } else {
+ dividend = count * sec;
+
+ while (nsec_fls + frequency_fls > 64) {
+ REDUCE_FLS(nsec, frequency);
+ dividend >>= 1;
+ }
+
+ divisor = nsec * frequency;
+ }
+
+ if (!divisor)
+ return dividend;
+
+ return div64_u64(dividend, divisor);
+}
+
+static DEFINE_PER_CPU(int, perf_throttled_count);
+static DEFINE_PER_CPU(u64, perf_throttled_seq);
+
+static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ s64 period, sample_period;
+ s64 delta;
+
+ period = perf_calculate_period(event, nsec, count);
+
+ delta = (s64)(period - hwc->sample_period);
+ delta = (delta + 7) / 8; /* low pass filter */
+
+ sample_period = hwc->sample_period + delta;
+
+ if (!sample_period)
+ sample_period = 1;
+
+ hwc->sample_period = sample_period;
+
+ if (local64_read(&hwc->period_left) > 8*sample_period) {
+ if (disable)
+ event->pmu->stop(event, PERF_EF_UPDATE);
+
+ local64_set(&hwc->period_left, 0);
+
+ if (disable)
+ event->pmu->start(event, PERF_EF_RELOAD);
+ }
+}
+
+/*
+ * combine freq adjustment with unthrottling to avoid two passes over the
+ * events. At the same time, make sure, having freq events does not change
+ * the rate of unthrottling as that would introduce bias.
+ */
+static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
+ int needs_unthr)
+{
+ struct perf_event *event;
+ struct hw_perf_event *hwc;
+ u64 now, period = TICK_NSEC;
+ s64 delta;
+
+ /*
+ * only need to iterate over all events iff:
+ * - context have events in frequency mode (needs freq adjust)
+ * - there are events to unthrottle on this cpu
+ */
+ if (!(ctx->nr_freq || needs_unthr))
+ return;
+
+ raw_spin_lock(&ctx->lock);
+ perf_pmu_disable(ctx->pmu);
+
+ list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+ if (event->state != PERF_EVENT_STATE_ACTIVE)
+ continue;
+
+ if (!event_filter_match(event))
+ continue;
+
+ perf_pmu_disable(event->pmu);
+
+ hwc = &event->hw;
+
+ if (hwc->interrupts == MAX_INTERRUPTS) {
+ hwc->interrupts = 0;
+ perf_log_throttle(event, 1);
+ event->pmu->start(event, 0);
+ }
+
+ if (!event->attr.freq || !event->attr.sample_freq)
+ goto next;
+
+ /*
+ * stop the event and update event->count
+ */
+ event->pmu->stop(event, PERF_EF_UPDATE);
+
+ now = local64_read(&event->count);
+ delta = now - hwc->freq_count_stamp;
+ hwc->freq_count_stamp = now;
+
+ /*
+ * restart the event
+ * reload only if value has changed
+ * we have stopped the event so tell that
+ * to perf_adjust_period() to avoid stopping it
+ * twice.
+ */
+ if (delta > 0)
+ perf_adjust_period(event, period, delta, false);
+
+ event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
+ next:
+ perf_pmu_enable(event->pmu);
+ }
+
+ perf_pmu_enable(ctx->pmu);
+ raw_spin_unlock(&ctx->lock);
+}
+
+/*
+ * Round-robin a context's events:
+ */
+static void rotate_ctx(struct perf_event_context *ctx)
+{
+ /*
+ * Rotate the first entry last of non-pinned groups. Rotation might be
+ * disabled by the inheritance code.
+ */
+ if (!ctx->rotate_disable)
+ list_rotate_left(&ctx->flexible_groups);
+}
+
+static int perf_rotate_context(struct perf_cpu_context *cpuctx)
+{
+ struct perf_event_context *ctx = NULL;
+ int rotate = 0;
+
+ if (cpuctx->ctx.nr_events) {
+ if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
+ rotate = 1;
+ }
+
+ ctx = cpuctx->task_ctx;
+ if (ctx && ctx->nr_events) {
+ if (ctx->nr_events != ctx->nr_active)
+ rotate = 1;
+ }
+
+ if (!rotate)
+ goto done;
+
+ perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+ perf_pmu_disable(cpuctx->ctx.pmu);
+
+ cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+ if (ctx)
+ ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
+
+ rotate_ctx(&cpuctx->ctx);
+ if (ctx)
+ rotate_ctx(ctx);
+
+ perf_event_sched_in(cpuctx, ctx, current);
+
+ perf_pmu_enable(cpuctx->ctx.pmu);
+ perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+done:
+
+ return rotate;
+}
+
+#ifdef CONFIG_NO_HZ_FULL
+bool perf_event_can_stop_tick(void)
+{
+ if (atomic_read(&nr_freq_events) ||
+ __this_cpu_read(perf_throttled_count))
+ return false;
+ else
+ return true;
+}
+#endif
+
+void perf_event_task_tick(void)
+{
+ struct list_head *head = this_cpu_ptr(&active_ctx_list);
+ struct perf_event_context *ctx, *tmp;
+ int throttled;
+
+ WARN_ON(!irqs_disabled());
+
+ __this_cpu_inc(perf_throttled_seq);
+ throttled = __this_cpu_xchg(perf_throttled_count, 0);
+
+ list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
+ perf_adjust_freq_unthr_context(ctx, throttled);
+}
+
+static int event_enable_on_exec(struct perf_event *event,
+ struct perf_event_context *ctx)
+{
+ if (!event->attr.enable_on_exec)
+ return 0;
+
+ event->attr.enable_on_exec = 0;
+ if (event->state >= PERF_EVENT_STATE_INACTIVE)
+ return 0;
+
+ __perf_event_mark_enabled(event);
+
+ return 1;
+}
+
+/*
+ * Enable all of a task's events that have been marked enable-on-exec.
+ * This expects task == current.
+ */
+static void perf_event_enable_on_exec(struct perf_event_context *ctx)
+{
+ struct perf_event_context *clone_ctx = NULL;
+ struct perf_event *event;
+ unsigned long flags;
+ int enabled = 0;
+ int ret;
+
+ local_irq_save(flags);
+ if (!ctx || !ctx->nr_events)
+ goto out;
+
+ /*
+ * We must ctxsw out cgroup events to avoid conflict
+ * when invoking perf_task_event_sched_in() later on
+ * in this function. Otherwise we end up trying to
+ * ctxswin cgroup events which are already scheduled
+ * in.
+ */
+ perf_cgroup_sched_out(current, NULL);
+
+ raw_spin_lock(&ctx->lock);
+ task_ctx_sched_out(ctx);
+
+ list_for_each_entry(event, &ctx->event_list, event_entry) {
+ ret = event_enable_on_exec(event, ctx);
+ if (ret)
+ enabled = 1;
+ }
+
+ /*
+ * Unclone this context if we enabled any event.
+ */
+ if (enabled)
+ clone_ctx = unclone_ctx(ctx);
+
+ raw_spin_unlock(&ctx->lock);
+
+ /*
+ * Also calls ctxswin for cgroup events, if any:
+ */
+ perf_event_context_sched_in(ctx, ctx->task);
+out:
+ local_irq_restore(flags);
+
+ if (clone_ctx)
+ put_ctx(clone_ctx);
+}
+
+void perf_event_exec(void)
+{
+ struct perf_event_context *ctx;
+ int ctxn;
+
+ rcu_read_lock();
+ for_each_task_context_nr(ctxn) {
+ ctx = current->perf_event_ctxp[ctxn];
+ if (!ctx)
+ continue;
+
+ perf_event_enable_on_exec(ctx);
+ }
+ rcu_read_unlock();
+}
+
+/*
+ * Cross CPU call to read the hardware event
+ */
+static void __perf_event_read(void *info)
+{
+ struct perf_event *event = info;
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+
+ /*
+ * If this is a task context, we need to check whether it is
+ * the current task context of this cpu. If not it has been
+ * scheduled out before the smp call arrived. In that case
+ * event->count would have been updated to a recent sample
+ * when the event was scheduled out.
+ */
+ if (ctx->task && cpuctx->task_ctx != ctx)
+ return;
+
+ raw_spin_lock(&ctx->lock);
+ if (ctx->is_active) {
+ update_context_time(ctx);
+ update_cgrp_time_from_event(event);
+ }
+ update_event_times(event);
+ if (event->state == PERF_EVENT_STATE_ACTIVE)
+ event->pmu->read(event);
+ raw_spin_unlock(&ctx->lock);
+}
+
+static inline u64 perf_event_count(struct perf_event *event)
+{
+ if (event->pmu->count)
+ return event->pmu->count(event);
+
+ return __perf_event_count(event);
+}
+
+static u64 perf_event_read(struct perf_event *event)
+{
+ /*
+ * If event is enabled and currently active on a CPU, update the
+ * value in the event structure:
+ */
+ if (event->state == PERF_EVENT_STATE_ACTIVE) {
+ smp_call_function_single(event->oncpu,
+ __perf_event_read, event, 1);
+ } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
+ struct perf_event_context *ctx = event->ctx;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&ctx->lock, flags);
+ /*
+ * may read while context is not active
+ * (e.g., thread is blocked), in that case
+ * we cannot update context time
+ */
+ if (ctx->is_active) {
+ update_context_time(ctx);
+ update_cgrp_time_from_event(event);
+ }
+ update_event_times(event);
+ raw_spin_unlock_irqrestore(&ctx->lock, flags);
+ }
+
+ return perf_event_count(event);
+}
+
+/*
+ * Initialize the perf_event context in a task_struct:
+ */
+static void __perf_event_init_context(struct perf_event_context *ctx)
+{
+ raw_spin_lock_init(&ctx->lock);
+ mutex_init(&ctx->mutex);
+ INIT_LIST_HEAD(&ctx->active_ctx_list);
+ INIT_LIST_HEAD(&ctx->pinned_groups);
+ INIT_LIST_HEAD(&ctx->flexible_groups);
+ INIT_LIST_HEAD(&ctx->event_list);
+ atomic_set(&ctx->refcount, 1);
+ INIT_DELAYED_WORK(&ctx->orphans_remove, orphans_remove_work);
+}
+
+static struct perf_event_context *
+alloc_perf_context(struct pmu *pmu, struct task_struct *task)
+{
+ struct perf_event_context *ctx;
+
+ ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
+ if (!ctx)
+ return NULL;
+
+ __perf_event_init_context(ctx);
+ if (task) {
+ ctx->task = task;
+ get_task_struct(task);
+ }
+ ctx->pmu = pmu;
+
+ return ctx;
+}
+
+static struct task_struct *
+find_lively_task_by_vpid(pid_t vpid)
+{
+ struct task_struct *task;
+ int err;
+
+ rcu_read_lock();
+ if (!vpid)
+ task = current;
+ else
+ task = find_task_by_vpid(vpid);
+ if (task)
+ get_task_struct(task);
+ rcu_read_unlock();
+
+ if (!task)
+ return ERR_PTR(-ESRCH);
+
+ /* Reuse ptrace permission checks for now. */
+ err = -EACCES;
+ if (!ptrace_may_access(task, PTRACE_MODE_READ))
+ goto errout;
+
+ return task;
+errout:
+ put_task_struct(task);
+ return ERR_PTR(err);
+
+}
+
+/*
+ * Returns a matching context with refcount and pincount.
+ */
+static struct perf_event_context *
+find_get_context(struct pmu *pmu, struct task_struct *task,
+ struct perf_event *event)
+{
+ struct perf_event_context *ctx, *clone_ctx = NULL;
+ struct perf_cpu_context *cpuctx;
+ void *task_ctx_data = NULL;
+ unsigned long flags;
+ int ctxn, err;
+ int cpu = event->cpu;
+
+ if (!task) {
+ /* Must be root to operate on a CPU event: */
+ if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+ return ERR_PTR(-EACCES);
+
+ /*
+ * We could be clever and allow to attach a event to an
+ * offline CPU and activate it when the CPU comes up, but
+ * that's for later.
+ */
+ if (!cpu_online(cpu))
+ return ERR_PTR(-ENODEV);
+
+ cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+ ctx = &cpuctx->ctx;
+ get_ctx(ctx);
+ ++ctx->pin_count;
+
+ return ctx;
+ }
+
+ err = -EINVAL;
+ ctxn = pmu->task_ctx_nr;
+ if (ctxn < 0)
+ goto errout;
+
+ if (event->attach_state & PERF_ATTACH_TASK_DATA) {
+ task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
+ if (!task_ctx_data) {
+ err = -ENOMEM;
+ goto errout;
+ }
+ }
+
+retry:
+ ctx = perf_lock_task_context(task, ctxn, &flags);
+ if (ctx) {
+ clone_ctx = unclone_ctx(ctx);
+ ++ctx->pin_count;
+
+ if (task_ctx_data && !ctx->task_ctx_data) {
+ ctx->task_ctx_data = task_ctx_data;
+ task_ctx_data = NULL;
+ }
+ raw_spin_unlock_irqrestore(&ctx->lock, flags);
+
+ if (clone_ctx)
+ put_ctx(clone_ctx);
+ } else {
+ ctx = alloc_perf_context(pmu, task);
+ err = -ENOMEM;
+ if (!ctx)
+ goto errout;
+
+ if (task_ctx_data) {
+ ctx->task_ctx_data = task_ctx_data;
+ task_ctx_data = NULL;
+ }
+
+ err = 0;
+ mutex_lock(&task->perf_event_mutex);
+ /*
+ * If it has already passed perf_event_exit_task().
+ * we must see PF_EXITING, it takes this mutex too.
+ */
+ if (task->flags & PF_EXITING)
+ err = -ESRCH;
+ else if (task->perf_event_ctxp[ctxn])
+ err = -EAGAIN;
+ else {
+ get_ctx(ctx);
+ ++ctx->pin_count;
+ rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
+ }
+ mutex_unlock(&task->perf_event_mutex);
+
+ if (unlikely(err)) {
+ put_ctx(ctx);
+
+ if (err == -EAGAIN)
+ goto retry;
+ goto errout;
+ }
+ }
+
+ kfree(task_ctx_data);
+ return ctx;
+
+errout:
+ kfree(task_ctx_data);
+ return ERR_PTR(err);
+}
+
+static void perf_event_free_filter(struct perf_event *event);
+static void perf_event_free_bpf_prog(struct perf_event *event);
+
+static void free_event_rcu(struct rcu_head *head)
+{
+ struct perf_event *event;
+
+ event = container_of(head, struct perf_event, rcu_head);
+ if (event->ns)
+ put_pid_ns(event->ns);
+ perf_event_free_filter(event);
+ kfree(event);
+}
+
+static void ring_buffer_attach(struct perf_event *event,
+ struct ring_buffer *rb);
+
+static void unaccount_event_cpu(struct perf_event *event, int cpu)
+{
+ if (event->parent)
+ return;
+
+ if (is_cgroup_event(event))
+ atomic_dec(&per_cpu(perf_cgroup_events, cpu));
+}
+
+static void unaccount_event(struct perf_event *event)
+{
+ if (event->parent)
+ return;
+
+ if (event->attach_state & PERF_ATTACH_TASK)
+ static_key_slow_dec_deferred(&perf_sched_events);
+ if (event->attr.mmap || event->attr.mmap_data)
+ atomic_dec(&nr_mmap_events);
+ if (event->attr.comm)
+ atomic_dec(&nr_comm_events);
+ if (event->attr.task)
+ atomic_dec(&nr_task_events);
+ if (event->attr.freq)
+ atomic_dec(&nr_freq_events);
+ if (is_cgroup_event(event))
+ static_key_slow_dec_deferred(&perf_sched_events);
+ if (has_branch_stack(event))
+ static_key_slow_dec_deferred(&perf_sched_events);
+
+ unaccount_event_cpu(event, event->cpu);
+}
+
+/*
+ * The following implement mutual exclusion of events on "exclusive" pmus
+ * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
+ * at a time, so we disallow creating events that might conflict, namely:
+ *
+ * 1) cpu-wide events in the presence of per-task events,
+ * 2) per-task events in the presence of cpu-wide events,
+ * 3) two matching events on the same context.
+ *
+ * The former two cases are handled in the allocation path (perf_event_alloc(),
+ * __free_event()), the latter -- before the first perf_install_in_context().
+ */
+static int exclusive_event_init(struct perf_event *event)
+{
+ struct pmu *pmu = event->pmu;
+
+ if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+ return 0;
+
+ /*
+ * Prevent co-existence of per-task and cpu-wide events on the
+ * same exclusive pmu.
+ *
+ * Negative pmu::exclusive_cnt means there are cpu-wide
+ * events on this "exclusive" pmu, positive means there are
+ * per-task events.
+ *
+ * Since this is called in perf_event_alloc() path, event::ctx
+ * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK
+ * to mean "per-task event", because unlike other attach states it
+ * never gets cleared.
+ */
+ if (event->attach_state & PERF_ATTACH_TASK) {
+ if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
+ return -EBUSY;
+ } else {
+ if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
+ return -EBUSY;
+ }
+
+ return 0;
+}
+
+static void exclusive_event_destroy(struct perf_event *event)
+{
+ struct pmu *pmu = event->pmu;
+
+ if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+ return;
+
+ /* see comment in exclusive_event_init() */
+ if (event->attach_state & PERF_ATTACH_TASK)
+ atomic_dec(&pmu->exclusive_cnt);
+ else
+ atomic_inc(&pmu->exclusive_cnt);
+}
+
+static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
+{
+ if ((e1->pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) &&
+ (e1->cpu == e2->cpu ||
+ e1->cpu == -1 ||
+ e2->cpu == -1))
+ return true;
+ return false;
+}
+
+/* Called under the same ctx::mutex as perf_install_in_context() */
+static bool exclusive_event_installable(struct perf_event *event,
+ struct perf_event_context *ctx)
+{
+ struct perf_event *iter_event;
+ struct pmu *pmu = event->pmu;
+
+ if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+ return true;
+
+ list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
+ if (exclusive_event_match(iter_event, event))
+ return false;
+ }
+
+ return true;
+}
+
+static void __free_event(struct perf_event *event)
+{
+ if (!event->parent) {
+ if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
+ put_callchain_buffers();
+ }
+
+ perf_event_free_bpf_prog(event);
+
+ if (event->destroy)
+ event->destroy(event);
+
+ if (event->ctx)
+ put_ctx(event->ctx);
+
+ if (event->pmu) {
+ exclusive_event_destroy(event);
+ module_put(event->pmu->module);
+ }
+
+ call_rcu(&event->rcu_head, free_event_rcu);
+}
+
+static void _free_event(struct perf_event *event)
+{
+ irq_work_sync(&event->pending);
+
+ unaccount_event(event);
+
+ if (event->rb) {
+ /*
+ * Can happen when we close an event with re-directed output.
+ *
+ * Since we have a 0 refcount, perf_mmap_close() will skip
+ * over us; possibly making our ring_buffer_put() the last.
+ */
+ mutex_lock(&event->mmap_mutex);
+ ring_buffer_attach(event, NULL);
+ mutex_unlock(&event->mmap_mutex);
+ }
+
+ if (is_cgroup_event(event))
+ perf_detach_cgroup(event);
+
+ __free_event(event);
+}
+
+/*
+ * Used to free events which have a known refcount of 1, such as in error paths
+ * where the event isn't exposed yet and inherited events.
+ */
+static void free_event(struct perf_event *event)
+{
+ if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
+ "unexpected event refcount: %ld; ptr=%p\n",
+ atomic_long_read(&event->refcount), event)) {
+ /* leak to avoid use-after-free */
+ return;
+ }
+
+ _free_event(event);
+}
+
+/*
+ * Remove user event from the owner task.
+ */
+static void perf_remove_from_owner(struct perf_event *event)
+{
+ struct task_struct *owner;
+
+ rcu_read_lock();
+ owner = ACCESS_ONCE(event->owner);
+ /*
+ * Matches the smp_wmb() in perf_event_exit_task(). If we observe
+ * !owner it means the list deletion is complete and we can indeed
+ * free this event, otherwise we need to serialize on
+ * owner->perf_event_mutex.
+ */
+ smp_read_barrier_depends();
+ if (owner) {
+ /*
+ * Since delayed_put_task_struct() also drops the last
+ * task reference we can safely take a new reference
+ * while holding the rcu_read_lock().
+ */
+ get_task_struct(owner);
+ }
+ rcu_read_unlock();
+
+ if (owner) {
+ /*
+ * If we're here through perf_event_exit_task() we're already
+ * holding ctx->mutex which would be an inversion wrt. the
+ * normal lock order.
+ *
+ * However we can safely take this lock because its the child
+ * ctx->mutex.
+ */
+ mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
+
+ /*
+ * We have to re-check the event->owner field, if it is cleared
+ * we raced with perf_event_exit_task(), acquiring the mutex
+ * ensured they're done, and we can proceed with freeing the
+ * event.
+ */
+ if (event->owner)
+ list_del_init(&event->owner_entry);
+ mutex_unlock(&owner->perf_event_mutex);
+ put_task_struct(owner);
+ }
+}
+
+static void put_event(struct perf_event *event)
+{
+ struct perf_event_context *ctx;
+
+ if (!atomic_long_dec_and_test(&event->refcount))
+ return;
+
+ if (!is_kernel_event(event))
+ perf_remove_from_owner(event);
+
+ /*
+ * There are two ways this annotation is useful:
+ *
+ * 1) there is a lock recursion from perf_event_exit_task
+ * see the comment there.
+ *
+ * 2) there is a lock-inversion with mmap_sem through
+ * perf_event_read_group(), which takes faults while
+ * holding ctx->mutex, however this is called after
+ * the last filedesc died, so there is no possibility
+ * to trigger the AB-BA case.
+ */
+ ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING);
+ WARN_ON_ONCE(ctx->parent_ctx);
+ perf_remove_from_context(event, true);
+ perf_event_ctx_unlock(event, ctx);
+
+ _free_event(event);
+}
+
+int perf_event_release_kernel(struct perf_event *event)
+{
+ put_event(event);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(perf_event_release_kernel);
+
+/*
+ * Called when the last reference to the file is gone.
+ */
+static int perf_release(struct inode *inode, struct file *file)
+{
+ put_event(file->private_data);
+ return 0;
+}
+
+/*
+ * Remove all orphanes events from the context.
+ */
+static void orphans_remove_work(struct work_struct *work)
+{
+ struct perf_event_context *ctx;
+ struct perf_event *event, *tmp;
+
+ ctx = container_of(work, struct perf_event_context,
+ orphans_remove.work);
+
+ mutex_lock(&ctx->mutex);
+ list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) {
+ struct perf_event *parent_event = event->parent;
+
+ if (!is_orphaned_child(event))
+ continue;
+
+ perf_remove_from_context(event, true);
+
+ mutex_lock(&parent_event->child_mutex);
+ list_del_init(&event->child_list);
+ mutex_unlock(&parent_event->child_mutex);
+
+ free_event(event);
+ put_event(parent_event);
+ }
+
+ raw_spin_lock_irq(&ctx->lock);
+ ctx->orphans_remove_sched = false;
+ raw_spin_unlock_irq(&ctx->lock);
+ mutex_unlock(&ctx->mutex);
+
+ put_ctx(ctx);
+}
+
+u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
+{
+ struct perf_event *child;
+ u64 total = 0;
+
+ *enabled = 0;
+ *running = 0;
+
+ mutex_lock(&event->child_mutex);
+ total += perf_event_read(event);
+ *enabled += event->total_time_enabled +
+ atomic64_read(&event->child_total_time_enabled);
+ *running += event->total_time_running +
+ atomic64_read(&event->child_total_time_running);
+
+ list_for_each_entry(child, &event->child_list, child_list) {
+ total += perf_event_read(child);
+ *enabled += child->total_time_enabled;
+ *running += child->total_time_running;
+ }
+ mutex_unlock(&event->child_mutex);
+
+ return total;
+}
+EXPORT_SYMBOL_GPL(perf_event_read_value);
+
+static int perf_event_read_group(struct perf_event *event,
+ u64 read_format, char __user *buf)
+{
+ struct perf_event *leader = event->group_leader, *sub;
+ struct perf_event_context *ctx = leader->ctx;
+ int n = 0, size = 0, ret;
+ u64 count, enabled, running;
+ u64 values[5];
+
+ lockdep_assert_held(&ctx->mutex);
+
+ count = perf_event_read_value(leader, &enabled, &running);
+
+ values[n++] = 1 + leader->nr_siblings;
+ if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+ values[n++] = enabled;
+ if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+ values[n++] = running;
+ values[n++] = count;
+ if (read_format & PERF_FORMAT_ID)
+ values[n++] = primary_event_id(leader);
+
+ size = n * sizeof(u64);
+
+ if (copy_to_user(buf, values, size))
+ return -EFAULT;
+
+ ret = size;
+
+ list_for_each_entry(sub, &leader->sibling_list, group_entry) {
+ n = 0;
+
+ values[n++] = perf_event_read_value(sub, &enabled, &running);
+ if (read_format & PERF_FORMAT_ID)
+ values[n++] = primary_event_id(sub);
+
+ size = n * sizeof(u64);
+
+ if (copy_to_user(buf + ret, values, size)) {
+ return -EFAULT;
+ }
+
+ ret += size;
+ }
+
+ return ret;
+}
+
+static int perf_event_read_one(struct perf_event *event,
+ u64 read_format, char __user *buf)
+{
+ u64 enabled, running;
+ u64 values[4];
+ int n = 0;
+
+ values[n++] = perf_event_read_value(event, &enabled, &running);
+ if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+ values[n++] = enabled;
+ if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+ values[n++] = running;
+ if (read_format & PERF_FORMAT_ID)
+ values[n++] = primary_event_id(event);
+
+ if (copy_to_user(buf, values, n * sizeof(u64)))
+ return -EFAULT;
+
+ return n * sizeof(u64);
+}
+
+static bool is_event_hup(struct perf_event *event)
+{
+ bool no_children;
+
+ if (event->state != PERF_EVENT_STATE_EXIT)
+ return false;
+
+ mutex_lock(&event->child_mutex);
+ no_children = list_empty(&event->child_list);
+ mutex_unlock(&event->child_mutex);
+ return no_children;
+}
+
+/*
+ * Read the performance event - simple non blocking version for now
+ */
+static ssize_t
+perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
+{
+ u64 read_format = event->attr.read_format;
+ int ret;
+
+ /*
+ * Return end-of-file for a read on a event that is in
+ * error state (i.e. because it was pinned but it couldn't be
+ * scheduled on to the CPU at some point).
+ */
+ if (event->state == PERF_EVENT_STATE_ERROR)
+ return 0;
+
+ if (count < event->read_size)
+ return -ENOSPC;
+
+ WARN_ON_ONCE(event->ctx->parent_ctx);
+ if (read_format & PERF_FORMAT_GROUP)
+ ret = perf_event_read_group(event, read_format, buf);
+ else
+ ret = perf_event_read_one(event, read_format, buf);
+
+ return ret;
+}
+
+static ssize_t
+perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+ struct perf_event *event = file->private_data;
+ struct perf_event_context *ctx;
+ int ret;
+
+ ctx = perf_event_ctx_lock(event);
+ ret = perf_read_hw(event, buf, count);
+ perf_event_ctx_unlock(event, ctx);
+
+ return ret;
+}
+
+static unsigned int perf_poll(struct file *file, poll_table *wait)
+{
+ struct perf_event *event = file->private_data;
+ struct ring_buffer *rb;
+ unsigned int events = POLLHUP;
+
+ poll_wait(file, &event->waitq, wait);
+
+ if (is_event_hup(event))
+ return events;
+
+ /*
+ * Pin the event->rb by taking event->mmap_mutex; otherwise
+ * perf_event_set_output() can swizzle our rb and make us miss wakeups.
+ */
+ mutex_lock(&event->mmap_mutex);
+ rb = event->rb;
+ if (rb)
+ events = atomic_xchg(&rb->poll, 0);
+ mutex_unlock(&event->mmap_mutex);
+ return events;
+}
+
+static void _perf_event_reset(struct perf_event *event)
+{
+ (void)perf_event_read(event);
+ local64_set(&event->count, 0);
+ perf_event_update_userpage(event);
+}
+
+/*
+ * Holding the top-level event's child_mutex means that any
+ * descendant process that has inherited this event will block
+ * in sync_child_event if it goes to exit, thus satisfying the
+ * task existence requirements of perf_event_enable/disable.
+ */
+static void perf_event_for_each_child(struct perf_event *event,
+ void (*func)(struct perf_event *))
+{
+ struct perf_event *child;
+
+ WARN_ON_ONCE(event->ctx->parent_ctx);
+
+ mutex_lock(&event->child_mutex);
+ func(event);
+ list_for_each_entry(child, &event->child_list, child_list)
+ func(child);
+ mutex_unlock(&event->child_mutex);
+}
+
+static void perf_event_for_each(struct perf_event *event,
+ void (*func)(struct perf_event *))
+{
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_event *sibling;
+
+ lockdep_assert_held(&ctx->mutex);
+
+ event = event->group_leader;
+
+ perf_event_for_each_child(event, func);
+ list_for_each_entry(sibling, &event->sibling_list, group_entry)
+ perf_event_for_each_child(sibling, func);
+}
+
+static int perf_event_period(struct perf_event *event, u64 __user *arg)
+{
+ struct perf_event_context *ctx = event->ctx;
+ int ret = 0, active;
+ u64 value;
+
+ if (!is_sampling_event(event))
+ return -EINVAL;
+
+ if (copy_from_user(&value, arg, sizeof(value)))
+ return -EFAULT;
+
+ if (!value)
+ return -EINVAL;
+
+ raw_spin_lock_irq(&ctx->lock);
+ if (event->attr.freq) {
+ if (value > sysctl_perf_event_sample_rate) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ event->attr.sample_freq = value;
+ } else {
+ event->attr.sample_period = value;
+ event->hw.sample_period = value;
+ }
+
+ active = (event->state == PERF_EVENT_STATE_ACTIVE);
+ if (active) {
+ perf_pmu_disable(ctx->pmu);
+ event->pmu->stop(event, PERF_EF_UPDATE);
+ }
+
+ local64_set(&event->hw.period_left, 0);
+
+ if (active) {
+ event->pmu->start(event, PERF_EF_RELOAD);
+ perf_pmu_enable(ctx->pmu);
+ }
+
+unlock:
+ raw_spin_unlock_irq(&ctx->lock);
+
+ return ret;
+}
+
+static const struct file_operations perf_fops;
+
+static inline int perf_fget_light(int fd, struct fd *p)
+{
+ struct fd f = fdget(fd);
+ if (!f.file)
+ return -EBADF;
+
+ if (f.file->f_op != &perf_fops) {
+ fdput(f);
+ return -EBADF;
+ }
+ *p = f;
+ return 0;
+}
+
+static int perf_event_set_output(struct perf_event *event,
+ struct perf_event *output_event);
+static int perf_event_set_filter(struct perf_event *event, void __user *arg);
+static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
+
+static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
+{
+ void (*func)(struct perf_event *);
+ u32 flags = arg;
+
+ switch (cmd) {
+ case PERF_EVENT_IOC_ENABLE:
+ func = _perf_event_enable;
+ break;
+ case PERF_EVENT_IOC_DISABLE:
+ func = _perf_event_disable;
+ break;
+ case PERF_EVENT_IOC_RESET:
+ func = _perf_event_reset;
+ break;
+
+ case PERF_EVENT_IOC_REFRESH:
+ return _perf_event_refresh(event, arg);
+
+ case PERF_EVENT_IOC_PERIOD:
+ return perf_event_period(event, (u64 __user *)arg);
+
+ case PERF_EVENT_IOC_ID:
+ {
+ u64 id = primary_event_id(event);
+
+ if (copy_to_user((void __user *)arg, &id, sizeof(id)))
+ return -EFAULT;
+ return 0;
+ }
+
+ case PERF_EVENT_IOC_SET_OUTPUT:
+ {
+ int ret;
+ if (arg != -1) {
+ struct perf_event *output_event;
+ struct fd output;
+ ret = perf_fget_light(arg, &output);
+ if (ret)
+ return ret;
+ output_event = output.file->private_data;
+ ret = perf_event_set_output(event, output_event);
+ fdput(output);
+ } else {
+ ret = perf_event_set_output(event, NULL);
+ }
+ return ret;
+ }
+
+ case PERF_EVENT_IOC_SET_FILTER:
+ return perf_event_set_filter(event, (void __user *)arg);
+
+ case PERF_EVENT_IOC_SET_BPF:
+ return perf_event_set_bpf_prog(event, arg);
+
+ default:
+ return -ENOTTY;
+ }
+
+ if (flags & PERF_IOC_FLAG_GROUP)
+ perf_event_for_each(event, func);
+ else
+ perf_event_for_each_child(event, func);
+
+ return 0;
+}
+
+static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ struct perf_event *event = file->private_data;
+ struct perf_event_context *ctx;
+ long ret;
+
+ ctx = perf_event_ctx_lock(event);
+ ret = _perf_ioctl(event, cmd, arg);
+ perf_event_ctx_unlock(event, ctx);
+
+ return ret;
+}
+
+#ifdef CONFIG_COMPAT
+static long perf_compat_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ switch (_IOC_NR(cmd)) {
+ case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
+ case _IOC_NR(PERF_EVENT_IOC_ID):
+ /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
+ if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
+ cmd &= ~IOCSIZE_MASK;
+ cmd |= sizeof(void *) << IOCSIZE_SHIFT;
+ }
+ break;
+ }
+ return perf_ioctl(file, cmd, arg);
+}
+#else
+# define perf_compat_ioctl NULL
+#endif
+
+int perf_event_task_enable(void)
+{
+ struct perf_event_context *ctx;
+ struct perf_event *event;
+
+ mutex_lock(&current->perf_event_mutex);
+ list_for_each_entry(event, &current->perf_event_list, owner_entry) {
+ ctx = perf_event_ctx_lock(event);
+ perf_event_for_each_child(event, _perf_event_enable);
+ perf_event_ctx_unlock(event, ctx);
+ }
+ mutex_unlock(&current->perf_event_mutex);
+
+ return 0;
+}
+
+int perf_event_task_disable(void)
+{
+ struct perf_event_context *ctx;
+ struct perf_event *event;
+
+ mutex_lock(&current->perf_event_mutex);
+ list_for_each_entry(event, &current->perf_event_list, owner_entry) {
+ ctx = perf_event_ctx_lock(event);
+ perf_event_for_each_child(event, _perf_event_disable);
+ perf_event_ctx_unlock(event, ctx);
+ }
+ mutex_unlock(&current->perf_event_mutex);
+
+ return 0;
+}
+
+static int perf_event_index(struct perf_event *event)
+{
+ if (event->hw.state & PERF_HES_STOPPED)
+ return 0;
+
+ if (event->state != PERF_EVENT_STATE_ACTIVE)
+ return 0;
+
+ return event->pmu->event_idx(event);
+}
+
+static void calc_timer_values(struct perf_event *event,
+ u64 *now,
+ u64 *enabled,
+ u64 *running)
+{
+ u64 ctx_time;
+
+ *now = perf_clock();
+ ctx_time = event->shadow_ctx_time + *now;
+ *enabled = ctx_time - event->tstamp_enabled;
+ *running = ctx_time - event->tstamp_running;
+}
+
+static void perf_event_init_userpage(struct perf_event *event)
+{
+ struct perf_event_mmap_page *userpg;
+ struct ring_buffer *rb;
+
+ rcu_read_lock();
+ rb = rcu_dereference(event->rb);
+ if (!rb)
+ goto unlock;
+
+ userpg = rb->user_page;
+
+ /* Allow new userspace to detect that bit 0 is deprecated */
+ userpg->cap_bit0_is_deprecated = 1;
+ userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
+ userpg->data_offset = PAGE_SIZE;
+ userpg->data_size = perf_data_size(rb);
+
+unlock:
+ rcu_read_unlock();
+}
+
+void __weak arch_perf_update_userpage(
+ struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
+{
+}
+
+/*
+ * Callers need to ensure there can be no nesting of this function, otherwise
+ * the seqlock logic goes bad. We can not serialize this because the arch
+ * code calls this from NMI context.
+ */
+void perf_event_update_userpage(struct perf_event *event)
+{
+ struct perf_event_mmap_page *userpg;
+ struct ring_buffer *rb;
+ u64 enabled, running, now;
+
+ rcu_read_lock();
+ rb = rcu_dereference(event->rb);
+ if (!rb)
+ goto unlock;
+
+ /*
+ * compute total_time_enabled, total_time_running
+ * based on snapshot values taken when the event
+ * was last scheduled in.
+ *
+ * we cannot simply called update_context_time()
+ * because of locking issue as we can be called in
+ * NMI context
+ */
+ calc_timer_values(event, &now, &enabled, &running);
+
+ userpg = rb->user_page;
+ /*
+ * Disable preemption so as to not let the corresponding user-space
+ * spin too long if we get preempted.
+ */
+ preempt_disable();
+ ++userpg->lock;
+ barrier();
+ userpg->index = perf_event_index(event);
+ userpg->offset = perf_event_count(event);
+ if (userpg->index)
+ userpg->offset -= local64_read(&event->hw.prev_count);
+
+ userpg->time_enabled = enabled +
+ atomic64_read(&event->child_total_time_enabled);
+
+ userpg->time_running = running +
+ atomic64_read(&event->child_total_time_running);
+
+ arch_perf_update_userpage(event, userpg, now);
+
+ barrier();
+ ++userpg->lock;
+ preempt_enable();
+unlock:
+ rcu_read_unlock();
+}
+
+static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct perf_event *event = vma->vm_file->private_data;
+ struct ring_buffer *rb;
+ int ret = VM_FAULT_SIGBUS;
+
+ if (vmf->flags & FAULT_FLAG_MKWRITE) {
+ if (vmf->pgoff == 0)
+ ret = 0;
+ return ret;
+ }
+
+ rcu_read_lock();
+ rb = rcu_dereference(event->rb);
+ if (!rb)
+ goto unlock;
+
+ if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
+ goto unlock;
+
+ vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
+ if (!vmf->page)
+ goto unlock;
+
+ get_page(vmf->page);
+ vmf->page->mapping = vma->vm_file->f_mapping;
+ vmf->page->index = vmf->pgoff;
+
+ ret = 0;
+unlock:
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static void ring_buffer_attach(struct perf_event *event,
+ struct ring_buffer *rb)
+{
+ struct ring_buffer *old_rb = NULL;
+ unsigned long flags;
+
+ if (event->rb) {
+ /*
+ * Should be impossible, we set this when removing
+ * event->rb_entry and wait/clear when adding event->rb_entry.
+ */
+ WARN_ON_ONCE(event->rcu_pending);
+
+ old_rb = event->rb;
+ spin_lock_irqsave(&old_rb->event_lock, flags);
+ list_del_rcu(&event->rb_entry);
+ spin_unlock_irqrestore(&old_rb->event_lock, flags);
+
+ event->rcu_batches = get_state_synchronize_rcu();
+ event->rcu_pending = 1;
+ }
+
+ if (rb) {
+ if (event->rcu_pending) {
+ cond_synchronize_rcu(event->rcu_batches);
+ event->rcu_pending = 0;
+ }
+
+ spin_lock_irqsave(&rb->event_lock, flags);
+ list_add_rcu(&event->rb_entry, &rb->event_list);
+ spin_unlock_irqrestore(&rb->event_lock, flags);
+ }
+
+ rcu_assign_pointer(event->rb, rb);
+
+ if (old_rb) {
+ ring_buffer_put(old_rb);
+ /*
+ * Since we detached before setting the new rb, so that we
+ * could attach the new rb, we could have missed a wakeup.
+ * Provide it now.
+ */
+ wake_up_all(&event->waitq);
+ }
+}
+
+static void ring_buffer_wakeup(struct perf_event *event)
+{
+ struct ring_buffer *rb;
+
+ rcu_read_lock();
+ rb = rcu_dereference(event->rb);
+ if (rb) {
+ list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
+ wake_up_all(&event->waitq);
+ }
+ rcu_read_unlock();
+}
+
+static void rb_free_rcu(struct rcu_head *rcu_head)
+{
+ struct ring_buffer *rb;
+
+ rb = container_of(rcu_head, struct ring_buffer, rcu_head);
+ rb_free(rb);
+}
+
+struct ring_buffer *ring_buffer_get(struct perf_event *event)
+{
+ struct ring_buffer *rb;
+
+ rcu_read_lock();
+ rb = rcu_dereference(event->rb);
+ if (rb) {
+ if (!atomic_inc_not_zero(&rb->refcount))
+ rb = NULL;
+ }
+ rcu_read_unlock();
+
+ return rb;
+}
+
+void ring_buffer_put(struct ring_buffer *rb)
+{
+ if (!atomic_dec_and_test(&rb->refcount))
+ return;
+
+ WARN_ON_ONCE(!list_empty(&rb->event_list));
+
+ call_rcu(&rb->rcu_head, rb_free_rcu);
+}
+
+static void perf_mmap_open(struct vm_area_struct *vma)
+{
+ struct perf_event *event = vma->vm_file->private_data;
+
+ atomic_inc(&event->mmap_count);
+ atomic_inc(&event->rb->mmap_count);
+
+ if (vma->vm_pgoff)
+ atomic_inc(&event->rb->aux_mmap_count);
+
+ if (event->pmu->event_mapped)
+ event->pmu->event_mapped(event);
+}
+
+/*
+ * A buffer can be mmap()ed multiple times; either directly through the same
+ * event, or through other events by use of perf_event_set_output().
+ *
+ * In order to undo the VM accounting done by perf_mmap() we need to destroy
+ * the buffer here, where we still have a VM context. This means we need
+ * to detach all events redirecting to us.
+ */
+static void perf_mmap_close(struct vm_area_struct *vma)
+{
+ struct perf_event *event = vma->vm_file->private_data;
+
+ struct ring_buffer *rb = ring_buffer_get(event);
+ struct user_struct *mmap_user = rb->mmap_user;
+ int mmap_locked = rb->mmap_locked;
+ unsigned long size = perf_data_size(rb);
+
+ if (event->pmu->event_unmapped)
+ event->pmu->event_unmapped(event);
+
+ /*
+ * rb->aux_mmap_count will always drop before rb->mmap_count and
+ * event->mmap_count, so it is ok to use event->mmap_mutex to
+ * serialize with perf_mmap here.
+ */
+ if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
+ atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
+ atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
+ vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
+
+ rb_free_aux(rb);
+ mutex_unlock(&event->mmap_mutex);
+ }
+
+ atomic_dec(&rb->mmap_count);
+
+ if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
+ goto out_put;
+
+ ring_buffer_attach(event, NULL);
+ mutex_unlock(&event->mmap_mutex);
+
+ /* If there's still other mmap()s of this buffer, we're done. */
+ if (atomic_read(&rb->mmap_count))
+ goto out_put;
+
+ /*
+ * No other mmap()s, detach from all other events that might redirect
+ * into the now unreachable buffer. Somewhat complicated by the
+ * fact that rb::event_lock otherwise nests inside mmap_mutex.
+ */
+again:
+ rcu_read_lock();
+ list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
+ if (!atomic_long_inc_not_zero(&event->refcount)) {
+ /*
+ * This event is en-route to free_event() which will
+ * detach it and remove it from the list.
+ */
+ continue;
+ }
+ rcu_read_unlock();
+
+ mutex_lock(&event->mmap_mutex);
+ /*
+ * Check we didn't race with perf_event_set_output() which can
+ * swizzle the rb from under us while we were waiting to
+ * acquire mmap_mutex.
+ *
+ * If we find a different rb; ignore this event, a next
+ * iteration will no longer find it on the list. We have to
+ * still restart the iteration to make sure we're not now
+ * iterating the wrong list.
+ */
+ if (event->rb == rb)
+ ring_buffer_attach(event, NULL);
+
+ mutex_unlock(&event->mmap_mutex);
+ put_event(event);
+
+ /*
+ * Restart the iteration; either we're on the wrong list or
+ * destroyed its integrity by doing a deletion.
+ */
+ goto again;
+ }
+ rcu_read_unlock();
+
+ /*
+ * It could be there's still a few 0-ref events on the list; they'll
+ * get cleaned up by free_event() -- they'll also still have their
+ * ref on the rb and will free it whenever they are done with it.
+ *
+ * Aside from that, this buffer is 'fully' detached and unmapped,
+ * undo the VM accounting.
+ */
+
+ atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
+ vma->vm_mm->pinned_vm -= mmap_locked;
+ free_uid(mmap_user);
+
+out_put:
+ ring_buffer_put(rb); /* could be last */
+}
+
+static const struct vm_operations_struct perf_mmap_vmops = {
+ .open = perf_mmap_open,
+ .close = perf_mmap_close, /* non mergable */
+ .fault = perf_mmap_fault,
+ .page_mkwrite = perf_mmap_fault,
+};
+
+static int perf_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct perf_event *event = file->private_data;
+ unsigned long user_locked, user_lock_limit;
+ struct user_struct *user = current_user();
+ unsigned long locked, lock_limit;
+ struct ring_buffer *rb = NULL;
+ unsigned long vma_size;
+ unsigned long nr_pages;
+ long user_extra = 0, extra = 0;
+ int ret = 0, flags = 0;
+
+ /*
+ * Don't allow mmap() of inherited per-task counters. This would
+ * create a performance issue due to all children writing to the
+ * same rb.
+ */
+ if (event->cpu == -1 && event->attr.inherit)
+ return -EINVAL;
+
+ if (!(vma->vm_flags & VM_SHARED))
+ return -EINVAL;
+
+ vma_size = vma->vm_end - vma->vm_start;
+
+ if (vma->vm_pgoff == 0) {
+ nr_pages = (vma_size / PAGE_SIZE) - 1;
+ } else {
+ /*
+ * AUX area mapping: if rb->aux_nr_pages != 0, it's already
+ * mapped, all subsequent mappings should have the same size
+ * and offset. Must be above the normal perf buffer.
+ */
+ u64 aux_offset, aux_size;
+
+ if (!event->rb)
+ return -EINVAL;
+
+ nr_pages = vma_size / PAGE_SIZE;
+
+ mutex_lock(&event->mmap_mutex);
+ ret = -EINVAL;
+
+ rb = event->rb;
+ if (!rb)
+ goto aux_unlock;
+
+ aux_offset = ACCESS_ONCE(rb->user_page->aux_offset);
+ aux_size = ACCESS_ONCE(rb->user_page->aux_size);
+
+ if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
+ goto aux_unlock;
+
+ if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
+ goto aux_unlock;
+
+ /* already mapped with a different offset */
+ if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
+ goto aux_unlock;
+
+ if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
+ goto aux_unlock;
+
+ /* already mapped with a different size */
+ if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
+ goto aux_unlock;
+
+ if (!is_power_of_2(nr_pages))
+ goto aux_unlock;
+
+ if (!atomic_inc_not_zero(&rb->mmap_count))
+ goto aux_unlock;
+
+ if (rb_has_aux(rb)) {
+ atomic_inc(&rb->aux_mmap_count);
+ ret = 0;
+ goto unlock;
+ }
+
+ atomic_set(&rb->aux_mmap_count, 1);
+ user_extra = nr_pages;
+
+ goto accounting;
+ }
+
+ /*
+ * If we have rb pages ensure they're a power-of-two number, so we
+ * can do bitmasks instead of modulo.
+ */
+ if (nr_pages != 0 && !is_power_of_2(nr_pages))
+ return -EINVAL;
+
+ if (vma_size != PAGE_SIZE * (1 + nr_pages))
+ return -EINVAL;
+
+ WARN_ON_ONCE(event->ctx->parent_ctx);
+again:
+ mutex_lock(&event->mmap_mutex);
+ if (event->rb) {
+ if (event->rb->nr_pages != nr_pages) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
+ /*
+ * Raced against perf_mmap_close() through
+ * perf_event_set_output(). Try again, hope for better
+ * luck.
+ */
+ mutex_unlock(&event->mmap_mutex);
+ goto again;
+ }
+
+ goto unlock;
+ }
+
+ user_extra = nr_pages + 1;
+
+accounting:
+ user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
+
+ /*
+ * Increase the limit linearly with more CPUs:
+ */
+ user_lock_limit *= num_online_cpus();
+
+ user_locked = atomic_long_read(&user->locked_vm) + user_extra;
+
+ if (user_locked > user_lock_limit)
+ extra = user_locked - user_lock_limit;
+
+ lock_limit = rlimit(RLIMIT_MEMLOCK);
+ lock_limit >>= PAGE_SHIFT;
+ locked = vma->vm_mm->pinned_vm + extra;
+
+ if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
+ !capable(CAP_IPC_LOCK)) {
+ ret = -EPERM;
+ goto unlock;
+ }
+
+ WARN_ON(!rb && event->rb);
+
+ if (vma->vm_flags & VM_WRITE)
+ flags |= RING_BUFFER_WRITABLE;
+
+ if (!rb) {
+ rb = rb_alloc(nr_pages,
+ event->attr.watermark ? event->attr.wakeup_watermark : 0,
+ event->cpu, flags);
+
+ if (!rb) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+
+ atomic_set(&rb->mmap_count, 1);
+ rb->mmap_user = get_current_user();
+ rb->mmap_locked = extra;
+
+ ring_buffer_attach(event, rb);
+
+ perf_event_init_userpage(event);
+ perf_event_update_userpage(event);
+ } else {
+ ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
+ event->attr.aux_watermark, flags);
+ if (!ret)
+ rb->aux_mmap_locked = extra;
+ }
+
+unlock:
+ if (!ret) {
+ atomic_long_add(user_extra, &user->locked_vm);
+ vma->vm_mm->pinned_vm += extra;
+
+ atomic_inc(&event->mmap_count);
+ } else if (rb) {
+ atomic_dec(&rb->mmap_count);
+ }
+aux_unlock:
+ mutex_unlock(&event->mmap_mutex);
+
+ /*
+ * Since pinned accounting is per vm we cannot allow fork() to copy our
+ * vma.
+ */
+ vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
+ vma->vm_ops = &perf_mmap_vmops;
+
+ if (event->pmu->event_mapped)
+ event->pmu->event_mapped(event);
+
+ return ret;
+}
+
+static int perf_fasync(int fd, struct file *filp, int on)
+{
+ struct inode *inode = file_inode(filp);
+ struct perf_event *event = filp->private_data;
+ int retval;
+
+ mutex_lock(&inode->i_mutex);
+ retval = fasync_helper(fd, filp, on, &event->fasync);
+ mutex_unlock(&inode->i_mutex);
+
+ if (retval < 0)
+ return retval;
+
+ return 0;
+}
+
+static const struct file_operations perf_fops = {
+ .llseek = no_llseek,
+ .release = perf_release,
+ .read = perf_read,
+ .poll = perf_poll,
+ .unlocked_ioctl = perf_ioctl,
+ .compat_ioctl = perf_compat_ioctl,
+ .mmap = perf_mmap,
+ .fasync = perf_fasync,
+};
+
+/*
+ * Perf event wakeup
+ *
+ * If there's data, ensure we set the poll() state and publish everything
+ * to user-space before waking everybody up.
+ */
+
+void perf_event_wakeup(struct perf_event *event)
+{
+ ring_buffer_wakeup(event);
+
+ if (event->pending_kill) {
+ kill_fasync(&event->fasync, SIGIO, event->pending_kill);
+ event->pending_kill = 0;
+ }
+}
+
+static void perf_pending_event(struct irq_work *entry)
+{
+ struct perf_event *event = container_of(entry,
+ struct perf_event, pending);
+ int rctx;
+
+ rctx = perf_swevent_get_recursion_context();
+ /*
+ * If we 'fail' here, that's OK, it means recursion is already disabled
+ * and we won't recurse 'further'.
+ */
+
+ if (event->pending_disable) {
+ event->pending_disable = 0;
+ __perf_event_disable(event);
+ }
+
+ if (event->pending_wakeup) {
+ event->pending_wakeup = 0;
+ perf_event_wakeup(event);
+ }
+
+ if (rctx >= 0)
+ perf_swevent_put_recursion_context(rctx);
+}
+
+/*
+ * We assume there is only KVM supporting the callbacks.
+ * Later on, we might change it to a list if there is
+ * another virtualization implementation supporting the callbacks.
+ */
+struct perf_guest_info_callbacks *perf_guest_cbs;
+
+int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
+{
+ perf_guest_cbs = cbs;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
+
+int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
+{
+ perf_guest_cbs = NULL;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
+
+static void
+perf_output_sample_regs(struct perf_output_handle *handle,
+ struct pt_regs *regs, u64 mask)
+{
+ int bit;
+
+ for_each_set_bit(bit, (const unsigned long *) &mask,
+ sizeof(mask) * BITS_PER_BYTE) {
+ u64 val;
+
+ val = perf_reg_value(regs, bit);
+ perf_output_put(handle, val);
+ }
+}
+
+static void perf_sample_regs_user(struct perf_regs *regs_user,
+ struct pt_regs *regs,
+ struct pt_regs *regs_user_copy)
+{
+ if (user_mode(regs)) {
+ regs_user->abi = perf_reg_abi(current);
+ regs_user->regs = regs;
+ } else if (current->mm) {
+ perf_get_regs_user(regs_user, regs, regs_user_copy);
+ } else {
+ regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
+ regs_user->regs = NULL;
+ }
+}
+
+static void perf_sample_regs_intr(struct perf_regs *regs_intr,
+ struct pt_regs *regs)
+{
+ regs_intr->regs = regs;
+ regs_intr->abi = perf_reg_abi(current);
+}
+
+
+/*
+ * Get remaining task size from user stack pointer.
+ *
+ * It'd be better to take stack vma map and limit this more
+ * precisly, but there's no way to get it safely under interrupt,
+ * so using TASK_SIZE as limit.
+ */
+static u64 perf_ustack_task_size(struct pt_regs *regs)
+{
+ unsigned long addr = perf_user_stack_pointer(regs);
+
+ if (!addr || addr >= TASK_SIZE)
+ return 0;
+
+ return TASK_SIZE - addr;
+}
+
+static u16
+perf_sample_ustack_size(u16 stack_size, u16 header_size,
+ struct pt_regs *regs)
+{
+ u64 task_size;
+
+ /* No regs, no stack pointer, no dump. */
+ if (!regs)
+ return 0;
+
+ /*
+ * Check if we fit in with the requested stack size into the:
+ * - TASK_SIZE
+ * If we don't, we limit the size to the TASK_SIZE.
+ *
+ * - remaining sample size
+ * If we don't, we customize the stack size to
+ * fit in to the remaining sample size.
+ */
+
+ task_size = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
+ stack_size = min(stack_size, (u16) task_size);
+
+ /* Current header size plus static size and dynamic size. */
+ header_size += 2 * sizeof(u64);
+
+ /* Do we fit in with the current stack dump size? */
+ if ((u16) (header_size + stack_size) < header_size) {
+ /*
+ * If we overflow the maximum size for the sample,
+ * we customize the stack dump size to fit in.
+ */
+ stack_size = USHRT_MAX - header_size - sizeof(u64);
+ stack_size = round_up(stack_size, sizeof(u64));
+ }
+
+ return stack_size;
+}
+
+static void
+perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
+ struct pt_regs *regs)
+{
+ /* Case of a kernel thread, nothing to dump */
+ if (!regs) {
+ u64 size = 0;
+ perf_output_put(handle, size);
+ } else {
+ unsigned long sp;
+ unsigned int rem;
+ u64 dyn_size;
+
+ /*
+ * We dump:
+ * static size
+ * - the size requested by user or the best one we can fit
+ * in to the sample max size
+ * data
+ * - user stack dump data
+ * dynamic size
+ * - the actual dumped size
+ */
+
+ /* Static size. */
+ perf_output_put(handle, dump_size);
+
+ /* Data. */
+ sp = perf_user_stack_pointer(regs);
+ rem = __output_copy_user(handle, (void *) sp, dump_size);
+ dyn_size = dump_size - rem;
+
+ perf_output_skip(handle, rem);
+
+ /* Dynamic size. */
+ perf_output_put(handle, dyn_size);
+ }
+}
+
+static void __perf_event_header__init_id(struct perf_event_header *header,
+ struct perf_sample_data *data,
+ struct perf_event *event)
+{
+ u64 sample_type = event->attr.sample_type;
+
+ data->type = sample_type;
+ header->size += event->id_header_size;
+
+ if (sample_type & PERF_SAMPLE_TID) {
+ /* namespace issues */
+ data->tid_entry.pid = perf_event_pid(event, current);
+ data->tid_entry.tid = perf_event_tid(event, current);
+ }
+
+ if (sample_type & PERF_SAMPLE_TIME)
+ data->time = perf_event_clock(event);
+
+ if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
+ data->id = primary_event_id(event);
+
+ if (sample_type & PERF_SAMPLE_STREAM_ID)
+ data->stream_id = event->id;
+
+ if (sample_type & PERF_SAMPLE_CPU) {
+ data->cpu_entry.cpu = raw_smp_processor_id();
+ data->cpu_entry.reserved = 0;
+ }
+}
+
+void perf_event_header__init_id(struct perf_event_header *header,
+ struct perf_sample_data *data,
+ struct perf_event *event)
+{
+ if (event->attr.sample_id_all)
+ __perf_event_header__init_id(header, data, event);
+}
+
+static void __perf_event__output_id_sample(struct perf_output_handle *handle,
+ struct perf_sample_data *data)
+{
+ u64 sample_type = data->type;
+
+ if (sample_type & PERF_SAMPLE_TID)
+ perf_output_put(handle, data->tid_entry);
+
+ if (sample_type & PERF_SAMPLE_TIME)
+ perf_output_put(handle, data->time);
+
+ if (sample_type & PERF_SAMPLE_ID)
+ perf_output_put(handle, data->id);
+
+ if (sample_type & PERF_SAMPLE_STREAM_ID)
+ perf_output_put(handle, data->stream_id);
+
+ if (sample_type & PERF_SAMPLE_CPU)
+ perf_output_put(handle, data->cpu_entry);
+
+ if (sample_type & PERF_SAMPLE_IDENTIFIER)
+ perf_output_put(handle, data->id);
+}
+
+void perf_event__output_id_sample(struct perf_event *event,
+ struct perf_output_handle *handle,
+ struct perf_sample_data *sample)
+{
+ if (event->attr.sample_id_all)
+ __perf_event__output_id_sample(handle, sample);
+}
+
+static void perf_output_read_one(struct perf_output_handle *handle,
+ struct perf_event *event,
+ u64 enabled, u64 running)
+{
+ u64 read_format = event->attr.read_format;
+ u64 values[4];
+ int n = 0;
+
+ values[n++] = perf_event_count(event);
+ if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+ values[n++] = enabled +
+ atomic64_read(&event->child_total_time_enabled);
+ }
+ if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+ values[n++] = running +
+ atomic64_read(&event->child_total_time_running);
+ }
+ if (read_format & PERF_FORMAT_ID)
+ values[n++] = primary_event_id(event);
+
+ __output_copy(handle, values, n * sizeof(u64));
+}
+
+/*
+ * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
+ */
+static void perf_output_read_group(struct perf_output_handle *handle,
+ struct perf_event *event,
+ u64 enabled, u64 running)
+{
+ struct perf_event *leader = event->group_leader, *sub;
+ u64 read_format = event->attr.read_format;
+ u64 values[5];
+ int n = 0;
+
+ values[n++] = 1 + leader->nr_siblings;
+
+ if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+ values[n++] = enabled;
+
+ if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+ values[n++] = running;
+
+ if (leader != event)
+ leader->pmu->read(leader);
+
+ values[n++] = perf_event_count(leader);
+ if (read_format & PERF_FORMAT_ID)
+ values[n++] = primary_event_id(leader);
+
+ __output_copy(handle, values, n * sizeof(u64));
+
+ list_for_each_entry(sub, &leader->sibling_list, group_entry) {
+ n = 0;
+
+ if ((sub != event) &&
+ (sub->state == PERF_EVENT_STATE_ACTIVE))
+ sub->pmu->read(sub);
+
+ values[n++] = perf_event_count(sub);
+ if (read_format & PERF_FORMAT_ID)
+ values[n++] = primary_event_id(sub);
+
+ __output_copy(handle, values, n * sizeof(u64));
+ }
+}
+
+#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
+ PERF_FORMAT_TOTAL_TIME_RUNNING)
+
+static void perf_output_read(struct perf_output_handle *handle,
+ struct perf_event *event)
+{
+ u64 enabled = 0, running = 0, now;
+ u64 read_format = event->attr.read_format;
+
+ /*
+ * compute total_time_enabled, total_time_running
+ * based on snapshot values taken when the event
+ * was last scheduled in.
+ *
+ * we cannot simply called update_context_time()
+ * because of locking issue as we are called in
+ * NMI context
+ */
+ if (read_format & PERF_FORMAT_TOTAL_TIMES)
+ calc_timer_values(event, &now, &enabled, &running);
+
+ if (event->attr.read_format & PERF_FORMAT_GROUP)
+ perf_output_read_group(handle, event, enabled, running);
+ else
+ perf_output_read_one(handle, event, enabled, running);
+}
+
+void perf_output_sample(struct perf_output_handle *handle,
+ struct perf_event_header *header,
+ struct perf_sample_data *data,
+ struct perf_event *event)
+{
+ u64 sample_type = data->type;
+
+ perf_output_put(handle, *header);
+
+ if (sample_type & PERF_SAMPLE_IDENTIFIER)
+ perf_output_put(handle, data->id);
+
+ if (sample_type & PERF_SAMPLE_IP)
+ perf_output_put(handle, data->ip);
+
+ if (sample_type & PERF_SAMPLE_TID)
+ perf_output_put(handle, data->tid_entry);
+
+ if (sample_type & PERF_SAMPLE_TIME)
+ perf_output_put(handle, data->time);
+
+ if (sample_type & PERF_SAMPLE_ADDR)
+ perf_output_put(handle, data->addr);
+
+ if (sample_type & PERF_SAMPLE_ID)
+ perf_output_put(handle, data->id);
+
+ if (sample_type & PERF_SAMPLE_STREAM_ID)
+ perf_output_put(handle, data->stream_id);
+
+ if (sample_type & PERF_SAMPLE_CPU)
+ perf_output_put(handle, data->cpu_entry);
+
+ if (sample_type & PERF_SAMPLE_PERIOD)
+ perf_output_put(handle, data->period);
+
+ if (sample_type & PERF_SAMPLE_READ)
+ perf_output_read(handle, event);
+
+ if (sample_type & PERF_SAMPLE_CALLCHAIN) {
+ if (data->callchain) {
+ int size = 1;
+
+ if (data->callchain)
+ size += data->callchain->nr;
+
+ size *= sizeof(u64);
+
+ __output_copy(handle, data->callchain, size);
+ } else {
+ u64 nr = 0;
+ perf_output_put(handle, nr);
+ }
+ }
+
+ if (sample_type & PERF_SAMPLE_RAW) {
+ if (data->raw) {
+ perf_output_put(handle, data->raw->size);
+ __output_copy(handle, data->raw->data,
+ data->raw->size);
+ } else {
+ struct {
+ u32 size;
+ u32 data;
+ } raw = {
+ .size = sizeof(u32),
+ .data = 0,
+ };
+ perf_output_put(handle, raw);
+ }
+ }
+
+ if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
+ if (data->br_stack) {
+ size_t size;
+
+ size = data->br_stack->nr
+ * sizeof(struct perf_branch_entry);
+
+ perf_output_put(handle, data->br_stack->nr);
+ perf_output_copy(handle, data->br_stack->entries, size);
+ } else {
+ /*
+ * we always store at least the value of nr
+ */
+ u64 nr = 0;
+ perf_output_put(handle, nr);
+ }
+ }
+
+ if (sample_type & PERF_SAMPLE_REGS_USER) {
+ u64 abi = data->regs_user.abi;
+
+ /*
+ * If there are no regs to dump, notice it through
+ * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
+ */
+ perf_output_put(handle, abi);
+
+ if (abi) {
+ u64 mask = event->attr.sample_regs_user;
+ perf_output_sample_regs(handle,
+ data->regs_user.regs,
+ mask);
+ }
+ }
+
+ if (sample_type & PERF_SAMPLE_STACK_USER) {
+ perf_output_sample_ustack(handle,
+ data->stack_user_size,
+ data->regs_user.regs);
+ }
+
+ if (sample_type & PERF_SAMPLE_WEIGHT)
+ perf_output_put(handle, data->weight);
+
+ if (sample_type & PERF_SAMPLE_DATA_SRC)
+ perf_output_put(handle, data->data_src.val);
+
+ if (sample_type & PERF_SAMPLE_TRANSACTION)
+ perf_output_put(handle, data->txn);
+
+ if (sample_type & PERF_SAMPLE_REGS_INTR) {
+ u64 abi = data->regs_intr.abi;
+ /*
+ * If there are no regs to dump, notice it through
+ * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
+ */
+ perf_output_put(handle, abi);
+
+ if (abi) {
+ u64 mask = event->attr.sample_regs_intr;
+
+ perf_output_sample_regs(handle,
+ data->regs_intr.regs,
+ mask);
+ }
+ }
+
+ if (!event->attr.watermark) {
+ int wakeup_events = event->attr.wakeup_events;
+
+ if (wakeup_events) {
+ struct ring_buffer *rb = handle->rb;
+ int events = local_inc_return(&rb->events);
+
+ if (events >= wakeup_events) {
+ local_sub(wakeup_events, &rb->events);
+ local_inc(&rb->wakeup);
+ }
+ }
+ }
+}
+
+void perf_prepare_sample(struct perf_event_header *header,
+ struct perf_sample_data *data,
+ struct perf_event *event,
+ struct pt_regs *regs)
+{
+ u64 sample_type = event->attr.sample_type;
+
+ header->type = PERF_RECORD_SAMPLE;
+ header->size = sizeof(*header) + event->header_size;
+
+ header->misc = 0;
+ header->misc |= perf_misc_flags(regs);
+
+ __perf_event_header__init_id(header, data, event);
+
+ if (sample_type & PERF_SAMPLE_IP)
+ data->ip = perf_instruction_pointer(regs);
+
+ if (sample_type & PERF_SAMPLE_CALLCHAIN) {
+ int size = 1;
+
+ data->callchain = perf_callchain(event, regs);
+
+ if (data->callchain)
+ size += data->callchain->nr;
+
+ header->size += size * sizeof(u64);
+ }
+
+ if (sample_type & PERF_SAMPLE_RAW) {
+ int size = sizeof(u32);
+
+ if (data->raw)
+ size += data->raw->size;
+ else
+ size += sizeof(u32);
+
+ WARN_ON_ONCE(size & (sizeof(u64)-1));
+ header->size += size;
+ }
+
+ if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
+ int size = sizeof(u64); /* nr */
+ if (data->br_stack) {
+ size += data->br_stack->nr
+ * sizeof(struct perf_branch_entry);
+ }
+ header->size += size;
+ }
+
+ if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
+ perf_sample_regs_user(&data->regs_user, regs,
+ &data->regs_user_copy);
+
+ if (sample_type & PERF_SAMPLE_REGS_USER) {
+ /* regs dump ABI info */
+ int size = sizeof(u64);
+
+ if (data->regs_user.regs) {
+ u64 mask = event->attr.sample_regs_user;
+ size += hweight64(mask) * sizeof(u64);
+ }
+
+ header->size += size;
+ }
+
+ if (sample_type & PERF_SAMPLE_STACK_USER) {
+ /*
+ * Either we need PERF_SAMPLE_STACK_USER bit to be allways
+ * processed as the last one or have additional check added
+ * in case new sample type is added, because we could eat
+ * up the rest of the sample size.
+ */
+ u16 stack_size = event->attr.sample_stack_user;
+ u16 size = sizeof(u64);
+
+ stack_size = perf_sample_ustack_size(stack_size, header->size,
+ data->regs_user.regs);
+
+ /*
+ * If there is something to dump, add space for the dump
+ * itself and for the field that tells the dynamic size,
+ * which is how many have been actually dumped.
+ */
+ if (stack_size)
+ size += sizeof(u64) + stack_size;
+
+ data->stack_user_size = stack_size;
+ header->size += size;
+ }
+
+ if (sample_type & PERF_SAMPLE_REGS_INTR) {
+ /* regs dump ABI info */
+ int size = sizeof(u64);
+
+ perf_sample_regs_intr(&data->regs_intr, regs);
+
+ if (data->regs_intr.regs) {
+ u64 mask = event->attr.sample_regs_intr;
+
+ size += hweight64(mask) * sizeof(u64);
+ }
+
+ header->size += size;
+ }
+}
+
+static void perf_event_output(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ struct perf_output_handle handle;
+ struct perf_event_header header;
+
+ /* protect the callchain buffers */
+ rcu_read_lock();
+
+ perf_prepare_sample(&header, data, event, regs);
+
+ if (perf_output_begin(&handle, event, header.size))
+ goto exit;
+
+ perf_output_sample(&handle, &header, data, event);
+
+ perf_output_end(&handle);
+
+exit:
+ rcu_read_unlock();
+}
+
+/*
+ * read event_id
+ */
+
+struct perf_read_event {
+ struct perf_event_header header;
+
+ u32 pid;
+ u32 tid;
+};
+
+static void
+perf_event_read_event(struct perf_event *event,
+ struct task_struct *task)
+{
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ struct perf_read_event read_event = {
+ .header = {
+ .type = PERF_RECORD_READ,
+ .misc = 0,
+ .size = sizeof(read_event) + event->read_size,
+ },
+ .pid = perf_event_pid(event, task),
+ .tid = perf_event_tid(event, task),
+ };
+ int ret;
+
+ perf_event_header__init_id(&read_event.header, &sample, event);
+ ret = perf_output_begin(&handle, event, read_event.header.size);
+ if (ret)
+ return;
+
+ perf_output_put(&handle, read_event);
+ perf_output_read(&handle, event);
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+}
+
+typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data);
+
+static void
+perf_event_aux_ctx(struct perf_event_context *ctx,
+ perf_event_aux_output_cb output,
+ void *data)
+{
+ struct perf_event *event;
+
+ list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+ if (event->state < PERF_EVENT_STATE_INACTIVE)
+ continue;
+ if (!event_filter_match(event))
+ continue;
+ output(event, data);
+ }
+}
+
+static void
+perf_event_aux(perf_event_aux_output_cb output, void *data,
+ struct perf_event_context *task_ctx)
+{
+ struct perf_cpu_context *cpuctx;
+ struct perf_event_context *ctx;
+ struct pmu *pmu;
+ int ctxn;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(pmu, &pmus, entry) {
+ cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+ if (cpuctx->unique_pmu != pmu)
+ goto next;
+ perf_event_aux_ctx(&cpuctx->ctx, output, data);
+ if (task_ctx)
+ goto next;
+ ctxn = pmu->task_ctx_nr;
+ if (ctxn < 0)
+ goto next;
+ ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+ if (ctx)
+ perf_event_aux_ctx(ctx, output, data);
+next:
+ put_cpu_ptr(pmu->pmu_cpu_context);
+ }
+
+ if (task_ctx) {
+ preempt_disable();
+ perf_event_aux_ctx(task_ctx, output, data);
+ preempt_enable();
+ }
+ rcu_read_unlock();
+}
+
+/*
+ * task tracking -- fork/exit
+ *
+ * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
+ */
+
+struct perf_task_event {
+ struct task_struct *task;
+ struct perf_event_context *task_ctx;
+
+ struct {
+ struct perf_event_header header;
+
+ u32 pid;
+ u32 ppid;
+ u32 tid;
+ u32 ptid;
+ u64 time;
+ } event_id;
+};
+
+static int perf_event_task_match(struct perf_event *event)
+{
+ return event->attr.comm || event->attr.mmap ||
+ event->attr.mmap2 || event->attr.mmap_data ||
+ event->attr.task;
+}
+
+static void perf_event_task_output(struct perf_event *event,
+ void *data)
+{
+ struct perf_task_event *task_event = data;
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ struct task_struct *task = task_event->task;
+ int ret, size = task_event->event_id.header.size;
+
+ if (!perf_event_task_match(event))
+ return;
+
+ perf_event_header__init_id(&task_event->event_id.header, &sample, event);
+
+ ret = perf_output_begin(&handle, event,
+ task_event->event_id.header.size);
+ if (ret)
+ goto out;
+
+ task_event->event_id.pid = perf_event_pid(event, task);
+ task_event->event_id.ppid = perf_event_pid(event, current);
+
+ task_event->event_id.tid = perf_event_tid(event, task);
+ task_event->event_id.ptid = perf_event_tid(event, current);
+
+ task_event->event_id.time = perf_event_clock(event);
+
+ perf_output_put(&handle, task_event->event_id);
+
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+out:
+ task_event->event_id.header.size = size;
+}
+
+static void perf_event_task(struct task_struct *task,
+ struct perf_event_context *task_ctx,
+ int new)
+{
+ struct perf_task_event task_event;
+
+ if (!atomic_read(&nr_comm_events) &&
+ !atomic_read(&nr_mmap_events) &&
+ !atomic_read(&nr_task_events))
+ return;
+
+ task_event = (struct perf_task_event){
+ .task = task,
+ .task_ctx = task_ctx,
+ .event_id = {
+ .header = {
+ .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
+ .misc = 0,
+ .size = sizeof(task_event.event_id),
+ },
+ /* .pid */
+ /* .ppid */
+ /* .tid */
+ /* .ptid */
+ /* .time */
+ },
+ };
+
+ perf_event_aux(perf_event_task_output,
+ &task_event,
+ task_ctx);
+}
+
+void perf_event_fork(struct task_struct *task)
+{
+ perf_event_task(task, NULL, 1);
+}
+
+/*
+ * comm tracking
+ */
+
+struct perf_comm_event {
+ struct task_struct *task;
+ char *comm;
+ int comm_size;
+
+ struct {
+ struct perf_event_header header;
+
+ u32 pid;
+ u32 tid;
+ } event_id;
+};
+
+static int perf_event_comm_match(struct perf_event *event)
+{
+ return event->attr.comm;
+}
+
+static void perf_event_comm_output(struct perf_event *event,
+ void *data)
+{
+ struct perf_comm_event *comm_event = data;
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ int size = comm_event->event_id.header.size;
+ int ret;
+
+ if (!perf_event_comm_match(event))
+ return;
+
+ perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
+ ret = perf_output_begin(&handle, event,
+ comm_event->event_id.header.size);
+
+ if (ret)
+ goto out;
+
+ comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
+ comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
+
+ perf_output_put(&handle, comm_event->event_id);
+ __output_copy(&handle, comm_event->comm,
+ comm_event->comm_size);
+
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+out:
+ comm_event->event_id.header.size = size;
+}
+
+static void perf_event_comm_event(struct perf_comm_event *comm_event)
+{
+ char comm[TASK_COMM_LEN];
+ unsigned int size;
+
+ memset(comm, 0, sizeof(comm));
+ strlcpy(comm, comm_event->task->comm, sizeof(comm));
+ size = ALIGN(strlen(comm)+1, sizeof(u64));
+
+ comm_event->comm = comm;
+ comm_event->comm_size = size;
+
+ comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
+
+ perf_event_aux(perf_event_comm_output,
+ comm_event,
+ NULL);
+}
+
+void perf_event_comm(struct task_struct *task, bool exec)
+{
+ struct perf_comm_event comm_event;
+
+ if (!atomic_read(&nr_comm_events))
+ return;
+
+ comm_event = (struct perf_comm_event){
+ .task = task,
+ /* .comm */
+ /* .comm_size */
+ .event_id = {
+ .header = {
+ .type = PERF_RECORD_COMM,
+ .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
+ /* .size */
+ },
+ /* .pid */
+ /* .tid */
+ },
+ };
+
+ perf_event_comm_event(&comm_event);
+}
+
+/*
+ * mmap tracking
+ */
+
+struct perf_mmap_event {
+ struct vm_area_struct *vma;
+
+ const char *file_name;
+ int file_size;
+ int maj, min;
+ u64 ino;
+ u64 ino_generation;
+ u32 prot, flags;
+
+ struct {
+ struct perf_event_header header;
+
+ u32 pid;
+ u32 tid;
+ u64 start;
+ u64 len;
+ u64 pgoff;
+ } event_id;
+};
+
+static int perf_event_mmap_match(struct perf_event *event,
+ void *data)
+{
+ struct perf_mmap_event *mmap_event = data;
+ struct vm_area_struct *vma = mmap_event->vma;
+ int executable = vma->vm_flags & VM_EXEC;
+
+ return (!executable && event->attr.mmap_data) ||
+ (executable && (event->attr.mmap || event->attr.mmap2));
+}
+
+static void perf_event_mmap_output(struct perf_event *event,
+ void *data)
+{
+ struct perf_mmap_event *mmap_event = data;
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ int size = mmap_event->event_id.header.size;
+ int ret;
+
+ if (!perf_event_mmap_match(event, data))
+ return;
+
+ if (event->attr.mmap2) {
+ mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
+ mmap_event->event_id.header.size += sizeof(mmap_event->maj);
+ mmap_event->event_id.header.size += sizeof(mmap_event->min);
+ mmap_event->event_id.header.size += sizeof(mmap_event->ino);
+ mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
+ mmap_event->event_id.header.size += sizeof(mmap_event->prot);
+ mmap_event->event_id.header.size += sizeof(mmap_event->flags);
+ }
+
+ perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
+ ret = perf_output_begin(&handle, event,
+ mmap_event->event_id.header.size);
+ if (ret)
+ goto out;
+
+ mmap_event->event_id.pid = perf_event_pid(event, current);
+ mmap_event->event_id.tid = perf_event_tid(event, current);
+
+ perf_output_put(&handle, mmap_event->event_id);
+
+ if (event->attr.mmap2) {
+ perf_output_put(&handle, mmap_event->maj);
+ perf_output_put(&handle, mmap_event->min);
+ perf_output_put(&handle, mmap_event->ino);
+ perf_output_put(&handle, mmap_event->ino_generation);
+ perf_output_put(&handle, mmap_event->prot);
+ perf_output_put(&handle, mmap_event->flags);
+ }
+
+ __output_copy(&handle, mmap_event->file_name,
+ mmap_event->file_size);
+
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+out:
+ mmap_event->event_id.header.size = size;
+}
+
+static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
+{
+ struct vm_area_struct *vma = mmap_event->vma;
+ struct file *file = vma->vm_file;
+ int maj = 0, min = 0;
+ u64 ino = 0, gen = 0;
+ u32 prot = 0, flags = 0;
+ unsigned int size;
+ char tmp[16];
+ char *buf = NULL;
+ char *name;
+
+ if (file) {
+ struct inode *inode;
+ dev_t dev;
+
+ buf = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (!buf) {
+ name = "//enomem";
+ goto cpy_name;
+ }
+ /*
+ * d_path() works from the end of the rb backwards, so we
+ * need to add enough zero bytes after the string to handle
+ * the 64bit alignment we do later.
+ */
+ name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64));
+ if (IS_ERR(name)) {
+ name = "//toolong";
+ goto cpy_name;
+ }
+ inode = file_inode(vma->vm_file);
+ dev = inode->i_sb->s_dev;
+ ino = inode->i_ino;
+ gen = inode->i_generation;
+ maj = MAJOR(dev);
+ min = MINOR(dev);
+
+ if (vma->vm_flags & VM_READ)
+ prot |= PROT_READ;
+ if (vma->vm_flags & VM_WRITE)
+ prot |= PROT_WRITE;
+ if (vma->vm_flags & VM_EXEC)
+ prot |= PROT_EXEC;
+
+ if (vma->vm_flags & VM_MAYSHARE)
+ flags = MAP_SHARED;
+ else
+ flags = MAP_PRIVATE;
+
+ if (vma->vm_flags & VM_DENYWRITE)
+ flags |= MAP_DENYWRITE;
+ if (vma->vm_flags & VM_MAYEXEC)
+ flags |= MAP_EXECUTABLE;
+ if (vma->vm_flags & VM_LOCKED)
+ flags |= MAP_LOCKED;
+ if (vma->vm_flags & VM_HUGETLB)
+ flags |= MAP_HUGETLB;
+
+ goto got_name;
+ } else {
+ if (vma->vm_ops && vma->vm_ops->name) {
+ name = (char *) vma->vm_ops->name(vma);
+ if (name)
+ goto cpy_name;
+ }
+
+ name = (char *)arch_vma_name(vma);
+ if (name)
+ goto cpy_name;
+
+ if (vma->vm_start <= vma->vm_mm->start_brk &&
+ vma->vm_end >= vma->vm_mm->brk) {
+ name = "[heap]";
+ goto cpy_name;
+ }
+ if (vma->vm_start <= vma->vm_mm->start_stack &&
+ vma->vm_end >= vma->vm_mm->start_stack) {
+ name = "[stack]";
+ goto cpy_name;
+ }
+
+ name = "//anon";
+ goto cpy_name;
+ }
+
+cpy_name:
+ strlcpy(tmp, name, sizeof(tmp));
+ name = tmp;
+got_name:
+ /*
+ * Since our buffer works in 8 byte units we need to align our string
+ * size to a multiple of 8. However, we must guarantee the tail end is
+ * zero'd out to avoid leaking random bits to userspace.
+ */
+ size = strlen(name)+1;
+ while (!IS_ALIGNED(size, sizeof(u64)))
+ name[size++] = '\0';
+
+ mmap_event->file_name = name;
+ mmap_event->file_size = size;
+ mmap_event->maj = maj;
+ mmap_event->min = min;
+ mmap_event->ino = ino;
+ mmap_event->ino_generation = gen;
+ mmap_event->prot = prot;
+ mmap_event->flags = flags;
+
+ if (!(vma->vm_flags & VM_EXEC))
+ mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
+
+ mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
+
+ perf_event_aux(perf_event_mmap_output,
+ mmap_event,
+ NULL);
+
+ kfree(buf);
+}
+
+void perf_event_mmap(struct vm_area_struct *vma)
+{
+ struct perf_mmap_event mmap_event;
+
+ if (!atomic_read(&nr_mmap_events))
+ return;
+
+ mmap_event = (struct perf_mmap_event){
+ .vma = vma,
+ /* .file_name */
+ /* .file_size */
+ .event_id = {
+ .header = {
+ .type = PERF_RECORD_MMAP,
+ .misc = PERF_RECORD_MISC_USER,
+ /* .size */
+ },
+ /* .pid */
+ /* .tid */
+ .start = vma->vm_start,
+ .len = vma->vm_end - vma->vm_start,
+ .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
+ },
+ /* .maj (attr_mmap2 only) */
+ /* .min (attr_mmap2 only) */
+ /* .ino (attr_mmap2 only) */
+ /* .ino_generation (attr_mmap2 only) */
+ /* .prot (attr_mmap2 only) */
+ /* .flags (attr_mmap2 only) */
+ };
+
+ perf_event_mmap_event(&mmap_event);
+}
+
+void perf_event_aux_event(struct perf_event *event, unsigned long head,
+ unsigned long size, u64 flags)
+{
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ struct perf_aux_event {
+ struct perf_event_header header;
+ u64 offset;
+ u64 size;
+ u64 flags;
+ } rec = {
+ .header = {
+ .type = PERF_RECORD_AUX,
+ .misc = 0,
+ .size = sizeof(rec),
+ },
+ .offset = head,
+ .size = size,
+ .flags = flags,
+ };
+ int ret;
+
+ perf_event_header__init_id(&rec.header, &sample, event);
+ ret = perf_output_begin(&handle, event, rec.header.size);
+
+ if (ret)
+ return;
+
+ perf_output_put(&handle, rec);
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+}
+
+/*
+ * IRQ throttle logging
+ */
+
+static void perf_log_throttle(struct perf_event *event, int enable)
+{
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ int ret;
+
+ struct {
+ struct perf_event_header header;
+ u64 time;
+ u64 id;
+ u64 stream_id;
+ } throttle_event = {
+ .header = {
+ .type = PERF_RECORD_THROTTLE,
+ .misc = 0,
+ .size = sizeof(throttle_event),
+ },
+ .time = perf_event_clock(event),
+ .id = primary_event_id(event),
+ .stream_id = event->id,
+ };
+
+ if (enable)
+ throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
+
+ perf_event_header__init_id(&throttle_event.header, &sample, event);
+
+ ret = perf_output_begin(&handle, event,
+ throttle_event.header.size);
+ if (ret)
+ return;
+
+ perf_output_put(&handle, throttle_event);
+ perf_event__output_id_sample(event, &handle, &sample);
+ perf_output_end(&handle);
+}
+
+static void perf_log_itrace_start(struct perf_event *event)
+{
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ struct perf_aux_event {
+ struct perf_event_header header;
+ u32 pid;
+ u32 tid;
+ } rec;
+ int ret;
+
+ if (event->parent)
+ event = event->parent;
+
+ if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
+ event->hw.itrace_started)
+ return;
+
+ event->hw.itrace_started = 1;
+
+ rec.header.type = PERF_RECORD_ITRACE_START;
+ rec.header.misc = 0;
+ rec.header.size = sizeof(rec);
+ rec.pid = perf_event_pid(event, current);
+ rec.tid = perf_event_tid(event, current);
+
+ perf_event_header__init_id(&rec.header, &sample, event);
+ ret = perf_output_begin(&handle, event, rec.header.size);
+
+ if (ret)
+ return;
+
+ perf_output_put(&handle, rec);
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+}
+
+/*
+ * Generic event overflow handling, sampling.
+ */
+
+static int __perf_event_overflow(struct perf_event *event,
+ int throttle, struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ int events = atomic_read(&event->event_limit);
+ struct hw_perf_event *hwc = &event->hw;
+ u64 seq;
+ int ret = 0;
+
+ /*
+ * Non-sampling counters might still use the PMI to fold short
+ * hardware counters, ignore those.
+ */
+ if (unlikely(!is_sampling_event(event)))
+ return 0;
+
+ seq = __this_cpu_read(perf_throttled_seq);
+ if (seq != hwc->interrupts_seq) {
+ hwc->interrupts_seq = seq;
+ hwc->interrupts = 1;
+ } else {
+ hwc->interrupts++;
+ if (unlikely(throttle
+ && hwc->interrupts >= max_samples_per_tick)) {
+ __this_cpu_inc(perf_throttled_count);
+ hwc->interrupts = MAX_INTERRUPTS;
+ perf_log_throttle(event, 0);
+ tick_nohz_full_kick();
+ ret = 1;
+ }
+ }
+
+ if (event->attr.freq) {
+ u64 now = perf_clock();
+ s64 delta = now - hwc->freq_time_stamp;
+
+ hwc->freq_time_stamp = now;
+
+ if (delta > 0 && delta < 2*TICK_NSEC)
+ perf_adjust_period(event, delta, hwc->last_period, true);
+ }
+
+ /*
+ * XXX event_limit might not quite work as expected on inherited
+ * events
+ */
+
+ event->pending_kill = POLL_IN;
+ if (events && atomic_dec_and_test(&event->event_limit)) {
+ ret = 1;
+ event->pending_kill = POLL_HUP;
+ event->pending_disable = 1;
+ irq_work_queue(&event->pending);
+ }
+
+ if (event->overflow_handler)
+ event->overflow_handler(event, data, regs);
+ else
+ perf_event_output(event, data, regs);
+
+ if (event->fasync && event->pending_kill) {
+ event->pending_wakeup = 1;
+ irq_work_queue(&event->pending);
+ }
+
+ return ret;
+}
+
+int perf_event_overflow(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ return __perf_event_overflow(event, 1, data, regs);
+}
+
+/*
+ * Generic software event infrastructure
+ */
+
+struct swevent_htable {
+ struct swevent_hlist *swevent_hlist;
+ struct mutex hlist_mutex;
+ int hlist_refcount;
+
+ /* Recursion avoidance in each contexts */
+ int recursion[PERF_NR_CONTEXTS];
+
+ /* Keeps track of cpu being initialized/exited */
+ bool online;
+};
+
+static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
+
+/*
+ * We directly increment event->count and keep a second value in
+ * event->hw.period_left to count intervals. This period event
+ * is kept in the range [-sample_period, 0] so that we can use the
+ * sign as trigger.
+ */
+
+u64 perf_swevent_set_period(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ u64 period = hwc->last_period;
+ u64 nr, offset;
+ s64 old, val;
+
+ hwc->last_period = hwc->sample_period;
+
+again:
+ old = val = local64_read(&hwc->period_left);
+ if (val < 0)
+ return 0;
+
+ nr = div64_u64(period + val, period);
+ offset = nr * period;
+ val -= offset;
+ if (local64_cmpxchg(&hwc->period_left, old, val) != old)
+ goto again;
+
+ return nr;
+}
+
+static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ int throttle = 0;
+
+ if (!overflow)
+ overflow = perf_swevent_set_period(event);
+
+ if (hwc->interrupts == MAX_INTERRUPTS)
+ return;
+
+ for (; overflow; overflow--) {
+ if (__perf_event_overflow(event, throttle,
+ data, regs)) {
+ /*
+ * We inhibit the overflow from happening when
+ * hwc->interrupts == MAX_INTERRUPTS.
+ */
+ break;
+ }
+ throttle = 1;
+ }
+}
+
+static void perf_swevent_event(struct perf_event *event, u64 nr,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ local64_add(nr, &event->count);
+
+ if (!regs)
+ return;
+
+ if (!is_sampling_event(event))
+ return;
+
+ if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
+ data->period = nr;
+ return perf_swevent_overflow(event, 1, data, regs);
+ } else
+ data->period = event->hw.last_period;
+
+ if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
+ return perf_swevent_overflow(event, 1, data, regs);
+
+ if (local64_add_negative(nr, &hwc->period_left))
+ return;
+
+ perf_swevent_overflow(event, 0, data, regs);
+}
+
+static int perf_exclude_event(struct perf_event *event,
+ struct pt_regs *regs)
+{
+ if (event->hw.state & PERF_HES_STOPPED)
+ return 1;
+
+ if (regs) {
+ if (event->attr.exclude_user && user_mode(regs))
+ return 1;
+
+ if (event->attr.exclude_kernel && !user_mode(regs))
+ return 1;
+ }
+
+ return 0;
+}
+
+static int perf_swevent_match(struct perf_event *event,
+ enum perf_type_id type,
+ u32 event_id,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ if (event->attr.type != type)
+ return 0;
+
+ if (event->attr.config != event_id)
+ return 0;
+
+ if (perf_exclude_event(event, regs))
+ return 0;
+
+ return 1;
+}
+
+static inline u64 swevent_hash(u64 type, u32 event_id)
+{
+ u64 val = event_id | (type << 32);
+
+ return hash_64(val, SWEVENT_HLIST_BITS);
+}
+
+static inline struct hlist_head *
+__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
+{
+ u64 hash = swevent_hash(type, event_id);
+
+ return &hlist->heads[hash];
+}
+
+/* For the read side: events when they trigger */
+static inline struct hlist_head *
+find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
+{
+ struct swevent_hlist *hlist;
+
+ hlist = rcu_dereference(swhash->swevent_hlist);
+ if (!hlist)
+ return NULL;
+
+ return __find_swevent_head(hlist, type, event_id);
+}
+
+/* For the event head insertion and removal in the hlist */
+static inline struct hlist_head *
+find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
+{
+ struct swevent_hlist *hlist;
+ u32 event_id = event->attr.config;
+ u64 type = event->attr.type;
+
+ /*
+ * Event scheduling is always serialized against hlist allocation
+ * and release. Which makes the protected version suitable here.
+ * The context lock guarantees that.
+ */
+ hlist = rcu_dereference_protected(swhash->swevent_hlist,
+ lockdep_is_held(&event->ctx->lock));
+ if (!hlist)
+ return NULL;
+
+ return __find_swevent_head(hlist, type, event_id);
+}
+
+static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
+ u64 nr,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
+ struct perf_event *event;
+ struct hlist_head *head;
+
+ rcu_read_lock();
+ head = find_swevent_head_rcu(swhash, type, event_id);
+ if (!head)
+ goto end;
+
+ hlist_for_each_entry_rcu(event, head, hlist_entry) {
+ if (perf_swevent_match(event, type, event_id, data, regs))
+ perf_swevent_event(event, nr, data, regs);
+ }
+end:
+ rcu_read_unlock();
+}
+
+DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
+
+int perf_swevent_get_recursion_context(void)
+{
+ struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
+
+ return get_recursion_context(swhash->recursion);
+}
+EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
+
+inline void perf_swevent_put_recursion_context(int rctx)
+{
+ struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
+
+ put_recursion_context(swhash->recursion, rctx);
+}
+
+void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
+{
+ struct perf_sample_data data;
+
+ if (WARN_ON_ONCE(!regs))
+ return;
+
+ perf_sample_data_init(&data, addr, 0);
+ do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
+}
+
+void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
+{
+ int rctx;
+
+ preempt_disable_notrace();
+ rctx = perf_swevent_get_recursion_context();
+ if (unlikely(rctx < 0))
+ goto fail;
+
+ ___perf_sw_event(event_id, nr, regs, addr);
+
+ perf_swevent_put_recursion_context(rctx);
+fail:
+ preempt_enable_notrace();
+}
+
+static void perf_swevent_read(struct perf_event *event)
+{
+}
+
+static int perf_swevent_add(struct perf_event *event, int flags)
+{
+ struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
+ struct hw_perf_event *hwc = &event->hw;
+ struct hlist_head *head;
+
+ if (is_sampling_event(event)) {
+ hwc->last_period = hwc->sample_period;
+ perf_swevent_set_period(event);
+ }
+
+ hwc->state = !(flags & PERF_EF_START);
+
+ head = find_swevent_head(swhash, event);
+ if (!head) {
+ /*
+ * We can race with cpu hotplug code. Do not
+ * WARN if the cpu just got unplugged.
+ */
+ WARN_ON_ONCE(swhash->online);
+ return -EINVAL;
+ }
+
+ hlist_add_head_rcu(&event->hlist_entry, head);
+ perf_event_update_userpage(event);
+
+ return 0;
+}
+
+static void perf_swevent_del(struct perf_event *event, int flags)
+{
+ hlist_del_rcu(&event->hlist_entry);
+}
+
+static void perf_swevent_start(struct perf_event *event, int flags)
+{
+ event->hw.state = 0;
+}
+
+static void perf_swevent_stop(struct perf_event *event, int flags)
+{
+ event->hw.state = PERF_HES_STOPPED;
+}
+
+/* Deref the hlist from the update side */
+static inline struct swevent_hlist *
+swevent_hlist_deref(struct swevent_htable *swhash)
+{
+ return rcu_dereference_protected(swhash->swevent_hlist,
+ lockdep_is_held(&swhash->hlist_mutex));
+}
+
+static void swevent_hlist_release(struct swevent_htable *swhash)
+{
+ struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
+
+ if (!hlist)
+ return;
+
+ RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
+ kfree_rcu(hlist, rcu_head);
+}
+
+static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
+{
+ struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
+
+ mutex_lock(&swhash->hlist_mutex);
+
+ if (!--swhash->hlist_refcount)
+ swevent_hlist_release(swhash);
+
+ mutex_unlock(&swhash->hlist_mutex);
+}
+
+static void swevent_hlist_put(struct perf_event *event)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ swevent_hlist_put_cpu(event, cpu);
+}
+
+static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
+{
+ struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
+ int err = 0;
+
+ mutex_lock(&swhash->hlist_mutex);
+
+ if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
+ struct swevent_hlist *hlist;
+
+ hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
+ if (!hlist) {
+ err = -ENOMEM;
+ goto exit;
+ }
+ rcu_assign_pointer(swhash->swevent_hlist, hlist);
+ }
+ swhash->hlist_refcount++;
+exit:
+ mutex_unlock(&swhash->hlist_mutex);
+
+ return err;
+}
+
+static int swevent_hlist_get(struct perf_event *event)
+{
+ int err;
+ int cpu, failed_cpu;
+
+ get_online_cpus();
+ for_each_possible_cpu(cpu) {
+ err = swevent_hlist_get_cpu(event, cpu);
+ if (err) {
+ failed_cpu = cpu;
+ goto fail;
+ }
+ }
+ put_online_cpus();
+
+ return 0;
+fail:
+ for_each_possible_cpu(cpu) {
+ if (cpu == failed_cpu)
+ break;
+ swevent_hlist_put_cpu(event, cpu);
+ }
+
+ put_online_cpus();
+ return err;
+}
+
+struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
+
+static void sw_perf_event_destroy(struct perf_event *event)
+{
+ u64 event_id = event->attr.config;
+
+ WARN_ON(event->parent);
+
+ static_key_slow_dec(&perf_swevent_enabled[event_id]);
+ swevent_hlist_put(event);
+}
+
+static int perf_swevent_init(struct perf_event *event)
+{
+ u64 event_id = event->attr.config;
+
+ if (event->attr.type != PERF_TYPE_SOFTWARE)
+ return -ENOENT;
+
+ /*
+ * no branch sampling for software events
+ */
+ if (has_branch_stack(event))
+ return -EOPNOTSUPP;
+
+ switch (event_id) {
+ case PERF_COUNT_SW_CPU_CLOCK:
+ case PERF_COUNT_SW_TASK_CLOCK:
+ return -ENOENT;
+
+ default:
+ break;
+ }
+
+ if (event_id >= PERF_COUNT_SW_MAX)
+ return -ENOENT;
+
+ if (!event->parent) {
+ int err;
+
+ err = swevent_hlist_get(event);
+ if (err)
+ return err;
+
+ static_key_slow_inc(&perf_swevent_enabled[event_id]);
+ event->destroy = sw_perf_event_destroy;
+ }
+
+ return 0;
+}
+
+static struct pmu perf_swevent = {
+ .task_ctx_nr = perf_sw_context,
+
+ .capabilities = PERF_PMU_CAP_NO_NMI,
+
+ .event_init = perf_swevent_init,
+ .add = perf_swevent_add,
+ .del = perf_swevent_del,
+ .start = perf_swevent_start,
+ .stop = perf_swevent_stop,
+ .read = perf_swevent_read,
+};
+
+#ifdef CONFIG_EVENT_TRACING
+
+static int perf_tp_filter_match(struct perf_event *event,
+ struct perf_sample_data *data)
+{
+ void *record = data->raw->data;
+
+ if (likely(!event->filter) || filter_match_preds(event->filter, record))
+ return 1;
+ return 0;
+}
+
+static int perf_tp_event_match(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ if (event->hw.state & PERF_HES_STOPPED)
+ return 0;
+ /*
+ * All tracepoints are from kernel-space.
+ */
+ if (event->attr.exclude_kernel)
+ return 0;
+
+ if (!perf_tp_filter_match(event, data))
+ return 0;
+
+ return 1;
+}
+
+void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
+ struct pt_regs *regs, struct hlist_head *head, int rctx,
+ struct task_struct *task)
+{
+ struct perf_sample_data data;
+ struct perf_event *event;
+
+ struct perf_raw_record raw = {
+ .size = entry_size,
+ .data = record,
+ };
+
+ perf_sample_data_init(&data, addr, 0);
+ data.raw = &raw;
+
+ hlist_for_each_entry_rcu(event, head, hlist_entry) {
+ if (perf_tp_event_match(event, &data, regs))
+ perf_swevent_event(event, count, &data, regs);
+ }
+
+ /*
+ * If we got specified a target task, also iterate its context and
+ * deliver this event there too.
+ */
+ if (task && task != current) {
+ struct perf_event_context *ctx;
+ struct trace_entry *entry = record;
+
+ rcu_read_lock();
+ ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
+ if (!ctx)
+ goto unlock;
+
+ list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+ if (event->attr.type != PERF_TYPE_TRACEPOINT)
+ continue;
+ if (event->attr.config != entry->type)
+ continue;
+ if (perf_tp_event_match(event, &data, regs))
+ perf_swevent_event(event, count, &data, regs);
+ }
+unlock:
+ rcu_read_unlock();
+ }
+
+ perf_swevent_put_recursion_context(rctx);
+}
+EXPORT_SYMBOL_GPL(perf_tp_event);
+
+static void tp_perf_event_destroy(struct perf_event *event)
+{
+ perf_trace_destroy(event);
+}
+
+static int perf_tp_event_init(struct perf_event *event)
+{
+ int err;
+
+ if (event->attr.type != PERF_TYPE_TRACEPOINT)
+ return -ENOENT;
+
+ /*
+ * no branch sampling for tracepoint events
+ */
+ if (has_branch_stack(event))
+ return -EOPNOTSUPP;
+
+ err = perf_trace_init(event);
+ if (err)
+ return err;
+
+ event->destroy = tp_perf_event_destroy;
+
+ return 0;
+}
+
+static struct pmu perf_tracepoint = {
+ .task_ctx_nr = perf_sw_context,
+
+ .event_init = perf_tp_event_init,
+ .add = perf_trace_add,
+ .del = perf_trace_del,
+ .start = perf_swevent_start,
+ .stop = perf_swevent_stop,
+ .read = perf_swevent_read,
+};
+
+static inline void perf_tp_register(void)
+{
+ perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
+}
+
+static int perf_event_set_filter(struct perf_event *event, void __user *arg)
+{
+ char *filter_str;
+ int ret;
+
+ if (event->attr.type != PERF_TYPE_TRACEPOINT)
+ return -EINVAL;
+
+ filter_str = strndup_user(arg, PAGE_SIZE);
+ if (IS_ERR(filter_str))
+ return PTR_ERR(filter_str);
+
+ ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
+
+ kfree(filter_str);
+ return ret;
+}
+
+static void perf_event_free_filter(struct perf_event *event)
+{
+ ftrace_profile_free_filter(event);
+}
+
+static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
+{
+ struct bpf_prog *prog;
+
+ if (event->attr.type != PERF_TYPE_TRACEPOINT)
+ return -EINVAL;
+
+ if (event->tp_event->prog)
+ return -EEXIST;
+
+ if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE))
+ /* bpf programs can only be attached to kprobes */
+ return -EINVAL;
+
+ prog = bpf_prog_get(prog_fd);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ if (prog->type != BPF_PROG_TYPE_KPROBE) {
+ /* valid fd, but invalid bpf program type */
+ bpf_prog_put(prog);
+ return -EINVAL;
+ }
+
+ event->tp_event->prog = prog;
+
+ return 0;
+}
+
+static void perf_event_free_bpf_prog(struct perf_event *event)
+{
+ struct bpf_prog *prog;
+
+ if (!event->tp_event)
+ return;
+
+ prog = event->tp_event->prog;
+ if (prog) {
+ event->tp_event->prog = NULL;
+ bpf_prog_put(prog);
+ }
+}
+
+#else
+
+static inline void perf_tp_register(void)
+{
+}
+
+static int perf_event_set_filter(struct perf_event *event, void __user *arg)
+{
+ return -ENOENT;
+}
+
+static void perf_event_free_filter(struct perf_event *event)
+{
+}
+
+static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
+{
+ return -ENOENT;
+}
+
+static void perf_event_free_bpf_prog(struct perf_event *event)
+{
+}
+#endif /* CONFIG_EVENT_TRACING */
+
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+void perf_bp_event(struct perf_event *bp, void *data)
+{
+ struct perf_sample_data sample;
+ struct pt_regs *regs = data;
+
+ perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
+
+ if (!bp->hw.state && !perf_exclude_event(bp, regs))
+ perf_swevent_event(bp, 1, &sample, regs);
+}
+#endif
+
+/*
+ * hrtimer based swevent callback
+ */
+
+static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
+{
+ enum hrtimer_restart ret = HRTIMER_RESTART;
+ struct perf_sample_data data;
+ struct pt_regs *regs;
+ struct perf_event *event;
+ u64 period;
+
+ event = container_of(hrtimer, struct perf_event, hw.hrtimer);
+
+ if (event->state != PERF_EVENT_STATE_ACTIVE)
+ return HRTIMER_NORESTART;
+
+ event->pmu->read(event);
+
+ perf_sample_data_init(&data, 0, event->hw.last_period);
+ regs = get_irq_regs();
+
+ if (regs && !perf_exclude_event(event, regs)) {
+ if (!(event->attr.exclude_idle && is_idle_task(current)))
+ if (__perf_event_overflow(event, 1, &data, regs))
+ ret = HRTIMER_NORESTART;
+ }
+
+ period = max_t(u64, 10000, event->hw.sample_period);
+ hrtimer_forward_now(hrtimer, ns_to_ktime(period));
+
+ return ret;
+}
+
+static void perf_swevent_start_hrtimer(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ s64 period;
+
+ if (!is_sampling_event(event))
+ return;
+
+ period = local64_read(&hwc->period_left);
+ if (period) {
+ if (period < 0)
+ period = 10000;
+
+ local64_set(&hwc->period_left, 0);
+ } else {
+ period = max_t(u64, 10000, hwc->sample_period);
+ }
+ __hrtimer_start_range_ns(&hwc->hrtimer,
+ ns_to_ktime(period), 0,
+ HRTIMER_MODE_REL_PINNED, 0);
+}
+
+static void perf_swevent_cancel_hrtimer(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (is_sampling_event(event)) {
+ ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
+ local64_set(&hwc->period_left, ktime_to_ns(remaining));
+
+ hrtimer_cancel(&hwc->hrtimer);
+ }
+}
+
+static void perf_swevent_init_hrtimer(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (!is_sampling_event(event))
+ return;
+
+ hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hwc->hrtimer.function = perf_swevent_hrtimer;
+
+ /*
+ * Since hrtimers have a fixed rate, we can do a static freq->period
+ * mapping and avoid the whole period adjust feedback stuff.
+ */
+ if (event->attr.freq) {
+ long freq = event->attr.sample_freq;
+
+ event->attr.sample_period = NSEC_PER_SEC / freq;
+ hwc->sample_period = event->attr.sample_period;
+ local64_set(&hwc->period_left, hwc->sample_period);
+ hwc->last_period = hwc->sample_period;
+ event->attr.freq = 0;
+ }
+}
+
+/*
+ * Software event: cpu wall time clock
+ */
+
+static void cpu_clock_event_update(struct perf_event *event)
+{
+ s64 prev;
+ u64 now;
+
+ now = local_clock();
+ prev = local64_xchg(&event->hw.prev_count, now);
+ local64_add(now - prev, &event->count);
+}
+
+static void cpu_clock_event_start(struct perf_event *event, int flags)
+{
+ local64_set(&event->hw.prev_count, local_clock());
+ perf_swevent_start_hrtimer(event);
+}
+
+static void cpu_clock_event_stop(struct perf_event *event, int flags)
+{
+ perf_swevent_cancel_hrtimer(event);
+ cpu_clock_event_update(event);
+}
+
+static int cpu_clock_event_add(struct perf_event *event, int flags)
+{
+ if (flags & PERF_EF_START)
+ cpu_clock_event_start(event, flags);
+ perf_event_update_userpage(event);
+
+ return 0;
+}
+
+static void cpu_clock_event_del(struct perf_event *event, int flags)
+{
+ cpu_clock_event_stop(event, flags);
+}
+
+static void cpu_clock_event_read(struct perf_event *event)
+{
+ cpu_clock_event_update(event);
+}
+
+static int cpu_clock_event_init(struct perf_event *event)
+{
+ if (event->attr.type != PERF_TYPE_SOFTWARE)
+ return -ENOENT;
+
+ if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
+ return -ENOENT;
+
+ /*
+ * no branch sampling for software events
+ */
+ if (has_branch_stack(event))
+ return -EOPNOTSUPP;
+
+ perf_swevent_init_hrtimer(event);
+
+ return 0;
+}
+
+static struct pmu perf_cpu_clock = {
+ .task_ctx_nr = perf_sw_context,
+
+ .capabilities = PERF_PMU_CAP_NO_NMI,
+
+ .event_init = cpu_clock_event_init,
+ .add = cpu_clock_event_add,
+ .del = cpu_clock_event_del,
+ .start = cpu_clock_event_start,
+ .stop = cpu_clock_event_stop,
+ .read = cpu_clock_event_read,
+};
+
+/*
+ * Software event: task time clock
+ */
+
+static void task_clock_event_update(struct perf_event *event, u64 now)
+{
+ u64 prev;
+ s64 delta;
+
+ prev = local64_xchg(&event->hw.prev_count, now);
+ delta = now - prev;
+ local64_add(delta, &event->count);
+}
+
+static void task_clock_event_start(struct perf_event *event, int flags)
+{
+ local64_set(&event->hw.prev_count, event->ctx->time);
+ perf_swevent_start_hrtimer(event);
+}
+
+static void task_clock_event_stop(struct perf_event *event, int flags)
+{
+ perf_swevent_cancel_hrtimer(event);
+ task_clock_event_update(event, event->ctx->time);
+}
+
+static int task_clock_event_add(struct perf_event *event, int flags)
+{
+ if (flags & PERF_EF_START)
+ task_clock_event_start(event, flags);
+ perf_event_update_userpage(event);
+
+ return 0;
+}
+
+static void task_clock_event_del(struct perf_event *event, int flags)
+{
+ task_clock_event_stop(event, PERF_EF_UPDATE);
+}
+
+static void task_clock_event_read(struct perf_event *event)
+{
+ u64 now = perf_clock();
+ u64 delta = now - event->ctx->timestamp;
+ u64 time = event->ctx->time + delta;
+
+ task_clock_event_update(event, time);
+}
+
+static int task_clock_event_init(struct perf_event *event)
+{
+ if (event->attr.type != PERF_TYPE_SOFTWARE)
+ return -ENOENT;
+
+ if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
+ return -ENOENT;
+
+ /*
+ * no branch sampling for software events
+ */
+ if (has_branch_stack(event))
+ return -EOPNOTSUPP;
+
+ perf_swevent_init_hrtimer(event);
+
+ return 0;
+}
+
+static struct pmu perf_task_clock = {
+ .task_ctx_nr = perf_sw_context,
+
+ .capabilities = PERF_PMU_CAP_NO_NMI,
+
+ .event_init = task_clock_event_init,
+ .add = task_clock_event_add,
+ .del = task_clock_event_del,
+ .start = task_clock_event_start,
+ .stop = task_clock_event_stop,
+ .read = task_clock_event_read,
+};
+
+static void perf_pmu_nop_void(struct pmu *pmu)
+{
+}
+
+static int perf_pmu_nop_int(struct pmu *pmu)
+{
+ return 0;
+}
+
+static void perf_pmu_start_txn(struct pmu *pmu)
+{
+ perf_pmu_disable(pmu);
+}
+
+static int perf_pmu_commit_txn(struct pmu *pmu)
+{
+ perf_pmu_enable(pmu);
+ return 0;
+}
+
+static void perf_pmu_cancel_txn(struct pmu *pmu)
+{
+ perf_pmu_enable(pmu);
+}
+
+static int perf_event_idx_default(struct perf_event *event)
+{
+ return 0;
+}
+
+/*
+ * Ensures all contexts with the same task_ctx_nr have the same
+ * pmu_cpu_context too.
+ */
+static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
+{
+ struct pmu *pmu;
+
+ if (ctxn < 0)
+ return NULL;
+
+ list_for_each_entry(pmu, &pmus, entry) {
+ if (pmu->task_ctx_nr == ctxn)
+ return pmu->pmu_cpu_context;
+ }
+
+ return NULL;
+}
+
+static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct perf_cpu_context *cpuctx;
+
+ cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+
+ if (cpuctx->unique_pmu == old_pmu)
+ cpuctx->unique_pmu = pmu;
+ }
+}
+
+static void free_pmu_context(struct pmu *pmu)
+{
+ struct pmu *i;
+
+ mutex_lock(&pmus_lock);
+ /*
+ * Like a real lame refcount.
+ */
+ list_for_each_entry(i, &pmus, entry) {
+ if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
+ update_pmu_context(i, pmu);
+ goto out;
+ }
+ }
+
+ free_percpu(pmu->pmu_cpu_context);
+out:
+ mutex_unlock(&pmus_lock);
+}
+static struct idr pmu_idr;
+
+static ssize_t
+type_show(struct device *dev, struct device_attribute *attr, char *page)
+{
+ struct pmu *pmu = dev_get_drvdata(dev);
+
+ return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
+}
+static DEVICE_ATTR_RO(type);
+
+static ssize_t
+perf_event_mux_interval_ms_show(struct device *dev,
+ struct device_attribute *attr,
+ char *page)
+{
+ struct pmu *pmu = dev_get_drvdata(dev);
+
+ return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
+}
+
+static ssize_t
+perf_event_mux_interval_ms_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct pmu *pmu = dev_get_drvdata(dev);
+ int timer, cpu, ret;
+
+ ret = kstrtoint(buf, 0, &timer);
+ if (ret)
+ return ret;
+
+ if (timer < 1)
+ return -EINVAL;
+
+ /* same value, noting to do */
+ if (timer == pmu->hrtimer_interval_ms)
+ return count;
+
+ pmu->hrtimer_interval_ms = timer;
+
+ /* update all cpuctx for this PMU */
+ for_each_possible_cpu(cpu) {
+ struct perf_cpu_context *cpuctx;
+ cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+ cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
+
+ if (hrtimer_active(&cpuctx->hrtimer))
+ hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval);
+ }
+
+ return count;
+}
+static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
+
+static struct attribute *pmu_dev_attrs[] = {
+ &dev_attr_type.attr,
+ &dev_attr_perf_event_mux_interval_ms.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(pmu_dev);
+
+static int pmu_bus_running;
+static struct bus_type pmu_bus = {
+ .name = "event_source",
+ .dev_groups = pmu_dev_groups,
+};
+
+static void pmu_dev_release(struct device *dev)
+{
+ kfree(dev);
+}
+
+static int pmu_dev_alloc(struct pmu *pmu)
+{
+ int ret = -ENOMEM;
+
+ pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
+ if (!pmu->dev)
+ goto out;
+
+ pmu->dev->groups = pmu->attr_groups;
+ device_initialize(pmu->dev);
+ ret = dev_set_name(pmu->dev, "%s", pmu->name);
+ if (ret)
+ goto free_dev;
+
+ dev_set_drvdata(pmu->dev, pmu);
+ pmu->dev->bus = &pmu_bus;
+ pmu->dev->release = pmu_dev_release;
+ ret = device_add(pmu->dev);
+ if (ret)
+ goto free_dev;
+
+out:
+ return ret;
+
+free_dev:
+ put_device(pmu->dev);
+ goto out;
+}
+
+static struct lock_class_key cpuctx_mutex;
+static struct lock_class_key cpuctx_lock;
+
+int perf_pmu_register(struct pmu *pmu, const char *name, int type)
+{
+ int cpu, ret;
+
+ mutex_lock(&pmus_lock);
+ ret = -ENOMEM;
+ pmu->pmu_disable_count = alloc_percpu(int);
+ if (!pmu->pmu_disable_count)
+ goto unlock;
+
+ pmu->type = -1;
+ if (!name)
+ goto skip_type;
+ pmu->name = name;
+
+ if (type < 0) {
+ type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
+ if (type < 0) {
+ ret = type;
+ goto free_pdc;
+ }
+ }
+ pmu->type = type;
+
+ if (pmu_bus_running) {
+ ret = pmu_dev_alloc(pmu);
+ if (ret)
+ goto free_idr;
+ }
+
+skip_type:
+ pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
+ if (pmu->pmu_cpu_context)
+ goto got_cpu_context;
+
+ ret = -ENOMEM;
+ pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
+ if (!pmu->pmu_cpu_context)
+ goto free_dev;
+
+ for_each_possible_cpu(cpu) {
+ struct perf_cpu_context *cpuctx;
+
+ cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+ __perf_event_init_context(&cpuctx->ctx);
+ lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
+ lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
+ cpuctx->ctx.pmu = pmu;
+
+ __perf_cpu_hrtimer_init(cpuctx, cpu);
+
+ cpuctx->unique_pmu = pmu;
+ }
+
+got_cpu_context:
+ if (!pmu->start_txn) {
+ if (pmu->pmu_enable) {
+ /*
+ * If we have pmu_enable/pmu_disable calls, install
+ * transaction stubs that use that to try and batch
+ * hardware accesses.
+ */
+ pmu->start_txn = perf_pmu_start_txn;
+ pmu->commit_txn = perf_pmu_commit_txn;
+ pmu->cancel_txn = perf_pmu_cancel_txn;
+ } else {
+ pmu->start_txn = perf_pmu_nop_void;
+ pmu->commit_txn = perf_pmu_nop_int;
+ pmu->cancel_txn = perf_pmu_nop_void;
+ }
+ }
+
+ if (!pmu->pmu_enable) {
+ pmu->pmu_enable = perf_pmu_nop_void;
+ pmu->pmu_disable = perf_pmu_nop_void;
+ }
+
+ if (!pmu->event_idx)
+ pmu->event_idx = perf_event_idx_default;
+
+ list_add_rcu(&pmu->entry, &pmus);
+ atomic_set(&pmu->exclusive_cnt, 0);
+ ret = 0;
+unlock:
+ mutex_unlock(&pmus_lock);
+
+ return ret;
+
+free_dev:
+ device_del(pmu->dev);
+ put_device(pmu->dev);
+
+free_idr:
+ if (pmu->type >= PERF_TYPE_MAX)
+ idr_remove(&pmu_idr, pmu->type);
+
+free_pdc:
+ free_percpu(pmu->pmu_disable_count);
+ goto unlock;
+}
+EXPORT_SYMBOL_GPL(perf_pmu_register);
+
+void perf_pmu_unregister(struct pmu *pmu)
+{
+ mutex_lock(&pmus_lock);
+ list_del_rcu(&pmu->entry);
+ mutex_unlock(&pmus_lock);
+
+ /*
+ * We dereference the pmu list under both SRCU and regular RCU, so
+ * synchronize against both of those.
+ */
+ synchronize_srcu(&pmus_srcu);
+ synchronize_rcu();
+
+ free_percpu(pmu->pmu_disable_count);
+ if (pmu->type >= PERF_TYPE_MAX)
+ idr_remove(&pmu_idr, pmu->type);
+ device_del(pmu->dev);
+ put_device(pmu->dev);
+ free_pmu_context(pmu);
+}
+EXPORT_SYMBOL_GPL(perf_pmu_unregister);
+
+static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
+{
+ struct perf_event_context *ctx = NULL;
+ int ret;
+
+ if (!try_module_get(pmu->module))
+ return -ENODEV;
+
+ if (event->group_leader != event) {
+ /*
+ * This ctx->mutex can nest when we're called through
+ * inheritance. See the perf_event_ctx_lock_nested() comment.
+ */
+ ctx = perf_event_ctx_lock_nested(event->group_leader,
+ SINGLE_DEPTH_NESTING);
+ BUG_ON(!ctx);
+ }
+
+ event->pmu = pmu;
+ ret = pmu->event_init(event);
+
+ if (ctx)
+ perf_event_ctx_unlock(event->group_leader, ctx);
+
+ if (ret)
+ module_put(pmu->module);
+
+ return ret;
+}
+
+struct pmu *perf_init_event(struct perf_event *event)
+{
+ struct pmu *pmu = NULL;
+ int idx;
+ int ret;
+
+ idx = srcu_read_lock(&pmus_srcu);
+
+ rcu_read_lock();
+ pmu = idr_find(&pmu_idr, event->attr.type);
+ rcu_read_unlock();
+ if (pmu) {
+ ret = perf_try_init_event(pmu, event);
+ if (ret)
+ pmu = ERR_PTR(ret);
+ goto unlock;
+ }
+
+ list_for_each_entry_rcu(pmu, &pmus, entry) {
+ ret = perf_try_init_event(pmu, event);
+ if (!ret)
+ goto unlock;
+
+ if (ret != -ENOENT) {
+ pmu = ERR_PTR(ret);
+ goto unlock;
+ }
+ }
+ pmu = ERR_PTR(-ENOENT);
+unlock:
+ srcu_read_unlock(&pmus_srcu, idx);
+
+ return pmu;
+}
+
+static void account_event_cpu(struct perf_event *event, int cpu)
+{
+ if (event->parent)
+ return;
+
+ if (is_cgroup_event(event))
+ atomic_inc(&per_cpu(perf_cgroup_events, cpu));
+}
+
+static void account_event(struct perf_event *event)
+{
+ if (event->parent)
+ return;
+
+ if (event->attach_state & PERF_ATTACH_TASK)
+ static_key_slow_inc(&perf_sched_events.key);
+ if (event->attr.mmap || event->attr.mmap_data)
+ atomic_inc(&nr_mmap_events);
+ if (event->attr.comm)
+ atomic_inc(&nr_comm_events);
+ if (event->attr.task)
+ atomic_inc(&nr_task_events);
+ if (event->attr.freq) {
+ if (atomic_inc_return(&nr_freq_events) == 1)
+ tick_nohz_full_kick_all();
+ }
+ if (has_branch_stack(event))
+ static_key_slow_inc(&perf_sched_events.key);
+ if (is_cgroup_event(event))
+ static_key_slow_inc(&perf_sched_events.key);
+
+ account_event_cpu(event, event->cpu);
+}
+
+/*
+ * Allocate and initialize a event structure
+ */
+static struct perf_event *
+perf_event_alloc(struct perf_event_attr *attr, int cpu,
+ struct task_struct *task,
+ struct perf_event *group_leader,
+ struct perf_event *parent_event,
+ perf_overflow_handler_t overflow_handler,
+ void *context, int cgroup_fd)
+{
+ struct pmu *pmu;
+ struct perf_event *event;
+ struct hw_perf_event *hwc;
+ long err = -EINVAL;
+
+ if ((unsigned)cpu >= nr_cpu_ids) {
+ if (!task || cpu != -1)
+ return ERR_PTR(-EINVAL);
+ }
+
+ event = kzalloc(sizeof(*event), GFP_KERNEL);
+ if (!event)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * Single events are their own group leaders, with an
+ * empty sibling list:
+ */
+ if (!group_leader)
+ group_leader = event;
+
+ mutex_init(&event->child_mutex);
+ INIT_LIST_HEAD(&event->child_list);
+
+ INIT_LIST_HEAD(&event->group_entry);
+ INIT_LIST_HEAD(&event->event_entry);
+ INIT_LIST_HEAD(&event->sibling_list);
+ INIT_LIST_HEAD(&event->rb_entry);
+ INIT_LIST_HEAD(&event->active_entry);
+ INIT_HLIST_NODE(&event->hlist_entry);
+
+
+ init_waitqueue_head(&event->waitq);
+ init_irq_work(&event->pending, perf_pending_event);
+
+ mutex_init(&event->mmap_mutex);
+
+ atomic_long_set(&event->refcount, 1);
+ event->cpu = cpu;
+ event->attr = *attr;
+ event->group_leader = group_leader;
+ event->pmu = NULL;
+ event->oncpu = -1;
+
+ event->parent = parent_event;
+
+ event->ns = get_pid_ns(task_active_pid_ns(current));
+ event->id = atomic64_inc_return(&perf_event_id);
+
+ event->state = PERF_EVENT_STATE_INACTIVE;
+
+ if (task) {
+ event->attach_state = PERF_ATTACH_TASK;
+ /*
+ * XXX pmu::event_init needs to know what task to account to
+ * and we cannot use the ctx information because we need the
+ * pmu before we get a ctx.
+ */
+ event->hw.target = task;
+ }
+
+ event->clock = &local_clock;
+ if (parent_event)
+ event->clock = parent_event->clock;
+
+ if (!overflow_handler && parent_event) {
+ overflow_handler = parent_event->overflow_handler;
+ context = parent_event->overflow_handler_context;
+ }
+
+ event->overflow_handler = overflow_handler;
+ event->overflow_handler_context = context;
+
+ perf_event__state_init(event);
+
+ pmu = NULL;
+
+ hwc = &event->hw;
+ hwc->sample_period = attr->sample_period;
+ if (attr->freq && attr->sample_freq)
+ hwc->sample_period = 1;
+ hwc->last_period = hwc->sample_period;
+
+ local64_set(&hwc->period_left, hwc->sample_period);
+
+ /*
+ * we currently do not support PERF_FORMAT_GROUP on inherited events
+ */
+ if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
+ goto err_ns;
+
+ if (!has_branch_stack(event))
+ event->attr.branch_sample_type = 0;
+
+ if (cgroup_fd != -1) {
+ err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
+ if (err)
+ goto err_ns;
+ }
+
+ pmu = perf_init_event(event);
+ if (!pmu)
+ goto err_ns;
+ else if (IS_ERR(pmu)) {
+ err = PTR_ERR(pmu);
+ goto err_ns;
+ }
+
+ err = exclusive_event_init(event);
+ if (err)
+ goto err_pmu;
+
+ if (!event->parent) {
+ if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
+ err = get_callchain_buffers();
+ if (err)
+ goto err_per_task;
+ }
+ }
+
+ return event;
+
+err_per_task:
+ exclusive_event_destroy(event);
+
+err_pmu:
+ if (event->destroy)
+ event->destroy(event);
+ module_put(pmu->module);
+err_ns:
+ if (is_cgroup_event(event))
+ perf_detach_cgroup(event);
+ if (event->ns)
+ put_pid_ns(event->ns);
+ kfree(event);
+
+ return ERR_PTR(err);
+}
+
+static int perf_copy_attr(struct perf_event_attr __user *uattr,
+ struct perf_event_attr *attr)
+{
+ u32 size;
+ int ret;
+
+ if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
+ return -EFAULT;
+
+ /*
+ * zero the full structure, so that a short copy will be nice.
+ */
+ memset(attr, 0, sizeof(*attr));
+
+ ret = get_user(size, &uattr->size);
+ if (ret)
+ return ret;
+
+ if (size > PAGE_SIZE) /* silly large */
+ goto err_size;
+
+ if (!size) /* abi compat */
+ size = PERF_ATTR_SIZE_VER0;
+
+ if (size < PERF_ATTR_SIZE_VER0)
+ goto err_size;
+
+ /*
+ * If we're handed a bigger struct than we know of,
+ * ensure all the unknown bits are 0 - i.e. new
+ * user-space does not rely on any kernel feature
+ * extensions we dont know about yet.
+ */
+ if (size > sizeof(*attr)) {
+ unsigned char __user *addr;
+ unsigned char __user *end;
+ unsigned char val;
+
+ addr = (void __user *)uattr + sizeof(*attr);
+ end = (void __user *)uattr + size;
+
+ for (; addr < end; addr++) {
+ ret = get_user(val, addr);
+ if (ret)
+ return ret;
+ if (val)
+ goto err_size;
+ }
+ size = sizeof(*attr);
+ }
+
+ ret = copy_from_user(attr, uattr, size);
+ if (ret)
+ return -EFAULT;
+
+ if (attr->__reserved_1)
+ return -EINVAL;
+
+ if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
+ return -EINVAL;
+
+ if (attr->read_format & ~(PERF_FORMAT_MAX-1))
+ return -EINVAL;
+
+ if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
+ u64 mask = attr->branch_sample_type;
+
+ /* only using defined bits */
+ if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
+ return -EINVAL;
+
+ /* at least one branch bit must be set */
+ if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
+ return -EINVAL;
+
+ /* propagate priv level, when not set for branch */
+ if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
+
+ /* exclude_kernel checked on syscall entry */
+ if (!attr->exclude_kernel)
+ mask |= PERF_SAMPLE_BRANCH_KERNEL;
+
+ if (!attr->exclude_user)
+ mask |= PERF_SAMPLE_BRANCH_USER;
+
+ if (!attr->exclude_hv)
+ mask |= PERF_SAMPLE_BRANCH_HV;
+ /*
+ * adjust user setting (for HW filter setup)
+ */
+ attr->branch_sample_type = mask;
+ }
+ /* privileged levels capture (kernel, hv): check permissions */
+ if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
+ && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+ return -EACCES;
+ }
+
+ if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
+ ret = perf_reg_validate(attr->sample_regs_user);
+ if (ret)
+ return ret;
+ }
+
+ if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
+ if (!arch_perf_have_user_stack_dump())
+ return -ENOSYS;
+
+ /*
+ * We have __u32 type for the size, but so far
+ * we can only use __u16 as maximum due to the
+ * __u16 sample size limit.
+ */
+ if (attr->sample_stack_user >= USHRT_MAX)
+ ret = -EINVAL;
+ else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
+ ret = -EINVAL;
+ }
+
+ if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
+ ret = perf_reg_validate(attr->sample_regs_intr);
+out:
+ return ret;
+
+err_size:
+ put_user(sizeof(*attr), &uattr->size);
+ ret = -E2BIG;
+ goto out;
+}
+
+static int
+perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
+{
+ struct ring_buffer *rb = NULL;
+ int ret = -EINVAL;
+
+ if (!output_event)
+ goto set;
+
+ /* don't allow circular references */
+ if (event == output_event)
+ goto out;
+
+ /*
+ * Don't allow cross-cpu buffers
+ */
+ if (output_event->cpu != event->cpu)
+ goto out;
+
+ /*
+ * If its not a per-cpu rb, it must be the same task.
+ */
+ if (output_event->cpu == -1 && output_event->ctx != event->ctx)
+ goto out;
+
+ /*
+ * Mixing clocks in the same buffer is trouble you don't need.
+ */
+ if (output_event->clock != event->clock)
+ goto out;
+
+ /*
+ * If both events generate aux data, they must be on the same PMU
+ */
+ if (has_aux(event) && has_aux(output_event) &&
+ event->pmu != output_event->pmu)
+ goto out;
+
+set:
+ mutex_lock(&event->mmap_mutex);
+ /* Can't redirect output if we've got an active mmap() */
+ if (atomic_read(&event->mmap_count))
+ goto unlock;
+
+ if (output_event) {
+ /* get the rb we want to redirect to */
+ rb = ring_buffer_get(output_event);
+ if (!rb)
+ goto unlock;
+ }
+
+ ring_buffer_attach(event, rb);
+
+ ret = 0;
+unlock:
+ mutex_unlock(&event->mmap_mutex);
+
+out:
+ return ret;
+}
+
+static void mutex_lock_double(struct mutex *a, struct mutex *b)
+{
+ if (b < a)
+ swap(a, b);
+
+ mutex_lock(a);
+ mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
+}
+
+static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
+{
+ bool nmi_safe = false;
+
+ switch (clk_id) {
+ case CLOCK_MONOTONIC:
+ event->clock = &ktime_get_mono_fast_ns;
+ nmi_safe = true;
+ break;
+
+ case CLOCK_MONOTONIC_RAW:
+ event->clock = &ktime_get_raw_fast_ns;
+ nmi_safe = true;
+ break;
+
+ case CLOCK_REALTIME:
+ event->clock = &ktime_get_real_ns;
+ break;
+
+ case CLOCK_BOOTTIME:
+ event->clock = &ktime_get_boot_ns;
+ break;
+
+ case CLOCK_TAI:
+ event->clock = &ktime_get_tai_ns;
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
+ return -EINVAL;
+
+ return 0;
+}
+
+/**
+ * sys_perf_event_open - open a performance event, associate it to a task/cpu
+ *
+ * @attr_uptr: event_id type attributes for monitoring/sampling
+ * @pid: target pid
+ * @cpu: target cpu
+ * @group_fd: group leader event fd
+ */
+SYSCALL_DEFINE5(perf_event_open,
+ struct perf_event_attr __user *, attr_uptr,
+ pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
+{
+ struct perf_event *group_leader = NULL, *output_event = NULL;
+ struct perf_event *event, *sibling;
+ struct perf_event_attr attr;
+ struct perf_event_context *ctx, *uninitialized_var(gctx);
+ struct file *event_file = NULL;
+ struct fd group = {NULL, 0};
+ struct task_struct *task = NULL;
+ struct pmu *pmu;
+ int event_fd;
+ int move_group = 0;
+ int err;
+ int f_flags = O_RDWR;
+ int cgroup_fd = -1;
+
+ /* for future expandability... */
+ if (flags & ~PERF_FLAG_ALL)
+ return -EINVAL;
+
+ err = perf_copy_attr(attr_uptr, &attr);
+ if (err)
+ return err;
+
+ if (!attr.exclude_kernel) {
+ if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+ return -EACCES;
+ }
+
+ if (attr.freq) {
+ if (attr.sample_freq > sysctl_perf_event_sample_rate)
+ return -EINVAL;
+ } else {
+ if (attr.sample_period & (1ULL << 63))
+ return -EINVAL;
+ }
+
+ /*
+ * In cgroup mode, the pid argument is used to pass the fd
+ * opened to the cgroup directory in cgroupfs. The cpu argument
+ * designates the cpu on which to monitor threads from that
+ * cgroup.
+ */
+ if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
+ return -EINVAL;
+
+ if (flags & PERF_FLAG_FD_CLOEXEC)
+ f_flags |= O_CLOEXEC;
+
+ event_fd = get_unused_fd_flags(f_flags);
+ if (event_fd < 0)
+ return event_fd;
+
+ if (group_fd != -1) {
+ err = perf_fget_light(group_fd, &group);
+ if (err)
+ goto err_fd;
+ group_leader = group.file->private_data;
+ if (flags & PERF_FLAG_FD_OUTPUT)
+ output_event = group_leader;
+ if (flags & PERF_FLAG_FD_NO_GROUP)
+ group_leader = NULL;
+ }
+
+ if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
+ task = find_lively_task_by_vpid(pid);
+ if (IS_ERR(task)) {
+ err = PTR_ERR(task);
+ goto err_group_fd;
+ }
+ }
+
+ if (task && group_leader &&
+ group_leader->attr.inherit != attr.inherit) {
+ err = -EINVAL;
+ goto err_task;
+ }
+
+ get_online_cpus();
+
+ if (flags & PERF_FLAG_PID_CGROUP)
+ cgroup_fd = pid;
+
+ event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
+ NULL, NULL, cgroup_fd);
+ if (IS_ERR(event)) {
+ err = PTR_ERR(event);
+ goto err_cpus;
+ }
+
+ if (is_sampling_event(event)) {
+ if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
+ err = -ENOTSUPP;
+ goto err_alloc;
+ }
+ }
+
+ account_event(event);
+
+ /*
+ * Special case software events and allow them to be part of
+ * any hardware group.
+ */
+ pmu = event->pmu;
+
+ if (attr.use_clockid) {
+ err = perf_event_set_clock(event, attr.clockid);
+ if (err)
+ goto err_alloc;
+ }
+
+ if (group_leader &&
+ (is_software_event(event) != is_software_event(group_leader))) {
+ if (is_software_event(event)) {
+ /*
+ * If event and group_leader are not both a software
+ * event, and event is, then group leader is not.
+ *
+ * Allow the addition of software events to !software
+ * groups, this is safe because software events never
+ * fail to schedule.
+ */
+ pmu = group_leader->pmu;
+ } else if (is_software_event(group_leader) &&
+ (group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
+ /*
+ * In case the group is a pure software group, and we
+ * try to add a hardware event, move the whole group to
+ * the hardware context.
+ */
+ move_group = 1;
+ }
+ }
+
+ /*
+ * Get the target context (task or percpu):
+ */
+ ctx = find_get_context(pmu, task, event);
+ if (IS_ERR(ctx)) {
+ err = PTR_ERR(ctx);
+ goto err_alloc;
+ }
+
+ if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) {
+ err = -EBUSY;
+ goto err_context;
+ }
+
+ if (task) {
+ put_task_struct(task);
+ task = NULL;
+ }
+
+ /*
+ * Look up the group leader (we will attach this event to it):
+ */
+ if (group_leader) {
+ err = -EINVAL;
+
+ /*
+ * Do not allow a recursive hierarchy (this new sibling
+ * becoming part of another group-sibling):
+ */
+ if (group_leader->group_leader != group_leader)
+ goto err_context;
+
+ /* All events in a group should have the same clock */
+ if (group_leader->clock != event->clock)
+ goto err_context;
+
+ /*
+ * Do not allow to attach to a group in a different
+ * task or CPU context:
+ */
+ if (move_group) {
+ /*
+ * Make sure we're both on the same task, or both
+ * per-cpu events.
+ */
+ if (group_leader->ctx->task != ctx->task)
+ goto err_context;
+
+ /*
+ * Make sure we're both events for the same CPU;
+ * grouping events for different CPUs is broken; since
+ * you can never concurrently schedule them anyhow.
+ */
+ if (group_leader->cpu != event->cpu)
+ goto err_context;
+ } else {
+ if (group_leader->ctx != ctx)
+ goto err_context;
+ }
+
+ /*
+ * Only a group leader can be exclusive or pinned
+ */
+ if (attr.exclusive || attr.pinned)
+ goto err_context;
+ }
+
+ if (output_event) {
+ err = perf_event_set_output(event, output_event);
+ if (err)
+ goto err_context;
+ }
+
+ event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
+ f_flags);
+ if (IS_ERR(event_file)) {
+ err = PTR_ERR(event_file);
+ goto err_context;
+ }
+
+ if (move_group) {
+ gctx = group_leader->ctx;
+
+ /*
+ * See perf_event_ctx_lock() for comments on the details
+ * of swizzling perf_event::ctx.
+ */
+ mutex_lock_double(&gctx->mutex, &ctx->mutex);
+
+ perf_remove_from_context(group_leader, false);
+
+ list_for_each_entry(sibling, &group_leader->sibling_list,
+ group_entry) {
+ perf_remove_from_context(sibling, false);
+ put_ctx(gctx);
+ }
+ } else {
+ mutex_lock(&ctx->mutex);
+ }
+
+ WARN_ON_ONCE(ctx->parent_ctx);
+
+ if (move_group) {
+ /*
+ * Wait for everybody to stop referencing the events through
+ * the old lists, before installing it on new lists.
+ */
+ synchronize_rcu();
+
+ /*
+ * Install the group siblings before the group leader.
+ *
+ * Because a group leader will try and install the entire group
+ * (through the sibling list, which is still in-tact), we can
+ * end up with siblings installed in the wrong context.
+ *
+ * By installing siblings first we NO-OP because they're not
+ * reachable through the group lists.
+ */
+ list_for_each_entry(sibling, &group_leader->sibling_list,
+ group_entry) {
+ perf_event__state_init(sibling);
+ perf_install_in_context(ctx, sibling, sibling->cpu);
+ get_ctx(ctx);
+ }
+
+ /*
+ * Removing from the context ends up with disabled
+ * event. What we want here is event in the initial
+ * startup state, ready to be add into new context.
+ */
+ perf_event__state_init(group_leader);
+ perf_install_in_context(ctx, group_leader, group_leader->cpu);
+ get_ctx(ctx);
+ }
+
+ if (!exclusive_event_installable(event, ctx)) {
+ err = -EBUSY;
+ mutex_unlock(&ctx->mutex);
+ fput(event_file);
+ goto err_context;
+ }
+
+ perf_install_in_context(ctx, event, event->cpu);
+ perf_unpin_context(ctx);
+
+ if (move_group) {
+ mutex_unlock(&gctx->mutex);
+ put_ctx(gctx);
+ }
+ mutex_unlock(&ctx->mutex);
+
+ put_online_cpus();
+
+ event->owner = current;
+
+ mutex_lock(&current->perf_event_mutex);
+ list_add_tail(&event->owner_entry, &current->perf_event_list);
+ mutex_unlock(&current->perf_event_mutex);
+
+ /*
+ * Precalculate sample_data sizes
+ */
+ perf_event__header_size(event);
+ perf_event__id_header_size(event);
+
+ /*
+ * Drop the reference on the group_event after placing the
+ * new event on the sibling_list. This ensures destruction
+ * of the group leader will find the pointer to itself in
+ * perf_group_detach().
+ */
+ fdput(group);
+ fd_install(event_fd, event_file);
+ return event_fd;
+
+err_context:
+ perf_unpin_context(ctx);
+ put_ctx(ctx);
+err_alloc:
+ free_event(event);
+err_cpus:
+ put_online_cpus();
+err_task:
+ if (task)
+ put_task_struct(task);
+err_group_fd:
+ fdput(group);
+err_fd:
+ put_unused_fd(event_fd);
+ return err;
+}
+
+/**
+ * perf_event_create_kernel_counter
+ *
+ * @attr: attributes of the counter to create
+ * @cpu: cpu in which the counter is bound
+ * @task: task to profile (NULL for percpu)
+ */
+struct perf_event *
+perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
+ struct task_struct *task,
+ perf_overflow_handler_t overflow_handler,
+ void *context)
+{
+ struct perf_event_context *ctx;
+ struct perf_event *event;
+ int err;
+
+ /*
+ * Get the target context (task or percpu):
+ */
+
+ event = perf_event_alloc(attr, cpu, task, NULL, NULL,
+ overflow_handler, context, -1);
+ if (IS_ERR(event)) {
+ err = PTR_ERR(event);
+ goto err;
+ }
+
+ /* Mark owner so we could distinguish it from user events. */
+ event->owner = EVENT_OWNER_KERNEL;
+
+ account_event(event);
+
+ ctx = find_get_context(event->pmu, task, event);
+ if (IS_ERR(ctx)) {
+ err = PTR_ERR(ctx);
+ goto err_free;
+ }
+
+ WARN_ON_ONCE(ctx->parent_ctx);
+ mutex_lock(&ctx->mutex);
+ if (!exclusive_event_installable(event, ctx)) {
+ mutex_unlock(&ctx->mutex);
+ perf_unpin_context(ctx);
+ put_ctx(ctx);
+ err = -EBUSY;
+ goto err_free;
+ }
+
+ perf_install_in_context(ctx, event, cpu);
+ perf_unpin_context(ctx);
+ mutex_unlock(&ctx->mutex);
+
+ return event;
+
+err_free:
+ free_event(event);
+err:
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
+
+void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
+{
+ struct perf_event_context *src_ctx;
+ struct perf_event_context *dst_ctx;
+ struct perf_event *event, *tmp;
+ LIST_HEAD(events);
+
+ src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
+ dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
+
+ /*
+ * See perf_event_ctx_lock() for comments on the details
+ * of swizzling perf_event::ctx.
+ */
+ mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
+ list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
+ event_entry) {
+ perf_remove_from_context(event, false);
+ unaccount_event_cpu(event, src_cpu);
+ put_ctx(src_ctx);
+ list_add(&event->migrate_entry, &events);
+ }
+
+ /*
+ * Wait for the events to quiesce before re-instating them.
+ */
+ synchronize_rcu();
+
+ /*
+ * Re-instate events in 2 passes.
+ *
+ * Skip over group leaders and only install siblings on this first
+ * pass, siblings will not get enabled without a leader, however a
+ * leader will enable its siblings, even if those are still on the old
+ * context.
+ */
+ list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
+ if (event->group_leader == event)
+ continue;
+
+ list_del(&event->migrate_entry);
+ if (event->state >= PERF_EVENT_STATE_OFF)
+ event->state = PERF_EVENT_STATE_INACTIVE;
+ account_event_cpu(event, dst_cpu);
+ perf_install_in_context(dst_ctx, event, dst_cpu);
+ get_ctx(dst_ctx);
+ }
+
+ /*
+ * Once all the siblings are setup properly, install the group leaders
+ * to make it go.
+ */
+ list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
+ list_del(&event->migrate_entry);
+ if (event->state >= PERF_EVENT_STATE_OFF)
+ event->state = PERF_EVENT_STATE_INACTIVE;
+ account_event_cpu(event, dst_cpu);
+ perf_install_in_context(dst_ctx, event, dst_cpu);
+ get_ctx(dst_ctx);
+ }
+ mutex_unlock(&dst_ctx->mutex);
+ mutex_unlock(&src_ctx->mutex);
+}
+EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
+
+static void sync_child_event(struct perf_event *child_event,
+ struct task_struct *child)
+{
+ struct perf_event *parent_event = child_event->parent;
+ u64 child_val;
+
+ if (child_event->attr.inherit_stat)
+ perf_event_read_event(child_event, child);
+
+ child_val = perf_event_count(child_event);
+
+ /*
+ * Add back the child's count to the parent's count:
+ */
+ atomic64_add(child_val, &parent_event->child_count);
+ atomic64_add(child_event->total_time_enabled,
+ &parent_event->child_total_time_enabled);
+ atomic64_add(child_event->total_time_running,
+ &parent_event->child_total_time_running);
+
+ /*
+ * Remove this event from the parent's list
+ */
+ WARN_ON_ONCE(parent_event->ctx->parent_ctx);
+ mutex_lock(&parent_event->child_mutex);
+ list_del_init(&child_event->child_list);
+ mutex_unlock(&parent_event->child_mutex);
+
+ /*
+ * Make sure user/parent get notified, that we just
+ * lost one event.
+ */
+ perf_event_wakeup(parent_event);
+
+ /*
+ * Release the parent event, if this was the last
+ * reference to it.
+ */
+ put_event(parent_event);
+}
+
+static void
+__perf_event_exit_task(struct perf_event *child_event,
+ struct perf_event_context *child_ctx,
+ struct task_struct *child)
+{
+ /*
+ * Do not destroy the 'original' grouping; because of the context
+ * switch optimization the original events could've ended up in a
+ * random child task.
+ *
+ * If we were to destroy the original group, all group related
+ * operations would cease to function properly after this random
+ * child dies.
+ *
+ * Do destroy all inherited groups, we don't care about those
+ * and being thorough is better.
+ */
+ perf_remove_from_context(child_event, !!child_event->parent);
+
+ /*
+ * It can happen that the parent exits first, and has events
+ * that are still around due to the child reference. These
+ * events need to be zapped.
+ */
+ if (child_event->parent) {
+ sync_child_event(child_event, child);
+ free_event(child_event);
+ } else {
+ child_event->state = PERF_EVENT_STATE_EXIT;
+ perf_event_wakeup(child_event);
+ }
+}
+
+static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
+{
+ struct perf_event *child_event, *next;
+ struct perf_event_context *child_ctx, *clone_ctx = NULL;
+ unsigned long flags;
+
+ if (likely(!child->perf_event_ctxp[ctxn])) {
+ perf_event_task(child, NULL, 0);
+ return;
+ }
+
+ local_irq_save(flags);
+ /*
+ * We can't reschedule here because interrupts are disabled,
+ * and either child is current or it is a task that can't be
+ * scheduled, so we are now safe from rescheduling changing
+ * our context.
+ */
+ child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
+
+ /*
+ * Take the context lock here so that if find_get_context is
+ * reading child->perf_event_ctxp, we wait until it has
+ * incremented the context's refcount before we do put_ctx below.
+ */
+ raw_spin_lock(&child_ctx->lock);
+ task_ctx_sched_out(child_ctx);
+ child->perf_event_ctxp[ctxn] = NULL;
+
+ /*
+ * If this context is a clone; unclone it so it can't get
+ * swapped to another process while we're removing all
+ * the events from it.
+ */
+ clone_ctx = unclone_ctx(child_ctx);
+ update_context_time(child_ctx);
+ raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
+
+ if (clone_ctx)
+ put_ctx(clone_ctx);
+
+ /*
+ * Report the task dead after unscheduling the events so that we
+ * won't get any samples after PERF_RECORD_EXIT. We can however still
+ * get a few PERF_RECORD_READ events.
+ */
+ perf_event_task(child, child_ctx, 0);
+
+ /*
+ * We can recurse on the same lock type through:
+ *
+ * __perf_event_exit_task()
+ * sync_child_event()
+ * put_event()
+ * mutex_lock(&ctx->mutex)
+ *
+ * But since its the parent context it won't be the same instance.
+ */
+ mutex_lock(&child_ctx->mutex);
+
+ list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
+ __perf_event_exit_task(child_event, child_ctx, child);
+
+ mutex_unlock(&child_ctx->mutex);
+
+ put_ctx(child_ctx);
+}
+
+/*
+ * When a child task exits, feed back event values to parent events.
+ */
+void perf_event_exit_task(struct task_struct *child)
+{
+ struct perf_event *event, *tmp;
+ int ctxn;
+
+ mutex_lock(&child->perf_event_mutex);
+ list_for_each_entry_safe(event, tmp, &child->perf_event_list,
+ owner_entry) {
+ list_del_init(&event->owner_entry);
+
+ /*
+ * Ensure the list deletion is visible before we clear
+ * the owner, closes a race against perf_release() where
+ * we need to serialize on the owner->perf_event_mutex.
+ */
+ smp_wmb();
+ event->owner = NULL;
+ }
+ mutex_unlock(&child->perf_event_mutex);
+
+ for_each_task_context_nr(ctxn)
+ perf_event_exit_task_context(child, ctxn);
+}
+
+static void perf_free_event(struct perf_event *event,
+ struct perf_event_context *ctx)
+{
+ struct perf_event *parent = event->parent;
+
+ if (WARN_ON_ONCE(!parent))
+ return;
+
+ mutex_lock(&parent->child_mutex);
+ list_del_init(&event->child_list);
+ mutex_unlock(&parent->child_mutex);
+
+ put_event(parent);
+
+ raw_spin_lock_irq(&ctx->lock);
+ perf_group_detach(event);
+ list_del_event(event, ctx);
+ raw_spin_unlock_irq(&ctx->lock);
+ free_event(event);
+}
+
+/*
+ * Free an unexposed, unused context as created by inheritance by
+ * perf_event_init_task below, used by fork() in case of fail.
+ *
+ * Not all locks are strictly required, but take them anyway to be nice and
+ * help out with the lockdep assertions.
+ */
+void perf_event_free_task(struct task_struct *task)
+{
+ struct perf_event_context *ctx;
+ struct perf_event *event, *tmp;
+ int ctxn;
+
+ for_each_task_context_nr(ctxn) {
+ ctx = task->perf_event_ctxp[ctxn];
+ if (!ctx)
+ continue;
+
+ mutex_lock(&ctx->mutex);
+again:
+ list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
+ group_entry)
+ perf_free_event(event, ctx);
+
+ list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
+ group_entry)
+ perf_free_event(event, ctx);
+
+ if (!list_empty(&ctx->pinned_groups) ||
+ !list_empty(&ctx->flexible_groups))
+ goto again;
+
+ mutex_unlock(&ctx->mutex);
+
+ put_ctx(ctx);
+ }
+}
+
+void perf_event_delayed_put(struct task_struct *task)
+{
+ int ctxn;
+
+ for_each_task_context_nr(ctxn)
+ WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
+}
+
+/*
+ * inherit a event from parent task to child task:
+ */
+static struct perf_event *
+inherit_event(struct perf_event *parent_event,
+ struct task_struct *parent,
+ struct perf_event_context *parent_ctx,
+ struct task_struct *child,
+ struct perf_event *group_leader,
+ struct perf_event_context *child_ctx)
+{
+ enum perf_event_active_state parent_state = parent_event->state;
+ struct perf_event *child_event;
+ unsigned long flags;
+
+ /*
+ * Instead of creating recursive hierarchies of events,
+ * we link inherited events back to the original parent,
+ * which has a filp for sure, which we use as the reference
+ * count:
+ */
+ if (parent_event->parent)
+ parent_event = parent_event->parent;
+
+ child_event = perf_event_alloc(&parent_event->attr,
+ parent_event->cpu,
+ child,
+ group_leader, parent_event,
+ NULL, NULL, -1);
+ if (IS_ERR(child_event))
+ return child_event;
+
+ if (is_orphaned_event(parent_event) ||
+ !atomic_long_inc_not_zero(&parent_event->refcount)) {
+ free_event(child_event);
+ return NULL;
+ }
+
+ get_ctx(child_ctx);
+
+ /*
+ * Make the child state follow the state of the parent event,
+ * not its attr.disabled bit. We hold the parent's mutex,
+ * so we won't race with perf_event_{en, dis}able_family.
+ */
+ if (parent_state >= PERF_EVENT_STATE_INACTIVE)
+ child_event->state = PERF_EVENT_STATE_INACTIVE;
+ else
+ child_event->state = PERF_EVENT_STATE_OFF;
+
+ if (parent_event->attr.freq) {
+ u64 sample_period = parent_event->hw.sample_period;
+ struct hw_perf_event *hwc = &child_event->hw;
+
+ hwc->sample_period = sample_period;
+ hwc->last_period = sample_period;
+
+ local64_set(&hwc->period_left, sample_period);
+ }
+
+ child_event->ctx = child_ctx;
+ child_event->overflow_handler = parent_event->overflow_handler;
+ child_event->overflow_handler_context
+ = parent_event->overflow_handler_context;
+
+ /*
+ * Precalculate sample_data sizes
+ */
+ perf_event__header_size(child_event);
+ perf_event__id_header_size(child_event);
+
+ /*
+ * Link it up in the child's context:
+ */
+ raw_spin_lock_irqsave(&child_ctx->lock, flags);
+ add_event_to_ctx(child_event, child_ctx);
+ raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
+
+ /*
+ * Link this into the parent event's child list
+ */
+ WARN_ON_ONCE(parent_event->ctx->parent_ctx);
+ mutex_lock(&parent_event->child_mutex);
+ list_add_tail(&child_event->child_list, &parent_event->child_list);
+ mutex_unlock(&parent_event->child_mutex);
+
+ return child_event;
+}
+
+static int inherit_group(struct perf_event *parent_event,
+ struct task_struct *parent,
+ struct perf_event_context *parent_ctx,
+ struct task_struct *child,
+ struct perf_event_context *child_ctx)
+{
+ struct perf_event *leader;
+ struct perf_event *sub;
+ struct perf_event *child_ctr;
+
+ leader = inherit_event(parent_event, parent, parent_ctx,
+ child, NULL, child_ctx);
+ if (IS_ERR(leader))
+ return PTR_ERR(leader);
+ list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
+ child_ctr = inherit_event(sub, parent, parent_ctx,
+ child, leader, child_ctx);
+ if (IS_ERR(child_ctr))
+ return PTR_ERR(child_ctr);
+ }
+ return 0;
+}
+
+static int
+inherit_task_group(struct perf_event *event, struct task_struct *parent,
+ struct perf_event_context *parent_ctx,
+ struct task_struct *child, int ctxn,
+ int *inherited_all)
+{
+ int ret;
+ struct perf_event_context *child_ctx;
+
+ if (!event->attr.inherit) {
+ *inherited_all = 0;
+ return 0;
+ }
+
+ child_ctx = child->perf_event_ctxp[ctxn];
+ if (!child_ctx) {
+ /*
+ * This is executed from the parent task context, so
+ * inherit events that have been marked for cloning.
+ * First allocate and initialize a context for the
+ * child.
+ */
+
+ child_ctx = alloc_perf_context(parent_ctx->pmu, child);
+ if (!child_ctx)
+ return -ENOMEM;
+
+ child->perf_event_ctxp[ctxn] = child_ctx;
+ }
+
+ ret = inherit_group(event, parent, parent_ctx,
+ child, child_ctx);
+
+ if (ret)
+ *inherited_all = 0;
+
+ return ret;
+}
+
+/*
+ * Initialize the perf_event context in task_struct
+ */
+static int perf_event_init_context(struct task_struct *child, int ctxn)
+{
+ struct perf_event_context *child_ctx, *parent_ctx;
+ struct perf_event_context *cloned_ctx;
+ struct perf_event *event;
+ struct task_struct *parent = current;
+ int inherited_all = 1;
+ unsigned long flags;
+ int ret = 0;
+
+ if (likely(!parent->perf_event_ctxp[ctxn]))
+ return 0;
+
+ /*
+ * If the parent's context is a clone, pin it so it won't get
+ * swapped under us.
+ */
+ parent_ctx = perf_pin_task_context(parent, ctxn);
+ if (!parent_ctx)
+ return 0;
+
+ /*
+ * No need to check if parent_ctx != NULL here; since we saw
+ * it non-NULL earlier, the only reason for it to become NULL
+ * is if we exit, and since we're currently in the middle of
+ * a fork we can't be exiting at the same time.
+ */
+
+ /*
+ * Lock the parent list. No need to lock the child - not PID
+ * hashed yet and not running, so nobody can access it.
+ */
+ mutex_lock(&parent_ctx->mutex);
+
+ /*
+ * We dont have to disable NMIs - we are only looking at
+ * the list, not manipulating it:
+ */
+ list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
+ ret = inherit_task_group(event, parent, parent_ctx,
+ child, ctxn, &inherited_all);
+ if (ret)
+ break;
+ }
+
+ /*
+ * We can't hold ctx->lock when iterating the ->flexible_group list due
+ * to allocations, but we need to prevent rotation because
+ * rotate_ctx() will change the list from interrupt context.
+ */
+ raw_spin_lock_irqsave(&parent_ctx->lock, flags);
+ parent_ctx->rotate_disable = 1;
+ raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
+
+ list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
+ ret = inherit_task_group(event, parent, parent_ctx,
+ child, ctxn, &inherited_all);
+ if (ret)
+ break;
+ }
+
+ raw_spin_lock_irqsave(&parent_ctx->lock, flags);
+ parent_ctx->rotate_disable = 0;
+
+ child_ctx = child->perf_event_ctxp[ctxn];
+
+ if (child_ctx && inherited_all) {
+ /*
+ * Mark the child context as a clone of the parent
+ * context, or of whatever the parent is a clone of.
+ *
+ * Note that if the parent is a clone, the holding of
+ * parent_ctx->lock avoids it from being uncloned.
+ */
+ cloned_ctx = parent_ctx->parent_ctx;
+ if (cloned_ctx) {
+ child_ctx->parent_ctx = cloned_ctx;
+ child_ctx->parent_gen = parent_ctx->parent_gen;
+ } else {
+ child_ctx->parent_ctx = parent_ctx;
+ child_ctx->parent_gen = parent_ctx->generation;
+ }
+ get_ctx(child_ctx->parent_ctx);
+ }
+
+ raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
+ mutex_unlock(&parent_ctx->mutex);
+
+ perf_unpin_context(parent_ctx);
+ put_ctx(parent_ctx);
+
+ return ret;
+}
+
+/*
+ * Initialize the perf_event context in task_struct
+ */
+int perf_event_init_task(struct task_struct *child)
+{
+ int ctxn, ret;
+
+ memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
+ mutex_init(&child->perf_event_mutex);
+ INIT_LIST_HEAD(&child->perf_event_list);
+
+ for_each_task_context_nr(ctxn) {
+ ret = perf_event_init_context(child, ctxn);
+ if (ret) {
+ perf_event_free_task(child);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static void __init perf_event_init_all_cpus(void)
+{
+ struct swevent_htable *swhash;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ swhash = &per_cpu(swevent_htable, cpu);
+ mutex_init(&swhash->hlist_mutex);
+ INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
+ }
+}
+
+static void perf_event_init_cpu(int cpu)
+{
+ struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
+
+ mutex_lock(&swhash->hlist_mutex);
+ swhash->online = true;
+ if (swhash->hlist_refcount > 0) {
+ struct swevent_hlist *hlist;
+
+ hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
+ WARN_ON(!hlist);
+ rcu_assign_pointer(swhash->swevent_hlist, hlist);
+ }
+ mutex_unlock(&swhash->hlist_mutex);
+}
+
+#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
+static void __perf_event_exit_context(void *__info)
+{
+ struct remove_event re = { .detach_group = true };
+ struct perf_event_context *ctx = __info;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
+ __perf_remove_from_context(&re);
+ rcu_read_unlock();
+}
+
+static void perf_event_exit_cpu_context(int cpu)
+{
+ struct perf_event_context *ctx;
+ struct pmu *pmu;
+ int idx;
+
+ idx = srcu_read_lock(&pmus_srcu);
+ list_for_each_entry_rcu(pmu, &pmus, entry) {
+ ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
+
+ mutex_lock(&ctx->mutex);
+ smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
+ mutex_unlock(&ctx->mutex);
+ }
+ srcu_read_unlock(&pmus_srcu, idx);
+}
+
+static void perf_event_exit_cpu(int cpu)
+{
+ struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
+
+ perf_event_exit_cpu_context(cpu);
+
+ mutex_lock(&swhash->hlist_mutex);
+ swhash->online = false;
+ swevent_hlist_release(swhash);
+ mutex_unlock(&swhash->hlist_mutex);
+}
+#else
+static inline void perf_event_exit_cpu(int cpu) { }
+#endif
+
+static int
+perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
+{
+ int cpu;
+
+ for_each_online_cpu(cpu)
+ perf_event_exit_cpu(cpu);
+
+ return NOTIFY_OK;
+}
+
+/*
+ * Run the perf reboot notifier at the very last possible moment so that
+ * the generic watchdog code runs as long as possible.
+ */
+static struct notifier_block perf_reboot_notifier = {
+ .notifier_call = perf_reboot,
+ .priority = INT_MIN,
+};
+
+static int
+perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
+{
+ unsigned int cpu = (long)hcpu;
+
+ switch (action & ~CPU_TASKS_FROZEN) {
+
+ case CPU_UP_PREPARE:
+ case CPU_DOWN_FAILED:
+ perf_event_init_cpu(cpu);
+ break;
+
+ case CPU_UP_CANCELED:
+ case CPU_DOWN_PREPARE:
+ perf_event_exit_cpu(cpu);
+ break;
+ default:
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+void __init perf_event_init(void)
+{
+ int ret;
+
+ idr_init(&pmu_idr);
+
+ perf_event_init_all_cpus();
+ init_srcu_struct(&pmus_srcu);
+ perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
+ perf_pmu_register(&perf_cpu_clock, NULL, -1);
+ perf_pmu_register(&perf_task_clock, NULL, -1);
+ perf_tp_register();
+ perf_cpu_notifier(perf_cpu_notify);
+ register_reboot_notifier(&perf_reboot_notifier);
+
+ ret = init_hw_breakpoint();
+ WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
+
+ /* do not patch jump label more than once per second */
+ jump_label_rate_limit(&perf_sched_events, HZ);
+
+ /*
+ * Build time assertion that we keep the data_head at the intended
+ * location. IOW, validation we got the __reserved[] size right.
+ */
+ BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
+ != 1024);
+}
+
+ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
+ char *page)
+{
+ struct perf_pmu_events_attr *pmu_attr =
+ container_of(attr, struct perf_pmu_events_attr, attr);
+
+ if (pmu_attr->event_str)
+ return sprintf(page, "%s\n", pmu_attr->event_str);
+
+ return 0;
+}
+
+static int __init perf_event_sysfs_init(void)
+{
+ struct pmu *pmu;
+ int ret;
+
+ mutex_lock(&pmus_lock);
+
+ ret = bus_register(&pmu_bus);
+ if (ret)
+ goto unlock;
+
+ list_for_each_entry(pmu, &pmus, entry) {
+ if (!pmu->name || pmu->type < 0)
+ continue;
+
+ ret = pmu_dev_alloc(pmu);
+ WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
+ }
+ pmu_bus_running = 1;
+ ret = 0;
+
+unlock:
+ mutex_unlock(&pmus_lock);
+
+ return ret;
+}
+device_initcall(perf_event_sysfs_init);
+
+#ifdef CONFIG_CGROUP_PERF
+static struct cgroup_subsys_state *
+perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct perf_cgroup *jc;
+
+ jc = kzalloc(sizeof(*jc), GFP_KERNEL);
+ if (!jc)
+ return ERR_PTR(-ENOMEM);
+
+ jc->info = alloc_percpu(struct perf_cgroup_info);
+ if (!jc->info) {
+ kfree(jc);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return &jc->css;
+}
+
+static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
+{
+ struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
+
+ free_percpu(jc->info);
+ kfree(jc);
+}
+
+static int __perf_cgroup_move(void *info)
+{
+ struct task_struct *task = info;
+ perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
+ return 0;
+}
+
+static void perf_cgroup_attach(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
+{
+ struct task_struct *task;
+
+ cgroup_taskset_for_each(task, tset)
+ task_function_call(task, __perf_cgroup_move, task);
+}
+
+static void perf_cgroup_exit(struct cgroup_subsys_state *css,
+ struct cgroup_subsys_state *old_css,
+ struct task_struct *task)
+{
+ /*
+ * cgroup_exit() is called in the copy_process() failure path.
+ * Ignore this case since the task hasn't ran yet, this avoids
+ * trying to poke a half freed task state from generic code.
+ */
+ if (!(task->flags & PF_EXITING))
+ return;
+
+ task_function_call(task, __perf_cgroup_move, task);
+}
+
+struct cgroup_subsys perf_event_cgrp_subsys = {
+ .css_alloc = perf_cgroup_css_alloc,
+ .css_free = perf_cgroup_css_free,
+ .exit = perf_cgroup_exit,
+ .attach = perf_cgroup_attach,
+};
+#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
new file mode 100644
index 000000000..92ce5f4cc
--- /dev/null
+++ b/kernel/events/hw_breakpoint.c
@@ -0,0 +1,655 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2007 Alan Stern
+ * Copyright (C) IBM Corporation, 2009
+ * Copyright (C) 2009, Frederic Weisbecker <fweisbec@gmail.com>
+ *
+ * Thanks to Ingo Molnar for his many suggestions.
+ *
+ * Authors: Alan Stern <stern@rowland.harvard.edu>
+ * K.Prasad <prasad@linux.vnet.ibm.com>
+ * Frederic Weisbecker <fweisbec@gmail.com>
+ */
+
+/*
+ * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
+ * using the CPU's debug registers.
+ * This file contains the arch-independent routines.
+ */
+
+#include <linux/irqflags.h>
+#include <linux/kallsyms.h>
+#include <linux/notifier.h>
+#include <linux/kprobes.h>
+#include <linux/kdebug.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
+
+#include <linux/hw_breakpoint.h>
+/*
+ * Constraints data
+ */
+struct bp_cpuinfo {
+ /* Number of pinned cpu breakpoints in a cpu */
+ unsigned int cpu_pinned;
+ /* tsk_pinned[n] is the number of tasks having n+1 breakpoints */
+ unsigned int *tsk_pinned;
+ /* Number of non-pinned cpu/task breakpoints in a cpu */
+ unsigned int flexible; /* XXX: placeholder, see fetch_this_slot() */
+};
+
+static DEFINE_PER_CPU(struct bp_cpuinfo, bp_cpuinfo[TYPE_MAX]);
+static int nr_slots[TYPE_MAX];
+
+static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type)
+{
+ return per_cpu_ptr(bp_cpuinfo + type, cpu);
+}
+
+/* Keep track of the breakpoints attached to tasks */
+static LIST_HEAD(bp_task_head);
+
+static int constraints_initialized;
+
+/* Gather the number of total pinned and un-pinned bp in a cpuset */
+struct bp_busy_slots {
+ unsigned int pinned;
+ unsigned int flexible;
+};
+
+/* Serialize accesses to the above constraints */
+static DEFINE_MUTEX(nr_bp_mutex);
+
+__weak int hw_breakpoint_weight(struct perf_event *bp)
+{
+ return 1;
+}
+
+static inline enum bp_type_idx find_slot_idx(struct perf_event *bp)
+{
+ if (bp->attr.bp_type & HW_BREAKPOINT_RW)
+ return TYPE_DATA;
+
+ return TYPE_INST;
+}
+
+/*
+ * Report the maximum number of pinned breakpoints a task
+ * have in this cpu
+ */
+static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
+{
+ unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
+ int i;
+
+ for (i = nr_slots[type] - 1; i >= 0; i--) {
+ if (tsk_pinned[i] > 0)
+ return i + 1;
+ }
+
+ return 0;
+}
+
+/*
+ * Count the number of breakpoints of the same type and same task.
+ * The given event must be not on the list.
+ */
+static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)
+{
+ struct task_struct *tsk = bp->hw.target;
+ struct perf_event *iter;
+ int count = 0;
+
+ list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
+ if (iter->hw.target == tsk &&
+ find_slot_idx(iter) == type &&
+ (iter->cpu < 0 || cpu == iter->cpu))
+ count += hw_breakpoint_weight(iter);
+ }
+
+ return count;
+}
+
+static const struct cpumask *cpumask_of_bp(struct perf_event *bp)
+{
+ if (bp->cpu >= 0)
+ return cpumask_of(bp->cpu);
+ return cpu_possible_mask;
+}
+
+/*
+ * Report the number of pinned/un-pinned breakpoints we have in
+ * a given cpu (cpu > -1) or in all of them (cpu = -1).
+ */
+static void
+fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
+ enum bp_type_idx type)
+{
+ const struct cpumask *cpumask = cpumask_of_bp(bp);
+ int cpu;
+
+ for_each_cpu(cpu, cpumask) {
+ struct bp_cpuinfo *info = get_bp_info(cpu, type);
+ int nr;
+
+ nr = info->cpu_pinned;
+ if (!bp->hw.target)
+ nr += max_task_bp_pinned(cpu, type);
+ else
+ nr += task_bp_pinned(cpu, bp, type);
+
+ if (nr > slots->pinned)
+ slots->pinned = nr;
+
+ nr = info->flexible;
+ if (nr > slots->flexible)
+ slots->flexible = nr;
+ }
+}
+
+/*
+ * For now, continue to consider flexible as pinned, until we can
+ * ensure no flexible event can ever be scheduled before a pinned event
+ * in a same cpu.
+ */
+static void
+fetch_this_slot(struct bp_busy_slots *slots, int weight)
+{
+ slots->pinned += weight;
+}
+
+/*
+ * Add a pinned breakpoint for the given task in our constraint table
+ */
+static void toggle_bp_task_slot(struct perf_event *bp, int cpu,
+ enum bp_type_idx type, int weight)
+{
+ unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
+ int old_idx, new_idx;
+
+ old_idx = task_bp_pinned(cpu, bp, type) - 1;
+ new_idx = old_idx + weight;
+
+ if (old_idx >= 0)
+ tsk_pinned[old_idx]--;
+ if (new_idx >= 0)
+ tsk_pinned[new_idx]++;
+}
+
+/*
+ * Add/remove the given breakpoint in our constraint table
+ */
+static void
+toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
+ int weight)
+{
+ const struct cpumask *cpumask = cpumask_of_bp(bp);
+ int cpu;
+
+ if (!enable)
+ weight = -weight;
+
+ /* Pinned counter cpu profiling */
+ if (!bp->hw.target) {
+ get_bp_info(bp->cpu, type)->cpu_pinned += weight;
+ return;
+ }
+
+ /* Pinned counter task profiling */
+ for_each_cpu(cpu, cpumask)
+ toggle_bp_task_slot(bp, cpu, type, weight);
+
+ if (enable)
+ list_add_tail(&bp->hw.bp_list, &bp_task_head);
+ else
+ list_del(&bp->hw.bp_list);
+}
+
+/*
+ * Function to perform processor-specific cleanup during unregistration
+ */
+__weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
+{
+ /*
+ * A weak stub function here for those archs that don't define
+ * it inside arch/.../kernel/hw_breakpoint.c
+ */
+}
+
+/*
+ * Contraints to check before allowing this new breakpoint counter:
+ *
+ * == Non-pinned counter == (Considered as pinned for now)
+ *
+ * - If attached to a single cpu, check:
+ *
+ * (per_cpu(info->flexible, cpu) || (per_cpu(info->cpu_pinned, cpu)
+ * + max(per_cpu(info->tsk_pinned, cpu)))) < HBP_NUM
+ *
+ * -> If there are already non-pinned counters in this cpu, it means
+ * there is already a free slot for them.
+ * Otherwise, we check that the maximum number of per task
+ * breakpoints (for this cpu) plus the number of per cpu breakpoint
+ * (for this cpu) doesn't cover every registers.
+ *
+ * - If attached to every cpus, check:
+ *
+ * (per_cpu(info->flexible, *) || (max(per_cpu(info->cpu_pinned, *))
+ * + max(per_cpu(info->tsk_pinned, *)))) < HBP_NUM
+ *
+ * -> This is roughly the same, except we check the number of per cpu
+ * bp for every cpu and we keep the max one. Same for the per tasks
+ * breakpoints.
+ *
+ *
+ * == Pinned counter ==
+ *
+ * - If attached to a single cpu, check:
+ *
+ * ((per_cpu(info->flexible, cpu) > 1) + per_cpu(info->cpu_pinned, cpu)
+ * + max(per_cpu(info->tsk_pinned, cpu))) < HBP_NUM
+ *
+ * -> Same checks as before. But now the info->flexible, if any, must keep
+ * one register at least (or they will never be fed).
+ *
+ * - If attached to every cpus, check:
+ *
+ * ((per_cpu(info->flexible, *) > 1) + max(per_cpu(info->cpu_pinned, *))
+ * + max(per_cpu(info->tsk_pinned, *))) < HBP_NUM
+ */
+static int __reserve_bp_slot(struct perf_event *bp)
+{
+ struct bp_busy_slots slots = {0};
+ enum bp_type_idx type;
+ int weight;
+
+ /* We couldn't initialize breakpoint constraints on boot */
+ if (!constraints_initialized)
+ return -ENOMEM;
+
+ /* Basic checks */
+ if (bp->attr.bp_type == HW_BREAKPOINT_EMPTY ||
+ bp->attr.bp_type == HW_BREAKPOINT_INVALID)
+ return -EINVAL;
+
+ type = find_slot_idx(bp);
+ weight = hw_breakpoint_weight(bp);
+
+ fetch_bp_busy_slots(&slots, bp, type);
+ /*
+ * Simulate the addition of this breakpoint to the constraints
+ * and see the result.
+ */
+ fetch_this_slot(&slots, weight);
+
+ /* Flexible counters need to keep at least one slot */
+ if (slots.pinned + (!!slots.flexible) > nr_slots[type])
+ return -ENOSPC;
+
+ toggle_bp_slot(bp, true, type, weight);
+
+ return 0;
+}
+
+int reserve_bp_slot(struct perf_event *bp)
+{
+ int ret;
+
+ mutex_lock(&nr_bp_mutex);
+
+ ret = __reserve_bp_slot(bp);
+
+ mutex_unlock(&nr_bp_mutex);
+
+ return ret;
+}
+
+static void __release_bp_slot(struct perf_event *bp)
+{
+ enum bp_type_idx type;
+ int weight;
+
+ type = find_slot_idx(bp);
+ weight = hw_breakpoint_weight(bp);
+ toggle_bp_slot(bp, false, type, weight);
+}
+
+void release_bp_slot(struct perf_event *bp)
+{
+ mutex_lock(&nr_bp_mutex);
+
+ arch_unregister_hw_breakpoint(bp);
+ __release_bp_slot(bp);
+
+ mutex_unlock(&nr_bp_mutex);
+}
+
+/*
+ * Allow the kernel debugger to reserve breakpoint slots without
+ * taking a lock using the dbg_* variant of for the reserve and
+ * release breakpoint slots.
+ */
+int dbg_reserve_bp_slot(struct perf_event *bp)
+{
+ if (mutex_is_locked(&nr_bp_mutex))
+ return -1;
+
+ return __reserve_bp_slot(bp);
+}
+
+int dbg_release_bp_slot(struct perf_event *bp)
+{
+ if (mutex_is_locked(&nr_bp_mutex))
+ return -1;
+
+ __release_bp_slot(bp);
+
+ return 0;
+}
+
+static int validate_hw_breakpoint(struct perf_event *bp)
+{
+ int ret;
+
+ ret = arch_validate_hwbkpt_settings(bp);
+ if (ret)
+ return ret;
+
+ if (arch_check_bp_in_kernelspace(bp)) {
+ if (bp->attr.exclude_kernel)
+ return -EINVAL;
+ /*
+ * Don't let unprivileged users set a breakpoint in the trap
+ * path to avoid trap recursion attacks.
+ */
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ }
+
+ return 0;
+}
+
+int register_perf_hw_breakpoint(struct perf_event *bp)
+{
+ int ret;
+
+ ret = reserve_bp_slot(bp);
+ if (ret)
+ return ret;
+
+ ret = validate_hw_breakpoint(bp);
+
+ /* if arch_validate_hwbkpt_settings() fails then release bp slot */
+ if (ret)
+ release_bp_slot(bp);
+
+ return ret;
+}
+
+/**
+ * register_user_hw_breakpoint - register a hardware breakpoint for user space
+ * @attr: breakpoint attributes
+ * @triggered: callback to trigger when we hit the breakpoint
+ * @tsk: pointer to 'task_struct' of the process to which the address belongs
+ */
+struct perf_event *
+register_user_hw_breakpoint(struct perf_event_attr *attr,
+ perf_overflow_handler_t triggered,
+ void *context,
+ struct task_struct *tsk)
+{
+ return perf_event_create_kernel_counter(attr, -1, tsk, triggered,
+ context);
+}
+EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
+
+/**
+ * modify_user_hw_breakpoint - modify a user-space hardware breakpoint
+ * @bp: the breakpoint structure to modify
+ * @attr: new breakpoint attributes
+ * @triggered: callback to trigger when we hit the breakpoint
+ * @tsk: pointer to 'task_struct' of the process to which the address belongs
+ */
+int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr)
+{
+ u64 old_addr = bp->attr.bp_addr;
+ u64 old_len = bp->attr.bp_len;
+ int old_type = bp->attr.bp_type;
+ int err = 0;
+
+ /*
+ * modify_user_hw_breakpoint can be invoked with IRQs disabled and hence it
+ * will not be possible to raise IPIs that invoke __perf_event_disable.
+ * So call the function directly after making sure we are targeting the
+ * current task.
+ */
+ if (irqs_disabled() && bp->ctx && bp->ctx->task == current)
+ __perf_event_disable(bp);
+ else
+ perf_event_disable(bp);
+
+ bp->attr.bp_addr = attr->bp_addr;
+ bp->attr.bp_type = attr->bp_type;
+ bp->attr.bp_len = attr->bp_len;
+
+ if (attr->disabled)
+ goto end;
+
+ err = validate_hw_breakpoint(bp);
+ if (!err)
+ perf_event_enable(bp);
+
+ if (err) {
+ bp->attr.bp_addr = old_addr;
+ bp->attr.bp_type = old_type;
+ bp->attr.bp_len = old_len;
+ if (!bp->attr.disabled)
+ perf_event_enable(bp);
+
+ return err;
+ }
+
+end:
+ bp->attr.disabled = attr->disabled;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint);
+
+/**
+ * unregister_hw_breakpoint - unregister a user-space hardware breakpoint
+ * @bp: the breakpoint structure to unregister
+ */
+void unregister_hw_breakpoint(struct perf_event *bp)
+{
+ if (!bp)
+ return;
+ perf_event_release_kernel(bp);
+}
+EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
+
+/**
+ * register_wide_hw_breakpoint - register a wide breakpoint in the kernel
+ * @attr: breakpoint attributes
+ * @triggered: callback to trigger when we hit the breakpoint
+ *
+ * @return a set of per_cpu pointers to perf events
+ */
+struct perf_event * __percpu *
+register_wide_hw_breakpoint(struct perf_event_attr *attr,
+ perf_overflow_handler_t triggered,
+ void *context)
+{
+ struct perf_event * __percpu *cpu_events, *bp;
+ long err = 0;
+ int cpu;
+
+ cpu_events = alloc_percpu(typeof(*cpu_events));
+ if (!cpu_events)
+ return (void __percpu __force *)ERR_PTR(-ENOMEM);
+
+ get_online_cpus();
+ for_each_online_cpu(cpu) {
+ bp = perf_event_create_kernel_counter(attr, cpu, NULL,
+ triggered, context);
+ if (IS_ERR(bp)) {
+ err = PTR_ERR(bp);
+ break;
+ }
+
+ per_cpu(*cpu_events, cpu) = bp;
+ }
+ put_online_cpus();
+
+ if (likely(!err))
+ return cpu_events;
+
+ unregister_wide_hw_breakpoint(cpu_events);
+ return (void __percpu __force *)ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
+
+/**
+ * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel
+ * @cpu_events: the per cpu set of events to unregister
+ */
+void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ unregister_hw_breakpoint(per_cpu(*cpu_events, cpu));
+
+ free_percpu(cpu_events);
+}
+EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint);
+
+static struct notifier_block hw_breakpoint_exceptions_nb = {
+ .notifier_call = hw_breakpoint_exceptions_notify,
+ /* we need to be notified first */
+ .priority = 0x7fffffff
+};
+
+static void bp_perf_event_destroy(struct perf_event *event)
+{
+ release_bp_slot(event);
+}
+
+static int hw_breakpoint_event_init(struct perf_event *bp)
+{
+ int err;
+
+ if (bp->attr.type != PERF_TYPE_BREAKPOINT)
+ return -ENOENT;
+
+ /*
+ * no branch sampling for breakpoint events
+ */
+ if (has_branch_stack(bp))
+ return -EOPNOTSUPP;
+
+ err = register_perf_hw_breakpoint(bp);
+ if (err)
+ return err;
+
+ bp->destroy = bp_perf_event_destroy;
+
+ return 0;
+}
+
+static int hw_breakpoint_add(struct perf_event *bp, int flags)
+{
+ if (!(flags & PERF_EF_START))
+ bp->hw.state = PERF_HES_STOPPED;
+
+ if (is_sampling_event(bp)) {
+ bp->hw.last_period = bp->hw.sample_period;
+ perf_swevent_set_period(bp);
+ }
+
+ return arch_install_hw_breakpoint(bp);
+}
+
+static void hw_breakpoint_del(struct perf_event *bp, int flags)
+{
+ arch_uninstall_hw_breakpoint(bp);
+}
+
+static void hw_breakpoint_start(struct perf_event *bp, int flags)
+{
+ bp->hw.state = 0;
+}
+
+static void hw_breakpoint_stop(struct perf_event *bp, int flags)
+{
+ bp->hw.state = PERF_HES_STOPPED;
+}
+
+static struct pmu perf_breakpoint = {
+ .task_ctx_nr = perf_sw_context, /* could eventually get its own */
+
+ .event_init = hw_breakpoint_event_init,
+ .add = hw_breakpoint_add,
+ .del = hw_breakpoint_del,
+ .start = hw_breakpoint_start,
+ .stop = hw_breakpoint_stop,
+ .read = hw_breakpoint_pmu_read,
+};
+
+int __init init_hw_breakpoint(void)
+{
+ int cpu, err_cpu;
+ int i;
+
+ for (i = 0; i < TYPE_MAX; i++)
+ nr_slots[i] = hw_breakpoint_slots(i);
+
+ for_each_possible_cpu(cpu) {
+ for (i = 0; i < TYPE_MAX; i++) {
+ struct bp_cpuinfo *info = get_bp_info(cpu, i);
+
+ info->tsk_pinned = kcalloc(nr_slots[i], sizeof(int),
+ GFP_KERNEL);
+ if (!info->tsk_pinned)
+ goto err_alloc;
+ }
+ }
+
+ constraints_initialized = 1;
+
+ perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT);
+
+ return register_die_notifier(&hw_breakpoint_exceptions_nb);
+
+ err_alloc:
+ for_each_possible_cpu(err_cpu) {
+ for (i = 0; i < TYPE_MAX; i++)
+ kfree(get_bp_info(err_cpu, i)->tsk_pinned);
+ if (err_cpu == cpu)
+ break;
+ }
+
+ return -ENOMEM;
+}
+
+
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
new file mode 100644
index 000000000..9f6ce9ba4
--- /dev/null
+++ b/kernel/events/internal.h
@@ -0,0 +1,231 @@
+#ifndef _KERNEL_EVENTS_INTERNAL_H
+#define _KERNEL_EVENTS_INTERNAL_H
+
+#include <linux/hardirq.h>
+#include <linux/uaccess.h>
+
+/* Buffer handling */
+
+#define RING_BUFFER_WRITABLE 0x01
+
+struct ring_buffer {
+ atomic_t refcount;
+ struct rcu_head rcu_head;
+#ifdef CONFIG_PERF_USE_VMALLOC
+ struct work_struct work;
+ int page_order; /* allocation order */
+#endif
+ int nr_pages; /* nr of data pages */
+ int overwrite; /* can overwrite itself */
+
+ atomic_t poll; /* POLL_ for wakeups */
+
+ local_t head; /* write position */
+ local_t nest; /* nested writers */
+ local_t events; /* event limit */
+ local_t wakeup; /* wakeup stamp */
+ local_t lost; /* nr records lost */
+
+ long watermark; /* wakeup watermark */
+ long aux_watermark;
+ /* poll crap */
+ spinlock_t event_lock;
+ struct list_head event_list;
+
+ atomic_t mmap_count;
+ unsigned long mmap_locked;
+ struct user_struct *mmap_user;
+
+ /* AUX area */
+ local_t aux_head;
+ local_t aux_nest;
+ local_t aux_wakeup;
+ unsigned long aux_pgoff;
+ int aux_nr_pages;
+ int aux_overwrite;
+ atomic_t aux_mmap_count;
+ unsigned long aux_mmap_locked;
+ void (*free_aux)(void *);
+ atomic_t aux_refcount;
+ void **aux_pages;
+ void *aux_priv;
+
+ struct perf_event_mmap_page *user_page;
+ void *data_pages[0];
+};
+
+extern void rb_free(struct ring_buffer *rb);
+extern struct ring_buffer *
+rb_alloc(int nr_pages, long watermark, int cpu, int flags);
+extern void perf_event_wakeup(struct perf_event *event);
+extern int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
+ pgoff_t pgoff, int nr_pages, long watermark, int flags);
+extern void rb_free_aux(struct ring_buffer *rb);
+extern struct ring_buffer *ring_buffer_get(struct perf_event *event);
+extern void ring_buffer_put(struct ring_buffer *rb);
+
+static inline bool rb_has_aux(struct ring_buffer *rb)
+{
+ return !!rb->aux_nr_pages;
+}
+
+void perf_event_aux_event(struct perf_event *event, unsigned long head,
+ unsigned long size, u64 flags);
+
+extern void
+perf_event_header__init_id(struct perf_event_header *header,
+ struct perf_sample_data *data,
+ struct perf_event *event);
+extern void
+perf_event__output_id_sample(struct perf_event *event,
+ struct perf_output_handle *handle,
+ struct perf_sample_data *sample);
+
+extern struct page *
+perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff);
+
+#ifdef CONFIG_PERF_USE_VMALLOC
+/*
+ * Back perf_mmap() with vmalloc memory.
+ *
+ * Required for architectures that have d-cache aliasing issues.
+ */
+
+static inline int page_order(struct ring_buffer *rb)
+{
+ return rb->page_order;
+}
+
+#else
+
+static inline int page_order(struct ring_buffer *rb)
+{
+ return 0;
+}
+#endif
+
+static inline unsigned long perf_data_size(struct ring_buffer *rb)
+{
+ return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
+}
+
+static inline unsigned long perf_aux_size(struct ring_buffer *rb)
+{
+ return rb->aux_nr_pages << PAGE_SHIFT;
+}
+
+#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \
+static inline unsigned long \
+func_name(struct perf_output_handle *handle, \
+ const void *buf, unsigned long len) \
+{ \
+ unsigned long size, written; \
+ \
+ do { \
+ size = min(handle->size, len); \
+ written = memcpy_func(handle->addr, buf, size); \
+ written = size - written; \
+ \
+ len -= written; \
+ handle->addr += written; \
+ buf += written; \
+ handle->size -= written; \
+ if (!handle->size) { \
+ struct ring_buffer *rb = handle->rb; \
+ \
+ handle->page++; \
+ handle->page &= rb->nr_pages - 1; \
+ handle->addr = rb->data_pages[handle->page]; \
+ handle->size = PAGE_SIZE << page_order(rb); \
+ } \
+ } while (len && written == size); \
+ \
+ return len; \
+}
+
+static inline unsigned long
+memcpy_common(void *dst, const void *src, unsigned long n)
+{
+ memcpy(dst, src, n);
+ return 0;
+}
+
+DEFINE_OUTPUT_COPY(__output_copy, memcpy_common)
+
+static inline unsigned long
+memcpy_skip(void *dst, const void *src, unsigned long n)
+{
+ return 0;
+}
+
+DEFINE_OUTPUT_COPY(__output_skip, memcpy_skip)
+
+#ifndef arch_perf_out_copy_user
+#define arch_perf_out_copy_user arch_perf_out_copy_user
+
+static inline unsigned long
+arch_perf_out_copy_user(void *dst, const void *src, unsigned long n)
+{
+ unsigned long ret;
+
+ pagefault_disable();
+ ret = __copy_from_user_inatomic(dst, src, n);
+ pagefault_enable();
+
+ return ret;
+}
+#endif
+
+DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user)
+
+/* Callchain handling */
+extern struct perf_callchain_entry *
+perf_callchain(struct perf_event *event, struct pt_regs *regs);
+extern int get_callchain_buffers(void);
+extern void put_callchain_buffers(void);
+
+static inline int get_recursion_context(int *recursion)
+{
+ int rctx;
+
+ if (in_nmi())
+ rctx = 3;
+ else if (in_irq())
+ rctx = 2;
+ else if (in_softirq())
+ rctx = 1;
+ else
+ rctx = 0;
+
+ if (recursion[rctx])
+ return -1;
+
+ recursion[rctx]++;
+ barrier();
+
+ return rctx;
+}
+
+static inline void put_recursion_context(int *recursion, int rctx)
+{
+ barrier();
+ recursion[rctx]--;
+}
+
+#ifdef CONFIG_HAVE_PERF_USER_STACK_DUMP
+static inline bool arch_perf_have_user_stack_dump(void)
+{
+ return true;
+}
+
+#define perf_user_stack_pointer(regs) user_stack_pointer(regs)
+#else
+static inline bool arch_perf_have_user_stack_dump(void)
+{
+ return false;
+}
+
+#define perf_user_stack_pointer(regs) 0
+#endif /* CONFIG_HAVE_PERF_USER_STACK_DUMP */
+
+#endif /* _KERNEL_EVENTS_INTERNAL_H */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
new file mode 100644
index 000000000..725c41608
--- /dev/null
+++ b/kernel/events/ring_buffer.c
@@ -0,0 +1,751 @@
+/*
+ * Performance events ring-buffer code:
+ *
+ * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
+ * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *
+ * For licensing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_event.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/circ_buf.h>
+#include <linux/poll.h>
+
+#include "internal.h"
+
+static void perf_output_wakeup(struct perf_output_handle *handle)
+{
+ atomic_set(&handle->rb->poll, POLLIN);
+
+ handle->event->pending_wakeup = 1;
+ irq_work_queue(&handle->event->pending);
+}
+
+/*
+ * We need to ensure a later event_id doesn't publish a head when a former
+ * event isn't done writing. However since we need to deal with NMIs we
+ * cannot fully serialize things.
+ *
+ * We only publish the head (and generate a wakeup) when the outer-most
+ * event completes.
+ */
+static void perf_output_get_handle(struct perf_output_handle *handle)
+{
+ struct ring_buffer *rb = handle->rb;
+
+ preempt_disable();
+ local_inc(&rb->nest);
+ handle->wakeup = local_read(&rb->wakeup);
+}
+
+static void perf_output_put_handle(struct perf_output_handle *handle)
+{
+ struct ring_buffer *rb = handle->rb;
+ unsigned long head;
+
+again:
+ head = local_read(&rb->head);
+
+ /*
+ * IRQ/NMI can happen here, which means we can miss a head update.
+ */
+
+ if (!local_dec_and_test(&rb->nest))
+ goto out;
+
+ /*
+ * Since the mmap() consumer (userspace) can run on a different CPU:
+ *
+ * kernel user
+ *
+ * if (LOAD ->data_tail) { LOAD ->data_head
+ * (A) smp_rmb() (C)
+ * STORE $data LOAD $data
+ * smp_wmb() (B) smp_mb() (D)
+ * STORE ->data_head STORE ->data_tail
+ * }
+ *
+ * Where A pairs with D, and B pairs with C.
+ *
+ * In our case (A) is a control dependency that separates the load of
+ * the ->data_tail and the stores of $data. In case ->data_tail
+ * indicates there is no room in the buffer to store $data we do not.
+ *
+ * D needs to be a full barrier since it separates the data READ
+ * from the tail WRITE.
+ *
+ * For B a WMB is sufficient since it separates two WRITEs, and for C
+ * an RMB is sufficient since it separates two READs.
+ *
+ * See perf_output_begin().
+ */
+ smp_wmb(); /* B, matches C */
+ rb->user_page->data_head = head;
+
+ /*
+ * Now check if we missed an update -- rely on previous implied
+ * compiler barriers to force a re-read.
+ */
+ if (unlikely(head != local_read(&rb->head))) {
+ local_inc(&rb->nest);
+ goto again;
+ }
+
+ if (handle->wakeup != local_read(&rb->wakeup))
+ perf_output_wakeup(handle);
+
+out:
+ preempt_enable();
+}
+
+int perf_output_begin(struct perf_output_handle *handle,
+ struct perf_event *event, unsigned int size)
+{
+ struct ring_buffer *rb;
+ unsigned long tail, offset, head;
+ int have_lost, page_shift;
+ struct {
+ struct perf_event_header header;
+ u64 id;
+ u64 lost;
+ } lost_event;
+
+ rcu_read_lock();
+ /*
+ * For inherited events we send all the output towards the parent.
+ */
+ if (event->parent)
+ event = event->parent;
+
+ rb = rcu_dereference(event->rb);
+ if (unlikely(!rb))
+ goto out;
+
+ if (unlikely(!rb->nr_pages))
+ goto out;
+
+ handle->rb = rb;
+ handle->event = event;
+
+ have_lost = local_read(&rb->lost);
+ if (unlikely(have_lost)) {
+ size += sizeof(lost_event);
+ if (event->attr.sample_id_all)
+ size += event->id_header_size;
+ }
+
+ perf_output_get_handle(handle);
+
+ do {
+ tail = ACCESS_ONCE(rb->user_page->data_tail);
+ offset = head = local_read(&rb->head);
+ if (!rb->overwrite &&
+ unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size))
+ goto fail;
+
+ /*
+ * The above forms a control dependency barrier separating the
+ * @tail load above from the data stores below. Since the @tail
+ * load is required to compute the branch to fail below.
+ *
+ * A, matches D; the full memory barrier userspace SHOULD issue
+ * after reading the data and before storing the new tail
+ * position.
+ *
+ * See perf_output_put_handle().
+ */
+
+ head += size;
+ } while (local_cmpxchg(&rb->head, offset, head) != offset);
+
+ /*
+ * We rely on the implied barrier() by local_cmpxchg() to ensure
+ * none of the data stores below can be lifted up by the compiler.
+ */
+
+ if (unlikely(head - local_read(&rb->wakeup) > rb->watermark))
+ local_add(rb->watermark, &rb->wakeup);
+
+ page_shift = PAGE_SHIFT + page_order(rb);
+
+ handle->page = (offset >> page_shift) & (rb->nr_pages - 1);
+ offset &= (1UL << page_shift) - 1;
+ handle->addr = rb->data_pages[handle->page] + offset;
+ handle->size = (1UL << page_shift) - offset;
+
+ if (unlikely(have_lost)) {
+ struct perf_sample_data sample_data;
+
+ lost_event.header.size = sizeof(lost_event);
+ lost_event.header.type = PERF_RECORD_LOST;
+ lost_event.header.misc = 0;
+ lost_event.id = event->id;
+ lost_event.lost = local_xchg(&rb->lost, 0);
+
+ perf_event_header__init_id(&lost_event.header,
+ &sample_data, event);
+ perf_output_put(handle, lost_event);
+ perf_event__output_id_sample(event, handle, &sample_data);
+ }
+
+ return 0;
+
+fail:
+ local_inc(&rb->lost);
+ perf_output_put_handle(handle);
+out:
+ rcu_read_unlock();
+
+ return -ENOSPC;
+}
+
+unsigned int perf_output_copy(struct perf_output_handle *handle,
+ const void *buf, unsigned int len)
+{
+ return __output_copy(handle, buf, len);
+}
+
+unsigned int perf_output_skip(struct perf_output_handle *handle,
+ unsigned int len)
+{
+ return __output_skip(handle, NULL, len);
+}
+
+void perf_output_end(struct perf_output_handle *handle)
+{
+ perf_output_put_handle(handle);
+ rcu_read_unlock();
+}
+
+static void
+ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
+{
+ long max_size = perf_data_size(rb);
+
+ if (watermark)
+ rb->watermark = min(max_size, watermark);
+
+ if (!rb->watermark)
+ rb->watermark = max_size / 2;
+
+ if (flags & RING_BUFFER_WRITABLE)
+ rb->overwrite = 0;
+ else
+ rb->overwrite = 1;
+
+ atomic_set(&rb->refcount, 1);
+
+ INIT_LIST_HEAD(&rb->event_list);
+ spin_lock_init(&rb->event_lock);
+}
+
+/*
+ * This is called before hardware starts writing to the AUX area to
+ * obtain an output handle and make sure there's room in the buffer.
+ * When the capture completes, call perf_aux_output_end() to commit
+ * the recorded data to the buffer.
+ *
+ * The ordering is similar to that of perf_output_{begin,end}, with
+ * the exception of (B), which should be taken care of by the pmu
+ * driver, since ordering rules will differ depending on hardware.
+ */
+void *perf_aux_output_begin(struct perf_output_handle *handle,
+ struct perf_event *event)
+{
+ struct perf_event *output_event = event;
+ unsigned long aux_head, aux_tail;
+ struct ring_buffer *rb;
+
+ if (output_event->parent)
+ output_event = output_event->parent;
+
+ /*
+ * Since this will typically be open across pmu::add/pmu::del, we
+ * grab ring_buffer's refcount instead of holding rcu read lock
+ * to make sure it doesn't disappear under us.
+ */
+ rb = ring_buffer_get(output_event);
+ if (!rb)
+ return NULL;
+
+ if (!rb_has_aux(rb) || !atomic_inc_not_zero(&rb->aux_refcount))
+ goto err;
+
+ /*
+ * Nesting is not supported for AUX area, make sure nested
+ * writers are caught early
+ */
+ if (WARN_ON_ONCE(local_xchg(&rb->aux_nest, 1)))
+ goto err_put;
+
+ aux_head = local_read(&rb->aux_head);
+
+ handle->rb = rb;
+ handle->event = event;
+ handle->head = aux_head;
+ handle->size = 0;
+
+ /*
+ * In overwrite mode, AUX data stores do not depend on aux_tail,
+ * therefore (A) control dependency barrier does not exist. The
+ * (B) <-> (C) ordering is still observed by the pmu driver.
+ */
+ if (!rb->aux_overwrite) {
+ aux_tail = ACCESS_ONCE(rb->user_page->aux_tail);
+ handle->wakeup = local_read(&rb->aux_wakeup) + rb->aux_watermark;
+ if (aux_head - aux_tail < perf_aux_size(rb))
+ handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb));
+
+ /*
+ * handle->size computation depends on aux_tail load; this forms a
+ * control dependency barrier separating aux_tail load from aux data
+ * store that will be enabled on successful return
+ */
+ if (!handle->size) { /* A, matches D */
+ event->pending_disable = 1;
+ perf_output_wakeup(handle);
+ local_set(&rb->aux_nest, 0);
+ goto err_put;
+ }
+ }
+
+ return handle->rb->aux_priv;
+
+err_put:
+ rb_free_aux(rb);
+
+err:
+ ring_buffer_put(rb);
+ handle->event = NULL;
+
+ return NULL;
+}
+
+/*
+ * Commit the data written by hardware into the ring buffer by adjusting
+ * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the
+ * pmu driver's responsibility to observe ordering rules of the hardware,
+ * so that all the data is externally visible before this is called.
+ */
+void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
+ bool truncated)
+{
+ struct ring_buffer *rb = handle->rb;
+ unsigned long aux_head;
+ u64 flags = 0;
+
+ if (truncated)
+ flags |= PERF_AUX_FLAG_TRUNCATED;
+
+ /* in overwrite mode, driver provides aux_head via handle */
+ if (rb->aux_overwrite) {
+ flags |= PERF_AUX_FLAG_OVERWRITE;
+
+ aux_head = handle->head;
+ local_set(&rb->aux_head, aux_head);
+ } else {
+ aux_head = local_read(&rb->aux_head);
+ local_add(size, &rb->aux_head);
+ }
+
+ if (size || flags) {
+ /*
+ * Only send RECORD_AUX if we have something useful to communicate
+ */
+
+ perf_event_aux_event(handle->event, aux_head, size, flags);
+ }
+
+ aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
+
+ if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) {
+ perf_output_wakeup(handle);
+ local_add(rb->aux_watermark, &rb->aux_wakeup);
+ }
+ handle->event = NULL;
+
+ local_set(&rb->aux_nest, 0);
+ rb_free_aux(rb);
+ ring_buffer_put(rb);
+}
+
+/*
+ * Skip over a given number of bytes in the AUX buffer, due to, for example,
+ * hardware's alignment constraints.
+ */
+int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)
+{
+ struct ring_buffer *rb = handle->rb;
+ unsigned long aux_head;
+
+ if (size > handle->size)
+ return -ENOSPC;
+
+ local_add(size, &rb->aux_head);
+
+ aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
+ if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) {
+ perf_output_wakeup(handle);
+ local_add(rb->aux_watermark, &rb->aux_wakeup);
+ handle->wakeup = local_read(&rb->aux_wakeup) +
+ rb->aux_watermark;
+ }
+
+ handle->head = aux_head;
+ handle->size -= size;
+
+ return 0;
+}
+
+void *perf_get_aux(struct perf_output_handle *handle)
+{
+ /* this is only valid between perf_aux_output_begin and *_end */
+ if (!handle->event)
+ return NULL;
+
+ return handle->rb->aux_priv;
+}
+
+#define PERF_AUX_GFP (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY)
+
+static struct page *rb_alloc_aux_page(int node, int order)
+{
+ struct page *page;
+
+ if (order > MAX_ORDER)
+ order = MAX_ORDER;
+
+ do {
+ page = alloc_pages_node(node, PERF_AUX_GFP, order);
+ } while (!page && order--);
+
+ if (page && order) {
+ /*
+ * Communicate the allocation size to the driver
+ */
+ split_page(page, order);
+ SetPagePrivate(page);
+ set_page_private(page, order);
+ }
+
+ return page;
+}
+
+static void rb_free_aux_page(struct ring_buffer *rb, int idx)
+{
+ struct page *page = virt_to_page(rb->aux_pages[idx]);
+
+ ClearPagePrivate(page);
+ page->mapping = NULL;
+ __free_page(page);
+}
+
+int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
+ pgoff_t pgoff, int nr_pages, long watermark, int flags)
+{
+ bool overwrite = !(flags & RING_BUFFER_WRITABLE);
+ int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu);
+ int ret = -ENOMEM, max_order = 0;
+
+ if (!has_aux(event))
+ return -ENOTSUPP;
+
+ if (event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) {
+ /*
+ * We need to start with the max_order that fits in nr_pages,
+ * not the other way around, hence ilog2() and not get_order.
+ */
+ max_order = ilog2(nr_pages);
+
+ /*
+ * PMU requests more than one contiguous chunks of memory
+ * for SW double buffering
+ */
+ if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_SW_DOUBLEBUF) &&
+ !overwrite) {
+ if (!max_order)
+ return -EINVAL;
+
+ max_order--;
+ }
+ }
+
+ rb->aux_pages = kzalloc_node(nr_pages * sizeof(void *), GFP_KERNEL, node);
+ if (!rb->aux_pages)
+ return -ENOMEM;
+
+ rb->free_aux = event->pmu->free_aux;
+ for (rb->aux_nr_pages = 0; rb->aux_nr_pages < nr_pages;) {
+ struct page *page;
+ int last, order;
+
+ order = min(max_order, ilog2(nr_pages - rb->aux_nr_pages));
+ page = rb_alloc_aux_page(node, order);
+ if (!page)
+ goto out;
+
+ for (last = rb->aux_nr_pages + (1 << page_private(page));
+ last > rb->aux_nr_pages; rb->aux_nr_pages++)
+ rb->aux_pages[rb->aux_nr_pages] = page_address(page++);
+ }
+
+ /*
+ * In overwrite mode, PMUs that don't support SG may not handle more
+ * than one contiguous allocation, since they rely on PMI to do double
+ * buffering. In this case, the entire buffer has to be one contiguous
+ * chunk.
+ */
+ if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) &&
+ overwrite) {
+ struct page *page = virt_to_page(rb->aux_pages[0]);
+
+ if (page_private(page) != max_order)
+ goto out;
+ }
+
+ rb->aux_priv = event->pmu->setup_aux(event->cpu, rb->aux_pages, nr_pages,
+ overwrite);
+ if (!rb->aux_priv)
+ goto out;
+
+ ret = 0;
+
+ /*
+ * aux_pages (and pmu driver's private data, aux_priv) will be
+ * referenced in both producer's and consumer's contexts, thus
+ * we keep a refcount here to make sure either of the two can
+ * reference them safely.
+ */
+ atomic_set(&rb->aux_refcount, 1);
+
+ rb->aux_overwrite = overwrite;
+ rb->aux_watermark = watermark;
+
+ if (!rb->aux_watermark && !rb->aux_overwrite)
+ rb->aux_watermark = nr_pages << (PAGE_SHIFT - 1);
+
+out:
+ if (!ret)
+ rb->aux_pgoff = pgoff;
+ else
+ rb_free_aux(rb);
+
+ return ret;
+}
+
+static void __rb_free_aux(struct ring_buffer *rb)
+{
+ int pg;
+
+ if (rb->aux_priv) {
+ rb->free_aux(rb->aux_priv);
+ rb->free_aux = NULL;
+ rb->aux_priv = NULL;
+ }
+
+ for (pg = 0; pg < rb->aux_nr_pages; pg++)
+ rb_free_aux_page(rb, pg);
+
+ kfree(rb->aux_pages);
+ rb->aux_nr_pages = 0;
+}
+
+void rb_free_aux(struct ring_buffer *rb)
+{
+ if (atomic_dec_and_test(&rb->aux_refcount))
+ __rb_free_aux(rb);
+}
+
+#ifndef CONFIG_PERF_USE_VMALLOC
+
+/*
+ * Back perf_mmap() with regular GFP_KERNEL-0 pages.
+ */
+
+static struct page *
+__perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
+{
+ if (pgoff > rb->nr_pages)
+ return NULL;
+
+ if (pgoff == 0)
+ return virt_to_page(rb->user_page);
+
+ return virt_to_page(rb->data_pages[pgoff - 1]);
+}
+
+static void *perf_mmap_alloc_page(int cpu)
+{
+ struct page *page;
+ int node;
+
+ node = (cpu == -1) ? cpu : cpu_to_node(cpu);
+ page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+ if (!page)
+ return NULL;
+
+ return page_address(page);
+}
+
+struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
+{
+ struct ring_buffer *rb;
+ unsigned long size;
+ int i;
+
+ size = sizeof(struct ring_buffer);
+ size += nr_pages * sizeof(void *);
+
+ rb = kzalloc(size, GFP_KERNEL);
+ if (!rb)
+ goto fail;
+
+ rb->user_page = perf_mmap_alloc_page(cpu);
+ if (!rb->user_page)
+ goto fail_user_page;
+
+ for (i = 0; i < nr_pages; i++) {
+ rb->data_pages[i] = perf_mmap_alloc_page(cpu);
+ if (!rb->data_pages[i])
+ goto fail_data_pages;
+ }
+
+ rb->nr_pages = nr_pages;
+
+ ring_buffer_init(rb, watermark, flags);
+
+ return rb;
+
+fail_data_pages:
+ for (i--; i >= 0; i--)
+ free_page((unsigned long)rb->data_pages[i]);
+
+ free_page((unsigned long)rb->user_page);
+
+fail_user_page:
+ kfree(rb);
+
+fail:
+ return NULL;
+}
+
+static void perf_mmap_free_page(unsigned long addr)
+{
+ struct page *page = virt_to_page((void *)addr);
+
+ page->mapping = NULL;
+ __free_page(page);
+}
+
+void rb_free(struct ring_buffer *rb)
+{
+ int i;
+
+ perf_mmap_free_page((unsigned long)rb->user_page);
+ for (i = 0; i < rb->nr_pages; i++)
+ perf_mmap_free_page((unsigned long)rb->data_pages[i]);
+ kfree(rb);
+}
+
+#else
+static int data_page_nr(struct ring_buffer *rb)
+{
+ return rb->nr_pages << page_order(rb);
+}
+
+static struct page *
+__perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
+{
+ /* The '>' counts in the user page. */
+ if (pgoff > data_page_nr(rb))
+ return NULL;
+
+ return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE);
+}
+
+static void perf_mmap_unmark_page(void *addr)
+{
+ struct page *page = vmalloc_to_page(addr);
+
+ page->mapping = NULL;
+}
+
+static void rb_free_work(struct work_struct *work)
+{
+ struct ring_buffer *rb;
+ void *base;
+ int i, nr;
+
+ rb = container_of(work, struct ring_buffer, work);
+ nr = data_page_nr(rb);
+
+ base = rb->user_page;
+ /* The '<=' counts in the user page. */
+ for (i = 0; i <= nr; i++)
+ perf_mmap_unmark_page(base + (i * PAGE_SIZE));
+
+ vfree(base);
+ kfree(rb);
+}
+
+void rb_free(struct ring_buffer *rb)
+{
+ schedule_work(&rb->work);
+}
+
+struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
+{
+ struct ring_buffer *rb;
+ unsigned long size;
+ void *all_buf;
+
+ size = sizeof(struct ring_buffer);
+ size += sizeof(void *);
+
+ rb = kzalloc(size, GFP_KERNEL);
+ if (!rb)
+ goto fail;
+
+ INIT_WORK(&rb->work, rb_free_work);
+
+ all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
+ if (!all_buf)
+ goto fail_all_buf;
+
+ rb->user_page = all_buf;
+ rb->data_pages[0] = all_buf + PAGE_SIZE;
+ rb->page_order = ilog2(nr_pages);
+ rb->nr_pages = !!nr_pages;
+
+ ring_buffer_init(rb, watermark, flags);
+
+ return rb;
+
+fail_all_buf:
+ kfree(rb);
+
+fail:
+ return NULL;
+}
+
+#endif
+
+struct page *
+perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
+{
+ if (rb->aux_nr_pages) {
+ /* above AUX space */
+ if (pgoff > rb->aux_pgoff + rb->aux_nr_pages)
+ return NULL;
+
+ /* AUX space */
+ if (pgoff >= rb->aux_pgoff)
+ return virt_to_page(rb->aux_pages[pgoff - rb->aux_pgoff]);
+ }
+
+ return __perf_mmap_to_page(rb, pgoff);
+}
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
new file mode 100644
index 000000000..cb346f26a
--- /dev/null
+++ b/kernel/events/uprobes.c
@@ -0,0 +1,1993 @@
+/*
+ * User-space Probes (UProbes)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2008-2012
+ * Authors:
+ * Srikar Dronamraju
+ * Jim Keniston
+ * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h> /* read_mapping_page */
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/export.h>
+#include <linux/rmap.h> /* anon_vma_prepare */
+#include <linux/mmu_notifier.h> /* set_pte_at_notify */
+#include <linux/swap.h> /* try_to_free_swap */
+#include <linux/ptrace.h> /* user_enable_single_step */
+#include <linux/kdebug.h> /* notifier mechanism */
+#include "../../mm/internal.h" /* munlock_vma_page */
+#include <linux/percpu-rwsem.h>
+#include <linux/task_work.h>
+#include <linux/shmem_fs.h>
+
+#include <linux/uprobes.h>
+
+#define UINSNS_PER_PAGE (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES)
+#define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE
+
+static struct rb_root uprobes_tree = RB_ROOT;
+/*
+ * allows us to skip the uprobe_mmap if there are no uprobe events active
+ * at this time. Probably a fine grained per inode count is better?
+ */
+#define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree)
+
+static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */
+
+#define UPROBES_HASH_SZ 13
+/* serialize uprobe->pending_list */
+static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
+#define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
+
+static struct percpu_rw_semaphore dup_mmap_sem;
+
+/* Have a copy of original instruction */
+#define UPROBE_COPY_INSN 0
+
+struct uprobe {
+ struct rb_node rb_node; /* node in the rb tree */
+ atomic_t ref;
+ struct rw_semaphore register_rwsem;
+ struct rw_semaphore consumer_rwsem;
+ struct list_head pending_list;
+ struct uprobe_consumer *consumers;
+ struct inode *inode; /* Also hold a ref to inode */
+ loff_t offset;
+ unsigned long flags;
+
+ /*
+ * The generic code assumes that it has two members of unknown type
+ * owned by the arch-specific code:
+ *
+ * insn - copy_insn() saves the original instruction here for
+ * arch_uprobe_analyze_insn().
+ *
+ * ixol - potentially modified instruction to execute out of
+ * line, copied to xol_area by xol_get_insn_slot().
+ */
+ struct arch_uprobe arch;
+};
+
+struct return_instance {
+ struct uprobe *uprobe;
+ unsigned long func;
+ unsigned long orig_ret_vaddr; /* original return address */
+ bool chained; /* true, if instance is nested */
+
+ struct return_instance *next; /* keep as stack */
+};
+
+/*
+ * Execute out of line area: anonymous executable mapping installed
+ * by the probed task to execute the copy of the original instruction
+ * mangled by set_swbp().
+ *
+ * On a breakpoint hit, thread contests for a slot. It frees the
+ * slot after singlestep. Currently a fixed number of slots are
+ * allocated.
+ */
+struct xol_area {
+ wait_queue_head_t wq; /* if all slots are busy */
+ atomic_t slot_count; /* number of in-use slots */
+ unsigned long *bitmap; /* 0 = free slot */
+ struct page *page;
+
+ /*
+ * We keep the vma's vm_start rather than a pointer to the vma
+ * itself. The probed process or a naughty kernel module could make
+ * the vma go away, and we must handle that reasonably gracefully.
+ */
+ unsigned long vaddr; /* Page(s) of instruction slots */
+};
+
+/*
+ * valid_vma: Verify if the specified vma is an executable vma
+ * Relax restrictions while unregistering: vm_flags might have
+ * changed after breakpoint was inserted.
+ * - is_register: indicates if we are in register context.
+ * - Return 1 if the specified virtual address is in an
+ * executable vma.
+ */
+static bool valid_vma(struct vm_area_struct *vma, bool is_register)
+{
+ vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE;
+
+ if (is_register)
+ flags |= VM_WRITE;
+
+ return vma->vm_file && (vma->vm_flags & flags) == VM_MAYEXEC;
+}
+
+static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset)
+{
+ return vma->vm_start + offset - ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+}
+
+static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
+{
+ return ((loff_t)vma->vm_pgoff << PAGE_SHIFT) + (vaddr - vma->vm_start);
+}
+
+/**
+ * __replace_page - replace page in vma by new page.
+ * based on replace_page in mm/ksm.c
+ *
+ * @vma: vma that holds the pte pointing to page
+ * @addr: address the old @page is mapped at
+ * @page: the cowed page we are replacing by kpage
+ * @kpage: the modified page we replace page by
+ *
+ * Returns 0 on success, -EFAULT on failure.
+ */
+static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
+ struct page *page, struct page *kpage)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ spinlock_t *ptl;
+ pte_t *ptep;
+ int err;
+ /* For mmu_notifiers */
+ const unsigned long mmun_start = addr;
+ const unsigned long mmun_end = addr + PAGE_SIZE;
+ struct mem_cgroup *memcg;
+
+ err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg);
+ if (err)
+ return err;
+
+ /* For try_to_free_swap() and munlock_vma_page() below */
+ lock_page(page);
+
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+ err = -EAGAIN;
+ ptep = page_check_address(page, mm, addr, &ptl, 0);
+ if (!ptep)
+ goto unlock;
+
+ get_page(kpage);
+ page_add_new_anon_rmap(kpage, vma, addr);
+ mem_cgroup_commit_charge(kpage, memcg, false);
+ lru_cache_add_active_or_unevictable(kpage, vma);
+
+ if (!PageAnon(page)) {
+ dec_mm_counter(mm, MM_FILEPAGES);
+ inc_mm_counter(mm, MM_ANONPAGES);
+ }
+
+ flush_cache_page(vma, addr, pte_pfn(*ptep));
+ ptep_clear_flush_notify(vma, addr, ptep);
+ set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
+
+ page_remove_rmap(page);
+ if (!page_mapped(page))
+ try_to_free_swap(page);
+ pte_unmap_unlock(ptep, ptl);
+
+ if (vma->vm_flags & VM_LOCKED)
+ munlock_vma_page(page);
+ put_page(page);
+
+ err = 0;
+ unlock:
+ mem_cgroup_cancel_charge(kpage, memcg);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ unlock_page(page);
+ return err;
+}
+
+/**
+ * is_swbp_insn - check if instruction is breakpoint instruction.
+ * @insn: instruction to be checked.
+ * Default implementation of is_swbp_insn
+ * Returns true if @insn is a breakpoint instruction.
+ */
+bool __weak is_swbp_insn(uprobe_opcode_t *insn)
+{
+ return *insn == UPROBE_SWBP_INSN;
+}
+
+/**
+ * is_trap_insn - check if instruction is breakpoint instruction.
+ * @insn: instruction to be checked.
+ * Default implementation of is_trap_insn
+ * Returns true if @insn is a breakpoint instruction.
+ *
+ * This function is needed for the case where an architecture has multiple
+ * trap instructions (like powerpc).
+ */
+bool __weak is_trap_insn(uprobe_opcode_t *insn)
+{
+ return is_swbp_insn(insn);
+}
+
+static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
+{
+ void *kaddr = kmap_atomic(page);
+ memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len);
+ kunmap_atomic(kaddr);
+}
+
+static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len)
+{
+ void *kaddr = kmap_atomic(page);
+ memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len);
+ kunmap_atomic(kaddr);
+}
+
+static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode)
+{
+ uprobe_opcode_t old_opcode;
+ bool is_swbp;
+
+ /*
+ * Note: We only check if the old_opcode is UPROBE_SWBP_INSN here.
+ * We do not check if it is any other 'trap variant' which could
+ * be conditional trap instruction such as the one powerpc supports.
+ *
+ * The logic is that we do not care if the underlying instruction
+ * is a trap variant; uprobes always wins over any other (gdb)
+ * breakpoint.
+ */
+ copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);
+ is_swbp = is_swbp_insn(&old_opcode);
+
+ if (is_swbp_insn(new_opcode)) {
+ if (is_swbp) /* register: already installed? */
+ return 0;
+ } else {
+ if (!is_swbp) /* unregister: was it changed by us? */
+ return 0;
+ }
+
+ return 1;
+}
+
+/*
+ * NOTE:
+ * Expect the breakpoint instruction to be the smallest size instruction for
+ * the architecture. If an arch has variable length instruction and the
+ * breakpoint instruction is not of the smallest length instruction
+ * supported by that architecture then we need to modify is_trap_at_addr and
+ * uprobe_write_opcode accordingly. This would never be a problem for archs
+ * that have fixed length instructions.
+ *
+ * uprobe_write_opcode - write the opcode at a given virtual address.
+ * @mm: the probed process address space.
+ * @vaddr: the virtual address to store the opcode.
+ * @opcode: opcode to be written at @vaddr.
+ *
+ * Called with mm->mmap_sem held for write.
+ * Return 0 (success) or a negative errno.
+ */
+int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,
+ uprobe_opcode_t opcode)
+{
+ struct page *old_page, *new_page;
+ struct vm_area_struct *vma;
+ int ret;
+
+retry:
+ /* Read the page with vaddr into memory */
+ ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma);
+ if (ret <= 0)
+ return ret;
+
+ ret = verify_opcode(old_page, vaddr, &opcode);
+ if (ret <= 0)
+ goto put_old;
+
+ ret = anon_vma_prepare(vma);
+ if (ret)
+ goto put_old;
+
+ ret = -ENOMEM;
+ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
+ if (!new_page)
+ goto put_old;
+
+ __SetPageUptodate(new_page);
+ copy_highpage(new_page, old_page);
+ copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
+
+ ret = __replace_page(vma, vaddr, old_page, new_page);
+ page_cache_release(new_page);
+put_old:
+ put_page(old_page);
+
+ if (unlikely(ret == -EAGAIN))
+ goto retry;
+ return ret;
+}
+
+/**
+ * set_swbp - store breakpoint at a given address.
+ * @auprobe: arch specific probepoint information.
+ * @mm: the probed process address space.
+ * @vaddr: the virtual address to insert the opcode.
+ *
+ * For mm @mm, store the breakpoint instruction at @vaddr.
+ * Return 0 (success) or a negative errno.
+ */
+int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
+{
+ return uprobe_write_opcode(mm, vaddr, UPROBE_SWBP_INSN);
+}
+
+/**
+ * set_orig_insn - Restore the original instruction.
+ * @mm: the probed process address space.
+ * @auprobe: arch specific probepoint information.
+ * @vaddr: the virtual address to insert the opcode.
+ *
+ * For mm @mm, restore the original opcode (opcode) at @vaddr.
+ * Return 0 (success) or a negative errno.
+ */
+int __weak
+set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
+{
+ return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)&auprobe->insn);
+}
+
+static int match_uprobe(struct uprobe *l, struct uprobe *r)
+{
+ if (l->inode < r->inode)
+ return -1;
+
+ if (l->inode > r->inode)
+ return 1;
+
+ if (l->offset < r->offset)
+ return -1;
+
+ if (l->offset > r->offset)
+ return 1;
+
+ return 0;
+}
+
+static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
+{
+ struct uprobe u = { .inode = inode, .offset = offset };
+ struct rb_node *n = uprobes_tree.rb_node;
+ struct uprobe *uprobe;
+ int match;
+
+ while (n) {
+ uprobe = rb_entry(n, struct uprobe, rb_node);
+ match = match_uprobe(&u, uprobe);
+ if (!match) {
+ atomic_inc(&uprobe->ref);
+ return uprobe;
+ }
+
+ if (match < 0)
+ n = n->rb_left;
+ else
+ n = n->rb_right;
+ }
+ return NULL;
+}
+
+/*
+ * Find a uprobe corresponding to a given inode:offset
+ * Acquires uprobes_treelock
+ */
+static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
+{
+ struct uprobe *uprobe;
+
+ spin_lock(&uprobes_treelock);
+ uprobe = __find_uprobe(inode, offset);
+ spin_unlock(&uprobes_treelock);
+
+ return uprobe;
+}
+
+static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
+{
+ struct rb_node **p = &uprobes_tree.rb_node;
+ struct rb_node *parent = NULL;
+ struct uprobe *u;
+ int match;
+
+ while (*p) {
+ parent = *p;
+ u = rb_entry(parent, struct uprobe, rb_node);
+ match = match_uprobe(uprobe, u);
+ if (!match) {
+ atomic_inc(&u->ref);
+ return u;
+ }
+
+ if (match < 0)
+ p = &parent->rb_left;
+ else
+ p = &parent->rb_right;
+
+ }
+
+ u = NULL;
+ rb_link_node(&uprobe->rb_node, parent, p);
+ rb_insert_color(&uprobe->rb_node, &uprobes_tree);
+ /* get access + creation ref */
+ atomic_set(&uprobe->ref, 2);
+
+ return u;
+}
+
+/*
+ * Acquire uprobes_treelock.
+ * Matching uprobe already exists in rbtree;
+ * increment (access refcount) and return the matching uprobe.
+ *
+ * No matching uprobe; insert the uprobe in rb_tree;
+ * get a double refcount (access + creation) and return NULL.
+ */
+static struct uprobe *insert_uprobe(struct uprobe *uprobe)
+{
+ struct uprobe *u;
+
+ spin_lock(&uprobes_treelock);
+ u = __insert_uprobe(uprobe);
+ spin_unlock(&uprobes_treelock);
+
+ return u;
+}
+
+static void put_uprobe(struct uprobe *uprobe)
+{
+ if (atomic_dec_and_test(&uprobe->ref))
+ kfree(uprobe);
+}
+
+static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
+{
+ struct uprobe *uprobe, *cur_uprobe;
+
+ uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL);
+ if (!uprobe)
+ return NULL;
+
+ uprobe->inode = igrab(inode);
+ uprobe->offset = offset;
+ init_rwsem(&uprobe->register_rwsem);
+ init_rwsem(&uprobe->consumer_rwsem);
+
+ /* add to uprobes_tree, sorted on inode:offset */
+ cur_uprobe = insert_uprobe(uprobe);
+ /* a uprobe exists for this inode:offset combination */
+ if (cur_uprobe) {
+ kfree(uprobe);
+ uprobe = cur_uprobe;
+ iput(inode);
+ }
+
+ return uprobe;
+}
+
+static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
+{
+ down_write(&uprobe->consumer_rwsem);
+ uc->next = uprobe->consumers;
+ uprobe->consumers = uc;
+ up_write(&uprobe->consumer_rwsem);
+}
+
+/*
+ * For uprobe @uprobe, delete the consumer @uc.
+ * Return true if the @uc is deleted successfully
+ * or return false.
+ */
+static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
+{
+ struct uprobe_consumer **con;
+ bool ret = false;
+
+ down_write(&uprobe->consumer_rwsem);
+ for (con = &uprobe->consumers; *con; con = &(*con)->next) {
+ if (*con == uc) {
+ *con = uc->next;
+ ret = true;
+ break;
+ }
+ }
+ up_write(&uprobe->consumer_rwsem);
+
+ return ret;
+}
+
+static int __copy_insn(struct address_space *mapping, struct file *filp,
+ void *insn, int nbytes, loff_t offset)
+{
+ struct page *page;
+ /*
+ * Ensure that the page that has the original instruction is populated
+ * and in page-cache. If ->readpage == NULL it must be shmem_mapping(),
+ * see uprobe_register().
+ */
+ if (mapping->a_ops->readpage)
+ page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp);
+ else
+ page = shmem_read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+
+ copy_from_page(page, offset, insn, nbytes);
+ page_cache_release(page);
+
+ return 0;
+}
+
+static int copy_insn(struct uprobe *uprobe, struct file *filp)
+{
+ struct address_space *mapping = uprobe->inode->i_mapping;
+ loff_t offs = uprobe->offset;
+ void *insn = &uprobe->arch.insn;
+ int size = sizeof(uprobe->arch.insn);
+ int len, err = -EIO;
+
+ /* Copy only available bytes, -EIO if nothing was read */
+ do {
+ if (offs >= i_size_read(uprobe->inode))
+ break;
+
+ len = min_t(int, size, PAGE_SIZE - (offs & ~PAGE_MASK));
+ err = __copy_insn(mapping, filp, insn, len, offs);
+ if (err)
+ break;
+
+ insn += len;
+ offs += len;
+ size -= len;
+ } while (size);
+
+ return err;
+}
+
+static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
+ struct mm_struct *mm, unsigned long vaddr)
+{
+ int ret = 0;
+
+ if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
+ return ret;
+
+ /* TODO: move this into _register, until then we abuse this sem. */
+ down_write(&uprobe->consumer_rwsem);
+ if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
+ goto out;
+
+ ret = copy_insn(uprobe, file);
+ if (ret)
+ goto out;
+
+ ret = -ENOTSUPP;
+ if (is_trap_insn((uprobe_opcode_t *)&uprobe->arch.insn))
+ goto out;
+
+ ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
+ if (ret)
+ goto out;
+
+ /* uprobe_write_opcode() assumes we don't cross page boundary */
+ BUG_ON((uprobe->offset & ~PAGE_MASK) +
+ UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
+
+ smp_wmb(); /* pairs with rmb() in find_active_uprobe() */
+ set_bit(UPROBE_COPY_INSN, &uprobe->flags);
+
+ out:
+ up_write(&uprobe->consumer_rwsem);
+
+ return ret;
+}
+
+static inline bool consumer_filter(struct uprobe_consumer *uc,
+ enum uprobe_filter_ctx ctx, struct mm_struct *mm)
+{
+ return !uc->filter || uc->filter(uc, ctx, mm);
+}
+
+static bool filter_chain(struct uprobe *uprobe,
+ enum uprobe_filter_ctx ctx, struct mm_struct *mm)
+{
+ struct uprobe_consumer *uc;
+ bool ret = false;
+
+ down_read(&uprobe->consumer_rwsem);
+ for (uc = uprobe->consumers; uc; uc = uc->next) {
+ ret = consumer_filter(uc, ctx, mm);
+ if (ret)
+ break;
+ }
+ up_read(&uprobe->consumer_rwsem);
+
+ return ret;
+}
+
+static int
+install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long vaddr)
+{
+ bool first_uprobe;
+ int ret;
+
+ ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr);
+ if (ret)
+ return ret;
+
+ /*
+ * set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(),
+ * the task can hit this breakpoint right after __replace_page().
+ */
+ first_uprobe = !test_bit(MMF_HAS_UPROBES, &mm->flags);
+ if (first_uprobe)
+ set_bit(MMF_HAS_UPROBES, &mm->flags);
+
+ ret = set_swbp(&uprobe->arch, mm, vaddr);
+ if (!ret)
+ clear_bit(MMF_RECALC_UPROBES, &mm->flags);
+ else if (first_uprobe)
+ clear_bit(MMF_HAS_UPROBES, &mm->flags);
+
+ return ret;
+}
+
+static int
+remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
+{
+ set_bit(MMF_RECALC_UPROBES, &mm->flags);
+ return set_orig_insn(&uprobe->arch, mm, vaddr);
+}
+
+static inline bool uprobe_is_active(struct uprobe *uprobe)
+{
+ return !RB_EMPTY_NODE(&uprobe->rb_node);
+}
+/*
+ * There could be threads that have already hit the breakpoint. They
+ * will recheck the current insn and restart if find_uprobe() fails.
+ * See find_active_uprobe().
+ */
+static void delete_uprobe(struct uprobe *uprobe)
+{
+ if (WARN_ON(!uprobe_is_active(uprobe)))
+ return;
+
+ spin_lock(&uprobes_treelock);
+ rb_erase(&uprobe->rb_node, &uprobes_tree);
+ spin_unlock(&uprobes_treelock);
+ RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
+ iput(uprobe->inode);
+ put_uprobe(uprobe);
+}
+
+struct map_info {
+ struct map_info *next;
+ struct mm_struct *mm;
+ unsigned long vaddr;
+};
+
+static inline struct map_info *free_map_info(struct map_info *info)
+{
+ struct map_info *next = info->next;
+ kfree(info);
+ return next;
+}
+
+static struct map_info *
+build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
+{
+ unsigned long pgoff = offset >> PAGE_SHIFT;
+ struct vm_area_struct *vma;
+ struct map_info *curr = NULL;
+ struct map_info *prev = NULL;
+ struct map_info *info;
+ int more = 0;
+
+ again:
+ i_mmap_lock_read(mapping);
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
+ if (!valid_vma(vma, is_register))
+ continue;
+
+ if (!prev && !more) {
+ /*
+ * Needs GFP_NOWAIT to avoid i_mmap_rwsem recursion through
+ * reclaim. This is optimistic, no harm done if it fails.
+ */
+ prev = kmalloc(sizeof(struct map_info),
+ GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
+ if (prev)
+ prev->next = NULL;
+ }
+ if (!prev) {
+ more++;
+ continue;
+ }
+
+ if (!atomic_inc_not_zero(&vma->vm_mm->mm_users))
+ continue;
+
+ info = prev;
+ prev = prev->next;
+ info->next = curr;
+ curr = info;
+
+ info->mm = vma->vm_mm;
+ info->vaddr = offset_to_vaddr(vma, offset);
+ }
+ i_mmap_unlock_read(mapping);
+
+ if (!more)
+ goto out;
+
+ prev = curr;
+ while (curr) {
+ mmput(curr->mm);
+ curr = curr->next;
+ }
+
+ do {
+ info = kmalloc(sizeof(struct map_info), GFP_KERNEL);
+ if (!info) {
+ curr = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+ info->next = prev;
+ prev = info;
+ } while (--more);
+
+ goto again;
+ out:
+ while (prev)
+ prev = free_map_info(prev);
+ return curr;
+}
+
+static int
+register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
+{
+ bool is_register = !!new;
+ struct map_info *info;
+ int err = 0;
+
+ percpu_down_write(&dup_mmap_sem);
+ info = build_map_info(uprobe->inode->i_mapping,
+ uprobe->offset, is_register);
+ if (IS_ERR(info)) {
+ err = PTR_ERR(info);
+ goto out;
+ }
+
+ while (info) {
+ struct mm_struct *mm = info->mm;
+ struct vm_area_struct *vma;
+
+ if (err && is_register)
+ goto free;
+
+ down_write(&mm->mmap_sem);
+ vma = find_vma(mm, info->vaddr);
+ if (!vma || !valid_vma(vma, is_register) ||
+ file_inode(vma->vm_file) != uprobe->inode)
+ goto unlock;
+
+ if (vma->vm_start > info->vaddr ||
+ vaddr_to_offset(vma, info->vaddr) != uprobe->offset)
+ goto unlock;
+
+ if (is_register) {
+ /* consult only the "caller", new consumer. */
+ if (consumer_filter(new,
+ UPROBE_FILTER_REGISTER, mm))
+ err = install_breakpoint(uprobe, mm, vma, info->vaddr);
+ } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) {
+ if (!filter_chain(uprobe,
+ UPROBE_FILTER_UNREGISTER, mm))
+ err |= remove_breakpoint(uprobe, mm, info->vaddr);
+ }
+
+ unlock:
+ up_write(&mm->mmap_sem);
+ free:
+ mmput(mm);
+ info = free_map_info(info);
+ }
+ out:
+ percpu_up_write(&dup_mmap_sem);
+ return err;
+}
+
+static int __uprobe_register(struct uprobe *uprobe, struct uprobe_consumer *uc)
+{
+ consumer_add(uprobe, uc);
+ return register_for_each_vma(uprobe, uc);
+}
+
+static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
+{
+ int err;
+
+ if (WARN_ON(!consumer_del(uprobe, uc)))
+ return;
+
+ err = register_for_each_vma(uprobe, NULL);
+ /* TODO : cant unregister? schedule a worker thread */
+ if (!uprobe->consumers && !err)
+ delete_uprobe(uprobe);
+}
+
+/*
+ * uprobe_register - register a probe
+ * @inode: the file in which the probe has to be placed.
+ * @offset: offset from the start of the file.
+ * @uc: information on howto handle the probe..
+ *
+ * Apart from the access refcount, uprobe_register() takes a creation
+ * refcount (thro alloc_uprobe) if and only if this @uprobe is getting
+ * inserted into the rbtree (i.e first consumer for a @inode:@offset
+ * tuple). Creation refcount stops uprobe_unregister from freeing the
+ * @uprobe even before the register operation is complete. Creation
+ * refcount is released when the last @uc for the @uprobe
+ * unregisters.
+ *
+ * Return errno if it cannot successully install probes
+ * else return 0 (success)
+ */
+int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
+{
+ struct uprobe *uprobe;
+ int ret;
+
+ /* Uprobe must have at least one set consumer */
+ if (!uc->handler && !uc->ret_handler)
+ return -EINVAL;
+
+ /* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */
+ if (!inode->i_mapping->a_ops->readpage && !shmem_mapping(inode->i_mapping))
+ return -EIO;
+ /* Racy, just to catch the obvious mistakes */
+ if (offset > i_size_read(inode))
+ return -EINVAL;
+
+ retry:
+ uprobe = alloc_uprobe(inode, offset);
+ if (!uprobe)
+ return -ENOMEM;
+ /*
+ * We can race with uprobe_unregister()->delete_uprobe().
+ * Check uprobe_is_active() and retry if it is false.
+ */
+ down_write(&uprobe->register_rwsem);
+ ret = -EAGAIN;
+ if (likely(uprobe_is_active(uprobe))) {
+ ret = __uprobe_register(uprobe, uc);
+ if (ret)
+ __uprobe_unregister(uprobe, uc);
+ }
+ up_write(&uprobe->register_rwsem);
+ put_uprobe(uprobe);
+
+ if (unlikely(ret == -EAGAIN))
+ goto retry;
+ return ret;
+}
+EXPORT_SYMBOL_GPL(uprobe_register);
+
+/*
+ * uprobe_apply - unregister a already registered probe.
+ * @inode: the file in which the probe has to be removed.
+ * @offset: offset from the start of the file.
+ * @uc: consumer which wants to add more or remove some breakpoints
+ * @add: add or remove the breakpoints
+ */
+int uprobe_apply(struct inode *inode, loff_t offset,
+ struct uprobe_consumer *uc, bool add)
+{
+ struct uprobe *uprobe;
+ struct uprobe_consumer *con;
+ int ret = -ENOENT;
+
+ uprobe = find_uprobe(inode, offset);
+ if (WARN_ON(!uprobe))
+ return ret;
+
+ down_write(&uprobe->register_rwsem);
+ for (con = uprobe->consumers; con && con != uc ; con = con->next)
+ ;
+ if (con)
+ ret = register_for_each_vma(uprobe, add ? uc : NULL);
+ up_write(&uprobe->register_rwsem);
+ put_uprobe(uprobe);
+
+ return ret;
+}
+
+/*
+ * uprobe_unregister - unregister a already registered probe.
+ * @inode: the file in which the probe has to be removed.
+ * @offset: offset from the start of the file.
+ * @uc: identify which probe if multiple probes are colocated.
+ */
+void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
+{
+ struct uprobe *uprobe;
+
+ uprobe = find_uprobe(inode, offset);
+ if (WARN_ON(!uprobe))
+ return;
+
+ down_write(&uprobe->register_rwsem);
+ __uprobe_unregister(uprobe, uc);
+ up_write(&uprobe->register_rwsem);
+ put_uprobe(uprobe);
+}
+EXPORT_SYMBOL_GPL(uprobe_unregister);
+
+static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+ int err = 0;
+
+ down_read(&mm->mmap_sem);
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ unsigned long vaddr;
+ loff_t offset;
+
+ if (!valid_vma(vma, false) ||
+ file_inode(vma->vm_file) != uprobe->inode)
+ continue;
+
+ offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
+ if (uprobe->offset < offset ||
+ uprobe->offset >= offset + vma->vm_end - vma->vm_start)
+ continue;
+
+ vaddr = offset_to_vaddr(vma, uprobe->offset);
+ err |= remove_breakpoint(uprobe, mm, vaddr);
+ }
+ up_read(&mm->mmap_sem);
+
+ return err;
+}
+
+static struct rb_node *
+find_node_in_range(struct inode *inode, loff_t min, loff_t max)
+{
+ struct rb_node *n = uprobes_tree.rb_node;
+
+ while (n) {
+ struct uprobe *u = rb_entry(n, struct uprobe, rb_node);
+
+ if (inode < u->inode) {
+ n = n->rb_left;
+ } else if (inode > u->inode) {
+ n = n->rb_right;
+ } else {
+ if (max < u->offset)
+ n = n->rb_left;
+ else if (min > u->offset)
+ n = n->rb_right;
+ else
+ break;
+ }
+ }
+
+ return n;
+}
+
+/*
+ * For a given range in vma, build a list of probes that need to be inserted.
+ */
+static void build_probe_list(struct inode *inode,
+ struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ struct list_head *head)
+{
+ loff_t min, max;
+ struct rb_node *n, *t;
+ struct uprobe *u;
+
+ INIT_LIST_HEAD(head);
+ min = vaddr_to_offset(vma, start);
+ max = min + (end - start) - 1;
+
+ spin_lock(&uprobes_treelock);
+ n = find_node_in_range(inode, min, max);
+ if (n) {
+ for (t = n; t; t = rb_prev(t)) {
+ u = rb_entry(t, struct uprobe, rb_node);
+ if (u->inode != inode || u->offset < min)
+ break;
+ list_add(&u->pending_list, head);
+ atomic_inc(&u->ref);
+ }
+ for (t = n; (t = rb_next(t)); ) {
+ u = rb_entry(t, struct uprobe, rb_node);
+ if (u->inode != inode || u->offset > max)
+ break;
+ list_add(&u->pending_list, head);
+ atomic_inc(&u->ref);
+ }
+ }
+ spin_unlock(&uprobes_treelock);
+}
+
+/*
+ * Called from mmap_region/vma_adjust with mm->mmap_sem acquired.
+ *
+ * Currently we ignore all errors and always return 0, the callers
+ * can't handle the failure anyway.
+ */
+int uprobe_mmap(struct vm_area_struct *vma)
+{
+ struct list_head tmp_list;
+ struct uprobe *uprobe, *u;
+ struct inode *inode;
+
+ if (no_uprobe_events() || !valid_vma(vma, true))
+ return 0;
+
+ inode = file_inode(vma->vm_file);
+ if (!inode)
+ return 0;
+
+ mutex_lock(uprobes_mmap_hash(inode));
+ build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list);
+ /*
+ * We can race with uprobe_unregister(), this uprobe can be already
+ * removed. But in this case filter_chain() must return false, all
+ * consumers have gone away.
+ */
+ list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
+ if (!fatal_signal_pending(current) &&
+ filter_chain(uprobe, UPROBE_FILTER_MMAP, vma->vm_mm)) {
+ unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
+ install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
+ }
+ put_uprobe(uprobe);
+ }
+ mutex_unlock(uprobes_mmap_hash(inode));
+
+ return 0;
+}
+
+static bool
+vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long end)
+{
+ loff_t min, max;
+ struct inode *inode;
+ struct rb_node *n;
+
+ inode = file_inode(vma->vm_file);
+
+ min = vaddr_to_offset(vma, start);
+ max = min + (end - start) - 1;
+
+ spin_lock(&uprobes_treelock);
+ n = find_node_in_range(inode, min, max);
+ spin_unlock(&uprobes_treelock);
+
+ return !!n;
+}
+
+/*
+ * Called in context of a munmap of a vma.
+ */
+void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
+{
+ if (no_uprobe_events() || !valid_vma(vma, false))
+ return;
+
+ if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */
+ return;
+
+ if (!test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags) ||
+ test_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags))
+ return;
+
+ if (vma_has_uprobes(vma, start, end))
+ set_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags);
+}
+
+/* Slot allocation for XOL */
+static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
+{
+ int ret = -EALREADY;
+
+ down_write(&mm->mmap_sem);
+ if (mm->uprobes_state.xol_area)
+ goto fail;
+
+ if (!area->vaddr) {
+ /* Try to map as high as possible, this is only a hint. */
+ area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE,
+ PAGE_SIZE, 0, 0);
+ if (area->vaddr & ~PAGE_MASK) {
+ ret = area->vaddr;
+ goto fail;
+ }
+ }
+
+ ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE,
+ VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, &area->page);
+ if (ret)
+ goto fail;
+
+ smp_wmb(); /* pairs with get_xol_area() */
+ mm->uprobes_state.xol_area = area;
+ fail:
+ up_write(&mm->mmap_sem);
+
+ return ret;
+}
+
+static struct xol_area *__create_xol_area(unsigned long vaddr)
+{
+ struct mm_struct *mm = current->mm;
+ uprobe_opcode_t insn = UPROBE_SWBP_INSN;
+ struct xol_area *area;
+
+ area = kmalloc(sizeof(*area), GFP_KERNEL);
+ if (unlikely(!area))
+ goto out;
+
+ area->bitmap = kzalloc(BITS_TO_LONGS(UINSNS_PER_PAGE) * sizeof(long), GFP_KERNEL);
+ if (!area->bitmap)
+ goto free_area;
+
+ area->page = alloc_page(GFP_HIGHUSER);
+ if (!area->page)
+ goto free_bitmap;
+
+ area->vaddr = vaddr;
+ init_waitqueue_head(&area->wq);
+ /* Reserve the 1st slot for get_trampoline_vaddr() */
+ set_bit(0, area->bitmap);
+ atomic_set(&area->slot_count, 1);
+ copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE);
+
+ if (!xol_add_vma(mm, area))
+ return area;
+
+ __free_page(area->page);
+ free_bitmap:
+ kfree(area->bitmap);
+ free_area:
+ kfree(area);
+ out:
+ return NULL;
+}
+
+/*
+ * get_xol_area - Allocate process's xol_area if necessary.
+ * This area will be used for storing instructions for execution out of line.
+ *
+ * Returns the allocated area or NULL.
+ */
+static struct xol_area *get_xol_area(void)
+{
+ struct mm_struct *mm = current->mm;
+ struct xol_area *area;
+
+ if (!mm->uprobes_state.xol_area)
+ __create_xol_area(0);
+
+ area = mm->uprobes_state.xol_area;
+ smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */
+ return area;
+}
+
+/*
+ * uprobe_clear_state - Free the area allocated for slots.
+ */
+void uprobe_clear_state(struct mm_struct *mm)
+{
+ struct xol_area *area = mm->uprobes_state.xol_area;
+
+ if (!area)
+ return;
+
+ put_page(area->page);
+ kfree(area->bitmap);
+ kfree(area);
+}
+
+void uprobe_start_dup_mmap(void)
+{
+ percpu_down_read(&dup_mmap_sem);
+}
+
+void uprobe_end_dup_mmap(void)
+{
+ percpu_up_read(&dup_mmap_sem);
+}
+
+void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm)
+{
+ newmm->uprobes_state.xol_area = NULL;
+
+ if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) {
+ set_bit(MMF_HAS_UPROBES, &newmm->flags);
+ /* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */
+ set_bit(MMF_RECALC_UPROBES, &newmm->flags);
+ }
+}
+
+/*
+ * - search for a free slot.
+ */
+static unsigned long xol_take_insn_slot(struct xol_area *area)
+{
+ unsigned long slot_addr;
+ int slot_nr;
+
+ do {
+ slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE);
+ if (slot_nr < UINSNS_PER_PAGE) {
+ if (!test_and_set_bit(slot_nr, area->bitmap))
+ break;
+
+ slot_nr = UINSNS_PER_PAGE;
+ continue;
+ }
+ wait_event(area->wq, (atomic_read(&area->slot_count) < UINSNS_PER_PAGE));
+ } while (slot_nr >= UINSNS_PER_PAGE);
+
+ slot_addr = area->vaddr + (slot_nr * UPROBE_XOL_SLOT_BYTES);
+ atomic_inc(&area->slot_count);
+
+ return slot_addr;
+}
+
+/*
+ * xol_get_insn_slot - allocate a slot for xol.
+ * Returns the allocated slot address or 0.
+ */
+static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
+{
+ struct xol_area *area;
+ unsigned long xol_vaddr;
+
+ area = get_xol_area();
+ if (!area)
+ return 0;
+
+ xol_vaddr = xol_take_insn_slot(area);
+ if (unlikely(!xol_vaddr))
+ return 0;
+
+ arch_uprobe_copy_ixol(area->page, xol_vaddr,
+ &uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
+
+ return xol_vaddr;
+}
+
+/*
+ * xol_free_insn_slot - If slot was earlier allocated by
+ * @xol_get_insn_slot(), make the slot available for
+ * subsequent requests.
+ */
+static void xol_free_insn_slot(struct task_struct *tsk)
+{
+ struct xol_area *area;
+ unsigned long vma_end;
+ unsigned long slot_addr;
+
+ if (!tsk->mm || !tsk->mm->uprobes_state.xol_area || !tsk->utask)
+ return;
+
+ slot_addr = tsk->utask->xol_vaddr;
+ if (unlikely(!slot_addr))
+ return;
+
+ area = tsk->mm->uprobes_state.xol_area;
+ vma_end = area->vaddr + PAGE_SIZE;
+ if (area->vaddr <= slot_addr && slot_addr < vma_end) {
+ unsigned long offset;
+ int slot_nr;
+
+ offset = slot_addr - area->vaddr;
+ slot_nr = offset / UPROBE_XOL_SLOT_BYTES;
+ if (slot_nr >= UINSNS_PER_PAGE)
+ return;
+
+ clear_bit(slot_nr, area->bitmap);
+ atomic_dec(&area->slot_count);
+ if (waitqueue_active(&area->wq))
+ wake_up(&area->wq);
+
+ tsk->utask->xol_vaddr = 0;
+ }
+}
+
+void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
+ void *src, unsigned long len)
+{
+ /* Initialize the slot */
+ copy_to_page(page, vaddr, src, len);
+
+ /*
+ * We probably need flush_icache_user_range() but it needs vma.
+ * This should work on most of architectures by default. If
+ * architecture needs to do something different it can define
+ * its own version of the function.
+ */
+ flush_dcache_page(page);
+}
+
+/**
+ * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs
+ * @regs: Reflects the saved state of the task after it has hit a breakpoint
+ * instruction.
+ * Return the address of the breakpoint instruction.
+ */
+unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs)
+{
+ return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE;
+}
+
+unsigned long uprobe_get_trap_addr(struct pt_regs *regs)
+{
+ struct uprobe_task *utask = current->utask;
+
+ if (unlikely(utask && utask->active_uprobe))
+ return utask->vaddr;
+
+ return instruction_pointer(regs);
+}
+
+/*
+ * Called with no locks held.
+ * Called in context of a exiting or a exec-ing thread.
+ */
+void uprobe_free_utask(struct task_struct *t)
+{
+ struct uprobe_task *utask = t->utask;
+ struct return_instance *ri, *tmp;
+
+ if (!utask)
+ return;
+
+ if (utask->active_uprobe)
+ put_uprobe(utask->active_uprobe);
+
+ ri = utask->return_instances;
+ while (ri) {
+ tmp = ri;
+ ri = ri->next;
+
+ put_uprobe(tmp->uprobe);
+ kfree(tmp);
+ }
+
+ xol_free_insn_slot(t);
+ kfree(utask);
+ t->utask = NULL;
+}
+
+/*
+ * Allocate a uprobe_task object for the task if if necessary.
+ * Called when the thread hits a breakpoint.
+ *
+ * Returns:
+ * - pointer to new uprobe_task on success
+ * - NULL otherwise
+ */
+static struct uprobe_task *get_utask(void)
+{
+ if (!current->utask)
+ current->utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
+ return current->utask;
+}
+
+static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
+{
+ struct uprobe_task *n_utask;
+ struct return_instance **p, *o, *n;
+
+ n_utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
+ if (!n_utask)
+ return -ENOMEM;
+ t->utask = n_utask;
+
+ p = &n_utask->return_instances;
+ for (o = o_utask->return_instances; o; o = o->next) {
+ n = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
+ if (!n)
+ return -ENOMEM;
+
+ *n = *o;
+ atomic_inc(&n->uprobe->ref);
+ n->next = NULL;
+
+ *p = n;
+ p = &n->next;
+ n_utask->depth++;
+ }
+
+ return 0;
+}
+
+static void uprobe_warn(struct task_struct *t, const char *msg)
+{
+ pr_warn("uprobe: %s:%d failed to %s\n",
+ current->comm, current->pid, msg);
+}
+
+static void dup_xol_work(struct callback_head *work)
+{
+ if (current->flags & PF_EXITING)
+ return;
+
+ if (!__create_xol_area(current->utask->dup_xol_addr))
+ uprobe_warn(current, "dup xol area");
+}
+
+/*
+ * Called in context of a new clone/fork from copy_process.
+ */
+void uprobe_copy_process(struct task_struct *t, unsigned long flags)
+{
+ struct uprobe_task *utask = current->utask;
+ struct mm_struct *mm = current->mm;
+ struct xol_area *area;
+
+ t->utask = NULL;
+
+ if (!utask || !utask->return_instances)
+ return;
+
+ if (mm == t->mm && !(flags & CLONE_VFORK))
+ return;
+
+ if (dup_utask(t, utask))
+ return uprobe_warn(t, "dup ret instances");
+
+ /* The task can fork() after dup_xol_work() fails */
+ area = mm->uprobes_state.xol_area;
+ if (!area)
+ return uprobe_warn(t, "dup xol area");
+
+ if (mm == t->mm)
+ return;
+
+ t->utask->dup_xol_addr = area->vaddr;
+ init_task_work(&t->utask->dup_xol_work, dup_xol_work);
+ task_work_add(t, &t->utask->dup_xol_work, true);
+}
+
+/*
+ * Current area->vaddr notion assume the trampoline address is always
+ * equal area->vaddr.
+ *
+ * Returns -1 in case the xol_area is not allocated.
+ */
+static unsigned long get_trampoline_vaddr(void)
+{
+ struct xol_area *area;
+ unsigned long trampoline_vaddr = -1;
+
+ area = current->mm->uprobes_state.xol_area;
+ smp_read_barrier_depends();
+ if (area)
+ trampoline_vaddr = area->vaddr;
+
+ return trampoline_vaddr;
+}
+
+static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
+{
+ struct return_instance *ri;
+ struct uprobe_task *utask;
+ unsigned long orig_ret_vaddr, trampoline_vaddr;
+ bool chained = false;
+
+ if (!get_xol_area())
+ return;
+
+ utask = get_utask();
+ if (!utask)
+ return;
+
+ if (utask->depth >= MAX_URETPROBE_DEPTH) {
+ printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to"
+ " nestedness limit pid/tgid=%d/%d\n",
+ current->pid, current->tgid);
+ return;
+ }
+
+ ri = kzalloc(sizeof(struct return_instance), GFP_KERNEL);
+ if (!ri)
+ goto fail;
+
+ trampoline_vaddr = get_trampoline_vaddr();
+ orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
+ if (orig_ret_vaddr == -1)
+ goto fail;
+
+ /*
+ * We don't want to keep trampoline address in stack, rather keep the
+ * original return address of first caller thru all the consequent
+ * instances. This also makes breakpoint unwrapping easier.
+ */
+ if (orig_ret_vaddr == trampoline_vaddr) {
+ if (!utask->return_instances) {
+ /*
+ * This situation is not possible. Likely we have an
+ * attack from user-space.
+ */
+ pr_warn("uprobe: unable to set uretprobe pid/tgid=%d/%d\n",
+ current->pid, current->tgid);
+ goto fail;
+ }
+
+ chained = true;
+ orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
+ }
+
+ atomic_inc(&uprobe->ref);
+ ri->uprobe = uprobe;
+ ri->func = instruction_pointer(regs);
+ ri->orig_ret_vaddr = orig_ret_vaddr;
+ ri->chained = chained;
+
+ utask->depth++;
+
+ /* add instance to the stack */
+ ri->next = utask->return_instances;
+ utask->return_instances = ri;
+
+ return;
+
+ fail:
+ kfree(ri);
+}
+
+/* Prepare to single-step probed instruction out of line. */
+static int
+pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
+{
+ struct uprobe_task *utask;
+ unsigned long xol_vaddr;
+ int err;
+
+ utask = get_utask();
+ if (!utask)
+ return -ENOMEM;
+
+ xol_vaddr = xol_get_insn_slot(uprobe);
+ if (!xol_vaddr)
+ return -ENOMEM;
+
+ utask->xol_vaddr = xol_vaddr;
+ utask->vaddr = bp_vaddr;
+
+ err = arch_uprobe_pre_xol(&uprobe->arch, regs);
+ if (unlikely(err)) {
+ xol_free_insn_slot(current);
+ return err;
+ }
+
+ utask->active_uprobe = uprobe;
+ utask->state = UTASK_SSTEP;
+ return 0;
+}
+
+/*
+ * If we are singlestepping, then ensure this thread is not connected to
+ * non-fatal signals until completion of singlestep. When xol insn itself
+ * triggers the signal, restart the original insn even if the task is
+ * already SIGKILL'ed (since coredump should report the correct ip). This
+ * is even more important if the task has a handler for SIGSEGV/etc, The
+ * _same_ instruction should be repeated again after return from the signal
+ * handler, and SSTEP can never finish in this case.
+ */
+bool uprobe_deny_signal(void)
+{
+ struct task_struct *t = current;
+ struct uprobe_task *utask = t->utask;
+
+ if (likely(!utask || !utask->active_uprobe))
+ return false;
+
+ WARN_ON_ONCE(utask->state != UTASK_SSTEP);
+
+ if (signal_pending(t)) {
+ spin_lock_irq(&t->sighand->siglock);
+ clear_tsk_thread_flag(t, TIF_SIGPENDING);
+ spin_unlock_irq(&t->sighand->siglock);
+
+ if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) {
+ utask->state = UTASK_SSTEP_TRAPPED;
+ set_tsk_thread_flag(t, TIF_UPROBE);
+ }
+ }
+
+ return true;
+}
+
+static void mmf_recalc_uprobes(struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (!valid_vma(vma, false))
+ continue;
+ /*
+ * This is not strictly accurate, we can race with
+ * uprobe_unregister() and see the already removed
+ * uprobe if delete_uprobe() was not yet called.
+ * Or this uprobe can be filtered out.
+ */
+ if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end))
+ return;
+ }
+
+ clear_bit(MMF_HAS_UPROBES, &mm->flags);
+}
+
+static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
+{
+ struct page *page;
+ uprobe_opcode_t opcode;
+ int result;
+
+ pagefault_disable();
+ result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr,
+ sizeof(opcode));
+ pagefault_enable();
+
+ if (likely(result == 0))
+ goto out;
+
+ result = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL);
+ if (result < 0)
+ return result;
+
+ copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
+ put_page(page);
+ out:
+ /* This needs to return true for any variant of the trap insn */
+ return is_trap_insn(&opcode);
+}
+
+static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
+{
+ struct mm_struct *mm = current->mm;
+ struct uprobe *uprobe = NULL;
+ struct vm_area_struct *vma;
+
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, bp_vaddr);
+ if (vma && vma->vm_start <= bp_vaddr) {
+ if (valid_vma(vma, false)) {
+ struct inode *inode = file_inode(vma->vm_file);
+ loff_t offset = vaddr_to_offset(vma, bp_vaddr);
+
+ uprobe = find_uprobe(inode, offset);
+ }
+
+ if (!uprobe)
+ *is_swbp = is_trap_at_addr(mm, bp_vaddr);
+ } else {
+ *is_swbp = -EFAULT;
+ }
+
+ if (!uprobe && test_and_clear_bit(MMF_RECALC_UPROBES, &mm->flags))
+ mmf_recalc_uprobes(mm);
+ up_read(&mm->mmap_sem);
+
+ return uprobe;
+}
+
+static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
+{
+ struct uprobe_consumer *uc;
+ int remove = UPROBE_HANDLER_REMOVE;
+ bool need_prep = false; /* prepare return uprobe, when needed */
+
+ down_read(&uprobe->register_rwsem);
+ for (uc = uprobe->consumers; uc; uc = uc->next) {
+ int rc = 0;
+
+ if (uc->handler) {
+ rc = uc->handler(uc, regs);
+ WARN(rc & ~UPROBE_HANDLER_MASK,
+ "bad rc=0x%x from %pf()\n", rc, uc->handler);
+ }
+
+ if (uc->ret_handler)
+ need_prep = true;
+
+ remove &= rc;
+ }
+
+ if (need_prep && !remove)
+ prepare_uretprobe(uprobe, regs); /* put bp at return */
+
+ if (remove && uprobe->consumers) {
+ WARN_ON(!uprobe_is_active(uprobe));
+ unapply_uprobe(uprobe, current->mm);
+ }
+ up_read(&uprobe->register_rwsem);
+}
+
+static void
+handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
+{
+ struct uprobe *uprobe = ri->uprobe;
+ struct uprobe_consumer *uc;
+
+ down_read(&uprobe->register_rwsem);
+ for (uc = uprobe->consumers; uc; uc = uc->next) {
+ if (uc->ret_handler)
+ uc->ret_handler(uc, ri->func, regs);
+ }
+ up_read(&uprobe->register_rwsem);
+}
+
+static bool handle_trampoline(struct pt_regs *regs)
+{
+ struct uprobe_task *utask;
+ struct return_instance *ri, *tmp;
+ bool chained;
+
+ utask = current->utask;
+ if (!utask)
+ return false;
+
+ ri = utask->return_instances;
+ if (!ri)
+ return false;
+
+ /*
+ * TODO: we should throw out return_instance's invalidated by
+ * longjmp(), currently we assume that the probed function always
+ * returns.
+ */
+ instruction_pointer_set(regs, ri->orig_ret_vaddr);
+
+ for (;;) {
+ handle_uretprobe_chain(ri, regs);
+
+ chained = ri->chained;
+ put_uprobe(ri->uprobe);
+
+ tmp = ri;
+ ri = ri->next;
+ kfree(tmp);
+ utask->depth--;
+
+ if (!chained)
+ break;
+ BUG_ON(!ri);
+ }
+
+ utask->return_instances = ri;
+
+ return true;
+}
+
+bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs)
+{
+ return false;
+}
+
+/*
+ * Run handler and ask thread to singlestep.
+ * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
+ */
+static void handle_swbp(struct pt_regs *regs)
+{
+ struct uprobe *uprobe;
+ unsigned long bp_vaddr;
+ int uninitialized_var(is_swbp);
+
+ bp_vaddr = uprobe_get_swbp_addr(regs);
+ if (bp_vaddr == get_trampoline_vaddr()) {
+ if (handle_trampoline(regs))
+ return;
+
+ pr_warn("uprobe: unable to handle uretprobe pid/tgid=%d/%d\n",
+ current->pid, current->tgid);
+ }
+
+ uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
+ if (!uprobe) {
+ if (is_swbp > 0) {
+ /* No matching uprobe; signal SIGTRAP. */
+ send_sig(SIGTRAP, current, 0);
+ } else {
+ /*
+ * Either we raced with uprobe_unregister() or we can't
+ * access this memory. The latter is only possible if
+ * another thread plays with our ->mm. In both cases
+ * we can simply restart. If this vma was unmapped we
+ * can pretend this insn was not executed yet and get
+ * the (correct) SIGSEGV after restart.
+ */
+ instruction_pointer_set(regs, bp_vaddr);
+ }
+ return;
+ }
+
+ /* change it in advance for ->handler() and restart */
+ instruction_pointer_set(regs, bp_vaddr);
+
+ /*
+ * TODO: move copy_insn/etc into _register and remove this hack.
+ * After we hit the bp, _unregister + _register can install the
+ * new and not-yet-analyzed uprobe at the same address, restart.
+ */
+ smp_rmb(); /* pairs with wmb() in install_breakpoint() */
+ if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags)))
+ goto out;
+
+ /* Tracing handlers use ->utask to communicate with fetch methods */
+ if (!get_utask())
+ goto out;
+
+ if (arch_uprobe_ignore(&uprobe->arch, regs))
+ goto out;
+
+ handler_chain(uprobe, regs);
+
+ if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
+ goto out;
+
+ if (!pre_ssout(uprobe, regs, bp_vaddr))
+ return;
+
+ /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
+out:
+ put_uprobe(uprobe);
+}
+
+/*
+ * Perform required fix-ups and disable singlestep.
+ * Allow pending signals to take effect.
+ */
+static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
+{
+ struct uprobe *uprobe;
+ int err = 0;
+
+ uprobe = utask->active_uprobe;
+ if (utask->state == UTASK_SSTEP_ACK)
+ err = arch_uprobe_post_xol(&uprobe->arch, regs);
+ else if (utask->state == UTASK_SSTEP_TRAPPED)
+ arch_uprobe_abort_xol(&uprobe->arch, regs);
+ else
+ WARN_ON_ONCE(1);
+
+ put_uprobe(uprobe);
+ utask->active_uprobe = NULL;
+ utask->state = UTASK_RUNNING;
+ xol_free_insn_slot(current);
+
+ spin_lock_irq(&current->sighand->siglock);
+ recalc_sigpending(); /* see uprobe_deny_signal() */
+ spin_unlock_irq(&current->sighand->siglock);
+
+ if (unlikely(err)) {
+ uprobe_warn(current, "execute the probed insn, sending SIGILL.");
+ force_sig_info(SIGILL, SEND_SIG_FORCED, current);
+ }
+}
+
+/*
+ * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag and
+ * allows the thread to return from interrupt. After that handle_swbp()
+ * sets utask->active_uprobe.
+ *
+ * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag
+ * and allows the thread to return from interrupt.
+ *
+ * While returning to userspace, thread notices the TIF_UPROBE flag and calls
+ * uprobe_notify_resume().
+ */
+void uprobe_notify_resume(struct pt_regs *regs)
+{
+ struct uprobe_task *utask;
+
+ clear_thread_flag(TIF_UPROBE);
+
+ utask = current->utask;
+ if (utask && utask->active_uprobe)
+ handle_singlestep(utask, regs);
+ else
+ handle_swbp(regs);
+}
+
+/*
+ * uprobe_pre_sstep_notifier gets called from interrupt context as part of
+ * notifier mechanism. Set TIF_UPROBE flag and indicate breakpoint hit.
+ */
+int uprobe_pre_sstep_notifier(struct pt_regs *regs)
+{
+ if (!current->mm)
+ return 0;
+
+ if (!test_bit(MMF_HAS_UPROBES, &current->mm->flags) &&
+ (!current->utask || !current->utask->return_instances))
+ return 0;
+
+ set_thread_flag(TIF_UPROBE);
+ return 1;
+}
+
+/*
+ * uprobe_post_sstep_notifier gets called in interrupt context as part of notifier
+ * mechanism. Set TIF_UPROBE flag and indicate completion of singlestep.
+ */
+int uprobe_post_sstep_notifier(struct pt_regs *regs)
+{
+ struct uprobe_task *utask = current->utask;
+
+ if (!current->mm || !utask || !utask->active_uprobe)
+ /* task is currently not uprobed */
+ return 0;
+
+ utask->state = UTASK_SSTEP_ACK;
+ set_thread_flag(TIF_UPROBE);
+ return 1;
+}
+
+static struct notifier_block uprobe_exception_nb = {
+ .notifier_call = arch_uprobe_exception_notify,
+ .priority = INT_MAX-1, /* notified after kprobes, kgdb */
+};
+
+static int __init init_uprobes(void)
+{
+ int i;
+
+ for (i = 0; i < UPROBES_HASH_SZ; i++)
+ mutex_init(&uprobes_mmap_mutex[i]);
+
+ if (percpu_init_rwsem(&dup_mmap_sem))
+ return -ENOMEM;
+
+ return register_die_notifier(&uprobe_exception_nb);
+}
+__initcall(init_uprobes);
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
new file mode 100644
index 000000000..6873bb3e6
--- /dev/null
+++ b/kernel/exec_domain.c
@@ -0,0 +1,58 @@
+/*
+ * Handling of different ABIs (personalities).
+ *
+ * We group personalities into execution domains which have their
+ * own handlers for kernel entry points, signal mapping, etc...
+ *
+ * 2001-05-06 Complete rewrite, Christoph Hellwig (hch@infradead.org)
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/kmod.h>
+#include <linux/module.h>
+#include <linux/personality.h>
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/syscalls.h>
+#include <linux/sysctl.h>
+#include <linux/types.h>
+#include <linux/fs_struct.h>
+
+#ifdef CONFIG_PROC_FS
+static int execdomains_proc_show(struct seq_file *m, void *v)
+{
+ seq_puts(m, "0-0\tLinux \t[kernel]\n");
+ return 0;
+}
+
+static int execdomains_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, execdomains_proc_show, NULL);
+}
+
+static const struct file_operations execdomains_proc_fops = {
+ .open = execdomains_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init proc_execdomains_init(void)
+{
+ proc_create("execdomains", 0, NULL, &execdomains_proc_fops);
+ return 0;
+}
+module_init(proc_execdomains_init);
+#endif
+
+SYSCALL_DEFINE1(personality, unsigned int, personality)
+{
+ unsigned int old = current->personality;
+
+ if (personality != 0xffffffff)
+ set_personality(personality);
+
+ return old;
+}
diff --git a/kernel/exit.c b/kernel/exit.c
new file mode 100644
index 000000000..490a707c7
--- /dev/null
+++ b/kernel/exit.c
@@ -0,0 +1,1632 @@
+/*
+ * linux/kernel/exit.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ */
+
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/capability.h>
+#include <linux/completion.h>
+#include <linux/personality.h>
+#include <linux/tty.h>
+#include <linux/iocontext.h>
+#include <linux/key.h>
+#include <linux/security.h>
+#include <linux/cpu.h>
+#include <linux/acct.h>
+#include <linux/tsacct_kern.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/freezer.h>
+#include <linux/binfmts.h>
+#include <linux/nsproxy.h>
+#include <linux/pid_namespace.h>
+#include <linux/ptrace.h>
+#include <linux/profile.h>
+#include <linux/mount.h>
+#include <linux/proc_fs.h>
+#include <linux/kthread.h>
+#include <linux/mempolicy.h>
+#include <linux/taskstats_kern.h>
+#include <linux/delayacct.h>
+#include <linux/cgroup.h>
+#include <linux/syscalls.h>
+#include <linux/signal.h>
+#include <linux/posix-timers.h>
+#include <linux/cn_proc.h>
+#include <linux/mutex.h>
+#include <linux/futex.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/audit.h> /* for audit_free() */
+#include <linux/resource.h>
+#include <linux/blkdev.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/tracehook.h>
+#include <linux/fs_struct.h>
+#include <linux/init_task.h>
+#include <linux/perf_event.h>
+#include <trace/events/sched.h>
+#include <linux/hw_breakpoint.h>
+#include <linux/oom.h>
+#include <linux/writeback.h>
+#include <linux/shm.h>
+
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+#include <asm/pgtable.h>
+#include <asm/mmu_context.h>
+
+static void exit_mm(struct task_struct *tsk);
+
+static void __unhash_process(struct task_struct *p, bool group_dead)
+{
+ nr_threads--;
+ detach_pid(p, PIDTYPE_PID);
+ if (group_dead) {
+ detach_pid(p, PIDTYPE_PGID);
+ detach_pid(p, PIDTYPE_SID);
+
+ list_del_rcu(&p->tasks);
+ list_del_init(&p->sibling);
+ __this_cpu_dec(process_counts);
+ }
+ list_del_rcu(&p->thread_group);
+ list_del_rcu(&p->thread_node);
+}
+
+/*
+ * This function expects the tasklist_lock write-locked.
+ */
+static void __exit_signal(struct task_struct *tsk)
+{
+ struct signal_struct *sig = tsk->signal;
+ bool group_dead = thread_group_leader(tsk);
+ struct sighand_struct *sighand;
+ struct tty_struct *uninitialized_var(tty);
+ cputime_t utime, stime;
+
+ sighand = rcu_dereference_check(tsk->sighand,
+ lockdep_tasklist_lock_is_held());
+ spin_lock(&sighand->siglock);
+
+ posix_cpu_timers_exit(tsk);
+ if (group_dead) {
+ posix_cpu_timers_exit_group(tsk);
+ tty = sig->tty;
+ sig->tty = NULL;
+ } else {
+ /*
+ * This can only happen if the caller is de_thread().
+ * FIXME: this is the temporary hack, we should teach
+ * posix-cpu-timers to handle this case correctly.
+ */
+ if (unlikely(has_group_leader_pid(tsk)))
+ posix_cpu_timers_exit_group(tsk);
+
+ /*
+ * If there is any task waiting for the group exit
+ * then notify it:
+ */
+ if (sig->notify_count > 0 && !--sig->notify_count)
+ wake_up_process(sig->group_exit_task);
+
+ if (tsk == sig->curr_target)
+ sig->curr_target = next_thread(tsk);
+ }
+
+ /*
+ * Accumulate here the counters for all threads as they die. We could
+ * skip the group leader because it is the last user of signal_struct,
+ * but we want to avoid the race with thread_group_cputime() which can
+ * see the empty ->thread_head list.
+ */
+ task_cputime(tsk, &utime, &stime);
+ write_seqlock(&sig->stats_lock);
+ sig->utime += utime;
+ sig->stime += stime;
+ sig->gtime += task_gtime(tsk);
+ sig->min_flt += tsk->min_flt;
+ sig->maj_flt += tsk->maj_flt;
+ sig->nvcsw += tsk->nvcsw;
+ sig->nivcsw += tsk->nivcsw;
+ sig->inblock += task_io_get_inblock(tsk);
+ sig->oublock += task_io_get_oublock(tsk);
+ task_io_accounting_add(&sig->ioac, &tsk->ioac);
+ sig->sum_sched_runtime += tsk_seruntime(tsk);
+ sig->nr_threads--;
+ __unhash_process(tsk, group_dead);
+ write_sequnlock(&sig->stats_lock);
+
+ /*
+ * Do this under ->siglock, we can race with another thread
+ * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
+ */
+ flush_sigqueue(&tsk->pending);
+ tsk->sighand = NULL;
+ spin_unlock(&sighand->siglock);
+
+ __cleanup_sighand(sighand);
+ clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
+ if (group_dead) {
+ flush_sigqueue(&sig->shared_pending);
+ tty_kref_put(tty);
+ }
+}
+
+static void delayed_put_task_struct(struct rcu_head *rhp)
+{
+ struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
+
+ perf_event_delayed_put(tsk);
+ trace_sched_process_free(tsk);
+ put_task_struct(tsk);
+}
+
+
+void release_task(struct task_struct *p)
+{
+ struct task_struct *leader;
+ int zap_leader;
+repeat:
+ /* don't need to get the RCU readlock here - the process is dead and
+ * can't be modifying its own credentials. But shut RCU-lockdep up */
+ rcu_read_lock();
+ atomic_dec(&__task_cred(p)->user->processes);
+ rcu_read_unlock();
+
+ proc_flush_task(p);
+
+ write_lock_irq(&tasklist_lock);
+ ptrace_release_task(p);
+ __exit_signal(p);
+
+ /*
+ * If we are the last non-leader member of the thread
+ * group, and the leader is zombie, then notify the
+ * group leader's parent process. (if it wants notification.)
+ */
+ zap_leader = 0;
+ leader = p->group_leader;
+ if (leader != p && thread_group_empty(leader)
+ && leader->exit_state == EXIT_ZOMBIE) {
+ /*
+ * If we were the last child thread and the leader has
+ * exited already, and the leader's parent ignores SIGCHLD,
+ * then we are the one who should release the leader.
+ */
+ zap_leader = do_notify_parent(leader, leader->exit_signal);
+ if (zap_leader)
+ leader->exit_state = EXIT_DEAD;
+ }
+
+ write_unlock_irq(&tasklist_lock);
+ release_thread(p);
+ call_rcu(&p->rcu, delayed_put_task_struct);
+
+ p = leader;
+ if (unlikely(zap_leader))
+ goto repeat;
+}
+
+/*
+ * Determine if a process group is "orphaned", according to the POSIX
+ * definition in 2.2.2.52. Orphaned process groups are not to be affected
+ * by terminal-generated stop signals. Newly orphaned process groups are
+ * to receive a SIGHUP and a SIGCONT.
+ *
+ * "I ask you, have you ever known what it is to be an orphan?"
+ */
+static int will_become_orphaned_pgrp(struct pid *pgrp,
+ struct task_struct *ignored_task)
+{
+ struct task_struct *p;
+
+ do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
+ if ((p == ignored_task) ||
+ (p->exit_state && thread_group_empty(p)) ||
+ is_global_init(p->real_parent))
+ continue;
+
+ if (task_pgrp(p->real_parent) != pgrp &&
+ task_session(p->real_parent) == task_session(p))
+ return 0;
+ } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
+
+ return 1;
+}
+
+int is_current_pgrp_orphaned(void)
+{
+ int retval;
+
+ read_lock(&tasklist_lock);
+ retval = will_become_orphaned_pgrp(task_pgrp(current), NULL);
+ read_unlock(&tasklist_lock);
+
+ return retval;
+}
+
+static bool has_stopped_jobs(struct pid *pgrp)
+{
+ struct task_struct *p;
+
+ do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
+ if (p->signal->flags & SIGNAL_STOP_STOPPED)
+ return true;
+ } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
+
+ return false;
+}
+
+/*
+ * Check to see if any process groups have become orphaned as
+ * a result of our exiting, and if they have any stopped jobs,
+ * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
+ */
+static void
+kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
+{
+ struct pid *pgrp = task_pgrp(tsk);
+ struct task_struct *ignored_task = tsk;
+
+ if (!parent)
+ /* exit: our father is in a different pgrp than
+ * we are and we were the only connection outside.
+ */
+ parent = tsk->real_parent;
+ else
+ /* reparent: our child is in a different pgrp than
+ * we are, and it was the only connection outside.
+ */
+ ignored_task = NULL;
+
+ if (task_pgrp(parent) != pgrp &&
+ task_session(parent) == task_session(tsk) &&
+ will_become_orphaned_pgrp(pgrp, ignored_task) &&
+ has_stopped_jobs(pgrp)) {
+ __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
+ __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
+ }
+}
+
+#ifdef CONFIG_MEMCG
+/*
+ * A task is exiting. If it owned this mm, find a new owner for the mm.
+ */
+void mm_update_next_owner(struct mm_struct *mm)
+{
+ struct task_struct *c, *g, *p = current;
+
+retry:
+ /*
+ * If the exiting or execing task is not the owner, it's
+ * someone else's problem.
+ */
+ if (mm->owner != p)
+ return;
+ /*
+ * The current owner is exiting/execing and there are no other
+ * candidates. Do not leave the mm pointing to a possibly
+ * freed task structure.
+ */
+ if (atomic_read(&mm->mm_users) <= 1) {
+ mm->owner = NULL;
+ return;
+ }
+
+ read_lock(&tasklist_lock);
+ /*
+ * Search in the children
+ */
+ list_for_each_entry(c, &p->children, sibling) {
+ if (c->mm == mm)
+ goto assign_new_owner;
+ }
+
+ /*
+ * Search in the siblings
+ */
+ list_for_each_entry(c, &p->real_parent->children, sibling) {
+ if (c->mm == mm)
+ goto assign_new_owner;
+ }
+
+ /*
+ * Search through everything else, we should not get here often.
+ */
+ for_each_process(g) {
+ if (g->flags & PF_KTHREAD)
+ continue;
+ for_each_thread(g, c) {
+ if (c->mm == mm)
+ goto assign_new_owner;
+ if (c->mm)
+ break;
+ }
+ }
+ read_unlock(&tasklist_lock);
+ /*
+ * We found no owner yet mm_users > 1: this implies that we are
+ * most likely racing with swapoff (try_to_unuse()) or /proc or
+ * ptrace or page migration (get_task_mm()). Mark owner as NULL.
+ */
+ mm->owner = NULL;
+ return;
+
+assign_new_owner:
+ BUG_ON(c == p);
+ get_task_struct(c);
+ /*
+ * The task_lock protects c->mm from changing.
+ * We always want mm->owner->mm == mm
+ */
+ task_lock(c);
+ /*
+ * Delay read_unlock() till we have the task_lock()
+ * to ensure that c does not slip away underneath us
+ */
+ read_unlock(&tasklist_lock);
+ if (c->mm != mm) {
+ task_unlock(c);
+ put_task_struct(c);
+ goto retry;
+ }
+ mm->owner = c;
+ task_unlock(c);
+ put_task_struct(c);
+}
+#endif /* CONFIG_MEMCG */
+
+/*
+ * Turn us into a lazy TLB process if we
+ * aren't already..
+ */
+static void exit_mm(struct task_struct *tsk)
+{
+ struct mm_struct *mm = tsk->mm;
+ struct core_state *core_state;
+
+ mm_release(tsk, mm);
+ if (!mm)
+ return;
+ sync_mm_rss(mm);
+ /*
+ * Serialize with any possible pending coredump.
+ * We must hold mmap_sem around checking core_state
+ * and clearing tsk->mm. The core-inducing thread
+ * will increment ->nr_threads for each thread in the
+ * group with ->mm != NULL.
+ */
+ down_read(&mm->mmap_sem);
+ core_state = mm->core_state;
+ if (core_state) {
+ struct core_thread self;
+
+ up_read(&mm->mmap_sem);
+
+ self.task = tsk;
+ self.next = xchg(&core_state->dumper.next, &self);
+ /*
+ * Implies mb(), the result of xchg() must be visible
+ * to core_state->dumper.
+ */
+ if (atomic_dec_and_test(&core_state->nr_threads))
+ complete(&core_state->startup);
+
+ for (;;) {
+ set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+ if (!self.task) /* see coredump_finish() */
+ break;
+ freezable_schedule();
+ }
+ __set_task_state(tsk, TASK_RUNNING);
+ down_read(&mm->mmap_sem);
+ }
+ atomic_inc(&mm->mm_count);
+ BUG_ON(mm != tsk->active_mm);
+ /* more a memory barrier than a real lock */
+ task_lock(tsk);
+ tsk->mm = NULL;
+ up_read(&mm->mmap_sem);
+ enter_lazy_tlb(mm, current);
+ task_unlock(tsk);
+ mm_update_next_owner(mm);
+ mmput(mm);
+ if (test_thread_flag(TIF_MEMDIE))
+ unmark_oom_victim();
+}
+
+static struct task_struct *find_alive_thread(struct task_struct *p)
+{
+ struct task_struct *t;
+
+ for_each_thread(p, t) {
+ if (!(t->flags & PF_EXITING))
+ return t;
+ }
+ return NULL;
+}
+
+static struct task_struct *find_child_reaper(struct task_struct *father)
+ __releases(&tasklist_lock)
+ __acquires(&tasklist_lock)
+{
+ struct pid_namespace *pid_ns = task_active_pid_ns(father);
+ struct task_struct *reaper = pid_ns->child_reaper;
+
+ if (likely(reaper != father))
+ return reaper;
+
+ reaper = find_alive_thread(father);
+ if (reaper) {
+ pid_ns->child_reaper = reaper;
+ return reaper;
+ }
+
+ write_unlock_irq(&tasklist_lock);
+ if (unlikely(pid_ns == &init_pid_ns)) {
+ panic("Attempted to kill init! exitcode=0x%08x\n",
+ father->signal->group_exit_code ?: father->exit_code);
+ }
+ zap_pid_ns_processes(pid_ns);
+ write_lock_irq(&tasklist_lock);
+
+ return father;
+}
+
+/*
+ * When we die, we re-parent all our children, and try to:
+ * 1. give them to another thread in our thread group, if such a member exists
+ * 2. give it to the first ancestor process which prctl'd itself as a
+ * child_subreaper for its children (like a service manager)
+ * 3. give it to the init process (PID 1) in our pid namespace
+ */
+static struct task_struct *find_new_reaper(struct task_struct *father,
+ struct task_struct *child_reaper)
+{
+ struct task_struct *thread, *reaper;
+
+ thread = find_alive_thread(father);
+ if (thread)
+ return thread;
+
+ if (father->signal->has_child_subreaper) {
+ /*
+ * Find the first ->is_child_subreaper ancestor in our pid_ns.
+ * We start from father to ensure we can not look into another
+ * namespace, this is safe because all its threads are dead.
+ */
+ for (reaper = father;
+ !same_thread_group(reaper, child_reaper);
+ reaper = reaper->real_parent) {
+ /* call_usermodehelper() descendants need this check */
+ if (reaper == &init_task)
+ break;
+ if (!reaper->signal->is_child_subreaper)
+ continue;
+ thread = find_alive_thread(reaper);
+ if (thread)
+ return thread;
+ }
+ }
+
+ return child_reaper;
+}
+
+/*
+* Any that need to be release_task'd are put on the @dead list.
+ */
+static void reparent_leader(struct task_struct *father, struct task_struct *p,
+ struct list_head *dead)
+{
+ if (unlikely(p->exit_state == EXIT_DEAD))
+ return;
+
+ /* We don't want people slaying init. */
+ p->exit_signal = SIGCHLD;
+
+ /* If it has exited notify the new parent about this child's death. */
+ if (!p->ptrace &&
+ p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
+ if (do_notify_parent(p, p->exit_signal)) {
+ p->exit_state = EXIT_DEAD;
+ list_add(&p->ptrace_entry, dead);
+ }
+ }
+
+ kill_orphaned_pgrp(p, father);
+}
+
+/*
+ * This does two things:
+ *
+ * A. Make init inherit all the child processes
+ * B. Check to see if any process groups have become orphaned
+ * as a result of our exiting, and if they have any stopped
+ * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
+ */
+static void forget_original_parent(struct task_struct *father,
+ struct list_head *dead)
+{
+ struct task_struct *p, *t, *reaper;
+
+ if (unlikely(!list_empty(&father->ptraced)))
+ exit_ptrace(father, dead);
+
+ /* Can drop and reacquire tasklist_lock */
+ reaper = find_child_reaper(father);
+ if (list_empty(&father->children))
+ return;
+
+ reaper = find_new_reaper(father, reaper);
+ list_for_each_entry(p, &father->children, sibling) {
+ for_each_thread(p, t) {
+ t->real_parent = reaper;
+ BUG_ON((!t->ptrace) != (t->parent == father));
+ if (likely(!t->ptrace))
+ t->parent = t->real_parent;
+ if (t->pdeath_signal)
+ group_send_sig_info(t->pdeath_signal,
+ SEND_SIG_NOINFO, t);
+ }
+ /*
+ * If this is a threaded reparent there is no need to
+ * notify anyone anything has happened.
+ */
+ if (!same_thread_group(reaper, father))
+ reparent_leader(father, p, dead);
+ }
+ list_splice_tail_init(&father->children, &reaper->children);
+}
+
+/*
+ * Send signals to all our closest relatives so that they know
+ * to properly mourn us..
+ */
+static void exit_notify(struct task_struct *tsk, int group_dead)
+{
+ bool autoreap;
+ struct task_struct *p, *n;
+ LIST_HEAD(dead);
+
+ write_lock_irq(&tasklist_lock);
+ forget_original_parent(tsk, &dead);
+
+ if (group_dead)
+ kill_orphaned_pgrp(tsk->group_leader, NULL);
+
+ if (unlikely(tsk->ptrace)) {
+ int sig = thread_group_leader(tsk) &&
+ thread_group_empty(tsk) &&
+ !ptrace_reparented(tsk) ?
+ tsk->exit_signal : SIGCHLD;
+ autoreap = do_notify_parent(tsk, sig);
+ } else if (thread_group_leader(tsk)) {
+ autoreap = thread_group_empty(tsk) &&
+ do_notify_parent(tsk, tsk->exit_signal);
+ } else {
+ autoreap = true;
+ }
+
+ tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE;
+ if (tsk->exit_state == EXIT_DEAD)
+ list_add(&tsk->ptrace_entry, &dead);
+
+ /* mt-exec, de_thread() is waiting for group leader */
+ if (unlikely(tsk->signal->notify_count < 0))
+ wake_up_process(tsk->signal->group_exit_task);
+ write_unlock_irq(&tasklist_lock);
+
+ list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
+ list_del_init(&p->ptrace_entry);
+ release_task(p);
+ }
+}
+
+#ifdef CONFIG_DEBUG_STACK_USAGE
+static void check_stack_usage(void)
+{
+ static DEFINE_SPINLOCK(low_water_lock);
+ static int lowest_to_date = THREAD_SIZE;
+ unsigned long free;
+
+ free = stack_not_used(current);
+
+ if (free >= lowest_to_date)
+ return;
+
+ spin_lock(&low_water_lock);
+ if (free < lowest_to_date) {
+ pr_warn("%s (%d) used greatest stack depth: %lu bytes left\n",
+ current->comm, task_pid_nr(current), free);
+ lowest_to_date = free;
+ }
+ spin_unlock(&low_water_lock);
+}
+#else
+static inline void check_stack_usage(void) {}
+#endif
+
+void do_exit(long code)
+{
+ struct task_struct *tsk = current;
+ int group_dead;
+ TASKS_RCU(int tasks_rcu_i);
+
+ profile_task_exit(tsk);
+
+ WARN_ON(blk_needs_flush_plug(tsk));
+
+ if (unlikely(in_interrupt()))
+ panic("Aiee, killing interrupt handler!");
+ if (unlikely(!tsk->pid))
+ panic("Attempted to kill the idle task!");
+
+ /*
+ * If do_exit is called because this processes oopsed, it's possible
+ * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
+ * continuing. Amongst other possible reasons, this is to prevent
+ * mm_release()->clear_child_tid() from writing to a user-controlled
+ * kernel address.
+ */
+ set_fs(USER_DS);
+
+ ptrace_event(PTRACE_EVENT_EXIT, code);
+
+ validate_creds_for_do_exit(tsk);
+
+ /*
+ * We're taking recursive faults here in do_exit. Safest is to just
+ * leave this task alone and wait for reboot.
+ */
+ if (unlikely(tsk->flags & PF_EXITING)) {
+ pr_alert("Fixing recursive fault but reboot is needed!\n");
+ /*
+ * We can do this unlocked here. The futex code uses
+ * this flag just to verify whether the pi state
+ * cleanup has been done or not. In the worst case it
+ * loops once more. We pretend that the cleanup was
+ * done as there is no way to return. Either the
+ * OWNER_DIED bit is set by now or we push the blocked
+ * task into the wait for ever nirwana as well.
+ */
+ tsk->flags |= PF_EXITPIDONE;
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule();
+ }
+
+ exit_signals(tsk); /* sets PF_EXITING */
+ /*
+ * tsk->flags are checked in the futex code to protect against
+ * an exiting task cleaning up the robust pi futexes.
+ */
+ smp_mb();
+ raw_spin_unlock_wait(&tsk->pi_lock);
+
+ if (unlikely(in_atomic()))
+ pr_info("note: %s[%d] exited with preempt_count %d\n",
+ current->comm, task_pid_nr(current),
+ preempt_count());
+
+ acct_update_integrals(tsk);
+ /* sync mm's RSS info before statistics gathering */
+ if (tsk->mm)
+ sync_mm_rss(tsk->mm);
+ group_dead = atomic_dec_and_test(&tsk->signal->live);
+ if (group_dead) {
+ hrtimer_cancel(&tsk->signal->real_timer);
+ exit_itimers(tsk->signal);
+ if (tsk->mm)
+ setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
+ }
+ acct_collect(code, group_dead);
+ if (group_dead)
+ tty_audit_exit();
+ audit_free(tsk);
+
+ tsk->exit_code = code;
+ taskstats_exit(tsk, group_dead);
+
+ exit_mm(tsk);
+
+ if (group_dead)
+ acct_process();
+ trace_sched_process_exit(tsk);
+
+ exit_sem(tsk);
+ exit_shm(tsk);
+ exit_files(tsk);
+ exit_fs(tsk);
+ if (group_dead)
+ disassociate_ctty(1);
+ exit_task_namespaces(tsk);
+ exit_task_work(tsk);
+ exit_thread();
+
+ /*
+ * Flush inherited counters to the parent - before the parent
+ * gets woken up by child-exit notifications.
+ *
+ * because of cgroup mode, must be called before cgroup_exit()
+ */
+ perf_event_exit_task(tsk);
+
+ cgroup_exit(tsk);
+
+ /*
+ * FIXME: do that only when needed, using sched_exit tracepoint
+ */
+ flush_ptrace_hw_breakpoint(tsk);
+
+ TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu));
+ exit_notify(tsk, group_dead);
+ proc_exit_connector(tsk);
+#ifdef CONFIG_NUMA
+ task_lock(tsk);
+ mpol_put(tsk->mempolicy);
+ tsk->mempolicy = NULL;
+ task_unlock(tsk);
+#endif
+#ifdef CONFIG_FUTEX
+ if (unlikely(current->pi_state_cache))
+ kfree(current->pi_state_cache);
+#endif
+ /*
+ * Make sure we are holding no locks:
+ */
+ debug_check_no_locks_held();
+ /*
+ * We can do this unlocked here. The futex code uses this flag
+ * just to verify whether the pi state cleanup has been done
+ * or not. In the worst case it loops once more.
+ */
+ tsk->flags |= PF_EXITPIDONE;
+
+ if (tsk->io_context)
+ exit_io_context(tsk);
+
+ if (tsk->splice_pipe)
+ free_pipe_info(tsk->splice_pipe);
+
+ if (tsk->task_frag.page)
+ put_page(tsk->task_frag.page);
+
+ validate_creds_for_do_exit(tsk);
+
+ check_stack_usage();
+ preempt_disable();
+ if (tsk->nr_dirtied)
+ __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
+ exit_rcu();
+ TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i));
+
+ /*
+ * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
+ * when the following two conditions become true.
+ * - There is race condition of mmap_sem (It is acquired by
+ * exit_mm()), and
+ * - SMI occurs before setting TASK_RUNINNG.
+ * (or hypervisor of virtual machine switches to other guest)
+ * As a result, we may become TASK_RUNNING after becoming TASK_DEAD
+ *
+ * To avoid it, we have to wait for releasing tsk->pi_lock which
+ * is held by try_to_wake_up()
+ */
+ smp_mb();
+ raw_spin_unlock_wait(&tsk->pi_lock);
+
+ /* causes final put_task_struct in finish_task_switch(). */
+ tsk->state = TASK_DEAD;
+ tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */
+ schedule();
+ BUG();
+ /* Avoid "noreturn function does return". */
+ for (;;)
+ cpu_relax(); /* For when BUG is null */
+}
+EXPORT_SYMBOL_GPL(do_exit);
+
+void complete_and_exit(struct completion *comp, long code)
+{
+ if (comp)
+ complete(comp);
+
+ do_exit(code);
+}
+EXPORT_SYMBOL(complete_and_exit);
+
+SYSCALL_DEFINE1(exit, int, error_code)
+{
+ do_exit((error_code&0xff)<<8);
+}
+
+/*
+ * Take down every thread in the group. This is called by fatal signals
+ * as well as by sys_exit_group (below).
+ */
+void
+do_group_exit(int exit_code)
+{
+ struct signal_struct *sig = current->signal;
+
+ BUG_ON(exit_code & 0x80); /* core dumps don't get here */
+
+ if (signal_group_exit(sig))
+ exit_code = sig->group_exit_code;
+ else if (!thread_group_empty(current)) {
+ struct sighand_struct *const sighand = current->sighand;
+
+ spin_lock_irq(&sighand->siglock);
+ if (signal_group_exit(sig))
+ /* Another thread got here before we took the lock. */
+ exit_code = sig->group_exit_code;
+ else {
+ sig->group_exit_code = exit_code;
+ sig->flags = SIGNAL_GROUP_EXIT;
+ zap_other_threads(current);
+ }
+ spin_unlock_irq(&sighand->siglock);
+ }
+
+ do_exit(exit_code);
+ /* NOTREACHED */
+}
+
+/*
+ * this kills every thread in the thread group. Note that any externally
+ * wait4()-ing process will get the correct exit code - even if this
+ * thread is not the thread group leader.
+ */
+SYSCALL_DEFINE1(exit_group, int, error_code)
+{
+ do_group_exit((error_code & 0xff) << 8);
+ /* NOTREACHED */
+ return 0;
+}
+
+struct wait_opts {
+ enum pid_type wo_type;
+ int wo_flags;
+ struct pid *wo_pid;
+
+ struct siginfo __user *wo_info;
+ int __user *wo_stat;
+ struct rusage __user *wo_rusage;
+
+ wait_queue_t child_wait;
+ int notask_error;
+};
+
+static inline
+struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
+{
+ if (type != PIDTYPE_PID)
+ task = task->group_leader;
+ return task->pids[type].pid;
+}
+
+static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
+{
+ return wo->wo_type == PIDTYPE_MAX ||
+ task_pid_type(p, wo->wo_type) == wo->wo_pid;
+}
+
+static int eligible_child(struct wait_opts *wo, struct task_struct *p)
+{
+ if (!eligible_pid(wo, p))
+ return 0;
+ /* Wait for all children (clone and not) if __WALL is set;
+ * otherwise, wait for clone children *only* if __WCLONE is
+ * set; otherwise, wait for non-clone children *only*. (Note:
+ * A "clone" child here is one that reports to its parent
+ * using a signal other than SIGCHLD.) */
+ if (((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
+ && !(wo->wo_flags & __WALL))
+ return 0;
+
+ return 1;
+}
+
+static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
+ pid_t pid, uid_t uid, int why, int status)
+{
+ struct siginfo __user *infop;
+ int retval = wo->wo_rusage
+ ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
+
+ put_task_struct(p);
+ infop = wo->wo_info;
+ if (infop) {
+ if (!retval)
+ retval = put_user(SIGCHLD, &infop->si_signo);
+ if (!retval)
+ retval = put_user(0, &infop->si_errno);
+ if (!retval)
+ retval = put_user((short)why, &infop->si_code);
+ if (!retval)
+ retval = put_user(pid, &infop->si_pid);
+ if (!retval)
+ retval = put_user(uid, &infop->si_uid);
+ if (!retval)
+ retval = put_user(status, &infop->si_status);
+ }
+ if (!retval)
+ retval = pid;
+ return retval;
+}
+
+/*
+ * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold
+ * read_lock(&tasklist_lock) on entry. If we return zero, we still hold
+ * the lock and this task is uninteresting. If we return nonzero, we have
+ * released the lock and the system call should return.
+ */
+static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
+{
+ int state, retval, status;
+ pid_t pid = task_pid_vnr(p);
+ uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
+ struct siginfo __user *infop;
+
+ if (!likely(wo->wo_flags & WEXITED))
+ return 0;
+
+ if (unlikely(wo->wo_flags & WNOWAIT)) {
+ int exit_code = p->exit_code;
+ int why;
+
+ get_task_struct(p);
+ read_unlock(&tasklist_lock);
+ sched_annotate_sleep();
+
+ if ((exit_code & 0x7f) == 0) {
+ why = CLD_EXITED;
+ status = exit_code >> 8;
+ } else {
+ why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED;
+ status = exit_code & 0x7f;
+ }
+ return wait_noreap_copyout(wo, p, pid, uid, why, status);
+ }
+ /*
+ * Move the task's state to DEAD/TRACE, only one thread can do this.
+ */
+ state = (ptrace_reparented(p) && thread_group_leader(p)) ?
+ EXIT_TRACE : EXIT_DEAD;
+ if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE)
+ return 0;
+ /*
+ * We own this thread, nobody else can reap it.
+ */
+ read_unlock(&tasklist_lock);
+ sched_annotate_sleep();
+
+ /*
+ * Check thread_group_leader() to exclude the traced sub-threads.
+ */
+ if (state == EXIT_DEAD && thread_group_leader(p)) {
+ struct signal_struct *sig = p->signal;
+ struct signal_struct *psig = current->signal;
+ unsigned long maxrss;
+ cputime_t tgutime, tgstime;
+
+ /*
+ * The resource counters for the group leader are in its
+ * own task_struct. Those for dead threads in the group
+ * are in its signal_struct, as are those for the child
+ * processes it has previously reaped. All these
+ * accumulate in the parent's signal_struct c* fields.
+ *
+ * We don't bother to take a lock here to protect these
+ * p->signal fields because the whole thread group is dead
+ * and nobody can change them.
+ *
+ * psig->stats_lock also protects us from our sub-theads
+ * which can reap other children at the same time. Until
+ * we change k_getrusage()-like users to rely on this lock
+ * we have to take ->siglock as well.
+ *
+ * We use thread_group_cputime_adjusted() to get times for
+ * the thread group, which consolidates times for all threads
+ * in the group including the group leader.
+ */
+ thread_group_cputime_adjusted(p, &tgutime, &tgstime);
+ spin_lock_irq(&current->sighand->siglock);
+ write_seqlock(&psig->stats_lock);
+ psig->cutime += tgutime + sig->cutime;
+ psig->cstime += tgstime + sig->cstime;
+ psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
+ psig->cmin_flt +=
+ p->min_flt + sig->min_flt + sig->cmin_flt;
+ psig->cmaj_flt +=
+ p->maj_flt + sig->maj_flt + sig->cmaj_flt;
+ psig->cnvcsw +=
+ p->nvcsw + sig->nvcsw + sig->cnvcsw;
+ psig->cnivcsw +=
+ p->nivcsw + sig->nivcsw + sig->cnivcsw;
+ psig->cinblock +=
+ task_io_get_inblock(p) +
+ sig->inblock + sig->cinblock;
+ psig->coublock +=
+ task_io_get_oublock(p) +
+ sig->oublock + sig->coublock;
+ maxrss = max(sig->maxrss, sig->cmaxrss);
+ if (psig->cmaxrss < maxrss)
+ psig->cmaxrss = maxrss;
+ task_io_accounting_add(&psig->ioac, &p->ioac);
+ task_io_accounting_add(&psig->ioac, &sig->ioac);
+ write_sequnlock(&psig->stats_lock);
+ spin_unlock_irq(&current->sighand->siglock);
+ }
+
+ retval = wo->wo_rusage
+ ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
+ status = (p->signal->flags & SIGNAL_GROUP_EXIT)
+ ? p->signal->group_exit_code : p->exit_code;
+ if (!retval && wo->wo_stat)
+ retval = put_user(status, wo->wo_stat);
+
+ infop = wo->wo_info;
+ if (!retval && infop)
+ retval = put_user(SIGCHLD, &infop->si_signo);
+ if (!retval && infop)
+ retval = put_user(0, &infop->si_errno);
+ if (!retval && infop) {
+ int why;
+
+ if ((status & 0x7f) == 0) {
+ why = CLD_EXITED;
+ status >>= 8;
+ } else {
+ why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
+ status &= 0x7f;
+ }
+ retval = put_user((short)why, &infop->si_code);
+ if (!retval)
+ retval = put_user(status, &infop->si_status);
+ }
+ if (!retval && infop)
+ retval = put_user(pid, &infop->si_pid);
+ if (!retval && infop)
+ retval = put_user(uid, &infop->si_uid);
+ if (!retval)
+ retval = pid;
+
+ if (state == EXIT_TRACE) {
+ write_lock_irq(&tasklist_lock);
+ /* We dropped tasklist, ptracer could die and untrace */
+ ptrace_unlink(p);
+
+ /* If parent wants a zombie, don't release it now */
+ state = EXIT_ZOMBIE;
+ if (do_notify_parent(p, p->exit_signal))
+ state = EXIT_DEAD;
+ p->exit_state = state;
+ write_unlock_irq(&tasklist_lock);
+ }
+ if (state == EXIT_DEAD)
+ release_task(p);
+
+ return retval;
+}
+
+static int *task_stopped_code(struct task_struct *p, bool ptrace)
+{
+ if (ptrace) {
+ if (task_is_stopped_or_traced(p) &&
+ !(p->jobctl & JOBCTL_LISTENING))
+ return &p->exit_code;
+ } else {
+ if (p->signal->flags & SIGNAL_STOP_STOPPED)
+ return &p->signal->group_exit_code;
+ }
+ return NULL;
+}
+
+/**
+ * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED
+ * @wo: wait options
+ * @ptrace: is the wait for ptrace
+ * @p: task to wait for
+ *
+ * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED.
+ *
+ * CONTEXT:
+ * read_lock(&tasklist_lock), which is released if return value is
+ * non-zero. Also, grabs and releases @p->sighand->siglock.
+ *
+ * RETURNS:
+ * 0 if wait condition didn't exist and search for other wait conditions
+ * should continue. Non-zero return, -errno on failure and @p's pid on
+ * success, implies that tasklist_lock is released and wait condition
+ * search should terminate.
+ */
+static int wait_task_stopped(struct wait_opts *wo,
+ int ptrace, struct task_struct *p)
+{
+ struct siginfo __user *infop;
+ int retval, exit_code, *p_code, why;
+ uid_t uid = 0; /* unneeded, required by compiler */
+ pid_t pid;
+
+ /*
+ * Traditionally we see ptrace'd stopped tasks regardless of options.
+ */
+ if (!ptrace && !(wo->wo_flags & WUNTRACED))
+ return 0;
+
+ if (!task_stopped_code(p, ptrace))
+ return 0;
+
+ exit_code = 0;
+ spin_lock_irq(&p->sighand->siglock);
+
+ p_code = task_stopped_code(p, ptrace);
+ if (unlikely(!p_code))
+ goto unlock_sig;
+
+ exit_code = *p_code;
+ if (!exit_code)
+ goto unlock_sig;
+
+ if (!unlikely(wo->wo_flags & WNOWAIT))
+ *p_code = 0;
+
+ uid = from_kuid_munged(current_user_ns(), task_uid(p));
+unlock_sig:
+ spin_unlock_irq(&p->sighand->siglock);
+ if (!exit_code)
+ return 0;
+
+ /*
+ * Now we are pretty sure this task is interesting.
+ * Make sure it doesn't get reaped out from under us while we
+ * give up the lock and then examine it below. We don't want to
+ * keep holding onto the tasklist_lock while we call getrusage and
+ * possibly take page faults for user memory.
+ */
+ get_task_struct(p);
+ pid = task_pid_vnr(p);
+ why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
+ read_unlock(&tasklist_lock);
+ sched_annotate_sleep();
+
+ if (unlikely(wo->wo_flags & WNOWAIT))
+ return wait_noreap_copyout(wo, p, pid, uid, why, exit_code);
+
+ retval = wo->wo_rusage
+ ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
+ if (!retval && wo->wo_stat)
+ retval = put_user((exit_code << 8) | 0x7f, wo->wo_stat);
+
+ infop = wo->wo_info;
+ if (!retval && infop)
+ retval = put_user(SIGCHLD, &infop->si_signo);
+ if (!retval && infop)
+ retval = put_user(0, &infop->si_errno);
+ if (!retval && infop)
+ retval = put_user((short)why, &infop->si_code);
+ if (!retval && infop)
+ retval = put_user(exit_code, &infop->si_status);
+ if (!retval && infop)
+ retval = put_user(pid, &infop->si_pid);
+ if (!retval && infop)
+ retval = put_user(uid, &infop->si_uid);
+ if (!retval)
+ retval = pid;
+ put_task_struct(p);
+
+ BUG_ON(!retval);
+ return retval;
+}
+
+/*
+ * Handle do_wait work for one task in a live, non-stopped state.
+ * read_lock(&tasklist_lock) on entry. If we return zero, we still hold
+ * the lock and this task is uninteresting. If we return nonzero, we have
+ * released the lock and the system call should return.
+ */
+static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
+{
+ int retval;
+ pid_t pid;
+ uid_t uid;
+
+ if (!unlikely(wo->wo_flags & WCONTINUED))
+ return 0;
+
+ if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
+ return 0;
+
+ spin_lock_irq(&p->sighand->siglock);
+ /* Re-check with the lock held. */
+ if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) {
+ spin_unlock_irq(&p->sighand->siglock);
+ return 0;
+ }
+ if (!unlikely(wo->wo_flags & WNOWAIT))
+ p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
+ uid = from_kuid_munged(current_user_ns(), task_uid(p));
+ spin_unlock_irq(&p->sighand->siglock);
+
+ pid = task_pid_vnr(p);
+ get_task_struct(p);
+ read_unlock(&tasklist_lock);
+ sched_annotate_sleep();
+
+ if (!wo->wo_info) {
+ retval = wo->wo_rusage
+ ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
+ put_task_struct(p);
+ if (!retval && wo->wo_stat)
+ retval = put_user(0xffff, wo->wo_stat);
+ if (!retval)
+ retval = pid;
+ } else {
+ retval = wait_noreap_copyout(wo, p, pid, uid,
+ CLD_CONTINUED, SIGCONT);
+ BUG_ON(retval == 0);
+ }
+
+ return retval;
+}
+
+/*
+ * Consider @p for a wait by @parent.
+ *
+ * -ECHILD should be in ->notask_error before the first call.
+ * Returns nonzero for a final return, when we have unlocked tasklist_lock.
+ * Returns zero if the search for a child should continue;
+ * then ->notask_error is 0 if @p is an eligible child,
+ * or another error from security_task_wait(), or still -ECHILD.
+ */
+static int wait_consider_task(struct wait_opts *wo, int ptrace,
+ struct task_struct *p)
+{
+ /*
+ * We can race with wait_task_zombie() from another thread.
+ * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition
+ * can't confuse the checks below.
+ */
+ int exit_state = ACCESS_ONCE(p->exit_state);
+ int ret;
+
+ if (unlikely(exit_state == EXIT_DEAD))
+ return 0;
+
+ ret = eligible_child(wo, p);
+ if (!ret)
+ return ret;
+
+ ret = security_task_wait(p);
+ if (unlikely(ret < 0)) {
+ /*
+ * If we have not yet seen any eligible child,
+ * then let this error code replace -ECHILD.
+ * A permission error will give the user a clue
+ * to look for security policy problems, rather
+ * than for mysterious wait bugs.
+ */
+ if (wo->notask_error)
+ wo->notask_error = ret;
+ return 0;
+ }
+
+ if (unlikely(exit_state == EXIT_TRACE)) {
+ /*
+ * ptrace == 0 means we are the natural parent. In this case
+ * we should clear notask_error, debugger will notify us.
+ */
+ if (likely(!ptrace))
+ wo->notask_error = 0;
+ return 0;
+ }
+
+ if (likely(!ptrace) && unlikely(p->ptrace)) {
+ /*
+ * If it is traced by its real parent's group, just pretend
+ * the caller is ptrace_do_wait() and reap this child if it
+ * is zombie.
+ *
+ * This also hides group stop state from real parent; otherwise
+ * a single stop can be reported twice as group and ptrace stop.
+ * If a ptracer wants to distinguish these two events for its
+ * own children it should create a separate process which takes
+ * the role of real parent.
+ */
+ if (!ptrace_reparented(p))
+ ptrace = 1;
+ }
+
+ /* slay zombie? */
+ if (exit_state == EXIT_ZOMBIE) {
+ /* we don't reap group leaders with subthreads */
+ if (!delay_group_leader(p)) {
+ /*
+ * A zombie ptracee is only visible to its ptracer.
+ * Notification and reaping will be cascaded to the
+ * real parent when the ptracer detaches.
+ */
+ if (unlikely(ptrace) || likely(!p->ptrace))
+ return wait_task_zombie(wo, p);
+ }
+
+ /*
+ * Allow access to stopped/continued state via zombie by
+ * falling through. Clearing of notask_error is complex.
+ *
+ * When !@ptrace:
+ *
+ * If WEXITED is set, notask_error should naturally be
+ * cleared. If not, subset of WSTOPPED|WCONTINUED is set,
+ * so, if there are live subthreads, there are events to
+ * wait for. If all subthreads are dead, it's still safe
+ * to clear - this function will be called again in finite
+ * amount time once all the subthreads are released and
+ * will then return without clearing.
+ *
+ * When @ptrace:
+ *
+ * Stopped state is per-task and thus can't change once the
+ * target task dies. Only continued and exited can happen.
+ * Clear notask_error if WCONTINUED | WEXITED.
+ */
+ if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED)))
+ wo->notask_error = 0;
+ } else {
+ /*
+ * @p is alive and it's gonna stop, continue or exit, so
+ * there always is something to wait for.
+ */
+ wo->notask_error = 0;
+ }
+
+ /*
+ * Wait for stopped. Depending on @ptrace, different stopped state
+ * is used and the two don't interact with each other.
+ */
+ ret = wait_task_stopped(wo, ptrace, p);
+ if (ret)
+ return ret;
+
+ /*
+ * Wait for continued. There's only one continued state and the
+ * ptracer can consume it which can confuse the real parent. Don't
+ * use WCONTINUED from ptracer. You don't need or want it.
+ */
+ return wait_task_continued(wo, p);
+}
+
+/*
+ * Do the work of do_wait() for one thread in the group, @tsk.
+ *
+ * -ECHILD should be in ->notask_error before the first call.
+ * Returns nonzero for a final return, when we have unlocked tasklist_lock.
+ * Returns zero if the search for a child should continue; then
+ * ->notask_error is 0 if there were any eligible children,
+ * or another error from security_task_wait(), or still -ECHILD.
+ */
+static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
+{
+ struct task_struct *p;
+
+ list_for_each_entry(p, &tsk->children, sibling) {
+ int ret = wait_consider_task(wo, 0, p);
+
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
+{
+ struct task_struct *p;
+
+ list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
+ int ret = wait_consider_task(wo, 1, p);
+
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int child_wait_callback(wait_queue_t *wait, unsigned mode,
+ int sync, void *key)
+{
+ struct wait_opts *wo = container_of(wait, struct wait_opts,
+ child_wait);
+ struct task_struct *p = key;
+
+ if (!eligible_pid(wo, p))
+ return 0;
+
+ if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)
+ return 0;
+
+ return default_wake_function(wait, mode, sync, key);
+}
+
+void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
+{
+ __wake_up_sync_key(&parent->signal->wait_chldexit,
+ TASK_INTERRUPTIBLE, 1, p);
+}
+
+static long do_wait(struct wait_opts *wo)
+{
+ struct task_struct *tsk;
+ int retval;
+
+ trace_sched_process_wait(wo->wo_pid);
+
+ init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
+ wo->child_wait.private = current;
+ add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
+repeat:
+ /*
+ * If there is nothing that can match our critiera just get out.
+ * We will clear ->notask_error to zero if we see any child that
+ * might later match our criteria, even if we are not able to reap
+ * it yet.
+ */
+ wo->notask_error = -ECHILD;
+ if ((wo->wo_type < PIDTYPE_MAX) &&
+ (!wo->wo_pid || hlist_empty(&wo->wo_pid->tasks[wo->wo_type])))
+ goto notask;
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ read_lock(&tasklist_lock);
+ tsk = current;
+ do {
+ retval = do_wait_thread(wo, tsk);
+ if (retval)
+ goto end;
+
+ retval = ptrace_do_wait(wo, tsk);
+ if (retval)
+ goto end;
+
+ if (wo->wo_flags & __WNOTHREAD)
+ break;
+ } while_each_thread(current, tsk);
+ read_unlock(&tasklist_lock);
+
+notask:
+ retval = wo->notask_error;
+ if (!retval && !(wo->wo_flags & WNOHANG)) {
+ retval = -ERESTARTSYS;
+ if (!signal_pending(current)) {
+ schedule();
+ goto repeat;
+ }
+ }
+end:
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
+ return retval;
+}
+
+SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
+ infop, int, options, struct rusage __user *, ru)
+{
+ struct wait_opts wo;
+ struct pid *pid = NULL;
+ enum pid_type type;
+ long ret;
+
+ if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED))
+ return -EINVAL;
+ if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
+ return -EINVAL;
+
+ switch (which) {
+ case P_ALL:
+ type = PIDTYPE_MAX;
+ break;
+ case P_PID:
+ type = PIDTYPE_PID;
+ if (upid <= 0)
+ return -EINVAL;
+ break;
+ case P_PGID:
+ type = PIDTYPE_PGID;
+ if (upid <= 0)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (type < PIDTYPE_MAX)
+ pid = find_get_pid(upid);
+
+ wo.wo_type = type;
+ wo.wo_pid = pid;
+ wo.wo_flags = options;
+ wo.wo_info = infop;
+ wo.wo_stat = NULL;
+ wo.wo_rusage = ru;
+ ret = do_wait(&wo);
+
+ if (ret > 0) {
+ ret = 0;
+ } else if (infop) {
+ /*
+ * For a WNOHANG return, clear out all the fields
+ * we would set so the user can easily tell the
+ * difference.
+ */
+ if (!ret)
+ ret = put_user(0, &infop->si_signo);
+ if (!ret)
+ ret = put_user(0, &infop->si_errno);
+ if (!ret)
+ ret = put_user(0, &infop->si_code);
+ if (!ret)
+ ret = put_user(0, &infop->si_pid);
+ if (!ret)
+ ret = put_user(0, &infop->si_uid);
+ if (!ret)
+ ret = put_user(0, &infop->si_status);
+ }
+
+ put_pid(pid);
+ return ret;
+}
+
+SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
+ int, options, struct rusage __user *, ru)
+{
+ struct wait_opts wo;
+ struct pid *pid = NULL;
+ enum pid_type type;
+ long ret;
+
+ if (options & ~(WNOHANG|WUNTRACED|WCONTINUED|
+ __WNOTHREAD|__WCLONE|__WALL))
+ return -EINVAL;
+
+ if (upid == -1)
+ type = PIDTYPE_MAX;
+ else if (upid < 0) {
+ type = PIDTYPE_PGID;
+ pid = find_get_pid(-upid);
+ } else if (upid == 0) {
+ type = PIDTYPE_PGID;
+ pid = get_task_pid(current, PIDTYPE_PGID);
+ } else /* upid > 0 */ {
+ type = PIDTYPE_PID;
+ pid = find_get_pid(upid);
+ }
+
+ wo.wo_type = type;
+ wo.wo_pid = pid;
+ wo.wo_flags = options | WEXITED;
+ wo.wo_info = NULL;
+ wo.wo_stat = stat_addr;
+ wo.wo_rusage = ru;
+ ret = do_wait(&wo);
+ put_pid(pid);
+
+ return ret;
+}
+
+#ifdef __ARCH_WANT_SYS_WAITPID
+
+/*
+ * sys_waitpid() remains for compatibility. waitpid() should be
+ * implemented by calling sys_wait4() from libc.a.
+ */
+SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options)
+{
+ return sys_wait4(pid, stat_addr, options, NULL);
+}
+
+#endif
diff --git a/kernel/extable.c b/kernel/extable.c
new file mode 100644
index 000000000..c98f92627
--- /dev/null
+++ b/kernel/extable.c
@@ -0,0 +1,144 @@
+/* Rewritten by Rusty Russell, on the backs of many others...
+ Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+#include <linux/ftrace.h>
+#include <linux/memory.h>
+#include <linux/module.h>
+#include <linux/ftrace.h>
+#include <linux/mutex.h>
+#include <linux/init.h>
+
+#include <asm/sections.h>
+#include <asm/uaccess.h>
+
+/*
+ * mutex protecting text section modification (dynamic code patching).
+ * some users need to sleep (allocating memory...) while they hold this lock.
+ *
+ * NOT exported to modules - patching kernel text is a really delicate matter.
+ */
+DEFINE_MUTEX(text_mutex);
+
+extern struct exception_table_entry __start___ex_table[];
+extern struct exception_table_entry __stop___ex_table[];
+
+/* Cleared by build time tools if the table is already sorted. */
+u32 __initdata __visible main_extable_sort_needed = 1;
+
+/* Sort the kernel's built-in exception table */
+void __init sort_main_extable(void)
+{
+ if (main_extable_sort_needed && __stop___ex_table > __start___ex_table) {
+ pr_notice("Sorting __ex_table...\n");
+ sort_extable(__start___ex_table, __stop___ex_table);
+ }
+}
+
+/* Given an address, look for it in the exception tables. */
+const struct exception_table_entry *search_exception_tables(unsigned long addr)
+{
+ const struct exception_table_entry *e;
+
+ e = search_extable(__start___ex_table, __stop___ex_table-1, addr);
+ if (!e)
+ e = search_module_extables(addr);
+ return e;
+}
+
+static inline int init_kernel_text(unsigned long addr)
+{
+ if (addr >= (unsigned long)_sinittext &&
+ addr < (unsigned long)_einittext)
+ return 1;
+ return 0;
+}
+
+int core_kernel_text(unsigned long addr)
+{
+ if (addr >= (unsigned long)_stext &&
+ addr < (unsigned long)_etext)
+ return 1;
+
+ if (system_state == SYSTEM_BOOTING &&
+ init_kernel_text(addr))
+ return 1;
+ return 0;
+}
+
+/**
+ * core_kernel_data - tell if addr points to kernel data
+ * @addr: address to test
+ *
+ * Returns true if @addr passed in is from the core kernel data
+ * section.
+ *
+ * Note: On some archs it may return true for core RODATA, and false
+ * for others. But will always be true for core RW data.
+ */
+int core_kernel_data(unsigned long addr)
+{
+ if (addr >= (unsigned long)_sdata &&
+ addr < (unsigned long)_edata)
+ return 1;
+ return 0;
+}
+
+int __kernel_text_address(unsigned long addr)
+{
+ if (core_kernel_text(addr))
+ return 1;
+ if (is_module_text_address(addr))
+ return 1;
+ if (is_ftrace_trampoline(addr))
+ return 1;
+ /*
+ * There might be init symbols in saved stacktraces.
+ * Give those symbols a chance to be printed in
+ * backtraces (such as lockdep traces).
+ *
+ * Since we are after the module-symbols check, there's
+ * no danger of address overlap:
+ */
+ if (init_kernel_text(addr))
+ return 1;
+ return 0;
+}
+
+int kernel_text_address(unsigned long addr)
+{
+ if (core_kernel_text(addr))
+ return 1;
+ if (is_module_text_address(addr))
+ return 1;
+ return is_ftrace_trampoline(addr);
+}
+
+/*
+ * On some architectures (PPC64, IA64) function pointers
+ * are actually only tokens to some data that then holds the
+ * real function address. As a result, to find if a function
+ * pointer is part of the kernel text, we need to do some
+ * special dereferencing first.
+ */
+int func_ptr_is_kernel_text(void *ptr)
+{
+ unsigned long addr;
+ addr = (unsigned long) dereference_function_descriptor(ptr);
+ if (core_kernel_text(addr))
+ return 1;
+ return is_module_text_address(addr);
+}
diff --git a/kernel/fork.c b/kernel/fork.c
new file mode 100644
index 000000000..e37f372d3
--- /dev/null
+++ b/kernel/fork.c
@@ -0,0 +1,2075 @@
+/*
+ * linux/kernel/fork.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ */
+
+/*
+ * 'fork.c' contains the help-routines for the 'fork' system call
+ * (see also entry.S and others).
+ * Fork is rather simple, once you get the hang of it, but the memory
+ * management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
+ */
+
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/unistd.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/completion.h>
+#include <linux/personality.h>
+#include <linux/mempolicy.h>
+#include <linux/sem.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/iocontext.h>
+#include <linux/key.h>
+#include <linux/binfmts.h>
+#include <linux/mman.h>
+#include <linux/mmu_notifier.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/vmacache.h>
+#include <linux/nsproxy.h>
+#include <linux/capability.h>
+#include <linux/cpu.h>
+#include <linux/cgroup.h>
+#include <linux/security.h>
+#include <linux/hugetlb.h>
+#include <linux/seccomp.h>
+#include <linux/swap.h>
+#include <linux/syscalls.h>
+#include <linux/jiffies.h>
+#include <linux/futex.h>
+#include <linux/compat.h>
+#include <linux/kthread.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/rcupdate.h>
+#include <linux/ptrace.h>
+#include <linux/mount.h>
+#include <linux/audit.h>
+#include <linux/memcontrol.h>
+#include <linux/ftrace.h>
+#include <linux/proc_fs.h>
+#include <linux/profile.h>
+#include <linux/rmap.h>
+#include <linux/ksm.h>
+#include <linux/acct.h>
+#include <linux/tsacct_kern.h>
+#include <linux/cn_proc.h>
+#include <linux/freezer.h>
+#include <linux/delayacct.h>
+#include <linux/taskstats_kern.h>
+#include <linux/random.h>
+#include <linux/tty.h>
+#include <linux/blkdev.h>
+#include <linux/fs_struct.h>
+#include <linux/magic.h>
+#include <linux/perf_event.h>
+#include <linux/posix-timers.h>
+#include <linux/user-return-notifier.h>
+#include <linux/oom.h>
+#include <linux/khugepaged.h>
+#include <linux/signalfd.h>
+#include <linux/uprobes.h>
+#include <linux/aio.h>
+#include <linux/compiler.h>
+#include <linux/sysctl.h>
+
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/uaccess.h>
+#include <asm/mmu_context.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+
+#include <trace/events/sched.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/task.h>
+
+/*
+ * Minimum number of threads to boot the kernel
+ */
+#define MIN_THREADS 20
+
+/*
+ * Maximum number of threads
+ */
+#define MAX_THREADS FUTEX_TID_MASK
+
+/*
+ * Protected counters by write_lock_irq(&tasklist_lock)
+ */
+unsigned long total_forks; /* Handle normal Linux uptimes. */
+int nr_threads; /* The idle threads do not count.. */
+
+int max_threads; /* tunable limit on nr_threads */
+
+DEFINE_PER_CPU(unsigned long, process_counts) = 0;
+
+__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
+
+#ifdef CONFIG_PROVE_RCU
+int lockdep_tasklist_lock_is_held(void)
+{
+ return lockdep_is_held(&tasklist_lock);
+}
+EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held);
+#endif /* #ifdef CONFIG_PROVE_RCU */
+
+int nr_processes(void)
+{
+ int cpu;
+ int total = 0;
+
+ for_each_possible_cpu(cpu)
+ total += per_cpu(process_counts, cpu);
+
+ return total;
+}
+
+void __weak arch_release_task_struct(struct task_struct *tsk)
+{
+}
+
+#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
+static struct kmem_cache *task_struct_cachep;
+
+static inline struct task_struct *alloc_task_struct_node(int node)
+{
+ return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL | ___GFP_TOI_NOTRACK, node);
+}
+
+static inline void free_task_struct(struct task_struct *tsk)
+{
+ kmem_cache_free(task_struct_cachep, tsk);
+}
+#endif
+
+void __weak arch_release_thread_info(struct thread_info *ti)
+{
+}
+
+#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR
+
+/*
+ * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
+ * kmemcache based allocator.
+ */
+# if THREAD_SIZE >= PAGE_SIZE
+static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
+ int node)
+{
+ struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP,
+ THREAD_SIZE_ORDER);
+
+ return page ? page_address(page) : NULL;
+}
+
+static inline void free_thread_info(struct thread_info *ti)
+{
+ free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
+}
+# else
+static struct kmem_cache *thread_info_cache;
+
+static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
+ int node)
+{
+ return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node);
+}
+
+static void free_thread_info(struct thread_info *ti)
+{
+ kmem_cache_free(thread_info_cache, ti);
+}
+
+void thread_info_cache_init(void)
+{
+ thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE,
+ THREAD_SIZE, 0, NULL);
+ BUG_ON(thread_info_cache == NULL);
+}
+# endif
+#endif
+
+/* SLAB cache for signal_struct structures (tsk->signal) */
+static struct kmem_cache *signal_cachep;
+
+/* SLAB cache for sighand_struct structures (tsk->sighand) */
+struct kmem_cache *sighand_cachep;
+
+/* SLAB cache for files_struct structures (tsk->files) */
+struct kmem_cache *files_cachep;
+
+/* SLAB cache for fs_struct structures (tsk->fs) */
+struct kmem_cache *fs_cachep;
+
+/* SLAB cache for vm_area_struct structures */
+struct kmem_cache *vm_area_cachep;
+
+/* SLAB cache for mm_struct structures (tsk->mm) */
+static struct kmem_cache *mm_cachep;
+
+static void account_kernel_stack(struct thread_info *ti, int account)
+{
+ struct zone *zone = page_zone(virt_to_page(ti));
+
+ mod_zone_page_state(zone, NR_KERNEL_STACK, account);
+}
+
+void free_task(struct task_struct *tsk)
+{
+ account_kernel_stack(tsk->stack, -1);
+ arch_release_thread_info(tsk->stack);
+ free_thread_info(tsk->stack);
+ rt_mutex_debug_task_free(tsk);
+ ftrace_graph_exit_task(tsk);
+ put_seccomp_filter(tsk);
+ arch_release_task_struct(tsk);
+ free_task_struct(tsk);
+}
+EXPORT_SYMBOL(free_task);
+
+static inline void free_signal_struct(struct signal_struct *sig)
+{
+ taskstats_tgid_free(sig);
+ sched_autogroup_exit(sig);
+ kmem_cache_free(signal_cachep, sig);
+}
+
+static inline void put_signal_struct(struct signal_struct *sig)
+{
+ if (atomic_dec_and_test(&sig->sigcnt))
+ free_signal_struct(sig);
+}
+
+void __put_task_struct(struct task_struct *tsk)
+{
+ WARN_ON(!tsk->exit_state);
+ WARN_ON(atomic_read(&tsk->usage));
+ WARN_ON(tsk == current);
+
+ task_numa_free(tsk);
+ security_task_free(tsk);
+ exit_creds(tsk);
+ delayacct_tsk_free(tsk);
+ put_signal_struct(tsk->signal);
+
+ if (!profile_handoff_task(tsk))
+ free_task(tsk);
+}
+EXPORT_SYMBOL_GPL(__put_task_struct);
+
+void __init __weak arch_task_cache_init(void) { }
+
+/*
+ * set_max_threads
+ */
+static void set_max_threads(unsigned int max_threads_suggested)
+{
+ u64 threads;
+
+ /*
+ * The number of threads shall be limited such that the thread
+ * structures may only consume a small part of the available memory.
+ */
+ if (fls64(totalram_pages) + fls64(PAGE_SIZE) > 64)
+ threads = MAX_THREADS;
+ else
+ threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE,
+ (u64) THREAD_SIZE * 8UL);
+
+ if (threads > max_threads_suggested)
+ threads = max_threads_suggested;
+
+ max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
+}
+
+void __init fork_init(void)
+{
+#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
+#ifndef ARCH_MIN_TASKALIGN
+#define ARCH_MIN_TASKALIGN L1_CACHE_BYTES
+#endif
+ /* create a slab on which task_structs can be allocated */
+ task_struct_cachep =
+ kmem_cache_create("task_struct", sizeof(struct task_struct),
+ ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
+#endif
+
+ /* do the arch specific task caches init */
+ arch_task_cache_init();
+
+ set_max_threads(MAX_THREADS);
+
+ init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
+ init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
+ init_task.signal->rlim[RLIMIT_SIGPENDING] =
+ init_task.signal->rlim[RLIMIT_NPROC];
+}
+
+int __weak arch_dup_task_struct(struct task_struct *dst,
+ struct task_struct *src)
+{
+ *dst = *src;
+ return 0;
+}
+
+void set_task_stack_end_magic(struct task_struct *tsk)
+{
+ unsigned long *stackend;
+
+ stackend = end_of_stack(tsk);
+ *stackend = STACK_END_MAGIC; /* for overflow detection */
+}
+
+static struct task_struct *dup_task_struct(struct task_struct *orig)
+{
+ struct task_struct *tsk;
+ struct thread_info *ti;
+ int node = tsk_fork_get_node(orig);
+ int err;
+
+ tsk = alloc_task_struct_node(node);
+ if (!tsk)
+ return NULL;
+
+ ti = alloc_thread_info_node(tsk, node);
+ if (!ti)
+ goto free_tsk;
+
+ err = arch_dup_task_struct(tsk, orig);
+ if (err)
+ goto free_ti;
+
+ tsk->stack = ti;
+#ifdef CONFIG_SECCOMP
+ /*
+ * We must handle setting up seccomp filters once we're under
+ * the sighand lock in case orig has changed between now and
+ * then. Until then, filter must be NULL to avoid messing up
+ * the usage counts on the error path calling free_task.
+ */
+ tsk->seccomp.filter = NULL;
+#endif
+
+ setup_thread_stack(tsk, orig);
+ clear_user_return_notifier(tsk);
+ clear_tsk_need_resched(tsk);
+ set_task_stack_end_magic(tsk);
+
+#ifdef CONFIG_CC_STACKPROTECTOR
+ tsk->stack_canary = get_random_int();
+#endif
+
+ /*
+ * One for us, one for whoever does the "release_task()" (usually
+ * parent)
+ */
+ atomic_set(&tsk->usage, 2);
+#ifdef CONFIG_BLK_DEV_IO_TRACE
+ tsk->btrace_seq = 0;
+#endif
+ tsk->splice_pipe = NULL;
+ tsk->task_frag.page = NULL;
+
+ account_kernel_stack(ti, 1);
+
+ return tsk;
+
+free_ti:
+ free_thread_info(ti);
+free_tsk:
+ free_task_struct(tsk);
+ return NULL;
+}
+
+#ifdef CONFIG_MMU
+static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+ struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
+ struct rb_node **rb_link, *rb_parent;
+ int retval;
+ unsigned long charge;
+
+ uprobe_start_dup_mmap();
+ down_write(&oldmm->mmap_sem);
+ flush_cache_dup_mm(oldmm);
+ uprobe_dup_mmap(oldmm, mm);
+ /*
+ * Not linked in yet - no deadlock potential:
+ */
+ down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
+
+ /* No ordering required: file already has been exposed. */
+ RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
+
+ mm->total_vm = oldmm->total_vm;
+ mm->shared_vm = oldmm->shared_vm;
+ mm->exec_vm = oldmm->exec_vm;
+ mm->stack_vm = oldmm->stack_vm;
+
+ rb_link = &mm->mm_rb.rb_node;
+ rb_parent = NULL;
+ pprev = &mm->mmap;
+ retval = ksm_fork(mm, oldmm);
+ if (retval)
+ goto out;
+ retval = khugepaged_fork(mm, oldmm);
+ if (retval)
+ goto out;
+
+ prev = NULL;
+ for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
+ struct file *file;
+
+ if (mpnt->vm_flags & VM_DONTCOPY) {
+ vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
+ -vma_pages(mpnt));
+ continue;
+ }
+ charge = 0;
+ if (mpnt->vm_flags & VM_ACCOUNT) {
+ unsigned long len = vma_pages(mpnt);
+
+ if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
+ goto fail_nomem;
+ charge = len;
+ }
+ tmp = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+ if (!tmp)
+ goto fail_nomem;
+ *tmp = *mpnt;
+ INIT_LIST_HEAD(&tmp->anon_vma_chain);
+ retval = vma_dup_policy(mpnt, tmp);
+ if (retval)
+ goto fail_nomem_policy;
+ tmp->vm_mm = mm;
+ if (anon_vma_fork(tmp, mpnt))
+ goto fail_nomem_anon_vma_fork;
+ tmp->vm_flags &= ~VM_LOCKED;
+ tmp->vm_next = tmp->vm_prev = NULL;
+ file = tmp->vm_file;
+ if (file) {
+ struct inode *inode = file_inode(file);
+ struct address_space *mapping = file->f_mapping;
+
+ vma_get_file(tmp);
+ if (tmp->vm_flags & VM_DENYWRITE)
+ atomic_dec(&inode->i_writecount);
+ i_mmap_lock_write(mapping);
+ if (tmp->vm_flags & VM_SHARED)
+ atomic_inc(&mapping->i_mmap_writable);
+ flush_dcache_mmap_lock(mapping);
+ /* insert tmp into the share list, just after mpnt */
+ vma_interval_tree_insert_after(tmp, mpnt,
+ &mapping->i_mmap);
+ flush_dcache_mmap_unlock(mapping);
+ i_mmap_unlock_write(mapping);
+ }
+
+ /*
+ * Clear hugetlb-related page reserves for children. This only
+ * affects MAP_PRIVATE mappings. Faults generated by the child
+ * are not guaranteed to succeed, even if read-only
+ */
+ if (is_vm_hugetlb_page(tmp))
+ reset_vma_resv_huge_pages(tmp);
+
+ /*
+ * Link in the new vma and copy the page table entries.
+ */
+ *pprev = tmp;
+ pprev = &tmp->vm_next;
+ tmp->vm_prev = prev;
+ prev = tmp;
+
+ __vma_link_rb(mm, tmp, rb_link, rb_parent);
+ rb_link = &tmp->vm_rb.rb_right;
+ rb_parent = &tmp->vm_rb;
+ uksm_vma_add_new(tmp);
+ mm->map_count++;
+ retval = copy_page_range(mm, oldmm, mpnt);
+
+ if (tmp->vm_ops && tmp->vm_ops->open)
+ tmp->vm_ops->open(tmp);
+
+ if (retval)
+ goto out;
+ }
+ /* a new mm has just been created */
+ arch_dup_mmap(oldmm, mm);
+ retval = 0;
+out:
+ up_write(&mm->mmap_sem);
+ flush_tlb_mm(oldmm);
+ up_write(&oldmm->mmap_sem);
+ uprobe_end_dup_mmap();
+ return retval;
+fail_nomem_anon_vma_fork:
+ mpol_put(vma_policy(tmp));
+fail_nomem_policy:
+ kmem_cache_free(vm_area_cachep, tmp);
+fail_nomem:
+ retval = -ENOMEM;
+ vm_unacct_memory(charge);
+ goto out;
+}
+
+static inline int mm_alloc_pgd(struct mm_struct *mm)
+{
+ mm->pgd = pgd_alloc(mm);
+ if (unlikely(!mm->pgd))
+ return -ENOMEM;
+ return 0;
+}
+
+static inline void mm_free_pgd(struct mm_struct *mm)
+{
+ pgd_free(mm, mm->pgd);
+}
+#else
+static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+ down_write(&oldmm->mmap_sem);
+ RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
+ up_write(&oldmm->mmap_sem);
+ return 0;
+}
+#define mm_alloc_pgd(mm) (0)
+#define mm_free_pgd(mm)
+#endif /* CONFIG_MMU */
+
+__cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
+
+#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
+#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
+
+static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;
+
+static int __init coredump_filter_setup(char *s)
+{
+ default_dump_filter =
+ (simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) &
+ MMF_DUMP_FILTER_MASK;
+ return 1;
+}
+
+__setup("coredump_filter=", coredump_filter_setup);
+
+#include <linux/init_task.h>
+
+static void mm_init_aio(struct mm_struct *mm)
+{
+#ifdef CONFIG_AIO
+ spin_lock_init(&mm->ioctx_lock);
+ mm->ioctx_table = NULL;
+#endif
+}
+
+static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
+{
+#ifdef CONFIG_MEMCG
+ mm->owner = p;
+#endif
+}
+
+static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
+{
+ mm->mmap = NULL;
+ mm->mm_rb = RB_ROOT;
+ mm->vmacache_seqnum = 0;
+ atomic_set(&mm->mm_users, 1);
+ atomic_set(&mm->mm_count, 1);
+ init_rwsem(&mm->mmap_sem);
+ INIT_LIST_HEAD(&mm->mmlist);
+ mm->core_state = NULL;
+ atomic_long_set(&mm->nr_ptes, 0);
+ mm_nr_pmds_init(mm);
+ mm->map_count = 0;
+ mm->locked_vm = 0;
+ mm->pinned_vm = 0;
+ memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
+ spin_lock_init(&mm->page_table_lock);
+ mm_init_cpumask(mm);
+ mm_init_aio(mm);
+ mm_init_owner(mm, p);
+ mmu_notifier_mm_init(mm);
+ clear_tlb_flush_pending(mm);
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
+ mm->pmd_huge_pte = NULL;
+#endif
+
+ if (current->mm) {
+ mm->flags = current->mm->flags & MMF_INIT_MASK;
+ mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
+ } else {
+ mm->flags = default_dump_filter;
+ mm->def_flags = 0;
+ }
+
+ if (mm_alloc_pgd(mm))
+ goto fail_nopgd;
+
+ if (init_new_context(p, mm))
+ goto fail_nocontext;
+
+ return mm;
+
+fail_nocontext:
+ mm_free_pgd(mm);
+fail_nopgd:
+ free_mm(mm);
+ return NULL;
+}
+
+static void check_mm(struct mm_struct *mm)
+{
+ int i;
+
+ for (i = 0; i < NR_MM_COUNTERS; i++) {
+ long x = atomic_long_read(&mm->rss_stat.count[i]);
+
+ if (unlikely(x))
+ printk(KERN_ALERT "BUG: Bad rss-counter state "
+ "mm:%p idx:%d val:%ld\n", mm, i, x);
+ }
+
+ if (atomic_long_read(&mm->nr_ptes))
+ pr_alert("BUG: non-zero nr_ptes on freeing mm: %ld\n",
+ atomic_long_read(&mm->nr_ptes));
+ if (mm_nr_pmds(mm))
+ pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n",
+ mm_nr_pmds(mm));
+
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
+ VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
+#endif
+}
+
+/*
+ * Allocate and initialize an mm_struct.
+ */
+struct mm_struct *mm_alloc(void)
+{
+ struct mm_struct *mm;
+
+ mm = allocate_mm();
+ if (!mm)
+ return NULL;
+
+ memset(mm, 0, sizeof(*mm));
+ return mm_init(mm, current);
+}
+
+/*
+ * Called when the last reference to the mm
+ * is dropped: either by a lazy thread or by
+ * mmput. Free the page directory and the mm.
+ */
+void __mmdrop(struct mm_struct *mm)
+{
+ BUG_ON(mm == &init_mm);
+ mm_free_pgd(mm);
+ destroy_context(mm);
+ mmu_notifier_mm_destroy(mm);
+ check_mm(mm);
+ free_mm(mm);
+}
+EXPORT_SYMBOL_GPL(__mmdrop);
+
+/*
+ * Decrement the use count and release all resources for an mm.
+ */
+void mmput(struct mm_struct *mm)
+{
+ might_sleep();
+
+ if (atomic_dec_and_test(&mm->mm_users)) {
+ uprobe_clear_state(mm);
+ exit_aio(mm);
+ ksm_exit(mm);
+ khugepaged_exit(mm); /* must run before exit_mmap */
+ exit_mmap(mm);
+ set_mm_exe_file(mm, NULL);
+ if (!list_empty(&mm->mmlist)) {
+ spin_lock(&mmlist_lock);
+ list_del(&mm->mmlist);
+ spin_unlock(&mmlist_lock);
+ }
+ if (mm->binfmt)
+ module_put(mm->binfmt->module);
+ mmdrop(mm);
+ }
+}
+EXPORT_SYMBOL_GPL(mmput);
+
+/**
+ * set_mm_exe_file - change a reference to the mm's executable file
+ *
+ * This changes mm's executable file (shown as symlink /proc/[pid]/exe).
+ *
+ * Main users are mmput() and sys_execve(). Callers prevent concurrent
+ * invocations: in mmput() nobody alive left, in execve task is single
+ * threaded. sys_prctl(PR_SET_MM_MAP/EXE_FILE) also needs to set the
+ * mm->exe_file, but does so without using set_mm_exe_file() in order
+ * to do avoid the need for any locks.
+ */
+void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
+{
+ struct file *old_exe_file;
+
+ /*
+ * It is safe to dereference the exe_file without RCU as
+ * this function is only called if nobody else can access
+ * this mm -- see comment above for justification.
+ */
+ old_exe_file = rcu_dereference_raw(mm->exe_file);
+
+ if (new_exe_file)
+ get_file(new_exe_file);
+ rcu_assign_pointer(mm->exe_file, new_exe_file);
+ if (old_exe_file)
+ fput(old_exe_file);
+}
+
+/**
+ * get_mm_exe_file - acquire a reference to the mm's executable file
+ *
+ * Returns %NULL if mm has no associated executable file.
+ * User must release file via fput().
+ */
+struct file *get_mm_exe_file(struct mm_struct *mm)
+{
+ struct file *exe_file;
+
+ rcu_read_lock();
+ exe_file = rcu_dereference(mm->exe_file);
+ if (exe_file && !get_file_rcu(exe_file))
+ exe_file = NULL;
+ rcu_read_unlock();
+ return exe_file;
+}
+EXPORT_SYMBOL(get_mm_exe_file);
+
+/**
+ * get_task_mm - acquire a reference to the task's mm
+ *
+ * Returns %NULL if the task has no mm. Checks PF_KTHREAD (meaning
+ * this kernel workthread has transiently adopted a user mm with use_mm,
+ * to do its AIO) is not set and if so returns a reference to it, after
+ * bumping up the use count. User must release the mm via mmput()
+ * after use. Typically used by /proc and ptrace.
+ */
+struct mm_struct *get_task_mm(struct task_struct *task)
+{
+ struct mm_struct *mm;
+
+ task_lock(task);
+ mm = task->mm;
+ if (mm) {
+ if (task->flags & PF_KTHREAD)
+ mm = NULL;
+ else
+ atomic_inc(&mm->mm_users);
+ }
+ task_unlock(task);
+ return mm;
+}
+EXPORT_SYMBOL_GPL(get_task_mm);
+
+struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
+{
+ struct mm_struct *mm;
+ int err;
+
+ err = mutex_lock_killable(&task->signal->cred_guard_mutex);
+ if (err)
+ return ERR_PTR(err);
+
+ mm = get_task_mm(task);
+ if (mm && mm != current->mm &&
+ !ptrace_may_access(task, mode)) {
+ mmput(mm);
+ mm = ERR_PTR(-EACCES);
+ }
+ mutex_unlock(&task->signal->cred_guard_mutex);
+
+ return mm;
+}
+
+static void complete_vfork_done(struct task_struct *tsk)
+{
+ struct completion *vfork;
+
+ task_lock(tsk);
+ vfork = tsk->vfork_done;
+ if (likely(vfork)) {
+ tsk->vfork_done = NULL;
+ complete(vfork);
+ }
+ task_unlock(tsk);
+}
+
+static int wait_for_vfork_done(struct task_struct *child,
+ struct completion *vfork)
+{
+ int killed;
+
+ freezer_do_not_count();
+ killed = wait_for_completion_killable(vfork);
+ freezer_count();
+
+ if (killed) {
+ task_lock(child);
+ child->vfork_done = NULL;
+ task_unlock(child);
+ }
+
+ put_task_struct(child);
+ return killed;
+}
+
+/* Please note the differences between mmput and mm_release.
+ * mmput is called whenever we stop holding onto a mm_struct,
+ * error success whatever.
+ *
+ * mm_release is called after a mm_struct has been removed
+ * from the current process.
+ *
+ * This difference is important for error handling, when we
+ * only half set up a mm_struct for a new process and need to restore
+ * the old one. Because we mmput the new mm_struct before
+ * restoring the old one. . .
+ * Eric Biederman 10 January 1998
+ */
+void mm_release(struct task_struct *tsk, struct mm_struct *mm)
+{
+ /* Get rid of any futexes when releasing the mm */
+#ifdef CONFIG_FUTEX
+ if (unlikely(tsk->robust_list)) {
+ exit_robust_list(tsk);
+ tsk->robust_list = NULL;
+ }
+#ifdef CONFIG_COMPAT
+ if (unlikely(tsk->compat_robust_list)) {
+ compat_exit_robust_list(tsk);
+ tsk->compat_robust_list = NULL;
+ }
+#endif
+ if (unlikely(!list_empty(&tsk->pi_state_list)))
+ exit_pi_state_list(tsk);
+#endif
+
+ uprobe_free_utask(tsk);
+
+ /* Get rid of any cached register state */
+ deactivate_mm(tsk, mm);
+
+ /*
+ * If we're exiting normally, clear a user-space tid field if
+ * requested. We leave this alone when dying by signal, to leave
+ * the value intact in a core dump, and to save the unnecessary
+ * trouble, say, a killed vfork parent shouldn't touch this mm.
+ * Userland only wants this done for a sys_exit.
+ */
+ if (tsk->clear_child_tid) {
+ if (!(tsk->flags & PF_SIGNALED) &&
+ atomic_read(&mm->mm_users) > 1) {
+ /*
+ * We don't check the error code - if userspace has
+ * not set up a proper pointer then tough luck.
+ */
+ put_user(0, tsk->clear_child_tid);
+ sys_futex(tsk->clear_child_tid, FUTEX_WAKE,
+ 1, NULL, NULL, 0);
+ }
+ tsk->clear_child_tid = NULL;
+ }
+
+ /*
+ * All done, finally we can wake up parent and return this mm to him.
+ * Also kthread_stop() uses this completion for synchronization.
+ */
+ if (tsk->vfork_done)
+ complete_vfork_done(tsk);
+}
+
+/*
+ * Allocate a new mm structure and copy contents from the
+ * mm structure of the passed in task structure.
+ */
+static struct mm_struct *dup_mm(struct task_struct *tsk)
+{
+ struct mm_struct *mm, *oldmm = current->mm;
+ int err;
+
+ mm = allocate_mm();
+ if (!mm)
+ goto fail_nomem;
+
+ memcpy(mm, oldmm, sizeof(*mm));
+
+ if (!mm_init(mm, tsk))
+ goto fail_nomem;
+
+ err = dup_mmap(mm, oldmm);
+ if (err)
+ goto free_pt;
+
+ mm->hiwater_rss = get_mm_rss(mm);
+ mm->hiwater_vm = mm->total_vm;
+
+ if (mm->binfmt && !try_module_get(mm->binfmt->module))
+ goto free_pt;
+
+ return mm;
+
+free_pt:
+ /* don't put binfmt in mmput, we haven't got module yet */
+ mm->binfmt = NULL;
+ mmput(mm);
+
+fail_nomem:
+ return NULL;
+}
+
+static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
+{
+ struct mm_struct *mm, *oldmm;
+ int retval;
+
+ tsk->min_flt = tsk->maj_flt = 0;
+ tsk->nvcsw = tsk->nivcsw = 0;
+#ifdef CONFIG_DETECT_HUNG_TASK
+ tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
+#endif
+
+ tsk->mm = NULL;
+ tsk->active_mm = NULL;
+
+ /*
+ * Are we cloning a kernel thread?
+ *
+ * We need to steal a active VM for that..
+ */
+ oldmm = current->mm;
+ if (!oldmm)
+ return 0;
+
+ /* initialize the new vmacache entries */
+ vmacache_flush(tsk);
+
+ if (clone_flags & CLONE_VM) {
+ atomic_inc(&oldmm->mm_users);
+ mm = oldmm;
+ goto good_mm;
+ }
+
+ retval = -ENOMEM;
+ mm = dup_mm(tsk);
+ if (!mm)
+ goto fail_nomem;
+
+good_mm:
+ tsk->mm = mm;
+ tsk->active_mm = mm;
+ return 0;
+
+fail_nomem:
+ return retval;
+}
+
+static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
+{
+ struct fs_struct *fs = current->fs;
+ if (clone_flags & CLONE_FS) {
+ /* tsk->fs is already what we want */
+ spin_lock(&fs->lock);
+ if (fs->in_exec) {
+ spin_unlock(&fs->lock);
+ return -EAGAIN;
+ }
+ fs->users++;
+ spin_unlock(&fs->lock);
+ return 0;
+ }
+ tsk->fs = copy_fs_struct(fs);
+ if (!tsk->fs)
+ return -ENOMEM;
+ return 0;
+}
+
+static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
+{
+ struct files_struct *oldf, *newf;
+ int error = 0;
+
+ /*
+ * A background process may not have any files ...
+ */
+ oldf = current->files;
+ if (!oldf)
+ goto out;
+
+ if (clone_flags & CLONE_FILES) {
+ atomic_inc(&oldf->count);
+ goto out;
+ }
+
+ newf = dup_fd(oldf, &error);
+ if (!newf)
+ goto out;
+
+ tsk->files = newf;
+ error = 0;
+out:
+ return error;
+}
+
+static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
+{
+#ifdef CONFIG_BLOCK
+ struct io_context *ioc = current->io_context;
+ struct io_context *new_ioc;
+
+ if (!ioc)
+ return 0;
+ /*
+ * Share io context with parent, if CLONE_IO is set
+ */
+ if (clone_flags & CLONE_IO) {
+ ioc_task_link(ioc);
+ tsk->io_context = ioc;
+ } else if (ioprio_valid(ioc->ioprio)) {
+ new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
+ if (unlikely(!new_ioc))
+ return -ENOMEM;
+
+ new_ioc->ioprio = ioc->ioprio;
+ put_io_context(new_ioc);
+ }
+#endif
+ return 0;
+}
+
+static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
+{
+ struct sighand_struct *sig;
+
+ if (clone_flags & CLONE_SIGHAND) {
+ atomic_inc(&current->sighand->count);
+ return 0;
+ }
+ sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
+ rcu_assign_pointer(tsk->sighand, sig);
+ if (!sig)
+ return -ENOMEM;
+ atomic_set(&sig->count, 1);
+ memcpy(sig->action, current->sighand->action, sizeof(sig->action));
+ return 0;
+}
+
+void __cleanup_sighand(struct sighand_struct *sighand)
+{
+ if (atomic_dec_and_test(&sighand->count)) {
+ signalfd_cleanup(sighand);
+ /*
+ * sighand_cachep is SLAB_DESTROY_BY_RCU so we can free it
+ * without an RCU grace period, see __lock_task_sighand().
+ */
+ kmem_cache_free(sighand_cachep, sighand);
+ }
+}
+
+/*
+ * Initialize POSIX timer handling for a thread group.
+ */
+static void posix_cpu_timers_init_group(struct signal_struct *sig)
+{
+ unsigned long cpu_limit;
+
+ /* Thread group counters. */
+ thread_group_cputime_init(sig);
+
+ cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
+ if (cpu_limit != RLIM_INFINITY) {
+ sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
+ sig->cputimer.running = 1;
+ }
+
+ /* The timer lists. */
+ INIT_LIST_HEAD(&sig->cpu_timers[0]);
+ INIT_LIST_HEAD(&sig->cpu_timers[1]);
+ INIT_LIST_HEAD(&sig->cpu_timers[2]);
+}
+
+static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
+{
+ struct signal_struct *sig;
+
+ if (clone_flags & CLONE_THREAD)
+ return 0;
+
+ sig = kmem_cache_zalloc(signal_cachep, GFP_KERNEL);
+ tsk->signal = sig;
+ if (!sig)
+ return -ENOMEM;
+
+ sig->nr_threads = 1;
+ atomic_set(&sig->live, 1);
+ atomic_set(&sig->sigcnt, 1);
+
+ /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */
+ sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node);
+ tsk->thread_node = (struct list_head)LIST_HEAD_INIT(sig->thread_head);
+
+ init_waitqueue_head(&sig->wait_chldexit);
+ sig->curr_target = tsk;
+ init_sigpending(&sig->shared_pending);
+ INIT_LIST_HEAD(&sig->posix_timers);
+ seqlock_init(&sig->stats_lock);
+
+ hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ sig->real_timer.function = it_real_fn;
+
+ task_lock(current->group_leader);
+ memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
+ task_unlock(current->group_leader);
+
+ posix_cpu_timers_init_group(sig);
+
+ tty_audit_fork(sig);
+ sched_autogroup_fork(sig);
+
+#ifdef CONFIG_CGROUPS
+ init_rwsem(&sig->group_rwsem);
+#endif
+
+ sig->oom_score_adj = current->signal->oom_score_adj;
+ sig->oom_score_adj_min = current->signal->oom_score_adj_min;
+
+ sig->has_child_subreaper = current->signal->has_child_subreaper ||
+ current->signal->is_child_subreaper;
+
+ mutex_init(&sig->cred_guard_mutex);
+
+ return 0;
+}
+
+static void copy_seccomp(struct task_struct *p)
+{
+#ifdef CONFIG_SECCOMP
+ /*
+ * Must be called with sighand->lock held, which is common to
+ * all threads in the group. Holding cred_guard_mutex is not
+ * needed because this new task is not yet running and cannot
+ * be racing exec.
+ */
+ assert_spin_locked(&current->sighand->siglock);
+
+ /* Ref-count the new filter user, and assign it. */
+ get_seccomp_filter(current);
+ p->seccomp = current->seccomp;
+
+ /*
+ * Explicitly enable no_new_privs here in case it got set
+ * between the task_struct being duplicated and holding the
+ * sighand lock. The seccomp state and nnp must be in sync.
+ */
+ if (task_no_new_privs(current))
+ task_set_no_new_privs(p);
+
+ /*
+ * If the parent gained a seccomp mode after copying thread
+ * flags and between before we held the sighand lock, we have
+ * to manually enable the seccomp thread flag here.
+ */
+ if (p->seccomp.mode != SECCOMP_MODE_DISABLED)
+ set_tsk_thread_flag(p, TIF_SECCOMP);
+#endif
+}
+
+SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
+{
+ current->clear_child_tid = tidptr;
+
+ return task_pid_vnr(current);
+}
+
+static void rt_mutex_init_task(struct task_struct *p)
+{
+ raw_spin_lock_init(&p->pi_lock);
+#ifdef CONFIG_RT_MUTEXES
+ p->pi_waiters = RB_ROOT;
+ p->pi_waiters_leftmost = NULL;
+ p->pi_blocked_on = NULL;
+#endif
+}
+
+/*
+ * Initialize POSIX timer handling for a single task.
+ */
+static void posix_cpu_timers_init(struct task_struct *tsk)
+{
+ tsk->cputime_expires.prof_exp = 0;
+ tsk->cputime_expires.virt_exp = 0;
+ tsk->cputime_expires.sched_exp = 0;
+ INIT_LIST_HEAD(&tsk->cpu_timers[0]);
+ INIT_LIST_HEAD(&tsk->cpu_timers[1]);
+ INIT_LIST_HEAD(&tsk->cpu_timers[2]);
+}
+
+static inline void
+init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
+{
+ task->pids[type].pid = pid;
+}
+
+/*
+ * This creates a new process as a copy of the old one,
+ * but does not actually start it yet.
+ *
+ * It copies the registers, and all the appropriate
+ * parts of the process environment (as per the clone
+ * flags). The actual kick-off is left to the caller.
+ */
+static struct task_struct *copy_process(unsigned long clone_flags,
+ unsigned long stack_start,
+ unsigned long stack_size,
+ int __user *child_tidptr,
+ struct pid *pid,
+ int trace)
+{
+ int retval;
+ struct task_struct *p;
+
+ if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
+ return ERR_PTR(-EINVAL);
+
+ if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
+ return ERR_PTR(-EINVAL);
+
+ /*
+ * Thread groups must share signals as well, and detached threads
+ * can only be started up within the thread group.
+ */
+ if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
+ return ERR_PTR(-EINVAL);
+
+ /*
+ * Shared signal handlers imply shared VM. By way of the above,
+ * thread groups also imply shared VM. Blocking this case allows
+ * for various simplifications in other code.
+ */
+ if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
+ return ERR_PTR(-EINVAL);
+
+ /*
+ * Siblings of global init remain as zombies on exit since they are
+ * not reaped by their parent (swapper). To solve this and to avoid
+ * multi-rooted process trees, prevent global and container-inits
+ * from creating siblings.
+ */
+ if ((clone_flags & CLONE_PARENT) &&
+ current->signal->flags & SIGNAL_UNKILLABLE)
+ return ERR_PTR(-EINVAL);
+
+ /*
+ * If the new process will be in a different pid or user namespace
+ * do not allow it to share a thread group or signal handlers or
+ * parent with the forking task.
+ */
+ if (clone_flags & CLONE_SIGHAND) {
+ if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
+ (task_active_pid_ns(current) !=
+ current->nsproxy->pid_ns_for_children))
+ return ERR_PTR(-EINVAL);
+ }
+
+ retval = security_task_create(clone_flags);
+ if (retval)
+ goto fork_out;
+
+ retval = -ENOMEM;
+ p = dup_task_struct(current);
+ if (!p)
+ goto fork_out;
+
+ ftrace_graph_init_task(p);
+
+ rt_mutex_init_task(p);
+
+#ifdef CONFIG_PROVE_LOCKING
+ DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
+ DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
+#endif
+ retval = -EAGAIN;
+ if (atomic_read(&p->real_cred->user->processes) >=
+ task_rlimit(p, RLIMIT_NPROC)) {
+ if (p->real_cred->user != INIT_USER &&
+ !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
+ goto bad_fork_free;
+ }
+ current->flags &= ~PF_NPROC_EXCEEDED;
+
+ retval = copy_creds(p, clone_flags);
+ if (retval < 0)
+ goto bad_fork_free;
+
+ /*
+ * If multiple threads are within copy_process(), then this check
+ * triggers too late. This doesn't hurt, the check is only there
+ * to stop root fork bombs.
+ */
+ retval = -EAGAIN;
+ if (nr_threads >= max_threads)
+ goto bad_fork_cleanup_count;
+
+ delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
+ p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
+ p->flags |= PF_FORKNOEXEC;
+ INIT_LIST_HEAD(&p->children);
+ INIT_LIST_HEAD(&p->sibling);
+ rcu_copy_process(p);
+ p->vfork_done = NULL;
+ spin_lock_init(&p->alloc_lock);
+
+ init_sigpending(&p->pending);
+
+ p->utime = p->stime = p->gtime = 0;
+ p->utimescaled = p->stimescaled = 0;
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+ p->prev_cputime.utime = p->prev_cputime.stime = 0;
+#endif
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
+ seqlock_init(&p->vtime_seqlock);
+ p->vtime_snap = 0;
+ p->vtime_snap_whence = VTIME_SLEEPING;
+#endif
+
+#if defined(SPLIT_RSS_COUNTING)
+ memset(&p->rss_stat, 0, sizeof(p->rss_stat));
+#endif
+
+ p->default_timer_slack_ns = current->timer_slack_ns;
+
+ task_io_accounting_init(&p->ioac);
+ acct_clear_integrals(p);
+
+ posix_cpu_timers_init(p);
+
+ p->start_time = ktime_get_ns();
+ p->real_start_time = ktime_get_boot_ns();
+ p->io_context = NULL;
+ p->audit_context = NULL;
+ if (clone_flags & CLONE_THREAD)
+ threadgroup_change_begin(current);
+ cgroup_fork(p);
+#ifdef CONFIG_NUMA
+ p->mempolicy = mpol_dup(p->mempolicy);
+ if (IS_ERR(p->mempolicy)) {
+ retval = PTR_ERR(p->mempolicy);
+ p->mempolicy = NULL;
+ goto bad_fork_cleanup_threadgroup_lock;
+ }
+#endif
+#ifdef CONFIG_CPUSETS
+ p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
+ p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
+ seqcount_init(&p->mems_allowed_seq);
+#endif
+#ifdef CONFIG_TRACE_IRQFLAGS
+ p->irq_events = 0;
+ p->hardirqs_enabled = 0;
+ p->hardirq_enable_ip = 0;
+ p->hardirq_enable_event = 0;
+ p->hardirq_disable_ip = _THIS_IP_;
+ p->hardirq_disable_event = 0;
+ p->softirqs_enabled = 1;
+ p->softirq_enable_ip = _THIS_IP_;
+ p->softirq_enable_event = 0;
+ p->softirq_disable_ip = 0;
+ p->softirq_disable_event = 0;
+ p->hardirq_context = 0;
+ p->softirq_context = 0;
+#endif
+#ifdef CONFIG_LOCKDEP
+ p->lockdep_depth = 0; /* no locks held yet */
+ p->curr_chain_key = 0;
+ p->lockdep_recursion = 0;
+#endif
+
+#ifdef CONFIG_DEBUG_MUTEXES
+ p->blocked_on = NULL; /* not blocked yet */
+#endif
+#ifdef CONFIG_BCACHE
+ p->sequential_io = 0;
+ p->sequential_io_avg = 0;
+#endif
+
+ /* Perform scheduler related setup. Assign this task to a CPU. */
+ retval = sched_fork(clone_flags, p);
+ if (retval)
+ goto bad_fork_cleanup_policy;
+
+ retval = perf_event_init_task(p);
+ if (retval)
+ goto bad_fork_cleanup_policy;
+ retval = audit_alloc(p);
+ if (retval)
+ goto bad_fork_cleanup_perf;
+ /* copy all the process information */
+ shm_init_task(p);
+ retval = copy_semundo(clone_flags, p);
+ if (retval)
+ goto bad_fork_cleanup_audit;
+ retval = copy_files(clone_flags, p);
+ if (retval)
+ goto bad_fork_cleanup_semundo;
+ retval = copy_fs(clone_flags, p);
+ if (retval)
+ goto bad_fork_cleanup_files;
+ retval = copy_sighand(clone_flags, p);
+ if (retval)
+ goto bad_fork_cleanup_fs;
+ retval = copy_signal(clone_flags, p);
+ if (retval)
+ goto bad_fork_cleanup_sighand;
+ retval = copy_mm(clone_flags, p);
+ if (retval)
+ goto bad_fork_cleanup_signal;
+ retval = copy_namespaces(clone_flags, p);
+ if (retval)
+ goto bad_fork_cleanup_mm;
+ retval = copy_io(clone_flags, p);
+ if (retval)
+ goto bad_fork_cleanup_namespaces;
+ retval = copy_thread(clone_flags, stack_start, stack_size, p);
+ if (retval)
+ goto bad_fork_cleanup_io;
+
+ if (pid != &init_struct_pid) {
+ pid = alloc_pid(p->nsproxy->pid_ns_for_children);
+ if (IS_ERR(pid)) {
+ retval = PTR_ERR(pid);
+ goto bad_fork_cleanup_io;
+ }
+ }
+
+ p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
+ /*
+ * Clear TID on mm_release()?
+ */
+ p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;
+#ifdef CONFIG_BLOCK
+ p->plug = NULL;
+#endif
+#ifdef CONFIG_FUTEX
+ p->robust_list = NULL;
+#ifdef CONFIG_COMPAT
+ p->compat_robust_list = NULL;
+#endif
+ INIT_LIST_HEAD(&p->pi_state_list);
+ p->pi_state_cache = NULL;
+#endif
+ /*
+ * sigaltstack should be cleared when sharing the same VM
+ */
+ if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
+ p->sas_ss_sp = p->sas_ss_size = 0;
+
+ /*
+ * Syscall tracing and stepping should be turned off in the
+ * child regardless of CLONE_PTRACE.
+ */
+ user_disable_single_step(p);
+ clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
+#ifdef TIF_SYSCALL_EMU
+ clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
+#endif
+ clear_all_latency_tracing(p);
+
+ /* ok, now we should be set up.. */
+ p->pid = pid_nr(pid);
+ if (clone_flags & CLONE_THREAD) {
+ p->exit_signal = -1;
+ p->group_leader = current->group_leader;
+ p->tgid = current->tgid;
+ } else {
+ if (clone_flags & CLONE_PARENT)
+ p->exit_signal = current->group_leader->exit_signal;
+ else
+ p->exit_signal = (clone_flags & CSIGNAL);
+ p->group_leader = p;
+ p->tgid = p->pid;
+ }
+
+ p->nr_dirtied = 0;
+ p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
+ p->dirty_paused_when = 0;
+
+ p->pdeath_signal = 0;
+ INIT_LIST_HEAD(&p->thread_group);
+ p->task_works = NULL;
+
+ /*
+ * Make it visible to the rest of the system, but dont wake it up yet.
+ * Need tasklist lock for parent etc handling!
+ */
+ write_lock_irq(&tasklist_lock);
+
+ /* CLONE_PARENT re-uses the old parent */
+ if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
+ p->real_parent = current->real_parent;
+ p->parent_exec_id = current->parent_exec_id;
+ } else {
+ p->real_parent = current;
+ p->parent_exec_id = current->self_exec_id;
+ }
+
+ spin_lock(&current->sighand->siglock);
+
+ /*
+ * Copy seccomp details explicitly here, in case they were changed
+ * before holding sighand lock.
+ */
+ copy_seccomp(p);
+
+ /*
+ * Process group and session signals need to be delivered to just the
+ * parent before the fork or both the parent and the child after the
+ * fork. Restart if a signal comes in before we add the new process to
+ * it's process group.
+ * A fatal signal pending means that current will exit, so the new
+ * thread can't slip out of an OOM kill (or normal SIGKILL).
+ */
+ recalc_sigpending();
+ if (signal_pending(current)) {
+ spin_unlock(&current->sighand->siglock);
+ write_unlock_irq(&tasklist_lock);
+ retval = -ERESTARTNOINTR;
+ goto bad_fork_free_pid;
+ }
+
+ if (likely(p->pid)) {
+ ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
+
+ init_task_pid(p, PIDTYPE_PID, pid);
+ if (thread_group_leader(p)) {
+ init_task_pid(p, PIDTYPE_PGID, task_pgrp(current));
+ init_task_pid(p, PIDTYPE_SID, task_session(current));
+
+ if (is_child_reaper(pid)) {
+ ns_of_pid(pid)->child_reaper = p;
+ p->signal->flags |= SIGNAL_UNKILLABLE;
+ }
+
+ p->signal->leader_pid = pid;
+ p->signal->tty = tty_kref_get(current->signal->tty);
+ list_add_tail(&p->sibling, &p->real_parent->children);
+ list_add_tail_rcu(&p->tasks, &init_task.tasks);
+ attach_pid(p, PIDTYPE_PGID);
+ attach_pid(p, PIDTYPE_SID);
+ __this_cpu_inc(process_counts);
+ } else {
+ current->signal->nr_threads++;
+ atomic_inc(&current->signal->live);
+ atomic_inc(&current->signal->sigcnt);
+ list_add_tail_rcu(&p->thread_group,
+ &p->group_leader->thread_group);
+ list_add_tail_rcu(&p->thread_node,
+ &p->signal->thread_head);
+ }
+ attach_pid(p, PIDTYPE_PID);
+ nr_threads++;
+ }
+
+ total_forks++;
+ spin_unlock(&current->sighand->siglock);
+ syscall_tracepoint_update(p);
+ write_unlock_irq(&tasklist_lock);
+
+ proc_fork_connector(p);
+ cgroup_post_fork(p);
+ if (clone_flags & CLONE_THREAD)
+ threadgroup_change_end(current);
+ perf_event_fork(p);
+
+ trace_task_newtask(p, clone_flags);
+ uprobe_copy_process(p, clone_flags);
+
+ return p;
+
+bad_fork_free_pid:
+ if (pid != &init_struct_pid)
+ free_pid(pid);
+bad_fork_cleanup_io:
+ if (p->io_context)
+ exit_io_context(p);
+bad_fork_cleanup_namespaces:
+ exit_task_namespaces(p);
+bad_fork_cleanup_mm:
+ if (p->mm)
+ mmput(p->mm);
+bad_fork_cleanup_signal:
+ if (!(clone_flags & CLONE_THREAD))
+ free_signal_struct(p->signal);
+bad_fork_cleanup_sighand:
+ __cleanup_sighand(p->sighand);
+bad_fork_cleanup_fs:
+ exit_fs(p); /* blocking */
+bad_fork_cleanup_files:
+ exit_files(p); /* blocking */
+bad_fork_cleanup_semundo:
+ exit_sem(p);
+bad_fork_cleanup_audit:
+ audit_free(p);
+bad_fork_cleanup_perf:
+ perf_event_free_task(p);
+bad_fork_cleanup_policy:
+#ifdef CONFIG_NUMA
+ mpol_put(p->mempolicy);
+bad_fork_cleanup_threadgroup_lock:
+#endif
+ if (clone_flags & CLONE_THREAD)
+ threadgroup_change_end(current);
+ delayacct_tsk_free(p);
+bad_fork_cleanup_count:
+ atomic_dec(&p->cred->user->processes);
+ exit_creds(p);
+bad_fork_free:
+ free_task(p);
+fork_out:
+ return ERR_PTR(retval);
+}
+
+static inline void init_idle_pids(struct pid_link *links)
+{
+ enum pid_type type;
+
+ for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) {
+ INIT_HLIST_NODE(&links[type].node); /* not really needed */
+ links[type].pid = &init_struct_pid;
+ }
+}
+
+struct task_struct *fork_idle(int cpu)
+{
+ struct task_struct *task;
+ task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0);
+ if (!IS_ERR(task)) {
+ init_idle_pids(task->pids);
+ init_idle(task, cpu);
+ }
+
+ return task;
+}
+
+/*
+ * Ok, this is the main fork-routine.
+ *
+ * It copies the process, and if successful kick-starts
+ * it and waits for it to finish using the VM if required.
+ */
+long do_fork(unsigned long clone_flags,
+ unsigned long stack_start,
+ unsigned long stack_size,
+ int __user *parent_tidptr,
+ int __user *child_tidptr)
+{
+ struct task_struct *p;
+ int trace = 0;
+ long nr;
+
+ /*
+ * Determine whether and which event to report to ptracer. When
+ * called from kernel_thread or CLONE_UNTRACED is explicitly
+ * requested, no event is reported; otherwise, report if the event
+ * for the type of forking is enabled.
+ */
+ if (!(clone_flags & CLONE_UNTRACED)) {
+ if (clone_flags & CLONE_VFORK)
+ trace = PTRACE_EVENT_VFORK;
+ else if ((clone_flags & CSIGNAL) != SIGCHLD)
+ trace = PTRACE_EVENT_CLONE;
+ else
+ trace = PTRACE_EVENT_FORK;
+
+ if (likely(!ptrace_event_enabled(current, trace)))
+ trace = 0;
+ }
+
+ p = copy_process(clone_flags, stack_start, stack_size,
+ child_tidptr, NULL, trace);
+ /*
+ * Do this prior waking up the new thread - the thread pointer
+ * might get invalid after that point, if the thread exits quickly.
+ */
+ if (!IS_ERR(p)) {
+ struct completion vfork;
+ struct pid *pid;
+
+ trace_sched_process_fork(current, p);
+
+ pid = get_task_pid(p, PIDTYPE_PID);
+ nr = pid_vnr(pid);
+
+ if (clone_flags & CLONE_PARENT_SETTID)
+ put_user(nr, parent_tidptr);
+
+ if (clone_flags & CLONE_VFORK) {
+ p->vfork_done = &vfork;
+ init_completion(&vfork);
+ get_task_struct(p);
+ }
+
+ wake_up_new_task(p);
+
+ /* forking complete and child started to run, tell ptracer */
+ if (unlikely(trace))
+ ptrace_event_pid(trace, pid);
+
+ if (clone_flags & CLONE_VFORK) {
+ if (!wait_for_vfork_done(p, &vfork))
+ ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
+ }
+
+ put_pid(pid);
+ } else {
+ nr = PTR_ERR(p);
+ }
+ return nr;
+}
+
+/*
+ * Create a kernel thread.
+ */
+pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
+{
+ return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
+ (unsigned long)arg, NULL, NULL);
+}
+
+#ifdef __ARCH_WANT_SYS_FORK
+SYSCALL_DEFINE0(fork)
+{
+#ifdef CONFIG_MMU
+ return do_fork(SIGCHLD, 0, 0, NULL, NULL);
+#else
+ /* can not support in nommu mode */
+ return -EINVAL;
+#endif
+}
+#endif
+
+#ifdef __ARCH_WANT_SYS_VFORK
+SYSCALL_DEFINE0(vfork)
+{
+ return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
+ 0, NULL, NULL);
+}
+#endif
+
+#ifdef __ARCH_WANT_SYS_CLONE
+#ifdef CONFIG_CLONE_BACKWARDS
+SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
+ int __user *, parent_tidptr,
+ int, tls_val,
+ int __user *, child_tidptr)
+#elif defined(CONFIG_CLONE_BACKWARDS2)
+SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
+ int __user *, parent_tidptr,
+ int __user *, child_tidptr,
+ int, tls_val)
+#elif defined(CONFIG_CLONE_BACKWARDS3)
+SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
+ int, stack_size,
+ int __user *, parent_tidptr,
+ int __user *, child_tidptr,
+ int, tls_val)
+#else
+SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
+ int __user *, parent_tidptr,
+ int __user *, child_tidptr,
+ int, tls_val)
+#endif
+{
+ return do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr);
+}
+#endif
+
+#ifndef ARCH_MIN_MMSTRUCT_ALIGN
+#define ARCH_MIN_MMSTRUCT_ALIGN 0
+#endif
+
+static void sighand_ctor(void *data)
+{
+ struct sighand_struct *sighand = data;
+
+ spin_lock_init(&sighand->siglock);
+ init_waitqueue_head(&sighand->signalfd_wqh);
+}
+
+void __init proc_caches_init(void)
+{
+ sighand_cachep = kmem_cache_create("sighand_cache",
+ sizeof(struct sighand_struct), 0,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU|
+ SLAB_NOTRACK, sighand_ctor);
+ signal_cachep = kmem_cache_create("signal_cache",
+ sizeof(struct signal_struct), 0,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+ files_cachep = kmem_cache_create("files_cache",
+ sizeof(struct files_struct), 0,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+ fs_cachep = kmem_cache_create("fs_cache",
+ sizeof(struct fs_struct), 0,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+ /*
+ * FIXME! The "sizeof(struct mm_struct)" currently includes the
+ * whole struct cpumask for the OFFSTACK case. We could change
+ * this to *only* allocate as much of it as required by the
+ * maximum number of CPU's we can ever have. The cpumask_allocation
+ * is at the end of the structure, exactly for that reason.
+ */
+ mm_cachep = kmem_cache_create("mm_struct",
+ sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+ vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
+ mmap_init();
+ nsproxy_cache_init();
+}
+
+/*
+ * Check constraints on flags passed to the unshare system call.
+ */
+static int check_unshare_flags(unsigned long unshare_flags)
+{
+ if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
+ CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
+ CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
+ CLONE_NEWUSER|CLONE_NEWPID))
+ return -EINVAL;
+ /*
+ * Not implemented, but pretend it works if there is nothing to
+ * unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND
+ * needs to unshare vm.
+ */
+ if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) {
+ /* FIXME: get_task_mm() increments ->mm_users */
+ if (atomic_read(&current->mm->mm_users) > 1)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/*
+ * Unshare the filesystem structure if it is being shared
+ */
+static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
+{
+ struct fs_struct *fs = current->fs;
+
+ if (!(unshare_flags & CLONE_FS) || !fs)
+ return 0;
+
+ /* don't need lock here; in the worst case we'll do useless copy */
+ if (fs->users == 1)
+ return 0;
+
+ *new_fsp = copy_fs_struct(fs);
+ if (!*new_fsp)
+ return -ENOMEM;
+
+ return 0;
+}
+
+/*
+ * Unshare file descriptor table if it is being shared
+ */
+static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
+{
+ struct files_struct *fd = current->files;
+ int error = 0;
+
+ if ((unshare_flags & CLONE_FILES) &&
+ (fd && atomic_read(&fd->count) > 1)) {
+ *new_fdp = dup_fd(fd, &error);
+ if (!*new_fdp)
+ return error;
+ }
+
+ return 0;
+}
+
+/*
+ * unshare allows a process to 'unshare' part of the process
+ * context which was originally shared using clone. copy_*
+ * functions used by do_fork() cannot be used here directly
+ * because they modify an inactive task_struct that is being
+ * constructed. Here we are modifying the current, active,
+ * task_struct.
+ */
+SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
+{
+ struct fs_struct *fs, *new_fs = NULL;
+ struct files_struct *fd, *new_fd = NULL;
+ struct cred *new_cred = NULL;
+ struct nsproxy *new_nsproxy = NULL;
+ int do_sysvsem = 0;
+ int err;
+
+ /*
+ * If unsharing a user namespace must also unshare the thread.
+ */
+ if (unshare_flags & CLONE_NEWUSER)
+ unshare_flags |= CLONE_THREAD | CLONE_FS;
+ /*
+ * If unsharing a thread from a thread group, must also unshare vm.
+ */
+ if (unshare_flags & CLONE_THREAD)
+ unshare_flags |= CLONE_VM;
+ /*
+ * If unsharing vm, must also unshare signal handlers.
+ */
+ if (unshare_flags & CLONE_VM)
+ unshare_flags |= CLONE_SIGHAND;
+ /*
+ * If unsharing namespace, must also unshare filesystem information.
+ */
+ if (unshare_flags & CLONE_NEWNS)
+ unshare_flags |= CLONE_FS;
+
+ err = check_unshare_flags(unshare_flags);
+ if (err)
+ goto bad_unshare_out;
+ /*
+ * CLONE_NEWIPC must also detach from the undolist: after switching
+ * to a new ipc namespace, the semaphore arrays from the old
+ * namespace are unreachable.
+ */
+ if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
+ do_sysvsem = 1;
+ err = unshare_fs(unshare_flags, &new_fs);
+ if (err)
+ goto bad_unshare_out;
+ err = unshare_fd(unshare_flags, &new_fd);
+ if (err)
+ goto bad_unshare_cleanup_fs;
+ err = unshare_userns(unshare_flags, &new_cred);
+ if (err)
+ goto bad_unshare_cleanup_fd;
+ err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
+ new_cred, new_fs);
+ if (err)
+ goto bad_unshare_cleanup_cred;
+
+ if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
+ if (do_sysvsem) {
+ /*
+ * CLONE_SYSVSEM is equivalent to sys_exit().
+ */
+ exit_sem(current);
+ }
+ if (unshare_flags & CLONE_NEWIPC) {
+ /* Orphan segments in old ns (see sem above). */
+ exit_shm(current);
+ shm_init_task(current);
+ }
+
+ if (new_nsproxy)
+ switch_task_namespaces(current, new_nsproxy);
+
+ task_lock(current);
+
+ if (new_fs) {
+ fs = current->fs;
+ spin_lock(&fs->lock);
+ current->fs = new_fs;
+ if (--fs->users)
+ new_fs = NULL;
+ else
+ new_fs = fs;
+ spin_unlock(&fs->lock);
+ }
+
+ if (new_fd) {
+ fd = current->files;
+ current->files = new_fd;
+ new_fd = fd;
+ }
+
+ task_unlock(current);
+
+ if (new_cred) {
+ /* Install the new user namespace */
+ commit_creds(new_cred);
+ new_cred = NULL;
+ }
+ }
+
+bad_unshare_cleanup_cred:
+ if (new_cred)
+ put_cred(new_cred);
+bad_unshare_cleanup_fd:
+ if (new_fd)
+ put_files_struct(new_fd);
+
+bad_unshare_cleanup_fs:
+ if (new_fs)
+ free_fs_struct(new_fs);
+
+bad_unshare_out:
+ return err;
+}
+
+/*
+ * Helper to unshare the files of the current task.
+ * We don't want to expose copy_files internals to
+ * the exec layer of the kernel.
+ */
+
+int unshare_files(struct files_struct **displaced)
+{
+ struct task_struct *task = current;
+ struct files_struct *copy = NULL;
+ int error;
+
+ error = unshare_fd(CLONE_FILES, &copy);
+ if (error || !copy) {
+ *displaced = NULL;
+ return error;
+ }
+ *displaced = task->files;
+ task_lock(task);
+ task->files = copy;
+ task_unlock(task);
+ return 0;
+}
+
+int sysctl_max_threads(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ int ret;
+ int threads = max_threads;
+ int min = MIN_THREADS;
+ int max = MAX_THREADS;
+
+ t = *table;
+ t.data = &threads;
+ t.extra1 = &min;
+ t.extra2 = &max;
+
+ ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+ if (ret || !write)
+ return ret;
+
+ set_max_threads(threads);
+
+ return 0;
+}
diff --git a/kernel/freezer.c b/kernel/freezer.c
new file mode 100644
index 000000000..a8900a3bc
--- /dev/null
+++ b/kernel/freezer.c
@@ -0,0 +1,179 @@
+/*
+ * kernel/freezer.c - Function to freeze a process
+ *
+ * Originally from kernel/power/process.c
+ */
+
+#include <linux/interrupt.h>
+#include <linux/suspend.h>
+#include <linux/export.h>
+#include <linux/syscalls.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+
+/* total number of freezing conditions in effect */
+atomic_t system_freezing_cnt = ATOMIC_INIT(0);
+EXPORT_SYMBOL(system_freezing_cnt);
+
+/* indicate whether PM freezing is in effect, protected by pm_mutex */
+bool pm_freezing;
+bool pm_nosig_freezing;
+
+/*
+ * Temporary export for the deadlock workaround in ata_scsi_hotplug().
+ * Remove once the hack becomes unnecessary.
+ */
+EXPORT_SYMBOL_GPL(pm_freezing);
+
+/* protects freezing and frozen transitions */
+static DEFINE_SPINLOCK(freezer_lock);
+
+/**
+ * freezing_slow_path - slow path for testing whether a task needs to be frozen
+ * @p: task to be tested
+ *
+ * This function is called by freezing() if system_freezing_cnt isn't zero
+ * and tests whether @p needs to enter and stay in frozen state. Can be
+ * called under any context. The freezers are responsible for ensuring the
+ * target tasks see the updated state.
+ */
+bool freezing_slow_path(struct task_struct *p)
+{
+ if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK))
+ return false;
+
+ if (test_thread_flag(TIF_MEMDIE))
+ return false;
+
+ if (pm_nosig_freezing || cgroup_freezing(p))
+ return true;
+
+ if (pm_freezing && !(p->flags & PF_KTHREAD))
+ return true;
+
+ return false;
+}
+EXPORT_SYMBOL(freezing_slow_path);
+
+/* Refrigerator is place where frozen processes are stored :-). */
+bool __refrigerator(bool check_kthr_stop)
+{
+ /* Hmm, should we be allowed to suspend when there are realtime
+ processes around? */
+ bool was_frozen = false;
+ long save = current->state;
+
+ pr_debug("%s entered refrigerator\n", current->comm);
+
+ for (;;) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+
+ spin_lock_irq(&freezer_lock);
+ current->flags |= PF_FROZEN;
+ if (!freezing(current) ||
+ (check_kthr_stop && kthread_should_stop()))
+ current->flags &= ~PF_FROZEN;
+ spin_unlock_irq(&freezer_lock);
+
+ if (!(current->flags & PF_FROZEN))
+ break;
+ was_frozen = true;
+ schedule();
+ }
+
+ pr_debug("%s left refrigerator\n", current->comm);
+
+ /*
+ * Restore saved task state before returning. The mb'd version
+ * needs to be used; otherwise, it might silently break
+ * synchronization which depends on ordered task state change.
+ */
+ set_current_state(save);
+
+ return was_frozen;
+}
+EXPORT_SYMBOL(__refrigerator);
+
+static void fake_signal_wake_up(struct task_struct *p)
+{
+ unsigned long flags;
+
+ if (lock_task_sighand(p, &flags)) {
+ signal_wake_up(p, 0);
+ unlock_task_sighand(p, &flags);
+ }
+}
+
+/**
+ * freeze_task - send a freeze request to given task
+ * @p: task to send the request to
+ *
+ * If @p is freezing, the freeze request is sent either by sending a fake
+ * signal (if it's not a kernel thread) or waking it up (if it's a kernel
+ * thread).
+ *
+ * RETURNS:
+ * %false, if @p is not freezing or already frozen; %true, otherwise
+ */
+bool freeze_task(struct task_struct *p)
+{
+ unsigned long flags;
+
+ /*
+ * This check can race with freezer_do_not_count, but worst case that
+ * will result in an extra wakeup being sent to the task. It does not
+ * race with freezer_count(), the barriers in freezer_count() and
+ * freezer_should_skip() ensure that either freezer_count() sees
+ * freezing == true in try_to_freeze() and freezes, or
+ * freezer_should_skip() sees !PF_FREEZE_SKIP and freezes the task
+ * normally.
+ */
+ if (freezer_should_skip(p))
+ return false;
+
+ spin_lock_irqsave(&freezer_lock, flags);
+ if (!freezing(p) || frozen(p)) {
+ spin_unlock_irqrestore(&freezer_lock, flags);
+ return false;
+ }
+
+ if (!(p->flags & PF_KTHREAD))
+ fake_signal_wake_up(p);
+ else
+ wake_up_state(p, TASK_INTERRUPTIBLE);
+
+ spin_unlock_irqrestore(&freezer_lock, flags);
+ return true;
+}
+
+void __thaw_task(struct task_struct *p)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&freezer_lock, flags);
+ if (frozen(p))
+ wake_up_process(p);
+ spin_unlock_irqrestore(&freezer_lock, flags);
+}
+
+/**
+ * set_freezable - make %current freezable
+ *
+ * Mark %current freezable and enter refrigerator if necessary.
+ */
+bool set_freezable(void)
+{
+ might_sleep();
+
+ /*
+ * Modify flags while holding freezer_lock. This ensures the
+ * freezer notices that we aren't frozen yet or the freezing
+ * condition is visible to try_to_freeze() below.
+ */
+ spin_lock_irq(&freezer_lock);
+ current->flags &= ~PF_NOFREEZE;
+ spin_unlock_irq(&freezer_lock);
+
+ return try_to_freeze();
+}
+EXPORT_SYMBOL(set_freezable);
diff --git a/kernel/futex.c b/kernel/futex.c
new file mode 100644
index 000000000..2579e407f
--- /dev/null
+++ b/kernel/futex.c
@@ -0,0 +1,3053 @@
+/*
+ * Fast Userspace Mutexes (which I call "Futexes!").
+ * (C) Rusty Russell, IBM 2002
+ *
+ * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
+ * (C) Copyright 2003 Red Hat Inc, All Rights Reserved
+ *
+ * Removed page pinning, fix privately mapped COW pages and other cleanups
+ * (C) Copyright 2003, 2004 Jamie Lokier
+ *
+ * Robust futex support started by Ingo Molnar
+ * (C) Copyright 2006 Red Hat Inc, All Rights Reserved
+ * Thanks to Thomas Gleixner for suggestions, analysis and fixes.
+ *
+ * PI-futex support started by Ingo Molnar and Thomas Gleixner
+ * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * PRIVATE futexes by Eric Dumazet
+ * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
+ *
+ * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
+ * Copyright (C) IBM Corporation, 2009
+ * Thanks to Thomas Gleixner for conceptual design and careful reviews.
+ *
+ * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
+ * enough at me, Linus for the original (flawed) idea, Matthew
+ * Kirkwood for proof-of-concept implementation.
+ *
+ * "The futexes are also cursed."
+ * "But they come in a choice of three flavours!"
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/slab.h>
+#include <linux/poll.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/jhash.h>
+#include <linux/init.h>
+#include <linux/futex.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/syscalls.h>
+#include <linux/signal.h>
+#include <linux/export.h>
+#include <linux/magic.h>
+#include <linux/pid.h>
+#include <linux/nsproxy.h>
+#include <linux/ptrace.h>
+#include <linux/sched/rt.h>
+#include <linux/hugetlb.h>
+#include <linux/freezer.h>
+#include <linux/bootmem.h>
+
+#include <asm/futex.h>
+
+#include "locking/rtmutex_common.h"
+
+/*
+ * READ this before attempting to hack on futexes!
+ *
+ * Basic futex operation and ordering guarantees
+ * =============================================
+ *
+ * The waiter reads the futex value in user space and calls
+ * futex_wait(). This function computes the hash bucket and acquires
+ * the hash bucket lock. After that it reads the futex user space value
+ * again and verifies that the data has not changed. If it has not changed
+ * it enqueues itself into the hash bucket, releases the hash bucket lock
+ * and schedules.
+ *
+ * The waker side modifies the user space value of the futex and calls
+ * futex_wake(). This function computes the hash bucket and acquires the
+ * hash bucket lock. Then it looks for waiters on that futex in the hash
+ * bucket and wakes them.
+ *
+ * In futex wake up scenarios where no tasks are blocked on a futex, taking
+ * the hb spinlock can be avoided and simply return. In order for this
+ * optimization to work, ordering guarantees must exist so that the waiter
+ * being added to the list is acknowledged when the list is concurrently being
+ * checked by the waker, avoiding scenarios like the following:
+ *
+ * CPU 0 CPU 1
+ * val = *futex;
+ * sys_futex(WAIT, futex, val);
+ * futex_wait(futex, val);
+ * uval = *futex;
+ * *futex = newval;
+ * sys_futex(WAKE, futex);
+ * futex_wake(futex);
+ * if (queue_empty())
+ * return;
+ * if (uval == val)
+ * lock(hash_bucket(futex));
+ * queue();
+ * unlock(hash_bucket(futex));
+ * schedule();
+ *
+ * This would cause the waiter on CPU 0 to wait forever because it
+ * missed the transition of the user space value from val to newval
+ * and the waker did not find the waiter in the hash bucket queue.
+ *
+ * The correct serialization ensures that a waiter either observes
+ * the changed user space value before blocking or is woken by a
+ * concurrent waker:
+ *
+ * CPU 0 CPU 1
+ * val = *futex;
+ * sys_futex(WAIT, futex, val);
+ * futex_wait(futex, val);
+ *
+ * waiters++; (a)
+ * mb(); (A) <-- paired with -.
+ * |
+ * lock(hash_bucket(futex)); |
+ * |
+ * uval = *futex; |
+ * | *futex = newval;
+ * | sys_futex(WAKE, futex);
+ * | futex_wake(futex);
+ * |
+ * `-------> mb(); (B)
+ * if (uval == val)
+ * queue();
+ * unlock(hash_bucket(futex));
+ * schedule(); if (waiters)
+ * lock(hash_bucket(futex));
+ * else wake_waiters(futex);
+ * waiters--; (b) unlock(hash_bucket(futex));
+ *
+ * Where (A) orders the waiters increment and the futex value read through
+ * atomic operations (see hb_waiters_inc) and where (B) orders the write
+ * to futex and the waiters read -- this is done by the barriers for both
+ * shared and private futexes in get_futex_key_refs().
+ *
+ * This yields the following case (where X:=waiters, Y:=futex):
+ *
+ * X = Y = 0
+ *
+ * w[X]=1 w[Y]=1
+ * MB MB
+ * r[Y]=y r[X]=x
+ *
+ * Which guarantees that x==0 && y==0 is impossible; which translates back into
+ * the guarantee that we cannot both miss the futex variable change and the
+ * enqueue.
+ *
+ * Note that a new waiter is accounted for in (a) even when it is possible that
+ * the wait call can return error, in which case we backtrack from it in (b).
+ * Refer to the comment in queue_lock().
+ *
+ * Similarly, in order to account for waiters being requeued on another
+ * address we always increment the waiters for the destination bucket before
+ * acquiring the lock. It then decrements them again after releasing it -
+ * the code that actually moves the futex(es) between hash buckets (requeue_futex)
+ * will do the additional required waiter count housekeeping. This is done for
+ * double_lock_hb() and double_unlock_hb(), respectively.
+ */
+
+#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
+int __read_mostly futex_cmpxchg_enabled;
+#endif
+
+/*
+ * Futex flags used to encode options to functions and preserve them across
+ * restarts.
+ */
+#define FLAGS_SHARED 0x01
+#define FLAGS_CLOCKRT 0x02
+#define FLAGS_HAS_TIMEOUT 0x04
+
+/*
+ * Priority Inheritance state:
+ */
+struct futex_pi_state {
+ /*
+ * list of 'owned' pi_state instances - these have to be
+ * cleaned up in do_exit() if the task exits prematurely:
+ */
+ struct list_head list;
+
+ /*
+ * The PI object:
+ */
+ struct rt_mutex pi_mutex;
+
+ struct task_struct *owner;
+ atomic_t refcount;
+
+ union futex_key key;
+};
+
+/**
+ * struct futex_q - The hashed futex queue entry, one per waiting task
+ * @list: priority-sorted list of tasks waiting on this futex
+ * @task: the task waiting on the futex
+ * @lock_ptr: the hash bucket lock
+ * @key: the key the futex is hashed on
+ * @pi_state: optional priority inheritance state
+ * @rt_waiter: rt_waiter storage for use with requeue_pi
+ * @requeue_pi_key: the requeue_pi target futex key
+ * @bitset: bitset for the optional bitmasked wakeup
+ *
+ * We use this hashed waitqueue, instead of a normal wait_queue_t, so
+ * we can wake only the relevant ones (hashed queues may be shared).
+ *
+ * A futex_q has a woken state, just like tasks have TASK_RUNNING.
+ * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
+ * The order of wakeup is always to make the first condition true, then
+ * the second.
+ *
+ * PI futexes are typically woken before they are removed from the hash list via
+ * the rt_mutex code. See unqueue_me_pi().
+ */
+struct futex_q {
+ struct plist_node list;
+
+ struct task_struct *task;
+ spinlock_t *lock_ptr;
+ union futex_key key;
+ struct futex_pi_state *pi_state;
+ struct rt_mutex_waiter *rt_waiter;
+ union futex_key *requeue_pi_key;
+ u32 bitset;
+};
+
+static const struct futex_q futex_q_init = {
+ /* list gets initialized in queue_me()*/
+ .key = FUTEX_KEY_INIT,
+ .bitset = FUTEX_BITSET_MATCH_ANY
+};
+
+/*
+ * Hash buckets are shared by all the futex_keys that hash to the same
+ * location. Each key may have multiple futex_q structures, one for each task
+ * waiting on a futex.
+ */
+struct futex_hash_bucket {
+ atomic_t waiters;
+ spinlock_t lock;
+ struct plist_head chain;
+} ____cacheline_aligned_in_smp;
+
+static unsigned long __read_mostly futex_hashsize;
+
+static struct futex_hash_bucket *futex_queues;
+
+static inline void futex_get_mm(union futex_key *key)
+{
+ atomic_inc(&key->private.mm->mm_count);
+ /*
+ * Ensure futex_get_mm() implies a full barrier such that
+ * get_futex_key() implies a full barrier. This is relied upon
+ * as full barrier (B), see the ordering comment above.
+ */
+ smp_mb__after_atomic();
+}
+
+/*
+ * Reflects a new waiter being added to the waitqueue.
+ */
+static inline void hb_waiters_inc(struct futex_hash_bucket *hb)
+{
+#ifdef CONFIG_SMP
+ atomic_inc(&hb->waiters);
+ /*
+ * Full barrier (A), see the ordering comment above.
+ */
+ smp_mb__after_atomic();
+#endif
+}
+
+/*
+ * Reflects a waiter being removed from the waitqueue by wakeup
+ * paths.
+ */
+static inline void hb_waiters_dec(struct futex_hash_bucket *hb)
+{
+#ifdef CONFIG_SMP
+ atomic_dec(&hb->waiters);
+#endif
+}
+
+static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
+{
+#ifdef CONFIG_SMP
+ return atomic_read(&hb->waiters);
+#else
+ return 1;
+#endif
+}
+
+/*
+ * We hash on the keys returned from get_futex_key (see below).
+ */
+static struct futex_hash_bucket *hash_futex(union futex_key *key)
+{
+ u32 hash = jhash2((u32*)&key->both.word,
+ (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
+ key->both.offset);
+ return &futex_queues[hash & (futex_hashsize - 1)];
+}
+
+/*
+ * Return 1 if two futex_keys are equal, 0 otherwise.
+ */
+static inline int match_futex(union futex_key *key1, union futex_key *key2)
+{
+ return (key1 && key2
+ && key1->both.word == key2->both.word
+ && key1->both.ptr == key2->both.ptr
+ && key1->both.offset == key2->both.offset);
+}
+
+/*
+ * Take a reference to the resource addressed by a key.
+ * Can be called while holding spinlocks.
+ *
+ */
+static void get_futex_key_refs(union futex_key *key)
+{
+ if (!key->both.ptr)
+ return;
+
+ switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
+ case FUT_OFF_INODE:
+ ihold(key->shared.inode); /* implies MB (B) */
+ break;
+ case FUT_OFF_MMSHARED:
+ futex_get_mm(key); /* implies MB (B) */
+ break;
+ default:
+ /*
+ * Private futexes do not hold reference on an inode or
+ * mm, therefore the only purpose of calling get_futex_key_refs
+ * is because we need the barrier for the lockless waiter check.
+ */
+ smp_mb(); /* explicit MB (B) */
+ }
+}
+
+/*
+ * Drop a reference to the resource addressed by a key.
+ * The hash bucket spinlock must not be held. This is
+ * a no-op for private futexes, see comment in the get
+ * counterpart.
+ */
+static void drop_futex_key_refs(union futex_key *key)
+{
+ if (!key->both.ptr) {
+ /* If we're here then we tried to put a key we failed to get */
+ WARN_ON_ONCE(1);
+ return;
+ }
+
+ switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
+ case FUT_OFF_INODE:
+ iput(key->shared.inode);
+ break;
+ case FUT_OFF_MMSHARED:
+ mmdrop(key->private.mm);
+ break;
+ }
+}
+
+/**
+ * get_futex_key() - Get parameters which are the keys for a futex
+ * @uaddr: virtual address of the futex
+ * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
+ * @key: address where result is stored.
+ * @rw: mapping needs to be read/write (values: VERIFY_READ,
+ * VERIFY_WRITE)
+ *
+ * Return: a negative error code or 0
+ *
+ * The key words are stored in *key on success.
+ *
+ * For shared mappings, it's (page->index, file_inode(vma->vm_file),
+ * offset_within_page). For private mappings, it's (uaddr, current->mm).
+ * We can usually work out the index without swapping in the page.
+ *
+ * lock_page() might sleep, the caller should not hold a spinlock.
+ */
+static int
+get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
+{
+ unsigned long address = (unsigned long)uaddr;
+ struct mm_struct *mm = current->mm;
+ struct page *page, *page_head;
+ int err, ro = 0;
+
+ /*
+ * The futex address must be "naturally" aligned.
+ */
+ key->both.offset = address % PAGE_SIZE;
+ if (unlikely((address % sizeof(u32)) != 0))
+ return -EINVAL;
+ address -= key->both.offset;
+
+ if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
+ return -EFAULT;
+
+ /*
+ * PROCESS_PRIVATE futexes are fast.
+ * As the mm cannot disappear under us and the 'key' only needs
+ * virtual address, we dont even have to find the underlying vma.
+ * Note : We do have to check 'uaddr' is a valid user address,
+ * but access_ok() should be faster than find_vma()
+ */
+ if (!fshared) {
+ key->private.mm = mm;
+ key->private.address = address;
+ get_futex_key_refs(key); /* implies MB (B) */
+ return 0;
+ }
+
+again:
+ err = get_user_pages_fast(address, 1, 1, &page);
+ /*
+ * If write access is not required (eg. FUTEX_WAIT), try
+ * and get read-only access.
+ */
+ if (err == -EFAULT && rw == VERIFY_READ) {
+ err = get_user_pages_fast(address, 1, 0, &page);
+ ro = 1;
+ }
+ if (err < 0)
+ return err;
+ else
+ err = 0;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ page_head = page;
+ if (unlikely(PageTail(page))) {
+ put_page(page);
+ /* serialize against __split_huge_page_splitting() */
+ local_irq_disable();
+ if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) {
+ page_head = compound_head(page);
+ /*
+ * page_head is valid pointer but we must pin
+ * it before taking the PG_lock and/or
+ * PG_compound_lock. The moment we re-enable
+ * irqs __split_huge_page_splitting() can
+ * return and the head page can be freed from
+ * under us. We can't take the PG_lock and/or
+ * PG_compound_lock on a page that could be
+ * freed from under us.
+ */
+ if (page != page_head) {
+ get_page(page_head);
+ put_page(page);
+ }
+ local_irq_enable();
+ } else {
+ local_irq_enable();
+ goto again;
+ }
+ }
+#else
+ page_head = compound_head(page);
+ if (page != page_head) {
+ get_page(page_head);
+ put_page(page);
+ }
+#endif
+
+ lock_page(page_head);
+
+ /*
+ * If page_head->mapping is NULL, then it cannot be a PageAnon
+ * page; but it might be the ZERO_PAGE or in the gate area or
+ * in a special mapping (all cases which we are happy to fail);
+ * or it may have been a good file page when get_user_pages_fast
+ * found it, but truncated or holepunched or subjected to
+ * invalidate_complete_page2 before we got the page lock (also
+ * cases which we are happy to fail). And we hold a reference,
+ * so refcount care in invalidate_complete_page's remove_mapping
+ * prevents drop_caches from setting mapping to NULL beneath us.
+ *
+ * The case we do have to guard against is when memory pressure made
+ * shmem_writepage move it from filecache to swapcache beneath us:
+ * an unlikely race, but we do need to retry for page_head->mapping.
+ */
+ if (!page_head->mapping) {
+ int shmem_swizzled = PageSwapCache(page_head);
+ unlock_page(page_head);
+ put_page(page_head);
+ if (shmem_swizzled)
+ goto again;
+ return -EFAULT;
+ }
+
+ /*
+ * Private mappings are handled in a simple way.
+ *
+ * NOTE: When userspace waits on a MAP_SHARED mapping, even if
+ * it's a read-only handle, it's expected that futexes attach to
+ * the object not the particular process.
+ */
+ if (PageAnon(page_head)) {
+ /*
+ * A RO anonymous page will never change and thus doesn't make
+ * sense for futex operations.
+ */
+ if (ro) {
+ err = -EFAULT;
+ goto out;
+ }
+
+ key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
+ key->private.mm = mm;
+ key->private.address = address;
+ } else {
+ key->both.offset |= FUT_OFF_INODE; /* inode-based key */
+ key->shared.inode = page_head->mapping->host;
+ key->shared.pgoff = basepage_index(page);
+ }
+
+ get_futex_key_refs(key); /* implies MB (B) */
+
+out:
+ unlock_page(page_head);
+ put_page(page_head);
+ return err;
+}
+
+static inline void put_futex_key(union futex_key *key)
+{
+ drop_futex_key_refs(key);
+}
+
+/**
+ * fault_in_user_writeable() - Fault in user address and verify RW access
+ * @uaddr: pointer to faulting user space address
+ *
+ * Slow path to fixup the fault we just took in the atomic write
+ * access to @uaddr.
+ *
+ * We have no generic implementation of a non-destructive write to the
+ * user address. We know that we faulted in the atomic pagefault
+ * disabled section so we can as well avoid the #PF overhead by
+ * calling get_user_pages() right away.
+ */
+static int fault_in_user_writeable(u32 __user *uaddr)
+{
+ struct mm_struct *mm = current->mm;
+ int ret;
+
+ down_read(&mm->mmap_sem);
+ ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
+ FAULT_FLAG_WRITE);
+ up_read(&mm->mmap_sem);
+
+ return ret < 0 ? ret : 0;
+}
+
+/**
+ * futex_top_waiter() - Return the highest priority waiter on a futex
+ * @hb: the hash bucket the futex_q's reside in
+ * @key: the futex key (to distinguish it from other futex futex_q's)
+ *
+ * Must be called with the hb lock held.
+ */
+static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
+ union futex_key *key)
+{
+ struct futex_q *this;
+
+ plist_for_each_entry(this, &hb->chain, list) {
+ if (match_futex(&this->key, key))
+ return this;
+ }
+ return NULL;
+}
+
+static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
+ u32 uval, u32 newval)
+{
+ int ret;
+
+ pagefault_disable();
+ ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
+ pagefault_enable();
+
+ return ret;
+}
+
+static int get_futex_value_locked(u32 *dest, u32 __user *from)
+{
+ int ret;
+
+ pagefault_disable();
+ ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
+ pagefault_enable();
+
+ return ret ? -EFAULT : 0;
+}
+
+
+/*
+ * PI code:
+ */
+static int refill_pi_state_cache(void)
+{
+ struct futex_pi_state *pi_state;
+
+ if (likely(current->pi_state_cache))
+ return 0;
+
+ pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
+
+ if (!pi_state)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&pi_state->list);
+ /* pi_mutex gets initialized later */
+ pi_state->owner = NULL;
+ atomic_set(&pi_state->refcount, 1);
+ pi_state->key = FUTEX_KEY_INIT;
+
+ current->pi_state_cache = pi_state;
+
+ return 0;
+}
+
+static struct futex_pi_state * alloc_pi_state(void)
+{
+ struct futex_pi_state *pi_state = current->pi_state_cache;
+
+ WARN_ON(!pi_state);
+ current->pi_state_cache = NULL;
+
+ return pi_state;
+}
+
+/*
+ * Must be called with the hb lock held.
+ */
+static void free_pi_state(struct futex_pi_state *pi_state)
+{
+ if (!pi_state)
+ return;
+
+ if (!atomic_dec_and_test(&pi_state->refcount))
+ return;
+
+ /*
+ * If pi_state->owner is NULL, the owner is most probably dying
+ * and has cleaned up the pi_state already
+ */
+ if (pi_state->owner) {
+ raw_spin_lock_irq(&pi_state->owner->pi_lock);
+ list_del_init(&pi_state->list);
+ raw_spin_unlock_irq(&pi_state->owner->pi_lock);
+
+ rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
+ }
+
+ if (current->pi_state_cache)
+ kfree(pi_state);
+ else {
+ /*
+ * pi_state->list is already empty.
+ * clear pi_state->owner.
+ * refcount is at 0 - put it back to 1.
+ */
+ pi_state->owner = NULL;
+ atomic_set(&pi_state->refcount, 1);
+ current->pi_state_cache = pi_state;
+ }
+}
+
+/*
+ * Look up the task based on what TID userspace gave us.
+ * We dont trust it.
+ */
+static struct task_struct * futex_find_get_task(pid_t pid)
+{
+ struct task_struct *p;
+
+ rcu_read_lock();
+ p = find_task_by_vpid(pid);
+ if (p)
+ get_task_struct(p);
+
+ rcu_read_unlock();
+
+ return p;
+}
+
+/*
+ * This task is holding PI mutexes at exit time => bad.
+ * Kernel cleans up PI-state, but userspace is likely hosed.
+ * (Robust-futex cleanup is separate and might save the day for userspace.)
+ */
+void exit_pi_state_list(struct task_struct *curr)
+{
+ struct list_head *next, *head = &curr->pi_state_list;
+ struct futex_pi_state *pi_state;
+ struct futex_hash_bucket *hb;
+ union futex_key key = FUTEX_KEY_INIT;
+
+ if (!futex_cmpxchg_enabled)
+ return;
+ /*
+ * We are a ZOMBIE and nobody can enqueue itself on
+ * pi_state_list anymore, but we have to be careful
+ * versus waiters unqueueing themselves:
+ */
+ raw_spin_lock_irq(&curr->pi_lock);
+ while (!list_empty(head)) {
+
+ next = head->next;
+ pi_state = list_entry(next, struct futex_pi_state, list);
+ key = pi_state->key;
+ hb = hash_futex(&key);
+ raw_spin_unlock_irq(&curr->pi_lock);
+
+ spin_lock(&hb->lock);
+
+ raw_spin_lock_irq(&curr->pi_lock);
+ /*
+ * We dropped the pi-lock, so re-check whether this
+ * task still owns the PI-state:
+ */
+ if (head->next != next) {
+ spin_unlock(&hb->lock);
+ continue;
+ }
+
+ WARN_ON(pi_state->owner != curr);
+ WARN_ON(list_empty(&pi_state->list));
+ list_del_init(&pi_state->list);
+ pi_state->owner = NULL;
+ raw_spin_unlock_irq(&curr->pi_lock);
+
+ rt_mutex_unlock(&pi_state->pi_mutex);
+
+ spin_unlock(&hb->lock);
+
+ raw_spin_lock_irq(&curr->pi_lock);
+ }
+ raw_spin_unlock_irq(&curr->pi_lock);
+}
+
+/*
+ * We need to check the following states:
+ *
+ * Waiter | pi_state | pi->owner | uTID | uODIED | ?
+ *
+ * [1] NULL | --- | --- | 0 | 0/1 | Valid
+ * [2] NULL | --- | --- | >0 | 0/1 | Valid
+ *
+ * [3] Found | NULL | -- | Any | 0/1 | Invalid
+ *
+ * [4] Found | Found | NULL | 0 | 1 | Valid
+ * [5] Found | Found | NULL | >0 | 1 | Invalid
+ *
+ * [6] Found | Found | task | 0 | 1 | Valid
+ *
+ * [7] Found | Found | NULL | Any | 0 | Invalid
+ *
+ * [8] Found | Found | task | ==taskTID | 0/1 | Valid
+ * [9] Found | Found | task | 0 | 0 | Invalid
+ * [10] Found | Found | task | !=taskTID | 0/1 | Invalid
+ *
+ * [1] Indicates that the kernel can acquire the futex atomically. We
+ * came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
+ *
+ * [2] Valid, if TID does not belong to a kernel thread. If no matching
+ * thread is found then it indicates that the owner TID has died.
+ *
+ * [3] Invalid. The waiter is queued on a non PI futex
+ *
+ * [4] Valid state after exit_robust_list(), which sets the user space
+ * value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
+ *
+ * [5] The user space value got manipulated between exit_robust_list()
+ * and exit_pi_state_list()
+ *
+ * [6] Valid state after exit_pi_state_list() which sets the new owner in
+ * the pi_state but cannot access the user space value.
+ *
+ * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set.
+ *
+ * [8] Owner and user space value match
+ *
+ * [9] There is no transient state which sets the user space TID to 0
+ * except exit_robust_list(), but this is indicated by the
+ * FUTEX_OWNER_DIED bit. See [4]
+ *
+ * [10] There is no transient state which leaves owner and user space
+ * TID out of sync.
+ */
+
+/*
+ * Validate that the existing waiter has a pi_state and sanity check
+ * the pi_state against the user space value. If correct, attach to
+ * it.
+ */
+static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
+ struct futex_pi_state **ps)
+{
+ pid_t pid = uval & FUTEX_TID_MASK;
+
+ /*
+ * Userspace might have messed up non-PI and PI futexes [3]
+ */
+ if (unlikely(!pi_state))
+ return -EINVAL;
+
+ WARN_ON(!atomic_read(&pi_state->refcount));
+
+ /*
+ * Handle the owner died case:
+ */
+ if (uval & FUTEX_OWNER_DIED) {
+ /*
+ * exit_pi_state_list sets owner to NULL and wakes the
+ * topmost waiter. The task which acquires the
+ * pi_state->rt_mutex will fixup owner.
+ */
+ if (!pi_state->owner) {
+ /*
+ * No pi state owner, but the user space TID
+ * is not 0. Inconsistent state. [5]
+ */
+ if (pid)
+ return -EINVAL;
+ /*
+ * Take a ref on the state and return success. [4]
+ */
+ goto out_state;
+ }
+
+ /*
+ * If TID is 0, then either the dying owner has not
+ * yet executed exit_pi_state_list() or some waiter
+ * acquired the rtmutex in the pi state, but did not
+ * yet fixup the TID in user space.
+ *
+ * Take a ref on the state and return success. [6]
+ */
+ if (!pid)
+ goto out_state;
+ } else {
+ /*
+ * If the owner died bit is not set, then the pi_state
+ * must have an owner. [7]
+ */
+ if (!pi_state->owner)
+ return -EINVAL;
+ }
+
+ /*
+ * Bail out if user space manipulated the futex value. If pi
+ * state exists then the owner TID must be the same as the
+ * user space TID. [9/10]
+ */
+ if (pid != task_pid_vnr(pi_state->owner))
+ return -EINVAL;
+out_state:
+ atomic_inc(&pi_state->refcount);
+ *ps = pi_state;
+ return 0;
+}
+
+/*
+ * Lookup the task for the TID provided from user space and attach to
+ * it after doing proper sanity checks.
+ */
+static int attach_to_pi_owner(u32 uval, union futex_key *key,
+ struct futex_pi_state **ps)
+{
+ pid_t pid = uval & FUTEX_TID_MASK;
+ struct futex_pi_state *pi_state;
+ struct task_struct *p;
+
+ /*
+ * We are the first waiter - try to look up the real owner and attach
+ * the new pi_state to it, but bail out when TID = 0 [1]
+ */
+ if (!pid)
+ return -ESRCH;
+ p = futex_find_get_task(pid);
+ if (!p)
+ return -ESRCH;
+
+ if (unlikely(p->flags & PF_KTHREAD)) {
+ put_task_struct(p);
+ return -EPERM;
+ }
+
+ /*
+ * We need to look at the task state flags to figure out,
+ * whether the task is exiting. To protect against the do_exit
+ * change of the task flags, we do this protected by
+ * p->pi_lock:
+ */
+ raw_spin_lock_irq(&p->pi_lock);
+ if (unlikely(p->flags & PF_EXITING)) {
+ /*
+ * The task is on the way out. When PF_EXITPIDONE is
+ * set, we know that the task has finished the
+ * cleanup:
+ */
+ int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN;
+
+ raw_spin_unlock_irq(&p->pi_lock);
+ put_task_struct(p);
+ return ret;
+ }
+
+ /*
+ * No existing pi state. First waiter. [2]
+ */
+ pi_state = alloc_pi_state();
+
+ /*
+ * Initialize the pi_mutex in locked state and make @p
+ * the owner of it:
+ */
+ rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
+
+ /* Store the key for possible exit cleanups: */
+ pi_state->key = *key;
+
+ WARN_ON(!list_empty(&pi_state->list));
+ list_add(&pi_state->list, &p->pi_state_list);
+ pi_state->owner = p;
+ raw_spin_unlock_irq(&p->pi_lock);
+
+ put_task_struct(p);
+
+ *ps = pi_state;
+
+ return 0;
+}
+
+static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
+ union futex_key *key, struct futex_pi_state **ps)
+{
+ struct futex_q *match = futex_top_waiter(hb, key);
+
+ /*
+ * If there is a waiter on that futex, validate it and
+ * attach to the pi_state when the validation succeeds.
+ */
+ if (match)
+ return attach_to_pi_state(uval, match->pi_state, ps);
+
+ /*
+ * We are the first waiter - try to look up the owner based on
+ * @uval and attach to it.
+ */
+ return attach_to_pi_owner(uval, key, ps);
+}
+
+static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
+{
+ u32 uninitialized_var(curval);
+
+ if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
+ return -EFAULT;
+
+ /*If user space value changed, let the caller retry */
+ return curval != uval ? -EAGAIN : 0;
+}
+
+/**
+ * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
+ * @uaddr: the pi futex user address
+ * @hb: the pi futex hash bucket
+ * @key: the futex key associated with uaddr and hb
+ * @ps: the pi_state pointer where we store the result of the
+ * lookup
+ * @task: the task to perform the atomic lock work for. This will
+ * be "current" except in the case of requeue pi.
+ * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
+ *
+ * Return:
+ * 0 - ready to wait;
+ * 1 - acquired the lock;
+ * <0 - error
+ *
+ * The hb->lock and futex_key refs shall be held by the caller.
+ */
+static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
+ union futex_key *key,
+ struct futex_pi_state **ps,
+ struct task_struct *task, int set_waiters)
+{
+ u32 uval, newval, vpid = task_pid_vnr(task);
+ struct futex_q *match;
+ int ret;
+
+ /*
+ * Read the user space value first so we can validate a few
+ * things before proceeding further.
+ */
+ if (get_futex_value_locked(&uval, uaddr))
+ return -EFAULT;
+
+ /*
+ * Detect deadlocks.
+ */
+ if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
+ return -EDEADLK;
+
+ /*
+ * Lookup existing state first. If it exists, try to attach to
+ * its pi_state.
+ */
+ match = futex_top_waiter(hb, key);
+ if (match)
+ return attach_to_pi_state(uval, match->pi_state, ps);
+
+ /*
+ * No waiter and user TID is 0. We are here because the
+ * waiters or the owner died bit is set or called from
+ * requeue_cmp_pi or for whatever reason something took the
+ * syscall.
+ */
+ if (!(uval & FUTEX_TID_MASK)) {
+ /*
+ * We take over the futex. No other waiters and the user space
+ * TID is 0. We preserve the owner died bit.
+ */
+ newval = uval & FUTEX_OWNER_DIED;
+ newval |= vpid;
+
+ /* The futex requeue_pi code can enforce the waiters bit */
+ if (set_waiters)
+ newval |= FUTEX_WAITERS;
+
+ ret = lock_pi_update_atomic(uaddr, uval, newval);
+ /* If the take over worked, return 1 */
+ return ret < 0 ? ret : 1;
+ }
+
+ /*
+ * First waiter. Set the waiters bit before attaching ourself to
+ * the owner. If owner tries to unlock, it will be forced into
+ * the kernel and blocked on hb->lock.
+ */
+ newval = uval | FUTEX_WAITERS;
+ ret = lock_pi_update_atomic(uaddr, uval, newval);
+ if (ret)
+ return ret;
+ /*
+ * If the update of the user space value succeeded, we try to
+ * attach to the owner. If that fails, no harm done, we only
+ * set the FUTEX_WAITERS bit in the user space variable.
+ */
+ return attach_to_pi_owner(uval, key, ps);
+}
+
+/**
+ * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket
+ * @q: The futex_q to unqueue
+ *
+ * The q->lock_ptr must not be NULL and must be held by the caller.
+ */
+static void __unqueue_futex(struct futex_q *q)
+{
+ struct futex_hash_bucket *hb;
+
+ if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr))
+ || WARN_ON(plist_node_empty(&q->list)))
+ return;
+
+ hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
+ plist_del(&q->list, &hb->chain);
+ hb_waiters_dec(hb);
+}
+
+/*
+ * The hash bucket lock must be held when this is called.
+ * Afterwards, the futex_q must not be accessed.
+ */
+static void wake_futex(struct futex_q *q)
+{
+ struct task_struct *p = q->task;
+
+ if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
+ return;
+
+ /*
+ * We set q->lock_ptr = NULL _before_ we wake up the task. If
+ * a non-futex wake up happens on another CPU then the task
+ * might exit and p would dereference a non-existing task
+ * struct. Prevent this by holding a reference on p across the
+ * wake up.
+ */
+ get_task_struct(p);
+
+ __unqueue_futex(q);
+ /*
+ * The waiting task can free the futex_q as soon as
+ * q->lock_ptr = NULL is written, without taking any locks. A
+ * memory barrier is required here to prevent the following
+ * store to lock_ptr from getting ahead of the plist_del.
+ */
+ smp_wmb();
+ q->lock_ptr = NULL;
+
+ wake_up_state(p, TASK_NORMAL);
+ put_task_struct(p);
+}
+
+static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
+{
+ struct task_struct *new_owner;
+ struct futex_pi_state *pi_state = this->pi_state;
+ u32 uninitialized_var(curval), newval;
+ int ret = 0;
+
+ if (!pi_state)
+ return -EINVAL;
+
+ /*
+ * If current does not own the pi_state then the futex is
+ * inconsistent and user space fiddled with the futex value.
+ */
+ if (pi_state->owner != current)
+ return -EINVAL;
+
+ raw_spin_lock(&pi_state->pi_mutex.wait_lock);
+ new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
+
+ /*
+ * It is possible that the next waiter (the one that brought
+ * this owner to the kernel) timed out and is no longer
+ * waiting on the lock.
+ */
+ if (!new_owner)
+ new_owner = this->task;
+
+ /*
+ * We pass it to the next owner. The WAITERS bit is always
+ * kept enabled while there is PI state around. We cleanup the
+ * owner died bit, because we are the owner.
+ */
+ newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
+
+ if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
+ ret = -EFAULT;
+ else if (curval != uval)
+ ret = -EINVAL;
+ if (ret) {
+ raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
+ return ret;
+ }
+
+ raw_spin_lock_irq(&pi_state->owner->pi_lock);
+ WARN_ON(list_empty(&pi_state->list));
+ list_del_init(&pi_state->list);
+ raw_spin_unlock_irq(&pi_state->owner->pi_lock);
+
+ raw_spin_lock_irq(&new_owner->pi_lock);
+ WARN_ON(!list_empty(&pi_state->list));
+ list_add(&pi_state->list, &new_owner->pi_state_list);
+ pi_state->owner = new_owner;
+ raw_spin_unlock_irq(&new_owner->pi_lock);
+
+ raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
+ rt_mutex_unlock(&pi_state->pi_mutex);
+
+ return 0;
+}
+
+/*
+ * Express the locking dependencies for lockdep:
+ */
+static inline void
+double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
+{
+ if (hb1 <= hb2) {
+ spin_lock(&hb1->lock);
+ if (hb1 < hb2)
+ spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
+ } else { /* hb1 > hb2 */
+ spin_lock(&hb2->lock);
+ spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
+ }
+}
+
+static inline void
+double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
+{
+ spin_unlock(&hb1->lock);
+ if (hb1 != hb2)
+ spin_unlock(&hb2->lock);
+}
+
+/*
+ * Wake up waiters matching bitset queued on this futex (uaddr).
+ */
+static int
+futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
+{
+ struct futex_hash_bucket *hb;
+ struct futex_q *this, *next;
+ union futex_key key = FUTEX_KEY_INIT;
+ int ret;
+
+ if (!bitset)
+ return -EINVAL;
+
+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_READ);
+ if (unlikely(ret != 0))
+ goto out;
+
+ hb = hash_futex(&key);
+
+ /* Make sure we really have tasks to wakeup */
+ if (!hb_waiters_pending(hb))
+ goto out_put_key;
+
+ spin_lock(&hb->lock);
+
+ plist_for_each_entry_safe(this, next, &hb->chain, list) {
+ if (match_futex (&this->key, &key)) {
+ if (this->pi_state || this->rt_waiter) {
+ ret = -EINVAL;
+ break;
+ }
+
+ /* Check if one of the bits is set in both bitsets */
+ if (!(this->bitset & bitset))
+ continue;
+
+ wake_futex(this);
+ if (++ret >= nr_wake)
+ break;
+ }
+ }
+
+ spin_unlock(&hb->lock);
+out_put_key:
+ put_futex_key(&key);
+out:
+ return ret;
+}
+
+/*
+ * Wake up all waiters hashed on the physical page that is mapped
+ * to this virtual address:
+ */
+static int
+futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
+ int nr_wake, int nr_wake2, int op)
+{
+ union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
+ struct futex_hash_bucket *hb1, *hb2;
+ struct futex_q *this, *next;
+ int ret, op_ret;
+
+retry:
+ ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
+ if (unlikely(ret != 0))
+ goto out;
+ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
+ if (unlikely(ret != 0))
+ goto out_put_key1;
+
+ hb1 = hash_futex(&key1);
+ hb2 = hash_futex(&key2);
+
+retry_private:
+ double_lock_hb(hb1, hb2);
+ op_ret = futex_atomic_op_inuser(op, uaddr2);
+ if (unlikely(op_ret < 0)) {
+
+ double_unlock_hb(hb1, hb2);
+
+#ifndef CONFIG_MMU
+ /*
+ * we don't get EFAULT from MMU faults if we don't have an MMU,
+ * but we might get them from range checking
+ */
+ ret = op_ret;
+ goto out_put_keys;
+#endif
+
+ if (unlikely(op_ret != -EFAULT)) {
+ ret = op_ret;
+ goto out_put_keys;
+ }
+
+ ret = fault_in_user_writeable(uaddr2);
+ if (ret)
+ goto out_put_keys;
+
+ if (!(flags & FLAGS_SHARED))
+ goto retry_private;
+
+ put_futex_key(&key2);
+ put_futex_key(&key1);
+ goto retry;
+ }
+
+ plist_for_each_entry_safe(this, next, &hb1->chain, list) {
+ if (match_futex (&this->key, &key1)) {
+ if (this->pi_state || this->rt_waiter) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+ wake_futex(this);
+ if (++ret >= nr_wake)
+ break;
+ }
+ }
+
+ if (op_ret > 0) {
+ op_ret = 0;
+ plist_for_each_entry_safe(this, next, &hb2->chain, list) {
+ if (match_futex (&this->key, &key2)) {
+ if (this->pi_state || this->rt_waiter) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+ wake_futex(this);
+ if (++op_ret >= nr_wake2)
+ break;
+ }
+ }
+ ret += op_ret;
+ }
+
+out_unlock:
+ double_unlock_hb(hb1, hb2);
+out_put_keys:
+ put_futex_key(&key2);
+out_put_key1:
+ put_futex_key(&key1);
+out:
+ return ret;
+}
+
+/**
+ * requeue_futex() - Requeue a futex_q from one hb to another
+ * @q: the futex_q to requeue
+ * @hb1: the source hash_bucket
+ * @hb2: the target hash_bucket
+ * @key2: the new key for the requeued futex_q
+ */
+static inline
+void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
+ struct futex_hash_bucket *hb2, union futex_key *key2)
+{
+
+ /*
+ * If key1 and key2 hash to the same bucket, no need to
+ * requeue.
+ */
+ if (likely(&hb1->chain != &hb2->chain)) {
+ plist_del(&q->list, &hb1->chain);
+ hb_waiters_dec(hb1);
+ plist_add(&q->list, &hb2->chain);
+ hb_waiters_inc(hb2);
+ q->lock_ptr = &hb2->lock;
+ }
+ get_futex_key_refs(key2);
+ q->key = *key2;
+}
+
+/**
+ * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
+ * @q: the futex_q
+ * @key: the key of the requeue target futex
+ * @hb: the hash_bucket of the requeue target futex
+ *
+ * During futex_requeue, with requeue_pi=1, it is possible to acquire the
+ * target futex if it is uncontended or via a lock steal. Set the futex_q key
+ * to the requeue target futex so the waiter can detect the wakeup on the right
+ * futex, but remove it from the hb and NULL the rt_waiter so it can detect
+ * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock
+ * to protect access to the pi_state to fixup the owner later. Must be called
+ * with both q->lock_ptr and hb->lock held.
+ */
+static inline
+void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
+ struct futex_hash_bucket *hb)
+{
+ get_futex_key_refs(key);
+ q->key = *key;
+
+ __unqueue_futex(q);
+
+ WARN_ON(!q->rt_waiter);
+ q->rt_waiter = NULL;
+
+ q->lock_ptr = &hb->lock;
+
+ wake_up_state(q->task, TASK_NORMAL);
+}
+
+/**
+ * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
+ * @pifutex: the user address of the to futex
+ * @hb1: the from futex hash bucket, must be locked by the caller
+ * @hb2: the to futex hash bucket, must be locked by the caller
+ * @key1: the from futex key
+ * @key2: the to futex key
+ * @ps: address to store the pi_state pointer
+ * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
+ *
+ * Try and get the lock on behalf of the top waiter if we can do it atomically.
+ * Wake the top waiter if we succeed. If the caller specified set_waiters,
+ * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
+ * hb1 and hb2 must be held by the caller.
+ *
+ * Return:
+ * 0 - failed to acquire the lock atomically;
+ * >0 - acquired the lock, return value is vpid of the top_waiter
+ * <0 - error
+ */
+static int futex_proxy_trylock_atomic(u32 __user *pifutex,
+ struct futex_hash_bucket *hb1,
+ struct futex_hash_bucket *hb2,
+ union futex_key *key1, union futex_key *key2,
+ struct futex_pi_state **ps, int set_waiters)
+{
+ struct futex_q *top_waiter = NULL;
+ u32 curval;
+ int ret, vpid;
+
+ if (get_futex_value_locked(&curval, pifutex))
+ return -EFAULT;
+
+ /*
+ * Find the top_waiter and determine if there are additional waiters.
+ * If the caller intends to requeue more than 1 waiter to pifutex,
+ * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
+ * as we have means to handle the possible fault. If not, don't set
+ * the bit unecessarily as it will force the subsequent unlock to enter
+ * the kernel.
+ */
+ top_waiter = futex_top_waiter(hb1, key1);
+
+ /* There are no waiters, nothing for us to do. */
+ if (!top_waiter)
+ return 0;
+
+ /* Ensure we requeue to the expected futex. */
+ if (!match_futex(top_waiter->requeue_pi_key, key2))
+ return -EINVAL;
+
+ /*
+ * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in
+ * the contended case or if set_waiters is 1. The pi_state is returned
+ * in ps in contended cases.
+ */
+ vpid = task_pid_vnr(top_waiter->task);
+ ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
+ set_waiters);
+ if (ret == 1) {
+ requeue_pi_wake_futex(top_waiter, key2, hb2);
+ return vpid;
+ }
+ return ret;
+}
+
+/**
+ * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
+ * @uaddr1: source futex user address
+ * @flags: futex flags (FLAGS_SHARED, etc.)
+ * @uaddr2: target futex user address
+ * @nr_wake: number of waiters to wake (must be 1 for requeue_pi)
+ * @nr_requeue: number of waiters to requeue (0-INT_MAX)
+ * @cmpval: @uaddr1 expected value (or %NULL)
+ * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
+ * pi futex (pi to pi requeue is not supported)
+ *
+ * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
+ * uaddr2 atomically on behalf of the top waiter.
+ *
+ * Return:
+ * >=0 - on success, the number of tasks requeued or woken;
+ * <0 - on error
+ */
+static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
+ u32 __user *uaddr2, int nr_wake, int nr_requeue,
+ u32 *cmpval, int requeue_pi)
+{
+ union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
+ int drop_count = 0, task_count = 0, ret;
+ struct futex_pi_state *pi_state = NULL;
+ struct futex_hash_bucket *hb1, *hb2;
+ struct futex_q *this, *next;
+
+ if (requeue_pi) {
+ /*
+ * Requeue PI only works on two distinct uaddrs. This
+ * check is only valid for private futexes. See below.
+ */
+ if (uaddr1 == uaddr2)
+ return -EINVAL;
+
+ /*
+ * requeue_pi requires a pi_state, try to allocate it now
+ * without any locks in case it fails.
+ */
+ if (refill_pi_state_cache())
+ return -ENOMEM;
+ /*
+ * requeue_pi must wake as many tasks as it can, up to nr_wake
+ * + nr_requeue, since it acquires the rt_mutex prior to
+ * returning to userspace, so as to not leave the rt_mutex with
+ * waiters and no owner. However, second and third wake-ups
+ * cannot be predicted as they involve race conditions with the
+ * first wake and a fault while looking up the pi_state. Both
+ * pthread_cond_signal() and pthread_cond_broadcast() should
+ * use nr_wake=1.
+ */
+ if (nr_wake != 1)
+ return -EINVAL;
+ }
+
+retry:
+ ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
+ if (unlikely(ret != 0))
+ goto out;
+ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
+ requeue_pi ? VERIFY_WRITE : VERIFY_READ);
+ if (unlikely(ret != 0))
+ goto out_put_key1;
+
+ /*
+ * The check above which compares uaddrs is not sufficient for
+ * shared futexes. We need to compare the keys:
+ */
+ if (requeue_pi && match_futex(&key1, &key2)) {
+ ret = -EINVAL;
+ goto out_put_keys;
+ }
+
+ hb1 = hash_futex(&key1);
+ hb2 = hash_futex(&key2);
+
+retry_private:
+ hb_waiters_inc(hb2);
+ double_lock_hb(hb1, hb2);
+
+ if (likely(cmpval != NULL)) {
+ u32 curval;
+
+ ret = get_futex_value_locked(&curval, uaddr1);
+
+ if (unlikely(ret)) {
+ double_unlock_hb(hb1, hb2);
+ hb_waiters_dec(hb2);
+
+ ret = get_user(curval, uaddr1);
+ if (ret)
+ goto out_put_keys;
+
+ if (!(flags & FLAGS_SHARED))
+ goto retry_private;
+
+ put_futex_key(&key2);
+ put_futex_key(&key1);
+ goto retry;
+ }
+ if (curval != *cmpval) {
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
+ }
+
+ if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
+ /*
+ * Attempt to acquire uaddr2 and wake the top waiter. If we
+ * intend to requeue waiters, force setting the FUTEX_WAITERS
+ * bit. We force this here where we are able to easily handle
+ * faults rather in the requeue loop below.
+ */
+ ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
+ &key2, &pi_state, nr_requeue);
+
+ /*
+ * At this point the top_waiter has either taken uaddr2 or is
+ * waiting on it. If the former, then the pi_state will not
+ * exist yet, look it up one more time to ensure we have a
+ * reference to it. If the lock was taken, ret contains the
+ * vpid of the top waiter task.
+ */
+ if (ret > 0) {
+ WARN_ON(pi_state);
+ drop_count++;
+ task_count++;
+ /*
+ * If we acquired the lock, then the user
+ * space value of uaddr2 should be vpid. It
+ * cannot be changed by the top waiter as it
+ * is blocked on hb2 lock if it tries to do
+ * so. If something fiddled with it behind our
+ * back the pi state lookup might unearth
+ * it. So we rather use the known value than
+ * rereading and handing potential crap to
+ * lookup_pi_state.
+ */
+ ret = lookup_pi_state(ret, hb2, &key2, &pi_state);
+ }
+
+ switch (ret) {
+ case 0:
+ break;
+ case -EFAULT:
+ free_pi_state(pi_state);
+ pi_state = NULL;
+ double_unlock_hb(hb1, hb2);
+ hb_waiters_dec(hb2);
+ put_futex_key(&key2);
+ put_futex_key(&key1);
+ ret = fault_in_user_writeable(uaddr2);
+ if (!ret)
+ goto retry;
+ goto out;
+ case -EAGAIN:
+ /*
+ * Two reasons for this:
+ * - Owner is exiting and we just wait for the
+ * exit to complete.
+ * - The user space value changed.
+ */
+ free_pi_state(pi_state);
+ pi_state = NULL;
+ double_unlock_hb(hb1, hb2);
+ hb_waiters_dec(hb2);
+ put_futex_key(&key2);
+ put_futex_key(&key1);
+ cond_resched();
+ goto retry;
+ default:
+ goto out_unlock;
+ }
+ }
+
+ plist_for_each_entry_safe(this, next, &hb1->chain, list) {
+ if (task_count - nr_wake >= nr_requeue)
+ break;
+
+ if (!match_futex(&this->key, &key1))
+ continue;
+
+ /*
+ * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
+ * be paired with each other and no other futex ops.
+ *
+ * We should never be requeueing a futex_q with a pi_state,
+ * which is awaiting a futex_unlock_pi().
+ */
+ if ((requeue_pi && !this->rt_waiter) ||
+ (!requeue_pi && this->rt_waiter) ||
+ this->pi_state) {
+ ret = -EINVAL;
+ break;
+ }
+
+ /*
+ * Wake nr_wake waiters. For requeue_pi, if we acquired the
+ * lock, we already woke the top_waiter. If not, it will be
+ * woken by futex_unlock_pi().
+ */
+ if (++task_count <= nr_wake && !requeue_pi) {
+ wake_futex(this);
+ continue;
+ }
+
+ /* Ensure we requeue to the expected futex for requeue_pi. */
+ if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) {
+ ret = -EINVAL;
+ break;
+ }
+
+ /*
+ * Requeue nr_requeue waiters and possibly one more in the case
+ * of requeue_pi if we couldn't acquire the lock atomically.
+ */
+ if (requeue_pi) {
+ /* Prepare the waiter to take the rt_mutex. */
+ atomic_inc(&pi_state->refcount);
+ this->pi_state = pi_state;
+ ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
+ this->rt_waiter,
+ this->task);
+ if (ret == 1) {
+ /* We got the lock. */
+ requeue_pi_wake_futex(this, &key2, hb2);
+ drop_count++;
+ continue;
+ } else if (ret) {
+ /* -EDEADLK */
+ this->pi_state = NULL;
+ free_pi_state(pi_state);
+ goto out_unlock;
+ }
+ }
+ requeue_futex(this, hb1, hb2, &key2);
+ drop_count++;
+ }
+
+out_unlock:
+ free_pi_state(pi_state);
+ double_unlock_hb(hb1, hb2);
+ hb_waiters_dec(hb2);
+
+ /*
+ * drop_futex_key_refs() must be called outside the spinlocks. During
+ * the requeue we moved futex_q's from the hash bucket at key1 to the
+ * one at key2 and updated their key pointer. We no longer need to
+ * hold the references to key1.
+ */
+ while (--drop_count >= 0)
+ drop_futex_key_refs(&key1);
+
+out_put_keys:
+ put_futex_key(&key2);
+out_put_key1:
+ put_futex_key(&key1);
+out:
+ return ret ? ret : task_count;
+}
+
+/* The key must be already stored in q->key. */
+static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
+ __acquires(&hb->lock)
+{
+ struct futex_hash_bucket *hb;
+
+ hb = hash_futex(&q->key);
+
+ /*
+ * Increment the counter before taking the lock so that
+ * a potential waker won't miss a to-be-slept task that is
+ * waiting for the spinlock. This is safe as all queue_lock()
+ * users end up calling queue_me(). Similarly, for housekeeping,
+ * decrement the counter at queue_unlock() when some error has
+ * occurred and we don't end up adding the task to the list.
+ */
+ hb_waiters_inc(hb);
+
+ q->lock_ptr = &hb->lock;
+
+ spin_lock(&hb->lock); /* implies MB (A) */
+ return hb;
+}
+
+static inline void
+queue_unlock(struct futex_hash_bucket *hb)
+ __releases(&hb->lock)
+{
+ spin_unlock(&hb->lock);
+ hb_waiters_dec(hb);
+}
+
+/**
+ * queue_me() - Enqueue the futex_q on the futex_hash_bucket
+ * @q: The futex_q to enqueue
+ * @hb: The destination hash bucket
+ *
+ * The hb->lock must be held by the caller, and is released here. A call to
+ * queue_me() is typically paired with exactly one call to unqueue_me(). The
+ * exceptions involve the PI related operations, which may use unqueue_me_pi()
+ * or nothing if the unqueue is done as part of the wake process and the unqueue
+ * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
+ * an example).
+ */
+static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
+ __releases(&hb->lock)
+{
+ int prio;
+
+ /*
+ * The priority used to register this element is
+ * - either the real thread-priority for the real-time threads
+ * (i.e. threads with a priority lower than MAX_RT_PRIO)
+ * - or MAX_RT_PRIO for non-RT threads.
+ * Thus, all RT-threads are woken first in priority order, and
+ * the others are woken last, in FIFO order.
+ */
+ prio = min(current->normal_prio, MAX_RT_PRIO);
+
+ plist_node_init(&q->list, prio);
+ plist_add(&q->list, &hb->chain);
+ q->task = current;
+ spin_unlock(&hb->lock);
+}
+
+/**
+ * unqueue_me() - Remove the futex_q from its futex_hash_bucket
+ * @q: The futex_q to unqueue
+ *
+ * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
+ * be paired with exactly one earlier call to queue_me().
+ *
+ * Return:
+ * 1 - if the futex_q was still queued (and we removed unqueued it);
+ * 0 - if the futex_q was already removed by the waking thread
+ */
+static int unqueue_me(struct futex_q *q)
+{
+ spinlock_t *lock_ptr;
+ int ret = 0;
+
+ /* In the common case we don't take the spinlock, which is nice. */
+retry:
+ lock_ptr = q->lock_ptr;
+ barrier();
+ if (lock_ptr != NULL) {
+ spin_lock(lock_ptr);
+ /*
+ * q->lock_ptr can change between reading it and
+ * spin_lock(), causing us to take the wrong lock. This
+ * corrects the race condition.
+ *
+ * Reasoning goes like this: if we have the wrong lock,
+ * q->lock_ptr must have changed (maybe several times)
+ * between reading it and the spin_lock(). It can
+ * change again after the spin_lock() but only if it was
+ * already changed before the spin_lock(). It cannot,
+ * however, change back to the original value. Therefore
+ * we can detect whether we acquired the correct lock.
+ */
+ if (unlikely(lock_ptr != q->lock_ptr)) {
+ spin_unlock(lock_ptr);
+ goto retry;
+ }
+ __unqueue_futex(q);
+
+ BUG_ON(q->pi_state);
+
+ spin_unlock(lock_ptr);
+ ret = 1;
+ }
+
+ drop_futex_key_refs(&q->key);
+ return ret;
+}
+
+/*
+ * PI futexes can not be requeued and must remove themself from the
+ * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry
+ * and dropped here.
+ */
+static void unqueue_me_pi(struct futex_q *q)
+ __releases(q->lock_ptr)
+{
+ __unqueue_futex(q);
+
+ BUG_ON(!q->pi_state);
+ free_pi_state(q->pi_state);
+ q->pi_state = NULL;
+
+ spin_unlock(q->lock_ptr);
+}
+
+/*
+ * Fixup the pi_state owner with the new owner.
+ *
+ * Must be called with hash bucket lock held and mm->sem held for non
+ * private futexes.
+ */
+static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
+ struct task_struct *newowner)
+{
+ u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
+ struct futex_pi_state *pi_state = q->pi_state;
+ struct task_struct *oldowner = pi_state->owner;
+ u32 uval, uninitialized_var(curval), newval;
+ int ret;
+
+ /* Owner died? */
+ if (!pi_state->owner)
+ newtid |= FUTEX_OWNER_DIED;
+
+ /*
+ * We are here either because we stole the rtmutex from the
+ * previous highest priority waiter or we are the highest priority
+ * waiter but failed to get the rtmutex the first time.
+ * We have to replace the newowner TID in the user space variable.
+ * This must be atomic as we have to preserve the owner died bit here.
+ *
+ * Note: We write the user space value _before_ changing the pi_state
+ * because we can fault here. Imagine swapped out pages or a fork
+ * that marked all the anonymous memory readonly for cow.
+ *
+ * Modifying pi_state _before_ the user space value would
+ * leave the pi_state in an inconsistent state when we fault
+ * here, because we need to drop the hash bucket lock to
+ * handle the fault. This might be observed in the PID check
+ * in lookup_pi_state.
+ */
+retry:
+ if (get_futex_value_locked(&uval, uaddr))
+ goto handle_fault;
+
+ while (1) {
+ newval = (uval & FUTEX_OWNER_DIED) | newtid;
+
+ if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
+ goto handle_fault;
+ if (curval == uval)
+ break;
+ uval = curval;
+ }
+
+ /*
+ * We fixed up user space. Now we need to fix the pi_state
+ * itself.
+ */
+ if (pi_state->owner != NULL) {
+ raw_spin_lock_irq(&pi_state->owner->pi_lock);
+ WARN_ON(list_empty(&pi_state->list));
+ list_del_init(&pi_state->list);
+ raw_spin_unlock_irq(&pi_state->owner->pi_lock);
+ }
+
+ pi_state->owner = newowner;
+
+ raw_spin_lock_irq(&newowner->pi_lock);
+ WARN_ON(!list_empty(&pi_state->list));
+ list_add(&pi_state->list, &newowner->pi_state_list);
+ raw_spin_unlock_irq(&newowner->pi_lock);
+ return 0;
+
+ /*
+ * To handle the page fault we need to drop the hash bucket
+ * lock here. That gives the other task (either the highest priority
+ * waiter itself or the task which stole the rtmutex) the
+ * chance to try the fixup of the pi_state. So once we are
+ * back from handling the fault we need to check the pi_state
+ * after reacquiring the hash bucket lock and before trying to
+ * do another fixup. When the fixup has been done already we
+ * simply return.
+ */
+handle_fault:
+ spin_unlock(q->lock_ptr);
+
+ ret = fault_in_user_writeable(uaddr);
+
+ spin_lock(q->lock_ptr);
+
+ /*
+ * Check if someone else fixed it for us:
+ */
+ if (pi_state->owner != oldowner)
+ return 0;
+
+ if (ret)
+ return ret;
+
+ goto retry;
+}
+
+static long futex_wait_restart(struct restart_block *restart);
+
+/**
+ * fixup_owner() - Post lock pi_state and corner case management
+ * @uaddr: user address of the futex
+ * @q: futex_q (contains pi_state and access to the rt_mutex)
+ * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0)
+ *
+ * After attempting to lock an rt_mutex, this function is called to cleanup
+ * the pi_state owner as well as handle race conditions that may allow us to
+ * acquire the lock. Must be called with the hb lock held.
+ *
+ * Return:
+ * 1 - success, lock taken;
+ * 0 - success, lock not taken;
+ * <0 - on error (-EFAULT)
+ */
+static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
+{
+ struct task_struct *owner;
+ int ret = 0;
+
+ if (locked) {
+ /*
+ * Got the lock. We might not be the anticipated owner if we
+ * did a lock-steal - fix up the PI-state in that case:
+ */
+ if (q->pi_state->owner != current)
+ ret = fixup_pi_state_owner(uaddr, q, current);
+ goto out;
+ }
+
+ /*
+ * Catch the rare case, where the lock was released when we were on the
+ * way back before we locked the hash bucket.
+ */
+ if (q->pi_state->owner == current) {
+ /*
+ * Try to get the rt_mutex now. This might fail as some other
+ * task acquired the rt_mutex after we removed ourself from the
+ * rt_mutex waiters list.
+ */
+ if (rt_mutex_trylock(&q->pi_state->pi_mutex)) {
+ locked = 1;
+ goto out;
+ }
+
+ /*
+ * pi_state is incorrect, some other task did a lock steal and
+ * we returned due to timeout or signal without taking the
+ * rt_mutex. Too late.
+ */
+ raw_spin_lock(&q->pi_state->pi_mutex.wait_lock);
+ owner = rt_mutex_owner(&q->pi_state->pi_mutex);
+ if (!owner)
+ owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
+ raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock);
+ ret = fixup_pi_state_owner(uaddr, q, owner);
+ goto out;
+ }
+
+ /*
+ * Paranoia check. If we did not take the lock, then we should not be
+ * the owner of the rt_mutex.
+ */
+ if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
+ printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
+ "pi-state %p\n", ret,
+ q->pi_state->pi_mutex.owner,
+ q->pi_state->owner);
+
+out:
+ return ret ? ret : locked;
+}
+
+/**
+ * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal
+ * @hb: the futex hash bucket, must be locked by the caller
+ * @q: the futex_q to queue up on
+ * @timeout: the prepared hrtimer_sleeper, or null for no timeout
+ */
+static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
+ struct hrtimer_sleeper *timeout)
+{
+ /*
+ * The task state is guaranteed to be set before another task can
+ * wake it. set_current_state() is implemented using set_mb() and
+ * queue_me() calls spin_unlock() upon completion, both serializing
+ * access to the hash list and forcing another memory barrier.
+ */
+ set_current_state(TASK_INTERRUPTIBLE);
+ queue_me(q, hb);
+
+ /* Arm the timer */
+ if (timeout) {
+ hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
+ if (!hrtimer_active(&timeout->timer))
+ timeout->task = NULL;
+ }
+
+ /*
+ * If we have been removed from the hash list, then another task
+ * has tried to wake us, and we can skip the call to schedule().
+ */
+ if (likely(!plist_node_empty(&q->list))) {
+ /*
+ * If the timer has already expired, current will already be
+ * flagged for rescheduling. Only call schedule if there
+ * is no timeout, or if it has yet to expire.
+ */
+ if (!timeout || timeout->task)
+ freezable_schedule();
+ }
+ __set_current_state(TASK_RUNNING);
+}
+
+/**
+ * futex_wait_setup() - Prepare to wait on a futex
+ * @uaddr: the futex userspace address
+ * @val: the expected value
+ * @flags: futex flags (FLAGS_SHARED, etc.)
+ * @q: the associated futex_q
+ * @hb: storage for hash_bucket pointer to be returned to caller
+ *
+ * Setup the futex_q and locate the hash_bucket. Get the futex value and
+ * compare it with the expected value. Handle atomic faults internally.
+ * Return with the hb lock held and a q.key reference on success, and unlocked
+ * with no q.key reference on failure.
+ *
+ * Return:
+ * 0 - uaddr contains val and hb has been locked;
+ * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
+ */
+static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
+ struct futex_q *q, struct futex_hash_bucket **hb)
+{
+ u32 uval;
+ int ret;
+
+ /*
+ * Access the page AFTER the hash-bucket is locked.
+ * Order is important:
+ *
+ * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
+ * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); }
+ *
+ * The basic logical guarantee of a futex is that it blocks ONLY
+ * if cond(var) is known to be true at the time of blocking, for
+ * any cond. If we locked the hash-bucket after testing *uaddr, that
+ * would open a race condition where we could block indefinitely with
+ * cond(var) false, which would violate the guarantee.
+ *
+ * On the other hand, we insert q and release the hash-bucket only
+ * after testing *uaddr. This guarantees that futex_wait() will NOT
+ * absorb a wakeup if *uaddr does not match the desired values
+ * while the syscall executes.
+ */
+retry:
+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, VERIFY_READ);
+ if (unlikely(ret != 0))
+ return ret;
+
+retry_private:
+ *hb = queue_lock(q);
+
+ ret = get_futex_value_locked(&uval, uaddr);
+
+ if (ret) {
+ queue_unlock(*hb);
+
+ ret = get_user(uval, uaddr);
+ if (ret)
+ goto out;
+
+ if (!(flags & FLAGS_SHARED))
+ goto retry_private;
+
+ put_futex_key(&q->key);
+ goto retry;
+ }
+
+ if (uval != val) {
+ queue_unlock(*hb);
+ ret = -EWOULDBLOCK;
+ }
+
+out:
+ if (ret)
+ put_futex_key(&q->key);
+ return ret;
+}
+
+static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
+ ktime_t *abs_time, u32 bitset)
+{
+ struct hrtimer_sleeper timeout, *to = NULL;
+ struct restart_block *restart;
+ struct futex_hash_bucket *hb;
+ struct futex_q q = futex_q_init;
+ int ret;
+
+ if (!bitset)
+ return -EINVAL;
+ q.bitset = bitset;
+
+ if (abs_time) {
+ to = &timeout;
+
+ hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
+ CLOCK_REALTIME : CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS);
+ hrtimer_init_sleeper(to, current);
+ hrtimer_set_expires_range_ns(&to->timer, *abs_time,
+ current->timer_slack_ns);
+ }
+
+retry:
+ /*
+ * Prepare to wait on uaddr. On success, holds hb lock and increments
+ * q.key refs.
+ */
+ ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
+ if (ret)
+ goto out;
+
+ /* queue_me and wait for wakeup, timeout, or a signal. */
+ futex_wait_queue_me(hb, &q, to);
+
+ /* If we were woken (and unqueued), we succeeded, whatever. */
+ ret = 0;
+ /* unqueue_me() drops q.key ref */
+ if (!unqueue_me(&q))
+ goto out;
+ ret = -ETIMEDOUT;
+ if (to && !to->task)
+ goto out;
+
+ /*
+ * We expect signal_pending(current), but we might be the
+ * victim of a spurious wakeup as well.
+ */
+ if (!signal_pending(current))
+ goto retry;
+
+ ret = -ERESTARTSYS;
+ if (!abs_time)
+ goto out;
+
+ restart = &current->restart_block;
+ restart->fn = futex_wait_restart;
+ restart->futex.uaddr = uaddr;
+ restart->futex.val = val;
+ restart->futex.time = abs_time->tv64;
+ restart->futex.bitset = bitset;
+ restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
+
+ ret = -ERESTART_RESTARTBLOCK;
+
+out:
+ if (to) {
+ hrtimer_cancel(&to->timer);
+ destroy_hrtimer_on_stack(&to->timer);
+ }
+ return ret;
+}
+
+
+static long futex_wait_restart(struct restart_block *restart)
+{
+ u32 __user *uaddr = restart->futex.uaddr;
+ ktime_t t, *tp = NULL;
+
+ if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
+ t.tv64 = restart->futex.time;
+ tp = &t;
+ }
+ restart->fn = do_no_restart_syscall;
+
+ return (long)futex_wait(uaddr, restart->futex.flags,
+ restart->futex.val, tp, restart->futex.bitset);
+}
+
+
+/*
+ * Userspace tried a 0 -> TID atomic transition of the futex value
+ * and failed. The kernel side here does the whole locking operation:
+ * if there are waiters then it will block, it does PI, etc. (Due to
+ * races the kernel might see a 0 value of the futex too.)
+ */
+static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
+ ktime_t *time, int trylock)
+{
+ struct hrtimer_sleeper timeout, *to = NULL;
+ struct futex_hash_bucket *hb;
+ struct futex_q q = futex_q_init;
+ int res, ret;
+
+ if (refill_pi_state_cache())
+ return -ENOMEM;
+
+ if (time) {
+ to = &timeout;
+ hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME,
+ HRTIMER_MODE_ABS);
+ hrtimer_init_sleeper(to, current);
+ hrtimer_set_expires(&to->timer, *time);
+ }
+
+retry:
+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, VERIFY_WRITE);
+ if (unlikely(ret != 0))
+ goto out;
+
+retry_private:
+ hb = queue_lock(&q);
+
+ ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
+ if (unlikely(ret)) {
+ switch (ret) {
+ case 1:
+ /* We got the lock. */
+ ret = 0;
+ goto out_unlock_put_key;
+ case -EFAULT:
+ goto uaddr_faulted;
+ case -EAGAIN:
+ /*
+ * Two reasons for this:
+ * - Task is exiting and we just wait for the
+ * exit to complete.
+ * - The user space value changed.
+ */
+ queue_unlock(hb);
+ put_futex_key(&q.key);
+ cond_resched();
+ goto retry;
+ default:
+ goto out_unlock_put_key;
+ }
+ }
+
+ /*
+ * Only actually queue now that the atomic ops are done:
+ */
+ queue_me(&q, hb);
+
+ WARN_ON(!q.pi_state);
+ /*
+ * Block on the PI mutex:
+ */
+ if (!trylock) {
+ ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to);
+ } else {
+ ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
+ /* Fixup the trylock return value: */
+ ret = ret ? 0 : -EWOULDBLOCK;
+ }
+
+ spin_lock(q.lock_ptr);
+ /*
+ * Fixup the pi_state owner and possibly acquire the lock if we
+ * haven't already.
+ */
+ res = fixup_owner(uaddr, &q, !ret);
+ /*
+ * If fixup_owner() returned an error, proprogate that. If it acquired
+ * the lock, clear our -ETIMEDOUT or -EINTR.
+ */
+ if (res)
+ ret = (res < 0) ? res : 0;
+
+ /*
+ * If fixup_owner() faulted and was unable to handle the fault, unlock
+ * it and return the fault to userspace.
+ */
+ if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
+ rt_mutex_unlock(&q.pi_state->pi_mutex);
+
+ /* Unqueue and drop the lock */
+ unqueue_me_pi(&q);
+
+ goto out_put_key;
+
+out_unlock_put_key:
+ queue_unlock(hb);
+
+out_put_key:
+ put_futex_key(&q.key);
+out:
+ if (to)
+ destroy_hrtimer_on_stack(&to->timer);
+ return ret != -EINTR ? ret : -ERESTARTNOINTR;
+
+uaddr_faulted:
+ queue_unlock(hb);
+
+ ret = fault_in_user_writeable(uaddr);
+ if (ret)
+ goto out_put_key;
+
+ if (!(flags & FLAGS_SHARED))
+ goto retry_private;
+
+ put_futex_key(&q.key);
+ goto retry;
+}
+
+/*
+ * Userspace attempted a TID -> 0 atomic transition, and failed.
+ * This is the in-kernel slowpath: we look up the PI state (if any),
+ * and do the rt-mutex unlock.
+ */
+static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
+{
+ u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current);
+ union futex_key key = FUTEX_KEY_INIT;
+ struct futex_hash_bucket *hb;
+ struct futex_q *match;
+ int ret;
+
+retry:
+ if (get_user(uval, uaddr))
+ return -EFAULT;
+ /*
+ * We release only a lock we actually own:
+ */
+ if ((uval & FUTEX_TID_MASK) != vpid)
+ return -EPERM;
+
+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE);
+ if (ret)
+ return ret;
+
+ hb = hash_futex(&key);
+ spin_lock(&hb->lock);
+
+ /*
+ * Check waiters first. We do not trust user space values at
+ * all and we at least want to know if user space fiddled
+ * with the futex value instead of blindly unlocking.
+ */
+ match = futex_top_waiter(hb, &key);
+ if (match) {
+ ret = wake_futex_pi(uaddr, uval, match);
+ /*
+ * The atomic access to the futex value generated a
+ * pagefault, so retry the user-access and the wakeup:
+ */
+ if (ret == -EFAULT)
+ goto pi_faulted;
+ goto out_unlock;
+ }
+
+ /*
+ * We have no kernel internal state, i.e. no waiters in the
+ * kernel. Waiters which are about to queue themselves are stuck
+ * on hb->lock. So we can safely ignore them. We do neither
+ * preserve the WAITERS bit not the OWNER_DIED one. We are the
+ * owner.
+ */
+ if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))
+ goto pi_faulted;
+
+ /*
+ * If uval has changed, let user space handle it.
+ */
+ ret = (curval == uval) ? 0 : -EAGAIN;
+
+out_unlock:
+ spin_unlock(&hb->lock);
+ put_futex_key(&key);
+ return ret;
+
+pi_faulted:
+ spin_unlock(&hb->lock);
+ put_futex_key(&key);
+
+ ret = fault_in_user_writeable(uaddr);
+ if (!ret)
+ goto retry;
+
+ return ret;
+}
+
+/**
+ * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex
+ * @hb: the hash_bucket futex_q was original enqueued on
+ * @q: the futex_q woken while waiting to be requeued
+ * @key2: the futex_key of the requeue target futex
+ * @timeout: the timeout associated with the wait (NULL if none)
+ *
+ * Detect if the task was woken on the initial futex as opposed to the requeue
+ * target futex. If so, determine if it was a timeout or a signal that caused
+ * the wakeup and return the appropriate error code to the caller. Must be
+ * called with the hb lock held.
+ *
+ * Return:
+ * 0 = no early wakeup detected;
+ * <0 = -ETIMEDOUT or -ERESTARTNOINTR
+ */
+static inline
+int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
+ struct futex_q *q, union futex_key *key2,
+ struct hrtimer_sleeper *timeout)
+{
+ int ret = 0;
+
+ /*
+ * With the hb lock held, we avoid races while we process the wakeup.
+ * We only need to hold hb (and not hb2) to ensure atomicity as the
+ * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
+ * It can't be requeued from uaddr2 to something else since we don't
+ * support a PI aware source futex for requeue.
+ */
+ if (!match_futex(&q->key, key2)) {
+ WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr));
+ /*
+ * We were woken prior to requeue by a timeout or a signal.
+ * Unqueue the futex_q and determine which it was.
+ */
+ plist_del(&q->list, &hb->chain);
+ hb_waiters_dec(hb);
+
+ /* Handle spurious wakeups gracefully */
+ ret = -EWOULDBLOCK;
+ if (timeout && !timeout->task)
+ ret = -ETIMEDOUT;
+ else if (signal_pending(current))
+ ret = -ERESTARTNOINTR;
+ }
+ return ret;
+}
+
+/**
+ * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
+ * @uaddr: the futex we initially wait on (non-pi)
+ * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
+ * the same type, no requeueing from private to shared, etc.
+ * @val: the expected value of uaddr
+ * @abs_time: absolute timeout
+ * @bitset: 32 bit wakeup bitset set by userspace, defaults to all
+ * @uaddr2: the pi futex we will take prior to returning to user-space
+ *
+ * The caller will wait on uaddr and will be requeued by futex_requeue() to
+ * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake
+ * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to
+ * userspace. This ensures the rt_mutex maintains an owner when it has waiters;
+ * without one, the pi logic would not know which task to boost/deboost, if
+ * there was a need to.
+ *
+ * We call schedule in futex_wait_queue_me() when we enqueue and return there
+ * via the following--
+ * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
+ * 2) wakeup on uaddr2 after a requeue
+ * 3) signal
+ * 4) timeout
+ *
+ * If 3, cleanup and return -ERESTARTNOINTR.
+ *
+ * If 2, we may then block on trying to take the rt_mutex and return via:
+ * 5) successful lock
+ * 6) signal
+ * 7) timeout
+ * 8) other lock acquisition failure
+ *
+ * If 6, return -EWOULDBLOCK (restarting the syscall would do the same).
+ *
+ * If 4 or 7, we cleanup and return with -ETIMEDOUT.
+ *
+ * Return:
+ * 0 - On success;
+ * <0 - On error
+ */
+static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
+ u32 val, ktime_t *abs_time, u32 bitset,
+ u32 __user *uaddr2)
+{
+ struct hrtimer_sleeper timeout, *to = NULL;
+ struct rt_mutex_waiter rt_waiter;
+ struct rt_mutex *pi_mutex = NULL;
+ struct futex_hash_bucket *hb;
+ union futex_key key2 = FUTEX_KEY_INIT;
+ struct futex_q q = futex_q_init;
+ int res, ret;
+
+ if (uaddr == uaddr2)
+ return -EINVAL;
+
+ if (!bitset)
+ return -EINVAL;
+
+ if (abs_time) {
+ to = &timeout;
+ hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
+ CLOCK_REALTIME : CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS);
+ hrtimer_init_sleeper(to, current);
+ hrtimer_set_expires_range_ns(&to->timer, *abs_time,
+ current->timer_slack_ns);
+ }
+
+ /*
+ * The waiter is allocated on our stack, manipulated by the requeue
+ * code while we sleep on uaddr.
+ */
+ debug_rt_mutex_init_waiter(&rt_waiter);
+ RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
+ RB_CLEAR_NODE(&rt_waiter.tree_entry);
+ rt_waiter.task = NULL;
+
+ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
+ if (unlikely(ret != 0))
+ goto out;
+
+ q.bitset = bitset;
+ q.rt_waiter = &rt_waiter;
+ q.requeue_pi_key = &key2;
+
+ /*
+ * Prepare to wait on uaddr. On success, increments q.key (key1) ref
+ * count.
+ */
+ ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
+ if (ret)
+ goto out_key2;
+
+ /*
+ * The check above which compares uaddrs is not sufficient for
+ * shared futexes. We need to compare the keys:
+ */
+ if (match_futex(&q.key, &key2)) {
+ queue_unlock(hb);
+ ret = -EINVAL;
+ goto out_put_keys;
+ }
+
+ /* Queue the futex_q, drop the hb lock, wait for wakeup. */
+ futex_wait_queue_me(hb, &q, to);
+
+ spin_lock(&hb->lock);
+ ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
+ spin_unlock(&hb->lock);
+ if (ret)
+ goto out_put_keys;
+
+ /*
+ * In order for us to be here, we know our q.key == key2, and since
+ * we took the hb->lock above, we also know that futex_requeue() has
+ * completed and we no longer have to concern ourselves with a wakeup
+ * race with the atomic proxy lock acquisition by the requeue code. The
+ * futex_requeue dropped our key1 reference and incremented our key2
+ * reference count.
+ */
+
+ /* Check if the requeue code acquired the second futex for us. */
+ if (!q.rt_waiter) {
+ /*
+ * Got the lock. We might not be the anticipated owner if we
+ * did a lock-steal - fix up the PI-state in that case.
+ */
+ if (q.pi_state && (q.pi_state->owner != current)) {
+ spin_lock(q.lock_ptr);
+ ret = fixup_pi_state_owner(uaddr2, &q, current);
+ spin_unlock(q.lock_ptr);
+ }
+ } else {
+ /*
+ * We have been woken up by futex_unlock_pi(), a timeout, or a
+ * signal. futex_unlock_pi() will not destroy the lock_ptr nor
+ * the pi_state.
+ */
+ WARN_ON(!q.pi_state);
+ pi_mutex = &q.pi_state->pi_mutex;
+ ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter);
+ debug_rt_mutex_free_waiter(&rt_waiter);
+
+ spin_lock(q.lock_ptr);
+ /*
+ * Fixup the pi_state owner and possibly acquire the lock if we
+ * haven't already.
+ */
+ res = fixup_owner(uaddr2, &q, !ret);
+ /*
+ * If fixup_owner() returned an error, proprogate that. If it
+ * acquired the lock, clear -ETIMEDOUT or -EINTR.
+ */
+ if (res)
+ ret = (res < 0) ? res : 0;
+
+ /* Unqueue and drop the lock. */
+ unqueue_me_pi(&q);
+ }
+
+ /*
+ * If fixup_pi_state_owner() faulted and was unable to handle the
+ * fault, unlock the rt_mutex and return the fault to userspace.
+ */
+ if (ret == -EFAULT) {
+ if (pi_mutex && rt_mutex_owner(pi_mutex) == current)
+ rt_mutex_unlock(pi_mutex);
+ } else if (ret == -EINTR) {
+ /*
+ * We've already been requeued, but cannot restart by calling
+ * futex_lock_pi() directly. We could restart this syscall, but
+ * it would detect that the user space "val" changed and return
+ * -EWOULDBLOCK. Save the overhead of the restart and return
+ * -EWOULDBLOCK directly.
+ */
+ ret = -EWOULDBLOCK;
+ }
+
+out_put_keys:
+ put_futex_key(&q.key);
+out_key2:
+ put_futex_key(&key2);
+
+out:
+ if (to) {
+ hrtimer_cancel(&to->timer);
+ destroy_hrtimer_on_stack(&to->timer);
+ }
+ return ret;
+}
+
+/*
+ * Support for robust futexes: the kernel cleans up held futexes at
+ * thread exit time.
+ *
+ * Implementation: user-space maintains a per-thread list of locks it
+ * is holding. Upon do_exit(), the kernel carefully walks this list,
+ * and marks all locks that are owned by this thread with the
+ * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
+ * always manipulated with the lock held, so the list is private and
+ * per-thread. Userspace also maintains a per-thread 'list_op_pending'
+ * field, to allow the kernel to clean up if the thread dies after
+ * acquiring the lock, but just before it could have added itself to
+ * the list. There can only be one such pending lock.
+ */
+
+/**
+ * sys_set_robust_list() - Set the robust-futex list head of a task
+ * @head: pointer to the list-head
+ * @len: length of the list-head, as userspace expects
+ */
+SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
+ size_t, len)
+{
+ if (!futex_cmpxchg_enabled)
+ return -ENOSYS;
+ /*
+ * The kernel knows only one size for now:
+ */
+ if (unlikely(len != sizeof(*head)))
+ return -EINVAL;
+
+ current->robust_list = head;
+
+ return 0;
+}
+
+/**
+ * sys_get_robust_list() - Get the robust-futex list head of a task
+ * @pid: pid of the process [zero for current task]
+ * @head_ptr: pointer to a list-head pointer, the kernel fills it in
+ * @len_ptr: pointer to a length field, the kernel fills in the header size
+ */
+SYSCALL_DEFINE3(get_robust_list, int, pid,
+ struct robust_list_head __user * __user *, head_ptr,
+ size_t __user *, len_ptr)
+{
+ struct robust_list_head __user *head;
+ unsigned long ret;
+ struct task_struct *p;
+
+ if (!futex_cmpxchg_enabled)
+ return -ENOSYS;
+
+ rcu_read_lock();
+
+ ret = -ESRCH;
+ if (!pid)
+ p = current;
+ else {
+ p = find_task_by_vpid(pid);
+ if (!p)
+ goto err_unlock;
+ }
+
+ ret = -EPERM;
+ if (!ptrace_may_access(p, PTRACE_MODE_READ))
+ goto err_unlock;
+
+ head = p->robust_list;
+ rcu_read_unlock();
+
+ if (put_user(sizeof(*head), len_ptr))
+ return -EFAULT;
+ return put_user(head, head_ptr);
+
+err_unlock:
+ rcu_read_unlock();
+
+ return ret;
+}
+
+/*
+ * Process a futex-list entry, check whether it's owned by the
+ * dying task, and do notification if so:
+ */
+int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
+{
+ u32 uval, uninitialized_var(nval), mval;
+
+retry:
+ if (get_user(uval, uaddr))
+ return -1;
+
+ if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) {
+ /*
+ * Ok, this dying thread is truly holding a futex
+ * of interest. Set the OWNER_DIED bit atomically
+ * via cmpxchg, and if the value had FUTEX_WAITERS
+ * set, wake up a waiter (if any). (We have to do a
+ * futex_wake() even if OWNER_DIED is already set -
+ * to handle the rare but possible case of recursive
+ * thread-death.) The rest of the cleanup is done in
+ * userspace.
+ */
+ mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
+ /*
+ * We are not holding a lock here, but we want to have
+ * the pagefault_disable/enable() protection because
+ * we want to handle the fault gracefully. If the
+ * access fails we try to fault in the futex with R/W
+ * verification via get_user_pages. get_user() above
+ * does not guarantee R/W access. If that fails we
+ * give up and leave the futex locked.
+ */
+ if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) {
+ if (fault_in_user_writeable(uaddr))
+ return -1;
+ goto retry;
+ }
+ if (nval != uval)
+ goto retry;
+
+ /*
+ * Wake robust non-PI futexes here. The wakeup of
+ * PI futexes happens in exit_pi_state():
+ */
+ if (!pi && (uval & FUTEX_WAITERS))
+ futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
+ }
+ return 0;
+}
+
+/*
+ * Fetch a robust-list pointer. Bit 0 signals PI futexes:
+ */
+static inline int fetch_robust_entry(struct robust_list __user **entry,
+ struct robust_list __user * __user *head,
+ unsigned int *pi)
+{
+ unsigned long uentry;
+
+ if (get_user(uentry, (unsigned long __user *)head))
+ return -EFAULT;
+
+ *entry = (void __user *)(uentry & ~1UL);
+ *pi = uentry & 1;
+
+ return 0;
+}
+
+/*
+ * Walk curr->robust_list (very carefully, it's a userspace list!)
+ * and mark any locks found there dead, and notify any waiters.
+ *
+ * We silently return on any sign of list-walking problem.
+ */
+void exit_robust_list(struct task_struct *curr)
+{
+ struct robust_list_head __user *head = curr->robust_list;
+ struct robust_list __user *entry, *next_entry, *pending;
+ unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
+ unsigned int uninitialized_var(next_pi);
+ unsigned long futex_offset;
+ int rc;
+
+ if (!futex_cmpxchg_enabled)
+ return;
+
+ /*
+ * Fetch the list head (which was registered earlier, via
+ * sys_set_robust_list()):
+ */
+ if (fetch_robust_entry(&entry, &head->list.next, &pi))
+ return;
+ /*
+ * Fetch the relative futex offset:
+ */
+ if (get_user(futex_offset, &head->futex_offset))
+ return;
+ /*
+ * Fetch any possibly pending lock-add first, and handle it
+ * if it exists:
+ */
+ if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
+ return;
+
+ next_entry = NULL; /* avoid warning with gcc */
+ while (entry != &head->list) {
+ /*
+ * Fetch the next entry in the list before calling
+ * handle_futex_death:
+ */
+ rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
+ /*
+ * A pending lock might already be on the list, so
+ * don't process it twice:
+ */
+ if (entry != pending)
+ if (handle_futex_death((void __user *)entry + futex_offset,
+ curr, pi))
+ return;
+ if (rc)
+ return;
+ entry = next_entry;
+ pi = next_pi;
+ /*
+ * Avoid excessively long or circular lists:
+ */
+ if (!--limit)
+ break;
+
+ cond_resched();
+ }
+
+ if (pending)
+ handle_futex_death((void __user *)pending + futex_offset,
+ curr, pip);
+}
+
+long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
+ u32 __user *uaddr2, u32 val2, u32 val3)
+{
+ int cmd = op & FUTEX_CMD_MASK;
+ unsigned int flags = 0;
+
+ if (!(op & FUTEX_PRIVATE_FLAG))
+ flags |= FLAGS_SHARED;
+
+ if (op & FUTEX_CLOCK_REALTIME) {
+ flags |= FLAGS_CLOCKRT;
+ if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
+ return -ENOSYS;
+ }
+
+ switch (cmd) {
+ case FUTEX_LOCK_PI:
+ case FUTEX_UNLOCK_PI:
+ case FUTEX_TRYLOCK_PI:
+ case FUTEX_WAIT_REQUEUE_PI:
+ case FUTEX_CMP_REQUEUE_PI:
+ if (!futex_cmpxchg_enabled)
+ return -ENOSYS;
+ }
+
+ switch (cmd) {
+ case FUTEX_WAIT:
+ val3 = FUTEX_BITSET_MATCH_ANY;
+ case FUTEX_WAIT_BITSET:
+ return futex_wait(uaddr, flags, val, timeout, val3);
+ case FUTEX_WAKE:
+ val3 = FUTEX_BITSET_MATCH_ANY;
+ case FUTEX_WAKE_BITSET:
+ return futex_wake(uaddr, flags, val, val3);
+ case FUTEX_REQUEUE:
+ return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
+ case FUTEX_CMP_REQUEUE:
+ return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
+ case FUTEX_WAKE_OP:
+ return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
+ case FUTEX_LOCK_PI:
+ return futex_lock_pi(uaddr, flags, timeout, 0);
+ case FUTEX_UNLOCK_PI:
+ return futex_unlock_pi(uaddr, flags);
+ case FUTEX_TRYLOCK_PI:
+ return futex_lock_pi(uaddr, flags, NULL, 1);
+ case FUTEX_WAIT_REQUEUE_PI:
+ val3 = FUTEX_BITSET_MATCH_ANY;
+ return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
+ uaddr2);
+ case FUTEX_CMP_REQUEUE_PI:
+ return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
+ }
+ return -ENOSYS;
+}
+
+
+SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
+ struct timespec __user *, utime, u32 __user *, uaddr2,
+ u32, val3)
+{
+ struct timespec ts;
+ ktime_t t, *tp = NULL;
+ u32 val2 = 0;
+ int cmd = op & FUTEX_CMD_MASK;
+
+ if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
+ cmd == FUTEX_WAIT_BITSET ||
+ cmd == FUTEX_WAIT_REQUEUE_PI)) {
+ if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
+ return -EFAULT;
+ if (!timespec_valid(&ts))
+ return -EINVAL;
+
+ t = timespec_to_ktime(ts);
+ if (cmd == FUTEX_WAIT)
+ t = ktime_add_safe(ktime_get(), t);
+ tp = &t;
+ }
+ /*
+ * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*.
+ * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
+ */
+ if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
+ cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
+ val2 = (u32) (unsigned long) utime;
+
+ return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
+}
+
+static void __init futex_detect_cmpxchg(void)
+{
+#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
+ u32 curval;
+
+ /*
+ * This will fail and we want it. Some arch implementations do
+ * runtime detection of the futex_atomic_cmpxchg_inatomic()
+ * functionality. We want to know that before we call in any
+ * of the complex code paths. Also we want to prevent
+ * registration of robust lists in that case. NULL is
+ * guaranteed to fault and we get -EFAULT on functional
+ * implementation, the non-functional ones will return
+ * -ENOSYS.
+ */
+ if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
+ futex_cmpxchg_enabled = 1;
+#endif
+}
+
+static int __init futex_init(void)
+{
+ unsigned int futex_shift;
+ unsigned long i;
+
+#if CONFIG_BASE_SMALL
+ futex_hashsize = 16;
+#else
+ futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
+#endif
+
+ futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
+ futex_hashsize, 0,
+ futex_hashsize < 256 ? HASH_SMALL : 0,
+ &futex_shift, NULL,
+ futex_hashsize, futex_hashsize);
+ futex_hashsize = 1UL << futex_shift;
+
+ futex_detect_cmpxchg();
+
+ for (i = 0; i < futex_hashsize; i++) {
+ atomic_set(&futex_queues[i].waiters, 0);
+ plist_head_init(&futex_queues[i].chain);
+ spin_lock_init(&futex_queues[i].lock);
+ }
+
+ return 0;
+}
+__initcall(futex_init);
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
new file mode 100644
index 000000000..55c8c9349
--- /dev/null
+++ b/kernel/futex_compat.c
@@ -0,0 +1,201 @@
+/*
+ * linux/kernel/futex_compat.c
+ *
+ * Futex compatibililty routines.
+ *
+ * Copyright 2006, Red Hat, Inc., Ingo Molnar
+ */
+
+#include <linux/linkage.h>
+#include <linux/compat.h>
+#include <linux/nsproxy.h>
+#include <linux/futex.h>
+#include <linux/ptrace.h>
+#include <linux/syscalls.h>
+
+#include <asm/uaccess.h>
+
+
+/*
+ * Fetch a robust-list pointer. Bit 0 signals PI futexes:
+ */
+static inline int
+fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
+ compat_uptr_t __user *head, unsigned int *pi)
+{
+ if (get_user(*uentry, head))
+ return -EFAULT;
+
+ *entry = compat_ptr((*uentry) & ~1);
+ *pi = (unsigned int)(*uentry) & 1;
+
+ return 0;
+}
+
+static void __user *futex_uaddr(struct robust_list __user *entry,
+ compat_long_t futex_offset)
+{
+ compat_uptr_t base = ptr_to_compat(entry);
+ void __user *uaddr = compat_ptr(base + futex_offset);
+
+ return uaddr;
+}
+
+/*
+ * Walk curr->robust_list (very carefully, it's a userspace list!)
+ * and mark any locks found there dead, and notify any waiters.
+ *
+ * We silently return on any sign of list-walking problem.
+ */
+void compat_exit_robust_list(struct task_struct *curr)
+{
+ struct compat_robust_list_head __user *head = curr->compat_robust_list;
+ struct robust_list __user *entry, *next_entry, *pending;
+ unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
+ unsigned int uninitialized_var(next_pi);
+ compat_uptr_t uentry, next_uentry, upending;
+ compat_long_t futex_offset;
+ int rc;
+
+ if (!futex_cmpxchg_enabled)
+ return;
+
+ /*
+ * Fetch the list head (which was registered earlier, via
+ * sys_set_robust_list()):
+ */
+ if (fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
+ return;
+ /*
+ * Fetch the relative futex offset:
+ */
+ if (get_user(futex_offset, &head->futex_offset))
+ return;
+ /*
+ * Fetch any possibly pending lock-add first, and handle it
+ * if it exists:
+ */
+ if (fetch_robust_entry(&upending, &pending,
+ &head->list_op_pending, &pip))
+ return;
+
+ next_entry = NULL; /* avoid warning with gcc */
+ while (entry != (struct robust_list __user *) &head->list) {
+ /*
+ * Fetch the next entry in the list before calling
+ * handle_futex_death:
+ */
+ rc = fetch_robust_entry(&next_uentry, &next_entry,
+ (compat_uptr_t __user *)&entry->next, &next_pi);
+ /*
+ * A pending lock might already be on the list, so
+ * dont process it twice:
+ */
+ if (entry != pending) {
+ void __user *uaddr = futex_uaddr(entry, futex_offset);
+
+ if (handle_futex_death(uaddr, curr, pi))
+ return;
+ }
+ if (rc)
+ return;
+ uentry = next_uentry;
+ entry = next_entry;
+ pi = next_pi;
+ /*
+ * Avoid excessively long or circular lists:
+ */
+ if (!--limit)
+ break;
+
+ cond_resched();
+ }
+ if (pending) {
+ void __user *uaddr = futex_uaddr(pending, futex_offset);
+
+ handle_futex_death(uaddr, curr, pip);
+ }
+}
+
+COMPAT_SYSCALL_DEFINE2(set_robust_list,
+ struct compat_robust_list_head __user *, head,
+ compat_size_t, len)
+{
+ if (!futex_cmpxchg_enabled)
+ return -ENOSYS;
+
+ if (unlikely(len != sizeof(*head)))
+ return -EINVAL;
+
+ current->compat_robust_list = head;
+
+ return 0;
+}
+
+COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
+ compat_uptr_t __user *, head_ptr,
+ compat_size_t __user *, len_ptr)
+{
+ struct compat_robust_list_head __user *head;
+ unsigned long ret;
+ struct task_struct *p;
+
+ if (!futex_cmpxchg_enabled)
+ return -ENOSYS;
+
+ rcu_read_lock();
+
+ ret = -ESRCH;
+ if (!pid)
+ p = current;
+ else {
+ p = find_task_by_vpid(pid);
+ if (!p)
+ goto err_unlock;
+ }
+
+ ret = -EPERM;
+ if (!ptrace_may_access(p, PTRACE_MODE_READ))
+ goto err_unlock;
+
+ head = p->compat_robust_list;
+ rcu_read_unlock();
+
+ if (put_user(sizeof(*head), len_ptr))
+ return -EFAULT;
+ return put_user(ptr_to_compat(head), head_ptr);
+
+err_unlock:
+ rcu_read_unlock();
+
+ return ret;
+}
+
+COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
+ struct compat_timespec __user *, utime, u32 __user *, uaddr2,
+ u32, val3)
+{
+ struct timespec ts;
+ ktime_t t, *tp = NULL;
+ int val2 = 0;
+ int cmd = op & FUTEX_CMD_MASK;
+
+ if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
+ cmd == FUTEX_WAIT_BITSET ||
+ cmd == FUTEX_WAIT_REQUEUE_PI)) {
+ if (compat_get_timespec(&ts, utime))
+ return -EFAULT;
+ if (!timespec_valid(&ts))
+ return -EINVAL;
+
+ t = timespec_to_ktime(ts);
+ if (cmd == FUTEX_WAIT)
+ t = ktime_add_safe(ktime_get(), t);
+ tp = &t;
+ }
+ if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
+ cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
+ val2 = (int) (unsigned long) utime;
+
+ return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
+}
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
new file mode 100644
index 000000000..c92e44855
--- /dev/null
+++ b/kernel/gcov/Kconfig
@@ -0,0 +1,82 @@
+menu "GCOV-based kernel profiling"
+
+config GCOV_KERNEL
+ bool "Enable gcov-based kernel profiling"
+ depends on DEBUG_FS
+ select CONSTRUCTORS if !UML
+ default n
+ ---help---
+ This option enables gcov-based code profiling (e.g. for code coverage
+ measurements).
+
+ If unsure, say N.
+
+ Additionally specify CONFIG_GCOV_PROFILE_ALL=y to get profiling data
+ for the entire kernel. To enable profiling for specific files or
+ directories, add a line similar to the following to the respective
+ Makefile:
+
+ For a single file (e.g. main.o):
+ GCOV_PROFILE_main.o := y
+
+ For all files in one directory:
+ GCOV_PROFILE := y
+
+ To exclude files from being profiled even when CONFIG_GCOV_PROFILE_ALL
+ is specified, use:
+
+ GCOV_PROFILE_main.o := n
+ and:
+ GCOV_PROFILE := n
+
+ Note that the debugfs filesystem has to be mounted to access
+ profiling data.
+
+config ARCH_HAS_GCOV_PROFILE_ALL
+ def_bool n
+
+config GCOV_PROFILE_ALL
+ bool "Profile entire Kernel"
+ depends on GCOV_KERNEL
+ depends on ARCH_HAS_GCOV_PROFILE_ALL
+ default n
+ ---help---
+ This options activates profiling for the entire kernel.
+
+ If unsure, say N.
+
+ Note that a kernel compiled with profiling flags will be significantly
+ larger and run slower. Also be sure to exclude files from profiling
+ which are not linked to the kernel image to prevent linker errors.
+
+choice
+ prompt "Specify GCOV format"
+ depends on GCOV_KERNEL
+ default GCOV_FORMAT_AUTODETECT
+ ---help---
+ The gcov format is usually determined by the GCC version, but there are
+ exceptions where format changes are integrated in lower-version GCCs.
+ In such a case use this option to adjust the format used in the kernel
+ accordingly.
+
+ If unsure, choose "Autodetect".
+
+config GCOV_FORMAT_AUTODETECT
+ bool "Autodetect"
+ ---help---
+ Select this option to use the format that corresponds to your GCC
+ version.
+
+config GCOV_FORMAT_3_4
+ bool "GCC 3.4 format"
+ ---help---
+ Select this option to use the format defined by GCC 3.4.
+
+config GCOV_FORMAT_4_7
+ bool "GCC 4.7 format"
+ ---help---
+ Select this option to use the format defined by GCC 4.7.
+
+endchoice
+
+endmenu
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
new file mode 100644
index 000000000..752d6486b
--- /dev/null
+++ b/kernel/gcov/Makefile
@@ -0,0 +1,7 @@
+ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
+
+obj-y := base.o fs.o
+obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_3_4.o
+obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_4_7.o
+obj-$(CONFIG_GCOV_FORMAT_AUTODETECT) += $(call cc-ifversion, -lt, 0407, \
+ gcc_3_4.o, gcc_4_7.o)
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
new file mode 100644
index 000000000..a744098e4
--- /dev/null
+++ b/kernel/gcov/base.c
@@ -0,0 +1,161 @@
+/*
+ * This code maintains a list of active profiling data structures.
+ *
+ * Copyright IBM Corp. 2009
+ * Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
+ *
+ * Uses gcc-internal data definitions.
+ * Based on the gcov-kernel patch by:
+ * Hubertus Franke <frankeh@us.ibm.com>
+ * Nigel Hinds <nhinds@us.ibm.com>
+ * Rajan Ravindran <rajancr@us.ibm.com>
+ * Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
+ * Paul Larson
+ */
+
+#define pr_fmt(fmt) "gcov: " fmt
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include "gcov.h"
+
+static int gcov_events_enabled;
+static DEFINE_MUTEX(gcov_lock);
+
+/*
+ * __gcov_init is called by gcc-generated constructor code for each object
+ * file compiled with -fprofile-arcs.
+ */
+void __gcov_init(struct gcov_info *info)
+{
+ static unsigned int gcov_version;
+
+ mutex_lock(&gcov_lock);
+ if (gcov_version == 0) {
+ gcov_version = gcov_info_version(info);
+ /*
+ * Printing gcc's version magic may prove useful for debugging
+ * incompatibility reports.
+ */
+ pr_info("version magic: 0x%x\n", gcov_version);
+ }
+ /*
+ * Add new profiling data structure to list and inform event
+ * listener.
+ */
+ gcov_info_link(info);
+ if (gcov_events_enabled)
+ gcov_event(GCOV_ADD, info);
+ mutex_unlock(&gcov_lock);
+}
+EXPORT_SYMBOL(__gcov_init);
+
+/*
+ * These functions may be referenced by gcc-generated profiling code but serve
+ * no function for kernel profiling.
+ */
+void __gcov_flush(void)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_flush);
+
+void __gcov_merge_add(gcov_type *counters, unsigned int n_counters)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_add);
+
+void __gcov_merge_single(gcov_type *counters, unsigned int n_counters)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_single);
+
+void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_delta);
+
+void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_ior);
+
+void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_time_profile);
+
+/**
+ * gcov_enable_events - enable event reporting through gcov_event()
+ *
+ * Turn on reporting of profiling data load/unload-events through the
+ * gcov_event() callback. Also replay all previous events once. This function
+ * is needed because some events are potentially generated too early for the
+ * callback implementation to handle them initially.
+ */
+void gcov_enable_events(void)
+{
+ struct gcov_info *info = NULL;
+
+ mutex_lock(&gcov_lock);
+ gcov_events_enabled = 1;
+
+ /* Perform event callback for previously registered entries. */
+ while ((info = gcov_info_next(info))) {
+ gcov_event(GCOV_ADD, info);
+ cond_resched();
+ }
+
+ mutex_unlock(&gcov_lock);
+}
+
+#ifdef CONFIG_MODULES
+static inline int within(void *addr, void *start, unsigned long size)
+{
+ return ((addr >= start) && (addr < start + size));
+}
+
+/* Update list and generate events when modules are unloaded. */
+static int gcov_module_notifier(struct notifier_block *nb, unsigned long event,
+ void *data)
+{
+ struct module *mod = data;
+ struct gcov_info *info = NULL;
+ struct gcov_info *prev = NULL;
+
+ if (event != MODULE_STATE_GOING)
+ return NOTIFY_OK;
+ mutex_lock(&gcov_lock);
+
+ /* Remove entries located in module from linked list. */
+ while ((info = gcov_info_next(info))) {
+ if (within(info, mod->module_core, mod->core_size)) {
+ gcov_info_unlink(prev, info);
+ if (gcov_events_enabled)
+ gcov_event(GCOV_REMOVE, info);
+ } else
+ prev = info;
+ }
+
+ mutex_unlock(&gcov_lock);
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block gcov_nb = {
+ .notifier_call = gcov_module_notifier,
+};
+
+static int __init gcov_init(void)
+{
+ return register_module_notifier(&gcov_nb);
+}
+device_initcall(gcov_init);
+#endif /* CONFIG_MODULES */
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
new file mode 100644
index 000000000..edf67c493
--- /dev/null
+++ b/kernel/gcov/fs.c
@@ -0,0 +1,791 @@
+/*
+ * This code exports profiling data as debugfs files to userspace.
+ *
+ * Copyright IBM Corp. 2009
+ * Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
+ *
+ * Uses gcc-internal data definitions.
+ * Based on the gcov-kernel patch by:
+ * Hubertus Franke <frankeh@us.ibm.com>
+ * Nigel Hinds <nhinds@us.ibm.com>
+ * Rajan Ravindran <rajancr@us.ibm.com>
+ * Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
+ * Paul Larson
+ * Yi CDL Yang
+ */
+
+#define pr_fmt(fmt) "gcov: " fmt
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/seq_file.h>
+#include "gcov.h"
+
+/**
+ * struct gcov_node - represents a debugfs entry
+ * @list: list head for child node list
+ * @children: child nodes
+ * @all: list head for list of all nodes
+ * @parent: parent node
+ * @loaded_info: array of pointers to profiling data sets for loaded object
+ * files.
+ * @num_loaded: number of profiling data sets for loaded object files.
+ * @unloaded_info: accumulated copy of profiling data sets for unloaded
+ * object files. Used only when gcov_persist=1.
+ * @dentry: main debugfs entry, either a directory or data file
+ * @links: associated symbolic links
+ * @name: data file basename
+ *
+ * struct gcov_node represents an entity within the gcov/ subdirectory
+ * of debugfs. There are directory and data file nodes. The latter represent
+ * the actual synthesized data file plus any associated symbolic links which
+ * are needed by the gcov tool to work correctly.
+ */
+struct gcov_node {
+ struct list_head list;
+ struct list_head children;
+ struct list_head all;
+ struct gcov_node *parent;
+ struct gcov_info **loaded_info;
+ struct gcov_info *unloaded_info;
+ struct dentry *dentry;
+ struct dentry **links;
+ int num_loaded;
+ char name[0];
+};
+
+static const char objtree[] = OBJTREE;
+static const char srctree[] = SRCTREE;
+static struct gcov_node root_node;
+static struct dentry *reset_dentry;
+static LIST_HEAD(all_head);
+static DEFINE_MUTEX(node_lock);
+
+/* If non-zero, keep copies of profiling data for unloaded modules. */
+static int gcov_persist = 1;
+
+static int __init gcov_persist_setup(char *str)
+{
+ unsigned long val;
+
+ if (kstrtoul(str, 0, &val)) {
+ pr_warn("invalid gcov_persist parameter '%s'\n", str);
+ return 0;
+ }
+ gcov_persist = val;
+ pr_info("setting gcov_persist to %d\n", gcov_persist);
+
+ return 1;
+}
+__setup("gcov_persist=", gcov_persist_setup);
+
+/*
+ * seq_file.start() implementation for gcov data files. Note that the
+ * gcov_iterator interface is designed to be more restrictive than seq_file
+ * (no start from arbitrary position, etc.), to simplify the iterator
+ * implementation.
+ */
+static void *gcov_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ loff_t i;
+
+ gcov_iter_start(seq->private);
+ for (i = 0; i < *pos; i++) {
+ if (gcov_iter_next(seq->private))
+ return NULL;
+ }
+ return seq->private;
+}
+
+/* seq_file.next() implementation for gcov data files. */
+static void *gcov_seq_next(struct seq_file *seq, void *data, loff_t *pos)
+{
+ struct gcov_iterator *iter = data;
+
+ if (gcov_iter_next(iter))
+ return NULL;
+ (*pos)++;
+
+ return iter;
+}
+
+/* seq_file.show() implementation for gcov data files. */
+static int gcov_seq_show(struct seq_file *seq, void *data)
+{
+ struct gcov_iterator *iter = data;
+
+ if (gcov_iter_write(iter, seq))
+ return -EINVAL;
+ return 0;
+}
+
+static void gcov_seq_stop(struct seq_file *seq, void *data)
+{
+ /* Unused. */
+}
+
+static const struct seq_operations gcov_seq_ops = {
+ .start = gcov_seq_start,
+ .next = gcov_seq_next,
+ .show = gcov_seq_show,
+ .stop = gcov_seq_stop,
+};
+
+/*
+ * Return a profiling data set associated with the given node. This is
+ * either a data set for a loaded object file or a data set copy in case
+ * all associated object files have been unloaded.
+ */
+static struct gcov_info *get_node_info(struct gcov_node *node)
+{
+ if (node->num_loaded > 0)
+ return node->loaded_info[0];
+
+ return node->unloaded_info;
+}
+
+/*
+ * Return a newly allocated profiling data set which contains the sum of
+ * all profiling data associated with the given node.
+ */
+static struct gcov_info *get_accumulated_info(struct gcov_node *node)
+{
+ struct gcov_info *info;
+ int i = 0;
+
+ if (node->unloaded_info)
+ info = gcov_info_dup(node->unloaded_info);
+ else
+ info = gcov_info_dup(node->loaded_info[i++]);
+ if (!info)
+ return NULL;
+ for (; i < node->num_loaded; i++)
+ gcov_info_add(info, node->loaded_info[i]);
+
+ return info;
+}
+
+/*
+ * open() implementation for gcov data files. Create a copy of the profiling
+ * data set and initialize the iterator and seq_file interface.
+ */
+static int gcov_seq_open(struct inode *inode, struct file *file)
+{
+ struct gcov_node *node = inode->i_private;
+ struct gcov_iterator *iter;
+ struct seq_file *seq;
+ struct gcov_info *info;
+ int rc = -ENOMEM;
+
+ mutex_lock(&node_lock);
+ /*
+ * Read from a profiling data copy to minimize reference tracking
+ * complexity and concurrent access and to keep accumulating multiple
+ * profiling data sets associated with one node simple.
+ */
+ info = get_accumulated_info(node);
+ if (!info)
+ goto out_unlock;
+ iter = gcov_iter_new(info);
+ if (!iter)
+ goto err_free_info;
+ rc = seq_open(file, &gcov_seq_ops);
+ if (rc)
+ goto err_free_iter_info;
+ seq = file->private_data;
+ seq->private = iter;
+out_unlock:
+ mutex_unlock(&node_lock);
+ return rc;
+
+err_free_iter_info:
+ gcov_iter_free(iter);
+err_free_info:
+ gcov_info_free(info);
+ goto out_unlock;
+}
+
+/*
+ * release() implementation for gcov data files. Release resources allocated
+ * by open().
+ */
+static int gcov_seq_release(struct inode *inode, struct file *file)
+{
+ struct gcov_iterator *iter;
+ struct gcov_info *info;
+ struct seq_file *seq;
+
+ seq = file->private_data;
+ iter = seq->private;
+ info = gcov_iter_get_info(iter);
+ gcov_iter_free(iter);
+ gcov_info_free(info);
+ seq_release(inode, file);
+
+ return 0;
+}
+
+/*
+ * Find a node by the associated data file name. Needs to be called with
+ * node_lock held.
+ */
+static struct gcov_node *get_node_by_name(const char *name)
+{
+ struct gcov_node *node;
+ struct gcov_info *info;
+
+ list_for_each_entry(node, &all_head, all) {
+ info = get_node_info(node);
+ if (info && (strcmp(gcov_info_filename(info), name) == 0))
+ return node;
+ }
+
+ return NULL;
+}
+
+/*
+ * Reset all profiling data associated with the specified node.
+ */
+static void reset_node(struct gcov_node *node)
+{
+ int i;
+
+ if (node->unloaded_info)
+ gcov_info_reset(node->unloaded_info);
+ for (i = 0; i < node->num_loaded; i++)
+ gcov_info_reset(node->loaded_info[i]);
+}
+
+static void remove_node(struct gcov_node *node);
+
+/*
+ * write() implementation for gcov data files. Reset profiling data for the
+ * corresponding file. If all associated object files have been unloaded,
+ * remove the debug fs node as well.
+ */
+static ssize_t gcov_seq_write(struct file *file, const char __user *addr,
+ size_t len, loff_t *pos)
+{
+ struct seq_file *seq;
+ struct gcov_info *info;
+ struct gcov_node *node;
+
+ seq = file->private_data;
+ info = gcov_iter_get_info(seq->private);
+ mutex_lock(&node_lock);
+ node = get_node_by_name(gcov_info_filename(info));
+ if (node) {
+ /* Reset counts or remove node for unloaded modules. */
+ if (node->num_loaded == 0)
+ remove_node(node);
+ else
+ reset_node(node);
+ }
+ /* Reset counts for open file. */
+ gcov_info_reset(info);
+ mutex_unlock(&node_lock);
+
+ return len;
+}
+
+/*
+ * Given a string <path> representing a file path of format:
+ * path/to/file.gcda
+ * construct and return a new string:
+ * <dir/>path/to/file.<ext>
+ */
+static char *link_target(const char *dir, const char *path, const char *ext)
+{
+ char *target;
+ char *old_ext;
+ char *copy;
+
+ copy = kstrdup(path, GFP_KERNEL);
+ if (!copy)
+ return NULL;
+ old_ext = strrchr(copy, '.');
+ if (old_ext)
+ *old_ext = '\0';
+ if (dir)
+ target = kasprintf(GFP_KERNEL, "%s/%s.%s", dir, copy, ext);
+ else
+ target = kasprintf(GFP_KERNEL, "%s.%s", copy, ext);
+ kfree(copy);
+
+ return target;
+}
+
+/*
+ * Construct a string representing the symbolic link target for the given
+ * gcov data file name and link type. Depending on the link type and the
+ * location of the data file, the link target can either point to a
+ * subdirectory of srctree, objtree or in an external location.
+ */
+static char *get_link_target(const char *filename, const struct gcov_link *ext)
+{
+ const char *rel;
+ char *result;
+
+ if (strncmp(filename, objtree, strlen(objtree)) == 0) {
+ rel = filename + strlen(objtree) + 1;
+ if (ext->dir == SRC_TREE)
+ result = link_target(srctree, rel, ext->ext);
+ else
+ result = link_target(objtree, rel, ext->ext);
+ } else {
+ /* External compilation. */
+ result = link_target(NULL, filename, ext->ext);
+ }
+
+ return result;
+}
+
+#define SKEW_PREFIX ".tmp_"
+
+/*
+ * For a filename .tmp_filename.ext return filename.ext. Needed to compensate
+ * for filename skewing caused by the mod-versioning mechanism.
+ */
+static const char *deskew(const char *basename)
+{
+ if (strncmp(basename, SKEW_PREFIX, sizeof(SKEW_PREFIX) - 1) == 0)
+ return basename + sizeof(SKEW_PREFIX) - 1;
+ return basename;
+}
+
+/*
+ * Create links to additional files (usually .c and .gcno files) which the
+ * gcov tool expects to find in the same directory as the gcov data file.
+ */
+static void add_links(struct gcov_node *node, struct dentry *parent)
+{
+ const char *basename;
+ char *target;
+ int num;
+ int i;
+
+ for (num = 0; gcov_link[num].ext; num++)
+ /* Nothing. */;
+ node->links = kcalloc(num, sizeof(struct dentry *), GFP_KERNEL);
+ if (!node->links)
+ return;
+ for (i = 0; i < num; i++) {
+ target = get_link_target(
+ gcov_info_filename(get_node_info(node)),
+ &gcov_link[i]);
+ if (!target)
+ goto out_err;
+ basename = kbasename(target);
+ if (basename == target)
+ goto out_err;
+ node->links[i] = debugfs_create_symlink(deskew(basename),
+ parent, target);
+ if (!node->links[i])
+ goto out_err;
+ kfree(target);
+ }
+
+ return;
+out_err:
+ kfree(target);
+ while (i-- > 0)
+ debugfs_remove(node->links[i]);
+ kfree(node->links);
+ node->links = NULL;
+}
+
+static const struct file_operations gcov_data_fops = {
+ .open = gcov_seq_open,
+ .release = gcov_seq_release,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .write = gcov_seq_write,
+};
+
+/* Basic initialization of a new node. */
+static void init_node(struct gcov_node *node, struct gcov_info *info,
+ const char *name, struct gcov_node *parent)
+{
+ INIT_LIST_HEAD(&node->list);
+ INIT_LIST_HEAD(&node->children);
+ INIT_LIST_HEAD(&node->all);
+ if (node->loaded_info) {
+ node->loaded_info[0] = info;
+ node->num_loaded = 1;
+ }
+ node->parent = parent;
+ if (name)
+ strcpy(node->name, name);
+}
+
+/*
+ * Create a new node and associated debugfs entry. Needs to be called with
+ * node_lock held.
+ */
+static struct gcov_node *new_node(struct gcov_node *parent,
+ struct gcov_info *info, const char *name)
+{
+ struct gcov_node *node;
+
+ node = kzalloc(sizeof(struct gcov_node) + strlen(name) + 1, GFP_KERNEL);
+ if (!node)
+ goto err_nomem;
+ if (info) {
+ node->loaded_info = kcalloc(1, sizeof(struct gcov_info *),
+ GFP_KERNEL);
+ if (!node->loaded_info)
+ goto err_nomem;
+ }
+ init_node(node, info, name, parent);
+ /* Differentiate between gcov data file nodes and directory nodes. */
+ if (info) {
+ node->dentry = debugfs_create_file(deskew(node->name), 0600,
+ parent->dentry, node, &gcov_data_fops);
+ } else
+ node->dentry = debugfs_create_dir(node->name, parent->dentry);
+ if (!node->dentry) {
+ pr_warn("could not create file\n");
+ kfree(node);
+ return NULL;
+ }
+ if (info)
+ add_links(node, parent->dentry);
+ list_add(&node->list, &parent->children);
+ list_add(&node->all, &all_head);
+
+ return node;
+
+err_nomem:
+ kfree(node);
+ pr_warn("out of memory\n");
+ return NULL;
+}
+
+/* Remove symbolic links associated with node. */
+static void remove_links(struct gcov_node *node)
+{
+ int i;
+
+ if (!node->links)
+ return;
+ for (i = 0; gcov_link[i].ext; i++)
+ debugfs_remove(node->links[i]);
+ kfree(node->links);
+ node->links = NULL;
+}
+
+/*
+ * Remove node from all lists and debugfs and release associated resources.
+ * Needs to be called with node_lock held.
+ */
+static void release_node(struct gcov_node *node)
+{
+ list_del(&node->list);
+ list_del(&node->all);
+ debugfs_remove(node->dentry);
+ remove_links(node);
+ kfree(node->loaded_info);
+ if (node->unloaded_info)
+ gcov_info_free(node->unloaded_info);
+ kfree(node);
+}
+
+/* Release node and empty parents. Needs to be called with node_lock held. */
+static void remove_node(struct gcov_node *node)
+{
+ struct gcov_node *parent;
+
+ while ((node != &root_node) && list_empty(&node->children)) {
+ parent = node->parent;
+ release_node(node);
+ node = parent;
+ }
+}
+
+/*
+ * Find child node with given basename. Needs to be called with node_lock
+ * held.
+ */
+static struct gcov_node *get_child_by_name(struct gcov_node *parent,
+ const char *name)
+{
+ struct gcov_node *node;
+
+ list_for_each_entry(node, &parent->children, list) {
+ if (strcmp(node->name, name) == 0)
+ return node;
+ }
+
+ return NULL;
+}
+
+/*
+ * write() implementation for reset file. Reset all profiling data to zero
+ * and remove nodes for which all associated object files are unloaded.
+ */
+static ssize_t reset_write(struct file *file, const char __user *addr,
+ size_t len, loff_t *pos)
+{
+ struct gcov_node *node;
+
+ mutex_lock(&node_lock);
+restart:
+ list_for_each_entry(node, &all_head, all) {
+ if (node->num_loaded > 0)
+ reset_node(node);
+ else if (list_empty(&node->children)) {
+ remove_node(node);
+ /* Several nodes may have gone - restart loop. */
+ goto restart;
+ }
+ }
+ mutex_unlock(&node_lock);
+
+ return len;
+}
+
+/* read() implementation for reset file. Unused. */
+static ssize_t reset_read(struct file *file, char __user *addr, size_t len,
+ loff_t *pos)
+{
+ /* Allow read operation so that a recursive copy won't fail. */
+ return 0;
+}
+
+static const struct file_operations gcov_reset_fops = {
+ .write = reset_write,
+ .read = reset_read,
+ .llseek = noop_llseek,
+};
+
+/*
+ * Create a node for a given profiling data set and add it to all lists and
+ * debugfs. Needs to be called with node_lock held.
+ */
+static void add_node(struct gcov_info *info)
+{
+ char *filename;
+ char *curr;
+ char *next;
+ struct gcov_node *parent;
+ struct gcov_node *node;
+
+ filename = kstrdup(gcov_info_filename(info), GFP_KERNEL);
+ if (!filename)
+ return;
+ parent = &root_node;
+ /* Create directory nodes along the path. */
+ for (curr = filename; (next = strchr(curr, '/')); curr = next + 1) {
+ if (curr == next)
+ continue;
+ *next = 0;
+ if (strcmp(curr, ".") == 0)
+ continue;
+ if (strcmp(curr, "..") == 0) {
+ if (!parent->parent)
+ goto err_remove;
+ parent = parent->parent;
+ continue;
+ }
+ node = get_child_by_name(parent, curr);
+ if (!node) {
+ node = new_node(parent, NULL, curr);
+ if (!node)
+ goto err_remove;
+ }
+ parent = node;
+ }
+ /* Create file node. */
+ node = new_node(parent, info, curr);
+ if (!node)
+ goto err_remove;
+out:
+ kfree(filename);
+ return;
+
+err_remove:
+ remove_node(parent);
+ goto out;
+}
+
+/*
+ * Associate a profiling data set with an existing node. Needs to be called
+ * with node_lock held.
+ */
+static void add_info(struct gcov_node *node, struct gcov_info *info)
+{
+ struct gcov_info **loaded_info;
+ int num = node->num_loaded;
+
+ /*
+ * Prepare new array. This is done first to simplify cleanup in
+ * case the new data set is incompatible, the node only contains
+ * unloaded data sets and there's not enough memory for the array.
+ */
+ loaded_info = kcalloc(num + 1, sizeof(struct gcov_info *), GFP_KERNEL);
+ if (!loaded_info) {
+ pr_warn("could not add '%s' (out of memory)\n",
+ gcov_info_filename(info));
+ return;
+ }
+ memcpy(loaded_info, node->loaded_info,
+ num * sizeof(struct gcov_info *));
+ loaded_info[num] = info;
+ /* Check if the new data set is compatible. */
+ if (num == 0) {
+ /*
+ * A module was unloaded, modified and reloaded. The new
+ * data set replaces the copy of the last one.
+ */
+ if (!gcov_info_is_compatible(node->unloaded_info, info)) {
+ pr_warn("discarding saved data for %s "
+ "(incompatible version)\n",
+ gcov_info_filename(info));
+ gcov_info_free(node->unloaded_info);
+ node->unloaded_info = NULL;
+ }
+ } else {
+ /*
+ * Two different versions of the same object file are loaded.
+ * The initial one takes precedence.
+ */
+ if (!gcov_info_is_compatible(node->loaded_info[0], info)) {
+ pr_warn("could not add '%s' (incompatible "
+ "version)\n", gcov_info_filename(info));
+ kfree(loaded_info);
+ return;
+ }
+ }
+ /* Overwrite previous array. */
+ kfree(node->loaded_info);
+ node->loaded_info = loaded_info;
+ node->num_loaded = num + 1;
+}
+
+/*
+ * Return the index of a profiling data set associated with a node.
+ */
+static int get_info_index(struct gcov_node *node, struct gcov_info *info)
+{
+ int i;
+
+ for (i = 0; i < node->num_loaded; i++) {
+ if (node->loaded_info[i] == info)
+ return i;
+ }
+ return -ENOENT;
+}
+
+/*
+ * Save the data of a profiling data set which is being unloaded.
+ */
+static void save_info(struct gcov_node *node, struct gcov_info *info)
+{
+ if (node->unloaded_info)
+ gcov_info_add(node->unloaded_info, info);
+ else {
+ node->unloaded_info = gcov_info_dup(info);
+ if (!node->unloaded_info) {
+ pr_warn("could not save data for '%s' "
+ "(out of memory)\n",
+ gcov_info_filename(info));
+ }
+ }
+}
+
+/*
+ * Disassociate a profiling data set from a node. Needs to be called with
+ * node_lock held.
+ */
+static void remove_info(struct gcov_node *node, struct gcov_info *info)
+{
+ int i;
+
+ i = get_info_index(node, info);
+ if (i < 0) {
+ pr_warn("could not remove '%s' (not found)\n",
+ gcov_info_filename(info));
+ return;
+ }
+ if (gcov_persist)
+ save_info(node, info);
+ /* Shrink array. */
+ node->loaded_info[i] = node->loaded_info[node->num_loaded - 1];
+ node->num_loaded--;
+ if (node->num_loaded > 0)
+ return;
+ /* Last loaded data set was removed. */
+ kfree(node->loaded_info);
+ node->loaded_info = NULL;
+ node->num_loaded = 0;
+ if (!node->unloaded_info)
+ remove_node(node);
+}
+
+/*
+ * Callback to create/remove profiling files when code compiled with
+ * -fprofile-arcs is loaded/unloaded.
+ */
+void gcov_event(enum gcov_action action, struct gcov_info *info)
+{
+ struct gcov_node *node;
+
+ mutex_lock(&node_lock);
+ node = get_node_by_name(gcov_info_filename(info));
+ switch (action) {
+ case GCOV_ADD:
+ if (node)
+ add_info(node, info);
+ else
+ add_node(info);
+ break;
+ case GCOV_REMOVE:
+ if (node)
+ remove_info(node, info);
+ else {
+ pr_warn("could not remove '%s' (not found)\n",
+ gcov_info_filename(info));
+ }
+ break;
+ }
+ mutex_unlock(&node_lock);
+}
+
+/* Create debugfs entries. */
+static __init int gcov_fs_init(void)
+{
+ int rc = -EIO;
+
+ init_node(&root_node, NULL, NULL, NULL);
+ /*
+ * /sys/kernel/debug/gcov will be parent for the reset control file
+ * and all profiling files.
+ */
+ root_node.dentry = debugfs_create_dir("gcov", NULL);
+ if (!root_node.dentry)
+ goto err_remove;
+ /*
+ * Create reset file which resets all profiling counts when written
+ * to.
+ */
+ reset_dentry = debugfs_create_file("reset", 0600, root_node.dentry,
+ NULL, &gcov_reset_fops);
+ if (!reset_dentry)
+ goto err_remove;
+ /* Replay previous events to get our fs hierarchy up-to-date. */
+ gcov_enable_events();
+ return 0;
+
+err_remove:
+ pr_err("init failed\n");
+ debugfs_remove(root_node.dentry);
+
+ return rc;
+}
+device_initcall(gcov_fs_init);
diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c
new file mode 100644
index 000000000..27bc88a35
--- /dev/null
+++ b/kernel/gcov/gcc_3_4.c
@@ -0,0 +1,562 @@
+/*
+ * This code provides functions to handle gcc's profiling data format
+ * introduced with gcc 3.4. Future versions of gcc may change the gcov
+ * format (as happened before), so all format-specific information needs
+ * to be kept modular and easily exchangeable.
+ *
+ * This file is based on gcc-internal definitions. Functions and data
+ * structures are defined to be compatible with gcc counterparts.
+ * For a better understanding, refer to gcc source: gcc/gcov-io.h.
+ *
+ * Copyright IBM Corp. 2009
+ * Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
+ *
+ * Uses gcc-internal data definitions.
+ */
+
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/seq_file.h>
+#include <linux/vmalloc.h>
+#include "gcov.h"
+
+#define GCOV_COUNTERS 5
+
+static struct gcov_info *gcov_info_head;
+
+/**
+ * struct gcov_fn_info - profiling meta data per function
+ * @ident: object file-unique function identifier
+ * @checksum: function checksum
+ * @n_ctrs: number of values per counter type belonging to this function
+ *
+ * This data is generated by gcc during compilation and doesn't change
+ * at run-time.
+ */
+struct gcov_fn_info {
+ unsigned int ident;
+ unsigned int checksum;
+ unsigned int n_ctrs[0];
+};
+
+/**
+ * struct gcov_ctr_info - profiling data per counter type
+ * @num: number of counter values for this type
+ * @values: array of counter values for this type
+ * @merge: merge function for counter values of this type (unused)
+ *
+ * This data is generated by gcc during compilation and doesn't change
+ * at run-time with the exception of the values array.
+ */
+struct gcov_ctr_info {
+ unsigned int num;
+ gcov_type *values;
+ void (*merge)(gcov_type *, unsigned int);
+};
+
+/**
+ * struct gcov_info - profiling data per object file
+ * @version: gcov version magic indicating the gcc version used for compilation
+ * @next: list head for a singly-linked list
+ * @stamp: time stamp
+ * @filename: name of the associated gcov data file
+ * @n_functions: number of instrumented functions
+ * @functions: function data
+ * @ctr_mask: mask specifying which counter types are active
+ * @counts: counter data per counter type
+ *
+ * This data is generated by gcc during compilation and doesn't change
+ * at run-time with the exception of the next pointer.
+ */
+struct gcov_info {
+ unsigned int version;
+ struct gcov_info *next;
+ unsigned int stamp;
+ const char *filename;
+ unsigned int n_functions;
+ const struct gcov_fn_info *functions;
+ unsigned int ctr_mask;
+ struct gcov_ctr_info counts[0];
+};
+
+/**
+ * gcov_info_filename - return info filename
+ * @info: profiling data set
+ */
+const char *gcov_info_filename(struct gcov_info *info)
+{
+ return info->filename;
+}
+
+/**
+ * gcov_info_version - return info version
+ * @info: profiling data set
+ */
+unsigned int gcov_info_version(struct gcov_info *info)
+{
+ return info->version;
+}
+
+/**
+ * gcov_info_next - return next profiling data set
+ * @info: profiling data set
+ *
+ * Returns next gcov_info following @info or first gcov_info in the chain if
+ * @info is %NULL.
+ */
+struct gcov_info *gcov_info_next(struct gcov_info *info)
+{
+ if (!info)
+ return gcov_info_head;
+
+ return info->next;
+}
+
+/**
+ * gcov_info_link - link/add profiling data set to the list
+ * @info: profiling data set
+ */
+void gcov_info_link(struct gcov_info *info)
+{
+ info->next = gcov_info_head;
+ gcov_info_head = info;
+}
+
+/**
+ * gcov_info_unlink - unlink/remove profiling data set from the list
+ * @prev: previous profiling data set
+ * @info: profiling data set
+ */
+void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info)
+{
+ if (prev)
+ prev->next = info->next;
+ else
+ gcov_info_head = info->next;
+}
+
+/* Symbolic links to be created for each profiling data file. */
+const struct gcov_link gcov_link[] = {
+ { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */
+ { 0, NULL},
+};
+
+/*
+ * Determine whether a counter is active. Based on gcc magic. Doesn't change
+ * at run-time.
+ */
+static int counter_active(struct gcov_info *info, unsigned int type)
+{
+ return (1 << type) & info->ctr_mask;
+}
+
+/* Determine number of active counters. Based on gcc magic. */
+static unsigned int num_counter_active(struct gcov_info *info)
+{
+ unsigned int i;
+ unsigned int result = 0;
+
+ for (i = 0; i < GCOV_COUNTERS; i++) {
+ if (counter_active(info, i))
+ result++;
+ }
+ return result;
+}
+
+/**
+ * gcov_info_reset - reset profiling data to zero
+ * @info: profiling data set
+ */
+void gcov_info_reset(struct gcov_info *info)
+{
+ unsigned int active = num_counter_active(info);
+ unsigned int i;
+
+ for (i = 0; i < active; i++) {
+ memset(info->counts[i].values, 0,
+ info->counts[i].num * sizeof(gcov_type));
+ }
+}
+
+/**
+ * gcov_info_is_compatible - check if profiling data can be added
+ * @info1: first profiling data set
+ * @info2: second profiling data set
+ *
+ * Returns non-zero if profiling data can be added, zero otherwise.
+ */
+int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2)
+{
+ return (info1->stamp == info2->stamp);
+}
+
+/**
+ * gcov_info_add - add up profiling data
+ * @dest: profiling data set to which data is added
+ * @source: profiling data set which is added
+ *
+ * Adds profiling counts of @source to @dest.
+ */
+void gcov_info_add(struct gcov_info *dest, struct gcov_info *source)
+{
+ unsigned int i;
+ unsigned int j;
+
+ for (i = 0; i < num_counter_active(dest); i++) {
+ for (j = 0; j < dest->counts[i].num; j++) {
+ dest->counts[i].values[j] +=
+ source->counts[i].values[j];
+ }
+ }
+}
+
+/* Get size of function info entry. Based on gcc magic. */
+static size_t get_fn_size(struct gcov_info *info)
+{
+ size_t size;
+
+ size = sizeof(struct gcov_fn_info) + num_counter_active(info) *
+ sizeof(unsigned int);
+ if (__alignof__(struct gcov_fn_info) > sizeof(unsigned int))
+ size = ALIGN(size, __alignof__(struct gcov_fn_info));
+ return size;
+}
+
+/* Get address of function info entry. Based on gcc magic. */
+static struct gcov_fn_info *get_fn_info(struct gcov_info *info, unsigned int fn)
+{
+ return (struct gcov_fn_info *)
+ ((char *) info->functions + fn * get_fn_size(info));
+}
+
+/**
+ * gcov_info_dup - duplicate profiling data set
+ * @info: profiling data set to duplicate
+ *
+ * Return newly allocated duplicate on success, %NULL on error.
+ */
+struct gcov_info *gcov_info_dup(struct gcov_info *info)
+{
+ struct gcov_info *dup;
+ unsigned int i;
+ unsigned int active;
+
+ /* Duplicate gcov_info. */
+ active = num_counter_active(info);
+ dup = kzalloc(sizeof(struct gcov_info) +
+ sizeof(struct gcov_ctr_info) * active, GFP_KERNEL);
+ if (!dup)
+ return NULL;
+ dup->version = info->version;
+ dup->stamp = info->stamp;
+ dup->n_functions = info->n_functions;
+ dup->ctr_mask = info->ctr_mask;
+ /* Duplicate filename. */
+ dup->filename = kstrdup(info->filename, GFP_KERNEL);
+ if (!dup->filename)
+ goto err_free;
+ /* Duplicate table of functions. */
+ dup->functions = kmemdup(info->functions, info->n_functions *
+ get_fn_size(info), GFP_KERNEL);
+ if (!dup->functions)
+ goto err_free;
+ /* Duplicate counter arrays. */
+ for (i = 0; i < active ; i++) {
+ struct gcov_ctr_info *ctr = &info->counts[i];
+ size_t size = ctr->num * sizeof(gcov_type);
+
+ dup->counts[i].num = ctr->num;
+ dup->counts[i].merge = ctr->merge;
+ dup->counts[i].values = vmalloc(size);
+ if (!dup->counts[i].values)
+ goto err_free;
+ memcpy(dup->counts[i].values, ctr->values, size);
+ }
+ return dup;
+
+err_free:
+ gcov_info_free(dup);
+ return NULL;
+}
+
+/**
+ * gcov_info_free - release memory for profiling data set duplicate
+ * @info: profiling data set duplicate to free
+ */
+void gcov_info_free(struct gcov_info *info)
+{
+ unsigned int active = num_counter_active(info);
+ unsigned int i;
+
+ for (i = 0; i < active ; i++)
+ vfree(info->counts[i].values);
+ kfree(info->functions);
+ kfree(info->filename);
+ kfree(info);
+}
+
+/**
+ * struct type_info - iterator helper array
+ * @ctr_type: counter type
+ * @offset: index of the first value of the current function for this type
+ *
+ * This array is needed to convert the in-memory data format into the in-file
+ * data format:
+ *
+ * In-memory:
+ * for each counter type
+ * for each function
+ * values
+ *
+ * In-file:
+ * for each function
+ * for each counter type
+ * values
+ *
+ * See gcc source gcc/gcov-io.h for more information on data organization.
+ */
+struct type_info {
+ int ctr_type;
+ unsigned int offset;
+};
+
+/**
+ * struct gcov_iterator - specifies current file position in logical records
+ * @info: associated profiling data
+ * @record: record type
+ * @function: function number
+ * @type: counter type
+ * @count: index into values array
+ * @num_types: number of counter types
+ * @type_info: helper array to get values-array offset for current function
+ */
+struct gcov_iterator {
+ struct gcov_info *info;
+
+ int record;
+ unsigned int function;
+ unsigned int type;
+ unsigned int count;
+
+ int num_types;
+ struct type_info type_info[0];
+};
+
+static struct gcov_fn_info *get_func(struct gcov_iterator *iter)
+{
+ return get_fn_info(iter->info, iter->function);
+}
+
+static struct type_info *get_type(struct gcov_iterator *iter)
+{
+ return &iter->type_info[iter->type];
+}
+
+/**
+ * gcov_iter_new - allocate and initialize profiling data iterator
+ * @info: profiling data set to be iterated
+ *
+ * Return file iterator on success, %NULL otherwise.
+ */
+struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
+{
+ struct gcov_iterator *iter;
+
+ iter = kzalloc(sizeof(struct gcov_iterator) +
+ num_counter_active(info) * sizeof(struct type_info),
+ GFP_KERNEL);
+ if (iter)
+ iter->info = info;
+
+ return iter;
+}
+
+/**
+ * gcov_iter_free - release memory for iterator
+ * @iter: file iterator to free
+ */
+void gcov_iter_free(struct gcov_iterator *iter)
+{
+ kfree(iter);
+}
+
+/**
+ * gcov_iter_get_info - return profiling data set for given file iterator
+ * @iter: file iterator
+ */
+struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter)
+{
+ return iter->info;
+}
+
+/**
+ * gcov_iter_start - reset file iterator to starting position
+ * @iter: file iterator
+ */
+void gcov_iter_start(struct gcov_iterator *iter)
+{
+ int i;
+
+ iter->record = 0;
+ iter->function = 0;
+ iter->type = 0;
+ iter->count = 0;
+ iter->num_types = 0;
+ for (i = 0; i < GCOV_COUNTERS; i++) {
+ if (counter_active(iter->info, i)) {
+ iter->type_info[iter->num_types].ctr_type = i;
+ iter->type_info[iter->num_types++].offset = 0;
+ }
+ }
+}
+
+/* Mapping of logical record number to actual file content. */
+#define RECORD_FILE_MAGIC 0
+#define RECORD_GCOV_VERSION 1
+#define RECORD_TIME_STAMP 2
+#define RECORD_FUNCTION_TAG 3
+#define RECORD_FUNCTON_TAG_LEN 4
+#define RECORD_FUNCTION_IDENT 5
+#define RECORD_FUNCTION_CHECK 6
+#define RECORD_COUNT_TAG 7
+#define RECORD_COUNT_LEN 8
+#define RECORD_COUNT 9
+
+/**
+ * gcov_iter_next - advance file iterator to next logical record
+ * @iter: file iterator
+ *
+ * Return zero if new position is valid, non-zero if iterator has reached end.
+ */
+int gcov_iter_next(struct gcov_iterator *iter)
+{
+ switch (iter->record) {
+ case RECORD_FILE_MAGIC:
+ case RECORD_GCOV_VERSION:
+ case RECORD_FUNCTION_TAG:
+ case RECORD_FUNCTON_TAG_LEN:
+ case RECORD_FUNCTION_IDENT:
+ case RECORD_COUNT_TAG:
+ /* Advance to next record */
+ iter->record++;
+ break;
+ case RECORD_COUNT:
+ /* Advance to next count */
+ iter->count++;
+ /* fall through */
+ case RECORD_COUNT_LEN:
+ if (iter->count < get_func(iter)->n_ctrs[iter->type]) {
+ iter->record = 9;
+ break;
+ }
+ /* Advance to next counter type */
+ get_type(iter)->offset += iter->count;
+ iter->count = 0;
+ iter->type++;
+ /* fall through */
+ case RECORD_FUNCTION_CHECK:
+ if (iter->type < iter->num_types) {
+ iter->record = 7;
+ break;
+ }
+ /* Advance to next function */
+ iter->type = 0;
+ iter->function++;
+ /* fall through */
+ case RECORD_TIME_STAMP:
+ if (iter->function < iter->info->n_functions)
+ iter->record = 3;
+ else
+ iter->record = -1;
+ break;
+ }
+ /* Check for EOF. */
+ if (iter->record == -1)
+ return -EINVAL;
+ else
+ return 0;
+}
+
+/**
+ * seq_write_gcov_u32 - write 32 bit number in gcov format to seq_file
+ * @seq: seq_file handle
+ * @v: value to be stored
+ *
+ * Number format defined by gcc: numbers are recorded in the 32 bit
+ * unsigned binary form of the endianness of the machine generating the
+ * file.
+ */
+static int seq_write_gcov_u32(struct seq_file *seq, u32 v)
+{
+ return seq_write(seq, &v, sizeof(v));
+}
+
+/**
+ * seq_write_gcov_u64 - write 64 bit number in gcov format to seq_file
+ * @seq: seq_file handle
+ * @v: value to be stored
+ *
+ * Number format defined by gcc: numbers are recorded in the 32 bit
+ * unsigned binary form of the endianness of the machine generating the
+ * file. 64 bit numbers are stored as two 32 bit numbers, the low part
+ * first.
+ */
+static int seq_write_gcov_u64(struct seq_file *seq, u64 v)
+{
+ u32 data[2];
+
+ data[0] = (v & 0xffffffffUL);
+ data[1] = (v >> 32);
+ return seq_write(seq, data, sizeof(data));
+}
+
+/**
+ * gcov_iter_write - write data for current pos to seq_file
+ * @iter: file iterator
+ * @seq: seq_file handle
+ *
+ * Return zero on success, non-zero otherwise.
+ */
+int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
+{
+ int rc = -EINVAL;
+
+ switch (iter->record) {
+ case RECORD_FILE_MAGIC:
+ rc = seq_write_gcov_u32(seq, GCOV_DATA_MAGIC);
+ break;
+ case RECORD_GCOV_VERSION:
+ rc = seq_write_gcov_u32(seq, iter->info->version);
+ break;
+ case RECORD_TIME_STAMP:
+ rc = seq_write_gcov_u32(seq, iter->info->stamp);
+ break;
+ case RECORD_FUNCTION_TAG:
+ rc = seq_write_gcov_u32(seq, GCOV_TAG_FUNCTION);
+ break;
+ case RECORD_FUNCTON_TAG_LEN:
+ rc = seq_write_gcov_u32(seq, 2);
+ break;
+ case RECORD_FUNCTION_IDENT:
+ rc = seq_write_gcov_u32(seq, get_func(iter)->ident);
+ break;
+ case RECORD_FUNCTION_CHECK:
+ rc = seq_write_gcov_u32(seq, get_func(iter)->checksum);
+ break;
+ case RECORD_COUNT_TAG:
+ rc = seq_write_gcov_u32(seq,
+ GCOV_TAG_FOR_COUNTER(get_type(iter)->ctr_type));
+ break;
+ case RECORD_COUNT_LEN:
+ rc = seq_write_gcov_u32(seq,
+ get_func(iter)->n_ctrs[iter->type] * 2);
+ break;
+ case RECORD_COUNT:
+ rc = seq_write_gcov_u64(seq,
+ iter->info->counts[iter->type].
+ values[iter->count + get_type(iter)->offset]);
+ break;
+ }
+ return rc;
+}
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
new file mode 100644
index 000000000..826ba9fb5
--- /dev/null
+++ b/kernel/gcov/gcc_4_7.c
@@ -0,0 +1,565 @@
+/*
+ * This code provides functions to handle gcc's profiling data format
+ * introduced with gcc 4.7.
+ *
+ * This file is based heavily on gcc_3_4.c file.
+ *
+ * For a better understanding, refer to gcc source:
+ * gcc/gcov-io.h
+ * libgcc/libgcov.c
+ *
+ * Uses gcc-internal data definitions.
+ */
+
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/seq_file.h>
+#include <linux/vmalloc.h>
+#include "gcov.h"
+
+#if __GNUC__ == 4 && __GNUC_MINOR__ >= 9
+#define GCOV_COUNTERS 9
+#else
+#define GCOV_COUNTERS 8
+#endif
+
+#define GCOV_TAG_FUNCTION_LENGTH 3
+
+static struct gcov_info *gcov_info_head;
+
+/**
+ * struct gcov_ctr_info - information about counters for a single function
+ * @num: number of counter values for this type
+ * @values: array of counter values for this type
+ *
+ * This data is generated by gcc during compilation and doesn't change
+ * at run-time with the exception of the values array.
+ */
+struct gcov_ctr_info {
+ unsigned int num;
+ gcov_type *values;
+};
+
+/**
+ * struct gcov_fn_info - profiling meta data per function
+ * @key: comdat key
+ * @ident: unique ident of function
+ * @lineno_checksum: function lineo_checksum
+ * @cfg_checksum: function cfg checksum
+ * @ctrs: instrumented counters
+ *
+ * This data is generated by gcc during compilation and doesn't change
+ * at run-time.
+ *
+ * Information about a single function. This uses the trailing array
+ * idiom. The number of counters is determined from the merge pointer
+ * array in gcov_info. The key is used to detect which of a set of
+ * comdat functions was selected -- it points to the gcov_info object
+ * of the object file containing the selected comdat function.
+ */
+struct gcov_fn_info {
+ const struct gcov_info *key;
+ unsigned int ident;
+ unsigned int lineno_checksum;
+ unsigned int cfg_checksum;
+ struct gcov_ctr_info ctrs[0];
+};
+
+/**
+ * struct gcov_info - profiling data per object file
+ * @version: gcov version magic indicating the gcc version used for compilation
+ * @next: list head for a singly-linked list
+ * @stamp: uniquifying time stamp
+ * @filename: name of the associated gcov data file
+ * @merge: merge functions (null for unused counter type)
+ * @n_functions: number of instrumented functions
+ * @functions: pointer to pointers to function information
+ *
+ * This data is generated by gcc during compilation and doesn't change
+ * at run-time with the exception of the next pointer.
+ */
+struct gcov_info {
+ unsigned int version;
+ struct gcov_info *next;
+ unsigned int stamp;
+ const char *filename;
+ void (*merge[GCOV_COUNTERS])(gcov_type *, unsigned int);
+ unsigned int n_functions;
+ struct gcov_fn_info **functions;
+};
+
+/**
+ * gcov_info_filename - return info filename
+ * @info: profiling data set
+ */
+const char *gcov_info_filename(struct gcov_info *info)
+{
+ return info->filename;
+}
+
+/**
+ * gcov_info_version - return info version
+ * @info: profiling data set
+ */
+unsigned int gcov_info_version(struct gcov_info *info)
+{
+ return info->version;
+}
+
+/**
+ * gcov_info_next - return next profiling data set
+ * @info: profiling data set
+ *
+ * Returns next gcov_info following @info or first gcov_info in the chain if
+ * @info is %NULL.
+ */
+struct gcov_info *gcov_info_next(struct gcov_info *info)
+{
+ if (!info)
+ return gcov_info_head;
+
+ return info->next;
+}
+
+/**
+ * gcov_info_link - link/add profiling data set to the list
+ * @info: profiling data set
+ */
+void gcov_info_link(struct gcov_info *info)
+{
+ info->next = gcov_info_head;
+ gcov_info_head = info;
+}
+
+/**
+ * gcov_info_unlink - unlink/remove profiling data set from the list
+ * @prev: previous profiling data set
+ * @info: profiling data set
+ */
+void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info)
+{
+ if (prev)
+ prev->next = info->next;
+ else
+ gcov_info_head = info->next;
+}
+
+/* Symbolic links to be created for each profiling data file. */
+const struct gcov_link gcov_link[] = {
+ { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */
+ { 0, NULL},
+};
+
+/*
+ * Determine whether a counter is active. Doesn't change at run-time.
+ */
+static int counter_active(struct gcov_info *info, unsigned int type)
+{
+ return info->merge[type] ? 1 : 0;
+}
+
+/* Determine number of active counters. Based on gcc magic. */
+static unsigned int num_counter_active(struct gcov_info *info)
+{
+ unsigned int i;
+ unsigned int result = 0;
+
+ for (i = 0; i < GCOV_COUNTERS; i++) {
+ if (counter_active(info, i))
+ result++;
+ }
+ return result;
+}
+
+/**
+ * gcov_info_reset - reset profiling data to zero
+ * @info: profiling data set
+ */
+void gcov_info_reset(struct gcov_info *info)
+{
+ struct gcov_ctr_info *ci_ptr;
+ unsigned int fi_idx;
+ unsigned int ct_idx;
+
+ for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) {
+ ci_ptr = info->functions[fi_idx]->ctrs;
+
+ for (ct_idx = 0; ct_idx < GCOV_COUNTERS; ct_idx++) {
+ if (!counter_active(info, ct_idx))
+ continue;
+
+ memset(ci_ptr->values, 0,
+ sizeof(gcov_type) * ci_ptr->num);
+ ci_ptr++;
+ }
+ }
+}
+
+/**
+ * gcov_info_is_compatible - check if profiling data can be added
+ * @info1: first profiling data set
+ * @info2: second profiling data set
+ *
+ * Returns non-zero if profiling data can be added, zero otherwise.
+ */
+int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2)
+{
+ return (info1->stamp == info2->stamp);
+}
+
+/**
+ * gcov_info_add - add up profiling data
+ * @dest: profiling data set to which data is added
+ * @source: profiling data set which is added
+ *
+ * Adds profiling counts of @source to @dest.
+ */
+void gcov_info_add(struct gcov_info *dst, struct gcov_info *src)
+{
+ struct gcov_ctr_info *dci_ptr;
+ struct gcov_ctr_info *sci_ptr;
+ unsigned int fi_idx;
+ unsigned int ct_idx;
+ unsigned int val_idx;
+
+ for (fi_idx = 0; fi_idx < src->n_functions; fi_idx++) {
+ dci_ptr = dst->functions[fi_idx]->ctrs;
+ sci_ptr = src->functions[fi_idx]->ctrs;
+
+ for (ct_idx = 0; ct_idx < GCOV_COUNTERS; ct_idx++) {
+ if (!counter_active(src, ct_idx))
+ continue;
+
+ for (val_idx = 0; val_idx < sci_ptr->num; val_idx++)
+ dci_ptr->values[val_idx] +=
+ sci_ptr->values[val_idx];
+
+ dci_ptr++;
+ sci_ptr++;
+ }
+ }
+}
+
+/**
+ * gcov_info_dup - duplicate profiling data set
+ * @info: profiling data set to duplicate
+ *
+ * Return newly allocated duplicate on success, %NULL on error.
+ */
+struct gcov_info *gcov_info_dup(struct gcov_info *info)
+{
+ struct gcov_info *dup;
+ struct gcov_ctr_info *dci_ptr; /* dst counter info */
+ struct gcov_ctr_info *sci_ptr; /* src counter info */
+ unsigned int active;
+ unsigned int fi_idx; /* function info idx */
+ unsigned int ct_idx; /* counter type idx */
+ size_t fi_size; /* function info size */
+ size_t cv_size; /* counter values size */
+
+ dup = kmemdup(info, sizeof(*dup), GFP_KERNEL);
+ if (!dup)
+ return NULL;
+
+ dup->next = NULL;
+ dup->filename = NULL;
+ dup->functions = NULL;
+
+ dup->filename = kstrdup(info->filename, GFP_KERNEL);
+ if (!dup->filename)
+ goto err_free;
+
+ dup->functions = kcalloc(info->n_functions,
+ sizeof(struct gcov_fn_info *), GFP_KERNEL);
+ if (!dup->functions)
+ goto err_free;
+
+ active = num_counter_active(info);
+ fi_size = sizeof(struct gcov_fn_info);
+ fi_size += sizeof(struct gcov_ctr_info) * active;
+
+ for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) {
+ dup->functions[fi_idx] = kzalloc(fi_size, GFP_KERNEL);
+ if (!dup->functions[fi_idx])
+ goto err_free;
+
+ *(dup->functions[fi_idx]) = *(info->functions[fi_idx]);
+
+ sci_ptr = info->functions[fi_idx]->ctrs;
+ dci_ptr = dup->functions[fi_idx]->ctrs;
+
+ for (ct_idx = 0; ct_idx < active; ct_idx++) {
+
+ cv_size = sizeof(gcov_type) * sci_ptr->num;
+
+ dci_ptr->values = vmalloc(cv_size);
+
+ if (!dci_ptr->values)
+ goto err_free;
+
+ dci_ptr->num = sci_ptr->num;
+ memcpy(dci_ptr->values, sci_ptr->values, cv_size);
+
+ sci_ptr++;
+ dci_ptr++;
+ }
+ }
+
+ return dup;
+err_free:
+ gcov_info_free(dup);
+ return NULL;
+}
+
+/**
+ * gcov_info_free - release memory for profiling data set duplicate
+ * @info: profiling data set duplicate to free
+ */
+void gcov_info_free(struct gcov_info *info)
+{
+ unsigned int active;
+ unsigned int fi_idx;
+ unsigned int ct_idx;
+ struct gcov_ctr_info *ci_ptr;
+
+ if (!info->functions)
+ goto free_info;
+
+ active = num_counter_active(info);
+
+ for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) {
+ if (!info->functions[fi_idx])
+ continue;
+
+ ci_ptr = info->functions[fi_idx]->ctrs;
+
+ for (ct_idx = 0; ct_idx < active; ct_idx++, ci_ptr++)
+ vfree(ci_ptr->values);
+
+ kfree(info->functions[fi_idx]);
+ }
+
+free_info:
+ kfree(info->functions);
+ kfree(info->filename);
+ kfree(info);
+}
+
+#define ITER_STRIDE PAGE_SIZE
+
+/**
+ * struct gcov_iterator - specifies current file position in logical records
+ * @info: associated profiling data
+ * @buffer: buffer containing file data
+ * @size: size of buffer
+ * @pos: current position in file
+ */
+struct gcov_iterator {
+ struct gcov_info *info;
+ void *buffer;
+ size_t size;
+ loff_t pos;
+};
+
+/**
+ * store_gcov_u32 - store 32 bit number in gcov format to buffer
+ * @buffer: target buffer or NULL
+ * @off: offset into the buffer
+ * @v: value to be stored
+ *
+ * Number format defined by gcc: numbers are recorded in the 32 bit
+ * unsigned binary form of the endianness of the machine generating the
+ * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't
+ * store anything.
+ */
+static size_t store_gcov_u32(void *buffer, size_t off, u32 v)
+{
+ u32 *data;
+
+ if (buffer) {
+ data = buffer + off;
+ *data = v;
+ }
+
+ return sizeof(*data);
+}
+
+/**
+ * store_gcov_u64 - store 64 bit number in gcov format to buffer
+ * @buffer: target buffer or NULL
+ * @off: offset into the buffer
+ * @v: value to be stored
+ *
+ * Number format defined by gcc: numbers are recorded in the 32 bit
+ * unsigned binary form of the endianness of the machine generating the
+ * file. 64 bit numbers are stored as two 32 bit numbers, the low part
+ * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store
+ * anything.
+ */
+static size_t store_gcov_u64(void *buffer, size_t off, u64 v)
+{
+ u32 *data;
+
+ if (buffer) {
+ data = buffer + off;
+
+ data[0] = (v & 0xffffffffUL);
+ data[1] = (v >> 32);
+ }
+
+ return sizeof(*data) * 2;
+}
+
+/**
+ * convert_to_gcda - convert profiling data set to gcda file format
+ * @buffer: the buffer to store file data or %NULL if no data should be stored
+ * @info: profiling data set to be converted
+ *
+ * Returns the number of bytes that were/would have been stored into the buffer.
+ */
+static size_t convert_to_gcda(char *buffer, struct gcov_info *info)
+{
+ struct gcov_fn_info *fi_ptr;
+ struct gcov_ctr_info *ci_ptr;
+ unsigned int fi_idx;
+ unsigned int ct_idx;
+ unsigned int cv_idx;
+ size_t pos = 0;
+
+ /* File header. */
+ pos += store_gcov_u32(buffer, pos, GCOV_DATA_MAGIC);
+ pos += store_gcov_u32(buffer, pos, info->version);
+ pos += store_gcov_u32(buffer, pos, info->stamp);
+
+ for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) {
+ fi_ptr = info->functions[fi_idx];
+
+ /* Function record. */
+ pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION);
+ pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION_LENGTH);
+ pos += store_gcov_u32(buffer, pos, fi_ptr->ident);
+ pos += store_gcov_u32(buffer, pos, fi_ptr->lineno_checksum);
+ pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum);
+
+ ci_ptr = fi_ptr->ctrs;
+
+ for (ct_idx = 0; ct_idx < GCOV_COUNTERS; ct_idx++) {
+ if (!counter_active(info, ct_idx))
+ continue;
+
+ /* Counter record. */
+ pos += store_gcov_u32(buffer, pos,
+ GCOV_TAG_FOR_COUNTER(ct_idx));
+ pos += store_gcov_u32(buffer, pos, ci_ptr->num * 2);
+
+ for (cv_idx = 0; cv_idx < ci_ptr->num; cv_idx++) {
+ pos += store_gcov_u64(buffer, pos,
+ ci_ptr->values[cv_idx]);
+ }
+
+ ci_ptr++;
+ }
+ }
+
+ return pos;
+}
+
+/**
+ * gcov_iter_new - allocate and initialize profiling data iterator
+ * @info: profiling data set to be iterated
+ *
+ * Return file iterator on success, %NULL otherwise.
+ */
+struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
+{
+ struct gcov_iterator *iter;
+
+ iter = kzalloc(sizeof(struct gcov_iterator), GFP_KERNEL);
+ if (!iter)
+ goto err_free;
+
+ iter->info = info;
+ /* Dry-run to get the actual buffer size. */
+ iter->size = convert_to_gcda(NULL, info);
+ iter->buffer = vmalloc(iter->size);
+ if (!iter->buffer)
+ goto err_free;
+
+ convert_to_gcda(iter->buffer, info);
+
+ return iter;
+
+err_free:
+ kfree(iter);
+ return NULL;
+}
+
+
+/**
+ * gcov_iter_get_info - return profiling data set for given file iterator
+ * @iter: file iterator
+ */
+void gcov_iter_free(struct gcov_iterator *iter)
+{
+ vfree(iter->buffer);
+ kfree(iter);
+}
+
+/**
+ * gcov_iter_get_info - return profiling data set for given file iterator
+ * @iter: file iterator
+ */
+struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter)
+{
+ return iter->info;
+}
+
+/**
+ * gcov_iter_start - reset file iterator to starting position
+ * @iter: file iterator
+ */
+void gcov_iter_start(struct gcov_iterator *iter)
+{
+ iter->pos = 0;
+}
+
+/**
+ * gcov_iter_next - advance file iterator to next logical record
+ * @iter: file iterator
+ *
+ * Return zero if new position is valid, non-zero if iterator has reached end.
+ */
+int gcov_iter_next(struct gcov_iterator *iter)
+{
+ if (iter->pos < iter->size)
+ iter->pos += ITER_STRIDE;
+
+ if (iter->pos >= iter->size)
+ return -EINVAL;
+
+ return 0;
+}
+
+/**
+ * gcov_iter_write - write data for current pos to seq_file
+ * @iter: file iterator
+ * @seq: seq_file handle
+ *
+ * Return zero on success, non-zero otherwise.
+ */
+int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
+{
+ size_t len;
+
+ if (iter->pos >= iter->size)
+ return -EINVAL;
+
+ len = ITER_STRIDE;
+ if (iter->pos + len > iter->size)
+ len = iter->size - iter->pos;
+
+ seq_write(seq, iter->buffer + iter->pos, len);
+
+ return 0;
+}
diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h
new file mode 100644
index 000000000..92c8e22a2
--- /dev/null
+++ b/kernel/gcov/gcov.h
@@ -0,0 +1,85 @@
+/*
+ * Profiling infrastructure declarations.
+ *
+ * This file is based on gcc-internal definitions. Data structures are
+ * defined to be compatible with gcc counterparts. For a better
+ * understanding, refer to gcc source: gcc/gcov-io.h.
+ *
+ * Copyright IBM Corp. 2009
+ * Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
+ *
+ * Uses gcc-internal data definitions.
+ */
+
+#ifndef GCOV_H
+#define GCOV_H GCOV_H
+
+#include <linux/types.h>
+
+/*
+ * Profiling data types used for gcc 3.4 and above - these are defined by
+ * gcc and need to be kept as close to the original definition as possible to
+ * remain compatible.
+ */
+#define GCOV_DATA_MAGIC ((unsigned int) 0x67636461)
+#define GCOV_TAG_FUNCTION ((unsigned int) 0x01000000)
+#define GCOV_TAG_COUNTER_BASE ((unsigned int) 0x01a10000)
+#define GCOV_TAG_FOR_COUNTER(count) \
+ (GCOV_TAG_COUNTER_BASE + ((unsigned int) (count) << 17))
+
+#if BITS_PER_LONG >= 64
+typedef long gcov_type;
+#else
+typedef long long gcov_type;
+#endif
+
+/* Opaque gcov_info. The gcov structures can change as for example in gcc 4.7 so
+ * we cannot use full definition here and they need to be placed in gcc specific
+ * implementation of gcov. This also means no direct access to the members in
+ * generic code and usage of the interface below.*/
+struct gcov_info;
+
+/* Interface to access gcov_info data */
+const char *gcov_info_filename(struct gcov_info *info);
+unsigned int gcov_info_version(struct gcov_info *info);
+struct gcov_info *gcov_info_next(struct gcov_info *info);
+void gcov_info_link(struct gcov_info *info);
+void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info);
+
+/* Base interface. */
+enum gcov_action {
+ GCOV_ADD,
+ GCOV_REMOVE,
+};
+
+void gcov_event(enum gcov_action action, struct gcov_info *info);
+void gcov_enable_events(void);
+
+/* Iterator control. */
+struct seq_file;
+struct gcov_iterator;
+
+struct gcov_iterator *gcov_iter_new(struct gcov_info *info);
+void gcov_iter_free(struct gcov_iterator *iter);
+void gcov_iter_start(struct gcov_iterator *iter);
+int gcov_iter_next(struct gcov_iterator *iter);
+int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq);
+struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter);
+
+/* gcov_info control. */
+void gcov_info_reset(struct gcov_info *info);
+int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2);
+void gcov_info_add(struct gcov_info *dest, struct gcov_info *source);
+struct gcov_info *gcov_info_dup(struct gcov_info *info);
+void gcov_info_free(struct gcov_info *info);
+
+struct gcov_link {
+ enum {
+ OBJ_TREE,
+ SRC_TREE,
+ } dir;
+ const char *ext;
+};
+extern const struct gcov_link gcov_link[];
+
+#endif /* GCOV_H */
diff --git a/kernel/groups.c b/kernel/groups.c
new file mode 100644
index 000000000..74d431d25
--- /dev/null
+++ b/kernel/groups.c
@@ -0,0 +1,277 @@
+/*
+ * Supplementary group IDs
+ */
+#include <linux/cred.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <linux/user_namespace.h>
+#include <asm/uaccess.h>
+
+struct group_info *groups_alloc(int gidsetsize)
+{
+ struct group_info *group_info;
+ int nblocks;
+ int i;
+
+ nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK;
+ /* Make sure we always allocate at least one indirect block pointer */
+ nblocks = nblocks ? : 1;
+ group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER);
+ if (!group_info)
+ return NULL;
+ group_info->ngroups = gidsetsize;
+ group_info->nblocks = nblocks;
+ atomic_set(&group_info->usage, 1);
+
+ if (gidsetsize <= NGROUPS_SMALL)
+ group_info->blocks[0] = group_info->small_block;
+ else {
+ for (i = 0; i < nblocks; i++) {
+ kgid_t *b;
+ b = (void *)__get_free_page(GFP_USER);
+ if (!b)
+ goto out_undo_partial_alloc;
+ group_info->blocks[i] = b;
+ }
+ }
+ return group_info;
+
+out_undo_partial_alloc:
+ while (--i >= 0) {
+ free_page((unsigned long)group_info->blocks[i]);
+ }
+ kfree(group_info);
+ return NULL;
+}
+
+EXPORT_SYMBOL(groups_alloc);
+
+void groups_free(struct group_info *group_info)
+{
+ if (group_info->blocks[0] != group_info->small_block) {
+ int i;
+ for (i = 0; i < group_info->nblocks; i++)
+ free_page((unsigned long)group_info->blocks[i]);
+ }
+ kfree(group_info);
+}
+
+EXPORT_SYMBOL(groups_free);
+
+/* export the group_info to a user-space array */
+static int groups_to_user(gid_t __user *grouplist,
+ const struct group_info *group_info)
+{
+ struct user_namespace *user_ns = current_user_ns();
+ int i;
+ unsigned int count = group_info->ngroups;
+
+ for (i = 0; i < count; i++) {
+ gid_t gid;
+ gid = from_kgid_munged(user_ns, GROUP_AT(group_info, i));
+ if (put_user(gid, grouplist+i))
+ return -EFAULT;
+ }
+ return 0;
+}
+
+/* fill a group_info from a user-space array - it must be allocated already */
+static int groups_from_user(struct group_info *group_info,
+ gid_t __user *grouplist)
+{
+ struct user_namespace *user_ns = current_user_ns();
+ int i;
+ unsigned int count = group_info->ngroups;
+
+ for (i = 0; i < count; i++) {
+ gid_t gid;
+ kgid_t kgid;
+ if (get_user(gid, grouplist+i))
+ return -EFAULT;
+
+ kgid = make_kgid(user_ns, gid);
+ if (!gid_valid(kgid))
+ return -EINVAL;
+
+ GROUP_AT(group_info, i) = kgid;
+ }
+ return 0;
+}
+
+/* a simple Shell sort */
+static void groups_sort(struct group_info *group_info)
+{
+ int base, max, stride;
+ int gidsetsize = group_info->ngroups;
+
+ for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
+ ; /* nothing */
+ stride /= 3;
+
+ while (stride) {
+ max = gidsetsize - stride;
+ for (base = 0; base < max; base++) {
+ int left = base;
+ int right = left + stride;
+ kgid_t tmp = GROUP_AT(group_info, right);
+
+ while (left >= 0 && gid_gt(GROUP_AT(group_info, left), tmp)) {
+ GROUP_AT(group_info, right) =
+ GROUP_AT(group_info, left);
+ right = left;
+ left -= stride;
+ }
+ GROUP_AT(group_info, right) = tmp;
+ }
+ stride /= 3;
+ }
+}
+
+/* a simple bsearch */
+int groups_search(const struct group_info *group_info, kgid_t grp)
+{
+ unsigned int left, right;
+
+ if (!group_info)
+ return 0;
+
+ left = 0;
+ right = group_info->ngroups;
+ while (left < right) {
+ unsigned int mid = (left+right)/2;
+ if (gid_gt(grp, GROUP_AT(group_info, mid)))
+ left = mid + 1;
+ else if (gid_lt(grp, GROUP_AT(group_info, mid)))
+ right = mid;
+ else
+ return 1;
+ }
+ return 0;
+}
+
+/**
+ * set_groups - Change a group subscription in a set of credentials
+ * @new: The newly prepared set of credentials to alter
+ * @group_info: The group list to install
+ */
+void set_groups(struct cred *new, struct group_info *group_info)
+{
+ put_group_info(new->group_info);
+ groups_sort(group_info);
+ get_group_info(group_info);
+ new->group_info = group_info;
+}
+
+EXPORT_SYMBOL(set_groups);
+
+/**
+ * set_current_groups - Change current's group subscription
+ * @group_info: The group list to impose
+ *
+ * Validate a group subscription and, if valid, impose it upon current's task
+ * security record.
+ */
+int set_current_groups(struct group_info *group_info)
+{
+ struct cred *new;
+
+ new = prepare_creds();
+ if (!new)
+ return -ENOMEM;
+
+ set_groups(new, group_info);
+ return commit_creds(new);
+}
+
+EXPORT_SYMBOL(set_current_groups);
+
+SYSCALL_DEFINE2(getgroups, int, gidsetsize, gid_t __user *, grouplist)
+{
+ const struct cred *cred = current_cred();
+ int i;
+
+ if (gidsetsize < 0)
+ return -EINVAL;
+
+ /* no need to grab task_lock here; it cannot change */
+ i = cred->group_info->ngroups;
+ if (gidsetsize) {
+ if (i > gidsetsize) {
+ i = -EINVAL;
+ goto out;
+ }
+ if (groups_to_user(grouplist, cred->group_info)) {
+ i = -EFAULT;
+ goto out;
+ }
+ }
+out:
+ return i;
+}
+
+bool may_setgroups(void)
+{
+ struct user_namespace *user_ns = current_user_ns();
+
+ return ns_capable(user_ns, CAP_SETGID) &&
+ userns_may_setgroups(user_ns);
+}
+
+/*
+ * SMP: Our groups are copy-on-write. We can set them safely
+ * without another task interfering.
+ */
+
+SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
+{
+ struct group_info *group_info;
+ int retval;
+
+ if (!may_setgroups())
+ return -EPERM;
+ if ((unsigned)gidsetsize > NGROUPS_MAX)
+ return -EINVAL;
+
+ group_info = groups_alloc(gidsetsize);
+ if (!group_info)
+ return -ENOMEM;
+ retval = groups_from_user(group_info, grouplist);
+ if (retval) {
+ put_group_info(group_info);
+ return retval;
+ }
+
+ retval = set_current_groups(group_info);
+ put_group_info(group_info);
+
+ return retval;
+}
+
+/*
+ * Check whether we're fsgid/egid or in the supplemental group..
+ */
+int in_group_p(kgid_t grp)
+{
+ const struct cred *cred = current_cred();
+ int retval = 1;
+
+ if (!gid_eq(grp, cred->fsgid))
+ retval = groups_search(cred->group_info, grp);
+ return retval;
+}
+
+EXPORT_SYMBOL(in_group_p);
+
+int in_egroup_p(kgid_t grp)
+{
+ const struct cred *cred = current_cred();
+ int retval = 1;
+
+ if (!gid_eq(grp, cred->egid))
+ retval = groups_search(cred->group_info, grp);
+ return retval;
+}
+
+EXPORT_SYMBOL(in_egroup_p);
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
new file mode 100644
index 000000000..e0f90c2b5
--- /dev/null
+++ b/kernel/hung_task.c
@@ -0,0 +1,251 @@
+/*
+ * Detect Hung Task
+ *
+ * kernel/hung_task.c - kernel thread for detecting tasks stuck in D state
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/cpu.h>
+#include <linux/nmi.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/lockdep.h>
+#include <linux/export.h>
+#include <linux/sysctl.h>
+#include <linux/utsname.h>
+#include <trace/events/sched.h>
+
+/*
+ * The number of tasks checked:
+ */
+int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
+
+/*
+ * Limit number of tasks checked in a batch.
+ *
+ * This value controls the preemptibility of khungtaskd since preemption
+ * is disabled during the critical section. It also controls the size of
+ * the RCU grace period. So it needs to be upper-bound.
+ */
+#define HUNG_TASK_BATCHING 1024
+
+/*
+ * Zero means infinite timeout - no checking done:
+ */
+unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT;
+
+int __read_mostly sysctl_hung_task_warnings = 10;
+
+static int __read_mostly did_panic;
+
+static struct task_struct *watchdog_task;
+
+/*
+ * Should we panic (and reboot, if panic_timeout= is set) when a
+ * hung task is detected:
+ */
+unsigned int __read_mostly sysctl_hung_task_panic =
+ CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE;
+
+static int __init hung_task_panic_setup(char *str)
+{
+ int rc = kstrtouint(str, 0, &sysctl_hung_task_panic);
+
+ if (rc)
+ return rc;
+ return 1;
+}
+__setup("hung_task_panic=", hung_task_panic_setup);
+
+static int
+hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ did_panic = 1;
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block panic_block = {
+ .notifier_call = hung_task_panic,
+};
+
+static void check_hung_task(struct task_struct *t, unsigned long timeout)
+{
+ unsigned long switch_count = t->nvcsw + t->nivcsw;
+
+ /*
+ * Ensure the task is not frozen.
+ * Also, skip vfork and any other user process that freezer should skip.
+ */
+ if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP)))
+ return;
+
+ /*
+ * When a freshly created task is scheduled once, changes its state to
+ * TASK_UNINTERRUPTIBLE without having ever been switched out once, it
+ * musn't be checked.
+ */
+ if (unlikely(!switch_count))
+ return;
+
+ if (switch_count != t->last_switch_count) {
+ t->last_switch_count = switch_count;
+ return;
+ }
+
+ trace_sched_process_hang(t);
+
+ if (!sysctl_hung_task_warnings)
+ return;
+
+ if (sysctl_hung_task_warnings > 0)
+ sysctl_hung_task_warnings--;
+
+ /*
+ * Ok, the task did not get scheduled for more than 2 minutes,
+ * complain:
+ */
+ pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n",
+ t->comm, t->pid, timeout);
+ pr_err(" %s %s %.*s\n",
+ print_tainted(), init_utsname()->release,
+ (int)strcspn(init_utsname()->version, " "),
+ init_utsname()->version);
+ pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
+ " disables this message.\n");
+ sched_show_task(t);
+ debug_show_held_locks(t);
+
+ touch_nmi_watchdog();
+
+ if (sysctl_hung_task_panic) {
+ trigger_all_cpu_backtrace();
+ panic("hung_task: blocked tasks");
+ }
+}
+
+/*
+ * To avoid extending the RCU grace period for an unbounded amount of time,
+ * periodically exit the critical section and enter a new one.
+ *
+ * For preemptible RCU it is sufficient to call rcu_read_unlock in order
+ * to exit the grace period. For classic RCU, a reschedule is required.
+ */
+static bool rcu_lock_break(struct task_struct *g, struct task_struct *t)
+{
+ bool can_cont;
+
+ get_task_struct(g);
+ get_task_struct(t);
+ rcu_read_unlock();
+ cond_resched();
+ rcu_read_lock();
+ can_cont = pid_alive(g) && pid_alive(t);
+ put_task_struct(t);
+ put_task_struct(g);
+
+ return can_cont;
+}
+
+/*
+ * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
+ * a really long time (120 seconds). If that happens, print out
+ * a warning.
+ */
+static void check_hung_uninterruptible_tasks(unsigned long timeout)
+{
+ int max_count = sysctl_hung_task_check_count;
+ int batch_count = HUNG_TASK_BATCHING;
+ struct task_struct *g, *t;
+
+ /*
+ * If the system crashed already then all bets are off,
+ * do not report extra hung tasks:
+ */
+ if (test_taint(TAINT_DIE) || did_panic)
+ return;
+
+ rcu_read_lock();
+ for_each_process_thread(g, t) {
+ if (!max_count--)
+ goto unlock;
+ if (!--batch_count) {
+ batch_count = HUNG_TASK_BATCHING;
+ if (!rcu_lock_break(g, t))
+ goto unlock;
+ }
+ /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
+ if (t->state == TASK_UNINTERRUPTIBLE)
+ check_hung_task(t, timeout);
+ }
+ unlock:
+ rcu_read_unlock();
+}
+
+static unsigned long timeout_jiffies(unsigned long timeout)
+{
+ /* timeout of 0 will disable the watchdog */
+ return timeout ? timeout * HZ : MAX_SCHEDULE_TIMEOUT;
+}
+
+/*
+ * Process updating of timeout sysctl
+ */
+int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int ret;
+
+ ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
+
+ if (ret || !write)
+ goto out;
+
+ wake_up_process(watchdog_task);
+
+ out:
+ return ret;
+}
+
+static atomic_t reset_hung_task = ATOMIC_INIT(0);
+
+void reset_hung_task_detector(void)
+{
+ atomic_set(&reset_hung_task, 1);
+}
+EXPORT_SYMBOL_GPL(reset_hung_task_detector);
+
+/*
+ * kthread which checks for tasks stuck in D state
+ */
+static int watchdog(void *dummy)
+{
+ set_user_nice(current, 0);
+
+ for ( ; ; ) {
+ unsigned long timeout = sysctl_hung_task_timeout_secs;
+
+ while (schedule_timeout_interruptible(timeout_jiffies(timeout)))
+ timeout = sysctl_hung_task_timeout_secs;
+
+ if (atomic_xchg(&reset_hung_task, 0))
+ continue;
+
+ check_hung_uninterruptible_tasks(timeout);
+ }
+
+ return 0;
+}
+
+static int __init hung_task_init(void)
+{
+ atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
+ watchdog_task = kthread_run(watchdog, NULL, "khungtaskd");
+
+ return 0;
+}
+subsys_initcall(hung_task_init);
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
new file mode 100644
index 000000000..9a76e3bed
--- /dev/null
+++ b/kernel/irq/Kconfig
@@ -0,0 +1,103 @@
+menu "IRQ subsystem"
+# Options selectable by the architecture code
+
+# Make sparse irq Kconfig switch below available
+config MAY_HAVE_SPARSE_IRQ
+ bool
+
+# Legacy support, required for itanic
+config GENERIC_IRQ_LEGACY
+ bool
+
+# Enable the generic irq autoprobe mechanism
+config GENERIC_IRQ_PROBE
+ bool
+
+# Use the generic /proc/interrupts implementation
+config GENERIC_IRQ_SHOW
+ bool
+
+# Print level/edge extra information
+config GENERIC_IRQ_SHOW_LEVEL
+ bool
+
+# Facility to allocate a hardware interrupt. This is legacy support
+# and should not be used in new code. Use irq domains instead.
+config GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
+ bool
+
+# Support for delayed migration from interrupt context
+config GENERIC_PENDING_IRQ
+ bool
+
+# Alpha specific irq affinity mechanism
+config AUTO_IRQ_AFFINITY
+ bool
+
+# Tasklet based software resend for pending interrupts on enable_irq()
+config HARDIRQS_SW_RESEND
+ bool
+
+# Preflow handler support for fasteoi (sparc64)
+config IRQ_PREFLOW_FASTEOI
+ bool
+
+# Edge style eoi based handler (cell)
+config IRQ_EDGE_EOI_HANDLER
+ bool
+
+# Generic configurable interrupt chip implementation
+config GENERIC_IRQ_CHIP
+ bool
+ select IRQ_DOMAIN
+
+# Generic irq_domain hw <--> linux irq number translation
+config IRQ_DOMAIN
+ bool
+
+# Support for hierarchical irq domains
+config IRQ_DOMAIN_HIERARCHY
+ bool
+ select IRQ_DOMAIN
+
+# Generic MSI interrupt support
+config GENERIC_MSI_IRQ
+ bool
+
+# Generic MSI hierarchical interrupt domain support
+config GENERIC_MSI_IRQ_DOMAIN
+ bool
+ select IRQ_DOMAIN_HIERARCHY
+ select GENERIC_MSI_IRQ
+
+config HANDLE_DOMAIN_IRQ
+ bool
+
+config IRQ_DOMAIN_DEBUG
+ bool "Expose hardware/virtual IRQ mapping via debugfs"
+ depends on IRQ_DOMAIN && DEBUG_FS
+ help
+ This option will show the mapping relationship between hardware irq
+ numbers and Linux irq numbers. The mapping is exposed via debugfs
+ in the file "irq_domain_mapping".
+
+ If you don't know what this means you don't need it.
+
+# Support forced irq threading
+config IRQ_FORCED_THREADING
+ bool
+
+config SPARSE_IRQ
+ bool "Support sparse irq numbering" if MAY_HAVE_SPARSE_IRQ
+ ---help---
+
+ Sparse irq numbering is useful for distro kernels that want
+ to define a high CONFIG_NR_CPUS value but still want to have
+ low kernel memory footprint on smaller machines.
+
+ ( Sparse irqs can also be beneficial on NUMA boxes, as they spread
+ out the interrupt descriptors in a more NUMA-friendly way. )
+
+ If you don't know what to do here, say N.
+
+endmenu
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
new file mode 100644
index 000000000..d12123526
--- /dev/null
+++ b/kernel/irq/Makefile
@@ -0,0 +1,9 @@
+
+obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o
+obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o
+obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
+obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o
+obj-$(CONFIG_PROC_FS) += proc.o
+obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
+obj-$(CONFIG_PM_SLEEP) += pm.o
+obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
new file mode 100644
index 000000000..0119b9d46
--- /dev/null
+++ b/kernel/irq/autoprobe.c
@@ -0,0 +1,185 @@
+/*
+ * linux/kernel/irq/autoprobe.c
+ *
+ * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains the interrupt probing code and driver APIs.
+ */
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/async.h>
+
+#include "internals.h"
+
+/*
+ * Autodetection depends on the fact that any interrupt that
+ * comes in on to an unassigned handler will get stuck with
+ * "IRQS_WAITING" cleared and the interrupt disabled.
+ */
+static DEFINE_MUTEX(probing_active);
+
+/**
+ * probe_irq_on - begin an interrupt autodetect
+ *
+ * Commence probing for an interrupt. The interrupts are scanned
+ * and a mask of potential interrupt lines is returned.
+ *
+ */
+unsigned long probe_irq_on(void)
+{
+ struct irq_desc *desc;
+ unsigned long mask = 0;
+ int i;
+
+ /*
+ * quiesce the kernel, or at least the asynchronous portion
+ */
+ async_synchronize_full();
+ mutex_lock(&probing_active);
+ /*
+ * something may have generated an irq long ago and we want to
+ * flush such a longstanding irq before considering it as spurious.
+ */
+ for_each_irq_desc_reverse(i, desc) {
+ raw_spin_lock_irq(&desc->lock);
+ if (!desc->action && irq_settings_can_probe(desc)) {
+ /*
+ * Some chips need to know about probing in
+ * progress:
+ */
+ if (desc->irq_data.chip->irq_set_type)
+ desc->irq_data.chip->irq_set_type(&desc->irq_data,
+ IRQ_TYPE_PROBE);
+ irq_startup(desc, false);
+ }
+ raw_spin_unlock_irq(&desc->lock);
+ }
+
+ /* Wait for longstanding interrupts to trigger. */
+ msleep(20);
+
+ /*
+ * enable any unassigned irqs
+ * (we must startup again here because if a longstanding irq
+ * happened in the previous stage, it may have masked itself)
+ */
+ for_each_irq_desc_reverse(i, desc) {
+ raw_spin_lock_irq(&desc->lock);
+ if (!desc->action && irq_settings_can_probe(desc)) {
+ desc->istate |= IRQS_AUTODETECT | IRQS_WAITING;
+ if (irq_startup(desc, false))
+ desc->istate |= IRQS_PENDING;
+ }
+ raw_spin_unlock_irq(&desc->lock);
+ }
+
+ /*
+ * Wait for spurious interrupts to trigger
+ */
+ msleep(100);
+
+ /*
+ * Now filter out any obviously spurious interrupts
+ */
+ for_each_irq_desc(i, desc) {
+ raw_spin_lock_irq(&desc->lock);
+
+ if (desc->istate & IRQS_AUTODETECT) {
+ /* It triggered already - consider it spurious. */
+ if (!(desc->istate & IRQS_WAITING)) {
+ desc->istate &= ~IRQS_AUTODETECT;
+ irq_shutdown(desc);
+ } else
+ if (i < 32)
+ mask |= 1 << i;
+ }
+ raw_spin_unlock_irq(&desc->lock);
+ }
+
+ return mask;
+}
+EXPORT_SYMBOL(probe_irq_on);
+
+/**
+ * probe_irq_mask - scan a bitmap of interrupt lines
+ * @val: mask of interrupts to consider
+ *
+ * Scan the interrupt lines and return a bitmap of active
+ * autodetect interrupts. The interrupt probe logic state
+ * is then returned to its previous value.
+ *
+ * Note: we need to scan all the irq's even though we will
+ * only return autodetect irq numbers - just so that we reset
+ * them all to a known state.
+ */
+unsigned int probe_irq_mask(unsigned long val)
+{
+ unsigned int mask = 0;
+ struct irq_desc *desc;
+ int i;
+
+ for_each_irq_desc(i, desc) {
+ raw_spin_lock_irq(&desc->lock);
+ if (desc->istate & IRQS_AUTODETECT) {
+ if (i < 16 && !(desc->istate & IRQS_WAITING))
+ mask |= 1 << i;
+
+ desc->istate &= ~IRQS_AUTODETECT;
+ irq_shutdown(desc);
+ }
+ raw_spin_unlock_irq(&desc->lock);
+ }
+ mutex_unlock(&probing_active);
+
+ return mask & val;
+}
+EXPORT_SYMBOL(probe_irq_mask);
+
+/**
+ * probe_irq_off - end an interrupt autodetect
+ * @val: mask of potential interrupts (unused)
+ *
+ * Scans the unused interrupt lines and returns the line which
+ * appears to have triggered the interrupt. If no interrupt was
+ * found then zero is returned. If more than one interrupt is
+ * found then minus the first candidate is returned to indicate
+ * their is doubt.
+ *
+ * The interrupt probe logic state is returned to its previous
+ * value.
+ *
+ * BUGS: When used in a module (which arguably shouldn't happen)
+ * nothing prevents two IRQ probe callers from overlapping. The
+ * results of this are non-optimal.
+ */
+int probe_irq_off(unsigned long val)
+{
+ int i, irq_found = 0, nr_of_irqs = 0;
+ struct irq_desc *desc;
+
+ for_each_irq_desc(i, desc) {
+ raw_spin_lock_irq(&desc->lock);
+
+ if (desc->istate & IRQS_AUTODETECT) {
+ if (!(desc->istate & IRQS_WAITING)) {
+ if (!nr_of_irqs)
+ irq_found = i;
+ nr_of_irqs++;
+ }
+ desc->istate &= ~IRQS_AUTODETECT;
+ irq_shutdown(desc);
+ }
+ raw_spin_unlock_irq(&desc->lock);
+ }
+ mutex_unlock(&probing_active);
+
+ if (nr_of_irqs > 1)
+ irq_found = -irq_found;
+
+ return irq_found;
+}
+EXPORT_SYMBOL(probe_irq_off);
+
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
new file mode 100644
index 000000000..eb9a4ea39
--- /dev/null
+++ b/kernel/irq/chip.c
@@ -0,0 +1,993 @@
+/*
+ * linux/kernel/irq/chip.c
+ *
+ * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 2005-2006, Thomas Gleixner, Russell King
+ *
+ * This file contains the core interrupt handling code, for irq-chip
+ * based architectures.
+ *
+ * Detailed information is available in Documentation/DocBook/genericirq
+ */
+
+#include <linux/irq.h>
+#include <linux/msi.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+#include <linux/irqdomain.h>
+
+#include <trace/events/irq.h>
+
+#include "internals.h"
+
+/**
+ * irq_set_chip - set the irq chip for an irq
+ * @irq: irq number
+ * @chip: pointer to irq chip description structure
+ */
+int irq_set_chip(unsigned int irq, struct irq_chip *chip)
+{
+ unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
+
+ if (!desc)
+ return -EINVAL;
+
+ if (!chip)
+ chip = &no_irq_chip;
+
+ desc->irq_data.chip = chip;
+ irq_put_desc_unlock(desc, flags);
+ /*
+ * For !CONFIG_SPARSE_IRQ make the irq show up in
+ * allocated_irqs.
+ */
+ irq_mark_irq(irq);
+ return 0;
+}
+EXPORT_SYMBOL(irq_set_chip);
+
+/**
+ * irq_set_type - set the irq trigger type for an irq
+ * @irq: irq number
+ * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h
+ */
+int irq_set_irq_type(unsigned int irq, unsigned int type)
+{
+ unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
+ int ret = 0;
+
+ if (!desc)
+ return -EINVAL;
+
+ type &= IRQ_TYPE_SENSE_MASK;
+ ret = __irq_set_trigger(desc, irq, type);
+ irq_put_desc_busunlock(desc, flags);
+ return ret;
+}
+EXPORT_SYMBOL(irq_set_irq_type);
+
+/**
+ * irq_set_handler_data - set irq handler data for an irq
+ * @irq: Interrupt number
+ * @data: Pointer to interrupt specific data
+ *
+ * Set the hardware irq controller data for an irq
+ */
+int irq_set_handler_data(unsigned int irq, void *data)
+{
+ unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
+
+ if (!desc)
+ return -EINVAL;
+ desc->irq_data.handler_data = data;
+ irq_put_desc_unlock(desc, flags);
+ return 0;
+}
+EXPORT_SYMBOL(irq_set_handler_data);
+
+/**
+ * irq_set_msi_desc_off - set MSI descriptor data for an irq at offset
+ * @irq_base: Interrupt number base
+ * @irq_offset: Interrupt number offset
+ * @entry: Pointer to MSI descriptor data
+ *
+ * Set the MSI descriptor entry for an irq at offset
+ */
+int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset,
+ struct msi_desc *entry)
+{
+ unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_lock(irq_base + irq_offset, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
+
+ if (!desc)
+ return -EINVAL;
+ desc->irq_data.msi_desc = entry;
+ if (entry && !irq_offset)
+ entry->irq = irq_base;
+ irq_put_desc_unlock(desc, flags);
+ return 0;
+}
+
+/**
+ * irq_set_msi_desc - set MSI descriptor data for an irq
+ * @irq: Interrupt number
+ * @entry: Pointer to MSI descriptor data
+ *
+ * Set the MSI descriptor entry for an irq
+ */
+int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
+{
+ return irq_set_msi_desc_off(irq, 0, entry);
+}
+
+/**
+ * irq_set_chip_data - set irq chip data for an irq
+ * @irq: Interrupt number
+ * @data: Pointer to chip specific data
+ *
+ * Set the hardware irq chip data for an irq
+ */
+int irq_set_chip_data(unsigned int irq, void *data)
+{
+ unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
+
+ if (!desc)
+ return -EINVAL;
+ desc->irq_data.chip_data = data;
+ irq_put_desc_unlock(desc, flags);
+ return 0;
+}
+EXPORT_SYMBOL(irq_set_chip_data);
+
+struct irq_data *irq_get_irq_data(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ return desc ? &desc->irq_data : NULL;
+}
+EXPORT_SYMBOL_GPL(irq_get_irq_data);
+
+static void irq_state_clr_disabled(struct irq_desc *desc)
+{
+ irqd_clear(&desc->irq_data, IRQD_IRQ_DISABLED);
+}
+
+static void irq_state_set_disabled(struct irq_desc *desc)
+{
+ irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);
+}
+
+static void irq_state_clr_masked(struct irq_desc *desc)
+{
+ irqd_clear(&desc->irq_data, IRQD_IRQ_MASKED);
+}
+
+static void irq_state_set_masked(struct irq_desc *desc)
+{
+ irqd_set(&desc->irq_data, IRQD_IRQ_MASKED);
+}
+
+int irq_startup(struct irq_desc *desc, bool resend)
+{
+ int ret = 0;
+
+ irq_state_clr_disabled(desc);
+ desc->depth = 0;
+
+ irq_domain_activate_irq(&desc->irq_data);
+ if (desc->irq_data.chip->irq_startup) {
+ ret = desc->irq_data.chip->irq_startup(&desc->irq_data);
+ irq_state_clr_masked(desc);
+ } else {
+ irq_enable(desc);
+ }
+ if (resend)
+ check_irq_resend(desc, desc->irq_data.irq);
+ return ret;
+}
+
+void irq_shutdown(struct irq_desc *desc)
+{
+ irq_state_set_disabled(desc);
+ desc->depth = 1;
+ if (desc->irq_data.chip->irq_shutdown)
+ desc->irq_data.chip->irq_shutdown(&desc->irq_data);
+ else if (desc->irq_data.chip->irq_disable)
+ desc->irq_data.chip->irq_disable(&desc->irq_data);
+ else
+ desc->irq_data.chip->irq_mask(&desc->irq_data);
+ irq_domain_deactivate_irq(&desc->irq_data);
+ irq_state_set_masked(desc);
+}
+
+void irq_enable(struct irq_desc *desc)
+{
+ irq_state_clr_disabled(desc);
+ if (desc->irq_data.chip->irq_enable)
+ desc->irq_data.chip->irq_enable(&desc->irq_data);
+ else
+ desc->irq_data.chip->irq_unmask(&desc->irq_data);
+ irq_state_clr_masked(desc);
+}
+
+/**
+ * irq_disable - Mark interrupt disabled
+ * @desc: irq descriptor which should be disabled
+ *
+ * If the chip does not implement the irq_disable callback, we
+ * use a lazy disable approach. That means we mark the interrupt
+ * disabled, but leave the hardware unmasked. That's an
+ * optimization because we avoid the hardware access for the
+ * common case where no interrupt happens after we marked it
+ * disabled. If an interrupt happens, then the interrupt flow
+ * handler masks the line at the hardware level and marks it
+ * pending.
+ */
+void irq_disable(struct irq_desc *desc)
+{
+ irq_state_set_disabled(desc);
+ if (desc->irq_data.chip->irq_disable) {
+ desc->irq_data.chip->irq_disable(&desc->irq_data);
+ irq_state_set_masked(desc);
+ }
+}
+
+void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu)
+{
+ if (desc->irq_data.chip->irq_enable)
+ desc->irq_data.chip->irq_enable(&desc->irq_data);
+ else
+ desc->irq_data.chip->irq_unmask(&desc->irq_data);
+ cpumask_set_cpu(cpu, desc->percpu_enabled);
+}
+
+void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu)
+{
+ if (desc->irq_data.chip->irq_disable)
+ desc->irq_data.chip->irq_disable(&desc->irq_data);
+ else
+ desc->irq_data.chip->irq_mask(&desc->irq_data);
+ cpumask_clear_cpu(cpu, desc->percpu_enabled);
+}
+
+static inline void mask_ack_irq(struct irq_desc *desc)
+{
+ if (desc->irq_data.chip->irq_mask_ack)
+ desc->irq_data.chip->irq_mask_ack(&desc->irq_data);
+ else {
+ desc->irq_data.chip->irq_mask(&desc->irq_data);
+ if (desc->irq_data.chip->irq_ack)
+ desc->irq_data.chip->irq_ack(&desc->irq_data);
+ }
+ irq_state_set_masked(desc);
+}
+
+void mask_irq(struct irq_desc *desc)
+{
+ if (desc->irq_data.chip->irq_mask) {
+ desc->irq_data.chip->irq_mask(&desc->irq_data);
+ irq_state_set_masked(desc);
+ }
+}
+
+void unmask_irq(struct irq_desc *desc)
+{
+ if (desc->irq_data.chip->irq_unmask) {
+ desc->irq_data.chip->irq_unmask(&desc->irq_data);
+ irq_state_clr_masked(desc);
+ }
+}
+
+void unmask_threaded_irq(struct irq_desc *desc)
+{
+ struct irq_chip *chip = desc->irq_data.chip;
+
+ if (chip->flags & IRQCHIP_EOI_THREADED)
+ chip->irq_eoi(&desc->irq_data);
+
+ if (chip->irq_unmask) {
+ chip->irq_unmask(&desc->irq_data);
+ irq_state_clr_masked(desc);
+ }
+}
+
+/*
+ * handle_nested_irq - Handle a nested irq from a irq thread
+ * @irq: the interrupt number
+ *
+ * Handle interrupts which are nested into a threaded interrupt
+ * handler. The handler function is called inside the calling
+ * threads context.
+ */
+void handle_nested_irq(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ struct irqaction *action;
+ irqreturn_t action_ret;
+
+ might_sleep();
+
+ raw_spin_lock_irq(&desc->lock);
+
+ desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+ kstat_incr_irqs_this_cpu(irq, desc);
+
+ action = desc->action;
+ if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) {
+ desc->istate |= IRQS_PENDING;
+ goto out_unlock;
+ }
+
+ irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
+ raw_spin_unlock_irq(&desc->lock);
+
+ action_ret = action->thread_fn(action->irq, action->dev_id);
+ if (!noirqdebug)
+ note_interrupt(irq, desc, action_ret);
+
+ raw_spin_lock_irq(&desc->lock);
+ irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
+
+out_unlock:
+ raw_spin_unlock_irq(&desc->lock);
+}
+EXPORT_SYMBOL_GPL(handle_nested_irq);
+
+static bool irq_check_poll(struct irq_desc *desc)
+{
+ if (!(desc->istate & IRQS_POLL_INPROGRESS))
+ return false;
+ return irq_wait_for_poll(desc);
+}
+
+static bool irq_may_run(struct irq_desc *desc)
+{
+ unsigned int mask = IRQD_IRQ_INPROGRESS | IRQD_WAKEUP_ARMED;
+
+ /*
+ * If the interrupt is not in progress and is not an armed
+ * wakeup interrupt, proceed.
+ */
+ if (!irqd_has_set(&desc->irq_data, mask))
+ return true;
+
+ /*
+ * If the interrupt is an armed wakeup source, mark it pending
+ * and suspended, disable it and notify the pm core about the
+ * event.
+ */
+ if (irq_pm_check_wakeup(desc))
+ return false;
+
+ /*
+ * Handle a potential concurrent poll on a different core.
+ */
+ return irq_check_poll(desc);
+}
+
+/**
+ * handle_simple_irq - Simple and software-decoded IRQs.
+ * @irq: the interrupt number
+ * @desc: the interrupt description structure for this irq
+ *
+ * Simple interrupts are either sent from a demultiplexing interrupt
+ * handler or come from hardware, where no interrupt hardware control
+ * is necessary.
+ *
+ * Note: The caller is expected to handle the ack, clear, mask and
+ * unmask issues if necessary.
+ */
+void
+handle_simple_irq(unsigned int irq, struct irq_desc *desc)
+{
+ raw_spin_lock(&desc->lock);
+
+ if (!irq_may_run(desc))
+ goto out_unlock;
+
+ desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+ kstat_incr_irqs_this_cpu(irq, desc);
+
+ if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
+ desc->istate |= IRQS_PENDING;
+ goto out_unlock;
+ }
+
+ handle_irq_event(desc);
+
+out_unlock:
+ raw_spin_unlock(&desc->lock);
+}
+EXPORT_SYMBOL_GPL(handle_simple_irq);
+
+/*
+ * Called unconditionally from handle_level_irq() and only for oneshot
+ * interrupts from handle_fasteoi_irq()
+ */
+static void cond_unmask_irq(struct irq_desc *desc)
+{
+ /*
+ * We need to unmask in the following cases:
+ * - Standard level irq (IRQF_ONESHOT is not set)
+ * - Oneshot irq which did not wake the thread (caused by a
+ * spurious interrupt or a primary handler handling it
+ * completely).
+ */
+ if (!irqd_irq_disabled(&desc->irq_data) &&
+ irqd_irq_masked(&desc->irq_data) && !desc->threads_oneshot)
+ unmask_irq(desc);
+}
+
+/**
+ * handle_level_irq - Level type irq handler
+ * @irq: the interrupt number
+ * @desc: the interrupt description structure for this irq
+ *
+ * Level type interrupts are active as long as the hardware line has
+ * the active level. This may require to mask the interrupt and unmask
+ * it after the associated handler has acknowledged the device, so the
+ * interrupt line is back to inactive.
+ */
+void
+handle_level_irq(unsigned int irq, struct irq_desc *desc)
+{
+ raw_spin_lock(&desc->lock);
+ mask_ack_irq(desc);
+
+ if (!irq_may_run(desc))
+ goto out_unlock;
+
+ desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+ kstat_incr_irqs_this_cpu(irq, desc);
+
+ /*
+ * If its disabled or no action available
+ * keep it masked and get out of here
+ */
+ if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
+ desc->istate |= IRQS_PENDING;
+ goto out_unlock;
+ }
+
+ handle_irq_event(desc);
+
+ cond_unmask_irq(desc);
+
+out_unlock:
+ raw_spin_unlock(&desc->lock);
+}
+EXPORT_SYMBOL_GPL(handle_level_irq);
+
+#ifdef CONFIG_IRQ_PREFLOW_FASTEOI
+static inline void preflow_handler(struct irq_desc *desc)
+{
+ if (desc->preflow_handler)
+ desc->preflow_handler(&desc->irq_data);
+}
+#else
+static inline void preflow_handler(struct irq_desc *desc) { }
+#endif
+
+static void cond_unmask_eoi_irq(struct irq_desc *desc, struct irq_chip *chip)
+{
+ if (!(desc->istate & IRQS_ONESHOT)) {
+ chip->irq_eoi(&desc->irq_data);
+ return;
+ }
+ /*
+ * We need to unmask in the following cases:
+ * - Oneshot irq which did not wake the thread (caused by a
+ * spurious interrupt or a primary handler handling it
+ * completely).
+ */
+ if (!irqd_irq_disabled(&desc->irq_data) &&
+ irqd_irq_masked(&desc->irq_data) && !desc->threads_oneshot) {
+ chip->irq_eoi(&desc->irq_data);
+ unmask_irq(desc);
+ } else if (!(chip->flags & IRQCHIP_EOI_THREADED)) {
+ chip->irq_eoi(&desc->irq_data);
+ }
+}
+
+/**
+ * handle_fasteoi_irq - irq handler for transparent controllers
+ * @irq: the interrupt number
+ * @desc: the interrupt description structure for this irq
+ *
+ * Only a single callback will be issued to the chip: an ->eoi()
+ * call when the interrupt has been serviced. This enables support
+ * for modern forms of interrupt handlers, which handle the flow
+ * details in hardware, transparently.
+ */
+void
+handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
+{
+ struct irq_chip *chip = desc->irq_data.chip;
+
+ raw_spin_lock(&desc->lock);
+
+ if (!irq_may_run(desc))
+ goto out;
+
+ desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+ kstat_incr_irqs_this_cpu(irq, desc);
+
+ /*
+ * If its disabled or no action available
+ * then mask it and get out of here:
+ */
+ if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
+ desc->istate |= IRQS_PENDING;
+ mask_irq(desc);
+ goto out;
+ }
+
+ if (desc->istate & IRQS_ONESHOT)
+ mask_irq(desc);
+
+ preflow_handler(desc);
+ handle_irq_event(desc);
+
+ cond_unmask_eoi_irq(desc, chip);
+
+ raw_spin_unlock(&desc->lock);
+ return;
+out:
+ if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED))
+ chip->irq_eoi(&desc->irq_data);
+ raw_spin_unlock(&desc->lock);
+}
+EXPORT_SYMBOL_GPL(handle_fasteoi_irq);
+
+/**
+ * handle_edge_irq - edge type IRQ handler
+ * @irq: the interrupt number
+ * @desc: the interrupt description structure for this irq
+ *
+ * Interrupt occures on the falling and/or rising edge of a hardware
+ * signal. The occurrence is latched into the irq controller hardware
+ * and must be acked in order to be reenabled. After the ack another
+ * interrupt can happen on the same source even before the first one
+ * is handled by the associated event handler. If this happens it
+ * might be necessary to disable (mask) the interrupt depending on the
+ * controller hardware. This requires to reenable the interrupt inside
+ * of the loop which handles the interrupts which have arrived while
+ * the handler was running. If all pending interrupts are handled, the
+ * loop is left.
+ */
+void
+handle_edge_irq(unsigned int irq, struct irq_desc *desc)
+{
+ raw_spin_lock(&desc->lock);
+
+ desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+
+ if (!irq_may_run(desc)) {
+ desc->istate |= IRQS_PENDING;
+ mask_ack_irq(desc);
+ goto out_unlock;
+ }
+
+ /*
+ * If its disabled or no action available then mask it and get
+ * out of here.
+ */
+ if (irqd_irq_disabled(&desc->irq_data) || !desc->action) {
+ desc->istate |= IRQS_PENDING;
+ mask_ack_irq(desc);
+ goto out_unlock;
+ }
+
+ kstat_incr_irqs_this_cpu(irq, desc);
+
+ /* Start handling the irq */
+ desc->irq_data.chip->irq_ack(&desc->irq_data);
+
+ do {
+ if (unlikely(!desc->action)) {
+ mask_irq(desc);
+ goto out_unlock;
+ }
+
+ /*
+ * When another irq arrived while we were handling
+ * one, we could have masked the irq.
+ * Renable it, if it was not disabled in meantime.
+ */
+ if (unlikely(desc->istate & IRQS_PENDING)) {
+ if (!irqd_irq_disabled(&desc->irq_data) &&
+ irqd_irq_masked(&desc->irq_data))
+ unmask_irq(desc);
+ }
+
+ handle_irq_event(desc);
+
+ } while ((desc->istate & IRQS_PENDING) &&
+ !irqd_irq_disabled(&desc->irq_data));
+
+out_unlock:
+ raw_spin_unlock(&desc->lock);
+}
+EXPORT_SYMBOL(handle_edge_irq);
+
+#ifdef CONFIG_IRQ_EDGE_EOI_HANDLER
+/**
+ * handle_edge_eoi_irq - edge eoi type IRQ handler
+ * @irq: the interrupt number
+ * @desc: the interrupt description structure for this irq
+ *
+ * Similar as the above handle_edge_irq, but using eoi and w/o the
+ * mask/unmask logic.
+ */
+void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc)
+{
+ struct irq_chip *chip = irq_desc_get_chip(desc);
+
+ raw_spin_lock(&desc->lock);
+
+ desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+
+ if (!irq_may_run(desc)) {
+ desc->istate |= IRQS_PENDING;
+ goto out_eoi;
+ }
+
+ /*
+ * If its disabled or no action available then mask it and get
+ * out of here.
+ */
+ if (irqd_irq_disabled(&desc->irq_data) || !desc->action) {
+ desc->istate |= IRQS_PENDING;
+ goto out_eoi;
+ }
+
+ kstat_incr_irqs_this_cpu(irq, desc);
+
+ do {
+ if (unlikely(!desc->action))
+ goto out_eoi;
+
+ handle_irq_event(desc);
+
+ } while ((desc->istate & IRQS_PENDING) &&
+ !irqd_irq_disabled(&desc->irq_data));
+
+out_eoi:
+ chip->irq_eoi(&desc->irq_data);
+ raw_spin_unlock(&desc->lock);
+}
+#endif
+
+/**
+ * handle_percpu_irq - Per CPU local irq handler
+ * @irq: the interrupt number
+ * @desc: the interrupt description structure for this irq
+ *
+ * Per CPU interrupts on SMP machines without locking requirements
+ */
+void
+handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
+{
+ struct irq_chip *chip = irq_desc_get_chip(desc);
+
+ kstat_incr_irqs_this_cpu(irq, desc);
+
+ if (chip->irq_ack)
+ chip->irq_ack(&desc->irq_data);
+
+ handle_irq_event_percpu(desc, desc->action);
+
+ if (chip->irq_eoi)
+ chip->irq_eoi(&desc->irq_data);
+}
+
+/**
+ * handle_percpu_devid_irq - Per CPU local irq handler with per cpu dev ids
+ * @irq: the interrupt number
+ * @desc: the interrupt description structure for this irq
+ *
+ * Per CPU interrupts on SMP machines without locking requirements. Same as
+ * handle_percpu_irq() above but with the following extras:
+ *
+ * action->percpu_dev_id is a pointer to percpu variables which
+ * contain the real device id for the cpu on which this handler is
+ * called
+ */
+void handle_percpu_devid_irq(unsigned int irq, struct irq_desc *desc)
+{
+ struct irq_chip *chip = irq_desc_get_chip(desc);
+ struct irqaction *action = desc->action;
+ void *dev_id = raw_cpu_ptr(action->percpu_dev_id);
+ irqreturn_t res;
+
+ kstat_incr_irqs_this_cpu(irq, desc);
+
+ if (chip->irq_ack)
+ chip->irq_ack(&desc->irq_data);
+
+ trace_irq_handler_entry(irq, action);
+ res = action->handler(irq, dev_id);
+ trace_irq_handler_exit(irq, action, res);
+
+ if (chip->irq_eoi)
+ chip->irq_eoi(&desc->irq_data);
+}
+
+void
+__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
+ const char *name)
+{
+ unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0);
+
+ if (!desc)
+ return;
+
+ if (!handle) {
+ handle = handle_bad_irq;
+ } else {
+ struct irq_data *irq_data = &desc->irq_data;
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ /*
+ * With hierarchical domains we might run into a
+ * situation where the outermost chip is not yet set
+ * up, but the inner chips are there. Instead of
+ * bailing we install the handler, but obviously we
+ * cannot enable/startup the interrupt at this point.
+ */
+ while (irq_data) {
+ if (irq_data->chip != &no_irq_chip)
+ break;
+ /*
+ * Bail out if the outer chip is not set up
+ * and the interrrupt supposed to be started
+ * right away.
+ */
+ if (WARN_ON(is_chained))
+ goto out;
+ /* Try the parent */
+ irq_data = irq_data->parent_data;
+ }
+#endif
+ if (WARN_ON(!irq_data || irq_data->chip == &no_irq_chip))
+ goto out;
+ }
+
+ /* Uninstall? */
+ if (handle == handle_bad_irq) {
+ if (desc->irq_data.chip != &no_irq_chip)
+ mask_ack_irq(desc);
+ irq_state_set_disabled(desc);
+ desc->depth = 1;
+ }
+ desc->handle_irq = handle;
+ desc->name = name;
+
+ if (handle != handle_bad_irq && is_chained) {
+ irq_settings_set_noprobe(desc);
+ irq_settings_set_norequest(desc);
+ irq_settings_set_nothread(desc);
+ irq_startup(desc, true);
+ }
+out:
+ irq_put_desc_busunlock(desc, flags);
+}
+EXPORT_SYMBOL_GPL(__irq_set_handler);
+
+void
+irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
+ irq_flow_handler_t handle, const char *name)
+{
+ irq_set_chip(irq, chip);
+ __irq_set_handler(irq, handle, 0, name);
+}
+EXPORT_SYMBOL_GPL(irq_set_chip_and_handler_name);
+
+void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
+{
+ unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
+
+ if (!desc)
+ return;
+ irq_settings_clr_and_set(desc, clr, set);
+
+ irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU |
+ IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT);
+ if (irq_settings_has_no_balance_set(desc))
+ irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
+ if (irq_settings_is_per_cpu(desc))
+ irqd_set(&desc->irq_data, IRQD_PER_CPU);
+ if (irq_settings_can_move_pcntxt(desc))
+ irqd_set(&desc->irq_data, IRQD_MOVE_PCNTXT);
+ if (irq_settings_is_level(desc))
+ irqd_set(&desc->irq_data, IRQD_LEVEL);
+
+ irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc));
+
+ irq_put_desc_unlock(desc, flags);
+}
+EXPORT_SYMBOL_GPL(irq_modify_status);
+
+/**
+ * irq_cpu_online - Invoke all irq_cpu_online functions.
+ *
+ * Iterate through all irqs and invoke the chip.irq_cpu_online()
+ * for each.
+ */
+void irq_cpu_online(void)
+{
+ struct irq_desc *desc;
+ struct irq_chip *chip;
+ unsigned long flags;
+ unsigned int irq;
+
+ for_each_active_irq(irq) {
+ desc = irq_to_desc(irq);
+ if (!desc)
+ continue;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+
+ chip = irq_data_get_irq_chip(&desc->irq_data);
+ if (chip && chip->irq_cpu_online &&
+ (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) ||
+ !irqd_irq_disabled(&desc->irq_data)))
+ chip->irq_cpu_online(&desc->irq_data);
+
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+ }
+}
+
+/**
+ * irq_cpu_offline - Invoke all irq_cpu_offline functions.
+ *
+ * Iterate through all irqs and invoke the chip.irq_cpu_offline()
+ * for each.
+ */
+void irq_cpu_offline(void)
+{
+ struct irq_desc *desc;
+ struct irq_chip *chip;
+ unsigned long flags;
+ unsigned int irq;
+
+ for_each_active_irq(irq) {
+ desc = irq_to_desc(irq);
+ if (!desc)
+ continue;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+
+ chip = irq_data_get_irq_chip(&desc->irq_data);
+ if (chip && chip->irq_cpu_offline &&
+ (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) ||
+ !irqd_irq_disabled(&desc->irq_data)))
+ chip->irq_cpu_offline(&desc->irq_data);
+
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+ }
+}
+
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+/**
+ * irq_chip_ack_parent - Acknowledge the parent interrupt
+ * @data: Pointer to interrupt specific data
+ */
+void irq_chip_ack_parent(struct irq_data *data)
+{
+ data = data->parent_data;
+ data->chip->irq_ack(data);
+}
+
+/**
+ * irq_chip_mask_parent - Mask the parent interrupt
+ * @data: Pointer to interrupt specific data
+ */
+void irq_chip_mask_parent(struct irq_data *data)
+{
+ data = data->parent_data;
+ data->chip->irq_mask(data);
+}
+
+/**
+ * irq_chip_unmask_parent - Unmask the parent interrupt
+ * @data: Pointer to interrupt specific data
+ */
+void irq_chip_unmask_parent(struct irq_data *data)
+{
+ data = data->parent_data;
+ data->chip->irq_unmask(data);
+}
+
+/**
+ * irq_chip_eoi_parent - Invoke EOI on the parent interrupt
+ * @data: Pointer to interrupt specific data
+ */
+void irq_chip_eoi_parent(struct irq_data *data)
+{
+ data = data->parent_data;
+ data->chip->irq_eoi(data);
+}
+
+/**
+ * irq_chip_set_affinity_parent - Set affinity on the parent interrupt
+ * @data: Pointer to interrupt specific data
+ * @dest: The affinity mask to set
+ * @force: Flag to enforce setting (disable online checks)
+ *
+ * Conditinal, as the underlying parent chip might not implement it.
+ */
+int irq_chip_set_affinity_parent(struct irq_data *data,
+ const struct cpumask *dest, bool force)
+{
+ data = data->parent_data;
+ if (data->chip->irq_set_affinity)
+ return data->chip->irq_set_affinity(data, dest, force);
+
+ return -ENOSYS;
+}
+
+/**
+ * irq_chip_retrigger_hierarchy - Retrigger an interrupt in hardware
+ * @data: Pointer to interrupt specific data
+ *
+ * Iterate through the domain hierarchy of the interrupt and check
+ * whether a hw retrigger function exists. If yes, invoke it.
+ */
+int irq_chip_retrigger_hierarchy(struct irq_data *data)
+{
+ for (data = data->parent_data; data; data = data->parent_data)
+ if (data->chip && data->chip->irq_retrigger)
+ return data->chip->irq_retrigger(data);
+
+ return -ENOSYS;
+}
+
+/**
+ * irq_chip_set_wake_parent - Set/reset wake-up on the parent interrupt
+ * @data: Pointer to interrupt specific data
+ * @on: Whether to set or reset the wake-up capability of this irq
+ *
+ * Conditional, as the underlying parent chip might not implement it.
+ */
+int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on)
+{
+ data = data->parent_data;
+ if (data->chip->irq_set_wake)
+ return data->chip->irq_set_wake(data, on);
+
+ return -ENOSYS;
+}
+#endif
+
+/**
+ * irq_chip_compose_msi_msg - Componse msi message for a irq chip
+ * @data: Pointer to interrupt specific data
+ * @msg: Pointer to the MSI message
+ *
+ * For hierarchical domains we find the first chip in the hierarchy
+ * which implements the irq_compose_msi_msg callback. For non
+ * hierarchical we use the top level chip.
+ */
+int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
+{
+ struct irq_data *pos = NULL;
+
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ for (; data; data = data->parent_data)
+#endif
+ if (data->chip && data->chip->irq_compose_msi_msg)
+ pos = data;
+ if (!pos)
+ return -ENOSYS;
+
+ pos->chip->irq_compose_msi_msg(pos, msg);
+
+ return 0;
+}
diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h
new file mode 100644
index 000000000..e75e29e44
--- /dev/null
+++ b/kernel/irq/debug.h
@@ -0,0 +1,45 @@
+/*
+ * Debugging printout:
+ */
+
+#include <linux/kallsyms.h>
+
+#define ___P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f)
+#define ___PS(f) if (desc->istate & f) printk("%14s set\n", #f)
+/* FIXME */
+#define ___PD(f) do { } while (0)
+
+static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
+{
+ printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n",
+ irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
+ printk("->handle_irq(): %p, ", desc->handle_irq);
+ print_symbol("%s\n", (unsigned long)desc->handle_irq);
+ printk("->irq_data.chip(): %p, ", desc->irq_data.chip);
+ print_symbol("%s\n", (unsigned long)desc->irq_data.chip);
+ printk("->action(): %p\n", desc->action);
+ if (desc->action) {
+ printk("->action->handler(): %p, ", desc->action->handler);
+ print_symbol("%s\n", (unsigned long)desc->action->handler);
+ }
+
+ ___P(IRQ_LEVEL);
+ ___P(IRQ_PER_CPU);
+ ___P(IRQ_NOPROBE);
+ ___P(IRQ_NOREQUEST);
+ ___P(IRQ_NOTHREAD);
+ ___P(IRQ_NOAUTOEN);
+
+ ___PS(IRQS_AUTODETECT);
+ ___PS(IRQS_REPLAY);
+ ___PS(IRQS_WAITING);
+ ___PS(IRQS_PENDING);
+
+ ___PD(IRQS_INPROGRESS);
+ ___PD(IRQS_DISABLED);
+ ___PD(IRQS_MASKED);
+}
+
+#undef ___P
+#undef ___PS
+#undef ___PD
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
new file mode 100644
index 000000000..74d90a754
--- /dev/null
+++ b/kernel/irq/devres.c
@@ -0,0 +1,139 @@
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/device.h>
+#include <linux/gfp.h>
+
+/*
+ * Device resource management aware IRQ request/free implementation.
+ */
+struct irq_devres {
+ unsigned int irq;
+ void *dev_id;
+};
+
+static void devm_irq_release(struct device *dev, void *res)
+{
+ struct irq_devres *this = res;
+
+ free_irq(this->irq, this->dev_id);
+}
+
+static int devm_irq_match(struct device *dev, void *res, void *data)
+{
+ struct irq_devres *this = res, *match = data;
+
+ return this->irq == match->irq && this->dev_id == match->dev_id;
+}
+
+/**
+ * devm_request_threaded_irq - allocate an interrupt line for a managed device
+ * @dev: device to request interrupt for
+ * @irq: Interrupt line to allocate
+ * @handler: Function to be called when the IRQ occurs
+ * @thread_fn: function to be called in a threaded interrupt context. NULL
+ * for devices which handle everything in @handler
+ * @irqflags: Interrupt type flags
+ * @devname: An ascii name for the claiming device
+ * @dev_id: A cookie passed back to the handler function
+ *
+ * Except for the extra @dev argument, this function takes the
+ * same arguments and performs the same function as
+ * request_threaded_irq(). IRQs requested with this function will be
+ * automatically freed on driver detach.
+ *
+ * If an IRQ allocated with this function needs to be freed
+ * separately, devm_free_irq() must be used.
+ */
+int devm_request_threaded_irq(struct device *dev, unsigned int irq,
+ irq_handler_t handler, irq_handler_t thread_fn,
+ unsigned long irqflags, const char *devname,
+ void *dev_id)
+{
+ struct irq_devres *dr;
+ int rc;
+
+ dr = devres_alloc(devm_irq_release, sizeof(struct irq_devres),
+ GFP_KERNEL);
+ if (!dr)
+ return -ENOMEM;
+
+ rc = request_threaded_irq(irq, handler, thread_fn, irqflags, devname,
+ dev_id);
+ if (rc) {
+ devres_free(dr);
+ return rc;
+ }
+
+ dr->irq = irq;
+ dr->dev_id = dev_id;
+ devres_add(dev, dr);
+
+ return 0;
+}
+EXPORT_SYMBOL(devm_request_threaded_irq);
+
+/**
+ * devm_request_any_context_irq - allocate an interrupt line for a managed device
+ * @dev: device to request interrupt for
+ * @irq: Interrupt line to allocate
+ * @handler: Function to be called when the IRQ occurs
+ * @thread_fn: function to be called in a threaded interrupt context. NULL
+ * for devices which handle everything in @handler
+ * @irqflags: Interrupt type flags
+ * @devname: An ascii name for the claiming device
+ * @dev_id: A cookie passed back to the handler function
+ *
+ * Except for the extra @dev argument, this function takes the
+ * same arguments and performs the same function as
+ * request_any_context_irq(). IRQs requested with this function will be
+ * automatically freed on driver detach.
+ *
+ * If an IRQ allocated with this function needs to be freed
+ * separately, devm_free_irq() must be used.
+ */
+int devm_request_any_context_irq(struct device *dev, unsigned int irq,
+ irq_handler_t handler, unsigned long irqflags,
+ const char *devname, void *dev_id)
+{
+ struct irq_devres *dr;
+ int rc;
+
+ dr = devres_alloc(devm_irq_release, sizeof(struct irq_devres),
+ GFP_KERNEL);
+ if (!dr)
+ return -ENOMEM;
+
+ rc = request_any_context_irq(irq, handler, irqflags, devname, dev_id);
+ if (rc < 0) {
+ devres_free(dr);
+ return rc;
+ }
+
+ dr->irq = irq;
+ dr->dev_id = dev_id;
+ devres_add(dev, dr);
+
+ return rc;
+}
+EXPORT_SYMBOL(devm_request_any_context_irq);
+
+/**
+ * devm_free_irq - free an interrupt
+ * @dev: device to free interrupt for
+ * @irq: Interrupt line to free
+ * @dev_id: Device identity to free
+ *
+ * Except for the extra @dev argument, this function takes the
+ * same arguments and performs the same function as free_irq().
+ * This function instead of free_irq() should be used to manually
+ * free IRQs allocated with devm_request_irq().
+ */
+void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id)
+{
+ struct irq_devres match_data = { irq, dev_id };
+
+ WARN_ON(devres_destroy(dev, devm_irq_release, devm_irq_match,
+ &match_data));
+ free_irq(irq, dev_id);
+}
+EXPORT_SYMBOL(devm_free_irq);
diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c
new file mode 100644
index 000000000..2feb6feca
--- /dev/null
+++ b/kernel/irq/dummychip.c
@@ -0,0 +1,62 @@
+/*
+ * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 2005-2006, Thomas Gleixner, Russell King
+ *
+ * This file contains the dummy interrupt chip implementation
+ */
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/export.h>
+
+#include "internals.h"
+
+/*
+ * What should we do if we get a hw irq event on an illegal vector?
+ * Each architecture has to answer this themself.
+ */
+static void ack_bad(struct irq_data *data)
+{
+ struct irq_desc *desc = irq_data_to_desc(data);
+
+ print_irq_desc(data->irq, desc);
+ ack_bad_irq(data->irq);
+}
+
+/*
+ * NOP functions
+ */
+static void noop(struct irq_data *data) { }
+
+static unsigned int noop_ret(struct irq_data *data)
+{
+ return 0;
+}
+
+/*
+ * Generic no controller implementation
+ */
+struct irq_chip no_irq_chip = {
+ .name = "none",
+ .irq_startup = noop_ret,
+ .irq_shutdown = noop,
+ .irq_enable = noop,
+ .irq_disable = noop,
+ .irq_ack = ack_bad,
+};
+
+/*
+ * Generic dummy implementation which can be used for
+ * real dumb interrupt sources
+ */
+struct irq_chip dummy_irq_chip = {
+ .name = "dummy",
+ .irq_startup = noop_ret,
+ .irq_shutdown = noop,
+ .irq_enable = noop,
+ .irq_disable = noop,
+ .irq_ack = noop,
+ .irq_mask = noop,
+ .irq_unmask = noop,
+ .flags = IRQCHIP_SKIP_SET_WAKE,
+};
+EXPORT_SYMBOL_GPL(dummy_irq_chip);
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
new file mode 100644
index 000000000..61024e8ab
--- /dev/null
+++ b/kernel/irq/generic-chip.c
@@ -0,0 +1,608 @@
+/*
+ * Library implementing the most common irq chip callback functions
+ *
+ * Copyright (C) 2011, Thomas Gleixner
+ */
+#include <linux/io.h>
+#include <linux/irq.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/irqdomain.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+#include <linux/syscore_ops.h>
+
+#include "internals.h"
+
+static LIST_HEAD(gc_list);
+static DEFINE_RAW_SPINLOCK(gc_lock);
+
+/**
+ * irq_gc_noop - NOOP function
+ * @d: irq_data
+ */
+void irq_gc_noop(struct irq_data *d)
+{
+}
+
+/**
+ * irq_gc_mask_disable_reg - Mask chip via disable register
+ * @d: irq_data
+ *
+ * Chip has separate enable/disable registers instead of a single mask
+ * register.
+ */
+void irq_gc_mask_disable_reg(struct irq_data *d)
+{
+ struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+ struct irq_chip_type *ct = irq_data_get_chip_type(d);
+ u32 mask = d->mask;
+
+ irq_gc_lock(gc);
+ irq_reg_writel(gc, mask, ct->regs.disable);
+ *ct->mask_cache &= ~mask;
+ irq_gc_unlock(gc);
+}
+
+/**
+ * irq_gc_mask_set_bit - Mask chip via setting bit in mask register
+ * @d: irq_data
+ *
+ * Chip has a single mask register. Values of this register are cached
+ * and protected by gc->lock
+ */
+void irq_gc_mask_set_bit(struct irq_data *d)
+{
+ struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+ struct irq_chip_type *ct = irq_data_get_chip_type(d);
+ u32 mask = d->mask;
+
+ irq_gc_lock(gc);
+ *ct->mask_cache |= mask;
+ irq_reg_writel(gc, *ct->mask_cache, ct->regs.mask);
+ irq_gc_unlock(gc);
+}
+EXPORT_SYMBOL_GPL(irq_gc_mask_set_bit);
+
+/**
+ * irq_gc_mask_clr_bit - Mask chip via clearing bit in mask register
+ * @d: irq_data
+ *
+ * Chip has a single mask register. Values of this register are cached
+ * and protected by gc->lock
+ */
+void irq_gc_mask_clr_bit(struct irq_data *d)
+{
+ struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+ struct irq_chip_type *ct = irq_data_get_chip_type(d);
+ u32 mask = d->mask;
+
+ irq_gc_lock(gc);
+ *ct->mask_cache &= ~mask;
+ irq_reg_writel(gc, *ct->mask_cache, ct->regs.mask);
+ irq_gc_unlock(gc);
+}
+EXPORT_SYMBOL_GPL(irq_gc_mask_clr_bit);
+
+/**
+ * irq_gc_unmask_enable_reg - Unmask chip via enable register
+ * @d: irq_data
+ *
+ * Chip has separate enable/disable registers instead of a single mask
+ * register.
+ */
+void irq_gc_unmask_enable_reg(struct irq_data *d)
+{
+ struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+ struct irq_chip_type *ct = irq_data_get_chip_type(d);
+ u32 mask = d->mask;
+
+ irq_gc_lock(gc);
+ irq_reg_writel(gc, mask, ct->regs.enable);
+ *ct->mask_cache |= mask;
+ irq_gc_unlock(gc);
+}
+
+/**
+ * irq_gc_ack_set_bit - Ack pending interrupt via setting bit
+ * @d: irq_data
+ */
+void irq_gc_ack_set_bit(struct irq_data *d)
+{
+ struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+ struct irq_chip_type *ct = irq_data_get_chip_type(d);
+ u32 mask = d->mask;
+
+ irq_gc_lock(gc);
+ irq_reg_writel(gc, mask, ct->regs.ack);
+ irq_gc_unlock(gc);
+}
+EXPORT_SYMBOL_GPL(irq_gc_ack_set_bit);
+
+/**
+ * irq_gc_ack_clr_bit - Ack pending interrupt via clearing bit
+ * @d: irq_data
+ */
+void irq_gc_ack_clr_bit(struct irq_data *d)
+{
+ struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+ struct irq_chip_type *ct = irq_data_get_chip_type(d);
+ u32 mask = ~d->mask;
+
+ irq_gc_lock(gc);
+ irq_reg_writel(gc, mask, ct->regs.ack);
+ irq_gc_unlock(gc);
+}
+
+/**
+ * irq_gc_mask_disable_reg_and_ack - Mask and ack pending interrupt
+ * @d: irq_data
+ */
+void irq_gc_mask_disable_reg_and_ack(struct irq_data *d)
+{
+ struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+ struct irq_chip_type *ct = irq_data_get_chip_type(d);
+ u32 mask = d->mask;
+
+ irq_gc_lock(gc);
+ irq_reg_writel(gc, mask, ct->regs.mask);
+ irq_reg_writel(gc, mask, ct->regs.ack);
+ irq_gc_unlock(gc);
+}
+
+/**
+ * irq_gc_eoi - EOI interrupt
+ * @d: irq_data
+ */
+void irq_gc_eoi(struct irq_data *d)
+{
+ struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+ struct irq_chip_type *ct = irq_data_get_chip_type(d);
+ u32 mask = d->mask;
+
+ irq_gc_lock(gc);
+ irq_reg_writel(gc, mask, ct->regs.eoi);
+ irq_gc_unlock(gc);
+}
+
+/**
+ * irq_gc_set_wake - Set/clr wake bit for an interrupt
+ * @d: irq_data
+ * @on: Indicates whether the wake bit should be set or cleared
+ *
+ * For chips where the wake from suspend functionality is not
+ * configured in a separate register and the wakeup active state is
+ * just stored in a bitmask.
+ */
+int irq_gc_set_wake(struct irq_data *d, unsigned int on)
+{
+ struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+ u32 mask = d->mask;
+
+ if (!(mask & gc->wake_enabled))
+ return -EINVAL;
+
+ irq_gc_lock(gc);
+ if (on)
+ gc->wake_active |= mask;
+ else
+ gc->wake_active &= ~mask;
+ irq_gc_unlock(gc);
+ return 0;
+}
+
+static u32 irq_readl_be(void __iomem *addr)
+{
+ return ioread32be(addr);
+}
+
+static void irq_writel_be(u32 val, void __iomem *addr)
+{
+ iowrite32be(val, addr);
+}
+
+static void
+irq_init_generic_chip(struct irq_chip_generic *gc, const char *name,
+ int num_ct, unsigned int irq_base,
+ void __iomem *reg_base, irq_flow_handler_t handler)
+{
+ raw_spin_lock_init(&gc->lock);
+ gc->num_ct = num_ct;
+ gc->irq_base = irq_base;
+ gc->reg_base = reg_base;
+ gc->chip_types->chip.name = name;
+ gc->chip_types->handler = handler;
+}
+
+/**
+ * irq_alloc_generic_chip - Allocate a generic chip and initialize it
+ * @name: Name of the irq chip
+ * @num_ct: Number of irq_chip_type instances associated with this
+ * @irq_base: Interrupt base nr for this chip
+ * @reg_base: Register base address (virtual)
+ * @handler: Default flow handler associated with this chip
+ *
+ * Returns an initialized irq_chip_generic structure. The chip defaults
+ * to the primary (index 0) irq_chip_type and @handler
+ */
+struct irq_chip_generic *
+irq_alloc_generic_chip(const char *name, int num_ct, unsigned int irq_base,
+ void __iomem *reg_base, irq_flow_handler_t handler)
+{
+ struct irq_chip_generic *gc;
+ unsigned long sz = sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);
+
+ gc = kzalloc(sz, GFP_KERNEL);
+ if (gc) {
+ irq_init_generic_chip(gc, name, num_ct, irq_base, reg_base,
+ handler);
+ }
+ return gc;
+}
+EXPORT_SYMBOL_GPL(irq_alloc_generic_chip);
+
+static void
+irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags)
+{
+ struct irq_chip_type *ct = gc->chip_types;
+ u32 *mskptr = &gc->mask_cache, mskreg = ct->regs.mask;
+ int i;
+
+ for (i = 0; i < gc->num_ct; i++) {
+ if (flags & IRQ_GC_MASK_CACHE_PER_TYPE) {
+ mskptr = &ct[i].mask_cache_priv;
+ mskreg = ct[i].regs.mask;
+ }
+ ct[i].mask_cache = mskptr;
+ if (flags & IRQ_GC_INIT_MASK_CACHE)
+ *mskptr = irq_reg_readl(gc, mskreg);
+ }
+}
+
+/**
+ * irq_alloc_domain_generic_chip - Allocate generic chips for an irq domain
+ * @d: irq domain for which to allocate chips
+ * @irqs_per_chip: Number of interrupts each chip handles
+ * @num_ct: Number of irq_chip_type instances associated with this
+ * @name: Name of the irq chip
+ * @handler: Default flow handler associated with these chips
+ * @clr: IRQ_* bits to clear in the mapping function
+ * @set: IRQ_* bits to set in the mapping function
+ * @gcflags: Generic chip specific setup flags
+ */
+int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
+ int num_ct, const char *name,
+ irq_flow_handler_t handler,
+ unsigned int clr, unsigned int set,
+ enum irq_gc_flags gcflags)
+{
+ struct irq_domain_chip_generic *dgc;
+ struct irq_chip_generic *gc;
+ int numchips, sz, i;
+ unsigned long flags;
+ void *tmp;
+
+ if (d->gc)
+ return -EBUSY;
+
+ numchips = DIV_ROUND_UP(d->revmap_size, irqs_per_chip);
+ if (!numchips)
+ return -EINVAL;
+
+ /* Allocate a pointer, generic chip and chiptypes for each chip */
+ sz = sizeof(*dgc) + numchips * sizeof(gc);
+ sz += numchips * (sizeof(*gc) + num_ct * sizeof(struct irq_chip_type));
+
+ tmp = dgc = kzalloc(sz, GFP_KERNEL);
+ if (!dgc)
+ return -ENOMEM;
+ dgc->irqs_per_chip = irqs_per_chip;
+ dgc->num_chips = numchips;
+ dgc->irq_flags_to_set = set;
+ dgc->irq_flags_to_clear = clr;
+ dgc->gc_flags = gcflags;
+ d->gc = dgc;
+
+ /* Calc pointer to the first generic chip */
+ tmp += sizeof(*dgc) + numchips * sizeof(gc);
+ for (i = 0; i < numchips; i++) {
+ /* Store the pointer to the generic chip */
+ dgc->gc[i] = gc = tmp;
+ irq_init_generic_chip(gc, name, num_ct, i * irqs_per_chip,
+ NULL, handler);
+
+ gc->domain = d;
+ if (gcflags & IRQ_GC_BE_IO) {
+ gc->reg_readl = &irq_readl_be;
+ gc->reg_writel = &irq_writel_be;
+ }
+
+ raw_spin_lock_irqsave(&gc_lock, flags);
+ list_add_tail(&gc->list, &gc_list);
+ raw_spin_unlock_irqrestore(&gc_lock, flags);
+ /* Calc pointer to the next generic chip */
+ tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);
+ }
+ d->name = name;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips);
+
+/**
+ * irq_get_domain_generic_chip - Get a pointer to the generic chip of a hw_irq
+ * @d: irq domain pointer
+ * @hw_irq: Hardware interrupt number
+ */
+struct irq_chip_generic *
+irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq)
+{
+ struct irq_domain_chip_generic *dgc = d->gc;
+ int idx;
+
+ if (!dgc)
+ return NULL;
+ idx = hw_irq / dgc->irqs_per_chip;
+ if (idx >= dgc->num_chips)
+ return NULL;
+ return dgc->gc[idx];
+}
+EXPORT_SYMBOL_GPL(irq_get_domain_generic_chip);
+
+/*
+ * Separate lockdep class for interrupt chip which can nest irq_desc
+ * lock.
+ */
+static struct lock_class_key irq_nested_lock_class;
+
+/*
+ * irq_map_generic_chip - Map a generic chip for an irq domain
+ */
+int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
+ irq_hw_number_t hw_irq)
+{
+ struct irq_data *data = irq_get_irq_data(virq);
+ struct irq_domain_chip_generic *dgc = d->gc;
+ struct irq_chip_generic *gc;
+ struct irq_chip_type *ct;
+ struct irq_chip *chip;
+ unsigned long flags;
+ int idx;
+
+ if (!d->gc)
+ return -ENODEV;
+
+ idx = hw_irq / dgc->irqs_per_chip;
+ if (idx >= dgc->num_chips)
+ return -EINVAL;
+ gc = dgc->gc[idx];
+
+ idx = hw_irq % dgc->irqs_per_chip;
+
+ if (test_bit(idx, &gc->unused))
+ return -ENOTSUPP;
+
+ if (test_bit(idx, &gc->installed))
+ return -EBUSY;
+
+ ct = gc->chip_types;
+ chip = &ct->chip;
+
+ /* We only init the cache for the first mapping of a generic chip */
+ if (!gc->installed) {
+ raw_spin_lock_irqsave(&gc->lock, flags);
+ irq_gc_init_mask_cache(gc, dgc->gc_flags);
+ raw_spin_unlock_irqrestore(&gc->lock, flags);
+ }
+
+ /* Mark the interrupt as installed */
+ set_bit(idx, &gc->installed);
+
+ if (dgc->gc_flags & IRQ_GC_INIT_NESTED_LOCK)
+ irq_set_lockdep_class(virq, &irq_nested_lock_class);
+
+ if (chip->irq_calc_mask)
+ chip->irq_calc_mask(data);
+ else
+ data->mask = 1 << idx;
+
+ irq_set_chip_and_handler(virq, chip, ct->handler);
+ irq_set_chip_data(virq, gc);
+ irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(irq_map_generic_chip);
+
+struct irq_domain_ops irq_generic_chip_ops = {
+ .map = irq_map_generic_chip,
+ .xlate = irq_domain_xlate_onetwocell,
+};
+EXPORT_SYMBOL_GPL(irq_generic_chip_ops);
+
+/**
+ * irq_setup_generic_chip - Setup a range of interrupts with a generic chip
+ * @gc: Generic irq chip holding all data
+ * @msk: Bitmask holding the irqs to initialize relative to gc->irq_base
+ * @flags: Flags for initialization
+ * @clr: IRQ_* bits to clear
+ * @set: IRQ_* bits to set
+ *
+ * Set up max. 32 interrupts starting from gc->irq_base. Note, this
+ * initializes all interrupts to the primary irq_chip_type and its
+ * associated handler.
+ */
+void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
+ enum irq_gc_flags flags, unsigned int clr,
+ unsigned int set)
+{
+ struct irq_chip_type *ct = gc->chip_types;
+ struct irq_chip *chip = &ct->chip;
+ unsigned int i;
+
+ raw_spin_lock(&gc_lock);
+ list_add_tail(&gc->list, &gc_list);
+ raw_spin_unlock(&gc_lock);
+
+ irq_gc_init_mask_cache(gc, flags);
+
+ for (i = gc->irq_base; msk; msk >>= 1, i++) {
+ if (!(msk & 0x01))
+ continue;
+
+ if (flags & IRQ_GC_INIT_NESTED_LOCK)
+ irq_set_lockdep_class(i, &irq_nested_lock_class);
+
+ if (!(flags & IRQ_GC_NO_MASK)) {
+ struct irq_data *d = irq_get_irq_data(i);
+
+ if (chip->irq_calc_mask)
+ chip->irq_calc_mask(d);
+ else
+ d->mask = 1 << (i - gc->irq_base);
+ }
+ irq_set_chip_and_handler(i, chip, ct->handler);
+ irq_set_chip_data(i, gc);
+ irq_modify_status(i, clr, set);
+ }
+ gc->irq_cnt = i - gc->irq_base;
+}
+EXPORT_SYMBOL_GPL(irq_setup_generic_chip);
+
+/**
+ * irq_setup_alt_chip - Switch to alternative chip
+ * @d: irq_data for this interrupt
+ * @type: Flow type to be initialized
+ *
+ * Only to be called from chip->irq_set_type() callbacks.
+ */
+int irq_setup_alt_chip(struct irq_data *d, unsigned int type)
+{
+ struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+ struct irq_chip_type *ct = gc->chip_types;
+ unsigned int i;
+
+ for (i = 0; i < gc->num_ct; i++, ct++) {
+ if (ct->type & type) {
+ d->chip = &ct->chip;
+ irq_data_to_desc(d)->handle_irq = ct->handler;
+ return 0;
+ }
+ }
+ return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(irq_setup_alt_chip);
+
+/**
+ * irq_remove_generic_chip - Remove a chip
+ * @gc: Generic irq chip holding all data
+ * @msk: Bitmask holding the irqs to initialize relative to gc->irq_base
+ * @clr: IRQ_* bits to clear
+ * @set: IRQ_* bits to set
+ *
+ * Remove up to 32 interrupts starting from gc->irq_base.
+ */
+void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk,
+ unsigned int clr, unsigned int set)
+{
+ unsigned int i = gc->irq_base;
+
+ raw_spin_lock(&gc_lock);
+ list_del(&gc->list);
+ raw_spin_unlock(&gc_lock);
+
+ for (; msk; msk >>= 1, i++) {
+ if (!(msk & 0x01))
+ continue;
+
+ /* Remove handler first. That will mask the irq line */
+ irq_set_handler(i, NULL);
+ irq_set_chip(i, &no_irq_chip);
+ irq_set_chip_data(i, NULL);
+ irq_modify_status(i, clr, set);
+ }
+}
+EXPORT_SYMBOL_GPL(irq_remove_generic_chip);
+
+static struct irq_data *irq_gc_get_irq_data(struct irq_chip_generic *gc)
+{
+ unsigned int virq;
+
+ if (!gc->domain)
+ return irq_get_irq_data(gc->irq_base);
+
+ /*
+ * We don't know which of the irqs has been actually
+ * installed. Use the first one.
+ */
+ if (!gc->installed)
+ return NULL;
+
+ virq = irq_find_mapping(gc->domain, gc->irq_base + __ffs(gc->installed));
+ return virq ? irq_get_irq_data(virq) : NULL;
+}
+
+#ifdef CONFIG_PM
+static int irq_gc_suspend(void)
+{
+ struct irq_chip_generic *gc;
+
+ list_for_each_entry(gc, &gc_list, list) {
+ struct irq_chip_type *ct = gc->chip_types;
+
+ if (ct->chip.irq_suspend) {
+ struct irq_data *data = irq_gc_get_irq_data(gc);
+
+ if (data)
+ ct->chip.irq_suspend(data);
+ }
+ }
+ return 0;
+}
+
+static void irq_gc_resume(void)
+{
+ struct irq_chip_generic *gc;
+
+ list_for_each_entry(gc, &gc_list, list) {
+ struct irq_chip_type *ct = gc->chip_types;
+
+ if (ct->chip.irq_resume) {
+ struct irq_data *data = irq_gc_get_irq_data(gc);
+
+ if (data)
+ ct->chip.irq_resume(data);
+ }
+ }
+}
+#else
+#define irq_gc_suspend NULL
+#define irq_gc_resume NULL
+#endif
+
+static void irq_gc_shutdown(void)
+{
+ struct irq_chip_generic *gc;
+
+ list_for_each_entry(gc, &gc_list, list) {
+ struct irq_chip_type *ct = gc->chip_types;
+
+ if (ct->chip.irq_pm_shutdown) {
+ struct irq_data *data = irq_gc_get_irq_data(gc);
+
+ if (data)
+ ct->chip.irq_pm_shutdown(data);
+ }
+ }
+}
+
+static struct syscore_ops irq_gc_syscore_ops = {
+ .suspend = irq_gc_suspend,
+ .resume = irq_gc_resume,
+ .shutdown = irq_gc_shutdown,
+};
+
+static int __init irq_gc_init_ops(void)
+{
+ register_syscore_ops(&irq_gc_syscore_ops);
+ return 0;
+}
+device_initcall(irq_gc_init_ops);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
new file mode 100644
index 000000000..635480270
--- /dev/null
+++ b/kernel/irq/handle.c
@@ -0,0 +1,197 @@
+/*
+ * linux/kernel/irq/handle.c
+ *
+ * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 2005-2006, Thomas Gleixner, Russell King
+ *
+ * This file contains the core interrupt handling code.
+ *
+ * Detailed information is available in Documentation/DocBook/genericirq
+ *
+ */
+
+#include <linux/irq.h>
+#include <linux/random.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+
+#include <trace/events/irq.h>
+
+#include "internals.h"
+
+/**
+ * handle_bad_irq - handle spurious and unhandled irqs
+ * @irq: the interrupt number
+ * @desc: description of the interrupt
+ *
+ * Handles spurious and unhandled IRQ's. It also prints a debugmessage.
+ */
+void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
+{
+ print_irq_desc(irq, desc);
+ kstat_incr_irqs_this_cpu(irq, desc);
+ ack_bad_irq(irq);
+}
+
+/*
+ * Special, empty irq handler:
+ */
+irqreturn_t no_action(int cpl, void *dev_id)
+{
+ return IRQ_NONE;
+}
+EXPORT_SYMBOL_GPL(no_action);
+
+static void warn_no_thread(unsigned int irq, struct irqaction *action)
+{
+ if (test_and_set_bit(IRQTF_WARNED, &action->thread_flags))
+ return;
+
+ printk(KERN_WARNING "IRQ %d device %s returned IRQ_WAKE_THREAD "
+ "but no thread function available.", irq, action->name);
+}
+
+void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
+{
+ /*
+ * In case the thread crashed and was killed we just pretend that
+ * we handled the interrupt. The hardirq handler has disabled the
+ * device interrupt, so no irq storm is lurking.
+ */
+ if (action->thread->flags & PF_EXITING)
+ return;
+
+ /*
+ * Wake up the handler thread for this action. If the
+ * RUNTHREAD bit is already set, nothing to do.
+ */
+ if (test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags))
+ return;
+
+ /*
+ * It's safe to OR the mask lockless here. We have only two
+ * places which write to threads_oneshot: This code and the
+ * irq thread.
+ *
+ * This code is the hard irq context and can never run on two
+ * cpus in parallel. If it ever does we have more serious
+ * problems than this bitmask.
+ *
+ * The irq threads of this irq which clear their "running" bit
+ * in threads_oneshot are serialized via desc->lock against
+ * each other and they are serialized against this code by
+ * IRQS_INPROGRESS.
+ *
+ * Hard irq handler:
+ *
+ * spin_lock(desc->lock);
+ * desc->state |= IRQS_INPROGRESS;
+ * spin_unlock(desc->lock);
+ * set_bit(IRQTF_RUNTHREAD, &action->thread_flags);
+ * desc->threads_oneshot |= mask;
+ * spin_lock(desc->lock);
+ * desc->state &= ~IRQS_INPROGRESS;
+ * spin_unlock(desc->lock);
+ *
+ * irq thread:
+ *
+ * again:
+ * spin_lock(desc->lock);
+ * if (desc->state & IRQS_INPROGRESS) {
+ * spin_unlock(desc->lock);
+ * while(desc->state & IRQS_INPROGRESS)
+ * cpu_relax();
+ * goto again;
+ * }
+ * if (!test_bit(IRQTF_RUNTHREAD, &action->thread_flags))
+ * desc->threads_oneshot &= ~mask;
+ * spin_unlock(desc->lock);
+ *
+ * So either the thread waits for us to clear IRQS_INPROGRESS
+ * or we are waiting in the flow handler for desc->lock to be
+ * released before we reach this point. The thread also checks
+ * IRQTF_RUNTHREAD under desc->lock. If set it leaves
+ * threads_oneshot untouched and runs the thread another time.
+ */
+ desc->threads_oneshot |= action->thread_mask;
+
+ /*
+ * We increment the threads_active counter in case we wake up
+ * the irq thread. The irq thread decrements the counter when
+ * it returns from the handler or in the exit path and wakes
+ * up waiters which are stuck in synchronize_irq() when the
+ * active count becomes zero. synchronize_irq() is serialized
+ * against this code (hard irq handler) via IRQS_INPROGRESS
+ * like the finalize_oneshot() code. See comment above.
+ */
+ atomic_inc(&desc->threads_active);
+
+ wake_up_process(action->thread);
+}
+
+irqreturn_t
+handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
+{
+ irqreturn_t retval = IRQ_NONE;
+ unsigned int flags = 0, irq = desc->irq_data.irq;
+
+ do {
+ irqreturn_t res;
+
+ trace_irq_handler_entry(irq, action);
+ res = action->handler(irq, action->dev_id);
+ trace_irq_handler_exit(irq, action, res);
+
+ if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pF enabled interrupts\n",
+ irq, action->handler))
+ local_irq_disable();
+
+ switch (res) {
+ case IRQ_WAKE_THREAD:
+ /*
+ * Catch drivers which return WAKE_THREAD but
+ * did not set up a thread function
+ */
+ if (unlikely(!action->thread_fn)) {
+ warn_no_thread(irq, action);
+ break;
+ }
+
+ __irq_wake_thread(desc, action);
+
+ /* Fall through to add to randomness */
+ case IRQ_HANDLED:
+ flags |= action->flags;
+ break;
+
+ default:
+ break;
+ }
+
+ retval |= res;
+ action = action->next;
+ } while (action);
+
+ add_interrupt_randomness(irq, flags);
+
+ if (!noirqdebug)
+ note_interrupt(irq, desc, retval);
+ return retval;
+}
+
+irqreturn_t handle_irq_event(struct irq_desc *desc)
+{
+ struct irqaction *action = desc->action;
+ irqreturn_t ret;
+
+ desc->istate &= ~IRQS_PENDING;
+ irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
+ raw_spin_unlock(&desc->lock);
+
+ ret = handle_irq_event_percpu(desc, action);
+
+ raw_spin_lock(&desc->lock);
+ irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
+ return ret;
+}
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
new file mode 100644
index 000000000..df553b0af
--- /dev/null
+++ b/kernel/irq/internals.h
@@ -0,0 +1,212 @@
+/*
+ * IRQ subsystem internal functions and variables:
+ *
+ * Do not ever include this file from anything else than
+ * kernel/irq/. Do not even think about using any information outside
+ * of this file for your non core code.
+ */
+#include <linux/irqdesc.h>
+#include <linux/kernel_stat.h>
+
+#ifdef CONFIG_SPARSE_IRQ
+# define IRQ_BITMAP_BITS (NR_IRQS + 8196)
+#else
+# define IRQ_BITMAP_BITS NR_IRQS
+#endif
+
+#define istate core_internal_state__do_not_mess_with_it
+
+extern bool noirqdebug;
+
+/*
+ * Bits used by threaded handlers:
+ * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run
+ * IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed
+ * IRQTF_AFFINITY - irq thread is requested to adjust affinity
+ * IRQTF_FORCED_THREAD - irq action is force threaded
+ */
+enum {
+ IRQTF_RUNTHREAD,
+ IRQTF_WARNED,
+ IRQTF_AFFINITY,
+ IRQTF_FORCED_THREAD,
+};
+
+/*
+ * Bit masks for desc->core_internal_state__do_not_mess_with_it
+ *
+ * IRQS_AUTODETECT - autodetection in progress
+ * IRQS_SPURIOUS_DISABLED - was disabled due to spurious interrupt
+ * detection
+ * IRQS_POLL_INPROGRESS - polling in progress
+ * IRQS_ONESHOT - irq is not unmasked in primary handler
+ * IRQS_REPLAY - irq is replayed
+ * IRQS_WAITING - irq is waiting
+ * IRQS_PENDING - irq is pending and replayed later
+ * IRQS_SUSPENDED - irq is suspended
+ */
+enum {
+ IRQS_AUTODETECT = 0x00000001,
+ IRQS_SPURIOUS_DISABLED = 0x00000002,
+ IRQS_POLL_INPROGRESS = 0x00000008,
+ IRQS_ONESHOT = 0x00000020,
+ IRQS_REPLAY = 0x00000040,
+ IRQS_WAITING = 0x00000080,
+ IRQS_PENDING = 0x00000200,
+ IRQS_SUSPENDED = 0x00000800,
+};
+
+#include "debug.h"
+#include "settings.h"
+
+#define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data)
+
+extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
+ unsigned long flags);
+extern void __disable_irq(struct irq_desc *desc, unsigned int irq);
+extern void __enable_irq(struct irq_desc *desc, unsigned int irq);
+
+extern int irq_startup(struct irq_desc *desc, bool resend);
+extern void irq_shutdown(struct irq_desc *desc);
+extern void irq_enable(struct irq_desc *desc);
+extern void irq_disable(struct irq_desc *desc);
+extern void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu);
+extern void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu);
+extern void mask_irq(struct irq_desc *desc);
+extern void unmask_irq(struct irq_desc *desc);
+extern void unmask_threaded_irq(struct irq_desc *desc);
+
+#ifdef CONFIG_SPARSE_IRQ
+static inline void irq_mark_irq(unsigned int irq) { }
+extern void irq_lock_sparse(void);
+extern void irq_unlock_sparse(void);
+#else
+extern void irq_mark_irq(unsigned int irq);
+static inline void irq_lock_sparse(void) { }
+static inline void irq_unlock_sparse(void) { }
+#endif
+
+extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
+
+irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action);
+irqreturn_t handle_irq_event(struct irq_desc *desc);
+
+/* Resending of interrupts :*/
+void check_irq_resend(struct irq_desc *desc, unsigned int irq);
+bool irq_wait_for_poll(struct irq_desc *desc);
+void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action);
+
+#ifdef CONFIG_PROC_FS
+extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
+extern void unregister_irq_proc(unsigned int irq, struct irq_desc *desc);
+extern void register_handler_proc(unsigned int irq, struct irqaction *action);
+extern void unregister_handler_proc(unsigned int irq, struct irqaction *action);
+#else
+static inline void register_irq_proc(unsigned int irq, struct irq_desc *desc) { }
+static inline void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) { }
+static inline void register_handler_proc(unsigned int irq,
+ struct irqaction *action) { }
+static inline void unregister_handler_proc(unsigned int irq,
+ struct irqaction *action) { }
+#endif
+
+extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask);
+
+extern void irq_set_thread_affinity(struct irq_desc *desc);
+
+extern int irq_do_set_affinity(struct irq_data *data,
+ const struct cpumask *dest, bool force);
+
+/* Inline functions for support of irq chips on slow busses */
+static inline void chip_bus_lock(struct irq_desc *desc)
+{
+ if (unlikely(desc->irq_data.chip->irq_bus_lock))
+ desc->irq_data.chip->irq_bus_lock(&desc->irq_data);
+}
+
+static inline void chip_bus_sync_unlock(struct irq_desc *desc)
+{
+ if (unlikely(desc->irq_data.chip->irq_bus_sync_unlock))
+ desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data);
+}
+
+#define _IRQ_DESC_CHECK (1 << 0)
+#define _IRQ_DESC_PERCPU (1 << 1)
+
+#define IRQ_GET_DESC_CHECK_GLOBAL (_IRQ_DESC_CHECK)
+#define IRQ_GET_DESC_CHECK_PERCPU (_IRQ_DESC_CHECK | _IRQ_DESC_PERCPU)
+
+struct irq_desc *
+__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
+ unsigned int check);
+void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus);
+
+static inline struct irq_desc *
+irq_get_desc_buslock(unsigned int irq, unsigned long *flags, unsigned int check)
+{
+ return __irq_get_desc_lock(irq, flags, true, check);
+}
+
+static inline void
+irq_put_desc_busunlock(struct irq_desc *desc, unsigned long flags)
+{
+ __irq_put_desc_unlock(desc, flags, true);
+}
+
+static inline struct irq_desc *
+irq_get_desc_lock(unsigned int irq, unsigned long *flags, unsigned int check)
+{
+ return __irq_get_desc_lock(irq, flags, false, check);
+}
+
+static inline void
+irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags)
+{
+ __irq_put_desc_unlock(desc, flags, false);
+}
+
+/*
+ * Manipulation functions for irq_data.state
+ */
+static inline void irqd_set_move_pending(struct irq_data *d)
+{
+ d->state_use_accessors |= IRQD_SETAFFINITY_PENDING;
+}
+
+static inline void irqd_clr_move_pending(struct irq_data *d)
+{
+ d->state_use_accessors &= ~IRQD_SETAFFINITY_PENDING;
+}
+
+static inline void irqd_clear(struct irq_data *d, unsigned int mask)
+{
+ d->state_use_accessors &= ~mask;
+}
+
+static inline void irqd_set(struct irq_data *d, unsigned int mask)
+{
+ d->state_use_accessors |= mask;
+}
+
+static inline bool irqd_has_set(struct irq_data *d, unsigned int mask)
+{
+ return d->state_use_accessors & mask;
+}
+
+static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *desc)
+{
+ __this_cpu_inc(*desc->kstat_irqs);
+ __this_cpu_inc(kstat.irqs_sum);
+}
+
+#ifdef CONFIG_PM_SLEEP
+bool irq_pm_check_wakeup(struct irq_desc *desc);
+void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action);
+void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action);
+#else
+static inline bool irq_pm_check_wakeup(struct irq_desc *desc) { return false; }
+static inline void
+irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) { }
+static inline void
+irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) { }
+#endif
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
new file mode 100644
index 000000000..99793b9b6
--- /dev/null
+++ b/kernel/irq/irqdesc.c
@@ -0,0 +1,648 @@
+/*
+ * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 2005-2006, Thomas Gleixner, Russell King
+ *
+ * This file contains the interrupt descriptor management code
+ *
+ * Detailed information is available in Documentation/DocBook/genericirq
+ *
+ */
+#include <linux/irq.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+#include <linux/radix-tree.h>
+#include <linux/bitmap.h>
+#include <linux/irqdomain.h>
+
+#include "internals.h"
+
+/*
+ * lockdep: we want to handle all irq_desc locks as a single lock-class:
+ */
+static struct lock_class_key irq_desc_lock_class;
+
+#if defined(CONFIG_SMP)
+static void __init init_irq_default_affinity(void)
+{
+ alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
+ cpumask_setall(irq_default_affinity);
+}
+#else
+static void __init init_irq_default_affinity(void)
+{
+}
+#endif
+
+#ifdef CONFIG_SMP
+static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node)
+{
+ if (!zalloc_cpumask_var_node(&desc->irq_data.affinity, gfp, node))
+ return -ENOMEM;
+
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+ if (!zalloc_cpumask_var_node(&desc->pending_mask, gfp, node)) {
+ free_cpumask_var(desc->irq_data.affinity);
+ return -ENOMEM;
+ }
+#endif
+ return 0;
+}
+
+static void desc_smp_init(struct irq_desc *desc, int node)
+{
+ desc->irq_data.node = node;
+ cpumask_copy(desc->irq_data.affinity, irq_default_affinity);
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+ cpumask_clear(desc->pending_mask);
+#endif
+}
+
+static inline int desc_node(struct irq_desc *desc)
+{
+ return desc->irq_data.node;
+}
+
+#else
+static inline int
+alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; }
+static inline void desc_smp_init(struct irq_desc *desc, int node) { }
+static inline int desc_node(struct irq_desc *desc) { return 0; }
+#endif
+
+static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
+ struct module *owner)
+{
+ int cpu;
+
+ desc->irq_data.irq = irq;
+ desc->irq_data.chip = &no_irq_chip;
+ desc->irq_data.chip_data = NULL;
+ desc->irq_data.handler_data = NULL;
+ desc->irq_data.msi_desc = NULL;
+ irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS);
+ irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);
+ desc->handle_irq = handle_bad_irq;
+ desc->depth = 1;
+ desc->irq_count = 0;
+ desc->irqs_unhandled = 0;
+ desc->name = NULL;
+ desc->owner = owner;
+ for_each_possible_cpu(cpu)
+ *per_cpu_ptr(desc->kstat_irqs, cpu) = 0;
+ desc_smp_init(desc, node);
+}
+
+int nr_irqs = NR_IRQS;
+EXPORT_SYMBOL_GPL(nr_irqs);
+
+static DEFINE_MUTEX(sparse_irq_lock);
+static DECLARE_BITMAP(allocated_irqs, IRQ_BITMAP_BITS);
+
+#ifdef CONFIG_SPARSE_IRQ
+
+static RADIX_TREE(irq_desc_tree, GFP_KERNEL);
+
+static void irq_insert_desc(unsigned int irq, struct irq_desc *desc)
+{
+ radix_tree_insert(&irq_desc_tree, irq, desc);
+}
+
+struct irq_desc *irq_to_desc(unsigned int irq)
+{
+ return radix_tree_lookup(&irq_desc_tree, irq);
+}
+EXPORT_SYMBOL(irq_to_desc);
+
+static void delete_irq_desc(unsigned int irq)
+{
+ radix_tree_delete(&irq_desc_tree, irq);
+}
+
+#ifdef CONFIG_SMP
+static void free_masks(struct irq_desc *desc)
+{
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+ free_cpumask_var(desc->pending_mask);
+#endif
+ free_cpumask_var(desc->irq_data.affinity);
+}
+#else
+static inline void free_masks(struct irq_desc *desc) { }
+#endif
+
+void irq_lock_sparse(void)
+{
+ mutex_lock(&sparse_irq_lock);
+}
+
+void irq_unlock_sparse(void)
+{
+ mutex_unlock(&sparse_irq_lock);
+}
+
+static struct irq_desc *alloc_desc(int irq, int node, struct module *owner)
+{
+ struct irq_desc *desc;
+ gfp_t gfp = GFP_KERNEL;
+
+ desc = kzalloc_node(sizeof(*desc), gfp, node);
+ if (!desc)
+ return NULL;
+ /* allocate based on nr_cpu_ids */
+ desc->kstat_irqs = alloc_percpu(unsigned int);
+ if (!desc->kstat_irqs)
+ goto err_desc;
+
+ if (alloc_masks(desc, gfp, node))
+ goto err_kstat;
+
+ raw_spin_lock_init(&desc->lock);
+ lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+
+ desc_set_defaults(irq, desc, node, owner);
+
+ return desc;
+
+err_kstat:
+ free_percpu(desc->kstat_irqs);
+err_desc:
+ kfree(desc);
+ return NULL;
+}
+
+static void free_desc(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ unregister_irq_proc(irq, desc);
+
+ /*
+ * sparse_irq_lock protects also show_interrupts() and
+ * kstat_irq_usr(). Once we deleted the descriptor from the
+ * sparse tree we can free it. Access in proc will fail to
+ * lookup the descriptor.
+ */
+ mutex_lock(&sparse_irq_lock);
+ delete_irq_desc(irq);
+ mutex_unlock(&sparse_irq_lock);
+
+ free_masks(desc);
+ free_percpu(desc->kstat_irqs);
+ kfree(desc);
+}
+
+static int alloc_descs(unsigned int start, unsigned int cnt, int node,
+ struct module *owner)
+{
+ struct irq_desc *desc;
+ int i;
+
+ for (i = 0; i < cnt; i++) {
+ desc = alloc_desc(start + i, node, owner);
+ if (!desc)
+ goto err;
+ mutex_lock(&sparse_irq_lock);
+ irq_insert_desc(start + i, desc);
+ mutex_unlock(&sparse_irq_lock);
+ }
+ return start;
+
+err:
+ for (i--; i >= 0; i--)
+ free_desc(start + i);
+
+ mutex_lock(&sparse_irq_lock);
+ bitmap_clear(allocated_irqs, start, cnt);
+ mutex_unlock(&sparse_irq_lock);
+ return -ENOMEM;
+}
+
+static int irq_expand_nr_irqs(unsigned int nr)
+{
+ if (nr > IRQ_BITMAP_BITS)
+ return -ENOMEM;
+ nr_irqs = nr;
+ return 0;
+}
+
+int __init early_irq_init(void)
+{
+ int i, initcnt, node = first_online_node;
+ struct irq_desc *desc;
+
+ init_irq_default_affinity();
+
+ /* Let arch update nr_irqs and return the nr of preallocated irqs */
+ initcnt = arch_probe_nr_irqs();
+ printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt);
+
+ if (WARN_ON(nr_irqs > IRQ_BITMAP_BITS))
+ nr_irqs = IRQ_BITMAP_BITS;
+
+ if (WARN_ON(initcnt > IRQ_BITMAP_BITS))
+ initcnt = IRQ_BITMAP_BITS;
+
+ if (initcnt > nr_irqs)
+ nr_irqs = initcnt;
+
+ for (i = 0; i < initcnt; i++) {
+ desc = alloc_desc(i, node, NULL);
+ set_bit(i, allocated_irqs);
+ irq_insert_desc(i, desc);
+ }
+ return arch_early_irq_init();
+}
+
+#else /* !CONFIG_SPARSE_IRQ */
+
+struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
+ [0 ... NR_IRQS-1] = {
+ .handle_irq = handle_bad_irq,
+ .depth = 1,
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock),
+ }
+};
+
+int __init early_irq_init(void)
+{
+ int count, i, node = first_online_node;
+ struct irq_desc *desc;
+
+ init_irq_default_affinity();
+
+ printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS);
+
+ desc = irq_desc;
+ count = ARRAY_SIZE(irq_desc);
+
+ for (i = 0; i < count; i++) {
+ desc[i].kstat_irqs = alloc_percpu(unsigned int);
+ alloc_masks(&desc[i], GFP_KERNEL, node);
+ raw_spin_lock_init(&desc[i].lock);
+ lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
+ desc_set_defaults(i, &desc[i], node, NULL);
+ }
+ return arch_early_irq_init();
+}
+
+struct irq_desc *irq_to_desc(unsigned int irq)
+{
+ return (irq < NR_IRQS) ? irq_desc + irq : NULL;
+}
+EXPORT_SYMBOL(irq_to_desc);
+
+static void free_desc(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+ desc_set_defaults(irq, desc, desc_node(desc), NULL);
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+}
+
+static inline int alloc_descs(unsigned int start, unsigned int cnt, int node,
+ struct module *owner)
+{
+ u32 i;
+
+ for (i = 0; i < cnt; i++) {
+ struct irq_desc *desc = irq_to_desc(start + i);
+
+ desc->owner = owner;
+ }
+ return start;
+}
+
+static int irq_expand_nr_irqs(unsigned int nr)
+{
+ return -ENOMEM;
+}
+
+void irq_mark_irq(unsigned int irq)
+{
+ mutex_lock(&sparse_irq_lock);
+ bitmap_set(allocated_irqs, irq, 1);
+ mutex_unlock(&sparse_irq_lock);
+}
+
+#ifdef CONFIG_GENERIC_IRQ_LEGACY
+void irq_init_desc(unsigned int irq)
+{
+ free_desc(irq);
+}
+#endif
+
+#endif /* !CONFIG_SPARSE_IRQ */
+
+/**
+ * generic_handle_irq - Invoke the handler for a particular irq
+ * @irq: The irq number to handle
+ *
+ */
+int generic_handle_irq(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (!desc)
+ return -EINVAL;
+ generic_handle_irq_desc(irq, desc);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(generic_handle_irq);
+
+#ifdef CONFIG_HANDLE_DOMAIN_IRQ
+/**
+ * __handle_domain_irq - Invoke the handler for a HW irq belonging to a domain
+ * @domain: The domain where to perform the lookup
+ * @hwirq: The HW irq number to convert to a logical one
+ * @lookup: Whether to perform the domain lookup or not
+ * @regs: Register file coming from the low-level handling code
+ *
+ * Returns: 0 on success, or -EINVAL if conversion has failed
+ */
+int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq,
+ bool lookup, struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+ unsigned int irq = hwirq;
+ int ret = 0;
+
+ irq_enter();
+
+#ifdef CONFIG_IRQ_DOMAIN
+ if (lookup)
+ irq = irq_find_mapping(domain, hwirq);
+#endif
+
+ /*
+ * Some hardware gives randomly wrong interrupts. Rather
+ * than crashing, do something sensible.
+ */
+ if (unlikely(!irq || irq >= nr_irqs)) {
+ ack_bad_irq(irq);
+ ret = -EINVAL;
+ } else {
+ generic_handle_irq(irq);
+ }
+
+ irq_exit();
+ set_irq_regs(old_regs);
+ return ret;
+}
+#endif
+
+/* Dynamic interrupt handling */
+
+/**
+ * irq_free_descs - free irq descriptors
+ * @from: Start of descriptor range
+ * @cnt: Number of consecutive irqs to free
+ */
+void irq_free_descs(unsigned int from, unsigned int cnt)
+{
+ int i;
+
+ if (from >= nr_irqs || (from + cnt) > nr_irqs)
+ return;
+
+ for (i = 0; i < cnt; i++)
+ free_desc(from + i);
+
+ mutex_lock(&sparse_irq_lock);
+ bitmap_clear(allocated_irqs, from, cnt);
+ mutex_unlock(&sparse_irq_lock);
+}
+EXPORT_SYMBOL_GPL(irq_free_descs);
+
+/**
+ * irq_alloc_descs - allocate and initialize a range of irq descriptors
+ * @irq: Allocate for specific irq number if irq >= 0
+ * @from: Start the search from this irq number
+ * @cnt: Number of consecutive irqs to allocate.
+ * @node: Preferred node on which the irq descriptor should be allocated
+ * @owner: Owning module (can be NULL)
+ *
+ * Returns the first irq number or error code
+ */
+int __ref
+__irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
+ struct module *owner)
+{
+ int start, ret;
+
+ if (!cnt)
+ return -EINVAL;
+
+ if (irq >= 0) {
+ if (from > irq)
+ return -EINVAL;
+ from = irq;
+ } else {
+ /*
+ * For interrupts which are freely allocated the
+ * architecture can force a lower bound to the @from
+ * argument. x86 uses this to exclude the GSI space.
+ */
+ from = arch_dynirq_lower_bound(from);
+ }
+
+ mutex_lock(&sparse_irq_lock);
+
+ start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS,
+ from, cnt, 0);
+ ret = -EEXIST;
+ if (irq >=0 && start != irq)
+ goto err;
+
+ if (start + cnt > nr_irqs) {
+ ret = irq_expand_nr_irqs(start + cnt);
+ if (ret)
+ goto err;
+ }
+
+ bitmap_set(allocated_irqs, start, cnt);
+ mutex_unlock(&sparse_irq_lock);
+ return alloc_descs(start, cnt, node, owner);
+
+err:
+ mutex_unlock(&sparse_irq_lock);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__irq_alloc_descs);
+
+#ifdef CONFIG_GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
+/**
+ * irq_alloc_hwirqs - Allocate an irq descriptor and initialize the hardware
+ * @cnt: number of interrupts to allocate
+ * @node: node on which to allocate
+ *
+ * Returns an interrupt number > 0 or 0, if the allocation fails.
+ */
+unsigned int irq_alloc_hwirqs(int cnt, int node)
+{
+ int i, irq = __irq_alloc_descs(-1, 0, cnt, node, NULL);
+
+ if (irq < 0)
+ return 0;
+
+ for (i = irq; cnt > 0; i++, cnt--) {
+ if (arch_setup_hwirq(i, node))
+ goto err;
+ irq_clear_status_flags(i, _IRQ_NOREQUEST);
+ }
+ return irq;
+
+err:
+ for (i--; i >= irq; i--) {
+ irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE);
+ arch_teardown_hwirq(i);
+ }
+ irq_free_descs(irq, cnt);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(irq_alloc_hwirqs);
+
+/**
+ * irq_free_hwirqs - Free irq descriptor and cleanup the hardware
+ * @from: Free from irq number
+ * @cnt: number of interrupts to free
+ *
+ */
+void irq_free_hwirqs(unsigned int from, int cnt)
+{
+ int i, j;
+
+ for (i = from, j = cnt; j > 0; i++, j--) {
+ irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE);
+ arch_teardown_hwirq(i);
+ }
+ irq_free_descs(from, cnt);
+}
+EXPORT_SYMBOL_GPL(irq_free_hwirqs);
+#endif
+
+/**
+ * irq_get_next_irq - get next allocated irq number
+ * @offset: where to start the search
+ *
+ * Returns next irq number after offset or nr_irqs if none is found.
+ */
+unsigned int irq_get_next_irq(unsigned int offset)
+{
+ return find_next_bit(allocated_irqs, nr_irqs, offset);
+}
+
+struct irq_desc *
+__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
+ unsigned int check)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (desc) {
+ if (check & _IRQ_DESC_CHECK) {
+ if ((check & _IRQ_DESC_PERCPU) &&
+ !irq_settings_is_per_cpu_devid(desc))
+ return NULL;
+
+ if (!(check & _IRQ_DESC_PERCPU) &&
+ irq_settings_is_per_cpu_devid(desc))
+ return NULL;
+ }
+
+ if (bus)
+ chip_bus_lock(desc);
+ raw_spin_lock_irqsave(&desc->lock, *flags);
+ }
+ return desc;
+}
+
+void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus)
+{
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+ if (bus)
+ chip_bus_sync_unlock(desc);
+}
+
+int irq_set_percpu_devid(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (!desc)
+ return -EINVAL;
+
+ if (desc->percpu_enabled)
+ return -EINVAL;
+
+ desc->percpu_enabled = kzalloc(sizeof(*desc->percpu_enabled), GFP_KERNEL);
+
+ if (!desc->percpu_enabled)
+ return -ENOMEM;
+
+ irq_set_percpu_devid_flags(irq);
+ return 0;
+}
+
+void kstat_incr_irq_this_cpu(unsigned int irq)
+{
+ kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
+}
+
+/**
+ * kstat_irqs_cpu - Get the statistics for an interrupt on a cpu
+ * @irq: The interrupt number
+ * @cpu: The cpu number
+ *
+ * Returns the sum of interrupt counts on @cpu since boot for
+ * @irq. The caller must ensure that the interrupt is not removed
+ * concurrently.
+ */
+unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ return desc && desc->kstat_irqs ?
+ *per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
+}
+
+/**
+ * kstat_irqs - Get the statistics for an interrupt
+ * @irq: The interrupt number
+ *
+ * Returns the sum of interrupt counts on all cpus since boot for
+ * @irq. The caller must ensure that the interrupt is not removed
+ * concurrently.
+ */
+unsigned int kstat_irqs(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ int cpu;
+ int sum = 0;
+
+ if (!desc || !desc->kstat_irqs)
+ return 0;
+ for_each_possible_cpu(cpu)
+ sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
+ return sum;
+}
+
+/**
+ * kstat_irqs_usr - Get the statistics for an interrupt
+ * @irq: The interrupt number
+ *
+ * Returns the sum of interrupt counts on all cpus since boot for
+ * @irq. Contrary to kstat_irqs() this can be called from any
+ * preemptible context. It's protected against concurrent removal of
+ * an interrupt descriptor when sparse irqs are enabled.
+ */
+unsigned int kstat_irqs_usr(unsigned int irq)
+{
+ int sum;
+
+ irq_lock_sparse();
+ sum = kstat_irqs(irq);
+ irq_unlock_sparse();
+ return sum;
+}
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
new file mode 100644
index 000000000..7fac31105
--- /dev/null
+++ b/kernel/irq/irqdomain.c
@@ -0,0 +1,1238 @@
+#define pr_fmt(fmt) "irq: " fmt
+
+#include <linux/debugfs.h>
+#include <linux/hardirq.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/irqdesc.h>
+#include <linux/irqdomain.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
+#include <linux/topology.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/fs.h>
+
+static LIST_HEAD(irq_domain_list);
+static DEFINE_MUTEX(irq_domain_mutex);
+
+static DEFINE_MUTEX(revmap_trees_mutex);
+static struct irq_domain *irq_default_domain;
+
+static int irq_domain_alloc_descs(int virq, unsigned int nr_irqs,
+ irq_hw_number_t hwirq, int node);
+static void irq_domain_check_hierarchy(struct irq_domain *domain);
+
+/**
+ * __irq_domain_add() - Allocate a new irq_domain data structure
+ * @of_node: optional device-tree node of the interrupt controller
+ * @size: Size of linear map; 0 for radix mapping only
+ * @hwirq_max: Maximum number of interrupts supported by controller
+ * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no
+ * direct mapping
+ * @ops: domain callbacks
+ * @host_data: Controller private data pointer
+ *
+ * Allocates and initialize and irq_domain structure.
+ * Returns pointer to IRQ domain, or NULL on failure.
+ */
+struct irq_domain *__irq_domain_add(struct device_node *of_node, int size,
+ irq_hw_number_t hwirq_max, int direct_max,
+ const struct irq_domain_ops *ops,
+ void *host_data)
+{
+ struct irq_domain *domain;
+
+ domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size),
+ GFP_KERNEL, of_node_to_nid(of_node));
+ if (WARN_ON(!domain))
+ return NULL;
+
+ /* Fill structure */
+ INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL);
+ domain->ops = ops;
+ domain->host_data = host_data;
+ domain->of_node = of_node_get(of_node);
+ domain->hwirq_max = hwirq_max;
+ domain->revmap_size = size;
+ domain->revmap_direct_max_irq = direct_max;
+ irq_domain_check_hierarchy(domain);
+
+ mutex_lock(&irq_domain_mutex);
+ list_add(&domain->link, &irq_domain_list);
+ mutex_unlock(&irq_domain_mutex);
+
+ pr_debug("Added domain %s\n", domain->name);
+ return domain;
+}
+EXPORT_SYMBOL_GPL(__irq_domain_add);
+
+/**
+ * irq_domain_remove() - Remove an irq domain.
+ * @domain: domain to remove
+ *
+ * This routine is used to remove an irq domain. The caller must ensure
+ * that all mappings within the domain have been disposed of prior to
+ * use, depending on the revmap type.
+ */
+void irq_domain_remove(struct irq_domain *domain)
+{
+ mutex_lock(&irq_domain_mutex);
+
+ /*
+ * radix_tree_delete() takes care of destroying the root
+ * node when all entries are removed. Shout if there are
+ * any mappings left.
+ */
+ WARN_ON(domain->revmap_tree.height);
+
+ list_del(&domain->link);
+
+ /*
+ * If the going away domain is the default one, reset it.
+ */
+ if (unlikely(irq_default_domain == domain))
+ irq_set_default_host(NULL);
+
+ mutex_unlock(&irq_domain_mutex);
+
+ pr_debug("Removed domain %s\n", domain->name);
+
+ of_node_put(domain->of_node);
+ kfree(domain);
+}
+EXPORT_SYMBOL_GPL(irq_domain_remove);
+
+/**
+ * irq_domain_add_simple() - Register an irq_domain and optionally map a range of irqs
+ * @of_node: pointer to interrupt controller's device tree node.
+ * @size: total number of irqs in mapping
+ * @first_irq: first number of irq block assigned to the domain,
+ * pass zero to assign irqs on-the-fly. If first_irq is non-zero, then
+ * pre-map all of the irqs in the domain to virqs starting at first_irq.
+ * @ops: domain callbacks
+ * @host_data: Controller private data pointer
+ *
+ * Allocates an irq_domain, and optionally if first_irq is positive then also
+ * allocate irq_descs and map all of the hwirqs to virqs starting at first_irq.
+ *
+ * This is intended to implement the expected behaviour for most
+ * interrupt controllers. If device tree is used, then first_irq will be 0 and
+ * irqs get mapped dynamically on the fly. However, if the controller requires
+ * static virq assignments (non-DT boot) then it will set that up correctly.
+ */
+struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
+ unsigned int size,
+ unsigned int first_irq,
+ const struct irq_domain_ops *ops,
+ void *host_data)
+{
+ struct irq_domain *domain;
+
+ domain = __irq_domain_add(of_node, size, size, 0, ops, host_data);
+ if (!domain)
+ return NULL;
+
+ if (first_irq > 0) {
+ if (IS_ENABLED(CONFIG_SPARSE_IRQ)) {
+ /* attempt to allocated irq_descs */
+ int rc = irq_alloc_descs(first_irq, first_irq, size,
+ of_node_to_nid(of_node));
+ if (rc < 0)
+ pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n",
+ first_irq);
+ }
+ irq_domain_associate_many(domain, first_irq, 0, size);
+ }
+
+ return domain;
+}
+EXPORT_SYMBOL_GPL(irq_domain_add_simple);
+
+/**
+ * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain.
+ * @of_node: pointer to interrupt controller's device tree node.
+ * @size: total number of irqs in legacy mapping
+ * @first_irq: first number of irq block assigned to the domain
+ * @first_hwirq: first hwirq number to use for the translation. Should normally
+ * be '0', but a positive integer can be used if the effective
+ * hwirqs numbering does not begin at zero.
+ * @ops: map/unmap domain callbacks
+ * @host_data: Controller private data pointer
+ *
+ * Note: the map() callback will be called before this function returns
+ * for all legacy interrupts except 0 (which is always the invalid irq for
+ * a legacy controller).
+ */
+struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
+ unsigned int size,
+ unsigned int first_irq,
+ irq_hw_number_t first_hwirq,
+ const struct irq_domain_ops *ops,
+ void *host_data)
+{
+ struct irq_domain *domain;
+
+ domain = __irq_domain_add(of_node, first_hwirq + size,
+ first_hwirq + size, 0, ops, host_data);
+ if (domain)
+ irq_domain_associate_many(domain, first_irq, first_hwirq, size);
+
+ return domain;
+}
+EXPORT_SYMBOL_GPL(irq_domain_add_legacy);
+
+/**
+ * irq_find_host() - Locates a domain for a given device node
+ * @node: device-tree node of the interrupt controller
+ */
+struct irq_domain *irq_find_host(struct device_node *node)
+{
+ struct irq_domain *h, *found = NULL;
+ int rc;
+
+ /* We might want to match the legacy controller last since
+ * it might potentially be set to match all interrupts in
+ * the absence of a device node. This isn't a problem so far
+ * yet though...
+ */
+ mutex_lock(&irq_domain_mutex);
+ list_for_each_entry(h, &irq_domain_list, link) {
+ if (h->ops->match)
+ rc = h->ops->match(h, node);
+ else
+ rc = (h->of_node != NULL) && (h->of_node == node);
+
+ if (rc) {
+ found = h;
+ break;
+ }
+ }
+ mutex_unlock(&irq_domain_mutex);
+ return found;
+}
+EXPORT_SYMBOL_GPL(irq_find_host);
+
+/**
+ * irq_set_default_host() - Set a "default" irq domain
+ * @domain: default domain pointer
+ *
+ * For convenience, it's possible to set a "default" domain that will be used
+ * whenever NULL is passed to irq_create_mapping(). It makes life easier for
+ * platforms that want to manipulate a few hard coded interrupt numbers that
+ * aren't properly represented in the device-tree.
+ */
+void irq_set_default_host(struct irq_domain *domain)
+{
+ pr_debug("Default domain set to @0x%p\n", domain);
+
+ irq_default_domain = domain;
+}
+EXPORT_SYMBOL_GPL(irq_set_default_host);
+
+void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
+{
+ struct irq_data *irq_data = irq_get_irq_data(irq);
+ irq_hw_number_t hwirq;
+
+ if (WARN(!irq_data || irq_data->domain != domain,
+ "virq%i doesn't exist; cannot disassociate\n", irq))
+ return;
+
+ hwirq = irq_data->hwirq;
+ irq_set_status_flags(irq, IRQ_NOREQUEST);
+
+ /* remove chip and handler */
+ irq_set_chip_and_handler(irq, NULL, NULL);
+
+ /* Make sure it's completed */
+ synchronize_irq(irq);
+
+ /* Tell the PIC about it */
+ if (domain->ops->unmap)
+ domain->ops->unmap(domain, irq);
+ smp_mb();
+
+ irq_data->domain = NULL;
+ irq_data->hwirq = 0;
+
+ /* Clear reverse map for this hwirq */
+ if (hwirq < domain->revmap_size) {
+ domain->linear_revmap[hwirq] = 0;
+ } else {
+ mutex_lock(&revmap_trees_mutex);
+ radix_tree_delete(&domain->revmap_tree, hwirq);
+ mutex_unlock(&revmap_trees_mutex);
+ }
+}
+
+int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
+ irq_hw_number_t hwirq)
+{
+ struct irq_data *irq_data = irq_get_irq_data(virq);
+ int ret;
+
+ if (WARN(hwirq >= domain->hwirq_max,
+ "error: hwirq 0x%x is too large for %s\n", (int)hwirq, domain->name))
+ return -EINVAL;
+ if (WARN(!irq_data, "error: virq%i is not allocated", virq))
+ return -EINVAL;
+ if (WARN(irq_data->domain, "error: virq%i is already associated", virq))
+ return -EINVAL;
+
+ mutex_lock(&irq_domain_mutex);
+ irq_data->hwirq = hwirq;
+ irq_data->domain = domain;
+ if (domain->ops->map) {
+ ret = domain->ops->map(domain, virq, hwirq);
+ if (ret != 0) {
+ /*
+ * If map() returns -EPERM, this interrupt is protected
+ * by the firmware or some other service and shall not
+ * be mapped. Don't bother telling the user about it.
+ */
+ if (ret != -EPERM) {
+ pr_info("%s didn't like hwirq-0x%lx to VIRQ%i mapping (rc=%d)\n",
+ domain->name, hwirq, virq, ret);
+ }
+ irq_data->domain = NULL;
+ irq_data->hwirq = 0;
+ mutex_unlock(&irq_domain_mutex);
+ return ret;
+ }
+
+ /* If not already assigned, give the domain the chip's name */
+ if (!domain->name && irq_data->chip)
+ domain->name = irq_data->chip->name;
+ }
+
+ if (hwirq < domain->revmap_size) {
+ domain->linear_revmap[hwirq] = virq;
+ } else {
+ mutex_lock(&revmap_trees_mutex);
+ radix_tree_insert(&domain->revmap_tree, hwirq, irq_data);
+ mutex_unlock(&revmap_trees_mutex);
+ }
+ mutex_unlock(&irq_domain_mutex);
+
+ irq_clear_status_flags(virq, IRQ_NOREQUEST);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(irq_domain_associate);
+
+void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base,
+ irq_hw_number_t hwirq_base, int count)
+{
+ int i;
+
+ pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__,
+ of_node_full_name(domain->of_node), irq_base, (int)hwirq_base, count);
+
+ for (i = 0; i < count; i++) {
+ irq_domain_associate(domain, irq_base + i, hwirq_base + i);
+ }
+}
+EXPORT_SYMBOL_GPL(irq_domain_associate_many);
+
+/**
+ * irq_create_direct_mapping() - Allocate an irq for direct mapping
+ * @domain: domain to allocate the irq for or NULL for default domain
+ *
+ * This routine is used for irq controllers which can choose the hardware
+ * interrupt numbers they generate. In such a case it's simplest to use
+ * the linux irq as the hardware interrupt number. It still uses the linear
+ * or radix tree to store the mapping, but the irq controller can optimize
+ * the revmap path by using the hwirq directly.
+ */
+unsigned int irq_create_direct_mapping(struct irq_domain *domain)
+{
+ unsigned int virq;
+
+ if (domain == NULL)
+ domain = irq_default_domain;
+
+ virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node));
+ if (!virq) {
+ pr_debug("create_direct virq allocation failed\n");
+ return 0;
+ }
+ if (virq >= domain->revmap_direct_max_irq) {
+ pr_err("ERROR: no free irqs available below %i maximum\n",
+ domain->revmap_direct_max_irq);
+ irq_free_desc(virq);
+ return 0;
+ }
+ pr_debug("create_direct obtained virq %d\n", virq);
+
+ if (irq_domain_associate(domain, virq, virq)) {
+ irq_free_desc(virq);
+ return 0;
+ }
+
+ return virq;
+}
+EXPORT_SYMBOL_GPL(irq_create_direct_mapping);
+
+/**
+ * irq_create_mapping() - Map a hardware interrupt into linux irq space
+ * @domain: domain owning this hardware interrupt or NULL for default domain
+ * @hwirq: hardware irq number in that domain space
+ *
+ * Only one mapping per hardware interrupt is permitted. Returns a linux
+ * irq number.
+ * If the sense/trigger is to be specified, set_irq_type() should be called
+ * on the number returned from that call.
+ */
+unsigned int irq_create_mapping(struct irq_domain *domain,
+ irq_hw_number_t hwirq)
+{
+ int virq;
+
+ pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
+
+ /* Look for default domain if nececssary */
+ if (domain == NULL)
+ domain = irq_default_domain;
+ if (domain == NULL) {
+ WARN(1, "%s(, %lx) called with NULL domain\n", __func__, hwirq);
+ return 0;
+ }
+ pr_debug("-> using domain @%p\n", domain);
+
+ /* Check if mapping already exists */
+ virq = irq_find_mapping(domain, hwirq);
+ if (virq) {
+ pr_debug("-> existing mapping on virq %d\n", virq);
+ return virq;
+ }
+
+ /* Allocate a virtual interrupt number */
+ virq = irq_domain_alloc_descs(-1, 1, hwirq,
+ of_node_to_nid(domain->of_node));
+ if (virq <= 0) {
+ pr_debug("-> virq allocation failed\n");
+ return 0;
+ }
+
+ if (irq_domain_associate(domain, virq, hwirq)) {
+ irq_free_desc(virq);
+ return 0;
+ }
+
+ pr_debug("irq %lu on domain %s mapped to virtual irq %u\n",
+ hwirq, of_node_full_name(domain->of_node), virq);
+
+ return virq;
+}
+EXPORT_SYMBOL_GPL(irq_create_mapping);
+
+/**
+ * irq_create_strict_mappings() - Map a range of hw irqs to fixed linux irqs
+ * @domain: domain owning the interrupt range
+ * @irq_base: beginning of linux IRQ range
+ * @hwirq_base: beginning of hardware IRQ range
+ * @count: Number of interrupts to map
+ *
+ * This routine is used for allocating and mapping a range of hardware
+ * irqs to linux irqs where the linux irq numbers are at pre-defined
+ * locations. For use by controllers that already have static mappings
+ * to insert in to the domain.
+ *
+ * Non-linear users can use irq_create_identity_mapping() for IRQ-at-a-time
+ * domain insertion.
+ *
+ * 0 is returned upon success, while any failure to establish a static
+ * mapping is treated as an error.
+ */
+int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base,
+ irq_hw_number_t hwirq_base, int count)
+{
+ int ret;
+
+ ret = irq_alloc_descs(irq_base, irq_base, count,
+ of_node_to_nid(domain->of_node));
+ if (unlikely(ret < 0))
+ return ret;
+
+ irq_domain_associate_many(domain, irq_base, hwirq_base, count);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(irq_create_strict_mappings);
+
+unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data)
+{
+ struct irq_domain *domain;
+ irq_hw_number_t hwirq;
+ unsigned int type = IRQ_TYPE_NONE;
+ int virq;
+
+ domain = irq_data->np ? irq_find_host(irq_data->np) : irq_default_domain;
+ if (!domain) {
+ pr_warn("no irq domain found for %s !\n",
+ of_node_full_name(irq_data->np));
+ return 0;
+ }
+
+ /* If domain has no translation, then we assume interrupt line */
+ if (domain->ops->xlate == NULL)
+ hwirq = irq_data->args[0];
+ else {
+ if (domain->ops->xlate(domain, irq_data->np, irq_data->args,
+ irq_data->args_count, &hwirq, &type))
+ return 0;
+ }
+
+ if (irq_domain_is_hierarchy(domain)) {
+ /*
+ * If we've already configured this interrupt,
+ * don't do it again, or hell will break loose.
+ */
+ virq = irq_find_mapping(domain, hwirq);
+ if (virq)
+ return virq;
+
+ virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, irq_data);
+ if (virq <= 0)
+ return 0;
+ } else {
+ /* Create mapping */
+ virq = irq_create_mapping(domain, hwirq);
+ if (!virq)
+ return virq;
+ }
+
+ /* Set type if specified and different than the current one */
+ if (type != IRQ_TYPE_NONE &&
+ type != irq_get_trigger_type(virq))
+ irq_set_irq_type(virq, type);
+ return virq;
+}
+EXPORT_SYMBOL_GPL(irq_create_of_mapping);
+
+/**
+ * irq_dispose_mapping() - Unmap an interrupt
+ * @virq: linux irq number of the interrupt to unmap
+ */
+void irq_dispose_mapping(unsigned int virq)
+{
+ struct irq_data *irq_data = irq_get_irq_data(virq);
+ struct irq_domain *domain;
+
+ if (!virq || !irq_data)
+ return;
+
+ domain = irq_data->domain;
+ if (WARN_ON(domain == NULL))
+ return;
+
+ irq_domain_disassociate(domain, virq);
+ irq_free_desc(virq);
+}
+EXPORT_SYMBOL_GPL(irq_dispose_mapping);
+
+/**
+ * irq_find_mapping() - Find a linux irq from an hw irq number.
+ * @domain: domain owning this hardware interrupt
+ * @hwirq: hardware irq number in that domain space
+ */
+unsigned int irq_find_mapping(struct irq_domain *domain,
+ irq_hw_number_t hwirq)
+{
+ struct irq_data *data;
+
+ /* Look for default domain if nececssary */
+ if (domain == NULL)
+ domain = irq_default_domain;
+ if (domain == NULL)
+ return 0;
+
+ if (hwirq < domain->revmap_direct_max_irq) {
+ data = irq_domain_get_irq_data(domain, hwirq);
+ if (data && data->hwirq == hwirq)
+ return hwirq;
+ }
+
+ /* Check if the hwirq is in the linear revmap. */
+ if (hwirq < domain->revmap_size)
+ return domain->linear_revmap[hwirq];
+
+ rcu_read_lock();
+ data = radix_tree_lookup(&domain->revmap_tree, hwirq);
+ rcu_read_unlock();
+ return data ? data->irq : 0;
+}
+EXPORT_SYMBOL_GPL(irq_find_mapping);
+
+#ifdef CONFIG_IRQ_DOMAIN_DEBUG
+static int virq_debug_show(struct seq_file *m, void *private)
+{
+ unsigned long flags;
+ struct irq_desc *desc;
+ struct irq_domain *domain;
+ struct radix_tree_iter iter;
+ void *data, **slot;
+ int i;
+
+ seq_printf(m, " %-16s %-6s %-10s %-10s %s\n",
+ "name", "mapped", "linear-max", "direct-max", "devtree-node");
+ mutex_lock(&irq_domain_mutex);
+ list_for_each_entry(domain, &irq_domain_list, link) {
+ int count = 0;
+ radix_tree_for_each_slot(slot, &domain->revmap_tree, &iter, 0)
+ count++;
+ seq_printf(m, "%c%-16s %6u %10u %10u %s\n",
+ domain == irq_default_domain ? '*' : ' ', domain->name,
+ domain->revmap_size + count, domain->revmap_size,
+ domain->revmap_direct_max_irq,
+ domain->of_node ? of_node_full_name(domain->of_node) : "");
+ }
+ mutex_unlock(&irq_domain_mutex);
+
+ seq_printf(m, "%-5s %-7s %-15s %-*s %6s %-14s %s\n", "irq", "hwirq",
+ "chip name", (int)(2 * sizeof(void *) + 2), "chip data",
+ "active", "type", "domain");
+
+ for (i = 1; i < nr_irqs; i++) {
+ desc = irq_to_desc(i);
+ if (!desc)
+ continue;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+ domain = desc->irq_data.domain;
+
+ if (domain) {
+ struct irq_chip *chip;
+ int hwirq = desc->irq_data.hwirq;
+ bool direct;
+
+ seq_printf(m, "%5d ", i);
+ seq_printf(m, "0x%05x ", hwirq);
+
+ chip = irq_desc_get_chip(desc);
+ seq_printf(m, "%-15s ", (chip && chip->name) ? chip->name : "none");
+
+ data = irq_desc_get_chip_data(desc);
+ seq_printf(m, data ? "0x%p " : " %p ", data);
+
+ seq_printf(m, " %c ", (desc->action && desc->action->handler) ? '*' : ' ');
+ direct = (i == hwirq) && (i < domain->revmap_direct_max_irq);
+ seq_printf(m, "%6s%-8s ",
+ (hwirq < domain->revmap_size) ? "LINEAR" : "RADIX",
+ direct ? "(DIRECT)" : "");
+ seq_printf(m, "%s\n", desc->irq_data.domain->name);
+ }
+
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+ }
+
+ return 0;
+}
+
+static int virq_debug_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, virq_debug_show, inode->i_private);
+}
+
+static const struct file_operations virq_debug_fops = {
+ .open = virq_debug_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init irq_debugfs_init(void)
+{
+ if (debugfs_create_file("irq_domain_mapping", S_IRUGO, NULL,
+ NULL, &virq_debug_fops) == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+__initcall(irq_debugfs_init);
+#endif /* CONFIG_IRQ_DOMAIN_DEBUG */
+
+/**
+ * irq_domain_xlate_onecell() - Generic xlate for direct one cell bindings
+ *
+ * Device Tree IRQ specifier translation function which works with one cell
+ * bindings where the cell value maps directly to the hwirq number.
+ */
+int irq_domain_xlate_onecell(struct irq_domain *d, struct device_node *ctrlr,
+ const u32 *intspec, unsigned int intsize,
+ unsigned long *out_hwirq, unsigned int *out_type)
+{
+ if (WARN_ON(intsize < 1))
+ return -EINVAL;
+ *out_hwirq = intspec[0];
+ *out_type = IRQ_TYPE_NONE;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(irq_domain_xlate_onecell);
+
+/**
+ * irq_domain_xlate_twocell() - Generic xlate for direct two cell bindings
+ *
+ * Device Tree IRQ specifier translation function which works with two cell
+ * bindings where the cell values map directly to the hwirq number
+ * and linux irq flags.
+ */
+int irq_domain_xlate_twocell(struct irq_domain *d, struct device_node *ctrlr,
+ const u32 *intspec, unsigned int intsize,
+ irq_hw_number_t *out_hwirq, unsigned int *out_type)
+{
+ if (WARN_ON(intsize < 2))
+ return -EINVAL;
+ *out_hwirq = intspec[0];
+ *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(irq_domain_xlate_twocell);
+
+/**
+ * irq_domain_xlate_onetwocell() - Generic xlate for one or two cell bindings
+ *
+ * Device Tree IRQ specifier translation function which works with either one
+ * or two cell bindings where the cell values map directly to the hwirq number
+ * and linux irq flags.
+ *
+ * Note: don't use this function unless your interrupt controller explicitly
+ * supports both one and two cell bindings. For the majority of controllers
+ * the _onecell() or _twocell() variants above should be used.
+ */
+int irq_domain_xlate_onetwocell(struct irq_domain *d,
+ struct device_node *ctrlr,
+ const u32 *intspec, unsigned int intsize,
+ unsigned long *out_hwirq, unsigned int *out_type)
+{
+ if (WARN_ON(intsize < 1))
+ return -EINVAL;
+ *out_hwirq = intspec[0];
+ *out_type = (intsize > 1) ? intspec[1] : IRQ_TYPE_NONE;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell);
+
+const struct irq_domain_ops irq_domain_simple_ops = {
+ .xlate = irq_domain_xlate_onetwocell,
+};
+EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
+
+static int irq_domain_alloc_descs(int virq, unsigned int cnt,
+ irq_hw_number_t hwirq, int node)
+{
+ unsigned int hint;
+
+ if (virq >= 0) {
+ virq = irq_alloc_descs(virq, virq, cnt, node);
+ } else {
+ hint = hwirq % nr_irqs;
+ if (hint == 0)
+ hint++;
+ virq = irq_alloc_descs_from(hint, cnt, node);
+ if (virq <= 0 && hint > 1)
+ virq = irq_alloc_descs_from(1, cnt, node);
+ }
+
+ return virq;
+}
+
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+/**
+ * irq_domain_add_hierarchy - Add a irqdomain into the hierarchy
+ * @parent: Parent irq domain to associate with the new domain
+ * @flags: Irq domain flags associated to the domain
+ * @size: Size of the domain. See below
+ * @node: Optional device-tree node of the interrupt controller
+ * @ops: Pointer to the interrupt domain callbacks
+ * @host_data: Controller private data pointer
+ *
+ * If @size is 0 a tree domain is created, otherwise a linear domain.
+ *
+ * If successful the parent is associated to the new domain and the
+ * domain flags are set.
+ * Returns pointer to IRQ domain, or NULL on failure.
+ */
+struct irq_domain *irq_domain_add_hierarchy(struct irq_domain *parent,
+ unsigned int flags,
+ unsigned int size,
+ struct device_node *node,
+ const struct irq_domain_ops *ops,
+ void *host_data)
+{
+ struct irq_domain *domain;
+
+ if (size)
+ domain = irq_domain_add_linear(node, size, ops, host_data);
+ else
+ domain = irq_domain_add_tree(node, ops, host_data);
+ if (domain) {
+ domain->parent = parent;
+ domain->flags |= flags;
+ }
+
+ return domain;
+}
+
+static void irq_domain_insert_irq(int virq)
+{
+ struct irq_data *data;
+
+ for (data = irq_get_irq_data(virq); data; data = data->parent_data) {
+ struct irq_domain *domain = data->domain;
+ irq_hw_number_t hwirq = data->hwirq;
+
+ if (hwirq < domain->revmap_size) {
+ domain->linear_revmap[hwirq] = virq;
+ } else {
+ mutex_lock(&revmap_trees_mutex);
+ radix_tree_insert(&domain->revmap_tree, hwirq, data);
+ mutex_unlock(&revmap_trees_mutex);
+ }
+
+ /* If not already assigned, give the domain the chip's name */
+ if (!domain->name && data->chip)
+ domain->name = data->chip->name;
+ }
+
+ irq_clear_status_flags(virq, IRQ_NOREQUEST);
+}
+
+static void irq_domain_remove_irq(int virq)
+{
+ struct irq_data *data;
+
+ irq_set_status_flags(virq, IRQ_NOREQUEST);
+ irq_set_chip_and_handler(virq, NULL, NULL);
+ synchronize_irq(virq);
+ smp_mb();
+
+ for (data = irq_get_irq_data(virq); data; data = data->parent_data) {
+ struct irq_domain *domain = data->domain;
+ irq_hw_number_t hwirq = data->hwirq;
+
+ if (hwirq < domain->revmap_size) {
+ domain->linear_revmap[hwirq] = 0;
+ } else {
+ mutex_lock(&revmap_trees_mutex);
+ radix_tree_delete(&domain->revmap_tree, hwirq);
+ mutex_unlock(&revmap_trees_mutex);
+ }
+ }
+}
+
+static struct irq_data *irq_domain_insert_irq_data(struct irq_domain *domain,
+ struct irq_data *child)
+{
+ struct irq_data *irq_data;
+
+ irq_data = kzalloc_node(sizeof(*irq_data), GFP_KERNEL, child->node);
+ if (irq_data) {
+ child->parent_data = irq_data;
+ irq_data->irq = child->irq;
+ irq_data->node = child->node;
+ irq_data->domain = domain;
+ }
+
+ return irq_data;
+}
+
+static void irq_domain_free_irq_data(unsigned int virq, unsigned int nr_irqs)
+{
+ struct irq_data *irq_data, *tmp;
+ int i;
+
+ for (i = 0; i < nr_irqs; i++) {
+ irq_data = irq_get_irq_data(virq + i);
+ tmp = irq_data->parent_data;
+ irq_data->parent_data = NULL;
+ irq_data->domain = NULL;
+
+ while (tmp) {
+ irq_data = tmp;
+ tmp = tmp->parent_data;
+ kfree(irq_data);
+ }
+ }
+}
+
+static int irq_domain_alloc_irq_data(struct irq_domain *domain,
+ unsigned int virq, unsigned int nr_irqs)
+{
+ struct irq_data *irq_data;
+ struct irq_domain *parent;
+ int i;
+
+ /* The outermost irq_data is embedded in struct irq_desc */
+ for (i = 0; i < nr_irqs; i++) {
+ irq_data = irq_get_irq_data(virq + i);
+ irq_data->domain = domain;
+
+ for (parent = domain->parent; parent; parent = parent->parent) {
+ irq_data = irq_domain_insert_irq_data(parent, irq_data);
+ if (!irq_data) {
+ irq_domain_free_irq_data(virq, i + 1);
+ return -ENOMEM;
+ }
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * irq_domain_get_irq_data - Get irq_data associated with @virq and @domain
+ * @domain: domain to match
+ * @virq: IRQ number to get irq_data
+ */
+struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain,
+ unsigned int virq)
+{
+ struct irq_data *irq_data;
+
+ for (irq_data = irq_get_irq_data(virq); irq_data;
+ irq_data = irq_data->parent_data)
+ if (irq_data->domain == domain)
+ return irq_data;
+
+ return NULL;
+}
+
+/**
+ * irq_domain_set_hwirq_and_chip - Set hwirq and irqchip of @virq at @domain
+ * @domain: Interrupt domain to match
+ * @virq: IRQ number
+ * @hwirq: The hwirq number
+ * @chip: The associated interrupt chip
+ * @chip_data: The associated chip data
+ */
+int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq,
+ irq_hw_number_t hwirq, struct irq_chip *chip,
+ void *chip_data)
+{
+ struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq);
+
+ if (!irq_data)
+ return -ENOENT;
+
+ irq_data->hwirq = hwirq;
+ irq_data->chip = chip ? chip : &no_irq_chip;
+ irq_data->chip_data = chip_data;
+
+ return 0;
+}
+
+/**
+ * irq_domain_set_info - Set the complete data for a @virq in @domain
+ * @domain: Interrupt domain to match
+ * @virq: IRQ number
+ * @hwirq: The hardware interrupt number
+ * @chip: The associated interrupt chip
+ * @chip_data: The associated interrupt chip data
+ * @handler: The interrupt flow handler
+ * @handler_data: The interrupt flow handler data
+ * @handler_name: The interrupt handler name
+ */
+void irq_domain_set_info(struct irq_domain *domain, unsigned int virq,
+ irq_hw_number_t hwirq, struct irq_chip *chip,
+ void *chip_data, irq_flow_handler_t handler,
+ void *handler_data, const char *handler_name)
+{
+ irq_domain_set_hwirq_and_chip(domain, virq, hwirq, chip, chip_data);
+ __irq_set_handler(virq, handler, 0, handler_name);
+ irq_set_handler_data(virq, handler_data);
+}
+
+/**
+ * irq_domain_reset_irq_data - Clear hwirq, chip and chip_data in @irq_data
+ * @irq_data: The pointer to irq_data
+ */
+void irq_domain_reset_irq_data(struct irq_data *irq_data)
+{
+ irq_data->hwirq = 0;
+ irq_data->chip = &no_irq_chip;
+ irq_data->chip_data = NULL;
+}
+
+/**
+ * irq_domain_free_irqs_common - Clear irq_data and free the parent
+ * @domain: Interrupt domain to match
+ * @virq: IRQ number to start with
+ * @nr_irqs: The number of irqs to free
+ */
+void irq_domain_free_irqs_common(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs)
+{
+ struct irq_data *irq_data;
+ int i;
+
+ for (i = 0; i < nr_irqs; i++) {
+ irq_data = irq_domain_get_irq_data(domain, virq + i);
+ if (irq_data)
+ irq_domain_reset_irq_data(irq_data);
+ }
+ irq_domain_free_irqs_parent(domain, virq, nr_irqs);
+}
+
+/**
+ * irq_domain_free_irqs_top - Clear handler and handler data, clear irqdata and free parent
+ * @domain: Interrupt domain to match
+ * @virq: IRQ number to start with
+ * @nr_irqs: The number of irqs to free
+ */
+void irq_domain_free_irqs_top(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs)
+{
+ int i;
+
+ for (i = 0; i < nr_irqs; i++) {
+ irq_set_handler_data(virq + i, NULL);
+ irq_set_handler(virq + i, NULL);
+ }
+ irq_domain_free_irqs_common(domain, virq, nr_irqs);
+}
+
+static bool irq_domain_is_auto_recursive(struct irq_domain *domain)
+{
+ return domain->flags & IRQ_DOMAIN_FLAG_AUTO_RECURSIVE;
+}
+
+static void irq_domain_free_irqs_recursive(struct irq_domain *domain,
+ unsigned int irq_base,
+ unsigned int nr_irqs)
+{
+ domain->ops->free(domain, irq_base, nr_irqs);
+ if (irq_domain_is_auto_recursive(domain)) {
+ BUG_ON(!domain->parent);
+ irq_domain_free_irqs_recursive(domain->parent, irq_base,
+ nr_irqs);
+ }
+}
+
+static int irq_domain_alloc_irqs_recursive(struct irq_domain *domain,
+ unsigned int irq_base,
+ unsigned int nr_irqs, void *arg)
+{
+ int ret = 0;
+ struct irq_domain *parent = domain->parent;
+ bool recursive = irq_domain_is_auto_recursive(domain);
+
+ BUG_ON(recursive && !parent);
+ if (recursive)
+ ret = irq_domain_alloc_irqs_recursive(parent, irq_base,
+ nr_irqs, arg);
+ if (ret >= 0)
+ ret = domain->ops->alloc(domain, irq_base, nr_irqs, arg);
+ if (ret < 0 && recursive)
+ irq_domain_free_irqs_recursive(parent, irq_base, nr_irqs);
+
+ return ret;
+}
+
+/**
+ * __irq_domain_alloc_irqs - Allocate IRQs from domain
+ * @domain: domain to allocate from
+ * @irq_base: allocate specified IRQ nubmer if irq_base >= 0
+ * @nr_irqs: number of IRQs to allocate
+ * @node: NUMA node id for memory allocation
+ * @arg: domain specific argument
+ * @realloc: IRQ descriptors have already been allocated if true
+ *
+ * Allocate IRQ numbers and initialized all data structures to support
+ * hierarchy IRQ domains.
+ * Parameter @realloc is mainly to support legacy IRQs.
+ * Returns error code or allocated IRQ number
+ *
+ * The whole process to setup an IRQ has been split into two steps.
+ * The first step, __irq_domain_alloc_irqs(), is to allocate IRQ
+ * descriptor and required hardware resources. The second step,
+ * irq_domain_activate_irq(), is to program hardwares with preallocated
+ * resources. In this way, it's easier to rollback when failing to
+ * allocate resources.
+ */
+int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
+ unsigned int nr_irqs, int node, void *arg,
+ bool realloc)
+{
+ int i, ret, virq;
+
+ if (domain == NULL) {
+ domain = irq_default_domain;
+ if (WARN(!domain, "domain is NULL; cannot allocate IRQ\n"))
+ return -EINVAL;
+ }
+
+ if (!domain->ops->alloc) {
+ pr_debug("domain->ops->alloc() is NULL\n");
+ return -ENOSYS;
+ }
+
+ if (realloc && irq_base >= 0) {
+ virq = irq_base;
+ } else {
+ virq = irq_domain_alloc_descs(irq_base, nr_irqs, 0, node);
+ if (virq < 0) {
+ pr_debug("cannot allocate IRQ(base %d, count %d)\n",
+ irq_base, nr_irqs);
+ return virq;
+ }
+ }
+
+ if (irq_domain_alloc_irq_data(domain, virq, nr_irqs)) {
+ pr_debug("cannot allocate memory for IRQ%d\n", virq);
+ ret = -ENOMEM;
+ goto out_free_desc;
+ }
+
+ mutex_lock(&irq_domain_mutex);
+ ret = irq_domain_alloc_irqs_recursive(domain, virq, nr_irqs, arg);
+ if (ret < 0) {
+ mutex_unlock(&irq_domain_mutex);
+ goto out_free_irq_data;
+ }
+ for (i = 0; i < nr_irqs; i++)
+ irq_domain_insert_irq(virq + i);
+ mutex_unlock(&irq_domain_mutex);
+
+ return virq;
+
+out_free_irq_data:
+ irq_domain_free_irq_data(virq, nr_irqs);
+out_free_desc:
+ irq_free_descs(virq, nr_irqs);
+ return ret;
+}
+
+/**
+ * irq_domain_free_irqs - Free IRQ number and associated data structures
+ * @virq: base IRQ number
+ * @nr_irqs: number of IRQs to free
+ */
+void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs)
+{
+ struct irq_data *data = irq_get_irq_data(virq);
+ int i;
+
+ if (WARN(!data || !data->domain || !data->domain->ops->free,
+ "NULL pointer, cannot free irq\n"))
+ return;
+
+ mutex_lock(&irq_domain_mutex);
+ for (i = 0; i < nr_irqs; i++)
+ irq_domain_remove_irq(virq + i);
+ irq_domain_free_irqs_recursive(data->domain, virq, nr_irqs);
+ mutex_unlock(&irq_domain_mutex);
+
+ irq_domain_free_irq_data(virq, nr_irqs);
+ irq_free_descs(virq, nr_irqs);
+}
+
+/**
+ * irq_domain_alloc_irqs_parent - Allocate interrupts from parent domain
+ * @irq_base: Base IRQ number
+ * @nr_irqs: Number of IRQs to allocate
+ * @arg: Allocation data (arch/domain specific)
+ *
+ * Check whether the domain has been setup recursive. If not allocate
+ * through the parent domain.
+ */
+int irq_domain_alloc_irqs_parent(struct irq_domain *domain,
+ unsigned int irq_base, unsigned int nr_irqs,
+ void *arg)
+{
+ /* irq_domain_alloc_irqs_recursive() has called parent's alloc() */
+ if (irq_domain_is_auto_recursive(domain))
+ return 0;
+
+ domain = domain->parent;
+ if (domain)
+ return irq_domain_alloc_irqs_recursive(domain, irq_base,
+ nr_irqs, arg);
+ return -ENOSYS;
+}
+
+/**
+ * irq_domain_free_irqs_parent - Free interrupts from parent domain
+ * @irq_base: Base IRQ number
+ * @nr_irqs: Number of IRQs to free
+ *
+ * Check whether the domain has been setup recursive. If not free
+ * through the parent domain.
+ */
+void irq_domain_free_irqs_parent(struct irq_domain *domain,
+ unsigned int irq_base, unsigned int nr_irqs)
+{
+ /* irq_domain_free_irqs_recursive() will call parent's free */
+ if (!irq_domain_is_auto_recursive(domain) && domain->parent)
+ irq_domain_free_irqs_recursive(domain->parent, irq_base,
+ nr_irqs);
+}
+
+/**
+ * irq_domain_activate_irq - Call domain_ops->activate recursively to activate
+ * interrupt
+ * @irq_data: outermost irq_data associated with interrupt
+ *
+ * This is the second step to call domain_ops->activate to program interrupt
+ * controllers, so the interrupt could actually get delivered.
+ */
+void irq_domain_activate_irq(struct irq_data *irq_data)
+{
+ if (irq_data && irq_data->domain) {
+ struct irq_domain *domain = irq_data->domain;
+
+ if (irq_data->parent_data)
+ irq_domain_activate_irq(irq_data->parent_data);
+ if (domain->ops->activate)
+ domain->ops->activate(domain, irq_data);
+ }
+}
+
+/**
+ * irq_domain_deactivate_irq - Call domain_ops->deactivate recursively to
+ * deactivate interrupt
+ * @irq_data: outermost irq_data associated with interrupt
+ *
+ * It calls domain_ops->deactivate to program interrupt controllers to disable
+ * interrupt delivery.
+ */
+void irq_domain_deactivate_irq(struct irq_data *irq_data)
+{
+ if (irq_data && irq_data->domain) {
+ struct irq_domain *domain = irq_data->domain;
+
+ if (domain->ops->deactivate)
+ domain->ops->deactivate(domain, irq_data);
+ if (irq_data->parent_data)
+ irq_domain_deactivate_irq(irq_data->parent_data);
+ }
+}
+
+static void irq_domain_check_hierarchy(struct irq_domain *domain)
+{
+ /* Hierarchy irq_domains must implement callback alloc() */
+ if (domain->ops->alloc)
+ domain->flags |= IRQ_DOMAIN_FLAG_HIERARCHY;
+}
+#else /* CONFIG_IRQ_DOMAIN_HIERARCHY */
+/**
+ * irq_domain_get_irq_data - Get irq_data associated with @virq and @domain
+ * @domain: domain to match
+ * @virq: IRQ number to get irq_data
+ */
+struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain,
+ unsigned int virq)
+{
+ struct irq_data *irq_data = irq_get_irq_data(virq);
+
+ return (irq_data && irq_data->domain == domain) ? irq_data : NULL;
+}
+
+static void irq_domain_check_hierarchy(struct irq_domain *domain)
+{
+}
+#endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
new file mode 100644
index 000000000..e68932bb3
--- /dev/null
+++ b/kernel/irq/manage.c
@@ -0,0 +1,1891 @@
+/*
+ * linux/kernel/irq/manage.c
+ *
+ * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 2005-2006 Thomas Gleixner
+ *
+ * This file contains driver APIs to the irq subsystem.
+ */
+
+#define pr_fmt(fmt) "genirq: " fmt
+
+#include <linux/irq.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/sched/rt.h>
+#include <linux/task_work.h>
+
+#include "internals.h"
+
+#ifdef CONFIG_IRQ_FORCED_THREADING
+__read_mostly bool force_irqthreads;
+
+static int __init setup_forced_irqthreads(char *arg)
+{
+ force_irqthreads = true;
+ return 0;
+}
+early_param("threadirqs", setup_forced_irqthreads);
+#endif
+
+static void __synchronize_hardirq(struct irq_desc *desc)
+{
+ bool inprogress;
+
+ do {
+ unsigned long flags;
+
+ /*
+ * Wait until we're out of the critical section. This might
+ * give the wrong answer due to the lack of memory barriers.
+ */
+ while (irqd_irq_inprogress(&desc->irq_data))
+ cpu_relax();
+
+ /* Ok, that indicated we're done: double-check carefully. */
+ raw_spin_lock_irqsave(&desc->lock, flags);
+ inprogress = irqd_irq_inprogress(&desc->irq_data);
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+
+ /* Oops, that failed? */
+ } while (inprogress);
+}
+
+/**
+ * synchronize_hardirq - wait for pending hard IRQ handlers (on other CPUs)
+ * @irq: interrupt number to wait for
+ *
+ * This function waits for any pending hard IRQ handlers for this
+ * interrupt to complete before returning. If you use this
+ * function while holding a resource the IRQ handler may need you
+ * will deadlock. It does not take associated threaded handlers
+ * into account.
+ *
+ * Do not use this for shutdown scenarios where you must be sure
+ * that all parts (hardirq and threaded handler) have completed.
+ *
+ * Returns: false if a threaded handler is active.
+ *
+ * This function may be called - with care - from IRQ context.
+ */
+bool synchronize_hardirq(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (desc) {
+ __synchronize_hardirq(desc);
+ return !atomic_read(&desc->threads_active);
+ }
+
+ return true;
+}
+EXPORT_SYMBOL(synchronize_hardirq);
+
+/**
+ * synchronize_irq - wait for pending IRQ handlers (on other CPUs)
+ * @irq: interrupt number to wait for
+ *
+ * This function waits for any pending IRQ handlers for this interrupt
+ * to complete before returning. If you use this function while
+ * holding a resource the IRQ handler may need you will deadlock.
+ *
+ * This function may be called - with care - from IRQ context.
+ */
+void synchronize_irq(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (desc) {
+ __synchronize_hardirq(desc);
+ /*
+ * We made sure that no hardirq handler is
+ * running. Now verify that no threaded handlers are
+ * active.
+ */
+ wait_event(desc->wait_for_threads,
+ !atomic_read(&desc->threads_active));
+ }
+}
+EXPORT_SYMBOL(synchronize_irq);
+
+#ifdef CONFIG_SMP
+cpumask_var_t irq_default_affinity;
+
+/**
+ * irq_can_set_affinity - Check if the affinity of a given irq can be set
+ * @irq: Interrupt to check
+ *
+ */
+int irq_can_set_affinity(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (!desc || !irqd_can_balance(&desc->irq_data) ||
+ !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity)
+ return 0;
+
+ return 1;
+}
+
+/**
+ * irq_set_thread_affinity - Notify irq threads to adjust affinity
+ * @desc: irq descriptor which has affitnity changed
+ *
+ * We just set IRQTF_AFFINITY and delegate the affinity setting
+ * to the interrupt thread itself. We can not call
+ * set_cpus_allowed_ptr() here as we hold desc->lock and this
+ * code can be called from hard interrupt context.
+ */
+void irq_set_thread_affinity(struct irq_desc *desc)
+{
+ struct irqaction *action = desc->action;
+
+ while (action) {
+ if (action->thread)
+ set_bit(IRQTF_AFFINITY, &action->thread_flags);
+ action = action->next;
+ }
+}
+
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+static inline bool irq_can_move_pcntxt(struct irq_data *data)
+{
+ return irqd_can_move_in_process_context(data);
+}
+static inline bool irq_move_pending(struct irq_data *data)
+{
+ return irqd_is_setaffinity_pending(data);
+}
+static inline void
+irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask)
+{
+ cpumask_copy(desc->pending_mask, mask);
+}
+static inline void
+irq_get_pending(struct cpumask *mask, struct irq_desc *desc)
+{
+ cpumask_copy(mask, desc->pending_mask);
+}
+#else
+static inline bool irq_can_move_pcntxt(struct irq_data *data) { return true; }
+static inline bool irq_move_pending(struct irq_data *data) { return false; }
+static inline void
+irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) { }
+static inline void
+irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { }
+#endif
+
+int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
+ bool force)
+{
+ struct irq_desc *desc = irq_data_to_desc(data);
+ struct irq_chip *chip = irq_data_get_irq_chip(data);
+ int ret;
+
+ ret = chip->irq_set_affinity(data, mask, force);
+ switch (ret) {
+ case IRQ_SET_MASK_OK:
+ case IRQ_SET_MASK_OK_DONE:
+ cpumask_copy(data->affinity, mask);
+ case IRQ_SET_MASK_OK_NOCOPY:
+ irq_set_thread_affinity(desc);
+ ret = 0;
+ }
+
+ return ret;
+}
+
+int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
+ bool force)
+{
+ struct irq_chip *chip = irq_data_get_irq_chip(data);
+ struct irq_desc *desc = irq_data_to_desc(data);
+ int ret = 0;
+
+ if (!chip || !chip->irq_set_affinity)
+ return -EINVAL;
+
+ if (irq_can_move_pcntxt(data)) {
+ ret = irq_do_set_affinity(data, mask, force);
+ } else {
+ irqd_set_move_pending(data);
+ irq_copy_pending(desc, mask);
+ }
+
+ if (desc->affinity_notify) {
+ kref_get(&desc->affinity_notify->kref);
+ schedule_work(&desc->affinity_notify->work);
+ }
+ irqd_set(data, IRQD_AFFINITY_SET);
+
+ return ret;
+}
+
+int __irq_set_affinity(unsigned int irq, const struct cpumask *mask, bool force)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ unsigned long flags;
+ int ret;
+
+ if (!desc)
+ return -EINVAL;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+ ret = irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask, force);
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+ return ret;
+}
+
+int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
+{
+ unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
+
+ if (!desc)
+ return -EINVAL;
+ desc->affinity_hint = m;
+ irq_put_desc_unlock(desc, flags);
+ /* set the initial affinity to prevent every interrupt being on CPU0 */
+ if (m)
+ __irq_set_affinity(irq, m, false);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
+
+static void irq_affinity_notify(struct work_struct *work)
+{
+ struct irq_affinity_notify *notify =
+ container_of(work, struct irq_affinity_notify, work);
+ struct irq_desc *desc = irq_to_desc(notify->irq);
+ cpumask_var_t cpumask;
+ unsigned long flags;
+
+ if (!desc || !alloc_cpumask_var(&cpumask, GFP_KERNEL))
+ goto out;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+ if (irq_move_pending(&desc->irq_data))
+ irq_get_pending(cpumask, desc);
+ else
+ cpumask_copy(cpumask, desc->irq_data.affinity);
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+
+ notify->notify(notify, cpumask);
+
+ free_cpumask_var(cpumask);
+out:
+ kref_put(&notify->kref, notify->release);
+}
+
+/**
+ * irq_set_affinity_notifier - control notification of IRQ affinity changes
+ * @irq: Interrupt for which to enable/disable notification
+ * @notify: Context for notification, or %NULL to disable
+ * notification. Function pointers must be initialised;
+ * the other fields will be initialised by this function.
+ *
+ * Must be called in process context. Notification may only be enabled
+ * after the IRQ is allocated and must be disabled before the IRQ is
+ * freed using free_irq().
+ */
+int
+irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_affinity_notify *old_notify;
+ unsigned long flags;
+
+ /* The release function is promised process context */
+ might_sleep();
+
+ if (!desc)
+ return -EINVAL;
+
+ /* Complete initialisation of *notify */
+ if (notify) {
+ notify->irq = irq;
+ kref_init(&notify->kref);
+ INIT_WORK(&notify->work, irq_affinity_notify);
+ }
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+ old_notify = desc->affinity_notify;
+ desc->affinity_notify = notify;
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+
+ if (old_notify)
+ kref_put(&old_notify->kref, old_notify->release);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(irq_set_affinity_notifier);
+
+#ifndef CONFIG_AUTO_IRQ_AFFINITY
+/*
+ * Generic version of the affinity autoselector.
+ */
+static int
+setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
+{
+ struct cpumask *set = irq_default_affinity;
+ int node = desc->irq_data.node;
+
+ /* Excludes PER_CPU and NO_BALANCE interrupts */
+ if (!irq_can_set_affinity(irq))
+ return 0;
+
+ /*
+ * Preserve an userspace affinity setup, but make sure that
+ * one of the targets is online.
+ */
+ if (irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) {
+ if (cpumask_intersects(desc->irq_data.affinity,
+ cpu_online_mask))
+ set = desc->irq_data.affinity;
+ else
+ irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET);
+ }
+
+ cpumask_and(mask, cpu_online_mask, set);
+ if (node != NUMA_NO_NODE) {
+ const struct cpumask *nodemask = cpumask_of_node(node);
+
+ /* make sure at least one of the cpus in nodemask is online */
+ if (cpumask_intersects(mask, nodemask))
+ cpumask_and(mask, mask, nodemask);
+ }
+ irq_do_set_affinity(&desc->irq_data, mask, false);
+ return 0;
+}
+#else
+static inline int
+setup_affinity(unsigned int irq, struct irq_desc *d, struct cpumask *mask)
+{
+ return irq_select_affinity(irq);
+}
+#endif
+
+/*
+ * Called when affinity is set via /proc/irq
+ */
+int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ unsigned long flags;
+ int ret;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+ ret = setup_affinity(irq, desc, mask);
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+ return ret;
+}
+
+#else
+static inline int
+setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
+{
+ return 0;
+}
+#endif
+
+void __disable_irq(struct irq_desc *desc, unsigned int irq)
+{
+ if (!desc->depth++)
+ irq_disable(desc);
+}
+
+static int __disable_irq_nosync(unsigned int irq)
+{
+ unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
+
+ if (!desc)
+ return -EINVAL;
+ __disable_irq(desc, irq);
+ irq_put_desc_busunlock(desc, flags);
+ return 0;
+}
+
+/**
+ * disable_irq_nosync - disable an irq without waiting
+ * @irq: Interrupt to disable
+ *
+ * Disable the selected interrupt line. Disables and Enables are
+ * nested.
+ * Unlike disable_irq(), this function does not ensure existing
+ * instances of the IRQ handler have completed before returning.
+ *
+ * This function may be called from IRQ context.
+ */
+void disable_irq_nosync(unsigned int irq)
+{
+ __disable_irq_nosync(irq);
+}
+EXPORT_SYMBOL(disable_irq_nosync);
+
+/**
+ * disable_irq - disable an irq and wait for completion
+ * @irq: Interrupt to disable
+ *
+ * Disable the selected interrupt line. Enables and Disables are
+ * nested.
+ * This function waits for any pending IRQ handlers for this interrupt
+ * to complete before returning. If you use this function while
+ * holding a resource the IRQ handler may need you will deadlock.
+ *
+ * This function may be called - with care - from IRQ context.
+ */
+void disable_irq(unsigned int irq)
+{
+ if (!__disable_irq_nosync(irq))
+ synchronize_irq(irq);
+}
+EXPORT_SYMBOL(disable_irq);
+
+/**
+ * disable_hardirq - disables an irq and waits for hardirq completion
+ * @irq: Interrupt to disable
+ *
+ * Disable the selected interrupt line. Enables and Disables are
+ * nested.
+ * This function waits for any pending hard IRQ handlers for this
+ * interrupt to complete before returning. If you use this function while
+ * holding a resource the hard IRQ handler may need you will deadlock.
+ *
+ * When used to optimistically disable an interrupt from atomic context
+ * the return value must be checked.
+ *
+ * Returns: false if a threaded handler is active.
+ *
+ * This function may be called - with care - from IRQ context.
+ */
+bool disable_hardirq(unsigned int irq)
+{
+ if (!__disable_irq_nosync(irq))
+ return synchronize_hardirq(irq);
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(disable_hardirq);
+
+void __enable_irq(struct irq_desc *desc, unsigned int irq)
+{
+ switch (desc->depth) {
+ case 0:
+ err_out:
+ WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
+ break;
+ case 1: {
+ if (desc->istate & IRQS_SUSPENDED)
+ goto err_out;
+ /* Prevent probing on this irq: */
+ irq_settings_set_noprobe(desc);
+ irq_enable(desc);
+ check_irq_resend(desc, irq);
+ /* fall-through */
+ }
+ default:
+ desc->depth--;
+ }
+}
+
+/**
+ * enable_irq - enable handling of an irq
+ * @irq: Interrupt to enable
+ *
+ * Undoes the effect of one call to disable_irq(). If this
+ * matches the last disable, processing of interrupts on this
+ * IRQ line is re-enabled.
+ *
+ * This function may be called from IRQ context only when
+ * desc->irq_data.chip->bus_lock and desc->chip->bus_sync_unlock are NULL !
+ */
+void enable_irq(unsigned int irq)
+{
+ unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
+
+ if (!desc)
+ return;
+ if (WARN(!desc->irq_data.chip,
+ KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
+ goto out;
+
+ __enable_irq(desc, irq);
+out:
+ irq_put_desc_busunlock(desc, flags);
+}
+EXPORT_SYMBOL(enable_irq);
+
+static int set_irq_wake_real(unsigned int irq, unsigned int on)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ int ret = -ENXIO;
+
+ if (irq_desc_get_chip(desc)->flags & IRQCHIP_SKIP_SET_WAKE)
+ return 0;
+
+ if (desc->irq_data.chip->irq_set_wake)
+ ret = desc->irq_data.chip->irq_set_wake(&desc->irq_data, on);
+
+ return ret;
+}
+
+/**
+ * irq_set_irq_wake - control irq power management wakeup
+ * @irq: interrupt to control
+ * @on: enable/disable power management wakeup
+ *
+ * Enable/disable power management wakeup mode, which is
+ * disabled by default. Enables and disables must match,
+ * just as they match for non-wakeup mode support.
+ *
+ * Wakeup mode lets this IRQ wake the system from sleep
+ * states like "suspend to RAM".
+ */
+int irq_set_irq_wake(unsigned int irq, unsigned int on)
+{
+ unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
+ int ret = 0;
+
+ if (!desc)
+ return -EINVAL;
+
+ /* wakeup-capable irqs can be shared between drivers that
+ * don't need to have the same sleep mode behaviors.
+ */
+ if (on) {
+ if (desc->wake_depth++ == 0) {
+ ret = set_irq_wake_real(irq, on);
+ if (ret)
+ desc->wake_depth = 0;
+ else
+ irqd_set(&desc->irq_data, IRQD_WAKEUP_STATE);
+ }
+ } else {
+ if (desc->wake_depth == 0) {
+ WARN(1, "Unbalanced IRQ %d wake disable\n", irq);
+ } else if (--desc->wake_depth == 0) {
+ ret = set_irq_wake_real(irq, on);
+ if (ret)
+ desc->wake_depth = 1;
+ else
+ irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE);
+ }
+ }
+ irq_put_desc_busunlock(desc, flags);
+ return ret;
+}
+EXPORT_SYMBOL(irq_set_irq_wake);
+
+/*
+ * Internal function that tells the architecture code whether a
+ * particular irq has been exclusively allocated or is available
+ * for driver use.
+ */
+int can_request_irq(unsigned int irq, unsigned long irqflags)
+{
+ unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
+ int canrequest = 0;
+
+ if (!desc)
+ return 0;
+
+ if (irq_settings_can_request(desc)) {
+ if (!desc->action ||
+ irqflags & desc->action->flags & IRQF_SHARED)
+ canrequest = 1;
+ }
+ irq_put_desc_unlock(desc, flags);
+ return canrequest;
+}
+
+int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
+ unsigned long flags)
+{
+ struct irq_chip *chip = desc->irq_data.chip;
+ int ret, unmask = 0;
+
+ if (!chip || !chip->irq_set_type) {
+ /*
+ * IRQF_TRIGGER_* but the PIC does not support multiple
+ * flow-types?
+ */
+ pr_debug("No set_type function for IRQ %d (%s)\n", irq,
+ chip ? (chip->name ? : "unknown") : "unknown");
+ return 0;
+ }
+
+ flags &= IRQ_TYPE_SENSE_MASK;
+
+ if (chip->flags & IRQCHIP_SET_TYPE_MASKED) {
+ if (!irqd_irq_masked(&desc->irq_data))
+ mask_irq(desc);
+ if (!irqd_irq_disabled(&desc->irq_data))
+ unmask = 1;
+ }
+
+ /* caller masked out all except trigger mode flags */
+ ret = chip->irq_set_type(&desc->irq_data, flags);
+
+ switch (ret) {
+ case IRQ_SET_MASK_OK:
+ case IRQ_SET_MASK_OK_DONE:
+ irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK);
+ irqd_set(&desc->irq_data, flags);
+
+ case IRQ_SET_MASK_OK_NOCOPY:
+ flags = irqd_get_trigger_type(&desc->irq_data);
+ irq_settings_set_trigger_mask(desc, flags);
+ irqd_clear(&desc->irq_data, IRQD_LEVEL);
+ irq_settings_clr_level(desc);
+ if (flags & IRQ_TYPE_LEVEL_MASK) {
+ irq_settings_set_level(desc);
+ irqd_set(&desc->irq_data, IRQD_LEVEL);
+ }
+
+ ret = 0;
+ break;
+ default:
+ pr_err("Setting trigger mode %lu for irq %u failed (%pF)\n",
+ flags, irq, chip->irq_set_type);
+ }
+ if (unmask)
+ unmask_irq(desc);
+ return ret;
+}
+
+#ifdef CONFIG_HARDIRQS_SW_RESEND
+int irq_set_parent(int irq, int parent_irq)
+{
+ unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
+
+ if (!desc)
+ return -EINVAL;
+
+ desc->parent_irq = parent_irq;
+
+ irq_put_desc_unlock(desc, flags);
+ return 0;
+}
+#endif
+
+/*
+ * Default primary interrupt handler for threaded interrupts. Is
+ * assigned as primary handler when request_threaded_irq is called
+ * with handler == NULL. Useful for oneshot interrupts.
+ */
+static irqreturn_t irq_default_primary_handler(int irq, void *dev_id)
+{
+ return IRQ_WAKE_THREAD;
+}
+
+/*
+ * Primary handler for nested threaded interrupts. Should never be
+ * called.
+ */
+static irqreturn_t irq_nested_primary_handler(int irq, void *dev_id)
+{
+ WARN(1, "Primary handler called for nested irq %d\n", irq);
+ return IRQ_NONE;
+}
+
+static int irq_wait_for_interrupt(struct irqaction *action)
+{
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ while (!kthread_should_stop()) {
+
+ if (test_and_clear_bit(IRQTF_RUNTHREAD,
+ &action->thread_flags)) {
+ __set_current_state(TASK_RUNNING);
+ return 0;
+ }
+ schedule();
+ set_current_state(TASK_INTERRUPTIBLE);
+ }
+ __set_current_state(TASK_RUNNING);
+ return -1;
+}
+
+/*
+ * Oneshot interrupts keep the irq line masked until the threaded
+ * handler finished. unmask if the interrupt has not been disabled and
+ * is marked MASKED.
+ */
+static void irq_finalize_oneshot(struct irq_desc *desc,
+ struct irqaction *action)
+{
+ if (!(desc->istate & IRQS_ONESHOT))
+ return;
+again:
+ chip_bus_lock(desc);
+ raw_spin_lock_irq(&desc->lock);
+
+ /*
+ * Implausible though it may be we need to protect us against
+ * the following scenario:
+ *
+ * The thread is faster done than the hard interrupt handler
+ * on the other CPU. If we unmask the irq line then the
+ * interrupt can come in again and masks the line, leaves due
+ * to IRQS_INPROGRESS and the irq line is masked forever.
+ *
+ * This also serializes the state of shared oneshot handlers
+ * versus "desc->threads_onehsot |= action->thread_mask;" in
+ * irq_wake_thread(). See the comment there which explains the
+ * serialization.
+ */
+ if (unlikely(irqd_irq_inprogress(&desc->irq_data))) {
+ raw_spin_unlock_irq(&desc->lock);
+ chip_bus_sync_unlock(desc);
+ cpu_relax();
+ goto again;
+ }
+
+ /*
+ * Now check again, whether the thread should run. Otherwise
+ * we would clear the threads_oneshot bit of this thread which
+ * was just set.
+ */
+ if (test_bit(IRQTF_RUNTHREAD, &action->thread_flags))
+ goto out_unlock;
+
+ desc->threads_oneshot &= ~action->thread_mask;
+
+ if (!desc->threads_oneshot && !irqd_irq_disabled(&desc->irq_data) &&
+ irqd_irq_masked(&desc->irq_data))
+ unmask_threaded_irq(desc);
+
+out_unlock:
+ raw_spin_unlock_irq(&desc->lock);
+ chip_bus_sync_unlock(desc);
+}
+
+#ifdef CONFIG_SMP
+/*
+ * Check whether we need to change the affinity of the interrupt thread.
+ */
+static void
+irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
+{
+ cpumask_var_t mask;
+ bool valid = true;
+
+ if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags))
+ return;
+
+ /*
+ * In case we are out of memory we set IRQTF_AFFINITY again and
+ * try again next time
+ */
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
+ set_bit(IRQTF_AFFINITY, &action->thread_flags);
+ return;
+ }
+
+ raw_spin_lock_irq(&desc->lock);
+ /*
+ * This code is triggered unconditionally. Check the affinity
+ * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out.
+ */
+ if (desc->irq_data.affinity)
+ cpumask_copy(mask, desc->irq_data.affinity);
+ else
+ valid = false;
+ raw_spin_unlock_irq(&desc->lock);
+
+ if (valid)
+ set_cpus_allowed_ptr(current, mask);
+ free_cpumask_var(mask);
+}
+#else
+static inline void
+irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
+#endif
+
+/*
+ * Interrupts which are not explicitely requested as threaded
+ * interrupts rely on the implicit bh/preempt disable of the hard irq
+ * context. So we need to disable bh here to avoid deadlocks and other
+ * side effects.
+ */
+static irqreturn_t
+irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
+{
+ irqreturn_t ret;
+
+ local_bh_disable();
+ ret = action->thread_fn(action->irq, action->dev_id);
+ irq_finalize_oneshot(desc, action);
+ local_bh_enable();
+ return ret;
+}
+
+/*
+ * Interrupts explicitly requested as threaded interrupts want to be
+ * preemtible - many of them need to sleep and wait for slow busses to
+ * complete.
+ */
+static irqreturn_t irq_thread_fn(struct irq_desc *desc,
+ struct irqaction *action)
+{
+ irqreturn_t ret;
+
+ ret = action->thread_fn(action->irq, action->dev_id);
+ irq_finalize_oneshot(desc, action);
+ return ret;
+}
+
+static void wake_threads_waitq(struct irq_desc *desc)
+{
+ if (atomic_dec_and_test(&desc->threads_active))
+ wake_up(&desc->wait_for_threads);
+}
+
+static void irq_thread_dtor(struct callback_head *unused)
+{
+ struct task_struct *tsk = current;
+ struct irq_desc *desc;
+ struct irqaction *action;
+
+ if (WARN_ON_ONCE(!(current->flags & PF_EXITING)))
+ return;
+
+ action = kthread_data(tsk);
+
+ pr_err("exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
+ tsk->comm, tsk->pid, action->irq);
+
+
+ desc = irq_to_desc(action->irq);
+ /*
+ * If IRQTF_RUNTHREAD is set, we need to decrement
+ * desc->threads_active and wake possible waiters.
+ */
+ if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags))
+ wake_threads_waitq(desc);
+
+ /* Prevent a stale desc->threads_oneshot */
+ irq_finalize_oneshot(desc, action);
+}
+
+/*
+ * Interrupt handler thread
+ */
+static int irq_thread(void *data)
+{
+ struct callback_head on_exit_work;
+ struct irqaction *action = data;
+ struct irq_desc *desc = irq_to_desc(action->irq);
+ irqreturn_t (*handler_fn)(struct irq_desc *desc,
+ struct irqaction *action);
+
+ if (force_irqthreads && test_bit(IRQTF_FORCED_THREAD,
+ &action->thread_flags))
+ handler_fn = irq_forced_thread_fn;
+ else
+ handler_fn = irq_thread_fn;
+
+ init_task_work(&on_exit_work, irq_thread_dtor);
+ task_work_add(current, &on_exit_work, false);
+
+ irq_thread_check_affinity(desc, action);
+
+ while (!irq_wait_for_interrupt(action)) {
+ irqreturn_t action_ret;
+
+ irq_thread_check_affinity(desc, action);
+
+ action_ret = handler_fn(desc, action);
+ if (action_ret == IRQ_HANDLED)
+ atomic_inc(&desc->threads_handled);
+
+ wake_threads_waitq(desc);
+ }
+
+ /*
+ * This is the regular exit path. __free_irq() is stopping the
+ * thread via kthread_stop() after calling
+ * synchronize_irq(). So neither IRQTF_RUNTHREAD nor the
+ * oneshot mask bit can be set. We cannot verify that as we
+ * cannot touch the oneshot mask at this point anymore as
+ * __setup_irq() might have given out currents thread_mask
+ * again.
+ */
+ task_work_cancel(current, irq_thread_dtor);
+ return 0;
+}
+
+/**
+ * irq_wake_thread - wake the irq thread for the action identified by dev_id
+ * @irq: Interrupt line
+ * @dev_id: Device identity for which the thread should be woken
+ *
+ */
+void irq_wake_thread(unsigned int irq, void *dev_id)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ struct irqaction *action;
+ unsigned long flags;
+
+ if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc)))
+ return;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+ for (action = desc->action; action; action = action->next) {
+ if (action->dev_id == dev_id) {
+ if (action->thread)
+ __irq_wake_thread(desc, action);
+ break;
+ }
+ }
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+}
+EXPORT_SYMBOL_GPL(irq_wake_thread);
+
+static void irq_setup_forced_threading(struct irqaction *new)
+{
+ if (!force_irqthreads)
+ return;
+ if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT))
+ return;
+
+ new->flags |= IRQF_ONESHOT;
+
+ if (!new->thread_fn) {
+ set_bit(IRQTF_FORCED_THREAD, &new->thread_flags);
+ new->thread_fn = new->handler;
+ new->handler = irq_default_primary_handler;
+ }
+}
+
+static int irq_request_resources(struct irq_desc *desc)
+{
+ struct irq_data *d = &desc->irq_data;
+ struct irq_chip *c = d->chip;
+
+ return c->irq_request_resources ? c->irq_request_resources(d) : 0;
+}
+
+static void irq_release_resources(struct irq_desc *desc)
+{
+ struct irq_data *d = &desc->irq_data;
+ struct irq_chip *c = d->chip;
+
+ if (c->irq_release_resources)
+ c->irq_release_resources(d);
+}
+
+/*
+ * Internal function to register an irqaction - typically used to
+ * allocate special interrupts that are part of the architecture.
+ */
+static int
+__setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
+{
+ struct irqaction *old, **old_ptr;
+ unsigned long flags, thread_mask = 0;
+ int ret, nested, shared = 0;
+ cpumask_var_t mask;
+
+ if (!desc)
+ return -EINVAL;
+
+ if (desc->irq_data.chip == &no_irq_chip)
+ return -ENOSYS;
+ if (!try_module_get(desc->owner))
+ return -ENODEV;
+
+ /*
+ * Check whether the interrupt nests into another interrupt
+ * thread.
+ */
+ nested = irq_settings_is_nested_thread(desc);
+ if (nested) {
+ if (!new->thread_fn) {
+ ret = -EINVAL;
+ goto out_mput;
+ }
+ /*
+ * Replace the primary handler which was provided from
+ * the driver for non nested interrupt handling by the
+ * dummy function which warns when called.
+ */
+ new->handler = irq_nested_primary_handler;
+ } else {
+ if (irq_settings_can_thread(desc))
+ irq_setup_forced_threading(new);
+ }
+
+ /*
+ * Create a handler thread when a thread function is supplied
+ * and the interrupt does not nest into another interrupt
+ * thread.
+ */
+ if (new->thread_fn && !nested) {
+ struct task_struct *t;
+ static const struct sched_param param = {
+ .sched_priority = MAX_USER_RT_PRIO/2,
+ };
+
+ t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
+ new->name);
+ if (IS_ERR(t)) {
+ ret = PTR_ERR(t);
+ goto out_mput;
+ }
+
+ sched_setscheduler_nocheck(t, SCHED_FIFO, &param);
+
+ /*
+ * We keep the reference to the task struct even if
+ * the thread dies to avoid that the interrupt code
+ * references an already freed task_struct.
+ */
+ get_task_struct(t);
+ new->thread = t;
+ /*
+ * Tell the thread to set its affinity. This is
+ * important for shared interrupt handlers as we do
+ * not invoke setup_affinity() for the secondary
+ * handlers as everything is already set up. Even for
+ * interrupts marked with IRQF_NO_BALANCE this is
+ * correct as we want the thread to move to the cpu(s)
+ * on which the requesting code placed the interrupt.
+ */
+ set_bit(IRQTF_AFFINITY, &new->thread_flags);
+ }
+
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
+ ret = -ENOMEM;
+ goto out_thread;
+ }
+
+ /*
+ * Drivers are often written to work w/o knowledge about the
+ * underlying irq chip implementation, so a request for a
+ * threaded irq without a primary hard irq context handler
+ * requires the ONESHOT flag to be set. Some irq chips like
+ * MSI based interrupts are per se one shot safe. Check the
+ * chip flags, so we can avoid the unmask dance at the end of
+ * the threaded handler for those.
+ */
+ if (desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)
+ new->flags &= ~IRQF_ONESHOT;
+
+ /*
+ * The following block of code has to be executed atomically
+ */
+ raw_spin_lock_irqsave(&desc->lock, flags);
+ old_ptr = &desc->action;
+ old = *old_ptr;
+ if (old) {
+ /*
+ * Can't share interrupts unless both agree to and are
+ * the same type (level, edge, polarity). So both flag
+ * fields must have IRQF_SHARED set and the bits which
+ * set the trigger type must match. Also all must
+ * agree on ONESHOT.
+ */
+ if (!((old->flags & new->flags) & IRQF_SHARED) ||
+ ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) ||
+ ((old->flags ^ new->flags) & IRQF_ONESHOT))
+ goto mismatch;
+
+ /* All handlers must agree on per-cpuness */
+ if ((old->flags & IRQF_PERCPU) !=
+ (new->flags & IRQF_PERCPU))
+ goto mismatch;
+
+ /* add new interrupt at end of irq queue */
+ do {
+ /*
+ * Or all existing action->thread_mask bits,
+ * so we can find the next zero bit for this
+ * new action.
+ */
+ thread_mask |= old->thread_mask;
+ old_ptr = &old->next;
+ old = *old_ptr;
+ } while (old);
+ shared = 1;
+ }
+
+ /*
+ * Setup the thread mask for this irqaction for ONESHOT. For
+ * !ONESHOT irqs the thread mask is 0 so we can avoid a
+ * conditional in irq_wake_thread().
+ */
+ if (new->flags & IRQF_ONESHOT) {
+ /*
+ * Unlikely to have 32 resp 64 irqs sharing one line,
+ * but who knows.
+ */
+ if (thread_mask == ~0UL) {
+ ret = -EBUSY;
+ goto out_mask;
+ }
+ /*
+ * The thread_mask for the action is or'ed to
+ * desc->thread_active to indicate that the
+ * IRQF_ONESHOT thread handler has been woken, but not
+ * yet finished. The bit is cleared when a thread
+ * completes. When all threads of a shared interrupt
+ * line have completed desc->threads_active becomes
+ * zero and the interrupt line is unmasked. See
+ * handle.c:irq_wake_thread() for further information.
+ *
+ * If no thread is woken by primary (hard irq context)
+ * interrupt handlers, then desc->threads_active is
+ * also checked for zero to unmask the irq line in the
+ * affected hard irq flow handlers
+ * (handle_[fasteoi|level]_irq).
+ *
+ * The new action gets the first zero bit of
+ * thread_mask assigned. See the loop above which or's
+ * all existing action->thread_mask bits.
+ */
+ new->thread_mask = 1 << ffz(thread_mask);
+
+ } else if (new->handler == irq_default_primary_handler &&
+ !(desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)) {
+ /*
+ * The interrupt was requested with handler = NULL, so
+ * we use the default primary handler for it. But it
+ * does not have the oneshot flag set. In combination
+ * with level interrupts this is deadly, because the
+ * default primary handler just wakes the thread, then
+ * the irq lines is reenabled, but the device still
+ * has the level irq asserted. Rinse and repeat....
+ *
+ * While this works for edge type interrupts, we play
+ * it safe and reject unconditionally because we can't
+ * say for sure which type this interrupt really
+ * has. The type flags are unreliable as the
+ * underlying chip implementation can override them.
+ */
+ pr_err("Threaded irq requested with handler=NULL and !ONESHOT for irq %d\n",
+ irq);
+ ret = -EINVAL;
+ goto out_mask;
+ }
+
+ if (!shared) {
+ ret = irq_request_resources(desc);
+ if (ret) {
+ pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n",
+ new->name, irq, desc->irq_data.chip->name);
+ goto out_mask;
+ }
+
+ init_waitqueue_head(&desc->wait_for_threads);
+
+ /* Setup the type (level, edge polarity) if configured: */
+ if (new->flags & IRQF_TRIGGER_MASK) {
+ ret = __irq_set_trigger(desc, irq,
+ new->flags & IRQF_TRIGGER_MASK);
+
+ if (ret)
+ goto out_mask;
+ }
+
+ desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \
+ IRQS_ONESHOT | IRQS_WAITING);
+ irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
+
+ if (new->flags & IRQF_PERCPU) {
+ irqd_set(&desc->irq_data, IRQD_PER_CPU);
+ irq_settings_set_per_cpu(desc);
+ }
+
+ if (new->flags & IRQF_ONESHOT)
+ desc->istate |= IRQS_ONESHOT;
+
+ if (irq_settings_can_autoenable(desc))
+ irq_startup(desc, true);
+ else
+ /* Undo nested disables: */
+ desc->depth = 1;
+
+ /* Exclude IRQ from balancing if requested */
+ if (new->flags & IRQF_NOBALANCING) {
+ irq_settings_set_no_balancing(desc);
+ irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
+ }
+
+ /* Set default affinity mask once everything is setup */
+ setup_affinity(irq, desc, mask);
+
+ } else if (new->flags & IRQF_TRIGGER_MASK) {
+ unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK;
+ unsigned int omsk = irq_settings_get_trigger_mask(desc);
+
+ if (nmsk != omsk)
+ /* hope the handler works with current trigger mode */
+ pr_warning("irq %d uses trigger mode %u; requested %u\n",
+ irq, nmsk, omsk);
+ }
+
+ new->irq = irq;
+ *old_ptr = new;
+
+ irq_pm_install_action(desc, new);
+
+ /* Reset broken irq detection when installing new handler */
+ desc->irq_count = 0;
+ desc->irqs_unhandled = 0;
+
+ /*
+ * Check whether we disabled the irq via the spurious handler
+ * before. Reenable it and give it another chance.
+ */
+ if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) {
+ desc->istate &= ~IRQS_SPURIOUS_DISABLED;
+ __enable_irq(desc, irq);
+ }
+
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+
+ /*
+ * Strictly no need to wake it up, but hung_task complains
+ * when no hard interrupt wakes the thread up.
+ */
+ if (new->thread)
+ wake_up_process(new->thread);
+
+ register_irq_proc(irq, desc);
+ new->dir = NULL;
+ register_handler_proc(irq, new);
+ free_cpumask_var(mask);
+
+ return 0;
+
+mismatch:
+ if (!(new->flags & IRQF_PROBE_SHARED)) {
+ pr_err("Flags mismatch irq %d. %08x (%s) vs. %08x (%s)\n",
+ irq, new->flags, new->name, old->flags, old->name);
+#ifdef CONFIG_DEBUG_SHIRQ
+ dump_stack();
+#endif
+ }
+ ret = -EBUSY;
+
+out_mask:
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+ free_cpumask_var(mask);
+
+out_thread:
+ if (new->thread) {
+ struct task_struct *t = new->thread;
+
+ new->thread = NULL;
+ kthread_stop(t);
+ put_task_struct(t);
+ }
+out_mput:
+ module_put(desc->owner);
+ return ret;
+}
+
+/**
+ * setup_irq - setup an interrupt
+ * @irq: Interrupt line to setup
+ * @act: irqaction for the interrupt
+ *
+ * Used to statically setup interrupts in the early boot process.
+ */
+int setup_irq(unsigned int irq, struct irqaction *act)
+{
+ int retval;
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (WARN_ON(irq_settings_is_per_cpu_devid(desc)))
+ return -EINVAL;
+ chip_bus_lock(desc);
+ retval = __setup_irq(irq, desc, act);
+ chip_bus_sync_unlock(desc);
+
+ return retval;
+}
+EXPORT_SYMBOL_GPL(setup_irq);
+
+/*
+ * Internal function to unregister an irqaction - used to free
+ * regular and special interrupts that are part of the architecture.
+ */
+static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ struct irqaction *action, **action_ptr;
+ unsigned long flags;
+
+ WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
+
+ if (!desc)
+ return NULL;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+
+ /*
+ * There can be multiple actions per IRQ descriptor, find the right
+ * one based on the dev_id:
+ */
+ action_ptr = &desc->action;
+ for (;;) {
+ action = *action_ptr;
+
+ if (!action) {
+ WARN(1, "Trying to free already-free IRQ %d\n", irq);
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+
+ return NULL;
+ }
+
+ if (action->dev_id == dev_id)
+ break;
+ action_ptr = &action->next;
+ }
+
+ /* Found it - now remove it from the list of entries: */
+ *action_ptr = action->next;
+
+ irq_pm_remove_action(desc, action);
+
+ /* If this was the last handler, shut down the IRQ line: */
+ if (!desc->action) {
+ irq_shutdown(desc);
+ irq_release_resources(desc);
+ }
+
+#ifdef CONFIG_SMP
+ /* make sure affinity_hint is cleaned up */
+ if (WARN_ON_ONCE(desc->affinity_hint))
+ desc->affinity_hint = NULL;
+#endif
+
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+
+ unregister_handler_proc(irq, action);
+
+ /* Make sure it's not being used on another CPU: */
+ synchronize_irq(irq);
+
+#ifdef CONFIG_DEBUG_SHIRQ
+ /*
+ * It's a shared IRQ -- the driver ought to be prepared for an IRQ
+ * event to happen even now it's being freed, so let's make sure that
+ * is so by doing an extra call to the handler ....
+ *
+ * ( We do this after actually deregistering it, to make sure that a
+ * 'real' IRQ doesn't run in * parallel with our fake. )
+ */
+ if (action->flags & IRQF_SHARED) {
+ local_irq_save(flags);
+ action->handler(irq, dev_id);
+ local_irq_restore(flags);
+ }
+#endif
+
+ if (action->thread) {
+ kthread_stop(action->thread);
+ put_task_struct(action->thread);
+ }
+
+ module_put(desc->owner);
+ return action;
+}
+
+/**
+ * remove_irq - free an interrupt
+ * @irq: Interrupt line to free
+ * @act: irqaction for the interrupt
+ *
+ * Used to remove interrupts statically setup by the early boot process.
+ */
+void remove_irq(unsigned int irq, struct irqaction *act)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (desc && !WARN_ON(irq_settings_is_per_cpu_devid(desc)))
+ __free_irq(irq, act->dev_id);
+}
+EXPORT_SYMBOL_GPL(remove_irq);
+
+/**
+ * free_irq - free an interrupt allocated with request_irq
+ * @irq: Interrupt line to free
+ * @dev_id: Device identity to free
+ *
+ * Remove an interrupt handler. The handler is removed and if the
+ * interrupt line is no longer in use by any driver it is disabled.
+ * On a shared IRQ the caller must ensure the interrupt is disabled
+ * on the card it drives before calling this function. The function
+ * does not return until any executing interrupts for this IRQ
+ * have completed.
+ *
+ * This function must not be called from interrupt context.
+ */
+void free_irq(unsigned int irq, void *dev_id)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc)))
+ return;
+
+#ifdef CONFIG_SMP
+ if (WARN_ON(desc->affinity_notify))
+ desc->affinity_notify = NULL;
+#endif
+
+ chip_bus_lock(desc);
+ kfree(__free_irq(irq, dev_id));
+ chip_bus_sync_unlock(desc);
+}
+EXPORT_SYMBOL(free_irq);
+
+/**
+ * request_threaded_irq - allocate an interrupt line
+ * @irq: Interrupt line to allocate
+ * @handler: Function to be called when the IRQ occurs.
+ * Primary handler for threaded interrupts
+ * If NULL and thread_fn != NULL the default
+ * primary handler is installed
+ * @thread_fn: Function called from the irq handler thread
+ * If NULL, no irq thread is created
+ * @irqflags: Interrupt type flags
+ * @devname: An ascii name for the claiming device
+ * @dev_id: A cookie passed back to the handler function
+ *
+ * This call allocates interrupt resources and enables the
+ * interrupt line and IRQ handling. From the point this
+ * call is made your handler function may be invoked. Since
+ * your handler function must clear any interrupt the board
+ * raises, you must take care both to initialise your hardware
+ * and to set up the interrupt handler in the right order.
+ *
+ * If you want to set up a threaded irq handler for your device
+ * then you need to supply @handler and @thread_fn. @handler is
+ * still called in hard interrupt context and has to check
+ * whether the interrupt originates from the device. If yes it
+ * needs to disable the interrupt on the device and return
+ * IRQ_WAKE_THREAD which will wake up the handler thread and run
+ * @thread_fn. This split handler design is necessary to support
+ * shared interrupts.
+ *
+ * Dev_id must be globally unique. Normally the address of the
+ * device data structure is used as the cookie. Since the handler
+ * receives this value it makes sense to use it.
+ *
+ * If your interrupt is shared you must pass a non NULL dev_id
+ * as this is required when freeing the interrupt.
+ *
+ * Flags:
+ *
+ * IRQF_SHARED Interrupt is shared
+ * IRQF_TRIGGER_* Specify active edge(s) or level
+ *
+ */
+int request_threaded_irq(unsigned int irq, irq_handler_t handler,
+ irq_handler_t thread_fn, unsigned long irqflags,
+ const char *devname, void *dev_id)
+{
+ struct irqaction *action;
+ struct irq_desc *desc;
+ int retval;
+
+ /*
+ * Sanity-check: shared interrupts must pass in a real dev-ID,
+ * otherwise we'll have trouble later trying to figure out
+ * which interrupt is which (messes up the interrupt freeing
+ * logic etc).
+ *
+ * Also IRQF_COND_SUSPEND only makes sense for shared interrupts and
+ * it cannot be set along with IRQF_NO_SUSPEND.
+ */
+ if (((irqflags & IRQF_SHARED) && !dev_id) ||
+ (!(irqflags & IRQF_SHARED) && (irqflags & IRQF_COND_SUSPEND)) ||
+ ((irqflags & IRQF_NO_SUSPEND) && (irqflags & IRQF_COND_SUSPEND)))
+ return -EINVAL;
+
+ desc = irq_to_desc(irq);
+ if (!desc)
+ return -EINVAL;
+
+ if (!irq_settings_can_request(desc) ||
+ WARN_ON(irq_settings_is_per_cpu_devid(desc)))
+ return -EINVAL;
+
+ if (!handler) {
+ if (!thread_fn)
+ return -EINVAL;
+ handler = irq_default_primary_handler;
+ }
+
+ action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
+ if (!action)
+ return -ENOMEM;
+
+ action->handler = handler;
+ action->thread_fn = thread_fn;
+ action->flags = irqflags;
+ action->name = devname;
+ action->dev_id = dev_id;
+
+ chip_bus_lock(desc);
+ retval = __setup_irq(irq, desc, action);
+ chip_bus_sync_unlock(desc);
+
+ if (retval)
+ kfree(action);
+
+#ifdef CONFIG_DEBUG_SHIRQ_FIXME
+ if (!retval && (irqflags & IRQF_SHARED)) {
+ /*
+ * It's a shared IRQ -- the driver ought to be prepared for it
+ * to happen immediately, so let's make sure....
+ * We disable the irq to make sure that a 'real' IRQ doesn't
+ * run in parallel with our fake.
+ */
+ unsigned long flags;
+
+ disable_irq(irq);
+ local_irq_save(flags);
+
+ handler(irq, dev_id);
+
+ local_irq_restore(flags);
+ enable_irq(irq);
+ }
+#endif
+ return retval;
+}
+EXPORT_SYMBOL(request_threaded_irq);
+
+/**
+ * request_any_context_irq - allocate an interrupt line
+ * @irq: Interrupt line to allocate
+ * @handler: Function to be called when the IRQ occurs.
+ * Threaded handler for threaded interrupts.
+ * @flags: Interrupt type flags
+ * @name: An ascii name for the claiming device
+ * @dev_id: A cookie passed back to the handler function
+ *
+ * This call allocates interrupt resources and enables the
+ * interrupt line and IRQ handling. It selects either a
+ * hardirq or threaded handling method depending on the
+ * context.
+ *
+ * On failure, it returns a negative value. On success,
+ * it returns either IRQC_IS_HARDIRQ or IRQC_IS_NESTED.
+ */
+int request_any_context_irq(unsigned int irq, irq_handler_t handler,
+ unsigned long flags, const char *name, void *dev_id)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ int ret;
+
+ if (!desc)
+ return -EINVAL;
+
+ if (irq_settings_is_nested_thread(desc)) {
+ ret = request_threaded_irq(irq, NULL, handler,
+ flags, name, dev_id);
+ return !ret ? IRQC_IS_NESTED : ret;
+ }
+
+ ret = request_irq(irq, handler, flags, name, dev_id);
+ return !ret ? IRQC_IS_HARDIRQ : ret;
+}
+EXPORT_SYMBOL_GPL(request_any_context_irq);
+
+void enable_percpu_irq(unsigned int irq, unsigned int type)
+{
+ unsigned int cpu = smp_processor_id();
+ unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU);
+
+ if (!desc)
+ return;
+
+ type &= IRQ_TYPE_SENSE_MASK;
+ if (type != IRQ_TYPE_NONE) {
+ int ret;
+
+ ret = __irq_set_trigger(desc, irq, type);
+
+ if (ret) {
+ WARN(1, "failed to set type for IRQ%d\n", irq);
+ goto out;
+ }
+ }
+
+ irq_percpu_enable(desc, cpu);
+out:
+ irq_put_desc_unlock(desc, flags);
+}
+EXPORT_SYMBOL_GPL(enable_percpu_irq);
+
+void disable_percpu_irq(unsigned int irq)
+{
+ unsigned int cpu = smp_processor_id();
+ unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU);
+
+ if (!desc)
+ return;
+
+ irq_percpu_disable(desc, cpu);
+ irq_put_desc_unlock(desc, flags);
+}
+EXPORT_SYMBOL_GPL(disable_percpu_irq);
+
+/*
+ * Internal function to unregister a percpu irqaction.
+ */
+static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_id)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ struct irqaction *action;
+ unsigned long flags;
+
+ WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
+
+ if (!desc)
+ return NULL;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+
+ action = desc->action;
+ if (!action || action->percpu_dev_id != dev_id) {
+ WARN(1, "Trying to free already-free IRQ %d\n", irq);
+ goto bad;
+ }
+
+ if (!cpumask_empty(desc->percpu_enabled)) {
+ WARN(1, "percpu IRQ %d still enabled on CPU%d!\n",
+ irq, cpumask_first(desc->percpu_enabled));
+ goto bad;
+ }
+
+ /* Found it - now remove it from the list of entries: */
+ desc->action = NULL;
+
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+
+ unregister_handler_proc(irq, action);
+
+ module_put(desc->owner);
+ return action;
+
+bad:
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+ return NULL;
+}
+
+/**
+ * remove_percpu_irq - free a per-cpu interrupt
+ * @irq: Interrupt line to free
+ * @act: irqaction for the interrupt
+ *
+ * Used to remove interrupts statically setup by the early boot process.
+ */
+void remove_percpu_irq(unsigned int irq, struct irqaction *act)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (desc && irq_settings_is_per_cpu_devid(desc))
+ __free_percpu_irq(irq, act->percpu_dev_id);
+}
+
+/**
+ * free_percpu_irq - free an interrupt allocated with request_percpu_irq
+ * @irq: Interrupt line to free
+ * @dev_id: Device identity to free
+ *
+ * Remove a percpu interrupt handler. The handler is removed, but
+ * the interrupt line is not disabled. This must be done on each
+ * CPU before calling this function. The function does not return
+ * until any executing interrupts for this IRQ have completed.
+ *
+ * This function must not be called from interrupt context.
+ */
+void free_percpu_irq(unsigned int irq, void __percpu *dev_id)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (!desc || !irq_settings_is_per_cpu_devid(desc))
+ return;
+
+ chip_bus_lock(desc);
+ kfree(__free_percpu_irq(irq, dev_id));
+ chip_bus_sync_unlock(desc);
+}
+
+/**
+ * setup_percpu_irq - setup a per-cpu interrupt
+ * @irq: Interrupt line to setup
+ * @act: irqaction for the interrupt
+ *
+ * Used to statically setup per-cpu interrupts in the early boot process.
+ */
+int setup_percpu_irq(unsigned int irq, struct irqaction *act)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ int retval;
+
+ if (!desc || !irq_settings_is_per_cpu_devid(desc))
+ return -EINVAL;
+ chip_bus_lock(desc);
+ retval = __setup_irq(irq, desc, act);
+ chip_bus_sync_unlock(desc);
+
+ return retval;
+}
+
+/**
+ * request_percpu_irq - allocate a percpu interrupt line
+ * @irq: Interrupt line to allocate
+ * @handler: Function to be called when the IRQ occurs.
+ * @devname: An ascii name for the claiming device
+ * @dev_id: A percpu cookie passed back to the handler function
+ *
+ * This call allocates interrupt resources, but doesn't
+ * automatically enable the interrupt. It has to be done on each
+ * CPU using enable_percpu_irq().
+ *
+ * Dev_id must be globally unique. It is a per-cpu variable, and
+ * the handler gets called with the interrupted CPU's instance of
+ * that variable.
+ */
+int request_percpu_irq(unsigned int irq, irq_handler_t handler,
+ const char *devname, void __percpu *dev_id)
+{
+ struct irqaction *action;
+ struct irq_desc *desc;
+ int retval;
+
+ if (!dev_id)
+ return -EINVAL;
+
+ desc = irq_to_desc(irq);
+ if (!desc || !irq_settings_can_request(desc) ||
+ !irq_settings_is_per_cpu_devid(desc))
+ return -EINVAL;
+
+ action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
+ if (!action)
+ return -ENOMEM;
+
+ action->handler = handler;
+ action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND;
+ action->name = devname;
+ action->percpu_dev_id = dev_id;
+
+ chip_bus_lock(desc);
+ retval = __setup_irq(irq, desc, action);
+ chip_bus_sync_unlock(desc);
+
+ if (retval)
+ kfree(action);
+
+ return retval;
+}
+
+/**
+ * irq_get_irqchip_state - returns the irqchip state of a interrupt.
+ * @irq: Interrupt line that is forwarded to a VM
+ * @which: One of IRQCHIP_STATE_* the caller wants to know about
+ * @state: a pointer to a boolean where the state is to be storeed
+ *
+ * This call snapshots the internal irqchip state of an
+ * interrupt, returning into @state the bit corresponding to
+ * stage @which
+ *
+ * This function should be called with preemption disabled if the
+ * interrupt controller has per-cpu registers.
+ */
+int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
+ bool *state)
+{
+ struct irq_desc *desc;
+ struct irq_data *data;
+ struct irq_chip *chip;
+ unsigned long flags;
+ int err = -EINVAL;
+
+ desc = irq_get_desc_buslock(irq, &flags, 0);
+ if (!desc)
+ return err;
+
+ data = irq_desc_get_irq_data(desc);
+
+ do {
+ chip = irq_data_get_irq_chip(data);
+ if (chip->irq_get_irqchip_state)
+ break;
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ data = data->parent_data;
+#else
+ data = NULL;
+#endif
+ } while (data);
+
+ if (data)
+ err = chip->irq_get_irqchip_state(data, which, state);
+
+ irq_put_desc_busunlock(desc, flags);
+ return err;
+}
+
+/**
+ * irq_set_irqchip_state - set the state of a forwarded interrupt.
+ * @irq: Interrupt line that is forwarded to a VM
+ * @which: State to be restored (one of IRQCHIP_STATE_*)
+ * @val: Value corresponding to @which
+ *
+ * This call sets the internal irqchip state of an interrupt,
+ * depending on the value of @which.
+ *
+ * This function should be called with preemption disabled if the
+ * interrupt controller has per-cpu registers.
+ */
+int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
+ bool val)
+{
+ struct irq_desc *desc;
+ struct irq_data *data;
+ struct irq_chip *chip;
+ unsigned long flags;
+ int err = -EINVAL;
+
+ desc = irq_get_desc_buslock(irq, &flags, 0);
+ if (!desc)
+ return err;
+
+ data = irq_desc_get_irq_data(desc);
+
+ do {
+ chip = irq_data_get_irq_chip(data);
+ if (chip->irq_set_irqchip_state)
+ break;
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ data = data->parent_data;
+#else
+ data = NULL;
+#endif
+ } while (data);
+
+ if (data)
+ err = chip->irq_set_irqchip_state(data, which, val);
+
+ irq_put_desc_busunlock(desc, flags);
+ return err;
+}
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
new file mode 100644
index 000000000..ca3f4aaff
--- /dev/null
+++ b/kernel/irq/migration.c
@@ -0,0 +1,72 @@
+
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+
+#include "internals.h"
+
+void irq_move_masked_irq(struct irq_data *idata)
+{
+ struct irq_desc *desc = irq_data_to_desc(idata);
+ struct irq_chip *chip = idata->chip;
+
+ if (likely(!irqd_is_setaffinity_pending(&desc->irq_data)))
+ return;
+
+ /*
+ * Paranoia: cpu-local interrupts shouldn't be calling in here anyway.
+ */
+ if (!irqd_can_balance(&desc->irq_data)) {
+ WARN_ON(1);
+ return;
+ }
+
+ irqd_clr_move_pending(&desc->irq_data);
+
+ if (unlikely(cpumask_empty(desc->pending_mask)))
+ return;
+
+ if (!chip->irq_set_affinity)
+ return;
+
+ assert_raw_spin_locked(&desc->lock);
+
+ /*
+ * If there was a valid mask to work with, please
+ * do the disable, re-program, enable sequence.
+ * This is *not* particularly important for level triggered
+ * but in a edge trigger case, we might be setting rte
+ * when an active trigger is coming in. This could
+ * cause some ioapics to mal-function.
+ * Being paranoid i guess!
+ *
+ * For correct operation this depends on the caller
+ * masking the irqs.
+ */
+ if (cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids)
+ irq_do_set_affinity(&desc->irq_data, desc->pending_mask, false);
+
+ cpumask_clear(desc->pending_mask);
+}
+
+void irq_move_irq(struct irq_data *idata)
+{
+ bool masked;
+
+ if (likely(!irqd_is_setaffinity_pending(idata)))
+ return;
+
+ if (unlikely(irqd_irq_disabled(idata)))
+ return;
+
+ /*
+ * Be careful vs. already masked interrupts. If this is a
+ * threaded interrupt with ONESHOT set, we can end up with an
+ * interrupt storm.
+ */
+ masked = irqd_irq_masked(idata);
+ if (!masked)
+ idata->chip->irq_mask(idata);
+ irq_move_masked_irq(idata);
+ if (!masked)
+ idata->chip->irq_unmask(idata);
+}
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
new file mode 100644
index 000000000..474de5cb3
--- /dev/null
+++ b/kernel/irq/msi.c
@@ -0,0 +1,337 @@
+/*
+ * linux/kernel/irq/msi.c
+ *
+ * Copyright (C) 2014 Intel Corp.
+ * Author: Jiang Liu <jiang.liu@linux.intel.com>
+ *
+ * This file is licensed under GPLv2.
+ *
+ * This file contains common code to support Message Signalled Interrupt for
+ * PCI compatible and non PCI compatible devices.
+ */
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/irq.h>
+#include <linux/irqdomain.h>
+#include <linux/msi.h>
+
+/* Temparory solution for building, will be removed later */
+#include <linux/pci.h>
+
+void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
+{
+ *msg = entry->msg;
+}
+
+void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg)
+{
+ struct msi_desc *entry = irq_get_msi_desc(irq);
+
+ __get_cached_msi_msg(entry, msg);
+}
+EXPORT_SYMBOL_GPL(get_cached_msi_msg);
+
+#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
+static inline void irq_chip_write_msi_msg(struct irq_data *data,
+ struct msi_msg *msg)
+{
+ data->chip->irq_write_msi_msg(data, msg);
+}
+
+/**
+ * msi_domain_set_affinity - Generic affinity setter function for MSI domains
+ * @irq_data: The irq data associated to the interrupt
+ * @mask: The affinity mask to set
+ * @force: Flag to enforce setting (disable online checks)
+ *
+ * Intended to be used by MSI interrupt controllers which are
+ * implemented with hierarchical domains.
+ */
+int msi_domain_set_affinity(struct irq_data *irq_data,
+ const struct cpumask *mask, bool force)
+{
+ struct irq_data *parent = irq_data->parent_data;
+ struct msi_msg msg;
+ int ret;
+
+ ret = parent->chip->irq_set_affinity(parent, mask, force);
+ if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) {
+ BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg));
+ irq_chip_write_msi_msg(irq_data, &msg);
+ }
+
+ return ret;
+}
+
+static void msi_domain_activate(struct irq_domain *domain,
+ struct irq_data *irq_data)
+{
+ struct msi_msg msg;
+
+ BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg));
+ irq_chip_write_msi_msg(irq_data, &msg);
+}
+
+static void msi_domain_deactivate(struct irq_domain *domain,
+ struct irq_data *irq_data)
+{
+ struct msi_msg msg;
+
+ memset(&msg, 0, sizeof(msg));
+ irq_chip_write_msi_msg(irq_data, &msg);
+}
+
+static int msi_domain_alloc(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs, void *arg)
+{
+ struct msi_domain_info *info = domain->host_data;
+ struct msi_domain_ops *ops = info->ops;
+ irq_hw_number_t hwirq = ops->get_hwirq(info, arg);
+ int i, ret;
+
+ if (irq_find_mapping(domain, hwirq) > 0)
+ return -EEXIST;
+
+ ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
+ if (ret < 0)
+ return ret;
+
+ for (i = 0; i < nr_irqs; i++) {
+ ret = ops->msi_init(domain, info, virq + i, hwirq + i, arg);
+ if (ret < 0) {
+ if (ops->msi_free) {
+ for (i--; i > 0; i--)
+ ops->msi_free(domain, info, virq + i);
+ }
+ irq_domain_free_irqs_top(domain, virq, nr_irqs);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static void msi_domain_free(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs)
+{
+ struct msi_domain_info *info = domain->host_data;
+ int i;
+
+ if (info->ops->msi_free) {
+ for (i = 0; i < nr_irqs; i++)
+ info->ops->msi_free(domain, info, virq + i);
+ }
+ irq_domain_free_irqs_top(domain, virq, nr_irqs);
+}
+
+static struct irq_domain_ops msi_domain_ops = {
+ .alloc = msi_domain_alloc,
+ .free = msi_domain_free,
+ .activate = msi_domain_activate,
+ .deactivate = msi_domain_deactivate,
+};
+
+#ifdef GENERIC_MSI_DOMAIN_OPS
+static irq_hw_number_t msi_domain_ops_get_hwirq(struct msi_domain_info *info,
+ msi_alloc_info_t *arg)
+{
+ return arg->hwirq;
+}
+
+static int msi_domain_ops_prepare(struct irq_domain *domain, struct device *dev,
+ int nvec, msi_alloc_info_t *arg)
+{
+ memset(arg, 0, sizeof(*arg));
+ return 0;
+}
+
+static void msi_domain_ops_set_desc(msi_alloc_info_t *arg,
+ struct msi_desc *desc)
+{
+ arg->desc = desc;
+}
+#else
+#define msi_domain_ops_get_hwirq NULL
+#define msi_domain_ops_prepare NULL
+#define msi_domain_ops_set_desc NULL
+#endif /* !GENERIC_MSI_DOMAIN_OPS */
+
+static int msi_domain_ops_init(struct irq_domain *domain,
+ struct msi_domain_info *info,
+ unsigned int virq, irq_hw_number_t hwirq,
+ msi_alloc_info_t *arg)
+{
+ irq_domain_set_hwirq_and_chip(domain, virq, hwirq, info->chip,
+ info->chip_data);
+ if (info->handler && info->handler_name) {
+ __irq_set_handler(virq, info->handler, 0, info->handler_name);
+ if (info->handler_data)
+ irq_set_handler_data(virq, info->handler_data);
+ }
+ return 0;
+}
+
+static int msi_domain_ops_check(struct irq_domain *domain,
+ struct msi_domain_info *info,
+ struct device *dev)
+{
+ return 0;
+}
+
+static struct msi_domain_ops msi_domain_ops_default = {
+ .get_hwirq = msi_domain_ops_get_hwirq,
+ .msi_init = msi_domain_ops_init,
+ .msi_check = msi_domain_ops_check,
+ .msi_prepare = msi_domain_ops_prepare,
+ .set_desc = msi_domain_ops_set_desc,
+};
+
+static void msi_domain_update_dom_ops(struct msi_domain_info *info)
+{
+ struct msi_domain_ops *ops = info->ops;
+
+ if (ops == NULL) {
+ info->ops = &msi_domain_ops_default;
+ return;
+ }
+
+ if (ops->get_hwirq == NULL)
+ ops->get_hwirq = msi_domain_ops_default.get_hwirq;
+ if (ops->msi_init == NULL)
+ ops->msi_init = msi_domain_ops_default.msi_init;
+ if (ops->msi_check == NULL)
+ ops->msi_check = msi_domain_ops_default.msi_check;
+ if (ops->msi_prepare == NULL)
+ ops->msi_prepare = msi_domain_ops_default.msi_prepare;
+ if (ops->set_desc == NULL)
+ ops->set_desc = msi_domain_ops_default.set_desc;
+}
+
+static void msi_domain_update_chip_ops(struct msi_domain_info *info)
+{
+ struct irq_chip *chip = info->chip;
+
+ BUG_ON(!chip);
+ if (!chip->irq_mask)
+ chip->irq_mask = pci_msi_mask_irq;
+ if (!chip->irq_unmask)
+ chip->irq_unmask = pci_msi_unmask_irq;
+ if (!chip->irq_set_affinity)
+ chip->irq_set_affinity = msi_domain_set_affinity;
+}
+
+/**
+ * msi_create_irq_domain - Create a MSI interrupt domain
+ * @of_node: Optional device-tree node of the interrupt controller
+ * @info: MSI domain info
+ * @parent: Parent irq domain
+ */
+struct irq_domain *msi_create_irq_domain(struct device_node *node,
+ struct msi_domain_info *info,
+ struct irq_domain *parent)
+{
+ if (info->flags & MSI_FLAG_USE_DEF_DOM_OPS)
+ msi_domain_update_dom_ops(info);
+ if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS)
+ msi_domain_update_chip_ops(info);
+
+ return irq_domain_add_hierarchy(parent, 0, 0, node, &msi_domain_ops,
+ info);
+}
+
+/**
+ * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain
+ * @domain: The domain to allocate from
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are allocated
+ * @nvec: The number of interrupts to allocate
+ *
+ * Returns 0 on success or an error code.
+ */
+int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
+ int nvec)
+{
+ struct msi_domain_info *info = domain->host_data;
+ struct msi_domain_ops *ops = info->ops;
+ msi_alloc_info_t arg;
+ struct msi_desc *desc;
+ int i, ret, virq = -1;
+
+ ret = ops->msi_check(domain, info, dev);
+ if (ret == 0)
+ ret = ops->msi_prepare(domain, dev, nvec, &arg);
+ if (ret)
+ return ret;
+
+ for_each_msi_entry(desc, dev) {
+ ops->set_desc(&arg, desc);
+ if (info->flags & MSI_FLAG_IDENTITY_MAP)
+ virq = (int)ops->get_hwirq(info, &arg);
+ else
+ virq = -1;
+
+ virq = __irq_domain_alloc_irqs(domain, virq, desc->nvec_used,
+ dev_to_node(dev), &arg, false);
+ if (virq < 0) {
+ ret = -ENOSPC;
+ if (ops->handle_error)
+ ret = ops->handle_error(domain, desc, ret);
+ if (ops->msi_finish)
+ ops->msi_finish(&arg, ret);
+ return ret;
+ }
+
+ for (i = 0; i < desc->nvec_used; i++)
+ irq_set_msi_desc_off(virq, i, desc);
+ }
+
+ if (ops->msi_finish)
+ ops->msi_finish(&arg, 0);
+
+ for_each_msi_entry(desc, dev) {
+ if (desc->nvec_used == 1)
+ dev_dbg(dev, "irq %d for MSI\n", virq);
+ else
+ dev_dbg(dev, "irq [%d-%d] for MSI\n",
+ virq, virq + desc->nvec_used - 1);
+ }
+
+ return 0;
+}
+
+/**
+ * msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated tp @dev
+ * @domain: The domain to managing the interrupts
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are free
+ */
+void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
+{
+ struct msi_desc *desc;
+
+ for_each_msi_entry(desc, dev) {
+ /*
+ * We might have failed to allocate an MSI early
+ * enough that there is no IRQ associated to this
+ * entry. If that's the case, don't do anything.
+ */
+ if (desc->irq) {
+ irq_domain_free_irqs(desc->irq, desc->nvec_used);
+ desc->irq = 0;
+ }
+ }
+}
+
+/**
+ * msi_get_domain_info - Get the MSI interrupt domain info for @domain
+ * @domain: The interrupt domain to retrieve data from
+ *
+ * Returns the pointer to the msi_domain_info stored in
+ * @domain->host_data.
+ */
+struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain)
+{
+ return (struct msi_domain_info *)domain->host_data;
+}
+
+#endif /* CONFIG_GENERIC_MSI_IRQ_DOMAIN */
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
new file mode 100644
index 000000000..5204a6d1b
--- /dev/null
+++ b/kernel/irq/pm.c
@@ -0,0 +1,206 @@
+/*
+ * linux/kernel/irq/pm.c
+ *
+ * Copyright (C) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
+ *
+ * This file contains power management functions related to interrupts.
+ */
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/suspend.h>
+#include <linux/syscore_ops.h>
+
+#include "internals.h"
+
+bool irq_pm_check_wakeup(struct irq_desc *desc)
+{
+ if (irqd_is_wakeup_armed(&desc->irq_data)) {
+ irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED);
+ desc->istate |= IRQS_SUSPENDED | IRQS_PENDING;
+ desc->depth++;
+ irq_disable(desc);
+ pm_system_wakeup();
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Called from __setup_irq() with desc->lock held after @action has
+ * been installed in the action chain.
+ */
+void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action)
+{
+ desc->nr_actions++;
+
+ if (action->flags & IRQF_FORCE_RESUME)
+ desc->force_resume_depth++;
+
+ WARN_ON_ONCE(desc->force_resume_depth &&
+ desc->force_resume_depth != desc->nr_actions);
+
+ if (action->flags & IRQF_NO_SUSPEND)
+ desc->no_suspend_depth++;
+ else if (action->flags & IRQF_COND_SUSPEND)
+ desc->cond_suspend_depth++;
+
+ WARN_ON_ONCE(desc->no_suspend_depth &&
+ (desc->no_suspend_depth +
+ desc->cond_suspend_depth) != desc->nr_actions);
+}
+
+/*
+ * Called from __free_irq() with desc->lock held after @action has
+ * been removed from the action chain.
+ */
+void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action)
+{
+ desc->nr_actions--;
+
+ if (action->flags & IRQF_FORCE_RESUME)
+ desc->force_resume_depth--;
+
+ if (action->flags & IRQF_NO_SUSPEND)
+ desc->no_suspend_depth--;
+ else if (action->flags & IRQF_COND_SUSPEND)
+ desc->cond_suspend_depth--;
+}
+
+static bool suspend_device_irq(struct irq_desc *desc, int irq)
+{
+ if (!desc->action || desc->no_suspend_depth)
+ return false;
+
+ if (irqd_is_wakeup_set(&desc->irq_data)) {
+ irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED);
+ /*
+ * We return true here to force the caller to issue
+ * synchronize_irq(). We need to make sure that the
+ * IRQD_WAKEUP_ARMED is visible before we return from
+ * suspend_device_irqs().
+ */
+ return true;
+ }
+
+ desc->istate |= IRQS_SUSPENDED;
+ __disable_irq(desc, irq);
+
+ /*
+ * Hardware which has no wakeup source configuration facility
+ * requires that the non wakeup interrupts are masked at the
+ * chip level. The chip implementation indicates that with
+ * IRQCHIP_MASK_ON_SUSPEND.
+ */
+ if (irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND)
+ mask_irq(desc);
+ return true;
+}
+
+/**
+ * suspend_device_irqs - disable all currently enabled interrupt lines
+ *
+ * During system-wide suspend or hibernation device drivers need to be
+ * prevented from receiving interrupts and this function is provided
+ * for this purpose.
+ *
+ * So we disable all interrupts and mark them IRQS_SUSPENDED except
+ * for those which are unused, those which are marked as not
+ * suspendable via an interrupt request with the flag IRQF_NO_SUSPEND
+ * set and those which are marked as active wakeup sources.
+ *
+ * The active wakeup sources are handled by the flow handler entry
+ * code which checks for the IRQD_WAKEUP_ARMED flag, suspends the
+ * interrupt and notifies the pm core about the wakeup.
+ */
+void suspend_device_irqs(void)
+{
+ struct irq_desc *desc;
+ int irq;
+
+ for_each_irq_desc(irq, desc) {
+ unsigned long flags;
+ bool sync;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+ sync = suspend_device_irq(desc, irq);
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+
+ if (sync)
+ synchronize_irq(irq);
+ }
+}
+EXPORT_SYMBOL_GPL(suspend_device_irqs);
+
+static void resume_irq(struct irq_desc *desc, int irq)
+{
+ irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED);
+
+ if (desc->istate & IRQS_SUSPENDED)
+ goto resume;
+
+ /* Force resume the interrupt? */
+ if (!desc->force_resume_depth)
+ return;
+
+ /* Pretend that it got disabled ! */
+ desc->depth++;
+resume:
+ desc->istate &= ~IRQS_SUSPENDED;
+ __enable_irq(desc, irq);
+}
+
+static void resume_irqs(bool want_early)
+{
+ struct irq_desc *desc;
+ int irq;
+
+ for_each_irq_desc(irq, desc) {
+ unsigned long flags;
+ bool is_early = desc->action &&
+ desc->action->flags & IRQF_EARLY_RESUME;
+
+ if (!is_early && want_early)
+ continue;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+ resume_irq(desc, irq);
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+ }
+}
+
+/**
+ * irq_pm_syscore_ops - enable interrupt lines early
+ *
+ * Enable all interrupt lines with %IRQF_EARLY_RESUME set.
+ */
+static void irq_pm_syscore_resume(void)
+{
+ resume_irqs(true);
+}
+
+static struct syscore_ops irq_pm_syscore_ops = {
+ .resume = irq_pm_syscore_resume,
+};
+
+static int __init irq_pm_init_ops(void)
+{
+ register_syscore_ops(&irq_pm_syscore_ops);
+ return 0;
+}
+
+device_initcall(irq_pm_init_ops);
+
+/**
+ * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs()
+ *
+ * Enable all non-%IRQF_EARLY_RESUME interrupt lines previously
+ * disabled by suspend_device_irqs() that have the IRQS_SUSPENDED flag
+ * set as well as those with %IRQF_FORCE_RESUME.
+ */
+void resume_device_irqs(void)
+{
+ resume_irqs(false);
+}
+EXPORT_SYMBOL_GPL(resume_device_irqs);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
new file mode 100644
index 000000000..df2f4642d
--- /dev/null
+++ b/kernel/irq/proc.c
@@ -0,0 +1,501 @@
+/*
+ * linux/kernel/irq/proc.c
+ *
+ * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains the /proc/irq/ handling code.
+ */
+
+#include <linux/irq.h>
+#include <linux/gfp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+
+#include "internals.h"
+
+/*
+ * Access rules:
+ *
+ * procfs protects read/write of /proc/irq/N/ files against a
+ * concurrent free of the interrupt descriptor. remove_proc_entry()
+ * immediately prevents new read/writes to happen and waits for
+ * already running read/write functions to complete.
+ *
+ * We remove the proc entries first and then delete the interrupt
+ * descriptor from the radix tree and free it. So it is guaranteed
+ * that irq_to_desc(N) is valid as long as the read/writes are
+ * permitted by procfs.
+ *
+ * The read from /proc/interrupts is a different problem because there
+ * is no protection. So the lookup and the access to irqdesc
+ * information must be protected by sparse_irq_lock.
+ */
+static struct proc_dir_entry *root_irq_dir;
+
+#ifdef CONFIG_SMP
+
+static int show_irq_affinity(int type, struct seq_file *m, void *v)
+{
+ struct irq_desc *desc = irq_to_desc((long)m->private);
+ const struct cpumask *mask = desc->irq_data.affinity;
+
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+ if (irqd_is_setaffinity_pending(&desc->irq_data))
+ mask = desc->pending_mask;
+#endif
+ if (type)
+ seq_printf(m, "%*pbl\n", cpumask_pr_args(mask));
+ else
+ seq_printf(m, "%*pb\n", cpumask_pr_args(mask));
+ return 0;
+}
+
+static int irq_affinity_hint_proc_show(struct seq_file *m, void *v)
+{
+ struct irq_desc *desc = irq_to_desc((long)m->private);
+ unsigned long flags;
+ cpumask_var_t mask;
+
+ if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+ if (desc->affinity_hint)
+ cpumask_copy(mask, desc->affinity_hint);
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+
+ seq_printf(m, "%*pb\n", cpumask_pr_args(mask));
+ free_cpumask_var(mask);
+
+ return 0;
+}
+
+#ifndef is_affinity_mask_valid
+#define is_affinity_mask_valid(val) 1
+#endif
+
+int no_irq_affinity;
+static int irq_affinity_proc_show(struct seq_file *m, void *v)
+{
+ return show_irq_affinity(0, m, v);
+}
+
+static int irq_affinity_list_proc_show(struct seq_file *m, void *v)
+{
+ return show_irq_affinity(1, m, v);
+}
+
+
+static ssize_t write_irq_affinity(int type, struct file *file,
+ const char __user *buffer, size_t count, loff_t *pos)
+{
+ unsigned int irq = (int)(long)PDE_DATA(file_inode(file));
+ cpumask_var_t new_value;
+ int err;
+
+ if (!irq_can_set_affinity(irq) || no_irq_affinity)
+ return -EIO;
+
+ if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
+ return -ENOMEM;
+
+ if (type)
+ err = cpumask_parselist_user(buffer, count, new_value);
+ else
+ err = cpumask_parse_user(buffer, count, new_value);
+ if (err)
+ goto free_cpumask;
+
+ if (!is_affinity_mask_valid(new_value)) {
+ err = -EINVAL;
+ goto free_cpumask;
+ }
+
+ /*
+ * Do not allow disabling IRQs completely - it's a too easy
+ * way to make the system unusable accidentally :-) At least
+ * one online CPU still has to be targeted.
+ */
+ if (!cpumask_intersects(new_value, cpu_online_mask)) {
+ /* Special case for empty set - allow the architecture
+ code to set default SMP affinity. */
+ err = irq_select_affinity_usr(irq, new_value) ? -EINVAL : count;
+ } else {
+ irq_set_affinity(irq, new_value);
+ err = count;
+ }
+
+free_cpumask:
+ free_cpumask_var(new_value);
+ return err;
+}
+
+static ssize_t irq_affinity_proc_write(struct file *file,
+ const char __user *buffer, size_t count, loff_t *pos)
+{
+ return write_irq_affinity(0, file, buffer, count, pos);
+}
+
+static ssize_t irq_affinity_list_proc_write(struct file *file,
+ const char __user *buffer, size_t count, loff_t *pos)
+{
+ return write_irq_affinity(1, file, buffer, count, pos);
+}
+
+static int irq_affinity_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, irq_affinity_proc_show, PDE_DATA(inode));
+}
+
+static int irq_affinity_list_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, irq_affinity_list_proc_show, PDE_DATA(inode));
+}
+
+static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, irq_affinity_hint_proc_show, PDE_DATA(inode));
+}
+
+static const struct file_operations irq_affinity_proc_fops = {
+ .open = irq_affinity_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+ .write = irq_affinity_proc_write,
+};
+
+static const struct file_operations irq_affinity_hint_proc_fops = {
+ .open = irq_affinity_hint_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static const struct file_operations irq_affinity_list_proc_fops = {
+ .open = irq_affinity_list_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+ .write = irq_affinity_list_proc_write,
+};
+
+static int default_affinity_show(struct seq_file *m, void *v)
+{
+ seq_printf(m, "%*pb\n", cpumask_pr_args(irq_default_affinity));
+ return 0;
+}
+
+static ssize_t default_affinity_write(struct file *file,
+ const char __user *buffer, size_t count, loff_t *ppos)
+{
+ cpumask_var_t new_value;
+ int err;
+
+ if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
+ return -ENOMEM;
+
+ err = cpumask_parse_user(buffer, count, new_value);
+ if (err)
+ goto out;
+
+ if (!is_affinity_mask_valid(new_value)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * Do not allow disabling IRQs completely - it's a too easy
+ * way to make the system unusable accidentally :-) At least
+ * one online CPU still has to be targeted.
+ */
+ if (!cpumask_intersects(new_value, cpu_online_mask)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ cpumask_copy(irq_default_affinity, new_value);
+ err = count;
+
+out:
+ free_cpumask_var(new_value);
+ return err;
+}
+
+static int default_affinity_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, default_affinity_show, PDE_DATA(inode));
+}
+
+static const struct file_operations default_affinity_proc_fops = {
+ .open = default_affinity_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+ .write = default_affinity_write,
+};
+
+static int irq_node_proc_show(struct seq_file *m, void *v)
+{
+ struct irq_desc *desc = irq_to_desc((long) m->private);
+
+ seq_printf(m, "%d\n", desc->irq_data.node);
+ return 0;
+}
+
+static int irq_node_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, irq_node_proc_show, PDE_DATA(inode));
+}
+
+static const struct file_operations irq_node_proc_fops = {
+ .open = irq_node_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+#endif
+
+static int irq_spurious_proc_show(struct seq_file *m, void *v)
+{
+ struct irq_desc *desc = irq_to_desc((long) m->private);
+
+ seq_printf(m, "count %u\n" "unhandled %u\n" "last_unhandled %u ms\n",
+ desc->irq_count, desc->irqs_unhandled,
+ jiffies_to_msecs(desc->last_unhandled));
+ return 0;
+}
+
+static int irq_spurious_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, irq_spurious_proc_show, PDE_DATA(inode));
+}
+
+static const struct file_operations irq_spurious_proc_fops = {
+ .open = irq_spurious_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+#define MAX_NAMELEN 128
+
+static int name_unique(unsigned int irq, struct irqaction *new_action)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ struct irqaction *action;
+ unsigned long flags;
+ int ret = 1;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+ for (action = desc->action ; action; action = action->next) {
+ if ((action != new_action) && action->name &&
+ !strcmp(new_action->name, action->name)) {
+ ret = 0;
+ break;
+ }
+ }
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+ return ret;
+}
+
+void register_handler_proc(unsigned int irq, struct irqaction *action)
+{
+ char name [MAX_NAMELEN];
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (!desc->dir || action->dir || !action->name ||
+ !name_unique(irq, action))
+ return;
+
+ memset(name, 0, MAX_NAMELEN);
+ snprintf(name, MAX_NAMELEN, "%s", action->name);
+
+ /* create /proc/irq/1234/handler/ */
+ action->dir = proc_mkdir(name, desc->dir);
+}
+
+#undef MAX_NAMELEN
+
+#define MAX_NAMELEN 10
+
+void register_irq_proc(unsigned int irq, struct irq_desc *desc)
+{
+ char name [MAX_NAMELEN];
+
+ if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip) || desc->dir)
+ return;
+
+ memset(name, 0, MAX_NAMELEN);
+ sprintf(name, "%d", irq);
+
+ /* create /proc/irq/1234 */
+ desc->dir = proc_mkdir(name, root_irq_dir);
+ if (!desc->dir)
+ return;
+
+#ifdef CONFIG_SMP
+ /* create /proc/irq/<irq>/smp_affinity */
+ proc_create_data("smp_affinity", 0644, desc->dir,
+ &irq_affinity_proc_fops, (void *)(long)irq);
+
+ /* create /proc/irq/<irq>/affinity_hint */
+ proc_create_data("affinity_hint", 0444, desc->dir,
+ &irq_affinity_hint_proc_fops, (void *)(long)irq);
+
+ /* create /proc/irq/<irq>/smp_affinity_list */
+ proc_create_data("smp_affinity_list", 0644, desc->dir,
+ &irq_affinity_list_proc_fops, (void *)(long)irq);
+
+ proc_create_data("node", 0444, desc->dir,
+ &irq_node_proc_fops, (void *)(long)irq);
+#endif
+
+ proc_create_data("spurious", 0444, desc->dir,
+ &irq_spurious_proc_fops, (void *)(long)irq);
+}
+
+void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
+{
+ char name [MAX_NAMELEN];
+
+ if (!root_irq_dir || !desc->dir)
+ return;
+#ifdef CONFIG_SMP
+ remove_proc_entry("smp_affinity", desc->dir);
+ remove_proc_entry("affinity_hint", desc->dir);
+ remove_proc_entry("smp_affinity_list", desc->dir);
+ remove_proc_entry("node", desc->dir);
+#endif
+ remove_proc_entry("spurious", desc->dir);
+
+ memset(name, 0, MAX_NAMELEN);
+ sprintf(name, "%u", irq);
+ remove_proc_entry(name, root_irq_dir);
+}
+
+#undef MAX_NAMELEN
+
+void unregister_handler_proc(unsigned int irq, struct irqaction *action)
+{
+ proc_remove(action->dir);
+}
+
+static void register_default_affinity_proc(void)
+{
+#ifdef CONFIG_SMP
+ proc_create("irq/default_smp_affinity", 0644, NULL,
+ &default_affinity_proc_fops);
+#endif
+}
+
+void init_irq_proc(void)
+{
+ unsigned int irq;
+ struct irq_desc *desc;
+
+ /* create /proc/irq */
+ root_irq_dir = proc_mkdir("irq", NULL);
+ if (!root_irq_dir)
+ return;
+
+ register_default_affinity_proc();
+
+ /*
+ * Create entries for all existing IRQs.
+ */
+ for_each_irq_desc(irq, desc) {
+ if (!desc)
+ continue;
+
+ register_irq_proc(irq, desc);
+ }
+}
+
+#ifdef CONFIG_GENERIC_IRQ_SHOW
+
+int __weak arch_show_interrupts(struct seq_file *p, int prec)
+{
+ return 0;
+}
+
+#ifndef ACTUAL_NR_IRQS
+# define ACTUAL_NR_IRQS nr_irqs
+#endif
+
+int show_interrupts(struct seq_file *p, void *v)
+{
+ static int prec;
+
+ unsigned long flags, any_count = 0;
+ int i = *(loff_t *) v, j;
+ struct irqaction *action;
+ struct irq_desc *desc;
+
+ if (i > ACTUAL_NR_IRQS)
+ return 0;
+
+ if (i == ACTUAL_NR_IRQS)
+ return arch_show_interrupts(p, prec);
+
+ /* print header and calculate the width of the first column */
+ if (i == 0) {
+ for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec)
+ j *= 10;
+
+ seq_printf(p, "%*s", prec + 8, "");
+ for_each_online_cpu(j)
+ seq_printf(p, "CPU%-8d", j);
+ seq_putc(p, '\n');
+ }
+
+ irq_lock_sparse();
+ desc = irq_to_desc(i);
+ if (!desc)
+ goto outsparse;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+ for_each_online_cpu(j)
+ any_count |= kstat_irqs_cpu(i, j);
+ action = desc->action;
+ if (!action && !any_count)
+ goto out;
+
+ seq_printf(p, "%*d: ", prec, i);
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
+
+ if (desc->irq_data.chip) {
+ if (desc->irq_data.chip->irq_print_chip)
+ desc->irq_data.chip->irq_print_chip(&desc->irq_data, p);
+ else if (desc->irq_data.chip->name)
+ seq_printf(p, " %8s", desc->irq_data.chip->name);
+ else
+ seq_printf(p, " %8s", "-");
+ } else {
+ seq_printf(p, " %8s", "None");
+ }
+ if (desc->irq_data.domain)
+ seq_printf(p, " %*d", prec, (int) desc->irq_data.hwirq);
+#ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL
+ seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge");
+#endif
+ if (desc->name)
+ seq_printf(p, "-%-8s", desc->name);
+
+ if (action) {
+ seq_printf(p, " %s", action->name);
+ while ((action = action->next) != NULL)
+ seq_printf(p, ", %s", action->name);
+ }
+
+ seq_putc(p, '\n');
+out:
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+outsparse:
+ irq_unlock_sparse();
+ return 0;
+}
+#endif
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
new file mode 100644
index 000000000..9065107f0
--- /dev/null
+++ b/kernel/irq/resend.c
@@ -0,0 +1,91 @@
+/*
+ * linux/kernel/irq/resend.c
+ *
+ * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 2005-2006, Thomas Gleixner
+ *
+ * This file contains the IRQ-resend code
+ *
+ * If the interrupt is waiting to be processed, we try to re-run it.
+ * We can't directly run it from here since the caller might be in an
+ * interrupt-protected region. Not all irq controller chips can
+ * retrigger interrupts at the hardware level, so in those cases
+ * we allow the resending of IRQs via a tasklet.
+ */
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/interrupt.h>
+
+#include "internals.h"
+
+#ifdef CONFIG_HARDIRQS_SW_RESEND
+
+/* Bitmap to handle software resend of interrupts: */
+static DECLARE_BITMAP(irqs_resend, IRQ_BITMAP_BITS);
+
+/*
+ * Run software resends of IRQ's
+ */
+static void resend_irqs(unsigned long arg)
+{
+ struct irq_desc *desc;
+ int irq;
+
+ while (!bitmap_empty(irqs_resend, nr_irqs)) {
+ irq = find_first_bit(irqs_resend, nr_irqs);
+ clear_bit(irq, irqs_resend);
+ desc = irq_to_desc(irq);
+ local_irq_disable();
+ desc->handle_irq(irq, desc);
+ local_irq_enable();
+ }
+}
+
+/* Tasklet to handle resend: */
+static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0);
+
+#endif
+
+/*
+ * IRQ resend
+ *
+ * Is called with interrupts disabled and desc->lock held.
+ */
+void check_irq_resend(struct irq_desc *desc, unsigned int irq)
+{
+ /*
+ * We do not resend level type interrupts. Level type
+ * interrupts are resent by hardware when they are still
+ * active. Clear the pending bit so suspend/resume does not
+ * get confused.
+ */
+ if (irq_settings_is_level(desc)) {
+ desc->istate &= ~IRQS_PENDING;
+ return;
+ }
+ if (desc->istate & IRQS_REPLAY)
+ return;
+ if (desc->istate & IRQS_PENDING) {
+ desc->istate &= ~IRQS_PENDING;
+ desc->istate |= IRQS_REPLAY;
+
+ if (!desc->irq_data.chip->irq_retrigger ||
+ !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) {
+#ifdef CONFIG_HARDIRQS_SW_RESEND
+ /*
+ * If the interrupt has a parent irq and runs
+ * in the thread context of the parent irq,
+ * retrigger the parent.
+ */
+ if (desc->parent_irq &&
+ irq_settings_is_nested_thread(desc))
+ irq = desc->parent_irq;
+ /* Set it pending and activate the softirq: */
+ set_bit(irq, irqs_resend);
+ tasklet_schedule(&resend_tasklet);
+#endif
+ }
+ }
+}
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
new file mode 100644
index 000000000..3320b84cc
--- /dev/null
+++ b/kernel/irq/settings.h
@@ -0,0 +1,156 @@
+/*
+ * Internal header to deal with irq_desc->status which will be renamed
+ * to irq_desc->settings.
+ */
+enum {
+ _IRQ_DEFAULT_INIT_FLAGS = IRQ_DEFAULT_INIT_FLAGS,
+ _IRQ_PER_CPU = IRQ_PER_CPU,
+ _IRQ_LEVEL = IRQ_LEVEL,
+ _IRQ_NOPROBE = IRQ_NOPROBE,
+ _IRQ_NOREQUEST = IRQ_NOREQUEST,
+ _IRQ_NOTHREAD = IRQ_NOTHREAD,
+ _IRQ_NOAUTOEN = IRQ_NOAUTOEN,
+ _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT,
+ _IRQ_NO_BALANCING = IRQ_NO_BALANCING,
+ _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD,
+ _IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID,
+ _IRQ_IS_POLLED = IRQ_IS_POLLED,
+ _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK,
+};
+
+#define IRQ_PER_CPU GOT_YOU_MORON
+#define IRQ_NO_BALANCING GOT_YOU_MORON
+#define IRQ_LEVEL GOT_YOU_MORON
+#define IRQ_NOPROBE GOT_YOU_MORON
+#define IRQ_NOREQUEST GOT_YOU_MORON
+#define IRQ_NOTHREAD GOT_YOU_MORON
+#define IRQ_NOAUTOEN GOT_YOU_MORON
+#define IRQ_NESTED_THREAD GOT_YOU_MORON
+#define IRQ_PER_CPU_DEVID GOT_YOU_MORON
+#define IRQ_IS_POLLED GOT_YOU_MORON
+#undef IRQF_MODIFY_MASK
+#define IRQF_MODIFY_MASK GOT_YOU_MORON
+
+static inline void
+irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set)
+{
+ desc->status_use_accessors &= ~(clr & _IRQF_MODIFY_MASK);
+ desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK);
+}
+
+static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
+{
+ return desc->status_use_accessors & _IRQ_PER_CPU;
+}
+
+static inline bool irq_settings_is_per_cpu_devid(struct irq_desc *desc)
+{
+ return desc->status_use_accessors & _IRQ_PER_CPU_DEVID;
+}
+
+static inline void irq_settings_set_per_cpu(struct irq_desc *desc)
+{
+ desc->status_use_accessors |= _IRQ_PER_CPU;
+}
+
+static inline void irq_settings_set_no_balancing(struct irq_desc *desc)
+{
+ desc->status_use_accessors |= _IRQ_NO_BALANCING;
+}
+
+static inline bool irq_settings_has_no_balance_set(struct irq_desc *desc)
+{
+ return desc->status_use_accessors & _IRQ_NO_BALANCING;
+}
+
+static inline u32 irq_settings_get_trigger_mask(struct irq_desc *desc)
+{
+ return desc->status_use_accessors & IRQ_TYPE_SENSE_MASK;
+}
+
+static inline void
+irq_settings_set_trigger_mask(struct irq_desc *desc, u32 mask)
+{
+ desc->status_use_accessors &= ~IRQ_TYPE_SENSE_MASK;
+ desc->status_use_accessors |= mask & IRQ_TYPE_SENSE_MASK;
+}
+
+static inline bool irq_settings_is_level(struct irq_desc *desc)
+{
+ return desc->status_use_accessors & _IRQ_LEVEL;
+}
+
+static inline void irq_settings_clr_level(struct irq_desc *desc)
+{
+ desc->status_use_accessors &= ~_IRQ_LEVEL;
+}
+
+static inline void irq_settings_set_level(struct irq_desc *desc)
+{
+ desc->status_use_accessors |= _IRQ_LEVEL;
+}
+
+static inline bool irq_settings_can_request(struct irq_desc *desc)
+{
+ return !(desc->status_use_accessors & _IRQ_NOREQUEST);
+}
+
+static inline void irq_settings_clr_norequest(struct irq_desc *desc)
+{
+ desc->status_use_accessors &= ~_IRQ_NOREQUEST;
+}
+
+static inline void irq_settings_set_norequest(struct irq_desc *desc)
+{
+ desc->status_use_accessors |= _IRQ_NOREQUEST;
+}
+
+static inline bool irq_settings_can_thread(struct irq_desc *desc)
+{
+ return !(desc->status_use_accessors & _IRQ_NOTHREAD);
+}
+
+static inline void irq_settings_clr_nothread(struct irq_desc *desc)
+{
+ desc->status_use_accessors &= ~_IRQ_NOTHREAD;
+}
+
+static inline void irq_settings_set_nothread(struct irq_desc *desc)
+{
+ desc->status_use_accessors |= _IRQ_NOTHREAD;
+}
+
+static inline bool irq_settings_can_probe(struct irq_desc *desc)
+{
+ return !(desc->status_use_accessors & _IRQ_NOPROBE);
+}
+
+static inline void irq_settings_clr_noprobe(struct irq_desc *desc)
+{
+ desc->status_use_accessors &= ~_IRQ_NOPROBE;
+}
+
+static inline void irq_settings_set_noprobe(struct irq_desc *desc)
+{
+ desc->status_use_accessors |= _IRQ_NOPROBE;
+}
+
+static inline bool irq_settings_can_move_pcntxt(struct irq_desc *desc)
+{
+ return desc->status_use_accessors & _IRQ_MOVE_PCNTXT;
+}
+
+static inline bool irq_settings_can_autoenable(struct irq_desc *desc)
+{
+ return !(desc->status_use_accessors & _IRQ_NOAUTOEN);
+}
+
+static inline bool irq_settings_is_nested_thread(struct irq_desc *desc)
+{
+ return desc->status_use_accessors & _IRQ_NESTED_THREAD;
+}
+
+static inline bool irq_settings_is_polled(struct irq_desc *desc)
+{
+ return desc->status_use_accessors & _IRQ_IS_POLLED;
+}
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
new file mode 100644
index 000000000..e2514b0e4
--- /dev/null
+++ b/kernel/irq/spurious.c
@@ -0,0 +1,467 @@
+/*
+ * linux/kernel/irq/spurious.c
+ *
+ * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains spurious interrupt handling.
+ */
+
+#include <linux/jiffies.h>
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/interrupt.h>
+#include <linux/moduleparam.h>
+#include <linux/timer.h>
+
+#include "internals.h"
+
+static int irqfixup __read_mostly;
+
+#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10)
+static void poll_spurious_irqs(unsigned long dummy);
+static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0);
+static int irq_poll_cpu;
+static atomic_t irq_poll_active;
+
+/*
+ * We wait here for a poller to finish.
+ *
+ * If the poll runs on this CPU, then we yell loudly and return
+ * false. That will leave the interrupt line disabled in the worst
+ * case, but it should never happen.
+ *
+ * We wait until the poller is done and then recheck disabled and
+ * action (about to be disabled). Only if it's still active, we return
+ * true and let the handler run.
+ */
+bool irq_wait_for_poll(struct irq_desc *desc)
+{
+ if (WARN_ONCE(irq_poll_cpu == smp_processor_id(),
+ "irq poll in progress on cpu %d for irq %d\n",
+ smp_processor_id(), desc->irq_data.irq))
+ return false;
+
+#ifdef CONFIG_SMP
+ do {
+ raw_spin_unlock(&desc->lock);
+ while (irqd_irq_inprogress(&desc->irq_data))
+ cpu_relax();
+ raw_spin_lock(&desc->lock);
+ } while (irqd_irq_inprogress(&desc->irq_data));
+ /* Might have been disabled in meantime */
+ return !irqd_irq_disabled(&desc->irq_data) && desc->action;
+#else
+ return false;
+#endif
+}
+
+
+/*
+ * Recovery handler for misrouted interrupts.
+ */
+static int try_one_irq(int irq, struct irq_desc *desc, bool force)
+{
+ irqreturn_t ret = IRQ_NONE;
+ struct irqaction *action;
+
+ raw_spin_lock(&desc->lock);
+
+ /*
+ * PER_CPU, nested thread interrupts and interrupts explicitely
+ * marked polled are excluded from polling.
+ */
+ if (irq_settings_is_per_cpu(desc) ||
+ irq_settings_is_nested_thread(desc) ||
+ irq_settings_is_polled(desc))
+ goto out;
+
+ /*
+ * Do not poll disabled interrupts unless the spurious
+ * disabled poller asks explicitely.
+ */
+ if (irqd_irq_disabled(&desc->irq_data) && !force)
+ goto out;
+
+ /*
+ * All handlers must agree on IRQF_SHARED, so we test just the
+ * first.
+ */
+ action = desc->action;
+ if (!action || !(action->flags & IRQF_SHARED) ||
+ (action->flags & __IRQF_TIMER))
+ goto out;
+
+ /* Already running on another processor */
+ if (irqd_irq_inprogress(&desc->irq_data)) {
+ /*
+ * Already running: If it is shared get the other
+ * CPU to go looking for our mystery interrupt too
+ */
+ desc->istate |= IRQS_PENDING;
+ goto out;
+ }
+
+ /* Mark it poll in progress */
+ desc->istate |= IRQS_POLL_INPROGRESS;
+ do {
+ if (handle_irq_event(desc) == IRQ_HANDLED)
+ ret = IRQ_HANDLED;
+ /* Make sure that there is still a valid action */
+ action = desc->action;
+ } while ((desc->istate & IRQS_PENDING) && action);
+ desc->istate &= ~IRQS_POLL_INPROGRESS;
+out:
+ raw_spin_unlock(&desc->lock);
+ return ret == IRQ_HANDLED;
+}
+
+static int misrouted_irq(int irq)
+{
+ struct irq_desc *desc;
+ int i, ok = 0;
+
+ if (atomic_inc_return(&irq_poll_active) != 1)
+ goto out;
+
+ irq_poll_cpu = smp_processor_id();
+
+ for_each_irq_desc(i, desc) {
+ if (!i)
+ continue;
+
+ if (i == irq) /* Already tried */
+ continue;
+
+ if (try_one_irq(i, desc, false))
+ ok = 1;
+ }
+out:
+ atomic_dec(&irq_poll_active);
+ /* So the caller can adjust the irq error counts */
+ return ok;
+}
+
+static void poll_spurious_irqs(unsigned long dummy)
+{
+ struct irq_desc *desc;
+ int i;
+
+ if (atomic_inc_return(&irq_poll_active) != 1)
+ goto out;
+ irq_poll_cpu = smp_processor_id();
+
+ for_each_irq_desc(i, desc) {
+ unsigned int state;
+
+ if (!i)
+ continue;
+
+ /* Racy but it doesn't matter */
+ state = desc->istate;
+ barrier();
+ if (!(state & IRQS_SPURIOUS_DISABLED))
+ continue;
+
+ local_irq_disable();
+ try_one_irq(i, desc, true);
+ local_irq_enable();
+ }
+out:
+ atomic_dec(&irq_poll_active);
+ mod_timer(&poll_spurious_irq_timer,
+ jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
+}
+
+static inline int bad_action_ret(irqreturn_t action_ret)
+{
+ if (likely(action_ret <= (IRQ_HANDLED | IRQ_WAKE_THREAD)))
+ return 0;
+ return 1;
+}
+
+/*
+ * If 99,900 of the previous 100,000 interrupts have not been handled
+ * then assume that the IRQ is stuck in some manner. Drop a diagnostic
+ * and try to turn the IRQ off.
+ *
+ * (The other 100-of-100,000 interrupts may have been a correctly
+ * functioning device sharing an IRQ with the failing one)
+ */
+static void
+__report_bad_irq(unsigned int irq, struct irq_desc *desc,
+ irqreturn_t action_ret)
+{
+ struct irqaction *action;
+ unsigned long flags;
+
+ if (bad_action_ret(action_ret)) {
+ printk(KERN_ERR "irq event %d: bogus return value %x\n",
+ irq, action_ret);
+ } else {
+ printk(KERN_ERR "irq %d: nobody cared (try booting with "
+ "the \"irqpoll\" option)\n", irq);
+ }
+ dump_stack();
+ printk(KERN_ERR "handlers:\n");
+
+ /*
+ * We need to take desc->lock here. note_interrupt() is called
+ * w/o desc->lock held, but IRQ_PROGRESS set. We might race
+ * with something else removing an action. It's ok to take
+ * desc->lock here. See synchronize_irq().
+ */
+ raw_spin_lock_irqsave(&desc->lock, flags);
+ action = desc->action;
+ while (action) {
+ printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler);
+ if (action->thread_fn)
+ printk(KERN_CONT " threaded [<%p>] %pf",
+ action->thread_fn, action->thread_fn);
+ printk(KERN_CONT "\n");
+ action = action->next;
+ }
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+}
+
+static void
+report_bad_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret)
+{
+ static int count = 100;
+
+ if (count > 0) {
+ count--;
+ __report_bad_irq(irq, desc, action_ret);
+ }
+}
+
+static inline int
+try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
+ irqreturn_t action_ret)
+{
+ struct irqaction *action;
+
+ if (!irqfixup)
+ return 0;
+
+ /* We didn't actually handle the IRQ - see if it was misrouted? */
+ if (action_ret == IRQ_NONE)
+ return 1;
+
+ /*
+ * But for 'irqfixup == 2' we also do it for handled interrupts if
+ * they are marked as IRQF_IRQPOLL (or for irq zero, which is the
+ * traditional PC timer interrupt.. Legacy)
+ */
+ if (irqfixup < 2)
+ return 0;
+
+ if (!irq)
+ return 1;
+
+ /*
+ * Since we don't get the descriptor lock, "action" can
+ * change under us. We don't really care, but we don't
+ * want to follow a NULL pointer. So tell the compiler to
+ * just load it once by using a barrier.
+ */
+ action = desc->action;
+ barrier();
+ return action && (action->flags & IRQF_IRQPOLL);
+}
+
+#define SPURIOUS_DEFERRED 0x80000000
+
+void note_interrupt(unsigned int irq, struct irq_desc *desc,
+ irqreturn_t action_ret)
+{
+ if (desc->istate & IRQS_POLL_INPROGRESS ||
+ irq_settings_is_polled(desc))
+ return;
+
+ if (bad_action_ret(action_ret)) {
+ report_bad_irq(irq, desc, action_ret);
+ return;
+ }
+
+ /*
+ * We cannot call note_interrupt from the threaded handler
+ * because we need to look at the compound of all handlers
+ * (primary and threaded). Aside of that in the threaded
+ * shared case we have no serialization against an incoming
+ * hardware interrupt while we are dealing with a threaded
+ * result.
+ *
+ * So in case a thread is woken, we just note the fact and
+ * defer the analysis to the next hardware interrupt.
+ *
+ * The threaded handlers store whether they sucessfully
+ * handled an interrupt and we check whether that number
+ * changed versus the last invocation.
+ *
+ * We could handle all interrupts with the delayed by one
+ * mechanism, but for the non forced threaded case we'd just
+ * add pointless overhead to the straight hardirq interrupts
+ * for the sake of a few lines less code.
+ */
+ if (action_ret & IRQ_WAKE_THREAD) {
+ /*
+ * There is a thread woken. Check whether one of the
+ * shared primary handlers returned IRQ_HANDLED. If
+ * not we defer the spurious detection to the next
+ * interrupt.
+ */
+ if (action_ret == IRQ_WAKE_THREAD) {
+ int handled;
+ /*
+ * We use bit 31 of thread_handled_last to
+ * denote the deferred spurious detection
+ * active. No locking necessary as
+ * thread_handled_last is only accessed here
+ * and we have the guarantee that hard
+ * interrupts are not reentrant.
+ */
+ if (!(desc->threads_handled_last & SPURIOUS_DEFERRED)) {
+ desc->threads_handled_last |= SPURIOUS_DEFERRED;
+ return;
+ }
+ /*
+ * Check whether one of the threaded handlers
+ * returned IRQ_HANDLED since the last
+ * interrupt happened.
+ *
+ * For simplicity we just set bit 31, as it is
+ * set in threads_handled_last as well. So we
+ * avoid extra masking. And we really do not
+ * care about the high bits of the handled
+ * count. We just care about the count being
+ * different than the one we saw before.
+ */
+ handled = atomic_read(&desc->threads_handled);
+ handled |= SPURIOUS_DEFERRED;
+ if (handled != desc->threads_handled_last) {
+ action_ret = IRQ_HANDLED;
+ /*
+ * Note: We keep the SPURIOUS_DEFERRED
+ * bit set. We are handling the
+ * previous invocation right now.
+ * Keep it for the current one, so the
+ * next hardware interrupt will
+ * account for it.
+ */
+ desc->threads_handled_last = handled;
+ } else {
+ /*
+ * None of the threaded handlers felt
+ * responsible for the last interrupt
+ *
+ * We keep the SPURIOUS_DEFERRED bit
+ * set in threads_handled_last as we
+ * need to account for the current
+ * interrupt as well.
+ */
+ action_ret = IRQ_NONE;
+ }
+ } else {
+ /*
+ * One of the primary handlers returned
+ * IRQ_HANDLED. So we don't care about the
+ * threaded handlers on the same line. Clear
+ * the deferred detection bit.
+ *
+ * In theory we could/should check whether the
+ * deferred bit is set and take the result of
+ * the previous run into account here as
+ * well. But it's really not worth the
+ * trouble. If every other interrupt is
+ * handled we never trigger the spurious
+ * detector. And if this is just the one out
+ * of 100k unhandled ones which is handled
+ * then we merily delay the spurious detection
+ * by one hard interrupt. Not a real problem.
+ */
+ desc->threads_handled_last &= ~SPURIOUS_DEFERRED;
+ }
+ }
+
+ if (unlikely(action_ret == IRQ_NONE)) {
+ /*
+ * If we are seeing only the odd spurious IRQ caused by
+ * bus asynchronicity then don't eventually trigger an error,
+ * otherwise the counter becomes a doomsday timer for otherwise
+ * working systems
+ */
+ if (time_after(jiffies, desc->last_unhandled + HZ/10))
+ desc->irqs_unhandled = 1;
+ else
+ desc->irqs_unhandled++;
+ desc->last_unhandled = jiffies;
+ }
+
+ if (unlikely(try_misrouted_irq(irq, desc, action_ret))) {
+ int ok = misrouted_irq(irq);
+ if (action_ret == IRQ_NONE)
+ desc->irqs_unhandled -= ok;
+ }
+
+ desc->irq_count++;
+ if (likely(desc->irq_count < 100000))
+ return;
+
+ desc->irq_count = 0;
+ if (unlikely(desc->irqs_unhandled > 99900)) {
+ /*
+ * The interrupt is stuck
+ */
+ __report_bad_irq(irq, desc, action_ret);
+ /*
+ * Now kill the IRQ
+ */
+ printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
+ desc->istate |= IRQS_SPURIOUS_DISABLED;
+ desc->depth++;
+ irq_disable(desc);
+
+ mod_timer(&poll_spurious_irq_timer,
+ jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
+ }
+ desc->irqs_unhandled = 0;
+}
+
+bool noirqdebug __read_mostly;
+
+int noirqdebug_setup(char *str)
+{
+ noirqdebug = 1;
+ printk(KERN_INFO "IRQ lockup detection disabled\n");
+
+ return 1;
+}
+
+__setup("noirqdebug", noirqdebug_setup);
+module_param(noirqdebug, bool, 0644);
+MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true");
+
+static int __init irqfixup_setup(char *str)
+{
+ irqfixup = 1;
+ printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
+ printk(KERN_WARNING "This may impact system performance.\n");
+
+ return 1;
+}
+
+__setup("irqfixup", irqfixup_setup);
+module_param(irqfixup, int, 0644);
+
+static int __init irqpoll_setup(char *str)
+{
+ irqfixup = 2;
+ printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
+ "enabled\n");
+ printk(KERN_WARNING "This may significantly impact system "
+ "performance\n");
+ return 1;
+}
+
+__setup("irqpoll", irqpoll_setup);
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
new file mode 100644
index 000000000..cbf9fb899
--- /dev/null
+++ b/kernel/irq_work.c
@@ -0,0 +1,196 @@
+/*
+ * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *
+ * Provides a framework for enqueueing and running callbacks from hardirq
+ * context. The enqueueing is NMI-safe.
+ */
+
+#include <linux/bug.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/irq_work.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <linux/irqflags.h>
+#include <linux/sched.h>
+#include <linux/tick.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
+#include <linux/smp.h>
+#include <asm/processor.h>
+
+
+static DEFINE_PER_CPU(struct llist_head, raised_list);
+static DEFINE_PER_CPU(struct llist_head, lazy_list);
+
+/*
+ * Claim the entry so that no one else will poke at it.
+ */
+static bool irq_work_claim(struct irq_work *work)
+{
+ unsigned long flags, oflags, nflags;
+
+ /*
+ * Start with our best wish as a premise but only trust any
+ * flag value after cmpxchg() result.
+ */
+ flags = work->flags & ~IRQ_WORK_PENDING;
+ for (;;) {
+ nflags = flags | IRQ_WORK_FLAGS;
+ oflags = cmpxchg(&work->flags, flags, nflags);
+ if (oflags == flags)
+ break;
+ if (oflags & IRQ_WORK_PENDING)
+ return false;
+ flags = oflags;
+ cpu_relax();
+ }
+
+ return true;
+}
+
+void __weak arch_irq_work_raise(void)
+{
+ /*
+ * Lame architectures will get the timer tick callback
+ */
+}
+
+#ifdef CONFIG_SMP
+/*
+ * Enqueue the irq_work @work on @cpu unless it's already pending
+ * somewhere.
+ *
+ * Can be re-enqueued while the callback is still in progress.
+ */
+bool irq_work_queue_on(struct irq_work *work, int cpu)
+{
+ /* All work should have been flushed before going offline */
+ WARN_ON_ONCE(cpu_is_offline(cpu));
+
+ /* Arch remote IPI send/receive backend aren't NMI safe */
+ WARN_ON_ONCE(in_nmi());
+
+ /* Only queue if not already pending */
+ if (!irq_work_claim(work))
+ return false;
+
+ if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
+ arch_send_call_function_single_ipi(cpu);
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(irq_work_queue_on);
+#endif
+
+/* Enqueue the irq work @work on the current CPU */
+bool irq_work_queue(struct irq_work *work)
+{
+ /* Only queue if not already pending */
+ if (!irq_work_claim(work))
+ return false;
+
+ /* Queue the entry and raise the IPI if needed. */
+ preempt_disable();
+
+ /* If the work is "lazy", handle it from next tick if any */
+ if (work->flags & IRQ_WORK_LAZY) {
+ if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
+ tick_nohz_tick_stopped())
+ arch_irq_work_raise();
+ } else {
+ if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
+ arch_irq_work_raise();
+ }
+
+ preempt_enable();
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(irq_work_queue);
+
+bool irq_work_needs_cpu(void)
+{
+ struct llist_head *raised, *lazy;
+
+ raised = this_cpu_ptr(&raised_list);
+ lazy = this_cpu_ptr(&lazy_list);
+
+ if (llist_empty(raised) || arch_irq_work_has_interrupt())
+ if (llist_empty(lazy))
+ return false;
+
+ /* All work should have been flushed before going offline */
+ WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
+
+ return true;
+}
+
+static void irq_work_run_list(struct llist_head *list)
+{
+ unsigned long flags;
+ struct irq_work *work;
+ struct llist_node *llnode;
+
+ BUG_ON(!irqs_disabled());
+
+ if (llist_empty(list))
+ return;
+
+ llnode = llist_del_all(list);
+ while (llnode != NULL) {
+ work = llist_entry(llnode, struct irq_work, llnode);
+
+ llnode = llist_next(llnode);
+
+ /*
+ * Clear the PENDING bit, after this point the @work
+ * can be re-used.
+ * Make it immediately visible so that other CPUs trying
+ * to claim that work don't rely on us to handle their data
+ * while we are in the middle of the func.
+ */
+ flags = work->flags & ~IRQ_WORK_PENDING;
+ xchg(&work->flags, flags);
+
+ work->func(work);
+ /*
+ * Clear the BUSY bit and return to the free state if
+ * no-one else claimed it meanwhile.
+ */
+ (void)cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
+ }
+}
+
+/*
+ * hotplug calls this through:
+ * hotplug_cfd() -> flush_smp_call_function_queue()
+ */
+void irq_work_run(void)
+{
+ irq_work_run_list(this_cpu_ptr(&raised_list));
+ irq_work_run_list(this_cpu_ptr(&lazy_list));
+}
+EXPORT_SYMBOL_GPL(irq_work_run);
+
+void irq_work_tick(void)
+{
+ struct llist_head *raised = this_cpu_ptr(&raised_list);
+
+ if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
+ irq_work_run_list(raised);
+ irq_work_run_list(this_cpu_ptr(&lazy_list));
+}
+
+/*
+ * Synchronize against the irq_work @entry, ensures the entry is not
+ * currently in use.
+ */
+void irq_work_sync(struct irq_work *work)
+{
+ WARN_ON_ONCE(irqs_disabled());
+
+ while (work->flags & IRQ_WORK_BUSY)
+ cpu_relax();
+}
+EXPORT_SYMBOL_GPL(irq_work_sync);
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
new file mode 100644
index 000000000..9019f15de
--- /dev/null
+++ b/kernel/jump_label.c
@@ -0,0 +1,460 @@
+/*
+ * jump label support
+ *
+ * Copyright (C) 2009 Jason Baron <jbaron@redhat.com>
+ * Copyright (C) 2011 Peter Zijlstra <pzijlstr@redhat.com>
+ *
+ */
+#include <linux/memory.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/sort.h>
+#include <linux/err.h>
+#include <linux/static_key.h>
+#include <linux/jump_label_ratelimit.h>
+
+#ifdef HAVE_JUMP_LABEL
+
+/* mutex to protect coming/going of the the jump_label table */
+static DEFINE_MUTEX(jump_label_mutex);
+
+void jump_label_lock(void)
+{
+ mutex_lock(&jump_label_mutex);
+}
+
+void jump_label_unlock(void)
+{
+ mutex_unlock(&jump_label_mutex);
+}
+
+static int jump_label_cmp(const void *a, const void *b)
+{
+ const struct jump_entry *jea = a;
+ const struct jump_entry *jeb = b;
+
+ if (jea->key < jeb->key)
+ return -1;
+
+ if (jea->key > jeb->key)
+ return 1;
+
+ return 0;
+}
+
+static void
+jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop)
+{
+ unsigned long size;
+
+ size = (((unsigned long)stop - (unsigned long)start)
+ / sizeof(struct jump_entry));
+ sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL);
+}
+
+static void jump_label_update(struct static_key *key, int enable);
+
+void static_key_slow_inc(struct static_key *key)
+{
+ STATIC_KEY_CHECK_USE();
+ if (atomic_inc_not_zero(&key->enabled))
+ return;
+
+ jump_label_lock();
+ if (atomic_read(&key->enabled) == 0) {
+ if (!jump_label_get_branch_default(key))
+ jump_label_update(key, JUMP_LABEL_ENABLE);
+ else
+ jump_label_update(key, JUMP_LABEL_DISABLE);
+ }
+ atomic_inc(&key->enabled);
+ jump_label_unlock();
+}
+EXPORT_SYMBOL_GPL(static_key_slow_inc);
+
+static void __static_key_slow_dec(struct static_key *key,
+ unsigned long rate_limit, struct delayed_work *work)
+{
+ if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) {
+ WARN(atomic_read(&key->enabled) < 0,
+ "jump label: negative count!\n");
+ return;
+ }
+
+ if (rate_limit) {
+ atomic_inc(&key->enabled);
+ schedule_delayed_work(work, rate_limit);
+ } else {
+ if (!jump_label_get_branch_default(key))
+ jump_label_update(key, JUMP_LABEL_DISABLE);
+ else
+ jump_label_update(key, JUMP_LABEL_ENABLE);
+ }
+ jump_label_unlock();
+}
+
+static void jump_label_update_timeout(struct work_struct *work)
+{
+ struct static_key_deferred *key =
+ container_of(work, struct static_key_deferred, work.work);
+ __static_key_slow_dec(&key->key, 0, NULL);
+}
+
+void static_key_slow_dec(struct static_key *key)
+{
+ STATIC_KEY_CHECK_USE();
+ __static_key_slow_dec(key, 0, NULL);
+}
+EXPORT_SYMBOL_GPL(static_key_slow_dec);
+
+void static_key_slow_dec_deferred(struct static_key_deferred *key)
+{
+ STATIC_KEY_CHECK_USE();
+ __static_key_slow_dec(&key->key, key->timeout, &key->work);
+}
+EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred);
+
+void jump_label_rate_limit(struct static_key_deferred *key,
+ unsigned long rl)
+{
+ STATIC_KEY_CHECK_USE();
+ key->timeout = rl;
+ INIT_DELAYED_WORK(&key->work, jump_label_update_timeout);
+}
+EXPORT_SYMBOL_GPL(jump_label_rate_limit);
+
+static int addr_conflict(struct jump_entry *entry, void *start, void *end)
+{
+ if (entry->code <= (unsigned long)end &&
+ entry->code + JUMP_LABEL_NOP_SIZE > (unsigned long)start)
+ return 1;
+
+ return 0;
+}
+
+static int __jump_label_text_reserved(struct jump_entry *iter_start,
+ struct jump_entry *iter_stop, void *start, void *end)
+{
+ struct jump_entry *iter;
+
+ iter = iter_start;
+ while (iter < iter_stop) {
+ if (addr_conflict(iter, start, end))
+ return 1;
+ iter++;
+ }
+
+ return 0;
+}
+
+/*
+ * Update code which is definitely not currently executing.
+ * Architectures which need heavyweight synchronization to modify
+ * running code can override this to make the non-live update case
+ * cheaper.
+ */
+void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry *entry,
+ enum jump_label_type type)
+{
+ arch_jump_label_transform(entry, type);
+}
+
+static void __jump_label_update(struct static_key *key,
+ struct jump_entry *entry,
+ struct jump_entry *stop, int enable)
+{
+ for (; (entry < stop) &&
+ (entry->key == (jump_label_t)(unsigned long)key);
+ entry++) {
+ /*
+ * entry->code set to 0 invalidates module init text sections
+ * kernel_text_address() verifies we are not in core kernel
+ * init code, see jump_label_invalidate_module_init().
+ */
+ if (entry->code && kernel_text_address(entry->code))
+ arch_jump_label_transform(entry, enable);
+ }
+}
+
+static enum jump_label_type jump_label_type(struct static_key *key)
+{
+ bool true_branch = jump_label_get_branch_default(key);
+ bool state = static_key_enabled(key);
+
+ if ((!true_branch && state) || (true_branch && !state))
+ return JUMP_LABEL_ENABLE;
+
+ return JUMP_LABEL_DISABLE;
+}
+
+void __init jump_label_init(void)
+{
+ struct jump_entry *iter_start = __start___jump_table;
+ struct jump_entry *iter_stop = __stop___jump_table;
+ struct static_key *key = NULL;
+ struct jump_entry *iter;
+
+ jump_label_lock();
+ jump_label_sort_entries(iter_start, iter_stop);
+
+ for (iter = iter_start; iter < iter_stop; iter++) {
+ struct static_key *iterk;
+
+ iterk = (struct static_key *)(unsigned long)iter->key;
+ arch_jump_label_transform_static(iter, jump_label_type(iterk));
+ if (iterk == key)
+ continue;
+
+ key = iterk;
+ /*
+ * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH.
+ */
+ *((unsigned long *)&key->entries) += (unsigned long)iter;
+#ifdef CONFIG_MODULES
+ key->next = NULL;
+#endif
+ }
+ static_key_initialized = true;
+ jump_label_unlock();
+}
+
+#ifdef CONFIG_MODULES
+
+struct static_key_mod {
+ struct static_key_mod *next;
+ struct jump_entry *entries;
+ struct module *mod;
+};
+
+static int __jump_label_mod_text_reserved(void *start, void *end)
+{
+ struct module *mod;
+
+ mod = __module_text_address((unsigned long)start);
+ if (!mod)
+ return 0;
+
+ WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod);
+
+ return __jump_label_text_reserved(mod->jump_entries,
+ mod->jump_entries + mod->num_jump_entries,
+ start, end);
+}
+
+static void __jump_label_mod_update(struct static_key *key, int enable)
+{
+ struct static_key_mod *mod = key->next;
+
+ while (mod) {
+ struct module *m = mod->mod;
+
+ __jump_label_update(key, mod->entries,
+ m->jump_entries + m->num_jump_entries,
+ enable);
+ mod = mod->next;
+ }
+}
+
+/***
+ * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop()
+ * @mod: module to patch
+ *
+ * Allow for run-time selection of the optimal nops. Before the module
+ * loads patch these with arch_get_jump_label_nop(), which is specified by
+ * the arch specific jump label code.
+ */
+void jump_label_apply_nops(struct module *mod)
+{
+ struct jump_entry *iter_start = mod->jump_entries;
+ struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
+ struct jump_entry *iter;
+
+ /* if the module doesn't have jump label entries, just return */
+ if (iter_start == iter_stop)
+ return;
+
+ for (iter = iter_start; iter < iter_stop; iter++) {
+ arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE);
+ }
+}
+
+static int jump_label_add_module(struct module *mod)
+{
+ struct jump_entry *iter_start = mod->jump_entries;
+ struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
+ struct jump_entry *iter;
+ struct static_key *key = NULL;
+ struct static_key_mod *jlm;
+
+ /* if the module doesn't have jump label entries, just return */
+ if (iter_start == iter_stop)
+ return 0;
+
+ jump_label_sort_entries(iter_start, iter_stop);
+
+ for (iter = iter_start; iter < iter_stop; iter++) {
+ struct static_key *iterk;
+
+ iterk = (struct static_key *)(unsigned long)iter->key;
+ if (iterk == key)
+ continue;
+
+ key = iterk;
+ if (__module_address(iter->key) == mod) {
+ /*
+ * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH.
+ */
+ *((unsigned long *)&key->entries) += (unsigned long)iter;
+ key->next = NULL;
+ continue;
+ }
+ jlm = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL);
+ if (!jlm)
+ return -ENOMEM;
+ jlm->mod = mod;
+ jlm->entries = iter;
+ jlm->next = key->next;
+ key->next = jlm;
+
+ if (jump_label_type(key) == JUMP_LABEL_ENABLE)
+ __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE);
+ }
+
+ return 0;
+}
+
+static void jump_label_del_module(struct module *mod)
+{
+ struct jump_entry *iter_start = mod->jump_entries;
+ struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
+ struct jump_entry *iter;
+ struct static_key *key = NULL;
+ struct static_key_mod *jlm, **prev;
+
+ for (iter = iter_start; iter < iter_stop; iter++) {
+ if (iter->key == (jump_label_t)(unsigned long)key)
+ continue;
+
+ key = (struct static_key *)(unsigned long)iter->key;
+
+ if (__module_address(iter->key) == mod)
+ continue;
+
+ prev = &key->next;
+ jlm = key->next;
+
+ while (jlm && jlm->mod != mod) {
+ prev = &jlm->next;
+ jlm = jlm->next;
+ }
+
+ if (jlm) {
+ *prev = jlm->next;
+ kfree(jlm);
+ }
+ }
+}
+
+static void jump_label_invalidate_module_init(struct module *mod)
+{
+ struct jump_entry *iter_start = mod->jump_entries;
+ struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
+ struct jump_entry *iter;
+
+ for (iter = iter_start; iter < iter_stop; iter++) {
+ if (within_module_init(iter->code, mod))
+ iter->code = 0;
+ }
+}
+
+static int
+jump_label_module_notify(struct notifier_block *self, unsigned long val,
+ void *data)
+{
+ struct module *mod = data;
+ int ret = 0;
+
+ switch (val) {
+ case MODULE_STATE_COMING:
+ jump_label_lock();
+ ret = jump_label_add_module(mod);
+ if (ret)
+ jump_label_del_module(mod);
+ jump_label_unlock();
+ break;
+ case MODULE_STATE_GOING:
+ jump_label_lock();
+ jump_label_del_module(mod);
+ jump_label_unlock();
+ break;
+ case MODULE_STATE_LIVE:
+ jump_label_lock();
+ jump_label_invalidate_module_init(mod);
+ jump_label_unlock();
+ break;
+ }
+
+ return notifier_from_errno(ret);
+}
+
+struct notifier_block jump_label_module_nb = {
+ .notifier_call = jump_label_module_notify,
+ .priority = 1, /* higher than tracepoints */
+};
+
+static __init int jump_label_init_module(void)
+{
+ return register_module_notifier(&jump_label_module_nb);
+}
+early_initcall(jump_label_init_module);
+
+#endif /* CONFIG_MODULES */
+
+/***
+ * jump_label_text_reserved - check if addr range is reserved
+ * @start: start text addr
+ * @end: end text addr
+ *
+ * checks if the text addr located between @start and @end
+ * overlaps with any of the jump label patch addresses. Code
+ * that wants to modify kernel text should first verify that
+ * it does not overlap with any of the jump label addresses.
+ * Caller must hold jump_label_mutex.
+ *
+ * returns 1 if there is an overlap, 0 otherwise
+ */
+int jump_label_text_reserved(void *start, void *end)
+{
+ int ret = __jump_label_text_reserved(__start___jump_table,
+ __stop___jump_table, start, end);
+
+ if (ret)
+ return ret;
+
+#ifdef CONFIG_MODULES
+ ret = __jump_label_mod_text_reserved(start, end);
+#endif
+ return ret;
+}
+
+static void jump_label_update(struct static_key *key, int enable)
+{
+ struct jump_entry *stop = __stop___jump_table;
+ struct jump_entry *entry = jump_label_get_entries(key);
+
+#ifdef CONFIG_MODULES
+ struct module *mod = __module_address((unsigned long)key);
+
+ __jump_label_mod_update(key, enable);
+
+ if (mod)
+ stop = mod->jump_entries + mod->num_jump_entries;
+#endif
+ /* if there are no users, entry can be NULL */
+ if (entry)
+ __jump_label_update(key, entry, stop, enable);
+}
+
+#endif
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
new file mode 100644
index 000000000..5c5987f10
--- /dev/null
+++ b/kernel/kallsyms.c
@@ -0,0 +1,608 @@
+/*
+ * kallsyms.c: in-kernel printing of symbolic oopses and stack traces.
+ *
+ * Rewritten and vastly simplified by Rusty Russell for in-kernel
+ * module loader:
+ * Copyright 2002 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
+ *
+ * ChangeLog:
+ *
+ * (25/Aug/2004) Paulo Marques <pmarques@grupopie.com>
+ * Changed the compression method from stem compression to "table lookup"
+ * compression (see scripts/kallsyms.c for a more complete description)
+ */
+#include <linux/kallsyms.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/seq_file.h>
+#include <linux/fs.h>
+#include <linux/kdb.h>
+#include <linux/err.h>
+#include <linux/proc_fs.h>
+#include <linux/sched.h> /* for cond_resched */
+#include <linux/mm.h>
+#include <linux/ctype.h>
+#include <linux/slab.h>
+#include <linux/compiler.h>
+
+#include <asm/sections.h>
+
+#ifdef CONFIG_KALLSYMS_ALL
+#define all_var 1
+#else
+#define all_var 0
+#endif
+
+/*
+ * These will be re-linked against their real values
+ * during the second link stage.
+ */
+extern const unsigned long kallsyms_addresses[] __weak;
+extern const u8 kallsyms_names[] __weak;
+
+/*
+ * Tell the compiler that the count isn't in the small data section if the arch
+ * has one (eg: FRV).
+ */
+extern const unsigned long kallsyms_num_syms
+__attribute__((weak, section(".rodata")));
+
+extern const u8 kallsyms_token_table[] __weak;
+extern const u16 kallsyms_token_index[] __weak;
+
+extern const unsigned long kallsyms_markers[] __weak;
+
+static inline int is_kernel_inittext(unsigned long addr)
+{
+ if (addr >= (unsigned long)_sinittext
+ && addr <= (unsigned long)_einittext)
+ return 1;
+ return 0;
+}
+
+static inline int is_kernel_text(unsigned long addr)
+{
+ if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) ||
+ arch_is_kernel_text(addr))
+ return 1;
+ return in_gate_area_no_mm(addr);
+}
+
+static inline int is_kernel(unsigned long addr)
+{
+ if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end)
+ return 1;
+ return in_gate_area_no_mm(addr);
+}
+
+static int is_ksym_addr(unsigned long addr)
+{
+ if (all_var)
+ return is_kernel(addr);
+
+ return is_kernel_text(addr) || is_kernel_inittext(addr);
+}
+
+/*
+ * Expand a compressed symbol data into the resulting uncompressed string,
+ * if uncompressed string is too long (>= maxlen), it will be truncated,
+ * given the offset to where the symbol is in the compressed stream.
+ */
+static unsigned int kallsyms_expand_symbol(unsigned int off,
+ char *result, size_t maxlen)
+{
+ int len, skipped_first = 0;
+ const u8 *tptr, *data;
+
+ /* Get the compressed symbol length from the first symbol byte. */
+ data = &kallsyms_names[off];
+ len = *data;
+ data++;
+
+ /*
+ * Update the offset to return the offset for the next symbol on
+ * the compressed stream.
+ */
+ off += len + 1;
+
+ /*
+ * For every byte on the compressed symbol data, copy the table
+ * entry for that byte.
+ */
+ while (len) {
+ tptr = &kallsyms_token_table[kallsyms_token_index[*data]];
+ data++;
+ len--;
+
+ while (*tptr) {
+ if (skipped_first) {
+ if (maxlen <= 1)
+ goto tail;
+ *result = *tptr;
+ result++;
+ maxlen--;
+ } else
+ skipped_first = 1;
+ tptr++;
+ }
+ }
+
+tail:
+ if (maxlen)
+ *result = '\0';
+
+ /* Return to offset to the next symbol. */
+ return off;
+}
+
+/*
+ * Get symbol type information. This is encoded as a single char at the
+ * beginning of the symbol name.
+ */
+static char kallsyms_get_symbol_type(unsigned int off)
+{
+ /*
+ * Get just the first code, look it up in the token table,
+ * and return the first char from this token.
+ */
+ return kallsyms_token_table[kallsyms_token_index[kallsyms_names[off + 1]]];
+}
+
+
+/*
+ * Find the offset on the compressed stream given and index in the
+ * kallsyms array.
+ */
+static unsigned int get_symbol_offset(unsigned long pos)
+{
+ const u8 *name;
+ int i;
+
+ /*
+ * Use the closest marker we have. We have markers every 256 positions,
+ * so that should be close enough.
+ */
+ name = &kallsyms_names[kallsyms_markers[pos >> 8]];
+
+ /*
+ * Sequentially scan all the symbols up to the point we're searching
+ * for. Every symbol is stored in a [<len>][<len> bytes of data] format,
+ * so we just need to add the len to the current pointer for every
+ * symbol we wish to skip.
+ */
+ for (i = 0; i < (pos & 0xFF); i++)
+ name = name + (*name) + 1;
+
+ return name - kallsyms_names;
+}
+
+/* Lookup the address for this symbol. Returns 0 if not found. */
+unsigned long kallsyms_lookup_name(const char *name)
+{
+ char namebuf[KSYM_NAME_LEN];
+ unsigned long i;
+ unsigned int off;
+
+ for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
+ off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
+
+ if (strcmp(namebuf, name) == 0)
+ return kallsyms_addresses[i];
+ }
+ return module_kallsyms_lookup_name(name);
+}
+EXPORT_SYMBOL_GPL(kallsyms_lookup_name);
+
+int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
+ unsigned long),
+ void *data)
+{
+ char namebuf[KSYM_NAME_LEN];
+ unsigned long i;
+ unsigned int off;
+ int ret;
+
+ for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
+ off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
+ ret = fn(data, namebuf, NULL, kallsyms_addresses[i]);
+ if (ret != 0)
+ return ret;
+ }
+ return module_kallsyms_on_each_symbol(fn, data);
+}
+EXPORT_SYMBOL_GPL(kallsyms_on_each_symbol);
+
+static unsigned long get_symbol_pos(unsigned long addr,
+ unsigned long *symbolsize,
+ unsigned long *offset)
+{
+ unsigned long symbol_start = 0, symbol_end = 0;
+ unsigned long i, low, high, mid;
+
+ /* This kernel should never had been booted. */
+ BUG_ON(!kallsyms_addresses);
+
+ /* Do a binary search on the sorted kallsyms_addresses array. */
+ low = 0;
+ high = kallsyms_num_syms;
+
+ while (high - low > 1) {
+ mid = low + (high - low) / 2;
+ if (kallsyms_addresses[mid] <= addr)
+ low = mid;
+ else
+ high = mid;
+ }
+
+ /*
+ * Search for the first aliased symbol. Aliased
+ * symbols are symbols with the same address.
+ */
+ while (low && kallsyms_addresses[low-1] == kallsyms_addresses[low])
+ --low;
+
+ symbol_start = kallsyms_addresses[low];
+
+ /* Search for next non-aliased symbol. */
+ for (i = low + 1; i < kallsyms_num_syms; i++) {
+ if (kallsyms_addresses[i] > symbol_start) {
+ symbol_end = kallsyms_addresses[i];
+ break;
+ }
+ }
+
+ /* If we found no next symbol, we use the end of the section. */
+ if (!symbol_end) {
+ if (is_kernel_inittext(addr))
+ symbol_end = (unsigned long)_einittext;
+ else if (all_var)
+ symbol_end = (unsigned long)_end;
+ else
+ symbol_end = (unsigned long)_etext;
+ }
+
+ if (symbolsize)
+ *symbolsize = symbol_end - symbol_start;
+ if (offset)
+ *offset = addr - symbol_start;
+
+ return low;
+}
+
+/*
+ * Lookup an address but don't bother to find any names.
+ */
+int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize,
+ unsigned long *offset)
+{
+ char namebuf[KSYM_NAME_LEN];
+ if (is_ksym_addr(addr))
+ return !!get_symbol_pos(addr, symbolsize, offset);
+
+ return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf);
+}
+
+/*
+ * Lookup an address
+ * - modname is set to NULL if it's in the kernel.
+ * - We guarantee that the returned name is valid until we reschedule even if.
+ * It resides in a module.
+ * - We also guarantee that modname will be valid until rescheduled.
+ */
+const char *kallsyms_lookup(unsigned long addr,
+ unsigned long *symbolsize,
+ unsigned long *offset,
+ char **modname, char *namebuf)
+{
+ namebuf[KSYM_NAME_LEN - 1] = 0;
+ namebuf[0] = 0;
+
+ if (is_ksym_addr(addr)) {
+ unsigned long pos;
+
+ pos = get_symbol_pos(addr, symbolsize, offset);
+ /* Grab name */
+ kallsyms_expand_symbol(get_symbol_offset(pos),
+ namebuf, KSYM_NAME_LEN);
+ if (modname)
+ *modname = NULL;
+ return namebuf;
+ }
+
+ /* See if it's in a module. */
+ return module_address_lookup(addr, symbolsize, offset, modname,
+ namebuf);
+}
+
+int lookup_symbol_name(unsigned long addr, char *symname)
+{
+ symname[0] = '\0';
+ symname[KSYM_NAME_LEN - 1] = '\0';
+
+ if (is_ksym_addr(addr)) {
+ unsigned long pos;
+
+ pos = get_symbol_pos(addr, NULL, NULL);
+ /* Grab name */
+ kallsyms_expand_symbol(get_symbol_offset(pos),
+ symname, KSYM_NAME_LEN);
+ return 0;
+ }
+ /* See if it's in a module. */
+ return lookup_module_symbol_name(addr, symname);
+}
+
+int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
+ unsigned long *offset, char *modname, char *name)
+{
+ name[0] = '\0';
+ name[KSYM_NAME_LEN - 1] = '\0';
+
+ if (is_ksym_addr(addr)) {
+ unsigned long pos;
+
+ pos = get_symbol_pos(addr, size, offset);
+ /* Grab name */
+ kallsyms_expand_symbol(get_symbol_offset(pos),
+ name, KSYM_NAME_LEN);
+ modname[0] = '\0';
+ return 0;
+ }
+ /* See if it's in a module. */
+ return lookup_module_symbol_attrs(addr, size, offset, modname, name);
+}
+
+/* Look up a kernel symbol and return it in a text buffer. */
+static int __sprint_symbol(char *buffer, unsigned long address,
+ int symbol_offset, int add_offset)
+{
+ char *modname;
+ const char *name;
+ unsigned long offset, size;
+ int len;
+
+ address += symbol_offset;
+ name = kallsyms_lookup(address, &size, &offset, &modname, buffer);
+ if (!name)
+ return sprintf(buffer, "0x%lx", address - symbol_offset);
+
+ if (name != buffer)
+ strcpy(buffer, name);
+ len = strlen(buffer);
+ offset -= symbol_offset;
+
+ if (add_offset)
+ len += sprintf(buffer + len, "+%#lx/%#lx", offset, size);
+
+ if (modname)
+ len += sprintf(buffer + len, " [%s]", modname);
+
+ return len;
+}
+
+/**
+ * sprint_symbol - Look up a kernel symbol and return it in a text buffer
+ * @buffer: buffer to be stored
+ * @address: address to lookup
+ *
+ * This function looks up a kernel symbol with @address and stores its name,
+ * offset, size and module name to @buffer if possible. If no symbol was found,
+ * just saves its @address as is.
+ *
+ * This function returns the number of bytes stored in @buffer.
+ */
+int sprint_symbol(char *buffer, unsigned long address)
+{
+ return __sprint_symbol(buffer, address, 0, 1);
+}
+EXPORT_SYMBOL_GPL(sprint_symbol);
+
+/**
+ * sprint_symbol_no_offset - Look up a kernel symbol and return it in a text buffer
+ * @buffer: buffer to be stored
+ * @address: address to lookup
+ *
+ * This function looks up a kernel symbol with @address and stores its name
+ * and module name to @buffer if possible. If no symbol was found, just saves
+ * its @address as is.
+ *
+ * This function returns the number of bytes stored in @buffer.
+ */
+int sprint_symbol_no_offset(char *buffer, unsigned long address)
+{
+ return __sprint_symbol(buffer, address, 0, 0);
+}
+EXPORT_SYMBOL_GPL(sprint_symbol_no_offset);
+
+/**
+ * sprint_backtrace - Look up a backtrace symbol and return it in a text buffer
+ * @buffer: buffer to be stored
+ * @address: address to lookup
+ *
+ * This function is for stack backtrace and does the same thing as
+ * sprint_symbol() but with modified/decreased @address. If there is a
+ * tail-call to the function marked "noreturn", gcc optimized out code after
+ * the call so that the stack-saved return address could point outside of the
+ * caller. This function ensures that kallsyms will find the original caller
+ * by decreasing @address.
+ *
+ * This function returns the number of bytes stored in @buffer.
+ */
+int sprint_backtrace(char *buffer, unsigned long address)
+{
+ return __sprint_symbol(buffer, address, -1, 1);
+}
+
+/* Look up a kernel symbol and print it to the kernel messages. */
+void __print_symbol(const char *fmt, unsigned long address)
+{
+ char buffer[KSYM_SYMBOL_LEN];
+
+ sprint_symbol(buffer, address);
+
+ printk(fmt, buffer);
+}
+EXPORT_SYMBOL(__print_symbol);
+
+/* To avoid using get_symbol_offset for every symbol, we carry prefix along. */
+struct kallsym_iter {
+ loff_t pos;
+ unsigned long value;
+ unsigned int nameoff; /* If iterating in core kernel symbols. */
+ char type;
+ char name[KSYM_NAME_LEN];
+ char module_name[MODULE_NAME_LEN];
+ int exported;
+};
+
+static int get_ksymbol_mod(struct kallsym_iter *iter)
+{
+ if (module_get_kallsym(iter->pos - kallsyms_num_syms, &iter->value,
+ &iter->type, iter->name, iter->module_name,
+ &iter->exported) < 0)
+ return 0;
+ return 1;
+}
+
+/* Returns space to next name. */
+static unsigned long get_ksymbol_core(struct kallsym_iter *iter)
+{
+ unsigned off = iter->nameoff;
+
+ iter->module_name[0] = '\0';
+ iter->value = kallsyms_addresses[iter->pos];
+
+ iter->type = kallsyms_get_symbol_type(off);
+
+ off = kallsyms_expand_symbol(off, iter->name, ARRAY_SIZE(iter->name));
+
+ return off - iter->nameoff;
+}
+
+static void reset_iter(struct kallsym_iter *iter, loff_t new_pos)
+{
+ iter->name[0] = '\0';
+ iter->nameoff = get_symbol_offset(new_pos);
+ iter->pos = new_pos;
+}
+
+/* Returns false if pos at or past end of file. */
+static int update_iter(struct kallsym_iter *iter, loff_t pos)
+{
+ /* Module symbols can be accessed randomly. */
+ if (pos >= kallsyms_num_syms) {
+ iter->pos = pos;
+ return get_ksymbol_mod(iter);
+ }
+
+ /* If we're not on the desired position, reset to new position. */
+ if (pos != iter->pos)
+ reset_iter(iter, pos);
+
+ iter->nameoff += get_ksymbol_core(iter);
+ iter->pos++;
+
+ return 1;
+}
+
+static void *s_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ (*pos)++;
+
+ if (!update_iter(m->private, *pos))
+ return NULL;
+ return p;
+}
+
+static void *s_start(struct seq_file *m, loff_t *pos)
+{
+ if (!update_iter(m->private, *pos))
+ return NULL;
+ return m->private;
+}
+
+static void s_stop(struct seq_file *m, void *p)
+{
+}
+
+static int s_show(struct seq_file *m, void *p)
+{
+ struct kallsym_iter *iter = m->private;
+
+ /* Some debugging symbols have no name. Ignore them. */
+ if (!iter->name[0])
+ return 0;
+
+ if (iter->module_name[0]) {
+ char type;
+
+ /*
+ * Label it "global" if it is exported,
+ * "local" if not exported.
+ */
+ type = iter->exported ? toupper(iter->type) :
+ tolower(iter->type);
+ seq_printf(m, "%pK %c %s\t[%s]\n", (void *)iter->value,
+ type, iter->name, iter->module_name);
+ } else
+ seq_printf(m, "%pK %c %s\n", (void *)iter->value,
+ iter->type, iter->name);
+ return 0;
+}
+
+static const struct seq_operations kallsyms_op = {
+ .start = s_start,
+ .next = s_next,
+ .stop = s_stop,
+ .show = s_show
+};
+
+static int kallsyms_open(struct inode *inode, struct file *file)
+{
+ /*
+ * We keep iterator in m->private, since normal case is to
+ * s_start from where we left off, so we avoid doing
+ * using get_symbol_offset for every symbol.
+ */
+ struct kallsym_iter *iter;
+ iter = __seq_open_private(file, &kallsyms_op, sizeof(*iter));
+ if (!iter)
+ return -ENOMEM;
+ reset_iter(iter, 0);
+
+ return 0;
+}
+
+#ifdef CONFIG_KGDB_KDB
+const char *kdb_walk_kallsyms(loff_t *pos)
+{
+ static struct kallsym_iter kdb_walk_kallsyms_iter;
+ if (*pos == 0) {
+ memset(&kdb_walk_kallsyms_iter, 0,
+ sizeof(kdb_walk_kallsyms_iter));
+ reset_iter(&kdb_walk_kallsyms_iter, 0);
+ }
+ while (1) {
+ if (!update_iter(&kdb_walk_kallsyms_iter, *pos))
+ return NULL;
+ ++*pos;
+ /* Some debugging symbols have no name. Ignore them. */
+ if (kdb_walk_kallsyms_iter.name[0])
+ return kdb_walk_kallsyms_iter.name;
+ }
+}
+#endif /* CONFIG_KGDB_KDB */
+
+static const struct file_operations kallsyms_operations = {
+ .open = kallsyms_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+static int __init kallsyms_init(void)
+{
+ proc_create("kallsyms", 0444, NULL, &kallsyms_operations);
+ return 0;
+}
+device_initcall(kallsyms_init);
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
new file mode 100644
index 000000000..0aa69ea1d
--- /dev/null
+++ b/kernel/kcmp.c
@@ -0,0 +1,198 @@
+#include <linux/kernel.h>
+#include <linux/syscalls.h>
+#include <linux/fdtable.h>
+#include <linux/string.h>
+#include <linux/random.h>
+#include <linux/module.h>
+#include <linux/ptrace.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/cache.h>
+#include <linux/bug.h>
+#include <linux/err.h>
+#include <linux/kcmp.h>
+
+#include <asm/unistd.h>
+
+/*
+ * We don't expose the real in-memory order of objects for security reasons.
+ * But still the comparison results should be suitable for sorting. So we
+ * obfuscate kernel pointers values and compare the production instead.
+ *
+ * The obfuscation is done in two steps. First we xor the kernel pointer with
+ * a random value, which puts pointer into a new position in a reordered space.
+ * Secondly we multiply the xor production with a large odd random number to
+ * permute its bits even more (the odd multiplier guarantees that the product
+ * is unique ever after the high bits are truncated, since any odd number is
+ * relative prime to 2^n).
+ *
+ * Note also that the obfuscation itself is invisible to userspace and if needed
+ * it can be changed to an alternate scheme.
+ */
+static unsigned long cookies[KCMP_TYPES][2] __read_mostly;
+
+static long kptr_obfuscate(long v, int type)
+{
+ return (v ^ cookies[type][0]) * cookies[type][1];
+}
+
+/*
+ * 0 - equal, i.e. v1 = v2
+ * 1 - less than, i.e. v1 < v2
+ * 2 - greater than, i.e. v1 > v2
+ * 3 - not equal but ordering unavailable (reserved for future)
+ */
+static int kcmp_ptr(void *v1, void *v2, enum kcmp_type type)
+{
+ long t1, t2;
+
+ t1 = kptr_obfuscate((long)v1, type);
+ t2 = kptr_obfuscate((long)v2, type);
+
+ return (t1 < t2) | ((t1 > t2) << 1);
+}
+
+/* The caller must have pinned the task */
+static struct file *
+get_file_raw_ptr(struct task_struct *task, unsigned int idx)
+{
+ struct file *file = NULL;
+
+ task_lock(task);
+ rcu_read_lock();
+
+ if (task->files)
+ file = fcheck_files(task->files, idx);
+
+ rcu_read_unlock();
+ task_unlock(task);
+
+ return file;
+}
+
+static void kcmp_unlock(struct mutex *m1, struct mutex *m2)
+{
+ if (likely(m2 != m1))
+ mutex_unlock(m2);
+ mutex_unlock(m1);
+}
+
+static int kcmp_lock(struct mutex *m1, struct mutex *m2)
+{
+ int err;
+
+ if (m2 > m1)
+ swap(m1, m2);
+
+ err = mutex_lock_killable(m1);
+ if (!err && likely(m1 != m2)) {
+ err = mutex_lock_killable_nested(m2, SINGLE_DEPTH_NESTING);
+ if (err)
+ mutex_unlock(m1);
+ }
+
+ return err;
+}
+
+SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
+ unsigned long, idx1, unsigned long, idx2)
+{
+ struct task_struct *task1, *task2;
+ int ret;
+
+ rcu_read_lock();
+
+ /*
+ * Tasks are looked up in caller's PID namespace only.
+ */
+ task1 = find_task_by_vpid(pid1);
+ task2 = find_task_by_vpid(pid2);
+ if (!task1 || !task2)
+ goto err_no_task;
+
+ get_task_struct(task1);
+ get_task_struct(task2);
+
+ rcu_read_unlock();
+
+ /*
+ * One should have enough rights to inspect task details.
+ */
+ ret = kcmp_lock(&task1->signal->cred_guard_mutex,
+ &task2->signal->cred_guard_mutex);
+ if (ret)
+ goto err;
+ if (!ptrace_may_access(task1, PTRACE_MODE_READ) ||
+ !ptrace_may_access(task2, PTRACE_MODE_READ)) {
+ ret = -EPERM;
+ goto err_unlock;
+ }
+
+ switch (type) {
+ case KCMP_FILE: {
+ struct file *filp1, *filp2;
+
+ filp1 = get_file_raw_ptr(task1, idx1);
+ filp2 = get_file_raw_ptr(task2, idx2);
+
+ if (filp1 && filp2)
+ ret = kcmp_ptr(filp1, filp2, KCMP_FILE);
+ else
+ ret = -EBADF;
+ break;
+ }
+ case KCMP_VM:
+ ret = kcmp_ptr(task1->mm, task2->mm, KCMP_VM);
+ break;
+ case KCMP_FILES:
+ ret = kcmp_ptr(task1->files, task2->files, KCMP_FILES);
+ break;
+ case KCMP_FS:
+ ret = kcmp_ptr(task1->fs, task2->fs, KCMP_FS);
+ break;
+ case KCMP_SIGHAND:
+ ret = kcmp_ptr(task1->sighand, task2->sighand, KCMP_SIGHAND);
+ break;
+ case KCMP_IO:
+ ret = kcmp_ptr(task1->io_context, task2->io_context, KCMP_IO);
+ break;
+ case KCMP_SYSVSEM:
+#ifdef CONFIG_SYSVIPC
+ ret = kcmp_ptr(task1->sysvsem.undo_list,
+ task2->sysvsem.undo_list,
+ KCMP_SYSVSEM);
+#else
+ ret = -EOPNOTSUPP;
+#endif
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+err_unlock:
+ kcmp_unlock(&task1->signal->cred_guard_mutex,
+ &task2->signal->cred_guard_mutex);
+err:
+ put_task_struct(task1);
+ put_task_struct(task2);
+
+ return ret;
+
+err_no_task:
+ rcu_read_unlock();
+ return -ESRCH;
+}
+
+static __init int kcmp_cookies_init(void)
+{
+ int i;
+
+ get_random_bytes(cookies, sizeof(cookies));
+
+ for (i = 0; i < KCMP_TYPES; i++)
+ cookies[i][1] |= (~(~0UL >> 1) | 1);
+
+ return 0;
+}
+arch_initcall(kcmp_cookies_init);
diff --git a/kernel/kexec.c b/kernel/kexec.c
new file mode 100644
index 000000000..7a36fdcca
--- /dev/null
+++ b/kernel/kexec.c
@@ -0,0 +1,2769 @@
+/*
+ * kexec.c - kexec system call
+ * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#define pr_fmt(fmt) "kexec: " fmt
+
+#include <linux/capability.h>
+#include <linux/mm.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/kexec.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <linux/syscalls.h>
+#include <linux/reboot.h>
+#include <linux/ioport.h>
+#include <linux/hardirq.h>
+#include <linux/elf.h>
+#include <linux/elfcore.h>
+#include <linux/utsname.h>
+#include <linux/numa.h>
+#include <linux/suspend.h>
+#include <linux/device.h>
+#include <linux/freezer.h>
+#include <linux/pm.h>
+#include <linux/cpu.h>
+#include <linux/console.h>
+#include <linux/vmalloc.h>
+#include <linux/swap.h>
+#include <linux/syscore_ops.h>
+#include <linux/compiler.h>
+#include <linux/hugetlb.h>
+
+#include <asm/page.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/sections.h>
+
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+
+/* Per cpu memory for storing cpu states in case of system crash. */
+note_buf_t __percpu *crash_notes;
+
+/* vmcoreinfo stuff */
+static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
+u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
+size_t vmcoreinfo_size;
+size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
+
+/* Flag to indicate we are going to kexec a new kernel */
+bool kexec_in_progress = false;
+
+/*
+ * Declare these symbols weak so that if architecture provides a purgatory,
+ * these will be overridden.
+ */
+char __weak kexec_purgatory[0];
+size_t __weak kexec_purgatory_size = 0;
+
+#ifdef CONFIG_KEXEC_FILE
+static int kexec_calculate_store_digests(struct kimage *image);
+#endif
+
+/* Location of the reserved area for the crash kernel */
+struct resource crashk_res = {
+ .name = "Crash kernel",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+struct resource crashk_low_res = {
+ .name = "Crash kernel",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+int kexec_should_crash(struct task_struct *p)
+{
+ if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
+ return 1;
+ return 0;
+}
+
+/*
+ * When kexec transitions to the new kernel there is a one-to-one
+ * mapping between physical and virtual addresses. On processors
+ * where you can disable the MMU this is trivial, and easy. For
+ * others it is still a simple predictable page table to setup.
+ *
+ * In that environment kexec copies the new kernel to its final
+ * resting place. This means I can only support memory whose
+ * physical address can fit in an unsigned long. In particular
+ * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
+ * If the assembly stub has more restrictive requirements
+ * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
+ * defined more restrictively in <asm/kexec.h>.
+ *
+ * The code for the transition from the current kernel to the
+ * the new kernel is placed in the control_code_buffer, whose size
+ * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single
+ * page of memory is necessary, but some architectures require more.
+ * Because this memory must be identity mapped in the transition from
+ * virtual to physical addresses it must live in the range
+ * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
+ * modifiable.
+ *
+ * The assembly stub in the control code buffer is passed a linked list
+ * of descriptor pages detailing the source pages of the new kernel,
+ * and the destination addresses of those source pages. As this data
+ * structure is not used in the context of the current OS, it must
+ * be self-contained.
+ *
+ * The code has been made to work with highmem pages and will use a
+ * destination page in its final resting place (if it happens
+ * to allocate it). The end product of this is that most of the
+ * physical address space, and most of RAM can be used.
+ *
+ * Future directions include:
+ * - allocating a page table with the control code buffer identity
+ * mapped, to simplify machine_kexec and make kexec_on_panic more
+ * reliable.
+ */
+
+/*
+ * KIMAGE_NO_DEST is an impossible destination address..., for
+ * allocating pages whose destination address we do not care about.
+ */
+#define KIMAGE_NO_DEST (-1UL)
+
+static int kimage_is_destination_range(struct kimage *image,
+ unsigned long start, unsigned long end);
+static struct page *kimage_alloc_page(struct kimage *image,
+ gfp_t gfp_mask,
+ unsigned long dest);
+
+static int copy_user_segment_list(struct kimage *image,
+ unsigned long nr_segments,
+ struct kexec_segment __user *segments)
+{
+ int ret;
+ size_t segment_bytes;
+
+ /* Read in the segments */
+ image->nr_segments = nr_segments;
+ segment_bytes = nr_segments * sizeof(*segments);
+ ret = copy_from_user(image->segment, segments, segment_bytes);
+ if (ret)
+ ret = -EFAULT;
+
+ return ret;
+}
+
+static int sanity_check_segment_list(struct kimage *image)
+{
+ int result, i;
+ unsigned long nr_segments = image->nr_segments;
+
+ /*
+ * Verify we have good destination addresses. The caller is
+ * responsible for making certain we don't attempt to load
+ * the new image into invalid or reserved areas of RAM. This
+ * just verifies it is an address we can use.
+ *
+ * Since the kernel does everything in page size chunks ensure
+ * the destination addresses are page aligned. Too many
+ * special cases crop of when we don't do this. The most
+ * insidious is getting overlapping destination addresses
+ * simply because addresses are changed to page size
+ * granularity.
+ */
+ result = -EADDRNOTAVAIL;
+ for (i = 0; i < nr_segments; i++) {
+ unsigned long mstart, mend;
+
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz;
+ if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
+ return result;
+ if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
+ return result;
+ }
+
+ /* Verify our destination addresses do not overlap.
+ * If we alloed overlapping destination addresses
+ * through very weird things can happen with no
+ * easy explanation as one segment stops on another.
+ */
+ result = -EINVAL;
+ for (i = 0; i < nr_segments; i++) {
+ unsigned long mstart, mend;
+ unsigned long j;
+
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz;
+ for (j = 0; j < i; j++) {
+ unsigned long pstart, pend;
+ pstart = image->segment[j].mem;
+ pend = pstart + image->segment[j].memsz;
+ /* Do the segments overlap ? */
+ if ((mend > pstart) && (mstart < pend))
+ return result;
+ }
+ }
+
+ /* Ensure our buffer sizes are strictly less than
+ * our memory sizes. This should always be the case,
+ * and it is easier to check up front than to be surprised
+ * later on.
+ */
+ result = -EINVAL;
+ for (i = 0; i < nr_segments; i++) {
+ if (image->segment[i].bufsz > image->segment[i].memsz)
+ return result;
+ }
+
+ /*
+ * Verify we have good destination addresses. Normally
+ * the caller is responsible for making certain we don't
+ * attempt to load the new image into invalid or reserved
+ * areas of RAM. But crash kernels are preloaded into a
+ * reserved area of ram. We must ensure the addresses
+ * are in the reserved area otherwise preloading the
+ * kernel could corrupt things.
+ */
+
+ if (image->type == KEXEC_TYPE_CRASH) {
+ result = -EADDRNOTAVAIL;
+ for (i = 0; i < nr_segments; i++) {
+ unsigned long mstart, mend;
+
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz - 1;
+ /* Ensure we are within the crash kernel limits */
+ if ((mstart < crashk_res.start) ||
+ (mend > crashk_res.end))
+ return result;
+ }
+ }
+
+ return 0;
+}
+
+static struct kimage *do_kimage_alloc_init(void)
+{
+ struct kimage *image;
+
+ /* Allocate a controlling structure */
+ image = kzalloc(sizeof(*image), GFP_KERNEL);
+ if (!image)
+ return NULL;
+
+ image->head = 0;
+ image->entry = &image->head;
+ image->last_entry = &image->head;
+ image->control_page = ~0; /* By default this does not apply */
+ image->type = KEXEC_TYPE_DEFAULT;
+
+ /* Initialize the list of control pages */
+ INIT_LIST_HEAD(&image->control_pages);
+
+ /* Initialize the list of destination pages */
+ INIT_LIST_HEAD(&image->dest_pages);
+
+ /* Initialize the list of unusable pages */
+ INIT_LIST_HEAD(&image->unusable_pages);
+
+ return image;
+}
+
+static void kimage_free_page_list(struct list_head *list);
+
+static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
+ unsigned long nr_segments,
+ struct kexec_segment __user *segments,
+ unsigned long flags)
+{
+ int ret;
+ struct kimage *image;
+ bool kexec_on_panic = flags & KEXEC_ON_CRASH;
+
+ if (kexec_on_panic) {
+ /* Verify we have a valid entry point */
+ if ((entry < crashk_res.start) || (entry > crashk_res.end))
+ return -EADDRNOTAVAIL;
+ }
+
+ /* Allocate and initialize a controlling structure */
+ image = do_kimage_alloc_init();
+ if (!image)
+ return -ENOMEM;
+
+ image->start = entry;
+
+ ret = copy_user_segment_list(image, nr_segments, segments);
+ if (ret)
+ goto out_free_image;
+
+ ret = sanity_check_segment_list(image);
+ if (ret)
+ goto out_free_image;
+
+ /* Enable the special crash kernel control page allocation policy. */
+ if (kexec_on_panic) {
+ image->control_page = crashk_res.start;
+ image->type = KEXEC_TYPE_CRASH;
+ }
+
+ /*
+ * Find a location for the control code buffer, and add it
+ * the vector of segments so that it's pages will also be
+ * counted as destination pages.
+ */
+ ret = -ENOMEM;
+ image->control_code_page = kimage_alloc_control_pages(image,
+ get_order(KEXEC_CONTROL_PAGE_SIZE));
+ if (!image->control_code_page) {
+ pr_err("Could not allocate control_code_buffer\n");
+ goto out_free_image;
+ }
+
+ if (!kexec_on_panic) {
+ image->swap_page = kimage_alloc_control_pages(image, 0);
+ if (!image->swap_page) {
+ pr_err("Could not allocate swap buffer\n");
+ goto out_free_control_pages;
+ }
+ }
+
+ *rimage = image;
+ return 0;
+out_free_control_pages:
+ kimage_free_page_list(&image->control_pages);
+out_free_image:
+ kfree(image);
+ return ret;
+}
+
+#ifdef CONFIG_KEXEC_FILE
+static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len)
+{
+ struct fd f = fdget(fd);
+ int ret;
+ struct kstat stat;
+ loff_t pos;
+ ssize_t bytes = 0;
+
+ if (!f.file)
+ return -EBADF;
+
+ ret = vfs_getattr(&f.file->f_path, &stat);
+ if (ret)
+ goto out;
+
+ if (stat.size > INT_MAX) {
+ ret = -EFBIG;
+ goto out;
+ }
+
+ /* Don't hand 0 to vmalloc, it whines. */
+ if (stat.size == 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ *buf = vmalloc(stat.size);
+ if (!*buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ pos = 0;
+ while (pos < stat.size) {
+ bytes = kernel_read(f.file, pos, (char *)(*buf) + pos,
+ stat.size - pos);
+ if (bytes < 0) {
+ vfree(*buf);
+ ret = bytes;
+ goto out;
+ }
+
+ if (bytes == 0)
+ break;
+ pos += bytes;
+ }
+
+ if (pos != stat.size) {
+ ret = -EBADF;
+ vfree(*buf);
+ goto out;
+ }
+
+ *buf_len = pos;
+out:
+ fdput(f);
+ return ret;
+}
+
+/* Architectures can provide this probe function */
+int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
+ unsigned long buf_len)
+{
+ return -ENOEXEC;
+}
+
+void * __weak arch_kexec_kernel_image_load(struct kimage *image)
+{
+ return ERR_PTR(-ENOEXEC);
+}
+
+void __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
+{
+}
+
+int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
+ unsigned long buf_len)
+{
+ return -EKEYREJECTED;
+}
+
+/* Apply relocations of type RELA */
+int __weak
+arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
+ unsigned int relsec)
+{
+ pr_err("RELA relocation unsupported.\n");
+ return -ENOEXEC;
+}
+
+/* Apply relocations of type REL */
+int __weak
+arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
+ unsigned int relsec)
+{
+ pr_err("REL relocation unsupported.\n");
+ return -ENOEXEC;
+}
+
+/*
+ * Free up memory used by kernel, initrd, and command line. This is temporary
+ * memory allocation which is not needed any more after these buffers have
+ * been loaded into separate segments and have been copied elsewhere.
+ */
+static void kimage_file_post_load_cleanup(struct kimage *image)
+{
+ struct purgatory_info *pi = &image->purgatory_info;
+
+ vfree(image->kernel_buf);
+ image->kernel_buf = NULL;
+
+ vfree(image->initrd_buf);
+ image->initrd_buf = NULL;
+
+ kfree(image->cmdline_buf);
+ image->cmdline_buf = NULL;
+
+ vfree(pi->purgatory_buf);
+ pi->purgatory_buf = NULL;
+
+ vfree(pi->sechdrs);
+ pi->sechdrs = NULL;
+
+ /* See if architecture has anything to cleanup post load */
+ arch_kimage_file_post_load_cleanup(image);
+
+ /*
+ * Above call should have called into bootloader to free up
+ * any data stored in kimage->image_loader_data. It should
+ * be ok now to free it up.
+ */
+ kfree(image->image_loader_data);
+ image->image_loader_data = NULL;
+}
+
+/*
+ * In file mode list of segments is prepared by kernel. Copy relevant
+ * data from user space, do error checking, prepare segment list
+ */
+static int
+kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
+ const char __user *cmdline_ptr,
+ unsigned long cmdline_len, unsigned flags)
+{
+ int ret = 0;
+ void *ldata;
+
+ ret = copy_file_from_fd(kernel_fd, &image->kernel_buf,
+ &image->kernel_buf_len);
+ if (ret)
+ return ret;
+
+ /* Call arch image probe handlers */
+ ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
+ image->kernel_buf_len);
+
+ if (ret)
+ goto out;
+
+#ifdef CONFIG_KEXEC_VERIFY_SIG
+ ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf,
+ image->kernel_buf_len);
+ if (ret) {
+ pr_debug("kernel signature verification failed.\n");
+ goto out;
+ }
+ pr_debug("kernel signature verification successful.\n");
+#endif
+ /* It is possible that there no initramfs is being loaded */
+ if (!(flags & KEXEC_FILE_NO_INITRAMFS)) {
+ ret = copy_file_from_fd(initrd_fd, &image->initrd_buf,
+ &image->initrd_buf_len);
+ if (ret)
+ goto out;
+ }
+
+ if (cmdline_len) {
+ image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL);
+ if (!image->cmdline_buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = copy_from_user(image->cmdline_buf, cmdline_ptr,
+ cmdline_len);
+ if (ret) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ image->cmdline_buf_len = cmdline_len;
+
+ /* command line should be a string with last byte null */
+ if (image->cmdline_buf[cmdline_len - 1] != '\0') {
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ /* Call arch image load handlers */
+ ldata = arch_kexec_kernel_image_load(image);
+
+ if (IS_ERR(ldata)) {
+ ret = PTR_ERR(ldata);
+ goto out;
+ }
+
+ image->image_loader_data = ldata;
+out:
+ /* In case of error, free up all allocated memory in this function */
+ if (ret)
+ kimage_file_post_load_cleanup(image);
+ return ret;
+}
+
+static int
+kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
+ int initrd_fd, const char __user *cmdline_ptr,
+ unsigned long cmdline_len, unsigned long flags)
+{
+ int ret;
+ struct kimage *image;
+ bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH;
+
+ image = do_kimage_alloc_init();
+ if (!image)
+ return -ENOMEM;
+
+ image->file_mode = 1;
+
+ if (kexec_on_panic) {
+ /* Enable special crash kernel control page alloc policy. */
+ image->control_page = crashk_res.start;
+ image->type = KEXEC_TYPE_CRASH;
+ }
+
+ ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd,
+ cmdline_ptr, cmdline_len, flags);
+ if (ret)
+ goto out_free_image;
+
+ ret = sanity_check_segment_list(image);
+ if (ret)
+ goto out_free_post_load_bufs;
+
+ ret = -ENOMEM;
+ image->control_code_page = kimage_alloc_control_pages(image,
+ get_order(KEXEC_CONTROL_PAGE_SIZE));
+ if (!image->control_code_page) {
+ pr_err("Could not allocate control_code_buffer\n");
+ goto out_free_post_load_bufs;
+ }
+
+ if (!kexec_on_panic) {
+ image->swap_page = kimage_alloc_control_pages(image, 0);
+ if (!image->swap_page) {
+ pr_err("Could not allocate swap buffer\n");
+ goto out_free_control_pages;
+ }
+ }
+
+ *rimage = image;
+ return 0;
+out_free_control_pages:
+ kimage_free_page_list(&image->control_pages);
+out_free_post_load_bufs:
+ kimage_file_post_load_cleanup(image);
+out_free_image:
+ kfree(image);
+ return ret;
+}
+#else /* CONFIG_KEXEC_FILE */
+static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
+#endif /* CONFIG_KEXEC_FILE */
+
+static int kimage_is_destination_range(struct kimage *image,
+ unsigned long start,
+ unsigned long end)
+{
+ unsigned long i;
+
+ for (i = 0; i < image->nr_segments; i++) {
+ unsigned long mstart, mend;
+
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz;
+ if ((end > mstart) && (start < mend))
+ return 1;
+ }
+
+ return 0;
+}
+
+static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
+{
+ struct page *pages;
+
+ pages = alloc_pages(gfp_mask, order);
+ if (pages) {
+ unsigned int count, i;
+ pages->mapping = NULL;
+ set_page_private(pages, order);
+ count = 1 << order;
+ for (i = 0; i < count; i++)
+ SetPageReserved(pages + i);
+ }
+
+ return pages;
+}
+
+static void kimage_free_pages(struct page *page)
+{
+ unsigned int order, count, i;
+
+ order = page_private(page);
+ count = 1 << order;
+ for (i = 0; i < count; i++)
+ ClearPageReserved(page + i);
+ __free_pages(page, order);
+}
+
+static void kimage_free_page_list(struct list_head *list)
+{
+ struct list_head *pos, *next;
+
+ list_for_each_safe(pos, next, list) {
+ struct page *page;
+
+ page = list_entry(pos, struct page, lru);
+ list_del(&page->lru);
+ kimage_free_pages(page);
+ }
+}
+
+static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
+ unsigned int order)
+{
+ /* Control pages are special, they are the intermediaries
+ * that are needed while we copy the rest of the pages
+ * to their final resting place. As such they must
+ * not conflict with either the destination addresses
+ * or memory the kernel is already using.
+ *
+ * The only case where we really need more than one of
+ * these are for architectures where we cannot disable
+ * the MMU and must instead generate an identity mapped
+ * page table for all of the memory.
+ *
+ * At worst this runs in O(N) of the image size.
+ */
+ struct list_head extra_pages;
+ struct page *pages;
+ unsigned int count;
+
+ count = 1 << order;
+ INIT_LIST_HEAD(&extra_pages);
+
+ /* Loop while I can allocate a page and the page allocated
+ * is a destination page.
+ */
+ do {
+ unsigned long pfn, epfn, addr, eaddr;
+
+ pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order);
+ if (!pages)
+ break;
+ pfn = page_to_pfn(pages);
+ epfn = pfn + count;
+ addr = pfn << PAGE_SHIFT;
+ eaddr = epfn << PAGE_SHIFT;
+ if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
+ kimage_is_destination_range(image, addr, eaddr)) {
+ list_add(&pages->lru, &extra_pages);
+ pages = NULL;
+ }
+ } while (!pages);
+
+ if (pages) {
+ /* Remember the allocated page... */
+ list_add(&pages->lru, &image->control_pages);
+
+ /* Because the page is already in it's destination
+ * location we will never allocate another page at
+ * that address. Therefore kimage_alloc_pages
+ * will not return it (again) and we don't need
+ * to give it an entry in image->segment[].
+ */
+ }
+ /* Deal with the destination pages I have inadvertently allocated.
+ *
+ * Ideally I would convert multi-page allocations into single
+ * page allocations, and add everything to image->dest_pages.
+ *
+ * For now it is simpler to just free the pages.
+ */
+ kimage_free_page_list(&extra_pages);
+
+ return pages;
+}
+
+static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
+ unsigned int order)
+{
+ /* Control pages are special, they are the intermediaries
+ * that are needed while we copy the rest of the pages
+ * to their final resting place. As such they must
+ * not conflict with either the destination addresses
+ * or memory the kernel is already using.
+ *
+ * Control pages are also the only pags we must allocate
+ * when loading a crash kernel. All of the other pages
+ * are specified by the segments and we just memcpy
+ * into them directly.
+ *
+ * The only case where we really need more than one of
+ * these are for architectures where we cannot disable
+ * the MMU and must instead generate an identity mapped
+ * page table for all of the memory.
+ *
+ * Given the low demand this implements a very simple
+ * allocator that finds the first hole of the appropriate
+ * size in the reserved memory region, and allocates all
+ * of the memory up to and including the hole.
+ */
+ unsigned long hole_start, hole_end, size;
+ struct page *pages;
+
+ pages = NULL;
+ size = (1 << order) << PAGE_SHIFT;
+ hole_start = (image->control_page + (size - 1)) & ~(size - 1);
+ hole_end = hole_start + size - 1;
+ while (hole_end <= crashk_res.end) {
+ unsigned long i;
+
+ if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
+ break;
+ /* See if I overlap any of the segments */
+ for (i = 0; i < image->nr_segments; i++) {
+ unsigned long mstart, mend;
+
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz - 1;
+ if ((hole_end >= mstart) && (hole_start <= mend)) {
+ /* Advance the hole to the end of the segment */
+ hole_start = (mend + (size - 1)) & ~(size - 1);
+ hole_end = hole_start + size - 1;
+ break;
+ }
+ }
+ /* If I don't overlap any segments I have found my hole! */
+ if (i == image->nr_segments) {
+ pages = pfn_to_page(hole_start >> PAGE_SHIFT);
+ break;
+ }
+ }
+ if (pages)
+ image->control_page = hole_end;
+
+ return pages;
+}
+
+
+struct page *kimage_alloc_control_pages(struct kimage *image,
+ unsigned int order)
+{
+ struct page *pages = NULL;
+
+ switch (image->type) {
+ case KEXEC_TYPE_DEFAULT:
+ pages = kimage_alloc_normal_control_pages(image, order);
+ break;
+ case KEXEC_TYPE_CRASH:
+ pages = kimage_alloc_crash_control_pages(image, order);
+ break;
+ }
+
+ return pages;
+}
+
+static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
+{
+ if (*image->entry != 0)
+ image->entry++;
+
+ if (image->entry == image->last_entry) {
+ kimage_entry_t *ind_page;
+ struct page *page;
+
+ page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
+ if (!page)
+ return -ENOMEM;
+
+ ind_page = page_address(page);
+ *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
+ image->entry = ind_page;
+ image->last_entry = ind_page +
+ ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
+ }
+ *image->entry = entry;
+ image->entry++;
+ *image->entry = 0;
+
+ return 0;
+}
+
+static int kimage_set_destination(struct kimage *image,
+ unsigned long destination)
+{
+ int result;
+
+ destination &= PAGE_MASK;
+ result = kimage_add_entry(image, destination | IND_DESTINATION);
+
+ return result;
+}
+
+
+static int kimage_add_page(struct kimage *image, unsigned long page)
+{
+ int result;
+
+ page &= PAGE_MASK;
+ result = kimage_add_entry(image, page | IND_SOURCE);
+
+ return result;
+}
+
+
+static void kimage_free_extra_pages(struct kimage *image)
+{
+ /* Walk through and free any extra destination pages I may have */
+ kimage_free_page_list(&image->dest_pages);
+
+ /* Walk through and free any unusable pages I have cached */
+ kimage_free_page_list(&image->unusable_pages);
+
+}
+static void kimage_terminate(struct kimage *image)
+{
+ if (*image->entry != 0)
+ image->entry++;
+
+ *image->entry = IND_DONE;
+}
+
+#define for_each_kimage_entry(image, ptr, entry) \
+ for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
+ ptr = (entry & IND_INDIRECTION) ? \
+ phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
+
+static void kimage_free_entry(kimage_entry_t entry)
+{
+ struct page *page;
+
+ page = pfn_to_page(entry >> PAGE_SHIFT);
+ kimage_free_pages(page);
+}
+
+static void kimage_free(struct kimage *image)
+{
+ kimage_entry_t *ptr, entry;
+ kimage_entry_t ind = 0;
+
+ if (!image)
+ return;
+
+ kimage_free_extra_pages(image);
+ for_each_kimage_entry(image, ptr, entry) {
+ if (entry & IND_INDIRECTION) {
+ /* Free the previous indirection page */
+ if (ind & IND_INDIRECTION)
+ kimage_free_entry(ind);
+ /* Save this indirection page until we are
+ * done with it.
+ */
+ ind = entry;
+ } else if (entry & IND_SOURCE)
+ kimage_free_entry(entry);
+ }
+ /* Free the final indirection page */
+ if (ind & IND_INDIRECTION)
+ kimage_free_entry(ind);
+
+ /* Handle any machine specific cleanup */
+ machine_kexec_cleanup(image);
+
+ /* Free the kexec control pages... */
+ kimage_free_page_list(&image->control_pages);
+
+ /*
+ * Free up any temporary buffers allocated. This might hit if
+ * error occurred much later after buffer allocation.
+ */
+ if (image->file_mode)
+ kimage_file_post_load_cleanup(image);
+
+ kfree(image);
+}
+
+static kimage_entry_t *kimage_dst_used(struct kimage *image,
+ unsigned long page)
+{
+ kimage_entry_t *ptr, entry;
+ unsigned long destination = 0;
+
+ for_each_kimage_entry(image, ptr, entry) {
+ if (entry & IND_DESTINATION)
+ destination = entry & PAGE_MASK;
+ else if (entry & IND_SOURCE) {
+ if (page == destination)
+ return ptr;
+ destination += PAGE_SIZE;
+ }
+ }
+
+ return NULL;
+}
+
+static struct page *kimage_alloc_page(struct kimage *image,
+ gfp_t gfp_mask,
+ unsigned long destination)
+{
+ /*
+ * Here we implement safeguards to ensure that a source page
+ * is not copied to its destination page before the data on
+ * the destination page is no longer useful.
+ *
+ * To do this we maintain the invariant that a source page is
+ * either its own destination page, or it is not a
+ * destination page at all.
+ *
+ * That is slightly stronger than required, but the proof
+ * that no problems will not occur is trivial, and the
+ * implementation is simply to verify.
+ *
+ * When allocating all pages normally this algorithm will run
+ * in O(N) time, but in the worst case it will run in O(N^2)
+ * time. If the runtime is a problem the data structures can
+ * be fixed.
+ */
+ struct page *page;
+ unsigned long addr;
+
+ /*
+ * Walk through the list of destination pages, and see if I
+ * have a match.
+ */
+ list_for_each_entry(page, &image->dest_pages, lru) {
+ addr = page_to_pfn(page) << PAGE_SHIFT;
+ if (addr == destination) {
+ list_del(&page->lru);
+ return page;
+ }
+ }
+ page = NULL;
+ while (1) {
+ kimage_entry_t *old;
+
+ /* Allocate a page, if we run out of memory give up */
+ page = kimage_alloc_pages(gfp_mask, 0);
+ if (!page)
+ return NULL;
+ /* If the page cannot be used file it away */
+ if (page_to_pfn(page) >
+ (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
+ list_add(&page->lru, &image->unusable_pages);
+ continue;
+ }
+ addr = page_to_pfn(page) << PAGE_SHIFT;
+
+ /* If it is the destination page we want use it */
+ if (addr == destination)
+ break;
+
+ /* If the page is not a destination page use it */
+ if (!kimage_is_destination_range(image, addr,
+ addr + PAGE_SIZE))
+ break;
+
+ /*
+ * I know that the page is someones destination page.
+ * See if there is already a source page for this
+ * destination page. And if so swap the source pages.
+ */
+ old = kimage_dst_used(image, addr);
+ if (old) {
+ /* If so move it */
+ unsigned long old_addr;
+ struct page *old_page;
+
+ old_addr = *old & PAGE_MASK;
+ old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
+ copy_highpage(page, old_page);
+ *old = addr | (*old & ~PAGE_MASK);
+
+ /* The old page I have found cannot be a
+ * destination page, so return it if it's
+ * gfp_flags honor the ones passed in.
+ */
+ if (!(gfp_mask & __GFP_HIGHMEM) &&
+ PageHighMem(old_page)) {
+ kimage_free_pages(old_page);
+ continue;
+ }
+ addr = old_addr;
+ page = old_page;
+ break;
+ } else {
+ /* Place the page on the destination list I
+ * will use it later.
+ */
+ list_add(&page->lru, &image->dest_pages);
+ }
+ }
+
+ return page;
+}
+
+static int kimage_load_normal_segment(struct kimage *image,
+ struct kexec_segment *segment)
+{
+ unsigned long maddr;
+ size_t ubytes, mbytes;
+ int result;
+ unsigned char __user *buf = NULL;
+ unsigned char *kbuf = NULL;
+
+ result = 0;
+ if (image->file_mode)
+ kbuf = segment->kbuf;
+ else
+ buf = segment->buf;
+ ubytes = segment->bufsz;
+ mbytes = segment->memsz;
+ maddr = segment->mem;
+
+ result = kimage_set_destination(image, maddr);
+ if (result < 0)
+ goto out;
+
+ while (mbytes) {
+ struct page *page;
+ char *ptr;
+ size_t uchunk, mchunk;
+
+ page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
+ if (!page) {
+ result = -ENOMEM;
+ goto out;
+ }
+ result = kimage_add_page(image, page_to_pfn(page)
+ << PAGE_SHIFT);
+ if (result < 0)
+ goto out;
+
+ ptr = kmap(page);
+ /* Start with a clear page */
+ clear_page(ptr);
+ ptr += maddr & ~PAGE_MASK;
+ mchunk = min_t(size_t, mbytes,
+ PAGE_SIZE - (maddr & ~PAGE_MASK));
+ uchunk = min(ubytes, mchunk);
+
+ /* For file based kexec, source pages are in kernel memory */
+ if (image->file_mode)
+ memcpy(ptr, kbuf, uchunk);
+ else
+ result = copy_from_user(ptr, buf, uchunk);
+ kunmap(page);
+ if (result) {
+ result = -EFAULT;
+ goto out;
+ }
+ ubytes -= uchunk;
+ maddr += mchunk;
+ if (image->file_mode)
+ kbuf += mchunk;
+ else
+ buf += mchunk;
+ mbytes -= mchunk;
+ }
+out:
+ return result;
+}
+
+static int kimage_load_crash_segment(struct kimage *image,
+ struct kexec_segment *segment)
+{
+ /* For crash dumps kernels we simply copy the data from
+ * user space to it's destination.
+ * We do things a page at a time for the sake of kmap.
+ */
+ unsigned long maddr;
+ size_t ubytes, mbytes;
+ int result;
+ unsigned char __user *buf = NULL;
+ unsigned char *kbuf = NULL;
+
+ result = 0;
+ if (image->file_mode)
+ kbuf = segment->kbuf;
+ else
+ buf = segment->buf;
+ ubytes = segment->bufsz;
+ mbytes = segment->memsz;
+ maddr = segment->mem;
+ while (mbytes) {
+ struct page *page;
+ char *ptr;
+ size_t uchunk, mchunk;
+
+ page = pfn_to_page(maddr >> PAGE_SHIFT);
+ if (!page) {
+ result = -ENOMEM;
+ goto out;
+ }
+ ptr = kmap(page);
+ ptr += maddr & ~PAGE_MASK;
+ mchunk = min_t(size_t, mbytes,
+ PAGE_SIZE - (maddr & ~PAGE_MASK));
+ uchunk = min(ubytes, mchunk);
+ if (mchunk > uchunk) {
+ /* Zero the trailing part of the page */
+ memset(ptr + uchunk, 0, mchunk - uchunk);
+ }
+
+ /* For file based kexec, source pages are in kernel memory */
+ if (image->file_mode)
+ memcpy(ptr, kbuf, uchunk);
+ else
+ result = copy_from_user(ptr, buf, uchunk);
+ kexec_flush_icache_page(page);
+ kunmap(page);
+ if (result) {
+ result = -EFAULT;
+ goto out;
+ }
+ ubytes -= uchunk;
+ maddr += mchunk;
+ if (image->file_mode)
+ kbuf += mchunk;
+ else
+ buf += mchunk;
+ mbytes -= mchunk;
+ }
+out:
+ return result;
+}
+
+static int kimage_load_segment(struct kimage *image,
+ struct kexec_segment *segment)
+{
+ int result = -ENOMEM;
+
+ switch (image->type) {
+ case KEXEC_TYPE_DEFAULT:
+ result = kimage_load_normal_segment(image, segment);
+ break;
+ case KEXEC_TYPE_CRASH:
+ result = kimage_load_crash_segment(image, segment);
+ break;
+ }
+
+ return result;
+}
+
+/*
+ * Exec Kernel system call: for obvious reasons only root may call it.
+ *
+ * This call breaks up into three pieces.
+ * - A generic part which loads the new kernel from the current
+ * address space, and very carefully places the data in the
+ * allocated pages.
+ *
+ * - A generic part that interacts with the kernel and tells all of
+ * the devices to shut down. Preventing on-going dmas, and placing
+ * the devices in a consistent state so a later kernel can
+ * reinitialize them.
+ *
+ * - A machine specific part that includes the syscall number
+ * and then copies the image to it's final destination. And
+ * jumps into the image at entry.
+ *
+ * kexec does not sync, or unmount filesystems so if you need
+ * that to happen you need to do that yourself.
+ */
+struct kimage *kexec_image;
+struct kimage *kexec_crash_image;
+int kexec_load_disabled;
+
+static DEFINE_MUTEX(kexec_mutex);
+
+SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
+ struct kexec_segment __user *, segments, unsigned long, flags)
+{
+ struct kimage **dest_image, *image;
+ int result;
+
+ /* We only trust the superuser with rebooting the system. */
+ if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
+ return -EPERM;
+
+ /*
+ * Verify we have a legal set of flags
+ * This leaves us room for future extensions.
+ */
+ if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
+ return -EINVAL;
+
+ /* Verify we are on the appropriate architecture */
+ if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
+ ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
+ return -EINVAL;
+
+ /* Put an artificial cap on the number
+ * of segments passed to kexec_load.
+ */
+ if (nr_segments > KEXEC_SEGMENT_MAX)
+ return -EINVAL;
+
+ image = NULL;
+ result = 0;
+
+ /* Because we write directly to the reserved memory
+ * region when loading crash kernels we need a mutex here to
+ * prevent multiple crash kernels from attempting to load
+ * simultaneously, and to prevent a crash kernel from loading
+ * over the top of a in use crash kernel.
+ *
+ * KISS: always take the mutex.
+ */
+ if (!mutex_trylock(&kexec_mutex))
+ return -EBUSY;
+
+ dest_image = &kexec_image;
+ if (flags & KEXEC_ON_CRASH)
+ dest_image = &kexec_crash_image;
+ if (nr_segments > 0) {
+ unsigned long i;
+
+ if (flags & KEXEC_ON_CRASH) {
+ /*
+ * Loading another kernel to switch to if this one
+ * crashes. Free any current crash dump kernel before
+ * we corrupt it.
+ */
+
+ kimage_free(xchg(&kexec_crash_image, NULL));
+ result = kimage_alloc_init(&image, entry, nr_segments,
+ segments, flags);
+ crash_map_reserved_pages();
+ } else {
+ /* Loading another kernel to reboot into. */
+
+ result = kimage_alloc_init(&image, entry, nr_segments,
+ segments, flags);
+ }
+ if (result)
+ goto out;
+
+ if (flags & KEXEC_PRESERVE_CONTEXT)
+ image->preserve_context = 1;
+ result = machine_kexec_prepare(image);
+ if (result)
+ goto out;
+
+ for (i = 0; i < nr_segments; i++) {
+ result = kimage_load_segment(image, &image->segment[i]);
+ if (result)
+ goto out;
+ }
+ kimage_terminate(image);
+ if (flags & KEXEC_ON_CRASH)
+ crash_unmap_reserved_pages();
+ }
+ /* Install the new kernel, and Uninstall the old */
+ image = xchg(dest_image, image);
+
+out:
+ mutex_unlock(&kexec_mutex);
+ kimage_free(image);
+
+ return result;
+}
+
+/*
+ * Add and remove page tables for crashkernel memory
+ *
+ * Provide an empty default implementation here -- architecture
+ * code may override this
+ */
+void __weak crash_map_reserved_pages(void)
+{}
+
+void __weak crash_unmap_reserved_pages(void)
+{}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
+ compat_ulong_t, nr_segments,
+ struct compat_kexec_segment __user *, segments,
+ compat_ulong_t, flags)
+{
+ struct compat_kexec_segment in;
+ struct kexec_segment out, __user *ksegments;
+ unsigned long i, result;
+
+ /* Don't allow clients that don't understand the native
+ * architecture to do anything.
+ */
+ if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT)
+ return -EINVAL;
+
+ if (nr_segments > KEXEC_SEGMENT_MAX)
+ return -EINVAL;
+
+ ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
+ for (i = 0; i < nr_segments; i++) {
+ result = copy_from_user(&in, &segments[i], sizeof(in));
+ if (result)
+ return -EFAULT;
+
+ out.buf = compat_ptr(in.buf);
+ out.bufsz = in.bufsz;
+ out.mem = in.mem;
+ out.memsz = in.memsz;
+
+ result = copy_to_user(&ksegments[i], &out, sizeof(out));
+ if (result)
+ return -EFAULT;
+ }
+
+ return sys_kexec_load(entry, nr_segments, ksegments, flags);
+}
+#endif
+
+#ifdef CONFIG_KEXEC_FILE
+SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
+ unsigned long, cmdline_len, const char __user *, cmdline_ptr,
+ unsigned long, flags)
+{
+ int ret = 0, i;
+ struct kimage **dest_image, *image;
+
+ /* We only trust the superuser with rebooting the system. */
+ if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
+ return -EPERM;
+
+ /* Make sure we have a legal set of flags */
+ if (flags != (flags & KEXEC_FILE_FLAGS))
+ return -EINVAL;
+
+ image = NULL;
+
+ if (!mutex_trylock(&kexec_mutex))
+ return -EBUSY;
+
+ dest_image = &kexec_image;
+ if (flags & KEXEC_FILE_ON_CRASH)
+ dest_image = &kexec_crash_image;
+
+ if (flags & KEXEC_FILE_UNLOAD)
+ goto exchange;
+
+ /*
+ * In case of crash, new kernel gets loaded in reserved region. It is
+ * same memory where old crash kernel might be loaded. Free any
+ * current crash dump kernel before we corrupt it.
+ */
+ if (flags & KEXEC_FILE_ON_CRASH)
+ kimage_free(xchg(&kexec_crash_image, NULL));
+
+ ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr,
+ cmdline_len, flags);
+ if (ret)
+ goto out;
+
+ ret = machine_kexec_prepare(image);
+ if (ret)
+ goto out;
+
+ ret = kexec_calculate_store_digests(image);
+ if (ret)
+ goto out;
+
+ for (i = 0; i < image->nr_segments; i++) {
+ struct kexec_segment *ksegment;
+
+ ksegment = &image->segment[i];
+ pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n",
+ i, ksegment->buf, ksegment->bufsz, ksegment->mem,
+ ksegment->memsz);
+
+ ret = kimage_load_segment(image, &image->segment[i]);
+ if (ret)
+ goto out;
+ }
+
+ kimage_terminate(image);
+
+ /*
+ * Free up any temporary buffers allocated which are not needed
+ * after image has been loaded
+ */
+ kimage_file_post_load_cleanup(image);
+exchange:
+ image = xchg(dest_image, image);
+out:
+ mutex_unlock(&kexec_mutex);
+ kimage_free(image);
+ return ret;
+}
+
+#endif /* CONFIG_KEXEC_FILE */
+
+void crash_kexec(struct pt_regs *regs)
+{
+ /* Take the kexec_mutex here to prevent sys_kexec_load
+ * running on one cpu from replacing the crash kernel
+ * we are using after a panic on a different cpu.
+ *
+ * If the crash kernel was not located in a fixed area
+ * of memory the xchg(&kexec_crash_image) would be
+ * sufficient. But since I reuse the memory...
+ */
+ if (mutex_trylock(&kexec_mutex)) {
+ if (kexec_crash_image) {
+ struct pt_regs fixed_regs;
+
+ crash_setup_regs(&fixed_regs, regs);
+ crash_save_vmcoreinfo();
+ machine_crash_shutdown(&fixed_regs);
+ machine_kexec(kexec_crash_image);
+ }
+ mutex_unlock(&kexec_mutex);
+ }
+}
+
+size_t crash_get_memory_size(void)
+{
+ size_t size = 0;
+ mutex_lock(&kexec_mutex);
+ if (crashk_res.end != crashk_res.start)
+ size = resource_size(&crashk_res);
+ mutex_unlock(&kexec_mutex);
+ return size;
+}
+
+void __weak crash_free_reserved_phys_range(unsigned long begin,
+ unsigned long end)
+{
+ unsigned long addr;
+
+ for (addr = begin; addr < end; addr += PAGE_SIZE)
+ free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
+}
+
+int crash_shrink_memory(unsigned long new_size)
+{
+ int ret = 0;
+ unsigned long start, end;
+ unsigned long old_size;
+ struct resource *ram_res;
+
+ mutex_lock(&kexec_mutex);
+
+ if (kexec_crash_image) {
+ ret = -ENOENT;
+ goto unlock;
+ }
+ start = crashk_res.start;
+ end = crashk_res.end;
+ old_size = (end == 0) ? 0 : end - start + 1;
+ if (new_size >= old_size) {
+ ret = (new_size == old_size) ? 0 : -EINVAL;
+ goto unlock;
+ }
+
+ ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
+ if (!ram_res) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+
+ start = roundup(start, KEXEC_CRASH_MEM_ALIGN);
+ end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN);
+
+ crash_map_reserved_pages();
+ crash_free_reserved_phys_range(end, crashk_res.end);
+
+ if ((start == end) && (crashk_res.parent != NULL))
+ release_resource(&crashk_res);
+
+ ram_res->start = end;
+ ram_res->end = crashk_res.end;
+ ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+ ram_res->name = "System RAM";
+
+ crashk_res.end = end - 1;
+
+ insert_resource(&iomem_resource, ram_res);
+ crash_unmap_reserved_pages();
+
+unlock:
+ mutex_unlock(&kexec_mutex);
+ return ret;
+}
+
+static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
+ size_t data_len)
+{
+ struct elf_note note;
+
+ note.n_namesz = strlen(name) + 1;
+ note.n_descsz = data_len;
+ note.n_type = type;
+ memcpy(buf, &note, sizeof(note));
+ buf += (sizeof(note) + 3)/4;
+ memcpy(buf, name, note.n_namesz);
+ buf += (note.n_namesz + 3)/4;
+ memcpy(buf, data, note.n_descsz);
+ buf += (note.n_descsz + 3)/4;
+
+ return buf;
+}
+
+static void final_note(u32 *buf)
+{
+ struct elf_note note;
+
+ note.n_namesz = 0;
+ note.n_descsz = 0;
+ note.n_type = 0;
+ memcpy(buf, &note, sizeof(note));
+}
+
+void crash_save_cpu(struct pt_regs *regs, int cpu)
+{
+ struct elf_prstatus prstatus;
+ u32 *buf;
+
+ if ((cpu < 0) || (cpu >= nr_cpu_ids))
+ return;
+
+ /* Using ELF notes here is opportunistic.
+ * I need a well defined structure format
+ * for the data I pass, and I need tags
+ * on the data to indicate what information I have
+ * squirrelled away. ELF notes happen to provide
+ * all of that, so there is no need to invent something new.
+ */
+ buf = (u32 *)per_cpu_ptr(crash_notes, cpu);
+ if (!buf)
+ return;
+ memset(&prstatus, 0, sizeof(prstatus));
+ prstatus.pr_pid = current->pid;
+ elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
+ buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
+ &prstatus, sizeof(prstatus));
+ final_note(buf);
+}
+
+static int __init crash_notes_memory_init(void)
+{
+ /* Allocate memory for saving cpu registers. */
+ crash_notes = alloc_percpu(note_buf_t);
+ if (!crash_notes) {
+ pr_warn("Kexec: Memory allocation for saving cpu register states failed\n");
+ return -ENOMEM;
+ }
+ return 0;
+}
+subsys_initcall(crash_notes_memory_init);
+
+
+/*
+ * parsing the "crashkernel" commandline
+ *
+ * this code is intended to be called from architecture specific code
+ */
+
+
+/*
+ * This function parses command lines in the format
+ *
+ * crashkernel=ramsize-range:size[,...][@offset]
+ *
+ * The function returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_mem(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ char *cur = cmdline, *tmp;
+
+ /* for each entry of the comma-separated list */
+ do {
+ unsigned long long start, end = ULLONG_MAX, size;
+
+ /* get the start of the range */
+ start = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warn("crashkernel: Memory value expected\n");
+ return -EINVAL;
+ }
+ cur = tmp;
+ if (*cur != '-') {
+ pr_warn("crashkernel: '-' expected\n");
+ return -EINVAL;
+ }
+ cur++;
+
+ /* if no ':' is here, than we read the end */
+ if (*cur != ':') {
+ end = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warn("crashkernel: Memory value expected\n");
+ return -EINVAL;
+ }
+ cur = tmp;
+ if (end <= start) {
+ pr_warn("crashkernel: end <= start\n");
+ return -EINVAL;
+ }
+ }
+
+ if (*cur != ':') {
+ pr_warn("crashkernel: ':' expected\n");
+ return -EINVAL;
+ }
+ cur++;
+
+ size = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warn("Memory value expected\n");
+ return -EINVAL;
+ }
+ cur = tmp;
+ if (size >= system_ram) {
+ pr_warn("crashkernel: invalid size\n");
+ return -EINVAL;
+ }
+
+ /* match ? */
+ if (system_ram >= start && system_ram < end) {
+ *crash_size = size;
+ break;
+ }
+ } while (*cur++ == ',');
+
+ if (*crash_size > 0) {
+ while (*cur && *cur != ' ' && *cur != '@')
+ cur++;
+ if (*cur == '@') {
+ cur++;
+ *crash_base = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warn("Memory value expected after '@'\n");
+ return -EINVAL;
+ }
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * That function parses "simple" (old) crashkernel command lines like
+ *
+ * crashkernel=size[@offset]
+ *
+ * It returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_simple(char *cmdline,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ char *cur = cmdline;
+
+ *crash_size = memparse(cmdline, &cur);
+ if (cmdline == cur) {
+ pr_warn("crashkernel: memory value expected\n");
+ return -EINVAL;
+ }
+
+ if (*cur == '@')
+ *crash_base = memparse(cur+1, &cur);
+ else if (*cur != ' ' && *cur != '\0') {
+ pr_warn("crashkernel: unrecognized char\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+#define SUFFIX_HIGH 0
+#define SUFFIX_LOW 1
+#define SUFFIX_NULL 2
+static __initdata char *suffix_tbl[] = {
+ [SUFFIX_HIGH] = ",high",
+ [SUFFIX_LOW] = ",low",
+ [SUFFIX_NULL] = NULL,
+};
+
+/*
+ * That function parses "suffix" crashkernel command lines like
+ *
+ * crashkernel=size,[high|low]
+ *
+ * It returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_suffix(char *cmdline,
+ unsigned long long *crash_size,
+ const char *suffix)
+{
+ char *cur = cmdline;
+
+ *crash_size = memparse(cmdline, &cur);
+ if (cmdline == cur) {
+ pr_warn("crashkernel: memory value expected\n");
+ return -EINVAL;
+ }
+
+ /* check with suffix */
+ if (strncmp(cur, suffix, strlen(suffix))) {
+ pr_warn("crashkernel: unrecognized char\n");
+ return -EINVAL;
+ }
+ cur += strlen(suffix);
+ if (*cur != ' ' && *cur != '\0') {
+ pr_warn("crashkernel: unrecognized char\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static __init char *get_last_crashkernel(char *cmdline,
+ const char *name,
+ const char *suffix)
+{
+ char *p = cmdline, *ck_cmdline = NULL;
+
+ /* find crashkernel and use the last one if there are more */
+ p = strstr(p, name);
+ while (p) {
+ char *end_p = strchr(p, ' ');
+ char *q;
+
+ if (!end_p)
+ end_p = p + strlen(p);
+
+ if (!suffix) {
+ int i;
+
+ /* skip the one with any known suffix */
+ for (i = 0; suffix_tbl[i]; i++) {
+ q = end_p - strlen(suffix_tbl[i]);
+ if (!strncmp(q, suffix_tbl[i],
+ strlen(suffix_tbl[i])))
+ goto next;
+ }
+ ck_cmdline = p;
+ } else {
+ q = end_p - strlen(suffix);
+ if (!strncmp(q, suffix, strlen(suffix)))
+ ck_cmdline = p;
+ }
+next:
+ p = strstr(p+1, name);
+ }
+
+ if (!ck_cmdline)
+ return NULL;
+
+ return ck_cmdline;
+}
+
+static int __init __parse_crashkernel(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base,
+ const char *name,
+ const char *suffix)
+{
+ char *first_colon, *first_space;
+ char *ck_cmdline;
+
+ BUG_ON(!crash_size || !crash_base);
+ *crash_size = 0;
+ *crash_base = 0;
+
+ ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
+
+ if (!ck_cmdline)
+ return -EINVAL;
+
+ ck_cmdline += strlen(name);
+
+ if (suffix)
+ return parse_crashkernel_suffix(ck_cmdline, crash_size,
+ suffix);
+ /*
+ * if the commandline contains a ':', then that's the extended
+ * syntax -- if not, it must be the classic syntax
+ */
+ first_colon = strchr(ck_cmdline, ':');
+ first_space = strchr(ck_cmdline, ' ');
+ if (first_colon && (!first_space || first_colon < first_space))
+ return parse_crashkernel_mem(ck_cmdline, system_ram,
+ crash_size, crash_base);
+
+ return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
+}
+
+/*
+ * That function is the entry point for command line parsing and should be
+ * called from the arch-specific code.
+ */
+int __init parse_crashkernel(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+ "crashkernel=", NULL);
+}
+
+int __init parse_crashkernel_high(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+ "crashkernel=", suffix_tbl[SUFFIX_HIGH]);
+}
+
+int __init parse_crashkernel_low(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+ "crashkernel=", suffix_tbl[SUFFIX_LOW]);
+}
+
+static void update_vmcoreinfo_note(void)
+{
+ u32 *buf = vmcoreinfo_note;
+
+ if (!vmcoreinfo_size)
+ return;
+ buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
+ vmcoreinfo_size);
+ final_note(buf);
+}
+
+void crash_save_vmcoreinfo(void)
+{
+ vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
+ update_vmcoreinfo_note();
+}
+
+void vmcoreinfo_append_str(const char *fmt, ...)
+{
+ va_list args;
+ char buf[0x50];
+ size_t r;
+
+ va_start(args, fmt);
+ r = vscnprintf(buf, sizeof(buf), fmt, args);
+ va_end(args);
+
+ r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
+
+ memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
+
+ vmcoreinfo_size += r;
+}
+
+/*
+ * provide an empty default implementation here -- architecture
+ * code may override this
+ */
+void __weak arch_crash_save_vmcoreinfo(void)
+{}
+
+unsigned long __weak paddr_vmcoreinfo_note(void)
+{
+ return __pa((unsigned long)(char *)&vmcoreinfo_note);
+}
+
+static int __init crash_save_vmcoreinfo_init(void)
+{
+ VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
+ VMCOREINFO_PAGESIZE(PAGE_SIZE);
+
+ VMCOREINFO_SYMBOL(init_uts_ns);
+ VMCOREINFO_SYMBOL(node_online_map);
+#ifdef CONFIG_MMU
+ VMCOREINFO_SYMBOL(swapper_pg_dir);
+#endif
+ VMCOREINFO_SYMBOL(_stext);
+ VMCOREINFO_SYMBOL(vmap_area_list);
+
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+ VMCOREINFO_SYMBOL(mem_map);
+ VMCOREINFO_SYMBOL(contig_page_data);
+#endif
+#ifdef CONFIG_SPARSEMEM
+ VMCOREINFO_SYMBOL(mem_section);
+ VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
+ VMCOREINFO_STRUCT_SIZE(mem_section);
+ VMCOREINFO_OFFSET(mem_section, section_mem_map);
+#endif
+ VMCOREINFO_STRUCT_SIZE(page);
+ VMCOREINFO_STRUCT_SIZE(pglist_data);
+ VMCOREINFO_STRUCT_SIZE(zone);
+ VMCOREINFO_STRUCT_SIZE(free_area);
+ VMCOREINFO_STRUCT_SIZE(list_head);
+ VMCOREINFO_SIZE(nodemask_t);
+ VMCOREINFO_OFFSET(page, flags);
+ VMCOREINFO_OFFSET(page, _count);
+ VMCOREINFO_OFFSET(page, mapping);
+ VMCOREINFO_OFFSET(page, lru);
+ VMCOREINFO_OFFSET(page, _mapcount);
+ VMCOREINFO_OFFSET(page, private);
+ VMCOREINFO_OFFSET(pglist_data, node_zones);
+ VMCOREINFO_OFFSET(pglist_data, nr_zones);
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
+ VMCOREINFO_OFFSET(pglist_data, node_mem_map);
+#endif
+ VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
+ VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
+ VMCOREINFO_OFFSET(pglist_data, node_id);
+ VMCOREINFO_OFFSET(zone, free_area);
+ VMCOREINFO_OFFSET(zone, vm_stat);
+ VMCOREINFO_OFFSET(zone, spanned_pages);
+ VMCOREINFO_OFFSET(free_area, free_list);
+ VMCOREINFO_OFFSET(list_head, next);
+ VMCOREINFO_OFFSET(list_head, prev);
+ VMCOREINFO_OFFSET(vmap_area, va_start);
+ VMCOREINFO_OFFSET(vmap_area, list);
+ VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
+ log_buf_kexec_setup();
+ VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
+ VMCOREINFO_NUMBER(NR_FREE_PAGES);
+ VMCOREINFO_NUMBER(PG_lru);
+ VMCOREINFO_NUMBER(PG_private);
+ VMCOREINFO_NUMBER(PG_swapcache);
+ VMCOREINFO_NUMBER(PG_slab);
+#ifdef CONFIG_MEMORY_FAILURE
+ VMCOREINFO_NUMBER(PG_hwpoison);
+#endif
+ VMCOREINFO_NUMBER(PG_head_mask);
+ VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
+#ifdef CONFIG_HUGETLBFS
+ VMCOREINFO_SYMBOL(free_huge_page);
+#endif
+
+ arch_crash_save_vmcoreinfo();
+ update_vmcoreinfo_note();
+
+ return 0;
+}
+
+subsys_initcall(crash_save_vmcoreinfo_init);
+
+#ifdef CONFIG_KEXEC_FILE
+static int locate_mem_hole_top_down(unsigned long start, unsigned long end,
+ struct kexec_buf *kbuf)
+{
+ struct kimage *image = kbuf->image;
+ unsigned long temp_start, temp_end;
+
+ temp_end = min(end, kbuf->buf_max);
+ temp_start = temp_end - kbuf->memsz;
+
+ do {
+ /* align down start */
+ temp_start = temp_start & (~(kbuf->buf_align - 1));
+
+ if (temp_start < start || temp_start < kbuf->buf_min)
+ return 0;
+
+ temp_end = temp_start + kbuf->memsz - 1;
+
+ /*
+ * Make sure this does not conflict with any of existing
+ * segments
+ */
+ if (kimage_is_destination_range(image, temp_start, temp_end)) {
+ temp_start = temp_start - PAGE_SIZE;
+ continue;
+ }
+
+ /* We found a suitable memory range */
+ break;
+ } while (1);
+
+ /* If we are here, we found a suitable memory range */
+ kbuf->mem = temp_start;
+
+ /* Success, stop navigating through remaining System RAM ranges */
+ return 1;
+}
+
+static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end,
+ struct kexec_buf *kbuf)
+{
+ struct kimage *image = kbuf->image;
+ unsigned long temp_start, temp_end;
+
+ temp_start = max(start, kbuf->buf_min);
+
+ do {
+ temp_start = ALIGN(temp_start, kbuf->buf_align);
+ temp_end = temp_start + kbuf->memsz - 1;
+
+ if (temp_end > end || temp_end > kbuf->buf_max)
+ return 0;
+ /*
+ * Make sure this does not conflict with any of existing
+ * segments
+ */
+ if (kimage_is_destination_range(image, temp_start, temp_end)) {
+ temp_start = temp_start + PAGE_SIZE;
+ continue;
+ }
+
+ /* We found a suitable memory range */
+ break;
+ } while (1);
+
+ /* If we are here, we found a suitable memory range */
+ kbuf->mem = temp_start;
+
+ /* Success, stop navigating through remaining System RAM ranges */
+ return 1;
+}
+
+static int locate_mem_hole_callback(u64 start, u64 end, void *arg)
+{
+ struct kexec_buf *kbuf = (struct kexec_buf *)arg;
+ unsigned long sz = end - start + 1;
+
+ /* Returning 0 will take to next memory range */
+ if (sz < kbuf->memsz)
+ return 0;
+
+ if (end < kbuf->buf_min || start > kbuf->buf_max)
+ return 0;
+
+ /*
+ * Allocate memory top down with-in ram range. Otherwise bottom up
+ * allocation.
+ */
+ if (kbuf->top_down)
+ return locate_mem_hole_top_down(start, end, kbuf);
+ return locate_mem_hole_bottom_up(start, end, kbuf);
+}
+
+/*
+ * Helper function for placing a buffer in a kexec segment. This assumes
+ * that kexec_mutex is held.
+ */
+int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
+ unsigned long memsz, unsigned long buf_align,
+ unsigned long buf_min, unsigned long buf_max,
+ bool top_down, unsigned long *load_addr)
+{
+
+ struct kexec_segment *ksegment;
+ struct kexec_buf buf, *kbuf;
+ int ret;
+
+ /* Currently adding segment this way is allowed only in file mode */
+ if (!image->file_mode)
+ return -EINVAL;
+
+ if (image->nr_segments >= KEXEC_SEGMENT_MAX)
+ return -EINVAL;
+
+ /*
+ * Make sure we are not trying to add buffer after allocating
+ * control pages. All segments need to be placed first before
+ * any control pages are allocated. As control page allocation
+ * logic goes through list of segments to make sure there are
+ * no destination overlaps.
+ */
+ if (!list_empty(&image->control_pages)) {
+ WARN_ON(1);
+ return -EINVAL;
+ }
+
+ memset(&buf, 0, sizeof(struct kexec_buf));
+ kbuf = &buf;
+ kbuf->image = image;
+ kbuf->buffer = buffer;
+ kbuf->bufsz = bufsz;
+
+ kbuf->memsz = ALIGN(memsz, PAGE_SIZE);
+ kbuf->buf_align = max(buf_align, PAGE_SIZE);
+ kbuf->buf_min = buf_min;
+ kbuf->buf_max = buf_max;
+ kbuf->top_down = top_down;
+
+ /* Walk the RAM ranges and allocate a suitable range for the buffer */
+ if (image->type == KEXEC_TYPE_CRASH)
+ ret = walk_iomem_res("Crash kernel",
+ IORESOURCE_MEM | IORESOURCE_BUSY,
+ crashk_res.start, crashk_res.end, kbuf,
+ locate_mem_hole_callback);
+ else
+ ret = walk_system_ram_res(0, -1, kbuf,
+ locate_mem_hole_callback);
+ if (ret != 1) {
+ /* A suitable memory range could not be found for buffer */
+ return -EADDRNOTAVAIL;
+ }
+
+ /* Found a suitable memory range */
+ ksegment = &image->segment[image->nr_segments];
+ ksegment->kbuf = kbuf->buffer;
+ ksegment->bufsz = kbuf->bufsz;
+ ksegment->mem = kbuf->mem;
+ ksegment->memsz = kbuf->memsz;
+ image->nr_segments++;
+ *load_addr = ksegment->mem;
+ return 0;
+}
+
+/* Calculate and store the digest of segments */
+static int kexec_calculate_store_digests(struct kimage *image)
+{
+ struct crypto_shash *tfm;
+ struct shash_desc *desc;
+ int ret = 0, i, j, zero_buf_sz, sha_region_sz;
+ size_t desc_size, nullsz;
+ char *digest;
+ void *zero_buf;
+ struct kexec_sha_region *sha_regions;
+ struct purgatory_info *pi = &image->purgatory_info;
+
+ zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT);
+ zero_buf_sz = PAGE_SIZE;
+
+ tfm = crypto_alloc_shash("sha256", 0, 0);
+ if (IS_ERR(tfm)) {
+ ret = PTR_ERR(tfm);
+ goto out;
+ }
+
+ desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
+ desc = kzalloc(desc_size, GFP_KERNEL);
+ if (!desc) {
+ ret = -ENOMEM;
+ goto out_free_tfm;
+ }
+
+ sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region);
+ sha_regions = vzalloc(sha_region_sz);
+ if (!sha_regions)
+ goto out_free_desc;
+
+ desc->tfm = tfm;
+ desc->flags = 0;
+
+ ret = crypto_shash_init(desc);
+ if (ret < 0)
+ goto out_free_sha_regions;
+
+ digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL);
+ if (!digest) {
+ ret = -ENOMEM;
+ goto out_free_sha_regions;
+ }
+
+ for (j = i = 0; i < image->nr_segments; i++) {
+ struct kexec_segment *ksegment;
+
+ ksegment = &image->segment[i];
+ /*
+ * Skip purgatory as it will be modified once we put digest
+ * info in purgatory.
+ */
+ if (ksegment->kbuf == pi->purgatory_buf)
+ continue;
+
+ ret = crypto_shash_update(desc, ksegment->kbuf,
+ ksegment->bufsz);
+ if (ret)
+ break;
+
+ /*
+ * Assume rest of the buffer is filled with zero and
+ * update digest accordingly.
+ */
+ nullsz = ksegment->memsz - ksegment->bufsz;
+ while (nullsz) {
+ unsigned long bytes = nullsz;
+
+ if (bytes > zero_buf_sz)
+ bytes = zero_buf_sz;
+ ret = crypto_shash_update(desc, zero_buf, bytes);
+ if (ret)
+ break;
+ nullsz -= bytes;
+ }
+
+ if (ret)
+ break;
+
+ sha_regions[j].start = ksegment->mem;
+ sha_regions[j].len = ksegment->memsz;
+ j++;
+ }
+
+ if (!ret) {
+ ret = crypto_shash_final(desc, digest);
+ if (ret)
+ goto out_free_digest;
+ ret = kexec_purgatory_get_set_symbol(image, "sha_regions",
+ sha_regions, sha_region_sz, 0);
+ if (ret)
+ goto out_free_digest;
+
+ ret = kexec_purgatory_get_set_symbol(image, "sha256_digest",
+ digest, SHA256_DIGEST_SIZE, 0);
+ if (ret)
+ goto out_free_digest;
+ }
+
+out_free_digest:
+ kfree(digest);
+out_free_sha_regions:
+ vfree(sha_regions);
+out_free_desc:
+ kfree(desc);
+out_free_tfm:
+ kfree(tfm);
+out:
+ return ret;
+}
+
+/* Actually load purgatory. Lot of code taken from kexec-tools */
+static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
+ unsigned long max, int top_down)
+{
+ struct purgatory_info *pi = &image->purgatory_info;
+ unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad;
+ unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset;
+ unsigned char *buf_addr, *src;
+ int i, ret = 0, entry_sidx = -1;
+ const Elf_Shdr *sechdrs_c;
+ Elf_Shdr *sechdrs = NULL;
+ void *purgatory_buf = NULL;
+
+ /*
+ * sechdrs_c points to section headers in purgatory and are read
+ * only. No modifications allowed.
+ */
+ sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff;
+
+ /*
+ * We can not modify sechdrs_c[] and its fields. It is read only.
+ * Copy it over to a local copy where one can store some temporary
+ * data and free it at the end. We need to modify ->sh_addr and
+ * ->sh_offset fields to keep track of permanent and temporary
+ * locations of sections.
+ */
+ sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr));
+ if (!sechdrs)
+ return -ENOMEM;
+
+ memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr));
+
+ /*
+ * We seem to have multiple copies of sections. First copy is which
+ * is embedded in kernel in read only section. Some of these sections
+ * will be copied to a temporary buffer and relocated. And these
+ * sections will finally be copied to their final destination at
+ * segment load time.
+ *
+ * Use ->sh_offset to reflect section address in memory. It will
+ * point to original read only copy if section is not allocatable.
+ * Otherwise it will point to temporary copy which will be relocated.
+ *
+ * Use ->sh_addr to contain final address of the section where it
+ * will go during execution time.
+ */
+ for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ if (sechdrs[i].sh_type == SHT_NOBITS)
+ continue;
+
+ sechdrs[i].sh_offset = (unsigned long)pi->ehdr +
+ sechdrs[i].sh_offset;
+ }
+
+ /*
+ * Identify entry point section and make entry relative to section
+ * start.
+ */
+ entry = pi->ehdr->e_entry;
+ for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+ continue;
+
+ if (!(sechdrs[i].sh_flags & SHF_EXECINSTR))
+ continue;
+
+ /* Make entry section relative */
+ if (sechdrs[i].sh_addr <= pi->ehdr->e_entry &&
+ ((sechdrs[i].sh_addr + sechdrs[i].sh_size) >
+ pi->ehdr->e_entry)) {
+ entry_sidx = i;
+ entry -= sechdrs[i].sh_addr;
+ break;
+ }
+ }
+
+ /* Determine how much memory is needed to load relocatable object. */
+ buf_align = 1;
+ bss_align = 1;
+ buf_sz = 0;
+ bss_sz = 0;
+
+ for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+ continue;
+
+ align = sechdrs[i].sh_addralign;
+ if (sechdrs[i].sh_type != SHT_NOBITS) {
+ if (buf_align < align)
+ buf_align = align;
+ buf_sz = ALIGN(buf_sz, align);
+ buf_sz += sechdrs[i].sh_size;
+ } else {
+ /* bss section */
+ if (bss_align < align)
+ bss_align = align;
+ bss_sz = ALIGN(bss_sz, align);
+ bss_sz += sechdrs[i].sh_size;
+ }
+ }
+
+ /* Determine the bss padding required to align bss properly */
+ bss_pad = 0;
+ if (buf_sz & (bss_align - 1))
+ bss_pad = bss_align - (buf_sz & (bss_align - 1));
+
+ memsz = buf_sz + bss_pad + bss_sz;
+
+ /* Allocate buffer for purgatory */
+ purgatory_buf = vzalloc(buf_sz);
+ if (!purgatory_buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (buf_align < bss_align)
+ buf_align = bss_align;
+
+ /* Add buffer to segment list */
+ ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz,
+ buf_align, min, max, top_down,
+ &pi->purgatory_load_addr);
+ if (ret)
+ goto out;
+
+ /* Load SHF_ALLOC sections */
+ buf_addr = purgatory_buf;
+ load_addr = curr_load_addr = pi->purgatory_load_addr;
+ bss_addr = load_addr + buf_sz + bss_pad;
+
+ for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+ continue;
+
+ align = sechdrs[i].sh_addralign;
+ if (sechdrs[i].sh_type != SHT_NOBITS) {
+ curr_load_addr = ALIGN(curr_load_addr, align);
+ offset = curr_load_addr - load_addr;
+ /* We already modifed ->sh_offset to keep src addr */
+ src = (char *) sechdrs[i].sh_offset;
+ memcpy(buf_addr + offset, src, sechdrs[i].sh_size);
+
+ /* Store load address and source address of section */
+ sechdrs[i].sh_addr = curr_load_addr;
+
+ /*
+ * This section got copied to temporary buffer. Update
+ * ->sh_offset accordingly.
+ */
+ sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset);
+
+ /* Advance to the next address */
+ curr_load_addr += sechdrs[i].sh_size;
+ } else {
+ bss_addr = ALIGN(bss_addr, align);
+ sechdrs[i].sh_addr = bss_addr;
+ bss_addr += sechdrs[i].sh_size;
+ }
+ }
+
+ /* Update entry point based on load address of text section */
+ if (entry_sidx >= 0)
+ entry += sechdrs[entry_sidx].sh_addr;
+
+ /* Make kernel jump to purgatory after shutdown */
+ image->start = entry;
+
+ /* Used later to get/set symbol values */
+ pi->sechdrs = sechdrs;
+
+ /*
+ * Used later to identify which section is purgatory and skip it
+ * from checksumming.
+ */
+ pi->purgatory_buf = purgatory_buf;
+ return ret;
+out:
+ vfree(sechdrs);
+ vfree(purgatory_buf);
+ return ret;
+}
+
+static int kexec_apply_relocations(struct kimage *image)
+{
+ int i, ret;
+ struct purgatory_info *pi = &image->purgatory_info;
+ Elf_Shdr *sechdrs = pi->sechdrs;
+
+ /* Apply relocations */
+ for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ Elf_Shdr *section, *symtab;
+
+ if (sechdrs[i].sh_type != SHT_RELA &&
+ sechdrs[i].sh_type != SHT_REL)
+ continue;
+
+ /*
+ * For section of type SHT_RELA/SHT_REL,
+ * ->sh_link contains section header index of associated
+ * symbol table. And ->sh_info contains section header
+ * index of section to which relocations apply.
+ */
+ if (sechdrs[i].sh_info >= pi->ehdr->e_shnum ||
+ sechdrs[i].sh_link >= pi->ehdr->e_shnum)
+ return -ENOEXEC;
+
+ section = &sechdrs[sechdrs[i].sh_info];
+ symtab = &sechdrs[sechdrs[i].sh_link];
+
+ if (!(section->sh_flags & SHF_ALLOC))
+ continue;
+
+ /*
+ * symtab->sh_link contain section header index of associated
+ * string table.
+ */
+ if (symtab->sh_link >= pi->ehdr->e_shnum)
+ /* Invalid section number? */
+ continue;
+
+ /*
+ * Respective architecture needs to provide support for applying
+ * relocations of type SHT_RELA/SHT_REL.
+ */
+ if (sechdrs[i].sh_type == SHT_RELA)
+ ret = arch_kexec_apply_relocations_add(pi->ehdr,
+ sechdrs, i);
+ else if (sechdrs[i].sh_type == SHT_REL)
+ ret = arch_kexec_apply_relocations(pi->ehdr,
+ sechdrs, i);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+/* Load relocatable purgatory object and relocate it appropriately */
+int kexec_load_purgatory(struct kimage *image, unsigned long min,
+ unsigned long max, int top_down,
+ unsigned long *load_addr)
+{
+ struct purgatory_info *pi = &image->purgatory_info;
+ int ret;
+
+ if (kexec_purgatory_size <= 0)
+ return -EINVAL;
+
+ if (kexec_purgatory_size < sizeof(Elf_Ehdr))
+ return -ENOEXEC;
+
+ pi->ehdr = (Elf_Ehdr *)kexec_purgatory;
+
+ if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0
+ || pi->ehdr->e_type != ET_REL
+ || !elf_check_arch(pi->ehdr)
+ || pi->ehdr->e_shentsize != sizeof(Elf_Shdr))
+ return -ENOEXEC;
+
+ if (pi->ehdr->e_shoff >= kexec_purgatory_size
+ || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) >
+ kexec_purgatory_size - pi->ehdr->e_shoff))
+ return -ENOEXEC;
+
+ ret = __kexec_load_purgatory(image, min, max, top_down);
+ if (ret)
+ return ret;
+
+ ret = kexec_apply_relocations(image);
+ if (ret)
+ goto out;
+
+ *load_addr = pi->purgatory_load_addr;
+ return 0;
+out:
+ vfree(pi->sechdrs);
+ vfree(pi->purgatory_buf);
+ return ret;
+}
+
+static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
+ const char *name)
+{
+ Elf_Sym *syms;
+ Elf_Shdr *sechdrs;
+ Elf_Ehdr *ehdr;
+ int i, k;
+ const char *strtab;
+
+ if (!pi->sechdrs || !pi->ehdr)
+ return NULL;
+
+ sechdrs = pi->sechdrs;
+ ehdr = pi->ehdr;
+
+ for (i = 0; i < ehdr->e_shnum; i++) {
+ if (sechdrs[i].sh_type != SHT_SYMTAB)
+ continue;
+
+ if (sechdrs[i].sh_link >= ehdr->e_shnum)
+ /* Invalid strtab section number */
+ continue;
+ strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset;
+ syms = (Elf_Sym *)sechdrs[i].sh_offset;
+
+ /* Go through symbols for a match */
+ for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) {
+ if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL)
+ continue;
+
+ if (strcmp(strtab + syms[k].st_name, name) != 0)
+ continue;
+
+ if (syms[k].st_shndx == SHN_UNDEF ||
+ syms[k].st_shndx >= ehdr->e_shnum) {
+ pr_debug("Symbol: %s has bad section index %d.\n",
+ name, syms[k].st_shndx);
+ return NULL;
+ }
+
+ /* Found the symbol we are looking for */
+ return &syms[k];
+ }
+ }
+
+ return NULL;
+}
+
+void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name)
+{
+ struct purgatory_info *pi = &image->purgatory_info;
+ Elf_Sym *sym;
+ Elf_Shdr *sechdr;
+
+ sym = kexec_purgatory_find_symbol(pi, name);
+ if (!sym)
+ return ERR_PTR(-EINVAL);
+
+ sechdr = &pi->sechdrs[sym->st_shndx];
+
+ /*
+ * Returns the address where symbol will finally be loaded after
+ * kexec_load_segment()
+ */
+ return (void *)(sechdr->sh_addr + sym->st_value);
+}
+
+/*
+ * Get or set value of a symbol. If "get_value" is true, symbol value is
+ * returned in buf otherwise symbol value is set based on value in buf.
+ */
+int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
+ void *buf, unsigned int size, bool get_value)
+{
+ Elf_Sym *sym;
+ Elf_Shdr *sechdrs;
+ struct purgatory_info *pi = &image->purgatory_info;
+ char *sym_buf;
+
+ sym = kexec_purgatory_find_symbol(pi, name);
+ if (!sym)
+ return -EINVAL;
+
+ if (sym->st_size != size) {
+ pr_err("symbol %s size mismatch: expected %lu actual %u\n",
+ name, (unsigned long)sym->st_size, size);
+ return -EINVAL;
+ }
+
+ sechdrs = pi->sechdrs;
+
+ if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
+ pr_err("symbol %s is in a bss section. Cannot %s\n", name,
+ get_value ? "get" : "set");
+ return -EINVAL;
+ }
+
+ sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset +
+ sym->st_value;
+
+ if (get_value)
+ memcpy((void *)buf, sym_buf, size);
+ else
+ memcpy((void *)sym_buf, buf, size);
+
+ return 0;
+}
+#endif /* CONFIG_KEXEC_FILE */
+
+/*
+ * Move into place and start executing a preloaded standalone
+ * executable. If nothing was preloaded return an error.
+ */
+int kernel_kexec(void)
+{
+ int error = 0;
+
+ if (!mutex_trylock(&kexec_mutex))
+ return -EBUSY;
+ if (!kexec_image) {
+ error = -EINVAL;
+ goto Unlock;
+ }
+
+#ifdef CONFIG_KEXEC_JUMP
+ if (kexec_image->preserve_context) {
+ lock_system_sleep();
+ pm_prepare_console();
+ error = freeze_processes();
+ if (error) {
+ error = -EBUSY;
+ goto Restore_console;
+ }
+ suspend_console();
+ error = dpm_suspend_start(PMSG_FREEZE);
+ if (error)
+ goto Resume_console;
+ /* At this point, dpm_suspend_start() has been called,
+ * but *not* dpm_suspend_end(). We *must* call
+ * dpm_suspend_end() now. Otherwise, drivers for
+ * some devices (e.g. interrupt controllers) become
+ * desynchronized with the actual state of the
+ * hardware at resume time, and evil weirdness ensues.
+ */
+ error = dpm_suspend_end(PMSG_FREEZE);
+ if (error)
+ goto Resume_devices;
+ error = disable_nonboot_cpus();
+ if (error)
+ goto Enable_cpus;
+ local_irq_disable();
+ error = syscore_suspend();
+ if (error)
+ goto Enable_irqs;
+ } else
+#endif
+ {
+ kexec_in_progress = true;
+ kernel_restart_prepare(NULL);
+ migrate_to_reboot_cpu();
+
+ /*
+ * migrate_to_reboot_cpu() disables CPU hotplug assuming that
+ * no further code needs to use CPU hotplug (which is true in
+ * the reboot case). However, the kexec path depends on using
+ * CPU hotplug again; so re-enable it here.
+ */
+ cpu_hotplug_enable();
+ pr_emerg("Starting new kernel\n");
+ machine_shutdown();
+ }
+
+ machine_kexec(kexec_image);
+
+#ifdef CONFIG_KEXEC_JUMP
+ if (kexec_image->preserve_context) {
+ syscore_resume();
+ Enable_irqs:
+ local_irq_enable();
+ Enable_cpus:
+ enable_nonboot_cpus();
+ dpm_resume_start(PMSG_RESTORE);
+ Resume_devices:
+ dpm_resume_end(PMSG_RESTORE);
+ Resume_console:
+ resume_console();
+ thaw_processes();
+ Restore_console:
+ pm_restore_console();
+ unlock_system_sleep();
+ }
+#endif
+
+ Unlock:
+ mutex_unlock(&kexec_mutex);
+ return error;
+}
diff --git a/kernel/kmod.c b/kernel/kmod.c
new file mode 100644
index 000000000..2777f40a9
--- /dev/null
+++ b/kernel/kmod.c
@@ -0,0 +1,694 @@
+/*
+ kmod, the new module loader (replaces kerneld)
+ Kirk Petersen
+
+ Reorganized not to be a daemon by Adam Richter, with guidance
+ from Greg Zornetzer.
+
+ Modified to avoid chroot and file sharing problems.
+ Mikael Pettersson
+
+ Limit the concurrent number of kmod modprobes to catch loops from
+ "modprobe needs a service that is in a module".
+ Keith Owens <kaos@ocs.com.au> December 1999
+
+ Unblock all signals when we exec a usermode process.
+ Shuu Yamaguchi <shuu@wondernetworkresources.com> December 2000
+
+ call_usermodehelper wait flag, and remove exec_usermodehelper.
+ Rusty Russell <rusty@rustcorp.com.au> Jan 2003
+*/
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/syscalls.h>
+#include <linux/unistd.h>
+#include <linux/kmod.h>
+#include <linux/slab.h>
+#include <linux/completion.h>
+#include <linux/cred.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/workqueue.h>
+#include <linux/security.h>
+#include <linux/mount.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/resource.h>
+#include <linux/notifier.h>
+#include <linux/suspend.h>
+#include <linux/rwsem.h>
+#include <linux/ptrace.h>
+#include <linux/async.h>
+#include <asm/uaccess.h>
+
+#include <trace/events/module.h>
+
+extern int max_threads;
+
+static struct workqueue_struct *khelper_wq;
+
+#define CAP_BSET (void *)1
+#define CAP_PI (void *)2
+
+static kernel_cap_t usermodehelper_bset = CAP_FULL_SET;
+static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;
+static DEFINE_SPINLOCK(umh_sysctl_lock);
+static DECLARE_RWSEM(umhelper_sem);
+
+#ifdef CONFIG_MODULES
+
+/*
+ modprobe_path is set via /proc/sys.
+*/
+char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
+
+static void free_modprobe_argv(struct subprocess_info *info)
+{
+ kfree(info->argv[3]); /* check call_modprobe() */
+ kfree(info->argv);
+}
+
+static int call_modprobe(char *module_name, int wait)
+{
+ struct subprocess_info *info;
+ static char *envp[] = {
+ "HOME=/",
+ "TERM=linux",
+ "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+ NULL
+ };
+
+ char **argv = kmalloc(sizeof(char *[5]), GFP_KERNEL);
+ if (!argv)
+ goto out;
+
+ module_name = kstrdup(module_name, GFP_KERNEL);
+ if (!module_name)
+ goto free_argv;
+
+ argv[0] = modprobe_path;
+ argv[1] = "-q";
+ argv[2] = "--";
+ argv[3] = module_name; /* check free_modprobe_argv() */
+ argv[4] = NULL;
+
+ info = call_usermodehelper_setup(modprobe_path, argv, envp, GFP_KERNEL,
+ NULL, free_modprobe_argv, NULL);
+ if (!info)
+ goto free_module_name;
+
+ return call_usermodehelper_exec(info, wait | UMH_KILLABLE);
+
+free_module_name:
+ kfree(module_name);
+free_argv:
+ kfree(argv);
+out:
+ return -ENOMEM;
+}
+
+/**
+ * __request_module - try to load a kernel module
+ * @wait: wait (or not) for the operation to complete
+ * @fmt: printf style format string for the name of the module
+ * @...: arguments as specified in the format string
+ *
+ * Load a module using the user mode module loader. The function returns
+ * zero on success or a negative errno code on failure. Note that a
+ * successful module load does not mean the module did not then unload
+ * and exit on an error of its own. Callers must check that the service
+ * they requested is now available not blindly invoke it.
+ *
+ * If module auto-loading support is disabled then this function
+ * becomes a no-operation.
+ */
+int __request_module(bool wait, const char *fmt, ...)
+{
+ va_list args;
+ char module_name[MODULE_NAME_LEN];
+ unsigned int max_modprobes;
+ int ret;
+ static atomic_t kmod_concurrent = ATOMIC_INIT(0);
+#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */
+ static int kmod_loop_msg;
+
+ /*
+ * We don't allow synchronous module loading from async. Module
+ * init may invoke async_synchronize_full() which will end up
+ * waiting for this task which already is waiting for the module
+ * loading to complete, leading to a deadlock.
+ */
+ WARN_ON_ONCE(wait && current_is_async());
+
+ if (!modprobe_path[0])
+ return 0;
+
+ va_start(args, fmt);
+ ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
+ va_end(args);
+ if (ret >= MODULE_NAME_LEN)
+ return -ENAMETOOLONG;
+
+ ret = security_kernel_module_request(module_name);
+ if (ret)
+ return ret;
+
+ /* If modprobe needs a service that is in a module, we get a recursive
+ * loop. Limit the number of running kmod threads to max_threads/2 or
+ * MAX_KMOD_CONCURRENT, whichever is the smaller. A cleaner method
+ * would be to run the parents of this process, counting how many times
+ * kmod was invoked. That would mean accessing the internals of the
+ * process tables to get the command line, proc_pid_cmdline is static
+ * and it is not worth changing the proc code just to handle this case.
+ * KAO.
+ *
+ * "trace the ppid" is simple, but will fail if someone's
+ * parent exits. I think this is as good as it gets. --RR
+ */
+ max_modprobes = min(max_threads/2, MAX_KMOD_CONCURRENT);
+ atomic_inc(&kmod_concurrent);
+ if (atomic_read(&kmod_concurrent) > max_modprobes) {
+ /* We may be blaming an innocent here, but unlikely */
+ if (kmod_loop_msg < 5) {
+ printk(KERN_ERR
+ "request_module: runaway loop modprobe %s\n",
+ module_name);
+ kmod_loop_msg++;
+ }
+ atomic_dec(&kmod_concurrent);
+ return -ENOMEM;
+ }
+
+ trace_module_request(module_name, wait, _RET_IP_);
+
+ ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
+
+ atomic_dec(&kmod_concurrent);
+ return ret;
+}
+EXPORT_SYMBOL(__request_module);
+#endif /* CONFIG_MODULES */
+
+static void call_usermodehelper_freeinfo(struct subprocess_info *info)
+{
+ if (info->cleanup)
+ (*info->cleanup)(info);
+ kfree(info);
+}
+
+static void umh_complete(struct subprocess_info *sub_info)
+{
+ struct completion *comp = xchg(&sub_info->complete, NULL);
+ /*
+ * See call_usermodehelper_exec(). If xchg() returns NULL
+ * we own sub_info, the UMH_KILLABLE caller has gone away
+ * or the caller used UMH_NO_WAIT.
+ */
+ if (comp)
+ complete(comp);
+ else
+ call_usermodehelper_freeinfo(sub_info);
+}
+
+/*
+ * This is the task which runs the usermode application
+ */
+static int ____call_usermodehelper(void *data)
+{
+ struct subprocess_info *sub_info = data;
+ struct cred *new;
+ int retval;
+
+ spin_lock_irq(&current->sighand->siglock);
+ flush_signal_handlers(current, 1);
+ spin_unlock_irq(&current->sighand->siglock);
+
+ /* We can run anywhere, unlike our parent keventd(). */
+ set_cpus_allowed_ptr(current, cpu_all_mask);
+
+ /*
+ * Our parent is keventd, which runs with elevated scheduling priority.
+ * Avoid propagating that into the userspace child.
+ */
+ set_user_nice(current, 0);
+
+ retval = -ENOMEM;
+ new = prepare_kernel_cred(current);
+ if (!new)
+ goto out;
+
+ spin_lock(&umh_sysctl_lock);
+ new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset);
+ new->cap_inheritable = cap_intersect(usermodehelper_inheritable,
+ new->cap_inheritable);
+ spin_unlock(&umh_sysctl_lock);
+
+ if (sub_info->init) {
+ retval = sub_info->init(sub_info, new);
+ if (retval) {
+ abort_creds(new);
+ goto out;
+ }
+ }
+
+ commit_creds(new);
+
+ retval = do_execve(getname_kernel(sub_info->path),
+ (const char __user *const __user *)sub_info->argv,
+ (const char __user *const __user *)sub_info->envp);
+out:
+ sub_info->retval = retval;
+ /* wait_for_helper() will call umh_complete if UHM_WAIT_PROC. */
+ if (!(sub_info->wait & UMH_WAIT_PROC))
+ umh_complete(sub_info);
+ if (!retval)
+ return 0;
+ do_exit(0);
+}
+
+/* Keventd can't block, but this (a child) can. */
+static int wait_for_helper(void *data)
+{
+ struct subprocess_info *sub_info = data;
+ pid_t pid;
+
+ /* If SIGCLD is ignored sys_wait4 won't populate the status. */
+ kernel_sigaction(SIGCHLD, SIG_DFL);
+ pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD);
+ if (pid < 0) {
+ sub_info->retval = pid;
+ } else {
+ int ret = -ECHILD;
+ /*
+ * Normally it is bogus to call wait4() from in-kernel because
+ * wait4() wants to write the exit code to a userspace address.
+ * But wait_for_helper() always runs as keventd, and put_user()
+ * to a kernel address works OK for kernel threads, due to their
+ * having an mm_segment_t which spans the entire address space.
+ *
+ * Thus the __user pointer cast is valid here.
+ */
+ sys_wait4(pid, (int __user *)&ret, 0, NULL);
+
+ /*
+ * If ret is 0, either ____call_usermodehelper failed and the
+ * real error code is already in sub_info->retval or
+ * sub_info->retval is 0 anyway, so don't mess with it then.
+ */
+ if (ret)
+ sub_info->retval = ret;
+ }
+
+ umh_complete(sub_info);
+ do_exit(0);
+}
+
+/* This is run by khelper thread */
+static void __call_usermodehelper(struct work_struct *work)
+{
+ struct subprocess_info *sub_info =
+ container_of(work, struct subprocess_info, work);
+ pid_t pid;
+
+ if (sub_info->wait & UMH_WAIT_PROC)
+ pid = kernel_thread(wait_for_helper, sub_info,
+ CLONE_FS | CLONE_FILES | SIGCHLD);
+ else
+ pid = kernel_thread(____call_usermodehelper, sub_info,
+ SIGCHLD);
+
+ if (pid < 0) {
+ sub_info->retval = pid;
+ umh_complete(sub_info);
+ }
+}
+
+/*
+ * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY
+ * (used for preventing user land processes from being created after the user
+ * land has been frozen during a system-wide hibernation or suspend operation).
+ * Should always be manipulated under umhelper_sem acquired for write.
+ */
+static enum umh_disable_depth usermodehelper_disabled = UMH_DISABLED;
+
+/* Number of helpers running */
+static atomic_t running_helpers = ATOMIC_INIT(0);
+
+/*
+ * Wait queue head used by usermodehelper_disable() to wait for all running
+ * helpers to finish.
+ */
+static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq);
+
+/*
+ * Used by usermodehelper_read_lock_wait() to wait for usermodehelper_disabled
+ * to become 'false'.
+ */
+static DECLARE_WAIT_QUEUE_HEAD(usermodehelper_disabled_waitq);
+
+/*
+ * Time to wait for running_helpers to become zero before the setting of
+ * usermodehelper_disabled in usermodehelper_disable() fails
+ */
+#define RUNNING_HELPERS_TIMEOUT (5 * HZ)
+
+int usermodehelper_read_trylock(void)
+{
+ DEFINE_WAIT(wait);
+ int ret = 0;
+
+ down_read(&umhelper_sem);
+ for (;;) {
+ prepare_to_wait(&usermodehelper_disabled_waitq, &wait,
+ TASK_INTERRUPTIBLE);
+ if (!usermodehelper_disabled)
+ break;
+
+ if (usermodehelper_disabled == UMH_DISABLED)
+ ret = -EAGAIN;
+
+ up_read(&umhelper_sem);
+
+ if (ret)
+ break;
+
+ schedule();
+ try_to_freeze();
+
+ down_read(&umhelper_sem);
+ }
+ finish_wait(&usermodehelper_disabled_waitq, &wait);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(usermodehelper_read_trylock);
+
+long usermodehelper_read_lock_wait(long timeout)
+{
+ DEFINE_WAIT(wait);
+
+ if (timeout < 0)
+ return -EINVAL;
+
+ down_read(&umhelper_sem);
+ for (;;) {
+ prepare_to_wait(&usermodehelper_disabled_waitq, &wait,
+ TASK_UNINTERRUPTIBLE);
+ if (!usermodehelper_disabled)
+ break;
+
+ up_read(&umhelper_sem);
+
+ timeout = schedule_timeout(timeout);
+ if (!timeout)
+ break;
+
+ down_read(&umhelper_sem);
+ }
+ finish_wait(&usermodehelper_disabled_waitq, &wait);
+ return timeout;
+}
+EXPORT_SYMBOL_GPL(usermodehelper_read_lock_wait);
+
+void usermodehelper_read_unlock(void)
+{
+ up_read(&umhelper_sem);
+}
+EXPORT_SYMBOL_GPL(usermodehelper_read_unlock);
+
+/**
+ * __usermodehelper_set_disable_depth - Modify usermodehelper_disabled.
+ * @depth: New value to assign to usermodehelper_disabled.
+ *
+ * Change the value of usermodehelper_disabled (under umhelper_sem locked for
+ * writing) and wakeup tasks waiting for it to change.
+ */
+void __usermodehelper_set_disable_depth(enum umh_disable_depth depth)
+{
+ down_write(&umhelper_sem);
+ usermodehelper_disabled = depth;
+ wake_up(&usermodehelper_disabled_waitq);
+ up_write(&umhelper_sem);
+}
+
+/**
+ * __usermodehelper_disable - Prevent new helpers from being started.
+ * @depth: New value to assign to usermodehelper_disabled.
+ *
+ * Set usermodehelper_disabled to @depth and wait for running helpers to exit.
+ */
+int __usermodehelper_disable(enum umh_disable_depth depth)
+{
+ long retval;
+
+ if (!depth)
+ return -EINVAL;
+
+ down_write(&umhelper_sem);
+ usermodehelper_disabled = depth;
+ up_write(&umhelper_sem);
+
+ /*
+ * From now on call_usermodehelper_exec() won't start any new
+ * helpers, so it is sufficient if running_helpers turns out to
+ * be zero at one point (it may be increased later, but that
+ * doesn't matter).
+ */
+ retval = wait_event_timeout(running_helpers_waitq,
+ atomic_read(&running_helpers) == 0,
+ RUNNING_HELPERS_TIMEOUT);
+ if (retval)
+ return 0;
+
+ __usermodehelper_set_disable_depth(UMH_ENABLED);
+ return -EAGAIN;
+}
+
+static void helper_lock(void)
+{
+ atomic_inc(&running_helpers);
+ smp_mb__after_atomic();
+}
+
+static void helper_unlock(void)
+{
+ if (atomic_dec_and_test(&running_helpers))
+ wake_up(&running_helpers_waitq);
+}
+
+/**
+ * call_usermodehelper_setup - prepare to call a usermode helper
+ * @path: path to usermode executable
+ * @argv: arg vector for process
+ * @envp: environment for process
+ * @gfp_mask: gfp mask for memory allocation
+ * @cleanup: a cleanup function
+ * @init: an init function
+ * @data: arbitrary context sensitive data
+ *
+ * Returns either %NULL on allocation failure, or a subprocess_info
+ * structure. This should be passed to call_usermodehelper_exec to
+ * exec the process and free the structure.
+ *
+ * The init function is used to customize the helper process prior to
+ * exec. A non-zero return code causes the process to error out, exit,
+ * and return the failure to the calling process
+ *
+ * The cleanup function is just before ethe subprocess_info is about to
+ * be freed. This can be used for freeing the argv and envp. The
+ * Function must be runnable in either a process context or the
+ * context in which call_usermodehelper_exec is called.
+ */
+struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
+ char **envp, gfp_t gfp_mask,
+ int (*init)(struct subprocess_info *info, struct cred *new),
+ void (*cleanup)(struct subprocess_info *info),
+ void *data)
+{
+ struct subprocess_info *sub_info;
+ sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask);
+ if (!sub_info)
+ goto out;
+
+ INIT_WORK(&sub_info->work, __call_usermodehelper);
+ sub_info->path = path;
+ sub_info->argv = argv;
+ sub_info->envp = envp;
+
+ sub_info->cleanup = cleanup;
+ sub_info->init = init;
+ sub_info->data = data;
+ out:
+ return sub_info;
+}
+EXPORT_SYMBOL(call_usermodehelper_setup);
+
+/**
+ * call_usermodehelper_exec - start a usermode application
+ * @sub_info: information about the subprocessa
+ * @wait: wait for the application to finish and return status.
+ * when UMH_NO_WAIT don't wait at all, but you get no useful error back
+ * when the program couldn't be exec'ed. This makes it safe to call
+ * from interrupt context.
+ *
+ * Runs a user-space application. The application is started
+ * asynchronously if wait is not set, and runs as a child of keventd.
+ * (ie. it runs with full root capabilities).
+ */
+int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
+{
+ DECLARE_COMPLETION_ONSTACK(done);
+ int retval = 0;
+
+ if (!sub_info->path) {
+ call_usermodehelper_freeinfo(sub_info);
+ return -EINVAL;
+ }
+ helper_lock();
+ if (!khelper_wq || usermodehelper_disabled) {
+ retval = -EBUSY;
+ goto out;
+ }
+ /*
+ * Set the completion pointer only if there is a waiter.
+ * This makes it possible to use umh_complete to free
+ * the data structure in case of UMH_NO_WAIT.
+ */
+ sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done;
+ sub_info->wait = wait;
+
+ queue_work(khelper_wq, &sub_info->work);
+ if (wait == UMH_NO_WAIT) /* task has freed sub_info */
+ goto unlock;
+
+ if (wait & UMH_KILLABLE) {
+ retval = wait_for_completion_killable(&done);
+ if (!retval)
+ goto wait_done;
+
+ /* umh_complete() will see NULL and free sub_info */
+ if (xchg(&sub_info->complete, NULL))
+ goto unlock;
+ /* fallthrough, umh_complete() was already called */
+ }
+
+ wait_for_completion(&done);
+wait_done:
+ retval = sub_info->retval;
+out:
+ call_usermodehelper_freeinfo(sub_info);
+unlock:
+ helper_unlock();
+ return retval;
+}
+EXPORT_SYMBOL(call_usermodehelper_exec);
+
+/**
+ * call_usermodehelper() - prepare and start a usermode application
+ * @path: path to usermode executable
+ * @argv: arg vector for process
+ * @envp: environment for process
+ * @wait: wait for the application to finish and return status.
+ * when UMH_NO_WAIT don't wait at all, but you get no useful error back
+ * when the program couldn't be exec'ed. This makes it safe to call
+ * from interrupt context.
+ *
+ * This function is the equivalent to use call_usermodehelper_setup() and
+ * call_usermodehelper_exec().
+ */
+int call_usermodehelper(char *path, char **argv, char **envp, int wait)
+{
+ struct subprocess_info *info;
+ gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL;
+
+ info = call_usermodehelper_setup(path, argv, envp, gfp_mask,
+ NULL, NULL, NULL);
+ if (info == NULL)
+ return -ENOMEM;
+
+ return call_usermodehelper_exec(info, wait);
+}
+EXPORT_SYMBOL(call_usermodehelper);
+
+static int proc_cap_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ unsigned long cap_array[_KERNEL_CAPABILITY_U32S];
+ kernel_cap_t new_cap;
+ int err, i;
+
+ if (write && (!capable(CAP_SETPCAP) ||
+ !capable(CAP_SYS_MODULE)))
+ return -EPERM;
+
+ /*
+ * convert from the global kernel_cap_t to the ulong array to print to
+ * userspace if this is a read.
+ */
+ spin_lock(&umh_sysctl_lock);
+ for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) {
+ if (table->data == CAP_BSET)
+ cap_array[i] = usermodehelper_bset.cap[i];
+ else if (table->data == CAP_PI)
+ cap_array[i] = usermodehelper_inheritable.cap[i];
+ else
+ BUG();
+ }
+ spin_unlock(&umh_sysctl_lock);
+
+ t = *table;
+ t.data = &cap_array;
+
+ /*
+ * actually read or write and array of ulongs from userspace. Remember
+ * these are least significant 32 bits first
+ */
+ err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
+ if (err < 0)
+ return err;
+
+ /*
+ * convert from the sysctl array of ulongs to the kernel_cap_t
+ * internal representation
+ */
+ for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++)
+ new_cap.cap[i] = cap_array[i];
+
+ /*
+ * Drop everything not in the new_cap (but don't add things)
+ */
+ spin_lock(&umh_sysctl_lock);
+ if (write) {
+ if (table->data == CAP_BSET)
+ usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap);
+ if (table->data == CAP_PI)
+ usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap);
+ }
+ spin_unlock(&umh_sysctl_lock);
+
+ return 0;
+}
+
+struct ctl_table usermodehelper_table[] = {
+ {
+ .procname = "bset",
+ .data = CAP_BSET,
+ .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
+ .mode = 0600,
+ .proc_handler = proc_cap_handler,
+ },
+ {
+ .procname = "inheritable",
+ .data = CAP_PI,
+ .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
+ .mode = 0600,
+ .proc_handler = proc_cap_handler,
+ },
+ { }
+};
+
+void __init usermodehelper_init(void)
+{
+ khelper_wq = create_singlethread_workqueue("khelper");
+ BUG_ON(!khelper_wq);
+}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
new file mode 100644
index 000000000..c90e417bb
--- /dev/null
+++ b/kernel/kprobes.c
@@ -0,0 +1,2472 @@
+/*
+ * Kernel Probes (KProbes)
+ * kernel/kprobes.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ *
+ * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
+ * Probes initial implementation (includes suggestions from
+ * Rusty Russell).
+ * 2004-Aug Updated by Prasanna S Panchamukhi <prasanna@in.ibm.com> with
+ * hlists and exceptions notifier as suggested by Andi Kleen.
+ * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
+ * interface to access function arguments.
+ * 2004-Sep Prasanna S Panchamukhi <prasanna@in.ibm.com> Changed Kprobes
+ * exceptions notifier to be first on the priority list.
+ * 2005-May Hien Nguyen <hien@us.ibm.com>, Jim Keniston
+ * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
+ * <prasanna@in.ibm.com> added function-return probes.
+ */
+#include <linux/kprobes.h>
+#include <linux/hash.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/stddef.h>
+#include <linux/export.h>
+#include <linux/moduleloader.h>
+#include <linux/kallsyms.h>
+#include <linux/freezer.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/sysctl.h>
+#include <linux/kdebug.h>
+#include <linux/memory.h>
+#include <linux/ftrace.h>
+#include <linux/cpu.h>
+#include <linux/jump_label.h>
+
+#include <asm-generic/sections.h>
+#include <asm/cacheflush.h>
+#include <asm/errno.h>
+#include <asm/uaccess.h>
+
+#define KPROBE_HASH_BITS 6
+#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
+
+
+/*
+ * Some oddball architectures like 64bit powerpc have function descriptors
+ * so this must be overridable.
+ */
+#ifndef kprobe_lookup_name
+#define kprobe_lookup_name(name, addr) \
+ addr = ((kprobe_opcode_t *)(kallsyms_lookup_name(name)))
+#endif
+
+static int kprobes_initialized;
+static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
+static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
+
+/* NOTE: change this value only with kprobe_mutex held */
+static bool kprobes_all_disarmed;
+
+/* This protects kprobe_table and optimizing_list */
+static DEFINE_MUTEX(kprobe_mutex);
+static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
+static struct {
+ raw_spinlock_t lock ____cacheline_aligned_in_smp;
+} kretprobe_table_locks[KPROBE_TABLE_SIZE];
+
+static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
+{
+ return &(kretprobe_table_locks[hash].lock);
+}
+
+/* Blacklist -- list of struct kprobe_blacklist_entry */
+static LIST_HEAD(kprobe_blacklist);
+
+#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
+/*
+ * kprobe->ainsn.insn points to the copy of the instruction to be
+ * single-stepped. x86_64, POWER4 and above have no-exec support and
+ * stepping on the instruction on a vmalloced/kmalloced/data page
+ * is a recipe for disaster
+ */
+struct kprobe_insn_page {
+ struct list_head list;
+ kprobe_opcode_t *insns; /* Page of instruction slots */
+ struct kprobe_insn_cache *cache;
+ int nused;
+ int ngarbage;
+ char slot_used[];
+};
+
+#define KPROBE_INSN_PAGE_SIZE(slots) \
+ (offsetof(struct kprobe_insn_page, slot_used) + \
+ (sizeof(char) * (slots)))
+
+static int slots_per_page(struct kprobe_insn_cache *c)
+{
+ return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t));
+}
+
+enum kprobe_slot_state {
+ SLOT_CLEAN = 0,
+ SLOT_DIRTY = 1,
+ SLOT_USED = 2,
+};
+
+static void *alloc_insn_page(void)
+{
+ return module_alloc(PAGE_SIZE);
+}
+
+static void free_insn_page(void *page)
+{
+ module_memfree(page);
+}
+
+struct kprobe_insn_cache kprobe_insn_slots = {
+ .mutex = __MUTEX_INITIALIZER(kprobe_insn_slots.mutex),
+ .alloc = alloc_insn_page,
+ .free = free_insn_page,
+ .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages),
+ .insn_size = MAX_INSN_SIZE,
+ .nr_garbage = 0,
+};
+static int collect_garbage_slots(struct kprobe_insn_cache *c);
+
+/**
+ * __get_insn_slot() - Find a slot on an executable page for an instruction.
+ * We allocate an executable page if there's no room on existing ones.
+ */
+kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
+{
+ struct kprobe_insn_page *kip;
+ kprobe_opcode_t *slot = NULL;
+
+ mutex_lock(&c->mutex);
+ retry:
+ list_for_each_entry(kip, &c->pages, list) {
+ if (kip->nused < slots_per_page(c)) {
+ int i;
+ for (i = 0; i < slots_per_page(c); i++) {
+ if (kip->slot_used[i] == SLOT_CLEAN) {
+ kip->slot_used[i] = SLOT_USED;
+ kip->nused++;
+ slot = kip->insns + (i * c->insn_size);
+ goto out;
+ }
+ }
+ /* kip->nused is broken. Fix it. */
+ kip->nused = slots_per_page(c);
+ WARN_ON(1);
+ }
+ }
+
+ /* If there are any garbage slots, collect it and try again. */
+ if (c->nr_garbage && collect_garbage_slots(c) == 0)
+ goto retry;
+
+ /* All out of space. Need to allocate a new page. */
+ kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL);
+ if (!kip)
+ goto out;
+
+ /*
+ * Use module_alloc so this page is within +/- 2GB of where the
+ * kernel image and loaded module images reside. This is required
+ * so x86_64 can correctly handle the %rip-relative fixups.
+ */
+ kip->insns = c->alloc();
+ if (!kip->insns) {
+ kfree(kip);
+ goto out;
+ }
+ INIT_LIST_HEAD(&kip->list);
+ memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c));
+ kip->slot_used[0] = SLOT_USED;
+ kip->nused = 1;
+ kip->ngarbage = 0;
+ kip->cache = c;
+ list_add(&kip->list, &c->pages);
+ slot = kip->insns;
+out:
+ mutex_unlock(&c->mutex);
+ return slot;
+}
+
+/* Return 1 if all garbages are collected, otherwise 0. */
+static int collect_one_slot(struct kprobe_insn_page *kip, int idx)
+{
+ kip->slot_used[idx] = SLOT_CLEAN;
+ kip->nused--;
+ if (kip->nused == 0) {
+ /*
+ * Page is no longer in use. Free it unless
+ * it's the last one. We keep the last one
+ * so as not to have to set it up again the
+ * next time somebody inserts a probe.
+ */
+ if (!list_is_singular(&kip->list)) {
+ list_del(&kip->list);
+ kip->cache->free(kip->insns);
+ kfree(kip);
+ }
+ return 1;
+ }
+ return 0;
+}
+
+static int collect_garbage_slots(struct kprobe_insn_cache *c)
+{
+ struct kprobe_insn_page *kip, *next;
+
+ /* Ensure no-one is interrupted on the garbages */
+ synchronize_sched();
+
+ list_for_each_entry_safe(kip, next, &c->pages, list) {
+ int i;
+ if (kip->ngarbage == 0)
+ continue;
+ kip->ngarbage = 0; /* we will collect all garbages */
+ for (i = 0; i < slots_per_page(c); i++) {
+ if (kip->slot_used[i] == SLOT_DIRTY &&
+ collect_one_slot(kip, i))
+ break;
+ }
+ }
+ c->nr_garbage = 0;
+ return 0;
+}
+
+void __free_insn_slot(struct kprobe_insn_cache *c,
+ kprobe_opcode_t *slot, int dirty)
+{
+ struct kprobe_insn_page *kip;
+
+ mutex_lock(&c->mutex);
+ list_for_each_entry(kip, &c->pages, list) {
+ long idx = ((long)slot - (long)kip->insns) /
+ (c->insn_size * sizeof(kprobe_opcode_t));
+ if (idx >= 0 && idx < slots_per_page(c)) {
+ WARN_ON(kip->slot_used[idx] != SLOT_USED);
+ if (dirty) {
+ kip->slot_used[idx] = SLOT_DIRTY;
+ kip->ngarbage++;
+ if (++c->nr_garbage > slots_per_page(c))
+ collect_garbage_slots(c);
+ } else
+ collect_one_slot(kip, idx);
+ goto out;
+ }
+ }
+ /* Could not free this slot. */
+ WARN_ON(1);
+out:
+ mutex_unlock(&c->mutex);
+}
+
+#ifdef CONFIG_OPTPROBES
+/* For optimized_kprobe buffer */
+struct kprobe_insn_cache kprobe_optinsn_slots = {
+ .mutex = __MUTEX_INITIALIZER(kprobe_optinsn_slots.mutex),
+ .alloc = alloc_insn_page,
+ .free = free_insn_page,
+ .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages),
+ /* .insn_size is initialized later */
+ .nr_garbage = 0,
+};
+#endif
+#endif
+
+/* We have preemption disabled.. so it is safe to use __ versions */
+static inline void set_kprobe_instance(struct kprobe *kp)
+{
+ __this_cpu_write(kprobe_instance, kp);
+}
+
+static inline void reset_kprobe_instance(void)
+{
+ __this_cpu_write(kprobe_instance, NULL);
+}
+
+/*
+ * This routine is called either:
+ * - under the kprobe_mutex - during kprobe_[un]register()
+ * OR
+ * - with preemption disabled - from arch/xxx/kernel/kprobes.c
+ */
+struct kprobe *get_kprobe(void *addr)
+{
+ struct hlist_head *head;
+ struct kprobe *p;
+
+ head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)];
+ hlist_for_each_entry_rcu(p, head, hlist) {
+ if (p->addr == addr)
+ return p;
+ }
+
+ return NULL;
+}
+NOKPROBE_SYMBOL(get_kprobe);
+
+static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs);
+
+/* Return true if the kprobe is an aggregator */
+static inline int kprobe_aggrprobe(struct kprobe *p)
+{
+ return p->pre_handler == aggr_pre_handler;
+}
+
+/* Return true(!0) if the kprobe is unused */
+static inline int kprobe_unused(struct kprobe *p)
+{
+ return kprobe_aggrprobe(p) && kprobe_disabled(p) &&
+ list_empty(&p->list);
+}
+
+/*
+ * Keep all fields in the kprobe consistent
+ */
+static inline void copy_kprobe(struct kprobe *ap, struct kprobe *p)
+{
+ memcpy(&p->opcode, &ap->opcode, sizeof(kprobe_opcode_t));
+ memcpy(&p->ainsn, &ap->ainsn, sizeof(struct arch_specific_insn));
+}
+
+#ifdef CONFIG_OPTPROBES
+/* NOTE: change this value only with kprobe_mutex held */
+static bool kprobes_allow_optimization;
+
+/*
+ * Call all pre_handler on the list, but ignores its return value.
+ * This must be called from arch-dep optimized caller.
+ */
+void opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
+{
+ struct kprobe *kp;
+
+ list_for_each_entry_rcu(kp, &p->list, list) {
+ if (kp->pre_handler && likely(!kprobe_disabled(kp))) {
+ set_kprobe_instance(kp);
+ kp->pre_handler(kp, regs);
+ }
+ reset_kprobe_instance();
+ }
+}
+NOKPROBE_SYMBOL(opt_pre_handler);
+
+/* Free optimized instructions and optimized_kprobe */
+static void free_aggr_kprobe(struct kprobe *p)
+{
+ struct optimized_kprobe *op;
+
+ op = container_of(p, struct optimized_kprobe, kp);
+ arch_remove_optimized_kprobe(op);
+ arch_remove_kprobe(p);
+ kfree(op);
+}
+
+/* Return true(!0) if the kprobe is ready for optimization. */
+static inline int kprobe_optready(struct kprobe *p)
+{
+ struct optimized_kprobe *op;
+
+ if (kprobe_aggrprobe(p)) {
+ op = container_of(p, struct optimized_kprobe, kp);
+ return arch_prepared_optinsn(&op->optinsn);
+ }
+
+ return 0;
+}
+
+/* Return true(!0) if the kprobe is disarmed. Note: p must be on hash list */
+static inline int kprobe_disarmed(struct kprobe *p)
+{
+ struct optimized_kprobe *op;
+
+ /* If kprobe is not aggr/opt probe, just return kprobe is disabled */
+ if (!kprobe_aggrprobe(p))
+ return kprobe_disabled(p);
+
+ op = container_of(p, struct optimized_kprobe, kp);
+
+ return kprobe_disabled(p) && list_empty(&op->list);
+}
+
+/* Return true(!0) if the probe is queued on (un)optimizing lists */
+static int kprobe_queued(struct kprobe *p)
+{
+ struct optimized_kprobe *op;
+
+ if (kprobe_aggrprobe(p)) {
+ op = container_of(p, struct optimized_kprobe, kp);
+ if (!list_empty(&op->list))
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Return an optimized kprobe whose optimizing code replaces
+ * instructions including addr (exclude breakpoint).
+ */
+static struct kprobe *get_optimized_kprobe(unsigned long addr)
+{
+ int i;
+ struct kprobe *p = NULL;
+ struct optimized_kprobe *op;
+
+ /* Don't check i == 0, since that is a breakpoint case. */
+ for (i = 1; !p && i < MAX_OPTIMIZED_LENGTH; i++)
+ p = get_kprobe((void *)(addr - i));
+
+ if (p && kprobe_optready(p)) {
+ op = container_of(p, struct optimized_kprobe, kp);
+ if (arch_within_optimized_kprobe(op, addr))
+ return p;
+ }
+
+ return NULL;
+}
+
+/* Optimization staging list, protected by kprobe_mutex */
+static LIST_HEAD(optimizing_list);
+static LIST_HEAD(unoptimizing_list);
+static LIST_HEAD(freeing_list);
+
+static void kprobe_optimizer(struct work_struct *work);
+static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
+#define OPTIMIZE_DELAY 5
+
+/*
+ * Optimize (replace a breakpoint with a jump) kprobes listed on
+ * optimizing_list.
+ */
+static void do_optimize_kprobes(void)
+{
+ /* Optimization never be done when disarmed */
+ if (kprobes_all_disarmed || !kprobes_allow_optimization ||
+ list_empty(&optimizing_list))
+ return;
+
+ /*
+ * The optimization/unoptimization refers online_cpus via
+ * stop_machine() and cpu-hotplug modifies online_cpus.
+ * And same time, text_mutex will be held in cpu-hotplug and here.
+ * This combination can cause a deadlock (cpu-hotplug try to lock
+ * text_mutex but stop_machine can not be done because online_cpus
+ * has been changed)
+ * To avoid this deadlock, we need to call get_online_cpus()
+ * for preventing cpu-hotplug outside of text_mutex locking.
+ */
+ get_online_cpus();
+ mutex_lock(&text_mutex);
+ arch_optimize_kprobes(&optimizing_list);
+ mutex_unlock(&text_mutex);
+ put_online_cpus();
+}
+
+/*
+ * Unoptimize (replace a jump with a breakpoint and remove the breakpoint
+ * if need) kprobes listed on unoptimizing_list.
+ */
+static void do_unoptimize_kprobes(void)
+{
+ struct optimized_kprobe *op, *tmp;
+
+ /* Unoptimization must be done anytime */
+ if (list_empty(&unoptimizing_list))
+ return;
+
+ /* Ditto to do_optimize_kprobes */
+ get_online_cpus();
+ mutex_lock(&text_mutex);
+ arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list);
+ /* Loop free_list for disarming */
+ list_for_each_entry_safe(op, tmp, &freeing_list, list) {
+ /* Disarm probes if marked disabled */
+ if (kprobe_disabled(&op->kp))
+ arch_disarm_kprobe(&op->kp);
+ if (kprobe_unused(&op->kp)) {
+ /*
+ * Remove unused probes from hash list. After waiting
+ * for synchronization, these probes are reclaimed.
+ * (reclaiming is done by do_free_cleaned_kprobes.)
+ */
+ hlist_del_rcu(&op->kp.hlist);
+ } else
+ list_del_init(&op->list);
+ }
+ mutex_unlock(&text_mutex);
+ put_online_cpus();
+}
+
+/* Reclaim all kprobes on the free_list */
+static void do_free_cleaned_kprobes(void)
+{
+ struct optimized_kprobe *op, *tmp;
+
+ list_for_each_entry_safe(op, tmp, &freeing_list, list) {
+ BUG_ON(!kprobe_unused(&op->kp));
+ list_del_init(&op->list);
+ free_aggr_kprobe(&op->kp);
+ }
+}
+
+/* Start optimizer after OPTIMIZE_DELAY passed */
+static void kick_kprobe_optimizer(void)
+{
+ schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
+}
+
+/* Kprobe jump optimizer */
+static void kprobe_optimizer(struct work_struct *work)
+{
+ mutex_lock(&kprobe_mutex);
+ /* Lock modules while optimizing kprobes */
+ mutex_lock(&module_mutex);
+
+ /*
+ * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)
+ * kprobes before waiting for quiesence period.
+ */
+ do_unoptimize_kprobes();
+
+ /*
+ * Step 2: Wait for quiesence period to ensure all running interrupts
+ * are done. Because optprobe may modify multiple instructions
+ * there is a chance that Nth instruction is interrupted. In that
+ * case, running interrupt can return to 2nd-Nth byte of jump
+ * instruction. This wait is for avoiding it.
+ */
+ synchronize_sched();
+
+ /* Step 3: Optimize kprobes after quiesence period */
+ do_optimize_kprobes();
+
+ /* Step 4: Free cleaned kprobes after quiesence period */
+ do_free_cleaned_kprobes();
+
+ mutex_unlock(&module_mutex);
+ mutex_unlock(&kprobe_mutex);
+
+ /* Step 5: Kick optimizer again if needed */
+ if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list))
+ kick_kprobe_optimizer();
+}
+
+/* Wait for completing optimization and unoptimization */
+static void wait_for_kprobe_optimizer(void)
+{
+ mutex_lock(&kprobe_mutex);
+
+ while (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) {
+ mutex_unlock(&kprobe_mutex);
+
+ /* this will also make optimizing_work execute immmediately */
+ flush_delayed_work(&optimizing_work);
+ /* @optimizing_work might not have been queued yet, relax */
+ cpu_relax();
+
+ mutex_lock(&kprobe_mutex);
+ }
+
+ mutex_unlock(&kprobe_mutex);
+}
+
+/* Optimize kprobe if p is ready to be optimized */
+static void optimize_kprobe(struct kprobe *p)
+{
+ struct optimized_kprobe *op;
+
+ /* Check if the kprobe is disabled or not ready for optimization. */
+ if (!kprobe_optready(p) || !kprobes_allow_optimization ||
+ (kprobe_disabled(p) || kprobes_all_disarmed))
+ return;
+
+ /* Both of break_handler and post_handler are not supported. */
+ if (p->break_handler || p->post_handler)
+ return;
+
+ op = container_of(p, struct optimized_kprobe, kp);
+
+ /* Check there is no other kprobes at the optimized instructions */
+ if (arch_check_optimized_kprobe(op) < 0)
+ return;
+
+ /* Check if it is already optimized. */
+ if (op->kp.flags & KPROBE_FLAG_OPTIMIZED)
+ return;
+ op->kp.flags |= KPROBE_FLAG_OPTIMIZED;
+
+ if (!list_empty(&op->list))
+ /* This is under unoptimizing. Just dequeue the probe */
+ list_del_init(&op->list);
+ else {
+ list_add(&op->list, &optimizing_list);
+ kick_kprobe_optimizer();
+ }
+}
+
+/* Short cut to direct unoptimizing */
+static void force_unoptimize_kprobe(struct optimized_kprobe *op)
+{
+ get_online_cpus();
+ arch_unoptimize_kprobe(op);
+ put_online_cpus();
+ if (kprobe_disabled(&op->kp))
+ arch_disarm_kprobe(&op->kp);
+}
+
+/* Unoptimize a kprobe if p is optimized */
+static void unoptimize_kprobe(struct kprobe *p, bool force)
+{
+ struct optimized_kprobe *op;
+
+ if (!kprobe_aggrprobe(p) || kprobe_disarmed(p))
+ return; /* This is not an optprobe nor optimized */
+
+ op = container_of(p, struct optimized_kprobe, kp);
+ if (!kprobe_optimized(p)) {
+ /* Unoptimized or unoptimizing case */
+ if (force && !list_empty(&op->list)) {
+ /*
+ * Only if this is unoptimizing kprobe and forced,
+ * forcibly unoptimize it. (No need to unoptimize
+ * unoptimized kprobe again :)
+ */
+ list_del_init(&op->list);
+ force_unoptimize_kprobe(op);
+ }
+ return;
+ }
+
+ op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+ if (!list_empty(&op->list)) {
+ /* Dequeue from the optimization queue */
+ list_del_init(&op->list);
+ return;
+ }
+ /* Optimized kprobe case */
+ if (force)
+ /* Forcibly update the code: this is a special case */
+ force_unoptimize_kprobe(op);
+ else {
+ list_add(&op->list, &unoptimizing_list);
+ kick_kprobe_optimizer();
+ }
+}
+
+/* Cancel unoptimizing for reusing */
+static void reuse_unused_kprobe(struct kprobe *ap)
+{
+ struct optimized_kprobe *op;
+
+ BUG_ON(!kprobe_unused(ap));
+ /*
+ * Unused kprobe MUST be on the way of delayed unoptimizing (means
+ * there is still a relative jump) and disabled.
+ */
+ op = container_of(ap, struct optimized_kprobe, kp);
+ if (unlikely(list_empty(&op->list)))
+ printk(KERN_WARNING "Warning: found a stray unused "
+ "aggrprobe@%p\n", ap->addr);
+ /* Enable the probe again */
+ ap->flags &= ~KPROBE_FLAG_DISABLED;
+ /* Optimize it again (remove from op->list) */
+ BUG_ON(!kprobe_optready(ap));
+ optimize_kprobe(ap);
+}
+
+/* Remove optimized instructions */
+static void kill_optimized_kprobe(struct kprobe *p)
+{
+ struct optimized_kprobe *op;
+
+ op = container_of(p, struct optimized_kprobe, kp);
+ if (!list_empty(&op->list))
+ /* Dequeue from the (un)optimization queue */
+ list_del_init(&op->list);
+ op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+
+ if (kprobe_unused(p)) {
+ /* Enqueue if it is unused */
+ list_add(&op->list, &freeing_list);
+ /*
+ * Remove unused probes from the hash list. After waiting
+ * for synchronization, this probe is reclaimed.
+ * (reclaiming is done by do_free_cleaned_kprobes().)
+ */
+ hlist_del_rcu(&op->kp.hlist);
+ }
+
+ /* Don't touch the code, because it is already freed. */
+ arch_remove_optimized_kprobe(op);
+}
+
+/* Try to prepare optimized instructions */
+static void prepare_optimized_kprobe(struct kprobe *p)
+{
+ struct optimized_kprobe *op;
+
+ op = container_of(p, struct optimized_kprobe, kp);
+ arch_prepare_optimized_kprobe(op, p);
+}
+
+/* Allocate new optimized_kprobe and try to prepare optimized instructions */
+static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
+{
+ struct optimized_kprobe *op;
+
+ op = kzalloc(sizeof(struct optimized_kprobe), GFP_KERNEL);
+ if (!op)
+ return NULL;
+
+ INIT_LIST_HEAD(&op->list);
+ op->kp.addr = p->addr;
+ arch_prepare_optimized_kprobe(op, p);
+
+ return &op->kp;
+}
+
+static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p);
+
+/*
+ * Prepare an optimized_kprobe and optimize it
+ * NOTE: p must be a normal registered kprobe
+ */
+static void try_to_optimize_kprobe(struct kprobe *p)
+{
+ struct kprobe *ap;
+ struct optimized_kprobe *op;
+
+ /* Impossible to optimize ftrace-based kprobe */
+ if (kprobe_ftrace(p))
+ return;
+
+ /* For preparing optimization, jump_label_text_reserved() is called */
+ jump_label_lock();
+ mutex_lock(&text_mutex);
+
+ ap = alloc_aggr_kprobe(p);
+ if (!ap)
+ goto out;
+
+ op = container_of(ap, struct optimized_kprobe, kp);
+ if (!arch_prepared_optinsn(&op->optinsn)) {
+ /* If failed to setup optimizing, fallback to kprobe */
+ arch_remove_optimized_kprobe(op);
+ kfree(op);
+ goto out;
+ }
+
+ init_aggr_kprobe(ap, p);
+ optimize_kprobe(ap); /* This just kicks optimizer thread */
+
+out:
+ mutex_unlock(&text_mutex);
+ jump_label_unlock();
+}
+
+#ifdef CONFIG_SYSCTL
+static void optimize_all_kprobes(void)
+{
+ struct hlist_head *head;
+ struct kprobe *p;
+ unsigned int i;
+
+ mutex_lock(&kprobe_mutex);
+ /* If optimization is already allowed, just return */
+ if (kprobes_allow_optimization)
+ goto out;
+
+ kprobes_allow_optimization = true;
+ for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
+ head = &kprobe_table[i];
+ hlist_for_each_entry_rcu(p, head, hlist)
+ if (!kprobe_disabled(p))
+ optimize_kprobe(p);
+ }
+ printk(KERN_INFO "Kprobes globally optimized\n");
+out:
+ mutex_unlock(&kprobe_mutex);
+}
+
+static void unoptimize_all_kprobes(void)
+{
+ struct hlist_head *head;
+ struct kprobe *p;
+ unsigned int i;
+
+ mutex_lock(&kprobe_mutex);
+ /* If optimization is already prohibited, just return */
+ if (!kprobes_allow_optimization) {
+ mutex_unlock(&kprobe_mutex);
+ return;
+ }
+
+ kprobes_allow_optimization = false;
+ for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
+ head = &kprobe_table[i];
+ hlist_for_each_entry_rcu(p, head, hlist) {
+ if (!kprobe_disabled(p))
+ unoptimize_kprobe(p, false);
+ }
+ }
+ mutex_unlock(&kprobe_mutex);
+
+ /* Wait for unoptimizing completion */
+ wait_for_kprobe_optimizer();
+ printk(KERN_INFO "Kprobes globally unoptimized\n");
+}
+
+static DEFINE_MUTEX(kprobe_sysctl_mutex);
+int sysctl_kprobes_optimization;
+int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *length,
+ loff_t *ppos)
+{
+ int ret;
+
+ mutex_lock(&kprobe_sysctl_mutex);
+ sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0;
+ ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
+
+ if (sysctl_kprobes_optimization)
+ optimize_all_kprobes();
+ else
+ unoptimize_all_kprobes();
+ mutex_unlock(&kprobe_sysctl_mutex);
+
+ return ret;
+}
+#endif /* CONFIG_SYSCTL */
+
+/* Put a breakpoint for a probe. Must be called with text_mutex locked */
+static void __arm_kprobe(struct kprobe *p)
+{
+ struct kprobe *_p;
+
+ /* Check collision with other optimized kprobes */
+ _p = get_optimized_kprobe((unsigned long)p->addr);
+ if (unlikely(_p))
+ /* Fallback to unoptimized kprobe */
+ unoptimize_kprobe(_p, true);
+
+ arch_arm_kprobe(p);
+ optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */
+}
+
+/* Remove the breakpoint of a probe. Must be called with text_mutex locked */
+static void __disarm_kprobe(struct kprobe *p, bool reopt)
+{
+ struct kprobe *_p;
+
+ /* Try to unoptimize */
+ unoptimize_kprobe(p, kprobes_all_disarmed);
+
+ if (!kprobe_queued(p)) {
+ arch_disarm_kprobe(p);
+ /* If another kprobe was blocked, optimize it. */
+ _p = get_optimized_kprobe((unsigned long)p->addr);
+ if (unlikely(_p) && reopt)
+ optimize_kprobe(_p);
+ }
+ /* TODO: reoptimize others after unoptimized this probe */
+}
+
+#else /* !CONFIG_OPTPROBES */
+
+#define optimize_kprobe(p) do {} while (0)
+#define unoptimize_kprobe(p, f) do {} while (0)
+#define kill_optimized_kprobe(p) do {} while (0)
+#define prepare_optimized_kprobe(p) do {} while (0)
+#define try_to_optimize_kprobe(p) do {} while (0)
+#define __arm_kprobe(p) arch_arm_kprobe(p)
+#define __disarm_kprobe(p, o) arch_disarm_kprobe(p)
+#define kprobe_disarmed(p) kprobe_disabled(p)
+#define wait_for_kprobe_optimizer() do {} while (0)
+
+/* There should be no unused kprobes can be reused without optimization */
+static void reuse_unused_kprobe(struct kprobe *ap)
+{
+ printk(KERN_ERR "Error: There should be no unused kprobe here.\n");
+ BUG_ON(kprobe_unused(ap));
+}
+
+static void free_aggr_kprobe(struct kprobe *p)
+{
+ arch_remove_kprobe(p);
+ kfree(p);
+}
+
+static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
+{
+ return kzalloc(sizeof(struct kprobe), GFP_KERNEL);
+}
+#endif /* CONFIG_OPTPROBES */
+
+#ifdef CONFIG_KPROBES_ON_FTRACE
+static struct ftrace_ops kprobe_ftrace_ops __read_mostly = {
+ .func = kprobe_ftrace_handler,
+ .flags = FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_IPMODIFY,
+};
+static int kprobe_ftrace_enabled;
+
+/* Must ensure p->addr is really on ftrace */
+static int prepare_kprobe(struct kprobe *p)
+{
+ if (!kprobe_ftrace(p))
+ return arch_prepare_kprobe(p);
+
+ return arch_prepare_kprobe_ftrace(p);
+}
+
+/* Caller must lock kprobe_mutex */
+static void arm_kprobe_ftrace(struct kprobe *p)
+{
+ int ret;
+
+ ret = ftrace_set_filter_ip(&kprobe_ftrace_ops,
+ (unsigned long)p->addr, 0, 0);
+ WARN(ret < 0, "Failed to arm kprobe-ftrace at %p (%d)\n", p->addr, ret);
+ kprobe_ftrace_enabled++;
+ if (kprobe_ftrace_enabled == 1) {
+ ret = register_ftrace_function(&kprobe_ftrace_ops);
+ WARN(ret < 0, "Failed to init kprobe-ftrace (%d)\n", ret);
+ }
+}
+
+/* Caller must lock kprobe_mutex */
+static void disarm_kprobe_ftrace(struct kprobe *p)
+{
+ int ret;
+
+ kprobe_ftrace_enabled--;
+ if (kprobe_ftrace_enabled == 0) {
+ ret = unregister_ftrace_function(&kprobe_ftrace_ops);
+ WARN(ret < 0, "Failed to init kprobe-ftrace (%d)\n", ret);
+ }
+ ret = ftrace_set_filter_ip(&kprobe_ftrace_ops,
+ (unsigned long)p->addr, 1, 0);
+ WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret);
+}
+#else /* !CONFIG_KPROBES_ON_FTRACE */
+#define prepare_kprobe(p) arch_prepare_kprobe(p)
+#define arm_kprobe_ftrace(p) do {} while (0)
+#define disarm_kprobe_ftrace(p) do {} while (0)
+#endif
+
+/* Arm a kprobe with text_mutex */
+static void arm_kprobe(struct kprobe *kp)
+{
+ if (unlikely(kprobe_ftrace(kp))) {
+ arm_kprobe_ftrace(kp);
+ return;
+ }
+ /*
+ * Here, since __arm_kprobe() doesn't use stop_machine(),
+ * this doesn't cause deadlock on text_mutex. So, we don't
+ * need get_online_cpus().
+ */
+ mutex_lock(&text_mutex);
+ __arm_kprobe(kp);
+ mutex_unlock(&text_mutex);
+}
+
+/* Disarm a kprobe with text_mutex */
+static void disarm_kprobe(struct kprobe *kp, bool reopt)
+{
+ if (unlikely(kprobe_ftrace(kp))) {
+ disarm_kprobe_ftrace(kp);
+ return;
+ }
+ /* Ditto */
+ mutex_lock(&text_mutex);
+ __disarm_kprobe(kp, reopt);
+ mutex_unlock(&text_mutex);
+}
+
+/*
+ * Aggregate handlers for multiple kprobes support - these handlers
+ * take care of invoking the individual kprobe handlers on p->list
+ */
+static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
+{
+ struct kprobe *kp;
+
+ list_for_each_entry_rcu(kp, &p->list, list) {
+ if (kp->pre_handler && likely(!kprobe_disabled(kp))) {
+ set_kprobe_instance(kp);
+ if (kp->pre_handler(kp, regs))
+ return 1;
+ }
+ reset_kprobe_instance();
+ }
+ return 0;
+}
+NOKPROBE_SYMBOL(aggr_pre_handler);
+
+static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
+ unsigned long flags)
+{
+ struct kprobe *kp;
+
+ list_for_each_entry_rcu(kp, &p->list, list) {
+ if (kp->post_handler && likely(!kprobe_disabled(kp))) {
+ set_kprobe_instance(kp);
+ kp->post_handler(kp, regs, flags);
+ reset_kprobe_instance();
+ }
+ }
+}
+NOKPROBE_SYMBOL(aggr_post_handler);
+
+static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
+ int trapnr)
+{
+ struct kprobe *cur = __this_cpu_read(kprobe_instance);
+
+ /*
+ * if we faulted "during" the execution of a user specified
+ * probe handler, invoke just that probe's fault handler
+ */
+ if (cur && cur->fault_handler) {
+ if (cur->fault_handler(cur, regs, trapnr))
+ return 1;
+ }
+ return 0;
+}
+NOKPROBE_SYMBOL(aggr_fault_handler);
+
+static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
+{
+ struct kprobe *cur = __this_cpu_read(kprobe_instance);
+ int ret = 0;
+
+ if (cur && cur->break_handler) {
+ if (cur->break_handler(cur, regs))
+ ret = 1;
+ }
+ reset_kprobe_instance();
+ return ret;
+}
+NOKPROBE_SYMBOL(aggr_break_handler);
+
+/* Walks the list and increments nmissed count for multiprobe case */
+void kprobes_inc_nmissed_count(struct kprobe *p)
+{
+ struct kprobe *kp;
+ if (!kprobe_aggrprobe(p)) {
+ p->nmissed++;
+ } else {
+ list_for_each_entry_rcu(kp, &p->list, list)
+ kp->nmissed++;
+ }
+ return;
+}
+NOKPROBE_SYMBOL(kprobes_inc_nmissed_count);
+
+void recycle_rp_inst(struct kretprobe_instance *ri,
+ struct hlist_head *head)
+{
+ struct kretprobe *rp = ri->rp;
+
+ /* remove rp inst off the rprobe_inst_table */
+ hlist_del(&ri->hlist);
+ INIT_HLIST_NODE(&ri->hlist);
+ if (likely(rp)) {
+ raw_spin_lock(&rp->lock);
+ hlist_add_head(&ri->hlist, &rp->free_instances);
+ raw_spin_unlock(&rp->lock);
+ } else
+ /* Unregistering */
+ hlist_add_head(&ri->hlist, head);
+}
+NOKPROBE_SYMBOL(recycle_rp_inst);
+
+void kretprobe_hash_lock(struct task_struct *tsk,
+ struct hlist_head **head, unsigned long *flags)
+__acquires(hlist_lock)
+{
+ unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
+ raw_spinlock_t *hlist_lock;
+
+ *head = &kretprobe_inst_table[hash];
+ hlist_lock = kretprobe_table_lock_ptr(hash);
+ raw_spin_lock_irqsave(hlist_lock, *flags);
+}
+NOKPROBE_SYMBOL(kretprobe_hash_lock);
+
+static void kretprobe_table_lock(unsigned long hash,
+ unsigned long *flags)
+__acquires(hlist_lock)
+{
+ raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
+ raw_spin_lock_irqsave(hlist_lock, *flags);
+}
+NOKPROBE_SYMBOL(kretprobe_table_lock);
+
+void kretprobe_hash_unlock(struct task_struct *tsk,
+ unsigned long *flags)
+__releases(hlist_lock)
+{
+ unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
+ raw_spinlock_t *hlist_lock;
+
+ hlist_lock = kretprobe_table_lock_ptr(hash);
+ raw_spin_unlock_irqrestore(hlist_lock, *flags);
+}
+NOKPROBE_SYMBOL(kretprobe_hash_unlock);
+
+static void kretprobe_table_unlock(unsigned long hash,
+ unsigned long *flags)
+__releases(hlist_lock)
+{
+ raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
+ raw_spin_unlock_irqrestore(hlist_lock, *flags);
+}
+NOKPROBE_SYMBOL(kretprobe_table_unlock);
+
+/*
+ * This function is called from finish_task_switch when task tk becomes dead,
+ * so that we can recycle any function-return probe instances associated
+ * with this task. These left over instances represent probed functions
+ * that have been called but will never return.
+ */
+void kprobe_flush_task(struct task_struct *tk)
+{
+ struct kretprobe_instance *ri;
+ struct hlist_head *head, empty_rp;
+ struct hlist_node *tmp;
+ unsigned long hash, flags = 0;
+
+ if (unlikely(!kprobes_initialized))
+ /* Early boot. kretprobe_table_locks not yet initialized. */
+ return;
+
+ INIT_HLIST_HEAD(&empty_rp);
+ hash = hash_ptr(tk, KPROBE_HASH_BITS);
+ head = &kretprobe_inst_table[hash];
+ kretprobe_table_lock(hash, &flags);
+ hlist_for_each_entry_safe(ri, tmp, head, hlist) {
+ if (ri->task == tk)
+ recycle_rp_inst(ri, &empty_rp);
+ }
+ kretprobe_table_unlock(hash, &flags);
+ hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
+ hlist_del(&ri->hlist);
+ kfree(ri);
+ }
+}
+NOKPROBE_SYMBOL(kprobe_flush_task);
+
+static inline void free_rp_inst(struct kretprobe *rp)
+{
+ struct kretprobe_instance *ri;
+ struct hlist_node *next;
+
+ hlist_for_each_entry_safe(ri, next, &rp->free_instances, hlist) {
+ hlist_del(&ri->hlist);
+ kfree(ri);
+ }
+}
+
+static void cleanup_rp_inst(struct kretprobe *rp)
+{
+ unsigned long flags, hash;
+ struct kretprobe_instance *ri;
+ struct hlist_node *next;
+ struct hlist_head *head;
+
+ /* No race here */
+ for (hash = 0; hash < KPROBE_TABLE_SIZE; hash++) {
+ kretprobe_table_lock(hash, &flags);
+ head = &kretprobe_inst_table[hash];
+ hlist_for_each_entry_safe(ri, next, head, hlist) {
+ if (ri->rp == rp)
+ ri->rp = NULL;
+ }
+ kretprobe_table_unlock(hash, &flags);
+ }
+ free_rp_inst(rp);
+}
+NOKPROBE_SYMBOL(cleanup_rp_inst);
+
+/*
+* Add the new probe to ap->list. Fail if this is the
+* second jprobe at the address - two jprobes can't coexist
+*/
+static int add_new_kprobe(struct kprobe *ap, struct kprobe *p)
+{
+ BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
+
+ if (p->break_handler || p->post_handler)
+ unoptimize_kprobe(ap, true); /* Fall back to normal kprobe */
+
+ if (p->break_handler) {
+ if (ap->break_handler)
+ return -EEXIST;
+ list_add_tail_rcu(&p->list, &ap->list);
+ ap->break_handler = aggr_break_handler;
+ } else
+ list_add_rcu(&p->list, &ap->list);
+ if (p->post_handler && !ap->post_handler)
+ ap->post_handler = aggr_post_handler;
+
+ return 0;
+}
+
+/*
+ * Fill in the required fields of the "manager kprobe". Replace the
+ * earlier kprobe in the hlist with the manager kprobe
+ */
+static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
+{
+ /* Copy p's insn slot to ap */
+ copy_kprobe(p, ap);
+ flush_insn_slot(ap);
+ ap->addr = p->addr;
+ ap->flags = p->flags & ~KPROBE_FLAG_OPTIMIZED;
+ ap->pre_handler = aggr_pre_handler;
+ ap->fault_handler = aggr_fault_handler;
+ /* We don't care the kprobe which has gone. */
+ if (p->post_handler && !kprobe_gone(p))
+ ap->post_handler = aggr_post_handler;
+ if (p->break_handler && !kprobe_gone(p))
+ ap->break_handler = aggr_break_handler;
+
+ INIT_LIST_HEAD(&ap->list);
+ INIT_HLIST_NODE(&ap->hlist);
+
+ list_add_rcu(&p->list, &ap->list);
+ hlist_replace_rcu(&p->hlist, &ap->hlist);
+}
+
+/*
+ * This is the second or subsequent kprobe at the address - handle
+ * the intricacies
+ */
+static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p)
+{
+ int ret = 0;
+ struct kprobe *ap = orig_p;
+
+ /* For preparing optimization, jump_label_text_reserved() is called */
+ jump_label_lock();
+ /*
+ * Get online CPUs to avoid text_mutex deadlock.with stop machine,
+ * which is invoked by unoptimize_kprobe() in add_new_kprobe()
+ */
+ get_online_cpus();
+ mutex_lock(&text_mutex);
+
+ if (!kprobe_aggrprobe(orig_p)) {
+ /* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */
+ ap = alloc_aggr_kprobe(orig_p);
+ if (!ap) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ init_aggr_kprobe(ap, orig_p);
+ } else if (kprobe_unused(ap))
+ /* This probe is going to die. Rescue it */
+ reuse_unused_kprobe(ap);
+
+ if (kprobe_gone(ap)) {
+ /*
+ * Attempting to insert new probe at the same location that
+ * had a probe in the module vaddr area which already
+ * freed. So, the instruction slot has already been
+ * released. We need a new slot for the new probe.
+ */
+ ret = arch_prepare_kprobe(ap);
+ if (ret)
+ /*
+ * Even if fail to allocate new slot, don't need to
+ * free aggr_probe. It will be used next time, or
+ * freed by unregister_kprobe.
+ */
+ goto out;
+
+ /* Prepare optimized instructions if possible. */
+ prepare_optimized_kprobe(ap);
+
+ /*
+ * Clear gone flag to prevent allocating new slot again, and
+ * set disabled flag because it is not armed yet.
+ */
+ ap->flags = (ap->flags & ~KPROBE_FLAG_GONE)
+ | KPROBE_FLAG_DISABLED;
+ }
+
+ /* Copy ap's insn slot to p */
+ copy_kprobe(ap, p);
+ ret = add_new_kprobe(ap, p);
+
+out:
+ mutex_unlock(&text_mutex);
+ put_online_cpus();
+ jump_label_unlock();
+
+ if (ret == 0 && kprobe_disabled(ap) && !kprobe_disabled(p)) {
+ ap->flags &= ~KPROBE_FLAG_DISABLED;
+ if (!kprobes_all_disarmed)
+ /* Arm the breakpoint again. */
+ arm_kprobe(ap);
+ }
+ return ret;
+}
+
+bool __weak arch_within_kprobe_blacklist(unsigned long addr)
+{
+ /* The __kprobes marked functions and entry code must not be probed */
+ return addr >= (unsigned long)__kprobes_text_start &&
+ addr < (unsigned long)__kprobes_text_end;
+}
+
+static bool within_kprobe_blacklist(unsigned long addr)
+{
+ struct kprobe_blacklist_entry *ent;
+
+ if (arch_within_kprobe_blacklist(addr))
+ return true;
+ /*
+ * If there exists a kprobe_blacklist, verify and
+ * fail any probe registration in the prohibited area
+ */
+ list_for_each_entry(ent, &kprobe_blacklist, list) {
+ if (addr >= ent->start_addr && addr < ent->end_addr)
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * If we have a symbol_name argument, look it up and add the offset field
+ * to it. This way, we can specify a relative address to a symbol.
+ * This returns encoded errors if it fails to look up symbol or invalid
+ * combination of parameters.
+ */
+static kprobe_opcode_t *kprobe_addr(struct kprobe *p)
+{
+ kprobe_opcode_t *addr = p->addr;
+
+ if ((p->symbol_name && p->addr) ||
+ (!p->symbol_name && !p->addr))
+ goto invalid;
+
+ if (p->symbol_name) {
+ kprobe_lookup_name(p->symbol_name, addr);
+ if (!addr)
+ return ERR_PTR(-ENOENT);
+ }
+
+ addr = (kprobe_opcode_t *)(((char *)addr) + p->offset);
+ if (addr)
+ return addr;
+
+invalid:
+ return ERR_PTR(-EINVAL);
+}
+
+/* Check passed kprobe is valid and return kprobe in kprobe_table. */
+static struct kprobe *__get_valid_kprobe(struct kprobe *p)
+{
+ struct kprobe *ap, *list_p;
+
+ ap = get_kprobe(p->addr);
+ if (unlikely(!ap))
+ return NULL;
+
+ if (p != ap) {
+ list_for_each_entry_rcu(list_p, &ap->list, list)
+ if (list_p == p)
+ /* kprobe p is a valid probe */
+ goto valid;
+ return NULL;
+ }
+valid:
+ return ap;
+}
+
+/* Return error if the kprobe is being re-registered */
+static inline int check_kprobe_rereg(struct kprobe *p)
+{
+ int ret = 0;
+
+ mutex_lock(&kprobe_mutex);
+ if (__get_valid_kprobe(p))
+ ret = -EINVAL;
+ mutex_unlock(&kprobe_mutex);
+
+ return ret;
+}
+
+int __weak arch_check_ftrace_location(struct kprobe *p)
+{
+ unsigned long ftrace_addr;
+
+ ftrace_addr = ftrace_location((unsigned long)p->addr);
+ if (ftrace_addr) {
+#ifdef CONFIG_KPROBES_ON_FTRACE
+ /* Given address is not on the instruction boundary */
+ if ((unsigned long)p->addr != ftrace_addr)
+ return -EILSEQ;
+ p->flags |= KPROBE_FLAG_FTRACE;
+#else /* !CONFIG_KPROBES_ON_FTRACE */
+ return -EINVAL;
+#endif
+ }
+ return 0;
+}
+
+static int check_kprobe_address_safe(struct kprobe *p,
+ struct module **probed_mod)
+{
+ int ret;
+
+ ret = arch_check_ftrace_location(p);
+ if (ret)
+ return ret;
+ jump_label_lock();
+ preempt_disable();
+
+ /* Ensure it is not in reserved area nor out of text */
+ if (!kernel_text_address((unsigned long) p->addr) ||
+ within_kprobe_blacklist((unsigned long) p->addr) ||
+ jump_label_text_reserved(p->addr, p->addr)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* Check if are we probing a module */
+ *probed_mod = __module_text_address((unsigned long) p->addr);
+ if (*probed_mod) {
+ /*
+ * We must hold a refcount of the probed module while updating
+ * its code to prohibit unexpected unloading.
+ */
+ if (unlikely(!try_module_get(*probed_mod))) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ /*
+ * If the module freed .init.text, we couldn't insert
+ * kprobes in there.
+ */
+ if (within_module_init((unsigned long)p->addr, *probed_mod) &&
+ (*probed_mod)->state != MODULE_STATE_COMING) {
+ module_put(*probed_mod);
+ *probed_mod = NULL;
+ ret = -ENOENT;
+ }
+ }
+out:
+ preempt_enable();
+ jump_label_unlock();
+
+ return ret;
+}
+
+int register_kprobe(struct kprobe *p)
+{
+ int ret;
+ struct kprobe *old_p;
+ struct module *probed_mod;
+ kprobe_opcode_t *addr;
+
+ /* Adjust probe address from symbol */
+ addr = kprobe_addr(p);
+ if (IS_ERR(addr))
+ return PTR_ERR(addr);
+ p->addr = addr;
+
+ ret = check_kprobe_rereg(p);
+ if (ret)
+ return ret;
+
+ /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */
+ p->flags &= KPROBE_FLAG_DISABLED;
+ p->nmissed = 0;
+ INIT_LIST_HEAD(&p->list);
+
+ ret = check_kprobe_address_safe(p, &probed_mod);
+ if (ret)
+ return ret;
+
+ mutex_lock(&kprobe_mutex);
+
+ old_p = get_kprobe(p->addr);
+ if (old_p) {
+ /* Since this may unoptimize old_p, locking text_mutex. */
+ ret = register_aggr_kprobe(old_p, p);
+ goto out;
+ }
+
+ mutex_lock(&text_mutex); /* Avoiding text modification */
+ ret = prepare_kprobe(p);
+ mutex_unlock(&text_mutex);
+ if (ret)
+ goto out;
+
+ INIT_HLIST_NODE(&p->hlist);
+ hlist_add_head_rcu(&p->hlist,
+ &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
+
+ if (!kprobes_all_disarmed && !kprobe_disabled(p))
+ arm_kprobe(p);
+
+ /* Try to optimize kprobe */
+ try_to_optimize_kprobe(p);
+
+out:
+ mutex_unlock(&kprobe_mutex);
+
+ if (probed_mod)
+ module_put(probed_mod);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(register_kprobe);
+
+/* Check if all probes on the aggrprobe are disabled */
+static int aggr_kprobe_disabled(struct kprobe *ap)
+{
+ struct kprobe *kp;
+
+ list_for_each_entry_rcu(kp, &ap->list, list)
+ if (!kprobe_disabled(kp))
+ /*
+ * There is an active probe on the list.
+ * We can't disable this ap.
+ */
+ return 0;
+
+ return 1;
+}
+
+/* Disable one kprobe: Make sure called under kprobe_mutex is locked */
+static struct kprobe *__disable_kprobe(struct kprobe *p)
+{
+ struct kprobe *orig_p;
+
+ /* Get an original kprobe for return */
+ orig_p = __get_valid_kprobe(p);
+ if (unlikely(orig_p == NULL))
+ return NULL;
+
+ if (!kprobe_disabled(p)) {
+ /* Disable probe if it is a child probe */
+ if (p != orig_p)
+ p->flags |= KPROBE_FLAG_DISABLED;
+
+ /* Try to disarm and disable this/parent probe */
+ if (p == orig_p || aggr_kprobe_disabled(orig_p)) {
+ /*
+ * If kprobes_all_disarmed is set, orig_p
+ * should have already been disarmed, so
+ * skip unneed disarming process.
+ */
+ if (!kprobes_all_disarmed)
+ disarm_kprobe(orig_p, true);
+ orig_p->flags |= KPROBE_FLAG_DISABLED;
+ }
+ }
+
+ return orig_p;
+}
+
+/*
+ * Unregister a kprobe without a scheduler synchronization.
+ */
+static int __unregister_kprobe_top(struct kprobe *p)
+{
+ struct kprobe *ap, *list_p;
+
+ /* Disable kprobe. This will disarm it if needed. */
+ ap = __disable_kprobe(p);
+ if (ap == NULL)
+ return -EINVAL;
+
+ if (ap == p)
+ /*
+ * This probe is an independent(and non-optimized) kprobe
+ * (not an aggrprobe). Remove from the hash list.
+ */
+ goto disarmed;
+
+ /* Following process expects this probe is an aggrprobe */
+ WARN_ON(!kprobe_aggrprobe(ap));
+
+ if (list_is_singular(&ap->list) && kprobe_disarmed(ap))
+ /*
+ * !disarmed could be happen if the probe is under delayed
+ * unoptimizing.
+ */
+ goto disarmed;
+ else {
+ /* If disabling probe has special handlers, update aggrprobe */
+ if (p->break_handler && !kprobe_gone(p))
+ ap->break_handler = NULL;
+ if (p->post_handler && !kprobe_gone(p)) {
+ list_for_each_entry_rcu(list_p, &ap->list, list) {
+ if ((list_p != p) && (list_p->post_handler))
+ goto noclean;
+ }
+ ap->post_handler = NULL;
+ }
+noclean:
+ /*
+ * Remove from the aggrprobe: this path will do nothing in
+ * __unregister_kprobe_bottom().
+ */
+ list_del_rcu(&p->list);
+ if (!kprobe_disabled(ap) && !kprobes_all_disarmed)
+ /*
+ * Try to optimize this probe again, because post
+ * handler may have been changed.
+ */
+ optimize_kprobe(ap);
+ }
+ return 0;
+
+disarmed:
+ BUG_ON(!kprobe_disarmed(ap));
+ hlist_del_rcu(&ap->hlist);
+ return 0;
+}
+
+static void __unregister_kprobe_bottom(struct kprobe *p)
+{
+ struct kprobe *ap;
+
+ if (list_empty(&p->list))
+ /* This is an independent kprobe */
+ arch_remove_kprobe(p);
+ else if (list_is_singular(&p->list)) {
+ /* This is the last child of an aggrprobe */
+ ap = list_entry(p->list.next, struct kprobe, list);
+ list_del(&p->list);
+ free_aggr_kprobe(ap);
+ }
+ /* Otherwise, do nothing. */
+}
+
+int register_kprobes(struct kprobe **kps, int num)
+{
+ int i, ret = 0;
+
+ if (num <= 0)
+ return -EINVAL;
+ for (i = 0; i < num; i++) {
+ ret = register_kprobe(kps[i]);
+ if (ret < 0) {
+ if (i > 0)
+ unregister_kprobes(kps, i);
+ break;
+ }
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(register_kprobes);
+
+void unregister_kprobe(struct kprobe *p)
+{
+ unregister_kprobes(&p, 1);
+}
+EXPORT_SYMBOL_GPL(unregister_kprobe);
+
+void unregister_kprobes(struct kprobe **kps, int num)
+{
+ int i;
+
+ if (num <= 0)
+ return;
+ mutex_lock(&kprobe_mutex);
+ for (i = 0; i < num; i++)
+ if (__unregister_kprobe_top(kps[i]) < 0)
+ kps[i]->addr = NULL;
+ mutex_unlock(&kprobe_mutex);
+
+ synchronize_sched();
+ for (i = 0; i < num; i++)
+ if (kps[i]->addr)
+ __unregister_kprobe_bottom(kps[i]);
+}
+EXPORT_SYMBOL_GPL(unregister_kprobes);
+
+static struct notifier_block kprobe_exceptions_nb = {
+ .notifier_call = kprobe_exceptions_notify,
+ .priority = 0x7fffffff /* we need to be notified first */
+};
+
+unsigned long __weak arch_deref_entry_point(void *entry)
+{
+ return (unsigned long)entry;
+}
+
+int register_jprobes(struct jprobe **jps, int num)
+{
+ struct jprobe *jp;
+ int ret = 0, i;
+
+ if (num <= 0)
+ return -EINVAL;
+ for (i = 0; i < num; i++) {
+ unsigned long addr, offset;
+ jp = jps[i];
+ addr = arch_deref_entry_point(jp->entry);
+
+ /* Verify probepoint is a function entry point */
+ if (kallsyms_lookup_size_offset(addr, NULL, &offset) &&
+ offset == 0) {
+ jp->kp.pre_handler = setjmp_pre_handler;
+ jp->kp.break_handler = longjmp_break_handler;
+ ret = register_kprobe(&jp->kp);
+ } else
+ ret = -EINVAL;
+
+ if (ret < 0) {
+ if (i > 0)
+ unregister_jprobes(jps, i);
+ break;
+ }
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(register_jprobes);
+
+int register_jprobe(struct jprobe *jp)
+{
+ return register_jprobes(&jp, 1);
+}
+EXPORT_SYMBOL_GPL(register_jprobe);
+
+void unregister_jprobe(struct jprobe *jp)
+{
+ unregister_jprobes(&jp, 1);
+}
+EXPORT_SYMBOL_GPL(unregister_jprobe);
+
+void unregister_jprobes(struct jprobe **jps, int num)
+{
+ int i;
+
+ if (num <= 0)
+ return;
+ mutex_lock(&kprobe_mutex);
+ for (i = 0; i < num; i++)
+ if (__unregister_kprobe_top(&jps[i]->kp) < 0)
+ jps[i]->kp.addr = NULL;
+ mutex_unlock(&kprobe_mutex);
+
+ synchronize_sched();
+ for (i = 0; i < num; i++) {
+ if (jps[i]->kp.addr)
+ __unregister_kprobe_bottom(&jps[i]->kp);
+ }
+}
+EXPORT_SYMBOL_GPL(unregister_jprobes);
+
+#ifdef CONFIG_KRETPROBES
+/*
+ * This kprobe pre_handler is registered with every kretprobe. When probe
+ * hits it will set up the return probe.
+ */
+static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
+{
+ struct kretprobe *rp = container_of(p, struct kretprobe, kp);
+ unsigned long hash, flags = 0;
+ struct kretprobe_instance *ri;
+
+ /*
+ * To avoid deadlocks, prohibit return probing in NMI contexts,
+ * just skip the probe and increase the (inexact) 'nmissed'
+ * statistical counter, so that the user is informed that
+ * something happened:
+ */
+ if (unlikely(in_nmi())) {
+ rp->nmissed++;
+ return 0;
+ }
+
+ /* TODO: consider to only swap the RA after the last pre_handler fired */
+ hash = hash_ptr(current, KPROBE_HASH_BITS);
+ raw_spin_lock_irqsave(&rp->lock, flags);
+ if (!hlist_empty(&rp->free_instances)) {
+ ri = hlist_entry(rp->free_instances.first,
+ struct kretprobe_instance, hlist);
+ hlist_del(&ri->hlist);
+ raw_spin_unlock_irqrestore(&rp->lock, flags);
+
+ ri->rp = rp;
+ ri->task = current;
+
+ if (rp->entry_handler && rp->entry_handler(ri, regs)) {
+ raw_spin_lock_irqsave(&rp->lock, flags);
+ hlist_add_head(&ri->hlist, &rp->free_instances);
+ raw_spin_unlock_irqrestore(&rp->lock, flags);
+ return 0;
+ }
+
+ arch_prepare_kretprobe(ri, regs);
+
+ /* XXX(hch): why is there no hlist_move_head? */
+ INIT_HLIST_NODE(&ri->hlist);
+ kretprobe_table_lock(hash, &flags);
+ hlist_add_head(&ri->hlist, &kretprobe_inst_table[hash]);
+ kretprobe_table_unlock(hash, &flags);
+ } else {
+ rp->nmissed++;
+ raw_spin_unlock_irqrestore(&rp->lock, flags);
+ }
+ return 0;
+}
+NOKPROBE_SYMBOL(pre_handler_kretprobe);
+
+int register_kretprobe(struct kretprobe *rp)
+{
+ int ret = 0;
+ struct kretprobe_instance *inst;
+ int i;
+ void *addr;
+
+ if (kretprobe_blacklist_size) {
+ addr = kprobe_addr(&rp->kp);
+ if (IS_ERR(addr))
+ return PTR_ERR(addr);
+
+ for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
+ if (kretprobe_blacklist[i].addr == addr)
+ return -EINVAL;
+ }
+ }
+
+ rp->kp.pre_handler = pre_handler_kretprobe;
+ rp->kp.post_handler = NULL;
+ rp->kp.fault_handler = NULL;
+ rp->kp.break_handler = NULL;
+
+ /* Pre-allocate memory for max kretprobe instances */
+ if (rp->maxactive <= 0) {
+#ifdef CONFIG_PREEMPT
+ rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus());
+#else
+ rp->maxactive = num_possible_cpus();
+#endif
+ }
+ raw_spin_lock_init(&rp->lock);
+ INIT_HLIST_HEAD(&rp->free_instances);
+ for (i = 0; i < rp->maxactive; i++) {
+ inst = kmalloc(sizeof(struct kretprobe_instance) +
+ rp->data_size, GFP_KERNEL);
+ if (inst == NULL) {
+ free_rp_inst(rp);
+ return -ENOMEM;
+ }
+ INIT_HLIST_NODE(&inst->hlist);
+ hlist_add_head(&inst->hlist, &rp->free_instances);
+ }
+
+ rp->nmissed = 0;
+ /* Establish function entry probe point */
+ ret = register_kprobe(&rp->kp);
+ if (ret != 0)
+ free_rp_inst(rp);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(register_kretprobe);
+
+int register_kretprobes(struct kretprobe **rps, int num)
+{
+ int ret = 0, i;
+
+ if (num <= 0)
+ return -EINVAL;
+ for (i = 0; i < num; i++) {
+ ret = register_kretprobe(rps[i]);
+ if (ret < 0) {
+ if (i > 0)
+ unregister_kretprobes(rps, i);
+ break;
+ }
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(register_kretprobes);
+
+void unregister_kretprobe(struct kretprobe *rp)
+{
+ unregister_kretprobes(&rp, 1);
+}
+EXPORT_SYMBOL_GPL(unregister_kretprobe);
+
+void unregister_kretprobes(struct kretprobe **rps, int num)
+{
+ int i;
+
+ if (num <= 0)
+ return;
+ mutex_lock(&kprobe_mutex);
+ for (i = 0; i < num; i++)
+ if (__unregister_kprobe_top(&rps[i]->kp) < 0)
+ rps[i]->kp.addr = NULL;
+ mutex_unlock(&kprobe_mutex);
+
+ synchronize_sched();
+ for (i = 0; i < num; i++) {
+ if (rps[i]->kp.addr) {
+ __unregister_kprobe_bottom(&rps[i]->kp);
+ cleanup_rp_inst(rps[i]);
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(unregister_kretprobes);
+
+#else /* CONFIG_KRETPROBES */
+int register_kretprobe(struct kretprobe *rp)
+{
+ return -ENOSYS;
+}
+EXPORT_SYMBOL_GPL(register_kretprobe);
+
+int register_kretprobes(struct kretprobe **rps, int num)
+{
+ return -ENOSYS;
+}
+EXPORT_SYMBOL_GPL(register_kretprobes);
+
+void unregister_kretprobe(struct kretprobe *rp)
+{
+}
+EXPORT_SYMBOL_GPL(unregister_kretprobe);
+
+void unregister_kretprobes(struct kretprobe **rps, int num)
+{
+}
+EXPORT_SYMBOL_GPL(unregister_kretprobes);
+
+static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
+{
+ return 0;
+}
+NOKPROBE_SYMBOL(pre_handler_kretprobe);
+
+#endif /* CONFIG_KRETPROBES */
+
+/* Set the kprobe gone and remove its instruction buffer. */
+static void kill_kprobe(struct kprobe *p)
+{
+ struct kprobe *kp;
+
+ p->flags |= KPROBE_FLAG_GONE;
+ if (kprobe_aggrprobe(p)) {
+ /*
+ * If this is an aggr_kprobe, we have to list all the
+ * chained probes and mark them GONE.
+ */
+ list_for_each_entry_rcu(kp, &p->list, list)
+ kp->flags |= KPROBE_FLAG_GONE;
+ p->post_handler = NULL;
+ p->break_handler = NULL;
+ kill_optimized_kprobe(p);
+ }
+ /*
+ * Here, we can remove insn_slot safely, because no thread calls
+ * the original probed function (which will be freed soon) any more.
+ */
+ arch_remove_kprobe(p);
+}
+
+/* Disable one kprobe */
+int disable_kprobe(struct kprobe *kp)
+{
+ int ret = 0;
+
+ mutex_lock(&kprobe_mutex);
+
+ /* Disable this kprobe */
+ if (__disable_kprobe(kp) == NULL)
+ ret = -EINVAL;
+
+ mutex_unlock(&kprobe_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(disable_kprobe);
+
+/* Enable one kprobe */
+int enable_kprobe(struct kprobe *kp)
+{
+ int ret = 0;
+ struct kprobe *p;
+
+ mutex_lock(&kprobe_mutex);
+
+ /* Check whether specified probe is valid. */
+ p = __get_valid_kprobe(kp);
+ if (unlikely(p == NULL)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (kprobe_gone(kp)) {
+ /* This kprobe has gone, we couldn't enable it. */
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (p != kp)
+ kp->flags &= ~KPROBE_FLAG_DISABLED;
+
+ if (!kprobes_all_disarmed && kprobe_disabled(p)) {
+ p->flags &= ~KPROBE_FLAG_DISABLED;
+ arm_kprobe(p);
+ }
+out:
+ mutex_unlock(&kprobe_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(enable_kprobe);
+
+void dump_kprobe(struct kprobe *kp)
+{
+ printk(KERN_WARNING "Dumping kprobe:\n");
+ printk(KERN_WARNING "Name: %s\nAddress: %p\nOffset: %x\n",
+ kp->symbol_name, kp->addr, kp->offset);
+}
+NOKPROBE_SYMBOL(dump_kprobe);
+
+/*
+ * Lookup and populate the kprobe_blacklist.
+ *
+ * Unlike the kretprobe blacklist, we'll need to determine
+ * the range of addresses that belong to the said functions,
+ * since a kprobe need not necessarily be at the beginning
+ * of a function.
+ */
+static int __init populate_kprobe_blacklist(unsigned long *start,
+ unsigned long *end)
+{
+ unsigned long *iter;
+ struct kprobe_blacklist_entry *ent;
+ unsigned long entry, offset = 0, size = 0;
+
+ for (iter = start; iter < end; iter++) {
+ entry = arch_deref_entry_point((void *)*iter);
+
+ if (!kernel_text_address(entry) ||
+ !kallsyms_lookup_size_offset(entry, &size, &offset)) {
+ pr_err("Failed to find blacklist at %p\n",
+ (void *)entry);
+ continue;
+ }
+
+ ent = kmalloc(sizeof(*ent), GFP_KERNEL);
+ if (!ent)
+ return -ENOMEM;
+ ent->start_addr = entry;
+ ent->end_addr = entry + size;
+ INIT_LIST_HEAD(&ent->list);
+ list_add_tail(&ent->list, &kprobe_blacklist);
+ }
+ return 0;
+}
+
+/* Module notifier call back, checking kprobes on the module */
+static int kprobes_module_callback(struct notifier_block *nb,
+ unsigned long val, void *data)
+{
+ struct module *mod = data;
+ struct hlist_head *head;
+ struct kprobe *p;
+ unsigned int i;
+ int checkcore = (val == MODULE_STATE_GOING);
+
+ if (val != MODULE_STATE_GOING && val != MODULE_STATE_LIVE)
+ return NOTIFY_DONE;
+
+ /*
+ * When MODULE_STATE_GOING was notified, both of module .text and
+ * .init.text sections would be freed. When MODULE_STATE_LIVE was
+ * notified, only .init.text section would be freed. We need to
+ * disable kprobes which have been inserted in the sections.
+ */
+ mutex_lock(&kprobe_mutex);
+ for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
+ head = &kprobe_table[i];
+ hlist_for_each_entry_rcu(p, head, hlist)
+ if (within_module_init((unsigned long)p->addr, mod) ||
+ (checkcore &&
+ within_module_core((unsigned long)p->addr, mod))) {
+ /*
+ * The vaddr this probe is installed will soon
+ * be vfreed buy not synced to disk. Hence,
+ * disarming the breakpoint isn't needed.
+ */
+ kill_kprobe(p);
+ }
+ }
+ mutex_unlock(&kprobe_mutex);
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block kprobe_module_nb = {
+ .notifier_call = kprobes_module_callback,
+ .priority = 0
+};
+
+/* Markers of _kprobe_blacklist section */
+extern unsigned long __start_kprobe_blacklist[];
+extern unsigned long __stop_kprobe_blacklist[];
+
+static int __init init_kprobes(void)
+{
+ int i, err = 0;
+
+ /* FIXME allocate the probe table, currently defined statically */
+ /* initialize all list heads */
+ for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
+ INIT_HLIST_HEAD(&kprobe_table[i]);
+ INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
+ raw_spin_lock_init(&(kretprobe_table_locks[i].lock));
+ }
+
+ err = populate_kprobe_blacklist(__start_kprobe_blacklist,
+ __stop_kprobe_blacklist);
+ if (err) {
+ pr_err("kprobes: failed to populate blacklist: %d\n", err);
+ pr_err("Please take care of using kprobes.\n");
+ }
+
+ if (kretprobe_blacklist_size) {
+ /* lookup the function address from its name */
+ for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
+ kprobe_lookup_name(kretprobe_blacklist[i].name,
+ kretprobe_blacklist[i].addr);
+ if (!kretprobe_blacklist[i].addr)
+ printk("kretprobe: lookup failed: %s\n",
+ kretprobe_blacklist[i].name);
+ }
+ }
+
+#if defined(CONFIG_OPTPROBES)
+#if defined(__ARCH_WANT_KPROBES_INSN_SLOT)
+ /* Init kprobe_optinsn_slots */
+ kprobe_optinsn_slots.insn_size = MAX_OPTINSN_SIZE;
+#endif
+ /* By default, kprobes can be optimized */
+ kprobes_allow_optimization = true;
+#endif
+
+ /* By default, kprobes are armed */
+ kprobes_all_disarmed = false;
+
+ err = arch_init_kprobes();
+ if (!err)
+ err = register_die_notifier(&kprobe_exceptions_nb);
+ if (!err)
+ err = register_module_notifier(&kprobe_module_nb);
+
+ kprobes_initialized = (err == 0);
+
+ if (!err)
+ init_test_probes();
+ return err;
+}
+
+#ifdef CONFIG_DEBUG_FS
+static void report_probe(struct seq_file *pi, struct kprobe *p,
+ const char *sym, int offset, char *modname, struct kprobe *pp)
+{
+ char *kprobe_type;
+
+ if (p->pre_handler == pre_handler_kretprobe)
+ kprobe_type = "r";
+ else if (p->pre_handler == setjmp_pre_handler)
+ kprobe_type = "j";
+ else
+ kprobe_type = "k";
+
+ if (sym)
+ seq_printf(pi, "%p %s %s+0x%x %s ",
+ p->addr, kprobe_type, sym, offset,
+ (modname ? modname : " "));
+ else
+ seq_printf(pi, "%p %s %p ",
+ p->addr, kprobe_type, p->addr);
+
+ if (!pp)
+ pp = p;
+ seq_printf(pi, "%s%s%s%s\n",
+ (kprobe_gone(p) ? "[GONE]" : ""),
+ ((kprobe_disabled(p) && !kprobe_gone(p)) ? "[DISABLED]" : ""),
+ (kprobe_optimized(pp) ? "[OPTIMIZED]" : ""),
+ (kprobe_ftrace(pp) ? "[FTRACE]" : ""));
+}
+
+static void *kprobe_seq_start(struct seq_file *f, loff_t *pos)
+{
+ return (*pos < KPROBE_TABLE_SIZE) ? pos : NULL;
+}
+
+static void *kprobe_seq_next(struct seq_file *f, void *v, loff_t *pos)
+{
+ (*pos)++;
+ if (*pos >= KPROBE_TABLE_SIZE)
+ return NULL;
+ return pos;
+}
+
+static void kprobe_seq_stop(struct seq_file *f, void *v)
+{
+ /* Nothing to do */
+}
+
+static int show_kprobe_addr(struct seq_file *pi, void *v)
+{
+ struct hlist_head *head;
+ struct kprobe *p, *kp;
+ const char *sym = NULL;
+ unsigned int i = *(loff_t *) v;
+ unsigned long offset = 0;
+ char *modname, namebuf[KSYM_NAME_LEN];
+
+ head = &kprobe_table[i];
+ preempt_disable();
+ hlist_for_each_entry_rcu(p, head, hlist) {
+ sym = kallsyms_lookup((unsigned long)p->addr, NULL,
+ &offset, &modname, namebuf);
+ if (kprobe_aggrprobe(p)) {
+ list_for_each_entry_rcu(kp, &p->list, list)
+ report_probe(pi, kp, sym, offset, modname, p);
+ } else
+ report_probe(pi, p, sym, offset, modname, NULL);
+ }
+ preempt_enable();
+ return 0;
+}
+
+static const struct seq_operations kprobes_seq_ops = {
+ .start = kprobe_seq_start,
+ .next = kprobe_seq_next,
+ .stop = kprobe_seq_stop,
+ .show = show_kprobe_addr
+};
+
+static int kprobes_open(struct inode *inode, struct file *filp)
+{
+ return seq_open(filp, &kprobes_seq_ops);
+}
+
+static const struct file_operations debugfs_kprobes_operations = {
+ .open = kprobes_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+/* kprobes/blacklist -- shows which functions can not be probed */
+static void *kprobe_blacklist_seq_start(struct seq_file *m, loff_t *pos)
+{
+ return seq_list_start(&kprobe_blacklist, *pos);
+}
+
+static void *kprobe_blacklist_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ return seq_list_next(v, &kprobe_blacklist, pos);
+}
+
+static int kprobe_blacklist_seq_show(struct seq_file *m, void *v)
+{
+ struct kprobe_blacklist_entry *ent =
+ list_entry(v, struct kprobe_blacklist_entry, list);
+
+ seq_printf(m, "0x%p-0x%p\t%ps\n", (void *)ent->start_addr,
+ (void *)ent->end_addr, (void *)ent->start_addr);
+ return 0;
+}
+
+static const struct seq_operations kprobe_blacklist_seq_ops = {
+ .start = kprobe_blacklist_seq_start,
+ .next = kprobe_blacklist_seq_next,
+ .stop = kprobe_seq_stop, /* Reuse void function */
+ .show = kprobe_blacklist_seq_show,
+};
+
+static int kprobe_blacklist_open(struct inode *inode, struct file *filp)
+{
+ return seq_open(filp, &kprobe_blacklist_seq_ops);
+}
+
+static const struct file_operations debugfs_kprobe_blacklist_ops = {
+ .open = kprobe_blacklist_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static void arm_all_kprobes(void)
+{
+ struct hlist_head *head;
+ struct kprobe *p;
+ unsigned int i;
+
+ mutex_lock(&kprobe_mutex);
+
+ /* If kprobes are armed, just return */
+ if (!kprobes_all_disarmed)
+ goto already_enabled;
+
+ /*
+ * optimize_kprobe() called by arm_kprobe() checks
+ * kprobes_all_disarmed, so set kprobes_all_disarmed before
+ * arm_kprobe.
+ */
+ kprobes_all_disarmed = false;
+ /* Arming kprobes doesn't optimize kprobe itself */
+ for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
+ head = &kprobe_table[i];
+ hlist_for_each_entry_rcu(p, head, hlist)
+ if (!kprobe_disabled(p))
+ arm_kprobe(p);
+ }
+
+ printk(KERN_INFO "Kprobes globally enabled\n");
+
+already_enabled:
+ mutex_unlock(&kprobe_mutex);
+ return;
+}
+
+static void disarm_all_kprobes(void)
+{
+ struct hlist_head *head;
+ struct kprobe *p;
+ unsigned int i;
+
+ mutex_lock(&kprobe_mutex);
+
+ /* If kprobes are already disarmed, just return */
+ if (kprobes_all_disarmed) {
+ mutex_unlock(&kprobe_mutex);
+ return;
+ }
+
+ kprobes_all_disarmed = true;
+ printk(KERN_INFO "Kprobes globally disabled\n");
+
+ for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
+ head = &kprobe_table[i];
+ hlist_for_each_entry_rcu(p, head, hlist) {
+ if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
+ disarm_kprobe(p, false);
+ }
+ }
+ mutex_unlock(&kprobe_mutex);
+
+ /* Wait for disarming all kprobes by optimizer */
+ wait_for_kprobe_optimizer();
+}
+
+/*
+ * XXX: The debugfs bool file interface doesn't allow for callbacks
+ * when the bool state is switched. We can reuse that facility when
+ * available
+ */
+static ssize_t read_enabled_file_bool(struct file *file,
+ char __user *user_buf, size_t count, loff_t *ppos)
+{
+ char buf[3];
+
+ if (!kprobes_all_disarmed)
+ buf[0] = '1';
+ else
+ buf[0] = '0';
+ buf[1] = '\n';
+ buf[2] = 0x00;
+ return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
+}
+
+static ssize_t write_enabled_file_bool(struct file *file,
+ const char __user *user_buf, size_t count, loff_t *ppos)
+{
+ char buf[32];
+ size_t buf_size;
+
+ buf_size = min(count, (sizeof(buf)-1));
+ if (copy_from_user(buf, user_buf, buf_size))
+ return -EFAULT;
+
+ buf[buf_size] = '\0';
+ switch (buf[0]) {
+ case 'y':
+ case 'Y':
+ case '1':
+ arm_all_kprobes();
+ break;
+ case 'n':
+ case 'N':
+ case '0':
+ disarm_all_kprobes();
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return count;
+}
+
+static const struct file_operations fops_kp = {
+ .read = read_enabled_file_bool,
+ .write = write_enabled_file_bool,
+ .llseek = default_llseek,
+};
+
+static int __init debugfs_kprobe_init(void)
+{
+ struct dentry *dir, *file;
+ unsigned int value = 1;
+
+ dir = debugfs_create_dir("kprobes", NULL);
+ if (!dir)
+ return -ENOMEM;
+
+ file = debugfs_create_file("list", 0444, dir, NULL,
+ &debugfs_kprobes_operations);
+ if (!file)
+ goto error;
+
+ file = debugfs_create_file("enabled", 0600, dir,
+ &value, &fops_kp);
+ if (!file)
+ goto error;
+
+ file = debugfs_create_file("blacklist", 0444, dir, NULL,
+ &debugfs_kprobe_blacklist_ops);
+ if (!file)
+ goto error;
+
+ return 0;
+
+error:
+ debugfs_remove(dir);
+ return -ENOMEM;
+}
+
+late_initcall(debugfs_kprobe_init);
+#endif /* CONFIG_DEBUG_FS */
+
+module_init(init_kprobes);
+
+/* defined in arch/.../kernel/kprobes.c */
+EXPORT_SYMBOL_GPL(jprobe_return);
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
new file mode 100644
index 000000000..6683ccef9
--- /dev/null
+++ b/kernel/ksysfs.c
@@ -0,0 +1,243 @@
+/*
+ * kernel/ksysfs.c - sysfs attributes in /sys/kernel, which
+ * are not related to any other subsystem
+ *
+ * Copyright (C) 2004 Kay Sievers <kay.sievers@vrfy.org>
+ *
+ * This file is release under the GPLv2
+ *
+ */
+
+#include <linux/kobject.h>
+#include <linux/string.h>
+#include <linux/sysfs.h>
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/kexec.h>
+#include <linux/profile.h>
+#include <linux/stat.h>
+#include <linux/sched.h>
+#include <linux/capability.h>
+#include <linux/compiler.h>
+
+#include <linux/rcupdate.h> /* rcu_expedited */
+
+#define KERNEL_ATTR_RO(_name) \
+static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+
+#define KERNEL_ATTR_RW(_name) \
+static struct kobj_attribute _name##_attr = \
+ __ATTR(_name, 0644, _name##_show, _name##_store)
+
+/* current uevent sequence number */
+static ssize_t uevent_seqnum_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%llu\n", (unsigned long long)uevent_seqnum);
+}
+KERNEL_ATTR_RO(uevent_seqnum);
+
+#ifdef CONFIG_UEVENT_HELPER
+/* uevent helper program, used during early boot */
+static ssize_t uevent_helper_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%s\n", uevent_helper);
+}
+static ssize_t uevent_helper_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ if (count+1 > UEVENT_HELPER_PATH_LEN)
+ return -ENOENT;
+ memcpy(uevent_helper, buf, count);
+ uevent_helper[count] = '\0';
+ if (count && uevent_helper[count-1] == '\n')
+ uevent_helper[count-1] = '\0';
+ return count;
+}
+KERNEL_ATTR_RW(uevent_helper);
+#endif
+
+#ifdef CONFIG_PROFILING
+static ssize_t profiling_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", prof_on);
+}
+static ssize_t profiling_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int ret;
+
+ if (prof_on)
+ return -EEXIST;
+ /*
+ * This eventually calls into get_option() which
+ * has a ton of callers and is not const. It is
+ * easiest to cast it away here.
+ */
+ profile_setup((char *)buf);
+ ret = profile_init();
+ if (ret)
+ return ret;
+ ret = create_proc_profile();
+ if (ret)
+ return ret;
+ return count;
+}
+KERNEL_ATTR_RW(profiling);
+#endif
+
+#ifdef CONFIG_KEXEC
+static ssize_t kexec_loaded_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", !!kexec_image);
+}
+KERNEL_ATTR_RO(kexec_loaded);
+
+static ssize_t kexec_crash_loaded_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", !!kexec_crash_image);
+}
+KERNEL_ATTR_RO(kexec_crash_loaded);
+
+static ssize_t kexec_crash_size_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%zu\n", crash_get_memory_size());
+}
+static ssize_t kexec_crash_size_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long cnt;
+ int ret;
+
+ if (kstrtoul(buf, 0, &cnt))
+ return -EINVAL;
+
+ ret = crash_shrink_memory(cnt);
+ return ret < 0 ? ret : count;
+}
+KERNEL_ATTR_RW(kexec_crash_size);
+
+static ssize_t vmcoreinfo_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%lx %x\n",
+ paddr_vmcoreinfo_note(),
+ (unsigned int)sizeof(vmcoreinfo_note));
+}
+KERNEL_ATTR_RO(vmcoreinfo);
+
+#endif /* CONFIG_KEXEC */
+
+/* whether file capabilities are enabled */
+static ssize_t fscaps_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", file_caps_enabled);
+}
+KERNEL_ATTR_RO(fscaps);
+
+int rcu_expedited;
+static ssize_t rcu_expedited_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", rcu_expedited);
+}
+static ssize_t rcu_expedited_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ if (kstrtoint(buf, 0, &rcu_expedited))
+ return -EINVAL;
+
+ return count;
+}
+KERNEL_ATTR_RW(rcu_expedited);
+
+/*
+ * Make /sys/kernel/notes give the raw contents of our kernel .notes section.
+ */
+extern const void __start_notes __weak;
+extern const void __stop_notes __weak;
+#define notes_size (&__stop_notes - &__start_notes)
+
+static ssize_t notes_read(struct file *filp, struct kobject *kobj,
+ struct bin_attribute *bin_attr,
+ char *buf, loff_t off, size_t count)
+{
+ memcpy(buf, &__start_notes + off, count);
+ return count;
+}
+
+static struct bin_attribute notes_attr = {
+ .attr = {
+ .name = "notes",
+ .mode = S_IRUGO,
+ },
+ .read = &notes_read,
+};
+
+struct kobject *kernel_kobj;
+EXPORT_SYMBOL_GPL(kernel_kobj);
+
+static struct attribute * kernel_attrs[] = {
+ &fscaps_attr.attr,
+ &uevent_seqnum_attr.attr,
+#ifdef CONFIG_UEVENT_HELPER
+ &uevent_helper_attr.attr,
+#endif
+#ifdef CONFIG_PROFILING
+ &profiling_attr.attr,
+#endif
+#ifdef CONFIG_KEXEC
+ &kexec_loaded_attr.attr,
+ &kexec_crash_loaded_attr.attr,
+ &kexec_crash_size_attr.attr,
+ &vmcoreinfo_attr.attr,
+#endif
+ &rcu_expedited_attr.attr,
+ NULL
+};
+
+static struct attribute_group kernel_attr_group = {
+ .attrs = kernel_attrs,
+};
+
+static int __init ksysfs_init(void)
+{
+ int error;
+
+ kernel_kobj = kobject_create_and_add("kernel", NULL);
+ if (!kernel_kobj) {
+ error = -ENOMEM;
+ goto exit;
+ }
+ error = sysfs_create_group(kernel_kobj, &kernel_attr_group);
+ if (error)
+ goto kset_exit;
+
+ if (notes_size > 0) {
+ notes_attr.size = notes_size;
+ error = sysfs_create_bin_file(kernel_kobj, &notes_attr);
+ if (error)
+ goto group_exit;
+ }
+
+ return 0;
+
+group_exit:
+ sysfs_remove_group(kernel_kobj, &kernel_attr_group);
+kset_exit:
+ kobject_put(kernel_kobj);
+exit:
+ return error;
+}
+
+core_initcall(ksysfs_init);
diff --git a/kernel/kthread.c b/kernel/kthread.c
new file mode 100644
index 000000000..c4237f12c
--- /dev/null
+++ b/kernel/kthread.c
@@ -0,0 +1,692 @@
+/* Kernel thread helper functions.
+ * Copyright (C) 2004 IBM Corporation, Rusty Russell.
+ *
+ * Creation is done via kthreadd, so that we get a clean environment
+ * even if we're invoked from userspace (think modprobe, hotplug cpu,
+ * etc.).
+ */
+#include <linux/sched.h>
+#include <linux/kthread.h>
+#include <linux/completion.h>
+#include <linux/err.h>
+#include <linux/cpuset.h>
+#include <linux/unistd.h>
+#include <linux/file.h>
+#include <linux/export.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/freezer.h>
+#include <linux/ptrace.h>
+#include <linux/uaccess.h>
+#include <trace/events/sched.h>
+
+static DEFINE_SPINLOCK(kthread_create_lock);
+static LIST_HEAD(kthread_create_list);
+struct task_struct *kthreadd_task;
+
+struct kthread_create_info
+{
+ /* Information passed to kthread() from kthreadd. */
+ int (*threadfn)(void *data);
+ void *data;
+ int node;
+
+ /* Result passed back to kthread_create() from kthreadd. */
+ struct task_struct *result;
+ struct completion *done;
+
+ struct list_head list;
+};
+
+struct kthread {
+ unsigned long flags;
+ unsigned int cpu;
+ void *data;
+ struct completion parked;
+ struct completion exited;
+};
+
+enum KTHREAD_BITS {
+ KTHREAD_IS_PER_CPU = 0,
+ KTHREAD_SHOULD_STOP,
+ KTHREAD_SHOULD_PARK,
+ KTHREAD_IS_PARKED,
+};
+
+#define __to_kthread(vfork) \
+ container_of(vfork, struct kthread, exited)
+
+static inline struct kthread *to_kthread(struct task_struct *k)
+{
+ return __to_kthread(k->vfork_done);
+}
+
+static struct kthread *to_live_kthread(struct task_struct *k)
+{
+ struct completion *vfork = ACCESS_ONCE(k->vfork_done);
+ if (likely(vfork))
+ return __to_kthread(vfork);
+ return NULL;
+}
+
+/**
+ * kthread_should_stop - should this kthread return now?
+ *
+ * When someone calls kthread_stop() on your kthread, it will be woken
+ * and this will return true. You should then return, and your return
+ * value will be passed through to kthread_stop().
+ */
+bool kthread_should_stop(void)
+{
+ return test_bit(KTHREAD_SHOULD_STOP, &to_kthread(current)->flags);
+}
+EXPORT_SYMBOL(kthread_should_stop);
+
+/**
+ * kthread_should_park - should this kthread park now?
+ *
+ * When someone calls kthread_park() on your kthread, it will be woken
+ * and this will return true. You should then do the necessary
+ * cleanup and call kthread_parkme()
+ *
+ * Similar to kthread_should_stop(), but this keeps the thread alive
+ * and in a park position. kthread_unpark() "restarts" the thread and
+ * calls the thread function again.
+ */
+bool kthread_should_park(void)
+{
+ return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(current)->flags);
+}
+
+/**
+ * kthread_freezable_should_stop - should this freezable kthread return now?
+ * @was_frozen: optional out parameter, indicates whether %current was frozen
+ *
+ * kthread_should_stop() for freezable kthreads, which will enter
+ * refrigerator if necessary. This function is safe from kthread_stop() /
+ * freezer deadlock and freezable kthreads should use this function instead
+ * of calling try_to_freeze() directly.
+ */
+bool kthread_freezable_should_stop(bool *was_frozen)
+{
+ bool frozen = false;
+
+ might_sleep();
+
+ if (unlikely(freezing(current)))
+ frozen = __refrigerator(true);
+
+ if (was_frozen)
+ *was_frozen = frozen;
+
+ return kthread_should_stop();
+}
+EXPORT_SYMBOL_GPL(kthread_freezable_should_stop);
+
+/**
+ * kthread_data - return data value specified on kthread creation
+ * @task: kthread task in question
+ *
+ * Return the data value specified when kthread @task was created.
+ * The caller is responsible for ensuring the validity of @task when
+ * calling this function.
+ */
+void *kthread_data(struct task_struct *task)
+{
+ return to_kthread(task)->data;
+}
+
+/**
+ * probe_kthread_data - speculative version of kthread_data()
+ * @task: possible kthread task in question
+ *
+ * @task could be a kthread task. Return the data value specified when it
+ * was created if accessible. If @task isn't a kthread task or its data is
+ * inaccessible for any reason, %NULL is returned. This function requires
+ * that @task itself is safe to dereference.
+ */
+void *probe_kthread_data(struct task_struct *task)
+{
+ struct kthread *kthread = to_kthread(task);
+ void *data = NULL;
+
+ probe_kernel_read(&data, &kthread->data, sizeof(data));
+ return data;
+}
+
+static void __kthread_parkme(struct kthread *self)
+{
+ __set_current_state(TASK_PARKED);
+ while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) {
+ if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags))
+ complete(&self->parked);
+ schedule();
+ __set_current_state(TASK_PARKED);
+ }
+ clear_bit(KTHREAD_IS_PARKED, &self->flags);
+ __set_current_state(TASK_RUNNING);
+}
+
+void kthread_parkme(void)
+{
+ __kthread_parkme(to_kthread(current));
+}
+
+static int kthread(void *_create)
+{
+ /* Copy data: it's on kthread's stack */
+ struct kthread_create_info *create = _create;
+ int (*threadfn)(void *data) = create->threadfn;
+ void *data = create->data;
+ struct completion *done;
+ struct kthread self;
+ int ret;
+
+ self.flags = 0;
+ self.data = data;
+ init_completion(&self.exited);
+ init_completion(&self.parked);
+ current->vfork_done = &self.exited;
+
+ /* If user was SIGKILLed, I release the structure. */
+ done = xchg(&create->done, NULL);
+ if (!done) {
+ kfree(create);
+ do_exit(-EINTR);
+ }
+ /* OK, tell user we're spawned, wait for stop or wakeup */
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ create->result = current;
+ complete(done);
+ schedule();
+
+ ret = -EINTR;
+
+ if (!test_bit(KTHREAD_SHOULD_STOP, &self.flags)) {
+ __kthread_parkme(&self);
+ ret = threadfn(data);
+ }
+ /* we can't just return, we must preserve "self" on stack */
+ do_exit(ret);
+}
+
+/* called from do_fork() to get node information for about to be created task */
+int tsk_fork_get_node(struct task_struct *tsk)
+{
+#ifdef CONFIG_NUMA
+ if (tsk == kthreadd_task)
+ return tsk->pref_node_fork;
+#endif
+ return NUMA_NO_NODE;
+}
+
+static void create_kthread(struct kthread_create_info *create)
+{
+ int pid;
+
+#ifdef CONFIG_NUMA
+ current->pref_node_fork = create->node;
+#endif
+ /* We want our own signal handler (we take no signals by default). */
+ pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
+ if (pid < 0) {
+ /* If user was SIGKILLed, I release the structure. */
+ struct completion *done = xchg(&create->done, NULL);
+
+ if (!done) {
+ kfree(create);
+ return;
+ }
+ create->result = ERR_PTR(pid);
+ complete(done);
+ }
+}
+
+/**
+ * kthread_create_on_node - create a kthread.
+ * @threadfn: the function to run until signal_pending(current).
+ * @data: data ptr for @threadfn.
+ * @node: memory node number.
+ * @namefmt: printf-style name for the thread.
+ *
+ * Description: This helper function creates and names a kernel
+ * thread. The thread will be stopped: use wake_up_process() to start
+ * it. See also kthread_run().
+ *
+ * If thread is going to be bound on a particular cpu, give its node
+ * in @node, to get NUMA affinity for kthread stack, or else give -1.
+ * When woken, the thread will run @threadfn() with @data as its
+ * argument. @threadfn() can either call do_exit() directly if it is a
+ * standalone thread for which no one will call kthread_stop(), or
+ * return when 'kthread_should_stop()' is true (which means
+ * kthread_stop() has been called). The return value should be zero
+ * or a negative error number; it will be passed to kthread_stop().
+ *
+ * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR).
+ */
+struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
+ void *data, int node,
+ const char namefmt[],
+ ...)
+{
+ DECLARE_COMPLETION_ONSTACK(done);
+ struct task_struct *task;
+ struct kthread_create_info *create = kmalloc(sizeof(*create),
+ GFP_KERNEL | ___GFP_TOI_NOTRACK);
+
+ if (!create)
+ return ERR_PTR(-ENOMEM);
+ create->threadfn = threadfn;
+ create->data = data;
+ create->node = node;
+ create->done = &done;
+
+ spin_lock(&kthread_create_lock);
+ list_add_tail(&create->list, &kthread_create_list);
+ spin_unlock(&kthread_create_lock);
+
+ wake_up_process(kthreadd_task);
+ /*
+ * Wait for completion in killable state, for I might be chosen by
+ * the OOM killer while kthreadd is trying to allocate memory for
+ * new kernel thread.
+ */
+ if (unlikely(wait_for_completion_killable(&done))) {
+ /*
+ * If I was SIGKILLed before kthreadd (or new kernel thread)
+ * calls complete(), leave the cleanup of this structure to
+ * that thread.
+ */
+ if (xchg(&create->done, NULL))
+ return ERR_PTR(-EINTR);
+ /*
+ * kthreadd (or new kernel thread) will call complete()
+ * shortly.
+ */
+ wait_for_completion(&done);
+ }
+ task = create->result;
+ if (!IS_ERR(task)) {
+ static const struct sched_param param = { .sched_priority = 0 };
+ va_list args;
+
+ va_start(args, namefmt);
+ vsnprintf(task->comm, sizeof(task->comm), namefmt, args);
+ va_end(args);
+ /*
+ * root may have changed our (kthreadd's) priority or CPU mask.
+ * The kernel thread should not inherit these properties.
+ */
+ sched_setscheduler_nocheck(task, SCHED_NORMAL, &param);
+ set_cpus_allowed_ptr(task, cpu_all_mask);
+ }
+ kfree(create);
+ return task;
+}
+EXPORT_SYMBOL(kthread_create_on_node);
+
+static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
+{
+ /* Must have done schedule() in kthread() before we set_task_cpu */
+ if (!wait_task_inactive(p, state)) {
+ WARN_ON(1);
+ return;
+ }
+ /* It's safe because the task is inactive. */
+ do_set_cpus_allowed(p, cpumask_of(cpu));
+ p->flags |= PF_NO_SETAFFINITY;
+}
+
+/**
+ * kthread_bind - bind a just-created kthread to a cpu.
+ * @p: thread created by kthread_create().
+ * @cpu: cpu (might not be online, must be possible) for @k to run on.
+ *
+ * Description: This function is equivalent to set_cpus_allowed(),
+ * except that @cpu doesn't need to be online, and the thread must be
+ * stopped (i.e., just returned from kthread_create()).
+ */
+void kthread_bind(struct task_struct *p, unsigned int cpu)
+{
+ __kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(kthread_bind);
+
+/**
+ * kthread_create_on_cpu - Create a cpu bound kthread
+ * @threadfn: the function to run until signal_pending(current).
+ * @data: data ptr for @threadfn.
+ * @cpu: The cpu on which the thread should be bound,
+ * @namefmt: printf-style name for the thread. Format is restricted
+ * to "name.*%u". Code fills in cpu number.
+ *
+ * Description: This helper function creates and names a kernel thread
+ * The thread will be woken and put into park mode.
+ */
+struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
+ void *data, unsigned int cpu,
+ const char *namefmt)
+{
+ struct task_struct *p;
+
+ p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt,
+ cpu);
+ if (IS_ERR(p))
+ return p;
+ set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags);
+ to_kthread(p)->cpu = cpu;
+ /* Park the thread to get it out of TASK_UNINTERRUPTIBLE state */
+ kthread_park(p);
+ return p;
+}
+
+static void __kthread_unpark(struct task_struct *k, struct kthread *kthread)
+{
+ clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
+ /*
+ * We clear the IS_PARKED bit here as we don't wait
+ * until the task has left the park code. So if we'd
+ * park before that happens we'd see the IS_PARKED bit
+ * which might be about to be cleared.
+ */
+ if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
+ if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
+ __kthread_bind(k, kthread->cpu, TASK_PARKED);
+ wake_up_state(k, TASK_PARKED);
+ }
+}
+
+/**
+ * kthread_unpark - unpark a thread created by kthread_create().
+ * @k: thread created by kthread_create().
+ *
+ * Sets kthread_should_park() for @k to return false, wakes it, and
+ * waits for it to return. If the thread is marked percpu then its
+ * bound to the cpu again.
+ */
+void kthread_unpark(struct task_struct *k)
+{
+ struct kthread *kthread = to_live_kthread(k);
+
+ if (kthread)
+ __kthread_unpark(k, kthread);
+}
+
+/**
+ * kthread_park - park a thread created by kthread_create().
+ * @k: thread created by kthread_create().
+ *
+ * Sets kthread_should_park() for @k to return true, wakes it, and
+ * waits for it to return. This can also be called after kthread_create()
+ * instead of calling wake_up_process(): the thread will park without
+ * calling threadfn().
+ *
+ * Returns 0 if the thread is parked, -ENOSYS if the thread exited.
+ * If called by the kthread itself just the park bit is set.
+ */
+int kthread_park(struct task_struct *k)
+{
+ struct kthread *kthread = to_live_kthread(k);
+ int ret = -ENOSYS;
+
+ if (kthread) {
+ if (!test_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
+ set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
+ if (k != current) {
+ wake_up_process(k);
+ wait_for_completion(&kthread->parked);
+ }
+ }
+ ret = 0;
+ }
+ return ret;
+}
+
+/**
+ * kthread_stop - stop a thread created by kthread_create().
+ * @k: thread created by kthread_create().
+ *
+ * Sets kthread_should_stop() for @k to return true, wakes it, and
+ * waits for it to exit. This can also be called after kthread_create()
+ * instead of calling wake_up_process(): the thread will exit without
+ * calling threadfn().
+ *
+ * If threadfn() may call do_exit() itself, the caller must ensure
+ * task_struct can't go away.
+ *
+ * Returns the result of threadfn(), or %-EINTR if wake_up_process()
+ * was never called.
+ */
+int kthread_stop(struct task_struct *k)
+{
+ struct kthread *kthread;
+ int ret;
+
+ trace_sched_kthread_stop(k);
+
+ get_task_struct(k);
+ kthread = to_live_kthread(k);
+ if (kthread) {
+ set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
+ __kthread_unpark(k, kthread);
+ wake_up_process(k);
+ wait_for_completion(&kthread->exited);
+ }
+ ret = k->exit_code;
+ put_task_struct(k);
+
+ trace_sched_kthread_stop_ret(ret);
+ return ret;
+}
+EXPORT_SYMBOL(kthread_stop);
+
+int kthreadd(void *unused)
+{
+ struct task_struct *tsk = current;
+
+ /* Setup a clean context for our children to inherit. */
+ set_task_comm(tsk, "kthreadd");
+ ignore_signals(tsk);
+ set_cpus_allowed_ptr(tsk, cpu_all_mask);
+ set_mems_allowed(node_states[N_MEMORY]);
+
+ current->flags |= PF_NOFREEZE;
+
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (list_empty(&kthread_create_list))
+ schedule();
+ __set_current_state(TASK_RUNNING);
+
+ spin_lock(&kthread_create_lock);
+ while (!list_empty(&kthread_create_list)) {
+ struct kthread_create_info *create;
+
+ create = list_entry(kthread_create_list.next,
+ struct kthread_create_info, list);
+ list_del_init(&create->list);
+ spin_unlock(&kthread_create_lock);
+
+ create_kthread(create);
+
+ spin_lock(&kthread_create_lock);
+ }
+ spin_unlock(&kthread_create_lock);
+ }
+
+ return 0;
+}
+
+void __init_kthread_worker(struct kthread_worker *worker,
+ const char *name,
+ struct lock_class_key *key)
+{
+ spin_lock_init(&worker->lock);
+ lockdep_set_class_and_name(&worker->lock, key, name);
+ INIT_LIST_HEAD(&worker->work_list);
+ worker->task = NULL;
+}
+EXPORT_SYMBOL_GPL(__init_kthread_worker);
+
+/**
+ * kthread_worker_fn - kthread function to process kthread_worker
+ * @worker_ptr: pointer to initialized kthread_worker
+ *
+ * This function can be used as @threadfn to kthread_create() or
+ * kthread_run() with @worker_ptr argument pointing to an initialized
+ * kthread_worker. The started kthread will process work_list until
+ * the it is stopped with kthread_stop(). A kthread can also call
+ * this function directly after extra initialization.
+ *
+ * Different kthreads can be used for the same kthread_worker as long
+ * as there's only one kthread attached to it at any given time. A
+ * kthread_worker without an attached kthread simply collects queued
+ * kthread_works.
+ */
+int kthread_worker_fn(void *worker_ptr)
+{
+ struct kthread_worker *worker = worker_ptr;
+ struct kthread_work *work;
+
+ WARN_ON(worker->task);
+ worker->task = current;
+repeat:
+ set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */
+
+ if (kthread_should_stop()) {
+ __set_current_state(TASK_RUNNING);
+ spin_lock_irq(&worker->lock);
+ worker->task = NULL;
+ spin_unlock_irq(&worker->lock);
+ return 0;
+ }
+
+ work = NULL;
+ spin_lock_irq(&worker->lock);
+ if (!list_empty(&worker->work_list)) {
+ work = list_first_entry(&worker->work_list,
+ struct kthread_work, node);
+ list_del_init(&work->node);
+ }
+ worker->current_work = work;
+ spin_unlock_irq(&worker->lock);
+
+ if (work) {
+ __set_current_state(TASK_RUNNING);
+ work->func(work);
+ } else if (!freezing(current))
+ schedule();
+
+ try_to_freeze();
+ goto repeat;
+}
+EXPORT_SYMBOL_GPL(kthread_worker_fn);
+
+/* insert @work before @pos in @worker */
+static void insert_kthread_work(struct kthread_worker *worker,
+ struct kthread_work *work,
+ struct list_head *pos)
+{
+ lockdep_assert_held(&worker->lock);
+
+ list_add_tail(&work->node, pos);
+ work->worker = worker;
+ if (!worker->current_work && likely(worker->task))
+ wake_up_process(worker->task);
+}
+
+/**
+ * queue_kthread_work - queue a kthread_work
+ * @worker: target kthread_worker
+ * @work: kthread_work to queue
+ *
+ * Queue @work to work processor @task for async execution. @task
+ * must have been created with kthread_worker_create(). Returns %true
+ * if @work was successfully queued, %false if it was already pending.
+ */
+bool queue_kthread_work(struct kthread_worker *worker,
+ struct kthread_work *work)
+{
+ bool ret = false;
+ unsigned long flags;
+
+ spin_lock_irqsave(&worker->lock, flags);
+ if (list_empty(&work->node)) {
+ insert_kthread_work(worker, work, &worker->work_list);
+ ret = true;
+ }
+ spin_unlock_irqrestore(&worker->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(queue_kthread_work);
+
+struct kthread_flush_work {
+ struct kthread_work work;
+ struct completion done;
+};
+
+static void kthread_flush_work_fn(struct kthread_work *work)
+{
+ struct kthread_flush_work *fwork =
+ container_of(work, struct kthread_flush_work, work);
+ complete(&fwork->done);
+}
+
+/**
+ * flush_kthread_work - flush a kthread_work
+ * @work: work to flush
+ *
+ * If @work is queued or executing, wait for it to finish execution.
+ */
+void flush_kthread_work(struct kthread_work *work)
+{
+ struct kthread_flush_work fwork = {
+ KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
+ COMPLETION_INITIALIZER_ONSTACK(fwork.done),
+ };
+ struct kthread_worker *worker;
+ bool noop = false;
+
+retry:
+ worker = work->worker;
+ if (!worker)
+ return;
+
+ spin_lock_irq(&worker->lock);
+ if (work->worker != worker) {
+ spin_unlock_irq(&worker->lock);
+ goto retry;
+ }
+
+ if (!list_empty(&work->node))
+ insert_kthread_work(worker, &fwork.work, work->node.next);
+ else if (worker->current_work == work)
+ insert_kthread_work(worker, &fwork.work, worker->work_list.next);
+ else
+ noop = true;
+
+ spin_unlock_irq(&worker->lock);
+
+ if (!noop)
+ wait_for_completion(&fwork.done);
+}
+EXPORT_SYMBOL_GPL(flush_kthread_work);
+
+/**
+ * flush_kthread_worker - flush all current works on a kthread_worker
+ * @worker: worker to flush
+ *
+ * Wait until all currently executing or pending works on @worker are
+ * finished.
+ */
+void flush_kthread_worker(struct kthread_worker *worker)
+{
+ struct kthread_flush_work fwork = {
+ KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
+ COMPLETION_INITIALIZER_ONSTACK(fwork.done),
+ };
+
+ queue_kthread_work(worker, &fwork.work);
+ wait_for_completion(&fwork.done);
+}
+EXPORT_SYMBOL_GPL(flush_kthread_worker);
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
new file mode 100644
index 000000000..a02812743
--- /dev/null
+++ b/kernel/latencytop.c
@@ -0,0 +1,292 @@
+/*
+ * latencytop.c: Latency display infrastructure
+ *
+ * (C) Copyright 2008 Intel Corporation
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+/*
+ * CONFIG_LATENCYTOP enables a kernel latency tracking infrastructure that is
+ * used by the "latencytop" userspace tool. The latency that is tracked is not
+ * the 'traditional' interrupt latency (which is primarily caused by something
+ * else consuming CPU), but instead, it is the latency an application encounters
+ * because the kernel sleeps on its behalf for various reasons.
+ *
+ * This code tracks 2 levels of statistics:
+ * 1) System level latency
+ * 2) Per process latency
+ *
+ * The latency is stored in fixed sized data structures in an accumulated form;
+ * if the "same" latency cause is hit twice, this will be tracked as one entry
+ * in the data structure. Both the count, total accumulated latency and maximum
+ * latency are tracked in this data structure. When the fixed size structure is
+ * full, no new causes are tracked until the buffer is flushed by writing to
+ * the /proc file; the userspace tool does this on a regular basis.
+ *
+ * A latency cause is identified by a stringified backtrace at the point that
+ * the scheduler gets invoked. The userland tool will use this string to
+ * identify the cause of the latency in human readable form.
+ *
+ * The information is exported via /proc/latency_stats and /proc/<pid>/latency.
+ * These files look like this:
+ *
+ * Latency Top version : v0.1
+ * 70 59433 4897 i915_irq_wait drm_ioctl vfs_ioctl do_vfs_ioctl sys_ioctl
+ * | | | |
+ * | | | +----> the stringified backtrace
+ * | | +---------> The maximum latency for this entry in microseconds
+ * | +--------------> The accumulated latency for this entry (microseconds)
+ * +-------------------> The number of times this entry is hit
+ *
+ * (note: the average latency is the accumulated latency divided by the number
+ * of times)
+ */
+
+#include <linux/latencytop.h>
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/notifier.h>
+#include <linux/spinlock.h>
+#include <linux/proc_fs.h>
+#include <linux/export.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/stacktrace.h>
+
+static DEFINE_RAW_SPINLOCK(latency_lock);
+
+#define MAXLR 128
+static struct latency_record latency_record[MAXLR];
+
+int latencytop_enabled;
+
+void clear_all_latency_tracing(struct task_struct *p)
+{
+ unsigned long flags;
+
+ if (!latencytop_enabled)
+ return;
+
+ raw_spin_lock_irqsave(&latency_lock, flags);
+ memset(&p->latency_record, 0, sizeof(p->latency_record));
+ p->latency_record_count = 0;
+ raw_spin_unlock_irqrestore(&latency_lock, flags);
+}
+
+static void clear_global_latency_tracing(void)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&latency_lock, flags);
+ memset(&latency_record, 0, sizeof(latency_record));
+ raw_spin_unlock_irqrestore(&latency_lock, flags);
+}
+
+static void __sched
+account_global_scheduler_latency(struct task_struct *tsk,
+ struct latency_record *lat)
+{
+ int firstnonnull = MAXLR + 1;
+ int i;
+
+ if (!latencytop_enabled)
+ return;
+
+ /* skip kernel threads for now */
+ if (!tsk->mm)
+ return;
+
+ for (i = 0; i < MAXLR; i++) {
+ int q, same = 1;
+
+ /* Nothing stored: */
+ if (!latency_record[i].backtrace[0]) {
+ if (firstnonnull > i)
+ firstnonnull = i;
+ continue;
+ }
+ for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
+ unsigned long record = lat->backtrace[q];
+
+ if (latency_record[i].backtrace[q] != record) {
+ same = 0;
+ break;
+ }
+
+ /* 0 and ULONG_MAX entries mean end of backtrace: */
+ if (record == 0 || record == ULONG_MAX)
+ break;
+ }
+ if (same) {
+ latency_record[i].count++;
+ latency_record[i].time += lat->time;
+ if (lat->time > latency_record[i].max)
+ latency_record[i].max = lat->time;
+ return;
+ }
+ }
+
+ i = firstnonnull;
+ if (i >= MAXLR - 1)
+ return;
+
+ /* Allocted a new one: */
+ memcpy(&latency_record[i], lat, sizeof(struct latency_record));
+}
+
+/*
+ * Iterator to store a backtrace into a latency record entry
+ */
+static inline void store_stacktrace(struct task_struct *tsk,
+ struct latency_record *lat)
+{
+ struct stack_trace trace;
+
+ memset(&trace, 0, sizeof(trace));
+ trace.max_entries = LT_BACKTRACEDEPTH;
+ trace.entries = &lat->backtrace[0];
+ save_stack_trace_tsk(tsk, &trace);
+}
+
+/**
+ * __account_scheduler_latency - record an occurred latency
+ * @tsk - the task struct of the task hitting the latency
+ * @usecs - the duration of the latency in microseconds
+ * @inter - 1 if the sleep was interruptible, 0 if uninterruptible
+ *
+ * This function is the main entry point for recording latency entries
+ * as called by the scheduler.
+ *
+ * This function has a few special cases to deal with normal 'non-latency'
+ * sleeps: specifically, interruptible sleep longer than 5 msec is skipped
+ * since this usually is caused by waiting for events via select() and co.
+ *
+ * Negative latencies (caused by time going backwards) are also explicitly
+ * skipped.
+ */
+void __sched
+__account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
+{
+ unsigned long flags;
+ int i, q;
+ struct latency_record lat;
+
+ /* Long interruptible waits are generally user requested... */
+ if (inter && usecs > 5000)
+ return;
+
+ /* Negative sleeps are time going backwards */
+ /* Zero-time sleeps are non-interesting */
+ if (usecs <= 0)
+ return;
+
+ memset(&lat, 0, sizeof(lat));
+ lat.count = 1;
+ lat.time = usecs;
+ lat.max = usecs;
+ store_stacktrace(tsk, &lat);
+
+ raw_spin_lock_irqsave(&latency_lock, flags);
+
+ account_global_scheduler_latency(tsk, &lat);
+
+ for (i = 0; i < tsk->latency_record_count; i++) {
+ struct latency_record *mylat;
+ int same = 1;
+
+ mylat = &tsk->latency_record[i];
+ for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
+ unsigned long record = lat.backtrace[q];
+
+ if (mylat->backtrace[q] != record) {
+ same = 0;
+ break;
+ }
+
+ /* 0 and ULONG_MAX entries mean end of backtrace: */
+ if (record == 0 || record == ULONG_MAX)
+ break;
+ }
+ if (same) {
+ mylat->count++;
+ mylat->time += lat.time;
+ if (lat.time > mylat->max)
+ mylat->max = lat.time;
+ goto out_unlock;
+ }
+ }
+
+ /*
+ * short term hack; if we're > 32 we stop; future we recycle:
+ */
+ if (tsk->latency_record_count >= LT_SAVECOUNT)
+ goto out_unlock;
+
+ /* Allocated a new one: */
+ i = tsk->latency_record_count++;
+ memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record));
+
+out_unlock:
+ raw_spin_unlock_irqrestore(&latency_lock, flags);
+}
+
+static int lstats_show(struct seq_file *m, void *v)
+{
+ int i;
+
+ seq_puts(m, "Latency Top version : v0.1\n");
+
+ for (i = 0; i < MAXLR; i++) {
+ struct latency_record *lr = &latency_record[i];
+
+ if (lr->backtrace[0]) {
+ int q;
+ seq_printf(m, "%i %lu %lu",
+ lr->count, lr->time, lr->max);
+ for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
+ unsigned long bt = lr->backtrace[q];
+ if (!bt)
+ break;
+ if (bt == ULONG_MAX)
+ break;
+ seq_printf(m, " %ps", (void *)bt);
+ }
+ seq_puts(m, "\n");
+ }
+ }
+ return 0;
+}
+
+static ssize_t
+lstats_write(struct file *file, const char __user *buf, size_t count,
+ loff_t *offs)
+{
+ clear_global_latency_tracing();
+
+ return count;
+}
+
+static int lstats_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, lstats_show, NULL);
+}
+
+static const struct file_operations lstats_fops = {
+ .open = lstats_open,
+ .read = seq_read,
+ .write = lstats_write,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init init_lstats_procfs(void)
+{
+ proc_create("latency_stats", 0644, NULL, &lstats_fops);
+ return 0;
+}
+device_initcall(init_lstats_procfs);
diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig
new file mode 100644
index 000000000..045022557
--- /dev/null
+++ b/kernel/livepatch/Kconfig
@@ -0,0 +1,18 @@
+config HAVE_LIVEPATCH
+ bool
+ help
+ Arch supports kernel live patching
+
+config LIVEPATCH
+ bool "Kernel Live Patching"
+ depends on DYNAMIC_FTRACE_WITH_REGS
+ depends on MODULES
+ depends on SYSFS
+ depends on KALLSYMS_ALL
+ depends on HAVE_LIVEPATCH
+ help
+ Say Y here if you want to support kernel live patching.
+ This option has no runtime impact until a kernel "patch"
+ module uses the interface provided by this option to register
+ a patch, causing calls to patched functions to be redirected
+ to new function code contained in the patch module.
diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile
new file mode 100644
index 000000000..e8780c090
--- /dev/null
+++ b/kernel/livepatch/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_LIVEPATCH) += livepatch.o
+
+livepatch-objs := core.o
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
new file mode 100644
index 000000000..9ec555732
--- /dev/null
+++ b/kernel/livepatch/core.c
@@ -0,0 +1,1011 @@
+/*
+ * core.c - Kernel Live Patching Core
+ *
+ * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
+ * Copyright (C) 2014 SUSE
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/ftrace.h>
+#include <linux/list.h>
+#include <linux/kallsyms.h>
+#include <linux/livepatch.h>
+
+/**
+ * struct klp_ops - structure for tracking registered ftrace ops structs
+ *
+ * A single ftrace_ops is shared between all enabled replacement functions
+ * (klp_func structs) which have the same old_addr. This allows the switch
+ * between function versions to happen instantaneously by updating the klp_ops
+ * struct's func_stack list. The winner is the klp_func at the top of the
+ * func_stack (front of the list).
+ *
+ * @node: node for the global klp_ops list
+ * @func_stack: list head for the stack of klp_func's (active func is on top)
+ * @fops: registered ftrace ops struct
+ */
+struct klp_ops {
+ struct list_head node;
+ struct list_head func_stack;
+ struct ftrace_ops fops;
+};
+
+/*
+ * The klp_mutex protects the global lists and state transitions of any
+ * structure reachable from them. References to any structure must be obtained
+ * under mutex protection (except in klp_ftrace_handler(), which uses RCU to
+ * ensure it gets consistent data).
+ */
+static DEFINE_MUTEX(klp_mutex);
+
+static LIST_HEAD(klp_patches);
+static LIST_HEAD(klp_ops);
+
+static struct kobject *klp_root_kobj;
+
+static struct klp_ops *klp_find_ops(unsigned long old_addr)
+{
+ struct klp_ops *ops;
+ struct klp_func *func;
+
+ list_for_each_entry(ops, &klp_ops, node) {
+ func = list_first_entry(&ops->func_stack, struct klp_func,
+ stack_node);
+ if (func->old_addr == old_addr)
+ return ops;
+ }
+
+ return NULL;
+}
+
+static bool klp_is_module(struct klp_object *obj)
+{
+ return obj->name;
+}
+
+static bool klp_is_object_loaded(struct klp_object *obj)
+{
+ return !obj->name || obj->mod;
+}
+
+/* sets obj->mod if object is not vmlinux and module is found */
+static void klp_find_object_module(struct klp_object *obj)
+{
+ struct module *mod;
+
+ if (!klp_is_module(obj))
+ return;
+
+ mutex_lock(&module_mutex);
+ /*
+ * We do not want to block removal of patched modules and therefore
+ * we do not take a reference here. The patches are removed by
+ * a going module handler instead.
+ */
+ mod = find_module(obj->name);
+ /*
+ * Do not mess work of the module coming and going notifiers.
+ * Note that the patch might still be needed before the going handler
+ * is called. Module functions can be called even in the GOING state
+ * until mod->exit() finishes. This is especially important for
+ * patches that modify semantic of the functions.
+ */
+ if (mod && mod->klp_alive)
+ obj->mod = mod;
+
+ mutex_unlock(&module_mutex);
+}
+
+/* klp_mutex must be held by caller */
+static bool klp_is_patch_registered(struct klp_patch *patch)
+{
+ struct klp_patch *mypatch;
+
+ list_for_each_entry(mypatch, &klp_patches, list)
+ if (mypatch == patch)
+ return true;
+
+ return false;
+}
+
+static bool klp_initialized(void)
+{
+ return klp_root_kobj;
+}
+
+struct klp_find_arg {
+ const char *objname;
+ const char *name;
+ unsigned long addr;
+ /*
+ * If count == 0, the symbol was not found. If count == 1, a unique
+ * match was found and addr is set. If count > 1, there is
+ * unresolvable ambiguity among "count" number of symbols with the same
+ * name in the same object.
+ */
+ unsigned long count;
+};
+
+static int klp_find_callback(void *data, const char *name,
+ struct module *mod, unsigned long addr)
+{
+ struct klp_find_arg *args = data;
+
+ if ((mod && !args->objname) || (!mod && args->objname))
+ return 0;
+
+ if (strcmp(args->name, name))
+ return 0;
+
+ if (args->objname && strcmp(args->objname, mod->name))
+ return 0;
+
+ /*
+ * args->addr might be overwritten if another match is found
+ * but klp_find_object_symbol() handles this and only returns the
+ * addr if count == 1.
+ */
+ args->addr = addr;
+ args->count++;
+
+ return 0;
+}
+
+static int klp_find_object_symbol(const char *objname, const char *name,
+ unsigned long *addr)
+{
+ struct klp_find_arg args = {
+ .objname = objname,
+ .name = name,
+ .addr = 0,
+ .count = 0
+ };
+
+ mutex_lock(&module_mutex);
+ kallsyms_on_each_symbol(klp_find_callback, &args);
+ mutex_unlock(&module_mutex);
+
+ if (args.count == 0)
+ pr_err("symbol '%s' not found in symbol table\n", name);
+ else if (args.count > 1)
+ pr_err("unresolvable ambiguity (%lu matches) on symbol '%s' in object '%s'\n",
+ args.count, name, objname);
+ else {
+ *addr = args.addr;
+ return 0;
+ }
+
+ *addr = 0;
+ return -EINVAL;
+}
+
+struct klp_verify_args {
+ const char *name;
+ const unsigned long addr;
+};
+
+static int klp_verify_callback(void *data, const char *name,
+ struct module *mod, unsigned long addr)
+{
+ struct klp_verify_args *args = data;
+
+ if (!mod &&
+ !strcmp(args->name, name) &&
+ args->addr == addr)
+ return 1;
+
+ return 0;
+}
+
+static int klp_verify_vmlinux_symbol(const char *name, unsigned long addr)
+{
+ struct klp_verify_args args = {
+ .name = name,
+ .addr = addr,
+ };
+ int ret;
+
+ mutex_lock(&module_mutex);
+ ret = kallsyms_on_each_symbol(klp_verify_callback, &args);
+ mutex_unlock(&module_mutex);
+
+ if (!ret) {
+ pr_err("symbol '%s' not found at specified address 0x%016lx, kernel mismatch?\n",
+ name, addr);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int klp_find_verify_func_addr(struct klp_object *obj,
+ struct klp_func *func)
+{
+ int ret;
+
+#if defined(CONFIG_RANDOMIZE_BASE)
+ /* KASLR is enabled, disregard old_addr from user */
+ func->old_addr = 0;
+#endif
+
+ if (!func->old_addr || klp_is_module(obj))
+ ret = klp_find_object_symbol(obj->name, func->old_name,
+ &func->old_addr);
+ else
+ ret = klp_verify_vmlinux_symbol(func->old_name,
+ func->old_addr);
+
+ return ret;
+}
+
+/*
+ * external symbols are located outside the parent object (where the parent
+ * object is either vmlinux or the kmod being patched).
+ */
+static int klp_find_external_symbol(struct module *pmod, const char *name,
+ unsigned long *addr)
+{
+ const struct kernel_symbol *sym;
+
+ /* first, check if it's an exported symbol */
+ preempt_disable();
+ sym = find_symbol(name, NULL, NULL, true, true);
+ if (sym) {
+ *addr = sym->value;
+ preempt_enable();
+ return 0;
+ }
+ preempt_enable();
+
+ /* otherwise check if it's in another .o within the patch module */
+ return klp_find_object_symbol(pmod->name, name, addr);
+}
+
+static int klp_write_object_relocations(struct module *pmod,
+ struct klp_object *obj)
+{
+ int ret;
+ struct klp_reloc *reloc;
+
+ if (WARN_ON(!klp_is_object_loaded(obj)))
+ return -EINVAL;
+
+ if (WARN_ON(!obj->relocs))
+ return -EINVAL;
+
+ for (reloc = obj->relocs; reloc->name; reloc++) {
+ if (!klp_is_module(obj)) {
+ ret = klp_verify_vmlinux_symbol(reloc->name,
+ reloc->val);
+ if (ret)
+ return ret;
+ } else {
+ /* module, reloc->val needs to be discovered */
+ if (reloc->external)
+ ret = klp_find_external_symbol(pmod,
+ reloc->name,
+ &reloc->val);
+ else
+ ret = klp_find_object_symbol(obj->mod->name,
+ reloc->name,
+ &reloc->val);
+ if (ret)
+ return ret;
+ }
+ ret = klp_write_module_reloc(pmod, reloc->type, reloc->loc,
+ reloc->val + reloc->addend);
+ if (ret) {
+ pr_err("relocation failed for symbol '%s' at 0x%016lx (%d)\n",
+ reloc->name, reloc->val, ret);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static void notrace klp_ftrace_handler(unsigned long ip,
+ unsigned long parent_ip,
+ struct ftrace_ops *fops,
+ struct pt_regs *regs)
+{
+ struct klp_ops *ops;
+ struct klp_func *func;
+
+ ops = container_of(fops, struct klp_ops, fops);
+
+ rcu_read_lock();
+ func = list_first_or_null_rcu(&ops->func_stack, struct klp_func,
+ stack_node);
+ if (WARN_ON_ONCE(!func))
+ goto unlock;
+
+ klp_arch_set_pc(regs, (unsigned long)func->new_func);
+unlock:
+ rcu_read_unlock();
+}
+
+static void klp_disable_func(struct klp_func *func)
+{
+ struct klp_ops *ops;
+
+ WARN_ON(func->state != KLP_ENABLED);
+ WARN_ON(!func->old_addr);
+
+ ops = klp_find_ops(func->old_addr);
+ if (WARN_ON(!ops))
+ return;
+
+ if (list_is_singular(&ops->func_stack)) {
+ WARN_ON(unregister_ftrace_function(&ops->fops));
+ WARN_ON(ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0));
+
+ list_del_rcu(&func->stack_node);
+ list_del(&ops->node);
+ kfree(ops);
+ } else {
+ list_del_rcu(&func->stack_node);
+ }
+
+ func->state = KLP_DISABLED;
+}
+
+static int klp_enable_func(struct klp_func *func)
+{
+ struct klp_ops *ops;
+ int ret;
+
+ if (WARN_ON(!func->old_addr))
+ return -EINVAL;
+
+ if (WARN_ON(func->state != KLP_DISABLED))
+ return -EINVAL;
+
+ ops = klp_find_ops(func->old_addr);
+ if (!ops) {
+ ops = kzalloc(sizeof(*ops), GFP_KERNEL);
+ if (!ops)
+ return -ENOMEM;
+
+ ops->fops.func = klp_ftrace_handler;
+ ops->fops.flags = FTRACE_OPS_FL_SAVE_REGS |
+ FTRACE_OPS_FL_DYNAMIC |
+ FTRACE_OPS_FL_IPMODIFY;
+
+ list_add(&ops->node, &klp_ops);
+
+ INIT_LIST_HEAD(&ops->func_stack);
+ list_add_rcu(&func->stack_node, &ops->func_stack);
+
+ ret = ftrace_set_filter_ip(&ops->fops, func->old_addr, 0, 0);
+ if (ret) {
+ pr_err("failed to set ftrace filter for function '%s' (%d)\n",
+ func->old_name, ret);
+ goto err;
+ }
+
+ ret = register_ftrace_function(&ops->fops);
+ if (ret) {
+ pr_err("failed to register ftrace handler for function '%s' (%d)\n",
+ func->old_name, ret);
+ ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0);
+ goto err;
+ }
+
+
+ } else {
+ list_add_rcu(&func->stack_node, &ops->func_stack);
+ }
+
+ func->state = KLP_ENABLED;
+
+ return 0;
+
+err:
+ list_del_rcu(&func->stack_node);
+ list_del(&ops->node);
+ kfree(ops);
+ return ret;
+}
+
+static void klp_disable_object(struct klp_object *obj)
+{
+ struct klp_func *func;
+
+ for (func = obj->funcs; func->old_name; func++)
+ if (func->state == KLP_ENABLED)
+ klp_disable_func(func);
+
+ obj->state = KLP_DISABLED;
+}
+
+static int klp_enable_object(struct klp_object *obj)
+{
+ struct klp_func *func;
+ int ret;
+
+ if (WARN_ON(obj->state != KLP_DISABLED))
+ return -EINVAL;
+
+ if (WARN_ON(!klp_is_object_loaded(obj)))
+ return -EINVAL;
+
+ for (func = obj->funcs; func->old_name; func++) {
+ ret = klp_enable_func(func);
+ if (ret) {
+ klp_disable_object(obj);
+ return ret;
+ }
+ }
+ obj->state = KLP_ENABLED;
+
+ return 0;
+}
+
+static int __klp_disable_patch(struct klp_patch *patch)
+{
+ struct klp_object *obj;
+
+ /* enforce stacking: only the last enabled patch can be disabled */
+ if (!list_is_last(&patch->list, &klp_patches) &&
+ list_next_entry(patch, list)->state == KLP_ENABLED)
+ return -EBUSY;
+
+ pr_notice("disabling patch '%s'\n", patch->mod->name);
+
+ for (obj = patch->objs; obj->funcs; obj++) {
+ if (obj->state == KLP_ENABLED)
+ klp_disable_object(obj);
+ }
+
+ patch->state = KLP_DISABLED;
+
+ return 0;
+}
+
+/**
+ * klp_disable_patch() - disables a registered patch
+ * @patch: The registered, enabled patch to be disabled
+ *
+ * Unregisters the patched functions from ftrace.
+ *
+ * Return: 0 on success, otherwise error
+ */
+int klp_disable_patch(struct klp_patch *patch)
+{
+ int ret;
+
+ mutex_lock(&klp_mutex);
+
+ if (!klp_is_patch_registered(patch)) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ if (patch->state == KLP_DISABLED) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ ret = __klp_disable_patch(patch);
+
+err:
+ mutex_unlock(&klp_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(klp_disable_patch);
+
+static int __klp_enable_patch(struct klp_patch *patch)
+{
+ struct klp_object *obj;
+ int ret;
+
+ if (WARN_ON(patch->state != KLP_DISABLED))
+ return -EINVAL;
+
+ /* enforce stacking: only the first disabled patch can be enabled */
+ if (patch->list.prev != &klp_patches &&
+ list_prev_entry(patch, list)->state == KLP_DISABLED)
+ return -EBUSY;
+
+ pr_notice_once("tainting kernel with TAINT_LIVEPATCH\n");
+ add_taint(TAINT_LIVEPATCH, LOCKDEP_STILL_OK);
+
+ pr_notice("enabling patch '%s'\n", patch->mod->name);
+
+ for (obj = patch->objs; obj->funcs; obj++) {
+ if (!klp_is_object_loaded(obj))
+ continue;
+
+ ret = klp_enable_object(obj);
+ if (ret)
+ goto unregister;
+ }
+
+ patch->state = KLP_ENABLED;
+
+ return 0;
+
+unregister:
+ WARN_ON(__klp_disable_patch(patch));
+ return ret;
+}
+
+/**
+ * klp_enable_patch() - enables a registered patch
+ * @patch: The registered, disabled patch to be enabled
+ *
+ * Performs the needed symbol lookups and code relocations,
+ * then registers the patched functions with ftrace.
+ *
+ * Return: 0 on success, otherwise error
+ */
+int klp_enable_patch(struct klp_patch *patch)
+{
+ int ret;
+
+ mutex_lock(&klp_mutex);
+
+ if (!klp_is_patch_registered(patch)) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ ret = __klp_enable_patch(patch);
+
+err:
+ mutex_unlock(&klp_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(klp_enable_patch);
+
+/*
+ * Sysfs Interface
+ *
+ * /sys/kernel/livepatch
+ * /sys/kernel/livepatch/<patch>
+ * /sys/kernel/livepatch/<patch>/enabled
+ * /sys/kernel/livepatch/<patch>/<object>
+ * /sys/kernel/livepatch/<patch>/<object>/<func>
+ */
+
+static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct klp_patch *patch;
+ int ret;
+ unsigned long val;
+
+ ret = kstrtoul(buf, 10, &val);
+ if (ret)
+ return -EINVAL;
+
+ if (val != KLP_DISABLED && val != KLP_ENABLED)
+ return -EINVAL;
+
+ patch = container_of(kobj, struct klp_patch, kobj);
+
+ mutex_lock(&klp_mutex);
+
+ if (val == patch->state) {
+ /* already in requested state */
+ ret = -EINVAL;
+ goto err;
+ }
+
+ if (val == KLP_ENABLED) {
+ ret = __klp_enable_patch(patch);
+ if (ret)
+ goto err;
+ } else {
+ ret = __klp_disable_patch(patch);
+ if (ret)
+ goto err;
+ }
+
+ mutex_unlock(&klp_mutex);
+
+ return count;
+
+err:
+ mutex_unlock(&klp_mutex);
+ return ret;
+}
+
+static ssize_t enabled_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct klp_patch *patch;
+
+ patch = container_of(kobj, struct klp_patch, kobj);
+ return snprintf(buf, PAGE_SIZE-1, "%d\n", patch->state);
+}
+
+static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled);
+static struct attribute *klp_patch_attrs[] = {
+ &enabled_kobj_attr.attr,
+ NULL
+};
+
+static void klp_kobj_release_patch(struct kobject *kobj)
+{
+ /*
+ * Once we have a consistency model we'll need to module_put() the
+ * patch module here. See klp_register_patch() for more details.
+ */
+}
+
+static struct kobj_type klp_ktype_patch = {
+ .release = klp_kobj_release_patch,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_attrs = klp_patch_attrs,
+};
+
+static void klp_kobj_release_func(struct kobject *kobj)
+{
+}
+
+static struct kobj_type klp_ktype_func = {
+ .release = klp_kobj_release_func,
+ .sysfs_ops = &kobj_sysfs_ops,
+};
+
+/*
+ * Free all functions' kobjects in the array up to some limit. When limit is
+ * NULL, all kobjects are freed.
+ */
+static void klp_free_funcs_limited(struct klp_object *obj,
+ struct klp_func *limit)
+{
+ struct klp_func *func;
+
+ for (func = obj->funcs; func->old_name && func != limit; func++)
+ kobject_put(&func->kobj);
+}
+
+/* Clean up when a patched object is unloaded */
+static void klp_free_object_loaded(struct klp_object *obj)
+{
+ struct klp_func *func;
+
+ obj->mod = NULL;
+
+ for (func = obj->funcs; func->old_name; func++)
+ func->old_addr = 0;
+}
+
+/*
+ * Free all objects' kobjects in the array up to some limit. When limit is
+ * NULL, all kobjects are freed.
+ */
+static void klp_free_objects_limited(struct klp_patch *patch,
+ struct klp_object *limit)
+{
+ struct klp_object *obj;
+
+ for (obj = patch->objs; obj->funcs && obj != limit; obj++) {
+ klp_free_funcs_limited(obj, NULL);
+ kobject_put(obj->kobj);
+ }
+}
+
+static void klp_free_patch(struct klp_patch *patch)
+{
+ klp_free_objects_limited(patch, NULL);
+ if (!list_empty(&patch->list))
+ list_del(&patch->list);
+ kobject_put(&patch->kobj);
+}
+
+static int klp_init_func(struct klp_object *obj, struct klp_func *func)
+{
+ INIT_LIST_HEAD(&func->stack_node);
+ func->state = KLP_DISABLED;
+
+ return kobject_init_and_add(&func->kobj, &klp_ktype_func,
+ obj->kobj, "%s", func->old_name);
+}
+
+/* parts of the initialization that is done only when the object is loaded */
+static int klp_init_object_loaded(struct klp_patch *patch,
+ struct klp_object *obj)
+{
+ struct klp_func *func;
+ int ret;
+
+ if (obj->relocs) {
+ ret = klp_write_object_relocations(patch->mod, obj);
+ if (ret)
+ return ret;
+ }
+
+ for (func = obj->funcs; func->old_name; func++) {
+ ret = klp_find_verify_func_addr(obj, func);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int klp_init_object(struct klp_patch *patch, struct klp_object *obj)
+{
+ struct klp_func *func;
+ int ret;
+ const char *name;
+
+ if (!obj->funcs)
+ return -EINVAL;
+
+ obj->state = KLP_DISABLED;
+ obj->mod = NULL;
+
+ klp_find_object_module(obj);
+
+ name = klp_is_module(obj) ? obj->name : "vmlinux";
+ obj->kobj = kobject_create_and_add(name, &patch->kobj);
+ if (!obj->kobj)
+ return -ENOMEM;
+
+ for (func = obj->funcs; func->old_name; func++) {
+ ret = klp_init_func(obj, func);
+ if (ret)
+ goto free;
+ }
+
+ if (klp_is_object_loaded(obj)) {
+ ret = klp_init_object_loaded(patch, obj);
+ if (ret)
+ goto free;
+ }
+
+ return 0;
+
+free:
+ klp_free_funcs_limited(obj, func);
+ kobject_put(obj->kobj);
+ return ret;
+}
+
+static int klp_init_patch(struct klp_patch *patch)
+{
+ struct klp_object *obj;
+ int ret;
+
+ if (!patch->objs)
+ return -EINVAL;
+
+ mutex_lock(&klp_mutex);
+
+ patch->state = KLP_DISABLED;
+
+ ret = kobject_init_and_add(&patch->kobj, &klp_ktype_patch,
+ klp_root_kobj, "%s", patch->mod->name);
+ if (ret)
+ goto unlock;
+
+ for (obj = patch->objs; obj->funcs; obj++) {
+ ret = klp_init_object(patch, obj);
+ if (ret)
+ goto free;
+ }
+
+ list_add_tail(&patch->list, &klp_patches);
+
+ mutex_unlock(&klp_mutex);
+
+ return 0;
+
+free:
+ klp_free_objects_limited(patch, obj);
+ kobject_put(&patch->kobj);
+unlock:
+ mutex_unlock(&klp_mutex);
+ return ret;
+}
+
+/**
+ * klp_unregister_patch() - unregisters a patch
+ * @patch: Disabled patch to be unregistered
+ *
+ * Frees the data structures and removes the sysfs interface.
+ *
+ * Return: 0 on success, otherwise error
+ */
+int klp_unregister_patch(struct klp_patch *patch)
+{
+ int ret = 0;
+
+ mutex_lock(&klp_mutex);
+
+ if (!klp_is_patch_registered(patch)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (patch->state == KLP_ENABLED) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ klp_free_patch(patch);
+
+out:
+ mutex_unlock(&klp_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(klp_unregister_patch);
+
+/**
+ * klp_register_patch() - registers a patch
+ * @patch: Patch to be registered
+ *
+ * Initializes the data structure associated with the patch and
+ * creates the sysfs interface.
+ *
+ * Return: 0 on success, otherwise error
+ */
+int klp_register_patch(struct klp_patch *patch)
+{
+ int ret;
+
+ if (!klp_initialized())
+ return -ENODEV;
+
+ if (!patch || !patch->mod)
+ return -EINVAL;
+
+ /*
+ * A reference is taken on the patch module to prevent it from being
+ * unloaded. Right now, we don't allow patch modules to unload since
+ * there is currently no method to determine if a thread is still
+ * running in the patched code contained in the patch module once
+ * the ftrace registration is successful.
+ */
+ if (!try_module_get(patch->mod))
+ return -ENODEV;
+
+ ret = klp_init_patch(patch);
+ if (ret)
+ module_put(patch->mod);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(klp_register_patch);
+
+static void klp_module_notify_coming(struct klp_patch *patch,
+ struct klp_object *obj)
+{
+ struct module *pmod = patch->mod;
+ struct module *mod = obj->mod;
+ int ret;
+
+ ret = klp_init_object_loaded(patch, obj);
+ if (ret)
+ goto err;
+
+ if (patch->state == KLP_DISABLED)
+ return;
+
+ pr_notice("applying patch '%s' to loading module '%s'\n",
+ pmod->name, mod->name);
+
+ ret = klp_enable_object(obj);
+ if (!ret)
+ return;
+
+err:
+ pr_warn("failed to apply patch '%s' to module '%s' (%d)\n",
+ pmod->name, mod->name, ret);
+}
+
+static void klp_module_notify_going(struct klp_patch *patch,
+ struct klp_object *obj)
+{
+ struct module *pmod = patch->mod;
+ struct module *mod = obj->mod;
+
+ if (patch->state == KLP_DISABLED)
+ goto disabled;
+
+ pr_notice("reverting patch '%s' on unloading module '%s'\n",
+ pmod->name, mod->name);
+
+ klp_disable_object(obj);
+
+disabled:
+ klp_free_object_loaded(obj);
+}
+
+static int klp_module_notify(struct notifier_block *nb, unsigned long action,
+ void *data)
+{
+ struct module *mod = data;
+ struct klp_patch *patch;
+ struct klp_object *obj;
+
+ if (action != MODULE_STATE_COMING && action != MODULE_STATE_GOING)
+ return 0;
+
+ mutex_lock(&klp_mutex);
+
+ /*
+ * Each module has to know that the notifier has been called.
+ * We never know what module will get patched by a new patch.
+ */
+ if (action == MODULE_STATE_COMING)
+ mod->klp_alive = true;
+ else /* MODULE_STATE_GOING */
+ mod->klp_alive = false;
+
+ list_for_each_entry(patch, &klp_patches, list) {
+ for (obj = patch->objs; obj->funcs; obj++) {
+ if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
+ continue;
+
+ if (action == MODULE_STATE_COMING) {
+ obj->mod = mod;
+ klp_module_notify_coming(patch, obj);
+ } else /* MODULE_STATE_GOING */
+ klp_module_notify_going(patch, obj);
+
+ break;
+ }
+ }
+
+ mutex_unlock(&klp_mutex);
+
+ return 0;
+}
+
+static struct notifier_block klp_module_nb = {
+ .notifier_call = klp_module_notify,
+ .priority = INT_MIN+1, /* called late but before ftrace notifier */
+};
+
+static int klp_init(void)
+{
+ int ret;
+
+ ret = klp_check_compiler_support();
+ if (ret) {
+ pr_info("Your compiler is too old; turning off.\n");
+ return -EINVAL;
+ }
+
+ ret = register_module_notifier(&klp_module_nb);
+ if (ret)
+ return ret;
+
+ klp_root_kobj = kobject_create_and_add("livepatch", kernel_kobj);
+ if (!klp_root_kobj) {
+ ret = -ENOMEM;
+ goto unregister;
+ }
+
+ return 0;
+
+unregister:
+ unregister_module_notifier(&klp_module_nb);
+ return ret;
+}
+
+module_init(klp_init);
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
new file mode 100644
index 000000000..de7a416cc
--- /dev/null
+++ b/kernel/locking/Makefile
@@ -0,0 +1,29 @@
+
+obj-y += mutex.o semaphore.o rwsem.o
+
+ifdef CONFIG_FUNCTION_TRACER
+CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_lockdep_proc.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_mutex-debug.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE)
+endif
+
+obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
+obj-$(CONFIG_LOCKDEP) += lockdep.o
+ifeq ($(CONFIG_PROC_FS),y)
+obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
+endif
+obj-$(CONFIG_SMP) += spinlock.o
+obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o
+obj-$(CONFIG_SMP) += lglock.o
+obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
+obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
+obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
+obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
+obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
+obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
+obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
+obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
+obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o
+obj-$(CONFIG_QUEUE_RWLOCK) += qrwlock.o
+obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
diff --git a/kernel/locking/lglock.c b/kernel/locking/lglock.c
new file mode 100644
index 000000000..86ae2aebf
--- /dev/null
+++ b/kernel/locking/lglock.c
@@ -0,0 +1,89 @@
+/* See include/linux/lglock.h for description */
+#include <linux/module.h>
+#include <linux/lglock.h>
+#include <linux/cpu.h>
+#include <linux/string.h>
+
+/*
+ * Note there is no uninit, so lglocks cannot be defined in
+ * modules (but it's fine to use them from there)
+ * Could be added though, just undo lg_lock_init
+ */
+
+void lg_lock_init(struct lglock *lg, char *name)
+{
+ LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0);
+}
+EXPORT_SYMBOL(lg_lock_init);
+
+void lg_local_lock(struct lglock *lg)
+{
+ arch_spinlock_t *lock;
+
+ preempt_disable();
+ lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
+ lock = this_cpu_ptr(lg->lock);
+ arch_spin_lock(lock);
+}
+EXPORT_SYMBOL(lg_local_lock);
+
+void lg_local_unlock(struct lglock *lg)
+{
+ arch_spinlock_t *lock;
+
+ lock_release(&lg->lock_dep_map, 1, _RET_IP_);
+ lock = this_cpu_ptr(lg->lock);
+ arch_spin_unlock(lock);
+ preempt_enable();
+}
+EXPORT_SYMBOL(lg_local_unlock);
+
+void lg_local_lock_cpu(struct lglock *lg, int cpu)
+{
+ arch_spinlock_t *lock;
+
+ preempt_disable();
+ lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
+ lock = per_cpu_ptr(lg->lock, cpu);
+ arch_spin_lock(lock);
+}
+EXPORT_SYMBOL(lg_local_lock_cpu);
+
+void lg_local_unlock_cpu(struct lglock *lg, int cpu)
+{
+ arch_spinlock_t *lock;
+
+ lock_release(&lg->lock_dep_map, 1, _RET_IP_);
+ lock = per_cpu_ptr(lg->lock, cpu);
+ arch_spin_unlock(lock);
+ preempt_enable();
+}
+EXPORT_SYMBOL(lg_local_unlock_cpu);
+
+void lg_global_lock(struct lglock *lg)
+{
+ int i;
+
+ preempt_disable();
+ lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
+ for_each_possible_cpu(i) {
+ arch_spinlock_t *lock;
+ lock = per_cpu_ptr(lg->lock, i);
+ arch_spin_lock(lock);
+ }
+}
+EXPORT_SYMBOL(lg_global_lock);
+
+void lg_global_unlock(struct lglock *lg)
+{
+ int i;
+
+ lock_release(&lg->lock_dep_map, 1, _RET_IP_);
+ for_each_possible_cpu(i) {
+ arch_spinlock_t *lock;
+ lock = per_cpu_ptr(lg->lock, i);
+ arch_spin_unlock(lock);
+ }
+ preempt_enable();
+}
+EXPORT_SYMBOL(lg_global_unlock);
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
new file mode 100644
index 000000000..aaeae885d
--- /dev/null
+++ b/kernel/locking/lockdep.c
@@ -0,0 +1,4304 @@
+/*
+ * kernel/lockdep.c
+ *
+ * Runtime locking correctness validator
+ *
+ * Started by Ingo Molnar:
+ *
+ * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *
+ * this code maps all the lock dependencies as they occur in a live kernel
+ * and will warn about the following classes of locking bugs:
+ *
+ * - lock inversion scenarios
+ * - circular lock dependencies
+ * - hardirq/softirq safe/unsafe locking bugs
+ *
+ * Bugs are reported even if the current locking scenario does not cause
+ * any deadlock at this point.
+ *
+ * I.e. if anytime in the past two locks were taken in a different order,
+ * even if it happened for another task, even if those were different
+ * locks (but of the same class as this lock), this code will detect it.
+ *
+ * Thanks to Arjan van de Ven for coming up with the initial idea of
+ * mapping lock dependencies runtime.
+ */
+#define DISABLE_BRANCH_PROFILING
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+#include <linux/kallsyms.h>
+#include <linux/interrupt.h>
+#include <linux/stacktrace.h>
+#include <linux/debug_locks.h>
+#include <linux/irqflags.h>
+#include <linux/utsname.h>
+#include <linux/hash.h>
+#include <linux/ftrace.h>
+#include <linux/stringify.h>
+#include <linux/bitops.h>
+#include <linux/gfp.h>
+#include <linux/kmemcheck.h>
+
+#include <asm/sections.h>
+
+#include "lockdep_internals.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/lock.h>
+
+#ifdef CONFIG_PROVE_LOCKING
+int prove_locking = 1;
+module_param(prove_locking, int, 0644);
+#else
+#define prove_locking 0
+#endif
+
+#ifdef CONFIG_LOCK_STAT
+int lock_stat = 1;
+module_param(lock_stat, int, 0644);
+#else
+#define lock_stat 0
+#endif
+
+/*
+ * lockdep_lock: protects the lockdep graph, the hashes and the
+ * class/list/hash allocators.
+ *
+ * This is one of the rare exceptions where it's justified
+ * to use a raw spinlock - we really dont want the spinlock
+ * code to recurse back into the lockdep code...
+ */
+static arch_spinlock_t lockdep_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
+
+static int graph_lock(void)
+{
+ arch_spin_lock(&lockdep_lock);
+ /*
+ * Make sure that if another CPU detected a bug while
+ * walking the graph we dont change it (while the other
+ * CPU is busy printing out stuff with the graph lock
+ * dropped already)
+ */
+ if (!debug_locks) {
+ arch_spin_unlock(&lockdep_lock);
+ return 0;
+ }
+ /* prevent any recursions within lockdep from causing deadlocks */
+ current->lockdep_recursion++;
+ return 1;
+}
+
+static inline int graph_unlock(void)
+{
+ if (debug_locks && !arch_spin_is_locked(&lockdep_lock)) {
+ /*
+ * The lockdep graph lock isn't locked while we expect it to
+ * be, we're confused now, bye!
+ */
+ return DEBUG_LOCKS_WARN_ON(1);
+ }
+
+ current->lockdep_recursion--;
+ arch_spin_unlock(&lockdep_lock);
+ return 0;
+}
+
+/*
+ * Turn lock debugging off and return with 0 if it was off already,
+ * and also release the graph lock:
+ */
+static inline int debug_locks_off_graph_unlock(void)
+{
+ int ret = debug_locks_off();
+
+ arch_spin_unlock(&lockdep_lock);
+
+ return ret;
+}
+
+static int lockdep_initialized;
+
+unsigned long nr_list_entries;
+static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES];
+
+/*
+ * All data structures here are protected by the global debug_lock.
+ *
+ * Mutex key structs only get allocated, once during bootup, and never
+ * get freed - this significantly simplifies the debugging code.
+ */
+unsigned long nr_lock_classes;
+static struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
+
+static inline struct lock_class *hlock_class(struct held_lock *hlock)
+{
+ if (!hlock->class_idx) {
+ /*
+ * Someone passed in garbage, we give up.
+ */
+ DEBUG_LOCKS_WARN_ON(1);
+ return NULL;
+ }
+ return lock_classes + hlock->class_idx - 1;
+}
+
+#ifdef CONFIG_LOCK_STAT
+static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS],
+ cpu_lock_stats);
+
+static inline u64 lockstat_clock(void)
+{
+ return local_clock();
+}
+
+static int lock_point(unsigned long points[], unsigned long ip)
+{
+ int i;
+
+ for (i = 0; i < LOCKSTAT_POINTS; i++) {
+ if (points[i] == 0) {
+ points[i] = ip;
+ break;
+ }
+ if (points[i] == ip)
+ break;
+ }
+
+ return i;
+}
+
+static void lock_time_inc(struct lock_time *lt, u64 time)
+{
+ if (time > lt->max)
+ lt->max = time;
+
+ if (time < lt->min || !lt->nr)
+ lt->min = time;
+
+ lt->total += time;
+ lt->nr++;
+}
+
+static inline void lock_time_add(struct lock_time *src, struct lock_time *dst)
+{
+ if (!src->nr)
+ return;
+
+ if (src->max > dst->max)
+ dst->max = src->max;
+
+ if (src->min < dst->min || !dst->nr)
+ dst->min = src->min;
+
+ dst->total += src->total;
+ dst->nr += src->nr;
+}
+
+struct lock_class_stats lock_stats(struct lock_class *class)
+{
+ struct lock_class_stats stats;
+ int cpu, i;
+
+ memset(&stats, 0, sizeof(struct lock_class_stats));
+ for_each_possible_cpu(cpu) {
+ struct lock_class_stats *pcs =
+ &per_cpu(cpu_lock_stats, cpu)[class - lock_classes];
+
+ for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++)
+ stats.contention_point[i] += pcs->contention_point[i];
+
+ for (i = 0; i < ARRAY_SIZE(stats.contending_point); i++)
+ stats.contending_point[i] += pcs->contending_point[i];
+
+ lock_time_add(&pcs->read_waittime, &stats.read_waittime);
+ lock_time_add(&pcs->write_waittime, &stats.write_waittime);
+
+ lock_time_add(&pcs->read_holdtime, &stats.read_holdtime);
+ lock_time_add(&pcs->write_holdtime, &stats.write_holdtime);
+
+ for (i = 0; i < ARRAY_SIZE(stats.bounces); i++)
+ stats.bounces[i] += pcs->bounces[i];
+ }
+
+ return stats;
+}
+
+void clear_lock_stats(struct lock_class *class)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct lock_class_stats *cpu_stats =
+ &per_cpu(cpu_lock_stats, cpu)[class - lock_classes];
+
+ memset(cpu_stats, 0, sizeof(struct lock_class_stats));
+ }
+ memset(class->contention_point, 0, sizeof(class->contention_point));
+ memset(class->contending_point, 0, sizeof(class->contending_point));
+}
+
+static struct lock_class_stats *get_lock_stats(struct lock_class *class)
+{
+ return &get_cpu_var(cpu_lock_stats)[class - lock_classes];
+}
+
+static void put_lock_stats(struct lock_class_stats *stats)
+{
+ put_cpu_var(cpu_lock_stats);
+}
+
+static void lock_release_holdtime(struct held_lock *hlock)
+{
+ struct lock_class_stats *stats;
+ u64 holdtime;
+
+ if (!lock_stat)
+ return;
+
+ holdtime = lockstat_clock() - hlock->holdtime_stamp;
+
+ stats = get_lock_stats(hlock_class(hlock));
+ if (hlock->read)
+ lock_time_inc(&stats->read_holdtime, holdtime);
+ else
+ lock_time_inc(&stats->write_holdtime, holdtime);
+ put_lock_stats(stats);
+}
+#else
+static inline void lock_release_holdtime(struct held_lock *hlock)
+{
+}
+#endif
+
+/*
+ * We keep a global list of all lock classes. The list only grows,
+ * never shrinks. The list is only accessed with the lockdep
+ * spinlock lock held.
+ */
+LIST_HEAD(all_lock_classes);
+
+/*
+ * The lockdep classes are in a hash-table as well, for fast lookup:
+ */
+#define CLASSHASH_BITS (MAX_LOCKDEP_KEYS_BITS - 1)
+#define CLASSHASH_SIZE (1UL << CLASSHASH_BITS)
+#define __classhashfn(key) hash_long((unsigned long)key, CLASSHASH_BITS)
+#define classhashentry(key) (classhash_table + __classhashfn((key)))
+
+static struct list_head classhash_table[CLASSHASH_SIZE];
+
+/*
+ * We put the lock dependency chains into a hash-table as well, to cache
+ * their existence:
+ */
+#define CHAINHASH_BITS (MAX_LOCKDEP_CHAINS_BITS-1)
+#define CHAINHASH_SIZE (1UL << CHAINHASH_BITS)
+#define __chainhashfn(chain) hash_long(chain, CHAINHASH_BITS)
+#define chainhashentry(chain) (chainhash_table + __chainhashfn((chain)))
+
+static struct list_head chainhash_table[CHAINHASH_SIZE];
+
+/*
+ * The hash key of the lock dependency chains is a hash itself too:
+ * it's a hash of all locks taken up to that lock, including that lock.
+ * It's a 64-bit hash, because it's important for the keys to be
+ * unique.
+ */
+#define iterate_chain_key(key1, key2) \
+ (((key1) << MAX_LOCKDEP_KEYS_BITS) ^ \
+ ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \
+ (key2))
+
+void lockdep_off(void)
+{
+ current->lockdep_recursion++;
+}
+EXPORT_SYMBOL(lockdep_off);
+
+void lockdep_on(void)
+{
+ current->lockdep_recursion--;
+}
+EXPORT_SYMBOL(lockdep_on);
+
+/*
+ * Debugging switches:
+ */
+
+#define VERBOSE 0
+#define VERY_VERBOSE 0
+
+#if VERBOSE
+# define HARDIRQ_VERBOSE 1
+# define SOFTIRQ_VERBOSE 1
+# define RECLAIM_VERBOSE 1
+#else
+# define HARDIRQ_VERBOSE 0
+# define SOFTIRQ_VERBOSE 0
+# define RECLAIM_VERBOSE 0
+#endif
+
+#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE || RECLAIM_VERBOSE
+/*
+ * Quick filtering for interesting events:
+ */
+static int class_filter(struct lock_class *class)
+{
+#if 0
+ /* Example */
+ if (class->name_version == 1 &&
+ !strcmp(class->name, "lockname"))
+ return 1;
+ if (class->name_version == 1 &&
+ !strcmp(class->name, "&struct->lockfield"))
+ return 1;
+#endif
+ /* Filter everything else. 1 would be to allow everything else */
+ return 0;
+}
+#endif
+
+static int verbose(struct lock_class *class)
+{
+#if VERBOSE
+ return class_filter(class);
+#endif
+ return 0;
+}
+
+/*
+ * Stack-trace: tightly packed array of stack backtrace
+ * addresses. Protected by the graph_lock.
+ */
+unsigned long nr_stack_trace_entries;
+static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES];
+
+static void print_lockdep_off(const char *bug_msg)
+{
+ printk(KERN_DEBUG "%s\n", bug_msg);
+ printk(KERN_DEBUG "turning off the locking correctness validator.\n");
+#ifdef CONFIG_LOCK_STAT
+ printk(KERN_DEBUG "Please attach the output of /proc/lock_stat to the bug report\n");
+#endif
+}
+
+static int save_trace(struct stack_trace *trace)
+{
+ trace->nr_entries = 0;
+ trace->max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries;
+ trace->entries = stack_trace + nr_stack_trace_entries;
+
+ trace->skip = 3;
+
+ save_stack_trace(trace);
+
+ /*
+ * Some daft arches put -1 at the end to indicate its a full trace.
+ *
+ * <rant> this is buggy anyway, since it takes a whole extra entry so a
+ * complete trace that maxes out the entries provided will be reported
+ * as incomplete, friggin useless </rant>
+ */
+ if (trace->nr_entries != 0 &&
+ trace->entries[trace->nr_entries-1] == ULONG_MAX)
+ trace->nr_entries--;
+
+ trace->max_entries = trace->nr_entries;
+
+ nr_stack_trace_entries += trace->nr_entries;
+
+ if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) {
+ if (!debug_locks_off_graph_unlock())
+ return 0;
+
+ print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!");
+ dump_stack();
+
+ return 0;
+ }
+
+ return 1;
+}
+
+unsigned int nr_hardirq_chains;
+unsigned int nr_softirq_chains;
+unsigned int nr_process_chains;
+unsigned int max_lockdep_depth;
+
+#ifdef CONFIG_DEBUG_LOCKDEP
+/*
+ * We cannot printk in early bootup code. Not even early_printk()
+ * might work. So we mark any initialization errors and printk
+ * about it later on, in lockdep_info().
+ */
+static int lockdep_init_error;
+static const char *lock_init_error;
+static unsigned long lockdep_init_trace_data[20];
+static struct stack_trace lockdep_init_trace = {
+ .max_entries = ARRAY_SIZE(lockdep_init_trace_data),
+ .entries = lockdep_init_trace_data,
+};
+
+/*
+ * Various lockdep statistics:
+ */
+DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats);
+#endif
+
+/*
+ * Locking printouts:
+ */
+
+#define __USAGE(__STATE) \
+ [LOCK_USED_IN_##__STATE] = "IN-"__stringify(__STATE)"-W", \
+ [LOCK_ENABLED_##__STATE] = __stringify(__STATE)"-ON-W", \
+ [LOCK_USED_IN_##__STATE##_READ] = "IN-"__stringify(__STATE)"-R",\
+ [LOCK_ENABLED_##__STATE##_READ] = __stringify(__STATE)"-ON-R",
+
+static const char *usage_str[] =
+{
+#define LOCKDEP_STATE(__STATE) __USAGE(__STATE)
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
+ [LOCK_USED] = "INITIAL USE",
+};
+
+const char * __get_key_name(struct lockdep_subclass_key *key, char *str)
+{
+ return kallsyms_lookup((unsigned long)key, NULL, NULL, NULL, str);
+}
+
+static inline unsigned long lock_flag(enum lock_usage_bit bit)
+{
+ return 1UL << bit;
+}
+
+static char get_usage_char(struct lock_class *class, enum lock_usage_bit bit)
+{
+ char c = '.';
+
+ if (class->usage_mask & lock_flag(bit + 2))
+ c = '+';
+ if (class->usage_mask & lock_flag(bit)) {
+ c = '-';
+ if (class->usage_mask & lock_flag(bit + 2))
+ c = '?';
+ }
+
+ return c;
+}
+
+void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS])
+{
+ int i = 0;
+
+#define LOCKDEP_STATE(__STATE) \
+ usage[i++] = get_usage_char(class, LOCK_USED_IN_##__STATE); \
+ usage[i++] = get_usage_char(class, LOCK_USED_IN_##__STATE##_READ);
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
+
+ usage[i] = '\0';
+}
+
+static void __print_lock_name(struct lock_class *class)
+{
+ char str[KSYM_NAME_LEN];
+ const char *name;
+
+ name = class->name;
+ if (!name) {
+ name = __get_key_name(class->key, str);
+ printk("%s", name);
+ } else {
+ printk("%s", name);
+ if (class->name_version > 1)
+ printk("#%d", class->name_version);
+ if (class->subclass)
+ printk("/%d", class->subclass);
+ }
+}
+
+static void print_lock_name(struct lock_class *class)
+{
+ char usage[LOCK_USAGE_CHARS];
+
+ get_usage_chars(class, usage);
+
+ printk(" (");
+ __print_lock_name(class);
+ printk("){%s}", usage);
+}
+
+static void print_lockdep_cache(struct lockdep_map *lock)
+{
+ const char *name;
+ char str[KSYM_NAME_LEN];
+
+ name = lock->name;
+ if (!name)
+ name = __get_key_name(lock->key->subkeys, str);
+
+ printk("%s", name);
+}
+
+static void print_lock(struct held_lock *hlock)
+{
+ /*
+ * We can be called locklessly through debug_show_all_locks() so be
+ * extra careful, the hlock might have been released and cleared.
+ */
+ unsigned int class_idx = hlock->class_idx;
+
+ /* Don't re-read hlock->class_idx, can't use READ_ONCE() on bitfields: */
+ barrier();
+
+ if (!class_idx || (class_idx - 1) >= MAX_LOCKDEP_KEYS) {
+ printk("<RELEASED>\n");
+ return;
+ }
+
+ print_lock_name(lock_classes + class_idx - 1);
+ printk(", at: ");
+ print_ip_sym(hlock->acquire_ip);
+}
+
+static void lockdep_print_held_locks(struct task_struct *curr)
+{
+ int i, depth = curr->lockdep_depth;
+
+ if (!depth) {
+ printk("no locks held by %s/%d.\n", curr->comm, task_pid_nr(curr));
+ return;
+ }
+ printk("%d lock%s held by %s/%d:\n",
+ depth, depth > 1 ? "s" : "", curr->comm, task_pid_nr(curr));
+
+ for (i = 0; i < depth; i++) {
+ printk(" #%d: ", i);
+ print_lock(curr->held_locks + i);
+ }
+}
+
+static void print_kernel_ident(void)
+{
+ printk("%s %.*s %s\n", init_utsname()->release,
+ (int)strcspn(init_utsname()->version, " "),
+ init_utsname()->version,
+ print_tainted());
+}
+
+static int very_verbose(struct lock_class *class)
+{
+#if VERY_VERBOSE
+ return class_filter(class);
+#endif
+ return 0;
+}
+
+/*
+ * Is this the address of a static object:
+ */
+#ifdef __KERNEL__
+static int static_obj(void *obj)
+{
+ unsigned long start = (unsigned long) &_stext,
+ end = (unsigned long) &_end,
+ addr = (unsigned long) obj;
+
+ /*
+ * static variable?
+ */
+ if ((addr >= start) && (addr < end))
+ return 1;
+
+ if (arch_is_kernel_data(addr))
+ return 1;
+
+ /*
+ * in-kernel percpu var?
+ */
+ if (is_kernel_percpu_address(addr))
+ return 1;
+
+ /*
+ * module static or percpu var?
+ */
+ return is_module_address(addr) || is_module_percpu_address(addr);
+}
+#endif
+
+/*
+ * To make lock name printouts unique, we calculate a unique
+ * class->name_version generation counter:
+ */
+static int count_matching_names(struct lock_class *new_class)
+{
+ struct lock_class *class;
+ int count = 0;
+
+ if (!new_class->name)
+ return 0;
+
+ list_for_each_entry_rcu(class, &all_lock_classes, lock_entry) {
+ if (new_class->key - new_class->subclass == class->key)
+ return class->name_version;
+ if (class->name && !strcmp(class->name, new_class->name))
+ count = max(count, class->name_version);
+ }
+
+ return count + 1;
+}
+
+/*
+ * Register a lock's class in the hash-table, if the class is not present
+ * yet. Otherwise we look it up. We cache the result in the lock object
+ * itself, so actual lookup of the hash should be once per lock object.
+ */
+static inline struct lock_class *
+look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
+{
+ struct lockdep_subclass_key *key;
+ struct list_head *hash_head;
+ struct lock_class *class;
+
+#ifdef CONFIG_DEBUG_LOCKDEP
+ /*
+ * If the architecture calls into lockdep before initializing
+ * the hashes then we'll warn about it later. (we cannot printk
+ * right now)
+ */
+ if (unlikely(!lockdep_initialized)) {
+ lockdep_init();
+ lockdep_init_error = 1;
+ lock_init_error = lock->name;
+ save_stack_trace(&lockdep_init_trace);
+ }
+#endif
+
+ if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
+ debug_locks_off();
+ printk(KERN_ERR
+ "BUG: looking up invalid subclass: %u\n", subclass);
+ printk(KERN_ERR
+ "turning off the locking correctness validator.\n");
+ dump_stack();
+ return NULL;
+ }
+
+ /*
+ * Static locks do not have their class-keys yet - for them the key
+ * is the lock object itself:
+ */
+ if (unlikely(!lock->key))
+ lock->key = (void *)lock;
+
+ /*
+ * NOTE: the class-key must be unique. For dynamic locks, a static
+ * lock_class_key variable is passed in through the mutex_init()
+ * (or spin_lock_init()) call - which acts as the key. For static
+ * locks we use the lock object itself as the key.
+ */
+ BUILD_BUG_ON(sizeof(struct lock_class_key) >
+ sizeof(struct lockdep_map));
+
+ key = lock->key->subkeys + subclass;
+
+ hash_head = classhashentry(key);
+
+ /*
+ * We do an RCU walk of the hash, see lockdep_free_key_range().
+ */
+ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ return NULL;
+
+ list_for_each_entry_rcu(class, hash_head, hash_entry) {
+ if (class->key == key) {
+ /*
+ * Huh! same key, different name? Did someone trample
+ * on some memory? We're most confused.
+ */
+ WARN_ON_ONCE(class->name != lock->name);
+ return class;
+ }
+ }
+
+ return NULL;
+}
+
+/*
+ * Register a lock's class in the hash-table, if the class is not present
+ * yet. Otherwise we look it up. We cache the result in the lock object
+ * itself, so actual lookup of the hash should be once per lock object.
+ */
+static inline struct lock_class *
+register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
+{
+ struct lockdep_subclass_key *key;
+ struct list_head *hash_head;
+ struct lock_class *class;
+
+ DEBUG_LOCKS_WARN_ON(!irqs_disabled());
+
+ class = look_up_lock_class(lock, subclass);
+ if (likely(class))
+ goto out_set_class_cache;
+
+ /*
+ * Debug-check: all keys must be persistent!
+ */
+ if (!static_obj(lock->key)) {
+ debug_locks_off();
+ printk("INFO: trying to register non-static key.\n");
+ printk("the code is fine but needs lockdep annotation.\n");
+ printk("turning off the locking correctness validator.\n");
+ dump_stack();
+
+ return NULL;
+ }
+
+ key = lock->key->subkeys + subclass;
+ hash_head = classhashentry(key);
+
+ if (!graph_lock()) {
+ return NULL;
+ }
+ /*
+ * We have to do the hash-walk again, to avoid races
+ * with another CPU:
+ */
+ list_for_each_entry_rcu(class, hash_head, hash_entry) {
+ if (class->key == key)
+ goto out_unlock_set;
+ }
+
+ /*
+ * Allocate a new key from the static array, and add it to
+ * the hash:
+ */
+ if (nr_lock_classes >= MAX_LOCKDEP_KEYS) {
+ if (!debug_locks_off_graph_unlock()) {
+ return NULL;
+ }
+
+ print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!");
+ dump_stack();
+ return NULL;
+ }
+ class = lock_classes + nr_lock_classes++;
+ debug_atomic_inc(nr_unused_locks);
+ class->key = key;
+ class->name = lock->name;
+ class->subclass = subclass;
+ INIT_LIST_HEAD(&class->lock_entry);
+ INIT_LIST_HEAD(&class->locks_before);
+ INIT_LIST_HEAD(&class->locks_after);
+ class->name_version = count_matching_names(class);
+ /*
+ * We use RCU's safe list-add method to make
+ * parallel walking of the hash-list safe:
+ */
+ list_add_tail_rcu(&class->hash_entry, hash_head);
+ /*
+ * Add it to the global list of classes:
+ */
+ list_add_tail_rcu(&class->lock_entry, &all_lock_classes);
+
+ if (verbose(class)) {
+ graph_unlock();
+
+ printk("\nnew class %p: %s", class->key, class->name);
+ if (class->name_version > 1)
+ printk("#%d", class->name_version);
+ printk("\n");
+ dump_stack();
+
+ if (!graph_lock()) {
+ return NULL;
+ }
+ }
+out_unlock_set:
+ graph_unlock();
+
+out_set_class_cache:
+ if (!subclass || force)
+ lock->class_cache[0] = class;
+ else if (subclass < NR_LOCKDEP_CACHING_CLASSES)
+ lock->class_cache[subclass] = class;
+
+ /*
+ * Hash collision, did we smoke some? We found a class with a matching
+ * hash but the subclass -- which is hashed in -- didn't match.
+ */
+ if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass))
+ return NULL;
+
+ return class;
+}
+
+#ifdef CONFIG_PROVE_LOCKING
+/*
+ * Allocate a lockdep entry. (assumes the graph_lock held, returns
+ * with NULL on failure)
+ */
+static struct lock_list *alloc_list_entry(void)
+{
+ if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) {
+ if (!debug_locks_off_graph_unlock())
+ return NULL;
+
+ print_lockdep_off("BUG: MAX_LOCKDEP_ENTRIES too low!");
+ dump_stack();
+ return NULL;
+ }
+ return list_entries + nr_list_entries++;
+}
+
+/*
+ * Add a new dependency to the head of the list:
+ */
+static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
+ struct list_head *head, unsigned long ip,
+ int distance, struct stack_trace *trace)
+{
+ struct lock_list *entry;
+ /*
+ * Lock not present yet - get a new dependency struct and
+ * add it to the list:
+ */
+ entry = alloc_list_entry();
+ if (!entry)
+ return 0;
+
+ entry->class = this;
+ entry->distance = distance;
+ entry->trace = *trace;
+ /*
+ * Both allocation and removal are done under the graph lock; but
+ * iteration is under RCU-sched; see look_up_lock_class() and
+ * lockdep_free_key_range().
+ */
+ list_add_tail_rcu(&entry->entry, head);
+
+ return 1;
+}
+
+/*
+ * For good efficiency of modular, we use power of 2
+ */
+#define MAX_CIRCULAR_QUEUE_SIZE 4096UL
+#define CQ_MASK (MAX_CIRCULAR_QUEUE_SIZE-1)
+
+/*
+ * The circular_queue and helpers is used to implement the
+ * breadth-first search(BFS)algorithem, by which we can build
+ * the shortest path from the next lock to be acquired to the
+ * previous held lock if there is a circular between them.
+ */
+struct circular_queue {
+ unsigned long element[MAX_CIRCULAR_QUEUE_SIZE];
+ unsigned int front, rear;
+};
+
+static struct circular_queue lock_cq;
+
+unsigned int max_bfs_queue_depth;
+
+static unsigned int lockdep_dependency_gen_id;
+
+static inline void __cq_init(struct circular_queue *cq)
+{
+ cq->front = cq->rear = 0;
+ lockdep_dependency_gen_id++;
+}
+
+static inline int __cq_empty(struct circular_queue *cq)
+{
+ return (cq->front == cq->rear);
+}
+
+static inline int __cq_full(struct circular_queue *cq)
+{
+ return ((cq->rear + 1) & CQ_MASK) == cq->front;
+}
+
+static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem)
+{
+ if (__cq_full(cq))
+ return -1;
+
+ cq->element[cq->rear] = elem;
+ cq->rear = (cq->rear + 1) & CQ_MASK;
+ return 0;
+}
+
+static inline int __cq_dequeue(struct circular_queue *cq, unsigned long *elem)
+{
+ if (__cq_empty(cq))
+ return -1;
+
+ *elem = cq->element[cq->front];
+ cq->front = (cq->front + 1) & CQ_MASK;
+ return 0;
+}
+
+static inline unsigned int __cq_get_elem_count(struct circular_queue *cq)
+{
+ return (cq->rear - cq->front) & CQ_MASK;
+}
+
+static inline void mark_lock_accessed(struct lock_list *lock,
+ struct lock_list *parent)
+{
+ unsigned long nr;
+
+ nr = lock - list_entries;
+ WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */
+ lock->parent = parent;
+ lock->class->dep_gen_id = lockdep_dependency_gen_id;
+}
+
+static inline unsigned long lock_accessed(struct lock_list *lock)
+{
+ unsigned long nr;
+
+ nr = lock - list_entries;
+ WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */
+ return lock->class->dep_gen_id == lockdep_dependency_gen_id;
+}
+
+static inline struct lock_list *get_lock_parent(struct lock_list *child)
+{
+ return child->parent;
+}
+
+static inline int get_lock_depth(struct lock_list *child)
+{
+ int depth = 0;
+ struct lock_list *parent;
+
+ while ((parent = get_lock_parent(child))) {
+ child = parent;
+ depth++;
+ }
+ return depth;
+}
+
+static int __bfs(struct lock_list *source_entry,
+ void *data,
+ int (*match)(struct lock_list *entry, void *data),
+ struct lock_list **target_entry,
+ int forward)
+{
+ struct lock_list *entry;
+ struct list_head *head;
+ struct circular_queue *cq = &lock_cq;
+ int ret = 1;
+
+ if (match(source_entry, data)) {
+ *target_entry = source_entry;
+ ret = 0;
+ goto exit;
+ }
+
+ if (forward)
+ head = &source_entry->class->locks_after;
+ else
+ head = &source_entry->class->locks_before;
+
+ if (list_empty(head))
+ goto exit;
+
+ __cq_init(cq);
+ __cq_enqueue(cq, (unsigned long)source_entry);
+
+ while (!__cq_empty(cq)) {
+ struct lock_list *lock;
+
+ __cq_dequeue(cq, (unsigned long *)&lock);
+
+ if (!lock->class) {
+ ret = -2;
+ goto exit;
+ }
+
+ if (forward)
+ head = &lock->class->locks_after;
+ else
+ head = &lock->class->locks_before;
+
+ DEBUG_LOCKS_WARN_ON(!irqs_disabled());
+
+ list_for_each_entry_rcu(entry, head, entry) {
+ if (!lock_accessed(entry)) {
+ unsigned int cq_depth;
+ mark_lock_accessed(entry, lock);
+ if (match(entry, data)) {
+ *target_entry = entry;
+ ret = 0;
+ goto exit;
+ }
+
+ if (__cq_enqueue(cq, (unsigned long)entry)) {
+ ret = -1;
+ goto exit;
+ }
+ cq_depth = __cq_get_elem_count(cq);
+ if (max_bfs_queue_depth < cq_depth)
+ max_bfs_queue_depth = cq_depth;
+ }
+ }
+ }
+exit:
+ return ret;
+}
+
+static inline int __bfs_forwards(struct lock_list *src_entry,
+ void *data,
+ int (*match)(struct lock_list *entry, void *data),
+ struct lock_list **target_entry)
+{
+ return __bfs(src_entry, data, match, target_entry, 1);
+
+}
+
+static inline int __bfs_backwards(struct lock_list *src_entry,
+ void *data,
+ int (*match)(struct lock_list *entry, void *data),
+ struct lock_list **target_entry)
+{
+ return __bfs(src_entry, data, match, target_entry, 0);
+
+}
+
+/*
+ * Recursive, forwards-direction lock-dependency checking, used for
+ * both noncyclic checking and for hardirq-unsafe/softirq-unsafe
+ * checking.
+ */
+
+/*
+ * Print a dependency chain entry (this is only done when a deadlock
+ * has been detected):
+ */
+static noinline int
+print_circular_bug_entry(struct lock_list *target, int depth)
+{
+ if (debug_locks_silent)
+ return 0;
+ printk("\n-> #%u", depth);
+ print_lock_name(target->class);
+ printk(":\n");
+ print_stack_trace(&target->trace, 6);
+
+ return 0;
+}
+
+static void
+print_circular_lock_scenario(struct held_lock *src,
+ struct held_lock *tgt,
+ struct lock_list *prt)
+{
+ struct lock_class *source = hlock_class(src);
+ struct lock_class *target = hlock_class(tgt);
+ struct lock_class *parent = prt->class;
+
+ /*
+ * A direct locking problem where unsafe_class lock is taken
+ * directly by safe_class lock, then all we need to show
+ * is the deadlock scenario, as it is obvious that the
+ * unsafe lock is taken under the safe lock.
+ *
+ * But if there is a chain instead, where the safe lock takes
+ * an intermediate lock (middle_class) where this lock is
+ * not the same as the safe lock, then the lock chain is
+ * used to describe the problem. Otherwise we would need
+ * to show a different CPU case for each link in the chain
+ * from the safe_class lock to the unsafe_class lock.
+ */
+ if (parent != source) {
+ printk("Chain exists of:\n ");
+ __print_lock_name(source);
+ printk(" --> ");
+ __print_lock_name(parent);
+ printk(" --> ");
+ __print_lock_name(target);
+ printk("\n\n");
+ }
+
+ printk(" Possible unsafe locking scenario:\n\n");
+ printk(" CPU0 CPU1\n");
+ printk(" ---- ----\n");
+ printk(" lock(");
+ __print_lock_name(target);
+ printk(");\n");
+ printk(" lock(");
+ __print_lock_name(parent);
+ printk(");\n");
+ printk(" lock(");
+ __print_lock_name(target);
+ printk(");\n");
+ printk(" lock(");
+ __print_lock_name(source);
+ printk(");\n");
+ printk("\n *** DEADLOCK ***\n\n");
+}
+
+/*
+ * When a circular dependency is detected, print the
+ * header first:
+ */
+static noinline int
+print_circular_bug_header(struct lock_list *entry, unsigned int depth,
+ struct held_lock *check_src,
+ struct held_lock *check_tgt)
+{
+ struct task_struct *curr = current;
+
+ if (debug_locks_silent)
+ return 0;
+
+ printk("\n");
+ printk("======================================================\n");
+ printk("[ INFO: possible circular locking dependency detected ]\n");
+ print_kernel_ident();
+ printk("-------------------------------------------------------\n");
+ printk("%s/%d is trying to acquire lock:\n",
+ curr->comm, task_pid_nr(curr));
+ print_lock(check_src);
+ printk("\nbut task is already holding lock:\n");
+ print_lock(check_tgt);
+ printk("\nwhich lock already depends on the new lock.\n\n");
+ printk("\nthe existing dependency chain (in reverse order) is:\n");
+
+ print_circular_bug_entry(entry, depth);
+
+ return 0;
+}
+
+static inline int class_equal(struct lock_list *entry, void *data)
+{
+ return entry->class == data;
+}
+
+static noinline int print_circular_bug(struct lock_list *this,
+ struct lock_list *target,
+ struct held_lock *check_src,
+ struct held_lock *check_tgt)
+{
+ struct task_struct *curr = current;
+ struct lock_list *parent;
+ struct lock_list *first_parent;
+ int depth;
+
+ if (!debug_locks_off_graph_unlock() || debug_locks_silent)
+ return 0;
+
+ if (!save_trace(&this->trace))
+ return 0;
+
+ depth = get_lock_depth(target);
+
+ print_circular_bug_header(target, depth, check_src, check_tgt);
+
+ parent = get_lock_parent(target);
+ first_parent = parent;
+
+ while (parent) {
+ print_circular_bug_entry(parent, --depth);
+ parent = get_lock_parent(parent);
+ }
+
+ printk("\nother info that might help us debug this:\n\n");
+ print_circular_lock_scenario(check_src, check_tgt,
+ first_parent);
+
+ lockdep_print_held_locks(curr);
+
+ printk("\nstack backtrace:\n");
+ dump_stack();
+
+ return 0;
+}
+
+static noinline int print_bfs_bug(int ret)
+{
+ if (!debug_locks_off_graph_unlock())
+ return 0;
+
+ /*
+ * Breadth-first-search failed, graph got corrupted?
+ */
+ WARN(1, "lockdep bfs error:%d\n", ret);
+
+ return 0;
+}
+
+static int noop_count(struct lock_list *entry, void *data)
+{
+ (*(unsigned long *)data)++;
+ return 0;
+}
+
+static unsigned long __lockdep_count_forward_deps(struct lock_list *this)
+{
+ unsigned long count = 0;
+ struct lock_list *uninitialized_var(target_entry);
+
+ __bfs_forwards(this, (void *)&count, noop_count, &target_entry);
+
+ return count;
+}
+unsigned long lockdep_count_forward_deps(struct lock_class *class)
+{
+ unsigned long ret, flags;
+ struct lock_list this;
+
+ this.parent = NULL;
+ this.class = class;
+
+ local_irq_save(flags);
+ arch_spin_lock(&lockdep_lock);
+ ret = __lockdep_count_forward_deps(&this);
+ arch_spin_unlock(&lockdep_lock);
+ local_irq_restore(flags);
+
+ return ret;
+}
+
+static unsigned long __lockdep_count_backward_deps(struct lock_list *this)
+{
+ unsigned long count = 0;
+ struct lock_list *uninitialized_var(target_entry);
+
+ __bfs_backwards(this, (void *)&count, noop_count, &target_entry);
+
+ return count;
+}
+
+unsigned long lockdep_count_backward_deps(struct lock_class *class)
+{
+ unsigned long ret, flags;
+ struct lock_list this;
+
+ this.parent = NULL;
+ this.class = class;
+
+ local_irq_save(flags);
+ arch_spin_lock(&lockdep_lock);
+ ret = __lockdep_count_backward_deps(&this);
+ arch_spin_unlock(&lockdep_lock);
+ local_irq_restore(flags);
+
+ return ret;
+}
+
+/*
+ * Prove that the dependency graph starting at <entry> can not
+ * lead to <target>. Print an error and return 0 if it does.
+ */
+static noinline int
+check_noncircular(struct lock_list *root, struct lock_class *target,
+ struct lock_list **target_entry)
+{
+ int result;
+
+ debug_atomic_inc(nr_cyclic_checks);
+
+ result = __bfs_forwards(root, target, class_equal, target_entry);
+
+ return result;
+}
+
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
+/*
+ * Forwards and backwards subgraph searching, for the purposes of
+ * proving that two subgraphs can be connected by a new dependency
+ * without creating any illegal irq-safe -> irq-unsafe lock dependency.
+ */
+
+static inline int usage_match(struct lock_list *entry, void *bit)
+{
+ return entry->class->usage_mask & (1 << (enum lock_usage_bit)bit);
+}
+
+
+
+/*
+ * Find a node in the forwards-direction dependency sub-graph starting
+ * at @root->class that matches @bit.
+ *
+ * Return 0 if such a node exists in the subgraph, and put that node
+ * into *@target_entry.
+ *
+ * Return 1 otherwise and keep *@target_entry unchanged.
+ * Return <0 on error.
+ */
+static int
+find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit,
+ struct lock_list **target_entry)
+{
+ int result;
+
+ debug_atomic_inc(nr_find_usage_forwards_checks);
+
+ result = __bfs_forwards(root, (void *)bit, usage_match, target_entry);
+
+ return result;
+}
+
+/*
+ * Find a node in the backwards-direction dependency sub-graph starting
+ * at @root->class that matches @bit.
+ *
+ * Return 0 if such a node exists in the subgraph, and put that node
+ * into *@target_entry.
+ *
+ * Return 1 otherwise and keep *@target_entry unchanged.
+ * Return <0 on error.
+ */
+static int
+find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit,
+ struct lock_list **target_entry)
+{
+ int result;
+
+ debug_atomic_inc(nr_find_usage_backwards_checks);
+
+ result = __bfs_backwards(root, (void *)bit, usage_match, target_entry);
+
+ return result;
+}
+
+static void print_lock_class_header(struct lock_class *class, int depth)
+{
+ int bit;
+
+ printk("%*s->", depth, "");
+ print_lock_name(class);
+ printk(" ops: %lu", class->ops);
+ printk(" {\n");
+
+ for (bit = 0; bit < LOCK_USAGE_STATES; bit++) {
+ if (class->usage_mask & (1 << bit)) {
+ int len = depth;
+
+ len += printk("%*s %s", depth, "", usage_str[bit]);
+ len += printk(" at:\n");
+ print_stack_trace(class->usage_traces + bit, len);
+ }
+ }
+ printk("%*s }\n", depth, "");
+
+ printk("%*s ... key at: ",depth,"");
+ print_ip_sym((unsigned long)class->key);
+}
+
+/*
+ * printk the shortest lock dependencies from @start to @end in reverse order:
+ */
+static void __used
+print_shortest_lock_dependencies(struct lock_list *leaf,
+ struct lock_list *root)
+{
+ struct lock_list *entry = leaf;
+ int depth;
+
+ /*compute depth from generated tree by BFS*/
+ depth = get_lock_depth(leaf);
+
+ do {
+ print_lock_class_header(entry->class, depth);
+ printk("%*s ... acquired at:\n", depth, "");
+ print_stack_trace(&entry->trace, 2);
+ printk("\n");
+
+ if (depth == 0 && (entry != root)) {
+ printk("lockdep:%s bad path found in chain graph\n", __func__);
+ break;
+ }
+
+ entry = get_lock_parent(entry);
+ depth--;
+ } while (entry && (depth >= 0));
+
+ return;
+}
+
+static void
+print_irq_lock_scenario(struct lock_list *safe_entry,
+ struct lock_list *unsafe_entry,
+ struct lock_class *prev_class,
+ struct lock_class *next_class)
+{
+ struct lock_class *safe_class = safe_entry->class;
+ struct lock_class *unsafe_class = unsafe_entry->class;
+ struct lock_class *middle_class = prev_class;
+
+ if (middle_class == safe_class)
+ middle_class = next_class;
+
+ /*
+ * A direct locking problem where unsafe_class lock is taken
+ * directly by safe_class lock, then all we need to show
+ * is the deadlock scenario, as it is obvious that the
+ * unsafe lock is taken under the safe lock.
+ *
+ * But if there is a chain instead, where the safe lock takes
+ * an intermediate lock (middle_class) where this lock is
+ * not the same as the safe lock, then the lock chain is
+ * used to describe the problem. Otherwise we would need
+ * to show a different CPU case for each link in the chain
+ * from the safe_class lock to the unsafe_class lock.
+ */
+ if (middle_class != unsafe_class) {
+ printk("Chain exists of:\n ");
+ __print_lock_name(safe_class);
+ printk(" --> ");
+ __print_lock_name(middle_class);
+ printk(" --> ");
+ __print_lock_name(unsafe_class);
+ printk("\n\n");
+ }
+
+ printk(" Possible interrupt unsafe locking scenario:\n\n");
+ printk(" CPU0 CPU1\n");
+ printk(" ---- ----\n");
+ printk(" lock(");
+ __print_lock_name(unsafe_class);
+ printk(");\n");
+ printk(" local_irq_disable();\n");
+ printk(" lock(");
+ __print_lock_name(safe_class);
+ printk(");\n");
+ printk(" lock(");
+ __print_lock_name(middle_class);
+ printk(");\n");
+ printk(" <Interrupt>\n");
+ printk(" lock(");
+ __print_lock_name(safe_class);
+ printk(");\n");
+ printk("\n *** DEADLOCK ***\n\n");
+}
+
+static int
+print_bad_irq_dependency(struct task_struct *curr,
+ struct lock_list *prev_root,
+ struct lock_list *next_root,
+ struct lock_list *backwards_entry,
+ struct lock_list *forwards_entry,
+ struct held_lock *prev,
+ struct held_lock *next,
+ enum lock_usage_bit bit1,
+ enum lock_usage_bit bit2,
+ const char *irqclass)
+{
+ if (!debug_locks_off_graph_unlock() || debug_locks_silent)
+ return 0;
+
+ printk("\n");
+ printk("======================================================\n");
+ printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n",
+ irqclass, irqclass);
+ print_kernel_ident();
+ printk("------------------------------------------------------\n");
+ printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
+ curr->comm, task_pid_nr(curr),
+ curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT,
+ curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT,
+ curr->hardirqs_enabled,
+ curr->softirqs_enabled);
+ print_lock(next);
+
+ printk("\nand this task is already holding:\n");
+ print_lock(prev);
+ printk("which would create a new lock dependency:\n");
+ print_lock_name(hlock_class(prev));
+ printk(" ->");
+ print_lock_name(hlock_class(next));
+ printk("\n");
+
+ printk("\nbut this new dependency connects a %s-irq-safe lock:\n",
+ irqclass);
+ print_lock_name(backwards_entry->class);
+ printk("\n... which became %s-irq-safe at:\n", irqclass);
+
+ print_stack_trace(backwards_entry->class->usage_traces + bit1, 1);
+
+ printk("\nto a %s-irq-unsafe lock:\n", irqclass);
+ print_lock_name(forwards_entry->class);
+ printk("\n... which became %s-irq-unsafe at:\n", irqclass);
+ printk("...");
+
+ print_stack_trace(forwards_entry->class->usage_traces + bit2, 1);
+
+ printk("\nother info that might help us debug this:\n\n");
+ print_irq_lock_scenario(backwards_entry, forwards_entry,
+ hlock_class(prev), hlock_class(next));
+
+ lockdep_print_held_locks(curr);
+
+ printk("\nthe dependencies between %s-irq-safe lock", irqclass);
+ printk(" and the holding lock:\n");
+ if (!save_trace(&prev_root->trace))
+ return 0;
+ print_shortest_lock_dependencies(backwards_entry, prev_root);
+
+ printk("\nthe dependencies between the lock to be acquired");
+ printk(" and %s-irq-unsafe lock:\n", irqclass);
+ if (!save_trace(&next_root->trace))
+ return 0;
+ print_shortest_lock_dependencies(forwards_entry, next_root);
+
+ printk("\nstack backtrace:\n");
+ dump_stack();
+
+ return 0;
+}
+
+static int
+check_usage(struct task_struct *curr, struct held_lock *prev,
+ struct held_lock *next, enum lock_usage_bit bit_backwards,
+ enum lock_usage_bit bit_forwards, const char *irqclass)
+{
+ int ret;
+ struct lock_list this, that;
+ struct lock_list *uninitialized_var(target_entry);
+ struct lock_list *uninitialized_var(target_entry1);
+
+ this.parent = NULL;
+
+ this.class = hlock_class(prev);
+ ret = find_usage_backwards(&this, bit_backwards, &target_entry);
+ if (ret < 0)
+ return print_bfs_bug(ret);
+ if (ret == 1)
+ return ret;
+
+ that.parent = NULL;
+ that.class = hlock_class(next);
+ ret = find_usage_forwards(&that, bit_forwards, &target_entry1);
+ if (ret < 0)
+ return print_bfs_bug(ret);
+ if (ret == 1)
+ return ret;
+
+ return print_bad_irq_dependency(curr, &this, &that,
+ target_entry, target_entry1,
+ prev, next,
+ bit_backwards, bit_forwards, irqclass);
+}
+
+static const char *state_names[] = {
+#define LOCKDEP_STATE(__STATE) \
+ __stringify(__STATE),
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
+};
+
+static const char *state_rnames[] = {
+#define LOCKDEP_STATE(__STATE) \
+ __stringify(__STATE)"-READ",
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
+};
+
+static inline const char *state_name(enum lock_usage_bit bit)
+{
+ return (bit & 1) ? state_rnames[bit >> 2] : state_names[bit >> 2];
+}
+
+static int exclusive_bit(int new_bit)
+{
+ /*
+ * USED_IN
+ * USED_IN_READ
+ * ENABLED
+ * ENABLED_READ
+ *
+ * bit 0 - write/read
+ * bit 1 - used_in/enabled
+ * bit 2+ state
+ */
+
+ int state = new_bit & ~3;
+ int dir = new_bit & 2;
+
+ /*
+ * keep state, bit flip the direction and strip read.
+ */
+ return state | (dir ^ 2);
+}
+
+static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
+ struct held_lock *next, enum lock_usage_bit bit)
+{
+ /*
+ * Prove that the new dependency does not connect a hardirq-safe
+ * lock with a hardirq-unsafe lock - to achieve this we search
+ * the backwards-subgraph starting at <prev>, and the
+ * forwards-subgraph starting at <next>:
+ */
+ if (!check_usage(curr, prev, next, bit,
+ exclusive_bit(bit), state_name(bit)))
+ return 0;
+
+ bit++; /* _READ */
+
+ /*
+ * Prove that the new dependency does not connect a hardirq-safe-read
+ * lock with a hardirq-unsafe lock - to achieve this we search
+ * the backwards-subgraph starting at <prev>, and the
+ * forwards-subgraph starting at <next>:
+ */
+ if (!check_usage(curr, prev, next, bit,
+ exclusive_bit(bit), state_name(bit)))
+ return 0;
+
+ return 1;
+}
+
+static int
+check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
+ struct held_lock *next)
+{
+#define LOCKDEP_STATE(__STATE) \
+ if (!check_irq_usage(curr, prev, next, LOCK_USED_IN_##__STATE)) \
+ return 0;
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
+
+ return 1;
+}
+
+static void inc_chains(void)
+{
+ if (current->hardirq_context)
+ nr_hardirq_chains++;
+ else {
+ if (current->softirq_context)
+ nr_softirq_chains++;
+ else
+ nr_process_chains++;
+ }
+}
+
+#else
+
+static inline int
+check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
+ struct held_lock *next)
+{
+ return 1;
+}
+
+static inline void inc_chains(void)
+{
+ nr_process_chains++;
+}
+
+#endif
+
+static void
+print_deadlock_scenario(struct held_lock *nxt,
+ struct held_lock *prv)
+{
+ struct lock_class *next = hlock_class(nxt);
+ struct lock_class *prev = hlock_class(prv);
+
+ printk(" Possible unsafe locking scenario:\n\n");
+ printk(" CPU0\n");
+ printk(" ----\n");
+ printk(" lock(");
+ __print_lock_name(prev);
+ printk(");\n");
+ printk(" lock(");
+ __print_lock_name(next);
+ printk(");\n");
+ printk("\n *** DEADLOCK ***\n\n");
+ printk(" May be due to missing lock nesting notation\n\n");
+}
+
+static int
+print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
+ struct held_lock *next)
+{
+ if (!debug_locks_off_graph_unlock() || debug_locks_silent)
+ return 0;
+
+ printk("\n");
+ printk("=============================================\n");
+ printk("[ INFO: possible recursive locking detected ]\n");
+ print_kernel_ident();
+ printk("---------------------------------------------\n");
+ printk("%s/%d is trying to acquire lock:\n",
+ curr->comm, task_pid_nr(curr));
+ print_lock(next);
+ printk("\nbut task is already holding lock:\n");
+ print_lock(prev);
+
+ printk("\nother info that might help us debug this:\n");
+ print_deadlock_scenario(next, prev);
+ lockdep_print_held_locks(curr);
+
+ printk("\nstack backtrace:\n");
+ dump_stack();
+
+ return 0;
+}
+
+/*
+ * Check whether we are holding such a class already.
+ *
+ * (Note that this has to be done separately, because the graph cannot
+ * detect such classes of deadlocks.)
+ *
+ * Returns: 0 on deadlock detected, 1 on OK, 2 on recursive read
+ */
+static int
+check_deadlock(struct task_struct *curr, struct held_lock *next,
+ struct lockdep_map *next_instance, int read)
+{
+ struct held_lock *prev;
+ struct held_lock *nest = NULL;
+ int i;
+
+ for (i = 0; i < curr->lockdep_depth; i++) {
+ prev = curr->held_locks + i;
+
+ if (prev->instance == next->nest_lock)
+ nest = prev;
+
+ if (hlock_class(prev) != hlock_class(next))
+ continue;
+
+ /*
+ * Allow read-after-read recursion of the same
+ * lock class (i.e. read_lock(lock)+read_lock(lock)):
+ */
+ if ((read == 2) && prev->read)
+ return 2;
+
+ /*
+ * We're holding the nest_lock, which serializes this lock's
+ * nesting behaviour.
+ */
+ if (nest)
+ return 2;
+
+ return print_deadlock_bug(curr, prev, next);
+ }
+ return 1;
+}
+
+/*
+ * There was a chain-cache miss, and we are about to add a new dependency
+ * to a previous lock. We recursively validate the following rules:
+ *
+ * - would the adding of the <prev> -> <next> dependency create a
+ * circular dependency in the graph? [== circular deadlock]
+ *
+ * - does the new prev->next dependency connect any hardirq-safe lock
+ * (in the full backwards-subgraph starting at <prev>) with any
+ * hardirq-unsafe lock (in the full forwards-subgraph starting at
+ * <next>)? [== illegal lock inversion with hardirq contexts]
+ *
+ * - does the new prev->next dependency connect any softirq-safe lock
+ * (in the full backwards-subgraph starting at <prev>) with any
+ * softirq-unsafe lock (in the full forwards-subgraph starting at
+ * <next>)? [== illegal lock inversion with softirq contexts]
+ *
+ * any of these scenarios could lead to a deadlock.
+ *
+ * Then if all the validations pass, we add the forwards and backwards
+ * dependency.
+ */
+static int
+check_prev_add(struct task_struct *curr, struct held_lock *prev,
+ struct held_lock *next, int distance, int trylock_loop)
+{
+ struct lock_list *entry;
+ int ret;
+ struct lock_list this;
+ struct lock_list *uninitialized_var(target_entry);
+ /*
+ * Static variable, serialized by the graph_lock().
+ *
+ * We use this static variable to save the stack trace in case
+ * we call into this function multiple times due to encountering
+ * trylocks in the held lock stack.
+ */
+ static struct stack_trace trace;
+
+ /*
+ * Prove that the new <prev> -> <next> dependency would not
+ * create a circular dependency in the graph. (We do this by
+ * forward-recursing into the graph starting at <next>, and
+ * checking whether we can reach <prev>.)
+ *
+ * We are using global variables to control the recursion, to
+ * keep the stackframe size of the recursive functions low:
+ */
+ this.class = hlock_class(next);
+ this.parent = NULL;
+ ret = check_noncircular(&this, hlock_class(prev), &target_entry);
+ if (unlikely(!ret))
+ return print_circular_bug(&this, target_entry, next, prev);
+ else if (unlikely(ret < 0))
+ return print_bfs_bug(ret);
+
+ if (!check_prev_add_irq(curr, prev, next))
+ return 0;
+
+ /*
+ * For recursive read-locks we do all the dependency checks,
+ * but we dont store read-triggered dependencies (only
+ * write-triggered dependencies). This ensures that only the
+ * write-side dependencies matter, and that if for example a
+ * write-lock never takes any other locks, then the reads are
+ * equivalent to a NOP.
+ */
+ if (next->read == 2 || prev->read == 2)
+ return 1;
+ /*
+ * Is the <prev> -> <next> dependency already present?
+ *
+ * (this may occur even though this is a new chain: consider
+ * e.g. the L1 -> L2 -> L3 -> L4 and the L5 -> L1 -> L2 -> L3
+ * chains - the second one will be new, but L1 already has
+ * L2 added to its dependency list, due to the first chain.)
+ */
+ list_for_each_entry(entry, &hlock_class(prev)->locks_after, entry) {
+ if (entry->class == hlock_class(next)) {
+ if (distance == 1)
+ entry->distance = 1;
+ return 2;
+ }
+ }
+
+ if (!trylock_loop && !save_trace(&trace))
+ return 0;
+
+ /*
+ * Ok, all validations passed, add the new lock
+ * to the previous lock's dependency list:
+ */
+ ret = add_lock_to_list(hlock_class(prev), hlock_class(next),
+ &hlock_class(prev)->locks_after,
+ next->acquire_ip, distance, &trace);
+
+ if (!ret)
+ return 0;
+
+ ret = add_lock_to_list(hlock_class(next), hlock_class(prev),
+ &hlock_class(next)->locks_before,
+ next->acquire_ip, distance, &trace);
+ if (!ret)
+ return 0;
+
+ /*
+ * Debugging printouts:
+ */
+ if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) {
+ graph_unlock();
+ printk("\n new dependency: ");
+ print_lock_name(hlock_class(prev));
+ printk(" => ");
+ print_lock_name(hlock_class(next));
+ printk("\n");
+ dump_stack();
+ return graph_lock();
+ }
+ return 1;
+}
+
+/*
+ * Add the dependency to all directly-previous locks that are 'relevant'.
+ * The ones that are relevant are (in increasing distance from curr):
+ * all consecutive trylock entries and the final non-trylock entry - or
+ * the end of this context's lock-chain - whichever comes first.
+ */
+static int
+check_prevs_add(struct task_struct *curr, struct held_lock *next)
+{
+ int depth = curr->lockdep_depth;
+ int trylock_loop = 0;
+ struct held_lock *hlock;
+
+ /*
+ * Debugging checks.
+ *
+ * Depth must not be zero for a non-head lock:
+ */
+ if (!depth)
+ goto out_bug;
+ /*
+ * At least two relevant locks must exist for this
+ * to be a head:
+ */
+ if (curr->held_locks[depth].irq_context !=
+ curr->held_locks[depth-1].irq_context)
+ goto out_bug;
+
+ for (;;) {
+ int distance = curr->lockdep_depth - depth + 1;
+ hlock = curr->held_locks + depth - 1;
+ /*
+ * Only non-recursive-read entries get new dependencies
+ * added:
+ */
+ if (hlock->read != 2 && hlock->check) {
+ if (!check_prev_add(curr, hlock, next,
+ distance, trylock_loop))
+ return 0;
+ /*
+ * Stop after the first non-trylock entry,
+ * as non-trylock entries have added their
+ * own direct dependencies already, so this
+ * lock is connected to them indirectly:
+ */
+ if (!hlock->trylock)
+ break;
+ }
+ depth--;
+ /*
+ * End of lock-stack?
+ */
+ if (!depth)
+ break;
+ /*
+ * Stop the search if we cross into another context:
+ */
+ if (curr->held_locks[depth].irq_context !=
+ curr->held_locks[depth-1].irq_context)
+ break;
+ trylock_loop = 1;
+ }
+ return 1;
+out_bug:
+ if (!debug_locks_off_graph_unlock())
+ return 0;
+
+ /*
+ * Clearly we all shouldn't be here, but since we made it we
+ * can reliable say we messed up our state. See the above two
+ * gotos for reasons why we could possibly end up here.
+ */
+ WARN_ON(1);
+
+ return 0;
+}
+
+unsigned long nr_lock_chains;
+struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS];
+int nr_chain_hlocks;
+static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS];
+
+struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i)
+{
+ return lock_classes + chain_hlocks[chain->base + i];
+}
+
+/*
+ * Look up a dependency chain. If the key is not present yet then
+ * add it and return 1 - in this case the new dependency chain is
+ * validated. If the key is already hashed, return 0.
+ * (On return with 1 graph_lock is held.)
+ */
+static inline int lookup_chain_cache(struct task_struct *curr,
+ struct held_lock *hlock,
+ u64 chain_key)
+{
+ struct lock_class *class = hlock_class(hlock);
+ struct list_head *hash_head = chainhashentry(chain_key);
+ struct lock_chain *chain;
+ struct held_lock *hlock_curr;
+ int i, j;
+
+ /*
+ * We might need to take the graph lock, ensure we've got IRQs
+ * disabled to make this an IRQ-safe lock.. for recursion reasons
+ * lockdep won't complain about its own locking errors.
+ */
+ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ return 0;
+ /*
+ * We can walk it lock-free, because entries only get added
+ * to the hash:
+ */
+ list_for_each_entry_rcu(chain, hash_head, entry) {
+ if (chain->chain_key == chain_key) {
+cache_hit:
+ debug_atomic_inc(chain_lookup_hits);
+ if (very_verbose(class))
+ printk("\nhash chain already cached, key: "
+ "%016Lx tail class: [%p] %s\n",
+ (unsigned long long)chain_key,
+ class->key, class->name);
+ return 0;
+ }
+ }
+ if (very_verbose(class))
+ printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n",
+ (unsigned long long)chain_key, class->key, class->name);
+ /*
+ * Allocate a new chain entry from the static array, and add
+ * it to the hash:
+ */
+ if (!graph_lock())
+ return 0;
+ /*
+ * We have to walk the chain again locked - to avoid duplicates:
+ */
+ list_for_each_entry(chain, hash_head, entry) {
+ if (chain->chain_key == chain_key) {
+ graph_unlock();
+ goto cache_hit;
+ }
+ }
+ if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) {
+ if (!debug_locks_off_graph_unlock())
+ return 0;
+
+ print_lockdep_off("BUG: MAX_LOCKDEP_CHAINS too low!");
+ dump_stack();
+ return 0;
+ }
+ chain = lock_chains + nr_lock_chains++;
+ chain->chain_key = chain_key;
+ chain->irq_context = hlock->irq_context;
+ /* Find the first held_lock of current chain */
+ for (i = curr->lockdep_depth - 1; i >= 0; i--) {
+ hlock_curr = curr->held_locks + i;
+ if (hlock_curr->irq_context != hlock->irq_context)
+ break;
+ }
+ i++;
+ chain->depth = curr->lockdep_depth + 1 - i;
+ if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
+ chain->base = nr_chain_hlocks;
+ nr_chain_hlocks += chain->depth;
+ for (j = 0; j < chain->depth - 1; j++, i++) {
+ int lock_id = curr->held_locks[i].class_idx - 1;
+ chain_hlocks[chain->base + j] = lock_id;
+ }
+ chain_hlocks[chain->base + j] = class - lock_classes;
+ }
+ list_add_tail_rcu(&chain->entry, hash_head);
+ debug_atomic_inc(chain_lookup_misses);
+ inc_chains();
+
+ return 1;
+}
+
+static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
+ struct held_lock *hlock, int chain_head, u64 chain_key)
+{
+ /*
+ * Trylock needs to maintain the stack of held locks, but it
+ * does not add new dependencies, because trylock can be done
+ * in any order.
+ *
+ * We look up the chain_key and do the O(N^2) check and update of
+ * the dependencies only if this is a new dependency chain.
+ * (If lookup_chain_cache() returns with 1 it acquires
+ * graph_lock for us)
+ */
+ if (!hlock->trylock && hlock->check &&
+ lookup_chain_cache(curr, hlock, chain_key)) {
+ /*
+ * Check whether last held lock:
+ *
+ * - is irq-safe, if this lock is irq-unsafe
+ * - is softirq-safe, if this lock is hardirq-unsafe
+ *
+ * And check whether the new lock's dependency graph
+ * could lead back to the previous lock.
+ *
+ * any of these scenarios could lead to a deadlock. If
+ * All validations
+ */
+ int ret = check_deadlock(curr, hlock, lock, hlock->read);
+
+ if (!ret)
+ return 0;
+ /*
+ * Mark recursive read, as we jump over it when
+ * building dependencies (just like we jump over
+ * trylock entries):
+ */
+ if (ret == 2)
+ hlock->read = 2;
+ /*
+ * Add dependency only if this lock is not the head
+ * of the chain, and if it's not a secondary read-lock:
+ */
+ if (!chain_head && ret != 2)
+ if (!check_prevs_add(curr, hlock))
+ return 0;
+ graph_unlock();
+ } else
+ /* after lookup_chain_cache(): */
+ if (unlikely(!debug_locks))
+ return 0;
+
+ return 1;
+}
+#else
+static inline int validate_chain(struct task_struct *curr,
+ struct lockdep_map *lock, struct held_lock *hlock,
+ int chain_head, u64 chain_key)
+{
+ return 1;
+}
+#endif
+
+/*
+ * We are building curr_chain_key incrementally, so double-check
+ * it from scratch, to make sure that it's done correctly:
+ */
+static void check_chain_key(struct task_struct *curr)
+{
+#ifdef CONFIG_DEBUG_LOCKDEP
+ struct held_lock *hlock, *prev_hlock = NULL;
+ unsigned int i, id;
+ u64 chain_key = 0;
+
+ for (i = 0; i < curr->lockdep_depth; i++) {
+ hlock = curr->held_locks + i;
+ if (chain_key != hlock->prev_chain_key) {
+ debug_locks_off();
+ /*
+ * We got mighty confused, our chain keys don't match
+ * with what we expect, someone trample on our task state?
+ */
+ WARN(1, "hm#1, depth: %u [%u], %016Lx != %016Lx\n",
+ curr->lockdep_depth, i,
+ (unsigned long long)chain_key,
+ (unsigned long long)hlock->prev_chain_key);
+ return;
+ }
+ id = hlock->class_idx - 1;
+ /*
+ * Whoops ran out of static storage again?
+ */
+ if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
+ return;
+
+ if (prev_hlock && (prev_hlock->irq_context !=
+ hlock->irq_context))
+ chain_key = 0;
+ chain_key = iterate_chain_key(chain_key, id);
+ prev_hlock = hlock;
+ }
+ if (chain_key != curr->curr_chain_key) {
+ debug_locks_off();
+ /*
+ * More smoking hash instead of calculating it, damn see these
+ * numbers float.. I bet that a pink elephant stepped on my memory.
+ */
+ WARN(1, "hm#2, depth: %u [%u], %016Lx != %016Lx\n",
+ curr->lockdep_depth, i,
+ (unsigned long long)chain_key,
+ (unsigned long long)curr->curr_chain_key);
+ }
+#endif
+}
+
+static void
+print_usage_bug_scenario(struct held_lock *lock)
+{
+ struct lock_class *class = hlock_class(lock);
+
+ printk(" Possible unsafe locking scenario:\n\n");
+ printk(" CPU0\n");
+ printk(" ----\n");
+ printk(" lock(");
+ __print_lock_name(class);
+ printk(");\n");
+ printk(" <Interrupt>\n");
+ printk(" lock(");
+ __print_lock_name(class);
+ printk(");\n");
+ printk("\n *** DEADLOCK ***\n\n");
+}
+
+static int
+print_usage_bug(struct task_struct *curr, struct held_lock *this,
+ enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit)
+{
+ if (!debug_locks_off_graph_unlock() || debug_locks_silent)
+ return 0;
+
+ printk("\n");
+ printk("=================================\n");
+ printk("[ INFO: inconsistent lock state ]\n");
+ print_kernel_ident();
+ printk("---------------------------------\n");
+
+ printk("inconsistent {%s} -> {%s} usage.\n",
+ usage_str[prev_bit], usage_str[new_bit]);
+
+ printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n",
+ curr->comm, task_pid_nr(curr),
+ trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT,
+ trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT,
+ trace_hardirqs_enabled(curr),
+ trace_softirqs_enabled(curr));
+ print_lock(this);
+
+ printk("{%s} state was registered at:\n", usage_str[prev_bit]);
+ print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1);
+
+ print_irqtrace_events(curr);
+ printk("\nother info that might help us debug this:\n");
+ print_usage_bug_scenario(this);
+
+ lockdep_print_held_locks(curr);
+
+ printk("\nstack backtrace:\n");
+ dump_stack();
+
+ return 0;
+}
+
+/*
+ * Print out an error if an invalid bit is set:
+ */
+static inline int
+valid_state(struct task_struct *curr, struct held_lock *this,
+ enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit)
+{
+ if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit)))
+ return print_usage_bug(curr, this, bad_bit, new_bit);
+ return 1;
+}
+
+static int mark_lock(struct task_struct *curr, struct held_lock *this,
+ enum lock_usage_bit new_bit);
+
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
+
+/*
+ * print irq inversion bug:
+ */
+static int
+print_irq_inversion_bug(struct task_struct *curr,
+ struct lock_list *root, struct lock_list *other,
+ struct held_lock *this, int forwards,
+ const char *irqclass)
+{
+ struct lock_list *entry = other;
+ struct lock_list *middle = NULL;
+ int depth;
+
+ if (!debug_locks_off_graph_unlock() || debug_locks_silent)
+ return 0;
+
+ printk("\n");
+ printk("=========================================================\n");
+ printk("[ INFO: possible irq lock inversion dependency detected ]\n");
+ print_kernel_ident();
+ printk("---------------------------------------------------------\n");
+ printk("%s/%d just changed the state of lock:\n",
+ curr->comm, task_pid_nr(curr));
+ print_lock(this);
+ if (forwards)
+ printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass);
+ else
+ printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass);
+ print_lock_name(other->class);
+ printk("\n\nand interrupts could create inverse lock ordering between them.\n\n");
+
+ printk("\nother info that might help us debug this:\n");
+
+ /* Find a middle lock (if one exists) */
+ depth = get_lock_depth(other);
+ do {
+ if (depth == 0 && (entry != root)) {
+ printk("lockdep:%s bad path found in chain graph\n", __func__);
+ break;
+ }
+ middle = entry;
+ entry = get_lock_parent(entry);
+ depth--;
+ } while (entry && entry != root && (depth >= 0));
+ if (forwards)
+ print_irq_lock_scenario(root, other,
+ middle ? middle->class : root->class, other->class);
+ else
+ print_irq_lock_scenario(other, root,
+ middle ? middle->class : other->class, root->class);
+
+ lockdep_print_held_locks(curr);
+
+ printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n");
+ if (!save_trace(&root->trace))
+ return 0;
+ print_shortest_lock_dependencies(other, root);
+
+ printk("\nstack backtrace:\n");
+ dump_stack();
+
+ return 0;
+}
+
+/*
+ * Prove that in the forwards-direction subgraph starting at <this>
+ * there is no lock matching <mask>:
+ */
+static int
+check_usage_forwards(struct task_struct *curr, struct held_lock *this,
+ enum lock_usage_bit bit, const char *irqclass)
+{
+ int ret;
+ struct lock_list root;
+ struct lock_list *uninitialized_var(target_entry);
+
+ root.parent = NULL;
+ root.class = hlock_class(this);
+ ret = find_usage_forwards(&root, bit, &target_entry);
+ if (ret < 0)
+ return print_bfs_bug(ret);
+ if (ret == 1)
+ return ret;
+
+ return print_irq_inversion_bug(curr, &root, target_entry,
+ this, 1, irqclass);
+}
+
+/*
+ * Prove that in the backwards-direction subgraph starting at <this>
+ * there is no lock matching <mask>:
+ */
+static int
+check_usage_backwards(struct task_struct *curr, struct held_lock *this,
+ enum lock_usage_bit bit, const char *irqclass)
+{
+ int ret;
+ struct lock_list root;
+ struct lock_list *uninitialized_var(target_entry);
+
+ root.parent = NULL;
+ root.class = hlock_class(this);
+ ret = find_usage_backwards(&root, bit, &target_entry);
+ if (ret < 0)
+ return print_bfs_bug(ret);
+ if (ret == 1)
+ return ret;
+
+ return print_irq_inversion_bug(curr, &root, target_entry,
+ this, 0, irqclass);
+}
+
+void print_irqtrace_events(struct task_struct *curr)
+{
+ printk("irq event stamp: %u\n", curr->irq_events);
+ printk("hardirqs last enabled at (%u): ", curr->hardirq_enable_event);
+ print_ip_sym(curr->hardirq_enable_ip);
+ printk("hardirqs last disabled at (%u): ", curr->hardirq_disable_event);
+ print_ip_sym(curr->hardirq_disable_ip);
+ printk("softirqs last enabled at (%u): ", curr->softirq_enable_event);
+ print_ip_sym(curr->softirq_enable_ip);
+ printk("softirqs last disabled at (%u): ", curr->softirq_disable_event);
+ print_ip_sym(curr->softirq_disable_ip);
+}
+
+static int HARDIRQ_verbose(struct lock_class *class)
+{
+#if HARDIRQ_VERBOSE
+ return class_filter(class);
+#endif
+ return 0;
+}
+
+static int SOFTIRQ_verbose(struct lock_class *class)
+{
+#if SOFTIRQ_VERBOSE
+ return class_filter(class);
+#endif
+ return 0;
+}
+
+static int RECLAIM_FS_verbose(struct lock_class *class)
+{
+#if RECLAIM_VERBOSE
+ return class_filter(class);
+#endif
+ return 0;
+}
+
+#define STRICT_READ_CHECKS 1
+
+static int (*state_verbose_f[])(struct lock_class *class) = {
+#define LOCKDEP_STATE(__STATE) \
+ __STATE##_verbose,
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
+};
+
+static inline int state_verbose(enum lock_usage_bit bit,
+ struct lock_class *class)
+{
+ return state_verbose_f[bit >> 2](class);
+}
+
+typedef int (*check_usage_f)(struct task_struct *, struct held_lock *,
+ enum lock_usage_bit bit, const char *name);
+
+static int
+mark_lock_irq(struct task_struct *curr, struct held_lock *this,
+ enum lock_usage_bit new_bit)
+{
+ int excl_bit = exclusive_bit(new_bit);
+ int read = new_bit & 1;
+ int dir = new_bit & 2;
+
+ /*
+ * mark USED_IN has to look forwards -- to ensure no dependency
+ * has ENABLED state, which would allow recursion deadlocks.
+ *
+ * mark ENABLED has to look backwards -- to ensure no dependee
+ * has USED_IN state, which, again, would allow recursion deadlocks.
+ */
+ check_usage_f usage = dir ?
+ check_usage_backwards : check_usage_forwards;
+
+ /*
+ * Validate that this particular lock does not have conflicting
+ * usage states.
+ */
+ if (!valid_state(curr, this, new_bit, excl_bit))
+ return 0;
+
+ /*
+ * Validate that the lock dependencies don't have conflicting usage
+ * states.
+ */
+ if ((!read || !dir || STRICT_READ_CHECKS) &&
+ !usage(curr, this, excl_bit, state_name(new_bit & ~1)))
+ return 0;
+
+ /*
+ * Check for read in write conflicts
+ */
+ if (!read) {
+ if (!valid_state(curr, this, new_bit, excl_bit + 1))
+ return 0;
+
+ if (STRICT_READ_CHECKS &&
+ !usage(curr, this, excl_bit + 1,
+ state_name(new_bit + 1)))
+ return 0;
+ }
+
+ if (state_verbose(new_bit, hlock_class(this)))
+ return 2;
+
+ return 1;
+}
+
+enum mark_type {
+#define LOCKDEP_STATE(__STATE) __STATE,
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
+};
+
+/*
+ * Mark all held locks with a usage bit:
+ */
+static int
+mark_held_locks(struct task_struct *curr, enum mark_type mark)
+{
+ enum lock_usage_bit usage_bit;
+ struct held_lock *hlock;
+ int i;
+
+ for (i = 0; i < curr->lockdep_depth; i++) {
+ hlock = curr->held_locks + i;
+
+ usage_bit = 2 + (mark << 2); /* ENABLED */
+ if (hlock->read)
+ usage_bit += 1; /* READ */
+
+ BUG_ON(usage_bit >= LOCK_USAGE_STATES);
+
+ if (!hlock->check)
+ continue;
+
+ if (!mark_lock(curr, hlock, usage_bit))
+ return 0;
+ }
+
+ return 1;
+}
+
+/*
+ * Hardirqs will be enabled:
+ */
+static void __trace_hardirqs_on_caller(unsigned long ip)
+{
+ struct task_struct *curr = current;
+
+ /* we'll do an OFF -> ON transition: */
+ curr->hardirqs_enabled = 1;
+
+ /*
+ * We are going to turn hardirqs on, so set the
+ * usage bit for all held locks:
+ */
+ if (!mark_held_locks(curr, HARDIRQ))
+ return;
+ /*
+ * If we have softirqs enabled, then set the usage
+ * bit for all held locks. (disabled hardirqs prevented
+ * this bit from being set before)
+ */
+ if (curr->softirqs_enabled)
+ if (!mark_held_locks(curr, SOFTIRQ))
+ return;
+
+ curr->hardirq_enable_ip = ip;
+ curr->hardirq_enable_event = ++curr->irq_events;
+ debug_atomic_inc(hardirqs_on_events);
+}
+
+__visible void trace_hardirqs_on_caller(unsigned long ip)
+{
+ time_hardirqs_on(CALLER_ADDR0, ip);
+
+ if (unlikely(!debug_locks || current->lockdep_recursion))
+ return;
+
+ if (unlikely(current->hardirqs_enabled)) {
+ /*
+ * Neither irq nor preemption are disabled here
+ * so this is racy by nature but losing one hit
+ * in a stat is not a big deal.
+ */
+ __debug_atomic_inc(redundant_hardirqs_on);
+ return;
+ }
+
+ /*
+ * We're enabling irqs and according to our state above irqs weren't
+ * already enabled, yet we find the hardware thinks they are in fact
+ * enabled.. someone messed up their IRQ state tracing.
+ */
+ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ return;
+
+ /*
+ * See the fine text that goes along with this variable definition.
+ */
+ if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled)))
+ return;
+
+ /*
+ * Can't allow enabling interrupts while in an interrupt handler,
+ * that's general bad form and such. Recursion, limited stack etc..
+ */
+ if (DEBUG_LOCKS_WARN_ON(current->hardirq_context))
+ return;
+
+ current->lockdep_recursion = 1;
+ __trace_hardirqs_on_caller(ip);
+ current->lockdep_recursion = 0;
+}
+EXPORT_SYMBOL(trace_hardirqs_on_caller);
+
+void trace_hardirqs_on(void)
+{
+ trace_hardirqs_on_caller(CALLER_ADDR0);
+}
+EXPORT_SYMBOL(trace_hardirqs_on);
+
+/*
+ * Hardirqs were disabled:
+ */
+__visible void trace_hardirqs_off_caller(unsigned long ip)
+{
+ struct task_struct *curr = current;
+
+ time_hardirqs_off(CALLER_ADDR0, ip);
+
+ if (unlikely(!debug_locks || current->lockdep_recursion))
+ return;
+
+ /*
+ * So we're supposed to get called after you mask local IRQs, but for
+ * some reason the hardware doesn't quite think you did a proper job.
+ */
+ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ return;
+
+ if (curr->hardirqs_enabled) {
+ /*
+ * We have done an ON -> OFF transition:
+ */
+ curr->hardirqs_enabled = 0;
+ curr->hardirq_disable_ip = ip;
+ curr->hardirq_disable_event = ++curr->irq_events;
+ debug_atomic_inc(hardirqs_off_events);
+ } else
+ debug_atomic_inc(redundant_hardirqs_off);
+}
+EXPORT_SYMBOL(trace_hardirqs_off_caller);
+
+void trace_hardirqs_off(void)
+{
+ trace_hardirqs_off_caller(CALLER_ADDR0);
+}
+EXPORT_SYMBOL(trace_hardirqs_off);
+
+/*
+ * Softirqs will be enabled:
+ */
+void trace_softirqs_on(unsigned long ip)
+{
+ struct task_struct *curr = current;
+
+ if (unlikely(!debug_locks || current->lockdep_recursion))
+ return;
+
+ /*
+ * We fancy IRQs being disabled here, see softirq.c, avoids
+ * funny state and nesting things.
+ */
+ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ return;
+
+ if (curr->softirqs_enabled) {
+ debug_atomic_inc(redundant_softirqs_on);
+ return;
+ }
+
+ current->lockdep_recursion = 1;
+ /*
+ * We'll do an OFF -> ON transition:
+ */
+ curr->softirqs_enabled = 1;
+ curr->softirq_enable_ip = ip;
+ curr->softirq_enable_event = ++curr->irq_events;
+ debug_atomic_inc(softirqs_on_events);
+ /*
+ * We are going to turn softirqs on, so set the
+ * usage bit for all held locks, if hardirqs are
+ * enabled too:
+ */
+ if (curr->hardirqs_enabled)
+ mark_held_locks(curr, SOFTIRQ);
+ current->lockdep_recursion = 0;
+}
+
+/*
+ * Softirqs were disabled:
+ */
+void trace_softirqs_off(unsigned long ip)
+{
+ struct task_struct *curr = current;
+
+ if (unlikely(!debug_locks || current->lockdep_recursion))
+ return;
+
+ /*
+ * We fancy IRQs being disabled here, see softirq.c
+ */
+ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ return;
+
+ if (curr->softirqs_enabled) {
+ /*
+ * We have done an ON -> OFF transition:
+ */
+ curr->softirqs_enabled = 0;
+ curr->softirq_disable_ip = ip;
+ curr->softirq_disable_event = ++curr->irq_events;
+ debug_atomic_inc(softirqs_off_events);
+ /*
+ * Whoops, we wanted softirqs off, so why aren't they?
+ */
+ DEBUG_LOCKS_WARN_ON(!softirq_count());
+ } else
+ debug_atomic_inc(redundant_softirqs_off);
+}
+
+static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
+{
+ struct task_struct *curr = current;
+
+ if (unlikely(!debug_locks))
+ return;
+
+ /* no reclaim without waiting on it */
+ if (!(gfp_mask & __GFP_WAIT))
+ return;
+
+ /* this guy won't enter reclaim */
+ if ((curr->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC))
+ return;
+
+ /* We're only interested __GFP_FS allocations for now */
+ if (!(gfp_mask & __GFP_FS))
+ return;
+
+ /*
+ * Oi! Can't be having __GFP_FS allocations with IRQs disabled.
+ */
+ if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags)))
+ return;
+
+ mark_held_locks(curr, RECLAIM_FS);
+}
+
+static void check_flags(unsigned long flags);
+
+void lockdep_trace_alloc(gfp_t gfp_mask)
+{
+ unsigned long flags;
+
+ if (unlikely(current->lockdep_recursion))
+ return;
+
+ raw_local_irq_save(flags);
+ check_flags(flags);
+ current->lockdep_recursion = 1;
+ __lockdep_trace_alloc(gfp_mask, flags);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+}
+
+static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
+{
+ /*
+ * If non-trylock use in a hardirq or softirq context, then
+ * mark the lock as used in these contexts:
+ */
+ if (!hlock->trylock) {
+ if (hlock->read) {
+ if (curr->hardirq_context)
+ if (!mark_lock(curr, hlock,
+ LOCK_USED_IN_HARDIRQ_READ))
+ return 0;
+ if (curr->softirq_context)
+ if (!mark_lock(curr, hlock,
+ LOCK_USED_IN_SOFTIRQ_READ))
+ return 0;
+ } else {
+ if (curr->hardirq_context)
+ if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ))
+ return 0;
+ if (curr->softirq_context)
+ if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ))
+ return 0;
+ }
+ }
+ if (!hlock->hardirqs_off) {
+ if (hlock->read) {
+ if (!mark_lock(curr, hlock,
+ LOCK_ENABLED_HARDIRQ_READ))
+ return 0;
+ if (curr->softirqs_enabled)
+ if (!mark_lock(curr, hlock,
+ LOCK_ENABLED_SOFTIRQ_READ))
+ return 0;
+ } else {
+ if (!mark_lock(curr, hlock,
+ LOCK_ENABLED_HARDIRQ))
+ return 0;
+ if (curr->softirqs_enabled)
+ if (!mark_lock(curr, hlock,
+ LOCK_ENABLED_SOFTIRQ))
+ return 0;
+ }
+ }
+
+ /*
+ * We reuse the irq context infrastructure more broadly as a general
+ * context checking code. This tests GFP_FS recursion (a lock taken
+ * during reclaim for a GFP_FS allocation is held over a GFP_FS
+ * allocation).
+ */
+ if (!hlock->trylock && (curr->lockdep_reclaim_gfp & __GFP_FS)) {
+ if (hlock->read) {
+ if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS_READ))
+ return 0;
+ } else {
+ if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS))
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+static int separate_irq_context(struct task_struct *curr,
+ struct held_lock *hlock)
+{
+ unsigned int depth = curr->lockdep_depth;
+
+ /*
+ * Keep track of points where we cross into an interrupt context:
+ */
+ hlock->irq_context = 2*(curr->hardirq_context ? 1 : 0) +
+ curr->softirq_context;
+ if (depth) {
+ struct held_lock *prev_hlock;
+
+ prev_hlock = curr->held_locks + depth-1;
+ /*
+ * If we cross into another context, reset the
+ * hash key (this also prevents the checking and the
+ * adding of the dependency to 'prev'):
+ */
+ if (prev_hlock->irq_context != hlock->irq_context)
+ return 1;
+ }
+ return 0;
+}
+
+#else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
+
+static inline
+int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
+ enum lock_usage_bit new_bit)
+{
+ WARN_ON(1); /* Impossible innit? when we don't have TRACE_IRQFLAG */
+ return 1;
+}
+
+static inline int mark_irqflags(struct task_struct *curr,
+ struct held_lock *hlock)
+{
+ return 1;
+}
+
+static inline int separate_irq_context(struct task_struct *curr,
+ struct held_lock *hlock)
+{
+ return 0;
+}
+
+void lockdep_trace_alloc(gfp_t gfp_mask)
+{
+}
+
+#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
+
+/*
+ * Mark a lock with a usage bit, and validate the state transition:
+ */
+static int mark_lock(struct task_struct *curr, struct held_lock *this,
+ enum lock_usage_bit new_bit)
+{
+ unsigned int new_mask = 1 << new_bit, ret = 1;
+
+ /*
+ * If already set then do not dirty the cacheline,
+ * nor do any checks:
+ */
+ if (likely(hlock_class(this)->usage_mask & new_mask))
+ return 1;
+
+ if (!graph_lock())
+ return 0;
+ /*
+ * Make sure we didn't race:
+ */
+ if (unlikely(hlock_class(this)->usage_mask & new_mask)) {
+ graph_unlock();
+ return 1;
+ }
+
+ hlock_class(this)->usage_mask |= new_mask;
+
+ if (!save_trace(hlock_class(this)->usage_traces + new_bit))
+ return 0;
+
+ switch (new_bit) {
+#define LOCKDEP_STATE(__STATE) \
+ case LOCK_USED_IN_##__STATE: \
+ case LOCK_USED_IN_##__STATE##_READ: \
+ case LOCK_ENABLED_##__STATE: \
+ case LOCK_ENABLED_##__STATE##_READ:
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
+ ret = mark_lock_irq(curr, this, new_bit);
+ if (!ret)
+ return 0;
+ break;
+ case LOCK_USED:
+ debug_atomic_dec(nr_unused_locks);
+ break;
+ default:
+ if (!debug_locks_off_graph_unlock())
+ return 0;
+ WARN_ON(1);
+ return 0;
+ }
+
+ graph_unlock();
+
+ /*
+ * We must printk outside of the graph_lock:
+ */
+ if (ret == 2) {
+ printk("\nmarked lock as {%s}:\n", usage_str[new_bit]);
+ print_lock(this);
+ print_irqtrace_events(curr);
+ dump_stack();
+ }
+
+ return ret;
+}
+
+/*
+ * Initialize a lock instance's lock-class mapping info:
+ */
+void lockdep_init_map(struct lockdep_map *lock, const char *name,
+ struct lock_class_key *key, int subclass)
+{
+ int i;
+
+ kmemcheck_mark_initialized(lock, sizeof(*lock));
+
+ for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++)
+ lock->class_cache[i] = NULL;
+
+#ifdef CONFIG_LOCK_STAT
+ lock->cpu = raw_smp_processor_id();
+#endif
+
+ /*
+ * Can't be having no nameless bastards around this place!
+ */
+ if (DEBUG_LOCKS_WARN_ON(!name)) {
+ lock->name = "NULL";
+ return;
+ }
+
+ lock->name = name;
+
+ /*
+ * No key, no joy, we need to hash something.
+ */
+ if (DEBUG_LOCKS_WARN_ON(!key))
+ return;
+ /*
+ * Sanity check, the lock-class key must be persistent:
+ */
+ if (!static_obj(key)) {
+ printk("BUG: key %p not in .data!\n", key);
+ /*
+ * What it says above ^^^^^, I suggest you read it.
+ */
+ DEBUG_LOCKS_WARN_ON(1);
+ return;
+ }
+ lock->key = key;
+
+ if (unlikely(!debug_locks))
+ return;
+
+ if (subclass) {
+ unsigned long flags;
+
+ if (DEBUG_LOCKS_WARN_ON(current->lockdep_recursion))
+ return;
+
+ raw_local_irq_save(flags);
+ current->lockdep_recursion = 1;
+ register_lock_class(lock, subclass, 1);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+ }
+}
+EXPORT_SYMBOL_GPL(lockdep_init_map);
+
+struct lock_class_key __lockdep_no_validate__;
+EXPORT_SYMBOL_GPL(__lockdep_no_validate__);
+
+static int
+print_lock_nested_lock_not_held(struct task_struct *curr,
+ struct held_lock *hlock,
+ unsigned long ip)
+{
+ if (!debug_locks_off())
+ return 0;
+ if (debug_locks_silent)
+ return 0;
+
+ printk("\n");
+ printk("==================================\n");
+ printk("[ BUG: Nested lock was not taken ]\n");
+ print_kernel_ident();
+ printk("----------------------------------\n");
+
+ printk("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr));
+ print_lock(hlock);
+
+ printk("\nbut this task is not holding:\n");
+ printk("%s\n", hlock->nest_lock->name);
+
+ printk("\nstack backtrace:\n");
+ dump_stack();
+
+ printk("\nother info that might help us debug this:\n");
+ lockdep_print_held_locks(curr);
+
+ printk("\nstack backtrace:\n");
+ dump_stack();
+
+ return 0;
+}
+
+static int __lock_is_held(struct lockdep_map *lock);
+
+/*
+ * This gets called for every mutex_lock*()/spin_lock*() operation.
+ * We maintain the dependency maps and validate the locking attempt:
+ */
+static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
+ int trylock, int read, int check, int hardirqs_off,
+ struct lockdep_map *nest_lock, unsigned long ip,
+ int references)
+{
+ struct task_struct *curr = current;
+ struct lock_class *class = NULL;
+ struct held_lock *hlock;
+ unsigned int depth, id;
+ int chain_head = 0;
+ int class_idx;
+ u64 chain_key;
+
+ if (unlikely(!debug_locks))
+ return 0;
+
+ /*
+ * Lockdep should run with IRQs disabled, otherwise we could
+ * get an interrupt which would want to take locks, which would
+ * end up in lockdep and have you got a head-ache already?
+ */
+ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ return 0;
+
+ if (!prove_locking || lock->key == &__lockdep_no_validate__)
+ check = 0;
+
+ if (subclass < NR_LOCKDEP_CACHING_CLASSES)
+ class = lock->class_cache[subclass];
+ /*
+ * Not cached?
+ */
+ if (unlikely(!class)) {
+ class = register_lock_class(lock, subclass, 0);
+ if (!class)
+ return 0;
+ }
+ atomic_inc((atomic_t *)&class->ops);
+ if (very_verbose(class)) {
+ printk("\nacquire class [%p] %s", class->key, class->name);
+ if (class->name_version > 1)
+ printk("#%d", class->name_version);
+ printk("\n");
+ dump_stack();
+ }
+
+ /*
+ * Add the lock to the list of currently held locks.
+ * (we dont increase the depth just yet, up until the
+ * dependency checks are done)
+ */
+ depth = curr->lockdep_depth;
+ /*
+ * Ran out of static storage for our per-task lock stack again have we?
+ */
+ if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH))
+ return 0;
+
+ class_idx = class - lock_classes + 1;
+
+ if (depth) {
+ hlock = curr->held_locks + depth - 1;
+ if (hlock->class_idx == class_idx && nest_lock) {
+ if (hlock->references)
+ hlock->references++;
+ else
+ hlock->references = 2;
+
+ return 1;
+ }
+ }
+
+ hlock = curr->held_locks + depth;
+ /*
+ * Plain impossible, we just registered it and checked it weren't no
+ * NULL like.. I bet this mushroom I ate was good!
+ */
+ if (DEBUG_LOCKS_WARN_ON(!class))
+ return 0;
+ hlock->class_idx = class_idx;
+ hlock->acquire_ip = ip;
+ hlock->instance = lock;
+ hlock->nest_lock = nest_lock;
+ hlock->trylock = trylock;
+ hlock->read = read;
+ hlock->check = check;
+ hlock->hardirqs_off = !!hardirqs_off;
+ hlock->references = references;
+#ifdef CONFIG_LOCK_STAT
+ hlock->waittime_stamp = 0;
+ hlock->holdtime_stamp = lockstat_clock();
+#endif
+
+ if (check && !mark_irqflags(curr, hlock))
+ return 0;
+
+ /* mark it as used: */
+ if (!mark_lock(curr, hlock, LOCK_USED))
+ return 0;
+
+ /*
+ * Calculate the chain hash: it's the combined hash of all the
+ * lock keys along the dependency chain. We save the hash value
+ * at every step so that we can get the current hash easily
+ * after unlock. The chain hash is then used to cache dependency
+ * results.
+ *
+ * The 'key ID' is what is the most compact key value to drive
+ * the hash, not class->key.
+ */
+ id = class - lock_classes;
+ /*
+ * Whoops, we did it again.. ran straight out of our static allocation.
+ */
+ if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
+ return 0;
+
+ chain_key = curr->curr_chain_key;
+ if (!depth) {
+ /*
+ * How can we have a chain hash when we ain't got no keys?!
+ */
+ if (DEBUG_LOCKS_WARN_ON(chain_key != 0))
+ return 0;
+ chain_head = 1;
+ }
+
+ hlock->prev_chain_key = chain_key;
+ if (separate_irq_context(curr, hlock)) {
+ chain_key = 0;
+ chain_head = 1;
+ }
+ chain_key = iterate_chain_key(chain_key, id);
+
+ if (nest_lock && !__lock_is_held(nest_lock))
+ return print_lock_nested_lock_not_held(curr, hlock, ip);
+
+ if (!validate_chain(curr, lock, hlock, chain_head, chain_key))
+ return 0;
+
+ curr->curr_chain_key = chain_key;
+ curr->lockdep_depth++;
+ check_chain_key(curr);
+#ifdef CONFIG_DEBUG_LOCKDEP
+ if (unlikely(!debug_locks))
+ return 0;
+#endif
+ if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) {
+ debug_locks_off();
+ print_lockdep_off("BUG: MAX_LOCK_DEPTH too low!");
+ printk(KERN_DEBUG "depth: %i max: %lu!\n",
+ curr->lockdep_depth, MAX_LOCK_DEPTH);
+
+ lockdep_print_held_locks(current);
+ debug_show_all_locks();
+ dump_stack();
+
+ return 0;
+ }
+
+ if (unlikely(curr->lockdep_depth > max_lockdep_depth))
+ max_lockdep_depth = curr->lockdep_depth;
+
+ return 1;
+}
+
+static int
+print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
+ unsigned long ip)
+{
+ if (!debug_locks_off())
+ return 0;
+ if (debug_locks_silent)
+ return 0;
+
+ printk("\n");
+ printk("=====================================\n");
+ printk("[ BUG: bad unlock balance detected! ]\n");
+ print_kernel_ident();
+ printk("-------------------------------------\n");
+ printk("%s/%d is trying to release lock (",
+ curr->comm, task_pid_nr(curr));
+ print_lockdep_cache(lock);
+ printk(") at:\n");
+ print_ip_sym(ip);
+ printk("but there are no more locks to release!\n");
+ printk("\nother info that might help us debug this:\n");
+ lockdep_print_held_locks(curr);
+
+ printk("\nstack backtrace:\n");
+ dump_stack();
+
+ return 0;
+}
+
+/*
+ * Common debugging checks for both nested and non-nested unlock:
+ */
+static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
+ unsigned long ip)
+{
+ if (unlikely(!debug_locks))
+ return 0;
+ /*
+ * Lockdep should run with IRQs disabled, recursion, head-ache, etc..
+ */
+ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ return 0;
+
+ if (curr->lockdep_depth <= 0)
+ return print_unlock_imbalance_bug(curr, lock, ip);
+
+ return 1;
+}
+
+static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
+{
+ if (hlock->instance == lock)
+ return 1;
+
+ if (hlock->references) {
+ struct lock_class *class = lock->class_cache[0];
+
+ if (!class)
+ class = look_up_lock_class(lock, 0);
+
+ /*
+ * If look_up_lock_class() failed to find a class, we're trying
+ * to test if we hold a lock that has never yet been acquired.
+ * Clearly if the lock hasn't been acquired _ever_, we're not
+ * holding it either, so report failure.
+ */
+ if (!class)
+ return 0;
+
+ /*
+ * References, but not a lock we're actually ref-counting?
+ * State got messed up, follow the sites that change ->references
+ * and try to make sense of it.
+ */
+ if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock))
+ return 0;
+
+ if (hlock->class_idx == class - lock_classes + 1)
+ return 1;
+ }
+
+ return 0;
+}
+
+static int
+__lock_set_class(struct lockdep_map *lock, const char *name,
+ struct lock_class_key *key, unsigned int subclass,
+ unsigned long ip)
+{
+ struct task_struct *curr = current;
+ struct held_lock *hlock, *prev_hlock;
+ struct lock_class *class;
+ unsigned int depth;
+ int i;
+
+ depth = curr->lockdep_depth;
+ /*
+ * This function is about (re)setting the class of a held lock,
+ * yet we're not actually holding any locks. Naughty user!
+ */
+ if (DEBUG_LOCKS_WARN_ON(!depth))
+ return 0;
+
+ prev_hlock = NULL;
+ for (i = depth-1; i >= 0; i--) {
+ hlock = curr->held_locks + i;
+ /*
+ * We must not cross into another context:
+ */
+ if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
+ break;
+ if (match_held_lock(hlock, lock))
+ goto found_it;
+ prev_hlock = hlock;
+ }
+ return print_unlock_imbalance_bug(curr, lock, ip);
+
+found_it:
+ lockdep_init_map(lock, name, key, 0);
+ class = register_lock_class(lock, subclass, 0);
+ hlock->class_idx = class - lock_classes + 1;
+
+ curr->lockdep_depth = i;
+ curr->curr_chain_key = hlock->prev_chain_key;
+
+ for (; i < depth; i++) {
+ hlock = curr->held_locks + i;
+ if (!__lock_acquire(hlock->instance,
+ hlock_class(hlock)->subclass, hlock->trylock,
+ hlock->read, hlock->check, hlock->hardirqs_off,
+ hlock->nest_lock, hlock->acquire_ip,
+ hlock->references))
+ return 0;
+ }
+
+ /*
+ * I took it apart and put it back together again, except now I have
+ * these 'spare' parts.. where shall I put them.
+ */
+ if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth))
+ return 0;
+ return 1;
+}
+
+/*
+ * Remove the lock to the list of currently held locks in a
+ * potentially non-nested (out of order) manner. This is a
+ * relatively rare operation, as all the unlock APIs default
+ * to nested mode (which uses lock_release()):
+ */
+static int
+lock_release_non_nested(struct task_struct *curr,
+ struct lockdep_map *lock, unsigned long ip)
+{
+ struct held_lock *hlock, *prev_hlock;
+ unsigned int depth;
+ int i;
+
+ /*
+ * Check whether the lock exists in the current stack
+ * of held locks:
+ */
+ depth = curr->lockdep_depth;
+ /*
+ * So we're all set to release this lock.. wait what lock? We don't
+ * own any locks, you've been drinking again?
+ */
+ if (DEBUG_LOCKS_WARN_ON(!depth))
+ return 0;
+
+ prev_hlock = NULL;
+ for (i = depth-1; i >= 0; i--) {
+ hlock = curr->held_locks + i;
+ /*
+ * We must not cross into another context:
+ */
+ if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
+ break;
+ if (match_held_lock(hlock, lock))
+ goto found_it;
+ prev_hlock = hlock;
+ }
+ return print_unlock_imbalance_bug(curr, lock, ip);
+
+found_it:
+ if (hlock->instance == lock)
+ lock_release_holdtime(hlock);
+
+ if (hlock->references) {
+ hlock->references--;
+ if (hlock->references) {
+ /*
+ * We had, and after removing one, still have
+ * references, the current lock stack is still
+ * valid. We're done!
+ */
+ return 1;
+ }
+ }
+
+ /*
+ * We have the right lock to unlock, 'hlock' points to it.
+ * Now we remove it from the stack, and add back the other
+ * entries (if any), recalculating the hash along the way:
+ */
+
+ curr->lockdep_depth = i;
+ curr->curr_chain_key = hlock->prev_chain_key;
+
+ for (i++; i < depth; i++) {
+ hlock = curr->held_locks + i;
+ if (!__lock_acquire(hlock->instance,
+ hlock_class(hlock)->subclass, hlock->trylock,
+ hlock->read, hlock->check, hlock->hardirqs_off,
+ hlock->nest_lock, hlock->acquire_ip,
+ hlock->references))
+ return 0;
+ }
+
+ /*
+ * We had N bottles of beer on the wall, we drank one, but now
+ * there's not N-1 bottles of beer left on the wall...
+ */
+ if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1))
+ return 0;
+ return 1;
+}
+
+/*
+ * Remove the lock to the list of currently held locks - this gets
+ * called on mutex_unlock()/spin_unlock*() (or on a failed
+ * mutex_lock_interruptible()). This is done for unlocks that nest
+ * perfectly. (i.e. the current top of the lock-stack is unlocked)
+ */
+static int lock_release_nested(struct task_struct *curr,
+ struct lockdep_map *lock, unsigned long ip)
+{
+ struct held_lock *hlock;
+ unsigned int depth;
+
+ /*
+ * Pop off the top of the lock stack:
+ */
+ depth = curr->lockdep_depth - 1;
+ hlock = curr->held_locks + depth;
+
+ /*
+ * Is the unlock non-nested:
+ */
+ if (hlock->instance != lock || hlock->references)
+ return lock_release_non_nested(curr, lock, ip);
+ curr->lockdep_depth--;
+
+ /*
+ * No more locks, but somehow we've got hash left over, who left it?
+ */
+ if (DEBUG_LOCKS_WARN_ON(!depth && (hlock->prev_chain_key != 0)))
+ return 0;
+
+ curr->curr_chain_key = hlock->prev_chain_key;
+
+ lock_release_holdtime(hlock);
+
+#ifdef CONFIG_DEBUG_LOCKDEP
+ hlock->prev_chain_key = 0;
+ hlock->class_idx = 0;
+ hlock->acquire_ip = 0;
+ hlock->irq_context = 0;
+#endif
+ return 1;
+}
+
+/*
+ * Remove the lock to the list of currently held locks - this gets
+ * called on mutex_unlock()/spin_unlock*() (or on a failed
+ * mutex_lock_interruptible()). This is done for unlocks that nest
+ * perfectly. (i.e. the current top of the lock-stack is unlocked)
+ */
+static void
+__lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
+{
+ struct task_struct *curr = current;
+
+ if (!check_unlock(curr, lock, ip))
+ return;
+
+ if (nested) {
+ if (!lock_release_nested(curr, lock, ip))
+ return;
+ } else {
+ if (!lock_release_non_nested(curr, lock, ip))
+ return;
+ }
+
+ check_chain_key(curr);
+}
+
+static int __lock_is_held(struct lockdep_map *lock)
+{
+ struct task_struct *curr = current;
+ int i;
+
+ for (i = 0; i < curr->lockdep_depth; i++) {
+ struct held_lock *hlock = curr->held_locks + i;
+
+ if (match_held_lock(hlock, lock))
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * Check whether we follow the irq-flags state precisely:
+ */
+static void check_flags(unsigned long flags)
+{
+#if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_DEBUG_LOCKDEP) && \
+ defined(CONFIG_TRACE_IRQFLAGS)
+ if (!debug_locks)
+ return;
+
+ if (irqs_disabled_flags(flags)) {
+ if (DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled)) {
+ printk("possible reason: unannotated irqs-off.\n");
+ }
+ } else {
+ if (DEBUG_LOCKS_WARN_ON(!current->hardirqs_enabled)) {
+ printk("possible reason: unannotated irqs-on.\n");
+ }
+ }
+
+ /*
+ * We dont accurately track softirq state in e.g.
+ * hardirq contexts (such as on 4KSTACKS), so only
+ * check if not in hardirq contexts:
+ */
+ if (!hardirq_count()) {
+ if (softirq_count()) {
+ /* like the above, but with softirqs */
+ DEBUG_LOCKS_WARN_ON(current->softirqs_enabled);
+ } else {
+ /* lick the above, does it taste good? */
+ DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
+ }
+ }
+
+ if (!debug_locks)
+ print_irqtrace_events(current);
+#endif
+}
+
+void lock_set_class(struct lockdep_map *lock, const char *name,
+ struct lock_class_key *key, unsigned int subclass,
+ unsigned long ip)
+{
+ unsigned long flags;
+
+ if (unlikely(current->lockdep_recursion))
+ return;
+
+ raw_local_irq_save(flags);
+ current->lockdep_recursion = 1;
+ check_flags(flags);
+ if (__lock_set_class(lock, name, key, subclass, ip))
+ check_chain_key(current);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_set_class);
+
+/*
+ * We are not always called with irqs disabled - do that here,
+ * and also avoid lockdep recursion:
+ */
+void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
+ int trylock, int read, int check,
+ struct lockdep_map *nest_lock, unsigned long ip)
+{
+ unsigned long flags;
+
+ if (unlikely(current->lockdep_recursion))
+ return;
+
+ raw_local_irq_save(flags);
+ check_flags(flags);
+
+ current->lockdep_recursion = 1;
+ trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
+ __lock_acquire(lock, subclass, trylock, read, check,
+ irqs_disabled_flags(flags), nest_lock, ip, 0);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_acquire);
+
+void lock_release(struct lockdep_map *lock, int nested,
+ unsigned long ip)
+{
+ unsigned long flags;
+
+ if (unlikely(current->lockdep_recursion))
+ return;
+
+ raw_local_irq_save(flags);
+ check_flags(flags);
+ current->lockdep_recursion = 1;
+ trace_lock_release(lock, ip);
+ __lock_release(lock, nested, ip);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_release);
+
+int lock_is_held(struct lockdep_map *lock)
+{
+ unsigned long flags;
+ int ret = 0;
+
+ if (unlikely(current->lockdep_recursion))
+ return 1; /* avoid false negative lockdep_assert_held() */
+
+ raw_local_irq_save(flags);
+ check_flags(flags);
+
+ current->lockdep_recursion = 1;
+ ret = __lock_is_held(lock);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(lock_is_held);
+
+void lockdep_set_current_reclaim_state(gfp_t gfp_mask)
+{
+ current->lockdep_reclaim_gfp = gfp_mask;
+}
+
+void lockdep_clear_current_reclaim_state(void)
+{
+ current->lockdep_reclaim_gfp = 0;
+}
+
+#ifdef CONFIG_LOCK_STAT
+static int
+print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
+ unsigned long ip)
+{
+ if (!debug_locks_off())
+ return 0;
+ if (debug_locks_silent)
+ return 0;
+
+ printk("\n");
+ printk("=================================\n");
+ printk("[ BUG: bad contention detected! ]\n");
+ print_kernel_ident();
+ printk("---------------------------------\n");
+ printk("%s/%d is trying to contend lock (",
+ curr->comm, task_pid_nr(curr));
+ print_lockdep_cache(lock);
+ printk(") at:\n");
+ print_ip_sym(ip);
+ printk("but there are no locks held!\n");
+ printk("\nother info that might help us debug this:\n");
+ lockdep_print_held_locks(curr);
+
+ printk("\nstack backtrace:\n");
+ dump_stack();
+
+ return 0;
+}
+
+static void
+__lock_contended(struct lockdep_map *lock, unsigned long ip)
+{
+ struct task_struct *curr = current;
+ struct held_lock *hlock, *prev_hlock;
+ struct lock_class_stats *stats;
+ unsigned int depth;
+ int i, contention_point, contending_point;
+
+ depth = curr->lockdep_depth;
+ /*
+ * Whee, we contended on this lock, except it seems we're not
+ * actually trying to acquire anything much at all..
+ */
+ if (DEBUG_LOCKS_WARN_ON(!depth))
+ return;
+
+ prev_hlock = NULL;
+ for (i = depth-1; i >= 0; i--) {
+ hlock = curr->held_locks + i;
+ /*
+ * We must not cross into another context:
+ */
+ if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
+ break;
+ if (match_held_lock(hlock, lock))
+ goto found_it;
+ prev_hlock = hlock;
+ }
+ print_lock_contention_bug(curr, lock, ip);
+ return;
+
+found_it:
+ if (hlock->instance != lock)
+ return;
+
+ hlock->waittime_stamp = lockstat_clock();
+
+ contention_point = lock_point(hlock_class(hlock)->contention_point, ip);
+ contending_point = lock_point(hlock_class(hlock)->contending_point,
+ lock->ip);
+
+ stats = get_lock_stats(hlock_class(hlock));
+ if (contention_point < LOCKSTAT_POINTS)
+ stats->contention_point[contention_point]++;
+ if (contending_point < LOCKSTAT_POINTS)
+ stats->contending_point[contending_point]++;
+ if (lock->cpu != smp_processor_id())
+ stats->bounces[bounce_contended + !!hlock->read]++;
+ put_lock_stats(stats);
+}
+
+static void
+__lock_acquired(struct lockdep_map *lock, unsigned long ip)
+{
+ struct task_struct *curr = current;
+ struct held_lock *hlock, *prev_hlock;
+ struct lock_class_stats *stats;
+ unsigned int depth;
+ u64 now, waittime = 0;
+ int i, cpu;
+
+ depth = curr->lockdep_depth;
+ /*
+ * Yay, we acquired ownership of this lock we didn't try to
+ * acquire, how the heck did that happen?
+ */
+ if (DEBUG_LOCKS_WARN_ON(!depth))
+ return;
+
+ prev_hlock = NULL;
+ for (i = depth-1; i >= 0; i--) {
+ hlock = curr->held_locks + i;
+ /*
+ * We must not cross into another context:
+ */
+ if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
+ break;
+ if (match_held_lock(hlock, lock))
+ goto found_it;
+ prev_hlock = hlock;
+ }
+ print_lock_contention_bug(curr, lock, _RET_IP_);
+ return;
+
+found_it:
+ if (hlock->instance != lock)
+ return;
+
+ cpu = smp_processor_id();
+ if (hlock->waittime_stamp) {
+ now = lockstat_clock();
+ waittime = now - hlock->waittime_stamp;
+ hlock->holdtime_stamp = now;
+ }
+
+ trace_lock_acquired(lock, ip);
+
+ stats = get_lock_stats(hlock_class(hlock));
+ if (waittime) {
+ if (hlock->read)
+ lock_time_inc(&stats->read_waittime, waittime);
+ else
+ lock_time_inc(&stats->write_waittime, waittime);
+ }
+ if (lock->cpu != cpu)
+ stats->bounces[bounce_acquired + !!hlock->read]++;
+ put_lock_stats(stats);
+
+ lock->cpu = cpu;
+ lock->ip = ip;
+}
+
+void lock_contended(struct lockdep_map *lock, unsigned long ip)
+{
+ unsigned long flags;
+
+ if (unlikely(!lock_stat))
+ return;
+
+ if (unlikely(current->lockdep_recursion))
+ return;
+
+ raw_local_irq_save(flags);
+ check_flags(flags);
+ current->lockdep_recursion = 1;
+ trace_lock_contended(lock, ip);
+ __lock_contended(lock, ip);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_contended);
+
+void lock_acquired(struct lockdep_map *lock, unsigned long ip)
+{
+ unsigned long flags;
+
+ if (unlikely(!lock_stat))
+ return;
+
+ if (unlikely(current->lockdep_recursion))
+ return;
+
+ raw_local_irq_save(flags);
+ check_flags(flags);
+ current->lockdep_recursion = 1;
+ __lock_acquired(lock, ip);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_acquired);
+#endif
+
+/*
+ * Used by the testsuite, sanitize the validator state
+ * after a simulated failure:
+ */
+
+void lockdep_reset(void)
+{
+ unsigned long flags;
+ int i;
+
+ raw_local_irq_save(flags);
+ current->curr_chain_key = 0;
+ current->lockdep_depth = 0;
+ current->lockdep_recursion = 0;
+ memset(current->held_locks, 0, MAX_LOCK_DEPTH*sizeof(struct held_lock));
+ nr_hardirq_chains = 0;
+ nr_softirq_chains = 0;
+ nr_process_chains = 0;
+ debug_locks = 1;
+ for (i = 0; i < CHAINHASH_SIZE; i++)
+ INIT_LIST_HEAD(chainhash_table + i);
+ raw_local_irq_restore(flags);
+}
+
+static void zap_class(struct lock_class *class)
+{
+ int i;
+
+ /*
+ * Remove all dependencies this lock is
+ * involved in:
+ */
+ for (i = 0; i < nr_list_entries; i++) {
+ if (list_entries[i].class == class)
+ list_del_rcu(&list_entries[i].entry);
+ }
+ /*
+ * Unhash the class and remove it from the all_lock_classes list:
+ */
+ list_del_rcu(&class->hash_entry);
+ list_del_rcu(&class->lock_entry);
+
+ RCU_INIT_POINTER(class->key, NULL);
+ RCU_INIT_POINTER(class->name, NULL);
+}
+
+static inline int within(const void *addr, void *start, unsigned long size)
+{
+ return addr >= start && addr < start + size;
+}
+
+/*
+ * Used in module.c to remove lock classes from memory that is going to be
+ * freed; and possibly re-used by other modules.
+ *
+ * We will have had one sync_sched() before getting here, so we're guaranteed
+ * nobody will look up these exact classes -- they're properly dead but still
+ * allocated.
+ */
+void lockdep_free_key_range(void *start, unsigned long size)
+{
+ struct lock_class *class;
+ struct list_head *head;
+ unsigned long flags;
+ int i;
+ int locked;
+
+ raw_local_irq_save(flags);
+ locked = graph_lock();
+
+ /*
+ * Unhash all classes that were created by this module:
+ */
+ for (i = 0; i < CLASSHASH_SIZE; i++) {
+ head = classhash_table + i;
+ if (list_empty(head))
+ continue;
+ list_for_each_entry_rcu(class, head, hash_entry) {
+ if (within(class->key, start, size))
+ zap_class(class);
+ else if (within(class->name, start, size))
+ zap_class(class);
+ }
+ }
+
+ if (locked)
+ graph_unlock();
+ raw_local_irq_restore(flags);
+
+ /*
+ * Wait for any possible iterators from look_up_lock_class() to pass
+ * before continuing to free the memory they refer to.
+ *
+ * sync_sched() is sufficient because the read-side is IRQ disable.
+ */
+ synchronize_sched();
+
+ /*
+ * XXX at this point we could return the resources to the pool;
+ * instead we leak them. We would need to change to bitmap allocators
+ * instead of the linear allocators we have now.
+ */
+}
+
+void lockdep_reset_lock(struct lockdep_map *lock)
+{
+ struct lock_class *class;
+ struct list_head *head;
+ unsigned long flags;
+ int i, j;
+ int locked;
+
+ raw_local_irq_save(flags);
+
+ /*
+ * Remove all classes this lock might have:
+ */
+ for (j = 0; j < MAX_LOCKDEP_SUBCLASSES; j++) {
+ /*
+ * If the class exists we look it up and zap it:
+ */
+ class = look_up_lock_class(lock, j);
+ if (class)
+ zap_class(class);
+ }
+ /*
+ * Debug check: in the end all mapped classes should
+ * be gone.
+ */
+ locked = graph_lock();
+ for (i = 0; i < CLASSHASH_SIZE; i++) {
+ head = classhash_table + i;
+ if (list_empty(head))
+ continue;
+ list_for_each_entry_rcu(class, head, hash_entry) {
+ int match = 0;
+
+ for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++)
+ match |= class == lock->class_cache[j];
+
+ if (unlikely(match)) {
+ if (debug_locks_off_graph_unlock()) {
+ /*
+ * We all just reset everything, how did it match?
+ */
+ WARN_ON(1);
+ }
+ goto out_restore;
+ }
+ }
+ }
+ if (locked)
+ graph_unlock();
+
+out_restore:
+ raw_local_irq_restore(flags);
+}
+
+void lockdep_init(void)
+{
+ int i;
+
+ /*
+ * Some architectures have their own start_kernel()
+ * code which calls lockdep_init(), while we also
+ * call lockdep_init() from the start_kernel() itself,
+ * and we want to initialize the hashes only once:
+ */
+ if (lockdep_initialized)
+ return;
+
+ for (i = 0; i < CLASSHASH_SIZE; i++)
+ INIT_LIST_HEAD(classhash_table + i);
+
+ for (i = 0; i < CHAINHASH_SIZE; i++)
+ INIT_LIST_HEAD(chainhash_table + i);
+
+ lockdep_initialized = 1;
+}
+
+void __init lockdep_info(void)
+{
+ printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n");
+
+ printk("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES);
+ printk("... MAX_LOCK_DEPTH: %lu\n", MAX_LOCK_DEPTH);
+ printk("... MAX_LOCKDEP_KEYS: %lu\n", MAX_LOCKDEP_KEYS);
+ printk("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE);
+ printk("... MAX_LOCKDEP_ENTRIES: %lu\n", MAX_LOCKDEP_ENTRIES);
+ printk("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS);
+ printk("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE);
+
+ printk(" memory used by lock dependency info: %lu kB\n",
+ (sizeof(struct lock_class) * MAX_LOCKDEP_KEYS +
+ sizeof(struct list_head) * CLASSHASH_SIZE +
+ sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES +
+ sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS +
+ sizeof(struct list_head) * CHAINHASH_SIZE
+#ifdef CONFIG_PROVE_LOCKING
+ + sizeof(struct circular_queue)
+#endif
+ ) / 1024
+ );
+
+ printk(" per task-struct memory footprint: %lu bytes\n",
+ sizeof(struct held_lock) * MAX_LOCK_DEPTH);
+
+#ifdef CONFIG_DEBUG_LOCKDEP
+ if (lockdep_init_error) {
+ printk("WARNING: lockdep init error! lock-%s was acquired"
+ "before lockdep_init\n", lock_init_error);
+ printk("Call stack leading to lockdep invocation was:\n");
+ print_stack_trace(&lockdep_init_trace, 0);
+ }
+#endif
+}
+
+static void
+print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
+ const void *mem_to, struct held_lock *hlock)
+{
+ if (!debug_locks_off())
+ return;
+ if (debug_locks_silent)
+ return;
+
+ printk("\n");
+ printk("=========================\n");
+ printk("[ BUG: held lock freed! ]\n");
+ print_kernel_ident();
+ printk("-------------------------\n");
+ printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
+ curr->comm, task_pid_nr(curr), mem_from, mem_to-1);
+ print_lock(hlock);
+ lockdep_print_held_locks(curr);
+
+ printk("\nstack backtrace:\n");
+ dump_stack();
+}
+
+static inline int not_in_range(const void* mem_from, unsigned long mem_len,
+ const void* lock_from, unsigned long lock_len)
+{
+ return lock_from + lock_len <= mem_from ||
+ mem_from + mem_len <= lock_from;
+}
+
+/*
+ * Called when kernel memory is freed (or unmapped), or if a lock
+ * is destroyed or reinitialized - this code checks whether there is
+ * any held lock in the memory range of <from> to <to>:
+ */
+void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len)
+{
+ struct task_struct *curr = current;
+ struct held_lock *hlock;
+ unsigned long flags;
+ int i;
+
+ if (unlikely(!debug_locks))
+ return;
+
+ local_irq_save(flags);
+ for (i = 0; i < curr->lockdep_depth; i++) {
+ hlock = curr->held_locks + i;
+
+ if (not_in_range(mem_from, mem_len, hlock->instance,
+ sizeof(*hlock->instance)))
+ continue;
+
+ print_freed_lock_bug(curr, mem_from, mem_from + mem_len, hlock);
+ break;
+ }
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(debug_check_no_locks_freed);
+
+static void print_held_locks_bug(void)
+{
+ if (!debug_locks_off())
+ return;
+ if (debug_locks_silent)
+ return;
+
+ printk("\n");
+ printk("=====================================\n");
+ printk("[ BUG: %s/%d still has locks held! ]\n",
+ current->comm, task_pid_nr(current));
+ print_kernel_ident();
+ printk("-------------------------------------\n");
+ lockdep_print_held_locks(current);
+ printk("\nstack backtrace:\n");
+ dump_stack();
+}
+
+void debug_check_no_locks_held(void)
+{
+ if (unlikely(current->lockdep_depth > 0))
+ print_held_locks_bug();
+}
+EXPORT_SYMBOL_GPL(debug_check_no_locks_held);
+
+#ifdef __KERNEL__
+void debug_show_all_locks(void)
+{
+ struct task_struct *g, *p;
+ int count = 10;
+ int unlock = 1;
+
+ if (unlikely(!debug_locks)) {
+ printk("INFO: lockdep is turned off.\n");
+ return;
+ }
+ printk("\nShowing all locks held in the system:\n");
+
+ /*
+ * Here we try to get the tasklist_lock as hard as possible,
+ * if not successful after 2 seconds we ignore it (but keep
+ * trying). This is to enable a debug printout even if a
+ * tasklist_lock-holding task deadlocks or crashes.
+ */
+retry:
+ if (!read_trylock(&tasklist_lock)) {
+ if (count == 10)
+ printk("hm, tasklist_lock locked, retrying... ");
+ if (count) {
+ count--;
+ printk(" #%d", 10-count);
+ mdelay(200);
+ goto retry;
+ }
+ printk(" ignoring it.\n");
+ unlock = 0;
+ } else {
+ if (count != 10)
+ printk(KERN_CONT " locked it.\n");
+ }
+
+ do_each_thread(g, p) {
+ /*
+ * It's not reliable to print a task's held locks
+ * if it's not sleeping (or if it's not the current
+ * task):
+ */
+ if (p->state == TASK_RUNNING && p != current)
+ continue;
+ if (p->lockdep_depth)
+ lockdep_print_held_locks(p);
+ if (!unlock)
+ if (read_trylock(&tasklist_lock))
+ unlock = 1;
+ } while_each_thread(g, p);
+
+ printk("\n");
+ printk("=============================================\n\n");
+
+ if (unlock)
+ read_unlock(&tasklist_lock);
+}
+EXPORT_SYMBOL_GPL(debug_show_all_locks);
+#endif
+
+/*
+ * Careful: only use this function if you are sure that
+ * the task cannot run in parallel!
+ */
+void debug_show_held_locks(struct task_struct *task)
+{
+ if (unlikely(!debug_locks)) {
+ printk("INFO: lockdep is turned off.\n");
+ return;
+ }
+ lockdep_print_held_locks(task);
+}
+EXPORT_SYMBOL_GPL(debug_show_held_locks);
+
+asmlinkage __visible void lockdep_sys_exit(void)
+{
+ struct task_struct *curr = current;
+
+ if (unlikely(curr->lockdep_depth)) {
+ if (!debug_locks_off())
+ return;
+ printk("\n");
+ printk("================================================\n");
+ printk("[ BUG: lock held when returning to user space! ]\n");
+ print_kernel_ident();
+ printk("------------------------------------------------\n");
+ printk("%s/%d is leaving the kernel with locks still held!\n",
+ curr->comm, curr->pid);
+ lockdep_print_held_locks(curr);
+ }
+}
+
+void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
+{
+ struct task_struct *curr = current;
+
+#ifndef CONFIG_PROVE_RCU_REPEATEDLY
+ if (!debug_locks_off())
+ return;
+#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */
+ /* Note: the following can be executed concurrently, so be careful. */
+ printk("\n");
+ printk("===============================\n");
+ printk("[ INFO: suspicious RCU usage. ]\n");
+ print_kernel_ident();
+ printk("-------------------------------\n");
+ printk("%s:%d %s!\n", file, line, s);
+ printk("\nother info that might help us debug this:\n\n");
+ printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
+ !rcu_lockdep_current_cpu_online()
+ ? "RCU used illegally from offline CPU!\n"
+ : !rcu_is_watching()
+ ? "RCU used illegally from idle CPU!\n"
+ : "",
+ rcu_scheduler_active, debug_locks);
+
+ /*
+ * If a CPU is in the RCU-free window in idle (ie: in the section
+ * between rcu_idle_enter() and rcu_idle_exit(), then RCU
+ * considers that CPU to be in an "extended quiescent state",
+ * which means that RCU will be completely ignoring that CPU.
+ * Therefore, rcu_read_lock() and friends have absolutely no
+ * effect on a CPU running in that state. In other words, even if
+ * such an RCU-idle CPU has called rcu_read_lock(), RCU might well
+ * delete data structures out from under it. RCU really has no
+ * choice here: we need to keep an RCU-free window in idle where
+ * the CPU may possibly enter into low power mode. This way we can
+ * notice an extended quiescent state to other CPUs that started a grace
+ * period. Otherwise we would delay any grace period as long as we run
+ * in the idle task.
+ *
+ * So complain bitterly if someone does call rcu_read_lock(),
+ * rcu_read_lock_bh() and so on from extended quiescent states.
+ */
+ if (!rcu_is_watching())
+ printk("RCU used illegally from extended quiescent state!\n");
+
+ lockdep_print_held_locks(curr);
+ printk("\nstack backtrace:\n");
+ dump_stack();
+}
+EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious);
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
new file mode 100644
index 000000000..51c4b24b6
--- /dev/null
+++ b/kernel/locking/lockdep_internals.h
@@ -0,0 +1,170 @@
+/*
+ * kernel/lockdep_internals.h
+ *
+ * Runtime locking correctness validator
+ *
+ * lockdep subsystem internal functions and variables.
+ */
+
+/*
+ * Lock-class usage-state bits:
+ */
+enum lock_usage_bit {
+#define LOCKDEP_STATE(__STATE) \
+ LOCK_USED_IN_##__STATE, \
+ LOCK_USED_IN_##__STATE##_READ, \
+ LOCK_ENABLED_##__STATE, \
+ LOCK_ENABLED_##__STATE##_READ,
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
+ LOCK_USED,
+ LOCK_USAGE_STATES
+};
+
+/*
+ * Usage-state bitmasks:
+ */
+#define __LOCKF(__STATE) LOCKF_##__STATE = (1 << LOCK_##__STATE),
+
+enum {
+#define LOCKDEP_STATE(__STATE) \
+ __LOCKF(USED_IN_##__STATE) \
+ __LOCKF(USED_IN_##__STATE##_READ) \
+ __LOCKF(ENABLED_##__STATE) \
+ __LOCKF(ENABLED_##__STATE##_READ)
+#include "lockdep_states.h"
+#undef LOCKDEP_STATE
+ __LOCKF(USED)
+};
+
+#define LOCKF_ENABLED_IRQ (LOCKF_ENABLED_HARDIRQ | LOCKF_ENABLED_SOFTIRQ)
+#define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ)
+
+#define LOCKF_ENABLED_IRQ_READ \
+ (LOCKF_ENABLED_HARDIRQ_READ | LOCKF_ENABLED_SOFTIRQ_READ)
+#define LOCKF_USED_IN_IRQ_READ \
+ (LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ)
+
+/*
+ * MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies
+ * we track.
+ *
+ * We use the per-lock dependency maps in two ways: we grow it by adding
+ * every to-be-taken lock to all currently held lock's own dependency
+ * table (if it's not there yet), and we check it for lock order
+ * conflicts and deadlocks.
+ */
+#define MAX_LOCKDEP_ENTRIES 32768UL
+
+#define MAX_LOCKDEP_CHAINS_BITS 16
+#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS)
+
+#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5)
+
+/*
+ * Stack-trace: tightly packed array of stack backtrace
+ * addresses. Protected by the hash_lock.
+ */
+#define MAX_STACK_TRACE_ENTRIES 524288UL
+
+extern struct list_head all_lock_classes;
+extern struct lock_chain lock_chains[];
+
+#define LOCK_USAGE_CHARS (1+LOCK_USAGE_STATES/2)
+
+extern void get_usage_chars(struct lock_class *class,
+ char usage[LOCK_USAGE_CHARS]);
+
+extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str);
+
+struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i);
+
+extern unsigned long nr_lock_classes;
+extern unsigned long nr_list_entries;
+extern unsigned long nr_lock_chains;
+extern int nr_chain_hlocks;
+extern unsigned long nr_stack_trace_entries;
+
+extern unsigned int nr_hardirq_chains;
+extern unsigned int nr_softirq_chains;
+extern unsigned int nr_process_chains;
+extern unsigned int max_lockdep_depth;
+extern unsigned int max_recursion_depth;
+
+extern unsigned int max_bfs_queue_depth;
+
+#ifdef CONFIG_PROVE_LOCKING
+extern unsigned long lockdep_count_forward_deps(struct lock_class *);
+extern unsigned long lockdep_count_backward_deps(struct lock_class *);
+#else
+static inline unsigned long
+lockdep_count_forward_deps(struct lock_class *class)
+{
+ return 0;
+}
+static inline unsigned long
+lockdep_count_backward_deps(struct lock_class *class)
+{
+ return 0;
+}
+#endif
+
+#ifdef CONFIG_DEBUG_LOCKDEP
+
+#include <asm/local.h>
+/*
+ * Various lockdep statistics.
+ * We want them per cpu as they are often accessed in fast path
+ * and we want to avoid too much cache bouncing.
+ */
+struct lockdep_stats {
+ int chain_lookup_hits;
+ int chain_lookup_misses;
+ int hardirqs_on_events;
+ int hardirqs_off_events;
+ int redundant_hardirqs_on;
+ int redundant_hardirqs_off;
+ int softirqs_on_events;
+ int softirqs_off_events;
+ int redundant_softirqs_on;
+ int redundant_softirqs_off;
+ int nr_unused_locks;
+ int nr_cyclic_checks;
+ int nr_cyclic_check_recursions;
+ int nr_find_usage_forwards_checks;
+ int nr_find_usage_forwards_recursions;
+ int nr_find_usage_backwards_checks;
+ int nr_find_usage_backwards_recursions;
+};
+
+DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats);
+
+#define __debug_atomic_inc(ptr) \
+ this_cpu_inc(lockdep_stats.ptr);
+
+#define debug_atomic_inc(ptr) { \
+ WARN_ON_ONCE(!irqs_disabled()); \
+ __this_cpu_inc(lockdep_stats.ptr); \
+}
+
+#define debug_atomic_dec(ptr) { \
+ WARN_ON_ONCE(!irqs_disabled()); \
+ __this_cpu_dec(lockdep_stats.ptr); \
+}
+
+#define debug_atomic_read(ptr) ({ \
+ struct lockdep_stats *__cpu_lockdep_stats; \
+ unsigned long long __total = 0; \
+ int __cpu; \
+ for_each_possible_cpu(__cpu) { \
+ __cpu_lockdep_stats = &per_cpu(lockdep_stats, __cpu); \
+ __total += __cpu_lockdep_stats->ptr; \
+ } \
+ __total; \
+})
+#else
+# define __debug_atomic_inc(ptr) do { } while (0)
+# define debug_atomic_inc(ptr) do { } while (0)
+# define debug_atomic_dec(ptr) do { } while (0)
+# define debug_atomic_read(ptr) 0
+#endif
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
new file mode 100644
index 000000000..d83d798be
--- /dev/null
+++ b/kernel/locking/lockdep_proc.c
@@ -0,0 +1,695 @@
+/*
+ * kernel/lockdep_proc.c
+ *
+ * Runtime locking correctness validator
+ *
+ * Started by Ingo Molnar:
+ *
+ * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *
+ * Code for /proc/lockdep and /proc/lockdep_stats:
+ *
+ */
+#include <linux/export.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/kallsyms.h>
+#include <linux/debug_locks.h>
+#include <linux/vmalloc.h>
+#include <linux/sort.h>
+#include <asm/uaccess.h>
+#include <asm/div64.h>
+
+#include "lockdep_internals.h"
+
+static void *l_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ return seq_list_next(v, &all_lock_classes, pos);
+}
+
+static void *l_start(struct seq_file *m, loff_t *pos)
+{
+ return seq_list_start_head(&all_lock_classes, *pos);
+}
+
+static void l_stop(struct seq_file *m, void *v)
+{
+}
+
+static void print_name(struct seq_file *m, struct lock_class *class)
+{
+ char str[KSYM_NAME_LEN];
+ const char *name = class->name;
+
+ if (!name) {
+ name = __get_key_name(class->key, str);
+ seq_printf(m, "%s", name);
+ } else{
+ seq_printf(m, "%s", name);
+ if (class->name_version > 1)
+ seq_printf(m, "#%d", class->name_version);
+ if (class->subclass)
+ seq_printf(m, "/%d", class->subclass);
+ }
+}
+
+static int l_show(struct seq_file *m, void *v)
+{
+ struct lock_class *class = list_entry(v, struct lock_class, lock_entry);
+ struct lock_list *entry;
+ char usage[LOCK_USAGE_CHARS];
+
+ if (v == &all_lock_classes) {
+ seq_printf(m, "all lock classes:\n");
+ return 0;
+ }
+
+ seq_printf(m, "%p", class->key);
+#ifdef CONFIG_DEBUG_LOCKDEP
+ seq_printf(m, " OPS:%8ld", class->ops);
+#endif
+#ifdef CONFIG_PROVE_LOCKING
+ seq_printf(m, " FD:%5ld", lockdep_count_forward_deps(class));
+ seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class));
+#endif
+
+ get_usage_chars(class, usage);
+ seq_printf(m, " %s", usage);
+
+ seq_printf(m, ": ");
+ print_name(m, class);
+ seq_puts(m, "\n");
+
+ list_for_each_entry(entry, &class->locks_after, entry) {
+ if (entry->distance == 1) {
+ seq_printf(m, " -> [%p] ", entry->class->key);
+ print_name(m, entry->class);
+ seq_puts(m, "\n");
+ }
+ }
+ seq_puts(m, "\n");
+
+ return 0;
+}
+
+static const struct seq_operations lockdep_ops = {
+ .start = l_start,
+ .next = l_next,
+ .stop = l_stop,
+ .show = l_show,
+};
+
+static int lockdep_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &lockdep_ops);
+}
+
+static const struct file_operations proc_lockdep_operations = {
+ .open = lockdep_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+#ifdef CONFIG_PROVE_LOCKING
+static void *lc_start(struct seq_file *m, loff_t *pos)
+{
+ if (*pos == 0)
+ return SEQ_START_TOKEN;
+
+ if (*pos - 1 < nr_lock_chains)
+ return lock_chains + (*pos - 1);
+
+ return NULL;
+}
+
+static void *lc_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ (*pos)++;
+ return lc_start(m, pos);
+}
+
+static void lc_stop(struct seq_file *m, void *v)
+{
+}
+
+static int lc_show(struct seq_file *m, void *v)
+{
+ struct lock_chain *chain = v;
+ struct lock_class *class;
+ int i;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_printf(m, "all lock chains:\n");
+ return 0;
+ }
+
+ seq_printf(m, "irq_context: %d\n", chain->irq_context);
+
+ for (i = 0; i < chain->depth; i++) {
+ class = lock_chain_get_class(chain, i);
+ if (!class->key)
+ continue;
+
+ seq_printf(m, "[%p] ", class->key);
+ print_name(m, class);
+ seq_puts(m, "\n");
+ }
+ seq_puts(m, "\n");
+
+ return 0;
+}
+
+static const struct seq_operations lockdep_chains_ops = {
+ .start = lc_start,
+ .next = lc_next,
+ .stop = lc_stop,
+ .show = lc_show,
+};
+
+static int lockdep_chains_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &lockdep_chains_ops);
+}
+
+static const struct file_operations proc_lockdep_chains_operations = {
+ .open = lockdep_chains_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+#endif /* CONFIG_PROVE_LOCKING */
+
+static void lockdep_stats_debug_show(struct seq_file *m)
+{
+#ifdef CONFIG_DEBUG_LOCKDEP
+ unsigned long long hi1 = debug_atomic_read(hardirqs_on_events),
+ hi2 = debug_atomic_read(hardirqs_off_events),
+ hr1 = debug_atomic_read(redundant_hardirqs_on),
+ hr2 = debug_atomic_read(redundant_hardirqs_off),
+ si1 = debug_atomic_read(softirqs_on_events),
+ si2 = debug_atomic_read(softirqs_off_events),
+ sr1 = debug_atomic_read(redundant_softirqs_on),
+ sr2 = debug_atomic_read(redundant_softirqs_off);
+
+ seq_printf(m, " chain lookup misses: %11llu\n",
+ debug_atomic_read(chain_lookup_misses));
+ seq_printf(m, " chain lookup hits: %11llu\n",
+ debug_atomic_read(chain_lookup_hits));
+ seq_printf(m, " cyclic checks: %11llu\n",
+ debug_atomic_read(nr_cyclic_checks));
+ seq_printf(m, " find-mask forwards checks: %11llu\n",
+ debug_atomic_read(nr_find_usage_forwards_checks));
+ seq_printf(m, " find-mask backwards checks: %11llu\n",
+ debug_atomic_read(nr_find_usage_backwards_checks));
+
+ seq_printf(m, " hardirq on events: %11llu\n", hi1);
+ seq_printf(m, " hardirq off events: %11llu\n", hi2);
+ seq_printf(m, " redundant hardirq ons: %11llu\n", hr1);
+ seq_printf(m, " redundant hardirq offs: %11llu\n", hr2);
+ seq_printf(m, " softirq on events: %11llu\n", si1);
+ seq_printf(m, " softirq off events: %11llu\n", si2);
+ seq_printf(m, " redundant softirq ons: %11llu\n", sr1);
+ seq_printf(m, " redundant softirq offs: %11llu\n", sr2);
+#endif
+}
+
+static int lockdep_stats_show(struct seq_file *m, void *v)
+{
+ struct lock_class *class;
+ unsigned long nr_unused = 0, nr_uncategorized = 0,
+ nr_irq_safe = 0, nr_irq_unsafe = 0,
+ nr_softirq_safe = 0, nr_softirq_unsafe = 0,
+ nr_hardirq_safe = 0, nr_hardirq_unsafe = 0,
+ nr_irq_read_safe = 0, nr_irq_read_unsafe = 0,
+ nr_softirq_read_safe = 0, nr_softirq_read_unsafe = 0,
+ nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0,
+ sum_forward_deps = 0;
+
+ list_for_each_entry(class, &all_lock_classes, lock_entry) {
+
+ if (class->usage_mask == 0)
+ nr_unused++;
+ if (class->usage_mask == LOCKF_USED)
+ nr_uncategorized++;
+ if (class->usage_mask & LOCKF_USED_IN_IRQ)
+ nr_irq_safe++;
+ if (class->usage_mask & LOCKF_ENABLED_IRQ)
+ nr_irq_unsafe++;
+ if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ)
+ nr_softirq_safe++;
+ if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ)
+ nr_softirq_unsafe++;
+ if (class->usage_mask & LOCKF_USED_IN_HARDIRQ)
+ nr_hardirq_safe++;
+ if (class->usage_mask & LOCKF_ENABLED_HARDIRQ)
+ nr_hardirq_unsafe++;
+ if (class->usage_mask & LOCKF_USED_IN_IRQ_READ)
+ nr_irq_read_safe++;
+ if (class->usage_mask & LOCKF_ENABLED_IRQ_READ)
+ nr_irq_read_unsafe++;
+ if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ)
+ nr_softirq_read_safe++;
+ if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ_READ)
+ nr_softirq_read_unsafe++;
+ if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ)
+ nr_hardirq_read_safe++;
+ if (class->usage_mask & LOCKF_ENABLED_HARDIRQ_READ)
+ nr_hardirq_read_unsafe++;
+
+#ifdef CONFIG_PROVE_LOCKING
+ sum_forward_deps += lockdep_count_forward_deps(class);
+#endif
+ }
+#ifdef CONFIG_DEBUG_LOCKDEP
+ DEBUG_LOCKS_WARN_ON(debug_atomic_read(nr_unused_locks) != nr_unused);
+#endif
+ seq_printf(m, " lock-classes: %11lu [max: %lu]\n",
+ nr_lock_classes, MAX_LOCKDEP_KEYS);
+ seq_printf(m, " direct dependencies: %11lu [max: %lu]\n",
+ nr_list_entries, MAX_LOCKDEP_ENTRIES);
+ seq_printf(m, " indirect dependencies: %11lu\n",
+ sum_forward_deps);
+
+ /*
+ * Total number of dependencies:
+ *
+ * All irq-safe locks may nest inside irq-unsafe locks,
+ * plus all the other known dependencies:
+ */
+ seq_printf(m, " all direct dependencies: %11lu\n",
+ nr_irq_unsafe * nr_irq_safe +
+ nr_hardirq_unsafe * nr_hardirq_safe +
+ nr_list_entries);
+
+#ifdef CONFIG_PROVE_LOCKING
+ seq_printf(m, " dependency chains: %11lu [max: %lu]\n",
+ nr_lock_chains, MAX_LOCKDEP_CHAINS);
+ seq_printf(m, " dependency chain hlocks: %11d [max: %lu]\n",
+ nr_chain_hlocks, MAX_LOCKDEP_CHAIN_HLOCKS);
+#endif
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+ seq_printf(m, " in-hardirq chains: %11u\n",
+ nr_hardirq_chains);
+ seq_printf(m, " in-softirq chains: %11u\n",
+ nr_softirq_chains);
+#endif
+ seq_printf(m, " in-process chains: %11u\n",
+ nr_process_chains);
+ seq_printf(m, " stack-trace entries: %11lu [max: %lu]\n",
+ nr_stack_trace_entries, MAX_STACK_TRACE_ENTRIES);
+ seq_printf(m, " combined max dependencies: %11u\n",
+ (nr_hardirq_chains + 1) *
+ (nr_softirq_chains + 1) *
+ (nr_process_chains + 1)
+ );
+ seq_printf(m, " hardirq-safe locks: %11lu\n",
+ nr_hardirq_safe);
+ seq_printf(m, " hardirq-unsafe locks: %11lu\n",
+ nr_hardirq_unsafe);
+ seq_printf(m, " softirq-safe locks: %11lu\n",
+ nr_softirq_safe);
+ seq_printf(m, " softirq-unsafe locks: %11lu\n",
+ nr_softirq_unsafe);
+ seq_printf(m, " irq-safe locks: %11lu\n",
+ nr_irq_safe);
+ seq_printf(m, " irq-unsafe locks: %11lu\n",
+ nr_irq_unsafe);
+
+ seq_printf(m, " hardirq-read-safe locks: %11lu\n",
+ nr_hardirq_read_safe);
+ seq_printf(m, " hardirq-read-unsafe locks: %11lu\n",
+ nr_hardirq_read_unsafe);
+ seq_printf(m, " softirq-read-safe locks: %11lu\n",
+ nr_softirq_read_safe);
+ seq_printf(m, " softirq-read-unsafe locks: %11lu\n",
+ nr_softirq_read_unsafe);
+ seq_printf(m, " irq-read-safe locks: %11lu\n",
+ nr_irq_read_safe);
+ seq_printf(m, " irq-read-unsafe locks: %11lu\n",
+ nr_irq_read_unsafe);
+
+ seq_printf(m, " uncategorized locks: %11lu\n",
+ nr_uncategorized);
+ seq_printf(m, " unused locks: %11lu\n",
+ nr_unused);
+ seq_printf(m, " max locking depth: %11u\n",
+ max_lockdep_depth);
+#ifdef CONFIG_PROVE_LOCKING
+ seq_printf(m, " max bfs queue depth: %11u\n",
+ max_bfs_queue_depth);
+#endif
+ lockdep_stats_debug_show(m);
+ seq_printf(m, " debug_locks: %11u\n",
+ debug_locks);
+
+ return 0;
+}
+
+static int lockdep_stats_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, lockdep_stats_show, NULL);
+}
+
+static const struct file_operations proc_lockdep_stats_operations = {
+ .open = lockdep_stats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+#ifdef CONFIG_LOCK_STAT
+
+struct lock_stat_data {
+ struct lock_class *class;
+ struct lock_class_stats stats;
+};
+
+struct lock_stat_seq {
+ struct lock_stat_data *iter_end;
+ struct lock_stat_data stats[MAX_LOCKDEP_KEYS];
+};
+
+/*
+ * sort on absolute number of contentions
+ */
+static int lock_stat_cmp(const void *l, const void *r)
+{
+ const struct lock_stat_data *dl = l, *dr = r;
+ unsigned long nl, nr;
+
+ nl = dl->stats.read_waittime.nr + dl->stats.write_waittime.nr;
+ nr = dr->stats.read_waittime.nr + dr->stats.write_waittime.nr;
+
+ return nr - nl;
+}
+
+static void seq_line(struct seq_file *m, char c, int offset, int length)
+{
+ int i;
+
+ for (i = 0; i < offset; i++)
+ seq_puts(m, " ");
+ for (i = 0; i < length; i++)
+ seq_printf(m, "%c", c);
+ seq_puts(m, "\n");
+}
+
+static void snprint_time(char *buf, size_t bufsiz, s64 nr)
+{
+ s64 div;
+ s32 rem;
+
+ nr += 5; /* for display rounding */
+ div = div_s64_rem(nr, 1000, &rem);
+ snprintf(buf, bufsiz, "%lld.%02d", (long long)div, (int)rem/10);
+}
+
+static void seq_time(struct seq_file *m, s64 time)
+{
+ char num[15];
+
+ snprint_time(num, sizeof(num), time);
+ seq_printf(m, " %14s", num);
+}
+
+static void seq_lock_time(struct seq_file *m, struct lock_time *lt)
+{
+ seq_printf(m, "%14lu", lt->nr);
+ seq_time(m, lt->min);
+ seq_time(m, lt->max);
+ seq_time(m, lt->total);
+ seq_time(m, lt->nr ? div_s64(lt->total, lt->nr) : 0);
+}
+
+static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
+{
+ struct lockdep_subclass_key *ckey;
+ struct lock_class_stats *stats;
+ struct lock_class *class;
+ const char *cname;
+ int i, namelen;
+ char name[39];
+
+ class = data->class;
+ stats = &data->stats;
+
+ namelen = 38;
+ if (class->name_version > 1)
+ namelen -= 2; /* XXX truncates versions > 9 */
+ if (class->subclass)
+ namelen -= 2;
+
+ rcu_read_lock_sched();
+ cname = rcu_dereference_sched(class->name);
+ ckey = rcu_dereference_sched(class->key);
+
+ if (!cname && !ckey) {
+ rcu_read_unlock_sched();
+ return;
+
+ } else if (!cname) {
+ char str[KSYM_NAME_LEN];
+ const char *key_name;
+
+ key_name = __get_key_name(ckey, str);
+ snprintf(name, namelen, "%s", key_name);
+ } else {
+ snprintf(name, namelen, "%s", cname);
+ }
+ rcu_read_unlock_sched();
+
+ namelen = strlen(name);
+ if (class->name_version > 1) {
+ snprintf(name+namelen, 3, "#%d", class->name_version);
+ namelen += 2;
+ }
+ if (class->subclass) {
+ snprintf(name+namelen, 3, "/%d", class->subclass);
+ namelen += 2;
+ }
+
+ if (stats->write_holdtime.nr) {
+ if (stats->read_holdtime.nr)
+ seq_printf(m, "%38s-W:", name);
+ else
+ seq_printf(m, "%40s:", name);
+
+ seq_printf(m, "%14lu ", stats->bounces[bounce_contended_write]);
+ seq_lock_time(m, &stats->write_waittime);
+ seq_printf(m, " %14lu ", stats->bounces[bounce_acquired_write]);
+ seq_lock_time(m, &stats->write_holdtime);
+ seq_puts(m, "\n");
+ }
+
+ if (stats->read_holdtime.nr) {
+ seq_printf(m, "%38s-R:", name);
+ seq_printf(m, "%14lu ", stats->bounces[bounce_contended_read]);
+ seq_lock_time(m, &stats->read_waittime);
+ seq_printf(m, " %14lu ", stats->bounces[bounce_acquired_read]);
+ seq_lock_time(m, &stats->read_holdtime);
+ seq_puts(m, "\n");
+ }
+
+ if (stats->read_waittime.nr + stats->write_waittime.nr == 0)
+ return;
+
+ if (stats->read_holdtime.nr)
+ namelen += 2;
+
+ for (i = 0; i < LOCKSTAT_POINTS; i++) {
+ char ip[32];
+
+ if (class->contention_point[i] == 0)
+ break;
+
+ if (!i)
+ seq_line(m, '-', 40-namelen, namelen);
+
+ snprintf(ip, sizeof(ip), "[<%p>]",
+ (void *)class->contention_point[i]);
+ seq_printf(m, "%40s %14lu %29s %pS\n",
+ name, stats->contention_point[i],
+ ip, (void *)class->contention_point[i]);
+ }
+ for (i = 0; i < LOCKSTAT_POINTS; i++) {
+ char ip[32];
+
+ if (class->contending_point[i] == 0)
+ break;
+
+ if (!i)
+ seq_line(m, '-', 40-namelen, namelen);
+
+ snprintf(ip, sizeof(ip), "[<%p>]",
+ (void *)class->contending_point[i]);
+ seq_printf(m, "%40s %14lu %29s %pS\n",
+ name, stats->contending_point[i],
+ ip, (void *)class->contending_point[i]);
+ }
+ if (i) {
+ seq_puts(m, "\n");
+ seq_line(m, '.', 0, 40 + 1 + 12 * (14 + 1));
+ seq_puts(m, "\n");
+ }
+}
+
+static void seq_header(struct seq_file *m)
+{
+ seq_puts(m, "lock_stat version 0.4\n");
+
+ if (unlikely(!debug_locks))
+ seq_printf(m, "*WARNING* lock debugging disabled!! - possibly due to a lockdep warning\n");
+
+ seq_line(m, '-', 0, 40 + 1 + 12 * (14 + 1));
+ seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s %14s %14s "
+ "%14s %14s\n",
+ "class name",
+ "con-bounces",
+ "contentions",
+ "waittime-min",
+ "waittime-max",
+ "waittime-total",
+ "waittime-avg",
+ "acq-bounces",
+ "acquisitions",
+ "holdtime-min",
+ "holdtime-max",
+ "holdtime-total",
+ "holdtime-avg");
+ seq_line(m, '-', 0, 40 + 1 + 12 * (14 + 1));
+ seq_printf(m, "\n");
+}
+
+static void *ls_start(struct seq_file *m, loff_t *pos)
+{
+ struct lock_stat_seq *data = m->private;
+ struct lock_stat_data *iter;
+
+ if (*pos == 0)
+ return SEQ_START_TOKEN;
+
+ iter = data->stats + (*pos - 1);
+ if (iter >= data->iter_end)
+ iter = NULL;
+
+ return iter;
+}
+
+static void *ls_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ (*pos)++;
+ return ls_start(m, pos);
+}
+
+static void ls_stop(struct seq_file *m, void *v)
+{
+}
+
+static int ls_show(struct seq_file *m, void *v)
+{
+ if (v == SEQ_START_TOKEN)
+ seq_header(m);
+ else
+ seq_stats(m, v);
+
+ return 0;
+}
+
+static const struct seq_operations lockstat_ops = {
+ .start = ls_start,
+ .next = ls_next,
+ .stop = ls_stop,
+ .show = ls_show,
+};
+
+static int lock_stat_open(struct inode *inode, struct file *file)
+{
+ int res;
+ struct lock_class *class;
+ struct lock_stat_seq *data = vmalloc(sizeof(struct lock_stat_seq));
+
+ if (!data)
+ return -ENOMEM;
+
+ res = seq_open(file, &lockstat_ops);
+ if (!res) {
+ struct lock_stat_data *iter = data->stats;
+ struct seq_file *m = file->private_data;
+
+ list_for_each_entry(class, &all_lock_classes, lock_entry) {
+ iter->class = class;
+ iter->stats = lock_stats(class);
+ iter++;
+ }
+ data->iter_end = iter;
+
+ sort(data->stats, data->iter_end - data->stats,
+ sizeof(struct lock_stat_data),
+ lock_stat_cmp, NULL);
+
+ m->private = data;
+ } else
+ vfree(data);
+
+ return res;
+}
+
+static ssize_t lock_stat_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct lock_class *class;
+ char c;
+
+ if (count) {
+ if (get_user(c, buf))
+ return -EFAULT;
+
+ if (c != '0')
+ return count;
+
+ list_for_each_entry(class, &all_lock_classes, lock_entry)
+ clear_lock_stats(class);
+ }
+ return count;
+}
+
+static int lock_stat_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq = file->private_data;
+
+ vfree(seq->private);
+ return seq_release(inode, file);
+}
+
+static const struct file_operations proc_lock_stat_operations = {
+ .open = lock_stat_open,
+ .write = lock_stat_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = lock_stat_release,
+};
+#endif /* CONFIG_LOCK_STAT */
+
+static int __init lockdep_proc_init(void)
+{
+ proc_create("lockdep", S_IRUSR, NULL, &proc_lockdep_operations);
+#ifdef CONFIG_PROVE_LOCKING
+ proc_create("lockdep_chains", S_IRUSR, NULL,
+ &proc_lockdep_chains_operations);
+#endif
+ proc_create("lockdep_stats", S_IRUSR, NULL,
+ &proc_lockdep_stats_operations);
+
+#ifdef CONFIG_LOCK_STAT
+ proc_create("lock_stat", S_IRUSR | S_IWUSR, NULL,
+ &proc_lock_stat_operations);
+#endif
+
+ return 0;
+}
+
+__initcall(lockdep_proc_init);
+
diff --git a/kernel/locking/lockdep_states.h b/kernel/locking/lockdep_states.h
new file mode 100644
index 000000000..995b0cc2b
--- /dev/null
+++ b/kernel/locking/lockdep_states.h
@@ -0,0 +1,9 @@
+/*
+ * Lockdep states,
+ *
+ * please update XXX_LOCK_USAGE_STATES in include/linux/lockdep.h whenever
+ * you add one, or come up with a nice dynamic solution.
+ */
+LOCKDEP_STATE(HARDIRQ)
+LOCKDEP_STATE(SOFTIRQ)
+LOCKDEP_STATE(RECLAIM_FS)
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
new file mode 100644
index 000000000..ec8cce259
--- /dev/null
+++ b/kernel/locking/locktorture.c
@@ -0,0 +1,815 @@
+/*
+ * Module-based torture test facility for locking
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright (C) IBM Corporation, 2014
+ *
+ * Author: Paul E. McKenney <paulmck@us.ibm.com>
+ * Based on kernel/rcu/torture.c.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/spinlock.h>
+#include <linux/rwlock.h>
+#include <linux/mutex.h>
+#include <linux/rwsem.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/atomic.h>
+#include <linux/moduleparam.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/torture.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>");
+
+torture_param(int, nwriters_stress, -1,
+ "Number of write-locking stress-test threads");
+torture_param(int, nreaders_stress, -1,
+ "Number of read-locking stress-test threads");
+torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
+torture_param(int, onoff_interval, 0,
+ "Time between CPU hotplugs (s), 0=disable");
+torture_param(int, shuffle_interval, 3,
+ "Number of jiffies between shuffles, 0=disable");
+torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero to disable.");
+torture_param(int, stat_interval, 60,
+ "Number of seconds between stats printk()s");
+torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable");
+torture_param(bool, verbose, true,
+ "Enable verbose debugging printk()s");
+
+static char *torture_type = "spin_lock";
+module_param(torture_type, charp, 0444);
+MODULE_PARM_DESC(torture_type,
+ "Type of lock to torture (spin_lock, spin_lock_irq, mutex_lock, ...)");
+
+static struct task_struct *stats_task;
+static struct task_struct **writer_tasks;
+static struct task_struct **reader_tasks;
+
+static bool lock_is_write_held;
+static bool lock_is_read_held;
+
+struct lock_stress_stats {
+ long n_lock_fail;
+ long n_lock_acquired;
+};
+
+#if defined(MODULE)
+#define LOCKTORTURE_RUNNABLE_INIT 1
+#else
+#define LOCKTORTURE_RUNNABLE_INIT 0
+#endif
+int torture_runnable = LOCKTORTURE_RUNNABLE_INIT;
+module_param(torture_runnable, int, 0444);
+MODULE_PARM_DESC(torture_runnable, "Start locktorture at module init");
+
+/* Forward reference. */
+static void lock_torture_cleanup(void);
+
+/*
+ * Operations vector for selecting different types of tests.
+ */
+struct lock_torture_ops {
+ void (*init)(void);
+ int (*writelock)(void);
+ void (*write_delay)(struct torture_random_state *trsp);
+ void (*writeunlock)(void);
+ int (*readlock)(void);
+ void (*read_delay)(struct torture_random_state *trsp);
+ void (*readunlock)(void);
+ unsigned long flags;
+ const char *name;
+};
+
+struct lock_torture_cxt {
+ int nrealwriters_stress;
+ int nrealreaders_stress;
+ bool debug_lock;
+ atomic_t n_lock_torture_errors;
+ struct lock_torture_ops *cur_ops;
+ struct lock_stress_stats *lwsa; /* writer statistics */
+ struct lock_stress_stats *lrsa; /* reader statistics */
+};
+static struct lock_torture_cxt cxt = { 0, 0, false,
+ ATOMIC_INIT(0),
+ NULL, NULL};
+/*
+ * Definitions for lock torture testing.
+ */
+
+static int torture_lock_busted_write_lock(void)
+{
+ return 0; /* BUGGY, do not use in real life!!! */
+}
+
+static void torture_lock_busted_write_delay(struct torture_random_state *trsp)
+{
+ const unsigned long longdelay_us = 100;
+
+ /* We want a long delay occasionally to force massive contention. */
+ if (!(torture_random(trsp) %
+ (cxt.nrealwriters_stress * 2000 * longdelay_us)))
+ mdelay(longdelay_us);
+#ifdef CONFIG_PREEMPT
+ if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
+ preempt_schedule(); /* Allow test to be preempted. */
+#endif
+}
+
+static void torture_lock_busted_write_unlock(void)
+{
+ /* BUGGY, do not use in real life!!! */
+}
+
+static struct lock_torture_ops lock_busted_ops = {
+ .writelock = torture_lock_busted_write_lock,
+ .write_delay = torture_lock_busted_write_delay,
+ .writeunlock = torture_lock_busted_write_unlock,
+ .readlock = NULL,
+ .read_delay = NULL,
+ .readunlock = NULL,
+ .name = "lock_busted"
+};
+
+static DEFINE_SPINLOCK(torture_spinlock);
+
+static int torture_spin_lock_write_lock(void) __acquires(torture_spinlock)
+{
+ spin_lock(&torture_spinlock);
+ return 0;
+}
+
+static void torture_spin_lock_write_delay(struct torture_random_state *trsp)
+{
+ const unsigned long shortdelay_us = 2;
+ const unsigned long longdelay_us = 100;
+
+ /* We want a short delay mostly to emulate likely code, and
+ * we want a long delay occasionally to force massive contention.
+ */
+ if (!(torture_random(trsp) %
+ (cxt.nrealwriters_stress * 2000 * longdelay_us)))
+ mdelay(longdelay_us);
+ if (!(torture_random(trsp) %
+ (cxt.nrealwriters_stress * 2 * shortdelay_us)))
+ udelay(shortdelay_us);
+#ifdef CONFIG_PREEMPT
+ if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
+ preempt_schedule(); /* Allow test to be preempted. */
+#endif
+}
+
+static void torture_spin_lock_write_unlock(void) __releases(torture_spinlock)
+{
+ spin_unlock(&torture_spinlock);
+}
+
+static struct lock_torture_ops spin_lock_ops = {
+ .writelock = torture_spin_lock_write_lock,
+ .write_delay = torture_spin_lock_write_delay,
+ .writeunlock = torture_spin_lock_write_unlock,
+ .readlock = NULL,
+ .read_delay = NULL,
+ .readunlock = NULL,
+ .name = "spin_lock"
+};
+
+static int torture_spin_lock_write_lock_irq(void)
+__acquires(torture_spinlock)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&torture_spinlock, flags);
+ cxt.cur_ops->flags = flags;
+ return 0;
+}
+
+static void torture_lock_spin_write_unlock_irq(void)
+__releases(torture_spinlock)
+{
+ spin_unlock_irqrestore(&torture_spinlock, cxt.cur_ops->flags);
+}
+
+static struct lock_torture_ops spin_lock_irq_ops = {
+ .writelock = torture_spin_lock_write_lock_irq,
+ .write_delay = torture_spin_lock_write_delay,
+ .writeunlock = torture_lock_spin_write_unlock_irq,
+ .readlock = NULL,
+ .read_delay = NULL,
+ .readunlock = NULL,
+ .name = "spin_lock_irq"
+};
+
+static DEFINE_RWLOCK(torture_rwlock);
+
+static int torture_rwlock_write_lock(void) __acquires(torture_rwlock)
+{
+ write_lock(&torture_rwlock);
+ return 0;
+}
+
+static void torture_rwlock_write_delay(struct torture_random_state *trsp)
+{
+ const unsigned long shortdelay_us = 2;
+ const unsigned long longdelay_ms = 100;
+
+ /* We want a short delay mostly to emulate likely code, and
+ * we want a long delay occasionally to force massive contention.
+ */
+ if (!(torture_random(trsp) %
+ (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
+ mdelay(longdelay_ms);
+ else
+ udelay(shortdelay_us);
+}
+
+static void torture_rwlock_write_unlock(void) __releases(torture_rwlock)
+{
+ write_unlock(&torture_rwlock);
+}
+
+static int torture_rwlock_read_lock(void) __acquires(torture_rwlock)
+{
+ read_lock(&torture_rwlock);
+ return 0;
+}
+
+static void torture_rwlock_read_delay(struct torture_random_state *trsp)
+{
+ const unsigned long shortdelay_us = 10;
+ const unsigned long longdelay_ms = 100;
+
+ /* We want a short delay mostly to emulate likely code, and
+ * we want a long delay occasionally to force massive contention.
+ */
+ if (!(torture_random(trsp) %
+ (cxt.nrealreaders_stress * 2000 * longdelay_ms)))
+ mdelay(longdelay_ms);
+ else
+ udelay(shortdelay_us);
+}
+
+static void torture_rwlock_read_unlock(void) __releases(torture_rwlock)
+{
+ read_unlock(&torture_rwlock);
+}
+
+static struct lock_torture_ops rw_lock_ops = {
+ .writelock = torture_rwlock_write_lock,
+ .write_delay = torture_rwlock_write_delay,
+ .writeunlock = torture_rwlock_write_unlock,
+ .readlock = torture_rwlock_read_lock,
+ .read_delay = torture_rwlock_read_delay,
+ .readunlock = torture_rwlock_read_unlock,
+ .name = "rw_lock"
+};
+
+static int torture_rwlock_write_lock_irq(void) __acquires(torture_rwlock)
+{
+ unsigned long flags;
+
+ write_lock_irqsave(&torture_rwlock, flags);
+ cxt.cur_ops->flags = flags;
+ return 0;
+}
+
+static void torture_rwlock_write_unlock_irq(void)
+__releases(torture_rwlock)
+{
+ write_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags);
+}
+
+static int torture_rwlock_read_lock_irq(void) __acquires(torture_rwlock)
+{
+ unsigned long flags;
+
+ read_lock_irqsave(&torture_rwlock, flags);
+ cxt.cur_ops->flags = flags;
+ return 0;
+}
+
+static void torture_rwlock_read_unlock_irq(void)
+__releases(torture_rwlock)
+{
+ write_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags);
+}
+
+static struct lock_torture_ops rw_lock_irq_ops = {
+ .writelock = torture_rwlock_write_lock_irq,
+ .write_delay = torture_rwlock_write_delay,
+ .writeunlock = torture_rwlock_write_unlock_irq,
+ .readlock = torture_rwlock_read_lock_irq,
+ .read_delay = torture_rwlock_read_delay,
+ .readunlock = torture_rwlock_read_unlock_irq,
+ .name = "rw_lock_irq"
+};
+
+static DEFINE_MUTEX(torture_mutex);
+
+static int torture_mutex_lock(void) __acquires(torture_mutex)
+{
+ mutex_lock(&torture_mutex);
+ return 0;
+}
+
+static void torture_mutex_delay(struct torture_random_state *trsp)
+{
+ const unsigned long longdelay_ms = 100;
+
+ /* We want a long delay occasionally to force massive contention. */
+ if (!(torture_random(trsp) %
+ (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
+ mdelay(longdelay_ms * 5);
+ else
+ mdelay(longdelay_ms / 5);
+#ifdef CONFIG_PREEMPT
+ if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
+ preempt_schedule(); /* Allow test to be preempted. */
+#endif
+}
+
+static void torture_mutex_unlock(void) __releases(torture_mutex)
+{
+ mutex_unlock(&torture_mutex);
+}
+
+static struct lock_torture_ops mutex_lock_ops = {
+ .writelock = torture_mutex_lock,
+ .write_delay = torture_mutex_delay,
+ .writeunlock = torture_mutex_unlock,
+ .readlock = NULL,
+ .read_delay = NULL,
+ .readunlock = NULL,
+ .name = "mutex_lock"
+};
+
+static DECLARE_RWSEM(torture_rwsem);
+static int torture_rwsem_down_write(void) __acquires(torture_rwsem)
+{
+ down_write(&torture_rwsem);
+ return 0;
+}
+
+static void torture_rwsem_write_delay(struct torture_random_state *trsp)
+{
+ const unsigned long longdelay_ms = 100;
+
+ /* We want a long delay occasionally to force massive contention. */
+ if (!(torture_random(trsp) %
+ (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
+ mdelay(longdelay_ms * 10);
+ else
+ mdelay(longdelay_ms / 10);
+#ifdef CONFIG_PREEMPT
+ if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
+ preempt_schedule(); /* Allow test to be preempted. */
+#endif
+}
+
+static void torture_rwsem_up_write(void) __releases(torture_rwsem)
+{
+ up_write(&torture_rwsem);
+}
+
+static int torture_rwsem_down_read(void) __acquires(torture_rwsem)
+{
+ down_read(&torture_rwsem);
+ return 0;
+}
+
+static void torture_rwsem_read_delay(struct torture_random_state *trsp)
+{
+ const unsigned long longdelay_ms = 100;
+
+ /* We want a long delay occasionally to force massive contention. */
+ if (!(torture_random(trsp) %
+ (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
+ mdelay(longdelay_ms * 2);
+ else
+ mdelay(longdelay_ms / 2);
+#ifdef CONFIG_PREEMPT
+ if (!(torture_random(trsp) % (cxt.nrealreaders_stress * 20000)))
+ preempt_schedule(); /* Allow test to be preempted. */
+#endif
+}
+
+static void torture_rwsem_up_read(void) __releases(torture_rwsem)
+{
+ up_read(&torture_rwsem);
+}
+
+static struct lock_torture_ops rwsem_lock_ops = {
+ .writelock = torture_rwsem_down_write,
+ .write_delay = torture_rwsem_write_delay,
+ .writeunlock = torture_rwsem_up_write,
+ .readlock = torture_rwsem_down_read,
+ .read_delay = torture_rwsem_read_delay,
+ .readunlock = torture_rwsem_up_read,
+ .name = "rwsem_lock"
+};
+
+/*
+ * Lock torture writer kthread. Repeatedly acquires and releases
+ * the lock, checking for duplicate acquisitions.
+ */
+static int lock_torture_writer(void *arg)
+{
+ struct lock_stress_stats *lwsp = arg;
+ static DEFINE_TORTURE_RANDOM(rand);
+
+ VERBOSE_TOROUT_STRING("lock_torture_writer task started");
+ set_user_nice(current, MAX_NICE);
+
+ do {
+ if ((torture_random(&rand) & 0xfffff) == 0)
+ schedule_timeout_uninterruptible(1);
+
+ cxt.cur_ops->writelock();
+ if (WARN_ON_ONCE(lock_is_write_held))
+ lwsp->n_lock_fail++;
+ lock_is_write_held = 1;
+ if (WARN_ON_ONCE(lock_is_read_held))
+ lwsp->n_lock_fail++; /* rare, but... */
+
+ lwsp->n_lock_acquired++;
+ cxt.cur_ops->write_delay(&rand);
+ lock_is_write_held = 0;
+ cxt.cur_ops->writeunlock();
+
+ stutter_wait("lock_torture_writer");
+ } while (!torture_must_stop());
+ torture_kthread_stopping("lock_torture_writer");
+ return 0;
+}
+
+/*
+ * Lock torture reader kthread. Repeatedly acquires and releases
+ * the reader lock.
+ */
+static int lock_torture_reader(void *arg)
+{
+ struct lock_stress_stats *lrsp = arg;
+ static DEFINE_TORTURE_RANDOM(rand);
+
+ VERBOSE_TOROUT_STRING("lock_torture_reader task started");
+ set_user_nice(current, MAX_NICE);
+
+ do {
+ if ((torture_random(&rand) & 0xfffff) == 0)
+ schedule_timeout_uninterruptible(1);
+
+ cxt.cur_ops->readlock();
+ lock_is_read_held = 1;
+ if (WARN_ON_ONCE(lock_is_write_held))
+ lrsp->n_lock_fail++; /* rare, but... */
+
+ lrsp->n_lock_acquired++;
+ cxt.cur_ops->read_delay(&rand);
+ lock_is_read_held = 0;
+ cxt.cur_ops->readunlock();
+
+ stutter_wait("lock_torture_reader");
+ } while (!torture_must_stop());
+ torture_kthread_stopping("lock_torture_reader");
+ return 0;
+}
+
+/*
+ * Create an lock-torture-statistics message in the specified buffer.
+ */
+static void __torture_print_stats(char *page,
+ struct lock_stress_stats *statp, bool write)
+{
+ bool fail = 0;
+ int i, n_stress;
+ long max = 0;
+ long min = statp[0].n_lock_acquired;
+ long long sum = 0;
+
+ n_stress = write ? cxt.nrealwriters_stress : cxt.nrealreaders_stress;
+ for (i = 0; i < n_stress; i++) {
+ if (statp[i].n_lock_fail)
+ fail = true;
+ sum += statp[i].n_lock_acquired;
+ if (max < statp[i].n_lock_fail)
+ max = statp[i].n_lock_fail;
+ if (min > statp[i].n_lock_fail)
+ min = statp[i].n_lock_fail;
+ }
+ page += sprintf(page,
+ "%s: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n",
+ write ? "Writes" : "Reads ",
+ sum, max, min, max / 2 > min ? "???" : "",
+ fail, fail ? "!!!" : "");
+ if (fail)
+ atomic_inc(&cxt.n_lock_torture_errors);
+}
+
+/*
+ * Print torture statistics. Caller must ensure that there is only one
+ * call to this function at a given time!!! This is normally accomplished
+ * by relying on the module system to only have one copy of the module
+ * loaded, and then by giving the lock_torture_stats kthread full control
+ * (or the init/cleanup functions when lock_torture_stats thread is not
+ * running).
+ */
+static void lock_torture_stats_print(void)
+{
+ int size = cxt.nrealwriters_stress * 200 + 8192;
+ char *buf;
+
+ if (cxt.cur_ops->readlock)
+ size += cxt.nrealreaders_stress * 200 + 8192;
+
+ buf = kmalloc(size, GFP_KERNEL);
+ if (!buf) {
+ pr_err("lock_torture_stats_print: Out of memory, need: %d",
+ size);
+ return;
+ }
+
+ __torture_print_stats(buf, cxt.lwsa, true);
+ pr_alert("%s", buf);
+ kfree(buf);
+
+ if (cxt.cur_ops->readlock) {
+ buf = kmalloc(size, GFP_KERNEL);
+ if (!buf) {
+ pr_err("lock_torture_stats_print: Out of memory, need: %d",
+ size);
+ return;
+ }
+
+ __torture_print_stats(buf, cxt.lrsa, false);
+ pr_alert("%s", buf);
+ kfree(buf);
+ }
+}
+
+/*
+ * Periodically prints torture statistics, if periodic statistics printing
+ * was specified via the stat_interval module parameter.
+ *
+ * No need to worry about fullstop here, since this one doesn't reference
+ * volatile state or register callbacks.
+ */
+static int lock_torture_stats(void *arg)
+{
+ VERBOSE_TOROUT_STRING("lock_torture_stats task started");
+ do {
+ schedule_timeout_interruptible(stat_interval * HZ);
+ lock_torture_stats_print();
+ torture_shutdown_absorb("lock_torture_stats");
+ } while (!torture_must_stop());
+ torture_kthread_stopping("lock_torture_stats");
+ return 0;
+}
+
+static inline void
+lock_torture_print_module_parms(struct lock_torture_ops *cur_ops,
+ const char *tag)
+{
+ pr_alert("%s" TORTURE_FLAG
+ "--- %s%s: nwriters_stress=%d nreaders_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n",
+ torture_type, tag, cxt.debug_lock ? " [debug]": "",
+ cxt.nrealwriters_stress, cxt.nrealreaders_stress, stat_interval,
+ verbose, shuffle_interval, stutter, shutdown_secs,
+ onoff_interval, onoff_holdoff);
+}
+
+static void lock_torture_cleanup(void)
+{
+ int i;
+
+ if (torture_cleanup_begin())
+ return;
+
+ if (writer_tasks) {
+ for (i = 0; i < cxt.nrealwriters_stress; i++)
+ torture_stop_kthread(lock_torture_writer,
+ writer_tasks[i]);
+ kfree(writer_tasks);
+ writer_tasks = NULL;
+ }
+
+ if (reader_tasks) {
+ for (i = 0; i < cxt.nrealreaders_stress; i++)
+ torture_stop_kthread(lock_torture_reader,
+ reader_tasks[i]);
+ kfree(reader_tasks);
+ reader_tasks = NULL;
+ }
+
+ torture_stop_kthread(lock_torture_stats, stats_task);
+ lock_torture_stats_print(); /* -After- the stats thread is stopped! */
+
+ if (atomic_read(&cxt.n_lock_torture_errors))
+ lock_torture_print_module_parms(cxt.cur_ops,
+ "End of test: FAILURE");
+ else if (torture_onoff_failures())
+ lock_torture_print_module_parms(cxt.cur_ops,
+ "End of test: LOCK_HOTPLUG");
+ else
+ lock_torture_print_module_parms(cxt.cur_ops,
+ "End of test: SUCCESS");
+ torture_cleanup_end();
+}
+
+static int __init lock_torture_init(void)
+{
+ int i, j;
+ int firsterr = 0;
+ static struct lock_torture_ops *torture_ops[] = {
+ &lock_busted_ops,
+ &spin_lock_ops, &spin_lock_irq_ops,
+ &rw_lock_ops, &rw_lock_irq_ops,
+ &mutex_lock_ops,
+ &rwsem_lock_ops,
+ };
+
+ if (!torture_init_begin(torture_type, verbose, &torture_runnable))
+ return -EBUSY;
+
+ /* Process args and tell the world that the torturer is on the job. */
+ for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
+ cxt.cur_ops = torture_ops[i];
+ if (strcmp(torture_type, cxt.cur_ops->name) == 0)
+ break;
+ }
+ if (i == ARRAY_SIZE(torture_ops)) {
+ pr_alert("lock-torture: invalid torture type: \"%s\"\n",
+ torture_type);
+ pr_alert("lock-torture types:");
+ for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
+ pr_alert(" %s", torture_ops[i]->name);
+ pr_alert("\n");
+ torture_init_end();
+ return -EINVAL;
+ }
+ if (cxt.cur_ops->init)
+ cxt.cur_ops->init(); /* no "goto unwind" prior to this point!!! */
+
+ if (nwriters_stress >= 0)
+ cxt.nrealwriters_stress = nwriters_stress;
+ else
+ cxt.nrealwriters_stress = 2 * num_online_cpus();
+
+#ifdef CONFIG_DEBUG_MUTEXES
+ if (strncmp(torture_type, "mutex", 5) == 0)
+ cxt.debug_lock = true;
+#endif
+#ifdef CONFIG_DEBUG_SPINLOCK
+ if ((strncmp(torture_type, "spin", 4) == 0) ||
+ (strncmp(torture_type, "rw_lock", 7) == 0))
+ cxt.debug_lock = true;
+#endif
+
+ /* Initialize the statistics so that each run gets its own numbers. */
+
+ lock_is_write_held = 0;
+ cxt.lwsa = kmalloc(sizeof(*cxt.lwsa) * cxt.nrealwriters_stress, GFP_KERNEL);
+ if (cxt.lwsa == NULL) {
+ VERBOSE_TOROUT_STRING("cxt.lwsa: Out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+ for (i = 0; i < cxt.nrealwriters_stress; i++) {
+ cxt.lwsa[i].n_lock_fail = 0;
+ cxt.lwsa[i].n_lock_acquired = 0;
+ }
+
+ if (cxt.cur_ops->readlock) {
+ if (nreaders_stress >= 0)
+ cxt.nrealreaders_stress = nreaders_stress;
+ else {
+ /*
+ * By default distribute evenly the number of
+ * readers and writers. We still run the same number
+ * of threads as the writer-only locks default.
+ */
+ if (nwriters_stress < 0) /* user doesn't care */
+ cxt.nrealwriters_stress = num_online_cpus();
+ cxt.nrealreaders_stress = cxt.nrealwriters_stress;
+ }
+
+ lock_is_read_held = 0;
+ cxt.lrsa = kmalloc(sizeof(*cxt.lrsa) * cxt.nrealreaders_stress, GFP_KERNEL);
+ if (cxt.lrsa == NULL) {
+ VERBOSE_TOROUT_STRING("cxt.lrsa: Out of memory");
+ firsterr = -ENOMEM;
+ kfree(cxt.lwsa);
+ goto unwind;
+ }
+
+ for (i = 0; i < cxt.nrealreaders_stress; i++) {
+ cxt.lrsa[i].n_lock_fail = 0;
+ cxt.lrsa[i].n_lock_acquired = 0;
+ }
+ }
+ lock_torture_print_module_parms(cxt.cur_ops, "Start of test");
+
+ /* Prepare torture context. */
+ if (onoff_interval > 0) {
+ firsterr = torture_onoff_init(onoff_holdoff * HZ,
+ onoff_interval * HZ);
+ if (firsterr)
+ goto unwind;
+ }
+ if (shuffle_interval > 0) {
+ firsterr = torture_shuffle_init(shuffle_interval);
+ if (firsterr)
+ goto unwind;
+ }
+ if (shutdown_secs > 0) {
+ firsterr = torture_shutdown_init(shutdown_secs,
+ lock_torture_cleanup);
+ if (firsterr)
+ goto unwind;
+ }
+ if (stutter > 0) {
+ firsterr = torture_stutter_init(stutter);
+ if (firsterr)
+ goto unwind;
+ }
+
+ writer_tasks = kzalloc(cxt.nrealwriters_stress * sizeof(writer_tasks[0]),
+ GFP_KERNEL);
+ if (writer_tasks == NULL) {
+ VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+
+ if (cxt.cur_ops->readlock) {
+ reader_tasks = kzalloc(cxt.nrealreaders_stress * sizeof(reader_tasks[0]),
+ GFP_KERNEL);
+ if (reader_tasks == NULL) {
+ VERBOSE_TOROUT_ERRSTRING("reader_tasks: Out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+ }
+
+ /*
+ * Create the kthreads and start torturing (oh, those poor little locks).
+ *
+ * TODO: Note that we interleave writers with readers, giving writers a
+ * slight advantage, by creating its kthread first. This can be modified
+ * for very specific needs, or even let the user choose the policy, if
+ * ever wanted.
+ */
+ for (i = 0, j = 0; i < cxt.nrealwriters_stress ||
+ j < cxt.nrealreaders_stress; i++, j++) {
+ if (i >= cxt.nrealwriters_stress)
+ goto create_reader;
+
+ /* Create writer. */
+ firsterr = torture_create_kthread(lock_torture_writer, &cxt.lwsa[i],
+ writer_tasks[i]);
+ if (firsterr)
+ goto unwind;
+
+ create_reader:
+ if (cxt.cur_ops->readlock == NULL || (j >= cxt.nrealreaders_stress))
+ continue;
+ /* Create reader. */
+ firsterr = torture_create_kthread(lock_torture_reader, &cxt.lrsa[j],
+ reader_tasks[j]);
+ if (firsterr)
+ goto unwind;
+ }
+ if (stat_interval > 0) {
+ firsterr = torture_create_kthread(lock_torture_stats, NULL,
+ stats_task);
+ if (firsterr)
+ goto unwind;
+ }
+ torture_init_end();
+ return 0;
+
+unwind:
+ torture_init_end();
+ lock_torture_cleanup();
+ return firsterr;
+}
+
+module_init(lock_torture_init);
+module_exit(lock_torture_cleanup);
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
new file mode 100644
index 000000000..75e114bdf
--- /dev/null
+++ b/kernel/locking/mcs_spinlock.h
@@ -0,0 +1,111 @@
+/*
+ * MCS lock defines
+ *
+ * This file contains the main data structure and API definitions of MCS lock.
+ *
+ * The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spin-lock
+ * with the desirable properties of being fair, and with each cpu trying
+ * to acquire the lock spinning on a local variable.
+ * It avoids expensive cache bouncings that common test-and-set spin-lock
+ * implementations incur.
+ */
+#ifndef __LINUX_MCS_SPINLOCK_H
+#define __LINUX_MCS_SPINLOCK_H
+
+#include <asm/mcs_spinlock.h>
+
+struct mcs_spinlock {
+ struct mcs_spinlock *next;
+ int locked; /* 1 if lock acquired */
+};
+
+#ifndef arch_mcs_spin_lock_contended
+/*
+ * Using smp_load_acquire() provides a memory barrier that ensures
+ * subsequent operations happen after the lock is acquired.
+ */
+#define arch_mcs_spin_lock_contended(l) \
+do { \
+ while (!(smp_load_acquire(l))) \
+ cpu_relax_lowlatency(); \
+} while (0)
+#endif
+
+#ifndef arch_mcs_spin_unlock_contended
+/*
+ * smp_store_release() provides a memory barrier to ensure all
+ * operations in the critical section has been completed before
+ * unlocking.
+ */
+#define arch_mcs_spin_unlock_contended(l) \
+ smp_store_release((l), 1)
+#endif
+
+/*
+ * Note: the smp_load_acquire/smp_store_release pair is not
+ * sufficient to form a full memory barrier across
+ * cpus for many architectures (except x86) for mcs_unlock and mcs_lock.
+ * For applications that need a full barrier across multiple cpus
+ * with mcs_unlock and mcs_lock pair, smp_mb__after_unlock_lock() should be
+ * used after mcs_lock.
+ */
+
+/*
+ * In order to acquire the lock, the caller should declare a local node and
+ * pass a reference of the node to this function in addition to the lock.
+ * If the lock has already been acquired, then this will proceed to spin
+ * on this node->locked until the previous lock holder sets the node->locked
+ * in mcs_spin_unlock().
+ */
+static inline
+void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
+{
+ struct mcs_spinlock *prev;
+
+ /* Init node */
+ node->locked = 0;
+ node->next = NULL;
+
+ prev = xchg(lock, node);
+ if (likely(prev == NULL)) {
+ /*
+ * Lock acquired, don't need to set node->locked to 1. Threads
+ * only spin on its own node->locked value for lock acquisition.
+ * However, since this thread can immediately acquire the lock
+ * and does not proceed to spin on its own node->locked, this
+ * value won't be used. If a debug mode is needed to
+ * audit lock status, then set node->locked value here.
+ */
+ return;
+ }
+ WRITE_ONCE(prev->next, node);
+
+ /* Wait until the lock holder passes the lock down. */
+ arch_mcs_spin_lock_contended(&node->locked);
+}
+
+/*
+ * Releases the lock. The caller should pass in the corresponding node that
+ * was used to acquire the lock.
+ */
+static inline
+void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
+{
+ struct mcs_spinlock *next = READ_ONCE(node->next);
+
+ if (likely(!next)) {
+ /*
+ * Release the lock by setting it to NULL
+ */
+ if (likely(cmpxchg(lock, node, NULL) == node))
+ return;
+ /* Wait until the next pointer is set */
+ while (!(next = READ_ONCE(node->next)))
+ cpu_relax_lowlatency();
+ }
+
+ /* Pass lock to next waiter. */
+ arch_mcs_spin_unlock_contended(&next->locked);
+}
+
+#endif /* __LINUX_MCS_SPINLOCK_H */
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
new file mode 100644
index 000000000..3ef373600
--- /dev/null
+++ b/kernel/locking/mutex-debug.c
@@ -0,0 +1,120 @@
+/*
+ * kernel/mutex-debug.c
+ *
+ * Debugging code for mutexes
+ *
+ * Started by Ingo Molnar:
+ *
+ * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * lock debugging, locking tree, deadlock detection started by:
+ *
+ * Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey
+ * Released under the General Public License (GPL).
+ */
+#include <linux/mutex.h>
+#include <linux/delay.h>
+#include <linux/export.h>
+#include <linux/poison.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/kallsyms.h>
+#include <linux/interrupt.h>
+#include <linux/debug_locks.h>
+
+#include "mutex-debug.h"
+
+/*
+ * Must be called with lock->wait_lock held.
+ */
+void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
+{
+ memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter));
+ waiter->magic = waiter;
+ INIT_LIST_HEAD(&waiter->list);
+}
+
+void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter)
+{
+ SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock));
+ DEBUG_LOCKS_WARN_ON(list_empty(&lock->wait_list));
+ DEBUG_LOCKS_WARN_ON(waiter->magic != waiter);
+ DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
+}
+
+void debug_mutex_free_waiter(struct mutex_waiter *waiter)
+{
+ DEBUG_LOCKS_WARN_ON(!list_empty(&waiter->list));
+ memset(waiter, MUTEX_DEBUG_FREE, sizeof(*waiter));
+}
+
+void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
+ struct thread_info *ti)
+{
+ SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock));
+
+ /* Mark the current thread as blocked on the lock: */
+ ti->task->blocked_on = waiter;
+}
+
+void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
+ struct thread_info *ti)
+{
+ DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
+ DEBUG_LOCKS_WARN_ON(waiter->task != ti->task);
+ DEBUG_LOCKS_WARN_ON(ti->task->blocked_on != waiter);
+ ti->task->blocked_on = NULL;
+
+ list_del_init(&waiter->list);
+ waiter->task = NULL;
+}
+
+void debug_mutex_unlock(struct mutex *lock)
+{
+ if (likely(debug_locks)) {
+ DEBUG_LOCKS_WARN_ON(lock->magic != lock);
+
+ if (!lock->owner)
+ DEBUG_LOCKS_WARN_ON(!lock->owner);
+ else
+ DEBUG_LOCKS_WARN_ON(lock->owner != current);
+
+ DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
+ }
+
+ /*
+ * __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug
+ * mutexes so that we can do it here after we've verified state.
+ */
+ mutex_clear_owner(lock);
+ atomic_set(&lock->count, 1);
+}
+
+void debug_mutex_init(struct mutex *lock, const char *name,
+ struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ /*
+ * Make sure we are not reinitializing a held lock:
+ */
+ debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+ lockdep_init_map(&lock->dep_map, name, key, 0);
+#endif
+ lock->magic = lock;
+}
+
+/***
+ * mutex_destroy - mark a mutex unusable
+ * @lock: the mutex to be destroyed
+ *
+ * This function marks the mutex uninitialized, and any subsequent
+ * use of the mutex is forbidden. The mutex must not be locked when
+ * this function is called.
+ */
+void mutex_destroy(struct mutex *lock)
+{
+ DEBUG_LOCKS_WARN_ON(mutex_is_locked(lock));
+ lock->magic = NULL;
+}
+
+EXPORT_SYMBOL_GPL(mutex_destroy);
diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h
new file mode 100644
index 000000000..0799fd3e4
--- /dev/null
+++ b/kernel/locking/mutex-debug.h
@@ -0,0 +1,55 @@
+/*
+ * Mutexes: blocking mutual exclusion locks
+ *
+ * started by Ingo Molnar:
+ *
+ * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * This file contains mutex debugging related internal declarations,
+ * prototypes and inline functions, for the CONFIG_DEBUG_MUTEXES case.
+ * More details are in kernel/mutex-debug.c.
+ */
+
+/*
+ * This must be called with lock->wait_lock held.
+ */
+extern void debug_mutex_lock_common(struct mutex *lock,
+ struct mutex_waiter *waiter);
+extern void debug_mutex_wake_waiter(struct mutex *lock,
+ struct mutex_waiter *waiter);
+extern void debug_mutex_free_waiter(struct mutex_waiter *waiter);
+extern void debug_mutex_add_waiter(struct mutex *lock,
+ struct mutex_waiter *waiter,
+ struct thread_info *ti);
+extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
+ struct thread_info *ti);
+extern void debug_mutex_unlock(struct mutex *lock);
+extern void debug_mutex_init(struct mutex *lock, const char *name,
+ struct lock_class_key *key);
+
+static inline void mutex_set_owner(struct mutex *lock)
+{
+ lock->owner = current;
+}
+
+static inline void mutex_clear_owner(struct mutex *lock)
+{
+ lock->owner = NULL;
+}
+
+#define spin_lock_mutex(lock, flags) \
+ do { \
+ struct mutex *l = container_of(lock, struct mutex, wait_lock); \
+ \
+ DEBUG_LOCKS_WARN_ON(in_interrupt()); \
+ local_irq_save(flags); \
+ arch_spin_lock(&(lock)->rlock.raw_lock);\
+ DEBUG_LOCKS_WARN_ON(l->magic != l); \
+ } while (0)
+
+#define spin_unlock_mutex(lock, flags) \
+ do { \
+ arch_spin_unlock(&(lock)->rlock.raw_lock); \
+ local_irq_restore(flags); \
+ preempt_check_resched(); \
+ } while (0)
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
new file mode 100644
index 000000000..4cccea6b8
--- /dev/null
+++ b/kernel/locking/mutex.c
@@ -0,0 +1,972 @@
+/*
+ * kernel/locking/mutex.c
+ *
+ * Mutexes: blocking mutual exclusion locks
+ *
+ * Started by Ingo Molnar:
+ *
+ * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * Many thanks to Arjan van de Ven, Thomas Gleixner, Steven Rostedt and
+ * David Howells for suggestions and improvements.
+ *
+ * - Adaptive spinning for mutexes by Peter Zijlstra. (Ported to mainline
+ * from the -rt tree, where it was originally implemented for rtmutexes
+ * by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale
+ * and Sven Dietrich.
+ *
+ * Also see Documentation/locking/mutex-design.txt.
+ */
+#include <linux/mutex.h>
+#include <linux/ww_mutex.h>
+#include <linux/sched.h>
+#include <linux/sched/rt.h>
+#include <linux/export.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <linux/debug_locks.h>
+#include <linux/osq_lock.h>
+
+/*
+ * In the DEBUG case we are using the "NULL fastpath" for mutexes,
+ * which forces all calls into the slowpath:
+ */
+#ifdef CONFIG_DEBUG_MUTEXES
+# include "mutex-debug.h"
+# include <asm-generic/mutex-null.h>
+/*
+ * Must be 0 for the debug case so we do not do the unlock outside of the
+ * wait_lock region. debug_mutex_unlock() will do the actual unlock in this
+ * case.
+ */
+# undef __mutex_slowpath_needs_to_unlock
+# define __mutex_slowpath_needs_to_unlock() 0
+#else
+# include "mutex.h"
+# include <asm/mutex.h>
+#endif
+
+void
+__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
+{
+ atomic_set(&lock->count, 1);
+ spin_lock_init(&lock->wait_lock);
+ INIT_LIST_HEAD(&lock->wait_list);
+ mutex_clear_owner(lock);
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+ osq_lock_init(&lock->osq);
+#endif
+
+ debug_mutex_init(lock, name, key);
+}
+
+EXPORT_SYMBOL(__mutex_init);
+
+#ifndef CONFIG_DEBUG_LOCK_ALLOC
+/*
+ * We split the mutex lock/unlock logic into separate fastpath and
+ * slowpath functions, to reduce the register pressure on the fastpath.
+ * We also put the fastpath first in the kernel image, to make sure the
+ * branch is predicted by the CPU as default-untaken.
+ */
+__visible void __sched __mutex_lock_slowpath(atomic_t *lock_count);
+
+/**
+ * mutex_lock - acquire the mutex
+ * @lock: the mutex to be acquired
+ *
+ * Lock the mutex exclusively for this task. If the mutex is not
+ * available right now, it will sleep until it can get it.
+ *
+ * The mutex must later on be released by the same task that
+ * acquired it. Recursive locking is not allowed. The task
+ * may not exit without first unlocking the mutex. Also, kernel
+ * memory where the mutex resides must not be freed with
+ * the mutex still locked. The mutex must first be initialized
+ * (or statically defined) before it can be locked. memset()-ing
+ * the mutex to 0 is not allowed.
+ *
+ * ( The CONFIG_DEBUG_MUTEXES .config option turns on debugging
+ * checks that will enforce the restrictions and will also do
+ * deadlock debugging. )
+ *
+ * This function is similar to (but not equivalent to) down().
+ */
+void __sched mutex_lock(struct mutex *lock)
+{
+ might_sleep();
+ /*
+ * The locking fastpath is the 1->0 transition from
+ * 'unlocked' into 'locked' state.
+ */
+ __mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);
+ mutex_set_owner(lock);
+}
+
+EXPORT_SYMBOL(mutex_lock);
+#endif
+
+static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
+ struct ww_acquire_ctx *ww_ctx)
+{
+#ifdef CONFIG_DEBUG_MUTEXES
+ /*
+ * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
+ * but released with a normal mutex_unlock in this call.
+ *
+ * This should never happen, always use ww_mutex_unlock.
+ */
+ DEBUG_LOCKS_WARN_ON(ww->ctx);
+
+ /*
+ * Not quite done after calling ww_acquire_done() ?
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
+
+ if (ww_ctx->contending_lock) {
+ /*
+ * After -EDEADLK you tried to
+ * acquire a different ww_mutex? Bad!
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
+
+ /*
+ * You called ww_mutex_lock after receiving -EDEADLK,
+ * but 'forgot' to unlock everything else first?
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
+ ww_ctx->contending_lock = NULL;
+ }
+
+ /*
+ * Naughty, using a different class will lead to undefined behavior!
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
+#endif
+ ww_ctx->acquired++;
+}
+
+/*
+ * After acquiring lock with fastpath or when we lost out in contested
+ * slowpath, set ctx and wake up any waiters so they can recheck.
+ *
+ * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set,
+ * as the fastpath and opportunistic spinning are disabled in that case.
+ */
+static __always_inline void
+ww_mutex_set_context_fastpath(struct ww_mutex *lock,
+ struct ww_acquire_ctx *ctx)
+{
+ unsigned long flags;
+ struct mutex_waiter *cur;
+
+ ww_mutex_lock_acquired(lock, ctx);
+
+ lock->ctx = ctx;
+
+ /*
+ * The lock->ctx update should be visible on all cores before
+ * the atomic read is done, otherwise contended waiters might be
+ * missed. The contended waiters will either see ww_ctx == NULL
+ * and keep spinning, or it will acquire wait_lock, add itself
+ * to waiter list and sleep.
+ */
+ smp_mb(); /* ^^^ */
+
+ /*
+ * Check if lock is contended, if not there is nobody to wake up
+ */
+ if (likely(atomic_read(&lock->base.count) == 0))
+ return;
+
+ /*
+ * Uh oh, we raced in fastpath, wake up everyone in this case,
+ * so they can see the new lock->ctx.
+ */
+ spin_lock_mutex(&lock->base.wait_lock, flags);
+ list_for_each_entry(cur, &lock->base.wait_list, list) {
+ debug_mutex_wake_waiter(&lock->base, cur);
+ wake_up_process(cur->task);
+ }
+ spin_unlock_mutex(&lock->base.wait_lock, flags);
+}
+
+/*
+ * After acquiring lock in the slowpath set ctx and wake up any
+ * waiters so they can recheck.
+ *
+ * Callers must hold the mutex wait_lock.
+ */
+static __always_inline void
+ww_mutex_set_context_slowpath(struct ww_mutex *lock,
+ struct ww_acquire_ctx *ctx)
+{
+ struct mutex_waiter *cur;
+
+ ww_mutex_lock_acquired(lock, ctx);
+ lock->ctx = ctx;
+
+ /*
+ * Give any possible sleeping processes the chance to wake up,
+ * so they can recheck if they have to back off.
+ */
+ list_for_each_entry(cur, &lock->base.wait_list, list) {
+ debug_mutex_wake_waiter(&lock->base, cur);
+ wake_up_process(cur->task);
+ }
+}
+
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+/*
+ * Look out! "owner" is an entirely speculative pointer
+ * access and not reliable.
+ */
+static noinline
+bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
+{
+ bool ret = true;
+
+ rcu_read_lock();
+ while (lock->owner == owner) {
+ /*
+ * Ensure we emit the owner->on_cpu, dereference _after_
+ * checking lock->owner still matches owner. If that fails,
+ * owner might point to freed memory. If it still matches,
+ * the rcu_read_lock() ensures the memory stays valid.
+ */
+ barrier();
+
+ if (!owner->on_cpu || need_resched()) {
+ ret = false;
+ break;
+ }
+
+ cpu_relax_lowlatency();
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
+/*
+ * Initial check for entering the mutex spinning loop
+ */
+static inline int mutex_can_spin_on_owner(struct mutex *lock)
+{
+ struct task_struct *owner;
+ int retval = 1;
+
+ if (need_resched())
+ return 0;
+
+ rcu_read_lock();
+ owner = READ_ONCE(lock->owner);
+ if (owner)
+ retval = owner->on_cpu;
+ rcu_read_unlock();
+ /*
+ * if lock->owner is not set, the mutex owner may have just acquired
+ * it and not set the owner yet or the mutex has been released.
+ */
+ return retval;
+}
+
+/*
+ * Atomically try to take the lock when it is available
+ */
+static inline bool mutex_try_to_acquire(struct mutex *lock)
+{
+ return !mutex_is_locked(lock) &&
+ (atomic_cmpxchg(&lock->count, 1, 0) == 1);
+}
+
+/*
+ * Optimistic spinning.
+ *
+ * We try to spin for acquisition when we find that the lock owner
+ * is currently running on a (different) CPU and while we don't
+ * need to reschedule. The rationale is that if the lock owner is
+ * running, it is likely to release the lock soon.
+ *
+ * Since this needs the lock owner, and this mutex implementation
+ * doesn't track the owner atomically in the lock field, we need to
+ * track it non-atomically.
+ *
+ * We can't do this for DEBUG_MUTEXES because that relies on wait_lock
+ * to serialize everything.
+ *
+ * The mutex spinners are queued up using MCS lock so that only one
+ * spinner can compete for the mutex. However, if mutex spinning isn't
+ * going to happen, there is no point in going through the lock/unlock
+ * overhead.
+ *
+ * Returns true when the lock was taken, otherwise false, indicating
+ * that we need to jump to the slowpath and sleep.
+ */
+static bool mutex_optimistic_spin(struct mutex *lock,
+ struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
+{
+ struct task_struct *task = current;
+
+ if (!mutex_can_spin_on_owner(lock))
+ goto done;
+
+ /*
+ * In order to avoid a stampede of mutex spinners trying to
+ * acquire the mutex all at once, the spinners need to take a
+ * MCS (queued) lock first before spinning on the owner field.
+ */
+ if (!osq_lock(&lock->osq))
+ goto done;
+
+ while (true) {
+ struct task_struct *owner;
+
+ if (use_ww_ctx && ww_ctx->acquired > 0) {
+ struct ww_mutex *ww;
+
+ ww = container_of(lock, struct ww_mutex, base);
+ /*
+ * If ww->ctx is set the contents are undefined, only
+ * by acquiring wait_lock there is a guarantee that
+ * they are not invalid when reading.
+ *
+ * As such, when deadlock detection needs to be
+ * performed the optimistic spinning cannot be done.
+ */
+ if (READ_ONCE(ww->ctx))
+ break;
+ }
+
+ /*
+ * If there's an owner, wait for it to either
+ * release the lock or go to sleep.
+ */
+ owner = READ_ONCE(lock->owner);
+ if (owner && !mutex_spin_on_owner(lock, owner))
+ break;
+
+ /* Try to acquire the mutex if it is unlocked. */
+ if (mutex_try_to_acquire(lock)) {
+ lock_acquired(&lock->dep_map, ip);
+
+ if (use_ww_ctx) {
+ struct ww_mutex *ww;
+ ww = container_of(lock, struct ww_mutex, base);
+
+ ww_mutex_set_context_fastpath(ww, ww_ctx);
+ }
+
+ mutex_set_owner(lock);
+ osq_unlock(&lock->osq);
+ return true;
+ }
+
+ /*
+ * When there's no owner, we might have preempted between the
+ * owner acquiring the lock and setting the owner field. If
+ * we're an RT task that will live-lock because we won't let
+ * the owner complete.
+ */
+ if (!owner && (need_resched() || rt_task(task)))
+ break;
+
+ /*
+ * The cpu_relax() call is a compiler barrier which forces
+ * everything in this loop to be re-loaded. We don't need
+ * memory barriers as we'll eventually observe the right
+ * values at the cost of a few extra spins.
+ */
+ cpu_relax_lowlatency();
+ }
+
+ osq_unlock(&lock->osq);
+done:
+ /*
+ * If we fell out of the spin path because of need_resched(),
+ * reschedule now, before we try-lock the mutex. This avoids getting
+ * scheduled out right after we obtained the mutex.
+ */
+ if (need_resched()) {
+ /*
+ * We _should_ have TASK_RUNNING here, but just in case
+ * we do not, make it so, otherwise we might get stuck.
+ */
+ __set_current_state(TASK_RUNNING);
+ schedule_preempt_disabled();
+ }
+
+ return false;
+}
+#else
+static bool mutex_optimistic_spin(struct mutex *lock,
+ struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
+{
+ return false;
+}
+#endif
+
+__visible __used noinline
+void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
+
+/**
+ * mutex_unlock - release the mutex
+ * @lock: the mutex to be released
+ *
+ * Unlock a mutex that has been locked by this task previously.
+ *
+ * This function must not be used in interrupt context. Unlocking
+ * of a not locked mutex is not allowed.
+ *
+ * This function is similar to (but not equivalent to) up().
+ */
+void __sched mutex_unlock(struct mutex *lock)
+{
+ /*
+ * The unlocking fastpath is the 0->1 transition from 'locked'
+ * into 'unlocked' state:
+ */
+#ifndef CONFIG_DEBUG_MUTEXES
+ /*
+ * When debugging is enabled we must not clear the owner before time,
+ * the slow path will always be taken, and that clears the owner field
+ * after verifying that it was indeed current.
+ */
+ mutex_clear_owner(lock);
+#endif
+ __mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath);
+}
+
+EXPORT_SYMBOL(mutex_unlock);
+
+/**
+ * ww_mutex_unlock - release the w/w mutex
+ * @lock: the mutex to be released
+ *
+ * Unlock a mutex that has been locked by this task previously with any of the
+ * ww_mutex_lock* functions (with or without an acquire context). It is
+ * forbidden to release the locks after releasing the acquire context.
+ *
+ * This function must not be used in interrupt context. Unlocking
+ * of a unlocked mutex is not allowed.
+ */
+void __sched ww_mutex_unlock(struct ww_mutex *lock)
+{
+ /*
+ * The unlocking fastpath is the 0->1 transition from 'locked'
+ * into 'unlocked' state:
+ */
+ if (lock->ctx) {
+#ifdef CONFIG_DEBUG_MUTEXES
+ DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
+#endif
+ if (lock->ctx->acquired > 0)
+ lock->ctx->acquired--;
+ lock->ctx = NULL;
+ }
+
+#ifndef CONFIG_DEBUG_MUTEXES
+ /*
+ * When debugging is enabled we must not clear the owner before time,
+ * the slow path will always be taken, and that clears the owner field
+ * after verifying that it was indeed current.
+ */
+ mutex_clear_owner(&lock->base);
+#endif
+ __mutex_fastpath_unlock(&lock->base.count, __mutex_unlock_slowpath);
+}
+EXPORT_SYMBOL(ww_mutex_unlock);
+
+static inline int __sched
+__ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
+{
+ struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
+ struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx);
+
+ if (!hold_ctx)
+ return 0;
+
+ if (unlikely(ctx == hold_ctx))
+ return -EALREADY;
+
+ if (ctx->stamp - hold_ctx->stamp <= LONG_MAX &&
+ (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) {
+#ifdef CONFIG_DEBUG_MUTEXES
+ DEBUG_LOCKS_WARN_ON(ctx->contending_lock);
+ ctx->contending_lock = ww;
+#endif
+ return -EDEADLK;
+ }
+
+ return 0;
+}
+
+/*
+ * Lock a mutex (possibly interruptible), slowpath:
+ */
+static __always_inline int __sched
+__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
+ struct lockdep_map *nest_lock, unsigned long ip,
+ struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
+{
+ struct task_struct *task = current;
+ struct mutex_waiter waiter;
+ unsigned long flags;
+ int ret;
+
+ preempt_disable();
+ mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
+
+ if (mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx)) {
+ /* got the lock, yay! */
+ preempt_enable();
+ return 0;
+ }
+
+ spin_lock_mutex(&lock->wait_lock, flags);
+
+ /*
+ * Once more, try to acquire the lock. Only try-lock the mutex if
+ * it is unlocked to reduce unnecessary xchg() operations.
+ */
+ if (!mutex_is_locked(lock) && (atomic_xchg(&lock->count, 0) == 1))
+ goto skip_wait;
+
+ debug_mutex_lock_common(lock, &waiter);
+ debug_mutex_add_waiter(lock, &waiter, task_thread_info(task));
+
+ /* add waiting tasks to the end of the waitqueue (FIFO): */
+ list_add_tail(&waiter.list, &lock->wait_list);
+ waiter.task = task;
+
+ lock_contended(&lock->dep_map, ip);
+
+ for (;;) {
+ /*
+ * Lets try to take the lock again - this is needed even if
+ * we get here for the first time (shortly after failing to
+ * acquire the lock), to make sure that we get a wakeup once
+ * it's unlocked. Later on, if we sleep, this is the
+ * operation that gives us the lock. We xchg it to -1, so
+ * that when we release the lock, we properly wake up the
+ * other waiters. We only attempt the xchg if the count is
+ * non-negative in order to avoid unnecessary xchg operations:
+ */
+ if (atomic_read(&lock->count) >= 0 &&
+ (atomic_xchg(&lock->count, -1) == 1))
+ break;
+
+ /*
+ * got a signal? (This code gets eliminated in the
+ * TASK_UNINTERRUPTIBLE case.)
+ */
+ if (unlikely(signal_pending_state(state, task))) {
+ ret = -EINTR;
+ goto err;
+ }
+
+ if (use_ww_ctx && ww_ctx->acquired > 0) {
+ ret = __ww_mutex_lock_check_stamp(lock, ww_ctx);
+ if (ret)
+ goto err;
+ }
+
+ __set_task_state(task, state);
+
+ /* didn't get the lock, go to sleep: */
+ spin_unlock_mutex(&lock->wait_lock, flags);
+ schedule_preempt_disabled();
+ spin_lock_mutex(&lock->wait_lock, flags);
+ }
+ __set_task_state(task, TASK_RUNNING);
+
+ mutex_remove_waiter(lock, &waiter, current_thread_info());
+ /* set it to 0 if there are no waiters left: */
+ if (likely(list_empty(&lock->wait_list)))
+ atomic_set(&lock->count, 0);
+ debug_mutex_free_waiter(&waiter);
+
+skip_wait:
+ /* got the lock - cleanup and rejoice! */
+ lock_acquired(&lock->dep_map, ip);
+ mutex_set_owner(lock);
+
+ if (use_ww_ctx) {
+ struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
+ ww_mutex_set_context_slowpath(ww, ww_ctx);
+ }
+
+ spin_unlock_mutex(&lock->wait_lock, flags);
+ preempt_enable();
+ return 0;
+
+err:
+ mutex_remove_waiter(lock, &waiter, task_thread_info(task));
+ spin_unlock_mutex(&lock->wait_lock, flags);
+ debug_mutex_free_waiter(&waiter);
+ mutex_release(&lock->dep_map, 1, ip);
+ preempt_enable();
+ return ret;
+}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __sched
+mutex_lock_nested(struct mutex *lock, unsigned int subclass)
+{
+ might_sleep();
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
+ subclass, NULL, _RET_IP_, NULL, 0);
+}
+
+EXPORT_SYMBOL_GPL(mutex_lock_nested);
+
+void __sched
+_mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
+{
+ might_sleep();
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
+ 0, nest, _RET_IP_, NULL, 0);
+}
+
+EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock);
+
+int __sched
+mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass)
+{
+ might_sleep();
+ return __mutex_lock_common(lock, TASK_KILLABLE,
+ subclass, NULL, _RET_IP_, NULL, 0);
+}
+EXPORT_SYMBOL_GPL(mutex_lock_killable_nested);
+
+int __sched
+mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
+{
+ might_sleep();
+ return __mutex_lock_common(lock, TASK_INTERRUPTIBLE,
+ subclass, NULL, _RET_IP_, NULL, 0);
+}
+
+EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
+
+static inline int
+ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
+ unsigned tmp;
+
+ if (ctx->deadlock_inject_countdown-- == 0) {
+ tmp = ctx->deadlock_inject_interval;
+ if (tmp > UINT_MAX/4)
+ tmp = UINT_MAX;
+ else
+ tmp = tmp*2 + tmp + tmp/2;
+
+ ctx->deadlock_inject_interval = tmp;
+ ctx->deadlock_inject_countdown = tmp;
+ ctx->contending_lock = lock;
+
+ ww_mutex_unlock(lock);
+
+ return -EDEADLK;
+ }
+#endif
+
+ return 0;
+}
+
+int __sched
+__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+ int ret;
+
+ might_sleep();
+ ret = __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE,
+ 0, &ctx->dep_map, _RET_IP_, ctx, 1);
+ if (!ret && ctx->acquired > 1)
+ return ww_mutex_deadlock_injection(lock, ctx);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__ww_mutex_lock);
+
+int __sched
+__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+ int ret;
+
+ might_sleep();
+ ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE,
+ 0, &ctx->dep_map, _RET_IP_, ctx, 1);
+
+ if (!ret && ctx->acquired > 1)
+ return ww_mutex_deadlock_injection(lock, ctx);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible);
+
+#endif
+
+/*
+ * Release the lock, slowpath:
+ */
+static inline void
+__mutex_unlock_common_slowpath(struct mutex *lock, int nested)
+{
+ unsigned long flags;
+
+ /*
+ * As a performance measurement, release the lock before doing other
+ * wakeup related duties to follow. This allows other tasks to acquire
+ * the lock sooner, while still handling cleanups in past unlock calls.
+ * This can be done as we do not enforce strict equivalence between the
+ * mutex counter and wait_list.
+ *
+ *
+ * Some architectures leave the lock unlocked in the fastpath failure
+ * case, others need to leave it locked. In the later case we have to
+ * unlock it here - as the lock counter is currently 0 or negative.
+ */
+ if (__mutex_slowpath_needs_to_unlock())
+ atomic_set(&lock->count, 1);
+
+ spin_lock_mutex(&lock->wait_lock, flags);
+ mutex_release(&lock->dep_map, nested, _RET_IP_);
+ debug_mutex_unlock(lock);
+
+ if (!list_empty(&lock->wait_list)) {
+ /* get the first entry from the wait-list: */
+ struct mutex_waiter *waiter =
+ list_entry(lock->wait_list.next,
+ struct mutex_waiter, list);
+
+ debug_mutex_wake_waiter(lock, waiter);
+
+ wake_up_process(waiter->task);
+ }
+
+ spin_unlock_mutex(&lock->wait_lock, flags);
+}
+
+/*
+ * Release the lock, slowpath:
+ */
+__visible void
+__mutex_unlock_slowpath(atomic_t *lock_count)
+{
+ struct mutex *lock = container_of(lock_count, struct mutex, count);
+
+ __mutex_unlock_common_slowpath(lock, 1);
+}
+
+#ifndef CONFIG_DEBUG_LOCK_ALLOC
+/*
+ * Here come the less common (and hence less performance-critical) APIs:
+ * mutex_lock_interruptible() and mutex_trylock().
+ */
+static noinline int __sched
+__mutex_lock_killable_slowpath(struct mutex *lock);
+
+static noinline int __sched
+__mutex_lock_interruptible_slowpath(struct mutex *lock);
+
+/**
+ * mutex_lock_interruptible - acquire the mutex, interruptible
+ * @lock: the mutex to be acquired
+ *
+ * Lock the mutex like mutex_lock(), and return 0 if the mutex has
+ * been acquired or sleep until the mutex becomes available. If a
+ * signal arrives while waiting for the lock then this function
+ * returns -EINTR.
+ *
+ * This function is similar to (but not equivalent to) down_interruptible().
+ */
+int __sched mutex_lock_interruptible(struct mutex *lock)
+{
+ int ret;
+
+ might_sleep();
+ ret = __mutex_fastpath_lock_retval(&lock->count);
+ if (likely(!ret)) {
+ mutex_set_owner(lock);
+ return 0;
+ } else
+ return __mutex_lock_interruptible_slowpath(lock);
+}
+
+EXPORT_SYMBOL(mutex_lock_interruptible);
+
+int __sched mutex_lock_killable(struct mutex *lock)
+{
+ int ret;
+
+ might_sleep();
+ ret = __mutex_fastpath_lock_retval(&lock->count);
+ if (likely(!ret)) {
+ mutex_set_owner(lock);
+ return 0;
+ } else
+ return __mutex_lock_killable_slowpath(lock);
+}
+EXPORT_SYMBOL(mutex_lock_killable);
+
+__visible void __sched
+__mutex_lock_slowpath(atomic_t *lock_count)
+{
+ struct mutex *lock = container_of(lock_count, struct mutex, count);
+
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0,
+ NULL, _RET_IP_, NULL, 0);
+}
+
+static noinline int __sched
+__mutex_lock_killable_slowpath(struct mutex *lock)
+{
+ return __mutex_lock_common(lock, TASK_KILLABLE, 0,
+ NULL, _RET_IP_, NULL, 0);
+}
+
+static noinline int __sched
+__mutex_lock_interruptible_slowpath(struct mutex *lock)
+{
+ return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0,
+ NULL, _RET_IP_, NULL, 0);
+}
+
+static noinline int __sched
+__ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+ return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 0,
+ NULL, _RET_IP_, ctx, 1);
+}
+
+static noinline int __sched
+__ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock,
+ struct ww_acquire_ctx *ctx)
+{
+ return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 0,
+ NULL, _RET_IP_, ctx, 1);
+}
+
+#endif
+
+/*
+ * Spinlock based trylock, we take the spinlock and check whether we
+ * can get the lock:
+ */
+static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
+{
+ struct mutex *lock = container_of(lock_count, struct mutex, count);
+ unsigned long flags;
+ int prev;
+
+ /* No need to trylock if the mutex is locked. */
+ if (mutex_is_locked(lock))
+ return 0;
+
+ spin_lock_mutex(&lock->wait_lock, flags);
+
+ prev = atomic_xchg(&lock->count, -1);
+ if (likely(prev == 1)) {
+ mutex_set_owner(lock);
+ mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+ }
+
+ /* Set it back to 0 if there are no waiters: */
+ if (likely(list_empty(&lock->wait_list)))
+ atomic_set(&lock->count, 0);
+
+ spin_unlock_mutex(&lock->wait_lock, flags);
+
+ return prev == 1;
+}
+
+/**
+ * mutex_trylock - try to acquire the mutex, without waiting
+ * @lock: the mutex to be acquired
+ *
+ * Try to acquire the mutex atomically. Returns 1 if the mutex
+ * has been acquired successfully, and 0 on contention.
+ *
+ * NOTE: this function follows the spin_trylock() convention, so
+ * it is negated from the down_trylock() return values! Be careful
+ * about this when converting semaphore users to mutexes.
+ *
+ * This function must not be used in interrupt context. The
+ * mutex must be released by the same task that acquired it.
+ */
+int __sched mutex_trylock(struct mutex *lock)
+{
+ int ret;
+
+ ret = __mutex_fastpath_trylock(&lock->count, __mutex_trylock_slowpath);
+ if (ret)
+ mutex_set_owner(lock);
+
+ return ret;
+}
+EXPORT_SYMBOL(mutex_trylock);
+
+#ifndef CONFIG_DEBUG_LOCK_ALLOC
+int __sched
+__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+ int ret;
+
+ might_sleep();
+
+ ret = __mutex_fastpath_lock_retval(&lock->base.count);
+
+ if (likely(!ret)) {
+ ww_mutex_set_context_fastpath(lock, ctx);
+ mutex_set_owner(&lock->base);
+ } else
+ ret = __ww_mutex_lock_slowpath(lock, ctx);
+ return ret;
+}
+EXPORT_SYMBOL(__ww_mutex_lock);
+
+int __sched
+__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+ int ret;
+
+ might_sleep();
+
+ ret = __mutex_fastpath_lock_retval(&lock->base.count);
+
+ if (likely(!ret)) {
+ ww_mutex_set_context_fastpath(lock, ctx);
+ mutex_set_owner(&lock->base);
+ } else
+ ret = __ww_mutex_lock_interruptible_slowpath(lock, ctx);
+ return ret;
+}
+EXPORT_SYMBOL(__ww_mutex_lock_interruptible);
+
+#endif
+
+/**
+ * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
+ * @cnt: the atomic which we are to dec
+ * @lock: the mutex to return holding if we dec to 0
+ *
+ * return true and hold lock if we dec to 0, return false otherwise
+ */
+int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
+{
+ /* dec if we can't possibly hit 0 */
+ if (atomic_add_unless(cnt, -1, 1))
+ return 0;
+ /* we might hit 0, so take the lock */
+ mutex_lock(lock);
+ if (!atomic_dec_and_test(cnt)) {
+ /* when we actually did the dec, we didn't hit 0 */
+ mutex_unlock(lock);
+ return 0;
+ }
+ /* we hit 0, and we hold the lock */
+ return 1;
+}
+EXPORT_SYMBOL(atomic_dec_and_mutex_lock);
diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h
new file mode 100644
index 000000000..5cda39760
--- /dev/null
+++ b/kernel/locking/mutex.h
@@ -0,0 +1,48 @@
+/*
+ * Mutexes: blocking mutual exclusion locks
+ *
+ * started by Ingo Molnar:
+ *
+ * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * This file contains mutex debugging related internal prototypes, for the
+ * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs:
+ */
+
+#define spin_lock_mutex(lock, flags) \
+ do { spin_lock(lock); (void)(flags); } while (0)
+#define spin_unlock_mutex(lock, flags) \
+ do { spin_unlock(lock); (void)(flags); } while (0)
+#define mutex_remove_waiter(lock, waiter, ti) \
+ __list_del((waiter)->list.prev, (waiter)->list.next)
+
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+static inline void mutex_set_owner(struct mutex *lock)
+{
+ lock->owner = current;
+}
+
+static inline void mutex_clear_owner(struct mutex *lock)
+{
+ lock->owner = NULL;
+}
+#else
+static inline void mutex_set_owner(struct mutex *lock)
+{
+}
+
+static inline void mutex_clear_owner(struct mutex *lock)
+{
+}
+#endif
+
+#define debug_mutex_wake_waiter(lock, waiter) do { } while (0)
+#define debug_mutex_free_waiter(waiter) do { } while (0)
+#define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0)
+#define debug_mutex_unlock(lock) do { } while (0)
+#define debug_mutex_init(lock, name, key) do { } while (0)
+
+static inline void
+debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
+{
+}
diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
new file mode 100644
index 000000000..dc85ee23a
--- /dev/null
+++ b/kernel/locking/osq_lock.c
@@ -0,0 +1,203 @@
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/osq_lock.h>
+
+/*
+ * An MCS like lock especially tailored for optimistic spinning for sleeping
+ * lock implementations (mutex, rwsem, etc).
+ *
+ * Using a single mcs node per CPU is safe because sleeping locks should not be
+ * called from interrupt context and we have preemption disabled while
+ * spinning.
+ */
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node);
+
+/*
+ * We use the value 0 to represent "no CPU", thus the encoded value
+ * will be the CPU number incremented by 1.
+ */
+static inline int encode_cpu(int cpu_nr)
+{
+ return cpu_nr + 1;
+}
+
+static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
+{
+ int cpu_nr = encoded_cpu_val - 1;
+
+ return per_cpu_ptr(&osq_node, cpu_nr);
+}
+
+/*
+ * Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
+ * Can return NULL in case we were the last queued and we updated @lock instead.
+ */
+static inline struct optimistic_spin_node *
+osq_wait_next(struct optimistic_spin_queue *lock,
+ struct optimistic_spin_node *node,
+ struct optimistic_spin_node *prev)
+{
+ struct optimistic_spin_node *next = NULL;
+ int curr = encode_cpu(smp_processor_id());
+ int old;
+
+ /*
+ * If there is a prev node in queue, then the 'old' value will be
+ * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if
+ * we're currently last in queue, then the queue will then become empty.
+ */
+ old = prev ? prev->cpu : OSQ_UNLOCKED_VAL;
+
+ for (;;) {
+ if (atomic_read(&lock->tail) == curr &&
+ atomic_cmpxchg(&lock->tail, curr, old) == curr) {
+ /*
+ * We were the last queued, we moved @lock back. @prev
+ * will now observe @lock and will complete its
+ * unlock()/unqueue().
+ */
+ break;
+ }
+
+ /*
+ * We must xchg() the @node->next value, because if we were to
+ * leave it in, a concurrent unlock()/unqueue() from
+ * @node->next might complete Step-A and think its @prev is
+ * still valid.
+ *
+ * If the concurrent unlock()/unqueue() wins the race, we'll
+ * wait for either @lock to point to us, through its Step-B, or
+ * wait for a new @node->next from its Step-C.
+ */
+ if (node->next) {
+ next = xchg(&node->next, NULL);
+ if (next)
+ break;
+ }
+
+ cpu_relax_lowlatency();
+ }
+
+ return next;
+}
+
+bool osq_lock(struct optimistic_spin_queue *lock)
+{
+ struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
+ struct optimistic_spin_node *prev, *next;
+ int curr = encode_cpu(smp_processor_id());
+ int old;
+
+ node->locked = 0;
+ node->next = NULL;
+ node->cpu = curr;
+
+ old = atomic_xchg(&lock->tail, curr);
+ if (old == OSQ_UNLOCKED_VAL)
+ return true;
+
+ prev = decode_cpu(old);
+ node->prev = prev;
+ WRITE_ONCE(prev->next, node);
+
+ /*
+ * Normally @prev is untouchable after the above store; because at that
+ * moment unlock can proceed and wipe the node element from stack.
+ *
+ * However, since our nodes are static per-cpu storage, we're
+ * guaranteed their existence -- this allows us to apply
+ * cmpxchg in an attempt to undo our queueing.
+ */
+
+ while (!READ_ONCE(node->locked)) {
+ /*
+ * If we need to reschedule bail... so we can block.
+ */
+ if (need_resched())
+ goto unqueue;
+
+ cpu_relax_lowlatency();
+ }
+ return true;
+
+unqueue:
+ /*
+ * Step - A -- stabilize @prev
+ *
+ * Undo our @prev->next assignment; this will make @prev's
+ * unlock()/unqueue() wait for a next pointer since @lock points to us
+ * (or later).
+ */
+
+ for (;;) {
+ if (prev->next == node &&
+ cmpxchg(&prev->next, node, NULL) == node)
+ break;
+
+ /*
+ * We can only fail the cmpxchg() racing against an unlock(),
+ * in which case we should observe @node->locked becomming
+ * true.
+ */
+ if (smp_load_acquire(&node->locked))
+ return true;
+
+ cpu_relax_lowlatency();
+
+ /*
+ * Or we race against a concurrent unqueue()'s step-B, in which
+ * case its step-C will write us a new @node->prev pointer.
+ */
+ prev = READ_ONCE(node->prev);
+ }
+
+ /*
+ * Step - B -- stabilize @next
+ *
+ * Similar to unlock(), wait for @node->next or move @lock from @node
+ * back to @prev.
+ */
+
+ next = osq_wait_next(lock, node, prev);
+ if (!next)
+ return false;
+
+ /*
+ * Step - C -- unlink
+ *
+ * @prev is stable because its still waiting for a new @prev->next
+ * pointer, @next is stable because our @node->next pointer is NULL and
+ * it will wait in Step-A.
+ */
+
+ WRITE_ONCE(next->prev, prev);
+ WRITE_ONCE(prev->next, next);
+
+ return false;
+}
+
+void osq_unlock(struct optimistic_spin_queue *lock)
+{
+ struct optimistic_spin_node *node, *next;
+ int curr = encode_cpu(smp_processor_id());
+
+ /*
+ * Fast path for the uncontended case.
+ */
+ if (likely(atomic_cmpxchg(&lock->tail, curr, OSQ_UNLOCKED_VAL) == curr))
+ return;
+
+ /*
+ * Second most likely case.
+ */
+ node = this_cpu_ptr(&osq_node);
+ next = xchg(&node->next, NULL);
+ if (next) {
+ WRITE_ONCE(next->locked, 1);
+ return;
+ }
+
+ next = osq_wait_next(lock, node, NULL);
+ if (next)
+ WRITE_ONCE(next->locked, 1);
+}
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
new file mode 100644
index 000000000..652a8ee8e
--- /dev/null
+++ b/kernel/locking/percpu-rwsem.c
@@ -0,0 +1,165 @@
+#include <linux/atomic.h>
+#include <linux/rwsem.h>
+#include <linux/percpu.h>
+#include <linux/wait.h>
+#include <linux/lockdep.h>
+#include <linux/percpu-rwsem.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+
+int __percpu_init_rwsem(struct percpu_rw_semaphore *brw,
+ const char *name, struct lock_class_key *rwsem_key)
+{
+ brw->fast_read_ctr = alloc_percpu(int);
+ if (unlikely(!brw->fast_read_ctr))
+ return -ENOMEM;
+
+ /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
+ __init_rwsem(&brw->rw_sem, name, rwsem_key);
+ atomic_set(&brw->write_ctr, 0);
+ atomic_set(&brw->slow_read_ctr, 0);
+ init_waitqueue_head(&brw->write_waitq);
+ return 0;
+}
+
+void percpu_free_rwsem(struct percpu_rw_semaphore *brw)
+{
+ free_percpu(brw->fast_read_ctr);
+ brw->fast_read_ctr = NULL; /* catch use after free bugs */
+}
+
+/*
+ * This is the fast-path for down_read/up_read, it only needs to ensure
+ * there is no pending writer (atomic_read(write_ctr) == 0) and inc/dec the
+ * fast per-cpu counter. The writer uses synchronize_sched_expedited() to
+ * serialize with the preempt-disabled section below.
+ *
+ * The nontrivial part is that we should guarantee acquire/release semantics
+ * in case when
+ *
+ * R_W: down_write() comes after up_read(), the writer should see all
+ * changes done by the reader
+ * or
+ * W_R: down_read() comes after up_write(), the reader should see all
+ * changes done by the writer
+ *
+ * If this helper fails the callers rely on the normal rw_semaphore and
+ * atomic_dec_and_test(), so in this case we have the necessary barriers.
+ *
+ * But if it succeeds we do not have any barriers, atomic_read(write_ctr) or
+ * __this_cpu_add() below can be reordered with any LOAD/STORE done by the
+ * reader inside the critical section. See the comments in down_write and
+ * up_write below.
+ */
+static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val)
+{
+ bool success = false;
+
+ preempt_disable();
+ if (likely(!atomic_read(&brw->write_ctr))) {
+ __this_cpu_add(*brw->fast_read_ctr, val);
+ success = true;
+ }
+ preempt_enable();
+
+ return success;
+}
+
+/*
+ * Like the normal down_read() this is not recursive, the writer can
+ * come after the first percpu_down_read() and create the deadlock.
+ *
+ * Note: returns with lock_is_held(brw->rw_sem) == T for lockdep,
+ * percpu_up_read() does rwsem_release(). This pairs with the usage
+ * of ->rw_sem in percpu_down/up_write().
+ */
+void percpu_down_read(struct percpu_rw_semaphore *brw)
+{
+ might_sleep();
+ if (likely(update_fast_ctr(brw, +1))) {
+ rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_);
+ return;
+ }
+
+ down_read(&brw->rw_sem);
+ atomic_inc(&brw->slow_read_ctr);
+ /* avoid up_read()->rwsem_release() */
+ __up_read(&brw->rw_sem);
+}
+
+void percpu_up_read(struct percpu_rw_semaphore *brw)
+{
+ rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_);
+
+ if (likely(update_fast_ctr(brw, -1)))
+ return;
+
+ /* false-positive is possible but harmless */
+ if (atomic_dec_and_test(&brw->slow_read_ctr))
+ wake_up_all(&brw->write_waitq);
+}
+
+static int clear_fast_ctr(struct percpu_rw_semaphore *brw)
+{
+ unsigned int sum = 0;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ sum += per_cpu(*brw->fast_read_ctr, cpu);
+ per_cpu(*brw->fast_read_ctr, cpu) = 0;
+ }
+
+ return sum;
+}
+
+/*
+ * A writer increments ->write_ctr to force the readers to switch to the
+ * slow mode, note the atomic_read() check in update_fast_ctr().
+ *
+ * After that the readers can only inc/dec the slow ->slow_read_ctr counter,
+ * ->fast_read_ctr is stable. Once the writer moves its sum into the slow
+ * counter it represents the number of active readers.
+ *
+ * Finally the writer takes ->rw_sem for writing and blocks the new readers,
+ * then waits until the slow counter becomes zero.
+ */
+void percpu_down_write(struct percpu_rw_semaphore *brw)
+{
+ /* tell update_fast_ctr() there is a pending writer */
+ atomic_inc(&brw->write_ctr);
+ /*
+ * 1. Ensures that write_ctr != 0 is visible to any down_read/up_read
+ * so that update_fast_ctr() can't succeed.
+ *
+ * 2. Ensures we see the result of every previous this_cpu_add() in
+ * update_fast_ctr().
+ *
+ * 3. Ensures that if any reader has exited its critical section via
+ * fast-path, it executes a full memory barrier before we return.
+ * See R_W case in the comment above update_fast_ctr().
+ */
+ synchronize_sched_expedited();
+
+ /* exclude other writers, and block the new readers completely */
+ down_write(&brw->rw_sem);
+
+ /* nobody can use fast_read_ctr, move its sum into slow_read_ctr */
+ atomic_add(clear_fast_ctr(brw), &brw->slow_read_ctr);
+
+ /* wait for all readers to complete their percpu_up_read() */
+ wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr));
+}
+
+void percpu_up_write(struct percpu_rw_semaphore *brw)
+{
+ /* release the lock, but the readers can't use the fast-path */
+ up_write(&brw->rw_sem);
+ /*
+ * Insert the barrier before the next fast-path in down_read,
+ * see W_R case in the comment above update_fast_ctr().
+ */
+ synchronize_sched_expedited();
+ /* the last writer unblocks update_fast_ctr() */
+ atomic_dec(&brw->write_ctr);
+}
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
new file mode 100644
index 000000000..f956ede7f
--- /dev/null
+++ b/kernel/locking/qrwlock.c
@@ -0,0 +1,132 @@
+/*
+ * Queue read/write lock
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * (C) Copyright 2013-2014 Hewlett-Packard Development Company, L.P.
+ *
+ * Authors: Waiman Long <waiman.long@hp.com>
+ */
+#include <linux/smp.h>
+#include <linux/bug.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <asm/qrwlock.h>
+
+/**
+ * rspin_until_writer_unlock - inc reader count & spin until writer is gone
+ * @lock : Pointer to queue rwlock structure
+ * @writer: Current queue rwlock writer status byte
+ *
+ * In interrupt context or at the head of the queue, the reader will just
+ * increment the reader count & wait until the writer releases the lock.
+ */
+static __always_inline void
+rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts)
+{
+ while ((cnts & _QW_WMASK) == _QW_LOCKED) {
+ cpu_relax_lowlatency();
+ cnts = smp_load_acquire((u32 *)&lock->cnts);
+ }
+}
+
+/**
+ * queue_read_lock_slowpath - acquire read lock of a queue rwlock
+ * @lock: Pointer to queue rwlock structure
+ */
+void queue_read_lock_slowpath(struct qrwlock *lock)
+{
+ u32 cnts;
+
+ /*
+ * Readers come here when they cannot get the lock without waiting
+ */
+ if (unlikely(in_interrupt())) {
+ /*
+ * Readers in interrupt context will spin until the lock is
+ * available without waiting in the queue.
+ */
+ cnts = smp_load_acquire((u32 *)&lock->cnts);
+ rspin_until_writer_unlock(lock, cnts);
+ return;
+ }
+ atomic_sub(_QR_BIAS, &lock->cnts);
+
+ /*
+ * Put the reader into the wait queue
+ */
+ arch_spin_lock(&lock->lock);
+
+ /*
+ * At the head of the wait queue now, wait until the writer state
+ * goes to 0 and then try to increment the reader count and get
+ * the lock. It is possible that an incoming writer may steal the
+ * lock in the interim, so it is necessary to check the writer byte
+ * to make sure that the write lock isn't taken.
+ */
+ while (atomic_read(&lock->cnts) & _QW_WMASK)
+ cpu_relax_lowlatency();
+
+ cnts = atomic_add_return(_QR_BIAS, &lock->cnts) - _QR_BIAS;
+ rspin_until_writer_unlock(lock, cnts);
+
+ /*
+ * Signal the next one in queue to become queue head
+ */
+ arch_spin_unlock(&lock->lock);
+}
+EXPORT_SYMBOL(queue_read_lock_slowpath);
+
+/**
+ * queue_write_lock_slowpath - acquire write lock of a queue rwlock
+ * @lock : Pointer to queue rwlock structure
+ */
+void queue_write_lock_slowpath(struct qrwlock *lock)
+{
+ u32 cnts;
+
+ /* Put the writer into the wait queue */
+ arch_spin_lock(&lock->lock);
+
+ /* Try to acquire the lock directly if no reader is present */
+ if (!atomic_read(&lock->cnts) &&
+ (atomic_cmpxchg(&lock->cnts, 0, _QW_LOCKED) == 0))
+ goto unlock;
+
+ /*
+ * Set the waiting flag to notify readers that a writer is pending,
+ * or wait for a previous writer to go away.
+ */
+ for (;;) {
+ cnts = atomic_read(&lock->cnts);
+ if (!(cnts & _QW_WMASK) &&
+ (atomic_cmpxchg(&lock->cnts, cnts,
+ cnts | _QW_WAITING) == cnts))
+ break;
+
+ cpu_relax_lowlatency();
+ }
+
+ /* When no more readers, set the locked flag */
+ for (;;) {
+ cnts = atomic_read(&lock->cnts);
+ if ((cnts == _QW_WAITING) &&
+ (atomic_cmpxchg(&lock->cnts, _QW_WAITING,
+ _QW_LOCKED) == _QW_WAITING))
+ break;
+
+ cpu_relax_lowlatency();
+ }
+unlock:
+ arch_spin_unlock(&lock->lock);
+}
+EXPORT_SYMBOL(queue_write_lock_slowpath);
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
new file mode 100644
index 000000000..62b6cee8e
--- /dev/null
+++ b/kernel/locking/rtmutex-debug.c
@@ -0,0 +1,184 @@
+/*
+ * RT-Mutexes: blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * This code is based on the rt.c implementation in the preempt-rt tree.
+ * Portions of said code are
+ *
+ * Copyright (C) 2004 LynuxWorks, Inc., Igor Manyilov, Bill Huey
+ * Copyright (C) 2006 Esben Nielsen
+ * Copyright (C) 2006 Kihon Technologies Inc.,
+ * Steven Rostedt <rostedt@goodmis.org>
+ *
+ * See rt.c in preempt-rt for proper credits and further information
+ */
+#include <linux/sched.h>
+#include <linux/sched/rt.h>
+#include <linux/delay.h>
+#include <linux/export.h>
+#include <linux/spinlock.h>
+#include <linux/kallsyms.h>
+#include <linux/syscalls.h>
+#include <linux/interrupt.h>
+#include <linux/rbtree.h>
+#include <linux/fs.h>
+#include <linux/debug_locks.h>
+
+#include "rtmutex_common.h"
+
+static void printk_task(struct task_struct *p)
+{
+ if (p)
+ printk("%16s:%5d [%p, %3d]", p->comm, task_pid_nr(p), p, p->prio);
+ else
+ printk("<none>");
+}
+
+static void printk_lock(struct rt_mutex *lock, int print_owner)
+{
+ if (lock->name)
+ printk(" [%p] {%s}\n",
+ lock, lock->name);
+ else
+ printk(" [%p] {%s:%d}\n",
+ lock, lock->file, lock->line);
+
+ if (print_owner && rt_mutex_owner(lock)) {
+ printk(".. ->owner: %p\n", lock->owner);
+ printk(".. held by: ");
+ printk_task(rt_mutex_owner(lock));
+ printk("\n");
+ }
+}
+
+void rt_mutex_debug_task_free(struct task_struct *task)
+{
+ DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters));
+ DEBUG_LOCKS_WARN_ON(task->pi_blocked_on);
+}
+
+/*
+ * We fill out the fields in the waiter to store the information about
+ * the deadlock. We print when we return. act_waiter can be NULL in
+ * case of a remove waiter operation.
+ */
+void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk,
+ struct rt_mutex_waiter *act_waiter,
+ struct rt_mutex *lock)
+{
+ struct task_struct *task;
+
+ if (!debug_locks || chwalk == RT_MUTEX_FULL_CHAINWALK || !act_waiter)
+ return;
+
+ task = rt_mutex_owner(act_waiter->lock);
+ if (task && task != current) {
+ act_waiter->deadlock_task_pid = get_pid(task_pid(task));
+ act_waiter->deadlock_lock = lock;
+ }
+}
+
+void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
+{
+ struct task_struct *task;
+
+ if (!waiter->deadlock_lock || !debug_locks)
+ return;
+
+ rcu_read_lock();
+ task = pid_task(waiter->deadlock_task_pid, PIDTYPE_PID);
+ if (!task) {
+ rcu_read_unlock();
+ return;
+ }
+
+ if (!debug_locks_off()) {
+ rcu_read_unlock();
+ return;
+ }
+
+ printk("\n============================================\n");
+ printk( "[ BUG: circular locking deadlock detected! ]\n");
+ printk("%s\n", print_tainted());
+ printk( "--------------------------------------------\n");
+ printk("%s/%d is deadlocking current task %s/%d\n\n",
+ task->comm, task_pid_nr(task),
+ current->comm, task_pid_nr(current));
+
+ printk("\n1) %s/%d is trying to acquire this lock:\n",
+ current->comm, task_pid_nr(current));
+ printk_lock(waiter->lock, 1);
+
+ printk("\n2) %s/%d is blocked on this lock:\n",
+ task->comm, task_pid_nr(task));
+ printk_lock(waiter->deadlock_lock, 1);
+
+ debug_show_held_locks(current);
+ debug_show_held_locks(task);
+
+ printk("\n%s/%d's [blocked] stackdump:\n\n",
+ task->comm, task_pid_nr(task));
+ show_stack(task, NULL);
+ printk("\n%s/%d's [current] stackdump:\n\n",
+ current->comm, task_pid_nr(current));
+ dump_stack();
+ debug_show_all_locks();
+ rcu_read_unlock();
+
+ printk("[ turning off deadlock detection."
+ "Please report this trace. ]\n\n");
+}
+
+void debug_rt_mutex_lock(struct rt_mutex *lock)
+{
+}
+
+void debug_rt_mutex_unlock(struct rt_mutex *lock)
+{
+ DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current);
+}
+
+void
+debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner)
+{
+}
+
+void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock)
+{
+ DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock));
+}
+
+void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
+{
+ memset(waiter, 0x11, sizeof(*waiter));
+ waiter->deadlock_task_pid = NULL;
+}
+
+void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
+{
+ put_pid(waiter->deadlock_task_pid);
+ memset(waiter, 0x22, sizeof(*waiter));
+}
+
+void debug_rt_mutex_init(struct rt_mutex *lock, const char *name)
+{
+ /*
+ * Make sure we are not reinitializing a held lock:
+ */
+ debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+ lock->name = name;
+}
+
+void
+rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task)
+{
+}
+
+void rt_mutex_deadlock_account_unlock(struct task_struct *task)
+{
+}
+
diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h
new file mode 100644
index 000000000..d0519c343
--- /dev/null
+++ b/kernel/locking/rtmutex-debug.h
@@ -0,0 +1,39 @@
+/*
+ * RT-Mutexes: blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * This file contains macros used solely by rtmutex.c. Debug version.
+ */
+
+extern void
+rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task);
+extern void rt_mutex_deadlock_account_unlock(struct task_struct *task);
+extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
+extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter);
+extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name);
+extern void debug_rt_mutex_lock(struct rt_mutex *lock);
+extern void debug_rt_mutex_unlock(struct rt_mutex *lock);
+extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
+ struct task_struct *powner);
+extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock);
+extern void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk,
+ struct rt_mutex_waiter *waiter,
+ struct rt_mutex *lock);
+extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter);
+# define debug_rt_mutex_reset_waiter(w) \
+ do { (w)->deadlock_lock = NULL; } while (0)
+
+static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter,
+ enum rtmutex_chainwalk walk)
+{
+ return (waiter != NULL);
+}
+
+static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w)
+{
+ debug_rt_mutex_print_deadlock(w);
+}
diff --git a/kernel/locking/rtmutex-tester.c b/kernel/locking/rtmutex-tester.c
new file mode 100644
index 000000000..1d96dd0d9
--- /dev/null
+++ b/kernel/locking/rtmutex-tester.c
@@ -0,0 +1,420 @@
+/*
+ * RT-Mutex-tester: scriptable tester for rt mutexes
+ *
+ * started by Thomas Gleixner:
+ *
+ * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ */
+#include <linux/device.h>
+#include <linux/kthread.h>
+#include <linux/export.h>
+#include <linux/sched.h>
+#include <linux/sched/rt.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/freezer.h>
+#include <linux/stat.h>
+
+#include "rtmutex.h"
+
+#define MAX_RT_TEST_THREADS 8
+#define MAX_RT_TEST_MUTEXES 8
+
+static spinlock_t rttest_lock;
+static atomic_t rttest_event;
+
+struct test_thread_data {
+ int opcode;
+ int opdata;
+ int mutexes[MAX_RT_TEST_MUTEXES];
+ int event;
+ struct device dev;
+};
+
+static struct test_thread_data thread_data[MAX_RT_TEST_THREADS];
+static struct task_struct *threads[MAX_RT_TEST_THREADS];
+static struct rt_mutex mutexes[MAX_RT_TEST_MUTEXES];
+
+enum test_opcodes {
+ RTTEST_NOP = 0,
+ RTTEST_SCHEDOT, /* 1 Sched other, data = nice */
+ RTTEST_SCHEDRT, /* 2 Sched fifo, data = prio */
+ RTTEST_LOCK, /* 3 Lock uninterruptible, data = lockindex */
+ RTTEST_LOCKNOWAIT, /* 4 Lock uninterruptible no wait in wakeup, data = lockindex */
+ RTTEST_LOCKINT, /* 5 Lock interruptible, data = lockindex */
+ RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */
+ RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */
+ RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */
+ /* 9, 10 - reserved for BKL commemoration */
+ RTTEST_SIGNAL = 11, /* 11 Signal other test thread, data = thread id */
+ RTTEST_RESETEVENT = 98, /* 98 Reset event counter */
+ RTTEST_RESET = 99, /* 99 Reset all pending operations */
+};
+
+static int handle_op(struct test_thread_data *td, int lockwakeup)
+{
+ int i, id, ret = -EINVAL;
+
+ switch(td->opcode) {
+
+ case RTTEST_NOP:
+ return 0;
+
+ case RTTEST_LOCKCONT:
+ td->mutexes[td->opdata] = 1;
+ td->event = atomic_add_return(1, &rttest_event);
+ return 0;
+
+ case RTTEST_RESET:
+ for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) {
+ if (td->mutexes[i] == 4) {
+ rt_mutex_unlock(&mutexes[i]);
+ td->mutexes[i] = 0;
+ }
+ }
+ return 0;
+
+ case RTTEST_RESETEVENT:
+ atomic_set(&rttest_event, 0);
+ return 0;
+
+ default:
+ if (lockwakeup)
+ return ret;
+ }
+
+ switch(td->opcode) {
+
+ case RTTEST_LOCK:
+ case RTTEST_LOCKNOWAIT:
+ id = td->opdata;
+ if (id < 0 || id >= MAX_RT_TEST_MUTEXES)
+ return ret;
+
+ td->mutexes[id] = 1;
+ td->event = atomic_add_return(1, &rttest_event);
+ rt_mutex_lock(&mutexes[id]);
+ td->event = atomic_add_return(1, &rttest_event);
+ td->mutexes[id] = 4;
+ return 0;
+
+ case RTTEST_LOCKINT:
+ case RTTEST_LOCKINTNOWAIT:
+ id = td->opdata;
+ if (id < 0 || id >= MAX_RT_TEST_MUTEXES)
+ return ret;
+
+ td->mutexes[id] = 1;
+ td->event = atomic_add_return(1, &rttest_event);
+ ret = rt_mutex_lock_interruptible(&mutexes[id], 0);
+ td->event = atomic_add_return(1, &rttest_event);
+ td->mutexes[id] = ret ? 0 : 4;
+ return ret ? -EINTR : 0;
+
+ case RTTEST_UNLOCK:
+ id = td->opdata;
+ if (id < 0 || id >= MAX_RT_TEST_MUTEXES || td->mutexes[id] != 4)
+ return ret;
+
+ td->event = atomic_add_return(1, &rttest_event);
+ rt_mutex_unlock(&mutexes[id]);
+ td->event = atomic_add_return(1, &rttest_event);
+ td->mutexes[id] = 0;
+ return 0;
+
+ default:
+ break;
+ }
+ return ret;
+}
+
+/*
+ * Schedule replacement for rtsem_down(). Only called for threads with
+ * PF_MUTEX_TESTER set.
+ *
+ * This allows us to have finegrained control over the event flow.
+ *
+ */
+void schedule_rt_mutex_test(struct rt_mutex *mutex)
+{
+ int tid, op, dat;
+ struct test_thread_data *td;
+
+ /* We have to lookup the task */
+ for (tid = 0; tid < MAX_RT_TEST_THREADS; tid++) {
+ if (threads[tid] == current)
+ break;
+ }
+
+ BUG_ON(tid == MAX_RT_TEST_THREADS);
+
+ td = &thread_data[tid];
+
+ op = td->opcode;
+ dat = td->opdata;
+
+ switch (op) {
+ case RTTEST_LOCK:
+ case RTTEST_LOCKINT:
+ case RTTEST_LOCKNOWAIT:
+ case RTTEST_LOCKINTNOWAIT:
+ if (mutex != &mutexes[dat])
+ break;
+
+ if (td->mutexes[dat] != 1)
+ break;
+
+ td->mutexes[dat] = 2;
+ td->event = atomic_add_return(1, &rttest_event);
+ break;
+
+ default:
+ break;
+ }
+
+ schedule();
+
+
+ switch (op) {
+ case RTTEST_LOCK:
+ case RTTEST_LOCKINT:
+ if (mutex != &mutexes[dat])
+ return;
+
+ if (td->mutexes[dat] != 2)
+ return;
+
+ td->mutexes[dat] = 3;
+ td->event = atomic_add_return(1, &rttest_event);
+ break;
+
+ case RTTEST_LOCKNOWAIT:
+ case RTTEST_LOCKINTNOWAIT:
+ if (mutex != &mutexes[dat])
+ return;
+
+ if (td->mutexes[dat] != 2)
+ return;
+
+ td->mutexes[dat] = 1;
+ td->event = atomic_add_return(1, &rttest_event);
+ return;
+
+ default:
+ return;
+ }
+
+ td->opcode = 0;
+
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ if (td->opcode > 0) {
+ int ret;
+
+ set_current_state(TASK_RUNNING);
+ ret = handle_op(td, 1);
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (td->opcode == RTTEST_LOCKCONT)
+ break;
+ td->opcode = ret;
+ }
+
+ /* Wait for the next command to be executed */
+ schedule();
+ }
+
+ /* Restore previous command and data */
+ td->opcode = op;
+ td->opdata = dat;
+}
+
+static int test_func(void *data)
+{
+ struct test_thread_data *td = data;
+ int ret;
+
+ current->flags |= PF_MUTEX_TESTER;
+ set_freezable();
+ allow_signal(SIGHUP);
+
+ for(;;) {
+
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ if (td->opcode > 0) {
+ set_current_state(TASK_RUNNING);
+ ret = handle_op(td, 0);
+ set_current_state(TASK_INTERRUPTIBLE);
+ td->opcode = ret;
+ }
+
+ /* Wait for the next command to be executed */
+ schedule();
+ try_to_freeze();
+
+ if (signal_pending(current))
+ flush_signals(current);
+
+ if(kthread_should_stop())
+ break;
+ }
+ return 0;
+}
+
+/**
+ * sysfs_test_command - interface for test commands
+ * @dev: thread reference
+ * @buf: command for actual step
+ * @count: length of buffer
+ *
+ * command syntax:
+ *
+ * opcode:data
+ */
+static ssize_t sysfs_test_command(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct sched_param schedpar;
+ struct test_thread_data *td;
+ char cmdbuf[32];
+ int op, dat, tid, ret;
+
+ td = container_of(dev, struct test_thread_data, dev);
+ tid = td->dev.id;
+
+ /* strings from sysfs write are not 0 terminated! */
+ if (count >= sizeof(cmdbuf))
+ return -EINVAL;
+
+ /* strip of \n: */
+ if (buf[count-1] == '\n')
+ count--;
+ if (count < 1)
+ return -EINVAL;
+
+ memcpy(cmdbuf, buf, count);
+ cmdbuf[count] = 0;
+
+ if (sscanf(cmdbuf, "%d:%d", &op, &dat) != 2)
+ return -EINVAL;
+
+ switch (op) {
+ case RTTEST_SCHEDOT:
+ schedpar.sched_priority = 0;
+ ret = sched_setscheduler(threads[tid], SCHED_NORMAL, &schedpar);
+ if (ret)
+ return ret;
+ set_user_nice(current, 0);
+ break;
+
+ case RTTEST_SCHEDRT:
+ schedpar.sched_priority = dat;
+ ret = sched_setscheduler(threads[tid], SCHED_FIFO, &schedpar);
+ if (ret)
+ return ret;
+ break;
+
+ case RTTEST_SIGNAL:
+ send_sig(SIGHUP, threads[tid], 0);
+ break;
+
+ default:
+ if (td->opcode > 0)
+ return -EBUSY;
+ td->opdata = dat;
+ td->opcode = op;
+ wake_up_process(threads[tid]);
+ }
+
+ return count;
+}
+
+/**
+ * sysfs_test_status - sysfs interface for rt tester
+ * @dev: thread to query
+ * @buf: char buffer to be filled with thread status info
+ */
+static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct test_thread_data *td;
+ struct task_struct *tsk;
+ char *curr = buf;
+ int i;
+
+ td = container_of(dev, struct test_thread_data, dev);
+ tsk = threads[td->dev.id];
+
+ spin_lock(&rttest_lock);
+
+ curr += sprintf(curr,
+ "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, M:",
+ td->opcode, td->event, tsk->state,
+ (MAX_RT_PRIO - 1) - tsk->prio,
+ (MAX_RT_PRIO - 1) - tsk->normal_prio,
+ tsk->pi_blocked_on);
+
+ for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--)
+ curr += sprintf(curr, "%d", td->mutexes[i]);
+
+ spin_unlock(&rttest_lock);
+
+ curr += sprintf(curr, ", T: %p, R: %p\n", tsk,
+ mutexes[td->dev.id].owner);
+
+ return curr - buf;
+}
+
+static DEVICE_ATTR(status, S_IRUSR, sysfs_test_status, NULL);
+static DEVICE_ATTR(command, S_IWUSR, NULL, sysfs_test_command);
+
+static struct bus_type rttest_subsys = {
+ .name = "rttest",
+ .dev_name = "rttest",
+};
+
+static int init_test_thread(int id)
+{
+ thread_data[id].dev.bus = &rttest_subsys;
+ thread_data[id].dev.id = id;
+
+ threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id);
+ if (IS_ERR(threads[id]))
+ return PTR_ERR(threads[id]);
+
+ return device_register(&thread_data[id].dev);
+}
+
+static int init_rttest(void)
+{
+ int ret, i;
+
+ spin_lock_init(&rttest_lock);
+
+ for (i = 0; i < MAX_RT_TEST_MUTEXES; i++)
+ rt_mutex_init(&mutexes[i]);
+
+ ret = subsys_system_register(&rttest_subsys, NULL);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < MAX_RT_TEST_THREADS; i++) {
+ ret = init_test_thread(i);
+ if (ret)
+ break;
+ ret = device_create_file(&thread_data[i].dev, &dev_attr_status);
+ if (ret)
+ break;
+ ret = device_create_file(&thread_data[i].dev, &dev_attr_command);
+ if (ret)
+ break;
+ }
+
+ printk("Initializing RT-Tester: %s\n", ret ? "Failed" : "OK" );
+
+ return ret;
+}
+
+device_initcall(init_rttest);
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
new file mode 100644
index 000000000..b025295f4
--- /dev/null
+++ b/kernel/locking/rtmutex.c
@@ -0,0 +1,1648 @@
+/*
+ * RT-Mutexes: simple blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner.
+ *
+ * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
+ * Copyright (C) 2006 Esben Nielsen
+ *
+ * See Documentation/locking/rt-mutex-design.txt for details.
+ */
+#include <linux/spinlock.h>
+#include <linux/export.h>
+#include <linux/sched.h>
+#include <linux/sched/rt.h>
+#include <linux/sched/deadline.h>
+#include <linux/timer.h>
+
+#include "rtmutex_common.h"
+
+/*
+ * lock->owner state tracking:
+ *
+ * lock->owner holds the task_struct pointer of the owner. Bit 0
+ * is used to keep track of the "lock has waiters" state.
+ *
+ * owner bit0
+ * NULL 0 lock is free (fast acquire possible)
+ * NULL 1 lock is free and has waiters and the top waiter
+ * is going to take the lock*
+ * taskpointer 0 lock is held (fast release possible)
+ * taskpointer 1 lock is held and has waiters**
+ *
+ * The fast atomic compare exchange based acquire and release is only
+ * possible when bit 0 of lock->owner is 0.
+ *
+ * (*) It also can be a transitional state when grabbing the lock
+ * with ->wait_lock is held. To prevent any fast path cmpxchg to the lock,
+ * we need to set the bit0 before looking at the lock, and the owner may be
+ * NULL in this small time, hence this can be a transitional state.
+ *
+ * (**) There is a small time when bit 0 is set but there are no
+ * waiters. This can happen when grabbing the lock in the slow path.
+ * To prevent a cmpxchg of the owner releasing the lock, we need to
+ * set this bit before looking at the lock.
+ */
+
+static void
+rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner)
+{
+ unsigned long val = (unsigned long)owner;
+
+ if (rt_mutex_has_waiters(lock))
+ val |= RT_MUTEX_HAS_WAITERS;
+
+ lock->owner = (struct task_struct *)val;
+}
+
+static inline void clear_rt_mutex_waiters(struct rt_mutex *lock)
+{
+ lock->owner = (struct task_struct *)
+ ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
+}
+
+static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
+{
+ if (!rt_mutex_has_waiters(lock))
+ clear_rt_mutex_waiters(lock);
+}
+
+/*
+ * We can speed up the acquire/release, if the architecture
+ * supports cmpxchg and if there's no debugging state to be set up
+ */
+#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
+# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c)
+static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+{
+ unsigned long owner, *p = (unsigned long *) &lock->owner;
+
+ do {
+ owner = *p;
+ } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
+}
+
+/*
+ * Safe fastpath aware unlock:
+ * 1) Clear the waiters bit
+ * 2) Drop lock->wait_lock
+ * 3) Try to unlock the lock with cmpxchg
+ */
+static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
+ __releases(lock->wait_lock)
+{
+ struct task_struct *owner = rt_mutex_owner(lock);
+
+ clear_rt_mutex_waiters(lock);
+ raw_spin_unlock(&lock->wait_lock);
+ /*
+ * If a new waiter comes in between the unlock and the cmpxchg
+ * we have two situations:
+ *
+ * unlock(wait_lock);
+ * lock(wait_lock);
+ * cmpxchg(p, owner, 0) == owner
+ * mark_rt_mutex_waiters(lock);
+ * acquire(lock);
+ * or:
+ *
+ * unlock(wait_lock);
+ * lock(wait_lock);
+ * mark_rt_mutex_waiters(lock);
+ *
+ * cmpxchg(p, owner, 0) != owner
+ * enqueue_waiter();
+ * unlock(wait_lock);
+ * lock(wait_lock);
+ * wake waiter();
+ * unlock(wait_lock);
+ * lock(wait_lock);
+ * acquire(lock);
+ */
+ return rt_mutex_cmpxchg(lock, owner, NULL);
+}
+
+#else
+# define rt_mutex_cmpxchg(l,c,n) (0)
+static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+{
+ lock->owner = (struct task_struct *)
+ ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
+}
+
+/*
+ * Simple slow path only version: lock->owner is protected by lock->wait_lock.
+ */
+static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
+ __releases(lock->wait_lock)
+{
+ lock->owner = NULL;
+ raw_spin_unlock(&lock->wait_lock);
+ return true;
+}
+#endif
+
+static inline int
+rt_mutex_waiter_less(struct rt_mutex_waiter *left,
+ struct rt_mutex_waiter *right)
+{
+ if (left->prio < right->prio)
+ return 1;
+
+ /*
+ * If both waiters have dl_prio(), we check the deadlines of the
+ * associated tasks.
+ * If left waiter has a dl_prio(), and we didn't return 1 above,
+ * then right waiter has a dl_prio() too.
+ */
+ if (dl_prio(left->prio))
+ return (left->task->dl.deadline < right->task->dl.deadline);
+
+ return 0;
+}
+
+static void
+rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
+{
+ struct rb_node **link = &lock->waiters.rb_node;
+ struct rb_node *parent = NULL;
+ struct rt_mutex_waiter *entry;
+ int leftmost = 1;
+
+ while (*link) {
+ parent = *link;
+ entry = rb_entry(parent, struct rt_mutex_waiter, tree_entry);
+ if (rt_mutex_waiter_less(waiter, entry)) {
+ link = &parent->rb_left;
+ } else {
+ link = &parent->rb_right;
+ leftmost = 0;
+ }
+ }
+
+ if (leftmost)
+ lock->waiters_leftmost = &waiter->tree_entry;
+
+ rb_link_node(&waiter->tree_entry, parent, link);
+ rb_insert_color(&waiter->tree_entry, &lock->waiters);
+}
+
+static void
+rt_mutex_dequeue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
+{
+ if (RB_EMPTY_NODE(&waiter->tree_entry))
+ return;
+
+ if (lock->waiters_leftmost == &waiter->tree_entry)
+ lock->waiters_leftmost = rb_next(&waiter->tree_entry);
+
+ rb_erase(&waiter->tree_entry, &lock->waiters);
+ RB_CLEAR_NODE(&waiter->tree_entry);
+}
+
+static void
+rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
+{
+ struct rb_node **link = &task->pi_waiters.rb_node;
+ struct rb_node *parent = NULL;
+ struct rt_mutex_waiter *entry;
+ int leftmost = 1;
+
+ while (*link) {
+ parent = *link;
+ entry = rb_entry(parent, struct rt_mutex_waiter, pi_tree_entry);
+ if (rt_mutex_waiter_less(waiter, entry)) {
+ link = &parent->rb_left;
+ } else {
+ link = &parent->rb_right;
+ leftmost = 0;
+ }
+ }
+
+ if (leftmost)
+ task->pi_waiters_leftmost = &waiter->pi_tree_entry;
+
+ rb_link_node(&waiter->pi_tree_entry, parent, link);
+ rb_insert_color(&waiter->pi_tree_entry, &task->pi_waiters);
+}
+
+static void
+rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
+{
+ if (RB_EMPTY_NODE(&waiter->pi_tree_entry))
+ return;
+
+ if (task->pi_waiters_leftmost == &waiter->pi_tree_entry)
+ task->pi_waiters_leftmost = rb_next(&waiter->pi_tree_entry);
+
+ rb_erase(&waiter->pi_tree_entry, &task->pi_waiters);
+ RB_CLEAR_NODE(&waiter->pi_tree_entry);
+}
+
+/*
+ * Calculate task priority from the waiter tree priority
+ *
+ * Return task->normal_prio when the waiter tree is empty or when
+ * the waiter is not allowed to do priority boosting
+ */
+int rt_mutex_getprio(struct task_struct *task)
+{
+ if (likely(!task_has_pi_waiters(task)))
+ return task->normal_prio;
+
+ return min(task_top_pi_waiter(task)->prio,
+ task->normal_prio);
+}
+
+struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
+{
+ if (likely(!task_has_pi_waiters(task)))
+ return NULL;
+
+ return task_top_pi_waiter(task)->task;
+}
+
+/*
+ * Called by sched_setscheduler() to get the priority which will be
+ * effective after the change.
+ */
+int rt_mutex_get_effective_prio(struct task_struct *task, int newprio)
+{
+ if (!task_has_pi_waiters(task))
+ return newprio;
+
+ if (task_top_pi_waiter(task)->task->prio <= newprio)
+ return task_top_pi_waiter(task)->task->prio;
+ return newprio;
+}
+
+/*
+ * Adjust the priority of a task, after its pi_waiters got modified.
+ *
+ * This can be both boosting and unboosting. task->pi_lock must be held.
+ */
+static void __rt_mutex_adjust_prio(struct task_struct *task)
+{
+ int prio = rt_mutex_getprio(task);
+
+ if (task->prio != prio || dl_prio(prio))
+ rt_mutex_setprio(task, prio);
+}
+
+/*
+ * Adjust task priority (undo boosting). Called from the exit path of
+ * rt_mutex_slowunlock() and rt_mutex_slowlock().
+ *
+ * (Note: We do this outside of the protection of lock->wait_lock to
+ * allow the lock to be taken while or before we readjust the priority
+ * of task. We do not use the spin_xx_mutex() variants here as we are
+ * outside of the debug path.)
+ */
+static void rt_mutex_adjust_prio(struct task_struct *task)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&task->pi_lock, flags);
+ __rt_mutex_adjust_prio(task);
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+}
+
+/*
+ * Deadlock detection is conditional:
+ *
+ * If CONFIG_DEBUG_RT_MUTEXES=n, deadlock detection is only conducted
+ * if the detect argument is == RT_MUTEX_FULL_CHAINWALK.
+ *
+ * If CONFIG_DEBUG_RT_MUTEXES=y, deadlock detection is always
+ * conducted independent of the detect argument.
+ *
+ * If the waiter argument is NULL this indicates the deboost path and
+ * deadlock detection is disabled independent of the detect argument
+ * and the config settings.
+ */
+static bool rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter,
+ enum rtmutex_chainwalk chwalk)
+{
+ /*
+ * This is just a wrapper function for the following call,
+ * because debug_rt_mutex_detect_deadlock() smells like a magic
+ * debug feature and I wanted to keep the cond function in the
+ * main source file along with the comments instead of having
+ * two of the same in the headers.
+ */
+ return debug_rt_mutex_detect_deadlock(waiter, chwalk);
+}
+
+/*
+ * Max number of times we'll walk the boosting chain:
+ */
+int max_lock_depth = 1024;
+
+static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
+{
+ return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL;
+}
+
+/*
+ * Adjust the priority chain. Also used for deadlock detection.
+ * Decreases task's usage by one - may thus free the task.
+ *
+ * @task: the task owning the mutex (owner) for which a chain walk is
+ * probably needed
+ * @chwalk: do we have to carry out deadlock detection?
+ * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck
+ * things for a task that has just got its priority adjusted, and
+ * is waiting on a mutex)
+ * @next_lock: the mutex on which the owner of @orig_lock was blocked before
+ * we dropped its pi_lock. Is never dereferenced, only used for
+ * comparison to detect lock chain changes.
+ * @orig_waiter: rt_mutex_waiter struct for the task that has just donated
+ * its priority to the mutex owner (can be NULL in the case
+ * depicted above or if the top waiter is gone away and we are
+ * actually deboosting the owner)
+ * @top_task: the current top waiter
+ *
+ * Returns 0 or -EDEADLK.
+ *
+ * Chain walk basics and protection scope
+ *
+ * [R] refcount on task
+ * [P] task->pi_lock held
+ * [L] rtmutex->wait_lock held
+ *
+ * Step Description Protected by
+ * function arguments:
+ * @task [R]
+ * @orig_lock if != NULL @top_task is blocked on it
+ * @next_lock Unprotected. Cannot be
+ * dereferenced. Only used for
+ * comparison.
+ * @orig_waiter if != NULL @top_task is blocked on it
+ * @top_task current, or in case of proxy
+ * locking protected by calling
+ * code
+ * again:
+ * loop_sanity_check();
+ * retry:
+ * [1] lock(task->pi_lock); [R] acquire [P]
+ * [2] waiter = task->pi_blocked_on; [P]
+ * [3] check_exit_conditions_1(); [P]
+ * [4] lock = waiter->lock; [P]
+ * [5] if (!try_lock(lock->wait_lock)) { [P] try to acquire [L]
+ * unlock(task->pi_lock); release [P]
+ * goto retry;
+ * }
+ * [6] check_exit_conditions_2(); [P] + [L]
+ * [7] requeue_lock_waiter(lock, waiter); [P] + [L]
+ * [8] unlock(task->pi_lock); release [P]
+ * put_task_struct(task); release [R]
+ * [9] check_exit_conditions_3(); [L]
+ * [10] task = owner(lock); [L]
+ * get_task_struct(task); [L] acquire [R]
+ * lock(task->pi_lock); [L] acquire [P]
+ * [11] requeue_pi_waiter(tsk, waiters(lock));[P] + [L]
+ * [12] check_exit_conditions_4(); [P] + [L]
+ * [13] unlock(task->pi_lock); release [P]
+ * unlock(lock->wait_lock); release [L]
+ * goto again;
+ */
+static int rt_mutex_adjust_prio_chain(struct task_struct *task,
+ enum rtmutex_chainwalk chwalk,
+ struct rt_mutex *orig_lock,
+ struct rt_mutex *next_lock,
+ struct rt_mutex_waiter *orig_waiter,
+ struct task_struct *top_task)
+{
+ struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
+ struct rt_mutex_waiter *prerequeue_top_waiter;
+ int ret = 0, depth = 0;
+ struct rt_mutex *lock;
+ bool detect_deadlock;
+ unsigned long flags;
+ bool requeue = true;
+
+ detect_deadlock = rt_mutex_cond_detect_deadlock(orig_waiter, chwalk);
+
+ /*
+ * The (de)boosting is a step by step approach with a lot of
+ * pitfalls. We want this to be preemptible and we want hold a
+ * maximum of two locks per step. So we have to check
+ * carefully whether things change under us.
+ */
+ again:
+ /*
+ * We limit the lock chain length for each invocation.
+ */
+ if (++depth > max_lock_depth) {
+ static int prev_max;
+
+ /*
+ * Print this only once. If the admin changes the limit,
+ * print a new message when reaching the limit again.
+ */
+ if (prev_max != max_lock_depth) {
+ prev_max = max_lock_depth;
+ printk(KERN_WARNING "Maximum lock depth %d reached "
+ "task: %s (%d)\n", max_lock_depth,
+ top_task->comm, task_pid_nr(top_task));
+ }
+ put_task_struct(task);
+
+ return -EDEADLK;
+ }
+
+ /*
+ * We are fully preemptible here and only hold the refcount on
+ * @task. So everything can have changed under us since the
+ * caller or our own code below (goto retry/again) dropped all
+ * locks.
+ */
+ retry:
+ /*
+ * [1] Task cannot go away as we did a get_task() before !
+ */
+ raw_spin_lock_irqsave(&task->pi_lock, flags);
+
+ /*
+ * [2] Get the waiter on which @task is blocked on.
+ */
+ waiter = task->pi_blocked_on;
+
+ /*
+ * [3] check_exit_conditions_1() protected by task->pi_lock.
+ */
+
+ /*
+ * Check whether the end of the boosting chain has been
+ * reached or the state of the chain has changed while we
+ * dropped the locks.
+ */
+ if (!waiter)
+ goto out_unlock_pi;
+
+ /*
+ * Check the orig_waiter state. After we dropped the locks,
+ * the previous owner of the lock might have released the lock.
+ */
+ if (orig_waiter && !rt_mutex_owner(orig_lock))
+ goto out_unlock_pi;
+
+ /*
+ * We dropped all locks after taking a refcount on @task, so
+ * the task might have moved on in the lock chain or even left
+ * the chain completely and blocks now on an unrelated lock or
+ * on @orig_lock.
+ *
+ * We stored the lock on which @task was blocked in @next_lock,
+ * so we can detect the chain change.
+ */
+ if (next_lock != waiter->lock)
+ goto out_unlock_pi;
+
+ /*
+ * Drop out, when the task has no waiters. Note,
+ * top_waiter can be NULL, when we are in the deboosting
+ * mode!
+ */
+ if (top_waiter) {
+ if (!task_has_pi_waiters(task))
+ goto out_unlock_pi;
+ /*
+ * If deadlock detection is off, we stop here if we
+ * are not the top pi waiter of the task. If deadlock
+ * detection is enabled we continue, but stop the
+ * requeueing in the chain walk.
+ */
+ if (top_waiter != task_top_pi_waiter(task)) {
+ if (!detect_deadlock)
+ goto out_unlock_pi;
+ else
+ requeue = false;
+ }
+ }
+
+ /*
+ * If the waiter priority is the same as the task priority
+ * then there is no further priority adjustment necessary. If
+ * deadlock detection is off, we stop the chain walk. If its
+ * enabled we continue, but stop the requeueing in the chain
+ * walk.
+ */
+ if (waiter->prio == task->prio) {
+ if (!detect_deadlock)
+ goto out_unlock_pi;
+ else
+ requeue = false;
+ }
+
+ /*
+ * [4] Get the next lock
+ */
+ lock = waiter->lock;
+ /*
+ * [5] We need to trylock here as we are holding task->pi_lock,
+ * which is the reverse lock order versus the other rtmutex
+ * operations.
+ */
+ if (!raw_spin_trylock(&lock->wait_lock)) {
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ cpu_relax();
+ goto retry;
+ }
+
+ /*
+ * [6] check_exit_conditions_2() protected by task->pi_lock and
+ * lock->wait_lock.
+ *
+ * Deadlock detection. If the lock is the same as the original
+ * lock which caused us to walk the lock chain or if the
+ * current lock is owned by the task which initiated the chain
+ * walk, we detected a deadlock.
+ */
+ if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
+ debug_rt_mutex_deadlock(chwalk, orig_waiter, lock);
+ raw_spin_unlock(&lock->wait_lock);
+ ret = -EDEADLK;
+ goto out_unlock_pi;
+ }
+
+ /*
+ * If we just follow the lock chain for deadlock detection, no
+ * need to do all the requeue operations. To avoid a truckload
+ * of conditionals around the various places below, just do the
+ * minimum chain walk checks.
+ */
+ if (!requeue) {
+ /*
+ * No requeue[7] here. Just release @task [8]
+ */
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ put_task_struct(task);
+
+ /*
+ * [9] check_exit_conditions_3 protected by lock->wait_lock.
+ * If there is no owner of the lock, end of chain.
+ */
+ if (!rt_mutex_owner(lock)) {
+ raw_spin_unlock(&lock->wait_lock);
+ return 0;
+ }
+
+ /* [10] Grab the next task, i.e. owner of @lock */
+ task = rt_mutex_owner(lock);
+ get_task_struct(task);
+ raw_spin_lock_irqsave(&task->pi_lock, flags);
+
+ /*
+ * No requeue [11] here. We just do deadlock detection.
+ *
+ * [12] Store whether owner is blocked
+ * itself. Decision is made after dropping the locks
+ */
+ next_lock = task_blocked_on_lock(task);
+ /*
+ * Get the top waiter for the next iteration
+ */
+ top_waiter = rt_mutex_top_waiter(lock);
+
+ /* [13] Drop locks */
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock(&lock->wait_lock);
+
+ /* If owner is not blocked, end of chain. */
+ if (!next_lock)
+ goto out_put_task;
+ goto again;
+ }
+
+ /*
+ * Store the current top waiter before doing the requeue
+ * operation on @lock. We need it for the boost/deboost
+ * decision below.
+ */
+ prerequeue_top_waiter = rt_mutex_top_waiter(lock);
+
+ /* [7] Requeue the waiter in the lock waiter list. */
+ rt_mutex_dequeue(lock, waiter);
+ waiter->prio = task->prio;
+ rt_mutex_enqueue(lock, waiter);
+
+ /* [8] Release the task */
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ put_task_struct(task);
+
+ /*
+ * [9] check_exit_conditions_3 protected by lock->wait_lock.
+ *
+ * We must abort the chain walk if there is no lock owner even
+ * in the dead lock detection case, as we have nothing to
+ * follow here. This is the end of the chain we are walking.
+ */
+ if (!rt_mutex_owner(lock)) {
+ /*
+ * If the requeue [7] above changed the top waiter,
+ * then we need to wake the new top waiter up to try
+ * to get the lock.
+ */
+ if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
+ wake_up_process(rt_mutex_top_waiter(lock)->task);
+ raw_spin_unlock(&lock->wait_lock);
+ return 0;
+ }
+
+ /* [10] Grab the next task, i.e. the owner of @lock */
+ task = rt_mutex_owner(lock);
+ get_task_struct(task);
+ raw_spin_lock_irqsave(&task->pi_lock, flags);
+
+ /* [11] requeue the pi waiters if necessary */
+ if (waiter == rt_mutex_top_waiter(lock)) {
+ /*
+ * The waiter became the new top (highest priority)
+ * waiter on the lock. Replace the previous top waiter
+ * in the owner tasks pi waiters list with this waiter
+ * and adjust the priority of the owner.
+ */
+ rt_mutex_dequeue_pi(task, prerequeue_top_waiter);
+ rt_mutex_enqueue_pi(task, waiter);
+ __rt_mutex_adjust_prio(task);
+
+ } else if (prerequeue_top_waiter == waiter) {
+ /*
+ * The waiter was the top waiter on the lock, but is
+ * no longer the top prority waiter. Replace waiter in
+ * the owner tasks pi waiters list with the new top
+ * (highest priority) waiter and adjust the priority
+ * of the owner.
+ * The new top waiter is stored in @waiter so that
+ * @waiter == @top_waiter evaluates to true below and
+ * we continue to deboost the rest of the chain.
+ */
+ rt_mutex_dequeue_pi(task, waiter);
+ waiter = rt_mutex_top_waiter(lock);
+ rt_mutex_enqueue_pi(task, waiter);
+ __rt_mutex_adjust_prio(task);
+ } else {
+ /*
+ * Nothing changed. No need to do any priority
+ * adjustment.
+ */
+ }
+
+ /*
+ * [12] check_exit_conditions_4() protected by task->pi_lock
+ * and lock->wait_lock. The actual decisions are made after we
+ * dropped the locks.
+ *
+ * Check whether the task which owns the current lock is pi
+ * blocked itself. If yes we store a pointer to the lock for
+ * the lock chain change detection above. After we dropped
+ * task->pi_lock next_lock cannot be dereferenced anymore.
+ */
+ next_lock = task_blocked_on_lock(task);
+ /*
+ * Store the top waiter of @lock for the end of chain walk
+ * decision below.
+ */
+ top_waiter = rt_mutex_top_waiter(lock);
+
+ /* [13] Drop the locks */
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock(&lock->wait_lock);
+
+ /*
+ * Make the actual exit decisions [12], based on the stored
+ * values.
+ *
+ * We reached the end of the lock chain. Stop right here. No
+ * point to go back just to figure that out.
+ */
+ if (!next_lock)
+ goto out_put_task;
+
+ /*
+ * If the current waiter is not the top waiter on the lock,
+ * then we can stop the chain walk here if we are not in full
+ * deadlock detection mode.
+ */
+ if (!detect_deadlock && waiter != top_waiter)
+ goto out_put_task;
+
+ goto again;
+
+ out_unlock_pi:
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ out_put_task:
+ put_task_struct(task);
+
+ return ret;
+}
+
+/*
+ * Try to take an rt-mutex
+ *
+ * Must be called with lock->wait_lock held.
+ *
+ * @lock: The lock to be acquired.
+ * @task: The task which wants to acquire the lock
+ * @waiter: The waiter that is queued to the lock's wait list if the
+ * callsite called task_blocked_on_lock(), otherwise NULL
+ */
+static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
+ struct rt_mutex_waiter *waiter)
+{
+ unsigned long flags;
+
+ /*
+ * Before testing whether we can acquire @lock, we set the
+ * RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all
+ * other tasks which try to modify @lock into the slow path
+ * and they serialize on @lock->wait_lock.
+ *
+ * The RT_MUTEX_HAS_WAITERS bit can have a transitional state
+ * as explained at the top of this file if and only if:
+ *
+ * - There is a lock owner. The caller must fixup the
+ * transient state if it does a trylock or leaves the lock
+ * function due to a signal or timeout.
+ *
+ * - @task acquires the lock and there are no other
+ * waiters. This is undone in rt_mutex_set_owner(@task) at
+ * the end of this function.
+ */
+ mark_rt_mutex_waiters(lock);
+
+ /*
+ * If @lock has an owner, give up.
+ */
+ if (rt_mutex_owner(lock))
+ return 0;
+
+ /*
+ * If @waiter != NULL, @task has already enqueued the waiter
+ * into @lock waiter list. If @waiter == NULL then this is a
+ * trylock attempt.
+ */
+ if (waiter) {
+ /*
+ * If waiter is not the highest priority waiter of
+ * @lock, give up.
+ */
+ if (waiter != rt_mutex_top_waiter(lock))
+ return 0;
+
+ /*
+ * We can acquire the lock. Remove the waiter from the
+ * lock waiters list.
+ */
+ rt_mutex_dequeue(lock, waiter);
+
+ } else {
+ /*
+ * If the lock has waiters already we check whether @task is
+ * eligible to take over the lock.
+ *
+ * If there are no other waiters, @task can acquire
+ * the lock. @task->pi_blocked_on is NULL, so it does
+ * not need to be dequeued.
+ */
+ if (rt_mutex_has_waiters(lock)) {
+ /*
+ * If @task->prio is greater than or equal to
+ * the top waiter priority (kernel view),
+ * @task lost.
+ */
+ if (task->prio >= rt_mutex_top_waiter(lock)->prio)
+ return 0;
+
+ /*
+ * The current top waiter stays enqueued. We
+ * don't have to change anything in the lock
+ * waiters order.
+ */
+ } else {
+ /*
+ * No waiters. Take the lock without the
+ * pi_lock dance.@task->pi_blocked_on is NULL
+ * and we have no waiters to enqueue in @task
+ * pi waiters list.
+ */
+ goto takeit;
+ }
+ }
+
+ /*
+ * Clear @task->pi_blocked_on. Requires protection by
+ * @task->pi_lock. Redundant operation for the @waiter == NULL
+ * case, but conditionals are more expensive than a redundant
+ * store.
+ */
+ raw_spin_lock_irqsave(&task->pi_lock, flags);
+ task->pi_blocked_on = NULL;
+ /*
+ * Finish the lock acquisition. @task is the new owner. If
+ * other waiters exist we have to insert the highest priority
+ * waiter into @task->pi_waiters list.
+ */
+ if (rt_mutex_has_waiters(lock))
+ rt_mutex_enqueue_pi(task, rt_mutex_top_waiter(lock));
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+
+takeit:
+ /* We got the lock. */
+ debug_rt_mutex_lock(lock);
+
+ /*
+ * This either preserves the RT_MUTEX_HAS_WAITERS bit if there
+ * are still waiters or clears it.
+ */
+ rt_mutex_set_owner(lock, task);
+
+ rt_mutex_deadlock_account_lock(lock, task);
+
+ return 1;
+}
+
+/*
+ * Task blocks on lock.
+ *
+ * Prepare waiter and propagate pi chain
+ *
+ * This must be called with lock->wait_lock held.
+ */
+static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task,
+ enum rtmutex_chainwalk chwalk)
+{
+ struct task_struct *owner = rt_mutex_owner(lock);
+ struct rt_mutex_waiter *top_waiter = waiter;
+ struct rt_mutex *next_lock;
+ int chain_walk = 0, res;
+ unsigned long flags;
+
+ /*
+ * Early deadlock detection. We really don't want the task to
+ * enqueue on itself just to untangle the mess later. It's not
+ * only an optimization. We drop the locks, so another waiter
+ * can come in before the chain walk detects the deadlock. So
+ * the other will detect the deadlock and return -EDEADLOCK,
+ * which is wrong, as the other waiter is not in a deadlock
+ * situation.
+ */
+ if (owner == task)
+ return -EDEADLK;
+
+ raw_spin_lock_irqsave(&task->pi_lock, flags);
+ __rt_mutex_adjust_prio(task);
+ waiter->task = task;
+ waiter->lock = lock;
+ waiter->prio = task->prio;
+
+ /* Get the top priority waiter on the lock */
+ if (rt_mutex_has_waiters(lock))
+ top_waiter = rt_mutex_top_waiter(lock);
+ rt_mutex_enqueue(lock, waiter);
+
+ task->pi_blocked_on = waiter;
+
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+
+ if (!owner)
+ return 0;
+
+ raw_spin_lock_irqsave(&owner->pi_lock, flags);
+ if (waiter == rt_mutex_top_waiter(lock)) {
+ rt_mutex_dequeue_pi(owner, top_waiter);
+ rt_mutex_enqueue_pi(owner, waiter);
+
+ __rt_mutex_adjust_prio(owner);
+ if (owner->pi_blocked_on)
+ chain_walk = 1;
+ } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
+ chain_walk = 1;
+ }
+
+ /* Store the lock on which owner is blocked or NULL */
+ next_lock = task_blocked_on_lock(owner);
+
+ raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
+ /*
+ * Even if full deadlock detection is on, if the owner is not
+ * blocked itself, we can avoid finding this out in the chain
+ * walk.
+ */
+ if (!chain_walk || !next_lock)
+ return 0;
+
+ /*
+ * The owner can't disappear while holding a lock,
+ * so the owner struct is protected by wait_lock.
+ * Gets dropped in rt_mutex_adjust_prio_chain()!
+ */
+ get_task_struct(owner);
+
+ raw_spin_unlock(&lock->wait_lock);
+
+ res = rt_mutex_adjust_prio_chain(owner, chwalk, lock,
+ next_lock, waiter, task);
+
+ raw_spin_lock(&lock->wait_lock);
+
+ return res;
+}
+
+/*
+ * Wake up the next waiter on the lock.
+ *
+ * Remove the top waiter from the current tasks pi waiter list and
+ * wake it up.
+ *
+ * Called with lock->wait_lock held.
+ */
+static void wakeup_next_waiter(struct rt_mutex *lock)
+{
+ struct rt_mutex_waiter *waiter;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&current->pi_lock, flags);
+
+ waiter = rt_mutex_top_waiter(lock);
+
+ /*
+ * Remove it from current->pi_waiters. We do not adjust a
+ * possible priority boost right now. We execute wakeup in the
+ * boosted mode and go back to normal after releasing
+ * lock->wait_lock.
+ */
+ rt_mutex_dequeue_pi(current, waiter);
+
+ /*
+ * As we are waking up the top waiter, and the waiter stays
+ * queued on the lock until it gets the lock, this lock
+ * obviously has waiters. Just set the bit here and this has
+ * the added benefit of forcing all new tasks into the
+ * slow path making sure no task of lower priority than
+ * the top waiter can steal this lock.
+ */
+ lock->owner = (void *) RT_MUTEX_HAS_WAITERS;
+
+ raw_spin_unlock_irqrestore(&current->pi_lock, flags);
+
+ /*
+ * It's safe to dereference waiter as it cannot go away as
+ * long as we hold lock->wait_lock. The waiter task needs to
+ * acquire it in order to dequeue the waiter.
+ */
+ wake_up_process(waiter->task);
+}
+
+/*
+ * Remove a waiter from a lock and give up
+ *
+ * Must be called with lock->wait_lock held and
+ * have just failed to try_to_take_rt_mutex().
+ */
+static void remove_waiter(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter)
+{
+ bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
+ struct task_struct *owner = rt_mutex_owner(lock);
+ struct rt_mutex *next_lock;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&current->pi_lock, flags);
+ rt_mutex_dequeue(lock, waiter);
+ current->pi_blocked_on = NULL;
+ raw_spin_unlock_irqrestore(&current->pi_lock, flags);
+
+ /*
+ * Only update priority if the waiter was the highest priority
+ * waiter of the lock and there is an owner to update.
+ */
+ if (!owner || !is_top_waiter)
+ return;
+
+ raw_spin_lock_irqsave(&owner->pi_lock, flags);
+
+ rt_mutex_dequeue_pi(owner, waiter);
+
+ if (rt_mutex_has_waiters(lock))
+ rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock));
+
+ __rt_mutex_adjust_prio(owner);
+
+ /* Store the lock on which owner is blocked or NULL */
+ next_lock = task_blocked_on_lock(owner);
+
+ raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
+
+ /*
+ * Don't walk the chain, if the owner task is not blocked
+ * itself.
+ */
+ if (!next_lock)
+ return;
+
+ /* gets dropped in rt_mutex_adjust_prio_chain()! */
+ get_task_struct(owner);
+
+ raw_spin_unlock(&lock->wait_lock);
+
+ rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock,
+ next_lock, NULL, current);
+
+ raw_spin_lock(&lock->wait_lock);
+}
+
+/*
+ * Recheck the pi chain, in case we got a priority setting
+ *
+ * Called from sched_setscheduler
+ */
+void rt_mutex_adjust_pi(struct task_struct *task)
+{
+ struct rt_mutex_waiter *waiter;
+ struct rt_mutex *next_lock;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&task->pi_lock, flags);
+
+ waiter = task->pi_blocked_on;
+ if (!waiter || (waiter->prio == task->prio &&
+ !dl_prio(task->prio))) {
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ return;
+ }
+ next_lock = waiter->lock;
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+
+ /* gets dropped in rt_mutex_adjust_prio_chain()! */
+ get_task_struct(task);
+
+ rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL,
+ next_lock, NULL, task);
+}
+
+/**
+ * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
+ * @lock: the rt_mutex to take
+ * @state: the state the task should block in (TASK_INTERRUPTIBLE
+ * or TASK_UNINTERRUPTIBLE)
+ * @timeout: the pre-initialized and started timer, or NULL for none
+ * @waiter: the pre-initialized rt_mutex_waiter
+ *
+ * lock->wait_lock must be held by the caller.
+ */
+static int __sched
+__rt_mutex_slowlock(struct rt_mutex *lock, int state,
+ struct hrtimer_sleeper *timeout,
+ struct rt_mutex_waiter *waiter)
+{
+ int ret = 0;
+
+ for (;;) {
+ /* Try to acquire the lock: */
+ if (try_to_take_rt_mutex(lock, current, waiter))
+ break;
+
+ /*
+ * TASK_INTERRUPTIBLE checks for signals and
+ * timeout. Ignored otherwise.
+ */
+ if (unlikely(state == TASK_INTERRUPTIBLE)) {
+ /* Signal pending? */
+ if (signal_pending(current))
+ ret = -EINTR;
+ if (timeout && !timeout->task)
+ ret = -ETIMEDOUT;
+ if (ret)
+ break;
+ }
+
+ raw_spin_unlock(&lock->wait_lock);
+
+ debug_rt_mutex_print_deadlock(waiter);
+
+ schedule_rt_mutex(lock);
+
+ raw_spin_lock(&lock->wait_lock);
+ set_current_state(state);
+ }
+
+ __set_current_state(TASK_RUNNING);
+ return ret;
+}
+
+static void rt_mutex_handle_deadlock(int res, int detect_deadlock,
+ struct rt_mutex_waiter *w)
+{
+ /*
+ * If the result is not -EDEADLOCK or the caller requested
+ * deadlock detection, nothing to do here.
+ */
+ if (res != -EDEADLOCK || detect_deadlock)
+ return;
+
+ /*
+ * Yell lowdly and stop the task right here.
+ */
+ rt_mutex_print_deadlock(w);
+ while (1) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule();
+ }
+}
+
+/*
+ * Slow path lock function:
+ */
+static int __sched
+rt_mutex_slowlock(struct rt_mutex *lock, int state,
+ struct hrtimer_sleeper *timeout,
+ enum rtmutex_chainwalk chwalk)
+{
+ struct rt_mutex_waiter waiter;
+ int ret = 0;
+
+ debug_rt_mutex_init_waiter(&waiter);
+ RB_CLEAR_NODE(&waiter.pi_tree_entry);
+ RB_CLEAR_NODE(&waiter.tree_entry);
+
+ raw_spin_lock(&lock->wait_lock);
+
+ /* Try to acquire the lock again: */
+ if (try_to_take_rt_mutex(lock, current, NULL)) {
+ raw_spin_unlock(&lock->wait_lock);
+ return 0;
+ }
+
+ set_current_state(state);
+
+ /* Setup the timer, when timeout != NULL */
+ if (unlikely(timeout)) {
+ hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
+ if (!hrtimer_active(&timeout->timer))
+ timeout->task = NULL;
+ }
+
+ ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
+
+ if (likely(!ret))
+ /* sleep on the mutex */
+ ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
+
+ if (unlikely(ret)) {
+ __set_current_state(TASK_RUNNING);
+ if (rt_mutex_has_waiters(lock))
+ remove_waiter(lock, &waiter);
+ rt_mutex_handle_deadlock(ret, chwalk, &waiter);
+ }
+
+ /*
+ * try_to_take_rt_mutex() sets the waiter bit
+ * unconditionally. We might have to fix that up.
+ */
+ fixup_rt_mutex_waiters(lock);
+
+ raw_spin_unlock(&lock->wait_lock);
+
+ /* Remove pending timer: */
+ if (unlikely(timeout))
+ hrtimer_cancel(&timeout->timer);
+
+ debug_rt_mutex_free_waiter(&waiter);
+
+ return ret;
+}
+
+/*
+ * Slow path try-lock function:
+ */
+static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
+{
+ int ret;
+
+ /*
+ * If the lock already has an owner we fail to get the lock.
+ * This can be done without taking the @lock->wait_lock as
+ * it is only being read, and this is a trylock anyway.
+ */
+ if (rt_mutex_owner(lock))
+ return 0;
+
+ /*
+ * The mutex has currently no owner. Lock the wait lock and
+ * try to acquire the lock.
+ */
+ raw_spin_lock(&lock->wait_lock);
+
+ ret = try_to_take_rt_mutex(lock, current, NULL);
+
+ /*
+ * try_to_take_rt_mutex() sets the lock waiters bit
+ * unconditionally. Clean this up.
+ */
+ fixup_rt_mutex_waiters(lock);
+
+ raw_spin_unlock(&lock->wait_lock);
+
+ return ret;
+}
+
+/*
+ * Slow path to release a rt-mutex:
+ */
+static void __sched
+rt_mutex_slowunlock(struct rt_mutex *lock)
+{
+ raw_spin_lock(&lock->wait_lock);
+
+ debug_rt_mutex_unlock(lock);
+
+ rt_mutex_deadlock_account_unlock(current);
+
+ /*
+ * We must be careful here if the fast path is enabled. If we
+ * have no waiters queued we cannot set owner to NULL here
+ * because of:
+ *
+ * foo->lock->owner = NULL;
+ * rtmutex_lock(foo->lock); <- fast path
+ * free = atomic_dec_and_test(foo->refcnt);
+ * rtmutex_unlock(foo->lock); <- fast path
+ * if (free)
+ * kfree(foo);
+ * raw_spin_unlock(foo->lock->wait_lock);
+ *
+ * So for the fastpath enabled kernel:
+ *
+ * Nothing can set the waiters bit as long as we hold
+ * lock->wait_lock. So we do the following sequence:
+ *
+ * owner = rt_mutex_owner(lock);
+ * clear_rt_mutex_waiters(lock);
+ * raw_spin_unlock(&lock->wait_lock);
+ * if (cmpxchg(&lock->owner, owner, 0) == owner)
+ * return;
+ * goto retry;
+ *
+ * The fastpath disabled variant is simple as all access to
+ * lock->owner is serialized by lock->wait_lock:
+ *
+ * lock->owner = NULL;
+ * raw_spin_unlock(&lock->wait_lock);
+ */
+ while (!rt_mutex_has_waiters(lock)) {
+ /* Drops lock->wait_lock ! */
+ if (unlock_rt_mutex_safe(lock) == true)
+ return;
+ /* Relock the rtmutex and try again */
+ raw_spin_lock(&lock->wait_lock);
+ }
+
+ /*
+ * The wakeup next waiter path does not suffer from the above
+ * race. See the comments there.
+ */
+ wakeup_next_waiter(lock);
+
+ raw_spin_unlock(&lock->wait_lock);
+
+ /* Undo pi boosting if necessary: */
+ rt_mutex_adjust_prio(current);
+}
+
+/*
+ * debug aware fast / slowpath lock,trylock,unlock
+ *
+ * The atomic acquire/release ops are compiled away, when either the
+ * architecture does not support cmpxchg or when debugging is enabled.
+ */
+static inline int
+rt_mutex_fastlock(struct rt_mutex *lock, int state,
+ int (*slowfn)(struct rt_mutex *lock, int state,
+ struct hrtimer_sleeper *timeout,
+ enum rtmutex_chainwalk chwalk))
+{
+ if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+ rt_mutex_deadlock_account_lock(lock, current);
+ return 0;
+ } else
+ return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
+}
+
+static inline int
+rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
+ struct hrtimer_sleeper *timeout,
+ enum rtmutex_chainwalk chwalk,
+ int (*slowfn)(struct rt_mutex *lock, int state,
+ struct hrtimer_sleeper *timeout,
+ enum rtmutex_chainwalk chwalk))
+{
+ if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
+ likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+ rt_mutex_deadlock_account_lock(lock, current);
+ return 0;
+ } else
+ return slowfn(lock, state, timeout, chwalk);
+}
+
+static inline int
+rt_mutex_fasttrylock(struct rt_mutex *lock,
+ int (*slowfn)(struct rt_mutex *lock))
+{
+ if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+ rt_mutex_deadlock_account_lock(lock, current);
+ return 1;
+ }
+ return slowfn(lock);
+}
+
+static inline void
+rt_mutex_fastunlock(struct rt_mutex *lock,
+ void (*slowfn)(struct rt_mutex *lock))
+{
+ if (likely(rt_mutex_cmpxchg(lock, current, NULL)))
+ rt_mutex_deadlock_account_unlock(current);
+ else
+ slowfn(lock);
+}
+
+/**
+ * rt_mutex_lock - lock a rt_mutex
+ *
+ * @lock: the rt_mutex to be locked
+ */
+void __sched rt_mutex_lock(struct rt_mutex *lock)
+{
+ might_sleep();
+
+ rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock);
+
+/**
+ * rt_mutex_lock_interruptible - lock a rt_mutex interruptible
+ *
+ * @lock: the rt_mutex to be locked
+ *
+ * Returns:
+ * 0 on success
+ * -EINTR when interrupted by a signal
+ */
+int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
+{
+ might_sleep();
+
+ return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
+
+/*
+ * Futex variant with full deadlock detection.
+ */
+int rt_mutex_timed_futex_lock(struct rt_mutex *lock,
+ struct hrtimer_sleeper *timeout)
+{
+ might_sleep();
+
+ return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
+ RT_MUTEX_FULL_CHAINWALK,
+ rt_mutex_slowlock);
+}
+
+/**
+ * rt_mutex_timed_lock - lock a rt_mutex interruptible
+ * the timeout structure is provided
+ * by the caller
+ *
+ * @lock: the rt_mutex to be locked
+ * @timeout: timeout structure or NULL (no timeout)
+ *
+ * Returns:
+ * 0 on success
+ * -EINTR when interrupted by a signal
+ * -ETIMEDOUT when the timeout expired
+ */
+int
+rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout)
+{
+ might_sleep();
+
+ return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
+ RT_MUTEX_MIN_CHAINWALK,
+ rt_mutex_slowlock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
+
+/**
+ * rt_mutex_trylock - try to lock a rt_mutex
+ *
+ * @lock: the rt_mutex to be locked
+ *
+ * Returns 1 on success and 0 on contention
+ */
+int __sched rt_mutex_trylock(struct rt_mutex *lock)
+{
+ return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_trylock);
+
+/**
+ * rt_mutex_unlock - unlock a rt_mutex
+ *
+ * @lock: the rt_mutex to be unlocked
+ */
+void __sched rt_mutex_unlock(struct rt_mutex *lock)
+{
+ rt_mutex_fastunlock(lock, rt_mutex_slowunlock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_unlock);
+
+/**
+ * rt_mutex_destroy - mark a mutex unusable
+ * @lock: the mutex to be destroyed
+ *
+ * This function marks the mutex uninitialized, and any subsequent
+ * use of the mutex is forbidden. The mutex must not be locked when
+ * this function is called.
+ */
+void rt_mutex_destroy(struct rt_mutex *lock)
+{
+ WARN_ON(rt_mutex_is_locked(lock));
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+ lock->magic = NULL;
+#endif
+}
+
+EXPORT_SYMBOL_GPL(rt_mutex_destroy);
+
+/**
+ * __rt_mutex_init - initialize the rt lock
+ *
+ * @lock: the rt lock to be initialized
+ *
+ * Initialize the rt lock to unlocked state.
+ *
+ * Initializing of a locked rt lock is not allowed
+ */
+void __rt_mutex_init(struct rt_mutex *lock, const char *name)
+{
+ lock->owner = NULL;
+ raw_spin_lock_init(&lock->wait_lock);
+ lock->waiters = RB_ROOT;
+ lock->waiters_leftmost = NULL;
+
+ debug_rt_mutex_init(lock, name);
+}
+EXPORT_SYMBOL_GPL(__rt_mutex_init);
+
+/**
+ * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
+ * proxy owner
+ *
+ * @lock: the rt_mutex to be locked
+ * @proxy_owner:the task to set as owner
+ *
+ * No locking. Caller has to do serializing itself
+ * Special API call for PI-futex support
+ */
+void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
+ struct task_struct *proxy_owner)
+{
+ __rt_mutex_init(lock, NULL);
+ debug_rt_mutex_proxy_lock(lock, proxy_owner);
+ rt_mutex_set_owner(lock, proxy_owner);
+ rt_mutex_deadlock_account_lock(lock, proxy_owner);
+}
+
+/**
+ * rt_mutex_proxy_unlock - release a lock on behalf of owner
+ *
+ * @lock: the rt_mutex to be locked
+ *
+ * No locking. Caller has to do serializing itself
+ * Special API call for PI-futex support
+ */
+void rt_mutex_proxy_unlock(struct rt_mutex *lock,
+ struct task_struct *proxy_owner)
+{
+ debug_rt_mutex_proxy_unlock(lock);
+ rt_mutex_set_owner(lock, NULL);
+ rt_mutex_deadlock_account_unlock(proxy_owner);
+}
+
+/**
+ * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
+ * @lock: the rt_mutex to take
+ * @waiter: the pre-initialized rt_mutex_waiter
+ * @task: the task to prepare
+ *
+ * Returns:
+ * 0 - task blocked on lock
+ * 1 - acquired the lock for task, caller should wake it up
+ * <0 - error
+ *
+ * Special API call for FUTEX_REQUEUE_PI support.
+ */
+int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task)
+{
+ int ret;
+
+ raw_spin_lock(&lock->wait_lock);
+
+ if (try_to_take_rt_mutex(lock, task, NULL)) {
+ raw_spin_unlock(&lock->wait_lock);
+ return 1;
+ }
+
+ /* We enforce deadlock detection for futexes */
+ ret = task_blocks_on_rt_mutex(lock, waiter, task,
+ RT_MUTEX_FULL_CHAINWALK);
+
+ if (ret && !rt_mutex_owner(lock)) {
+ /*
+ * Reset the return value. We might have
+ * returned with -EDEADLK and the owner
+ * released the lock while we were walking the
+ * pi chain. Let the waiter sort it out.
+ */
+ ret = 0;
+ }
+
+ if (unlikely(ret))
+ remove_waiter(lock, waiter);
+
+ raw_spin_unlock(&lock->wait_lock);
+
+ debug_rt_mutex_print_deadlock(waiter);
+
+ return ret;
+}
+
+/**
+ * rt_mutex_next_owner - return the next owner of the lock
+ *
+ * @lock: the rt lock query
+ *
+ * Returns the next owner of the lock or NULL
+ *
+ * Caller has to serialize against other accessors to the lock
+ * itself.
+ *
+ * Special API call for PI-futex support
+ */
+struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
+{
+ if (!rt_mutex_has_waiters(lock))
+ return NULL;
+
+ return rt_mutex_top_waiter(lock)->task;
+}
+
+/**
+ * rt_mutex_finish_proxy_lock() - Complete lock acquisition
+ * @lock: the rt_mutex we were woken on
+ * @to: the timeout, null if none. hrtimer should already have
+ * been started.
+ * @waiter: the pre-initialized rt_mutex_waiter
+ *
+ * Complete the lock acquisition started our behalf by another thread.
+ *
+ * Returns:
+ * 0 - success
+ * <0 - error, one of -EINTR, -ETIMEDOUT
+ *
+ * Special API call for PI-futex requeue support
+ */
+int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
+ struct hrtimer_sleeper *to,
+ struct rt_mutex_waiter *waiter)
+{
+ int ret;
+
+ raw_spin_lock(&lock->wait_lock);
+
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ /* sleep on the mutex */
+ ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
+
+ if (unlikely(ret))
+ remove_waiter(lock, waiter);
+
+ /*
+ * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
+ * have to fix that up.
+ */
+ fixup_rt_mutex_waiters(lock);
+
+ raw_spin_unlock(&lock->wait_lock);
+
+ return ret;
+}
diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h
new file mode 100644
index 000000000..c4060584c
--- /dev/null
+++ b/kernel/locking/rtmutex.h
@@ -0,0 +1,36 @@
+/*
+ * RT-Mutexes: blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * This file contains macros used solely by rtmutex.c.
+ * Non-debug version.
+ */
+
+#define rt_mutex_deadlock_check(l) (0)
+#define rt_mutex_deadlock_account_lock(m, t) do { } while (0)
+#define rt_mutex_deadlock_account_unlock(l) do { } while (0)
+#define debug_rt_mutex_init_waiter(w) do { } while (0)
+#define debug_rt_mutex_free_waiter(w) do { } while (0)
+#define debug_rt_mutex_lock(l) do { } while (0)
+#define debug_rt_mutex_proxy_lock(l,p) do { } while (0)
+#define debug_rt_mutex_proxy_unlock(l) do { } while (0)
+#define debug_rt_mutex_unlock(l) do { } while (0)
+#define debug_rt_mutex_init(m, n) do { } while (0)
+#define debug_rt_mutex_deadlock(d, a ,l) do { } while (0)
+#define debug_rt_mutex_print_deadlock(w) do { } while (0)
+#define debug_rt_mutex_reset_waiter(w) do { } while (0)
+
+static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w)
+{
+ WARN(1, "rtmutex deadlock detected\n");
+}
+
+static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *w,
+ enum rtmutex_chainwalk walk)
+{
+ return walk == RT_MUTEX_FULL_CHAINWALK;
+}
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
new file mode 100644
index 000000000..855212501
--- /dev/null
+++ b/kernel/locking/rtmutex_common.h
@@ -0,0 +1,141 @@
+/*
+ * RT Mutexes: blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * This file contains the private data structure and API definitions.
+ */
+
+#ifndef __KERNEL_RTMUTEX_COMMON_H
+#define __KERNEL_RTMUTEX_COMMON_H
+
+#include <linux/rtmutex.h>
+
+/*
+ * The rtmutex in kernel tester is independent of rtmutex debugging. We
+ * call schedule_rt_mutex_test() instead of schedule() for the tasks which
+ * belong to the tester. That way we can delay the wakeup path of those
+ * threads to provoke lock stealing and testing of complex boosting scenarios.
+ */
+#ifdef CONFIG_RT_MUTEX_TESTER
+
+extern void schedule_rt_mutex_test(struct rt_mutex *lock);
+
+#define schedule_rt_mutex(_lock) \
+ do { \
+ if (!(current->flags & PF_MUTEX_TESTER)) \
+ schedule(); \
+ else \
+ schedule_rt_mutex_test(_lock); \
+ } while (0)
+
+#else
+# define schedule_rt_mutex(_lock) schedule()
+#endif
+
+/*
+ * This is the control structure for tasks blocked on a rt_mutex,
+ * which is allocated on the kernel stack on of the blocked task.
+ *
+ * @tree_entry: pi node to enqueue into the mutex waiters tree
+ * @pi_tree_entry: pi node to enqueue into the mutex owner waiters tree
+ * @task: task reference to the blocked task
+ */
+struct rt_mutex_waiter {
+ struct rb_node tree_entry;
+ struct rb_node pi_tree_entry;
+ struct task_struct *task;
+ struct rt_mutex *lock;
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+ unsigned long ip;
+ struct pid *deadlock_task_pid;
+ struct rt_mutex *deadlock_lock;
+#endif
+ int prio;
+};
+
+/*
+ * Various helpers to access the waiters-tree:
+ */
+static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
+{
+ return !RB_EMPTY_ROOT(&lock->waiters);
+}
+
+static inline struct rt_mutex_waiter *
+rt_mutex_top_waiter(struct rt_mutex *lock)
+{
+ struct rt_mutex_waiter *w;
+
+ w = rb_entry(lock->waiters_leftmost, struct rt_mutex_waiter,
+ tree_entry);
+ BUG_ON(w->lock != lock);
+
+ return w;
+}
+
+static inline int task_has_pi_waiters(struct task_struct *p)
+{
+ return !RB_EMPTY_ROOT(&p->pi_waiters);
+}
+
+static inline struct rt_mutex_waiter *
+task_top_pi_waiter(struct task_struct *p)
+{
+ return rb_entry(p->pi_waiters_leftmost, struct rt_mutex_waiter,
+ pi_tree_entry);
+}
+
+/*
+ * lock->owner state tracking:
+ */
+#define RT_MUTEX_HAS_WAITERS 1UL
+#define RT_MUTEX_OWNER_MASKALL 1UL
+
+static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
+{
+ return (struct task_struct *)
+ ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL);
+}
+
+/*
+ * Constants for rt mutex functions which have a selectable deadlock
+ * detection.
+ *
+ * RT_MUTEX_MIN_CHAINWALK: Stops the lock chain walk when there are
+ * no further PI adjustments to be made.
+ *
+ * RT_MUTEX_FULL_CHAINWALK: Invoke deadlock detection with a full
+ * walk of the lock chain.
+ */
+enum rtmutex_chainwalk {
+ RT_MUTEX_MIN_CHAINWALK,
+ RT_MUTEX_FULL_CHAINWALK,
+};
+
+/*
+ * PI-futex support (proxy locking functions, etc.):
+ */
+extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
+extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
+ struct task_struct *proxy_owner);
+extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
+ struct task_struct *proxy_owner);
+extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task);
+extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
+ struct hrtimer_sleeper *to,
+ struct rt_mutex_waiter *waiter);
+extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+# include "rtmutex-debug.h"
+#else
+# include "rtmutex.h"
+#endif
+
+#endif
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
new file mode 100644
index 000000000..3a5048572
--- /dev/null
+++ b/kernel/locking/rwsem-spinlock.c
@@ -0,0 +1,303 @@
+/* rwsem-spinlock.c: R/W semaphores: contention handling functions for
+ * generic spinlock implementation
+ *
+ * Copyright (c) 2001 David Howells (dhowells@redhat.com).
+ * - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
+ * - Derived also from comments by Linus
+ */
+#include <linux/rwsem.h>
+#include <linux/sched.h>
+#include <linux/export.h>
+
+enum rwsem_waiter_type {
+ RWSEM_WAITING_FOR_WRITE,
+ RWSEM_WAITING_FOR_READ
+};
+
+struct rwsem_waiter {
+ struct list_head list;
+ struct task_struct *task;
+ enum rwsem_waiter_type type;
+};
+
+int rwsem_is_locked(struct rw_semaphore *sem)
+{
+ int ret = 1;
+ unsigned long flags;
+
+ if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) {
+ ret = (sem->count != 0);
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(rwsem_is_locked);
+
+/*
+ * initialise the semaphore
+ */
+void __init_rwsem(struct rw_semaphore *sem, const char *name,
+ struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ /*
+ * Make sure we are not reinitializing a held semaphore:
+ */
+ debug_check_no_locks_freed((void *)sem, sizeof(*sem));
+ lockdep_init_map(&sem->dep_map, name, key, 0);
+#endif
+ sem->count = 0;
+ raw_spin_lock_init(&sem->wait_lock);
+ INIT_LIST_HEAD(&sem->wait_list);
+}
+EXPORT_SYMBOL(__init_rwsem);
+
+/*
+ * handle the lock release when processes blocked on it that can now run
+ * - if we come here, then:
+ * - the 'active count' _reached_ zero
+ * - the 'waiting count' is non-zero
+ * - the spinlock must be held by the caller
+ * - woken process blocks are discarded from the list after having task zeroed
+ * - writers are only woken if wakewrite is non-zero
+ */
+static inline struct rw_semaphore *
+__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
+{
+ struct rwsem_waiter *waiter;
+ struct task_struct *tsk;
+ int woken;
+
+ waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
+
+ if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
+ if (wakewrite)
+ /* Wake up a writer. Note that we do not grant it the
+ * lock - it will have to acquire it when it runs. */
+ wake_up_process(waiter->task);
+ goto out;
+ }
+
+ /* grant an infinite number of read locks to the front of the queue */
+ woken = 0;
+ do {
+ struct list_head *next = waiter->list.next;
+
+ list_del(&waiter->list);
+ tsk = waiter->task;
+ /*
+ * Make sure we do not wakeup the next reader before
+ * setting the nil condition to grant the next reader;
+ * otherwise we could miss the wakeup on the other
+ * side and end up sleeping again. See the pairing
+ * in rwsem_down_read_failed().
+ */
+ smp_mb();
+ waiter->task = NULL;
+ wake_up_process(tsk);
+ put_task_struct(tsk);
+ woken++;
+ if (next == &sem->wait_list)
+ break;
+ waiter = list_entry(next, struct rwsem_waiter, list);
+ } while (waiter->type != RWSEM_WAITING_FOR_WRITE);
+
+ sem->count += woken;
+
+ out:
+ return sem;
+}
+
+/*
+ * wake a single writer
+ */
+static inline struct rw_semaphore *
+__rwsem_wake_one_writer(struct rw_semaphore *sem)
+{
+ struct rwsem_waiter *waiter;
+
+ waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
+ wake_up_process(waiter->task);
+
+ return sem;
+}
+
+/*
+ * get a read lock on the semaphore
+ */
+void __sched __down_read(struct rw_semaphore *sem)
+{
+ struct rwsem_waiter waiter;
+ struct task_struct *tsk;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&sem->wait_lock, flags);
+
+ if (sem->count >= 0 && list_empty(&sem->wait_list)) {
+ /* granted */
+ sem->count++;
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+ goto out;
+ }
+
+ tsk = current;
+ set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+
+ /* set up my own style of waitqueue */
+ waiter.task = tsk;
+ waiter.type = RWSEM_WAITING_FOR_READ;
+ get_task_struct(tsk);
+
+ list_add_tail(&waiter.list, &sem->wait_list);
+
+ /* we don't need to touch the semaphore struct anymore */
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+
+ /* wait to be given the lock */
+ for (;;) {
+ if (!waiter.task)
+ break;
+ schedule();
+ set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+ }
+
+ __set_task_state(tsk, TASK_RUNNING);
+ out:
+ ;
+}
+
+/*
+ * trylock for reading -- returns 1 if successful, 0 if contention
+ */
+int __down_read_trylock(struct rw_semaphore *sem)
+{
+ unsigned long flags;
+ int ret = 0;
+
+
+ raw_spin_lock_irqsave(&sem->wait_lock, flags);
+
+ if (sem->count >= 0 && list_empty(&sem->wait_list)) {
+ /* granted */
+ sem->count++;
+ ret = 1;
+ }
+
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+
+ return ret;
+}
+
+/*
+ * get a write lock on the semaphore
+ */
+void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
+{
+ struct rwsem_waiter waiter;
+ struct task_struct *tsk;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&sem->wait_lock, flags);
+
+ /* set up my own style of waitqueue */
+ tsk = current;
+ waiter.task = tsk;
+ waiter.type = RWSEM_WAITING_FOR_WRITE;
+ list_add_tail(&waiter.list, &sem->wait_list);
+
+ /* wait for someone to release the lock */
+ for (;;) {
+ /*
+ * That is the key to support write lock stealing: allows the
+ * task already on CPU to get the lock soon rather than put
+ * itself into sleep and waiting for system woke it or someone
+ * else in the head of the wait list up.
+ */
+ if (sem->count == 0)
+ break;
+ set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+ schedule();
+ raw_spin_lock_irqsave(&sem->wait_lock, flags);
+ }
+ /* got the lock */
+ sem->count = -1;
+ list_del(&waiter.list);
+
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+}
+
+void __sched __down_write(struct rw_semaphore *sem)
+{
+ __down_write_nested(sem, 0);
+}
+
+/*
+ * trylock for writing -- returns 1 if successful, 0 if contention
+ */
+int __down_write_trylock(struct rw_semaphore *sem)
+{
+ unsigned long flags;
+ int ret = 0;
+
+ raw_spin_lock_irqsave(&sem->wait_lock, flags);
+
+ if (sem->count == 0) {
+ /* got the lock */
+ sem->count = -1;
+ ret = 1;
+ }
+
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+
+ return ret;
+}
+
+/*
+ * release a read lock on the semaphore
+ */
+void __up_read(struct rw_semaphore *sem)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&sem->wait_lock, flags);
+
+ if (--sem->count == 0 && !list_empty(&sem->wait_list))
+ sem = __rwsem_wake_one_writer(sem);
+
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+}
+
+/*
+ * release a write lock on the semaphore
+ */
+void __up_write(struct rw_semaphore *sem)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&sem->wait_lock, flags);
+
+ sem->count = 0;
+ if (!list_empty(&sem->wait_list))
+ sem = __rwsem_do_wake(sem, 1);
+
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+}
+
+/*
+ * downgrade a write lock into a read lock
+ * - just wake up any readers at the front of the queue
+ */
+void __downgrade_write(struct rw_semaphore *sem)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&sem->wait_lock, flags);
+
+ sem->count = 1;
+ if (!list_empty(&sem->wait_list))
+ sem = __rwsem_do_wake(sem, 0);
+
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+}
+
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
new file mode 100644
index 000000000..3417d0172
--- /dev/null
+++ b/kernel/locking/rwsem-xadd.c
@@ -0,0 +1,531 @@
+/* rwsem.c: R/W semaphores: contention handling functions
+ *
+ * Written by David Howells (dhowells@redhat.com).
+ * Derived from arch/i386/kernel/semaphore.c
+ *
+ * Writer lock-stealing by Alex Shi <alex.shi@intel.com>
+ * and Michel Lespinasse <walken@google.com>
+ *
+ * Optimistic spinning by Tim Chen <tim.c.chen@intel.com>
+ * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes.
+ */
+#include <linux/rwsem.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/sched/rt.h>
+#include <linux/osq_lock.h>
+
+#include "rwsem.h"
+
+/*
+ * Guide to the rw_semaphore's count field for common values.
+ * (32-bit case illustrated, similar for 64-bit)
+ *
+ * 0x0000000X (1) X readers active or attempting lock, no writer waiting
+ * X = #active_readers + #readers attempting to lock
+ * (X*ACTIVE_BIAS)
+ *
+ * 0x00000000 rwsem is unlocked, and no one is waiting for the lock or
+ * attempting to read lock or write lock.
+ *
+ * 0xffff000X (1) X readers active or attempting lock, with waiters for lock
+ * X = #active readers + # readers attempting lock
+ * (X*ACTIVE_BIAS + WAITING_BIAS)
+ * (2) 1 writer attempting lock, no waiters for lock
+ * X-1 = #active readers + #readers attempting lock
+ * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS)
+ * (3) 1 writer active, no waiters for lock
+ * X-1 = #active readers + #readers attempting lock
+ * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS)
+ *
+ * 0xffff0001 (1) 1 reader active or attempting lock, waiters for lock
+ * (WAITING_BIAS + ACTIVE_BIAS)
+ * (2) 1 writer active or attempting lock, no waiters for lock
+ * (ACTIVE_WRITE_BIAS)
+ *
+ * 0xffff0000 (1) There are writers or readers queued but none active
+ * or in the process of attempting lock.
+ * (WAITING_BIAS)
+ * Note: writer can attempt to steal lock for this count by adding
+ * ACTIVE_WRITE_BIAS in cmpxchg and checking the old count
+ *
+ * 0xfffe0001 (1) 1 writer active, or attempting lock. Waiters on queue.
+ * (ACTIVE_WRITE_BIAS + WAITING_BIAS)
+ *
+ * Note: Readers attempt to lock by adding ACTIVE_BIAS in down_read and checking
+ * the count becomes more than 0 for successful lock acquisition,
+ * i.e. the case where there are only readers or nobody has lock.
+ * (1st and 2nd case above).
+ *
+ * Writers attempt to lock by adding ACTIVE_WRITE_BIAS in down_write and
+ * checking the count becomes ACTIVE_WRITE_BIAS for successful lock
+ * acquisition (i.e. nobody else has lock or attempts lock). If
+ * unsuccessful, in rwsem_down_write_failed, we'll check to see if there
+ * are only waiters but none active (5th case above), and attempt to
+ * steal the lock.
+ *
+ */
+
+/*
+ * Initialize an rwsem:
+ */
+void __init_rwsem(struct rw_semaphore *sem, const char *name,
+ struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ /*
+ * Make sure we are not reinitializing a held semaphore:
+ */
+ debug_check_no_locks_freed((void *)sem, sizeof(*sem));
+ lockdep_init_map(&sem->dep_map, name, key, 0);
+#endif
+ sem->count = RWSEM_UNLOCKED_VALUE;
+ raw_spin_lock_init(&sem->wait_lock);
+ INIT_LIST_HEAD(&sem->wait_list);
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
+ sem->owner = NULL;
+ osq_lock_init(&sem->osq);
+#endif
+}
+
+EXPORT_SYMBOL(__init_rwsem);
+
+enum rwsem_waiter_type {
+ RWSEM_WAITING_FOR_WRITE,
+ RWSEM_WAITING_FOR_READ
+};
+
+struct rwsem_waiter {
+ struct list_head list;
+ struct task_struct *task;
+ enum rwsem_waiter_type type;
+};
+
+enum rwsem_wake_type {
+ RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */
+ RWSEM_WAKE_READERS, /* Wake readers only */
+ RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */
+};
+
+/*
+ * handle the lock release when processes blocked on it that can now run
+ * - if we come here from up_xxxx(), then:
+ * - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed)
+ * - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so)
+ * - there must be someone on the queue
+ * - the spinlock must be held by the caller
+ * - woken process blocks are discarded from the list after having task zeroed
+ * - writers are only woken if downgrading is false
+ */
+static struct rw_semaphore *
+__rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
+{
+ struct rwsem_waiter *waiter;
+ struct task_struct *tsk;
+ struct list_head *next;
+ long oldcount, woken, loop, adjustment;
+
+ waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
+ if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
+ if (wake_type == RWSEM_WAKE_ANY)
+ /* Wake writer at the front of the queue, but do not
+ * grant it the lock yet as we want other writers
+ * to be able to steal it. Readers, on the other hand,
+ * will block as they will notice the queued writer.
+ */
+ wake_up_process(waiter->task);
+ goto out;
+ }
+
+ /* Writers might steal the lock before we grant it to the next reader.
+ * We prefer to do the first reader grant before counting readers
+ * so we can bail out early if a writer stole the lock.
+ */
+ adjustment = 0;
+ if (wake_type != RWSEM_WAKE_READ_OWNED) {
+ adjustment = RWSEM_ACTIVE_READ_BIAS;
+ try_reader_grant:
+ oldcount = rwsem_atomic_update(adjustment, sem) - adjustment;
+ if (unlikely(oldcount < RWSEM_WAITING_BIAS)) {
+ /* A writer stole the lock. Undo our reader grant. */
+ if (rwsem_atomic_update(-adjustment, sem) &
+ RWSEM_ACTIVE_MASK)
+ goto out;
+ /* Last active locker left. Retry waking readers. */
+ goto try_reader_grant;
+ }
+ }
+
+ /* Grant an infinite number of read locks to the readers at the front
+ * of the queue. Note we increment the 'active part' of the count by
+ * the number of readers before waking any processes up.
+ */
+ woken = 0;
+ do {
+ woken++;
+
+ if (waiter->list.next == &sem->wait_list)
+ break;
+
+ waiter = list_entry(waiter->list.next,
+ struct rwsem_waiter, list);
+
+ } while (waiter->type != RWSEM_WAITING_FOR_WRITE);
+
+ adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
+ if (waiter->type != RWSEM_WAITING_FOR_WRITE)
+ /* hit end of list above */
+ adjustment -= RWSEM_WAITING_BIAS;
+
+ if (adjustment)
+ rwsem_atomic_add(adjustment, sem);
+
+ next = sem->wait_list.next;
+ loop = woken;
+ do {
+ waiter = list_entry(next, struct rwsem_waiter, list);
+ next = waiter->list.next;
+ tsk = waiter->task;
+ /*
+ * Make sure we do not wakeup the next reader before
+ * setting the nil condition to grant the next reader;
+ * otherwise we could miss the wakeup on the other
+ * side and end up sleeping again. See the pairing
+ * in rwsem_down_read_failed().
+ */
+ smp_mb();
+ waiter->task = NULL;
+ wake_up_process(tsk);
+ put_task_struct(tsk);
+ } while (--loop);
+
+ sem->wait_list.next = next;
+ next->prev = &sem->wait_list;
+
+ out:
+ return sem;
+}
+
+/*
+ * Wait for the read lock to be granted
+ */
+__visible
+struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
+{
+ long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
+ struct rwsem_waiter waiter;
+ struct task_struct *tsk = current;
+
+ /* set up my own style of waitqueue */
+ waiter.task = tsk;
+ waiter.type = RWSEM_WAITING_FOR_READ;
+ get_task_struct(tsk);
+
+ raw_spin_lock_irq(&sem->wait_lock);
+ if (list_empty(&sem->wait_list))
+ adjustment += RWSEM_WAITING_BIAS;
+ list_add_tail(&waiter.list, &sem->wait_list);
+
+ /* we're now waiting on the lock, but no longer actively locking */
+ count = rwsem_atomic_update(adjustment, sem);
+
+ /* If there are no active locks, wake the front queued process(es).
+ *
+ * If there are no writers and we are first in the queue,
+ * wake our own waiter to join the existing active readers !
+ */
+ if (count == RWSEM_WAITING_BIAS ||
+ (count > RWSEM_WAITING_BIAS &&
+ adjustment != -RWSEM_ACTIVE_READ_BIAS))
+ sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY);
+
+ raw_spin_unlock_irq(&sem->wait_lock);
+
+ /* wait to be given the lock */
+ while (true) {
+ set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+ if (!waiter.task)
+ break;
+ schedule();
+ }
+
+ __set_task_state(tsk, TASK_RUNNING);
+ return sem;
+}
+EXPORT_SYMBOL(rwsem_down_read_failed);
+
+static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
+{
+ /*
+ * Try acquiring the write lock. Check count first in order
+ * to reduce unnecessary expensive cmpxchg() operations.
+ */
+ if (count == RWSEM_WAITING_BIAS &&
+ cmpxchg(&sem->count, RWSEM_WAITING_BIAS,
+ RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) {
+ if (!list_is_singular(&sem->wait_list))
+ rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
+ rwsem_set_owner(sem);
+ return true;
+ }
+
+ return false;
+}
+
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
+/*
+ * Try to acquire write lock before the writer has been put on wait queue.
+ */
+static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
+{
+ long old, count = READ_ONCE(sem->count);
+
+ while (true) {
+ if (!(count == 0 || count == RWSEM_WAITING_BIAS))
+ return false;
+
+ old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS);
+ if (old == count) {
+ rwsem_set_owner(sem);
+ return true;
+ }
+
+ count = old;
+ }
+}
+
+static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
+{
+ struct task_struct *owner;
+ bool ret = true;
+
+ if (need_resched())
+ return false;
+
+ rcu_read_lock();
+ owner = READ_ONCE(sem->owner);
+ if (!owner) {
+ long count = READ_ONCE(sem->count);
+ /*
+ * If sem->owner is not set, yet we have just recently entered the
+ * slowpath with the lock being active, then there is a possibility
+ * reader(s) may have the lock. To be safe, bail spinning in these
+ * situations.
+ */
+ if (count & RWSEM_ACTIVE_MASK)
+ ret = false;
+ goto done;
+ }
+
+ ret = owner->on_cpu;
+done:
+ rcu_read_unlock();
+ return ret;
+}
+
+static noinline
+bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner)
+{
+ long count;
+
+ rcu_read_lock();
+ while (sem->owner == owner) {
+ /*
+ * Ensure we emit the owner->on_cpu, dereference _after_
+ * checking sem->owner still matches owner, if that fails,
+ * owner might point to free()d memory, if it still matches,
+ * the rcu_read_lock() ensures the memory stays valid.
+ */
+ barrier();
+
+ /* abort spinning when need_resched or owner is not running */
+ if (!owner->on_cpu || need_resched()) {
+ rcu_read_unlock();
+ return false;
+ }
+
+ cpu_relax_lowlatency();
+ }
+ rcu_read_unlock();
+
+ if (READ_ONCE(sem->owner))
+ return true; /* new owner, continue spinning */
+
+ /*
+ * When the owner is not set, the lock could be free or
+ * held by readers. Check the counter to verify the
+ * state.
+ */
+ count = READ_ONCE(sem->count);
+ return (count == 0 || count == RWSEM_WAITING_BIAS);
+}
+
+static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
+{
+ struct task_struct *owner;
+ bool taken = false;
+
+ preempt_disable();
+
+ /* sem->wait_lock should not be held when doing optimistic spinning */
+ if (!rwsem_can_spin_on_owner(sem))
+ goto done;
+
+ if (!osq_lock(&sem->osq))
+ goto done;
+
+ while (true) {
+ owner = READ_ONCE(sem->owner);
+ if (owner && !rwsem_spin_on_owner(sem, owner))
+ break;
+
+ /* wait_lock will be acquired if write_lock is obtained */
+ if (rwsem_try_write_lock_unqueued(sem)) {
+ taken = true;
+ break;
+ }
+
+ /*
+ * When there's no owner, we might have preempted between the
+ * owner acquiring the lock and setting the owner field. If
+ * we're an RT task that will live-lock because we won't let
+ * the owner complete.
+ */
+ if (!owner && (need_resched() || rt_task(current)))
+ break;
+
+ /*
+ * The cpu_relax() call is a compiler barrier which forces
+ * everything in this loop to be re-loaded. We don't need
+ * memory barriers as we'll eventually observe the right
+ * values at the cost of a few extra spins.
+ */
+ cpu_relax_lowlatency();
+ }
+ osq_unlock(&sem->osq);
+done:
+ preempt_enable();
+ return taken;
+}
+
+#else
+static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
+{
+ return false;
+}
+#endif
+
+/*
+ * Wait until we successfully acquire the write lock
+ */
+__visible
+struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
+{
+ long count;
+ bool waiting = true; /* any queued threads before us */
+ struct rwsem_waiter waiter;
+
+ /* undo write bias from down_write operation, stop active locking */
+ count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem);
+
+ /* do optimistic spinning and steal lock if possible */
+ if (rwsem_optimistic_spin(sem))
+ return sem;
+
+ /*
+ * Optimistic spinning failed, proceed to the slowpath
+ * and block until we can acquire the sem.
+ */
+ waiter.task = current;
+ waiter.type = RWSEM_WAITING_FOR_WRITE;
+
+ raw_spin_lock_irq(&sem->wait_lock);
+
+ /* account for this before adding a new element to the list */
+ if (list_empty(&sem->wait_list))
+ waiting = false;
+
+ list_add_tail(&waiter.list, &sem->wait_list);
+
+ /* we're now waiting on the lock, but no longer actively locking */
+ if (waiting) {
+ count = READ_ONCE(sem->count);
+
+ /*
+ * If there were already threads queued before us and there are
+ * no active writers, the lock must be read owned; so we try to
+ * wake any read locks that were queued ahead of us.
+ */
+ if (count > RWSEM_WAITING_BIAS)
+ sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS);
+
+ } else
+ count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
+
+ /* wait until we successfully acquire the lock */
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ while (true) {
+ if (rwsem_try_write_lock(count, sem))
+ break;
+ raw_spin_unlock_irq(&sem->wait_lock);
+
+ /* Block until there are no active lockers. */
+ do {
+ schedule();
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ } while ((count = sem->count) & RWSEM_ACTIVE_MASK);
+
+ raw_spin_lock_irq(&sem->wait_lock);
+ }
+ __set_current_state(TASK_RUNNING);
+
+ list_del(&waiter.list);
+ raw_spin_unlock_irq(&sem->wait_lock);
+
+ return sem;
+}
+EXPORT_SYMBOL(rwsem_down_write_failed);
+
+/*
+ * handle waking up a waiter on the semaphore
+ * - up_read/up_write has decremented the active part of count if we come here
+ */
+__visible
+struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&sem->wait_lock, flags);
+
+ /* do nothing if list empty */
+ if (!list_empty(&sem->wait_list))
+ sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY);
+
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+
+ return sem;
+}
+EXPORT_SYMBOL(rwsem_wake);
+
+/*
+ * downgrade a write lock into a read lock
+ * - caller incremented waiting part of count and discovered it still negative
+ * - just wake up any readers at the front of the queue
+ */
+__visible
+struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&sem->wait_lock, flags);
+
+ /* do nothing if list empty */
+ if (!list_empty(&sem->wait_list))
+ sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED);
+
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+
+ return sem;
+}
+EXPORT_SYMBOL(rwsem_downgrade_wake);
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
new file mode 100644
index 000000000..205be0ce3
--- /dev/null
+++ b/kernel/locking/rwsem.c
@@ -0,0 +1,166 @@
+/* kernel/rwsem.c: R/W semaphores, public implementation
+ *
+ * Written by David Howells (dhowells@redhat.com).
+ * Derived from asm-i386/semaphore.h
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/export.h>
+#include <linux/rwsem.h>
+#include <linux/atomic.h>
+
+#include "rwsem.h"
+
+/*
+ * lock for reading
+ */
+void __sched down_read(struct rw_semaphore *sem)
+{
+ might_sleep();
+ rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
+
+ LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
+}
+
+EXPORT_SYMBOL(down_read);
+
+/*
+ * trylock for reading -- returns 1 if successful, 0 if contention
+ */
+int down_read_trylock(struct rw_semaphore *sem)
+{
+ int ret = __down_read_trylock(sem);
+
+ if (ret == 1)
+ rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);
+ return ret;
+}
+
+EXPORT_SYMBOL(down_read_trylock);
+
+/*
+ * lock for writing
+ */
+void __sched down_write(struct rw_semaphore *sem)
+{
+ might_sleep();
+ rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
+
+ LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
+ rwsem_set_owner(sem);
+}
+
+EXPORT_SYMBOL(down_write);
+
+/*
+ * trylock for writing -- returns 1 if successful, 0 if contention
+ */
+int down_write_trylock(struct rw_semaphore *sem)
+{
+ int ret = __down_write_trylock(sem);
+
+ if (ret == 1) {
+ rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_);
+ rwsem_set_owner(sem);
+ }
+
+ return ret;
+}
+
+EXPORT_SYMBOL(down_write_trylock);
+
+/*
+ * release a read lock
+ */
+void up_read(struct rw_semaphore *sem)
+{
+ rwsem_release(&sem->dep_map, 1, _RET_IP_);
+
+ __up_read(sem);
+}
+
+EXPORT_SYMBOL(up_read);
+
+/*
+ * release a write lock
+ */
+void up_write(struct rw_semaphore *sem)
+{
+ rwsem_release(&sem->dep_map, 1, _RET_IP_);
+
+ rwsem_clear_owner(sem);
+ __up_write(sem);
+}
+
+EXPORT_SYMBOL(up_write);
+
+/*
+ * downgrade write lock to read lock
+ */
+void downgrade_write(struct rw_semaphore *sem)
+{
+ /*
+ * lockdep: a downgraded write will live on as a write
+ * dependency.
+ */
+ rwsem_clear_owner(sem);
+ __downgrade_write(sem);
+}
+
+EXPORT_SYMBOL(downgrade_write);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+
+void down_read_nested(struct rw_semaphore *sem, int subclass)
+{
+ might_sleep();
+ rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
+
+ LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
+}
+
+EXPORT_SYMBOL(down_read_nested);
+
+void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
+{
+ might_sleep();
+ rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);
+
+ LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
+ rwsem_set_owner(sem);
+}
+
+EXPORT_SYMBOL(_down_write_nest_lock);
+
+void down_read_non_owner(struct rw_semaphore *sem)
+{
+ might_sleep();
+
+ __down_read(sem);
+}
+
+EXPORT_SYMBOL(down_read_non_owner);
+
+void down_write_nested(struct rw_semaphore *sem, int subclass)
+{
+ might_sleep();
+ rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
+
+ LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
+ rwsem_set_owner(sem);
+}
+
+EXPORT_SYMBOL(down_write_nested);
+
+void up_read_non_owner(struct rw_semaphore *sem)
+{
+ __up_read(sem);
+}
+
+EXPORT_SYMBOL(up_read_non_owner);
+
+#endif
+
+
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
new file mode 100644
index 000000000..870ed9a5b
--- /dev/null
+++ b/kernel/locking/rwsem.h
@@ -0,0 +1,20 @@
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
+static inline void rwsem_set_owner(struct rw_semaphore *sem)
+{
+ sem->owner = current;
+}
+
+static inline void rwsem_clear_owner(struct rw_semaphore *sem)
+{
+ sem->owner = NULL;
+}
+
+#else
+static inline void rwsem_set_owner(struct rw_semaphore *sem)
+{
+}
+
+static inline void rwsem_clear_owner(struct rw_semaphore *sem)
+{
+}
+#endif
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
new file mode 100644
index 000000000..b8120abe5
--- /dev/null
+++ b/kernel/locking/semaphore.c
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2008 Intel Corporation
+ * Author: Matthew Wilcox <willy@linux.intel.com>
+ *
+ * Distributed under the terms of the GNU GPL, version 2
+ *
+ * This file implements counting semaphores.
+ * A counting semaphore may be acquired 'n' times before sleeping.
+ * See mutex.c for single-acquisition sleeping locks which enforce
+ * rules which allow code to be debugged more easily.
+ */
+
+/*
+ * Some notes on the implementation:
+ *
+ * The spinlock controls access to the other members of the semaphore.
+ * down_trylock() and up() can be called from interrupt context, so we
+ * have to disable interrupts when taking the lock. It turns out various
+ * parts of the kernel expect to be able to use down() on a semaphore in
+ * interrupt context when they know it will succeed, so we have to use
+ * irqsave variants for down(), down_interruptible() and down_killable()
+ * too.
+ *
+ * The ->count variable represents how many more tasks can acquire this
+ * semaphore. If it's zero, there may be tasks waiting on the wait_list.
+ */
+
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/sched.h>
+#include <linux/semaphore.h>
+#include <linux/spinlock.h>
+#include <linux/ftrace.h>
+
+static noinline void __down(struct semaphore *sem);
+static noinline int __down_interruptible(struct semaphore *sem);
+static noinline int __down_killable(struct semaphore *sem);
+static noinline int __down_timeout(struct semaphore *sem, long timeout);
+static noinline void __up(struct semaphore *sem);
+
+/**
+ * down - acquire the semaphore
+ * @sem: the semaphore to be acquired
+ *
+ * Acquires the semaphore. If no more tasks are allowed to acquire the
+ * semaphore, calling this function will put the task to sleep until the
+ * semaphore is released.
+ *
+ * Use of this function is deprecated, please use down_interruptible() or
+ * down_killable() instead.
+ */
+void down(struct semaphore *sem)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&sem->lock, flags);
+ if (likely(sem->count > 0))
+ sem->count--;
+ else
+ __down(sem);
+ raw_spin_unlock_irqrestore(&sem->lock, flags);
+}
+EXPORT_SYMBOL(down);
+
+/**
+ * down_interruptible - acquire the semaphore unless interrupted
+ * @sem: the semaphore to be acquired
+ *
+ * Attempts to acquire the semaphore. If no more tasks are allowed to
+ * acquire the semaphore, calling this function will put the task to sleep.
+ * If the sleep is interrupted by a signal, this function will return -EINTR.
+ * If the semaphore is successfully acquired, this function returns 0.
+ */
+int down_interruptible(struct semaphore *sem)
+{
+ unsigned long flags;
+ int result = 0;
+
+ raw_spin_lock_irqsave(&sem->lock, flags);
+ if (likely(sem->count > 0))
+ sem->count--;
+ else
+ result = __down_interruptible(sem);
+ raw_spin_unlock_irqrestore(&sem->lock, flags);
+
+ return result;
+}
+EXPORT_SYMBOL(down_interruptible);
+
+/**
+ * down_killable - acquire the semaphore unless killed
+ * @sem: the semaphore to be acquired
+ *
+ * Attempts to acquire the semaphore. If no more tasks are allowed to
+ * acquire the semaphore, calling this function will put the task to sleep.
+ * If the sleep is interrupted by a fatal signal, this function will return
+ * -EINTR. If the semaphore is successfully acquired, this function returns
+ * 0.
+ */
+int down_killable(struct semaphore *sem)
+{
+ unsigned long flags;
+ int result = 0;
+
+ raw_spin_lock_irqsave(&sem->lock, flags);
+ if (likely(sem->count > 0))
+ sem->count--;
+ else
+ result = __down_killable(sem);
+ raw_spin_unlock_irqrestore(&sem->lock, flags);
+
+ return result;
+}
+EXPORT_SYMBOL(down_killable);
+
+/**
+ * down_trylock - try to acquire the semaphore, without waiting
+ * @sem: the semaphore to be acquired
+ *
+ * Try to acquire the semaphore atomically. Returns 0 if the semaphore has
+ * been acquired successfully or 1 if it it cannot be acquired.
+ *
+ * NOTE: This return value is inverted from both spin_trylock and
+ * mutex_trylock! Be careful about this when converting code.
+ *
+ * Unlike mutex_trylock, this function can be used from interrupt context,
+ * and the semaphore can be released by any task or interrupt.
+ */
+int down_trylock(struct semaphore *sem)
+{
+ unsigned long flags;
+ int count;
+
+ raw_spin_lock_irqsave(&sem->lock, flags);
+ count = sem->count - 1;
+ if (likely(count >= 0))
+ sem->count = count;
+ raw_spin_unlock_irqrestore(&sem->lock, flags);
+
+ return (count < 0);
+}
+EXPORT_SYMBOL(down_trylock);
+
+/**
+ * down_timeout - acquire the semaphore within a specified time
+ * @sem: the semaphore to be acquired
+ * @timeout: how long to wait before failing
+ *
+ * Attempts to acquire the semaphore. If no more tasks are allowed to
+ * acquire the semaphore, calling this function will put the task to sleep.
+ * If the semaphore is not released within the specified number of jiffies,
+ * this function returns -ETIME. It returns 0 if the semaphore was acquired.
+ */
+int down_timeout(struct semaphore *sem, long timeout)
+{
+ unsigned long flags;
+ int result = 0;
+
+ raw_spin_lock_irqsave(&sem->lock, flags);
+ if (likely(sem->count > 0))
+ sem->count--;
+ else
+ result = __down_timeout(sem, timeout);
+ raw_spin_unlock_irqrestore(&sem->lock, flags);
+
+ return result;
+}
+EXPORT_SYMBOL(down_timeout);
+
+/**
+ * up - release the semaphore
+ * @sem: the semaphore to release
+ *
+ * Release the semaphore. Unlike mutexes, up() may be called from any
+ * context and even by tasks which have never called down().
+ */
+void up(struct semaphore *sem)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&sem->lock, flags);
+ if (likely(list_empty(&sem->wait_list)))
+ sem->count++;
+ else
+ __up(sem);
+ raw_spin_unlock_irqrestore(&sem->lock, flags);
+}
+EXPORT_SYMBOL(up);
+
+/* Functions for the contended case */
+
+struct semaphore_waiter {
+ struct list_head list;
+ struct task_struct *task;
+ bool up;
+};
+
+/*
+ * Because this function is inlined, the 'state' parameter will be
+ * constant, and thus optimised away by the compiler. Likewise the
+ * 'timeout' parameter for the cases without timeouts.
+ */
+static inline int __sched __down_common(struct semaphore *sem, long state,
+ long timeout)
+{
+ struct task_struct *task = current;
+ struct semaphore_waiter waiter;
+
+ list_add_tail(&waiter.list, &sem->wait_list);
+ waiter.task = task;
+ waiter.up = false;
+
+ for (;;) {
+ if (signal_pending_state(state, task))
+ goto interrupted;
+ if (unlikely(timeout <= 0))
+ goto timed_out;
+ __set_task_state(task, state);
+ raw_spin_unlock_irq(&sem->lock);
+ timeout = schedule_timeout(timeout);
+ raw_spin_lock_irq(&sem->lock);
+ if (waiter.up)
+ return 0;
+ }
+
+ timed_out:
+ list_del(&waiter.list);
+ return -ETIME;
+
+ interrupted:
+ list_del(&waiter.list);
+ return -EINTR;
+}
+
+static noinline void __sched __down(struct semaphore *sem)
+{
+ __down_common(sem, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
+}
+
+static noinline int __sched __down_interruptible(struct semaphore *sem)
+{
+ return __down_common(sem, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
+}
+
+static noinline int __sched __down_killable(struct semaphore *sem)
+{
+ return __down_common(sem, TASK_KILLABLE, MAX_SCHEDULE_TIMEOUT);
+}
+
+static noinline int __sched __down_timeout(struct semaphore *sem, long timeout)
+{
+ return __down_common(sem, TASK_UNINTERRUPTIBLE, timeout);
+}
+
+static noinline void __sched __up(struct semaphore *sem)
+{
+ struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list,
+ struct semaphore_waiter, list);
+ list_del(&waiter->list);
+ waiter->up = true;
+ wake_up_process(waiter->task);
+}
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
new file mode 100644
index 000000000..db3ccb1dd
--- /dev/null
+++ b/kernel/locking/spinlock.c
@@ -0,0 +1,407 @@
+/*
+ * Copyright (2004) Linus Torvalds
+ *
+ * Author: Zwane Mwaikambo <zwane@fsmlabs.com>
+ *
+ * Copyright (2004, 2005) Ingo Molnar
+ *
+ * This file contains the spinlock/rwlock implementations for the
+ * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them)
+ *
+ * Note that some architectures have special knowledge about the
+ * stack frames of these functions in their profile_pc. If you
+ * change anything significant here that could change the stack
+ * frame contact the architecture maintainers.
+ */
+
+#include <linux/linkage.h>
+#include <linux/preempt.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <linux/debug_locks.h>
+#include <linux/export.h>
+
+/*
+ * If lockdep is enabled then we use the non-preemption spin-ops
+ * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are
+ * not re-enabled during lock-acquire (which the preempt-spin-ops do):
+ */
+#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC)
+/*
+ * The __lock_function inlines are taken from
+ * include/linux/spinlock_api_smp.h
+ */
+#else
+#define raw_read_can_lock(l) read_can_lock(l)
+#define raw_write_can_lock(l) write_can_lock(l)
+
+/*
+ * Some architectures can relax in favour of the CPU owning the lock.
+ */
+#ifndef arch_read_relax
+# define arch_read_relax(l) cpu_relax()
+#endif
+#ifndef arch_write_relax
+# define arch_write_relax(l) cpu_relax()
+#endif
+#ifndef arch_spin_relax
+# define arch_spin_relax(l) cpu_relax()
+#endif
+
+/*
+ * We build the __lock_function inlines here. They are too large for
+ * inlining all over the place, but here is only one user per function
+ * which embedds them into the calling _lock_function below.
+ *
+ * This could be a long-held lock. We both prepare to spin for a long
+ * time (making _this_ CPU preemptable if possible), and we also signal
+ * towards that other CPU that it should break the lock ASAP.
+ */
+#define BUILD_LOCK_OPS(op, locktype) \
+void __lockfunc __raw_##op##_lock(locktype##_t *lock) \
+{ \
+ for (;;) { \
+ preempt_disable(); \
+ if (likely(do_raw_##op##_trylock(lock))) \
+ break; \
+ preempt_enable(); \
+ \
+ if (!(lock)->break_lock) \
+ (lock)->break_lock = 1; \
+ while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\
+ arch_##op##_relax(&lock->raw_lock); \
+ } \
+ (lock)->break_lock = 0; \
+} \
+ \
+unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \
+{ \
+ unsigned long flags; \
+ \
+ for (;;) { \
+ preempt_disable(); \
+ local_irq_save(flags); \
+ if (likely(do_raw_##op##_trylock(lock))) \
+ break; \
+ local_irq_restore(flags); \
+ preempt_enable(); \
+ \
+ if (!(lock)->break_lock) \
+ (lock)->break_lock = 1; \
+ while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\
+ arch_##op##_relax(&lock->raw_lock); \
+ } \
+ (lock)->break_lock = 0; \
+ return flags; \
+} \
+ \
+void __lockfunc __raw_##op##_lock_irq(locktype##_t *lock) \
+{ \
+ _raw_##op##_lock_irqsave(lock); \
+} \
+ \
+void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \
+{ \
+ unsigned long flags; \
+ \
+ /* */ \
+ /* Careful: we must exclude softirqs too, hence the */ \
+ /* irq-disabling. We use the generic preemption-aware */ \
+ /* function: */ \
+ /**/ \
+ flags = _raw_##op##_lock_irqsave(lock); \
+ local_bh_disable(); \
+ local_irq_restore(flags); \
+} \
+
+/*
+ * Build preemption-friendly versions of the following
+ * lock-spinning functions:
+ *
+ * __[spin|read|write]_lock()
+ * __[spin|read|write]_lock_irq()
+ * __[spin|read|write]_lock_irqsave()
+ * __[spin|read|write]_lock_bh()
+ */
+BUILD_LOCK_OPS(spin, raw_spinlock);
+BUILD_LOCK_OPS(read, rwlock);
+BUILD_LOCK_OPS(write, rwlock);
+
+#endif
+
+#ifndef CONFIG_INLINE_SPIN_TRYLOCK
+int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock)
+{
+ return __raw_spin_trylock(lock);
+}
+EXPORT_SYMBOL(_raw_spin_trylock);
+#endif
+
+#ifndef CONFIG_INLINE_SPIN_TRYLOCK_BH
+int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock)
+{
+ return __raw_spin_trylock_bh(lock);
+}
+EXPORT_SYMBOL(_raw_spin_trylock_bh);
+#endif
+
+#ifndef CONFIG_INLINE_SPIN_LOCK
+void __lockfunc _raw_spin_lock(raw_spinlock_t *lock)
+{
+ __raw_spin_lock(lock);
+}
+EXPORT_SYMBOL(_raw_spin_lock);
+#endif
+
+#ifndef CONFIG_INLINE_SPIN_LOCK_IRQSAVE
+unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock)
+{
+ return __raw_spin_lock_irqsave(lock);
+}
+EXPORT_SYMBOL(_raw_spin_lock_irqsave);
+#endif
+
+#ifndef CONFIG_INLINE_SPIN_LOCK_IRQ
+void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock)
+{
+ __raw_spin_lock_irq(lock);
+}
+EXPORT_SYMBOL(_raw_spin_lock_irq);
+#endif
+
+#ifndef CONFIG_INLINE_SPIN_LOCK_BH
+void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock)
+{
+ __raw_spin_lock_bh(lock);
+}
+EXPORT_SYMBOL(_raw_spin_lock_bh);
+#endif
+
+#ifdef CONFIG_UNINLINE_SPIN_UNLOCK
+void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock)
+{
+ __raw_spin_unlock(lock);
+}
+EXPORT_SYMBOL(_raw_spin_unlock);
+#endif
+
+#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE
+void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags)
+{
+ __raw_spin_unlock_irqrestore(lock, flags);
+}
+EXPORT_SYMBOL(_raw_spin_unlock_irqrestore);
+#endif
+
+#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQ
+void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock)
+{
+ __raw_spin_unlock_irq(lock);
+}
+EXPORT_SYMBOL(_raw_spin_unlock_irq);
+#endif
+
+#ifndef CONFIG_INLINE_SPIN_UNLOCK_BH
+void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
+{
+ __raw_spin_unlock_bh(lock);
+}
+EXPORT_SYMBOL(_raw_spin_unlock_bh);
+#endif
+
+#ifndef CONFIG_INLINE_READ_TRYLOCK
+int __lockfunc _raw_read_trylock(rwlock_t *lock)
+{
+ return __raw_read_trylock(lock);
+}
+EXPORT_SYMBOL(_raw_read_trylock);
+#endif
+
+#ifndef CONFIG_INLINE_READ_LOCK
+void __lockfunc _raw_read_lock(rwlock_t *lock)
+{
+ __raw_read_lock(lock);
+}
+EXPORT_SYMBOL(_raw_read_lock);
+#endif
+
+#ifndef CONFIG_INLINE_READ_LOCK_IRQSAVE
+unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock)
+{
+ return __raw_read_lock_irqsave(lock);
+}
+EXPORT_SYMBOL(_raw_read_lock_irqsave);
+#endif
+
+#ifndef CONFIG_INLINE_READ_LOCK_IRQ
+void __lockfunc _raw_read_lock_irq(rwlock_t *lock)
+{
+ __raw_read_lock_irq(lock);
+}
+EXPORT_SYMBOL(_raw_read_lock_irq);
+#endif
+
+#ifndef CONFIG_INLINE_READ_LOCK_BH
+void __lockfunc _raw_read_lock_bh(rwlock_t *lock)
+{
+ __raw_read_lock_bh(lock);
+}
+EXPORT_SYMBOL(_raw_read_lock_bh);
+#endif
+
+#ifndef CONFIG_INLINE_READ_UNLOCK
+void __lockfunc _raw_read_unlock(rwlock_t *lock)
+{
+ __raw_read_unlock(lock);
+}
+EXPORT_SYMBOL(_raw_read_unlock);
+#endif
+
+#ifndef CONFIG_INLINE_READ_UNLOCK_IRQRESTORE
+void __lockfunc _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
+{
+ __raw_read_unlock_irqrestore(lock, flags);
+}
+EXPORT_SYMBOL(_raw_read_unlock_irqrestore);
+#endif
+
+#ifndef CONFIG_INLINE_READ_UNLOCK_IRQ
+void __lockfunc _raw_read_unlock_irq(rwlock_t *lock)
+{
+ __raw_read_unlock_irq(lock);
+}
+EXPORT_SYMBOL(_raw_read_unlock_irq);
+#endif
+
+#ifndef CONFIG_INLINE_READ_UNLOCK_BH
+void __lockfunc _raw_read_unlock_bh(rwlock_t *lock)
+{
+ __raw_read_unlock_bh(lock);
+}
+EXPORT_SYMBOL(_raw_read_unlock_bh);
+#endif
+
+#ifndef CONFIG_INLINE_WRITE_TRYLOCK
+int __lockfunc _raw_write_trylock(rwlock_t *lock)
+{
+ return __raw_write_trylock(lock);
+}
+EXPORT_SYMBOL(_raw_write_trylock);
+#endif
+
+#ifndef CONFIG_INLINE_WRITE_LOCK
+void __lockfunc _raw_write_lock(rwlock_t *lock)
+{
+ __raw_write_lock(lock);
+}
+EXPORT_SYMBOL(_raw_write_lock);
+#endif
+
+#ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE
+unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock)
+{
+ return __raw_write_lock_irqsave(lock);
+}
+EXPORT_SYMBOL(_raw_write_lock_irqsave);
+#endif
+
+#ifndef CONFIG_INLINE_WRITE_LOCK_IRQ
+void __lockfunc _raw_write_lock_irq(rwlock_t *lock)
+{
+ __raw_write_lock_irq(lock);
+}
+EXPORT_SYMBOL(_raw_write_lock_irq);
+#endif
+
+#ifndef CONFIG_INLINE_WRITE_LOCK_BH
+void __lockfunc _raw_write_lock_bh(rwlock_t *lock)
+{
+ __raw_write_lock_bh(lock);
+}
+EXPORT_SYMBOL(_raw_write_lock_bh);
+#endif
+
+#ifndef CONFIG_INLINE_WRITE_UNLOCK
+void __lockfunc _raw_write_unlock(rwlock_t *lock)
+{
+ __raw_write_unlock(lock);
+}
+EXPORT_SYMBOL(_raw_write_unlock);
+#endif
+
+#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE
+void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
+{
+ __raw_write_unlock_irqrestore(lock, flags);
+}
+EXPORT_SYMBOL(_raw_write_unlock_irqrestore);
+#endif
+
+#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQ
+void __lockfunc _raw_write_unlock_irq(rwlock_t *lock)
+{
+ __raw_write_unlock_irq(lock);
+}
+EXPORT_SYMBOL(_raw_write_unlock_irq);
+#endif
+
+#ifndef CONFIG_INLINE_WRITE_UNLOCK_BH
+void __lockfunc _raw_write_unlock_bh(rwlock_t *lock)
+{
+ __raw_write_unlock_bh(lock);
+}
+EXPORT_SYMBOL(_raw_write_unlock_bh);
+#endif
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+
+void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
+{
+ preempt_disable();
+ spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+ LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
+}
+EXPORT_SYMBOL(_raw_spin_lock_nested);
+
+void __lockfunc _raw_spin_lock_bh_nested(raw_spinlock_t *lock, int subclass)
+{
+ __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
+ spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+ LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
+}
+EXPORT_SYMBOL(_raw_spin_lock_bh_nested);
+
+unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock,
+ int subclass)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ preempt_disable();
+ spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+ LOCK_CONTENDED_FLAGS(lock, do_raw_spin_trylock, do_raw_spin_lock,
+ do_raw_spin_lock_flags, &flags);
+ return flags;
+}
+EXPORT_SYMBOL(_raw_spin_lock_irqsave_nested);
+
+void __lockfunc _raw_spin_lock_nest_lock(raw_spinlock_t *lock,
+ struct lockdep_map *nest_lock)
+{
+ preempt_disable();
+ spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_);
+ LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
+}
+EXPORT_SYMBOL(_raw_spin_lock_nest_lock);
+
+#endif
+
+notrace int in_lock_functions(unsigned long addr)
+{
+ /* Linker adds these: start and end of __lockfunc functions */
+ extern char __lock_text_start[], __lock_text_end[];
+
+ return addr >= (unsigned long)__lock_text_start
+ && addr < (unsigned long)__lock_text_end;
+}
+EXPORT_SYMBOL(in_lock_functions);
diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
new file mode 100644
index 000000000..0374a596c
--- /dev/null
+++ b/kernel/locking/spinlock_debug.c
@@ -0,0 +1,302 @@
+/*
+ * Copyright 2005, Red Hat, Inc., Ingo Molnar
+ * Released under the General Public License (GPL).
+ *
+ * This file contains the spinlock/rwlock implementations for
+ * DEBUG_SPINLOCK.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/nmi.h>
+#include <linux/interrupt.h>
+#include <linux/debug_locks.h>
+#include <linux/delay.h>
+#include <linux/export.h>
+
+void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
+ struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ /*
+ * Make sure we are not reinitializing a held lock:
+ */
+ debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+ lockdep_init_map(&lock->dep_map, name, key, 0);
+#endif
+ lock->raw_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
+ lock->magic = SPINLOCK_MAGIC;
+ lock->owner = SPINLOCK_OWNER_INIT;
+ lock->owner_cpu = -1;
+}
+
+EXPORT_SYMBOL(__raw_spin_lock_init);
+
+void __rwlock_init(rwlock_t *lock, const char *name,
+ struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ /*
+ * Make sure we are not reinitializing a held lock:
+ */
+ debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+ lockdep_init_map(&lock->dep_map, name, key, 0);
+#endif
+ lock->raw_lock = (arch_rwlock_t) __ARCH_RW_LOCK_UNLOCKED;
+ lock->magic = RWLOCK_MAGIC;
+ lock->owner = SPINLOCK_OWNER_INIT;
+ lock->owner_cpu = -1;
+}
+
+EXPORT_SYMBOL(__rwlock_init);
+
+static void spin_dump(raw_spinlock_t *lock, const char *msg)
+{
+ struct task_struct *owner = NULL;
+
+ if (lock->owner && lock->owner != SPINLOCK_OWNER_INIT)
+ owner = lock->owner;
+ printk(KERN_EMERG "BUG: spinlock %s on CPU#%d, %s/%d\n",
+ msg, raw_smp_processor_id(),
+ current->comm, task_pid_nr(current));
+ printk(KERN_EMERG " lock: %pS, .magic: %08x, .owner: %s/%d, "
+ ".owner_cpu: %d\n",
+ lock, lock->magic,
+ owner ? owner->comm : "<none>",
+ owner ? task_pid_nr(owner) : -1,
+ lock->owner_cpu);
+ dump_stack();
+}
+
+static void spin_bug(raw_spinlock_t *lock, const char *msg)
+{
+ if (!debug_locks_off())
+ return;
+
+ spin_dump(lock, msg);
+}
+
+#define SPIN_BUG_ON(cond, lock, msg) if (unlikely(cond)) spin_bug(lock, msg)
+
+static inline void
+debug_spin_lock_before(raw_spinlock_t *lock)
+{
+ SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic");
+ SPIN_BUG_ON(lock->owner == current, lock, "recursion");
+ SPIN_BUG_ON(lock->owner_cpu == raw_smp_processor_id(),
+ lock, "cpu recursion");
+}
+
+static inline void debug_spin_lock_after(raw_spinlock_t *lock)
+{
+ lock->owner_cpu = raw_smp_processor_id();
+ lock->owner = current;
+}
+
+static inline void debug_spin_unlock(raw_spinlock_t *lock)
+{
+ SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic");
+ SPIN_BUG_ON(!raw_spin_is_locked(lock), lock, "already unlocked");
+ SPIN_BUG_ON(lock->owner != current, lock, "wrong owner");
+ SPIN_BUG_ON(lock->owner_cpu != raw_smp_processor_id(),
+ lock, "wrong CPU");
+ lock->owner = SPINLOCK_OWNER_INIT;
+ lock->owner_cpu = -1;
+}
+
+static void __spin_lock_debug(raw_spinlock_t *lock)
+{
+ u64 i;
+ u64 loops = loops_per_jiffy * HZ;
+
+ for (i = 0; i < loops; i++) {
+ if (arch_spin_trylock(&lock->raw_lock))
+ return;
+ __delay(1);
+ }
+ /* lockup suspected: */
+ spin_dump(lock, "lockup suspected");
+#ifdef CONFIG_SMP
+ trigger_all_cpu_backtrace();
+#endif
+
+ /*
+ * The trylock above was causing a livelock. Give the lower level arch
+ * specific lock code a chance to acquire the lock. We have already
+ * printed a warning/backtrace at this point. The non-debug arch
+ * specific code might actually succeed in acquiring the lock. If it is
+ * not successful, the end-result is the same - there is no forward
+ * progress.
+ */
+ arch_spin_lock(&lock->raw_lock);
+}
+
+void do_raw_spin_lock(raw_spinlock_t *lock)
+{
+ debug_spin_lock_before(lock);
+ if (unlikely(!arch_spin_trylock(&lock->raw_lock)))
+ __spin_lock_debug(lock);
+ debug_spin_lock_after(lock);
+}
+
+int do_raw_spin_trylock(raw_spinlock_t *lock)
+{
+ int ret = arch_spin_trylock(&lock->raw_lock);
+
+ if (ret)
+ debug_spin_lock_after(lock);
+#ifndef CONFIG_SMP
+ /*
+ * Must not happen on UP:
+ */
+ SPIN_BUG_ON(!ret, lock, "trylock failure on UP");
+#endif
+ return ret;
+}
+
+void do_raw_spin_unlock(raw_spinlock_t *lock)
+{
+ debug_spin_unlock(lock);
+ arch_spin_unlock(&lock->raw_lock);
+}
+
+static void rwlock_bug(rwlock_t *lock, const char *msg)
+{
+ if (!debug_locks_off())
+ return;
+
+ printk(KERN_EMERG "BUG: rwlock %s on CPU#%d, %s/%d, %p\n",
+ msg, raw_smp_processor_id(), current->comm,
+ task_pid_nr(current), lock);
+ dump_stack();
+}
+
+#define RWLOCK_BUG_ON(cond, lock, msg) if (unlikely(cond)) rwlock_bug(lock, msg)
+
+#if 0 /* __write_lock_debug() can lock up - maybe this can too? */
+static void __read_lock_debug(rwlock_t *lock)
+{
+ u64 i;
+ u64 loops = loops_per_jiffy * HZ;
+ int print_once = 1;
+
+ for (;;) {
+ for (i = 0; i < loops; i++) {
+ if (arch_read_trylock(&lock->raw_lock))
+ return;
+ __delay(1);
+ }
+ /* lockup suspected: */
+ if (print_once) {
+ print_once = 0;
+ printk(KERN_EMERG "BUG: read-lock lockup on CPU#%d, "
+ "%s/%d, %p\n",
+ raw_smp_processor_id(), current->comm,
+ current->pid, lock);
+ dump_stack();
+ }
+ }
+}
+#endif
+
+void do_raw_read_lock(rwlock_t *lock)
+{
+ RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
+ arch_read_lock(&lock->raw_lock);
+}
+
+int do_raw_read_trylock(rwlock_t *lock)
+{
+ int ret = arch_read_trylock(&lock->raw_lock);
+
+#ifndef CONFIG_SMP
+ /*
+ * Must not happen on UP:
+ */
+ RWLOCK_BUG_ON(!ret, lock, "trylock failure on UP");
+#endif
+ return ret;
+}
+
+void do_raw_read_unlock(rwlock_t *lock)
+{
+ RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
+ arch_read_unlock(&lock->raw_lock);
+}
+
+static inline void debug_write_lock_before(rwlock_t *lock)
+{
+ RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
+ RWLOCK_BUG_ON(lock->owner == current, lock, "recursion");
+ RWLOCK_BUG_ON(lock->owner_cpu == raw_smp_processor_id(),
+ lock, "cpu recursion");
+}
+
+static inline void debug_write_lock_after(rwlock_t *lock)
+{
+ lock->owner_cpu = raw_smp_processor_id();
+ lock->owner = current;
+}
+
+static inline void debug_write_unlock(rwlock_t *lock)
+{
+ RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
+ RWLOCK_BUG_ON(lock->owner != current, lock, "wrong owner");
+ RWLOCK_BUG_ON(lock->owner_cpu != raw_smp_processor_id(),
+ lock, "wrong CPU");
+ lock->owner = SPINLOCK_OWNER_INIT;
+ lock->owner_cpu = -1;
+}
+
+#if 0 /* This can cause lockups */
+static void __write_lock_debug(rwlock_t *lock)
+{
+ u64 i;
+ u64 loops = loops_per_jiffy * HZ;
+ int print_once = 1;
+
+ for (;;) {
+ for (i = 0; i < loops; i++) {
+ if (arch_write_trylock(&lock->raw_lock))
+ return;
+ __delay(1);
+ }
+ /* lockup suspected: */
+ if (print_once) {
+ print_once = 0;
+ printk(KERN_EMERG "BUG: write-lock lockup on CPU#%d, "
+ "%s/%d, %p\n",
+ raw_smp_processor_id(), current->comm,
+ current->pid, lock);
+ dump_stack();
+ }
+ }
+}
+#endif
+
+void do_raw_write_lock(rwlock_t *lock)
+{
+ debug_write_lock_before(lock);
+ arch_write_lock(&lock->raw_lock);
+ debug_write_lock_after(lock);
+}
+
+int do_raw_write_trylock(rwlock_t *lock)
+{
+ int ret = arch_write_trylock(&lock->raw_lock);
+
+ if (ret)
+ debug_write_lock_after(lock);
+#ifndef CONFIG_SMP
+ /*
+ * Must not happen on UP:
+ */
+ RWLOCK_BUG_ON(!ret, lock, "trylock failure on UP");
+#endif
+ return ret;
+}
+
+void do_raw_write_unlock(rwlock_t *lock)
+{
+ debug_write_unlock(lock);
+ arch_write_unlock(&lock->raw_lock);
+}
diff --git a/kernel/module-internal.h b/kernel/module-internal.h
new file mode 100644
index 000000000..915e123a4
--- /dev/null
+++ b/kernel/module-internal.h
@@ -0,0 +1,12 @@
+/* Module internals
+ *
+ * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+extern int mod_verify_sig(const void *mod, unsigned long *_modlen);
diff --git a/kernel/module.c b/kernel/module.c
new file mode 100644
index 000000000..cfc9e843a
--- /dev/null
+++ b/kernel/module.c
@@ -0,0 +1,3917 @@
+/*
+ Copyright (C) 2002 Richard Henderson
+ Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+#include <linux/export.h>
+#include <linux/moduleloader.h>
+#include <linux/ftrace_event.h>
+#include <linux/init.h>
+#include <linux/kallsyms.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/sysfs.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/elf.h>
+#include <linux/proc_fs.h>
+#include <linux/security.h>
+#include <linux/seq_file.h>
+#include <linux/syscalls.h>
+#include <linux/fcntl.h>
+#include <linux/rcupdate.h>
+#include <linux/capability.h>
+#include <linux/cpu.h>
+#include <linux/moduleparam.h>
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/vermagic.h>
+#include <linux/notifier.h>
+#include <linux/sched.h>
+#include <linux/device.h>
+#include <linux/string.h>
+#include <linux/mutex.h>
+#include <linux/rculist.h>
+#include <asm/uaccess.h>
+#include <asm/cacheflush.h>
+#include <asm/mmu_context.h>
+#include <linux/license.h>
+#include <asm/sections.h>
+#include <linux/tracepoint.h>
+#include <linux/ftrace.h>
+#include <linux/async.h>
+#include <linux/percpu.h>
+#include <linux/kmemleak.h>
+#include <linux/jump_label.h>
+#include <linux/pfn.h>
+#include <linux/bsearch.h>
+#include <uapi/linux/module.h>
+#include "module-internal.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/module.h>
+
+#ifndef ARCH_SHF_SMALL
+#define ARCH_SHF_SMALL 0
+#endif
+
+/*
+ * Modules' sections will be aligned on page boundaries
+ * to ensure complete separation of code and data, but
+ * only when CONFIG_DEBUG_SET_MODULE_RONX=y
+ */
+#ifdef CONFIG_DEBUG_SET_MODULE_RONX
+# define debug_align(X) ALIGN(X, PAGE_SIZE)
+#else
+# define debug_align(X) (X)
+#endif
+
+/*
+ * Given BASE and SIZE this macro calculates the number of pages the
+ * memory regions occupies
+ */
+#define MOD_NUMBER_OF_PAGES(BASE, SIZE) (((SIZE) > 0) ? \
+ (PFN_DOWN((unsigned long)(BASE) + (SIZE) - 1) - \
+ PFN_DOWN((unsigned long)BASE) + 1) \
+ : (0UL))
+
+/* If this is set, the section belongs in the init part of the module */
+#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
+
+/*
+ * Mutex protects:
+ * 1) List of modules (also safely readable with preempt_disable),
+ * 2) module_use links,
+ * 3) module_addr_min/module_addr_max.
+ * (delete and add uses RCU list operations). */
+DEFINE_MUTEX(module_mutex);
+EXPORT_SYMBOL_GPL(module_mutex);
+static LIST_HEAD(modules);
+#ifdef CONFIG_KGDB_KDB
+struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
+#endif /* CONFIG_KGDB_KDB */
+
+#ifdef CONFIG_MODULE_SIG
+#ifdef CONFIG_MODULE_SIG_FORCE
+static bool sig_enforce = true;
+#else
+static bool sig_enforce = false;
+
+static int param_set_bool_enable_only(const char *val,
+ const struct kernel_param *kp)
+{
+ int err;
+ bool test;
+ struct kernel_param dummy_kp = *kp;
+
+ dummy_kp.arg = &test;
+
+ err = param_set_bool(val, &dummy_kp);
+ if (err)
+ return err;
+
+ /* Don't let them unset it once it's set! */
+ if (!test && sig_enforce)
+ return -EROFS;
+
+ if (test)
+ sig_enforce = true;
+ return 0;
+}
+
+static const struct kernel_param_ops param_ops_bool_enable_only = {
+ .flags = KERNEL_PARAM_OPS_FL_NOARG,
+ .set = param_set_bool_enable_only,
+ .get = param_get_bool,
+};
+#define param_check_bool_enable_only param_check_bool
+
+module_param(sig_enforce, bool_enable_only, 0644);
+#endif /* !CONFIG_MODULE_SIG_FORCE */
+#endif /* CONFIG_MODULE_SIG */
+
+/* Block module loading/unloading? */
+int modules_disabled = 0;
+core_param(nomodule, modules_disabled, bint, 0);
+
+/* Waiting for a module to finish initializing? */
+static DECLARE_WAIT_QUEUE_HEAD(module_wq);
+
+static BLOCKING_NOTIFIER_HEAD(module_notify_list);
+
+/* Bounds of module allocation, for speeding __module_address.
+ * Protected by module_mutex. */
+static unsigned long module_addr_min = -1UL, module_addr_max = 0;
+
+int register_module_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_register(&module_notify_list, nb);
+}
+EXPORT_SYMBOL(register_module_notifier);
+
+int unregister_module_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_unregister(&module_notify_list, nb);
+}
+EXPORT_SYMBOL(unregister_module_notifier);
+
+struct load_info {
+ Elf_Ehdr *hdr;
+ unsigned long len;
+ Elf_Shdr *sechdrs;
+ char *secstrings, *strtab;
+ unsigned long symoffs, stroffs;
+ struct _ddebug *debug;
+ unsigned int num_debug;
+ bool sig_ok;
+ struct {
+ unsigned int sym, str, mod, vers, info, pcpu;
+ } index;
+};
+
+/* We require a truly strong try_module_get(): 0 means failure due to
+ ongoing or failed initialization etc. */
+static inline int strong_try_module_get(struct module *mod)
+{
+ BUG_ON(mod && mod->state == MODULE_STATE_UNFORMED);
+ if (mod && mod->state == MODULE_STATE_COMING)
+ return -EBUSY;
+ if (try_module_get(mod))
+ return 0;
+ else
+ return -ENOENT;
+}
+
+static inline void add_taint_module(struct module *mod, unsigned flag,
+ enum lockdep_ok lockdep_ok)
+{
+ add_taint(flag, lockdep_ok);
+ mod->taints |= (1U << flag);
+}
+
+/*
+ * A thread that wants to hold a reference to a module only while it
+ * is running can call this to safely exit. nfsd and lockd use this.
+ */
+void __module_put_and_exit(struct module *mod, long code)
+{
+ module_put(mod);
+ do_exit(code);
+}
+EXPORT_SYMBOL(__module_put_and_exit);
+
+/* Find a module section: 0 means not found. */
+static unsigned int find_sec(const struct load_info *info, const char *name)
+{
+ unsigned int i;
+
+ for (i = 1; i < info->hdr->e_shnum; i++) {
+ Elf_Shdr *shdr = &info->sechdrs[i];
+ /* Alloc bit cleared means "ignore it." */
+ if ((shdr->sh_flags & SHF_ALLOC)
+ && strcmp(info->secstrings + shdr->sh_name, name) == 0)
+ return i;
+ }
+ return 0;
+}
+
+/* Find a module section, or NULL. */
+static void *section_addr(const struct load_info *info, const char *name)
+{
+ /* Section 0 has sh_addr 0. */
+ return (void *)info->sechdrs[find_sec(info, name)].sh_addr;
+}
+
+/* Find a module section, or NULL. Fill in number of "objects" in section. */
+static void *section_objs(const struct load_info *info,
+ const char *name,
+ size_t object_size,
+ unsigned int *num)
+{
+ unsigned int sec = find_sec(info, name);
+
+ /* Section 0 has sh_addr 0 and sh_size 0. */
+ *num = info->sechdrs[sec].sh_size / object_size;
+ return (void *)info->sechdrs[sec].sh_addr;
+}
+
+/* Provided by the linker */
+extern const struct kernel_symbol __start___ksymtab[];
+extern const struct kernel_symbol __stop___ksymtab[];
+extern const struct kernel_symbol __start___ksymtab_gpl[];
+extern const struct kernel_symbol __stop___ksymtab_gpl[];
+extern const struct kernel_symbol __start___ksymtab_gpl_future[];
+extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
+extern const unsigned long __start___kcrctab[];
+extern const unsigned long __start___kcrctab_gpl[];
+extern const unsigned long __start___kcrctab_gpl_future[];
+#ifdef CONFIG_UNUSED_SYMBOLS
+extern const struct kernel_symbol __start___ksymtab_unused[];
+extern const struct kernel_symbol __stop___ksymtab_unused[];
+extern const struct kernel_symbol __start___ksymtab_unused_gpl[];
+extern const struct kernel_symbol __stop___ksymtab_unused_gpl[];
+extern const unsigned long __start___kcrctab_unused[];
+extern const unsigned long __start___kcrctab_unused_gpl[];
+#endif
+
+#ifndef CONFIG_MODVERSIONS
+#define symversion(base, idx) NULL
+#else
+#define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL)
+#endif
+
+static bool each_symbol_in_section(const struct symsearch *arr,
+ unsigned int arrsize,
+ struct module *owner,
+ bool (*fn)(const struct symsearch *syms,
+ struct module *owner,
+ void *data),
+ void *data)
+{
+ unsigned int j;
+
+ for (j = 0; j < arrsize; j++) {
+ if (fn(&arr[j], owner, data))
+ return true;
+ }
+
+ return false;
+}
+
+/* Returns true as soon as fn returns true, otherwise false. */
+bool each_symbol_section(bool (*fn)(const struct symsearch *arr,
+ struct module *owner,
+ void *data),
+ void *data)
+{
+ struct module *mod;
+ static const struct symsearch arr[] = {
+ { __start___ksymtab, __stop___ksymtab, __start___kcrctab,
+ NOT_GPL_ONLY, false },
+ { __start___ksymtab_gpl, __stop___ksymtab_gpl,
+ __start___kcrctab_gpl,
+ GPL_ONLY, false },
+ { __start___ksymtab_gpl_future, __stop___ksymtab_gpl_future,
+ __start___kcrctab_gpl_future,
+ WILL_BE_GPL_ONLY, false },
+#ifdef CONFIG_UNUSED_SYMBOLS
+ { __start___ksymtab_unused, __stop___ksymtab_unused,
+ __start___kcrctab_unused,
+ NOT_GPL_ONLY, true },
+ { __start___ksymtab_unused_gpl, __stop___ksymtab_unused_gpl,
+ __start___kcrctab_unused_gpl,
+ GPL_ONLY, true },
+#endif
+ };
+
+ if (each_symbol_in_section(arr, ARRAY_SIZE(arr), NULL, fn, data))
+ return true;
+
+ list_for_each_entry_rcu(mod, &modules, list) {
+ struct symsearch arr[] = {
+ { mod->syms, mod->syms + mod->num_syms, mod->crcs,
+ NOT_GPL_ONLY, false },
+ { mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms,
+ mod->gpl_crcs,
+ GPL_ONLY, false },
+ { mod->gpl_future_syms,
+ mod->gpl_future_syms + mod->num_gpl_future_syms,
+ mod->gpl_future_crcs,
+ WILL_BE_GPL_ONLY, false },
+#ifdef CONFIG_UNUSED_SYMBOLS
+ { mod->unused_syms,
+ mod->unused_syms + mod->num_unused_syms,
+ mod->unused_crcs,
+ NOT_GPL_ONLY, true },
+ { mod->unused_gpl_syms,
+ mod->unused_gpl_syms + mod->num_unused_gpl_syms,
+ mod->unused_gpl_crcs,
+ GPL_ONLY, true },
+#endif
+ };
+
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+
+ if (each_symbol_in_section(arr, ARRAY_SIZE(arr), mod, fn, data))
+ return true;
+ }
+ return false;
+}
+EXPORT_SYMBOL_GPL(each_symbol_section);
+
+struct find_symbol_arg {
+ /* Input */
+ const char *name;
+ bool gplok;
+ bool warn;
+
+ /* Output */
+ struct module *owner;
+ const unsigned long *crc;
+ const struct kernel_symbol *sym;
+};
+
+static bool check_symbol(const struct symsearch *syms,
+ struct module *owner,
+ unsigned int symnum, void *data)
+{
+ struct find_symbol_arg *fsa = data;
+
+ if (!fsa->gplok) {
+ if (syms->licence == GPL_ONLY)
+ return false;
+ if (syms->licence == WILL_BE_GPL_ONLY && fsa->warn) {
+ pr_warn("Symbol %s is being used by a non-GPL module, "
+ "which will not be allowed in the future\n",
+ fsa->name);
+ }
+ }
+
+#ifdef CONFIG_UNUSED_SYMBOLS
+ if (syms->unused && fsa->warn) {
+ pr_warn("Symbol %s is marked as UNUSED, however this module is "
+ "using it.\n", fsa->name);
+ pr_warn("This symbol will go away in the future.\n");
+ pr_warn("Please evaluate if this is the right api to use and "
+ "if it really is, submit a report to the linux kernel "
+ "mailing list together with submitting your code for "
+ "inclusion.\n");
+ }
+#endif
+
+ fsa->owner = owner;
+ fsa->crc = symversion(syms->crcs, symnum);
+ fsa->sym = &syms->start[symnum];
+ return true;
+}
+
+static int cmp_name(const void *va, const void *vb)
+{
+ const char *a;
+ const struct kernel_symbol *b;
+ a = va; b = vb;
+ return strcmp(a, b->name);
+}
+
+static bool find_symbol_in_section(const struct symsearch *syms,
+ struct module *owner,
+ void *data)
+{
+ struct find_symbol_arg *fsa = data;
+ struct kernel_symbol *sym;
+
+ sym = bsearch(fsa->name, syms->start, syms->stop - syms->start,
+ sizeof(struct kernel_symbol), cmp_name);
+
+ if (sym != NULL && check_symbol(syms, owner, sym - syms->start, data))
+ return true;
+
+ return false;
+}
+
+/* Find a symbol and return it, along with, (optional) crc and
+ * (optional) module which owns it. Needs preempt disabled or module_mutex. */
+const struct kernel_symbol *find_symbol(const char *name,
+ struct module **owner,
+ const unsigned long **crc,
+ bool gplok,
+ bool warn)
+{
+ struct find_symbol_arg fsa;
+
+ fsa.name = name;
+ fsa.gplok = gplok;
+ fsa.warn = warn;
+
+ if (each_symbol_section(find_symbol_in_section, &fsa)) {
+ if (owner)
+ *owner = fsa.owner;
+ if (crc)
+ *crc = fsa.crc;
+ return fsa.sym;
+ }
+
+ pr_debug("Failed to find symbol %s\n", name);
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(find_symbol);
+
+/* Search for module by name: must hold module_mutex. */
+static struct module *find_module_all(const char *name, size_t len,
+ bool even_unformed)
+{
+ struct module *mod;
+
+ list_for_each_entry(mod, &modules, list) {
+ if (!even_unformed && mod->state == MODULE_STATE_UNFORMED)
+ continue;
+ if (strlen(mod->name) == len && !memcmp(mod->name, name, len))
+ return mod;
+ }
+ return NULL;
+}
+
+struct module *find_module(const char *name)
+{
+ return find_module_all(name, strlen(name), false);
+}
+EXPORT_SYMBOL_GPL(find_module);
+
+#ifdef CONFIG_SMP
+
+static inline void __percpu *mod_percpu(struct module *mod)
+{
+ return mod->percpu;
+}
+
+static int percpu_modalloc(struct module *mod, struct load_info *info)
+{
+ Elf_Shdr *pcpusec = &info->sechdrs[info->index.pcpu];
+ unsigned long align = pcpusec->sh_addralign;
+
+ if (!pcpusec->sh_size)
+ return 0;
+
+ if (align > PAGE_SIZE) {
+ pr_warn("%s: per-cpu alignment %li > %li\n",
+ mod->name, align, PAGE_SIZE);
+ align = PAGE_SIZE;
+ }
+
+ mod->percpu = __alloc_reserved_percpu(pcpusec->sh_size, align);
+ if (!mod->percpu) {
+ pr_warn("%s: Could not allocate %lu bytes percpu data\n",
+ mod->name, (unsigned long)pcpusec->sh_size);
+ return -ENOMEM;
+ }
+ mod->percpu_size = pcpusec->sh_size;
+ return 0;
+}
+
+static void percpu_modfree(struct module *mod)
+{
+ free_percpu(mod->percpu);
+}
+
+static unsigned int find_pcpusec(struct load_info *info)
+{
+ return find_sec(info, ".data..percpu");
+}
+
+static void percpu_modcopy(struct module *mod,
+ const void *from, unsigned long size)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ memcpy(per_cpu_ptr(mod->percpu, cpu), from, size);
+}
+
+/**
+ * is_module_percpu_address - test whether address is from module static percpu
+ * @addr: address to test
+ *
+ * Test whether @addr belongs to module static percpu area.
+ *
+ * RETURNS:
+ * %true if @addr is from module static percpu area
+ */
+bool is_module_percpu_address(unsigned long addr)
+{
+ struct module *mod;
+ unsigned int cpu;
+
+ preempt_disable();
+
+ list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+ if (!mod->percpu_size)
+ continue;
+ for_each_possible_cpu(cpu) {
+ void *start = per_cpu_ptr(mod->percpu, cpu);
+
+ if ((void *)addr >= start &&
+ (void *)addr < start + mod->percpu_size) {
+ preempt_enable();
+ return true;
+ }
+ }
+ }
+
+ preempt_enable();
+ return false;
+}
+
+#else /* ... !CONFIG_SMP */
+
+static inline void __percpu *mod_percpu(struct module *mod)
+{
+ return NULL;
+}
+static int percpu_modalloc(struct module *mod, struct load_info *info)
+{
+ /* UP modules shouldn't have this section: ENOMEM isn't quite right */
+ if (info->sechdrs[info->index.pcpu].sh_size != 0)
+ return -ENOMEM;
+ return 0;
+}
+static inline void percpu_modfree(struct module *mod)
+{
+}
+static unsigned int find_pcpusec(struct load_info *info)
+{
+ return 0;
+}
+static inline void percpu_modcopy(struct module *mod,
+ const void *from, unsigned long size)
+{
+ /* pcpusec should be 0, and size of that section should be 0. */
+ BUG_ON(size != 0);
+}
+bool is_module_percpu_address(unsigned long addr)
+{
+ return false;
+}
+
+#endif /* CONFIG_SMP */
+
+#define MODINFO_ATTR(field) \
+static void setup_modinfo_##field(struct module *mod, const char *s) \
+{ \
+ mod->field = kstrdup(s, GFP_KERNEL); \
+} \
+static ssize_t show_modinfo_##field(struct module_attribute *mattr, \
+ struct module_kobject *mk, char *buffer) \
+{ \
+ return scnprintf(buffer, PAGE_SIZE, "%s\n", mk->mod->field); \
+} \
+static int modinfo_##field##_exists(struct module *mod) \
+{ \
+ return mod->field != NULL; \
+} \
+static void free_modinfo_##field(struct module *mod) \
+{ \
+ kfree(mod->field); \
+ mod->field = NULL; \
+} \
+static struct module_attribute modinfo_##field = { \
+ .attr = { .name = __stringify(field), .mode = 0444 }, \
+ .show = show_modinfo_##field, \
+ .setup = setup_modinfo_##field, \
+ .test = modinfo_##field##_exists, \
+ .free = free_modinfo_##field, \
+};
+
+MODINFO_ATTR(version);
+MODINFO_ATTR(srcversion);
+
+static char last_unloaded_module[MODULE_NAME_LEN+1];
+
+#ifdef CONFIG_MODULE_UNLOAD
+
+EXPORT_TRACEPOINT_SYMBOL(module_get);
+
+/* MODULE_REF_BASE is the base reference count by kmodule loader. */
+#define MODULE_REF_BASE 1
+
+/* Init the unload section of the module. */
+static int module_unload_init(struct module *mod)
+{
+ /*
+ * Initialize reference counter to MODULE_REF_BASE.
+ * refcnt == 0 means module is going.
+ */
+ atomic_set(&mod->refcnt, MODULE_REF_BASE);
+
+ INIT_LIST_HEAD(&mod->source_list);
+ INIT_LIST_HEAD(&mod->target_list);
+
+ /* Hold reference count during initialization. */
+ atomic_inc(&mod->refcnt);
+
+ return 0;
+}
+
+/* Does a already use b? */
+static int already_uses(struct module *a, struct module *b)
+{
+ struct module_use *use;
+
+ list_for_each_entry(use, &b->source_list, source_list) {
+ if (use->source == a) {
+ pr_debug("%s uses %s!\n", a->name, b->name);
+ return 1;
+ }
+ }
+ pr_debug("%s does not use %s!\n", a->name, b->name);
+ return 0;
+}
+
+/*
+ * Module a uses b
+ * - we add 'a' as a "source", 'b' as a "target" of module use
+ * - the module_use is added to the list of 'b' sources (so
+ * 'b' can walk the list to see who sourced them), and of 'a'
+ * targets (so 'a' can see what modules it targets).
+ */
+static int add_module_usage(struct module *a, struct module *b)
+{
+ struct module_use *use;
+
+ pr_debug("Allocating new usage for %s.\n", a->name);
+ use = kmalloc(sizeof(*use), GFP_ATOMIC);
+ if (!use) {
+ pr_warn("%s: out of memory loading\n", a->name);
+ return -ENOMEM;
+ }
+
+ use->source = a;
+ use->target = b;
+ list_add(&use->source_list, &b->source_list);
+ list_add(&use->target_list, &a->target_list);
+ return 0;
+}
+
+/* Module a uses b: caller needs module_mutex() */
+int ref_module(struct module *a, struct module *b)
+{
+ int err;
+
+ if (b == NULL || already_uses(a, b))
+ return 0;
+
+ /* If module isn't available, we fail. */
+ err = strong_try_module_get(b);
+ if (err)
+ return err;
+
+ err = add_module_usage(a, b);
+ if (err) {
+ module_put(b);
+ return err;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ref_module);
+
+/* Clear the unload stuff of the module. */
+static void module_unload_free(struct module *mod)
+{
+ struct module_use *use, *tmp;
+
+ mutex_lock(&module_mutex);
+ list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) {
+ struct module *i = use->target;
+ pr_debug("%s unusing %s\n", mod->name, i->name);
+ module_put(i);
+ list_del(&use->source_list);
+ list_del(&use->target_list);
+ kfree(use);
+ }
+ mutex_unlock(&module_mutex);
+}
+
+#ifdef CONFIG_MODULE_FORCE_UNLOAD
+static inline int try_force_unload(unsigned int flags)
+{
+ int ret = (flags & O_TRUNC);
+ if (ret)
+ add_taint(TAINT_FORCED_RMMOD, LOCKDEP_NOW_UNRELIABLE);
+ return ret;
+}
+#else
+static inline int try_force_unload(unsigned int flags)
+{
+ return 0;
+}
+#endif /* CONFIG_MODULE_FORCE_UNLOAD */
+
+/* Try to release refcount of module, 0 means success. */
+static int try_release_module_ref(struct module *mod)
+{
+ int ret;
+
+ /* Try to decrement refcnt which we set at loading */
+ ret = atomic_sub_return(MODULE_REF_BASE, &mod->refcnt);
+ BUG_ON(ret < 0);
+ if (ret)
+ /* Someone can put this right now, recover with checking */
+ ret = atomic_add_unless(&mod->refcnt, MODULE_REF_BASE, 0);
+
+ return ret;
+}
+
+static int try_stop_module(struct module *mod, int flags, int *forced)
+{
+ /* If it's not unused, quit unless we're forcing. */
+ if (try_release_module_ref(mod) != 0) {
+ *forced = try_force_unload(flags);
+ if (!(*forced))
+ return -EWOULDBLOCK;
+ }
+
+ /* Mark it as dying. */
+ mod->state = MODULE_STATE_GOING;
+
+ return 0;
+}
+
+/**
+ * module_refcount - return the refcount or -1 if unloading
+ *
+ * @mod: the module we're checking
+ *
+ * Returns:
+ * -1 if the module is in the process of unloading
+ * otherwise the number of references in the kernel to the module
+ */
+int module_refcount(struct module *mod)
+{
+ return atomic_read(&mod->refcnt) - MODULE_REF_BASE;
+}
+EXPORT_SYMBOL(module_refcount);
+
+/* This exists whether we can unload or not */
+static void free_module(struct module *mod);
+
+SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
+ unsigned int, flags)
+{
+ struct module *mod;
+ char name[MODULE_NAME_LEN];
+ int ret, forced = 0;
+
+ if (!capable(CAP_SYS_MODULE) || modules_disabled)
+ return -EPERM;
+
+ if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0)
+ return -EFAULT;
+ name[MODULE_NAME_LEN-1] = '\0';
+
+ if (mutex_lock_interruptible(&module_mutex) != 0)
+ return -EINTR;
+
+ mod = find_module(name);
+ if (!mod) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ if (!list_empty(&mod->source_list)) {
+ /* Other modules depend on us: get rid of them first. */
+ ret = -EWOULDBLOCK;
+ goto out;
+ }
+
+ /* Doing init or already dying? */
+ if (mod->state != MODULE_STATE_LIVE) {
+ /* FIXME: if (force), slam module count damn the torpedoes */
+ pr_debug("%s already dying\n", mod->name);
+ ret = -EBUSY;
+ goto out;
+ }
+
+ /* If it has an init func, it must have an exit func to unload */
+ if (mod->init && !mod->exit) {
+ forced = try_force_unload(flags);
+ if (!forced) {
+ /* This module can't be removed */
+ ret = -EBUSY;
+ goto out;
+ }
+ }
+
+ /* Stop the machine so refcounts can't move and disable module. */
+ ret = try_stop_module(mod, flags, &forced);
+ if (ret != 0)
+ goto out;
+
+ mutex_unlock(&module_mutex);
+ /* Final destruction now no one is using it. */
+ if (mod->exit != NULL)
+ mod->exit();
+ blocking_notifier_call_chain(&module_notify_list,
+ MODULE_STATE_GOING, mod);
+ async_synchronize_full();
+
+ /* Store the name of the last unloaded module for diagnostic purposes */
+ strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
+
+ free_module(mod);
+ return 0;
+out:
+ mutex_unlock(&module_mutex);
+ return ret;
+}
+
+static inline void print_unload_info(struct seq_file *m, struct module *mod)
+{
+ struct module_use *use;
+ int printed_something = 0;
+
+ seq_printf(m, " %i ", module_refcount(mod));
+
+ /*
+ * Always include a trailing , so userspace can differentiate
+ * between this and the old multi-field proc format.
+ */
+ list_for_each_entry(use, &mod->source_list, source_list) {
+ printed_something = 1;
+ seq_printf(m, "%s,", use->source->name);
+ }
+
+ if (mod->init != NULL && mod->exit == NULL) {
+ printed_something = 1;
+ seq_puts(m, "[permanent],");
+ }
+
+ if (!printed_something)
+ seq_puts(m, "-");
+}
+
+void __symbol_put(const char *symbol)
+{
+ struct module *owner;
+
+ preempt_disable();
+ if (!find_symbol(symbol, &owner, NULL, true, false))
+ BUG();
+ module_put(owner);
+ preempt_enable();
+}
+EXPORT_SYMBOL(__symbol_put);
+
+/* Note this assumes addr is a function, which it currently always is. */
+void symbol_put_addr(void *addr)
+{
+ struct module *modaddr;
+ unsigned long a = (unsigned long)dereference_function_descriptor(addr);
+
+ if (core_kernel_text(a))
+ return;
+
+ /* module_text_address is safe here: we're supposed to have reference
+ * to module from symbol_get, so it can't go away. */
+ modaddr = __module_text_address(a);
+ BUG_ON(!modaddr);
+ module_put(modaddr);
+}
+EXPORT_SYMBOL_GPL(symbol_put_addr);
+
+static ssize_t show_refcnt(struct module_attribute *mattr,
+ struct module_kobject *mk, char *buffer)
+{
+ return sprintf(buffer, "%i\n", module_refcount(mk->mod));
+}
+
+static struct module_attribute modinfo_refcnt =
+ __ATTR(refcnt, 0444, show_refcnt, NULL);
+
+void __module_get(struct module *module)
+{
+ if (module) {
+ preempt_disable();
+ atomic_inc(&module->refcnt);
+ trace_module_get(module, _RET_IP_);
+ preempt_enable();
+ }
+}
+EXPORT_SYMBOL(__module_get);
+
+bool try_module_get(struct module *module)
+{
+ bool ret = true;
+
+ if (module) {
+ preempt_disable();
+ /* Note: here, we can fail to get a reference */
+ if (likely(module_is_live(module) &&
+ atomic_inc_not_zero(&module->refcnt) != 0))
+ trace_module_get(module, _RET_IP_);
+ else
+ ret = false;
+
+ preempt_enable();
+ }
+ return ret;
+}
+EXPORT_SYMBOL(try_module_get);
+
+void module_put(struct module *module)
+{
+ int ret;
+
+ if (module) {
+ preempt_disable();
+ ret = atomic_dec_if_positive(&module->refcnt);
+ WARN_ON(ret < 0); /* Failed to put refcount */
+ trace_module_put(module, _RET_IP_);
+ preempt_enable();
+ }
+}
+EXPORT_SYMBOL(module_put);
+
+#else /* !CONFIG_MODULE_UNLOAD */
+static inline void print_unload_info(struct seq_file *m, struct module *mod)
+{
+ /* We don't know the usage count, or what modules are using. */
+ seq_puts(m, " - -");
+}
+
+static inline void module_unload_free(struct module *mod)
+{
+}
+
+int ref_module(struct module *a, struct module *b)
+{
+ return strong_try_module_get(b);
+}
+EXPORT_SYMBOL_GPL(ref_module);
+
+static inline int module_unload_init(struct module *mod)
+{
+ return 0;
+}
+#endif /* CONFIG_MODULE_UNLOAD */
+
+static size_t module_flags_taint(struct module *mod, char *buf)
+{
+ size_t l = 0;
+
+ if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE))
+ buf[l++] = 'P';
+ if (mod->taints & (1 << TAINT_OOT_MODULE))
+ buf[l++] = 'O';
+ if (mod->taints & (1 << TAINT_FORCED_MODULE))
+ buf[l++] = 'F';
+ if (mod->taints & (1 << TAINT_CRAP))
+ buf[l++] = 'C';
+ if (mod->taints & (1 << TAINT_UNSIGNED_MODULE))
+ buf[l++] = 'E';
+ /*
+ * TAINT_FORCED_RMMOD: could be added.
+ * TAINT_CPU_OUT_OF_SPEC, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
+ * apply to modules.
+ */
+ return l;
+}
+
+static ssize_t show_initstate(struct module_attribute *mattr,
+ struct module_kobject *mk, char *buffer)
+{
+ const char *state = "unknown";
+
+ switch (mk->mod->state) {
+ case MODULE_STATE_LIVE:
+ state = "live";
+ break;
+ case MODULE_STATE_COMING:
+ state = "coming";
+ break;
+ case MODULE_STATE_GOING:
+ state = "going";
+ break;
+ default:
+ BUG();
+ }
+ return sprintf(buffer, "%s\n", state);
+}
+
+static struct module_attribute modinfo_initstate =
+ __ATTR(initstate, 0444, show_initstate, NULL);
+
+static ssize_t store_uevent(struct module_attribute *mattr,
+ struct module_kobject *mk,
+ const char *buffer, size_t count)
+{
+ enum kobject_action action;
+
+ if (kobject_action_type(buffer, count, &action) == 0)
+ kobject_uevent(&mk->kobj, action);
+ return count;
+}
+
+struct module_attribute module_uevent =
+ __ATTR(uevent, 0200, NULL, store_uevent);
+
+static ssize_t show_coresize(struct module_attribute *mattr,
+ struct module_kobject *mk, char *buffer)
+{
+ return sprintf(buffer, "%u\n", mk->mod->core_size);
+}
+
+static struct module_attribute modinfo_coresize =
+ __ATTR(coresize, 0444, show_coresize, NULL);
+
+static ssize_t show_initsize(struct module_attribute *mattr,
+ struct module_kobject *mk, char *buffer)
+{
+ return sprintf(buffer, "%u\n", mk->mod->init_size);
+}
+
+static struct module_attribute modinfo_initsize =
+ __ATTR(initsize, 0444, show_initsize, NULL);
+
+static ssize_t show_taint(struct module_attribute *mattr,
+ struct module_kobject *mk, char *buffer)
+{
+ size_t l;
+
+ l = module_flags_taint(mk->mod, buffer);
+ buffer[l++] = '\n';
+ return l;
+}
+
+static struct module_attribute modinfo_taint =
+ __ATTR(taint, 0444, show_taint, NULL);
+
+static struct module_attribute *modinfo_attrs[] = {
+ &module_uevent,
+ &modinfo_version,
+ &modinfo_srcversion,
+ &modinfo_initstate,
+ &modinfo_coresize,
+ &modinfo_initsize,
+ &modinfo_taint,
+#ifdef CONFIG_MODULE_UNLOAD
+ &modinfo_refcnt,
+#endif
+ NULL,
+};
+
+static const char vermagic[] = VERMAGIC_STRING;
+
+static int try_to_force_load(struct module *mod, const char *reason)
+{
+#ifdef CONFIG_MODULE_FORCE_LOAD
+ if (!test_taint(TAINT_FORCED_MODULE))
+ pr_warn("%s: %s: kernel tainted.\n", mod->name, reason);
+ add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_NOW_UNRELIABLE);
+ return 0;
+#else
+ return -ENOEXEC;
+#endif
+}
+
+#ifdef CONFIG_MODVERSIONS
+/* If the arch applies (non-zero) relocations to kernel kcrctab, unapply it. */
+static unsigned long maybe_relocated(unsigned long crc,
+ const struct module *crc_owner)
+{
+#ifdef ARCH_RELOCATES_KCRCTAB
+ if (crc_owner == NULL)
+ return crc - (unsigned long)reloc_start;
+#endif
+ return crc;
+}
+
+static int check_version(Elf_Shdr *sechdrs,
+ unsigned int versindex,
+ const char *symname,
+ struct module *mod,
+ const unsigned long *crc,
+ const struct module *crc_owner)
+{
+ unsigned int i, num_versions;
+ struct modversion_info *versions;
+
+ /* Exporting module didn't supply crcs? OK, we're already tainted. */
+ if (!crc)
+ return 1;
+
+ /* No versions at all? modprobe --force does this. */
+ if (versindex == 0)
+ return try_to_force_load(mod, symname) == 0;
+
+ versions = (void *) sechdrs[versindex].sh_addr;
+ num_versions = sechdrs[versindex].sh_size
+ / sizeof(struct modversion_info);
+
+ for (i = 0; i < num_versions; i++) {
+ if (strcmp(versions[i].name, symname) != 0)
+ continue;
+
+ if (versions[i].crc == maybe_relocated(*crc, crc_owner))
+ return 1;
+ pr_debug("Found checksum %lX vs module %lX\n",
+ maybe_relocated(*crc, crc_owner), versions[i].crc);
+ goto bad_version;
+ }
+
+ pr_warn("%s: no symbol version for %s\n", mod->name, symname);
+ return 0;
+
+bad_version:
+ pr_warn("%s: disagrees about version of symbol %s\n",
+ mod->name, symname);
+ return 0;
+}
+
+static inline int check_modstruct_version(Elf_Shdr *sechdrs,
+ unsigned int versindex,
+ struct module *mod)
+{
+ const unsigned long *crc;
+
+ /* Since this should be found in kernel (which can't be removed),
+ * no locking is necessary. */
+ if (!find_symbol(VMLINUX_SYMBOL_STR(module_layout), NULL,
+ &crc, true, false))
+ BUG();
+ return check_version(sechdrs, versindex,
+ VMLINUX_SYMBOL_STR(module_layout), mod, crc,
+ NULL);
+}
+
+/* First part is kernel version, which we ignore if module has crcs. */
+static inline int same_magic(const char *amagic, const char *bmagic,
+ bool has_crcs)
+{
+ if (has_crcs) {
+ amagic += strcspn(amagic, " ");
+ bmagic += strcspn(bmagic, " ");
+ }
+ return strcmp(amagic, bmagic) == 0;
+}
+#else
+static inline int check_version(Elf_Shdr *sechdrs,
+ unsigned int versindex,
+ const char *symname,
+ struct module *mod,
+ const unsigned long *crc,
+ const struct module *crc_owner)
+{
+ return 1;
+}
+
+static inline int check_modstruct_version(Elf_Shdr *sechdrs,
+ unsigned int versindex,
+ struct module *mod)
+{
+ return 1;
+}
+
+static inline int same_magic(const char *amagic, const char *bmagic,
+ bool has_crcs)
+{
+ return strcmp(amagic, bmagic) == 0;
+}
+#endif /* CONFIG_MODVERSIONS */
+
+/* Resolve a symbol for this module. I.e. if we find one, record usage. */
+static const struct kernel_symbol *resolve_symbol(struct module *mod,
+ const struct load_info *info,
+ const char *name,
+ char ownername[])
+{
+ struct module *owner;
+ const struct kernel_symbol *sym;
+ const unsigned long *crc;
+ int err;
+
+ /*
+ * The module_mutex should not be a heavily contended lock;
+ * if we get the occasional sleep here, we'll go an extra iteration
+ * in the wait_event_interruptible(), which is harmless.
+ */
+ sched_annotate_sleep();
+ mutex_lock(&module_mutex);
+ sym = find_symbol(name, &owner, &crc,
+ !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true);
+ if (!sym)
+ goto unlock;
+
+ if (!check_version(info->sechdrs, info->index.vers, name, mod, crc,
+ owner)) {
+ sym = ERR_PTR(-EINVAL);
+ goto getname;
+ }
+
+ err = ref_module(mod, owner);
+ if (err) {
+ sym = ERR_PTR(err);
+ goto getname;
+ }
+
+getname:
+ /* We must make copy under the lock if we failed to get ref. */
+ strncpy(ownername, module_name(owner), MODULE_NAME_LEN);
+unlock:
+ mutex_unlock(&module_mutex);
+ return sym;
+}
+
+static const struct kernel_symbol *
+resolve_symbol_wait(struct module *mod,
+ const struct load_info *info,
+ const char *name)
+{
+ const struct kernel_symbol *ksym;
+ char owner[MODULE_NAME_LEN];
+
+ if (wait_event_interruptible_timeout(module_wq,
+ !IS_ERR(ksym = resolve_symbol(mod, info, name, owner))
+ || PTR_ERR(ksym) != -EBUSY,
+ 30 * HZ) <= 0) {
+ pr_warn("%s: gave up waiting for init of module %s.\n",
+ mod->name, owner);
+ }
+ return ksym;
+}
+
+/*
+ * /sys/module/foo/sections stuff
+ * J. Corbet <corbet@lwn.net>
+ */
+#ifdef CONFIG_SYSFS
+
+#ifdef CONFIG_KALLSYMS
+static inline bool sect_empty(const Elf_Shdr *sect)
+{
+ return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0;
+}
+
+struct module_sect_attr {
+ struct module_attribute mattr;
+ char *name;
+ unsigned long address;
+};
+
+struct module_sect_attrs {
+ struct attribute_group grp;
+ unsigned int nsections;
+ struct module_sect_attr attrs[0];
+};
+
+static ssize_t module_sect_show(struct module_attribute *mattr,
+ struct module_kobject *mk, char *buf)
+{
+ struct module_sect_attr *sattr =
+ container_of(mattr, struct module_sect_attr, mattr);
+ return sprintf(buf, "0x%pK\n", (void *)sattr->address);
+}
+
+static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
+{
+ unsigned int section;
+
+ for (section = 0; section < sect_attrs->nsections; section++)
+ kfree(sect_attrs->attrs[section].name);
+ kfree(sect_attrs);
+}
+
+static void add_sect_attrs(struct module *mod, const struct load_info *info)
+{
+ unsigned int nloaded = 0, i, size[2];
+ struct module_sect_attrs *sect_attrs;
+ struct module_sect_attr *sattr;
+ struct attribute **gattr;
+
+ /* Count loaded sections and allocate structures */
+ for (i = 0; i < info->hdr->e_shnum; i++)
+ if (!sect_empty(&info->sechdrs[i]))
+ nloaded++;
+ size[0] = ALIGN(sizeof(*sect_attrs)
+ + nloaded * sizeof(sect_attrs->attrs[0]),
+ sizeof(sect_attrs->grp.attrs[0]));
+ size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.attrs[0]);
+ sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL);
+ if (sect_attrs == NULL)
+ return;
+
+ /* Setup section attributes. */
+ sect_attrs->grp.name = "sections";
+ sect_attrs->grp.attrs = (void *)sect_attrs + size[0];
+
+ sect_attrs->nsections = 0;
+ sattr = &sect_attrs->attrs[0];
+ gattr = &sect_attrs->grp.attrs[0];
+ for (i = 0; i < info->hdr->e_shnum; i++) {
+ Elf_Shdr *sec = &info->sechdrs[i];
+ if (sect_empty(sec))
+ continue;
+ sattr->address = sec->sh_addr;
+ sattr->name = kstrdup(info->secstrings + sec->sh_name,
+ GFP_KERNEL);
+ if (sattr->name == NULL)
+ goto out;
+ sect_attrs->nsections++;
+ sysfs_attr_init(&sattr->mattr.attr);
+ sattr->mattr.show = module_sect_show;
+ sattr->mattr.store = NULL;
+ sattr->mattr.attr.name = sattr->name;
+ sattr->mattr.attr.mode = S_IRUGO;
+ *(gattr++) = &(sattr++)->mattr.attr;
+ }
+ *gattr = NULL;
+
+ if (sysfs_create_group(&mod->mkobj.kobj, &sect_attrs->grp))
+ goto out;
+
+ mod->sect_attrs = sect_attrs;
+ return;
+ out:
+ free_sect_attrs(sect_attrs);
+}
+
+static void remove_sect_attrs(struct module *mod)
+{
+ if (mod->sect_attrs) {
+ sysfs_remove_group(&mod->mkobj.kobj,
+ &mod->sect_attrs->grp);
+ /* We are positive that no one is using any sect attrs
+ * at this point. Deallocate immediately. */
+ free_sect_attrs(mod->sect_attrs);
+ mod->sect_attrs = NULL;
+ }
+}
+
+/*
+ * /sys/module/foo/notes/.section.name gives contents of SHT_NOTE sections.
+ */
+
+struct module_notes_attrs {
+ struct kobject *dir;
+ unsigned int notes;
+ struct bin_attribute attrs[0];
+};
+
+static ssize_t module_notes_read(struct file *filp, struct kobject *kobj,
+ struct bin_attribute *bin_attr,
+ char *buf, loff_t pos, size_t count)
+{
+ /*
+ * The caller checked the pos and count against our size.
+ */
+ memcpy(buf, bin_attr->private + pos, count);
+ return count;
+}
+
+static void free_notes_attrs(struct module_notes_attrs *notes_attrs,
+ unsigned int i)
+{
+ if (notes_attrs->dir) {
+ while (i-- > 0)
+ sysfs_remove_bin_file(notes_attrs->dir,
+ &notes_attrs->attrs[i]);
+ kobject_put(notes_attrs->dir);
+ }
+ kfree(notes_attrs);
+}
+
+static void add_notes_attrs(struct module *mod, const struct load_info *info)
+{
+ unsigned int notes, loaded, i;
+ struct module_notes_attrs *notes_attrs;
+ struct bin_attribute *nattr;
+
+ /* failed to create section attributes, so can't create notes */
+ if (!mod->sect_attrs)
+ return;
+
+ /* Count notes sections and allocate structures. */
+ notes = 0;
+ for (i = 0; i < info->hdr->e_shnum; i++)
+ if (!sect_empty(&info->sechdrs[i]) &&
+ (info->sechdrs[i].sh_type == SHT_NOTE))
+ ++notes;
+
+ if (notes == 0)
+ return;
+
+ notes_attrs = kzalloc(sizeof(*notes_attrs)
+ + notes * sizeof(notes_attrs->attrs[0]),
+ GFP_KERNEL);
+ if (notes_attrs == NULL)
+ return;
+
+ notes_attrs->notes = notes;
+ nattr = &notes_attrs->attrs[0];
+ for (loaded = i = 0; i < info->hdr->e_shnum; ++i) {
+ if (sect_empty(&info->sechdrs[i]))
+ continue;
+ if (info->sechdrs[i].sh_type == SHT_NOTE) {
+ sysfs_bin_attr_init(nattr);
+ nattr->attr.name = mod->sect_attrs->attrs[loaded].name;
+ nattr->attr.mode = S_IRUGO;
+ nattr->size = info->sechdrs[i].sh_size;
+ nattr->private = (void *) info->sechdrs[i].sh_addr;
+ nattr->read = module_notes_read;
+ ++nattr;
+ }
+ ++loaded;
+ }
+
+ notes_attrs->dir = kobject_create_and_add("notes", &mod->mkobj.kobj);
+ if (!notes_attrs->dir)
+ goto out;
+
+ for (i = 0; i < notes; ++i)
+ if (sysfs_create_bin_file(notes_attrs->dir,
+ &notes_attrs->attrs[i]))
+ goto out;
+
+ mod->notes_attrs = notes_attrs;
+ return;
+
+ out:
+ free_notes_attrs(notes_attrs, i);
+}
+
+static void remove_notes_attrs(struct module *mod)
+{
+ if (mod->notes_attrs)
+ free_notes_attrs(mod->notes_attrs, mod->notes_attrs->notes);
+}
+
+#else
+
+static inline void add_sect_attrs(struct module *mod,
+ const struct load_info *info)
+{
+}
+
+static inline void remove_sect_attrs(struct module *mod)
+{
+}
+
+static inline void add_notes_attrs(struct module *mod,
+ const struct load_info *info)
+{
+}
+
+static inline void remove_notes_attrs(struct module *mod)
+{
+}
+#endif /* CONFIG_KALLSYMS */
+
+static void add_usage_links(struct module *mod)
+{
+#ifdef CONFIG_MODULE_UNLOAD
+ struct module_use *use;
+ int nowarn;
+
+ mutex_lock(&module_mutex);
+ list_for_each_entry(use, &mod->target_list, target_list) {
+ nowarn = sysfs_create_link(use->target->holders_dir,
+ &mod->mkobj.kobj, mod->name);
+ }
+ mutex_unlock(&module_mutex);
+#endif
+}
+
+static void del_usage_links(struct module *mod)
+{
+#ifdef CONFIG_MODULE_UNLOAD
+ struct module_use *use;
+
+ mutex_lock(&module_mutex);
+ list_for_each_entry(use, &mod->target_list, target_list)
+ sysfs_remove_link(use->target->holders_dir, mod->name);
+ mutex_unlock(&module_mutex);
+#endif
+}
+
+static int module_add_modinfo_attrs(struct module *mod)
+{
+ struct module_attribute *attr;
+ struct module_attribute *temp_attr;
+ int error = 0;
+ int i;
+
+ mod->modinfo_attrs = kzalloc((sizeof(struct module_attribute) *
+ (ARRAY_SIZE(modinfo_attrs) + 1)),
+ GFP_KERNEL);
+ if (!mod->modinfo_attrs)
+ return -ENOMEM;
+
+ temp_attr = mod->modinfo_attrs;
+ for (i = 0; (attr = modinfo_attrs[i]) && !error; i++) {
+ if (!attr->test ||
+ (attr->test && attr->test(mod))) {
+ memcpy(temp_attr, attr, sizeof(*temp_attr));
+ sysfs_attr_init(&temp_attr->attr);
+ error = sysfs_create_file(&mod->mkobj.kobj,
+ &temp_attr->attr);
+ ++temp_attr;
+ }
+ }
+ return error;
+}
+
+static void module_remove_modinfo_attrs(struct module *mod)
+{
+ struct module_attribute *attr;
+ int i;
+
+ for (i = 0; (attr = &mod->modinfo_attrs[i]); i++) {
+ /* pick a field to test for end of list */
+ if (!attr->attr.name)
+ break;
+ sysfs_remove_file(&mod->mkobj.kobj, &attr->attr);
+ if (attr->free)
+ attr->free(mod);
+ }
+ kfree(mod->modinfo_attrs);
+}
+
+static void mod_kobject_put(struct module *mod)
+{
+ DECLARE_COMPLETION_ONSTACK(c);
+ mod->mkobj.kobj_completion = &c;
+ kobject_put(&mod->mkobj.kobj);
+ wait_for_completion(&c);
+}
+
+static int mod_sysfs_init(struct module *mod)
+{
+ int err;
+ struct kobject *kobj;
+
+ if (!module_sysfs_initialized) {
+ pr_err("%s: module sysfs not initialized\n", mod->name);
+ err = -EINVAL;
+ goto out;
+ }
+
+ kobj = kset_find_obj(module_kset, mod->name);
+ if (kobj) {
+ pr_err("%s: module is already loaded\n", mod->name);
+ kobject_put(kobj);
+ err = -EINVAL;
+ goto out;
+ }
+
+ mod->mkobj.mod = mod;
+
+ memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj));
+ mod->mkobj.kobj.kset = module_kset;
+ err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL,
+ "%s", mod->name);
+ if (err)
+ mod_kobject_put(mod);
+
+ /* delay uevent until full sysfs population */
+out:
+ return err;
+}
+
+static int mod_sysfs_setup(struct module *mod,
+ const struct load_info *info,
+ struct kernel_param *kparam,
+ unsigned int num_params)
+{
+ int err;
+
+ err = mod_sysfs_init(mod);
+ if (err)
+ goto out;
+
+ mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj);
+ if (!mod->holders_dir) {
+ err = -ENOMEM;
+ goto out_unreg;
+ }
+
+ err = module_param_sysfs_setup(mod, kparam, num_params);
+ if (err)
+ goto out_unreg_holders;
+
+ err = module_add_modinfo_attrs(mod);
+ if (err)
+ goto out_unreg_param;
+
+ add_usage_links(mod);
+ add_sect_attrs(mod, info);
+ add_notes_attrs(mod, info);
+
+ kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);
+ return 0;
+
+out_unreg_param:
+ module_param_sysfs_remove(mod);
+out_unreg_holders:
+ kobject_put(mod->holders_dir);
+out_unreg:
+ mod_kobject_put(mod);
+out:
+ return err;
+}
+
+static void mod_sysfs_fini(struct module *mod)
+{
+ remove_notes_attrs(mod);
+ remove_sect_attrs(mod);
+ mod_kobject_put(mod);
+}
+
+#else /* !CONFIG_SYSFS */
+
+static int mod_sysfs_setup(struct module *mod,
+ const struct load_info *info,
+ struct kernel_param *kparam,
+ unsigned int num_params)
+{
+ return 0;
+}
+
+static void mod_sysfs_fini(struct module *mod)
+{
+}
+
+static void module_remove_modinfo_attrs(struct module *mod)
+{
+}
+
+static void del_usage_links(struct module *mod)
+{
+}
+
+#endif /* CONFIG_SYSFS */
+
+static void mod_sysfs_teardown(struct module *mod)
+{
+ del_usage_links(mod);
+ module_remove_modinfo_attrs(mod);
+ module_param_sysfs_remove(mod);
+ kobject_put(mod->mkobj.drivers_dir);
+ kobject_put(mod->holders_dir);
+ mod_sysfs_fini(mod);
+}
+
+#ifdef CONFIG_DEBUG_SET_MODULE_RONX
+/*
+ * LKM RO/NX protection: protect module's text/ro-data
+ * from modification and any data from execution.
+ */
+void set_page_attributes(void *start, void *end, int (*set)(unsigned long start, int num_pages))
+{
+ unsigned long begin_pfn = PFN_DOWN((unsigned long)start);
+ unsigned long end_pfn = PFN_DOWN((unsigned long)end);
+
+ if (end_pfn > begin_pfn)
+ set(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn);
+}
+
+static void set_section_ro_nx(void *base,
+ unsigned long text_size,
+ unsigned long ro_size,
+ unsigned long total_size)
+{
+ /* begin and end PFNs of the current subsection */
+ unsigned long begin_pfn;
+ unsigned long end_pfn;
+
+ /*
+ * Set RO for module text and RO-data:
+ * - Always protect first page.
+ * - Do not protect last partial page.
+ */
+ if (ro_size > 0)
+ set_page_attributes(base, base + ro_size, set_memory_ro);
+
+ /*
+ * Set NX permissions for module data:
+ * - Do not protect first partial page.
+ * - Always protect last page.
+ */
+ if (total_size > text_size) {
+ begin_pfn = PFN_UP((unsigned long)base + text_size);
+ end_pfn = PFN_UP((unsigned long)base + total_size);
+ if (end_pfn > begin_pfn)
+ set_memory_nx(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn);
+ }
+}
+
+static void unset_module_core_ro_nx(struct module *mod)
+{
+ set_page_attributes(mod->module_core + mod->core_text_size,
+ mod->module_core + mod->core_size,
+ set_memory_x);
+ set_page_attributes(mod->module_core,
+ mod->module_core + mod->core_ro_size,
+ set_memory_rw);
+}
+
+static void unset_module_init_ro_nx(struct module *mod)
+{
+ set_page_attributes(mod->module_init + mod->init_text_size,
+ mod->module_init + mod->init_size,
+ set_memory_x);
+ set_page_attributes(mod->module_init,
+ mod->module_init + mod->init_ro_size,
+ set_memory_rw);
+}
+
+/* Iterate through all modules and set each module's text as RW */
+void set_all_modules_text_rw(void)
+{
+ struct module *mod;
+
+ mutex_lock(&module_mutex);
+ list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+ if ((mod->module_core) && (mod->core_text_size)) {
+ set_page_attributes(mod->module_core,
+ mod->module_core + mod->core_text_size,
+ set_memory_rw);
+ }
+ if ((mod->module_init) && (mod->init_text_size)) {
+ set_page_attributes(mod->module_init,
+ mod->module_init + mod->init_text_size,
+ set_memory_rw);
+ }
+ }
+ mutex_unlock(&module_mutex);
+}
+
+/* Iterate through all modules and set each module's text as RO */
+void set_all_modules_text_ro(void)
+{
+ struct module *mod;
+
+ mutex_lock(&module_mutex);
+ list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+ if ((mod->module_core) && (mod->core_text_size)) {
+ set_page_attributes(mod->module_core,
+ mod->module_core + mod->core_text_size,
+ set_memory_ro);
+ }
+ if ((mod->module_init) && (mod->init_text_size)) {
+ set_page_attributes(mod->module_init,
+ mod->module_init + mod->init_text_size,
+ set_memory_ro);
+ }
+ }
+ mutex_unlock(&module_mutex);
+}
+#else
+static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { }
+static void unset_module_core_ro_nx(struct module *mod) { }
+static void unset_module_init_ro_nx(struct module *mod) { }
+#endif
+
+void __weak module_memfree(void *module_region)
+{
+ vfree(module_region);
+}
+
+void __weak module_arch_cleanup(struct module *mod)
+{
+}
+
+void __weak module_arch_freeing_init(struct module *mod)
+{
+}
+
+/* Free a module, remove from lists, etc. */
+static void free_module(struct module *mod)
+{
+ trace_module_free(mod);
+
+ mod_sysfs_teardown(mod);
+
+ /* We leave it in list to prevent duplicate loads, but make sure
+ * that noone uses it while it's being deconstructed. */
+ mutex_lock(&module_mutex);
+ mod->state = MODULE_STATE_UNFORMED;
+ mutex_unlock(&module_mutex);
+
+ /* Remove dynamic debug info */
+ ddebug_remove_module(mod->name);
+
+ /* Arch-specific cleanup. */
+ module_arch_cleanup(mod);
+
+ /* Module unload stuff */
+ module_unload_free(mod);
+
+ /* Free any allocated parameters. */
+ destroy_params(mod->kp, mod->num_kp);
+
+ /* Now we can delete it from the lists */
+ mutex_lock(&module_mutex);
+ /* Unlink carefully: kallsyms could be walking list. */
+ list_del_rcu(&mod->list);
+ /* Remove this module from bug list, this uses list_del_rcu */
+ module_bug_cleanup(mod);
+ /* Wait for RCU synchronizing before releasing mod->list and buglist. */
+ synchronize_rcu();
+ mutex_unlock(&module_mutex);
+
+ /* This may be NULL, but that's OK */
+ unset_module_init_ro_nx(mod);
+ module_arch_freeing_init(mod);
+ module_memfree(mod->module_init);
+ kfree(mod->args);
+ percpu_modfree(mod);
+
+ /* Free lock-classes; relies on the preceding sync_rcu(). */
+ lockdep_free_key_range(mod->module_core, mod->core_size);
+
+ /* Finally, free the core (containing the module structure) */
+ unset_module_core_ro_nx(mod);
+ module_memfree(mod->module_core);
+
+#ifdef CONFIG_MPU
+ update_protections(current->mm);
+#endif
+}
+
+void *__symbol_get(const char *symbol)
+{
+ struct module *owner;
+ const struct kernel_symbol *sym;
+
+ preempt_disable();
+ sym = find_symbol(symbol, &owner, NULL, true, true);
+ if (sym && strong_try_module_get(owner))
+ sym = NULL;
+ preempt_enable();
+
+ return sym ? (void *)sym->value : NULL;
+}
+EXPORT_SYMBOL_GPL(__symbol_get);
+
+/*
+ * Ensure that an exported symbol [global namespace] does not already exist
+ * in the kernel or in some other module's exported symbol table.
+ *
+ * You must hold the module_mutex.
+ */
+static int verify_export_symbols(struct module *mod)
+{
+ unsigned int i;
+ struct module *owner;
+ const struct kernel_symbol *s;
+ struct {
+ const struct kernel_symbol *sym;
+ unsigned int num;
+ } arr[] = {
+ { mod->syms, mod->num_syms },
+ { mod->gpl_syms, mod->num_gpl_syms },
+ { mod->gpl_future_syms, mod->num_gpl_future_syms },
+#ifdef CONFIG_UNUSED_SYMBOLS
+ { mod->unused_syms, mod->num_unused_syms },
+ { mod->unused_gpl_syms, mod->num_unused_gpl_syms },
+#endif
+ };
+
+ for (i = 0; i < ARRAY_SIZE(arr); i++) {
+ for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) {
+ if (find_symbol(s->name, &owner, NULL, true, false)) {
+ pr_err("%s: exports duplicate symbol %s"
+ " (owned by %s)\n",
+ mod->name, s->name, module_name(owner));
+ return -ENOEXEC;
+ }
+ }
+ }
+ return 0;
+}
+
+/* Change all symbols so that st_value encodes the pointer directly. */
+static int simplify_symbols(struct module *mod, const struct load_info *info)
+{
+ Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
+ Elf_Sym *sym = (void *)symsec->sh_addr;
+ unsigned long secbase;
+ unsigned int i;
+ int ret = 0;
+ const struct kernel_symbol *ksym;
+
+ for (i = 1; i < symsec->sh_size / sizeof(Elf_Sym); i++) {
+ const char *name = info->strtab + sym[i].st_name;
+
+ switch (sym[i].st_shndx) {
+ case SHN_COMMON:
+ /* Ignore common symbols */
+ if (!strncmp(name, "__gnu_lto", 9))
+ break;
+
+ /* We compiled with -fno-common. These are not
+ supposed to happen. */
+ pr_debug("Common symbol: %s\n", name);
+ pr_warn("%s: please compile with -fno-common\n",
+ mod->name);
+ ret = -ENOEXEC;
+ break;
+
+ case SHN_ABS:
+ /* Don't need to do anything */
+ pr_debug("Absolute symbol: 0x%08lx\n",
+ (long)sym[i].st_value);
+ break;
+
+ case SHN_UNDEF:
+ ksym = resolve_symbol_wait(mod, info, name);
+ /* Ok if resolved. */
+ if (ksym && !IS_ERR(ksym)) {
+ sym[i].st_value = ksym->value;
+ break;
+ }
+
+ /* Ok if weak. */
+ if (!ksym && ELF_ST_BIND(sym[i].st_info) == STB_WEAK)
+ break;
+
+ pr_warn("%s: Unknown symbol %s (err %li)\n",
+ mod->name, name, PTR_ERR(ksym));
+ ret = PTR_ERR(ksym) ?: -ENOENT;
+ break;
+
+ default:
+ /* Divert to percpu allocation if a percpu var. */
+ if (sym[i].st_shndx == info->index.pcpu)
+ secbase = (unsigned long)mod_percpu(mod);
+ else
+ secbase = info->sechdrs[sym[i].st_shndx].sh_addr;
+ sym[i].st_value += secbase;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static int apply_relocations(struct module *mod, const struct load_info *info)
+{
+ unsigned int i;
+ int err = 0;
+
+ /* Now do relocations. */
+ for (i = 1; i < info->hdr->e_shnum; i++) {
+ unsigned int infosec = info->sechdrs[i].sh_info;
+
+ /* Not a valid relocation section? */
+ if (infosec >= info->hdr->e_shnum)
+ continue;
+
+ /* Don't bother with non-allocated sections */
+ if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC))
+ continue;
+
+ if (info->sechdrs[i].sh_type == SHT_REL)
+ err = apply_relocate(info->sechdrs, info->strtab,
+ info->index.sym, i, mod);
+ else if (info->sechdrs[i].sh_type == SHT_RELA)
+ err = apply_relocate_add(info->sechdrs, info->strtab,
+ info->index.sym, i, mod);
+ if (err < 0)
+ break;
+ }
+ return err;
+}
+
+/* Additional bytes needed by arch in front of individual sections */
+unsigned int __weak arch_mod_section_prepend(struct module *mod,
+ unsigned int section)
+{
+ /* default implementation just returns zero */
+ return 0;
+}
+
+/* Update size with this section: return offset. */
+static long get_offset(struct module *mod, unsigned int *size,
+ Elf_Shdr *sechdr, unsigned int section)
+{
+ long ret;
+
+ *size += arch_mod_section_prepend(mod, section);
+ ret = ALIGN(*size, sechdr->sh_addralign ?: 1);
+ *size = ret + sechdr->sh_size;
+ return ret;
+}
+
+/* Lay out the SHF_ALLOC sections in a way not dissimilar to how ld
+ might -- code, read-only data, read-write data, small data. Tally
+ sizes, and place the offsets into sh_entsize fields: high bit means it
+ belongs in init. */
+static void layout_sections(struct module *mod, struct load_info *info)
+{
+ static unsigned long const masks[][2] = {
+ /* NOTE: all executable code must be the first section
+ * in this array; otherwise modify the text_size
+ * finder in the two loops below */
+ { SHF_EXECINSTR | SHF_ALLOC, ARCH_SHF_SMALL },
+ { SHF_ALLOC, SHF_WRITE | ARCH_SHF_SMALL },
+ { SHF_WRITE | SHF_ALLOC, ARCH_SHF_SMALL },
+ { ARCH_SHF_SMALL | SHF_ALLOC, 0 }
+ };
+ unsigned int m, i;
+
+ for (i = 0; i < info->hdr->e_shnum; i++)
+ info->sechdrs[i].sh_entsize = ~0UL;
+
+ pr_debug("Core section allocation order:\n");
+ for (m = 0; m < ARRAY_SIZE(masks); ++m) {
+ for (i = 0; i < info->hdr->e_shnum; ++i) {
+ Elf_Shdr *s = &info->sechdrs[i];
+ const char *sname = info->secstrings + s->sh_name;
+
+ if ((s->sh_flags & masks[m][0]) != masks[m][0]
+ || (s->sh_flags & masks[m][1])
+ || s->sh_entsize != ~0UL
+ || strstarts(sname, ".init"))
+ continue;
+ s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
+ pr_debug("\t%s\n", sname);
+ }
+ switch (m) {
+ case 0: /* executable */
+ mod->core_size = debug_align(mod->core_size);
+ mod->core_text_size = mod->core_size;
+ break;
+ case 1: /* RO: text and ro-data */
+ mod->core_size = debug_align(mod->core_size);
+ mod->core_ro_size = mod->core_size;
+ break;
+ case 3: /* whole core */
+ mod->core_size = debug_align(mod->core_size);
+ break;
+ }
+ }
+
+ pr_debug("Init section allocation order:\n");
+ for (m = 0; m < ARRAY_SIZE(masks); ++m) {
+ for (i = 0; i < info->hdr->e_shnum; ++i) {
+ Elf_Shdr *s = &info->sechdrs[i];
+ const char *sname = info->secstrings + s->sh_name;
+
+ if ((s->sh_flags & masks[m][0]) != masks[m][0]
+ || (s->sh_flags & masks[m][1])
+ || s->sh_entsize != ~0UL
+ || !strstarts(sname, ".init"))
+ continue;
+ s->sh_entsize = (get_offset(mod, &mod->init_size, s, i)
+ | INIT_OFFSET_MASK);
+ pr_debug("\t%s\n", sname);
+ }
+ switch (m) {
+ case 0: /* executable */
+ mod->init_size = debug_align(mod->init_size);
+ mod->init_text_size = mod->init_size;
+ break;
+ case 1: /* RO: text and ro-data */
+ mod->init_size = debug_align(mod->init_size);
+ mod->init_ro_size = mod->init_size;
+ break;
+ case 3: /* whole init */
+ mod->init_size = debug_align(mod->init_size);
+ break;
+ }
+ }
+}
+
+static void set_license(struct module *mod, const char *license)
+{
+ if (!license)
+ license = "unspecified";
+
+ if (!license_is_gpl_compatible(license)) {
+ if (!test_taint(TAINT_PROPRIETARY_MODULE))
+ pr_warn("%s: module license '%s' taints kernel.\n",
+ mod->name, license);
+ add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
+ LOCKDEP_NOW_UNRELIABLE);
+ }
+}
+
+/* Parse tag=value strings from .modinfo section */
+static char *next_string(char *string, unsigned long *secsize)
+{
+ /* Skip non-zero chars */
+ while (string[0]) {
+ string++;
+ if ((*secsize)-- <= 1)
+ return NULL;
+ }
+
+ /* Skip any zero padding. */
+ while (!string[0]) {
+ string++;
+ if ((*secsize)-- <= 1)
+ return NULL;
+ }
+ return string;
+}
+
+static char *get_modinfo(struct load_info *info, const char *tag)
+{
+ char *p;
+ unsigned int taglen = strlen(tag);
+ Elf_Shdr *infosec = &info->sechdrs[info->index.info];
+ unsigned long size = infosec->sh_size;
+
+ for (p = (char *)infosec->sh_addr; p; p = next_string(p, &size)) {
+ if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=')
+ return p + taglen + 1;
+ }
+ return NULL;
+}
+
+static void setup_modinfo(struct module *mod, struct load_info *info)
+{
+ struct module_attribute *attr;
+ int i;
+
+ for (i = 0; (attr = modinfo_attrs[i]); i++) {
+ if (attr->setup)
+ attr->setup(mod, get_modinfo(info, attr->attr.name));
+ }
+}
+
+static void free_modinfo(struct module *mod)
+{
+ struct module_attribute *attr;
+ int i;
+
+ for (i = 0; (attr = modinfo_attrs[i]); i++) {
+ if (attr->free)
+ attr->free(mod);
+ }
+}
+
+#ifdef CONFIG_KALLSYMS
+
+/* lookup symbol in given range of kernel_symbols */
+static const struct kernel_symbol *lookup_symbol(const char *name,
+ const struct kernel_symbol *start,
+ const struct kernel_symbol *stop)
+{
+ return bsearch(name, start, stop - start,
+ sizeof(struct kernel_symbol), cmp_name);
+}
+
+static int is_exported(const char *name, unsigned long value,
+ const struct module *mod)
+{
+ const struct kernel_symbol *ks;
+ if (!mod)
+ ks = lookup_symbol(name, __start___ksymtab, __stop___ksymtab);
+ else
+ ks = lookup_symbol(name, mod->syms, mod->syms + mod->num_syms);
+ return ks != NULL && ks->value == value;
+}
+
+/* As per nm */
+static char elf_type(const Elf_Sym *sym, const struct load_info *info)
+{
+ const Elf_Shdr *sechdrs = info->sechdrs;
+
+ if (ELF_ST_BIND(sym->st_info) == STB_WEAK) {
+ if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT)
+ return 'v';
+ else
+ return 'w';
+ }
+ if (sym->st_shndx == SHN_UNDEF)
+ return 'U';
+ if (sym->st_shndx == SHN_ABS)
+ return 'a';
+ if (sym->st_shndx >= SHN_LORESERVE)
+ return '?';
+ if (sechdrs[sym->st_shndx].sh_flags & SHF_EXECINSTR)
+ return 't';
+ if (sechdrs[sym->st_shndx].sh_flags & SHF_ALLOC
+ && sechdrs[sym->st_shndx].sh_type != SHT_NOBITS) {
+ if (!(sechdrs[sym->st_shndx].sh_flags & SHF_WRITE))
+ return 'r';
+ else if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL)
+ return 'g';
+ else
+ return 'd';
+ }
+ if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
+ if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL)
+ return 's';
+ else
+ return 'b';
+ }
+ if (strstarts(info->secstrings + sechdrs[sym->st_shndx].sh_name,
+ ".debug")) {
+ return 'n';
+ }
+ return '?';
+}
+
+static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
+ unsigned int shnum)
+{
+ const Elf_Shdr *sec;
+
+ if (src->st_shndx == SHN_UNDEF
+ || src->st_shndx >= shnum
+ || !src->st_name)
+ return false;
+
+ sec = sechdrs + src->st_shndx;
+ if (!(sec->sh_flags & SHF_ALLOC)
+#ifndef CONFIG_KALLSYMS_ALL
+ || !(sec->sh_flags & SHF_EXECINSTR)
+#endif
+ || (sec->sh_entsize & INIT_OFFSET_MASK))
+ return false;
+
+ return true;
+}
+
+/*
+ * We only allocate and copy the strings needed by the parts of symtab
+ * we keep. This is simple, but has the effect of making multiple
+ * copies of duplicates. We could be more sophisticated, see
+ * linux-kernel thread starting with
+ * <73defb5e4bca04a6431392cc341112b1@localhost>.
+ */
+static void layout_symtab(struct module *mod, struct load_info *info)
+{
+ Elf_Shdr *symsect = info->sechdrs + info->index.sym;
+ Elf_Shdr *strsect = info->sechdrs + info->index.str;
+ const Elf_Sym *src;
+ unsigned int i, nsrc, ndst, strtab_size = 0;
+
+ /* Put symbol section at end of init part of module. */
+ symsect->sh_flags |= SHF_ALLOC;
+ symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect,
+ info->index.sym) | INIT_OFFSET_MASK;
+ pr_debug("\t%s\n", info->secstrings + symsect->sh_name);
+
+ src = (void *)info->hdr + symsect->sh_offset;
+ nsrc = symsect->sh_size / sizeof(*src);
+
+ /* Compute total space required for the core symbols' strtab. */
+ for (ndst = i = 0; i < nsrc; i++) {
+ if (i == 0 ||
+ is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) {
+ strtab_size += strlen(&info->strtab[src[i].st_name])+1;
+ ndst++;
+ }
+ }
+
+ /* Append room for core symbols at end of core part. */
+ info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
+ info->stroffs = mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym);
+ mod->core_size += strtab_size;
+ mod->core_size = debug_align(mod->core_size);
+
+ /* Put string table section at end of init part of module. */
+ strsect->sh_flags |= SHF_ALLOC;
+ strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect,
+ info->index.str) | INIT_OFFSET_MASK;
+ mod->init_size = debug_align(mod->init_size);
+ pr_debug("\t%s\n", info->secstrings + strsect->sh_name);
+}
+
+static void add_kallsyms(struct module *mod, const struct load_info *info)
+{
+ unsigned int i, ndst;
+ const Elf_Sym *src;
+ Elf_Sym *dst;
+ char *s;
+ Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
+
+ mod->symtab = (void *)symsec->sh_addr;
+ mod->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
+ /* Make sure we get permanent strtab: don't use info->strtab. */
+ mod->strtab = (void *)info->sechdrs[info->index.str].sh_addr;
+
+ /* Set types up while we still have access to sections. */
+ for (i = 0; i < mod->num_symtab; i++)
+ mod->symtab[i].st_info = elf_type(&mod->symtab[i], info);
+
+ mod->core_symtab = dst = mod->module_core + info->symoffs;
+ mod->core_strtab = s = mod->module_core + info->stroffs;
+ src = mod->symtab;
+ for (ndst = i = 0; i < mod->num_symtab; i++) {
+ if (i == 0 ||
+ is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) {
+ dst[ndst] = src[i];
+ dst[ndst++].st_name = s - mod->core_strtab;
+ s += strlcpy(s, &mod->strtab[src[i].st_name],
+ KSYM_NAME_LEN) + 1;
+ }
+ }
+ mod->core_num_syms = ndst;
+}
+#else
+static inline void layout_symtab(struct module *mod, struct load_info *info)
+{
+}
+
+static void add_kallsyms(struct module *mod, const struct load_info *info)
+{
+}
+#endif /* CONFIG_KALLSYMS */
+
+static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num)
+{
+ if (!debug)
+ return;
+#ifdef CONFIG_DYNAMIC_DEBUG
+ if (ddebug_add_module(debug, num, debug->modname))
+ pr_err("dynamic debug error adding module: %s\n",
+ debug->modname);
+#endif
+}
+
+static void dynamic_debug_remove(struct _ddebug *debug)
+{
+ if (debug)
+ ddebug_remove_module(debug->modname);
+}
+
+void * __weak module_alloc(unsigned long size)
+{
+ return vmalloc_exec(size);
+}
+
+static void *module_alloc_update_bounds(unsigned long size)
+{
+ void *ret = module_alloc(size);
+
+ if (ret) {
+ mutex_lock(&module_mutex);
+ /* Update module bounds. */
+ if ((unsigned long)ret < module_addr_min)
+ module_addr_min = (unsigned long)ret;
+ if ((unsigned long)ret + size > module_addr_max)
+ module_addr_max = (unsigned long)ret + size;
+ mutex_unlock(&module_mutex);
+ }
+ return ret;
+}
+
+#ifdef CONFIG_DEBUG_KMEMLEAK
+static void kmemleak_load_module(const struct module *mod,
+ const struct load_info *info)
+{
+ unsigned int i;
+
+ /* only scan the sections containing data */
+ kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL);
+
+ for (i = 1; i < info->hdr->e_shnum; i++) {
+ /* Scan all writable sections that's not executable */
+ if (!(info->sechdrs[i].sh_flags & SHF_ALLOC) ||
+ !(info->sechdrs[i].sh_flags & SHF_WRITE) ||
+ (info->sechdrs[i].sh_flags & SHF_EXECINSTR))
+ continue;
+
+ kmemleak_scan_area((void *)info->sechdrs[i].sh_addr,
+ info->sechdrs[i].sh_size, GFP_KERNEL);
+ }
+}
+#else
+static inline void kmemleak_load_module(const struct module *mod,
+ const struct load_info *info)
+{
+}
+#endif
+
+#ifdef CONFIG_MODULE_SIG
+static int module_sig_check(struct load_info *info)
+{
+ int err = -ENOKEY;
+ const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1;
+ const void *mod = info->hdr;
+
+ if (info->len > markerlen &&
+ memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) {
+ /* We truncate the module to discard the signature */
+ info->len -= markerlen;
+ err = mod_verify_sig(mod, &info->len);
+ }
+
+ if (!err) {
+ info->sig_ok = true;
+ return 0;
+ }
+
+ /* Not having a signature is only an error if we're strict. */
+ if (err == -ENOKEY && !sig_enforce)
+ err = 0;
+
+ return err;
+}
+#else /* !CONFIG_MODULE_SIG */
+static int module_sig_check(struct load_info *info)
+{
+ return 0;
+}
+#endif /* !CONFIG_MODULE_SIG */
+
+/* Sanity checks against invalid binaries, wrong arch, weird elf version. */
+static int elf_header_check(struct load_info *info)
+{
+ if (info->len < sizeof(*(info->hdr)))
+ return -ENOEXEC;
+
+ if (memcmp(info->hdr->e_ident, ELFMAG, SELFMAG) != 0
+ || info->hdr->e_type != ET_REL
+ || !elf_check_arch(info->hdr)
+ || info->hdr->e_shentsize != sizeof(Elf_Shdr))
+ return -ENOEXEC;
+
+ if (info->hdr->e_shoff >= info->len
+ || (info->hdr->e_shnum * sizeof(Elf_Shdr) >
+ info->len - info->hdr->e_shoff))
+ return -ENOEXEC;
+
+ return 0;
+}
+
+#define COPY_CHUNK_SIZE (16*PAGE_SIZE)
+
+static int copy_chunked_from_user(void *dst, const void __user *usrc, unsigned long len)
+{
+ do {
+ unsigned long n = min(len, COPY_CHUNK_SIZE);
+
+ if (copy_from_user(dst, usrc, n) != 0)
+ return -EFAULT;
+ cond_resched();
+ dst += n;
+ usrc += n;
+ len -= n;
+ } while (len);
+ return 0;
+}
+
+/* Sets info->hdr and info->len. */
+static int copy_module_from_user(const void __user *umod, unsigned long len,
+ struct load_info *info)
+{
+ int err;
+
+ info->len = len;
+ if (info->len < sizeof(*(info->hdr)))
+ return -ENOEXEC;
+
+ err = security_kernel_module_from_file(NULL);
+ if (err)
+ return err;
+
+ /* Suck in entire file: we'll want most of it. */
+ info->hdr = __vmalloc(info->len,
+ GFP_KERNEL | __GFP_HIGHMEM | __GFP_NOWARN, PAGE_KERNEL);
+ if (!info->hdr)
+ return -ENOMEM;
+
+ if (copy_chunked_from_user(info->hdr, umod, info->len) != 0) {
+ vfree(info->hdr);
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+/* Sets info->hdr and info->len. */
+static int copy_module_from_fd(int fd, struct load_info *info)
+{
+ struct fd f = fdget(fd);
+ int err;
+ struct kstat stat;
+ loff_t pos;
+ ssize_t bytes = 0;
+
+ if (!f.file)
+ return -ENOEXEC;
+
+ err = security_kernel_module_from_file(f.file);
+ if (err)
+ goto out;
+
+ err = vfs_getattr(&f.file->f_path, &stat);
+ if (err)
+ goto out;
+
+ if (stat.size > INT_MAX) {
+ err = -EFBIG;
+ goto out;
+ }
+
+ /* Don't hand 0 to vmalloc, it whines. */
+ if (stat.size == 0) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ info->hdr = vmalloc(stat.size);
+ if (!info->hdr) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ pos = 0;
+ while (pos < stat.size) {
+ bytes = kernel_read(f.file, pos, (char *)(info->hdr) + pos,
+ stat.size - pos);
+ if (bytes < 0) {
+ vfree(info->hdr);
+ err = bytes;
+ goto out;
+ }
+ if (bytes == 0)
+ break;
+ pos += bytes;
+ }
+ info->len = pos;
+
+out:
+ fdput(f);
+ return err;
+}
+
+static void free_copy(struct load_info *info)
+{
+ vfree(info->hdr);
+}
+
+static int rewrite_section_headers(struct load_info *info, int flags)
+{
+ unsigned int i;
+
+ /* This should always be true, but let's be sure. */
+ info->sechdrs[0].sh_addr = 0;
+
+ for (i = 1; i < info->hdr->e_shnum; i++) {
+ Elf_Shdr *shdr = &info->sechdrs[i];
+ if (shdr->sh_type != SHT_NOBITS
+ && info->len < shdr->sh_offset + shdr->sh_size) {
+ pr_err("Module len %lu truncated\n", info->len);
+ return -ENOEXEC;
+ }
+
+ /* Mark all sections sh_addr with their address in the
+ temporary image. */
+ shdr->sh_addr = (size_t)info->hdr + shdr->sh_offset;
+
+#ifndef CONFIG_MODULE_UNLOAD
+ /* Don't load .exit sections */
+ if (strstarts(info->secstrings+shdr->sh_name, ".exit"))
+ shdr->sh_flags &= ~(unsigned long)SHF_ALLOC;
+#endif
+ }
+
+ /* Track but don't keep modinfo and version sections. */
+ if (flags & MODULE_INIT_IGNORE_MODVERSIONS)
+ info->index.vers = 0; /* Pretend no __versions section! */
+ else
+ info->index.vers = find_sec(info, "__versions");
+ info->index.info = find_sec(info, ".modinfo");
+ info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC;
+ info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC;
+ return 0;
+}
+
+/*
+ * Set up our basic convenience variables (pointers to section headers,
+ * search for module section index etc), and do some basic section
+ * verification.
+ *
+ * Return the temporary module pointer (we'll replace it with the final
+ * one when we move the module sections around).
+ */
+static struct module *setup_load_info(struct load_info *info, int flags)
+{
+ unsigned int i;
+ int err;
+ struct module *mod;
+
+ /* Set up the convenience variables */
+ info->sechdrs = (void *)info->hdr + info->hdr->e_shoff;
+ info->secstrings = (void *)info->hdr
+ + info->sechdrs[info->hdr->e_shstrndx].sh_offset;
+
+ err = rewrite_section_headers(info, flags);
+ if (err)
+ return ERR_PTR(err);
+
+ /* Find internal symbols and strings. */
+ for (i = 1; i < info->hdr->e_shnum; i++) {
+ if (info->sechdrs[i].sh_type == SHT_SYMTAB) {
+ info->index.sym = i;
+ info->index.str = info->sechdrs[i].sh_link;
+ info->strtab = (char *)info->hdr
+ + info->sechdrs[info->index.str].sh_offset;
+ break;
+ }
+ }
+
+ info->index.mod = find_sec(info, ".gnu.linkonce.this_module");
+ if (!info->index.mod) {
+ pr_warn("No module found in object\n");
+ return ERR_PTR(-ENOEXEC);
+ }
+ /* This is temporary: point mod into copy of data. */
+ mod = (void *)info->sechdrs[info->index.mod].sh_addr;
+
+ if (info->index.sym == 0) {
+ pr_warn("%s: module has no symbols (stripped?)\n", mod->name);
+ return ERR_PTR(-ENOEXEC);
+ }
+
+ info->index.pcpu = find_pcpusec(info);
+
+ /* Check module struct version now, before we try to use module. */
+ if (!check_modstruct_version(info->sechdrs, info->index.vers, mod))
+ return ERR_PTR(-ENOEXEC);
+
+ return mod;
+}
+
+static int check_modinfo(struct module *mod, struct load_info *info, int flags)
+{
+ const char *modmagic = get_modinfo(info, "vermagic");
+ int err;
+
+ if (flags & MODULE_INIT_IGNORE_VERMAGIC)
+ modmagic = NULL;
+
+ /* This is allowed: modprobe --force will invalidate it. */
+ if (!modmagic) {
+ err = try_to_force_load(mod, "bad vermagic");
+ if (err)
+ return err;
+ } else if (!same_magic(modmagic, vermagic, info->index.vers)) {
+ pr_err("%s: version magic '%s' should be '%s'\n",
+ mod->name, modmagic, vermagic);
+ return -ENOEXEC;
+ }
+
+ if (!get_modinfo(info, "intree"))
+ add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK);
+
+ if (get_modinfo(info, "staging")) {
+ add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
+ pr_warn("%s: module is from the staging directory, the quality "
+ "is unknown, you have been warned.\n", mod->name);
+ }
+
+ /* Set up license info based on the info section */
+ set_license(mod, get_modinfo(info, "license"));
+
+ return 0;
+}
+
+static int find_module_sections(struct module *mod, struct load_info *info)
+{
+ mod->kp = section_objs(info, "__param",
+ sizeof(*mod->kp), &mod->num_kp);
+ mod->syms = section_objs(info, "__ksymtab",
+ sizeof(*mod->syms), &mod->num_syms);
+ mod->crcs = section_addr(info, "__kcrctab");
+ mod->gpl_syms = section_objs(info, "__ksymtab_gpl",
+ sizeof(*mod->gpl_syms),
+ &mod->num_gpl_syms);
+ mod->gpl_crcs = section_addr(info, "__kcrctab_gpl");
+ mod->gpl_future_syms = section_objs(info,
+ "__ksymtab_gpl_future",
+ sizeof(*mod->gpl_future_syms),
+ &mod->num_gpl_future_syms);
+ mod->gpl_future_crcs = section_addr(info, "__kcrctab_gpl_future");
+
+#ifdef CONFIG_UNUSED_SYMBOLS
+ mod->unused_syms = section_objs(info, "__ksymtab_unused",
+ sizeof(*mod->unused_syms),
+ &mod->num_unused_syms);
+ mod->unused_crcs = section_addr(info, "__kcrctab_unused");
+ mod->unused_gpl_syms = section_objs(info, "__ksymtab_unused_gpl",
+ sizeof(*mod->unused_gpl_syms),
+ &mod->num_unused_gpl_syms);
+ mod->unused_gpl_crcs = section_addr(info, "__kcrctab_unused_gpl");
+#endif
+#ifdef CONFIG_CONSTRUCTORS
+ mod->ctors = section_objs(info, ".ctors",
+ sizeof(*mod->ctors), &mod->num_ctors);
+ if (!mod->ctors)
+ mod->ctors = section_objs(info, ".init_array",
+ sizeof(*mod->ctors), &mod->num_ctors);
+ else if (find_sec(info, ".init_array")) {
+ /*
+ * This shouldn't happen with same compiler and binutils
+ * building all parts of the module.
+ */
+ pr_warn("%s: has both .ctors and .init_array.\n",
+ mod->name);
+ return -EINVAL;
+ }
+#endif
+
+#ifdef CONFIG_TRACEPOINTS
+ mod->tracepoints_ptrs = section_objs(info, "__tracepoints_ptrs",
+ sizeof(*mod->tracepoints_ptrs),
+ &mod->num_tracepoints);
+#endif
+#ifdef HAVE_JUMP_LABEL
+ mod->jump_entries = section_objs(info, "__jump_table",
+ sizeof(*mod->jump_entries),
+ &mod->num_jump_entries);
+#endif
+#ifdef CONFIG_EVENT_TRACING
+ mod->trace_events = section_objs(info, "_ftrace_events",
+ sizeof(*mod->trace_events),
+ &mod->num_trace_events);
+ mod->trace_enums = section_objs(info, "_ftrace_enum_map",
+ sizeof(*mod->trace_enums),
+ &mod->num_trace_enums);
+#endif
+#ifdef CONFIG_TRACING
+ mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt",
+ sizeof(*mod->trace_bprintk_fmt_start),
+ &mod->num_trace_bprintk_fmt);
+#endif
+#ifdef CONFIG_FTRACE_MCOUNT_RECORD
+ /* sechdrs[0].sh_size is always zero */
+ mod->ftrace_callsites = section_objs(info, "__mcount_loc",
+ sizeof(*mod->ftrace_callsites),
+ &mod->num_ftrace_callsites);
+#endif
+
+ mod->extable = section_objs(info, "__ex_table",
+ sizeof(*mod->extable), &mod->num_exentries);
+
+ if (section_addr(info, "__obsparm"))
+ pr_warn("%s: Ignoring obsolete parameters\n", mod->name);
+
+ info->debug = section_objs(info, "__verbose",
+ sizeof(*info->debug), &info->num_debug);
+
+ return 0;
+}
+
+static int move_module(struct module *mod, struct load_info *info)
+{
+ int i;
+ void *ptr;
+
+ /* Do the allocs. */
+ ptr = module_alloc_update_bounds(mod->core_size);
+ /*
+ * The pointer to this block is stored in the module structure
+ * which is inside the block. Just mark it as not being a
+ * leak.
+ */
+ kmemleak_not_leak(ptr);
+ if (!ptr)
+ return -ENOMEM;
+
+ memset(ptr, 0, mod->core_size);
+ mod->module_core = ptr;
+
+ if (mod->init_size) {
+ ptr = module_alloc_update_bounds(mod->init_size);
+ /*
+ * The pointer to this block is stored in the module structure
+ * which is inside the block. This block doesn't need to be
+ * scanned as it contains data and code that will be freed
+ * after the module is initialized.
+ */
+ kmemleak_ignore(ptr);
+ if (!ptr) {
+ module_memfree(mod->module_core);
+ return -ENOMEM;
+ }
+ memset(ptr, 0, mod->init_size);
+ mod->module_init = ptr;
+ } else
+ mod->module_init = NULL;
+
+ /* Transfer each section which specifies SHF_ALLOC */
+ pr_debug("final section addresses:\n");
+ for (i = 0; i < info->hdr->e_shnum; i++) {
+ void *dest;
+ Elf_Shdr *shdr = &info->sechdrs[i];
+
+ if (!(shdr->sh_flags & SHF_ALLOC))
+ continue;
+
+ if (shdr->sh_entsize & INIT_OFFSET_MASK)
+ dest = mod->module_init
+ + (shdr->sh_entsize & ~INIT_OFFSET_MASK);
+ else
+ dest = mod->module_core + shdr->sh_entsize;
+
+ if (shdr->sh_type != SHT_NOBITS)
+ memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size);
+ /* Update sh_addr to point to copy in image. */
+ shdr->sh_addr = (unsigned long)dest;
+ pr_debug("\t0x%lx %s\n",
+ (long)shdr->sh_addr, info->secstrings + shdr->sh_name);
+ }
+
+ return 0;
+}
+
+static int check_module_license_and_versions(struct module *mod)
+{
+ /*
+ * ndiswrapper is under GPL by itself, but loads proprietary modules.
+ * Don't use add_taint_module(), as it would prevent ndiswrapper from
+ * using GPL-only symbols it needs.
+ */
+ if (strcmp(mod->name, "ndiswrapper") == 0)
+ add_taint(TAINT_PROPRIETARY_MODULE, LOCKDEP_NOW_UNRELIABLE);
+
+ /* driverloader was caught wrongly pretending to be under GPL */
+ if (strcmp(mod->name, "driverloader") == 0)
+ add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
+ LOCKDEP_NOW_UNRELIABLE);
+
+ /* lve claims to be GPL but upstream won't provide source */
+ if (strcmp(mod->name, "lve") == 0)
+ add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
+ LOCKDEP_NOW_UNRELIABLE);
+
+#ifdef CONFIG_MODVERSIONS
+ if ((mod->num_syms && !mod->crcs)
+ || (mod->num_gpl_syms && !mod->gpl_crcs)
+ || (mod->num_gpl_future_syms && !mod->gpl_future_crcs)
+#ifdef CONFIG_UNUSED_SYMBOLS
+ || (mod->num_unused_syms && !mod->unused_crcs)
+ || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs)
+#endif
+ ) {
+ return try_to_force_load(mod,
+ "no versions for exported symbols");
+ }
+#endif
+ return 0;
+}
+
+static void flush_module_icache(const struct module *mod)
+{
+ mm_segment_t old_fs;
+
+ /* flush the icache in correct context */
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+
+ /*
+ * Flush the instruction cache, since we've played with text.
+ * Do it before processing of module parameters, so the module
+ * can provide parameter accessor functions of its own.
+ */
+ if (mod->module_init)
+ flush_icache_range((unsigned long)mod->module_init,
+ (unsigned long)mod->module_init
+ + mod->init_size);
+ flush_icache_range((unsigned long)mod->module_core,
+ (unsigned long)mod->module_core + mod->core_size);
+
+ set_fs(old_fs);
+}
+
+int __weak module_frob_arch_sections(Elf_Ehdr *hdr,
+ Elf_Shdr *sechdrs,
+ char *secstrings,
+ struct module *mod)
+{
+ return 0;
+}
+
+static struct module *layout_and_allocate(struct load_info *info, int flags)
+{
+ /* Module within temporary copy. */
+ struct module *mod;
+ int err;
+
+ mod = setup_load_info(info, flags);
+ if (IS_ERR(mod))
+ return mod;
+
+ err = check_modinfo(mod, info, flags);
+ if (err)
+ return ERR_PTR(err);
+
+ /* Allow arches to frob section contents and sizes. */
+ err = module_frob_arch_sections(info->hdr, info->sechdrs,
+ info->secstrings, mod);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ /* We will do a special allocation for per-cpu sections later. */
+ info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC;
+
+ /* Determine total sizes, and put offsets in sh_entsize. For now
+ this is done generically; there doesn't appear to be any
+ special cases for the architectures. */
+ layout_sections(mod, info);
+ layout_symtab(mod, info);
+
+ /* Allocate and move to the final place */
+ err = move_module(mod, info);
+ if (err)
+ return ERR_PTR(err);
+
+ /* Module has been copied to its final place now: return it. */
+ mod = (void *)info->sechdrs[info->index.mod].sh_addr;
+ kmemleak_load_module(mod, info);
+ return mod;
+}
+
+/* mod is no longer valid after this! */
+static void module_deallocate(struct module *mod, struct load_info *info)
+{
+ percpu_modfree(mod);
+ module_arch_freeing_init(mod);
+ module_memfree(mod->module_init);
+ module_memfree(mod->module_core);
+}
+
+int __weak module_finalize(const Elf_Ehdr *hdr,
+ const Elf_Shdr *sechdrs,
+ struct module *me)
+{
+ return 0;
+}
+
+static int post_relocation(struct module *mod, const struct load_info *info)
+{
+ /* Sort exception table now relocations are done. */
+ sort_extable(mod->extable, mod->extable + mod->num_exentries);
+
+ /* Copy relocated percpu area over. */
+ percpu_modcopy(mod, (void *)info->sechdrs[info->index.pcpu].sh_addr,
+ info->sechdrs[info->index.pcpu].sh_size);
+
+ /* Setup kallsyms-specific fields. */
+ add_kallsyms(mod, info);
+
+ /* Arch-specific module finalizing. */
+ return module_finalize(info->hdr, info->sechdrs, mod);
+}
+
+/* Is this module of this name done loading? No locks held. */
+static bool finished_loading(const char *name)
+{
+ struct module *mod;
+ bool ret;
+
+ /*
+ * The module_mutex should not be a heavily contended lock;
+ * if we get the occasional sleep here, we'll go an extra iteration
+ * in the wait_event_interruptible(), which is harmless.
+ */
+ sched_annotate_sleep();
+ mutex_lock(&module_mutex);
+ mod = find_module_all(name, strlen(name), true);
+ ret = !mod || mod->state == MODULE_STATE_LIVE
+ || mod->state == MODULE_STATE_GOING;
+ mutex_unlock(&module_mutex);
+
+ return ret;
+}
+
+/* Call module constructors. */
+static void do_mod_ctors(struct module *mod)
+{
+#ifdef CONFIG_CONSTRUCTORS
+ unsigned long i;
+
+ for (i = 0; i < mod->num_ctors; i++)
+ mod->ctors[i]();
+#endif
+}
+
+/* For freeing module_init on success, in case kallsyms traversing */
+struct mod_initfree {
+ struct rcu_head rcu;
+ void *module_init;
+};
+
+static void do_free_init(struct rcu_head *head)
+{
+ struct mod_initfree *m = container_of(head, struct mod_initfree, rcu);
+ module_memfree(m->module_init);
+ kfree(m);
+}
+
+/*
+ * This is where the real work happens.
+ *
+ * Keep it uninlined to provide a reliable breakpoint target, e.g. for the gdb
+ * helper command 'lx-symbols'.
+ */
+static noinline int do_init_module(struct module *mod)
+{
+ int ret = 0;
+ struct mod_initfree *freeinit;
+
+ freeinit = kmalloc(sizeof(*freeinit), GFP_KERNEL);
+ if (!freeinit) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+ freeinit->module_init = mod->module_init;
+
+ /*
+ * We want to find out whether @mod uses async during init. Clear
+ * PF_USED_ASYNC. async_schedule*() will set it.
+ */
+ current->flags &= ~PF_USED_ASYNC;
+
+ do_mod_ctors(mod);
+ /* Start the module */
+ if (mod->init != NULL)
+ ret = do_one_initcall(mod->init);
+ if (ret < 0) {
+ goto fail_free_freeinit;
+ }
+ if (ret > 0) {
+ pr_warn("%s: '%s'->init suspiciously returned %d, it should "
+ "follow 0/-E convention\n"
+ "%s: loading module anyway...\n",
+ __func__, mod->name, ret, __func__);
+ dump_stack();
+ }
+
+ /* Now it's a first class citizen! */
+ mod->state = MODULE_STATE_LIVE;
+ blocking_notifier_call_chain(&module_notify_list,
+ MODULE_STATE_LIVE, mod);
+
+ /*
+ * We need to finish all async code before the module init sequence
+ * is done. This has potential to deadlock. For example, a newly
+ * detected block device can trigger request_module() of the
+ * default iosched from async probing task. Once userland helper
+ * reaches here, async_synchronize_full() will wait on the async
+ * task waiting on request_module() and deadlock.
+ *
+ * This deadlock is avoided by perfomring async_synchronize_full()
+ * iff module init queued any async jobs. This isn't a full
+ * solution as it will deadlock the same if module loading from
+ * async jobs nests more than once; however, due to the various
+ * constraints, this hack seems to be the best option for now.
+ * Please refer to the following thread for details.
+ *
+ * http://thread.gmane.org/gmane.linux.kernel/1420814
+ */
+ if (current->flags & PF_USED_ASYNC)
+ async_synchronize_full();
+
+ mutex_lock(&module_mutex);
+ /* Drop initial reference. */
+ module_put(mod);
+ trim_init_extable(mod);
+#ifdef CONFIG_KALLSYMS
+ mod->num_symtab = mod->core_num_syms;
+ mod->symtab = mod->core_symtab;
+ mod->strtab = mod->core_strtab;
+#endif
+ unset_module_init_ro_nx(mod);
+ module_arch_freeing_init(mod);
+ mod->module_init = NULL;
+ mod->init_size = 0;
+ mod->init_ro_size = 0;
+ mod->init_text_size = 0;
+ /*
+ * We want to free module_init, but be aware that kallsyms may be
+ * walking this with preempt disabled. In all the failure paths,
+ * we call synchronize_rcu/synchronize_sched, but we don't want
+ * to slow down the success path, so use actual RCU here.
+ */
+ call_rcu(&freeinit->rcu, do_free_init);
+ mutex_unlock(&module_mutex);
+ wake_up_all(&module_wq);
+
+ return 0;
+
+fail_free_freeinit:
+ kfree(freeinit);
+fail:
+ /* Try to protect us from buggy refcounters. */
+ mod->state = MODULE_STATE_GOING;
+ synchronize_sched();
+ module_put(mod);
+ blocking_notifier_call_chain(&module_notify_list,
+ MODULE_STATE_GOING, mod);
+ free_module(mod);
+ wake_up_all(&module_wq);
+ return ret;
+}
+
+static int may_init_module(void)
+{
+ if (!capable(CAP_SYS_MODULE) || modules_disabled)
+ return -EPERM;
+
+ return 0;
+}
+
+/*
+ * We try to place it in the list now to make sure it's unique before
+ * we dedicate too many resources. In particular, temporary percpu
+ * memory exhaustion.
+ */
+static int add_unformed_module(struct module *mod)
+{
+ int err;
+ struct module *old;
+
+ mod->state = MODULE_STATE_UNFORMED;
+
+again:
+ mutex_lock(&module_mutex);
+ old = find_module_all(mod->name, strlen(mod->name), true);
+ if (old != NULL) {
+ if (old->state == MODULE_STATE_COMING
+ || old->state == MODULE_STATE_UNFORMED) {
+ /* Wait in case it fails to load. */
+ mutex_unlock(&module_mutex);
+ err = wait_event_interruptible(module_wq,
+ finished_loading(mod->name));
+ if (err)
+ goto out_unlocked;
+ goto again;
+ }
+ err = -EEXIST;
+ goto out;
+ }
+ list_add_rcu(&mod->list, &modules);
+ err = 0;
+
+out:
+ mutex_unlock(&module_mutex);
+out_unlocked:
+ return err;
+}
+
+static int complete_formation(struct module *mod, struct load_info *info)
+{
+ int err;
+
+ mutex_lock(&module_mutex);
+
+ /* Find duplicate symbols (must be called under lock). */
+ err = verify_export_symbols(mod);
+ if (err < 0)
+ goto out;
+
+ /* This relies on module_mutex for list integrity. */
+ module_bug_finalize(info->hdr, info->sechdrs, mod);
+
+ /* Set RO and NX regions for core */
+ set_section_ro_nx(mod->module_core,
+ mod->core_text_size,
+ mod->core_ro_size,
+ mod->core_size);
+
+ /* Set RO and NX regions for init */
+ set_section_ro_nx(mod->module_init,
+ mod->init_text_size,
+ mod->init_ro_size,
+ mod->init_size);
+
+ /* Mark state as coming so strong_try_module_get() ignores us,
+ * but kallsyms etc. can see us. */
+ mod->state = MODULE_STATE_COMING;
+ mutex_unlock(&module_mutex);
+
+ blocking_notifier_call_chain(&module_notify_list,
+ MODULE_STATE_COMING, mod);
+ return 0;
+
+out:
+ mutex_unlock(&module_mutex);
+ return err;
+}
+
+static int unknown_module_param_cb(char *param, char *val, const char *modname)
+{
+ /* Check for magic 'dyndbg' arg */
+ int ret = ddebug_dyndbg_module_param_cb(param, val, modname);
+ if (ret != 0)
+ pr_warn("%s: unknown parameter '%s' ignored\n", modname, param);
+ return 0;
+}
+
+/* Allocate and load the module: note that size of section 0 is always
+ zero, and we rely on this for optional sections. */
+static int load_module(struct load_info *info, const char __user *uargs,
+ int flags)
+{
+ struct module *mod;
+ long err;
+ char *after_dashes;
+
+ err = module_sig_check(info);
+ if (err)
+ goto free_copy;
+
+ err = elf_header_check(info);
+ if (err)
+ goto free_copy;
+
+ /* Figure out module layout, and allocate all the memory. */
+ mod = layout_and_allocate(info, flags);
+ if (IS_ERR(mod)) {
+ err = PTR_ERR(mod);
+ goto free_copy;
+ }
+
+ /* Reserve our place in the list. */
+ err = add_unformed_module(mod);
+ if (err)
+ goto free_module;
+
+#ifdef CONFIG_MODULE_SIG
+ mod->sig_ok = info->sig_ok;
+ if (!mod->sig_ok) {
+ pr_notice_once("%s: module verification failed: signature "
+ "and/or required key missing - tainting "
+ "kernel\n", mod->name);
+ add_taint_module(mod, TAINT_UNSIGNED_MODULE, LOCKDEP_STILL_OK);
+ }
+#endif
+
+ /* To avoid stressing percpu allocator, do this once we're unique. */
+ err = percpu_modalloc(mod, info);
+ if (err)
+ goto unlink_mod;
+
+ /* Now module is in final location, initialize linked lists, etc. */
+ err = module_unload_init(mod);
+ if (err)
+ goto unlink_mod;
+
+ /* Now we've got everything in the final locations, we can
+ * find optional sections. */
+ err = find_module_sections(mod, info);
+ if (err)
+ goto free_unload;
+
+ err = check_module_license_and_versions(mod);
+ if (err)
+ goto free_unload;
+
+ /* Set up MODINFO_ATTR fields */
+ setup_modinfo(mod, info);
+
+ /* Fix up syms, so that st_value is a pointer to location. */
+ err = simplify_symbols(mod, info);
+ if (err < 0)
+ goto free_modinfo;
+
+ err = apply_relocations(mod, info);
+ if (err < 0)
+ goto free_modinfo;
+
+ err = post_relocation(mod, info);
+ if (err < 0)
+ goto free_modinfo;
+
+ flush_module_icache(mod);
+
+ /* Now copy in args */
+ mod->args = strndup_user(uargs, ~0UL >> 1);
+ if (IS_ERR(mod->args)) {
+ err = PTR_ERR(mod->args);
+ goto free_arch_cleanup;
+ }
+
+ dynamic_debug_setup(info->debug, info->num_debug);
+
+ /* Ftrace init must be called in the MODULE_STATE_UNFORMED state */
+ ftrace_module_init(mod);
+
+ /* Finally it's fully formed, ready to start executing. */
+ err = complete_formation(mod, info);
+ if (err)
+ goto ddebug_cleanup;
+
+ /* Module is ready to execute: parsing args may do that. */
+ after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
+ -32768, 32767, unknown_module_param_cb);
+ if (IS_ERR(after_dashes)) {
+ err = PTR_ERR(after_dashes);
+ goto bug_cleanup;
+ } else if (after_dashes) {
+ pr_warn("%s: parameters '%s' after `--' ignored\n",
+ mod->name, after_dashes);
+ }
+
+ /* Link in to syfs. */
+ err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp);
+ if (err < 0)
+ goto bug_cleanup;
+
+ /* Get rid of temporary copy. */
+ free_copy(info);
+
+ /* Done! */
+ trace_module_load(mod);
+
+ return do_init_module(mod);
+
+ bug_cleanup:
+ /* module_bug_cleanup needs module_mutex protection */
+ mutex_lock(&module_mutex);
+ module_bug_cleanup(mod);
+ mutex_unlock(&module_mutex);
+
+ blocking_notifier_call_chain(&module_notify_list,
+ MODULE_STATE_GOING, mod);
+
+ /* we can't deallocate the module until we clear memory protection */
+ unset_module_init_ro_nx(mod);
+ unset_module_core_ro_nx(mod);
+
+ ddebug_cleanup:
+ dynamic_debug_remove(info->debug);
+ synchronize_sched();
+ kfree(mod->args);
+ free_arch_cleanup:
+ module_arch_cleanup(mod);
+ free_modinfo:
+ free_modinfo(mod);
+ free_unload:
+ module_unload_free(mod);
+ unlink_mod:
+ mutex_lock(&module_mutex);
+ /* Unlink carefully: kallsyms could be walking list. */
+ list_del_rcu(&mod->list);
+ wake_up_all(&module_wq);
+ /* Wait for RCU synchronizing before releasing mod->list. */
+ synchronize_rcu();
+ mutex_unlock(&module_mutex);
+ free_module:
+ /* Free lock-classes; relies on the preceding sync_rcu() */
+ lockdep_free_key_range(mod->module_core, mod->core_size);
+
+ module_deallocate(mod, info);
+ free_copy:
+ free_copy(info);
+ return err;
+}
+
+SYSCALL_DEFINE3(init_module, void __user *, umod,
+ unsigned long, len, const char __user *, uargs)
+{
+ int err;
+ struct load_info info = { };
+
+ err = may_init_module();
+ if (err)
+ return err;
+
+ pr_debug("init_module: umod=%p, len=%lu, uargs=%p\n",
+ umod, len, uargs);
+
+ err = copy_module_from_user(umod, len, &info);
+ if (err)
+ return err;
+
+ return load_module(&info, uargs, 0);
+}
+
+SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
+{
+ int err;
+ struct load_info info = { };
+
+ err = may_init_module();
+ if (err)
+ return err;
+
+ pr_debug("finit_module: fd=%d, uargs=%p, flags=%i\n", fd, uargs, flags);
+
+ if (flags & ~(MODULE_INIT_IGNORE_MODVERSIONS
+ |MODULE_INIT_IGNORE_VERMAGIC))
+ return -EINVAL;
+
+ err = copy_module_from_fd(fd, &info);
+ if (err)
+ return err;
+
+ return load_module(&info, uargs, flags);
+}
+
+static inline int within(unsigned long addr, void *start, unsigned long size)
+{
+ return ((void *)addr >= start && (void *)addr < start + size);
+}
+
+#ifdef CONFIG_KALLSYMS
+/*
+ * This ignores the intensely annoying "mapping symbols" found
+ * in ARM ELF files: $a, $t and $d.
+ */
+static inline int is_arm_mapping_symbol(const char *str)
+{
+ if (str[0] == '.' && str[1] == 'L')
+ return true;
+ return str[0] == '$' && strchr("axtd", str[1])
+ && (str[2] == '\0' || str[2] == '.');
+}
+
+static const char *get_ksymbol(struct module *mod,
+ unsigned long addr,
+ unsigned long *size,
+ unsigned long *offset)
+{
+ unsigned int i, best = 0;
+ unsigned long nextval;
+
+ /* At worse, next value is at end of module */
+ if (within_module_init(addr, mod))
+ nextval = (unsigned long)mod->module_init+mod->init_text_size;
+ else
+ nextval = (unsigned long)mod->module_core+mod->core_text_size;
+
+ /* Scan for closest preceding symbol, and next symbol. (ELF
+ starts real symbols at 1). */
+ for (i = 1; i < mod->num_symtab; i++) {
+ if (mod->symtab[i].st_shndx == SHN_UNDEF)
+ continue;
+
+ /* We ignore unnamed symbols: they're uninformative
+ * and inserted at a whim. */
+ if (mod->symtab[i].st_value <= addr
+ && mod->symtab[i].st_value > mod->symtab[best].st_value
+ && *(mod->strtab + mod->symtab[i].st_name) != '\0'
+ && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name))
+ best = i;
+ if (mod->symtab[i].st_value > addr
+ && mod->symtab[i].st_value < nextval
+ && *(mod->strtab + mod->symtab[i].st_name) != '\0'
+ && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name))
+ nextval = mod->symtab[i].st_value;
+ }
+
+ if (!best)
+ return NULL;
+
+ if (size)
+ *size = nextval - mod->symtab[best].st_value;
+ if (offset)
+ *offset = addr - mod->symtab[best].st_value;
+ return mod->strtab + mod->symtab[best].st_name;
+}
+
+/* For kallsyms to ask for address resolution. NULL means not found. Careful
+ * not to lock to avoid deadlock on oopses, simply disable preemption. */
+const char *module_address_lookup(unsigned long addr,
+ unsigned long *size,
+ unsigned long *offset,
+ char **modname,
+ char *namebuf)
+{
+ struct module *mod;
+ const char *ret = NULL;
+
+ preempt_disable();
+ list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+ if (within_module(addr, mod)) {
+ if (modname)
+ *modname = mod->name;
+ ret = get_ksymbol(mod, addr, size, offset);
+ break;
+ }
+ }
+ /* Make a copy in here where it's safe */
+ if (ret) {
+ strncpy(namebuf, ret, KSYM_NAME_LEN - 1);
+ ret = namebuf;
+ }
+ preempt_enable();
+ return ret;
+}
+
+int lookup_module_symbol_name(unsigned long addr, char *symname)
+{
+ struct module *mod;
+
+ preempt_disable();
+ list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+ if (within_module(addr, mod)) {
+ const char *sym;
+
+ sym = get_ksymbol(mod, addr, NULL, NULL);
+ if (!sym)
+ goto out;
+ strlcpy(symname, sym, KSYM_NAME_LEN);
+ preempt_enable();
+ return 0;
+ }
+ }
+out:
+ preempt_enable();
+ return -ERANGE;
+}
+
+int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
+ unsigned long *offset, char *modname, char *name)
+{
+ struct module *mod;
+
+ preempt_disable();
+ list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+ if (within_module(addr, mod)) {
+ const char *sym;
+
+ sym = get_ksymbol(mod, addr, size, offset);
+ if (!sym)
+ goto out;
+ if (modname)
+ strlcpy(modname, mod->name, MODULE_NAME_LEN);
+ if (name)
+ strlcpy(name, sym, KSYM_NAME_LEN);
+ preempt_enable();
+ return 0;
+ }
+ }
+out:
+ preempt_enable();
+ return -ERANGE;
+}
+
+int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
+ char *name, char *module_name, int *exported)
+{
+ struct module *mod;
+
+ preempt_disable();
+ list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+ if (symnum < mod->num_symtab) {
+ *value = mod->symtab[symnum].st_value;
+ *type = mod->symtab[symnum].st_info;
+ strlcpy(name, mod->strtab + mod->symtab[symnum].st_name,
+ KSYM_NAME_LEN);
+ strlcpy(module_name, mod->name, MODULE_NAME_LEN);
+ *exported = is_exported(name, *value, mod);
+ preempt_enable();
+ return 0;
+ }
+ symnum -= mod->num_symtab;
+ }
+ preempt_enable();
+ return -ERANGE;
+}
+
+static unsigned long mod_find_symname(struct module *mod, const char *name)
+{
+ unsigned int i;
+
+ for (i = 0; i < mod->num_symtab; i++)
+ if (strcmp(name, mod->strtab+mod->symtab[i].st_name) == 0 &&
+ mod->symtab[i].st_info != 'U')
+ return mod->symtab[i].st_value;
+ return 0;
+}
+
+/* Look for this name: can be of form module:name. */
+unsigned long module_kallsyms_lookup_name(const char *name)
+{
+ struct module *mod;
+ char *colon;
+ unsigned long ret = 0;
+
+ /* Don't lock: we're in enough trouble already. */
+ preempt_disable();
+ if ((colon = strchr(name, ':')) != NULL) {
+ if ((mod = find_module_all(name, colon - name, false)) != NULL)
+ ret = mod_find_symname(mod, colon+1);
+ } else {
+ list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+ if ((ret = mod_find_symname(mod, name)) != 0)
+ break;
+ }
+ }
+ preempt_enable();
+ return ret;
+}
+
+int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
+ struct module *, unsigned long),
+ void *data)
+{
+ struct module *mod;
+ unsigned int i;
+ int ret;
+
+ list_for_each_entry(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+ for (i = 0; i < mod->num_symtab; i++) {
+ ret = fn(data, mod->strtab + mod->symtab[i].st_name,
+ mod, mod->symtab[i].st_value);
+ if (ret != 0)
+ return ret;
+ }
+ }
+ return 0;
+}
+#endif /* CONFIG_KALLSYMS */
+
+static char *module_flags(struct module *mod, char *buf)
+{
+ int bx = 0;
+
+ BUG_ON(mod->state == MODULE_STATE_UNFORMED);
+ if (mod->taints ||
+ mod->state == MODULE_STATE_GOING ||
+ mod->state == MODULE_STATE_COMING) {
+ buf[bx++] = '(';
+ bx += module_flags_taint(mod, buf + bx);
+ /* Show a - for module-is-being-unloaded */
+ if (mod->state == MODULE_STATE_GOING)
+ buf[bx++] = '-';
+ /* Show a + for module-is-being-loaded */
+ if (mod->state == MODULE_STATE_COMING)
+ buf[bx++] = '+';
+ buf[bx++] = ')';
+ }
+ buf[bx] = '\0';
+
+ return buf;
+}
+
+#ifdef CONFIG_PROC_FS
+/* Called by the /proc file system to return a list of modules. */
+static void *m_start(struct seq_file *m, loff_t *pos)
+{
+ mutex_lock(&module_mutex);
+ return seq_list_start(&modules, *pos);
+}
+
+static void *m_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ return seq_list_next(p, &modules, pos);
+}
+
+static void m_stop(struct seq_file *m, void *p)
+{
+ mutex_unlock(&module_mutex);
+}
+
+static int m_show(struct seq_file *m, void *p)
+{
+ struct module *mod = list_entry(p, struct module, list);
+ char buf[8];
+
+ /* We always ignore unformed modules. */
+ if (mod->state == MODULE_STATE_UNFORMED)
+ return 0;
+
+ seq_printf(m, "%s %u",
+ mod->name, mod->init_size + mod->core_size);
+ print_unload_info(m, mod);
+
+ /* Informative for users. */
+ seq_printf(m, " %s",
+ mod->state == MODULE_STATE_GOING ? "Unloading" :
+ mod->state == MODULE_STATE_COMING ? "Loading" :
+ "Live");
+ /* Used by oprofile and other similar tools. */
+ seq_printf(m, " 0x%pK", mod->module_core);
+
+ /* Taints info */
+ if (mod->taints)
+ seq_printf(m, " %s", module_flags(mod, buf));
+
+ seq_puts(m, "\n");
+ return 0;
+}
+
+/* Format: modulename size refcount deps address
+
+ Where refcount is a number or -, and deps is a comma-separated list
+ of depends or -.
+*/
+static const struct seq_operations modules_op = {
+ .start = m_start,
+ .next = m_next,
+ .stop = m_stop,
+ .show = m_show
+};
+
+static int modules_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &modules_op);
+}
+
+static const struct file_operations proc_modules_operations = {
+ .open = modules_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int __init proc_modules_init(void)
+{
+ proc_create("modules", 0, NULL, &proc_modules_operations);
+ return 0;
+}
+module_init(proc_modules_init);
+#endif
+
+/* Given an address, look for it in the module exception tables. */
+const struct exception_table_entry *search_module_extables(unsigned long addr)
+{
+ const struct exception_table_entry *e = NULL;
+ struct module *mod;
+
+ preempt_disable();
+ list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+ if (mod->num_exentries == 0)
+ continue;
+
+ e = search_extable(mod->extable,
+ mod->extable + mod->num_exentries - 1,
+ addr);
+ if (e)
+ break;
+ }
+ preempt_enable();
+
+ /* Now, if we found one, we are running inside it now, hence
+ we cannot unload the module, hence no refcnt needed. */
+ return e;
+}
+
+/*
+ * is_module_address - is this address inside a module?
+ * @addr: the address to check.
+ *
+ * See is_module_text_address() if you simply want to see if the address
+ * is code (not data).
+ */
+bool is_module_address(unsigned long addr)
+{
+ bool ret;
+
+ preempt_disable();
+ ret = __module_address(addr) != NULL;
+ preempt_enable();
+
+ return ret;
+}
+
+/*
+ * __module_address - get the module which contains an address.
+ * @addr: the address.
+ *
+ * Must be called with preempt disabled or module mutex held so that
+ * module doesn't get freed during this.
+ */
+struct module *__module_address(unsigned long addr)
+{
+ struct module *mod;
+
+ if (addr < module_addr_min || addr > module_addr_max)
+ return NULL;
+
+ list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+ if (within_module(addr, mod))
+ return mod;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(__module_address);
+
+/*
+ * is_module_text_address - is this address inside module code?
+ * @addr: the address to check.
+ *
+ * See is_module_address() if you simply want to see if the address is
+ * anywhere in a module. See kernel_text_address() for testing if an
+ * address corresponds to kernel or module code.
+ */
+bool is_module_text_address(unsigned long addr)
+{
+ bool ret;
+
+ preempt_disable();
+ ret = __module_text_address(addr) != NULL;
+ preempt_enable();
+
+ return ret;
+}
+
+/*
+ * __module_text_address - get the module whose code contains an address.
+ * @addr: the address.
+ *
+ * Must be called with preempt disabled or module mutex held so that
+ * module doesn't get freed during this.
+ */
+struct module *__module_text_address(unsigned long addr)
+{
+ struct module *mod = __module_address(addr);
+ if (mod) {
+ /* Make sure it's within the text section. */
+ if (!within(addr, mod->module_init, mod->init_text_size)
+ && !within(addr, mod->module_core, mod->core_text_size))
+ mod = NULL;
+ }
+ return mod;
+}
+EXPORT_SYMBOL_GPL(__module_text_address);
+
+/* Don't grab lock, we're oopsing. */
+void print_modules(void)
+{
+ struct module *mod;
+ char buf[8];
+
+ printk(KERN_DEFAULT "Modules linked in:");
+ /* Most callers should already have preempt disabled, but make sure */
+ preempt_disable();
+ list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+ pr_cont(" %s%s", mod->name, module_flags(mod, buf));
+ }
+ preempt_enable();
+ if (last_unloaded_module[0])
+ pr_cont(" [last unloaded: %s]", last_unloaded_module);
+ pr_cont("\n");
+}
+
+#ifdef CONFIG_MODVERSIONS
+/* Generate the signature for all relevant module structures here.
+ * If these change, we don't want to try to parse the module. */
+void module_layout(struct module *mod,
+ struct modversion_info *ver,
+ struct kernel_param *kp,
+ struct kernel_symbol *ks,
+ struct tracepoint * const *tp)
+{
+}
+EXPORT_SYMBOL(module_layout);
+#endif
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
new file mode 100644
index 000000000..be5b8fac4
--- /dev/null
+++ b/kernel/module_signing.c
@@ -0,0 +1,250 @@
+/* Module signature checker
+ *
+ * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <crypto/public_key.h>
+#include <crypto/hash.h>
+#include <keys/asymmetric-type.h>
+#include <keys/system_keyring.h>
+#include "module-internal.h"
+
+/*
+ * Module signature information block.
+ *
+ * The constituents of the signature section are, in order:
+ *
+ * - Signer's name
+ * - Key identifier
+ * - Signature data
+ * - Information block
+ */
+struct module_signature {
+ u8 algo; /* Public-key crypto algorithm [enum pkey_algo] */
+ u8 hash; /* Digest algorithm [enum hash_algo] */
+ u8 id_type; /* Key identifier type [enum pkey_id_type] */
+ u8 signer_len; /* Length of signer's name */
+ u8 key_id_len; /* Length of key identifier */
+ u8 __pad[3];
+ __be32 sig_len; /* Length of signature data */
+};
+
+/*
+ * Digest the module contents.
+ */
+static struct public_key_signature *mod_make_digest(enum hash_algo hash,
+ const void *mod,
+ unsigned long modlen)
+{
+ struct public_key_signature *pks;
+ struct crypto_shash *tfm;
+ struct shash_desc *desc;
+ size_t digest_size, desc_size;
+ int ret;
+
+ pr_devel("==>%s()\n", __func__);
+
+ /* Allocate the hashing algorithm we're going to need and find out how
+ * big the hash operational data will be.
+ */
+ tfm = crypto_alloc_shash(hash_algo_name[hash], 0, 0);
+ if (IS_ERR(tfm))
+ return (PTR_ERR(tfm) == -ENOENT) ? ERR_PTR(-ENOPKG) : ERR_CAST(tfm);
+
+ desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
+ digest_size = crypto_shash_digestsize(tfm);
+
+ /* We allocate the hash operational data storage on the end of our
+ * context data and the digest output buffer on the end of that.
+ */
+ ret = -ENOMEM;
+ pks = kzalloc(digest_size + sizeof(*pks) + desc_size, GFP_KERNEL);
+ if (!pks)
+ goto error_no_pks;
+
+ pks->pkey_hash_algo = hash;
+ pks->digest = (u8 *)pks + sizeof(*pks) + desc_size;
+ pks->digest_size = digest_size;
+
+ desc = (void *)pks + sizeof(*pks);
+ desc->tfm = tfm;
+ desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+ ret = crypto_shash_init(desc);
+ if (ret < 0)
+ goto error;
+
+ ret = crypto_shash_finup(desc, mod, modlen, pks->digest);
+ if (ret < 0)
+ goto error;
+
+ crypto_free_shash(tfm);
+ pr_devel("<==%s() = ok\n", __func__);
+ return pks;
+
+error:
+ kfree(pks);
+error_no_pks:
+ crypto_free_shash(tfm);
+ pr_devel("<==%s() = %d\n", __func__, ret);
+ return ERR_PTR(ret);
+}
+
+/*
+ * Extract an MPI array from the signature data. This represents the actual
+ * signature. Each raw MPI is prefaced by a BE 2-byte value indicating the
+ * size of the MPI in bytes.
+ *
+ * RSA signatures only have one MPI, so currently we only read one.
+ */
+static int mod_extract_mpi_array(struct public_key_signature *pks,
+ const void *data, size_t len)
+{
+ size_t nbytes;
+ MPI mpi;
+
+ if (len < 3)
+ return -EBADMSG;
+ nbytes = ((const u8 *)data)[0] << 8 | ((const u8 *)data)[1];
+ data += 2;
+ len -= 2;
+ if (len != nbytes)
+ return -EBADMSG;
+
+ mpi = mpi_read_raw_data(data, nbytes);
+ if (!mpi)
+ return -ENOMEM;
+ pks->mpi[0] = mpi;
+ pks->nr_mpi = 1;
+ return 0;
+}
+
+/*
+ * Request an asymmetric key.
+ */
+static struct key *request_asymmetric_key(const char *signer, size_t signer_len,
+ const u8 *key_id, size_t key_id_len)
+{
+ key_ref_t key;
+ size_t i;
+ char *id, *q;
+
+ pr_devel("==>%s(,%zu,,%zu)\n", __func__, signer_len, key_id_len);
+
+ /* Construct an identifier. */
+ id = kmalloc(signer_len + 2 + key_id_len * 2 + 1, GFP_KERNEL);
+ if (!id)
+ return ERR_PTR(-ENOKEY);
+
+ memcpy(id, signer, signer_len);
+
+ q = id + signer_len;
+ *q++ = ':';
+ *q++ = ' ';
+ for (i = 0; i < key_id_len; i++) {
+ *q++ = hex_asc[*key_id >> 4];
+ *q++ = hex_asc[*key_id++ & 0x0f];
+ }
+
+ *q = 0;
+
+ pr_debug("Look up: \"%s\"\n", id);
+
+ key = keyring_search(make_key_ref(system_trusted_keyring, 1),
+ &key_type_asymmetric, id);
+ if (IS_ERR(key))
+ pr_warn("Request for unknown module key '%s' err %ld\n",
+ id, PTR_ERR(key));
+ kfree(id);
+
+ if (IS_ERR(key)) {
+ switch (PTR_ERR(key)) {
+ /* Hide some search errors */
+ case -EACCES:
+ case -ENOTDIR:
+ case -EAGAIN:
+ return ERR_PTR(-ENOKEY);
+ default:
+ return ERR_CAST(key);
+ }
+ }
+
+ pr_devel("<==%s() = 0 [%x]\n", __func__, key_serial(key_ref_to_ptr(key)));
+ return key_ref_to_ptr(key);
+}
+
+/*
+ * Verify the signature on a module.
+ */
+int mod_verify_sig(const void *mod, unsigned long *_modlen)
+{
+ struct public_key_signature *pks;
+ struct module_signature ms;
+ struct key *key;
+ const void *sig;
+ size_t modlen = *_modlen, sig_len;
+ int ret;
+
+ pr_devel("==>%s(,%zu)\n", __func__, modlen);
+
+ if (modlen <= sizeof(ms))
+ return -EBADMSG;
+
+ memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms));
+ modlen -= sizeof(ms);
+
+ sig_len = be32_to_cpu(ms.sig_len);
+ if (sig_len >= modlen)
+ return -EBADMSG;
+ modlen -= sig_len;
+ if ((size_t)ms.signer_len + ms.key_id_len >= modlen)
+ return -EBADMSG;
+ modlen -= (size_t)ms.signer_len + ms.key_id_len;
+
+ *_modlen = modlen;
+ sig = mod + modlen;
+
+ /* For the moment, only support RSA and X.509 identifiers */
+ if (ms.algo != PKEY_ALGO_RSA ||
+ ms.id_type != PKEY_ID_X509)
+ return -ENOPKG;
+
+ if (ms.hash >= PKEY_HASH__LAST ||
+ !hash_algo_name[ms.hash])
+ return -ENOPKG;
+
+ key = request_asymmetric_key(sig, ms.signer_len,
+ sig + ms.signer_len, ms.key_id_len);
+ if (IS_ERR(key))
+ return PTR_ERR(key);
+
+ pks = mod_make_digest(ms.hash, mod, modlen);
+ if (IS_ERR(pks)) {
+ ret = PTR_ERR(pks);
+ goto error_put_key;
+ }
+
+ ret = mod_extract_mpi_array(pks, sig + ms.signer_len + ms.key_id_len,
+ sig_len);
+ if (ret < 0)
+ goto error_free_pks;
+
+ ret = verify_signature(key, pks);
+ pr_devel("verify_signature() = %d\n", ret);
+
+error_free_pks:
+ mpi_free(pks->rsa.s);
+ kfree(pks);
+error_put_key:
+ key_put(key);
+ pr_devel("<==%s() = %d\n", __func__, ret);
+ return ret;
+}
diff --git a/kernel/notifier.c b/kernel/notifier.c
new file mode 100644
index 000000000..ae9fc7cc3
--- /dev/null
+++ b/kernel/notifier.c
@@ -0,0 +1,562 @@
+#include <linux/kdebug.h>
+#include <linux/kprobes.h>
+#include <linux/export.h>
+#include <linux/notifier.h>
+#include <linux/rcupdate.h>
+#include <linux/vmalloc.h>
+#include <linux/reboot.h>
+
+/*
+ * Notifier list for kernel code which wants to be called
+ * at shutdown. This is used to stop any idling DMA operations
+ * and the like.
+ */
+BLOCKING_NOTIFIER_HEAD(reboot_notifier_list);
+
+/*
+ * Notifier chain core routines. The exported routines below
+ * are layered on top of these, with appropriate locking added.
+ */
+
+static int notifier_chain_register(struct notifier_block **nl,
+ struct notifier_block *n)
+{
+ while ((*nl) != NULL) {
+ if (n->priority > (*nl)->priority)
+ break;
+ nl = &((*nl)->next);
+ }
+ n->next = *nl;
+ rcu_assign_pointer(*nl, n);
+ return 0;
+}
+
+static int notifier_chain_cond_register(struct notifier_block **nl,
+ struct notifier_block *n)
+{
+ while ((*nl) != NULL) {
+ if ((*nl) == n)
+ return 0;
+ if (n->priority > (*nl)->priority)
+ break;
+ nl = &((*nl)->next);
+ }
+ n->next = *nl;
+ rcu_assign_pointer(*nl, n);
+ return 0;
+}
+
+static int notifier_chain_unregister(struct notifier_block **nl,
+ struct notifier_block *n)
+{
+ while ((*nl) != NULL) {
+ if ((*nl) == n) {
+ rcu_assign_pointer(*nl, n->next);
+ return 0;
+ }
+ nl = &((*nl)->next);
+ }
+ return -ENOENT;
+}
+
+/**
+ * notifier_call_chain - Informs the registered notifiers about an event.
+ * @nl: Pointer to head of the blocking notifier chain
+ * @val: Value passed unmodified to notifier function
+ * @v: Pointer passed unmodified to notifier function
+ * @nr_to_call: Number of notifier functions to be called. Don't care
+ * value of this parameter is -1.
+ * @nr_calls: Records the number of notifications sent. Don't care
+ * value of this field is NULL.
+ * @returns: notifier_call_chain returns the value returned by the
+ * last notifier function called.
+ */
+static int notifier_call_chain(struct notifier_block **nl,
+ unsigned long val, void *v,
+ int nr_to_call, int *nr_calls)
+{
+ int ret = NOTIFY_DONE;
+ struct notifier_block *nb, *next_nb;
+
+ nb = rcu_dereference_raw(*nl);
+
+ while (nb && nr_to_call) {
+ next_nb = rcu_dereference_raw(nb->next);
+
+#ifdef CONFIG_DEBUG_NOTIFIERS
+ if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) {
+ WARN(1, "Invalid notifier called!");
+ nb = next_nb;
+ continue;
+ }
+#endif
+ ret = nb->notifier_call(nb, val, v);
+
+ if (nr_calls)
+ (*nr_calls)++;
+
+ if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK)
+ break;
+ nb = next_nb;
+ nr_to_call--;
+ }
+ return ret;
+}
+NOKPROBE_SYMBOL(notifier_call_chain);
+
+/*
+ * Atomic notifier chain routines. Registration and unregistration
+ * use a spinlock, and call_chain is synchronized by RCU (no locks).
+ */
+
+/**
+ * atomic_notifier_chain_register - Add notifier to an atomic notifier chain
+ * @nh: Pointer to head of the atomic notifier chain
+ * @n: New entry in notifier chain
+ *
+ * Adds a notifier to an atomic notifier chain.
+ *
+ * Currently always returns zero.
+ */
+int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
+ struct notifier_block *n)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&nh->lock, flags);
+ ret = notifier_chain_register(&nh->head, n);
+ spin_unlock_irqrestore(&nh->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(atomic_notifier_chain_register);
+
+/**
+ * atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain
+ * @nh: Pointer to head of the atomic notifier chain
+ * @n: Entry to remove from notifier chain
+ *
+ * Removes a notifier from an atomic notifier chain.
+ *
+ * Returns zero on success or %-ENOENT on failure.
+ */
+int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
+ struct notifier_block *n)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&nh->lock, flags);
+ ret = notifier_chain_unregister(&nh->head, n);
+ spin_unlock_irqrestore(&nh->lock, flags);
+ synchronize_rcu();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
+
+/**
+ * __atomic_notifier_call_chain - Call functions in an atomic notifier chain
+ * @nh: Pointer to head of the atomic notifier chain
+ * @val: Value passed unmodified to notifier function
+ * @v: Pointer passed unmodified to notifier function
+ * @nr_to_call: See the comment for notifier_call_chain.
+ * @nr_calls: See the comment for notifier_call_chain.
+ *
+ * Calls each function in a notifier chain in turn. The functions
+ * run in an atomic context, so they must not block.
+ * This routine uses RCU to synchronize with changes to the chain.
+ *
+ * If the return value of the notifier can be and'ed
+ * with %NOTIFY_STOP_MASK then atomic_notifier_call_chain()
+ * will return immediately, with the return value of
+ * the notifier function which halted execution.
+ * Otherwise the return value is the return value
+ * of the last notifier function called.
+ */
+int __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
+ unsigned long val, void *v,
+ int nr_to_call, int *nr_calls)
+{
+ int ret;
+
+ rcu_read_lock();
+ ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain);
+NOKPROBE_SYMBOL(__atomic_notifier_call_chain);
+
+int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
+ unsigned long val, void *v)
+{
+ return __atomic_notifier_call_chain(nh, val, v, -1, NULL);
+}
+EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
+NOKPROBE_SYMBOL(atomic_notifier_call_chain);
+
+/*
+ * Blocking notifier chain routines. All access to the chain is
+ * synchronized by an rwsem.
+ */
+
+/**
+ * blocking_notifier_chain_register - Add notifier to a blocking notifier chain
+ * @nh: Pointer to head of the blocking notifier chain
+ * @n: New entry in notifier chain
+ *
+ * Adds a notifier to a blocking notifier chain.
+ * Must be called in process context.
+ *
+ * Currently always returns zero.
+ */
+int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
+ struct notifier_block *n)
+{
+ int ret;
+
+ /*
+ * This code gets used during boot-up, when task switching is
+ * not yet working and interrupts must remain disabled. At
+ * such times we must not call down_write().
+ */
+ if (unlikely(system_state == SYSTEM_BOOTING))
+ return notifier_chain_register(&nh->head, n);
+
+ down_write(&nh->rwsem);
+ ret = notifier_chain_register(&nh->head, n);
+ up_write(&nh->rwsem);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(blocking_notifier_chain_register);
+
+/**
+ * blocking_notifier_chain_cond_register - Cond add notifier to a blocking notifier chain
+ * @nh: Pointer to head of the blocking notifier chain
+ * @n: New entry in notifier chain
+ *
+ * Adds a notifier to a blocking notifier chain, only if not already
+ * present in the chain.
+ * Must be called in process context.
+ *
+ * Currently always returns zero.
+ */
+int blocking_notifier_chain_cond_register(struct blocking_notifier_head *nh,
+ struct notifier_block *n)
+{
+ int ret;
+
+ down_write(&nh->rwsem);
+ ret = notifier_chain_cond_register(&nh->head, n);
+ up_write(&nh->rwsem);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(blocking_notifier_chain_cond_register);
+
+/**
+ * blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain
+ * @nh: Pointer to head of the blocking notifier chain
+ * @n: Entry to remove from notifier chain
+ *
+ * Removes a notifier from a blocking notifier chain.
+ * Must be called from process context.
+ *
+ * Returns zero on success or %-ENOENT on failure.
+ */
+int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
+ struct notifier_block *n)
+{
+ int ret;
+
+ /*
+ * This code gets used during boot-up, when task switching is
+ * not yet working and interrupts must remain disabled. At
+ * such times we must not call down_write().
+ */
+ if (unlikely(system_state == SYSTEM_BOOTING))
+ return notifier_chain_unregister(&nh->head, n);
+
+ down_write(&nh->rwsem);
+ ret = notifier_chain_unregister(&nh->head, n);
+ up_write(&nh->rwsem);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);
+
+/**
+ * __blocking_notifier_call_chain - Call functions in a blocking notifier chain
+ * @nh: Pointer to head of the blocking notifier chain
+ * @val: Value passed unmodified to notifier function
+ * @v: Pointer passed unmodified to notifier function
+ * @nr_to_call: See comment for notifier_call_chain.
+ * @nr_calls: See comment for notifier_call_chain.
+ *
+ * Calls each function in a notifier chain in turn. The functions
+ * run in a process context, so they are allowed to block.
+ *
+ * If the return value of the notifier can be and'ed
+ * with %NOTIFY_STOP_MASK then blocking_notifier_call_chain()
+ * will return immediately, with the return value of
+ * the notifier function which halted execution.
+ * Otherwise the return value is the return value
+ * of the last notifier function called.
+ */
+int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
+ unsigned long val, void *v,
+ int nr_to_call, int *nr_calls)
+{
+ int ret = NOTIFY_DONE;
+
+ /*
+ * We check the head outside the lock, but if this access is
+ * racy then it does not matter what the result of the test
+ * is, we re-check the list after having taken the lock anyway:
+ */
+ if (rcu_access_pointer(nh->head)) {
+ down_read(&nh->rwsem);
+ ret = notifier_call_chain(&nh->head, val, v, nr_to_call,
+ nr_calls);
+ up_read(&nh->rwsem);
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__blocking_notifier_call_chain);
+
+int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
+ unsigned long val, void *v)
+{
+ return __blocking_notifier_call_chain(nh, val, v, -1, NULL);
+}
+EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);
+
+/*
+ * Raw notifier chain routines. There is no protection;
+ * the caller must provide it. Use at your own risk!
+ */
+
+/**
+ * raw_notifier_chain_register - Add notifier to a raw notifier chain
+ * @nh: Pointer to head of the raw notifier chain
+ * @n: New entry in notifier chain
+ *
+ * Adds a notifier to a raw notifier chain.
+ * All locking must be provided by the caller.
+ *
+ * Currently always returns zero.
+ */
+int raw_notifier_chain_register(struct raw_notifier_head *nh,
+ struct notifier_block *n)
+{
+ return notifier_chain_register(&nh->head, n);
+}
+EXPORT_SYMBOL_GPL(raw_notifier_chain_register);
+
+/**
+ * raw_notifier_chain_unregister - Remove notifier from a raw notifier chain
+ * @nh: Pointer to head of the raw notifier chain
+ * @n: Entry to remove from notifier chain
+ *
+ * Removes a notifier from a raw notifier chain.
+ * All locking must be provided by the caller.
+ *
+ * Returns zero on success or %-ENOENT on failure.
+ */
+int raw_notifier_chain_unregister(struct raw_notifier_head *nh,
+ struct notifier_block *n)
+{
+ return notifier_chain_unregister(&nh->head, n);
+}
+EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);
+
+/**
+ * __raw_notifier_call_chain - Call functions in a raw notifier chain
+ * @nh: Pointer to head of the raw notifier chain
+ * @val: Value passed unmodified to notifier function
+ * @v: Pointer passed unmodified to notifier function
+ * @nr_to_call: See comment for notifier_call_chain.
+ * @nr_calls: See comment for notifier_call_chain
+ *
+ * Calls each function in a notifier chain in turn. The functions
+ * run in an undefined context.
+ * All locking must be provided by the caller.
+ *
+ * If the return value of the notifier can be and'ed
+ * with %NOTIFY_STOP_MASK then raw_notifier_call_chain()
+ * will return immediately, with the return value of
+ * the notifier function which halted execution.
+ * Otherwise the return value is the return value
+ * of the last notifier function called.
+ */
+int __raw_notifier_call_chain(struct raw_notifier_head *nh,
+ unsigned long val, void *v,
+ int nr_to_call, int *nr_calls)
+{
+ return notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
+}
+EXPORT_SYMBOL_GPL(__raw_notifier_call_chain);
+
+int raw_notifier_call_chain(struct raw_notifier_head *nh,
+ unsigned long val, void *v)
+{
+ return __raw_notifier_call_chain(nh, val, v, -1, NULL);
+}
+EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
+
+#ifdef CONFIG_SRCU
+/*
+ * SRCU notifier chain routines. Registration and unregistration
+ * use a mutex, and call_chain is synchronized by SRCU (no locks).
+ */
+
+/**
+ * srcu_notifier_chain_register - Add notifier to an SRCU notifier chain
+ * @nh: Pointer to head of the SRCU notifier chain
+ * @n: New entry in notifier chain
+ *
+ * Adds a notifier to an SRCU notifier chain.
+ * Must be called in process context.
+ *
+ * Currently always returns zero.
+ */
+int srcu_notifier_chain_register(struct srcu_notifier_head *nh,
+ struct notifier_block *n)
+{
+ int ret;
+
+ /*
+ * This code gets used during boot-up, when task switching is
+ * not yet working and interrupts must remain disabled. At
+ * such times we must not call mutex_lock().
+ */
+ if (unlikely(system_state == SYSTEM_BOOTING))
+ return notifier_chain_register(&nh->head, n);
+
+ mutex_lock(&nh->mutex);
+ ret = notifier_chain_register(&nh->head, n);
+ mutex_unlock(&nh->mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(srcu_notifier_chain_register);
+
+/**
+ * srcu_notifier_chain_unregister - Remove notifier from an SRCU notifier chain
+ * @nh: Pointer to head of the SRCU notifier chain
+ * @n: Entry to remove from notifier chain
+ *
+ * Removes a notifier from an SRCU notifier chain.
+ * Must be called from process context.
+ *
+ * Returns zero on success or %-ENOENT on failure.
+ */
+int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh,
+ struct notifier_block *n)
+{
+ int ret;
+
+ /*
+ * This code gets used during boot-up, when task switching is
+ * not yet working and interrupts must remain disabled. At
+ * such times we must not call mutex_lock().
+ */
+ if (unlikely(system_state == SYSTEM_BOOTING))
+ return notifier_chain_unregister(&nh->head, n);
+
+ mutex_lock(&nh->mutex);
+ ret = notifier_chain_unregister(&nh->head, n);
+ mutex_unlock(&nh->mutex);
+ synchronize_srcu(&nh->srcu);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister);
+
+/**
+ * __srcu_notifier_call_chain - Call functions in an SRCU notifier chain
+ * @nh: Pointer to head of the SRCU notifier chain
+ * @val: Value passed unmodified to notifier function
+ * @v: Pointer passed unmodified to notifier function
+ * @nr_to_call: See comment for notifier_call_chain.
+ * @nr_calls: See comment for notifier_call_chain
+ *
+ * Calls each function in a notifier chain in turn. The functions
+ * run in a process context, so they are allowed to block.
+ *
+ * If the return value of the notifier can be and'ed
+ * with %NOTIFY_STOP_MASK then srcu_notifier_call_chain()
+ * will return immediately, with the return value of
+ * the notifier function which halted execution.
+ * Otherwise the return value is the return value
+ * of the last notifier function called.
+ */
+int __srcu_notifier_call_chain(struct srcu_notifier_head *nh,
+ unsigned long val, void *v,
+ int nr_to_call, int *nr_calls)
+{
+ int ret;
+ int idx;
+
+ idx = srcu_read_lock(&nh->srcu);
+ ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
+ srcu_read_unlock(&nh->srcu, idx);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__srcu_notifier_call_chain);
+
+int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
+ unsigned long val, void *v)
+{
+ return __srcu_notifier_call_chain(nh, val, v, -1, NULL);
+}
+EXPORT_SYMBOL_GPL(srcu_notifier_call_chain);
+
+/**
+ * srcu_init_notifier_head - Initialize an SRCU notifier head
+ * @nh: Pointer to head of the srcu notifier chain
+ *
+ * Unlike other sorts of notifier heads, SRCU notifier heads require
+ * dynamic initialization. Be sure to call this routine before
+ * calling any of the other SRCU notifier routines for this head.
+ *
+ * If an SRCU notifier head is deallocated, it must first be cleaned
+ * up by calling srcu_cleanup_notifier_head(). Otherwise the head's
+ * per-cpu data (used by the SRCU mechanism) will leak.
+ */
+void srcu_init_notifier_head(struct srcu_notifier_head *nh)
+{
+ mutex_init(&nh->mutex);
+ if (init_srcu_struct(&nh->srcu) < 0)
+ BUG();
+ nh->head = NULL;
+}
+EXPORT_SYMBOL_GPL(srcu_init_notifier_head);
+
+#endif /* CONFIG_SRCU */
+
+static ATOMIC_NOTIFIER_HEAD(die_chain);
+
+int notrace notify_die(enum die_val val, const char *str,
+ struct pt_regs *regs, long err, int trap, int sig)
+{
+ struct die_args args = {
+ .regs = regs,
+ .str = str,
+ .err = err,
+ .trapnr = trap,
+ .signr = sig,
+
+ };
+ return atomic_notifier_call_chain(&die_chain, val, &args);
+}
+NOKPROBE_SYMBOL(notify_die);
+
+int register_die_notifier(struct notifier_block *nb)
+{
+ vmalloc_sync_all();
+ return atomic_notifier_chain_register(&die_chain, nb);
+}
+EXPORT_SYMBOL_GPL(register_die_notifier);
+
+int unregister_die_notifier(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_unregister(&die_chain, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_die_notifier);
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
new file mode 100644
index 000000000..49746c81a
--- /dev/null
+++ b/kernel/nsproxy.c
@@ -0,0 +1,259 @@
+/*
+ * Copyright (C) 2006 IBM Corporation
+ *
+ * Author: Serge Hallyn <serue@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ *
+ * Jun 2006 - namespaces support
+ * OpenVZ, SWsoft Inc.
+ * Pavel Emelianov <xemul@openvz.org>
+ */
+
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/nsproxy.h>
+#include <linux/init_task.h>
+#include <linux/mnt_namespace.h>
+#include <linux/utsname.h>
+#include <linux/pid_namespace.h>
+#include <net/net_namespace.h>
+#include <linux/ipc_namespace.h>
+#include <linux/proc_ns.h>
+#include <linux/file.h>
+#include <linux/syscalls.h>
+
+static struct kmem_cache *nsproxy_cachep;
+
+struct nsproxy init_nsproxy = {
+ .count = ATOMIC_INIT(1),
+ .uts_ns = &init_uts_ns,
+#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC)
+ .ipc_ns = &init_ipc_ns,
+#endif
+ .mnt_ns = NULL,
+ .pid_ns_for_children = &init_pid_ns,
+#ifdef CONFIG_NET
+ .net_ns = &init_net,
+#endif
+};
+
+static inline struct nsproxy *create_nsproxy(void)
+{
+ struct nsproxy *nsproxy;
+
+ nsproxy = kmem_cache_alloc(nsproxy_cachep, GFP_KERNEL);
+ if (nsproxy)
+ atomic_set(&nsproxy->count, 1);
+ return nsproxy;
+}
+
+/*
+ * Create new nsproxy and all of its the associated namespaces.
+ * Return the newly created nsproxy. Do not attach this to the task,
+ * leave it to the caller to do proper locking and attach it to task.
+ */
+static struct nsproxy *create_new_namespaces(unsigned long flags,
+ struct task_struct *tsk, struct user_namespace *user_ns,
+ struct fs_struct *new_fs)
+{
+ struct nsproxy *new_nsp;
+ int err;
+
+ new_nsp = create_nsproxy();
+ if (!new_nsp)
+ return ERR_PTR(-ENOMEM);
+
+ new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs);
+ if (IS_ERR(new_nsp->mnt_ns)) {
+ err = PTR_ERR(new_nsp->mnt_ns);
+ goto out_ns;
+ }
+
+ new_nsp->uts_ns = copy_utsname(flags, user_ns, tsk->nsproxy->uts_ns);
+ if (IS_ERR(new_nsp->uts_ns)) {
+ err = PTR_ERR(new_nsp->uts_ns);
+ goto out_uts;
+ }
+
+ new_nsp->ipc_ns = copy_ipcs(flags, user_ns, tsk->nsproxy->ipc_ns);
+ if (IS_ERR(new_nsp->ipc_ns)) {
+ err = PTR_ERR(new_nsp->ipc_ns);
+ goto out_ipc;
+ }
+
+ new_nsp->pid_ns_for_children =
+ copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns_for_children);
+ if (IS_ERR(new_nsp->pid_ns_for_children)) {
+ err = PTR_ERR(new_nsp->pid_ns_for_children);
+ goto out_pid;
+ }
+
+ new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns);
+ if (IS_ERR(new_nsp->net_ns)) {
+ err = PTR_ERR(new_nsp->net_ns);
+ goto out_net;
+ }
+
+ return new_nsp;
+
+out_net:
+ if (new_nsp->pid_ns_for_children)
+ put_pid_ns(new_nsp->pid_ns_for_children);
+out_pid:
+ if (new_nsp->ipc_ns)
+ put_ipc_ns(new_nsp->ipc_ns);
+out_ipc:
+ if (new_nsp->uts_ns)
+ put_uts_ns(new_nsp->uts_ns);
+out_uts:
+ if (new_nsp->mnt_ns)
+ put_mnt_ns(new_nsp->mnt_ns);
+out_ns:
+ kmem_cache_free(nsproxy_cachep, new_nsp);
+ return ERR_PTR(err);
+}
+
+/*
+ * called from clone. This now handles copy for nsproxy and all
+ * namespaces therein.
+ */
+int copy_namespaces(unsigned long flags, struct task_struct *tsk)
+{
+ struct nsproxy *old_ns = tsk->nsproxy;
+ struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
+ struct nsproxy *new_ns;
+
+ if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
+ CLONE_NEWPID | CLONE_NEWNET)))) {
+ get_nsproxy(old_ns);
+ return 0;
+ }
+
+ if (!ns_capable(user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /*
+ * CLONE_NEWIPC must detach from the undolist: after switching
+ * to a new ipc namespace, the semaphore arrays from the old
+ * namespace are unreachable. In clone parlance, CLONE_SYSVSEM
+ * means share undolist with parent, so we must forbid using
+ * it along with CLONE_NEWIPC.
+ */
+ if ((flags & (CLONE_NEWIPC | CLONE_SYSVSEM)) ==
+ (CLONE_NEWIPC | CLONE_SYSVSEM))
+ return -EINVAL;
+
+ new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs);
+ if (IS_ERR(new_ns))
+ return PTR_ERR(new_ns);
+
+ tsk->nsproxy = new_ns;
+ return 0;
+}
+
+void free_nsproxy(struct nsproxy *ns)
+{
+ if (ns->mnt_ns)
+ put_mnt_ns(ns->mnt_ns);
+ if (ns->uts_ns)
+ put_uts_ns(ns->uts_ns);
+ if (ns->ipc_ns)
+ put_ipc_ns(ns->ipc_ns);
+ if (ns->pid_ns_for_children)
+ put_pid_ns(ns->pid_ns_for_children);
+ put_net(ns->net_ns);
+ kmem_cache_free(nsproxy_cachep, ns);
+}
+
+/*
+ * Called from unshare. Unshare all the namespaces part of nsproxy.
+ * On success, returns the new nsproxy.
+ */
+int unshare_nsproxy_namespaces(unsigned long unshare_flags,
+ struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs)
+{
+ struct user_namespace *user_ns;
+ int err = 0;
+
+ if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
+ CLONE_NEWNET | CLONE_NEWPID)))
+ return 0;
+
+ user_ns = new_cred ? new_cred->user_ns : current_user_ns();
+ if (!ns_capable(user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+
+ *new_nsp = create_new_namespaces(unshare_flags, current, user_ns,
+ new_fs ? new_fs : current->fs);
+ if (IS_ERR(*new_nsp)) {
+ err = PTR_ERR(*new_nsp);
+ goto out;
+ }
+
+out:
+ return err;
+}
+
+void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
+{
+ struct nsproxy *ns;
+
+ might_sleep();
+
+ task_lock(p);
+ ns = p->nsproxy;
+ p->nsproxy = new;
+ task_unlock(p);
+
+ if (ns && atomic_dec_and_test(&ns->count))
+ free_nsproxy(ns);
+}
+
+void exit_task_namespaces(struct task_struct *p)
+{
+ switch_task_namespaces(p, NULL);
+}
+
+SYSCALL_DEFINE2(setns, int, fd, int, nstype)
+{
+ struct task_struct *tsk = current;
+ struct nsproxy *new_nsproxy;
+ struct file *file;
+ struct ns_common *ns;
+ int err;
+
+ file = proc_ns_fget(fd);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ err = -EINVAL;
+ ns = get_proc_ns(file_inode(file));
+ if (nstype && (ns->ops->type != nstype))
+ goto out;
+
+ new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs);
+ if (IS_ERR(new_nsproxy)) {
+ err = PTR_ERR(new_nsproxy);
+ goto out;
+ }
+
+ err = ns->ops->install(new_nsproxy, ns);
+ if (err) {
+ free_nsproxy(new_nsproxy);
+ goto out;
+ }
+ switch_task_namespaces(tsk, new_nsproxy);
+out:
+ fput(file);
+ return err;
+}
+
+int __init nsproxy_cache_init(void)
+{
+ nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC);
+ return 0;
+}
diff --git a/kernel/padata.c b/kernel/padata.c
new file mode 100644
index 000000000..b38bea9c4
--- /dev/null
+++ b/kernel/padata.c
@@ -0,0 +1,1105 @@
+/*
+ * padata.c - generic interface to process data streams in parallel
+ *
+ * See Documentation/padata.txt for an api documentation.
+ *
+ * Copyright (C) 2008, 2009 secunet Security Networks AG
+ * Copyright (C) 2008, 2009 Steffen Klassert <steffen.klassert@secunet.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <linux/export.h>
+#include <linux/cpumask.h>
+#include <linux/err.h>
+#include <linux/cpu.h>
+#include <linux/padata.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/sysfs.h>
+#include <linux/rcupdate.h>
+
+#define MAX_OBJ_NUM 1000
+
+static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
+{
+ int cpu, target_cpu;
+
+ target_cpu = cpumask_first(pd->cpumask.pcpu);
+ for (cpu = 0; cpu < cpu_index; cpu++)
+ target_cpu = cpumask_next(target_cpu, pd->cpumask.pcpu);
+
+ return target_cpu;
+}
+
+static int padata_cpu_hash(struct parallel_data *pd)
+{
+ unsigned int seq_nr;
+ int cpu_index;
+
+ /*
+ * Hash the sequence numbers to the cpus by taking
+ * seq_nr mod. number of cpus in use.
+ */
+
+ seq_nr = atomic_inc_return(&pd->seq_nr);
+ cpu_index = seq_nr % cpumask_weight(pd->cpumask.pcpu);
+
+ return padata_index_to_cpu(pd, cpu_index);
+}
+
+static void padata_parallel_worker(struct work_struct *parallel_work)
+{
+ struct padata_parallel_queue *pqueue;
+ struct parallel_data *pd;
+ struct padata_instance *pinst;
+ LIST_HEAD(local_list);
+
+ local_bh_disable();
+ pqueue = container_of(parallel_work,
+ struct padata_parallel_queue, work);
+ pd = pqueue->pd;
+ pinst = pd->pinst;
+
+ spin_lock(&pqueue->parallel.lock);
+ list_replace_init(&pqueue->parallel.list, &local_list);
+ spin_unlock(&pqueue->parallel.lock);
+
+ while (!list_empty(&local_list)) {
+ struct padata_priv *padata;
+
+ padata = list_entry(local_list.next,
+ struct padata_priv, list);
+
+ list_del_init(&padata->list);
+
+ padata->parallel(padata);
+ }
+
+ local_bh_enable();
+}
+
+/**
+ * padata_do_parallel - padata parallelization function
+ *
+ * @pinst: padata instance
+ * @padata: object to be parallelized
+ * @cb_cpu: cpu the serialization callback function will run on,
+ * must be in the serial cpumask of padata(i.e. cpumask.cbcpu).
+ *
+ * The parallelization callback function will run with BHs off.
+ * Note: Every object which is parallelized by padata_do_parallel
+ * must be seen by padata_do_serial.
+ */
+int padata_do_parallel(struct padata_instance *pinst,
+ struct padata_priv *padata, int cb_cpu)
+{
+ int target_cpu, err;
+ struct padata_parallel_queue *queue;
+ struct parallel_data *pd;
+
+ rcu_read_lock_bh();
+
+ pd = rcu_dereference_bh(pinst->pd);
+
+ err = -EINVAL;
+ if (!(pinst->flags & PADATA_INIT) || pinst->flags & PADATA_INVALID)
+ goto out;
+
+ if (!cpumask_test_cpu(cb_cpu, pd->cpumask.cbcpu))
+ goto out;
+
+ err = -EBUSY;
+ if ((pinst->flags & PADATA_RESET))
+ goto out;
+
+ if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM)
+ goto out;
+
+ err = 0;
+ atomic_inc(&pd->refcnt);
+ padata->pd = pd;
+ padata->cb_cpu = cb_cpu;
+
+ target_cpu = padata_cpu_hash(pd);
+ queue = per_cpu_ptr(pd->pqueue, target_cpu);
+
+ spin_lock(&queue->parallel.lock);
+ list_add_tail(&padata->list, &queue->parallel.list);
+ spin_unlock(&queue->parallel.lock);
+
+ queue_work_on(target_cpu, pinst->wq, &queue->work);
+
+out:
+ rcu_read_unlock_bh();
+
+ return err;
+}
+EXPORT_SYMBOL(padata_do_parallel);
+
+/*
+ * padata_get_next - Get the next object that needs serialization.
+ *
+ * Return values are:
+ *
+ * A pointer to the control struct of the next object that needs
+ * serialization, if present in one of the percpu reorder queues.
+ *
+ * NULL, if all percpu reorder queues are empty.
+ *
+ * -EINPROGRESS, if the next object that needs serialization will
+ * be parallel processed by another cpu and is not yet present in
+ * the cpu's reorder queue.
+ *
+ * -ENODATA, if this cpu has to do the parallel processing for
+ * the next object.
+ */
+static struct padata_priv *padata_get_next(struct parallel_data *pd)
+{
+ int cpu, num_cpus;
+ unsigned int next_nr, next_index;
+ struct padata_parallel_queue *next_queue;
+ struct padata_priv *padata;
+ struct padata_list *reorder;
+
+ num_cpus = cpumask_weight(pd->cpumask.pcpu);
+
+ /*
+ * Calculate the percpu reorder queue and the sequence
+ * number of the next object.
+ */
+ next_nr = pd->processed;
+ next_index = next_nr % num_cpus;
+ cpu = padata_index_to_cpu(pd, next_index);
+ next_queue = per_cpu_ptr(pd->pqueue, cpu);
+
+ padata = NULL;
+
+ reorder = &next_queue->reorder;
+
+ if (!list_empty(&reorder->list)) {
+ padata = list_entry(reorder->list.next,
+ struct padata_priv, list);
+
+ spin_lock(&reorder->lock);
+ list_del_init(&padata->list);
+ atomic_dec(&pd->reorder_objects);
+ spin_unlock(&reorder->lock);
+
+ pd->processed++;
+
+ goto out;
+ }
+
+ if (__this_cpu_read(pd->pqueue->cpu_index) == next_queue->cpu_index) {
+ padata = ERR_PTR(-ENODATA);
+ goto out;
+ }
+
+ padata = ERR_PTR(-EINPROGRESS);
+out:
+ return padata;
+}
+
+static void padata_reorder(struct parallel_data *pd)
+{
+ int cb_cpu;
+ struct padata_priv *padata;
+ struct padata_serial_queue *squeue;
+ struct padata_instance *pinst = pd->pinst;
+
+ /*
+ * We need to ensure that only one cpu can work on dequeueing of
+ * the reorder queue the time. Calculating in which percpu reorder
+ * queue the next object will arrive takes some time. A spinlock
+ * would be highly contended. Also it is not clear in which order
+ * the objects arrive to the reorder queues. So a cpu could wait to
+ * get the lock just to notice that there is nothing to do at the
+ * moment. Therefore we use a trylock and let the holder of the lock
+ * care for all the objects enqueued during the holdtime of the lock.
+ */
+ if (!spin_trylock_bh(&pd->lock))
+ return;
+
+ while (1) {
+ padata = padata_get_next(pd);
+
+ /*
+ * All reorder queues are empty, or the next object that needs
+ * serialization is parallel processed by another cpu and is
+ * still on it's way to the cpu's reorder queue, nothing to
+ * do for now.
+ */
+ if (!padata || PTR_ERR(padata) == -EINPROGRESS)
+ break;
+
+ /*
+ * This cpu has to do the parallel processing of the next
+ * object. It's waiting in the cpu's parallelization queue,
+ * so exit immediately.
+ */
+ if (PTR_ERR(padata) == -ENODATA) {
+ del_timer(&pd->timer);
+ spin_unlock_bh(&pd->lock);
+ return;
+ }
+
+ cb_cpu = padata->cb_cpu;
+ squeue = per_cpu_ptr(pd->squeue, cb_cpu);
+
+ spin_lock(&squeue->serial.lock);
+ list_add_tail(&padata->list, &squeue->serial.list);
+ spin_unlock(&squeue->serial.lock);
+
+ queue_work_on(cb_cpu, pinst->wq, &squeue->work);
+ }
+
+ spin_unlock_bh(&pd->lock);
+
+ /*
+ * The next object that needs serialization might have arrived to
+ * the reorder queues in the meantime, we will be called again
+ * from the timer function if no one else cares for it.
+ */
+ if (atomic_read(&pd->reorder_objects)
+ && !(pinst->flags & PADATA_RESET))
+ mod_timer(&pd->timer, jiffies + HZ);
+ else
+ del_timer(&pd->timer);
+
+ return;
+}
+
+static void padata_reorder_timer(unsigned long arg)
+{
+ struct parallel_data *pd = (struct parallel_data *)arg;
+
+ padata_reorder(pd);
+}
+
+static void padata_serial_worker(struct work_struct *serial_work)
+{
+ struct padata_serial_queue *squeue;
+ struct parallel_data *pd;
+ LIST_HEAD(local_list);
+
+ local_bh_disable();
+ squeue = container_of(serial_work, struct padata_serial_queue, work);
+ pd = squeue->pd;
+
+ spin_lock(&squeue->serial.lock);
+ list_replace_init(&squeue->serial.list, &local_list);
+ spin_unlock(&squeue->serial.lock);
+
+ while (!list_empty(&local_list)) {
+ struct padata_priv *padata;
+
+ padata = list_entry(local_list.next,
+ struct padata_priv, list);
+
+ list_del_init(&padata->list);
+
+ padata->serial(padata);
+ atomic_dec(&pd->refcnt);
+ }
+ local_bh_enable();
+}
+
+/**
+ * padata_do_serial - padata serialization function
+ *
+ * @padata: object to be serialized.
+ *
+ * padata_do_serial must be called for every parallelized object.
+ * The serialization callback function will run with BHs off.
+ */
+void padata_do_serial(struct padata_priv *padata)
+{
+ int cpu;
+ struct padata_parallel_queue *pqueue;
+ struct parallel_data *pd;
+
+ pd = padata->pd;
+
+ cpu = get_cpu();
+ pqueue = per_cpu_ptr(pd->pqueue, cpu);
+
+ spin_lock(&pqueue->reorder.lock);
+ atomic_inc(&pd->reorder_objects);
+ list_add_tail(&padata->list, &pqueue->reorder.list);
+ spin_unlock(&pqueue->reorder.lock);
+
+ put_cpu();
+
+ padata_reorder(pd);
+}
+EXPORT_SYMBOL(padata_do_serial);
+
+static int padata_setup_cpumasks(struct parallel_data *pd,
+ const struct cpumask *pcpumask,
+ const struct cpumask *cbcpumask)
+{
+ if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL))
+ return -ENOMEM;
+
+ cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_online_mask);
+ if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) {
+ free_cpumask_var(pd->cpumask.cbcpu);
+ return -ENOMEM;
+ }
+
+ cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_online_mask);
+ return 0;
+}
+
+static void __padata_list_init(struct padata_list *pd_list)
+{
+ INIT_LIST_HEAD(&pd_list->list);
+ spin_lock_init(&pd_list->lock);
+}
+
+/* Initialize all percpu queues used by serial workers */
+static void padata_init_squeues(struct parallel_data *pd)
+{
+ int cpu;
+ struct padata_serial_queue *squeue;
+
+ for_each_cpu(cpu, pd->cpumask.cbcpu) {
+ squeue = per_cpu_ptr(pd->squeue, cpu);
+ squeue->pd = pd;
+ __padata_list_init(&squeue->serial);
+ INIT_WORK(&squeue->work, padata_serial_worker);
+ }
+}
+
+/* Initialize all percpu queues used by parallel workers */
+static void padata_init_pqueues(struct parallel_data *pd)
+{
+ int cpu_index, cpu;
+ struct padata_parallel_queue *pqueue;
+
+ cpu_index = 0;
+ for_each_cpu(cpu, pd->cpumask.pcpu) {
+ pqueue = per_cpu_ptr(pd->pqueue, cpu);
+ pqueue->pd = pd;
+ pqueue->cpu_index = cpu_index;
+ cpu_index++;
+
+ __padata_list_init(&pqueue->reorder);
+ __padata_list_init(&pqueue->parallel);
+ INIT_WORK(&pqueue->work, padata_parallel_worker);
+ atomic_set(&pqueue->num_obj, 0);
+ }
+}
+
+/* Allocate and initialize the internal cpumask dependend resources. */
+static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
+ const struct cpumask *pcpumask,
+ const struct cpumask *cbcpumask)
+{
+ struct parallel_data *pd;
+
+ pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL);
+ if (!pd)
+ goto err;
+
+ pd->pqueue = alloc_percpu(struct padata_parallel_queue);
+ if (!pd->pqueue)
+ goto err_free_pd;
+
+ pd->squeue = alloc_percpu(struct padata_serial_queue);
+ if (!pd->squeue)
+ goto err_free_pqueue;
+ if (padata_setup_cpumasks(pd, pcpumask, cbcpumask) < 0)
+ goto err_free_squeue;
+
+ padata_init_pqueues(pd);
+ padata_init_squeues(pd);
+ setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd);
+ atomic_set(&pd->seq_nr, -1);
+ atomic_set(&pd->reorder_objects, 0);
+ atomic_set(&pd->refcnt, 0);
+ pd->pinst = pinst;
+ spin_lock_init(&pd->lock);
+
+ return pd;
+
+err_free_squeue:
+ free_percpu(pd->squeue);
+err_free_pqueue:
+ free_percpu(pd->pqueue);
+err_free_pd:
+ kfree(pd);
+err:
+ return NULL;
+}
+
+static void padata_free_pd(struct parallel_data *pd)
+{
+ free_cpumask_var(pd->cpumask.pcpu);
+ free_cpumask_var(pd->cpumask.cbcpu);
+ free_percpu(pd->pqueue);
+ free_percpu(pd->squeue);
+ kfree(pd);
+}
+
+/* Flush all objects out of the padata queues. */
+static void padata_flush_queues(struct parallel_data *pd)
+{
+ int cpu;
+ struct padata_parallel_queue *pqueue;
+ struct padata_serial_queue *squeue;
+
+ for_each_cpu(cpu, pd->cpumask.pcpu) {
+ pqueue = per_cpu_ptr(pd->pqueue, cpu);
+ flush_work(&pqueue->work);
+ }
+
+ del_timer_sync(&pd->timer);
+
+ if (atomic_read(&pd->reorder_objects))
+ padata_reorder(pd);
+
+ for_each_cpu(cpu, pd->cpumask.cbcpu) {
+ squeue = per_cpu_ptr(pd->squeue, cpu);
+ flush_work(&squeue->work);
+ }
+
+ BUG_ON(atomic_read(&pd->refcnt) != 0);
+}
+
+static void __padata_start(struct padata_instance *pinst)
+{
+ pinst->flags |= PADATA_INIT;
+}
+
+static void __padata_stop(struct padata_instance *pinst)
+{
+ if (!(pinst->flags & PADATA_INIT))
+ return;
+
+ pinst->flags &= ~PADATA_INIT;
+
+ synchronize_rcu();
+
+ get_online_cpus();
+ padata_flush_queues(pinst->pd);
+ put_online_cpus();
+}
+
+/* Replace the internal control structure with a new one. */
+static void padata_replace(struct padata_instance *pinst,
+ struct parallel_data *pd_new)
+{
+ struct parallel_data *pd_old = pinst->pd;
+ int notification_mask = 0;
+
+ pinst->flags |= PADATA_RESET;
+
+ rcu_assign_pointer(pinst->pd, pd_new);
+
+ synchronize_rcu();
+
+ if (!cpumask_equal(pd_old->cpumask.pcpu, pd_new->cpumask.pcpu))
+ notification_mask |= PADATA_CPU_PARALLEL;
+ if (!cpumask_equal(pd_old->cpumask.cbcpu, pd_new->cpumask.cbcpu))
+ notification_mask |= PADATA_CPU_SERIAL;
+
+ padata_flush_queues(pd_old);
+ padata_free_pd(pd_old);
+
+ if (notification_mask)
+ blocking_notifier_call_chain(&pinst->cpumask_change_notifier,
+ notification_mask,
+ &pd_new->cpumask);
+
+ pinst->flags &= ~PADATA_RESET;
+}
+
+/**
+ * padata_register_cpumask_notifier - Registers a notifier that will be called
+ * if either pcpu or cbcpu or both cpumasks change.
+ *
+ * @pinst: A poineter to padata instance
+ * @nblock: A pointer to notifier block.
+ */
+int padata_register_cpumask_notifier(struct padata_instance *pinst,
+ struct notifier_block *nblock)
+{
+ return blocking_notifier_chain_register(&pinst->cpumask_change_notifier,
+ nblock);
+}
+EXPORT_SYMBOL(padata_register_cpumask_notifier);
+
+/**
+ * padata_unregister_cpumask_notifier - Unregisters cpumask notifier
+ * registered earlier using padata_register_cpumask_notifier
+ *
+ * @pinst: A pointer to data instance.
+ * @nlock: A pointer to notifier block.
+ */
+int padata_unregister_cpumask_notifier(struct padata_instance *pinst,
+ struct notifier_block *nblock)
+{
+ return blocking_notifier_chain_unregister(
+ &pinst->cpumask_change_notifier,
+ nblock);
+}
+EXPORT_SYMBOL(padata_unregister_cpumask_notifier);
+
+
+/* If cpumask contains no active cpu, we mark the instance as invalid. */
+static bool padata_validate_cpumask(struct padata_instance *pinst,
+ const struct cpumask *cpumask)
+{
+ if (!cpumask_intersects(cpumask, cpu_online_mask)) {
+ pinst->flags |= PADATA_INVALID;
+ return false;
+ }
+
+ pinst->flags &= ~PADATA_INVALID;
+ return true;
+}
+
+static int __padata_set_cpumasks(struct padata_instance *pinst,
+ cpumask_var_t pcpumask,
+ cpumask_var_t cbcpumask)
+{
+ int valid;
+ struct parallel_data *pd;
+
+ valid = padata_validate_cpumask(pinst, pcpumask);
+ if (!valid) {
+ __padata_stop(pinst);
+ goto out_replace;
+ }
+
+ valid = padata_validate_cpumask(pinst, cbcpumask);
+ if (!valid)
+ __padata_stop(pinst);
+
+out_replace:
+ pd = padata_alloc_pd(pinst, pcpumask, cbcpumask);
+ if (!pd)
+ return -ENOMEM;
+
+ cpumask_copy(pinst->cpumask.pcpu, pcpumask);
+ cpumask_copy(pinst->cpumask.cbcpu, cbcpumask);
+
+ padata_replace(pinst, pd);
+
+ if (valid)
+ __padata_start(pinst);
+
+ return 0;
+}
+
+/**
+ * padata_set_cpumasks - Set both parallel and serial cpumasks. The first
+ * one is used by parallel workers and the second one
+ * by the wokers doing serialization.
+ *
+ * @pinst: padata instance
+ * @pcpumask: the cpumask to use for parallel workers
+ * @cbcpumask: the cpumsak to use for serial workers
+ */
+int padata_set_cpumasks(struct padata_instance *pinst, cpumask_var_t pcpumask,
+ cpumask_var_t cbcpumask)
+{
+ int err;
+
+ mutex_lock(&pinst->lock);
+ get_online_cpus();
+
+ err = __padata_set_cpumasks(pinst, pcpumask, cbcpumask);
+
+ put_online_cpus();
+ mutex_unlock(&pinst->lock);
+
+ return err;
+
+}
+EXPORT_SYMBOL(padata_set_cpumasks);
+
+/**
+ * padata_set_cpumask: Sets specified by @cpumask_type cpumask to the value
+ * equivalent to @cpumask.
+ *
+ * @pinst: padata instance
+ * @cpumask_type: PADATA_CPU_SERIAL or PADATA_CPU_PARALLEL corresponding
+ * to parallel and serial cpumasks respectively.
+ * @cpumask: the cpumask to use
+ */
+int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
+ cpumask_var_t cpumask)
+{
+ struct cpumask *serial_mask, *parallel_mask;
+ int err = -EINVAL;
+
+ mutex_lock(&pinst->lock);
+ get_online_cpus();
+
+ switch (cpumask_type) {
+ case PADATA_CPU_PARALLEL:
+ serial_mask = pinst->cpumask.cbcpu;
+ parallel_mask = cpumask;
+ break;
+ case PADATA_CPU_SERIAL:
+ parallel_mask = pinst->cpumask.pcpu;
+ serial_mask = cpumask;
+ break;
+ default:
+ goto out;
+ }
+
+ err = __padata_set_cpumasks(pinst, parallel_mask, serial_mask);
+
+out:
+ put_online_cpus();
+ mutex_unlock(&pinst->lock);
+
+ return err;
+}
+EXPORT_SYMBOL(padata_set_cpumask);
+
+static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
+{
+ struct parallel_data *pd;
+
+ if (cpumask_test_cpu(cpu, cpu_online_mask)) {
+ pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu,
+ pinst->cpumask.cbcpu);
+ if (!pd)
+ return -ENOMEM;
+
+ padata_replace(pinst, pd);
+
+ if (padata_validate_cpumask(pinst, pinst->cpumask.pcpu) &&
+ padata_validate_cpumask(pinst, pinst->cpumask.cbcpu))
+ __padata_start(pinst);
+ }
+
+ return 0;
+}
+
+ /**
+ * padata_add_cpu - add a cpu to one or both(parallel and serial)
+ * padata cpumasks.
+ *
+ * @pinst: padata instance
+ * @cpu: cpu to add
+ * @mask: bitmask of flags specifying to which cpumask @cpu shuld be added.
+ * The @mask may be any combination of the following flags:
+ * PADATA_CPU_SERIAL - serial cpumask
+ * PADATA_CPU_PARALLEL - parallel cpumask
+ */
+
+int padata_add_cpu(struct padata_instance *pinst, int cpu, int mask)
+{
+ int err;
+
+ if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL)))
+ return -EINVAL;
+
+ mutex_lock(&pinst->lock);
+
+ get_online_cpus();
+ if (mask & PADATA_CPU_SERIAL)
+ cpumask_set_cpu(cpu, pinst->cpumask.cbcpu);
+ if (mask & PADATA_CPU_PARALLEL)
+ cpumask_set_cpu(cpu, pinst->cpumask.pcpu);
+
+ err = __padata_add_cpu(pinst, cpu);
+ put_online_cpus();
+
+ mutex_unlock(&pinst->lock);
+
+ return err;
+}
+EXPORT_SYMBOL(padata_add_cpu);
+
+static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
+{
+ struct parallel_data *pd = NULL;
+
+ if (cpumask_test_cpu(cpu, cpu_online_mask)) {
+
+ if (!padata_validate_cpumask(pinst, pinst->cpumask.pcpu) ||
+ !padata_validate_cpumask(pinst, pinst->cpumask.cbcpu))
+ __padata_stop(pinst);
+
+ pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu,
+ pinst->cpumask.cbcpu);
+ if (!pd)
+ return -ENOMEM;
+
+ padata_replace(pinst, pd);
+
+ cpumask_clear_cpu(cpu, pd->cpumask.cbcpu);
+ cpumask_clear_cpu(cpu, pd->cpumask.pcpu);
+ }
+
+ return 0;
+}
+
+ /**
+ * padata_remove_cpu - remove a cpu from the one or both(serial and parallel)
+ * padata cpumasks.
+ *
+ * @pinst: padata instance
+ * @cpu: cpu to remove
+ * @mask: bitmask specifying from which cpumask @cpu should be removed
+ * The @mask may be any combination of the following flags:
+ * PADATA_CPU_SERIAL - serial cpumask
+ * PADATA_CPU_PARALLEL - parallel cpumask
+ */
+int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask)
+{
+ int err;
+
+ if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL)))
+ return -EINVAL;
+
+ mutex_lock(&pinst->lock);
+
+ get_online_cpus();
+ if (mask & PADATA_CPU_SERIAL)
+ cpumask_clear_cpu(cpu, pinst->cpumask.cbcpu);
+ if (mask & PADATA_CPU_PARALLEL)
+ cpumask_clear_cpu(cpu, pinst->cpumask.pcpu);
+
+ err = __padata_remove_cpu(pinst, cpu);
+ put_online_cpus();
+
+ mutex_unlock(&pinst->lock);
+
+ return err;
+}
+EXPORT_SYMBOL(padata_remove_cpu);
+
+/**
+ * padata_start - start the parallel processing
+ *
+ * @pinst: padata instance to start
+ */
+int padata_start(struct padata_instance *pinst)
+{
+ int err = 0;
+
+ mutex_lock(&pinst->lock);
+
+ if (pinst->flags & PADATA_INVALID)
+ err =-EINVAL;
+
+ __padata_start(pinst);
+
+ mutex_unlock(&pinst->lock);
+
+ return err;
+}
+EXPORT_SYMBOL(padata_start);
+
+/**
+ * padata_stop - stop the parallel processing
+ *
+ * @pinst: padata instance to stop
+ */
+void padata_stop(struct padata_instance *pinst)
+{
+ mutex_lock(&pinst->lock);
+ __padata_stop(pinst);
+ mutex_unlock(&pinst->lock);
+}
+EXPORT_SYMBOL(padata_stop);
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu)
+{
+ return cpumask_test_cpu(cpu, pinst->cpumask.pcpu) ||
+ cpumask_test_cpu(cpu, pinst->cpumask.cbcpu);
+}
+
+
+static int padata_cpu_callback(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ int err;
+ struct padata_instance *pinst;
+ int cpu = (unsigned long)hcpu;
+
+ pinst = container_of(nfb, struct padata_instance, cpu_notifier);
+
+ switch (action) {
+ case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
+ case CPU_DOWN_FAILED:
+ case CPU_DOWN_FAILED_FROZEN:
+ if (!pinst_has_cpu(pinst, cpu))
+ break;
+ mutex_lock(&pinst->lock);
+ err = __padata_add_cpu(pinst, cpu);
+ mutex_unlock(&pinst->lock);
+ if (err)
+ return notifier_from_errno(err);
+ break;
+
+ case CPU_DOWN_PREPARE:
+ case CPU_DOWN_PREPARE_FROZEN:
+ case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
+ if (!pinst_has_cpu(pinst, cpu))
+ break;
+ mutex_lock(&pinst->lock);
+ err = __padata_remove_cpu(pinst, cpu);
+ mutex_unlock(&pinst->lock);
+ if (err)
+ return notifier_from_errno(err);
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+#endif
+
+static void __padata_free(struct padata_instance *pinst)
+{
+#ifdef CONFIG_HOTPLUG_CPU
+ unregister_hotcpu_notifier(&pinst->cpu_notifier);
+#endif
+
+ padata_stop(pinst);
+ padata_free_pd(pinst->pd);
+ free_cpumask_var(pinst->cpumask.pcpu);
+ free_cpumask_var(pinst->cpumask.cbcpu);
+ kfree(pinst);
+}
+
+#define kobj2pinst(_kobj) \
+ container_of(_kobj, struct padata_instance, kobj)
+#define attr2pentry(_attr) \
+ container_of(_attr, struct padata_sysfs_entry, attr)
+
+static void padata_sysfs_release(struct kobject *kobj)
+{
+ struct padata_instance *pinst = kobj2pinst(kobj);
+ __padata_free(pinst);
+}
+
+struct padata_sysfs_entry {
+ struct attribute attr;
+ ssize_t (*show)(struct padata_instance *, struct attribute *, char *);
+ ssize_t (*store)(struct padata_instance *, struct attribute *,
+ const char *, size_t);
+};
+
+static ssize_t show_cpumask(struct padata_instance *pinst,
+ struct attribute *attr, char *buf)
+{
+ struct cpumask *cpumask;
+ ssize_t len;
+
+ mutex_lock(&pinst->lock);
+ if (!strcmp(attr->name, "serial_cpumask"))
+ cpumask = pinst->cpumask.cbcpu;
+ else
+ cpumask = pinst->cpumask.pcpu;
+
+ len = snprintf(buf, PAGE_SIZE, "%*pb\n",
+ nr_cpu_ids, cpumask_bits(cpumask));
+ mutex_unlock(&pinst->lock);
+ return len < PAGE_SIZE ? len : -EINVAL;
+}
+
+static ssize_t store_cpumask(struct padata_instance *pinst,
+ struct attribute *attr,
+ const char *buf, size_t count)
+{
+ cpumask_var_t new_cpumask;
+ ssize_t ret;
+ int mask_type;
+
+ if (!alloc_cpumask_var(&new_cpumask, GFP_KERNEL))
+ return -ENOMEM;
+
+ ret = bitmap_parse(buf, count, cpumask_bits(new_cpumask),
+ nr_cpumask_bits);
+ if (ret < 0)
+ goto out;
+
+ mask_type = !strcmp(attr->name, "serial_cpumask") ?
+ PADATA_CPU_SERIAL : PADATA_CPU_PARALLEL;
+ ret = padata_set_cpumask(pinst, mask_type, new_cpumask);
+ if (!ret)
+ ret = count;
+
+out:
+ free_cpumask_var(new_cpumask);
+ return ret;
+}
+
+#define PADATA_ATTR_RW(_name, _show_name, _store_name) \
+ static struct padata_sysfs_entry _name##_attr = \
+ __ATTR(_name, 0644, _show_name, _store_name)
+#define PADATA_ATTR_RO(_name, _show_name) \
+ static struct padata_sysfs_entry _name##_attr = \
+ __ATTR(_name, 0400, _show_name, NULL)
+
+PADATA_ATTR_RW(serial_cpumask, show_cpumask, store_cpumask);
+PADATA_ATTR_RW(parallel_cpumask, show_cpumask, store_cpumask);
+
+/*
+ * Padata sysfs provides the following objects:
+ * serial_cpumask [RW] - cpumask for serial workers
+ * parallel_cpumask [RW] - cpumask for parallel workers
+ */
+static struct attribute *padata_default_attrs[] = {
+ &serial_cpumask_attr.attr,
+ &parallel_cpumask_attr.attr,
+ NULL,
+};
+
+static ssize_t padata_sysfs_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct padata_instance *pinst;
+ struct padata_sysfs_entry *pentry;
+ ssize_t ret = -EIO;
+
+ pinst = kobj2pinst(kobj);
+ pentry = attr2pentry(attr);
+ if (pentry->show)
+ ret = pentry->show(pinst, attr, buf);
+
+ return ret;
+}
+
+static ssize_t padata_sysfs_store(struct kobject *kobj, struct attribute *attr,
+ const char *buf, size_t count)
+{
+ struct padata_instance *pinst;
+ struct padata_sysfs_entry *pentry;
+ ssize_t ret = -EIO;
+
+ pinst = kobj2pinst(kobj);
+ pentry = attr2pentry(attr);
+ if (pentry->show)
+ ret = pentry->store(pinst, attr, buf, count);
+
+ return ret;
+}
+
+static const struct sysfs_ops padata_sysfs_ops = {
+ .show = padata_sysfs_show,
+ .store = padata_sysfs_store,
+};
+
+static struct kobj_type padata_attr_type = {
+ .sysfs_ops = &padata_sysfs_ops,
+ .default_attrs = padata_default_attrs,
+ .release = padata_sysfs_release,
+};
+
+/**
+ * padata_alloc_possible - Allocate and initialize padata instance.
+ * Use the cpu_possible_mask for serial and
+ * parallel workers.
+ *
+ * @wq: workqueue to use for the allocated padata instance
+ */
+struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq)
+{
+ return padata_alloc(wq, cpu_possible_mask, cpu_possible_mask);
+}
+EXPORT_SYMBOL(padata_alloc_possible);
+
+/**
+ * padata_alloc - allocate and initialize a padata instance and specify
+ * cpumasks for serial and parallel workers.
+ *
+ * @wq: workqueue to use for the allocated padata instance
+ * @pcpumask: cpumask that will be used for padata parallelization
+ * @cbcpumask: cpumask that will be used for padata serialization
+ */
+struct padata_instance *padata_alloc(struct workqueue_struct *wq,
+ const struct cpumask *pcpumask,
+ const struct cpumask *cbcpumask)
+{
+ struct padata_instance *pinst;
+ struct parallel_data *pd = NULL;
+
+ pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL);
+ if (!pinst)
+ goto err;
+
+ get_online_cpus();
+ if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL))
+ goto err_free_inst;
+ if (!alloc_cpumask_var(&pinst->cpumask.cbcpu, GFP_KERNEL)) {
+ free_cpumask_var(pinst->cpumask.pcpu);
+ goto err_free_inst;
+ }
+ if (!padata_validate_cpumask(pinst, pcpumask) ||
+ !padata_validate_cpumask(pinst, cbcpumask))
+ goto err_free_masks;
+
+ pd = padata_alloc_pd(pinst, pcpumask, cbcpumask);
+ if (!pd)
+ goto err_free_masks;
+
+ rcu_assign_pointer(pinst->pd, pd);
+
+ pinst->wq = wq;
+
+ cpumask_copy(pinst->cpumask.pcpu, pcpumask);
+ cpumask_copy(pinst->cpumask.cbcpu, cbcpumask);
+
+ pinst->flags = 0;
+
+ put_online_cpus();
+
+ BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier);
+ kobject_init(&pinst->kobj, &padata_attr_type);
+ mutex_init(&pinst->lock);
+
+#ifdef CONFIG_HOTPLUG_CPU
+ pinst->cpu_notifier.notifier_call = padata_cpu_callback;
+ pinst->cpu_notifier.priority = 0;
+ register_hotcpu_notifier(&pinst->cpu_notifier);
+#endif
+
+ return pinst;
+
+err_free_masks:
+ free_cpumask_var(pinst->cpumask.pcpu);
+ free_cpumask_var(pinst->cpumask.cbcpu);
+err_free_inst:
+ kfree(pinst);
+ put_online_cpus();
+err:
+ return NULL;
+}
+EXPORT_SYMBOL(padata_alloc);
+
+/**
+ * padata_free - free a padata instance
+ *
+ * @padata_inst: padata instance to free
+ */
+void padata_free(struct padata_instance *pinst)
+{
+ kobject_put(&pinst->kobj);
+}
+EXPORT_SYMBOL(padata_free);
diff --git a/kernel/panic.c b/kernel/panic.c
new file mode 100644
index 000000000..8136ad76e
--- /dev/null
+++ b/kernel/panic.c
@@ -0,0 +1,519 @@
+/*
+ * linux/kernel/panic.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ */
+
+/*
+ * This function is used through-out the kernel (including mm and fs)
+ * to indicate a major problem.
+ */
+#include <linux/debug_locks.h>
+#include <linux/interrupt.h>
+#include <linux/kmsg_dump.h>
+#include <linux/kallsyms.h>
+#include <linux/notifier.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/ftrace.h>
+#include <linux/reboot.h>
+#include <linux/delay.h>
+#include <linux/kexec.h>
+#include <linux/sched.h>
+#include <linux/sysrq.h>
+#include <linux/init.h>
+#include <linux/nmi.h>
+
+#define PANIC_TIMER_STEP 100
+#define PANIC_BLINK_SPD 18
+
+int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE;
+static unsigned long tainted_mask;
+static int pause_on_oops;
+static int pause_on_oops_flag;
+static DEFINE_SPINLOCK(pause_on_oops_lock);
+static bool crash_kexec_post_notifiers;
+int panic_on_warn __read_mostly;
+
+int panic_timeout = CONFIG_PANIC_TIMEOUT;
+EXPORT_SYMBOL_GPL(panic_timeout);
+
+ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
+
+EXPORT_SYMBOL(panic_notifier_list);
+
+static long no_blink(int state)
+{
+ return 0;
+}
+
+/* Returns how long it waited in ms */
+long (*panic_blink)(int state);
+EXPORT_SYMBOL(panic_blink);
+
+/*
+ * Stop ourself in panic -- architecture code may override this
+ */
+void __weak panic_smp_self_stop(void)
+{
+ while (1)
+ cpu_relax();
+}
+
+/**
+ * panic - halt the system
+ * @fmt: The text string to print
+ *
+ * Display a message, then perform cleanups.
+ *
+ * This function never returns.
+ */
+void panic(const char *fmt, ...)
+{
+ static DEFINE_SPINLOCK(panic_lock);
+ static char buf[1024];
+ va_list args;
+ long i, i_next = 0;
+ int state = 0;
+
+ /*
+ * Disable local interrupts. This will prevent panic_smp_self_stop
+ * from deadlocking the first cpu that invokes the panic, since
+ * there is nothing to prevent an interrupt handler (that runs
+ * after the panic_lock is acquired) from invoking panic again.
+ */
+ local_irq_disable();
+
+ /*
+ * It's possible to come here directly from a panic-assertion and
+ * not have preempt disabled. Some functions called from here want
+ * preempt to be disabled. No point enabling it later though...
+ *
+ * Only one CPU is allowed to execute the panic code from here. For
+ * multiple parallel invocations of panic, all other CPUs either
+ * stop themself or will wait until they are stopped by the 1st CPU
+ * with smp_send_stop().
+ */
+ if (!spin_trylock(&panic_lock))
+ panic_smp_self_stop();
+
+ console_verbose();
+ bust_spinlocks(1);
+ va_start(args, fmt);
+ vsnprintf(buf, sizeof(buf), fmt, args);
+ va_end(args);
+ pr_emerg("Kernel panic - not syncing: %s\n", buf);
+#ifdef CONFIG_DEBUG_BUGVERBOSE
+ /*
+ * Avoid nested stack-dumping if a panic occurs during oops processing
+ */
+ if (!test_taint(TAINT_DIE) && oops_in_progress <= 1)
+ dump_stack();
+#endif
+
+ /*
+ * If we have crashed and we have a crash kernel loaded let it handle
+ * everything else.
+ * If we want to run this after calling panic_notifiers, pass
+ * the "crash_kexec_post_notifiers" option to the kernel.
+ */
+ if (!crash_kexec_post_notifiers)
+ crash_kexec(NULL);
+
+ /*
+ * Note smp_send_stop is the usual smp shutdown function, which
+ * unfortunately means it may not be hardened to work in a panic
+ * situation.
+ */
+ smp_send_stop();
+
+ /*
+ * Run any panic handlers, including those that might need to
+ * add information to the kmsg dump output.
+ */
+ atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
+
+ kmsg_dump(KMSG_DUMP_PANIC);
+
+ /*
+ * If you doubt kdump always works fine in any situation,
+ * "crash_kexec_post_notifiers" offers you a chance to run
+ * panic_notifiers and dumping kmsg before kdump.
+ * Note: since some panic_notifiers can make crashed kernel
+ * more unstable, it can increase risks of the kdump failure too.
+ */
+ crash_kexec(NULL);
+
+ bust_spinlocks(0);
+
+ if (!panic_blink)
+ panic_blink = no_blink;
+
+ if (panic_timeout > 0) {
+ /*
+ * Delay timeout seconds before rebooting the machine.
+ * We can't use the "normal" timers since we just panicked.
+ */
+ pr_emerg("Rebooting in %d seconds..", panic_timeout);
+
+ for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) {
+ touch_nmi_watchdog();
+ if (i >= i_next) {
+ i += panic_blink(state ^= 1);
+ i_next = i + 3600 / PANIC_BLINK_SPD;
+ }
+ mdelay(PANIC_TIMER_STEP);
+ }
+ }
+ if (panic_timeout != 0) {
+ /*
+ * This will not be a clean reboot, with everything
+ * shutting down. But if there is a chance of
+ * rebooting the system it will be rebooted.
+ */
+ emergency_restart();
+ }
+#ifdef __sparc__
+ {
+ extern int stop_a_enabled;
+ /* Make sure the user can actually press Stop-A (L1-A) */
+ stop_a_enabled = 1;
+ pr_emerg("Press Stop-A (L1-A) to return to the boot prom\n");
+ }
+#endif
+#if defined(CONFIG_S390)
+ {
+ unsigned long caller;
+
+ caller = (unsigned long)__builtin_return_address(0);
+ disabled_wait(caller);
+ }
+#endif
+ pr_emerg("---[ end Kernel panic - not syncing: %s\n", buf);
+ local_irq_enable();
+ for (i = 0; ; i += PANIC_TIMER_STEP) {
+ touch_softlockup_watchdog();
+ if (i >= i_next) {
+ i += panic_blink(state ^= 1);
+ i_next = i + 3600 / PANIC_BLINK_SPD;
+ }
+ mdelay(PANIC_TIMER_STEP);
+ }
+}
+
+EXPORT_SYMBOL(panic);
+
+
+struct tnt {
+ u8 bit;
+ char true;
+ char false;
+};
+
+static const struct tnt tnts[] = {
+ { TAINT_PROPRIETARY_MODULE, 'P', 'G' },
+ { TAINT_FORCED_MODULE, 'F', ' ' },
+ { TAINT_CPU_OUT_OF_SPEC, 'S', ' ' },
+ { TAINT_FORCED_RMMOD, 'R', ' ' },
+ { TAINT_MACHINE_CHECK, 'M', ' ' },
+ { TAINT_BAD_PAGE, 'B', ' ' },
+ { TAINT_USER, 'U', ' ' },
+ { TAINT_DIE, 'D', ' ' },
+ { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' },
+ { TAINT_WARN, 'W', ' ' },
+ { TAINT_CRAP, 'C', ' ' },
+ { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' },
+ { TAINT_OOT_MODULE, 'O', ' ' },
+ { TAINT_UNSIGNED_MODULE, 'E', ' ' },
+ { TAINT_SOFTLOCKUP, 'L', ' ' },
+ { TAINT_LIVEPATCH, 'K', ' ' },
+};
+
+/**
+ * print_tainted - return a string to represent the kernel taint state.
+ *
+ * 'P' - Proprietary module has been loaded.
+ * 'F' - Module has been forcibly loaded.
+ * 'S' - SMP with CPUs not designed for SMP.
+ * 'R' - User forced a module unload.
+ * 'M' - System experienced a machine check exception.
+ * 'B' - System has hit bad_page.
+ * 'U' - Userspace-defined naughtiness.
+ * 'D' - Kernel has oopsed before
+ * 'A' - ACPI table overridden.
+ * 'W' - Taint on warning.
+ * 'C' - modules from drivers/staging are loaded.
+ * 'I' - Working around severe firmware bug.
+ * 'O' - Out-of-tree module has been loaded.
+ * 'E' - Unsigned module has been loaded.
+ * 'L' - A soft lockup has previously occurred.
+ * 'K' - Kernel has been live patched.
+ *
+ * The string is overwritten by the next call to print_tainted().
+ */
+const char *print_tainted(void)
+{
+ static char buf[ARRAY_SIZE(tnts) + sizeof("Tainted: ")];
+
+ if (tainted_mask) {
+ char *s;
+ int i;
+
+ s = buf + sprintf(buf, "Tainted: ");
+ for (i = 0; i < ARRAY_SIZE(tnts); i++) {
+ const struct tnt *t = &tnts[i];
+ *s++ = test_bit(t->bit, &tainted_mask) ?
+ t->true : t->false;
+ }
+ *s = 0;
+ } else
+ snprintf(buf, sizeof(buf), "Not tainted");
+
+ return buf;
+}
+
+int test_taint(unsigned flag)
+{
+ return test_bit(flag, &tainted_mask);
+}
+EXPORT_SYMBOL(test_taint);
+
+unsigned long get_taint(void)
+{
+ return tainted_mask;
+}
+
+/**
+ * add_taint: add a taint flag if not already set.
+ * @flag: one of the TAINT_* constants.
+ * @lockdep_ok: whether lock debugging is still OK.
+ *
+ * If something bad has gone wrong, you'll want @lockdebug_ok = false, but for
+ * some notewortht-but-not-corrupting cases, it can be set to true.
+ */
+void add_taint(unsigned flag, enum lockdep_ok lockdep_ok)
+{
+ if (lockdep_ok == LOCKDEP_NOW_UNRELIABLE && __debug_locks_off())
+ pr_warn("Disabling lock debugging due to kernel taint\n");
+
+ set_bit(flag, &tainted_mask);
+}
+EXPORT_SYMBOL(add_taint);
+
+static void spin_msec(int msecs)
+{
+ int i;
+
+ for (i = 0; i < msecs; i++) {
+ touch_nmi_watchdog();
+ mdelay(1);
+ }
+}
+
+/*
+ * It just happens that oops_enter() and oops_exit() are identically
+ * implemented...
+ */
+static void do_oops_enter_exit(void)
+{
+ unsigned long flags;
+ static int spin_counter;
+
+ if (!pause_on_oops)
+ return;
+
+ spin_lock_irqsave(&pause_on_oops_lock, flags);
+ if (pause_on_oops_flag == 0) {
+ /* This CPU may now print the oops message */
+ pause_on_oops_flag = 1;
+ } else {
+ /* We need to stall this CPU */
+ if (!spin_counter) {
+ /* This CPU gets to do the counting */
+ spin_counter = pause_on_oops;
+ do {
+ spin_unlock(&pause_on_oops_lock);
+ spin_msec(MSEC_PER_SEC);
+ spin_lock(&pause_on_oops_lock);
+ } while (--spin_counter);
+ pause_on_oops_flag = 0;
+ } else {
+ /* This CPU waits for a different one */
+ while (spin_counter) {
+ spin_unlock(&pause_on_oops_lock);
+ spin_msec(1);
+ spin_lock(&pause_on_oops_lock);
+ }
+ }
+ }
+ spin_unlock_irqrestore(&pause_on_oops_lock, flags);
+}
+
+/*
+ * Return true if the calling CPU is allowed to print oops-related info.
+ * This is a bit racy..
+ */
+int oops_may_print(void)
+{
+ return pause_on_oops_flag == 0;
+}
+
+/*
+ * Called when the architecture enters its oops handler, before it prints
+ * anything. If this is the first CPU to oops, and it's oopsing the first
+ * time then let it proceed.
+ *
+ * This is all enabled by the pause_on_oops kernel boot option. We do all
+ * this to ensure that oopses don't scroll off the screen. It has the
+ * side-effect of preventing later-oopsing CPUs from mucking up the display,
+ * too.
+ *
+ * It turns out that the CPU which is allowed to print ends up pausing for
+ * the right duration, whereas all the other CPUs pause for twice as long:
+ * once in oops_enter(), once in oops_exit().
+ */
+void oops_enter(void)
+{
+ tracing_off();
+ /* can't trust the integrity of the kernel anymore: */
+ debug_locks_off();
+ do_oops_enter_exit();
+}
+
+/*
+ * 64-bit random ID for oopses:
+ */
+static u64 oops_id;
+
+static int init_oops_id(void)
+{
+ if (!oops_id)
+ get_random_bytes(&oops_id, sizeof(oops_id));
+ else
+ oops_id++;
+
+ return 0;
+}
+late_initcall(init_oops_id);
+
+void print_oops_end_marker(void)
+{
+ init_oops_id();
+ pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id);
+}
+
+/*
+ * Called when the architecture exits its oops handler, after printing
+ * everything.
+ */
+void oops_exit(void)
+{
+ do_oops_enter_exit();
+ print_oops_end_marker();
+ kmsg_dump(KMSG_DUMP_OOPS);
+}
+
+#ifdef WANT_WARN_ON_SLOWPATH
+struct slowpath_args {
+ const char *fmt;
+ va_list args;
+};
+
+static void warn_slowpath_common(const char *file, int line, void *caller,
+ unsigned taint, struct slowpath_args *args)
+{
+ disable_trace_on_warning();
+
+ pr_warn("------------[ cut here ]------------\n");
+ pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS()\n",
+ raw_smp_processor_id(), current->pid, file, line, caller);
+
+ if (args)
+ vprintk(args->fmt, args->args);
+
+ if (panic_on_warn) {
+ /*
+ * This thread may hit another WARN() in the panic path.
+ * Resetting this prevents additional WARN() from panicking the
+ * system on this thread. Other threads are blocked by the
+ * panic_mutex in panic().
+ */
+ panic_on_warn = 0;
+ panic("panic_on_warn set ...\n");
+ }
+
+ print_modules();
+ dump_stack();
+ print_oops_end_marker();
+ /* Just a warning, don't kill lockdep. */
+ add_taint(taint, LOCKDEP_STILL_OK);
+}
+
+void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...)
+{
+ struct slowpath_args args;
+
+ args.fmt = fmt;
+ va_start(args.args, fmt);
+ warn_slowpath_common(file, line, __builtin_return_address(0),
+ TAINT_WARN, &args);
+ va_end(args.args);
+}
+EXPORT_SYMBOL(warn_slowpath_fmt);
+
+void warn_slowpath_fmt_taint(const char *file, int line,
+ unsigned taint, const char *fmt, ...)
+{
+ struct slowpath_args args;
+
+ args.fmt = fmt;
+ va_start(args.args, fmt);
+ warn_slowpath_common(file, line, __builtin_return_address(0),
+ taint, &args);
+ va_end(args.args);
+}
+EXPORT_SYMBOL(warn_slowpath_fmt_taint);
+
+void warn_slowpath_null(const char *file, int line)
+{
+ warn_slowpath_common(file, line, __builtin_return_address(0),
+ TAINT_WARN, NULL);
+}
+EXPORT_SYMBOL(warn_slowpath_null);
+#endif
+
+#ifdef CONFIG_CC_STACKPROTECTOR
+
+/*
+ * Called when gcc's -fstack-protector feature is used, and
+ * gcc detects corruption of the on-stack canary value
+ */
+__visible void __stack_chk_fail(void)
+{
+ panic("stack-protector: Kernel stack is corrupted in: %p\n",
+ __builtin_return_address(0));
+}
+EXPORT_SYMBOL(__stack_chk_fail);
+
+#endif
+
+core_param(panic, panic_timeout, int, 0644);
+core_param(pause_on_oops, pause_on_oops, int, 0644);
+core_param(panic_on_warn, panic_on_warn, int, 0644);
+
+static int __init setup_crash_kexec_post_notifiers(char *s)
+{
+ crash_kexec_post_notifiers = true;
+ return 0;
+}
+early_param("crash_kexec_post_notifiers", setup_crash_kexec_post_notifiers);
+
+static int __init oops_setup(char *s)
+{
+ if (!s)
+ return -EINVAL;
+ if (!strcmp(s, "panic"))
+ panic_on_oops = 1;
+ return 0;
+}
+early_param("oops", oops_setup);
diff --git a/kernel/params.c b/kernel/params.c
new file mode 100644
index 000000000..a22d6a759
--- /dev/null
+++ b/kernel/params.c
@@ -0,0 +1,954 @@
+/* Helpers for initial module or kernel cmdline parsing
+ Copyright (C) 2001 Rusty Russell.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/ctype.h>
+
+/* Protects all parameters, and incidentally kmalloced_param list. */
+static DEFINE_MUTEX(param_lock);
+
+/* This just allows us to keep track of which parameters are kmalloced. */
+struct kmalloced_param {
+ struct list_head list;
+ char val[];
+};
+static LIST_HEAD(kmalloced_params);
+
+static void *kmalloc_parameter(unsigned int size)
+{
+ struct kmalloced_param *p;
+
+ p = kmalloc(sizeof(*p) + size, GFP_KERNEL);
+ if (!p)
+ return NULL;
+
+ list_add(&p->list, &kmalloced_params);
+ return p->val;
+}
+
+/* Does nothing if parameter wasn't kmalloced above. */
+static void maybe_kfree_parameter(void *param)
+{
+ struct kmalloced_param *p;
+
+ list_for_each_entry(p, &kmalloced_params, list) {
+ if (p->val == param) {
+ list_del(&p->list);
+ kfree(p);
+ break;
+ }
+ }
+}
+
+static char dash2underscore(char c)
+{
+ if (c == '-')
+ return '_';
+ return c;
+}
+
+bool parameqn(const char *a, const char *b, size_t n)
+{
+ size_t i;
+
+ for (i = 0; i < n; i++) {
+ if (dash2underscore(a[i]) != dash2underscore(b[i]))
+ return false;
+ }
+ return true;
+}
+
+bool parameq(const char *a, const char *b)
+{
+ return parameqn(a, b, strlen(a)+1);
+}
+
+static void param_check_unsafe(const struct kernel_param *kp)
+{
+ if (kp->flags & KERNEL_PARAM_FL_UNSAFE) {
+ pr_warn("Setting dangerous option %s - tainting kernel\n",
+ kp->name);
+ add_taint(TAINT_USER, LOCKDEP_STILL_OK);
+ }
+}
+
+static int parse_one(char *param,
+ char *val,
+ const char *doing,
+ const struct kernel_param *params,
+ unsigned num_params,
+ s16 min_level,
+ s16 max_level,
+ int (*handle_unknown)(char *param, char *val,
+ const char *doing))
+{
+ unsigned int i;
+ int err;
+
+ /* Find parameter */
+ for (i = 0; i < num_params; i++) {
+ if (parameq(param, params[i].name)) {
+ if (params[i].level < min_level
+ || params[i].level > max_level)
+ return 0;
+ /* No one handled NULL, so do it here. */
+ if (!val &&
+ !(params[i].ops->flags & KERNEL_PARAM_OPS_FL_NOARG))
+ return -EINVAL;
+ pr_debug("handling %s with %p\n", param,
+ params[i].ops->set);
+ mutex_lock(&param_lock);
+ param_check_unsafe(&params[i]);
+ err = params[i].ops->set(val, &params[i]);
+ mutex_unlock(&param_lock);
+ return err;
+ }
+ }
+
+ if (handle_unknown) {
+ pr_debug("doing %s: %s='%s'\n", doing, param, val);
+ return handle_unknown(param, val, doing);
+ }
+
+ pr_debug("Unknown argument '%s'\n", param);
+ return -ENOENT;
+}
+
+/* You can use " around spaces, but can't escape ". */
+/* Hyphens and underscores equivalent in parameter names. */
+static char *next_arg(char *args, char **param, char **val)
+{
+ unsigned int i, equals = 0;
+ int in_quote = 0, quoted = 0;
+ char *next;
+
+ if (*args == '"') {
+ args++;
+ in_quote = 1;
+ quoted = 1;
+ }
+
+ for (i = 0; args[i]; i++) {
+ if (isspace(args[i]) && !in_quote)
+ break;
+ if (equals == 0) {
+ if (args[i] == '=')
+ equals = i;
+ }
+ if (args[i] == '"')
+ in_quote = !in_quote;
+ }
+
+ *param = args;
+ if (!equals)
+ *val = NULL;
+ else {
+ args[equals] = '\0';
+ *val = args + equals + 1;
+
+ /* Don't include quotes in value. */
+ if (**val == '"') {
+ (*val)++;
+ if (args[i-1] == '"')
+ args[i-1] = '\0';
+ }
+ }
+ if (quoted && args[i-1] == '"')
+ args[i-1] = '\0';
+
+ if (args[i]) {
+ args[i] = '\0';
+ next = args + i + 1;
+ } else
+ next = args + i;
+
+ /* Chew up trailing spaces. */
+ return skip_spaces(next);
+}
+
+/* Args looks like "foo=bar,bar2 baz=fuz wiz". */
+char *parse_args(const char *doing,
+ char *args,
+ const struct kernel_param *params,
+ unsigned num,
+ s16 min_level,
+ s16 max_level,
+ int (*unknown)(char *param, char *val, const char *doing))
+{
+ char *param, *val;
+
+ /* Chew leading spaces */
+ args = skip_spaces(args);
+
+ if (*args)
+ pr_debug("doing %s, parsing ARGS: '%s'\n", doing, args);
+
+ while (*args) {
+ int ret;
+ int irq_was_disabled;
+
+ args = next_arg(args, &param, &val);
+ /* Stop at -- */
+ if (!val && strcmp(param, "--") == 0)
+ return args;
+ irq_was_disabled = irqs_disabled();
+ ret = parse_one(param, val, doing, params, num,
+ min_level, max_level, unknown);
+ if (irq_was_disabled && !irqs_disabled())
+ pr_warn("%s: option '%s' enabled irq's!\n",
+ doing, param);
+
+ switch (ret) {
+ case -ENOENT:
+ pr_err("%s: Unknown parameter `%s'\n", doing, param);
+ return ERR_PTR(ret);
+ case -ENOSPC:
+ pr_err("%s: `%s' too large for parameter `%s'\n",
+ doing, val ?: "", param);
+ return ERR_PTR(ret);
+ case 0:
+ break;
+ default:
+ pr_err("%s: `%s' invalid for parameter `%s'\n",
+ doing, val ?: "", param);
+ return ERR_PTR(ret);
+ }
+ }
+
+ /* All parsed OK. */
+ return NULL;
+}
+
+/* Lazy bastard, eh? */
+#define STANDARD_PARAM_DEF(name, type, format, strtolfn) \
+ int param_set_##name(const char *val, const struct kernel_param *kp) \
+ { \
+ return strtolfn(val, 0, (type *)kp->arg); \
+ } \
+ int param_get_##name(char *buffer, const struct kernel_param *kp) \
+ { \
+ return scnprintf(buffer, PAGE_SIZE, format, \
+ *((type *)kp->arg)); \
+ } \
+ struct kernel_param_ops param_ops_##name = { \
+ .set = param_set_##name, \
+ .get = param_get_##name, \
+ }; \
+ EXPORT_SYMBOL(param_set_##name); \
+ EXPORT_SYMBOL(param_get_##name); \
+ EXPORT_SYMBOL(param_ops_##name)
+
+
+STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8);
+STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16);
+STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16);
+STANDARD_PARAM_DEF(int, int, "%i", kstrtoint);
+STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint);
+STANDARD_PARAM_DEF(long, long, "%li", kstrtol);
+STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul);
+STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull);
+
+int param_set_charp(const char *val, const struct kernel_param *kp)
+{
+ if (strlen(val) > 1024) {
+ pr_err("%s: string parameter too long\n", kp->name);
+ return -ENOSPC;
+ }
+
+ maybe_kfree_parameter(*(char **)kp->arg);
+
+ /* This is a hack. We can't kmalloc in early boot, and we
+ * don't need to; this mangled commandline is preserved. */
+ if (slab_is_available()) {
+ *(char **)kp->arg = kmalloc_parameter(strlen(val)+1);
+ if (!*(char **)kp->arg)
+ return -ENOMEM;
+ strcpy(*(char **)kp->arg, val);
+ } else
+ *(const char **)kp->arg = val;
+
+ return 0;
+}
+EXPORT_SYMBOL(param_set_charp);
+
+int param_get_charp(char *buffer, const struct kernel_param *kp)
+{
+ return scnprintf(buffer, PAGE_SIZE, "%s", *((char **)kp->arg));
+}
+EXPORT_SYMBOL(param_get_charp);
+
+static void param_free_charp(void *arg)
+{
+ maybe_kfree_parameter(*((char **)arg));
+}
+
+struct kernel_param_ops param_ops_charp = {
+ .set = param_set_charp,
+ .get = param_get_charp,
+ .free = param_free_charp,
+};
+EXPORT_SYMBOL(param_ops_charp);
+
+/* Actually could be a bool or an int, for historical reasons. */
+int param_set_bool(const char *val, const struct kernel_param *kp)
+{
+ /* No equals means "set"... */
+ if (!val) val = "1";
+
+ /* One of =[yYnN01] */
+ return strtobool(val, kp->arg);
+}
+EXPORT_SYMBOL(param_set_bool);
+
+int param_get_bool(char *buffer, const struct kernel_param *kp)
+{
+ /* Y and N chosen as being relatively non-coder friendly */
+ return sprintf(buffer, "%c", *(bool *)kp->arg ? 'Y' : 'N');
+}
+EXPORT_SYMBOL(param_get_bool);
+
+struct kernel_param_ops param_ops_bool = {
+ .flags = KERNEL_PARAM_OPS_FL_NOARG,
+ .set = param_set_bool,
+ .get = param_get_bool,
+};
+EXPORT_SYMBOL(param_ops_bool);
+
+/* This one must be bool. */
+int param_set_invbool(const char *val, const struct kernel_param *kp)
+{
+ int ret;
+ bool boolval;
+ struct kernel_param dummy;
+
+ dummy.arg = &boolval;
+ ret = param_set_bool(val, &dummy);
+ if (ret == 0)
+ *(bool *)kp->arg = !boolval;
+ return ret;
+}
+EXPORT_SYMBOL(param_set_invbool);
+
+int param_get_invbool(char *buffer, const struct kernel_param *kp)
+{
+ return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y');
+}
+EXPORT_SYMBOL(param_get_invbool);
+
+struct kernel_param_ops param_ops_invbool = {
+ .set = param_set_invbool,
+ .get = param_get_invbool,
+};
+EXPORT_SYMBOL(param_ops_invbool);
+
+int param_set_bint(const char *val, const struct kernel_param *kp)
+{
+ struct kernel_param boolkp;
+ bool v;
+ int ret;
+
+ /* Match bool exactly, by re-using it. */
+ boolkp = *kp;
+ boolkp.arg = &v;
+
+ ret = param_set_bool(val, &boolkp);
+ if (ret == 0)
+ *(int *)kp->arg = v;
+ return ret;
+}
+EXPORT_SYMBOL(param_set_bint);
+
+struct kernel_param_ops param_ops_bint = {
+ .flags = KERNEL_PARAM_OPS_FL_NOARG,
+ .set = param_set_bint,
+ .get = param_get_int,
+};
+EXPORT_SYMBOL(param_ops_bint);
+
+/* We break the rule and mangle the string. */
+static int param_array(const char *name,
+ const char *val,
+ unsigned int min, unsigned int max,
+ void *elem, int elemsize,
+ int (*set)(const char *, const struct kernel_param *kp),
+ s16 level,
+ unsigned int *num)
+{
+ int ret;
+ struct kernel_param kp;
+ char save;
+
+ /* Get the name right for errors. */
+ kp.name = name;
+ kp.arg = elem;
+ kp.level = level;
+
+ *num = 0;
+ /* We expect a comma-separated list of values. */
+ do {
+ int len;
+
+ if (*num == max) {
+ pr_err("%s: can only take %i arguments\n", name, max);
+ return -EINVAL;
+ }
+ len = strcspn(val, ",");
+
+ /* nul-terminate and parse */
+ save = val[len];
+ ((char *)val)[len] = '\0';
+ BUG_ON(!mutex_is_locked(&param_lock));
+ ret = set(val, &kp);
+
+ if (ret != 0)
+ return ret;
+ kp.arg += elemsize;
+ val += len+1;
+ (*num)++;
+ } while (save == ',');
+
+ if (*num < min) {
+ pr_err("%s: needs at least %i arguments\n", name, min);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int param_array_set(const char *val, const struct kernel_param *kp)
+{
+ const struct kparam_array *arr = kp->arr;
+ unsigned int temp_num;
+
+ return param_array(kp->name, val, 1, arr->max, arr->elem,
+ arr->elemsize, arr->ops->set, kp->level,
+ arr->num ?: &temp_num);
+}
+
+static int param_array_get(char *buffer, const struct kernel_param *kp)
+{
+ int i, off, ret;
+ const struct kparam_array *arr = kp->arr;
+ struct kernel_param p;
+
+ p = *kp;
+ for (i = off = 0; i < (arr->num ? *arr->num : arr->max); i++) {
+ if (i)
+ buffer[off++] = ',';
+ p.arg = arr->elem + arr->elemsize * i;
+ BUG_ON(!mutex_is_locked(&param_lock));
+ ret = arr->ops->get(buffer + off, &p);
+ if (ret < 0)
+ return ret;
+ off += ret;
+ }
+ buffer[off] = '\0';
+ return off;
+}
+
+static void param_array_free(void *arg)
+{
+ unsigned int i;
+ const struct kparam_array *arr = arg;
+
+ if (arr->ops->free)
+ for (i = 0; i < (arr->num ? *arr->num : arr->max); i++)
+ arr->ops->free(arr->elem + arr->elemsize * i);
+}
+
+struct kernel_param_ops param_array_ops = {
+ .set = param_array_set,
+ .get = param_array_get,
+ .free = param_array_free,
+};
+EXPORT_SYMBOL(param_array_ops);
+
+int param_set_copystring(const char *val, const struct kernel_param *kp)
+{
+ const struct kparam_string *kps = kp->str;
+
+ if (strlen(val)+1 > kps->maxlen) {
+ pr_err("%s: string doesn't fit in %u chars.\n",
+ kp->name, kps->maxlen-1);
+ return -ENOSPC;
+ }
+ strcpy(kps->string, val);
+ return 0;
+}
+EXPORT_SYMBOL(param_set_copystring);
+
+int param_get_string(char *buffer, const struct kernel_param *kp)
+{
+ const struct kparam_string *kps = kp->str;
+ return strlcpy(buffer, kps->string, kps->maxlen);
+}
+EXPORT_SYMBOL(param_get_string);
+
+struct kernel_param_ops param_ops_string = {
+ .set = param_set_copystring,
+ .get = param_get_string,
+};
+EXPORT_SYMBOL(param_ops_string);
+
+/* sysfs output in /sys/modules/XYZ/parameters/ */
+#define to_module_attr(n) container_of(n, struct module_attribute, attr)
+#define to_module_kobject(n) container_of(n, struct module_kobject, kobj)
+
+struct param_attribute
+{
+ struct module_attribute mattr;
+ const struct kernel_param *param;
+};
+
+struct module_param_attrs
+{
+ unsigned int num;
+ struct attribute_group grp;
+ struct param_attribute attrs[0];
+};
+
+#ifdef CONFIG_SYSFS
+#define to_param_attr(n) container_of(n, struct param_attribute, mattr)
+
+static ssize_t param_attr_show(struct module_attribute *mattr,
+ struct module_kobject *mk, char *buf)
+{
+ int count;
+ struct param_attribute *attribute = to_param_attr(mattr);
+
+ if (!attribute->param->ops->get)
+ return -EPERM;
+
+ mutex_lock(&param_lock);
+ count = attribute->param->ops->get(buf, attribute->param);
+ mutex_unlock(&param_lock);
+ if (count > 0) {
+ strcat(buf, "\n");
+ ++count;
+ }
+ return count;
+}
+
+/* sysfs always hands a nul-terminated string in buf. We rely on that. */
+static ssize_t param_attr_store(struct module_attribute *mattr,
+ struct module_kobject *km,
+ const char *buf, size_t len)
+{
+ int err;
+ struct param_attribute *attribute = to_param_attr(mattr);
+
+ if (!attribute->param->ops->set)
+ return -EPERM;
+
+ mutex_lock(&param_lock);
+ param_check_unsafe(attribute->param);
+ err = attribute->param->ops->set(buf, attribute->param);
+ mutex_unlock(&param_lock);
+ if (!err)
+ return len;
+ return err;
+}
+#endif
+
+#ifdef CONFIG_MODULES
+#define __modinit
+#else
+#define __modinit __init
+#endif
+
+#ifdef CONFIG_SYSFS
+void __kernel_param_lock(void)
+{
+ mutex_lock(&param_lock);
+}
+EXPORT_SYMBOL(__kernel_param_lock);
+
+void __kernel_param_unlock(void)
+{
+ mutex_unlock(&param_lock);
+}
+EXPORT_SYMBOL(__kernel_param_unlock);
+
+/*
+ * add_sysfs_param - add a parameter to sysfs
+ * @mk: struct module_kobject
+ * @kparam: the actual parameter definition to add to sysfs
+ * @name: name of parameter
+ *
+ * Create a kobject if for a (per-module) parameter if mp NULL, and
+ * create file in sysfs. Returns an error on out of memory. Always cleans up
+ * if there's an error.
+ */
+static __modinit int add_sysfs_param(struct module_kobject *mk,
+ const struct kernel_param *kp,
+ const char *name)
+{
+ struct module_param_attrs *new_mp;
+ struct attribute **new_attrs;
+ unsigned int i;
+
+ /* We don't bother calling this with invisible parameters. */
+ BUG_ON(!kp->perm);
+
+ if (!mk->mp) {
+ /* First allocation. */
+ mk->mp = kzalloc(sizeof(*mk->mp), GFP_KERNEL);
+ if (!mk->mp)
+ return -ENOMEM;
+ mk->mp->grp.name = "parameters";
+ /* NULL-terminated attribute array. */
+ mk->mp->grp.attrs = kzalloc(sizeof(mk->mp->grp.attrs[0]),
+ GFP_KERNEL);
+ /* Caller will cleanup via free_module_param_attrs */
+ if (!mk->mp->grp.attrs)
+ return -ENOMEM;
+ }
+
+ /* Enlarge allocations. */
+ new_mp = krealloc(mk->mp,
+ sizeof(*mk->mp) +
+ sizeof(mk->mp->attrs[0]) * (mk->mp->num + 1),
+ GFP_KERNEL);
+ if (!new_mp)
+ return -ENOMEM;
+ mk->mp = new_mp;
+
+ /* Extra pointer for NULL terminator */
+ new_attrs = krealloc(mk->mp->grp.attrs,
+ sizeof(mk->mp->grp.attrs[0]) * (mk->mp->num + 2),
+ GFP_KERNEL);
+ if (!new_attrs)
+ return -ENOMEM;
+ mk->mp->grp.attrs = new_attrs;
+
+ /* Tack new one on the end. */
+ memset(&mk->mp->attrs[mk->mp->num], 0, sizeof(mk->mp->attrs[0]));
+ sysfs_attr_init(&mk->mp->attrs[mk->mp->num].mattr.attr);
+ mk->mp->attrs[mk->mp->num].param = kp;
+ mk->mp->attrs[mk->mp->num].mattr.show = param_attr_show;
+ /* Do not allow runtime DAC changes to make param writable. */
+ if ((kp->perm & (S_IWUSR | S_IWGRP | S_IWOTH)) != 0)
+ mk->mp->attrs[mk->mp->num].mattr.store = param_attr_store;
+ else
+ mk->mp->attrs[mk->mp->num].mattr.store = NULL;
+ mk->mp->attrs[mk->mp->num].mattr.attr.name = (char *)name;
+ mk->mp->attrs[mk->mp->num].mattr.attr.mode = kp->perm;
+ mk->mp->num++;
+
+ /* Fix up all the pointers, since krealloc can move us */
+ for (i = 0; i < mk->mp->num; i++)
+ mk->mp->grp.attrs[i] = &mk->mp->attrs[i].mattr.attr;
+ mk->mp->grp.attrs[mk->mp->num] = NULL;
+ return 0;
+}
+
+#ifdef CONFIG_MODULES
+static void free_module_param_attrs(struct module_kobject *mk)
+{
+ if (mk->mp)
+ kfree(mk->mp->grp.attrs);
+ kfree(mk->mp);
+ mk->mp = NULL;
+}
+
+/*
+ * module_param_sysfs_setup - setup sysfs support for one module
+ * @mod: module
+ * @kparam: module parameters (array)
+ * @num_params: number of module parameters
+ *
+ * Adds sysfs entries for module parameters under
+ * /sys/module/[mod->name]/parameters/
+ */
+int module_param_sysfs_setup(struct module *mod,
+ const struct kernel_param *kparam,
+ unsigned int num_params)
+{
+ int i, err;
+ bool params = false;
+
+ for (i = 0; i < num_params; i++) {
+ if (kparam[i].perm == 0)
+ continue;
+ err = add_sysfs_param(&mod->mkobj, &kparam[i], kparam[i].name);
+ if (err) {
+ free_module_param_attrs(&mod->mkobj);
+ return err;
+ }
+ params = true;
+ }
+
+ if (!params)
+ return 0;
+
+ /* Create the param group. */
+ err = sysfs_create_group(&mod->mkobj.kobj, &mod->mkobj.mp->grp);
+ if (err)
+ free_module_param_attrs(&mod->mkobj);
+ return err;
+}
+
+/*
+ * module_param_sysfs_remove - remove sysfs support for one module
+ * @mod: module
+ *
+ * Remove sysfs entries for module parameters and the corresponding
+ * kobject.
+ */
+void module_param_sysfs_remove(struct module *mod)
+{
+ if (mod->mkobj.mp) {
+ sysfs_remove_group(&mod->mkobj.kobj, &mod->mkobj.mp->grp);
+ /* We are positive that no one is using any param
+ * attrs at this point. Deallocate immediately. */
+ free_module_param_attrs(&mod->mkobj);
+ }
+}
+#endif
+
+void destroy_params(const struct kernel_param *params, unsigned num)
+{
+ unsigned int i;
+
+ for (i = 0; i < num; i++)
+ if (params[i].ops->free)
+ params[i].ops->free(params[i].arg);
+}
+
+static struct module_kobject * __init locate_module_kobject(const char *name)
+{
+ struct module_kobject *mk;
+ struct kobject *kobj;
+ int err;
+
+ kobj = kset_find_obj(module_kset, name);
+ if (kobj) {
+ mk = to_module_kobject(kobj);
+ } else {
+ mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL);
+ BUG_ON(!mk);
+
+ mk->mod = THIS_MODULE;
+ mk->kobj.kset = module_kset;
+ err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL,
+ "%s", name);
+#ifdef CONFIG_MODULES
+ if (!err)
+ err = sysfs_create_file(&mk->kobj, &module_uevent.attr);
+#endif
+ if (err) {
+ kobject_put(&mk->kobj);
+ pr_crit("Adding module '%s' to sysfs failed (%d), the system may be unstable.\n",
+ name, err);
+ return NULL;
+ }
+
+ /* So that we hold reference in both cases. */
+ kobject_get(&mk->kobj);
+ }
+
+ return mk;
+}
+
+static void __init kernel_add_sysfs_param(const char *name,
+ const struct kernel_param *kparam,
+ unsigned int name_skip)
+{
+ struct module_kobject *mk;
+ int err;
+
+ mk = locate_module_kobject(name);
+ if (!mk)
+ return;
+
+ /* We need to remove old parameters before adding more. */
+ if (mk->mp)
+ sysfs_remove_group(&mk->kobj, &mk->mp->grp);
+
+ /* These should not fail at boot. */
+ err = add_sysfs_param(mk, kparam, kparam->name + name_skip);
+ BUG_ON(err);
+ err = sysfs_create_group(&mk->kobj, &mk->mp->grp);
+ BUG_ON(err);
+ kobject_uevent(&mk->kobj, KOBJ_ADD);
+ kobject_put(&mk->kobj);
+}
+
+/*
+ * param_sysfs_builtin - add sysfs parameters for built-in modules
+ *
+ * Add module_parameters to sysfs for "modules" built into the kernel.
+ *
+ * The "module" name (KBUILD_MODNAME) is stored before a dot, the
+ * "parameter" name is stored behind a dot in kernel_param->name. So,
+ * extract the "module" name for all built-in kernel_param-eters,
+ * and for all who have the same, call kernel_add_sysfs_param.
+ */
+static void __init param_sysfs_builtin(void)
+{
+ const struct kernel_param *kp;
+ unsigned int name_len;
+ char modname[MODULE_NAME_LEN];
+
+ for (kp = __start___param; kp < __stop___param; kp++) {
+ char *dot;
+
+ if (kp->perm == 0)
+ continue;
+
+ dot = strchr(kp->name, '.');
+ if (!dot) {
+ /* This happens for core_param() */
+ strcpy(modname, "kernel");
+ name_len = 0;
+ } else {
+ name_len = dot - kp->name + 1;
+ strlcpy(modname, kp->name, name_len);
+ }
+ kernel_add_sysfs_param(modname, kp, name_len);
+ }
+}
+
+ssize_t __modver_version_show(struct module_attribute *mattr,
+ struct module_kobject *mk, char *buf)
+{
+ struct module_version_attribute *vattr =
+ container_of(mattr, struct module_version_attribute, mattr);
+
+ return scnprintf(buf, PAGE_SIZE, "%s\n", vattr->version);
+}
+
+extern const struct module_version_attribute *__start___modver[];
+extern const struct module_version_attribute *__stop___modver[];
+
+static void __init version_sysfs_builtin(void)
+{
+ const struct module_version_attribute **p;
+ struct module_kobject *mk;
+ int err;
+
+ for (p = __start___modver; p < __stop___modver; p++) {
+ const struct module_version_attribute *vattr = *p;
+
+ mk = locate_module_kobject(vattr->module_name);
+ if (mk) {
+ err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr);
+ kobject_uevent(&mk->kobj, KOBJ_ADD);
+ kobject_put(&mk->kobj);
+ }
+ }
+}
+
+/* module-related sysfs stuff */
+
+static ssize_t module_attr_show(struct kobject *kobj,
+ struct attribute *attr,
+ char *buf)
+{
+ struct module_attribute *attribute;
+ struct module_kobject *mk;
+ int ret;
+
+ attribute = to_module_attr(attr);
+ mk = to_module_kobject(kobj);
+
+ if (!attribute->show)
+ return -EIO;
+
+ ret = attribute->show(attribute, mk, buf);
+
+ return ret;
+}
+
+static ssize_t module_attr_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf, size_t len)
+{
+ struct module_attribute *attribute;
+ struct module_kobject *mk;
+ int ret;
+
+ attribute = to_module_attr(attr);
+ mk = to_module_kobject(kobj);
+
+ if (!attribute->store)
+ return -EIO;
+
+ ret = attribute->store(attribute, mk, buf, len);
+
+ return ret;
+}
+
+static const struct sysfs_ops module_sysfs_ops = {
+ .show = module_attr_show,
+ .store = module_attr_store,
+};
+
+static int uevent_filter(struct kset *kset, struct kobject *kobj)
+{
+ struct kobj_type *ktype = get_ktype(kobj);
+
+ if (ktype == &module_ktype)
+ return 1;
+ return 0;
+}
+
+static const struct kset_uevent_ops module_uevent_ops = {
+ .filter = uevent_filter,
+};
+
+struct kset *module_kset;
+int module_sysfs_initialized;
+
+static void module_kobj_release(struct kobject *kobj)
+{
+ struct module_kobject *mk = to_module_kobject(kobj);
+ complete(mk->kobj_completion);
+}
+
+struct kobj_type module_ktype = {
+ .release = module_kobj_release,
+ .sysfs_ops = &module_sysfs_ops,
+};
+
+/*
+ * param_sysfs_init - wrapper for built-in params support
+ */
+static int __init param_sysfs_init(void)
+{
+ module_kset = kset_create_and_add("module", &module_uevent_ops, NULL);
+ if (!module_kset) {
+ printk(KERN_WARNING "%s (%d): error creating kset\n",
+ __FILE__, __LINE__);
+ return -ENOMEM;
+ }
+ module_sysfs_initialized = 1;
+
+ version_sysfs_builtin();
+ param_sysfs_builtin();
+
+ return 0;
+}
+subsys_initcall(param_sysfs_init);
+
+#endif /* CONFIG_SYSFS */
diff --git a/kernel/pid.c b/kernel/pid.c
new file mode 100644
index 000000000..4fd07d5b7
--- /dev/null
+++ b/kernel/pid.c
@@ -0,0 +1,609 @@
+/*
+ * Generic pidhash and scalable, time-bounded PID allocator
+ *
+ * (C) 2002-2003 Nadia Yvette Chambers, IBM
+ * (C) 2004 Nadia Yvette Chambers, Oracle
+ * (C) 2002-2004 Ingo Molnar, Red Hat
+ *
+ * pid-structures are backing objects for tasks sharing a given ID to chain
+ * against. There is very little to them aside from hashing them and
+ * parking tasks using given ID's on a list.
+ *
+ * The hash is always changed with the tasklist_lock write-acquired,
+ * and the hash is only accessed with the tasklist_lock at least
+ * read-acquired, so there's no additional SMP locking needed here.
+ *
+ * We have a list of bitmap pages, which bitmaps represent the PID space.
+ * Allocating and freeing PIDs is completely lockless. The worst-case
+ * allocation scenario when all but one out of 1 million PIDs possible are
+ * allocated already: the scanning of 32 list entries and at most PAGE_SIZE
+ * bytes. The typical fastpath is a single successful setbit. Freeing is O(1).
+ *
+ * Pid namespaces:
+ * (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc.
+ * (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM
+ * Many thanks to Oleg Nesterov for comments and help
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/rculist.h>
+#include <linux/bootmem.h>
+#include <linux/hash.h>
+#include <linux/pid_namespace.h>
+#include <linux/init_task.h>
+#include <linux/syscalls.h>
+#include <linux/proc_ns.h>
+#include <linux/proc_fs.h>
+
+#define pid_hashfn(nr, ns) \
+ hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
+static struct hlist_head *pid_hash;
+static unsigned int pidhash_shift = 4;
+struct pid init_struct_pid = INIT_STRUCT_PID;
+
+int pid_max = PID_MAX_DEFAULT;
+
+#define RESERVED_PIDS 300
+
+int pid_max_min = RESERVED_PIDS + 1;
+int pid_max_max = PID_MAX_LIMIT;
+
+static inline int mk_pid(struct pid_namespace *pid_ns,
+ struct pidmap *map, int off)
+{
+ return (map - pid_ns->pidmap)*BITS_PER_PAGE + off;
+}
+
+#define find_next_offset(map, off) \
+ find_next_zero_bit((map)->page, BITS_PER_PAGE, off)
+
+/*
+ * PID-map pages start out as NULL, they get allocated upon
+ * first use and are never deallocated. This way a low pid_max
+ * value does not cause lots of bitmaps to be allocated, but
+ * the scheme scales to up to 4 million PIDs, runtime.
+ */
+struct pid_namespace init_pid_ns = {
+ .kref = {
+ .refcount = ATOMIC_INIT(2),
+ },
+ .pidmap = {
+ [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
+ },
+ .last_pid = 0,
+ .nr_hashed = PIDNS_HASH_ADDING,
+ .level = 0,
+ .child_reaper = &init_task,
+ .user_ns = &init_user_ns,
+ .ns.inum = PROC_PID_INIT_INO,
+#ifdef CONFIG_PID_NS
+ .ns.ops = &pidns_operations,
+#endif
+};
+EXPORT_SYMBOL_GPL(init_pid_ns);
+
+/*
+ * Note: disable interrupts while the pidmap_lock is held as an
+ * interrupt might come in and do read_lock(&tasklist_lock).
+ *
+ * If we don't disable interrupts there is a nasty deadlock between
+ * detach_pid()->free_pid() and another cpu that does
+ * spin_lock(&pidmap_lock) followed by an interrupt routine that does
+ * read_lock(&tasklist_lock);
+ *
+ * After we clean up the tasklist_lock and know there are no
+ * irq handlers that take it we can leave the interrupts enabled.
+ * For now it is easier to be safe than to prove it can't happen.
+ */
+
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
+
+static void free_pidmap(struct upid *upid)
+{
+ int nr = upid->nr;
+ struct pidmap *map = upid->ns->pidmap + nr / BITS_PER_PAGE;
+ int offset = nr & BITS_PER_PAGE_MASK;
+
+ clear_bit(offset, map->page);
+ atomic_inc(&map->nr_free);
+}
+
+/*
+ * If we started walking pids at 'base', is 'a' seen before 'b'?
+ */
+static int pid_before(int base, int a, int b)
+{
+ /*
+ * This is the same as saying
+ *
+ * (a - base + MAXUINT) % MAXUINT < (b - base + MAXUINT) % MAXUINT
+ * and that mapping orders 'a' and 'b' with respect to 'base'.
+ */
+ return (unsigned)(a - base) < (unsigned)(b - base);
+}
+
+/*
+ * We might be racing with someone else trying to set pid_ns->last_pid
+ * at the pid allocation time (there's also a sysctl for this, but racing
+ * with this one is OK, see comment in kernel/pid_namespace.c about it).
+ * We want the winner to have the "later" value, because if the
+ * "earlier" value prevails, then a pid may get reused immediately.
+ *
+ * Since pids rollover, it is not sufficient to just pick the bigger
+ * value. We have to consider where we started counting from.
+ *
+ * 'base' is the value of pid_ns->last_pid that we observed when
+ * we started looking for a pid.
+ *
+ * 'pid' is the pid that we eventually found.
+ */
+static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid)
+{
+ int prev;
+ int last_write = base;
+ do {
+ prev = last_write;
+ last_write = cmpxchg(&pid_ns->last_pid, prev, pid);
+ } while ((prev != last_write) && (pid_before(base, last_write, pid)));
+}
+
+static int alloc_pidmap(struct pid_namespace *pid_ns)
+{
+ int i, offset, max_scan, pid, last = pid_ns->last_pid;
+ struct pidmap *map;
+
+ pid = last + 1;
+ if (pid >= pid_max)
+ pid = RESERVED_PIDS;
+ offset = pid & BITS_PER_PAGE_MASK;
+ map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
+ /*
+ * If last_pid points into the middle of the map->page we
+ * want to scan this bitmap block twice, the second time
+ * we start with offset == 0 (or RESERVED_PIDS).
+ */
+ max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset;
+ for (i = 0; i <= max_scan; ++i) {
+ if (unlikely(!map->page)) {
+ void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ /*
+ * Free the page if someone raced with us
+ * installing it:
+ */
+ spin_lock_irq(&pidmap_lock);
+ if (!map->page) {
+ map->page = page;
+ page = NULL;
+ }
+ spin_unlock_irq(&pidmap_lock);
+ kfree(page);
+ if (unlikely(!map->page))
+ return -ENOMEM;
+ }
+ if (likely(atomic_read(&map->nr_free))) {
+ for ( ; ; ) {
+ if (!test_and_set_bit(offset, map->page)) {
+ atomic_dec(&map->nr_free);
+ set_last_pid(pid_ns, last, pid);
+ return pid;
+ }
+ offset = find_next_offset(map, offset);
+ if (offset >= BITS_PER_PAGE)
+ break;
+ pid = mk_pid(pid_ns, map, offset);
+ if (pid >= pid_max)
+ break;
+ }
+ }
+ if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
+ ++map;
+ offset = 0;
+ } else {
+ map = &pid_ns->pidmap[0];
+ offset = RESERVED_PIDS;
+ if (unlikely(last == offset))
+ break;
+ }
+ pid = mk_pid(pid_ns, map, offset);
+ }
+ return -EAGAIN;
+}
+
+int next_pidmap(struct pid_namespace *pid_ns, unsigned int last)
+{
+ int offset;
+ struct pidmap *map, *end;
+
+ if (last >= PID_MAX_LIMIT)
+ return -1;
+
+ offset = (last + 1) & BITS_PER_PAGE_MASK;
+ map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
+ end = &pid_ns->pidmap[PIDMAP_ENTRIES];
+ for (; map < end; map++, offset = 0) {
+ if (unlikely(!map->page))
+ continue;
+ offset = find_next_bit((map)->page, BITS_PER_PAGE, offset);
+ if (offset < BITS_PER_PAGE)
+ return mk_pid(pid_ns, map, offset);
+ }
+ return -1;
+}
+
+void put_pid(struct pid *pid)
+{
+ struct pid_namespace *ns;
+
+ if (!pid)
+ return;
+
+ ns = pid->numbers[pid->level].ns;
+ if ((atomic_read(&pid->count) == 1) ||
+ atomic_dec_and_test(&pid->count)) {
+ kmem_cache_free(ns->pid_cachep, pid);
+ put_pid_ns(ns);
+ }
+}
+EXPORT_SYMBOL_GPL(put_pid);
+
+static void delayed_put_pid(struct rcu_head *rhp)
+{
+ struct pid *pid = container_of(rhp, struct pid, rcu);
+ put_pid(pid);
+}
+
+void free_pid(struct pid *pid)
+{
+ /* We can be called with write_lock_irq(&tasklist_lock) held */
+ int i;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pidmap_lock, flags);
+ for (i = 0; i <= pid->level; i++) {
+ struct upid *upid = pid->numbers + i;
+ struct pid_namespace *ns = upid->ns;
+ hlist_del_rcu(&upid->pid_chain);
+ switch(--ns->nr_hashed) {
+ case 2:
+ case 1:
+ /* When all that is left in the pid namespace
+ * is the reaper wake up the reaper. The reaper
+ * may be sleeping in zap_pid_ns_processes().
+ */
+ wake_up_process(ns->child_reaper);
+ break;
+ case PIDNS_HASH_ADDING:
+ /* Handle a fork failure of the first process */
+ WARN_ON(ns->child_reaper);
+ ns->nr_hashed = 0;
+ /* fall through */
+ case 0:
+ schedule_work(&ns->proc_work);
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&pidmap_lock, flags);
+
+ for (i = 0; i <= pid->level; i++)
+ free_pidmap(pid->numbers + i);
+
+ call_rcu(&pid->rcu, delayed_put_pid);
+}
+
+struct pid *alloc_pid(struct pid_namespace *ns)
+{
+ struct pid *pid;
+ enum pid_type type;
+ int i, nr;
+ struct pid_namespace *tmp;
+ struct upid *upid;
+ int retval = -ENOMEM;
+
+ pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
+ if (!pid)
+ return ERR_PTR(retval);
+
+ tmp = ns;
+ pid->level = ns->level;
+ for (i = ns->level; i >= 0; i--) {
+ nr = alloc_pidmap(tmp);
+ if (IS_ERR_VALUE(nr)) {
+ retval = nr;
+ goto out_free;
+ }
+
+ pid->numbers[i].nr = nr;
+ pid->numbers[i].ns = tmp;
+ tmp = tmp->parent;
+ }
+
+ if (unlikely(is_child_reaper(pid))) {
+ if (pid_ns_prepare_proc(ns))
+ goto out_free;
+ }
+
+ get_pid_ns(ns);
+ atomic_set(&pid->count, 1);
+ for (type = 0; type < PIDTYPE_MAX; ++type)
+ INIT_HLIST_HEAD(&pid->tasks[type]);
+
+ upid = pid->numbers + ns->level;
+ spin_lock_irq(&pidmap_lock);
+ if (!(ns->nr_hashed & PIDNS_HASH_ADDING))
+ goto out_unlock;
+ for ( ; upid >= pid->numbers; --upid) {
+ hlist_add_head_rcu(&upid->pid_chain,
+ &pid_hash[pid_hashfn(upid->nr, upid->ns)]);
+ upid->ns->nr_hashed++;
+ }
+ spin_unlock_irq(&pidmap_lock);
+
+ return pid;
+
+out_unlock:
+ spin_unlock_irq(&pidmap_lock);
+ put_pid_ns(ns);
+
+out_free:
+ while (++i <= ns->level)
+ free_pidmap(pid->numbers + i);
+
+ kmem_cache_free(ns->pid_cachep, pid);
+ return ERR_PTR(retval);
+}
+
+void disable_pid_allocation(struct pid_namespace *ns)
+{
+ spin_lock_irq(&pidmap_lock);
+ ns->nr_hashed &= ~PIDNS_HASH_ADDING;
+ spin_unlock_irq(&pidmap_lock);
+}
+
+struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
+{
+ struct upid *pnr;
+
+ hlist_for_each_entry_rcu(pnr,
+ &pid_hash[pid_hashfn(nr, ns)], pid_chain)
+ if (pnr->nr == nr && pnr->ns == ns)
+ return container_of(pnr, struct pid,
+ numbers[ns->level]);
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(find_pid_ns);
+
+struct pid *find_vpid(int nr)
+{
+ return find_pid_ns(nr, task_active_pid_ns(current));
+}
+EXPORT_SYMBOL_GPL(find_vpid);
+
+/*
+ * attach_pid() must be called with the tasklist_lock write-held.
+ */
+void attach_pid(struct task_struct *task, enum pid_type type)
+{
+ struct pid_link *link = &task->pids[type];
+ hlist_add_head_rcu(&link->node, &link->pid->tasks[type]);
+}
+
+static void __change_pid(struct task_struct *task, enum pid_type type,
+ struct pid *new)
+{
+ struct pid_link *link;
+ struct pid *pid;
+ int tmp;
+
+ link = &task->pids[type];
+ pid = link->pid;
+
+ hlist_del_rcu(&link->node);
+ link->pid = new;
+
+ for (tmp = PIDTYPE_MAX; --tmp >= 0; )
+ if (!hlist_empty(&pid->tasks[tmp]))
+ return;
+
+ free_pid(pid);
+}
+
+void detach_pid(struct task_struct *task, enum pid_type type)
+{
+ __change_pid(task, type, NULL);
+}
+
+void change_pid(struct task_struct *task, enum pid_type type,
+ struct pid *pid)
+{
+ __change_pid(task, type, pid);
+ attach_pid(task, type);
+}
+
+/* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */
+void transfer_pid(struct task_struct *old, struct task_struct *new,
+ enum pid_type type)
+{
+ new->pids[type].pid = old->pids[type].pid;
+ hlist_replace_rcu(&old->pids[type].node, &new->pids[type].node);
+}
+
+struct task_struct *pid_task(struct pid *pid, enum pid_type type)
+{
+ struct task_struct *result = NULL;
+ if (pid) {
+ struct hlist_node *first;
+ first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]),
+ lockdep_tasklist_lock_is_held());
+ if (first)
+ result = hlist_entry(first, struct task_struct, pids[(type)].node);
+ }
+ return result;
+}
+EXPORT_SYMBOL(pid_task);
+
+/*
+ * Must be called under rcu_read_lock().
+ */
+struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
+{
+ rcu_lockdep_assert(rcu_read_lock_held(),
+ "find_task_by_pid_ns() needs rcu_read_lock()"
+ " protection");
+ return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
+}
+
+struct task_struct *find_task_by_vpid(pid_t vnr)
+{
+ return find_task_by_pid_ns(vnr, task_active_pid_ns(current));
+}
+
+struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
+{
+ struct pid *pid;
+ rcu_read_lock();
+ if (type != PIDTYPE_PID)
+ task = task->group_leader;
+ pid = get_pid(task->pids[type].pid);
+ rcu_read_unlock();
+ return pid;
+}
+EXPORT_SYMBOL_GPL(get_task_pid);
+
+struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
+{
+ struct task_struct *result;
+ rcu_read_lock();
+ result = pid_task(pid, type);
+ if (result)
+ get_task_struct(result);
+ rcu_read_unlock();
+ return result;
+}
+EXPORT_SYMBOL_GPL(get_pid_task);
+
+struct pid *find_get_pid(pid_t nr)
+{
+ struct pid *pid;
+
+ rcu_read_lock();
+ pid = get_pid(find_vpid(nr));
+ rcu_read_unlock();
+
+ return pid;
+}
+EXPORT_SYMBOL_GPL(find_get_pid);
+
+pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
+{
+ struct upid *upid;
+ pid_t nr = 0;
+
+ if (pid && ns->level <= pid->level) {
+ upid = &pid->numbers[ns->level];
+ if (upid->ns == ns)
+ nr = upid->nr;
+ }
+ return nr;
+}
+EXPORT_SYMBOL_GPL(pid_nr_ns);
+
+pid_t pid_vnr(struct pid *pid)
+{
+ return pid_nr_ns(pid, task_active_pid_ns(current));
+}
+EXPORT_SYMBOL_GPL(pid_vnr);
+
+pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
+ struct pid_namespace *ns)
+{
+ pid_t nr = 0;
+
+ rcu_read_lock();
+ if (!ns)
+ ns = task_active_pid_ns(current);
+ if (likely(pid_alive(task))) {
+ if (type != PIDTYPE_PID)
+ task = task->group_leader;
+ nr = pid_nr_ns(task->pids[type].pid, ns);
+ }
+ rcu_read_unlock();
+
+ return nr;
+}
+EXPORT_SYMBOL(__task_pid_nr_ns);
+
+pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
+{
+ return pid_nr_ns(task_tgid(tsk), ns);
+}
+EXPORT_SYMBOL(task_tgid_nr_ns);
+
+struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
+{
+ return ns_of_pid(task_pid(tsk));
+}
+EXPORT_SYMBOL_GPL(task_active_pid_ns);
+
+/*
+ * Used by proc to find the first pid that is greater than or equal to nr.
+ *
+ * If there is a pid at nr this function is exactly the same as find_pid_ns.
+ */
+struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
+{
+ struct pid *pid;
+
+ do {
+ pid = find_pid_ns(nr, ns);
+ if (pid)
+ break;
+ nr = next_pidmap(ns, nr);
+ } while (nr > 0);
+
+ return pid;
+}
+
+/*
+ * The pid hash table is scaled according to the amount of memory in the
+ * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or
+ * more.
+ */
+void __init pidhash_init(void)
+{
+ unsigned int i, pidhash_size;
+
+ pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
+ HASH_EARLY | HASH_SMALL,
+ &pidhash_shift, NULL,
+ 0, 4096);
+ pidhash_size = 1U << pidhash_shift;
+
+ for (i = 0; i < pidhash_size; i++)
+ INIT_HLIST_HEAD(&pid_hash[i]);
+}
+
+void __init pidmap_init(void)
+{
+ /* Veryify no one has done anything silly */
+ BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING);
+
+ /* bump default and minimum pid_max based on number of cpus */
+ pid_max = min(pid_max_max, max_t(int, pid_max,
+ PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
+ pid_max_min = max_t(int, pid_max_min,
+ PIDS_PER_CPU_MIN * num_possible_cpus());
+ pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
+
+ init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ /* Reserve PID 0. We never call free_pidmap(0) */
+ set_bit(0, init_pid_ns.pidmap[0].page);
+ atomic_dec(&init_pid_ns.pidmap[0].nr_free);
+
+ init_pid_ns.pid_cachep = KMEM_CACHE(pid,
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC);
+}
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
new file mode 100644
index 000000000..a65ba137f
--- /dev/null
+++ b/kernel/pid_namespace.c
@@ -0,0 +1,409 @@
+/*
+ * Pid namespaces
+ *
+ * Authors:
+ * (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc.
+ * (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM
+ * Many thanks to Oleg Nesterov for comments and help
+ *
+ */
+
+#include <linux/pid.h>
+#include <linux/pid_namespace.h>
+#include <linux/user_namespace.h>
+#include <linux/syscalls.h>
+#include <linux/err.h>
+#include <linux/acct.h>
+#include <linux/slab.h>
+#include <linux/proc_ns.h>
+#include <linux/reboot.h>
+#include <linux/export.h>
+
+struct pid_cache {
+ int nr_ids;
+ char name[16];
+ struct kmem_cache *cachep;
+ struct list_head list;
+};
+
+static LIST_HEAD(pid_caches_lh);
+static DEFINE_MUTEX(pid_caches_mutex);
+static struct kmem_cache *pid_ns_cachep;
+
+/*
+ * creates the kmem cache to allocate pids from.
+ * @nr_ids: the number of numerical ids this pid will have to carry
+ */
+
+static struct kmem_cache *create_pid_cachep(int nr_ids)
+{
+ struct pid_cache *pcache;
+ struct kmem_cache *cachep;
+
+ mutex_lock(&pid_caches_mutex);
+ list_for_each_entry(pcache, &pid_caches_lh, list)
+ if (pcache->nr_ids == nr_ids)
+ goto out;
+
+ pcache = kmalloc(sizeof(struct pid_cache), GFP_KERNEL);
+ if (pcache == NULL)
+ goto err_alloc;
+
+ snprintf(pcache->name, sizeof(pcache->name), "pid_%d", nr_ids);
+ cachep = kmem_cache_create(pcache->name,
+ sizeof(struct pid) + (nr_ids - 1) * sizeof(struct upid),
+ 0, SLAB_HWCACHE_ALIGN, NULL);
+ if (cachep == NULL)
+ goto err_cachep;
+
+ pcache->nr_ids = nr_ids;
+ pcache->cachep = cachep;
+ list_add(&pcache->list, &pid_caches_lh);
+out:
+ mutex_unlock(&pid_caches_mutex);
+ return pcache->cachep;
+
+err_cachep:
+ kfree(pcache);
+err_alloc:
+ mutex_unlock(&pid_caches_mutex);
+ return NULL;
+}
+
+static void proc_cleanup_work(struct work_struct *work)
+{
+ struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work);
+ pid_ns_release_proc(ns);
+}
+
+/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
+#define MAX_PID_NS_LEVEL 32
+
+static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns,
+ struct pid_namespace *parent_pid_ns)
+{
+ struct pid_namespace *ns;
+ unsigned int level = parent_pid_ns->level + 1;
+ int i;
+ int err;
+
+ if (level > MAX_PID_NS_LEVEL) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = -ENOMEM;
+ ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
+ if (ns == NULL)
+ goto out;
+
+ ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!ns->pidmap[0].page)
+ goto out_free;
+
+ ns->pid_cachep = create_pid_cachep(level + 1);
+ if (ns->pid_cachep == NULL)
+ goto out_free_map;
+
+ err = ns_alloc_inum(&ns->ns);
+ if (err)
+ goto out_free_map;
+ ns->ns.ops = &pidns_operations;
+
+ kref_init(&ns->kref);
+ ns->level = level;
+ ns->parent = get_pid_ns(parent_pid_ns);
+ ns->user_ns = get_user_ns(user_ns);
+ ns->nr_hashed = PIDNS_HASH_ADDING;
+ INIT_WORK(&ns->proc_work, proc_cleanup_work);
+
+ set_bit(0, ns->pidmap[0].page);
+ atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
+
+ for (i = 1; i < PIDMAP_ENTRIES; i++)
+ atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
+
+ return ns;
+
+out_free_map:
+ kfree(ns->pidmap[0].page);
+out_free:
+ kmem_cache_free(pid_ns_cachep, ns);
+out:
+ return ERR_PTR(err);
+}
+
+static void delayed_free_pidns(struct rcu_head *p)
+{
+ kmem_cache_free(pid_ns_cachep,
+ container_of(p, struct pid_namespace, rcu));
+}
+
+static void destroy_pid_namespace(struct pid_namespace *ns)
+{
+ int i;
+
+ ns_free_inum(&ns->ns);
+ for (i = 0; i < PIDMAP_ENTRIES; i++)
+ kfree(ns->pidmap[i].page);
+ put_user_ns(ns->user_ns);
+ call_rcu(&ns->rcu, delayed_free_pidns);
+}
+
+struct pid_namespace *copy_pid_ns(unsigned long flags,
+ struct user_namespace *user_ns, struct pid_namespace *old_ns)
+{
+ if (!(flags & CLONE_NEWPID))
+ return get_pid_ns(old_ns);
+ if (task_active_pid_ns(current) != old_ns)
+ return ERR_PTR(-EINVAL);
+ return create_pid_namespace(user_ns, old_ns);
+}
+
+static void free_pid_ns(struct kref *kref)
+{
+ struct pid_namespace *ns;
+
+ ns = container_of(kref, struct pid_namespace, kref);
+ destroy_pid_namespace(ns);
+}
+
+void put_pid_ns(struct pid_namespace *ns)
+{
+ struct pid_namespace *parent;
+
+ while (ns != &init_pid_ns) {
+ parent = ns->parent;
+ if (!kref_put(&ns->kref, free_pid_ns))
+ break;
+ ns = parent;
+ }
+}
+EXPORT_SYMBOL_GPL(put_pid_ns);
+
+void zap_pid_ns_processes(struct pid_namespace *pid_ns)
+{
+ int nr;
+ int rc;
+ struct task_struct *task, *me = current;
+ int init_pids = thread_group_leader(me) ? 1 : 2;
+
+ /* Don't allow any more processes into the pid namespace */
+ disable_pid_allocation(pid_ns);
+
+ /*
+ * Ignore SIGCHLD causing any terminated children to autoreap.
+ * This speeds up the namespace shutdown, plus see the comment
+ * below.
+ */
+ spin_lock_irq(&me->sighand->siglock);
+ me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN;
+ spin_unlock_irq(&me->sighand->siglock);
+
+ /*
+ * The last thread in the cgroup-init thread group is terminating.
+ * Find remaining pid_ts in the namespace, signal and wait for them
+ * to exit.
+ *
+ * Note: This signals each threads in the namespace - even those that
+ * belong to the same thread group, To avoid this, we would have
+ * to walk the entire tasklist looking a processes in this
+ * namespace, but that could be unnecessarily expensive if the
+ * pid namespace has just a few processes. Or we need to
+ * maintain a tasklist for each pid namespace.
+ *
+ */
+ read_lock(&tasklist_lock);
+ nr = next_pidmap(pid_ns, 1);
+ while (nr > 0) {
+ rcu_read_lock();
+
+ task = pid_task(find_vpid(nr), PIDTYPE_PID);
+ if (task && !__fatal_signal_pending(task))
+ send_sig_info(SIGKILL, SEND_SIG_FORCED, task);
+
+ rcu_read_unlock();
+
+ nr = next_pidmap(pid_ns, nr);
+ }
+ read_unlock(&tasklist_lock);
+
+ /*
+ * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD.
+ * sys_wait4() will also block until our children traced from the
+ * parent namespace are detached and become EXIT_DEAD.
+ */
+ do {
+ clear_thread_flag(TIF_SIGPENDING);
+ rc = sys_wait4(-1, NULL, __WALL, NULL);
+ } while (rc != -ECHILD);
+
+ /*
+ * sys_wait4() above can't reap the EXIT_DEAD children but we do not
+ * really care, we could reparent them to the global init. We could
+ * exit and reap ->child_reaper even if it is not the last thread in
+ * this pid_ns, free_pid(nr_hashed == 0) calls proc_cleanup_work(),
+ * pid_ns can not go away until proc_kill_sb() drops the reference.
+ *
+ * But this ns can also have other tasks injected by setns()+fork().
+ * Again, ignoring the user visible semantics we do not really need
+ * to wait until they are all reaped, but they can be reparented to
+ * us and thus we need to ensure that pid->child_reaper stays valid
+ * until they all go away. See free_pid()->wake_up_process().
+ *
+ * We rely on ignored SIGCHLD, an injected zombie must be autoreaped
+ * if reparented.
+ */
+ for (;;) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (pid_ns->nr_hashed == init_pids)
+ break;
+ schedule();
+ }
+ __set_current_state(TASK_RUNNING);
+
+ if (pid_ns->reboot)
+ current->signal->group_exit_code = pid_ns->reboot;
+
+ acct_exit_ns(pid_ns);
+ return;
+}
+
+#ifdef CONFIG_CHECKPOINT_RESTORE
+static int pid_ns_ctl_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct pid_namespace *pid_ns = task_active_pid_ns(current);
+ struct ctl_table tmp = *table;
+
+ if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /*
+ * Writing directly to ns' last_pid field is OK, since this field
+ * is volatile in a living namespace anyway and a code writing to
+ * it should synchronize its usage with external means.
+ */
+
+ tmp.data = &pid_ns->last_pid;
+ return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+}
+
+extern int pid_max;
+static int zero = 0;
+static struct ctl_table pid_ns_ctl_table[] = {
+ {
+ .procname = "ns_last_pid",
+ .maxlen = sizeof(int),
+ .mode = 0666, /* permissions are checked in the handler */
+ .proc_handler = pid_ns_ctl_handler,
+ .extra1 = &zero,
+ .extra2 = &pid_max,
+ },
+ { }
+};
+static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };
+#endif /* CONFIG_CHECKPOINT_RESTORE */
+
+int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
+{
+ if (pid_ns == &init_pid_ns)
+ return 0;
+
+ switch (cmd) {
+ case LINUX_REBOOT_CMD_RESTART2:
+ case LINUX_REBOOT_CMD_RESTART:
+ pid_ns->reboot = SIGHUP;
+ break;
+
+ case LINUX_REBOOT_CMD_POWER_OFF:
+ case LINUX_REBOOT_CMD_HALT:
+ pid_ns->reboot = SIGINT;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ read_lock(&tasklist_lock);
+ force_sig(SIGKILL, pid_ns->child_reaper);
+ read_unlock(&tasklist_lock);
+
+ do_exit(0);
+
+ /* Not reached */
+ return 0;
+}
+
+static inline struct pid_namespace *to_pid_ns(struct ns_common *ns)
+{
+ return container_of(ns, struct pid_namespace, ns);
+}
+
+static struct ns_common *pidns_get(struct task_struct *task)
+{
+ struct pid_namespace *ns;
+
+ rcu_read_lock();
+ ns = task_active_pid_ns(task);
+ if (ns)
+ get_pid_ns(ns);
+ rcu_read_unlock();
+
+ return ns ? &ns->ns : NULL;
+}
+
+static void pidns_put(struct ns_common *ns)
+{
+ put_pid_ns(to_pid_ns(ns));
+}
+
+static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns)
+{
+ struct pid_namespace *active = task_active_pid_ns(current);
+ struct pid_namespace *ancestor, *new = to_pid_ns(ns);
+
+ if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) ||
+ !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /*
+ * Only allow entering the current active pid namespace
+ * or a child of the current active pid namespace.
+ *
+ * This is required for fork to return a usable pid value and
+ * this maintains the property that processes and their
+ * children can not escape their current pid namespace.
+ */
+ if (new->level < active->level)
+ return -EINVAL;
+
+ ancestor = new;
+ while (ancestor->level > active->level)
+ ancestor = ancestor->parent;
+ if (ancestor != active)
+ return -EINVAL;
+
+ put_pid_ns(nsproxy->pid_ns_for_children);
+ nsproxy->pid_ns_for_children = get_pid_ns(new);
+ return 0;
+}
+
+const struct proc_ns_operations pidns_operations = {
+ .name = "pid",
+ .type = CLONE_NEWPID,
+ .get = pidns_get,
+ .put = pidns_put,
+ .install = pidns_install,
+};
+
+static __init int pid_namespaces_init(void)
+{
+ pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
+
+#ifdef CONFIG_CHECKPOINT_RESTORE
+ register_sysctl_paths(kern_path, pid_ns_ctl_table);
+#endif
+ return 0;
+}
+
+__initcall(pid_namespaces_init);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
new file mode 100644
index 000000000..89a46f3ff
--- /dev/null
+++ b/kernel/power/Kconfig
@@ -0,0 +1,581 @@
+config SUSPEND
+ bool "Suspend to RAM and standby"
+ depends on ARCH_SUSPEND_POSSIBLE
+ default y
+ ---help---
+ Allow the system to enter sleep states in which main memory is
+ powered and thus its contents are preserved, such as the
+ suspend-to-RAM state (e.g. the ACPI S3 state).
+
+config SUSPEND_FREEZER
+ bool "Enable freezer for suspend to RAM/standby" \
+ if ARCH_WANTS_FREEZER_CONTROL || BROKEN
+ depends on SUSPEND
+ default y
+ help
+ This allows you to turn off the freezer for suspend. If this is
+ done, no tasks are frozen for suspend to RAM/standby.
+
+ Turning OFF this setting is NOT recommended! If in doubt, say Y.
+
+config HIBERNATE_CALLBACKS
+ bool
+
+config HIBERNATION
+ bool "Hibernation (aka 'suspend to disk')"
+ depends on SWAP && ARCH_HIBERNATION_POSSIBLE
+ select HIBERNATE_CALLBACKS
+ select LZO_COMPRESS
+ select LZO_DECOMPRESS
+ select CRC32
+ ---help---
+ Enable the suspend to disk (STD) functionality, which is usually
+ called "hibernation" in user interfaces. STD checkpoints the
+ system and powers it off; and restores that checkpoint on reboot.
+
+ You can suspend your machine with 'echo disk > /sys/power/state'
+ after placing resume=/dev/swappartition on the kernel command line
+ in your bootloader's configuration file.
+
+ Alternatively, you can use the additional userland tools available
+ from <http://suspend.sf.net>.
+
+ In principle it does not require ACPI or APM, although for example
+ ACPI will be used for the final steps when it is available. One
+ of the reasons to use software suspend is that the firmware hooks
+ for suspend states like suspend-to-RAM (STR) often don't work very
+ well with Linux.
+
+ It creates an image which is saved in your active swap. Upon the next
+ boot, pass the 'resume=/dev/swappartition' argument to the kernel to
+ have it detect the saved image, restore memory state from it, and
+ continue to run as before. If you do not want the previous state to
+ be reloaded, then use the 'noresume' kernel command line argument.
+ Note, however, that fsck will be run on your filesystems and you will
+ need to run mkswap against the swap partition used for the suspend.
+
+ It also works with swap files to a limited extent (for details see
+ <file:Documentation/power/swsusp-and-swap-files.txt>).
+
+ Right now you may boot without resuming and resume later but in the
+ meantime you cannot use the swap partition(s)/file(s) involved in
+ suspending. Also in this case you must not use the filesystems
+ that were mounted before the suspend. In particular, you MUST NOT
+ MOUNT any journaled filesystems mounted before the suspend or they
+ will get corrupted in a nasty way.
+
+ For more information take a look at <file:Documentation/power/swsusp.txt>.
+
+config ARCH_SAVE_PAGE_KEYS
+ bool
+
+config PM_STD_PARTITION
+ string "Default resume partition"
+ depends on HIBERNATION
+ default ""
+ ---help---
+ The default resume partition is the partition that the suspend-
+ to-disk implementation will look for a suspended disk image.
+
+ The partition specified here will be different for almost every user.
+ It should be a valid swap partition (at least for now) that is turned
+ on before suspending.
+
+ The partition specified can be overridden by specifying:
+
+ resume=/dev/<other device>
+
+ which will set the resume partition to the device specified.
+
+ Note there is currently not a way to specify which device to save the
+ suspended image to. It will simply pick the first available swap
+ device.
+
+menuconfig TOI_CORE
+ bool "Enhanced Hibernation (TuxOnIce)"
+ depends on HIBERNATION
+ default y
+ ---help---
+ TuxOnIce is the 'new and improved' suspend support.
+
+ See the TuxOnIce home page (tuxonice.net)
+ for FAQs, HOWTOs and other documentation.
+
+ comment "Image Storage (you need at least one allocator)"
+ depends on TOI_CORE
+
+ config TOI_FILE
+ bool "File Allocator"
+ depends on TOI_CORE
+ default y
+ ---help---
+ This option enables support for storing an image in a
+ simple file. You might want this if your swap is
+ sometimes full enough that you don't have enough spare
+ space to store an image.
+
+ config TOI_SWAP
+ bool "Swap Allocator"
+ depends on TOI_CORE && SWAP
+ default y
+ ---help---
+ This option enables support for storing an image in your
+ swap space.
+
+ comment "General Options"
+ depends on TOI_CORE
+
+ config TOI_PRUNE
+ bool "Image pruning support"
+ depends on TOI_CORE && CRYPTO && BROKEN
+ default y
+ ---help---
+ This option adds support for using cryptoapi hashing
+ algorithms to identify pages with the same content. We
+ then write a much smaller pointer to the first copy of
+ the data instead of a complete (perhaps compressed)
+ additional copy.
+
+ You probably want this, so say Y here.
+
+ comment "No image pruning support available without Cryptoapi support."
+ depends on TOI_CORE && !CRYPTO
+
+ config TOI_CRYPTO
+ bool "Compression support"
+ depends on TOI_CORE && CRYPTO
+ default y
+ ---help---
+ This option adds support for using cryptoapi compression
+ algorithms. Compression is particularly useful as it can
+ more than double your suspend and resume speed (depending
+ upon how well your image compresses).
+
+ You probably want this, so say Y here.
+
+ comment "No compression support available without Cryptoapi support."
+ depends on TOI_CORE && !CRYPTO
+
+ config TOI_USERUI
+ bool "Userspace User Interface support"
+ depends on TOI_CORE && NET && (VT || SERIAL_CONSOLE)
+ default y
+ ---help---
+ This option enabled support for a userspace based user interface
+ to TuxOnIce, which allows you to have a nice display while suspending
+ and resuming, and also enables features such as pressing escape to
+ cancel a cycle or interactive debugging.
+
+ config TOI_USERUI_DEFAULT_PATH
+ string "Default userui program location"
+ default "/usr/local/sbin/tuxoniceui_text"
+ depends on TOI_USERUI
+ ---help---
+ This entry allows you to specify a default path to the userui binary.
+
+ config TOI_DEFAULT_IMAGE_SIZE_LIMIT
+ int "Default image size limit"
+ range -2 65536
+ default "-2"
+ depends on TOI_CORE
+ ---help---
+ This entry allows you to specify a default image size limit. It can
+ be overridden at run-time using /sys/power/tuxonice/image_size_limit.
+
+ config TOI_KEEP_IMAGE
+ bool "Allow Keep Image Mode"
+ depends on TOI_CORE
+ ---help---
+ This option allows you to keep and image and reuse it. It is intended
+ __ONLY__ for use with systems where all filesystems are mounted read-
+ only (kiosks, for example). To use it, compile this option in and boot
+ normally. Set the KEEP_IMAGE flag in /sys/power/tuxonice and suspend.
+ When you resume, the image will not be removed. You will be unable to turn
+ off swap partitions (assuming you are using the swap allocator), but future
+ suspends simply do a power-down. The image can be updated using the
+ kernel command line parameter suspend_act= to turn off the keep image
+ bit. Keep image mode is a little less user friendly on purpose - it
+ should not be used without thought!
+
+ config TOI_INCREMENTAL
+ bool "Incremental Image Support"
+ depends on TOI_CORE && 64BIT && TOI_KEEP_IMAGE
+ default n
+ ---help---
+ This option enables the work in progress toward using the dirty page
+ tracking to record changes to pages. It is hoped that
+ this will be an initial step toward implementing storing just
+ the differences between consecutive images, which will
+ increase the amount of storage needed for the image, but also
+ increase the speed at which writing an image occurs and
+ reduce the wear and tear on drives.
+
+ At the moment, all that is implemented is the first step of keeping
+ an existing image and then comparing it to the contents in memory
+ (by setting /sys/power/tuxonice/verify_image to 1 and triggering a
+ (fake) resume) to see what the page change tracking should find to be
+ different. If you have verify_image set to 1, TuxOnIce will automatically
+ invalidate the old image when you next try to hibernate, so there's no
+ greater chance of disk corruption than normal.
+
+ comment "No incremental image support available without Keep Image support."
+ depends on TOI_CORE && !TOI_KEEP_IMAGE && 64BIT
+
+ config TOI_REPLACE_SWSUSP
+ bool "Replace swsusp by default"
+ default y
+ depends on TOI_CORE
+ ---help---
+ TuxOnIce can replace swsusp. This option makes that the default state,
+ requiring you to echo 0 > /sys/power/tuxonice/replace_swsusp if you want
+ to use the vanilla kernel functionality. Note that your initrd/ramfs will
+ need to do this before trying to resume, too.
+ With overriding swsusp enabled, echoing disk to /sys/power/state will
+ start a TuxOnIce cycle. If resume= doesn't specify an allocator and both
+ the swap and file allocators are compiled in, the swap allocator will be
+ used by default.
+
+ config TOI_IGNORE_LATE_INITCALL
+ bool "Wait for initrd/ramfs to run, by default"
+ default n
+ depends on TOI_CORE
+ ---help---
+ When booting, TuxOnIce can check for an image and start to resume prior
+ to any initrd/ramfs running (via a late initcall).
+
+ If you don't have an initrd/ramfs, this is what you want to happen -
+ otherwise you won't be able to safely resume. You should set this option
+ to 'No'.
+
+ If, however, you want your initrd/ramfs to run anyway before resuming,
+ you need to tell TuxOnIce to ignore that earlier opportunity to resume.
+ This can be done either by using this compile time option, or by
+ overriding this option with the boot-time parameter toi_initramfs_resume_only=1.
+
+ Note that if TuxOnIce can't resume at the earlier opportunity, the
+ value of this option won't matter - the initramfs/initrd (if any) will
+ run anyway.
+
+ menuconfig TOI_CLUSTER
+ bool "Cluster support"
+ default n
+ depends on TOI_CORE && NET && BROKEN
+ ---help---
+ Support for linking multiple machines in a cluster so that they suspend
+ and resume together.
+
+ config TOI_DEFAULT_CLUSTER_INTERFACE
+ string "Default cluster interface"
+ depends on TOI_CLUSTER
+ ---help---
+ The default interface on which to communicate with other nodes in
+ the cluster.
+
+ If no value is set here, cluster support will be disabled by default.
+
+ config TOI_DEFAULT_CLUSTER_KEY
+ string "Default cluster key"
+ default "Default"
+ depends on TOI_CLUSTER
+ ---help---
+ The default key used by this node. All nodes in the same cluster
+ have the same key. Multiple clusters may coexist on the same lan
+ by using different values for this key.
+
+ config TOI_CLUSTER_IMAGE_TIMEOUT
+ int "Timeout when checking for image"
+ default 15
+ depends on TOI_CLUSTER
+ ---help---
+ Timeout (seconds) before continuing to boot when waiting to see
+ whether other nodes might have an image. Set to -1 to wait
+ indefinitely. In WAIT_UNTIL_NODES is non zero, we might continue
+ booting sooner than this timeout.
+
+ config TOI_CLUSTER_WAIT_UNTIL_NODES
+ int "Nodes without image before continuing"
+ default 0
+ depends on TOI_CLUSTER
+ ---help---
+ When booting and no image is found, we wait to see if other nodes
+ have an image before continuing to boot. This value lets us
+ continue after seeing a certain number of nodes without an image,
+ instead of continuing to wait for the timeout. Set to 0 to only
+ use the timeout.
+
+ config TOI_DEFAULT_CLUSTER_PRE_HIBERNATE
+ string "Default pre-hibernate script"
+ depends on TOI_CLUSTER
+ ---help---
+ The default script to be called when starting to hibernate.
+
+ config TOI_DEFAULT_CLUSTER_POST_HIBERNATE
+ string "Default post-hibernate script"
+ depends on TOI_CLUSTER
+ ---help---
+ The default script to be called after resuming from hibernation.
+
+ config TOI_DEFAULT_WAIT
+ int "Default waiting time for emergency boot messages"
+ default "25"
+ range -1 32768
+ depends on TOI_CORE
+ help
+ TuxOnIce can display warnings very early in the process of resuming,
+ if (for example) it appears that you have booted a kernel that doesn't
+ match an image on disk. It can then give you the opportunity to either
+ continue booting that kernel, or reboot the machine. This option can be
+ used to control how long to wait in such circumstances. -1 means wait
+ forever. 0 means don't wait at all (do the default action, which will
+ generally be to continue booting and remove the image). Values of 1 or
+ more indicate a number of seconds (up to 255) to wait before doing the
+ default.
+
+ config TOI_DEFAULT_EXTRA_PAGES_ALLOWANCE
+ int "Default extra pages allowance"
+ default "2000"
+ range 500 32768
+ depends on TOI_CORE
+ help
+ This value controls the default for the allowance TuxOnIce makes for
+ drivers to allocate extra memory during the atomic copy. The default
+ value of 2000 will be okay in most cases. If you are using
+ DRI, the easiest way to find what value to use is to try to hibernate
+ and look at how many pages were actually needed in the sysfs entry
+ /sys/power/tuxonice/debug_info (first number on the last line), adding
+ a little extra because the value is not always the same.
+
+ config TOI_CHECKSUM
+ bool "Checksum pageset2"
+ default n
+ depends on TOI_CORE
+ select CRYPTO
+ select CRYPTO_ALGAPI
+ select CRYPTO_MD4
+ ---help---
+ Adds support for checksumming pageset2 pages, to ensure you really get an
+ atomic copy. Since some filesystems (XFS especially) change metadata even
+ when there's no other activity, we need this to check for pages that have
+ been changed while we were saving the page cache. If your debugging output
+ always says no pages were resaved, you may be able to safely disable this
+ option.
+
+config TOI
+ bool
+ depends on TOI_CORE!=n
+ default y
+
+config TOI_ZRAM_SUPPORT
+ def_bool y
+ depends on TOI && ZRAM!=n
+
+config PM_SLEEP
+ def_bool y
+ depends on SUSPEND || HIBERNATE_CALLBACKS
+ select PM
+
+config PM_SLEEP_SMP
+ def_bool y
+ depends on SMP
+ depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE
+ depends on PM_SLEEP
+ select HOTPLUG_CPU
+
+config PM_AUTOSLEEP
+ bool "Opportunistic sleep"
+ depends on PM_SLEEP
+ default n
+ ---help---
+ Allow the kernel to trigger a system transition into a global sleep
+ state automatically whenever there are no active wakeup sources.
+
+config PM_WAKELOCKS
+ bool "User space wakeup sources interface"
+ depends on PM_SLEEP
+ default n
+ ---help---
+ Allow user space to create, activate and deactivate wakeup source
+ objects with the help of a sysfs-based interface.
+
+config PM_WAKELOCKS_LIMIT
+ int "Maximum number of user space wakeup sources (0 = no limit)"
+ range 0 100000
+ default 100
+ depends on PM_WAKELOCKS
+
+config PM_WAKELOCKS_GC
+ bool "Garbage collector for user space wakeup sources"
+ depends on PM_WAKELOCKS
+ default y
+
+config PM
+ bool "Device power management core functionality"
+ ---help---
+ Enable functionality allowing I/O devices to be put into energy-saving
+ (low power) states, for example after a specified period of inactivity
+ (autosuspended), and woken up in response to a hardware-generated
+ wake-up event or a driver's request.
+
+ Hardware support is generally required for this functionality to work
+ and the bus type drivers of the buses the devices are on are
+ responsible for the actual handling of device suspend requests and
+ wake-up events.
+
+config PM_DEBUG
+ bool "Power Management Debug Support"
+ depends on PM
+ ---help---
+ This option enables various debugging support in the Power Management
+ code. This is helpful when debugging and reporting PM bugs, like
+ suspend support.
+
+config PM_ADVANCED_DEBUG
+ bool "Extra PM attributes in sysfs for low-level debugging/testing"
+ depends on PM_DEBUG
+ ---help---
+ Add extra sysfs attributes allowing one to access some Power Management
+ fields of device objects from user space. If you are not a kernel
+ developer interested in debugging/testing Power Management, say "no".
+
+config PM_TEST_SUSPEND
+ bool "Test suspend/resume and wakealarm during bootup"
+ depends on SUSPEND && PM_DEBUG && RTC_CLASS=y
+ ---help---
+ This option will let you suspend your machine during bootup, and
+ make it wake up a few seconds later using an RTC wakeup alarm.
+ Enable this with a kernel parameter like "test_suspend=mem".
+
+ You probably want to have your system's RTC driver statically
+ linked, ensuring that it's available when this test runs.
+
+config PM_SLEEP_DEBUG
+ def_bool y
+ depends on PM_DEBUG && PM_SLEEP
+
+config DPM_WATCHDOG
+ bool "Device suspend/resume watchdog"
+ depends on PM_DEBUG && PSTORE
+ ---help---
+ Sets up a watchdog timer to capture drivers that are
+ locked up attempting to suspend/resume a device.
+ A detected lockup causes system panic with message
+ captured in pstore device for inspection in subsequent
+ boot session.
+
+config DPM_WATCHDOG_TIMEOUT
+ int "Watchdog timeout in seconds"
+ range 1 120
+ default 60
+ depends on DPM_WATCHDOG
+
+config PM_TRACE
+ bool
+ help
+ This enables code to save the last PM event point across
+ reboot. The architecture needs to support this, x86 for
+ example does by saving things in the RTC, see below.
+
+ The architecture specific code must provide the extern
+ functions from <linux/resume-trace.h> as well as the
+ <asm/resume-trace.h> header with a TRACE_RESUME() macro.
+
+ The way the information is presented is architecture-
+ dependent, x86 will print the information during a
+ late_initcall.
+
+config PM_TRACE_RTC
+ bool "Suspend/resume event tracing"
+ depends on PM_SLEEP_DEBUG
+ depends on X86
+ select PM_TRACE
+ ---help---
+ This enables some cheesy code to save the last PM event point in the
+ RTC across reboots, so that you can debug a machine that just hangs
+ during suspend (or more commonly, during resume).
+
+ To use this debugging feature you should attempt to suspend the
+ machine, reboot it and then run
+
+ dmesg -s 1000000 | grep 'hash matches'
+
+ CAUTION: this option will cause your machine's real-time clock to be
+ set to an invalid time after a resume.
+
+config APM_EMULATION
+ tristate "Advanced Power Management Emulation"
+ depends on PM && SYS_SUPPORTS_APM_EMULATION
+ help
+ APM is a BIOS specification for saving power using several different
+ techniques. This is mostly useful for battery powered laptops with
+ APM compliant BIOSes. If you say Y here, the system time will be
+ reset after a RESUME operation, the /proc/apm device will provide
+ battery status information, and user-space programs will receive
+ notification of APM "events" (e.g. battery status change).
+
+ In order to use APM, you will need supporting software. For location
+ and more information, read <file:Documentation/power/apm-acpi.txt>
+ and the Battery Powered Linux mini-HOWTO, available from
+ <http://www.tldp.org/docs.html#howto>.
+
+ This driver does not spin down disk drives (see the hdparm(8)
+ manpage ("man 8 hdparm") for that), and it doesn't turn off
+ VESA-compliant "green" monitors.
+
+ Generally, if you don't have a battery in your machine, there isn't
+ much point in using this driver and you should say N. If you get
+ random kernel OOPSes or reboots that don't seem to be related to
+ anything, try disabling/enabling this option (or disabling/enabling
+ APM in your BIOS).
+
+config PM_OPP
+ bool
+ select SRCU
+ ---help---
+ SOCs have a standard set of tuples consisting of frequency and
+ voltage pairs that the device will support per voltage domain. This
+ is called Operating Performance Point or OPP. The actual definitions
+ of OPP varies over silicon within the same family of devices.
+
+ OPP layer organizes the data internally using device pointers
+ representing individual voltage domains and provides SOC
+ implementations a ready to use framework to manage OPPs.
+ For more information, read <file:Documentation/power/opp.txt>
+
+config PM_CLK
+ def_bool y
+ depends on PM && HAVE_CLK
+
+config PM_GENERIC_DOMAINS
+ bool
+ depends on PM
+
+config WQ_POWER_EFFICIENT_DEFAULT
+ bool "Enable workqueue power-efficient mode by default"
+ depends on PM
+ default n
+ help
+ Per-cpu workqueues are generally preferred because they show
+ better performance thanks to cache locality; unfortunately,
+ per-cpu workqueues tend to be more power hungry than unbound
+ workqueues.
+
+ Enabling workqueue.power_efficient kernel parameter makes the
+ per-cpu workqueues which were observed to contribute
+ significantly to power consumption unbound, leading to measurably
+ lower power usage at the cost of small performance overhead.
+
+ This config option determines whether workqueue.power_efficient
+ is enabled by default.
+
+ If in doubt, say N.
+
+config PM_GENERIC_DOMAINS_SLEEP
+ def_bool y
+ depends on PM_SLEEP && PM_GENERIC_DOMAINS
+
+config PM_GENERIC_DOMAINS_OF
+ def_bool y
+ depends on PM_GENERIC_DOMAINS && OF
+
+config CPU_PM
+ bool
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
new file mode 100644
index 000000000..b8d7b68f7
--- /dev/null
+++ b/kernel/power/Makefile
@@ -0,0 +1,47 @@
+
+ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG
+
+tuxonice_core-y := tuxonice_modules.o
+
+obj-$(CONFIG_TOI) += tuxonice_builtin.o
+obj-$(CONFIG_TOI_INCREMENTAL) += tuxonice_incremental.o \
+ tuxonice_copy_before_write.o
+
+tuxonice_core-$(CONFIG_PM_DEBUG) += tuxonice_alloc.o
+
+# Compile these in after allocation debugging, if used.
+
+tuxonice_core-y += tuxonice_sysfs.o tuxonice_highlevel.o \
+ tuxonice_io.o tuxonice_pagedir.o tuxonice_prepare_image.o \
+ tuxonice_extent.o tuxonice_pageflags.o tuxonice_ui.o \
+ tuxonice_power_off.o tuxonice_atomic_copy.o
+
+tuxonice_core-$(CONFIG_TOI_CHECKSUM) += tuxonice_checksum.o
+
+tuxonice_core-$(CONFIG_NET) += tuxonice_storage.o tuxonice_netlink.o
+
+obj-$(CONFIG_TOI_CORE) += tuxonice_core.o
+obj-$(CONFIG_TOI_PRUNE) += tuxonice_prune.o
+obj-$(CONFIG_TOI_CRYPTO) += tuxonice_compress.o
+
+tuxonice_bio-y := tuxonice_bio_core.o tuxonice_bio_chains.o \
+ tuxonice_bio_signature.o
+
+obj-$(CONFIG_TOI_SWAP) += tuxonice_bio.o tuxonice_swap.o
+obj-$(CONFIG_TOI_FILE) += tuxonice_bio.o tuxonice_file.o
+obj-$(CONFIG_TOI_CLUSTER) += tuxonice_cluster.o
+
+obj-$(CONFIG_TOI_USERUI) += tuxonice_userui.o
+
+obj-y += qos.o
+obj-$(CONFIG_PM) += main.o
+obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o
+obj-$(CONFIG_FREEZER) += process.o
+obj-$(CONFIG_SUSPEND) += suspend.o
+obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o
+obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \
+ block_io.o
+obj-$(CONFIG_PM_AUTOSLEEP) += autosleep.o
+obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o
+
+obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c
new file mode 100644
index 000000000..9012ecf7b
--- /dev/null
+++ b/kernel/power/autosleep.c
@@ -0,0 +1,128 @@
+/*
+ * kernel/power/autosleep.c
+ *
+ * Opportunistic sleep support.
+ *
+ * Copyright (C) 2012 Rafael J. Wysocki <rjw@sisk.pl>
+ */
+
+#include <linux/device.h>
+#include <linux/mutex.h>
+#include <linux/pm_wakeup.h>
+
+#include "power.h"
+
+static suspend_state_t autosleep_state;
+static struct workqueue_struct *autosleep_wq;
+/*
+ * Note: it is only safe to mutex_lock(&autosleep_lock) if a wakeup_source
+ * is active, otherwise a deadlock with try_to_suspend() is possible.
+ * Alternatively mutex_lock_interruptible() can be used. This will then fail
+ * if an auto_sleep cycle tries to freeze processes.
+ */
+static DEFINE_MUTEX(autosleep_lock);
+static struct wakeup_source *autosleep_ws;
+
+static void try_to_suspend(struct work_struct *work)
+{
+ unsigned int initial_count, final_count;
+
+ if (!pm_get_wakeup_count(&initial_count, true))
+ goto out;
+
+ mutex_lock(&autosleep_lock);
+
+ if (!pm_save_wakeup_count(initial_count) ||
+ system_state != SYSTEM_RUNNING) {
+ mutex_unlock(&autosleep_lock);
+ goto out;
+ }
+
+ if (autosleep_state == PM_SUSPEND_ON) {
+ mutex_unlock(&autosleep_lock);
+ return;
+ }
+ if (autosleep_state >= PM_SUSPEND_MAX)
+ hibernate();
+ else
+ pm_suspend(autosleep_state);
+
+ mutex_unlock(&autosleep_lock);
+
+ if (!pm_get_wakeup_count(&final_count, false))
+ goto out;
+
+ /*
+ * If the wakeup occured for an unknown reason, wait to prevent the
+ * system from trying to suspend and waking up in a tight loop.
+ */
+ if (final_count == initial_count)
+ schedule_timeout_uninterruptible(HZ / 2);
+
+ out:
+ queue_up_suspend_work();
+}
+
+static DECLARE_WORK(suspend_work, try_to_suspend);
+
+void queue_up_suspend_work(void)
+{
+ if (autosleep_state > PM_SUSPEND_ON)
+ queue_work(autosleep_wq, &suspend_work);
+}
+
+suspend_state_t pm_autosleep_state(void)
+{
+ return autosleep_state;
+}
+
+int pm_autosleep_lock(void)
+{
+ return mutex_lock_interruptible(&autosleep_lock);
+}
+
+void pm_autosleep_unlock(void)
+{
+ mutex_unlock(&autosleep_lock);
+}
+
+int pm_autosleep_set_state(suspend_state_t state)
+{
+
+#ifndef CONFIG_HIBERNATION
+ if (state >= PM_SUSPEND_MAX)
+ return -EINVAL;
+#endif
+
+ __pm_stay_awake(autosleep_ws);
+
+ mutex_lock(&autosleep_lock);
+
+ autosleep_state = state;
+
+ __pm_relax(autosleep_ws);
+
+ if (state > PM_SUSPEND_ON) {
+ pm_wakep_autosleep_enabled(true);
+ queue_up_suspend_work();
+ } else {
+ pm_wakep_autosleep_enabled(false);
+ }
+
+ mutex_unlock(&autosleep_lock);
+ return 0;
+}
+
+int __init pm_autosleep_init(void)
+{
+ autosleep_ws = wakeup_source_register("autosleep");
+ if (!autosleep_ws)
+ return -ENOMEM;
+
+ autosleep_wq = alloc_ordered_workqueue("autosleep", 0);
+ if (autosleep_wq)
+ return 0;
+
+ wakeup_source_unregister(autosleep_ws);
+ return -ENOMEM;
+}
diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c
new file mode 100644
index 000000000..9a58bc258
--- /dev/null
+++ b/kernel/power/block_io.c
@@ -0,0 +1,103 @@
+/*
+ * This file provides functions for block I/O operations on swap/file.
+ *
+ * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
+ * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/bio.h>
+#include <linux/kernel.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+
+#include "power.h"
+
+/**
+ * submit - submit BIO request.
+ * @rw: READ or WRITE.
+ * @off physical offset of page.
+ * @page: page we're reading or writing.
+ * @bio_chain: list of pending biod (for async reading)
+ *
+ * Straight from the textbook - allocate and initialize the bio.
+ * If we're reading, make sure the page is marked as dirty.
+ * Then submit it and, if @bio_chain == NULL, wait.
+ */
+static int submit(int rw, struct block_device *bdev, sector_t sector,
+ struct page *page, struct bio **bio_chain)
+{
+ const int bio_rw = rw | REQ_SYNC;
+ struct bio *bio;
+
+ bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
+ bio->bi_iter.bi_sector = sector;
+ bio->bi_bdev = bdev;
+ bio->bi_end_io = end_swap_bio_read;
+
+ if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
+ printk(KERN_ERR "PM: Adding page to bio failed at %llu\n",
+ (unsigned long long)sector);
+ bio_put(bio);
+ return -EFAULT;
+ }
+
+ lock_page(page);
+ bio_get(bio);
+
+ if (bio_chain == NULL) {
+ submit_bio(bio_rw, bio);
+ wait_on_page_locked(page);
+ if (rw == READ)
+ bio_set_pages_dirty(bio);
+ bio_put(bio);
+ } else {
+ if (rw == READ)
+ get_page(page); /* These pages are freed later */
+ bio->bi_private = *bio_chain;
+ *bio_chain = bio;
+ submit_bio(bio_rw, bio);
+ }
+ return 0;
+}
+
+int hib_bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
+{
+ return submit(READ, hib_resume_bdev, page_off * (PAGE_SIZE >> 9),
+ virt_to_page(addr), bio_chain);
+}
+
+int hib_bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
+{
+ return submit(WRITE, hib_resume_bdev, page_off * (PAGE_SIZE >> 9),
+ virt_to_page(addr), bio_chain);
+}
+
+int hib_wait_on_bio_chain(struct bio **bio_chain)
+{
+ struct bio *bio;
+ struct bio *next_bio;
+ int ret = 0;
+
+ if (bio_chain == NULL)
+ return 0;
+
+ bio = *bio_chain;
+ if (bio == NULL)
+ return 0;
+ while (bio) {
+ struct page *page;
+
+ next_bio = bio->bi_private;
+ page = bio->bi_io_vec[0].bv_page;
+ wait_on_page_locked(page);
+ if (!PageUptodate(page) || PageError(page))
+ ret = -EIO;
+ put_page(page);
+ bio_put(bio);
+ bio = next_bio;
+ }
+ *bio_chain = NULL;
+ return ret;
+}
diff --git a/kernel/power/console.c b/kernel/power/console.c
new file mode 100644
index 000000000..aba9c545a
--- /dev/null
+++ b/kernel/power/console.c
@@ -0,0 +1,151 @@
+/*
+ * Functions for saving/restoring console.
+ *
+ * Originally from swsusp.
+ */
+
+#include <linux/console.h>
+#include <linux/vt_kern.h>
+#include <linux/kbd_kern.h>
+#include <linux/vt.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include "power.h"
+
+#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1)
+
+static int orig_fgconsole, orig_kmsg;
+
+static DEFINE_MUTEX(vt_switch_mutex);
+
+struct pm_vt_switch {
+ struct list_head head;
+ struct device *dev;
+ bool required;
+};
+
+static LIST_HEAD(pm_vt_switch_list);
+
+
+/**
+ * pm_vt_switch_required - indicate VT switch at suspend requirements
+ * @dev: device
+ * @required: if true, caller needs VT switch at suspend/resume time
+ *
+ * The different console drivers may or may not require VT switches across
+ * suspend/resume, depending on how they handle restoring video state and
+ * what may be running.
+ *
+ * Drivers can indicate support for switchless suspend/resume, which can
+ * save time and flicker, by using this routine and passing 'false' as
+ * the argument. If any loaded driver needs VT switching, or the
+ * no_console_suspend argument has been passed on the command line, VT
+ * switches will occur.
+ */
+void pm_vt_switch_required(struct device *dev, bool required)
+{
+ struct pm_vt_switch *entry, *tmp;
+
+ mutex_lock(&vt_switch_mutex);
+ list_for_each_entry(tmp, &pm_vt_switch_list, head) {
+ if (tmp->dev == dev) {
+ /* already registered, update requirement */
+ tmp->required = required;
+ goto out;
+ }
+ }
+
+ entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ goto out;
+
+ entry->required = required;
+ entry->dev = dev;
+
+ list_add(&entry->head, &pm_vt_switch_list);
+out:
+ mutex_unlock(&vt_switch_mutex);
+}
+EXPORT_SYMBOL(pm_vt_switch_required);
+
+/**
+ * pm_vt_switch_unregister - stop tracking a device's VT switching needs
+ * @dev: device
+ *
+ * Remove @dev from the vt switch list.
+ */
+void pm_vt_switch_unregister(struct device *dev)
+{
+ struct pm_vt_switch *tmp;
+
+ mutex_lock(&vt_switch_mutex);
+ list_for_each_entry(tmp, &pm_vt_switch_list, head) {
+ if (tmp->dev == dev) {
+ list_del(&tmp->head);
+ kfree(tmp);
+ break;
+ }
+ }
+ mutex_unlock(&vt_switch_mutex);
+}
+EXPORT_SYMBOL(pm_vt_switch_unregister);
+
+/*
+ * There are three cases when a VT switch on suspend/resume are required:
+ * 1) no driver has indicated a requirement one way or another, so preserve
+ * the old behavior
+ * 2) console suspend is disabled, we want to see debug messages across
+ * suspend/resume
+ * 3) any registered driver indicates it needs a VT switch
+ *
+ * If none of these conditions is present, meaning we have at least one driver
+ * that doesn't need the switch, and none that do, we can avoid it to make
+ * resume look a little prettier (and suspend too, but that's usually hidden,
+ * e.g. when closing the lid on a laptop).
+ */
+static bool pm_vt_switch(void)
+{
+ struct pm_vt_switch *entry;
+ bool ret = true;
+
+ mutex_lock(&vt_switch_mutex);
+ if (list_empty(&pm_vt_switch_list))
+ goto out;
+
+ if (!console_suspend_enabled)
+ goto out;
+
+ list_for_each_entry(entry, &pm_vt_switch_list, head) {
+ if (entry->required)
+ goto out;
+ }
+
+ ret = false;
+out:
+ mutex_unlock(&vt_switch_mutex);
+ return ret;
+}
+
+int pm_prepare_console(void)
+{
+ if (!pm_vt_switch())
+ return 0;
+
+ orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1);
+ if (orig_fgconsole < 0)
+ return 1;
+
+ orig_kmsg = vt_kmsg_redirect(SUSPEND_CONSOLE);
+ return 0;
+}
+
+void pm_restore_console(void)
+{
+ if (!pm_vt_switch())
+ return;
+
+ if (orig_fgconsole >= 0) {
+ vt_move_to_console(orig_fgconsole, 0);
+ vt_kmsg_redirect(orig_kmsg);
+ }
+}
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
new file mode 100644
index 000000000..b7d3bc724
--- /dev/null
+++ b/kernel/power/hibernate.c
@@ -0,0 +1,1178 @@
+/*
+ * kernel/power/hibernate.c - Hibernation (a.k.a suspend-to-disk) support.
+ *
+ * Copyright (c) 2003 Patrick Mochel
+ * Copyright (c) 2003 Open Source Development Lab
+ * Copyright (c) 2004 Pavel Machek <pavel@ucw.cz>
+ * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc.
+ * Copyright (C) 2012 Bojan Smojver <bojan@rexursive.com>
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/export.h>
+#include <linux/suspend.h>
+#include <linux/syscalls.h>
+#include <linux/reboot.h>
+#include <linux/string.h>
+#include <linux/device.h>
+#include <linux/async.h>
+#include <linux/delay.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/pm.h>
+#include <linux/console.h>
+#include <linux/cpu.h>
+#include <linux/freezer.h>
+#include <linux/gfp.h>
+#include <linux/syscore_ops.h>
+#include <linux/ctype.h>
+#include <linux/genhd.h>
+#include <linux/ktime.h>
+#include <trace/events/power.h>
+
+#include "tuxonice.h"
+
+
+static int nocompress;
+static int noresume;
+static int nohibernate;
+static int resume_wait;
+static unsigned int resume_delay;
+char resume_file[256] = CONFIG_PM_STD_PARTITION;
+dev_t swsusp_resume_device;
+sector_t swsusp_resume_block;
+__visible int in_suspend __nosavedata;
+
+enum {
+ HIBERNATION_INVALID,
+ HIBERNATION_PLATFORM,
+ HIBERNATION_SHUTDOWN,
+ HIBERNATION_REBOOT,
+#ifdef CONFIG_SUSPEND
+ HIBERNATION_SUSPEND,
+#endif
+ /* keep last */
+ __HIBERNATION_AFTER_LAST
+};
+#define HIBERNATION_MAX (__HIBERNATION_AFTER_LAST-1)
+#define HIBERNATION_FIRST (HIBERNATION_INVALID + 1)
+
+static int hibernation_mode = HIBERNATION_SHUTDOWN;
+
+bool freezer_test_done;
+
+static const struct platform_hibernation_ops *hibernation_ops;
+
+bool hibernation_available(void)
+{
+ return (nohibernate == 0);
+}
+
+/**
+ * hibernation_set_ops - Set the global hibernate operations.
+ * @ops: Hibernation operations to use in subsequent hibernation transitions.
+ */
+void hibernation_set_ops(const struct platform_hibernation_ops *ops)
+{
+ if (ops && !(ops->begin && ops->end && ops->pre_snapshot
+ && ops->prepare && ops->finish && ops->enter && ops->pre_restore
+ && ops->restore_cleanup && ops->leave)) {
+ WARN_ON(1);
+ return;
+ }
+ lock_system_sleep();
+ hibernation_ops = ops;
+ if (ops)
+ hibernation_mode = HIBERNATION_PLATFORM;
+ else if (hibernation_mode == HIBERNATION_PLATFORM)
+ hibernation_mode = HIBERNATION_SHUTDOWN;
+
+ unlock_system_sleep();
+}
+EXPORT_SYMBOL_GPL(hibernation_set_ops);
+
+static bool entering_platform_hibernation;
+
+bool system_entering_hibernation(void)
+{
+ return entering_platform_hibernation;
+}
+EXPORT_SYMBOL(system_entering_hibernation);
+
+#ifdef CONFIG_PM_DEBUG
+static void hibernation_debug_sleep(void)
+{
+ printk(KERN_INFO "hibernation debug: Waiting for 5 seconds.\n");
+ mdelay(5000);
+}
+
+static int hibernation_test(int level)
+{
+ if (pm_test_level == level) {
+ hibernation_debug_sleep();
+ return 1;
+ }
+ return 0;
+}
+#else /* !CONFIG_PM_DEBUG */
+static int hibernation_test(int level) { return 0; }
+#endif /* !CONFIG_PM_DEBUG */
+
+/**
+ * platform_begin - Call platform to start hibernation.
+ * @platform_mode: Whether or not to use the platform driver.
+ */
+int platform_begin(int platform_mode)
+{
+ return (platform_mode && hibernation_ops) ?
+ hibernation_ops->begin() : 0;
+}
+
+/**
+ * platform_end - Call platform to finish transition to the working state.
+ * @platform_mode: Whether or not to use the platform driver.
+ */
+void platform_end(int platform_mode)
+{
+ if (platform_mode && hibernation_ops)
+ hibernation_ops->end();
+}
+
+/**
+ * platform_pre_snapshot - Call platform to prepare the machine for hibernation.
+ * @platform_mode: Whether or not to use the platform driver.
+ *
+ * Use the platform driver to prepare the system for creating a hibernate image,
+ * if so configured, and return an error code if that fails.
+ */
+
+int platform_pre_snapshot(int platform_mode)
+{
+ return (platform_mode && hibernation_ops) ?
+ hibernation_ops->pre_snapshot() : 0;
+}
+
+/**
+ * platform_leave - Call platform to prepare a transition to the working state.
+ * @platform_mode: Whether or not to use the platform driver.
+ *
+ * Use the platform driver prepare to prepare the machine for switching to the
+ * normal mode of operation.
+ *
+ * This routine is called on one CPU with interrupts disabled.
+ */
+void platform_leave(int platform_mode)
+{
+ if (platform_mode && hibernation_ops)
+ hibernation_ops->leave();
+}
+
+/**
+ * platform_finish - Call platform to switch the system to the working state.
+ * @platform_mode: Whether or not to use the platform driver.
+ *
+ * Use the platform driver to switch the machine to the normal mode of
+ * operation.
+ *
+ * This routine must be called after platform_prepare().
+ */
+void platform_finish(int platform_mode)
+{
+ if (platform_mode && hibernation_ops)
+ hibernation_ops->finish();
+}
+
+/**
+ * platform_pre_restore - Prepare for hibernate image restoration.
+ * @platform_mode: Whether or not to use the platform driver.
+ *
+ * Use the platform driver to prepare the system for resume from a hibernation
+ * image.
+ *
+ * If the restore fails after this function has been called,
+ * platform_restore_cleanup() must be called.
+ */
+int platform_pre_restore(int platform_mode)
+{
+ return (platform_mode && hibernation_ops) ?
+ hibernation_ops->pre_restore() : 0;
+}
+
+/**
+ * platform_restore_cleanup - Switch to the working state after failing restore.
+ * @platform_mode: Whether or not to use the platform driver.
+ *
+ * Use the platform driver to switch the system to the normal mode of operation
+ * after a failing restore.
+ *
+ * If platform_pre_restore() has been called before the failing restore, this
+ * function must be called too, regardless of the result of
+ * platform_pre_restore().
+ */
+void platform_restore_cleanup(int platform_mode)
+{
+ if (platform_mode && hibernation_ops)
+ hibernation_ops->restore_cleanup();
+}
+
+/**
+ * platform_recover - Recover from a failure to suspend devices.
+ * @platform_mode: Whether or not to use the platform driver.
+ */
+void platform_recover(int platform_mode)
+{
+ if (platform_mode && hibernation_ops && hibernation_ops->recover)
+ hibernation_ops->recover();
+}
+
+/**
+ * swsusp_show_speed - Print time elapsed between two events during hibernation.
+ * @start: Starting event.
+ * @stop: Final event.
+ * @nr_pages: Number of memory pages processed between @start and @stop.
+ * @msg: Additional diagnostic message to print.
+ */
+void swsusp_show_speed(ktime_t start, ktime_t stop,
+ unsigned nr_pages, char *msg)
+{
+ ktime_t diff;
+ u64 elapsed_centisecs64;
+ unsigned int centisecs;
+ unsigned int k;
+ unsigned int kps;
+
+ diff = ktime_sub(stop, start);
+ elapsed_centisecs64 = ktime_divns(diff, 10*NSEC_PER_MSEC);
+ centisecs = elapsed_centisecs64;
+ if (centisecs == 0)
+ centisecs = 1; /* avoid div-by-zero */
+ k = nr_pages * (PAGE_SIZE / 1024);
+ kps = (k * 100) / centisecs;
+ printk(KERN_INFO "PM: %s %u kbytes in %u.%02u seconds (%u.%02u MB/s)\n",
+ msg, k,
+ centisecs / 100, centisecs % 100,
+ kps / 1000, (kps % 1000) / 10);
+}
+
+/**
+ * create_image - Create a hibernation image.
+ * @platform_mode: Whether or not to use the platform driver.
+ *
+ * Execute device drivers' "late" and "noirq" freeze callbacks, create a
+ * hibernation image and run the drivers' "noirq" and "early" thaw callbacks.
+ *
+ * Control reappears in this routine after the subsequent restore.
+ */
+static int create_image(int platform_mode)
+{
+ int error;
+
+ error = dpm_suspend_end(PMSG_FREEZE);
+ if (error) {
+ printk(KERN_ERR "PM: Some devices failed to power down, "
+ "aborting hibernation\n");
+ return error;
+ }
+
+ error = platform_pre_snapshot(platform_mode);
+ if (error || hibernation_test(TEST_PLATFORM))
+ goto Platform_finish;
+
+ error = disable_nonboot_cpus();
+ if (error || hibernation_test(TEST_CPUS))
+ goto Enable_cpus;
+
+ local_irq_disable();
+
+ error = syscore_suspend();
+ if (error) {
+ printk(KERN_ERR "PM: Some system devices failed to power down, "
+ "aborting hibernation\n");
+ goto Enable_irqs;
+ }
+
+ if (hibernation_test(TEST_CORE) || pm_wakeup_pending())
+ goto Power_up;
+
+ in_suspend = 1;
+ save_processor_state();
+ trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, true);
+ error = swsusp_arch_suspend();
+ trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false);
+ if (error)
+ printk(KERN_ERR "PM: Error %d creating hibernation image\n",
+ error);
+ /* Restore control flow magically appears here */
+ restore_processor_state();
+ if (!in_suspend)
+ events_check_enabled = false;
+
+ platform_leave(platform_mode);
+
+ Power_up:
+ syscore_resume();
+
+ Enable_irqs:
+ local_irq_enable();
+
+ Enable_cpus:
+ enable_nonboot_cpus();
+
+ Platform_finish:
+ platform_finish(platform_mode);
+
+ dpm_resume_start(in_suspend ?
+ (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
+
+ return error;
+}
+
+/**
+ * hibernation_snapshot - Quiesce devices and create a hibernation image.
+ * @platform_mode: If set, use platform driver to prepare for the transition.
+ *
+ * This routine must be called with pm_mutex held.
+ */
+int hibernation_snapshot(int platform_mode)
+{
+ pm_message_t msg;
+ int error;
+
+ error = platform_begin(platform_mode);
+ if (error)
+ goto Close;
+
+ /* Preallocate image memory before shutting down devices. */
+ error = hibernate_preallocate_memory();
+ if (error)
+ goto Close;
+
+ error = freeze_kernel_threads();
+ if (error)
+ goto Cleanup;
+
+ if (hibernation_test(TEST_FREEZER)) {
+
+ /*
+ * Indicate to the caller that we are returning due to a
+ * successful freezer test.
+ */
+ freezer_test_done = true;
+ goto Thaw;
+ }
+
+ error = dpm_prepare(PMSG_FREEZE);
+ if (error) {
+ dpm_complete(PMSG_RECOVER);
+ goto Thaw;
+ }
+
+ suspend_console();
+ pm_restrict_gfp_mask();
+
+ error = dpm_suspend(PMSG_FREEZE);
+
+ if (error || hibernation_test(TEST_DEVICES))
+ platform_recover(platform_mode);
+ else
+ error = create_image(platform_mode);
+
+ /*
+ * In the case that we call create_image() above, the control
+ * returns here (1) after the image has been created or the
+ * image creation has failed and (2) after a successful restore.
+ */
+
+ /* We may need to release the preallocated image pages here. */
+ if (error || !in_suspend)
+ swsusp_free();
+
+ msg = in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE;
+ dpm_resume(msg);
+
+ if (error || !in_suspend)
+ pm_restore_gfp_mask();
+
+ resume_console();
+ dpm_complete(msg);
+
+ Close:
+ platform_end(platform_mode);
+ return error;
+
+ Thaw:
+ thaw_kernel_threads();
+ Cleanup:
+ swsusp_free();
+ goto Close;
+}
+
+/**
+ * resume_target_kernel - Restore system state from a hibernation image.
+ * @platform_mode: Whether or not to use the platform driver.
+ *
+ * Execute device drivers' "noirq" and "late" freeze callbacks, restore the
+ * contents of highmem that have not been restored yet from the image and run
+ * the low-level code that will restore the remaining contents of memory and
+ * switch to the just restored target kernel.
+ */
+static int resume_target_kernel(bool platform_mode)
+{
+ int error;
+
+ error = dpm_suspend_end(PMSG_QUIESCE);
+ if (error) {
+ printk(KERN_ERR "PM: Some devices failed to power down, "
+ "aborting resume\n");
+ return error;
+ }
+
+ error = platform_pre_restore(platform_mode);
+ if (error)
+ goto Cleanup;
+
+ error = disable_nonboot_cpus();
+ if (error)
+ goto Enable_cpus;
+
+ local_irq_disable();
+
+ error = syscore_suspend();
+ if (error)
+ goto Enable_irqs;
+
+ save_processor_state();
+ error = restore_highmem();
+ if (!error) {
+ error = swsusp_arch_resume();
+ /*
+ * The code below is only ever reached in case of a failure.
+ * Otherwise, execution continues at the place where
+ * swsusp_arch_suspend() was called.
+ */
+ BUG_ON(!error);
+ /*
+ * This call to restore_highmem() reverts the changes made by
+ * the previous one.
+ */
+ restore_highmem();
+ }
+ /*
+ * The only reason why swsusp_arch_resume() can fail is memory being
+ * very tight, so we have to free it as soon as we can to avoid
+ * subsequent failures.
+ */
+ swsusp_free();
+ restore_processor_state();
+ touch_softlockup_watchdog();
+
+ syscore_resume();
+
+ Enable_irqs:
+ local_irq_enable();
+
+ Enable_cpus:
+ enable_nonboot_cpus();
+
+ Cleanup:
+ platform_restore_cleanup(platform_mode);
+
+ dpm_resume_start(PMSG_RECOVER);
+
+ return error;
+}
+
+/**
+ * hibernation_restore - Quiesce devices and restore from a hibernation image.
+ * @platform_mode: If set, use platform driver to prepare for the transition.
+ *
+ * This routine must be called with pm_mutex held. If it is successful, control
+ * reappears in the restored target kernel in hibernation_snapshot().
+ */
+int hibernation_restore(int platform_mode)
+{
+ int error;
+
+ pm_prepare_console();
+ suspend_console();
+ pm_restrict_gfp_mask();
+ error = dpm_suspend_start(PMSG_QUIESCE);
+ if (!error) {
+ error = resume_target_kernel(platform_mode);
+ /*
+ * The above should either succeed and jump to the new kernel,
+ * or return with an error. Otherwise things are just
+ * undefined, so let's be paranoid.
+ */
+ BUG_ON(!error);
+ }
+ dpm_resume_end(PMSG_RECOVER);
+ pm_restore_gfp_mask();
+ resume_console();
+ pm_restore_console();
+ return error;
+}
+
+/**
+ * hibernation_platform_enter - Power off the system using the platform driver.
+ */
+int hibernation_platform_enter(void)
+{
+ int error;
+
+ if (!hibernation_ops)
+ return -ENOSYS;
+
+ /*
+ * We have cancelled the power transition by running
+ * hibernation_ops->finish() before saving the image, so we should let
+ * the firmware know that we're going to enter the sleep state after all
+ */
+ error = hibernation_ops->begin();
+ if (error)
+ goto Close;
+
+ entering_platform_hibernation = true;
+ suspend_console();
+ error = dpm_suspend_start(PMSG_HIBERNATE);
+ if (error) {
+ if (hibernation_ops->recover)
+ hibernation_ops->recover();
+ goto Resume_devices;
+ }
+
+ error = dpm_suspend_end(PMSG_HIBERNATE);
+ if (error)
+ goto Resume_devices;
+
+ error = hibernation_ops->prepare();
+ if (error)
+ goto Platform_finish;
+
+ error = disable_nonboot_cpus();
+ if (error)
+ goto Platform_finish;
+
+ local_irq_disable();
+ syscore_suspend();
+ if (pm_wakeup_pending()) {
+ error = -EAGAIN;
+ goto Power_up;
+ }
+
+ hibernation_ops->enter();
+ /* We should never get here */
+ while (1);
+
+ Power_up:
+ syscore_resume();
+ local_irq_enable();
+ enable_nonboot_cpus();
+
+ Platform_finish:
+ hibernation_ops->finish();
+
+ dpm_resume_start(PMSG_RESTORE);
+
+ Resume_devices:
+ entering_platform_hibernation = false;
+ dpm_resume_end(PMSG_RESTORE);
+ resume_console();
+
+ Close:
+ hibernation_ops->end();
+
+ return error;
+}
+
+/**
+ * power_down - Shut the machine down for hibernation.
+ *
+ * Use the platform driver, if configured, to put the system into the sleep
+ * state corresponding to hibernation, or try to power it off or reboot,
+ * depending on the value of hibernation_mode.
+ */
+static void power_down(void)
+{
+#ifdef CONFIG_SUSPEND
+ int error;
+#endif
+
+ switch (hibernation_mode) {
+ case HIBERNATION_REBOOT:
+ kernel_restart(NULL);
+ break;
+ case HIBERNATION_PLATFORM:
+ hibernation_platform_enter();
+ case HIBERNATION_SHUTDOWN:
+ if (pm_power_off)
+ kernel_power_off();
+ break;
+#ifdef CONFIG_SUSPEND
+ case HIBERNATION_SUSPEND:
+ error = suspend_devices_and_enter(PM_SUSPEND_MEM);
+ if (error) {
+ if (hibernation_ops)
+ hibernation_mode = HIBERNATION_PLATFORM;
+ else
+ hibernation_mode = HIBERNATION_SHUTDOWN;
+ power_down();
+ }
+ /*
+ * Restore swap signature.
+ */
+ error = swsusp_unmark();
+ if (error)
+ printk(KERN_ERR "PM: Swap will be unusable! "
+ "Try swapon -a.\n");
+ return;
+#endif
+ }
+ kernel_halt();
+ /*
+ * Valid image is on the disk, if we continue we risk serious data
+ * corruption after resume.
+ */
+ printk(KERN_CRIT "PM: Please power down manually\n");
+ while (1)
+ cpu_relax();
+}
+
+/**
+ * hibernate - Carry out system hibernation, including saving the image.
+ */
+int hibernate(void)
+{
+ int error;
+
+ if (test_action_state(TOI_REPLACE_SWSUSP))
+ return try_tuxonice_hibernate();
+
+ if (!hibernation_available()) {
+ pr_debug("PM: Hibernation not available.\n");
+ return -EPERM;
+ }
+
+ lock_system_sleep();
+ /* The snapshot device should not be opened while we're running */
+ if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
+ error = -EBUSY;
+ goto Unlock;
+ }
+
+ pm_prepare_console();
+ error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
+ if (error)
+ goto Exit;
+
+ printk(KERN_INFO "PM: Syncing filesystems ... ");
+ sys_sync();
+ printk("done.\n");
+
+ error = freeze_processes();
+ if (error)
+ goto Exit;
+
+ lock_device_hotplug();
+ /* Allocate memory management structures */
+ error = create_basic_memory_bitmaps();
+ if (error)
+ goto Thaw;
+
+ error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM);
+ if (error || freezer_test_done)
+ goto Free_bitmaps;
+
+ if (in_suspend) {
+ unsigned int flags = 0;
+
+ if (hibernation_mode == HIBERNATION_PLATFORM)
+ flags |= SF_PLATFORM_MODE;
+ if (nocompress)
+ flags |= SF_NOCOMPRESS_MODE;
+ else
+ flags |= SF_CRC32_MODE;
+
+ pr_debug("PM: writing image.\n");
+ error = swsusp_write(flags);
+ swsusp_free();
+ if (!error)
+ power_down();
+ in_suspend = 0;
+ pm_restore_gfp_mask();
+ } else {
+ pr_debug("PM: Image restored successfully.\n");
+ }
+
+ Free_bitmaps:
+ free_basic_memory_bitmaps();
+ Thaw:
+ unlock_device_hotplug();
+ thaw_processes();
+
+ /* Don't bother checking whether freezer_test_done is true */
+ freezer_test_done = false;
+ Exit:
+ pm_notifier_call_chain(PM_POST_HIBERNATION);
+ pm_restore_console();
+ atomic_inc(&snapshot_device_available);
+ Unlock:
+ unlock_system_sleep();
+ return error;
+}
+
+
+/**
+ * software_resume - Resume from a saved hibernation image.
+ *
+ * This routine is called as a late initcall, when all devices have been
+ * discovered and initialized already.
+ *
+ * The image reading code is called to see if there is a hibernation image
+ * available for reading. If that is the case, devices are quiesced and the
+ * contents of memory is restored from the saved image.
+ *
+ * If this is successful, control reappears in the restored target kernel in
+ * hibernation_snaphot() which returns to hibernate(). Otherwise, the routine
+ * attempts to recover gracefully and make the kernel return to the normal mode
+ * of operation.
+ */
+int software_resume(void)
+{
+ int error;
+ unsigned int flags;
+
+ resume_attempted = 1;
+
+ /*
+ * We can't know (until an image header - if any - is loaded), whether
+ * we did override swsusp. We therefore ensure that both are tried.
+ */
+ try_tuxonice_resume();
+
+ /*
+ * If the user said "noresume".. bail out early.
+ */
+ if (noresume || !hibernation_available())
+ return 0;
+
+ /*
+ * name_to_dev_t() below takes a sysfs buffer mutex when sysfs
+ * is configured into the kernel. Since the regular hibernate
+ * trigger path is via sysfs which takes a buffer mutex before
+ * calling hibernate functions (which take pm_mutex) this can
+ * cause lockdep to complain about a possible ABBA deadlock
+ * which cannot happen since we're in the boot code here and
+ * sysfs can't be invoked yet. Therefore, we use a subclass
+ * here to avoid lockdep complaining.
+ */
+ mutex_lock_nested(&pm_mutex, SINGLE_DEPTH_NESTING);
+
+ if (swsusp_resume_device)
+ goto Check_image;
+
+ if (!strlen(resume_file)) {
+ error = -ENOENT;
+ goto Unlock;
+ }
+
+ pr_debug("PM: Checking hibernation image partition %s\n", resume_file);
+
+ if (resume_delay) {
+ printk(KERN_INFO "Waiting %dsec before reading resume device...\n",
+ resume_delay);
+ ssleep(resume_delay);
+ }
+
+ /* Check if the device is there */
+ swsusp_resume_device = name_to_dev_t(resume_file);
+
+ /*
+ * name_to_dev_t is ineffective to verify parition if resume_file is in
+ * integer format. (e.g. major:minor)
+ */
+ if (isdigit(resume_file[0]) && resume_wait) {
+ int partno;
+ while (!get_gendisk(swsusp_resume_device, &partno))
+ msleep(10);
+ }
+
+ if (!swsusp_resume_device) {
+ /*
+ * Some device discovery might still be in progress; we need
+ * to wait for this to finish.
+ */
+ wait_for_device_probe();
+
+ if (resume_wait) {
+ while ((swsusp_resume_device = name_to_dev_t(resume_file)) == 0)
+ msleep(10);
+ async_synchronize_full();
+ }
+
+ swsusp_resume_device = name_to_dev_t(resume_file);
+ if (!swsusp_resume_device) {
+ error = -ENODEV;
+ goto Unlock;
+ }
+ }
+
+ Check_image:
+ pr_debug("PM: Hibernation image partition %d:%d present\n",
+ MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device));
+
+ pr_debug("PM: Looking for hibernation image.\n");
+ error = swsusp_check();
+ if (error)
+ goto Unlock;
+
+ /* The snapshot device should not be opened while we're running */
+ if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
+ error = -EBUSY;
+ swsusp_close(FMODE_READ);
+ goto Unlock;
+ }
+
+ pm_prepare_console();
+ error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
+ if (error)
+ goto Close_Finish;
+
+ pr_debug("PM: Preparing processes for restore.\n");
+ error = freeze_processes();
+ if (error)
+ goto Close_Finish;
+
+ pr_debug("PM: Loading hibernation image.\n");
+
+ lock_device_hotplug();
+ error = create_basic_memory_bitmaps();
+ if (error)
+ goto Thaw;
+
+ error = swsusp_read(&flags);
+ swsusp_close(FMODE_READ);
+ if (!error)
+ hibernation_restore(flags & SF_PLATFORM_MODE);
+
+ printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n");
+ swsusp_free();
+ free_basic_memory_bitmaps();
+ Thaw:
+ unlock_device_hotplug();
+ thaw_processes();
+ Finish:
+ pm_notifier_call_chain(PM_POST_RESTORE);
+ pm_restore_console();
+ atomic_inc(&snapshot_device_available);
+ /* For success case, the suspend path will release the lock */
+ Unlock:
+ mutex_unlock(&pm_mutex);
+ pr_debug("PM: Hibernation image not present or could not be loaded.\n");
+ return error;
+ Close_Finish:
+ swsusp_close(FMODE_READ);
+ goto Finish;
+}
+
+late_initcall_sync(software_resume);
+
+
+static const char * const hibernation_modes[] = {
+ [HIBERNATION_PLATFORM] = "platform",
+ [HIBERNATION_SHUTDOWN] = "shutdown",
+ [HIBERNATION_REBOOT] = "reboot",
+#ifdef CONFIG_SUSPEND
+ [HIBERNATION_SUSPEND] = "suspend",
+#endif
+};
+
+/*
+ * /sys/power/disk - Control hibernation mode.
+ *
+ * Hibernation can be handled in several ways. There are a few different ways
+ * to put the system into the sleep state: using the platform driver (e.g. ACPI
+ * or other hibernation_ops), powering it off or rebooting it (for testing
+ * mostly).
+ *
+ * The sysfs file /sys/power/disk provides an interface for selecting the
+ * hibernation mode to use. Reading from this file causes the available modes
+ * to be printed. There are 3 modes that can be supported:
+ *
+ * 'platform'
+ * 'shutdown'
+ * 'reboot'
+ *
+ * If a platform hibernation driver is in use, 'platform' will be supported
+ * and will be used by default. Otherwise, 'shutdown' will be used by default.
+ * The selected option (i.e. the one corresponding to the current value of
+ * hibernation_mode) is enclosed by a square bracket.
+ *
+ * To select a given hibernation mode it is necessary to write the mode's
+ * string representation (as returned by reading from /sys/power/disk) back
+ * into /sys/power/disk.
+ */
+
+static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ int i;
+ char *start = buf;
+
+ if (!hibernation_available())
+ return sprintf(buf, "[disabled]\n");
+
+ for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
+ if (!hibernation_modes[i])
+ continue;
+ switch (i) {
+ case HIBERNATION_SHUTDOWN:
+ case HIBERNATION_REBOOT:
+#ifdef CONFIG_SUSPEND
+ case HIBERNATION_SUSPEND:
+#endif
+ break;
+ case HIBERNATION_PLATFORM:
+ if (hibernation_ops)
+ break;
+ /* not a valid mode, continue with loop */
+ continue;
+ }
+ if (i == hibernation_mode)
+ buf += sprintf(buf, "[%s] ", hibernation_modes[i]);
+ else
+ buf += sprintf(buf, "%s ", hibernation_modes[i]);
+ }
+ buf += sprintf(buf, "\n");
+ return buf-start;
+}
+
+static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ int error = 0;
+ int i;
+ int len;
+ char *p;
+ int mode = HIBERNATION_INVALID;
+
+ if (!hibernation_available())
+ return -EPERM;
+
+ p = memchr(buf, '\n', n);
+ len = p ? p - buf : n;
+
+ lock_system_sleep();
+ for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
+ if (len == strlen(hibernation_modes[i])
+ && !strncmp(buf, hibernation_modes[i], len)) {
+ mode = i;
+ break;
+ }
+ }
+ if (mode != HIBERNATION_INVALID) {
+ switch (mode) {
+ case HIBERNATION_SHUTDOWN:
+ case HIBERNATION_REBOOT:
+#ifdef CONFIG_SUSPEND
+ case HIBERNATION_SUSPEND:
+#endif
+ hibernation_mode = mode;
+ break;
+ case HIBERNATION_PLATFORM:
+ if (hibernation_ops)
+ hibernation_mode = mode;
+ else
+ error = -EINVAL;
+ }
+ } else
+ error = -EINVAL;
+
+ if (!error)
+ pr_debug("PM: Hibernation mode set to '%s'\n",
+ hibernation_modes[mode]);
+ unlock_system_sleep();
+ return error ? error : n;
+}
+
+power_attr(disk);
+
+static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device),
+ MINOR(swsusp_resume_device));
+}
+
+static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ dev_t res;
+ int len = n;
+ char *name;
+
+ if (len && buf[len-1] == '\n')
+ len--;
+ name = kstrndup(buf, len, GFP_KERNEL);
+ if (!name)
+ return -ENOMEM;
+
+ res = name_to_dev_t(name);
+ kfree(name);
+ if (!res)
+ return -EINVAL;
+
+ lock_system_sleep();
+ swsusp_resume_device = res;
+ unlock_system_sleep();
+ printk(KERN_INFO "PM: Starting manual resume from disk\n");
+ noresume = 0;
+ software_resume();
+ return n;
+}
+
+power_attr(resume);
+
+static ssize_t image_size_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%lu\n", image_size);
+}
+
+static ssize_t image_size_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ unsigned long size;
+
+ if (sscanf(buf, "%lu", &size) == 1) {
+ image_size = size;
+ return n;
+ }
+
+ return -EINVAL;
+}
+
+power_attr(image_size);
+
+static ssize_t reserved_size_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%lu\n", reserved_size);
+}
+
+static ssize_t reserved_size_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ unsigned long size;
+
+ if (sscanf(buf, "%lu", &size) == 1) {
+ reserved_size = size;
+ return n;
+ }
+
+ return -EINVAL;
+}
+
+power_attr(reserved_size);
+
+static struct attribute * g[] = {
+ &disk_attr.attr,
+ &resume_attr.attr,
+ &image_size_attr.attr,
+ &reserved_size_attr.attr,
+ NULL,
+};
+
+
+static struct attribute_group attr_group = {
+ .attrs = g,
+};
+
+
+static int __init pm_disk_init(void)
+{
+ return sysfs_create_group(power_kobj, &attr_group);
+}
+
+core_initcall(pm_disk_init);
+
+
+static int __init resume_setup(char *str)
+{
+ if (noresume)
+ return 1;
+
+ strncpy( resume_file, str, 255 );
+ return 1;
+}
+
+static int __init resume_offset_setup(char *str)
+{
+ unsigned long long offset;
+
+ if (noresume)
+ return 1;
+
+ if (sscanf(str, "%llu", &offset) == 1)
+ swsusp_resume_block = offset;
+
+ return 1;
+}
+
+static int __init hibernate_setup(char *str)
+{
+ if (!strncmp(str, "noresume", 8))
+ noresume = 1;
+ else if (!strncmp(str, "nocompress", 10))
+ nocompress = 1;
+ else if (!strncmp(str, "no", 2)) {
+ noresume = 1;
+ nohibernate = 1;
+ }
+ return 1;
+}
+
+static int __init noresume_setup(char *str)
+{
+ noresume = 1;
+ set_toi_state(TOI_NORESUME_SPECIFIED);
+ return 1;
+}
+
+static int __init resumewait_setup(char *str)
+{
+ resume_wait = 1;
+ return 1;
+}
+
+static int __init resumedelay_setup(char *str)
+{
+ int rc = kstrtouint(str, 0, &resume_delay);
+
+ if (rc)
+ return rc;
+ return 1;
+}
+
+static int __init nohibernate_setup(char *str)
+{
+ noresume = 1;
+ nohibernate = 1;
+ return 1;
+}
+
+static int __init kaslr_nohibernate_setup(char *str)
+{
+ return nohibernate_setup(str);
+}
+
+__setup("noresume", noresume_setup);
+__setup("resume_offset=", resume_offset_setup);
+__setup("resume=", resume_setup);
+__setup("hibernate=", hibernate_setup);
+__setup("resumewait", resumewait_setup);
+__setup("resumedelay=", resumedelay_setup);
+__setup("nohibernate", nohibernate_setup);
+__setup("kaslr", kaslr_nohibernate_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
new file mode 100644
index 000000000..86e8157a4
--- /dev/null
+++ b/kernel/power/main.c
@@ -0,0 +1,646 @@
+/*
+ * kernel/power/main.c - PM subsystem core functionality.
+ *
+ * Copyright (c) 2003 Patrick Mochel
+ * Copyright (c) 2003 Open Source Development Lab
+ *
+ * This file is released under the GPLv2
+ *
+ */
+
+#include <linux/export.h>
+#include <linux/kobject.h>
+#include <linux/string.h>
+#include <linux/pm-trace.h>
+#include <linux/workqueue.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include "power.h"
+
+DEFINE_MUTEX(pm_mutex);
+
+#ifdef CONFIG_PM_SLEEP
+
+/* Routines for PM-transition notifications */
+
+static BLOCKING_NOTIFIER_HEAD(pm_chain_head);
+
+int register_pm_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_register(&pm_chain_head, nb);
+}
+EXPORT_SYMBOL_GPL(register_pm_notifier);
+
+int unregister_pm_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_unregister(&pm_chain_head, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_pm_notifier);
+
+int pm_notifier_call_chain(unsigned long val)
+{
+ int ret = blocking_notifier_call_chain(&pm_chain_head, val, NULL);
+
+ return notifier_to_errno(ret);
+}
+
+/* If set, devices may be suspended and resumed asynchronously. */
+int pm_async_enabled = 1;
+
+static ssize_t pm_async_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%d\n", pm_async_enabled);
+}
+
+static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ unsigned long val;
+
+ if (kstrtoul(buf, 10, &val))
+ return -EINVAL;
+
+ if (val > 1)
+ return -EINVAL;
+
+ pm_async_enabled = val;
+ return n;
+}
+
+power_attr(pm_async);
+
+#ifdef CONFIG_PM_DEBUG
+int pm_test_level = TEST_NONE;
+
+static const char * const pm_tests[__TEST_AFTER_LAST] = {
+ [TEST_NONE] = "none",
+ [TEST_CORE] = "core",
+ [TEST_CPUS] = "processors",
+ [TEST_PLATFORM] = "platform",
+ [TEST_DEVICES] = "devices",
+ [TEST_FREEZER] = "freezer",
+};
+
+static ssize_t pm_test_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ char *s = buf;
+ int level;
+
+ for (level = TEST_FIRST; level <= TEST_MAX; level++)
+ if (pm_tests[level]) {
+ if (level == pm_test_level)
+ s += sprintf(s, "[%s] ", pm_tests[level]);
+ else
+ s += sprintf(s, "%s ", pm_tests[level]);
+ }
+
+ if (s != buf)
+ /* convert the last space to a newline */
+ *(s-1) = '\n';
+
+ return (s - buf);
+}
+
+static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ const char * const *s;
+ int level;
+ char *p;
+ int len;
+ int error = -EINVAL;
+
+ p = memchr(buf, '\n', n);
+ len = p ? p - buf : n;
+
+ lock_system_sleep();
+
+ level = TEST_FIRST;
+ for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++)
+ if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) {
+ pm_test_level = level;
+ error = 0;
+ break;
+ }
+
+ unlock_system_sleep();
+
+ return error ? error : n;
+}
+
+power_attr(pm_test);
+#endif /* CONFIG_PM_DEBUG */
+
+#ifdef CONFIG_DEBUG_FS
+static char *suspend_step_name(enum suspend_stat_step step)
+{
+ switch (step) {
+ case SUSPEND_FREEZE:
+ return "freeze";
+ case SUSPEND_PREPARE:
+ return "prepare";
+ case SUSPEND_SUSPEND:
+ return "suspend";
+ case SUSPEND_SUSPEND_NOIRQ:
+ return "suspend_noirq";
+ case SUSPEND_RESUME_NOIRQ:
+ return "resume_noirq";
+ case SUSPEND_RESUME:
+ return "resume";
+ default:
+ return "";
+ }
+}
+
+static int suspend_stats_show(struct seq_file *s, void *unused)
+{
+ int i, index, last_dev, last_errno, last_step;
+
+ last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1;
+ last_dev %= REC_FAILED_NUM;
+ last_errno = suspend_stats.last_failed_errno + REC_FAILED_NUM - 1;
+ last_errno %= REC_FAILED_NUM;
+ last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1;
+ last_step %= REC_FAILED_NUM;
+ seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n"
+ "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n",
+ "success", suspend_stats.success,
+ "fail", suspend_stats.fail,
+ "failed_freeze", suspend_stats.failed_freeze,
+ "failed_prepare", suspend_stats.failed_prepare,
+ "failed_suspend", suspend_stats.failed_suspend,
+ "failed_suspend_late",
+ suspend_stats.failed_suspend_late,
+ "failed_suspend_noirq",
+ suspend_stats.failed_suspend_noirq,
+ "failed_resume", suspend_stats.failed_resume,
+ "failed_resume_early",
+ suspend_stats.failed_resume_early,
+ "failed_resume_noirq",
+ suspend_stats.failed_resume_noirq);
+ seq_printf(s, "failures:\n last_failed_dev:\t%-s\n",
+ suspend_stats.failed_devs[last_dev]);
+ for (i = 1; i < REC_FAILED_NUM; i++) {
+ index = last_dev + REC_FAILED_NUM - i;
+ index %= REC_FAILED_NUM;
+ seq_printf(s, "\t\t\t%-s\n",
+ suspend_stats.failed_devs[index]);
+ }
+ seq_printf(s, " last_failed_errno:\t%-d\n",
+ suspend_stats.errno[last_errno]);
+ for (i = 1; i < REC_FAILED_NUM; i++) {
+ index = last_errno + REC_FAILED_NUM - i;
+ index %= REC_FAILED_NUM;
+ seq_printf(s, "\t\t\t%-d\n",
+ suspend_stats.errno[index]);
+ }
+ seq_printf(s, " last_failed_step:\t%-s\n",
+ suspend_step_name(
+ suspend_stats.failed_steps[last_step]));
+ for (i = 1; i < REC_FAILED_NUM; i++) {
+ index = last_step + REC_FAILED_NUM - i;
+ index %= REC_FAILED_NUM;
+ seq_printf(s, "\t\t\t%-s\n",
+ suspend_step_name(
+ suspend_stats.failed_steps[index]));
+ }
+
+ return 0;
+}
+
+static int suspend_stats_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, suspend_stats_show, NULL);
+}
+
+static const struct file_operations suspend_stats_operations = {
+ .open = suspend_stats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init pm_debugfs_init(void)
+{
+ debugfs_create_file("suspend_stats", S_IFREG | S_IRUGO,
+ NULL, NULL, &suspend_stats_operations);
+ return 0;
+}
+
+late_initcall(pm_debugfs_init);
+#endif /* CONFIG_DEBUG_FS */
+
+#endif /* CONFIG_PM_SLEEP */
+
+#ifdef CONFIG_PM_SLEEP_DEBUG
+/*
+ * pm_print_times: print time taken by devices to suspend and resume.
+ *
+ * show() returns whether printing of suspend and resume times is enabled.
+ * store() accepts 0 or 1. 0 disables printing and 1 enables it.
+ */
+bool pm_print_times_enabled;
+
+static ssize_t pm_print_times_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", pm_print_times_enabled);
+}
+
+static ssize_t pm_print_times_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ unsigned long val;
+
+ if (kstrtoul(buf, 10, &val))
+ return -EINVAL;
+
+ if (val > 1)
+ return -EINVAL;
+
+ pm_print_times_enabled = !!val;
+ return n;
+}
+
+power_attr(pm_print_times);
+
+static inline void pm_print_times_init(void)
+{
+ pm_print_times_enabled = !!initcall_debug;
+}
+#else /* !CONFIG_PP_SLEEP_DEBUG */
+static inline void pm_print_times_init(void) {}
+#endif /* CONFIG_PM_SLEEP_DEBUG */
+
+struct kobject *power_kobj;
+
+/**
+ * state - control system sleep states.
+ *
+ * show() returns available sleep state labels, which may be "mem", "standby",
+ * "freeze" and "disk" (hibernation). See Documentation/power/states.txt for a
+ * description of what they mean.
+ *
+ * store() accepts one of those strings, translates it into the proper
+ * enumerated value, and initiates a suspend transition.
+ */
+static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ char *s = buf;
+#ifdef CONFIG_SUSPEND
+ suspend_state_t i;
+
+ for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
+ if (pm_states[i])
+ s += sprintf(s,"%s ", pm_states[i]);
+
+#endif
+ if (hibernation_available())
+ s += sprintf(s, "disk ");
+ if (s != buf)
+ /* convert the last space to a newline */
+ *(s-1) = '\n';
+ return (s - buf);
+}
+
+static suspend_state_t decode_state(const char *buf, size_t n)
+{
+#ifdef CONFIG_SUSPEND
+ suspend_state_t state;
+#endif
+ char *p;
+ int len;
+
+ p = memchr(buf, '\n', n);
+ len = p ? p - buf : n;
+
+ /* Check hibernation first. */
+ if (len == 4 && !strncmp(buf, "disk", len))
+ return PM_SUSPEND_MAX;
+
+#ifdef CONFIG_SUSPEND
+ for (state = PM_SUSPEND_MIN; state < PM_SUSPEND_MAX; state++) {
+ const char *label = pm_states[state];
+
+ if (label && len == strlen(label) && !strncmp(buf, label, len))
+ return state;
+ }
+#endif
+
+ return PM_SUSPEND_ON;
+}
+
+static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ suspend_state_t state;
+ int error;
+
+ error = pm_autosleep_lock();
+ if (error)
+ return error;
+
+ if (pm_autosleep_state() > PM_SUSPEND_ON) {
+ error = -EBUSY;
+ goto out;
+ }
+
+ state = decode_state(buf, n);
+ if (state < PM_SUSPEND_MAX)
+ error = pm_suspend(state);
+ else if (state == PM_SUSPEND_MAX)
+ error = hibernate();
+ else
+ error = -EINVAL;
+
+ out:
+ pm_autosleep_unlock();
+ return error ? error : n;
+}
+
+power_attr(state);
+
+#ifdef CONFIG_PM_SLEEP
+/*
+ * The 'wakeup_count' attribute, along with the functions defined in
+ * drivers/base/power/wakeup.c, provides a means by which wakeup events can be
+ * handled in a non-racy way.
+ *
+ * If a wakeup event occurs when the system is in a sleep state, it simply is
+ * woken up. In turn, if an event that would wake the system up from a sleep
+ * state occurs when it is undergoing a transition to that sleep state, the
+ * transition should be aborted. Moreover, if such an event occurs when the
+ * system is in the working state, an attempt to start a transition to the
+ * given sleep state should fail during certain period after the detection of
+ * the event. Using the 'state' attribute alone is not sufficient to satisfy
+ * these requirements, because a wakeup event may occur exactly when 'state'
+ * is being written to and may be delivered to user space right before it is
+ * frozen, so the event will remain only partially processed until the system is
+ * woken up by another event. In particular, it won't cause the transition to
+ * a sleep state to be aborted.
+ *
+ * This difficulty may be overcome if user space uses 'wakeup_count' before
+ * writing to 'state'. It first should read from 'wakeup_count' and store
+ * the read value. Then, after carrying out its own preparations for the system
+ * transition to a sleep state, it should write the stored value to
+ * 'wakeup_count'. If that fails, at least one wakeup event has occurred since
+ * 'wakeup_count' was read and 'state' should not be written to. Otherwise, it
+ * is allowed to write to 'state', but the transition will be aborted if there
+ * are any wakeup events detected after 'wakeup_count' was written to.
+ */
+
+static ssize_t wakeup_count_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ unsigned int val;
+
+ return pm_get_wakeup_count(&val, true) ?
+ sprintf(buf, "%u\n", val) : -EINTR;
+}
+
+static ssize_t wakeup_count_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ unsigned int val;
+ int error;
+
+ error = pm_autosleep_lock();
+ if (error)
+ return error;
+
+ if (pm_autosleep_state() > PM_SUSPEND_ON) {
+ error = -EBUSY;
+ goto out;
+ }
+
+ error = -EINVAL;
+ if (sscanf(buf, "%u", &val) == 1) {
+ if (pm_save_wakeup_count(val))
+ error = n;
+ else
+ pm_print_active_wakeup_sources();
+ }
+
+ out:
+ pm_autosleep_unlock();
+ return error;
+}
+
+power_attr(wakeup_count);
+
+#ifdef CONFIG_PM_AUTOSLEEP
+static ssize_t autosleep_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ suspend_state_t state = pm_autosleep_state();
+
+ if (state == PM_SUSPEND_ON)
+ return sprintf(buf, "off\n");
+
+#ifdef CONFIG_SUSPEND
+ if (state < PM_SUSPEND_MAX)
+ return sprintf(buf, "%s\n", pm_states[state] ?
+ pm_states[state] : "error");
+#endif
+#ifdef CONFIG_HIBERNATION
+ return sprintf(buf, "disk\n");
+#else
+ return sprintf(buf, "error");
+#endif
+}
+
+static ssize_t autosleep_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ suspend_state_t state = decode_state(buf, n);
+ int error;
+
+ if (state == PM_SUSPEND_ON
+ && strcmp(buf, "off") && strcmp(buf, "off\n"))
+ return -EINVAL;
+
+ error = pm_autosleep_set_state(state);
+ return error ? error : n;
+}
+
+power_attr(autosleep);
+#endif /* CONFIG_PM_AUTOSLEEP */
+
+#ifdef CONFIG_PM_WAKELOCKS
+static ssize_t wake_lock_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return pm_show_wakelocks(buf, true);
+}
+
+static ssize_t wake_lock_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ int error = pm_wake_lock(buf);
+ return error ? error : n;
+}
+
+power_attr(wake_lock);
+
+static ssize_t wake_unlock_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return pm_show_wakelocks(buf, false);
+}
+
+static ssize_t wake_unlock_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ int error = pm_wake_unlock(buf);
+ return error ? error : n;
+}
+
+power_attr(wake_unlock);
+
+#endif /* CONFIG_PM_WAKELOCKS */
+#endif /* CONFIG_PM_SLEEP */
+
+#ifdef CONFIG_PM_TRACE
+int pm_trace_enabled;
+
+static ssize_t pm_trace_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%d\n", pm_trace_enabled);
+}
+
+static ssize_t
+pm_trace_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ int val;
+
+ if (sscanf(buf, "%d", &val) == 1) {
+ pm_trace_enabled = !!val;
+ if (pm_trace_enabled) {
+ pr_warn("PM: Enabling pm_trace changes system date and time during resume.\n"
+ "PM: Correct system time has to be restored manually after resume.\n");
+ }
+ return n;
+ }
+ return -EINVAL;
+}
+
+power_attr(pm_trace);
+
+static ssize_t pm_trace_dev_match_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return show_trace_dev_match(buf, PAGE_SIZE);
+}
+
+static ssize_t
+pm_trace_dev_match_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ return -EINVAL;
+}
+
+power_attr(pm_trace_dev_match);
+
+#endif /* CONFIG_PM_TRACE */
+
+#ifdef CONFIG_FREEZER
+static ssize_t pm_freeze_timeout_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%u\n", freeze_timeout_msecs);
+}
+
+static ssize_t pm_freeze_timeout_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ unsigned long val;
+
+ if (kstrtoul(buf, 10, &val))
+ return -EINVAL;
+
+ freeze_timeout_msecs = val;
+ return n;
+}
+
+power_attr(pm_freeze_timeout);
+
+#endif /* CONFIG_FREEZER*/
+
+static struct attribute * g[] = {
+ &state_attr.attr,
+#ifdef CONFIG_PM_TRACE
+ &pm_trace_attr.attr,
+ &pm_trace_dev_match_attr.attr,
+#endif
+#ifdef CONFIG_PM_SLEEP
+ &pm_async_attr.attr,
+ &wakeup_count_attr.attr,
+#ifdef CONFIG_PM_AUTOSLEEP
+ &autosleep_attr.attr,
+#endif
+#ifdef CONFIG_PM_WAKELOCKS
+ &wake_lock_attr.attr,
+ &wake_unlock_attr.attr,
+#endif
+#ifdef CONFIG_PM_DEBUG
+ &pm_test_attr.attr,
+#endif
+#ifdef CONFIG_PM_SLEEP_DEBUG
+ &pm_print_times_attr.attr,
+#endif
+#endif
+#ifdef CONFIG_FREEZER
+ &pm_freeze_timeout_attr.attr,
+#endif
+ NULL,
+};
+
+static struct attribute_group attr_group = {
+ .attrs = g,
+};
+
+struct workqueue_struct *pm_wq;
+EXPORT_SYMBOL_GPL(pm_wq);
+
+static int __init pm_start_workqueue(void)
+{
+ pm_wq = alloc_workqueue("pm", WQ_FREEZABLE, 0);
+
+ return pm_wq ? 0 : -ENOMEM;
+}
+
+static int __init pm_init(void)
+{
+ int error = pm_start_workqueue();
+ if (error)
+ return error;
+ hibernate_image_size_init();
+ hibernate_reserved_size_init();
+ power_kobj = kobject_create_and_add("power", NULL);
+ if (!power_kobj)
+ return -ENOMEM;
+ error = sysfs_create_group(power_kobj, &attr_group);
+ if (error)
+ return error;
+ pm_print_times_init();
+ return pm_autosleep_init();
+}
+
+core_initcall(pm_init);
diff --git a/kernel/power/power.h b/kernel/power/power.h
new file mode 100644
index 000000000..095ed9f03
--- /dev/null
+++ b/kernel/power/power.h
@@ -0,0 +1,335 @@
+#include <linux/suspend.h>
+#include <linux/suspend_ioctls.h>
+#include <linux/utsname.h>
+#include <linux/freezer.h>
+#include <linux/compiler.h>
+
+struct swsusp_info {
+ struct new_utsname uts;
+ u32 version_code;
+ unsigned long num_physpages;
+ int cpus;
+ unsigned long image_pages;
+ unsigned long pages;
+ unsigned long size;
+} __aligned(PAGE_SIZE);
+
+#ifdef CONFIG_HIBERNATION
+/* kernel/power/snapshot.c */
+extern void __init hibernate_reserved_size_init(void);
+extern void __init hibernate_image_size_init(void);
+
+#ifdef CONFIG_ARCH_HIBERNATION_HEADER
+/* Maximum size of architecture specific data in a hibernation header */
+#define MAX_ARCH_HEADER_SIZE (sizeof(struct new_utsname) + 4)
+
+extern int arch_hibernation_header_save(void *addr, unsigned int max_size);
+extern int arch_hibernation_header_restore(void *addr);
+
+static inline int init_header_complete(struct swsusp_info *info)
+{
+ return arch_hibernation_header_save(info, MAX_ARCH_HEADER_SIZE);
+}
+
+static inline char *check_image_kernel(struct swsusp_info *info)
+{
+ return arch_hibernation_header_restore(info) ?
+ "architecture specific data" : NULL;
+}
+#else
+extern char *check_image_kernel(struct swsusp_info *info);
+#endif /* CONFIG_ARCH_HIBERNATION_HEADER */
+extern int init_header(struct swsusp_info *info);
+
+extern char resume_file[256];
+/*
+ * Keep some memory free so that I/O operations can succeed without paging
+ * [Might this be more than 4 MB?]
+ */
+#define PAGES_FOR_IO ((4096 * 1024) >> PAGE_SHIFT)
+
+/*
+ * Keep 1 MB of memory free so that device drivers can allocate some pages in
+ * their .suspend() routines without breaking the suspend to disk.
+ */
+#define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT)
+
+asmlinkage int swsusp_save(void);
+
+/* kernel/power/hibernate.c */
+extern bool freezer_test_done;
+
+extern int hibernation_snapshot(int platform_mode);
+extern int hibernation_restore(int platform_mode);
+extern int hibernation_platform_enter(void);
+
+#else /* !CONFIG_HIBERNATION */
+
+static inline void hibernate_reserved_size_init(void) {}
+static inline void hibernate_image_size_init(void) {}
+#endif /* !CONFIG_HIBERNATION */
+
+extern int pfn_is_nosave(unsigned long);
+
+#define power_attr(_name) \
+static struct kobj_attribute _name##_attr = { \
+ .attr = { \
+ .name = __stringify(_name), \
+ .mode = 0644, \
+ }, \
+ .show = _name##_show, \
+ .store = _name##_store, \
+}
+
+extern struct pbe *restore_pblist;
+
+/* Preferred image size in bytes (default 500 MB) */
+extern unsigned long image_size;
+/* Size of memory reserved for drivers (default SPARE_PAGES x PAGE_SIZE) */
+extern unsigned long reserved_size;
+extern int in_suspend;
+extern dev_t swsusp_resume_device;
+extern sector_t swsusp_resume_block;
+
+extern asmlinkage int swsusp_arch_suspend(void);
+extern asmlinkage int swsusp_arch_resume(void);
+
+extern int create_basic_memory_bitmaps(void);
+extern void free_basic_memory_bitmaps(void);
+extern int hibernate_preallocate_memory(void);
+
+/**
+ * Auxiliary structure used for reading the snapshot image data and
+ * metadata from and writing them to the list of page backup entries
+ * (PBEs) which is the main data structure of swsusp.
+ *
+ * Using struct snapshot_handle we can transfer the image, including its
+ * metadata, as a continuous sequence of bytes with the help of
+ * snapshot_read_next() and snapshot_write_next().
+ *
+ * The code that writes the image to a storage or transfers it to
+ * the user land is required to use snapshot_read_next() for this
+ * purpose and it should not make any assumptions regarding the internal
+ * structure of the image. Similarly, the code that reads the image from
+ * a storage or transfers it from the user land is required to use
+ * snapshot_write_next().
+ *
+ * This may allow us to change the internal structure of the image
+ * in the future with considerably less effort.
+ */
+
+struct snapshot_handle {
+ unsigned int cur; /* number of the block of PAGE_SIZE bytes the
+ * next operation will refer to (ie. current)
+ */
+ void *buffer; /* address of the block to read from
+ * or write to
+ */
+ int sync_read; /* Set to one to notify the caller of
+ * snapshot_write_next() that it may
+ * need to call wait_on_bio_chain()
+ */
+};
+
+/* This macro returns the address from/to which the caller of
+ * snapshot_read_next()/snapshot_write_next() is allowed to
+ * read/write data after the function returns
+ */
+#define data_of(handle) ((handle).buffer)
+
+extern unsigned int snapshot_additional_pages(struct zone *zone);
+extern unsigned long snapshot_get_image_size(void);
+extern int snapshot_read_next(struct snapshot_handle *handle);
+extern int snapshot_write_next(struct snapshot_handle *handle);
+extern void snapshot_write_finalize(struct snapshot_handle *handle);
+extern int snapshot_image_loaded(struct snapshot_handle *handle);
+
+/* If unset, the snapshot device cannot be open. */
+extern atomic_t snapshot_device_available;
+
+extern sector_t alloc_swapdev_block(int swap);
+extern void free_all_swap_pages(int swap);
+extern int swsusp_swap_in_use(void);
+
+/*
+ * Flags that can be passed from the hibernatig hernel to the "boot" kernel in
+ * the image header.
+ */
+#define SF_PLATFORM_MODE 1
+#define SF_NOCOMPRESS_MODE 2
+#define SF_CRC32_MODE 4
+
+/* kernel/power/hibernate.c */
+extern int swsusp_check(void);
+extern void swsusp_free(void);
+extern int swsusp_read(unsigned int *flags_p);
+extern int swsusp_write(unsigned int flags);
+extern void swsusp_close(fmode_t);
+#ifdef CONFIG_SUSPEND
+extern int swsusp_unmark(void);
+#endif
+
+/* kernel/power/block_io.c */
+extern struct block_device *hib_resume_bdev;
+
+extern int hib_bio_read_page(pgoff_t page_off, void *addr,
+ struct bio **bio_chain);
+extern int hib_bio_write_page(pgoff_t page_off, void *addr,
+ struct bio **bio_chain);
+extern int hib_wait_on_bio_chain(struct bio **bio_chain);
+
+struct timeval;
+/* kernel/power/swsusp.c */
+extern void swsusp_show_speed(ktime_t, ktime_t, unsigned int, char *);
+
+#ifdef CONFIG_SUSPEND
+/* kernel/power/suspend.c */
+extern const char *pm_labels[];
+extern const char *pm_states[];
+
+extern int suspend_devices_and_enter(suspend_state_t state);
+#else /* !CONFIG_SUSPEND */
+static inline int suspend_devices_and_enter(suspend_state_t state)
+{
+ return -ENOSYS;
+}
+#endif /* !CONFIG_SUSPEND */
+
+#ifdef CONFIG_PM_TEST_SUSPEND
+/* kernel/power/suspend_test.c */
+extern void suspend_test_start(void);
+extern void suspend_test_finish(const char *label);
+#else /* !CONFIG_PM_TEST_SUSPEND */
+static inline void suspend_test_start(void) {}
+static inline void suspend_test_finish(const char *label) {}
+#endif /* !CONFIG_PM_TEST_SUSPEND */
+
+#ifdef CONFIG_PM_SLEEP
+/* kernel/power/main.c */
+extern int pm_notifier_call_chain(unsigned long val);
+#endif
+
+#ifdef CONFIG_HIGHMEM
+int restore_highmem(void);
+#else
+static inline unsigned int count_highmem_pages(void) { return 0; }
+static inline int restore_highmem(void) { return 0; }
+#endif
+
+/*
+ * Suspend test levels
+ */
+enum {
+ /* keep first */
+ TEST_NONE,
+ TEST_CORE,
+ TEST_CPUS,
+ TEST_PLATFORM,
+ TEST_DEVICES,
+ TEST_FREEZER,
+ /* keep last */
+ __TEST_AFTER_LAST
+};
+
+#define TEST_FIRST TEST_NONE
+#define TEST_MAX (__TEST_AFTER_LAST - 1)
+
+extern int pm_test_level;
+
+#ifdef CONFIG_SUSPEND_FREEZER
+static inline int suspend_freeze_processes(void)
+{
+ int error;
+
+ error = freeze_processes();
+ /*
+ * freeze_processes() automatically thaws every task if freezing
+ * fails. So we need not do anything extra upon error.
+ */
+ if (error)
+ return error;
+
+ error = freeze_kernel_threads();
+ /*
+ * freeze_kernel_threads() thaws only kernel threads upon freezing
+ * failure. So we have to thaw the userspace tasks ourselves.
+ */
+ if (error)
+ thaw_processes();
+
+ return error;
+}
+
+static inline void suspend_thaw_processes(void)
+{
+ thaw_processes();
+}
+#else
+static inline int suspend_freeze_processes(void)
+{
+ return 0;
+}
+
+static inline void suspend_thaw_processes(void)
+{
+}
+#endif
+
+extern struct page *saveable_page(struct zone *z, unsigned long p);
+#ifdef CONFIG_HIGHMEM
+struct page *saveable_highmem_page(struct zone *z, unsigned long p);
+#else
+static
+inline void *saveable_highmem_page(struct zone *z, unsigned long p)
+{
+ return NULL;
+}
+#endif
+
+#define PBES_PER_PAGE (PAGE_SIZE / sizeof(struct pbe))
+extern struct list_head nosave_regions;
+
+/**
+ * This structure represents a range of page frames the contents of which
+ * should not be saved during the suspend.
+ */
+
+struct nosave_region {
+ struct list_head list;
+ unsigned long start_pfn;
+ unsigned long end_pfn;
+};
+
+#ifdef CONFIG_PM_AUTOSLEEP
+
+/* kernel/power/autosleep.c */
+extern int pm_autosleep_init(void);
+extern int pm_autosleep_lock(void);
+extern void pm_autosleep_unlock(void);
+extern suspend_state_t pm_autosleep_state(void);
+extern int pm_autosleep_set_state(suspend_state_t state);
+
+#else /* !CONFIG_PM_AUTOSLEEP */
+
+static inline int pm_autosleep_init(void) { return 0; }
+static inline int pm_autosleep_lock(void) { return 0; }
+static inline void pm_autosleep_unlock(void) {}
+static inline suspend_state_t pm_autosleep_state(void) { return PM_SUSPEND_ON; }
+
+#endif /* !CONFIG_PM_AUTOSLEEP */
+
+#ifdef CONFIG_PM_WAKELOCKS
+
+/* kernel/power/wakelock.c */
+extern ssize_t pm_show_wakelocks(char *buf, bool show_active);
+extern int pm_wake_lock(const char *buf);
+extern int pm_wake_unlock(const char *buf);
+
+#endif /* !CONFIG_PM_WAKELOCKS */
+
+#ifdef CONFIG_TOI
+unsigned long toi_get_nonconflicting_page(void);
+#define BM_END_OF_MAP (~0UL)
+#else
+#define toi_get_nonconflicting_page() (0)
+#endif
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
new file mode 100644
index 000000000..7ef6866b5
--- /dev/null
+++ b/kernel/power/poweroff.c
@@ -0,0 +1,46 @@
+/*
+ * poweroff.c - sysrq handler to gracefully power down machine.
+ *
+ * This file is released under the GPL v2
+ */
+
+#include <linux/kernel.h>
+#include <linux/sysrq.h>
+#include <linux/init.h>
+#include <linux/pm.h>
+#include <linux/workqueue.h>
+#include <linux/reboot.h>
+#include <linux/cpumask.h>
+
+/*
+ * When the user hits Sys-Rq o to power down the machine this is the
+ * callback we use.
+ */
+
+static void do_poweroff(struct work_struct *dummy)
+{
+ kernel_power_off();
+}
+
+static DECLARE_WORK(poweroff_work, do_poweroff);
+
+static void handle_poweroff(int key)
+{
+ /* run sysrq poweroff on boot cpu */
+ schedule_work_on(cpumask_first(cpu_online_mask), &poweroff_work);
+}
+
+static struct sysrq_key_op sysrq_poweroff_op = {
+ .handler = handle_poweroff,
+ .help_msg = "poweroff(o)",
+ .action_msg = "Power Off",
+ .enable_mask = SYSRQ_ENABLE_BOOT,
+};
+
+static int __init pm_sysrq_init(void)
+{
+ register_sysrq_key('o', &sysrq_poweroff_op);
+ return 0;
+}
+
+subsys_initcall(pm_sysrq_init);
diff --git a/kernel/power/process.c b/kernel/power/process.c
new file mode 100644
index 000000000..564f786df
--- /dev/null
+++ b/kernel/power/process.c
@@ -0,0 +1,237 @@
+/*
+ * drivers/power/process.c - Functions for starting/stopping processes on
+ * suspend transitions.
+ *
+ * Originally from swsusp.
+ */
+
+
+#undef DEBUG
+
+#include <linux/interrupt.h>
+#include <linux/oom.h>
+#include <linux/suspend.h>
+#include <linux/module.h>
+#include <linux/syscalls.h>
+#include <linux/freezer.h>
+#include <linux/delay.h>
+#include <linux/workqueue.h>
+#include <linux/kmod.h>
+#include <trace/events/power.h>
+
+/*
+ * Timeout for stopping processes
+ */
+unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC;
+
+static int try_to_freeze_tasks(bool user_only)
+{
+ struct task_struct *g, *p;
+ unsigned long end_time;
+ unsigned int todo;
+ bool wq_busy = false;
+ struct timeval start, end;
+ u64 elapsed_msecs64;
+ unsigned int elapsed_msecs;
+ bool wakeup = false;
+ int sleep_usecs = USEC_PER_MSEC;
+
+ do_gettimeofday(&start);
+
+ end_time = jiffies + msecs_to_jiffies(freeze_timeout_msecs);
+
+ if (!user_only)
+ freeze_workqueues_begin();
+
+ while (true) {
+ todo = 0;
+ read_lock(&tasklist_lock);
+ for_each_process_thread(g, p) {
+ if (p == current || !freeze_task(p))
+ continue;
+
+ if (!freezer_should_skip(p))
+ todo++;
+ }
+ read_unlock(&tasklist_lock);
+
+ if (!user_only) {
+ wq_busy = freeze_workqueues_busy();
+ todo += wq_busy;
+ }
+
+ if (!todo || time_after(jiffies, end_time))
+ break;
+
+ if (pm_wakeup_pending()) {
+ wakeup = true;
+ break;
+ }
+
+ /*
+ * We need to retry, but first give the freezing tasks some
+ * time to enter the refrigerator. Start with an initial
+ * 1 ms sleep followed by exponential backoff until 8 ms.
+ */
+ usleep_range(sleep_usecs / 2, sleep_usecs);
+ if (sleep_usecs < 8 * USEC_PER_MSEC)
+ sleep_usecs *= 2;
+ }
+
+ do_gettimeofday(&end);
+ elapsed_msecs64 = timeval_to_ns(&end) - timeval_to_ns(&start);
+ do_div(elapsed_msecs64, NSEC_PER_MSEC);
+ elapsed_msecs = elapsed_msecs64;
+
+ if (todo) {
+ pr_cont("\n");
+ pr_err("Freezing of tasks %s after %d.%03d seconds "
+ "(%d tasks refusing to freeze, wq_busy=%d):\n",
+ wakeup ? "aborted" : "failed",
+ elapsed_msecs / 1000, elapsed_msecs % 1000,
+ todo - wq_busy, wq_busy);
+
+ if (!wakeup) {
+ read_lock(&tasklist_lock);
+ for_each_process_thread(g, p) {
+ if (p != current && !freezer_should_skip(p)
+ && freezing(p) && !frozen(p))
+ sched_show_task(p);
+ }
+ read_unlock(&tasklist_lock);
+ }
+ } else {
+ pr_cont("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000,
+ elapsed_msecs % 1000);
+ }
+
+ return todo ? -EBUSY : 0;
+}
+
+/**
+ * freeze_processes - Signal user space processes to enter the refrigerator.
+ * The current thread will not be frozen. The same process that calls
+ * freeze_processes must later call thaw_processes.
+ *
+ * On success, returns 0. On failure, -errno and system is fully thawed.
+ */
+int freeze_processes(void)
+{
+ int error;
+
+ error = __usermodehelper_disable(UMH_FREEZING);
+ if (error)
+ return error;
+
+ /* Make sure this task doesn't get frozen */
+ current->flags |= PF_SUSPEND_TASK;
+
+ if (!pm_freezing)
+ atomic_inc(&system_freezing_cnt);
+
+ pm_wakeup_clear();
+ pr_info("Freezing user space processes ... ");
+ pm_freezing = true;
+ error = try_to_freeze_tasks(true);
+ if (!error) {
+ __usermodehelper_set_disable_depth(UMH_DISABLED);
+ pr_cont("done.");
+ }
+ pr_cont("\n");
+ BUG_ON(in_atomic());
+
+ /*
+ * Now that the whole userspace is frozen we need to disbale
+ * the OOM killer to disallow any further interference with
+ * killable tasks.
+ */
+ if (!error && !oom_killer_disable())
+ error = -EBUSY;
+
+ if (error)
+ thaw_processes();
+ return error;
+}
+
+/**
+ * freeze_kernel_threads - Make freezable kernel threads go to the refrigerator.
+ *
+ * On success, returns 0. On failure, -errno and only the kernel threads are
+ * thawed, so as to give a chance to the caller to do additional cleanups
+ * (if any) before thawing the userspace tasks. So, it is the responsibility
+ * of the caller to thaw the userspace tasks, when the time is right.
+ */
+int freeze_kernel_threads(void)
+{
+ int error;
+
+ pr_info("Freezing remaining freezable tasks ... ");
+
+ pm_nosig_freezing = true;
+ error = try_to_freeze_tasks(false);
+ if (!error)
+ pr_cont("done.");
+
+ pr_cont("\n");
+ BUG_ON(in_atomic());
+
+ if (error)
+ thaw_kernel_threads();
+ return error;
+}
+
+void thaw_processes(void)
+{
+ struct task_struct *g, *p;
+ struct task_struct *curr = current;
+
+ trace_suspend_resume(TPS("thaw_processes"), 0, true);
+ if (pm_freezing)
+ atomic_dec(&system_freezing_cnt);
+ pm_freezing = false;
+ pm_nosig_freezing = false;
+
+ oom_killer_enable();
+
+ pr_info("Restarting tasks ... ");
+
+ __usermodehelper_set_disable_depth(UMH_FREEZING);
+ thaw_workqueues();
+
+ read_lock(&tasklist_lock);
+ for_each_process_thread(g, p) {
+ /* No other threads should have PF_SUSPEND_TASK set */
+ WARN_ON((p != curr) && (p->flags & PF_SUSPEND_TASK));
+ __thaw_task(p);
+ }
+ read_unlock(&tasklist_lock);
+
+ WARN_ON(!(curr->flags & PF_SUSPEND_TASK));
+ curr->flags &= ~PF_SUSPEND_TASK;
+
+ usermodehelper_enable();
+
+ schedule();
+ pr_cont("done.\n");
+ trace_suspend_resume(TPS("thaw_processes"), 0, false);
+}
+
+void thaw_kernel_threads(void)
+{
+ struct task_struct *g, *p;
+
+ pm_nosig_freezing = false;
+ pr_info("Restarting kernel threads ... ");
+
+ thaw_workqueues();
+
+ read_lock(&tasklist_lock);
+ for_each_process_thread(g, p) {
+ if (p->flags & (PF_KTHREAD | PF_WQ_WORKER))
+ __thaw_task(p);
+ }
+ read_unlock(&tasklist_lock);
+
+ schedule();
+ pr_cont("done.\n");
+}
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
new file mode 100644
index 000000000..97b0df713
--- /dev/null
+++ b/kernel/power/qos.c
@@ -0,0 +1,713 @@
+/*
+ * This module exposes the interface to kernel space for specifying
+ * QoS dependencies. It provides infrastructure for registration of:
+ *
+ * Dependents on a QoS value : register requests
+ * Watchers of QoS value : get notified when target QoS value changes
+ *
+ * This QoS design is best effort based. Dependents register their QoS needs.
+ * Watchers register to keep track of the current QoS needs of the system.
+ *
+ * There are 3 basic classes of QoS parameter: latency, timeout, throughput
+ * each have defined units:
+ * latency: usec
+ * timeout: usec <-- currently not used.
+ * throughput: kbs (kilo byte / sec)
+ *
+ * There are lists of pm_qos_objects each one wrapping requests, notifiers
+ *
+ * User mode requests on a QOS parameter register themselves to the
+ * subsystem by opening the device node /dev/... and writing there request to
+ * the node. As long as the process holds a file handle open to the node the
+ * client continues to be accounted for. Upon file release the usermode
+ * request is removed and a new qos target is computed. This way when the
+ * request that the application has is cleaned up when closes the file
+ * pointer or exits the pm_qos_object will get an opportunity to clean up.
+ *
+ * Mark Gross <mgross@linux.intel.com>
+ */
+
+/*#define DEBUG*/
+
+#include <linux/pm_qos.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/device.h>
+#include <linux/miscdevice.h>
+#include <linux/string.h>
+#include <linux/platform_device.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include <linux/uaccess.h>
+#include <linux/export.h>
+#include <trace/events/power.h>
+
+/*
+ * locking rule: all changes to constraints or notifiers lists
+ * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
+ * held, taken with _irqsave. One lock to rule them all
+ */
+struct pm_qos_object {
+ struct pm_qos_constraints *constraints;
+ struct miscdevice pm_qos_power_miscdev;
+ char *name;
+};
+
+static DEFINE_SPINLOCK(pm_qos_lock);
+
+static struct pm_qos_object null_pm_qos;
+
+static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
+static struct pm_qos_constraints cpu_dma_constraints = {
+ .list = PLIST_HEAD_INIT(cpu_dma_constraints.list),
+ .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
+ .default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
+ .no_constraint_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
+ .type = PM_QOS_MIN,
+ .notifiers = &cpu_dma_lat_notifier,
+};
+static struct pm_qos_object cpu_dma_pm_qos = {
+ .constraints = &cpu_dma_constraints,
+ .name = "cpu_dma_latency",
+};
+
+static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
+static struct pm_qos_constraints network_lat_constraints = {
+ .list = PLIST_HEAD_INIT(network_lat_constraints.list),
+ .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
+ .default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
+ .no_constraint_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
+ .type = PM_QOS_MIN,
+ .notifiers = &network_lat_notifier,
+};
+static struct pm_qos_object network_lat_pm_qos = {
+ .constraints = &network_lat_constraints,
+ .name = "network_latency",
+};
+
+
+static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
+static struct pm_qos_constraints network_tput_constraints = {
+ .list = PLIST_HEAD_INIT(network_tput_constraints.list),
+ .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
+ .default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
+ .no_constraint_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
+ .type = PM_QOS_MAX,
+ .notifiers = &network_throughput_notifier,
+};
+static struct pm_qos_object network_throughput_pm_qos = {
+ .constraints = &network_tput_constraints,
+ .name = "network_throughput",
+};
+
+
+static BLOCKING_NOTIFIER_HEAD(memory_bandwidth_notifier);
+static struct pm_qos_constraints memory_bw_constraints = {
+ .list = PLIST_HEAD_INIT(memory_bw_constraints.list),
+ .target_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE,
+ .default_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE,
+ .no_constraint_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE,
+ .type = PM_QOS_SUM,
+ .notifiers = &memory_bandwidth_notifier,
+};
+static struct pm_qos_object memory_bandwidth_pm_qos = {
+ .constraints = &memory_bw_constraints,
+ .name = "memory_bandwidth",
+};
+
+
+static struct pm_qos_object *pm_qos_array[] = {
+ &null_pm_qos,
+ &cpu_dma_pm_qos,
+ &network_lat_pm_qos,
+ &network_throughput_pm_qos,
+ &memory_bandwidth_pm_qos,
+};
+
+static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
+ size_t count, loff_t *f_pos);
+static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
+ size_t count, loff_t *f_pos);
+static int pm_qos_power_open(struct inode *inode, struct file *filp);
+static int pm_qos_power_release(struct inode *inode, struct file *filp);
+
+static const struct file_operations pm_qos_power_fops = {
+ .write = pm_qos_power_write,
+ .read = pm_qos_power_read,
+ .open = pm_qos_power_open,
+ .release = pm_qos_power_release,
+ .llseek = noop_llseek,
+};
+
+/* unlocked internal variant */
+static inline int pm_qos_get_value(struct pm_qos_constraints *c)
+{
+ struct plist_node *node;
+ int total_value = 0;
+
+ if (plist_head_empty(&c->list))
+ return c->no_constraint_value;
+
+ switch (c->type) {
+ case PM_QOS_MIN:
+ return plist_first(&c->list)->prio;
+
+ case PM_QOS_MAX:
+ return plist_last(&c->list)->prio;
+
+ case PM_QOS_SUM:
+ plist_for_each(node, &c->list)
+ total_value += node->prio;
+
+ return total_value;
+
+ default:
+ /* runtime check for not using enum */
+ BUG();
+ return PM_QOS_DEFAULT_VALUE;
+ }
+}
+
+s32 pm_qos_read_value(struct pm_qos_constraints *c)
+{
+ return c->target_value;
+}
+
+static inline void pm_qos_set_value(struct pm_qos_constraints *c, s32 value)
+{
+ c->target_value = value;
+}
+
+static inline int pm_qos_get_value(struct pm_qos_constraints *c);
+static int pm_qos_dbg_show_requests(struct seq_file *s, void *unused)
+{
+ struct pm_qos_object *qos = (struct pm_qos_object *)s->private;
+ struct pm_qos_constraints *c;
+ struct pm_qos_request *req;
+ char *type;
+ unsigned long flags;
+ int tot_reqs = 0;
+ int active_reqs = 0;
+
+ if (IS_ERR_OR_NULL(qos)) {
+ pr_err("%s: bad qos param!\n", __func__);
+ return -EINVAL;
+ }
+ c = qos->constraints;
+ if (IS_ERR_OR_NULL(c)) {
+ pr_err("%s: Bad constraints on qos?\n", __func__);
+ return -EINVAL;
+ }
+
+ /* Lock to ensure we have a snapshot */
+ spin_lock_irqsave(&pm_qos_lock, flags);
+ if (plist_head_empty(&c->list)) {
+ seq_puts(s, "Empty!\n");
+ goto out;
+ }
+
+ switch (c->type) {
+ case PM_QOS_MIN:
+ type = "Minimum";
+ break;
+ case PM_QOS_MAX:
+ type = "Maximum";
+ break;
+ case PM_QOS_SUM:
+ type = "Sum";
+ break;
+ default:
+ type = "Unknown";
+ }
+
+ plist_for_each_entry(req, &c->list, node) {
+ char *state = "Default";
+
+ if ((req->node).prio != c->default_value) {
+ active_reqs++;
+ state = "Active";
+ }
+ tot_reqs++;
+ seq_printf(s, "%d: %d: %s\n", tot_reqs,
+ (req->node).prio, state);
+ }
+
+ seq_printf(s, "Type=%s, Value=%d, Requests: active=%d / total=%d\n",
+ type, pm_qos_get_value(c), active_reqs, tot_reqs);
+
+out:
+ spin_unlock_irqrestore(&pm_qos_lock, flags);
+ return 0;
+}
+
+static int pm_qos_dbg_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, pm_qos_dbg_show_requests,
+ inode->i_private);
+}
+
+static const struct file_operations pm_qos_debug_fops = {
+ .open = pm_qos_dbg_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+/**
+ * pm_qos_update_target - manages the constraints list and calls the notifiers
+ * if needed
+ * @c: constraints data struct
+ * @node: request to add to the list, to update or to remove
+ * @action: action to take on the constraints list
+ * @value: value of the request to add or update
+ *
+ * This function returns 1 if the aggregated constraint value has changed, 0
+ * otherwise.
+ */
+int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
+ enum pm_qos_req_action action, int value)
+{
+ unsigned long flags;
+ int prev_value, curr_value, new_value;
+ int ret;
+
+ spin_lock_irqsave(&pm_qos_lock, flags);
+ prev_value = pm_qos_get_value(c);
+ if (value == PM_QOS_DEFAULT_VALUE)
+ new_value = c->default_value;
+ else
+ new_value = value;
+
+ switch (action) {
+ case PM_QOS_REMOVE_REQ:
+ plist_del(node, &c->list);
+ break;
+ case PM_QOS_UPDATE_REQ:
+ /*
+ * to change the list, we atomically remove, reinit
+ * with new value and add, then see if the extremal
+ * changed
+ */
+ plist_del(node, &c->list);
+ case PM_QOS_ADD_REQ:
+ plist_node_init(node, new_value);
+ plist_add(node, &c->list);
+ break;
+ default:
+ /* no action */
+ ;
+ }
+
+ curr_value = pm_qos_get_value(c);
+ pm_qos_set_value(c, curr_value);
+
+ spin_unlock_irqrestore(&pm_qos_lock, flags);
+
+ trace_pm_qos_update_target(action, prev_value, curr_value);
+ if (prev_value != curr_value) {
+ ret = 1;
+ if (c->notifiers)
+ blocking_notifier_call_chain(c->notifiers,
+ (unsigned long)curr_value,
+ NULL);
+ } else {
+ ret = 0;
+ }
+ return ret;
+}
+
+/**
+ * pm_qos_flags_remove_req - Remove device PM QoS flags request.
+ * @pqf: Device PM QoS flags set to remove the request from.
+ * @req: Request to remove from the set.
+ */
+static void pm_qos_flags_remove_req(struct pm_qos_flags *pqf,
+ struct pm_qos_flags_request *req)
+{
+ s32 val = 0;
+
+ list_del(&req->node);
+ list_for_each_entry(req, &pqf->list, node)
+ val |= req->flags;
+
+ pqf->effective_flags = val;
+}
+
+/**
+ * pm_qos_update_flags - Update a set of PM QoS flags.
+ * @pqf: Set of flags to update.
+ * @req: Request to add to the set, to modify, or to remove from the set.
+ * @action: Action to take on the set.
+ * @val: Value of the request to add or modify.
+ *
+ * Update the given set of PM QoS flags and call notifiers if the aggregate
+ * value has changed. Returns 1 if the aggregate constraint value has changed,
+ * 0 otherwise.
+ */
+bool pm_qos_update_flags(struct pm_qos_flags *pqf,
+ struct pm_qos_flags_request *req,
+ enum pm_qos_req_action action, s32 val)
+{
+ unsigned long irqflags;
+ s32 prev_value, curr_value;
+
+ spin_lock_irqsave(&pm_qos_lock, irqflags);
+
+ prev_value = list_empty(&pqf->list) ? 0 : pqf->effective_flags;
+
+ switch (action) {
+ case PM_QOS_REMOVE_REQ:
+ pm_qos_flags_remove_req(pqf, req);
+ break;
+ case PM_QOS_UPDATE_REQ:
+ pm_qos_flags_remove_req(pqf, req);
+ case PM_QOS_ADD_REQ:
+ req->flags = val;
+ INIT_LIST_HEAD(&req->node);
+ list_add_tail(&req->node, &pqf->list);
+ pqf->effective_flags |= val;
+ break;
+ default:
+ /* no action */
+ ;
+ }
+
+ curr_value = list_empty(&pqf->list) ? 0 : pqf->effective_flags;
+
+ spin_unlock_irqrestore(&pm_qos_lock, irqflags);
+
+ trace_pm_qos_update_flags(action, prev_value, curr_value);
+ return prev_value != curr_value;
+}
+
+/**
+ * pm_qos_request - returns current system wide qos expectation
+ * @pm_qos_class: identification of which qos value is requested
+ *
+ * This function returns the current target value.
+ */
+int pm_qos_request(int pm_qos_class)
+{
+ return pm_qos_read_value(pm_qos_array[pm_qos_class]->constraints);
+}
+EXPORT_SYMBOL_GPL(pm_qos_request);
+
+int pm_qos_request_active(struct pm_qos_request *req)
+{
+ return req->pm_qos_class != 0;
+}
+EXPORT_SYMBOL_GPL(pm_qos_request_active);
+
+static void __pm_qos_update_request(struct pm_qos_request *req,
+ s32 new_value)
+{
+ trace_pm_qos_update_request(req->pm_qos_class, new_value);
+
+ if (new_value != req->node.prio)
+ pm_qos_update_target(
+ pm_qos_array[req->pm_qos_class]->constraints,
+ &req->node, PM_QOS_UPDATE_REQ, new_value);
+}
+
+/**
+ * pm_qos_work_fn - the timeout handler of pm_qos_update_request_timeout
+ * @work: work struct for the delayed work (timeout)
+ *
+ * This cancels the timeout request by falling back to the default at timeout.
+ */
+static void pm_qos_work_fn(struct work_struct *work)
+{
+ struct pm_qos_request *req = container_of(to_delayed_work(work),
+ struct pm_qos_request,
+ work);
+
+ __pm_qos_update_request(req, PM_QOS_DEFAULT_VALUE);
+}
+
+/**
+ * pm_qos_add_request - inserts new qos request into the list
+ * @req: pointer to a preallocated handle
+ * @pm_qos_class: identifies which list of qos request to use
+ * @value: defines the qos request
+ *
+ * This function inserts a new entry in the pm_qos_class list of requested qos
+ * performance characteristics. It recomputes the aggregate QoS expectations
+ * for the pm_qos_class of parameters and initializes the pm_qos_request
+ * handle. Caller needs to save this handle for later use in updates and
+ * removal.
+ */
+
+void pm_qos_add_request(struct pm_qos_request *req,
+ int pm_qos_class, s32 value)
+{
+ if (!req) /*guard against callers passing in null */
+ return;
+
+ if (pm_qos_request_active(req)) {
+ WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n");
+ return;
+ }
+ req->pm_qos_class = pm_qos_class;
+ INIT_DELAYED_WORK(&req->work, pm_qos_work_fn);
+ trace_pm_qos_add_request(pm_qos_class, value);
+ pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints,
+ &req->node, PM_QOS_ADD_REQ, value);
+}
+EXPORT_SYMBOL_GPL(pm_qos_add_request);
+
+/**
+ * pm_qos_update_request - modifies an existing qos request
+ * @req : handle to list element holding a pm_qos request to use
+ * @value: defines the qos request
+ *
+ * Updates an existing qos request for the pm_qos_class of parameters along
+ * with updating the target pm_qos_class value.
+ *
+ * Attempts are made to make this code callable on hot code paths.
+ */
+void pm_qos_update_request(struct pm_qos_request *req,
+ s32 new_value)
+{
+ if (!req) /*guard against callers passing in null */
+ return;
+
+ if (!pm_qos_request_active(req)) {
+ WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n");
+ return;
+ }
+
+ cancel_delayed_work_sync(&req->work);
+ __pm_qos_update_request(req, new_value);
+}
+EXPORT_SYMBOL_GPL(pm_qos_update_request);
+
+/**
+ * pm_qos_update_request_timeout - modifies an existing qos request temporarily.
+ * @req : handle to list element holding a pm_qos request to use
+ * @new_value: defines the temporal qos request
+ * @timeout_us: the effective duration of this qos request in usecs.
+ *
+ * After timeout_us, this qos request is cancelled automatically.
+ */
+void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value,
+ unsigned long timeout_us)
+{
+ if (!req)
+ return;
+ if (WARN(!pm_qos_request_active(req),
+ "%s called for unknown object.", __func__))
+ return;
+
+ cancel_delayed_work_sync(&req->work);
+
+ trace_pm_qos_update_request_timeout(req->pm_qos_class,
+ new_value, timeout_us);
+ if (new_value != req->node.prio)
+ pm_qos_update_target(
+ pm_qos_array[req->pm_qos_class]->constraints,
+ &req->node, PM_QOS_UPDATE_REQ, new_value);
+
+ schedule_delayed_work(&req->work, usecs_to_jiffies(timeout_us));
+}
+
+/**
+ * pm_qos_remove_request - modifies an existing qos request
+ * @req: handle to request list element
+ *
+ * Will remove pm qos request from the list of constraints and
+ * recompute the current target value for the pm_qos_class. Call this
+ * on slow code paths.
+ */
+void pm_qos_remove_request(struct pm_qos_request *req)
+{
+ if (!req) /*guard against callers passing in null */
+ return;
+ /* silent return to keep pcm code cleaner */
+
+ if (!pm_qos_request_active(req)) {
+ WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n");
+ return;
+ }
+
+ cancel_delayed_work_sync(&req->work);
+
+ trace_pm_qos_remove_request(req->pm_qos_class, PM_QOS_DEFAULT_VALUE);
+ pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints,
+ &req->node, PM_QOS_REMOVE_REQ,
+ PM_QOS_DEFAULT_VALUE);
+ memset(req, 0, sizeof(*req));
+}
+EXPORT_SYMBOL_GPL(pm_qos_remove_request);
+
+/**
+ * pm_qos_add_notifier - sets notification entry for changes to target value
+ * @pm_qos_class: identifies which qos target changes should be notified.
+ * @notifier: notifier block managed by caller.
+ *
+ * will register the notifier into a notification chain that gets called
+ * upon changes to the pm_qos_class target value.
+ */
+int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier)
+{
+ int retval;
+
+ retval = blocking_notifier_chain_register(
+ pm_qos_array[pm_qos_class]->constraints->notifiers,
+ notifier);
+
+ return retval;
+}
+EXPORT_SYMBOL_GPL(pm_qos_add_notifier);
+
+/**
+ * pm_qos_remove_notifier - deletes notification entry from chain.
+ * @pm_qos_class: identifies which qos target changes are notified.
+ * @notifier: notifier block to be removed.
+ *
+ * will remove the notifier from the notification chain that gets called
+ * upon changes to the pm_qos_class target value.
+ */
+int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
+{
+ int retval;
+
+ retval = blocking_notifier_chain_unregister(
+ pm_qos_array[pm_qos_class]->constraints->notifiers,
+ notifier);
+
+ return retval;
+}
+EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
+
+/* User space interface to PM QoS classes via misc devices */
+static int register_pm_qos_misc(struct pm_qos_object *qos, struct dentry *d)
+{
+ qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR;
+ qos->pm_qos_power_miscdev.name = qos->name;
+ qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops;
+
+ if (d) {
+ (void)debugfs_create_file(qos->name, S_IRUGO, d,
+ (void *)qos, &pm_qos_debug_fops);
+ }
+
+ return misc_register(&qos->pm_qos_power_miscdev);
+}
+
+static int find_pm_qos_object_by_minor(int minor)
+{
+ int pm_qos_class;
+
+ for (pm_qos_class = PM_QOS_CPU_DMA_LATENCY;
+ pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) {
+ if (minor ==
+ pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor)
+ return pm_qos_class;
+ }
+ return -1;
+}
+
+static int pm_qos_power_open(struct inode *inode, struct file *filp)
+{
+ long pm_qos_class;
+
+ pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
+ if (pm_qos_class >= PM_QOS_CPU_DMA_LATENCY) {
+ struct pm_qos_request *req = kzalloc(sizeof(*req), GFP_KERNEL);
+ if (!req)
+ return -ENOMEM;
+
+ pm_qos_add_request(req, pm_qos_class, PM_QOS_DEFAULT_VALUE);
+ filp->private_data = req;
+
+ return 0;
+ }
+ return -EPERM;
+}
+
+static int pm_qos_power_release(struct inode *inode, struct file *filp)
+{
+ struct pm_qos_request *req;
+
+ req = filp->private_data;
+ pm_qos_remove_request(req);
+ kfree(req);
+
+ return 0;
+}
+
+
+static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
+ size_t count, loff_t *f_pos)
+{
+ s32 value;
+ unsigned long flags;
+ struct pm_qos_request *req = filp->private_data;
+
+ if (!req)
+ return -EINVAL;
+ if (!pm_qos_request_active(req))
+ return -EINVAL;
+
+ spin_lock_irqsave(&pm_qos_lock, flags);
+ value = pm_qos_get_value(pm_qos_array[req->pm_qos_class]->constraints);
+ spin_unlock_irqrestore(&pm_qos_lock, flags);
+
+ return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32));
+}
+
+static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
+ size_t count, loff_t *f_pos)
+{
+ s32 value;
+ struct pm_qos_request *req;
+
+ if (count == sizeof(s32)) {
+ if (copy_from_user(&value, buf, sizeof(s32)))
+ return -EFAULT;
+ } else {
+ int ret;
+
+ ret = kstrtos32_from_user(buf, count, 16, &value);
+ if (ret)
+ return ret;
+ }
+
+ req = filp->private_data;
+ pm_qos_update_request(req, value);
+
+ return count;
+}
+
+
+static int __init pm_qos_power_init(void)
+{
+ int ret = 0;
+ int i;
+ struct dentry *d;
+
+ BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES);
+
+ d = debugfs_create_dir("pm_qos", NULL);
+ if (IS_ERR_OR_NULL(d))
+ d = NULL;
+
+ for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) {
+ ret = register_pm_qos_misc(pm_qos_array[i], d);
+ if (ret < 0) {
+ printk(KERN_ERR "pm_qos_param: %s setup failed\n",
+ pm_qos_array[i]->name);
+ return ret;
+ }
+ }
+
+ return ret;
+}
+
+late_initcall(pm_qos_power_init);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
new file mode 100644
index 000000000..ba9d20ebc
--- /dev/null
+++ b/kernel/power/snapshot.c
@@ -0,0 +1,2722 @@
+/*
+ * linux/kernel/power/snapshot.c
+ *
+ * This file provides system snapshot/restore functionality for swsusp.
+ *
+ * Copyright (C) 1998-2005 Pavel Machek <pavel@ucw.cz>
+ * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+ *
+ * This file is released under the GPLv2.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/suspend.h>
+#include <linux/delay.h>
+#include <linux/bitops.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/pm.h>
+#include <linux/device.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/syscalls.h>
+#include <linux/console.h>
+#include <linux/highmem.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/compiler.h>
+#include <linux/ktime.h>
+
+#include <asm/uaccess.h>
+#include <asm/mmu_context.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/io.h>
+
+#include "tuxonice_modules.h"
+#include "tuxonice_builtin.h"
+#include "tuxonice_alloc.h"
+#include "power.h"
+
+static int swsusp_page_is_free(struct page *);
+static void swsusp_set_page_forbidden(struct page *);
+static void swsusp_unset_page_forbidden(struct page *);
+
+/*
+ * Number of bytes to reserve for memory allocations made by device drivers
+ * from their ->freeze() and ->freeze_noirq() callbacks so that they don't
+ * cause image creation to fail (tunable via /sys/power/reserved_size).
+ */
+unsigned long reserved_size;
+
+void __init hibernate_reserved_size_init(void)
+{
+ reserved_size = SPARE_PAGES * PAGE_SIZE;
+}
+
+/*
+ * Preferred image size in bytes (tunable via /sys/power/image_size).
+ * When it is set to N, swsusp will do its best to ensure the image
+ * size will not exceed N bytes, but if that is impossible, it will
+ * try to create the smallest image possible.
+ */
+unsigned long image_size;
+
+void __init hibernate_image_size_init(void)
+{
+ image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE;
+}
+
+/* List of PBEs needed for restoring the pages that were allocated before
+ * the suspend and included in the suspend image, but have also been
+ * allocated by the "resume" kernel, so their contents cannot be written
+ * directly to their "original" page frames.
+ */
+struct pbe *restore_pblist;
+
+/* Pointer to an auxiliary buffer (1 page) */
+static void *buffer;
+
+/**
+ * @safe_needed - on resume, for storing the PBE list and the image,
+ * we can only use memory pages that do not conflict with the pages
+ * used before suspend. The unsafe pages have PageNosaveFree set
+ * and we count them using unsafe_pages.
+ *
+ * Each allocated image page is marked as PageNosave and PageNosaveFree
+ * so that swsusp_free() can release it.
+ */
+
+#define PG_ANY 0
+#define PG_SAFE 1
+#define PG_UNSAFE_CLEAR 1
+#define PG_UNSAFE_KEEP 0
+
+static unsigned int allocated_unsafe_pages;
+
+static void *get_image_page(gfp_t gfp_mask, int safe_needed)
+{
+ void *res;
+
+ if (toi_running)
+ return (void *) toi_get_nonconflicting_page();
+
+ res = (void *)get_zeroed_page(gfp_mask);
+ if (safe_needed)
+ while (res && swsusp_page_is_free(virt_to_page(res))) {
+ /* The page is unsafe, mark it for swsusp_free() */
+ swsusp_set_page_forbidden(virt_to_page(res));
+ allocated_unsafe_pages++;
+ res = (void *)get_zeroed_page(gfp_mask);
+ }
+ if (res) {
+ swsusp_set_page_forbidden(virt_to_page(res));
+ swsusp_set_page_free(virt_to_page(res));
+ }
+ return res;
+}
+
+unsigned long get_safe_page(gfp_t gfp_mask)
+{
+ return (unsigned long)get_image_page(gfp_mask, PG_SAFE);
+}
+
+static struct page *alloc_image_page(gfp_t gfp_mask)
+{
+ struct page *page;
+
+ page = alloc_page(gfp_mask);
+ if (page) {
+ swsusp_set_page_forbidden(page);
+ swsusp_set_page_free(page);
+ }
+ return page;
+}
+
+/**
+ * free_image_page - free page represented by @addr, allocated with
+ * get_image_page (page flags set by it must be cleared)
+ */
+
+static inline void free_image_page(void *addr, int clear_nosave_free)
+{
+ struct page *page;
+
+ BUG_ON(!virt_addr_valid(addr));
+
+ page = virt_to_page(addr);
+
+ if (toi_running) {
+ toi__free_page(29, page);
+ return;
+ }
+
+ swsusp_unset_page_forbidden(page);
+ if (clear_nosave_free)
+ swsusp_unset_page_free(page);
+
+ __free_page(page);
+}
+
+/* struct linked_page is used to build chains of pages */
+
+#define LINKED_PAGE_DATA_SIZE (PAGE_SIZE - sizeof(void *))
+
+struct linked_page {
+ struct linked_page *next;
+ char data[LINKED_PAGE_DATA_SIZE];
+} __packed;
+
+static inline void
+free_list_of_pages(struct linked_page *list, int clear_page_nosave)
+{
+ while (list) {
+ struct linked_page *lp = list->next;
+
+ free_image_page(list, clear_page_nosave);
+ list = lp;
+ }
+}
+
+/**
+ * struct chain_allocator is used for allocating small objects out of
+ * a linked list of pages called 'the chain'.
+ *
+ * The chain grows each time when there is no room for a new object in
+ * the current page. The allocated objects cannot be freed individually.
+ * It is only possible to free them all at once, by freeing the entire
+ * chain.
+ *
+ * NOTE: The chain allocator may be inefficient if the allocated objects
+ * are not much smaller than PAGE_SIZE.
+ */
+
+struct chain_allocator {
+ struct linked_page *chain; /* the chain */
+ unsigned int used_space; /* total size of objects allocated out
+ * of the current page
+ */
+ gfp_t gfp_mask; /* mask for allocating pages */
+ int safe_needed; /* if set, only "safe" pages are allocated */
+};
+
+static void
+chain_init(struct chain_allocator *ca, gfp_t gfp_mask, int safe_needed)
+{
+ ca->chain = NULL;
+ ca->used_space = LINKED_PAGE_DATA_SIZE;
+ ca->gfp_mask = gfp_mask;
+ ca->safe_needed = safe_needed;
+}
+
+static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
+{
+ void *ret;
+
+ if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) {
+ struct linked_page *lp;
+
+ lp = get_image_page(ca->gfp_mask, ca->safe_needed);
+ if (!lp)
+ return NULL;
+
+ lp->next = ca->chain;
+ ca->chain = lp;
+ ca->used_space = 0;
+ }
+ ret = ca->chain->data + ca->used_space;
+ ca->used_space += size;
+ return ret;
+}
+
+/**
+ * Data types related to memory bitmaps.
+ *
+ * Memory bitmap is a structure consiting of many linked lists of
+ * objects. The main list's elements are of type struct zone_bitmap
+ * and each of them corresonds to one zone. For each zone bitmap
+ * object there is a list of objects of type struct bm_block that
+ * represent each blocks of bitmap in which information is stored.
+ *
+ * struct memory_bitmap contains a pointer to the main list of zone
+ * bitmap objects, a struct bm_position used for browsing the bitmap,
+ * and a pointer to the list of pages used for allocating all of the
+ * zone bitmap objects and bitmap block objects.
+ *
+ * NOTE: It has to be possible to lay out the bitmap in memory
+ * using only allocations of order 0. Additionally, the bitmap is
+ * designed to work with arbitrary number of zones (this is over the
+ * top for now, but let's avoid making unnecessary assumptions ;-).
+ *
+ * struct zone_bitmap contains a pointer to a list of bitmap block
+ * objects and a pointer to the bitmap block object that has been
+ * most recently used for setting bits. Additionally, it contains the
+ * pfns that correspond to the start and end of the represented zone.
+ *
+ * struct bm_block contains a pointer to the memory page in which
+ * information is stored (in the form of a block of bitmap)
+ * It also contains the pfns that correspond to the start and end of
+ * the represented memory area.
+ *
+ * The memory bitmap is organized as a radix tree to guarantee fast random
+ * access to the bits. There is one radix tree for each zone (as returned
+ * from create_mem_extents).
+ *
+ * One radix tree is represented by one struct mem_zone_bm_rtree. There are
+ * two linked lists for the nodes of the tree, one for the inner nodes and
+ * one for the leave nodes. The linked leave nodes are used for fast linear
+ * access of the memory bitmap.
+ *
+ * The struct rtree_node represents one node of the radix tree.
+ */
+
+#define BM_END_OF_MAP (~0UL)
+
+#define BM_BITS_PER_BLOCK (PAGE_SIZE * BITS_PER_BYTE)
+#define BM_BLOCK_SHIFT (PAGE_SHIFT + 3)
+#define BM_BLOCK_MASK ((1UL << BM_BLOCK_SHIFT) - 1)
+
+/*
+ * struct rtree_node is a wrapper struct to link the nodes
+ * of the rtree together for easy linear iteration over
+ * bits and easy freeing
+ */
+struct rtree_node {
+ struct list_head list;
+ unsigned long *data;
+};
+
+/*
+ * struct mem_zone_bm_rtree represents a bitmap used for one
+ * populated memory zone.
+ */
+struct mem_zone_bm_rtree {
+ struct list_head list; /* Link Zones together */
+ struct list_head nodes; /* Radix Tree inner nodes */
+ struct list_head leaves; /* Radix Tree leaves */
+ unsigned long start_pfn; /* Zone start page frame */
+ unsigned long end_pfn; /* Zone end page frame + 1 */
+ struct rtree_node *rtree; /* Radix Tree Root */
+ int levels; /* Number of Radix Tree Levels */
+ unsigned int blocks; /* Number of Bitmap Blocks */
+};
+
+/* strcut bm_position is used for browsing memory bitmaps */
+
+struct bm_position {
+ struct mem_zone_bm_rtree *zone;
+ struct rtree_node *node;
+ unsigned long node_pfn;
+ int node_bit;
+};
+
+#define BM_POSITION_SLOTS (NR_CPUS * 2)
+
+struct memory_bitmap {
+ struct list_head zones;
+ struct linked_page *p_list; /* list of pages used to store zone
+ * bitmap objects and bitmap block
+ * objects
+ */
+ struct bm_position cur[BM_POSITION_SLOTS]; /* most recently used bit position */
+};
+
+/* Functions that operate on memory bitmaps */
+
+#define BM_ENTRIES_PER_LEVEL (PAGE_SIZE / sizeof(unsigned long))
+#if BITS_PER_LONG == 32
+#define BM_RTREE_LEVEL_SHIFT (PAGE_SHIFT - 2)
+#else
+#define BM_RTREE_LEVEL_SHIFT (PAGE_SHIFT - 3)
+#endif
+#define BM_RTREE_LEVEL_MASK ((1UL << BM_RTREE_LEVEL_SHIFT) - 1)
+
+/*
+ * alloc_rtree_node - Allocate a new node and add it to the radix tree.
+ *
+ * This function is used to allocate inner nodes as well as the
+ * leave nodes of the radix tree. It also adds the node to the
+ * corresponding linked list passed in by the *list parameter.
+ */
+static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed,
+ struct chain_allocator *ca,
+ struct list_head *list)
+{
+ struct rtree_node *node;
+
+ node = chain_alloc(ca, sizeof(struct rtree_node));
+ if (!node)
+ return NULL;
+
+ node->data = get_image_page(gfp_mask, safe_needed);
+ if (!node->data)
+ return NULL;
+
+ list_add_tail(&node->list, list);
+
+ return node;
+}
+
+/*
+ * add_rtree_block - Add a new leave node to the radix tree
+ *
+ * The leave nodes need to be allocated in order to keep the leaves
+ * linked list in order. This is guaranteed by the zone->blocks
+ * counter.
+ */
+static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask,
+ int safe_needed, struct chain_allocator *ca)
+{
+ struct rtree_node *node, *block, **dst;
+ unsigned int levels_needed, block_nr;
+ int i;
+
+ block_nr = zone->blocks;
+ levels_needed = 0;
+
+ /* How many levels do we need for this block nr? */
+ while (block_nr) {
+ levels_needed += 1;
+ block_nr >>= BM_RTREE_LEVEL_SHIFT;
+ }
+
+ /* Make sure the rtree has enough levels */
+ for (i = zone->levels; i < levels_needed; i++) {
+ node = alloc_rtree_node(gfp_mask, safe_needed, ca,
+ &zone->nodes);
+ if (!node)
+ return -ENOMEM;
+
+ node->data[0] = (unsigned long)zone->rtree;
+ zone->rtree = node;
+ zone->levels += 1;
+ }
+
+ /* Allocate new block */
+ block = alloc_rtree_node(gfp_mask, safe_needed, ca, &zone->leaves);
+ if (!block)
+ return -ENOMEM;
+
+ /* Now walk the rtree to insert the block */
+ node = zone->rtree;
+ dst = &zone->rtree;
+ block_nr = zone->blocks;
+ for (i = zone->levels; i > 0; i--) {
+ int index;
+
+ if (!node) {
+ node = alloc_rtree_node(gfp_mask, safe_needed, ca,
+ &zone->nodes);
+ if (!node)
+ return -ENOMEM;
+ *dst = node;
+ }
+
+ index = block_nr >> ((i - 1) * BM_RTREE_LEVEL_SHIFT);
+ index &= BM_RTREE_LEVEL_MASK;
+ dst = (struct rtree_node **)&((*dst)->data[index]);
+ node = *dst;
+ }
+
+ zone->blocks += 1;
+ *dst = block;
+
+ return 0;
+}
+
+static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone,
+ int clear_nosave_free);
+
+/*
+ * create_zone_bm_rtree - create a radix tree for one zone
+ *
+ * Allocated the mem_zone_bm_rtree structure and initializes it.
+ * This function also allocated and builds the radix tree for the
+ * zone.
+ */
+static struct mem_zone_bm_rtree *
+create_zone_bm_rtree(gfp_t gfp_mask, int safe_needed,
+ struct chain_allocator *ca,
+ unsigned long start, unsigned long end)
+{
+ struct mem_zone_bm_rtree *zone;
+ unsigned int i, nr_blocks;
+ unsigned long pages;
+
+ pages = end - start;
+ zone = chain_alloc(ca, sizeof(struct mem_zone_bm_rtree));
+ if (!zone)
+ return NULL;
+
+ INIT_LIST_HEAD(&zone->nodes);
+ INIT_LIST_HEAD(&zone->leaves);
+ zone->start_pfn = start;
+ zone->end_pfn = end;
+ nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK);
+
+ for (i = 0; i < nr_blocks; i++) {
+ if (add_rtree_block(zone, gfp_mask, safe_needed, ca)) {
+ free_zone_bm_rtree(zone, PG_UNSAFE_CLEAR);
+ return NULL;
+ }
+ }
+
+ return zone;
+}
+
+/*
+ * free_zone_bm_rtree - Free the memory of the radix tree
+ *
+ * Free all node pages of the radix tree. The mem_zone_bm_rtree
+ * structure itself is not freed here nor are the rtree_node
+ * structs.
+ */
+static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone,
+ int clear_nosave_free)
+{
+ struct rtree_node *node;
+
+ list_for_each_entry(node, &zone->nodes, list)
+ free_image_page(node->data, clear_nosave_free);
+
+ list_for_each_entry(node, &zone->leaves, list)
+ free_image_page(node->data, clear_nosave_free);
+}
+
+void memory_bm_position_reset(struct memory_bitmap *bm)
+{
+ int index;
+
+ for (index = 0; index < BM_POSITION_SLOTS; index++) {
+ bm->cur[index].zone = list_entry(bm->zones.next, struct mem_zone_bm_rtree,
+ list);
+ bm->cur[index].node = list_entry(bm->cur[index].zone->leaves.next,
+ struct rtree_node, list);
+ bm->cur[index].node_pfn = 0;
+ bm->cur[index].node_bit = 0;
+ }
+}
+
+static void memory_bm_clear_current(struct memory_bitmap *bm, int index);
+unsigned long memory_bm_next_pfn(struct memory_bitmap *bm, int index);
+
+/**
+ * memory_bm_clear
+ * @param bm - The bitmap to clear
+ *
+ * Only run while single threaded - locking not needed
+ */
+void memory_bm_clear(struct memory_bitmap *bm)
+{
+ memory_bm_position_reset(bm);
+
+ while (memory_bm_next_pfn(bm, 0) != BM_END_OF_MAP) {
+ memory_bm_clear_current(bm, 0);
+ }
+
+ memory_bm_position_reset(bm);
+}
+static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
+
+struct mem_extent {
+ struct list_head hook;
+ unsigned long start;
+ unsigned long end;
+};
+
+/**
+ * free_mem_extents - free a list of memory extents
+ * @list - list of extents to empty
+ */
+static void free_mem_extents(struct list_head *list)
+{
+ struct mem_extent *ext, *aux;
+
+ list_for_each_entry_safe(ext, aux, list, hook) {
+ list_del(&ext->hook);
+ kfree(ext);
+ }
+}
+
+/**
+ * create_mem_extents - create a list of memory extents representing
+ * contiguous ranges of PFNs
+ * @list - list to put the extents into
+ * @gfp_mask - mask to use for memory allocations
+ */
+static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)
+{
+ struct zone *zone;
+
+ INIT_LIST_HEAD(list);
+
+ for_each_populated_zone(zone) {
+ unsigned long zone_start, zone_end;
+ struct mem_extent *ext, *cur, *aux;
+
+ zone_start = zone->zone_start_pfn;
+ zone_end = zone_end_pfn(zone);
+
+ list_for_each_entry(ext, list, hook)
+ if (zone_start <= ext->end)
+ break;
+
+ if (&ext->hook == list || zone_end < ext->start) {
+ /* New extent is necessary */
+ struct mem_extent *new_ext;
+
+ new_ext = kzalloc(sizeof(struct mem_extent), gfp_mask);
+ if (!new_ext) {
+ free_mem_extents(list);
+ return -ENOMEM;
+ }
+ new_ext->start = zone_start;
+ new_ext->end = zone_end;
+ list_add_tail(&new_ext->hook, &ext->hook);
+ continue;
+ }
+
+ /* Merge this zone's range of PFNs with the existing one */
+ if (zone_start < ext->start)
+ ext->start = zone_start;
+ if (zone_end > ext->end)
+ ext->end = zone_end;
+
+ /* More merging may be possible */
+ cur = ext;
+ list_for_each_entry_safe_continue(cur, aux, list, hook) {
+ if (zone_end < cur->start)
+ break;
+ if (zone_end < cur->end)
+ ext->end = cur->end;
+ list_del(&cur->hook);
+ kfree(cur);
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * memory_bm_create - allocate memory for a memory bitmap
+ */
+static int
+memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
+{
+ struct chain_allocator ca;
+ struct list_head mem_extents;
+ struct mem_extent *ext;
+ int error;
+
+ chain_init(&ca, gfp_mask, safe_needed);
+ INIT_LIST_HEAD(&bm->zones);
+
+ error = create_mem_extents(&mem_extents, gfp_mask);
+ if (error)
+ return error;
+
+ list_for_each_entry(ext, &mem_extents, hook) {
+ struct mem_zone_bm_rtree *zone;
+
+ zone = create_zone_bm_rtree(gfp_mask, safe_needed, &ca,
+ ext->start, ext->end);
+ if (!zone) {
+ error = -ENOMEM;
+ goto Error;
+ }
+ list_add_tail(&zone->list, &bm->zones);
+ }
+
+ bm->p_list = ca.chain;
+
+ memory_bm_position_reset(bm);
+ Exit:
+ free_mem_extents(&mem_extents);
+ return error;
+
+ Error:
+ bm->p_list = ca.chain;
+ memory_bm_free(bm, PG_UNSAFE_CLEAR);
+ goto Exit;
+}
+
+/**
+ * memory_bm_free - free memory occupied by the memory bitmap @bm
+ */
+static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
+{
+ struct mem_zone_bm_rtree *zone;
+
+ list_for_each_entry(zone, &bm->zones, list)
+ free_zone_bm_rtree(zone, clear_nosave_free);
+
+ free_list_of_pages(bm->p_list, clear_nosave_free);
+
+ INIT_LIST_HEAD(&bm->zones);
+}
+
+/**
+ * memory_bm_find_bit - Find the bit for pfn in the memory
+ * bitmap
+ *
+ * Find the bit in the bitmap @bm that corresponds to given pfn.
+ * The cur.zone, cur.block and cur.node_pfn member of @bm are
+ * updated.
+ * It walks the radix tree to find the page which contains the bit for
+ * pfn and returns the bit position in **addr and *bit_nr.
+ */
+int memory_bm_find_bit(struct memory_bitmap *bm, int index,
+ unsigned long pfn, void **addr, unsigned int *bit_nr)
+{
+ struct mem_zone_bm_rtree *curr, *zone;
+ struct rtree_node *node;
+ int i, block_nr;
+
+ if (!bm->cur[index].zone) {
+ // Reset
+ bm->cur[index].zone = list_entry(bm->zones.next, struct mem_zone_bm_rtree,
+ list);
+ bm->cur[index].node = list_entry(bm->cur[index].zone->leaves.next,
+ struct rtree_node, list);
+ bm->cur[index].node_pfn = 0;
+ bm->cur[index].node_bit = 0;
+ }
+
+ zone = bm->cur[index].zone;
+
+ if (pfn >= zone->start_pfn && pfn < zone->end_pfn)
+ goto zone_found;
+
+ zone = NULL;
+
+ /* Find the right zone */
+ list_for_each_entry(curr, &bm->zones, list) {
+ if (pfn >= curr->start_pfn && pfn < curr->end_pfn) {
+ zone = curr;
+ break;
+ }
+ }
+
+ if (!zone)
+ return -EFAULT;
+
+zone_found:
+ /*
+ * We have a zone. Now walk the radix tree to find the leave
+ * node for our pfn.
+ */
+
+ node = bm->cur[index].node;
+ if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur[index].node_pfn)
+ goto node_found;
+
+ node = zone->rtree;
+ block_nr = (pfn - zone->start_pfn) >> BM_BLOCK_SHIFT;
+
+ for (i = zone->levels; i > 0; i--) {
+ int index;
+
+ index = block_nr >> ((i - 1) * BM_RTREE_LEVEL_SHIFT);
+ index &= BM_RTREE_LEVEL_MASK;
+ BUG_ON(node->data[index] == 0);
+ node = (struct rtree_node *)node->data[index];
+ }
+
+node_found:
+ /* Update last position */
+ bm->cur[index].zone = zone;
+ bm->cur[index].node = node;
+ bm->cur[index].node_pfn = (pfn - zone->start_pfn) & ~BM_BLOCK_MASK;
+
+ /* Set return values */
+ *addr = node->data;
+ *bit_nr = (pfn - zone->start_pfn) & BM_BLOCK_MASK;
+
+ return 0;
+}
+
+void memory_bm_set_bit(struct memory_bitmap *bm, int index, unsigned long pfn)
+{
+ void *addr;
+ unsigned int bit;
+ int error;
+
+ error = memory_bm_find_bit(bm, index, pfn, &addr, &bit);
+ BUG_ON(error);
+ set_bit(bit, addr);
+}
+
+int mem_bm_set_bit_check(struct memory_bitmap *bm, int index, unsigned long pfn)
+{
+ void *addr;
+ unsigned int bit;
+ int error;
+
+ error = memory_bm_find_bit(bm, index, pfn, &addr, &bit);
+ if (!error)
+ set_bit(bit, addr);
+
+ return error;
+}
+
+void memory_bm_clear_bit(struct memory_bitmap *bm, int index, unsigned long pfn)
+{
+ void *addr;
+ unsigned int bit;
+ int error;
+
+ error = memory_bm_find_bit(bm, index, pfn, &addr, &bit);
+ BUG_ON(error);
+ clear_bit(bit, addr);
+}
+
+static void memory_bm_clear_current(struct memory_bitmap *bm, int index)
+{
+ int bit;
+
+ bit = max(bm->cur[index].node_bit - 1, 0);
+ clear_bit(bit, bm->cur[index].node->data);
+}
+
+int memory_bm_test_bit(struct memory_bitmap *bm, int index, unsigned long pfn)
+{
+ void *addr;
+ unsigned int bit;
+ int error;
+
+ error = memory_bm_find_bit(bm, index, pfn, &addr, &bit);
+ BUG_ON(error);
+ return test_bit(bit, addr);
+}
+
+static bool memory_bm_pfn_present(struct memory_bitmap *bm, int index, unsigned long pfn)
+{
+ void *addr;
+ unsigned int bit;
+
+ return !memory_bm_find_bit(bm, index, pfn, &addr, &bit);
+}
+
+/*
+ * rtree_next_node - Jumps to the next leave node
+ *
+ * Sets the position to the beginning of the next node in the
+ * memory bitmap. This is either the next node in the current
+ * zone's radix tree or the first node in the radix tree of the
+ * next zone.
+ *
+ * Returns true if there is a next node, false otherwise.
+ */
+static bool rtree_next_node(struct memory_bitmap *bm, int index)
+{
+ bm->cur[index].node = list_entry(bm->cur[index].node->list.next,
+ struct rtree_node, list);
+ if (&bm->cur[index].node->list != &bm->cur[index].zone->leaves) {
+ bm->cur[index].node_pfn += BM_BITS_PER_BLOCK;
+ bm->cur[index].node_bit = 0;
+ touch_softlockup_watchdog();
+ return true;
+ }
+
+ /* No more nodes, goto next zone */
+ bm->cur[index].zone = list_entry(bm->cur[index].zone->list.next,
+ struct mem_zone_bm_rtree, list);
+ if (&bm->cur[index].zone->list != &bm->zones) {
+ bm->cur[index].node = list_entry(bm->cur[index].zone->leaves.next,
+ struct rtree_node, list);
+ bm->cur[index].node_pfn = 0;
+ bm->cur[index].node_bit = 0;
+ return true;
+ }
+
+ /* No more zones */
+ return false;
+}
+
+/**
+ * memory_bm_rtree_next_pfn - Find the next set bit in the bitmap @bm
+ *
+ * Starting from the last returned position this function searches
+ * for the next set bit in the memory bitmap and returns its
+ * number. If no more bit is set BM_END_OF_MAP is returned.
+ *
+ * It is required to run memory_bm_position_reset() before the
+ * first call to this function.
+ */
+unsigned long memory_bm_next_pfn(struct memory_bitmap *bm, int index)
+{
+ unsigned long bits, pfn, pages;
+ int bit;
+
+ index += NR_CPUS; /* Iteration state is separated from get/set/test */
+
+ do {
+ pages = bm->cur[index].zone->end_pfn - bm->cur[index].zone->start_pfn;
+ bits = min(pages - bm->cur[index].node_pfn, BM_BITS_PER_BLOCK);
+ bit = find_next_bit(bm->cur[index].node->data, bits,
+ bm->cur[index].node_bit);
+ if (bit < bits) {
+ pfn = bm->cur[index].zone->start_pfn + bm->cur[index].node_pfn + bit;
+ bm->cur[index].node_bit = bit + 1;
+ return pfn;
+ }
+ } while (rtree_next_node(bm, index));
+
+ return BM_END_OF_MAP;
+}
+
+LIST_HEAD(nosave_regions);
+
+/**
+ * register_nosave_region - register a range of page frames the contents
+ * of which should not be saved during the suspend (to be used in the early
+ * initialization code)
+ */
+
+void __init
+__register_nosave_region(unsigned long start_pfn, unsigned long end_pfn,
+ int use_kmalloc)
+{
+ struct nosave_region *region;
+
+ if (start_pfn >= end_pfn)
+ return;
+
+ if (!list_empty(&nosave_regions)) {
+ /* Try to extend the previous region (they should be sorted) */
+ region = list_entry(nosave_regions.prev,
+ struct nosave_region, list);
+ if (region->end_pfn == start_pfn) {
+ region->end_pfn = end_pfn;
+ goto Report;
+ }
+ }
+ if (use_kmalloc) {
+ /* during init, this shouldn't fail */
+ region = kmalloc(sizeof(struct nosave_region), GFP_KERNEL);
+ BUG_ON(!region);
+ } else
+ /* This allocation cannot fail */
+ region = memblock_virt_alloc(sizeof(struct nosave_region), 0);
+ region->start_pfn = start_pfn;
+ region->end_pfn = end_pfn;
+ list_add_tail(&region->list, &nosave_regions);
+ Report:
+ printk(KERN_INFO "PM: Registered nosave memory: [mem %#010llx-%#010llx]\n",
+ (unsigned long long) start_pfn << PAGE_SHIFT,
+ ((unsigned long long) end_pfn << PAGE_SHIFT) - 1);
+}
+
+/*
+ * Set bits in this map correspond to the page frames the contents of which
+ * should not be saved during the suspend.
+ */
+static struct memory_bitmap *forbidden_pages_map;
+
+/* Set bits in this map correspond to free page frames. */
+static struct memory_bitmap *free_pages_map;
+
+/*
+ * Each page frame allocated for creating the image is marked by setting the
+ * corresponding bits in forbidden_pages_map and free_pages_map simultaneously
+ */
+
+void swsusp_set_page_free(struct page *page)
+{
+ if (free_pages_map)
+ memory_bm_set_bit(free_pages_map, 0, page_to_pfn(page));
+}
+
+static int swsusp_page_is_free(struct page *page)
+{
+ return free_pages_map ?
+ memory_bm_test_bit(free_pages_map, 0, page_to_pfn(page)) : 0;
+}
+
+void swsusp_unset_page_free(struct page *page)
+{
+ if (free_pages_map)
+ memory_bm_clear_bit(free_pages_map, 0, page_to_pfn(page));
+}
+
+static void swsusp_set_page_forbidden(struct page *page)
+{
+ if (forbidden_pages_map)
+ memory_bm_set_bit(forbidden_pages_map, 0, page_to_pfn(page));
+}
+
+int swsusp_page_is_forbidden(struct page *page)
+{
+ return forbidden_pages_map ?
+ memory_bm_test_bit(forbidden_pages_map, 0, page_to_pfn(page)) : 0;
+}
+
+static void swsusp_unset_page_forbidden(struct page *page)
+{
+ if (forbidden_pages_map)
+ memory_bm_clear_bit(forbidden_pages_map, 0, page_to_pfn(page));
+}
+
+/**
+ * mark_nosave_pages - set bits corresponding to the page frames the
+ * contents of which should not be saved in a given bitmap.
+ */
+
+static void mark_nosave_pages(struct memory_bitmap *bm)
+{
+ struct nosave_region *region;
+
+ if (list_empty(&nosave_regions))
+ return;
+
+ list_for_each_entry(region, &nosave_regions, list) {
+ unsigned long pfn;
+
+ pr_debug("PM: Marking nosave pages: [mem %#010llx-%#010llx]\n",
+ (unsigned long long) region->start_pfn << PAGE_SHIFT,
+ ((unsigned long long) region->end_pfn << PAGE_SHIFT)
+ - 1);
+
+ for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++)
+ if (pfn_valid(pfn)) {
+ /*
+ * It is safe to ignore the result of
+ * mem_bm_set_bit_check() here, since we won't
+ * touch the PFNs for which the error is
+ * returned anyway.
+ */
+ mem_bm_set_bit_check(bm, 0, pfn);
+ }
+ }
+}
+
+/**
+ * create_basic_memory_bitmaps - create bitmaps needed for marking page
+ * frames that should not be saved and free page frames. The pointers
+ * forbidden_pages_map and free_pages_map are only modified if everything
+ * goes well, because we don't want the bits to be used before both bitmaps
+ * are set up.
+ */
+
+int create_basic_memory_bitmaps(void)
+{
+ struct memory_bitmap *bm1, *bm2;
+ int error = 0;
+
+ if (forbidden_pages_map && free_pages_map)
+ return 0;
+ else
+ BUG_ON(forbidden_pages_map || free_pages_map);
+
+ bm1 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL);
+ if (!bm1)
+ return -ENOMEM;
+
+ error = memory_bm_create(bm1, GFP_KERNEL, PG_ANY);
+ if (error)
+ goto Free_first_object;
+
+ bm2 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL);
+ if (!bm2)
+ goto Free_first_bitmap;
+
+ error = memory_bm_create(bm2, GFP_KERNEL, PG_ANY);
+ if (error)
+ goto Free_second_object;
+
+ forbidden_pages_map = bm1;
+ free_pages_map = bm2;
+ mark_nosave_pages(forbidden_pages_map);
+
+ pr_debug("PM: Basic memory bitmaps created\n");
+
+ return 0;
+
+ Free_second_object:
+ kfree(bm2);
+ Free_first_bitmap:
+ memory_bm_free(bm1, PG_UNSAFE_CLEAR);
+ Free_first_object:
+ kfree(bm1);
+ return -ENOMEM;
+}
+
+/**
+ * free_basic_memory_bitmaps - free memory bitmaps allocated by
+ * create_basic_memory_bitmaps(). The auxiliary pointers are necessary
+ * so that the bitmaps themselves are not referred to while they are being
+ * freed.
+ */
+
+void free_basic_memory_bitmaps(void)
+{
+ struct memory_bitmap *bm1, *bm2;
+
+ if (WARN_ON(!(forbidden_pages_map && free_pages_map)))
+ return;
+
+ bm1 = forbidden_pages_map;
+ bm2 = free_pages_map;
+ forbidden_pages_map = NULL;
+ free_pages_map = NULL;
+ memory_bm_free(bm1, PG_UNSAFE_CLEAR);
+ kfree(bm1);
+ memory_bm_free(bm2, PG_UNSAFE_CLEAR);
+ kfree(bm2);
+
+ pr_debug("PM: Basic memory bitmaps freed\n");
+}
+
+/**
+ * snapshot_additional_pages - estimate the number of additional pages
+ * be needed for setting up the suspend image data structures for given
+ * zone (usually the returned value is greater than the exact number)
+ */
+
+unsigned int snapshot_additional_pages(struct zone *zone)
+{
+ unsigned int rtree, nodes;
+
+ rtree = nodes = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
+ rtree += DIV_ROUND_UP(rtree * sizeof(struct rtree_node),
+ LINKED_PAGE_DATA_SIZE);
+ while (nodes > 1) {
+ nodes = DIV_ROUND_UP(nodes, BM_ENTRIES_PER_LEVEL);
+ rtree += nodes;
+ }
+
+ return 2 * rtree;
+}
+
+#ifdef CONFIG_HIGHMEM
+/**
+ * count_free_highmem_pages - compute the total number of free highmem
+ * pages, system-wide.
+ */
+
+static unsigned int count_free_highmem_pages(void)
+{
+ struct zone *zone;
+ unsigned int cnt = 0;
+
+ for_each_populated_zone(zone)
+ if (is_highmem(zone))
+ cnt += zone_page_state(zone, NR_FREE_PAGES);
+
+ return cnt;
+}
+
+/**
+ * saveable_highmem_page - Determine whether a highmem page should be
+ * included in the suspend image.
+ *
+ * We should save the page if it isn't Nosave or NosaveFree, or Reserved,
+ * and it isn't a part of a free chunk of pages.
+ */
+struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
+{
+ struct page *page;
+
+ if (!pfn_valid(pfn))
+ return NULL;
+
+ page = pfn_to_page(pfn);
+ if (page_zone(page) != zone)
+ return NULL;
+
+ BUG_ON(!PageHighMem(page));
+
+ if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page) ||
+ PageReserved(page))
+ return NULL;
+
+ if (page_is_guard(page))
+ return NULL;
+
+ return page;
+}
+
+/**
+ * count_highmem_pages - compute the total number of saveable highmem
+ * pages.
+ */
+
+static unsigned int count_highmem_pages(void)
+{
+ struct zone *zone;
+ unsigned int n = 0;
+
+ for_each_populated_zone(zone) {
+ unsigned long pfn, max_zone_pfn;
+
+ if (!is_highmem(zone))
+ continue;
+
+ mark_free_pages(zone);
+ max_zone_pfn = zone_end_pfn(zone);
+ for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
+ if (saveable_highmem_page(zone, pfn))
+ n++;
+ }
+ return n;
+}
+#endif /* CONFIG_HIGHMEM */
+
+/**
+ * saveable_page - Determine whether a non-highmem page should be included
+ * in the suspend image.
+ *
+ * We should save the page if it isn't Nosave, and is not in the range
+ * of pages statically defined as 'unsaveable', and it isn't a part of
+ * a free chunk of pages.
+ */
+struct page *saveable_page(struct zone *zone, unsigned long pfn)
+{
+ struct page *page;
+
+ if (!pfn_valid(pfn))
+ return NULL;
+
+ page = pfn_to_page(pfn);
+ if (page_zone(page) != zone)
+ return NULL;
+
+ BUG_ON(PageHighMem(page));
+
+ if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page))
+ return NULL;
+
+ if (PageReserved(page)
+ && (!kernel_page_present(page) || pfn_is_nosave(pfn)))
+ return NULL;
+
+ if (page_is_guard(page))
+ return NULL;
+
+ return page;
+}
+
+/**
+ * count_data_pages - compute the total number of saveable non-highmem
+ * pages.
+ */
+
+static unsigned int count_data_pages(void)
+{
+ struct zone *zone;
+ unsigned long pfn, max_zone_pfn;
+ unsigned int n = 0;
+
+ for_each_populated_zone(zone) {
+ if (is_highmem(zone))
+ continue;
+
+ mark_free_pages(zone);
+ max_zone_pfn = zone_end_pfn(zone);
+ for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
+ if (saveable_page(zone, pfn))
+ n++;
+ }
+ return n;
+}
+
+/* This is needed, because copy_page and memcpy are not usable for copying
+ * task structs.
+ */
+static inline void do_copy_page(long *dst, long *src)
+{
+ int n;
+
+ for (n = PAGE_SIZE / sizeof(long); n; n--)
+ *dst++ = *src++;
+}
+
+
+/**
+ * safe_copy_page - check if the page we are going to copy is marked as
+ * present in the kernel page tables (this always is the case if
+ * CONFIG_DEBUG_PAGEALLOC is not set and in that case
+ * kernel_page_present() always returns 'true').
+ */
+static void safe_copy_page(void *dst, struct page *s_page)
+{
+ if (kernel_page_present(s_page)) {
+ do_copy_page(dst, page_address(s_page));
+ } else {
+ kernel_map_pages(s_page, 1, 1);
+ do_copy_page(dst, page_address(s_page));
+ kernel_map_pages(s_page, 1, 0);
+ }
+}
+
+
+#ifdef CONFIG_HIGHMEM
+static inline struct page *
+page_is_saveable(struct zone *zone, unsigned long pfn)
+{
+ return is_highmem(zone) ?
+ saveable_highmem_page(zone, pfn) : saveable_page(zone, pfn);
+}
+
+static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
+{
+ struct page *s_page, *d_page;
+ void *src, *dst;
+
+ s_page = pfn_to_page(src_pfn);
+ d_page = pfn_to_page(dst_pfn);
+ if (PageHighMem(s_page)) {
+ src = kmap_atomic(s_page);
+ dst = kmap_atomic(d_page);
+ do_copy_page(dst, src);
+ kunmap_atomic(dst);
+ kunmap_atomic(src);
+ } else {
+ if (PageHighMem(d_page)) {
+ /* Page pointed to by src may contain some kernel
+ * data modified by kmap_atomic()
+ */
+ safe_copy_page(buffer, s_page);
+ dst = kmap_atomic(d_page);
+ copy_page(dst, buffer);
+ kunmap_atomic(dst);
+ } else {
+ safe_copy_page(page_address(d_page), s_page);
+ }
+ }
+}
+#else
+#define page_is_saveable(zone, pfn) saveable_page(zone, pfn)
+
+static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
+{
+ safe_copy_page(page_address(pfn_to_page(dst_pfn)),
+ pfn_to_page(src_pfn));
+}
+#endif /* CONFIG_HIGHMEM */
+
+static void
+copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
+{
+ struct zone *zone;
+ unsigned long pfn;
+
+ for_each_populated_zone(zone) {
+ unsigned long max_zone_pfn;
+
+ mark_free_pages(zone);
+ max_zone_pfn = zone_end_pfn(zone);
+ for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
+ if (page_is_saveable(zone, pfn))
+ memory_bm_set_bit(orig_bm, 0, pfn);
+ }
+ memory_bm_position_reset(orig_bm);
+ memory_bm_position_reset(copy_bm);
+ for(;;) {
+ pfn = memory_bm_next_pfn(orig_bm, 0);
+ if (unlikely(pfn == BM_END_OF_MAP))
+ break;
+ copy_data_page(memory_bm_next_pfn(copy_bm, 0), pfn);
+ }
+}
+
+/* Total number of image pages */
+static unsigned int nr_copy_pages;
+/* Number of pages needed for saving the original pfns of the image pages */
+static unsigned int nr_meta_pages;
+/*
+ * Numbers of normal and highmem page frames allocated for hibernation image
+ * before suspending devices.
+ */
+unsigned int alloc_normal, alloc_highmem;
+/*
+ * Memory bitmap used for marking saveable pages (during hibernation) or
+ * hibernation image pages (during restore)
+ */
+static struct memory_bitmap orig_bm;
+/*
+ * Memory bitmap used during hibernation for marking allocated page frames that
+ * will contain copies of saveable pages. During restore it is initially used
+ * for marking hibernation image pages, but then the set bits from it are
+ * duplicated in @orig_bm and it is released. On highmem systems it is next
+ * used for marking "safe" highmem pages, but it has to be reinitialized for
+ * this purpose.
+ */
+static struct memory_bitmap copy_bm;
+
+/**
+ * swsusp_free - free pages allocated for the suspend.
+ *
+ * Suspend pages are alocated before the atomic copy is made, so we
+ * need to release them after the resume.
+ */
+
+void swsusp_free(void)
+{
+ unsigned long fb_pfn, fr_pfn;
+
+ if (!forbidden_pages_map || !free_pages_map)
+ goto out;
+
+ memory_bm_position_reset(forbidden_pages_map);
+ memory_bm_position_reset(free_pages_map);
+
+loop:
+ fr_pfn = memory_bm_next_pfn(free_pages_map, 0);
+ fb_pfn = memory_bm_next_pfn(forbidden_pages_map, 0);
+
+ /*
+ * Find the next bit set in both bitmaps. This is guaranteed to
+ * terminate when fb_pfn == fr_pfn == BM_END_OF_MAP.
+ */
+ do {
+ if (fb_pfn < fr_pfn)
+ fb_pfn = memory_bm_next_pfn(forbidden_pages_map, 0);
+ if (fr_pfn < fb_pfn)
+ fr_pfn = memory_bm_next_pfn(free_pages_map, 0);
+ } while (fb_pfn != fr_pfn);
+
+ if (fr_pfn != BM_END_OF_MAP && pfn_valid(fr_pfn)) {
+ struct page *page = pfn_to_page(fr_pfn);
+
+ memory_bm_clear_current(forbidden_pages_map, 0);
+ memory_bm_clear_current(free_pages_map, 0);
+ __free_page(page);
+ goto loop;
+ }
+
+out:
+ nr_copy_pages = 0;
+ nr_meta_pages = 0;
+ restore_pblist = NULL;
+ buffer = NULL;
+ alloc_normal = 0;
+ alloc_highmem = 0;
+}
+
+/* Helper functions used for the shrinking of memory. */
+
+#define GFP_IMAGE (GFP_KERNEL | __GFP_NOWARN)
+
+/**
+ * preallocate_image_pages - Allocate a number of pages for hibernation image
+ * @nr_pages: Number of page frames to allocate.
+ * @mask: GFP flags to use for the allocation.
+ *
+ * Return value: Number of page frames actually allocated
+ */
+static unsigned long preallocate_image_pages(unsigned long nr_pages, gfp_t mask)
+{
+ unsigned long nr_alloc = 0;
+
+ while (nr_pages > 0) {
+ struct page *page;
+
+ page = alloc_image_page(mask);
+ if (!page)
+ break;
+ memory_bm_set_bit(&copy_bm, 0, page_to_pfn(page));
+ if (PageHighMem(page))
+ alloc_highmem++;
+ else
+ alloc_normal++;
+ nr_pages--;
+ nr_alloc++;
+ }
+
+ return nr_alloc;
+}
+
+static unsigned long preallocate_image_memory(unsigned long nr_pages,
+ unsigned long avail_normal)
+{
+ unsigned long alloc;
+
+ if (avail_normal <= alloc_normal)
+ return 0;
+
+ alloc = avail_normal - alloc_normal;
+ if (nr_pages < alloc)
+ alloc = nr_pages;
+
+ return preallocate_image_pages(alloc, GFP_IMAGE);
+}
+
+#ifdef CONFIG_HIGHMEM
+static unsigned long preallocate_image_highmem(unsigned long nr_pages)
+{
+ return preallocate_image_pages(nr_pages, GFP_IMAGE | __GFP_HIGHMEM);
+}
+
+/**
+ * __fraction - Compute (an approximation of) x * (multiplier / base)
+ */
+static unsigned long __fraction(u64 x, u64 multiplier, u64 base)
+{
+ x *= multiplier;
+ do_div(x, base);
+ return (unsigned long)x;
+}
+
+static unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
+ unsigned long highmem,
+ unsigned long total)
+{
+ unsigned long alloc = __fraction(nr_pages, highmem, total);
+
+ return preallocate_image_pages(alloc, GFP_IMAGE | __GFP_HIGHMEM);
+}
+#else /* CONFIG_HIGHMEM */
+static inline unsigned long preallocate_image_highmem(unsigned long nr_pages)
+{
+ return 0;
+}
+
+static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
+ unsigned long highmem,
+ unsigned long total)
+{
+ return 0;
+}
+#endif /* CONFIG_HIGHMEM */
+
+/**
+ * free_unnecessary_pages - Release preallocated pages not needed for the image
+ */
+static unsigned long free_unnecessary_pages(void)
+{
+ unsigned long save, to_free_normal, to_free_highmem, free;
+
+ save = count_data_pages();
+ if (alloc_normal >= save) {
+ to_free_normal = alloc_normal - save;
+ save = 0;
+ } else {
+ to_free_normal = 0;
+ save -= alloc_normal;
+ }
+ save += count_highmem_pages();
+ if (alloc_highmem >= save) {
+ to_free_highmem = alloc_highmem - save;
+ } else {
+ to_free_highmem = 0;
+ save -= alloc_highmem;
+ if (to_free_normal > save)
+ to_free_normal -= save;
+ else
+ to_free_normal = 0;
+ }
+ free = to_free_normal + to_free_highmem;
+
+ memory_bm_position_reset(&copy_bm);
+
+ while (to_free_normal > 0 || to_free_highmem > 0) {
+ unsigned long pfn = memory_bm_next_pfn(&copy_bm, 0);
+ struct page *page = pfn_to_page(pfn);
+
+ if (PageHighMem(page)) {
+ if (!to_free_highmem)
+ continue;
+ to_free_highmem--;
+ alloc_highmem--;
+ } else {
+ if (!to_free_normal)
+ continue;
+ to_free_normal--;
+ alloc_normal--;
+ }
+ memory_bm_clear_bit(&copy_bm, 0, pfn);
+ swsusp_unset_page_forbidden(page);
+ swsusp_unset_page_free(page);
+ __free_page(page);
+ }
+
+ return free;
+}
+
+/**
+ * minimum_image_size - Estimate the minimum acceptable size of an image
+ * @saveable: Number of saveable pages in the system.
+ *
+ * We want to avoid attempting to free too much memory too hard, so estimate the
+ * minimum acceptable size of a hibernation image to use as the lower limit for
+ * preallocating memory.
+ *
+ * We assume that the minimum image size should be proportional to
+ *
+ * [number of saveable pages] - [number of pages that can be freed in theory]
+ *
+ * where the second term is the sum of (1) reclaimable slab pages, (2) active
+ * and (3) inactive anonymous pages, (4) active and (5) inactive file pages,
+ * minus mapped file pages.
+ */
+static unsigned long minimum_image_size(unsigned long saveable)
+{
+ unsigned long size;
+
+ size = global_page_state(NR_SLAB_RECLAIMABLE)
+ + global_page_state(NR_ACTIVE_ANON)
+ + global_page_state(NR_INACTIVE_ANON)
+ + global_page_state(NR_ACTIVE_FILE)
+ + global_page_state(NR_INACTIVE_FILE)
+ - global_page_state(NR_FILE_MAPPED);
+
+ return saveable <= size ? 0 : saveable - size;
+}
+
+/**
+ * hibernate_preallocate_memory - Preallocate memory for hibernation image
+ *
+ * To create a hibernation image it is necessary to make a copy of every page
+ * frame in use. We also need a number of page frames to be free during
+ * hibernation for allocations made while saving the image and for device
+ * drivers, in case they need to allocate memory from their hibernation
+ * callbacks (these two numbers are given by PAGES_FOR_IO (which is a rough
+ * estimate) and reserverd_size divided by PAGE_SIZE (which is tunable through
+ * /sys/power/reserved_size, respectively). To make this happen, we compute the
+ * total number of available page frames and allocate at least
+ *
+ * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2
+ * + 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE)
+ *
+ * of them, which corresponds to the maximum size of a hibernation image.
+ *
+ * If image_size is set below the number following from the above formula,
+ * the preallocation of memory is continued until the total number of saveable
+ * pages in the system is below the requested image size or the minimum
+ * acceptable image size returned by minimum_image_size(), whichever is greater.
+ */
+int hibernate_preallocate_memory(void)
+{
+ struct zone *zone;
+ unsigned long saveable, size, max_size, count, highmem, pages = 0;
+ unsigned long alloc, save_highmem, pages_highmem, avail_normal;
+ ktime_t start, stop;
+ int error;
+
+ printk(KERN_INFO "PM: Preallocating image memory... ");
+ start = ktime_get();
+
+ error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY);
+ if (error)
+ goto err_out;
+
+ error = memory_bm_create(&copy_bm, GFP_IMAGE, PG_ANY);
+ if (error)
+ goto err_out;
+
+ alloc_normal = 0;
+ alloc_highmem = 0;
+
+ /* Count the number of saveable data pages. */
+ save_highmem = count_highmem_pages();
+ saveable = count_data_pages();
+
+ /*
+ * Compute the total number of page frames we can use (count) and the
+ * number of pages needed for image metadata (size).
+ */
+ count = saveable;
+ saveable += save_highmem;
+ highmem = save_highmem;
+ size = 0;
+ for_each_populated_zone(zone) {
+ size += snapshot_additional_pages(zone);
+ if (is_highmem(zone))
+ highmem += zone_page_state(zone, NR_FREE_PAGES);
+ else
+ count += zone_page_state(zone, NR_FREE_PAGES);
+ }
+ avail_normal = count;
+ count += highmem;
+ count -= totalreserve_pages;
+
+ /* Add number of pages required for page keys (s390 only). */
+ size += page_key_additional_pages(saveable);
+
+ /* Compute the maximum number of saveable pages to leave in memory. */
+ max_size = (count - (size + PAGES_FOR_IO)) / 2
+ - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE);
+ /* Compute the desired number of image pages specified by image_size. */
+ size = DIV_ROUND_UP(image_size, PAGE_SIZE);
+ if (size > max_size)
+ size = max_size;
+ /*
+ * If the desired number of image pages is at least as large as the
+ * current number of saveable pages in memory, allocate page frames for
+ * the image and we're done.
+ */
+ if (size >= saveable) {
+ pages = preallocate_image_highmem(save_highmem);
+ pages += preallocate_image_memory(saveable - pages, avail_normal);
+ goto out;
+ }
+
+ /* Estimate the minimum size of the image. */
+ pages = minimum_image_size(saveable);
+ /*
+ * To avoid excessive pressure on the normal zone, leave room in it to
+ * accommodate an image of the minimum size (unless it's already too
+ * small, in which case don't preallocate pages from it at all).
+ */
+ if (avail_normal > pages)
+ avail_normal -= pages;
+ else
+ avail_normal = 0;
+ if (size < pages)
+ size = min_t(unsigned long, pages, max_size);
+
+ /*
+ * Let the memory management subsystem know that we're going to need a
+ * large number of page frames to allocate and make it free some memory.
+ * NOTE: If this is not done, performance will be hurt badly in some
+ * test cases.
+ */
+ shrink_all_memory(saveable - size);
+
+ /*
+ * The number of saveable pages in memory was too high, so apply some
+ * pressure to decrease it. First, make room for the largest possible
+ * image and fail if that doesn't work. Next, try to decrease the size
+ * of the image as much as indicated by 'size' using allocations from
+ * highmem and non-highmem zones separately.
+ */
+ pages_highmem = preallocate_image_highmem(highmem / 2);
+ alloc = count - max_size;
+ if (alloc > pages_highmem)
+ alloc -= pages_highmem;
+ else
+ alloc = 0;
+ pages = preallocate_image_memory(alloc, avail_normal);
+ if (pages < alloc) {
+ /* We have exhausted non-highmem pages, try highmem. */
+ alloc -= pages;
+ pages += pages_highmem;
+ pages_highmem = preallocate_image_highmem(alloc);
+ if (pages_highmem < alloc)
+ goto err_out;
+ pages += pages_highmem;
+ /*
+ * size is the desired number of saveable pages to leave in
+ * memory, so try to preallocate (all memory - size) pages.
+ */
+ alloc = (count - pages) - size;
+ pages += preallocate_image_highmem(alloc);
+ } else {
+ /*
+ * There are approximately max_size saveable pages at this point
+ * and we want to reduce this number down to size.
+ */
+ alloc = max_size - size;
+ size = preallocate_highmem_fraction(alloc, highmem, count);
+ pages_highmem += size;
+ alloc -= size;
+ size = preallocate_image_memory(alloc, avail_normal);
+ pages_highmem += preallocate_image_highmem(alloc - size);
+ pages += pages_highmem + size;
+ }
+
+ /*
+ * We only need as many page frames for the image as there are saveable
+ * pages in memory, but we have allocated more. Release the excessive
+ * ones now.
+ */
+ pages -= free_unnecessary_pages();
+
+ out:
+ stop = ktime_get();
+ printk(KERN_CONT "done (allocated %lu pages)\n", pages);
+ swsusp_show_speed(start, stop, pages, "Allocated");
+
+ return 0;
+
+ err_out:
+ printk(KERN_CONT "\n");
+ swsusp_free();
+ return -ENOMEM;
+}
+
+#ifdef CONFIG_HIGHMEM
+/**
+ * count_pages_for_highmem - compute the number of non-highmem pages
+ * that will be necessary for creating copies of highmem pages.
+ */
+
+static unsigned int count_pages_for_highmem(unsigned int nr_highmem)
+{
+ unsigned int free_highmem = count_free_highmem_pages() + alloc_highmem;
+
+ if (free_highmem >= nr_highmem)
+ nr_highmem = 0;
+ else
+ nr_highmem -= free_highmem;
+
+ return nr_highmem;
+}
+#else
+static unsigned int
+count_pages_for_highmem(unsigned int nr_highmem) { return 0; }
+#endif /* CONFIG_HIGHMEM */
+
+/**
+ * enough_free_mem - Make sure we have enough free memory for the
+ * snapshot image.
+ */
+
+static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
+{
+ struct zone *zone;
+ unsigned int free = alloc_normal;
+
+ for_each_populated_zone(zone)
+ if (!is_highmem(zone))
+ free += zone_page_state(zone, NR_FREE_PAGES);
+
+ nr_pages += count_pages_for_highmem(nr_highmem);
+ pr_debug("PM: Normal pages needed: %u + %u, available pages: %u\n",
+ nr_pages, PAGES_FOR_IO, free);
+
+ return free > nr_pages + PAGES_FOR_IO;
+}
+
+#ifdef CONFIG_HIGHMEM
+/**
+ * get_highmem_buffer - if there are some highmem pages in the suspend
+ * image, we may need the buffer to copy them and/or load their data.
+ */
+
+static inline int get_highmem_buffer(int safe_needed)
+{
+ buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed);
+ return buffer ? 0 : -ENOMEM;
+}
+
+/**
+ * alloc_highmem_image_pages - allocate some highmem pages for the image.
+ * Try to allocate as many pages as needed, but if the number of free
+ * highmem pages is lesser than that, allocate them all.
+ */
+
+static inline unsigned int
+alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
+{
+ unsigned int to_alloc = count_free_highmem_pages();
+
+ if (to_alloc > nr_highmem)
+ to_alloc = nr_highmem;
+
+ nr_highmem -= to_alloc;
+ while (to_alloc-- > 0) {
+ struct page *page;
+
+ page = alloc_image_page(__GFP_HIGHMEM);
+ memory_bm_set_bit(bm, 0, page_to_pfn(page));
+ }
+ return nr_highmem;
+}
+#else
+static inline int get_highmem_buffer(int safe_needed) { return 0; }
+
+static inline unsigned int
+alloc_highmem_pages(struct memory_bitmap *bm, unsigned int n) { return 0; }
+#endif /* CONFIG_HIGHMEM */
+
+/**
+ * swsusp_alloc - allocate memory for the suspend image
+ *
+ * We first try to allocate as many highmem pages as there are
+ * saveable highmem pages in the system. If that fails, we allocate
+ * non-highmem pages for the copies of the remaining highmem ones.
+ *
+ * In this approach it is likely that the copies of highmem pages will
+ * also be located in the high memory, because of the way in which
+ * copy_data_pages() works.
+ */
+
+static int
+swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
+ unsigned int nr_pages, unsigned int nr_highmem)
+{
+ if (nr_highmem > 0) {
+ if (get_highmem_buffer(PG_ANY))
+ goto err_out;
+ if (nr_highmem > alloc_highmem) {
+ nr_highmem -= alloc_highmem;
+ nr_pages += alloc_highmem_pages(copy_bm, nr_highmem);
+ }
+ }
+ if (nr_pages > alloc_normal) {
+ nr_pages -= alloc_normal;
+ while (nr_pages-- > 0) {
+ struct page *page;
+
+ page = alloc_image_page(GFP_ATOMIC | __GFP_COLD);
+ if (!page)
+ goto err_out;
+ memory_bm_set_bit(copy_bm, 0, page_to_pfn(page));
+ }
+ }
+
+ return 0;
+
+ err_out:
+ swsusp_free();
+ return -ENOMEM;
+}
+
+asmlinkage __visible int swsusp_save(void)
+{
+ unsigned int nr_pages, nr_highmem;
+
+ if (toi_running)
+ return toi_post_context_save();
+
+ printk(KERN_INFO "PM: Creating hibernation image:\n");
+
+ drain_local_pages(NULL);
+ nr_pages = count_data_pages();
+ nr_highmem = count_highmem_pages();
+ printk(KERN_INFO "PM: Need to copy %u pages\n", nr_pages + nr_highmem);
+
+ if (!enough_free_mem(nr_pages, nr_highmem)) {
+ printk(KERN_ERR "PM: Not enough free memory\n");
+ return -ENOMEM;
+ }
+
+ if (swsusp_alloc(&orig_bm, &copy_bm, nr_pages, nr_highmem)) {
+ printk(KERN_ERR "PM: Memory allocation failed\n");
+ return -ENOMEM;
+ }
+
+ /* During allocating of suspend pagedir, new cold pages may appear.
+ * Kill them.
+ */
+ drain_local_pages(NULL);
+ copy_data_pages(&copy_bm, &orig_bm);
+
+ /*
+ * End of critical section. From now on, we can write to memory,
+ * but we should not touch disk. This specially means we must _not_
+ * touch swap space! Except we must write out our image of course.
+ */
+
+ nr_pages += nr_highmem;
+ nr_copy_pages = nr_pages;
+ nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE);
+
+ printk(KERN_INFO "PM: Hibernation image created (%d pages copied)\n",
+ nr_pages);
+
+ return 0;
+}
+
+#ifndef CONFIG_ARCH_HIBERNATION_HEADER
+static int init_header_complete(struct swsusp_info *info)
+{
+ memcpy(&info->uts, init_utsname(), sizeof(struct new_utsname));
+ info->version_code = LINUX_VERSION_CODE;
+ return 0;
+}
+
+char *check_image_kernel(struct swsusp_info *info)
+{
+ if (info->version_code != LINUX_VERSION_CODE)
+ return "kernel version";
+ if (strcmp(info->uts.sysname,init_utsname()->sysname))
+ return "system type";
+ if (strcmp(info->uts.release,init_utsname()->release))
+ return "kernel release";
+ if (strcmp(info->uts.version,init_utsname()->version))
+ return "version";
+ if (strcmp(info->uts.machine,init_utsname()->machine))
+ return "machine";
+ return NULL;
+}
+#endif /* CONFIG_ARCH_HIBERNATION_HEADER */
+
+unsigned long snapshot_get_image_size(void)
+{
+ return nr_copy_pages + nr_meta_pages + 1;
+}
+
+int init_header(struct swsusp_info *info)
+{
+ memset(info, 0, sizeof(struct swsusp_info));
+ info->num_physpages = get_num_physpages();
+ info->image_pages = nr_copy_pages;
+ info->pages = snapshot_get_image_size();
+ info->size = info->pages;
+ info->size <<= PAGE_SHIFT;
+ return init_header_complete(info);
+}
+
+/**
+ * pack_pfns - pfns corresponding to the set bits found in the bitmap @bm
+ * are stored in the array @buf[] (1 page at a time)
+ */
+
+static inline void
+pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
+{
+ int j;
+
+ for (j = 0; j < PAGE_SIZE / sizeof(long); j++) {
+ buf[j] = memory_bm_next_pfn(bm, 0);
+ if (unlikely(buf[j] == BM_END_OF_MAP))
+ break;
+ /* Save page key for data page (s390 only). */
+ page_key_read(buf + j);
+ }
+}
+
+/**
+ * snapshot_read_next - used for reading the system memory snapshot.
+ *
+ * On the first call to it @handle should point to a zeroed
+ * snapshot_handle structure. The structure gets updated and a pointer
+ * to it should be passed to this function every next time.
+ *
+ * On success the function returns a positive number. Then, the caller
+ * is allowed to read up to the returned number of bytes from the memory
+ * location computed by the data_of() macro.
+ *
+ * The function returns 0 to indicate the end of data stream condition,
+ * and a negative number is returned on error. In such cases the
+ * structure pointed to by @handle is not updated and should not be used
+ * any more.
+ */
+
+int snapshot_read_next(struct snapshot_handle *handle)
+{
+ if (handle->cur > nr_meta_pages + nr_copy_pages)
+ return 0;
+
+ if (!buffer) {
+ /* This makes the buffer be freed by swsusp_free() */
+ buffer = get_image_page(GFP_ATOMIC, PG_ANY);
+ if (!buffer)
+ return -ENOMEM;
+ }
+ if (!handle->cur) {
+ int error;
+
+ error = init_header((struct swsusp_info *)buffer);
+ if (error)
+ return error;
+ handle->buffer = buffer;
+ memory_bm_position_reset(&orig_bm);
+ memory_bm_position_reset(&copy_bm);
+ } else if (handle->cur <= nr_meta_pages) {
+ clear_page(buffer);
+ pack_pfns(buffer, &orig_bm);
+ } else {
+ struct page *page;
+
+ page = pfn_to_page(memory_bm_next_pfn(&copy_bm, 0));
+ if (PageHighMem(page)) {
+ /* Highmem pages are copied to the buffer,
+ * because we can't return with a kmapped
+ * highmem page (we may not be called again).
+ */
+ void *kaddr;
+
+ kaddr = kmap_atomic(page);
+ copy_page(buffer, kaddr);
+ kunmap_atomic(kaddr);
+ handle->buffer = buffer;
+ } else {
+ handle->buffer = page_address(page);
+ }
+ }
+ handle->cur++;
+ return PAGE_SIZE;
+}
+
+/**
+ * mark_unsafe_pages - mark the pages that cannot be used for storing
+ * the image during resume, because they conflict with the pages that
+ * had been used before suspend
+ */
+
+static int mark_unsafe_pages(struct memory_bitmap *bm)
+{
+ struct zone *zone;
+ unsigned long pfn, max_zone_pfn;
+
+ /* Clear page flags */
+ for_each_populated_zone(zone) {
+ max_zone_pfn = zone_end_pfn(zone);
+ for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
+ if (pfn_valid(pfn))
+ swsusp_unset_page_free(pfn_to_page(pfn));
+ }
+
+ /* Mark pages that correspond to the "original" pfns as "unsafe" */
+ memory_bm_position_reset(bm);
+ do {
+ pfn = memory_bm_next_pfn(bm, 0);
+ if (likely(pfn != BM_END_OF_MAP)) {
+ if (likely(pfn_valid(pfn)))
+ swsusp_set_page_free(pfn_to_page(pfn));
+ else
+ return -EFAULT;
+ }
+ } while (pfn != BM_END_OF_MAP);
+
+ allocated_unsafe_pages = 0;
+
+ return 0;
+}
+
+static void
+duplicate_memory_bitmap(struct memory_bitmap *dst, struct memory_bitmap *src)
+{
+ unsigned long pfn;
+
+ memory_bm_position_reset(src);
+ pfn = memory_bm_next_pfn(src, 0);
+ while (pfn != BM_END_OF_MAP) {
+ memory_bm_set_bit(dst, 0, pfn);
+ pfn = memory_bm_next_pfn(src, 0);
+ }
+}
+
+static int check_header(struct swsusp_info *info)
+{
+ char *reason;
+
+ reason = check_image_kernel(info);
+ if (!reason && info->num_physpages != get_num_physpages())
+ reason = "memory size";
+ if (reason) {
+ printk(KERN_ERR "PM: Image mismatch: %s\n", reason);
+ return -EPERM;
+ }
+ return 0;
+}
+
+/**
+ * load header - check the image header and copy data from it
+ */
+
+static int
+load_header(struct swsusp_info *info)
+{
+ int error;
+
+ restore_pblist = NULL;
+ error = check_header(info);
+ if (!error) {
+ nr_copy_pages = info->image_pages;
+ nr_meta_pages = info->pages - info->image_pages - 1;
+ }
+ return error;
+}
+
+/**
+ * unpack_orig_pfns - for each element of @buf[] (1 page at a time) set
+ * the corresponding bit in the memory bitmap @bm
+ */
+static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
+{
+ int j;
+
+ for (j = 0; j < PAGE_SIZE / sizeof(long); j++) {
+ if (unlikely(buf[j] == BM_END_OF_MAP))
+ break;
+
+ /* Extract and buffer page key for data page (s390 only). */
+ page_key_memorize(buf + j);
+
+ if (memory_bm_pfn_present(bm, 0, buf[j]))
+ memory_bm_set_bit(bm, 0, buf[j]);
+ else
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+/* List of "safe" pages that may be used to store data loaded from the suspend
+ * image
+ */
+static struct linked_page *safe_pages_list;
+
+#ifdef CONFIG_HIGHMEM
+/* struct highmem_pbe is used for creating the list of highmem pages that
+ * should be restored atomically during the resume from disk, because the page
+ * frames they have occupied before the suspend are in use.
+ */
+struct highmem_pbe {
+ struct page *copy_page; /* data is here now */
+ struct page *orig_page; /* data was here before the suspend */
+ struct highmem_pbe *next;
+};
+
+/* List of highmem PBEs needed for restoring the highmem pages that were
+ * allocated before the suspend and included in the suspend image, but have
+ * also been allocated by the "resume" kernel, so their contents cannot be
+ * written directly to their "original" page frames.
+ */
+static struct highmem_pbe *highmem_pblist;
+
+/**
+ * count_highmem_image_pages - compute the number of highmem pages in the
+ * suspend image. The bits in the memory bitmap @bm that correspond to the
+ * image pages are assumed to be set.
+ */
+
+static unsigned int count_highmem_image_pages(struct memory_bitmap *bm)
+{
+ unsigned long pfn;
+ unsigned int cnt = 0;
+
+ memory_bm_position_reset(bm);
+ pfn = memory_bm_next_pfn(bm, 0);
+ while (pfn != BM_END_OF_MAP) {
+ if (PageHighMem(pfn_to_page(pfn)))
+ cnt++;
+
+ pfn = memory_bm_next_pfn(bm, 0);
+ }
+ return cnt;
+}
+
+/**
+ * prepare_highmem_image - try to allocate as many highmem pages as
+ * there are highmem image pages (@nr_highmem_p points to the variable
+ * containing the number of highmem image pages). The pages that are
+ * "safe" (ie. will not be overwritten when the suspend image is
+ * restored) have the corresponding bits set in @bm (it must be
+ * unitialized).
+ *
+ * NOTE: This function should not be called if there are no highmem
+ * image pages.
+ */
+
+static unsigned int safe_highmem_pages;
+
+static struct memory_bitmap *safe_highmem_bm;
+
+static int
+prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
+{
+ unsigned int to_alloc;
+
+ if (memory_bm_create(bm, GFP_ATOMIC, PG_SAFE))
+ return -ENOMEM;
+
+ if (get_highmem_buffer(PG_SAFE))
+ return -ENOMEM;
+
+ to_alloc = count_free_highmem_pages();
+ if (to_alloc > *nr_highmem_p)
+ to_alloc = *nr_highmem_p;
+ else
+ *nr_highmem_p = to_alloc;
+
+ safe_highmem_pages = 0;
+ while (to_alloc-- > 0) {
+ struct page *page;
+
+ page = alloc_page(__GFP_HIGHMEM);
+ if (!swsusp_page_is_free(page)) {
+ /* The page is "safe", set its bit the bitmap */
+ memory_bm_set_bit(bm, 0, page_to_pfn(page));
+ safe_highmem_pages++;
+ }
+ /* Mark the page as allocated */
+ swsusp_set_page_forbidden(page);
+ swsusp_set_page_free(page);
+ }
+ memory_bm_position_reset(bm);
+ safe_highmem_bm = bm;
+ return 0;
+}
+
+/**
+ * get_highmem_page_buffer - for given highmem image page find the buffer
+ * that suspend_write_next() should set for its caller to write to.
+ *
+ * If the page is to be saved to its "original" page frame or a copy of
+ * the page is to be made in the highmem, @buffer is returned. Otherwise,
+ * the copy of the page is to be made in normal memory, so the address of
+ * the copy is returned.
+ *
+ * If @buffer is returned, the caller of suspend_write_next() will write
+ * the page's contents to @buffer, so they will have to be copied to the
+ * right location on the next call to suspend_write_next() and it is done
+ * with the help of copy_last_highmem_page(). For this purpose, if
+ * @buffer is returned, @last_highmem page is set to the page to which
+ * the data will have to be copied from @buffer.
+ */
+
+static struct page *last_highmem_page;
+
+static void *
+get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
+{
+ struct highmem_pbe *pbe;
+ void *kaddr;
+
+ if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page)) {
+ /* We have allocated the "original" page frame and we can
+ * use it directly to store the loaded page.
+ */
+ last_highmem_page = page;
+ return buffer;
+ }
+ /* The "original" page frame has not been allocated and we have to
+ * use a "safe" page frame to store the loaded page.
+ */
+ pbe = chain_alloc(ca, sizeof(struct highmem_pbe));
+ if (!pbe) {
+ swsusp_free();
+ return ERR_PTR(-ENOMEM);
+ }
+ pbe->orig_page = page;
+ if (safe_highmem_pages > 0) {
+ struct page *tmp;
+
+ /* Copy of the page will be stored in high memory */
+ kaddr = buffer;
+ tmp = pfn_to_page(memory_bm_next_pfn(safe_highmem_bm, 0));
+ safe_highmem_pages--;
+ last_highmem_page = tmp;
+ pbe->copy_page = tmp;
+ } else {
+ /* Copy of the page will be stored in normal memory */
+ kaddr = safe_pages_list;
+ safe_pages_list = safe_pages_list->next;
+ pbe->copy_page = virt_to_page(kaddr);
+ }
+ pbe->next = highmem_pblist;
+ highmem_pblist = pbe;
+ return kaddr;
+}
+
+/**
+ * copy_last_highmem_page - copy the contents of a highmem image from
+ * @buffer, where the caller of snapshot_write_next() has place them,
+ * to the right location represented by @last_highmem_page .
+ */
+
+static void copy_last_highmem_page(void)
+{
+ if (last_highmem_page) {
+ void *dst;
+
+ dst = kmap_atomic(last_highmem_page);
+ copy_page(dst, buffer);
+ kunmap_atomic(dst);
+ last_highmem_page = NULL;
+ }
+}
+
+static inline int last_highmem_page_copied(void)
+{
+ return !last_highmem_page;
+}
+
+static inline void free_highmem_data(void)
+{
+ if (safe_highmem_bm)
+ memory_bm_free(safe_highmem_bm, PG_UNSAFE_CLEAR);
+
+ if (buffer)
+ free_image_page(buffer, PG_UNSAFE_CLEAR);
+}
+#else
+static unsigned int
+count_highmem_image_pages(struct memory_bitmap *bm) { return 0; }
+
+static inline int
+prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
+{
+ return 0;
+}
+
+static inline void *
+get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
+{
+ return ERR_PTR(-EINVAL);
+}
+
+static inline void copy_last_highmem_page(void) {}
+static inline int last_highmem_page_copied(void) { return 1; }
+static inline void free_highmem_data(void) {}
+#endif /* CONFIG_HIGHMEM */
+
+/**
+ * prepare_image - use the memory bitmap @bm to mark the pages that will
+ * be overwritten in the process of restoring the system memory state
+ * from the suspend image ("unsafe" pages) and allocate memory for the
+ * image.
+ *
+ * The idea is to allocate a new memory bitmap first and then allocate
+ * as many pages as needed for the image data, but not to assign these
+ * pages to specific tasks initially. Instead, we just mark them as
+ * allocated and create a lists of "safe" pages that will be used
+ * later. On systems with high memory a list of "safe" highmem pages is
+ * also created.
+ */
+
+#define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe))
+
+static int
+prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
+{
+ unsigned int nr_pages, nr_highmem;
+ struct linked_page *sp_list, *lp;
+ int error;
+
+ /* If there is no highmem, the buffer will not be necessary */
+ free_image_page(buffer, PG_UNSAFE_CLEAR);
+ buffer = NULL;
+
+ nr_highmem = count_highmem_image_pages(bm);
+ error = mark_unsafe_pages(bm);
+ if (error)
+ goto Free;
+
+ error = memory_bm_create(new_bm, GFP_ATOMIC, PG_SAFE);
+ if (error)
+ goto Free;
+
+ duplicate_memory_bitmap(new_bm, bm);
+ memory_bm_free(bm, PG_UNSAFE_KEEP);
+ if (nr_highmem > 0) {
+ error = prepare_highmem_image(bm, &nr_highmem);
+ if (error)
+ goto Free;
+ }
+ /* Reserve some safe pages for potential later use.
+ *
+ * NOTE: This way we make sure there will be enough safe pages for the
+ * chain_alloc() in get_buffer(). It is a bit wasteful, but
+ * nr_copy_pages cannot be greater than 50% of the memory anyway.
+ */
+ sp_list = NULL;
+ /* nr_copy_pages cannot be lesser than allocated_unsafe_pages */
+ nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;
+ nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE);
+ while (nr_pages > 0) {
+ lp = get_image_page(GFP_ATOMIC, PG_SAFE);
+ if (!lp) {
+ error = -ENOMEM;
+ goto Free;
+ }
+ lp->next = sp_list;
+ sp_list = lp;
+ nr_pages--;
+ }
+ /* Preallocate memory for the image */
+ safe_pages_list = NULL;
+ nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;
+ while (nr_pages > 0) {
+ lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC);
+ if (!lp) {
+ error = -ENOMEM;
+ goto Free;
+ }
+ if (!swsusp_page_is_free(virt_to_page(lp))) {
+ /* The page is "safe", add it to the list */
+ lp->next = safe_pages_list;
+ safe_pages_list = lp;
+ }
+ /* Mark the page as allocated */
+ swsusp_set_page_forbidden(virt_to_page(lp));
+ swsusp_set_page_free(virt_to_page(lp));
+ nr_pages--;
+ }
+ /* Free the reserved safe pages so that chain_alloc() can use them */
+ while (sp_list) {
+ lp = sp_list->next;
+ free_image_page(sp_list, PG_UNSAFE_CLEAR);
+ sp_list = lp;
+ }
+ return 0;
+
+ Free:
+ swsusp_free();
+ return error;
+}
+
+/**
+ * get_buffer - compute the address that snapshot_write_next() should
+ * set for its caller to write to.
+ */
+
+static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
+{
+ struct pbe *pbe;
+ struct page *page;
+ unsigned long pfn = memory_bm_next_pfn(bm, 0);
+
+ if (pfn == BM_END_OF_MAP)
+ return ERR_PTR(-EFAULT);
+
+ page = pfn_to_page(pfn);
+ if (PageHighMem(page))
+ return get_highmem_page_buffer(page, ca);
+
+ if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page))
+ /* We have allocated the "original" page frame and we can
+ * use it directly to store the loaded page.
+ */
+ return page_address(page);
+
+ /* The "original" page frame has not been allocated and we have to
+ * use a "safe" page frame to store the loaded page.
+ */
+ pbe = chain_alloc(ca, sizeof(struct pbe));
+ if (!pbe) {
+ swsusp_free();
+ return ERR_PTR(-ENOMEM);
+ }
+ pbe->orig_address = page_address(page);
+ pbe->address = safe_pages_list;
+ safe_pages_list = safe_pages_list->next;
+ pbe->next = restore_pblist;
+ restore_pblist = pbe;
+ return pbe->address;
+}
+
+/**
+ * snapshot_write_next - used for writing the system memory snapshot.
+ *
+ * On the first call to it @handle should point to a zeroed
+ * snapshot_handle structure. The structure gets updated and a pointer
+ * to it should be passed to this function every next time.
+ *
+ * On success the function returns a positive number. Then, the caller
+ * is allowed to write up to the returned number of bytes to the memory
+ * location computed by the data_of() macro.
+ *
+ * The function returns 0 to indicate the "end of file" condition,
+ * and a negative number is returned on error. In such cases the
+ * structure pointed to by @handle is not updated and should not be used
+ * any more.
+ */
+
+int snapshot_write_next(struct snapshot_handle *handle)
+{
+ static struct chain_allocator ca;
+ int error = 0;
+
+ /* Check if we have already loaded the entire image */
+ if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages)
+ return 0;
+
+ handle->sync_read = 1;
+
+ if (!handle->cur) {
+ if (!buffer)
+ /* This makes the buffer be freed by swsusp_free() */
+ buffer = get_image_page(GFP_ATOMIC, PG_ANY);
+
+ if (!buffer)
+ return -ENOMEM;
+
+ handle->buffer = buffer;
+ } else if (handle->cur == 1) {
+ error = load_header(buffer);
+ if (error)
+ return error;
+
+ error = memory_bm_create(&copy_bm, GFP_ATOMIC, PG_ANY);
+ if (error)
+ return error;
+
+ /* Allocate buffer for page keys. */
+ error = page_key_alloc(nr_copy_pages);
+ if (error)
+ return error;
+
+ } else if (handle->cur <= nr_meta_pages + 1) {
+ error = unpack_orig_pfns(buffer, &copy_bm);
+ if (error)
+ return error;
+
+ if (handle->cur == nr_meta_pages + 1) {
+ error = prepare_image(&orig_bm, &copy_bm);
+ if (error)
+ return error;
+
+ chain_init(&ca, GFP_ATOMIC, PG_SAFE);
+ memory_bm_position_reset(&orig_bm);
+ restore_pblist = NULL;
+ handle->buffer = get_buffer(&orig_bm, &ca);
+ handle->sync_read = 0;
+ if (IS_ERR(handle->buffer))
+ return PTR_ERR(handle->buffer);
+ }
+ } else {
+ copy_last_highmem_page();
+ /* Restore page key for data page (s390 only). */
+ page_key_write(handle->buffer);
+ handle->buffer = get_buffer(&orig_bm, &ca);
+ if (IS_ERR(handle->buffer))
+ return PTR_ERR(handle->buffer);
+ if (handle->buffer != buffer)
+ handle->sync_read = 0;
+ }
+ handle->cur++;
+ return PAGE_SIZE;
+}
+
+/**
+ * snapshot_write_finalize - must be called after the last call to
+ * snapshot_write_next() in case the last page in the image happens
+ * to be a highmem page and its contents should be stored in the
+ * highmem. Additionally, it releases the memory that will not be
+ * used any more.
+ */
+
+void snapshot_write_finalize(struct snapshot_handle *handle)
+{
+ copy_last_highmem_page();
+ /* Restore page key for data page (s390 only). */
+ page_key_write(handle->buffer);
+ page_key_free();
+ /* Free only if we have loaded the image entirely */
+ if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) {
+ memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
+ free_highmem_data();
+ }
+}
+
+int snapshot_image_loaded(struct snapshot_handle *handle)
+{
+ return !(!nr_copy_pages || !last_highmem_page_copied() ||
+ handle->cur <= nr_meta_pages + nr_copy_pages);
+}
+
+#ifdef CONFIG_HIGHMEM
+/* Assumes that @buf is ready and points to a "safe" page */
+static inline void
+swap_two_pages_data(struct page *p1, struct page *p2, void *buf)
+{
+ void *kaddr1, *kaddr2;
+
+ kaddr1 = kmap_atomic(p1);
+ kaddr2 = kmap_atomic(p2);
+ copy_page(buf, kaddr1);
+ copy_page(kaddr1, kaddr2);
+ copy_page(kaddr2, buf);
+ kunmap_atomic(kaddr2);
+ kunmap_atomic(kaddr1);
+}
+
+/**
+ * restore_highmem - for each highmem page that was allocated before
+ * the suspend and included in the suspend image, and also has been
+ * allocated by the "resume" kernel swap its current (ie. "before
+ * resume") contents with the previous (ie. "before suspend") one.
+ *
+ * If the resume eventually fails, we can call this function once
+ * again and restore the "before resume" highmem state.
+ */
+
+int restore_highmem(void)
+{
+ struct highmem_pbe *pbe = highmem_pblist;
+ void *buf;
+
+ if (!pbe)
+ return 0;
+
+ buf = get_image_page(GFP_ATOMIC, PG_SAFE);
+ if (!buf)
+ return -ENOMEM;
+
+ while (pbe) {
+ swap_two_pages_data(pbe->copy_page, pbe->orig_page, buf);
+ pbe = pbe->next;
+ }
+ free_image_page(buf, PG_UNSAFE_CLEAR);
+ return 0;
+}
+#endif /* CONFIG_HIGHMEM */
+
+struct memory_bitmap *pageset1_map, *pageset2_map, *free_map, *nosave_map,
+ *pageset1_copy_map, *io_map, *page_resave_map, *compare_map;
+
+int resume_attempted;
+
+int memory_bm_write(struct memory_bitmap *bm, int (*rw_chunk)
+ (int rw, struct toi_module_ops *owner, char *buffer, int buffer_size))
+{
+ int result;
+
+ memory_bm_position_reset(bm);
+
+ do {
+ result = rw_chunk(WRITE, NULL, (char *) bm->cur[0].node->data, PAGE_SIZE);
+
+ if (result)
+ return result;
+ } while (rtree_next_node(bm, 0));
+ return 0;
+}
+
+int memory_bm_read(struct memory_bitmap *bm, int (*rw_chunk)
+ (int rw, struct toi_module_ops *owner, char *buffer, int buffer_size))
+{
+ int result;
+
+ memory_bm_position_reset(bm);
+
+ do {
+ result = rw_chunk(READ, NULL, (char *) bm->cur[0].node->data, PAGE_SIZE);
+
+ if (result)
+ return result;
+
+ } while (rtree_next_node(bm, 0));
+ return 0;
+}
+
+int memory_bm_space_needed(struct memory_bitmap *bm)
+{
+ unsigned long bytes = 0;
+
+ memory_bm_position_reset(bm);
+ do {
+ bytes += PAGE_SIZE;
+ } while (rtree_next_node(bm, 0));
+ return bytes;
+}
+
+int toi_alloc_bitmap(struct memory_bitmap **bm)
+{
+ int error;
+ struct memory_bitmap *bm1;
+
+ bm1 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL);
+ if (!bm1)
+ return -ENOMEM;
+
+ error = memory_bm_create(bm1, GFP_KERNEL, PG_ANY);
+ if (error) {
+ printk("Error returned - %d.\n", error);
+ kfree(bm1);
+ return -ENOMEM;
+ }
+
+ *bm = bm1;
+ return 0;
+}
+
+void toi_free_bitmap(struct memory_bitmap **bm)
+{
+ if (!*bm)
+ return;
+
+ memory_bm_free(*bm, 0);
+ kfree(*bm);
+ *bm = NULL;
+}
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
new file mode 100644
index 000000000..8d7a1ef72
--- /dev/null
+++ b/kernel/power/suspend.c
@@ -0,0 +1,536 @@
+/*
+ * kernel/power/suspend.c - Suspend to RAM and standby functionality.
+ *
+ * Copyright (c) 2003 Patrick Mochel
+ * Copyright (c) 2003 Open Source Development Lab
+ * Copyright (c) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/string.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/console.h>
+#include <linux/cpu.h>
+#include <linux/cpuidle.h>
+#include <linux/syscalls.h>
+#include <linux/gfp.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/suspend.h>
+#include <linux/syscore_ops.h>
+#include <linux/ftrace.h>
+#include <trace/events/power.h>
+#include <linux/compiler.h>
+#include <linux/moduleparam.h>
+
+#include "power.h"
+
+const char *pm_labels[] = { "mem", "standby", "freeze", NULL };
+const char *pm_states[PM_SUSPEND_MAX];
+
+static const struct platform_suspend_ops *suspend_ops;
+static const struct platform_freeze_ops *freeze_ops;
+static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
+
+enum freeze_state __read_mostly suspend_freeze_state;
+static DEFINE_SPINLOCK(suspend_freeze_lock);
+
+void freeze_set_ops(const struct platform_freeze_ops *ops)
+{
+ lock_system_sleep();
+ freeze_ops = ops;
+ unlock_system_sleep();
+}
+
+static void freeze_begin(void)
+{
+ suspend_freeze_state = FREEZE_STATE_NONE;
+}
+
+static void freeze_enter(void)
+{
+ spin_lock_irq(&suspend_freeze_lock);
+ if (pm_wakeup_pending())
+ goto out;
+
+ suspend_freeze_state = FREEZE_STATE_ENTER;
+ spin_unlock_irq(&suspend_freeze_lock);
+
+ get_online_cpus();
+ cpuidle_resume();
+
+ /* Push all the CPUs into the idle loop. */
+ wake_up_all_idle_cpus();
+ pr_debug("PM: suspend-to-idle\n");
+ /* Make the current CPU wait so it can enter the idle loop too. */
+ wait_event(suspend_freeze_wait_head,
+ suspend_freeze_state == FREEZE_STATE_WAKE);
+ pr_debug("PM: resume from suspend-to-idle\n");
+
+ cpuidle_pause();
+ put_online_cpus();
+
+ spin_lock_irq(&suspend_freeze_lock);
+
+ out:
+ suspend_freeze_state = FREEZE_STATE_NONE;
+ spin_unlock_irq(&suspend_freeze_lock);
+}
+
+void freeze_wake(void)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&suspend_freeze_lock, flags);
+ if (suspend_freeze_state > FREEZE_STATE_NONE) {
+ suspend_freeze_state = FREEZE_STATE_WAKE;
+ wake_up(&suspend_freeze_wait_head);
+ }
+ spin_unlock_irqrestore(&suspend_freeze_lock, flags);
+}
+EXPORT_SYMBOL_GPL(freeze_wake);
+
+static bool valid_state(suspend_state_t state)
+{
+ /*
+ * PM_SUSPEND_STANDBY and PM_SUSPEND_MEM states need low level
+ * support and need to be valid to the low level
+ * implementation, no valid callback implies that none are valid.
+ */
+ return suspend_ops && suspend_ops->valid && suspend_ops->valid(state);
+}
+
+/*
+ * If this is set, the "mem" label always corresponds to the deepest sleep state
+ * available, the "standby" label corresponds to the second deepest sleep state
+ * available (if any), and the "freeze" label corresponds to the remaining
+ * available sleep state (if there is one).
+ */
+static bool relative_states;
+
+static int __init sleep_states_setup(char *str)
+{
+ relative_states = !strncmp(str, "1", 1);
+ pm_states[PM_SUSPEND_FREEZE] = pm_labels[relative_states ? 0 : 2];
+ return 1;
+}
+
+__setup("relative_sleep_states=", sleep_states_setup);
+
+/**
+ * suspend_set_ops - Set the global suspend method table.
+ * @ops: Suspend operations to use.
+ */
+void suspend_set_ops(const struct platform_suspend_ops *ops)
+{
+ suspend_state_t i;
+ int j = 0;
+
+ lock_system_sleep();
+
+ suspend_ops = ops;
+ for (i = PM_SUSPEND_MEM; i >= PM_SUSPEND_STANDBY; i--)
+ if (valid_state(i)) {
+ pm_states[i] = pm_labels[j++];
+ } else if (!relative_states) {
+ pm_states[i] = NULL;
+ j++;
+ }
+
+ pm_states[PM_SUSPEND_FREEZE] = pm_labels[j];
+
+ unlock_system_sleep();
+}
+EXPORT_SYMBOL_GPL(suspend_set_ops);
+
+/**
+ * suspend_valid_only_mem - Generic memory-only valid callback.
+ *
+ * Platform drivers that implement mem suspend only and only need to check for
+ * that in their .valid() callback can use this instead of rolling their own
+ * .valid() callback.
+ */
+int suspend_valid_only_mem(suspend_state_t state)
+{
+ return state == PM_SUSPEND_MEM;
+}
+EXPORT_SYMBOL_GPL(suspend_valid_only_mem);
+
+static bool sleep_state_supported(suspend_state_t state)
+{
+ return state == PM_SUSPEND_FREEZE || (suspend_ops && suspend_ops->enter);
+}
+
+static int platform_suspend_prepare(suspend_state_t state)
+{
+ return state != PM_SUSPEND_FREEZE && suspend_ops->prepare ?
+ suspend_ops->prepare() : 0;
+}
+
+static int platform_suspend_prepare_late(suspend_state_t state)
+{
+ return state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->prepare ?
+ freeze_ops->prepare() : 0;
+}
+
+static int platform_suspend_prepare_noirq(suspend_state_t state)
+{
+ return state != PM_SUSPEND_FREEZE && suspend_ops->prepare_late ?
+ suspend_ops->prepare_late() : 0;
+}
+
+static void platform_resume_noirq(suspend_state_t state)
+{
+ if (state != PM_SUSPEND_FREEZE && suspend_ops->wake)
+ suspend_ops->wake();
+}
+
+static void platform_resume_early(suspend_state_t state)
+{
+ if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->restore)
+ freeze_ops->restore();
+}
+
+static void platform_resume_finish(suspend_state_t state)
+{
+ if (state != PM_SUSPEND_FREEZE && suspend_ops->finish)
+ suspend_ops->finish();
+}
+
+static int platform_suspend_begin(suspend_state_t state)
+{
+ if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin)
+ return freeze_ops->begin();
+ else if (suspend_ops->begin)
+ return suspend_ops->begin(state);
+ else
+ return 0;
+}
+
+static void platform_resume_end(suspend_state_t state)
+{
+ if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end)
+ freeze_ops->end();
+ else if (suspend_ops->end)
+ suspend_ops->end();
+}
+
+static void platform_recover(suspend_state_t state)
+{
+ if (state != PM_SUSPEND_FREEZE && suspend_ops->recover)
+ suspend_ops->recover();
+}
+
+static bool platform_suspend_again(suspend_state_t state)
+{
+ return state != PM_SUSPEND_FREEZE && suspend_ops->suspend_again ?
+ suspend_ops->suspend_again() : false;
+}
+
+#ifdef CONFIG_PM_DEBUG
+static unsigned int pm_test_delay = 5;
+module_param(pm_test_delay, uint, 0644);
+MODULE_PARM_DESC(pm_test_delay,
+ "Number of seconds to wait before resuming from suspend test");
+#endif
+
+static int suspend_test(int level)
+{
+#ifdef CONFIG_PM_DEBUG
+ if (pm_test_level == level) {
+ printk(KERN_INFO "suspend debug: Waiting for %d second(s).\n",
+ pm_test_delay);
+ mdelay(pm_test_delay * 1000);
+ return 1;
+ }
+#endif /* !CONFIG_PM_DEBUG */
+ return 0;
+}
+
+/**
+ * suspend_prepare - Prepare for entering system sleep state.
+ *
+ * Common code run for every system sleep state that can be entered (except for
+ * hibernation). Run suspend notifiers, allocate the "suspend" console and
+ * freeze processes.
+ */
+static int suspend_prepare(suspend_state_t state)
+{
+ int error;
+
+ if (!sleep_state_supported(state))
+ return -EPERM;
+
+ pm_prepare_console();
+
+ error = pm_notifier_call_chain(PM_SUSPEND_PREPARE);
+ if (error)
+ goto Finish;
+
+ trace_suspend_resume(TPS("freeze_processes"), 0, true);
+ error = suspend_freeze_processes();
+ trace_suspend_resume(TPS("freeze_processes"), 0, false);
+ if (!error)
+ return 0;
+
+ suspend_stats.failed_freeze++;
+ dpm_save_failed_step(SUSPEND_FREEZE);
+ Finish:
+ pm_notifier_call_chain(PM_POST_SUSPEND);
+ pm_restore_console();
+ return error;
+}
+
+/* default implementation */
+void __weak arch_suspend_disable_irqs(void)
+{
+ local_irq_disable();
+}
+
+/* default implementation */
+void __weak arch_suspend_enable_irqs(void)
+{
+ local_irq_enable();
+}
+
+/**
+ * suspend_enter - Make the system enter the given sleep state.
+ * @state: System sleep state to enter.
+ * @wakeup: Returns information that the sleep state should not be re-entered.
+ *
+ * This function should be called after devices have been suspended.
+ */
+static int suspend_enter(suspend_state_t state, bool *wakeup)
+{
+ int error;
+
+ error = platform_suspend_prepare(state);
+ if (error)
+ goto Platform_finish;
+
+ error = dpm_suspend_late(PMSG_SUSPEND);
+ if (error) {
+ printk(KERN_ERR "PM: late suspend of devices failed\n");
+ goto Platform_finish;
+ }
+ error = platform_suspend_prepare_late(state);
+ if (error)
+ goto Devices_early_resume;
+
+ error = dpm_suspend_noirq(PMSG_SUSPEND);
+ if (error) {
+ printk(KERN_ERR "PM: noirq suspend of devices failed\n");
+ goto Platform_early_resume;
+ }
+ error = platform_suspend_prepare_noirq(state);
+ if (error)
+ goto Platform_wake;
+
+ if (suspend_test(TEST_PLATFORM))
+ goto Platform_wake;
+
+ /*
+ * PM_SUSPEND_FREEZE equals
+ * frozen processes + suspended devices + idle processors.
+ * Thus we should invoke freeze_enter() soon after
+ * all the devices are suspended.
+ */
+ if (state == PM_SUSPEND_FREEZE) {
+ trace_suspend_resume(TPS("machine_suspend"), state, true);
+ freeze_enter();
+ trace_suspend_resume(TPS("machine_suspend"), state, false);
+ goto Platform_wake;
+ }
+
+ error = disable_nonboot_cpus();
+ if (error || suspend_test(TEST_CPUS))
+ goto Enable_cpus;
+
+ arch_suspend_disable_irqs();
+ BUG_ON(!irqs_disabled());
+
+ error = syscore_suspend();
+ if (!error) {
+ *wakeup = pm_wakeup_pending();
+ if (!(suspend_test(TEST_CORE) || *wakeup)) {
+ trace_suspend_resume(TPS("machine_suspend"),
+ state, true);
+ error = suspend_ops->enter(state);
+ trace_suspend_resume(TPS("machine_suspend"),
+ state, false);
+ events_check_enabled = false;
+ }
+ syscore_resume();
+ }
+
+ arch_suspend_enable_irqs();
+ BUG_ON(irqs_disabled());
+
+ Enable_cpus:
+ enable_nonboot_cpus();
+
+ Platform_wake:
+ platform_resume_noirq(state);
+ dpm_resume_noirq(PMSG_RESUME);
+
+ Platform_early_resume:
+ platform_resume_early(state);
+
+ Devices_early_resume:
+ dpm_resume_early(PMSG_RESUME);
+
+ Platform_finish:
+ platform_resume_finish(state);
+ return error;
+}
+
+/**
+ * suspend_devices_and_enter - Suspend devices and enter system sleep state.
+ * @state: System sleep state to enter.
+ */
+int suspend_devices_and_enter(suspend_state_t state)
+{
+ int error;
+ bool wakeup = false;
+
+ if (!sleep_state_supported(state))
+ return -ENOSYS;
+
+ error = platform_suspend_begin(state);
+ if (error)
+ goto Close;
+
+ suspend_console();
+ suspend_test_start();
+ error = dpm_suspend_start(PMSG_SUSPEND);
+ if (error) {
+ pr_err("PM: Some devices failed to suspend, or early wake event detected\n");
+ goto Recover_platform;
+ }
+ suspend_test_finish("suspend devices");
+ if (suspend_test(TEST_DEVICES))
+ goto Recover_platform;
+
+ do {
+ error = suspend_enter(state, &wakeup);
+ } while (!error && !wakeup && platform_suspend_again(state));
+
+ Resume_devices:
+ suspend_test_start();
+ dpm_resume_end(PMSG_RESUME);
+ suspend_test_finish("resume devices");
+ trace_suspend_resume(TPS("resume_console"), state, true);
+ resume_console();
+ trace_suspend_resume(TPS("resume_console"), state, false);
+
+ Close:
+ platform_resume_end(state);
+ return error;
+
+ Recover_platform:
+ platform_recover(state);
+ goto Resume_devices;
+}
+
+/**
+ * suspend_finish - Clean up before finishing the suspend sequence.
+ *
+ * Call platform code to clean up, restart processes, and free the console that
+ * we've allocated. This routine is not called for hibernation.
+ */
+static void suspend_finish(void)
+{
+ suspend_thaw_processes();
+ pm_notifier_call_chain(PM_POST_SUSPEND);
+ pm_restore_console();
+}
+
+/**
+ * enter_state - Do common work needed to enter system sleep state.
+ * @state: System sleep state to enter.
+ *
+ * Make sure that no one else is trying to put the system into a sleep state.
+ * Fail if that's not the case. Otherwise, prepare for system suspend, make the
+ * system enter the given sleep state and clean up after wakeup.
+ */
+static int enter_state(suspend_state_t state)
+{
+ int error;
+
+ trace_suspend_resume(TPS("suspend_enter"), state, true);
+ if (state == PM_SUSPEND_FREEZE) {
+#ifdef CONFIG_PM_DEBUG
+ if (pm_test_level != TEST_NONE && pm_test_level <= TEST_CPUS) {
+ pr_warning("PM: Unsupported test mode for freeze state,"
+ "please choose none/freezer/devices/platform.\n");
+ return -EAGAIN;
+ }
+#endif
+ } else if (!valid_state(state)) {
+ return -EINVAL;
+ }
+ if (!mutex_trylock(&pm_mutex))
+ return -EBUSY;
+
+ if (state == PM_SUSPEND_FREEZE)
+ freeze_begin();
+
+ trace_suspend_resume(TPS("sync_filesystems"), 0, true);
+ printk(KERN_INFO "PM: Syncing filesystems ... ");
+ sys_sync();
+ printk("done.\n");
+ trace_suspend_resume(TPS("sync_filesystems"), 0, false);
+
+ pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
+ error = suspend_prepare(state);
+ if (error)
+ goto Unlock;
+
+ if (suspend_test(TEST_FREEZER))
+ goto Finish;
+
+ trace_suspend_resume(TPS("suspend_enter"), state, false);
+ pr_debug("PM: Entering %s sleep\n", pm_states[state]);
+ pm_restrict_gfp_mask();
+ error = suspend_devices_and_enter(state);
+ pm_restore_gfp_mask();
+
+ Finish:
+ pr_debug("PM: Finishing wakeup.\n");
+ suspend_finish();
+ Unlock:
+ mutex_unlock(&pm_mutex);
+ return error;
+}
+
+/**
+ * pm_suspend - Externally visible function for suspending the system.
+ * @state: System sleep state to enter.
+ *
+ * Check if the value of @state represents one of the supported states,
+ * execute enter_state() and update system suspend statistics.
+ */
+int pm_suspend(suspend_state_t state)
+{
+ int error;
+
+ if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX)
+ return -EINVAL;
+
+ error = enter_state(state);
+ if (error) {
+ suspend_stats.fail++;
+ dpm_save_failed_errno(error);
+ } else {
+ suspend_stats.success++;
+ }
+ return error;
+}
+EXPORT_SYMBOL(pm_suspend);
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
new file mode 100644
index 000000000..084452e34
--- /dev/null
+++ b/kernel/power/suspend_test.c
@@ -0,0 +1,218 @@
+/*
+ * kernel/power/suspend_test.c - Suspend to RAM and standby test facility.
+ *
+ * Copyright (c) 2009 Pavel Machek <pavel@ucw.cz>
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/init.h>
+#include <linux/rtc.h>
+
+#include "power.h"
+
+/*
+ * We test the system suspend code by setting an RTC wakealarm a short
+ * time in the future, then suspending. Suspending the devices won't
+ * normally take long ... some systems only need a few milliseconds.
+ *
+ * The time it takes is system-specific though, so when we test this
+ * during system bootup we allow a LOT of time.
+ */
+#define TEST_SUSPEND_SECONDS 10
+
+static unsigned long suspend_test_start_time;
+static u32 test_repeat_count_max = 1;
+static u32 test_repeat_count_current;
+
+void suspend_test_start(void)
+{
+ /* FIXME Use better timebase than "jiffies", ideally a clocksource.
+ * What we want is a hardware counter that will work correctly even
+ * during the irqs-are-off stages of the suspend/resume cycle...
+ */
+ suspend_test_start_time = jiffies;
+}
+
+void suspend_test_finish(const char *label)
+{
+ long nj = jiffies - suspend_test_start_time;
+ unsigned msec;
+
+ msec = jiffies_to_msecs(abs(nj));
+ pr_info("PM: %s took %d.%03d seconds\n", label,
+ msec / 1000, msec % 1000);
+
+ /* Warning on suspend means the RTC alarm period needs to be
+ * larger -- the system was sooo slooowwww to suspend that the
+ * alarm (should have) fired before the system went to sleep!
+ *
+ * Warning on either suspend or resume also means the system
+ * has some performance issues. The stack dump of a WARN_ON
+ * is more likely to get the right attention than a printk...
+ */
+ WARN(msec > (TEST_SUSPEND_SECONDS * 1000),
+ "Component: %s, time: %u\n", label, msec);
+}
+
+/*
+ * To test system suspend, we need a hands-off mechanism to resume the
+ * system. RTCs wake alarms are a common self-contained mechanism.
+ */
+
+static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
+{
+ static char err_readtime[] __initdata =
+ KERN_ERR "PM: can't read %s time, err %d\n";
+ static char err_wakealarm [] __initdata =
+ KERN_ERR "PM: can't set %s wakealarm, err %d\n";
+ static char err_suspend[] __initdata =
+ KERN_ERR "PM: suspend test failed, error %d\n";
+ static char info_test[] __initdata =
+ KERN_INFO "PM: test RTC wakeup from '%s' suspend\n";
+
+ unsigned long now;
+ struct rtc_wkalrm alm;
+ int status;
+
+ /* this may fail if the RTC hasn't been initialized */
+repeat:
+ status = rtc_read_time(rtc, &alm.time);
+ if (status < 0) {
+ printk(err_readtime, dev_name(&rtc->dev), status);
+ return;
+ }
+ rtc_tm_to_time(&alm.time, &now);
+
+ memset(&alm, 0, sizeof alm);
+ rtc_time_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time);
+ alm.enabled = true;
+
+ status = rtc_set_alarm(rtc, &alm);
+ if (status < 0) {
+ printk(err_wakealarm, dev_name(&rtc->dev), status);
+ return;
+ }
+
+ if (state == PM_SUSPEND_MEM) {
+ printk(info_test, pm_states[state]);
+ status = pm_suspend(state);
+ if (status == -ENODEV)
+ state = PM_SUSPEND_STANDBY;
+ }
+ if (state == PM_SUSPEND_STANDBY) {
+ printk(info_test, pm_states[state]);
+ status = pm_suspend(state);
+ if (status < 0)
+ state = PM_SUSPEND_FREEZE;
+ }
+ if (state == PM_SUSPEND_FREEZE) {
+ printk(info_test, pm_states[state]);
+ status = pm_suspend(state);
+ }
+
+ if (status < 0)
+ printk(err_suspend, status);
+
+ test_repeat_count_current++;
+ if (test_repeat_count_current < test_repeat_count_max)
+ goto repeat;
+
+ /* Some platforms can't detect that the alarm triggered the
+ * wakeup, or (accordingly) disable it after it afterwards.
+ * It's supposed to give oneshot behavior; cope.
+ */
+ alm.enabled = false;
+ rtc_set_alarm(rtc, &alm);
+}
+
+static int __init has_wakealarm(struct device *dev, const void *data)
+{
+ struct rtc_device *candidate = to_rtc_device(dev);
+
+ if (!candidate->ops->set_alarm)
+ return 0;
+ if (!device_may_wakeup(candidate->dev.parent))
+ return 0;
+
+ return 1;
+}
+
+/*
+ * Kernel options like "test_suspend=mem" force suspend/resume sanity tests
+ * at startup time. They're normally disabled, for faster boot and because
+ * we can't know which states really work on this particular system.
+ */
+static const char *test_state_label __initdata;
+
+static char warn_bad_state[] __initdata =
+ KERN_WARNING "PM: can't test '%s' suspend state\n";
+
+static int __init setup_test_suspend(char *value)
+{
+ int i;
+ char *repeat;
+ char *suspend_type;
+
+ /* example : "=mem[,N]" ==> "mem[,N]" */
+ value++;
+ suspend_type = strsep(&value, ",");
+ if (!suspend_type)
+ return 0;
+
+ repeat = strsep(&value, ",");
+ if (repeat) {
+ if (kstrtou32(repeat, 0, &test_repeat_count_max))
+ return 0;
+ }
+
+ for (i = 0; pm_labels[i]; i++)
+ if (!strcmp(pm_labels[i], suspend_type)) {
+ test_state_label = pm_labels[i];
+ return 0;
+ }
+
+ printk(warn_bad_state, suspend_type);
+ return 0;
+}
+__setup("test_suspend", setup_test_suspend);
+
+static int __init test_suspend(void)
+{
+ static char warn_no_rtc[] __initdata =
+ KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n";
+
+ struct rtc_device *rtc = NULL;
+ struct device *dev;
+ suspend_state_t test_state;
+
+ /* PM is initialized by now; is that state testable? */
+ if (!test_state_label)
+ return 0;
+
+ for (test_state = PM_SUSPEND_MIN; test_state < PM_SUSPEND_MAX; test_state++) {
+ const char *state_label = pm_states[test_state];
+
+ if (state_label && !strcmp(test_state_label, state_label))
+ break;
+ }
+ if (test_state == PM_SUSPEND_MAX) {
+ printk(warn_bad_state, test_state_label);
+ return 0;
+ }
+
+ /* RTCs have initialized by now too ... can we use one? */
+ dev = class_find_device(rtc_class, NULL, NULL, has_wakealarm);
+ if (dev)
+ rtc = rtc_class_open(dev_name(dev));
+ if (!rtc) {
+ printk(warn_no_rtc);
+ return 0;
+ }
+
+ /* go for it */
+ test_wakealarm(rtc, test_state);
+ rtc_class_close(rtc);
+ return 0;
+}
+late_initcall(test_suspend);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
new file mode 100644
index 000000000..570aff817
--- /dev/null
+++ b/kernel/power/swap.c
@@ -0,0 +1,1512 @@
+/*
+ * linux/kernel/power/swap.c
+ *
+ * This file provides functions for reading the suspend image from
+ * and writing it to a swap partition.
+ *
+ * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
+ * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+ * Copyright (C) 2010-2012 Bojan Smojver <bojan@rexursive.com>
+ *
+ * This file is released under the GPLv2.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/file.h>
+#include <linux/delay.h>
+#include <linux/bitops.h>
+#include <linux/genhd.h>
+#include <linux/device.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/pm.h>
+#include <linux/slab.h>
+#include <linux/lzo.h>
+#include <linux/vmalloc.h>
+#include <linux/cpumask.h>
+#include <linux/atomic.h>
+#include <linux/kthread.h>
+#include <linux/crc32.h>
+#include <linux/ktime.h>
+
+#include "power.h"
+
+#define HIBERNATE_SIG "S1SUSPEND"
+
+/*
+ * The swap map is a data structure used for keeping track of each page
+ * written to a swap partition. It consists of many swap_map_page
+ * structures that contain each an array of MAP_PAGE_ENTRIES swap entries.
+ * These structures are stored on the swap and linked together with the
+ * help of the .next_swap member.
+ *
+ * The swap map is created during suspend. The swap map pages are
+ * allocated and populated one at a time, so we only need one memory
+ * page to set up the entire structure.
+ *
+ * During resume we pick up all swap_map_page structures into a list.
+ */
+
+#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1)
+
+/*
+ * Number of free pages that are not high.
+ */
+static inline unsigned long low_free_pages(void)
+{
+ return nr_free_pages() - nr_free_highpages();
+}
+
+/*
+ * Number of pages required to be kept free while writing the image. Always
+ * half of all available low pages before the writing starts.
+ */
+static inline unsigned long reqd_free_pages(void)
+{
+ return low_free_pages() / 2;
+}
+
+struct swap_map_page {
+ sector_t entries[MAP_PAGE_ENTRIES];
+ sector_t next_swap;
+};
+
+struct swap_map_page_list {
+ struct swap_map_page *map;
+ struct swap_map_page_list *next;
+};
+
+/**
+ * The swap_map_handle structure is used for handling swap in
+ * a file-alike way
+ */
+
+struct swap_map_handle {
+ struct swap_map_page *cur;
+ struct swap_map_page_list *maps;
+ sector_t cur_swap;
+ sector_t first_sector;
+ unsigned int k;
+ unsigned long reqd_free_pages;
+ u32 crc32;
+};
+
+struct swsusp_header {
+ char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int) -
+ sizeof(u32)];
+ u32 crc32;
+ sector_t image;
+ unsigned int flags; /* Flags to pass to the "boot" kernel */
+ char orig_sig[10];
+ char sig[10];
+} __packed;
+
+static struct swsusp_header *swsusp_header;
+
+/**
+ * The following functions are used for tracing the allocated
+ * swap pages, so that they can be freed in case of an error.
+ */
+
+struct swsusp_extent {
+ struct rb_node node;
+ unsigned long start;
+ unsigned long end;
+};
+
+static struct rb_root swsusp_extents = RB_ROOT;
+
+static int swsusp_extents_insert(unsigned long swap_offset)
+{
+ struct rb_node **new = &(swsusp_extents.rb_node);
+ struct rb_node *parent = NULL;
+ struct swsusp_extent *ext;
+
+ /* Figure out where to put the new node */
+ while (*new) {
+ ext = rb_entry(*new, struct swsusp_extent, node);
+ parent = *new;
+ if (swap_offset < ext->start) {
+ /* Try to merge */
+ if (swap_offset == ext->start - 1) {
+ ext->start--;
+ return 0;
+ }
+ new = &((*new)->rb_left);
+ } else if (swap_offset > ext->end) {
+ /* Try to merge */
+ if (swap_offset == ext->end + 1) {
+ ext->end++;
+ return 0;
+ }
+ new = &((*new)->rb_right);
+ } else {
+ /* It already is in the tree */
+ return -EINVAL;
+ }
+ }
+ /* Add the new node and rebalance the tree. */
+ ext = kzalloc(sizeof(struct swsusp_extent), GFP_KERNEL);
+ if (!ext)
+ return -ENOMEM;
+
+ ext->start = swap_offset;
+ ext->end = swap_offset;
+ rb_link_node(&ext->node, parent, new);
+ rb_insert_color(&ext->node, &swsusp_extents);
+ return 0;
+}
+
+/**
+ * alloc_swapdev_block - allocate a swap page and register that it has
+ * been allocated, so that it can be freed in case of an error.
+ */
+
+sector_t alloc_swapdev_block(int swap)
+{
+ unsigned long offset;
+
+ offset = swp_offset(get_swap_page_of_type(swap));
+ if (offset) {
+ if (swsusp_extents_insert(offset))
+ swap_free(swp_entry(swap, offset));
+ else
+ return swapdev_block(swap, offset);
+ }
+ return 0;
+}
+
+/**
+ * free_all_swap_pages - free swap pages allocated for saving image data.
+ * It also frees the extents used to register which swap entries had been
+ * allocated.
+ */
+
+void free_all_swap_pages(int swap)
+{
+ struct rb_node *node;
+
+ while ((node = swsusp_extents.rb_node)) {
+ struct swsusp_extent *ext;
+ unsigned long offset;
+
+ ext = container_of(node, struct swsusp_extent, node);
+ rb_erase(node, &swsusp_extents);
+ for (offset = ext->start; offset <= ext->end; offset++)
+ swap_free(swp_entry(swap, offset));
+
+ kfree(ext);
+ }
+}
+
+int swsusp_swap_in_use(void)
+{
+ return (swsusp_extents.rb_node != NULL);
+}
+
+/*
+ * General things
+ */
+
+static unsigned short root_swap = 0xffff;
+struct block_device *hib_resume_bdev;
+
+/*
+ * Saving part
+ */
+
+static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
+{
+ int error;
+
+ hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL);
+ if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) ||
+ !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) {
+ memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
+ memcpy(swsusp_header->sig, HIBERNATE_SIG, 10);
+ swsusp_header->image = handle->first_sector;
+ swsusp_header->flags = flags;
+ if (flags & SF_CRC32_MODE)
+ swsusp_header->crc32 = handle->crc32;
+ error = hib_bio_write_page(swsusp_resume_block,
+ swsusp_header, NULL);
+ } else {
+ printk(KERN_ERR "PM: Swap header not found!\n");
+ error = -ENODEV;
+ }
+ return error;
+}
+
+/**
+ * swsusp_swap_check - check if the resume device is a swap device
+ * and get its index (if so)
+ *
+ * This is called before saving image
+ */
+static int swsusp_swap_check(void)
+{
+ int res;
+
+ res = swap_type_of(swsusp_resume_device, swsusp_resume_block,
+ &hib_resume_bdev);
+ if (res < 0)
+ return res;
+
+ root_swap = res;
+ res = blkdev_get(hib_resume_bdev, FMODE_WRITE, NULL);
+ if (res)
+ return res;
+
+ res = set_blocksize(hib_resume_bdev, PAGE_SIZE);
+ if (res < 0)
+ blkdev_put(hib_resume_bdev, FMODE_WRITE);
+
+ return res;
+}
+
+/**
+ * write_page - Write one page to given swap location.
+ * @buf: Address we're writing.
+ * @offset: Offset of the swap page we're writing to.
+ * @bio_chain: Link the next write BIO here
+ */
+
+static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
+{
+ void *src;
+ int ret;
+
+ if (!offset)
+ return -ENOSPC;
+
+ if (bio_chain) {
+ src = (void *)__get_free_page(__GFP_WAIT | __GFP_NOWARN |
+ __GFP_NORETRY);
+ if (src) {
+ copy_page(src, buf);
+ } else {
+ ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */
+ if (ret)
+ return ret;
+ src = (void *)__get_free_page(__GFP_WAIT |
+ __GFP_NOWARN |
+ __GFP_NORETRY);
+ if (src) {
+ copy_page(src, buf);
+ } else {
+ WARN_ON_ONCE(1);
+ bio_chain = NULL; /* Go synchronous */
+ src = buf;
+ }
+ }
+ } else {
+ src = buf;
+ }
+ return hib_bio_write_page(offset, src, bio_chain);
+}
+
+static void release_swap_writer(struct swap_map_handle *handle)
+{
+ if (handle->cur)
+ free_page((unsigned long)handle->cur);
+ handle->cur = NULL;
+}
+
+static int get_swap_writer(struct swap_map_handle *handle)
+{
+ int ret;
+
+ ret = swsusp_swap_check();
+ if (ret) {
+ if (ret != -ENOSPC)
+ printk(KERN_ERR "PM: Cannot find swap device, try "
+ "swapon -a.\n");
+ return ret;
+ }
+ handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
+ if (!handle->cur) {
+ ret = -ENOMEM;
+ goto err_close;
+ }
+ handle->cur_swap = alloc_swapdev_block(root_swap);
+ if (!handle->cur_swap) {
+ ret = -ENOSPC;
+ goto err_rel;
+ }
+ handle->k = 0;
+ handle->reqd_free_pages = reqd_free_pages();
+ handle->first_sector = handle->cur_swap;
+ return 0;
+err_rel:
+ release_swap_writer(handle);
+err_close:
+ swsusp_close(FMODE_WRITE);
+ return ret;
+}
+
+static int swap_write_page(struct swap_map_handle *handle, void *buf,
+ struct bio **bio_chain)
+{
+ int error = 0;
+ sector_t offset;
+
+ if (!handle->cur)
+ return -EINVAL;
+ offset = alloc_swapdev_block(root_swap);
+ error = write_page(buf, offset, bio_chain);
+ if (error)
+ return error;
+ handle->cur->entries[handle->k++] = offset;
+ if (handle->k >= MAP_PAGE_ENTRIES) {
+ offset = alloc_swapdev_block(root_swap);
+ if (!offset)
+ return -ENOSPC;
+ handle->cur->next_swap = offset;
+ error = write_page(handle->cur, handle->cur_swap, bio_chain);
+ if (error)
+ goto out;
+ clear_page(handle->cur);
+ handle->cur_swap = offset;
+ handle->k = 0;
+
+ if (bio_chain && low_free_pages() <= handle->reqd_free_pages) {
+ error = hib_wait_on_bio_chain(bio_chain);
+ if (error)
+ goto out;
+ /*
+ * Recalculate the number of required free pages, to
+ * make sure we never take more than half.
+ */
+ handle->reqd_free_pages = reqd_free_pages();
+ }
+ }
+ out:
+ return error;
+}
+
+static int flush_swap_writer(struct swap_map_handle *handle)
+{
+ if (handle->cur && handle->cur_swap)
+ return write_page(handle->cur, handle->cur_swap, NULL);
+ else
+ return -EINVAL;
+}
+
+static int swap_writer_finish(struct swap_map_handle *handle,
+ unsigned int flags, int error)
+{
+ if (!error) {
+ flush_swap_writer(handle);
+ printk(KERN_INFO "PM: S");
+ error = mark_swapfiles(handle, flags);
+ printk("|\n");
+ }
+
+ if (error)
+ free_all_swap_pages(root_swap);
+ release_swap_writer(handle);
+ swsusp_close(FMODE_WRITE);
+
+ return error;
+}
+
+/* We need to remember how much compressed data we need to read. */
+#define LZO_HEADER sizeof(size_t)
+
+/* Number of pages/bytes we'll compress at one time. */
+#define LZO_UNC_PAGES 32
+#define LZO_UNC_SIZE (LZO_UNC_PAGES * PAGE_SIZE)
+
+/* Number of pages/bytes we need for compressed data (worst case). */
+#define LZO_CMP_PAGES DIV_ROUND_UP(lzo1x_worst_compress(LZO_UNC_SIZE) + \
+ LZO_HEADER, PAGE_SIZE)
+#define LZO_CMP_SIZE (LZO_CMP_PAGES * PAGE_SIZE)
+
+/* Maximum number of threads for compression/decompression. */
+#define LZO_THREADS 3
+
+/* Minimum/maximum number of pages for read buffering. */
+#define LZO_MIN_RD_PAGES 1024
+#define LZO_MAX_RD_PAGES 8192
+
+
+/**
+ * save_image - save the suspend image data
+ */
+
+static int save_image(struct swap_map_handle *handle,
+ struct snapshot_handle *snapshot,
+ unsigned int nr_to_write)
+{
+ unsigned int m;
+ int ret;
+ int nr_pages;
+ int err2;
+ struct bio *bio;
+ ktime_t start;
+ ktime_t stop;
+
+ printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n",
+ nr_to_write);
+ m = nr_to_write / 10;
+ if (!m)
+ m = 1;
+ nr_pages = 0;
+ bio = NULL;
+ start = ktime_get();
+ while (1) {
+ ret = snapshot_read_next(snapshot);
+ if (ret <= 0)
+ break;
+ ret = swap_write_page(handle, data_of(*snapshot), &bio);
+ if (ret)
+ break;
+ if (!(nr_pages % m))
+ printk(KERN_INFO "PM: Image saving progress: %3d%%\n",
+ nr_pages / m * 10);
+ nr_pages++;
+ }
+ err2 = hib_wait_on_bio_chain(&bio);
+ stop = ktime_get();
+ if (!ret)
+ ret = err2;
+ if (!ret)
+ printk(KERN_INFO "PM: Image saving done.\n");
+ swsusp_show_speed(start, stop, nr_to_write, "Wrote");
+ return ret;
+}
+
+/**
+ * Structure used for CRC32.
+ */
+struct crc_data {
+ struct task_struct *thr; /* thread */
+ atomic_t ready; /* ready to start flag */
+ atomic_t stop; /* ready to stop flag */
+ unsigned run_threads; /* nr current threads */
+ wait_queue_head_t go; /* start crc update */
+ wait_queue_head_t done; /* crc update done */
+ u32 *crc32; /* points to handle's crc32 */
+ size_t *unc_len[LZO_THREADS]; /* uncompressed lengths */
+ unsigned char *unc[LZO_THREADS]; /* uncompressed data */
+};
+
+/**
+ * CRC32 update function that runs in its own thread.
+ */
+static int crc32_threadfn(void *data)
+{
+ struct crc_data *d = data;
+ unsigned i;
+
+ while (1) {
+ wait_event(d->go, atomic_read(&d->ready) ||
+ kthread_should_stop());
+ if (kthread_should_stop()) {
+ d->thr = NULL;
+ atomic_set(&d->stop, 1);
+ wake_up(&d->done);
+ break;
+ }
+ atomic_set(&d->ready, 0);
+
+ for (i = 0; i < d->run_threads; i++)
+ *d->crc32 = crc32_le(*d->crc32,
+ d->unc[i], *d->unc_len[i]);
+ atomic_set(&d->stop, 1);
+ wake_up(&d->done);
+ }
+ return 0;
+}
+/**
+ * Structure used for LZO data compression.
+ */
+struct cmp_data {
+ struct task_struct *thr; /* thread */
+ atomic_t ready; /* ready to start flag */
+ atomic_t stop; /* ready to stop flag */
+ int ret; /* return code */
+ wait_queue_head_t go; /* start compression */
+ wait_queue_head_t done; /* compression done */
+ size_t unc_len; /* uncompressed length */
+ size_t cmp_len; /* compressed length */
+ unsigned char unc[LZO_UNC_SIZE]; /* uncompressed buffer */
+ unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */
+ unsigned char wrk[LZO1X_1_MEM_COMPRESS]; /* compression workspace */
+};
+
+/**
+ * Compression function that runs in its own thread.
+ */
+static int lzo_compress_threadfn(void *data)
+{
+ struct cmp_data *d = data;
+
+ while (1) {
+ wait_event(d->go, atomic_read(&d->ready) ||
+ kthread_should_stop());
+ if (kthread_should_stop()) {
+ d->thr = NULL;
+ d->ret = -1;
+ atomic_set(&d->stop, 1);
+ wake_up(&d->done);
+ break;
+ }
+ atomic_set(&d->ready, 0);
+
+ d->ret = lzo1x_1_compress(d->unc, d->unc_len,
+ d->cmp + LZO_HEADER, &d->cmp_len,
+ d->wrk);
+ atomic_set(&d->stop, 1);
+ wake_up(&d->done);
+ }
+ return 0;
+}
+
+/**
+ * save_image_lzo - Save the suspend image data compressed with LZO.
+ * @handle: Swap map handle to use for saving the image.
+ * @snapshot: Image to read data from.
+ * @nr_to_write: Number of pages to save.
+ */
+static int save_image_lzo(struct swap_map_handle *handle,
+ struct snapshot_handle *snapshot,
+ unsigned int nr_to_write)
+{
+ unsigned int m;
+ int ret = 0;
+ int nr_pages;
+ int err2;
+ struct bio *bio;
+ ktime_t start;
+ ktime_t stop;
+ size_t off;
+ unsigned thr, run_threads, nr_threads;
+ unsigned char *page = NULL;
+ struct cmp_data *data = NULL;
+ struct crc_data *crc = NULL;
+
+ /*
+ * We'll limit the number of threads for compression to limit memory
+ * footprint.
+ */
+ nr_threads = num_online_cpus() - 1;
+ nr_threads = clamp_val(nr_threads, 1, LZO_THREADS);
+
+ page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
+ if (!page) {
+ printk(KERN_ERR "PM: Failed to allocate LZO page\n");
+ ret = -ENOMEM;
+ goto out_clean;
+ }
+
+ data = vmalloc(sizeof(*data) * nr_threads);
+ if (!data) {
+ printk(KERN_ERR "PM: Failed to allocate LZO data\n");
+ ret = -ENOMEM;
+ goto out_clean;
+ }
+ for (thr = 0; thr < nr_threads; thr++)
+ memset(&data[thr], 0, offsetof(struct cmp_data, go));
+
+ crc = kmalloc(sizeof(*crc), GFP_KERNEL);
+ if (!crc) {
+ printk(KERN_ERR "PM: Failed to allocate crc\n");
+ ret = -ENOMEM;
+ goto out_clean;
+ }
+ memset(crc, 0, offsetof(struct crc_data, go));
+
+ /*
+ * Start the compression threads.
+ */
+ for (thr = 0; thr < nr_threads; thr++) {
+ init_waitqueue_head(&data[thr].go);
+ init_waitqueue_head(&data[thr].done);
+
+ data[thr].thr = kthread_run(lzo_compress_threadfn,
+ &data[thr],
+ "image_compress/%u", thr);
+ if (IS_ERR(data[thr].thr)) {
+ data[thr].thr = NULL;
+ printk(KERN_ERR
+ "PM: Cannot start compression threads\n");
+ ret = -ENOMEM;
+ goto out_clean;
+ }
+ }
+
+ /*
+ * Start the CRC32 thread.
+ */
+ init_waitqueue_head(&crc->go);
+ init_waitqueue_head(&crc->done);
+
+ handle->crc32 = 0;
+ crc->crc32 = &handle->crc32;
+ for (thr = 0; thr < nr_threads; thr++) {
+ crc->unc[thr] = data[thr].unc;
+ crc->unc_len[thr] = &data[thr].unc_len;
+ }
+
+ crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32");
+ if (IS_ERR(crc->thr)) {
+ crc->thr = NULL;
+ printk(KERN_ERR "PM: Cannot start CRC32 thread\n");
+ ret = -ENOMEM;
+ goto out_clean;
+ }
+
+ /*
+ * Adjust the number of required free pages after all allocations have
+ * been done. We don't want to run out of pages when writing.
+ */
+ handle->reqd_free_pages = reqd_free_pages();
+
+ printk(KERN_INFO
+ "PM: Using %u thread(s) for compression.\n"
+ "PM: Compressing and saving image data (%u pages)...\n",
+ nr_threads, nr_to_write);
+ m = nr_to_write / 10;
+ if (!m)
+ m = 1;
+ nr_pages = 0;
+ bio = NULL;
+ start = ktime_get();
+ for (;;) {
+ for (thr = 0; thr < nr_threads; thr++) {
+ for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) {
+ ret = snapshot_read_next(snapshot);
+ if (ret < 0)
+ goto out_finish;
+
+ if (!ret)
+ break;
+
+ memcpy(data[thr].unc + off,
+ data_of(*snapshot), PAGE_SIZE);
+
+ if (!(nr_pages % m))
+ printk(KERN_INFO
+ "PM: Image saving progress: "
+ "%3d%%\n",
+ nr_pages / m * 10);
+ nr_pages++;
+ }
+ if (!off)
+ break;
+
+ data[thr].unc_len = off;
+
+ atomic_set(&data[thr].ready, 1);
+ wake_up(&data[thr].go);
+ }
+
+ if (!thr)
+ break;
+
+ crc->run_threads = thr;
+ atomic_set(&crc->ready, 1);
+ wake_up(&crc->go);
+
+ for (run_threads = thr, thr = 0; thr < run_threads; thr++) {
+ wait_event(data[thr].done,
+ atomic_read(&data[thr].stop));
+ atomic_set(&data[thr].stop, 0);
+
+ ret = data[thr].ret;
+
+ if (ret < 0) {
+ printk(KERN_ERR "PM: LZO compression failed\n");
+ goto out_finish;
+ }
+
+ if (unlikely(!data[thr].cmp_len ||
+ data[thr].cmp_len >
+ lzo1x_worst_compress(data[thr].unc_len))) {
+ printk(KERN_ERR
+ "PM: Invalid LZO compressed length\n");
+ ret = -1;
+ goto out_finish;
+ }
+
+ *(size_t *)data[thr].cmp = data[thr].cmp_len;
+
+ /*
+ * Given we are writing one page at a time to disk, we
+ * copy that much from the buffer, although the last
+ * bit will likely be smaller than full page. This is
+ * OK - we saved the length of the compressed data, so
+ * any garbage at the end will be discarded when we
+ * read it.
+ */
+ for (off = 0;
+ off < LZO_HEADER + data[thr].cmp_len;
+ off += PAGE_SIZE) {
+ memcpy(page, data[thr].cmp + off, PAGE_SIZE);
+
+ ret = swap_write_page(handle, page, &bio);
+ if (ret)
+ goto out_finish;
+ }
+ }
+
+ wait_event(crc->done, atomic_read(&crc->stop));
+ atomic_set(&crc->stop, 0);
+ }
+
+out_finish:
+ err2 = hib_wait_on_bio_chain(&bio);
+ stop = ktime_get();
+ if (!ret)
+ ret = err2;
+ if (!ret)
+ printk(KERN_INFO "PM: Image saving done.\n");
+ swsusp_show_speed(start, stop, nr_to_write, "Wrote");
+out_clean:
+ if (crc) {
+ if (crc->thr)
+ kthread_stop(crc->thr);
+ kfree(crc);
+ }
+ if (data) {
+ for (thr = 0; thr < nr_threads; thr++)
+ if (data[thr].thr)
+ kthread_stop(data[thr].thr);
+ vfree(data);
+ }
+ if (page) free_page((unsigned long)page);
+
+ return ret;
+}
+
+/**
+ * enough_swap - Make sure we have enough swap to save the image.
+ *
+ * Returns TRUE or FALSE after checking the total amount of swap
+ * space avaiable from the resume partition.
+ */
+
+static int enough_swap(unsigned int nr_pages, unsigned int flags)
+{
+ unsigned int free_swap = count_swap_pages(root_swap, 1);
+ unsigned int required;
+
+ pr_debug("PM: Free swap pages: %u\n", free_swap);
+
+ required = PAGES_FOR_IO + nr_pages;
+ return free_swap > required;
+}
+
+/**
+ * swsusp_write - Write entire image and metadata.
+ * @flags: flags to pass to the "boot" kernel in the image header
+ *
+ * It is important _NOT_ to umount filesystems at this point. We want
+ * them synced (in case something goes wrong) but we DO not want to mark
+ * filesystem clean: it is not. (And it does not matter, if we resume
+ * correctly, we'll mark system clean, anyway.)
+ */
+
+int swsusp_write(unsigned int flags)
+{
+ struct swap_map_handle handle;
+ struct snapshot_handle snapshot;
+ struct swsusp_info *header;
+ unsigned long pages;
+ int error;
+
+ pages = snapshot_get_image_size();
+ error = get_swap_writer(&handle);
+ if (error) {
+ printk(KERN_ERR "PM: Cannot get swap writer\n");
+ return error;
+ }
+ if (flags & SF_NOCOMPRESS_MODE) {
+ if (!enough_swap(pages, flags)) {
+ printk(KERN_ERR "PM: Not enough free swap\n");
+ error = -ENOSPC;
+ goto out_finish;
+ }
+ }
+ memset(&snapshot, 0, sizeof(struct snapshot_handle));
+ error = snapshot_read_next(&snapshot);
+ if (error < PAGE_SIZE) {
+ if (error >= 0)
+ error = -EFAULT;
+
+ goto out_finish;
+ }
+ header = (struct swsusp_info *)data_of(snapshot);
+ error = swap_write_page(&handle, header, NULL);
+ if (!error) {
+ error = (flags & SF_NOCOMPRESS_MODE) ?
+ save_image(&handle, &snapshot, pages - 1) :
+ save_image_lzo(&handle, &snapshot, pages - 1);
+ }
+out_finish:
+ error = swap_writer_finish(&handle, flags, error);
+ return error;
+}
+
+/**
+ * The following functions allow us to read data using a swap map
+ * in a file-alike way
+ */
+
+static void release_swap_reader(struct swap_map_handle *handle)
+{
+ struct swap_map_page_list *tmp;
+
+ while (handle->maps) {
+ if (handle->maps->map)
+ free_page((unsigned long)handle->maps->map);
+ tmp = handle->maps;
+ handle->maps = handle->maps->next;
+ kfree(tmp);
+ }
+ handle->cur = NULL;
+}
+
+static int get_swap_reader(struct swap_map_handle *handle,
+ unsigned int *flags_p)
+{
+ int error;
+ struct swap_map_page_list *tmp, *last;
+ sector_t offset;
+
+ *flags_p = swsusp_header->flags;
+
+ if (!swsusp_header->image) /* how can this happen? */
+ return -EINVAL;
+
+ handle->cur = NULL;
+ last = handle->maps = NULL;
+ offset = swsusp_header->image;
+ while (offset) {
+ tmp = kmalloc(sizeof(*handle->maps), GFP_KERNEL);
+ if (!tmp) {
+ release_swap_reader(handle);
+ return -ENOMEM;
+ }
+ memset(tmp, 0, sizeof(*tmp));
+ if (!handle->maps)
+ handle->maps = tmp;
+ if (last)
+ last->next = tmp;
+ last = tmp;
+
+ tmp->map = (struct swap_map_page *)
+ __get_free_page(__GFP_WAIT | __GFP_HIGH);
+ if (!tmp->map) {
+ release_swap_reader(handle);
+ return -ENOMEM;
+ }
+
+ error = hib_bio_read_page(offset, tmp->map, NULL);
+ if (error) {
+ release_swap_reader(handle);
+ return error;
+ }
+ offset = tmp->map->next_swap;
+ }
+ handle->k = 0;
+ handle->cur = handle->maps->map;
+ return 0;
+}
+
+static int swap_read_page(struct swap_map_handle *handle, void *buf,
+ struct bio **bio_chain)
+{
+ sector_t offset;
+ int error;
+ struct swap_map_page_list *tmp;
+
+ if (!handle->cur)
+ return -EINVAL;
+ offset = handle->cur->entries[handle->k];
+ if (!offset)
+ return -EFAULT;
+ error = hib_bio_read_page(offset, buf, bio_chain);
+ if (error)
+ return error;
+ if (++handle->k >= MAP_PAGE_ENTRIES) {
+ handle->k = 0;
+ free_page((unsigned long)handle->maps->map);
+ tmp = handle->maps;
+ handle->maps = handle->maps->next;
+ kfree(tmp);
+ if (!handle->maps)
+ release_swap_reader(handle);
+ else
+ handle->cur = handle->maps->map;
+ }
+ return error;
+}
+
+static int swap_reader_finish(struct swap_map_handle *handle)
+{
+ release_swap_reader(handle);
+
+ return 0;
+}
+
+/**
+ * load_image - load the image using the swap map handle
+ * @handle and the snapshot handle @snapshot
+ * (assume there are @nr_pages pages to load)
+ */
+
+static int load_image(struct swap_map_handle *handle,
+ struct snapshot_handle *snapshot,
+ unsigned int nr_to_read)
+{
+ unsigned int m;
+ int ret = 0;
+ ktime_t start;
+ ktime_t stop;
+ struct bio *bio;
+ int err2;
+ unsigned nr_pages;
+
+ printk(KERN_INFO "PM: Loading image data pages (%u pages)...\n",
+ nr_to_read);
+ m = nr_to_read / 10;
+ if (!m)
+ m = 1;
+ nr_pages = 0;
+ bio = NULL;
+ start = ktime_get();
+ for ( ; ; ) {
+ ret = snapshot_write_next(snapshot);
+ if (ret <= 0)
+ break;
+ ret = swap_read_page(handle, data_of(*snapshot), &bio);
+ if (ret)
+ break;
+ if (snapshot->sync_read)
+ ret = hib_wait_on_bio_chain(&bio);
+ if (ret)
+ break;
+ if (!(nr_pages % m))
+ printk(KERN_INFO "PM: Image loading progress: %3d%%\n",
+ nr_pages / m * 10);
+ nr_pages++;
+ }
+ err2 = hib_wait_on_bio_chain(&bio);
+ stop = ktime_get();
+ if (!ret)
+ ret = err2;
+ if (!ret) {
+ printk(KERN_INFO "PM: Image loading done.\n");
+ snapshot_write_finalize(snapshot);
+ if (!snapshot_image_loaded(snapshot))
+ ret = -ENODATA;
+ }
+ swsusp_show_speed(start, stop, nr_to_read, "Read");
+ return ret;
+}
+
+/**
+ * Structure used for LZO data decompression.
+ */
+struct dec_data {
+ struct task_struct *thr; /* thread */
+ atomic_t ready; /* ready to start flag */
+ atomic_t stop; /* ready to stop flag */
+ int ret; /* return code */
+ wait_queue_head_t go; /* start decompression */
+ wait_queue_head_t done; /* decompression done */
+ size_t unc_len; /* uncompressed length */
+ size_t cmp_len; /* compressed length */
+ unsigned char unc[LZO_UNC_SIZE]; /* uncompressed buffer */
+ unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */
+};
+
+/**
+ * Deompression function that runs in its own thread.
+ */
+static int lzo_decompress_threadfn(void *data)
+{
+ struct dec_data *d = data;
+
+ while (1) {
+ wait_event(d->go, atomic_read(&d->ready) ||
+ kthread_should_stop());
+ if (kthread_should_stop()) {
+ d->thr = NULL;
+ d->ret = -1;
+ atomic_set(&d->stop, 1);
+ wake_up(&d->done);
+ break;
+ }
+ atomic_set(&d->ready, 0);
+
+ d->unc_len = LZO_UNC_SIZE;
+ d->ret = lzo1x_decompress_safe(d->cmp + LZO_HEADER, d->cmp_len,
+ d->unc, &d->unc_len);
+ atomic_set(&d->stop, 1);
+ wake_up(&d->done);
+ }
+ return 0;
+}
+
+/**
+ * load_image_lzo - Load compressed image data and decompress them with LZO.
+ * @handle: Swap map handle to use for loading data.
+ * @snapshot: Image to copy uncompressed data into.
+ * @nr_to_read: Number of pages to load.
+ */
+static int load_image_lzo(struct swap_map_handle *handle,
+ struct snapshot_handle *snapshot,
+ unsigned int nr_to_read)
+{
+ unsigned int m;
+ int ret = 0;
+ int eof = 0;
+ struct bio *bio;
+ ktime_t start;
+ ktime_t stop;
+ unsigned nr_pages;
+ size_t off;
+ unsigned i, thr, run_threads, nr_threads;
+ unsigned ring = 0, pg = 0, ring_size = 0,
+ have = 0, want, need, asked = 0;
+ unsigned long read_pages = 0;
+ unsigned char **page = NULL;
+ struct dec_data *data = NULL;
+ struct crc_data *crc = NULL;
+
+ /*
+ * We'll limit the number of threads for decompression to limit memory
+ * footprint.
+ */
+ nr_threads = num_online_cpus() - 1;
+ nr_threads = clamp_val(nr_threads, 1, LZO_THREADS);
+
+ page = vmalloc(sizeof(*page) * LZO_MAX_RD_PAGES);
+ if (!page) {
+ printk(KERN_ERR "PM: Failed to allocate LZO page\n");
+ ret = -ENOMEM;
+ goto out_clean;
+ }
+
+ data = vmalloc(sizeof(*data) * nr_threads);
+ if (!data) {
+ printk(KERN_ERR "PM: Failed to allocate LZO data\n");
+ ret = -ENOMEM;
+ goto out_clean;
+ }
+ for (thr = 0; thr < nr_threads; thr++)
+ memset(&data[thr], 0, offsetof(struct dec_data, go));
+
+ crc = kmalloc(sizeof(*crc), GFP_KERNEL);
+ if (!crc) {
+ printk(KERN_ERR "PM: Failed to allocate crc\n");
+ ret = -ENOMEM;
+ goto out_clean;
+ }
+ memset(crc, 0, offsetof(struct crc_data, go));
+
+ /*
+ * Start the decompression threads.
+ */
+ for (thr = 0; thr < nr_threads; thr++) {
+ init_waitqueue_head(&data[thr].go);
+ init_waitqueue_head(&data[thr].done);
+
+ data[thr].thr = kthread_run(lzo_decompress_threadfn,
+ &data[thr],
+ "image_decompress/%u", thr);
+ if (IS_ERR(data[thr].thr)) {
+ data[thr].thr = NULL;
+ printk(KERN_ERR
+ "PM: Cannot start decompression threads\n");
+ ret = -ENOMEM;
+ goto out_clean;
+ }
+ }
+
+ /*
+ * Start the CRC32 thread.
+ */
+ init_waitqueue_head(&crc->go);
+ init_waitqueue_head(&crc->done);
+
+ handle->crc32 = 0;
+ crc->crc32 = &handle->crc32;
+ for (thr = 0; thr < nr_threads; thr++) {
+ crc->unc[thr] = data[thr].unc;
+ crc->unc_len[thr] = &data[thr].unc_len;
+ }
+
+ crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32");
+ if (IS_ERR(crc->thr)) {
+ crc->thr = NULL;
+ printk(KERN_ERR "PM: Cannot start CRC32 thread\n");
+ ret = -ENOMEM;
+ goto out_clean;
+ }
+
+ /*
+ * Set the number of pages for read buffering.
+ * This is complete guesswork, because we'll only know the real
+ * picture once prepare_image() is called, which is much later on
+ * during the image load phase. We'll assume the worst case and
+ * say that none of the image pages are from high memory.
+ */
+ if (low_free_pages() > snapshot_get_image_size())
+ read_pages = (low_free_pages() - snapshot_get_image_size()) / 2;
+ read_pages = clamp_val(read_pages, LZO_MIN_RD_PAGES, LZO_MAX_RD_PAGES);
+
+ for (i = 0; i < read_pages; i++) {
+ page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ?
+ __GFP_WAIT | __GFP_HIGH :
+ __GFP_WAIT | __GFP_NOWARN |
+ __GFP_NORETRY);
+
+ if (!page[i]) {
+ if (i < LZO_CMP_PAGES) {
+ ring_size = i;
+ printk(KERN_ERR
+ "PM: Failed to allocate LZO pages\n");
+ ret = -ENOMEM;
+ goto out_clean;
+ } else {
+ break;
+ }
+ }
+ }
+ want = ring_size = i;
+
+ printk(KERN_INFO
+ "PM: Using %u thread(s) for decompression.\n"
+ "PM: Loading and decompressing image data (%u pages)...\n",
+ nr_threads, nr_to_read);
+ m = nr_to_read / 10;
+ if (!m)
+ m = 1;
+ nr_pages = 0;
+ bio = NULL;
+ start = ktime_get();
+
+ ret = snapshot_write_next(snapshot);
+ if (ret <= 0)
+ goto out_finish;
+
+ for(;;) {
+ for (i = 0; !eof && i < want; i++) {
+ ret = swap_read_page(handle, page[ring], &bio);
+ if (ret) {
+ /*
+ * On real read error, finish. On end of data,
+ * set EOF flag and just exit the read loop.
+ */
+ if (handle->cur &&
+ handle->cur->entries[handle->k]) {
+ goto out_finish;
+ } else {
+ eof = 1;
+ break;
+ }
+ }
+ if (++ring >= ring_size)
+ ring = 0;
+ }
+ asked += i;
+ want -= i;
+
+ /*
+ * We are out of data, wait for some more.
+ */
+ if (!have) {
+ if (!asked)
+ break;
+
+ ret = hib_wait_on_bio_chain(&bio);
+ if (ret)
+ goto out_finish;
+ have += asked;
+ asked = 0;
+ if (eof)
+ eof = 2;
+ }
+
+ if (crc->run_threads) {
+ wait_event(crc->done, atomic_read(&crc->stop));
+ atomic_set(&crc->stop, 0);
+ crc->run_threads = 0;
+ }
+
+ for (thr = 0; have && thr < nr_threads; thr++) {
+ data[thr].cmp_len = *(size_t *)page[pg];
+ if (unlikely(!data[thr].cmp_len ||
+ data[thr].cmp_len >
+ lzo1x_worst_compress(LZO_UNC_SIZE))) {
+ printk(KERN_ERR
+ "PM: Invalid LZO compressed length\n");
+ ret = -1;
+ goto out_finish;
+ }
+
+ need = DIV_ROUND_UP(data[thr].cmp_len + LZO_HEADER,
+ PAGE_SIZE);
+ if (need > have) {
+ if (eof > 1) {
+ ret = -1;
+ goto out_finish;
+ }
+ break;
+ }
+
+ for (off = 0;
+ off < LZO_HEADER + data[thr].cmp_len;
+ off += PAGE_SIZE) {
+ memcpy(data[thr].cmp + off,
+ page[pg], PAGE_SIZE);
+ have--;
+ want++;
+ if (++pg >= ring_size)
+ pg = 0;
+ }
+
+ atomic_set(&data[thr].ready, 1);
+ wake_up(&data[thr].go);
+ }
+
+ /*
+ * Wait for more data while we are decompressing.
+ */
+ if (have < LZO_CMP_PAGES && asked) {
+ ret = hib_wait_on_bio_chain(&bio);
+ if (ret)
+ goto out_finish;
+ have += asked;
+ asked = 0;
+ if (eof)
+ eof = 2;
+ }
+
+ for (run_threads = thr, thr = 0; thr < run_threads; thr++) {
+ wait_event(data[thr].done,
+ atomic_read(&data[thr].stop));
+ atomic_set(&data[thr].stop, 0);
+
+ ret = data[thr].ret;
+
+ if (ret < 0) {
+ printk(KERN_ERR
+ "PM: LZO decompression failed\n");
+ goto out_finish;
+ }
+
+ if (unlikely(!data[thr].unc_len ||
+ data[thr].unc_len > LZO_UNC_SIZE ||
+ data[thr].unc_len & (PAGE_SIZE - 1))) {
+ printk(KERN_ERR
+ "PM: Invalid LZO uncompressed length\n");
+ ret = -1;
+ goto out_finish;
+ }
+
+ for (off = 0;
+ off < data[thr].unc_len; off += PAGE_SIZE) {
+ memcpy(data_of(*snapshot),
+ data[thr].unc + off, PAGE_SIZE);
+
+ if (!(nr_pages % m))
+ printk(KERN_INFO
+ "PM: Image loading progress: "
+ "%3d%%\n",
+ nr_pages / m * 10);
+ nr_pages++;
+
+ ret = snapshot_write_next(snapshot);
+ if (ret <= 0) {
+ crc->run_threads = thr + 1;
+ atomic_set(&crc->ready, 1);
+ wake_up(&crc->go);
+ goto out_finish;
+ }
+ }
+ }
+
+ crc->run_threads = thr;
+ atomic_set(&crc->ready, 1);
+ wake_up(&crc->go);
+ }
+
+out_finish:
+ if (crc->run_threads) {
+ wait_event(crc->done, atomic_read(&crc->stop));
+ atomic_set(&crc->stop, 0);
+ }
+ stop = ktime_get();
+ if (!ret) {
+ printk(KERN_INFO "PM: Image loading done.\n");
+ snapshot_write_finalize(snapshot);
+ if (!snapshot_image_loaded(snapshot))
+ ret = -ENODATA;
+ if (!ret) {
+ if (swsusp_header->flags & SF_CRC32_MODE) {
+ if(handle->crc32 != swsusp_header->crc32) {
+ printk(KERN_ERR
+ "PM: Invalid image CRC32!\n");
+ ret = -ENODATA;
+ }
+ }
+ }
+ }
+ swsusp_show_speed(start, stop, nr_to_read, "Read");
+out_clean:
+ for (i = 0; i < ring_size; i++)
+ free_page((unsigned long)page[i]);
+ if (crc) {
+ if (crc->thr)
+ kthread_stop(crc->thr);
+ kfree(crc);
+ }
+ if (data) {
+ for (thr = 0; thr < nr_threads; thr++)
+ if (data[thr].thr)
+ kthread_stop(data[thr].thr);
+ vfree(data);
+ }
+ vfree(page);
+
+ return ret;
+}
+
+/**
+ * swsusp_read - read the hibernation image.
+ * @flags_p: flags passed by the "frozen" kernel in the image header should
+ * be written into this memory location
+ */
+
+int swsusp_read(unsigned int *flags_p)
+{
+ int error;
+ struct swap_map_handle handle;
+ struct snapshot_handle snapshot;
+ struct swsusp_info *header;
+
+ memset(&snapshot, 0, sizeof(struct snapshot_handle));
+ error = snapshot_write_next(&snapshot);
+ if (error < PAGE_SIZE)
+ return error < 0 ? error : -EFAULT;
+ header = (struct swsusp_info *)data_of(snapshot);
+ error = get_swap_reader(&handle, flags_p);
+ if (error)
+ goto end;
+ if (!error)
+ error = swap_read_page(&handle, header, NULL);
+ if (!error) {
+ error = (*flags_p & SF_NOCOMPRESS_MODE) ?
+ load_image(&handle, &snapshot, header->pages - 1) :
+ load_image_lzo(&handle, &snapshot, header->pages - 1);
+ }
+ swap_reader_finish(&handle);
+end:
+ if (!error)
+ pr_debug("PM: Image successfully loaded\n");
+ else
+ pr_debug("PM: Error %d resuming\n", error);
+ return error;
+}
+
+/**
+ * swsusp_check - Check for swsusp signature in the resume device
+ */
+
+int swsusp_check(void)
+{
+ int error;
+
+ hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device,
+ FMODE_READ, NULL);
+ if (!IS_ERR(hib_resume_bdev)) {
+ set_blocksize(hib_resume_bdev, PAGE_SIZE);
+ clear_page(swsusp_header);
+ error = hib_bio_read_page(swsusp_resume_block,
+ swsusp_header, NULL);
+ if (error)
+ goto put;
+
+ if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) {
+ memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
+ /* Reset swap signature now */
+ error = hib_bio_write_page(swsusp_resume_block,
+ swsusp_header, NULL);
+ } else {
+ error = -EINVAL;
+ }
+
+put:
+ if (error)
+ blkdev_put(hib_resume_bdev, FMODE_READ);
+ else
+ pr_debug("PM: Image signature found, resuming\n");
+ } else {
+ error = PTR_ERR(hib_resume_bdev);
+ }
+
+ if (error)
+ pr_debug("PM: Image not found (code %d)\n", error);
+
+ return error;
+}
+
+/**
+ * swsusp_close - close swap device.
+ */
+
+void swsusp_close(fmode_t mode)
+{
+ if (IS_ERR(hib_resume_bdev)) {
+ pr_debug("PM: Image device not initialised\n");
+ return;
+ }
+
+ blkdev_put(hib_resume_bdev, mode);
+}
+
+/**
+ * swsusp_unmark - Unmark swsusp signature in the resume device
+ */
+
+#ifdef CONFIG_SUSPEND
+int swsusp_unmark(void)
+{
+ int error;
+
+ hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL);
+ if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) {
+ memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10);
+ error = hib_bio_write_page(swsusp_resume_block,
+ swsusp_header, NULL);
+ } else {
+ printk(KERN_ERR "PM: Cannot find swsusp signature!\n");
+ error = -ENODEV;
+ }
+
+ /*
+ * We just returned from suspend, we don't need the image any more.
+ */
+ free_all_swap_pages(root_swap);
+
+ return error;
+}
+#endif
+
+static int swsusp_header_init(void)
+{
+ swsusp_header = (struct swsusp_header*) __get_free_page(GFP_KERNEL);
+ if (!swsusp_header)
+ panic("Could not allocate memory for swsusp_header\n");
+ return 0;
+}
+
+core_initcall(swsusp_header_init);
diff --git a/kernel/power/tuxonice.h b/kernel/power/tuxonice.h
new file mode 100644
index 000000000..1aff98026
--- /dev/null
+++ b/kernel/power/tuxonice.h
@@ -0,0 +1,260 @@
+/*
+ * kernel/power/tuxonice.h
+ *
+ * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ * It contains declarations used throughout swsusp.
+ *
+ */
+
+#ifndef KERNEL_POWER_TOI_H
+#define KERNEL_POWER_TOI_H
+
+#include <linux/delay.h>
+#include <linux/bootmem.h>
+#include <linux/suspend.h>
+#include <linux/fs.h>
+#include <asm/setup.h>
+#include "tuxonice_pageflags.h"
+#include "power.h"
+
+#define TOI_CORE_VERSION "3.3"
+#define TOI_HEADER_VERSION 3
+#define MY_BOOT_KERNEL_DATA_VERSION 4
+
+struct toi_boot_kernel_data {
+ int version;
+ int size;
+ unsigned long toi_action;
+ unsigned long toi_debug_state;
+ u32 toi_default_console_level;
+ int toi_io_time[2][2];
+ char toi_nosave_commandline[COMMAND_LINE_SIZE];
+ unsigned long pages_used[33];
+ unsigned long incremental_bytes_in;
+ unsigned long incremental_bytes_out;
+ unsigned long compress_bytes_in;
+ unsigned long compress_bytes_out;
+ unsigned long pruned_pages;
+};
+
+extern struct toi_boot_kernel_data toi_bkd;
+
+/* Location of book kernel data struct in kernel being resumed */
+extern unsigned long boot_kernel_data_buffer;
+
+/* == Action states == */
+
+enum {
+ TOI_REBOOT,
+ TOI_PAUSE,
+ TOI_LOGALL,
+ TOI_CAN_CANCEL,
+ TOI_KEEP_IMAGE,
+ TOI_FREEZER_TEST,
+ TOI_SINGLESTEP,
+ TOI_PAUSE_NEAR_PAGESET_END,
+ TOI_TEST_FILTER_SPEED,
+ TOI_TEST_BIO,
+ TOI_NO_PAGESET2,
+ TOI_IGNORE_ROOTFS,
+ TOI_REPLACE_SWSUSP,
+ TOI_PAGESET2_FULL,
+ TOI_ABORT_ON_RESAVE_NEEDED,
+ TOI_NO_MULTITHREADED_IO,
+ TOI_NO_DIRECT_LOAD, /* Obsolete */
+ TOI_LATE_CPU_HOTPLUG, /* Obsolete */
+ TOI_GET_MAX_MEM_ALLOCD,
+ TOI_NO_FLUSHER_THREAD,
+ TOI_NO_PS2_IF_UNNEEDED,
+ TOI_POST_RESUME_BREAKPOINT,
+ TOI_NO_READAHEAD,
+ TOI_TRACE_DEBUG_ON,
+ TOI_INCREMENTAL_IMAGE,
+};
+
+extern unsigned long toi_bootflags_mask;
+
+#define clear_action_state(bit) (test_and_clear_bit(bit, &toi_bkd.toi_action))
+
+/* == Result states == */
+
+enum {
+ TOI_ABORTED,
+ TOI_ABORT_REQUESTED,
+ TOI_NOSTORAGE_AVAILABLE,
+ TOI_INSUFFICIENT_STORAGE,
+ TOI_FREEZING_FAILED,
+ TOI_KEPT_IMAGE,
+ TOI_WOULD_EAT_MEMORY,
+ TOI_UNABLE_TO_FREE_ENOUGH_MEMORY,
+ TOI_PM_SEM,
+ TOI_DEVICE_REFUSED,
+ TOI_SYSDEV_REFUSED,
+ TOI_EXTRA_PAGES_ALLOW_TOO_SMALL,
+ TOI_UNABLE_TO_PREPARE_IMAGE,
+ TOI_FAILED_MODULE_INIT,
+ TOI_FAILED_MODULE_CLEANUP,
+ TOI_FAILED_IO,
+ TOI_OUT_OF_MEMORY,
+ TOI_IMAGE_ERROR,
+ TOI_PLATFORM_PREP_FAILED,
+ TOI_CPU_HOTPLUG_FAILED,
+ TOI_ARCH_PREPARE_FAILED, /* Removed Linux-3.0 */
+ TOI_RESAVE_NEEDED,
+ TOI_CANT_SUSPEND,
+ TOI_NOTIFIERS_PREPARE_FAILED,
+ TOI_PRE_SNAPSHOT_FAILED,
+ TOI_PRE_RESTORE_FAILED,
+ TOI_USERMODE_HELPERS_ERR,
+ TOI_CANT_USE_ALT_RESUME,
+ TOI_HEADER_TOO_BIG,
+ TOI_WAKEUP_EVENT,
+ TOI_SYSCORE_REFUSED,
+ TOI_DPM_PREPARE_FAILED,
+ TOI_DPM_SUSPEND_FAILED,
+ TOI_NUM_RESULT_STATES /* Used in printing debug info only */
+};
+
+extern unsigned long toi_result;
+
+#define set_result_state(bit) (test_and_set_bit(bit, &toi_result))
+#define set_abort_result(bit) (test_and_set_bit(TOI_ABORTED, &toi_result), \
+ test_and_set_bit(bit, &toi_result))
+#define clear_result_state(bit) (test_and_clear_bit(bit, &toi_result))
+#define test_result_state(bit) (test_bit(bit, &toi_result))
+
+/* == Debug sections and levels == */
+
+/* debugging levels. */
+enum {
+ TOI_STATUS = 0,
+ TOI_ERROR = 2,
+ TOI_LOW,
+ TOI_MEDIUM,
+ TOI_HIGH,
+ TOI_VERBOSE,
+};
+
+enum {
+ TOI_ANY_SECTION,
+ TOI_EAT_MEMORY,
+ TOI_IO,
+ TOI_HEADER,
+ TOI_WRITER,
+ TOI_MEMORY,
+ TOI_PAGEDIR,
+ TOI_COMPRESS,
+ TOI_BIO,
+};
+
+#define set_debug_state(bit) (test_and_set_bit(bit, &toi_bkd.toi_debug_state))
+#define clear_debug_state(bit) \
+ (test_and_clear_bit(bit, &toi_bkd.toi_debug_state))
+#define test_debug_state(bit) (test_bit(bit, &toi_bkd.toi_debug_state))
+
+/* == Steps in hibernating == */
+
+enum {
+ STEP_HIBERNATE_PREPARE_IMAGE,
+ STEP_HIBERNATE_SAVE_IMAGE,
+ STEP_HIBERNATE_POWERDOWN,
+ STEP_RESUME_CAN_RESUME,
+ STEP_RESUME_LOAD_PS1,
+ STEP_RESUME_DO_RESTORE,
+ STEP_RESUME_READ_PS2,
+ STEP_RESUME_GO,
+ STEP_RESUME_ALT_IMAGE,
+ STEP_CLEANUP,
+ STEP_QUIET_CLEANUP
+};
+
+/* == TuxOnIce states ==
+ (see also include/linux/suspend.h) */
+
+#define get_toi_state() (toi_state)
+#define restore_toi_state(saved_state) \
+ do { toi_state = saved_state; } while (0)
+
+/* == Module support == */
+
+struct toi_core_fns {
+ int (*post_context_save)(void);
+ unsigned long (*get_nonconflicting_page)(void);
+ int (*try_hibernate)(void);
+ void (*try_resume)(void);
+};
+
+extern struct toi_core_fns *toi_core_fns;
+
+/* == All else == */
+#define KB(x) ((x) << (PAGE_SHIFT - 10))
+#define MB(x) ((x) >> (20 - PAGE_SHIFT))
+
+extern int toi_start_anything(int toi_or_resume);
+extern void toi_finish_anything(int toi_or_resume);
+
+extern int save_image_part1(void);
+extern int toi_atomic_restore(void);
+
+extern int toi_try_hibernate(void);
+extern void toi_try_resume(void);
+
+extern int __toi_post_context_save(void);
+
+extern unsigned int nr_hibernates;
+extern char alt_resume_param[256];
+
+extern void copyback_post(void);
+extern int toi_hibernate(void);
+extern unsigned long extra_pd1_pages_used;
+
+#define SECTOR_SIZE 512
+
+extern void toi_early_boot_message(int can_erase_image, int default_answer,
+ char *warning_reason, ...);
+
+extern int do_check_can_resume(void);
+extern int do_toi_step(int step);
+extern int toi_launch_userspace_program(char *command, int channel_no,
+ int wait, int debug);
+
+extern char tuxonice_signature[9];
+
+extern int toi_start_other_threads(void);
+extern void toi_stop_other_threads(void);
+
+extern int toi_trace_index;
+#define TOI_TRACE_DEBUG(PFN, DESC, ...) \
+ do { \
+ if (test_action_state(TOI_TRACE_DEBUG_ON)) { \
+ printk("*TOI* %ld %02d" DESC "\n", PFN, toi_trace_index, ##__VA_ARGS__); \
+ } \
+ } while(0)
+
+#ifdef CONFIG_TOI_KEEP_IMAGE
+#define toi_keeping_image (test_action_state(TOI_KEEP_IMAGE) || test_action_state(TOI_INCREMENTAL_IMAGE))
+#else
+#define toi_keeping_image (0)
+#endif
+
+#ifdef CONFIG_TOI_INCREMENTAL
+extern void toi_reset_dirtiness_one(unsigned long pfn, int verbose);
+extern int toi_reset_dirtiness(int verbose);
+extern void toi_cbw_write(void);
+extern void toi_cbw_restore(void);
+extern int toi_allocate_cbw_data(void);
+extern void toi_free_cbw_data(void);
+extern int toi_cbw_init(void);
+extern void toi_mark_tasks_cbw(void);
+#else
+static inline int toi_reset_dirtiness(int verbose) { return 0; }
+#define toi_cbw_write() do { } while(0)
+#define toi_cbw_restore() do { } while(0)
+#define toi_allocate_cbw_data() do { } while(0)
+#define toi_free_cbw_data() do { } while(0)
+static inline int toi_cbw_init(void) { return 0; }
+#endif
+#endif
diff --git a/kernel/power/tuxonice_alloc.c b/kernel/power/tuxonice_alloc.c
new file mode 100644
index 000000000..5729240d8
--- /dev/null
+++ b/kernel/power/tuxonice_alloc.c
@@ -0,0 +1,308 @@
+/*
+ * kernel/power/tuxonice_alloc.c
+ *
+ * Copyright (C) 2008-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ */
+
+#include <linux/export.h>
+#include <linux/slab.h>
+#include "tuxonice_modules.h"
+#include "tuxonice_alloc.h"
+#include "tuxonice_sysfs.h"
+#include "tuxonice.h"
+
+#define TOI_ALLOC_PATHS 41
+
+static DEFINE_MUTEX(toi_alloc_mutex);
+
+static struct toi_module_ops toi_alloc_ops;
+
+static int toi_fail_num;
+
+static atomic_t toi_alloc_count[TOI_ALLOC_PATHS],
+ toi_free_count[TOI_ALLOC_PATHS],
+ toi_test_count[TOI_ALLOC_PATHS],
+ toi_fail_count[TOI_ALLOC_PATHS];
+static int toi_cur_allocd[TOI_ALLOC_PATHS], toi_max_allocd[TOI_ALLOC_PATHS];
+static int cur_allocd, max_allocd;
+
+static char *toi_alloc_desc[TOI_ALLOC_PATHS] = {
+ "", /* 0 */
+ "get_io_info_struct",
+ "extent",
+ "extent (loading chain)",
+ "userui channel",
+ "userui arg", /* 5 */
+ "attention list metadata",
+ "extra pagedir memory metadata",
+ "bdev metadata",
+ "extra pagedir memory",
+ "header_locations_read", /* 10 */
+ "bio queue",
+ "prepare_readahead",
+ "i/o buffer",
+ "writer buffer in bio_init",
+ "checksum buffer", /* 15 */
+ "compression buffer",
+ "filewriter signature op",
+ "set resume param alloc1",
+ "set resume param alloc2",
+ "debugging info buffer", /* 20 */
+ "check can resume buffer",
+ "write module config buffer",
+ "read module config buffer",
+ "write image header buffer",
+ "read pageset1 buffer", /* 25 */
+ "get_have_image_data buffer",
+ "checksum page",
+ "worker rw loop",
+ "get nonconflicting page",
+ "ps1 load addresses", /* 30 */
+ "remove swap image",
+ "swap image exists",
+ "swap parse sig location",
+ "sysfs kobj",
+ "swap mark resume attempted buffer", /* 35 */
+ "cluster member",
+ "boot kernel data buffer",
+ "setting swap signature",
+ "block i/o bdev struct",
+ "copy before write", /* 40 */
+};
+
+#define MIGHT_FAIL(FAIL_NUM, FAIL_VAL) \
+ do { \
+ BUG_ON(FAIL_NUM >= TOI_ALLOC_PATHS); \
+ \
+ if (FAIL_NUM == toi_fail_num) { \
+ atomic_inc(&toi_test_count[FAIL_NUM]); \
+ toi_fail_num = 0; \
+ return FAIL_VAL; \
+ } \
+ } while (0)
+
+static void alloc_update_stats(int fail_num, void *result, int size)
+{
+ if (!result) {
+ atomic_inc(&toi_fail_count[fail_num]);
+ return;
+ }
+
+ atomic_inc(&toi_alloc_count[fail_num]);
+ if (unlikely(test_action_state(TOI_GET_MAX_MEM_ALLOCD))) {
+ mutex_lock(&toi_alloc_mutex);
+ toi_cur_allocd[fail_num]++;
+ cur_allocd += size;
+ if (unlikely(cur_allocd > max_allocd)) {
+ int i;
+
+ for (i = 0; i < TOI_ALLOC_PATHS; i++)
+ toi_max_allocd[i] = toi_cur_allocd[i];
+ max_allocd = cur_allocd;
+ }
+ mutex_unlock(&toi_alloc_mutex);
+ }
+}
+
+static void free_update_stats(int fail_num, int size)
+{
+ BUG_ON(fail_num >= TOI_ALLOC_PATHS);
+ atomic_inc(&toi_free_count[fail_num]);
+ if (unlikely(atomic_read(&toi_free_count[fail_num]) >
+ atomic_read(&toi_alloc_count[fail_num])))
+ dump_stack();
+ if (unlikely(test_action_state(TOI_GET_MAX_MEM_ALLOCD))) {
+ mutex_lock(&toi_alloc_mutex);
+ cur_allocd -= size;
+ toi_cur_allocd[fail_num]--;
+ mutex_unlock(&toi_alloc_mutex);
+ }
+}
+
+void *toi_kzalloc(int fail_num, size_t size, gfp_t flags)
+{
+ void *result;
+
+ if (toi_alloc_ops.enabled)
+ MIGHT_FAIL(fail_num, NULL);
+ result = kzalloc(size, flags);
+ if (toi_alloc_ops.enabled)
+ alloc_update_stats(fail_num, result, size);
+ if (fail_num == toi_trace_allocs)
+ dump_stack();
+ return result;
+}
+
+unsigned long toi_get_free_pages(int fail_num, gfp_t mask,
+ unsigned int order)
+{
+ unsigned long result;
+
+ mask |= ___GFP_TOI_NOTRACK;
+ if (toi_alloc_ops.enabled)
+ MIGHT_FAIL(fail_num, 0);
+ result = __get_free_pages(mask, order);
+ if (toi_alloc_ops.enabled)
+ alloc_update_stats(fail_num, (void *) result,
+ PAGE_SIZE << order);
+ if (fail_num == toi_trace_allocs)
+ dump_stack();
+ return result;
+}
+
+struct page *toi_alloc_page(int fail_num, gfp_t mask)
+{
+ struct page *result;
+
+ if (toi_alloc_ops.enabled)
+ MIGHT_FAIL(fail_num, NULL);
+ mask |= ___GFP_TOI_NOTRACK;
+ result = alloc_page(mask);
+ if (toi_alloc_ops.enabled)
+ alloc_update_stats(fail_num, (void *) result, PAGE_SIZE);
+ if (fail_num == toi_trace_allocs)
+ dump_stack();
+ return result;
+}
+
+unsigned long toi_get_zeroed_page(int fail_num, gfp_t mask)
+{
+ unsigned long result;
+
+ if (toi_alloc_ops.enabled)
+ MIGHT_FAIL(fail_num, 0);
+ mask |= ___GFP_TOI_NOTRACK;
+ result = get_zeroed_page(mask);
+ if (toi_alloc_ops.enabled)
+ alloc_update_stats(fail_num, (void *) result, PAGE_SIZE);
+ if (fail_num == toi_trace_allocs)
+ dump_stack();
+ return result;
+}
+
+void toi_kfree(int fail_num, const void *arg, int size)
+{
+ if (arg && toi_alloc_ops.enabled)
+ free_update_stats(fail_num, size);
+
+ if (fail_num == toi_trace_allocs)
+ dump_stack();
+ kfree(arg);
+}
+
+void toi_free_page(int fail_num, unsigned long virt)
+{
+ if (virt && toi_alloc_ops.enabled)
+ free_update_stats(fail_num, PAGE_SIZE);
+
+ if (fail_num == toi_trace_allocs)
+ dump_stack();
+ free_page(virt);
+}
+
+void toi__free_page(int fail_num, struct page *page)
+{
+ if (page && toi_alloc_ops.enabled)
+ free_update_stats(fail_num, PAGE_SIZE);
+
+ if (fail_num == toi_trace_allocs)
+ dump_stack();
+ __free_page(page);
+}
+
+void toi_free_pages(int fail_num, struct page *page, int order)
+{
+ if (page && toi_alloc_ops.enabled)
+ free_update_stats(fail_num, PAGE_SIZE << order);
+
+ if (fail_num == toi_trace_allocs)
+ dump_stack();
+ __free_pages(page, order);
+}
+
+void toi_alloc_print_debug_stats(void)
+{
+ int i, header_done = 0;
+
+ if (!toi_alloc_ops.enabled)
+ return;
+
+ for (i = 0; i < TOI_ALLOC_PATHS; i++)
+ if (atomic_read(&toi_alloc_count[i]) !=
+ atomic_read(&toi_free_count[i])) {
+ if (!header_done) {
+ printk(KERN_INFO "Idx Allocs Frees Tests "
+ " Fails Max Description\n");
+ header_done = 1;
+ }
+
+ printk(KERN_INFO "%3d %7d %7d %7d %7d %7d %s\n", i,
+ atomic_read(&toi_alloc_count[i]),
+ atomic_read(&toi_free_count[i]),
+ atomic_read(&toi_test_count[i]),
+ atomic_read(&toi_fail_count[i]),
+ toi_max_allocd[i],
+ toi_alloc_desc[i]);
+ }
+}
+
+static int toi_alloc_initialise(int starting_cycle)
+{
+ int i;
+
+ if (!starting_cycle)
+ return 0;
+
+ if (toi_trace_allocs)
+ dump_stack();
+
+ for (i = 0; i < TOI_ALLOC_PATHS; i++) {
+ atomic_set(&toi_alloc_count[i], 0);
+ atomic_set(&toi_free_count[i], 0);
+ atomic_set(&toi_test_count[i], 0);
+ atomic_set(&toi_fail_count[i], 0);
+ toi_cur_allocd[i] = 0;
+ toi_max_allocd[i] = 0;
+ };
+
+ max_allocd = 0;
+ cur_allocd = 0;
+ return 0;
+}
+
+static struct toi_sysfs_data sysfs_params[] = {
+ SYSFS_INT("failure_test", SYSFS_RW, &toi_fail_num, 0, 99, 0, NULL),
+ SYSFS_INT("trace", SYSFS_RW, &toi_trace_allocs, 0, TOI_ALLOC_PATHS, 0,
+ NULL),
+ SYSFS_BIT("find_max_mem_allocated", SYSFS_RW, &toi_bkd.toi_action,
+ TOI_GET_MAX_MEM_ALLOCD, 0),
+ SYSFS_INT("enabled", SYSFS_RW, &toi_alloc_ops.enabled, 0, 1, 0,
+ NULL)
+};
+
+static struct toi_module_ops toi_alloc_ops = {
+ .type = MISC_HIDDEN_MODULE,
+ .name = "allocation debugging",
+ .directory = "alloc",
+ .module = THIS_MODULE,
+ .early = 1,
+ .initialise = toi_alloc_initialise,
+
+ .sysfs_data = sysfs_params,
+ .num_sysfs_entries = sizeof(sysfs_params) /
+ sizeof(struct toi_sysfs_data),
+};
+
+int toi_alloc_init(void)
+{
+ int result = toi_register_module(&toi_alloc_ops);
+ return result;
+}
+
+void toi_alloc_exit(void)
+{
+ toi_unregister_module(&toi_alloc_ops);
+}
diff --git a/kernel/power/tuxonice_alloc.h b/kernel/power/tuxonice_alloc.h
new file mode 100644
index 000000000..28c5af193
--- /dev/null
+++ b/kernel/power/tuxonice_alloc.h
@@ -0,0 +1,54 @@
+/*
+ * kernel/power/tuxonice_alloc.h
+ *
+ * Copyright (C) 2008-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ */
+
+#include <linux/slab.h>
+#define TOI_WAIT_GFP (GFP_NOFS | __GFP_NOWARN)
+#define TOI_ATOMIC_GFP (GFP_ATOMIC | __GFP_NOWARN)
+
+#ifdef CONFIG_PM_DEBUG
+extern void *toi_kzalloc(int fail_num, size_t size, gfp_t flags);
+extern void toi_kfree(int fail_num, const void *arg, int size);
+
+extern unsigned long toi_get_free_pages(int fail_num, gfp_t mask,
+ unsigned int order);
+#define toi_get_free_page(FAIL_NUM, MASK) toi_get_free_pages(FAIL_NUM, MASK, 0)
+extern unsigned long toi_get_zeroed_page(int fail_num, gfp_t mask);
+extern void toi_free_page(int fail_num, unsigned long buf);
+extern void toi__free_page(int fail_num, struct page *page);
+extern void toi_free_pages(int fail_num, struct page *page, int order);
+extern struct page *toi_alloc_page(int fail_num, gfp_t mask);
+extern int toi_alloc_init(void);
+extern void toi_alloc_exit(void);
+
+extern void toi_alloc_print_debug_stats(void);
+
+#else /* CONFIG_PM_DEBUG */
+
+#define toi_kzalloc(FAIL, SIZE, FLAGS) (kzalloc(SIZE, FLAGS))
+#define toi_kfree(FAIL, ALLOCN, SIZE) (kfree(ALLOCN))
+
+#define toi_get_free_pages(FAIL, FLAGS, ORDER) __get_free_pages(FLAGS, ORDER)
+#define toi_get_free_page(FAIL, FLAGS) __get_free_page(FLAGS)
+#define toi_get_zeroed_page(FAIL, FLAGS) get_zeroed_page(FLAGS)
+#define toi_free_page(FAIL, ALLOCN) do { free_page(ALLOCN); } while (0)
+#define toi__free_page(FAIL, PAGE) __free_page(PAGE)
+#define toi_free_pages(FAIL, PAGE, ORDER) __free_pages(PAGE, ORDER)
+#define toi_alloc_page(FAIL, MASK) alloc_page(MASK)
+static inline int toi_alloc_init(void)
+{
+ return 0;
+}
+
+static inline void toi_alloc_exit(void) { }
+
+static inline void toi_alloc_print_debug_stats(void) { }
+
+#endif
+
+extern int toi_trace_allocs;
diff --git a/kernel/power/tuxonice_atomic_copy.c b/kernel/power/tuxonice_atomic_copy.c
new file mode 100644
index 000000000..7b9886f54
--- /dev/null
+++ b/kernel/power/tuxonice_atomic_copy.c
@@ -0,0 +1,469 @@
+/*
+ * kernel/power/tuxonice_atomic_copy.c
+ *
+ * Copyright 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * Distributed under GPLv2.
+ *
+ * Routines for doing the atomic save/restore.
+ */
+
+#include <linux/suspend.h>
+#include <linux/highmem.h>
+#include <linux/cpu.h>
+#include <linux/freezer.h>
+#include <linux/console.h>
+#include <linux/syscore_ops.h>
+#include <linux/ftrace.h>
+#include <asm/suspend.h>
+#include "tuxonice.h"
+#include "tuxonice_storage.h"
+#include "tuxonice_power_off.h"
+#include "tuxonice_ui.h"
+#include "tuxonice_io.h"
+#include "tuxonice_prepare_image.h"
+#include "tuxonice_pageflags.h"
+#include "tuxonice_checksum.h"
+#include "tuxonice_builtin.h"
+#include "tuxonice_atomic_copy.h"
+#include "tuxonice_alloc.h"
+#include "tuxonice_modules.h"
+
+unsigned long extra_pd1_pages_used;
+
+/**
+ * free_pbe_list - free page backup entries used by the atomic copy code.
+ * @list: List to free.
+ * @highmem: Whether the list is in highmem.
+ *
+ * Normally, this function isn't used. If, however, we need to abort before
+ * doing the atomic copy, we use this to free the pbes previously allocated.
+ **/
+static void free_pbe_list(struct pbe **list, int highmem)
+{
+ while (*list) {
+ int i;
+ struct pbe *free_pbe, *next_page = NULL;
+ struct page *page;
+
+ if (highmem) {
+ page = (struct page *) *list;
+ free_pbe = (struct pbe *) kmap(page);
+ } else {
+ page = virt_to_page(*list);
+ free_pbe = *list;
+ }
+
+ for (i = 0; i < PBES_PER_PAGE; i++) {
+ if (!free_pbe)
+ break;
+ if (highmem)
+ toi__free_page(29, free_pbe->address);
+ else
+ toi_free_page(29,
+ (unsigned long) free_pbe->address);
+ free_pbe = free_pbe->next;
+ }
+
+ if (highmem) {
+ if (free_pbe)
+ next_page = free_pbe;
+ kunmap(page);
+ } else {
+ if (free_pbe)
+ next_page = free_pbe;
+ }
+
+ toi__free_page(29, page);
+ *list = (struct pbe *) next_page;
+ };
+}
+
+/**
+ * copyback_post - post atomic-restore actions
+ *
+ * After doing the atomic restore, we have a few more things to do:
+ * 1) We want to retain some values across the restore, so we now copy
+ * these from the nosave variables to the normal ones.
+ * 2) Set the status flags.
+ * 3) Resume devices.
+ * 4) Tell userui so it can redraw & restore settings.
+ * 5) Reread the page cache.
+ **/
+void copyback_post(void)
+{
+ struct toi_boot_kernel_data *bkd =
+ (struct toi_boot_kernel_data *) boot_kernel_data_buffer;
+
+ if (toi_activate_storage(1))
+ panic("Failed to reactivate our storage.");
+
+ toi_post_atomic_restore_modules(bkd);
+
+ toi_cond_pause(1, "About to reload secondary pagedir.");
+
+ if (read_pageset2(0))
+ panic("Unable to successfully reread the page cache.");
+
+ /*
+ * If the user wants to sleep again after resuming from full-off,
+ * it's most likely to be in order to suspend to ram, so we'll
+ * do this check after loading pageset2, to give them the fastest
+ * wakeup when they are ready to use the computer again.
+ */
+ toi_check_resleep();
+
+ if (test_action_state(TOI_INCREMENTAL_IMAGE))
+ toi_reset_dirtiness(1);
+}
+
+/**
+ * toi_copy_pageset1 - do the atomic copy of pageset1
+ *
+ * Make the atomic copy of pageset1. We can't use copy_page (as we once did)
+ * because we can't be sure what side effects it has. On my old Duron, with
+ * 3DNOW, kernel_fpu_begin increments preempt count, making our preempt
+ * count at resume time 4 instead of 3.
+ *
+ * We don't want to call kmap_atomic unconditionally because it has the side
+ * effect of incrementing the preempt count, which will leave it one too high
+ * post resume (the page containing the preempt count will be copied after
+ * its incremented. This is essentially the same problem.
+ **/
+void toi_copy_pageset1(void)
+{
+ int i;
+ unsigned long source_index, dest_index;
+
+ memory_bm_position_reset(pageset1_map);
+ memory_bm_position_reset(pageset1_copy_map);
+
+ source_index = memory_bm_next_pfn(pageset1_map, 0);
+ dest_index = memory_bm_next_pfn(pageset1_copy_map, 0);
+
+ for (i = 0; i < pagedir1.size; i++) {
+ unsigned long *origvirt, *copyvirt;
+ struct page *origpage, *copypage;
+ int loop = (PAGE_SIZE / sizeof(unsigned long)) - 1,
+ was_present1, was_present2;
+
+ origpage = pfn_to_page(source_index);
+ copypage = pfn_to_page(dest_index);
+
+ origvirt = PageHighMem(origpage) ?
+ kmap_atomic(origpage) :
+ page_address(origpage);
+
+ copyvirt = PageHighMem(copypage) ?
+ kmap_atomic(copypage) :
+ page_address(copypage);
+
+ was_present1 = kernel_page_present(origpage);
+ if (!was_present1)
+ kernel_map_pages(origpage, 1, 1);
+
+ was_present2 = kernel_page_present(copypage);
+ if (!was_present2)
+ kernel_map_pages(copypage, 1, 1);
+
+ while (loop >= 0) {
+ *(copyvirt + loop) = *(origvirt + loop);
+ loop--;
+ }
+
+ if (!was_present1)
+ kernel_map_pages(origpage, 1, 0);
+
+ if (!was_present2)
+ kernel_map_pages(copypage, 1, 0);
+
+ if (PageHighMem(origpage))
+ kunmap_atomic(origvirt);
+
+ if (PageHighMem(copypage))
+ kunmap_atomic(copyvirt);
+
+ source_index = memory_bm_next_pfn(pageset1_map, 0);
+ dest_index = memory_bm_next_pfn(pageset1_copy_map, 0);
+ }
+}
+
+/**
+ * __toi_post_context_save - steps after saving the cpu context
+ *
+ * Steps taken after saving the CPU state to make the actual
+ * atomic copy.
+ *
+ * Called from swsusp_save in snapshot.c via toi_post_context_save.
+ **/
+int __toi_post_context_save(void)
+{
+ unsigned long old_ps1_size = pagedir1.size;
+
+ check_checksums();
+
+ free_checksum_pages();
+
+ toi_recalculate_image_contents(1);
+
+ extra_pd1_pages_used = pagedir1.size > old_ps1_size ?
+ pagedir1.size - old_ps1_size : 0;
+
+ if (extra_pd1_pages_used > extra_pd1_pages_allowance) {
+ printk(KERN_INFO "Pageset1 has grown by %lu pages. "
+ "extra_pages_allowance is currently only %lu.\n",
+ pagedir1.size - old_ps1_size,
+ extra_pd1_pages_allowance);
+
+ /*
+ * Highlevel code will see this, clear the state and
+ * retry if we haven't already done so twice.
+ */
+ if (any_to_free(1)) {
+ set_abort_result(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL);
+ return 1;
+ }
+ if (try_allocate_extra_memory()) {
+ printk(KERN_INFO "Failed to allocate the extra memory"
+ " needed. Restarting the process.");
+ set_abort_result(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL);
+ return 1;
+ }
+ printk(KERN_INFO "However it looks like there's enough"
+ " free ram and storage to handle this, so "
+ " continuing anyway.");
+ /*
+ * What if try_allocate_extra_memory above calls
+ * toi_allocate_extra_pagedir_memory and it allocs a new
+ * slab page via toi_kzalloc which should be in ps1? So...
+ */
+ toi_recalculate_image_contents(1);
+ }
+
+ if (!test_action_state(TOI_TEST_FILTER_SPEED) &&
+ !test_action_state(TOI_TEST_BIO))
+ toi_copy_pageset1();
+
+ return 0;
+}
+
+/**
+ * toi_hibernate - high level code for doing the atomic copy
+ *
+ * High-level code which prepares to do the atomic copy. Loosely based
+ * on the swsusp version, but with the following twists:
+ * - We set toi_running so the swsusp code uses our code paths.
+ * - We give better feedback regarding what goes wrong if there is a
+ * problem.
+ * - We use an extra function to call the assembly, just in case this code
+ * is in a module (return address).
+ **/
+int toi_hibernate(void)
+{
+ int error;
+
+ error = toi_lowlevel_builtin();
+
+ if (!error) {
+ struct toi_boot_kernel_data *bkd =
+ (struct toi_boot_kernel_data *) boot_kernel_data_buffer;
+
+ /*
+ * The boot kernel's data may be larger (newer version) or
+ * smaller (older version) than ours. Copy the minimum
+ * of the two sizes, so that we don't overwrite valid values
+ * from pre-atomic copy.
+ */
+
+ memcpy(&toi_bkd, (char *) boot_kernel_data_buffer,
+ min_t(int, sizeof(struct toi_boot_kernel_data),
+ bkd->size));
+ }
+
+ return error;
+}
+
+/**
+ * toi_atomic_restore - prepare to do the atomic restore
+ *
+ * Get ready to do the atomic restore. This part gets us into the same
+ * state we are in prior to do calling do_toi_lowlevel while
+ * hibernating: hot-unplugging secondary cpus and freeze processes,
+ * before starting the thread that will do the restore.
+ **/
+int toi_atomic_restore(void)
+{
+ int error;
+
+ toi_prepare_status(DONT_CLEAR_BAR, "Atomic restore.");
+
+ memcpy(&toi_bkd.toi_nosave_commandline, saved_command_line,
+ strlen(saved_command_line));
+
+ toi_pre_atomic_restore_modules(&toi_bkd);
+
+ if (add_boot_kernel_data_pbe())
+ goto Failed;
+
+ toi_prepare_status(DONT_CLEAR_BAR, "Doing atomic copy/restore.");
+
+ if (toi_go_atomic(PMSG_QUIESCE, 0))
+ goto Failed;
+
+ /* We'll ignore saved state, but this gets preempt count (etc) right */
+ save_processor_state();
+
+ error = swsusp_arch_resume();
+ /*
+ * Code below is only ever reached in case of failure. Otherwise
+ * execution continues at place where swsusp_arch_suspend was called.
+ *
+ * We don't know whether it's safe to continue (this shouldn't happen),
+ * so lets err on the side of caution.
+ */
+ BUG();
+
+Failed:
+ free_pbe_list(&restore_pblist, 0);
+#ifdef CONFIG_HIGHMEM
+ free_pbe_list(&restore_highmem_pblist, 1);
+#endif
+ return 1;
+}
+
+/**
+ * toi_go_atomic - do the actual atomic copy/restore
+ * @state: The state to use for dpm_suspend_start & power_down calls.
+ * @suspend_time: Whether we're suspending or resuming.
+ **/
+int toi_go_atomic(pm_message_t state, int suspend_time)
+{
+ if (suspend_time) {
+ if (platform_begin(1)) {
+ set_abort_result(TOI_PLATFORM_PREP_FAILED);
+ toi_end_atomic(ATOMIC_STEP_PLATFORM_END, suspend_time, 3);
+ return 1;
+ }
+
+ if (dpm_prepare(PMSG_FREEZE)) {
+ set_abort_result(TOI_DPM_PREPARE_FAILED);
+ dpm_complete(PMSG_RECOVER);
+ toi_end_atomic(ATOMIC_STEP_PLATFORM_END, suspend_time, 3);
+ return 1;
+ }
+ }
+
+ suspend_console();
+ pm_restrict_gfp_mask();
+
+ if (suspend_time) {
+ if (dpm_suspend(state)) {
+ set_abort_result(TOI_DPM_SUSPEND_FAILED);
+ toi_end_atomic(ATOMIC_STEP_DEVICE_RESUME, suspend_time, 3);
+ return 1;
+ }
+ } else {
+ if (dpm_suspend_start(state)) {
+ set_abort_result(TOI_DPM_SUSPEND_FAILED);
+ toi_end_atomic(ATOMIC_STEP_DEVICE_RESUME, suspend_time, 3);
+ return 1;
+ }
+ }
+
+ /* At this point, dpm_suspend_start() has been called, but *not*
+ * dpm_suspend_noirq(). We *must* dpm_suspend_noirq() now.
+ * Otherwise, drivers for some devices (e.g. interrupt controllers)
+ * become desynchronized with the actual state of the hardware
+ * at resume time, and evil weirdness ensues.
+ */
+
+ if (dpm_suspend_end(state)) {
+ set_abort_result(TOI_DEVICE_REFUSED);
+ toi_end_atomic(ATOMIC_STEP_DEVICE_RESUME, suspend_time, 1);
+ return 1;
+ }
+
+ if (suspend_time) {
+ if (platform_pre_snapshot(1))
+ set_abort_result(TOI_PRE_SNAPSHOT_FAILED);
+ } else {
+ if (platform_pre_restore(1))
+ set_abort_result(TOI_PRE_RESTORE_FAILED);
+ }
+
+ if (test_result_state(TOI_ABORTED)) {
+ toi_end_atomic(ATOMIC_STEP_PLATFORM_FINISH, suspend_time, 1);
+ return 1;
+ }
+
+ if (disable_nonboot_cpus()) {
+ set_abort_result(TOI_CPU_HOTPLUG_FAILED);
+ toi_end_atomic(ATOMIC_STEP_CPU_HOTPLUG,
+ suspend_time, 1);
+ return 1;
+ }
+
+ local_irq_disable();
+
+ if (syscore_suspend()) {
+ set_abort_result(TOI_SYSCORE_REFUSED);
+ toi_end_atomic(ATOMIC_STEP_IRQS, suspend_time, 1);
+ return 1;
+ }
+
+ if (suspend_time && pm_wakeup_pending()) {
+ set_abort_result(TOI_WAKEUP_EVENT);
+ toi_end_atomic(ATOMIC_STEP_SYSCORE_RESUME, suspend_time, 1);
+ return 1;
+ }
+ return 0;
+}
+
+/**
+ * toi_end_atomic - post atomic copy/restore routines
+ * @stage: What step to start at.
+ * @suspend_time: Whether we're suspending or resuming.
+ * @error: Whether we're recovering from an error.
+ **/
+void toi_end_atomic(int stage, int suspend_time, int error)
+{
+ pm_message_t msg = suspend_time ? (error ? PMSG_RECOVER : PMSG_THAW) :
+ PMSG_RESTORE;
+
+ switch (stage) {
+ case ATOMIC_ALL_STEPS:
+ if (!suspend_time) {
+ events_check_enabled = false;
+ }
+ platform_leave(1);
+ case ATOMIC_STEP_SYSCORE_RESUME:
+ syscore_resume();
+ case ATOMIC_STEP_IRQS:
+ local_irq_enable();
+ case ATOMIC_STEP_CPU_HOTPLUG:
+ enable_nonboot_cpus();
+ case ATOMIC_STEP_PLATFORM_FINISH:
+ if (!suspend_time && error & 2)
+ platform_restore_cleanup(1);
+ else
+ platform_finish(1);
+ dpm_resume_start(msg);
+ case ATOMIC_STEP_DEVICE_RESUME:
+ if (suspend_time && (error & 2))
+ platform_recover(1);
+ dpm_resume(msg);
+ if (!toi_in_suspend()) {
+ dpm_resume_end(PMSG_RECOVER);
+ }
+ if (error || !toi_in_suspend()) {
+ pm_restore_gfp_mask();
+ }
+ resume_console();
+ case ATOMIC_STEP_DPM_COMPLETE:
+ dpm_complete(msg);
+ case ATOMIC_STEP_PLATFORM_END:
+ platform_end(1);
+
+ toi_prepare_status(DONT_CLEAR_BAR, "Post atomic.");
+ }
+}
diff --git a/kernel/power/tuxonice_atomic_copy.h b/kernel/power/tuxonice_atomic_copy.h
new file mode 100644
index 000000000..2de0e3b49
--- /dev/null
+++ b/kernel/power/tuxonice_atomic_copy.h
@@ -0,0 +1,25 @@
+/*
+ * kernel/power/tuxonice_atomic_copy.h
+ *
+ * Copyright 2008-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * Distributed under GPLv2.
+ *
+ * Routines for doing the atomic save/restore.
+ */
+
+enum {
+ ATOMIC_ALL_STEPS,
+ ATOMIC_STEP_SYSCORE_RESUME,
+ ATOMIC_STEP_IRQS,
+ ATOMIC_STEP_CPU_HOTPLUG,
+ ATOMIC_STEP_PLATFORM_FINISH,
+ ATOMIC_STEP_DEVICE_RESUME,
+ ATOMIC_STEP_DPM_COMPLETE,
+ ATOMIC_STEP_PLATFORM_END,
+};
+
+int toi_go_atomic(pm_message_t state, int toi_time);
+void toi_end_atomic(int stage, int toi_time, int error);
+
+extern void platform_recover(int platform_mode);
diff --git a/kernel/power/tuxonice_bio.h b/kernel/power/tuxonice_bio.h
new file mode 100644
index 000000000..201e3cd47
--- /dev/null
+++ b/kernel/power/tuxonice_bio.h
@@ -0,0 +1,78 @@
+/*
+ * kernel/power/tuxonice_bio.h
+ *
+ * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * Distributed under GPLv2.
+ *
+ * This file contains declarations for functions exported from
+ * tuxonice_bio.c, which contains low level io functions.
+ */
+
+#include <linux/buffer_head.h>
+#include "tuxonice_extent.h"
+
+void toi_put_extent_chain(struct hibernate_extent_chain *chain);
+int toi_add_to_extent_chain(struct hibernate_extent_chain *chain,
+ unsigned long start, unsigned long end);
+
+struct hibernate_extent_saved_state {
+ int extent_num;
+ struct hibernate_extent *extent_ptr;
+ unsigned long offset;
+};
+
+struct toi_bdev_info {
+ struct toi_bdev_info *next;
+ struct hibernate_extent_chain blocks;
+ struct block_device *bdev;
+ struct toi_module_ops *allocator;
+ int allocator_index;
+ struct hibernate_extent_chain allocations;
+ char name[266]; /* "swap on " or "file " + up to 256 chars */
+
+ /* Saved in header */
+ char uuid[17];
+ dev_t dev_t;
+ int prio;
+ int bmap_shift;
+ int blocks_per_page;
+ unsigned long pages_used;
+ struct hibernate_extent_saved_state saved_state[4];
+};
+
+struct toi_extent_iterate_state {
+ struct toi_bdev_info *current_chain;
+ int num_chains;
+ int saved_chain_number[4];
+ struct toi_bdev_info *saved_chain_ptr[4];
+};
+
+/*
+ * Our exported interface so the swapwriter and filewriter don't
+ * need these functions duplicated.
+ */
+struct toi_bio_ops {
+ int (*bdev_page_io) (int rw, struct block_device *bdev, long pos,
+ struct page *page);
+ int (*register_storage)(struct toi_bdev_info *new);
+ void (*free_storage)(void);
+};
+
+struct toi_allocator_ops {
+ unsigned long (*toi_swap_storage_available) (void);
+};
+
+extern struct toi_bio_ops toi_bio_ops;
+
+extern char *toi_writer_buffer;
+extern int toi_writer_buffer_posn;
+
+struct toi_bio_allocator_ops {
+ int (*register_storage) (void);
+ unsigned long (*storage_available)(void);
+ int (*allocate_storage) (struct toi_bdev_info *, unsigned long);
+ int (*bmap) (struct toi_bdev_info *);
+ void (*free_storage) (struct toi_bdev_info *);
+ unsigned long (*free_unused_storage) (struct toi_bdev_info *, unsigned long used);
+};
diff --git a/kernel/power/tuxonice_bio_chains.c b/kernel/power/tuxonice_bio_chains.c
new file mode 100644
index 000000000..364fae9db
--- /dev/null
+++ b/kernel/power/tuxonice_bio_chains.c
@@ -0,0 +1,1126 @@
+/*
+ * kernel/power/tuxonice_bio_devinfo.c
+ *
+ * Copyright (C) 2009-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * Distributed under GPLv2.
+ *
+ */
+
+#include <linux/mm_types.h>
+#include "tuxonice_bio.h"
+#include "tuxonice_bio_internal.h"
+#include "tuxonice_alloc.h"
+#include "tuxonice_ui.h"
+#include "tuxonice.h"
+#include "tuxonice_io.h"
+
+static struct toi_bdev_info *prio_chain_head;
+static int num_chains;
+
+/* Pointer to current entry being loaded/saved. */
+struct toi_extent_iterate_state toi_writer_posn;
+
+#define metadata_size (sizeof(struct toi_bdev_info) - \
+ offsetof(struct toi_bdev_info, uuid))
+
+/*
+ * After section 0 (header) comes 2 => next_section[0] = 2
+ */
+static int next_section[3] = { 2, 3, 1 };
+
+/**
+ * dump_block_chains - print the contents of the bdev info array.
+ **/
+void dump_block_chains(void)
+{
+ int i = 0;
+ int j;
+ struct toi_bdev_info *cur_chain = prio_chain_head;
+
+ while (cur_chain) {
+ struct hibernate_extent *this = cur_chain->blocks.first;
+
+ printk(KERN_DEBUG "Chain %d (prio %d):", i, cur_chain->prio);
+
+ while (this) {
+ printk(KERN_CONT " [%lu-%lu]%s", this->start,
+ this->end, this->next ? "," : "");
+ this = this->next;
+ }
+
+ printk("\n");
+ cur_chain = cur_chain->next;
+ i++;
+ }
+
+ printk(KERN_DEBUG "Saved states:\n");
+ for (i = 0; i < 4; i++) {
+ printk(KERN_DEBUG "Slot %d: Chain %d.\n",
+ i, toi_writer_posn.saved_chain_number[i]);
+
+ cur_chain = prio_chain_head;
+ j = 0;
+ while (cur_chain) {
+ printk(KERN_DEBUG " Chain %d: Extent %d. Offset %lu.\n",
+ j, cur_chain->saved_state[i].extent_num,
+ cur_chain->saved_state[i].offset);
+ cur_chain = cur_chain->next;
+ j++;
+ }
+ printk(KERN_CONT "\n");
+ }
+}
+
+/**
+ *
+ **/
+static void toi_extent_chain_next(void)
+{
+ struct toi_bdev_info *this = toi_writer_posn.current_chain;
+
+ if (!this->blocks.current_extent)
+ return;
+
+ if (this->blocks.current_offset == this->blocks.current_extent->end) {
+ if (this->blocks.current_extent->next) {
+ this->blocks.current_extent =
+ this->blocks.current_extent->next;
+ this->blocks.current_offset =
+ this->blocks.current_extent->start;
+ } else {
+ this->blocks.current_extent = NULL;
+ this->blocks.current_offset = 0;
+ }
+ } else
+ this->blocks.current_offset++;
+}
+
+/**
+ *
+ */
+
+static struct toi_bdev_info *__find_next_chain_same_prio(void)
+{
+ struct toi_bdev_info *start_chain = toi_writer_posn.current_chain;
+ struct toi_bdev_info *this = start_chain;
+ int orig_prio = this->prio;
+
+ do {
+ this = this->next;
+
+ if (!this)
+ this = prio_chain_head;
+
+ /* Back on original chain? Use it again. */
+ if (this == start_chain)
+ return start_chain;
+
+ } while (!this->blocks.current_extent || this->prio != orig_prio);
+
+ return this;
+}
+
+static void find_next_chain(void)
+{
+ struct toi_bdev_info *this;
+
+ this = __find_next_chain_same_prio();
+
+ /*
+ * If we didn't get another chain of the same priority that we
+ * can use, look for the next priority.
+ */
+ while (this && !this->blocks.current_extent)
+ this = this->next;
+
+ toi_writer_posn.current_chain = this;
+}
+
+/**
+ * toi_extent_state_next - go to the next extent
+ * @blocks: The number of values to progress.
+ * @stripe_mode: Whether to spread usage across all chains.
+ *
+ * Given a state, progress to the next valid entry. We may begin in an
+ * invalid state, as we do when invoked after extent_state_goto_start below.
+ *
+ * When using compression and expected_compression > 0, we let the image size
+ * be larger than storage, so we can validly run out of data to return.
+ **/
+static unsigned long toi_extent_state_next(int blocks, int current_stream)
+{
+ int i;
+
+ if (!toi_writer_posn.current_chain)
+ return -ENOSPC;
+
+ /* Assume chains always have lengths that are multiples of @blocks */
+ for (i = 0; i < blocks; i++)
+ toi_extent_chain_next();
+
+ /* The header stream is not striped */
+ if (current_stream ||
+ !toi_writer_posn.current_chain->blocks.current_extent)
+ find_next_chain();
+
+ return toi_writer_posn.current_chain ? 0 : -ENOSPC;
+}
+
+static void toi_insert_chain_in_prio_list(struct toi_bdev_info *this)
+{
+ struct toi_bdev_info **prev_ptr;
+ struct toi_bdev_info *cur;
+
+ /* Loop through the existing chain, finding where to insert it */
+ prev_ptr = &prio_chain_head;
+ cur = prio_chain_head;
+
+ while (cur && cur->prio >= this->prio) {
+ prev_ptr = &cur->next;
+ cur = cur->next;
+ }
+
+ this->next = *prev_ptr;
+ *prev_ptr = this;
+
+ this = prio_chain_head;
+ while (this)
+ this = this->next;
+ num_chains++;
+}
+
+/**
+ * toi_extent_state_goto_start - reinitialize an extent chain iterator
+ * @state: Iterator to reinitialize
+ **/
+void toi_extent_state_goto_start(void)
+{
+ struct toi_bdev_info *this = prio_chain_head;
+
+ while (this) {
+ toi_message(TOI_BIO, TOI_VERBOSE, 0,
+ "Setting current extent to %p.", this->blocks.first);
+ this->blocks.current_extent = this->blocks.first;
+ if (this->blocks.current_extent) {
+ toi_message(TOI_BIO, TOI_VERBOSE, 0,
+ "Setting current offset to %lu.",
+ this->blocks.current_extent->start);
+ this->blocks.current_offset =
+ this->blocks.current_extent->start;
+ }
+
+ this = this->next;
+ }
+
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Setting current chain to %p.",
+ prio_chain_head);
+ toi_writer_posn.current_chain = prio_chain_head;
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Leaving extent state goto start.");
+}
+
+/**
+ * toi_extent_state_save - save state of the iterator
+ * @state: Current state of the chain
+ * @saved_state: Iterator to populate
+ *
+ * Given a state and a struct hibernate_extent_state_store, save the current
+ * position in a format that can be used with relocated chains (at
+ * resume time).
+ **/
+void toi_extent_state_save(int slot)
+{
+ struct toi_bdev_info *cur_chain = prio_chain_head;
+ struct hibernate_extent *extent;
+ struct hibernate_extent_saved_state *chain_state;
+ int i = 0;
+
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_extent_state_save, slot %d.",
+ slot);
+
+ if (!toi_writer_posn.current_chain) {
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "No current chain => "
+ "chain_num = -1.");
+ toi_writer_posn.saved_chain_number[slot] = -1;
+ return;
+ }
+
+ while (cur_chain) {
+ i++;
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Saving chain %d (%p) "
+ "state, slot %d.", i, cur_chain, slot);
+
+ chain_state = &cur_chain->saved_state[slot];
+
+ chain_state->offset = cur_chain->blocks.current_offset;
+
+ if (toi_writer_posn.current_chain == cur_chain) {
+ toi_writer_posn.saved_chain_number[slot] = i;
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "This is the chain "
+ "we were on => chain_num is %d.", i);
+ }
+
+ if (!cur_chain->blocks.current_extent) {
+ chain_state->extent_num = 0;
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "No current extent "
+ "for this chain => extent_num %d is 0.",
+ i);
+ cur_chain = cur_chain->next;
+ continue;
+ }
+
+ extent = cur_chain->blocks.first;
+ chain_state->extent_num = 1;
+
+ while (extent != cur_chain->blocks.current_extent) {
+ chain_state->extent_num++;
+ extent = extent->next;
+ }
+
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "extent num %d is %d.", i,
+ chain_state->extent_num);
+
+ cur_chain = cur_chain->next;
+ }
+ toi_message(TOI_BIO, TOI_VERBOSE, 0,
+ "Completed saving extent state slot %d.", slot);
+}
+
+/**
+ * toi_extent_state_restore - restore the position saved by extent_state_save
+ * @state: State to populate
+ * @saved_state: Iterator saved to restore
+ **/
+void toi_extent_state_restore(int slot)
+{
+ int i = 0;
+ struct toi_bdev_info *cur_chain = prio_chain_head;
+ struct hibernate_extent_saved_state *chain_state;
+
+ toi_message(TOI_BIO, TOI_VERBOSE, 0,
+ "toi_extent_state_restore - slot %d.", slot);
+
+ if (toi_writer_posn.saved_chain_number[slot] == -1) {
+ toi_writer_posn.current_chain = NULL;
+ return;
+ }
+
+ while (cur_chain) {
+ int posn;
+ int j;
+ i++;
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Restoring chain %d (%p) "
+ "state, slot %d.", i, cur_chain, slot);
+
+ chain_state = &cur_chain->saved_state[slot];
+
+ posn = chain_state->extent_num;
+
+ cur_chain->blocks.current_extent = cur_chain->blocks.first;
+ cur_chain->blocks.current_offset = chain_state->offset;
+
+ if (i == toi_writer_posn.saved_chain_number[slot]) {
+ toi_writer_posn.current_chain = cur_chain;
+ toi_message(TOI_BIO, TOI_VERBOSE, 0,
+ "Found current chain.");
+ }
+
+ for (j = 0; j < 4; j++)
+ if (i == toi_writer_posn.saved_chain_number[j]) {
+ toi_writer_posn.saved_chain_ptr[j] = cur_chain;
+ toi_message(TOI_BIO, TOI_VERBOSE, 0,
+ "Found saved chain ptr %d (%p) (offset"
+ " %d).", j, cur_chain,
+ cur_chain->saved_state[j].offset);
+ }
+
+ if (posn) {
+ while (--posn)
+ cur_chain->blocks.current_extent =
+ cur_chain->blocks.current_extent->next;
+ } else
+ cur_chain->blocks.current_extent = NULL;
+
+ cur_chain = cur_chain->next;
+ }
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Done.");
+ if (test_action_state(TOI_LOGALL))
+ dump_block_chains();
+}
+
+/*
+ * Storage needed
+ *
+ * Returns amount of space in the image header required
+ * for the chain data. This ignores the links between
+ * pages, which we factor in when allocating the space.
+ */
+int toi_bio_devinfo_storage_needed(void)
+{
+ int result = sizeof(num_chains);
+ struct toi_bdev_info *chain = prio_chain_head;
+
+ while (chain) {
+ result += metadata_size;
+
+ /* Chain size */
+ result += sizeof(int);
+
+ /* Extents */
+ result += (2 * sizeof(unsigned long) *
+ chain->blocks.num_extents);
+
+ chain = chain->next;
+ }
+
+ result += 4 * sizeof(int);
+ return result;
+}
+
+static unsigned long chain_pages_used(struct toi_bdev_info *chain)
+{
+ struct hibernate_extent *this = chain->blocks.first;
+ struct hibernate_extent_saved_state *state = &chain->saved_state[3];
+ unsigned long size = 0;
+ int extent_idx = 1;
+
+ if (!state->extent_num) {
+ if (!this)
+ return 0;
+ else
+ return chain->blocks.size;
+ }
+
+ while (extent_idx < state->extent_num) {
+ size += (this->end - this->start + 1);
+ this = this->next;
+ extent_idx++;
+ }
+
+ /* We didn't use the one we're sitting on, so don't count it */
+ return size + state->offset - this->start;
+}
+
+void toi_bio_free_unused_storage_chain(struct toi_bdev_info *chain)
+{
+ unsigned long used = chain_pages_used(chain);
+
+ /* Free the storage */
+ unsigned long first_freed = 0;
+
+ if (chain->allocator->bio_allocator_ops->free_unused_storage)
+ first_freed = chain->allocator->bio_allocator_ops->free_unused_storage(chain, used);
+
+ printk(KERN_EMERG "Used %ld blocks in this chain. First extent freed is %lx.\n", used, first_freed);
+
+ /* Adjust / free the extents. */
+ toi_put_extent_chain_from(&chain->blocks, first_freed);
+
+ {
+ struct hibernate_extent *this = chain->blocks.first;
+ while (this) {
+ printk("Extent %lx-%lx.\n", this->start, this->end);
+ this = this->next;
+ }
+ }
+}
+
+/**
+ * toi_serialise_extent_chain - write a chain in the image
+ * @chain: Chain to write.
+ **/
+static int toi_serialise_extent_chain(struct toi_bdev_info *chain)
+{
+ struct hibernate_extent *this;
+ int ret;
+ int i = 1;
+
+ chain->pages_used = chain_pages_used(chain);
+
+ if (test_action_state(TOI_LOGALL))
+ dump_block_chains();
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Serialising chain (dev_t %lx).",
+ chain->dev_t);
+ /* Device info - dev_t, prio, bmap_shift, blocks per page, positions */
+ ret = toiActiveAllocator->rw_header_chunk(WRITE, &toi_blockwriter_ops,
+ (char *) &chain->uuid, metadata_size);
+ if (ret)
+ return ret;
+
+ /* Num extents */
+ ret = toiActiveAllocator->rw_header_chunk(WRITE, &toi_blockwriter_ops,
+ (char *) &chain->blocks.num_extents, sizeof(int));
+ if (ret)
+ return ret;
+
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "%d extents.",
+ chain->blocks.num_extents);
+
+ this = chain->blocks.first;
+ while (this) {
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Extent %d.", i);
+ ret = toiActiveAllocator->rw_header_chunk(WRITE,
+ &toi_blockwriter_ops,
+ (char *) this, 2 * sizeof(this->start));
+ if (ret)
+ return ret;
+ this = this->next;
+ i++;
+ }
+
+ return ret;
+}
+
+int toi_serialise_extent_chains(void)
+{
+ struct toi_bdev_info *this = prio_chain_head;
+ int result;
+
+ /* Write the number of chains */
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Write number of chains (%d)",
+ num_chains);
+ result = toiActiveAllocator->rw_header_chunk(WRITE,
+ &toi_blockwriter_ops, (char *) &num_chains,
+ sizeof(int));
+ if (result)
+ return result;
+
+ /* Then the chains themselves */
+ while (this) {
+ result = toi_serialise_extent_chain(this);
+ if (result)
+ return result;
+ this = this->next;
+ }
+
+ /*
+ * Finally, the chain we should be on at the start of each
+ * section.
+ */
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Saved chain numbers.");
+ result = toiActiveAllocator->rw_header_chunk(WRITE,
+ &toi_blockwriter_ops,
+ (char *) &toi_writer_posn.saved_chain_number[0],
+ 4 * sizeof(int));
+
+ return result;
+}
+
+int toi_register_storage_chain(struct toi_bdev_info *new)
+{
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Inserting chain %p into list.",
+ new);
+ toi_insert_chain_in_prio_list(new);
+ return 0;
+}
+
+static void free_bdev_info(struct toi_bdev_info *chain)
+{
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Free chain %p.", chain);
+
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Block extents.");
+ toi_put_extent_chain(&chain->blocks);
+
+ /*
+ * The allocator may need to do more than just free the chains
+ * (swap_free, for example). Don't call from boot kernel.
+ */
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Allocator extents.");
+ if (chain->allocator)
+ chain->allocator->bio_allocator_ops->free_storage(chain);
+
+ /*
+ * Dropping out of reading atomic copy? Need to undo
+ * toi_open_by_devnum.
+ */
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Bdev.");
+ if (chain->bdev && !IS_ERR(chain->bdev) &&
+ chain->bdev != resume_block_device &&
+ chain->bdev != header_block_device &&
+ test_toi_state(TOI_TRYING_TO_RESUME))
+ toi_close_bdev(chain->bdev);
+
+ /* Poison */
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Struct.");
+ toi_kfree(39, chain, sizeof(*chain));
+
+ if (prio_chain_head == chain)
+ prio_chain_head = NULL;
+
+ num_chains--;
+}
+
+void free_all_bdev_info(void)
+{
+ struct toi_bdev_info *this = prio_chain_head;
+
+ while (this) {
+ struct toi_bdev_info *next = this->next;
+ free_bdev_info(this);
+ this = next;
+ }
+
+ memset((char *) &toi_writer_posn, 0, sizeof(toi_writer_posn));
+ prio_chain_head = NULL;
+}
+
+static void set_up_start_position(void)
+{
+ toi_writer_posn.current_chain = prio_chain_head;
+ go_next_page(0, 0);
+}
+
+/**
+ * toi_load_extent_chain - read back a chain saved in the image
+ * @chain: Chain to load
+ *
+ * The linked list of extents is reconstructed from the disk. chain will point
+ * to the first entry.
+ **/
+int toi_load_extent_chain(int index, int *num_loaded)
+{
+ struct toi_bdev_info *chain = toi_kzalloc(39,
+ sizeof(struct toi_bdev_info), GFP_ATOMIC);
+ struct hibernate_extent *this, *last = NULL;
+ int i, ret;
+
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Loading extent chain %d.", index);
+ /* Get dev_t, prio, bmap_shift, blocks per page, positions */
+ ret = toiActiveAllocator->rw_header_chunk_noreadahead(READ, NULL,
+ (char *) &chain->uuid, metadata_size);
+
+ if (ret) {
+ printk(KERN_ERR "Failed to read the size of extent chain.\n");
+ toi_kfree(39, chain, sizeof(*chain));
+ return 1;
+ }
+
+ toi_bkd.pages_used[index] = chain->pages_used;
+
+ ret = toiActiveAllocator->rw_header_chunk_noreadahead(READ, NULL,
+ (char *) &chain->blocks.num_extents, sizeof(int));
+ if (ret) {
+ printk(KERN_ERR "Failed to read the size of extent chain.\n");
+ toi_kfree(39, chain, sizeof(*chain));
+ return 1;
+ }
+
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "%d extents.",
+ chain->blocks.num_extents);
+
+ for (i = 0; i < chain->blocks.num_extents; i++) {
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Extent %d.", i + 1);
+
+ this = toi_kzalloc(2, sizeof(struct hibernate_extent),
+ TOI_ATOMIC_GFP);
+ if (!this) {
+ printk(KERN_INFO "Failed to allocate a new extent.\n");
+ free_bdev_info(chain);
+ return -ENOMEM;
+ }
+ this->next = NULL;
+ /* Get the next page */
+ ret = toiActiveAllocator->rw_header_chunk_noreadahead(READ,
+ NULL, (char *) this, 2 * sizeof(this->start));
+ if (ret) {
+ printk(KERN_INFO "Failed to read an extent.\n");
+ toi_kfree(2, this, sizeof(struct hibernate_extent));
+ free_bdev_info(chain);
+ return 1;
+ }
+
+ if (last)
+ last->next = this;
+ else {
+ char b1[32], b2[32], b3[32];
+ /*
+ * Open the bdev
+ */
+ toi_message(TOI_BIO, TOI_VERBOSE, 0,
+ "Chain dev_t is %s. Resume dev t is %s. Header"
+ " bdev_t is %s.\n",
+ format_dev_t(b1, chain->dev_t),
+ format_dev_t(b2, resume_dev_t),
+ format_dev_t(b3, toi_sig_data->header_dev_t));
+
+ if (chain->dev_t == resume_dev_t)
+ chain->bdev = resume_block_device;
+ else if (chain->dev_t == toi_sig_data->header_dev_t)
+ chain->bdev = header_block_device;
+ else {
+ chain->bdev = toi_open_bdev(chain->uuid,
+ chain->dev_t, 1);
+ if (IS_ERR(chain->bdev)) {
+ free_bdev_info(chain);
+ return -ENODEV;
+ }
+ }
+
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Chain bmap shift "
+ "is %d and blocks per page is %d.",
+ chain->bmap_shift,
+ chain->blocks_per_page);
+
+ chain->blocks.first = this;
+
+ /*
+ * Couldn't do this earlier, but can't do
+ * goto_start now - we may have already used blocks
+ * in the first chain.
+ */
+ chain->blocks.current_extent = this;
+ chain->blocks.current_offset = this->start;
+
+ /*
+ * Can't wait until we've read the whole chain
+ * before we insert it in the list. We might need
+ * this chain to read the next page in the header
+ */
+ toi_insert_chain_in_prio_list(chain);
+ }
+
+ /*
+ * We have to wait until 2 extents are loaded before setting up
+ * properly because if the first extent has only one page, we
+ * will need to put the position on the second extent. Sounds
+ * obvious, but it wasn't!
+ */
+ (*num_loaded)++;
+ if ((*num_loaded) == 2)
+ set_up_start_position();
+ last = this;
+ }
+
+ /*
+ * Shouldn't get empty chains, but it's not impossible. Link them in so
+ * they get freed properly later.
+ */
+ if (!chain->blocks.num_extents)
+ toi_insert_chain_in_prio_list(chain);
+
+ if (!chain->blocks.current_extent) {
+ chain->blocks.current_extent = chain->blocks.first;
+ if (chain->blocks.current_extent)
+ chain->blocks.current_offset =
+ chain->blocks.current_extent->start;
+ }
+ return 0;
+}
+
+int toi_load_extent_chains(void)
+{
+ int result;
+ int to_load;
+ int i;
+ int extents_loaded = 0;
+
+ result = toiActiveAllocator->rw_header_chunk_noreadahead(READ, NULL,
+ (char *) &to_load,
+ sizeof(int));
+ if (result)
+ return result;
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "%d chains to read.", to_load);
+
+ for (i = 0; i < to_load; i++) {
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, " >> Loading chain %d/%d.",
+ i, to_load);
+ result = toi_load_extent_chain(i, &extents_loaded);
+ if (result)
+ return result;
+ }
+
+ /* If we never got to a second extent, we still need to do this. */
+ if (extents_loaded == 1)
+ set_up_start_position();
+
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Save chain numbers.");
+ result = toiActiveAllocator->rw_header_chunk_noreadahead(READ,
+ &toi_blockwriter_ops,
+ (char *) &toi_writer_posn.saved_chain_number[0],
+ 4 * sizeof(int));
+
+ return result;
+}
+
+static int toi_end_of_stream(int writing, int section_barrier)
+{
+ struct toi_bdev_info *cur_chain = toi_writer_posn.current_chain;
+ int compare_to = next_section[current_stream];
+ struct toi_bdev_info *compare_chain =
+ toi_writer_posn.saved_chain_ptr[compare_to];
+ int compare_offset = compare_chain ?
+ compare_chain->saved_state[compare_to].offset : 0;
+
+ if (!section_barrier)
+ return 0;
+
+ if (!cur_chain)
+ return 1;
+
+ if (cur_chain == compare_chain &&
+ cur_chain->blocks.current_offset == compare_offset) {
+ if (writing) {
+ if (!current_stream) {
+ debug_broken_header();
+ return 1;
+ }
+ } else {
+ more_readahead = 0;
+ toi_message(TOI_BIO, TOI_VERBOSE, 0,
+ "Reached the end of stream %d "
+ "(not an error).", current_stream);
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * go_next_page - skip blocks to the start of the next page
+ * @writing: Whether we're reading or writing the image.
+ *
+ * Go forward one page.
+ **/
+int go_next_page(int writing, int section_barrier)
+{
+ struct toi_bdev_info *cur_chain = toi_writer_posn.current_chain;
+ int max = cur_chain ? cur_chain->blocks_per_page : 1;
+
+ /* Nope. Go foward a page - or maybe two. Don't stripe the header,
+ * so that bad fragmentation doesn't put the extent data containing
+ * the location of the second page out of the first header page.
+ */
+ if (toi_extent_state_next(max, current_stream)) {
+ /* Don't complain if readahead falls off the end */
+ if (writing && section_barrier) {
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Extent state eof. "
+ "Expected compression ratio too optimistic?");
+ if (test_action_state(TOI_LOGALL))
+ dump_block_chains();
+ }
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Ran out of extents to "
+ "read/write. (Not necessarily a fatal error.");
+ return -ENOSPC;
+ }
+
+ return 0;
+}
+
+int devices_of_same_priority(struct toi_bdev_info *this)
+{
+ struct toi_bdev_info *check = prio_chain_head;
+ int i = 0;
+
+ while (check) {
+ if (check->prio == this->prio)
+ i++;
+ check = check->next;
+ }
+
+ return i;
+}
+
+/**
+ * toi_bio_rw_page - do i/o on the next disk page in the image
+ * @writing: Whether reading or writing.
+ * @page: Page to do i/o on.
+ * @is_readahead: Whether we're doing readahead
+ * @free_group: The group used in allocating the page
+ *
+ * Submit a page for reading or writing, possibly readahead.
+ * Pass the group used in allocating the page as well, as it should
+ * be freed on completion of the bio if we're writing the page.
+ **/
+int toi_bio_rw_page(int writing, struct page *page,
+ int is_readahead, int free_group)
+{
+ int result = toi_end_of_stream(writing, 1);
+ struct toi_bdev_info *dev_info = toi_writer_posn.current_chain;
+
+ if (result) {
+ if (writing)
+ abort_hibernate(TOI_INSUFFICIENT_STORAGE,
+ "Insufficient storage for your image.");
+ else
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Seeking to "
+ "read/write another page when stream has "
+ "ended.");
+ return -ENOSPC;
+ }
+
+ toi_message(TOI_BIO, TOI_VERBOSE, 0,
+ "%s %lx:%ld",
+ writing ? "Write" : "Read",
+ dev_info->dev_t, dev_info->blocks.current_offset);
+
+ result = toi_do_io(writing, dev_info->bdev,
+ dev_info->blocks.current_offset << dev_info->bmap_shift,
+ page, is_readahead, 0, free_group);
+
+ /* Ignore the result here - will check end of stream if come in again */
+ go_next_page(writing, 1);
+
+ if (result)
+ printk(KERN_ERR "toi_do_io returned %d.\n", result);
+ return result;
+}
+
+dev_t get_header_dev_t(void)
+{
+ return prio_chain_head->dev_t;
+}
+
+struct block_device *get_header_bdev(void)
+{
+ return prio_chain_head->bdev;
+}
+
+unsigned long get_headerblock(void)
+{
+ return prio_chain_head->blocks.first->start <<
+ prio_chain_head->bmap_shift;
+}
+
+int get_main_pool_phys_params(void)
+{
+ struct toi_bdev_info *this = prio_chain_head;
+ int result;
+
+ while (this) {
+ result = this->allocator->bio_allocator_ops->bmap(this);
+ if (result)
+ return result;
+ this = this->next;
+ }
+
+ return 0;
+}
+
+static int apply_header_reservation(void)
+{
+ int i;
+
+ if (!header_pages_reserved) {
+ toi_message(TOI_BIO, TOI_VERBOSE, 0,
+ "No header pages reserved at the moment.");
+ return 0;
+ }
+
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Applying header reservation.");
+
+ /* Apply header space reservation */
+ toi_extent_state_goto_start();
+
+ for (i = 0; i < header_pages_reserved; i++)
+ if (go_next_page(1, 0))
+ return -ENOSPC;
+
+ /* The end of header pages will be the start of pageset 2 */
+ toi_extent_state_save(2);
+
+ toi_message(TOI_BIO, TOI_VERBOSE, 0,
+ "Finished applying header reservation.");
+ return 0;
+}
+
+static int toi_bio_register_storage(void)
+{
+ int result = 0;
+ struct toi_module_ops *this_module;
+
+ list_for_each_entry(this_module, &toi_modules, module_list) {
+ if (!this_module->enabled ||
+ this_module->type != BIO_ALLOCATOR_MODULE)
+ continue;
+ toi_message(TOI_BIO, TOI_VERBOSE, 0,
+ "Registering storage from %s.",
+ this_module->name);
+ result = this_module->bio_allocator_ops->register_storage();
+ if (result)
+ break;
+ }
+
+ return result;
+}
+
+void toi_bio_free_unused_storage(void)
+{
+ struct toi_bdev_info *this = prio_chain_head;
+
+ while (this) {
+ toi_bio_free_unused_storage_chain(this);
+ this = this->next;
+ }
+}
+
+int toi_bio_allocate_storage(unsigned long request)
+{
+ struct toi_bdev_info *chain = prio_chain_head;
+ unsigned long to_get = request;
+ unsigned long extra_pages, needed;
+ int no_free = 0;
+
+ if (!chain) {
+ int result = toi_bio_register_storage();
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_allocate_storage: "
+ "Registering storage.");
+ if (result)
+ return 0;
+ chain = prio_chain_head;
+ if (!chain) {
+ printk("TuxOnIce: No storage was registered.\n");
+ return 0;
+ }
+ }
+
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_allocate_storage: "
+ "Request is %lu pages.", request);
+ extra_pages = DIV_ROUND_UP(request * (sizeof(unsigned long)
+ + sizeof(int)), PAGE_SIZE);
+ needed = request + extra_pages + header_pages_reserved;
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Adding %lu extra pages and %lu "
+ "for header => %lu.",
+ extra_pages, header_pages_reserved, needed);
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Already allocated %lu pages.",
+ raw_pages_allocd);
+
+ to_get = needed > raw_pages_allocd ? needed - raw_pages_allocd : 0;
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Need to get %lu pages.", to_get);
+
+ if (!to_get)
+ return apply_header_reservation();
+
+ while (to_get && chain) {
+ int num_group = devices_of_same_priority(chain);
+ int divisor = num_group - no_free;
+ int i;
+ unsigned long portion = DIV_ROUND_UP(to_get, divisor);
+ unsigned long got = 0;
+ unsigned long got_this_round = 0;
+ struct toi_bdev_info *top = chain;
+
+ toi_message(TOI_BIO, TOI_VERBOSE, 0,
+ " Start of loop. To get is %lu. Divisor is %d.",
+ to_get, divisor);
+ no_free = 0;
+
+ /*
+ * We're aiming to spread the allocated storage as evenly
+ * as possible, but we also want to get all the storage we
+ * can off this priority.
+ */
+ for (i = 0; i < num_group; i++) {
+ struct toi_bio_allocator_ops *ops =
+ chain->allocator->bio_allocator_ops;
+ toi_message(TOI_BIO, TOI_VERBOSE, 0,
+ " Asking for %lu pages from chain %p.",
+ portion, chain);
+ got = ops->allocate_storage(chain, portion);
+ toi_message(TOI_BIO, TOI_VERBOSE, 0,
+ " Got %lu pages from allocator %p.",
+ got, chain);
+ if (!got)
+ no_free++;
+ got_this_round += got;
+ chain = chain->next;
+ }
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, " Loop finished. Got a "
+ "total of %lu pages from %d allocators.",
+ got_this_round, divisor - no_free);
+
+ raw_pages_allocd += got_this_round;
+ to_get = needed > raw_pages_allocd ? needed - raw_pages_allocd :
+ 0;
+
+ /*
+ * If we got anything from chains of this priority and we
+ * still have storage to allocate, go over this priority
+ * again.
+ */
+ if (got_this_round && to_get)
+ chain = top;
+ else
+ no_free = 0;
+ }
+
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Finished allocating. Calling "
+ "get_main_pool_phys_params");
+ /* Now let swap allocator bmap the pages */
+ get_main_pool_phys_params();
+
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Done. Reserving header.");
+ return apply_header_reservation();
+}
+
+void toi_bio_chains_post_atomic(struct toi_boot_kernel_data *bkd)
+{
+ int i = 0;
+ struct toi_bdev_info *cur_chain = prio_chain_head;
+
+ while (cur_chain) {
+ cur_chain->pages_used = bkd->pages_used[i];
+ cur_chain = cur_chain->next;
+ i++;
+ }
+}
+
+int toi_bio_chains_debug_info(char *buffer, int size)
+{
+ /* Show what we actually used */
+ struct toi_bdev_info *cur_chain = prio_chain_head;
+ int len = 0;
+
+ while (cur_chain) {
+ len += scnprintf(buffer + len, size - len, " Used %lu pages "
+ "from %s.\n", cur_chain->pages_used,
+ cur_chain->name);
+ cur_chain = cur_chain->next;
+ }
+
+ return len;
+}
+
+void toi_bio_store_inc_image_ptr(struct toi_incremental_image_pointer *ptr)
+{
+ struct toi_bdev_info *this = toi_writer_posn.current_chain,
+ *cmp = prio_chain_head;
+
+ ptr->save.chain = 1;
+ while (this != cmp) {
+ ptr->save.chain++;
+ cmp = cmp->next;
+ }
+ ptr->save.block = this->blocks.current_offset;
+
+ /* Save the raw info internally for quicker access when updating pointers */
+ ptr->bdev = this->bdev;
+ ptr->block = this->blocks.current_offset << this->bmap_shift;
+}
+
+void toi_bio_restore_inc_image_ptr(struct toi_incremental_image_pointer *ptr)
+{
+ int i = ptr->save.chain - 1;
+ struct toi_bdev_info *this;
+ struct hibernate_extent *hib;
+
+ /* Find chain by stored index */
+ this = prio_chain_head;
+ while (i) {
+ this = this->next;
+ i--;
+ }
+ toi_writer_posn.current_chain = this;
+
+ /* Restore block */
+ this->blocks.current_offset = ptr->save.block;
+
+ /* Find current offset from block number */
+ hib = this->blocks.first;
+
+ while (hib->start > ptr->save.block) {
+ hib = hib->next;
+ }
+
+ this->blocks.last_touched = this->blocks.current_extent = hib;
+}
diff --git a/kernel/power/tuxonice_bio_core.c b/kernel/power/tuxonice_bio_core.c
new file mode 100644
index 000000000..d18f2751c
--- /dev/null
+++ b/kernel/power/tuxonice_bio_core.c
@@ -0,0 +1,1933 @@
+/*
+ * kernel/power/tuxonice_bio.c
+ *
+ * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * Distributed under GPLv2.
+ *
+ * This file contains block io functions for TuxOnIce. These are
+ * used by the swapwriter and it is planned that they will also
+ * be used by the NFSwriter.
+ *
+ */
+
+#include <linux/blkdev.h>
+#include <linux/syscalls.h>
+#include <linux/suspend.h>
+#include <linux/ctype.h>
+#include <linux/fs_uuid.h>
+#include <linux/mount.h>
+
+#include "tuxonice.h"
+#include "tuxonice_sysfs.h"
+#include "tuxonice_modules.h"
+#include "tuxonice_prepare_image.h"
+#include "tuxonice_bio.h"
+#include "tuxonice_ui.h"
+#include "tuxonice_alloc.h"
+#include "tuxonice_io.h"
+#include "tuxonice_builtin.h"
+#include "tuxonice_bio_internal.h"
+
+#define MEMORY_ONLY 1
+#define THROTTLE_WAIT 2
+
+/* #define MEASURE_MUTEX_CONTENTION */
+#ifndef MEASURE_MUTEX_CONTENTION
+#define my_mutex_lock(index, the_lock) mutex_lock(the_lock)
+#define my_mutex_unlock(index, the_lock) mutex_unlock(the_lock)
+#else
+unsigned long mutex_times[2][2][NR_CPUS];
+#define my_mutex_lock(index, the_lock) do { \
+ int have_mutex; \
+ have_mutex = mutex_trylock(the_lock); \
+ if (!have_mutex) { \
+ mutex_lock(the_lock); \
+ mutex_times[index][0][smp_processor_id()]++; \
+ } else { \
+ mutex_times[index][1][smp_processor_id()]++; \
+ }
+
+#define my_mutex_unlock(index, the_lock) \
+ mutex_unlock(the_lock); \
+} while (0)
+#endif
+
+static int page_idx, reset_idx;
+
+static int target_outstanding_io = 1024;
+static int max_outstanding_writes, max_outstanding_reads;
+
+static struct page *bio_queue_head, *bio_queue_tail;
+static atomic_t toi_bio_queue_size;
+static DEFINE_SPINLOCK(bio_queue_lock);
+
+static int free_mem_throttle, throughput_throttle;
+int more_readahead = 1;
+static struct page *readahead_list_head, *readahead_list_tail;
+
+static struct page *waiting_on;
+
+static atomic_t toi_io_in_progress, toi_io_done;
+static DECLARE_WAIT_QUEUE_HEAD(num_in_progress_wait);
+
+int current_stream;
+/* Not static, so that the allocators can setup and complete
+ * writing the header */
+char *toi_writer_buffer;
+int toi_writer_buffer_posn;
+
+static DEFINE_MUTEX(toi_bio_mutex);
+static DEFINE_MUTEX(toi_bio_readahead_mutex);
+
+static struct task_struct *toi_queue_flusher;
+static int toi_bio_queue_flush_pages(int dedicated_thread);
+
+struct toi_module_ops toi_blockwriter_ops;
+
+struct toi_incremental_image_pointer toi_inc_ptr[2][2];
+
+#define TOTAL_OUTSTANDING_IO (atomic_read(&toi_io_in_progress) + \
+ atomic_read(&toi_bio_queue_size))
+
+unsigned long raw_pages_allocd, header_pages_reserved;
+
+static int toi_rw_buffer(int writing, char *buffer, int buffer_size,
+ int no_readahead);
+
+/**
+ * set_free_mem_throttle - set the point where we pause to avoid oom.
+ *
+ * Initially, this value is zero, but when we first fail to allocate memory,
+ * we set it (plus a buffer) and thereafter throttle i/o once that limit is
+ * reached.
+ **/
+static void set_free_mem_throttle(void)
+{
+ int new_throttle = nr_free_buffer_pages() + 256;
+
+ if (new_throttle > free_mem_throttle)
+ free_mem_throttle = new_throttle;
+}
+
+#define NUM_REASONS 7
+static atomic_t reasons[NUM_REASONS];
+static char *reason_name[NUM_REASONS] = {
+ "readahead not ready",
+ "bio allocation",
+ "synchronous I/O",
+ "toi_bio_get_new_page",
+ "memory low",
+ "readahead buffer allocation",
+ "throughput_throttle",
+};
+
+/* User Specified Parameters. */
+unsigned long resume_firstblock;
+dev_t resume_dev_t;
+struct block_device *resume_block_device;
+static atomic_t resume_bdev_open_count;
+
+struct block_device *header_block_device;
+
+/**
+ * toi_open_bdev: Open a bdev at resume time.
+ *
+ * index: The swap index. May be MAX_SWAPFILES for the resume_dev_t
+ * (the user can have resume= pointing at a swap partition/file that isn't
+ * swapon'd when they hibernate. MAX_SWAPFILES+1 for the first page of the
+ * header. It will be from a swap partition that was enabled when we hibernated,
+ * but we don't know it's real index until we read that first page.
+ * dev_t: The device major/minor.
+ * display_errs: Whether to try to do this quietly.
+ *
+ * We stored a dev_t in the image header. Open the matching device without
+ * requiring /dev/<whatever> in most cases and record the details needed
+ * to close it later and avoid duplicating work.
+ */
+struct block_device *toi_open_bdev(char *uuid, dev_t default_device,
+ int display_errs)
+{
+ struct block_device *bdev;
+ dev_t device = default_device;
+ char buf[32];
+ int retried = 0;
+
+retry:
+ if (uuid) {
+ struct fs_info seek;
+ strncpy((char *) &seek.uuid, uuid, 16);
+ seek.dev_t = 0;
+ seek.last_mount_size = 0;
+ device = blk_lookup_fs_info(&seek);
+ if (!device) {
+ device = default_device;
+ printk(KERN_DEBUG "Unable to resolve uuid. Falling back"
+ " to dev_t.\n");
+ } else
+ printk(KERN_DEBUG "Resolved uuid to device %s.\n",
+ format_dev_t(buf, device));
+ }
+
+ if (!device) {
+ printk(KERN_ERR "TuxOnIce attempting to open a "
+ "blank dev_t!\n");
+ dump_stack();
+ return NULL;
+ }
+ bdev = toi_open_by_devnum(device);
+
+ if (IS_ERR(bdev) || !bdev) {
+ if (!retried) {
+ retried = 1;
+ wait_for_device_probe();
+ goto retry;
+ }
+ if (display_errs)
+ toi_early_boot_message(1, TOI_CONTINUE_REQ,
+ "Failed to get access to block device "
+ "\"%x\" (error %d).\n Maybe you need "
+ "to run mknod and/or lvmsetup in an "
+ "initrd/ramfs?", device, bdev);
+ return ERR_PTR(-EINVAL);
+ }
+ toi_message(TOI_BIO, TOI_VERBOSE, 0,
+ "TuxOnIce got bdev %p for dev_t %x.",
+ bdev, device);
+
+ return bdev;
+}
+
+static void toi_bio_reserve_header_space(unsigned long request)
+{
+ header_pages_reserved = request;
+}
+
+/**
+ * do_bio_wait - wait for some TuxOnIce I/O to complete
+ * @reason: The array index of the reason we're waiting.
+ *
+ * Wait for a particular page of I/O if we're after a particular page.
+ * If we're not after a particular page, wait instead for all in flight
+ * I/O to be completed or for us to have enough free memory to be able
+ * to submit more I/O.
+ *
+ * If we wait, we also update our statistics regarding why we waited.
+ **/
+static void do_bio_wait(int reason)
+{
+ struct page *was_waiting_on = waiting_on;
+
+ /* On SMP, waiting_on can be reset, so we make a copy */
+ if (was_waiting_on) {
+ wait_on_page_locked(was_waiting_on);
+ atomic_inc(&reasons[reason]);
+ } else {
+ atomic_inc(&reasons[reason]);
+
+ wait_event(num_in_progress_wait,
+ !atomic_read(&toi_io_in_progress) ||
+ nr_free_buffer_pages() > free_mem_throttle);
+ }
+}
+
+/**
+ * throttle_if_needed - wait for I/O completion if throttle points are reached
+ * @flags: What to check and how to act.
+ *
+ * Check whether we need to wait for some I/O to complete. We always check
+ * whether we have enough memory available, but may also (depending upon
+ * @reason) check if the throughput throttle limit has been reached.
+ **/
+static int throttle_if_needed(int flags)
+{
+ int free_pages = nr_free_buffer_pages();
+
+ /* Getting low on memory and I/O is in progress? */
+ while (unlikely(free_pages < free_mem_throttle) &&
+ atomic_read(&toi_io_in_progress) &&
+ !test_result_state(TOI_ABORTED)) {
+ if (!(flags & THROTTLE_WAIT))
+ return -ENOMEM;
+ do_bio_wait(4);
+ free_pages = nr_free_buffer_pages();
+ }
+
+ while (!(flags & MEMORY_ONLY) && throughput_throttle &&
+ TOTAL_OUTSTANDING_IO >= throughput_throttle &&
+ !test_result_state(TOI_ABORTED)) {
+ int result = toi_bio_queue_flush_pages(0);
+ if (result)
+ return result;
+ atomic_inc(&reasons[6]);
+ wait_event(num_in_progress_wait,
+ !atomic_read(&toi_io_in_progress) ||
+ TOTAL_OUTSTANDING_IO < throughput_throttle);
+ }
+
+ return 0;
+}
+
+/**
+ * update_throughput_throttle - update the raw throughput throttle
+ * @jif_index: The number of times this function has been called.
+ *
+ * This function is called four times per second by the core, and used to limit
+ * the amount of I/O we submit at once, spreading out our waiting through the
+ * whole job and letting userui get an opportunity to do its work.
+ *
+ * We don't start limiting I/O until 1/4s has gone so that we get a
+ * decent sample for our initial limit, and keep updating it because
+ * throughput may vary (on rotating media, eg) with our block number.
+ *
+ * We throttle to 1/10s worth of I/O.
+ **/
+static void update_throughput_throttle(int jif_index)
+{
+ int done = atomic_read(&toi_io_done);
+ throughput_throttle = done * 2 / 5 / jif_index;
+}
+
+/**
+ * toi_finish_all_io - wait for all outstanding i/o to complete
+ *
+ * Flush any queued but unsubmitted I/O and wait for it all to complete.
+ **/
+static int toi_finish_all_io(void)
+{
+ int result = toi_bio_queue_flush_pages(0);
+ toi_bio_queue_flusher_should_finish = 1;
+ wake_up(&toi_io_queue_flusher);
+ wait_event(num_in_progress_wait, !TOTAL_OUTSTANDING_IO);
+ return result;
+}
+
+/**
+ * toi_end_bio - bio completion function.
+ * @bio: bio that has completed.
+ * @err: Error value. Yes, like end_swap_bio_read, we ignore it.
+ *
+ * Function called by the block driver from interrupt context when I/O is
+ * completed. If we were writing the page, we want to free it and will have
+ * set bio->bi_private to the parameter we should use in telling the page
+ * allocation accounting code what the page was allocated for. If we're
+ * reading the page, it will be in the singly linked list made from
+ * page->private pointers.
+ **/
+static void toi_end_bio(struct bio *bio, int err)
+{
+ struct page *page = bio->bi_io_vec[0].bv_page;
+
+ BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
+
+ unlock_page(page);
+ bio_put(bio);
+
+ if (waiting_on == page)
+ waiting_on = NULL;
+
+ put_page(page);
+
+ if (bio->bi_private)
+ toi__free_page((int) ((unsigned long) bio->bi_private) , page);
+
+ bio_put(bio);
+
+ atomic_dec(&toi_io_in_progress);
+ atomic_inc(&toi_io_done);
+
+ wake_up(&num_in_progress_wait);
+}
+
+/**
+ * submit - submit BIO request
+ * @writing: READ or WRITE.
+ * @dev: The block device we're using.
+ * @first_block: The first sector we're using.
+ * @page: The page being used for I/O.
+ * @free_group: If writing, the group that was used in allocating the page
+ * and which will be used in freeing the page from the completion
+ * routine.
+ *
+ * Based on Patrick Mochell's pmdisk code from long ago: "Straight from the
+ * textbook - allocate and initialize the bio. If we're writing, make sure
+ * the page is marked as dirty. Then submit it and carry on."
+ *
+ * If we're just testing the speed of our own code, we fake having done all
+ * the hard work and all toi_end_bio immediately.
+ **/
+static int submit(int writing, struct block_device *dev, sector_t first_block,
+ struct page *page, int free_group)
+{
+ struct bio *bio = NULL;
+ int cur_outstanding_io, result;
+
+ /*
+ * Shouldn't throttle if reading - can deadlock in the single
+ * threaded case as pages are only freed when we use the
+ * readahead.
+ */
+ if (writing) {
+ result = throttle_if_needed(MEMORY_ONLY | THROTTLE_WAIT);
+ if (result)
+ return result;
+ }
+
+ while (!bio) {
+ bio = bio_alloc(TOI_ATOMIC_GFP, 1);
+ if (!bio) {
+ set_free_mem_throttle();
+ do_bio_wait(1);
+ }
+ }
+
+ bio->bi_bdev = dev;
+ bio->bi_iter.bi_sector = first_block;
+ bio->bi_private = (void *) ((unsigned long) free_group);
+ bio->bi_end_io = toi_end_bio;
+ bio->bi_flags |= (1 << BIO_TOI);
+
+ if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
+ printk(KERN_DEBUG "ERROR: adding page to bio at %lld\n",
+ (unsigned long long) first_block);
+ bio_put(bio);
+ return -EFAULT;
+ }
+
+ bio_get(bio);
+
+ cur_outstanding_io = atomic_add_return(1, &toi_io_in_progress);
+ if (writing) {
+ if (cur_outstanding_io > max_outstanding_writes)
+ max_outstanding_writes = cur_outstanding_io;
+ } else {
+ if (cur_outstanding_io > max_outstanding_reads)
+ max_outstanding_reads = cur_outstanding_io;
+ }
+
+ /* Still read the header! */
+ if (unlikely(test_action_state(TOI_TEST_BIO) && writing)) {
+ /* Fake having done the hard work */
+ set_bit(BIO_UPTODATE, &bio->bi_flags);
+ toi_end_bio(bio, 0);
+ } else
+ submit_bio(writing | REQ_SYNC, bio);
+
+ return 0;
+}
+
+/**
+ * toi_do_io: Prepare to do some i/o on a page and submit or batch it.
+ *
+ * @writing: Whether reading or writing.
+ * @bdev: The block device which we're using.
+ * @block0: The first sector we're reading or writing.
+ * @page: The page on which I/O is being done.
+ * @readahead_index: If doing readahead, the index (reset this flag when done).
+ * @syncio: Whether the i/o is being done synchronously.
+ *
+ * Prepare and start a read or write operation.
+ *
+ * Note that we always work with our own page. If writing, we might be given a
+ * compression buffer that will immediately be used to start compressing the
+ * next page. For reading, we do readahead and therefore don't know the final
+ * address where the data needs to go.
+ **/
+int toi_do_io(int writing, struct block_device *bdev, long block0,
+ struct page *page, int is_readahead, int syncio, int free_group)
+{
+ page->private = 0;
+
+ /* Do here so we don't race against toi_bio_get_next_page_read */
+ lock_page(page);
+
+ if (is_readahead) {
+ if (readahead_list_head)
+ readahead_list_tail->private = (unsigned long) page;
+ else
+ readahead_list_head = page;
+
+ readahead_list_tail = page;
+ }
+
+ /* Done before submitting to avoid races. */
+ if (syncio)
+ waiting_on = page;
+
+ /* Submit the page */
+ get_page(page);
+
+ if (submit(writing, bdev, block0, page, free_group))
+ return -EFAULT;
+
+ if (syncio)
+ do_bio_wait(2);
+
+ return 0;
+}
+
+/**
+ * toi_bdev_page_io - simpler interface to do directly i/o on a single page
+ * @writing: Whether reading or writing.
+ * @bdev: Block device on which we're operating.
+ * @pos: Sector at which page to read or write starts.
+ * @page: Page to be read/written.
+ *
+ * A simple interface to submit a page of I/O and wait for its completion.
+ * The caller must free the page used.
+ **/
+static int toi_bdev_page_io(int writing, struct block_device *bdev,
+ long pos, struct page *page)
+{
+ return toi_do_io(writing, bdev, pos, page, 0, 1, 0);
+}
+
+/**
+ * toi_bio_memory_needed - report the amount of memory needed for block i/o
+ *
+ * We want to have at least enough memory so as to have target_outstanding_io
+ * or more transactions on the fly at once. If we can do more, fine.
+ **/
+static int toi_bio_memory_needed(void)
+{
+ return target_outstanding_io * (PAGE_SIZE + sizeof(struct request) +
+ sizeof(struct bio));
+}
+
+/**
+ * toi_bio_print_debug_stats - put out debugging info in the buffer provided
+ * @buffer: A buffer of size @size into which text should be placed.
+ * @size: The size of @buffer.
+ *
+ * Fill a buffer with debugging info. This is used for both our debug_info sysfs
+ * entry and for recording the same info in dmesg.
+ **/
+static int toi_bio_print_debug_stats(char *buffer, int size)
+{
+ int len = 0;
+
+ if (toiActiveAllocator != &toi_blockwriter_ops) {
+ len = scnprintf(buffer, size,
+ "- Block I/O inactive.\n");
+ return len;
+ }
+
+ len = scnprintf(buffer, size, "- Block I/O active.\n");
+
+ len += toi_bio_chains_debug_info(buffer + len, size - len);
+
+ len += scnprintf(buffer + len, size - len,
+ "- Max outstanding reads %d. Max writes %d.\n",
+ max_outstanding_reads, max_outstanding_writes);
+
+ len += scnprintf(buffer + len, size - len,
+ " Memory_needed: %d x (%lu + %u + %u) = %d bytes.\n",
+ target_outstanding_io,
+ PAGE_SIZE, (unsigned int) sizeof(struct request),
+ (unsigned int) sizeof(struct bio), toi_bio_memory_needed());
+
+#ifdef MEASURE_MUTEX_CONTENTION
+ {
+ int i;
+
+ len += scnprintf(buffer + len, size - len,
+ " Mutex contention while reading:\n Contended Free\n");
+
+ for_each_online_cpu(i)
+ len += scnprintf(buffer + len, size - len,
+ " %9lu %9lu\n",
+ mutex_times[0][0][i], mutex_times[0][1][i]);
+
+ len += scnprintf(buffer + len, size - len,
+ " Mutex contention while writing:\n Contended Free\n");
+
+ for_each_online_cpu(i)
+ len += scnprintf(buffer + len, size - len,
+ " %9lu %9lu\n",
+ mutex_times[1][0][i], mutex_times[1][1][i]);
+
+ }
+#endif
+
+ return len + scnprintf(buffer + len, size - len,
+ " Free mem throttle point reached %d.\n", free_mem_throttle);
+}
+
+static int total_header_bytes;
+static int unowned;
+
+void debug_broken_header(void)
+{
+ printk(KERN_DEBUG "Image header too big for size allocated!\n");
+ print_toi_header_storage_for_modules();
+ printk(KERN_DEBUG "Page flags : %d.\n", toi_pageflags_space_needed());
+ printk(KERN_DEBUG "toi_header : %zu.\n", sizeof(struct toi_header));
+ printk(KERN_DEBUG "Total unowned : %d.\n", unowned);
+ printk(KERN_DEBUG "Total used : %d (%ld pages).\n", total_header_bytes,
+ DIV_ROUND_UP(total_header_bytes, PAGE_SIZE));
+ printk(KERN_DEBUG "Space needed now : %ld.\n",
+ get_header_storage_needed());
+ dump_block_chains();
+ abort_hibernate(TOI_HEADER_TOO_BIG, "Header reservation too small.");
+}
+
+static int toi_bio_update_previous_inc_img_ptr(int stream)
+{
+ int result;
+ char * buffer = (char *) toi_get_zeroed_page(12, TOI_ATOMIC_GFP);
+ struct page *page;
+ struct toi_incremental_image_pointer *prev, *this;
+
+ prev = &toi_inc_ptr[stream][0];
+ this = &toi_inc_ptr[stream][1];
+
+ if (!buffer) {
+ // We're at the start of writing a pageset. Memory should not be that scarce.
+ return -ENOMEM;
+ }
+
+ page = virt_to_page(buffer);
+ result = toi_do_io(READ, prev->bdev, prev->block, page, 0, 1, 0);
+
+ if (result)
+ goto out;
+
+ memcpy(buffer, (char *) this, sizeof(this->save));
+
+ result = toi_do_io(WRITE, prev->bdev, prev->block, page, 0, 0, 12);
+
+ // If the IO is successfully submitted (!result), the page will be freed
+ // asynchronously on completion.
+out:
+ if (result)
+ toi__free_page(12, virt_to_page(buffer));
+ return result;
+}
+
+/**
+ * toi_rw_init_incremental - incremental image part of setting up to write new section
+ */
+static int toi_write_init_incremental(int stream)
+{
+ int result = 0;
+
+ // Remember the location of this block so we can link to it.
+ toi_bio_store_inc_image_ptr(&toi_inc_ptr[stream][1]);
+
+ // Update the pointer at the start of the last pageset with the same stream number.
+ result = toi_bio_update_previous_inc_img_ptr(stream);
+ if (result)
+ return result;
+
+ // Move the current to the previous slot.
+ memcpy(&toi_inc_ptr[stream][0], &toi_inc_ptr[stream][1], sizeof(toi_inc_ptr[stream][1]));
+
+ // Store a blank pointer at the start of this incremental pageset
+ memset(&toi_inc_ptr[stream][1], 0, sizeof(toi_inc_ptr[stream][1]));
+ result = toi_rw_buffer(WRITE, (char *) &toi_inc_ptr[stream][1], sizeof(toi_inc_ptr[stream][1]), 0);
+ if (result)
+ return result;
+
+ // Serialise extent chains if this is an incremental pageset
+ return toi_serialise_extent_chains();
+}
+
+/**
+ * toi_read_init_incremental - incremental image part of setting up to read new section
+ */
+static int toi_read_init_incremental(int stream)
+{
+ int result;
+
+ // Set our position to the start of the next pageset
+ toi_bio_restore_inc_image_ptr(&toi_inc_ptr[stream][1]);
+
+ // Read the start of the next incremental pageset (if any)
+ result = toi_rw_buffer(READ, (char *) &toi_inc_ptr[stream][1], sizeof(toi_inc_ptr[stream][1]), 0);
+
+ if (!result)
+ result = toi_load_extent_chains();
+
+ return result;
+}
+
+/**
+ * toi_rw_init - prepare to read or write a stream in the image
+ * @writing: Whether reading or writing.
+ * @stream number: Section of the image being processed.
+ *
+ * Prepare to read or write a section ('stream') in the image.
+ **/
+static int toi_rw_init(int writing, int stream_number)
+{
+ if (stream_number)
+ toi_extent_state_restore(stream_number);
+ else
+ toi_extent_state_goto_start();
+
+ if (writing) {
+ reset_idx = 0;
+ if (!current_stream)
+ page_idx = 0;
+ } else {
+ reset_idx = 1;
+ }
+
+ atomic_set(&toi_io_done, 0);
+ if (!toi_writer_buffer)
+ toi_writer_buffer = (char *) toi_get_zeroed_page(11,
+ TOI_ATOMIC_GFP);
+ toi_writer_buffer_posn = writing ? 0 : PAGE_SIZE;
+
+ current_stream = stream_number;
+
+ more_readahead = 1;
+
+ if (test_result_state(TOI_KEPT_IMAGE)) {
+ int result;
+
+ if (writing) {
+ result = toi_write_init_incremental(stream_number);
+ } else {
+ result = toi_read_init_incremental(stream_number);
+ }
+
+ if (result)
+ return result;
+ }
+
+ return toi_writer_buffer ? 0 : -ENOMEM;
+}
+
+/**
+ * toi_bio_queue_write - queue a page for writing
+ * @full_buffer: Pointer to a page to be queued
+ *
+ * Add a page to the queue to be submitted. If we're the queue flusher,
+ * we'll do this once we've dropped toi_bio_mutex, so other threads can
+ * continue to submit I/O while we're on the slow path doing the actual
+ * submission.
+ **/
+static void toi_bio_queue_write(char **full_buffer)
+{
+ struct page *page = virt_to_page(*full_buffer);
+ unsigned long flags;
+
+ *full_buffer = NULL;
+ page->private = 0;
+
+ spin_lock_irqsave(&bio_queue_lock, flags);
+ if (!bio_queue_head)
+ bio_queue_head = page;
+ else
+ bio_queue_tail->private = (unsigned long) page;
+
+ bio_queue_tail = page;
+ atomic_inc(&toi_bio_queue_size);
+
+ spin_unlock_irqrestore(&bio_queue_lock, flags);
+ wake_up(&toi_io_queue_flusher);
+}
+
+/**
+ * toi_rw_cleanup - Cleanup after i/o.
+ * @writing: Whether we were reading or writing.
+ *
+ * Flush all I/O and clean everything up after reading or writing a
+ * section of the image.
+ **/
+static int toi_rw_cleanup(int writing)
+{
+ int i, result = 0;
+
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_rw_cleanup.");
+ if (writing) {
+ if (toi_writer_buffer_posn && !test_result_state(TOI_ABORTED))
+ toi_bio_queue_write(&toi_writer_buffer);
+
+ while (bio_queue_head && !result)
+ result = toi_bio_queue_flush_pages(0);
+
+ if (result)
+ return result;
+
+ if (current_stream == 2)
+ toi_extent_state_save(1);
+ else if (current_stream == 1)
+ toi_extent_state_save(3);
+ }
+
+ result = toi_finish_all_io();
+
+ while (readahead_list_head) {
+ void *next = (void *) readahead_list_head->private;
+ toi__free_page(12, readahead_list_head);
+ readahead_list_head = next;
+ }
+
+ readahead_list_tail = NULL;
+
+ if (!current_stream)
+ return result;
+
+ for (i = 0; i < NUM_REASONS; i++) {
+ if (!atomic_read(&reasons[i]))
+ continue;
+ printk(KERN_DEBUG "Waited for i/o due to %s %d times.\n",
+ reason_name[i], atomic_read(&reasons[i]));
+ atomic_set(&reasons[i], 0);
+ }
+
+ current_stream = 0;
+ return result;
+}
+
+/**
+ * toi_start_one_readahead - start one page of readahead
+ * @dedicated_thread: Is this a thread dedicated to doing readahead?
+ *
+ * Start one new page of readahead. If this is being called by a thread
+ * whose only just is to submit readahead, don't quit because we failed
+ * to allocate a page.
+ **/
+static int toi_start_one_readahead(int dedicated_thread)
+{
+ char *buffer = NULL;
+ int oom = 0, result;
+
+ result = throttle_if_needed(dedicated_thread ? THROTTLE_WAIT : 0);
+ if (result) {
+ printk("toi_start_one_readahead: throttle_if_needed returned %d.\n", result);
+ return result;
+ }
+
+ mutex_lock(&toi_bio_readahead_mutex);
+
+ while (!buffer) {
+ buffer = (char *) toi_get_zeroed_page(12,
+ TOI_ATOMIC_GFP);
+ if (!buffer) {
+ if (oom && !dedicated_thread) {
+ mutex_unlock(&toi_bio_readahead_mutex);
+ printk("toi_start_one_readahead: oom and !dedicated thread %d.\n", result);
+ return -ENOMEM;
+ }
+
+ oom = 1;
+ set_free_mem_throttle();
+ do_bio_wait(5);
+ }
+ }
+
+ result = toi_bio_rw_page(READ, virt_to_page(buffer), 1, 0);
+ if (result) {
+ printk("toi_start_one_readahead: toi_bio_rw_page returned %d.\n", result);
+ }
+ if (result == -ENOSPC)
+ toi__free_page(12, virt_to_page(buffer));
+ mutex_unlock(&toi_bio_readahead_mutex);
+ if (result) {
+ if (result == -ENOSPC)
+ toi_message(TOI_BIO, TOI_VERBOSE, 0,
+ "Last readahead page submitted.");
+ else
+ printk(KERN_DEBUG "toi_bio_rw_page returned %d.\n",
+ result);
+ }
+ return result;
+}
+
+/**
+ * toi_start_new_readahead - start new readahead
+ * @dedicated_thread: Are we dedicated to this task?
+ *
+ * Start readahead of image pages.
+ *
+ * We can be called as a thread dedicated to this task (may be helpful on
+ * systems with lots of CPUs), in which case we don't exit until there's no
+ * more readahead.
+ *
+ * If this is not called by a dedicated thread, we top up our queue until
+ * there's no more readahead to submit, we've submitted the number given
+ * in target_outstanding_io or the number in progress exceeds the target
+ * outstanding I/O value.
+ *
+ * No mutex needed because this is only ever called by the first cpu.
+ **/
+static int toi_start_new_readahead(int dedicated_thread)
+{
+ int last_result, num_submitted = 0;
+
+ /* Start a new readahead? */
+ if (!more_readahead)
+ return 0;
+
+ do {
+ last_result = toi_start_one_readahead(dedicated_thread);
+
+ if (last_result) {
+ if (last_result == -ENOMEM || last_result == -ENOSPC)
+ return 0;
+
+ printk(KERN_DEBUG
+ "Begin read chunk returned %d.\n",
+ last_result);
+ } else
+ num_submitted++;
+
+ } while (more_readahead && !last_result &&
+ (dedicated_thread ||
+ (num_submitted < target_outstanding_io &&
+ atomic_read(&toi_io_in_progress) < target_outstanding_io)));
+
+ return last_result;
+}
+
+/**
+ * bio_io_flusher - start the dedicated I/O flushing routine
+ * @writing: Whether we're writing the image.
+ **/
+static int bio_io_flusher(int writing)
+{
+
+ if (writing)
+ return toi_bio_queue_flush_pages(1);
+ else
+ return toi_start_new_readahead(1);
+}
+
+/**
+ * toi_bio_get_next_page_read - read a disk page, perhaps with readahead
+ * @no_readahead: Whether we can use readahead
+ *
+ * Read a page from disk, submitting readahead and cleaning up finished i/o
+ * while we wait for the page we're after.
+ **/
+static int toi_bio_get_next_page_read(int no_readahead)
+{
+ char *virt;
+ struct page *old_readahead_list_head;
+
+ /*
+ * When reading the second page of the header, we have to
+ * delay submitting the read until after we've gotten the
+ * extents out of the first page.
+ */
+ if (unlikely(no_readahead)) {
+ int result = toi_start_one_readahead(0);
+ if (result) {
+ printk(KERN_EMERG "No readahead and toi_start_one_readahead "
+ "returned non-zero.\n");
+ return -EIO;
+ }
+ }
+
+ if (unlikely(!readahead_list_head)) {
+ /*
+ * If the last page finishes exactly on the page
+ * boundary, we will be called one extra time and
+ * have no data to return. In this case, we should
+ * not BUG(), like we used to!
+ */
+ if (!more_readahead) {
+ printk(KERN_EMERG "No more readahead.\n");
+ return -ENOSPC;
+ }
+ if (unlikely(toi_start_one_readahead(0))) {
+ printk(KERN_EMERG "No readahead and "
+ "toi_start_one_readahead returned non-zero.\n");
+ return -EIO;
+ }
+ }
+
+ if (PageLocked(readahead_list_head)) {
+ waiting_on = readahead_list_head;
+ do_bio_wait(0);
+ }
+
+ virt = page_address(readahead_list_head);
+ memcpy(toi_writer_buffer, virt, PAGE_SIZE);
+
+ mutex_lock(&toi_bio_readahead_mutex);
+ old_readahead_list_head = readahead_list_head;
+ readahead_list_head = (struct page *) readahead_list_head->private;
+ mutex_unlock(&toi_bio_readahead_mutex);
+ toi__free_page(12, old_readahead_list_head);
+ return 0;
+}
+
+/**
+ * toi_bio_queue_flush_pages - flush the queue of pages queued for writing
+ * @dedicated_thread: Whether we're a dedicated thread
+ *
+ * Flush the queue of pages ready to be written to disk.
+ *
+ * If we're a dedicated thread, stay in here until told to leave,
+ * sleeping in wait_event.
+ *
+ * The first thread is normally the only one to come in here. Another
+ * thread can enter this routine too, though, via throttle_if_needed.
+ * Since that's the case, we must be careful to only have one thread
+ * doing this work at a time. Otherwise we have a race and could save
+ * pages out of order.
+ *
+ * If an error occurs, free all remaining pages without submitting them
+ * for I/O.
+ **/
+
+int toi_bio_queue_flush_pages(int dedicated_thread)
+{
+ unsigned long flags;
+ int result = 0;
+ static DEFINE_MUTEX(busy);
+
+ if (!mutex_trylock(&busy))
+ return 0;
+
+top:
+ spin_lock_irqsave(&bio_queue_lock, flags);
+ while (bio_queue_head) {
+ struct page *page = bio_queue_head;
+ bio_queue_head = (struct page *) page->private;
+ if (bio_queue_tail == page)
+ bio_queue_tail = NULL;
+ atomic_dec(&toi_bio_queue_size);
+ spin_unlock_irqrestore(&bio_queue_lock, flags);
+
+ /* Don't generate more error messages if already had one */
+ if (!result)
+ result = toi_bio_rw_page(WRITE, page, 0, 11);
+ /*
+ * If writing the page failed, don't drop out.
+ * Flush the rest of the queue too.
+ */
+ if (result)
+ toi__free_page(11 , page);
+ spin_lock_irqsave(&bio_queue_lock, flags);
+ }
+ spin_unlock_irqrestore(&bio_queue_lock, flags);
+
+ if (dedicated_thread) {
+ wait_event(toi_io_queue_flusher, bio_queue_head ||
+ toi_bio_queue_flusher_should_finish);
+ if (likely(!toi_bio_queue_flusher_should_finish))
+ goto top;
+ toi_bio_queue_flusher_should_finish = 0;
+ }
+
+ mutex_unlock(&busy);
+ return result;
+}
+
+/**
+ * toi_bio_get_new_page - get a new page for I/O
+ * @full_buffer: Pointer to a page to allocate.
+ **/
+static int toi_bio_get_new_page(char **full_buffer)
+{
+ int result = throttle_if_needed(THROTTLE_WAIT);
+ if (result)
+ return result;
+
+ while (!*full_buffer) {
+ *full_buffer = (char *) toi_get_zeroed_page(11, TOI_ATOMIC_GFP);
+ if (!*full_buffer) {
+ set_free_mem_throttle();
+ do_bio_wait(3);
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * toi_rw_buffer - combine smaller buffers into PAGE_SIZE I/O
+ * @writing: Bool - whether writing (or reading).
+ * @buffer: The start of the buffer to write or fill.
+ * @buffer_size: The size of the buffer to write or fill.
+ * @no_readahead: Don't try to start readhead (when getting extents).
+ **/
+static int toi_rw_buffer(int writing, char *buffer, int buffer_size,
+ int no_readahead)
+{
+ int bytes_left = buffer_size, result = 0;
+
+ while (bytes_left) {
+ char *source_start = buffer + buffer_size - bytes_left;
+ char *dest_start = toi_writer_buffer + toi_writer_buffer_posn;
+ int capacity = PAGE_SIZE - toi_writer_buffer_posn;
+ char *to = writing ? dest_start : source_start;
+ char *from = writing ? source_start : dest_start;
+
+ if (bytes_left <= capacity) {
+ memcpy(to, from, bytes_left);
+ toi_writer_buffer_posn += bytes_left;
+ return 0;
+ }
+
+ /* Complete this page and start a new one */
+ memcpy(to, from, capacity);
+ bytes_left -= capacity;
+
+ if (!writing) {
+ /*
+ * Perform actual I/O:
+ * read readahead_list_head into toi_writer_buffer
+ */
+ int result = toi_bio_get_next_page_read(no_readahead);
+ if (result && bytes_left) {
+ printk("toi_bio_get_next_page_read "
+ "returned %d. Expecting to read %d bytes.\n", result, bytes_left);
+ return result;
+ }
+ } else {
+ toi_bio_queue_write(&toi_writer_buffer);
+ result = toi_bio_get_new_page(&toi_writer_buffer);
+ if (result) {
+ printk(KERN_ERR "toi_bio_get_new_page returned "
+ "%d.\n", result);
+ return result;
+ }
+ }
+
+ toi_writer_buffer_posn = 0;
+ toi_cond_pause(0, NULL);
+ }
+
+ return 0;
+}
+
+/**
+ * toi_bio_read_page - read a page of the image
+ * @pfn: The pfn where the data belongs.
+ * @buffer_page: The page containing the (possibly compressed) data.
+ * @buf_size: The number of bytes on @buffer_page used (PAGE_SIZE).
+ *
+ * Read a (possibly compressed) page from the image, into buffer_page,
+ * returning its pfn and the buffer size.
+ **/
+static int toi_bio_read_page(unsigned long *pfn, int buf_type,
+ void *buffer_page, unsigned int *buf_size)
+{
+ int result = 0;
+ int this_idx;
+ char *buffer_virt = TOI_MAP(buf_type, buffer_page);
+
+ /*
+ * Only call start_new_readahead if we don't have a dedicated thread
+ * and we're the queue flusher.
+ */
+ if (current == toi_queue_flusher && more_readahead &&
+ !test_action_state(TOI_NO_READAHEAD)) {
+ int result2 = toi_start_new_readahead(0);
+ if (result2) {
+ printk(KERN_DEBUG "Queue flusher and "
+ "toi_start_one_readahead returned non-zero.\n");
+ result = -EIO;
+ goto out;
+ }
+ }
+
+ my_mutex_lock(0, &toi_bio_mutex);
+
+ /*
+ * Structure in the image:
+ * [destination pfn|page size|page data]
+ * buf_size is PAGE_SIZE
+ * We can validly find there's nothing to read in a multithreaded
+ * situation.
+ */
+ if (toi_rw_buffer(READ, (char *) &this_idx, sizeof(int), 0) ||
+ toi_rw_buffer(READ, (char *) pfn, sizeof(unsigned long), 0) ||
+ toi_rw_buffer(READ, (char *) buf_size, sizeof(int), 0) ||
+ toi_rw_buffer(READ, buffer_virt, *buf_size, 0)) {
+ result = -ENODATA;
+ goto out_unlock;
+ }
+
+ if (reset_idx) {
+ page_idx = this_idx;
+ reset_idx = 0;
+ } else {
+ page_idx++;
+ if (!this_idx)
+ result = -ENODATA;
+ else if (page_idx != this_idx)
+ printk(KERN_ERR "Got page index %d, expected %d.\n",
+ this_idx, page_idx);
+ }
+
+out_unlock:
+ my_mutex_unlock(0, &toi_bio_mutex);
+out:
+ TOI_UNMAP(buf_type, buffer_page);
+ return result;
+}
+
+/**
+ * toi_bio_write_page - write a page of the image
+ * @pfn: The pfn where the data belongs.
+ * @buffer_page: The page containing the (possibly compressed) data.
+ * @buf_size: The number of bytes on @buffer_page used.
+ *
+ * Write a (possibly compressed) page to the image from the buffer, together
+ * with it's index and buffer size.
+ **/
+static int toi_bio_write_page(unsigned long pfn, int buf_type,
+ void *buffer_page, unsigned int buf_size)
+{
+ char *buffer_virt;
+ int result = 0, result2 = 0;
+
+ if (unlikely(test_action_state(TOI_TEST_FILTER_SPEED)))
+ return 0;
+
+ my_mutex_lock(1, &toi_bio_mutex);
+
+ if (test_result_state(TOI_ABORTED)) {
+ my_mutex_unlock(1, &toi_bio_mutex);
+ return 0;
+ }
+
+ buffer_virt = TOI_MAP(buf_type, buffer_page);
+ page_idx++;
+
+ /*
+ * Structure in the image:
+ * [destination pfn|page size|page data]
+ * buf_size is PAGE_SIZE
+ */
+ if (toi_rw_buffer(WRITE, (char *) &page_idx, sizeof(int), 0) ||
+ toi_rw_buffer(WRITE, (char *) &pfn, sizeof(unsigned long), 0) ||
+ toi_rw_buffer(WRITE, (char *) &buf_size, sizeof(int), 0) ||
+ toi_rw_buffer(WRITE, buffer_virt, buf_size, 0)) {
+ printk(KERN_DEBUG "toi_rw_buffer returned non-zero to "
+ "toi_bio_write_page.\n");
+ result = -EIO;
+ }
+
+ TOI_UNMAP(buf_type, buffer_page);
+ my_mutex_unlock(1, &toi_bio_mutex);
+
+ if (current == toi_queue_flusher)
+ result2 = toi_bio_queue_flush_pages(0);
+
+ return result ? result : result2;
+}
+
+/**
+ * _toi_rw_header_chunk - read or write a portion of the image header
+ * @writing: Whether reading or writing.
+ * @owner: The module for which we're writing.
+ * Used for confirming that modules
+ * don't use more header space than they asked for.
+ * @buffer: Address of the data to write.
+ * @buffer_size: Size of the data buffer.
+ * @no_readahead: Don't try to start readhead (when getting extents).
+ *
+ * Perform PAGE_SIZE I/O. Start readahead if needed.
+ **/
+static int _toi_rw_header_chunk(int writing, struct toi_module_ops *owner,
+ char *buffer, int buffer_size, int no_readahead)
+{
+ int result = 0;
+
+ if (owner) {
+ owner->header_used += buffer_size;
+ toi_message(TOI_HEADER, TOI_LOW, 1,
+ "Header: %s : %d bytes (%d/%d) from offset %d.",
+ owner->name,
+ buffer_size, owner->header_used,
+ owner->header_requested,
+ toi_writer_buffer_posn);
+ if (owner->header_used > owner->header_requested && writing) {
+ printk(KERN_EMERG "TuxOnIce module %s is using more "
+ "header space (%u) than it requested (%u).\n",
+ owner->name,
+ owner->header_used,
+ owner->header_requested);
+ return buffer_size;
+ }
+ } else {
+ unowned += buffer_size;
+ toi_message(TOI_HEADER, TOI_LOW, 1,
+ "Header: (No owner): %d bytes (%d total so far) from "
+ "offset %d.", buffer_size, unowned,
+ toi_writer_buffer_posn);
+ }
+
+ if (!writing && !no_readahead && more_readahead) {
+ result = toi_start_new_readahead(0);
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Start new readahead "
+ "returned %d.", result);
+ }
+
+ if (!result) {
+ result = toi_rw_buffer(writing, buffer, buffer_size,
+ no_readahead);
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "rw_buffer returned "
+ "%d.", result);
+ }
+
+ total_header_bytes += buffer_size;
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "_toi_rw_header_chunk returning "
+ "%d.", result);
+ return result;
+}
+
+static int toi_rw_header_chunk(int writing, struct toi_module_ops *owner,
+ char *buffer, int size)
+{
+ return _toi_rw_header_chunk(writing, owner, buffer, size, 1);
+}
+
+static int toi_rw_header_chunk_noreadahead(int writing,
+ struct toi_module_ops *owner, char *buffer, int size)
+{
+ return _toi_rw_header_chunk(writing, owner, buffer, size, 1);
+}
+
+/**
+ * toi_bio_storage_needed - get the amount of storage needed for my fns
+ **/
+static int toi_bio_storage_needed(void)
+{
+ return sizeof(int) + PAGE_SIZE + toi_bio_devinfo_storage_needed();
+}
+
+/**
+ * toi_bio_save_config_info - save block I/O config to image header
+ * @buf: PAGE_SIZE'd buffer into which data should be saved.
+ **/
+static int toi_bio_save_config_info(char *buf)
+{
+ int *ints = (int *) buf;
+ ints[0] = target_outstanding_io;
+ return sizeof(int);
+}
+
+/**
+ * toi_bio_load_config_info - restore block I/O config
+ * @buf: Data to be reloaded.
+ * @size: Size of the buffer saved.
+ **/
+static void toi_bio_load_config_info(char *buf, int size)
+{
+ int *ints = (int *) buf;
+ target_outstanding_io = ints[0];
+}
+
+void close_resume_dev_t(int force)
+{
+ if (!resume_block_device)
+ return;
+
+ if (force)
+ atomic_set(&resume_bdev_open_count, 0);
+ else
+ atomic_dec(&resume_bdev_open_count);
+
+ if (!atomic_read(&resume_bdev_open_count)) {
+ toi_close_bdev(resume_block_device);
+ resume_block_device = NULL;
+ }
+}
+
+int open_resume_dev_t(int force, int quiet)
+{
+ if (force) {
+ close_resume_dev_t(1);
+ atomic_set(&resume_bdev_open_count, 1);
+ } else
+ atomic_inc(&resume_bdev_open_count);
+
+ if (resume_block_device)
+ return 0;
+
+ resume_block_device = toi_open_bdev(NULL, resume_dev_t, 0);
+ if (IS_ERR(resume_block_device)) {
+ if (!quiet)
+ toi_early_boot_message(1, TOI_CONTINUE_REQ,
+ "Failed to open device %x, where"
+ " the header should be found.",
+ resume_dev_t);
+ resume_block_device = NULL;
+ atomic_set(&resume_bdev_open_count, 0);
+ return 1;
+ }
+
+ return 0;
+}
+
+/**
+ * toi_bio_initialise - initialise bio code at start of some action
+ * @starting_cycle: Whether starting a hibernation cycle, or just reading or
+ * writing a sysfs value.
+ **/
+static int toi_bio_initialise(int starting_cycle)
+{
+ int result;
+
+ if (!starting_cycle || !resume_dev_t)
+ return 0;
+
+ max_outstanding_writes = 0;
+ max_outstanding_reads = 0;
+ current_stream = 0;
+ toi_queue_flusher = current;
+#ifdef MEASURE_MUTEX_CONTENTION
+ {
+ int i, j, k;
+
+ for (i = 0; i < 2; i++)
+ for (j = 0; j < 2; j++)
+ for_each_online_cpu(k)
+ mutex_times[i][j][k] = 0;
+ }
+#endif
+ result = open_resume_dev_t(0, 1);
+
+ if (result)
+ return result;
+
+ return get_signature_page();
+}
+
+static unsigned long raw_to_real(unsigned long raw)
+{
+ unsigned long extra;
+
+ extra = (raw * (sizeof(unsigned long) + sizeof(int)) +
+ (PAGE_SIZE + sizeof(unsigned long) + sizeof(int) + 1)) /
+ (PAGE_SIZE + sizeof(unsigned long) + sizeof(int));
+
+ return raw > extra ? raw - extra : 0;
+}
+
+static unsigned long toi_bio_storage_available(void)
+{
+ unsigned long sum = 0;
+ struct toi_module_ops *this_module;
+
+ list_for_each_entry(this_module, &toi_modules, module_list) {
+ if (!this_module->enabled ||
+ this_module->type != BIO_ALLOCATOR_MODULE)
+ continue;
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Seeking storage "
+ "available from %s.", this_module->name);
+ sum += this_module->bio_allocator_ops->storage_available();
+ }
+
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Total storage available is %lu "
+ "pages (%d header pages).", sum, header_pages_reserved);
+
+ return sum > header_pages_reserved ?
+ raw_to_real(sum - header_pages_reserved) : 0;
+
+}
+
+static unsigned long toi_bio_storage_allocated(void)
+{
+ return raw_pages_allocd > header_pages_reserved ?
+ raw_to_real(raw_pages_allocd - header_pages_reserved) : 0;
+}
+
+/*
+ * If we have read part of the image, we might have filled memory with
+ * data that should be zeroed out.
+ */
+static void toi_bio_noresume_reset(void)
+{
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_noresume_reset.");
+ toi_rw_cleanup(READ);
+ free_all_bdev_info();
+}
+
+/**
+ * toi_bio_cleanup - cleanup after some action
+ * @finishing_cycle: Whether completing a cycle.
+ **/
+static void toi_bio_cleanup(int finishing_cycle)
+{
+ if (!finishing_cycle)
+ return;
+
+ if (toi_writer_buffer) {
+ toi_free_page(11, (unsigned long) toi_writer_buffer);
+ toi_writer_buffer = NULL;
+ }
+
+ forget_signature_page();
+
+ if (header_block_device && toi_sig_data &&
+ toi_sig_data->header_dev_t != resume_dev_t)
+ toi_close_bdev(header_block_device);
+
+ header_block_device = NULL;
+
+ close_resume_dev_t(0);
+}
+
+static int toi_bio_write_header_init(void)
+{
+ int result;
+
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_write_header_init");
+ toi_rw_init(WRITE, 0);
+ toi_writer_buffer_posn = 0;
+
+ /* Info needed to bootstrap goes at the start of the header.
+ * First we save the positions and devinfo, including the number
+ * of header pages. Then we save the structs containing data needed
+ * for reading the header pages back.
+ * Note that even if header pages take more than one page, when we
+ * read back the info, we will have restored the location of the
+ * next header page by the time we go to use it.
+ */
+
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "serialise extent chains.");
+ result = toi_serialise_extent_chains();
+
+ if (result)
+ return result;
+
+ /*
+ * Signature page hasn't been modified at this point. Write it in
+ * the header so we can restore it later.
+ */
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "serialise signature page.");
+ return toi_rw_header_chunk_noreadahead(WRITE, &toi_blockwriter_ops,
+ (char *) toi_cur_sig_page,
+ PAGE_SIZE);
+}
+
+static int toi_bio_write_header_cleanup(void)
+{
+ int result = 0;
+
+ if (toi_writer_buffer_posn)
+ toi_bio_queue_write(&toi_writer_buffer);
+
+ result = toi_finish_all_io();
+
+ unowned = 0;
+ total_header_bytes = 0;
+
+ /* Set signature to save we have an image */
+ if (!result)
+ result = toi_bio_mark_have_image();
+
+ return result;
+}
+
+/*
+ * toi_bio_read_header_init()
+ *
+ * Description:
+ * 1. Attempt to read the device specified with resume=.
+ * 2. Check the contents of the swap header for our signature.
+ * 3. Warn, ignore, reset and/or continue as appropriate.
+ * 4. If continuing, read the toi_swap configuration section
+ * of the header and set up block device info so we can read
+ * the rest of the header & image.
+ *
+ * Returns:
+ * May not return if user choose to reboot at a warning.
+ * -EINVAL if cannot resume at this time. Booting should continue
+ * normally.
+ */
+
+static int toi_bio_read_header_init(void)
+{
+ int result = 0;
+ char buf[32];
+
+ toi_writer_buffer_posn = 0;
+
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_read_header_init");
+
+ if (!toi_sig_data) {
+ printk(KERN_INFO "toi_bio_read_header_init called when we "
+ "haven't verified there is an image!\n");
+ return -EINVAL;
+ }
+
+ /*
+ * If the header is not on the resume_swap_dev_t, get the resume device
+ * first.
+ */
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Header dev_t is %lx.",
+ toi_sig_data->header_dev_t);
+ if (toi_sig_data->have_uuid) {
+ struct fs_info seek;
+ dev_t device;
+
+ strncpy((char *) seek.uuid, toi_sig_data->header_uuid, 16);
+ seek.dev_t = toi_sig_data->header_dev_t;
+ seek.last_mount_size = 0;
+ device = blk_lookup_fs_info(&seek);
+ if (device) {
+ printk("Using dev_t %s, returned by blk_lookup_fs_info.\n",
+ format_dev_t(buf, device));
+ toi_sig_data->header_dev_t = device;
+ }
+ }
+ if (toi_sig_data->header_dev_t != resume_dev_t) {
+ header_block_device = toi_open_bdev(NULL,
+ toi_sig_data->header_dev_t, 1);
+
+ if (IS_ERR(header_block_device))
+ return PTR_ERR(header_block_device);
+ } else
+ header_block_device = resume_block_device;
+
+ if (!toi_writer_buffer)
+ toi_writer_buffer = (char *) toi_get_zeroed_page(11,
+ TOI_ATOMIC_GFP);
+ more_readahead = 1;
+
+ /*
+ * Read toi_swap configuration.
+ * Headerblock size taken into account already.
+ */
+ result = toi_bio_ops.bdev_page_io(READ, header_block_device,
+ toi_sig_data->first_header_block,
+ virt_to_page((unsigned long) toi_writer_buffer));
+ if (result)
+ return result;
+
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "load extent chains.");
+ result = toi_load_extent_chains();
+
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "load original signature page.");
+ toi_orig_sig_page = (char *) toi_get_zeroed_page(38, TOI_ATOMIC_GFP);
+ if (!toi_orig_sig_page) {
+ printk(KERN_ERR "Failed to allocate memory for the current"
+ " image signature.\n");
+ return -ENOMEM;
+ }
+
+ return toi_rw_header_chunk_noreadahead(READ, &toi_blockwriter_ops,
+ (char *) toi_orig_sig_page,
+ PAGE_SIZE);
+}
+
+static int toi_bio_read_header_cleanup(void)
+{
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_read_header_cleanup.");
+ return toi_rw_cleanup(READ);
+}
+
+/* Works only for digits and letters, but small and fast */
+#define TOLOWER(x) ((x) | 0x20)
+
+/*
+ * UUID must be 32 chars long. It may have dashes, but nothing
+ * else.
+ */
+char *uuid_from_commandline(char *commandline)
+{
+ int low = 0;
+ char *result = NULL, *output, *ptr;
+
+ if (strncmp(commandline, "UUID=", 5))
+ return NULL;
+
+ result = kzalloc(17, GFP_KERNEL);
+ if (!result) {
+ printk("Failed to kzalloc UUID text memory.\n");
+ return NULL;
+ }
+
+ ptr = commandline + 5;
+ output = result;
+
+ while (*ptr && (output - result) < 16) {
+ if (isxdigit(*ptr)) {
+ int value = isdigit(*ptr) ? *ptr - '0' :
+ TOLOWER(*ptr) - 'a' + 10;
+ if (low) {
+ *output += value;
+ output++;
+ } else {
+ *output = value << 4;
+ }
+ low = !low;
+ } else if (*ptr != '-')
+ break;
+ ptr++;
+ }
+
+ if ((output - result) < 16 || *ptr) {
+ printk(KERN_DEBUG "Found resume=UUID=, but the value looks "
+ "invalid.\n");
+ kfree(result);
+ result = NULL;
+ }
+
+ return result;
+}
+
+#define retry_if_fails(command) \
+do { \
+ command; \
+ if (!resume_dev_t && !waited_for_device_probe) { \
+ wait_for_device_probe(); \
+ command; \
+ waited_for_device_probe = 1; \
+ } \
+} while(0)
+
+/**
+ * try_to_open_resume_device: Try to parse and open resume=
+ *
+ * Any "swap:" has been stripped away and we just have the path to deal with.
+ * We attempt to do name_to_dev_t, open and stat the file. Having opened the
+ * file, get the struct block_device * to match.
+ */
+static int try_to_open_resume_device(char *commandline, int quiet)
+{
+ struct kstat stat;
+ int error = 0;
+ char *uuid = uuid_from_commandline(commandline);
+ int waited_for_device_probe = 0;
+
+ resume_dev_t = MKDEV(0, 0);
+
+ if (!strlen(commandline))
+ retry_if_fails(toi_bio_scan_for_image(quiet));
+
+ if (uuid) {
+ struct fs_info seek;
+ strncpy((char *) &seek.uuid, uuid, 16);
+ seek.dev_t = resume_dev_t;
+ seek.last_mount_size = 0;
+ retry_if_fails(resume_dev_t = blk_lookup_fs_info(&seek));
+ kfree(uuid);
+ }
+
+ if (!resume_dev_t)
+ retry_if_fails(resume_dev_t = name_to_dev_t(commandline));
+
+ if (!resume_dev_t) {
+ struct file *file = filp_open(commandline,
+ O_RDONLY|O_LARGEFILE, 0);
+
+ if (!IS_ERR(file) && file) {
+ vfs_getattr(&file->f_path, &stat);
+ filp_close(file, NULL);
+ } else
+ error = vfs_stat(commandline, &stat);
+ if (!error)
+ resume_dev_t = stat.rdev;
+ }
+
+ if (!resume_dev_t) {
+ if (quiet)
+ return 1;
+
+ if (test_toi_state(TOI_TRYING_TO_RESUME))
+ toi_early_boot_message(1, toi_translate_err_default,
+ "Failed to translate \"%s\" into a device id.\n",
+ commandline);
+ else
+ printk("TuxOnIce: Can't translate \"%s\" into a device "
+ "id yet.\n", commandline);
+ return 1;
+ }
+
+ return open_resume_dev_t(1, quiet);
+}
+
+/*
+ * Parse Image Location
+ *
+ * Attempt to parse a resume= parameter.
+ * Swap Writer accepts:
+ * resume=[swap:|file:]DEVNAME[:FIRSTBLOCK][@BLOCKSIZE]
+ *
+ * Where:
+ * DEVNAME is convertable to a dev_t by name_to_dev_t
+ * FIRSTBLOCK is the location of the first block in the swap file
+ * (specifying for a swap partition is nonsensical but not prohibited).
+ * Data is validated by attempting to read a swap header from the
+ * location given. Failure will result in toi_swap refusing to
+ * save an image, and a reboot with correct parameters will be
+ * necessary.
+ */
+static int toi_bio_parse_sig_location(char *commandline,
+ int only_allocator, int quiet)
+{
+ char *thischar, *devstart, *colon = NULL;
+ int signature_found, result = -EINVAL, temp_result = 0;
+
+ if (strncmp(commandline, "swap:", 5) &&
+ strncmp(commandline, "file:", 5)) {
+ /*
+ * Failing swap:, we'll take a simple resume=/dev/hda2, or a
+ * blank value (scan) but fall through to other allocators
+ * if /dev/ or UUID= isn't matched.
+ */
+ if (strncmp(commandline, "/dev/", 5) &&
+ strncmp(commandline, "UUID=", 5) &&
+ strlen(commandline))
+ return 1;
+ } else
+ commandline += 5;
+
+ devstart = commandline;
+ thischar = commandline;
+ while ((*thischar != ':') && (*thischar != '@') &&
+ ((thischar - commandline) < 250) && (*thischar))
+ thischar++;
+
+ if (*thischar == ':') {
+ colon = thischar;
+ *colon = 0;
+ thischar++;
+ }
+
+ while ((thischar - commandline) < 250 && *thischar)
+ thischar++;
+
+ if (colon) {
+ unsigned long block;
+ temp_result = kstrtoul(colon + 1, 0, &block);
+ if (!temp_result)
+ resume_firstblock = (int) block;
+ } else
+ resume_firstblock = 0;
+
+ clear_toi_state(TOI_CAN_HIBERNATE);
+ clear_toi_state(TOI_CAN_RESUME);
+
+ if (!temp_result)
+ temp_result = try_to_open_resume_device(devstart, quiet);
+
+ if (colon)
+ *colon = ':';
+
+ /* No error if we only scanned */
+ if (temp_result)
+ return strlen(commandline) ? -EINVAL : 1;
+
+ signature_found = toi_bio_image_exists(quiet);
+
+ if (signature_found != -1) {
+ result = 0;
+ /*
+ * TODO: If only file storage, CAN_HIBERNATE should only be
+ * set if file allocator's target is valid.
+ */
+ set_toi_state(TOI_CAN_HIBERNATE);
+ set_toi_state(TOI_CAN_RESUME);
+ } else
+ if (!quiet)
+ printk(KERN_ERR "TuxOnIce: Block I/O: No "
+ "signature found at %s.\n", devstart);
+
+ return result;
+}
+
+static void toi_bio_release_storage(void)
+{
+ header_pages_reserved = 0;
+ raw_pages_allocd = 0;
+
+ free_all_bdev_info();
+}
+
+/* toi_swap_remove_image
+ *
+ */
+static int toi_bio_remove_image(void)
+{
+ int result;
+
+ toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_remove_image.");
+
+ result = toi_bio_restore_original_signature();
+
+ /*
+ * We don't do a sanity check here: we want to restore the swap
+ * whatever version of kernel made the hibernate image.
+ *
+ * We need to write swap, but swap may not be enabled so
+ * we write the device directly
+ *
+ * If we don't have an current_signature_page, we didn't
+ * read an image header, so don't change anything.
+ */
+
+ toi_bio_release_storage();
+
+ return result;
+}
+
+struct toi_bio_ops toi_bio_ops = {
+ .bdev_page_io = toi_bdev_page_io,
+ .register_storage = toi_register_storage_chain,
+ .free_storage = toi_bio_release_storage,
+};
+
+static struct toi_sysfs_data sysfs_params[] = {
+ SYSFS_INT("target_outstanding_io", SYSFS_RW, &target_outstanding_io,
+ 0, 16384, 0, NULL),
+};
+
+struct toi_module_ops toi_blockwriter_ops = {
+ .type = WRITER_MODULE,
+ .name = "block i/o",
+ .directory = "block_io",
+ .module = THIS_MODULE,
+ .memory_needed = toi_bio_memory_needed,
+ .print_debug_info = toi_bio_print_debug_stats,
+ .storage_needed = toi_bio_storage_needed,
+ .save_config_info = toi_bio_save_config_info,
+ .load_config_info = toi_bio_load_config_info,
+ .initialise = toi_bio_initialise,
+ .cleanup = toi_bio_cleanup,
+ .post_atomic_restore = toi_bio_chains_post_atomic,
+
+ .rw_init = toi_rw_init,
+ .rw_cleanup = toi_rw_cleanup,
+ .read_page = toi_bio_read_page,
+ .write_page = toi_bio_write_page,
+ .rw_header_chunk = toi_rw_header_chunk,
+ .rw_header_chunk_noreadahead = toi_rw_header_chunk_noreadahead,
+ .io_flusher = bio_io_flusher,
+ .update_throughput_throttle = update_throughput_throttle,
+ .finish_all_io = toi_finish_all_io,
+
+ .noresume_reset = toi_bio_noresume_reset,
+ .storage_available = toi_bio_storage_available,
+ .storage_allocated = toi_bio_storage_allocated,
+ .reserve_header_space = toi_bio_reserve_header_space,
+ .allocate_storage = toi_bio_allocate_storage,
+ .free_unused_storage = toi_bio_free_unused_storage,
+ .image_exists = toi_bio_image_exists,
+ .mark_resume_attempted = toi_bio_mark_resume_attempted,
+ .write_header_init = toi_bio_write_header_init,
+ .write_header_cleanup = toi_bio_write_header_cleanup,
+ .read_header_init = toi_bio_read_header_init,
+ .read_header_cleanup = toi_bio_read_header_cleanup,
+ .get_header_version = toi_bio_get_header_version,
+ .remove_image = toi_bio_remove_image,
+ .parse_sig_location = toi_bio_parse_sig_location,
+
+ .sysfs_data = sysfs_params,
+ .num_sysfs_entries = sizeof(sysfs_params) /
+ sizeof(struct toi_sysfs_data),
+};
+
+/**
+ * toi_block_io_load - load time routine for block I/O module
+ *
+ * Register block i/o ops and sysfs entries.
+ **/
+static __init int toi_block_io_load(void)
+{
+ return toi_register_module(&toi_blockwriter_ops);
+}
+
+late_initcall(toi_block_io_load);
diff --git a/kernel/power/tuxonice_bio_internal.h b/kernel/power/tuxonice_bio_internal.h
new file mode 100644
index 000000000..cf9211ed9
--- /dev/null
+++ b/kernel/power/tuxonice_bio_internal.h
@@ -0,0 +1,101 @@
+/*
+ * kernel/power/tuxonice_bio_internal.h
+ *
+ * Copyright (C) 2009-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * Distributed under GPLv2.
+ *
+ * This file contains declarations for functions exported from
+ * tuxonice_bio.c, which contains low level io functions.
+ */
+
+/* Extent chains */
+void toi_extent_state_goto_start(void);
+void toi_extent_state_save(int slot);
+int go_next_page(int writing, int section_barrier);
+void toi_extent_state_restore(int slot);
+void free_all_bdev_info(void);
+int devices_of_same_priority(struct toi_bdev_info *this);
+int toi_register_storage_chain(struct toi_bdev_info *new);
+int toi_serialise_extent_chains(void);
+int toi_load_extent_chains(void);
+int toi_bio_rw_page(int writing, struct page *page, int is_readahead,
+ int free_group);
+int toi_bio_restore_original_signature(void);
+int toi_bio_devinfo_storage_needed(void);
+unsigned long get_headerblock(void);
+dev_t get_header_dev_t(void);
+struct block_device *get_header_bdev(void);
+int toi_bio_allocate_storage(unsigned long request);
+void toi_bio_free_unused_storage(void);
+
+/* Signature functions */
+#define HaveImage "HaveImage"
+#define NoImage "TuxOnIce"
+#define sig_size (sizeof(HaveImage))
+
+struct sig_data {
+ char sig[sig_size];
+ int have_image;
+ int resumed_before;
+
+ char have_uuid;
+ char header_uuid[17];
+ dev_t header_dev_t;
+ unsigned long first_header_block;
+
+ /* Repeat the signature to be sure we have a header version */
+ char sig2[sig_size];
+ int header_version;
+};
+
+void forget_signature_page(void);
+int toi_check_for_signature(void);
+int toi_bio_image_exists(int quiet);
+int get_signature_page(void);
+int toi_bio_mark_resume_attempted(int);
+extern char *toi_cur_sig_page;
+extern char *toi_orig_sig_page;
+int toi_bio_mark_have_image(void);
+extern struct sig_data *toi_sig_data;
+extern dev_t resume_dev_t;
+extern struct block_device *resume_block_device;
+extern struct block_device *header_block_device;
+extern unsigned long resume_firstblock;
+
+struct block_device *open_bdev(dev_t device, int display_errs);
+extern int current_stream;
+extern int more_readahead;
+int toi_do_io(int writing, struct block_device *bdev, long block0,
+ struct page *page, int is_readahead, int syncio, int free_group);
+int get_main_pool_phys_params(void);
+
+void toi_close_bdev(struct block_device *bdev);
+struct block_device *toi_open_bdev(char *uuid, dev_t default_device,
+ int display_errs);
+
+extern struct toi_module_ops toi_blockwriter_ops;
+void dump_block_chains(void);
+void debug_broken_header(void);
+extern unsigned long raw_pages_allocd, header_pages_reserved;
+int toi_bio_chains_debug_info(char *buffer, int size);
+void toi_bio_chains_post_atomic(struct toi_boot_kernel_data *bkd);
+int toi_bio_scan_for_image(int quiet);
+int toi_bio_get_header_version(void);
+
+void close_resume_dev_t(int force);
+int open_resume_dev_t(int force, int quiet);
+
+struct toi_incremental_image_pointer_saved_data {
+ unsigned long block;
+ int chain;
+};
+
+struct toi_incremental_image_pointer {
+ struct toi_incremental_image_pointer_saved_data save;
+ struct block_device *bdev;
+ unsigned long block;
+};
+
+void toi_bio_store_inc_image_ptr(struct toi_incremental_image_pointer *ptr);
+void toi_bio_restore_inc_image_ptr(struct toi_incremental_image_pointer *ptr);
diff --git a/kernel/power/tuxonice_bio_signature.c b/kernel/power/tuxonice_bio_signature.c
new file mode 100644
index 000000000..ead874f8e
--- /dev/null
+++ b/kernel/power/tuxonice_bio_signature.c
@@ -0,0 +1,403 @@
+/*
+ * kernel/power/tuxonice_bio_signature.c
+ *
+ * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * Distributed under GPLv2.
+ *
+ */
+
+#include <linux/fs_uuid.h>
+
+#include "tuxonice.h"
+#include "tuxonice_sysfs.h"
+#include "tuxonice_modules.h"
+#include "tuxonice_prepare_image.h"
+#include "tuxonice_bio.h"
+#include "tuxonice_ui.h"
+#include "tuxonice_alloc.h"
+#include "tuxonice_io.h"
+#include "tuxonice_builtin.h"
+#include "tuxonice_bio_internal.h"
+
+struct sig_data *toi_sig_data;
+
+/* Struct of swap header pages */
+
+struct old_sig_data {
+ dev_t device;
+ unsigned long sector;
+ int resume_attempted;
+ int orig_sig_type;
+};
+
+union diskpage {
+ union swap_header swh; /* swh.magic is the only member used */
+ struct sig_data sig_data;
+ struct old_sig_data old_sig_data;
+};
+
+union p_diskpage {
+ union diskpage *pointer;
+ char *ptr;
+ unsigned long address;
+};
+
+char *toi_cur_sig_page;
+char *toi_orig_sig_page;
+int have_image;
+int have_old_image;
+
+int get_signature_page(void)
+{
+ if (!toi_cur_sig_page) {
+ toi_message(TOI_IO, TOI_VERBOSE, 0,
+ "Allocating current signature page.");
+ toi_cur_sig_page = (char *) toi_get_zeroed_page(38,
+ TOI_ATOMIC_GFP);
+ if (!toi_cur_sig_page) {
+ printk(KERN_ERR "Failed to allocate memory for the "
+ "current image signature.\n");
+ return -ENOMEM;
+ }
+
+ toi_sig_data = (struct sig_data *) toi_cur_sig_page;
+ }
+
+ toi_message(TOI_IO, TOI_VERBOSE, 0, "Reading signature from dev %lx,"
+ " sector %d.",
+ resume_block_device->bd_dev, resume_firstblock);
+
+ return toi_bio_ops.bdev_page_io(READ, resume_block_device,
+ resume_firstblock, virt_to_page(toi_cur_sig_page));
+}
+
+void forget_signature_page(void)
+{
+ if (toi_cur_sig_page) {
+ toi_sig_data = NULL;
+ toi_message(TOI_IO, TOI_VERBOSE, 0, "Freeing toi_cur_sig_page"
+ " (%p).", toi_cur_sig_page);
+ toi_free_page(38, (unsigned long) toi_cur_sig_page);
+ toi_cur_sig_page = NULL;
+ }
+
+ if (toi_orig_sig_page) {
+ toi_message(TOI_IO, TOI_VERBOSE, 0, "Freeing toi_orig_sig_page"
+ " (%p).", toi_orig_sig_page);
+ toi_free_page(38, (unsigned long) toi_orig_sig_page);
+ toi_orig_sig_page = NULL;
+ }
+}
+
+/*
+ * We need to ensure we use the signature page that's currently on disk,
+ * so as to not remove the image header. Post-atomic-restore, the orig sig
+ * page will be empty, so we can use that as our method of knowing that we
+ * need to load the on-disk signature and not use the non-image sig in
+ * memory. (We're going to powerdown after writing the change, so it's safe.
+ */
+int toi_bio_mark_resume_attempted(int flag)
+{
+ toi_message(TOI_IO, TOI_VERBOSE, 0, "Make resume attempted = %d.",
+ flag);
+ if (!toi_orig_sig_page) {
+ forget_signature_page();
+ get_signature_page();
+ }
+ toi_sig_data->resumed_before = flag;
+ return toi_bio_ops.bdev_page_io(WRITE, resume_block_device,
+ resume_firstblock, virt_to_page(toi_cur_sig_page));
+}
+
+int toi_bio_mark_have_image(void)
+{
+ int result = 0;
+ char buf[32];
+ struct fs_info *fs_info;
+
+ toi_message(TOI_IO, TOI_VERBOSE, 0, "Recording that an image exists.");
+ memcpy(toi_sig_data->sig, tuxonice_signature,
+ sizeof(tuxonice_signature));
+ toi_sig_data->have_image = 1;
+ toi_sig_data->resumed_before = 0;
+ toi_sig_data->header_dev_t = get_header_dev_t();
+ toi_sig_data->have_uuid = 0;
+
+ fs_info = fs_info_from_block_dev(get_header_bdev());
+ if (fs_info && !IS_ERR(fs_info)) {
+ memcpy(toi_sig_data->header_uuid, &fs_info->uuid, 16);
+ free_fs_info(fs_info);
+ } else
+ result = (int) PTR_ERR(fs_info);
+
+ if (!result) {
+ toi_message(TOI_IO, TOI_VERBOSE, 0, "Got uuid for dev_t %s.",
+ format_dev_t(buf, get_header_dev_t()));
+ toi_sig_data->have_uuid = 1;
+ } else
+ toi_message(TOI_IO, TOI_VERBOSE, 0, "Could not get uuid for "
+ "dev_t %s.",
+ format_dev_t(buf, get_header_dev_t()));
+
+ toi_sig_data->first_header_block = get_headerblock();
+ have_image = 1;
+ toi_message(TOI_IO, TOI_VERBOSE, 0, "header dev_t is %x. First block "
+ "is %d.", toi_sig_data->header_dev_t,
+ toi_sig_data->first_header_block);
+
+ memcpy(toi_sig_data->sig2, tuxonice_signature,
+ sizeof(tuxonice_signature));
+ toi_sig_data->header_version = TOI_HEADER_VERSION;
+
+ return toi_bio_ops.bdev_page_io(WRITE, resume_block_device,
+ resume_firstblock, virt_to_page(toi_cur_sig_page));
+}
+
+int remove_old_signature(void)
+{
+ union p_diskpage swap_header_page = (union p_diskpage) toi_cur_sig_page;
+ char *orig_sig;
+ char *header_start = (char *) toi_get_zeroed_page(38, TOI_ATOMIC_GFP);
+ int result;
+ struct block_device *header_bdev;
+ struct old_sig_data *old_sig_data =
+ &swap_header_page.pointer->old_sig_data;
+
+ header_bdev = toi_open_bdev(NULL, old_sig_data->device, 1);
+ result = toi_bio_ops.bdev_page_io(READ, header_bdev,
+ old_sig_data->sector, virt_to_page(header_start));
+
+ if (result)
+ goto out;
+
+ /*
+ * TODO: Get the original contents of the first bytes of the swap
+ * header page.
+ */
+ if (!old_sig_data->orig_sig_type)
+ orig_sig = "SWAP-SPACE";
+ else
+ orig_sig = "SWAPSPACE2";
+
+ memcpy(swap_header_page.pointer->swh.magic.magic, orig_sig, 10);
+ memcpy(swap_header_page.ptr, header_start, 10);
+
+ result = toi_bio_ops.bdev_page_io(WRITE, resume_block_device,
+ resume_firstblock, virt_to_page(swap_header_page.ptr));
+
+out:
+ toi_close_bdev(header_bdev);
+ have_old_image = 0;
+ toi_free_page(38, (unsigned long) header_start);
+ return result;
+}
+
+/*
+ * toi_bio_restore_original_signature - restore the original signature
+ *
+ * At boot time (aborting pre atomic-restore), toi_orig_sig_page gets used.
+ * It will have the original signature page contents, stored in the image
+ * header. Post atomic-restore, we use :toi_cur_sig_page, which will contain
+ * the contents that were loaded when we started the cycle.
+ */
+int toi_bio_restore_original_signature(void)
+{
+ char *use = toi_orig_sig_page ? toi_orig_sig_page : toi_cur_sig_page;
+
+ if (have_old_image)
+ return remove_old_signature();
+
+ if (!use) {
+ printk("toi_bio_restore_original_signature: No signature "
+ "page loaded.\n");
+ return 0;
+ }
+
+ toi_message(TOI_IO, TOI_VERBOSE, 0, "Recording that no image exists.");
+ have_image = 0;
+ toi_sig_data->have_image = 0;
+ return toi_bio_ops.bdev_page_io(WRITE, resume_block_device,
+ resume_firstblock, virt_to_page(use));
+}
+
+/*
+ * check_for_signature - See whether we have an image.
+ *
+ * Returns 0 if no image, 1 if there is one, -1 if indeterminate.
+ */
+int toi_check_for_signature(void)
+{
+ union p_diskpage swap_header_page;
+ int type;
+ const char *normal_sigs[] = {"SWAP-SPACE", "SWAPSPACE2" };
+ const char *swsusp_sigs[] = {"S1SUSP", "S2SUSP", "S1SUSPEND" };
+ char *swap_header;
+
+ if (!toi_cur_sig_page) {
+ int result = get_signature_page();
+
+ if (result)
+ return result;
+ }
+
+ /*
+ * Start by looking for the binary header.
+ */
+ if (!memcmp(tuxonice_signature, toi_cur_sig_page,
+ sizeof(tuxonice_signature))) {
+ have_image = toi_sig_data->have_image;
+ toi_message(TOI_IO, TOI_VERBOSE, 0, "Have binary signature. "
+ "Have image is %d.", have_image);
+ if (have_image)
+ toi_message(TOI_IO, TOI_VERBOSE, 0, "header dev_t is "
+ "%x. First block is %d.",
+ toi_sig_data->header_dev_t,
+ toi_sig_data->first_header_block);
+ return toi_sig_data->have_image;
+ }
+
+ /*
+ * Failing that, try old file allocator headers.
+ */
+
+ if (!memcmp(HaveImage, toi_cur_sig_page, strlen(HaveImage))) {
+ have_image = 1;
+ return 1;
+ }
+
+ have_image = 0;
+
+ if (!memcmp(NoImage, toi_cur_sig_page, strlen(NoImage)))
+ return 0;
+
+ /*
+ * Nope? How about swap?
+ */
+ swap_header_page = (union p_diskpage) toi_cur_sig_page;
+ swap_header = swap_header_page.pointer->swh.magic.magic;
+
+ /* Normal swapspace? */
+ for (type = 0; type < 2; type++)
+ if (!memcmp(normal_sigs[type], swap_header,
+ strlen(normal_sigs[type])))
+ return 0;
+
+ /* Swsusp or uswsusp? */
+ for (type = 0; type < 3; type++)
+ if (!memcmp(swsusp_sigs[type], swap_header,
+ strlen(swsusp_sigs[type])))
+ return 2;
+
+ /* Old TuxOnIce version? */
+ if (!memcmp(tuxonice_signature, swap_header,
+ sizeof(tuxonice_signature) - 1)) {
+ toi_message(TOI_IO, TOI_VERBOSE, 0, "Found old TuxOnIce "
+ "signature.");
+ have_old_image = 1;
+ return 3;
+ }
+
+ return -1;
+}
+
+/*
+ * Image_exists
+ *
+ * Returns -1 if don't know, otherwise 0 (no) or 1 (yes).
+ */
+int toi_bio_image_exists(int quiet)
+{
+ int result;
+ char *msg = NULL;
+
+ toi_message(TOI_IO, TOI_VERBOSE, 0, "toi_bio_image_exists.");
+
+ if (!resume_dev_t) {
+ if (!quiet)
+ printk(KERN_INFO "Not even trying to read header "
+ "because resume_dev_t is not set.\n");
+ return -1;
+ }
+
+ if (open_resume_dev_t(0, quiet))
+ return -1;
+
+ result = toi_check_for_signature();
+
+ clear_toi_state(TOI_RESUMED_BEFORE);
+ if (toi_sig_data->resumed_before)
+ set_toi_state(TOI_RESUMED_BEFORE);
+
+ if (quiet || result == -ENOMEM)
+ return result;
+
+ if (result == -1)
+ msg = "TuxOnIce: Unable to find a signature."
+ " Could you have moved a swap file?\n";
+ else if (!result)
+ msg = "TuxOnIce: No image found.\n";
+ else if (result == 1)
+ msg = "TuxOnIce: Image found.\n";
+ else if (result == 2)
+ msg = "TuxOnIce: uswsusp or swsusp image found.\n";
+ else if (result == 3)
+ msg = "TuxOnIce: Old implementation's signature found.\n";
+
+ printk(KERN_INFO "%s", msg);
+
+ return result;
+}
+
+int toi_bio_scan_for_image(int quiet)
+{
+ struct block_device *bdev;
+ char default_name[255] = "";
+
+ if (!quiet)
+ printk(KERN_DEBUG "Scanning swap devices for TuxOnIce "
+ "signature...\n");
+ for (bdev = next_bdev_of_type(NULL, "swap"); bdev;
+ bdev = next_bdev_of_type(bdev, "swap")) {
+ int result;
+ char name[255] = "";
+ sprintf(name, "%u:%u", MAJOR(bdev->bd_dev),
+ MINOR(bdev->bd_dev));
+ if (!quiet)
+ printk(KERN_DEBUG "- Trying %s.\n", name);
+ resume_block_device = bdev;
+ resume_dev_t = bdev->bd_dev;
+
+ result = toi_check_for_signature();
+
+ resume_block_device = NULL;
+ resume_dev_t = MKDEV(0, 0);
+
+ if (!default_name[0])
+ strcpy(default_name, name);
+
+ if (result == 1) {
+ /* Got one! */
+ strcpy(resume_file, name);
+ next_bdev_of_type(bdev, NULL);
+ if (!quiet)
+ printk(KERN_DEBUG " ==> Image found on %s.\n",
+ resume_file);
+ return 1;
+ }
+ forget_signature_page();
+ }
+
+ if (!quiet)
+ printk(KERN_DEBUG "TuxOnIce scan: No image found.\n");
+ strcpy(resume_file, default_name);
+ return 0;
+}
+
+int toi_bio_get_header_version(void)
+{
+ return (memcmp(toi_sig_data->sig2, tuxonice_signature,
+ sizeof(tuxonice_signature))) ?
+ 0 : toi_sig_data->header_version;
+
+}
diff --git a/kernel/power/tuxonice_builtin.c b/kernel/power/tuxonice_builtin.c
new file mode 100644
index 000000000..0a6733ae0
--- /dev/null
+++ b/kernel/power/tuxonice_builtin.c
@@ -0,0 +1,498 @@
+/*
+ * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ */
+#include <linux/kernel.h>
+#include <linux/swap.h>
+#include <linux/syscalls.h>
+#include <linux/bio.h>
+#include <linux/root_dev.h>
+#include <linux/freezer.h>
+#include <linux/reboot.h>
+#include <linux/writeback.h>
+#include <linux/tty.h>
+#include <linux/crypto.h>
+#include <linux/cpu.h>
+#include <linux/ctype.h>
+#include <linux/kthread.h>
+#include "tuxonice_io.h"
+#include "tuxonice.h"
+#include "tuxonice_extent.h"
+#include "tuxonice_netlink.h"
+#include "tuxonice_prepare_image.h"
+#include "tuxonice_ui.h"
+#include "tuxonice_sysfs.h"
+#include "tuxonice_pagedir.h"
+#include "tuxonice_modules.h"
+#include "tuxonice_builtin.h"
+#include "tuxonice_power_off.h"
+#include "tuxonice_alloc.h"
+
+unsigned long toi_bootflags_mask;
+
+/*
+ * Highmem related functions (x86 only).
+ */
+
+#ifdef CONFIG_HIGHMEM
+
+/**
+ * copyback_high: Restore highmem pages.
+ *
+ * Highmem data and pbe lists are/can be stored in highmem.
+ * The format is slightly different to the lowmem pbe lists
+ * used for the assembly code: the last pbe in each page is
+ * a struct page * instead of struct pbe *, pointing to the
+ * next page where pbes are stored (or NULL if happens to be
+ * the end of the list). Since we don't want to generate
+ * unnecessary deltas against swsusp code, we use a cast
+ * instead of a union.
+ **/
+
+static void copyback_high(void)
+{
+ struct page *pbe_page = (struct page *) restore_highmem_pblist;
+ struct pbe *this_pbe, *first_pbe;
+ unsigned long *origpage, *copypage;
+ int pbe_index = 1;
+
+ if (!pbe_page)
+ return;
+
+ this_pbe = (struct pbe *) kmap_atomic(pbe_page);
+ first_pbe = this_pbe;
+
+ while (this_pbe) {
+ int loop = (PAGE_SIZE / sizeof(unsigned long)) - 1;
+
+ origpage = kmap_atomic(pfn_to_page((unsigned long) this_pbe->orig_address));
+ copypage = kmap_atomic((struct page *) this_pbe->address);
+
+ while (loop >= 0) {
+ *(origpage + loop) = *(copypage + loop);
+ loop--;
+ }
+
+ kunmap_atomic(origpage);
+ kunmap_atomic(copypage);
+
+ if (!this_pbe->next)
+ break;
+
+ if (pbe_index < PBES_PER_PAGE) {
+ this_pbe++;
+ pbe_index++;
+ } else {
+ pbe_page = (struct page *) this_pbe->next;
+ kunmap_atomic(first_pbe);
+ if (!pbe_page)
+ return;
+ this_pbe = (struct pbe *) kmap_atomic(pbe_page);
+ first_pbe = this_pbe;
+ pbe_index = 1;
+ }
+ }
+ kunmap_atomic(first_pbe);
+}
+
+#else /* CONFIG_HIGHMEM */
+static void copyback_high(void) { }
+#endif
+
+char toi_wait_for_keypress_dev_console(int timeout)
+{
+ int fd, this_timeout = 255, orig_kthread = 0;
+ char key = '\0';
+ struct termios t, t_backup;
+
+ /* We should be guaranteed /dev/console exists after populate_rootfs()
+ * in init/main.c.
+ */
+ fd = sys_open("/dev/console", O_RDONLY, 0);
+ if (fd < 0) {
+ printk(KERN_INFO "Couldn't open /dev/console.\n");
+ return key;
+ }
+
+ if (sys_ioctl(fd, TCGETS, (long)&t) < 0)
+ goto out_close;
+
+ memcpy(&t_backup, &t, sizeof(t));
+
+ t.c_lflag &= ~(ISIG|ICANON|ECHO);
+ t.c_cc[VMIN] = 0;
+
+new_timeout:
+ if (timeout > 0) {
+ this_timeout = timeout < 26 ? timeout : 25;
+ timeout -= this_timeout;
+ this_timeout *= 10;
+ }
+
+ t.c_cc[VTIME] = this_timeout;
+
+ if (sys_ioctl(fd, TCSETS, (long)&t) < 0)
+ goto out_restore;
+
+ if (current->flags & PF_KTHREAD) {
+ orig_kthread = (current->flags & PF_KTHREAD);
+ current->flags &= ~PF_KTHREAD;
+ }
+
+ while (1) {
+ if (sys_read(fd, &key, 1) <= 0) {
+ if (timeout)
+ goto new_timeout;
+ key = '\0';
+ break;
+ }
+ key = tolower(key);
+ if (test_toi_state(TOI_SANITY_CHECK_PROMPT)) {
+ if (key == 'c') {
+ set_toi_state(TOI_CONTINUE_REQ);
+ break;
+ } else if (key == ' ')
+ break;
+ } else
+ break;
+ }
+ if (orig_kthread) {
+ current->flags |= PF_KTHREAD;
+ }
+
+out_restore:
+ sys_ioctl(fd, TCSETS, (long)&t_backup);
+out_close:
+ sys_close(fd);
+
+ return key;
+}
+
+struct toi_boot_kernel_data toi_bkd __nosavedata
+ __attribute__((aligned(PAGE_SIZE))) = {
+ MY_BOOT_KERNEL_DATA_VERSION,
+ 0,
+#ifdef CONFIG_TOI_REPLACE_SWSUSP
+ (1 << TOI_REPLACE_SWSUSP) |
+#endif
+ (1 << TOI_NO_FLUSHER_THREAD) |
+ (1 << TOI_PAGESET2_FULL),
+};
+
+struct block_device *toi_open_by_devnum(dev_t dev)
+{
+ struct block_device *bdev = bdget(dev);
+ int err = -ENOMEM;
+ if (bdev)
+ err = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY, NULL);
+ return err ? ERR_PTR(err) : bdev;
+}
+
+/**
+ * toi_close_bdev: Close a swap bdev.
+ *
+ * int: The swap entry number to close.
+ */
+void toi_close_bdev(struct block_device *bdev)
+{
+ blkdev_put(bdev, FMODE_READ | FMODE_NDELAY);
+}
+
+int toi_wait = CONFIG_TOI_DEFAULT_WAIT;
+struct toi_core_fns *toi_core_fns;
+unsigned long toi_result;
+struct pagedir pagedir1 = {1};
+struct toi_cbw **toi_first_cbw;
+int toi_next_cbw;
+
+unsigned long toi_get_nonconflicting_page(void)
+{
+ return toi_core_fns->get_nonconflicting_page();
+}
+
+int toi_post_context_save(void)
+{
+ return toi_core_fns->post_context_save();
+}
+
+int try_tuxonice_hibernate(void)
+{
+ if (!toi_core_fns)
+ return -ENODEV;
+
+ return toi_core_fns->try_hibernate();
+}
+
+static int num_resume_calls;
+#ifdef CONFIG_TOI_IGNORE_LATE_INITCALL
+static int ignore_late_initcall = 1;
+#else
+static int ignore_late_initcall;
+#endif
+
+int toi_translate_err_default = TOI_CONTINUE_REQ;
+
+void try_tuxonice_resume(void)
+{
+ if (!hibernation_available())
+ return;
+
+ /* Don't let it wrap around eventually */
+ if (num_resume_calls < 2)
+ num_resume_calls++;
+
+ if (num_resume_calls == 1 && ignore_late_initcall) {
+ printk(KERN_INFO "TuxOnIce: Ignoring late initcall, as requested.\n");
+ return;
+ }
+
+ if (toi_core_fns)
+ toi_core_fns->try_resume();
+ else
+ printk(KERN_INFO "TuxOnIce core not loaded yet.\n");
+}
+
+int toi_lowlevel_builtin(void)
+{
+ int error = 0;
+
+ save_processor_state();
+ error = swsusp_arch_suspend();
+ if (error)
+ printk(KERN_ERR "Error %d hibernating\n", error);
+
+ /* Restore control flow appears here */
+ if (!toi_in_hibernate) {
+ copyback_high();
+ set_toi_state(TOI_NOW_RESUMING);
+ }
+
+ restore_processor_state();
+ return error;
+}
+
+unsigned long toi_compress_bytes_in;
+unsigned long toi_compress_bytes_out;
+
+int toi_in_suspend(void)
+{
+ return in_suspend;
+}
+
+unsigned long toi_state = ((1 << TOI_BOOT_TIME) |
+ (1 << TOI_IGNORE_LOGLEVEL) |
+ (1 << TOI_IO_STOPPED));
+
+/* The number of hibernates we have started (some may have been cancelled) */
+unsigned int nr_hibernates;
+int toi_running;
+__nosavedata int toi_in_hibernate;
+__nosavedata struct pbe *restore_highmem_pblist;
+
+int toi_trace_allocs;
+
+void toi_read_lock_tasklist(void)
+{
+ read_lock(&tasklist_lock);
+}
+
+void toi_read_unlock_tasklist(void)
+{
+ read_unlock(&tasklist_lock);
+}
+
+#ifdef CONFIG_TOI_ZRAM_SUPPORT
+int (*toi_flag_zram_disks) (void);
+
+int toi_do_flag_zram_disks(void)
+{
+ return toi_flag_zram_disks ? (*toi_flag_zram_disks)() : 0;
+}
+
+#endif
+
+/* toi_generate_free_page_map
+ *
+ * Description: This routine generates a bitmap of free pages from the
+ * lists used by the memory manager. We then use the bitmap
+ * to quickly calculate which pages to save and in which
+ * pagesets.
+ */
+void toi_generate_free_page_map(void)
+{
+ int order, cpu, t;
+ unsigned long flags, i;
+ struct zone *zone;
+ struct list_head *curr;
+ unsigned long pfn;
+ struct page *page;
+
+ for_each_populated_zone(zone) {
+
+ if (!zone->spanned_pages)
+ continue;
+
+ spin_lock_irqsave(&zone->lock, flags);
+
+ for (i = 0; i < zone->spanned_pages; i++) {
+ pfn = zone->zone_start_pfn + i;
+
+ if (!pfn_valid(pfn))
+ continue;
+
+ page = pfn_to_page(pfn);
+
+ ClearPageNosaveFree(page);
+ }
+
+ for_each_migratetype_order(order, t) {
+ list_for_each(curr,
+ &zone->free_area[order].free_list[t]) {
+ unsigned long j;
+
+ pfn = page_to_pfn(list_entry(curr, struct page,
+ lru));
+ for (j = 0; j < (1UL << order); j++)
+ SetPageNosaveFree(pfn_to_page(pfn + j));
+ }
+ }
+
+ for_each_online_cpu(cpu) {
+ struct per_cpu_pageset *pset =
+ per_cpu_ptr(zone->pageset, cpu);
+ struct per_cpu_pages *pcp = &pset->pcp;
+ struct page *page;
+ int t;
+
+ for (t = 0; t < MIGRATE_PCPTYPES; t++)
+ list_for_each_entry(page, &pcp->lists[t], lru)
+ SetPageNosaveFree(page);
+ }
+
+ spin_unlock_irqrestore(&zone->lock, flags);
+ }
+}
+
+/* toi_size_of_free_region
+ *
+ * Description: Return the number of pages that are free, beginning with and
+ * including this one.
+ */
+int toi_size_of_free_region(struct zone *zone, unsigned long start_pfn)
+{
+ unsigned long this_pfn = start_pfn,
+ end_pfn = zone_end_pfn(zone);
+
+ while (pfn_valid(this_pfn) && this_pfn < end_pfn && PageNosaveFree(pfn_to_page(this_pfn)))
+ this_pfn++;
+
+ return this_pfn - start_pfn;
+}
+
+static int __init toi_wait_setup(char *str)
+{
+ int value;
+
+ if (sscanf(str, "=%d", &value)) {
+ if (value < -1 || value > 255)
+ printk(KERN_INFO "TuxOnIce_wait outside range -1 to "
+ "255.\n");
+ else
+ toi_wait = value;
+ }
+
+ return 1;
+}
+__setup("toi_wait", toi_wait_setup);
+
+static int __init toi_translate_retry_setup(char *str)
+{
+ toi_translate_err_default = 0;
+ return 1;
+}
+__setup("toi_translate_retry", toi_translate_retry_setup);
+
+static int __init toi_debug_setup(char *str)
+{
+ toi_bkd.toi_action |= (1 << TOI_LOGALL);
+ toi_bootflags_mask |= (1 << TOI_LOGALL);
+ toi_bkd.toi_debug_state = 255;
+ toi_bkd.toi_default_console_level = 7;
+ return 1;
+}
+__setup("toi_debug_setup", toi_debug_setup);
+
+static int __init toi_pause_setup(char *str)
+{
+ toi_bkd.toi_action |= (1 << TOI_PAUSE);
+ toi_bootflags_mask |= (1 << TOI_PAUSE);
+ return 1;
+}
+__setup("toi_pause", toi_pause_setup);
+
+#ifdef CONFIG_PM_DEBUG
+static int __init toi_trace_allocs_setup(char *str)
+{
+ int value;
+
+ if (sscanf(str, "=%d", &value))
+ toi_trace_allocs = value;
+
+ return 1;
+}
+__setup("toi_trace_allocs", toi_trace_allocs_setup);
+#endif
+
+static int __init toi_ignore_late_initcall_setup(char *str)
+{
+ int value;
+
+ if (sscanf(str, "=%d", &value))
+ ignore_late_initcall = value;
+
+ return 1;
+}
+__setup("toi_initramfs_resume_only", toi_ignore_late_initcall_setup);
+
+static int __init toi_force_no_multithreaded_setup(char *str)
+{
+ int value;
+
+ toi_bkd.toi_action &= ~(1 << TOI_NO_MULTITHREADED_IO);
+ toi_bootflags_mask |= (1 << TOI_NO_MULTITHREADED_IO);
+
+ if (sscanf(str, "=%d", &value) && value)
+ toi_bkd.toi_action |= (1 << TOI_NO_MULTITHREADED_IO);
+
+ return 1;
+}
+__setup("toi_no_multithreaded", toi_force_no_multithreaded_setup);
+
+#ifdef CONFIG_KGDB
+static int __init toi_post_resume_breakpoint_setup(char *str)
+{
+ int value;
+
+ toi_bkd.toi_action &= ~(1 << TOI_POST_RESUME_BREAKPOINT);
+ toi_bootflags_mask |= (1 << TOI_POST_RESUME_BREAKPOINT);
+ if (sscanf(str, "=%d", &value) && value)
+ toi_bkd.toi_action |= (1 << TOI_POST_RESUME_BREAKPOINT);
+
+ return 1;
+}
+__setup("toi_post_resume_break", toi_post_resume_breakpoint_setup);
+#endif
+
+static int __init toi_disable_readahead_setup(char *str)
+{
+ int value;
+
+ toi_bkd.toi_action &= ~(1 << TOI_NO_READAHEAD);
+ toi_bootflags_mask |= (1 << TOI_NO_READAHEAD);
+ if (sscanf(str, "=%d", &value) && value)
+ toi_bkd.toi_action |= (1 << TOI_NO_READAHEAD);
+
+ return 1;
+}
+__setup("toi_no_readahead", toi_disable_readahead_setup);
diff --git a/kernel/power/tuxonice_builtin.h b/kernel/power/tuxonice_builtin.h
new file mode 100644
index 000000000..9539818e0
--- /dev/null
+++ b/kernel/power/tuxonice_builtin.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ */
+#include <asm/setup.h>
+
+extern struct toi_core_fns *toi_core_fns;
+extern unsigned long toi_compress_bytes_in, toi_compress_bytes_out;
+extern unsigned int nr_hibernates;
+extern int toi_in_hibernate;
+
+extern __nosavedata struct pbe *restore_highmem_pblist;
+
+int toi_lowlevel_builtin(void);
+
+#ifdef CONFIG_HIGHMEM
+extern __nosavedata struct zone_data *toi_nosave_zone_list;
+extern __nosavedata unsigned long toi_nosave_max_pfn;
+#endif
+
+extern unsigned long toi_get_nonconflicting_page(void);
+extern int toi_post_context_save(void);
+
+extern char toi_wait_for_keypress_dev_console(int timeout);
+extern struct block_device *toi_open_by_devnum(dev_t dev);
+extern void toi_close_bdev(struct block_device *bdev);
+extern int toi_wait;
+extern int toi_translate_err_default;
+extern int toi_force_no_multithreaded;
+extern void toi_read_lock_tasklist(void);
+extern void toi_read_unlock_tasklist(void);
+extern int toi_in_suspend(void);
+extern void toi_generate_free_page_map(void);
+extern int toi_size_of_free_region(struct zone *zone, unsigned long start_pfn);
+
+#ifdef CONFIG_TOI_ZRAM_SUPPORT
+extern int toi_do_flag_zram_disks(void);
+#else
+#define toi_do_flag_zram_disks() (0)
+#endif
diff --git a/kernel/power/tuxonice_checksum.c b/kernel/power/tuxonice_checksum.c
new file mode 100644
index 000000000..8952c0fec
--- /dev/null
+++ b/kernel/power/tuxonice_checksum.c
@@ -0,0 +1,392 @@
+/*
+ * kernel/power/tuxonice_checksum.c
+ *
+ * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ * This file contains data checksum routines for TuxOnIce,
+ * using cryptoapi. They are used to locate any modifications
+ * made to pageset 2 while we're saving it.
+ */
+
+#include <linux/suspend.h>
+#include <linux/highmem.h>
+#include <linux/vmalloc.h>
+#include <linux/crypto.h>
+#include <linux/scatterlist.h>
+
+#include "tuxonice.h"
+#include "tuxonice_modules.h"
+#include "tuxonice_sysfs.h"
+#include "tuxonice_io.h"
+#include "tuxonice_pageflags.h"
+#include "tuxonice_checksum.h"
+#include "tuxonice_pagedir.h"
+#include "tuxonice_alloc.h"
+#include "tuxonice_ui.h"
+
+static struct toi_module_ops toi_checksum_ops;
+
+/* Constant at the mo, but I might allow tuning later */
+static char toi_checksum_name[32] = "md4";
+/* Bytes per checksum */
+#define CHECKSUM_SIZE (16)
+
+#define CHECKSUMS_PER_PAGE ((PAGE_SIZE - sizeof(void *)) / CHECKSUM_SIZE)
+
+struct cpu_context {
+ struct crypto_hash *transform;
+ struct hash_desc desc;
+ struct scatterlist sg[2];
+ char *buf;
+};
+
+static DEFINE_PER_CPU(struct cpu_context, contexts);
+static int pages_allocated;
+static unsigned long page_list;
+
+static int toi_num_resaved;
+
+static unsigned long this_checksum, next_page;
+static int checksum_count;
+
+static inline int checksum_pages_needed(void)
+{
+ return DIV_ROUND_UP(pagedir2.size, CHECKSUMS_PER_PAGE);
+}
+
+/* ---- Local buffer management ---- */
+
+/*
+ * toi_checksum_cleanup
+ *
+ * Frees memory allocated for our labours.
+ */
+static void toi_checksum_cleanup(int ending_cycle)
+{
+ int cpu;
+
+ if (ending_cycle) {
+ for_each_online_cpu(cpu) {
+ struct cpu_context *this = &per_cpu(contexts, cpu);
+ if (this->transform) {
+ crypto_free_hash(this->transform);
+ this->transform = NULL;
+ this->desc.tfm = NULL;
+ }
+
+ if (this->buf) {
+ toi_free_page(27, (unsigned long) this->buf);
+ this->buf = NULL;
+ }
+ }
+ }
+}
+
+/*
+ * toi_crypto_initialise
+ *
+ * Prepare to do some work by allocating buffers and transforms.
+ * Returns: Int: Zero. Even if we can't set up checksum, we still
+ * seek to hibernate.
+ */
+static int toi_checksum_initialise(int starting_cycle)
+{
+ int cpu;
+
+ if (!(starting_cycle & SYSFS_HIBERNATE) || !toi_checksum_ops.enabled)
+ return 0;
+
+ if (!*toi_checksum_name) {
+ printk(KERN_INFO "TuxOnIce: No checksum algorithm name set.\n");
+ return 1;
+ }
+
+ for_each_online_cpu(cpu) {
+ struct cpu_context *this = &per_cpu(contexts, cpu);
+ struct page *page;
+
+ this->transform = crypto_alloc_hash(toi_checksum_name, 0, 0);
+ if (IS_ERR(this->transform)) {
+ printk(KERN_INFO "TuxOnIce: Failed to initialise the "
+ "%s checksum algorithm: %ld.\n",
+ toi_checksum_name, (long) this->transform);
+ this->transform = NULL;
+ return 1;
+ }
+
+ this->desc.tfm = this->transform;
+ this->desc.flags = 0;
+
+ page = toi_alloc_page(27, GFP_KERNEL);
+ if (!page)
+ return 1;
+ this->buf = page_address(page);
+ sg_init_one(&this->sg[0], this->buf, PAGE_SIZE);
+ }
+ return 0;
+}
+
+/*
+ * toi_checksum_print_debug_stats
+ * @buffer: Pointer to a buffer into which the debug info will be printed.
+ * @size: Size of the buffer.
+ *
+ * Print information to be recorded for debugging purposes into a buffer.
+ * Returns: Number of characters written to the buffer.
+ */
+
+static int toi_checksum_print_debug_stats(char *buffer, int size)
+{
+ int len;
+
+ if (!toi_checksum_ops.enabled)
+ return scnprintf(buffer, size,
+ "- Checksumming disabled.\n");
+
+ len = scnprintf(buffer, size, "- Checksum method is '%s'.\n",
+ toi_checksum_name);
+ len += scnprintf(buffer + len, size - len,
+ " %d pages resaved in atomic copy.\n", toi_num_resaved);
+ return len;
+}
+
+static int toi_checksum_memory_needed(void)
+{
+ return toi_checksum_ops.enabled ?
+ checksum_pages_needed() << PAGE_SHIFT : 0;
+}
+
+static int toi_checksum_storage_needed(void)
+{
+ if (toi_checksum_ops.enabled)
+ return strlen(toi_checksum_name) + sizeof(int) + 1;
+ else
+ return 0;
+}
+
+/*
+ * toi_checksum_save_config_info
+ * @buffer: Pointer to a buffer of size PAGE_SIZE.
+ *
+ * Save informaton needed when reloading the image at resume time.
+ * Returns: Number of bytes used for saving our data.
+ */
+static int toi_checksum_save_config_info(char *buffer)
+{
+ int namelen = strlen(toi_checksum_name) + 1;
+ int total_len;
+
+ *((unsigned int *) buffer) = namelen;
+ strncpy(buffer + sizeof(unsigned int), toi_checksum_name, namelen);
+ total_len = sizeof(unsigned int) + namelen;
+ return total_len;
+}
+
+/* toi_checksum_load_config_info
+ * @buffer: Pointer to the start of the data.
+ * @size: Number of bytes that were saved.
+ *
+ * Description: Reload information needed for dechecksuming the image at
+ * resume time.
+ */
+static void toi_checksum_load_config_info(char *buffer, int size)
+{
+ int namelen;
+
+ namelen = *((unsigned int *) (buffer));
+ strncpy(toi_checksum_name, buffer + sizeof(unsigned int),
+ namelen);
+ return;
+}
+
+/*
+ * Free Checksum Memory
+ */
+
+void free_checksum_pages(void)
+{
+ while (pages_allocated) {
+ unsigned long next = *((unsigned long *) page_list);
+ ClearPageNosave(virt_to_page(page_list));
+ toi_free_page(15, (unsigned long) page_list);
+ page_list = next;
+ pages_allocated--;
+ }
+}
+
+/*
+ * Allocate Checksum Memory
+ */
+
+int allocate_checksum_pages(void)
+{
+ int pages_needed = checksum_pages_needed();
+
+ if (!toi_checksum_ops.enabled)
+ return 0;
+
+ while (pages_allocated < pages_needed) {
+ unsigned long *new_page =
+ (unsigned long *) toi_get_zeroed_page(15, TOI_ATOMIC_GFP);
+ if (!new_page) {
+ printk(KERN_ERR "Unable to allocate checksum pages.\n");
+ return -ENOMEM;
+ }
+ SetPageNosave(virt_to_page(new_page));
+ (*new_page) = page_list;
+ page_list = (unsigned long) new_page;
+ pages_allocated++;
+ }
+
+ next_page = (unsigned long) page_list;
+ checksum_count = 0;
+
+ return 0;
+}
+
+char *tuxonice_get_next_checksum(void)
+{
+ if (!toi_checksum_ops.enabled)
+ return NULL;
+
+ if (checksum_count % CHECKSUMS_PER_PAGE)
+ this_checksum += CHECKSUM_SIZE;
+ else {
+ this_checksum = next_page + sizeof(void *);
+ next_page = *((unsigned long *) next_page);
+ }
+
+ checksum_count++;
+ return (char *) this_checksum;
+}
+
+int tuxonice_calc_checksum(struct page *page, char *checksum_locn)
+{
+ char *pa;
+ int result, cpu = smp_processor_id();
+ struct cpu_context *ctx = &per_cpu(contexts, cpu);
+
+ if (!toi_checksum_ops.enabled)
+ return 0;
+
+ pa = kmap(page);
+ memcpy(ctx->buf, pa, PAGE_SIZE);
+ kunmap(page);
+ result = crypto_hash_digest(&ctx->desc, ctx->sg, PAGE_SIZE,
+ checksum_locn);
+ if (result)
+ printk(KERN_ERR "TuxOnIce checksumming: crypto_hash_digest "
+ "returned %d.\n", result);
+ return result;
+}
+/*
+ * Calculate checksums
+ */
+
+void check_checksums(void)
+{
+ int index = 0, cpu = smp_processor_id();
+ char current_checksum[CHECKSUM_SIZE];
+ struct cpu_context *ctx = &per_cpu(contexts, cpu);
+ unsigned long pfn;
+
+ if (!toi_checksum_ops.enabled) {
+ toi_message(TOI_IO, TOI_VERBOSE, 0, "Checksumming disabled.");
+ return;
+ }
+
+ next_page = (unsigned long) page_list;
+
+ toi_num_resaved = 0;
+ this_checksum = 0;
+
+ toi_trace_index++;
+
+ toi_message(TOI_IO, TOI_VERBOSE, 0, "Verifying checksums.");
+ memory_bm_position_reset(pageset2_map);
+ for (pfn = memory_bm_next_pfn(pageset2_map, 0); pfn != BM_END_OF_MAP;
+ pfn = memory_bm_next_pfn(pageset2_map, 0)) {
+ int ret, resave_needed = false;
+ char *pa;
+ struct page *page = pfn_to_page(pfn);
+
+ if (index < checksum_count) {
+ if (index % CHECKSUMS_PER_PAGE) {
+ this_checksum += CHECKSUM_SIZE;
+ } else {
+ this_checksum = next_page + sizeof(void *);
+ next_page = *((unsigned long *) next_page);
+ }
+
+ /* Done when IRQs disabled so must be atomic */
+ pa = kmap_atomic(page);
+ memcpy(ctx->buf, pa, PAGE_SIZE);
+ kunmap_atomic(pa);
+ ret = crypto_hash_digest(&ctx->desc, ctx->sg, PAGE_SIZE,
+ current_checksum);
+
+ if (ret) {
+ printk(KERN_INFO "Digest failed. Returned %d.\n", ret);
+ return;
+ }
+
+ resave_needed = memcmp(current_checksum, (char *) this_checksum,
+ CHECKSUM_SIZE);
+ } else {
+ resave_needed = true;
+ }
+
+ if (resave_needed) {
+ TOI_TRACE_DEBUG(pfn, "_Resaving %d", resave_needed);
+ SetPageResave(pfn_to_page(pfn));
+ toi_num_resaved++;
+ if (test_action_state(TOI_ABORT_ON_RESAVE_NEEDED))
+ set_abort_result(TOI_RESAVE_NEEDED);
+ }
+
+ index++;
+ }
+ toi_message(TOI_IO, TOI_VERBOSE, 0, "Checksum verification complete.");
+}
+
+static struct toi_sysfs_data sysfs_params[] = {
+ SYSFS_INT("enabled", SYSFS_RW, &toi_checksum_ops.enabled, 0, 1, 0,
+ NULL),
+ SYSFS_BIT("abort_if_resave_needed", SYSFS_RW, &toi_bkd.toi_action,
+ TOI_ABORT_ON_RESAVE_NEEDED, 0)
+};
+
+/*
+ * Ops structure.
+ */
+static struct toi_module_ops toi_checksum_ops = {
+ .type = MISC_MODULE,
+ .name = "checksumming",
+ .directory = "checksum",
+ .module = THIS_MODULE,
+ .initialise = toi_checksum_initialise,
+ .cleanup = toi_checksum_cleanup,
+ .print_debug_info = toi_checksum_print_debug_stats,
+ .save_config_info = toi_checksum_save_config_info,
+ .load_config_info = toi_checksum_load_config_info,
+ .memory_needed = toi_checksum_memory_needed,
+ .storage_needed = toi_checksum_storage_needed,
+
+ .sysfs_data = sysfs_params,
+ .num_sysfs_entries = sizeof(sysfs_params) /
+ sizeof(struct toi_sysfs_data),
+};
+
+/* ---- Registration ---- */
+int toi_checksum_init(void)
+{
+ int result = toi_register_module(&toi_checksum_ops);
+ return result;
+}
+
+void toi_checksum_exit(void)
+{
+ toi_unregister_module(&toi_checksum_ops);
+}
diff --git a/kernel/power/tuxonice_checksum.h b/kernel/power/tuxonice_checksum.h
new file mode 100644
index 000000000..7d6478a6a
--- /dev/null
+++ b/kernel/power/tuxonice_checksum.h
@@ -0,0 +1,31 @@
+/*
+ * kernel/power/tuxonice_checksum.h
+ *
+ * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ * This file contains data checksum routines for TuxOnIce,
+ * using cryptoapi. They are used to locate any modifications
+ * made to pageset 2 while we're saving it.
+ */
+
+#if defined(CONFIG_TOI_CHECKSUM)
+extern int toi_checksum_init(void);
+extern void toi_checksum_exit(void);
+void check_checksums(void);
+int allocate_checksum_pages(void);
+void free_checksum_pages(void);
+char *tuxonice_get_next_checksum(void);
+int tuxonice_calc_checksum(struct page *page, char *checksum_locn);
+#else
+static inline int toi_checksum_init(void) { return 0; }
+static inline void toi_checksum_exit(void) { }
+static inline void check_checksums(void) { };
+static inline int allocate_checksum_pages(void) { return 0; };
+static inline void free_checksum_pages(void) { };
+static inline char *tuxonice_get_next_checksum(void) { return NULL; };
+static inline int tuxonice_calc_checksum(struct page *page, char *checksum_locn)
+ { return 0; }
+#endif
+
diff --git a/kernel/power/tuxonice_cluster.c b/kernel/power/tuxonice_cluster.c
new file mode 100644
index 000000000..cfe3383ab
--- /dev/null
+++ b/kernel/power/tuxonice_cluster.c
@@ -0,0 +1,1058 @@
+/*
+ * kernel/power/tuxonice_cluster.c
+ *
+ * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ * This file contains routines for cluster hibernation support.
+ *
+ * Based on ip autoconfiguration code in net/ipv4/ipconfig.c.
+ *
+ * How does it work?
+ *
+ * There is no 'master' node that tells everyone else what to do. All nodes
+ * send messages to the broadcast address/port, maintain a list of peers
+ * and figure out when to progress to the next step in hibernating or resuming.
+ * This makes us more fault tolerant when it comes to nodes coming and going
+ * (which may be more of an issue if we're hibernating when power supplies
+ * are being unreliable).
+ *
+ * At boot time, we start a ktuxonice thread that handles communication with
+ * other nodes. This node maintains a state machine that controls our progress
+ * through hibernating and resuming, keeping us in step with other nodes. Nodes
+ * are identified by their hw address.
+ *
+ * On startup, the node sends CLUSTER_PING on the configured interface's
+ * broadcast address, port $toi_cluster_port (see below) and begins to listen
+ * for other broadcast messages. CLUSTER_PING messages are repeated at
+ * intervals of 5 minutes, with a random offset to spread traffic out.
+ *
+ * A hibernation cycle is initiated from any node via
+ *
+ * echo > /sys/power/tuxonice/do_hibernate
+ *
+ * and (possibily) the hibernate script. At each step of the process, the node
+ * completes its work, and waits for all other nodes to signal completion of
+ * their work (or timeout) before progressing to the next step.
+ *
+ * Request/state Action before reply Possible reply Next state
+ * HIBERNATE capable, pre-script HIBERNATE|ACK NODE_PREP
+ * HIBERNATE|NACK INIT_0
+ *
+ * PREP prepare_image PREP|ACK IMAGE_WRITE
+ * PREP|NACK INIT_0
+ * ABORT RUNNING
+ *
+ * IO write image IO|ACK power off
+ * ABORT POST_RESUME
+ *
+ * (Boot time) check for image IMAGE|ACK RESUME_PREP
+ * (Note 1)
+ * IMAGE|NACK (Note 2)
+ *
+ * PREP prepare read image PREP|ACK IMAGE_READ
+ * PREP|NACK (As NACK_IMAGE)
+ *
+ * IO read image IO|ACK POST_RESUME
+ *
+ * POST_RESUME thaw, post-script RUNNING
+ *
+ * INIT_0 init 0
+ *
+ * Other messages:
+ *
+ * - PING: Request for all other live nodes to send a PONG. Used at startup to
+ * announce presence, when a node is suspected dead and periodically, in case
+ * segments of the network are [un]plugged.
+ *
+ * - PONG: Response to a PING.
+ *
+ * - ABORT: Request to cancel writing an image.
+ *
+ * - BYE: Notification that this node is shutting down.
+ *
+ * Note 1: Repeated at 3s intervals until we continue to boot/resume, so that
+ * nodes which are slower to start up can get state synchronised. If a node
+ * starting up sees other nodes sending RESUME_PREP or IMAGE_READ, it may send
+ * ACK_IMAGE and they will wait for it to catch up. If it sees ACK_READ, it
+ * must invalidate its image (if any) and boot normally.
+ *
+ * Note 2: May occur when one node lost power or powered off while others
+ * hibernated. This node waits for others to complete resuming (ACK_READ)
+ * before completing its boot, so that it appears as a fail node restarting.
+ *
+ * If any node has an image, then it also has a list of nodes that hibernated
+ * in synchronisation with it. The node will wait for other nodes to appear
+ * or timeout before beginning its restoration.
+ *
+ * If a node has no image, it needs to wait, in case other nodes which do have
+ * an image are going to resume, but are taking longer to announce their
+ * presence. For this reason, the user can specify a timeout value and a number
+ * of nodes detected before we just continue. (We might want to assume in a
+ * cluster of, say, 15 nodes, if 8 others have booted without finding an image,
+ * the remaining nodes will too. This might help in situations where some nodes
+ * are much slower to boot, or more subject to hardware failures or such like).
+ */
+
+#include <linux/suspend.h>
+#include <linux/if.h>
+#include <linux/rtnetlink.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/in.h>
+#include <linux/if_arp.h>
+#include <linux/kthread.h>
+#include <linux/wait.h>
+#include <linux/netdevice.h>
+#include <net/ip.h>
+
+#include "tuxonice.h"
+#include "tuxonice_modules.h"
+#include "tuxonice_sysfs.h"
+#include "tuxonice_alloc.h"
+#include "tuxonice_io.h"
+
+#if 1
+#define PRINTK(a, b...) do { printk(a, ##b); } while (0)
+#else
+#define PRINTK(a, b...) do { } while (0)
+#endif
+
+static int loopback_mode;
+static int num_local_nodes = 1;
+#define MAX_LOCAL_NODES 8
+#define SADDR (loopback_mode ? b->sid : h->saddr)
+
+#define MYNAME "TuxOnIce Clustering"
+
+enum cluster_message {
+ MSG_ACK = 1,
+ MSG_NACK = 2,
+ MSG_PING = 4,
+ MSG_ABORT = 8,
+ MSG_BYE = 16,
+ MSG_HIBERNATE = 32,
+ MSG_IMAGE = 64,
+ MSG_IO = 128,
+ MSG_RUNNING = 256
+};
+
+static char *str_message(int message)
+{
+ switch (message) {
+ case 4:
+ return "Ping";
+ case 8:
+ return "Abort";
+ case 9:
+ return "Abort acked";
+ case 10:
+ return "Abort nacked";
+ case 16:
+ return "Bye";
+ case 17:
+ return "Bye acked";
+ case 18:
+ return "Bye nacked";
+ case 32:
+ return "Hibernate request";
+ case 33:
+ return "Hibernate ack";
+ case 34:
+ return "Hibernate nack";
+ case 64:
+ return "Image exists?";
+ case 65:
+ return "Image does exist";
+ case 66:
+ return "No image here";
+ case 128:
+ return "I/O";
+ case 129:
+ return "I/O okay";
+ case 130:
+ return "I/O failed";
+ case 256:
+ return "Running";
+ default:
+ printk(KERN_ERR "Unrecognised message %d.\n", message);
+ return "Unrecognised message (see dmesg)";
+ }
+}
+
+#define MSG_ACK_MASK (MSG_ACK | MSG_NACK)
+#define MSG_STATE_MASK (~MSG_ACK_MASK)
+
+struct node_info {
+ struct list_head member_list;
+ wait_queue_head_t member_events;
+ spinlock_t member_list_lock;
+ spinlock_t receive_lock;
+ int peer_count, ignored_peer_count;
+ struct toi_sysfs_data sysfs_data;
+ enum cluster_message current_message;
+};
+
+struct node_info node_array[MAX_LOCAL_NODES];
+
+struct cluster_member {
+ __be32 addr;
+ enum cluster_message message;
+ struct list_head list;
+ int ignore;
+};
+
+#define toi_cluster_port_send 3501
+#define toi_cluster_port_recv 3502
+
+static struct net_device *net_dev;
+static struct toi_module_ops toi_cluster_ops;
+
+static int toi_recv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev);
+
+static struct packet_type toi_cluster_packet_type = {
+ .type = __constant_htons(ETH_P_IP),
+ .func = toi_recv,
+};
+
+struct toi_pkt { /* BOOTP packet format */
+ struct iphdr iph; /* IP header */
+ struct udphdr udph; /* UDP header */
+ u8 htype; /* HW address type */
+ u8 hlen; /* HW address length */
+ __be32 xid; /* Transaction ID */
+ __be16 secs; /* Seconds since we started */
+ __be16 flags; /* Just what it says */
+ u8 hw_addr[16]; /* Sender's HW address */
+ u16 message; /* Message */
+ unsigned long sid; /* Source ID for loopback testing */
+};
+
+static char toi_cluster_iface[IFNAMSIZ] = CONFIG_TOI_DEFAULT_CLUSTER_INTERFACE;
+
+static int added_pack;
+
+static int others_have_image;
+
+/* Key used to allow multiple clusters on the same lan */
+static char toi_cluster_key[32] = CONFIG_TOI_DEFAULT_CLUSTER_KEY;
+static char pre_hibernate_script[255] =
+ CONFIG_TOI_DEFAULT_CLUSTER_PRE_HIBERNATE;
+static char post_hibernate_script[255] =
+ CONFIG_TOI_DEFAULT_CLUSTER_POST_HIBERNATE;
+
+/* List of cluster members */
+static unsigned long continue_delay = 5 * HZ;
+static unsigned long cluster_message_timeout = 3 * HZ;
+
+/* === Membership list === */
+
+static void print_member_info(int index)
+{
+ struct cluster_member *this;
+
+ printk(KERN_INFO "==> Dumping node %d.\n", index);
+
+ list_for_each_entry(this, &node_array[index].member_list, list)
+ printk(KERN_INFO "%d.%d.%d.%d last message %s. %s\n",
+ NIPQUAD(this->addr),
+ str_message(this->message),
+ this->ignore ? "(Ignored)" : "");
+ printk(KERN_INFO "== Done ==\n");
+}
+
+static struct cluster_member *__find_member(int index, __be32 addr)
+{
+ struct cluster_member *this;
+
+ list_for_each_entry(this, &node_array[index].member_list, list) {
+ if (this->addr != addr)
+ continue;
+
+ return this;
+ }
+
+ return NULL;
+}
+
+static void set_ignore(int index, __be32 addr, struct cluster_member *this)
+{
+ if (this->ignore) {
+ PRINTK("Node %d already ignoring %d.%d.%d.%d.\n",
+ index, NIPQUAD(addr));
+ return;
+ }
+
+ PRINTK("Node %d sees node %d.%d.%d.%d now being ignored.\n",
+ index, NIPQUAD(addr));
+ this->ignore = 1;
+ node_array[index].ignored_peer_count++;
+}
+
+static int __add_update_member(int index, __be32 addr, int message)
+{
+ struct cluster_member *this;
+
+ this = __find_member(index, addr);
+ if (this) {
+ if (this->message != message) {
+ this->message = message;
+ if ((message & MSG_NACK) &&
+ (message & (MSG_HIBERNATE | MSG_IMAGE | MSG_IO)))
+ set_ignore(index, addr, this);
+ PRINTK("Node %d sees node %d.%d.%d.%d now sending "
+ "%s.\n", index, NIPQUAD(addr),
+ str_message(message));
+ wake_up(&node_array[index].member_events);
+ }
+ return 0;
+ }
+
+ this = (struct cluster_member *) toi_kzalloc(36,
+ sizeof(struct cluster_member), GFP_KERNEL);
+
+ if (!this)
+ return -1;
+
+ this->addr = addr;
+ this->message = message;
+ this->ignore = 0;
+ INIT_LIST_HEAD(&this->list);
+
+ node_array[index].peer_count++;
+
+ PRINTK("Node %d sees node %d.%d.%d.%d sending %s.\n", index,
+ NIPQUAD(addr), str_message(message));
+
+ if ((message & MSG_NACK) &&
+ (message & (MSG_HIBERNATE | MSG_IMAGE | MSG_IO)))
+ set_ignore(index, addr, this);
+ list_add_tail(&this->list, &node_array[index].member_list);
+ return 1;
+}
+
+static int add_update_member(int index, __be32 addr, int message)
+{
+ int result;
+ unsigned long flags;
+ spin_lock_irqsave(&node_array[index].member_list_lock, flags);
+ result = __add_update_member(index, addr, message);
+ spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
+
+ print_member_info(index);
+
+ wake_up(&node_array[index].member_events);
+
+ return result;
+}
+
+static void del_member(int index, __be32 addr)
+{
+ struct cluster_member *this;
+ unsigned long flags;
+
+ spin_lock_irqsave(&node_array[index].member_list_lock, flags);
+ this = __find_member(index, addr);
+
+ if (this) {
+ list_del_init(&this->list);
+ toi_kfree(36, this, sizeof(*this));
+ node_array[index].peer_count--;
+ }
+
+ spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
+}
+
+/* === Message transmission === */
+
+static void toi_send_if(int message, unsigned long my_id);
+
+/*
+ * Process received TOI packet.
+ */
+static int toi_recv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev)
+{
+ struct toi_pkt *b;
+ struct iphdr *h;
+ int len, result, index;
+ unsigned long addr, message, ack;
+
+ /* Perform verifications before taking the lock. */
+ if (skb->pkt_type == PACKET_OTHERHOST)
+ goto drop;
+
+ if (dev != net_dev)
+ goto drop;
+
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (!skb)
+ return NET_RX_DROP;
+
+ if (!pskb_may_pull(skb,
+ sizeof(struct iphdr) +
+ sizeof(struct udphdr)))
+ goto drop;
+
+ b = (struct toi_pkt *)skb_network_header(skb);
+ h = &b->iph;
+
+ if (h->ihl != 5 || h->version != 4 || h->protocol != IPPROTO_UDP)
+ goto drop;
+
+ /* Fragments are not supported */
+ if (h->frag_off & htons(IP_OFFSET | IP_MF)) {
+ if (net_ratelimit())
+ printk(KERN_ERR "TuxOnIce: Ignoring fragmented "
+ "cluster message.\n");
+ goto drop;
+ }
+
+ if (skb->len < ntohs(h->tot_len))
+ goto drop;
+
+ if (ip_fast_csum((char *) h, h->ihl))
+ goto drop;
+
+ if (b->udph.source != htons(toi_cluster_port_send) ||
+ b->udph.dest != htons(toi_cluster_port_recv))
+ goto drop;
+
+ if (ntohs(h->tot_len) < ntohs(b->udph.len) + sizeof(struct iphdr))
+ goto drop;
+
+ len = ntohs(b->udph.len) - sizeof(struct udphdr);
+
+ /* Ok the front looks good, make sure we can get at the rest. */
+ if (!pskb_may_pull(skb, skb->len))
+ goto drop;
+
+ b = (struct toi_pkt *)skb_network_header(skb);
+ h = &b->iph;
+
+ addr = SADDR;
+ PRINTK(">>> Message %s received from " NIPQUAD_FMT ".\n",
+ str_message(b->message), NIPQUAD(addr));
+
+ message = b->message & MSG_STATE_MASK;
+ ack = b->message & MSG_ACK_MASK;
+
+ for (index = 0; index < num_local_nodes; index++) {
+ int new_message = node_array[index].current_message,
+ old_message = new_message;
+
+ if (index == SADDR || !old_message) {
+ PRINTK("Ignoring node %d (offline or self).\n", index);
+ continue;
+ }
+
+ /* One message at a time, please. */
+ spin_lock(&node_array[index].receive_lock);
+
+ result = add_update_member(index, SADDR, b->message);
+ if (result == -1) {
+ printk(KERN_INFO "Failed to add new cluster member "
+ NIPQUAD_FMT ".\n",
+ NIPQUAD(addr));
+ goto drop_unlock;
+ }
+
+ switch (b->message & MSG_STATE_MASK) {
+ case MSG_PING:
+ break;
+ case MSG_ABORT:
+ break;
+ case MSG_BYE:
+ break;
+ case MSG_HIBERNATE:
+ /* Can I hibernate? */
+ new_message = MSG_HIBERNATE |
+ ((index & 1) ? MSG_NACK : MSG_ACK);
+ break;
+ case MSG_IMAGE:
+ /* Can I resume? */
+ new_message = MSG_IMAGE |
+ ((index & 1) ? MSG_NACK : MSG_ACK);
+ if (new_message != old_message)
+ printk(KERN_ERR "Setting whether I can resume "
+ "to %d.\n", new_message);
+ break;
+ case MSG_IO:
+ new_message = MSG_IO | MSG_ACK;
+ break;
+ case MSG_RUNNING:
+ break;
+ default:
+ if (net_ratelimit())
+ printk(KERN_ERR "Unrecognised TuxOnIce cluster"
+ " message %d from " NIPQUAD_FMT ".\n",
+ b->message, NIPQUAD(addr));
+ };
+
+ if (old_message != new_message) {
+ node_array[index].current_message = new_message;
+ printk(KERN_INFO ">>> Sending new message for node "
+ "%d.\n", index);
+ toi_send_if(new_message, index);
+ } else if (!ack) {
+ printk(KERN_INFO ">>> Resending message for node %d.\n",
+ index);
+ toi_send_if(new_message, index);
+ }
+drop_unlock:
+ spin_unlock(&node_array[index].receive_lock);
+ };
+
+drop:
+ /* Throw the packet out. */
+ kfree_skb(skb);
+
+ return 0;
+}
+
+/*
+ * Send cluster message to single interface.
+ */
+static void toi_send_if(int message, unsigned long my_id)
+{
+ struct sk_buff *skb;
+ struct toi_pkt *b;
+ int hh_len = LL_RESERVED_SPACE(net_dev);
+ struct iphdr *h;
+
+ /* Allocate packet */
+ skb = alloc_skb(sizeof(struct toi_pkt) + hh_len + 15, GFP_KERNEL);
+ if (!skb)
+ return;
+ skb_reserve(skb, hh_len);
+ b = (struct toi_pkt *) skb_put(skb, sizeof(struct toi_pkt));
+ memset(b, 0, sizeof(struct toi_pkt));
+
+ /* Construct IP header */
+ skb_reset_network_header(skb);
+ h = ip_hdr(skb);
+ h->version = 4;
+ h->ihl = 5;
+ h->tot_len = htons(sizeof(struct toi_pkt));
+ h->frag_off = htons(IP_DF);
+ h->ttl = 64;
+ h->protocol = IPPROTO_UDP;
+ h->daddr = htonl(INADDR_BROADCAST);
+ h->check = ip_fast_csum((unsigned char *) h, h->ihl);
+
+ /* Construct UDP header */
+ b->udph.source = htons(toi_cluster_port_send);
+ b->udph.dest = htons(toi_cluster_port_recv);
+ b->udph.len = htons(sizeof(struct toi_pkt) - sizeof(struct iphdr));
+ /* UDP checksum not calculated -- explicitly allowed in BOOTP RFC */
+
+ /* Construct message */
+ b->message = message;
+ b->sid = my_id;
+ b->htype = net_dev->type; /* can cause undefined behavior */
+ b->hlen = net_dev->addr_len;
+ memcpy(b->hw_addr, net_dev->dev_addr, net_dev->addr_len);
+ b->secs = htons(3); /* 3 seconds */
+
+ /* Chain packet down the line... */
+ skb->dev = net_dev;
+ skb->protocol = htons(ETH_P_IP);
+ if ((dev_hard_header(skb, net_dev, ntohs(skb->protocol),
+ net_dev->broadcast, net_dev->dev_addr, skb->len) < 0) ||
+ dev_queue_xmit(skb) < 0)
+ printk(KERN_INFO "E");
+}
+
+/* ========================================= */
+
+/* kTOICluster */
+
+static atomic_t num_cluster_threads;
+static DECLARE_WAIT_QUEUE_HEAD(clusterd_events);
+
+static int kTOICluster(void *data)
+{
+ unsigned long my_id;
+
+ my_id = atomic_add_return(1, &num_cluster_threads) - 1;
+ node_array[my_id].current_message = (unsigned long) data;
+
+ PRINTK("kTOICluster daemon %lu starting.\n", my_id);
+
+ current->flags |= PF_NOFREEZE;
+
+ while (node_array[my_id].current_message) {
+ toi_send_if(node_array[my_id].current_message, my_id);
+ sleep_on_timeout(&clusterd_events,
+ cluster_message_timeout);
+ PRINTK("Link state %lu is %d.\n", my_id,
+ node_array[my_id].current_message);
+ }
+
+ toi_send_if(MSG_BYE, my_id);
+ atomic_dec(&num_cluster_threads);
+ wake_up(&clusterd_events);
+
+ PRINTK("kTOICluster daemon %lu exiting.\n", my_id);
+ __set_current_state(TASK_RUNNING);
+ return 0;
+}
+
+static void kill_clusterd(void)
+{
+ int i;
+
+ for (i = 0; i < num_local_nodes; i++) {
+ if (node_array[i].current_message) {
+ PRINTK("Seeking to kill clusterd %d.\n", i);
+ node_array[i].current_message = 0;
+ }
+ }
+ wait_event(clusterd_events,
+ !atomic_read(&num_cluster_threads));
+ PRINTK("All cluster daemons have exited.\n");
+}
+
+static int peers_not_in_message(int index, int message, int precise)
+{
+ struct cluster_member *this;
+ unsigned long flags;
+ int result = 0;
+
+ spin_lock_irqsave(&node_array[index].member_list_lock, flags);
+ list_for_each_entry(this, &node_array[index].member_list, list) {
+ if (this->ignore)
+ continue;
+
+ PRINTK("Peer %d.%d.%d.%d sending %s. "
+ "Seeking %s.\n",
+ NIPQUAD(this->addr),
+ str_message(this->message), str_message(message));
+ if ((precise ? this->message :
+ this->message & MSG_STATE_MASK) !=
+ message)
+ result++;
+ }
+ spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
+ PRINTK("%d peers in sought message.\n", result);
+ return result;
+}
+
+static void reset_ignored(int index)
+{
+ struct cluster_member *this;
+ unsigned long flags;
+
+ spin_lock_irqsave(&node_array[index].member_list_lock, flags);
+ list_for_each_entry(this, &node_array[index].member_list, list)
+ this->ignore = 0;
+ node_array[index].ignored_peer_count = 0;
+ spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
+}
+
+static int peers_in_message(int index, int message, int precise)
+{
+ return node_array[index].peer_count -
+ node_array[index].ignored_peer_count -
+ peers_not_in_message(index, message, precise);
+}
+
+static int time_to_continue(int index, unsigned long start, int message)
+{
+ int first = peers_not_in_message(index, message, 0);
+ int second = peers_in_message(index, message, 1);
+
+ PRINTK("First part returns %d, second returns %d.\n", first, second);
+
+ if (!first && !second) {
+ PRINTK("All peers answered message %d.\n",
+ message);
+ return 1;
+ }
+
+ if (time_after(jiffies, start + continue_delay)) {
+ PRINTK("Timeout reached.\n");
+ return 1;
+ }
+
+ PRINTK("Not time to continue yet (%lu < %lu).\n", jiffies,
+ start + continue_delay);
+ return 0;
+}
+
+void toi_initiate_cluster_hibernate(void)
+{
+ int result;
+ unsigned long start;
+
+ result = do_toi_step(STEP_HIBERNATE_PREPARE_IMAGE);
+ if (result)
+ return;
+
+ toi_send_if(MSG_HIBERNATE, 0);
+
+ start = jiffies;
+ wait_event(node_array[0].member_events,
+ time_to_continue(0, start, MSG_HIBERNATE));
+
+ if (test_action_state(TOI_FREEZER_TEST)) {
+ toi_send_if(MSG_ABORT, 0);
+
+ start = jiffies;
+ wait_event(node_array[0].member_events,
+ time_to_continue(0, start, MSG_RUNNING));
+
+ do_toi_step(STEP_QUIET_CLEANUP);
+ return;
+ }
+
+ toi_send_if(MSG_IO, 0);
+
+ result = do_toi_step(STEP_HIBERNATE_SAVE_IMAGE);
+ if (result)
+ return;
+
+ /* This code runs at resume time too! */
+ if (toi_in_hibernate)
+ result = do_toi_step(STEP_HIBERNATE_POWERDOWN);
+}
+
+/* toi_cluster_print_debug_stats
+ *
+ * Description: Print information to be recorded for debugging purposes into a
+ * buffer.
+ * Arguments: buffer: Pointer to a buffer into which the debug info will be
+ * printed.
+ * size: Size of the buffer.
+ * Returns: Number of characters written to the buffer.
+ */
+static int toi_cluster_print_debug_stats(char *buffer, int size)
+{
+ int len;
+
+ if (strlen(toi_cluster_iface))
+ len = scnprintf(buffer, size,
+ "- Cluster interface is '%s'.\n",
+ toi_cluster_iface);
+ else
+ len = scnprintf(buffer, size,
+ "- Cluster support is disabled.\n");
+ return len;
+}
+
+/* cluster_memory_needed
+ *
+ * Description: Tell the caller how much memory we need to operate during
+ * hibernate/resume.
+ * Returns: Unsigned long. Maximum number of bytes of memory required for
+ * operation.
+ */
+static int toi_cluster_memory_needed(void)
+{
+ return 0;
+}
+
+static int toi_cluster_storage_needed(void)
+{
+ return 1 + strlen(toi_cluster_iface);
+}
+
+/* toi_cluster_save_config_info
+ *
+ * Description: Save informaton needed when reloading the image at resume time.
+ * Arguments: Buffer: Pointer to a buffer of size PAGE_SIZE.
+ * Returns: Number of bytes used for saving our data.
+ */
+static int toi_cluster_save_config_info(char *buffer)
+{
+ strcpy(buffer, toi_cluster_iface);
+ return strlen(toi_cluster_iface + 1);
+}
+
+/* toi_cluster_load_config_info
+ *
+ * Description: Reload information needed for declustering the image at
+ * resume time.
+ * Arguments: Buffer: Pointer to the start of the data.
+ * Size: Number of bytes that were saved.
+ */
+static void toi_cluster_load_config_info(char *buffer, int size)
+{
+ strncpy(toi_cluster_iface, buffer, size);
+ return;
+}
+
+static void cluster_startup(void)
+{
+ int have_image = do_check_can_resume(), i;
+ unsigned long start = jiffies, initial_message;
+ struct task_struct *p;
+
+ initial_message = MSG_IMAGE;
+
+ have_image = 1;
+
+ for (i = 0; i < num_local_nodes; i++) {
+ PRINTK("Starting ktoiclusterd %d.\n", i);
+ p = kthread_create(kTOICluster, (void *) initial_message,
+ "ktoiclusterd/%d", i);
+ if (IS_ERR(p)) {
+ printk(KERN_ERR "Failed to start ktoiclusterd.\n");
+ return;
+ }
+
+ wake_up_process(p);
+ }
+
+ /* Wait for delay or someone else sending first message */
+ wait_event(node_array[0].member_events, time_to_continue(0, start,
+ MSG_IMAGE));
+
+ others_have_image = peers_in_message(0, MSG_IMAGE | MSG_ACK, 1);
+
+ printk(KERN_INFO "Continuing. I %shave an image. Peers with image:"
+ " %d.\n", have_image ? "" : "don't ", others_have_image);
+
+ if (have_image) {
+ int result;
+
+ /* Start to resume */
+ printk(KERN_INFO " === Starting to resume === \n");
+ node_array[0].current_message = MSG_IO;
+ toi_send_if(MSG_IO, 0);
+
+ /* result = do_toi_step(STEP_RESUME_LOAD_PS1); */
+ result = 0;
+
+ if (!result) {
+ /*
+ * Atomic restore - we'll come back in the hibernation
+ * path.
+ */
+
+ /* result = do_toi_step(STEP_RESUME_DO_RESTORE); */
+ result = 0;
+
+ /* do_toi_step(STEP_QUIET_CLEANUP); */
+ }
+
+ node_array[0].current_message |= MSG_NACK;
+
+ /* For debugging - disable for real life? */
+ wait_event(node_array[0].member_events,
+ time_to_continue(0, start, MSG_IO));
+ }
+
+ if (others_have_image) {
+ /* Wait for them to resume */
+ printk(KERN_INFO "Waiting for other nodes to resume.\n");
+ start = jiffies;
+ wait_event(node_array[0].member_events,
+ time_to_continue(0, start, MSG_RUNNING));
+ if (peers_not_in_message(0, MSG_RUNNING, 0))
+ printk(KERN_INFO "Timed out while waiting for other "
+ "nodes to resume.\n");
+ }
+
+ /* Find out whether an image exists here. Send ACK_IMAGE or NACK_IMAGE
+ * as appropriate.
+ *
+ * If we don't have an image:
+ * - Wait until someone else says they have one, or conditions are met
+ * for continuing to boot (n machines or t seconds).
+ * - If anyone has an image, wait for them to resume before continuing
+ * to boot.
+ *
+ * If we have an image:
+ * - Wait until conditions are met before continuing to resume (n
+ * machines or t seconds). Send RESUME_PREP and freeze processes.
+ * NACK_PREP if freezing fails (shouldn't) and follow logic for
+ * us having no image above. On success, wait for [N]ACK_PREP from
+ * other machines. Read image (including atomic restore) until done.
+ * Wait for ACK_READ from others (should never fail). Thaw processes
+ * and do post-resume. (The section after the atomic restore is done
+ * via the code for hibernating).
+ */
+
+ node_array[0].current_message = MSG_RUNNING;
+}
+
+/* toi_cluster_open_iface
+ *
+ * Description: Prepare to use an interface.
+ */
+
+static int toi_cluster_open_iface(void)
+{
+ struct net_device *dev;
+
+ rtnl_lock();
+
+ for_each_netdev(&init_net, dev) {
+ if (/* dev == &init_net.loopback_dev || */
+ strcmp(dev->name, toi_cluster_iface))
+ continue;
+
+ net_dev = dev;
+ break;
+ }
+
+ rtnl_unlock();
+
+ if (!net_dev) {
+ printk(KERN_ERR MYNAME ": Device %s not found.\n",
+ toi_cluster_iface);
+ return -ENODEV;
+ }
+
+ dev_add_pack(&toi_cluster_packet_type);
+ added_pack = 1;
+
+ loopback_mode = (net_dev == init_net.loopback_dev);
+ num_local_nodes = loopback_mode ? 8 : 1;
+
+ PRINTK("Loopback mode is %s. Number of local nodes is %d.\n",
+ loopback_mode ? "on" : "off", num_local_nodes);
+
+ cluster_startup();
+ return 0;
+}
+
+/* toi_cluster_close_iface
+ *
+ * Description: Stop using an interface.
+ */
+
+static int toi_cluster_close_iface(void)
+{
+ kill_clusterd();
+ if (added_pack) {
+ dev_remove_pack(&toi_cluster_packet_type);
+ added_pack = 0;
+ }
+ return 0;
+}
+
+static void write_side_effect(void)
+{
+ if (toi_cluster_ops.enabled) {
+ toi_cluster_open_iface();
+ set_toi_state(TOI_CLUSTER_MODE);
+ } else {
+ toi_cluster_close_iface();
+ clear_toi_state(TOI_CLUSTER_MODE);
+ }
+}
+
+static void node_write_side_effect(void)
+{
+}
+
+/*
+ * data for our sysfs entries.
+ */
+static struct toi_sysfs_data sysfs_params[] = {
+ SYSFS_STRING("interface", SYSFS_RW, toi_cluster_iface, IFNAMSIZ, 0,
+ NULL),
+ SYSFS_INT("enabled", SYSFS_RW, &toi_cluster_ops.enabled, 0, 1, 0,
+ write_side_effect),
+ SYSFS_STRING("cluster_name", SYSFS_RW, toi_cluster_key, 32, 0, NULL),
+ SYSFS_STRING("pre-hibernate-script", SYSFS_RW, pre_hibernate_script,
+ 256, 0, NULL),
+ SYSFS_STRING("post-hibernate-script", SYSFS_RW, post_hibernate_script,
+ 256, 0, STRING),
+ SYSFS_UL("continue_delay", SYSFS_RW, &continue_delay, HZ / 2, 60 * HZ,
+ 0)
+};
+
+/*
+ * Ops structure.
+ */
+
+static struct toi_module_ops toi_cluster_ops = {
+ .type = FILTER_MODULE,
+ .name = "Cluster",
+ .directory = "cluster",
+ .module = THIS_MODULE,
+ .memory_needed = toi_cluster_memory_needed,
+ .print_debug_info = toi_cluster_print_debug_stats,
+ .save_config_info = toi_cluster_save_config_info,
+ .load_config_info = toi_cluster_load_config_info,
+ .storage_needed = toi_cluster_storage_needed,
+
+ .sysfs_data = sysfs_params,
+ .num_sysfs_entries = sizeof(sysfs_params) /
+ sizeof(struct toi_sysfs_data),
+};
+
+/* ---- Registration ---- */
+
+#ifdef MODULE
+#define INIT static __init
+#define EXIT static __exit
+#else
+#define INIT
+#define EXIT
+#endif
+
+INIT int toi_cluster_init(void)
+{
+ int temp = toi_register_module(&toi_cluster_ops), i;
+ struct kobject *kobj = toi_cluster_ops.dir_kobj;
+
+ for (i = 0; i < MAX_LOCAL_NODES; i++) {
+ node_array[i].current_message = 0;
+ INIT_LIST_HEAD(&node_array[i].member_list);
+ init_waitqueue_head(&node_array[i].member_events);
+ spin_lock_init(&node_array[i].member_list_lock);
+ spin_lock_init(&node_array[i].receive_lock);
+
+ /* Set up sysfs entry */
+ node_array[i].sysfs_data.attr.name = toi_kzalloc(8,
+ sizeof(node_array[i].sysfs_data.attr.name),
+ GFP_KERNEL);
+ sprintf((char *) node_array[i].sysfs_data.attr.name, "node_%d",
+ i);
+ node_array[i].sysfs_data.attr.mode = SYSFS_RW;
+ node_array[i].sysfs_data.type = TOI_SYSFS_DATA_INTEGER;
+ node_array[i].sysfs_data.flags = 0;
+ node_array[i].sysfs_data.data.integer.variable =
+ (int *) &node_array[i].current_message;
+ node_array[i].sysfs_data.data.integer.minimum = 0;
+ node_array[i].sysfs_data.data.integer.maximum = INT_MAX;
+ node_array[i].sysfs_data.write_side_effect =
+ node_write_side_effect;
+ toi_register_sysfs_file(kobj, &node_array[i].sysfs_data);
+ }
+
+ toi_cluster_ops.enabled = (strlen(toi_cluster_iface) > 0);
+
+ if (toi_cluster_ops.enabled)
+ toi_cluster_open_iface();
+
+ return temp;
+}
+
+EXIT void toi_cluster_exit(void)
+{
+ int i;
+ toi_cluster_close_iface();
+
+ for (i = 0; i < MAX_LOCAL_NODES; i++)
+ toi_unregister_sysfs_file(toi_cluster_ops.dir_kobj,
+ &node_array[i].sysfs_data);
+ toi_unregister_module(&toi_cluster_ops);
+}
+
+static int __init toi_cluster_iface_setup(char *iface)
+{
+ toi_cluster_ops.enabled = (*iface &&
+ strcmp(iface, "off"));
+
+ if (toi_cluster_ops.enabled)
+ strncpy(toi_cluster_iface, iface, strlen(iface));
+}
+
+__setup("toi_cluster=", toi_cluster_iface_setup);
diff --git a/kernel/power/tuxonice_cluster.h b/kernel/power/tuxonice_cluster.h
new file mode 100644
index 000000000..84356b304
--- /dev/null
+++ b/kernel/power/tuxonice_cluster.h
@@ -0,0 +1,18 @@
+/*
+ * kernel/power/tuxonice_cluster.h
+ *
+ * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ */
+
+#ifdef CONFIG_TOI_CLUSTER
+extern int toi_cluster_init(void);
+extern void toi_cluster_exit(void);
+extern void toi_initiate_cluster_hibernate(void);
+#else
+static inline int toi_cluster_init(void) { return 0; }
+static inline void toi_cluster_exit(void) { }
+static inline void toi_initiate_cluster_hibernate(void) { }
+#endif
+
diff --git a/kernel/power/tuxonice_compress.c b/kernel/power/tuxonice_compress.c
new file mode 100644
index 000000000..d118568b7
--- /dev/null
+++ b/kernel/power/tuxonice_compress.c
@@ -0,0 +1,452 @@
+/*
+ * kernel/power/compression.c
+ *
+ * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ * This file contains data compression routines for TuxOnIce,
+ * using cryptoapi.
+ */
+
+#include <linux/suspend.h>
+#include <linux/highmem.h>
+#include <linux/vmalloc.h>
+#include <linux/crypto.h>
+
+#include "tuxonice_builtin.h"
+#include "tuxonice.h"
+#include "tuxonice_modules.h"
+#include "tuxonice_sysfs.h"
+#include "tuxonice_io.h"
+#include "tuxonice_ui.h"
+#include "tuxonice_alloc.h"
+
+static int toi_expected_compression;
+
+static struct toi_module_ops toi_compression_ops;
+static struct toi_module_ops *next_driver;
+
+static char toi_compressor_name[32] = "lzo";
+
+static DEFINE_MUTEX(stats_lock);
+
+struct cpu_context {
+ u8 *page_buffer;
+ struct crypto_comp *transform;
+ unsigned int len;
+ u8 *buffer_start;
+ u8 *output_buffer;
+};
+
+#define OUT_BUF_SIZE (2 * PAGE_SIZE)
+
+static DEFINE_PER_CPU(struct cpu_context, contexts);
+
+/*
+ * toi_crypto_prepare
+ *
+ * Prepare to do some work by allocating buffers and transforms.
+ */
+static int toi_compress_crypto_prepare(void)
+{
+ int cpu;
+
+ if (!*toi_compressor_name) {
+ printk(KERN_INFO "TuxOnIce: Compression enabled but no "
+ "compressor name set.\n");
+ return 1;
+ }
+
+ for_each_online_cpu(cpu) {
+ struct cpu_context *this = &per_cpu(contexts, cpu);
+ this->transform = crypto_alloc_comp(toi_compressor_name, 0, 0);
+ if (IS_ERR(this->transform)) {
+ printk(KERN_INFO "TuxOnIce: Failed to initialise the "
+ "%s compression transform.\n",
+ toi_compressor_name);
+ this->transform = NULL;
+ return 1;
+ }
+
+ this->page_buffer =
+ (char *) toi_get_zeroed_page(16, TOI_ATOMIC_GFP);
+
+ if (!this->page_buffer) {
+ printk(KERN_ERR
+ "Failed to allocate a page buffer for TuxOnIce "
+ "compression driver.\n");
+ return -ENOMEM;
+ }
+
+ this->output_buffer =
+ (char *) vmalloc_32(OUT_BUF_SIZE);
+
+ if (!this->output_buffer) {
+ printk(KERN_ERR
+ "Failed to allocate a output buffer for TuxOnIce "
+ "compression driver.\n");
+ return -ENOMEM;
+ }
+ }
+
+ return 0;
+}
+
+static int toi_compress_rw_cleanup(int writing)
+{
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ struct cpu_context *this = &per_cpu(contexts, cpu);
+ if (this->transform) {
+ crypto_free_comp(this->transform);
+ this->transform = NULL;
+ }
+
+ if (this->page_buffer)
+ toi_free_page(16, (unsigned long) this->page_buffer);
+
+ this->page_buffer = NULL;
+
+ if (this->output_buffer)
+ vfree(this->output_buffer);
+
+ this->output_buffer = NULL;
+ }
+
+ return 0;
+}
+
+/*
+ * toi_compress_init
+ */
+
+static int toi_compress_init(int toi_or_resume)
+{
+ if (!toi_or_resume)
+ return 0;
+
+ toi_compress_bytes_in = 0;
+ toi_compress_bytes_out = 0;
+
+ next_driver = toi_get_next_filter(&toi_compression_ops);
+
+ return next_driver ? 0 : -ECHILD;
+}
+
+/*
+ * toi_compress_rw_init()
+ */
+
+static int toi_compress_rw_init(int rw, int stream_number)
+{
+ if (toi_compress_crypto_prepare()) {
+ printk(KERN_ERR "Failed to initialise compression "
+ "algorithm.\n");
+ if (rw == READ) {
+ printk(KERN_INFO "Unable to read the image.\n");
+ return -ENODEV;
+ } else {
+ printk(KERN_INFO "Continuing without "
+ "compressing the image.\n");
+ toi_compression_ops.enabled = 0;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * toi_compress_write_page()
+ *
+ * Compress a page of data, buffering output and passing on filled
+ * pages to the next module in the pipeline.
+ *
+ * Buffer_page: Pointer to a buffer of size PAGE_SIZE, containing
+ * data to be compressed.
+ *
+ * Returns: 0 on success. Otherwise the error is that returned by later
+ * modules, -ECHILD if we have a broken pipeline or -EIO if
+ * zlib errs.
+ */
+static int toi_compress_write_page(unsigned long index, int buf_type,
+ void *buffer_page, unsigned int buf_size)
+{
+ int ret = 0, cpu = smp_processor_id();
+ struct cpu_context *ctx = &per_cpu(contexts, cpu);
+ u8* output_buffer = buffer_page;
+ int output_len = buf_size;
+ int out_buf_type = buf_type;
+
+ if (ctx->transform) {
+
+ ctx->buffer_start = TOI_MAP(buf_type, buffer_page);
+ ctx->len = OUT_BUF_SIZE;
+
+ ret = crypto_comp_compress(ctx->transform,
+ ctx->buffer_start, buf_size,
+ ctx->output_buffer, &ctx->len);
+
+ TOI_UNMAP(buf_type, buffer_page);
+
+ toi_message(TOI_COMPRESS, TOI_VERBOSE, 0,
+ "CPU %d, index %lu: %d bytes",
+ cpu, index, ctx->len);
+
+ if (!ret && ctx->len < buf_size) { /* some compression */
+ output_buffer = ctx->output_buffer;
+ output_len = ctx->len;
+ out_buf_type = TOI_VIRT;
+ }
+
+ }
+
+ mutex_lock(&stats_lock);
+
+ toi_compress_bytes_in += buf_size;
+ toi_compress_bytes_out += output_len;
+
+ mutex_unlock(&stats_lock);
+
+ if (!ret)
+ ret = next_driver->write_page(index, out_buf_type,
+ output_buffer, output_len);
+
+ return ret;
+}
+
+/*
+ * toi_compress_read_page()
+ * @buffer_page: struct page *. Pointer to a buffer of size PAGE_SIZE.
+ *
+ * Retrieve data from later modules and decompress it until the input buffer
+ * is filled.
+ * Zero if successful. Error condition from me or from downstream on failure.
+ */
+static int toi_compress_read_page(unsigned long *index, int buf_type,
+ void *buffer_page, unsigned int *buf_size)
+{
+ int ret, cpu = smp_processor_id();
+ unsigned int len;
+ unsigned int outlen = PAGE_SIZE;
+ char *buffer_start;
+ struct cpu_context *ctx = &per_cpu(contexts, cpu);
+
+ if (!ctx->transform)
+ return next_driver->read_page(index, TOI_PAGE, buffer_page,
+ buf_size);
+
+ /*
+ * All our reads must be synchronous - we can't decompress
+ * data that hasn't been read yet.
+ */
+
+ ret = next_driver->read_page(index, TOI_VIRT, ctx->page_buffer, &len);
+
+ buffer_start = kmap(buffer_page);
+
+ /* Error or uncompressed data */
+ if (ret || len == PAGE_SIZE) {
+ memcpy(buffer_start, ctx->page_buffer, len);
+ goto out;
+ }
+
+ ret = crypto_comp_decompress(
+ ctx->transform,
+ ctx->page_buffer,
+ len, buffer_start, &outlen);
+
+ toi_message(TOI_COMPRESS, TOI_VERBOSE, 0,
+ "CPU %d, index %lu: %d=>%d (%d).",
+ cpu, *index, len, outlen, ret);
+
+ if (ret)
+ abort_hibernate(TOI_FAILED_IO,
+ "Compress_read returned %d.\n", ret);
+ else if (outlen != PAGE_SIZE) {
+ abort_hibernate(TOI_FAILED_IO,
+ "Decompression yielded %d bytes instead of %ld.\n",
+ outlen, PAGE_SIZE);
+ printk(KERN_ERR "Decompression yielded %d bytes instead of "
+ "%ld.\n", outlen, PAGE_SIZE);
+ ret = -EIO;
+ *buf_size = outlen;
+ }
+out:
+ TOI_UNMAP(buf_type, buffer_page);
+ return ret;
+}
+
+/*
+ * toi_compress_print_debug_stats
+ * @buffer: Pointer to a buffer into which the debug info will be printed.
+ * @size: Size of the buffer.
+ *
+ * Print information to be recorded for debugging purposes into a buffer.
+ * Returns: Number of characters written to the buffer.
+ */
+
+static int toi_compress_print_debug_stats(char *buffer, int size)
+{
+ unsigned long pages_in = toi_compress_bytes_in >> PAGE_SHIFT,
+ pages_out = toi_compress_bytes_out >> PAGE_SHIFT;
+ int len;
+
+ /* Output the compression ratio achieved. */
+ if (*toi_compressor_name)
+ len = scnprintf(buffer, size, "- Compressor is '%s'.\n",
+ toi_compressor_name);
+ else
+ len = scnprintf(buffer, size, "- Compressor is not set.\n");
+
+ if (pages_in)
+ len += scnprintf(buffer+len, size - len, " Compressed "
+ "%lu bytes into %lu (%ld percent compression).\n",
+ toi_compress_bytes_in,
+ toi_compress_bytes_out,
+ (pages_in - pages_out) * 100 / pages_in);
+ return len;
+}
+
+/*
+ * toi_compress_compression_memory_needed
+ *
+ * Tell the caller how much memory we need to operate during hibernate/resume.
+ * Returns: Unsigned long. Maximum number of bytes of memory required for
+ * operation.
+ */
+static int toi_compress_memory_needed(void)
+{
+ return 2 * PAGE_SIZE;
+}
+
+static int toi_compress_storage_needed(void)
+{
+ return 2 * sizeof(unsigned long) + 2 * sizeof(int) +
+ strlen(toi_compressor_name) + 1;
+}
+
+/*
+ * toi_compress_save_config_info
+ * @buffer: Pointer to a buffer of size PAGE_SIZE.
+ *
+ * Save informaton needed when reloading the image at resume time.
+ * Returns: Number of bytes used for saving our data.
+ */
+static int toi_compress_save_config_info(char *buffer)
+{
+ int len = strlen(toi_compressor_name) + 1, offset = 0;
+
+ *((unsigned long *) buffer) = toi_compress_bytes_in;
+ offset += sizeof(unsigned long);
+ *((unsigned long *) (buffer + offset)) = toi_compress_bytes_out;
+ offset += sizeof(unsigned long);
+ *((int *) (buffer + offset)) = toi_expected_compression;
+ offset += sizeof(int);
+ *((int *) (buffer + offset)) = len;
+ offset += sizeof(int);
+ strncpy(buffer + offset, toi_compressor_name, len);
+ return offset + len;
+}
+
+/* toi_compress_load_config_info
+ * @buffer: Pointer to the start of the data.
+ * @size: Number of bytes that were saved.
+ *
+ * Description: Reload information needed for decompressing the image at
+ * resume time.
+ */
+static void toi_compress_load_config_info(char *buffer, int size)
+{
+ int len, offset = 0;
+
+ toi_compress_bytes_in = *((unsigned long *) buffer);
+ offset += sizeof(unsigned long);
+ toi_compress_bytes_out = *((unsigned long *) (buffer + offset));
+ offset += sizeof(unsigned long);
+ toi_expected_compression = *((int *) (buffer + offset));
+ offset += sizeof(int);
+ len = *((int *) (buffer + offset));
+ offset += sizeof(int);
+ strncpy(toi_compressor_name, buffer + offset, len);
+}
+
+static void toi_compress_pre_atomic_restore(struct toi_boot_kernel_data *bkd)
+{
+ bkd->compress_bytes_in = toi_compress_bytes_in;
+ bkd->compress_bytes_out = toi_compress_bytes_out;
+}
+
+static void toi_compress_post_atomic_restore(struct toi_boot_kernel_data *bkd)
+{
+ toi_compress_bytes_in = bkd->compress_bytes_in;
+ toi_compress_bytes_out = bkd->compress_bytes_out;
+}
+
+/*
+ * toi_expected_compression_ratio
+ *
+ * Description: Returns the expected ratio between data passed into this module
+ * and the amount of data output when writing.
+ * Returns: 100 if the module is disabled. Otherwise the value set by the
+ * user via our sysfs entry.
+ */
+
+static int toi_compress_expected_ratio(void)
+{
+ if (!toi_compression_ops.enabled)
+ return 100;
+ else
+ return 100 - toi_expected_compression;
+}
+
+/*
+ * data for our sysfs entries.
+ */
+static struct toi_sysfs_data sysfs_params[] = {
+ SYSFS_INT("expected_compression", SYSFS_RW, &toi_expected_compression,
+ 0, 99, 0, NULL),
+ SYSFS_INT("enabled", SYSFS_RW, &toi_compression_ops.enabled, 0, 1, 0,
+ NULL),
+ SYSFS_STRING("algorithm", SYSFS_RW, toi_compressor_name, 31, 0, NULL),
+};
+
+/*
+ * Ops structure.
+ */
+static struct toi_module_ops toi_compression_ops = {
+ .type = FILTER_MODULE,
+ .name = "compression",
+ .directory = "compression",
+ .module = THIS_MODULE,
+ .initialise = toi_compress_init,
+ .memory_needed = toi_compress_memory_needed,
+ .print_debug_info = toi_compress_print_debug_stats,
+ .save_config_info = toi_compress_save_config_info,
+ .load_config_info = toi_compress_load_config_info,
+ .storage_needed = toi_compress_storage_needed,
+ .expected_compression = toi_compress_expected_ratio,
+
+ .pre_atomic_restore = toi_compress_pre_atomic_restore,
+ .post_atomic_restore = toi_compress_post_atomic_restore,
+
+ .rw_init = toi_compress_rw_init,
+ .rw_cleanup = toi_compress_rw_cleanup,
+
+ .write_page = toi_compress_write_page,
+ .read_page = toi_compress_read_page,
+
+ .sysfs_data = sysfs_params,
+ .num_sysfs_entries = sizeof(sysfs_params) /
+ sizeof(struct toi_sysfs_data),
+};
+
+/* ---- Registration ---- */
+
+static __init int toi_compress_load(void)
+{
+ return toi_register_module(&toi_compression_ops);
+}
+
+late_initcall(toi_compress_load);
diff --git a/kernel/power/tuxonice_copy_before_write.c b/kernel/power/tuxonice_copy_before_write.c
new file mode 100644
index 000000000..dc02a4acf
--- /dev/null
+++ b/kernel/power/tuxonice_copy_before_write.c
@@ -0,0 +1,240 @@
+/*
+ * kernel/power/tuxonice_copy_before_write.c
+ *
+ * Copyright (C) 2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ * Routines (apart from the fault handling code) to deal with allocating memory
+ * for copying pages before they are modified, restoring the contents and getting
+ * the contents written to disk.
+ */
+
+#include <linux/percpu-defs.h>
+#include <linux/sched.h>
+#include <linux/tuxonice.h>
+#include "tuxonice_alloc.h"
+#include "tuxonice_modules.h"
+#include "tuxonice_sysfs.h"
+#include "tuxonice.h"
+
+DEFINE_PER_CPU(struct toi_cbw_state, toi_cbw_states);
+#define CBWS_PER_PAGE (PAGE_SIZE / sizeof(struct toi_cbw))
+#define toi_cbw_pool_size 100
+
+static void _toi_free_cbw_data(struct toi_cbw_state *state)
+{
+ struct toi_cbw *page_ptr, *ptr, *next;
+
+ page_ptr = ptr = state->first;
+
+ while(ptr) {
+ next = ptr->next;
+
+ if (ptr->virt) {
+ toi__free_page(40, virt_to_page(ptr->virt));
+ }
+ if ((((unsigned long) ptr) & PAGE_MASK) != (unsigned long) page_ptr) {
+ /* Must be on a new page - free the previous one. */
+ toi__free_page(40, virt_to_page(page_ptr));
+ page_ptr = ptr;
+ }
+ ptr = next;
+ }
+
+ if (page_ptr) {
+ toi__free_page(40, virt_to_page(page_ptr));
+ }
+
+ state->first = state->next = state->last = NULL;
+ state->size = 0;
+}
+
+void toi_free_cbw_data(void)
+{
+ int i;
+
+ for_each_online_cpu(i) {
+ struct toi_cbw_state *state = &per_cpu(toi_cbw_states, i);
+
+ if (!state->first)
+ continue;
+
+ state->enabled = 0;
+
+ while (state->active) {
+ schedule();
+ }
+
+ _toi_free_cbw_data(state);
+ }
+}
+
+static int _toi_allocate_cbw_data(struct toi_cbw_state *state)
+{
+ while(state->size < toi_cbw_pool_size) {
+ int i;
+ struct toi_cbw *ptr;
+
+ ptr = (struct toi_cbw *) toi_get_zeroed_page(40, GFP_KERNEL);
+
+ if (!ptr) {
+ return -ENOMEM;
+ }
+
+ if (!state->first) {
+ state->first = state->next = state->last = ptr;
+ }
+
+ for (i = 0; i < CBWS_PER_PAGE; i++) {
+ struct toi_cbw *cbw = &ptr[i];
+
+ cbw->virt = (char *) toi_get_zeroed_page(40, GFP_KERNEL);
+ if (!cbw->virt) {
+ state->size += i;
+ printk("Out of memory allocating CBW pages.\n");
+ return -ENOMEM;
+ }
+
+ if (cbw == state->first)
+ continue;
+
+ state->last->next = cbw;
+ state->last = cbw;
+ }
+
+ state->size += CBWS_PER_PAGE;
+ }
+
+ state->enabled = 1;
+
+ return 0;
+}
+
+
+int toi_allocate_cbw_data(void)
+{
+ int i, result;
+
+ for_each_online_cpu(i) {
+ struct toi_cbw_state *state = &per_cpu(toi_cbw_states, i);
+
+ result = _toi_allocate_cbw_data(state);
+
+ if (result)
+ return result;
+ }
+
+ return 0;
+}
+
+void toi_cbw_restore(void)
+{
+ if (!toi_keeping_image)
+ return;
+
+}
+
+void toi_cbw_write(void)
+{
+ if (!toi_keeping_image)
+ return;
+
+}
+
+/**
+ * toi_cbw_test_read - Test copy before write on one page
+ *
+ * Allocate copy before write buffers, then make one page only copy-before-write
+ * and attempt to write to it. We should then be able to retrieve the original
+ * version from the cbw buffer and the modified version from the page itself.
+ */
+static int toi_cbw_test_read(const char *buffer, int count)
+{
+ unsigned long virt = toi_get_zeroed_page(40, GFP_KERNEL);
+ char *original = "Original contents";
+ char *modified = "Modified material";
+ struct page *page = virt_to_page(virt);
+ int i, len = 0, found = 0, pfn = page_to_pfn(page);
+
+ if (!page) {
+ printk("toi_cbw_test_read: Unable to allocate a page for testing.\n");
+ return -ENOMEM;
+ }
+
+ memcpy((char *) virt, original, strlen(original));
+
+ if (toi_allocate_cbw_data()) {
+ printk("toi_cbw_test_read: Unable to allocate cbw data.\n");
+ return -ENOMEM;
+ }
+
+ toi_reset_dirtiness_one(pfn, 0);
+
+ SetPageTOI_CBW(page);
+
+ memcpy((char *) virt, modified, strlen(modified));
+
+ if (strncmp((char *) virt, modified, strlen(modified))) {
+ len += sprintf((char *) buffer + len, "Failed to write to page after protecting it.\n");
+ }
+
+ for_each_online_cpu(i) {
+ struct toi_cbw_state *state = &per_cpu(toi_cbw_states, i);
+ struct toi_cbw *ptr = state->first, *last_ptr = ptr;
+
+ if (!found) {
+ while (ptr) {
+ if (ptr->pfn == pfn) {
+ found = 1;
+ if (strncmp(ptr->virt, original, strlen(original))) {
+ len += sprintf((char *) buffer + len, "Contents of original buffer are not original.\n");
+ } else {
+ len += sprintf((char *) buffer + len, "Test passed. Buffer changed and original contents preserved.\n");
+ }
+ break;
+ }
+
+ last_ptr = ptr;
+ ptr = ptr->next;
+ }
+ }
+
+ if (!last_ptr)
+ len += sprintf((char *) buffer + len, "All available CBW buffers on cpu %d used.\n", i);
+ }
+
+ if (!found)
+ len += sprintf((char *) buffer + len, "Copy before write buffer not found.\n");
+
+ toi_free_cbw_data();
+
+ return len;
+}
+
+/*
+ * This array contains entries that are automatically registered at
+ * boot. Modules and the console code register their own entries separately.
+ */
+static struct toi_sysfs_data sysfs_params[] = {
+ SYSFS_CUSTOM("test", SYSFS_RW, toi_cbw_test_read,
+ NULL, SYSFS_NEEDS_SM_FOR_READ, NULL),
+};
+
+static struct toi_module_ops toi_cbw_ops = {
+ .type = MISC_HIDDEN_MODULE,
+ .name = "copy_before_write debugging",
+ .directory = "cbw",
+ .module = THIS_MODULE,
+ .early = 1,
+
+ .sysfs_data = sysfs_params,
+ .num_sysfs_entries = sizeof(sysfs_params) /
+ sizeof(struct toi_sysfs_data),
+};
+
+int toi_cbw_init(void)
+{
+ int result = toi_register_module(&toi_cbw_ops);
+ return result;
+}
diff --git a/kernel/power/tuxonice_extent.c b/kernel/power/tuxonice_extent.c
new file mode 100644
index 000000000..3b558b220
--- /dev/null
+++ b/kernel/power/tuxonice_extent.c
@@ -0,0 +1,144 @@
+/*
+ * kernel/power/tuxonice_extent.c
+ *
+ * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * Distributed under GPLv2.
+ *
+ * These functions encapsulate the manipulation of storage metadata.
+ */
+
+#include <linux/suspend.h>
+#include "tuxonice_modules.h"
+#include "tuxonice_extent.h"
+#include "tuxonice_alloc.h"
+#include "tuxonice_ui.h"
+#include "tuxonice.h"
+
+/**
+ * toi_get_extent - return a free extent
+ *
+ * May fail, returning NULL instead.
+ **/
+static struct hibernate_extent *toi_get_extent(void)
+{
+ return (struct hibernate_extent *) toi_kzalloc(2,
+ sizeof(struct hibernate_extent), TOI_ATOMIC_GFP);
+}
+
+/**
+ * toi_put_extent_chain - free a chain of extents starting from value 'from'
+ * @chain: Chain to free.
+ *
+ * Note that 'from' is an extent value, and may be part way through an extent.
+ * In this case, the extent should be truncated (if necessary) and following
+ * extents freed.
+ **/
+void toi_put_extent_chain_from(struct hibernate_extent_chain *chain, unsigned long from)
+{
+ struct hibernate_extent *this;
+
+ this = chain->first;
+
+ while (this) {
+ struct hibernate_extent *next = this->next;
+
+ // Delete the whole extent?
+ if (this->start >= from) {
+ chain->size -= (this->end - this->start + 1);
+ if (chain->first == this)
+ chain->first = next;
+ if (chain->last_touched == this)
+ chain->last_touched = NULL;
+ if (chain->current_extent == this)
+ chain->current_extent = NULL;
+ toi_kfree(2, this, sizeof(*this));
+ chain->num_extents--;
+ } else if (this->end >= from) {
+ // Delete part of the extent
+ chain->size -= (this->end - from + 1);
+ this->start = from;
+ }
+ this = next;
+ }
+}
+
+/**
+ * toi_put_extent_chain - free a whole chain of extents
+ * @chain: Chain to free.
+ **/
+void toi_put_extent_chain(struct hibernate_extent_chain *chain)
+{
+ toi_put_extent_chain_from(chain, 0);
+}
+
+/**
+ * toi_add_to_extent_chain - add an extent to an existing chain
+ * @chain: Chain to which the extend should be added
+ * @start: Start of the extent (first physical block)
+ * @end: End of the extent (last physical block)
+ *
+ * The chain information is updated if the insertion is successful.
+ **/
+int toi_add_to_extent_chain(struct hibernate_extent_chain *chain,
+ unsigned long start, unsigned long end)
+{
+ struct hibernate_extent *new_ext = NULL, *cur_ext = NULL;
+
+ toi_message(TOI_IO, TOI_VERBOSE, 0,
+ "Adding extent %lu-%lu to chain %p.\n", start, end, chain);
+
+ /* Find the right place in the chain */
+ if (chain->last_touched && chain->last_touched->start < start)
+ cur_ext = chain->last_touched;
+ else if (chain->first && chain->first->start < start)
+ cur_ext = chain->first;
+
+ if (cur_ext) {
+ while (cur_ext->next && cur_ext->next->start < start)
+ cur_ext = cur_ext->next;
+
+ if (cur_ext->end == (start - 1)) {
+ struct hibernate_extent *next_ext = cur_ext->next;
+ cur_ext->end = end;
+
+ /* Merge with the following one? */
+ if (next_ext && cur_ext->end + 1 == next_ext->start) {
+ cur_ext->end = next_ext->end;
+ cur_ext->next = next_ext->next;
+ toi_kfree(2, next_ext, sizeof(*next_ext));
+ chain->num_extents--;
+ }
+
+ chain->last_touched = cur_ext;
+ chain->size += (end - start + 1);
+
+ return 0;
+ }
+ }
+
+ new_ext = toi_get_extent();
+ if (!new_ext) {
+ printk(KERN_INFO "Error unable to append a new extent to the "
+ "chain.\n");
+ return -ENOMEM;
+ }
+
+ chain->num_extents++;
+ chain->size += (end - start + 1);
+ new_ext->start = start;
+ new_ext->end = end;
+
+ chain->last_touched = new_ext;
+
+ if (cur_ext) {
+ new_ext->next = cur_ext->next;
+ cur_ext->next = new_ext;
+ } else {
+ if (chain->first)
+ new_ext->next = chain->first;
+ chain->first = new_ext;
+ }
+
+ return 0;
+}
diff --git a/kernel/power/tuxonice_extent.h b/kernel/power/tuxonice_extent.h
new file mode 100644
index 000000000..cf1289efc
--- /dev/null
+++ b/kernel/power/tuxonice_extent.h
@@ -0,0 +1,45 @@
+/*
+ * kernel/power/tuxonice_extent.h
+ *
+ * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ * It contains declarations related to extents. Extents are
+ * TuxOnIce's method of storing some of the metadata for the image.
+ * See tuxonice_extent.c for more info.
+ *
+ */
+
+#include "tuxonice_modules.h"
+
+#ifndef EXTENT_H
+#define EXTENT_H
+
+struct hibernate_extent {
+ unsigned long start, end;
+ struct hibernate_extent *next;
+};
+
+struct hibernate_extent_chain {
+ unsigned long size; /* size of the chain ie sum (max-min+1) */
+ int num_extents;
+ struct hibernate_extent *first, *last_touched;
+ struct hibernate_extent *current_extent;
+ unsigned long current_offset;
+};
+
+/* Simplify iterating through all the values in an extent chain */
+#define toi_extent_for_each(extent_chain, extentpointer, value) \
+if ((extent_chain)->first) \
+ for ((extentpointer) = (extent_chain)->first, (value) = \
+ (extentpointer)->start; \
+ ((extentpointer) && ((extentpointer)->next || (value) <= \
+ (extentpointer)->end)); \
+ (((value) == (extentpointer)->end) ? \
+ ((extentpointer) = (extentpointer)->next, (value) = \
+ ((extentpointer) ? (extentpointer)->start : 0)) : \
+ (value)++))
+
+extern void toi_put_extent_chain_from(struct hibernate_extent_chain *chain, unsigned long from);
+#endif
diff --git a/kernel/power/tuxonice_file.c b/kernel/power/tuxonice_file.c
new file mode 100644
index 000000000..607246051
--- /dev/null
+++ b/kernel/power/tuxonice_file.c
@@ -0,0 +1,484 @@
+/*
+ * kernel/power/tuxonice_file.c
+ *
+ * Copyright (C) 2005-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * Distributed under GPLv2.
+ *
+ * This file encapsulates functions for usage of a simple file as a
+ * backing store. It is based upon the swapallocator, and shares the
+ * same basic working. Here, though, we have nothing to do with
+ * swapspace, and only one device to worry about.
+ *
+ * The user can just
+ *
+ * echo TuxOnIce > /path/to/my_file
+ *
+ * dd if=/dev/zero bs=1M count=<file_size_desired> >> /path/to/my_file
+ *
+ * and
+ *
+ * echo /path/to/my_file > /sys/power/tuxonice/file/target
+ *
+ * then put what they find in /sys/power/tuxonice/resume
+ * as their resume= parameter in lilo.conf (and rerun lilo if using it).
+ *
+ * Having done this, they're ready to hibernate and resume.
+ *
+ * TODO:
+ * - File resizing.
+ */
+
+#include <linux/blkdev.h>
+#include <linux/mount.h>
+#include <linux/fs.h>
+#include <linux/fs_uuid.h>
+
+#include "tuxonice.h"
+#include "tuxonice_modules.h"
+#include "tuxonice_bio.h"
+#include "tuxonice_alloc.h"
+#include "tuxonice_builtin.h"
+#include "tuxonice_sysfs.h"
+#include "tuxonice_ui.h"
+#include "tuxonice_io.h"
+
+#define target_is_normal_file() (S_ISREG(target_inode->i_mode))
+
+static struct toi_module_ops toi_fileops;
+
+static struct file *target_file;
+static struct block_device *toi_file_target_bdev;
+static unsigned long pages_available, pages_allocated;
+static char toi_file_target[256];
+static struct inode *target_inode;
+static int file_target_priority;
+static int used_devt;
+static int target_claim;
+static dev_t toi_file_dev_t;
+static int sig_page_index;
+
+/* For test_toi_file_target */
+static struct toi_bdev_info *file_chain;
+
+static int has_contiguous_blocks(struct toi_bdev_info *dev_info, int page_num)
+{
+ int j;
+ sector_t last = 0;
+
+ for (j = 0; j < dev_info->blocks_per_page; j++) {
+ sector_t this = bmap(target_inode,
+ page_num * dev_info->blocks_per_page + j);
+
+ if (!this || (last && (last + 1) != this))
+ break;
+
+ last = this;
+ }
+
+ return j == dev_info->blocks_per_page;
+}
+
+static unsigned long get_usable_pages(struct toi_bdev_info *dev_info)
+{
+ unsigned long result = 0;
+ struct block_device *bdev = dev_info->bdev;
+ int i;
+
+ switch (target_inode->i_mode & S_IFMT) {
+ case S_IFSOCK:
+ case S_IFCHR:
+ case S_IFIFO: /* Socket, Char, Fifo */
+ return -1;
+ case S_IFREG: /* Regular file: current size - holes + free
+ space on part */
+ for (i = 0; i < (target_inode->i_size >> PAGE_SHIFT) ; i++) {
+ if (has_contiguous_blocks(dev_info, i))
+ result++;
+ }
+ break;
+ case S_IFBLK: /* Block device */
+ if (!bdev->bd_disk) {
+ toi_message(TOI_IO, TOI_VERBOSE, 0,
+ "bdev->bd_disk null.");
+ return 0;
+ }
+
+ result = (bdev->bd_part ?
+ bdev->bd_part->nr_sects :
+ get_capacity(bdev->bd_disk)) >> (PAGE_SHIFT - 9);
+ }
+
+
+ return result;
+}
+
+static int toi_file_register_storage(void)
+{
+ struct toi_bdev_info *devinfo;
+ int result = 0;
+ struct fs_info *fs_info;
+
+ toi_message(TOI_IO, TOI_VERBOSE, 0, "toi_file_register_storage.");
+ if (!strlen(toi_file_target)) {
+ toi_message(TOI_IO, TOI_VERBOSE, 0, "Register file storage: "
+ "No target filename set.");
+ return 0;
+ }
+
+ target_file = filp_open(toi_file_target, O_RDONLY|O_LARGEFILE, 0);
+ toi_message(TOI_IO, TOI_VERBOSE, 0, "filp_open %s returned %p.",
+ toi_file_target, target_file);
+
+ if (IS_ERR(target_file) || !target_file) {
+ target_file = NULL;
+ toi_file_dev_t = name_to_dev_t(toi_file_target);
+ if (!toi_file_dev_t) {
+ struct kstat stat;
+ int error = vfs_stat(toi_file_target, &stat);
+ printk(KERN_INFO "Open file %s returned %p and "
+ "name_to_devt failed.\n",
+ toi_file_target, target_file);
+ if (error) {
+ printk(KERN_INFO "Stating the file also failed."
+ " Nothing more we can do.\n");
+ return 0;
+ } else
+ toi_file_dev_t = stat.rdev;
+ }
+
+ toi_file_target_bdev = toi_open_by_devnum(toi_file_dev_t);
+ if (IS_ERR(toi_file_target_bdev)) {
+ printk(KERN_INFO "Got a dev_num (%lx) but failed to "
+ "open it.\n",
+ (unsigned long) toi_file_dev_t);
+ toi_file_target_bdev = NULL;
+ return 0;
+ }
+ used_devt = 1;
+ target_inode = toi_file_target_bdev->bd_inode;
+ } else
+ target_inode = target_file->f_mapping->host;
+
+ toi_message(TOI_IO, TOI_VERBOSE, 0, "Succeeded in opening the target.");
+ if (S_ISLNK(target_inode->i_mode) || S_ISDIR(target_inode->i_mode) ||
+ S_ISSOCK(target_inode->i_mode) || S_ISFIFO(target_inode->i_mode)) {
+ printk(KERN_INFO "File support works with regular files,"
+ " character files and block devices.\n");
+ /* Cleanup routine will undo the above */
+ return 0;
+ }
+
+ if (!used_devt) {
+ if (S_ISBLK(target_inode->i_mode)) {
+ toi_file_target_bdev = I_BDEV(target_inode);
+ if (!blkdev_get(toi_file_target_bdev, FMODE_WRITE |
+ FMODE_READ, NULL))
+ target_claim = 1;
+ } else
+ toi_file_target_bdev = target_inode->i_sb->s_bdev;
+ if (!toi_file_target_bdev) {
+ printk(KERN_INFO "%s is not a valid file allocator "
+ "target.\n", toi_file_target);
+ return 0;
+ }
+ toi_file_dev_t = toi_file_target_bdev->bd_dev;
+ }
+
+ devinfo = toi_kzalloc(39, sizeof(struct toi_bdev_info), GFP_ATOMIC);
+ if (!devinfo) {
+ printk("Failed to allocate a toi_bdev_info struct for the file allocator.\n");
+ return -ENOMEM;
+ }
+
+ devinfo->bdev = toi_file_target_bdev;
+ devinfo->allocator = &toi_fileops;
+ devinfo->allocator_index = 0;
+
+ fs_info = fs_info_from_block_dev(toi_file_target_bdev);
+ if (fs_info && !IS_ERR(fs_info)) {
+ memcpy(devinfo->uuid, &fs_info->uuid, 16);
+ free_fs_info(fs_info);
+ } else
+ result = (int) PTR_ERR(fs_info);
+
+ /* Unlike swap code, only complain if fs_info_from_block_dev returned
+ * -ENOMEM. The 'file' might be a full partition, so might validly not
+ * have an identifiable type, UUID etc.
+ */
+ if (result)
+ printk(KERN_DEBUG "Failed to get fs_info for file device (%d).\n",
+ result);
+ devinfo->dev_t = toi_file_dev_t;
+ devinfo->prio = file_target_priority;
+ devinfo->bmap_shift = target_inode->i_blkbits - 9;
+ devinfo->blocks_per_page =
+ (1 << (PAGE_SHIFT - target_inode->i_blkbits));
+ sprintf(devinfo->name, "file %s", toi_file_target);
+ file_chain = devinfo;
+ toi_message(TOI_IO, TOI_VERBOSE, 0, "Dev_t is %lx. Prio is %d. Bmap "
+ "shift is %d. Blocks per page %d.",
+ devinfo->dev_t, devinfo->prio, devinfo->bmap_shift,
+ devinfo->blocks_per_page);
+
+ /* Keep one aside for the signature */
+ pages_available = get_usable_pages(devinfo) - 1;
+
+ toi_message(TOI_IO, TOI_VERBOSE, 0, "Registering file storage, %lu "
+ "pages.", pages_available);
+
+ toi_bio_ops.register_storage(devinfo);
+ return 0;
+}
+
+static unsigned long toi_file_storage_available(void)
+{
+ return pages_available;
+}
+
+static int toi_file_allocate_storage(struct toi_bdev_info *chain,
+ unsigned long request)
+{
+ unsigned long available = pages_available - pages_allocated;
+ unsigned long to_add = min(available, request);
+
+ toi_message(TOI_IO, TOI_VERBOSE, 0, "Pages available is %lu. Allocated "
+ "is %lu. Allocating %lu pages from file.",
+ pages_available, pages_allocated, to_add);
+ pages_allocated += to_add;
+
+ return to_add;
+}
+
+/**
+ * __populate_block_list - add an extent to the chain
+ * @min: Start of the extent (first physical block = sector)
+ * @max: End of the extent (last physical block = sector)
+ *
+ * If TOI_TEST_BIO is set, print a debug message, outputting the min and max
+ * fs block numbers.
+ **/
+static int __populate_block_list(struct toi_bdev_info *chain, int min, int max)
+{
+ if (test_action_state(TOI_TEST_BIO))
+ toi_message(TOI_IO, TOI_VERBOSE, 0, "Adding extent %d-%d.",
+ min << chain->bmap_shift,
+ ((max + 1) << chain->bmap_shift) - 1);
+
+ return toi_add_to_extent_chain(&chain->blocks, min, max);
+}
+
+static int get_main_pool_phys_params(struct toi_bdev_info *chain)
+{
+ int i, extent_min = -1, extent_max = -1, result = 0, have_sig_page = 0;
+ unsigned long pages_mapped = 0;
+
+ toi_message(TOI_IO, TOI_VERBOSE, 0, "Getting file allocator blocks.");
+
+ if (chain->blocks.first)
+ toi_put_extent_chain(&chain->blocks);
+
+ if (!target_is_normal_file()) {
+ result = (pages_available > 0) ?
+ __populate_block_list(chain, chain->blocks_per_page,
+ (pages_allocated + 1) *
+ chain->blocks_per_page - 1) : 0;
+ return result;
+ }
+
+ /*
+ * FIXME: We are assuming the first page is contiguous. Is that
+ * assumption always right?
+ */
+
+ for (i = 0; i < (target_inode->i_size >> PAGE_SHIFT); i++) {
+ sector_t new_sector;
+
+ if (!has_contiguous_blocks(chain, i))
+ continue;
+
+ if (!have_sig_page) {
+ have_sig_page = 1;
+ sig_page_index = i;
+ continue;
+ }
+
+ pages_mapped++;
+
+ /* Ignore first page - it has the header */
+ if (pages_mapped == 1)
+ continue;
+
+ new_sector = bmap(target_inode, (i * chain->blocks_per_page));
+
+ /*
+ * I'd love to be able to fill in holes and resize
+ * files, but not yet...
+ */
+
+ if (new_sector == extent_max + 1)
+ extent_max += chain->blocks_per_page;
+ else {
+ if (extent_min > -1) {
+ result = __populate_block_list(chain,
+ extent_min, extent_max);
+ if (result)
+ return result;
+ }
+
+ extent_min = new_sector;
+ extent_max = extent_min +
+ chain->blocks_per_page - 1;
+ }
+
+ if (pages_mapped == pages_allocated)
+ break;
+ }
+
+ if (extent_min > -1) {
+ result = __populate_block_list(chain, extent_min, extent_max);
+ if (result)
+ return result;
+ }
+
+ return 0;
+}
+
+static void toi_file_free_storage(struct toi_bdev_info *chain)
+{
+ pages_allocated = 0;
+ file_chain = NULL;
+}
+
+/**
+ * toi_file_print_debug_stats - print debug info
+ * @buffer: Buffer to data to populate
+ * @size: Size of the buffer
+ **/
+static int toi_file_print_debug_stats(char *buffer, int size)
+{
+ int len = scnprintf(buffer, size, "- File Allocator active.\n");
+
+ len += scnprintf(buffer+len, size-len, " Storage available for "
+ "image: %lu pages.\n", pages_available);
+
+ return len;
+}
+
+static void toi_file_cleanup(int finishing_cycle)
+{
+ if (toi_file_target_bdev) {
+ if (target_claim) {
+ blkdev_put(toi_file_target_bdev, FMODE_WRITE | FMODE_READ);
+ target_claim = 0;
+ }
+
+ if (used_devt) {
+ blkdev_put(toi_file_target_bdev,
+ FMODE_READ | FMODE_NDELAY);
+ used_devt = 0;
+ }
+ toi_file_target_bdev = NULL;
+ target_inode = NULL;
+ }
+
+ if (target_file) {
+ filp_close(target_file, NULL);
+ target_file = NULL;
+ }
+
+ pages_available = 0;
+}
+
+/**
+ * test_toi_file_target - sysfs callback for /sys/power/tuxonince/file/target
+ *
+ * Test wheter the target file is valid for hibernating.
+ **/
+static void test_toi_file_target(void)
+{
+ int result = toi_file_register_storage();
+ sector_t sector;
+ char buf[50];
+ struct fs_info *fs_info;
+
+ if (result || !file_chain)
+ return;
+
+ /* This doesn't mean we're in business. Is any storage available? */
+ if (!pages_available)
+ goto out;
+
+ toi_file_allocate_storage(file_chain, 1);
+ result = get_main_pool_phys_params(file_chain);
+ if (result)
+ goto out;
+
+
+ sector = bmap(target_inode, sig_page_index *
+ file_chain->blocks_per_page) << file_chain->bmap_shift;
+
+ /* Use the uuid, or the dev_t if that fails */
+ fs_info = fs_info_from_block_dev(toi_file_target_bdev);
+ if (!fs_info || IS_ERR(fs_info)) {
+ bdevname(toi_file_target_bdev, buf);
+ sprintf(resume_file, "/dev/%s:%llu", buf,
+ (unsigned long long) sector);
+ } else {
+ int i;
+ hex_dump_to_buffer(fs_info->uuid, 16, 32, 1, buf, 50, 0);
+
+ /* Remove the spaces */
+ for (i = 1; i < 16; i++) {
+ buf[2 * i] = buf[3 * i];
+ buf[2 * i + 1] = buf[3 * i + 1];
+ }
+ buf[32] = 0;
+ sprintf(resume_file, "UUID=%s:0x%llx", buf,
+ (unsigned long long) sector);
+ free_fs_info(fs_info);
+ }
+
+ toi_attempt_to_parse_resume_device(0);
+out:
+ toi_file_free_storage(file_chain);
+ toi_bio_ops.free_storage();
+}
+
+static struct toi_sysfs_data sysfs_params[] = {
+ SYSFS_STRING("target", SYSFS_RW, toi_file_target, 256,
+ SYSFS_NEEDS_SM_FOR_WRITE, test_toi_file_target),
+ SYSFS_INT("enabled", SYSFS_RW, &toi_fileops.enabled, 0, 1, 0, NULL),
+ SYSFS_INT("priority", SYSFS_RW, &file_target_priority, -4095,
+ 4096, 0, NULL),
+};
+
+static struct toi_bio_allocator_ops toi_bio_fileops = {
+ .register_storage = toi_file_register_storage,
+ .storage_available = toi_file_storage_available,
+ .allocate_storage = toi_file_allocate_storage,
+ .bmap = get_main_pool_phys_params,
+ .free_storage = toi_file_free_storage,
+};
+
+static struct toi_module_ops toi_fileops = {
+ .type = BIO_ALLOCATOR_MODULE,
+ .name = "file storage",
+ .directory = "file",
+ .module = THIS_MODULE,
+ .print_debug_info = toi_file_print_debug_stats,
+ .cleanup = toi_file_cleanup,
+ .bio_allocator_ops = &toi_bio_fileops,
+
+ .sysfs_data = sysfs_params,
+ .num_sysfs_entries = sizeof(sysfs_params) /
+ sizeof(struct toi_sysfs_data),
+};
+
+/* ---- Registration ---- */
+static __init int toi_file_load(void)
+{
+ return toi_register_module(&toi_fileops);
+}
+
+late_initcall(toi_file_load);
diff --git a/kernel/power/tuxonice_highlevel.c b/kernel/power/tuxonice_highlevel.c
new file mode 100644
index 000000000..bdcd832f3
--- /dev/null
+++ b/kernel/power/tuxonice_highlevel.c
@@ -0,0 +1,1413 @@
+/*
+ * kernel/power/tuxonice_highlevel.c
+ */
+/** \mainpage TuxOnIce.
+ *
+ * TuxOnIce provides support for saving and restoring an image of
+ * system memory to an arbitrary storage device, either on the local computer,
+ * or across some network. The support is entirely OS based, so TuxOnIce
+ * works without requiring BIOS, APM or ACPI support. The vast majority of the
+ * code is also architecture independant, so it should be very easy to port
+ * the code to new architectures. TuxOnIce includes support for SMP, 4G HighMem
+ * and preemption. Initramfses and initrds are also supported.
+ *
+ * TuxOnIce uses a modular design, in which the method of storing the image is
+ * completely abstracted from the core code, as are transformations on the data
+ * such as compression and/or encryption (multiple 'modules' can be used to
+ * provide arbitrary combinations of functionality). The user interface is also
+ * modular, so that arbitrarily simple or complex interfaces can be used to
+ * provide anything from debugging information through to eye candy.
+ *
+ * \section Copyright
+ *
+ * TuxOnIce is released under the GPLv2.
+ *
+ * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu><BR>
+ * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz><BR>
+ * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr><BR>
+ * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au)<BR>
+ *
+ * \section Credits
+ *
+ * Nigel would like to thank the following people for their work:
+ *
+ * Bernard Blackham <bernard@blackham.com.au><BR>
+ * Web page & Wiki administration, some coding. A person without whom
+ * TuxOnIce would not be where it is.
+ *
+ * Michael Frank <mhf@linuxmail.org><BR>
+ * Extensive testing and help with improving stability. I was constantly
+ * amazed by the quality and quantity of Michael's help.
+ *
+ * Pavel Machek <pavel@ucw.cz><BR>
+ * Modifications, defectiveness pointing, being with Gabor at the very
+ * beginning, suspend to swap space, stop all tasks. Port to 2.4.18-ac and
+ * 2.5.17. Even though Pavel and I disagree on the direction suspend to
+ * disk should take, I appreciate the valuable work he did in helping Gabor
+ * get the concept working.
+ *
+ * ..and of course the myriads of TuxOnIce users who have helped diagnose
+ * and fix bugs, made suggestions on how to improve the code, proofread
+ * documentation, and donated time and money.
+ *
+ * Thanks also to corporate sponsors:
+ *
+ * <B>Redhat.</B>Sometime employer from May 2006 (my fault, not Redhat's!).
+ *
+ * <B>Cyclades.com.</B> Nigel's employers from Dec 2004 until May 2006, who
+ * allowed him to work on TuxOnIce and PM related issues on company time.
+ *
+ * <B>LinuxFund.org.</B> Sponsored Nigel's work on TuxOnIce for four months Oct
+ * 2003 to Jan 2004.
+ *
+ * <B>LAC Linux.</B> Donated P4 hardware that enabled development and ongoing
+ * maintenance of SMP and Highmem support.
+ *
+ * <B>OSDL.</B> Provided access to various hardware configurations, make
+ * occasional small donations to the project.
+ */
+
+#include <linux/suspend.h>
+#include <linux/module.h>
+#include <linux/freezer.h>
+#include <generated/utsrelease.h>
+#include <linux/cpu.h>
+#include <linux/console.h>
+#include <linux/writeback.h>
+#include <linux/uaccess.h> /* for get/set_fs & KERNEL_DS on i386 */
+#include <linux/bio.h>
+#include <linux/kgdb.h>
+
+#include "tuxonice.h"
+#include "tuxonice_modules.h"
+#include "tuxonice_sysfs.h"
+#include "tuxonice_prepare_image.h"
+#include "tuxonice_io.h"
+#include "tuxonice_ui.h"
+#include "tuxonice_power_off.h"
+#include "tuxonice_storage.h"
+#include "tuxonice_checksum.h"
+#include "tuxonice_builtin.h"
+#include "tuxonice_atomic_copy.h"
+#include "tuxonice_alloc.h"
+#include "tuxonice_cluster.h"
+
+/*! Pageset metadata. */
+struct pagedir pagedir2 = {2};
+
+static mm_segment_t oldfs;
+static DEFINE_MUTEX(tuxonice_in_use);
+static int block_dump_save;
+
+int toi_trace_index;
+
+/* Binary signature if an image is present */
+char tuxonice_signature[9] = "\xed\xc3\x02\xe9\x98\x56\xe5\x0c";
+
+unsigned long boot_kernel_data_buffer;
+
+static char *result_strings[] = {
+ "Hibernation was aborted",
+ "The user requested that we cancel the hibernation",
+ "No storage was available",
+ "Insufficient storage was available",
+ "Freezing filesystems and/or tasks failed",
+ "A pre-existing image was used",
+ "We would free memory, but image size limit doesn't allow this",
+ "Unable to free enough memory to hibernate",
+ "Unable to obtain the Power Management Semaphore",
+ "A device suspend/resume returned an error",
+ "A system device suspend/resume returned an error",
+ "The extra pages allowance is too small",
+ "We were unable to successfully prepare an image",
+ "TuxOnIce module initialisation failed",
+ "TuxOnIce module cleanup failed",
+ "I/O errors were encountered",
+ "Ran out of memory",
+ "An error was encountered while reading the image",
+ "Platform preparation failed",
+ "CPU Hotplugging failed",
+ "Architecture specific preparation failed",
+ "Pages needed resaving, but we were told to abort if this happens",
+ "We can't hibernate at the moment (invalid resume= or filewriter "
+ "target?)",
+ "A hibernation preparation notifier chain member cancelled the "
+ "hibernation",
+ "Pre-snapshot preparation failed",
+ "Pre-restore preparation failed",
+ "Failed to disable usermode helpers",
+ "Can't resume from alternate image",
+ "Header reservation too small",
+ "Device Power Management Preparation failed",
+};
+
+/**
+ * toi_finish_anything - cleanup after doing anything
+ * @hibernate_or_resume: Whether finishing a cycle or attempt at
+ * resuming.
+ *
+ * This is our basic clean-up routine, matching start_anything below. We
+ * call cleanup routines, drop module references and restore process fs and
+ * cpus allowed masks, together with the global block_dump variable's value.
+ **/
+void toi_finish_anything(int hibernate_or_resume)
+{
+ toi_running = 0;
+ toi_cleanup_modules(hibernate_or_resume);
+ toi_put_modules();
+ if (hibernate_or_resume) {
+ block_dump = block_dump_save;
+ set_cpus_allowed_ptr(current, cpu_all_mask);
+ toi_alloc_print_debug_stats();
+ atomic_inc(&snapshot_device_available);
+ unlock_system_sleep();
+ }
+
+ set_fs(oldfs);
+ mutex_unlock(&tuxonice_in_use);
+}
+
+/**
+ * toi_start_anything - basic initialisation for TuxOnIce
+ * @toi_or_resume: Whether starting a cycle or attempt at resuming.
+ *
+ * Our basic initialisation routine. Take references on modules, use the
+ * kernel segment, recheck resume= if no active allocator is set, initialise
+ * modules, save and reset block_dump and ensure we're running on CPU0.
+ **/
+int toi_start_anything(int hibernate_or_resume)
+{
+ mutex_lock(&tuxonice_in_use);
+
+ oldfs = get_fs();
+ set_fs(KERNEL_DS);
+
+ toi_trace_index = 0;
+
+ if (hibernate_or_resume) {
+ lock_system_sleep();
+
+ if (!atomic_add_unless(&snapshot_device_available, -1, 0))
+ goto snapshotdevice_unavailable;
+ }
+
+ if (hibernate_or_resume == SYSFS_HIBERNATE)
+ toi_print_modules();
+
+ if (toi_get_modules()) {
+ printk(KERN_INFO "TuxOnIce: Get modules failed!\n");
+ goto prehibernate_err;
+ }
+
+ if (hibernate_or_resume) {
+ block_dump_save = block_dump;
+ block_dump = 0;
+ set_cpus_allowed_ptr(current,
+ cpumask_of(cpumask_first(cpu_online_mask)));
+ }
+
+ if (toi_initialise_modules_early(hibernate_or_resume))
+ goto early_init_err;
+
+ if (!toiActiveAllocator)
+ toi_attempt_to_parse_resume_device(!hibernate_or_resume);
+
+ if (!toi_initialise_modules_late(hibernate_or_resume)) {
+ toi_running = 1; /* For the swsusp code we use :< */
+ return 0;
+ }
+
+ toi_cleanup_modules(hibernate_or_resume);
+early_init_err:
+ if (hibernate_or_resume) {
+ block_dump_save = block_dump;
+ set_cpus_allowed_ptr(current, cpu_all_mask);
+ }
+ toi_put_modules();
+prehibernate_err:
+ if (hibernate_or_resume)
+ atomic_inc(&snapshot_device_available);
+snapshotdevice_unavailable:
+ if (hibernate_or_resume)
+ mutex_unlock(&pm_mutex);
+ set_fs(oldfs);
+ mutex_unlock(&tuxonice_in_use);
+ return -EBUSY;
+}
+
+/*
+ * Nosave page tracking.
+ *
+ * Here rather than in prepare_image because we want to do it once only at the
+ * start of a cycle.
+ */
+
+/**
+ * mark_nosave_pages - set up our Nosave bitmap
+ *
+ * Build a bitmap of Nosave pages from the list. The bitmap allows faster
+ * use when preparing the image.
+ **/
+static void mark_nosave_pages(void)
+{
+ struct nosave_region *region;
+
+ list_for_each_entry(region, &nosave_regions, list) {
+ unsigned long pfn;
+
+ for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++)
+ if (pfn_valid(pfn)) {
+ SetPageNosave(pfn_to_page(pfn));
+ }
+ }
+}
+
+/**
+ * allocate_bitmaps - allocate bitmaps used to record page states
+ *
+ * Allocate the bitmaps we use to record the various TuxOnIce related
+ * page states.
+ **/
+static int allocate_bitmaps(void)
+{
+ if (toi_alloc_bitmap(&pageset1_map) ||
+ toi_alloc_bitmap(&pageset1_copy_map) ||
+ toi_alloc_bitmap(&pageset2_map) ||
+ toi_alloc_bitmap(&io_map) ||
+ toi_alloc_bitmap(&nosave_map) ||
+ toi_alloc_bitmap(&free_map) ||
+ toi_alloc_bitmap(&compare_map) ||
+ toi_alloc_bitmap(&page_resave_map))
+ return 1;
+
+ return 0;
+}
+
+/**
+ * free_bitmaps - free the bitmaps used to record page states
+ *
+ * Free the bitmaps allocated above. It is not an error to call
+ * memory_bm_free on a bitmap that isn't currently allocated.
+ **/
+static void free_bitmaps(void)
+{
+ toi_free_bitmap(&pageset1_map);
+ toi_free_bitmap(&pageset1_copy_map);
+ toi_free_bitmap(&pageset2_map);
+ toi_free_bitmap(&io_map);
+ toi_free_bitmap(&nosave_map);
+ toi_free_bitmap(&free_map);
+ toi_free_bitmap(&compare_map);
+ toi_free_bitmap(&page_resave_map);
+}
+
+/**
+ * io_MB_per_second - return the number of MB/s read or written
+ * @write: Whether to return the speed at which we wrote.
+ *
+ * Calculate the number of megabytes per second that were read or written.
+ **/
+static int io_MB_per_second(int write)
+{
+ return (toi_bkd.toi_io_time[write][1]) ?
+ MB((unsigned long) toi_bkd.toi_io_time[write][0]) * HZ /
+ toi_bkd.toi_io_time[write][1] : 0;
+}
+
+#define SNPRINTF(a...) do { len += scnprintf(((char *) buffer) + len, \
+ count - len - 1, ## a); } while (0)
+
+/**
+ * get_debug_info - fill a buffer with debugging information
+ * @buffer: The buffer to be filled.
+ * @count: The size of the buffer, in bytes.
+ *
+ * Fill a (usually PAGE_SIZEd) buffer with the debugging info that we will
+ * either printk or return via sysfs.
+ **/
+static int get_toi_debug_info(const char *buffer, int count)
+{
+ int len = 0, i, first_result = 1;
+
+ SNPRINTF("TuxOnIce debugging info:\n");
+ SNPRINTF("- TuxOnIce core : " TOI_CORE_VERSION "\n");
+ SNPRINTF("- Kernel Version : " UTS_RELEASE "\n");
+ SNPRINTF("- Compiler vers. : %d.%d\n", __GNUC__, __GNUC_MINOR__);
+ SNPRINTF("- Attempt number : %d\n", nr_hibernates);
+ SNPRINTF("- Parameters : %ld %ld %ld %d %ld %ld\n",
+ toi_result,
+ toi_bkd.toi_action,
+ toi_bkd.toi_debug_state,
+ toi_bkd.toi_default_console_level,
+ image_size_limit,
+ toi_poweroff_method);
+ SNPRINTF("- Overall expected compression percentage: %d.\n",
+ 100 - toi_expected_compression_ratio());
+ len += toi_print_module_debug_info(((char *) buffer) + len,
+ count - len - 1);
+ if (toi_bkd.toi_io_time[0][1]) {
+ if ((io_MB_per_second(0) < 5) || (io_MB_per_second(1) < 5)) {
+ SNPRINTF("- I/O speed: Write %ld KB/s",
+ (KB((unsigned long) toi_bkd.toi_io_time[0][0]) * HZ /
+ toi_bkd.toi_io_time[0][1]));
+ if (toi_bkd.toi_io_time[1][1])
+ SNPRINTF(", Read %ld KB/s",
+ (KB((unsigned long)
+ toi_bkd.toi_io_time[1][0]) * HZ /
+ toi_bkd.toi_io_time[1][1]));
+ } else {
+ SNPRINTF("- I/O speed: Write %ld MB/s",
+ (MB((unsigned long) toi_bkd.toi_io_time[0][0]) * HZ /
+ toi_bkd.toi_io_time[0][1]));
+ if (toi_bkd.toi_io_time[1][1])
+ SNPRINTF(", Read %ld MB/s",
+ (MB((unsigned long)
+ toi_bkd.toi_io_time[1][0]) * HZ /
+ toi_bkd.toi_io_time[1][1]));
+ }
+ SNPRINTF(".\n");
+ } else
+ SNPRINTF("- No I/O speed stats available.\n");
+ SNPRINTF("- Extra pages : %lu used/%lu.\n",
+ extra_pd1_pages_used, extra_pd1_pages_allowance);
+
+ for (i = 0; i < TOI_NUM_RESULT_STATES; i++)
+ if (test_result_state(i)) {
+ SNPRINTF("%s: %s.\n", first_result ?
+ "- Result " :
+ " ",
+ result_strings[i]);
+ first_result = 0;
+ }
+ if (first_result)
+ SNPRINTF("- Result : %s.\n", nr_hibernates ?
+ "Succeeded" :
+ "No hibernation attempts so far");
+ return len;
+}
+
+#ifdef CONFIG_TOI_INCREMENTAL
+/**
+ * get_toi_page_state - fill a buffer with page state information
+ * @buffer: The buffer to be filled.
+ * @count: The size of the buffer, in bytes.
+ *
+ * Fill a (usually PAGE_SIZEd) buffer with the debugging info that we will
+ * either printk or return via sysfs.
+ **/
+static int get_toi_page_state(const char *buffer, int count)
+{
+ int free = 0, untracked = 0, dirty = 0, ro = 0, invalid = 0, other = 0, total = 0;
+ int len = 0;
+ struct zone *zone;
+ int allocated_bitmaps = 0;
+
+ set_cpus_allowed_ptr(current,
+ cpumask_of(cpumask_first(cpu_online_mask)));
+
+ if (!free_map) {
+ BUG_ON(toi_alloc_bitmap(&free_map));
+ allocated_bitmaps = 1;
+ }
+
+ toi_generate_free_page_map();
+
+ for_each_populated_zone(zone) {
+ unsigned long loop;
+
+ total += zone->spanned_pages;
+
+ for (loop = 0; loop < zone->spanned_pages; loop++) {
+ unsigned long pfn = zone->zone_start_pfn + loop;
+ struct page *page;
+ int chunk_size;
+
+ if (!pfn_valid(pfn)) {
+ continue;
+ }
+
+ chunk_size = toi_size_of_free_region(zone, pfn);
+ if (chunk_size) {
+ /*
+ * If the page gets allocated, it will be need
+ * saving in an image.
+ * Don't bother with explicitly removing any
+ * RO protection applied below.
+ * We'll SetPageTOI_Dirty(page) if/when it
+ * gets allocated.
+ */
+ free += chunk_size;
+ loop += chunk_size - 1;
+ continue;
+ }
+
+ page = pfn_to_page(pfn);
+
+ if (PageTOI_Untracked(page)) {
+ untracked++;
+ } else if (PageTOI_RO(page)) {
+ ro++;
+ } else if (PageTOI_Dirty(page)) {
+ dirty++;
+ } else {
+ printk("Page %ld state 'other'.\n", pfn);
+ other++;
+ }
+ }
+ }
+
+ if (allocated_bitmaps) {
+ toi_free_bitmap(&free_map);
+ }
+
+ set_cpus_allowed_ptr(current, cpu_all_mask);
+
+ SNPRINTF("TuxOnIce page breakdown:\n");
+ SNPRINTF("- Free : %d\n", free);
+ SNPRINTF("- Untracked : %d\n", untracked);
+ SNPRINTF("- Read only : %d\n", ro);
+ SNPRINTF("- Dirty : %d\n", dirty);
+ SNPRINTF("- Other : %d\n", other);
+ SNPRINTF("- Invalid : %d\n", invalid);
+ SNPRINTF("- Total : %d\n", total);
+ return len;
+}
+#endif
+
+/**
+ * do_cleanup - cleanup after attempting to hibernate or resume
+ * @get_debug_info: Whether to allocate and return debugging info.
+ *
+ * Cleanup after attempting to hibernate or resume, possibly getting
+ * debugging info as we do so.
+ **/
+static void do_cleanup(int get_debug_info, int restarting)
+{
+ int i = 0;
+ char *buffer = NULL;
+
+ trap_non_toi_io = 0;
+
+ if (get_debug_info)
+ toi_prepare_status(DONT_CLEAR_BAR, "Cleaning up...");
+
+ free_checksum_pages();
+
+ toi_cbw_restore();
+ toi_free_cbw_data();
+
+ if (get_debug_info)
+ buffer = (char *) toi_get_zeroed_page(20, TOI_ATOMIC_GFP);
+
+ if (buffer)
+ i = get_toi_debug_info(buffer, PAGE_SIZE);
+
+ toi_free_extra_pagedir_memory();
+
+ pagedir1.size = 0;
+ pagedir2.size = 0;
+ set_highmem_size(pagedir1, 0);
+ set_highmem_size(pagedir2, 0);
+
+ if (boot_kernel_data_buffer) {
+ if (!test_toi_state(TOI_BOOT_KERNEL))
+ toi_free_page(37, boot_kernel_data_buffer);
+ boot_kernel_data_buffer = 0;
+ }
+
+ if (test_toi_state(TOI_DEVICE_HOTPLUG_LOCKED)) {
+ unlock_device_hotplug();
+ clear_toi_state(TOI_DEVICE_HOTPLUG_LOCKED);
+ }
+
+ clear_toi_state(TOI_BOOT_KERNEL);
+ if (current->flags & PF_SUSPEND_TASK)
+ thaw_processes();
+
+ if (!restarting)
+ toi_stop_other_threads();
+
+ if (toi_keeping_image &&
+ !test_result_state(TOI_ABORTED)) {
+ toi_message(TOI_ANY_SECTION, TOI_LOW, 1,
+ "TuxOnIce: Not invalidating the image due "
+ "to Keep Image or Incremental Image being enabled.");
+ set_result_state(TOI_KEPT_IMAGE);
+
+ /*
+ * For an incremental image, free unused storage so
+ * swap (if any) can be used for normal system operation,
+ * if so desired.
+ */
+
+ toiActiveAllocator->free_unused_storage();
+ } else
+ if (toiActiveAllocator)
+ toiActiveAllocator->remove_image();
+
+ free_bitmaps();
+ usermodehelper_enable();
+
+ if (test_toi_state(TOI_NOTIFIERS_PREPARE)) {
+ pm_notifier_call_chain(PM_POST_HIBERNATION);
+ clear_toi_state(TOI_NOTIFIERS_PREPARE);
+ }
+
+ if (buffer && i) {
+ /* Printk can only handle 1023 bytes, including
+ * its level mangling. */
+ for (i = 0; i < 3; i++)
+ printk(KERN_ERR "%s", buffer + (1023 * i));
+ toi_free_page(20, (unsigned long) buffer);
+ }
+
+ if (!restarting)
+ toi_cleanup_console();
+
+ free_attention_list();
+
+ if (!restarting)
+ toi_deactivate_storage(0);
+
+ clear_toi_state(TOI_IGNORE_LOGLEVEL);
+ clear_toi_state(TOI_TRYING_TO_RESUME);
+ clear_toi_state(TOI_NOW_RESUMING);
+}
+
+/**
+ * check_still_keeping_image - we kept an image; check whether to reuse it.
+ *
+ * We enter this routine when we have kept an image. If the user has said they
+ * want to still keep it, all we need to do is powerdown. If powering down
+ * means hibernating to ram and the power doesn't run out, we'll return 1.
+ * If we do power off properly or the battery runs out, we'll resume via the
+ * normal paths.
+ *
+ * If the user has said they want to remove the previously kept image, we
+ * remove it, and return 0. We'll then store a new image.
+ **/
+static int check_still_keeping_image(void)
+{
+ if (toi_keeping_image) {
+ if (!test_action_state(TOI_INCREMENTAL_IMAGE)) {
+ printk(KERN_INFO "Image already stored: powering down "
+ "immediately.");
+ do_toi_step(STEP_HIBERNATE_POWERDOWN);
+ return 1;
+ }
+ /**
+ * Incremental image - need to write new part.
+ * We detect that we're writing an incremental image by looking
+ * at test_result_state(TOI_KEPT_IMAGE)
+ **/
+ return 0;
+ }
+
+ printk(KERN_INFO "Invalidating previous image.\n");
+ toiActiveAllocator->remove_image();
+
+ return 0;
+}
+
+/**
+ * toi_init - prepare to hibernate to disk
+ *
+ * Initialise variables & data structures, in preparation for
+ * hibernating to disk.
+ **/
+static int toi_init(int restarting)
+{
+ int result, i, j;
+
+ toi_result = 0;
+
+ printk(KERN_INFO "Initiating a hibernation cycle.\n");
+
+ nr_hibernates++;
+
+ for (i = 0; i < 2; i++)
+ for (j = 0; j < 2; j++)
+ toi_bkd.toi_io_time[i][j] = 0;
+
+ if (!test_toi_state(TOI_CAN_HIBERNATE) ||
+ allocate_bitmaps())
+ return 1;
+
+ mark_nosave_pages();
+
+ if (!restarting)
+ toi_prepare_console();
+
+ result = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
+ if (result) {
+ set_result_state(TOI_NOTIFIERS_PREPARE_FAILED);
+ return 1;
+ }
+ set_toi_state(TOI_NOTIFIERS_PREPARE);
+
+ if (!restarting) {
+ printk(KERN_ERR "Starting other threads.");
+ toi_start_other_threads();
+ }
+
+ result = usermodehelper_disable();
+ if (result) {
+ printk(KERN_ERR "TuxOnIce: Failed to disable usermode "
+ "helpers\n");
+ set_result_state(TOI_USERMODE_HELPERS_ERR);
+ return 1;
+ }
+
+ boot_kernel_data_buffer = toi_get_zeroed_page(37, TOI_ATOMIC_GFP);
+ if (!boot_kernel_data_buffer) {
+ printk(KERN_ERR "TuxOnIce: Failed to allocate "
+ "boot_kernel_data_buffer.\n");
+ set_result_state(TOI_OUT_OF_MEMORY);
+ return 1;
+ }
+
+ toi_allocate_cbw_data();
+
+ return 0;
+}
+
+/**
+ * can_hibernate - perform basic 'Can we hibernate?' tests
+ *
+ * Perform basic tests that must pass if we're going to be able to hibernate:
+ * Can we get the pm_mutex? Is resume= valid (we need to know where to write
+ * the image header).
+ **/
+static int can_hibernate(void)
+{
+ if (!test_toi_state(TOI_CAN_HIBERNATE))
+ toi_attempt_to_parse_resume_device(0);
+
+ if (!test_toi_state(TOI_CAN_HIBERNATE)) {
+ printk(KERN_INFO "TuxOnIce: Hibernation is disabled.\n"
+ "This may be because you haven't put something along "
+ "the lines of\n\nresume=swap:/dev/hda1\n\n"
+ "in lilo.conf or equivalent. (Where /dev/hda1 is your "
+ "swap partition).\n");
+ set_abort_result(TOI_CANT_SUSPEND);
+ return 0;
+ }
+
+ if (strlen(alt_resume_param)) {
+ attempt_to_parse_alt_resume_param();
+
+ if (!strlen(alt_resume_param)) {
+ printk(KERN_INFO "Alternate resume parameter now "
+ "invalid. Aborting.\n");
+ set_abort_result(TOI_CANT_USE_ALT_RESUME);
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+/**
+ * do_post_image_write - having written an image, figure out what to do next
+ *
+ * After writing an image, we might load an alternate image or power down.
+ * Powering down might involve hibernating to ram, in which case we also
+ * need to handle reloading pageset2.
+ **/
+static int do_post_image_write(void)
+{
+ /* If switching images fails, do normal powerdown */
+ if (alt_resume_param[0])
+ do_toi_step(STEP_RESUME_ALT_IMAGE);
+
+ toi_power_down();
+
+ barrier();
+ mb();
+ return 0;
+}
+
+/**
+ * __save_image - do the hard work of saving the image
+ *
+ * High level routine for getting the image saved. The key assumptions made
+ * are that processes have been frozen and sufficient memory is available.
+ *
+ * We also exit through here at resume time, coming back from toi_hibernate
+ * after the atomic restore. This is the reason for the toi_in_hibernate
+ * test.
+ **/
+static int __save_image(void)
+{
+ int temp_result, did_copy = 0;
+
+ toi_prepare_status(DONT_CLEAR_BAR, "Starting to save the image..");
+
+ toi_message(TOI_ANY_SECTION, TOI_LOW, 1,
+ " - Final values: %d and %d.",
+ pagedir1.size, pagedir2.size);
+
+ toi_cond_pause(1, "About to write pagedir2.");
+
+ temp_result = write_pageset(&pagedir2);
+
+ if (temp_result == -1 || test_result_state(TOI_ABORTED))
+ return 1;
+
+ toi_cond_pause(1, "About to copy pageset 1.");
+
+ if (test_result_state(TOI_ABORTED))
+ return 1;
+
+ toi_deactivate_storage(1);
+
+ toi_prepare_status(DONT_CLEAR_BAR, "Doing atomic copy/restore.");
+
+ toi_in_hibernate = 1;
+
+ if (toi_go_atomic(PMSG_FREEZE, 1))
+ goto Failed;
+
+ temp_result = toi_hibernate();
+
+#ifdef CONFIG_KGDB
+ if (test_action_state(TOI_POST_RESUME_BREAKPOINT))
+ kgdb_breakpoint();
+#endif
+
+ if (!temp_result)
+ did_copy = 1;
+
+ /* We return here at resume time too! */
+ toi_end_atomic(ATOMIC_ALL_STEPS, toi_in_hibernate, temp_result);
+
+Failed:
+ if (toi_activate_storage(1))
+ panic("Failed to reactivate our storage.");
+
+ /* Resume time? */
+ if (!toi_in_hibernate) {
+ copyback_post();
+ return 0;
+ }
+
+ /* Nope. Hibernating. So, see if we can save the image... */
+
+ if (temp_result || test_result_state(TOI_ABORTED)) {
+ if (did_copy)
+ goto abort_reloading_pagedir_two;
+ else
+ return 1;
+ }
+
+ toi_update_status(pagedir2.size, pagedir1.size + pagedir2.size,
+ NULL);
+
+ if (test_result_state(TOI_ABORTED))
+ goto abort_reloading_pagedir_two;
+
+ toi_cond_pause(1, "About to write pageset1.");
+
+ toi_message(TOI_ANY_SECTION, TOI_LOW, 1, "-- Writing pageset1");
+
+ temp_result = write_pageset(&pagedir1);
+
+ /* We didn't overwrite any memory, so no reread needs to be done. */
+ if (test_action_state(TOI_TEST_FILTER_SPEED) ||
+ test_action_state(TOI_TEST_BIO))
+ return 1;
+
+ if (temp_result == 1 || test_result_state(TOI_ABORTED))
+ goto abort_reloading_pagedir_two;
+
+ toi_cond_pause(1, "About to write header.");
+
+ if (test_result_state(TOI_ABORTED))
+ goto abort_reloading_pagedir_two;
+
+ temp_result = write_image_header();
+
+ if (!temp_result && !test_result_state(TOI_ABORTED))
+ return 0;
+
+abort_reloading_pagedir_two:
+ temp_result = read_pageset2(1);
+
+ /* If that failed, we're sunk. Panic! */
+ if (temp_result)
+ panic("Attempt to reload pagedir 2 while aborting "
+ "a hibernate failed.");
+
+ return 1;
+}
+
+static void map_ps2_pages(int enable)
+{
+ unsigned long pfn = 0;
+
+ memory_bm_position_reset(pageset2_map);
+ pfn = memory_bm_next_pfn(pageset2_map, 0);
+
+ while (pfn != BM_END_OF_MAP) {
+ struct page *page = pfn_to_page(pfn);
+ kernel_map_pages(page, 1, enable);
+ pfn = memory_bm_next_pfn(pageset2_map, 0);
+ }
+}
+
+/**
+ * do_save_image - save the image and handle the result
+ *
+ * Save the prepared image. If we fail or we're in the path returning
+ * from the atomic restore, cleanup.
+ **/
+static int do_save_image(void)
+{
+ int result;
+ map_ps2_pages(0);
+ result = __save_image();
+ map_ps2_pages(1);
+ return result;
+}
+
+/**
+ * do_prepare_image - try to prepare an image
+ *
+ * Seek to initialise and prepare an image to be saved. On failure,
+ * cleanup.
+ **/
+static int do_prepare_image(void)
+{
+ int restarting = test_result_state(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL);
+
+ if (!restarting && toi_activate_storage(0))
+ return 1;
+
+ /*
+ * If kept image and still keeping image and hibernating to RAM, (non
+ * incremental image case) we will return 1 after hibernating and
+ * resuming (provided the power doesn't run out. In that case, we skip
+ * directly to cleaning up and exiting.
+ */
+
+ if (!can_hibernate() ||
+ (test_result_state(TOI_KEPT_IMAGE) &&
+ check_still_keeping_image()))
+ return 1;
+
+ if (toi_init(restarting) || toi_prepare_image() ||
+ test_result_state(TOI_ABORTED))
+ return 1;
+
+ trap_non_toi_io = 1;
+
+ return 0;
+}
+
+/**
+ * do_check_can_resume - find out whether an image has been stored
+ *
+ * Read whether an image exists. We use the same routine as the
+ * image_exists sysfs entry, and just look to see whether the
+ * first character in the resulting buffer is a '1'.
+ **/
+int do_check_can_resume(void)
+{
+ int result = -1;
+
+ if (toi_activate_storage(0))
+ return -1;
+
+ if (!test_toi_state(TOI_RESUME_DEVICE_OK))
+ toi_attempt_to_parse_resume_device(1);
+
+ if (toiActiveAllocator)
+ result = toiActiveAllocator->image_exists(1);
+
+ toi_deactivate_storage(0);
+ return result;
+}
+
+/**
+ * do_load_atomic_copy - load the first part of an image, if it exists
+ *
+ * Check whether we have an image. If one exists, do sanity checking
+ * (possibly invalidating the image or even rebooting if the user
+ * requests that) before loading it into memory in preparation for the
+ * atomic restore.
+ *
+ * If and only if we have an image loaded and ready to restore, we return 1.
+ **/
+static int do_load_atomic_copy(void)
+{
+ int read_image_result = 0;
+
+ if (sizeof(swp_entry_t) != sizeof(long)) {
+ printk(KERN_WARNING "TuxOnIce: The size of swp_entry_t != size"
+ " of long. Please report this!\n");
+ return 1;
+ }
+
+ if (!resume_file[0])
+ printk(KERN_WARNING "TuxOnIce: "
+ "You need to use a resume= command line parameter to "
+ "tell TuxOnIce where to look for an image.\n");
+
+ toi_activate_storage(0);
+
+ if (!(test_toi_state(TOI_RESUME_DEVICE_OK)) &&
+ !toi_attempt_to_parse_resume_device(0)) {
+ /*
+ * Without a usable storage device we can do nothing -
+ * even if noresume is given
+ */
+
+ if (!toiNumAllocators)
+ printk(KERN_ALERT "TuxOnIce: "
+ "No storage allocators have been registered.\n");
+ else
+ printk(KERN_ALERT "TuxOnIce: "
+ "Missing or invalid storage location "
+ "(resume= parameter). Please correct and "
+ "rerun lilo (or equivalent) before "
+ "hibernating.\n");
+ toi_deactivate_storage(0);
+ return 1;
+ }
+
+ if (allocate_bitmaps())
+ return 1;
+
+ read_image_result = read_pageset1(); /* non fatal error ignored */
+
+ if (test_toi_state(TOI_NORESUME_SPECIFIED))
+ clear_toi_state(TOI_NORESUME_SPECIFIED);
+
+ toi_deactivate_storage(0);
+
+ if (read_image_result)
+ return 1;
+
+ return 0;
+}
+
+/**
+ * prepare_restore_load_alt_image - save & restore alt image variables
+ *
+ * Save and restore the pageset1 maps, when loading an alternate image.
+ **/
+static void prepare_restore_load_alt_image(int prepare)
+{
+ static struct memory_bitmap *pageset1_map_save, *pageset1_copy_map_save;
+
+ if (prepare) {
+ pageset1_map_save = pageset1_map;
+ pageset1_map = NULL;
+ pageset1_copy_map_save = pageset1_copy_map;
+ pageset1_copy_map = NULL;
+ set_toi_state(TOI_LOADING_ALT_IMAGE);
+ toi_reset_alt_image_pageset2_pfn();
+ } else {
+ toi_free_bitmap(&pageset1_map);
+ pageset1_map = pageset1_map_save;
+ toi_free_bitmap(&pageset1_copy_map);
+ pageset1_copy_map = pageset1_copy_map_save;
+ clear_toi_state(TOI_NOW_RESUMING);
+ clear_toi_state(TOI_LOADING_ALT_IMAGE);
+ }
+}
+
+/**
+ * do_toi_step - perform a step in hibernating or resuming
+ *
+ * Perform a step in hibernating or resuming an image. This abstraction
+ * is in preparation for implementing cluster support, and perhaps replacing
+ * uswsusp too (haven't looked whether that's possible yet).
+ **/
+int do_toi_step(int step)
+{
+ switch (step) {
+ case STEP_HIBERNATE_PREPARE_IMAGE:
+ return do_prepare_image();
+ case STEP_HIBERNATE_SAVE_IMAGE:
+ return do_save_image();
+ case STEP_HIBERNATE_POWERDOWN:
+ return do_post_image_write();
+ case STEP_RESUME_CAN_RESUME:
+ return do_check_can_resume();
+ case STEP_RESUME_LOAD_PS1:
+ return do_load_atomic_copy();
+ case STEP_RESUME_DO_RESTORE:
+ /*
+ * If we succeed, this doesn't return.
+ * Instead, we return from do_save_image() in the
+ * hibernated kernel.
+ */
+ return toi_atomic_restore();
+ case STEP_RESUME_ALT_IMAGE:
+ printk(KERN_INFO "Trying to resume alternate image.\n");
+ toi_in_hibernate = 0;
+ save_restore_alt_param(SAVE, NOQUIET);
+ prepare_restore_load_alt_image(1);
+ if (!do_check_can_resume()) {
+ printk(KERN_INFO "Nothing to resume from.\n");
+ goto out;
+ }
+ if (!do_load_atomic_copy())
+ toi_atomic_restore();
+
+ printk(KERN_INFO "Failed to load image.\n");
+out:
+ prepare_restore_load_alt_image(0);
+ save_restore_alt_param(RESTORE, NOQUIET);
+ break;
+ case STEP_CLEANUP:
+ do_cleanup(1, 0);
+ break;
+ case STEP_QUIET_CLEANUP:
+ do_cleanup(0, 0);
+ break;
+ }
+
+ return 0;
+}
+
+/* -- Functions for kickstarting a hibernate or resume --- */
+
+/**
+ * toi_try_resume - try to do the steps in resuming
+ *
+ * Check if we have an image and if so try to resume. Clear the status
+ * flags too.
+ **/
+void toi_try_resume(void)
+{
+ set_toi_state(TOI_TRYING_TO_RESUME);
+ resume_attempted = 1;
+
+ current->flags |= PF_MEMALLOC;
+ toi_start_other_threads();
+
+ if (do_toi_step(STEP_RESUME_CAN_RESUME) &&
+ !do_toi_step(STEP_RESUME_LOAD_PS1))
+ do_toi_step(STEP_RESUME_DO_RESTORE);
+
+ toi_stop_other_threads();
+ do_cleanup(0, 0);
+
+ current->flags &= ~PF_MEMALLOC;
+
+ clear_toi_state(TOI_IGNORE_LOGLEVEL);
+ clear_toi_state(TOI_TRYING_TO_RESUME);
+ clear_toi_state(TOI_NOW_RESUMING);
+}
+
+/**
+ * toi_sys_power_disk_try_resume - wrapper calling toi_try_resume
+ *
+ * Wrapper for when __toi_try_resume is called from swsusp resume path,
+ * rather than from echo > /sys/power/tuxonice/do_resume.
+ **/
+static void toi_sys_power_disk_try_resume(void)
+{
+ resume_attempted = 1;
+
+ /*
+ * There's a comment in kernel/power/disk.c that indicates
+ * we should be able to use mutex_lock_nested below. That
+ * doesn't seem to cut it, though, so let's just turn lockdep
+ * off for now.
+ */
+ lockdep_off();
+
+ if (toi_start_anything(SYSFS_RESUMING))
+ goto out;
+
+ toi_try_resume();
+
+ /*
+ * For initramfs, we have to clear the boot time
+ * flag after trying to resume
+ */
+ clear_toi_state(TOI_BOOT_TIME);
+
+ toi_finish_anything(SYSFS_RESUMING);
+out:
+ lockdep_on();
+}
+
+/**
+ * toi_try_hibernate - try to start a hibernation cycle
+ *
+ * Start a hibernation cycle, coming in from either
+ * echo > /sys/power/tuxonice/do_suspend
+ *
+ * or
+ *
+ * echo disk > /sys/power/state
+ *
+ * In the later case, we come in without pm_sem taken; in the
+ * former, it has been taken.
+ **/
+int toi_try_hibernate(void)
+{
+ int result = 0, sys_power_disk = 0, retries = 0;
+
+ if (!mutex_is_locked(&tuxonice_in_use)) {
+ /* Came in via /sys/power/disk */
+ if (toi_start_anything(SYSFS_HIBERNATING))
+ return -EBUSY;
+ sys_power_disk = 1;
+ }
+
+ current->flags |= PF_MEMALLOC;
+
+ if (test_toi_state(TOI_CLUSTER_MODE)) {
+ toi_initiate_cluster_hibernate();
+ goto out;
+ }
+
+prepare:
+ result = do_toi_step(STEP_HIBERNATE_PREPARE_IMAGE);
+
+ if (result)
+ goto out;
+
+ if (test_action_state(TOI_FREEZER_TEST))
+ goto out_restore_gfp_mask;
+
+ result = do_toi_step(STEP_HIBERNATE_SAVE_IMAGE);
+
+ if (test_result_state(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL)) {
+ if (retries < 2) {
+ do_cleanup(0, 1);
+ retries++;
+ clear_result_state(TOI_ABORTED);
+ extra_pd1_pages_allowance = extra_pd1_pages_used + 500;
+ printk(KERN_INFO "Automatically adjusting the extra"
+ " pages allowance to %ld and restarting.\n",
+ extra_pd1_pages_allowance);
+ pm_restore_gfp_mask();
+ goto prepare;
+ }
+
+ printk(KERN_INFO "Adjusted extra pages allowance twice and "
+ "still couldn't hibernate successfully. Giving up.");
+ }
+
+ /* This code runs at resume time too! */
+ if (!result && toi_in_hibernate)
+ result = do_toi_step(STEP_HIBERNATE_POWERDOWN);
+
+out_restore_gfp_mask:
+ pm_restore_gfp_mask();
+out:
+ do_cleanup(1, 0);
+ current->flags &= ~PF_MEMALLOC;
+
+ if (sys_power_disk)
+ toi_finish_anything(SYSFS_HIBERNATING);
+
+ return result;
+}
+
+/*
+ * channel_no: If !0, -c <channel_no> is added to args (userui).
+ */
+int toi_launch_userspace_program(char *command, int channel_no,
+ int wait, int debug)
+{
+ int retval;
+ static char *envp[] = {
+ "HOME=/",
+ "TERM=linux",
+ "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+ NULL };
+ static char *argv[] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
+ };
+ char *channel = NULL;
+ int arg = 0, size;
+ char test_read[255];
+ char *orig_posn = command;
+
+ if (!strlen(orig_posn))
+ return 1;
+
+ if (channel_no) {
+ channel = toi_kzalloc(4, 6, GFP_KERNEL);
+ if (!channel) {
+ printk(KERN_INFO "Failed to allocate memory in "
+ "preparing to launch userspace program.\n");
+ return 1;
+ }
+ }
+
+ /* Up to 6 args supported */
+ while (arg < 6) {
+ sscanf(orig_posn, "%s", test_read);
+ size = strlen(test_read);
+ if (!(size))
+ break;
+ argv[arg] = toi_kzalloc(5, size + 1, TOI_ATOMIC_GFP);
+ strcpy(argv[arg], test_read);
+ orig_posn += size + 1;
+ *test_read = 0;
+ arg++;
+ }
+
+ if (channel_no) {
+ sprintf(channel, "-c%d", channel_no);
+ argv[arg] = channel;
+ } else
+ arg--;
+
+ if (debug) {
+ argv[++arg] = toi_kzalloc(5, 8, TOI_ATOMIC_GFP);
+ strcpy(argv[arg], "--debug");
+ }
+
+ retval = call_usermodehelper(argv[0], argv, envp, wait);
+
+ /*
+ * If the program reports an error, retval = 256. Don't complain
+ * about that here.
+ */
+ if (retval && retval != 256)
+ printk(KERN_ERR "Failed to launch userspace program '%s': "
+ "Error %d\n", command, retval);
+
+ {
+ int i;
+ for (i = 0; i < arg; i++)
+ if (argv[i] && argv[i] != channel)
+ toi_kfree(5, argv[i], sizeof(*argv[i]));
+ }
+
+ toi_kfree(4, channel, sizeof(*channel));
+
+ return retval;
+}
+
+/*
+ * This array contains entries that are automatically registered at
+ * boot. Modules and the console code register their own entries separately.
+ */
+static struct toi_sysfs_data sysfs_params[] = {
+ SYSFS_LONG("extra_pages_allowance", SYSFS_RW,
+ &extra_pd1_pages_allowance, 0, LONG_MAX, 0),
+ SYSFS_CUSTOM("image_exists", SYSFS_RW, image_exists_read,
+ image_exists_write, SYSFS_NEEDS_SM_FOR_BOTH, NULL),
+ SYSFS_STRING("resume", SYSFS_RW, resume_file, 255,
+ SYSFS_NEEDS_SM_FOR_WRITE,
+ attempt_to_parse_resume_device2),
+ SYSFS_STRING("alt_resume_param", SYSFS_RW, alt_resume_param, 255,
+ SYSFS_NEEDS_SM_FOR_WRITE,
+ attempt_to_parse_alt_resume_param),
+ SYSFS_CUSTOM("debug_info", SYSFS_READONLY, get_toi_debug_info, NULL, 0,
+ NULL),
+ SYSFS_BIT("ignore_rootfs", SYSFS_RW, &toi_bkd.toi_action,
+ TOI_IGNORE_ROOTFS, 0),
+ SYSFS_LONG("image_size_limit", SYSFS_RW, &image_size_limit, -2,
+ INT_MAX, 0),
+ SYSFS_UL("last_result", SYSFS_RW, &toi_result, 0, 0, 0),
+ SYSFS_BIT("no_multithreaded_io", SYSFS_RW, &toi_bkd.toi_action,
+ TOI_NO_MULTITHREADED_IO, 0),
+ SYSFS_BIT("no_flusher_thread", SYSFS_RW, &toi_bkd.toi_action,
+ TOI_NO_FLUSHER_THREAD, 0),
+ SYSFS_BIT("full_pageset2", SYSFS_RW, &toi_bkd.toi_action,
+ TOI_PAGESET2_FULL, 0),
+ SYSFS_BIT("reboot", SYSFS_RW, &toi_bkd.toi_action, TOI_REBOOT, 0),
+ SYSFS_BIT("replace_swsusp", SYSFS_RW, &toi_bkd.toi_action,
+ TOI_REPLACE_SWSUSP, 0),
+ SYSFS_STRING("resume_commandline", SYSFS_RW,
+ toi_bkd.toi_nosave_commandline, COMMAND_LINE_SIZE, 0,
+ NULL),
+ SYSFS_STRING("version", SYSFS_READONLY, TOI_CORE_VERSION, 0, 0, NULL),
+ SYSFS_BIT("freezer_test", SYSFS_RW, &toi_bkd.toi_action,
+ TOI_FREEZER_TEST, 0),
+ SYSFS_BIT("test_bio", SYSFS_RW, &toi_bkd.toi_action, TOI_TEST_BIO, 0),
+ SYSFS_BIT("test_filter_speed", SYSFS_RW, &toi_bkd.toi_action,
+ TOI_TEST_FILTER_SPEED, 0),
+ SYSFS_BIT("no_pageset2", SYSFS_RW, &toi_bkd.toi_action,
+ TOI_NO_PAGESET2, 0),
+ SYSFS_BIT("no_pageset2_if_unneeded", SYSFS_RW, &toi_bkd.toi_action,
+ TOI_NO_PS2_IF_UNNEEDED, 0),
+ SYSFS_STRING("binary_signature", SYSFS_READONLY,
+ tuxonice_signature, 9, 0, NULL),
+ SYSFS_INT("max_workers", SYSFS_RW, &toi_max_workers, 0, NR_CPUS, 0,
+ NULL),
+#ifdef CONFIG_KGDB
+ SYSFS_BIT("post_resume_breakpoint", SYSFS_RW, &toi_bkd.toi_action,
+ TOI_POST_RESUME_BREAKPOINT, 0),
+#endif
+ SYSFS_BIT("no_readahead", SYSFS_RW, &toi_bkd.toi_action,
+ TOI_NO_READAHEAD, 0),
+ SYSFS_BIT("trace_debug_on", SYSFS_RW, &toi_bkd.toi_action,
+ TOI_TRACE_DEBUG_ON, 0),
+#ifdef CONFIG_TOI_KEEP_IMAGE
+ SYSFS_BIT("keep_image", SYSFS_RW , &toi_bkd.toi_action, TOI_KEEP_IMAGE,
+ 0),
+#endif
+#ifdef CONFIG_TOI_INCREMENTAL
+ SYSFS_CUSTOM("pagestate", SYSFS_READONLY, get_toi_page_state, NULL, 0,
+ NULL),
+ SYSFS_BIT("incremental", SYSFS_RW, &toi_bkd.toi_action,
+ TOI_INCREMENTAL_IMAGE, 1),
+#endif
+};
+
+static struct toi_core_fns my_fns = {
+ .get_nonconflicting_page = __toi_get_nonconflicting_page,
+ .post_context_save = __toi_post_context_save,
+ .try_hibernate = toi_try_hibernate,
+ .try_resume = toi_sys_power_disk_try_resume,
+};
+
+/**
+ * core_load - initialisation of TuxOnIce core
+ *
+ * Initialise the core, beginning with sysfs. Checksum and so on are part of
+ * the core, but have their own initialisation routines because they either
+ * aren't compiled in all the time or have their own subdirectories.
+ **/
+static __init int core_load(void)
+{
+ int i,
+ numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data);
+
+ printk(KERN_INFO "TuxOnIce " TOI_CORE_VERSION
+ " (http://tuxonice.net)\n");
+
+ if (!hibernation_available()) {
+ printk(KERN_INFO "TuxOnIce disabled due to request for hibernation"
+ " to be disabled in this kernel.\n");
+ return 1;
+ }
+
+ if (toi_sysfs_init())
+ return 1;
+
+ for (i = 0; i < numfiles; i++)
+ toi_register_sysfs_file(tuxonice_kobj, &sysfs_params[i]);
+
+ toi_core_fns = &my_fns;
+
+ if (toi_alloc_init())
+ return 1;
+ if (toi_checksum_init())
+ return 1;
+ if (toi_usm_init())
+ return 1;
+ if (toi_ui_init())
+ return 1;
+ if (toi_poweroff_init())
+ return 1;
+ if (toi_cluster_init())
+ return 1;
+ if (toi_cbw_init())
+ return 1;
+
+ return 0;
+}
+
+late_initcall(core_load);
diff --git a/kernel/power/tuxonice_incremental.c b/kernel/power/tuxonice_incremental.c
new file mode 100644
index 000000000..c5a09789e
--- /dev/null
+++ b/kernel/power/tuxonice_incremental.c
@@ -0,0 +1,402 @@
+/*
+ * kernel/power/tuxonice_incremental.c
+ *
+ * Copyright (C) 2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ * This file contains routines related to storing incremental images - that
+ * is, retaining an image after an initial cycle and then storing incremental
+ * changes on subsequent hibernations.
+ *
+ * Based in part on on...
+ *
+ * Debug helper to dump the current kernel pagetables of the system
+ * so that we can see what the various memory ranges are set to.
+ *
+ * (C) Copyright 2008 Intel Corporation
+ *
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/mm.h>
+#include <linux/tuxonice.h>
+#include <linux/sched.h>
+#include <asm/pgtable.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+#include <asm/page.h>
+#include "tuxonice_pageflags.h"
+#include "tuxonice_builtin.h"
+#include "power.h"
+
+int toi_do_incremental_initcall;
+
+extern void kdb_init(int level);
+extern noinline void kgdb_breakpoint(void);
+
+#undef pr_debug
+#if 0
+#define pr_debug(a, b...) do { printk(a, ##b); } while(0)
+#else
+#define pr_debug(a, b...) do { } while(0)
+#endif
+
+/* Multipliers for offsets within the PTEs */
+#define PTE_LEVEL_MULT (PAGE_SIZE)
+#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
+#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
+#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
+
+/*
+ * This function gets called on a break in a continuous series
+ * of PTE entries; the next one is different so we need to
+ * print what we collected so far.
+ */
+static void note_page(void *addr)
+{
+ static struct page *lastpage;
+ struct page *page;
+
+ page = virt_to_page(addr);
+
+ if (page != lastpage) {
+ unsigned int level;
+ pte_t *pte = lookup_address((unsigned long) addr, &level);
+ struct page *pt_page2 = pte_page(*pte);
+ //debug("Note page %p (=> %p => %p|%ld).\n", addr, pte, pt_page2, page_to_pfn(pt_page2));
+ SetPageTOI_Untracked(pt_page2);
+ lastpage = page;
+ }
+}
+
+static void walk_pte_level(pmd_t addr)
+{
+ int i;
+ pte_t *start;
+
+ start = (pte_t *) pmd_page_vaddr(addr);
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ note_page(start);
+ start++;
+ }
+}
+
+#if PTRS_PER_PMD > 1
+
+static void walk_pmd_level(pud_t addr)
+{
+ int i;
+ pmd_t *start;
+
+ start = (pmd_t *) pud_page_vaddr(addr);
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ if (!pmd_none(*start)) {
+ if (pmd_large(*start) || !pmd_present(*start))
+ note_page(start);
+ else
+ walk_pte_level(*start);
+ } else
+ note_page(start);
+ start++;
+ }
+}
+
+#else
+#define walk_pmd_level(a) walk_pte_level(__pmd(pud_val(a)))
+#define pud_large(a) pmd_large(__pmd(pud_val(a)))
+#define pud_none(a) pmd_none(__pmd(pud_val(a)))
+#endif
+
+#if PTRS_PER_PUD > 1
+
+static void walk_pud_level(pgd_t addr)
+{
+ int i;
+ pud_t *start;
+
+ start = (pud_t *) pgd_page_vaddr(addr);
+
+ for (i = 0; i < PTRS_PER_PUD; i++) {
+ if (!pud_none(*start)) {
+ if (pud_large(*start) || !pud_present(*start))
+ note_page(start);
+ else
+ walk_pmd_level(*start);
+ } else
+ note_page(start);
+
+ start++;
+ }
+}
+
+#else
+#define walk_pud_level(a) walk_pmd_level(__pud(pgd_val(a)))
+#define pgd_large(a) pud_large(__pud(pgd_val(a)))
+#define pgd_none(a) pud_none(__pud(pgd_val(a)))
+#endif
+
+/*
+ * Not static in the original at the time of writing, so needs renaming here.
+ */
+static void toi_ptdump_walk_pgd_level(pgd_t *pgd)
+{
+#ifdef CONFIG_X86_64
+ pgd_t *start = (pgd_t *) &init_level4_pgt;
+#else
+ pgd_t *start = swapper_pg_dir;
+#endif
+ int i;
+ if (pgd) {
+ start = pgd;
+ }
+
+ for (i = 0; i < PTRS_PER_PGD; i++) {
+ if (!pgd_none(*start)) {
+ if (pgd_large(*start) || !pgd_present(*start))
+ note_page(start);
+ else
+ walk_pud_level(*start);
+ } else
+ note_page(start);
+
+ start++;
+ }
+
+ /* Flush out the last page */
+ note_page(start);
+}
+
+#ifdef CONFIG_PARAVIRT
+extern struct pv_info pv_info;
+
+static void toi_set_paravirt_ops_untracked(void) {
+ int i;
+
+ unsigned long pvpfn = page_to_pfn(virt_to_page(__parainstructions)),
+ pvpfn_end = page_to_pfn(virt_to_page(__parainstructions_end));
+ //debug(KERN_EMERG ".parainstructions goes from pfn %ld to %ld.\n", pvpfn, pvpfn_end);
+ for (i = pvpfn; i <= pvpfn_end; i++) {
+ SetPageTOI_Untracked(pfn_to_page(i));
+ }
+}
+#else
+#define toi_set_paravirt_ops_untracked() { do { } while(0) }
+#endif
+
+extern void toi_mark_per_cpus_pages_untracked(void);
+
+void toi_untrack_stack(unsigned long *stack)
+{
+ int i;
+ struct page *stack_page = virt_to_page(stack);
+
+ for (i = 0; i < (1 << THREAD_SIZE_ORDER); i++) {
+ pr_debug("Untrack stack page %p.\n", page_address(stack_page + i));
+ SetPageTOI_Untracked(stack_page + i);
+ }
+}
+void toi_untrack_process(struct task_struct *p)
+{
+ SetPageTOI_Untracked(virt_to_page(p));
+ pr_debug("Untrack process %d page %p.\n", p->pid, page_address(virt_to_page(p)));
+
+ toi_untrack_stack(p->stack);
+}
+
+void toi_generate_untracked_map(void)
+{
+ struct task_struct *p, *t;
+ struct page *page;
+ pte_t *pte;
+ int i;
+ unsigned int level;
+ static int been_here = 0;
+
+ if (been_here)
+ return;
+
+ been_here = 1;
+
+ /* Pagetable pages */
+ toi_ptdump_walk_pgd_level(NULL);
+
+ /* Printk buffer - not normally needed but can be helpful for debugging. */
+ //toi_set_logbuf_untracked();
+
+ /* Paravirt ops */
+ toi_set_paravirt_ops_untracked();
+
+ /* Task structs and stacks */
+ for_each_process_thread(p, t) {
+ toi_untrack_process(p);
+ //toi_untrack_stack((unsigned long *) t->thread.sp);
+ }
+
+ for (i = 0; i < NR_CPUS; i++) {
+ struct task_struct *idle = idle_task(i);
+
+ if (idle) {
+ pr_debug("Untrack idle process for CPU %d.\n", i);
+ toi_untrack_process(idle);
+ }
+
+ /* IRQ stack */
+ pr_debug("Untrack IRQ stack for CPU %d.\n", i);
+ toi_untrack_stack((unsigned long *)per_cpu(irq_stack_ptr, i));
+ }
+
+ /* Per CPU data */
+ //pr_debug("Untracking per CPU variable pages.\n");
+ toi_mark_per_cpus_pages_untracked();
+
+ /* Init stack - for bringing up secondary CPUs */
+ page = virt_to_page(init_stack);
+ for (i = 0; i < DIV_ROUND_UP(sizeof(init_stack), PAGE_SIZE); i++) {
+ SetPageTOI_Untracked(page + i);
+ }
+
+ pte = lookup_address((unsigned long) &mmu_cr4_features, &level);
+ SetPageTOI_Untracked(pte_page(*pte));
+ SetPageTOI_Untracked(virt_to_page(trampoline_cr4_features));
+}
+
+/**
+ * toi_reset_dirtiness_one
+ */
+
+void toi_reset_dirtiness_one(unsigned long pfn, int verbose)
+{
+ struct page *page = pfn_to_page(pfn);
+
+ /**
+ * Don't worry about whether the Dirty flag is
+ * already set. If this is our first call, it
+ * won't be.
+ */
+
+ preempt_disable();
+
+ ClearPageTOI_Dirty(page);
+ SetPageTOI_RO(page);
+ if (verbose)
+ printk(KERN_EMERG "Making page %ld (%p|%p) read only.\n", pfn, page, page_address(page));
+
+ set_memory_ro((unsigned long) page_address(page), 1);
+
+ preempt_enable();
+}
+
+/**
+ * TuxOnIce's incremental image support works by marking all memory apart from
+ * the page tables read-only, then in the page-faults that result enabling
+ * writing if appropriate and flagging the page as dirty. Free pages are also
+ * marked as dirty and not protected so that if allocated, they will be included
+ * in the image without further processing.
+ *
+ * toi_reset_dirtiness is called when and image exists and incremental images are
+ * enabled, and each time we resume thereafter. It is not invoked on a fresh boot.
+ *
+ * This routine should be called from a single-cpu-running context to avoid races in setting
+ * page dirty/read only flags.
+ *
+ * TODO: Make "it is not invoked on a fresh boot" true when I've finished developing it!
+ *
+ * TODO: Consider Xen paravirt guest boot issues. See arch/x86/mm/pageattr.c.
+ **/
+
+int toi_reset_dirtiness(int verbose)
+{
+ struct zone *zone;
+ unsigned long loop;
+ int allocated_map = 0;
+
+ toi_generate_untracked_map();
+
+ if (!free_map) {
+ if (!toi_alloc_bitmap(&free_map))
+ return -ENOMEM;
+ allocated_map = 1;
+ }
+
+ toi_generate_free_page_map();
+
+ pr_debug(KERN_EMERG "Reset dirtiness.\n");
+ for_each_populated_zone(zone) {
+ // 64 bit only. No need to worry about highmem.
+ for (loop = 0; loop < zone->spanned_pages; loop++) {
+ unsigned long pfn = zone->zone_start_pfn + loop;
+ struct page *page;
+ int chunk_size;
+
+ if (!pfn_valid(pfn)) {
+ continue;
+ }
+
+ chunk_size = toi_size_of_free_region(zone, pfn);
+ if (chunk_size) {
+ loop += chunk_size - 1;
+ continue;
+ }
+
+ page = pfn_to_page(pfn);
+
+ if (PageNosave(page) || !saveable_page(zone, pfn)) {
+ continue;
+ }
+
+ if (PageTOI_Untracked(page)) {
+ continue;
+ }
+
+ /**
+ * Do we need to (re)protect the page?
+ * If it is already protected (PageTOI_RO), there is
+ * nothing to do - skip the following.
+ * If it is marked as dirty (PageTOI_Dirty), it was
+ * either free and has been allocated or has been
+ * written to and marked dirty. Reset the dirty flag
+ * and (re)apply the protection.
+ */
+ if (!PageTOI_RO(page)) {
+ toi_reset_dirtiness_one(pfn, verbose);
+ }
+ }
+ }
+
+ pr_debug(KERN_EMERG "Done resetting dirtiness.\n");
+
+ if (allocated_map) {
+ toi_free_bitmap(&free_map);
+ }
+ return 0;
+}
+
+static int toi_reset_dirtiness_initcall(void)
+{
+ if (toi_do_incremental_initcall) {
+ pr_info("TuxOnIce: Enabling dirty page tracking.\n");
+ toi_reset_dirtiness(0);
+ }
+ return 1;
+}
+extern void toi_generate_untracked_map(void);
+
+// Leave early_initcall for pages to register untracked sections.
+early_initcall(toi_reset_dirtiness_initcall);
+
+static int __init toi_incremental_initcall_setup(char *str)
+{
+ int value;
+
+ if (sscanf(str, "=%d", &value) && value)
+ toi_do_incremental_initcall = value;
+
+ return 1;
+}
+__setup("toi_incremental_initcall", toi_incremental_initcall_setup);
diff --git a/kernel/power/tuxonice_io.c b/kernel/power/tuxonice_io.c
new file mode 100644
index 000000000..91b0c4fd0
--- /dev/null
+++ b/kernel/power/tuxonice_io.c
@@ -0,0 +1,1932 @@
+/*
+ * kernel/power/tuxonice_io.c
+ *
+ * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
+ * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr>
+ * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ * It contains high level IO routines for hibernating.
+ *
+ */
+
+#include <linux/suspend.h>
+#include <linux/version.h>
+#include <linux/utsname.h>
+#include <linux/mount.h>
+#include <linux/highmem.h>
+#include <linux/kthread.h>
+#include <linux/cpu.h>
+#include <linux/fs_struct.h>
+#include <linux/bio.h>
+#include <linux/fs_uuid.h>
+#include <linux/kmod.h>
+#include <asm/tlbflush.h>
+
+#include "tuxonice.h"
+#include "tuxonice_modules.h"
+#include "tuxonice_pageflags.h"
+#include "tuxonice_io.h"
+#include "tuxonice_ui.h"
+#include "tuxonice_storage.h"
+#include "tuxonice_prepare_image.h"
+#include "tuxonice_extent.h"
+#include "tuxonice_sysfs.h"
+#include "tuxonice_builtin.h"
+#include "tuxonice_checksum.h"
+#include "tuxonice_alloc.h"
+char alt_resume_param[256];
+
+/* Version read from image header at resume */
+static int toi_image_header_version;
+
+#define read_if_version(VERS, VAR, DESC, ERR_ACT) do { \
+ if (likely(toi_image_header_version >= VERS)) \
+ if (toiActiveAllocator->rw_header_chunk(READ, NULL, \
+ (char *) &VAR, sizeof(VAR))) { \
+ abort_hibernate(TOI_FAILED_IO, "Failed to read DESC."); \
+ ERR_ACT; \
+ } \
+} while(0) \
+
+/* Variables shared between threads and updated under the mutex */
+static int io_write, io_finish_at, io_base, io_barmax, io_pageset, io_result;
+static int io_index, io_nextupdate, io_pc, io_pc_step;
+static DEFINE_MUTEX(io_mutex);
+static DEFINE_PER_CPU(struct page *, last_sought);
+static DEFINE_PER_CPU(struct page *, last_high_page);
+static DEFINE_PER_CPU(char *, checksum_locn);
+static DEFINE_PER_CPU(struct pbe *, last_low_page);
+static atomic_t io_count;
+atomic_t toi_io_workers;
+
+static int using_flusher;
+
+DECLARE_WAIT_QUEUE_HEAD(toi_io_queue_flusher);
+
+int toi_bio_queue_flusher_should_finish;
+
+int toi_max_workers;
+
+static char *image_version_error = "The image header version is newer than " \
+ "this kernel supports.";
+
+struct toi_module_ops *first_filter;
+
+static atomic_t toi_num_other_threads;
+static DECLARE_WAIT_QUEUE_HEAD(toi_worker_wait_queue);
+enum toi_worker_commands {
+ TOI_IO_WORKER_STOP,
+ TOI_IO_WORKER_RUN,
+ TOI_IO_WORKER_EXIT
+};
+static enum toi_worker_commands toi_worker_command;
+
+/**
+ * toi_attempt_to_parse_resume_device - determine if we can hibernate
+ *
+ * Can we hibernate, using the current resume= parameter?
+ **/
+int toi_attempt_to_parse_resume_device(int quiet)
+{
+ struct list_head *Allocator;
+ struct toi_module_ops *thisAllocator;
+ int result, returning = 0;
+
+ if (toi_activate_storage(0))
+ return 0;
+
+ toiActiveAllocator = NULL;
+ clear_toi_state(TOI_RESUME_DEVICE_OK);
+ clear_toi_state(TOI_CAN_RESUME);
+ clear_result_state(TOI_ABORTED);
+
+ if (!toiNumAllocators) {
+ if (!quiet)
+ printk(KERN_INFO "TuxOnIce: No storage allocators have "
+ "been registered. Hibernating will be "
+ "disabled.\n");
+ goto cleanup;
+ }
+
+ list_for_each(Allocator, &toiAllocators) {
+ thisAllocator = list_entry(Allocator, struct toi_module_ops,
+ type_list);
+
+ /*
+ * Not sure why you'd want to disable an allocator, but
+ * we should honour the flag if we're providing it
+ */
+ if (!thisAllocator->enabled)
+ continue;
+
+ result = thisAllocator->parse_sig_location(
+ resume_file, (toiNumAllocators == 1),
+ quiet);
+
+ switch (result) {
+ case -EINVAL:
+ /* For this allocator, but not a valid
+ * configuration. Error already printed. */
+ goto cleanup;
+
+ case 0:
+ /* For this allocator and valid. */
+ toiActiveAllocator = thisAllocator;
+
+ set_toi_state(TOI_RESUME_DEVICE_OK);
+ set_toi_state(TOI_CAN_RESUME);
+ returning = 1;
+ goto cleanup;
+ }
+ }
+ if (!quiet)
+ printk(KERN_INFO "TuxOnIce: No matching enabled allocator "
+ "found. Resuming disabled.\n");
+cleanup:
+ toi_deactivate_storage(0);
+ return returning;
+}
+
+void attempt_to_parse_resume_device2(void)
+{
+ toi_prepare_usm();
+ toi_attempt_to_parse_resume_device(0);
+ toi_cleanup_usm();
+}
+
+void save_restore_alt_param(int replace, int quiet)
+{
+ static char resume_param_save[255];
+ static unsigned long toi_state_save;
+
+ if (replace) {
+ toi_state_save = toi_state;
+ strcpy(resume_param_save, resume_file);
+ strcpy(resume_file, alt_resume_param);
+ } else {
+ strcpy(resume_file, resume_param_save);
+ toi_state = toi_state_save;
+ }
+ toi_attempt_to_parse_resume_device(quiet);
+}
+
+void attempt_to_parse_alt_resume_param(void)
+{
+ int ok = 0;
+
+ /* Temporarily set resume_param to the poweroff value */
+ if (!strlen(alt_resume_param))
+ return;
+
+ printk(KERN_INFO "=== Trying Poweroff Resume2 ===\n");
+ save_restore_alt_param(SAVE, NOQUIET);
+ if (test_toi_state(TOI_CAN_RESUME))
+ ok = 1;
+
+ printk(KERN_INFO "=== Done ===\n");
+ save_restore_alt_param(RESTORE, QUIET);
+
+ /* If not ok, clear the string */
+ if (ok)
+ return;
+
+ printk(KERN_INFO "Can't resume from that location; clearing "
+ "alt_resume_param.\n");
+ alt_resume_param[0] = '\0';
+}
+
+/**
+ * noresume_reset_modules - reset data structures in case of non resuming
+ *
+ * When we read the start of an image, modules (and especially the
+ * active allocator) might need to reset data structures if we
+ * decide to remove the image rather than resuming from it.
+ **/
+static void noresume_reset_modules(void)
+{
+ struct toi_module_ops *this_filter;
+
+ list_for_each_entry(this_filter, &toi_filters, type_list)
+ if (this_filter->noresume_reset)
+ this_filter->noresume_reset();
+
+ if (toiActiveAllocator && toiActiveAllocator->noresume_reset)
+ toiActiveAllocator->noresume_reset();
+}
+
+/**
+ * fill_toi_header - fill the hibernate header structure
+ * @struct toi_header: Header data structure to be filled.
+ **/
+static int fill_toi_header(struct toi_header *sh)
+{
+ int i, error;
+
+ error = init_header((struct swsusp_info *) sh);
+ if (error)
+ return error;
+
+ sh->pagedir = pagedir1;
+ sh->pageset_2_size = pagedir2.size;
+ sh->param0 = toi_result;
+ sh->param1 = toi_bkd.toi_action;
+ sh->param2 = toi_bkd.toi_debug_state;
+ sh->param3 = toi_bkd.toi_default_console_level;
+ sh->root_fs = current->fs->root.mnt->mnt_sb->s_dev;
+ for (i = 0; i < 4; i++)
+ sh->io_time[i/2][i%2] = toi_bkd.toi_io_time[i/2][i%2];
+ sh->bkd = boot_kernel_data_buffer;
+ return 0;
+}
+
+/**
+ * rw_init_modules - initialize modules
+ * @rw: Whether we are reading of writing an image.
+ * @which: Section of the image being processed.
+ *
+ * Iterate over modules, preparing the ones that will be used to read or write
+ * data.
+ **/
+static int rw_init_modules(int rw, int which)
+{
+ struct toi_module_ops *this_module;
+ /* Initialise page transformers */
+ list_for_each_entry(this_module, &toi_filters, type_list) {
+ if (!this_module->enabled)
+ continue;
+ if (this_module->rw_init && this_module->rw_init(rw, which)) {
+ abort_hibernate(TOI_FAILED_MODULE_INIT,
+ "Failed to initialize the %s filter.",
+ this_module->name);
+ return 1;
+ }
+ }
+
+ /* Initialise allocator */
+ if (toiActiveAllocator->rw_init(rw, which)) {
+ abort_hibernate(TOI_FAILED_MODULE_INIT,
+ "Failed to initialise the allocator.");
+ return 1;
+ }
+
+ /* Initialise other modules */
+ list_for_each_entry(this_module, &toi_modules, module_list) {
+ if (!this_module->enabled ||
+ this_module->type == FILTER_MODULE ||
+ this_module->type == WRITER_MODULE)
+ continue;
+ if (this_module->rw_init && this_module->rw_init(rw, which)) {
+ set_abort_result(TOI_FAILED_MODULE_INIT);
+ printk(KERN_INFO "Setting aborted flag due to module "
+ "init failure.\n");
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * rw_cleanup_modules - cleanup modules
+ * @rw: Whether we are reading of writing an image.
+ *
+ * Cleanup components after reading or writing a set of pages.
+ * Only the allocator may fail.
+ **/
+static int rw_cleanup_modules(int rw)
+{
+ struct toi_module_ops *this_module;
+ int result = 0;
+
+ /* Cleanup other modules */
+ list_for_each_entry(this_module, &toi_modules, module_list) {
+ if (!this_module->enabled ||
+ this_module->type == FILTER_MODULE ||
+ this_module->type == WRITER_MODULE)
+ continue;
+ if (this_module->rw_cleanup)
+ result |= this_module->rw_cleanup(rw);
+ }
+
+ /* Flush data and cleanup */
+ list_for_each_entry(this_module, &toi_filters, type_list) {
+ if (!this_module->enabled)
+ continue;
+ if (this_module->rw_cleanup)
+ result |= this_module->rw_cleanup(rw);
+ }
+
+ result |= toiActiveAllocator->rw_cleanup(rw);
+
+ return result;
+}
+
+static struct page *copy_page_from_orig_page(struct page *orig_page, int is_high)
+{
+ int index, min, max;
+ struct page *high_page = NULL,
+ **my_last_high_page = raw_cpu_ptr(&last_high_page),
+ **my_last_sought = raw_cpu_ptr(&last_sought);
+ struct pbe *this, **my_last_low_page = raw_cpu_ptr(&last_low_page);
+ void *compare;
+
+ if (is_high) {
+ if (*my_last_sought && *my_last_high_page &&
+ *my_last_sought < orig_page)
+ high_page = *my_last_high_page;
+ else
+ high_page = (struct page *) restore_highmem_pblist;
+ this = (struct pbe *) kmap(high_page);
+ compare = orig_page;
+ } else {
+ if (*my_last_sought && *my_last_low_page &&
+ *my_last_sought < orig_page)
+ this = *my_last_low_page;
+ else
+ this = restore_pblist;
+ compare = page_address(orig_page);
+ }
+
+ *my_last_sought = orig_page;
+
+ /* Locate page containing pbe */
+ while (this[PBES_PER_PAGE - 1].next &&
+ this[PBES_PER_PAGE - 1].orig_address < compare) {
+ if (is_high) {
+ struct page *next_high_page = (struct page *)
+ this[PBES_PER_PAGE - 1].next;
+ kunmap(high_page);
+ this = kmap(next_high_page);
+ high_page = next_high_page;
+ } else
+ this = this[PBES_PER_PAGE - 1].next;
+ }
+
+ /* Do a binary search within the page */
+ min = 0;
+ max = PBES_PER_PAGE;
+ index = PBES_PER_PAGE / 2;
+ while (max - min) {
+ if (!this[index].orig_address ||
+ this[index].orig_address > compare)
+ max = index;
+ else if (this[index].orig_address == compare) {
+ if (is_high) {
+ struct page *page = this[index].address;
+ *my_last_high_page = high_page;
+ kunmap(high_page);
+ return page;
+ }
+ *my_last_low_page = this;
+ return virt_to_page(this[index].address);
+ } else
+ min = index;
+ index = ((max + min) / 2);
+ };
+
+ if (is_high)
+ kunmap(high_page);
+
+ abort_hibernate(TOI_FAILED_IO, "Failed to get destination page for"
+ " orig page %p. This[min].orig_address=%p.\n", orig_page,
+ this[index].orig_address);
+ return NULL;
+}
+
+/**
+ * write_next_page - write the next page in a pageset
+ * @data_pfn: The pfn where the next data to write is located.
+ * @my_io_index: The index of the page in the pageset.
+ * @write_pfn: The pfn number to write in the image (where the data belongs).
+ *
+ * Get the pfn of the next page to write, map the page if necessary and do the
+ * write.
+ **/
+static int write_next_page(unsigned long *data_pfn, int *my_io_index,
+ unsigned long *write_pfn)
+{
+ struct page *page;
+ char **my_checksum_locn = raw_cpu_ptr(&checksum_locn);
+ int result = 0, was_present;
+
+ *data_pfn = memory_bm_next_pfn(io_map, 0);
+
+ /* Another thread could have beaten us to it. */
+ if (*data_pfn == BM_END_OF_MAP) {
+ if (atomic_read(&io_count)) {
+ printk(KERN_INFO "Ran out of pfns but io_count is "
+ "still %d.\n", atomic_read(&io_count));
+ BUG();
+ }
+ mutex_unlock(&io_mutex);
+ return -ENODATA;
+ }
+
+ *my_io_index = io_finish_at - atomic_sub_return(1, &io_count);
+
+ memory_bm_clear_bit(io_map, 0, *data_pfn);
+ page = pfn_to_page(*data_pfn);
+
+ was_present = kernel_page_present(page);
+ if (!was_present)
+ kernel_map_pages(page, 1, 1);
+
+ if (io_pageset == 1)
+ *write_pfn = memory_bm_next_pfn(pageset1_map, 0);
+ else {
+ *write_pfn = *data_pfn;
+ *my_checksum_locn = tuxonice_get_next_checksum();
+ }
+
+ TOI_TRACE_DEBUG(*data_pfn, "_PS%d_write %d", io_pageset, *my_io_index);
+
+ mutex_unlock(&io_mutex);
+
+ if (io_pageset == 2 && tuxonice_calc_checksum(page, *my_checksum_locn))
+ return 1;
+
+ result = first_filter->write_page(*write_pfn, TOI_PAGE, page,
+ PAGE_SIZE);
+
+ if (!was_present)
+ kernel_map_pages(page, 1, 0);
+
+ return result;
+}
+
+/**
+ * read_next_page - read the next page in a pageset
+ * @my_io_index: The index of the page in the pageset.
+ * @write_pfn: The pfn in which the data belongs.
+ *
+ * Read a page of the image into our buffer. It can happen (here and in the
+ * write routine) that threads don't get run until after other CPUs have done
+ * all the work. This was the cause of the long standing issue with
+ * occasionally getting -ENODATA errors at the end of reading the image. We
+ * therefore need to check there's actually a page to read before trying to
+ * retrieve one.
+ **/
+
+static int read_next_page(int *my_io_index, unsigned long *write_pfn,
+ struct page *buffer)
+{
+ unsigned int buf_size = PAGE_SIZE;
+ unsigned long left = atomic_read(&io_count);
+
+ if (!left)
+ return -ENODATA;
+
+ /* Start off assuming the page we read isn't resaved */
+ *my_io_index = io_finish_at - atomic_sub_return(1, &io_count);
+
+ mutex_unlock(&io_mutex);
+
+ /*
+ * Are we aborting? If so, don't submit any more I/O as
+ * resetting the resume_attempted flag (from ui.c) will
+ * clear the bdev flags, making this thread oops.
+ */
+ if (unlikely(test_toi_state(TOI_STOP_RESUME))) {
+ atomic_dec(&toi_io_workers);
+ if (!atomic_read(&toi_io_workers)) {
+ /*
+ * So we can be sure we'll have memory for
+ * marking that we haven't resumed.
+ */
+ rw_cleanup_modules(READ);
+ set_toi_state(TOI_IO_STOPPED);
+ }
+ while (1)
+ schedule();
+ }
+
+ /*
+ * See toi_bio_read_page in tuxonice_bio.c:
+ * read the next page in the image.
+ */
+ return first_filter->read_page(write_pfn, TOI_PAGE, buffer, &buf_size);
+}
+
+static void use_read_page(unsigned long write_pfn, struct page *buffer)
+{
+ struct page *final_page = pfn_to_page(write_pfn),
+ *copy_page = final_page;
+ char *virt, *buffer_virt;
+ int was_present, cpu = smp_processor_id();
+ unsigned long idx = 0;
+
+ if (io_pageset == 1 && (!pageset1_copy_map ||
+ !memory_bm_test_bit(pageset1_copy_map, cpu, write_pfn))) {
+ int is_high = PageHighMem(final_page);
+ copy_page = copy_page_from_orig_page(is_high ? (void *) write_pfn : final_page, is_high);
+ }
+
+ if (!memory_bm_test_bit(io_map, cpu, write_pfn)) {
+ int test = !memory_bm_test_bit(io_map, cpu, write_pfn);
+ toi_message(TOI_IO, TOI_VERBOSE, 0, "Discard %ld (%d).", write_pfn, test);
+ mutex_lock(&io_mutex);
+ idx = atomic_add_return(1, &io_count);
+ mutex_unlock(&io_mutex);
+ return;
+ }
+
+ virt = kmap(copy_page);
+ buffer_virt = kmap(buffer);
+ was_present = kernel_page_present(copy_page);
+ if (!was_present)
+ kernel_map_pages(copy_page, 1, 1);
+ memcpy(virt, buffer_virt, PAGE_SIZE);
+ if (!was_present)
+ kernel_map_pages(copy_page, 1, 0);
+ kunmap(copy_page);
+ kunmap(buffer);
+ memory_bm_clear_bit(io_map, cpu, write_pfn);
+ TOI_TRACE_DEBUG(write_pfn, "_PS%d_read", io_pageset);
+}
+
+static unsigned long status_update(int writing, unsigned long done,
+ unsigned long ticks)
+{
+ int cs_index = writing ? 0 : 1;
+ unsigned long ticks_so_far = toi_bkd.toi_io_time[cs_index][1] + ticks;
+ unsigned long msec = jiffies_to_msecs(abs(ticks_so_far));
+ unsigned long pgs_per_s, estimate = 0, pages_left;
+
+ if (msec) {
+ pages_left = io_barmax - done;
+ pgs_per_s = 1000 * done / msec;
+ if (pgs_per_s)
+ estimate = DIV_ROUND_UP(pages_left, pgs_per_s);
+ }
+
+ if (estimate && ticks > HZ / 2)
+ return toi_update_status(done, io_barmax,
+ " %d/%d MB (%lu sec left)",
+ MB(done+1), MB(io_barmax), estimate);
+
+ return toi_update_status(done, io_barmax, " %d/%d MB",
+ MB(done+1), MB(io_barmax));
+}
+
+/**
+ * worker_rw_loop - main loop to read/write pages
+ *
+ * The main I/O loop for reading or writing pages. The io_map bitmap is used to
+ * track the pages to read/write.
+ * If we are reading, the pages are loaded to their final (mapped) pfn.
+ * Data is non zero iff this is a thread started via start_other_threads.
+ * In that case, we stay in here until told to quit.
+ **/
+static int worker_rw_loop(void *data)
+{
+ unsigned long data_pfn, write_pfn, next_jiffies = jiffies + HZ / 4,
+ jif_index = 1, start_time = jiffies, thread_num;
+ int result = 0, my_io_index = 0, last_worker;
+ struct page *buffer = toi_alloc_page(28, TOI_ATOMIC_GFP);
+ cpumask_var_t orig_mask;
+
+ if (!alloc_cpumask_var(&orig_mask, GFP_KERNEL)) {
+ printk(KERN_EMERG "Failed to allocate cpumask for TuxOnIce I/O thread %ld.\n", (unsigned long) data);
+ result = -ENOMEM;
+ goto out;
+ }
+
+ cpumask_copy(orig_mask, tsk_cpus_allowed(current));
+
+ current->flags |= PF_NOFREEZE;
+
+top:
+ mutex_lock(&io_mutex);
+ thread_num = atomic_read(&toi_io_workers);
+
+ cpumask_copy(tsk_cpus_allowed(current), orig_mask);
+ schedule();
+
+ atomic_inc(&toi_io_workers);
+
+ while (atomic_read(&io_count) >= atomic_read(&toi_io_workers) &&
+ !(io_write && test_result_state(TOI_ABORTED)) &&
+ toi_worker_command == TOI_IO_WORKER_RUN) {
+ if (!thread_num && jiffies > next_jiffies) {
+ next_jiffies += HZ / 4;
+ if (toiActiveAllocator->update_throughput_throttle)
+ toiActiveAllocator->update_throughput_throttle(
+ jif_index);
+ jif_index++;
+ }
+
+ /*
+ * What page to use? If reading, don't know yet which page's
+ * data will be read, so always use the buffer. If writing,
+ * use the copy (Pageset1) or original page (Pageset2), but
+ * always write the pfn of the original page.
+ */
+ if (io_write)
+ result = write_next_page(&data_pfn, &my_io_index,
+ &write_pfn);
+ else /* Reading */
+ result = read_next_page(&my_io_index, &write_pfn,
+ buffer);
+
+ if (result) {
+ mutex_lock(&io_mutex);
+ /* Nothing to do? */
+ if (result == -ENODATA) {
+ toi_message(TOI_IO, TOI_VERBOSE, 0,
+ "Thread %d has no more work.",
+ smp_processor_id());
+ break;
+ }
+
+ io_result = result;
+
+ if (io_write) {
+ printk(KERN_INFO "Write chunk returned %d.\n",
+ result);
+ abort_hibernate(TOI_FAILED_IO,
+ "Failed to write a chunk of the "
+ "image.");
+ break;
+ }
+
+ if (io_pageset == 1) {
+ printk(KERN_ERR "\nBreaking out of I/O loop "
+ "because of result code %d.\n", result);
+ break;
+ }
+ panic("Read chunk returned (%d)", result);
+ }
+
+ /*
+ * Discard reads of resaved pages while reading ps2
+ * and unwanted pages while rereading ps2 when aborting.
+ */
+ if (!io_write) {
+ if (!PageResave(pfn_to_page(write_pfn)))
+ use_read_page(write_pfn, buffer);
+ else {
+ mutex_lock(&io_mutex);
+ toi_message(TOI_IO, TOI_VERBOSE, 0,
+ "Resaved %ld.", write_pfn);
+ atomic_inc(&io_count);
+ mutex_unlock(&io_mutex);
+ }
+ }
+
+ if (!thread_num) {
+ if(my_io_index + io_base > io_nextupdate)
+ io_nextupdate = status_update(io_write,
+ my_io_index + io_base,
+ jiffies - start_time);
+
+ if (my_io_index > io_pc) {
+ printk(KERN_CONT "...%d%%", 20 * io_pc_step);
+ io_pc_step++;
+ io_pc = io_finish_at * io_pc_step / 5;
+ }
+ }
+
+ toi_cond_pause(0, NULL);
+
+ /*
+ * Subtle: If there's less I/O still to be done than threads
+ * running, quit. This stops us doing I/O beyond the end of
+ * the image when reading.
+ *
+ * Possible race condition. Two threads could do the test at
+ * the same time; one should exit and one should continue.
+ * Therefore we take the mutex before comparing and exiting.
+ */
+
+ mutex_lock(&io_mutex);
+ }
+
+ last_worker = atomic_dec_and_test(&toi_io_workers);
+ toi_message(TOI_IO, TOI_VERBOSE, 0, "%d workers left.", atomic_read(&toi_io_workers));
+ mutex_unlock(&io_mutex);
+
+ if ((unsigned long) data && toi_worker_command != TOI_IO_WORKER_EXIT) {
+ /* Were we the last thread and we're using a flusher thread? */
+ if (last_worker && using_flusher) {
+ toiActiveAllocator->finish_all_io();
+ }
+ /* First, if we're doing I/O, wait for it to finish */
+ wait_event(toi_worker_wait_queue, toi_worker_command != TOI_IO_WORKER_RUN);
+ /* Then wait to be told what to do next */
+ wait_event(toi_worker_wait_queue, toi_worker_command != TOI_IO_WORKER_STOP);
+ if (toi_worker_command == TOI_IO_WORKER_RUN)
+ goto top;
+ }
+
+ if (thread_num)
+ atomic_dec(&toi_num_other_threads);
+
+out:
+ toi_message(TOI_IO, TOI_LOW, 0, "Thread %d exiting.", thread_num);
+ toi__free_page(28, buffer);
+ free_cpumask_var(orig_mask);
+
+ return result;
+}
+
+int toi_start_other_threads(void)
+{
+ int cpu;
+ struct task_struct *p;
+ int to_start = (toi_max_workers ? toi_max_workers : num_online_cpus()) - 1;
+ unsigned long num_started = 0;
+
+ if (test_action_state(TOI_NO_MULTITHREADED_IO))
+ return 0;
+
+ toi_worker_command = TOI_IO_WORKER_STOP;
+
+ for_each_online_cpu(cpu) {
+ if (num_started == to_start)
+ break;
+
+ if (cpu == smp_processor_id())
+ continue;
+
+ p = kthread_create_on_node(worker_rw_loop, (void *) num_started + 1,
+ cpu_to_node(cpu), "ktoi_io/%d", cpu);
+ if (IS_ERR(p)) {
+ printk(KERN_ERR "ktoi_io for %i failed\n", cpu);
+ continue;
+ }
+ kthread_bind(p, cpu);
+ p->flags |= PF_MEMALLOC;
+ wake_up_process(p);
+ num_started++;
+ atomic_inc(&toi_num_other_threads);
+ }
+
+ toi_message(TOI_IO, TOI_LOW, 0, "Started %d threads.", num_started);
+ return num_started;
+}
+
+void toi_stop_other_threads(void)
+{
+ toi_message(TOI_IO, TOI_LOW, 0, "Stopping other threads.");
+ toi_worker_command = TOI_IO_WORKER_EXIT;
+ wake_up(&toi_worker_wait_queue);
+}
+
+/**
+ * do_rw_loop - main highlevel function for reading or writing pages
+ *
+ * Create the io_map bitmap and call worker_rw_loop to perform I/O operations.
+ **/
+static int do_rw_loop(int write, int finish_at, struct memory_bitmap *pageflags,
+ int base, int barmax, int pageset)
+{
+ int index = 0, cpu, result = 0, workers_started;
+ unsigned long pfn, next;
+
+ first_filter = toi_get_next_filter(NULL);
+
+ if (!finish_at)
+ return 0;
+
+ io_write = write;
+ io_finish_at = finish_at;
+ io_base = base;
+ io_barmax = barmax;
+ io_pageset = pageset;
+ io_index = 0;
+ io_pc = io_finish_at / 5;
+ io_pc_step = 1;
+ io_result = 0;
+ io_nextupdate = base + 1;
+ toi_bio_queue_flusher_should_finish = 0;
+
+ for_each_online_cpu(cpu) {
+ per_cpu(last_sought, cpu) = NULL;
+ per_cpu(last_low_page, cpu) = NULL;
+ per_cpu(last_high_page, cpu) = NULL;
+ }
+
+ /* Ensure all bits clear */
+ memory_bm_clear(io_map);
+
+ memory_bm_position_reset(io_map);
+ next = memory_bm_next_pfn(io_map, 0);
+
+ BUG_ON(next != BM_END_OF_MAP);
+
+ /* Set the bits for the pages to write */
+ memory_bm_position_reset(pageflags);
+
+ pfn = memory_bm_next_pfn(pageflags, 0);
+ toi_trace_index++;
+
+ while (pfn != BM_END_OF_MAP && index < finish_at) {
+ TOI_TRACE_DEBUG(pfn, "_io_pageset_%d (%d/%d)", pageset, index + 1, finish_at);
+ memory_bm_set_bit(io_map, 0, pfn);
+ pfn = memory_bm_next_pfn(pageflags, 0);
+ index++;
+ }
+
+ BUG_ON(next != BM_END_OF_MAP || index < finish_at);
+
+ memory_bm_position_reset(io_map);
+ toi_trace_index++;
+
+ atomic_set(&io_count, finish_at);
+
+ memory_bm_position_reset(pageset1_map);
+
+ mutex_lock(&io_mutex);
+
+ clear_toi_state(TOI_IO_STOPPED);
+
+ using_flusher = (atomic_read(&toi_num_other_threads) &&
+ toiActiveAllocator->io_flusher &&
+ !test_action_state(TOI_NO_FLUSHER_THREAD));
+
+ workers_started = atomic_read(&toi_num_other_threads);
+
+ memory_bm_position_reset(io_map);
+ memory_bm_position_reset(pageset1_copy_map);
+
+ toi_worker_command = TOI_IO_WORKER_RUN;
+ wake_up(&toi_worker_wait_queue);
+
+ mutex_unlock(&io_mutex);
+
+ if (using_flusher)
+ result = toiActiveAllocator->io_flusher(write);
+ else
+ worker_rw_loop(NULL);
+
+ while (atomic_read(&toi_io_workers))
+ schedule();
+
+ printk(KERN_CONT "\n");
+
+ toi_worker_command = TOI_IO_WORKER_STOP;
+ wake_up(&toi_worker_wait_queue);
+
+ if (unlikely(test_toi_state(TOI_STOP_RESUME))) {
+ if (!atomic_read(&toi_io_workers)) {
+ rw_cleanup_modules(READ);
+ set_toi_state(TOI_IO_STOPPED);
+ }
+ while (1)
+ schedule();
+ }
+ set_toi_state(TOI_IO_STOPPED);
+
+ if (!io_result && !result && !test_result_state(TOI_ABORTED)) {
+ unsigned long next;
+
+ toi_update_status(io_base + io_finish_at, io_barmax,
+ " %d/%d MB ",
+ MB(io_base + io_finish_at), MB(io_barmax));
+
+ memory_bm_position_reset(io_map);
+ next = memory_bm_next_pfn(io_map, 0);
+ if (next != BM_END_OF_MAP) {
+ printk(KERN_INFO "Finished I/O loop but still work to "
+ "do?\nFinish at = %d. io_count = %d.\n",
+ finish_at, atomic_read(&io_count));
+ printk(KERN_INFO "I/O bitmap still records work to do."
+ "%ld.\n", next);
+ BUG();
+ do {
+ cpu_relax();
+ } while (0);
+ }
+ }
+
+ return io_result ? io_result : result;
+}
+
+/**
+ * write_pageset - write a pageset to disk.
+ * @pagedir: Which pagedir to write.
+ *
+ * Returns:
+ * Zero on success or -1 on failure.
+ **/
+int write_pageset(struct pagedir *pagedir)
+{
+ int finish_at, base = 0;
+ int barmax = pagedir1.size + pagedir2.size;
+ long error = 0;
+ struct memory_bitmap *pageflags;
+ unsigned long start_time, end_time;
+
+ /*
+ * Even if there is nothing to read or write, the allocator
+ * may need the init/cleanup for it's housekeeping. (eg:
+ * Pageset1 may start where pageset2 ends when writing).
+ */
+ finish_at = pagedir->size;
+
+ if (pagedir->id == 1) {
+ toi_prepare_status(DONT_CLEAR_BAR,
+ "Writing kernel & process data...");
+ base = pagedir2.size;
+ if (test_action_state(TOI_TEST_FILTER_SPEED) ||
+ test_action_state(TOI_TEST_BIO))
+ pageflags = pageset1_map;
+ else
+ pageflags = pageset1_copy_map;
+ } else {
+ toi_prepare_status(DONT_CLEAR_BAR, "Writing caches...");
+ pageflags = pageset2_map;
+ }
+
+ start_time = jiffies;
+
+ if (rw_init_modules(WRITE, pagedir->id)) {
+ abort_hibernate(TOI_FAILED_MODULE_INIT,
+ "Failed to initialise modules for writing.");
+ error = 1;
+ }
+
+ if (!error)
+ error = do_rw_loop(WRITE, finish_at, pageflags, base, barmax,
+ pagedir->id);
+
+ if (rw_cleanup_modules(WRITE) && !error) {
+ abort_hibernate(TOI_FAILED_MODULE_CLEANUP,
+ "Failed to cleanup after writing.");
+ error = 1;
+ }
+
+ end_time = jiffies;
+
+ if ((end_time - start_time) && (!test_result_state(TOI_ABORTED))) {
+ toi_bkd.toi_io_time[0][0] += finish_at,
+ toi_bkd.toi_io_time[0][1] += (end_time - start_time);
+ }
+
+ return error;
+}
+
+/**
+ * read_pageset - highlevel function to read a pageset from disk
+ * @pagedir: pageset to read
+ * @overwrittenpagesonly: Whether to read the whole pageset or
+ * only part of it.
+ *
+ * Returns:
+ * Zero on success or -1 on failure.
+ **/
+static int read_pageset(struct pagedir *pagedir, int overwrittenpagesonly)
+{
+ int result = 0, base = 0;
+ int finish_at = pagedir->size;
+ int barmax = pagedir1.size + pagedir2.size;
+ struct memory_bitmap *pageflags;
+ unsigned long start_time, end_time;
+
+ if (pagedir->id == 1) {
+ toi_prepare_status(DONT_CLEAR_BAR,
+ "Reading kernel & process data...");
+ pageflags = pageset1_map;
+ } else {
+ toi_prepare_status(DONT_CLEAR_BAR, "Reading caches...");
+ if (overwrittenpagesonly) {
+ barmax = min(pagedir1.size, pagedir2.size);
+ finish_at = min(pagedir1.size, pagedir2.size);
+ } else
+ base = pagedir1.size;
+ pageflags = pageset2_map;
+ }
+
+ start_time = jiffies;
+
+ if (rw_init_modules(READ, pagedir->id)) {
+ toiActiveAllocator->remove_image();
+ result = 1;
+ } else
+ result = do_rw_loop(READ, finish_at, pageflags, base, barmax,
+ pagedir->id);
+
+ if (rw_cleanup_modules(READ) && !result) {
+ abort_hibernate(TOI_FAILED_MODULE_CLEANUP,
+ "Failed to cleanup after reading.");
+ result = 1;
+ }
+
+ /* Statistics */
+ end_time = jiffies;
+
+ if ((end_time - start_time) && (!test_result_state(TOI_ABORTED))) {
+ toi_bkd.toi_io_time[1][0] += finish_at,
+ toi_bkd.toi_io_time[1][1] += (end_time - start_time);
+ }
+
+ return result;
+}
+
+/**
+ * write_module_configs - store the modules configuration
+ *
+ * The configuration for each module is stored in the image header.
+ * Returns: Int
+ * Zero on success, Error value otherwise.
+ **/
+static int write_module_configs(void)
+{
+ struct toi_module_ops *this_module;
+ char *buffer = (char *) toi_get_zeroed_page(22, TOI_ATOMIC_GFP);
+ int len, index = 1;
+ struct toi_module_header toi_module_header;
+
+ if (!buffer) {
+ printk(KERN_INFO "Failed to allocate a buffer for saving "
+ "module configuration info.\n");
+ return -ENOMEM;
+ }
+
+ /*
+ * We have to know which data goes with which module, so we at
+ * least write a length of zero for a module. Note that we are
+ * also assuming every module's config data takes <= PAGE_SIZE.
+ */
+
+ /* For each module (in registration order) */
+ list_for_each_entry(this_module, &toi_modules, module_list) {
+ if (!this_module->enabled || !this_module->storage_needed ||
+ (this_module->type == WRITER_MODULE &&
+ toiActiveAllocator != this_module))
+ continue;
+
+ /* Get the data from the module */
+ len = 0;
+ if (this_module->save_config_info)
+ len = this_module->save_config_info(buffer);
+
+ /* Save the details of the module */
+ toi_module_header.enabled = this_module->enabled;
+ toi_module_header.type = this_module->type;
+ toi_module_header.index = index++;
+ strncpy(toi_module_header.name, this_module->name,
+ sizeof(toi_module_header.name));
+ toiActiveAllocator->rw_header_chunk(WRITE,
+ this_module,
+ (char *) &toi_module_header,
+ sizeof(toi_module_header));
+
+ /* Save the size of the data and any data returned */
+ toiActiveAllocator->rw_header_chunk(WRITE,
+ this_module,
+ (char *) &len, sizeof(int));
+ if (len)
+ toiActiveAllocator->rw_header_chunk(
+ WRITE, this_module, buffer, len);
+ }
+
+ /* Write a blank header to terminate the list */
+ toi_module_header.name[0] = '\0';
+ toiActiveAllocator->rw_header_chunk(WRITE, NULL,
+ (char *) &toi_module_header, sizeof(toi_module_header));
+
+ toi_free_page(22, (unsigned long) buffer);
+ return 0;
+}
+
+/**
+ * read_one_module_config - read and configure one module
+ *
+ * Read the configuration for one module, and configure the module
+ * to match if it is loaded.
+ *
+ * Returns: Int
+ * Zero on success, Error value otherwise.
+ **/
+static int read_one_module_config(struct toi_module_header *header)
+{
+ struct toi_module_ops *this_module;
+ int result, len;
+ char *buffer;
+
+ /* Find the module */
+ this_module = toi_find_module_given_name(header->name);
+
+ if (!this_module) {
+ if (header->enabled) {
+ toi_early_boot_message(1, TOI_CONTINUE_REQ,
+ "It looks like we need module %s for reading "
+ "the image but it hasn't been registered.\n",
+ header->name);
+ if (!(test_toi_state(TOI_CONTINUE_REQ)))
+ return -EINVAL;
+ } else
+ printk(KERN_INFO "Module %s configuration data found, "
+ "but the module hasn't registered. Looks like "
+ "it was disabled, so we're ignoring its data.",
+ header->name);
+ }
+
+ /* Get the length of the data (if any) */
+ result = toiActiveAllocator->rw_header_chunk(READ, NULL, (char *) &len,
+ sizeof(int));
+ if (result) {
+ printk(KERN_ERR "Failed to read the length of the module %s's"
+ " configuration data.\n",
+ header->name);
+ return -EINVAL;
+ }
+
+ /* Read any data and pass to the module (if we found one) */
+ if (!len)
+ return 0;
+
+ buffer = (char *) toi_get_zeroed_page(23, TOI_ATOMIC_GFP);
+
+ if (!buffer) {
+ printk(KERN_ERR "Failed to allocate a buffer for reloading "
+ "module configuration info.\n");
+ return -ENOMEM;
+ }
+
+ toiActiveAllocator->rw_header_chunk(READ, NULL, buffer, len);
+
+ if (!this_module)
+ goto out;
+
+ if (!this_module->save_config_info)
+ printk(KERN_ERR "Huh? Module %s appears to have a "
+ "save_config_info, but not a load_config_info "
+ "function!\n", this_module->name);
+ else
+ this_module->load_config_info(buffer, len);
+
+ /*
+ * Now move this module to the tail of its lists. This will put it in
+ * order. Any new modules will end up at the top of the lists. They
+ * should have been set to disabled when loaded (people will
+ * normally not edit an initrd to load a new module and then hibernate
+ * without using it!).
+ */
+
+ toi_move_module_tail(this_module);
+
+ this_module->enabled = header->enabled;
+
+out:
+ toi_free_page(23, (unsigned long) buffer);
+ return 0;
+}
+
+/**
+ * read_module_configs - reload module configurations from the image header.
+ *
+ * Returns: Int
+ * Zero on success or an error code.
+ **/
+static int read_module_configs(void)
+{
+ int result = 0;
+ struct toi_module_header toi_module_header;
+ struct toi_module_ops *this_module;
+
+ /* All modules are initially disabled. That way, if we have a module
+ * loaded now that wasn't loaded when we hibernated, it won't be used
+ * in trying to read the data.
+ */
+ list_for_each_entry(this_module, &toi_modules, module_list)
+ this_module->enabled = 0;
+
+ /* Get the first module header */
+ result = toiActiveAllocator->rw_header_chunk(READ, NULL,
+ (char *) &toi_module_header,
+ sizeof(toi_module_header));
+ if (result) {
+ printk(KERN_ERR "Failed to read the next module header.\n");
+ return -EINVAL;
+ }
+
+ /* For each module (in registration order) */
+ while (toi_module_header.name[0]) {
+ result = read_one_module_config(&toi_module_header);
+
+ if (result)
+ return -EINVAL;
+
+ /* Get the next module header */
+ result = toiActiveAllocator->rw_header_chunk(READ, NULL,
+ (char *) &toi_module_header,
+ sizeof(toi_module_header));
+
+ if (result) {
+ printk(KERN_ERR "Failed to read the next module "
+ "header.\n");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static inline int save_fs_info(struct fs_info *fs, struct block_device *bdev)
+{
+ return (!fs || IS_ERR(fs) || !fs->last_mount_size) ? 0 : 1;
+}
+
+int fs_info_space_needed(void)
+{
+ const struct super_block *sb;
+ int result = sizeof(int);
+
+ list_for_each_entry(sb, &super_blocks, s_list) {
+ struct fs_info *fs;
+
+ if (!sb->s_bdev)
+ continue;
+
+ fs = fs_info_from_block_dev(sb->s_bdev);
+ if (save_fs_info(fs, sb->s_bdev))
+ result += 16 + sizeof(dev_t) + sizeof(int) +
+ fs->last_mount_size;
+ free_fs_info(fs);
+ }
+ return result;
+}
+
+static int fs_info_num_to_save(void)
+{
+ const struct super_block *sb;
+ int to_save = 0;
+
+ list_for_each_entry(sb, &super_blocks, s_list) {
+ struct fs_info *fs;
+
+ if (!sb->s_bdev)
+ continue;
+
+ fs = fs_info_from_block_dev(sb->s_bdev);
+ if (save_fs_info(fs, sb->s_bdev))
+ to_save++;
+ free_fs_info(fs);
+ }
+
+ return to_save;
+}
+
+static int fs_info_save(void)
+{
+ const struct super_block *sb;
+ int to_save = fs_info_num_to_save();
+
+ if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, (char *) &to_save,
+ sizeof(int))) {
+ abort_hibernate(TOI_FAILED_IO, "Failed to write num fs_info"
+ " to save.");
+ return -EIO;
+ }
+
+ list_for_each_entry(sb, &super_blocks, s_list) {
+ struct fs_info *fs;
+
+ if (!sb->s_bdev)
+ continue;
+
+ fs = fs_info_from_block_dev(sb->s_bdev);
+ if (save_fs_info(fs, sb->s_bdev)) {
+ if (toiActiveAllocator->rw_header_chunk(WRITE, NULL,
+ &fs->uuid[0], 16)) {
+ abort_hibernate(TOI_FAILED_IO, "Failed to "
+ "write uuid.");
+ return -EIO;
+ }
+ if (toiActiveAllocator->rw_header_chunk(WRITE, NULL,
+ (char *) &fs->dev_t, sizeof(dev_t))) {
+ abort_hibernate(TOI_FAILED_IO, "Failed to "
+ "write dev_t.");
+ return -EIO;
+ }
+ if (toiActiveAllocator->rw_header_chunk(WRITE, NULL,
+ (char *) &fs->last_mount_size, sizeof(int))) {
+ abort_hibernate(TOI_FAILED_IO, "Failed to "
+ "write last mount length.");
+ return -EIO;
+ }
+ if (toiActiveAllocator->rw_header_chunk(WRITE, NULL,
+ fs->last_mount, fs->last_mount_size)) {
+ abort_hibernate(TOI_FAILED_IO, "Failed to "
+ "write uuid.");
+ return -EIO;
+ }
+ }
+ free_fs_info(fs);
+ }
+ return 0;
+}
+
+static int fs_info_load_and_check_one(void)
+{
+ char uuid[16], *last_mount;
+ int result = 0, ln;
+ dev_t dev_t;
+ struct block_device *dev;
+ struct fs_info *fs_info, seek;
+
+ if (toiActiveAllocator->rw_header_chunk(READ, NULL, uuid, 16)) {
+ abort_hibernate(TOI_FAILED_IO, "Failed to read uuid.");
+ return -EIO;
+ }
+
+ read_if_version(3, dev_t, "uuid dev_t field", return -EIO);
+
+ if (toiActiveAllocator->rw_header_chunk(READ, NULL, (char *) &ln,
+ sizeof(int))) {
+ abort_hibernate(TOI_FAILED_IO,
+ "Failed to read last mount size.");
+ return -EIO;
+ }
+
+ last_mount = kzalloc(ln, GFP_KERNEL);
+
+ if (!last_mount)
+ return -ENOMEM;
+
+ if (toiActiveAllocator->rw_header_chunk(READ, NULL, last_mount, ln)) {
+ abort_hibernate(TOI_FAILED_IO,
+ "Failed to read last mount timestamp.");
+ result = -EIO;
+ goto out_lmt;
+ }
+
+ strncpy((char *) &seek.uuid, uuid, 16);
+ seek.dev_t = dev_t;
+ seek.last_mount_size = ln;
+ seek.last_mount = last_mount;
+ dev_t = blk_lookup_fs_info(&seek);
+ if (!dev_t)
+ goto out_lmt;
+
+ dev = toi_open_by_devnum(dev_t);
+
+ fs_info = fs_info_from_block_dev(dev);
+ if (fs_info && !IS_ERR(fs_info)) {
+ if (ln != fs_info->last_mount_size) {
+ printk(KERN_EMERG "Found matching uuid but last mount "
+ "time lengths differ?! "
+ "(%d vs %d).\n", ln,
+ fs_info->last_mount_size);
+ result = -EINVAL;
+ } else {
+ char buf[BDEVNAME_SIZE];
+ result = !!memcmp(fs_info->last_mount, last_mount, ln);
+ if (result)
+ printk(KERN_EMERG "Last mount time for %s has "
+ "changed!\n", bdevname(dev, buf));
+ }
+ }
+ toi_close_bdev(dev);
+ free_fs_info(fs_info);
+out_lmt:
+ kfree(last_mount);
+ return result;
+}
+
+static int fs_info_load_and_check(void)
+{
+ int to_do, result = 0;
+
+ if (toiActiveAllocator->rw_header_chunk(READ, NULL, (char *) &to_do,
+ sizeof(int))) {
+ abort_hibernate(TOI_FAILED_IO, "Failed to read num fs_info "
+ "to load.");
+ return -EIO;
+ }
+
+ while(to_do--)
+ result |= fs_info_load_and_check_one();
+
+ return result;
+}
+
+/**
+ * write_image_header - write the image header after write the image proper
+ *
+ * Returns: Int
+ * Zero on success, error value otherwise.
+ **/
+int write_image_header(void)
+{
+ int ret;
+ int total = pagedir1.size + pagedir2.size+2;
+ char *header_buffer = NULL;
+
+ /* Now prepare to write the header */
+ ret = toiActiveAllocator->write_header_init();
+ if (ret) {
+ abort_hibernate(TOI_FAILED_MODULE_INIT,
+ "Active allocator's write_header_init"
+ " function failed.");
+ goto write_image_header_abort;
+ }
+
+ /* Get a buffer */
+ header_buffer = (char *) toi_get_zeroed_page(24, TOI_ATOMIC_GFP);
+ if (!header_buffer) {
+ abort_hibernate(TOI_OUT_OF_MEMORY,
+ "Out of memory when trying to get page for header!");
+ goto write_image_header_abort;
+ }
+
+ /* Write hibernate header */
+ if (fill_toi_header((struct toi_header *) header_buffer)) {
+ abort_hibernate(TOI_OUT_OF_MEMORY,
+ "Failure to fill header information!");
+ goto write_image_header_abort;
+ }
+
+ if (toiActiveAllocator->rw_header_chunk(WRITE, NULL,
+ header_buffer, sizeof(struct toi_header))) {
+ abort_hibernate(TOI_OUT_OF_MEMORY,
+ "Failure to write header info.");
+ goto write_image_header_abort;
+ }
+
+ if (toiActiveAllocator->rw_header_chunk(WRITE, NULL,
+ (char *) &toi_max_workers, sizeof(toi_max_workers))) {
+ abort_hibernate(TOI_OUT_OF_MEMORY,
+ "Failure to number of workers to use.");
+ goto write_image_header_abort;
+ }
+
+ /* Write filesystem info */
+ if (fs_info_save())
+ goto write_image_header_abort;
+
+ /* Write module configurations */
+ ret = write_module_configs();
+ if (ret) {
+ abort_hibernate(TOI_FAILED_IO,
+ "Failed to write module configs.");
+ goto write_image_header_abort;
+ }
+
+ if (memory_bm_write(pageset1_map,
+ toiActiveAllocator->rw_header_chunk)) {
+ abort_hibernate(TOI_FAILED_IO,
+ "Failed to write bitmaps.");
+ goto write_image_header_abort;
+ }
+
+ /* Flush data and let allocator cleanup */
+ if (toiActiveAllocator->write_header_cleanup()) {
+ abort_hibernate(TOI_FAILED_IO,
+ "Failed to cleanup writing header.");
+ goto write_image_header_abort_no_cleanup;
+ }
+
+ if (test_result_state(TOI_ABORTED))
+ goto write_image_header_abort_no_cleanup;
+
+ toi_update_status(total, total, NULL);
+
+out:
+ if (header_buffer)
+ toi_free_page(24, (unsigned long) header_buffer);
+ return ret;
+
+write_image_header_abort:
+ toiActiveAllocator->write_header_cleanup();
+write_image_header_abort_no_cleanup:
+ ret = -1;
+ goto out;
+}
+
+/**
+ * sanity_check - check the header
+ * @sh: the header which was saved at hibernate time.
+ *
+ * Perform a few checks, seeking to ensure that the kernel being
+ * booted matches the one hibernated. They need to match so we can
+ * be _sure_ things will work. It is not absolutely impossible for
+ * resuming from a different kernel to work, just not assured.
+ **/
+static char *sanity_check(struct toi_header *sh)
+{
+ char *reason = check_image_kernel((struct swsusp_info *) sh);
+
+ if (reason)
+ return reason;
+
+ if (!test_action_state(TOI_IGNORE_ROOTFS)) {
+ const struct super_block *sb;
+ list_for_each_entry(sb, &super_blocks, s_list) {
+ if ((!(sb->s_flags & MS_RDONLY)) &&
+ (sb->s_type->fs_flags & FS_REQUIRES_DEV))
+ return "Device backed fs has been mounted "
+ "rw prior to resume or initrd/ramfs "
+ "is mounted rw.";
+ }
+ }
+
+ return NULL;
+}
+
+static DECLARE_WAIT_QUEUE_HEAD(freeze_wait);
+
+#define FREEZE_IN_PROGRESS (~0)
+
+static int freeze_result;
+
+static void do_freeze(struct work_struct *dummy)
+{
+ freeze_result = freeze_processes();
+ wake_up(&freeze_wait);
+ trap_non_toi_io = 1;
+}
+
+static DECLARE_WORK(freeze_work, do_freeze);
+
+/**
+ * __read_pageset1 - test for the existence of an image and attempt to load it
+ *
+ * Returns: Int
+ * Zero if image found and pageset1 successfully loaded.
+ * Error if no image found or loaded.
+ **/
+static int __read_pageset1(void)
+{
+ int i, result = 0;
+ char *header_buffer = (char *) toi_get_zeroed_page(25, TOI_ATOMIC_GFP),
+ *sanity_error = NULL;
+ struct toi_header *toi_header;
+
+ if (!header_buffer) {
+ printk(KERN_INFO "Unable to allocate a page for reading the "
+ "signature.\n");
+ return -ENOMEM;
+ }
+
+ /* Check for an image */
+ result = toiActiveAllocator->image_exists(1);
+ if (result == 3) {
+ result = -ENODATA;
+ toi_early_boot_message(1, 0, "The signature from an older "
+ "version of TuxOnIce has been detected.");
+ goto out_remove_image;
+ }
+
+ if (result != 1) {
+ result = -ENODATA;
+ noresume_reset_modules();
+ printk(KERN_INFO "TuxOnIce: No image found.\n");
+ goto out;
+ }
+
+ /*
+ * Prepare the active allocator for reading the image header. The
+ * activate allocator might read its own configuration.
+ *
+ * NB: This call may never return because there might be a signature
+ * for a different image such that we warn the user and they choose
+ * to reboot. (If the device ids look erroneous (2.4 vs 2.6) or the
+ * location of the image might be unavailable if it was stored on a
+ * network connection).
+ */
+
+ result = toiActiveAllocator->read_header_init();
+ if (result) {
+ printk(KERN_INFO "TuxOnIce: Failed to initialise, reading the "
+ "image header.\n");
+ goto out_remove_image;
+ }
+
+ /* Check for noresume command line option */
+ if (test_toi_state(TOI_NORESUME_SPECIFIED)) {
+ printk(KERN_INFO "TuxOnIce: Noresume on command line. Removed "
+ "image.\n");
+ goto out_remove_image;
+ }
+
+ /* Check whether we've resumed before */
+ if (test_toi_state(TOI_RESUMED_BEFORE)) {
+ toi_early_boot_message(1, 0, NULL);
+ if (!(test_toi_state(TOI_CONTINUE_REQ))) {
+ printk(KERN_INFO "TuxOnIce: Tried to resume before: "
+ "Invalidated image.\n");
+ goto out_remove_image;
+ }
+ }
+
+ clear_toi_state(TOI_CONTINUE_REQ);
+
+ toi_image_header_version = toiActiveAllocator->get_header_version();
+
+ if (unlikely(toi_image_header_version > TOI_HEADER_VERSION)) {
+ toi_early_boot_message(1, 0, image_version_error);
+ if (!(test_toi_state(TOI_CONTINUE_REQ))) {
+ printk(KERN_INFO "TuxOnIce: Header version too new: "
+ "Invalidated image.\n");
+ goto out_remove_image;
+ }
+ }
+
+ /* Read hibernate header */
+ result = toiActiveAllocator->rw_header_chunk(READ, NULL,
+ header_buffer, sizeof(struct toi_header));
+ if (result < 0) {
+ printk(KERN_ERR "TuxOnIce: Failed to read the image "
+ "signature.\n");
+ goto out_remove_image;
+ }
+
+ toi_header = (struct toi_header *) header_buffer;
+
+ /*
+ * NB: This call may also result in a reboot rather than returning.
+ */
+
+ sanity_error = sanity_check(toi_header);
+ if (sanity_error) {
+ toi_early_boot_message(1, TOI_CONTINUE_REQ,
+ sanity_error);
+ printk(KERN_INFO "TuxOnIce: Sanity check failed.\n");
+ goto out_remove_image;
+ }
+
+ /*
+ * We have an image and it looks like it will load okay.
+ *
+ * Get metadata from header. Don't override commandline parameters.
+ *
+ * We don't need to save the image size limit because it's not used
+ * during resume and will be restored with the image anyway.
+ */
+
+ memcpy((char *) &pagedir1,
+ (char *) &toi_header->pagedir, sizeof(pagedir1));
+ toi_result = toi_header->param0;
+ if (!toi_bkd.toi_debug_state) {
+ toi_bkd.toi_action =
+ (toi_header->param1 & ~toi_bootflags_mask) |
+ (toi_bkd.toi_action & toi_bootflags_mask);
+ toi_bkd.toi_debug_state = toi_header->param2;
+ toi_bkd.toi_default_console_level = toi_header->param3;
+ }
+ clear_toi_state(TOI_IGNORE_LOGLEVEL);
+ pagedir2.size = toi_header->pageset_2_size;
+ for (i = 0; i < 4; i++)
+ toi_bkd.toi_io_time[i/2][i%2] =
+ toi_header->io_time[i/2][i%2];
+
+ set_toi_state(TOI_BOOT_KERNEL);
+ boot_kernel_data_buffer = toi_header->bkd;
+
+ read_if_version(1, toi_max_workers, "TuxOnIce max workers",
+ goto out_remove_image);
+
+ /* Read filesystem info */
+ if (fs_info_load_and_check()) {
+ printk(KERN_EMERG "TuxOnIce: File system mount time checks "
+ "failed. Refusing to corrupt your filesystems!\n");
+ goto out_remove_image;
+ }
+
+ /* Read module configurations */
+ result = read_module_configs();
+ if (result) {
+ pagedir1.size = 0;
+ pagedir2.size = 0;
+ printk(KERN_INFO "TuxOnIce: Failed to read TuxOnIce module "
+ "configurations.\n");
+ clear_action_state(TOI_KEEP_IMAGE);
+ goto out_remove_image;
+ }
+
+ toi_prepare_console();
+
+ set_toi_state(TOI_NOW_RESUMING);
+
+ result = pm_notifier_call_chain(PM_RESTORE_PREPARE);
+ if (result)
+ goto out_notifier_call_chain;;
+
+ if (usermodehelper_disable())
+ goto out_enable_usermodehelper;
+
+ current->flags |= PF_NOFREEZE;
+ freeze_result = FREEZE_IN_PROGRESS;
+
+ schedule_work_on(cpumask_first(cpu_online_mask), &freeze_work);
+
+ toi_cond_pause(1, "About to read original pageset1 locations.");
+
+ /*
+ * See _toi_rw_header_chunk in tuxonice_bio.c:
+ * Initialize pageset1_map by reading the map from the image.
+ */
+ if (memory_bm_read(pageset1_map, toiActiveAllocator->rw_header_chunk))
+ goto out_thaw;
+
+ /*
+ * See toi_rw_cleanup in tuxonice_bio.c:
+ * Clean up after reading the header.
+ */
+ result = toiActiveAllocator->read_header_cleanup();
+ if (result) {
+ printk(KERN_ERR "TuxOnIce: Failed to cleanup after reading the "
+ "image header.\n");
+ goto out_thaw;
+ }
+
+ toi_cond_pause(1, "About to read pagedir.");
+
+ /*
+ * Get the addresses of pages into which we will load the kernel to
+ * be copied back and check if they conflict with the ones we are using.
+ */
+ if (toi_get_pageset1_load_addresses()) {
+ printk(KERN_INFO "TuxOnIce: Failed to get load addresses for "
+ "pageset1.\n");
+ goto out_thaw;
+ }
+
+ /* Read the original kernel back */
+ toi_cond_pause(1, "About to read pageset 1.");
+
+ /* Given the pagemap, read back the data from disk */
+ if (read_pageset(&pagedir1, 0)) {
+ toi_prepare_status(DONT_CLEAR_BAR, "Failed to read pageset 1.");
+ result = -EIO;
+ goto out_thaw;
+ }
+
+ toi_cond_pause(1, "About to restore original kernel.");
+ result = 0;
+
+ if (!toi_keeping_image &&
+ toiActiveAllocator->mark_resume_attempted)
+ toiActiveAllocator->mark_resume_attempted(1);
+
+ wait_event(freeze_wait, freeze_result != FREEZE_IN_PROGRESS);
+out:
+ current->flags &= ~PF_NOFREEZE;
+ toi_free_page(25, (unsigned long) header_buffer);
+ return result;
+
+out_thaw:
+ wait_event(freeze_wait, freeze_result != FREEZE_IN_PROGRESS);
+ trap_non_toi_io = 0;
+ thaw_processes();
+out_enable_usermodehelper:
+ usermodehelper_enable();
+out_notifier_call_chain:
+ pm_notifier_call_chain(PM_POST_RESTORE);
+ toi_cleanup_console();
+out_remove_image:
+ result = -EINVAL;
+ if (!toi_keeping_image)
+ toiActiveAllocator->remove_image();
+ toiActiveAllocator->read_header_cleanup();
+ noresume_reset_modules();
+ goto out;
+}
+
+/**
+ * read_pageset1 - highlevel function to read the saved pages
+ *
+ * Attempt to read the header and pageset1 of a hibernate image.
+ * Handle the outcome, complaining where appropriate.
+ **/
+int read_pageset1(void)
+{
+ int error;
+
+ error = __read_pageset1();
+
+ if (error && error != -ENODATA && error != -EINVAL &&
+ !test_result_state(TOI_ABORTED))
+ abort_hibernate(TOI_IMAGE_ERROR,
+ "TuxOnIce: Error %d resuming\n", error);
+
+ return error;
+}
+
+/**
+ * get_have_image_data - check the image header
+ **/
+static char *get_have_image_data(void)
+{
+ char *output_buffer = (char *) toi_get_zeroed_page(26, TOI_ATOMIC_GFP);
+ struct toi_header *toi_header;
+
+ if (!output_buffer) {
+ printk(KERN_INFO "Output buffer null.\n");
+ return NULL;
+ }
+
+ /* Check for an image */
+ if (!toiActiveAllocator->image_exists(1) ||
+ toiActiveAllocator->read_header_init() ||
+ toiActiveAllocator->rw_header_chunk(READ, NULL,
+ output_buffer, sizeof(struct toi_header))) {
+ sprintf(output_buffer, "0\n");
+ /*
+ * From an initrd/ramfs, catting have_image and
+ * getting a result of 0 is sufficient.
+ */
+ clear_toi_state(TOI_BOOT_TIME);
+ goto out;
+ }
+
+ toi_header = (struct toi_header *) output_buffer;
+
+ sprintf(output_buffer, "1\n%s\n%s\n",
+ toi_header->uts.machine,
+ toi_header->uts.version);
+
+ /* Check whether we've resumed before */
+ if (test_toi_state(TOI_RESUMED_BEFORE))
+ strcat(output_buffer, "Resumed before.\n");
+
+out:
+ noresume_reset_modules();
+ return output_buffer;
+}
+
+/**
+ * read_pageset2 - read second part of the image
+ * @overwrittenpagesonly: Read only pages which would have been
+ * verwritten by pageset1?
+ *
+ * Read in part or all of pageset2 of an image, depending upon
+ * whether we are hibernating and have only overwritten a portion
+ * with pageset1 pages, or are resuming and need to read them
+ * all.
+ *
+ * Returns: Int
+ * Zero if no error, otherwise the error value.
+ **/
+int read_pageset2(int overwrittenpagesonly)
+{
+ int result = 0;
+
+ if (!pagedir2.size)
+ return 0;
+
+ result = read_pageset(&pagedir2, overwrittenpagesonly);
+
+ toi_cond_pause(1, "Pagedir 2 read.");
+
+ return result;
+}
+
+/**
+ * image_exists_read - has an image been found?
+ * @page: Output buffer
+ *
+ * Store 0 or 1 in page, depending on whether an image is found.
+ * Incoming buffer is PAGE_SIZE and result is guaranteed
+ * to be far less than that, so we don't worry about
+ * overflow.
+ **/
+int image_exists_read(const char *page, int count)
+{
+ int len = 0;
+ char *result;
+
+ if (toi_activate_storage(0))
+ return count;
+
+ if (!test_toi_state(TOI_RESUME_DEVICE_OK))
+ toi_attempt_to_parse_resume_device(0);
+
+ if (!toiActiveAllocator) {
+ len = sprintf((char *) page, "-1\n");
+ } else {
+ result = get_have_image_data();
+ if (result) {
+ len = sprintf((char *) page, "%s", result);
+ toi_free_page(26, (unsigned long) result);
+ }
+ }
+
+ toi_deactivate_storage(0);
+
+ return len;
+}
+
+/**
+ * image_exists_write - invalidate an image if one exists
+ **/
+int image_exists_write(const char *buffer, int count)
+{
+ if (toi_activate_storage(0))
+ return count;
+
+ if (toiActiveAllocator && toiActiveAllocator->image_exists(1))
+ toiActiveAllocator->remove_image();
+
+ toi_deactivate_storage(0);
+
+ clear_result_state(TOI_KEPT_IMAGE);
+
+ return count;
+}
diff --git a/kernel/power/tuxonice_io.h b/kernel/power/tuxonice_io.h
new file mode 100644
index 000000000..56645a5c6
--- /dev/null
+++ b/kernel/power/tuxonice_io.h
@@ -0,0 +1,72 @@
+/*
+ * kernel/power/tuxonice_io.h
+ *
+ * Copyright (C) 2005-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ * It contains high level IO routines for hibernating.
+ *
+ */
+
+#include <linux/utsname.h>
+#include "tuxonice_pagedir.h"
+
+/* Non-module data saved in our image header */
+struct toi_header {
+ /*
+ * Mirror struct swsusp_info, but without
+ * the page aligned attribute
+ */
+ struct new_utsname uts;
+ u32 version_code;
+ unsigned long num_physpages;
+ int cpus;
+ unsigned long image_pages;
+ unsigned long pages;
+ unsigned long size;
+
+ /* Our own data */
+ unsigned long orig_mem_free;
+ int page_size;
+ int pageset_2_size;
+ int param0;
+ int param1;
+ int param2;
+ int param3;
+ int progress0;
+ int progress1;
+ int progress2;
+ int progress3;
+ int io_time[2][2];
+ struct pagedir pagedir;
+ dev_t root_fs;
+ unsigned long bkd; /* Boot kernel data locn */
+};
+
+extern int write_pageset(struct pagedir *pagedir);
+extern int write_image_header(void);
+extern int read_pageset1(void);
+extern int read_pageset2(int overwrittenpagesonly);
+
+extern int toi_attempt_to_parse_resume_device(int quiet);
+extern void attempt_to_parse_resume_device2(void);
+extern void attempt_to_parse_alt_resume_param(void);
+int image_exists_read(const char *page, int count);
+int image_exists_write(const char *buffer, int count);
+extern void save_restore_alt_param(int replace, int quiet);
+extern atomic_t toi_io_workers;
+
+/* Args to save_restore_alt_param */
+#define RESTORE 0
+#define SAVE 1
+
+#define NOQUIET 0
+#define QUIET 1
+
+extern wait_queue_head_t toi_io_queue_flusher;
+extern int toi_bio_queue_flusher_should_finish;
+
+int fs_info_space_needed(void);
+
+extern int toi_max_workers;
diff --git a/kernel/power/tuxonice_modules.c b/kernel/power/tuxonice_modules.c
new file mode 100644
index 000000000..18f22bdb6
--- /dev/null
+++ b/kernel/power/tuxonice_modules.c
@@ -0,0 +1,520 @@
+/*
+ * kernel/power/tuxonice_modules.c
+ *
+ * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ */
+
+#include <linux/suspend.h>
+#include <linux/module.h>
+#include "tuxonice.h"
+#include "tuxonice_modules.h"
+#include "tuxonice_sysfs.h"
+#include "tuxonice_ui.h"
+
+LIST_HEAD(toi_filters);
+LIST_HEAD(toiAllocators);
+
+LIST_HEAD(toi_modules);
+
+struct toi_module_ops *toiActiveAllocator;
+
+static int toi_num_filters;
+int toiNumAllocators, toi_num_modules;
+
+/*
+ * toi_header_storage_for_modules
+ *
+ * Returns the amount of space needed to store configuration
+ * data needed by the modules prior to copying back the original
+ * kernel. We can exclude data for pageset2 because it will be
+ * available anyway once the kernel is copied back.
+ */
+long toi_header_storage_for_modules(void)
+{
+ struct toi_module_ops *this_module;
+ int bytes = 0;
+
+ list_for_each_entry(this_module, &toi_modules, module_list) {
+ if (!this_module->enabled ||
+ (this_module->type == WRITER_MODULE &&
+ toiActiveAllocator != this_module))
+ continue;
+ if (this_module->storage_needed) {
+ int this = this_module->storage_needed() +
+ sizeof(struct toi_module_header) +
+ sizeof(int);
+ this_module->header_requested = this;
+ bytes += this;
+ }
+ }
+
+ /* One more for the empty terminator */
+ return bytes + sizeof(struct toi_module_header);
+}
+
+void print_toi_header_storage_for_modules(void)
+{
+ struct toi_module_ops *this_module;
+ int bytes = 0;
+
+ printk(KERN_DEBUG "Header storage:\n");
+ list_for_each_entry(this_module, &toi_modules, module_list) {
+ if (!this_module->enabled ||
+ (this_module->type == WRITER_MODULE &&
+ toiActiveAllocator != this_module))
+ continue;
+ if (this_module->storage_needed) {
+ int this = this_module->storage_needed() +
+ sizeof(struct toi_module_header) +
+ sizeof(int);
+ this_module->header_requested = this;
+ bytes += this;
+ printk(KERN_DEBUG "+ %16s : %-4d/%d.\n",
+ this_module->name,
+ this_module->header_used, this);
+ }
+ }
+
+ printk(KERN_DEBUG "+ empty terminator : %zu.\n",
+ sizeof(struct toi_module_header));
+ printk(KERN_DEBUG " ====\n");
+ printk(KERN_DEBUG " %zu\n",
+ bytes + sizeof(struct toi_module_header));
+}
+
+/*
+ * toi_memory_for_modules
+ *
+ * Returns the amount of memory requested by modules for
+ * doing their work during the cycle.
+ */
+
+long toi_memory_for_modules(int print_parts)
+{
+ long bytes = 0, result;
+ struct toi_module_ops *this_module;
+
+ if (print_parts)
+ printk(KERN_INFO "Memory for modules:\n===================\n");
+ list_for_each_entry(this_module, &toi_modules, module_list) {
+ int this;
+ if (!this_module->enabled)
+ continue;
+ if (this_module->memory_needed) {
+ this = this_module->memory_needed();
+ if (print_parts)
+ printk(KERN_INFO "%10d bytes (%5ld pages) for "
+ "module '%s'.\n", this,
+ DIV_ROUND_UP(this, PAGE_SIZE),
+ this_module->name);
+ bytes += this;
+ }
+ }
+
+ result = DIV_ROUND_UP(bytes, PAGE_SIZE);
+ if (print_parts)
+ printk(KERN_INFO " => %ld bytes, %ld pages.\n", bytes, result);
+
+ return result;
+}
+
+/*
+ * toi_expected_compression_ratio
+ *
+ * Returns the compression ratio expected when saving the image.
+ */
+
+int toi_expected_compression_ratio(void)
+{
+ int ratio = 100;
+ struct toi_module_ops *this_module;
+
+ list_for_each_entry(this_module, &toi_modules, module_list) {
+ if (!this_module->enabled)
+ continue;
+ if (this_module->expected_compression)
+ ratio = ratio * this_module->expected_compression()
+ / 100;
+ }
+
+ return ratio;
+}
+
+/* toi_find_module_given_dir
+ * Functionality : Return a module (if found), given a pointer
+ * to its directory name
+ */
+
+static struct toi_module_ops *toi_find_module_given_dir(char *name)
+{
+ struct toi_module_ops *this_module, *found_module = NULL;
+
+ list_for_each_entry(this_module, &toi_modules, module_list) {
+ if (!strcmp(name, this_module->directory)) {
+ found_module = this_module;
+ break;
+ }
+ }
+
+ return found_module;
+}
+
+/* toi_find_module_given_name
+ * Functionality : Return a module (if found), given a pointer
+ * to its name
+ */
+
+struct toi_module_ops *toi_find_module_given_name(char *name)
+{
+ struct toi_module_ops *this_module, *found_module = NULL;
+
+ list_for_each_entry(this_module, &toi_modules, module_list) {
+ if (!strcmp(name, this_module->name)) {
+ found_module = this_module;
+ break;
+ }
+ }
+
+ return found_module;
+}
+
+/*
+ * toi_print_module_debug_info
+ * Functionality : Get debugging info from modules into a buffer.
+ */
+int toi_print_module_debug_info(char *buffer, int buffer_size)
+{
+ struct toi_module_ops *this_module;
+ int len = 0;
+
+ list_for_each_entry(this_module, &toi_modules, module_list) {
+ if (!this_module->enabled)
+ continue;
+ if (this_module->print_debug_info) {
+ int result;
+ result = this_module->print_debug_info(buffer + len,
+ buffer_size - len);
+ len += result;
+ }
+ }
+
+ /* Ensure null terminated */
+ buffer[buffer_size] = 0;
+
+ return len;
+}
+
+/*
+ * toi_register_module
+ *
+ * Register a module.
+ */
+int toi_register_module(struct toi_module_ops *module)
+{
+ int i;
+ struct kobject *kobj;
+
+ if (!hibernation_available())
+ return -ENODEV;
+
+ module->enabled = 1;
+
+ if (toi_find_module_given_name(module->name)) {
+ printk(KERN_INFO "TuxOnIce: Trying to load module %s,"
+ " which is already registered.\n",
+ module->name);
+ return -EBUSY;
+ }
+
+ switch (module->type) {
+ case FILTER_MODULE:
+ list_add_tail(&module->type_list, &toi_filters);
+ toi_num_filters++;
+ break;
+ case WRITER_MODULE:
+ list_add_tail(&module->type_list, &toiAllocators);
+ toiNumAllocators++;
+ break;
+ case MISC_MODULE:
+ case MISC_HIDDEN_MODULE:
+ case BIO_ALLOCATOR_MODULE:
+ break;
+ default:
+ printk(KERN_ERR "Hmmm. Module '%s' has an invalid type."
+ " It has been ignored.\n", module->name);
+ return -EINVAL;
+ }
+ list_add_tail(&module->module_list, &toi_modules);
+ toi_num_modules++;
+
+ if ((!module->directory && !module->shared_directory) ||
+ !module->sysfs_data || !module->num_sysfs_entries)
+ return 0;
+
+ /*
+ * Modules may share a directory, but those with shared_dir
+ * set must be loaded (via symbol dependencies) after parents
+ * and unloaded beforehand.
+ */
+ if (module->shared_directory) {
+ struct toi_module_ops *shared =
+ toi_find_module_given_dir(module->shared_directory);
+ if (!shared) {
+ printk(KERN_ERR "TuxOnIce: Module %s wants to share "
+ "%s's directory but %s isn't loaded.\n",
+ module->name, module->shared_directory,
+ module->shared_directory);
+ toi_unregister_module(module);
+ return -ENODEV;
+ }
+ kobj = shared->dir_kobj;
+ } else {
+ if (!strncmp(module->directory, "[ROOT]", 6))
+ kobj = tuxonice_kobj;
+ else
+ kobj = make_toi_sysdir(module->directory);
+ }
+ module->dir_kobj = kobj;
+ for (i = 0; i < module->num_sysfs_entries; i++) {
+ int result = toi_register_sysfs_file(kobj,
+ &module->sysfs_data[i]);
+ if (result)
+ return result;
+ }
+ return 0;
+}
+
+/*
+ * toi_unregister_module
+ *
+ * Remove a module.
+ */
+void toi_unregister_module(struct toi_module_ops *module)
+{
+ int i;
+
+ if (module->dir_kobj)
+ for (i = 0; i < module->num_sysfs_entries; i++)
+ toi_unregister_sysfs_file(module->dir_kobj,
+ &module->sysfs_data[i]);
+
+ if (!module->shared_directory && module->directory &&
+ strncmp(module->directory, "[ROOT]", 6))
+ remove_toi_sysdir(module->dir_kobj);
+
+ switch (module->type) {
+ case FILTER_MODULE:
+ list_del(&module->type_list);
+ toi_num_filters--;
+ break;
+ case WRITER_MODULE:
+ list_del(&module->type_list);
+ toiNumAllocators--;
+ if (toiActiveAllocator == module) {
+ toiActiveAllocator = NULL;
+ clear_toi_state(TOI_CAN_RESUME);
+ clear_toi_state(TOI_CAN_HIBERNATE);
+ }
+ break;
+ case MISC_MODULE:
+ case MISC_HIDDEN_MODULE:
+ case BIO_ALLOCATOR_MODULE:
+ break;
+ default:
+ printk(KERN_ERR "Module '%s' has an invalid type."
+ " It has been ignored.\n", module->name);
+ return;
+ }
+ list_del(&module->module_list);
+ toi_num_modules--;
+}
+
+/*
+ * toi_move_module_tail
+ *
+ * Rearrange modules when reloading the config.
+ */
+void toi_move_module_tail(struct toi_module_ops *module)
+{
+ switch (module->type) {
+ case FILTER_MODULE:
+ if (toi_num_filters > 1)
+ list_move_tail(&module->type_list, &toi_filters);
+ break;
+ case WRITER_MODULE:
+ if (toiNumAllocators > 1)
+ list_move_tail(&module->type_list, &toiAllocators);
+ break;
+ case MISC_MODULE:
+ case MISC_HIDDEN_MODULE:
+ case BIO_ALLOCATOR_MODULE:
+ break;
+ default:
+ printk(KERN_ERR "Module '%s' has an invalid type."
+ " It has been ignored.\n", module->name);
+ return;
+ }
+ if ((toi_num_filters + toiNumAllocators) > 1)
+ list_move_tail(&module->module_list, &toi_modules);
+}
+
+/*
+ * toi_initialise_modules
+ *
+ * Get ready to do some work!
+ */
+int toi_initialise_modules(int starting_cycle, int early)
+{
+ struct toi_module_ops *this_module;
+ int result;
+
+ list_for_each_entry(this_module, &toi_modules, module_list) {
+ this_module->header_requested = 0;
+ this_module->header_used = 0;
+ if (!this_module->enabled)
+ continue;
+ if (this_module->early != early)
+ continue;
+ if (this_module->initialise) {
+ result = this_module->initialise(starting_cycle);
+ if (result) {
+ toi_cleanup_modules(starting_cycle);
+ return result;
+ }
+ this_module->initialised = 1;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * toi_cleanup_modules
+ *
+ * Tell modules the work is done.
+ */
+void toi_cleanup_modules(int finishing_cycle)
+{
+ struct toi_module_ops *this_module;
+
+ list_for_each_entry(this_module, &toi_modules, module_list) {
+ if (!this_module->enabled || !this_module->initialised)
+ continue;
+ if (this_module->cleanup)
+ this_module->cleanup(finishing_cycle);
+ this_module->initialised = 0;
+ }
+}
+
+/*
+ * toi_pre_atomic_restore_modules
+ *
+ * Get ready to do some work!
+ */
+void toi_pre_atomic_restore_modules(struct toi_boot_kernel_data *bkd)
+{
+ struct toi_module_ops *this_module;
+
+ list_for_each_entry(this_module, &toi_modules, module_list) {
+ if (this_module->enabled && this_module->pre_atomic_restore)
+ this_module->pre_atomic_restore(bkd);
+ }
+}
+
+/*
+ * toi_post_atomic_restore_modules
+ *
+ * Get ready to do some work!
+ */
+void toi_post_atomic_restore_modules(struct toi_boot_kernel_data *bkd)
+{
+ struct toi_module_ops *this_module;
+
+ list_for_each_entry(this_module, &toi_modules, module_list) {
+ if (this_module->enabled && this_module->post_atomic_restore)
+ this_module->post_atomic_restore(bkd);
+ }
+}
+
+/*
+ * toi_get_next_filter
+ *
+ * Get the next filter in the pipeline.
+ */
+struct toi_module_ops *toi_get_next_filter(struct toi_module_ops *filter_sought)
+{
+ struct toi_module_ops *last_filter = NULL, *this_filter = NULL;
+
+ list_for_each_entry(this_filter, &toi_filters, type_list) {
+ if (!this_filter->enabled)
+ continue;
+ if ((last_filter == filter_sought) || (!filter_sought))
+ return this_filter;
+ last_filter = this_filter;
+ }
+
+ return toiActiveAllocator;
+}
+
+/**
+ * toi_show_modules: Printk what support is loaded.
+ */
+void toi_print_modules(void)
+{
+ struct toi_module_ops *this_module;
+ int prev = 0;
+
+ printk(KERN_INFO "TuxOnIce " TOI_CORE_VERSION ", with support for");
+
+ list_for_each_entry(this_module, &toi_modules, module_list) {
+ if (this_module->type == MISC_HIDDEN_MODULE)
+ continue;
+ printk("%s %s%s%s", prev ? "," : "",
+ this_module->enabled ? "" : "[",
+ this_module->name,
+ this_module->enabled ? "" : "]");
+ prev = 1;
+ }
+
+ printk(".\n");
+}
+
+/* toi_get_modules
+ *
+ * Take a reference to modules so they can't go away under us.
+ */
+
+int toi_get_modules(void)
+{
+ struct toi_module_ops *this_module;
+
+ list_for_each_entry(this_module, &toi_modules, module_list) {
+ struct toi_module_ops *this_module2;
+
+ if (try_module_get(this_module->module))
+ continue;
+
+ /* Failed! Reverse gets and return error */
+ list_for_each_entry(this_module2, &toi_modules,
+ module_list) {
+ if (this_module == this_module2)
+ return -EINVAL;
+ module_put(this_module2->module);
+ }
+ }
+ return 0;
+}
+
+/* toi_put_modules
+ *
+ * Release our references to modules we used.
+ */
+
+void toi_put_modules(void)
+{
+ struct toi_module_ops *this_module;
+
+ list_for_each_entry(this_module, &toi_modules, module_list)
+ module_put(this_module->module);
+}
diff --git a/kernel/power/tuxonice_modules.h b/kernel/power/tuxonice_modules.h
new file mode 100644
index 000000000..34ffe2ee3
--- /dev/null
+++ b/kernel/power/tuxonice_modules.h
@@ -0,0 +1,212 @@
+/*
+ * kernel/power/tuxonice_modules.h
+ *
+ * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ * It contains declarations for modules. Modules are additions to
+ * TuxOnIce that provide facilities such as image compression or
+ * encryption, backends for storage of the image and user interfaces.
+ *
+ */
+
+#ifndef TOI_MODULES_H
+#define TOI_MODULES_H
+
+/* This is the maximum size we store in the image header for a module name */
+#define TOI_MAX_MODULE_NAME_LENGTH 30
+
+struct toi_boot_kernel_data;
+
+/* Per-module metadata */
+struct toi_module_header {
+ char name[TOI_MAX_MODULE_NAME_LENGTH];
+ int enabled;
+ int type;
+ int index;
+ int data_length;
+ unsigned long signature;
+};
+
+enum {
+ FILTER_MODULE,
+ WRITER_MODULE,
+ BIO_ALLOCATOR_MODULE,
+ MISC_MODULE,
+ MISC_HIDDEN_MODULE,
+};
+
+enum {
+ TOI_ASYNC,
+ TOI_SYNC
+};
+
+enum {
+ TOI_VIRT,
+ TOI_PAGE,
+};
+
+#define TOI_MAP(type, addr) \
+ (type == TOI_PAGE ? kmap(addr) : addr)
+
+#define TOI_UNMAP(type, addr) \
+ do { \
+ if (type == TOI_PAGE) \
+ kunmap(addr); \
+ } while(0)
+
+struct toi_module_ops {
+ /* Functions common to all modules */
+ int type;
+ char *name;
+ char *directory;
+ char *shared_directory;
+ struct kobject *dir_kobj;
+ struct module *module;
+ int enabled, early, initialised;
+ struct list_head module_list;
+
+ /* List of filters or allocators */
+ struct list_head list, type_list;
+
+ /*
+ * Requirements for memory and storage in
+ * the image header..
+ */
+ int (*memory_needed) (void);
+ int (*storage_needed) (void);
+
+ int header_requested, header_used;
+
+ int (*expected_compression) (void);
+
+ /*
+ * Debug info
+ */
+ int (*print_debug_info) (char *buffer, int size);
+ int (*save_config_info) (char *buffer);
+ void (*load_config_info) (char *buffer, int len);
+
+ /*
+ * Initialise & cleanup - general routines called
+ * at the start and end of a cycle.
+ */
+ int (*initialise) (int starting_cycle);
+ void (*cleanup) (int finishing_cycle);
+
+ void (*pre_atomic_restore) (struct toi_boot_kernel_data *bkd);
+ void (*post_atomic_restore) (struct toi_boot_kernel_data *bkd);
+
+ /*
+ * Calls for allocating storage (allocators only).
+ *
+ * Header space is requested separately and cannot fail, but the
+ * reservation is only applied when main storage is allocated.
+ * The header space reservation is thus always set prior to
+ * requesting the allocation of storage - and prior to querying
+ * how much storage is available.
+ */
+
+ unsigned long (*storage_available) (void);
+ void (*reserve_header_space) (unsigned long space_requested);
+ int (*register_storage) (void);
+ int (*allocate_storage) (unsigned long space_requested);
+ unsigned long (*storage_allocated) (void);
+ void (*free_unused_storage) (void);
+
+ /*
+ * Routines used in image I/O.
+ */
+ int (*rw_init) (int rw, int stream_number);
+ int (*rw_cleanup) (int rw);
+ int (*write_page) (unsigned long index, int buf_type, void *buf,
+ unsigned int buf_size);
+ int (*read_page) (unsigned long *index, int buf_type, void *buf,
+ unsigned int *buf_size);
+ int (*io_flusher) (int rw);
+
+ /* Reset module if image exists but reading aborted */
+ void (*noresume_reset) (void);
+
+ /* Read and write the metadata */
+ int (*write_header_init) (void);
+ int (*write_header_cleanup) (void);
+
+ int (*read_header_init) (void);
+ int (*read_header_cleanup) (void);
+
+ /* To be called after read_header_init */
+ int (*get_header_version) (void);
+
+ int (*rw_header_chunk) (int rw, struct toi_module_ops *owner,
+ char *buffer_start, int buffer_size);
+
+ int (*rw_header_chunk_noreadahead) (int rw,
+ struct toi_module_ops *owner, char *buffer_start,
+ int buffer_size);
+
+ /* Attempt to parse an image location */
+ int (*parse_sig_location) (char *buffer, int only_writer, int quiet);
+
+ /* Throttle I/O according to throughput */
+ void (*update_throughput_throttle) (int jif_index);
+
+ /* Flush outstanding I/O */
+ int (*finish_all_io) (void);
+
+ /* Determine whether image exists that we can restore */
+ int (*image_exists) (int quiet);
+
+ /* Mark the image as having tried to resume */
+ int (*mark_resume_attempted) (int);
+
+ /* Destroy image if one exists */
+ int (*remove_image) (void);
+
+ /* Sysfs Data */
+ struct toi_sysfs_data *sysfs_data;
+ int num_sysfs_entries;
+
+ /* Block I/O allocator */
+ struct toi_bio_allocator_ops *bio_allocator_ops;
+};
+
+extern int toi_num_modules, toiNumAllocators;
+
+extern struct toi_module_ops *toiActiveAllocator;
+extern struct list_head toi_filters, toiAllocators, toi_modules;
+
+extern void toi_prepare_console_modules(void);
+extern void toi_cleanup_console_modules(void);
+
+extern struct toi_module_ops *toi_find_module_given_name(char *name);
+extern struct toi_module_ops *toi_get_next_filter(struct toi_module_ops *);
+
+extern int toi_register_module(struct toi_module_ops *module);
+extern void toi_move_module_tail(struct toi_module_ops *module);
+
+extern long toi_header_storage_for_modules(void);
+extern long toi_memory_for_modules(int print_parts);
+extern void print_toi_header_storage_for_modules(void);
+extern int toi_expected_compression_ratio(void);
+
+extern int toi_print_module_debug_info(char *buffer, int buffer_size);
+extern int toi_register_module(struct toi_module_ops *module);
+extern void toi_unregister_module(struct toi_module_ops *module);
+
+extern int toi_initialise_modules(int starting_cycle, int early);
+#define toi_initialise_modules_early(starting) \
+ toi_initialise_modules(starting, 1)
+#define toi_initialise_modules_late(starting) \
+ toi_initialise_modules(starting, 0)
+extern void toi_cleanup_modules(int finishing_cycle);
+
+extern void toi_post_atomic_restore_modules(struct toi_boot_kernel_data *bkd);
+extern void toi_pre_atomic_restore_modules(struct toi_boot_kernel_data *bkd);
+
+extern void toi_print_modules(void);
+
+int toi_get_modules(void);
+void toi_put_modules(void);
+#endif
diff --git a/kernel/power/tuxonice_netlink.c b/kernel/power/tuxonice_netlink.c
new file mode 100644
index 000000000..0db58af8b
--- /dev/null
+++ b/kernel/power/tuxonice_netlink.c
@@ -0,0 +1,324 @@
+/*
+ * kernel/power/tuxonice_netlink.c
+ *
+ * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ * Functions for communicating with a userspace helper via netlink.
+ */
+
+#include <linux/suspend.h>
+#include <linux/sched.h>
+#include <linux/kmod.h>
+#include "tuxonice_netlink.h"
+#include "tuxonice.h"
+#include "tuxonice_modules.h"
+#include "tuxonice_alloc.h"
+#include "tuxonice_builtin.h"
+
+static struct user_helper_data *uhd_list;
+
+/*
+ * Refill our pool of SKBs for use in emergencies (eg, when eating memory and
+ * none can be allocated).
+ */
+static void toi_fill_skb_pool(struct user_helper_data *uhd)
+{
+ while (uhd->pool_level < uhd->pool_limit) {
+ struct sk_buff *new_skb =
+ alloc_skb(NLMSG_SPACE(uhd->skb_size), TOI_ATOMIC_GFP);
+
+ if (!new_skb)
+ break;
+
+ new_skb->next = uhd->emerg_skbs;
+ uhd->emerg_skbs = new_skb;
+ uhd->pool_level++;
+ }
+}
+
+/*
+ * Try to allocate a single skb. If we can't get one, try to use one from
+ * our pool.
+ */
+static struct sk_buff *toi_get_skb(struct user_helper_data *uhd)
+{
+ struct sk_buff *skb =
+ alloc_skb(NLMSG_SPACE(uhd->skb_size), TOI_ATOMIC_GFP);
+
+ if (skb)
+ return skb;
+
+ skb = uhd->emerg_skbs;
+ if (skb) {
+ uhd->pool_level--;
+ uhd->emerg_skbs = skb->next;
+ skb->next = NULL;
+ }
+
+ return skb;
+}
+
+void toi_send_netlink_message(struct user_helper_data *uhd,
+ int type, void *params, size_t len)
+{
+ struct sk_buff *skb;
+ struct nlmsghdr *nlh;
+ void *dest;
+ struct task_struct *t;
+
+ if (uhd->pid == -1)
+ return;
+
+ if (uhd->debug)
+ printk(KERN_ERR "toi_send_netlink_message: Send "
+ "message type %d.\n", type);
+
+ skb = toi_get_skb(uhd);
+ if (!skb) {
+ printk(KERN_INFO "toi_netlink: Can't allocate skb!\n");
+ return;
+ }
+
+ nlh = nlmsg_put(skb, 0, uhd->sock_seq, type, len, 0);
+ uhd->sock_seq++;
+
+ dest = NLMSG_DATA(nlh);
+ if (params && len > 0)
+ memcpy(dest, params, len);
+
+ netlink_unicast(uhd->nl, skb, uhd->pid, 0);
+
+ toi_read_lock_tasklist();
+ t = find_task_by_pid_ns(uhd->pid, &init_pid_ns);
+ if (!t) {
+ toi_read_unlock_tasklist();
+ if (uhd->pid > -1)
+ printk(KERN_INFO "Hmm. Can't find the userspace task"
+ " %d.\n", uhd->pid);
+ return;
+ }
+ wake_up_process(t);
+ toi_read_unlock_tasklist();
+
+ yield();
+}
+
+static void send_whether_debugging(struct user_helper_data *uhd)
+{
+ static u8 is_debugging = 1;
+
+ toi_send_netlink_message(uhd, NETLINK_MSG_IS_DEBUGGING,
+ &is_debugging, sizeof(u8));
+}
+
+/*
+ * Set the PF_NOFREEZE flag on the given process to ensure it can run whilst we
+ * are hibernating.
+ */
+static int nl_set_nofreeze(struct user_helper_data *uhd, __u32 pid)
+{
+ struct task_struct *t;
+
+ if (uhd->debug)
+ printk(KERN_ERR "nl_set_nofreeze for pid %d.\n", pid);
+
+ toi_read_lock_tasklist();
+ t = find_task_by_pid_ns(pid, &init_pid_ns);
+ if (!t) {
+ toi_read_unlock_tasklist();
+ printk(KERN_INFO "Strange. Can't find the userspace task %d.\n",
+ pid);
+ return -EINVAL;
+ }
+
+ t->flags |= PF_NOFREEZE;
+
+ toi_read_unlock_tasklist();
+ uhd->pid = pid;
+
+ toi_send_netlink_message(uhd, NETLINK_MSG_NOFREEZE_ACK, NULL, 0);
+
+ return 0;
+}
+
+/*
+ * Called when the userspace process has informed us that it's ready to roll.
+ */
+static int nl_ready(struct user_helper_data *uhd, u32 version)
+{
+ if (version != uhd->interface_version) {
+ printk(KERN_INFO "%s userspace process using invalid interface"
+ " version (%d - kernel wants %d). Trying to "
+ "continue without it.\n",
+ uhd->name, version, uhd->interface_version);
+ if (uhd->not_ready)
+ uhd->not_ready();
+ return -EINVAL;
+ }
+
+ complete(&uhd->wait_for_process);
+
+ return 0;
+}
+
+void toi_netlink_close_complete(struct user_helper_data *uhd)
+{
+ if (uhd->nl) {
+ netlink_kernel_release(uhd->nl);
+ uhd->nl = NULL;
+ }
+
+ while (uhd->emerg_skbs) {
+ struct sk_buff *next = uhd->emerg_skbs->next;
+ kfree_skb(uhd->emerg_skbs);
+ uhd->emerg_skbs = next;
+ }
+
+ uhd->pid = -1;
+}
+
+static int toi_nl_gen_rcv_msg(struct user_helper_data *uhd,
+ struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ int type = nlh->nlmsg_type;
+ int *data;
+ int err;
+
+ if (uhd->debug)
+ printk(KERN_ERR "toi_user_rcv_skb: Received message %d.\n",
+ type);
+
+ /* Let the more specific handler go first. It returns
+ * 1 for valid messages that it doesn't know. */
+ err = uhd->rcv_msg(skb, nlh);
+ if (err != 1)
+ return err;
+
+ /* Only allow one task to receive NOFREEZE privileges */
+ if (type == NETLINK_MSG_NOFREEZE_ME && uhd->pid != -1) {
+ printk(KERN_INFO "Received extra nofreeze me requests.\n");
+ return -EBUSY;
+ }
+
+ data = NLMSG_DATA(nlh);
+
+ switch (type) {
+ case NETLINK_MSG_NOFREEZE_ME:
+ return nl_set_nofreeze(uhd, nlh->nlmsg_pid);
+ case NETLINK_MSG_GET_DEBUGGING:
+ send_whether_debugging(uhd);
+ return 0;
+ case NETLINK_MSG_READY:
+ if (nlh->nlmsg_len != NLMSG_LENGTH(sizeof(u32))) {
+ printk(KERN_INFO "Invalid ready mesage.\n");
+ if (uhd->not_ready)
+ uhd->not_ready();
+ return -EINVAL;
+ }
+ return nl_ready(uhd, (u32) *data);
+ case NETLINK_MSG_CLEANUP:
+ toi_netlink_close_complete(uhd);
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+static void toi_user_rcv_skb(struct sk_buff *skb)
+{
+ int err;
+ struct nlmsghdr *nlh;
+ struct user_helper_data *uhd = uhd_list;
+
+ while (uhd && uhd->netlink_id != skb->sk->sk_protocol)
+ uhd = uhd->next;
+
+ if (!uhd)
+ return;
+
+ while (skb->len >= NLMSG_SPACE(0)) {
+ u32 rlen;
+
+ nlh = (struct nlmsghdr *) skb->data;
+ if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
+ return;
+
+ rlen = NLMSG_ALIGN(nlh->nlmsg_len);
+ if (rlen > skb->len)
+ rlen = skb->len;
+
+ err = toi_nl_gen_rcv_msg(uhd, skb, nlh);
+ if (err)
+ netlink_ack(skb, nlh, err);
+ else if (nlh->nlmsg_flags & NLM_F_ACK)
+ netlink_ack(skb, nlh, 0);
+ skb_pull(skb, rlen);
+ }
+}
+
+static int netlink_prepare(struct user_helper_data *uhd)
+{
+ struct netlink_kernel_cfg cfg = {
+ .groups = 0,
+ .input = toi_user_rcv_skb,
+ };
+
+ uhd->next = uhd_list;
+ uhd_list = uhd;
+
+ uhd->sock_seq = 0x42c0ffee;
+ uhd->nl = netlink_kernel_create(&init_net, uhd->netlink_id, &cfg);
+ if (!uhd->nl) {
+ printk(KERN_INFO "Failed to allocate netlink socket for %s.\n",
+ uhd->name);
+ return -ENOMEM;
+ }
+
+ toi_fill_skb_pool(uhd);
+
+ return 0;
+}
+
+void toi_netlink_close(struct user_helper_data *uhd)
+{
+ struct task_struct *t;
+
+ toi_read_lock_tasklist();
+ t = find_task_by_pid_ns(uhd->pid, &init_pid_ns);
+ if (t)
+ t->flags &= ~PF_NOFREEZE;
+ toi_read_unlock_tasklist();
+
+ toi_send_netlink_message(uhd, NETLINK_MSG_CLEANUP, NULL, 0);
+}
+int toi_netlink_setup(struct user_helper_data *uhd)
+{
+ /* In case userui didn't cleanup properly on us */
+ toi_netlink_close_complete(uhd);
+
+ if (netlink_prepare(uhd) < 0) {
+ printk(KERN_INFO "Netlink prepare failed.\n");
+ return 1;
+ }
+
+ if (toi_launch_userspace_program(uhd->program, uhd->netlink_id,
+ UMH_WAIT_EXEC, uhd->debug) < 0) {
+ printk(KERN_INFO "Launch userspace program failed.\n");
+ toi_netlink_close_complete(uhd);
+ return 1;
+ }
+
+ /* Wait 2 seconds for the userspace process to make contact */
+ wait_for_completion_timeout(&uhd->wait_for_process, 2*HZ);
+
+ if (uhd->pid == -1) {
+ printk(KERN_INFO "%s: Failed to contact userspace process.\n",
+ uhd->name);
+ toi_netlink_close_complete(uhd);
+ return 1;
+ }
+
+ return 0;
+}
diff --git a/kernel/power/tuxonice_netlink.h b/kernel/power/tuxonice_netlink.h
new file mode 100644
index 000000000..89e154599
--- /dev/null
+++ b/kernel/power/tuxonice_netlink.h
@@ -0,0 +1,62 @@
+/*
+ * kernel/power/tuxonice_netlink.h
+ *
+ * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ * Declarations for functions for communicating with a userspace helper
+ * via netlink.
+ */
+
+#include <linux/netlink.h>
+#include <net/sock.h>
+
+#define NETLINK_MSG_BASE 0x10
+
+#define NETLINK_MSG_READY 0x10
+#define NETLINK_MSG_NOFREEZE_ME 0x16
+#define NETLINK_MSG_GET_DEBUGGING 0x19
+#define NETLINK_MSG_CLEANUP 0x24
+#define NETLINK_MSG_NOFREEZE_ACK 0x27
+#define NETLINK_MSG_IS_DEBUGGING 0x28
+
+struct user_helper_data {
+ int (*rcv_msg) (struct sk_buff *skb, struct nlmsghdr *nlh);
+ void (*not_ready) (void);
+ struct sock *nl;
+ u32 sock_seq;
+ pid_t pid;
+ char *comm;
+ char program[256];
+ int pool_level;
+ int pool_limit;
+ struct sk_buff *emerg_skbs;
+ int skb_size;
+ int netlink_id;
+ char *name;
+ struct user_helper_data *next;
+ struct completion wait_for_process;
+ u32 interface_version;
+ int must_init;
+ int debug;
+};
+
+#ifdef CONFIG_NET
+int toi_netlink_setup(struct user_helper_data *uhd);
+void toi_netlink_close(struct user_helper_data *uhd);
+void toi_send_netlink_message(struct user_helper_data *uhd,
+ int type, void *params, size_t len);
+void toi_netlink_close_complete(struct user_helper_data *uhd);
+#else
+static inline int toi_netlink_setup(struct user_helper_data *uhd)
+{
+ return 0;
+}
+
+static inline void toi_netlink_close(struct user_helper_data *uhd) { };
+static inline void toi_send_netlink_message(struct user_helper_data *uhd,
+ int type, void *params, size_t len) { };
+static inline void toi_netlink_close_complete(struct user_helper_data *uhd)
+ { };
+#endif
diff --git a/kernel/power/tuxonice_pagedir.c b/kernel/power/tuxonice_pagedir.c
new file mode 100644
index 000000000..9ea185af1
--- /dev/null
+++ b/kernel/power/tuxonice_pagedir.c
@@ -0,0 +1,345 @@
+/*
+ * kernel/power/tuxonice_pagedir.c
+ *
+ * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
+ * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr>
+ * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ * Routines for handling pagesets.
+ * Note that pbes aren't actually stored as such. They're stored as
+ * bitmaps and extents.
+ */
+
+#include <linux/suspend.h>
+#include <linux/highmem.h>
+#include <linux/bootmem.h>
+#include <linux/hardirq.h>
+#include <linux/sched.h>
+#include <linux/cpu.h>
+#include <asm/tlbflush.h>
+
+#include "tuxonice_pageflags.h"
+#include "tuxonice_ui.h"
+#include "tuxonice_pagedir.h"
+#include "tuxonice_prepare_image.h"
+#include "tuxonice.h"
+#include "tuxonice_builtin.h"
+#include "tuxonice_alloc.h"
+
+static int ptoi_pfn;
+static struct pbe *this_low_pbe;
+static struct pbe **last_low_pbe_ptr;
+
+void toi_reset_alt_image_pageset2_pfn(void)
+{
+ memory_bm_position_reset(pageset2_map);
+}
+
+static struct page *first_conflicting_page;
+
+/*
+ * free_conflicting_pages
+ */
+
+static void free_conflicting_pages(void)
+{
+ while (first_conflicting_page) {
+ struct page *next =
+ *((struct page **) kmap(first_conflicting_page));
+ kunmap(first_conflicting_page);
+ toi__free_page(29, first_conflicting_page);
+ first_conflicting_page = next;
+ }
+}
+
+/* __toi_get_nonconflicting_page
+ *
+ * Description: Gets order zero pages that won't be overwritten
+ * while copying the original pages.
+ */
+
+struct page *___toi_get_nonconflicting_page(int can_be_highmem)
+{
+ struct page *page;
+ gfp_t flags = TOI_ATOMIC_GFP;
+ if (can_be_highmem)
+ flags |= __GFP_HIGHMEM;
+
+
+ if (test_toi_state(TOI_LOADING_ALT_IMAGE) &&
+ pageset2_map && ptoi_pfn) {
+ do {
+ ptoi_pfn = memory_bm_next_pfn(pageset2_map, 0);
+ if (ptoi_pfn != BM_END_OF_MAP) {
+ page = pfn_to_page(ptoi_pfn);
+ if (!PagePageset1(page) &&
+ (can_be_highmem || !PageHighMem(page)))
+ return page;
+ }
+ } while (ptoi_pfn);
+ }
+
+ do {
+ page = toi_alloc_page(29, flags | __GFP_ZERO);
+ if (!page) {
+ printk(KERN_INFO "Failed to get nonconflicting "
+ "page.\n");
+ return NULL;
+ }
+ if (PagePageset1(page)) {
+ struct page **next = (struct page **) kmap(page);
+ *next = first_conflicting_page;
+ first_conflicting_page = page;
+ kunmap(page);
+ }
+ } while (PagePageset1(page));
+
+ return page;
+}
+
+unsigned long __toi_get_nonconflicting_page(void)
+{
+ struct page *page = ___toi_get_nonconflicting_page(0);
+ return page ? (unsigned long) page_address(page) : 0;
+}
+
+static struct pbe *get_next_pbe(struct page **page_ptr, struct pbe *this_pbe,
+ int highmem)
+{
+ if (((((unsigned long) this_pbe) & (PAGE_SIZE - 1))
+ + 2 * sizeof(struct pbe)) > PAGE_SIZE) {
+ struct page *new_page =
+ ___toi_get_nonconflicting_page(highmem);
+ if (!new_page)
+ return ERR_PTR(-ENOMEM);
+ this_pbe = (struct pbe *) kmap(new_page);
+ memset(this_pbe, 0, PAGE_SIZE);
+ *page_ptr = new_page;
+ } else
+ this_pbe++;
+
+ return this_pbe;
+}
+
+/**
+ * get_pageset1_load_addresses - generate pbes for conflicting pages
+ *
+ * We check here that pagedir & pages it points to won't collide
+ * with pages where we're going to restore from the loaded pages
+ * later.
+ *
+ * Returns:
+ * Zero on success, one if couldn't find enough pages (shouldn't
+ * happen).
+ **/
+int toi_get_pageset1_load_addresses(void)
+{
+ int pfn, highallocd = 0, lowallocd = 0;
+ int low_needed = pagedir1.size - get_highmem_size(pagedir1);
+ int high_needed = get_highmem_size(pagedir1);
+ int low_pages_for_highmem = 0;
+ gfp_t flags = GFP_ATOMIC | __GFP_NOWARN | __GFP_HIGHMEM;
+ struct page *page, *high_pbe_page = NULL, *last_high_pbe_page = NULL,
+ *low_pbe_page, *last_low_pbe_page = NULL;
+ struct pbe **last_high_pbe_ptr = &restore_highmem_pblist,
+ *this_high_pbe = NULL;
+ unsigned long orig_low_pfn, orig_high_pfn;
+ int high_pbes_done = 0, low_pbes_done = 0;
+ int low_direct = 0, high_direct = 0, result = 0, i;
+ int high_page = 1, high_offset = 0, low_page = 1, low_offset = 0;
+
+ toi_trace_index++;
+
+ memory_bm_position_reset(pageset1_map);
+ memory_bm_position_reset(pageset1_copy_map);
+
+ last_low_pbe_ptr = &restore_pblist;
+
+ /* First, allocate pages for the start of our pbe lists. */
+ if (high_needed) {
+ high_pbe_page = ___toi_get_nonconflicting_page(1);
+ if (!high_pbe_page) {
+ result = -ENOMEM;
+ goto out;
+ }
+ this_high_pbe = (struct pbe *) kmap(high_pbe_page);
+ memset(this_high_pbe, 0, PAGE_SIZE);
+ }
+
+ low_pbe_page = ___toi_get_nonconflicting_page(0);
+ if (!low_pbe_page) {
+ result = -ENOMEM;
+ goto out;
+ }
+ this_low_pbe = (struct pbe *) page_address(low_pbe_page);
+
+ /*
+ * Next, allocate the number of pages we need.
+ */
+
+ i = low_needed + high_needed;
+
+ do {
+ int is_high;
+
+ if (i == low_needed)
+ flags &= ~__GFP_HIGHMEM;
+
+ page = toi_alloc_page(30, flags);
+ BUG_ON(!page);
+
+ SetPagePageset1Copy(page);
+ is_high = PageHighMem(page);
+
+ if (PagePageset1(page)) {
+ if (is_high)
+ high_direct++;
+ else
+ low_direct++;
+ } else {
+ if (is_high)
+ highallocd++;
+ else
+ lowallocd++;
+ }
+ } while (--i);
+
+ high_needed -= high_direct;
+ low_needed -= low_direct;
+
+ /*
+ * Do we need to use some lowmem pages for the copies of highmem
+ * pages?
+ */
+ if (high_needed > highallocd) {
+ low_pages_for_highmem = high_needed - highallocd;
+ high_needed -= low_pages_for_highmem;
+ low_needed += low_pages_for_highmem;
+ }
+
+ /*
+ * Now generate our pbes (which will be used for the atomic restore),
+ * and free unneeded pages.
+ */
+ memory_bm_position_reset(pageset1_copy_map);
+ for (pfn = memory_bm_next_pfn(pageset1_copy_map, 0); pfn != BM_END_OF_MAP;
+ pfn = memory_bm_next_pfn(pageset1_copy_map, 0)) {
+ int is_high;
+ page = pfn_to_page(pfn);
+ is_high = PageHighMem(page);
+
+ if (PagePageset1(page))
+ continue;
+
+ /* Nope. We're going to use this page. Add a pbe. */
+ if (is_high || low_pages_for_highmem) {
+ struct page *orig_page;
+ high_pbes_done++;
+ if (!is_high)
+ low_pages_for_highmem--;
+ do {
+ orig_high_pfn = memory_bm_next_pfn(pageset1_map, 0);
+ BUG_ON(orig_high_pfn == BM_END_OF_MAP);
+ orig_page = pfn_to_page(orig_high_pfn);
+ } while (!PageHighMem(orig_page) ||
+ PagePageset1Copy(orig_page));
+
+ this_high_pbe->orig_address = (void *) orig_high_pfn;
+ this_high_pbe->address = page;
+ this_high_pbe->next = NULL;
+ toi_message(TOI_PAGEDIR, TOI_VERBOSE, 0, "High pbe %d/%d: %p(%d)=>%p",
+ high_page, high_offset, page, orig_high_pfn, orig_page);
+ if (last_high_pbe_page != high_pbe_page) {
+ *last_high_pbe_ptr =
+ (struct pbe *) high_pbe_page;
+ if (last_high_pbe_page) {
+ kunmap(last_high_pbe_page);
+ high_page++;
+ high_offset = 0;
+ } else
+ high_offset++;
+ last_high_pbe_page = high_pbe_page;
+ } else {
+ *last_high_pbe_ptr = this_high_pbe;
+ high_offset++;
+ }
+ last_high_pbe_ptr = &this_high_pbe->next;
+ this_high_pbe = get_next_pbe(&high_pbe_page,
+ this_high_pbe, 1);
+ if (IS_ERR(this_high_pbe)) {
+ printk(KERN_INFO
+ "This high pbe is an error.\n");
+ return -ENOMEM;
+ }
+ } else {
+ struct page *orig_page;
+ low_pbes_done++;
+ do {
+ orig_low_pfn = memory_bm_next_pfn(pageset1_map, 0);
+ BUG_ON(orig_low_pfn == BM_END_OF_MAP);
+ orig_page = pfn_to_page(orig_low_pfn);
+ } while (PageHighMem(orig_page) ||
+ PagePageset1Copy(orig_page));
+
+ this_low_pbe->orig_address = page_address(orig_page);
+ this_low_pbe->address = page_address(page);
+ this_low_pbe->next = NULL;
+ toi_message(TOI_PAGEDIR, TOI_VERBOSE, 0, "Low pbe %d/%d: %p(%d)=>%p",
+ low_page, low_offset, this_low_pbe->orig_address,
+ orig_low_pfn, this_low_pbe->address);
+ TOI_TRACE_DEBUG(orig_low_pfn, "LoadAddresses (%d/%d): %p=>%p", low_page, low_offset, this_low_pbe->orig_address, this_low_pbe->address);
+ *last_low_pbe_ptr = this_low_pbe;
+ last_low_pbe_ptr = &this_low_pbe->next;
+ this_low_pbe = get_next_pbe(&low_pbe_page,
+ this_low_pbe, 0);
+ if (low_pbe_page != last_low_pbe_page) {
+ if (last_low_pbe_page) {
+ low_page++;
+ low_offset = 0;
+ } else {
+ low_offset++;
+ }
+ last_low_pbe_page = low_pbe_page;
+ } else
+ low_offset++;
+ if (IS_ERR(this_low_pbe)) {
+ printk(KERN_INFO "this_low_pbe is an error.\n");
+ return -ENOMEM;
+ }
+ }
+ }
+
+ if (high_pbe_page)
+ kunmap(high_pbe_page);
+
+ if (last_high_pbe_page != high_pbe_page) {
+ if (last_high_pbe_page)
+ kunmap(last_high_pbe_page);
+ toi__free_page(29, high_pbe_page);
+ }
+
+ free_conflicting_pages();
+
+out:
+ return result;
+}
+
+int add_boot_kernel_data_pbe(void)
+{
+ this_low_pbe->address = (char *) __toi_get_nonconflicting_page();
+ if (!this_low_pbe->address) {
+ printk(KERN_INFO "Failed to get bkd atomic restore buffer.");
+ return -ENOMEM;
+ }
+
+ toi_bkd.size = sizeof(toi_bkd);
+ memcpy(this_low_pbe->address, &toi_bkd, sizeof(toi_bkd));
+
+ *last_low_pbe_ptr = this_low_pbe;
+ this_low_pbe->orig_address = (char *) boot_kernel_data_buffer;
+ this_low_pbe->next = NULL;
+ return 0;
+}
diff --git a/kernel/power/tuxonice_pagedir.h b/kernel/power/tuxonice_pagedir.h
new file mode 100644
index 000000000..80d1a3d8c
--- /dev/null
+++ b/kernel/power/tuxonice_pagedir.h
@@ -0,0 +1,50 @@
+/*
+ * kernel/power/tuxonice_pagedir.h
+ *
+ * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ * Declarations for routines for handling pagesets.
+ */
+
+#ifndef KERNEL_POWER_PAGEDIR_H
+#define KERNEL_POWER_PAGEDIR_H
+
+/* Pagedir
+ *
+ * Contains the metadata for a set of pages saved in the image.
+ */
+
+struct pagedir {
+ int id;
+ unsigned long size;
+#ifdef CONFIG_HIGHMEM
+ unsigned long size_high;
+#endif
+};
+
+#ifdef CONFIG_HIGHMEM
+#define get_highmem_size(pagedir) (pagedir.size_high)
+#define set_highmem_size(pagedir, sz) do { pagedir.size_high = sz; } while (0)
+#define inc_highmem_size(pagedir) do { pagedir.size_high++; } while (0)
+#define get_lowmem_size(pagedir) (pagedir.size - pagedir.size_high)
+#else
+#define get_highmem_size(pagedir) (0)
+#define set_highmem_size(pagedir, sz) do { } while (0)
+#define inc_highmem_size(pagedir) do { } while (0)
+#define get_lowmem_size(pagedir) (pagedir.size)
+#endif
+
+extern struct pagedir pagedir1, pagedir2;
+
+extern void toi_copy_pageset1(void);
+
+extern int toi_get_pageset1_load_addresses(void);
+
+extern unsigned long __toi_get_nonconflicting_page(void);
+struct page *___toi_get_nonconflicting_page(int can_be_highmem);
+
+extern void toi_reset_alt_image_pageset2_pfn(void);
+extern int add_boot_kernel_data_pbe(void);
+#endif
diff --git a/kernel/power/tuxonice_pageflags.c b/kernel/power/tuxonice_pageflags.c
new file mode 100644
index 000000000..307d09f33
--- /dev/null
+++ b/kernel/power/tuxonice_pageflags.c
@@ -0,0 +1,18 @@
+/*
+ * kernel/power/tuxonice_pageflags.c
+ *
+ * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ * Routines for serialising and relocating pageflags in which we
+ * store our image metadata.
+ */
+
+#include "tuxonice_pageflags.h"
+#include "power.h"
+
+int toi_pageflags_space_needed(void)
+{
+ return memory_bm_space_needed(pageset1_map);
+}
diff --git a/kernel/power/tuxonice_pageflags.h b/kernel/power/tuxonice_pageflags.h
new file mode 100644
index 000000000..30ee577c3
--- /dev/null
+++ b/kernel/power/tuxonice_pageflags.h
@@ -0,0 +1,106 @@
+/*
+ * kernel/power/tuxonice_pageflags.h
+ *
+ * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ */
+
+#ifndef KERNEL_POWER_TUXONICE_PAGEFLAGS_H
+#define KERNEL_POWER_TUXONICE_PAGEFLAGS_H
+
+struct memory_bitmap;
+void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
+void memory_bm_clear(struct memory_bitmap *bm);
+
+int mem_bm_set_bit_check(struct memory_bitmap *bm, int index, unsigned long pfn);
+void memory_bm_set_bit(struct memory_bitmap *bm, int index, unsigned long pfn);
+unsigned long memory_bm_next_pfn(struct memory_bitmap *bm, int index);
+unsigned long memory_bm_next_pfn_index(struct memory_bitmap *bm, int index);
+void memory_bm_position_reset(struct memory_bitmap *bm);
+void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
+int toi_alloc_bitmap(struct memory_bitmap **bm);
+void toi_free_bitmap(struct memory_bitmap **bm);
+void memory_bm_clear(struct memory_bitmap *bm);
+void memory_bm_clear_bit(struct memory_bitmap *bm, int index, unsigned long pfn);
+void memory_bm_set_bit(struct memory_bitmap *bm, int index, unsigned long pfn);
+int memory_bm_test_bit(struct memory_bitmap *bm, int index, unsigned long pfn);
+int memory_bm_test_bit_index(struct memory_bitmap *bm, int index, unsigned long pfn);
+void memory_bm_clear_bit_index(struct memory_bitmap *bm, int index, unsigned long pfn);
+
+struct toi_module_ops;
+int memory_bm_write(struct memory_bitmap *bm, int (*rw_chunk)
+ (int rw, struct toi_module_ops *owner, char *buffer, int buffer_size));
+int memory_bm_read(struct memory_bitmap *bm, int (*rw_chunk)
+ (int rw, struct toi_module_ops *owner, char *buffer, int buffer_size));
+int memory_bm_space_needed(struct memory_bitmap *bm);
+
+extern struct memory_bitmap *pageset1_map;
+extern struct memory_bitmap *pageset1_copy_map;
+extern struct memory_bitmap *pageset2_map;
+extern struct memory_bitmap *page_resave_map;
+extern struct memory_bitmap *io_map;
+extern struct memory_bitmap *nosave_map;
+extern struct memory_bitmap *free_map;
+extern struct memory_bitmap *compare_map;
+
+#define PagePageset1(page) \
+ (pageset1_map && memory_bm_test_bit(pageset1_map, smp_processor_id(), page_to_pfn(page)))
+#define SetPagePageset1(page) \
+ (memory_bm_set_bit(pageset1_map, smp_processor_id(), page_to_pfn(page)))
+#define ClearPagePageset1(page) \
+ (memory_bm_clear_bit(pageset1_map, smp_processor_id(), page_to_pfn(page)))
+
+#define PagePageset1Copy(page) \
+ (memory_bm_test_bit(pageset1_copy_map, smp_processor_id(), page_to_pfn(page)))
+#define SetPagePageset1Copy(page) \
+ (memory_bm_set_bit(pageset1_copy_map, smp_processor_id(), page_to_pfn(page)))
+#define ClearPagePageset1Copy(page) \
+ (memory_bm_clear_bit(pageset1_copy_map, smp_processor_id(), page_to_pfn(page)))
+
+#define PagePageset2(page) \
+ (memory_bm_test_bit(pageset2_map, smp_processor_id(), page_to_pfn(page)))
+#define SetPagePageset2(page) \
+ (memory_bm_set_bit(pageset2_map, smp_processor_id(), page_to_pfn(page)))
+#define ClearPagePageset2(page) \
+ (memory_bm_clear_bit(pageset2_map, smp_processor_id(), page_to_pfn(page)))
+
+#define PageWasRW(page) \
+ (memory_bm_test_bit(pageset2_map, smp_processor_id(), page_to_pfn(page)))
+#define SetPageWasRW(page) \
+ (memory_bm_set_bit(pageset2_map, smp_processor_id(), page_to_pfn(page)))
+#define ClearPageWasRW(page) \
+ (memory_bm_clear_bit(pageset2_map, smp_processor_id(), page_to_pfn(page)))
+
+#define PageResave(page) (page_resave_map ? \
+ memory_bm_test_bit(page_resave_map, smp_processor_id(), page_to_pfn(page)) : 0)
+#define SetPageResave(page) \
+ (memory_bm_set_bit(page_resave_map, smp_processor_id(), page_to_pfn(page)))
+#define ClearPageResave(page) \
+ (memory_bm_clear_bit(page_resave_map, smp_processor_id(), page_to_pfn(page)))
+
+#define PageNosave(page) (nosave_map ? \
+ memory_bm_test_bit(nosave_map, smp_processor_id(), page_to_pfn(page)) : 0)
+#define SetPageNosave(page) \
+ (mem_bm_set_bit_check(nosave_map, smp_processor_id(), page_to_pfn(page)))
+#define ClearPageNosave(page) \
+ (memory_bm_clear_bit(nosave_map, smp_processor_id(), page_to_pfn(page)))
+
+#define PageNosaveFree(page) (free_map ? \
+ memory_bm_test_bit(free_map, smp_processor_id(), page_to_pfn(page)) : 0)
+#define SetPageNosaveFree(page) \
+ (memory_bm_set_bit(free_map, smp_processor_id(), page_to_pfn(page)))
+#define ClearPageNosaveFree(page) \
+ (memory_bm_clear_bit(free_map, smp_processor_id(), page_to_pfn(page)))
+
+#define PageCompareChanged(page) (compare_map ? \
+ memory_bm_test_bit(compare_map, smp_processor_id(), page_to_pfn(page)) : 0)
+#define SetPageCompareChanged(page) \
+ (memory_bm_set_bit(compare_map, smp_processor_id(), page_to_pfn(page)))
+#define ClearPageCompareChanged(page) \
+ (memory_bm_clear_bit(compare_map, smp_processor_id(), page_to_pfn(page)))
+
+extern void save_pageflags(struct memory_bitmap *pagemap);
+extern int load_pageflags(struct memory_bitmap *pagemap);
+extern int toi_pageflags_space_needed(void);
+#endif
diff --git a/kernel/power/tuxonice_power_off.c b/kernel/power/tuxonice_power_off.c
new file mode 100644
index 000000000..f8e969625
--- /dev/null
+++ b/kernel/power/tuxonice_power_off.c
@@ -0,0 +1,286 @@
+/*
+ * kernel/power/tuxonice_power_off.c
+ *
+ * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ * Support for powering down.
+ */
+
+#include <linux/device.h>
+#include <linux/suspend.h>
+#include <linux/mm.h>
+#include <linux/pm.h>
+#include <linux/reboot.h>
+#include <linux/cpu.h>
+#include <linux/console.h>
+#include <linux/fs.h>
+#include "tuxonice.h"
+#include "tuxonice_ui.h"
+#include "tuxonice_power_off.h"
+#include "tuxonice_sysfs.h"
+#include "tuxonice_modules.h"
+#include "tuxonice_io.h"
+
+unsigned long toi_poweroff_method; /* 0 - Kernel power off */
+
+static int wake_delay;
+static char lid_state_file[256], wake_alarm_dir[256];
+static struct file *lid_file, *alarm_file, *epoch_file;
+static int post_wake_state = -1;
+
+static int did_suspend_to_both;
+
+/*
+ * __toi_power_down
+ * Functionality : Powers down or reboots the computer once the image
+ * has been written to disk.
+ * Key Assumptions : Able to reboot/power down via code called or that
+ * the warning emitted if the calls fail will be visible
+ * to the user (ie printk resumes devices).
+ */
+
+static void __toi_power_down(int method)
+{
+ int error;
+
+ toi_cond_pause(1, test_action_state(TOI_REBOOT) ? "Ready to reboot." :
+ "Powering down.");
+
+ if (test_result_state(TOI_ABORTED))
+ goto out;
+
+ if (test_action_state(TOI_REBOOT))
+ kernel_restart(NULL);
+
+ switch (method) {
+ case 0:
+ break;
+ case 3:
+ /*
+ * Re-read the overwritten part of pageset2 to make post-resume
+ * faster.
+ */
+ if (read_pageset2(1))
+ panic("Attempt to reload pagedir 2 failed. "
+ "Try rebooting.");
+
+ pm_prepare_console();
+
+ error = pm_notifier_call_chain(PM_SUSPEND_PREPARE);
+ if (!error) {
+ pm_restore_gfp_mask();
+ error = suspend_devices_and_enter(PM_SUSPEND_MEM);
+ pm_restrict_gfp_mask();
+ if (!error)
+ did_suspend_to_both = 1;
+ }
+ pm_notifier_call_chain(PM_POST_SUSPEND);
+ pm_restore_console();
+
+ /* Success - we're now post-resume-from-ram */
+ if (did_suspend_to_both)
+ return;
+
+ /* Failed to suspend to ram - do normal power off */
+ break;
+ case 4:
+ /*
+ * If succeeds, doesn't return. If fails, do a simple
+ * powerdown.
+ */
+ hibernation_platform_enter();
+ break;
+ case 5:
+ /* Historic entry only now */
+ break;
+ }
+
+ if (method && method != 5)
+ toi_cond_pause(1,
+ "Falling back to alternate power off method.");
+
+ if (test_result_state(TOI_ABORTED))
+ goto out;
+
+ if (pm_power_off)
+ kernel_power_off();
+ kernel_halt();
+ toi_cond_pause(1, "Powerdown failed.");
+ while (1)
+ cpu_relax();
+
+out:
+ if (read_pageset2(1))
+ panic("Attempt to reload pagedir 2 failed. Try rebooting.");
+ return;
+}
+
+#define CLOSE_FILE(file) \
+ if (file) { \
+ filp_close(file, NULL); file = NULL; \
+ }
+
+static void powerdown_cleanup(int toi_or_resume)
+{
+ if (!toi_or_resume)
+ return;
+
+ CLOSE_FILE(lid_file);
+ CLOSE_FILE(alarm_file);
+ CLOSE_FILE(epoch_file);
+}
+
+static void open_file(char *format, char *arg, struct file **var, int mode,
+ char *desc)
+{
+ char buf[256];
+
+ if (strlen(arg)) {
+ sprintf(buf, format, arg);
+ *var = filp_open(buf, mode, 0);
+ if (IS_ERR(*var) || !*var) {
+ printk(KERN_INFO "Failed to open %s file '%s' (%p).\n",
+ desc, buf, *var);
+ *var = NULL;
+ }
+ }
+}
+
+static int powerdown_init(int toi_or_resume)
+{
+ if (!toi_or_resume)
+ return 0;
+
+ did_suspend_to_both = 0;
+
+ open_file("/proc/acpi/button/%s/state", lid_state_file, &lid_file,
+ O_RDONLY, "lid");
+
+ if (strlen(wake_alarm_dir)) {
+ open_file("/sys/class/rtc/%s/wakealarm", wake_alarm_dir,
+ &alarm_file, O_WRONLY, "alarm");
+
+ open_file("/sys/class/rtc/%s/since_epoch", wake_alarm_dir,
+ &epoch_file, O_RDONLY, "epoch");
+ }
+
+ return 0;
+}
+
+static int lid_closed(void)
+{
+ char array[25];
+ ssize_t size;
+ loff_t pos = 0;
+
+ if (!lid_file)
+ return 0;
+
+ size = vfs_read(lid_file, (char __user *) array, 25, &pos);
+ if ((int) size < 1) {
+ printk(KERN_INFO "Failed to read lid state file (%d).\n",
+ (int) size);
+ return 0;
+ }
+
+ if (!strcmp(array, "state: closed\n"))
+ return 1;
+
+ return 0;
+}
+
+static void write_alarm_file(int value)
+{
+ ssize_t size;
+ char buf[40];
+ loff_t pos = 0;
+
+ if (!alarm_file)
+ return;
+
+ sprintf(buf, "%d\n", value);
+
+ size = vfs_write(alarm_file, (char __user *)buf, strlen(buf), &pos);
+
+ if (size < 0)
+ printk(KERN_INFO "Error %d writing alarm value %s.\n",
+ (int) size, buf);
+}
+
+/**
+ * toi_check_resleep: See whether to powerdown again after waking.
+ *
+ * After waking, check whether we should powerdown again in a (usually
+ * different) way. We only do this if the lid switch is still closed.
+ */
+void toi_check_resleep(void)
+{
+ /* We only return if we suspended to ram and woke. */
+ if (lid_closed() && post_wake_state >= 0)
+ __toi_power_down(post_wake_state);
+}
+
+void toi_power_down(void)
+{
+ if (alarm_file && wake_delay) {
+ char array[25];
+ loff_t pos = 0;
+ size_t size = vfs_read(epoch_file, (char __user *) array, 25,
+ &pos);
+
+ if (((int) size) < 1)
+ printk(KERN_INFO "Failed to read epoch file (%d).\n",
+ (int) size);
+ else {
+ unsigned long since_epoch;
+ if (!kstrtoul(array, 0, &since_epoch)) {
+ /* Clear any wakeup time. */
+ write_alarm_file(0);
+
+ /* Set new wakeup time. */
+ write_alarm_file(since_epoch + wake_delay);
+ }
+ }
+ }
+
+ __toi_power_down(toi_poweroff_method);
+
+ toi_check_resleep();
+}
+
+static struct toi_sysfs_data sysfs_params[] = {
+#if defined(CONFIG_ACPI)
+ SYSFS_STRING("lid_file", SYSFS_RW, lid_state_file, 256, 0, NULL),
+ SYSFS_INT("wake_delay", SYSFS_RW, &wake_delay, 0, INT_MAX, 0, NULL),
+ SYSFS_STRING("wake_alarm_dir", SYSFS_RW, wake_alarm_dir, 256, 0, NULL),
+ SYSFS_INT("post_wake_state", SYSFS_RW, &post_wake_state, -1, 5, 0,
+ NULL),
+ SYSFS_UL("powerdown_method", SYSFS_RW, &toi_poweroff_method, 0, 5, 0),
+ SYSFS_INT("did_suspend_to_both", SYSFS_READONLY, &did_suspend_to_both,
+ 0, 0, 0, NULL)
+#endif
+};
+
+static struct toi_module_ops powerdown_ops = {
+ .type = MISC_HIDDEN_MODULE,
+ .name = "poweroff",
+ .initialise = powerdown_init,
+ .cleanup = powerdown_cleanup,
+ .directory = "[ROOT]",
+ .module = THIS_MODULE,
+ .sysfs_data = sysfs_params,
+ .num_sysfs_entries = sizeof(sysfs_params) /
+ sizeof(struct toi_sysfs_data),
+};
+
+int toi_poweroff_init(void)
+{
+ return toi_register_module(&powerdown_ops);
+}
+
+void toi_poweroff_exit(void)
+{
+ toi_unregister_module(&powerdown_ops);
+}
diff --git a/kernel/power/tuxonice_power_off.h b/kernel/power/tuxonice_power_off.h
new file mode 100644
index 000000000..6e1d8bb39
--- /dev/null
+++ b/kernel/power/tuxonice_power_off.h
@@ -0,0 +1,24 @@
+/*
+ * kernel/power/tuxonice_power_off.h
+ *
+ * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ * Support for the powering down.
+ */
+
+int toi_pm_state_finish(void);
+void toi_power_down(void);
+extern unsigned long toi_poweroff_method;
+int toi_poweroff_init(void);
+void toi_poweroff_exit(void);
+void toi_check_resleep(void);
+
+extern int platform_begin(int platform_mode);
+extern int platform_pre_snapshot(int platform_mode);
+extern void platform_leave(int platform_mode);
+extern void platform_end(int platform_mode);
+extern void platform_finish(int platform_mode);
+extern int platform_pre_restore(int platform_mode);
+extern void platform_restore_cleanup(int platform_mode);
diff --git a/kernel/power/tuxonice_prepare_image.c b/kernel/power/tuxonice_prepare_image.c
new file mode 100644
index 000000000..e0593252f
--- /dev/null
+++ b/kernel/power/tuxonice_prepare_image.c
@@ -0,0 +1,1080 @@
+/*
+ * kernel/power/tuxonice_prepare_image.c
+ *
+ * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ * We need to eat memory until we can:
+ * 1. Perform the save without changing anything (RAM_NEEDED < #pages)
+ * 2. Fit it all in available space (toiActiveAllocator->available_space() >=
+ * main_storage_needed())
+ * 3. Reload the pagedir and pageset1 to places that don't collide with their
+ * final destinations, not knowing to what extent the resumed kernel will
+ * overlap with the one loaded at boot time. I think the resumed kernel
+ * should overlap completely, but I don't want to rely on this as it is
+ * an unproven assumption. We therefore assume there will be no overlap at
+ * all (worse case).
+ * 4. Meet the user's requested limit (if any) on the size of the image.
+ * The limit is in MB, so pages/256 (assuming 4K pages).
+ *
+ */
+
+#include <linux/highmem.h>
+#include <linux/freezer.h>
+#include <linux/hardirq.h>
+#include <linux/mmzone.h>
+#include <linux/console.h>
+#include <linux/tuxonice.h>
+
+#include "tuxonice_pageflags.h"
+#include "tuxonice_modules.h"
+#include "tuxonice_io.h"
+#include "tuxonice_ui.h"
+#include "tuxonice_prepare_image.h"
+#include "tuxonice.h"
+#include "tuxonice_extent.h"
+#include "tuxonice_checksum.h"
+#include "tuxonice_sysfs.h"
+#include "tuxonice_alloc.h"
+#include "tuxonice_atomic_copy.h"
+#include "tuxonice_builtin.h"
+
+static unsigned long num_nosave, main_storage_allocated, storage_limit,
+ header_storage_needed;
+unsigned long extra_pd1_pages_allowance =
+ CONFIG_TOI_DEFAULT_EXTRA_PAGES_ALLOWANCE;
+long image_size_limit = CONFIG_TOI_DEFAULT_IMAGE_SIZE_LIMIT;
+static int no_ps2_needed;
+
+struct attention_list {
+ struct task_struct *task;
+ struct attention_list *next;
+};
+
+static struct attention_list *attention_list;
+
+#define PAGESET1 0
+#define PAGESET2 1
+
+void free_attention_list(void)
+{
+ struct attention_list *last = NULL;
+
+ while (attention_list) {
+ last = attention_list;
+ attention_list = attention_list->next;
+ toi_kfree(6, last, sizeof(*last));
+ }
+}
+
+static int build_attention_list(void)
+{
+ int i, task_count = 0;
+ struct task_struct *p;
+ struct attention_list *next;
+
+ /*
+ * Count all userspace process (with task->mm) marked PF_NOFREEZE.
+ */
+ toi_read_lock_tasklist();
+ for_each_process(p)
+ if ((p->flags & PF_NOFREEZE) || p == current)
+ task_count++;
+ toi_read_unlock_tasklist();
+
+ /*
+ * Allocate attention list structs.
+ */
+ for (i = 0; i < task_count; i++) {
+ struct attention_list *this =
+ toi_kzalloc(6, sizeof(struct attention_list),
+ TOI_WAIT_GFP);
+ if (!this) {
+ printk(KERN_INFO "Failed to allocate slab for "
+ "attention list.\n");
+ free_attention_list();
+ return 1;
+ }
+ this->next = NULL;
+ if (attention_list)
+ this->next = attention_list;
+ attention_list = this;
+ }
+
+ next = attention_list;
+ toi_read_lock_tasklist();
+ for_each_process(p)
+ if ((p->flags & PF_NOFREEZE) || p == current) {
+ next->task = p;
+ next = next->next;
+ }
+ toi_read_unlock_tasklist();
+ return 0;
+}
+
+static void pageset2_full(void)
+{
+ struct zone *zone;
+ struct page *page;
+ unsigned long flags;
+ int i;
+
+ toi_trace_index++;
+
+ for_each_populated_zone(zone) {
+ spin_lock_irqsave(&zone->lru_lock, flags);
+ for_each_lru(i) {
+ if (!zone_page_state(zone, NR_LRU_BASE + i))
+ continue;
+
+ list_for_each_entry(page, &zone->lruvec.lists[i], lru) {
+ struct address_space *mapping;
+
+ mapping = page_mapping(page);
+ if (!mapping || !mapping->host ||
+ !(mapping->host->i_flags & S_ATOMIC_COPY)) {
+ if (PageTOI_RO(page) && test_result_state(TOI_KEPT_IMAGE)) {
+ TOI_TRACE_DEBUG(page_to_pfn(page), "_Pageset2 unmodified.");
+ } else {
+ TOI_TRACE_DEBUG(page_to_pfn(page), "_Pageset2 pageset2_full.");
+ SetPagePageset2(page);
+ }
+ }
+ }
+ }
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
+ }
+}
+
+/*
+ * toi_mark_task_as_pageset
+ * Functionality : Marks all the saveable pages belonging to a given process
+ * as belonging to a particular pageset.
+ */
+
+static void toi_mark_task_as_pageset(struct task_struct *t, int pageset2)
+{
+ struct vm_area_struct *vma;
+ struct mm_struct *mm;
+
+ mm = t->active_mm;
+
+ if (!mm || !mm->mmap)
+ return;
+
+ toi_trace_index++;
+
+ if (!irqs_disabled())
+ down_read(&mm->mmap_sem);
+
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ unsigned long posn;
+
+ if (!vma->vm_start ||
+ vma->vm_flags & VM_PFNMAP)
+ continue;
+
+ for (posn = vma->vm_start; posn < vma->vm_end;
+ posn += PAGE_SIZE) {
+ struct page *page = follow_page(vma, posn, 0);
+ struct address_space *mapping;
+
+ if (!page || !pfn_valid(page_to_pfn(page)))
+ continue;
+
+ mapping = page_mapping(page);
+ if (mapping && mapping->host &&
+ mapping->host->i_flags & S_ATOMIC_COPY && pageset2)
+ continue;
+
+ if (PageTOI_RO(page) && test_result_state(TOI_KEPT_IMAGE)) {
+ TOI_TRACE_DEBUG(page_to_pfn(page), "_Unmodified %d", pageset2 ? 1 : 2);
+ continue;
+ }
+
+ if (pageset2) {
+ TOI_TRACE_DEBUG(page_to_pfn(page), "_MarkTaskAsPageset 1");
+ SetPagePageset2(page);
+ } else {
+ TOI_TRACE_DEBUG(page_to_pfn(page), "_MarkTaskAsPageset 2");
+ ClearPagePageset2(page);
+ SetPagePageset1(page);
+ }
+ }
+ }
+
+ if (!irqs_disabled())
+ up_read(&mm->mmap_sem);
+}
+
+static void mark_tasks(int pageset)
+{
+ struct task_struct *p;
+
+ toi_read_lock_tasklist();
+ for_each_process(p) {
+ if (!p->mm)
+ continue;
+
+ if (p->flags & PF_KTHREAD)
+ continue;
+
+ toi_mark_task_as_pageset(p, pageset);
+ }
+ toi_read_unlock_tasklist();
+
+}
+
+/* mark_pages_for_pageset2
+ *
+ * Description: Mark unshared pages in processes not needed for hibernate as
+ * being able to be written out in a separate pagedir.
+ * HighMem pages are simply marked as pageset2. They won't be
+ * needed during hibernate.
+ */
+
+static void toi_mark_pages_for_pageset2(void)
+{
+ struct attention_list *this = attention_list;
+
+ memory_bm_clear(pageset2_map);
+
+ if (test_action_state(TOI_NO_PAGESET2) || no_ps2_needed)
+ return;
+
+ if (test_action_state(TOI_PAGESET2_FULL))
+ pageset2_full();
+ else
+ mark_tasks(PAGESET2);
+
+ /*
+ * Because the tasks in attention_list are ones related to hibernating,
+ * we know that they won't go away under us.
+ */
+
+ while (this) {
+ if (!test_result_state(TOI_ABORTED))
+ toi_mark_task_as_pageset(this->task, PAGESET1);
+ this = this->next;
+ }
+}
+
+/*
+ * The atomic copy of pageset1 is stored in pageset2 pages.
+ * But if pageset1 is larger (normally only just after boot),
+ * we need to allocate extra pages to store the atomic copy.
+ * The following data struct and functions are used to handle
+ * the allocation and freeing of that memory.
+ */
+
+static unsigned long extra_pages_allocated;
+
+struct extras {
+ struct page *page;
+ int order;
+ struct extras *next;
+};
+
+static struct extras *extras_list;
+
+/* toi_free_extra_pagedir_memory
+ *
+ * Description: Free previously allocated extra pagedir memory.
+ */
+void toi_free_extra_pagedir_memory(void)
+{
+ /* Free allocated pages */
+ while (extras_list) {
+ struct extras *this = extras_list;
+ int i;
+
+ extras_list = this->next;
+
+ for (i = 0; i < (1 << this->order); i++)
+ ClearPageNosave(this->page + i);
+
+ toi_free_pages(9, this->page, this->order);
+ toi_kfree(7, this, sizeof(*this));
+ }
+
+ extra_pages_allocated = 0;
+}
+
+/* toi_allocate_extra_pagedir_memory
+ *
+ * Description: Allocate memory for making the atomic copy of pagedir1 in the
+ * case where it is bigger than pagedir2.
+ * Arguments: int num_to_alloc: Number of extra pages needed.
+ * Result: int. Number of extra pages we now have allocated.
+ */
+static int toi_allocate_extra_pagedir_memory(int extra_pages_needed)
+{
+ int j, order, num_to_alloc = extra_pages_needed - extra_pages_allocated;
+ gfp_t flags = TOI_ATOMIC_GFP;
+
+ if (num_to_alloc < 1)
+ return 0;
+
+ order = fls(num_to_alloc);
+ if (order >= MAX_ORDER)
+ order = MAX_ORDER - 1;
+
+ while (num_to_alloc) {
+ struct page *newpage;
+ unsigned long virt;
+ struct extras *extras_entry;
+
+ while ((1 << order) > num_to_alloc)
+ order--;
+
+ extras_entry = (struct extras *) toi_kzalloc(7,
+ sizeof(struct extras), TOI_ATOMIC_GFP);
+
+ if (!extras_entry)
+ return extra_pages_allocated;
+
+ virt = toi_get_free_pages(9, flags, order);
+ while (!virt && order) {
+ order--;
+ virt = toi_get_free_pages(9, flags, order);
+ }
+
+ if (!virt) {
+ toi_kfree(7, extras_entry, sizeof(*extras_entry));
+ return extra_pages_allocated;
+ }
+
+ newpage = virt_to_page(virt);
+
+ extras_entry->page = newpage;
+ extras_entry->order = order;
+ extras_entry->next = extras_list;
+
+ extras_list = extras_entry;
+
+ for (j = 0; j < (1 << order); j++) {
+ SetPageNosave(newpage + j);
+ SetPagePageset1Copy(newpage + j);
+ }
+
+ extra_pages_allocated += (1 << order);
+ num_to_alloc -= (1 << order);
+ }
+
+ return extra_pages_allocated;
+}
+
+/*
+ * real_nr_free_pages: Count pcp pages for a zone type or all zones
+ * (-1 for all, otherwise zone_idx() result desired).
+ */
+unsigned long real_nr_free_pages(unsigned long zone_idx_mask)
+{
+ struct zone *zone;
+ int result = 0, cpu;
+
+ /* PCP lists */
+ for_each_populated_zone(zone) {
+ if (!(zone_idx_mask & (1 << zone_idx(zone))))
+ continue;
+
+ for_each_online_cpu(cpu) {
+ struct per_cpu_pageset *pset =
+ per_cpu_ptr(zone->pageset, cpu);
+ struct per_cpu_pages *pcp = &pset->pcp;
+ result += pcp->count;
+ }
+
+ result += zone_page_state(zone, NR_FREE_PAGES);
+ }
+ return result;
+}
+
+/*
+ * Discover how much extra memory will be required by the drivers
+ * when they're asked to hibernate. We can then ensure that amount
+ * of memory is available when we really want it.
+ */
+static void get_extra_pd1_allowance(void)
+{
+ unsigned long orig_num_free = real_nr_free_pages(all_zones_mask), final;
+
+ toi_prepare_status(CLEAR_BAR, "Finding allowance for drivers.");
+
+ if (toi_go_atomic(PMSG_FREEZE, 1))
+ return;
+
+ final = real_nr_free_pages(all_zones_mask);
+ toi_end_atomic(ATOMIC_ALL_STEPS, 1, 0);
+
+ extra_pd1_pages_allowance = (orig_num_free > final) ?
+ orig_num_free - final + MIN_EXTRA_PAGES_ALLOWANCE :
+ MIN_EXTRA_PAGES_ALLOWANCE;
+}
+
+/*
+ * Amount of storage needed, possibly taking into account the
+ * expected compression ratio and possibly also ignoring our
+ * allowance for extra pages.
+ */
+static unsigned long main_storage_needed(int use_ecr,
+ int ignore_extra_pd1_allow)
+{
+ return (pagedir1.size + pagedir2.size +
+ (ignore_extra_pd1_allow ? 0 : extra_pd1_pages_allowance)) *
+ (use_ecr ? toi_expected_compression_ratio() : 100) / 100;
+}
+
+/*
+ * Storage needed for the image header, in bytes until the return.
+ */
+unsigned long get_header_storage_needed(void)
+{
+ unsigned long bytes = sizeof(struct toi_header) +
+ toi_header_storage_for_modules() +
+ toi_pageflags_space_needed() +
+ fs_info_space_needed();
+
+ return DIV_ROUND_UP(bytes, PAGE_SIZE);
+}
+
+/*
+ * When freeing memory, pages from either pageset might be freed.
+ *
+ * When seeking to free memory to be able to hibernate, for every ps1 page
+ * freed, we need 2 less pages for the atomic copy because there is one less
+ * page to copy and one more page into which data can be copied.
+ *
+ * Freeing ps2 pages saves us nothing directly. No more memory is available
+ * for the atomic copy. Indirectly, a ps1 page might be freed (slab?), but
+ * that's too much work to figure out.
+ *
+ * => ps1_to_free functions
+ *
+ * Of course if we just want to reduce the image size, because of storage
+ * limitations or an image size limit either ps will do.
+ *
+ * => any_to_free function
+ */
+
+static unsigned long lowpages_usable_for_highmem_copy(void)
+{
+ unsigned long needed = get_lowmem_size(pagedir1) +
+ extra_pd1_pages_allowance + MIN_FREE_RAM +
+ toi_memory_for_modules(0),
+ available = get_lowmem_size(pagedir2) +
+ real_nr_free_low_pages() + extra_pages_allocated;
+
+ return available > needed ? available - needed : 0;
+}
+
+static unsigned long highpages_ps1_to_free(void)
+{
+ unsigned long need = get_highmem_size(pagedir1),
+ available = get_highmem_size(pagedir2) +
+ real_nr_free_high_pages() +
+ lowpages_usable_for_highmem_copy();
+
+ return need > available ? DIV_ROUND_UP(need - available, 2) : 0;
+}
+
+static unsigned long lowpages_ps1_to_free(void)
+{
+ unsigned long needed = get_lowmem_size(pagedir1) +
+ extra_pd1_pages_allowance + MIN_FREE_RAM +
+ toi_memory_for_modules(0),
+ available = get_lowmem_size(pagedir2) +
+ real_nr_free_low_pages() + extra_pages_allocated;
+
+ return needed > available ? DIV_ROUND_UP(needed - available, 2) : 0;
+}
+
+static unsigned long current_image_size(void)
+{
+ return pagedir1.size + pagedir2.size + header_storage_needed;
+}
+
+static unsigned long storage_still_required(void)
+{
+ unsigned long needed = main_storage_needed(1, 1);
+ return needed > storage_limit ? needed - storage_limit : 0;
+}
+
+static unsigned long ram_still_required(void)
+{
+ unsigned long needed = MIN_FREE_RAM + toi_memory_for_modules(0) +
+ 2 * extra_pd1_pages_allowance,
+ available = real_nr_free_low_pages() + extra_pages_allocated;
+ return needed > available ? needed - available : 0;
+}
+
+unsigned long any_to_free(int use_image_size_limit)
+{
+ int use_soft_limit = use_image_size_limit && image_size_limit > 0;
+ unsigned long current_size = current_image_size(),
+ soft_limit = use_soft_limit ? (image_size_limit << 8) : 0,
+ to_free = use_soft_limit ? (current_size > soft_limit ?
+ current_size - soft_limit : 0) : 0,
+ storage_limit = storage_still_required(),
+ ram_limit = ram_still_required(),
+ first_max = max(to_free, storage_limit);
+
+ return max(first_max, ram_limit);
+}
+
+static int need_pageset2(void)
+{
+ return (real_nr_free_low_pages() + extra_pages_allocated -
+ 2 * extra_pd1_pages_allowance - MIN_FREE_RAM -
+ toi_memory_for_modules(0) - pagedir1.size) < pagedir2.size;
+}
+
+/* amount_needed
+ *
+ * Calculates the amount by which the image size needs to be reduced to meet
+ * our constraints.
+ */
+static unsigned long amount_needed(int use_image_size_limit)
+{
+ return max(highpages_ps1_to_free() + lowpages_ps1_to_free(),
+ any_to_free(use_image_size_limit));
+}
+
+static int image_not_ready(int use_image_size_limit)
+{
+ toi_message(TOI_EAT_MEMORY, TOI_LOW, 1,
+ "Amount still needed (%lu) > 0:%u,"
+ " Storage allocd: %lu < %lu: %u.\n",
+ amount_needed(use_image_size_limit),
+ (amount_needed(use_image_size_limit) > 0),
+ main_storage_allocated,
+ main_storage_needed(1, 1),
+ main_storage_allocated < main_storage_needed(1, 1));
+
+ toi_cond_pause(0, NULL);
+
+ return (amount_needed(use_image_size_limit) > 0) ||
+ main_storage_allocated < main_storage_needed(1, 1);
+}
+
+static void display_failure_reason(int tries_exceeded)
+{
+ unsigned long storage_required = storage_still_required(),
+ ram_required = ram_still_required(),
+ high_ps1 = highpages_ps1_to_free(),
+ low_ps1 = lowpages_ps1_to_free();
+
+ printk(KERN_INFO "Failed to prepare the image because...\n");
+
+ if (!storage_limit) {
+ printk(KERN_INFO "- You need some storage available to be "
+ "able to hibernate.\n");
+ return;
+ }
+
+ if (tries_exceeded)
+ printk(KERN_INFO "- The maximum number of iterations was "
+ "reached without successfully preparing the "
+ "image.\n");
+
+ if (storage_required) {
+ printk(KERN_INFO " - We need at least %lu pages of storage "
+ "(ignoring the header), but only have %lu.\n",
+ main_storage_needed(1, 1),
+ main_storage_allocated);
+ set_abort_result(TOI_INSUFFICIENT_STORAGE);
+ }
+
+ if (ram_required) {
+ printk(KERN_INFO " - We need %lu more free pages of low "
+ "memory.\n", ram_required);
+ printk(KERN_INFO " Minimum free : %8d\n", MIN_FREE_RAM);
+ printk(KERN_INFO " + Reqd. by modules : %8lu\n",
+ toi_memory_for_modules(0));
+ printk(KERN_INFO " + 2 * extra allow : %8lu\n",
+ 2 * extra_pd1_pages_allowance);
+ printk(KERN_INFO " - Currently free : %8lu\n",
+ real_nr_free_low_pages());
+ printk(KERN_INFO " - Pages allocd : %8lu\n",
+ extra_pages_allocated);
+ printk(KERN_INFO " : ========\n");
+ printk(KERN_INFO " Still needed : %8lu\n",
+ ram_required);
+
+ /* Print breakdown of memory needed for modules */
+ toi_memory_for_modules(1);
+ set_abort_result(TOI_UNABLE_TO_FREE_ENOUGH_MEMORY);
+ }
+
+ if (high_ps1) {
+ printk(KERN_INFO "- We need to free %lu highmem pageset 1 "
+ "pages.\n", high_ps1);
+ set_abort_result(TOI_UNABLE_TO_FREE_ENOUGH_MEMORY);
+ }
+
+ if (low_ps1) {
+ printk(KERN_INFO " - We need to free %ld lowmem pageset 1 "
+ "pages.\n", low_ps1);
+ set_abort_result(TOI_UNABLE_TO_FREE_ENOUGH_MEMORY);
+ }
+}
+
+static void display_stats(int always, int sub_extra_pd1_allow)
+{
+ char buffer[255];
+ snprintf(buffer, 254,
+ "Free:%lu(%lu). Sets:%lu(%lu),%lu(%lu). "
+ "Nosave:%lu-%lu=%lu. Storage:%lu/%lu(%lu=>%lu). "
+ "Needed:%lu,%lu,%lu(%u,%lu,%lu,%ld) (PS2:%s)\n",
+
+ /* Free */
+ real_nr_free_pages(all_zones_mask),
+ real_nr_free_low_pages(),
+
+ /* Sets */
+ pagedir1.size, pagedir1.size - get_highmem_size(pagedir1),
+ pagedir2.size, pagedir2.size - get_highmem_size(pagedir2),
+
+ /* Nosave */
+ num_nosave, extra_pages_allocated,
+ num_nosave - extra_pages_allocated,
+
+ /* Storage */
+ main_storage_allocated,
+ storage_limit,
+ main_storage_needed(1, sub_extra_pd1_allow),
+ main_storage_needed(1, 1),
+
+ /* Needed */
+ lowpages_ps1_to_free(), highpages_ps1_to_free(),
+ any_to_free(1),
+ MIN_FREE_RAM, toi_memory_for_modules(0),
+ extra_pd1_pages_allowance,
+ image_size_limit,
+
+ need_pageset2() ? "yes" : "no");
+
+ if (always)
+ printk("%s", buffer);
+ else
+ toi_message(TOI_EAT_MEMORY, TOI_MEDIUM, 1, buffer);
+}
+
+/* flag_image_pages
+ *
+ * This routine generates our lists of pages to be stored in each
+ * pageset. Since we store the data using extents, and adding new
+ * extents might allocate a new extent page, this routine may well
+ * be called more than once.
+ */
+static void flag_image_pages(int atomic_copy)
+{
+ int num_free = 0, num_unmodified = 0;
+ unsigned long loop;
+ struct zone *zone;
+
+ pagedir1.size = 0;
+ pagedir2.size = 0;
+
+ set_highmem_size(pagedir1, 0);
+ set_highmem_size(pagedir2, 0);
+
+ num_nosave = 0;
+ toi_trace_index++;
+
+ memory_bm_clear(pageset1_map);
+
+ toi_generate_free_page_map();
+
+ /*
+ * Pages not to be saved are marked Nosave irrespective of being
+ * reserved.
+ */
+ for_each_populated_zone(zone) {
+ int highmem = is_highmem(zone);
+
+ for (loop = 0; loop < zone->spanned_pages; loop++) {
+ unsigned long pfn = zone->zone_start_pfn + loop;
+ struct page *page;
+ int chunk_size;
+
+ if (!pfn_valid(pfn)) {
+ TOI_TRACE_DEBUG(pfn, "_Flag Invalid");
+ continue;
+ }
+
+ chunk_size = toi_size_of_free_region(zone, pfn);
+ if (chunk_size) {
+ unsigned long y;
+ for (y = pfn; y < pfn + chunk_size; y++) {
+ page = pfn_to_page(y);
+ TOI_TRACE_DEBUG(y, "_Flag Free");
+ ClearPagePageset1(page);
+ ClearPagePageset2(page);
+ }
+ num_free += chunk_size;
+ loop += chunk_size - 1;
+ continue;
+ }
+
+ page = pfn_to_page(pfn);
+
+ if (PageNosave(page)) {
+ char *desc = PagePageset1Copy(page) ? "Pageset1Copy" : "NoSave";
+ TOI_TRACE_DEBUG(pfn, "_Flag %s", desc);
+ num_nosave++;
+ continue;
+ }
+
+ page = highmem ? saveable_highmem_page(zone, pfn) :
+ saveable_page(zone, pfn);
+
+ if (!page) {
+ TOI_TRACE_DEBUG(pfn, "_Flag Nosave2");
+ num_nosave++;
+ continue;
+ }
+
+ if (PageTOI_RO(page) && test_result_state(TOI_KEPT_IMAGE)) {
+ TOI_TRACE_DEBUG(pfn, "_Unmodified");
+ num_unmodified++;
+ continue;
+ }
+
+ if (PagePageset2(page)) {
+ pagedir2.size++;
+ TOI_TRACE_DEBUG(pfn, "_Flag PS2");
+ if (PageHighMem(page))
+ inc_highmem_size(pagedir2);
+ else
+ SetPagePageset1Copy(page);
+ if (PageResave(page)) {
+ SetPagePageset1(page);
+ ClearPagePageset1Copy(page);
+ pagedir1.size++;
+ if (PageHighMem(page))
+ inc_highmem_size(pagedir1);
+ }
+ } else {
+ pagedir1.size++;
+ TOI_TRACE_DEBUG(pfn, "_Flag PS1");
+ SetPagePageset1(page);
+ if (PageHighMem(page))
+ inc_highmem_size(pagedir1);
+ }
+ }
+ }
+
+ if (!atomic_copy)
+ toi_message(TOI_EAT_MEMORY, TOI_MEDIUM, 0,
+ "Count data pages: Set1 (%d) + Set2 (%d) + Nosave (%ld)"
+ " + Unmodified (%d) + NumFree (%d) = %d.\n",
+ pagedir1.size, pagedir2.size, num_nosave, num_unmodified,
+ num_free, pagedir1.size + pagedir2.size + num_nosave + num_free);
+}
+
+void toi_recalculate_image_contents(int atomic_copy)
+{
+ memory_bm_clear(pageset1_map);
+ if (!atomic_copy) {
+ unsigned long pfn;
+ memory_bm_position_reset(pageset2_map);
+ for (pfn = memory_bm_next_pfn(pageset2_map, 0);
+ pfn != BM_END_OF_MAP;
+ pfn = memory_bm_next_pfn(pageset2_map, 0))
+ ClearPagePageset1Copy(pfn_to_page(pfn));
+ /* Need to call this before getting pageset1_size! */
+ toi_mark_pages_for_pageset2();
+ }
+ memory_bm_position_reset(pageset2_map);
+ flag_image_pages(atomic_copy);
+
+ if (!atomic_copy) {
+ storage_limit = toiActiveAllocator->storage_available();
+ display_stats(0, 0);
+ }
+}
+
+int try_allocate_extra_memory(void)
+{
+ unsigned long wanted = pagedir1.size + extra_pd1_pages_allowance -
+ get_lowmem_size(pagedir2);
+ if (wanted > extra_pages_allocated) {
+ unsigned long got = toi_allocate_extra_pagedir_memory(wanted);
+ if (wanted < got) {
+ toi_message(TOI_EAT_MEMORY, TOI_LOW, 1,
+ "Want %d extra pages for pageset1, got %d.\n",
+ wanted, got);
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/* update_image
+ *
+ * Allocate [more] memory and storage for the image.
+ */
+static void update_image(int ps2_recalc)
+{
+ int old_header_req;
+ unsigned long seek;
+
+ if (try_allocate_extra_memory())
+ return;
+
+ if (ps2_recalc)
+ goto recalc;
+
+ thaw_kernel_threads();
+
+ /*
+ * Allocate remaining storage space, if possible, up to the
+ * maximum we know we'll need. It's okay to allocate the
+ * maximum if the writer is the swapwriter, but
+ * we don't want to grab all available space on an NFS share.
+ * We therefore ignore the expected compression ratio here,
+ * thereby trying to allocate the maximum image size we could
+ * need (assuming compression doesn't expand the image), but
+ * don't complain if we can't get the full amount we're after.
+ */
+
+ do {
+ int result;
+
+ old_header_req = header_storage_needed;
+ toiActiveAllocator->reserve_header_space(header_storage_needed);
+
+ /* How much storage is free with the reservation applied? */
+ storage_limit = toiActiveAllocator->storage_available();
+ seek = min(storage_limit, main_storage_needed(0, 0));
+
+ result = toiActiveAllocator->allocate_storage(seek);
+ if (result)
+ printk("Failed to allocate storage (%d).\n", result);
+
+ main_storage_allocated =
+ toiActiveAllocator->storage_allocated();
+
+ /* Need more header because more storage allocated? */
+ header_storage_needed = get_header_storage_needed();
+
+ } while (header_storage_needed > old_header_req);
+
+ if (freeze_kernel_threads())
+ set_abort_result(TOI_FREEZING_FAILED);
+
+recalc:
+ toi_recalculate_image_contents(0);
+}
+
+/* attempt_to_freeze
+ *
+ * Try to freeze processes.
+ */
+
+static int attempt_to_freeze(void)
+{
+ int result;
+
+ /* Stop processes before checking again */
+ toi_prepare_status(CLEAR_BAR, "Freezing processes & syncing "
+ "filesystems.");
+ result = freeze_processes();
+
+ if (result)
+ set_abort_result(TOI_FREEZING_FAILED);
+
+ result = freeze_kernel_threads();
+
+ if (result)
+ set_abort_result(TOI_FREEZING_FAILED);
+
+ return result;
+}
+
+/* eat_memory
+ *
+ * Try to free some memory, either to meet hard or soft constraints on the image
+ * characteristics.
+ *
+ * Hard constraints:
+ * - Pageset1 must be < half of memory;
+ * - We must have enough memory free at resume time to have pageset1
+ * be able to be loaded in pages that don't conflict with where it has to
+ * be restored.
+ * Soft constraints
+ * - User specificied image size limit.
+ */
+static void eat_memory(void)
+{
+ unsigned long amount_wanted = 0;
+ int did_eat_memory = 0;
+
+ /*
+ * Note that if we have enough storage space and enough free memory, we
+ * may exit without eating anything. We give up when the last 10
+ * iterations ate no extra pages because we're not going to get much
+ * more anyway, but the few pages we get will take a lot of time.
+ *
+ * We freeze processes before beginning, and then unfreeze them if we
+ * need to eat memory until we think we have enough. If our attempts
+ * to freeze fail, we give up and abort.
+ */
+
+ amount_wanted = amount_needed(1);
+
+ switch (image_size_limit) {
+ case -1: /* Don't eat any memory */
+ if (amount_wanted > 0) {
+ set_abort_result(TOI_WOULD_EAT_MEMORY);
+ return;
+ }
+ break;
+ case -2: /* Free caches only */
+ drop_pagecache();
+ toi_recalculate_image_contents(0);
+ amount_wanted = amount_needed(1);
+ break;
+ default:
+ break;
+ }
+
+ if (amount_wanted > 0 && !test_result_state(TOI_ABORTED) &&
+ image_size_limit != -1) {
+ unsigned long request = amount_wanted;
+ unsigned long high_req = max(highpages_ps1_to_free(),
+ any_to_free(1));
+ unsigned long low_req = lowpages_ps1_to_free();
+ unsigned long got = 0;
+
+ toi_prepare_status(CLEAR_BAR,
+ "Seeking to free %ldMB of memory.",
+ MB(amount_wanted));
+
+ thaw_kernel_threads();
+
+ /*
+ * Ask for too many because shrink_memory_mask doesn't
+ * currently return enough most of the time.
+ */
+
+ if (low_req)
+ got = shrink_memory_mask(low_req, GFP_KERNEL);
+ if (high_req)
+ shrink_memory_mask(high_req - got, GFP_HIGHUSER);
+
+ did_eat_memory = 1;
+
+ toi_recalculate_image_contents(0);
+
+ amount_wanted = amount_needed(1);
+
+ printk(KERN_DEBUG "Asked shrink_memory_mask for %ld low pages &"
+ " %ld pages from anywhere, got %ld.\n",
+ high_req, low_req,
+ request - amount_wanted);
+
+ toi_cond_pause(0, NULL);
+
+ if (freeze_kernel_threads())
+ set_abort_result(TOI_FREEZING_FAILED);
+ }
+
+ if (did_eat_memory)
+ toi_recalculate_image_contents(0);
+}
+
+/* toi_prepare_image
+ *
+ * Entry point to the whole image preparation section.
+ *
+ * We do four things:
+ * - Freeze processes;
+ * - Ensure image size constraints are met;
+ * - Complete all the preparation for saving the image,
+ * including allocation of storage. The only memory
+ * that should be needed when we're finished is that
+ * for actually storing the image (and we know how
+ * much is needed for that because the modules tell
+ * us).
+ * - Make sure that all dirty buffers are written out.
+ */
+#define MAX_TRIES 2
+int toi_prepare_image(void)
+{
+ int result = 1, tries = 1;
+
+ main_storage_allocated = 0;
+ no_ps2_needed = 0;
+
+ if (attempt_to_freeze())
+ return 1;
+
+ lock_device_hotplug();
+ set_toi_state(TOI_DEVICE_HOTPLUG_LOCKED);
+
+ if (!extra_pd1_pages_allowance)
+ get_extra_pd1_allowance();
+
+ storage_limit = toiActiveAllocator->storage_available();
+
+ if (!storage_limit) {
+ printk(KERN_INFO "No storage available. Didn't try to prepare "
+ "an image.\n");
+ display_failure_reason(0);
+ set_abort_result(TOI_NOSTORAGE_AVAILABLE);
+ return 1;
+ }
+
+ if (build_attention_list()) {
+ abort_hibernate(TOI_UNABLE_TO_PREPARE_IMAGE,
+ "Unable to successfully prepare the image.\n");
+ return 1;
+ }
+
+ toi_recalculate_image_contents(0);
+
+ do {
+ toi_prepare_status(CLEAR_BAR,
+ "Preparing Image. Try %d.", tries);
+
+ eat_memory();
+
+ if (test_result_state(TOI_ABORTED))
+ break;
+
+ update_image(0);
+
+ tries++;
+
+ } while (image_not_ready(1) && tries <= MAX_TRIES &&
+ !test_result_state(TOI_ABORTED));
+
+ result = image_not_ready(0);
+
+ /* TODO: Handle case where need to remove existing image and resave
+ * instead of adding to incremental image. */
+
+ if (!test_result_state(TOI_ABORTED)) {
+ if (result) {
+ display_stats(1, 0);
+ display_failure_reason(tries > MAX_TRIES);
+ abort_hibernate(TOI_UNABLE_TO_PREPARE_IMAGE,
+ "Unable to successfully prepare the image.\n");
+ } else {
+ /* Pageset 2 needed? */
+ if (!need_pageset2() &&
+ test_action_state(TOI_NO_PS2_IF_UNNEEDED)) {
+ no_ps2_needed = 1;
+ toi_recalculate_image_contents(0);
+ update_image(1);
+ }
+
+ toi_cond_pause(1, "Image preparation complete.");
+ }
+ }
+
+ return result ? result : allocate_checksum_pages();
+}
diff --git a/kernel/power/tuxonice_prepare_image.h b/kernel/power/tuxonice_prepare_image.h
new file mode 100644
index 000000000..af6769ee2
--- /dev/null
+++ b/kernel/power/tuxonice_prepare_image.h
@@ -0,0 +1,38 @@
+/*
+ * kernel/power/tuxonice_prepare_image.h
+ *
+ * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ */
+
+#include <asm/sections.h>
+
+extern int toi_prepare_image(void);
+extern void toi_recalculate_image_contents(int storage_available);
+extern unsigned long real_nr_free_pages(unsigned long zone_idx_mask);
+extern long image_size_limit;
+extern void toi_free_extra_pagedir_memory(void);
+extern unsigned long extra_pd1_pages_allowance;
+extern void free_attention_list(void);
+
+#define MIN_FREE_RAM 100
+#define MIN_EXTRA_PAGES_ALLOWANCE 500
+
+#define all_zones_mask ((unsigned long) ((1 << MAX_NR_ZONES) - 1))
+#ifdef CONFIG_HIGHMEM
+#define real_nr_free_high_pages() (real_nr_free_pages(1 << ZONE_HIGHMEM))
+#define real_nr_free_low_pages() (real_nr_free_pages(all_zones_mask - \
+ (1 << ZONE_HIGHMEM)))
+#else
+#define real_nr_free_high_pages() (0)
+#define real_nr_free_low_pages() (real_nr_free_pages(all_zones_mask))
+
+/* For eat_memory function */
+#define ZONE_HIGHMEM (MAX_NR_ZONES + 1)
+#endif
+
+unsigned long get_header_storage_needed(void);
+unsigned long any_to_free(int use_image_size_limit);
+int try_allocate_extra_memory(void);
diff --git a/kernel/power/tuxonice_prune.c b/kernel/power/tuxonice_prune.c
new file mode 100644
index 000000000..710e48dee
--- /dev/null
+++ b/kernel/power/tuxonice_prune.c
@@ -0,0 +1,406 @@
+/*
+ * kernel/power/tuxonice_prune.c
+ *
+ * Copyright (C) 2012 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ * This file implements a TuxOnIce module that seeks to prune the
+ * amount of data written to disk. It builds a table of hashes
+ * of the uncompressed data, and writes the pfn of the previous page
+ * with the same contents instead of repeating the data when a match
+ * is found.
+ */
+
+#include <linux/suspend.h>
+#include <linux/highmem.h>
+#include <linux/vmalloc.h>
+#include <linux/crypto.h>
+#include <linux/scatterlist.h>
+#include <crypto/hash.h>
+
+#include "tuxonice_builtin.h"
+#include "tuxonice.h"
+#include "tuxonice_modules.h"
+#include "tuxonice_sysfs.h"
+#include "tuxonice_io.h"
+#include "tuxonice_ui.h"
+#include "tuxonice_alloc.h"
+
+/*
+ * We never write a page bigger than PAGE_SIZE, so use a large number
+ * to indicate that data is a PFN.
+ */
+#define PRUNE_DATA_IS_PFN (PAGE_SIZE + 100)
+
+static unsigned long toi_pruned_pages;
+
+static struct toi_module_ops toi_prune_ops;
+static struct toi_module_ops *next_driver;
+
+static char toi_prune_hash_algo_name[32] = "sha1";
+
+static DEFINE_MUTEX(stats_lock);
+
+struct cpu_context {
+ struct shash_desc desc;
+ char *digest;
+};
+
+#define OUT_BUF_SIZE (2 * PAGE_SIZE)
+
+static DEFINE_PER_CPU(struct cpu_context, contexts);
+
+/*
+ * toi_crypto_prepare
+ *
+ * Prepare to do some work by allocating buffers and transforms.
+ */
+static int toi_prune_crypto_prepare(void)
+{
+ int cpu, ret, digestsize;
+
+ if (!*toi_prune_hash_algo_name) {
+ printk(KERN_INFO "TuxOnIce: Pruning enabled but no "
+ "hash algorithm set.\n");
+ return 1;
+ }
+
+ for_each_online_cpu(cpu) {
+ struct cpu_context *this = &per_cpu(contexts, cpu);
+ this->desc.tfm = crypto_alloc_shash(toi_prune_hash_algo_name, 0, 0);
+ if (IS_ERR(this->desc.tfm)) {
+ printk(KERN_INFO "TuxOnIce: Failed to allocate the "
+ "%s prune hash algorithm.\n",
+ toi_prune_hash_algo_name);
+ this->desc.tfm = NULL;
+ return 1;
+ }
+
+ if (!digestsize)
+ digestsize = crypto_shash_digestsize(this->desc.tfm);
+
+ this->digest = kmalloc(digestsize, GFP_KERNEL);
+ if (!this->digest) {
+ printk(KERN_INFO "TuxOnIce: Failed to allocate space "
+ "for digest output.\n");
+ crypto_free_shash(this->desc.tfm);
+ this->desc.tfm = NULL;
+ }
+
+ this->desc.flags = 0;
+
+ ret = crypto_shash_init(&this->desc);
+ if (ret < 0) {
+ printk(KERN_INFO "TuxOnIce: Failed to initialise the "
+ "%s prune hash algorithm.\n",
+ toi_prune_hash_algo_name);
+ kfree(this->digest);
+ this->digest = NULL;
+ crypto_free_shash(this->desc.tfm);
+ this->desc.tfm = NULL;
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+static int toi_prune_rw_cleanup(int writing)
+{
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ struct cpu_context *this = &per_cpu(contexts, cpu);
+ if (this->desc.tfm) {
+ crypto_free_shash(this->desc.tfm);
+ this->desc.tfm = NULL;
+ }
+
+ if (this->digest) {
+ kfree(this->digest);
+ this->digest = NULL;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * toi_prune_init
+ */
+
+static int toi_prune_init(int toi_or_resume)
+{
+ if (!toi_or_resume)
+ return 0;
+
+ toi_pruned_pages = 0;
+
+ next_driver = toi_get_next_filter(&toi_prune_ops);
+
+ return next_driver ? 0 : -ECHILD;
+}
+
+/*
+ * toi_prune_rw_init()
+ */
+
+static int toi_prune_rw_init(int rw, int stream_number)
+{
+ if (toi_prune_crypto_prepare()) {
+ printk(KERN_ERR "Failed to initialise prune "
+ "algorithm.\n");
+ if (rw == READ) {
+ printk(KERN_INFO "Unable to read the image.\n");
+ return -ENODEV;
+ } else {
+ printk(KERN_INFO "Continuing without "
+ "pruning the image.\n");
+ toi_prune_ops.enabled = 0;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * toi_prune_write_page()
+ *
+ * Compress a page of data, buffering output and passing on filled
+ * pages to the next module in the pipeline.
+ *
+ * Buffer_page: Pointer to a buffer of size PAGE_SIZE, containing
+ * data to be checked.
+ *
+ * Returns: 0 on success. Otherwise the error is that returned by later
+ * modules, -ECHILD if we have a broken pipeline or -EIO if
+ * zlib errs.
+ */
+static int toi_prune_write_page(unsigned long index, int buf_type,
+ void *buffer_page, unsigned int buf_size)
+{
+ int ret = 0, cpu = smp_processor_id(), write_data = 1;
+ struct cpu_context *ctx = &per_cpu(contexts, cpu);
+ u8* output_buffer = buffer_page;
+ int output_len = buf_size;
+ int out_buf_type = buf_type;
+ void *buffer_start;
+ u32 buf[4];
+
+ if (ctx->desc.tfm) {
+
+ buffer_start = TOI_MAP(buf_type, buffer_page);
+ ctx->len = OUT_BUF_SIZE;
+
+ ret = crypto_shash_digest(&ctx->desc, buffer_start, buf_size, &ctx->digest);
+ if (ret) {
+ printk(KERN_INFO "TuxOnIce: Failed to calculate digest (%d).\n", ret);
+ } else {
+ mutex_lock(&stats_lock);
+
+ toi_pruned_pages++;
+
+ mutex_unlock(&stats_lock);
+
+ }
+
+ TOI_UNMAP(buf_type, buffer_page);
+ }
+
+ if (write_data)
+ ret = next_driver->write_page(index, out_buf_type,
+ output_buffer, output_len);
+ else
+ ret = next_driver->write_page(index, out_buf_type,
+ output_buffer, output_len);
+
+ return ret;
+}
+
+/*
+ * toi_prune_read_page()
+ * @buffer_page: struct page *. Pointer to a buffer of size PAGE_SIZE.
+ *
+ * Retrieve data from later modules or from a previously loaded page and
+ * fill the input buffer.
+ * Zero if successful. Error condition from me or from downstream on failure.
+ */
+static int toi_prune_read_page(unsigned long *index, int buf_type,
+ void *buffer_page, unsigned int *buf_size)
+{
+ int ret, cpu = smp_processor_id();
+ unsigned int len;
+ char *buffer_start;
+ struct cpu_context *ctx = &per_cpu(contexts, cpu);
+
+ if (!ctx->desc.tfm)
+ return next_driver->read_page(index, TOI_PAGE, buffer_page,
+ buf_size);
+
+ /*
+ * All our reads must be synchronous - we can't handle
+ * data that hasn't been read yet.
+ */
+
+ ret = next_driver->read_page(index, buf_type, buffer_page, &len);
+
+ if (len == PRUNE_DATA_IS_PFN) {
+ buffer_start = kmap(buffer_page);
+ }
+
+ return ret;
+}
+
+/*
+ * toi_prune_print_debug_stats
+ * @buffer: Pointer to a buffer into which the debug info will be printed.
+ * @size: Size of the buffer.
+ *
+ * Print information to be recorded for debugging purposes into a buffer.
+ * Returns: Number of characters written to the buffer.
+ */
+
+static int toi_prune_print_debug_stats(char *buffer, int size)
+{
+ int len;
+
+ /* Output the number of pages pruned. */
+ if (*toi_prune_hash_algo_name)
+ len = scnprintf(buffer, size, "- Compressor is '%s'.\n",
+ toi_prune_hash_algo_name);
+ else
+ len = scnprintf(buffer, size, "- Compressor is not set.\n");
+
+ if (toi_pruned_pages)
+ len += scnprintf(buffer+len, size - len, " Pruned "
+ "%lu pages).\n",
+ toi_pruned_pages);
+ return len;
+}
+
+/*
+ * toi_prune_memory_needed
+ *
+ * Tell the caller how much memory we need to operate during hibernate/resume.
+ * Returns: Unsigned long. Maximum number of bytes of memory required for
+ * operation.
+ */
+static int toi_prune_memory_needed(void)
+{
+ return 2 * PAGE_SIZE;
+}
+
+static int toi_prune_storage_needed(void)
+{
+ return 2 * sizeof(unsigned long) + 2 * sizeof(int) +
+ strlen(toi_prune_hash_algo_name) + 1;
+}
+
+/*
+ * toi_prune_save_config_info
+ * @buffer: Pointer to a buffer of size PAGE_SIZE.
+ *
+ * Save informaton needed when reloading the image at resume time.
+ * Returns: Number of bytes used for saving our data.
+ */
+static int toi_prune_save_config_info(char *buffer)
+{
+ int len = strlen(toi_prune_hash_algo_name) + 1, offset = 0;
+
+ *((unsigned long *) buffer) = toi_pruned_pages;
+ offset += sizeof(unsigned long);
+ *((int *) (buffer + offset)) = len;
+ offset += sizeof(int);
+ strncpy(buffer + offset, toi_prune_hash_algo_name, len);
+ return offset + len;
+}
+
+/* toi_prune_load_config_info
+ * @buffer: Pointer to the start of the data.
+ * @size: Number of bytes that were saved.
+ *
+ * Description: Reload information needed for passing back to the
+ * resumed kernel.
+ */
+static void toi_prune_load_config_info(char *buffer, int size)
+{
+ int len, offset = 0;
+
+ toi_pruned_pages = *((unsigned long *) buffer);
+ offset += sizeof(unsigned long);
+ len = *((int *) (buffer + offset));
+ offset += sizeof(int);
+ strncpy(toi_prune_hash_algo_name, buffer + offset, len);
+}
+
+static void toi_prune_pre_atomic_restore(struct toi_boot_kernel_data *bkd)
+{
+ bkd->pruned_pages = toi_pruned_pages;
+}
+
+static void toi_prune_post_atomic_restore(struct toi_boot_kernel_data *bkd)
+{
+ toi_pruned_pages = bkd->pruned_pages;
+}
+
+/*
+ * toi_expected_ratio
+ *
+ * Description: Returns the expected ratio between data passed into this module
+ * and the amount of data output when writing.
+ * Returns: 100 - we have no idea how many pages will be pruned.
+ */
+
+static int toi_prune_expected_ratio(void)
+{
+ return 100;
+}
+
+/*
+ * data for our sysfs entries.
+ */
+static struct toi_sysfs_data sysfs_params[] = {
+ SYSFS_INT("enabled", SYSFS_RW, &toi_prune_ops.enabled, 0, 1, 0,
+ NULL),
+ SYSFS_STRING("algorithm", SYSFS_RW, toi_prune_hash_algo_name, 31, 0, NULL),
+};
+
+/*
+ * Ops structure.
+ */
+static struct toi_module_ops toi_prune_ops = {
+ .type = FILTER_MODULE,
+ .name = "prune",
+ .directory = "prune",
+ .module = THIS_MODULE,
+ .initialise = toi_prune_init,
+ .memory_needed = toi_prune_memory_needed,
+ .print_debug_info = toi_prune_print_debug_stats,
+ .save_config_info = toi_prune_save_config_info,
+ .load_config_info = toi_prune_load_config_info,
+ .storage_needed = toi_prune_storage_needed,
+ .expected_compression = toi_prune_expected_ratio,
+
+ .pre_atomic_restore = toi_prune_pre_atomic_restore,
+ .post_atomic_restore = toi_prune_post_atomic_restore,
+
+ .rw_init = toi_prune_rw_init,
+ .rw_cleanup = toi_prune_rw_cleanup,
+
+ .write_page = toi_prune_write_page,
+ .read_page = toi_prune_read_page,
+
+ .sysfs_data = sysfs_params,
+ .num_sysfs_entries = sizeof(sysfs_params) /
+ sizeof(struct toi_sysfs_data),
+};
+
+/* ---- Registration ---- */
+
+static __init int toi_prune_load(void)
+{
+ return toi_register_module(&toi_prune_ops);
+}
+
+late_initcall(toi_prune_load);
diff --git a/kernel/power/tuxonice_storage.c b/kernel/power/tuxonice_storage.c
new file mode 100644
index 000000000..e99f6e24f
--- /dev/null
+++ b/kernel/power/tuxonice_storage.c
@@ -0,0 +1,282 @@
+/*
+ * kernel/power/tuxonice_storage.c
+ *
+ * Copyright (C) 2005-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ * Routines for talking to a userspace program that manages storage.
+ *
+ * The kernel side:
+ * - starts the userspace program;
+ * - sends messages telling it when to open and close the connection;
+ * - tells it when to quit;
+ *
+ * The user space side:
+ * - passes messages regarding status;
+ *
+ */
+
+#include <linux/suspend.h>
+#include <linux/freezer.h>
+
+#include "tuxonice_sysfs.h"
+#include "tuxonice_modules.h"
+#include "tuxonice_netlink.h"
+#include "tuxonice_storage.h"
+#include "tuxonice_ui.h"
+
+static struct user_helper_data usm_helper_data;
+static struct toi_module_ops usm_ops;
+static int message_received, usm_prepare_count;
+static int storage_manager_last_action, storage_manager_action;
+
+static int usm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ int type;
+ int *data;
+
+ type = nlh->nlmsg_type;
+
+ /* A control message: ignore them */
+ if (type < NETLINK_MSG_BASE)
+ return 0;
+
+ /* Unknown message: reply with EINVAL */
+ if (type >= USM_MSG_MAX)
+ return -EINVAL;
+
+ /* All operations require privileges, even GET */
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ /* Only allow one task to receive NOFREEZE privileges */
+ if (type == NETLINK_MSG_NOFREEZE_ME && usm_helper_data.pid != -1)
+ return -EBUSY;
+
+ data = (int *) NLMSG_DATA(nlh);
+
+ switch (type) {
+ case USM_MSG_SUCCESS:
+ case USM_MSG_FAILED:
+ message_received = type;
+ complete(&usm_helper_data.wait_for_process);
+ break;
+ default:
+ printk(KERN_INFO "Storage manager doesn't recognise "
+ "message %d.\n", type);
+ }
+
+ return 1;
+}
+
+#ifdef CONFIG_NET
+static int activations;
+
+int toi_activate_storage(int force)
+{
+ int tries = 1;
+
+ if (usm_helper_data.pid == -1 || !usm_ops.enabled)
+ return 0;
+
+ message_received = 0;
+ activations++;
+
+ if (activations > 1 && !force)
+ return 0;
+
+ while ((!message_received || message_received == USM_MSG_FAILED) &&
+ tries < 2) {
+ toi_prepare_status(DONT_CLEAR_BAR, "Activate storage attempt "
+ "%d.\n", tries);
+
+ init_completion(&usm_helper_data.wait_for_process);
+
+ toi_send_netlink_message(&usm_helper_data,
+ USM_MSG_CONNECT,
+ NULL, 0);
+
+ /* Wait 2 seconds for the userspace process to make contact */
+ wait_for_completion_timeout(&usm_helper_data.wait_for_process,
+ 2*HZ);
+
+ tries++;
+ }
+
+ return 0;
+}
+
+int toi_deactivate_storage(int force)
+{
+ if (usm_helper_data.pid == -1 || !usm_ops.enabled)
+ return 0;
+
+ message_received = 0;
+ activations--;
+
+ if (activations && !force)
+ return 0;
+
+ init_completion(&usm_helper_data.wait_for_process);
+
+ toi_send_netlink_message(&usm_helper_data,
+ USM_MSG_DISCONNECT,
+ NULL, 0);
+
+ wait_for_completion_timeout(&usm_helper_data.wait_for_process, 2*HZ);
+
+ if (!message_received || message_received == USM_MSG_FAILED) {
+ printk(KERN_INFO "Returning failure disconnecting storage.\n");
+ return 1;
+ }
+
+ return 0;
+}
+#endif
+
+static void storage_manager_simulate(void)
+{
+ printk(KERN_INFO "--- Storage manager simulate ---\n");
+ toi_prepare_usm();
+ schedule();
+ printk(KERN_INFO "--- Activate storage 1 ---\n");
+ toi_activate_storage(1);
+ schedule();
+ printk(KERN_INFO "--- Deactivate storage 1 ---\n");
+ toi_deactivate_storage(1);
+ schedule();
+ printk(KERN_INFO "--- Cleanup usm ---\n");
+ toi_cleanup_usm();
+ schedule();
+ printk(KERN_INFO "--- Storage manager simulate ends ---\n");
+}
+
+static int usm_storage_needed(void)
+{
+ return sizeof(int) + strlen(usm_helper_data.program) + 1;
+}
+
+static int usm_save_config_info(char *buf)
+{
+ int len = strlen(usm_helper_data.program);
+ memcpy(buf, usm_helper_data.program, len + 1);
+ return sizeof(int) + len + 1;
+}
+
+static void usm_load_config_info(char *buf, int size)
+{
+ /* Don't load the saved path if one has already been set */
+ if (usm_helper_data.program[0])
+ return;
+
+ memcpy(usm_helper_data.program, buf + sizeof(int), *((int *) buf));
+}
+
+static int usm_memory_needed(void)
+{
+ /* ball park figure of 32 pages */
+ return 32 * PAGE_SIZE;
+}
+
+/* toi_prepare_usm
+ */
+int toi_prepare_usm(void)
+{
+ usm_prepare_count++;
+
+ if (usm_prepare_count > 1 || !usm_ops.enabled)
+ return 0;
+
+ usm_helper_data.pid = -1;
+
+ if (!*usm_helper_data.program)
+ return 0;
+
+ toi_netlink_setup(&usm_helper_data);
+
+ if (usm_helper_data.pid == -1)
+ printk(KERN_INFO "TuxOnIce Storage Manager wanted, but couldn't"
+ " start it.\n");
+
+ toi_activate_storage(0);
+
+ return usm_helper_data.pid != -1;
+}
+
+void toi_cleanup_usm(void)
+{
+ usm_prepare_count--;
+
+ if (usm_helper_data.pid > -1 && !usm_prepare_count) {
+ toi_deactivate_storage(0);
+ toi_netlink_close(&usm_helper_data);
+ }
+}
+
+static void storage_manager_activate(void)
+{
+ if (storage_manager_action == storage_manager_last_action)
+ return;
+
+ if (storage_manager_action)
+ toi_prepare_usm();
+ else
+ toi_cleanup_usm();
+
+ storage_manager_last_action = storage_manager_action;
+}
+
+/*
+ * User interface specific /sys/power/tuxonice entries.
+ */
+
+static struct toi_sysfs_data sysfs_params[] = {
+ SYSFS_NONE("simulate_atomic_copy", storage_manager_simulate),
+ SYSFS_INT("enabled", SYSFS_RW, &usm_ops.enabled, 0, 1, 0, NULL),
+ SYSFS_STRING("program", SYSFS_RW, usm_helper_data.program, 254, 0,
+ NULL),
+ SYSFS_INT("activate_storage", SYSFS_RW , &storage_manager_action, 0, 1,
+ 0, storage_manager_activate)
+};
+
+static struct toi_module_ops usm_ops = {
+ .type = MISC_MODULE,
+ .name = "usm",
+ .directory = "storage_manager",
+ .module = THIS_MODULE,
+ .storage_needed = usm_storage_needed,
+ .save_config_info = usm_save_config_info,
+ .load_config_info = usm_load_config_info,
+ .memory_needed = usm_memory_needed,
+
+ .sysfs_data = sysfs_params,
+ .num_sysfs_entries = sizeof(sysfs_params) /
+ sizeof(struct toi_sysfs_data),
+};
+
+/* toi_usm_sysfs_init
+ * Description: Boot time initialisation for user interface.
+ */
+int toi_usm_init(void)
+{
+ usm_helper_data.nl = NULL;
+ usm_helper_data.program[0] = '\0';
+ usm_helper_data.pid = -1;
+ usm_helper_data.skb_size = 0;
+ usm_helper_data.pool_limit = 6;
+ usm_helper_data.netlink_id = NETLINK_TOI_USM;
+ usm_helper_data.name = "userspace storage manager";
+ usm_helper_data.rcv_msg = usm_user_rcv_msg;
+ usm_helper_data.interface_version = 2;
+ usm_helper_data.must_init = 0;
+ init_completion(&usm_helper_data.wait_for_process);
+
+ return toi_register_module(&usm_ops);
+}
+
+void toi_usm_exit(void)
+{
+ toi_netlink_close_complete(&usm_helper_data);
+ toi_unregister_module(&usm_ops);
+}
diff --git a/kernel/power/tuxonice_storage.h b/kernel/power/tuxonice_storage.h
new file mode 100644
index 000000000..1ed9ab156
--- /dev/null
+++ b/kernel/power/tuxonice_storage.h
@@ -0,0 +1,45 @@
+/*
+ * kernel/power/tuxonice_storage.h
+ *
+ * Copyright (C) 2005-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ */
+
+#ifdef CONFIG_NET
+int toi_prepare_usm(void);
+void toi_cleanup_usm(void);
+
+int toi_activate_storage(int force);
+int toi_deactivate_storage(int force);
+extern int toi_usm_init(void);
+extern void toi_usm_exit(void);
+#else
+static inline int toi_usm_init(void) { return 0; }
+static inline void toi_usm_exit(void) { }
+
+static inline int toi_activate_storage(int force)
+{
+ return 0;
+}
+
+static inline int toi_deactivate_storage(int force)
+{
+ return 0;
+}
+
+static inline int toi_prepare_usm(void) { return 0; }
+static inline void toi_cleanup_usm(void) { }
+#endif
+
+enum {
+ USM_MSG_BASE = 0x10,
+
+ /* Kernel -> Userspace */
+ USM_MSG_CONNECT = 0x30,
+ USM_MSG_DISCONNECT = 0x31,
+ USM_MSG_SUCCESS = 0x40,
+ USM_MSG_FAILED = 0x41,
+
+ USM_MSG_MAX,
+};
diff --git a/kernel/power/tuxonice_swap.c b/kernel/power/tuxonice_swap.c
new file mode 100644
index 000000000..ce3215033
--- /dev/null
+++ b/kernel/power/tuxonice_swap.c
@@ -0,0 +1,474 @@
+/*
+ * kernel/power/tuxonice_swap.c
+ *
+ * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * Distributed under GPLv2.
+ *
+ * This file encapsulates functions for usage of swap space as a
+ * backing store.
+ */
+
+#include <linux/suspend.h>
+#include <linux/blkdev.h>
+#include <linux/swapops.h>
+#include <linux/swap.h>
+#include <linux/syscalls.h>
+#include <linux/fs_uuid.h>
+
+#include "tuxonice.h"
+#include "tuxonice_sysfs.h"
+#include "tuxonice_modules.h"
+#include "tuxonice_io.h"
+#include "tuxonice_ui.h"
+#include "tuxonice_extent.h"
+#include "tuxonice_bio.h"
+#include "tuxonice_alloc.h"
+#include "tuxonice_builtin.h"
+
+static struct toi_module_ops toi_swapops;
+
+/* For swapfile automatically swapon/off'd. */
+static char swapfilename[255] = "";
+static int toi_swapon_status;
+
+/* Swap Pages */
+static unsigned long swap_allocated;
+
+static struct sysinfo swapinfo;
+
+static int is_ram_backed(struct swap_info_struct *si)
+{
+ if (!strncmp(si->bdev->bd_disk->disk_name, "ram", 3) ||
+ !strncmp(si->bdev->bd_disk->disk_name, "zram", 4))
+ return 1;
+
+ return 0;
+}
+
+/**
+ * enable_swapfile: Swapon the user specified swapfile prior to hibernating.
+ *
+ * Activate the given swapfile if it wasn't already enabled. Remember whether
+ * we really did swapon it for swapoffing later.
+ */
+static void enable_swapfile(void)
+{
+ int activateswapresult = -EINVAL;
+
+ if (swapfilename[0]) {
+ /* Attempt to swap on with maximum priority */
+ activateswapresult = sys_swapon(swapfilename, 0xFFFF);
+ if (activateswapresult && activateswapresult != -EBUSY)
+ printk(KERN_ERR "TuxOnIce: The swapfile/partition "
+ "specified by /sys/power/tuxonice/swap/swapfile"
+ " (%s) could not be turned on (error %d). "
+ "Attempting to continue.\n",
+ swapfilename, activateswapresult);
+ if (!activateswapresult)
+ toi_swapon_status = 1;
+ }
+}
+
+/**
+ * disable_swapfile: Swapoff any file swaponed at the start of the cycle.
+ *
+ * If we did successfully swapon a file at the start of the cycle, swapoff
+ * it now (finishing up).
+ */
+static void disable_swapfile(void)
+{
+ if (!toi_swapon_status)
+ return;
+
+ sys_swapoff(swapfilename);
+ toi_swapon_status = 0;
+}
+
+static int add_blocks_to_extent_chain(struct toi_bdev_info *chain,
+ unsigned long start, unsigned long end)
+{
+ if (test_action_state(TOI_TEST_BIO))
+ toi_message(TOI_IO, TOI_VERBOSE, 0, "Adding extent %lu-%lu to "
+ "chain %p.", start << chain->bmap_shift,
+ end << chain->bmap_shift, chain);
+
+ return toi_add_to_extent_chain(&chain->blocks, start, end);
+}
+
+
+static int get_main_pool_phys_params(struct toi_bdev_info *chain)
+{
+ struct hibernate_extent *extentpointer = NULL;
+ unsigned long address, extent_min = 0, extent_max = 0;
+ int empty = 1;
+
+ toi_message(TOI_IO, TOI_VERBOSE, 0, "get main pool phys params for "
+ "chain %d.", chain->allocator_index);
+
+ if (!chain->allocations.first)
+ return 0;
+
+ if (chain->blocks.first)
+ toi_put_extent_chain(&chain->blocks);
+
+ toi_extent_for_each(&chain->allocations, extentpointer, address) {
+ swp_entry_t swap_address = (swp_entry_t) { address };
+ struct block_device *bdev;
+ sector_t new_sector = map_swap_entry(swap_address, &bdev);
+
+ if (empty) {
+ empty = 0;
+ extent_min = extent_max = new_sector;
+ continue;
+ }
+
+ if (new_sector == extent_max + 1) {
+ extent_max++;
+ continue;
+ }
+
+ if (add_blocks_to_extent_chain(chain, extent_min, extent_max)) {
+ printk(KERN_ERR "Out of memory while making block "
+ "chains.\n");
+ return -ENOMEM;
+ }
+
+ extent_min = new_sector;
+ extent_max = new_sector;
+ }
+
+ if (!empty &&
+ add_blocks_to_extent_chain(chain, extent_min, extent_max)) {
+ printk(KERN_ERR "Out of memory while making block chains.\n");
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+/*
+ * Like si_swapinfo, except that we don't include ram backed swap (compcache!)
+ * and don't need to use the spinlocks (userspace is stopped when this
+ * function is called).
+ */
+void si_swapinfo_no_compcache(void)
+{
+ unsigned int i;
+
+ si_swapinfo(&swapinfo);
+ swapinfo.freeswap = 0;
+ swapinfo.totalswap = 0;
+
+ for (i = 0; i < MAX_SWAPFILES; i++) {
+ struct swap_info_struct *si = get_swap_info_struct(i);
+ if (si && (si->flags & SWP_WRITEOK) && !is_ram_backed(si)) {
+ swapinfo.totalswap += si->inuse_pages;
+ swapinfo.freeswap += si->pages - si->inuse_pages;
+ }
+ }
+}
+/*
+ * We can't just remember the value from allocation time, because other
+ * processes might have allocated swap in the mean time.
+ */
+static unsigned long toi_swap_storage_available(void)
+{
+ toi_message(TOI_IO, TOI_VERBOSE, 0, "In toi_swap_storage_available.");
+ si_swapinfo_no_compcache();
+ return swapinfo.freeswap + swap_allocated;
+}
+
+static int toi_swap_initialise(int starting_cycle)
+{
+ if (!starting_cycle)
+ return 0;
+
+ enable_swapfile();
+ return 0;
+}
+
+static void toi_swap_cleanup(int ending_cycle)
+{
+ if (!ending_cycle)
+ return;
+
+ disable_swapfile();
+}
+
+static void toi_swap_free_storage(struct toi_bdev_info *chain)
+{
+ /* Free swap entries */
+ struct hibernate_extent *extentpointer;
+ unsigned long extentvalue;
+
+ toi_message(TOI_IO, TOI_VERBOSE, 0, "Freeing storage for chain %p.",
+ chain);
+
+ swap_allocated -= chain->allocations.size;
+ toi_extent_for_each(&chain->allocations, extentpointer, extentvalue)
+ swap_free((swp_entry_t) { extentvalue });
+
+ toi_put_extent_chain(&chain->allocations);
+}
+
+static void free_swap_range(unsigned long min, unsigned long max)
+{
+ int j;
+
+ for (j = min; j <= max; j++)
+ swap_free((swp_entry_t) { j });
+ swap_allocated -= (max - min + 1);
+}
+
+/*
+ * Allocation of a single swap type. Swap priorities are handled at the higher
+ * level.
+ */
+static int toi_swap_allocate_storage(struct toi_bdev_info *chain,
+ unsigned long request)
+{
+ unsigned long gotten = 0;
+
+ toi_message(TOI_IO, TOI_VERBOSE, 0, " Swap allocate storage: Asked to"
+ " allocate %lu pages from device %d.", request,
+ chain->allocator_index);
+
+ while (gotten < request) {
+ swp_entry_t start, end;
+ if (0) {
+ /* Broken at the moment for SSDs */
+ get_swap_range_of_type(chain->allocator_index, &start, &end,
+ request - gotten + 1);
+ } else {
+ start = end = get_swap_page_of_type(chain->allocator_index);
+ }
+ if (start.val) {
+ int added = end.val - start.val + 1;
+ if (toi_add_to_extent_chain(&chain->allocations,
+ start.val, end.val)) {
+ printk(KERN_INFO "Failed to allocate extent for "
+ "%lu-%lu.\n", start.val, end.val);
+ free_swap_range(start.val, end.val);
+ break;
+ }
+ gotten += added;
+ swap_allocated += added;
+ } else
+ break;
+ }
+
+ toi_message(TOI_IO, TOI_VERBOSE, 0, " Allocated %lu pages.", gotten);
+ return gotten;
+}
+
+static int toi_swap_register_storage(void)
+{
+ int i, result = 0;
+
+ toi_message(TOI_IO, TOI_VERBOSE, 0, "toi_swap_register_storage.");
+ for (i = 0; i < MAX_SWAPFILES; i++) {
+ struct swap_info_struct *si = get_swap_info_struct(i);
+ struct toi_bdev_info *devinfo;
+ unsigned char *p;
+ unsigned char buf[256];
+ struct fs_info *fs_info;
+
+ if (!si || !(si->flags & SWP_WRITEOK) || is_ram_backed(si))
+ continue;
+
+ devinfo = toi_kzalloc(39, sizeof(struct toi_bdev_info),
+ GFP_ATOMIC);
+ if (!devinfo) {
+ printk("Failed to allocate devinfo struct for swap "
+ "device %d.\n", i);
+ return -ENOMEM;
+ }
+
+ devinfo->bdev = si->bdev;
+ devinfo->allocator = &toi_swapops;
+ devinfo->allocator_index = i;
+
+ fs_info = fs_info_from_block_dev(si->bdev);
+ if (fs_info && !IS_ERR(fs_info)) {
+ memcpy(devinfo->uuid, &fs_info->uuid, 16);
+ free_fs_info(fs_info);
+ } else
+ result = (int) PTR_ERR(fs_info);
+
+ if (!fs_info)
+ printk("fs_info from block dev returned %d.\n", result);
+ devinfo->dev_t = si->bdev->bd_dev;
+ devinfo->prio = si->prio;
+ devinfo->bmap_shift = 3;
+ devinfo->blocks_per_page = 1;
+
+ p = d_path(&si->swap_file->f_path, buf, sizeof(buf));
+ sprintf(devinfo->name, "swap on %s", p);
+
+ toi_message(TOI_IO, TOI_VERBOSE, 0, "Registering swap storage:"
+ " Device %d (%lx), prio %d.", i,
+ (unsigned long) devinfo->dev_t, devinfo->prio);
+ toi_bio_ops.register_storage(devinfo);
+ }
+
+ return 0;
+}
+
+static unsigned long toi_swap_free_unused_storage(struct toi_bdev_info *chain, unsigned long used)
+{
+ struct hibernate_extent *extentpointer = NULL;
+ unsigned long extentvalue;
+ unsigned long i = 0, first_freed = 0;
+
+ toi_extent_for_each(&chain->allocations, extentpointer, extentvalue) {
+ i++;
+ if (i > used) {
+ swap_free((swp_entry_t) { extentvalue });
+ if (!first_freed)
+ first_freed = extentvalue;
+ }
+ }
+
+ return first_freed;
+}
+
+/*
+ * workspace_size
+ *
+ * Description:
+ * Returns the number of bytes of RAM needed for this
+ * code to do its work. (Used when calculating whether
+ * we have enough memory to be able to hibernate & resume).
+ *
+ */
+static int toi_swap_memory_needed(void)
+{
+ return 1;
+}
+
+/*
+ * Print debug info
+ *
+ * Description:
+ */
+static int toi_swap_print_debug_stats(char *buffer, int size)
+{
+ int len = 0;
+
+ len = scnprintf(buffer, size, "- Swap Allocator enabled.\n");
+ if (swapfilename[0])
+ len += scnprintf(buffer+len, size-len,
+ " Attempting to automatically swapon: %s.\n",
+ swapfilename);
+
+ si_swapinfo_no_compcache();
+
+ len += scnprintf(buffer+len, size-len,
+ " Swap available for image: %lu pages.\n",
+ swapinfo.freeswap + swap_allocated);
+
+ return len;
+}
+
+static int header_locations_read_sysfs(const char *page, int count)
+{
+ int i, printedpartitionsmessage = 0, len = 0, haveswap = 0;
+ struct inode *swapf = NULL;
+ int zone;
+ char *path_page = (char *) toi_get_free_page(10, GFP_KERNEL);
+ char *path, *output = (char *) page;
+ int path_len;
+
+ if (!page)
+ return 0;
+
+ for (i = 0; i < MAX_SWAPFILES; i++) {
+ struct swap_info_struct *si = get_swap_info_struct(i);
+
+ if (!si || !(si->flags & SWP_WRITEOK))
+ continue;
+
+ if (S_ISBLK(si->swap_file->f_mapping->host->i_mode)) {
+ haveswap = 1;
+ if (!printedpartitionsmessage) {
+ len += sprintf(output + len,
+ "For swap partitions, simply use the "
+ "format: resume=swap:/dev/hda1.\n");
+ printedpartitionsmessage = 1;
+ }
+ } else {
+ path_len = 0;
+
+ path = d_path(&si->swap_file->f_path, path_page,
+ PAGE_SIZE);
+ path_len = snprintf(path_page, PAGE_SIZE, "%s", path);
+
+ haveswap = 1;
+ swapf = si->swap_file->f_mapping->host;
+ zone = bmap(swapf, 0);
+ if (!zone) {
+ len += sprintf(output + len,
+ "Swapfile %s has been corrupted. Reuse"
+ " mkswap on it and try again.\n",
+ path_page);
+ } else {
+ char name_buffer[BDEVNAME_SIZE];
+ len += sprintf(output + len,
+ "For swapfile `%s`,"
+ " use resume=swap:/dev/%s:0x%x.\n",
+ path_page,
+ bdevname(si->bdev, name_buffer),
+ zone << (swapf->i_blkbits - 9));
+ }
+ }
+ }
+
+ if (!haveswap)
+ len = sprintf(output, "You need to turn on swap partitions "
+ "before examining this file.\n");
+
+ toi_free_page(10, (unsigned long) path_page);
+ return len;
+}
+
+static struct toi_sysfs_data sysfs_params[] = {
+ SYSFS_STRING("swapfilename", SYSFS_RW, swapfilename, 255, 0, NULL),
+ SYSFS_CUSTOM("headerlocations", SYSFS_READONLY,
+ header_locations_read_sysfs, NULL, 0, NULL),
+ SYSFS_INT("enabled", SYSFS_RW, &toi_swapops.enabled, 0, 1, 0,
+ attempt_to_parse_resume_device2),
+};
+
+static struct toi_bio_allocator_ops toi_bio_swapops = {
+ .register_storage = toi_swap_register_storage,
+ .storage_available = toi_swap_storage_available,
+ .allocate_storage = toi_swap_allocate_storage,
+ .bmap = get_main_pool_phys_params,
+ .free_storage = toi_swap_free_storage,
+ .free_unused_storage = toi_swap_free_unused_storage,
+};
+
+static struct toi_module_ops toi_swapops = {
+ .type = BIO_ALLOCATOR_MODULE,
+ .name = "swap storage",
+ .directory = "swap",
+ .module = THIS_MODULE,
+ .memory_needed = toi_swap_memory_needed,
+ .print_debug_info = toi_swap_print_debug_stats,
+ .initialise = toi_swap_initialise,
+ .cleanup = toi_swap_cleanup,
+ .bio_allocator_ops = &toi_bio_swapops,
+
+ .sysfs_data = sysfs_params,
+ .num_sysfs_entries = sizeof(sysfs_params) /
+ sizeof(struct toi_sysfs_data),
+};
+
+/* ---- Registration ---- */
+static __init int toi_swap_load(void)
+{
+ return toi_register_module(&toi_swapops);
+}
+
+late_initcall(toi_swap_load);
diff --git a/kernel/power/tuxonice_sysfs.c b/kernel/power/tuxonice_sysfs.c
new file mode 100644
index 000000000..79c9315b6
--- /dev/null
+++ b/kernel/power/tuxonice_sysfs.c
@@ -0,0 +1,333 @@
+/*
+ * kernel/power/tuxonice_sysfs.c
+ *
+ * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ * This file contains support for sysfs entries for tuning TuxOnIce.
+ *
+ * We have a generic handler that deals with the most common cases, and
+ * hooks for special handlers to use.
+ */
+
+#include <linux/suspend.h>
+
+#include "tuxonice_sysfs.h"
+#include "tuxonice.h"
+#include "tuxonice_storage.h"
+#include "tuxonice_alloc.h"
+
+static int toi_sysfs_initialised;
+
+static void toi_initialise_sysfs(void);
+
+static struct toi_sysfs_data sysfs_params[];
+
+#define to_sysfs_data(_attr) container_of(_attr, struct toi_sysfs_data, attr)
+
+static void toi_main_wrapper(void)
+{
+ toi_try_hibernate();
+}
+
+static ssize_t toi_attr_show(struct kobject *kobj, struct attribute *attr,
+ char *page)
+{
+ struct toi_sysfs_data *sysfs_data = to_sysfs_data(attr);
+ int len = 0;
+ int full_prep = sysfs_data->flags & SYSFS_NEEDS_SM_FOR_READ;
+
+ if (full_prep && toi_start_anything(0))
+ return -EBUSY;
+
+ if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_READ)
+ toi_prepare_usm();
+
+ switch (sysfs_data->type) {
+ case TOI_SYSFS_DATA_CUSTOM:
+ len = (sysfs_data->data.special.read_sysfs) ?
+ (sysfs_data->data.special.read_sysfs)(page, PAGE_SIZE)
+ : 0;
+ break;
+ case TOI_SYSFS_DATA_BIT:
+ len = sprintf(page, "%d\n",
+ -test_bit(sysfs_data->data.bit.bit,
+ sysfs_data->data.bit.bit_vector));
+ break;
+ case TOI_SYSFS_DATA_INTEGER:
+ len = sprintf(page, "%d\n",
+ *(sysfs_data->data.integer.variable));
+ break;
+ case TOI_SYSFS_DATA_LONG:
+ len = sprintf(page, "%ld\n",
+ *(sysfs_data->data.a_long.variable));
+ break;
+ case TOI_SYSFS_DATA_UL:
+ len = sprintf(page, "%lu\n",
+ *(sysfs_data->data.ul.variable));
+ break;
+ case TOI_SYSFS_DATA_STRING:
+ len = sprintf(page, "%s\n",
+ sysfs_data->data.string.variable);
+ break;
+ }
+
+ if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_READ)
+ toi_cleanup_usm();
+
+ if (full_prep)
+ toi_finish_anything(0);
+
+ return len;
+}
+
+#define BOUND(_variable, _type) do { \
+ if (*_variable < sysfs_data->data._type.minimum) \
+ *_variable = sysfs_data->data._type.minimum; \
+ else if (*_variable > sysfs_data->data._type.maximum) \
+ *_variable = sysfs_data->data._type.maximum; \
+} while (0)
+
+static ssize_t toi_attr_store(struct kobject *kobj, struct attribute *attr,
+ const char *my_buf, size_t count)
+{
+ int assigned_temp_buffer = 0, result = count;
+ struct toi_sysfs_data *sysfs_data = to_sysfs_data(attr);
+
+ if (toi_start_anything((sysfs_data->flags & SYSFS_HIBERNATE_OR_RESUME)))
+ return -EBUSY;
+
+ ((char *) my_buf)[count] = 0;
+
+ if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_WRITE)
+ toi_prepare_usm();
+
+ switch (sysfs_data->type) {
+ case TOI_SYSFS_DATA_CUSTOM:
+ if (sysfs_data->data.special.write_sysfs)
+ result = (sysfs_data->data.special.write_sysfs)(my_buf,
+ count);
+ break;
+ case TOI_SYSFS_DATA_BIT:
+ {
+ unsigned long value;
+ result = kstrtoul(my_buf, 0, &value);
+ if (result)
+ break;
+ if (value)
+ set_bit(sysfs_data->data.bit.bit,
+ (sysfs_data->data.bit.bit_vector));
+ else
+ clear_bit(sysfs_data->data.bit.bit,
+ (sysfs_data->data.bit.bit_vector));
+ }
+ break;
+ case TOI_SYSFS_DATA_INTEGER:
+ {
+ long temp;
+ result = kstrtol(my_buf, 0, &temp);
+ if (result)
+ break;
+ *(sysfs_data->data.integer.variable) = (int) temp;
+ BOUND(sysfs_data->data.integer.variable, integer);
+ break;
+ }
+ case TOI_SYSFS_DATA_LONG:
+ {
+ long *variable =
+ sysfs_data->data.a_long.variable;
+ result = kstrtol(my_buf, 0, variable);
+ if (result)
+ break;
+ BOUND(variable, a_long);
+ break;
+ }
+ case TOI_SYSFS_DATA_UL:
+ {
+ unsigned long *variable =
+ sysfs_data->data.ul.variable;
+ result = kstrtoul(my_buf, 0, variable);
+ if (result)
+ break;
+ BOUND(variable, ul);
+ break;
+ }
+ break;
+ case TOI_SYSFS_DATA_STRING:
+ {
+ int copy_len = count;
+ char *variable =
+ sysfs_data->data.string.variable;
+
+ if (sysfs_data->data.string.max_length &&
+ (copy_len > sysfs_data->data.string.max_length))
+ copy_len = sysfs_data->data.string.max_length;
+
+ if (!variable) {
+ variable = (char *) toi_get_zeroed_page(31,
+ TOI_ATOMIC_GFP);
+ sysfs_data->data.string.variable = variable;
+ assigned_temp_buffer = 1;
+ }
+ strncpy(variable, my_buf, copy_len);
+ if (copy_len && my_buf[copy_len - 1] == '\n')
+ variable[count - 1] = 0;
+ variable[count] = 0;
+ }
+ break;
+ }
+
+ if (!result)
+ result = count;
+
+ /* Side effect routine? */
+ if (result == count && sysfs_data->write_side_effect)
+ sysfs_data->write_side_effect();
+
+ /* Free temporary buffers */
+ if (assigned_temp_buffer) {
+ toi_free_page(31,
+ (unsigned long) sysfs_data->data.string.variable);
+ sysfs_data->data.string.variable = NULL;
+ }
+
+ if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_WRITE)
+ toi_cleanup_usm();
+
+ toi_finish_anything(sysfs_data->flags & SYSFS_HIBERNATE_OR_RESUME);
+
+ return result;
+}
+
+static struct sysfs_ops toi_sysfs_ops = {
+ .show = &toi_attr_show,
+ .store = &toi_attr_store,
+};
+
+static struct kobj_type toi_ktype = {
+ .sysfs_ops = &toi_sysfs_ops,
+};
+
+struct kobject *tuxonice_kobj;
+
+/* Non-module sysfs entries.
+ *
+ * This array contains entries that are automatically registered at
+ * boot. Modules and the console code register their own entries separately.
+ */
+
+static struct toi_sysfs_data sysfs_params[] = {
+ SYSFS_CUSTOM("do_hibernate", SYSFS_WRITEONLY, NULL, NULL,
+ SYSFS_HIBERNATING, toi_main_wrapper),
+ SYSFS_CUSTOM("do_resume", SYSFS_WRITEONLY, NULL, NULL,
+ SYSFS_RESUMING, toi_try_resume)
+};
+
+void remove_toi_sysdir(struct kobject *kobj)
+{
+ if (!kobj)
+ return;
+
+ kobject_put(kobj);
+}
+
+struct kobject *make_toi_sysdir(char *name)
+{
+ struct kobject *kobj = kobject_create_and_add(name, tuxonice_kobj);
+
+ if (!kobj) {
+ printk(KERN_INFO "TuxOnIce: Can't allocate kobject for sysfs "
+ "dir!\n");
+ return NULL;
+ }
+
+ kobj->ktype = &toi_ktype;
+
+ return kobj;
+}
+
+/* toi_register_sysfs_file
+ *
+ * Helper for registering a new /sysfs/tuxonice entry.
+ */
+
+int toi_register_sysfs_file(
+ struct kobject *kobj,
+ struct toi_sysfs_data *toi_sysfs_data)
+{
+ int result;
+
+ if (!toi_sysfs_initialised)
+ toi_initialise_sysfs();
+
+ result = sysfs_create_file(kobj, &toi_sysfs_data->attr);
+ if (result)
+ printk(KERN_INFO "TuxOnIce: sysfs_create_file for %s "
+ "returned %d.\n",
+ toi_sysfs_data->attr.name, result);
+ kobj->ktype = &toi_ktype;
+
+ return result;
+}
+
+/* toi_unregister_sysfs_file
+ *
+ * Helper for removing unwanted /sys/power/tuxonice entries.
+ *
+ */
+void toi_unregister_sysfs_file(struct kobject *kobj,
+ struct toi_sysfs_data *toi_sysfs_data)
+{
+ sysfs_remove_file(kobj, &toi_sysfs_data->attr);
+}
+
+void toi_cleanup_sysfs(void)
+{
+ int i,
+ numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data);
+
+ if (!toi_sysfs_initialised)
+ return;
+
+ for (i = 0; i < numfiles; i++)
+ toi_unregister_sysfs_file(tuxonice_kobj, &sysfs_params[i]);
+
+ kobject_put(tuxonice_kobj);
+ toi_sysfs_initialised = 0;
+}
+
+/* toi_initialise_sysfs
+ *
+ * Initialise the /sysfs/tuxonice directory.
+ */
+
+static void toi_initialise_sysfs(void)
+{
+ int i;
+ int numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data);
+
+ if (toi_sysfs_initialised)
+ return;
+
+ /* Make our TuxOnIce directory a child of /sys/power */
+ tuxonice_kobj = kobject_create_and_add("tuxonice", power_kobj);
+ if (!tuxonice_kobj)
+ return;
+
+ toi_sysfs_initialised = 1;
+
+ for (i = 0; i < numfiles; i++)
+ toi_register_sysfs_file(tuxonice_kobj, &sysfs_params[i]);
+}
+
+int toi_sysfs_init(void)
+{
+ toi_initialise_sysfs();
+ return 0;
+}
+
+void toi_sysfs_exit(void)
+{
+ toi_cleanup_sysfs();
+}
diff --git a/kernel/power/tuxonice_sysfs.h b/kernel/power/tuxonice_sysfs.h
new file mode 100644
index 000000000..5b331b19a
--- /dev/null
+++ b/kernel/power/tuxonice_sysfs.h
@@ -0,0 +1,137 @@
+/*
+ * kernel/power/tuxonice_sysfs.h
+ *
+ * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/sysfs.h>
+
+struct toi_sysfs_data {
+ struct attribute attr;
+ int type;
+ int flags;
+ union {
+ struct {
+ unsigned long *bit_vector;
+ int bit;
+ } bit;
+ struct {
+ int *variable;
+ int minimum;
+ int maximum;
+ } integer;
+ struct {
+ long *variable;
+ long minimum;
+ long maximum;
+ } a_long;
+ struct {
+ unsigned long *variable;
+ unsigned long minimum;
+ unsigned long maximum;
+ } ul;
+ struct {
+ char *variable;
+ int max_length;
+ } string;
+ struct {
+ int (*read_sysfs) (const char *buffer, int count);
+ int (*write_sysfs) (const char *buffer, int count);
+ void *data;
+ } special;
+ } data;
+
+ /* Side effects routine. Used, eg, for reparsing the
+ * resume= entry when it changes */
+ void (*write_side_effect) (void);
+ struct list_head sysfs_data_list;
+};
+
+enum {
+ TOI_SYSFS_DATA_NONE = 1,
+ TOI_SYSFS_DATA_CUSTOM,
+ TOI_SYSFS_DATA_BIT,
+ TOI_SYSFS_DATA_INTEGER,
+ TOI_SYSFS_DATA_UL,
+ TOI_SYSFS_DATA_LONG,
+ TOI_SYSFS_DATA_STRING
+};
+
+#define SYSFS_WRITEONLY 0200
+#define SYSFS_READONLY 0444
+#define SYSFS_RW 0644
+
+#define SYSFS_BIT(_name, _mode, _ul, _bit, _flags) { \
+ .attr = {.name = _name , .mode = _mode }, \
+ .type = TOI_SYSFS_DATA_BIT, \
+ .flags = _flags, \
+ .data = { .bit = { .bit_vector = _ul, .bit = _bit } } }
+
+#define SYSFS_INT(_name, _mode, _int, _min, _max, _flags, _wse) { \
+ .attr = {.name = _name , .mode = _mode }, \
+ .type = TOI_SYSFS_DATA_INTEGER, \
+ .flags = _flags, \
+ .data = { .integer = { .variable = _int, .minimum = _min, \
+ .maximum = _max } }, \
+ .write_side_effect = _wse }
+
+#define SYSFS_UL(_name, _mode, _ul, _min, _max, _flags) { \
+ .attr = {.name = _name , .mode = _mode }, \
+ .type = TOI_SYSFS_DATA_UL, \
+ .flags = _flags, \
+ .data = { .ul = { .variable = _ul, .minimum = _min, \
+ .maximum = _max } } }
+
+#define SYSFS_LONG(_name, _mode, _long, _min, _max, _flags) { \
+ .attr = {.name = _name , .mode = _mode }, \
+ .type = TOI_SYSFS_DATA_LONG, \
+ .flags = _flags, \
+ .data = { .a_long = { .variable = _long, .minimum = _min, \
+ .maximum = _max } } }
+
+#define SYSFS_STRING(_name, _mode, _string, _max_len, _flags, _wse) { \
+ .attr = {.name = _name , .mode = _mode }, \
+ .type = TOI_SYSFS_DATA_STRING, \
+ .flags = _flags, \
+ .data = { .string = { .variable = _string, .max_length = _max_len } }, \
+ .write_side_effect = _wse }
+
+#define SYSFS_CUSTOM(_name, _mode, _read, _write, _flags, _wse) { \
+ .attr = {.name = _name , .mode = _mode }, \
+ .type = TOI_SYSFS_DATA_CUSTOM, \
+ .flags = _flags, \
+ .data = { .special = { .read_sysfs = _read, .write_sysfs = _write } }, \
+ .write_side_effect = _wse }
+
+#define SYSFS_NONE(_name, _wse) { \
+ .attr = {.name = _name , .mode = SYSFS_WRITEONLY }, \
+ .type = TOI_SYSFS_DATA_NONE, \
+ .write_side_effect = _wse, \
+}
+
+/* Flags */
+#define SYSFS_NEEDS_SM_FOR_READ 1
+#define SYSFS_NEEDS_SM_FOR_WRITE 2
+#define SYSFS_HIBERNATE 4
+#define SYSFS_RESUME 8
+#define SYSFS_HIBERNATE_OR_RESUME (SYSFS_HIBERNATE | SYSFS_RESUME)
+#define SYSFS_HIBERNATING (SYSFS_HIBERNATE | SYSFS_NEEDS_SM_FOR_WRITE)
+#define SYSFS_RESUMING (SYSFS_RESUME | SYSFS_NEEDS_SM_FOR_WRITE)
+#define SYSFS_NEEDS_SM_FOR_BOTH \
+ (SYSFS_NEEDS_SM_FOR_READ | SYSFS_NEEDS_SM_FOR_WRITE)
+
+int toi_register_sysfs_file(struct kobject *kobj,
+ struct toi_sysfs_data *toi_sysfs_data);
+void toi_unregister_sysfs_file(struct kobject *kobj,
+ struct toi_sysfs_data *toi_sysfs_data);
+
+extern struct kobject *tuxonice_kobj;
+
+struct kobject *make_toi_sysdir(char *name);
+void remove_toi_sysdir(struct kobject *obj);
+extern void toi_cleanup_sysfs(void);
+
+extern int toi_sysfs_init(void);
+extern void toi_sysfs_exit(void);
diff --git a/kernel/power/tuxonice_ui.c b/kernel/power/tuxonice_ui.c
new file mode 100644
index 000000000..c405f9b9a
--- /dev/null
+++ b/kernel/power/tuxonice_ui.c
@@ -0,0 +1,247 @@
+/*
+ * kernel/power/tuxonice_ui.c
+ *
+ * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
+ * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr>
+ * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ * Routines for TuxOnIce's user interface.
+ *
+ * The user interface code talks to a userspace program via a
+ * netlink socket.
+ *
+ * The kernel side:
+ * - starts the userui program;
+ * - sends text messages and progress bar status;
+ *
+ * The user space side:
+ * - passes messages regarding user requests (abort, toggle reboot etc)
+ *
+ */
+
+#define __KERNEL_SYSCALLS__
+
+#include <linux/reboot.h>
+
+#include "tuxonice_sysfs.h"
+#include "tuxonice_modules.h"
+#include "tuxonice.h"
+#include "tuxonice_ui.h"
+#include "tuxonice_netlink.h"
+#include "tuxonice_power_off.h"
+#include "tuxonice_builtin.h"
+
+static char local_printf_buf[1024]; /* Same as printk - should be safe */
+struct ui_ops *toi_current_ui;
+
+/**
+ * toi_wait_for_keypress - Wait for keypress via userui or /dev/console.
+ *
+ * @timeout: Maximum time to wait.
+ *
+ * Wait for a keypress, either from userui or /dev/console if userui isn't
+ * available. The non-userui path is particularly for at boot-time, prior
+ * to userui being started, when we have an important warning to give to
+ * the user.
+ */
+static char toi_wait_for_keypress(int timeout)
+{
+ if (toi_current_ui && toi_current_ui->wait_for_key(timeout))
+ return ' ';
+
+ return toi_wait_for_keypress_dev_console(timeout);
+}
+
+/* toi_early_boot_message()
+ * Description: Handle errors early in the process of booting.
+ * The user may press C to continue booting, perhaps
+ * invalidating the image, or space to reboot.
+ * This works from either the serial console or normally
+ * attached keyboard.
+ *
+ * Note that we come in here from init, while the kernel is
+ * locked. If we want to get events from the serial console,
+ * we need to temporarily unlock the kernel.
+ *
+ * toi_early_boot_message may also be called post-boot.
+ * In this case, it simply printks the message and returns.
+ *
+ * Arguments: int Whether we are able to erase the image.
+ * int default_answer. What to do when we timeout. This
+ * will normally be continue, but the user might
+ * provide command line options (__setup) to override
+ * particular cases.
+ * Char *. Pointer to a string explaining why we're moaning.
+ */
+
+#define say(message, a...) printk(KERN_EMERG message, ##a)
+
+void toi_early_boot_message(int message_detail, int default_answer,
+ char *warning_reason, ...)
+{
+#if defined(CONFIG_VT) || defined(CONFIG_SERIAL_CONSOLE)
+ unsigned long orig_state = get_toi_state(), continue_req = 0;
+ unsigned long orig_loglevel = console_loglevel;
+ int can_ask = 1;
+#else
+ int can_ask = 0;
+#endif
+
+ va_list args;
+ int printed_len;
+
+ if (!toi_wait) {
+ set_toi_state(TOI_CONTINUE_REQ);
+ can_ask = 0;
+ }
+
+ if (warning_reason) {
+ va_start(args, warning_reason);
+ printed_len = vsnprintf(local_printf_buf,
+ sizeof(local_printf_buf),
+ warning_reason,
+ args);
+ va_end(args);
+ }
+
+ if (!test_toi_state(TOI_BOOT_TIME)) {
+ printk("TuxOnIce: %s\n", local_printf_buf);
+ return;
+ }
+
+ if (!can_ask) {
+ continue_req = !!default_answer;
+ goto post_ask;
+ }
+
+#if defined(CONFIG_VT) || defined(CONFIG_SERIAL_CONSOLE)
+ console_loglevel = 7;
+
+ say("=== TuxOnIce ===\n\n");
+ if (warning_reason) {
+ say("BIG FAT WARNING!! %s\n\n", local_printf_buf);
+ switch (message_detail) {
+ case 0:
+ say("If you continue booting, note that any image WILL"
+ "NOT BE REMOVED.\nTuxOnIce is unable to do so "
+ "because the appropriate modules aren't\n"
+ "loaded. You should manually remove the image "
+ "to avoid any\npossibility of corrupting your "
+ "filesystem(s) later.\n");
+ break;
+ case 1:
+ say("If you want to use the current TuxOnIce image, "
+ "reboot and try\nagain with the same kernel "
+ "that you hibernated from. If you want\n"
+ "to forget that image, continue and the image "
+ "will be erased.\n");
+ break;
+ }
+ say("Press SPACE to reboot or C to continue booting with "
+ "this kernel\n\n");
+ if (toi_wait > 0)
+ say("Default action if you don't select one in %d "
+ "seconds is: %s.\n",
+ toi_wait,
+ default_answer == TOI_CONTINUE_REQ ?
+ "continue booting" : "reboot");
+ } else {
+ say("BIG FAT WARNING!!\n\n"
+ "You have tried to resume from this image before.\n"
+ "If it failed once, it may well fail again.\n"
+ "Would you like to remove the image and boot "
+ "normally?\nThis will be equivalent to entering "
+ "noresume on the\nkernel command line.\n\n"
+ "Press SPACE to remove the image or C to continue "
+ "resuming.\n\n");
+ if (toi_wait > 0)
+ say("Default action if you don't select one in %d "
+ "seconds is: %s.\n", toi_wait,
+ !!default_answer ?
+ "continue resuming" : "remove the image");
+ }
+ console_loglevel = orig_loglevel;
+
+ set_toi_state(TOI_SANITY_CHECK_PROMPT);
+ clear_toi_state(TOI_CONTINUE_REQ);
+
+ if (toi_wait_for_keypress(toi_wait) == 0) /* We timed out */
+ continue_req = !!default_answer;
+ else
+ continue_req = test_toi_state(TOI_CONTINUE_REQ);
+
+#endif /* CONFIG_VT or CONFIG_SERIAL_CONSOLE */
+
+post_ask:
+ if ((warning_reason) && (!continue_req))
+ kernel_restart(NULL);
+
+ restore_toi_state(orig_state);
+ if (continue_req)
+ set_toi_state(TOI_CONTINUE_REQ);
+}
+
+#undef say
+
+/*
+ * User interface specific /sys/power/tuxonice entries.
+ */
+
+static struct toi_sysfs_data sysfs_params[] = {
+#if defined(CONFIG_NET) && defined(CONFIG_SYSFS)
+ SYSFS_INT("default_console_level", SYSFS_RW,
+ &toi_bkd.toi_default_console_level, 0, 7, 0, NULL),
+ SYSFS_UL("debug_sections", SYSFS_RW, &toi_bkd.toi_debug_state, 0,
+ 1 << 30, 0),
+ SYSFS_BIT("log_everything", SYSFS_RW, &toi_bkd.toi_action, TOI_LOGALL,
+ 0)
+#endif
+};
+
+static struct toi_module_ops userui_ops = {
+ .type = MISC_HIDDEN_MODULE,
+ .name = "printk ui",
+ .directory = "user_interface",
+ .module = THIS_MODULE,
+ .sysfs_data = sysfs_params,
+ .num_sysfs_entries = sizeof(sysfs_params) /
+ sizeof(struct toi_sysfs_data),
+};
+
+int toi_register_ui_ops(struct ui_ops *this_ui)
+{
+ if (toi_current_ui) {
+ printk(KERN_INFO "Only one TuxOnIce user interface module can "
+ "be loaded at a time.");
+ return -EBUSY;
+ }
+
+ toi_current_ui = this_ui;
+
+ return 0;
+}
+
+void toi_remove_ui_ops(struct ui_ops *this_ui)
+{
+ if (toi_current_ui != this_ui)
+ return;
+
+ toi_current_ui = NULL;
+}
+
+/* toi_console_sysfs_init
+ * Description: Boot time initialisation for user interface.
+ */
+
+int toi_ui_init(void)
+{
+ return toi_register_module(&userui_ops);
+}
+
+void toi_ui_exit(void)
+{
+ toi_unregister_module(&userui_ops);
+}
diff --git a/kernel/power/tuxonice_ui.h b/kernel/power/tuxonice_ui.h
new file mode 100644
index 000000000..d71c607f6
--- /dev/null
+++ b/kernel/power/tuxonice_ui.h
@@ -0,0 +1,97 @@
+/*
+ * kernel/power/tuxonice_ui.h
+ *
+ * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ */
+
+enum {
+ DONT_CLEAR_BAR,
+ CLEAR_BAR
+};
+
+enum {
+ /* Userspace -> Kernel */
+ USERUI_MSG_ABORT = 0x11,
+ USERUI_MSG_SET_STATE = 0x12,
+ USERUI_MSG_GET_STATE = 0x13,
+ USERUI_MSG_GET_DEBUG_STATE = 0x14,
+ USERUI_MSG_SET_DEBUG_STATE = 0x15,
+ USERUI_MSG_SPACE = 0x18,
+ USERUI_MSG_GET_POWERDOWN_METHOD = 0x1A,
+ USERUI_MSG_SET_POWERDOWN_METHOD = 0x1B,
+ USERUI_MSG_GET_LOGLEVEL = 0x1C,
+ USERUI_MSG_SET_LOGLEVEL = 0x1D,
+ USERUI_MSG_PRINTK = 0x1E,
+
+ /* Kernel -> Userspace */
+ USERUI_MSG_MESSAGE = 0x21,
+ USERUI_MSG_PROGRESS = 0x22,
+ USERUI_MSG_POST_ATOMIC_RESTORE = 0x25,
+
+ USERUI_MSG_MAX,
+};
+
+struct userui_msg_params {
+ u32 a, b, c, d;
+ char text[255];
+};
+
+struct ui_ops {
+ char (*wait_for_key) (int timeout);
+ u32 (*update_status) (u32 value, u32 maximum, const char *fmt, ...);
+ void (*prepare_status) (int clearbar, const char *fmt, ...);
+ void (*cond_pause) (int pause, char *message);
+ void (*abort)(int result_code, const char *fmt, ...);
+ void (*prepare)(void);
+ void (*cleanup)(void);
+ void (*message)(u32 section, u32 level, u32 normally_logged,
+ const char *fmt, ...);
+};
+
+extern struct ui_ops *toi_current_ui;
+
+#define toi_update_status(val, max, fmt, args...) \
+ (toi_current_ui ? (toi_current_ui->update_status) (val, max, fmt, ##args) : \
+ max)
+
+#define toi_prepare_console(void) \
+ do { if (toi_current_ui) \
+ (toi_current_ui->prepare)(); \
+ } while (0)
+
+#define toi_cleanup_console(void) \
+ do { if (toi_current_ui) \
+ (toi_current_ui->cleanup)(); \
+ } while (0)
+
+#define abort_hibernate(result, fmt, args...) \
+ do { if (toi_current_ui) \
+ (toi_current_ui->abort)(result, fmt, ##args); \
+ else { \
+ set_abort_result(result); \
+ } \
+ } while (0)
+
+#define toi_cond_pause(pause, message) \
+ do { if (toi_current_ui) \
+ (toi_current_ui->cond_pause)(pause, message); \
+ } while (0)
+
+#define toi_prepare_status(clear, fmt, args...) \
+ do { if (toi_current_ui) \
+ (toi_current_ui->prepare_status)(clear, fmt, ##args); \
+ else \
+ printk(KERN_INFO fmt "%s", ##args, "\n"); \
+ } while (0)
+
+#define toi_message(sn, lev, log, fmt, a...) \
+do { \
+ if (toi_current_ui && (!sn || test_debug_state(sn))) \
+ toi_current_ui->message(sn, lev, log, fmt, ##a); \
+} while (0)
+
+__exit void toi_ui_cleanup(void);
+extern int toi_ui_init(void);
+extern void toi_ui_exit(void);
+extern int toi_register_ui_ops(struct ui_ops *this_ui);
+extern void toi_remove_ui_ops(struct ui_ops *this_ui);
diff --git a/kernel/power/tuxonice_userui.c b/kernel/power/tuxonice_userui.c
new file mode 100644
index 000000000..edc885c72
--- /dev/null
+++ b/kernel/power/tuxonice_userui.c
@@ -0,0 +1,658 @@
+/*
+ * kernel/power/user_ui.c
+ *
+ * Copyright (C) 2005-2007 Bernard Blackham
+ * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ *
+ * This file is released under the GPLv2.
+ *
+ * Routines for TuxOnIce's user interface.
+ *
+ * The user interface code talks to a userspace program via a
+ * netlink socket.
+ *
+ * The kernel side:
+ * - starts the userui program;
+ * - sends text messages and progress bar status;
+ *
+ * The user space side:
+ * - passes messages regarding user requests (abort, toggle reboot etc)
+ *
+ */
+
+#define __KERNEL_SYSCALLS__
+
+#include <linux/suspend.h>
+#include <linux/freezer.h>
+#include <linux/console.h>
+#include <linux/ctype.h>
+#include <linux/tty.h>
+#include <linux/vt_kern.h>
+#include <linux/reboot.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <linux/vt.h>
+
+#include "tuxonice_sysfs.h"
+#include "tuxonice_modules.h"
+#include "tuxonice.h"
+#include "tuxonice_ui.h"
+#include "tuxonice_netlink.h"
+#include "tuxonice_power_off.h"
+
+static char local_printf_buf[1024]; /* Same as printk - should be safe */
+
+static struct user_helper_data ui_helper_data;
+static struct toi_module_ops userui_ops;
+static int orig_kmsg;
+
+static char lastheader[512];
+static int lastheader_message_len;
+static int ui_helper_changed; /* Used at resume-time so don't overwrite value
+ set from initrd/ramfs. */
+
+/* Number of distinct progress amounts that userspace can display */
+static int progress_granularity = 30;
+
+static DECLARE_WAIT_QUEUE_HEAD(userui_wait_for_key);
+static int userui_wait_should_wake;
+
+#define toi_stop_waiting_for_userui_key() \
+{ \
+ userui_wait_should_wake = true; \
+ wake_up_interruptible(&userui_wait_for_key); \
+}
+
+/**
+ * ui_nl_set_state - Update toi_action based on a message from userui.
+ *
+ * @n: The bit (1 << bit) to set.
+ */
+static void ui_nl_set_state(int n)
+{
+ /* Only let them change certain settings */
+ static const u32 toi_action_mask =
+ (1 << TOI_REBOOT) | (1 << TOI_PAUSE) |
+ (1 << TOI_LOGALL) |
+ (1 << TOI_SINGLESTEP) |
+ (1 << TOI_PAUSE_NEAR_PAGESET_END);
+ static unsigned long new_action;
+
+ new_action = (toi_bkd.toi_action & (~toi_action_mask)) |
+ (n & toi_action_mask);
+
+ printk(KERN_DEBUG "n is %x. Action flags being changed from %lx "
+ "to %lx.", n, toi_bkd.toi_action, new_action);
+ toi_bkd.toi_action = new_action;
+
+ if (!test_action_state(TOI_PAUSE) &&
+ !test_action_state(TOI_SINGLESTEP))
+ toi_stop_waiting_for_userui_key();
+}
+
+/**
+ * userui_post_atomic_restore - Tell userui that atomic restore just happened.
+ *
+ * Tell userui that atomic restore just occured, so that it can do things like
+ * redrawing the screen, re-getting settings and so on.
+ */
+static void userui_post_atomic_restore(struct toi_boot_kernel_data *bkd)
+{
+ toi_send_netlink_message(&ui_helper_data,
+ USERUI_MSG_POST_ATOMIC_RESTORE, NULL, 0);
+}
+
+/**
+ * userui_storage_needed - Report how much memory in image header is needed.
+ */
+static int userui_storage_needed(void)
+{
+ return sizeof(ui_helper_data.program) + 1 + sizeof(int);
+}
+
+/**
+ * userui_save_config_info - Fill buffer with config info for image header.
+ *
+ * @buf: Buffer into which to put the config info we want to save.
+ */
+static int userui_save_config_info(char *buf)
+{
+ *((int *) buf) = progress_granularity;
+ memcpy(buf + sizeof(int), ui_helper_data.program,
+ sizeof(ui_helper_data.program));
+ return sizeof(ui_helper_data.program) + sizeof(int) + 1;
+}
+
+/**
+ * userui_load_config_info - Restore config info from buffer.
+ *
+ * @buf: Buffer containing header info loaded.
+ * @size: Size of data loaded for this module.
+ */
+static void userui_load_config_info(char *buf, int size)
+{
+ progress_granularity = *((int *) buf);
+ size -= sizeof(int);
+
+ /* Don't load the saved path if one has already been set */
+ if (ui_helper_changed)
+ return;
+
+ if (size > sizeof(ui_helper_data.program))
+ size = sizeof(ui_helper_data.program);
+
+ memcpy(ui_helper_data.program, buf + sizeof(int), size);
+ ui_helper_data.program[sizeof(ui_helper_data.program)-1] = '\0';
+}
+
+/**
+ * set_ui_program_set: Record that userui program was changed.
+ *
+ * Side effect routine for when the userui program is set. In an initrd or
+ * ramfs, the user may set a location for the userui program. If this happens,
+ * we don't want to reload the value that was saved in the image header. This
+ * routine allows us to flag that we shouldn't restore the program name from
+ * the image header.
+ */
+static void set_ui_program_set(void)
+{
+ ui_helper_changed = 1;
+}
+
+/**
+ * userui_memory_needed - Tell core how much memory to reserve for us.
+ */
+static int userui_memory_needed(void)
+{
+ /* ball park figure of 128 pages */
+ return 128 * PAGE_SIZE;
+}
+
+/**
+ * userui_update_status - Update the progress bar and (if on) in-bar message.
+ *
+ * @value: Current progress percentage numerator.
+ * @maximum: Current progress percentage denominator.
+ * @fmt: Message to be displayed in the middle of the progress bar.
+ *
+ * Note that a NULL message does not mean that any previous message is erased!
+ * For that, you need toi_prepare_status with clearbar on.
+ *
+ * Returns an unsigned long, being the next numerator (as determined by the
+ * maximum and progress granularity) where status needs to be updated.
+ * This is to reduce unnecessary calls to update_status.
+ */
+static u32 userui_update_status(u32 value, u32 maximum, const char *fmt, ...)
+{
+ static u32 last_step = 9999;
+ struct userui_msg_params msg;
+ u32 this_step, next_update;
+ int bitshift;
+
+ if (ui_helper_data.pid == -1)
+ return 0;
+
+ if ((!maximum) || (!progress_granularity))
+ return maximum;
+
+ if (value < 0)
+ value = 0;
+
+ if (value > maximum)
+ value = maximum;
+
+ /* Try to avoid math problems - we can't do 64 bit math here
+ * (and shouldn't need it - anyone got screen resolution
+ * of 65536 pixels or more?) */
+ bitshift = fls(maximum) - 16;
+ if (bitshift > 0) {
+ u32 temp_maximum = maximum >> bitshift;
+ u32 temp_value = value >> bitshift;
+ this_step = (u32)
+ (temp_value * progress_granularity / temp_maximum);
+ next_update = (((this_step + 1) * temp_maximum /
+ progress_granularity) + 1) << bitshift;
+ } else {
+ this_step = (u32) (value * progress_granularity / maximum);
+ next_update = ((this_step + 1) * maximum /
+ progress_granularity) + 1;
+ }
+
+ if (this_step == last_step)
+ return next_update;
+
+ memset(&msg, 0, sizeof(msg));
+
+ msg.a = this_step;
+ msg.b = progress_granularity;
+
+ if (fmt) {
+ va_list args;
+ va_start(args, fmt);
+ vsnprintf(msg.text, sizeof(msg.text), fmt, args);
+ va_end(args);
+ msg.text[sizeof(msg.text)-1] = '\0';
+ }
+
+ toi_send_netlink_message(&ui_helper_data, USERUI_MSG_PROGRESS,
+ &msg, sizeof(msg));
+ last_step = this_step;
+
+ return next_update;
+}
+
+/**
+ * userui_message - Display a message without necessarily logging it.
+ *
+ * @section: Type of message. Messages can be filtered by type.
+ * @level: Degree of importance of the message. Lower values = higher priority.
+ * @normally_logged: Whether logged even if log_everything is off.
+ * @fmt: Message (and parameters).
+ *
+ * This function is intended to do the same job as printk, but without normally
+ * logging what is printed. The point is to be able to get debugging info on
+ * screen without filling the logs with "1/534. ^M 2/534^M. 3/534^M"
+ *
+ * It may be called from an interrupt context - can't sleep!
+ */
+static void userui_message(u32 section, u32 level, u32 normally_logged,
+ const char *fmt, ...)
+{
+ struct userui_msg_params msg;
+
+ if ((level) && (level > console_loglevel))
+ return;
+
+ memset(&msg, 0, sizeof(msg));
+
+ msg.a = section;
+ msg.b = level;
+ msg.c = normally_logged;
+
+ if (fmt) {
+ va_list args;
+ va_start(args, fmt);
+ vsnprintf(msg.text, sizeof(msg.text), fmt, args);
+ va_end(args);
+ msg.text[sizeof(msg.text)-1] = '\0';
+ }
+
+ if (test_action_state(TOI_LOGALL))
+ printk(KERN_INFO "%s\n", msg.text);
+
+ toi_send_netlink_message(&ui_helper_data, USERUI_MSG_MESSAGE,
+ &msg, sizeof(msg));
+}
+
+/**
+ * wait_for_key_via_userui - Wait for userui to receive a keypress.
+ */
+static void wait_for_key_via_userui(void)
+{
+ DECLARE_WAITQUEUE(wait, current);
+
+ add_wait_queue(&userui_wait_for_key, &wait);
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ wait_event_interruptible(userui_wait_for_key, userui_wait_should_wake);
+ userui_wait_should_wake = false;
+
+ set_current_state(TASK_RUNNING);
+ remove_wait_queue(&userui_wait_for_key, &wait);
+}
+
+/**
+ * userui_prepare_status - Display high level messages.
+ *
+ * @clearbar: Whether to clear the progress bar.
+ * @fmt...: New message for the title.
+ *
+ * Prepare the 'nice display', drawing the header and version, along with the
+ * current action and perhaps also resetting the progress bar.
+ */
+static void userui_prepare_status(int clearbar, const char *fmt, ...)
+{
+ va_list args;
+
+ if (fmt) {
+ va_start(args, fmt);
+ lastheader_message_len = vsnprintf(lastheader, 512, fmt, args);
+ va_end(args);
+ }
+
+ if (clearbar)
+ toi_update_status(0, 1, NULL);
+
+ if (ui_helper_data.pid == -1)
+ printk(KERN_EMERG "%s\n", lastheader);
+ else
+ toi_message(0, TOI_STATUS, 1, lastheader, NULL);
+}
+
+/**
+ * toi_wait_for_keypress - Wait for keypress via userui.
+ *
+ * @timeout: Maximum time to wait.
+ *
+ * Wait for a keypress from userui.
+ *
+ * FIXME: Implement timeout?
+ */
+static char userui_wait_for_keypress(int timeout)
+{
+ char key = '\0';
+
+ if (ui_helper_data.pid != -1) {
+ wait_for_key_via_userui();
+ key = ' ';
+ }
+
+ return key;
+}
+
+/**
+ * userui_abort_hibernate - Abort a cycle & tell user if they didn't request it.
+ *
+ * @result_code: Reason why we're aborting (1 << bit).
+ * @fmt: Message to display if telling the user what's going on.
+ *
+ * Abort a cycle. If this wasn't at the user's request (and we're displaying
+ * output), tell the user why and wait for them to acknowledge the message.
+ */
+static void userui_abort_hibernate(int result_code, const char *fmt, ...)
+{
+ va_list args;
+ int printed_len = 0;
+
+ set_result_state(result_code);
+
+ if (test_result_state(TOI_ABORTED))
+ return;
+
+ set_result_state(TOI_ABORTED);
+
+ if (test_result_state(TOI_ABORT_REQUESTED))
+ return;
+
+ va_start(args, fmt);
+ printed_len = vsnprintf(local_printf_buf, sizeof(local_printf_buf),
+ fmt, args);
+ va_end(args);
+ if (ui_helper_data.pid != -1)
+ printed_len = sprintf(local_printf_buf + printed_len,
+ " (Press SPACE to continue)");
+
+ toi_prepare_status(CLEAR_BAR, "%s", local_printf_buf);
+
+ if (ui_helper_data.pid != -1)
+ userui_wait_for_keypress(0);
+}
+
+/**
+ * request_abort_hibernate - Abort hibernating or resuming at user request.
+ *
+ * Handle the user requesting the cancellation of a hibernation or resume by
+ * pressing escape.
+ */
+static void request_abort_hibernate(void)
+{
+ if (test_result_state(TOI_ABORT_REQUESTED) ||
+ !test_action_state(TOI_CAN_CANCEL))
+ return;
+
+ if (test_toi_state(TOI_NOW_RESUMING)) {
+ toi_prepare_status(CLEAR_BAR, "Escape pressed. "
+ "Powering down again.");
+ set_toi_state(TOI_STOP_RESUME);
+ while (!test_toi_state(TOI_IO_STOPPED))
+ schedule();
+ if (toiActiveAllocator->mark_resume_attempted)
+ toiActiveAllocator->mark_resume_attempted(0);
+ toi_power_down();
+ }
+
+ toi_prepare_status(CLEAR_BAR, "--- ESCAPE PRESSED :"
+ " ABORTING HIBERNATION ---");
+ set_abort_result(TOI_ABORT_REQUESTED);
+ toi_stop_waiting_for_userui_key();
+}
+
+/**
+ * userui_user_rcv_msg - Receive a netlink message from userui.
+ *
+ * @skb: skb received.
+ * @nlh: Netlink header received.
+ */
+static int userui_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ int type;
+ int *data;
+
+ type = nlh->nlmsg_type;
+
+ /* A control message: ignore them */
+ if (type < NETLINK_MSG_BASE)
+ return 0;
+
+ /* Unknown message: reply with EINVAL */
+ if (type >= USERUI_MSG_MAX)
+ return -EINVAL;
+
+ /* All operations require privileges, even GET */
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ /* Only allow one task to receive NOFREEZE privileges */
+ if (type == NETLINK_MSG_NOFREEZE_ME && ui_helper_data.pid != -1) {
+ printk(KERN_INFO "Got NOFREEZE_ME request when "
+ "ui_helper_data.pid is %d.\n", ui_helper_data.pid);
+ return -EBUSY;
+ }
+
+ data = (int *) NLMSG_DATA(nlh);
+
+ switch (type) {
+ case USERUI_MSG_ABORT:
+ request_abort_hibernate();
+ return 0;
+ case USERUI_MSG_GET_STATE:
+ toi_send_netlink_message(&ui_helper_data,
+ USERUI_MSG_GET_STATE, &toi_bkd.toi_action,
+ sizeof(toi_bkd.toi_action));
+ return 0;
+ case USERUI_MSG_GET_DEBUG_STATE:
+ toi_send_netlink_message(&ui_helper_data,
+ USERUI_MSG_GET_DEBUG_STATE,
+ &toi_bkd.toi_debug_state,
+ sizeof(toi_bkd.toi_debug_state));
+ return 0;
+ case USERUI_MSG_SET_STATE:
+ if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int)))
+ return -EINVAL;
+ ui_nl_set_state(*data);
+ return 0;
+ case USERUI_MSG_SET_DEBUG_STATE:
+ if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int)))
+ return -EINVAL;
+ toi_bkd.toi_debug_state = (*data);
+ return 0;
+ case USERUI_MSG_SPACE:
+ toi_stop_waiting_for_userui_key();
+ return 0;
+ case USERUI_MSG_GET_POWERDOWN_METHOD:
+ toi_send_netlink_message(&ui_helper_data,
+ USERUI_MSG_GET_POWERDOWN_METHOD,
+ &toi_poweroff_method,
+ sizeof(toi_poweroff_method));
+ return 0;
+ case USERUI_MSG_SET_POWERDOWN_METHOD:
+ if (nlh->nlmsg_len != NLMSG_LENGTH(sizeof(char)))
+ return -EINVAL;
+ toi_poweroff_method = (unsigned long)(*data);
+ return 0;
+ case USERUI_MSG_GET_LOGLEVEL:
+ toi_send_netlink_message(&ui_helper_data,
+ USERUI_MSG_GET_LOGLEVEL,
+ &toi_bkd.toi_default_console_level,
+ sizeof(toi_bkd.toi_default_console_level));
+ return 0;
+ case USERUI_MSG_SET_LOGLEVEL:
+ if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int)))
+ return -EINVAL;
+ toi_bkd.toi_default_console_level = (*data);
+ return 0;
+ case USERUI_MSG_PRINTK:
+ printk(KERN_INFO "%s", (char *) data);
+ return 0;
+ }
+
+ /* Unhandled here */
+ return 1;
+}
+
+/**
+ * userui_cond_pause - Possibly pause at user request.
+ *
+ * @pause: Whether to pause or just display the message.
+ * @message: Message to display at the start of pausing.
+ *
+ * Potentially pause and wait for the user to tell us to continue. We normally
+ * only pause when @pause is set. While paused, the user can do things like
+ * changing the loglevel, toggling the display of debugging sections and such
+ * like.
+ */
+static void userui_cond_pause(int pause, char *message)
+{
+ int displayed_message = 0, last_key = 0;
+
+ while (last_key != 32 &&
+ ui_helper_data.pid != -1 &&
+ ((test_action_state(TOI_PAUSE) && pause) ||
+ (test_action_state(TOI_SINGLESTEP)))) {
+ if (!displayed_message) {
+ toi_prepare_status(DONT_CLEAR_BAR,
+ "%s Press SPACE to continue.%s",
+ message ? message : "",
+ (test_action_state(TOI_SINGLESTEP)) ?
+ " Single step on." : "");
+ displayed_message = 1;
+ }
+ last_key = userui_wait_for_keypress(0);
+ }
+ schedule();
+}
+
+/**
+ * userui_prepare_console - Prepare the console for use.
+ *
+ * Prepare a console for use, saving current kmsg settings and attempting to
+ * start userui. Console loglevel changes are handled by userui.
+ */
+static void userui_prepare_console(void)
+{
+ orig_kmsg = vt_kmsg_redirect(fg_console + 1);
+
+ ui_helper_data.pid = -1;
+
+ if (!userui_ops.enabled) {
+ printk(KERN_INFO "TuxOnIce: Userui disabled.\n");
+ return;
+ }
+
+ if (*ui_helper_data.program)
+ toi_netlink_setup(&ui_helper_data);
+ else
+ printk(KERN_INFO "TuxOnIce: Userui program not configured.\n");
+}
+
+/**
+ * userui_cleanup_console - Cleanup after a cycle.
+ *
+ * Tell userui to cleanup, and restore kmsg_redirect to its original value.
+ */
+
+static void userui_cleanup_console(void)
+{
+ if (ui_helper_data.pid > -1)
+ toi_netlink_close(&ui_helper_data);
+
+ vt_kmsg_redirect(orig_kmsg);
+}
+
+/*
+ * User interface specific /sys/power/tuxonice entries.
+ */
+
+static struct toi_sysfs_data sysfs_params[] = {
+#if defined(CONFIG_NET) && defined(CONFIG_SYSFS)
+ SYSFS_BIT("enable_escape", SYSFS_RW, &toi_bkd.toi_action,
+ TOI_CAN_CANCEL, 0),
+ SYSFS_BIT("pause_between_steps", SYSFS_RW, &toi_bkd.toi_action,
+ TOI_PAUSE, 0),
+ SYSFS_INT("enabled", SYSFS_RW, &userui_ops.enabled, 0, 1, 0, NULL),
+ SYSFS_INT("progress_granularity", SYSFS_RW, &progress_granularity, 1,
+ 2048, 0, NULL),
+ SYSFS_STRING("program", SYSFS_RW, ui_helper_data.program, 255, 0,
+ set_ui_program_set),
+ SYSFS_INT("debug", SYSFS_RW, &ui_helper_data.debug, 0, 1, 0, NULL)
+#endif
+};
+
+static struct toi_module_ops userui_ops = {
+ .type = MISC_MODULE,
+ .name = "userui",
+ .shared_directory = "user_interface",
+ .module = THIS_MODULE,
+ .storage_needed = userui_storage_needed,
+ .save_config_info = userui_save_config_info,
+ .load_config_info = userui_load_config_info,
+ .memory_needed = userui_memory_needed,
+ .post_atomic_restore = userui_post_atomic_restore,
+ .sysfs_data = sysfs_params,
+ .num_sysfs_entries = sizeof(sysfs_params) /
+ sizeof(struct toi_sysfs_data),
+};
+
+static struct ui_ops my_ui_ops = {
+ .update_status = userui_update_status,
+ .message = userui_message,
+ .prepare_status = userui_prepare_status,
+ .abort = userui_abort_hibernate,
+ .cond_pause = userui_cond_pause,
+ .prepare = userui_prepare_console,
+ .cleanup = userui_cleanup_console,
+ .wait_for_key = userui_wait_for_keypress,
+};
+
+/**
+ * toi_user_ui_init - Boot time initialisation for user interface.
+ *
+ * Invoked from the core init routine.
+ */
+static __init int toi_user_ui_init(void)
+{
+ int result;
+
+ ui_helper_data.nl = NULL;
+ strncpy(ui_helper_data.program, CONFIG_TOI_USERUI_DEFAULT_PATH, 255);
+ ui_helper_data.pid = -1;
+ ui_helper_data.skb_size = sizeof(struct userui_msg_params);
+ ui_helper_data.pool_limit = 6;
+ ui_helper_data.netlink_id = NETLINK_TOI_USERUI;
+ ui_helper_data.name = "userspace ui";
+ ui_helper_data.rcv_msg = userui_user_rcv_msg;
+ ui_helper_data.interface_version = 8;
+ ui_helper_data.must_init = 0;
+ ui_helper_data.not_ready = userui_cleanup_console;
+ init_completion(&ui_helper_data.wait_for_process);
+ result = toi_register_module(&userui_ops);
+ if (!result) {
+ result = toi_register_ui_ops(&my_ui_ops);
+ if (result)
+ toi_unregister_module(&userui_ops);
+ }
+
+ return result;
+}
+
+late_initcall(toi_user_ui_init);
diff --git a/kernel/power/user.c b/kernel/power/user.c
new file mode 100644
index 000000000..526e89114
--- /dev/null
+++ b/kernel/power/user.c
@@ -0,0 +1,478 @@
+/*
+ * linux/kernel/power/user.c
+ *
+ * This file provides the user space interface for software suspend/resume.
+ *
+ * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+ *
+ * This file is released under the GPLv2.
+ *
+ */
+
+#include <linux/suspend.h>
+#include <linux/syscalls.h>
+#include <linux/reboot.h>
+#include <linux/string.h>
+#include <linux/device.h>
+#include <linux/miscdevice.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/pm.h>
+#include <linux/fs.h>
+#include <linux/compat.h>
+#include <linux/console.h>
+#include <linux/cpu.h>
+#include <linux/freezer.h>
+
+#include <asm/uaccess.h>
+
+#include "power.h"
+
+
+#define SNAPSHOT_MINOR 231
+
+static struct snapshot_data {
+ struct snapshot_handle handle;
+ int swap;
+ int mode;
+ bool frozen;
+ bool ready;
+ bool platform_support;
+ bool free_bitmaps;
+} snapshot_state;
+
+atomic_t snapshot_device_available = ATOMIC_INIT(1);
+
+static int snapshot_open(struct inode *inode, struct file *filp)
+{
+ struct snapshot_data *data;
+ int error;
+
+ if (!hibernation_available())
+ return -EPERM;
+
+ lock_system_sleep();
+
+ if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
+ error = -EBUSY;
+ goto Unlock;
+ }
+
+ if ((filp->f_flags & O_ACCMODE) == O_RDWR) {
+ atomic_inc(&snapshot_device_available);
+ error = -ENOSYS;
+ goto Unlock;
+ }
+ nonseekable_open(inode, filp);
+ data = &snapshot_state;
+ filp->private_data = data;
+ memset(&data->handle, 0, sizeof(struct snapshot_handle));
+ if ((filp->f_flags & O_ACCMODE) == O_RDONLY) {
+ /* Hibernating. The image device should be accessible. */
+ data->swap = swsusp_resume_device ?
+ swap_type_of(swsusp_resume_device, 0, NULL) : -1;
+ data->mode = O_RDONLY;
+ data->free_bitmaps = false;
+ error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
+ if (error)
+ pm_notifier_call_chain(PM_POST_HIBERNATION);
+ } else {
+ /*
+ * Resuming. We may need to wait for the image device to
+ * appear.
+ */
+ wait_for_device_probe();
+
+ data->swap = -1;
+ data->mode = O_WRONLY;
+ error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
+ if (!error) {
+ error = create_basic_memory_bitmaps();
+ data->free_bitmaps = !error;
+ }
+ if (error)
+ pm_notifier_call_chain(PM_POST_RESTORE);
+ }
+ if (error)
+ atomic_inc(&snapshot_device_available);
+
+ data->frozen = false;
+ data->ready = false;
+ data->platform_support = false;
+
+ Unlock:
+ unlock_system_sleep();
+
+ return error;
+}
+
+static int snapshot_release(struct inode *inode, struct file *filp)
+{
+ struct snapshot_data *data;
+
+ lock_system_sleep();
+
+ swsusp_free();
+ data = filp->private_data;
+ free_all_swap_pages(data->swap);
+ if (data->frozen) {
+ pm_restore_gfp_mask();
+ free_basic_memory_bitmaps();
+ thaw_processes();
+ } else if (data->free_bitmaps) {
+ free_basic_memory_bitmaps();
+ }
+ pm_notifier_call_chain(data->mode == O_RDONLY ?
+ PM_POST_HIBERNATION : PM_POST_RESTORE);
+ atomic_inc(&snapshot_device_available);
+
+ unlock_system_sleep();
+
+ return 0;
+}
+
+static ssize_t snapshot_read(struct file *filp, char __user *buf,
+ size_t count, loff_t *offp)
+{
+ struct snapshot_data *data;
+ ssize_t res;
+ loff_t pg_offp = *offp & ~PAGE_MASK;
+
+ lock_system_sleep();
+
+ data = filp->private_data;
+ if (!data->ready) {
+ res = -ENODATA;
+ goto Unlock;
+ }
+ if (!pg_offp) { /* on page boundary? */
+ res = snapshot_read_next(&data->handle);
+ if (res <= 0)
+ goto Unlock;
+ } else {
+ res = PAGE_SIZE - pg_offp;
+ }
+
+ res = simple_read_from_buffer(buf, count, &pg_offp,
+ data_of(data->handle), res);
+ if (res > 0)
+ *offp += res;
+
+ Unlock:
+ unlock_system_sleep();
+
+ return res;
+}
+
+static ssize_t snapshot_write(struct file *filp, const char __user *buf,
+ size_t count, loff_t *offp)
+{
+ struct snapshot_data *data;
+ ssize_t res;
+ loff_t pg_offp = *offp & ~PAGE_MASK;
+
+ lock_system_sleep();
+
+ data = filp->private_data;
+
+ if (!pg_offp) {
+ res = snapshot_write_next(&data->handle);
+ if (res <= 0)
+ goto unlock;
+ } else {
+ res = PAGE_SIZE - pg_offp;
+ }
+
+ res = simple_write_to_buffer(data_of(data->handle), res, &pg_offp,
+ buf, count);
+ if (res > 0)
+ *offp += res;
+unlock:
+ unlock_system_sleep();
+
+ return res;
+}
+
+static long snapshot_ioctl(struct file *filp, unsigned int cmd,
+ unsigned long arg)
+{
+ int error = 0;
+ struct snapshot_data *data;
+ loff_t size;
+ sector_t offset;
+
+ if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC)
+ return -ENOTTY;
+ if (_IOC_NR(cmd) > SNAPSHOT_IOC_MAXNR)
+ return -ENOTTY;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!mutex_trylock(&pm_mutex))
+ return -EBUSY;
+
+ lock_device_hotplug();
+ data = filp->private_data;
+
+ switch (cmd) {
+
+ case SNAPSHOT_FREEZE:
+ if (data->frozen)
+ break;
+
+ printk("Syncing filesystems ... ");
+ sys_sync();
+ printk("done.\n");
+
+ error = freeze_processes();
+ if (error)
+ break;
+
+ error = create_basic_memory_bitmaps();
+ if (error)
+ thaw_processes();
+ else
+ data->frozen = true;
+
+ break;
+
+ case SNAPSHOT_UNFREEZE:
+ if (!data->frozen || data->ready)
+ break;
+ pm_restore_gfp_mask();
+ free_basic_memory_bitmaps();
+ data->free_bitmaps = false;
+ thaw_processes();
+ data->frozen = false;
+ break;
+
+ case SNAPSHOT_CREATE_IMAGE:
+ if (data->mode != O_RDONLY || !data->frozen || data->ready) {
+ error = -EPERM;
+ break;
+ }
+ pm_restore_gfp_mask();
+ error = hibernation_snapshot(data->platform_support);
+ if (!error) {
+ error = put_user(in_suspend, (int __user *)arg);
+ data->ready = !freezer_test_done && !error;
+ freezer_test_done = false;
+ }
+ break;
+
+ case SNAPSHOT_ATOMIC_RESTORE:
+ snapshot_write_finalize(&data->handle);
+ if (data->mode != O_WRONLY || !data->frozen ||
+ !snapshot_image_loaded(&data->handle)) {
+ error = -EPERM;
+ break;
+ }
+ error = hibernation_restore(data->platform_support);
+ break;
+
+ case SNAPSHOT_FREE:
+ swsusp_free();
+ memset(&data->handle, 0, sizeof(struct snapshot_handle));
+ data->ready = false;
+ /*
+ * It is necessary to thaw kernel threads here, because
+ * SNAPSHOT_CREATE_IMAGE may be invoked directly after
+ * SNAPSHOT_FREE. In that case, if kernel threads were not
+ * thawed, the preallocation of memory carried out by
+ * hibernation_snapshot() might run into problems (i.e. it
+ * might fail or even deadlock).
+ */
+ thaw_kernel_threads();
+ break;
+
+ case SNAPSHOT_PREF_IMAGE_SIZE:
+ image_size = arg;
+ break;
+
+ case SNAPSHOT_GET_IMAGE_SIZE:
+ if (!data->ready) {
+ error = -ENODATA;
+ break;
+ }
+ size = snapshot_get_image_size();
+ size <<= PAGE_SHIFT;
+ error = put_user(size, (loff_t __user *)arg);
+ break;
+
+ case SNAPSHOT_AVAIL_SWAP_SIZE:
+ size = count_swap_pages(data->swap, 1);
+ size <<= PAGE_SHIFT;
+ error = put_user(size, (loff_t __user *)arg);
+ break;
+
+ case SNAPSHOT_ALLOC_SWAP_PAGE:
+ if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
+ error = -ENODEV;
+ break;
+ }
+ offset = alloc_swapdev_block(data->swap);
+ if (offset) {
+ offset <<= PAGE_SHIFT;
+ error = put_user(offset, (loff_t __user *)arg);
+ } else {
+ error = -ENOSPC;
+ }
+ break;
+
+ case SNAPSHOT_FREE_SWAP_PAGES:
+ if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
+ error = -ENODEV;
+ break;
+ }
+ free_all_swap_pages(data->swap);
+ break;
+
+ case SNAPSHOT_S2RAM:
+ if (!data->frozen) {
+ error = -EPERM;
+ break;
+ }
+ /*
+ * Tasks are frozen and the notifiers have been called with
+ * PM_HIBERNATION_PREPARE
+ */
+ error = suspend_devices_and_enter(PM_SUSPEND_MEM);
+ data->ready = false;
+ break;
+
+ case SNAPSHOT_PLATFORM_SUPPORT:
+ data->platform_support = !!arg;
+ break;
+
+ case SNAPSHOT_POWER_OFF:
+ if (data->platform_support)
+ error = hibernation_platform_enter();
+ break;
+
+ case SNAPSHOT_SET_SWAP_AREA:
+ if (swsusp_swap_in_use()) {
+ error = -EPERM;
+ } else {
+ struct resume_swap_area swap_area;
+ dev_t swdev;
+
+ error = copy_from_user(&swap_area, (void __user *)arg,
+ sizeof(struct resume_swap_area));
+ if (error) {
+ error = -EFAULT;
+ break;
+ }
+
+ /*
+ * User space encodes device types as two-byte values,
+ * so we need to recode them
+ */
+ swdev = new_decode_dev(swap_area.dev);
+ if (swdev) {
+ offset = swap_area.offset;
+ data->swap = swap_type_of(swdev, offset, NULL);
+ if (data->swap < 0)
+ error = -ENODEV;
+ } else {
+ data->swap = -1;
+ error = -EINVAL;
+ }
+ }
+ break;
+
+ default:
+ error = -ENOTTY;
+
+ }
+
+ unlock_device_hotplug();
+ mutex_unlock(&pm_mutex);
+
+ return error;
+}
+
+#ifdef CONFIG_COMPAT
+
+struct compat_resume_swap_area {
+ compat_loff_t offset;
+ u32 dev;
+} __packed;
+
+static long
+snapshot_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ BUILD_BUG_ON(sizeof(loff_t) != sizeof(compat_loff_t));
+
+ switch (cmd) {
+ case SNAPSHOT_GET_IMAGE_SIZE:
+ case SNAPSHOT_AVAIL_SWAP_SIZE:
+ case SNAPSHOT_ALLOC_SWAP_PAGE: {
+ compat_loff_t __user *uoffset = compat_ptr(arg);
+ loff_t offset;
+ mm_segment_t old_fs;
+ int err;
+
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ err = snapshot_ioctl(file, cmd, (unsigned long) &offset);
+ set_fs(old_fs);
+ if (!err && put_user(offset, uoffset))
+ err = -EFAULT;
+ return err;
+ }
+
+ case SNAPSHOT_CREATE_IMAGE:
+ return snapshot_ioctl(file, cmd,
+ (unsigned long) compat_ptr(arg));
+
+ case SNAPSHOT_SET_SWAP_AREA: {
+ struct compat_resume_swap_area __user *u_swap_area =
+ compat_ptr(arg);
+ struct resume_swap_area swap_area;
+ mm_segment_t old_fs;
+ int err;
+
+ err = get_user(swap_area.offset, &u_swap_area->offset);
+ err |= get_user(swap_area.dev, &u_swap_area->dev);
+ if (err)
+ return -EFAULT;
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ err = snapshot_ioctl(file, SNAPSHOT_SET_SWAP_AREA,
+ (unsigned long) &swap_area);
+ set_fs(old_fs);
+ return err;
+ }
+
+ default:
+ return snapshot_ioctl(file, cmd, arg);
+ }
+}
+
+#endif /* CONFIG_COMPAT */
+
+static const struct file_operations snapshot_fops = {
+ .open = snapshot_open,
+ .release = snapshot_release,
+ .read = snapshot_read,
+ .write = snapshot_write,
+ .llseek = no_llseek,
+ .unlocked_ioctl = snapshot_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = snapshot_compat_ioctl,
+#endif
+};
+
+static struct miscdevice snapshot_device = {
+ .minor = SNAPSHOT_MINOR,
+ .name = "snapshot",
+ .fops = &snapshot_fops,
+};
+
+static int __init snapshot_device_init(void)
+{
+ return misc_register(&snapshot_device);
+};
+
+device_initcall(snapshot_device_init);
diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c
new file mode 100644
index 000000000..019069c84
--- /dev/null
+++ b/kernel/power/wakelock.c
@@ -0,0 +1,268 @@
+/*
+ * kernel/power/wakelock.c
+ *
+ * User space wakeup sources support.
+ *
+ * Copyright (C) 2012 Rafael J. Wysocki <rjw@sisk.pl>
+ *
+ * This code is based on the analogous interface allowing user space to
+ * manipulate wakelocks on Android.
+ */
+
+#include <linux/capability.h>
+#include <linux/ctype.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/hrtimer.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+
+#include "power.h"
+
+static DEFINE_MUTEX(wakelocks_lock);
+
+struct wakelock {
+ char *name;
+ struct rb_node node;
+ struct wakeup_source ws;
+#ifdef CONFIG_PM_WAKELOCKS_GC
+ struct list_head lru;
+#endif
+};
+
+static struct rb_root wakelocks_tree = RB_ROOT;
+
+ssize_t pm_show_wakelocks(char *buf, bool show_active)
+{
+ struct rb_node *node;
+ struct wakelock *wl;
+ char *str = buf;
+ char *end = buf + PAGE_SIZE;
+
+ mutex_lock(&wakelocks_lock);
+
+ for (node = rb_first(&wakelocks_tree); node; node = rb_next(node)) {
+ wl = rb_entry(node, struct wakelock, node);
+ if (wl->ws.active == show_active)
+ str += scnprintf(str, end - str, "%s ", wl->name);
+ }
+ if (str > buf)
+ str--;
+
+ str += scnprintf(str, end - str, "\n");
+
+ mutex_unlock(&wakelocks_lock);
+ return (str - buf);
+}
+
+#if CONFIG_PM_WAKELOCKS_LIMIT > 0
+static unsigned int number_of_wakelocks;
+
+static inline bool wakelocks_limit_exceeded(void)
+{
+ return number_of_wakelocks > CONFIG_PM_WAKELOCKS_LIMIT;
+}
+
+static inline void increment_wakelocks_number(void)
+{
+ number_of_wakelocks++;
+}
+
+static inline void decrement_wakelocks_number(void)
+{
+ number_of_wakelocks--;
+}
+#else /* CONFIG_PM_WAKELOCKS_LIMIT = 0 */
+static inline bool wakelocks_limit_exceeded(void) { return false; }
+static inline void increment_wakelocks_number(void) {}
+static inline void decrement_wakelocks_number(void) {}
+#endif /* CONFIG_PM_WAKELOCKS_LIMIT */
+
+#ifdef CONFIG_PM_WAKELOCKS_GC
+#define WL_GC_COUNT_MAX 100
+#define WL_GC_TIME_SEC 300
+
+static LIST_HEAD(wakelocks_lru_list);
+static unsigned int wakelocks_gc_count;
+
+static inline void wakelocks_lru_add(struct wakelock *wl)
+{
+ list_add(&wl->lru, &wakelocks_lru_list);
+}
+
+static inline void wakelocks_lru_most_recent(struct wakelock *wl)
+{
+ list_move(&wl->lru, &wakelocks_lru_list);
+}
+
+static void wakelocks_gc(void)
+{
+ struct wakelock *wl, *aux;
+ ktime_t now;
+
+ if (++wakelocks_gc_count <= WL_GC_COUNT_MAX)
+ return;
+
+ now = ktime_get();
+ list_for_each_entry_safe_reverse(wl, aux, &wakelocks_lru_list, lru) {
+ u64 idle_time_ns;
+ bool active;
+
+ spin_lock_irq(&wl->ws.lock);
+ idle_time_ns = ktime_to_ns(ktime_sub(now, wl->ws.last_time));
+ active = wl->ws.active;
+ spin_unlock_irq(&wl->ws.lock);
+
+ if (idle_time_ns < ((u64)WL_GC_TIME_SEC * NSEC_PER_SEC))
+ break;
+
+ if (!active) {
+ wakeup_source_remove(&wl->ws);
+ rb_erase(&wl->node, &wakelocks_tree);
+ list_del(&wl->lru);
+ kfree(wl->name);
+ kfree(wl);
+ decrement_wakelocks_number();
+ }
+ }
+ wakelocks_gc_count = 0;
+}
+#else /* !CONFIG_PM_WAKELOCKS_GC */
+static inline void wakelocks_lru_add(struct wakelock *wl) {}
+static inline void wakelocks_lru_most_recent(struct wakelock *wl) {}
+static inline void wakelocks_gc(void) {}
+#endif /* !CONFIG_PM_WAKELOCKS_GC */
+
+static struct wakelock *wakelock_lookup_add(const char *name, size_t len,
+ bool add_if_not_found)
+{
+ struct rb_node **node = &wakelocks_tree.rb_node;
+ struct rb_node *parent = *node;
+ struct wakelock *wl;
+
+ while (*node) {
+ int diff;
+
+ parent = *node;
+ wl = rb_entry(*node, struct wakelock, node);
+ diff = strncmp(name, wl->name, len);
+ if (diff == 0) {
+ if (wl->name[len])
+ diff = -1;
+ else
+ return wl;
+ }
+ if (diff < 0)
+ node = &(*node)->rb_left;
+ else
+ node = &(*node)->rb_right;
+ }
+ if (!add_if_not_found)
+ return ERR_PTR(-EINVAL);
+
+ if (wakelocks_limit_exceeded())
+ return ERR_PTR(-ENOSPC);
+
+ /* Not found, we have to add a new one. */
+ wl = kzalloc(sizeof(*wl), GFP_KERNEL);
+ if (!wl)
+ return ERR_PTR(-ENOMEM);
+
+ wl->name = kstrndup(name, len, GFP_KERNEL);
+ if (!wl->name) {
+ kfree(wl);
+ return ERR_PTR(-ENOMEM);
+ }
+ wl->ws.name = wl->name;
+ wakeup_source_add(&wl->ws);
+ rb_link_node(&wl->node, parent, node);
+ rb_insert_color(&wl->node, &wakelocks_tree);
+ wakelocks_lru_add(wl);
+ increment_wakelocks_number();
+ return wl;
+}
+
+int pm_wake_lock(const char *buf)
+{
+ const char *str = buf;
+ struct wakelock *wl;
+ u64 timeout_ns = 0;
+ size_t len;
+ int ret = 0;
+
+ if (!capable(CAP_BLOCK_SUSPEND))
+ return -EPERM;
+
+ while (*str && !isspace(*str))
+ str++;
+
+ len = str - buf;
+ if (!len)
+ return -EINVAL;
+
+ if (*str && *str != '\n') {
+ /* Find out if there's a valid timeout string appended. */
+ ret = kstrtou64(skip_spaces(str), 10, &timeout_ns);
+ if (ret)
+ return -EINVAL;
+ }
+
+ mutex_lock(&wakelocks_lock);
+
+ wl = wakelock_lookup_add(buf, len, true);
+ if (IS_ERR(wl)) {
+ ret = PTR_ERR(wl);
+ goto out;
+ }
+ if (timeout_ns) {
+ u64 timeout_ms = timeout_ns + NSEC_PER_MSEC - 1;
+
+ do_div(timeout_ms, NSEC_PER_MSEC);
+ __pm_wakeup_event(&wl->ws, timeout_ms);
+ } else {
+ __pm_stay_awake(&wl->ws);
+ }
+
+ wakelocks_lru_most_recent(wl);
+
+ out:
+ mutex_unlock(&wakelocks_lock);
+ return ret;
+}
+
+int pm_wake_unlock(const char *buf)
+{
+ struct wakelock *wl;
+ size_t len;
+ int ret = 0;
+
+ if (!capable(CAP_BLOCK_SUSPEND))
+ return -EPERM;
+
+ len = strlen(buf);
+ if (!len)
+ return -EINVAL;
+
+ if (buf[len-1] == '\n')
+ len--;
+
+ if (!len)
+ return -EINVAL;
+
+ mutex_lock(&wakelocks_lock);
+
+ wl = wakelock_lookup_add(buf, len, false);
+ if (IS_ERR(wl)) {
+ ret = PTR_ERR(wl);
+ goto out;
+ }
+ __pm_relax(&wl->ws);
+
+ wakelocks_lru_most_recent(wl);
+ wakelocks_gc();
+
+ out:
+ mutex_unlock(&wakelocks_lock);
+ return ret;
+}
diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile
new file mode 100644
index 000000000..85405bdcf
--- /dev/null
+++ b/kernel/printk/Makefile
@@ -0,0 +1,2 @@
+obj-y = printk.o
+obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o
diff --git a/kernel/printk/braille.c b/kernel/printk/braille.c
new file mode 100644
index 000000000..276762f3a
--- /dev/null
+++ b/kernel/printk/braille.c
@@ -0,0 +1,49 @@
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/console.h>
+#include <linux/string.h>
+
+#include "console_cmdline.h"
+#include "braille.h"
+
+char *_braille_console_setup(char **str, char **brl_options)
+{
+ if (!memcmp(*str, "brl,", 4)) {
+ *brl_options = "";
+ *str += 4;
+ } else if (!memcmp(str, "brl=", 4)) {
+ *brl_options = *str + 4;
+ *str = strchr(*brl_options, ',');
+ if (!*str)
+ pr_err("need port name after brl=\n");
+ else
+ *((*str)++) = 0;
+ } else
+ return NULL;
+
+ return *str;
+}
+
+int
+_braille_register_console(struct console *console, struct console_cmdline *c)
+{
+ int rtn = 0;
+
+ if (c->brl_options) {
+ console->flags |= CON_BRL;
+ rtn = braille_register_console(console, c->index, c->options,
+ c->brl_options);
+ }
+
+ return rtn;
+}
+
+int
+_braille_unregister_console(struct console *console)
+{
+ if (console->flags & CON_BRL)
+ return braille_unregister_console(console);
+
+ return 0;
+}
diff --git a/kernel/printk/braille.h b/kernel/printk/braille.h
new file mode 100644
index 000000000..769d77114
--- /dev/null
+++ b/kernel/printk/braille.h
@@ -0,0 +1,48 @@
+#ifndef _PRINTK_BRAILLE_H
+#define _PRINTK_BRAILLE_H
+
+#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
+
+static inline void
+braille_set_options(struct console_cmdline *c, char *brl_options)
+{
+ c->brl_options = brl_options;
+}
+
+char *
+_braille_console_setup(char **str, char **brl_options);
+
+int
+_braille_register_console(struct console *console, struct console_cmdline *c);
+
+int
+_braille_unregister_console(struct console *console);
+
+#else
+
+static inline void
+braille_set_options(struct console_cmdline *c, char *brl_options)
+{
+}
+
+static inline char *
+_braille_console_setup(char **str, char **brl_options)
+{
+ return NULL;
+}
+
+static inline int
+_braille_register_console(struct console *console, struct console_cmdline *c)
+{
+ return 0;
+}
+
+static inline int
+_braille_unregister_console(struct console *console)
+{
+ return 0;
+}
+
+#endif
+
+#endif
diff --git a/kernel/printk/console_cmdline.h b/kernel/printk/console_cmdline.h
new file mode 100644
index 000000000..2ca4a8b5f
--- /dev/null
+++ b/kernel/printk/console_cmdline.h
@@ -0,0 +1,14 @@
+#ifndef _CONSOLE_CMDLINE_H
+#define _CONSOLE_CMDLINE_H
+
+struct console_cmdline
+{
+ char name[16]; /* Name of the driver */
+ int index; /* Minor dev. to use */
+ char *options; /* Options for the driver */
+#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
+ char *brl_options; /* Options for braille driver */
+#endif
+};
+
+#endif
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
new file mode 100644
index 000000000..83cf08088
--- /dev/null
+++ b/kernel/printk/printk.c
@@ -0,0 +1,3080 @@
+/*
+ * linux/kernel/printk.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * Modified to make sys_syslog() more flexible: added commands to
+ * return the last 4k of kernel messages, regardless of whether
+ * they've been read or not. Added option to suppress kernel printk's
+ * to the console. Added hook for sending the console messages
+ * elsewhere, in preparation for a serial line console (someday).
+ * Ted Ts'o, 2/11/93.
+ * Modified for sysctl support, 1/8/97, Chris Horn.
+ * Fixed SMP synchronization, 08/08/99, Manfred Spraul
+ * manfred@colorfullife.com
+ * Rewrote bits to get rid of console_lock
+ * 01Mar01 Andrew Morton
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/tty.h>
+#include <linux/tty_driver.h>
+#include <linux/console.h>
+#include <linux/init.h>
+#include <linux/jiffies.h>
+#include <linux/nmi.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/interrupt.h> /* For in_interrupt() */
+#include <linux/delay.h>
+#include <linux/smp.h>
+#include <linux/security.h>
+#include <linux/bootmem.h>
+#include <linux/memblock.h>
+#include <linux/syscalls.h>
+#include <linux/suspend.h>
+#include <linux/kexec.h>
+#include <linux/kdb.h>
+#include <linux/ratelimit.h>
+#include <linux/kmsg_dump.h>
+#include <linux/syslog.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
+#include <linux/rculist.h>
+#include <linux/poll.h>
+#include <linux/irq_work.h>
+#include <linux/utsname.h>
+#include <linux/ctype.h>
+#include <linux/uio.h>
+
+#include <asm/uaccess.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/printk.h>
+
+#include "console_cmdline.h"
+#include "braille.h"
+
+int console_printk[4] = {
+ CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */
+ MESSAGE_LOGLEVEL_DEFAULT, /* default_message_loglevel */
+ CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */
+ CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */
+};
+
+/*
+ * Low level drivers may need that to know if they can schedule in
+ * their unblank() callback or not. So let's export it.
+ */
+int oops_in_progress;
+EXPORT_SYMBOL(oops_in_progress);
+
+/*
+ * console_sem protects the console_drivers list, and also
+ * provides serialisation for access to the entire console
+ * driver system.
+ */
+static DEFINE_SEMAPHORE(console_sem);
+struct console *console_drivers;
+EXPORT_SYMBOL_GPL(console_drivers);
+
+#ifdef CONFIG_LOCKDEP
+static struct lockdep_map console_lock_dep_map = {
+ .name = "console_lock"
+};
+#endif
+
+/*
+ * Helper macros to handle lockdep when locking/unlocking console_sem. We use
+ * macros instead of functions so that _RET_IP_ contains useful information.
+ */
+#define down_console_sem() do { \
+ down(&console_sem);\
+ mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);\
+} while (0)
+
+static int __down_trylock_console_sem(unsigned long ip)
+{
+ if (down_trylock(&console_sem))
+ return 1;
+ mutex_acquire(&console_lock_dep_map, 0, 1, ip);
+ return 0;
+}
+#define down_trylock_console_sem() __down_trylock_console_sem(_RET_IP_)
+
+#define up_console_sem() do { \
+ mutex_release(&console_lock_dep_map, 1, _RET_IP_);\
+ up(&console_sem);\
+} while (0)
+
+/*
+ * This is used for debugging the mess that is the VT code by
+ * keeping track if we have the console semaphore held. It's
+ * definitely not the perfect debug tool (we don't know if _WE_
+ * hold it and are racing, but it helps tracking those weird code
+ * paths in the console code where we end up in places I want
+ * locked without the console sempahore held).
+ */
+static int console_locked, console_suspended;
+
+/*
+ * If exclusive_console is non-NULL then only this console is to be printed to.
+ */
+static struct console *exclusive_console;
+
+/*
+ * Array of consoles built from command line options (console=)
+ */
+
+#define MAX_CMDLINECONSOLES 8
+
+static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES];
+
+static int selected_console = -1;
+static int preferred_console = -1;
+int console_set_on_cmdline;
+EXPORT_SYMBOL(console_set_on_cmdline);
+
+/* Flag: console code may call schedule() */
+static int console_may_schedule;
+
+/*
+ * The printk log buffer consists of a chain of concatenated variable
+ * length records. Every record starts with a record header, containing
+ * the overall length of the record.
+ *
+ * The heads to the first and last entry in the buffer, as well as the
+ * sequence numbers of these entries are maintained when messages are
+ * stored.
+ *
+ * If the heads indicate available messages, the length in the header
+ * tells the start next message. A length == 0 for the next message
+ * indicates a wrap-around to the beginning of the buffer.
+ *
+ * Every record carries the monotonic timestamp in microseconds, as well as
+ * the standard userspace syslog level and syslog facility. The usual
+ * kernel messages use LOG_KERN; userspace-injected messages always carry
+ * a matching syslog facility, by default LOG_USER. The origin of every
+ * message can be reliably determined that way.
+ *
+ * The human readable log message directly follows the message header. The
+ * length of the message text is stored in the header, the stored message
+ * is not terminated.
+ *
+ * Optionally, a message can carry a dictionary of properties (key/value pairs),
+ * to provide userspace with a machine-readable message context.
+ *
+ * Examples for well-defined, commonly used property names are:
+ * DEVICE=b12:8 device identifier
+ * b12:8 block dev_t
+ * c127:3 char dev_t
+ * n8 netdev ifindex
+ * +sound:card0 subsystem:devname
+ * SUBSYSTEM=pci driver-core subsystem name
+ *
+ * Valid characters in property names are [a-zA-Z0-9.-_]. The plain text value
+ * follows directly after a '=' character. Every property is terminated by
+ * a '\0' character. The last property is not terminated.
+ *
+ * Example of a message structure:
+ * 0000 ff 8f 00 00 00 00 00 00 monotonic time in nsec
+ * 0008 34 00 record is 52 bytes long
+ * 000a 0b 00 text is 11 bytes long
+ * 000c 1f 00 dictionary is 23 bytes long
+ * 000e 03 00 LOG_KERN (facility) LOG_ERR (level)
+ * 0010 69 74 27 73 20 61 20 6c "it's a l"
+ * 69 6e 65 "ine"
+ * 001b 44 45 56 49 43 "DEVIC"
+ * 45 3d 62 38 3a 32 00 44 "E=b8:2\0D"
+ * 52 49 56 45 52 3d 62 75 "RIVER=bu"
+ * 67 "g"
+ * 0032 00 00 00 padding to next message header
+ *
+ * The 'struct printk_log' buffer header must never be directly exported to
+ * userspace, it is a kernel-private implementation detail that might
+ * need to be changed in the future, when the requirements change.
+ *
+ * /dev/kmsg exports the structured data in the following line format:
+ * "level,sequnum,timestamp;<message text>\n"
+ *
+ * The optional key/value pairs are attached as continuation lines starting
+ * with a space character and terminated by a newline. All possible
+ * non-prinatable characters are escaped in the "\xff" notation.
+ *
+ * Users of the export format should ignore possible additional values
+ * separated by ',', and find the message after the ';' character.
+ */
+
+enum log_flags {
+ LOG_NOCONS = 1, /* already flushed, do not print to console */
+ LOG_NEWLINE = 2, /* text ended with a newline */
+ LOG_PREFIX = 4, /* text started with a prefix */
+ LOG_CONT = 8, /* text is a fragment of a continuation line */
+};
+
+struct printk_log {
+ u64 ts_nsec; /* timestamp in nanoseconds */
+ u16 len; /* length of entire record */
+ u16 text_len; /* length of text buffer */
+ u16 dict_len; /* length of dictionary buffer */
+ u8 facility; /* syslog facility */
+ u8 flags:5; /* internal record flags */
+ u8 level:3; /* syslog level */
+};
+
+/*
+ * The logbuf_lock protects kmsg buffer, indices, counters. This can be taken
+ * within the scheduler's rq lock. It must be released before calling
+ * console_unlock() or anything else that might wake up a process.
+ */
+static DEFINE_RAW_SPINLOCK(logbuf_lock);
+
+#ifdef CONFIG_PRINTK
+DECLARE_WAIT_QUEUE_HEAD(log_wait);
+/* the next printk record to read by syslog(READ) or /proc/kmsg */
+static u64 syslog_seq;
+static u32 syslog_idx;
+static enum log_flags syslog_prev;
+static size_t syslog_partial;
+
+/* index and sequence number of the first record stored in the buffer */
+static u64 log_first_seq;
+static u32 log_first_idx;
+
+/* index and sequence number of the next record to store in the buffer */
+static u64 log_next_seq;
+static u32 log_next_idx;
+
+/* the next printk record to write to the console */
+static u64 console_seq;
+static u32 console_idx;
+static enum log_flags console_prev;
+
+/* the next printk record to read after the last 'clear' command */
+static u64 clear_seq;
+static u32 clear_idx;
+
+#define PREFIX_MAX 32
+#define LOG_LINE_MAX (1024 - PREFIX_MAX)
+
+/* record buffer */
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
+#define LOG_ALIGN 4
+#else
+#define LOG_ALIGN __alignof__(struct printk_log)
+#endif
+#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
+static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
+static char *log_buf = __log_buf;
+static u32 log_buf_len = __LOG_BUF_LEN;
+
+#ifdef CONFIG_TOI_INCREMENTAL
+void toi_set_logbuf_untracked(void)
+{
+ int i;
+ struct page *log_buf_start_page = virt_to_page(__log_buf);
+
+ printk("Not protecting kernel printk log buffer (%p-%p).\n",
+ __log_buf, __log_buf + __LOG_BUF_LEN);
+
+ for (i = 0; i < (1 << (CONFIG_LOG_BUF_SHIFT - PAGE_SHIFT)); i++)
+ SetPageTOI_Untracked(log_buf_start_page + i);
+}
+#endif
+
+/* Return log buffer address */
+char *log_buf_addr_get(void)
+{
+ return log_buf;
+}
+
+/* Return log buffer size */
+u32 log_buf_len_get(void)
+{
+ return log_buf_len;
+}
+
+/* human readable text of the record */
+static char *log_text(const struct printk_log *msg)
+{
+ return (char *)msg + sizeof(struct printk_log);
+}
+
+/* optional key/value pair dictionary attached to the record */
+static char *log_dict(const struct printk_log *msg)
+{
+ return (char *)msg + sizeof(struct printk_log) + msg->text_len;
+}
+
+/* get record by index; idx must point to valid msg */
+static struct printk_log *log_from_idx(u32 idx)
+{
+ struct printk_log *msg = (struct printk_log *)(log_buf + idx);
+
+ /*
+ * A length == 0 record is the end of buffer marker. Wrap around and
+ * read the message at the start of the buffer.
+ */
+ if (!msg->len)
+ return (struct printk_log *)log_buf;
+ return msg;
+}
+
+/* get next record; idx must point to valid msg */
+static u32 log_next(u32 idx)
+{
+ struct printk_log *msg = (struct printk_log *)(log_buf + idx);
+
+ /* length == 0 indicates the end of the buffer; wrap */
+ /*
+ * A length == 0 record is the end of buffer marker. Wrap around and
+ * read the message at the start of the buffer as *this* one, and
+ * return the one after that.
+ */
+ if (!msg->len) {
+ msg = (struct printk_log *)log_buf;
+ return msg->len;
+ }
+ return idx + msg->len;
+}
+
+/*
+ * Check whether there is enough free space for the given message.
+ *
+ * The same values of first_idx and next_idx mean that the buffer
+ * is either empty or full.
+ *
+ * If the buffer is empty, we must respect the position of the indexes.
+ * They cannot be reset to the beginning of the buffer.
+ */
+static int logbuf_has_space(u32 msg_size, bool empty)
+{
+ u32 free;
+
+ if (log_next_idx > log_first_idx || empty)
+ free = max(log_buf_len - log_next_idx, log_first_idx);
+ else
+ free = log_first_idx - log_next_idx;
+
+ /*
+ * We need space also for an empty header that signalizes wrapping
+ * of the buffer.
+ */
+ return free >= msg_size + sizeof(struct printk_log);
+}
+
+static int log_make_free_space(u32 msg_size)
+{
+ while (log_first_seq < log_next_seq) {
+ if (logbuf_has_space(msg_size, false))
+ return 0;
+ /* drop old messages until we have enough contiguous space */
+ log_first_idx = log_next(log_first_idx);
+ log_first_seq++;
+ }
+
+ /* sequence numbers are equal, so the log buffer is empty */
+ if (logbuf_has_space(msg_size, true))
+ return 0;
+
+ return -ENOMEM;
+}
+
+/* compute the message size including the padding bytes */
+static u32 msg_used_size(u16 text_len, u16 dict_len, u32 *pad_len)
+{
+ u32 size;
+
+ size = sizeof(struct printk_log) + text_len + dict_len;
+ *pad_len = (-size) & (LOG_ALIGN - 1);
+ size += *pad_len;
+
+ return size;
+}
+
+/*
+ * Define how much of the log buffer we could take at maximum. The value
+ * must be greater than two. Note that only half of the buffer is available
+ * when the index points to the middle.
+ */
+#define MAX_LOG_TAKE_PART 4
+static const char trunc_msg[] = "<truncated>";
+
+static u32 truncate_msg(u16 *text_len, u16 *trunc_msg_len,
+ u16 *dict_len, u32 *pad_len)
+{
+ /*
+ * The message should not take the whole buffer. Otherwise, it might
+ * get removed too soon.
+ */
+ u32 max_text_len = log_buf_len / MAX_LOG_TAKE_PART;
+ if (*text_len > max_text_len)
+ *text_len = max_text_len;
+ /* enable the warning message */
+ *trunc_msg_len = strlen(trunc_msg);
+ /* disable the "dict" completely */
+ *dict_len = 0;
+ /* compute the size again, count also the warning message */
+ return msg_used_size(*text_len + *trunc_msg_len, 0, pad_len);
+}
+
+/* insert record into the buffer, discard old ones, update heads */
+static int log_store(int facility, int level,
+ enum log_flags flags, u64 ts_nsec,
+ const char *dict, u16 dict_len,
+ const char *text, u16 text_len)
+{
+ struct printk_log *msg;
+ u32 size, pad_len;
+ u16 trunc_msg_len = 0;
+
+ /* number of '\0' padding bytes to next message */
+ size = msg_used_size(text_len, dict_len, &pad_len);
+
+ if (log_make_free_space(size)) {
+ /* truncate the message if it is too long for empty buffer */
+ size = truncate_msg(&text_len, &trunc_msg_len,
+ &dict_len, &pad_len);
+ /* survive when the log buffer is too small for trunc_msg */
+ if (log_make_free_space(size))
+ return 0;
+ }
+
+ if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) {
+ /*
+ * This message + an additional empty header does not fit
+ * at the end of the buffer. Add an empty header with len == 0
+ * to signify a wrap around.
+ */
+ memset(log_buf + log_next_idx, 0, sizeof(struct printk_log));
+ log_next_idx = 0;
+ }
+
+ /* fill message */
+ msg = (struct printk_log *)(log_buf + log_next_idx);
+ memcpy(log_text(msg), text, text_len);
+ msg->text_len = text_len;
+ if (trunc_msg_len) {
+ memcpy(log_text(msg) + text_len, trunc_msg, trunc_msg_len);
+ msg->text_len += trunc_msg_len;
+ }
+ memcpy(log_dict(msg), dict, dict_len);
+ msg->dict_len = dict_len;
+ msg->facility = facility;
+ msg->level = level & 7;
+ msg->flags = flags & 0x1f;
+ if (ts_nsec > 0)
+ msg->ts_nsec = ts_nsec;
+ else
+ msg->ts_nsec = local_clock();
+ memset(log_dict(msg) + dict_len, 0, pad_len);
+ msg->len = size;
+
+ /* insert message */
+ log_next_idx += msg->len;
+ log_next_seq++;
+
+ return msg->text_len;
+}
+
+int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT);
+
+static int syslog_action_restricted(int type)
+{
+ if (dmesg_restrict)
+ return 1;
+ /*
+ * Unless restricted, we allow "read all" and "get buffer size"
+ * for everybody.
+ */
+ return type != SYSLOG_ACTION_READ_ALL &&
+ type != SYSLOG_ACTION_SIZE_BUFFER;
+}
+
+int check_syslog_permissions(int type, bool from_file)
+{
+ /*
+ * If this is from /proc/kmsg and we've already opened it, then we've
+ * already done the capabilities checks at open time.
+ */
+ if (from_file && type != SYSLOG_ACTION_OPEN)
+ goto ok;
+
+ if (syslog_action_restricted(type)) {
+ if (capable(CAP_SYSLOG))
+ goto ok;
+ /*
+ * For historical reasons, accept CAP_SYS_ADMIN too, with
+ * a warning.
+ */
+ if (capable(CAP_SYS_ADMIN)) {
+ pr_warn_once("%s (%d): Attempt to access syslog with "
+ "CAP_SYS_ADMIN but no CAP_SYSLOG "
+ "(deprecated).\n",
+ current->comm, task_pid_nr(current));
+ goto ok;
+ }
+ return -EPERM;
+ }
+ok:
+ return security_syslog(type);
+}
+
+
+/* /dev/kmsg - userspace message inject/listen interface */
+struct devkmsg_user {
+ u64 seq;
+ u32 idx;
+ enum log_flags prev;
+ struct mutex lock;
+ char buf[8192];
+};
+
+static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
+{
+ char *buf, *line;
+ int i;
+ int level = default_message_loglevel;
+ int facility = 1; /* LOG_USER */
+ size_t len = iov_iter_count(from);
+ ssize_t ret = len;
+
+ if (len > LOG_LINE_MAX)
+ return -EINVAL;
+ buf = kmalloc(len+1, GFP_KERNEL);
+ if (buf == NULL)
+ return -ENOMEM;
+
+ buf[len] = '\0';
+ if (copy_from_iter(buf, len, from) != len) {
+ kfree(buf);
+ return -EFAULT;
+ }
+
+ /*
+ * Extract and skip the syslog prefix <[0-9]*>. Coming from userspace
+ * the decimal value represents 32bit, the lower 3 bit are the log
+ * level, the rest are the log facility.
+ *
+ * If no prefix or no userspace facility is specified, we
+ * enforce LOG_USER, to be able to reliably distinguish
+ * kernel-generated messages from userspace-injected ones.
+ */
+ line = buf;
+ if (line[0] == '<') {
+ char *endp = NULL;
+
+ i = simple_strtoul(line+1, &endp, 10);
+ if (endp && endp[0] == '>') {
+ level = i & 7;
+ if (i >> 3)
+ facility = i >> 3;
+ endp++;
+ len -= endp - line;
+ line = endp;
+ }
+ }
+
+ printk_emit(facility, level, NULL, 0, "%s", line);
+ kfree(buf);
+ return ret;
+}
+
+static ssize_t devkmsg_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct devkmsg_user *user = file->private_data;
+ struct printk_log *msg;
+ u64 ts_usec;
+ size_t i;
+ char cont = '-';
+ size_t len;
+ ssize_t ret;
+
+ if (!user)
+ return -EBADF;
+
+ ret = mutex_lock_interruptible(&user->lock);
+ if (ret)
+ return ret;
+ raw_spin_lock_irq(&logbuf_lock);
+ while (user->seq == log_next_seq) {
+ if (file->f_flags & O_NONBLOCK) {
+ ret = -EAGAIN;
+ raw_spin_unlock_irq(&logbuf_lock);
+ goto out;
+ }
+
+ raw_spin_unlock_irq(&logbuf_lock);
+ ret = wait_event_interruptible(log_wait,
+ user->seq != log_next_seq);
+ if (ret)
+ goto out;
+ raw_spin_lock_irq(&logbuf_lock);
+ }
+
+ if (user->seq < log_first_seq) {
+ /* our last seen message is gone, return error and reset */
+ user->idx = log_first_idx;
+ user->seq = log_first_seq;
+ ret = -EPIPE;
+ raw_spin_unlock_irq(&logbuf_lock);
+ goto out;
+ }
+
+ msg = log_from_idx(user->idx);
+ ts_usec = msg->ts_nsec;
+ do_div(ts_usec, 1000);
+
+ /*
+ * If we couldn't merge continuation line fragments during the print,
+ * export the stored flags to allow an optional external merge of the
+ * records. Merging the records isn't always neccessarily correct, like
+ * when we hit a race during printing. In most cases though, it produces
+ * better readable output. 'c' in the record flags mark the first
+ * fragment of a line, '+' the following.
+ */
+ if (msg->flags & LOG_CONT && !(user->prev & LOG_CONT))
+ cont = 'c';
+ else if ((msg->flags & LOG_CONT) ||
+ ((user->prev & LOG_CONT) && !(msg->flags & LOG_PREFIX)))
+ cont = '+';
+
+ len = sprintf(user->buf, "%u,%llu,%llu,%c;",
+ (msg->facility << 3) | msg->level,
+ user->seq, ts_usec, cont);
+ user->prev = msg->flags;
+
+ /* escape non-printable characters */
+ for (i = 0; i < msg->text_len; i++) {
+ unsigned char c = log_text(msg)[i];
+
+ if (c < ' ' || c >= 127 || c == '\\')
+ len += sprintf(user->buf + len, "\\x%02x", c);
+ else
+ user->buf[len++] = c;
+ }
+ user->buf[len++] = '\n';
+
+ if (msg->dict_len) {
+ bool line = true;
+
+ for (i = 0; i < msg->dict_len; i++) {
+ unsigned char c = log_dict(msg)[i];
+
+ if (line) {
+ user->buf[len++] = ' ';
+ line = false;
+ }
+
+ if (c == '\0') {
+ user->buf[len++] = '\n';
+ line = true;
+ continue;
+ }
+
+ if (c < ' ' || c >= 127 || c == '\\') {
+ len += sprintf(user->buf + len, "\\x%02x", c);
+ continue;
+ }
+
+ user->buf[len++] = c;
+ }
+ user->buf[len++] = '\n';
+ }
+
+ user->idx = log_next(user->idx);
+ user->seq++;
+ raw_spin_unlock_irq(&logbuf_lock);
+
+ if (len > count) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (copy_to_user(buf, user->buf, len)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ ret = len;
+out:
+ mutex_unlock(&user->lock);
+ return ret;
+}
+
+static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
+{
+ struct devkmsg_user *user = file->private_data;
+ loff_t ret = 0;
+
+ if (!user)
+ return -EBADF;
+ if (offset)
+ return -ESPIPE;
+
+ raw_spin_lock_irq(&logbuf_lock);
+ switch (whence) {
+ case SEEK_SET:
+ /* the first record */
+ user->idx = log_first_idx;
+ user->seq = log_first_seq;
+ break;
+ case SEEK_DATA:
+ /*
+ * The first record after the last SYSLOG_ACTION_CLEAR,
+ * like issued by 'dmesg -c'. Reading /dev/kmsg itself
+ * changes no global state, and does not clear anything.
+ */
+ user->idx = clear_idx;
+ user->seq = clear_seq;
+ break;
+ case SEEK_END:
+ /* after the last record */
+ user->idx = log_next_idx;
+ user->seq = log_next_seq;
+ break;
+ default:
+ ret = -EINVAL;
+ }
+ raw_spin_unlock_irq(&logbuf_lock);
+ return ret;
+}
+
+static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
+{
+ struct devkmsg_user *user = file->private_data;
+ int ret = 0;
+
+ if (!user)
+ return POLLERR|POLLNVAL;
+
+ poll_wait(file, &log_wait, wait);
+
+ raw_spin_lock_irq(&logbuf_lock);
+ if (user->seq < log_next_seq) {
+ /* return error when data has vanished underneath us */
+ if (user->seq < log_first_seq)
+ ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI;
+ else
+ ret = POLLIN|POLLRDNORM;
+ }
+ raw_spin_unlock_irq(&logbuf_lock);
+
+ return ret;
+}
+
+static int devkmsg_open(struct inode *inode, struct file *file)
+{
+ struct devkmsg_user *user;
+ int err;
+
+ /* write-only does not need any file context */
+ if ((file->f_flags & O_ACCMODE) == O_WRONLY)
+ return 0;
+
+ err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL,
+ SYSLOG_FROM_READER);
+ if (err)
+ return err;
+
+ user = kmalloc(sizeof(struct devkmsg_user), GFP_KERNEL);
+ if (!user)
+ return -ENOMEM;
+
+ mutex_init(&user->lock);
+
+ raw_spin_lock_irq(&logbuf_lock);
+ user->idx = log_first_idx;
+ user->seq = log_first_seq;
+ raw_spin_unlock_irq(&logbuf_lock);
+
+ file->private_data = user;
+ return 0;
+}
+
+static int devkmsg_release(struct inode *inode, struct file *file)
+{
+ struct devkmsg_user *user = file->private_data;
+
+ if (!user)
+ return 0;
+
+ mutex_destroy(&user->lock);
+ kfree(user);
+ return 0;
+}
+
+const struct file_operations kmsg_fops = {
+ .open = devkmsg_open,
+ .read = devkmsg_read,
+ .write_iter = devkmsg_write,
+ .llseek = devkmsg_llseek,
+ .poll = devkmsg_poll,
+ .release = devkmsg_release,
+};
+
+#ifdef CONFIG_KEXEC
+/*
+ * This appends the listed symbols to /proc/vmcore
+ *
+ * /proc/vmcore is used by various utilities, like crash and makedumpfile to
+ * obtain access to symbols that are otherwise very difficult to locate. These
+ * symbols are specifically used so that utilities can access and extract the
+ * dmesg log from a vmcore file after a crash.
+ */
+void log_buf_kexec_setup(void)
+{
+ VMCOREINFO_SYMBOL(log_buf);
+ VMCOREINFO_SYMBOL(log_buf_len);
+ VMCOREINFO_SYMBOL(log_first_idx);
+ VMCOREINFO_SYMBOL(log_next_idx);
+ /*
+ * Export struct printk_log size and field offsets. User space tools can
+ * parse it and detect any changes to structure down the line.
+ */
+ VMCOREINFO_STRUCT_SIZE(printk_log);
+ VMCOREINFO_OFFSET(printk_log, ts_nsec);
+ VMCOREINFO_OFFSET(printk_log, len);
+ VMCOREINFO_OFFSET(printk_log, text_len);
+ VMCOREINFO_OFFSET(printk_log, dict_len);
+}
+#endif
+
+/* requested log_buf_len from kernel cmdline */
+static unsigned long __initdata new_log_buf_len;
+
+/* we practice scaling the ring buffer by powers of 2 */
+static void __init log_buf_len_update(unsigned size)
+{
+ if (size)
+ size = roundup_pow_of_two(size);
+ if (size > log_buf_len)
+ new_log_buf_len = size;
+}
+
+/* save requested log_buf_len since it's too early to process it */
+static int __init log_buf_len_setup(char *str)
+{
+ unsigned size = memparse(str, &str);
+
+ log_buf_len_update(size);
+
+ return 0;
+}
+early_param("log_buf_len", log_buf_len_setup);
+
+#ifdef CONFIG_SMP
+#define __LOG_CPU_MAX_BUF_LEN (1 << CONFIG_LOG_CPU_MAX_BUF_SHIFT)
+
+static void __init log_buf_add_cpu(void)
+{
+ unsigned int cpu_extra;
+
+ /*
+ * archs should set up cpu_possible_bits properly with
+ * set_cpu_possible() after setup_arch() but just in
+ * case lets ensure this is valid.
+ */
+ if (num_possible_cpus() == 1)
+ return;
+
+ cpu_extra = (num_possible_cpus() - 1) * __LOG_CPU_MAX_BUF_LEN;
+
+ /* by default this will only continue through for large > 64 CPUs */
+ if (cpu_extra <= __LOG_BUF_LEN / 2)
+ return;
+
+ pr_info("log_buf_len individual max cpu contribution: %d bytes\n",
+ __LOG_CPU_MAX_BUF_LEN);
+ pr_info("log_buf_len total cpu_extra contributions: %d bytes\n",
+ cpu_extra);
+ pr_info("log_buf_len min size: %d bytes\n", __LOG_BUF_LEN);
+
+ log_buf_len_update(cpu_extra + __LOG_BUF_LEN);
+}
+#else /* !CONFIG_SMP */
+static inline void log_buf_add_cpu(void) {}
+#endif /* CONFIG_SMP */
+
+void __init setup_log_buf(int early)
+{
+ unsigned long flags;
+ char *new_log_buf;
+ int free;
+
+ if (log_buf != __log_buf)
+ return;
+
+ if (!early && !new_log_buf_len)
+ log_buf_add_cpu();
+
+ if (!new_log_buf_len)
+ return;
+
+ if (early) {
+ new_log_buf =
+ memblock_virt_alloc(new_log_buf_len, LOG_ALIGN);
+ } else {
+ new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len,
+ LOG_ALIGN);
+ }
+
+ if (unlikely(!new_log_buf)) {
+ pr_err("log_buf_len: %ld bytes not available\n",
+ new_log_buf_len);
+ return;
+ }
+
+ raw_spin_lock_irqsave(&logbuf_lock, flags);
+ log_buf_len = new_log_buf_len;
+ log_buf = new_log_buf;
+ new_log_buf_len = 0;
+ free = __LOG_BUF_LEN - log_next_idx;
+ memcpy(log_buf, __log_buf, __LOG_BUF_LEN);
+ raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+
+ pr_info("log_buf_len: %d bytes\n", log_buf_len);
+ pr_info("early log buf free: %d(%d%%)\n",
+ free, (free * 100) / __LOG_BUF_LEN);
+}
+
+static bool __read_mostly ignore_loglevel;
+
+static int __init ignore_loglevel_setup(char *str)
+{
+ ignore_loglevel = true;
+ pr_info("debug: ignoring loglevel setting.\n");
+
+ return 0;
+}
+
+early_param("ignore_loglevel", ignore_loglevel_setup);
+module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(ignore_loglevel,
+ "ignore loglevel setting (prints all kernel messages to the console)");
+
+#ifdef CONFIG_BOOT_PRINTK_DELAY
+
+static int boot_delay; /* msecs delay after each printk during bootup */
+static unsigned long long loops_per_msec; /* based on boot_delay */
+
+static int __init boot_delay_setup(char *str)
+{
+ unsigned long lpj;
+
+ lpj = preset_lpj ? preset_lpj : 1000000; /* some guess */
+ loops_per_msec = (unsigned long long)lpj / 1000 * HZ;
+
+ get_option(&str, &boot_delay);
+ if (boot_delay > 10 * 1000)
+ boot_delay = 0;
+
+ pr_debug("boot_delay: %u, preset_lpj: %ld, lpj: %lu, "
+ "HZ: %d, loops_per_msec: %llu\n",
+ boot_delay, preset_lpj, lpj, HZ, loops_per_msec);
+ return 0;
+}
+early_param("boot_delay", boot_delay_setup);
+
+static void boot_delay_msec(int level)
+{
+ unsigned long long k;
+ unsigned long timeout;
+
+ if ((boot_delay == 0 || system_state != SYSTEM_BOOTING)
+ || (level >= console_loglevel && !ignore_loglevel)) {
+ return;
+ }
+
+ k = (unsigned long long)loops_per_msec * boot_delay;
+
+ timeout = jiffies + msecs_to_jiffies(boot_delay);
+ while (k) {
+ k--;
+ cpu_relax();
+ /*
+ * use (volatile) jiffies to prevent
+ * compiler reduction; loop termination via jiffies
+ * is secondary and may or may not happen.
+ */
+ if (time_after(jiffies, timeout))
+ break;
+ touch_nmi_watchdog();
+ }
+}
+#else
+static inline void boot_delay_msec(int level)
+{
+}
+#endif
+
+static bool printk_time = IS_ENABLED(CONFIG_PRINTK_TIME);
+module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
+
+static size_t print_time(u64 ts, char *buf)
+{
+ unsigned long rem_nsec;
+
+ if (!printk_time)
+ return 0;
+
+ rem_nsec = do_div(ts, 1000000000);
+
+ if (!buf)
+ return snprintf(NULL, 0, "[%5lu.000000] ", (unsigned long)ts);
+
+ return sprintf(buf, "[%5lu.%06lu] ",
+ (unsigned long)ts, rem_nsec / 1000);
+}
+
+static size_t print_prefix(const struct printk_log *msg, bool syslog, char *buf)
+{
+ size_t len = 0;
+ unsigned int prefix = (msg->facility << 3) | msg->level;
+
+ if (syslog) {
+ if (buf) {
+ len += sprintf(buf, "<%u>", prefix);
+ } else {
+ len += 3;
+ if (prefix > 999)
+ len += 3;
+ else if (prefix > 99)
+ len += 2;
+ else if (prefix > 9)
+ len++;
+ }
+ }
+
+ len += print_time(msg->ts_nsec, buf ? buf + len : NULL);
+ return len;
+}
+
+static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev,
+ bool syslog, char *buf, size_t size)
+{
+ const char *text = log_text(msg);
+ size_t text_size = msg->text_len;
+ bool prefix = true;
+ bool newline = true;
+ size_t len = 0;
+
+ if ((prev & LOG_CONT) && !(msg->flags & LOG_PREFIX))
+ prefix = false;
+
+ if (msg->flags & LOG_CONT) {
+ if ((prev & LOG_CONT) && !(prev & LOG_NEWLINE))
+ prefix = false;
+
+ if (!(msg->flags & LOG_NEWLINE))
+ newline = false;
+ }
+
+ do {
+ const char *next = memchr(text, '\n', text_size);
+ size_t text_len;
+
+ if (next) {
+ text_len = next - text;
+ next++;
+ text_size -= next - text;
+ } else {
+ text_len = text_size;
+ }
+
+ if (buf) {
+ if (print_prefix(msg, syslog, NULL) +
+ text_len + 1 >= size - len)
+ break;
+
+ if (prefix)
+ len += print_prefix(msg, syslog, buf + len);
+ memcpy(buf + len, text, text_len);
+ len += text_len;
+ if (next || newline)
+ buf[len++] = '\n';
+ } else {
+ /* SYSLOG_ACTION_* buffer size only calculation */
+ if (prefix)
+ len += print_prefix(msg, syslog, NULL);
+ len += text_len;
+ if (next || newline)
+ len++;
+ }
+
+ prefix = true;
+ text = next;
+ } while (text);
+
+ return len;
+}
+
+static int syslog_print(char __user *buf, int size)
+{
+ char *text;
+ struct printk_log *msg;
+ int len = 0;
+
+ text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
+ if (!text)
+ return -ENOMEM;
+
+ while (size > 0) {
+ size_t n;
+ size_t skip;
+
+ raw_spin_lock_irq(&logbuf_lock);
+ if (syslog_seq < log_first_seq) {
+ /* messages are gone, move to first one */
+ syslog_seq = log_first_seq;
+ syslog_idx = log_first_idx;
+ syslog_prev = 0;
+ syslog_partial = 0;
+ }
+ if (syslog_seq == log_next_seq) {
+ raw_spin_unlock_irq(&logbuf_lock);
+ break;
+ }
+
+ skip = syslog_partial;
+ msg = log_from_idx(syslog_idx);
+ n = msg_print_text(msg, syslog_prev, true, text,
+ LOG_LINE_MAX + PREFIX_MAX);
+ if (n - syslog_partial <= size) {
+ /* message fits into buffer, move forward */
+ syslog_idx = log_next(syslog_idx);
+ syslog_seq++;
+ syslog_prev = msg->flags;
+ n -= syslog_partial;
+ syslog_partial = 0;
+ } else if (!len){
+ /* partial read(), remember position */
+ n = size;
+ syslog_partial += n;
+ } else
+ n = 0;
+ raw_spin_unlock_irq(&logbuf_lock);
+
+ if (!n)
+ break;
+
+ if (copy_to_user(buf, text + skip, n)) {
+ if (!len)
+ len = -EFAULT;
+ break;
+ }
+
+ len += n;
+ size -= n;
+ buf += n;
+ }
+
+ kfree(text);
+ return len;
+}
+
+static int syslog_print_all(char __user *buf, int size, bool clear)
+{
+ char *text;
+ int len = 0;
+
+ text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
+ if (!text)
+ return -ENOMEM;
+
+ raw_spin_lock_irq(&logbuf_lock);
+ if (buf) {
+ u64 next_seq;
+ u64 seq;
+ u32 idx;
+ enum log_flags prev;
+
+ if (clear_seq < log_first_seq) {
+ /* messages are gone, move to first available one */
+ clear_seq = log_first_seq;
+ clear_idx = log_first_idx;
+ }
+
+ /*
+ * Find first record that fits, including all following records,
+ * into the user-provided buffer for this dump.
+ */
+ seq = clear_seq;
+ idx = clear_idx;
+ prev = 0;
+ while (seq < log_next_seq) {
+ struct printk_log *msg = log_from_idx(idx);
+
+ len += msg_print_text(msg, prev, true, NULL, 0);
+ prev = msg->flags;
+ idx = log_next(idx);
+ seq++;
+ }
+
+ /* move first record forward until length fits into the buffer */
+ seq = clear_seq;
+ idx = clear_idx;
+ prev = 0;
+ while (len > size && seq < log_next_seq) {
+ struct printk_log *msg = log_from_idx(idx);
+
+ len -= msg_print_text(msg, prev, true, NULL, 0);
+ prev = msg->flags;
+ idx = log_next(idx);
+ seq++;
+ }
+
+ /* last message fitting into this dump */
+ next_seq = log_next_seq;
+
+ len = 0;
+ while (len >= 0 && seq < next_seq) {
+ struct printk_log *msg = log_from_idx(idx);
+ int textlen;
+
+ textlen = msg_print_text(msg, prev, true, text,
+ LOG_LINE_MAX + PREFIX_MAX);
+ if (textlen < 0) {
+ len = textlen;
+ break;
+ }
+ idx = log_next(idx);
+ seq++;
+ prev = msg->flags;
+
+ raw_spin_unlock_irq(&logbuf_lock);
+ if (copy_to_user(buf + len, text, textlen))
+ len = -EFAULT;
+ else
+ len += textlen;
+ raw_spin_lock_irq(&logbuf_lock);
+
+ if (seq < log_first_seq) {
+ /* messages are gone, move to next one */
+ seq = log_first_seq;
+ idx = log_first_idx;
+ prev = 0;
+ }
+ }
+ }
+
+ if (clear) {
+ clear_seq = log_next_seq;
+ clear_idx = log_next_idx;
+ }
+ raw_spin_unlock_irq(&logbuf_lock);
+
+ kfree(text);
+ return len;
+}
+
+int do_syslog(int type, char __user *buf, int len, bool from_file)
+{
+ bool clear = false;
+ static int saved_console_loglevel = LOGLEVEL_DEFAULT;
+ int error;
+
+ error = check_syslog_permissions(type, from_file);
+ if (error)
+ goto out;
+
+ switch (type) {
+ case SYSLOG_ACTION_CLOSE: /* Close log */
+ break;
+ case SYSLOG_ACTION_OPEN: /* Open log */
+ break;
+ case SYSLOG_ACTION_READ: /* Read from log */
+ error = -EINVAL;
+ if (!buf || len < 0)
+ goto out;
+ error = 0;
+ if (!len)
+ goto out;
+ if (!access_ok(VERIFY_WRITE, buf, len)) {
+ error = -EFAULT;
+ goto out;
+ }
+ error = wait_event_interruptible(log_wait,
+ syslog_seq != log_next_seq);
+ if (error)
+ goto out;
+ error = syslog_print(buf, len);
+ break;
+ /* Read/clear last kernel messages */
+ case SYSLOG_ACTION_READ_CLEAR:
+ clear = true;
+ /* FALL THRU */
+ /* Read last kernel messages */
+ case SYSLOG_ACTION_READ_ALL:
+ error = -EINVAL;
+ if (!buf || len < 0)
+ goto out;
+ error = 0;
+ if (!len)
+ goto out;
+ if (!access_ok(VERIFY_WRITE, buf, len)) {
+ error = -EFAULT;
+ goto out;
+ }
+ error = syslog_print_all(buf, len, clear);
+ break;
+ /* Clear ring buffer */
+ case SYSLOG_ACTION_CLEAR:
+ syslog_print_all(NULL, 0, true);
+ break;
+ /* Disable logging to console */
+ case SYSLOG_ACTION_CONSOLE_OFF:
+ if (saved_console_loglevel == LOGLEVEL_DEFAULT)
+ saved_console_loglevel = console_loglevel;
+ console_loglevel = minimum_console_loglevel;
+ break;
+ /* Enable logging to console */
+ case SYSLOG_ACTION_CONSOLE_ON:
+ if (saved_console_loglevel != LOGLEVEL_DEFAULT) {
+ console_loglevel = saved_console_loglevel;
+ saved_console_loglevel = LOGLEVEL_DEFAULT;
+ }
+ break;
+ /* Set level of messages printed to console */
+ case SYSLOG_ACTION_CONSOLE_LEVEL:
+ error = -EINVAL;
+ if (len < 1 || len > 8)
+ goto out;
+ if (len < minimum_console_loglevel)
+ len = minimum_console_loglevel;
+ console_loglevel = len;
+ /* Implicitly re-enable logging to console */
+ saved_console_loglevel = LOGLEVEL_DEFAULT;
+ error = 0;
+ break;
+ /* Number of chars in the log buffer */
+ case SYSLOG_ACTION_SIZE_UNREAD:
+ raw_spin_lock_irq(&logbuf_lock);
+ if (syslog_seq < log_first_seq) {
+ /* messages are gone, move to first one */
+ syslog_seq = log_first_seq;
+ syslog_idx = log_first_idx;
+ syslog_prev = 0;
+ syslog_partial = 0;
+ }
+ if (from_file) {
+ /*
+ * Short-cut for poll(/"proc/kmsg") which simply checks
+ * for pending data, not the size; return the count of
+ * records, not the length.
+ */
+ error = log_next_seq - syslog_seq;
+ } else {
+ u64 seq = syslog_seq;
+ u32 idx = syslog_idx;
+ enum log_flags prev = syslog_prev;
+
+ error = 0;
+ while (seq < log_next_seq) {
+ struct printk_log *msg = log_from_idx(idx);
+
+ error += msg_print_text(msg, prev, true, NULL, 0);
+ idx = log_next(idx);
+ seq++;
+ prev = msg->flags;
+ }
+ error -= syslog_partial;
+ }
+ raw_spin_unlock_irq(&logbuf_lock);
+ break;
+ /* Size of the log buffer */
+ case SYSLOG_ACTION_SIZE_BUFFER:
+ error = log_buf_len;
+ break;
+ default:
+ error = -EINVAL;
+ break;
+ }
+out:
+ return error;
+}
+
+SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
+{
+ return do_syslog(type, buf, len, SYSLOG_FROM_READER);
+}
+
+/*
+ * Call the console drivers, asking them to write out
+ * log_buf[start] to log_buf[end - 1].
+ * The console_lock must be held.
+ */
+static void call_console_drivers(int level, const char *text, size_t len)
+{
+ struct console *con;
+
+ trace_console(text, len);
+
+ if (level >= console_loglevel && !ignore_loglevel)
+ return;
+ if (!console_drivers)
+ return;
+
+ for_each_console(con) {
+ if (exclusive_console && con != exclusive_console)
+ continue;
+ if (!(con->flags & CON_ENABLED))
+ continue;
+ if (!con->write)
+ continue;
+ if (!cpu_online(smp_processor_id()) &&
+ !(con->flags & CON_ANYTIME))
+ continue;
+ con->write(con, text, len, level);
+ }
+}
+
+/*
+ * Zap console related locks when oopsing.
+ * To leave time for slow consoles to print a full oops,
+ * only zap at most once every 30 seconds.
+ */
+static void zap_locks(void)
+{
+ static unsigned long oops_timestamp;
+
+ if (time_after_eq(jiffies, oops_timestamp) &&
+ !time_after(jiffies, oops_timestamp + 30 * HZ))
+ return;
+
+ oops_timestamp = jiffies;
+
+ debug_locks_off();
+ /* If a crash is occurring, make sure we can't deadlock */
+ raw_spin_lock_init(&logbuf_lock);
+ /* And make sure that we print immediately */
+ sema_init(&console_sem, 1);
+}
+
+/*
+ * Check if we have any console that is capable of printing while cpu is
+ * booting or shutting down. Requires console_sem.
+ */
+static int have_callable_console(void)
+{
+ struct console *con;
+
+ for_each_console(con)
+ if (con->flags & CON_ANYTIME)
+ return 1;
+
+ return 0;
+}
+
+/*
+ * Can we actually use the console at this time on this cpu?
+ *
+ * Console drivers may assume that per-cpu resources have been allocated. So
+ * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't
+ * call them until this CPU is officially up.
+ */
+static inline int can_use_console(unsigned int cpu)
+{
+ return cpu_online(cpu) || have_callable_console();
+}
+
+/*
+ * Try to get console ownership to actually show the kernel
+ * messages from a 'printk'. Return true (and with the
+ * console_lock held, and 'console_locked' set) if it
+ * is successful, false otherwise.
+ */
+static int console_trylock_for_printk(void)
+{
+ unsigned int cpu = smp_processor_id();
+
+ if (!console_trylock())
+ return 0;
+ /*
+ * If we can't use the console, we need to release the console
+ * semaphore by hand to avoid flushing the buffer. We need to hold the
+ * console semaphore in order to do this test safely.
+ */
+ if (!can_use_console(cpu)) {
+ console_locked = 0;
+ up_console_sem();
+ return 0;
+ }
+ return 1;
+}
+
+int printk_delay_msec __read_mostly;
+
+static inline void printk_delay(void)
+{
+ if (unlikely(printk_delay_msec)) {
+ int m = printk_delay_msec;
+
+ while (m--) {
+ mdelay(1);
+ touch_nmi_watchdog();
+ }
+ }
+}
+
+/*
+ * Continuation lines are buffered, and not committed to the record buffer
+ * until the line is complete, or a race forces it. The line fragments
+ * though, are printed immediately to the consoles to ensure everything has
+ * reached the console in case of a kernel crash.
+ */
+static struct cont {
+ char buf[LOG_LINE_MAX];
+ size_t len; /* length == 0 means unused buffer */
+ size_t cons; /* bytes written to console */
+ struct task_struct *owner; /* task of first print*/
+ u64 ts_nsec; /* time of first print */
+ u8 level; /* log level of first message */
+ u8 facility; /* log facility of first message */
+ enum log_flags flags; /* prefix, newline flags */
+ bool flushed:1; /* buffer sealed and committed */
+} cont;
+
+static void cont_flush(enum log_flags flags)
+{
+ if (cont.flushed)
+ return;
+ if (cont.len == 0)
+ return;
+
+ if (cont.cons) {
+ /*
+ * If a fragment of this line was directly flushed to the
+ * console; wait for the console to pick up the rest of the
+ * line. LOG_NOCONS suppresses a duplicated output.
+ */
+ log_store(cont.facility, cont.level, flags | LOG_NOCONS,
+ cont.ts_nsec, NULL, 0, cont.buf, cont.len);
+ cont.flags = flags;
+ cont.flushed = true;
+ } else {
+ /*
+ * If no fragment of this line ever reached the console,
+ * just submit it to the store and free the buffer.
+ */
+ log_store(cont.facility, cont.level, flags, 0,
+ NULL, 0, cont.buf, cont.len);
+ cont.len = 0;
+ }
+}
+
+static bool cont_add(int facility, int level, const char *text, size_t len)
+{
+ if (cont.len && cont.flushed)
+ return false;
+
+ if (cont.len + len > sizeof(cont.buf)) {
+ /* the line gets too long, split it up in separate records */
+ cont_flush(LOG_CONT);
+ return false;
+ }
+
+ if (!cont.len) {
+ cont.facility = facility;
+ cont.level = level;
+ cont.owner = current;
+ cont.ts_nsec = local_clock();
+ cont.flags = 0;
+ cont.cons = 0;
+ cont.flushed = false;
+ }
+
+ memcpy(cont.buf + cont.len, text, len);
+ cont.len += len;
+
+ if (cont.len > (sizeof(cont.buf) * 80) / 100)
+ cont_flush(LOG_CONT);
+
+ return true;
+}
+
+static size_t cont_print_text(char *text, size_t size)
+{
+ size_t textlen = 0;
+ size_t len;
+
+ if (cont.cons == 0 && (console_prev & LOG_NEWLINE)) {
+ textlen += print_time(cont.ts_nsec, text);
+ size -= textlen;
+ }
+
+ len = cont.len - cont.cons;
+ if (len > 0) {
+ if (len+1 > size)
+ len = size-1;
+ memcpy(text + textlen, cont.buf + cont.cons, len);
+ textlen += len;
+ cont.cons = cont.len;
+ }
+
+ if (cont.flushed) {
+ if (cont.flags & LOG_NEWLINE)
+ text[textlen++] = '\n';
+ /* got everything, release buffer */
+ cont.len = 0;
+ }
+ return textlen;
+}
+
+asmlinkage int vprintk_emit(int facility, int level,
+ const char *dict, size_t dictlen,
+ const char *fmt, va_list args)
+{
+ static int recursion_bug;
+ static char textbuf[LOG_LINE_MAX];
+ char *text = textbuf;
+ size_t text_len = 0;
+ enum log_flags lflags = 0;
+ unsigned long flags;
+ int this_cpu;
+ int printed_len = 0;
+ bool in_sched = false;
+ /* cpu currently holding logbuf_lock in this function */
+ static unsigned int logbuf_cpu = UINT_MAX;
+
+ if (level == LOGLEVEL_SCHED) {
+ level = LOGLEVEL_DEFAULT;
+ in_sched = true;
+ }
+
+ boot_delay_msec(level);
+ printk_delay();
+
+ /* This stops the holder of console_sem just where we want him */
+ local_irq_save(flags);
+ this_cpu = smp_processor_id();
+
+ /*
+ * Ouch, printk recursed into itself!
+ */
+ if (unlikely(logbuf_cpu == this_cpu)) {
+ /*
+ * If a crash is occurring during printk() on this CPU,
+ * then try to get the crash message out but make sure
+ * we can't deadlock. Otherwise just return to avoid the
+ * recursion and return - but flag the recursion so that
+ * it can be printed at the next appropriate moment:
+ */
+ if (!oops_in_progress && !lockdep_recursing(current)) {
+ recursion_bug = 1;
+ local_irq_restore(flags);
+ return 0;
+ }
+ zap_locks();
+ }
+
+ lockdep_off();
+ raw_spin_lock(&logbuf_lock);
+ logbuf_cpu = this_cpu;
+
+ if (unlikely(recursion_bug)) {
+ static const char recursion_msg[] =
+ "BUG: recent printk recursion!";
+
+ recursion_bug = 0;
+ /* emit KERN_CRIT message */
+ printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
+ NULL, 0, recursion_msg,
+ strlen(recursion_msg));
+ }
+
+ /*
+ * The printf needs to come first; we need the syslog
+ * prefix which might be passed-in as a parameter.
+ */
+ text_len = vscnprintf(text, sizeof(textbuf), fmt, args);
+
+ /* mark and strip a trailing newline */
+ if (text_len && text[text_len-1] == '\n') {
+ text_len--;
+ lflags |= LOG_NEWLINE;
+ }
+
+ /* strip kernel syslog prefix and extract log level or control flags */
+ if (facility == 0) {
+ int kern_level = printk_get_level(text);
+
+ if (kern_level) {
+ const char *end_of_header = printk_skip_level(text);
+ switch (kern_level) {
+ case '0' ... '7':
+ if (level == LOGLEVEL_DEFAULT)
+ level = kern_level - '0';
+ /* fallthrough */
+ case 'd': /* KERN_DEFAULT */
+ lflags |= LOG_PREFIX;
+ }
+ /*
+ * No need to check length here because vscnprintf
+ * put '\0' at the end of the string. Only valid and
+ * newly printed level is detected.
+ */
+ text_len -= end_of_header - text;
+ text = (char *)end_of_header;
+ }
+ }
+
+ if (level == LOGLEVEL_DEFAULT)
+ level = default_message_loglevel;
+
+ if (dict)
+ lflags |= LOG_PREFIX|LOG_NEWLINE;
+
+ if (!(lflags & LOG_NEWLINE)) {
+ /*
+ * Flush the conflicting buffer. An earlier newline was missing,
+ * or another task also prints continuation lines.
+ */
+ if (cont.len && (lflags & LOG_PREFIX || cont.owner != current))
+ cont_flush(LOG_NEWLINE);
+
+ /* buffer line if possible, otherwise store it right away */
+ if (cont_add(facility, level, text, text_len))
+ printed_len += text_len;
+ else
+ printed_len += log_store(facility, level,
+ lflags | LOG_CONT, 0,
+ dict, dictlen, text, text_len);
+ } else {
+ bool stored = false;
+
+ /*
+ * If an earlier newline was missing and it was the same task,
+ * either merge it with the current buffer and flush, or if
+ * there was a race with interrupts (prefix == true) then just
+ * flush it out and store this line separately.
+ * If the preceding printk was from a different task and missed
+ * a newline, flush and append the newline.
+ */
+ if (cont.len) {
+ if (cont.owner == current && !(lflags & LOG_PREFIX))
+ stored = cont_add(facility, level, text,
+ text_len);
+ cont_flush(LOG_NEWLINE);
+ }
+
+ if (stored)
+ printed_len += text_len;
+ else
+ printed_len += log_store(facility, level, lflags, 0,
+ dict, dictlen, text, text_len);
+ }
+
+ logbuf_cpu = UINT_MAX;
+ raw_spin_unlock(&logbuf_lock);
+ lockdep_on();
+ local_irq_restore(flags);
+
+ /* If called from the scheduler, we can not call up(). */
+ if (!in_sched) {
+ lockdep_off();
+ /*
+ * Disable preemption to avoid being preempted while holding
+ * console_sem which would prevent anyone from printing to
+ * console
+ */
+ preempt_disable();
+
+ /*
+ * Try to acquire and then immediately release the console
+ * semaphore. The release will print out buffers and wake up
+ * /dev/kmsg and syslog() users.
+ */
+ if (console_trylock_for_printk())
+ console_unlock();
+ preempt_enable();
+ lockdep_on();
+ }
+
+ return printed_len;
+}
+EXPORT_SYMBOL(vprintk_emit);
+
+asmlinkage int vprintk(const char *fmt, va_list args)
+{
+ return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
+}
+EXPORT_SYMBOL(vprintk);
+
+asmlinkage int printk_emit(int facility, int level,
+ const char *dict, size_t dictlen,
+ const char *fmt, ...)
+{
+ va_list args;
+ int r;
+
+ va_start(args, fmt);
+ r = vprintk_emit(facility, level, dict, dictlen, fmt, args);
+ va_end(args);
+
+ return r;
+}
+EXPORT_SYMBOL(printk_emit);
+
+int vprintk_default(const char *fmt, va_list args)
+{
+ int r;
+
+#ifdef CONFIG_KGDB_KDB
+ if (unlikely(kdb_trap_printk)) {
+ r = vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args);
+ return r;
+ }
+#endif
+ r = vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(vprintk_default);
+
+/*
+ * This allows printk to be diverted to another function per cpu.
+ * This is useful for calling printk functions from within NMI
+ * without worrying about race conditions that can lock up the
+ * box.
+ */
+DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
+
+/**
+ * printk - print a kernel message
+ * @fmt: format string
+ *
+ * This is printk(). It can be called from any context. We want it to work.
+ *
+ * We try to grab the console_lock. If we succeed, it's easy - we log the
+ * output and call the console drivers. If we fail to get the semaphore, we
+ * place the output into the log buffer and return. The current holder of
+ * the console_sem will notice the new output in console_unlock(); and will
+ * send it to the consoles before releasing the lock.
+ *
+ * One effect of this deferred printing is that code which calls printk() and
+ * then changes console_loglevel may break. This is because console_loglevel
+ * is inspected when the actual printing occurs.
+ *
+ * See also:
+ * printf(3)
+ *
+ * See the vsnprintf() documentation for format string extensions over C99.
+ */
+asmlinkage __visible int printk(const char *fmt, ...)
+{
+ printk_func_t vprintk_func;
+ va_list args;
+ int r;
+
+ va_start(args, fmt);
+
+ /*
+ * If a caller overrides the per_cpu printk_func, then it needs
+ * to disable preemption when calling printk(). Otherwise
+ * the printk_func should be set to the default. No need to
+ * disable preemption here.
+ */
+ vprintk_func = this_cpu_read(printk_func);
+ r = vprintk_func(fmt, args);
+
+ va_end(args);
+
+ return r;
+}
+EXPORT_SYMBOL(printk);
+
+#else /* CONFIG_PRINTK */
+
+#define LOG_LINE_MAX 0
+#define PREFIX_MAX 0
+
+static u64 syslog_seq;
+static u32 syslog_idx;
+static u64 console_seq;
+static u32 console_idx;
+static enum log_flags syslog_prev;
+static u64 log_first_seq;
+static u32 log_first_idx;
+static u64 log_next_seq;
+static enum log_flags console_prev;
+static struct cont {
+ size_t len;
+ size_t cons;
+ u8 level;
+ bool flushed:1;
+} cont;
+static struct printk_log *log_from_idx(u32 idx) { return NULL; }
+static u32 log_next(u32 idx) { return 0; }
+static void call_console_drivers(int level, const char *text, size_t len) {}
+static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev,
+ bool syslog, char *buf, size_t size) { return 0; }
+static size_t cont_print_text(char *text, size_t size) { return 0; }
+
+/* Still needs to be defined for users */
+DEFINE_PER_CPU(printk_func_t, printk_func);
+
+#endif /* CONFIG_PRINTK */
+
+#ifdef CONFIG_EARLY_PRINTK
+struct console *early_console;
+
+asmlinkage __visible void early_printk(const char *fmt, ...)
+{
+ va_list ap;
+ char buf[512];
+ int n;
+
+ if (!early_console)
+ return;
+
+ va_start(ap, fmt);
+ n = vscnprintf(buf, sizeof(buf), fmt, ap);
+ va_end(ap);
+
+ early_console->write(early_console, buf, n, 0);
+}
+#endif
+
+static int __add_preferred_console(char *name, int idx, char *options,
+ char *brl_options)
+{
+ struct console_cmdline *c;
+ int i;
+
+ /*
+ * See if this tty is not yet registered, and
+ * if we have a slot free.
+ */
+ for (i = 0, c = console_cmdline;
+ i < MAX_CMDLINECONSOLES && c->name[0];
+ i++, c++) {
+ if (strcmp(c->name, name) == 0 && c->index == idx) {
+ if (!brl_options)
+ selected_console = i;
+ return 0;
+ }
+ }
+ if (i == MAX_CMDLINECONSOLES)
+ return -E2BIG;
+ if (!brl_options)
+ selected_console = i;
+ strlcpy(c->name, name, sizeof(c->name));
+ c->options = options;
+ braille_set_options(c, brl_options);
+
+ c->index = idx;
+ return 0;
+}
+/*
+ * Set up a console. Called via do_early_param() in init/main.c
+ * for each "console=" parameter in the boot command line.
+ */
+static int __init console_setup(char *str)
+{
+ char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for "ttyS" */
+ char *s, *options, *brl_options = NULL;
+ int idx;
+
+ if (_braille_console_setup(&str, &brl_options))
+ return 1;
+
+ /*
+ * Decode str into name, index, options.
+ */
+ if (str[0] >= '0' && str[0] <= '9') {
+ strcpy(buf, "ttyS");
+ strncpy(buf + 4, str, sizeof(buf) - 5);
+ } else {
+ strncpy(buf, str, sizeof(buf) - 1);
+ }
+ buf[sizeof(buf) - 1] = 0;
+ options = strchr(str, ',');
+ if (options)
+ *(options++) = 0;
+#ifdef __sparc__
+ if (!strcmp(str, "ttya"))
+ strcpy(buf, "ttyS0");
+ if (!strcmp(str, "ttyb"))
+ strcpy(buf, "ttyS1");
+#endif
+ for (s = buf; *s; s++)
+ if (isdigit(*s) || *s == ',')
+ break;
+ idx = simple_strtoul(s, NULL, 10);
+ *s = 0;
+
+ __add_preferred_console(buf, idx, options, brl_options);
+ console_set_on_cmdline = 1;
+ return 1;
+}
+__setup("console=", console_setup);
+
+/**
+ * add_preferred_console - add a device to the list of preferred consoles.
+ * @name: device name
+ * @idx: device index
+ * @options: options for this console
+ *
+ * The last preferred console added will be used for kernel messages
+ * and stdin/out/err for init. Normally this is used by console_setup
+ * above to handle user-supplied console arguments; however it can also
+ * be used by arch-specific code either to override the user or more
+ * commonly to provide a default console (ie from PROM variables) when
+ * the user has not supplied one.
+ */
+int add_preferred_console(char *name, int idx, char *options)
+{
+ return __add_preferred_console(name, idx, options, NULL);
+}
+
+bool console_suspend_enabled = true;
+EXPORT_SYMBOL(console_suspend_enabled);
+
+static int __init console_suspend_disable(char *str)
+{
+ console_suspend_enabled = false;
+ return 1;
+}
+__setup("no_console_suspend", console_suspend_disable);
+module_param_named(console_suspend, console_suspend_enabled,
+ bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(console_suspend, "suspend console during suspend"
+ " and hibernate operations");
+
+/**
+ * suspend_console - suspend the console subsystem
+ *
+ * This disables printk() while we go into suspend states
+ */
+void suspend_console(void)
+{
+ if (!console_suspend_enabled)
+ return;
+ printk("Suspending console(s) (use no_console_suspend to debug)\n");
+ console_lock();
+ console_suspended = 1;
+ up_console_sem();
+}
+
+void resume_console(void)
+{
+ if (!console_suspend_enabled)
+ return;
+ down_console_sem();
+ console_suspended = 0;
+ console_unlock();
+}
+
+/**
+ * console_cpu_notify - print deferred console messages after CPU hotplug
+ * @self: notifier struct
+ * @action: CPU hotplug event
+ * @hcpu: unused
+ *
+ * If printk() is called from a CPU that is not online yet, the messages
+ * will be spooled but will not show up on the console. This function is
+ * called when a new CPU comes online (or fails to come up), and ensures
+ * that any such output gets printed.
+ */
+static int console_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ switch (action) {
+ case CPU_ONLINE:
+ case CPU_DEAD:
+ case CPU_DOWN_FAILED:
+ case CPU_UP_CANCELED:
+ console_lock();
+ console_unlock();
+ }
+ return NOTIFY_OK;
+}
+
+/**
+ * console_lock - lock the console system for exclusive use.
+ *
+ * Acquires a lock which guarantees that the caller has
+ * exclusive access to the console system and the console_drivers list.
+ *
+ * Can sleep, returns nothing.
+ */
+void console_lock(void)
+{
+ might_sleep();
+
+ down_console_sem();
+ if (console_suspended)
+ return;
+ console_locked = 1;
+ console_may_schedule = 1;
+}
+EXPORT_SYMBOL(console_lock);
+
+/**
+ * console_trylock - try to lock the console system for exclusive use.
+ *
+ * Try to acquire a lock which guarantees that the caller has exclusive
+ * access to the console system and the console_drivers list.
+ *
+ * returns 1 on success, and 0 on failure to acquire the lock.
+ */
+int console_trylock(void)
+{
+ if (down_trylock_console_sem())
+ return 0;
+ if (console_suspended) {
+ up_console_sem();
+ return 0;
+ }
+ console_locked = 1;
+ console_may_schedule = 0;
+ return 1;
+}
+EXPORT_SYMBOL(console_trylock);
+
+int is_console_locked(void)
+{
+ return console_locked;
+}
+
+static void console_cont_flush(char *text, size_t size)
+{
+ unsigned long flags;
+ size_t len;
+
+ raw_spin_lock_irqsave(&logbuf_lock, flags);
+
+ if (!cont.len)
+ goto out;
+
+ /*
+ * We still queue earlier records, likely because the console was
+ * busy. The earlier ones need to be printed before this one, we
+ * did not flush any fragment so far, so just let it queue up.
+ */
+ if (console_seq < log_next_seq && !cont.cons)
+ goto out;
+
+ len = cont_print_text(text, size);
+ raw_spin_unlock(&logbuf_lock);
+ stop_critical_timings();
+ call_console_drivers(cont.level, text, len);
+ start_critical_timings();
+ local_irq_restore(flags);
+ return;
+out:
+ raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+}
+
+/**
+ * console_unlock - unlock the console system
+ *
+ * Releases the console_lock which the caller holds on the console system
+ * and the console driver list.
+ *
+ * While the console_lock was held, console output may have been buffered
+ * by printk(). If this is the case, console_unlock(); emits
+ * the output prior to releasing the lock.
+ *
+ * If there is output waiting, we wake /dev/kmsg and syslog() users.
+ *
+ * console_unlock(); may be called from any context.
+ */
+void console_unlock(void)
+{
+ static char text[LOG_LINE_MAX + PREFIX_MAX];
+ static u64 seen_seq;
+ unsigned long flags;
+ bool wake_klogd = false;
+ bool retry;
+
+ if (console_suspended) {
+ up_console_sem();
+ return;
+ }
+
+ console_may_schedule = 0;
+
+ /* flush buffered message fragment immediately to console */
+ console_cont_flush(text, sizeof(text));
+again:
+ for (;;) {
+ struct printk_log *msg;
+ size_t len;
+ int level;
+
+ raw_spin_lock_irqsave(&logbuf_lock, flags);
+ if (seen_seq != log_next_seq) {
+ wake_klogd = true;
+ seen_seq = log_next_seq;
+ }
+
+ if (console_seq < log_first_seq) {
+ len = sprintf(text, "** %u printk messages dropped ** ",
+ (unsigned)(log_first_seq - console_seq));
+
+ /* messages are gone, move to first one */
+ console_seq = log_first_seq;
+ console_idx = log_first_idx;
+ console_prev = 0;
+ } else {
+ len = 0;
+ }
+skip:
+ if (console_seq == log_next_seq)
+ break;
+
+ msg = log_from_idx(console_idx);
+ if (msg->flags & LOG_NOCONS) {
+ /*
+ * Skip record we have buffered and already printed
+ * directly to the console when we received it.
+ */
+ console_idx = log_next(console_idx);
+ console_seq++;
+ /*
+ * We will get here again when we register a new
+ * CON_PRINTBUFFER console. Clear the flag so we
+ * will properly dump everything later.
+ */
+ msg->flags &= ~LOG_NOCONS;
+ console_prev = msg->flags;
+ goto skip;
+ }
+
+ level = msg->level;
+ len += msg_print_text(msg, console_prev, false,
+ text + len, sizeof(text) - len);
+ console_idx = log_next(console_idx);
+ console_seq++;
+ console_prev = msg->flags;
+ raw_spin_unlock(&logbuf_lock);
+
+ stop_critical_timings(); /* don't trace print latency */
+ call_console_drivers(level, text, len);
+ start_critical_timings();
+ local_irq_restore(flags);
+ }
+ console_locked = 0;
+
+ /* Release the exclusive_console once it is used */
+ if (unlikely(exclusive_console))
+ exclusive_console = NULL;
+
+ raw_spin_unlock(&logbuf_lock);
+
+ up_console_sem();
+
+ /*
+ * Someone could have filled up the buffer again, so re-check if there's
+ * something to flush. In case we cannot trylock the console_sem again,
+ * there's a new owner and the console_unlock() from them will do the
+ * flush, no worries.
+ */
+ raw_spin_lock(&logbuf_lock);
+ retry = console_seq != log_next_seq;
+ raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+
+ if (retry && console_trylock())
+ goto again;
+
+ if (wake_klogd)
+ wake_up_klogd();
+}
+EXPORT_SYMBOL(console_unlock);
+
+/**
+ * console_conditional_schedule - yield the CPU if required
+ *
+ * If the console code is currently allowed to sleep, and
+ * if this CPU should yield the CPU to another task, do
+ * so here.
+ *
+ * Must be called within console_lock();.
+ */
+void __sched console_conditional_schedule(void)
+{
+ if (console_may_schedule)
+ cond_resched();
+}
+EXPORT_SYMBOL(console_conditional_schedule);
+
+void console_unblank(void)
+{
+ struct console *c;
+
+ /*
+ * console_unblank can no longer be called in interrupt context unless
+ * oops_in_progress is set to 1..
+ */
+ if (oops_in_progress) {
+ if (down_trylock_console_sem() != 0)
+ return;
+ } else
+ console_lock();
+
+ console_locked = 1;
+ console_may_schedule = 0;
+ for_each_console(c)
+ if ((c->flags & CON_ENABLED) && c->unblank)
+ c->unblank();
+ console_unlock();
+}
+
+/*
+ * Return the console tty driver structure and its associated index
+ */
+struct tty_driver *console_device(int *index)
+{
+ struct console *c;
+ struct tty_driver *driver = NULL;
+
+ console_lock();
+ for_each_console(c) {
+ if (!c->device)
+ continue;
+ driver = c->device(c, index);
+ if (driver)
+ break;
+ }
+ console_unlock();
+ return driver;
+}
+
+/*
+ * Prevent further output on the passed console device so that (for example)
+ * serial drivers can disable console output before suspending a port, and can
+ * re-enable output afterwards.
+ */
+void console_stop(struct console *console)
+{
+ console_lock();
+ console->flags &= ~CON_ENABLED;
+ console_unlock();
+}
+EXPORT_SYMBOL(console_stop);
+
+void console_start(struct console *console)
+{
+ console_lock();
+ console->flags |= CON_ENABLED;
+ console_unlock();
+}
+EXPORT_SYMBOL(console_start);
+
+static int __read_mostly keep_bootcon;
+
+static int __init keep_bootcon_setup(char *str)
+{
+ keep_bootcon = 1;
+ pr_info("debug: skip boot console de-registration.\n");
+
+ return 0;
+}
+
+early_param("keep_bootcon", keep_bootcon_setup);
+
+/*
+ * The console driver calls this routine during kernel initialization
+ * to register the console printing procedure with printk() and to
+ * print any messages that were printed by the kernel before the
+ * console driver was initialized.
+ *
+ * This can happen pretty early during the boot process (because of
+ * early_printk) - sometimes before setup_arch() completes - be careful
+ * of what kernel features are used - they may not be initialised yet.
+ *
+ * There are two types of consoles - bootconsoles (early_printk) and
+ * "real" consoles (everything which is not a bootconsole) which are
+ * handled differently.
+ * - Any number of bootconsoles can be registered at any time.
+ * - As soon as a "real" console is registered, all bootconsoles
+ * will be unregistered automatically.
+ * - Once a "real" console is registered, any attempt to register a
+ * bootconsoles will be rejected
+ */
+void register_console(struct console *newcon)
+{
+ int i;
+ unsigned long flags;
+ struct console *bcon = NULL;
+ struct console_cmdline *c;
+
+ if (console_drivers)
+ for_each_console(bcon)
+ if (WARN(bcon == newcon,
+ "console '%s%d' already registered\n",
+ bcon->name, bcon->index))
+ return;
+
+ /*
+ * before we register a new CON_BOOT console, make sure we don't
+ * already have a valid console
+ */
+ if (console_drivers && newcon->flags & CON_BOOT) {
+ /* find the last or real console */
+ for_each_console(bcon) {
+ if (!(bcon->flags & CON_BOOT)) {
+ pr_info("Too late to register bootconsole %s%d\n",
+ newcon->name, newcon->index);
+ return;
+ }
+ }
+ }
+
+ if (console_drivers && console_drivers->flags & CON_BOOT)
+ bcon = console_drivers;
+
+ if (preferred_console < 0 || bcon || !console_drivers)
+ preferred_console = selected_console;
+
+ /*
+ * See if we want to use this console driver. If we
+ * didn't select a console we take the first one
+ * that registers here.
+ */
+ if (preferred_console < 0) {
+ if (newcon->index < 0)
+ newcon->index = 0;
+ if (newcon->setup == NULL ||
+ newcon->setup(newcon, NULL) == 0) {
+ newcon->flags |= CON_ENABLED;
+ if (newcon->device) {
+ newcon->flags |= CON_CONSDEV;
+ preferred_console = 0;
+ }
+ }
+ }
+
+ /*
+ * See if this console matches one we selected on
+ * the command line.
+ */
+ for (i = 0, c = console_cmdline;
+ i < MAX_CMDLINECONSOLES && c->name[0];
+ i++, c++) {
+ if (!newcon->match ||
+ newcon->match(newcon, c->name, c->index, c->options) != 0) {
+ /* default matching */
+ BUILD_BUG_ON(sizeof(c->name) != sizeof(newcon->name));
+ if (strcmp(c->name, newcon->name) != 0)
+ continue;
+ if (newcon->index >= 0 &&
+ newcon->index != c->index)
+ continue;
+ if (newcon->index < 0)
+ newcon->index = c->index;
+
+ if (_braille_register_console(newcon, c))
+ return;
+
+ if (newcon->setup &&
+ newcon->setup(newcon, c->options) != 0)
+ break;
+ }
+
+ newcon->flags |= CON_ENABLED;
+ if (i == selected_console) {
+ newcon->flags |= CON_CONSDEV;
+ preferred_console = selected_console;
+ }
+ break;
+ }
+
+ if (!(newcon->flags & CON_ENABLED))
+ return;
+
+ /*
+ * If we have a bootconsole, and are switching to a real console,
+ * don't print everything out again, since when the boot console, and
+ * the real console are the same physical device, it's annoying to
+ * see the beginning boot messages twice
+ */
+ if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV))
+ newcon->flags &= ~CON_PRINTBUFFER;
+
+ /*
+ * Put this console in the list - keep the
+ * preferred driver at the head of the list.
+ */
+ console_lock();
+ if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) {
+ newcon->next = console_drivers;
+ console_drivers = newcon;
+ if (newcon->next)
+ newcon->next->flags &= ~CON_CONSDEV;
+ } else {
+ newcon->next = console_drivers->next;
+ console_drivers->next = newcon;
+ }
+ if (newcon->flags & CON_PRINTBUFFER) {
+ /*
+ * console_unlock(); will print out the buffered messages
+ * for us.
+ */
+ raw_spin_lock_irqsave(&logbuf_lock, flags);
+ console_seq = syslog_seq;
+ console_idx = syslog_idx;
+ console_prev = syslog_prev;
+ raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+ /*
+ * We're about to replay the log buffer. Only do this to the
+ * just-registered console to avoid excessive message spam to
+ * the already-registered consoles.
+ */
+ exclusive_console = newcon;
+ }
+ console_unlock();
+ console_sysfs_notify();
+
+ /*
+ * By unregistering the bootconsoles after we enable the real console
+ * we get the "console xxx enabled" message on all the consoles -
+ * boot consoles, real consoles, etc - this is to ensure that end
+ * users know there might be something in the kernel's log buffer that
+ * went to the bootconsole (that they do not see on the real console)
+ */
+ pr_info("%sconsole [%s%d] enabled\n",
+ (newcon->flags & CON_BOOT) ? "boot" : "" ,
+ newcon->name, newcon->index);
+ if (bcon &&
+ ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) &&
+ !keep_bootcon) {
+ /* We need to iterate through all boot consoles, to make
+ * sure we print everything out, before we unregister them.
+ */
+ for_each_console(bcon)
+ if (bcon->flags & CON_BOOT)
+ unregister_console(bcon);
+ }
+}
+EXPORT_SYMBOL(register_console);
+
+int unregister_console(struct console *console)
+{
+ struct console *a, *b;
+ int res;
+
+ pr_info("%sconsole [%s%d] disabled\n",
+ (console->flags & CON_BOOT) ? "boot" : "" ,
+ console->name, console->index);
+
+ res = _braille_unregister_console(console);
+ if (res)
+ return res;
+
+ res = 1;
+ console_lock();
+ if (console_drivers == console) {
+ console_drivers=console->next;
+ res = 0;
+ } else if (console_drivers) {
+ for (a=console_drivers->next, b=console_drivers ;
+ a; b=a, a=b->next) {
+ if (a == console) {
+ b->next = a->next;
+ res = 0;
+ break;
+ }
+ }
+ }
+
+ /*
+ * If this isn't the last console and it has CON_CONSDEV set, we
+ * need to set it on the next preferred console.
+ */
+ if (console_drivers != NULL && console->flags & CON_CONSDEV)
+ console_drivers->flags |= CON_CONSDEV;
+
+ console->flags &= ~CON_ENABLED;
+ console_unlock();
+ console_sysfs_notify();
+ return res;
+}
+EXPORT_SYMBOL(unregister_console);
+
+static int __init printk_late_init(void)
+{
+ struct console *con;
+
+ for_each_console(con) {
+ if (!keep_bootcon && con->flags & CON_BOOT) {
+ unregister_console(con);
+ }
+ }
+ hotcpu_notifier(console_cpu_notify, 0);
+ return 0;
+}
+late_initcall(printk_late_init);
+
+#if defined CONFIG_PRINTK
+/*
+ * Delayed printk version, for scheduler-internal messages:
+ */
+#define PRINTK_PENDING_WAKEUP 0x01
+#define PRINTK_PENDING_OUTPUT 0x02
+
+static DEFINE_PER_CPU(int, printk_pending);
+
+static void wake_up_klogd_work_func(struct irq_work *irq_work)
+{
+ int pending = __this_cpu_xchg(printk_pending, 0);
+
+ if (pending & PRINTK_PENDING_OUTPUT) {
+ /* If trylock fails, someone else is doing the printing */
+ if (console_trylock())
+ console_unlock();
+ }
+
+ if (pending & PRINTK_PENDING_WAKEUP)
+ wake_up_interruptible(&log_wait);
+}
+
+static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
+ .func = wake_up_klogd_work_func,
+ .flags = IRQ_WORK_LAZY,
+};
+
+void wake_up_klogd(void)
+{
+ preempt_disable();
+ if (waitqueue_active(&log_wait)) {
+ this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
+ irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
+ }
+ preempt_enable();
+}
+
+int printk_deferred(const char *fmt, ...)
+{
+ va_list args;
+ int r;
+
+ preempt_disable();
+ va_start(args, fmt);
+ r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args);
+ va_end(args);
+
+ __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT);
+ irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
+ preempt_enable();
+
+ return r;
+}
+
+/*
+ * printk rate limiting, lifted from the networking subsystem.
+ *
+ * This enforces a rate limit: not more than 10 kernel messages
+ * every 5s to make a denial-of-service attack impossible.
+ */
+DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10);
+
+int __printk_ratelimit(const char *func)
+{
+ return ___ratelimit(&printk_ratelimit_state, func);
+}
+EXPORT_SYMBOL(__printk_ratelimit);
+
+/**
+ * printk_timed_ratelimit - caller-controlled printk ratelimiting
+ * @caller_jiffies: pointer to caller's state
+ * @interval_msecs: minimum interval between prints
+ *
+ * printk_timed_ratelimit() returns true if more than @interval_msecs
+ * milliseconds have elapsed since the last time printk_timed_ratelimit()
+ * returned true.
+ */
+bool printk_timed_ratelimit(unsigned long *caller_jiffies,
+ unsigned int interval_msecs)
+{
+ unsigned long elapsed = jiffies - *caller_jiffies;
+
+ if (*caller_jiffies && elapsed <= msecs_to_jiffies(interval_msecs))
+ return false;
+
+ *caller_jiffies = jiffies;
+ return true;
+}
+EXPORT_SYMBOL(printk_timed_ratelimit);
+
+static DEFINE_SPINLOCK(dump_list_lock);
+static LIST_HEAD(dump_list);
+
+/**
+ * kmsg_dump_register - register a kernel log dumper.
+ * @dumper: pointer to the kmsg_dumper structure
+ *
+ * Adds a kernel log dumper to the system. The dump callback in the
+ * structure will be called when the kernel oopses or panics and must be
+ * set. Returns zero on success and %-EINVAL or %-EBUSY otherwise.
+ */
+int kmsg_dump_register(struct kmsg_dumper *dumper)
+{
+ unsigned long flags;
+ int err = -EBUSY;
+
+ /* The dump callback needs to be set */
+ if (!dumper->dump)
+ return -EINVAL;
+
+ spin_lock_irqsave(&dump_list_lock, flags);
+ /* Don't allow registering multiple times */
+ if (!dumper->registered) {
+ dumper->registered = 1;
+ list_add_tail_rcu(&dumper->list, &dump_list);
+ err = 0;
+ }
+ spin_unlock_irqrestore(&dump_list_lock, flags);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(kmsg_dump_register);
+
+/**
+ * kmsg_dump_unregister - unregister a kmsg dumper.
+ * @dumper: pointer to the kmsg_dumper structure
+ *
+ * Removes a dump device from the system. Returns zero on success and
+ * %-EINVAL otherwise.
+ */
+int kmsg_dump_unregister(struct kmsg_dumper *dumper)
+{
+ unsigned long flags;
+ int err = -EINVAL;
+
+ spin_lock_irqsave(&dump_list_lock, flags);
+ if (dumper->registered) {
+ dumper->registered = 0;
+ list_del_rcu(&dumper->list);
+ err = 0;
+ }
+ spin_unlock_irqrestore(&dump_list_lock, flags);
+ synchronize_rcu();
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
+
+static bool always_kmsg_dump;
+module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR);
+
+/**
+ * kmsg_dump - dump kernel log to kernel message dumpers.
+ * @reason: the reason (oops, panic etc) for dumping
+ *
+ * Call each of the registered dumper's dump() callback, which can
+ * retrieve the kmsg records with kmsg_dump_get_line() or
+ * kmsg_dump_get_buffer().
+ */
+void kmsg_dump(enum kmsg_dump_reason reason)
+{
+ struct kmsg_dumper *dumper;
+ unsigned long flags;
+
+ if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump)
+ return;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(dumper, &dump_list, list) {
+ if (dumper->max_reason && reason > dumper->max_reason)
+ continue;
+
+ /* initialize iterator with data about the stored records */
+ dumper->active = true;
+
+ raw_spin_lock_irqsave(&logbuf_lock, flags);
+ dumper->cur_seq = clear_seq;
+ dumper->cur_idx = clear_idx;
+ dumper->next_seq = log_next_seq;
+ dumper->next_idx = log_next_idx;
+ raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+
+ /* invoke dumper which will iterate over records */
+ dumper->dump(dumper, reason);
+
+ /* reset iterator */
+ dumper->active = false;
+ }
+ rcu_read_unlock();
+}
+
+/**
+ * kmsg_dump_get_line_nolock - retrieve one kmsg log line (unlocked version)
+ * @dumper: registered kmsg dumper
+ * @syslog: include the "<4>" prefixes
+ * @line: buffer to copy the line to
+ * @size: maximum size of the buffer
+ * @len: length of line placed into buffer
+ *
+ * Start at the beginning of the kmsg buffer, with the oldest kmsg
+ * record, and copy one record into the provided buffer.
+ *
+ * Consecutive calls will return the next available record moving
+ * towards the end of the buffer with the youngest messages.
+ *
+ * A return value of FALSE indicates that there are no more records to
+ * read.
+ *
+ * The function is similar to kmsg_dump_get_line(), but grabs no locks.
+ */
+bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,
+ char *line, size_t size, size_t *len)
+{
+ struct printk_log *msg;
+ size_t l = 0;
+ bool ret = false;
+
+ if (!dumper->active)
+ goto out;
+
+ if (dumper->cur_seq < log_first_seq) {
+ /* messages are gone, move to first available one */
+ dumper->cur_seq = log_first_seq;
+ dumper->cur_idx = log_first_idx;
+ }
+
+ /* last entry */
+ if (dumper->cur_seq >= log_next_seq)
+ goto out;
+
+ msg = log_from_idx(dumper->cur_idx);
+ l = msg_print_text(msg, 0, syslog, line, size);
+
+ dumper->cur_idx = log_next(dumper->cur_idx);
+ dumper->cur_seq++;
+ ret = true;
+out:
+ if (len)
+ *len = l;
+ return ret;
+}
+
+/**
+ * kmsg_dump_get_line - retrieve one kmsg log line
+ * @dumper: registered kmsg dumper
+ * @syslog: include the "<4>" prefixes
+ * @line: buffer to copy the line to
+ * @size: maximum size of the buffer
+ * @len: length of line placed into buffer
+ *
+ * Start at the beginning of the kmsg buffer, with the oldest kmsg
+ * record, and copy one record into the provided buffer.
+ *
+ * Consecutive calls will return the next available record moving
+ * towards the end of the buffer with the youngest messages.
+ *
+ * A return value of FALSE indicates that there are no more records to
+ * read.
+ */
+bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
+ char *line, size_t size, size_t *len)
+{
+ unsigned long flags;
+ bool ret;
+
+ raw_spin_lock_irqsave(&logbuf_lock, flags);
+ ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len);
+ raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
+
+/**
+ * kmsg_dump_get_buffer - copy kmsg log lines
+ * @dumper: registered kmsg dumper
+ * @syslog: include the "<4>" prefixes
+ * @buf: buffer to copy the line to
+ * @size: maximum size of the buffer
+ * @len: length of line placed into buffer
+ *
+ * Start at the end of the kmsg buffer and fill the provided buffer
+ * with as many of the the *youngest* kmsg records that fit into it.
+ * If the buffer is large enough, all available kmsg records will be
+ * copied with a single call.
+ *
+ * Consecutive calls will fill the buffer with the next block of
+ * available older records, not including the earlier retrieved ones.
+ *
+ * A return value of FALSE indicates that there are no more records to
+ * read.
+ */
+bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
+ char *buf, size_t size, size_t *len)
+{
+ unsigned long flags;
+ u64 seq;
+ u32 idx;
+ u64 next_seq;
+ u32 next_idx;
+ enum log_flags prev;
+ size_t l = 0;
+ bool ret = false;
+
+ if (!dumper->active)
+ goto out;
+
+ raw_spin_lock_irqsave(&logbuf_lock, flags);
+ if (dumper->cur_seq < log_first_seq) {
+ /* messages are gone, move to first available one */
+ dumper->cur_seq = log_first_seq;
+ dumper->cur_idx = log_first_idx;
+ }
+
+ /* last entry */
+ if (dumper->cur_seq >= dumper->next_seq) {
+ raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+ goto out;
+ }
+
+ /* calculate length of entire buffer */
+ seq = dumper->cur_seq;
+ idx = dumper->cur_idx;
+ prev = 0;
+ while (seq < dumper->next_seq) {
+ struct printk_log *msg = log_from_idx(idx);
+
+ l += msg_print_text(msg, prev, true, NULL, 0);
+ idx = log_next(idx);
+ seq++;
+ prev = msg->flags;
+ }
+
+ /* move first record forward until length fits into the buffer */
+ seq = dumper->cur_seq;
+ idx = dumper->cur_idx;
+ prev = 0;
+ while (l > size && seq < dumper->next_seq) {
+ struct printk_log *msg = log_from_idx(idx);
+
+ l -= msg_print_text(msg, prev, true, NULL, 0);
+ idx = log_next(idx);
+ seq++;
+ prev = msg->flags;
+ }
+
+ /* last message in next interation */
+ next_seq = seq;
+ next_idx = idx;
+
+ l = 0;
+ while (seq < dumper->next_seq) {
+ struct printk_log *msg = log_from_idx(idx);
+
+ l += msg_print_text(msg, prev, syslog, buf + l, size - l);
+ idx = log_next(idx);
+ seq++;
+ prev = msg->flags;
+ }
+
+ dumper->next_seq = next_seq;
+ dumper->next_idx = next_idx;
+ ret = true;
+ raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+out:
+ if (len)
+ *len = l;
+ return ret;
+}
+EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
+
+/**
+ * kmsg_dump_rewind_nolock - reset the interator (unlocked version)
+ * @dumper: registered kmsg dumper
+ *
+ * Reset the dumper's iterator so that kmsg_dump_get_line() and
+ * kmsg_dump_get_buffer() can be called again and used multiple
+ * times within the same dumper.dump() callback.
+ *
+ * The function is similar to kmsg_dump_rewind(), but grabs no locks.
+ */
+void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper)
+{
+ dumper->cur_seq = clear_seq;
+ dumper->cur_idx = clear_idx;
+ dumper->next_seq = log_next_seq;
+ dumper->next_idx = log_next_idx;
+}
+
+/**
+ * kmsg_dump_rewind - reset the interator
+ * @dumper: registered kmsg dumper
+ *
+ * Reset the dumper's iterator so that kmsg_dump_get_line() and
+ * kmsg_dump_get_buffer() can be called again and used multiple
+ * times within the same dumper.dump() callback.
+ */
+void kmsg_dump_rewind(struct kmsg_dumper *dumper)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&logbuf_lock, flags);
+ kmsg_dump_rewind_nolock(dumper);
+ raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+}
+EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
+
+static char dump_stack_arch_desc_str[128];
+
+/**
+ * dump_stack_set_arch_desc - set arch-specific str to show with task dumps
+ * @fmt: printf-style format string
+ * @...: arguments for the format string
+ *
+ * The configured string will be printed right after utsname during task
+ * dumps. Usually used to add arch-specific system identifiers. If an
+ * arch wants to make use of such an ID string, it should initialize this
+ * as soon as possible during boot.
+ */
+void __init dump_stack_set_arch_desc(const char *fmt, ...)
+{
+ va_list args;
+
+ va_start(args, fmt);
+ vsnprintf(dump_stack_arch_desc_str, sizeof(dump_stack_arch_desc_str),
+ fmt, args);
+ va_end(args);
+}
+
+/**
+ * dump_stack_print_info - print generic debug info for dump_stack()
+ * @log_lvl: log level
+ *
+ * Arch-specific dump_stack() implementations can use this function to
+ * print out the same debug information as the generic dump_stack().
+ */
+void dump_stack_print_info(const char *log_lvl)
+{
+ printk("%sCPU: %d PID: %d Comm: %.20s %s %s %.*s\n",
+ log_lvl, raw_smp_processor_id(), current->pid, current->comm,
+ print_tainted(), init_utsname()->release,
+ (int)strcspn(init_utsname()->version, " "),
+ init_utsname()->version);
+
+ if (dump_stack_arch_desc_str[0] != '\0')
+ printk("%sHardware name: %s\n",
+ log_lvl, dump_stack_arch_desc_str);
+
+ print_worker_info(log_lvl, current);
+}
+
+/**
+ * show_regs_print_info - print generic debug info for show_regs()
+ * @log_lvl: log level
+ *
+ * show_regs() implementations can use this function to print out generic
+ * debug information.
+ */
+void show_regs_print_info(const char *log_lvl)
+{
+ dump_stack_print_info(log_lvl);
+
+ printk("%stask: %p ti: %p task.ti: %p\n",
+ log_lvl, current, current_thread_info(),
+ task_thread_info(current));
+}
+
+#endif
diff --git a/kernel/profile.c b/kernel/profile.c
new file mode 100644
index 000000000..a7bcd28d6
--- /dev/null
+++ b/kernel/profile.c
@@ -0,0 +1,613 @@
+/*
+ * linux/kernel/profile.c
+ * Simple profiling. Manages a direct-mapped profile hit count buffer,
+ * with configurable resolution, support for restricting the cpus on
+ * which profiling is done, and switching between cpu time and
+ * schedule() calls via kernel command line parameters passed at boot.
+ *
+ * Scheduler profiling support, Arjan van de Ven and Ingo Molnar,
+ * Red Hat, July 2004
+ * Consolidation of architecture support code for profiling,
+ * Nadia Yvette Chambers, Oracle, July 2004
+ * Amortized hit count accounting via per-cpu open-addressed hashtables
+ * to resolve timer interrupt livelocks, Nadia Yvette Chambers,
+ * Oracle, 2004
+ */
+
+#include <linux/export.h>
+#include <linux/profile.h>
+#include <linux/bootmem.h>
+#include <linux/notifier.h>
+#include <linux/mm.h>
+#include <linux/cpumask.h>
+#include <linux/cpu.h>
+#include <linux/highmem.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <asm/sections.h>
+#include <asm/irq_regs.h>
+#include <asm/ptrace.h>
+
+struct profile_hit {
+ u32 pc, hits;
+};
+#define PROFILE_GRPSHIFT 3
+#define PROFILE_GRPSZ (1 << PROFILE_GRPSHIFT)
+#define NR_PROFILE_HIT (PAGE_SIZE/sizeof(struct profile_hit))
+#define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ)
+
+static atomic_t *prof_buffer;
+static unsigned long prof_len, prof_shift;
+
+int prof_on __read_mostly;
+EXPORT_SYMBOL_GPL(prof_on);
+
+static cpumask_var_t prof_cpu_mask;
+#ifdef CONFIG_SMP
+static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
+static DEFINE_PER_CPU(int, cpu_profile_flip);
+static DEFINE_MUTEX(profile_flip_mutex);
+#endif /* CONFIG_SMP */
+
+int profile_setup(char *str)
+{
+ static const char schedstr[] = "schedule";
+ static const char sleepstr[] = "sleep";
+ static const char kvmstr[] = "kvm";
+ int par;
+
+ if (!strncmp(str, sleepstr, strlen(sleepstr))) {
+#ifdef CONFIG_SCHEDSTATS
+ prof_on = SLEEP_PROFILING;
+ if (str[strlen(sleepstr)] == ',')
+ str += strlen(sleepstr) + 1;
+ if (get_option(&str, &par))
+ prof_shift = par;
+ pr_info("kernel sleep profiling enabled (shift: %ld)\n",
+ prof_shift);
+#else
+ pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n");
+#endif /* CONFIG_SCHEDSTATS */
+ } else if (!strncmp(str, schedstr, strlen(schedstr))) {
+ prof_on = SCHED_PROFILING;
+ if (str[strlen(schedstr)] == ',')
+ str += strlen(schedstr) + 1;
+ if (get_option(&str, &par))
+ prof_shift = par;
+ pr_info("kernel schedule profiling enabled (shift: %ld)\n",
+ prof_shift);
+ } else if (!strncmp(str, kvmstr, strlen(kvmstr))) {
+ prof_on = KVM_PROFILING;
+ if (str[strlen(kvmstr)] == ',')
+ str += strlen(kvmstr) + 1;
+ if (get_option(&str, &par))
+ prof_shift = par;
+ pr_info("kernel KVM profiling enabled (shift: %ld)\n",
+ prof_shift);
+ } else if (get_option(&str, &par)) {
+ prof_shift = par;
+ prof_on = CPU_PROFILING;
+ pr_info("kernel profiling enabled (shift: %ld)\n",
+ prof_shift);
+ }
+ return 1;
+}
+__setup("profile=", profile_setup);
+
+
+int __ref profile_init(void)
+{
+ int buffer_bytes;
+ if (!prof_on)
+ return 0;
+
+ /* only text is profiled */
+ prof_len = (_etext - _stext) >> prof_shift;
+ buffer_bytes = prof_len*sizeof(atomic_t);
+
+ if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ cpumask_copy(prof_cpu_mask, cpu_possible_mask);
+
+ prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL|__GFP_NOWARN);
+ if (prof_buffer)
+ return 0;
+
+ prof_buffer = alloc_pages_exact(buffer_bytes,
+ GFP_KERNEL|__GFP_ZERO|__GFP_NOWARN);
+ if (prof_buffer)
+ return 0;
+
+ prof_buffer = vzalloc(buffer_bytes);
+ if (prof_buffer)
+ return 0;
+
+ free_cpumask_var(prof_cpu_mask);
+ return -ENOMEM;
+}
+
+/* Profile event notifications */
+
+static BLOCKING_NOTIFIER_HEAD(task_exit_notifier);
+static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
+static BLOCKING_NOTIFIER_HEAD(munmap_notifier);
+
+void profile_task_exit(struct task_struct *task)
+{
+ blocking_notifier_call_chain(&task_exit_notifier, 0, task);
+}
+
+int profile_handoff_task(struct task_struct *task)
+{
+ int ret;
+ ret = atomic_notifier_call_chain(&task_free_notifier, 0, task);
+ return (ret == NOTIFY_OK) ? 1 : 0;
+}
+
+void profile_munmap(unsigned long addr)
+{
+ blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr);
+}
+
+int task_handoff_register(struct notifier_block *n)
+{
+ return atomic_notifier_chain_register(&task_free_notifier, n);
+}
+EXPORT_SYMBOL_GPL(task_handoff_register);
+
+int task_handoff_unregister(struct notifier_block *n)
+{
+ return atomic_notifier_chain_unregister(&task_free_notifier, n);
+}
+EXPORT_SYMBOL_GPL(task_handoff_unregister);
+
+int profile_event_register(enum profile_type type, struct notifier_block *n)
+{
+ int err = -EINVAL;
+
+ switch (type) {
+ case PROFILE_TASK_EXIT:
+ err = blocking_notifier_chain_register(
+ &task_exit_notifier, n);
+ break;
+ case PROFILE_MUNMAP:
+ err = blocking_notifier_chain_register(
+ &munmap_notifier, n);
+ break;
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(profile_event_register);
+
+int profile_event_unregister(enum profile_type type, struct notifier_block *n)
+{
+ int err = -EINVAL;
+
+ switch (type) {
+ case PROFILE_TASK_EXIT:
+ err = blocking_notifier_chain_unregister(
+ &task_exit_notifier, n);
+ break;
+ case PROFILE_MUNMAP:
+ err = blocking_notifier_chain_unregister(
+ &munmap_notifier, n);
+ break;
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(profile_event_unregister);
+
+#ifdef CONFIG_SMP
+/*
+ * Each cpu has a pair of open-addressed hashtables for pending
+ * profile hits. read_profile() IPI's all cpus to request them
+ * to flip buffers and flushes their contents to prof_buffer itself.
+ * Flip requests are serialized by the profile_flip_mutex. The sole
+ * use of having a second hashtable is for avoiding cacheline
+ * contention that would otherwise happen during flushes of pending
+ * profile hits required for the accuracy of reported profile hits
+ * and so resurrect the interrupt livelock issue.
+ *
+ * The open-addressed hashtables are indexed by profile buffer slot
+ * and hold the number of pending hits to that profile buffer slot on
+ * a cpu in an entry. When the hashtable overflows, all pending hits
+ * are accounted to their corresponding profile buffer slots with
+ * atomic_add() and the hashtable emptied. As numerous pending hits
+ * may be accounted to a profile buffer slot in a hashtable entry,
+ * this amortizes a number of atomic profile buffer increments likely
+ * to be far larger than the number of entries in the hashtable,
+ * particularly given that the number of distinct profile buffer
+ * positions to which hits are accounted during short intervals (e.g.
+ * several seconds) is usually very small. Exclusion from buffer
+ * flipping is provided by interrupt disablement (note that for
+ * SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from
+ * process context).
+ * The hash function is meant to be lightweight as opposed to strong,
+ * and was vaguely inspired by ppc64 firmware-supported inverted
+ * pagetable hash functions, but uses a full hashtable full of finite
+ * collision chains, not just pairs of them.
+ *
+ * -- nyc
+ */
+static void __profile_flip_buffers(void *unused)
+{
+ int cpu = smp_processor_id();
+
+ per_cpu(cpu_profile_flip, cpu) = !per_cpu(cpu_profile_flip, cpu);
+}
+
+static void profile_flip_buffers(void)
+{
+ int i, j, cpu;
+
+ mutex_lock(&profile_flip_mutex);
+ j = per_cpu(cpu_profile_flip, get_cpu());
+ put_cpu();
+ on_each_cpu(__profile_flip_buffers, NULL, 1);
+ for_each_online_cpu(cpu) {
+ struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[j];
+ for (i = 0; i < NR_PROFILE_HIT; ++i) {
+ if (!hits[i].hits) {
+ if (hits[i].pc)
+ hits[i].pc = 0;
+ continue;
+ }
+ atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
+ hits[i].hits = hits[i].pc = 0;
+ }
+ }
+ mutex_unlock(&profile_flip_mutex);
+}
+
+static void profile_discard_flip_buffers(void)
+{
+ int i, cpu;
+
+ mutex_lock(&profile_flip_mutex);
+ i = per_cpu(cpu_profile_flip, get_cpu());
+ put_cpu();
+ on_each_cpu(__profile_flip_buffers, NULL, 1);
+ for_each_online_cpu(cpu) {
+ struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i];
+ memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit));
+ }
+ mutex_unlock(&profile_flip_mutex);
+}
+
+static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)
+{
+ unsigned long primary, secondary, flags, pc = (unsigned long)__pc;
+ int i, j, cpu;
+ struct profile_hit *hits;
+
+ pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1);
+ i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
+ secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
+ cpu = get_cpu();
+ hits = per_cpu(cpu_profile_hits, cpu)[per_cpu(cpu_profile_flip, cpu)];
+ if (!hits) {
+ put_cpu();
+ return;
+ }
+ /*
+ * We buffer the global profiler buffer into a per-CPU
+ * queue and thus reduce the number of global (and possibly
+ * NUMA-alien) accesses. The write-queue is self-coalescing:
+ */
+ local_irq_save(flags);
+ do {
+ for (j = 0; j < PROFILE_GRPSZ; ++j) {
+ if (hits[i + j].pc == pc) {
+ hits[i + j].hits += nr_hits;
+ goto out;
+ } else if (!hits[i + j].hits) {
+ hits[i + j].pc = pc;
+ hits[i + j].hits = nr_hits;
+ goto out;
+ }
+ }
+ i = (i + secondary) & (NR_PROFILE_HIT - 1);
+ } while (i != primary);
+
+ /*
+ * Add the current hit(s) and flush the write-queue out
+ * to the global buffer:
+ */
+ atomic_add(nr_hits, &prof_buffer[pc]);
+ for (i = 0; i < NR_PROFILE_HIT; ++i) {
+ atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
+ hits[i].pc = hits[i].hits = 0;
+ }
+out:
+ local_irq_restore(flags);
+ put_cpu();
+}
+
+static int profile_cpu_callback(struct notifier_block *info,
+ unsigned long action, void *__cpu)
+{
+ int node, cpu = (unsigned long)__cpu;
+ struct page *page;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
+ node = cpu_to_mem(cpu);
+ per_cpu(cpu_profile_flip, cpu) = 0;
+ if (!per_cpu(cpu_profile_hits, cpu)[1]) {
+ page = alloc_pages_exact_node(node,
+ GFP_KERNEL | __GFP_ZERO,
+ 0);
+ if (!page)
+ return notifier_from_errno(-ENOMEM);
+ per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
+ }
+ if (!per_cpu(cpu_profile_hits, cpu)[0]) {
+ page = alloc_pages_exact_node(node,
+ GFP_KERNEL | __GFP_ZERO,
+ 0);
+ if (!page)
+ goto out_free;
+ per_cpu(cpu_profile_hits, cpu)[0] = page_address(page);
+ }
+ break;
+out_free:
+ page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
+ per_cpu(cpu_profile_hits, cpu)[1] = NULL;
+ __free_page(page);
+ return notifier_from_errno(-ENOMEM);
+ case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
+ if (prof_cpu_mask != NULL)
+ cpumask_set_cpu(cpu, prof_cpu_mask);
+ break;
+ case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ if (prof_cpu_mask != NULL)
+ cpumask_clear_cpu(cpu, prof_cpu_mask);
+ if (per_cpu(cpu_profile_hits, cpu)[0]) {
+ page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
+ per_cpu(cpu_profile_hits, cpu)[0] = NULL;
+ __free_page(page);
+ }
+ if (per_cpu(cpu_profile_hits, cpu)[1]) {
+ page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
+ per_cpu(cpu_profile_hits, cpu)[1] = NULL;
+ __free_page(page);
+ }
+ break;
+ }
+ return NOTIFY_OK;
+}
+#else /* !CONFIG_SMP */
+#define profile_flip_buffers() do { } while (0)
+#define profile_discard_flip_buffers() do { } while (0)
+#define profile_cpu_callback NULL
+
+static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)
+{
+ unsigned long pc;
+ pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift;
+ atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
+}
+#endif /* !CONFIG_SMP */
+
+void profile_hits(int type, void *__pc, unsigned int nr_hits)
+{
+ if (prof_on != type || !prof_buffer)
+ return;
+ do_profile_hits(type, __pc, nr_hits);
+}
+EXPORT_SYMBOL_GPL(profile_hits);
+
+void profile_tick(int type)
+{
+ struct pt_regs *regs = get_irq_regs();
+
+ if (!user_mode(regs) && prof_cpu_mask != NULL &&
+ cpumask_test_cpu(smp_processor_id(), prof_cpu_mask))
+ profile_hit(type, (void *)profile_pc(regs));
+}
+
+#ifdef CONFIG_PROC_FS
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <asm/uaccess.h>
+
+static int prof_cpu_mask_proc_show(struct seq_file *m, void *v)
+{
+ seq_printf(m, "%*pb\n", cpumask_pr_args(prof_cpu_mask));
+ return 0;
+}
+
+static int prof_cpu_mask_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, prof_cpu_mask_proc_show, NULL);
+}
+
+static ssize_t prof_cpu_mask_proc_write(struct file *file,
+ const char __user *buffer, size_t count, loff_t *pos)
+{
+ cpumask_var_t new_value;
+ int err;
+
+ if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
+ return -ENOMEM;
+
+ err = cpumask_parse_user(buffer, count, new_value);
+ if (!err) {
+ cpumask_copy(prof_cpu_mask, new_value);
+ err = count;
+ }
+ free_cpumask_var(new_value);
+ return err;
+}
+
+static const struct file_operations prof_cpu_mask_proc_fops = {
+ .open = prof_cpu_mask_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+ .write = prof_cpu_mask_proc_write,
+};
+
+void create_prof_cpu_mask(void)
+{
+ /* create /proc/irq/prof_cpu_mask */
+ proc_create("irq/prof_cpu_mask", 0600, NULL, &prof_cpu_mask_proc_fops);
+}
+
+/*
+ * This function accesses profiling information. The returned data is
+ * binary: the sampling step and the actual contents of the profile
+ * buffer. Use of the program readprofile is recommended in order to
+ * get meaningful info out of these data.
+ */
+static ssize_t
+read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+ unsigned long p = *ppos;
+ ssize_t read;
+ char *pnt;
+ unsigned int sample_step = 1 << prof_shift;
+
+ profile_flip_buffers();
+ if (p >= (prof_len+1)*sizeof(unsigned int))
+ return 0;
+ if (count > (prof_len+1)*sizeof(unsigned int) - p)
+ count = (prof_len+1)*sizeof(unsigned int) - p;
+ read = 0;
+
+ while (p < sizeof(unsigned int) && count > 0) {
+ if (put_user(*((char *)(&sample_step)+p), buf))
+ return -EFAULT;
+ buf++; p++; count--; read++;
+ }
+ pnt = (char *)prof_buffer + p - sizeof(atomic_t);
+ if (copy_to_user(buf, (void *)pnt, count))
+ return -EFAULT;
+ read += count;
+ *ppos += read;
+ return read;
+}
+
+/*
+ * Writing to /proc/profile resets the counters
+ *
+ * Writing a 'profiling multiplier' value into it also re-sets the profiling
+ * interrupt frequency, on architectures that support this.
+ */
+static ssize_t write_profile(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+#ifdef CONFIG_SMP
+ extern int setup_profiling_timer(unsigned int multiplier);
+
+ if (count == sizeof(int)) {
+ unsigned int multiplier;
+
+ if (copy_from_user(&multiplier, buf, sizeof(int)))
+ return -EFAULT;
+
+ if (setup_profiling_timer(multiplier))
+ return -EINVAL;
+ }
+#endif
+ profile_discard_flip_buffers();
+ memset(prof_buffer, 0, prof_len * sizeof(atomic_t));
+ return count;
+}
+
+static const struct file_operations proc_profile_operations = {
+ .read = read_profile,
+ .write = write_profile,
+ .llseek = default_llseek,
+};
+
+#ifdef CONFIG_SMP
+static void profile_nop(void *unused)
+{
+}
+
+static int create_hash_tables(void)
+{
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ int node = cpu_to_mem(cpu);
+ struct page *page;
+
+ page = alloc_pages_exact_node(node,
+ GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
+ 0);
+ if (!page)
+ goto out_cleanup;
+ per_cpu(cpu_profile_hits, cpu)[1]
+ = (struct profile_hit *)page_address(page);
+ page = alloc_pages_exact_node(node,
+ GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
+ 0);
+ if (!page)
+ goto out_cleanup;
+ per_cpu(cpu_profile_hits, cpu)[0]
+ = (struct profile_hit *)page_address(page);
+ }
+ return 0;
+out_cleanup:
+ prof_on = 0;
+ smp_mb();
+ on_each_cpu(profile_nop, NULL, 1);
+ for_each_online_cpu(cpu) {
+ struct page *page;
+
+ if (per_cpu(cpu_profile_hits, cpu)[0]) {
+ page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
+ per_cpu(cpu_profile_hits, cpu)[0] = NULL;
+ __free_page(page);
+ }
+ if (per_cpu(cpu_profile_hits, cpu)[1]) {
+ page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
+ per_cpu(cpu_profile_hits, cpu)[1] = NULL;
+ __free_page(page);
+ }
+ }
+ return -1;
+}
+#else
+#define create_hash_tables() ({ 0; })
+#endif
+
+int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */
+{
+ struct proc_dir_entry *entry;
+ int err = 0;
+
+ if (!prof_on)
+ return 0;
+
+ cpu_notifier_register_begin();
+
+ if (create_hash_tables()) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ entry = proc_create("profile", S_IWUSR | S_IRUGO,
+ NULL, &proc_profile_operations);
+ if (!entry)
+ goto out;
+ proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t));
+ __hotcpu_notifier(profile_cpu_callback, 0);
+
+out:
+ cpu_notifier_register_done();
+ return err;
+}
+subsys_initcall(create_proc_profile);
+#endif /* CONFIG_PROC_FS */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
new file mode 100644
index 000000000..c8e0e050a
--- /dev/null
+++ b/kernel/ptrace.c
@@ -0,0 +1,1219 @@
+/*
+ * linux/kernel/ptrace.c
+ *
+ * (C) Copyright 1999 Linus Torvalds
+ *
+ * Common interfaces for "ptrace()" which we do not want
+ * to continually duplicate across every architecture.
+ */
+
+#include <linux/capability.h>
+#include <linux/export.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/ptrace.h>
+#include <linux/security.h>
+#include <linux/signal.h>
+#include <linux/uio.h>
+#include <linux/audit.h>
+#include <linux/pid_namespace.h>
+#include <linux/syscalls.h>
+#include <linux/uaccess.h>
+#include <linux/regset.h>
+#include <linux/hw_breakpoint.h>
+#include <linux/cn_proc.h>
+#include <linux/compat.h>
+
+
+/*
+ * ptrace a task: make the debugger its new parent and
+ * move it to the ptrace list.
+ *
+ * Must be called with the tasklist lock write-held.
+ */
+void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
+{
+ BUG_ON(!list_empty(&child->ptrace_entry));
+ list_add(&child->ptrace_entry, &new_parent->ptraced);
+ child->parent = new_parent;
+}
+
+/**
+ * __ptrace_unlink - unlink ptracee and restore its execution state
+ * @child: ptracee to be unlinked
+ *
+ * Remove @child from the ptrace list, move it back to the original parent,
+ * and restore the execution state so that it conforms to the group stop
+ * state.
+ *
+ * Unlinking can happen via two paths - explicit PTRACE_DETACH or ptracer
+ * exiting. For PTRACE_DETACH, unless the ptracee has been killed between
+ * ptrace_check_attach() and here, it's guaranteed to be in TASK_TRACED.
+ * If the ptracer is exiting, the ptracee can be in any state.
+ *
+ * After detach, the ptracee should be in a state which conforms to the
+ * group stop. If the group is stopped or in the process of stopping, the
+ * ptracee should be put into TASK_STOPPED; otherwise, it should be woken
+ * up from TASK_TRACED.
+ *
+ * If the ptracee is in TASK_TRACED and needs to be moved to TASK_STOPPED,
+ * it goes through TRACED -> RUNNING -> STOPPED transition which is similar
+ * to but in the opposite direction of what happens while attaching to a
+ * stopped task. However, in this direction, the intermediate RUNNING
+ * state is not hidden even from the current ptracer and if it immediately
+ * re-attaches and performs a WNOHANG wait(2), it may fail.
+ *
+ * CONTEXT:
+ * write_lock_irq(tasklist_lock)
+ */
+void __ptrace_unlink(struct task_struct *child)
+{
+ BUG_ON(!child->ptrace);
+
+ child->ptrace = 0;
+ child->parent = child->real_parent;
+ list_del_init(&child->ptrace_entry);
+
+ spin_lock(&child->sighand->siglock);
+
+ /*
+ * Clear all pending traps and TRAPPING. TRAPPING should be
+ * cleared regardless of JOBCTL_STOP_PENDING. Do it explicitly.
+ */
+ task_clear_jobctl_pending(child, JOBCTL_TRAP_MASK);
+ task_clear_jobctl_trapping(child);
+
+ /*
+ * Reinstate JOBCTL_STOP_PENDING if group stop is in effect and
+ * @child isn't dead.
+ */
+ if (!(child->flags & PF_EXITING) &&
+ (child->signal->flags & SIGNAL_STOP_STOPPED ||
+ child->signal->group_stop_count)) {
+ child->jobctl |= JOBCTL_STOP_PENDING;
+
+ /*
+ * This is only possible if this thread was cloned by the
+ * traced task running in the stopped group, set the signal
+ * for the future reports.
+ * FIXME: we should change ptrace_init_task() to handle this
+ * case.
+ */
+ if (!(child->jobctl & JOBCTL_STOP_SIGMASK))
+ child->jobctl |= SIGSTOP;
+ }
+
+ /*
+ * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick
+ * @child in the butt. Note that @resume should be used iff @child
+ * is in TASK_TRACED; otherwise, we might unduly disrupt
+ * TASK_KILLABLE sleeps.
+ */
+ if (child->jobctl & JOBCTL_STOP_PENDING || task_is_traced(child))
+ ptrace_signal_wake_up(child, true);
+
+ spin_unlock(&child->sighand->siglock);
+}
+
+/* Ensure that nothing can wake it up, even SIGKILL */
+static bool ptrace_freeze_traced(struct task_struct *task)
+{
+ bool ret = false;
+
+ /* Lockless, nobody but us can set this flag */
+ if (task->jobctl & JOBCTL_LISTENING)
+ return ret;
+
+ spin_lock_irq(&task->sighand->siglock);
+ if (task_is_traced(task) && !__fatal_signal_pending(task)) {
+ task->state = __TASK_TRACED;
+ ret = true;
+ }
+ spin_unlock_irq(&task->sighand->siglock);
+
+ return ret;
+}
+
+static void ptrace_unfreeze_traced(struct task_struct *task)
+{
+ if (task->state != __TASK_TRACED)
+ return;
+
+ WARN_ON(!task->ptrace || task->parent != current);
+
+ spin_lock_irq(&task->sighand->siglock);
+ if (__fatal_signal_pending(task))
+ wake_up_state(task, __TASK_TRACED);
+ else
+ task->state = TASK_TRACED;
+ spin_unlock_irq(&task->sighand->siglock);
+}
+
+/**
+ * ptrace_check_attach - check whether ptracee is ready for ptrace operation
+ * @child: ptracee to check for
+ * @ignore_state: don't check whether @child is currently %TASK_TRACED
+ *
+ * Check whether @child is being ptraced by %current and ready for further
+ * ptrace operations. If @ignore_state is %false, @child also should be in
+ * %TASK_TRACED state and on return the child is guaranteed to be traced
+ * and not executing. If @ignore_state is %true, @child can be in any
+ * state.
+ *
+ * CONTEXT:
+ * Grabs and releases tasklist_lock and @child->sighand->siglock.
+ *
+ * RETURNS:
+ * 0 on success, -ESRCH if %child is not ready.
+ */
+static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
+{
+ int ret = -ESRCH;
+
+ /*
+ * We take the read lock around doing both checks to close a
+ * possible race where someone else was tracing our child and
+ * detached between these two checks. After this locked check,
+ * we are sure that this is our traced child and that can only
+ * be changed by us so it's not changing right after this.
+ */
+ read_lock(&tasklist_lock);
+ if (child->ptrace && child->parent == current) {
+ WARN_ON(child->state == __TASK_TRACED);
+ /*
+ * child->sighand can't be NULL, release_task()
+ * does ptrace_unlink() before __exit_signal().
+ */
+ if (ignore_state || ptrace_freeze_traced(child))
+ ret = 0;
+ }
+ read_unlock(&tasklist_lock);
+
+ if (!ret && !ignore_state) {
+ if (!wait_task_inactive(child, __TASK_TRACED)) {
+ /*
+ * This can only happen if may_ptrace_stop() fails and
+ * ptrace_stop() changes ->state back to TASK_RUNNING,
+ * so we should not worry about leaking __TASK_TRACED.
+ */
+ WARN_ON(child->state == __TASK_TRACED);
+ ret = -ESRCH;
+ }
+ }
+
+ return ret;
+}
+
+static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
+{
+ if (mode & PTRACE_MODE_NOAUDIT)
+ return has_ns_capability_noaudit(current, ns, CAP_SYS_PTRACE);
+ else
+ return has_ns_capability(current, ns, CAP_SYS_PTRACE);
+}
+
+/* Returns 0 on success, -errno on denial. */
+static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
+{
+ const struct cred *cred = current_cred(), *tcred;
+
+ /* May we inspect the given task?
+ * This check is used both for attaching with ptrace
+ * and for allowing access to sensitive information in /proc.
+ *
+ * ptrace_attach denies several cases that /proc allows
+ * because setting up the necessary parent/child relationship
+ * or halting the specified task is impossible.
+ */
+ int dumpable = 0;
+ /* Don't let security modules deny introspection */
+ if (same_thread_group(task, current))
+ return 0;
+ rcu_read_lock();
+ tcred = __task_cred(task);
+ if (uid_eq(cred->uid, tcred->euid) &&
+ uid_eq(cred->uid, tcred->suid) &&
+ uid_eq(cred->uid, tcred->uid) &&
+ gid_eq(cred->gid, tcred->egid) &&
+ gid_eq(cred->gid, tcred->sgid) &&
+ gid_eq(cred->gid, tcred->gid))
+ goto ok;
+ if (ptrace_has_cap(tcred->user_ns, mode))
+ goto ok;
+ rcu_read_unlock();
+ return -EPERM;
+ok:
+ rcu_read_unlock();
+ smp_rmb();
+ if (task->mm)
+ dumpable = get_dumpable(task->mm);
+ rcu_read_lock();
+ if (dumpable != SUID_DUMP_USER &&
+ !ptrace_has_cap(__task_cred(task)->user_ns, mode)) {
+ rcu_read_unlock();
+ return -EPERM;
+ }
+ rcu_read_unlock();
+
+ return security_ptrace_access_check(task, mode);
+}
+
+bool ptrace_may_access(struct task_struct *task, unsigned int mode)
+{
+ int err;
+ task_lock(task);
+ err = __ptrace_may_access(task, mode);
+ task_unlock(task);
+ return !err;
+}
+
+static int ptrace_attach(struct task_struct *task, long request,
+ unsigned long addr,
+ unsigned long flags)
+{
+ bool seize = (request == PTRACE_SEIZE);
+ int retval;
+
+ retval = -EIO;
+ if (seize) {
+ if (addr != 0)
+ goto out;
+ if (flags & ~(unsigned long)PTRACE_O_MASK)
+ goto out;
+ flags = PT_PTRACED | PT_SEIZED | (flags << PT_OPT_FLAG_SHIFT);
+ } else {
+ flags = PT_PTRACED;
+ }
+
+ audit_ptrace(task);
+
+ retval = -EPERM;
+ if (unlikely(task->flags & PF_KTHREAD))
+ goto out;
+ if (same_thread_group(task, current))
+ goto out;
+
+ /*
+ * Protect exec's credential calculations against our interference;
+ * SUID, SGID and LSM creds get determined differently
+ * under ptrace.
+ */
+ retval = -ERESTARTNOINTR;
+ if (mutex_lock_interruptible(&task->signal->cred_guard_mutex))
+ goto out;
+
+ task_lock(task);
+ retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH);
+ task_unlock(task);
+ if (retval)
+ goto unlock_creds;
+
+ write_lock_irq(&tasklist_lock);
+ retval = -EPERM;
+ if (unlikely(task->exit_state))
+ goto unlock_tasklist;
+ if (task->ptrace)
+ goto unlock_tasklist;
+
+ if (seize)
+ flags |= PT_SEIZED;
+ rcu_read_lock();
+ if (ns_capable(__task_cred(task)->user_ns, CAP_SYS_PTRACE))
+ flags |= PT_PTRACE_CAP;
+ rcu_read_unlock();
+ task->ptrace = flags;
+
+ __ptrace_link(task, current);
+
+ /* SEIZE doesn't trap tracee on attach */
+ if (!seize)
+ send_sig_info(SIGSTOP, SEND_SIG_FORCED, task);
+
+ spin_lock(&task->sighand->siglock);
+
+ /*
+ * If the task is already STOPPED, set JOBCTL_TRAP_STOP and
+ * TRAPPING, and kick it so that it transits to TRACED. TRAPPING
+ * will be cleared if the child completes the transition or any
+ * event which clears the group stop states happens. We'll wait
+ * for the transition to complete before returning from this
+ * function.
+ *
+ * This hides STOPPED -> RUNNING -> TRACED transition from the
+ * attaching thread but a different thread in the same group can
+ * still observe the transient RUNNING state. IOW, if another
+ * thread's WNOHANG wait(2) on the stopped tracee races against
+ * ATTACH, the wait(2) may fail due to the transient RUNNING.
+ *
+ * The following task_is_stopped() test is safe as both transitions
+ * in and out of STOPPED are protected by siglock.
+ */
+ if (task_is_stopped(task) &&
+ task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING))
+ signal_wake_up_state(task, __TASK_STOPPED);
+
+ spin_unlock(&task->sighand->siglock);
+
+ retval = 0;
+unlock_tasklist:
+ write_unlock_irq(&tasklist_lock);
+unlock_creds:
+ mutex_unlock(&task->signal->cred_guard_mutex);
+out:
+ if (!retval) {
+ wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT,
+ TASK_UNINTERRUPTIBLE);
+ proc_ptrace_connector(task, PTRACE_ATTACH);
+ }
+
+ return retval;
+}
+
+/**
+ * ptrace_traceme -- helper for PTRACE_TRACEME
+ *
+ * Performs checks and sets PT_PTRACED.
+ * Should be used by all ptrace implementations for PTRACE_TRACEME.
+ */
+static int ptrace_traceme(void)
+{
+ int ret = -EPERM;
+
+ write_lock_irq(&tasklist_lock);
+ /* Are we already being traced? */
+ if (!current->ptrace) {
+ ret = security_ptrace_traceme(current->parent);
+ /*
+ * Check PF_EXITING to ensure ->real_parent has not passed
+ * exit_ptrace(). Otherwise we don't report the error but
+ * pretend ->real_parent untraces us right after return.
+ */
+ if (!ret && !(current->real_parent->flags & PF_EXITING)) {
+ current->ptrace = PT_PTRACED;
+ __ptrace_link(current, current->real_parent);
+ }
+ }
+ write_unlock_irq(&tasklist_lock);
+
+ return ret;
+}
+
+/*
+ * Called with irqs disabled, returns true if childs should reap themselves.
+ */
+static int ignoring_children(struct sighand_struct *sigh)
+{
+ int ret;
+ spin_lock(&sigh->siglock);
+ ret = (sigh->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) ||
+ (sigh->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT);
+ spin_unlock(&sigh->siglock);
+ return ret;
+}
+
+/*
+ * Called with tasklist_lock held for writing.
+ * Unlink a traced task, and clean it up if it was a traced zombie.
+ * Return true if it needs to be reaped with release_task().
+ * (We can't call release_task() here because we already hold tasklist_lock.)
+ *
+ * If it's a zombie, our attachedness prevented normal parent notification
+ * or self-reaping. Do notification now if it would have happened earlier.
+ * If it should reap itself, return true.
+ *
+ * If it's our own child, there is no notification to do. But if our normal
+ * children self-reap, then this child was prevented by ptrace and we must
+ * reap it now, in that case we must also wake up sub-threads sleeping in
+ * do_wait().
+ */
+static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
+{
+ bool dead;
+
+ __ptrace_unlink(p);
+
+ if (p->exit_state != EXIT_ZOMBIE)
+ return false;
+
+ dead = !thread_group_leader(p);
+
+ if (!dead && thread_group_empty(p)) {
+ if (!same_thread_group(p->real_parent, tracer))
+ dead = do_notify_parent(p, p->exit_signal);
+ else if (ignoring_children(tracer->sighand)) {
+ __wake_up_parent(p, tracer);
+ dead = true;
+ }
+ }
+ /* Mark it as in the process of being reaped. */
+ if (dead)
+ p->exit_state = EXIT_DEAD;
+ return dead;
+}
+
+static int ptrace_detach(struct task_struct *child, unsigned int data)
+{
+ if (!valid_signal(data))
+ return -EIO;
+
+ /* Architecture-specific hardware disable .. */
+ ptrace_disable(child);
+ clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+
+ write_lock_irq(&tasklist_lock);
+ /*
+ * We rely on ptrace_freeze_traced(). It can't be killed and
+ * untraced by another thread, it can't be a zombie.
+ */
+ WARN_ON(!child->ptrace || child->exit_state);
+ /*
+ * tasklist_lock avoids the race with wait_task_stopped(), see
+ * the comment in ptrace_resume().
+ */
+ child->exit_code = data;
+ __ptrace_detach(current, child);
+ write_unlock_irq(&tasklist_lock);
+
+ proc_ptrace_connector(child, PTRACE_DETACH);
+
+ return 0;
+}
+
+/*
+ * Detach all tasks we were using ptrace on. Called with tasklist held
+ * for writing.
+ */
+void exit_ptrace(struct task_struct *tracer, struct list_head *dead)
+{
+ struct task_struct *p, *n;
+
+ list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) {
+ if (unlikely(p->ptrace & PT_EXITKILL))
+ send_sig_info(SIGKILL, SEND_SIG_FORCED, p);
+
+ if (__ptrace_detach(tracer, p))
+ list_add(&p->ptrace_entry, dead);
+ }
+}
+
+int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
+{
+ int copied = 0;
+
+ while (len > 0) {
+ char buf[128];
+ int this_len, retval;
+
+ this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
+ retval = access_process_vm(tsk, src, buf, this_len, 0);
+ if (!retval) {
+ if (copied)
+ break;
+ return -EIO;
+ }
+ if (copy_to_user(dst, buf, retval))
+ return -EFAULT;
+ copied += retval;
+ src += retval;
+ dst += retval;
+ len -= retval;
+ }
+ return copied;
+}
+
+int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long dst, int len)
+{
+ int copied = 0;
+
+ while (len > 0) {
+ char buf[128];
+ int this_len, retval;
+
+ this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
+ if (copy_from_user(buf, src, this_len))
+ return -EFAULT;
+ retval = access_process_vm(tsk, dst, buf, this_len, 1);
+ if (!retval) {
+ if (copied)
+ break;
+ return -EIO;
+ }
+ copied += retval;
+ src += retval;
+ dst += retval;
+ len -= retval;
+ }
+ return copied;
+}
+
+static int ptrace_setoptions(struct task_struct *child, unsigned long data)
+{
+ unsigned flags;
+
+ if (data & ~(unsigned long)PTRACE_O_MASK)
+ return -EINVAL;
+
+ /* Avoid intermediate state when all opts are cleared */
+ flags = child->ptrace;
+ flags &= ~(PTRACE_O_MASK << PT_OPT_FLAG_SHIFT);
+ flags |= (data << PT_OPT_FLAG_SHIFT);
+ child->ptrace = flags;
+
+ return 0;
+}
+
+static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info)
+{
+ unsigned long flags;
+ int error = -ESRCH;
+
+ if (lock_task_sighand(child, &flags)) {
+ error = -EINVAL;
+ if (likely(child->last_siginfo != NULL)) {
+ *info = *child->last_siginfo;
+ error = 0;
+ }
+ unlock_task_sighand(child, &flags);
+ }
+ return error;
+}
+
+static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info)
+{
+ unsigned long flags;
+ int error = -ESRCH;
+
+ if (lock_task_sighand(child, &flags)) {
+ error = -EINVAL;
+ if (likely(child->last_siginfo != NULL)) {
+ *child->last_siginfo = *info;
+ error = 0;
+ }
+ unlock_task_sighand(child, &flags);
+ }
+ return error;
+}
+
+static int ptrace_peek_siginfo(struct task_struct *child,
+ unsigned long addr,
+ unsigned long data)
+{
+ struct ptrace_peeksiginfo_args arg;
+ struct sigpending *pending;
+ struct sigqueue *q;
+ int ret, i;
+
+ ret = copy_from_user(&arg, (void __user *) addr,
+ sizeof(struct ptrace_peeksiginfo_args));
+ if (ret)
+ return -EFAULT;
+
+ if (arg.flags & ~PTRACE_PEEKSIGINFO_SHARED)
+ return -EINVAL; /* unknown flags */
+
+ if (arg.nr < 0)
+ return -EINVAL;
+
+ if (arg.flags & PTRACE_PEEKSIGINFO_SHARED)
+ pending = &child->signal->shared_pending;
+ else
+ pending = &child->pending;
+
+ for (i = 0; i < arg.nr; ) {
+ siginfo_t info;
+ s32 off = arg.off + i;
+
+ spin_lock_irq(&child->sighand->siglock);
+ list_for_each_entry(q, &pending->list, list) {
+ if (!off--) {
+ copy_siginfo(&info, &q->info);
+ break;
+ }
+ }
+ spin_unlock_irq(&child->sighand->siglock);
+
+ if (off >= 0) /* beyond the end of the list */
+ break;
+
+#ifdef CONFIG_COMPAT
+ if (unlikely(is_compat_task())) {
+ compat_siginfo_t __user *uinfo = compat_ptr(data);
+
+ if (copy_siginfo_to_user32(uinfo, &info) ||
+ __put_user(info.si_code, &uinfo->si_code)) {
+ ret = -EFAULT;
+ break;
+ }
+
+ } else
+#endif
+ {
+ siginfo_t __user *uinfo = (siginfo_t __user *) data;
+
+ if (copy_siginfo_to_user(uinfo, &info) ||
+ __put_user(info.si_code, &uinfo->si_code)) {
+ ret = -EFAULT;
+ break;
+ }
+ }
+
+ data += sizeof(siginfo_t);
+ i++;
+
+ if (signal_pending(current))
+ break;
+
+ cond_resched();
+ }
+
+ if (i > 0)
+ return i;
+
+ return ret;
+}
+
+#ifdef PTRACE_SINGLESTEP
+#define is_singlestep(request) ((request) == PTRACE_SINGLESTEP)
+#else
+#define is_singlestep(request) 0
+#endif
+
+#ifdef PTRACE_SINGLEBLOCK
+#define is_singleblock(request) ((request) == PTRACE_SINGLEBLOCK)
+#else
+#define is_singleblock(request) 0
+#endif
+
+#ifdef PTRACE_SYSEMU
+#define is_sysemu_singlestep(request) ((request) == PTRACE_SYSEMU_SINGLESTEP)
+#else
+#define is_sysemu_singlestep(request) 0
+#endif
+
+static int ptrace_resume(struct task_struct *child, long request,
+ unsigned long data)
+{
+ bool need_siglock;
+
+ if (!valid_signal(data))
+ return -EIO;
+
+ if (request == PTRACE_SYSCALL)
+ set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+ else
+ clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+
+#ifdef TIF_SYSCALL_EMU
+ if (request == PTRACE_SYSEMU || request == PTRACE_SYSEMU_SINGLESTEP)
+ set_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+ else
+ clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+#endif
+
+ if (is_singleblock(request)) {
+ if (unlikely(!arch_has_block_step()))
+ return -EIO;
+ user_enable_block_step(child);
+ } else if (is_singlestep(request) || is_sysemu_singlestep(request)) {
+ if (unlikely(!arch_has_single_step()))
+ return -EIO;
+ user_enable_single_step(child);
+ } else {
+ user_disable_single_step(child);
+ }
+
+ /*
+ * Change ->exit_code and ->state under siglock to avoid the race
+ * with wait_task_stopped() in between; a non-zero ->exit_code will
+ * wrongly look like another report from tracee.
+ *
+ * Note that we need siglock even if ->exit_code == data and/or this
+ * status was not reported yet, the new status must not be cleared by
+ * wait_task_stopped() after resume.
+ *
+ * If data == 0 we do not care if wait_task_stopped() reports the old
+ * status and clears the code too; this can't race with the tracee, it
+ * takes siglock after resume.
+ */
+ need_siglock = data && !thread_group_empty(current);
+ if (need_siglock)
+ spin_lock_irq(&child->sighand->siglock);
+ child->exit_code = data;
+ wake_up_state(child, __TASK_TRACED);
+ if (need_siglock)
+ spin_unlock_irq(&child->sighand->siglock);
+
+ return 0;
+}
+
+#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
+
+static const struct user_regset *
+find_regset(const struct user_regset_view *view, unsigned int type)
+{
+ const struct user_regset *regset;
+ int n;
+
+ for (n = 0; n < view->n; ++n) {
+ regset = view->regsets + n;
+ if (regset->core_note_type == type)
+ return regset;
+ }
+
+ return NULL;
+}
+
+static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
+ struct iovec *kiov)
+{
+ const struct user_regset_view *view = task_user_regset_view(task);
+ const struct user_regset *regset = find_regset(view, type);
+ int regset_no;
+
+ if (!regset || (kiov->iov_len % regset->size) != 0)
+ return -EINVAL;
+
+ regset_no = regset - view->regsets;
+ kiov->iov_len = min(kiov->iov_len,
+ (__kernel_size_t) (regset->n * regset->size));
+
+ if (req == PTRACE_GETREGSET)
+ return copy_regset_to_user(task, view, regset_no, 0,
+ kiov->iov_len, kiov->iov_base);
+ else
+ return copy_regset_from_user(task, view, regset_no, 0,
+ kiov->iov_len, kiov->iov_base);
+}
+
+/*
+ * This is declared in linux/regset.h and defined in machine-dependent
+ * code. We put the export here, near the primary machine-neutral use,
+ * to ensure no machine forgets it.
+ */
+EXPORT_SYMBOL_GPL(task_user_regset_view);
+#endif
+
+int ptrace_request(struct task_struct *child, long request,
+ unsigned long addr, unsigned long data)
+{
+ bool seized = child->ptrace & PT_SEIZED;
+ int ret = -EIO;
+ siginfo_t siginfo, *si;
+ void __user *datavp = (void __user *) data;
+ unsigned long __user *datalp = datavp;
+ unsigned long flags;
+
+ switch (request) {
+ case PTRACE_PEEKTEXT:
+ case PTRACE_PEEKDATA:
+ return generic_ptrace_peekdata(child, addr, data);
+ case PTRACE_POKETEXT:
+ case PTRACE_POKEDATA:
+ return generic_ptrace_pokedata(child, addr, data);
+
+#ifdef PTRACE_OLDSETOPTIONS
+ case PTRACE_OLDSETOPTIONS:
+#endif
+ case PTRACE_SETOPTIONS:
+ ret = ptrace_setoptions(child, data);
+ break;
+ case PTRACE_GETEVENTMSG:
+ ret = put_user(child->ptrace_message, datalp);
+ break;
+
+ case PTRACE_PEEKSIGINFO:
+ ret = ptrace_peek_siginfo(child, addr, data);
+ break;
+
+ case PTRACE_GETSIGINFO:
+ ret = ptrace_getsiginfo(child, &siginfo);
+ if (!ret)
+ ret = copy_siginfo_to_user(datavp, &siginfo);
+ break;
+
+ case PTRACE_SETSIGINFO:
+ if (copy_from_user(&siginfo, datavp, sizeof siginfo))
+ ret = -EFAULT;
+ else
+ ret = ptrace_setsiginfo(child, &siginfo);
+ break;
+
+ case PTRACE_GETSIGMASK:
+ if (addr != sizeof(sigset_t)) {
+ ret = -EINVAL;
+ break;
+ }
+
+ if (copy_to_user(datavp, &child->blocked, sizeof(sigset_t)))
+ ret = -EFAULT;
+ else
+ ret = 0;
+
+ break;
+
+ case PTRACE_SETSIGMASK: {
+ sigset_t new_set;
+
+ if (addr != sizeof(sigset_t)) {
+ ret = -EINVAL;
+ break;
+ }
+
+ if (copy_from_user(&new_set, datavp, sizeof(sigset_t))) {
+ ret = -EFAULT;
+ break;
+ }
+
+ sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));
+
+ /*
+ * Every thread does recalc_sigpending() after resume, so
+ * retarget_shared_pending() and recalc_sigpending() are not
+ * called here.
+ */
+ spin_lock_irq(&child->sighand->siglock);
+ child->blocked = new_set;
+ spin_unlock_irq(&child->sighand->siglock);
+
+ ret = 0;
+ break;
+ }
+
+ case PTRACE_INTERRUPT:
+ /*
+ * Stop tracee without any side-effect on signal or job
+ * control. At least one trap is guaranteed to happen
+ * after this request. If @child is already trapped, the
+ * current trap is not disturbed and another trap will
+ * happen after the current trap is ended with PTRACE_CONT.
+ *
+ * The actual trap might not be PTRACE_EVENT_STOP trap but
+ * the pending condition is cleared regardless.
+ */
+ if (unlikely(!seized || !lock_task_sighand(child, &flags)))
+ break;
+
+ /*
+ * INTERRUPT doesn't disturb existing trap sans one
+ * exception. If ptracer issued LISTEN for the current
+ * STOP, this INTERRUPT should clear LISTEN and re-trap
+ * tracee into STOP.
+ */
+ if (likely(task_set_jobctl_pending(child, JOBCTL_TRAP_STOP)))
+ ptrace_signal_wake_up(child, child->jobctl & JOBCTL_LISTENING);
+
+ unlock_task_sighand(child, &flags);
+ ret = 0;
+ break;
+
+ case PTRACE_LISTEN:
+ /*
+ * Listen for events. Tracee must be in STOP. It's not
+ * resumed per-se but is not considered to be in TRACED by
+ * wait(2) or ptrace(2). If an async event (e.g. group
+ * stop state change) happens, tracee will enter STOP trap
+ * again. Alternatively, ptracer can issue INTERRUPT to
+ * finish listening and re-trap tracee into STOP.
+ */
+ if (unlikely(!seized || !lock_task_sighand(child, &flags)))
+ break;
+
+ si = child->last_siginfo;
+ if (likely(si && (si->si_code >> 8) == PTRACE_EVENT_STOP)) {
+ child->jobctl |= JOBCTL_LISTENING;
+ /*
+ * If NOTIFY is set, it means event happened between
+ * start of this trap and now. Trigger re-trap.
+ */
+ if (child->jobctl & JOBCTL_TRAP_NOTIFY)
+ ptrace_signal_wake_up(child, true);
+ ret = 0;
+ }
+ unlock_task_sighand(child, &flags);
+ break;
+
+ case PTRACE_DETACH: /* detach a process that was attached. */
+ ret = ptrace_detach(child, data);
+ break;
+
+#ifdef CONFIG_BINFMT_ELF_FDPIC
+ case PTRACE_GETFDPIC: {
+ struct mm_struct *mm = get_task_mm(child);
+ unsigned long tmp = 0;
+
+ ret = -ESRCH;
+ if (!mm)
+ break;
+
+ switch (addr) {
+ case PTRACE_GETFDPIC_EXEC:
+ tmp = mm->context.exec_fdpic_loadmap;
+ break;
+ case PTRACE_GETFDPIC_INTERP:
+ tmp = mm->context.interp_fdpic_loadmap;
+ break;
+ default:
+ break;
+ }
+ mmput(mm);
+
+ ret = put_user(tmp, datalp);
+ break;
+ }
+#endif
+
+#ifdef PTRACE_SINGLESTEP
+ case PTRACE_SINGLESTEP:
+#endif
+#ifdef PTRACE_SINGLEBLOCK
+ case PTRACE_SINGLEBLOCK:
+#endif
+#ifdef PTRACE_SYSEMU
+ case PTRACE_SYSEMU:
+ case PTRACE_SYSEMU_SINGLESTEP:
+#endif
+ case PTRACE_SYSCALL:
+ case PTRACE_CONT:
+ return ptrace_resume(child, request, data);
+
+ case PTRACE_KILL:
+ if (child->exit_state) /* already dead */
+ return 0;
+ return ptrace_resume(child, request, SIGKILL);
+
+#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
+ case PTRACE_GETREGSET:
+ case PTRACE_SETREGSET: {
+ struct iovec kiov;
+ struct iovec __user *uiov = datavp;
+
+ if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov)))
+ return -EFAULT;
+
+ if (__get_user(kiov.iov_base, &uiov->iov_base) ||
+ __get_user(kiov.iov_len, &uiov->iov_len))
+ return -EFAULT;
+
+ ret = ptrace_regset(child, request, addr, &kiov);
+ if (!ret)
+ ret = __put_user(kiov.iov_len, &uiov->iov_len);
+ break;
+ }
+#endif
+ default:
+ break;
+ }
+
+ return ret;
+}
+
+static struct task_struct *ptrace_get_task_struct(pid_t pid)
+{
+ struct task_struct *child;
+
+ rcu_read_lock();
+ child = find_task_by_vpid(pid);
+ if (child)
+ get_task_struct(child);
+ rcu_read_unlock();
+
+ if (!child)
+ return ERR_PTR(-ESRCH);
+ return child;
+}
+
+#ifndef arch_ptrace_attach
+#define arch_ptrace_attach(child) do { } while (0)
+#endif
+
+SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
+ unsigned long, data)
+{
+ struct task_struct *child;
+ long ret;
+
+ if (request == PTRACE_TRACEME) {
+ ret = ptrace_traceme();
+ if (!ret)
+ arch_ptrace_attach(current);
+ goto out;
+ }
+
+ child = ptrace_get_task_struct(pid);
+ if (IS_ERR(child)) {
+ ret = PTR_ERR(child);
+ goto out;
+ }
+
+ if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
+ ret = ptrace_attach(child, request, addr, data);
+ /*
+ * Some architectures need to do book-keeping after
+ * a ptrace attach.
+ */
+ if (!ret)
+ arch_ptrace_attach(child);
+ goto out_put_task_struct;
+ }
+
+ ret = ptrace_check_attach(child, request == PTRACE_KILL ||
+ request == PTRACE_INTERRUPT);
+ if (ret < 0)
+ goto out_put_task_struct;
+
+ ret = arch_ptrace(child, request, addr, data);
+ if (ret || request != PTRACE_DETACH)
+ ptrace_unfreeze_traced(child);
+
+ out_put_task_struct:
+ put_task_struct(child);
+ out:
+ return ret;
+}
+
+int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr,
+ unsigned long data)
+{
+ unsigned long tmp;
+ int copied;
+
+ copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), 0);
+ if (copied != sizeof(tmp))
+ return -EIO;
+ return put_user(tmp, (unsigned long __user *)data);
+}
+
+int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr,
+ unsigned long data)
+{
+ int copied;
+
+ copied = access_process_vm(tsk, addr, &data, sizeof(data), 1);
+ return (copied == sizeof(data)) ? 0 : -EIO;
+}
+
+#if defined CONFIG_COMPAT
+
+int compat_ptrace_request(struct task_struct *child, compat_long_t request,
+ compat_ulong_t addr, compat_ulong_t data)
+{
+ compat_ulong_t __user *datap = compat_ptr(data);
+ compat_ulong_t word;
+ siginfo_t siginfo;
+ int ret;
+
+ switch (request) {
+ case PTRACE_PEEKTEXT:
+ case PTRACE_PEEKDATA:
+ ret = access_process_vm(child, addr, &word, sizeof(word), 0);
+ if (ret != sizeof(word))
+ ret = -EIO;
+ else
+ ret = put_user(word, datap);
+ break;
+
+ case PTRACE_POKETEXT:
+ case PTRACE_POKEDATA:
+ ret = access_process_vm(child, addr, &data, sizeof(data), 1);
+ ret = (ret != sizeof(data) ? -EIO : 0);
+ break;
+
+ case PTRACE_GETEVENTMSG:
+ ret = put_user((compat_ulong_t) child->ptrace_message, datap);
+ break;
+
+ case PTRACE_GETSIGINFO:
+ ret = ptrace_getsiginfo(child, &siginfo);
+ if (!ret)
+ ret = copy_siginfo_to_user32(
+ (struct compat_siginfo __user *) datap,
+ &siginfo);
+ break;
+
+ case PTRACE_SETSIGINFO:
+ memset(&siginfo, 0, sizeof siginfo);
+ if (copy_siginfo_from_user32(
+ &siginfo, (struct compat_siginfo __user *) datap))
+ ret = -EFAULT;
+ else
+ ret = ptrace_setsiginfo(child, &siginfo);
+ break;
+#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
+ case PTRACE_GETREGSET:
+ case PTRACE_SETREGSET:
+ {
+ struct iovec kiov;
+ struct compat_iovec __user *uiov =
+ (struct compat_iovec __user *) datap;
+ compat_uptr_t ptr;
+ compat_size_t len;
+
+ if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov)))
+ return -EFAULT;
+
+ if (__get_user(ptr, &uiov->iov_base) ||
+ __get_user(len, &uiov->iov_len))
+ return -EFAULT;
+
+ kiov.iov_base = compat_ptr(ptr);
+ kiov.iov_len = len;
+
+ ret = ptrace_regset(child, request, addr, &kiov);
+ if (!ret)
+ ret = __put_user(kiov.iov_len, &uiov->iov_len);
+ break;
+ }
+#endif
+
+ default:
+ ret = ptrace_request(child, request, addr, data);
+ }
+
+ return ret;
+}
+
+COMPAT_SYSCALL_DEFINE4(ptrace, compat_long_t, request, compat_long_t, pid,
+ compat_long_t, addr, compat_long_t, data)
+{
+ struct task_struct *child;
+ long ret;
+
+ if (request == PTRACE_TRACEME) {
+ ret = ptrace_traceme();
+ goto out;
+ }
+
+ child = ptrace_get_task_struct(pid);
+ if (IS_ERR(child)) {
+ ret = PTR_ERR(child);
+ goto out;
+ }
+
+ if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
+ ret = ptrace_attach(child, request, addr, data);
+ /*
+ * Some architectures need to do book-keeping after
+ * a ptrace attach.
+ */
+ if (!ret)
+ arch_ptrace_attach(child);
+ goto out_put_task_struct;
+ }
+
+ ret = ptrace_check_attach(child, request == PTRACE_KILL ||
+ request == PTRACE_INTERRUPT);
+ if (!ret) {
+ ret = compat_arch_ptrace(child, request, addr, data);
+ if (ret || request != PTRACE_DETACH)
+ ptrace_unfreeze_traced(child);
+ }
+
+ out_put_task_struct:
+ put_task_struct(child);
+ out:
+ return ret;
+}
+#endif /* CONFIG_COMPAT */
diff --git a/kernel/range.c b/kernel/range.c
new file mode 100644
index 000000000..82cfc285b
--- /dev/null
+++ b/kernel/range.c
@@ -0,0 +1,163 @@
+/*
+ * Range add and subtract
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/sort.h>
+#include <linux/string.h>
+#include <linux/range.h>
+
+int add_range(struct range *range, int az, int nr_range, u64 start, u64 end)
+{
+ if (start >= end)
+ return nr_range;
+
+ /* Out of slots: */
+ if (nr_range >= az)
+ return nr_range;
+
+ range[nr_range].start = start;
+ range[nr_range].end = end;
+
+ nr_range++;
+
+ return nr_range;
+}
+
+int add_range_with_merge(struct range *range, int az, int nr_range,
+ u64 start, u64 end)
+{
+ int i;
+
+ if (start >= end)
+ return nr_range;
+
+ /* get new start/end: */
+ for (i = 0; i < nr_range; i++) {
+ u64 common_start, common_end;
+
+ if (!range[i].end)
+ continue;
+
+ common_start = max(range[i].start, start);
+ common_end = min(range[i].end, end);
+ if (common_start > common_end)
+ continue;
+
+ /* new start/end, will add it back at last */
+ start = min(range[i].start, start);
+ end = max(range[i].end, end);
+
+ memmove(&range[i], &range[i + 1],
+ (nr_range - (i + 1)) * sizeof(range[i]));
+ range[nr_range - 1].start = 0;
+ range[nr_range - 1].end = 0;
+ nr_range--;
+ i--;
+ }
+
+ /* Need to add it: */
+ return add_range(range, az, nr_range, start, end);
+}
+
+void subtract_range(struct range *range, int az, u64 start, u64 end)
+{
+ int i, j;
+
+ if (start >= end)
+ return;
+
+ for (j = 0; j < az; j++) {
+ if (!range[j].end)
+ continue;
+
+ if (start <= range[j].start && end >= range[j].end) {
+ range[j].start = 0;
+ range[j].end = 0;
+ continue;
+ }
+
+ if (start <= range[j].start && end < range[j].end &&
+ range[j].start < end) {
+ range[j].start = end;
+ continue;
+ }
+
+
+ if (start > range[j].start && end >= range[j].end &&
+ range[j].end > start) {
+ range[j].end = start;
+ continue;
+ }
+
+ if (start > range[j].start && end < range[j].end) {
+ /* Find the new spare: */
+ for (i = 0; i < az; i++) {
+ if (range[i].end == 0)
+ break;
+ }
+ if (i < az) {
+ range[i].end = range[j].end;
+ range[i].start = end;
+ } else {
+ pr_err("%s: run out of slot in ranges\n",
+ __func__);
+ }
+ range[j].end = start;
+ continue;
+ }
+ }
+}
+
+static int cmp_range(const void *x1, const void *x2)
+{
+ const struct range *r1 = x1;
+ const struct range *r2 = x2;
+
+ if (r1->start < r2->start)
+ return -1;
+ if (r1->start > r2->start)
+ return 1;
+ return 0;
+}
+
+int clean_sort_range(struct range *range, int az)
+{
+ int i, j, k = az - 1, nr_range = az;
+
+ for (i = 0; i < k; i++) {
+ if (range[i].end)
+ continue;
+ for (j = k; j > i; j--) {
+ if (range[j].end) {
+ k = j;
+ break;
+ }
+ }
+ if (j == i)
+ break;
+ range[i].start = range[k].start;
+ range[i].end = range[k].end;
+ range[k].start = 0;
+ range[k].end = 0;
+ k--;
+ }
+ /* count it */
+ for (i = 0; i < az; i++) {
+ if (!range[i].end) {
+ nr_range = i;
+ break;
+ }
+ }
+
+ /* sort them */
+ sort(range, nr_range, sizeof(struct range), cmp_range, NULL);
+
+ return nr_range;
+}
+
+void sort_range(struct range *range, int nr_range)
+{
+ /* sort them */
+ sort(range, nr_range, sizeof(struct range), cmp_range, NULL);
+}
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
new file mode 100644
index 000000000..50a808424
--- /dev/null
+++ b/kernel/rcu/Makefile
@@ -0,0 +1,7 @@
+obj-y += update.o
+obj-$(CONFIG_SRCU) += srcu.o
+obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
+obj-$(CONFIG_TREE_RCU) += tree.o
+obj-$(CONFIG_PREEMPT_RCU) += tree.o
+obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o
+obj-$(CONFIG_TINY_RCU) += tiny.o
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
new file mode 100644
index 000000000..80adef7d4
--- /dev/null
+++ b/kernel/rcu/rcu.h
@@ -0,0 +1,146 @@
+/*
+ * Read-Copy Update definitions shared among RCU implementations.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright IBM Corporation, 2011
+ *
+ * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ */
+
+#ifndef __LINUX_RCU_H
+#define __LINUX_RCU_H
+
+#include <trace/events/rcu.h>
+#ifdef CONFIG_RCU_TRACE
+#define RCU_TRACE(stmt) stmt
+#else /* #ifdef CONFIG_RCU_TRACE */
+#define RCU_TRACE(stmt)
+#endif /* #else #ifdef CONFIG_RCU_TRACE */
+
+/*
+ * Process-level increment to ->dynticks_nesting field. This allows for
+ * architectures that use half-interrupts and half-exceptions from
+ * process context.
+ *
+ * DYNTICK_TASK_NEST_MASK defines a field of width DYNTICK_TASK_NEST_WIDTH
+ * that counts the number of process-based reasons why RCU cannot
+ * consider the corresponding CPU to be idle, and DYNTICK_TASK_NEST_VALUE
+ * is the value used to increment or decrement this field.
+ *
+ * The rest of the bits could in principle be used to count interrupts,
+ * but this would mean that a negative-one value in the interrupt
+ * field could incorrectly zero out the DYNTICK_TASK_NEST_MASK field.
+ * We therefore provide a two-bit guard field defined by DYNTICK_TASK_MASK
+ * that is set to DYNTICK_TASK_FLAG upon initial exit from idle.
+ * The DYNTICK_TASK_EXIT_IDLE value is thus the combined value used upon
+ * initial exit from idle.
+ */
+#define DYNTICK_TASK_NEST_WIDTH 7
+#define DYNTICK_TASK_NEST_VALUE ((LLONG_MAX >> DYNTICK_TASK_NEST_WIDTH) + 1)
+#define DYNTICK_TASK_NEST_MASK (LLONG_MAX - DYNTICK_TASK_NEST_VALUE + 1)
+#define DYNTICK_TASK_FLAG ((DYNTICK_TASK_NEST_VALUE / 8) * 2)
+#define DYNTICK_TASK_MASK ((DYNTICK_TASK_NEST_VALUE / 8) * 3)
+#define DYNTICK_TASK_EXIT_IDLE (DYNTICK_TASK_NEST_VALUE + \
+ DYNTICK_TASK_FLAG)
+
+/*
+ * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally
+ * by call_rcu() and rcu callback execution, and are therefore not part of the
+ * RCU API. Leaving in rcupdate.h because they are used by all RCU flavors.
+ */
+
+#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
+# define STATE_RCU_HEAD_READY 0
+# define STATE_RCU_HEAD_QUEUED 1
+
+extern struct debug_obj_descr rcuhead_debug_descr;
+
+static inline int debug_rcu_head_queue(struct rcu_head *head)
+{
+ int r1;
+
+ r1 = debug_object_activate(head, &rcuhead_debug_descr);
+ debug_object_active_state(head, &rcuhead_debug_descr,
+ STATE_RCU_HEAD_READY,
+ STATE_RCU_HEAD_QUEUED);
+ return r1;
+}
+
+static inline void debug_rcu_head_unqueue(struct rcu_head *head)
+{
+ debug_object_active_state(head, &rcuhead_debug_descr,
+ STATE_RCU_HEAD_QUEUED,
+ STATE_RCU_HEAD_READY);
+ debug_object_deactivate(head, &rcuhead_debug_descr);
+}
+#else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
+static inline int debug_rcu_head_queue(struct rcu_head *head)
+{
+ return 0;
+}
+
+static inline void debug_rcu_head_unqueue(struct rcu_head *head)
+{
+}
+#endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
+
+void kfree(const void *);
+
+/*
+ * Reclaim the specified callback, either by invoking it (non-lazy case)
+ * or freeing it directly (lazy case). Return true if lazy, false otherwise.
+ */
+static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
+{
+ unsigned long offset = (unsigned long)head->func;
+
+ rcu_lock_acquire(&rcu_callback_map);
+ if (__is_kfree_rcu_offset(offset)) {
+ RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset));
+ kfree((void *)head - offset);
+ rcu_lock_release(&rcu_callback_map);
+ return true;
+ } else {
+ RCU_TRACE(trace_rcu_invoke_callback(rn, head));
+ head->func(head);
+ rcu_lock_release(&rcu_callback_map);
+ return false;
+ }
+}
+
+#ifdef CONFIG_RCU_STALL_COMMON
+
+extern int rcu_cpu_stall_suppress;
+int rcu_jiffies_till_stall_check(void);
+
+#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
+
+/*
+ * Strings used in tracepoints need to be exported via the
+ * tracing system such that tools like perf and trace-cmd can
+ * translate the string address pointers to actual text.
+ */
+#define TPS(x) tracepoint_string(x)
+
+void rcu_early_boot_tests(void);
+
+/*
+ * This function really isn't for public consumption, but RCU is special in
+ * that context switches can allow the state machine to make progress.
+ */
+extern void resched_cpu(int cpu);
+
+#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
new file mode 100644
index 000000000..8dbe27611
--- /dev/null
+++ b/kernel/rcu/rcutorture.c
@@ -0,0 +1,1861 @@
+/*
+ * Read-Copy Update module-based torture test facility
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright (C) IBM Corporation, 2005, 2006
+ *
+ * Authors: Paul E. McKenney <paulmck@us.ibm.com>
+ * Josh Triplett <josh@joshtriplett.org>
+ *
+ * See also: Documentation/RCU/torture.txt
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/err.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <linux/freezer.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/stat.h>
+#include <linux/srcu.h>
+#include <linux/slab.h>
+#include <linux/trace_clock.h>
+#include <asm/byteorder.h>
+#include <linux/torture.h>
+#include <linux/vmalloc.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@joshtriplett.org>");
+
+
+torture_param(int, cbflood_inter_holdoff, HZ,
+ "Holdoff between floods (jiffies)");
+torture_param(int, cbflood_intra_holdoff, 1,
+ "Holdoff between bursts (jiffies)");
+torture_param(int, cbflood_n_burst, 3, "# bursts in flood, zero to disable");
+torture_param(int, cbflood_n_per_burst, 20000,
+ "# callbacks per burst in flood");
+torture_param(int, fqs_duration, 0,
+ "Duration of fqs bursts (us), 0 to disable");
+torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)");
+torture_param(int, fqs_stutter, 3, "Wait time between fqs bursts (s)");
+torture_param(bool, gp_cond, false, "Use conditional/async GP wait primitives");
+torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
+torture_param(bool, gp_normal, false,
+ "Use normal (non-expedited) GP wait primitives");
+torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives");
+torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers");
+torture_param(int, n_barrier_cbs, 0,
+ "# of callbacks/kthreads for barrier testing");
+torture_param(int, nfakewriters, 4, "Number of RCU fake writer threads");
+torture_param(int, nreaders, -1, "Number of RCU reader threads");
+torture_param(int, object_debug, 0,
+ "Enable debug-object double call_rcu() testing");
+torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
+torture_param(int, onoff_interval, 0,
+ "Time between CPU hotplugs (s), 0=disable");
+torture_param(int, shuffle_interval, 3, "Number of seconds between shuffles");
+torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable.");
+torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable.");
+torture_param(int, stall_cpu_holdoff, 10,
+ "Time to wait before starting stall (s).");
+torture_param(int, stat_interval, 60,
+ "Number of seconds between stats printk()s");
+torture_param(int, stutter, 5, "Number of seconds to run/halt test");
+torture_param(int, test_boost, 1, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
+torture_param(int, test_boost_duration, 4,
+ "Duration of each boost test, seconds.");
+torture_param(int, test_boost_interval, 7,
+ "Interval between boost tests, seconds.");
+torture_param(bool, test_no_idle_hz, true,
+ "Test support for tickless idle CPUs");
+torture_param(bool, verbose, true,
+ "Enable verbose debugging printk()s");
+
+static char *torture_type = "rcu";
+module_param(torture_type, charp, 0444);
+MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)");
+
+static int nrealreaders;
+static int ncbflooders;
+static struct task_struct *writer_task;
+static struct task_struct **fakewriter_tasks;
+static struct task_struct **reader_tasks;
+static struct task_struct *stats_task;
+static struct task_struct **cbflood_task;
+static struct task_struct *fqs_task;
+static struct task_struct *boost_tasks[NR_CPUS];
+static struct task_struct *stall_task;
+static struct task_struct **barrier_cbs_tasks;
+static struct task_struct *barrier_task;
+
+#define RCU_TORTURE_PIPE_LEN 10
+
+struct rcu_torture {
+ struct rcu_head rtort_rcu;
+ int rtort_pipe_count;
+ struct list_head rtort_free;
+ int rtort_mbtest;
+};
+
+static LIST_HEAD(rcu_torture_freelist);
+static struct rcu_torture __rcu *rcu_torture_current;
+static unsigned long rcu_torture_current_version;
+static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
+static DEFINE_SPINLOCK(rcu_torture_lock);
+static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1],
+ rcu_torture_count) = { 0 };
+static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1],
+ rcu_torture_batch) = { 0 };
+static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1];
+static atomic_t n_rcu_torture_alloc;
+static atomic_t n_rcu_torture_alloc_fail;
+static atomic_t n_rcu_torture_free;
+static atomic_t n_rcu_torture_mberror;
+static atomic_t n_rcu_torture_error;
+static long n_rcu_torture_barrier_error;
+static long n_rcu_torture_boost_ktrerror;
+static long n_rcu_torture_boost_rterror;
+static long n_rcu_torture_boost_failure;
+static long n_rcu_torture_boosts;
+static long n_rcu_torture_timers;
+static long n_barrier_attempts;
+static long n_barrier_successes;
+static atomic_long_t n_cbfloods;
+static struct list_head rcu_torture_removed;
+
+static int rcu_torture_writer_state;
+#define RTWS_FIXED_DELAY 0
+#define RTWS_DELAY 1
+#define RTWS_REPLACE 2
+#define RTWS_DEF_FREE 3
+#define RTWS_EXP_SYNC 4
+#define RTWS_COND_GET 5
+#define RTWS_COND_SYNC 6
+#define RTWS_SYNC 7
+#define RTWS_STUTTER 8
+#define RTWS_STOPPING 9
+
+#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE)
+#define RCUTORTURE_RUNNABLE_INIT 1
+#else
+#define RCUTORTURE_RUNNABLE_INIT 0
+#endif
+static int torture_runnable = RCUTORTURE_RUNNABLE_INIT;
+module_param(torture_runnable, int, 0444);
+MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot");
+
+#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU)
+#define rcu_can_boost() 1
+#else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
+#define rcu_can_boost() 0
+#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
+
+#ifdef CONFIG_RCU_TRACE
+static u64 notrace rcu_trace_clock_local(void)
+{
+ u64 ts = trace_clock_local();
+ unsigned long __maybe_unused ts_rem = do_div(ts, NSEC_PER_USEC);
+ return ts;
+}
+#else /* #ifdef CONFIG_RCU_TRACE */
+static u64 notrace rcu_trace_clock_local(void)
+{
+ return 0ULL;
+}
+#endif /* #else #ifdef CONFIG_RCU_TRACE */
+
+static unsigned long boost_starttime; /* jiffies of next boost test start. */
+static DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
+ /* and boost task create/destroy. */
+static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */
+static bool barrier_phase; /* Test phase. */
+static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */
+static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */
+static DECLARE_WAIT_QUEUE_HEAD(barrier_wq);
+
+/*
+ * Allocate an element from the rcu_tortures pool.
+ */
+static struct rcu_torture *
+rcu_torture_alloc(void)
+{
+ struct list_head *p;
+
+ spin_lock_bh(&rcu_torture_lock);
+ if (list_empty(&rcu_torture_freelist)) {
+ atomic_inc(&n_rcu_torture_alloc_fail);
+ spin_unlock_bh(&rcu_torture_lock);
+ return NULL;
+ }
+ atomic_inc(&n_rcu_torture_alloc);
+ p = rcu_torture_freelist.next;
+ list_del_init(p);
+ spin_unlock_bh(&rcu_torture_lock);
+ return container_of(p, struct rcu_torture, rtort_free);
+}
+
+/*
+ * Free an element to the rcu_tortures pool.
+ */
+static void
+rcu_torture_free(struct rcu_torture *p)
+{
+ atomic_inc(&n_rcu_torture_free);
+ spin_lock_bh(&rcu_torture_lock);
+ list_add_tail(&p->rtort_free, &rcu_torture_freelist);
+ spin_unlock_bh(&rcu_torture_lock);
+}
+
+/*
+ * Operations vector for selecting different types of tests.
+ */
+
+struct rcu_torture_ops {
+ int ttype;
+ void (*init)(void);
+ int (*readlock)(void);
+ void (*read_delay)(struct torture_random_state *rrsp);
+ void (*readunlock)(int idx);
+ unsigned long (*started)(void);
+ unsigned long (*completed)(void);
+ void (*deferred_free)(struct rcu_torture *p);
+ void (*sync)(void);
+ void (*exp_sync)(void);
+ unsigned long (*get_state)(void);
+ void (*cond_sync)(unsigned long oldstate);
+ void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
+ void (*cb_barrier)(void);
+ void (*fqs)(void);
+ void (*stats)(void);
+ int irq_capable;
+ int can_boost;
+ const char *name;
+};
+
+static struct rcu_torture_ops *cur_ops;
+
+/*
+ * Definitions for rcu torture testing.
+ */
+
+static int rcu_torture_read_lock(void) __acquires(RCU)
+{
+ rcu_read_lock();
+ return 0;
+}
+
+static void rcu_read_delay(struct torture_random_state *rrsp)
+{
+ const unsigned long shortdelay_us = 200;
+ const unsigned long longdelay_ms = 50;
+
+ /* We want a short delay sometimes to make a reader delay the grace
+ * period, and we want a long delay occasionally to trigger
+ * force_quiescent_state. */
+
+ if (!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms)))
+ mdelay(longdelay_ms);
+ if (!(torture_random(rrsp) % (nrealreaders * 2 * shortdelay_us)))
+ udelay(shortdelay_us);
+#ifdef CONFIG_PREEMPT
+ if (!preempt_count() &&
+ !(torture_random(rrsp) % (nrealreaders * 20000)))
+ preempt_schedule(); /* No QS if preempt_disable() in effect */
+#endif
+}
+
+static void rcu_torture_read_unlock(int idx) __releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+/*
+ * Update callback in the pipe. This should be invoked after a grace period.
+ */
+static bool
+rcu_torture_pipe_update_one(struct rcu_torture *rp)
+{
+ int i;
+
+ i = rp->rtort_pipe_count;
+ if (i > RCU_TORTURE_PIPE_LEN)
+ i = RCU_TORTURE_PIPE_LEN;
+ atomic_inc(&rcu_torture_wcount[i]);
+ if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
+ rp->rtort_mbtest = 0;
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Update all callbacks in the pipe. Suitable for synchronous grace-period
+ * primitives.
+ */
+static void
+rcu_torture_pipe_update(struct rcu_torture *old_rp)
+{
+ struct rcu_torture *rp;
+ struct rcu_torture *rp1;
+
+ if (old_rp)
+ list_add(&old_rp->rtort_free, &rcu_torture_removed);
+ list_for_each_entry_safe(rp, rp1, &rcu_torture_removed, rtort_free) {
+ if (rcu_torture_pipe_update_one(rp)) {
+ list_del(&rp->rtort_free);
+ rcu_torture_free(rp);
+ }
+ }
+}
+
+static void
+rcu_torture_cb(struct rcu_head *p)
+{
+ struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
+
+ if (torture_must_stop_irq()) {
+ /* Test is ending, just drop callbacks on the floor. */
+ /* The next initialization will pick up the pieces. */
+ return;
+ }
+ if (rcu_torture_pipe_update_one(rp))
+ rcu_torture_free(rp);
+ else
+ cur_ops->deferred_free(rp);
+}
+
+static unsigned long rcu_no_completed(void)
+{
+ return 0;
+}
+
+static void rcu_torture_deferred_free(struct rcu_torture *p)
+{
+ call_rcu(&p->rtort_rcu, rcu_torture_cb);
+}
+
+static void rcu_sync_torture_init(void)
+{
+ INIT_LIST_HEAD(&rcu_torture_removed);
+}
+
+static struct rcu_torture_ops rcu_ops = {
+ .ttype = RCU_FLAVOR,
+ .init = rcu_sync_torture_init,
+ .readlock = rcu_torture_read_lock,
+ .read_delay = rcu_read_delay,
+ .readunlock = rcu_torture_read_unlock,
+ .started = rcu_batches_started,
+ .completed = rcu_batches_completed,
+ .deferred_free = rcu_torture_deferred_free,
+ .sync = synchronize_rcu,
+ .exp_sync = synchronize_rcu_expedited,
+ .get_state = get_state_synchronize_rcu,
+ .cond_sync = cond_synchronize_rcu,
+ .call = call_rcu,
+ .cb_barrier = rcu_barrier,
+ .fqs = rcu_force_quiescent_state,
+ .stats = NULL,
+ .irq_capable = 1,
+ .can_boost = rcu_can_boost(),
+ .name = "rcu"
+};
+
+/*
+ * Definitions for rcu_bh torture testing.
+ */
+
+static int rcu_bh_torture_read_lock(void) __acquires(RCU_BH)
+{
+ rcu_read_lock_bh();
+ return 0;
+}
+
+static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH)
+{
+ rcu_read_unlock_bh();
+}
+
+static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
+{
+ call_rcu_bh(&p->rtort_rcu, rcu_torture_cb);
+}
+
+static struct rcu_torture_ops rcu_bh_ops = {
+ .ttype = RCU_BH_FLAVOR,
+ .init = rcu_sync_torture_init,
+ .readlock = rcu_bh_torture_read_lock,
+ .read_delay = rcu_read_delay, /* just reuse rcu's version. */
+ .readunlock = rcu_bh_torture_read_unlock,
+ .started = rcu_batches_started_bh,
+ .completed = rcu_batches_completed_bh,
+ .deferred_free = rcu_bh_torture_deferred_free,
+ .sync = synchronize_rcu_bh,
+ .exp_sync = synchronize_rcu_bh_expedited,
+ .call = call_rcu_bh,
+ .cb_barrier = rcu_barrier_bh,
+ .fqs = rcu_bh_force_quiescent_state,
+ .stats = NULL,
+ .irq_capable = 1,
+ .name = "rcu_bh"
+};
+
+/*
+ * Don't even think about trying any of these in real life!!!
+ * The names includes "busted", and they really means it!
+ * The only purpose of these functions is to provide a buggy RCU
+ * implementation to make sure that rcutorture correctly emits
+ * buggy-RCU error messages.
+ */
+static void rcu_busted_torture_deferred_free(struct rcu_torture *p)
+{
+ /* This is a deliberate bug for testing purposes only! */
+ rcu_torture_cb(&p->rtort_rcu);
+}
+
+static void synchronize_rcu_busted(void)
+{
+ /* This is a deliberate bug for testing purposes only! */
+}
+
+static void
+call_rcu_busted(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+ /* This is a deliberate bug for testing purposes only! */
+ func(head);
+}
+
+static struct rcu_torture_ops rcu_busted_ops = {
+ .ttype = INVALID_RCU_FLAVOR,
+ .init = rcu_sync_torture_init,
+ .readlock = rcu_torture_read_lock,
+ .read_delay = rcu_read_delay, /* just reuse rcu's version. */
+ .readunlock = rcu_torture_read_unlock,
+ .started = rcu_no_completed,
+ .completed = rcu_no_completed,
+ .deferred_free = rcu_busted_torture_deferred_free,
+ .sync = synchronize_rcu_busted,
+ .exp_sync = synchronize_rcu_busted,
+ .call = call_rcu_busted,
+ .cb_barrier = NULL,
+ .fqs = NULL,
+ .stats = NULL,
+ .irq_capable = 1,
+ .name = "rcu_busted"
+};
+
+/*
+ * Definitions for srcu torture testing.
+ */
+
+DEFINE_STATIC_SRCU(srcu_ctl);
+
+static int srcu_torture_read_lock(void) __acquires(&srcu_ctl)
+{
+ return srcu_read_lock(&srcu_ctl);
+}
+
+static void srcu_read_delay(struct torture_random_state *rrsp)
+{
+ long delay;
+ const long uspertick = 1000000 / HZ;
+ const long longdelay = 10;
+
+ /* We want there to be long-running readers, but not all the time. */
+
+ delay = torture_random(rrsp) %
+ (nrealreaders * 2 * longdelay * uspertick);
+ if (!delay)
+ schedule_timeout_interruptible(longdelay);
+ else
+ rcu_read_delay(rrsp);
+}
+
+static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
+{
+ srcu_read_unlock(&srcu_ctl, idx);
+}
+
+static unsigned long srcu_torture_completed(void)
+{
+ return srcu_batches_completed(&srcu_ctl);
+}
+
+static void srcu_torture_deferred_free(struct rcu_torture *rp)
+{
+ call_srcu(&srcu_ctl, &rp->rtort_rcu, rcu_torture_cb);
+}
+
+static void srcu_torture_synchronize(void)
+{
+ synchronize_srcu(&srcu_ctl);
+}
+
+static void srcu_torture_call(struct rcu_head *head,
+ void (*func)(struct rcu_head *head))
+{
+ call_srcu(&srcu_ctl, head, func);
+}
+
+static void srcu_torture_barrier(void)
+{
+ srcu_barrier(&srcu_ctl);
+}
+
+static void srcu_torture_stats(void)
+{
+ int cpu;
+ int idx = srcu_ctl.completed & 0x1;
+
+ pr_alert("%s%s per-CPU(idx=%d):",
+ torture_type, TORTURE_FLAG, idx);
+ for_each_possible_cpu(cpu) {
+ long c0, c1;
+
+ c0 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx];
+ c1 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx];
+ pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
+ }
+ pr_cont("\n");
+}
+
+static void srcu_torture_synchronize_expedited(void)
+{
+ synchronize_srcu_expedited(&srcu_ctl);
+}
+
+static struct rcu_torture_ops srcu_ops = {
+ .ttype = SRCU_FLAVOR,
+ .init = rcu_sync_torture_init,
+ .readlock = srcu_torture_read_lock,
+ .read_delay = srcu_read_delay,
+ .readunlock = srcu_torture_read_unlock,
+ .started = NULL,
+ .completed = srcu_torture_completed,
+ .deferred_free = srcu_torture_deferred_free,
+ .sync = srcu_torture_synchronize,
+ .exp_sync = srcu_torture_synchronize_expedited,
+ .call = srcu_torture_call,
+ .cb_barrier = srcu_torture_barrier,
+ .stats = srcu_torture_stats,
+ .name = "srcu"
+};
+
+/*
+ * Definitions for sched torture testing.
+ */
+
+static int sched_torture_read_lock(void)
+{
+ preempt_disable();
+ return 0;
+}
+
+static void sched_torture_read_unlock(int idx)
+{
+ preempt_enable();
+}
+
+static void rcu_sched_torture_deferred_free(struct rcu_torture *p)
+{
+ call_rcu_sched(&p->rtort_rcu, rcu_torture_cb);
+}
+
+static struct rcu_torture_ops sched_ops = {
+ .ttype = RCU_SCHED_FLAVOR,
+ .init = rcu_sync_torture_init,
+ .readlock = sched_torture_read_lock,
+ .read_delay = rcu_read_delay, /* just reuse rcu's version. */
+ .readunlock = sched_torture_read_unlock,
+ .started = rcu_batches_started_sched,
+ .completed = rcu_batches_completed_sched,
+ .deferred_free = rcu_sched_torture_deferred_free,
+ .sync = synchronize_sched,
+ .exp_sync = synchronize_sched_expedited,
+ .call = call_rcu_sched,
+ .cb_barrier = rcu_barrier_sched,
+ .fqs = rcu_sched_force_quiescent_state,
+ .stats = NULL,
+ .irq_capable = 1,
+ .name = "sched"
+};
+
+#ifdef CONFIG_TASKS_RCU
+
+/*
+ * Definitions for RCU-tasks torture testing.
+ */
+
+static int tasks_torture_read_lock(void)
+{
+ return 0;
+}
+
+static void tasks_torture_read_unlock(int idx)
+{
+}
+
+static void rcu_tasks_torture_deferred_free(struct rcu_torture *p)
+{
+ call_rcu_tasks(&p->rtort_rcu, rcu_torture_cb);
+}
+
+static struct rcu_torture_ops tasks_ops = {
+ .ttype = RCU_TASKS_FLAVOR,
+ .init = rcu_sync_torture_init,
+ .readlock = tasks_torture_read_lock,
+ .read_delay = rcu_read_delay, /* just reuse rcu's version. */
+ .readunlock = tasks_torture_read_unlock,
+ .started = rcu_no_completed,
+ .completed = rcu_no_completed,
+ .deferred_free = rcu_tasks_torture_deferred_free,
+ .sync = synchronize_rcu_tasks,
+ .exp_sync = synchronize_rcu_tasks,
+ .call = call_rcu_tasks,
+ .cb_barrier = rcu_barrier_tasks,
+ .fqs = NULL,
+ .stats = NULL,
+ .irq_capable = 1,
+ .name = "tasks"
+};
+
+#define RCUTORTURE_TASKS_OPS &tasks_ops,
+
+#else /* #ifdef CONFIG_TASKS_RCU */
+
+#define RCUTORTURE_TASKS_OPS
+
+#endif /* #else #ifdef CONFIG_TASKS_RCU */
+
+/*
+ * RCU torture priority-boost testing. Runs one real-time thread per
+ * CPU for moderate bursts, repeatedly registering RCU callbacks and
+ * spinning waiting for them to be invoked. If a given callback takes
+ * too long to be invoked, we assume that priority inversion has occurred.
+ */
+
+struct rcu_boost_inflight {
+ struct rcu_head rcu;
+ int inflight;
+};
+
+static void rcu_torture_boost_cb(struct rcu_head *head)
+{
+ struct rcu_boost_inflight *rbip =
+ container_of(head, struct rcu_boost_inflight, rcu);
+
+ smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */
+ rbip->inflight = 0;
+}
+
+static int rcu_torture_boost(void *arg)
+{
+ unsigned long call_rcu_time;
+ unsigned long endtime;
+ unsigned long oldstarttime;
+ struct rcu_boost_inflight rbi = { .inflight = 0 };
+ struct sched_param sp;
+
+ VERBOSE_TOROUT_STRING("rcu_torture_boost started");
+
+ /* Set real-time priority. */
+ sp.sched_priority = 1;
+ if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) {
+ VERBOSE_TOROUT_STRING("rcu_torture_boost RT prio failed!");
+ n_rcu_torture_boost_rterror++;
+ }
+
+ init_rcu_head_on_stack(&rbi.rcu);
+ /* Each pass through the following loop does one boost-test cycle. */
+ do {
+ /* Wait for the next test interval. */
+ oldstarttime = boost_starttime;
+ while (ULONG_CMP_LT(jiffies, oldstarttime)) {
+ schedule_timeout_interruptible(oldstarttime - jiffies);
+ stutter_wait("rcu_torture_boost");
+ if (torture_must_stop())
+ goto checkwait;
+ }
+
+ /* Do one boost-test interval. */
+ endtime = oldstarttime + test_boost_duration * HZ;
+ call_rcu_time = jiffies;
+ while (ULONG_CMP_LT(jiffies, endtime)) {
+ /* If we don't have a callback in flight, post one. */
+ if (!rbi.inflight) {
+ smp_mb(); /* RCU core before ->inflight = 1. */
+ rbi.inflight = 1;
+ call_rcu(&rbi.rcu, rcu_torture_boost_cb);
+ if (jiffies - call_rcu_time >
+ test_boost_duration * HZ - HZ / 2) {
+ VERBOSE_TOROUT_STRING("rcu_torture_boost boosting failed");
+ n_rcu_torture_boost_failure++;
+ }
+ call_rcu_time = jiffies;
+ }
+ cond_resched_rcu_qs();
+ stutter_wait("rcu_torture_boost");
+ if (torture_must_stop())
+ goto checkwait;
+ }
+
+ /*
+ * Set the start time of the next test interval.
+ * Yes, this is vulnerable to long delays, but such
+ * delays simply cause a false negative for the next
+ * interval. Besides, we are running at RT priority,
+ * so delays should be relatively rare.
+ */
+ while (oldstarttime == boost_starttime &&
+ !kthread_should_stop()) {
+ if (mutex_trylock(&boost_mutex)) {
+ boost_starttime = jiffies +
+ test_boost_interval * HZ;
+ n_rcu_torture_boosts++;
+ mutex_unlock(&boost_mutex);
+ break;
+ }
+ schedule_timeout_uninterruptible(1);
+ }
+
+ /* Go do the stutter. */
+checkwait: stutter_wait("rcu_torture_boost");
+ } while (!torture_must_stop());
+
+ /* Clean up and exit. */
+ while (!kthread_should_stop() || rbi.inflight) {
+ torture_shutdown_absorb("rcu_torture_boost");
+ schedule_timeout_uninterruptible(1);
+ }
+ smp_mb(); /* order accesses to ->inflight before stack-frame death. */
+ destroy_rcu_head_on_stack(&rbi.rcu);
+ torture_kthread_stopping("rcu_torture_boost");
+ return 0;
+}
+
+static void rcu_torture_cbflood_cb(struct rcu_head *rhp)
+{
+}
+
+/*
+ * RCU torture callback-flood kthread. Repeatedly induces bursts of calls
+ * to call_rcu() or analogous, increasing the probability of occurrence
+ * of callback-overflow corner cases.
+ */
+static int
+rcu_torture_cbflood(void *arg)
+{
+ int err = 1;
+ int i;
+ int j;
+ struct rcu_head *rhp;
+
+ if (cbflood_n_per_burst > 0 &&
+ cbflood_inter_holdoff > 0 &&
+ cbflood_intra_holdoff > 0 &&
+ cur_ops->call &&
+ cur_ops->cb_barrier) {
+ rhp = vmalloc(sizeof(*rhp) *
+ cbflood_n_burst * cbflood_n_per_burst);
+ err = !rhp;
+ }
+ if (err) {
+ VERBOSE_TOROUT_STRING("rcu_torture_cbflood disabled: Bad args or OOM");
+ while (!torture_must_stop())
+ schedule_timeout_interruptible(HZ);
+ return 0;
+ }
+ VERBOSE_TOROUT_STRING("rcu_torture_cbflood task started");
+ do {
+ schedule_timeout_interruptible(cbflood_inter_holdoff);
+ atomic_long_inc(&n_cbfloods);
+ WARN_ON(signal_pending(current));
+ for (i = 0; i < cbflood_n_burst; i++) {
+ for (j = 0; j < cbflood_n_per_burst; j++) {
+ cur_ops->call(&rhp[i * cbflood_n_per_burst + j],
+ rcu_torture_cbflood_cb);
+ }
+ schedule_timeout_interruptible(cbflood_intra_holdoff);
+ WARN_ON(signal_pending(current));
+ }
+ cur_ops->cb_barrier();
+ stutter_wait("rcu_torture_cbflood");
+ } while (!torture_must_stop());
+ vfree(rhp);
+ torture_kthread_stopping("rcu_torture_cbflood");
+ return 0;
+}
+
+/*
+ * RCU torture force-quiescent-state kthread. Repeatedly induces
+ * bursts of calls to force_quiescent_state(), increasing the probability
+ * of occurrence of some important types of race conditions.
+ */
+static int
+rcu_torture_fqs(void *arg)
+{
+ unsigned long fqs_resume_time;
+ int fqs_burst_remaining;
+
+ VERBOSE_TOROUT_STRING("rcu_torture_fqs task started");
+ do {
+ fqs_resume_time = jiffies + fqs_stutter * HZ;
+ while (ULONG_CMP_LT(jiffies, fqs_resume_time) &&
+ !kthread_should_stop()) {
+ schedule_timeout_interruptible(1);
+ }
+ fqs_burst_remaining = fqs_duration;
+ while (fqs_burst_remaining > 0 &&
+ !kthread_should_stop()) {
+ cur_ops->fqs();
+ udelay(fqs_holdoff);
+ fqs_burst_remaining -= fqs_holdoff;
+ }
+ stutter_wait("rcu_torture_fqs");
+ } while (!torture_must_stop());
+ torture_kthread_stopping("rcu_torture_fqs");
+ return 0;
+}
+
+/*
+ * RCU torture writer kthread. Repeatedly substitutes a new structure
+ * for that pointed to by rcu_torture_current, freeing the old structure
+ * after a series of grace periods (the "pipeline").
+ */
+static int
+rcu_torture_writer(void *arg)
+{
+ bool can_expedite = !rcu_gp_is_expedited();
+ int expediting = 0;
+ unsigned long gp_snap;
+ bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal;
+ bool gp_sync1 = gp_sync;
+ int i;
+ struct rcu_torture *rp;
+ struct rcu_torture *old_rp;
+ static DEFINE_TORTURE_RANDOM(rand);
+ int synctype[] = { RTWS_DEF_FREE, RTWS_EXP_SYNC,
+ RTWS_COND_GET, RTWS_SYNC };
+ int nsynctypes = 0;
+
+ VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
+ pr_alert("%s" TORTURE_FLAG
+ " Grace periods expedited from boot/sysfs for %s,\n",
+ torture_type, cur_ops->name);
+ pr_alert("%s" TORTURE_FLAG
+ " Testing of dynamic grace-period expediting diabled.\n",
+ torture_type);
+
+ /* Initialize synctype[] array. If none set, take default. */
+ if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1)
+ gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true;
+ if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync)
+ synctype[nsynctypes++] = RTWS_COND_GET;
+ else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync))
+ pr_alert("rcu_torture_writer: gp_cond without primitives.\n");
+ if (gp_exp1 && cur_ops->exp_sync)
+ synctype[nsynctypes++] = RTWS_EXP_SYNC;
+ else if (gp_exp && !cur_ops->exp_sync)
+ pr_alert("rcu_torture_writer: gp_exp without primitives.\n");
+ if (gp_normal1 && cur_ops->deferred_free)
+ synctype[nsynctypes++] = RTWS_DEF_FREE;
+ else if (gp_normal && !cur_ops->deferred_free)
+ pr_alert("rcu_torture_writer: gp_normal without primitives.\n");
+ if (gp_sync1 && cur_ops->sync)
+ synctype[nsynctypes++] = RTWS_SYNC;
+ else if (gp_sync && !cur_ops->sync)
+ pr_alert("rcu_torture_writer: gp_sync without primitives.\n");
+ if (WARN_ONCE(nsynctypes == 0,
+ "rcu_torture_writer: No update-side primitives.\n")) {
+ /*
+ * No updates primitives, so don't try updating.
+ * The resulting test won't be testing much, hence the
+ * above WARN_ONCE().
+ */
+ rcu_torture_writer_state = RTWS_STOPPING;
+ torture_kthread_stopping("rcu_torture_writer");
+ }
+
+ do {
+ rcu_torture_writer_state = RTWS_FIXED_DELAY;
+ schedule_timeout_uninterruptible(1);
+ rp = rcu_torture_alloc();
+ if (rp == NULL)
+ continue;
+ rp->rtort_pipe_count = 0;
+ rcu_torture_writer_state = RTWS_DELAY;
+ udelay(torture_random(&rand) & 0x3ff);
+ rcu_torture_writer_state = RTWS_REPLACE;
+ old_rp = rcu_dereference_check(rcu_torture_current,
+ current == writer_task);
+ rp->rtort_mbtest = 1;
+ rcu_assign_pointer(rcu_torture_current, rp);
+ smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */
+ if (old_rp) {
+ i = old_rp->rtort_pipe_count;
+ if (i > RCU_TORTURE_PIPE_LEN)
+ i = RCU_TORTURE_PIPE_LEN;
+ atomic_inc(&rcu_torture_wcount[i]);
+ old_rp->rtort_pipe_count++;
+ switch (synctype[torture_random(&rand) % nsynctypes]) {
+ case RTWS_DEF_FREE:
+ rcu_torture_writer_state = RTWS_DEF_FREE;
+ cur_ops->deferred_free(old_rp);
+ break;
+ case RTWS_EXP_SYNC:
+ rcu_torture_writer_state = RTWS_EXP_SYNC;
+ cur_ops->exp_sync();
+ rcu_torture_pipe_update(old_rp);
+ break;
+ case RTWS_COND_GET:
+ rcu_torture_writer_state = RTWS_COND_GET;
+ gp_snap = cur_ops->get_state();
+ i = torture_random(&rand) % 16;
+ if (i != 0)
+ schedule_timeout_interruptible(i);
+ udelay(torture_random(&rand) % 1000);
+ rcu_torture_writer_state = RTWS_COND_SYNC;
+ cur_ops->cond_sync(gp_snap);
+ rcu_torture_pipe_update(old_rp);
+ break;
+ case RTWS_SYNC:
+ rcu_torture_writer_state = RTWS_SYNC;
+ cur_ops->sync();
+ rcu_torture_pipe_update(old_rp);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+ }
+ rcutorture_record_progress(++rcu_torture_current_version);
+ /* Cycle through nesting levels of rcu_expedite_gp() calls. */
+ if (can_expedite &&
+ !(torture_random(&rand) & 0xff & (!!expediting - 1))) {
+ WARN_ON_ONCE(expediting == 0 && rcu_gp_is_expedited());
+ if (expediting >= 0)
+ rcu_expedite_gp();
+ else
+ rcu_unexpedite_gp();
+ if (++expediting > 3)
+ expediting = -expediting;
+ }
+ rcu_torture_writer_state = RTWS_STUTTER;
+ stutter_wait("rcu_torture_writer");
+ } while (!torture_must_stop());
+ /* Reset expediting back to unexpedited. */
+ if (expediting > 0)
+ expediting = -expediting;
+ while (can_expedite && expediting++ < 0)
+ rcu_unexpedite_gp();
+ WARN_ON_ONCE(can_expedite && rcu_gp_is_expedited());
+ rcu_torture_writer_state = RTWS_STOPPING;
+ torture_kthread_stopping("rcu_torture_writer");
+ return 0;
+}
+
+/*
+ * RCU torture fake writer kthread. Repeatedly calls sync, with a random
+ * delay between calls.
+ */
+static int
+rcu_torture_fakewriter(void *arg)
+{
+ DEFINE_TORTURE_RANDOM(rand);
+
+ VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task started");
+ set_user_nice(current, MAX_NICE);
+
+ do {
+ schedule_timeout_uninterruptible(1 + torture_random(&rand)%10);
+ udelay(torture_random(&rand) & 0x3ff);
+ if (cur_ops->cb_barrier != NULL &&
+ torture_random(&rand) % (nfakewriters * 8) == 0) {
+ cur_ops->cb_barrier();
+ } else if (gp_normal == gp_exp) {
+ if (torture_random(&rand) & 0x80)
+ cur_ops->sync();
+ else
+ cur_ops->exp_sync();
+ } else if (gp_normal) {
+ cur_ops->sync();
+ } else {
+ cur_ops->exp_sync();
+ }
+ stutter_wait("rcu_torture_fakewriter");
+ } while (!torture_must_stop());
+
+ torture_kthread_stopping("rcu_torture_fakewriter");
+ return 0;
+}
+
+static void rcutorture_trace_dump(void)
+{
+ static atomic_t beenhere = ATOMIC_INIT(0);
+
+ if (atomic_read(&beenhere))
+ return;
+ if (atomic_xchg(&beenhere, 1) != 0)
+ return;
+ ftrace_dump(DUMP_ALL);
+}
+
+/*
+ * RCU torture reader from timer handler. Dereferences rcu_torture_current,
+ * incrementing the corresponding element of the pipeline array. The
+ * counter in the element should never be greater than 1, otherwise, the
+ * RCU implementation is broken.
+ */
+static void rcu_torture_timer(unsigned long unused)
+{
+ int idx;
+ unsigned long started;
+ unsigned long completed;
+ static DEFINE_TORTURE_RANDOM(rand);
+ static DEFINE_SPINLOCK(rand_lock);
+ struct rcu_torture *p;
+ int pipe_count;
+ unsigned long long ts;
+
+ idx = cur_ops->readlock();
+ if (cur_ops->started)
+ started = cur_ops->started();
+ else
+ started = cur_ops->completed();
+ ts = rcu_trace_clock_local();
+ p = rcu_dereference_check(rcu_torture_current,
+ rcu_read_lock_bh_held() ||
+ rcu_read_lock_sched_held() ||
+ srcu_read_lock_held(&srcu_ctl));
+ if (p == NULL) {
+ /* Leave because rcu_torture_writer is not yet underway */
+ cur_ops->readunlock(idx);
+ return;
+ }
+ if (p->rtort_mbtest == 0)
+ atomic_inc(&n_rcu_torture_mberror);
+ spin_lock(&rand_lock);
+ cur_ops->read_delay(&rand);
+ n_rcu_torture_timers++;
+ spin_unlock(&rand_lock);
+ preempt_disable();
+ pipe_count = p->rtort_pipe_count;
+ if (pipe_count > RCU_TORTURE_PIPE_LEN) {
+ /* Should not happen, but... */
+ pipe_count = RCU_TORTURE_PIPE_LEN;
+ }
+ completed = cur_ops->completed();
+ if (pipe_count > 1) {
+ do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts,
+ started, completed);
+ rcutorture_trace_dump();
+ }
+ __this_cpu_inc(rcu_torture_count[pipe_count]);
+ completed = completed - started;
+ if (cur_ops->started)
+ completed++;
+ if (completed > RCU_TORTURE_PIPE_LEN) {
+ /* Should not happen, but... */
+ completed = RCU_TORTURE_PIPE_LEN;
+ }
+ __this_cpu_inc(rcu_torture_batch[completed]);
+ preempt_enable();
+ cur_ops->readunlock(idx);
+}
+
+/*
+ * RCU torture reader kthread. Repeatedly dereferences rcu_torture_current,
+ * incrementing the corresponding element of the pipeline array. The
+ * counter in the element should never be greater than 1, otherwise, the
+ * RCU implementation is broken.
+ */
+static int
+rcu_torture_reader(void *arg)
+{
+ unsigned long started;
+ unsigned long completed;
+ int idx;
+ DEFINE_TORTURE_RANDOM(rand);
+ struct rcu_torture *p;
+ int pipe_count;
+ struct timer_list t;
+ unsigned long long ts;
+
+ VERBOSE_TOROUT_STRING("rcu_torture_reader task started");
+ set_user_nice(current, MAX_NICE);
+ if (irqreader && cur_ops->irq_capable)
+ setup_timer_on_stack(&t, rcu_torture_timer, 0);
+
+ do {
+ if (irqreader && cur_ops->irq_capable) {
+ if (!timer_pending(&t))
+ mod_timer(&t, jiffies + 1);
+ }
+ idx = cur_ops->readlock();
+ if (cur_ops->started)
+ started = cur_ops->started();
+ else
+ started = cur_ops->completed();
+ ts = rcu_trace_clock_local();
+ p = rcu_dereference_check(rcu_torture_current,
+ rcu_read_lock_bh_held() ||
+ rcu_read_lock_sched_held() ||
+ srcu_read_lock_held(&srcu_ctl));
+ if (p == NULL) {
+ /* Wait for rcu_torture_writer to get underway */
+ cur_ops->readunlock(idx);
+ schedule_timeout_interruptible(HZ);
+ continue;
+ }
+ if (p->rtort_mbtest == 0)
+ atomic_inc(&n_rcu_torture_mberror);
+ cur_ops->read_delay(&rand);
+ preempt_disable();
+ pipe_count = p->rtort_pipe_count;
+ if (pipe_count > RCU_TORTURE_PIPE_LEN) {
+ /* Should not happen, but... */
+ pipe_count = RCU_TORTURE_PIPE_LEN;
+ }
+ completed = cur_ops->completed();
+ if (pipe_count > 1) {
+ do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu,
+ ts, started, completed);
+ rcutorture_trace_dump();
+ }
+ __this_cpu_inc(rcu_torture_count[pipe_count]);
+ completed = completed - started;
+ if (cur_ops->started)
+ completed++;
+ if (completed > RCU_TORTURE_PIPE_LEN) {
+ /* Should not happen, but... */
+ completed = RCU_TORTURE_PIPE_LEN;
+ }
+ __this_cpu_inc(rcu_torture_batch[completed]);
+ preempt_enable();
+ cur_ops->readunlock(idx);
+ cond_resched_rcu_qs();
+ stutter_wait("rcu_torture_reader");
+ } while (!torture_must_stop());
+ if (irqreader && cur_ops->irq_capable) {
+ del_timer_sync(&t);
+ destroy_timer_on_stack(&t);
+ }
+ torture_kthread_stopping("rcu_torture_reader");
+ return 0;
+}
+
+/*
+ * Print torture statistics. Caller must ensure that there is only
+ * one call to this function at a given time!!! This is normally
+ * accomplished by relying on the module system to only have one copy
+ * of the module loaded, and then by giving the rcu_torture_stats
+ * kthread full control (or the init/cleanup functions when rcu_torture_stats
+ * thread is not running).
+ */
+static void
+rcu_torture_stats_print(void)
+{
+ int cpu;
+ int i;
+ long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
+ long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
+ static unsigned long rtcv_snap = ULONG_MAX;
+
+ for_each_possible_cpu(cpu) {
+ for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
+ pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i];
+ batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i];
+ }
+ }
+ for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) {
+ if (pipesummary[i] != 0)
+ break;
+ }
+
+ pr_alert("%s%s ", torture_type, TORTURE_FLAG);
+ pr_cont("rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ",
+ rcu_torture_current,
+ rcu_torture_current_version,
+ list_empty(&rcu_torture_freelist),
+ atomic_read(&n_rcu_torture_alloc),
+ atomic_read(&n_rcu_torture_alloc_fail),
+ atomic_read(&n_rcu_torture_free));
+ pr_cont("rtmbe: %d rtbke: %ld rtbre: %ld ",
+ atomic_read(&n_rcu_torture_mberror),
+ n_rcu_torture_boost_ktrerror,
+ n_rcu_torture_boost_rterror);
+ pr_cont("rtbf: %ld rtb: %ld nt: %ld ",
+ n_rcu_torture_boost_failure,
+ n_rcu_torture_boosts,
+ n_rcu_torture_timers);
+ torture_onoff_stats();
+ pr_cont("barrier: %ld/%ld:%ld ",
+ n_barrier_successes,
+ n_barrier_attempts,
+ n_rcu_torture_barrier_error);
+ pr_cont("cbflood: %ld\n", atomic_long_read(&n_cbfloods));
+
+ pr_alert("%s%s ", torture_type, TORTURE_FLAG);
+ if (atomic_read(&n_rcu_torture_mberror) != 0 ||
+ n_rcu_torture_barrier_error != 0 ||
+ n_rcu_torture_boost_ktrerror != 0 ||
+ n_rcu_torture_boost_rterror != 0 ||
+ n_rcu_torture_boost_failure != 0 ||
+ i > 1) {
+ pr_cont("%s", "!!! ");
+ atomic_inc(&n_rcu_torture_error);
+ WARN_ON_ONCE(1);
+ }
+ pr_cont("Reader Pipe: ");
+ for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
+ pr_cont(" %ld", pipesummary[i]);
+ pr_cont("\n");
+
+ pr_alert("%s%s ", torture_type, TORTURE_FLAG);
+ pr_cont("Reader Batch: ");
+ for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
+ pr_cont(" %ld", batchsummary[i]);
+ pr_cont("\n");
+
+ pr_alert("%s%s ", torture_type, TORTURE_FLAG);
+ pr_cont("Free-Block Circulation: ");
+ for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
+ pr_cont(" %d", atomic_read(&rcu_torture_wcount[i]));
+ }
+ pr_cont("\n");
+
+ if (cur_ops->stats)
+ cur_ops->stats();
+ if (rtcv_snap == rcu_torture_current_version &&
+ rcu_torture_current != NULL) {
+ int __maybe_unused flags;
+ unsigned long __maybe_unused gpnum;
+ unsigned long __maybe_unused completed;
+
+ rcutorture_get_gp_data(cur_ops->ttype,
+ &flags, &gpnum, &completed);
+ pr_alert("??? Writer stall state %d g%lu c%lu f%#x\n",
+ rcu_torture_writer_state,
+ gpnum, completed, flags);
+ show_rcu_gp_kthreads();
+ rcutorture_trace_dump();
+ }
+ rtcv_snap = rcu_torture_current_version;
+}
+
+/*
+ * Periodically prints torture statistics, if periodic statistics printing
+ * was specified via the stat_interval module parameter.
+ */
+static int
+rcu_torture_stats(void *arg)
+{
+ VERBOSE_TOROUT_STRING("rcu_torture_stats task started");
+ do {
+ schedule_timeout_interruptible(stat_interval * HZ);
+ rcu_torture_stats_print();
+ torture_shutdown_absorb("rcu_torture_stats");
+ } while (!torture_must_stop());
+ torture_kthread_stopping("rcu_torture_stats");
+ return 0;
+}
+
+static inline void
+rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
+{
+ pr_alert("%s" TORTURE_FLAG
+ "--- %s: nreaders=%d nfakewriters=%d "
+ "stat_interval=%d verbose=%d test_no_idle_hz=%d "
+ "shuffle_interval=%d stutter=%d irqreader=%d "
+ "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
+ "test_boost=%d/%d test_boost_interval=%d "
+ "test_boost_duration=%d shutdown_secs=%d "
+ "stall_cpu=%d stall_cpu_holdoff=%d "
+ "n_barrier_cbs=%d "
+ "onoff_interval=%d onoff_holdoff=%d\n",
+ torture_type, tag, nrealreaders, nfakewriters,
+ stat_interval, verbose, test_no_idle_hz, shuffle_interval,
+ stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
+ test_boost, cur_ops->can_boost,
+ test_boost_interval, test_boost_duration, shutdown_secs,
+ stall_cpu, stall_cpu_holdoff,
+ n_barrier_cbs,
+ onoff_interval, onoff_holdoff);
+}
+
+static void rcutorture_booster_cleanup(int cpu)
+{
+ struct task_struct *t;
+
+ if (boost_tasks[cpu] == NULL)
+ return;
+ mutex_lock(&boost_mutex);
+ t = boost_tasks[cpu];
+ boost_tasks[cpu] = NULL;
+ mutex_unlock(&boost_mutex);
+
+ /* This must be outside of the mutex, otherwise deadlock! */
+ torture_stop_kthread(rcu_torture_boost, t);
+}
+
+static int rcutorture_booster_init(int cpu)
+{
+ int retval;
+
+ if (boost_tasks[cpu] != NULL)
+ return 0; /* Already created, nothing more to do. */
+
+ /* Don't allow time recalculation while creating a new task. */
+ mutex_lock(&boost_mutex);
+ VERBOSE_TOROUT_STRING("Creating rcu_torture_boost task");
+ boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL,
+ cpu_to_node(cpu),
+ "rcu_torture_boost");
+ if (IS_ERR(boost_tasks[cpu])) {
+ retval = PTR_ERR(boost_tasks[cpu]);
+ VERBOSE_TOROUT_STRING("rcu_torture_boost task create failed");
+ n_rcu_torture_boost_ktrerror++;
+ boost_tasks[cpu] = NULL;
+ mutex_unlock(&boost_mutex);
+ return retval;
+ }
+ kthread_bind(boost_tasks[cpu], cpu);
+ wake_up_process(boost_tasks[cpu]);
+ mutex_unlock(&boost_mutex);
+ return 0;
+}
+
+/*
+ * CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then
+ * induces a CPU stall for the time specified by stall_cpu.
+ */
+static int rcu_torture_stall(void *args)
+{
+ unsigned long stop_at;
+
+ VERBOSE_TOROUT_STRING("rcu_torture_stall task started");
+ if (stall_cpu_holdoff > 0) {
+ VERBOSE_TOROUT_STRING("rcu_torture_stall begin holdoff");
+ schedule_timeout_interruptible(stall_cpu_holdoff * HZ);
+ VERBOSE_TOROUT_STRING("rcu_torture_stall end holdoff");
+ }
+ if (!kthread_should_stop()) {
+ stop_at = get_seconds() + stall_cpu;
+ /* RCU CPU stall is expected behavior in following code. */
+ pr_alert("rcu_torture_stall start.\n");
+ rcu_read_lock();
+ preempt_disable();
+ while (ULONG_CMP_LT(get_seconds(), stop_at))
+ continue; /* Induce RCU CPU stall warning. */
+ preempt_enable();
+ rcu_read_unlock();
+ pr_alert("rcu_torture_stall end.\n");
+ }
+ torture_shutdown_absorb("rcu_torture_stall");
+ while (!kthread_should_stop())
+ schedule_timeout_interruptible(10 * HZ);
+ return 0;
+}
+
+/* Spawn CPU-stall kthread, if stall_cpu specified. */
+static int __init rcu_torture_stall_init(void)
+{
+ if (stall_cpu <= 0)
+ return 0;
+ return torture_create_kthread(rcu_torture_stall, NULL, stall_task);
+}
+
+/* Callback function for RCU barrier testing. */
+static void rcu_torture_barrier_cbf(struct rcu_head *rcu)
+{
+ atomic_inc(&barrier_cbs_invoked);
+}
+
+/* kthread function to register callbacks used to test RCU barriers. */
+static int rcu_torture_barrier_cbs(void *arg)
+{
+ long myid = (long)arg;
+ bool lastphase = 0;
+ bool newphase;
+ struct rcu_head rcu;
+
+ init_rcu_head_on_stack(&rcu);
+ VERBOSE_TOROUT_STRING("rcu_torture_barrier_cbs task started");
+ set_user_nice(current, MAX_NICE);
+ do {
+ wait_event(barrier_cbs_wq[myid],
+ (newphase =
+ ACCESS_ONCE(barrier_phase)) != lastphase ||
+ torture_must_stop());
+ lastphase = newphase;
+ smp_mb(); /* ensure barrier_phase load before ->call(). */
+ if (torture_must_stop())
+ break;
+ cur_ops->call(&rcu, rcu_torture_barrier_cbf);
+ if (atomic_dec_and_test(&barrier_cbs_count))
+ wake_up(&barrier_wq);
+ } while (!torture_must_stop());
+ if (cur_ops->cb_barrier != NULL)
+ cur_ops->cb_barrier();
+ destroy_rcu_head_on_stack(&rcu);
+ torture_kthread_stopping("rcu_torture_barrier_cbs");
+ return 0;
+}
+
+/* kthread function to drive and coordinate RCU barrier testing. */
+static int rcu_torture_barrier(void *arg)
+{
+ int i;
+
+ VERBOSE_TOROUT_STRING("rcu_torture_barrier task starting");
+ do {
+ atomic_set(&barrier_cbs_invoked, 0);
+ atomic_set(&barrier_cbs_count, n_barrier_cbs);
+ smp_mb(); /* Ensure barrier_phase after prior assignments. */
+ barrier_phase = !barrier_phase;
+ for (i = 0; i < n_barrier_cbs; i++)
+ wake_up(&barrier_cbs_wq[i]);
+ wait_event(barrier_wq,
+ atomic_read(&barrier_cbs_count) == 0 ||
+ torture_must_stop());
+ if (torture_must_stop())
+ break;
+ n_barrier_attempts++;
+ cur_ops->cb_barrier(); /* Implies smp_mb() for wait_event(). */
+ if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) {
+ n_rcu_torture_barrier_error++;
+ pr_err("barrier_cbs_invoked = %d, n_barrier_cbs = %d\n",
+ atomic_read(&barrier_cbs_invoked),
+ n_barrier_cbs);
+ WARN_ON_ONCE(1);
+ }
+ n_barrier_successes++;
+ schedule_timeout_interruptible(HZ / 10);
+ } while (!torture_must_stop());
+ torture_kthread_stopping("rcu_torture_barrier");
+ return 0;
+}
+
+/* Initialize RCU barrier testing. */
+static int rcu_torture_barrier_init(void)
+{
+ int i;
+ int ret;
+
+ if (n_barrier_cbs == 0)
+ return 0;
+ if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) {
+ pr_alert("%s" TORTURE_FLAG
+ " Call or barrier ops missing for %s,\n",
+ torture_type, cur_ops->name);
+ pr_alert("%s" TORTURE_FLAG
+ " RCU barrier testing omitted from run.\n",
+ torture_type);
+ return 0;
+ }
+ atomic_set(&barrier_cbs_count, 0);
+ atomic_set(&barrier_cbs_invoked, 0);
+ barrier_cbs_tasks =
+ kzalloc(n_barrier_cbs * sizeof(barrier_cbs_tasks[0]),
+ GFP_KERNEL);
+ barrier_cbs_wq =
+ kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]),
+ GFP_KERNEL);
+ if (barrier_cbs_tasks == NULL || !barrier_cbs_wq)
+ return -ENOMEM;
+ for (i = 0; i < n_barrier_cbs; i++) {
+ init_waitqueue_head(&barrier_cbs_wq[i]);
+ ret = torture_create_kthread(rcu_torture_barrier_cbs,
+ (void *)(long)i,
+ barrier_cbs_tasks[i]);
+ if (ret)
+ return ret;
+ }
+ return torture_create_kthread(rcu_torture_barrier, NULL, barrier_task);
+}
+
+/* Clean up after RCU barrier testing. */
+static void rcu_torture_barrier_cleanup(void)
+{
+ int i;
+
+ torture_stop_kthread(rcu_torture_barrier, barrier_task);
+ if (barrier_cbs_tasks != NULL) {
+ for (i = 0; i < n_barrier_cbs; i++)
+ torture_stop_kthread(rcu_torture_barrier_cbs,
+ barrier_cbs_tasks[i]);
+ kfree(barrier_cbs_tasks);
+ barrier_cbs_tasks = NULL;
+ }
+ if (barrier_cbs_wq != NULL) {
+ kfree(barrier_cbs_wq);
+ barrier_cbs_wq = NULL;
+ }
+}
+
+static int rcutorture_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ long cpu = (long)hcpu;
+
+ switch (action) {
+ case CPU_ONLINE:
+ case CPU_DOWN_FAILED:
+ (void)rcutorture_booster_init(cpu);
+ break;
+ case CPU_DOWN_PREPARE:
+ rcutorture_booster_cleanup(cpu);
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block rcutorture_cpu_nb = {
+ .notifier_call = rcutorture_cpu_notify,
+};
+
+static void
+rcu_torture_cleanup(void)
+{
+ int i;
+
+ rcutorture_record_test_transition();
+ if (torture_cleanup_begin()) {
+ if (cur_ops->cb_barrier != NULL)
+ cur_ops->cb_barrier();
+ return;
+ }
+
+ rcu_torture_barrier_cleanup();
+ torture_stop_kthread(rcu_torture_stall, stall_task);
+ torture_stop_kthread(rcu_torture_writer, writer_task);
+
+ if (reader_tasks) {
+ for (i = 0; i < nrealreaders; i++)
+ torture_stop_kthread(rcu_torture_reader,
+ reader_tasks[i]);
+ kfree(reader_tasks);
+ }
+ rcu_torture_current = NULL;
+
+ if (fakewriter_tasks) {
+ for (i = 0; i < nfakewriters; i++) {
+ torture_stop_kthread(rcu_torture_fakewriter,
+ fakewriter_tasks[i]);
+ }
+ kfree(fakewriter_tasks);
+ fakewriter_tasks = NULL;
+ }
+
+ torture_stop_kthread(rcu_torture_stats, stats_task);
+ torture_stop_kthread(rcu_torture_fqs, fqs_task);
+ for (i = 0; i < ncbflooders; i++)
+ torture_stop_kthread(rcu_torture_cbflood, cbflood_task[i]);
+ if ((test_boost == 1 && cur_ops->can_boost) ||
+ test_boost == 2) {
+ unregister_cpu_notifier(&rcutorture_cpu_nb);
+ for_each_possible_cpu(i)
+ rcutorture_booster_cleanup(i);
+ }
+
+ /* Wait for all RCU callbacks to fire. */
+
+ if (cur_ops->cb_barrier != NULL)
+ cur_ops->cb_barrier();
+
+ rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
+
+ if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error)
+ rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
+ else if (torture_onoff_failures())
+ rcu_torture_print_module_parms(cur_ops,
+ "End of test: RCU_HOTPLUG");
+ else
+ rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
+ torture_cleanup_end();
+}
+
+#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
+static void rcu_torture_leak_cb(struct rcu_head *rhp)
+{
+}
+
+static void rcu_torture_err_cb(struct rcu_head *rhp)
+{
+ /*
+ * This -might- happen due to race conditions, but is unlikely.
+ * The scenario that leads to this happening is that the
+ * first of the pair of duplicate callbacks is queued,
+ * someone else starts a grace period that includes that
+ * callback, then the second of the pair must wait for the
+ * next grace period. Unlikely, but can happen. If it
+ * does happen, the debug-objects subsystem won't have splatted.
+ */
+ pr_alert("rcutorture: duplicated callback was invoked.\n");
+}
+#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
+
+/*
+ * Verify that double-free causes debug-objects to complain, but only
+ * if CONFIG_DEBUG_OBJECTS_RCU_HEAD=y. Otherwise, say that the test
+ * cannot be carried out.
+ */
+static void rcu_test_debug_objects(void)
+{
+#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
+ struct rcu_head rh1;
+ struct rcu_head rh2;
+
+ init_rcu_head_on_stack(&rh1);
+ init_rcu_head_on_stack(&rh2);
+ pr_alert("rcutorture: WARN: Duplicate call_rcu() test starting.\n");
+
+ /* Try to queue the rh2 pair of callbacks for the same grace period. */
+ preempt_disable(); /* Prevent preemption from interrupting test. */
+ rcu_read_lock(); /* Make it impossible to finish a grace period. */
+ call_rcu(&rh1, rcu_torture_leak_cb); /* Start grace period. */
+ local_irq_disable(); /* Make it harder to start a new grace period. */
+ call_rcu(&rh2, rcu_torture_leak_cb);
+ call_rcu(&rh2, rcu_torture_err_cb); /* Duplicate callback. */
+ local_irq_enable();
+ rcu_read_unlock();
+ preempt_enable();
+
+ /* Wait for them all to get done so we can safely return. */
+ rcu_barrier();
+ pr_alert("rcutorture: WARN: Duplicate call_rcu() test complete.\n");
+ destroy_rcu_head_on_stack(&rh1);
+ destroy_rcu_head_on_stack(&rh2);
+#else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
+ pr_alert("rcutorture: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n");
+#endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
+}
+
+static int __init
+rcu_torture_init(void)
+{
+ int i;
+ int cpu;
+ int firsterr = 0;
+ static struct rcu_torture_ops *torture_ops[] = {
+ &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops,
+ RCUTORTURE_TASKS_OPS
+ };
+
+ if (!torture_init_begin(torture_type, verbose, &torture_runnable))
+ return -EBUSY;
+
+ /* Process args and tell the world that the torturer is on the job. */
+ for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
+ cur_ops = torture_ops[i];
+ if (strcmp(torture_type, cur_ops->name) == 0)
+ break;
+ }
+ if (i == ARRAY_SIZE(torture_ops)) {
+ pr_alert("rcu-torture: invalid torture type: \"%s\"\n",
+ torture_type);
+ pr_alert("rcu-torture types:");
+ for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
+ pr_alert(" %s", torture_ops[i]->name);
+ pr_alert("\n");
+ torture_init_end();
+ return -EINVAL;
+ }
+ if (cur_ops->fqs == NULL && fqs_duration != 0) {
+ pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n");
+ fqs_duration = 0;
+ }
+ if (cur_ops->init)
+ cur_ops->init(); /* no "goto unwind" prior to this point!!! */
+
+ if (nreaders >= 0) {
+ nrealreaders = nreaders;
+ } else {
+ nrealreaders = num_online_cpus() - 1;
+ if (nrealreaders <= 0)
+ nrealreaders = 1;
+ }
+ rcu_torture_print_module_parms(cur_ops, "Start of test");
+
+ /* Set up the freelist. */
+
+ INIT_LIST_HEAD(&rcu_torture_freelist);
+ for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) {
+ rcu_tortures[i].rtort_mbtest = 0;
+ list_add_tail(&rcu_tortures[i].rtort_free,
+ &rcu_torture_freelist);
+ }
+
+ /* Initialize the statistics so that each run gets its own numbers. */
+
+ rcu_torture_current = NULL;
+ rcu_torture_current_version = 0;
+ atomic_set(&n_rcu_torture_alloc, 0);
+ atomic_set(&n_rcu_torture_alloc_fail, 0);
+ atomic_set(&n_rcu_torture_free, 0);
+ atomic_set(&n_rcu_torture_mberror, 0);
+ atomic_set(&n_rcu_torture_error, 0);
+ n_rcu_torture_barrier_error = 0;
+ n_rcu_torture_boost_ktrerror = 0;
+ n_rcu_torture_boost_rterror = 0;
+ n_rcu_torture_boost_failure = 0;
+ n_rcu_torture_boosts = 0;
+ for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
+ atomic_set(&rcu_torture_wcount[i], 0);
+ for_each_possible_cpu(cpu) {
+ for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
+ per_cpu(rcu_torture_count, cpu)[i] = 0;
+ per_cpu(rcu_torture_batch, cpu)[i] = 0;
+ }
+ }
+
+ /* Start up the kthreads. */
+
+ firsterr = torture_create_kthread(rcu_torture_writer, NULL,
+ writer_task);
+ if (firsterr)
+ goto unwind;
+ fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]),
+ GFP_KERNEL);
+ if (fakewriter_tasks == NULL) {
+ VERBOSE_TOROUT_ERRSTRING("out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+ for (i = 0; i < nfakewriters; i++) {
+ firsterr = torture_create_kthread(rcu_torture_fakewriter,
+ NULL, fakewriter_tasks[i]);
+ if (firsterr)
+ goto unwind;
+ }
+ reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]),
+ GFP_KERNEL);
+ if (reader_tasks == NULL) {
+ VERBOSE_TOROUT_ERRSTRING("out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+ for (i = 0; i < nrealreaders; i++) {
+ firsterr = torture_create_kthread(rcu_torture_reader, NULL,
+ reader_tasks[i]);
+ if (firsterr)
+ goto unwind;
+ }
+ if (stat_interval > 0) {
+ firsterr = torture_create_kthread(rcu_torture_stats, NULL,
+ stats_task);
+ if (firsterr)
+ goto unwind;
+ }
+ if (test_no_idle_hz) {
+ firsterr = torture_shuffle_init(shuffle_interval * HZ);
+ if (firsterr)
+ goto unwind;
+ }
+ if (stutter < 0)
+ stutter = 0;
+ if (stutter) {
+ firsterr = torture_stutter_init(stutter * HZ);
+ if (firsterr)
+ goto unwind;
+ }
+ if (fqs_duration < 0)
+ fqs_duration = 0;
+ if (fqs_duration) {
+ /* Create the fqs thread */
+ firsterr = torture_create_kthread(rcu_torture_fqs, NULL,
+ fqs_task);
+ if (firsterr)
+ goto unwind;
+ }
+ if (test_boost_interval < 1)
+ test_boost_interval = 1;
+ if (test_boost_duration < 2)
+ test_boost_duration = 2;
+ if ((test_boost == 1 && cur_ops->can_boost) ||
+ test_boost == 2) {
+
+ boost_starttime = jiffies + test_boost_interval * HZ;
+ register_cpu_notifier(&rcutorture_cpu_nb);
+ for_each_possible_cpu(i) {
+ if (cpu_is_offline(i))
+ continue; /* Heuristic: CPU can go offline. */
+ firsterr = rcutorture_booster_init(i);
+ if (firsterr)
+ goto unwind;
+ }
+ }
+ firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup);
+ if (firsterr)
+ goto unwind;
+ firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval * HZ);
+ if (firsterr)
+ goto unwind;
+ firsterr = rcu_torture_stall_init();
+ if (firsterr)
+ goto unwind;
+ firsterr = rcu_torture_barrier_init();
+ if (firsterr)
+ goto unwind;
+ if (object_debug)
+ rcu_test_debug_objects();
+ if (cbflood_n_burst > 0) {
+ /* Create the cbflood threads */
+ ncbflooders = (num_online_cpus() + 3) / 4;
+ cbflood_task = kcalloc(ncbflooders, sizeof(*cbflood_task),
+ GFP_KERNEL);
+ if (!cbflood_task) {
+ VERBOSE_TOROUT_ERRSTRING("out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+ for (i = 0; i < ncbflooders; i++) {
+ firsterr = torture_create_kthread(rcu_torture_cbflood,
+ NULL,
+ cbflood_task[i]);
+ if (firsterr)
+ goto unwind;
+ }
+ }
+ rcutorture_record_test_transition();
+ torture_init_end();
+ return 0;
+
+unwind:
+ torture_init_end();
+ rcu_torture_cleanup();
+ return firsterr;
+}
+
+module_init(rcu_torture_init);
+module_exit(rcu_torture_cleanup);
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
new file mode 100644
index 000000000..cad76e76b
--- /dev/null
+++ b/kernel/rcu/srcu.c
@@ -0,0 +1,676 @@
+/*
+ * Sleepable Read-Copy Update mechanism for mutual exclusion.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright (C) IBM Corporation, 2006
+ * Copyright (C) Fujitsu, 2012
+ *
+ * Author: Paul McKenney <paulmck@us.ibm.com>
+ * Lai Jiangshan <laijs@cn.fujitsu.com>
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * Documentation/RCU/ *.txt
+ *
+ */
+
+#include <linux/export.h>
+#include <linux/mutex.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/delay.h>
+#include <linux/srcu.h>
+
+#include "rcu.h"
+
+/*
+ * Initialize an rcu_batch structure to empty.
+ */
+static inline void rcu_batch_init(struct rcu_batch *b)
+{
+ b->head = NULL;
+ b->tail = &b->head;
+}
+
+/*
+ * Enqueue a callback onto the tail of the specified rcu_batch structure.
+ */
+static inline void rcu_batch_queue(struct rcu_batch *b, struct rcu_head *head)
+{
+ *b->tail = head;
+ b->tail = &head->next;
+}
+
+/*
+ * Is the specified rcu_batch structure empty?
+ */
+static inline bool rcu_batch_empty(struct rcu_batch *b)
+{
+ return b->tail == &b->head;
+}
+
+/*
+ * Remove the callback at the head of the specified rcu_batch structure
+ * and return a pointer to it, or return NULL if the structure is empty.
+ */
+static inline struct rcu_head *rcu_batch_dequeue(struct rcu_batch *b)
+{
+ struct rcu_head *head;
+
+ if (rcu_batch_empty(b))
+ return NULL;
+
+ head = b->head;
+ b->head = head->next;
+ if (b->tail == &head->next)
+ rcu_batch_init(b);
+
+ return head;
+}
+
+/*
+ * Move all callbacks from the rcu_batch structure specified by "from" to
+ * the structure specified by "to".
+ */
+static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from)
+{
+ if (!rcu_batch_empty(from)) {
+ *to->tail = from->head;
+ to->tail = from->tail;
+ rcu_batch_init(from);
+ }
+}
+
+static int init_srcu_struct_fields(struct srcu_struct *sp)
+{
+ sp->completed = 0;
+ spin_lock_init(&sp->queue_lock);
+ sp->running = false;
+ rcu_batch_init(&sp->batch_queue);
+ rcu_batch_init(&sp->batch_check0);
+ rcu_batch_init(&sp->batch_check1);
+ rcu_batch_init(&sp->batch_done);
+ INIT_DELAYED_WORK(&sp->work, process_srcu);
+ sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
+ return sp->per_cpu_ref ? 0 : -ENOMEM;
+}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+
+int __init_srcu_struct(struct srcu_struct *sp, const char *name,
+ struct lock_class_key *key)
+{
+ /* Don't re-initialize a lock while it is held. */
+ debug_check_no_locks_freed((void *)sp, sizeof(*sp));
+ lockdep_init_map(&sp->dep_map, name, key, 0);
+ return init_srcu_struct_fields(sp);
+}
+EXPORT_SYMBOL_GPL(__init_srcu_struct);
+
+#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+/**
+ * init_srcu_struct - initialize a sleep-RCU structure
+ * @sp: structure to initialize.
+ *
+ * Must invoke this on a given srcu_struct before passing that srcu_struct
+ * to any other function. Each srcu_struct represents a separate domain
+ * of SRCU protection.
+ */
+int init_srcu_struct(struct srcu_struct *sp)
+{
+ return init_srcu_struct_fields(sp);
+}
+EXPORT_SYMBOL_GPL(init_srcu_struct);
+
+#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+/*
+ * Returns approximate total of the readers' ->seq[] values for the
+ * rank of per-CPU counters specified by idx.
+ */
+static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx)
+{
+ int cpu;
+ unsigned long sum = 0;
+ unsigned long t;
+
+ for_each_possible_cpu(cpu) {
+ t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]);
+ sum += t;
+ }
+ return sum;
+}
+
+/*
+ * Returns approximate number of readers active on the specified rank
+ * of the per-CPU ->c[] counters.
+ */
+static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)
+{
+ int cpu;
+ unsigned long sum = 0;
+ unsigned long t;
+
+ for_each_possible_cpu(cpu) {
+ t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]);
+ sum += t;
+ }
+ return sum;
+}
+
+/*
+ * Return true if the number of pre-existing readers is determined to
+ * be stably zero. An example unstable zero can occur if the call
+ * to srcu_readers_active_idx() misses an __srcu_read_lock() increment,
+ * but due to task migration, sees the corresponding __srcu_read_unlock()
+ * decrement. This can happen because srcu_readers_active_idx() takes
+ * time to sum the array, and might in fact be interrupted or preempted
+ * partway through the summation.
+ */
+static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
+{
+ unsigned long seq;
+
+ seq = srcu_readers_seq_idx(sp, idx);
+
+ /*
+ * The following smp_mb() A pairs with the smp_mb() B located in
+ * __srcu_read_lock(). This pairing ensures that if an
+ * __srcu_read_lock() increments its counter after the summation
+ * in srcu_readers_active_idx(), then the corresponding SRCU read-side
+ * critical section will see any changes made prior to the start
+ * of the current SRCU grace period.
+ *
+ * Also, if the above call to srcu_readers_seq_idx() saw the
+ * increment of ->seq[], then the call to srcu_readers_active_idx()
+ * must see the increment of ->c[].
+ */
+ smp_mb(); /* A */
+
+ /*
+ * Note that srcu_readers_active_idx() can incorrectly return
+ * zero even though there is a pre-existing reader throughout.
+ * To see this, suppose that task A is in a very long SRCU
+ * read-side critical section that started on CPU 0, and that
+ * no other reader exists, so that the sum of the counters
+ * is equal to one. Then suppose that task B starts executing
+ * srcu_readers_active_idx(), summing up to CPU 1, and then that
+ * task C starts reading on CPU 0, so that its increment is not
+ * summed, but finishes reading on CPU 2, so that its decrement
+ * -is- summed. Then when task B completes its sum, it will
+ * incorrectly get zero, despite the fact that task A has been
+ * in its SRCU read-side critical section the whole time.
+ *
+ * We therefore do a validation step should srcu_readers_active_idx()
+ * return zero.
+ */
+ if (srcu_readers_active_idx(sp, idx) != 0)
+ return false;
+
+ /*
+ * The remainder of this function is the validation step.
+ * The following smp_mb() D pairs with the smp_mb() C in
+ * __srcu_read_unlock(). If the __srcu_read_unlock() was seen
+ * by srcu_readers_active_idx() above, then any destructive
+ * operation performed after the grace period will happen after
+ * the corresponding SRCU read-side critical section.
+ *
+ * Note that there can be at most NR_CPUS worth of readers using
+ * the old index, which is not enough to overflow even a 32-bit
+ * integer. (Yes, this does mean that systems having more than
+ * a billion or so CPUs need to be 64-bit systems.) Therefore,
+ * the sum of the ->seq[] counters cannot possibly overflow.
+ * Therefore, the only way that the return values of the two
+ * calls to srcu_readers_seq_idx() can be equal is if there were
+ * no increments of the corresponding rank of ->seq[] counts
+ * in the interim. But the missed-increment scenario laid out
+ * above includes an increment of the ->seq[] counter by
+ * the corresponding __srcu_read_lock(). Therefore, if this
+ * scenario occurs, the return values from the two calls to
+ * srcu_readers_seq_idx() will differ, and thus the validation
+ * step below suffices.
+ */
+ smp_mb(); /* D */
+
+ return srcu_readers_seq_idx(sp, idx) == seq;
+}
+
+/**
+ * srcu_readers_active - returns approximate number of readers.
+ * @sp: which srcu_struct to count active readers (holding srcu_read_lock).
+ *
+ * Note that this is not an atomic primitive, and can therefore suffer
+ * severe errors when invoked on an active srcu_struct. That said, it
+ * can be useful as an error check at cleanup time.
+ */
+static int srcu_readers_active(struct srcu_struct *sp)
+{
+ int cpu;
+ unsigned long sum = 0;
+
+ for_each_possible_cpu(cpu) {
+ sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]);
+ sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]);
+ }
+ return sum;
+}
+
+/**
+ * cleanup_srcu_struct - deconstruct a sleep-RCU structure
+ * @sp: structure to clean up.
+ *
+ * Must invoke this after you are finished using a given srcu_struct that
+ * was initialized via init_srcu_struct(), else you leak memory.
+ */
+void cleanup_srcu_struct(struct srcu_struct *sp)
+{
+ if (WARN_ON(srcu_readers_active(sp)))
+ return; /* Leakage unless caller handles error. */
+ free_percpu(sp->per_cpu_ref);
+ sp->per_cpu_ref = NULL;
+}
+EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
+
+/*
+ * Counts the new reader in the appropriate per-CPU element of the
+ * srcu_struct. Must be called from process context.
+ * Returns an index that must be passed to the matching srcu_read_unlock().
+ */
+int __srcu_read_lock(struct srcu_struct *sp)
+{
+ int idx;
+
+ idx = ACCESS_ONCE(sp->completed) & 0x1;
+ preempt_disable();
+ __this_cpu_inc(sp->per_cpu_ref->c[idx]);
+ smp_mb(); /* B */ /* Avoid leaking the critical section. */
+ __this_cpu_inc(sp->per_cpu_ref->seq[idx]);
+ preempt_enable();
+ return idx;
+}
+EXPORT_SYMBOL_GPL(__srcu_read_lock);
+
+/*
+ * Removes the count for the old reader from the appropriate per-CPU
+ * element of the srcu_struct. Note that this may well be a different
+ * CPU than that which was incremented by the corresponding srcu_read_lock().
+ * Must be called from process context.
+ */
+void __srcu_read_unlock(struct srcu_struct *sp, int idx)
+{
+ smp_mb(); /* C */ /* Avoid leaking the critical section. */
+ this_cpu_dec(sp->per_cpu_ref->c[idx]);
+}
+EXPORT_SYMBOL_GPL(__srcu_read_unlock);
+
+/*
+ * We use an adaptive strategy for synchronize_srcu() and especially for
+ * synchronize_srcu_expedited(). We spin for a fixed time period
+ * (defined below) to allow SRCU readers to exit their read-side critical
+ * sections. If there are still some readers after 10 microseconds,
+ * we repeatedly block for 1-millisecond time periods. This approach
+ * has done well in testing, so there is no need for a config parameter.
+ */
+#define SRCU_RETRY_CHECK_DELAY 5
+#define SYNCHRONIZE_SRCU_TRYCOUNT 2
+#define SYNCHRONIZE_SRCU_EXP_TRYCOUNT 12
+
+/*
+ * @@@ Wait until all pre-existing readers complete. Such readers
+ * will have used the index specified by "idx".
+ * the caller should ensures the ->completed is not changed while checking
+ * and idx = (->completed & 1) ^ 1
+ */
+static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
+{
+ for (;;) {
+ if (srcu_readers_active_idx_check(sp, idx))
+ return true;
+ if (--trycount <= 0)
+ return false;
+ udelay(SRCU_RETRY_CHECK_DELAY);
+ }
+}
+
+/*
+ * Increment the ->completed counter so that future SRCU readers will
+ * use the other rank of the ->c[] and ->seq[] arrays. This allows
+ * us to wait for pre-existing readers in a starvation-free manner.
+ */
+static void srcu_flip(struct srcu_struct *sp)
+{
+ sp->completed++;
+}
+
+/*
+ * Enqueue an SRCU callback on the specified srcu_struct structure,
+ * initiating grace-period processing if it is not already running.
+ *
+ * Note that all CPUs must agree that the grace period extended beyond
+ * all pre-existing SRCU read-side critical section. On systems with
+ * more than one CPU, this means that when "func()" is invoked, each CPU
+ * is guaranteed to have executed a full memory barrier since the end of
+ * its last corresponding SRCU read-side critical section whose beginning
+ * preceded the call to call_rcu(). It also means that each CPU executing
+ * an SRCU read-side critical section that continues beyond the start of
+ * "func()" must have executed a memory barrier after the call_rcu()
+ * but before the beginning of that SRCU read-side critical section.
+ * Note that these guarantees include CPUs that are offline, idle, or
+ * executing in user mode, as well as CPUs that are executing in the kernel.
+ *
+ * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
+ * resulting SRCU callback function "func()", then both CPU A and CPU
+ * B are guaranteed to execute a full memory barrier during the time
+ * interval between the call to call_rcu() and the invocation of "func()".
+ * This guarantee applies even if CPU A and CPU B are the same CPU (but
+ * again only if the system has more than one CPU).
+ *
+ * Of course, these guarantees apply only for invocations of call_srcu(),
+ * srcu_read_lock(), and srcu_read_unlock() that are all passed the same
+ * srcu_struct structure.
+ */
+void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
+ void (*func)(struct rcu_head *head))
+{
+ unsigned long flags;
+
+ head->next = NULL;
+ head->func = func;
+ spin_lock_irqsave(&sp->queue_lock, flags);
+ rcu_batch_queue(&sp->batch_queue, head);
+ if (!sp->running) {
+ sp->running = true;
+ queue_delayed_work(system_power_efficient_wq, &sp->work, 0);
+ }
+ spin_unlock_irqrestore(&sp->queue_lock, flags);
+}
+EXPORT_SYMBOL_GPL(call_srcu);
+
+static void srcu_advance_batches(struct srcu_struct *sp, int trycount);
+static void srcu_reschedule(struct srcu_struct *sp);
+
+/*
+ * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
+ */
+static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
+{
+ struct rcu_synchronize rcu;
+ struct rcu_head *head = &rcu.head;
+ bool done = false;
+
+ rcu_lockdep_assert(!lock_is_held(&sp->dep_map) &&
+ !lock_is_held(&rcu_bh_lock_map) &&
+ !lock_is_held(&rcu_lock_map) &&
+ !lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section");
+
+ might_sleep();
+ init_completion(&rcu.completion);
+
+ head->next = NULL;
+ head->func = wakeme_after_rcu;
+ spin_lock_irq(&sp->queue_lock);
+ if (!sp->running) {
+ /* steal the processing owner */
+ sp->running = true;
+ rcu_batch_queue(&sp->batch_check0, head);
+ spin_unlock_irq(&sp->queue_lock);
+
+ srcu_advance_batches(sp, trycount);
+ if (!rcu_batch_empty(&sp->batch_done)) {
+ BUG_ON(sp->batch_done.head != head);
+ rcu_batch_dequeue(&sp->batch_done);
+ done = true;
+ }
+ /* give the processing owner to work_struct */
+ srcu_reschedule(sp);
+ } else {
+ rcu_batch_queue(&sp->batch_queue, head);
+ spin_unlock_irq(&sp->queue_lock);
+ }
+
+ if (!done)
+ wait_for_completion(&rcu.completion);
+}
+
+/**
+ * synchronize_srcu - wait for prior SRCU read-side critical-section completion
+ * @sp: srcu_struct with which to synchronize.
+ *
+ * Wait for the count to drain to zero of both indexes. To avoid the
+ * possible starvation of synchronize_srcu(), it waits for the count of
+ * the index=((->completed & 1) ^ 1) to drain to zero at first,
+ * and then flip the completed and wait for the count of the other index.
+ *
+ * Can block; must be called from process context.
+ *
+ * Note that it is illegal to call synchronize_srcu() from the corresponding
+ * SRCU read-side critical section; doing so will result in deadlock.
+ * However, it is perfectly legal to call synchronize_srcu() on one
+ * srcu_struct from some other srcu_struct's read-side critical section,
+ * as long as the resulting graph of srcu_structs is acyclic.
+ *
+ * There are memory-ordering constraints implied by synchronize_srcu().
+ * On systems with more than one CPU, when synchronize_srcu() returns,
+ * each CPU is guaranteed to have executed a full memory barrier since
+ * the end of its last corresponding SRCU-sched read-side critical section
+ * whose beginning preceded the call to synchronize_srcu(). In addition,
+ * each CPU having an SRCU read-side critical section that extends beyond
+ * the return from synchronize_srcu() is guaranteed to have executed a
+ * full memory barrier after the beginning of synchronize_srcu() and before
+ * the beginning of that SRCU read-side critical section. Note that these
+ * guarantees include CPUs that are offline, idle, or executing in user mode,
+ * as well as CPUs that are executing in the kernel.
+ *
+ * Furthermore, if CPU A invoked synchronize_srcu(), which returned
+ * to its caller on CPU B, then both CPU A and CPU B are guaranteed
+ * to have executed a full memory barrier during the execution of
+ * synchronize_srcu(). This guarantee applies even if CPU A and CPU B
+ * are the same CPU, but again only if the system has more than one CPU.
+ *
+ * Of course, these memory-ordering guarantees apply only when
+ * synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are
+ * passed the same srcu_struct structure.
+ */
+void synchronize_srcu(struct srcu_struct *sp)
+{
+ __synchronize_srcu(sp, rcu_gp_is_expedited()
+ ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT
+ : SYNCHRONIZE_SRCU_TRYCOUNT);
+}
+EXPORT_SYMBOL_GPL(synchronize_srcu);
+
+/**
+ * synchronize_srcu_expedited - Brute-force SRCU grace period
+ * @sp: srcu_struct with which to synchronize.
+ *
+ * Wait for an SRCU grace period to elapse, but be more aggressive about
+ * spinning rather than blocking when waiting.
+ *
+ * Note that synchronize_srcu_expedited() has the same deadlock and
+ * memory-ordering properties as does synchronize_srcu().
+ */
+void synchronize_srcu_expedited(struct srcu_struct *sp)
+{
+ __synchronize_srcu(sp, SYNCHRONIZE_SRCU_EXP_TRYCOUNT);
+}
+EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
+
+/**
+ * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete.
+ * @sp: srcu_struct on which to wait for in-flight callbacks.
+ */
+void srcu_barrier(struct srcu_struct *sp)
+{
+ synchronize_srcu(sp);
+}
+EXPORT_SYMBOL_GPL(srcu_barrier);
+
+/**
+ * srcu_batches_completed - return batches completed.
+ * @sp: srcu_struct on which to report batch completion.
+ *
+ * Report the number of batches, correlated with, but not necessarily
+ * precisely the same as, the number of grace periods that have elapsed.
+ */
+unsigned long srcu_batches_completed(struct srcu_struct *sp)
+{
+ return sp->completed;
+}
+EXPORT_SYMBOL_GPL(srcu_batches_completed);
+
+#define SRCU_CALLBACK_BATCH 10
+#define SRCU_INTERVAL 1
+
+/*
+ * Move any new SRCU callbacks to the first stage of the SRCU grace
+ * period pipeline.
+ */
+static void srcu_collect_new(struct srcu_struct *sp)
+{
+ if (!rcu_batch_empty(&sp->batch_queue)) {
+ spin_lock_irq(&sp->queue_lock);
+ rcu_batch_move(&sp->batch_check0, &sp->batch_queue);
+ spin_unlock_irq(&sp->queue_lock);
+ }
+}
+
+/*
+ * Core SRCU state machine. Advance callbacks from ->batch_check0 to
+ * ->batch_check1 and then to ->batch_done as readers drain.
+ */
+static void srcu_advance_batches(struct srcu_struct *sp, int trycount)
+{
+ int idx = 1 ^ (sp->completed & 1);
+
+ /*
+ * Because readers might be delayed for an extended period after
+ * fetching ->completed for their index, at any point in time there
+ * might well be readers using both idx=0 and idx=1. We therefore
+ * need to wait for readers to clear from both index values before
+ * invoking a callback.
+ */
+
+ if (rcu_batch_empty(&sp->batch_check0) &&
+ rcu_batch_empty(&sp->batch_check1))
+ return; /* no callbacks need to be advanced */
+
+ if (!try_check_zero(sp, idx, trycount))
+ return; /* failed to advance, will try after SRCU_INTERVAL */
+
+ /*
+ * The callbacks in ->batch_check1 have already done with their
+ * first zero check and flip back when they were enqueued on
+ * ->batch_check0 in a previous invocation of srcu_advance_batches().
+ * (Presumably try_check_zero() returned false during that
+ * invocation, leaving the callbacks stranded on ->batch_check1.)
+ * They are therefore ready to invoke, so move them to ->batch_done.
+ */
+ rcu_batch_move(&sp->batch_done, &sp->batch_check1);
+
+ if (rcu_batch_empty(&sp->batch_check0))
+ return; /* no callbacks need to be advanced */
+ srcu_flip(sp);
+
+ /*
+ * The callbacks in ->batch_check0 just finished their
+ * first check zero and flip, so move them to ->batch_check1
+ * for future checking on the other idx.
+ */
+ rcu_batch_move(&sp->batch_check1, &sp->batch_check0);
+
+ /*
+ * SRCU read-side critical sections are normally short, so check
+ * at least twice in quick succession after a flip.
+ */
+ trycount = trycount < 2 ? 2 : trycount;
+ if (!try_check_zero(sp, idx^1, trycount))
+ return; /* failed to advance, will try after SRCU_INTERVAL */
+
+ /*
+ * The callbacks in ->batch_check1 have now waited for all
+ * pre-existing readers using both idx values. They are therefore
+ * ready to invoke, so move them to ->batch_done.
+ */
+ rcu_batch_move(&sp->batch_done, &sp->batch_check1);
+}
+
+/*
+ * Invoke a limited number of SRCU callbacks that have passed through
+ * their grace period. If there are more to do, SRCU will reschedule
+ * the workqueue.
+ */
+static void srcu_invoke_callbacks(struct srcu_struct *sp)
+{
+ int i;
+ struct rcu_head *head;
+
+ for (i = 0; i < SRCU_CALLBACK_BATCH; i++) {
+ head = rcu_batch_dequeue(&sp->batch_done);
+ if (!head)
+ break;
+ local_bh_disable();
+ head->func(head);
+ local_bh_enable();
+ }
+}
+
+/*
+ * Finished one round of SRCU grace period. Start another if there are
+ * more SRCU callbacks queued, otherwise put SRCU into not-running state.
+ */
+static void srcu_reschedule(struct srcu_struct *sp)
+{
+ bool pending = true;
+
+ if (rcu_batch_empty(&sp->batch_done) &&
+ rcu_batch_empty(&sp->batch_check1) &&
+ rcu_batch_empty(&sp->batch_check0) &&
+ rcu_batch_empty(&sp->batch_queue)) {
+ spin_lock_irq(&sp->queue_lock);
+ if (rcu_batch_empty(&sp->batch_done) &&
+ rcu_batch_empty(&sp->batch_check1) &&
+ rcu_batch_empty(&sp->batch_check0) &&
+ rcu_batch_empty(&sp->batch_queue)) {
+ sp->running = false;
+ pending = false;
+ }
+ spin_unlock_irq(&sp->queue_lock);
+ }
+
+ if (pending)
+ queue_delayed_work(system_power_efficient_wq,
+ &sp->work, SRCU_INTERVAL);
+}
+
+/*
+ * This is the work-queue function that handles SRCU grace periods.
+ */
+void process_srcu(struct work_struct *work)
+{
+ struct srcu_struct *sp;
+
+ sp = container_of(work, struct srcu_struct, work.work);
+
+ srcu_collect_new(sp);
+ srcu_advance_batches(sp, 1);
+ srcu_invoke_callbacks(sp);
+ srcu_reschedule(sp);
+}
+EXPORT_SYMBOL_GPL(process_srcu);
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
new file mode 100644
index 000000000..ec3086879
--- /dev/null
+++ b/kernel/rcu/tiny.c
@@ -0,0 +1,288 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright IBM Corporation, 2008
+ *
+ * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * Documentation/RCU
+ */
+#include <linux/completion.h>
+#include <linux/interrupt.h>
+#include <linux/notifier.h>
+#include <linux/rcupdate.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/time.h>
+#include <linux/cpu.h>
+#include <linux/prefetch.h>
+#include <linux/ftrace_event.h>
+
+#include "rcu.h"
+
+/* Forward declarations for tiny_plugin.h. */
+struct rcu_ctrlblk;
+static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
+static void rcu_process_callbacks(struct softirq_action *unused);
+static void __call_rcu(struct rcu_head *head,
+ void (*func)(struct rcu_head *rcu),
+ struct rcu_ctrlblk *rcp);
+
+#include "tiny_plugin.h"
+
+/*
+ * Enter idle, which is an extended quiescent state if we have fully
+ * entered that mode.
+ */
+void rcu_idle_enter(void)
+{
+}
+EXPORT_SYMBOL_GPL(rcu_idle_enter);
+
+/*
+ * Exit an interrupt handler towards idle.
+ */
+void rcu_irq_exit(void)
+{
+}
+EXPORT_SYMBOL_GPL(rcu_irq_exit);
+
+/*
+ * Exit idle, so that we are no longer in an extended quiescent state.
+ */
+void rcu_idle_exit(void)
+{
+}
+EXPORT_SYMBOL_GPL(rcu_idle_exit);
+
+/*
+ * Enter an interrupt handler, moving away from idle.
+ */
+void rcu_irq_enter(void)
+{
+}
+EXPORT_SYMBOL_GPL(rcu_irq_enter);
+
+#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE)
+
+/*
+ * Test whether RCU thinks that the current CPU is idle.
+ */
+bool notrace __rcu_is_watching(void)
+{
+ return true;
+}
+EXPORT_SYMBOL(__rcu_is_watching);
+
+#endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */
+
+/*
+ * Helper function for rcu_sched_qs() and rcu_bh_qs().
+ * Also irqs are disabled to avoid confusion due to interrupt handlers
+ * invoking call_rcu().
+ */
+static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
+{
+ RCU_TRACE(reset_cpu_stall_ticks(rcp));
+ if (rcp->donetail != rcp->curtail) {
+ rcp->donetail = rcp->curtail;
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * Record an rcu quiescent state. And an rcu_bh quiescent state while we
+ * are at it, given that any rcu quiescent state is also an rcu_bh
+ * quiescent state. Use "+" instead of "||" to defeat short circuiting.
+ */
+void rcu_sched_qs(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
+ rcu_qsctr_help(&rcu_bh_ctrlblk))
+ raise_softirq(RCU_SOFTIRQ);
+ local_irq_restore(flags);
+}
+
+/*
+ * Record an rcu_bh quiescent state.
+ */
+void rcu_bh_qs(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ if (rcu_qsctr_help(&rcu_bh_ctrlblk))
+ raise_softirq(RCU_SOFTIRQ);
+ local_irq_restore(flags);
+}
+
+/*
+ * Check to see if the scheduling-clock interrupt came from an extended
+ * quiescent state, and, if so, tell RCU about it. This function must
+ * be called from hardirq context. It is normally called from the
+ * scheduling-clock interrupt.
+ */
+void rcu_check_callbacks(int user)
+{
+ RCU_TRACE(check_cpu_stalls());
+ if (user)
+ rcu_sched_qs();
+ else if (!in_softirq())
+ rcu_bh_qs();
+ if (user)
+ rcu_note_voluntary_context_switch(current);
+}
+
+/*
+ * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure
+ * whose grace period has elapsed.
+ */
+static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
+{
+ const char *rn = NULL;
+ struct rcu_head *next, *list;
+ unsigned long flags;
+ RCU_TRACE(int cb_count = 0);
+
+ /* Move the ready-to-invoke callbacks to a local list. */
+ local_irq_save(flags);
+ if (rcp->donetail == &rcp->rcucblist) {
+ /* No callbacks ready, so just leave. */
+ local_irq_restore(flags);
+ return;
+ }
+ RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1));
+ list = rcp->rcucblist;
+ rcp->rcucblist = *rcp->donetail;
+ *rcp->donetail = NULL;
+ if (rcp->curtail == rcp->donetail)
+ rcp->curtail = &rcp->rcucblist;
+ rcp->donetail = &rcp->rcucblist;
+ local_irq_restore(flags);
+
+ /* Invoke the callbacks on the local list. */
+ RCU_TRACE(rn = rcp->name);
+ while (list) {
+ next = list->next;
+ prefetch(next);
+ debug_rcu_head_unqueue(list);
+ local_bh_disable();
+ __rcu_reclaim(rn, list);
+ local_bh_enable();
+ list = next;
+ RCU_TRACE(cb_count++);
+ }
+ RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
+ RCU_TRACE(trace_rcu_batch_end(rcp->name,
+ cb_count, 0, need_resched(),
+ is_idle_task(current),
+ false));
+}
+
+static void rcu_process_callbacks(struct softirq_action *unused)
+{
+ __rcu_process_callbacks(&rcu_sched_ctrlblk);
+ __rcu_process_callbacks(&rcu_bh_ctrlblk);
+}
+
+/*
+ * Wait for a grace period to elapse. But it is illegal to invoke
+ * synchronize_sched() from within an RCU read-side critical section.
+ * Therefore, any legal call to synchronize_sched() is a quiescent
+ * state, and so on a UP system, synchronize_sched() need do nothing.
+ * Ditto for synchronize_rcu_bh(). (But Lai Jiangshan points out the
+ * benefits of doing might_sleep() to reduce latency.)
+ *
+ * Cool, huh? (Due to Josh Triplett.)
+ *
+ * But we want to make this a static inline later. The cond_resched()
+ * currently makes this problematic.
+ */
+void synchronize_sched(void)
+{
+ rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
+ !lock_is_held(&rcu_lock_map) &&
+ !lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_sched() in RCU read-side critical section");
+ cond_resched();
+}
+EXPORT_SYMBOL_GPL(synchronize_sched);
+
+/*
+ * Helper function for call_rcu() and call_rcu_bh().
+ */
+static void __call_rcu(struct rcu_head *head,
+ void (*func)(struct rcu_head *rcu),
+ struct rcu_ctrlblk *rcp)
+{
+ unsigned long flags;
+
+ debug_rcu_head_queue(head);
+ head->func = func;
+ head->next = NULL;
+
+ local_irq_save(flags);
+ *rcp->curtail = head;
+ rcp->curtail = &head->next;
+ RCU_TRACE(rcp->qlen++);
+ local_irq_restore(flags);
+
+ if (unlikely(is_idle_task(current))) {
+ /* force scheduling for rcu_sched_qs() */
+ resched_cpu(0);
+ }
+}
+
+/*
+ * Post an RCU callback to be invoked after the end of an RCU-sched grace
+ * period. But since we have but one CPU, that would be after any
+ * quiescent state.
+ */
+void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+ __call_rcu(head, func, &rcu_sched_ctrlblk);
+}
+EXPORT_SYMBOL_GPL(call_rcu_sched);
+
+/*
+ * Post an RCU bottom-half callback to be invoked after any subsequent
+ * quiescent state.
+ */
+void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+ __call_rcu(head, func, &rcu_bh_ctrlblk);
+}
+EXPORT_SYMBOL_GPL(call_rcu_bh);
+
+void __init rcu_init(void)
+{
+ open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+ RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk));
+ RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk));
+
+ rcu_early_boot_tests();
+}
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
new file mode 100644
index 000000000..f94e209a1
--- /dev/null
+++ b/kernel/rcu/tiny_plugin.h
@@ -0,0 +1,173 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition
+ * Internal non-public definitions that provide either classic
+ * or preemptible semantics.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright (c) 2010 Linaro
+ *
+ * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ */
+
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+/* Global control variables for rcupdate callback mechanism. */
+struct rcu_ctrlblk {
+ struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */
+ struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
+ struct rcu_head **curtail; /* ->next pointer of last CB. */
+ RCU_TRACE(long qlen); /* Number of pending CBs. */
+ RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */
+ RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */
+ RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */
+ RCU_TRACE(const char *name); /* Name of RCU type. */
+};
+
+/* Definition for rcupdate control block. */
+static struct rcu_ctrlblk rcu_sched_ctrlblk = {
+ .donetail = &rcu_sched_ctrlblk.rcucblist,
+ .curtail = &rcu_sched_ctrlblk.rcucblist,
+ RCU_TRACE(.name = "rcu_sched")
+};
+
+static struct rcu_ctrlblk rcu_bh_ctrlblk = {
+ .donetail = &rcu_bh_ctrlblk.rcucblist,
+ .curtail = &rcu_bh_ctrlblk.rcucblist,
+ RCU_TRACE(.name = "rcu_bh")
+};
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+#include <linux/kernel_stat.h>
+
+int rcu_scheduler_active __read_mostly;
+EXPORT_SYMBOL_GPL(rcu_scheduler_active);
+
+/*
+ * During boot, we forgive RCU lockdep issues. After this function is
+ * invoked, we start taking RCU lockdep issues seriously.
+ */
+void __init rcu_scheduler_starting(void)
+{
+ WARN_ON(nr_context_switches() > 0);
+ rcu_scheduler_active = 1;
+}
+
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+#ifdef CONFIG_RCU_TRACE
+
+static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ rcp->qlen -= n;
+ local_irq_restore(flags);
+}
+
+/*
+ * Dump statistics for TINY_RCU, such as they are.
+ */
+static int show_tiny_stats(struct seq_file *m, void *unused)
+{
+ seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen);
+ seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen);
+ return 0;
+}
+
+static int show_tiny_stats_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, show_tiny_stats, NULL);
+}
+
+static const struct file_operations show_tiny_stats_fops = {
+ .owner = THIS_MODULE,
+ .open = show_tiny_stats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static struct dentry *rcudir;
+
+static int __init rcutiny_trace_init(void)
+{
+ struct dentry *retval;
+
+ rcudir = debugfs_create_dir("rcu", NULL);
+ if (!rcudir)
+ goto free_out;
+ retval = debugfs_create_file("rcudata", 0444, rcudir,
+ NULL, &show_tiny_stats_fops);
+ if (!retval)
+ goto free_out;
+ return 0;
+free_out:
+ debugfs_remove_recursive(rcudir);
+ return 1;
+}
+
+static void __exit rcutiny_trace_cleanup(void)
+{
+ debugfs_remove_recursive(rcudir);
+}
+
+module_init(rcutiny_trace_init);
+module_exit(rcutiny_trace_cleanup);
+
+MODULE_AUTHOR("Paul E. McKenney");
+MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation");
+MODULE_LICENSE("GPL");
+
+static void check_cpu_stall(struct rcu_ctrlblk *rcp)
+{
+ unsigned long j;
+ unsigned long js;
+
+ if (rcu_cpu_stall_suppress)
+ return;
+ rcp->ticks_this_gp++;
+ j = jiffies;
+ js = ACCESS_ONCE(rcp->jiffies_stall);
+ if (rcp->rcucblist && ULONG_CMP_GE(j, js)) {
+ pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
+ rcp->name, rcp->ticks_this_gp, DYNTICK_TASK_EXIT_IDLE,
+ jiffies - rcp->gp_start, rcp->qlen);
+ dump_stack();
+ ACCESS_ONCE(rcp->jiffies_stall) = jiffies +
+ 3 * rcu_jiffies_till_stall_check() + 3;
+ } else if (ULONG_CMP_GE(j, js)) {
+ ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check();
+ }
+}
+
+static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
+{
+ rcp->ticks_this_gp = 0;
+ rcp->gp_start = jiffies;
+ ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check();
+}
+
+static void check_cpu_stalls(void)
+{
+ RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk));
+ RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk));
+}
+
+#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
new file mode 100644
index 000000000..8cf7304b2
--- /dev/null
+++ b/kernel/rcu/tree.c
@@ -0,0 +1,4136 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright IBM Corporation, 2008
+ *
+ * Authors: Dipankar Sarma <dipankar@in.ibm.com>
+ * Manfred Spraul <manfred@colorfullife.com>
+ * Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical version
+ *
+ * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
+ * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * Documentation/RCU
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/nmi.h>
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/export.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/mutex.h>
+#include <linux/time.h>
+#include <linux/kernel_stat.h>
+#include <linux/wait.h>
+#include <linux/kthread.h>
+#include <linux/prefetch.h>
+#include <linux/delay.h>
+#include <linux/stop_machine.h>
+#include <linux/random.h>
+#include <linux/ftrace_event.h>
+#include <linux/suspend.h>
+
+#include "tree.h"
+#include "rcu.h"
+
+MODULE_ALIAS("rcutree");
+#ifdef MODULE_PARAM_PREFIX
+#undef MODULE_PARAM_PREFIX
+#endif
+#define MODULE_PARAM_PREFIX "rcutree."
+
+/* Data structures. */
+
+static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
+static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
+
+/*
+ * In order to export the rcu_state name to the tracing tools, it
+ * needs to be added in the __tracepoint_string section.
+ * This requires defining a separate variable tp_<sname>_varname
+ * that points to the string being used, and this will allow
+ * the tracing userspace tools to be able to decipher the string
+ * address to the matching string.
+ */
+#ifdef CONFIG_TRACING
+# define DEFINE_RCU_TPS(sname) \
+static char sname##_varname[] = #sname; \
+static const char *tp_##sname##_varname __used __tracepoint_string = sname##_varname;
+# define RCU_STATE_NAME(sname) sname##_varname
+#else
+# define DEFINE_RCU_TPS(sname)
+# define RCU_STATE_NAME(sname) __stringify(sname)
+#endif
+
+#define RCU_STATE_INITIALIZER(sname, sabbr, cr) \
+DEFINE_RCU_TPS(sname) \
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data); \
+struct rcu_state sname##_state = { \
+ .level = { &sname##_state.node[0] }, \
+ .rda = &sname##_data, \
+ .call = cr, \
+ .fqs_state = RCU_GP_IDLE, \
+ .gpnum = 0UL - 300UL, \
+ .completed = 0UL - 300UL, \
+ .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \
+ .orphan_nxttail = &sname##_state.orphan_nxtlist, \
+ .orphan_donetail = &sname##_state.orphan_donelist, \
+ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
+ .name = RCU_STATE_NAME(sname), \
+ .abbr = sabbr, \
+}
+
+RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
+RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
+
+static struct rcu_state *rcu_state_p;
+LIST_HEAD(rcu_struct_flavors);
+
+/* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */
+static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF;
+module_param(rcu_fanout_leaf, int, 0444);
+int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
+static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */
+ NUM_RCU_LVL_0,
+ NUM_RCU_LVL_1,
+ NUM_RCU_LVL_2,
+ NUM_RCU_LVL_3,
+ NUM_RCU_LVL_4,
+};
+int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
+
+/*
+ * The rcu_scheduler_active variable transitions from zero to one just
+ * before the first task is spawned. So when this variable is zero, RCU
+ * can assume that there is but one task, allowing RCU to (for example)
+ * optimize synchronize_sched() to a simple barrier(). When this variable
+ * is one, RCU must actually do all the hard work required to detect real
+ * grace periods. This variable is also used to suppress boot-time false
+ * positives from lockdep-RCU error checking.
+ */
+int rcu_scheduler_active __read_mostly;
+EXPORT_SYMBOL_GPL(rcu_scheduler_active);
+
+/*
+ * The rcu_scheduler_fully_active variable transitions from zero to one
+ * during the early_initcall() processing, which is after the scheduler
+ * is capable of creating new tasks. So RCU processing (for example,
+ * creating tasks for RCU priority boosting) must be delayed until after
+ * rcu_scheduler_fully_active transitions from zero to one. We also
+ * currently delay invocation of any RCU callbacks until after this point.
+ *
+ * It might later prove better for people registering RCU callbacks during
+ * early boot to take responsibility for these callbacks, but one step at
+ * a time.
+ */
+static int rcu_scheduler_fully_active __read_mostly;
+
+static void rcu_init_new_rnp(struct rcu_node *rnp_leaf);
+static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf);
+static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
+static void invoke_rcu_core(void);
+static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
+
+/* rcuc/rcub kthread realtime priority */
+static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
+module_param(kthread_prio, int, 0644);
+
+/* Delay in jiffies for grace-period initialization delays, debug only. */
+#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT
+static int gp_init_delay = CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY;
+module_param(gp_init_delay, int, 0644);
+#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */
+static const int gp_init_delay;
+#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */
+#define PER_RCU_NODE_PERIOD 10 /* Number of grace periods between delays. */
+
+/*
+ * Track the rcutorture test sequence number and the update version
+ * number within a given test. The rcutorture_testseq is incremented
+ * on every rcutorture module load and unload, so has an odd value
+ * when a test is running. The rcutorture_vernum is set to zero
+ * when rcutorture starts and is incremented on each rcutorture update.
+ * These variables enable correlating rcutorture output with the
+ * RCU tracing information.
+ */
+unsigned long rcutorture_testseq;
+unsigned long rcutorture_vernum;
+
+/*
+ * Compute the mask of online CPUs for the specified rcu_node structure.
+ * This will not be stable unless the rcu_node structure's ->lock is
+ * held, but the bit corresponding to the current CPU will be stable
+ * in most contexts.
+ */
+unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
+{
+ return ACCESS_ONCE(rnp->qsmaskinitnext);
+}
+
+/*
+ * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
+ * permit this function to be invoked without holding the root rcu_node
+ * structure's ->lock, but of course results can be subject to change.
+ */
+static int rcu_gp_in_progress(struct rcu_state *rsp)
+{
+ return ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum);
+}
+
+/*
+ * Note a quiescent state. Because we do not need to know
+ * how many quiescent states passed, just if there was at least
+ * one since the start of the grace period, this just sets a flag.
+ * The caller must have disabled preemption.
+ */
+void rcu_sched_qs(void)
+{
+ if (!__this_cpu_read(rcu_sched_data.passed_quiesce)) {
+ trace_rcu_grace_period(TPS("rcu_sched"),
+ __this_cpu_read(rcu_sched_data.gpnum),
+ TPS("cpuqs"));
+ __this_cpu_write(rcu_sched_data.passed_quiesce, 1);
+ }
+}
+
+void rcu_bh_qs(void)
+{
+ if (!__this_cpu_read(rcu_bh_data.passed_quiesce)) {
+ trace_rcu_grace_period(TPS("rcu_bh"),
+ __this_cpu_read(rcu_bh_data.gpnum),
+ TPS("cpuqs"));
+ __this_cpu_write(rcu_bh_data.passed_quiesce, 1);
+ }
+}
+
+static DEFINE_PER_CPU(int, rcu_sched_qs_mask);
+
+static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
+ .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
+ .dynticks = ATOMIC_INIT(1),
+#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
+ .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE,
+ .dynticks_idle = ATOMIC_INIT(1),
+#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
+};
+
+DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
+EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr);
+
+/*
+ * Let the RCU core know that this CPU has gone through the scheduler,
+ * which is a quiescent state. This is called when the need for a
+ * quiescent state is urgent, so we burn an atomic operation and full
+ * memory barriers to let the RCU core know about it, regardless of what
+ * this CPU might (or might not) do in the near future.
+ *
+ * We inform the RCU core by emulating a zero-duration dyntick-idle
+ * period, which we in turn do by incrementing the ->dynticks counter
+ * by two.
+ */
+static void rcu_momentary_dyntick_idle(void)
+{
+ unsigned long flags;
+ struct rcu_data *rdp;
+ struct rcu_dynticks *rdtp;
+ int resched_mask;
+ struct rcu_state *rsp;
+
+ local_irq_save(flags);
+
+ /*
+ * Yes, we can lose flag-setting operations. This is OK, because
+ * the flag will be set again after some delay.
+ */
+ resched_mask = raw_cpu_read(rcu_sched_qs_mask);
+ raw_cpu_write(rcu_sched_qs_mask, 0);
+
+ /* Find the flavor that needs a quiescent state. */
+ for_each_rcu_flavor(rsp) {
+ rdp = raw_cpu_ptr(rsp->rda);
+ if (!(resched_mask & rsp->flavor_mask))
+ continue;
+ smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */
+ if (ACCESS_ONCE(rdp->mynode->completed) !=
+ ACCESS_ONCE(rdp->cond_resched_completed))
+ continue;
+
+ /*
+ * Pretend to be momentarily idle for the quiescent state.
+ * This allows the grace-period kthread to record the
+ * quiescent state, with no need for this CPU to do anything
+ * further.
+ */
+ rdtp = this_cpu_ptr(&rcu_dynticks);
+ smp_mb__before_atomic(); /* Earlier stuff before QS. */
+ atomic_add(2, &rdtp->dynticks); /* QS. */
+ smp_mb__after_atomic(); /* Later stuff after QS. */
+ break;
+ }
+ local_irq_restore(flags);
+}
+
+/*
+ * Note a context switch. This is a quiescent state for RCU-sched,
+ * and requires special handling for preemptible RCU.
+ * The caller must have disabled preemption.
+ */
+void rcu_note_context_switch(void)
+{
+ trace_rcu_utilization(TPS("Start context switch"));
+ rcu_sched_qs();
+ rcu_preempt_note_context_switch();
+ if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
+ rcu_momentary_dyntick_idle();
+ trace_rcu_utilization(TPS("End context switch"));
+}
+EXPORT_SYMBOL_GPL(rcu_note_context_switch);
+
+/*
+ * Register a quiescent state for all RCU flavors. If there is an
+ * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight
+ * dyntick-idle quiescent state visible to other CPUs (but only for those
+ * RCU flavors in desperate need of a quiescent state, which will normally
+ * be none of them). Either way, do a lightweight quiescent state for
+ * all RCU flavors.
+ */
+void rcu_all_qs(void)
+{
+ if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
+ rcu_momentary_dyntick_idle();
+ this_cpu_inc(rcu_qs_ctr);
+}
+EXPORT_SYMBOL_GPL(rcu_all_qs);
+
+static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */
+static long qhimark = 10000; /* If this many pending, ignore blimit. */
+static long qlowmark = 100; /* Once only this many pending, use blimit. */
+
+module_param(blimit, long, 0444);
+module_param(qhimark, long, 0444);
+module_param(qlowmark, long, 0444);
+
+static ulong jiffies_till_first_fqs = ULONG_MAX;
+static ulong jiffies_till_next_fqs = ULONG_MAX;
+
+module_param(jiffies_till_first_fqs, ulong, 0644);
+module_param(jiffies_till_next_fqs, ulong, 0644);
+
+/*
+ * How long the grace period must be before we start recruiting
+ * quiescent-state help from rcu_note_context_switch().
+ */
+static ulong jiffies_till_sched_qs = HZ / 20;
+module_param(jiffies_till_sched_qs, ulong, 0644);
+
+static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
+ struct rcu_data *rdp);
+static void force_qs_rnp(struct rcu_state *rsp,
+ int (*f)(struct rcu_data *rsp, bool *isidle,
+ unsigned long *maxj),
+ bool *isidle, unsigned long *maxj);
+static void force_quiescent_state(struct rcu_state *rsp);
+static int rcu_pending(void);
+
+/*
+ * Return the number of RCU batches started thus far for debug & stats.
+ */
+unsigned long rcu_batches_started(void)
+{
+ return rcu_state_p->gpnum;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_started);
+
+/*
+ * Return the number of RCU-sched batches started thus far for debug & stats.
+ */
+unsigned long rcu_batches_started_sched(void)
+{
+ return rcu_sched_state.gpnum;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_started_sched);
+
+/*
+ * Return the number of RCU BH batches started thus far for debug & stats.
+ */
+unsigned long rcu_batches_started_bh(void)
+{
+ return rcu_bh_state.gpnum;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_started_bh);
+
+/*
+ * Return the number of RCU batches completed thus far for debug & stats.
+ */
+unsigned long rcu_batches_completed(void)
+{
+ return rcu_state_p->completed;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed);
+
+/*
+ * Return the number of RCU-sched batches completed thus far for debug & stats.
+ */
+unsigned long rcu_batches_completed_sched(void)
+{
+ return rcu_sched_state.completed;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
+
+/*
+ * Return the number of RCU BH batches completed thus far for debug & stats.
+ */
+unsigned long rcu_batches_completed_bh(void)
+{
+ return rcu_bh_state.completed;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
+
+/*
+ * Force a quiescent state.
+ */
+void rcu_force_quiescent_state(void)
+{
+ force_quiescent_state(rcu_state_p);
+}
+EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
+
+/*
+ * Force a quiescent state for RCU BH.
+ */
+void rcu_bh_force_quiescent_state(void)
+{
+ force_quiescent_state(&rcu_bh_state);
+}
+EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
+
+/*
+ * Force a quiescent state for RCU-sched.
+ */
+void rcu_sched_force_quiescent_state(void)
+{
+ force_quiescent_state(&rcu_sched_state);
+}
+EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
+
+/*
+ * Show the state of the grace-period kthreads.
+ */
+void show_rcu_gp_kthreads(void)
+{
+ struct rcu_state *rsp;
+
+ for_each_rcu_flavor(rsp) {
+ pr_info("%s: wait state: %d ->state: %#lx\n",
+ rsp->name, rsp->gp_state, rsp->gp_kthread->state);
+ /* sched_show_task(rsp->gp_kthread); */
+ }
+}
+EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads);
+
+/*
+ * Record the number of times rcutorture tests have been initiated and
+ * terminated. This information allows the debugfs tracing stats to be
+ * correlated to the rcutorture messages, even when the rcutorture module
+ * is being repeatedly loaded and unloaded. In other words, we cannot
+ * store this state in rcutorture itself.
+ */
+void rcutorture_record_test_transition(void)
+{
+ rcutorture_testseq++;
+ rcutorture_vernum = 0;
+}
+EXPORT_SYMBOL_GPL(rcutorture_record_test_transition);
+
+/*
+ * Send along grace-period-related data for rcutorture diagnostics.
+ */
+void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
+ unsigned long *gpnum, unsigned long *completed)
+{
+ struct rcu_state *rsp = NULL;
+
+ switch (test_type) {
+ case RCU_FLAVOR:
+ rsp = rcu_state_p;
+ break;
+ case RCU_BH_FLAVOR:
+ rsp = &rcu_bh_state;
+ break;
+ case RCU_SCHED_FLAVOR:
+ rsp = &rcu_sched_state;
+ break;
+ default:
+ break;
+ }
+ if (rsp != NULL) {
+ *flags = ACCESS_ONCE(rsp->gp_flags);
+ *gpnum = ACCESS_ONCE(rsp->gpnum);
+ *completed = ACCESS_ONCE(rsp->completed);
+ return;
+ }
+ *flags = 0;
+ *gpnum = 0;
+ *completed = 0;
+}
+EXPORT_SYMBOL_GPL(rcutorture_get_gp_data);
+
+/*
+ * Record the number of writer passes through the current rcutorture test.
+ * This is also used to correlate debugfs tracing stats with the rcutorture
+ * messages.
+ */
+void rcutorture_record_progress(unsigned long vernum)
+{
+ rcutorture_vernum++;
+}
+EXPORT_SYMBOL_GPL(rcutorture_record_progress);
+
+/*
+ * Does the CPU have callbacks ready to be invoked?
+ */
+static int
+cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
+{
+ return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL] &&
+ rdp->nxttail[RCU_DONE_TAIL] != NULL;
+}
+
+/*
+ * Return the root node of the specified rcu_state structure.
+ */
+static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
+{
+ return &rsp->node[0];
+}
+
+/*
+ * Is there any need for future grace periods?
+ * Interrupts must be disabled. If the caller does not hold the root
+ * rnp_node structure's ->lock, the results are advisory only.
+ */
+static int rcu_future_needs_gp(struct rcu_state *rsp)
+{
+ struct rcu_node *rnp = rcu_get_root(rsp);
+ int idx = (ACCESS_ONCE(rnp->completed) + 1) & 0x1;
+ int *fp = &rnp->need_future_gp[idx];
+
+ return ACCESS_ONCE(*fp);
+}
+
+/*
+ * Does the current CPU require a not-yet-started grace period?
+ * The caller must have disabled interrupts to prevent races with
+ * normal callback registry.
+ */
+static int
+cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+ int i;
+
+ if (rcu_gp_in_progress(rsp))
+ return 0; /* No, a grace period is already in progress. */
+ if (rcu_future_needs_gp(rsp))
+ return 1; /* Yes, a no-CBs CPU needs one. */
+ if (!rdp->nxttail[RCU_NEXT_TAIL])
+ return 0; /* No, this is a no-CBs (or offline) CPU. */
+ if (*rdp->nxttail[RCU_NEXT_READY_TAIL])
+ return 1; /* Yes, this CPU has newly registered callbacks. */
+ for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
+ if (rdp->nxttail[i - 1] != rdp->nxttail[i] &&
+ ULONG_CMP_LT(ACCESS_ONCE(rsp->completed),
+ rdp->nxtcompleted[i]))
+ return 1; /* Yes, CBs for future grace period. */
+ return 0; /* No grace period needed. */
+}
+
+/*
+ * rcu_eqs_enter_common - current CPU is moving towards extended quiescent state
+ *
+ * If the new value of the ->dynticks_nesting counter now is zero,
+ * we really have entered idle, and must do the appropriate accounting.
+ * The caller must have disabled interrupts.
+ */
+static void rcu_eqs_enter_common(long long oldval, bool user)
+{
+ struct rcu_state *rsp;
+ struct rcu_data *rdp;
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+
+ trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting);
+ if (!user && !is_idle_task(current)) {
+ struct task_struct *idle __maybe_unused =
+ idle_task(smp_processor_id());
+
+ trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0);
+ ftrace_dump(DUMP_ORIG);
+ WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
+ current->pid, current->comm,
+ idle->pid, idle->comm); /* must be idle task! */
+ }
+ for_each_rcu_flavor(rsp) {
+ rdp = this_cpu_ptr(rsp->rda);
+ do_nocb_deferred_wakeup(rdp);
+ }
+ rcu_prepare_for_idle();
+ /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
+ smp_mb__before_atomic(); /* See above. */
+ atomic_inc(&rdtp->dynticks);
+ smp_mb__after_atomic(); /* Force ordering with next sojourn. */
+ WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
+ rcu_dynticks_task_enter();
+
+ /*
+ * It is illegal to enter an extended quiescent state while
+ * in an RCU read-side critical section.
+ */
+ rcu_lockdep_assert(!lock_is_held(&rcu_lock_map),
+ "Illegal idle entry in RCU read-side critical section.");
+ rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map),
+ "Illegal idle entry in RCU-bh read-side critical section.");
+ rcu_lockdep_assert(!lock_is_held(&rcu_sched_lock_map),
+ "Illegal idle entry in RCU-sched read-side critical section.");
+}
+
+/*
+ * Enter an RCU extended quiescent state, which can be either the
+ * idle loop or adaptive-tickless usermode execution.
+ */
+static void rcu_eqs_enter(bool user)
+{
+ long long oldval;
+ struct rcu_dynticks *rdtp;
+
+ rdtp = this_cpu_ptr(&rcu_dynticks);
+ oldval = rdtp->dynticks_nesting;
+ WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0);
+ if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) {
+ rdtp->dynticks_nesting = 0;
+ rcu_eqs_enter_common(oldval, user);
+ } else {
+ rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE;
+ }
+}
+
+/**
+ * rcu_idle_enter - inform RCU that current CPU is entering idle
+ *
+ * Enter idle mode, in other words, -leave- the mode in which RCU
+ * read-side critical sections can occur. (Though RCU read-side
+ * critical sections can occur in irq handlers in idle, a possibility
+ * handled by irq_enter() and irq_exit().)
+ *
+ * We crowbar the ->dynticks_nesting field to zero to allow for
+ * the possibility of usermode upcalls having messed up our count
+ * of interrupt nesting level during the prior busy period.
+ */
+void rcu_idle_enter(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ rcu_eqs_enter(false);
+ rcu_sysidle_enter(0);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(rcu_idle_enter);
+
+#ifdef CONFIG_RCU_USER_QS
+/**
+ * rcu_user_enter - inform RCU that we are resuming userspace.
+ *
+ * Enter RCU idle mode right before resuming userspace. No use of RCU
+ * is permitted between this call and rcu_user_exit(). This way the
+ * CPU doesn't need to maintain the tick for RCU maintenance purposes
+ * when the CPU runs in userspace.
+ */
+void rcu_user_enter(void)
+{
+ rcu_eqs_enter(1);
+}
+#endif /* CONFIG_RCU_USER_QS */
+
+/**
+ * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
+ *
+ * Exit from an interrupt handler, which might possibly result in entering
+ * idle mode, in other words, leaving the mode in which read-side critical
+ * sections can occur.
+ *
+ * This code assumes that the idle loop never does anything that might
+ * result in unbalanced calls to irq_enter() and irq_exit(). If your
+ * architecture violates this assumption, RCU will give you what you
+ * deserve, good and hard. But very infrequently and irreproducibly.
+ *
+ * Use things like work queues to work around this limitation.
+ *
+ * You have been warned.
+ */
+void rcu_irq_exit(void)
+{
+ unsigned long flags;
+ long long oldval;
+ struct rcu_dynticks *rdtp;
+
+ local_irq_save(flags);
+ rdtp = this_cpu_ptr(&rcu_dynticks);
+ oldval = rdtp->dynticks_nesting;
+ rdtp->dynticks_nesting--;
+ WARN_ON_ONCE(rdtp->dynticks_nesting < 0);
+ if (rdtp->dynticks_nesting)
+ trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting);
+ else
+ rcu_eqs_enter_common(oldval, true);
+ rcu_sysidle_enter(1);
+ local_irq_restore(flags);
+}
+
+/*
+ * rcu_eqs_exit_common - current CPU moving away from extended quiescent state
+ *
+ * If the new value of the ->dynticks_nesting counter was previously zero,
+ * we really have exited idle, and must do the appropriate accounting.
+ * The caller must have disabled interrupts.
+ */
+static void rcu_eqs_exit_common(long long oldval, int user)
+{
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+
+ rcu_dynticks_task_exit();
+ smp_mb__before_atomic(); /* Force ordering w/previous sojourn. */
+ atomic_inc(&rdtp->dynticks);
+ /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
+ smp_mb__after_atomic(); /* See above. */
+ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+ rcu_cleanup_after_idle();
+ trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting);
+ if (!user && !is_idle_task(current)) {
+ struct task_struct *idle __maybe_unused =
+ idle_task(smp_processor_id());
+
+ trace_rcu_dyntick(TPS("Error on exit: not idle task"),
+ oldval, rdtp->dynticks_nesting);
+ ftrace_dump(DUMP_ORIG);
+ WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
+ current->pid, current->comm,
+ idle->pid, idle->comm); /* must be idle task! */
+ }
+}
+
+/*
+ * Exit an RCU extended quiescent state, which can be either the
+ * idle loop or adaptive-tickless usermode execution.
+ */
+static void rcu_eqs_exit(bool user)
+{
+ struct rcu_dynticks *rdtp;
+ long long oldval;
+
+ rdtp = this_cpu_ptr(&rcu_dynticks);
+ oldval = rdtp->dynticks_nesting;
+ WARN_ON_ONCE(oldval < 0);
+ if (oldval & DYNTICK_TASK_NEST_MASK) {
+ rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
+ } else {
+ rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
+ rcu_eqs_exit_common(oldval, user);
+ }
+}
+
+/**
+ * rcu_idle_exit - inform RCU that current CPU is leaving idle
+ *
+ * Exit idle mode, in other words, -enter- the mode in which RCU
+ * read-side critical sections can occur.
+ *
+ * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NEST to
+ * allow for the possibility of usermode upcalls messing up our count
+ * of interrupt nesting level during the busy period that is just
+ * now starting.
+ */
+void rcu_idle_exit(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ rcu_eqs_exit(false);
+ rcu_sysidle_exit(0);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(rcu_idle_exit);
+
+#ifdef CONFIG_RCU_USER_QS
+/**
+ * rcu_user_exit - inform RCU that we are exiting userspace.
+ *
+ * Exit RCU idle mode while entering the kernel because it can
+ * run a RCU read side critical section anytime.
+ */
+void rcu_user_exit(void)
+{
+ rcu_eqs_exit(1);
+}
+#endif /* CONFIG_RCU_USER_QS */
+
+/**
+ * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
+ *
+ * Enter an interrupt handler, which might possibly result in exiting
+ * idle mode, in other words, entering the mode in which read-side critical
+ * sections can occur.
+ *
+ * Note that the Linux kernel is fully capable of entering an interrupt
+ * handler that it never exits, for example when doing upcalls to
+ * user mode! This code assumes that the idle loop never does upcalls to
+ * user mode. If your architecture does do upcalls from the idle loop (or
+ * does anything else that results in unbalanced calls to the irq_enter()
+ * and irq_exit() functions), RCU will give you what you deserve, good
+ * and hard. But very infrequently and irreproducibly.
+ *
+ * Use things like work queues to work around this limitation.
+ *
+ * You have been warned.
+ */
+void rcu_irq_enter(void)
+{
+ unsigned long flags;
+ struct rcu_dynticks *rdtp;
+ long long oldval;
+
+ local_irq_save(flags);
+ rdtp = this_cpu_ptr(&rcu_dynticks);
+ oldval = rdtp->dynticks_nesting;
+ rdtp->dynticks_nesting++;
+ WARN_ON_ONCE(rdtp->dynticks_nesting == 0);
+ if (oldval)
+ trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting);
+ else
+ rcu_eqs_exit_common(oldval, true);
+ rcu_sysidle_exit(1);
+ local_irq_restore(flags);
+}
+
+/**
+ * rcu_nmi_enter - inform RCU of entry to NMI context
+ *
+ * If the CPU was idle from RCU's viewpoint, update rdtp->dynticks and
+ * rdtp->dynticks_nmi_nesting to let the RCU grace-period handling know
+ * that the CPU is active. This implementation permits nested NMIs, as
+ * long as the nesting level does not overflow an int. (You will probably
+ * run out of stack space first.)
+ */
+void rcu_nmi_enter(void)
+{
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+ int incby = 2;
+
+ /* Complain about underflow. */
+ WARN_ON_ONCE(rdtp->dynticks_nmi_nesting < 0);
+
+ /*
+ * If idle from RCU viewpoint, atomically increment ->dynticks
+ * to mark non-idle and increment ->dynticks_nmi_nesting by one.
+ * Otherwise, increment ->dynticks_nmi_nesting by two. This means
+ * if ->dynticks_nmi_nesting is equal to one, we are guaranteed
+ * to be in the outermost NMI handler that interrupted an RCU-idle
+ * period (observation due to Andy Lutomirski).
+ */
+ if (!(atomic_read(&rdtp->dynticks) & 0x1)) {
+ smp_mb__before_atomic(); /* Force delay from prior write. */
+ atomic_inc(&rdtp->dynticks);
+ /* atomic_inc() before later RCU read-side crit sects */
+ smp_mb__after_atomic(); /* See above. */
+ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+ incby = 1;
+ }
+ rdtp->dynticks_nmi_nesting += incby;
+ barrier();
+}
+
+/**
+ * rcu_nmi_exit - inform RCU of exit from NMI context
+ *
+ * If we are returning from the outermost NMI handler that interrupted an
+ * RCU-idle period, update rdtp->dynticks and rdtp->dynticks_nmi_nesting
+ * to let the RCU grace-period handling know that the CPU is back to
+ * being RCU-idle.
+ */
+void rcu_nmi_exit(void)
+{
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+
+ /*
+ * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks.
+ * (We are exiting an NMI handler, so RCU better be paying attention
+ * to us!)
+ */
+ WARN_ON_ONCE(rdtp->dynticks_nmi_nesting <= 0);
+ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+
+ /*
+ * If the nesting level is not 1, the CPU wasn't RCU-idle, so
+ * leave it in non-RCU-idle state.
+ */
+ if (rdtp->dynticks_nmi_nesting != 1) {
+ rdtp->dynticks_nmi_nesting -= 2;
+ return;
+ }
+
+ /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
+ rdtp->dynticks_nmi_nesting = 0;
+ /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
+ smp_mb__before_atomic(); /* See above. */
+ atomic_inc(&rdtp->dynticks);
+ smp_mb__after_atomic(); /* Force delay to next write. */
+ WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
+}
+
+/**
+ * __rcu_is_watching - are RCU read-side critical sections safe?
+ *
+ * Return true if RCU is watching the running CPU, which means that
+ * this CPU can safely enter RCU read-side critical sections. Unlike
+ * rcu_is_watching(), the caller of __rcu_is_watching() must have at
+ * least disabled preemption.
+ */
+bool notrace __rcu_is_watching(void)
+{
+ return atomic_read(this_cpu_ptr(&rcu_dynticks.dynticks)) & 0x1;
+}
+
+/**
+ * rcu_is_watching - see if RCU thinks that the current CPU is idle
+ *
+ * If the current CPU is in its idle loop and is neither in an interrupt
+ * or NMI handler, return true.
+ */
+bool notrace rcu_is_watching(void)
+{
+ bool ret;
+
+ preempt_disable();
+ ret = __rcu_is_watching();
+ preempt_enable();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rcu_is_watching);
+
+#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
+
+/*
+ * Is the current CPU online? Disable preemption to avoid false positives
+ * that could otherwise happen due to the current CPU number being sampled,
+ * this task being preempted, its old CPU being taken offline, resuming
+ * on some other CPU, then determining that its old CPU is now offline.
+ * It is OK to use RCU on an offline processor during initial boot, hence
+ * the check for rcu_scheduler_fully_active. Note also that it is OK
+ * for a CPU coming online to use RCU for one jiffy prior to marking itself
+ * online in the cpu_online_mask. Similarly, it is OK for a CPU going
+ * offline to continue to use RCU for one jiffy after marking itself
+ * offline in the cpu_online_mask. This leniency is necessary given the
+ * non-atomic nature of the online and offline processing, for example,
+ * the fact that a CPU enters the scheduler after completing the CPU_DYING
+ * notifiers.
+ *
+ * This is also why RCU internally marks CPUs online during the
+ * CPU_UP_PREPARE phase and offline during the CPU_DEAD phase.
+ *
+ * Disable checking if in an NMI handler because we cannot safely report
+ * errors from NMI handlers anyway.
+ */
+bool rcu_lockdep_current_cpu_online(void)
+{
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+ bool ret;
+
+ if (in_nmi())
+ return true;
+ preempt_disable();
+ rdp = this_cpu_ptr(&rcu_sched_data);
+ rnp = rdp->mynode;
+ ret = (rdp->grpmask & rcu_rnp_online_cpus(rnp)) ||
+ !rcu_scheduler_fully_active;
+ preempt_enable();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
+
+#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */
+
+/**
+ * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle
+ *
+ * If the current CPU is idle or running at a first-level (not nested)
+ * interrupt from idle, return true. The caller must have at least
+ * disabled preemption.
+ */
+static int rcu_is_cpu_rrupt_from_idle(void)
+{
+ return __this_cpu_read(rcu_dynticks.dynticks_nesting) <= 1;
+}
+
+/*
+ * Snapshot the specified CPU's dynticks counter so that we can later
+ * credit them with an implicit quiescent state. Return 1 if this CPU
+ * is in dynticks idle mode, which is an extended quiescent state.
+ */
+static int dyntick_save_progress_counter(struct rcu_data *rdp,
+ bool *isidle, unsigned long *maxj)
+{
+ rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
+ rcu_sysidle_check_cpu(rdp, isidle, maxj);
+ if ((rdp->dynticks_snap & 0x1) == 0) {
+ trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
+ return 1;
+ } else {
+ if (ULONG_CMP_LT(ACCESS_ONCE(rdp->gpnum) + ULONG_MAX / 4,
+ rdp->mynode->gpnum))
+ ACCESS_ONCE(rdp->gpwrap) = true;
+ return 0;
+ }
+}
+
+/*
+ * Return true if the specified CPU has passed through a quiescent
+ * state by virtue of being in or having passed through an dynticks
+ * idle state since the last call to dyntick_save_progress_counter()
+ * for this same CPU, or by virtue of having been offline.
+ */
+static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
+ bool *isidle, unsigned long *maxj)
+{
+ unsigned int curr;
+ int *rcrmp;
+ unsigned int snap;
+
+ curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks);
+ snap = (unsigned int)rdp->dynticks_snap;
+
+ /*
+ * If the CPU passed through or entered a dynticks idle phase with
+ * no active irq/NMI handlers, then we can safely pretend that the CPU
+ * already acknowledged the request to pass through a quiescent
+ * state. Either way, that CPU cannot possibly be in an RCU
+ * read-side critical section that started before the beginning
+ * of the current RCU grace period.
+ */
+ if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) {
+ trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
+ rdp->dynticks_fqs++;
+ return 1;
+ }
+
+ /*
+ * Check for the CPU being offline, but only if the grace period
+ * is old enough. We don't need to worry about the CPU changing
+ * state: If we see it offline even once, it has been through a
+ * quiescent state.
+ *
+ * The reason for insisting that the grace period be at least
+ * one jiffy old is that CPUs that are not quite online and that
+ * have just gone offline can still execute RCU read-side critical
+ * sections.
+ */
+ if (ULONG_CMP_GE(rdp->rsp->gp_start + 2, jiffies))
+ return 0; /* Grace period is not old enough. */
+ barrier();
+ if (cpu_is_offline(rdp->cpu)) {
+ trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("ofl"));
+ rdp->offline_fqs++;
+ return 1;
+ }
+
+ /*
+ * A CPU running for an extended time within the kernel can
+ * delay RCU grace periods. When the CPU is in NO_HZ_FULL mode,
+ * even context-switching back and forth between a pair of
+ * in-kernel CPU-bound tasks cannot advance grace periods.
+ * So if the grace period is old enough, make the CPU pay attention.
+ * Note that the unsynchronized assignments to the per-CPU
+ * rcu_sched_qs_mask variable are safe. Yes, setting of
+ * bits can be lost, but they will be set again on the next
+ * force-quiescent-state pass. So lost bit sets do not result
+ * in incorrect behavior, merely in a grace period lasting
+ * a few jiffies longer than it might otherwise. Because
+ * there are at most four threads involved, and because the
+ * updates are only once every few jiffies, the probability of
+ * lossage (and thus of slight grace-period extension) is
+ * quite low.
+ *
+ * Note that if the jiffies_till_sched_qs boot/sysfs parameter
+ * is set too high, we override with half of the RCU CPU stall
+ * warning delay.
+ */
+ rcrmp = &per_cpu(rcu_sched_qs_mask, rdp->cpu);
+ if (ULONG_CMP_GE(jiffies,
+ rdp->rsp->gp_start + jiffies_till_sched_qs) ||
+ ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
+ if (!(ACCESS_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) {
+ ACCESS_ONCE(rdp->cond_resched_completed) =
+ ACCESS_ONCE(rdp->mynode->completed);
+ smp_mb(); /* ->cond_resched_completed before *rcrmp. */
+ ACCESS_ONCE(*rcrmp) =
+ ACCESS_ONCE(*rcrmp) + rdp->rsp->flavor_mask;
+ resched_cpu(rdp->cpu); /* Force CPU into scheduler. */
+ rdp->rsp->jiffies_resched += 5; /* Enable beating. */
+ } else if (ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
+ /* Time to beat on that CPU again! */
+ resched_cpu(rdp->cpu); /* Force CPU into scheduler. */
+ rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */
+ }
+ }
+
+ return 0;
+}
+
+static void record_gp_stall_check_time(struct rcu_state *rsp)
+{
+ unsigned long j = jiffies;
+ unsigned long j1;
+
+ rsp->gp_start = j;
+ smp_wmb(); /* Record start time before stall time. */
+ j1 = rcu_jiffies_till_stall_check();
+ ACCESS_ONCE(rsp->jiffies_stall) = j + j1;
+ rsp->jiffies_resched = j + j1 / 2;
+ rsp->n_force_qs_gpstart = ACCESS_ONCE(rsp->n_force_qs);
+}
+
+/*
+ * Complain about starvation of grace-period kthread.
+ */
+static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
+{
+ unsigned long gpa;
+ unsigned long j;
+
+ j = jiffies;
+ gpa = ACCESS_ONCE(rsp->gp_activity);
+ if (j - gpa > 2 * HZ)
+ pr_err("%s kthread starved for %ld jiffies!\n",
+ rsp->name, j - gpa);
+}
+
+/*
+ * Dump stacks of all tasks running on stalled CPUs.
+ */
+static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
+{
+ int cpu;
+ unsigned long flags;
+ struct rcu_node *rnp;
+
+ rcu_for_each_leaf_node(rsp, rnp) {
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ if (rnp->qsmask != 0) {
+ for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
+ if (rnp->qsmask & (1UL << cpu))
+ dump_cpu_task(rnp->grplo + cpu);
+ }
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ }
+}
+
+static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
+{
+ int cpu;
+ long delta;
+ unsigned long flags;
+ unsigned long gpa;
+ unsigned long j;
+ int ndetected = 0;
+ struct rcu_node *rnp = rcu_get_root(rsp);
+ long totqlen = 0;
+
+ /* Only let one CPU complain about others per time interval. */
+
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall);
+ if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ return;
+ }
+ ACCESS_ONCE(rsp->jiffies_stall) = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+
+ /*
+ * OK, time to rat on our buddy...
+ * See Documentation/RCU/stallwarn.txt for info on how to debug
+ * RCU CPU stall warnings.
+ */
+ pr_err("INFO: %s detected stalls on CPUs/tasks:",
+ rsp->name);
+ print_cpu_stall_info_begin();
+ rcu_for_each_leaf_node(rsp, rnp) {
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ ndetected += rcu_print_task_stall(rnp);
+ if (rnp->qsmask != 0) {
+ for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
+ if (rnp->qsmask & (1UL << cpu)) {
+ print_cpu_stall_info(rsp,
+ rnp->grplo + cpu);
+ ndetected++;
+ }
+ }
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ }
+
+ print_cpu_stall_info_end();
+ for_each_possible_cpu(cpu)
+ totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
+ pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n",
+ smp_processor_id(), (long)(jiffies - rsp->gp_start),
+ (long)rsp->gpnum, (long)rsp->completed, totqlen);
+ if (ndetected) {
+ rcu_dump_cpu_stacks(rsp);
+ } else {
+ if (ACCESS_ONCE(rsp->gpnum) != gpnum ||
+ ACCESS_ONCE(rsp->completed) == gpnum) {
+ pr_err("INFO: Stall ended before state dump start\n");
+ } else {
+ j = jiffies;
+ gpa = ACCESS_ONCE(rsp->gp_activity);
+ pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n",
+ rsp->name, j - gpa, j, gpa,
+ jiffies_till_next_fqs,
+ rcu_get_root(rsp)->qsmask);
+ /* In this case, the current CPU might be at fault. */
+ sched_show_task(current);
+ }
+ }
+
+ /* Complain about tasks blocking the grace period. */
+ rcu_print_detail_task_stall(rsp);
+
+ rcu_check_gp_kthread_starvation(rsp);
+
+ force_quiescent_state(rsp); /* Kick them all. */
+}
+
+static void print_cpu_stall(struct rcu_state *rsp)
+{
+ int cpu;
+ unsigned long flags;
+ struct rcu_node *rnp = rcu_get_root(rsp);
+ long totqlen = 0;
+
+ /*
+ * OK, time to rat on ourselves...
+ * See Documentation/RCU/stallwarn.txt for info on how to debug
+ * RCU CPU stall warnings.
+ */
+ pr_err("INFO: %s self-detected stall on CPU", rsp->name);
+ print_cpu_stall_info_begin();
+ print_cpu_stall_info(rsp, smp_processor_id());
+ print_cpu_stall_info_end();
+ for_each_possible_cpu(cpu)
+ totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
+ pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n",
+ jiffies - rsp->gp_start,
+ (long)rsp->gpnum, (long)rsp->completed, totqlen);
+
+ rcu_check_gp_kthread_starvation(rsp);
+
+ rcu_dump_cpu_stacks(rsp);
+
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ if (ULONG_CMP_GE(jiffies, ACCESS_ONCE(rsp->jiffies_stall)))
+ ACCESS_ONCE(rsp->jiffies_stall) = jiffies +
+ 3 * rcu_jiffies_till_stall_check() + 3;
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+
+ /*
+ * Attempt to revive the RCU machinery by forcing a context switch.
+ *
+ * A context switch would normally allow the RCU state machine to make
+ * progress and it could be we're stuck in kernel space without context
+ * switches for an entirely unreasonable amount of time.
+ */
+ resched_cpu(smp_processor_id());
+}
+
+static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+ unsigned long completed;
+ unsigned long gpnum;
+ unsigned long gps;
+ unsigned long j;
+ unsigned long js;
+ struct rcu_node *rnp;
+
+ if (rcu_cpu_stall_suppress || !rcu_gp_in_progress(rsp))
+ return;
+ j = jiffies;
+
+ /*
+ * Lots of memory barriers to reject false positives.
+ *
+ * The idea is to pick up rsp->gpnum, then rsp->jiffies_stall,
+ * then rsp->gp_start, and finally rsp->completed. These values
+ * are updated in the opposite order with memory barriers (or
+ * equivalent) during grace-period initialization and cleanup.
+ * Now, a false positive can occur if we get an new value of
+ * rsp->gp_start and a old value of rsp->jiffies_stall. But given
+ * the memory barriers, the only way that this can happen is if one
+ * grace period ends and another starts between these two fetches.
+ * Detect this by comparing rsp->completed with the previous fetch
+ * from rsp->gpnum.
+ *
+ * Given this check, comparisons of jiffies, rsp->jiffies_stall,
+ * and rsp->gp_start suffice to forestall false positives.
+ */
+ gpnum = ACCESS_ONCE(rsp->gpnum);
+ smp_rmb(); /* Pick up ->gpnum first... */
+ js = ACCESS_ONCE(rsp->jiffies_stall);
+ smp_rmb(); /* ...then ->jiffies_stall before the rest... */
+ gps = ACCESS_ONCE(rsp->gp_start);
+ smp_rmb(); /* ...and finally ->gp_start before ->completed. */
+ completed = ACCESS_ONCE(rsp->completed);
+ if (ULONG_CMP_GE(completed, gpnum) ||
+ ULONG_CMP_LT(j, js) ||
+ ULONG_CMP_GE(gps, js))
+ return; /* No stall or GP completed since entering function. */
+ rnp = rdp->mynode;
+ if (rcu_gp_in_progress(rsp) &&
+ (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask)) {
+
+ /* We haven't checked in, so go dump stack. */
+ print_cpu_stall(rsp);
+
+ } else if (rcu_gp_in_progress(rsp) &&
+ ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) {
+
+ /* They had a few time units to dump stack, so complain. */
+ print_other_cpu_stall(rsp, gpnum);
+ }
+}
+
+/**
+ * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
+ *
+ * Set the stall-warning timeout way off into the future, thus preventing
+ * any RCU CPU stall-warning messages from appearing in the current set of
+ * RCU grace periods.
+ *
+ * The caller must disable hard irqs.
+ */
+void rcu_cpu_stall_reset(void)
+{
+ struct rcu_state *rsp;
+
+ for_each_rcu_flavor(rsp)
+ ACCESS_ONCE(rsp->jiffies_stall) = jiffies + ULONG_MAX / 2;
+}
+
+/*
+ * Initialize the specified rcu_data structure's default callback list
+ * to empty. The default callback list is the one that is not used by
+ * no-callbacks CPUs.
+ */
+static void init_default_callback_list(struct rcu_data *rdp)
+{
+ int i;
+
+ rdp->nxtlist = NULL;
+ for (i = 0; i < RCU_NEXT_SIZE; i++)
+ rdp->nxttail[i] = &rdp->nxtlist;
+}
+
+/*
+ * Initialize the specified rcu_data structure's callback list to empty.
+ */
+static void init_callback_list(struct rcu_data *rdp)
+{
+ if (init_nocb_callback_list(rdp))
+ return;
+ init_default_callback_list(rdp);
+}
+
+/*
+ * Determine the value that ->completed will have at the end of the
+ * next subsequent grace period. This is used to tag callbacks so that
+ * a CPU can invoke callbacks in a timely fashion even if that CPU has
+ * been dyntick-idle for an extended period with callbacks under the
+ * influence of RCU_FAST_NO_HZ.
+ *
+ * The caller must hold rnp->lock with interrupts disabled.
+ */
+static unsigned long rcu_cbs_completed(struct rcu_state *rsp,
+ struct rcu_node *rnp)
+{
+ /*
+ * If RCU is idle, we just wait for the next grace period.
+ * But we can only be sure that RCU is idle if we are looking
+ * at the root rcu_node structure -- otherwise, a new grace
+ * period might have started, but just not yet gotten around
+ * to initializing the current non-root rcu_node structure.
+ */
+ if (rcu_get_root(rsp) == rnp && rnp->gpnum == rnp->completed)
+ return rnp->completed + 1;
+
+ /*
+ * Otherwise, wait for a possible partial grace period and
+ * then the subsequent full grace period.
+ */
+ return rnp->completed + 2;
+}
+
+/*
+ * Trace-event helper function for rcu_start_future_gp() and
+ * rcu_nocb_wait_gp().
+ */
+static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
+ unsigned long c, const char *s)
+{
+ trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum,
+ rnp->completed, c, rnp->level,
+ rnp->grplo, rnp->grphi, s);
+}
+
+/*
+ * Start some future grace period, as needed to handle newly arrived
+ * callbacks. The required future grace periods are recorded in each
+ * rcu_node structure's ->need_future_gp field. Returns true if there
+ * is reason to awaken the grace-period kthread.
+ *
+ * The caller must hold the specified rcu_node structure's ->lock.
+ */
+static bool __maybe_unused
+rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
+ unsigned long *c_out)
+{
+ unsigned long c;
+ int i;
+ bool ret = false;
+ struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
+
+ /*
+ * Pick up grace-period number for new callbacks. If this
+ * grace period is already marked as needed, return to the caller.
+ */
+ c = rcu_cbs_completed(rdp->rsp, rnp);
+ trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf"));
+ if (rnp->need_future_gp[c & 0x1]) {
+ trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf"));
+ goto out;
+ }
+
+ /*
+ * If either this rcu_node structure or the root rcu_node structure
+ * believe that a grace period is in progress, then we must wait
+ * for the one following, which is in "c". Because our request
+ * will be noticed at the end of the current grace period, we don't
+ * need to explicitly start one. We only do the lockless check
+ * of rnp_root's fields if the current rcu_node structure thinks
+ * there is no grace period in flight, and because we hold rnp->lock,
+ * the only possible change is when rnp_root's two fields are
+ * equal, in which case rnp_root->gpnum might be concurrently
+ * incremented. But that is OK, as it will just result in our
+ * doing some extra useless work.
+ */
+ if (rnp->gpnum != rnp->completed ||
+ ACCESS_ONCE(rnp_root->gpnum) != ACCESS_ONCE(rnp_root->completed)) {
+ rnp->need_future_gp[c & 0x1]++;
+ trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf"));
+ goto out;
+ }
+
+ /*
+ * There might be no grace period in progress. If we don't already
+ * hold it, acquire the root rcu_node structure's lock in order to
+ * start one (if needed).
+ */
+ if (rnp != rnp_root) {
+ raw_spin_lock(&rnp_root->lock);
+ smp_mb__after_unlock_lock();
+ }
+
+ /*
+ * Get a new grace-period number. If there really is no grace
+ * period in progress, it will be smaller than the one we obtained
+ * earlier. Adjust callbacks as needed. Note that even no-CBs
+ * CPUs have a ->nxtcompleted[] array, so no no-CBs checks needed.
+ */
+ c = rcu_cbs_completed(rdp->rsp, rnp_root);
+ for (i = RCU_DONE_TAIL; i < RCU_NEXT_TAIL; i++)
+ if (ULONG_CMP_LT(c, rdp->nxtcompleted[i]))
+ rdp->nxtcompleted[i] = c;
+
+ /*
+ * If the needed for the required grace period is already
+ * recorded, trace and leave.
+ */
+ if (rnp_root->need_future_gp[c & 0x1]) {
+ trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartedroot"));
+ goto unlock_out;
+ }
+
+ /* Record the need for the future grace period. */
+ rnp_root->need_future_gp[c & 0x1]++;
+
+ /* If a grace period is not already in progress, start one. */
+ if (rnp_root->gpnum != rnp_root->completed) {
+ trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot"));
+ } else {
+ trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot"));
+ ret = rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp);
+ }
+unlock_out:
+ if (rnp != rnp_root)
+ raw_spin_unlock(&rnp_root->lock);
+out:
+ if (c_out != NULL)
+ *c_out = c;
+ return ret;
+}
+
+/*
+ * Clean up any old requests for the just-ended grace period. Also return
+ * whether any additional grace periods have been requested. Also invoke
+ * rcu_nocb_gp_cleanup() in order to wake up any no-callbacks kthreads
+ * waiting for this grace period to complete.
+ */
+static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
+{
+ int c = rnp->completed;
+ int needmore;
+ struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
+
+ rcu_nocb_gp_cleanup(rsp, rnp);
+ rnp->need_future_gp[c & 0x1] = 0;
+ needmore = rnp->need_future_gp[(c + 1) & 0x1];
+ trace_rcu_future_gp(rnp, rdp, c,
+ needmore ? TPS("CleanupMore") : TPS("Cleanup"));
+ return needmore;
+}
+
+/*
+ * Awaken the grace-period kthread for the specified flavor of RCU.
+ * Don't do a self-awaken, and don't bother awakening when there is
+ * nothing for the grace-period kthread to do (as in several CPUs
+ * raced to awaken, and we lost), and finally don't try to awaken
+ * a kthread that has not yet been created.
+ */
+static void rcu_gp_kthread_wake(struct rcu_state *rsp)
+{
+ if (current == rsp->gp_kthread ||
+ !ACCESS_ONCE(rsp->gp_flags) ||
+ !rsp->gp_kthread)
+ return;
+ wake_up(&rsp->gp_wq);
+}
+
+/*
+ * If there is room, assign a ->completed number to any callbacks on
+ * this CPU that have not already been assigned. Also accelerate any
+ * callbacks that were previously assigned a ->completed number that has
+ * since proven to be too conservative, which can happen if callbacks get
+ * assigned a ->completed number while RCU is idle, but with reference to
+ * a non-root rcu_node structure. This function is idempotent, so it does
+ * not hurt to call it repeatedly. Returns an flag saying that we should
+ * awaken the RCU grace-period kthread.
+ *
+ * The caller must hold rnp->lock with interrupts disabled.
+ */
+static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
+ struct rcu_data *rdp)
+{
+ unsigned long c;
+ int i;
+ bool ret;
+
+ /* If the CPU has no callbacks, nothing to do. */
+ if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
+ return false;
+
+ /*
+ * Starting from the sublist containing the callbacks most
+ * recently assigned a ->completed number and working down, find the
+ * first sublist that is not assignable to an upcoming grace period.
+ * Such a sublist has something in it (first two tests) and has
+ * a ->completed number assigned that will complete sooner than
+ * the ->completed number for newly arrived callbacks (last test).
+ *
+ * The key point is that any later sublist can be assigned the
+ * same ->completed number as the newly arrived callbacks, which
+ * means that the callbacks in any of these later sublist can be
+ * grouped into a single sublist, whether or not they have already
+ * been assigned a ->completed number.
+ */
+ c = rcu_cbs_completed(rsp, rnp);
+ for (i = RCU_NEXT_TAIL - 1; i > RCU_DONE_TAIL; i--)
+ if (rdp->nxttail[i] != rdp->nxttail[i - 1] &&
+ !ULONG_CMP_GE(rdp->nxtcompleted[i], c))
+ break;
+
+ /*
+ * If there are no sublist for unassigned callbacks, leave.
+ * At the same time, advance "i" one sublist, so that "i" will
+ * index into the sublist where all the remaining callbacks should
+ * be grouped into.
+ */
+ if (++i >= RCU_NEXT_TAIL)
+ return false;
+
+ /*
+ * Assign all subsequent callbacks' ->completed number to the next
+ * full grace period and group them all in the sublist initially
+ * indexed by "i".
+ */
+ for (; i <= RCU_NEXT_TAIL; i++) {
+ rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL];
+ rdp->nxtcompleted[i] = c;
+ }
+ /* Record any needed additional grace periods. */
+ ret = rcu_start_future_gp(rnp, rdp, NULL);
+
+ /* Trace depending on how much we were able to accelerate. */
+ if (!*rdp->nxttail[RCU_WAIT_TAIL])
+ trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB"));
+ else
+ trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB"));
+ return ret;
+}
+
+/*
+ * Move any callbacks whose grace period has completed to the
+ * RCU_DONE_TAIL sublist, then compact the remaining sublists and
+ * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL
+ * sublist. This function is idempotent, so it does not hurt to
+ * invoke it repeatedly. As long as it is not invoked -too- often...
+ * Returns true if the RCU grace-period kthread needs to be awakened.
+ *
+ * The caller must hold rnp->lock with interrupts disabled.
+ */
+static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
+ struct rcu_data *rdp)
+{
+ int i, j;
+
+ /* If the CPU has no callbacks, nothing to do. */
+ if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
+ return false;
+
+ /*
+ * Find all callbacks whose ->completed numbers indicate that they
+ * are ready to invoke, and put them into the RCU_DONE_TAIL sublist.
+ */
+ for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {
+ if (ULONG_CMP_LT(rnp->completed, rdp->nxtcompleted[i]))
+ break;
+ rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[i];
+ }
+ /* Clean up any sublist tail pointers that were misordered above. */
+ for (j = RCU_WAIT_TAIL; j < i; j++)
+ rdp->nxttail[j] = rdp->nxttail[RCU_DONE_TAIL];
+
+ /* Copy down callbacks to fill in empty sublists. */
+ for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
+ if (rdp->nxttail[j] == rdp->nxttail[RCU_NEXT_TAIL])
+ break;
+ rdp->nxttail[j] = rdp->nxttail[i];
+ rdp->nxtcompleted[j] = rdp->nxtcompleted[i];
+ }
+
+ /* Classify any remaining callbacks. */
+ return rcu_accelerate_cbs(rsp, rnp, rdp);
+}
+
+/*
+ * Update CPU-local rcu_data state to record the beginnings and ends of
+ * grace periods. The caller must hold the ->lock of the leaf rcu_node
+ * structure corresponding to the current CPU, and must have irqs disabled.
+ * Returns true if the grace-period kthread needs to be awakened.
+ */
+static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
+ struct rcu_data *rdp)
+{
+ bool ret;
+
+ /* Handle the ends of any preceding grace periods first. */
+ if (rdp->completed == rnp->completed &&
+ !unlikely(ACCESS_ONCE(rdp->gpwrap))) {
+
+ /* No grace period end, so just accelerate recent callbacks. */
+ ret = rcu_accelerate_cbs(rsp, rnp, rdp);
+
+ } else {
+
+ /* Advance callbacks. */
+ ret = rcu_advance_cbs(rsp, rnp, rdp);
+
+ /* Remember that we saw this grace-period completion. */
+ rdp->completed = rnp->completed;
+ trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend"));
+ }
+
+ if (rdp->gpnum != rnp->gpnum || unlikely(ACCESS_ONCE(rdp->gpwrap))) {
+ /*
+ * If the current grace period is waiting for this CPU,
+ * set up to detect a quiescent state, otherwise don't
+ * go looking for one.
+ */
+ rdp->gpnum = rnp->gpnum;
+ trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart"));
+ rdp->passed_quiesce = 0;
+ rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
+ rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
+ zero_cpu_stall_ticks(rdp);
+ ACCESS_ONCE(rdp->gpwrap) = false;
+ }
+ return ret;
+}
+
+static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+ unsigned long flags;
+ bool needwake;
+ struct rcu_node *rnp;
+
+ local_irq_save(flags);
+ rnp = rdp->mynode;
+ if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) &&
+ rdp->completed == ACCESS_ONCE(rnp->completed) &&
+ !unlikely(ACCESS_ONCE(rdp->gpwrap))) || /* w/out lock. */
+ !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
+ local_irq_restore(flags);
+ return;
+ }
+ smp_mb__after_unlock_lock();
+ needwake = __note_gp_changes(rsp, rnp, rdp);
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ if (needwake)
+ rcu_gp_kthread_wake(rsp);
+}
+
+/*
+ * Initialize a new grace period. Return 0 if no grace period required.
+ */
+static int rcu_gp_init(struct rcu_state *rsp)
+{
+ unsigned long oldmask;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp = rcu_get_root(rsp);
+
+ ACCESS_ONCE(rsp->gp_activity) = jiffies;
+ raw_spin_lock_irq(&rnp->lock);
+ smp_mb__after_unlock_lock();
+ if (!ACCESS_ONCE(rsp->gp_flags)) {
+ /* Spurious wakeup, tell caller to go back to sleep. */
+ raw_spin_unlock_irq(&rnp->lock);
+ return 0;
+ }
+ ACCESS_ONCE(rsp->gp_flags) = 0; /* Clear all flags: New grace period. */
+
+ if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) {
+ /*
+ * Grace period already in progress, don't start another.
+ * Not supposed to be able to happen.
+ */
+ raw_spin_unlock_irq(&rnp->lock);
+ return 0;
+ }
+
+ /* Advance to a new grace period and initialize state. */
+ record_gp_stall_check_time(rsp);
+ /* Record GP times before starting GP, hence smp_store_release(). */
+ smp_store_release(&rsp->gpnum, rsp->gpnum + 1);
+ trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start"));
+ raw_spin_unlock_irq(&rnp->lock);
+
+ /*
+ * Apply per-leaf buffered online and offline operations to the
+ * rcu_node tree. Note that this new grace period need not wait
+ * for subsequent online CPUs, and that quiescent-state forcing
+ * will handle subsequent offline CPUs.
+ */
+ rcu_for_each_leaf_node(rsp, rnp) {
+ raw_spin_lock_irq(&rnp->lock);
+ smp_mb__after_unlock_lock();
+ if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
+ !rnp->wait_blkd_tasks) {
+ /* Nothing to do on this leaf rcu_node structure. */
+ raw_spin_unlock_irq(&rnp->lock);
+ continue;
+ }
+
+ /* Record old state, apply changes to ->qsmaskinit field. */
+ oldmask = rnp->qsmaskinit;
+ rnp->qsmaskinit = rnp->qsmaskinitnext;
+
+ /* If zero-ness of ->qsmaskinit changed, propagate up tree. */
+ if (!oldmask != !rnp->qsmaskinit) {
+ if (!oldmask) /* First online CPU for this rcu_node. */
+ rcu_init_new_rnp(rnp);
+ else if (rcu_preempt_has_tasks(rnp)) /* blocked tasks */
+ rnp->wait_blkd_tasks = true;
+ else /* Last offline CPU and can propagate. */
+ rcu_cleanup_dead_rnp(rnp);
+ }
+
+ /*
+ * If all waited-on tasks from prior grace period are
+ * done, and if all this rcu_node structure's CPUs are
+ * still offline, propagate up the rcu_node tree and
+ * clear ->wait_blkd_tasks. Otherwise, if one of this
+ * rcu_node structure's CPUs has since come back online,
+ * simply clear ->wait_blkd_tasks (but rcu_cleanup_dead_rnp()
+ * checks for this, so just call it unconditionally).
+ */
+ if (rnp->wait_blkd_tasks &&
+ (!rcu_preempt_has_tasks(rnp) ||
+ rnp->qsmaskinit)) {
+ rnp->wait_blkd_tasks = false;
+ rcu_cleanup_dead_rnp(rnp);
+ }
+
+ raw_spin_unlock_irq(&rnp->lock);
+ }
+
+ /*
+ * Set the quiescent-state-needed bits in all the rcu_node
+ * structures for all currently online CPUs in breadth-first order,
+ * starting from the root rcu_node structure, relying on the layout
+ * of the tree within the rsp->node[] array. Note that other CPUs
+ * will access only the leaves of the hierarchy, thus seeing that no
+ * grace period is in progress, at least until the corresponding
+ * leaf node has been initialized. In addition, we have excluded
+ * CPU-hotplug operations.
+ *
+ * The grace period cannot complete until the initialization
+ * process finishes, because this kthread handles both.
+ */
+ rcu_for_each_node_breadth_first(rsp, rnp) {
+ raw_spin_lock_irq(&rnp->lock);
+ smp_mb__after_unlock_lock();
+ rdp = this_cpu_ptr(rsp->rda);
+ rcu_preempt_check_blocked_tasks(rnp);
+ rnp->qsmask = rnp->qsmaskinit;
+ ACCESS_ONCE(rnp->gpnum) = rsp->gpnum;
+ if (WARN_ON_ONCE(rnp->completed != rsp->completed))
+ ACCESS_ONCE(rnp->completed) = rsp->completed;
+ if (rnp == rdp->mynode)
+ (void)__note_gp_changes(rsp, rnp, rdp);
+ rcu_preempt_boost_start_gp(rnp);
+ trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
+ rnp->level, rnp->grplo,
+ rnp->grphi, rnp->qsmask);
+ raw_spin_unlock_irq(&rnp->lock);
+ cond_resched_rcu_qs();
+ ACCESS_ONCE(rsp->gp_activity) = jiffies;
+ if (gp_init_delay > 0 &&
+ !(rsp->gpnum % (rcu_num_nodes * PER_RCU_NODE_PERIOD)))
+ schedule_timeout_uninterruptible(gp_init_delay);
+ }
+
+ return 1;
+}
+
+/*
+ * Do one round of quiescent-state forcing.
+ */
+static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
+{
+ int fqs_state = fqs_state_in;
+ bool isidle = false;
+ unsigned long maxj;
+ struct rcu_node *rnp = rcu_get_root(rsp);
+
+ ACCESS_ONCE(rsp->gp_activity) = jiffies;
+ rsp->n_force_qs++;
+ if (fqs_state == RCU_SAVE_DYNTICK) {
+ /* Collect dyntick-idle snapshots. */
+ if (is_sysidle_rcu_state(rsp)) {
+ isidle = true;
+ maxj = jiffies - ULONG_MAX / 4;
+ }
+ force_qs_rnp(rsp, dyntick_save_progress_counter,
+ &isidle, &maxj);
+ rcu_sysidle_report_gp(rsp, isidle, maxj);
+ fqs_state = RCU_FORCE_QS;
+ } else {
+ /* Handle dyntick-idle and offline CPUs. */
+ isidle = true;
+ force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj);
+ }
+ /* Clear flag to prevent immediate re-entry. */
+ if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
+ raw_spin_lock_irq(&rnp->lock);
+ smp_mb__after_unlock_lock();
+ ACCESS_ONCE(rsp->gp_flags) =
+ ACCESS_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS;
+ raw_spin_unlock_irq(&rnp->lock);
+ }
+ return fqs_state;
+}
+
+/*
+ * Clean up after the old grace period.
+ */
+static void rcu_gp_cleanup(struct rcu_state *rsp)
+{
+ unsigned long gp_duration;
+ bool needgp = false;
+ int nocb = 0;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp = rcu_get_root(rsp);
+
+ ACCESS_ONCE(rsp->gp_activity) = jiffies;
+ raw_spin_lock_irq(&rnp->lock);
+ smp_mb__after_unlock_lock();
+ gp_duration = jiffies - rsp->gp_start;
+ if (gp_duration > rsp->gp_max)
+ rsp->gp_max = gp_duration;
+
+ /*
+ * We know the grace period is complete, but to everyone else
+ * it appears to still be ongoing. But it is also the case
+ * that to everyone else it looks like there is nothing that
+ * they can do to advance the grace period. It is therefore
+ * safe for us to drop the lock in order to mark the grace
+ * period as completed in all of the rcu_node structures.
+ */
+ raw_spin_unlock_irq(&rnp->lock);
+
+ /*
+ * Propagate new ->completed value to rcu_node structures so
+ * that other CPUs don't have to wait until the start of the next
+ * grace period to process their callbacks. This also avoids
+ * some nasty RCU grace-period initialization races by forcing
+ * the end of the current grace period to be completely recorded in
+ * all of the rcu_node structures before the beginning of the next
+ * grace period is recorded in any of the rcu_node structures.
+ */
+ rcu_for_each_node_breadth_first(rsp, rnp) {
+ raw_spin_lock_irq(&rnp->lock);
+ smp_mb__after_unlock_lock();
+ WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
+ WARN_ON_ONCE(rnp->qsmask);
+ ACCESS_ONCE(rnp->completed) = rsp->gpnum;
+ rdp = this_cpu_ptr(rsp->rda);
+ if (rnp == rdp->mynode)
+ needgp = __note_gp_changes(rsp, rnp, rdp) || needgp;
+ /* smp_mb() provided by prior unlock-lock pair. */
+ nocb += rcu_future_gp_cleanup(rsp, rnp);
+ raw_spin_unlock_irq(&rnp->lock);
+ cond_resched_rcu_qs();
+ ACCESS_ONCE(rsp->gp_activity) = jiffies;
+ }
+ rnp = rcu_get_root(rsp);
+ raw_spin_lock_irq(&rnp->lock);
+ smp_mb__after_unlock_lock(); /* Order GP before ->completed update. */
+ rcu_nocb_gp_set(rnp, nocb);
+
+ /* Declare grace period done. */
+ ACCESS_ONCE(rsp->completed) = rsp->gpnum;
+ trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end"));
+ rsp->fqs_state = RCU_GP_IDLE;
+ rdp = this_cpu_ptr(rsp->rda);
+ /* Advance CBs to reduce false positives below. */
+ needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp;
+ if (needgp || cpu_needs_another_gp(rsp, rdp)) {
+ ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT;
+ trace_rcu_grace_period(rsp->name,
+ ACCESS_ONCE(rsp->gpnum),
+ TPS("newreq"));
+ }
+ raw_spin_unlock_irq(&rnp->lock);
+}
+
+/*
+ * Body of kthread that handles grace periods.
+ */
+static int __noreturn rcu_gp_kthread(void *arg)
+{
+ int fqs_state;
+ int gf;
+ unsigned long j;
+ int ret;
+ struct rcu_state *rsp = arg;
+ struct rcu_node *rnp = rcu_get_root(rsp);
+
+ rcu_bind_gp_kthread();
+ for (;;) {
+
+ /* Handle grace-period start. */
+ for (;;) {
+ trace_rcu_grace_period(rsp->name,
+ ACCESS_ONCE(rsp->gpnum),
+ TPS("reqwait"));
+ rsp->gp_state = RCU_GP_WAIT_GPS;
+ wait_event_interruptible(rsp->gp_wq,
+ ACCESS_ONCE(rsp->gp_flags) &
+ RCU_GP_FLAG_INIT);
+ /* Locking provides needed memory barrier. */
+ if (rcu_gp_init(rsp))
+ break;
+ cond_resched_rcu_qs();
+ ACCESS_ONCE(rsp->gp_activity) = jiffies;
+ WARN_ON(signal_pending(current));
+ trace_rcu_grace_period(rsp->name,
+ ACCESS_ONCE(rsp->gpnum),
+ TPS("reqwaitsig"));
+ }
+
+ /* Handle quiescent-state forcing. */
+ fqs_state = RCU_SAVE_DYNTICK;
+ j = jiffies_till_first_fqs;
+ if (j > HZ) {
+ j = HZ;
+ jiffies_till_first_fqs = HZ;
+ }
+ ret = 0;
+ for (;;) {
+ if (!ret)
+ rsp->jiffies_force_qs = jiffies + j;
+ trace_rcu_grace_period(rsp->name,
+ ACCESS_ONCE(rsp->gpnum),
+ TPS("fqswait"));
+ rsp->gp_state = RCU_GP_WAIT_FQS;
+ ret = wait_event_interruptible_timeout(rsp->gp_wq,
+ ((gf = ACCESS_ONCE(rsp->gp_flags)) &
+ RCU_GP_FLAG_FQS) ||
+ (!ACCESS_ONCE(rnp->qsmask) &&
+ !rcu_preempt_blocked_readers_cgp(rnp)),
+ j);
+ /* Locking provides needed memory barriers. */
+ /* If grace period done, leave loop. */
+ if (!ACCESS_ONCE(rnp->qsmask) &&
+ !rcu_preempt_blocked_readers_cgp(rnp))
+ break;
+ /* If time for quiescent-state forcing, do it. */
+ if (ULONG_CMP_GE(jiffies, rsp->jiffies_force_qs) ||
+ (gf & RCU_GP_FLAG_FQS)) {
+ trace_rcu_grace_period(rsp->name,
+ ACCESS_ONCE(rsp->gpnum),
+ TPS("fqsstart"));
+ fqs_state = rcu_gp_fqs(rsp, fqs_state);
+ trace_rcu_grace_period(rsp->name,
+ ACCESS_ONCE(rsp->gpnum),
+ TPS("fqsend"));
+ cond_resched_rcu_qs();
+ ACCESS_ONCE(rsp->gp_activity) = jiffies;
+ } else {
+ /* Deal with stray signal. */
+ cond_resched_rcu_qs();
+ ACCESS_ONCE(rsp->gp_activity) = jiffies;
+ WARN_ON(signal_pending(current));
+ trace_rcu_grace_period(rsp->name,
+ ACCESS_ONCE(rsp->gpnum),
+ TPS("fqswaitsig"));
+ }
+ j = jiffies_till_next_fqs;
+ if (j > HZ) {
+ j = HZ;
+ jiffies_till_next_fqs = HZ;
+ } else if (j < 1) {
+ j = 1;
+ jiffies_till_next_fqs = 1;
+ }
+ }
+
+ /* Handle grace-period end. */
+ rcu_gp_cleanup(rsp);
+ }
+}
+
+/*
+ * Start a new RCU grace period if warranted, re-initializing the hierarchy
+ * in preparation for detecting the next grace period. The caller must hold
+ * the root node's ->lock and hard irqs must be disabled.
+ *
+ * Note that it is legal for a dying CPU (which is marked as offline) to
+ * invoke this function. This can happen when the dying CPU reports its
+ * quiescent state.
+ *
+ * Returns true if the grace-period kthread must be awakened.
+ */
+static bool
+rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
+ struct rcu_data *rdp)
+{
+ if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) {
+ /*
+ * Either we have not yet spawned the grace-period
+ * task, this CPU does not need another grace period,
+ * or a grace period is already in progress.
+ * Either way, don't start a new grace period.
+ */
+ return false;
+ }
+ ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT;
+ trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum),
+ TPS("newreq"));
+
+ /*
+ * We can't do wakeups while holding the rnp->lock, as that
+ * could cause possible deadlocks with the rq->lock. Defer
+ * the wakeup to our caller.
+ */
+ return true;
+}
+
+/*
+ * Similar to rcu_start_gp_advanced(), but also advance the calling CPU's
+ * callbacks. Note that rcu_start_gp_advanced() cannot do this because it
+ * is invoked indirectly from rcu_advance_cbs(), which would result in
+ * endless recursion -- or would do so if it wasn't for the self-deadlock
+ * that is encountered beforehand.
+ *
+ * Returns true if the grace-period kthread needs to be awakened.
+ */
+static bool rcu_start_gp(struct rcu_state *rsp)
+{
+ struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
+ struct rcu_node *rnp = rcu_get_root(rsp);
+ bool ret = false;
+
+ /*
+ * If there is no grace period in progress right now, any
+ * callbacks we have up to this point will be satisfied by the
+ * next grace period. Also, advancing the callbacks reduces the
+ * probability of false positives from cpu_needs_another_gp()
+ * resulting in pointless grace periods. So, advance callbacks
+ * then start the grace period!
+ */
+ ret = rcu_advance_cbs(rsp, rnp, rdp) || ret;
+ ret = rcu_start_gp_advanced(rsp, rnp, rdp) || ret;
+ return ret;
+}
+
+/*
+ * Report a full set of quiescent states to the specified rcu_state
+ * data structure. This involves cleaning up after the prior grace
+ * period and letting rcu_start_gp() start up the next grace period
+ * if one is needed. Note that the caller must hold rnp->lock, which
+ * is released before return.
+ */
+static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
+ __releases(rcu_get_root(rsp)->lock)
+{
+ WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
+ raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
+ rcu_gp_kthread_wake(rsp);
+}
+
+/*
+ * Similar to rcu_report_qs_rdp(), for which it is a helper function.
+ * Allows quiescent states for a group of CPUs to be reported at one go
+ * to the specified rcu_node structure, though all the CPUs in the group
+ * must be represented by the same rcu_node structure (which need not be a
+ * leaf rcu_node structure, though it often will be). The gps parameter
+ * is the grace-period snapshot, which means that the quiescent states
+ * are valid only if rnp->gpnum is equal to gps. That structure's lock
+ * must be held upon entry, and it is released before return.
+ */
+static void
+rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
+ struct rcu_node *rnp, unsigned long gps, unsigned long flags)
+ __releases(rnp->lock)
+{
+ unsigned long oldmask = 0;
+ struct rcu_node *rnp_c;
+
+ /* Walk up the rcu_node hierarchy. */
+ for (;;) {
+ if (!(rnp->qsmask & mask) || rnp->gpnum != gps) {
+
+ /*
+ * Our bit has already been cleared, or the
+ * relevant grace period is already over, so done.
+ */
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ return;
+ }
+ WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */
+ rnp->qsmask &= ~mask;
+ trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum,
+ mask, rnp->qsmask, rnp->level,
+ rnp->grplo, rnp->grphi,
+ !!rnp->gp_tasks);
+ if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
+
+ /* Other bits still set at this level, so done. */
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ return;
+ }
+ mask = rnp->grpmask;
+ if (rnp->parent == NULL) {
+
+ /* No more levels. Exit loop holding root lock. */
+
+ break;
+ }
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ rnp_c = rnp;
+ rnp = rnp->parent;
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ smp_mb__after_unlock_lock();
+ oldmask = rnp_c->qsmask;
+ }
+
+ /*
+ * Get here if we are the last CPU to pass through a quiescent
+ * state for this grace period. Invoke rcu_report_qs_rsp()
+ * to clean up and start the next grace period if one is needed.
+ */
+ rcu_report_qs_rsp(rsp, flags); /* releases rnp->lock. */
+}
+
+/*
+ * Record a quiescent state for all tasks that were previously queued
+ * on the specified rcu_node structure and that were blocking the current
+ * RCU grace period. The caller must hold the specified rnp->lock with
+ * irqs disabled, and this lock is released upon return, but irqs remain
+ * disabled.
+ */
+static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
+ struct rcu_node *rnp, unsigned long flags)
+ __releases(rnp->lock)
+{
+ unsigned long gps;
+ unsigned long mask;
+ struct rcu_node *rnp_p;
+
+ if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p ||
+ rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ return; /* Still need more quiescent states! */
+ }
+
+ rnp_p = rnp->parent;
+ if (rnp_p == NULL) {
+ /*
+ * Only one rcu_node structure in the tree, so don't
+ * try to report up to its nonexistent parent!
+ */
+ rcu_report_qs_rsp(rsp, flags);
+ return;
+ }
+
+ /* Report up the rest of the hierarchy, tracking current ->gpnum. */
+ gps = rnp->gpnum;
+ mask = rnp->grpmask;
+ raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+ raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */
+ smp_mb__after_unlock_lock();
+ rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags);
+}
+
+/*
+ * Record a quiescent state for the specified CPU to that CPU's rcu_data
+ * structure. This must be either called from the specified CPU, or
+ * called when the specified CPU is known to be offline (and when it is
+ * also known that no other CPU is concurrently trying to help the offline
+ * CPU). The lastcomp argument is used to make sure we are still in the
+ * grace period of interest. We don't want to end the current grace period
+ * based on quiescent states detected in an earlier grace period!
+ */
+static void
+rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
+{
+ unsigned long flags;
+ unsigned long mask;
+ bool needwake;
+ struct rcu_node *rnp;
+
+ rnp = rdp->mynode;
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ smp_mb__after_unlock_lock();
+ if ((rdp->passed_quiesce == 0 &&
+ rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) ||
+ rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum ||
+ rdp->gpwrap) {
+
+ /*
+ * The grace period in which this quiescent state was
+ * recorded has ended, so don't report it upwards.
+ * We will instead need a new quiescent state that lies
+ * within the current grace period.
+ */
+ rdp->passed_quiesce = 0; /* need qs for new gp. */
+ rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ return;
+ }
+ mask = rdp->grpmask;
+ if ((rnp->qsmask & mask) == 0) {
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ } else {
+ rdp->qs_pending = 0;
+
+ /*
+ * This GP can't end until cpu checks in, so all of our
+ * callbacks can be processed during the next GP.
+ */
+ needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
+
+ rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags);
+ /* ^^^ Released rnp->lock */
+ if (needwake)
+ rcu_gp_kthread_wake(rsp);
+ }
+}
+
+/*
+ * Check to see if there is a new grace period of which this CPU
+ * is not yet aware, and if so, set up local rcu_data state for it.
+ * Otherwise, see if this CPU has just passed through its first
+ * quiescent state for this grace period, and record that fact if so.
+ */
+static void
+rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+ /* Check for grace-period ends and beginnings. */
+ note_gp_changes(rsp, rdp);
+
+ /*
+ * Does this CPU still need to do its part for current grace period?
+ * If no, return and let the other CPUs do their part as well.
+ */
+ if (!rdp->qs_pending)
+ return;
+
+ /*
+ * Was there a quiescent state since the beginning of the grace
+ * period? If no, then exit and wait for the next call.
+ */
+ if (!rdp->passed_quiesce &&
+ rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr))
+ return;
+
+ /*
+ * Tell RCU we are done (but rcu_report_qs_rdp() will be the
+ * judge of that).
+ */
+ rcu_report_qs_rdp(rdp->cpu, rsp, rdp);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Send the specified CPU's RCU callbacks to the orphanage. The
+ * specified CPU must be offline, and the caller must hold the
+ * ->orphan_lock.
+ */
+static void
+rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
+ struct rcu_node *rnp, struct rcu_data *rdp)
+{
+ /* No-CBs CPUs do not have orphanable callbacks. */
+ if (rcu_is_nocb_cpu(rdp->cpu))
+ return;
+
+ /*
+ * Orphan the callbacks. First adjust the counts. This is safe
+ * because _rcu_barrier() excludes CPU-hotplug operations, so it
+ * cannot be running now. Thus no memory barrier is required.
+ */
+ if (rdp->nxtlist != NULL) {
+ rsp->qlen_lazy += rdp->qlen_lazy;
+ rsp->qlen += rdp->qlen;
+ rdp->n_cbs_orphaned += rdp->qlen;
+ rdp->qlen_lazy = 0;
+ ACCESS_ONCE(rdp->qlen) = 0;
+ }
+
+ /*
+ * Next, move those callbacks still needing a grace period to
+ * the orphanage, where some other CPU will pick them up.
+ * Some of the callbacks might have gone partway through a grace
+ * period, but that is too bad. They get to start over because we
+ * cannot assume that grace periods are synchronized across CPUs.
+ * We don't bother updating the ->nxttail[] array yet, instead
+ * we just reset the whole thing later on.
+ */
+ if (*rdp->nxttail[RCU_DONE_TAIL] != NULL) {
+ *rsp->orphan_nxttail = *rdp->nxttail[RCU_DONE_TAIL];
+ rsp->orphan_nxttail = rdp->nxttail[RCU_NEXT_TAIL];
+ *rdp->nxttail[RCU_DONE_TAIL] = NULL;
+ }
+
+ /*
+ * Then move the ready-to-invoke callbacks to the orphanage,
+ * where some other CPU will pick them up. These will not be
+ * required to pass though another grace period: They are done.
+ */
+ if (rdp->nxtlist != NULL) {
+ *rsp->orphan_donetail = rdp->nxtlist;
+ rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL];
+ }
+
+ /*
+ * Finally, initialize the rcu_data structure's list to empty and
+ * disallow further callbacks on this CPU.
+ */
+ init_callback_list(rdp);
+ rdp->nxttail[RCU_NEXT_TAIL] = NULL;
+}
+
+/*
+ * Adopt the RCU callbacks from the specified rcu_state structure's
+ * orphanage. The caller must hold the ->orphan_lock.
+ */
+static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
+{
+ int i;
+ struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
+
+ /* No-CBs CPUs are handled specially. */
+ if (rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags))
+ return;
+
+ /* Do the accounting first. */
+ rdp->qlen_lazy += rsp->qlen_lazy;
+ rdp->qlen += rsp->qlen;
+ rdp->n_cbs_adopted += rsp->qlen;
+ if (rsp->qlen_lazy != rsp->qlen)
+ rcu_idle_count_callbacks_posted();
+ rsp->qlen_lazy = 0;
+ rsp->qlen = 0;
+
+ /*
+ * We do not need a memory barrier here because the only way we
+ * can get here if there is an rcu_barrier() in flight is if
+ * we are the task doing the rcu_barrier().
+ */
+
+ /* First adopt the ready-to-invoke callbacks. */
+ if (rsp->orphan_donelist != NULL) {
+ *rsp->orphan_donetail = *rdp->nxttail[RCU_DONE_TAIL];
+ *rdp->nxttail[RCU_DONE_TAIL] = rsp->orphan_donelist;
+ for (i = RCU_NEXT_SIZE - 1; i >= RCU_DONE_TAIL; i--)
+ if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL])
+ rdp->nxttail[i] = rsp->orphan_donetail;
+ rsp->orphan_donelist = NULL;
+ rsp->orphan_donetail = &rsp->orphan_donelist;
+ }
+
+ /* And then adopt the callbacks that still need a grace period. */
+ if (rsp->orphan_nxtlist != NULL) {
+ *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxtlist;
+ rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxttail;
+ rsp->orphan_nxtlist = NULL;
+ rsp->orphan_nxttail = &rsp->orphan_nxtlist;
+ }
+}
+
+/*
+ * Trace the fact that this CPU is going offline.
+ */
+static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
+{
+ RCU_TRACE(unsigned long mask);
+ RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda));
+ RCU_TRACE(struct rcu_node *rnp = rdp->mynode);
+
+ RCU_TRACE(mask = rdp->grpmask);
+ trace_rcu_grace_period(rsp->name,
+ rnp->gpnum + 1 - !!(rnp->qsmask & mask),
+ TPS("cpuofl"));
+}
+
+/*
+ * All CPUs for the specified rcu_node structure have gone offline,
+ * and all tasks that were preempted within an RCU read-side critical
+ * section while running on one of those CPUs have since exited their RCU
+ * read-side critical section. Some other CPU is reporting this fact with
+ * the specified rcu_node structure's ->lock held and interrupts disabled.
+ * This function therefore goes up the tree of rcu_node structures,
+ * clearing the corresponding bits in the ->qsmaskinit fields. Note that
+ * the leaf rcu_node structure's ->qsmaskinit field has already been
+ * updated
+ *
+ * This function does check that the specified rcu_node structure has
+ * all CPUs offline and no blocked tasks, so it is OK to invoke it
+ * prematurely. That said, invoking it after the fact will cost you
+ * a needless lock acquisition. So once it has done its work, don't
+ * invoke it again.
+ */
+static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
+{
+ long mask;
+ struct rcu_node *rnp = rnp_leaf;
+
+ if (rnp->qsmaskinit || rcu_preempt_has_tasks(rnp))
+ return;
+ for (;;) {
+ mask = rnp->grpmask;
+ rnp = rnp->parent;
+ if (!rnp)
+ break;
+ raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+ smp_mb__after_unlock_lock(); /* GP memory ordering. */
+ rnp->qsmaskinit &= ~mask;
+ rnp->qsmask &= ~mask;
+ if (rnp->qsmaskinit) {
+ raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+ return;
+ }
+ raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+ }
+}
+
+/*
+ * The CPU is exiting the idle loop into the arch_cpu_idle_dead()
+ * function. We now remove it from the rcu_node tree's ->qsmaskinit
+ * bit masks.
+ */
+static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
+{
+ unsigned long flags;
+ unsigned long mask;
+ struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+ struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
+
+ /* Remove outgoing CPU from mask in the leaf rcu_node structure. */
+ mask = rdp->grpmask;
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */
+ rnp->qsmaskinitnext &= ~mask;
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+}
+
+/*
+ * The CPU has been completely removed, and some other CPU is reporting
+ * this fact from process context. Do the remainder of the cleanup,
+ * including orphaning the outgoing CPU's RCU callbacks, and also
+ * adopting them. There can only be one CPU hotplug operation at a time,
+ * so no other CPU can be attempting to update rcu_cpu_kthread_task.
+ */
+static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
+{
+ unsigned long flags;
+ struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+ struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
+
+ /* Adjust any no-longer-needed kthreads. */
+ rcu_boost_kthread_setaffinity(rnp, -1);
+
+ /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
+ raw_spin_lock_irqsave(&rsp->orphan_lock, flags);
+ rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
+ rcu_adopt_orphan_cbs(rsp, flags);
+ raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags);
+
+ WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
+ "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
+ cpu, rdp->qlen, rdp->nxtlist);
+}
+
+#else /* #ifdef CONFIG_HOTPLUG_CPU */
+
+static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
+{
+}
+
+static void __maybe_unused rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
+{
+}
+
+static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
+{
+}
+
+static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
+{
+}
+
+#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
+
+/*
+ * Invoke any RCU callbacks that have made it to the end of their grace
+ * period. Thottle as specified by rdp->blimit.
+ */
+static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+ unsigned long flags;
+ struct rcu_head *next, *list, **tail;
+ long bl, count, count_lazy;
+ int i;
+
+ /* If no callbacks are ready, just return. */
+ if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
+ trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0);
+ trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist),
+ need_resched(), is_idle_task(current),
+ rcu_is_callbacks_kthread());
+ return;
+ }
+
+ /*
+ * Extract the list of ready callbacks, disabling to prevent
+ * races with call_rcu() from interrupt handlers.
+ */
+ local_irq_save(flags);
+ WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
+ bl = rdp->blimit;
+ trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, bl);
+ list = rdp->nxtlist;
+ rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
+ *rdp->nxttail[RCU_DONE_TAIL] = NULL;
+ tail = rdp->nxttail[RCU_DONE_TAIL];
+ for (i = RCU_NEXT_SIZE - 1; i >= 0; i--)
+ if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL])
+ rdp->nxttail[i] = &rdp->nxtlist;
+ local_irq_restore(flags);
+
+ /* Invoke callbacks. */
+ count = count_lazy = 0;
+ while (list) {
+ next = list->next;
+ prefetch(next);
+ debug_rcu_head_unqueue(list);
+ if (__rcu_reclaim(rsp->name, list))
+ count_lazy++;
+ list = next;
+ /* Stop only if limit reached and CPU has something to do. */
+ if (++count >= bl &&
+ (need_resched() ||
+ (!is_idle_task(current) && !rcu_is_callbacks_kthread())))
+ break;
+ }
+
+ local_irq_save(flags);
+ trace_rcu_batch_end(rsp->name, count, !!list, need_resched(),
+ is_idle_task(current),
+ rcu_is_callbacks_kthread());
+
+ /* Update count, and requeue any remaining callbacks. */
+ if (list != NULL) {
+ *tail = rdp->nxtlist;
+ rdp->nxtlist = list;
+ for (i = 0; i < RCU_NEXT_SIZE; i++)
+ if (&rdp->nxtlist == rdp->nxttail[i])
+ rdp->nxttail[i] = tail;
+ else
+ break;
+ }
+ smp_mb(); /* List handling before counting for rcu_barrier(). */
+ rdp->qlen_lazy -= count_lazy;
+ ACCESS_ONCE(rdp->qlen) = rdp->qlen - count;
+ rdp->n_cbs_invoked += count;
+
+ /* Reinstate batch limit if we have worked down the excess. */
+ if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark)
+ rdp->blimit = blimit;
+
+ /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */
+ if (rdp->qlen == 0 && rdp->qlen_last_fqs_check != 0) {
+ rdp->qlen_last_fqs_check = 0;
+ rdp->n_force_qs_snap = rsp->n_force_qs;
+ } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark)
+ rdp->qlen_last_fqs_check = rdp->qlen;
+ WARN_ON_ONCE((rdp->nxtlist == NULL) != (rdp->qlen == 0));
+
+ local_irq_restore(flags);
+
+ /* Re-invoke RCU core processing if there are callbacks remaining. */
+ if (cpu_has_callbacks_ready_to_invoke(rdp))
+ invoke_rcu_core();
+}
+
+/*
+ * Check to see if this CPU is in a non-context-switch quiescent state
+ * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
+ * Also schedule RCU core processing.
+ *
+ * This function must be called from hardirq context. It is normally
+ * invoked from the scheduling-clock interrupt. If rcu_pending returns
+ * false, there is no point in invoking rcu_check_callbacks().
+ */
+void rcu_check_callbacks(int user)
+{
+ trace_rcu_utilization(TPS("Start scheduler-tick"));
+ increment_cpu_stall_ticks();
+ if (user || rcu_is_cpu_rrupt_from_idle()) {
+
+ /*
+ * Get here if this CPU took its interrupt from user
+ * mode or from the idle loop, and if this is not a
+ * nested interrupt. In this case, the CPU is in
+ * a quiescent state, so note it.
+ *
+ * No memory barrier is required here because both
+ * rcu_sched_qs() and rcu_bh_qs() reference only CPU-local
+ * variables that other CPUs neither access nor modify,
+ * at least not while the corresponding CPU is online.
+ */
+
+ rcu_sched_qs();
+ rcu_bh_qs();
+
+ } else if (!in_softirq()) {
+
+ /*
+ * Get here if this CPU did not take its interrupt from
+ * softirq, in other words, if it is not interrupting
+ * a rcu_bh read-side critical section. This is an _bh
+ * critical section, so note it.
+ */
+
+ rcu_bh_qs();
+ }
+ rcu_preempt_check_callbacks();
+ if (rcu_pending())
+ invoke_rcu_core();
+ if (user)
+ rcu_note_voluntary_context_switch(current);
+ trace_rcu_utilization(TPS("End scheduler-tick"));
+}
+
+/*
+ * Scan the leaf rcu_node structures, processing dyntick state for any that
+ * have not yet encountered a quiescent state, using the function specified.
+ * Also initiate boosting for any threads blocked on the root rcu_node.
+ *
+ * The caller must have suppressed start of new grace periods.
+ */
+static void force_qs_rnp(struct rcu_state *rsp,
+ int (*f)(struct rcu_data *rsp, bool *isidle,
+ unsigned long *maxj),
+ bool *isidle, unsigned long *maxj)
+{
+ unsigned long bit;
+ int cpu;
+ unsigned long flags;
+ unsigned long mask;
+ struct rcu_node *rnp;
+
+ rcu_for_each_leaf_node(rsp, rnp) {
+ cond_resched_rcu_qs();
+ mask = 0;
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ smp_mb__after_unlock_lock();
+ if (!rcu_gp_in_progress(rsp)) {
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ return;
+ }
+ if (rnp->qsmask == 0) {
+ if (rcu_state_p == &rcu_sched_state ||
+ rsp != rcu_state_p ||
+ rcu_preempt_blocked_readers_cgp(rnp)) {
+ /*
+ * No point in scanning bits because they
+ * are all zero. But we might need to
+ * priority-boost blocked readers.
+ */
+ rcu_initiate_boost(rnp, flags);
+ /* rcu_initiate_boost() releases rnp->lock */
+ continue;
+ }
+ if (rnp->parent &&
+ (rnp->parent->qsmask & rnp->grpmask)) {
+ /*
+ * Race between grace-period
+ * initialization and task exiting RCU
+ * read-side critical section: Report.
+ */
+ rcu_report_unblock_qs_rnp(rsp, rnp, flags);
+ /* rcu_report_unblock_qs_rnp() rlses ->lock */
+ continue;
+ }
+ }
+ cpu = rnp->grplo;
+ bit = 1;
+ for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
+ if ((rnp->qsmask & bit) != 0) {
+ if ((rnp->qsmaskinit & bit) == 0)
+ *isidle = false; /* Pending hotplug. */
+ if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
+ mask |= bit;
+ }
+ }
+ if (mask != 0) {
+ /* Idle/offline CPUs, report (releases rnp->lock. */
+ rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags);
+ } else {
+ /* Nothing to do here, so just drop the lock. */
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ }
+ }
+}
+
+/*
+ * Force quiescent states on reluctant CPUs, and also detect which
+ * CPUs are in dyntick-idle mode.
+ */
+static void force_quiescent_state(struct rcu_state *rsp)
+{
+ unsigned long flags;
+ bool ret;
+ struct rcu_node *rnp;
+ struct rcu_node *rnp_old = NULL;
+
+ /* Funnel through hierarchy to reduce memory contention. */
+ rnp = __this_cpu_read(rsp->rda->mynode);
+ for (; rnp != NULL; rnp = rnp->parent) {
+ ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) ||
+ !raw_spin_trylock(&rnp->fqslock);
+ if (rnp_old != NULL)
+ raw_spin_unlock(&rnp_old->fqslock);
+ if (ret) {
+ rsp->n_force_qs_lh++;
+ return;
+ }
+ rnp_old = rnp;
+ }
+ /* rnp_old == rcu_get_root(rsp), rnp == NULL. */
+
+ /* Reached the root of the rcu_node tree, acquire lock. */
+ raw_spin_lock_irqsave(&rnp_old->lock, flags);
+ smp_mb__after_unlock_lock();
+ raw_spin_unlock(&rnp_old->fqslock);
+ if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
+ rsp->n_force_qs_lh++;
+ raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
+ return; /* Someone beat us to it. */
+ }
+ ACCESS_ONCE(rsp->gp_flags) =
+ ACCESS_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS;
+ raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
+ rcu_gp_kthread_wake(rsp);
+}
+
+/*
+ * This does the RCU core processing work for the specified rcu_state
+ * and rcu_data structures. This may be called only from the CPU to
+ * whom the rdp belongs.
+ */
+static void
+__rcu_process_callbacks(struct rcu_state *rsp)
+{
+ unsigned long flags;
+ bool needwake;
+ struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
+
+ WARN_ON_ONCE(rdp->beenonline == 0);
+
+ /* Update RCU state based on any recent quiescent states. */
+ rcu_check_quiescent_state(rsp, rdp);
+
+ /* Does this CPU require a not-yet-started grace period? */
+ local_irq_save(flags);
+ if (cpu_needs_another_gp(rsp, rdp)) {
+ raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */
+ needwake = rcu_start_gp(rsp);
+ raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
+ if (needwake)
+ rcu_gp_kthread_wake(rsp);
+ } else {
+ local_irq_restore(flags);
+ }
+
+ /* If there are callbacks ready, invoke them. */
+ if (cpu_has_callbacks_ready_to_invoke(rdp))
+ invoke_rcu_callbacks(rsp, rdp);
+
+ /* Do any needed deferred wakeups of rcuo kthreads. */
+ do_nocb_deferred_wakeup(rdp);
+}
+
+/*
+ * Do RCU core processing for the current CPU.
+ */
+static void rcu_process_callbacks(struct softirq_action *unused)
+{
+ struct rcu_state *rsp;
+
+ if (cpu_is_offline(smp_processor_id()))
+ return;
+ trace_rcu_utilization(TPS("Start RCU core"));
+ for_each_rcu_flavor(rsp)
+ __rcu_process_callbacks(rsp);
+ trace_rcu_utilization(TPS("End RCU core"));
+}
+
+/*
+ * Schedule RCU callback invocation. If the specified type of RCU
+ * does not support RCU priority boosting, just do a direct call,
+ * otherwise wake up the per-CPU kernel kthread. Note that because we
+ * are running on the current CPU with softirqs disabled, the
+ * rcu_cpu_kthread_task cannot disappear out from under us.
+ */
+static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+ if (unlikely(!ACCESS_ONCE(rcu_scheduler_fully_active)))
+ return;
+ if (likely(!rsp->boost)) {
+ rcu_do_batch(rsp, rdp);
+ return;
+ }
+ invoke_rcu_callbacks_kthread();
+}
+
+static void invoke_rcu_core(void)
+{
+ if (cpu_online(smp_processor_id()))
+ raise_softirq(RCU_SOFTIRQ);
+}
+
+/*
+ * Handle any core-RCU processing required by a call_rcu() invocation.
+ */
+static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
+ struct rcu_head *head, unsigned long flags)
+{
+ bool needwake;
+
+ /*
+ * If called from an extended quiescent state, invoke the RCU
+ * core in order to force a re-evaluation of RCU's idleness.
+ */
+ if (!rcu_is_watching())
+ invoke_rcu_core();
+
+ /* If interrupts were disabled or CPU offline, don't invoke RCU core. */
+ if (irqs_disabled_flags(flags) || cpu_is_offline(smp_processor_id()))
+ return;
+
+ /*
+ * Force the grace period if too many callbacks or too long waiting.
+ * Enforce hysteresis, and don't invoke force_quiescent_state()
+ * if some other CPU has recently done so. Also, don't bother
+ * invoking force_quiescent_state() if the newly enqueued callback
+ * is the only one waiting for a grace period to complete.
+ */
+ if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
+
+ /* Are we ignoring a completed grace period? */
+ note_gp_changes(rsp, rdp);
+
+ /* Start a new grace period if one not already started. */
+ if (!rcu_gp_in_progress(rsp)) {
+ struct rcu_node *rnp_root = rcu_get_root(rsp);
+
+ raw_spin_lock(&rnp_root->lock);
+ smp_mb__after_unlock_lock();
+ needwake = rcu_start_gp(rsp);
+ raw_spin_unlock(&rnp_root->lock);
+ if (needwake)
+ rcu_gp_kthread_wake(rsp);
+ } else {
+ /* Give the grace period a kick. */
+ rdp->blimit = LONG_MAX;
+ if (rsp->n_force_qs == rdp->n_force_qs_snap &&
+ *rdp->nxttail[RCU_DONE_TAIL] != head)
+ force_quiescent_state(rsp);
+ rdp->n_force_qs_snap = rsp->n_force_qs;
+ rdp->qlen_last_fqs_check = rdp->qlen;
+ }
+ }
+}
+
+/*
+ * RCU callback function to leak a callback.
+ */
+static void rcu_leak_callback(struct rcu_head *rhp)
+{
+}
+
+/*
+ * Helper function for call_rcu() and friends. The cpu argument will
+ * normally be -1, indicating "currently running CPU". It may specify
+ * a CPU only if that CPU is a no-CBs CPU. Currently, only _rcu_barrier()
+ * is expected to specify a CPU.
+ */
+static void
+__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
+ struct rcu_state *rsp, int cpu, bool lazy)
+{
+ unsigned long flags;
+ struct rcu_data *rdp;
+
+ WARN_ON_ONCE((unsigned long)head & 0x1); /* Misaligned rcu_head! */
+ if (debug_rcu_head_queue(head)) {
+ /* Probable double call_rcu(), so leak the callback. */
+ ACCESS_ONCE(head->func) = rcu_leak_callback;
+ WARN_ONCE(1, "__call_rcu(): Leaked duplicate callback\n");
+ return;
+ }
+ head->func = func;
+ head->next = NULL;
+
+ /*
+ * Opportunistically note grace-period endings and beginnings.
+ * Note that we might see a beginning right after we see an
+ * end, but never vice versa, since this CPU has to pass through
+ * a quiescent state betweentimes.
+ */
+ local_irq_save(flags);
+ rdp = this_cpu_ptr(rsp->rda);
+
+ /* Add the callback to our list. */
+ if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL) || cpu != -1) {
+ int offline;
+
+ if (cpu != -1)
+ rdp = per_cpu_ptr(rsp->rda, cpu);
+ if (likely(rdp->mynode)) {
+ /* Post-boot, so this should be for a no-CBs CPU. */
+ offline = !__call_rcu_nocb(rdp, head, lazy, flags);
+ WARN_ON_ONCE(offline);
+ /* Offline CPU, _call_rcu() illegal, leak callback. */
+ local_irq_restore(flags);
+ return;
+ }
+ /*
+ * Very early boot, before rcu_init(). Initialize if needed
+ * and then drop through to queue the callback.
+ */
+ BUG_ON(cpu != -1);
+ WARN_ON_ONCE(!rcu_is_watching());
+ if (!likely(rdp->nxtlist))
+ init_default_callback_list(rdp);
+ }
+ ACCESS_ONCE(rdp->qlen) = rdp->qlen + 1;
+ if (lazy)
+ rdp->qlen_lazy++;
+ else
+ rcu_idle_count_callbacks_posted();
+ smp_mb(); /* Count before adding callback for rcu_barrier(). */
+ *rdp->nxttail[RCU_NEXT_TAIL] = head;
+ rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
+
+ if (__is_kfree_rcu_offset((unsigned long)func))
+ trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func,
+ rdp->qlen_lazy, rdp->qlen);
+ else
+ trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen);
+
+ /* Go handle any RCU core processing required. */
+ __call_rcu_core(rsp, rdp, head, flags);
+ local_irq_restore(flags);
+}
+
+/*
+ * Queue an RCU-sched callback for invocation after a grace period.
+ */
+void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+ __call_rcu(head, func, &rcu_sched_state, -1, 0);
+}
+EXPORT_SYMBOL_GPL(call_rcu_sched);
+
+/*
+ * Queue an RCU callback for invocation after a quicker grace period.
+ */
+void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+ __call_rcu(head, func, &rcu_bh_state, -1, 0);
+}
+EXPORT_SYMBOL_GPL(call_rcu_bh);
+
+/*
+ * Queue an RCU callback for lazy invocation after a grace period.
+ * This will likely be later named something like "call_rcu_lazy()",
+ * but this change will require some way of tagging the lazy RCU
+ * callbacks in the list of pending callbacks. Until then, this
+ * function may only be called from __kfree_rcu().
+ */
+void kfree_call_rcu(struct rcu_head *head,
+ void (*func)(struct rcu_head *rcu))
+{
+ __call_rcu(head, func, rcu_state_p, -1, 1);
+}
+EXPORT_SYMBOL_GPL(kfree_call_rcu);
+
+/*
+ * Because a context switch is a grace period for RCU-sched and RCU-bh,
+ * any blocking grace-period wait automatically implies a grace period
+ * if there is only one CPU online at any point time during execution
+ * of either synchronize_sched() or synchronize_rcu_bh(). It is OK to
+ * occasionally incorrectly indicate that there are multiple CPUs online
+ * when there was in fact only one the whole time, as this just adds
+ * some overhead: RCU still operates correctly.
+ */
+static inline int rcu_blocking_is_gp(void)
+{
+ int ret;
+
+ might_sleep(); /* Check for RCU read-side critical section. */
+ preempt_disable();
+ ret = num_online_cpus() <= 1;
+ preempt_enable();
+ return ret;
+}
+
+/**
+ * synchronize_sched - wait until an rcu-sched grace period has elapsed.
+ *
+ * Control will return to the caller some time after a full rcu-sched
+ * grace period has elapsed, in other words after all currently executing
+ * rcu-sched read-side critical sections have completed. These read-side
+ * critical sections are delimited by rcu_read_lock_sched() and
+ * rcu_read_unlock_sched(), and may be nested. Note that preempt_disable(),
+ * local_irq_disable(), and so on may be used in place of
+ * rcu_read_lock_sched().
+ *
+ * This means that all preempt_disable code sequences, including NMI and
+ * non-threaded hardware-interrupt handlers, in progress on entry will
+ * have completed before this primitive returns. However, this does not
+ * guarantee that softirq handlers will have completed, since in some
+ * kernels, these handlers can run in process context, and can block.
+ *
+ * Note that this guarantee implies further memory-ordering guarantees.
+ * On systems with more than one CPU, when synchronize_sched() returns,
+ * each CPU is guaranteed to have executed a full memory barrier since the
+ * end of its last RCU-sched read-side critical section whose beginning
+ * preceded the call to synchronize_sched(). In addition, each CPU having
+ * an RCU read-side critical section that extends beyond the return from
+ * synchronize_sched() is guaranteed to have executed a full memory barrier
+ * after the beginning of synchronize_sched() and before the beginning of
+ * that RCU read-side critical section. Note that these guarantees include
+ * CPUs that are offline, idle, or executing in user mode, as well as CPUs
+ * that are executing in the kernel.
+ *
+ * Furthermore, if CPU A invoked synchronize_sched(), which returned
+ * to its caller on CPU B, then both CPU A and CPU B are guaranteed
+ * to have executed a full memory barrier during the execution of
+ * synchronize_sched() -- even if CPU A and CPU B are the same CPU (but
+ * again only if the system has more than one CPU).
+ *
+ * This primitive provides the guarantees made by the (now removed)
+ * synchronize_kernel() API. In contrast, synchronize_rcu() only
+ * guarantees that rcu_read_lock() sections will have completed.
+ * In "classic RCU", these two guarantees happen to be one and
+ * the same, but can differ in realtime RCU implementations.
+ */
+void synchronize_sched(void)
+{
+ rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
+ !lock_is_held(&rcu_lock_map) &&
+ !lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_sched() in RCU-sched read-side critical section");
+ if (rcu_blocking_is_gp())
+ return;
+ if (rcu_gp_is_expedited())
+ synchronize_sched_expedited();
+ else
+ wait_rcu_gp(call_rcu_sched);
+}
+EXPORT_SYMBOL_GPL(synchronize_sched);
+
+/**
+ * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
+ *
+ * Control will return to the caller some time after a full rcu_bh grace
+ * period has elapsed, in other words after all currently executing rcu_bh
+ * read-side critical sections have completed. RCU read-side critical
+ * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
+ * and may be nested.
+ *
+ * See the description of synchronize_sched() for more detailed information
+ * on memory ordering guarantees.
+ */
+void synchronize_rcu_bh(void)
+{
+ rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
+ !lock_is_held(&rcu_lock_map) &&
+ !lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section");
+ if (rcu_blocking_is_gp())
+ return;
+ if (rcu_gp_is_expedited())
+ synchronize_rcu_bh_expedited();
+ else
+ wait_rcu_gp(call_rcu_bh);
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
+
+/**
+ * get_state_synchronize_rcu - Snapshot current RCU state
+ *
+ * Returns a cookie that is used by a later call to cond_synchronize_rcu()
+ * to determine whether or not a full grace period has elapsed in the
+ * meantime.
+ */
+unsigned long get_state_synchronize_rcu(void)
+{
+ /*
+ * Any prior manipulation of RCU-protected data must happen
+ * before the load from ->gpnum.
+ */
+ smp_mb(); /* ^^^ */
+
+ /*
+ * Make sure this load happens before the purportedly
+ * time-consuming work between get_state_synchronize_rcu()
+ * and cond_synchronize_rcu().
+ */
+ return smp_load_acquire(&rcu_state_p->gpnum);
+}
+EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
+
+/**
+ * cond_synchronize_rcu - Conditionally wait for an RCU grace period
+ *
+ * @oldstate: return value from earlier call to get_state_synchronize_rcu()
+ *
+ * If a full RCU grace period has elapsed since the earlier call to
+ * get_state_synchronize_rcu(), just return. Otherwise, invoke
+ * synchronize_rcu() to wait for a full grace period.
+ *
+ * Yes, this function does not take counter wrap into account. But
+ * counter wrap is harmless. If the counter wraps, we have waited for
+ * more than 2 billion grace periods (and way more on a 64-bit system!),
+ * so waiting for one additional grace period should be just fine.
+ */
+void cond_synchronize_rcu(unsigned long oldstate)
+{
+ unsigned long newstate;
+
+ /*
+ * Ensure that this load happens before any RCU-destructive
+ * actions the caller might carry out after we return.
+ */
+ newstate = smp_load_acquire(&rcu_state_p->completed);
+ if (ULONG_CMP_GE(oldstate, newstate))
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
+
+static int synchronize_sched_expedited_cpu_stop(void *data)
+{
+ /*
+ * There must be a full memory barrier on each affected CPU
+ * between the time that try_stop_cpus() is called and the
+ * time that it returns.
+ *
+ * In the current initial implementation of cpu_stop, the
+ * above condition is already met when the control reaches
+ * this point and the following smp_mb() is not strictly
+ * necessary. Do smp_mb() anyway for documentation and
+ * robustness against future implementation changes.
+ */
+ smp_mb(); /* See above comment block. */
+ return 0;
+}
+
+/**
+ * synchronize_sched_expedited - Brute-force RCU-sched grace period
+ *
+ * Wait for an RCU-sched grace period to elapse, but use a "big hammer"
+ * approach to force the grace period to end quickly. This consumes
+ * significant time on all CPUs and is unfriendly to real-time workloads,
+ * so is thus not recommended for any sort of common-case code. In fact,
+ * if you are using synchronize_sched_expedited() in a loop, please
+ * restructure your code to batch your updates, and then use a single
+ * synchronize_sched() instead.
+ *
+ * This implementation can be thought of as an application of ticket
+ * locking to RCU, with sync_sched_expedited_started and
+ * sync_sched_expedited_done taking on the roles of the halves
+ * of the ticket-lock word. Each task atomically increments
+ * sync_sched_expedited_started upon entry, snapshotting the old value,
+ * then attempts to stop all the CPUs. If this succeeds, then each
+ * CPU will have executed a context switch, resulting in an RCU-sched
+ * grace period. We are then done, so we use atomic_cmpxchg() to
+ * update sync_sched_expedited_done to match our snapshot -- but
+ * only if someone else has not already advanced past our snapshot.
+ *
+ * On the other hand, if try_stop_cpus() fails, we check the value
+ * of sync_sched_expedited_done. If it has advanced past our
+ * initial snapshot, then someone else must have forced a grace period
+ * some time after we took our snapshot. In this case, our work is
+ * done for us, and we can simply return. Otherwise, we try again,
+ * but keep our initial snapshot for purposes of checking for someone
+ * doing our work for us.
+ *
+ * If we fail too many times in a row, we fall back to synchronize_sched().
+ */
+void synchronize_sched_expedited(void)
+{
+ cpumask_var_t cm;
+ bool cma = false;
+ int cpu;
+ long firstsnap, s, snap;
+ int trycount = 0;
+ struct rcu_state *rsp = &rcu_sched_state;
+
+ /*
+ * If we are in danger of counter wrap, just do synchronize_sched().
+ * By allowing sync_sched_expedited_started to advance no more than
+ * ULONG_MAX/8 ahead of sync_sched_expedited_done, we are ensuring
+ * that more than 3.5 billion CPUs would be required to force a
+ * counter wrap on a 32-bit system. Quite a few more CPUs would of
+ * course be required on a 64-bit system.
+ */
+ if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start),
+ (ulong)atomic_long_read(&rsp->expedited_done) +
+ ULONG_MAX / 8)) {
+ synchronize_sched();
+ atomic_long_inc(&rsp->expedited_wrap);
+ return;
+ }
+
+ /*
+ * Take a ticket. Note that atomic_inc_return() implies a
+ * full memory barrier.
+ */
+ snap = atomic_long_inc_return(&rsp->expedited_start);
+ firstsnap = snap;
+ if (!try_get_online_cpus()) {
+ /* CPU hotplug operation in flight, fall back to normal GP. */
+ wait_rcu_gp(call_rcu_sched);
+ atomic_long_inc(&rsp->expedited_normal);
+ return;
+ }
+ WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
+
+ /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
+ cma = zalloc_cpumask_var(&cm, GFP_KERNEL);
+ if (cma) {
+ cpumask_copy(cm, cpu_online_mask);
+ cpumask_clear_cpu(raw_smp_processor_id(), cm);
+ for_each_cpu(cpu, cm) {
+ struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+
+ if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
+ cpumask_clear_cpu(cpu, cm);
+ }
+ if (cpumask_weight(cm) == 0)
+ goto all_cpus_idle;
+ }
+
+ /*
+ * Each pass through the following loop attempts to force a
+ * context switch on each CPU.
+ */
+ while (try_stop_cpus(cma ? cm : cpu_online_mask,
+ synchronize_sched_expedited_cpu_stop,
+ NULL) == -EAGAIN) {
+ put_online_cpus();
+ atomic_long_inc(&rsp->expedited_tryfail);
+
+ /* Check to see if someone else did our work for us. */
+ s = atomic_long_read(&rsp->expedited_done);
+ if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
+ /* ensure test happens before caller kfree */
+ smp_mb__before_atomic(); /* ^^^ */
+ atomic_long_inc(&rsp->expedited_workdone1);
+ free_cpumask_var(cm);
+ return;
+ }
+
+ /* No joy, try again later. Or just synchronize_sched(). */
+ if (trycount++ < 10) {
+ udelay(trycount * num_online_cpus());
+ } else {
+ wait_rcu_gp(call_rcu_sched);
+ atomic_long_inc(&rsp->expedited_normal);
+ free_cpumask_var(cm);
+ return;
+ }
+
+ /* Recheck to see if someone else did our work for us. */
+ s = atomic_long_read(&rsp->expedited_done);
+ if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
+ /* ensure test happens before caller kfree */
+ smp_mb__before_atomic(); /* ^^^ */
+ atomic_long_inc(&rsp->expedited_workdone2);
+ free_cpumask_var(cm);
+ return;
+ }
+
+ /*
+ * Refetching sync_sched_expedited_started allows later
+ * callers to piggyback on our grace period. We retry
+ * after they started, so our grace period works for them,
+ * and they started after our first try, so their grace
+ * period works for us.
+ */
+ if (!try_get_online_cpus()) {
+ /* CPU hotplug operation in flight, use normal GP. */
+ wait_rcu_gp(call_rcu_sched);
+ atomic_long_inc(&rsp->expedited_normal);
+ free_cpumask_var(cm);
+ return;
+ }
+ snap = atomic_long_read(&rsp->expedited_start);
+ smp_mb(); /* ensure read is before try_stop_cpus(). */
+ }
+ atomic_long_inc(&rsp->expedited_stoppedcpus);
+
+all_cpus_idle:
+ free_cpumask_var(cm);
+
+ /*
+ * Everyone up to our most recent fetch is covered by our grace
+ * period. Update the counter, but only if our work is still
+ * relevant -- which it won't be if someone who started later
+ * than we did already did their update.
+ */
+ do {
+ atomic_long_inc(&rsp->expedited_done_tries);
+ s = atomic_long_read(&rsp->expedited_done);
+ if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
+ /* ensure test happens before caller kfree */
+ smp_mb__before_atomic(); /* ^^^ */
+ atomic_long_inc(&rsp->expedited_done_lost);
+ break;
+ }
+ } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s);
+ atomic_long_inc(&rsp->expedited_done_exit);
+
+ put_online_cpus();
+}
+EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+
+/*
+ * Check to see if there is any immediate RCU-related work to be done
+ * by the current CPU, for the specified type of RCU, returning 1 if so.
+ * The checks are in order of increasing expense: checks that can be
+ * carried out against CPU-local state are performed first. However,
+ * we must check for CPU stalls first, else we might not get a chance.
+ */
+static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+ struct rcu_node *rnp = rdp->mynode;
+
+ rdp->n_rcu_pending++;
+
+ /* Check for CPU stalls, if enabled. */
+ check_cpu_stall(rsp, rdp);
+
+ /* Is this CPU a NO_HZ_FULL CPU that should ignore RCU? */
+ if (rcu_nohz_full_cpu(rsp))
+ return 0;
+
+ /* Is the RCU core waiting for a quiescent state from this CPU? */
+ if (rcu_scheduler_fully_active &&
+ rdp->qs_pending && !rdp->passed_quiesce &&
+ rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) {
+ rdp->n_rp_qs_pending++;
+ } else if (rdp->qs_pending &&
+ (rdp->passed_quiesce ||
+ rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) {
+ rdp->n_rp_report_qs++;
+ return 1;
+ }
+
+ /* Does this CPU have callbacks ready to invoke? */
+ if (cpu_has_callbacks_ready_to_invoke(rdp)) {
+ rdp->n_rp_cb_ready++;
+ return 1;
+ }
+
+ /* Has RCU gone idle with this CPU needing another grace period? */
+ if (cpu_needs_another_gp(rsp, rdp)) {
+ rdp->n_rp_cpu_needs_gp++;
+ return 1;
+ }
+
+ /* Has another RCU grace period completed? */
+ if (ACCESS_ONCE(rnp->completed) != rdp->completed) { /* outside lock */
+ rdp->n_rp_gp_completed++;
+ return 1;
+ }
+
+ /* Has a new RCU grace period started? */
+ if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum ||
+ unlikely(ACCESS_ONCE(rdp->gpwrap))) { /* outside lock */
+ rdp->n_rp_gp_started++;
+ return 1;
+ }
+
+ /* Does this CPU need a deferred NOCB wakeup? */
+ if (rcu_nocb_need_deferred_wakeup(rdp)) {
+ rdp->n_rp_nocb_defer_wakeup++;
+ return 1;
+ }
+
+ /* nothing to do */
+ rdp->n_rp_need_nothing++;
+ return 0;
+}
+
+/*
+ * Check to see if there is any immediate RCU-related work to be done
+ * by the current CPU, returning 1 if so. This function is part of the
+ * RCU implementation; it is -not- an exported member of the RCU API.
+ */
+static int rcu_pending(void)
+{
+ struct rcu_state *rsp;
+
+ for_each_rcu_flavor(rsp)
+ if (__rcu_pending(rsp, this_cpu_ptr(rsp->rda)))
+ return 1;
+ return 0;
+}
+
+/*
+ * Return true if the specified CPU has any callback. If all_lazy is
+ * non-NULL, store an indication of whether all callbacks are lazy.
+ * (If there are no callbacks, all of them are deemed to be lazy.)
+ */
+static int __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy)
+{
+ bool al = true;
+ bool hc = false;
+ struct rcu_data *rdp;
+ struct rcu_state *rsp;
+
+ for_each_rcu_flavor(rsp) {
+ rdp = this_cpu_ptr(rsp->rda);
+ if (!rdp->nxtlist)
+ continue;
+ hc = true;
+ if (rdp->qlen != rdp->qlen_lazy || !all_lazy) {
+ al = false;
+ break;
+ }
+ }
+ if (all_lazy)
+ *all_lazy = al;
+ return hc;
+}
+
+/*
+ * Helper function for _rcu_barrier() tracing. If tracing is disabled,
+ * the compiler is expected to optimize this away.
+ */
+static void _rcu_barrier_trace(struct rcu_state *rsp, const char *s,
+ int cpu, unsigned long done)
+{
+ trace_rcu_barrier(rsp->name, s, cpu,
+ atomic_read(&rsp->barrier_cpu_count), done);
+}
+
+/*
+ * RCU callback function for _rcu_barrier(). If we are last, wake
+ * up the task executing _rcu_barrier().
+ */
+static void rcu_barrier_callback(struct rcu_head *rhp)
+{
+ struct rcu_data *rdp = container_of(rhp, struct rcu_data, barrier_head);
+ struct rcu_state *rsp = rdp->rsp;
+
+ if (atomic_dec_and_test(&rsp->barrier_cpu_count)) {
+ _rcu_barrier_trace(rsp, "LastCB", -1, rsp->n_barrier_done);
+ complete(&rsp->barrier_completion);
+ } else {
+ _rcu_barrier_trace(rsp, "CB", -1, rsp->n_barrier_done);
+ }
+}
+
+/*
+ * Called with preemption disabled, and from cross-cpu IRQ context.
+ */
+static void rcu_barrier_func(void *type)
+{
+ struct rcu_state *rsp = type;
+ struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
+
+ _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done);
+ atomic_inc(&rsp->barrier_cpu_count);
+ rsp->call(&rdp->barrier_head, rcu_barrier_callback);
+}
+
+/*
+ * Orchestrate the specified type of RCU barrier, waiting for all
+ * RCU callbacks of the specified type to complete.
+ */
+static void _rcu_barrier(struct rcu_state *rsp)
+{
+ int cpu;
+ struct rcu_data *rdp;
+ unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done);
+ unsigned long snap_done;
+
+ _rcu_barrier_trace(rsp, "Begin", -1, snap);
+
+ /* Take mutex to serialize concurrent rcu_barrier() requests. */
+ mutex_lock(&rsp->barrier_mutex);
+
+ /*
+ * Ensure that all prior references, including to ->n_barrier_done,
+ * are ordered before the _rcu_barrier() machinery.
+ */
+ smp_mb(); /* See above block comment. */
+
+ /*
+ * Recheck ->n_barrier_done to see if others did our work for us.
+ * This means checking ->n_barrier_done for an even-to-odd-to-even
+ * transition. The "if" expression below therefore rounds the old
+ * value up to the next even number and adds two before comparing.
+ */
+ snap_done = rsp->n_barrier_done;
+ _rcu_barrier_trace(rsp, "Check", -1, snap_done);
+
+ /*
+ * If the value in snap is odd, we needed to wait for the current
+ * rcu_barrier() to complete, then wait for the next one, in other
+ * words, we need the value of snap_done to be three larger than
+ * the value of snap. On the other hand, if the value in snap is
+ * even, we only had to wait for the next rcu_barrier() to complete,
+ * in other words, we need the value of snap_done to be only two
+ * greater than the value of snap. The "(snap + 3) & ~0x1" computes
+ * this for us (thank you, Linus!).
+ */
+ if (ULONG_CMP_GE(snap_done, (snap + 3) & ~0x1)) {
+ _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done);
+ smp_mb(); /* caller's subsequent code after above check. */
+ mutex_unlock(&rsp->barrier_mutex);
+ return;
+ }
+
+ /*
+ * Increment ->n_barrier_done to avoid duplicate work. Use
+ * ACCESS_ONCE() to prevent the compiler from speculating
+ * the increment to precede the early-exit check.
+ */
+ ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1;
+ WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1);
+ _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done);
+ smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */
+
+ /*
+ * Initialize the count to one rather than to zero in order to
+ * avoid a too-soon return to zero in case of a short grace period
+ * (or preemption of this task). Exclude CPU-hotplug operations
+ * to ensure that no offline CPU has callbacks queued.
+ */
+ init_completion(&rsp->barrier_completion);
+ atomic_set(&rsp->barrier_cpu_count, 1);
+ get_online_cpus();
+
+ /*
+ * Force each CPU with callbacks to register a new callback.
+ * When that callback is invoked, we will know that all of the
+ * corresponding CPU's preceding callbacks have been invoked.
+ */
+ for_each_possible_cpu(cpu) {
+ if (!cpu_online(cpu) && !rcu_is_nocb_cpu(cpu))
+ continue;
+ rdp = per_cpu_ptr(rsp->rda, cpu);
+ if (rcu_is_nocb_cpu(cpu)) {
+ if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) {
+ _rcu_barrier_trace(rsp, "OfflineNoCB", cpu,
+ rsp->n_barrier_done);
+ } else {
+ _rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
+ rsp->n_barrier_done);
+ smp_mb__before_atomic();
+ atomic_inc(&rsp->barrier_cpu_count);
+ __call_rcu(&rdp->barrier_head,
+ rcu_barrier_callback, rsp, cpu, 0);
+ }
+ } else if (ACCESS_ONCE(rdp->qlen)) {
+ _rcu_barrier_trace(rsp, "OnlineQ", cpu,
+ rsp->n_barrier_done);
+ smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
+ } else {
+ _rcu_barrier_trace(rsp, "OnlineNQ", cpu,
+ rsp->n_barrier_done);
+ }
+ }
+ put_online_cpus();
+
+ /*
+ * Now that we have an rcu_barrier_callback() callback on each
+ * CPU, and thus each counted, remove the initial count.
+ */
+ if (atomic_dec_and_test(&rsp->barrier_cpu_count))
+ complete(&rsp->barrier_completion);
+
+ /* Increment ->n_barrier_done to prevent duplicate work. */
+ smp_mb(); /* Keep increment after above mechanism. */
+ ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1;
+ WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0);
+ _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done);
+ smp_mb(); /* Keep increment before caller's subsequent code. */
+
+ /* Wait for all rcu_barrier_callback() callbacks to be invoked. */
+ wait_for_completion(&rsp->barrier_completion);
+
+ /* Other rcu_barrier() invocations can now safely proceed. */
+ mutex_unlock(&rsp->barrier_mutex);
+}
+
+/**
+ * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
+ */
+void rcu_barrier_bh(void)
+{
+ _rcu_barrier(&rcu_bh_state);
+}
+EXPORT_SYMBOL_GPL(rcu_barrier_bh);
+
+/**
+ * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
+ */
+void rcu_barrier_sched(void)
+{
+ _rcu_barrier(&rcu_sched_state);
+}
+EXPORT_SYMBOL_GPL(rcu_barrier_sched);
+
+/*
+ * Propagate ->qsinitmask bits up the rcu_node tree to account for the
+ * first CPU in a given leaf rcu_node structure coming online. The caller
+ * must hold the corresponding leaf rcu_node ->lock with interrrupts
+ * disabled.
+ */
+static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
+{
+ long mask;
+ struct rcu_node *rnp = rnp_leaf;
+
+ for (;;) {
+ mask = rnp->grpmask;
+ rnp = rnp->parent;
+ if (rnp == NULL)
+ return;
+ raw_spin_lock(&rnp->lock); /* Interrupts already disabled. */
+ rnp->qsmaskinit |= mask;
+ raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
+ }
+}
+
+/*
+ * Do boot-time initialization of a CPU's per-CPU RCU data.
+ */
+static void __init
+rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
+{
+ unsigned long flags;
+ struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+ struct rcu_node *rnp = rcu_get_root(rsp);
+
+ /* Set up local state, ensuring consistent view of global state. */
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
+ rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
+ WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
+ WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
+ rdp->cpu = cpu;
+ rdp->rsp = rsp;
+ rcu_boot_init_nocb_percpu_data(rdp);
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+}
+
+/*
+ * Initialize a CPU's per-CPU RCU data. Note that only one online or
+ * offline event can be happening at a given time. Note also that we
+ * can accept some slop in the rsp->completed access due to the fact
+ * that this CPU cannot possibly have any RCU callbacks in flight yet.
+ */
+static void
+rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
+{
+ unsigned long flags;
+ unsigned long mask;
+ struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+ struct rcu_node *rnp = rcu_get_root(rsp);
+
+ /* Set up local state, ensuring consistent view of global state. */
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ rdp->beenonline = 1; /* We have now been online. */
+ rdp->qlen_last_fqs_check = 0;
+ rdp->n_force_qs_snap = rsp->n_force_qs;
+ rdp->blimit = blimit;
+ if (!rdp->nxtlist)
+ init_callback_list(rdp); /* Re-enable callbacks on this CPU. */
+ rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
+ rcu_sysidle_init_percpu_data(rdp->dynticks);
+ atomic_set(&rdp->dynticks->dynticks,
+ (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
+ raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+
+ /*
+ * Add CPU to leaf rcu_node pending-online bitmask. Any needed
+ * propagation up the rcu_node tree will happen at the beginning
+ * of the next grace period.
+ */
+ rnp = rdp->mynode;
+ mask = rdp->grpmask;
+ raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+ smp_mb__after_unlock_lock();
+ rnp->qsmaskinitnext |= mask;
+ rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */
+ rdp->completed = rnp->completed;
+ rdp->passed_quiesce = false;
+ rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
+ rdp->qs_pending = false;
+ trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+}
+
+static void rcu_prepare_cpu(int cpu)
+{
+ struct rcu_state *rsp;
+
+ for_each_rcu_flavor(rsp)
+ rcu_init_percpu_data(cpu, rsp);
+}
+
+/*
+ * Handle CPU online/offline notification events.
+ */
+int rcu_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ long cpu = (long)hcpu;
+ struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
+ struct rcu_node *rnp = rdp->mynode;
+ struct rcu_state *rsp;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
+ rcu_prepare_cpu(cpu);
+ rcu_prepare_kthreads(cpu);
+ rcu_spawn_all_nocb_kthreads(cpu);
+ break;
+ case CPU_ONLINE:
+ case CPU_DOWN_FAILED:
+ rcu_boost_kthread_setaffinity(rnp, -1);
+ break;
+ case CPU_DOWN_PREPARE:
+ rcu_boost_kthread_setaffinity(rnp, cpu);
+ break;
+ case CPU_DYING:
+ case CPU_DYING_FROZEN:
+ for_each_rcu_flavor(rsp)
+ rcu_cleanup_dying_cpu(rsp);
+ break;
+ case CPU_DYING_IDLE:
+ for_each_rcu_flavor(rsp) {
+ rcu_cleanup_dying_idle_cpu(cpu, rsp);
+ }
+ break;
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
+ for_each_rcu_flavor(rsp) {
+ rcu_cleanup_dead_cpu(cpu, rsp);
+ do_nocb_deferred_wakeup(per_cpu_ptr(rsp->rda, cpu));
+ }
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static int rcu_pm_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ switch (action) {
+ case PM_HIBERNATION_PREPARE:
+ case PM_SUSPEND_PREPARE:
+ if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */
+ rcu_expedite_gp();
+ break;
+ case PM_POST_HIBERNATION:
+ case PM_POST_SUSPEND:
+ if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */
+ rcu_unexpedite_gp();
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+/*
+ * Spawn the kthreads that handle each RCU flavor's grace periods.
+ */
+static int __init rcu_spawn_gp_kthread(void)
+{
+ unsigned long flags;
+ int kthread_prio_in = kthread_prio;
+ struct rcu_node *rnp;
+ struct rcu_state *rsp;
+ struct sched_param sp;
+ struct task_struct *t;
+
+ /* Force priority into range. */
+ if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1)
+ kthread_prio = 1;
+ else if (kthread_prio < 0)
+ kthread_prio = 0;
+ else if (kthread_prio > 99)
+ kthread_prio = 99;
+ if (kthread_prio != kthread_prio_in)
+ pr_alert("rcu_spawn_gp_kthread(): Limited prio to %d from %d\n",
+ kthread_prio, kthread_prio_in);
+
+ rcu_scheduler_fully_active = 1;
+ for_each_rcu_flavor(rsp) {
+ t = kthread_create(rcu_gp_kthread, rsp, "%s", rsp->name);
+ BUG_ON(IS_ERR(t));
+ rnp = rcu_get_root(rsp);
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ rsp->gp_kthread = t;
+ if (kthread_prio) {
+ sp.sched_priority = kthread_prio;
+ sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+ }
+ wake_up_process(t);
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ }
+ rcu_spawn_nocb_kthreads();
+ rcu_spawn_boost_kthreads();
+ return 0;
+}
+early_initcall(rcu_spawn_gp_kthread);
+
+/*
+ * This function is invoked towards the end of the scheduler's initialization
+ * process. Before this is called, the idle task might contain
+ * RCU read-side critical sections (during which time, this idle
+ * task is booting the system). After this function is called, the
+ * idle tasks are prohibited from containing RCU read-side critical
+ * sections. This function also enables RCU lockdep checking.
+ */
+void rcu_scheduler_starting(void)
+{
+ WARN_ON(num_online_cpus() != 1);
+ WARN_ON(nr_context_switches() > 0);
+ rcu_scheduler_active = 1;
+}
+
+/*
+ * Compute the per-level fanout, either using the exact fanout specified
+ * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
+ */
+static void __init rcu_init_levelspread(struct rcu_state *rsp)
+{
+ int i;
+
+ if (IS_ENABLED(CONFIG_RCU_FANOUT_EXACT)) {
+ rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
+ for (i = rcu_num_lvls - 2; i >= 0; i--)
+ rsp->levelspread[i] = CONFIG_RCU_FANOUT;
+ } else {
+ int ccur;
+ int cprv;
+
+ cprv = nr_cpu_ids;
+ for (i = rcu_num_lvls - 1; i >= 0; i--) {
+ ccur = rsp->levelcnt[i];
+ rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
+ cprv = ccur;
+ }
+ }
+}
+
+/*
+ * Helper function for rcu_init() that initializes one rcu_state structure.
+ */
+static void __init rcu_init_one(struct rcu_state *rsp,
+ struct rcu_data __percpu *rda)
+{
+ static const char * const buf[] = {
+ "rcu_node_0",
+ "rcu_node_1",
+ "rcu_node_2",
+ "rcu_node_3" }; /* Match MAX_RCU_LVLS */
+ static const char * const fqs[] = {
+ "rcu_node_fqs_0",
+ "rcu_node_fqs_1",
+ "rcu_node_fqs_2",
+ "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */
+ static u8 fl_mask = 0x1;
+ int cpustride = 1;
+ int i;
+ int j;
+ struct rcu_node *rnp;
+
+ BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */
+
+ /* Silence gcc 4.8 warning about array index out of range. */
+ if (rcu_num_lvls > RCU_NUM_LVLS)
+ panic("rcu_init_one: rcu_num_lvls overflow");
+
+ /* Initialize the level-tracking arrays. */
+
+ for (i = 0; i < rcu_num_lvls; i++)
+ rsp->levelcnt[i] = num_rcu_lvl[i];
+ for (i = 1; i < rcu_num_lvls; i++)
+ rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
+ rcu_init_levelspread(rsp);
+ rsp->flavor_mask = fl_mask;
+ fl_mask <<= 1;
+
+ /* Initialize the elements themselves, starting from the leaves. */
+
+ for (i = rcu_num_lvls - 1; i >= 0; i--) {
+ cpustride *= rsp->levelspread[i];
+ rnp = rsp->level[i];
+ for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
+ raw_spin_lock_init(&rnp->lock);
+ lockdep_set_class_and_name(&rnp->lock,
+ &rcu_node_class[i], buf[i]);
+ raw_spin_lock_init(&rnp->fqslock);
+ lockdep_set_class_and_name(&rnp->fqslock,
+ &rcu_fqs_class[i], fqs[i]);
+ rnp->gpnum = rsp->gpnum;
+ rnp->completed = rsp->completed;
+ rnp->qsmask = 0;
+ rnp->qsmaskinit = 0;
+ rnp->grplo = j * cpustride;
+ rnp->grphi = (j + 1) * cpustride - 1;
+ if (rnp->grphi >= nr_cpu_ids)
+ rnp->grphi = nr_cpu_ids - 1;
+ if (i == 0) {
+ rnp->grpnum = 0;
+ rnp->grpmask = 0;
+ rnp->parent = NULL;
+ } else {
+ rnp->grpnum = j % rsp->levelspread[i - 1];
+ rnp->grpmask = 1UL << rnp->grpnum;
+ rnp->parent = rsp->level[i - 1] +
+ j / rsp->levelspread[i - 1];
+ }
+ rnp->level = i;
+ INIT_LIST_HEAD(&rnp->blkd_tasks);
+ rcu_init_one_nocb(rnp);
+ }
+ }
+
+ init_waitqueue_head(&rsp->gp_wq);
+ rnp = rsp->level[rcu_num_lvls - 1];
+ for_each_possible_cpu(i) {
+ while (i > rnp->grphi)
+ rnp++;
+ per_cpu_ptr(rsp->rda, i)->mynode = rnp;
+ rcu_boot_init_percpu_data(i, rsp);
+ }
+ list_add(&rsp->flavors, &rcu_struct_flavors);
+}
+
+/*
+ * Compute the rcu_node tree geometry from kernel parameters. This cannot
+ * replace the definitions in tree.h because those are needed to size
+ * the ->node array in the rcu_state structure.
+ */
+static void __init rcu_init_geometry(void)
+{
+ ulong d;
+ int i;
+ int j;
+ int n = nr_cpu_ids;
+ int rcu_capacity[MAX_RCU_LVLS + 1];
+
+ /*
+ * Initialize any unspecified boot parameters.
+ * The default values of jiffies_till_first_fqs and
+ * jiffies_till_next_fqs are set to the RCU_JIFFIES_TILL_FORCE_QS
+ * value, which is a function of HZ, then adding one for each
+ * RCU_JIFFIES_FQS_DIV CPUs that might be on the system.
+ */
+ d = RCU_JIFFIES_TILL_FORCE_QS + nr_cpu_ids / RCU_JIFFIES_FQS_DIV;
+ if (jiffies_till_first_fqs == ULONG_MAX)
+ jiffies_till_first_fqs = d;
+ if (jiffies_till_next_fqs == ULONG_MAX)
+ jiffies_till_next_fqs = d;
+
+ /* If the compile-time values are accurate, just leave. */
+ if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF &&
+ nr_cpu_ids == NR_CPUS)
+ return;
+ pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%d\n",
+ rcu_fanout_leaf, nr_cpu_ids);
+
+ /*
+ * Compute number of nodes that can be handled an rcu_node tree
+ * with the given number of levels. Setting rcu_capacity[0] makes
+ * some of the arithmetic easier.
+ */
+ rcu_capacity[0] = 1;
+ rcu_capacity[1] = rcu_fanout_leaf;
+ for (i = 2; i <= MAX_RCU_LVLS; i++)
+ rcu_capacity[i] = rcu_capacity[i - 1] * CONFIG_RCU_FANOUT;
+
+ /*
+ * The boot-time rcu_fanout_leaf parameter is only permitted
+ * to increase the leaf-level fanout, not decrease it. Of course,
+ * the leaf-level fanout cannot exceed the number of bits in
+ * the rcu_node masks. Finally, the tree must be able to accommodate
+ * the configured number of CPUs. Complain and fall back to the
+ * compile-time values if these limits are exceeded.
+ */
+ if (rcu_fanout_leaf < CONFIG_RCU_FANOUT_LEAF ||
+ rcu_fanout_leaf > sizeof(unsigned long) * 8 ||
+ n > rcu_capacity[MAX_RCU_LVLS]) {
+ WARN_ON(1);
+ return;
+ }
+
+ /* Calculate the number of rcu_nodes at each level of the tree. */
+ for (i = 1; i <= MAX_RCU_LVLS; i++)
+ if (n <= rcu_capacity[i]) {
+ for (j = 0; j <= i; j++)
+ num_rcu_lvl[j] =
+ DIV_ROUND_UP(n, rcu_capacity[i - j]);
+ rcu_num_lvls = i;
+ for (j = i + 1; j <= MAX_RCU_LVLS; j++)
+ num_rcu_lvl[j] = 0;
+ break;
+ }
+
+ /* Calculate the total number of rcu_node structures. */
+ rcu_num_nodes = 0;
+ for (i = 0; i <= MAX_RCU_LVLS; i++)
+ rcu_num_nodes += num_rcu_lvl[i];
+ rcu_num_nodes -= n;
+}
+
+void __init rcu_init(void)
+{
+ int cpu;
+
+ rcu_early_boot_tests();
+
+ rcu_bootup_announce();
+ rcu_init_geometry();
+ rcu_init_one(&rcu_bh_state, &rcu_bh_data);
+ rcu_init_one(&rcu_sched_state, &rcu_sched_data);
+ __rcu_init_preempt();
+ open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+
+ /*
+ * We don't need protection against CPU-hotplug here because
+ * this is called early in boot, before either interrupts
+ * or the scheduler are operational.
+ */
+ cpu_notifier(rcu_cpu_notify, 0);
+ pm_notifier(rcu_pm_notify, 0);
+ for_each_online_cpu(cpu)
+ rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
+}
+
+#include "tree_plugin.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
new file mode 100644
index 000000000..a69d3dab2
--- /dev/null
+++ b/kernel/rcu/tree.h
@@ -0,0 +1,620 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion (tree-based version)
+ * Internal non-public definitions.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright IBM Corporation, 2008
+ *
+ * Author: Ingo Molnar <mingo@elte.hu>
+ * Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ */
+
+#include <linux/cache.h>
+#include <linux/spinlock.h>
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/seqlock.h>
+
+/*
+ * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
+ * CONFIG_RCU_FANOUT_LEAF.
+ * In theory, it should be possible to add more levels straightforwardly.
+ * In practice, this did work well going from three levels to four.
+ * Of course, your mileage may vary.
+ */
+#define MAX_RCU_LVLS 4
+#define RCU_FANOUT_1 (CONFIG_RCU_FANOUT_LEAF)
+#define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT)
+#define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT)
+#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
+
+#if NR_CPUS <= RCU_FANOUT_1
+# define RCU_NUM_LVLS 1
+# define NUM_RCU_LVL_0 1
+# define NUM_RCU_LVL_1 (NR_CPUS)
+# define NUM_RCU_LVL_2 0
+# define NUM_RCU_LVL_3 0
+# define NUM_RCU_LVL_4 0
+#elif NR_CPUS <= RCU_FANOUT_2
+# define RCU_NUM_LVLS 2
+# define NUM_RCU_LVL_0 1
+# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
+# define NUM_RCU_LVL_2 (NR_CPUS)
+# define NUM_RCU_LVL_3 0
+# define NUM_RCU_LVL_4 0
+#elif NR_CPUS <= RCU_FANOUT_3
+# define RCU_NUM_LVLS 3
+# define NUM_RCU_LVL_0 1
+# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
+# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
+# define NUM_RCU_LVL_3 (NR_CPUS)
+# define NUM_RCU_LVL_4 0
+#elif NR_CPUS <= RCU_FANOUT_4
+# define RCU_NUM_LVLS 4
+# define NUM_RCU_LVL_0 1
+# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
+# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
+# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
+# define NUM_RCU_LVL_4 (NR_CPUS)
+#else
+# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
+#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
+
+#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
+#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
+
+extern int rcu_num_lvls;
+extern int rcu_num_nodes;
+
+/*
+ * Dynticks per-CPU state.
+ */
+struct rcu_dynticks {
+ long long dynticks_nesting; /* Track irq/process nesting level. */
+ /* Process level is worth LLONG_MAX/2. */
+ int dynticks_nmi_nesting; /* Track NMI nesting level. */
+ atomic_t dynticks; /* Even value for idle, else odd. */
+#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
+ long long dynticks_idle_nesting;
+ /* irq/process nesting level from idle. */
+ atomic_t dynticks_idle; /* Even value for idle, else odd. */
+ /* "Idle" excludes userspace execution. */
+ unsigned long dynticks_idle_jiffies;
+ /* End of last non-NMI non-idle period. */
+#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
+#ifdef CONFIG_RCU_FAST_NO_HZ
+ bool all_lazy; /* Are all CPU's CBs lazy? */
+ unsigned long nonlazy_posted;
+ /* # times non-lazy CBs posted to CPU. */
+ unsigned long nonlazy_posted_snap;
+ /* idle-period nonlazy_posted snapshot. */
+ unsigned long last_accelerate;
+ /* Last jiffy CBs were accelerated. */
+ unsigned long last_advance_all;
+ /* Last jiffy CBs were all advanced. */
+ int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
+#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
+};
+
+/* RCU's kthread states for tracing. */
+#define RCU_KTHREAD_STOPPED 0
+#define RCU_KTHREAD_RUNNING 1
+#define RCU_KTHREAD_WAITING 2
+#define RCU_KTHREAD_OFFCPU 3
+#define RCU_KTHREAD_YIELDING 4
+#define RCU_KTHREAD_MAX 4
+
+/*
+ * Definition for node within the RCU grace-period-detection hierarchy.
+ */
+struct rcu_node {
+ raw_spinlock_t lock; /* Root rcu_node's lock protects some */
+ /* rcu_state fields as well as following. */
+ unsigned long gpnum; /* Current grace period for this node. */
+ /* This will either be equal to or one */
+ /* behind the root rcu_node's gpnum. */
+ unsigned long completed; /* Last GP completed for this node. */
+ /* This will either be equal to or one */
+ /* behind the root rcu_node's gpnum. */
+ unsigned long qsmask; /* CPUs or groups that need to switch in */
+ /* order for current grace period to proceed.*/
+ /* In leaf rcu_node, each bit corresponds to */
+ /* an rcu_data structure, otherwise, each */
+ /* bit corresponds to a child rcu_node */
+ /* structure. */
+ unsigned long expmask; /* Groups that have ->blkd_tasks */
+ /* elements that need to drain to allow the */
+ /* current expedited grace period to */
+ /* complete (only for PREEMPT_RCU). */
+ unsigned long qsmaskinit;
+ /* Per-GP initial value for qsmask & expmask. */
+ /* Initialized from ->qsmaskinitnext at the */
+ /* beginning of each grace period. */
+ unsigned long qsmaskinitnext;
+ /* Online CPUs for next grace period. */
+ unsigned long grpmask; /* Mask to apply to parent qsmask. */
+ /* Only one bit will be set in this mask. */
+ int grplo; /* lowest-numbered CPU or group here. */
+ int grphi; /* highest-numbered CPU or group here. */
+ u8 grpnum; /* CPU/group number for next level up. */
+ u8 level; /* root is at level 0. */
+ bool wait_blkd_tasks;/* Necessary to wait for blocked tasks to */
+ /* exit RCU read-side critical sections */
+ /* before propagating offline up the */
+ /* rcu_node tree? */
+ struct rcu_node *parent;
+ struct list_head blkd_tasks;
+ /* Tasks blocked in RCU read-side critical */
+ /* section. Tasks are placed at the head */
+ /* of this list and age towards the tail. */
+ struct list_head *gp_tasks;
+ /* Pointer to the first task blocking the */
+ /* current grace period, or NULL if there */
+ /* is no such task. */
+ struct list_head *exp_tasks;
+ /* Pointer to the first task blocking the */
+ /* current expedited grace period, or NULL */
+ /* if there is no such task. If there */
+ /* is no current expedited grace period, */
+ /* then there can cannot be any such task. */
+#ifdef CONFIG_RCU_BOOST
+ struct list_head *boost_tasks;
+ /* Pointer to first task that needs to be */
+ /* priority boosted, or NULL if no priority */
+ /* boosting is needed for this rcu_node */
+ /* structure. If there are no tasks */
+ /* queued on this rcu_node structure that */
+ /* are blocking the current grace period, */
+ /* there can be no such task. */
+ struct rt_mutex boost_mtx;
+ /* Used only for the priority-boosting */
+ /* side effect, not as a lock. */
+ unsigned long boost_time;
+ /* When to start boosting (jiffies). */
+ struct task_struct *boost_kthread_task;
+ /* kthread that takes care of priority */
+ /* boosting for this rcu_node structure. */
+ unsigned int boost_kthread_status;
+ /* State of boost_kthread_task for tracing. */
+ unsigned long n_tasks_boosted;
+ /* Total number of tasks boosted. */
+ unsigned long n_exp_boosts;
+ /* Number of tasks boosted for expedited GP. */
+ unsigned long n_normal_boosts;
+ /* Number of tasks boosted for normal GP. */
+ unsigned long n_balk_blkd_tasks;
+ /* Refused to boost: no blocked tasks. */
+ unsigned long n_balk_exp_gp_tasks;
+ /* Refused to boost: nothing blocking GP. */
+ unsigned long n_balk_boost_tasks;
+ /* Refused to boost: already boosting. */
+ unsigned long n_balk_notblocked;
+ /* Refused to boost: RCU RS CS still running. */
+ unsigned long n_balk_notyet;
+ /* Refused to boost: not yet time. */
+ unsigned long n_balk_nos;
+ /* Refused to boost: not sure why, though. */
+ /* This can happen due to race conditions. */
+#endif /* #ifdef CONFIG_RCU_BOOST */
+#ifdef CONFIG_RCU_NOCB_CPU
+ wait_queue_head_t nocb_gp_wq[2];
+ /* Place for rcu_nocb_kthread() to wait GP. */
+#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
+ int need_future_gp[2];
+ /* Counts of upcoming no-CB GP requests. */
+ raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp;
+} ____cacheline_internodealigned_in_smp;
+
+/*
+ * Do a full breadth-first scan of the rcu_node structures for the
+ * specified rcu_state structure.
+ */
+#define rcu_for_each_node_breadth_first(rsp, rnp) \
+ for ((rnp) = &(rsp)->node[0]; \
+ (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
+
+/*
+ * Do a breadth-first scan of the non-leaf rcu_node structures for the
+ * specified rcu_state structure. Note that if there is a singleton
+ * rcu_node tree with but one rcu_node structure, this loop is a no-op.
+ */
+#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
+ for ((rnp) = &(rsp)->node[0]; \
+ (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++)
+
+/*
+ * Scan the leaves of the rcu_node hierarchy for the specified rcu_state
+ * structure. Note that if there is a singleton rcu_node tree with but
+ * one rcu_node structure, this loop -will- visit the rcu_node structure.
+ * It is still a leaf node, even if it is also the root node.
+ */
+#define rcu_for_each_leaf_node(rsp, rnp) \
+ for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \
+ (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
+
+/* Index values for nxttail array in struct rcu_data. */
+#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */
+#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */
+#define RCU_NEXT_READY_TAIL 2 /* Also RCU_NEXT head. */
+#define RCU_NEXT_TAIL 3
+#define RCU_NEXT_SIZE 4
+
+/* Per-CPU data for read-copy update. */
+struct rcu_data {
+ /* 1) quiescent-state and grace-period handling : */
+ unsigned long completed; /* Track rsp->completed gp number */
+ /* in order to detect GP end. */
+ unsigned long gpnum; /* Highest gp number that this CPU */
+ /* is aware of having started. */
+ unsigned long rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */
+ /* for rcu_all_qs() invocations. */
+ bool passed_quiesce; /* User-mode/idle loop etc. */
+ bool qs_pending; /* Core waits for quiesc state. */
+ bool beenonline; /* CPU online at least once. */
+ bool gpwrap; /* Possible gpnum/completed wrap. */
+ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
+ unsigned long grpmask; /* Mask to apply to leaf qsmask. */
+#ifdef CONFIG_RCU_CPU_STALL_INFO
+ unsigned long ticks_this_gp; /* The number of scheduling-clock */
+ /* ticks this CPU has handled */
+ /* during and after the last grace */
+ /* period it is aware of. */
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
+
+ /* 2) batch handling */
+ /*
+ * If nxtlist is not NULL, it is partitioned as follows.
+ * Any of the partitions might be empty, in which case the
+ * pointer to that partition will be equal to the pointer for
+ * the following partition. When the list is empty, all of
+ * the nxttail elements point to the ->nxtlist pointer itself,
+ * which in that case is NULL.
+ *
+ * [nxtlist, *nxttail[RCU_DONE_TAIL]):
+ * Entries that batch # <= ->completed
+ * The grace period for these entries has completed, and
+ * the other grace-period-completed entries may be moved
+ * here temporarily in rcu_process_callbacks().
+ * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]):
+ * Entries that batch # <= ->completed - 1: waiting for current GP
+ * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]):
+ * Entries known to have arrived before current GP ended
+ * [*nxttail[RCU_NEXT_READY_TAIL], *nxttail[RCU_NEXT_TAIL]):
+ * Entries that might have arrived after current GP ended
+ * Note that the value of *nxttail[RCU_NEXT_TAIL] will
+ * always be NULL, as this is the end of the list.
+ */
+ struct rcu_head *nxtlist;
+ struct rcu_head **nxttail[RCU_NEXT_SIZE];
+ unsigned long nxtcompleted[RCU_NEXT_SIZE];
+ /* grace periods for sublists. */
+ long qlen_lazy; /* # of lazy queued callbacks */
+ long qlen; /* # of queued callbacks, incl lazy */
+ long qlen_last_fqs_check;
+ /* qlen at last check for QS forcing */
+ unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */
+ unsigned long n_nocbs_invoked; /* count of no-CBs RCU cbs invoked. */
+ unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */
+ unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */
+ unsigned long n_force_qs_snap;
+ /* did other CPU force QS recently? */
+ long blimit; /* Upper limit on a processed batch */
+
+ /* 3) dynticks interface. */
+ struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */
+ int dynticks_snap; /* Per-GP tracking for dynticks. */
+
+ /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
+ unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */
+ unsigned long offline_fqs; /* Kicked due to being offline. */
+ unsigned long cond_resched_completed;
+ /* Grace period that needs help */
+ /* from cond_resched(). */
+
+ /* 5) __rcu_pending() statistics. */
+ unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */
+ unsigned long n_rp_qs_pending;
+ unsigned long n_rp_report_qs;
+ unsigned long n_rp_cb_ready;
+ unsigned long n_rp_cpu_needs_gp;
+ unsigned long n_rp_gp_completed;
+ unsigned long n_rp_gp_started;
+ unsigned long n_rp_nocb_defer_wakeup;
+ unsigned long n_rp_need_nothing;
+
+ /* 6) _rcu_barrier() and OOM callbacks. */
+ struct rcu_head barrier_head;
+#ifdef CONFIG_RCU_FAST_NO_HZ
+ struct rcu_head oom_head;
+#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
+
+ /* 7) Callback offloading. */
+#ifdef CONFIG_RCU_NOCB_CPU
+ struct rcu_head *nocb_head; /* CBs waiting for kthread. */
+ struct rcu_head **nocb_tail;
+ atomic_long_t nocb_q_count; /* # CBs waiting for nocb */
+ atomic_long_t nocb_q_count_lazy; /* invocation (all stages). */
+ struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */
+ struct rcu_head **nocb_follower_tail;
+ wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */
+ struct task_struct *nocb_kthread;
+ int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
+
+ /* The following fields are used by the leader, hence own cacheline. */
+ struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp;
+ /* CBs waiting for GP. */
+ struct rcu_head **nocb_gp_tail;
+ bool nocb_leader_sleep; /* Is the nocb leader thread asleep? */
+ struct rcu_data *nocb_next_follower;
+ /* Next follower in wakeup chain. */
+
+ /* The following fields are used by the follower, hence new cachline. */
+ struct rcu_data *nocb_leader ____cacheline_internodealigned_in_smp;
+ /* Leader CPU takes GP-end wakeups. */
+#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
+
+ /* 8) RCU CPU stall data. */
+#ifdef CONFIG_RCU_CPU_STALL_INFO
+ unsigned int softirq_snap; /* Snapshot of softirq activity. */
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
+
+ int cpu;
+ struct rcu_state *rsp;
+};
+
+/* Values for fqs_state field in struct rcu_state. */
+#define RCU_GP_IDLE 0 /* No grace period in progress. */
+#define RCU_GP_INIT 1 /* Grace period being initialized. */
+#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */
+#define RCU_FORCE_QS 3 /* Need to force quiescent state. */
+#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
+
+/* Values for nocb_defer_wakeup field in struct rcu_data. */
+#define RCU_NOGP_WAKE_NOT 0
+#define RCU_NOGP_WAKE 1
+#define RCU_NOGP_WAKE_FORCE 2
+
+#define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500))
+ /* For jiffies_till_first_fqs and */
+ /* and jiffies_till_next_fqs. */
+
+#define RCU_JIFFIES_FQS_DIV 256 /* Very large systems need more */
+ /* delay between bouts of */
+ /* quiescent-state forcing. */
+
+#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time to take */
+ /* at least one scheduling clock */
+ /* irq before ratting on them. */
+
+#define rcu_wait(cond) \
+do { \
+ for (;;) { \
+ set_current_state(TASK_INTERRUPTIBLE); \
+ if (cond) \
+ break; \
+ schedule(); \
+ } \
+ __set_current_state(TASK_RUNNING); \
+} while (0)
+
+/*
+ * RCU global state, including node hierarchy. This hierarchy is
+ * represented in "heap" form in a dense array. The root (first level)
+ * of the hierarchy is in ->node[0] (referenced by ->level[0]), the second
+ * level in ->node[1] through ->node[m] (->node[1] referenced by ->level[1]),
+ * and the third level in ->node[m+1] and following (->node[m+1] referenced
+ * by ->level[2]). The number of levels is determined by the number of
+ * CPUs and by CONFIG_RCU_FANOUT. Small systems will have a "hierarchy"
+ * consisting of a single rcu_node.
+ */
+struct rcu_state {
+ struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */
+ struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */
+ u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */
+ u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */
+ u8 flavor_mask; /* bit in flavor mask. */
+ struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
+ void (*call)(struct rcu_head *head, /* call_rcu() flavor. */
+ void (*func)(struct rcu_head *head));
+
+ /* The following fields are guarded by the root rcu_node's lock. */
+
+ u8 fqs_state ____cacheline_internodealigned_in_smp;
+ /* Force QS state. */
+ u8 boost; /* Subject to priority boost. */
+ unsigned long gpnum; /* Current gp number. */
+ unsigned long completed; /* # of last completed gp. */
+ struct task_struct *gp_kthread; /* Task for grace periods. */
+ wait_queue_head_t gp_wq; /* Where GP task waits. */
+ short gp_flags; /* Commands for GP task. */
+ short gp_state; /* GP kthread sleep state. */
+
+ /* End of fields guarded by root rcu_node's lock. */
+
+ raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp;
+ /* Protect following fields. */
+ struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */
+ /* need a grace period. */
+ struct rcu_head **orphan_nxttail; /* Tail of above. */
+ struct rcu_head *orphan_donelist; /* Orphaned callbacks that */
+ /* are ready to invoke. */
+ struct rcu_head **orphan_donetail; /* Tail of above. */
+ long qlen_lazy; /* Number of lazy callbacks. */
+ long qlen; /* Total number of callbacks. */
+ /* End of fields guarded by orphan_lock. */
+
+ struct mutex barrier_mutex; /* Guards barrier fields. */
+ atomic_t barrier_cpu_count; /* # CPUs waiting on. */
+ struct completion barrier_completion; /* Wake at barrier end. */
+ unsigned long n_barrier_done; /* ++ at start and end of */
+ /* _rcu_barrier(). */
+ /* End of fields guarded by barrier_mutex. */
+
+ atomic_long_t expedited_start; /* Starting ticket. */
+ atomic_long_t expedited_done; /* Done ticket. */
+ atomic_long_t expedited_wrap; /* # near-wrap incidents. */
+ atomic_long_t expedited_tryfail; /* # acquisition failures. */
+ atomic_long_t expedited_workdone1; /* # done by others #1. */
+ atomic_long_t expedited_workdone2; /* # done by others #2. */
+ atomic_long_t expedited_normal; /* # fallbacks to normal. */
+ atomic_long_t expedited_stoppedcpus; /* # successful stop_cpus. */
+ atomic_long_t expedited_done_tries; /* # tries to update _done. */
+ atomic_long_t expedited_done_lost; /* # times beaten to _done. */
+ atomic_long_t expedited_done_exit; /* # times exited _done loop. */
+
+ unsigned long jiffies_force_qs; /* Time at which to invoke */
+ /* force_quiescent_state(). */
+ unsigned long n_force_qs; /* Number of calls to */
+ /* force_quiescent_state(). */
+ unsigned long n_force_qs_lh; /* ~Number of calls leaving */
+ /* due to lock unavailable. */
+ unsigned long n_force_qs_ngp; /* Number of calls leaving */
+ /* due to no GP active. */
+ unsigned long gp_start; /* Time at which GP started, */
+ /* but in jiffies. */
+ unsigned long gp_activity; /* Time of last GP kthread */
+ /* activity in jiffies. */
+ unsigned long jiffies_stall; /* Time at which to check */
+ /* for CPU stalls. */
+ unsigned long jiffies_resched; /* Time at which to resched */
+ /* a reluctant CPU. */
+ unsigned long n_force_qs_gpstart; /* Snapshot of n_force_qs at */
+ /* GP start. */
+ unsigned long gp_max; /* Maximum GP duration in */
+ /* jiffies. */
+ const char *name; /* Name of structure. */
+ char abbr; /* Abbreviated name. */
+ struct list_head flavors; /* List of RCU flavors. */
+};
+
+/* Values for rcu_state structure's gp_flags field. */
+#define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */
+#define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */
+
+/* Values for rcu_state structure's gp_flags field. */
+#define RCU_GP_WAIT_INIT 0 /* Initial state. */
+#define RCU_GP_WAIT_GPS 1 /* Wait for grace-period start. */
+#define RCU_GP_WAIT_FQS 2 /* Wait for force-quiescent-state time. */
+
+extern struct list_head rcu_struct_flavors;
+
+/* Sequence through rcu_state structures for each RCU flavor. */
+#define for_each_rcu_flavor(rsp) \
+ list_for_each_entry((rsp), &rcu_struct_flavors, flavors)
+
+/*
+ * RCU implementation internal declarations:
+ */
+extern struct rcu_state rcu_sched_state;
+DECLARE_PER_CPU(struct rcu_data, rcu_sched_data);
+
+extern struct rcu_state rcu_bh_state;
+DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
+
+#ifdef CONFIG_PREEMPT_RCU
+extern struct rcu_state rcu_preempt_state;
+DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
+#endif /* #ifdef CONFIG_PREEMPT_RCU */
+
+#ifdef CONFIG_RCU_BOOST
+DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
+DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu);
+DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
+DECLARE_PER_CPU(char, rcu_cpu_has_work);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+
+#ifndef RCU_TREE_NONCORE
+
+/* Forward declarations for rcutree_plugin.h */
+static void rcu_bootup_announce(void);
+static void rcu_preempt_note_context_switch(void);
+static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
+#ifdef CONFIG_HOTPLUG_CPU
+static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+static void rcu_print_detail_task_stall(struct rcu_state *rsp);
+static int rcu_print_task_stall(struct rcu_node *rnp);
+static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
+static void rcu_preempt_check_callbacks(void);
+void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
+static void __init __rcu_init_preempt(void);
+static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
+static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
+static void invoke_rcu_callbacks_kthread(void);
+static bool rcu_is_callbacks_kthread(void);
+#ifdef CONFIG_RCU_BOOST
+static void rcu_preempt_do_callbacks(void);
+static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
+ struct rcu_node *rnp);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+static void __init rcu_spawn_boost_kthreads(void);
+static void rcu_prepare_kthreads(int cpu);
+static void rcu_cleanup_after_idle(void);
+static void rcu_prepare_for_idle(void);
+static void rcu_idle_count_callbacks_posted(void);
+static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
+static void print_cpu_stall_info_begin(void);
+static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
+static void print_cpu_stall_info_end(void);
+static void zero_cpu_stall_ticks(struct rcu_data *rdp);
+static void increment_cpu_stall_ticks(void);
+static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu);
+static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
+static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
+static void rcu_init_one_nocb(struct rcu_node *rnp);
+static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
+ bool lazy, unsigned long flags);
+static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
+ struct rcu_data *rdp,
+ unsigned long flags);
+static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
+static void do_nocb_deferred_wakeup(struct rcu_data *rdp);
+static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
+static void rcu_spawn_all_nocb_kthreads(int cpu);
+static void __init rcu_spawn_nocb_kthreads(void);
+#ifdef CONFIG_RCU_NOCB_CPU
+static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp);
+#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
+static void __maybe_unused rcu_kick_nohz_cpu(int cpu);
+static bool init_nocb_callback_list(struct rcu_data *rdp);
+static void rcu_sysidle_enter(int irq);
+static void rcu_sysidle_exit(int irq);
+static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
+ unsigned long *maxj);
+static bool is_sysidle_rcu_state(struct rcu_state *rsp);
+static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
+ unsigned long maxj);
+static void rcu_bind_gp_kthread(void);
+static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp);
+static bool rcu_nohz_full_cpu(struct rcu_state *rsp);
+static void rcu_dynticks_task_enter(void);
+static void rcu_dynticks_task_exit(void);
+
+#endif /* #ifndef RCU_TREE_NONCORE */
+
+#ifdef CONFIG_RCU_TRACE
+/* Read out queue lengths for tracing. */
+static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
+{
+#ifdef CONFIG_RCU_NOCB_CPU
+ *ql = atomic_long_read(&rdp->nocb_q_count);
+ *qll = atomic_long_read(&rdp->nocb_q_count_lazy);
+#else /* #ifdef CONFIG_RCU_NOCB_CPU */
+ *ql = 0;
+ *qll = 0;
+#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
+}
+#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
new file mode 100644
index 000000000..8c0ec0f5a
--- /dev/null
+++ b/kernel/rcu/tree_plugin.h
@@ -0,0 +1,3090 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion (tree-based version)
+ * Internal non-public definitions that provide either classic
+ * or preemptible semantics.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright Red Hat, 2009
+ * Copyright IBM Corporation, 2009
+ *
+ * Author: Ingo Molnar <mingo@elte.hu>
+ * Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ */
+
+#include <linux/delay.h>
+#include <linux/gfp.h>
+#include <linux/oom.h>
+#include <linux/smpboot.h>
+#include "../time/tick-internal.h"
+
+#ifdef CONFIG_RCU_BOOST
+
+#include "../locking/rtmutex_common.h"
+
+/*
+ * Control variables for per-CPU and per-rcu_node kthreads. These
+ * handle all flavors of RCU.
+ */
+static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
+DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
+DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
+DEFINE_PER_CPU(char, rcu_cpu_has_work);
+
+#endif /* #ifdef CONFIG_RCU_BOOST */
+
+#ifdef CONFIG_RCU_NOCB_CPU
+static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
+static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */
+static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */
+#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
+
+/*
+ * Check the RCU kernel configuration parameters and print informative
+ * messages about anything out of the ordinary. If you like #ifdef, you
+ * will love this function.
+ */
+static void __init rcu_bootup_announce_oddness(void)
+{
+ if (IS_ENABLED(CONFIG_RCU_TRACE))
+ pr_info("\tRCU debugfs-based tracing is enabled.\n");
+ if ((IS_ENABLED(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) ||
+ (!IS_ENABLED(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32))
+ pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
+ CONFIG_RCU_FANOUT);
+ if (IS_ENABLED(CONFIG_RCU_FANOUT_EXACT))
+ pr_info("\tHierarchical RCU autobalancing is disabled.\n");
+ if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ))
+ pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
+ if (IS_ENABLED(CONFIG_PROVE_RCU))
+ pr_info("\tRCU lockdep checking is enabled.\n");
+ if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_RUNNABLE))
+ pr_info("\tRCU torture testing starts during boot.\n");
+ if (IS_ENABLED(CONFIG_RCU_CPU_STALL_INFO))
+ pr_info("\tAdditional per-CPU info printed with stalls.\n");
+ if (NUM_RCU_LVL_4 != 0)
+ pr_info("\tFour-level hierarchy is enabled.\n");
+ if (CONFIG_RCU_FANOUT_LEAF != 16)
+ pr_info("\tBuild-time adjustment of leaf fanout to %d.\n",
+ CONFIG_RCU_FANOUT_LEAF);
+ if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF)
+ pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
+ if (nr_cpu_ids != NR_CPUS)
+ pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
+ if (IS_ENABLED(CONFIG_RCU_BOOST))
+ pr_info("\tRCU kthread priority: %d.\n", kthread_prio);
+}
+
+#ifdef CONFIG_PREEMPT_RCU
+
+RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
+static struct rcu_state *rcu_state_p = &rcu_preempt_state;
+
+static int rcu_preempted_readers_exp(struct rcu_node *rnp);
+static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
+ bool wake);
+
+/*
+ * Tell them what RCU they are running.
+ */
+static void __init rcu_bootup_announce(void)
+{
+ pr_info("Preemptible hierarchical RCU implementation.\n");
+ rcu_bootup_announce_oddness();
+}
+
+/*
+ * Record a preemptible-RCU quiescent state for the specified CPU. Note
+ * that this just means that the task currently running on the CPU is
+ * not in a quiescent state. There might be any number of tasks blocked
+ * while in an RCU read-side critical section.
+ *
+ * As with the other rcu_*_qs() functions, callers to this function
+ * must disable preemption.
+ */
+static void rcu_preempt_qs(void)
+{
+ if (!__this_cpu_read(rcu_preempt_data.passed_quiesce)) {
+ trace_rcu_grace_period(TPS("rcu_preempt"),
+ __this_cpu_read(rcu_preempt_data.gpnum),
+ TPS("cpuqs"));
+ __this_cpu_write(rcu_preempt_data.passed_quiesce, 1);
+ barrier(); /* Coordinate with rcu_preempt_check_callbacks(). */
+ current->rcu_read_unlock_special.b.need_qs = false;
+ }
+}
+
+/*
+ * We have entered the scheduler, and the current task might soon be
+ * context-switched away from. If this task is in an RCU read-side
+ * critical section, we will no longer be able to rely on the CPU to
+ * record that fact, so we enqueue the task on the blkd_tasks list.
+ * The task will dequeue itself when it exits the outermost enclosing
+ * RCU read-side critical section. Therefore, the current grace period
+ * cannot be permitted to complete until the blkd_tasks list entries
+ * predating the current grace period drain, in other words, until
+ * rnp->gp_tasks becomes NULL.
+ *
+ * Caller must disable preemption.
+ */
+static void rcu_preempt_note_context_switch(void)
+{
+ struct task_struct *t = current;
+ unsigned long flags;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+
+ if (t->rcu_read_lock_nesting > 0 &&
+ !t->rcu_read_unlock_special.b.blocked) {
+
+ /* Possibly blocking in an RCU read-side critical section. */
+ rdp = this_cpu_ptr(rcu_preempt_state.rda);
+ rnp = rdp->mynode;
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ smp_mb__after_unlock_lock();
+ t->rcu_read_unlock_special.b.blocked = true;
+ t->rcu_blocked_node = rnp;
+
+ /*
+ * If this CPU has already checked in, then this task
+ * will hold up the next grace period rather than the
+ * current grace period. Queue the task accordingly.
+ * If the task is queued for the current grace period
+ * (i.e., this CPU has not yet passed through a quiescent
+ * state for the current grace period), then as long
+ * as that task remains queued, the current grace period
+ * cannot end. Note that there is some uncertainty as
+ * to exactly when the current grace period started.
+ * We take a conservative approach, which can result
+ * in unnecessarily waiting on tasks that started very
+ * slightly after the current grace period began. C'est
+ * la vie!!!
+ *
+ * But first, note that the current CPU must still be
+ * on line!
+ */
+ WARN_ON_ONCE((rdp->grpmask & rcu_rnp_online_cpus(rnp)) == 0);
+ WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
+ if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) {
+ list_add(&t->rcu_node_entry, rnp->gp_tasks->prev);
+ rnp->gp_tasks = &t->rcu_node_entry;
+#ifdef CONFIG_RCU_BOOST
+ if (rnp->boost_tasks != NULL)
+ rnp->boost_tasks = rnp->gp_tasks;
+#endif /* #ifdef CONFIG_RCU_BOOST */
+ } else {
+ list_add(&t->rcu_node_entry, &rnp->blkd_tasks);
+ if (rnp->qsmask & rdp->grpmask)
+ rnp->gp_tasks = &t->rcu_node_entry;
+ }
+ trace_rcu_preempt_task(rdp->rsp->name,
+ t->pid,
+ (rnp->qsmask & rdp->grpmask)
+ ? rnp->gpnum
+ : rnp->gpnum + 1);
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ } else if (t->rcu_read_lock_nesting < 0 &&
+ t->rcu_read_unlock_special.s) {
+
+ /*
+ * Complete exit from RCU read-side critical section on
+ * behalf of preempted instance of __rcu_read_unlock().
+ */
+ rcu_read_unlock_special(t);
+ }
+
+ /*
+ * Either we were not in an RCU read-side critical section to
+ * begin with, or we have now recorded that critical section
+ * globally. Either way, we can now note a quiescent state
+ * for this CPU. Again, if we were in an RCU read-side critical
+ * section, and if that critical section was blocking the current
+ * grace period, then the fact that the task has been enqueued
+ * means that we continue to block the current grace period.
+ */
+ rcu_preempt_qs();
+}
+
+/*
+ * Check for preempted RCU readers blocking the current grace period
+ * for the specified rcu_node structure. If the caller needs a reliable
+ * answer, it must hold the rcu_node's ->lock.
+ */
+static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
+{
+ return rnp->gp_tasks != NULL;
+}
+
+/*
+ * Advance a ->blkd_tasks-list pointer to the next entry, instead
+ * returning NULL if at the end of the list.
+ */
+static struct list_head *rcu_next_node_entry(struct task_struct *t,
+ struct rcu_node *rnp)
+{
+ struct list_head *np;
+
+ np = t->rcu_node_entry.next;
+ if (np == &rnp->blkd_tasks)
+ np = NULL;
+ return np;
+}
+
+/*
+ * Return true if the specified rcu_node structure has tasks that were
+ * preempted within an RCU read-side critical section.
+ */
+static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
+{
+ return !list_empty(&rnp->blkd_tasks);
+}
+
+/*
+ * Handle special cases during rcu_read_unlock(), such as needing to
+ * notify RCU core processing or task having blocked during the RCU
+ * read-side critical section.
+ */
+void rcu_read_unlock_special(struct task_struct *t)
+{
+ bool empty_exp;
+ bool empty_norm;
+ bool empty_exp_now;
+ unsigned long flags;
+ struct list_head *np;
+#ifdef CONFIG_RCU_BOOST
+ bool drop_boost_mutex = false;
+#endif /* #ifdef CONFIG_RCU_BOOST */
+ struct rcu_node *rnp;
+ union rcu_special special;
+
+ /* NMI handlers cannot block and cannot safely manipulate state. */
+ if (in_nmi())
+ return;
+
+ local_irq_save(flags);
+
+ /*
+ * If RCU core is waiting for this CPU to exit critical section,
+ * let it know that we have done so. Because irqs are disabled,
+ * t->rcu_read_unlock_special cannot change.
+ */
+ special = t->rcu_read_unlock_special;
+ if (special.b.need_qs) {
+ rcu_preempt_qs();
+ t->rcu_read_unlock_special.b.need_qs = false;
+ if (!t->rcu_read_unlock_special.s) {
+ local_irq_restore(flags);
+ return;
+ }
+ }
+
+ /* Hardware IRQ handlers cannot block, complain if they get here. */
+ if (in_irq() || in_serving_softirq()) {
+ lockdep_rcu_suspicious(__FILE__, __LINE__,
+ "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n");
+ pr_alert("->rcu_read_unlock_special: %#x (b: %d, nq: %d)\n",
+ t->rcu_read_unlock_special.s,
+ t->rcu_read_unlock_special.b.blocked,
+ t->rcu_read_unlock_special.b.need_qs);
+ local_irq_restore(flags);
+ return;
+ }
+
+ /* Clean up if blocked during RCU read-side critical section. */
+ if (special.b.blocked) {
+ t->rcu_read_unlock_special.b.blocked = false;
+
+ /*
+ * Remove this task from the list it blocked on. The
+ * task can migrate while we acquire the lock, but at
+ * most one time. So at most two passes through loop.
+ */
+ for (;;) {
+ rnp = t->rcu_blocked_node;
+ raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+ smp_mb__after_unlock_lock();
+ if (rnp == t->rcu_blocked_node)
+ break;
+ raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+ }
+ empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
+ empty_exp = !rcu_preempted_readers_exp(rnp);
+ smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
+ np = rcu_next_node_entry(t, rnp);
+ list_del_init(&t->rcu_node_entry);
+ t->rcu_blocked_node = NULL;
+ trace_rcu_unlock_preempted_task(TPS("rcu_preempt"),
+ rnp->gpnum, t->pid);
+ if (&t->rcu_node_entry == rnp->gp_tasks)
+ rnp->gp_tasks = np;
+ if (&t->rcu_node_entry == rnp->exp_tasks)
+ rnp->exp_tasks = np;
+#ifdef CONFIG_RCU_BOOST
+ if (&t->rcu_node_entry == rnp->boost_tasks)
+ rnp->boost_tasks = np;
+ /* Snapshot ->boost_mtx ownership with rcu_node lock held. */
+ drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t;
+#endif /* #ifdef CONFIG_RCU_BOOST */
+
+ /*
+ * If this was the last task on the current list, and if
+ * we aren't waiting on any CPUs, report the quiescent state.
+ * Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
+ * so we must take a snapshot of the expedited state.
+ */
+ empty_exp_now = !rcu_preempted_readers_exp(rnp);
+ if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) {
+ trace_rcu_quiescent_state_report(TPS("preempt_rcu"),
+ rnp->gpnum,
+ 0, rnp->qsmask,
+ rnp->level,
+ rnp->grplo,
+ rnp->grphi,
+ !!rnp->gp_tasks);
+ rcu_report_unblock_qs_rnp(&rcu_preempt_state,
+ rnp, flags);
+ } else {
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ }
+
+#ifdef CONFIG_RCU_BOOST
+ /* Unboost if we were boosted. */
+ if (drop_boost_mutex)
+ rt_mutex_unlock(&rnp->boost_mtx);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+
+ /*
+ * If this was the last task on the expedited lists,
+ * then we need to report up the rcu_node hierarchy.
+ */
+ if (!empty_exp && empty_exp_now)
+ rcu_report_exp_rnp(&rcu_preempt_state, rnp, true);
+ } else {
+ local_irq_restore(flags);
+ }
+}
+
+/*
+ * Dump detailed information for all tasks blocking the current RCU
+ * grace period on the specified rcu_node structure.
+ */
+static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
+{
+ unsigned long flags;
+ struct task_struct *t;
+
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ if (!rcu_preempt_blocked_readers_cgp(rnp)) {
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ return;
+ }
+ t = list_entry(rnp->gp_tasks,
+ struct task_struct, rcu_node_entry);
+ list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
+ sched_show_task(t);
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+}
+
+/*
+ * Dump detailed information for all tasks blocking the current RCU
+ * grace period.
+ */
+static void rcu_print_detail_task_stall(struct rcu_state *rsp)
+{
+ struct rcu_node *rnp = rcu_get_root(rsp);
+
+ rcu_print_detail_task_stall_rnp(rnp);
+ rcu_for_each_leaf_node(rsp, rnp)
+ rcu_print_detail_task_stall_rnp(rnp);
+}
+
+#ifdef CONFIG_RCU_CPU_STALL_INFO
+
+static void rcu_print_task_stall_begin(struct rcu_node *rnp)
+{
+ pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
+ rnp->level, rnp->grplo, rnp->grphi);
+}
+
+static void rcu_print_task_stall_end(void)
+{
+ pr_cont("\n");
+}
+
+#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
+
+static void rcu_print_task_stall_begin(struct rcu_node *rnp)
+{
+}
+
+static void rcu_print_task_stall_end(void)
+{
+}
+
+#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */
+
+/*
+ * Scan the current list of tasks blocked within RCU read-side critical
+ * sections, printing out the tid of each.
+ */
+static int rcu_print_task_stall(struct rcu_node *rnp)
+{
+ struct task_struct *t;
+ int ndetected = 0;
+
+ if (!rcu_preempt_blocked_readers_cgp(rnp))
+ return 0;
+ rcu_print_task_stall_begin(rnp);
+ t = list_entry(rnp->gp_tasks,
+ struct task_struct, rcu_node_entry);
+ list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
+ pr_cont(" P%d", t->pid);
+ ndetected++;
+ }
+ rcu_print_task_stall_end();
+ return ndetected;
+}
+
+/*
+ * Check that the list of blocked tasks for the newly completed grace
+ * period is in fact empty. It is a serious bug to complete a grace
+ * period that still has RCU readers blocked! This function must be
+ * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock
+ * must be held by the caller.
+ *
+ * Also, if there are blocked tasks on the list, they automatically
+ * block the newly created grace period, so set up ->gp_tasks accordingly.
+ */
+static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
+{
+ WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
+ if (rcu_preempt_has_tasks(rnp))
+ rnp->gp_tasks = rnp->blkd_tasks.next;
+ WARN_ON_ONCE(rnp->qsmask);
+}
+
+/*
+ * Check for a quiescent state from the current CPU. When a task blocks,
+ * the task is recorded in the corresponding CPU's rcu_node structure,
+ * which is checked elsewhere.
+ *
+ * Caller must disable hard irqs.
+ */
+static void rcu_preempt_check_callbacks(void)
+{
+ struct task_struct *t = current;
+
+ if (t->rcu_read_lock_nesting == 0) {
+ rcu_preempt_qs();
+ return;
+ }
+ if (t->rcu_read_lock_nesting > 0 &&
+ __this_cpu_read(rcu_preempt_data.qs_pending) &&
+ !__this_cpu_read(rcu_preempt_data.passed_quiesce))
+ t->rcu_read_unlock_special.b.need_qs = true;
+}
+
+#ifdef CONFIG_RCU_BOOST
+
+static void rcu_preempt_do_callbacks(void)
+{
+ rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data));
+}
+
+#endif /* #ifdef CONFIG_RCU_BOOST */
+
+/*
+ * Queue a preemptible-RCU callback for invocation after a grace period.
+ */
+void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+ __call_rcu(head, func, &rcu_preempt_state, -1, 0);
+}
+EXPORT_SYMBOL_GPL(call_rcu);
+
+/**
+ * synchronize_rcu - wait until a grace period has elapsed.
+ *
+ * Control will return to the caller some time after a full grace
+ * period has elapsed, in other words after all currently executing RCU
+ * read-side critical sections have completed. Note, however, that
+ * upon return from synchronize_rcu(), the caller might well be executing
+ * concurrently with new RCU read-side critical sections that began while
+ * synchronize_rcu() was waiting. RCU read-side critical sections are
+ * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
+ *
+ * See the description of synchronize_sched() for more detailed information
+ * on memory ordering guarantees.
+ */
+void synchronize_rcu(void)
+{
+ rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
+ !lock_is_held(&rcu_lock_map) &&
+ !lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_rcu() in RCU read-side critical section");
+ if (!rcu_scheduler_active)
+ return;
+ if (rcu_gp_is_expedited())
+ synchronize_rcu_expedited();
+ else
+ wait_rcu_gp(call_rcu);
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu);
+
+static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
+static unsigned long sync_rcu_preempt_exp_count;
+static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
+
+/*
+ * Return non-zero if there are any tasks in RCU read-side critical
+ * sections blocking the current preemptible-RCU expedited grace period.
+ * If there is no preemptible-RCU expedited grace period currently in
+ * progress, returns zero unconditionally.
+ */
+static int rcu_preempted_readers_exp(struct rcu_node *rnp)
+{
+ return rnp->exp_tasks != NULL;
+}
+
+/*
+ * return non-zero if there is no RCU expedited grace period in progress
+ * for the specified rcu_node structure, in other words, if all CPUs and
+ * tasks covered by the specified rcu_node structure have done their bit
+ * for the current expedited grace period. Works only for preemptible
+ * RCU -- other RCU implementation use other means.
+ *
+ * Caller must hold sync_rcu_preempt_exp_mutex.
+ */
+static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
+{
+ return !rcu_preempted_readers_exp(rnp) &&
+ ACCESS_ONCE(rnp->expmask) == 0;
+}
+
+/*
+ * Report the exit from RCU read-side critical section for the last task
+ * that queued itself during or before the current expedited preemptible-RCU
+ * grace period. This event is reported either to the rcu_node structure on
+ * which the task was queued or to one of that rcu_node structure's ancestors,
+ * recursively up the tree. (Calm down, calm down, we do the recursion
+ * iteratively!)
+ *
+ * Caller must hold sync_rcu_preempt_exp_mutex.
+ */
+static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
+ bool wake)
+{
+ unsigned long flags;
+ unsigned long mask;
+
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ smp_mb__after_unlock_lock();
+ for (;;) {
+ if (!sync_rcu_preempt_exp_done(rnp)) {
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ break;
+ }
+ if (rnp->parent == NULL) {
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ if (wake) {
+ smp_mb(); /* EGP done before wake_up(). */
+ wake_up(&sync_rcu_preempt_exp_wq);
+ }
+ break;
+ }
+ mask = rnp->grpmask;
+ raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
+ rnp = rnp->parent;
+ raw_spin_lock(&rnp->lock); /* irqs already disabled */
+ smp_mb__after_unlock_lock();
+ rnp->expmask &= ~mask;
+ }
+}
+
+/*
+ * Snapshot the tasks blocking the newly started preemptible-RCU expedited
+ * grace period for the specified rcu_node structure, phase 1. If there
+ * are such tasks, set the ->expmask bits up the rcu_node tree and also
+ * set the ->expmask bits on the leaf rcu_node structures to tell phase 2
+ * that work is needed here.
+ *
+ * Caller must hold sync_rcu_preempt_exp_mutex.
+ */
+static void
+sync_rcu_preempt_exp_init1(struct rcu_state *rsp, struct rcu_node *rnp)
+{
+ unsigned long flags;
+ unsigned long mask;
+ struct rcu_node *rnp_up;
+
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ smp_mb__after_unlock_lock();
+ WARN_ON_ONCE(rnp->expmask);
+ WARN_ON_ONCE(rnp->exp_tasks);
+ if (!rcu_preempt_has_tasks(rnp)) {
+ /* No blocked tasks, nothing to do. */
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ return;
+ }
+ /* Call for Phase 2 and propagate ->expmask bits up the tree. */
+ rnp->expmask = 1;
+ rnp_up = rnp;
+ while (rnp_up->parent) {
+ mask = rnp_up->grpmask;
+ rnp_up = rnp_up->parent;
+ if (rnp_up->expmask & mask)
+ break;
+ raw_spin_lock(&rnp_up->lock); /* irqs already off */
+ smp_mb__after_unlock_lock();
+ rnp_up->expmask |= mask;
+ raw_spin_unlock(&rnp_up->lock); /* irqs still off */
+ }
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+}
+
+/*
+ * Snapshot the tasks blocking the newly started preemptible-RCU expedited
+ * grace period for the specified rcu_node structure, phase 2. If the
+ * leaf rcu_node structure has its ->expmask field set, check for tasks.
+ * If there are some, clear ->expmask and set ->exp_tasks accordingly,
+ * then initiate RCU priority boosting. Otherwise, clear ->expmask and
+ * invoke rcu_report_exp_rnp() to clear out the upper-level ->expmask bits,
+ * enabling rcu_read_unlock_special() to do the bit-clearing.
+ *
+ * Caller must hold sync_rcu_preempt_exp_mutex.
+ */
+static void
+sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ smp_mb__after_unlock_lock();
+ if (!rnp->expmask) {
+ /* Phase 1 didn't do anything, so Phase 2 doesn't either. */
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ return;
+ }
+
+ /* Phase 1 is over. */
+ rnp->expmask = 0;
+
+ /*
+ * If there are still blocked tasks, set up ->exp_tasks so that
+ * rcu_read_unlock_special() will wake us and then boost them.
+ */
+ if (rcu_preempt_has_tasks(rnp)) {
+ rnp->exp_tasks = rnp->blkd_tasks.next;
+ rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
+ return;
+ }
+
+ /* No longer any blocked tasks, so undo bit setting. */
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ rcu_report_exp_rnp(rsp, rnp, false);
+}
+
+/**
+ * synchronize_rcu_expedited - Brute-force RCU grace period
+ *
+ * Wait for an RCU-preempt grace period, but expedite it. The basic
+ * idea is to invoke synchronize_sched_expedited() to push all the tasks to
+ * the ->blkd_tasks lists and wait for this list to drain. This consumes
+ * significant time on all CPUs and is unfriendly to real-time workloads,
+ * so is thus not recommended for any sort of common-case code.
+ * In fact, if you are using synchronize_rcu_expedited() in a loop,
+ * please restructure your code to batch your updates, and then Use a
+ * single synchronize_rcu() instead.
+ */
+void synchronize_rcu_expedited(void)
+{
+ struct rcu_node *rnp;
+ struct rcu_state *rsp = &rcu_preempt_state;
+ unsigned long snap;
+ int trycount = 0;
+
+ smp_mb(); /* Caller's modifications seen first by other CPUs. */
+ snap = ACCESS_ONCE(sync_rcu_preempt_exp_count) + 1;
+ smp_mb(); /* Above access cannot bleed into critical section. */
+
+ /*
+ * Block CPU-hotplug operations. This means that any CPU-hotplug
+ * operation that finds an rcu_node structure with tasks in the
+ * process of being boosted will know that all tasks blocking
+ * this expedited grace period will already be in the process of
+ * being boosted. This simplifies the process of moving tasks
+ * from leaf to root rcu_node structures.
+ */
+ if (!try_get_online_cpus()) {
+ /* CPU-hotplug operation in flight, fall back to normal GP. */
+ wait_rcu_gp(call_rcu);
+ return;
+ }
+
+ /*
+ * Acquire lock, falling back to synchronize_rcu() if too many
+ * lock-acquisition failures. Of course, if someone does the
+ * expedited grace period for us, just leave.
+ */
+ while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
+ if (ULONG_CMP_LT(snap,
+ ACCESS_ONCE(sync_rcu_preempt_exp_count))) {
+ put_online_cpus();
+ goto mb_ret; /* Others did our work for us. */
+ }
+ if (trycount++ < 10) {
+ udelay(trycount * num_online_cpus());
+ } else {
+ put_online_cpus();
+ wait_rcu_gp(call_rcu);
+ return;
+ }
+ }
+ if (ULONG_CMP_LT(snap, ACCESS_ONCE(sync_rcu_preempt_exp_count))) {
+ put_online_cpus();
+ goto unlock_mb_ret; /* Others did our work for us. */
+ }
+
+ /* force all RCU readers onto ->blkd_tasks lists. */
+ synchronize_sched_expedited();
+
+ /*
+ * Snapshot current state of ->blkd_tasks lists into ->expmask.
+ * Phase 1 sets bits and phase 2 permits rcu_read_unlock_special()
+ * to start clearing them. Doing this in one phase leads to
+ * strange races between setting and clearing bits, so just say "no"!
+ */
+ rcu_for_each_leaf_node(rsp, rnp)
+ sync_rcu_preempt_exp_init1(rsp, rnp);
+ rcu_for_each_leaf_node(rsp, rnp)
+ sync_rcu_preempt_exp_init2(rsp, rnp);
+
+ put_online_cpus();
+
+ /* Wait for snapshotted ->blkd_tasks lists to drain. */
+ rnp = rcu_get_root(rsp);
+ wait_event(sync_rcu_preempt_exp_wq,
+ sync_rcu_preempt_exp_done(rnp));
+
+ /* Clean up and exit. */
+ smp_mb(); /* ensure expedited GP seen before counter increment. */
+ ACCESS_ONCE(sync_rcu_preempt_exp_count) =
+ sync_rcu_preempt_exp_count + 1;
+unlock_mb_ret:
+ mutex_unlock(&sync_rcu_preempt_exp_mutex);
+mb_ret:
+ smp_mb(); /* ensure subsequent action seen after grace period. */
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
+
+/**
+ * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
+ *
+ * Note that this primitive does not necessarily wait for an RCU grace period
+ * to complete. For example, if there are no RCU callbacks queued anywhere
+ * in the system, then rcu_barrier() is within its rights to return
+ * immediately, without waiting for anything, much less an RCU grace period.
+ */
+void rcu_barrier(void)
+{
+ _rcu_barrier(&rcu_preempt_state);
+}
+EXPORT_SYMBOL_GPL(rcu_barrier);
+
+/*
+ * Initialize preemptible RCU's state structures.
+ */
+static void __init __rcu_init_preempt(void)
+{
+ rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
+}
+
+/*
+ * Check for a task exiting while in a preemptible-RCU read-side
+ * critical section, clean up if so. No need to issue warnings,
+ * as debug_check_no_locks_held() already does this if lockdep
+ * is enabled.
+ */
+void exit_rcu(void)
+{
+ struct task_struct *t = current;
+
+ if (likely(list_empty(&current->rcu_node_entry)))
+ return;
+ t->rcu_read_lock_nesting = 1;
+ barrier();
+ t->rcu_read_unlock_special.b.blocked = true;
+ __rcu_read_unlock();
+}
+
+#else /* #ifdef CONFIG_PREEMPT_RCU */
+
+static struct rcu_state *rcu_state_p = &rcu_sched_state;
+
+/*
+ * Tell them what RCU they are running.
+ */
+static void __init rcu_bootup_announce(void)
+{
+ pr_info("Hierarchical RCU implementation.\n");
+ rcu_bootup_announce_oddness();
+}
+
+/*
+ * Because preemptible RCU does not exist, we never have to check for
+ * CPUs being in quiescent states.
+ */
+static void rcu_preempt_note_context_switch(void)
+{
+}
+
+/*
+ * Because preemptible RCU does not exist, there are never any preempted
+ * RCU readers.
+ */
+static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
+{
+ return 0;
+}
+
+/*
+ * Because there is no preemptible RCU, there can be no readers blocked.
+ */
+static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
+{
+ return false;
+}
+
+/*
+ * Because preemptible RCU does not exist, we never have to check for
+ * tasks blocked within RCU read-side critical sections.
+ */
+static void rcu_print_detail_task_stall(struct rcu_state *rsp)
+{
+}
+
+/*
+ * Because preemptible RCU does not exist, we never have to check for
+ * tasks blocked within RCU read-side critical sections.
+ */
+static int rcu_print_task_stall(struct rcu_node *rnp)
+{
+ return 0;
+}
+
+/*
+ * Because there is no preemptible RCU, there can be no readers blocked,
+ * so there is no need to check for blocked tasks. So check only for
+ * bogus qsmask values.
+ */
+static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
+{
+ WARN_ON_ONCE(rnp->qsmask);
+}
+
+/*
+ * Because preemptible RCU does not exist, it never has any callbacks
+ * to check.
+ */
+static void rcu_preempt_check_callbacks(void)
+{
+}
+
+/*
+ * Wait for an rcu-preempt grace period, but make it happen quickly.
+ * But because preemptible RCU does not exist, map to rcu-sched.
+ */
+void synchronize_rcu_expedited(void)
+{
+ synchronize_sched_expedited();
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
+
+/*
+ * Because preemptible RCU does not exist, rcu_barrier() is just
+ * another name for rcu_barrier_sched().
+ */
+void rcu_barrier(void)
+{
+ rcu_barrier_sched();
+}
+EXPORT_SYMBOL_GPL(rcu_barrier);
+
+/*
+ * Because preemptible RCU does not exist, it need not be initialized.
+ */
+static void __init __rcu_init_preempt(void)
+{
+}
+
+/*
+ * Because preemptible RCU does not exist, tasks cannot possibly exit
+ * while in preemptible RCU read-side critical sections.
+ */
+void exit_rcu(void)
+{
+}
+
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
+
+#ifdef CONFIG_RCU_BOOST
+
+#include "../locking/rtmutex_common.h"
+
+#ifdef CONFIG_RCU_TRACE
+
+static void rcu_initiate_boost_trace(struct rcu_node *rnp)
+{
+ if (!rcu_preempt_has_tasks(rnp))
+ rnp->n_balk_blkd_tasks++;
+ else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL)
+ rnp->n_balk_exp_gp_tasks++;
+ else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL)
+ rnp->n_balk_boost_tasks++;
+ else if (rnp->gp_tasks != NULL && rnp->qsmask != 0)
+ rnp->n_balk_notblocked++;
+ else if (rnp->gp_tasks != NULL &&
+ ULONG_CMP_LT(jiffies, rnp->boost_time))
+ rnp->n_balk_notyet++;
+ else
+ rnp->n_balk_nos++;
+}
+
+#else /* #ifdef CONFIG_RCU_TRACE */
+
+static void rcu_initiate_boost_trace(struct rcu_node *rnp)
+{
+}
+
+#endif /* #else #ifdef CONFIG_RCU_TRACE */
+
+static void rcu_wake_cond(struct task_struct *t, int status)
+{
+ /*
+ * If the thread is yielding, only wake it when this
+ * is invoked from idle
+ */
+ if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
+ wake_up_process(t);
+}
+
+/*
+ * Carry out RCU priority boosting on the task indicated by ->exp_tasks
+ * or ->boost_tasks, advancing the pointer to the next task in the
+ * ->blkd_tasks list.
+ *
+ * Note that irqs must be enabled: boosting the task can block.
+ * Returns 1 if there are more tasks needing to be boosted.
+ */
+static int rcu_boost(struct rcu_node *rnp)
+{
+ unsigned long flags;
+ struct task_struct *t;
+ struct list_head *tb;
+
+ if (ACCESS_ONCE(rnp->exp_tasks) == NULL &&
+ ACCESS_ONCE(rnp->boost_tasks) == NULL)
+ return 0; /* Nothing left to boost. */
+
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ smp_mb__after_unlock_lock();
+
+ /*
+ * Recheck under the lock: all tasks in need of boosting
+ * might exit their RCU read-side critical sections on their own.
+ */
+ if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) {
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ return 0;
+ }
+
+ /*
+ * Preferentially boost tasks blocking expedited grace periods.
+ * This cannot starve the normal grace periods because a second
+ * expedited grace period must boost all blocked tasks, including
+ * those blocking the pre-existing normal grace period.
+ */
+ if (rnp->exp_tasks != NULL) {
+ tb = rnp->exp_tasks;
+ rnp->n_exp_boosts++;
+ } else {
+ tb = rnp->boost_tasks;
+ rnp->n_normal_boosts++;
+ }
+ rnp->n_tasks_boosted++;
+
+ /*
+ * We boost task t by manufacturing an rt_mutex that appears to
+ * be held by task t. We leave a pointer to that rt_mutex where
+ * task t can find it, and task t will release the mutex when it
+ * exits its outermost RCU read-side critical section. Then
+ * simply acquiring this artificial rt_mutex will boost task
+ * t's priority. (Thanks to tglx for suggesting this approach!)
+ *
+ * Note that task t must acquire rnp->lock to remove itself from
+ * the ->blkd_tasks list, which it will do from exit() if from
+ * nowhere else. We therefore are guaranteed that task t will
+ * stay around at least until we drop rnp->lock. Note that
+ * rnp->lock also resolves races between our priority boosting
+ * and task t's exiting its outermost RCU read-side critical
+ * section.
+ */
+ t = container_of(tb, struct task_struct, rcu_node_entry);
+ rt_mutex_init_proxy_locked(&rnp->boost_mtx, t);
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ /* Lock only for side effect: boosts task t's priority. */
+ rt_mutex_lock(&rnp->boost_mtx);
+ rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */
+
+ return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
+ ACCESS_ONCE(rnp->boost_tasks) != NULL;
+}
+
+/*
+ * Priority-boosting kthread. One per leaf rcu_node and one for the
+ * root rcu_node.
+ */
+static int rcu_boost_kthread(void *arg)
+{
+ struct rcu_node *rnp = (struct rcu_node *)arg;
+ int spincnt = 0;
+ int more2boost;
+
+ trace_rcu_utilization(TPS("Start boost kthread@init"));
+ for (;;) {
+ rnp->boost_kthread_status = RCU_KTHREAD_WAITING;
+ trace_rcu_utilization(TPS("End boost kthread@rcu_wait"));
+ rcu_wait(rnp->boost_tasks || rnp->exp_tasks);
+ trace_rcu_utilization(TPS("Start boost kthread@rcu_wait"));
+ rnp->boost_kthread_status = RCU_KTHREAD_RUNNING;
+ more2boost = rcu_boost(rnp);
+ if (more2boost)
+ spincnt++;
+ else
+ spincnt = 0;
+ if (spincnt > 10) {
+ rnp->boost_kthread_status = RCU_KTHREAD_YIELDING;
+ trace_rcu_utilization(TPS("End boost kthread@rcu_yield"));
+ schedule_timeout_interruptible(2);
+ trace_rcu_utilization(TPS("Start boost kthread@rcu_yield"));
+ spincnt = 0;
+ }
+ }
+ /* NOTREACHED */
+ trace_rcu_utilization(TPS("End boost kthread@notreached"));
+ return 0;
+}
+
+/*
+ * Check to see if it is time to start boosting RCU readers that are
+ * blocking the current grace period, and, if so, tell the per-rcu_node
+ * kthread to start boosting them. If there is an expedited grace
+ * period in progress, it is always time to boost.
+ *
+ * The caller must hold rnp->lock, which this function releases.
+ * The ->boost_kthread_task is immortal, so we don't need to worry
+ * about it going away.
+ */
+static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
+ __releases(rnp->lock)
+{
+ struct task_struct *t;
+
+ if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
+ rnp->n_balk_exp_gp_tasks++;
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ return;
+ }
+ if (rnp->exp_tasks != NULL ||
+ (rnp->gp_tasks != NULL &&
+ rnp->boost_tasks == NULL &&
+ rnp->qsmask == 0 &&
+ ULONG_CMP_GE(jiffies, rnp->boost_time))) {
+ if (rnp->exp_tasks == NULL)
+ rnp->boost_tasks = rnp->gp_tasks;
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ t = rnp->boost_kthread_task;
+ if (t)
+ rcu_wake_cond(t, rnp->boost_kthread_status);
+ } else {
+ rcu_initiate_boost_trace(rnp);
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ }
+}
+
+/*
+ * Wake up the per-CPU kthread to invoke RCU callbacks.
+ */
+static void invoke_rcu_callbacks_kthread(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __this_cpu_write(rcu_cpu_has_work, 1);
+ if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
+ current != __this_cpu_read(rcu_cpu_kthread_task)) {
+ rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task),
+ __this_cpu_read(rcu_cpu_kthread_status));
+ }
+ local_irq_restore(flags);
+}
+
+/*
+ * Is the current CPU running the RCU-callbacks kthread?
+ * Caller must have preemption disabled.
+ */
+static bool rcu_is_callbacks_kthread(void)
+{
+ return __this_cpu_read(rcu_cpu_kthread_task) == current;
+}
+
+#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
+
+/*
+ * Do priority-boost accounting for the start of a new grace period.
+ */
+static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
+{
+ rnp->boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
+}
+
+/*
+ * Create an RCU-boost kthread for the specified node if one does not
+ * already exist. We only create this kthread for preemptible RCU.
+ * Returns zero if all is well, a negated errno otherwise.
+ */
+static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
+ struct rcu_node *rnp)
+{
+ int rnp_index = rnp - &rsp->node[0];
+ unsigned long flags;
+ struct sched_param sp;
+ struct task_struct *t;
+
+ if (&rcu_preempt_state != rsp)
+ return 0;
+
+ if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0)
+ return 0;
+
+ rsp->boost = 1;
+ if (rnp->boost_kthread_task != NULL)
+ return 0;
+ t = kthread_create(rcu_boost_kthread, (void *)rnp,
+ "rcub/%d", rnp_index);
+ if (IS_ERR(t))
+ return PTR_ERR(t);
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ smp_mb__after_unlock_lock();
+ rnp->boost_kthread_task = t;
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ sp.sched_priority = kthread_prio;
+ sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+ wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
+ return 0;
+}
+
+static void rcu_kthread_do_work(void)
+{
+ rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data));
+ rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data));
+ rcu_preempt_do_callbacks();
+}
+
+static void rcu_cpu_kthread_setup(unsigned int cpu)
+{
+ struct sched_param sp;
+
+ sp.sched_priority = kthread_prio;
+ sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
+}
+
+static void rcu_cpu_kthread_park(unsigned int cpu)
+{
+ per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
+}
+
+static int rcu_cpu_kthread_should_run(unsigned int cpu)
+{
+ return __this_cpu_read(rcu_cpu_has_work);
+}
+
+/*
+ * Per-CPU kernel thread that invokes RCU callbacks. This replaces the
+ * RCU softirq used in flavors and configurations of RCU that do not
+ * support RCU priority boosting.
+ */
+static void rcu_cpu_kthread(unsigned int cpu)
+{
+ unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
+ char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
+ int spincnt;
+
+ for (spincnt = 0; spincnt < 10; spincnt++) {
+ trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
+ local_bh_disable();
+ *statusp = RCU_KTHREAD_RUNNING;
+ this_cpu_inc(rcu_cpu_kthread_loops);
+ local_irq_disable();
+ work = *workp;
+ *workp = 0;
+ local_irq_enable();
+ if (work)
+ rcu_kthread_do_work();
+ local_bh_enable();
+ if (*workp == 0) {
+ trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
+ *statusp = RCU_KTHREAD_WAITING;
+ return;
+ }
+ }
+ *statusp = RCU_KTHREAD_YIELDING;
+ trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
+ schedule_timeout_interruptible(2);
+ trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
+ *statusp = RCU_KTHREAD_WAITING;
+}
+
+/*
+ * Set the per-rcu_node kthread's affinity to cover all CPUs that are
+ * served by the rcu_node in question. The CPU hotplug lock is still
+ * held, so the value of rnp->qsmaskinit will be stable.
+ *
+ * We don't include outgoingcpu in the affinity set, use -1 if there is
+ * no outgoing CPU. If there are no CPUs left in the affinity set,
+ * this function allows the kthread to execute on any CPU.
+ */
+static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
+{
+ struct task_struct *t = rnp->boost_kthread_task;
+ unsigned long mask = rcu_rnp_online_cpus(rnp);
+ cpumask_var_t cm;
+ int cpu;
+
+ if (!t)
+ return;
+ if (!zalloc_cpumask_var(&cm, GFP_KERNEL))
+ return;
+ for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
+ if ((mask & 0x1) && cpu != outgoingcpu)
+ cpumask_set_cpu(cpu, cm);
+ if (cpumask_weight(cm) == 0)
+ cpumask_setall(cm);
+ set_cpus_allowed_ptr(t, cm);
+ free_cpumask_var(cm);
+}
+
+static struct smp_hotplug_thread rcu_cpu_thread_spec = {
+ .store = &rcu_cpu_kthread_task,
+ .thread_should_run = rcu_cpu_kthread_should_run,
+ .thread_fn = rcu_cpu_kthread,
+ .thread_comm = "rcuc/%u",
+ .setup = rcu_cpu_kthread_setup,
+ .park = rcu_cpu_kthread_park,
+};
+
+/*
+ * Spawn boost kthreads -- called as soon as the scheduler is running.
+ */
+static void __init rcu_spawn_boost_kthreads(void)
+{
+ struct rcu_node *rnp;
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ per_cpu(rcu_cpu_has_work, cpu) = 0;
+ BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
+ rcu_for_each_leaf_node(rcu_state_p, rnp)
+ (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
+}
+
+static void rcu_prepare_kthreads(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
+ struct rcu_node *rnp = rdp->mynode;
+
+ /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
+ if (rcu_scheduler_fully_active)
+ (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
+}
+
+#else /* #ifdef CONFIG_RCU_BOOST */
+
+static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
+ __releases(rnp->lock)
+{
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+}
+
+static void invoke_rcu_callbacks_kthread(void)
+{
+ WARN_ON_ONCE(1);
+}
+
+static bool rcu_is_callbacks_kthread(void)
+{
+ return false;
+}
+
+static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
+{
+}
+
+static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
+{
+}
+
+static void __init rcu_spawn_boost_kthreads(void)
+{
+}
+
+static void rcu_prepare_kthreads(int cpu)
+{
+}
+
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
+
+#if !defined(CONFIG_RCU_FAST_NO_HZ)
+
+/*
+ * Check to see if any future RCU-related work will need to be done
+ * by the current CPU, even if none need be done immediately, returning
+ * 1 if so. This function is part of the RCU implementation; it is -not-
+ * an exported member of the RCU API.
+ *
+ * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs
+ * any flavor of RCU.
+ */
+#ifndef CONFIG_RCU_NOCB_CPU_ALL
+int rcu_needs_cpu(unsigned long *delta_jiffies)
+{
+ *delta_jiffies = ULONG_MAX;
+ return rcu_cpu_has_callbacks(NULL);
+}
+#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
+
+/*
+ * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
+ * after it.
+ */
+static void rcu_cleanup_after_idle(void)
+{
+}
+
+/*
+ * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n,
+ * is nothing.
+ */
+static void rcu_prepare_for_idle(void)
+{
+}
+
+/*
+ * Don't bother keeping a running count of the number of RCU callbacks
+ * posted because CONFIG_RCU_FAST_NO_HZ=n.
+ */
+static void rcu_idle_count_callbacks_posted(void)
+{
+}
+
+#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
+
+/*
+ * This code is invoked when a CPU goes idle, at which point we want
+ * to have the CPU do everything required for RCU so that it can enter
+ * the energy-efficient dyntick-idle mode. This is handled by a
+ * state machine implemented by rcu_prepare_for_idle() below.
+ *
+ * The following three proprocessor symbols control this state machine:
+ *
+ * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted
+ * to sleep in dyntick-idle mode with RCU callbacks pending. This
+ * is sized to be roughly one RCU grace period. Those energy-efficiency
+ * benchmarkers who might otherwise be tempted to set this to a large
+ * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your
+ * system. And if you are -that- concerned about energy efficiency,
+ * just power the system down and be done with it!
+ * RCU_IDLE_LAZY_GP_DELAY gives the number of jiffies that a CPU is
+ * permitted to sleep in dyntick-idle mode with only lazy RCU
+ * callbacks pending. Setting this too high can OOM your system.
+ *
+ * The values below work well in practice. If future workloads require
+ * adjustment, they can be converted into kernel config parameters, though
+ * making the state machine smarter might be a better option.
+ */
+#define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */
+#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */
+
+static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY;
+module_param(rcu_idle_gp_delay, int, 0644);
+static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY;
+module_param(rcu_idle_lazy_gp_delay, int, 0644);
+
+extern int tick_nohz_active;
+
+/*
+ * Try to advance callbacks for all flavors of RCU on the current CPU, but
+ * only if it has been awhile since the last time we did so. Afterwards,
+ * if there are any callbacks ready for immediate invocation, return true.
+ */
+static bool __maybe_unused rcu_try_advance_all_cbs(void)
+{
+ bool cbs_ready = false;
+ struct rcu_data *rdp;
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+ struct rcu_node *rnp;
+ struct rcu_state *rsp;
+
+ /* Exit early if we advanced recently. */
+ if (jiffies == rdtp->last_advance_all)
+ return false;
+ rdtp->last_advance_all = jiffies;
+
+ for_each_rcu_flavor(rsp) {
+ rdp = this_cpu_ptr(rsp->rda);
+ rnp = rdp->mynode;
+
+ /*
+ * Don't bother checking unless a grace period has
+ * completed since we last checked and there are
+ * callbacks not yet ready to invoke.
+ */
+ if ((rdp->completed != rnp->completed ||
+ unlikely(ACCESS_ONCE(rdp->gpwrap))) &&
+ rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
+ note_gp_changes(rsp, rdp);
+
+ if (cpu_has_callbacks_ready_to_invoke(rdp))
+ cbs_ready = true;
+ }
+ return cbs_ready;
+}
+
+/*
+ * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
+ * to invoke. If the CPU has callbacks, try to advance them. Tell the
+ * caller to set the timeout based on whether or not there are non-lazy
+ * callbacks.
+ *
+ * The caller must have disabled interrupts.
+ */
+#ifndef CONFIG_RCU_NOCB_CPU_ALL
+int rcu_needs_cpu(unsigned long *dj)
+{
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+
+ /* Snapshot to detect later posting of non-lazy callback. */
+ rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
+
+ /* If no callbacks, RCU doesn't need the CPU. */
+ if (!rcu_cpu_has_callbacks(&rdtp->all_lazy)) {
+ *dj = ULONG_MAX;
+ return 0;
+ }
+
+ /* Attempt to advance callbacks. */
+ if (rcu_try_advance_all_cbs()) {
+ /* Some ready to invoke, so initiate later invocation. */
+ invoke_rcu_core();
+ return 1;
+ }
+ rdtp->last_accelerate = jiffies;
+
+ /* Request timer delay depending on laziness, and round. */
+ if (!rdtp->all_lazy) {
+ *dj = round_up(rcu_idle_gp_delay + jiffies,
+ rcu_idle_gp_delay) - jiffies;
+ } else {
+ *dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
+ }
+ return 0;
+}
+#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
+
+/*
+ * Prepare a CPU for idle from an RCU perspective. The first major task
+ * is to sense whether nohz mode has been enabled or disabled via sysfs.
+ * The second major task is to check to see if a non-lazy callback has
+ * arrived at a CPU that previously had only lazy callbacks. The third
+ * major task is to accelerate (that is, assign grace-period numbers to)
+ * any recently arrived callbacks.
+ *
+ * The caller must have disabled interrupts.
+ */
+static void rcu_prepare_for_idle(void)
+{
+#ifndef CONFIG_RCU_NOCB_CPU_ALL
+ bool needwake;
+ struct rcu_data *rdp;
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+ struct rcu_node *rnp;
+ struct rcu_state *rsp;
+ int tne;
+
+ /* Handle nohz enablement switches conservatively. */
+ tne = ACCESS_ONCE(tick_nohz_active);
+ if (tne != rdtp->tick_nohz_enabled_snap) {
+ if (rcu_cpu_has_callbacks(NULL))
+ invoke_rcu_core(); /* force nohz to see update. */
+ rdtp->tick_nohz_enabled_snap = tne;
+ return;
+ }
+ if (!tne)
+ return;
+
+ /* If this is a no-CBs CPU, no callbacks, just return. */
+ if (rcu_is_nocb_cpu(smp_processor_id()))
+ return;
+
+ /*
+ * If a non-lazy callback arrived at a CPU having only lazy
+ * callbacks, invoke RCU core for the side-effect of recalculating
+ * idle duration on re-entry to idle.
+ */
+ if (rdtp->all_lazy &&
+ rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) {
+ rdtp->all_lazy = false;
+ rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
+ invoke_rcu_core();
+ return;
+ }
+
+ /*
+ * If we have not yet accelerated this jiffy, accelerate all
+ * callbacks on this CPU.
+ */
+ if (rdtp->last_accelerate == jiffies)
+ return;
+ rdtp->last_accelerate = jiffies;
+ for_each_rcu_flavor(rsp) {
+ rdp = this_cpu_ptr(rsp->rda);
+ if (!*rdp->nxttail[RCU_DONE_TAIL])
+ continue;
+ rnp = rdp->mynode;
+ raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+ smp_mb__after_unlock_lock();
+ needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
+ raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+ if (needwake)
+ rcu_gp_kthread_wake(rsp);
+ }
+#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
+}
+
+/*
+ * Clean up for exit from idle. Attempt to advance callbacks based on
+ * any grace periods that elapsed while the CPU was idle, and if any
+ * callbacks are now ready to invoke, initiate invocation.
+ */
+static void rcu_cleanup_after_idle(void)
+{
+#ifndef CONFIG_RCU_NOCB_CPU_ALL
+ if (rcu_is_nocb_cpu(smp_processor_id()))
+ return;
+ if (rcu_try_advance_all_cbs())
+ invoke_rcu_core();
+#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
+}
+
+/*
+ * Keep a running count of the number of non-lazy callbacks posted
+ * on this CPU. This running counter (which is never decremented) allows
+ * rcu_prepare_for_idle() to detect when something out of the idle loop
+ * posts a callback, even if an equal number of callbacks are invoked.
+ * Of course, callbacks should only be posted from within a trace event
+ * designed to be called from idle or from within RCU_NONIDLE().
+ */
+static void rcu_idle_count_callbacks_posted(void)
+{
+ __this_cpu_add(rcu_dynticks.nonlazy_posted, 1);
+}
+
+/*
+ * Data for flushing lazy RCU callbacks at OOM time.
+ */
+static atomic_t oom_callback_count;
+static DECLARE_WAIT_QUEUE_HEAD(oom_callback_wq);
+
+/*
+ * RCU OOM callback -- decrement the outstanding count and deliver the
+ * wake-up if we are the last one.
+ */
+static void rcu_oom_callback(struct rcu_head *rhp)
+{
+ if (atomic_dec_and_test(&oom_callback_count))
+ wake_up(&oom_callback_wq);
+}
+
+/*
+ * Post an rcu_oom_notify callback on the current CPU if it has at
+ * least one lazy callback. This will unnecessarily post callbacks
+ * to CPUs that already have a non-lazy callback at the end of their
+ * callback list, but this is an infrequent operation, so accept some
+ * extra overhead to keep things simple.
+ */
+static void rcu_oom_notify_cpu(void *unused)
+{
+ struct rcu_state *rsp;
+ struct rcu_data *rdp;
+
+ for_each_rcu_flavor(rsp) {
+ rdp = raw_cpu_ptr(rsp->rda);
+ if (rdp->qlen_lazy != 0) {
+ atomic_inc(&oom_callback_count);
+ rsp->call(&rdp->oom_head, rcu_oom_callback);
+ }
+ }
+}
+
+/*
+ * If low on memory, ensure that each CPU has a non-lazy callback.
+ * This will wake up CPUs that have only lazy callbacks, in turn
+ * ensuring that they free up the corresponding memory in a timely manner.
+ * Because an uncertain amount of memory will be freed in some uncertain
+ * timeframe, we do not claim to have freed anything.
+ */
+static int rcu_oom_notify(struct notifier_block *self,
+ unsigned long notused, void *nfreed)
+{
+ int cpu;
+
+ /* Wait for callbacks from earlier instance to complete. */
+ wait_event(oom_callback_wq, atomic_read(&oom_callback_count) == 0);
+ smp_mb(); /* Ensure callback reuse happens after callback invocation. */
+
+ /*
+ * Prevent premature wakeup: ensure that all increments happen
+ * before there is a chance of the counter reaching zero.
+ */
+ atomic_set(&oom_callback_count, 1);
+
+ get_online_cpus();
+ for_each_online_cpu(cpu) {
+ smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1);
+ cond_resched_rcu_qs();
+ }
+ put_online_cpus();
+
+ /* Unconditionally decrement: no need to wake ourselves up. */
+ atomic_dec(&oom_callback_count);
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block rcu_oom_nb = {
+ .notifier_call = rcu_oom_notify
+};
+
+static int __init rcu_register_oom_notifier(void)
+{
+ register_oom_notifier(&rcu_oom_nb);
+ return 0;
+}
+early_initcall(rcu_register_oom_notifier);
+
+#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
+
+#ifdef CONFIG_RCU_CPU_STALL_INFO
+
+#ifdef CONFIG_RCU_FAST_NO_HZ
+
+static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
+{
+ struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+ unsigned long nlpd = rdtp->nonlazy_posted - rdtp->nonlazy_posted_snap;
+
+ sprintf(cp, "last_accelerate: %04lx/%04lx, nonlazy_posted: %ld, %c%c",
+ rdtp->last_accelerate & 0xffff, jiffies & 0xffff,
+ ulong2long(nlpd),
+ rdtp->all_lazy ? 'L' : '.',
+ rdtp->tick_nohz_enabled_snap ? '.' : 'D');
+}
+
+#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
+
+static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
+{
+ *cp = '\0';
+}
+
+#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */
+
+/* Initiate the stall-info list. */
+static void print_cpu_stall_info_begin(void)
+{
+ pr_cont("\n");
+}
+
+/*
+ * Print out diagnostic information for the specified stalled CPU.
+ *
+ * If the specified CPU is aware of the current RCU grace period
+ * (flavor specified by rsp), then print the number of scheduling
+ * clock interrupts the CPU has taken during the time that it has
+ * been aware. Otherwise, print the number of RCU grace periods
+ * that this CPU is ignorant of, for example, "1" if the CPU was
+ * aware of the previous grace period.
+ *
+ * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info.
+ */
+static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
+{
+ char fast_no_hz[72];
+ struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+ struct rcu_dynticks *rdtp = rdp->dynticks;
+ char *ticks_title;
+ unsigned long ticks_value;
+
+ if (rsp->gpnum == rdp->gpnum) {
+ ticks_title = "ticks this GP";
+ ticks_value = rdp->ticks_this_gp;
+ } else {
+ ticks_title = "GPs behind";
+ ticks_value = rsp->gpnum - rdp->gpnum;
+ }
+ print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
+ pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n",
+ cpu, ticks_value, ticks_title,
+ atomic_read(&rdtp->dynticks) & 0xfff,
+ rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
+ rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
+ ACCESS_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart,
+ fast_no_hz);
+}
+
+/* Terminate the stall-info list. */
+static void print_cpu_stall_info_end(void)
+{
+ pr_err("\t");
+}
+
+/* Zero ->ticks_this_gp for all flavors of RCU. */
+static void zero_cpu_stall_ticks(struct rcu_data *rdp)
+{
+ rdp->ticks_this_gp = 0;
+ rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id());
+}
+
+/* Increment ->ticks_this_gp for all flavors of RCU. */
+static void increment_cpu_stall_ticks(void)
+{
+ struct rcu_state *rsp;
+
+ for_each_rcu_flavor(rsp)
+ raw_cpu_inc(rsp->rda->ticks_this_gp);
+}
+
+#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
+
+static void print_cpu_stall_info_begin(void)
+{
+ pr_cont(" {");
+}
+
+static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
+{
+ pr_cont(" %d", cpu);
+}
+
+static void print_cpu_stall_info_end(void)
+{
+ pr_cont("} ");
+}
+
+static void zero_cpu_stall_ticks(struct rcu_data *rdp)
+{
+}
+
+static void increment_cpu_stall_ticks(void)
+{
+}
+
+#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */
+
+#ifdef CONFIG_RCU_NOCB_CPU
+
+/*
+ * Offload callback processing from the boot-time-specified set of CPUs
+ * specified by rcu_nocb_mask. For each CPU in the set, there is a
+ * kthread created that pulls the callbacks from the corresponding CPU,
+ * waits for a grace period to elapse, and invokes the callbacks.
+ * The no-CBs CPUs do a wake_up() on their kthread when they insert
+ * a callback into any empty list, unless the rcu_nocb_poll boot parameter
+ * has been specified, in which case each kthread actively polls its
+ * CPU. (Which isn't so great for energy efficiency, but which does
+ * reduce RCU's overhead on that CPU.)
+ *
+ * This is intended to be used in conjunction with Frederic Weisbecker's
+ * adaptive-idle work, which would seriously reduce OS jitter on CPUs
+ * running CPU-bound user-mode computations.
+ *
+ * Offloading of callback processing could also in theory be used as
+ * an energy-efficiency measure because CPUs with no RCU callbacks
+ * queued are more aggressive about entering dyntick-idle mode.
+ */
+
+
+/* Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. */
+static int __init rcu_nocb_setup(char *str)
+{
+ alloc_bootmem_cpumask_var(&rcu_nocb_mask);
+ have_rcu_nocb_mask = true;
+ cpulist_parse(str, rcu_nocb_mask);
+ return 1;
+}
+__setup("rcu_nocbs=", rcu_nocb_setup);
+
+static int __init parse_rcu_nocb_poll(char *arg)
+{
+ rcu_nocb_poll = 1;
+ return 0;
+}
+early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
+
+/*
+ * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
+ * grace period.
+ */
+static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
+{
+ wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]);
+}
+
+/*
+ * Set the root rcu_node structure's ->need_future_gp field
+ * based on the sum of those of all rcu_node structures. This does
+ * double-count the root rcu_node structure's requests, but this
+ * is necessary to handle the possibility of a rcu_nocb_kthread()
+ * having awakened during the time that the rcu_node structures
+ * were being updated for the end of the previous grace period.
+ */
+static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
+{
+ rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq;
+}
+
+static void rcu_init_one_nocb(struct rcu_node *rnp)
+{
+ init_waitqueue_head(&rnp->nocb_gp_wq[0]);
+ init_waitqueue_head(&rnp->nocb_gp_wq[1]);
+}
+
+#ifndef CONFIG_RCU_NOCB_CPU_ALL
+/* Is the specified CPU a no-CBs CPU? */
+bool rcu_is_nocb_cpu(int cpu)
+{
+ if (have_rcu_nocb_mask)
+ return cpumask_test_cpu(cpu, rcu_nocb_mask);
+ return false;
+}
+#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
+
+/*
+ * Kick the leader kthread for this NOCB group.
+ */
+static void wake_nocb_leader(struct rcu_data *rdp, bool force)
+{
+ struct rcu_data *rdp_leader = rdp->nocb_leader;
+
+ if (!ACCESS_ONCE(rdp_leader->nocb_kthread))
+ return;
+ if (ACCESS_ONCE(rdp_leader->nocb_leader_sleep) || force) {
+ /* Prior smp_mb__after_atomic() orders against prior enqueue. */
+ ACCESS_ONCE(rdp_leader->nocb_leader_sleep) = false;
+ wake_up(&rdp_leader->nocb_wq);
+ }
+}
+
+/*
+ * Does the specified CPU need an RCU callback for the specified flavor
+ * of rcu_barrier()?
+ */
+static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+ unsigned long ret;
+#ifdef CONFIG_PROVE_RCU
+ struct rcu_head *rhp;
+#endif /* #ifdef CONFIG_PROVE_RCU */
+
+ /*
+ * Check count of all no-CBs callbacks awaiting invocation.
+ * There needs to be a barrier before this function is called,
+ * but associated with a prior determination that no more
+ * callbacks would be posted. In the worst case, the first
+ * barrier in _rcu_barrier() suffices (but the caller cannot
+ * necessarily rely on this, not a substitute for the caller
+ * getting the concurrency design right!). There must also be
+ * a barrier between the following load an posting of a callback
+ * (if a callback is in fact needed). This is associated with an
+ * atomic_inc() in the caller.
+ */
+ ret = atomic_long_read(&rdp->nocb_q_count);
+
+#ifdef CONFIG_PROVE_RCU
+ rhp = ACCESS_ONCE(rdp->nocb_head);
+ if (!rhp)
+ rhp = ACCESS_ONCE(rdp->nocb_gp_head);
+ if (!rhp)
+ rhp = ACCESS_ONCE(rdp->nocb_follower_head);
+
+ /* Having no rcuo kthread but CBs after scheduler starts is bad! */
+ if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp &&
+ rcu_scheduler_fully_active) {
+ /* RCU callback enqueued before CPU first came online??? */
+ pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n",
+ cpu, rhp->func);
+ WARN_ON_ONCE(1);
+ }
+#endif /* #ifdef CONFIG_PROVE_RCU */
+
+ return !!ret;
+}
+
+/*
+ * Enqueue the specified string of rcu_head structures onto the specified
+ * CPU's no-CBs lists. The CPU is specified by rdp, the head of the
+ * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy
+ * counts are supplied by rhcount and rhcount_lazy.
+ *
+ * If warranted, also wake up the kthread servicing this CPUs queues.
+ */
+static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
+ struct rcu_head *rhp,
+ struct rcu_head **rhtp,
+ int rhcount, int rhcount_lazy,
+ unsigned long flags)
+{
+ int len;
+ struct rcu_head **old_rhpp;
+ struct task_struct *t;
+
+ /* Enqueue the callback on the nocb list and update counts. */
+ atomic_long_add(rhcount, &rdp->nocb_q_count);
+ /* rcu_barrier() relies on ->nocb_q_count add before xchg. */
+ old_rhpp = xchg(&rdp->nocb_tail, rhtp);
+ ACCESS_ONCE(*old_rhpp) = rhp;
+ atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy);
+ smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */
+
+ /* If we are not being polled and there is a kthread, awaken it ... */
+ t = ACCESS_ONCE(rdp->nocb_kthread);
+ if (rcu_nocb_poll || !t) {
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+ TPS("WakeNotPoll"));
+ return;
+ }
+ len = atomic_long_read(&rdp->nocb_q_count);
+ if (old_rhpp == &rdp->nocb_head) {
+ if (!irqs_disabled_flags(flags)) {
+ /* ... if queue was empty ... */
+ wake_nocb_leader(rdp, false);
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+ TPS("WakeEmpty"));
+ } else {
+ rdp->nocb_defer_wakeup = RCU_NOGP_WAKE;
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+ TPS("WakeEmptyIsDeferred"));
+ }
+ rdp->qlen_last_fqs_check = 0;
+ } else if (len > rdp->qlen_last_fqs_check + qhimark) {
+ /* ... or if many callbacks queued. */
+ if (!irqs_disabled_flags(flags)) {
+ wake_nocb_leader(rdp, true);
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+ TPS("WakeOvf"));
+ } else {
+ rdp->nocb_defer_wakeup = RCU_NOGP_WAKE_FORCE;
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+ TPS("WakeOvfIsDeferred"));
+ }
+ rdp->qlen_last_fqs_check = LONG_MAX / 2;
+ } else {
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeNot"));
+ }
+ return;
+}
+
+/*
+ * This is a helper for __call_rcu(), which invokes this when the normal
+ * callback queue is inoperable. If this is not a no-CBs CPU, this
+ * function returns failure back to __call_rcu(), which can complain
+ * appropriately.
+ *
+ * Otherwise, this function queues the callback where the corresponding
+ * "rcuo" kthread can find it.
+ */
+static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
+ bool lazy, unsigned long flags)
+{
+
+ if (!rcu_is_nocb_cpu(rdp->cpu))
+ return false;
+ __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy, flags);
+ if (__is_kfree_rcu_offset((unsigned long)rhp->func))
+ trace_rcu_kfree_callback(rdp->rsp->name, rhp,
+ (unsigned long)rhp->func,
+ -atomic_long_read(&rdp->nocb_q_count_lazy),
+ -atomic_long_read(&rdp->nocb_q_count));
+ else
+ trace_rcu_callback(rdp->rsp->name, rhp,
+ -atomic_long_read(&rdp->nocb_q_count_lazy),
+ -atomic_long_read(&rdp->nocb_q_count));
+
+ /*
+ * If called from an extended quiescent state with interrupts
+ * disabled, invoke the RCU core in order to allow the idle-entry
+ * deferred-wakeup check to function.
+ */
+ if (irqs_disabled_flags(flags) &&
+ !rcu_is_watching() &&
+ cpu_online(smp_processor_id()))
+ invoke_rcu_core();
+
+ return true;
+}
+
+/*
+ * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is
+ * not a no-CBs CPU.
+ */
+static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
+ struct rcu_data *rdp,
+ unsigned long flags)
+{
+ long ql = rsp->qlen;
+ long qll = rsp->qlen_lazy;
+
+ /* If this is not a no-CBs CPU, tell the caller to do it the old way. */
+ if (!rcu_is_nocb_cpu(smp_processor_id()))
+ return false;
+ rsp->qlen = 0;
+ rsp->qlen_lazy = 0;
+
+ /* First, enqueue the donelist, if any. This preserves CB ordering. */
+ if (rsp->orphan_donelist != NULL) {
+ __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist,
+ rsp->orphan_donetail, ql, qll, flags);
+ ql = qll = 0;
+ rsp->orphan_donelist = NULL;
+ rsp->orphan_donetail = &rsp->orphan_donelist;
+ }
+ if (rsp->orphan_nxtlist != NULL) {
+ __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist,
+ rsp->orphan_nxttail, ql, qll, flags);
+ ql = qll = 0;
+ rsp->orphan_nxtlist = NULL;
+ rsp->orphan_nxttail = &rsp->orphan_nxtlist;
+ }
+ return true;
+}
+
+/*
+ * If necessary, kick off a new grace period, and either way wait
+ * for a subsequent grace period to complete.
+ */
+static void rcu_nocb_wait_gp(struct rcu_data *rdp)
+{
+ unsigned long c;
+ bool d;
+ unsigned long flags;
+ bool needwake;
+ struct rcu_node *rnp = rdp->mynode;
+
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ smp_mb__after_unlock_lock();
+ needwake = rcu_start_future_gp(rnp, rdp, &c);
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ if (needwake)
+ rcu_gp_kthread_wake(rdp->rsp);
+
+ /*
+ * Wait for the grace period. Do so interruptibly to avoid messing
+ * up the load average.
+ */
+ trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait"));
+ for (;;) {
+ wait_event_interruptible(
+ rnp->nocb_gp_wq[c & 0x1],
+ (d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c)));
+ if (likely(d))
+ break;
+ WARN_ON(signal_pending(current));
+ trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait"));
+ }
+ trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait"));
+ smp_mb(); /* Ensure that CB invocation happens after GP end. */
+}
+
+/*
+ * Leaders come here to wait for additional callbacks to show up.
+ * This function does not return until callbacks appear.
+ */
+static void nocb_leader_wait(struct rcu_data *my_rdp)
+{
+ bool firsttime = true;
+ bool gotcbs;
+ struct rcu_data *rdp;
+ struct rcu_head **tail;
+
+wait_again:
+
+ /* Wait for callbacks to appear. */
+ if (!rcu_nocb_poll) {
+ trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
+ wait_event_interruptible(my_rdp->nocb_wq,
+ !ACCESS_ONCE(my_rdp->nocb_leader_sleep));
+ /* Memory barrier handled by smp_mb() calls below and repoll. */
+ } else if (firsttime) {
+ firsttime = false; /* Don't drown trace log with "Poll"! */
+ trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Poll");
+ }
+
+ /*
+ * Each pass through the following loop checks a follower for CBs.
+ * We are our own first follower. Any CBs found are moved to
+ * nocb_gp_head, where they await a grace period.
+ */
+ gotcbs = false;
+ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
+ rdp->nocb_gp_head = ACCESS_ONCE(rdp->nocb_head);
+ if (!rdp->nocb_gp_head)
+ continue; /* No CBs here, try next follower. */
+
+ /* Move callbacks to wait-for-GP list, which is empty. */
+ ACCESS_ONCE(rdp->nocb_head) = NULL;
+ rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
+ gotcbs = true;
+ }
+
+ /*
+ * If there were no callbacks, sleep a bit, rescan after a
+ * memory barrier, and go retry.
+ */
+ if (unlikely(!gotcbs)) {
+ if (!rcu_nocb_poll)
+ trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu,
+ "WokeEmpty");
+ WARN_ON(signal_pending(current));
+ schedule_timeout_interruptible(1);
+
+ /* Rescan in case we were a victim of memory ordering. */
+ my_rdp->nocb_leader_sleep = true;
+ smp_mb(); /* Ensure _sleep true before scan. */
+ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower)
+ if (ACCESS_ONCE(rdp->nocb_head)) {
+ /* Found CB, so short-circuit next wait. */
+ my_rdp->nocb_leader_sleep = false;
+ break;
+ }
+ goto wait_again;
+ }
+
+ /* Wait for one grace period. */
+ rcu_nocb_wait_gp(my_rdp);
+
+ /*
+ * We left ->nocb_leader_sleep unset to reduce cache thrashing.
+ * We set it now, but recheck for new callbacks while
+ * traversing our follower list.
+ */
+ my_rdp->nocb_leader_sleep = true;
+ smp_mb(); /* Ensure _sleep true before scan of ->nocb_head. */
+
+ /* Each pass through the following loop wakes a follower, if needed. */
+ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
+ if (ACCESS_ONCE(rdp->nocb_head))
+ my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/
+ if (!rdp->nocb_gp_head)
+ continue; /* No CBs, so no need to wake follower. */
+
+ /* Append callbacks to follower's "done" list. */
+ tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail);
+ *tail = rdp->nocb_gp_head;
+ smp_mb__after_atomic(); /* Store *tail before wakeup. */
+ if (rdp != my_rdp && tail == &rdp->nocb_follower_head) {
+ /*
+ * List was empty, wake up the follower.
+ * Memory barriers supplied by atomic_long_add().
+ */
+ wake_up(&rdp->nocb_wq);
+ }
+ }
+
+ /* If we (the leader) don't have CBs, go wait some more. */
+ if (!my_rdp->nocb_follower_head)
+ goto wait_again;
+}
+
+/*
+ * Followers come here to wait for additional callbacks to show up.
+ * This function does not return until callbacks appear.
+ */
+static void nocb_follower_wait(struct rcu_data *rdp)
+{
+ bool firsttime = true;
+
+ for (;;) {
+ if (!rcu_nocb_poll) {
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+ "FollowerSleep");
+ wait_event_interruptible(rdp->nocb_wq,
+ ACCESS_ONCE(rdp->nocb_follower_head));
+ } else if (firsttime) {
+ /* Don't drown trace log with "Poll"! */
+ firsttime = false;
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "Poll");
+ }
+ if (smp_load_acquire(&rdp->nocb_follower_head)) {
+ /* ^^^ Ensure CB invocation follows _head test. */
+ return;
+ }
+ if (!rcu_nocb_poll)
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+ "WokeEmpty");
+ WARN_ON(signal_pending(current));
+ schedule_timeout_interruptible(1);
+ }
+}
+
+/*
+ * Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes
+ * callbacks queued by the corresponding no-CBs CPU, however, there is
+ * an optional leader-follower relationship so that the grace-period
+ * kthreads don't have to do quite so many wakeups.
+ */
+static int rcu_nocb_kthread(void *arg)
+{
+ int c, cl;
+ struct rcu_head *list;
+ struct rcu_head *next;
+ struct rcu_head **tail;
+ struct rcu_data *rdp = arg;
+
+ /* Each pass through this loop invokes one batch of callbacks */
+ for (;;) {
+ /* Wait for callbacks. */
+ if (rdp->nocb_leader == rdp)
+ nocb_leader_wait(rdp);
+ else
+ nocb_follower_wait(rdp);
+
+ /* Pull the ready-to-invoke callbacks onto local list. */
+ list = ACCESS_ONCE(rdp->nocb_follower_head);
+ BUG_ON(!list);
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty");
+ ACCESS_ONCE(rdp->nocb_follower_head) = NULL;
+ tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head);
+
+ /* Each pass through the following loop invokes a callback. */
+ trace_rcu_batch_start(rdp->rsp->name,
+ atomic_long_read(&rdp->nocb_q_count_lazy),
+ atomic_long_read(&rdp->nocb_q_count), -1);
+ c = cl = 0;
+ while (list) {
+ next = list->next;
+ /* Wait for enqueuing to complete, if needed. */
+ while (next == NULL && &list->next != tail) {
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+ TPS("WaitQueue"));
+ schedule_timeout_interruptible(1);
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+ TPS("WokeQueue"));
+ next = list->next;
+ }
+ debug_rcu_head_unqueue(list);
+ local_bh_disable();
+ if (__rcu_reclaim(rdp->rsp->name, list))
+ cl++;
+ c++;
+ local_bh_enable();
+ list = next;
+ }
+ trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1);
+ smp_mb__before_atomic(); /* _add after CB invocation. */
+ atomic_long_add(-c, &rdp->nocb_q_count);
+ atomic_long_add(-cl, &rdp->nocb_q_count_lazy);
+ rdp->n_nocbs_invoked += c;
+ }
+ return 0;
+}
+
+/* Is a deferred wakeup of rcu_nocb_kthread() required? */
+static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
+{
+ return ACCESS_ONCE(rdp->nocb_defer_wakeup);
+}
+
+/* Do a deferred wakeup of rcu_nocb_kthread(). */
+static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
+{
+ int ndw;
+
+ if (!rcu_nocb_need_deferred_wakeup(rdp))
+ return;
+ ndw = ACCESS_ONCE(rdp->nocb_defer_wakeup);
+ ACCESS_ONCE(rdp->nocb_defer_wakeup) = RCU_NOGP_WAKE_NOT;
+ wake_nocb_leader(rdp, ndw == RCU_NOGP_WAKE_FORCE);
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake"));
+}
+
+void __init rcu_init_nohz(void)
+{
+ int cpu;
+ bool need_rcu_nocb_mask = true;
+ struct rcu_state *rsp;
+
+#ifdef CONFIG_RCU_NOCB_CPU_NONE
+ need_rcu_nocb_mask = false;
+#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */
+
+#if defined(CONFIG_NO_HZ_FULL)
+ if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask))
+ need_rcu_nocb_mask = true;
+#endif /* #if defined(CONFIG_NO_HZ_FULL) */
+
+ if (!have_rcu_nocb_mask && need_rcu_nocb_mask) {
+ if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) {
+ pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n");
+ return;
+ }
+ have_rcu_nocb_mask = true;
+ }
+ if (!have_rcu_nocb_mask)
+ return;
+
+#ifdef CONFIG_RCU_NOCB_CPU_ZERO
+ pr_info("\tOffload RCU callbacks from CPU 0\n");
+ cpumask_set_cpu(0, rcu_nocb_mask);
+#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */
+#ifdef CONFIG_RCU_NOCB_CPU_ALL
+ pr_info("\tOffload RCU callbacks from all CPUs\n");
+ cpumask_copy(rcu_nocb_mask, cpu_possible_mask);
+#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */
+#if defined(CONFIG_NO_HZ_FULL)
+ if (tick_nohz_full_running)
+ cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask);
+#endif /* #if defined(CONFIG_NO_HZ_FULL) */
+
+ if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
+ pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n");
+ cpumask_and(rcu_nocb_mask, cpu_possible_mask,
+ rcu_nocb_mask);
+ }
+ pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n",
+ cpumask_pr_args(rcu_nocb_mask));
+ if (rcu_nocb_poll)
+ pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
+
+ for_each_rcu_flavor(rsp) {
+ for_each_cpu(cpu, rcu_nocb_mask)
+ init_nocb_callback_list(per_cpu_ptr(rsp->rda, cpu));
+ rcu_organize_nocb_kthreads(rsp);
+ }
+}
+
+/* Initialize per-rcu_data variables for no-CBs CPUs. */
+static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
+{
+ rdp->nocb_tail = &rdp->nocb_head;
+ init_waitqueue_head(&rdp->nocb_wq);
+ rdp->nocb_follower_tail = &rdp->nocb_follower_head;
+}
+
+/*
+ * If the specified CPU is a no-CBs CPU that does not already have its
+ * rcuo kthread for the specified RCU flavor, spawn it. If the CPUs are
+ * brought online out of order, this can require re-organizing the
+ * leader-follower relationships.
+ */
+static void rcu_spawn_one_nocb_kthread(struct rcu_state *rsp, int cpu)
+{
+ struct rcu_data *rdp;
+ struct rcu_data *rdp_last;
+ struct rcu_data *rdp_old_leader;
+ struct rcu_data *rdp_spawn = per_cpu_ptr(rsp->rda, cpu);
+ struct task_struct *t;
+
+ /*
+ * If this isn't a no-CBs CPU or if it already has an rcuo kthread,
+ * then nothing to do.
+ */
+ if (!rcu_is_nocb_cpu(cpu) || rdp_spawn->nocb_kthread)
+ return;
+
+ /* If we didn't spawn the leader first, reorganize! */
+ rdp_old_leader = rdp_spawn->nocb_leader;
+ if (rdp_old_leader != rdp_spawn && !rdp_old_leader->nocb_kthread) {
+ rdp_last = NULL;
+ rdp = rdp_old_leader;
+ do {
+ rdp->nocb_leader = rdp_spawn;
+ if (rdp_last && rdp != rdp_spawn)
+ rdp_last->nocb_next_follower = rdp;
+ if (rdp == rdp_spawn) {
+ rdp = rdp->nocb_next_follower;
+ } else {
+ rdp_last = rdp;
+ rdp = rdp->nocb_next_follower;
+ rdp_last->nocb_next_follower = NULL;
+ }
+ } while (rdp);
+ rdp_spawn->nocb_next_follower = rdp_old_leader;
+ }
+
+ /* Spawn the kthread for this CPU and RCU flavor. */
+ t = kthread_run(rcu_nocb_kthread, rdp_spawn,
+ "rcuo%c/%d", rsp->abbr, cpu);
+ BUG_ON(IS_ERR(t));
+ ACCESS_ONCE(rdp_spawn->nocb_kthread) = t;
+}
+
+/*
+ * If the specified CPU is a no-CBs CPU that does not already have its
+ * rcuo kthreads, spawn them.
+ */
+static void rcu_spawn_all_nocb_kthreads(int cpu)
+{
+ struct rcu_state *rsp;
+
+ if (rcu_scheduler_fully_active)
+ for_each_rcu_flavor(rsp)
+ rcu_spawn_one_nocb_kthread(rsp, cpu);
+}
+
+/*
+ * Once the scheduler is running, spawn rcuo kthreads for all online
+ * no-CBs CPUs. This assumes that the early_initcall()s happen before
+ * non-boot CPUs come online -- if this changes, we will need to add
+ * some mutual exclusion.
+ */
+static void __init rcu_spawn_nocb_kthreads(void)
+{
+ int cpu;
+
+ for_each_online_cpu(cpu)
+ rcu_spawn_all_nocb_kthreads(cpu);
+}
+
+/* How many follower CPU IDs per leader? Default of -1 for sqrt(nr_cpu_ids). */
+static int rcu_nocb_leader_stride = -1;
+module_param(rcu_nocb_leader_stride, int, 0444);
+
+/*
+ * Initialize leader-follower relationships for all no-CBs CPU.
+ */
+static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp)
+{
+ int cpu;
+ int ls = rcu_nocb_leader_stride;
+ int nl = 0; /* Next leader. */
+ struct rcu_data *rdp;
+ struct rcu_data *rdp_leader = NULL; /* Suppress misguided gcc warn. */
+ struct rcu_data *rdp_prev = NULL;
+
+ if (!have_rcu_nocb_mask)
+ return;
+ if (ls == -1) {
+ ls = int_sqrt(nr_cpu_ids);
+ rcu_nocb_leader_stride = ls;
+ }
+
+ /*
+ * Each pass through this loop sets up one rcu_data structure and
+ * spawns one rcu_nocb_kthread().
+ */
+ for_each_cpu(cpu, rcu_nocb_mask) {
+ rdp = per_cpu_ptr(rsp->rda, cpu);
+ if (rdp->cpu >= nl) {
+ /* New leader, set up for followers & next leader. */
+ nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls;
+ rdp->nocb_leader = rdp;
+ rdp_leader = rdp;
+ } else {
+ /* Another follower, link to previous leader. */
+ rdp->nocb_leader = rdp_leader;
+ rdp_prev->nocb_next_follower = rdp;
+ }
+ rdp_prev = rdp;
+ }
+}
+
+/* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */
+static bool init_nocb_callback_list(struct rcu_data *rdp)
+{
+ if (!rcu_is_nocb_cpu(rdp->cpu))
+ return false;
+
+ /* If there are early-boot callbacks, move them to nocb lists. */
+ if (rdp->nxtlist) {
+ rdp->nocb_head = rdp->nxtlist;
+ rdp->nocb_tail = rdp->nxttail[RCU_NEXT_TAIL];
+ atomic_long_set(&rdp->nocb_q_count, rdp->qlen);
+ atomic_long_set(&rdp->nocb_q_count_lazy, rdp->qlen_lazy);
+ rdp->nxtlist = NULL;
+ rdp->qlen = 0;
+ rdp->qlen_lazy = 0;
+ }
+ rdp->nxttail[RCU_NEXT_TAIL] = NULL;
+ return true;
+}
+
+#else /* #ifdef CONFIG_RCU_NOCB_CPU */
+
+static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
+{
+ WARN_ON_ONCE(1); /* Should be dead code. */
+ return false;
+}
+
+static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
+{
+}
+
+static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
+{
+}
+
+static void rcu_init_one_nocb(struct rcu_node *rnp)
+{
+}
+
+static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
+ bool lazy, unsigned long flags)
+{
+ return false;
+}
+
+static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
+ struct rcu_data *rdp,
+ unsigned long flags)
+{
+ return false;
+}
+
+static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
+{
+}
+
+static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
+{
+ return false;
+}
+
+static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
+{
+}
+
+static void rcu_spawn_all_nocb_kthreads(int cpu)
+{
+}
+
+static void __init rcu_spawn_nocb_kthreads(void)
+{
+}
+
+static bool init_nocb_callback_list(struct rcu_data *rdp)
+{
+ return false;
+}
+
+#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
+
+/*
+ * An adaptive-ticks CPU can potentially execute in kernel mode for an
+ * arbitrarily long period of time with the scheduling-clock tick turned
+ * off. RCU will be paying attention to this CPU because it is in the
+ * kernel, but the CPU cannot be guaranteed to be executing the RCU state
+ * machine because the scheduling-clock tick has been disabled. Therefore,
+ * if an adaptive-ticks CPU is failing to respond to the current grace
+ * period and has not be idle from an RCU perspective, kick it.
+ */
+static void __maybe_unused rcu_kick_nohz_cpu(int cpu)
+{
+#ifdef CONFIG_NO_HZ_FULL
+ if (tick_nohz_full_cpu(cpu))
+ smp_send_reschedule(cpu);
+#endif /* #ifdef CONFIG_NO_HZ_FULL */
+}
+
+
+#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
+
+static int full_sysidle_state; /* Current system-idle state. */
+#define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */
+#define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */
+#define RCU_SYSIDLE_LONG 2 /* All CPUs idle for long enough. */
+#define RCU_SYSIDLE_FULL 3 /* All CPUs idle, ready for sysidle. */
+#define RCU_SYSIDLE_FULL_NOTED 4 /* Actually entered sysidle state. */
+
+/*
+ * Invoked to note exit from irq or task transition to idle. Note that
+ * usermode execution does -not- count as idle here! After all, we want
+ * to detect full-system idle states, not RCU quiescent states and grace
+ * periods. The caller must have disabled interrupts.
+ */
+static void rcu_sysidle_enter(int irq)
+{
+ unsigned long j;
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+
+ /* If there are no nohz_full= CPUs, no need to track this. */
+ if (!tick_nohz_full_enabled())
+ return;
+
+ /* Adjust nesting, check for fully idle. */
+ if (irq) {
+ rdtp->dynticks_idle_nesting--;
+ WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0);
+ if (rdtp->dynticks_idle_nesting != 0)
+ return; /* Still not fully idle. */
+ } else {
+ if ((rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) ==
+ DYNTICK_TASK_NEST_VALUE) {
+ rdtp->dynticks_idle_nesting = 0;
+ } else {
+ rdtp->dynticks_idle_nesting -= DYNTICK_TASK_NEST_VALUE;
+ WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0);
+ return; /* Still not fully idle. */
+ }
+ }
+
+ /* Record start of fully idle period. */
+ j = jiffies;
+ ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j;
+ smp_mb__before_atomic();
+ atomic_inc(&rdtp->dynticks_idle);
+ smp_mb__after_atomic();
+ WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1);
+}
+
+/*
+ * Unconditionally force exit from full system-idle state. This is
+ * invoked when a normal CPU exits idle, but must be called separately
+ * for the timekeeping CPU (tick_do_timer_cpu). The reason for this
+ * is that the timekeeping CPU is permitted to take scheduling-clock
+ * interrupts while the system is in system-idle state, and of course
+ * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock
+ * interrupt from any other type of interrupt.
+ */
+void rcu_sysidle_force_exit(void)
+{
+ int oldstate = ACCESS_ONCE(full_sysidle_state);
+ int newoldstate;
+
+ /*
+ * Each pass through the following loop attempts to exit full
+ * system-idle state. If contention proves to be a problem,
+ * a trylock-based contention tree could be used here.
+ */
+ while (oldstate > RCU_SYSIDLE_SHORT) {
+ newoldstate = cmpxchg(&full_sysidle_state,
+ oldstate, RCU_SYSIDLE_NOT);
+ if (oldstate == newoldstate &&
+ oldstate == RCU_SYSIDLE_FULL_NOTED) {
+ rcu_kick_nohz_cpu(tick_do_timer_cpu);
+ return; /* We cleared it, done! */
+ }
+ oldstate = newoldstate;
+ }
+ smp_mb(); /* Order initial oldstate fetch vs. later non-idle work. */
+}
+
+/*
+ * Invoked to note entry to irq or task transition from idle. Note that
+ * usermode execution does -not- count as idle here! The caller must
+ * have disabled interrupts.
+ */
+static void rcu_sysidle_exit(int irq)
+{
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+
+ /* If there are no nohz_full= CPUs, no need to track this. */
+ if (!tick_nohz_full_enabled())
+ return;
+
+ /* Adjust nesting, check for already non-idle. */
+ if (irq) {
+ rdtp->dynticks_idle_nesting++;
+ WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0);
+ if (rdtp->dynticks_idle_nesting != 1)
+ return; /* Already non-idle. */
+ } else {
+ /*
+ * Allow for irq misnesting. Yes, it really is possible
+ * to enter an irq handler then never leave it, and maybe
+ * also vice versa. Handle both possibilities.
+ */
+ if (rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) {
+ rdtp->dynticks_idle_nesting += DYNTICK_TASK_NEST_VALUE;
+ WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0);
+ return; /* Already non-idle. */
+ } else {
+ rdtp->dynticks_idle_nesting = DYNTICK_TASK_EXIT_IDLE;
+ }
+ }
+
+ /* Record end of idle period. */
+ smp_mb__before_atomic();
+ atomic_inc(&rdtp->dynticks_idle);
+ smp_mb__after_atomic();
+ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1));
+
+ /*
+ * If we are the timekeeping CPU, we are permitted to be non-idle
+ * during a system-idle state. This must be the case, because
+ * the timekeeping CPU has to take scheduling-clock interrupts
+ * during the time that the system is transitioning to full
+ * system-idle state. This means that the timekeeping CPU must
+ * invoke rcu_sysidle_force_exit() directly if it does anything
+ * more than take a scheduling-clock interrupt.
+ */
+ if (smp_processor_id() == tick_do_timer_cpu)
+ return;
+
+ /* Update system-idle state: We are clearly no longer fully idle! */
+ rcu_sysidle_force_exit();
+}
+
+/*
+ * Check to see if the current CPU is idle. Note that usermode execution
+ * does not count as idle. The caller must have disabled interrupts,
+ * and must be running on tick_do_timer_cpu.
+ */
+static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
+ unsigned long *maxj)
+{
+ int cur;
+ unsigned long j;
+ struct rcu_dynticks *rdtp = rdp->dynticks;
+
+ /* If there are no nohz_full= CPUs, don't check system-wide idleness. */
+ if (!tick_nohz_full_enabled())
+ return;
+
+ /*
+ * If some other CPU has already reported non-idle, if this is
+ * not the flavor of RCU that tracks sysidle state, or if this
+ * is an offline or the timekeeping CPU, nothing to do.
+ */
+ if (!*isidle || rdp->rsp != rcu_state_p ||
+ cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu)
+ return;
+ /* Verify affinity of current kthread. */
+ WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu);
+
+ /* Pick up current idle and NMI-nesting counter and check. */
+ cur = atomic_read(&rdtp->dynticks_idle);
+ if (cur & 0x1) {
+ *isidle = false; /* We are not idle! */
+ return;
+ }
+ smp_mb(); /* Read counters before timestamps. */
+
+ /* Pick up timestamps. */
+ j = ACCESS_ONCE(rdtp->dynticks_idle_jiffies);
+ /* If this CPU entered idle more recently, update maxj timestamp. */
+ if (ULONG_CMP_LT(*maxj, j))
+ *maxj = j;
+}
+
+/*
+ * Is this the flavor of RCU that is handling full-system idle?
+ */
+static bool is_sysidle_rcu_state(struct rcu_state *rsp)
+{
+ return rsp == rcu_state_p;
+}
+
+/*
+ * Return a delay in jiffies based on the number of CPUs, rcu_node
+ * leaf fanout, and jiffies tick rate. The idea is to allow larger
+ * systems more time to transition to full-idle state in order to
+ * avoid the cache thrashing that otherwise occur on the state variable.
+ * Really small systems (less than a couple of tens of CPUs) should
+ * instead use a single global atomically incremented counter, and later
+ * versions of this will automatically reconfigure themselves accordingly.
+ */
+static unsigned long rcu_sysidle_delay(void)
+{
+ if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL)
+ return 0;
+ return DIV_ROUND_UP(nr_cpu_ids * HZ, rcu_fanout_leaf * 1000);
+}
+
+/*
+ * Advance the full-system-idle state. This is invoked when all of
+ * the non-timekeeping CPUs are idle.
+ */
+static void rcu_sysidle(unsigned long j)
+{
+ /* Check the current state. */
+ switch (ACCESS_ONCE(full_sysidle_state)) {
+ case RCU_SYSIDLE_NOT:
+
+ /* First time all are idle, so note a short idle period. */
+ ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_SHORT;
+ break;
+
+ case RCU_SYSIDLE_SHORT:
+
+ /*
+ * Idle for a bit, time to advance to next state?
+ * cmpxchg failure means race with non-idle, let them win.
+ */
+ if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay()))
+ (void)cmpxchg(&full_sysidle_state,
+ RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG);
+ break;
+
+ case RCU_SYSIDLE_LONG:
+
+ /*
+ * Do an additional check pass before advancing to full.
+ * cmpxchg failure means race with non-idle, let them win.
+ */
+ if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay()))
+ (void)cmpxchg(&full_sysidle_state,
+ RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL);
+ break;
+
+ default:
+ break;
+ }
+}
+
+/*
+ * Found a non-idle non-timekeeping CPU, so kick the system-idle state
+ * back to the beginning.
+ */
+static void rcu_sysidle_cancel(void)
+{
+ smp_mb();
+ if (full_sysidle_state > RCU_SYSIDLE_SHORT)
+ ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT;
+}
+
+/*
+ * Update the sysidle state based on the results of a force-quiescent-state
+ * scan of the CPUs' dyntick-idle state.
+ */
+static void rcu_sysidle_report(struct rcu_state *rsp, int isidle,
+ unsigned long maxj, bool gpkt)
+{
+ if (rsp != rcu_state_p)
+ return; /* Wrong flavor, ignore. */
+ if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL)
+ return; /* Running state machine from timekeeping CPU. */
+ if (isidle)
+ rcu_sysidle(maxj); /* More idle! */
+ else
+ rcu_sysidle_cancel(); /* Idle is over. */
+}
+
+/*
+ * Wrapper for rcu_sysidle_report() when called from the grace-period
+ * kthread's context.
+ */
+static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
+ unsigned long maxj)
+{
+ /* If there are no nohz_full= CPUs, no need to track this. */
+ if (!tick_nohz_full_enabled())
+ return;
+
+ rcu_sysidle_report(rsp, isidle, maxj, true);
+}
+
+/* Callback and function for forcing an RCU grace period. */
+struct rcu_sysidle_head {
+ struct rcu_head rh;
+ int inuse;
+};
+
+static void rcu_sysidle_cb(struct rcu_head *rhp)
+{
+ struct rcu_sysidle_head *rshp;
+
+ /*
+ * The following memory barrier is needed to replace the
+ * memory barriers that would normally be in the memory
+ * allocator.
+ */
+ smp_mb(); /* grace period precedes setting inuse. */
+
+ rshp = container_of(rhp, struct rcu_sysidle_head, rh);
+ ACCESS_ONCE(rshp->inuse) = 0;
+}
+
+/*
+ * Check to see if the system is fully idle, other than the timekeeping CPU.
+ * The caller must have disabled interrupts. This is not intended to be
+ * called unless tick_nohz_full_enabled().
+ */
+bool rcu_sys_is_idle(void)
+{
+ static struct rcu_sysidle_head rsh;
+ int rss = ACCESS_ONCE(full_sysidle_state);
+
+ if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu))
+ return false;
+
+ /* Handle small-system case by doing a full scan of CPUs. */
+ if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) {
+ int oldrss = rss - 1;
+
+ /*
+ * One pass to advance to each state up to _FULL.
+ * Give up if any pass fails to advance the state.
+ */
+ while (rss < RCU_SYSIDLE_FULL && oldrss < rss) {
+ int cpu;
+ bool isidle = true;
+ unsigned long maxj = jiffies - ULONG_MAX / 4;
+ struct rcu_data *rdp;
+
+ /* Scan all the CPUs looking for nonidle CPUs. */
+ for_each_possible_cpu(cpu) {
+ rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
+ rcu_sysidle_check_cpu(rdp, &isidle, &maxj);
+ if (!isidle)
+ break;
+ }
+ rcu_sysidle_report(rcu_state_p, isidle, maxj, false);
+ oldrss = rss;
+ rss = ACCESS_ONCE(full_sysidle_state);
+ }
+ }
+
+ /* If this is the first observation of an idle period, record it. */
+ if (rss == RCU_SYSIDLE_FULL) {
+ rss = cmpxchg(&full_sysidle_state,
+ RCU_SYSIDLE_FULL, RCU_SYSIDLE_FULL_NOTED);
+ return rss == RCU_SYSIDLE_FULL;
+ }
+
+ smp_mb(); /* ensure rss load happens before later caller actions. */
+
+ /* If already fully idle, tell the caller (in case of races). */
+ if (rss == RCU_SYSIDLE_FULL_NOTED)
+ return true;
+
+ /*
+ * If we aren't there yet, and a grace period is not in flight,
+ * initiate a grace period. Either way, tell the caller that
+ * we are not there yet. We use an xchg() rather than an assignment
+ * to make up for the memory barriers that would otherwise be
+ * provided by the memory allocator.
+ */
+ if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL &&
+ !rcu_gp_in_progress(rcu_state_p) &&
+ !rsh.inuse && xchg(&rsh.inuse, 1) == 0)
+ call_rcu(&rsh.rh, rcu_sysidle_cb);
+ return false;
+}
+
+/*
+ * Initialize dynticks sysidle state for CPUs coming online.
+ */
+static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
+{
+ rdtp->dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE;
+}
+
+#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
+
+static void rcu_sysidle_enter(int irq)
+{
+}
+
+static void rcu_sysidle_exit(int irq)
+{
+}
+
+static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
+ unsigned long *maxj)
+{
+}
+
+static bool is_sysidle_rcu_state(struct rcu_state *rsp)
+{
+ return false;
+}
+
+static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
+ unsigned long maxj)
+{
+}
+
+static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
+{
+}
+
+#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
+
+/*
+ * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the
+ * grace-period kthread will do force_quiescent_state() processing?
+ * The idea is to avoid waking up RCU core processing on such a
+ * CPU unless the grace period has extended for too long.
+ *
+ * This code relies on the fact that all NO_HZ_FULL CPUs are also
+ * CONFIG_RCU_NOCB_CPU CPUs.
+ */
+static bool rcu_nohz_full_cpu(struct rcu_state *rsp)
+{
+#ifdef CONFIG_NO_HZ_FULL
+ if (tick_nohz_full_cpu(smp_processor_id()) &&
+ (!rcu_gp_in_progress(rsp) ||
+ ULONG_CMP_LT(jiffies, ACCESS_ONCE(rsp->gp_start) + HZ)))
+ return 1;
+#endif /* #ifdef CONFIG_NO_HZ_FULL */
+ return 0;
+}
+
+/*
+ * Bind the grace-period kthread for the sysidle flavor of RCU to the
+ * timekeeping CPU.
+ */
+static void rcu_bind_gp_kthread(void)
+{
+ int __maybe_unused cpu;
+
+ if (!tick_nohz_full_enabled())
+ return;
+#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
+ cpu = tick_do_timer_cpu;
+ if (cpu >= 0 && cpu < nr_cpu_ids)
+ set_cpus_allowed_ptr(current, cpumask_of(cpu));
+#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
+ housekeeping_affine(current);
+#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
+}
+
+/* Record the current task on dyntick-idle entry. */
+static void rcu_dynticks_task_enter(void)
+{
+#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
+ ACCESS_ONCE(current->rcu_tasks_idle_cpu) = smp_processor_id();
+#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
+}
+
+/* Record no current task on dyntick-idle exit. */
+static void rcu_dynticks_task_exit(void)
+{
+#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
+ ACCESS_ONCE(current->rcu_tasks_idle_cpu) = -1;
+#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
+}
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
new file mode 100644
index 000000000..f92361efd
--- /dev/null
+++ b/kernel/rcu/tree_trace.c
@@ -0,0 +1,505 @@
+/*
+ * Read-Copy Update tracing for classic implementation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright IBM Corporation, 2008
+ *
+ * Papers: http://www.rdrop.com/users/paulmck/RCU
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * Documentation/RCU
+ *
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/mutex.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#define RCU_TREE_NONCORE
+#include "tree.h"
+
+DECLARE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
+
+static int r_open(struct inode *inode, struct file *file,
+ const struct seq_operations *op)
+{
+ int ret = seq_open(file, op);
+ if (!ret) {
+ struct seq_file *m = (struct seq_file *)file->private_data;
+ m->private = inode->i_private;
+ }
+ return ret;
+}
+
+static void *r_start(struct seq_file *m, loff_t *pos)
+{
+ struct rcu_state *rsp = (struct rcu_state *)m->private;
+ *pos = cpumask_next(*pos - 1, cpu_possible_mask);
+ if ((*pos) < nr_cpu_ids)
+ return per_cpu_ptr(rsp->rda, *pos);
+ return NULL;
+}
+
+static void *r_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ (*pos)++;
+ return r_start(m, pos);
+}
+
+static void r_stop(struct seq_file *m, void *v)
+{
+}
+
+static int show_rcubarrier(struct seq_file *m, void *v)
+{
+ struct rcu_state *rsp = (struct rcu_state *)m->private;
+ seq_printf(m, "bcc: %d nbd: %lu\n",
+ atomic_read(&rsp->barrier_cpu_count),
+ rsp->n_barrier_done);
+ return 0;
+}
+
+static int rcubarrier_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, show_rcubarrier, inode->i_private);
+}
+
+static const struct file_operations rcubarrier_fops = {
+ .owner = THIS_MODULE,
+ .open = rcubarrier_open,
+ .read = seq_read,
+ .llseek = no_llseek,
+ .release = single_release,
+};
+
+#ifdef CONFIG_RCU_BOOST
+
+static char convert_kthread_status(unsigned int kthread_status)
+{
+ if (kthread_status > RCU_KTHREAD_MAX)
+ return '?';
+ return "SRWOY"[kthread_status];
+}
+
+#endif /* #ifdef CONFIG_RCU_BOOST */
+
+static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
+{
+ long ql, qll;
+
+ if (!rdp->beenonline)
+ return;
+ seq_printf(m, "%3d%cc=%ld g=%ld pq=%d/%d qp=%d",
+ rdp->cpu,
+ cpu_is_offline(rdp->cpu) ? '!' : ' ',
+ ulong2long(rdp->completed), ulong2long(rdp->gpnum),
+ rdp->passed_quiesce,
+ rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu),
+ rdp->qs_pending);
+ seq_printf(m, " dt=%d/%llx/%d df=%lu",
+ atomic_read(&rdp->dynticks->dynticks),
+ rdp->dynticks->dynticks_nesting,
+ rdp->dynticks->dynticks_nmi_nesting,
+ rdp->dynticks_fqs);
+ seq_printf(m, " of=%lu", rdp->offline_fqs);
+ rcu_nocb_q_lengths(rdp, &ql, &qll);
+ qll += rdp->qlen_lazy;
+ ql += rdp->qlen;
+ seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c",
+ qll, ql,
+ ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
+ rdp->nxttail[RCU_NEXT_TAIL]],
+ ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
+ rdp->nxttail[RCU_NEXT_READY_TAIL]],
+ ".W"[rdp->nxttail[RCU_DONE_TAIL] !=
+ rdp->nxttail[RCU_WAIT_TAIL]],
+ ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]);
+#ifdef CONFIG_RCU_BOOST
+ seq_printf(m, " kt=%d/%c ktl=%x",
+ per_cpu(rcu_cpu_has_work, rdp->cpu),
+ convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
+ rdp->cpu)),
+ per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+ seq_printf(m, " b=%ld", rdp->blimit);
+ seq_printf(m, " ci=%lu nci=%lu co=%lu ca=%lu\n",
+ rdp->n_cbs_invoked, rdp->n_nocbs_invoked,
+ rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
+}
+
+static int show_rcudata(struct seq_file *m, void *v)
+{
+ print_one_rcu_data(m, (struct rcu_data *)v);
+ return 0;
+}
+
+static const struct seq_operations rcudate_op = {
+ .start = r_start,
+ .next = r_next,
+ .stop = r_stop,
+ .show = show_rcudata,
+};
+
+static int rcudata_open(struct inode *inode, struct file *file)
+{
+ return r_open(inode, file, &rcudate_op);
+}
+
+static const struct file_operations rcudata_fops = {
+ .owner = THIS_MODULE,
+ .open = rcudata_open,
+ .read = seq_read,
+ .llseek = no_llseek,
+ .release = seq_release,
+};
+
+static int show_rcuexp(struct seq_file *m, void *v)
+{
+ struct rcu_state *rsp = (struct rcu_state *)m->private;
+
+ seq_printf(m, "s=%lu d=%lu w=%lu tf=%lu wd1=%lu wd2=%lu n=%lu sc=%lu dt=%lu dl=%lu dx=%lu\n",
+ atomic_long_read(&rsp->expedited_start),
+ atomic_long_read(&rsp->expedited_done),
+ atomic_long_read(&rsp->expedited_wrap),
+ atomic_long_read(&rsp->expedited_tryfail),
+ atomic_long_read(&rsp->expedited_workdone1),
+ atomic_long_read(&rsp->expedited_workdone2),
+ atomic_long_read(&rsp->expedited_normal),
+ atomic_long_read(&rsp->expedited_stoppedcpus),
+ atomic_long_read(&rsp->expedited_done_tries),
+ atomic_long_read(&rsp->expedited_done_lost),
+ atomic_long_read(&rsp->expedited_done_exit));
+ return 0;
+}
+
+static int rcuexp_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, show_rcuexp, inode->i_private);
+}
+
+static const struct file_operations rcuexp_fops = {
+ .owner = THIS_MODULE,
+ .open = rcuexp_open,
+ .read = seq_read,
+ .llseek = no_llseek,
+ .release = single_release,
+};
+
+#ifdef CONFIG_RCU_BOOST
+
+static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp)
+{
+ seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu ",
+ rnp->grplo, rnp->grphi,
+ "T."[list_empty(&rnp->blkd_tasks)],
+ "N."[!rnp->gp_tasks],
+ "E."[!rnp->exp_tasks],
+ "B."[!rnp->boost_tasks],
+ convert_kthread_status(rnp->boost_kthread_status),
+ rnp->n_tasks_boosted, rnp->n_exp_boosts,
+ rnp->n_normal_boosts);
+ seq_printf(m, "j=%04x bt=%04x\n",
+ (int)(jiffies & 0xffff),
+ (int)(rnp->boost_time & 0xffff));
+ seq_printf(m, " balk: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n",
+ rnp->n_balk_blkd_tasks,
+ rnp->n_balk_exp_gp_tasks,
+ rnp->n_balk_boost_tasks,
+ rnp->n_balk_notblocked,
+ rnp->n_balk_notyet,
+ rnp->n_balk_nos);
+}
+
+static int show_rcu_node_boost(struct seq_file *m, void *unused)
+{
+ struct rcu_node *rnp;
+
+ rcu_for_each_leaf_node(&rcu_preempt_state, rnp)
+ print_one_rcu_node_boost(m, rnp);
+ return 0;
+}
+
+static int rcu_node_boost_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, show_rcu_node_boost, NULL);
+}
+
+static const struct file_operations rcu_node_boost_fops = {
+ .owner = THIS_MODULE,
+ .open = rcu_node_boost_open,
+ .read = seq_read,
+ .llseek = no_llseek,
+ .release = single_release,
+};
+
+#endif /* #ifdef CONFIG_RCU_BOOST */
+
+static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
+{
+ unsigned long gpnum;
+ int level = 0;
+ struct rcu_node *rnp;
+
+ gpnum = rsp->gpnum;
+ seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x ",
+ ulong2long(rsp->completed), ulong2long(gpnum),
+ rsp->fqs_state,
+ (long)(rsp->jiffies_force_qs - jiffies),
+ (int)(jiffies & 0xffff));
+ seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
+ rsp->n_force_qs, rsp->n_force_qs_ngp,
+ rsp->n_force_qs - rsp->n_force_qs_ngp,
+ ACCESS_ONCE(rsp->n_force_qs_lh), rsp->qlen_lazy, rsp->qlen);
+ for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) {
+ if (rnp->level != level) {
+ seq_puts(m, "\n");
+ level = rnp->level;
+ }
+ seq_printf(m, "%lx/%lx->%lx %c%c>%c %d:%d ^%d ",
+ rnp->qsmask, rnp->qsmaskinit, rnp->qsmaskinitnext,
+ ".G"[rnp->gp_tasks != NULL],
+ ".E"[rnp->exp_tasks != NULL],
+ ".T"[!list_empty(&rnp->blkd_tasks)],
+ rnp->grplo, rnp->grphi, rnp->grpnum);
+ }
+ seq_puts(m, "\n");
+}
+
+static int show_rcuhier(struct seq_file *m, void *v)
+{
+ struct rcu_state *rsp = (struct rcu_state *)m->private;
+ print_one_rcu_state(m, rsp);
+ return 0;
+}
+
+static int rcuhier_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, show_rcuhier, inode->i_private);
+}
+
+static const struct file_operations rcuhier_fops = {
+ .owner = THIS_MODULE,
+ .open = rcuhier_open,
+ .read = seq_read,
+ .llseek = no_llseek,
+ .release = single_release,
+};
+
+static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
+{
+ unsigned long flags;
+ unsigned long completed;
+ unsigned long gpnum;
+ unsigned long gpage;
+ unsigned long gpmax;
+ struct rcu_node *rnp = &rsp->node[0];
+
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ completed = ACCESS_ONCE(rsp->completed);
+ gpnum = ACCESS_ONCE(rsp->gpnum);
+ if (completed == gpnum)
+ gpage = 0;
+ else
+ gpage = jiffies - rsp->gp_start;
+ gpmax = rsp->gp_max;
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ seq_printf(m, "completed=%ld gpnum=%ld age=%ld max=%ld\n",
+ ulong2long(completed), ulong2long(gpnum), gpage, gpmax);
+}
+
+static int show_rcugp(struct seq_file *m, void *v)
+{
+ struct rcu_state *rsp = (struct rcu_state *)m->private;
+ show_one_rcugp(m, rsp);
+ return 0;
+}
+
+static int rcugp_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, show_rcugp, inode->i_private);
+}
+
+static const struct file_operations rcugp_fops = {
+ .owner = THIS_MODULE,
+ .open = rcugp_open,
+ .read = seq_read,
+ .llseek = no_llseek,
+ .release = single_release,
+};
+
+static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
+{
+ if (!rdp->beenonline)
+ return;
+ seq_printf(m, "%3d%cnp=%ld ",
+ rdp->cpu,
+ cpu_is_offline(rdp->cpu) ? '!' : ' ',
+ rdp->n_rcu_pending);
+ seq_printf(m, "qsp=%ld rpq=%ld cbr=%ld cng=%ld ",
+ rdp->n_rp_qs_pending,
+ rdp->n_rp_report_qs,
+ rdp->n_rp_cb_ready,
+ rdp->n_rp_cpu_needs_gp);
+ seq_printf(m, "gpc=%ld gps=%ld nn=%ld ndw%ld\n",
+ rdp->n_rp_gp_completed,
+ rdp->n_rp_gp_started,
+ rdp->n_rp_nocb_defer_wakeup,
+ rdp->n_rp_need_nothing);
+}
+
+static int show_rcu_pending(struct seq_file *m, void *v)
+{
+ print_one_rcu_pending(m, (struct rcu_data *)v);
+ return 0;
+}
+
+static const struct seq_operations rcu_pending_op = {
+ .start = r_start,
+ .next = r_next,
+ .stop = r_stop,
+ .show = show_rcu_pending,
+};
+
+static int rcu_pending_open(struct inode *inode, struct file *file)
+{
+ return r_open(inode, file, &rcu_pending_op);
+}
+
+static const struct file_operations rcu_pending_fops = {
+ .owner = THIS_MODULE,
+ .open = rcu_pending_open,
+ .read = seq_read,
+ .llseek = no_llseek,
+ .release = seq_release,
+};
+
+static int show_rcutorture(struct seq_file *m, void *unused)
+{
+ seq_printf(m, "rcutorture test sequence: %lu %s\n",
+ rcutorture_testseq >> 1,
+ (rcutorture_testseq & 0x1) ? "(test in progress)" : "");
+ seq_printf(m, "rcutorture update version number: %lu\n",
+ rcutorture_vernum);
+ return 0;
+}
+
+static int rcutorture_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, show_rcutorture, NULL);
+}
+
+static const struct file_operations rcutorture_fops = {
+ .owner = THIS_MODULE,
+ .open = rcutorture_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static struct dentry *rcudir;
+
+static int __init rcutree_trace_init(void)
+{
+ struct rcu_state *rsp;
+ struct dentry *retval;
+ struct dentry *rspdir;
+
+ rcudir = debugfs_create_dir("rcu", NULL);
+ if (!rcudir)
+ goto free_out;
+
+ for_each_rcu_flavor(rsp) {
+ rspdir = debugfs_create_dir(rsp->name, rcudir);
+ if (!rspdir)
+ goto free_out;
+
+ retval = debugfs_create_file("rcudata", 0444,
+ rspdir, rsp, &rcudata_fops);
+ if (!retval)
+ goto free_out;
+
+ retval = debugfs_create_file("rcuexp", 0444,
+ rspdir, rsp, &rcuexp_fops);
+ if (!retval)
+ goto free_out;
+
+ retval = debugfs_create_file("rcu_pending", 0444,
+ rspdir, rsp, &rcu_pending_fops);
+ if (!retval)
+ goto free_out;
+
+ retval = debugfs_create_file("rcubarrier", 0444,
+ rspdir, rsp, &rcubarrier_fops);
+ if (!retval)
+ goto free_out;
+
+#ifdef CONFIG_RCU_BOOST
+ if (rsp == &rcu_preempt_state) {
+ retval = debugfs_create_file("rcuboost", 0444,
+ rspdir, NULL, &rcu_node_boost_fops);
+ if (!retval)
+ goto free_out;
+ }
+#endif
+
+ retval = debugfs_create_file("rcugp", 0444,
+ rspdir, rsp, &rcugp_fops);
+ if (!retval)
+ goto free_out;
+
+ retval = debugfs_create_file("rcuhier", 0444,
+ rspdir, rsp, &rcuhier_fops);
+ if (!retval)
+ goto free_out;
+ }
+
+ retval = debugfs_create_file("rcutorture", 0444, rcudir,
+ NULL, &rcutorture_fops);
+ if (!retval)
+ goto free_out;
+ return 0;
+free_out:
+ debugfs_remove_recursive(rcudir);
+ return 1;
+}
+
+static void __exit rcutree_trace_cleanup(void)
+{
+ debugfs_remove_recursive(rcudir);
+}
+
+
+module_init(rcutree_trace_init);
+module_exit(rcutree_trace_cleanup);
+
+MODULE_AUTHOR("Paul E. McKenney");
+MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation");
+MODULE_LICENSE("GPL");
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
new file mode 100644
index 000000000..1f133350d
--- /dev/null
+++ b/kernel/rcu/update.c
@@ -0,0 +1,831 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright IBM Corporation, 2001
+ *
+ * Authors: Dipankar Sarma <dipankar@in.ibm.com>
+ * Manfred Spraul <manfred@colorfullife.com>
+ *
+ * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
+ * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
+ * Papers:
+ * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
+ * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * http://lse.sourceforge.net/locking/rcupdate.html
+ *
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/mutex.h>
+#include <linux/export.h>
+#include <linux/hardirq.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/tick.h>
+
+#define CREATE_TRACE_POINTS
+
+#include "rcu.h"
+
+MODULE_ALIAS("rcupdate");
+#ifdef MODULE_PARAM_PREFIX
+#undef MODULE_PARAM_PREFIX
+#endif
+#define MODULE_PARAM_PREFIX "rcupdate."
+
+module_param(rcu_expedited, int, 0);
+
+#ifndef CONFIG_TINY_RCU
+
+static atomic_t rcu_expedited_nesting =
+ ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0);
+
+/*
+ * Should normal grace-period primitives be expedited? Intended for
+ * use within RCU. Note that this function takes the rcu_expedited
+ * sysfs/boot variable into account as well as the rcu_expedite_gp()
+ * nesting. So looping on rcu_unexpedite_gp() until rcu_gp_is_expedited()
+ * returns false is a -really- bad idea.
+ */
+bool rcu_gp_is_expedited(void)
+{
+ return rcu_expedited || atomic_read(&rcu_expedited_nesting);
+}
+EXPORT_SYMBOL_GPL(rcu_gp_is_expedited);
+
+/**
+ * rcu_expedite_gp - Expedite future RCU grace periods
+ *
+ * After a call to this function, future calls to synchronize_rcu() and
+ * friends act as the corresponding synchronize_rcu_expedited() function
+ * had instead been called.
+ */
+void rcu_expedite_gp(void)
+{
+ atomic_inc(&rcu_expedited_nesting);
+}
+EXPORT_SYMBOL_GPL(rcu_expedite_gp);
+
+/**
+ * rcu_unexpedite_gp - Cancel prior rcu_expedite_gp() invocation
+ *
+ * Undo a prior call to rcu_expedite_gp(). If all prior calls to
+ * rcu_expedite_gp() are undone by a subsequent call to rcu_unexpedite_gp(),
+ * and if the rcu_expedited sysfs/boot parameter is not set, then all
+ * subsequent calls to synchronize_rcu() and friends will return to
+ * their normal non-expedited behavior.
+ */
+void rcu_unexpedite_gp(void)
+{
+ atomic_dec(&rcu_expedited_nesting);
+}
+EXPORT_SYMBOL_GPL(rcu_unexpedite_gp);
+
+#endif /* #ifndef CONFIG_TINY_RCU */
+
+/*
+ * Inform RCU of the end of the in-kernel boot sequence.
+ */
+void rcu_end_inkernel_boot(void)
+{
+ if (IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT))
+ rcu_unexpedite_gp();
+}
+
+#ifdef CONFIG_PREEMPT_RCU
+
+/*
+ * Preemptible RCU implementation for rcu_read_lock().
+ * Just increment ->rcu_read_lock_nesting, shared state will be updated
+ * if we block.
+ */
+void __rcu_read_lock(void)
+{
+ current->rcu_read_lock_nesting++;
+ barrier(); /* critical section after entry code. */
+}
+EXPORT_SYMBOL_GPL(__rcu_read_lock);
+
+/*
+ * Preemptible RCU implementation for rcu_read_unlock().
+ * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
+ * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
+ * invoke rcu_read_unlock_special() to clean up after a context switch
+ * in an RCU read-side critical section and other special cases.
+ */
+void __rcu_read_unlock(void)
+{
+ struct task_struct *t = current;
+
+ if (t->rcu_read_lock_nesting != 1) {
+ --t->rcu_read_lock_nesting;
+ } else {
+ barrier(); /* critical section before exit code. */
+ t->rcu_read_lock_nesting = INT_MIN;
+ barrier(); /* assign before ->rcu_read_unlock_special load */
+ if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special.s)))
+ rcu_read_unlock_special(t);
+ barrier(); /* ->rcu_read_unlock_special load before assign */
+ t->rcu_read_lock_nesting = 0;
+ }
+#ifdef CONFIG_PROVE_LOCKING
+ {
+ int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
+
+ WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
+ }
+#endif /* #ifdef CONFIG_PROVE_LOCKING */
+}
+EXPORT_SYMBOL_GPL(__rcu_read_unlock);
+
+#endif /* #ifdef CONFIG_PREEMPT_RCU */
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static struct lock_class_key rcu_lock_key;
+struct lockdep_map rcu_lock_map =
+ STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
+EXPORT_SYMBOL_GPL(rcu_lock_map);
+
+static struct lock_class_key rcu_bh_lock_key;
+struct lockdep_map rcu_bh_lock_map =
+ STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_bh", &rcu_bh_lock_key);
+EXPORT_SYMBOL_GPL(rcu_bh_lock_map);
+
+static struct lock_class_key rcu_sched_lock_key;
+struct lockdep_map rcu_sched_lock_map =
+ STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key);
+EXPORT_SYMBOL_GPL(rcu_sched_lock_map);
+
+static struct lock_class_key rcu_callback_key;
+struct lockdep_map rcu_callback_map =
+ STATIC_LOCKDEP_MAP_INIT("rcu_callback", &rcu_callback_key);
+EXPORT_SYMBOL_GPL(rcu_callback_map);
+
+int notrace debug_lockdep_rcu_enabled(void)
+{
+ return rcu_scheduler_active && debug_locks &&
+ current->lockdep_recursion == 0;
+}
+EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
+
+/**
+ * rcu_read_lock_held() - might we be in RCU read-side critical section?
+ *
+ * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an RCU
+ * read-side critical section. In absence of CONFIG_DEBUG_LOCK_ALLOC,
+ * this assumes we are in an RCU read-side critical section unless it can
+ * prove otherwise. This is useful for debug checks in functions that
+ * require that they be called within an RCU read-side critical section.
+ *
+ * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot
+ * and while lockdep is disabled.
+ *
+ * Note that rcu_read_lock() and the matching rcu_read_unlock() must
+ * occur in the same context, for example, it is illegal to invoke
+ * rcu_read_unlock() in process context if the matching rcu_read_lock()
+ * was invoked from within an irq handler.
+ *
+ * Note that rcu_read_lock() is disallowed if the CPU is either idle or
+ * offline from an RCU perspective, so check for those as well.
+ */
+int rcu_read_lock_held(void)
+{
+ if (!debug_lockdep_rcu_enabled())
+ return 1;
+ if (!rcu_is_watching())
+ return 0;
+ if (!rcu_lockdep_current_cpu_online())
+ return 0;
+ return lock_is_held(&rcu_lock_map);
+}
+EXPORT_SYMBOL_GPL(rcu_read_lock_held);
+
+/**
+ * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section?
+ *
+ * Check for bottom half being disabled, which covers both the
+ * CONFIG_PROVE_RCU and not cases. Note that if someone uses
+ * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled)
+ * will show the situation. This is useful for debug checks in functions
+ * that require that they be called within an RCU read-side critical
+ * section.
+ *
+ * Check debug_lockdep_rcu_enabled() to prevent false positives during boot.
+ *
+ * Note that rcu_read_lock() is disallowed if the CPU is either idle or
+ * offline from an RCU perspective, so check for those as well.
+ */
+int rcu_read_lock_bh_held(void)
+{
+ if (!debug_lockdep_rcu_enabled())
+ return 1;
+ if (!rcu_is_watching())
+ return 0;
+ if (!rcu_lockdep_current_cpu_online())
+ return 0;
+ return in_softirq() || irqs_disabled();
+}
+EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
+
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+/**
+ * wakeme_after_rcu() - Callback function to awaken a task after grace period
+ * @head: Pointer to rcu_head member within rcu_synchronize structure
+ *
+ * Awaken the corresponding task now that a grace period has elapsed.
+ */
+void wakeme_after_rcu(struct rcu_head *head)
+{
+ struct rcu_synchronize *rcu;
+
+ rcu = container_of(head, struct rcu_synchronize, head);
+ complete(&rcu->completion);
+}
+
+void wait_rcu_gp(call_rcu_func_t crf)
+{
+ struct rcu_synchronize rcu;
+
+ init_rcu_head_on_stack(&rcu.head);
+ init_completion(&rcu.completion);
+ /* Will wake me after RCU finished. */
+ crf(&rcu.head, wakeme_after_rcu);
+ /* Wait for it. */
+ wait_for_completion(&rcu.completion);
+ destroy_rcu_head_on_stack(&rcu.head);
+}
+EXPORT_SYMBOL_GPL(wait_rcu_gp);
+
+#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
+void init_rcu_head(struct rcu_head *head)
+{
+ debug_object_init(head, &rcuhead_debug_descr);
+}
+
+void destroy_rcu_head(struct rcu_head *head)
+{
+ debug_object_free(head, &rcuhead_debug_descr);
+}
+
+/*
+ * fixup_activate is called when:
+ * - an active object is activated
+ * - an unknown object is activated (might be a statically initialized object)
+ * Activation is performed internally by call_rcu().
+ */
+static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state)
+{
+ struct rcu_head *head = addr;
+
+ switch (state) {
+
+ case ODEBUG_STATE_NOTAVAILABLE:
+ /*
+ * This is not really a fixup. We just make sure that it is
+ * tracked in the object tracker.
+ */
+ debug_object_init(head, &rcuhead_debug_descr);
+ debug_object_activate(head, &rcuhead_debug_descr);
+ return 0;
+ default:
+ return 1;
+ }
+}
+
+/**
+ * init_rcu_head_on_stack() - initialize on-stack rcu_head for debugobjects
+ * @head: pointer to rcu_head structure to be initialized
+ *
+ * This function informs debugobjects of a new rcu_head structure that
+ * has been allocated as an auto variable on the stack. This function
+ * is not required for rcu_head structures that are statically defined or
+ * that are dynamically allocated on the heap. This function has no
+ * effect for !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds.
+ */
+void init_rcu_head_on_stack(struct rcu_head *head)
+{
+ debug_object_init_on_stack(head, &rcuhead_debug_descr);
+}
+EXPORT_SYMBOL_GPL(init_rcu_head_on_stack);
+
+/**
+ * destroy_rcu_head_on_stack() - destroy on-stack rcu_head for debugobjects
+ * @head: pointer to rcu_head structure to be initialized
+ *
+ * This function informs debugobjects that an on-stack rcu_head structure
+ * is about to go out of scope. As with init_rcu_head_on_stack(), this
+ * function is not required for rcu_head structures that are statically
+ * defined or that are dynamically allocated on the heap. Also as with
+ * init_rcu_head_on_stack(), this function has no effect for
+ * !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds.
+ */
+void destroy_rcu_head_on_stack(struct rcu_head *head)
+{
+ debug_object_free(head, &rcuhead_debug_descr);
+}
+EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack);
+
+struct debug_obj_descr rcuhead_debug_descr = {
+ .name = "rcu_head",
+ .fixup_activate = rcuhead_fixup_activate,
+};
+EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
+#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
+
+#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE)
+void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp,
+ unsigned long secs,
+ unsigned long c_old, unsigned long c)
+{
+ trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c);
+}
+EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
+#else
+#define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \
+ do { } while (0)
+#endif
+
+#ifdef CONFIG_RCU_STALL_COMMON
+
+#ifdef CONFIG_PROVE_RCU
+#define RCU_STALL_DELAY_DELTA (5 * HZ)
+#else
+#define RCU_STALL_DELAY_DELTA 0
+#endif
+
+int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
+static int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
+
+module_param(rcu_cpu_stall_suppress, int, 0644);
+module_param(rcu_cpu_stall_timeout, int, 0644);
+
+int rcu_jiffies_till_stall_check(void)
+{
+ int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout);
+
+ /*
+ * Limit check must be consistent with the Kconfig limits
+ * for CONFIG_RCU_CPU_STALL_TIMEOUT.
+ */
+ if (till_stall_check < 3) {
+ ACCESS_ONCE(rcu_cpu_stall_timeout) = 3;
+ till_stall_check = 3;
+ } else if (till_stall_check > 300) {
+ ACCESS_ONCE(rcu_cpu_stall_timeout) = 300;
+ till_stall_check = 300;
+ }
+ return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
+}
+
+void rcu_sysrq_start(void)
+{
+ if (!rcu_cpu_stall_suppress)
+ rcu_cpu_stall_suppress = 2;
+}
+
+void rcu_sysrq_end(void)
+{
+ if (rcu_cpu_stall_suppress == 2)
+ rcu_cpu_stall_suppress = 0;
+}
+
+static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
+{
+ rcu_cpu_stall_suppress = 1;
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block rcu_panic_block = {
+ .notifier_call = rcu_panic,
+};
+
+static int __init check_cpu_stall_init(void)
+{
+ atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
+ return 0;
+}
+early_initcall(check_cpu_stall_init);
+
+#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
+
+#ifdef CONFIG_TASKS_RCU
+
+/*
+ * Simple variant of RCU whose quiescent states are voluntary context switch,
+ * user-space execution, and idle. As such, grace periods can take one good
+ * long time. There are no read-side primitives similar to rcu_read_lock()
+ * and rcu_read_unlock() because this implementation is intended to get
+ * the system into a safe state for some of the manipulations involved in
+ * tracing and the like. Finally, this implementation does not support
+ * high call_rcu_tasks() rates from multiple CPUs. If this is required,
+ * per-CPU callback lists will be needed.
+ */
+
+/* Global list of callbacks and associated lock. */
+static struct rcu_head *rcu_tasks_cbs_head;
+static struct rcu_head **rcu_tasks_cbs_tail = &rcu_tasks_cbs_head;
+static DECLARE_WAIT_QUEUE_HEAD(rcu_tasks_cbs_wq);
+static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock);
+
+/* Track exiting tasks in order to allow them to be waited for. */
+DEFINE_SRCU(tasks_rcu_exit_srcu);
+
+/* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */
+static int rcu_task_stall_timeout __read_mostly = HZ * 60 * 10;
+module_param(rcu_task_stall_timeout, int, 0644);
+
+static void rcu_spawn_tasks_kthread(void);
+
+/*
+ * Post an RCU-tasks callback. First call must be from process context
+ * after the scheduler if fully operational.
+ */
+void call_rcu_tasks(struct rcu_head *rhp, void (*func)(struct rcu_head *rhp))
+{
+ unsigned long flags;
+ bool needwake;
+
+ rhp->next = NULL;
+ rhp->func = func;
+ raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags);
+ needwake = !rcu_tasks_cbs_head;
+ *rcu_tasks_cbs_tail = rhp;
+ rcu_tasks_cbs_tail = &rhp->next;
+ raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags);
+ if (needwake) {
+ rcu_spawn_tasks_kthread();
+ wake_up(&rcu_tasks_cbs_wq);
+ }
+}
+EXPORT_SYMBOL_GPL(call_rcu_tasks);
+
+/**
+ * synchronize_rcu_tasks - wait until an rcu-tasks grace period has elapsed.
+ *
+ * Control will return to the caller some time after a full rcu-tasks
+ * grace period has elapsed, in other words after all currently
+ * executing rcu-tasks read-side critical sections have elapsed. These
+ * read-side critical sections are delimited by calls to schedule(),
+ * cond_resched_rcu_qs(), idle execution, userspace execution, calls
+ * to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched().
+ *
+ * This is a very specialized primitive, intended only for a few uses in
+ * tracing and other situations requiring manipulation of function
+ * preambles and profiling hooks. The synchronize_rcu_tasks() function
+ * is not (yet) intended for heavy use from multiple CPUs.
+ *
+ * Note that this guarantee implies further memory-ordering guarantees.
+ * On systems with more than one CPU, when synchronize_rcu_tasks() returns,
+ * each CPU is guaranteed to have executed a full memory barrier since the
+ * end of its last RCU-tasks read-side critical section whose beginning
+ * preceded the call to synchronize_rcu_tasks(). In addition, each CPU
+ * having an RCU-tasks read-side critical section that extends beyond
+ * the return from synchronize_rcu_tasks() is guaranteed to have executed
+ * a full memory barrier after the beginning of synchronize_rcu_tasks()
+ * and before the beginning of that RCU-tasks read-side critical section.
+ * Note that these guarantees include CPUs that are offline, idle, or
+ * executing in user mode, as well as CPUs that are executing in the kernel.
+ *
+ * Furthermore, if CPU A invoked synchronize_rcu_tasks(), which returned
+ * to its caller on CPU B, then both CPU A and CPU B are guaranteed
+ * to have executed a full memory barrier during the execution of
+ * synchronize_rcu_tasks() -- even if CPU A and CPU B are the same CPU
+ * (but again only if the system has more than one CPU).
+ */
+void synchronize_rcu_tasks(void)
+{
+ /* Complain if the scheduler has not started. */
+ rcu_lockdep_assert(!rcu_scheduler_active,
+ "synchronize_rcu_tasks called too soon");
+
+ /* Wait for the grace period. */
+ wait_rcu_gp(call_rcu_tasks);
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_tasks);
+
+/**
+ * rcu_barrier_tasks - Wait for in-flight call_rcu_tasks() callbacks.
+ *
+ * Although the current implementation is guaranteed to wait, it is not
+ * obligated to, for example, if there are no pending callbacks.
+ */
+void rcu_barrier_tasks(void)
+{
+ /* There is only one callback queue, so this is easy. ;-) */
+ synchronize_rcu_tasks();
+}
+EXPORT_SYMBOL_GPL(rcu_barrier_tasks);
+
+/* See if tasks are still holding out, complain if so. */
+static void check_holdout_task(struct task_struct *t,
+ bool needreport, bool *firstreport)
+{
+ int cpu;
+
+ if (!ACCESS_ONCE(t->rcu_tasks_holdout) ||
+ t->rcu_tasks_nvcsw != ACCESS_ONCE(t->nvcsw) ||
+ !ACCESS_ONCE(t->on_rq) ||
+ (IS_ENABLED(CONFIG_NO_HZ_FULL) &&
+ !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) {
+ ACCESS_ONCE(t->rcu_tasks_holdout) = false;
+ list_del_init(&t->rcu_tasks_holdout_list);
+ put_task_struct(t);
+ return;
+ }
+ if (!needreport)
+ return;
+ if (*firstreport) {
+ pr_err("INFO: rcu_tasks detected stalls on tasks:\n");
+ *firstreport = false;
+ }
+ cpu = task_cpu(t);
+ pr_alert("%p: %c%c nvcsw: %lu/%lu holdout: %d idle_cpu: %d/%d\n",
+ t, ".I"[is_idle_task(t)],
+ "N."[cpu < 0 || !tick_nohz_full_cpu(cpu)],
+ t->rcu_tasks_nvcsw, t->nvcsw, t->rcu_tasks_holdout,
+ t->rcu_tasks_idle_cpu, cpu);
+ sched_show_task(t);
+}
+
+/* RCU-tasks kthread that detects grace periods and invokes callbacks. */
+static int __noreturn rcu_tasks_kthread(void *arg)
+{
+ unsigned long flags;
+ struct task_struct *g, *t;
+ unsigned long lastreport;
+ struct rcu_head *list;
+ struct rcu_head *next;
+ LIST_HEAD(rcu_tasks_holdouts);
+
+ /* Run on housekeeping CPUs by default. Sysadm can move if desired. */
+ housekeeping_affine(current);
+
+ /*
+ * Each pass through the following loop makes one check for
+ * newly arrived callbacks, and, if there are some, waits for
+ * one RCU-tasks grace period and then invokes the callbacks.
+ * This loop is terminated by the system going down. ;-)
+ */
+ for (;;) {
+
+ /* Pick up any new callbacks. */
+ raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags);
+ list = rcu_tasks_cbs_head;
+ rcu_tasks_cbs_head = NULL;
+ rcu_tasks_cbs_tail = &rcu_tasks_cbs_head;
+ raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags);
+
+ /* If there were none, wait a bit and start over. */
+ if (!list) {
+ wait_event_interruptible(rcu_tasks_cbs_wq,
+ rcu_tasks_cbs_head);
+ if (!rcu_tasks_cbs_head) {
+ WARN_ON(signal_pending(current));
+ schedule_timeout_interruptible(HZ/10);
+ }
+ continue;
+ }
+
+ /*
+ * Wait for all pre-existing t->on_rq and t->nvcsw
+ * transitions to complete. Invoking synchronize_sched()
+ * suffices because all these transitions occur with
+ * interrupts disabled. Without this synchronize_sched(),
+ * a read-side critical section that started before the
+ * grace period might be incorrectly seen as having started
+ * after the grace period.
+ *
+ * This synchronize_sched() also dispenses with the
+ * need for a memory barrier on the first store to
+ * ->rcu_tasks_holdout, as it forces the store to happen
+ * after the beginning of the grace period.
+ */
+ synchronize_sched();
+
+ /*
+ * There were callbacks, so we need to wait for an
+ * RCU-tasks grace period. Start off by scanning
+ * the task list for tasks that are not already
+ * voluntarily blocked. Mark these tasks and make
+ * a list of them in rcu_tasks_holdouts.
+ */
+ rcu_read_lock();
+ for_each_process_thread(g, t) {
+ if (t != current && ACCESS_ONCE(t->on_rq) &&
+ !is_idle_task(t)) {
+ get_task_struct(t);
+ t->rcu_tasks_nvcsw = ACCESS_ONCE(t->nvcsw);
+ ACCESS_ONCE(t->rcu_tasks_holdout) = true;
+ list_add(&t->rcu_tasks_holdout_list,
+ &rcu_tasks_holdouts);
+ }
+ }
+ rcu_read_unlock();
+
+ /*
+ * Wait for tasks that are in the process of exiting.
+ * This does only part of the job, ensuring that all
+ * tasks that were previously exiting reach the point
+ * where they have disabled preemption, allowing the
+ * later synchronize_sched() to finish the job.
+ */
+ synchronize_srcu(&tasks_rcu_exit_srcu);
+
+ /*
+ * Each pass through the following loop scans the list
+ * of holdout tasks, removing any that are no longer
+ * holdouts. When the list is empty, we are done.
+ */
+ lastreport = jiffies;
+ while (!list_empty(&rcu_tasks_holdouts)) {
+ bool firstreport;
+ bool needreport;
+ int rtst;
+ struct task_struct *t1;
+
+ schedule_timeout_interruptible(HZ);
+ rtst = ACCESS_ONCE(rcu_task_stall_timeout);
+ needreport = rtst > 0 &&
+ time_after(jiffies, lastreport + rtst);
+ if (needreport)
+ lastreport = jiffies;
+ firstreport = true;
+ WARN_ON(signal_pending(current));
+ list_for_each_entry_safe(t, t1, &rcu_tasks_holdouts,
+ rcu_tasks_holdout_list) {
+ check_holdout_task(t, needreport, &firstreport);
+ cond_resched();
+ }
+ }
+
+ /*
+ * Because ->on_rq and ->nvcsw are not guaranteed
+ * to have a full memory barriers prior to them in the
+ * schedule() path, memory reordering on other CPUs could
+ * cause their RCU-tasks read-side critical sections to
+ * extend past the end of the grace period. However,
+ * because these ->nvcsw updates are carried out with
+ * interrupts disabled, we can use synchronize_sched()
+ * to force the needed ordering on all such CPUs.
+ *
+ * This synchronize_sched() also confines all
+ * ->rcu_tasks_holdout accesses to be within the grace
+ * period, avoiding the need for memory barriers for
+ * ->rcu_tasks_holdout accesses.
+ *
+ * In addition, this synchronize_sched() waits for exiting
+ * tasks to complete their final preempt_disable() region
+ * of execution, cleaning up after the synchronize_srcu()
+ * above.
+ */
+ synchronize_sched();
+
+ /* Invoke the callbacks. */
+ while (list) {
+ next = list->next;
+ local_bh_disable();
+ list->func(list);
+ local_bh_enable();
+ list = next;
+ cond_resched();
+ }
+ schedule_timeout_uninterruptible(HZ/10);
+ }
+}
+
+/* Spawn rcu_tasks_kthread() at first call to call_rcu_tasks(). */
+static void rcu_spawn_tasks_kthread(void)
+{
+ static DEFINE_MUTEX(rcu_tasks_kthread_mutex);
+ static struct task_struct *rcu_tasks_kthread_ptr;
+ struct task_struct *t;
+
+ if (ACCESS_ONCE(rcu_tasks_kthread_ptr)) {
+ smp_mb(); /* Ensure caller sees full kthread. */
+ return;
+ }
+ mutex_lock(&rcu_tasks_kthread_mutex);
+ if (rcu_tasks_kthread_ptr) {
+ mutex_unlock(&rcu_tasks_kthread_mutex);
+ return;
+ }
+ t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread");
+ BUG_ON(IS_ERR(t));
+ smp_mb(); /* Ensure others see full kthread. */
+ ACCESS_ONCE(rcu_tasks_kthread_ptr) = t;
+ mutex_unlock(&rcu_tasks_kthread_mutex);
+}
+
+#endif /* #ifdef CONFIG_TASKS_RCU */
+
+#ifdef CONFIG_PROVE_RCU
+
+/*
+ * Early boot self test parameters, one for each flavor
+ */
+static bool rcu_self_test;
+static bool rcu_self_test_bh;
+static bool rcu_self_test_sched;
+
+module_param(rcu_self_test, bool, 0444);
+module_param(rcu_self_test_bh, bool, 0444);
+module_param(rcu_self_test_sched, bool, 0444);
+
+static int rcu_self_test_counter;
+
+static void test_callback(struct rcu_head *r)
+{
+ rcu_self_test_counter++;
+ pr_info("RCU test callback executed %d\n", rcu_self_test_counter);
+}
+
+static void early_boot_test_call_rcu(void)
+{
+ static struct rcu_head head;
+
+ call_rcu(&head, test_callback);
+}
+
+static void early_boot_test_call_rcu_bh(void)
+{
+ static struct rcu_head head;
+
+ call_rcu_bh(&head, test_callback);
+}
+
+static void early_boot_test_call_rcu_sched(void)
+{
+ static struct rcu_head head;
+
+ call_rcu_sched(&head, test_callback);
+}
+
+void rcu_early_boot_tests(void)
+{
+ pr_info("Running RCU self tests\n");
+
+ if (rcu_self_test)
+ early_boot_test_call_rcu();
+ if (rcu_self_test_bh)
+ early_boot_test_call_rcu_bh();
+ if (rcu_self_test_sched)
+ early_boot_test_call_rcu_sched();
+}
+
+static int rcu_verify_early_boot_tests(void)
+{
+ int ret = 0;
+ int early_boot_test_counter = 0;
+
+ if (rcu_self_test) {
+ early_boot_test_counter++;
+ rcu_barrier();
+ }
+ if (rcu_self_test_bh) {
+ early_boot_test_counter++;
+ rcu_barrier_bh();
+ }
+ if (rcu_self_test_sched) {
+ early_boot_test_counter++;
+ rcu_barrier_sched();
+ }
+
+ if (rcu_self_test_counter != early_boot_test_counter) {
+ WARN_ON(1);
+ ret = -1;
+ }
+
+ return ret;
+}
+late_initcall(rcu_verify_early_boot_tests);
+#else
+void rcu_early_boot_tests(void) {}
+#endif /* CONFIG_PROVE_RCU */
diff --git a/kernel/reboot.c b/kernel/reboot.c
new file mode 100644
index 000000000..d20c85d9f
--- /dev/null
+++ b/kernel/reboot.c
@@ -0,0 +1,557 @@
+/*
+ * linux/kernel/reboot.c
+ *
+ * Copyright (C) 2013 Linus Torvalds
+ */
+
+#define pr_fmt(fmt) "reboot: " fmt
+
+#include <linux/ctype.h>
+#include <linux/export.h>
+#include <linux/kexec.h>
+#include <linux/kmod.h>
+#include <linux/kmsg_dump.h>
+#include <linux/reboot.h>
+#include <linux/suspend.h>
+#include <linux/syscalls.h>
+#include <linux/syscore_ops.h>
+#include <linux/uaccess.h>
+
+/*
+ * this indicates whether you can reboot with ctrl-alt-del: the default is yes
+ */
+
+int C_A_D = 1;
+struct pid *cad_pid;
+EXPORT_SYMBOL(cad_pid);
+
+#if defined(CONFIG_ARM) || defined(CONFIG_UNICORE32)
+#define DEFAULT_REBOOT_MODE = REBOOT_HARD
+#else
+#define DEFAULT_REBOOT_MODE
+#endif
+enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE;
+
+/*
+ * This variable is used privately to keep track of whether or not
+ * reboot_type is still set to its default value (i.e., reboot= hasn't
+ * been set on the command line). This is needed so that we can
+ * suppress DMI scanning for reboot quirks. Without it, it's
+ * impossible to override a faulty reboot quirk without recompiling.
+ */
+int reboot_default = 1;
+int reboot_cpu;
+enum reboot_type reboot_type = BOOT_ACPI;
+int reboot_force;
+
+/*
+ * If set, this is used for preparing the system to power off.
+ */
+
+void (*pm_power_off_prepare)(void);
+
+/**
+ * emergency_restart - reboot the system
+ *
+ * Without shutting down any hardware or taking any locks
+ * reboot the system. This is called when we know we are in
+ * trouble so this is our best effort to reboot. This is
+ * safe to call in interrupt context.
+ */
+void emergency_restart(void)
+{
+ kmsg_dump(KMSG_DUMP_EMERG);
+ machine_emergency_restart();
+}
+EXPORT_SYMBOL_GPL(emergency_restart);
+
+void kernel_restart_prepare(char *cmd)
+{
+ blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
+ system_state = SYSTEM_RESTART;
+ usermodehelper_disable();
+ device_shutdown();
+}
+
+/**
+ * register_reboot_notifier - Register function to be called at reboot time
+ * @nb: Info about notifier function to be called
+ *
+ * Registers a function with the list of functions
+ * to be called at reboot time.
+ *
+ * Currently always returns zero, as blocking_notifier_chain_register()
+ * always returns zero.
+ */
+int register_reboot_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_register(&reboot_notifier_list, nb);
+}
+EXPORT_SYMBOL(register_reboot_notifier);
+
+/**
+ * unregister_reboot_notifier - Unregister previously registered reboot notifier
+ * @nb: Hook to be unregistered
+ *
+ * Unregisters a previously registered reboot
+ * notifier function.
+ *
+ * Returns zero on success, or %-ENOENT on failure.
+ */
+int unregister_reboot_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_unregister(&reboot_notifier_list, nb);
+}
+EXPORT_SYMBOL(unregister_reboot_notifier);
+
+/*
+ * Notifier list for kernel code which wants to be called
+ * to restart the system.
+ */
+static ATOMIC_NOTIFIER_HEAD(restart_handler_list);
+
+/**
+ * register_restart_handler - Register function to be called to reset
+ * the system
+ * @nb: Info about handler function to be called
+ * @nb->priority: Handler priority. Handlers should follow the
+ * following guidelines for setting priorities.
+ * 0: Restart handler of last resort,
+ * with limited restart capabilities
+ * 128: Default restart handler; use if no other
+ * restart handler is expected to be available,
+ * and/or if restart functionality is
+ * sufficient to restart the entire system
+ * 255: Highest priority restart handler, will
+ * preempt all other restart handlers
+ *
+ * Registers a function with code to be called to restart the
+ * system.
+ *
+ * Registered functions will be called from machine_restart as last
+ * step of the restart sequence (if the architecture specific
+ * machine_restart function calls do_kernel_restart - see below
+ * for details).
+ * Registered functions are expected to restart the system immediately.
+ * If more than one function is registered, the restart handler priority
+ * selects which function will be called first.
+ *
+ * Restart handlers are expected to be registered from non-architecture
+ * code, typically from drivers. A typical use case would be a system
+ * where restart functionality is provided through a watchdog. Multiple
+ * restart handlers may exist; for example, one restart handler might
+ * restart the entire system, while another only restarts the CPU.
+ * In such cases, the restart handler which only restarts part of the
+ * hardware is expected to register with low priority to ensure that
+ * it only runs if no other means to restart the system is available.
+ *
+ * Currently always returns zero, as atomic_notifier_chain_register()
+ * always returns zero.
+ */
+int register_restart_handler(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_register(&restart_handler_list, nb);
+}
+EXPORT_SYMBOL(register_restart_handler);
+
+/**
+ * unregister_restart_handler - Unregister previously registered
+ * restart handler
+ * @nb: Hook to be unregistered
+ *
+ * Unregisters a previously registered restart handler function.
+ *
+ * Returns zero on success, or %-ENOENT on failure.
+ */
+int unregister_restart_handler(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_unregister(&restart_handler_list, nb);
+}
+EXPORT_SYMBOL(unregister_restart_handler);
+
+/**
+ * do_kernel_restart - Execute kernel restart handler call chain
+ *
+ * Calls functions registered with register_restart_handler.
+ *
+ * Expected to be called from machine_restart as last step of the restart
+ * sequence.
+ *
+ * Restarts the system immediately if a restart handler function has been
+ * registered. Otherwise does nothing.
+ */
+void do_kernel_restart(char *cmd)
+{
+ atomic_notifier_call_chain(&restart_handler_list, reboot_mode, cmd);
+}
+
+void migrate_to_reboot_cpu(void)
+{
+ /* The boot cpu is always logical cpu 0 */
+ int cpu = reboot_cpu;
+
+ cpu_hotplug_disable();
+
+ /* Make certain the cpu I'm about to reboot on is online */
+ if (!cpu_online(cpu))
+ cpu = cpumask_first(cpu_online_mask);
+
+ /* Prevent races with other tasks migrating this task */
+ current->flags |= PF_NO_SETAFFINITY;
+
+ /* Make certain I only run on the appropriate processor */
+ set_cpus_allowed_ptr(current, cpumask_of(cpu));
+}
+
+/**
+ * kernel_restart - reboot the system
+ * @cmd: pointer to buffer containing command to execute for restart
+ * or %NULL
+ *
+ * Shutdown everything and perform a clean reboot.
+ * This is not safe to call in interrupt context.
+ */
+void kernel_restart(char *cmd)
+{
+ kernel_restart_prepare(cmd);
+ migrate_to_reboot_cpu();
+ syscore_shutdown();
+ if (!cmd)
+ pr_emerg("Restarting system\n");
+ else
+ pr_emerg("Restarting system with command '%s'\n", cmd);
+ kmsg_dump(KMSG_DUMP_RESTART);
+ machine_restart(cmd);
+}
+EXPORT_SYMBOL_GPL(kernel_restart);
+
+static void kernel_shutdown_prepare(enum system_states state)
+{
+ blocking_notifier_call_chain(&reboot_notifier_list,
+ (state == SYSTEM_HALT) ? SYS_HALT : SYS_POWER_OFF, NULL);
+ system_state = state;
+ usermodehelper_disable();
+ device_shutdown();
+}
+/**
+ * kernel_halt - halt the system
+ *
+ * Shutdown everything and perform a clean system halt.
+ */
+void kernel_halt(void)
+{
+ kernel_shutdown_prepare(SYSTEM_HALT);
+ migrate_to_reboot_cpu();
+ syscore_shutdown();
+ pr_emerg("System halted\n");
+ kmsg_dump(KMSG_DUMP_HALT);
+ machine_halt();
+}
+EXPORT_SYMBOL_GPL(kernel_halt);
+
+/**
+ * kernel_power_off - power_off the system
+ *
+ * Shutdown everything and perform a clean system power_off.
+ */
+void kernel_power_off(void)
+{
+ kernel_shutdown_prepare(SYSTEM_POWER_OFF);
+ if (pm_power_off_prepare)
+ pm_power_off_prepare();
+ migrate_to_reboot_cpu();
+ syscore_shutdown();
+ pr_emerg("Power down\n");
+ kmsg_dump(KMSG_DUMP_POWEROFF);
+ machine_power_off();
+}
+EXPORT_SYMBOL_GPL(kernel_power_off);
+
+static DEFINE_MUTEX(reboot_mutex);
+
+/*
+ * Reboot system call: for obvious reasons only root may call it,
+ * and even root needs to set up some magic numbers in the registers
+ * so that some mistake won't make this reboot the whole machine.
+ * You can also set the meaning of the ctrl-alt-del-key here.
+ *
+ * reboot doesn't sync: do that yourself before calling this.
+ */
+SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
+ void __user *, arg)
+{
+ struct pid_namespace *pid_ns = task_active_pid_ns(current);
+ char buffer[256];
+ int ret = 0;
+
+ /* We only trust the superuser with rebooting the system. */
+ if (!ns_capable(pid_ns->user_ns, CAP_SYS_BOOT))
+ return -EPERM;
+
+ /* For safety, we require "magic" arguments. */
+ if (magic1 != LINUX_REBOOT_MAGIC1 ||
+ (magic2 != LINUX_REBOOT_MAGIC2 &&
+ magic2 != LINUX_REBOOT_MAGIC2A &&
+ magic2 != LINUX_REBOOT_MAGIC2B &&
+ magic2 != LINUX_REBOOT_MAGIC2C))
+ return -EINVAL;
+
+ /*
+ * If pid namespaces are enabled and the current task is in a child
+ * pid_namespace, the command is handled by reboot_pid_ns() which will
+ * call do_exit().
+ */
+ ret = reboot_pid_ns(pid_ns, cmd);
+ if (ret)
+ return ret;
+
+ /* Instead of trying to make the power_off code look like
+ * halt when pm_power_off is not set do it the easy way.
+ */
+ if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off)
+ cmd = LINUX_REBOOT_CMD_HALT;
+
+ mutex_lock(&reboot_mutex);
+ switch (cmd) {
+ case LINUX_REBOOT_CMD_RESTART:
+ kernel_restart(NULL);
+ break;
+
+ case LINUX_REBOOT_CMD_CAD_ON:
+ C_A_D = 1;
+ break;
+
+ case LINUX_REBOOT_CMD_CAD_OFF:
+ C_A_D = 0;
+ break;
+
+ case LINUX_REBOOT_CMD_HALT:
+ kernel_halt();
+ do_exit(0);
+ panic("cannot halt");
+
+ case LINUX_REBOOT_CMD_POWER_OFF:
+ kernel_power_off();
+ do_exit(0);
+ break;
+
+ case LINUX_REBOOT_CMD_RESTART2:
+ ret = strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1);
+ if (ret < 0) {
+ ret = -EFAULT;
+ break;
+ }
+ buffer[sizeof(buffer) - 1] = '\0';
+
+ kernel_restart(buffer);
+ break;
+
+#ifdef CONFIG_KEXEC
+ case LINUX_REBOOT_CMD_KEXEC:
+ ret = kernel_kexec();
+ break;
+#endif
+
+#ifdef CONFIG_HIBERNATION
+ case LINUX_REBOOT_CMD_SW_SUSPEND:
+ ret = hibernate();
+ break;
+#endif
+
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ mutex_unlock(&reboot_mutex);
+ return ret;
+}
+
+static void deferred_cad(struct work_struct *dummy)
+{
+ kernel_restart(NULL);
+}
+
+/*
+ * This function gets called by ctrl-alt-del - ie the keyboard interrupt.
+ * As it's called within an interrupt, it may NOT sync: the only choice
+ * is whether to reboot at once, or just ignore the ctrl-alt-del.
+ */
+void ctrl_alt_del(void)
+{
+ static DECLARE_WORK(cad_work, deferred_cad);
+
+ if (C_A_D)
+ schedule_work(&cad_work);
+ else
+ kill_cad_pid(SIGINT, 1);
+}
+
+char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
+static const char reboot_cmd[] = "/sbin/reboot";
+
+static int run_cmd(const char *cmd)
+{
+ char **argv;
+ static char *envp[] = {
+ "HOME=/",
+ "PATH=/sbin:/bin:/usr/sbin:/usr/bin",
+ NULL
+ };
+ int ret;
+ argv = argv_split(GFP_KERNEL, cmd, NULL);
+ if (argv) {
+ ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+ argv_free(argv);
+ } else {
+ ret = -ENOMEM;
+ }
+
+ return ret;
+}
+
+static int __orderly_reboot(void)
+{
+ int ret;
+
+ ret = run_cmd(reboot_cmd);
+
+ if (ret) {
+ pr_warn("Failed to start orderly reboot: forcing the issue\n");
+ emergency_sync();
+ kernel_restart(NULL);
+ }
+
+ return ret;
+}
+
+static int __orderly_poweroff(bool force)
+{
+ int ret;
+
+ ret = run_cmd(poweroff_cmd);
+
+ if (ret && force) {
+ pr_warn("Failed to start orderly shutdown: forcing the issue\n");
+
+ /*
+ * I guess this should try to kick off some daemon to sync and
+ * poweroff asap. Or not even bother syncing if we're doing an
+ * emergency shutdown?
+ */
+ emergency_sync();
+ kernel_power_off();
+ }
+
+ return ret;
+}
+
+static bool poweroff_force;
+
+static void poweroff_work_func(struct work_struct *work)
+{
+ __orderly_poweroff(poweroff_force);
+}
+
+static DECLARE_WORK(poweroff_work, poweroff_work_func);
+
+/**
+ * orderly_poweroff - Trigger an orderly system poweroff
+ * @force: force poweroff if command execution fails
+ *
+ * This may be called from any context to trigger a system shutdown.
+ * If the orderly shutdown fails, it will force an immediate shutdown.
+ */
+void orderly_poweroff(bool force)
+{
+ if (force) /* do not override the pending "true" */
+ poweroff_force = true;
+ schedule_work(&poweroff_work);
+}
+EXPORT_SYMBOL_GPL(orderly_poweroff);
+
+static void reboot_work_func(struct work_struct *work)
+{
+ __orderly_reboot();
+}
+
+static DECLARE_WORK(reboot_work, reboot_work_func);
+
+/**
+ * orderly_reboot - Trigger an orderly system reboot
+ *
+ * This may be called from any context to trigger a system reboot.
+ * If the orderly reboot fails, it will force an immediate reboot.
+ */
+void orderly_reboot(void)
+{
+ schedule_work(&reboot_work);
+}
+EXPORT_SYMBOL_GPL(orderly_reboot);
+
+static int __init reboot_setup(char *str)
+{
+ for (;;) {
+ /*
+ * Having anything passed on the command line via
+ * reboot= will cause us to disable DMI checking
+ * below.
+ */
+ reboot_default = 0;
+
+ switch (*str) {
+ case 'w':
+ reboot_mode = REBOOT_WARM;
+ break;
+
+ case 'c':
+ reboot_mode = REBOOT_COLD;
+ break;
+
+ case 'h':
+ reboot_mode = REBOOT_HARD;
+ break;
+
+ case 's':
+ {
+ int rc;
+
+ if (isdigit(*(str+1))) {
+ rc = kstrtoint(str+1, 0, &reboot_cpu);
+ if (rc)
+ return rc;
+ } else if (str[1] == 'm' && str[2] == 'p' &&
+ isdigit(*(str+3))) {
+ rc = kstrtoint(str+3, 0, &reboot_cpu);
+ if (rc)
+ return rc;
+ } else
+ reboot_mode = REBOOT_SOFT;
+ break;
+ }
+ case 'g':
+ reboot_mode = REBOOT_GPIO;
+ break;
+
+ case 'b':
+ case 'a':
+ case 'k':
+ case 't':
+ case 'e':
+ case 'p':
+ reboot_type = *str;
+ break;
+
+ case 'f':
+ reboot_force = 1;
+ break;
+ }
+
+ str = strchr(str, ',');
+ if (str)
+ str++;
+ else
+ break;
+ }
+ return 1;
+}
+__setup("reboot=", reboot_setup);
diff --git a/kernel/relay.c b/kernel/relay.c
new file mode 100644
index 000000000..e9dbaeb8f
--- /dev/null
+++ b/kernel/relay.c
@@ -0,0 +1,1360 @@
+/*
+ * Public API and common code for kernel->userspace relay file support.
+ *
+ * See Documentation/filesystems/relay.txt for an overview.
+ *
+ * Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
+ * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com)
+ *
+ * Moved to kernel/relay.c by Paul Mundt, 2006.
+ * November 2006 - CPU hotplug support by Mathieu Desnoyers
+ * (mathieu.desnoyers@polymtl.ca)
+ *
+ * This file is released under the GPL.
+ */
+#include <linux/errno.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/string.h>
+#include <linux/relay.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/cpu.h>
+#include <linux/splice.h>
+
+/* list of open channels, for cpu hotplug */
+static DEFINE_MUTEX(relay_channels_mutex);
+static LIST_HEAD(relay_channels);
+
+/*
+ * close() vm_op implementation for relay file mapping.
+ */
+static void relay_file_mmap_close(struct vm_area_struct *vma)
+{
+ struct rchan_buf *buf = vma->vm_private_data;
+ buf->chan->cb->buf_unmapped(buf, vma->vm_file);
+}
+
+/*
+ * fault() vm_op implementation for relay file mapping.
+ */
+static int relay_buf_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct page *page;
+ struct rchan_buf *buf = vma->vm_private_data;
+ pgoff_t pgoff = vmf->pgoff;
+
+ if (!buf)
+ return VM_FAULT_OOM;
+
+ page = vmalloc_to_page(buf->start + (pgoff << PAGE_SHIFT));
+ if (!page)
+ return VM_FAULT_SIGBUS;
+ get_page(page);
+ vmf->page = page;
+
+ return 0;
+}
+
+/*
+ * vm_ops for relay file mappings.
+ */
+static const struct vm_operations_struct relay_file_mmap_ops = {
+ .fault = relay_buf_fault,
+ .close = relay_file_mmap_close,
+};
+
+/*
+ * allocate an array of pointers of struct page
+ */
+static struct page **relay_alloc_page_array(unsigned int n_pages)
+{
+ const size_t pa_size = n_pages * sizeof(struct page *);
+ if (pa_size > PAGE_SIZE)
+ return vzalloc(pa_size);
+ return kzalloc(pa_size, GFP_KERNEL);
+}
+
+/*
+ * free an array of pointers of struct page
+ */
+static void relay_free_page_array(struct page **array)
+{
+ if (is_vmalloc_addr(array))
+ vfree(array);
+ else
+ kfree(array);
+}
+
+/**
+ * relay_mmap_buf: - mmap channel buffer to process address space
+ * @buf: relay channel buffer
+ * @vma: vm_area_struct describing memory to be mapped
+ *
+ * Returns 0 if ok, negative on error
+ *
+ * Caller should already have grabbed mmap_sem.
+ */
+static int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
+{
+ unsigned long length = vma->vm_end - vma->vm_start;
+ struct file *filp = vma->vm_file;
+
+ if (!buf)
+ return -EBADF;
+
+ if (length != (unsigned long)buf->chan->alloc_size)
+ return -EINVAL;
+
+ vma->vm_ops = &relay_file_mmap_ops;
+ vma->vm_flags |= VM_DONTEXPAND;
+ vma->vm_private_data = buf;
+ buf->chan->cb->buf_mapped(buf, filp);
+
+ return 0;
+}
+
+/**
+ * relay_alloc_buf - allocate a channel buffer
+ * @buf: the buffer struct
+ * @size: total size of the buffer
+ *
+ * Returns a pointer to the resulting buffer, %NULL if unsuccessful. The
+ * passed in size will get page aligned, if it isn't already.
+ */
+static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size)
+{
+ void *mem;
+ unsigned int i, j, n_pages;
+
+ *size = PAGE_ALIGN(*size);
+ n_pages = *size >> PAGE_SHIFT;
+
+ buf->page_array = relay_alloc_page_array(n_pages);
+ if (!buf->page_array)
+ return NULL;
+
+ for (i = 0; i < n_pages; i++) {
+ buf->page_array[i] = alloc_page(GFP_KERNEL);
+ if (unlikely(!buf->page_array[i]))
+ goto depopulate;
+ set_page_private(buf->page_array[i], (unsigned long)buf);
+ }
+ mem = vmap(buf->page_array, n_pages, VM_MAP, PAGE_KERNEL);
+ if (!mem)
+ goto depopulate;
+
+ memset(mem, 0, *size);
+ buf->page_count = n_pages;
+ return mem;
+
+depopulate:
+ for (j = 0; j < i; j++)
+ __free_page(buf->page_array[j]);
+ relay_free_page_array(buf->page_array);
+ return NULL;
+}
+
+/**
+ * relay_create_buf - allocate and initialize a channel buffer
+ * @chan: the relay channel
+ *
+ * Returns channel buffer if successful, %NULL otherwise.
+ */
+static struct rchan_buf *relay_create_buf(struct rchan *chan)
+{
+ struct rchan_buf *buf;
+
+ if (chan->n_subbufs > UINT_MAX / sizeof(size_t *))
+ return NULL;
+
+ buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL);
+ if (!buf)
+ return NULL;
+ buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL);
+ if (!buf->padding)
+ goto free_buf;
+
+ buf->start = relay_alloc_buf(buf, &chan->alloc_size);
+ if (!buf->start)
+ goto free_buf;
+
+ buf->chan = chan;
+ kref_get(&buf->chan->kref);
+ return buf;
+
+free_buf:
+ kfree(buf->padding);
+ kfree(buf);
+ return NULL;
+}
+
+/**
+ * relay_destroy_channel - free the channel struct
+ * @kref: target kernel reference that contains the relay channel
+ *
+ * Should only be called from kref_put().
+ */
+static void relay_destroy_channel(struct kref *kref)
+{
+ struct rchan *chan = container_of(kref, struct rchan, kref);
+ kfree(chan);
+}
+
+/**
+ * relay_destroy_buf - destroy an rchan_buf struct and associated buffer
+ * @buf: the buffer struct
+ */
+static void relay_destroy_buf(struct rchan_buf *buf)
+{
+ struct rchan *chan = buf->chan;
+ unsigned int i;
+
+ if (likely(buf->start)) {
+ vunmap(buf->start);
+ for (i = 0; i < buf->page_count; i++)
+ __free_page(buf->page_array[i]);
+ relay_free_page_array(buf->page_array);
+ }
+ chan->buf[buf->cpu] = NULL;
+ kfree(buf->padding);
+ kfree(buf);
+ kref_put(&chan->kref, relay_destroy_channel);
+}
+
+/**
+ * relay_remove_buf - remove a channel buffer
+ * @kref: target kernel reference that contains the relay buffer
+ *
+ * Removes the file from the filesystem, which also frees the
+ * rchan_buf_struct and the channel buffer. Should only be called from
+ * kref_put().
+ */
+static void relay_remove_buf(struct kref *kref)
+{
+ struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref);
+ relay_destroy_buf(buf);
+}
+
+/**
+ * relay_buf_empty - boolean, is the channel buffer empty?
+ * @buf: channel buffer
+ *
+ * Returns 1 if the buffer is empty, 0 otherwise.
+ */
+static int relay_buf_empty(struct rchan_buf *buf)
+{
+ return (buf->subbufs_produced - buf->subbufs_consumed) ? 0 : 1;
+}
+
+/**
+ * relay_buf_full - boolean, is the channel buffer full?
+ * @buf: channel buffer
+ *
+ * Returns 1 if the buffer is full, 0 otherwise.
+ */
+int relay_buf_full(struct rchan_buf *buf)
+{
+ size_t ready = buf->subbufs_produced - buf->subbufs_consumed;
+ return (ready >= buf->chan->n_subbufs) ? 1 : 0;
+}
+EXPORT_SYMBOL_GPL(relay_buf_full);
+
+/*
+ * High-level relay kernel API and associated functions.
+ */
+
+/*
+ * rchan_callback implementations defining default channel behavior. Used
+ * in place of corresponding NULL values in client callback struct.
+ */
+
+/*
+ * subbuf_start() default callback. Does nothing.
+ */
+static int subbuf_start_default_callback (struct rchan_buf *buf,
+ void *subbuf,
+ void *prev_subbuf,
+ size_t prev_padding)
+{
+ if (relay_buf_full(buf))
+ return 0;
+
+ return 1;
+}
+
+/*
+ * buf_mapped() default callback. Does nothing.
+ */
+static void buf_mapped_default_callback(struct rchan_buf *buf,
+ struct file *filp)
+{
+}
+
+/*
+ * buf_unmapped() default callback. Does nothing.
+ */
+static void buf_unmapped_default_callback(struct rchan_buf *buf,
+ struct file *filp)
+{
+}
+
+/*
+ * create_buf_file_create() default callback. Does nothing.
+ */
+static struct dentry *create_buf_file_default_callback(const char *filename,
+ struct dentry *parent,
+ umode_t mode,
+ struct rchan_buf *buf,
+ int *is_global)
+{
+ return NULL;
+}
+
+/*
+ * remove_buf_file() default callback. Does nothing.
+ */
+static int remove_buf_file_default_callback(struct dentry *dentry)
+{
+ return -EINVAL;
+}
+
+/* relay channel default callbacks */
+static struct rchan_callbacks default_channel_callbacks = {
+ .subbuf_start = subbuf_start_default_callback,
+ .buf_mapped = buf_mapped_default_callback,
+ .buf_unmapped = buf_unmapped_default_callback,
+ .create_buf_file = create_buf_file_default_callback,
+ .remove_buf_file = remove_buf_file_default_callback,
+};
+
+/**
+ * wakeup_readers - wake up readers waiting on a channel
+ * @data: contains the channel buffer
+ *
+ * This is the timer function used to defer reader waking.
+ */
+static void wakeup_readers(unsigned long data)
+{
+ struct rchan_buf *buf = (struct rchan_buf *)data;
+ wake_up_interruptible(&buf->read_wait);
+}
+
+/**
+ * __relay_reset - reset a channel buffer
+ * @buf: the channel buffer
+ * @init: 1 if this is a first-time initialization
+ *
+ * See relay_reset() for description of effect.
+ */
+static void __relay_reset(struct rchan_buf *buf, unsigned int init)
+{
+ size_t i;
+
+ if (init) {
+ init_waitqueue_head(&buf->read_wait);
+ kref_init(&buf->kref);
+ setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf);
+ } else
+ del_timer_sync(&buf->timer);
+
+ buf->subbufs_produced = 0;
+ buf->subbufs_consumed = 0;
+ buf->bytes_consumed = 0;
+ buf->finalized = 0;
+ buf->data = buf->start;
+ buf->offset = 0;
+
+ for (i = 0; i < buf->chan->n_subbufs; i++)
+ buf->padding[i] = 0;
+
+ buf->chan->cb->subbuf_start(buf, buf->data, NULL, 0);
+}
+
+/**
+ * relay_reset - reset the channel
+ * @chan: the channel
+ *
+ * This has the effect of erasing all data from all channel buffers
+ * and restarting the channel in its initial state. The buffers
+ * are not freed, so any mappings are still in effect.
+ *
+ * NOTE. Care should be taken that the channel isn't actually
+ * being used by anything when this call is made.
+ */
+void relay_reset(struct rchan *chan)
+{
+ unsigned int i;
+
+ if (!chan)
+ return;
+
+ if (chan->is_global && chan->buf[0]) {
+ __relay_reset(chan->buf[0], 0);
+ return;
+ }
+
+ mutex_lock(&relay_channels_mutex);
+ for_each_possible_cpu(i)
+ if (chan->buf[i])
+ __relay_reset(chan->buf[i], 0);
+ mutex_unlock(&relay_channels_mutex);
+}
+EXPORT_SYMBOL_GPL(relay_reset);
+
+static inline void relay_set_buf_dentry(struct rchan_buf *buf,
+ struct dentry *dentry)
+{
+ buf->dentry = dentry;
+ d_inode(buf->dentry)->i_size = buf->early_bytes;
+}
+
+static struct dentry *relay_create_buf_file(struct rchan *chan,
+ struct rchan_buf *buf,
+ unsigned int cpu)
+{
+ struct dentry *dentry;
+ char *tmpname;
+
+ tmpname = kzalloc(NAME_MAX + 1, GFP_KERNEL);
+ if (!tmpname)
+ return NULL;
+ snprintf(tmpname, NAME_MAX, "%s%d", chan->base_filename, cpu);
+
+ /* Create file in fs */
+ dentry = chan->cb->create_buf_file(tmpname, chan->parent,
+ S_IRUSR, buf,
+ &chan->is_global);
+
+ kfree(tmpname);
+
+ return dentry;
+}
+
+/*
+ * relay_open_buf - create a new relay channel buffer
+ *
+ * used by relay_open() and CPU hotplug.
+ */
+static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu)
+{
+ struct rchan_buf *buf = NULL;
+ struct dentry *dentry;
+
+ if (chan->is_global)
+ return chan->buf[0];
+
+ buf = relay_create_buf(chan);
+ if (!buf)
+ return NULL;
+
+ if (chan->has_base_filename) {
+ dentry = relay_create_buf_file(chan, buf, cpu);
+ if (!dentry)
+ goto free_buf;
+ relay_set_buf_dentry(buf, dentry);
+ }
+
+ buf->cpu = cpu;
+ __relay_reset(buf, 1);
+
+ if(chan->is_global) {
+ chan->buf[0] = buf;
+ buf->cpu = 0;
+ }
+
+ return buf;
+
+free_buf:
+ relay_destroy_buf(buf);
+ return NULL;
+}
+
+/**
+ * relay_close_buf - close a channel buffer
+ * @buf: channel buffer
+ *
+ * Marks the buffer finalized and restores the default callbacks.
+ * The channel buffer and channel buffer data structure are then freed
+ * automatically when the last reference is given up.
+ */
+static void relay_close_buf(struct rchan_buf *buf)
+{
+ buf->finalized = 1;
+ del_timer_sync(&buf->timer);
+ buf->chan->cb->remove_buf_file(buf->dentry);
+ kref_put(&buf->kref, relay_remove_buf);
+}
+
+static void setup_callbacks(struct rchan *chan,
+ struct rchan_callbacks *cb)
+{
+ if (!cb) {
+ chan->cb = &default_channel_callbacks;
+ return;
+ }
+
+ if (!cb->subbuf_start)
+ cb->subbuf_start = subbuf_start_default_callback;
+ if (!cb->buf_mapped)
+ cb->buf_mapped = buf_mapped_default_callback;
+ if (!cb->buf_unmapped)
+ cb->buf_unmapped = buf_unmapped_default_callback;
+ if (!cb->create_buf_file)
+ cb->create_buf_file = create_buf_file_default_callback;
+ if (!cb->remove_buf_file)
+ cb->remove_buf_file = remove_buf_file_default_callback;
+ chan->cb = cb;
+}
+
+/**
+ * relay_hotcpu_callback - CPU hotplug callback
+ * @nb: notifier block
+ * @action: hotplug action to take
+ * @hcpu: CPU number
+ *
+ * Returns the success/failure of the operation. (%NOTIFY_OK, %NOTIFY_BAD)
+ */
+static int relay_hotcpu_callback(struct notifier_block *nb,
+ unsigned long action,
+ void *hcpu)
+{
+ unsigned int hotcpu = (unsigned long)hcpu;
+ struct rchan *chan;
+
+ switch(action) {
+ case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
+ mutex_lock(&relay_channels_mutex);
+ list_for_each_entry(chan, &relay_channels, list) {
+ if (chan->buf[hotcpu])
+ continue;
+ chan->buf[hotcpu] = relay_open_buf(chan, hotcpu);
+ if(!chan->buf[hotcpu]) {
+ printk(KERN_ERR
+ "relay_hotcpu_callback: cpu %d buffer "
+ "creation failed\n", hotcpu);
+ mutex_unlock(&relay_channels_mutex);
+ return notifier_from_errno(-ENOMEM);
+ }
+ }
+ mutex_unlock(&relay_channels_mutex);
+ break;
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ /* No need to flush the cpu : will be flushed upon
+ * final relay_flush() call. */
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+/**
+ * relay_open - create a new relay channel
+ * @base_filename: base name of files to create, %NULL for buffering only
+ * @parent: dentry of parent directory, %NULL for root directory or buffer
+ * @subbuf_size: size of sub-buffers
+ * @n_subbufs: number of sub-buffers
+ * @cb: client callback functions
+ * @private_data: user-defined data
+ *
+ * Returns channel pointer if successful, %NULL otherwise.
+ *
+ * Creates a channel buffer for each cpu using the sizes and
+ * attributes specified. The created channel buffer files
+ * will be named base_filename0...base_filenameN-1. File
+ * permissions will be %S_IRUSR.
+ */
+struct rchan *relay_open(const char *base_filename,
+ struct dentry *parent,
+ size_t subbuf_size,
+ size_t n_subbufs,
+ struct rchan_callbacks *cb,
+ void *private_data)
+{
+ unsigned int i;
+ struct rchan *chan;
+
+ if (!(subbuf_size && n_subbufs))
+ return NULL;
+ if (subbuf_size > UINT_MAX / n_subbufs)
+ return NULL;
+
+ chan = kzalloc(sizeof(struct rchan), GFP_KERNEL);
+ if (!chan)
+ return NULL;
+
+ chan->version = RELAYFS_CHANNEL_VERSION;
+ chan->n_subbufs = n_subbufs;
+ chan->subbuf_size = subbuf_size;
+ chan->alloc_size = PAGE_ALIGN(subbuf_size * n_subbufs);
+ chan->parent = parent;
+ chan->private_data = private_data;
+ if (base_filename) {
+ chan->has_base_filename = 1;
+ strlcpy(chan->base_filename, base_filename, NAME_MAX);
+ }
+ setup_callbacks(chan, cb);
+ kref_init(&chan->kref);
+
+ mutex_lock(&relay_channels_mutex);
+ for_each_online_cpu(i) {
+ chan->buf[i] = relay_open_buf(chan, i);
+ if (!chan->buf[i])
+ goto free_bufs;
+ }
+ list_add(&chan->list, &relay_channels);
+ mutex_unlock(&relay_channels_mutex);
+
+ return chan;
+
+free_bufs:
+ for_each_possible_cpu(i) {
+ if (chan->buf[i])
+ relay_close_buf(chan->buf[i]);
+ }
+
+ kref_put(&chan->kref, relay_destroy_channel);
+ mutex_unlock(&relay_channels_mutex);
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(relay_open);
+
+struct rchan_percpu_buf_dispatcher {
+ struct rchan_buf *buf;
+ struct dentry *dentry;
+};
+
+/* Called in atomic context. */
+static void __relay_set_buf_dentry(void *info)
+{
+ struct rchan_percpu_buf_dispatcher *p = info;
+
+ relay_set_buf_dentry(p->buf, p->dentry);
+}
+
+/**
+ * relay_late_setup_files - triggers file creation
+ * @chan: channel to operate on
+ * @base_filename: base name of files to create
+ * @parent: dentry of parent directory, %NULL for root directory
+ *
+ * Returns 0 if successful, non-zero otherwise.
+ *
+ * Use to setup files for a previously buffer-only channel.
+ * Useful to do early tracing in kernel, before VFS is up, for example.
+ */
+int relay_late_setup_files(struct rchan *chan,
+ const char *base_filename,
+ struct dentry *parent)
+{
+ int err = 0;
+ unsigned int i, curr_cpu;
+ unsigned long flags;
+ struct dentry *dentry;
+ struct rchan_percpu_buf_dispatcher disp;
+
+ if (!chan || !base_filename)
+ return -EINVAL;
+
+ strlcpy(chan->base_filename, base_filename, NAME_MAX);
+
+ mutex_lock(&relay_channels_mutex);
+ /* Is chan already set up? */
+ if (unlikely(chan->has_base_filename)) {
+ mutex_unlock(&relay_channels_mutex);
+ return -EEXIST;
+ }
+ chan->has_base_filename = 1;
+ chan->parent = parent;
+ curr_cpu = get_cpu();
+ /*
+ * The CPU hotplug notifier ran before us and created buffers with
+ * no files associated. So it's safe to call relay_setup_buf_file()
+ * on all currently online CPUs.
+ */
+ for_each_online_cpu(i) {
+ if (unlikely(!chan->buf[i])) {
+ WARN_ONCE(1, KERN_ERR "CPU has no buffer!\n");
+ err = -EINVAL;
+ break;
+ }
+
+ dentry = relay_create_buf_file(chan, chan->buf[i], i);
+ if (unlikely(!dentry)) {
+ err = -EINVAL;
+ break;
+ }
+
+ if (curr_cpu == i) {
+ local_irq_save(flags);
+ relay_set_buf_dentry(chan->buf[i], dentry);
+ local_irq_restore(flags);
+ } else {
+ disp.buf = chan->buf[i];
+ disp.dentry = dentry;
+ smp_mb();
+ /* relay_channels_mutex must be held, so wait. */
+ err = smp_call_function_single(i,
+ __relay_set_buf_dentry,
+ &disp, 1);
+ }
+ if (unlikely(err))
+ break;
+ }
+ put_cpu();
+ mutex_unlock(&relay_channels_mutex);
+
+ return err;
+}
+
+/**
+ * relay_switch_subbuf - switch to a new sub-buffer
+ * @buf: channel buffer
+ * @length: size of current event
+ *
+ * Returns either the length passed in or 0 if full.
+ *
+ * Performs sub-buffer-switch tasks such as invoking callbacks,
+ * updating padding counts, waking up readers, etc.
+ */
+size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
+{
+ void *old, *new;
+ size_t old_subbuf, new_subbuf;
+
+ if (unlikely(length > buf->chan->subbuf_size))
+ goto toobig;
+
+ if (buf->offset != buf->chan->subbuf_size + 1) {
+ buf->prev_padding = buf->chan->subbuf_size - buf->offset;
+ old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
+ buf->padding[old_subbuf] = buf->prev_padding;
+ buf->subbufs_produced++;
+ if (buf->dentry)
+ d_inode(buf->dentry)->i_size +=
+ buf->chan->subbuf_size -
+ buf->padding[old_subbuf];
+ else
+ buf->early_bytes += buf->chan->subbuf_size -
+ buf->padding[old_subbuf];
+ smp_mb();
+ if (waitqueue_active(&buf->read_wait))
+ /*
+ * Calling wake_up_interruptible() from here
+ * will deadlock if we happen to be logging
+ * from the scheduler (trying to re-grab
+ * rq->lock), so defer it.
+ */
+ mod_timer(&buf->timer, jiffies + 1);
+ }
+
+ old = buf->data;
+ new_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
+ new = buf->start + new_subbuf * buf->chan->subbuf_size;
+ buf->offset = 0;
+ if (!buf->chan->cb->subbuf_start(buf, new, old, buf->prev_padding)) {
+ buf->offset = buf->chan->subbuf_size + 1;
+ return 0;
+ }
+ buf->data = new;
+ buf->padding[new_subbuf] = 0;
+
+ if (unlikely(length + buf->offset > buf->chan->subbuf_size))
+ goto toobig;
+
+ return length;
+
+toobig:
+ buf->chan->last_toobig = length;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(relay_switch_subbuf);
+
+/**
+ * relay_subbufs_consumed - update the buffer's sub-buffers-consumed count
+ * @chan: the channel
+ * @cpu: the cpu associated with the channel buffer to update
+ * @subbufs_consumed: number of sub-buffers to add to current buf's count
+ *
+ * Adds to the channel buffer's consumed sub-buffer count.
+ * subbufs_consumed should be the number of sub-buffers newly consumed,
+ * not the total consumed.
+ *
+ * NOTE. Kernel clients don't need to call this function if the channel
+ * mode is 'overwrite'.
+ */
+void relay_subbufs_consumed(struct rchan *chan,
+ unsigned int cpu,
+ size_t subbufs_consumed)
+{
+ struct rchan_buf *buf;
+
+ if (!chan)
+ return;
+
+ if (cpu >= NR_CPUS || !chan->buf[cpu] ||
+ subbufs_consumed > chan->n_subbufs)
+ return;
+
+ buf = chan->buf[cpu];
+ if (subbufs_consumed > buf->subbufs_produced - buf->subbufs_consumed)
+ buf->subbufs_consumed = buf->subbufs_produced;
+ else
+ buf->subbufs_consumed += subbufs_consumed;
+}
+EXPORT_SYMBOL_GPL(relay_subbufs_consumed);
+
+/**
+ * relay_close - close the channel
+ * @chan: the channel
+ *
+ * Closes all channel buffers and frees the channel.
+ */
+void relay_close(struct rchan *chan)
+{
+ unsigned int i;
+
+ if (!chan)
+ return;
+
+ mutex_lock(&relay_channels_mutex);
+ if (chan->is_global && chan->buf[0])
+ relay_close_buf(chan->buf[0]);
+ else
+ for_each_possible_cpu(i)
+ if (chan->buf[i])
+ relay_close_buf(chan->buf[i]);
+
+ if (chan->last_toobig)
+ printk(KERN_WARNING "relay: one or more items not logged "
+ "[item size (%Zd) > sub-buffer size (%Zd)]\n",
+ chan->last_toobig, chan->subbuf_size);
+
+ list_del(&chan->list);
+ kref_put(&chan->kref, relay_destroy_channel);
+ mutex_unlock(&relay_channels_mutex);
+}
+EXPORT_SYMBOL_GPL(relay_close);
+
+/**
+ * relay_flush - close the channel
+ * @chan: the channel
+ *
+ * Flushes all channel buffers, i.e. forces buffer switch.
+ */
+void relay_flush(struct rchan *chan)
+{
+ unsigned int i;
+
+ if (!chan)
+ return;
+
+ if (chan->is_global && chan->buf[0]) {
+ relay_switch_subbuf(chan->buf[0], 0);
+ return;
+ }
+
+ mutex_lock(&relay_channels_mutex);
+ for_each_possible_cpu(i)
+ if (chan->buf[i])
+ relay_switch_subbuf(chan->buf[i], 0);
+ mutex_unlock(&relay_channels_mutex);
+}
+EXPORT_SYMBOL_GPL(relay_flush);
+
+/**
+ * relay_file_open - open file op for relay files
+ * @inode: the inode
+ * @filp: the file
+ *
+ * Increments the channel buffer refcount.
+ */
+static int relay_file_open(struct inode *inode, struct file *filp)
+{
+ struct rchan_buf *buf = inode->i_private;
+ kref_get(&buf->kref);
+ filp->private_data = buf;
+
+ return nonseekable_open(inode, filp);
+}
+
+/**
+ * relay_file_mmap - mmap file op for relay files
+ * @filp: the file
+ * @vma: the vma describing what to map
+ *
+ * Calls upon relay_mmap_buf() to map the file into user space.
+ */
+static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ struct rchan_buf *buf = filp->private_data;
+ return relay_mmap_buf(buf, vma);
+}
+
+/**
+ * relay_file_poll - poll file op for relay files
+ * @filp: the file
+ * @wait: poll table
+ *
+ * Poll implemention.
+ */
+static unsigned int relay_file_poll(struct file *filp, poll_table *wait)
+{
+ unsigned int mask = 0;
+ struct rchan_buf *buf = filp->private_data;
+
+ if (buf->finalized)
+ return POLLERR;
+
+ if (filp->f_mode & FMODE_READ) {
+ poll_wait(filp, &buf->read_wait, wait);
+ if (!relay_buf_empty(buf))
+ mask |= POLLIN | POLLRDNORM;
+ }
+
+ return mask;
+}
+
+/**
+ * relay_file_release - release file op for relay files
+ * @inode: the inode
+ * @filp: the file
+ *
+ * Decrements the channel refcount, as the filesystem is
+ * no longer using it.
+ */
+static int relay_file_release(struct inode *inode, struct file *filp)
+{
+ struct rchan_buf *buf = filp->private_data;
+ kref_put(&buf->kref, relay_remove_buf);
+
+ return 0;
+}
+
+/*
+ * relay_file_read_consume - update the consumed count for the buffer
+ */
+static void relay_file_read_consume(struct rchan_buf *buf,
+ size_t read_pos,
+ size_t bytes_consumed)
+{
+ size_t subbuf_size = buf->chan->subbuf_size;
+ size_t n_subbufs = buf->chan->n_subbufs;
+ size_t read_subbuf;
+
+ if (buf->subbufs_produced == buf->subbufs_consumed &&
+ buf->offset == buf->bytes_consumed)
+ return;
+
+ if (buf->bytes_consumed + bytes_consumed > subbuf_size) {
+ relay_subbufs_consumed(buf->chan, buf->cpu, 1);
+ buf->bytes_consumed = 0;
+ }
+
+ buf->bytes_consumed += bytes_consumed;
+ if (!read_pos)
+ read_subbuf = buf->subbufs_consumed % n_subbufs;
+ else
+ read_subbuf = read_pos / buf->chan->subbuf_size;
+ if (buf->bytes_consumed + buf->padding[read_subbuf] == subbuf_size) {
+ if ((read_subbuf == buf->subbufs_produced % n_subbufs) &&
+ (buf->offset == subbuf_size))
+ return;
+ relay_subbufs_consumed(buf->chan, buf->cpu, 1);
+ buf->bytes_consumed = 0;
+ }
+}
+
+/*
+ * relay_file_read_avail - boolean, are there unconsumed bytes available?
+ */
+static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
+{
+ size_t subbuf_size = buf->chan->subbuf_size;
+ size_t n_subbufs = buf->chan->n_subbufs;
+ size_t produced = buf->subbufs_produced;
+ size_t consumed = buf->subbufs_consumed;
+
+ relay_file_read_consume(buf, read_pos, 0);
+
+ consumed = buf->subbufs_consumed;
+
+ if (unlikely(buf->offset > subbuf_size)) {
+ if (produced == consumed)
+ return 0;
+ return 1;
+ }
+
+ if (unlikely(produced - consumed >= n_subbufs)) {
+ consumed = produced - n_subbufs + 1;
+ buf->subbufs_consumed = consumed;
+ buf->bytes_consumed = 0;
+ }
+
+ produced = (produced % n_subbufs) * subbuf_size + buf->offset;
+ consumed = (consumed % n_subbufs) * subbuf_size + buf->bytes_consumed;
+
+ if (consumed > produced)
+ produced += n_subbufs * subbuf_size;
+
+ if (consumed == produced) {
+ if (buf->offset == subbuf_size &&
+ buf->subbufs_produced > buf->subbufs_consumed)
+ return 1;
+ return 0;
+ }
+
+ return 1;
+}
+
+/**
+ * relay_file_read_subbuf_avail - return bytes available in sub-buffer
+ * @read_pos: file read position
+ * @buf: relay channel buffer
+ */
+static size_t relay_file_read_subbuf_avail(size_t read_pos,
+ struct rchan_buf *buf)
+{
+ size_t padding, avail = 0;
+ size_t read_subbuf, read_offset, write_subbuf, write_offset;
+ size_t subbuf_size = buf->chan->subbuf_size;
+
+ write_subbuf = (buf->data - buf->start) / subbuf_size;
+ write_offset = buf->offset > subbuf_size ? subbuf_size : buf->offset;
+ read_subbuf = read_pos / subbuf_size;
+ read_offset = read_pos % subbuf_size;
+ padding = buf->padding[read_subbuf];
+
+ if (read_subbuf == write_subbuf) {
+ if (read_offset + padding < write_offset)
+ avail = write_offset - (read_offset + padding);
+ } else
+ avail = (subbuf_size - padding) - read_offset;
+
+ return avail;
+}
+
+/**
+ * relay_file_read_start_pos - find the first available byte to read
+ * @read_pos: file read position
+ * @buf: relay channel buffer
+ *
+ * If the @read_pos is in the middle of padding, return the
+ * position of the first actually available byte, otherwise
+ * return the original value.
+ */
+static size_t relay_file_read_start_pos(size_t read_pos,
+ struct rchan_buf *buf)
+{
+ size_t read_subbuf, padding, padding_start, padding_end;
+ size_t subbuf_size = buf->chan->subbuf_size;
+ size_t n_subbufs = buf->chan->n_subbufs;
+ size_t consumed = buf->subbufs_consumed % n_subbufs;
+
+ if (!read_pos)
+ read_pos = consumed * subbuf_size + buf->bytes_consumed;
+ read_subbuf = read_pos / subbuf_size;
+ padding = buf->padding[read_subbuf];
+ padding_start = (read_subbuf + 1) * subbuf_size - padding;
+ padding_end = (read_subbuf + 1) * subbuf_size;
+ if (read_pos >= padding_start && read_pos < padding_end) {
+ read_subbuf = (read_subbuf + 1) % n_subbufs;
+ read_pos = read_subbuf * subbuf_size;
+ }
+
+ return read_pos;
+}
+
+/**
+ * relay_file_read_end_pos - return the new read position
+ * @read_pos: file read position
+ * @buf: relay channel buffer
+ * @count: number of bytes to be read
+ */
+static size_t relay_file_read_end_pos(struct rchan_buf *buf,
+ size_t read_pos,
+ size_t count)
+{
+ size_t read_subbuf, padding, end_pos;
+ size_t subbuf_size = buf->chan->subbuf_size;
+ size_t n_subbufs = buf->chan->n_subbufs;
+
+ read_subbuf = read_pos / subbuf_size;
+ padding = buf->padding[read_subbuf];
+ if (read_pos % subbuf_size + count + padding == subbuf_size)
+ end_pos = (read_subbuf + 1) * subbuf_size;
+ else
+ end_pos = read_pos + count;
+ if (end_pos >= subbuf_size * n_subbufs)
+ end_pos = 0;
+
+ return end_pos;
+}
+
+/*
+ * subbuf_read_actor - read up to one subbuf's worth of data
+ */
+static int subbuf_read_actor(size_t read_start,
+ struct rchan_buf *buf,
+ size_t avail,
+ read_descriptor_t *desc)
+{
+ void *from;
+ int ret = 0;
+
+ from = buf->start + read_start;
+ ret = avail;
+ if (copy_to_user(desc->arg.buf, from, avail)) {
+ desc->error = -EFAULT;
+ ret = 0;
+ }
+ desc->arg.data += ret;
+ desc->written += ret;
+ desc->count -= ret;
+
+ return ret;
+}
+
+typedef int (*subbuf_actor_t) (size_t read_start,
+ struct rchan_buf *buf,
+ size_t avail,
+ read_descriptor_t *desc);
+
+/*
+ * relay_file_read_subbufs - read count bytes, bridging subbuf boundaries
+ */
+static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
+ subbuf_actor_t subbuf_actor,
+ read_descriptor_t *desc)
+{
+ struct rchan_buf *buf = filp->private_data;
+ size_t read_start, avail;
+ int ret;
+
+ if (!desc->count)
+ return 0;
+
+ mutex_lock(&file_inode(filp)->i_mutex);
+ do {
+ if (!relay_file_read_avail(buf, *ppos))
+ break;
+
+ read_start = relay_file_read_start_pos(*ppos, buf);
+ avail = relay_file_read_subbuf_avail(read_start, buf);
+ if (!avail)
+ break;
+
+ avail = min(desc->count, avail);
+ ret = subbuf_actor(read_start, buf, avail, desc);
+ if (desc->error < 0)
+ break;
+
+ if (ret) {
+ relay_file_read_consume(buf, read_start, ret);
+ *ppos = relay_file_read_end_pos(buf, read_start, ret);
+ }
+ } while (desc->count && ret);
+ mutex_unlock(&file_inode(filp)->i_mutex);
+
+ return desc->written;
+}
+
+static ssize_t relay_file_read(struct file *filp,
+ char __user *buffer,
+ size_t count,
+ loff_t *ppos)
+{
+ read_descriptor_t desc;
+ desc.written = 0;
+ desc.count = count;
+ desc.arg.buf = buffer;
+ desc.error = 0;
+ return relay_file_read_subbufs(filp, ppos, subbuf_read_actor, &desc);
+}
+
+static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed)
+{
+ rbuf->bytes_consumed += bytes_consumed;
+
+ if (rbuf->bytes_consumed >= rbuf->chan->subbuf_size) {
+ relay_subbufs_consumed(rbuf->chan, rbuf->cpu, 1);
+ rbuf->bytes_consumed %= rbuf->chan->subbuf_size;
+ }
+}
+
+static void relay_pipe_buf_release(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf)
+{
+ struct rchan_buf *rbuf;
+
+ rbuf = (struct rchan_buf *)page_private(buf->page);
+ relay_consume_bytes(rbuf, buf->private);
+}
+
+static const struct pipe_buf_operations relay_pipe_buf_ops = {
+ .can_merge = 0,
+ .confirm = generic_pipe_buf_confirm,
+ .release = relay_pipe_buf_release,
+ .steal = generic_pipe_buf_steal,
+ .get = generic_pipe_buf_get,
+};
+
+static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i)
+{
+}
+
+/*
+ * subbuf_splice_actor - splice up to one subbuf's worth of data
+ */
+static ssize_t subbuf_splice_actor(struct file *in,
+ loff_t *ppos,
+ struct pipe_inode_info *pipe,
+ size_t len,
+ unsigned int flags,
+ int *nonpad_ret)
+{
+ unsigned int pidx, poff, total_len, subbuf_pages, nr_pages;
+ struct rchan_buf *rbuf = in->private_data;
+ unsigned int subbuf_size = rbuf->chan->subbuf_size;
+ uint64_t pos = (uint64_t) *ppos;
+ uint32_t alloc_size = (uint32_t) rbuf->chan->alloc_size;
+ size_t read_start = (size_t) do_div(pos, alloc_size);
+ size_t read_subbuf = read_start / subbuf_size;
+ size_t padding = rbuf->padding[read_subbuf];
+ size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding;
+ struct page *pages[PIPE_DEF_BUFFERS];
+ struct partial_page partial[PIPE_DEF_BUFFERS];
+ struct splice_pipe_desc spd = {
+ .pages = pages,
+ .nr_pages = 0,
+ .nr_pages_max = PIPE_DEF_BUFFERS,
+ .partial = partial,
+ .flags = flags,
+ .ops = &relay_pipe_buf_ops,
+ .spd_release = relay_page_release,
+ };
+ ssize_t ret;
+
+ if (rbuf->subbufs_produced == rbuf->subbufs_consumed)
+ return 0;
+ if (splice_grow_spd(pipe, &spd))
+ return -ENOMEM;
+
+ /*
+ * Adjust read len, if longer than what is available
+ */
+ if (len > (subbuf_size - read_start % subbuf_size))
+ len = subbuf_size - read_start % subbuf_size;
+
+ subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT;
+ pidx = (read_start / PAGE_SIZE) % subbuf_pages;
+ poff = read_start & ~PAGE_MASK;
+ nr_pages = min_t(unsigned int, subbuf_pages, spd.nr_pages_max);
+
+ for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) {
+ unsigned int this_len, this_end, private;
+ unsigned int cur_pos = read_start + total_len;
+
+ if (!len)
+ break;
+
+ this_len = min_t(unsigned long, len, PAGE_SIZE - poff);
+ private = this_len;
+
+ spd.pages[spd.nr_pages] = rbuf->page_array[pidx];
+ spd.partial[spd.nr_pages].offset = poff;
+
+ this_end = cur_pos + this_len;
+ if (this_end >= nonpad_end) {
+ this_len = nonpad_end - cur_pos;
+ private = this_len + padding;
+ }
+ spd.partial[spd.nr_pages].len = this_len;
+ spd.partial[spd.nr_pages].private = private;
+
+ len -= this_len;
+ total_len += this_len;
+ poff = 0;
+ pidx = (pidx + 1) % subbuf_pages;
+
+ if (this_end >= nonpad_end) {
+ spd.nr_pages++;
+ break;
+ }
+ }
+
+ ret = 0;
+ if (!spd.nr_pages)
+ goto out;
+
+ ret = *nonpad_ret = splice_to_pipe(pipe, &spd);
+ if (ret < 0 || ret < total_len)
+ goto out;
+
+ if (read_start + ret == nonpad_end)
+ ret += padding;
+
+out:
+ splice_shrink_spd(&spd);
+ return ret;
+}
+
+static ssize_t relay_file_splice_read(struct file *in,
+ loff_t *ppos,
+ struct pipe_inode_info *pipe,
+ size_t len,
+ unsigned int flags)
+{
+ ssize_t spliced;
+ int ret;
+ int nonpad_ret = 0;
+
+ ret = 0;
+ spliced = 0;
+
+ while (len && !spliced) {
+ ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret);
+ if (ret < 0)
+ break;
+ else if (!ret) {
+ if (flags & SPLICE_F_NONBLOCK)
+ ret = -EAGAIN;
+ break;
+ }
+
+ *ppos += ret;
+ if (ret > len)
+ len = 0;
+ else
+ len -= ret;
+ spliced += nonpad_ret;
+ nonpad_ret = 0;
+ }
+
+ if (spliced)
+ return spliced;
+
+ return ret;
+}
+
+const struct file_operations relay_file_operations = {
+ .open = relay_file_open,
+ .poll = relay_file_poll,
+ .mmap = relay_file_mmap,
+ .read = relay_file_read,
+ .llseek = no_llseek,
+ .release = relay_file_release,
+ .splice_read = relay_file_splice_read,
+};
+EXPORT_SYMBOL_GPL(relay_file_operations);
+
+static __init int relay_init(void)
+{
+
+ hotcpu_notifier(relay_hotcpu_callback, 0);
+ return 0;
+}
+
+early_initcall(relay_init);
diff --git a/kernel/resource.c b/kernel/resource.c
new file mode 100644
index 000000000..90552aab5
--- /dev/null
+++ b/kernel/resource.c
@@ -0,0 +1,1534 @@
+/*
+ * linux/kernel/resource.c
+ *
+ * Copyright (C) 1999 Linus Torvalds
+ * Copyright (C) 1999 Martin Mares <mj@ucw.cz>
+ *
+ * Arbitrary resource management.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/export.h>
+#include <linux/errno.h>
+#include <linux/ioport.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/device.h>
+#include <linux/pfn.h>
+#include <linux/mm.h>
+#include <linux/resource_ext.h>
+#include <asm/io.h>
+
+
+struct resource ioport_resource = {
+ .name = "PCI IO",
+ .start = 0,
+ .end = IO_SPACE_LIMIT,
+ .flags = IORESOURCE_IO,
+};
+EXPORT_SYMBOL(ioport_resource);
+
+struct resource iomem_resource = {
+ .name = "PCI mem",
+ .start = 0,
+ .end = -1,
+ .flags = IORESOURCE_MEM,
+};
+EXPORT_SYMBOL(iomem_resource);
+
+/* constraints to be met while allocating resources */
+struct resource_constraint {
+ resource_size_t min, max, align;
+ resource_size_t (*alignf)(void *, const struct resource *,
+ resource_size_t, resource_size_t);
+ void *alignf_data;
+};
+
+static DEFINE_RWLOCK(resource_lock);
+
+/*
+ * For memory hotplug, there is no way to free resource entries allocated
+ * by boot mem after the system is up. So for reusing the resource entry
+ * we need to remember the resource.
+ */
+static struct resource *bootmem_resource_free;
+static DEFINE_SPINLOCK(bootmem_resource_lock);
+
+static struct resource *next_resource(struct resource *p, bool sibling_only)
+{
+ /* Caller wants to traverse through siblings only */
+ if (sibling_only)
+ return p->sibling;
+
+ if (p->child)
+ return p->child;
+ while (!p->sibling && p->parent)
+ p = p->parent;
+ return p->sibling;
+}
+
+static void *r_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct resource *p = v;
+ (*pos)++;
+ return (void *)next_resource(p, false);
+}
+
+#ifdef CONFIG_PROC_FS
+
+enum { MAX_IORES_LEVEL = 5 };
+
+static void *r_start(struct seq_file *m, loff_t *pos)
+ __acquires(resource_lock)
+{
+ struct resource *p = m->private;
+ loff_t l = 0;
+ read_lock(&resource_lock);
+ for (p = p->child; p && l < *pos; p = r_next(m, p, &l))
+ ;
+ return p;
+}
+
+static void r_stop(struct seq_file *m, void *v)
+ __releases(resource_lock)
+{
+ read_unlock(&resource_lock);
+}
+
+static int r_show(struct seq_file *m, void *v)
+{
+ struct resource *root = m->private;
+ struct resource *r = v, *p;
+ int width = root->end < 0x10000 ? 4 : 8;
+ int depth;
+
+ for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent)
+ if (p->parent == root)
+ break;
+ seq_printf(m, "%*s%0*llx-%0*llx : %s\n",
+ depth * 2, "",
+ width, (unsigned long long) r->start,
+ width, (unsigned long long) r->end,
+ r->name ? r->name : "<BAD>");
+ return 0;
+}
+
+static const struct seq_operations resource_op = {
+ .start = r_start,
+ .next = r_next,
+ .stop = r_stop,
+ .show = r_show,
+};
+
+static int ioports_open(struct inode *inode, struct file *file)
+{
+ int res = seq_open(file, &resource_op);
+ if (!res) {
+ struct seq_file *m = file->private_data;
+ m->private = &ioport_resource;
+ }
+ return res;
+}
+
+static int iomem_open(struct inode *inode, struct file *file)
+{
+ int res = seq_open(file, &resource_op);
+ if (!res) {
+ struct seq_file *m = file->private_data;
+ m->private = &iomem_resource;
+ }
+ return res;
+}
+
+static const struct file_operations proc_ioports_operations = {
+ .open = ioports_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static const struct file_operations proc_iomem_operations = {
+ .open = iomem_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int __init ioresources_init(void)
+{
+ proc_create("ioports", 0, NULL, &proc_ioports_operations);
+ proc_create("iomem", 0, NULL, &proc_iomem_operations);
+ return 0;
+}
+__initcall(ioresources_init);
+
+#endif /* CONFIG_PROC_FS */
+
+static void free_resource(struct resource *res)
+{
+ if (!res)
+ return;
+
+ if (!PageSlab(virt_to_head_page(res))) {
+ spin_lock(&bootmem_resource_lock);
+ res->sibling = bootmem_resource_free;
+ bootmem_resource_free = res;
+ spin_unlock(&bootmem_resource_lock);
+ } else {
+ kfree(res);
+ }
+}
+
+static struct resource *alloc_resource(gfp_t flags)
+{
+ struct resource *res = NULL;
+
+ spin_lock(&bootmem_resource_lock);
+ if (bootmem_resource_free) {
+ res = bootmem_resource_free;
+ bootmem_resource_free = res->sibling;
+ }
+ spin_unlock(&bootmem_resource_lock);
+
+ if (res)
+ memset(res, 0, sizeof(struct resource));
+ else
+ res = kzalloc(sizeof(struct resource), flags);
+
+ return res;
+}
+
+/* Return the conflict entry if you can't request it */
+static struct resource * __request_resource(struct resource *root, struct resource *new)
+{
+ resource_size_t start = new->start;
+ resource_size_t end = new->end;
+ struct resource *tmp, **p;
+
+ if (end < start)
+ return root;
+ if (start < root->start)
+ return root;
+ if (end > root->end)
+ return root;
+ p = &root->child;
+ for (;;) {
+ tmp = *p;
+ if (!tmp || tmp->start > end) {
+ new->sibling = tmp;
+ *p = new;
+ new->parent = root;
+ return NULL;
+ }
+ p = &tmp->sibling;
+ if (tmp->end < start)
+ continue;
+ return tmp;
+ }
+}
+
+static int __release_resource(struct resource *old)
+{
+ struct resource *tmp, **p;
+
+ p = &old->parent->child;
+ for (;;) {
+ tmp = *p;
+ if (!tmp)
+ break;
+ if (tmp == old) {
+ *p = tmp->sibling;
+ old->parent = NULL;
+ return 0;
+ }
+ p = &tmp->sibling;
+ }
+ return -EINVAL;
+}
+
+static void __release_child_resources(struct resource *r)
+{
+ struct resource *tmp, *p;
+ resource_size_t size;
+
+ p = r->child;
+ r->child = NULL;
+ while (p) {
+ tmp = p;
+ p = p->sibling;
+
+ tmp->parent = NULL;
+ tmp->sibling = NULL;
+ __release_child_resources(tmp);
+
+ printk(KERN_DEBUG "release child resource %pR\n", tmp);
+ /* need to restore size, and keep flags */
+ size = resource_size(tmp);
+ tmp->start = 0;
+ tmp->end = size - 1;
+ }
+}
+
+void release_child_resources(struct resource *r)
+{
+ write_lock(&resource_lock);
+ __release_child_resources(r);
+ write_unlock(&resource_lock);
+}
+
+/**
+ * request_resource_conflict - request and reserve an I/O or memory resource
+ * @root: root resource descriptor
+ * @new: resource descriptor desired by caller
+ *
+ * Returns 0 for success, conflict resource on error.
+ */
+struct resource *request_resource_conflict(struct resource *root, struct resource *new)
+{
+ struct resource *conflict;
+
+ write_lock(&resource_lock);
+ conflict = __request_resource(root, new);
+ write_unlock(&resource_lock);
+ return conflict;
+}
+
+/**
+ * request_resource - request and reserve an I/O or memory resource
+ * @root: root resource descriptor
+ * @new: resource descriptor desired by caller
+ *
+ * Returns 0 for success, negative error code on error.
+ */
+int request_resource(struct resource *root, struct resource *new)
+{
+ struct resource *conflict;
+
+ conflict = request_resource_conflict(root, new);
+ return conflict ? -EBUSY : 0;
+}
+
+EXPORT_SYMBOL(request_resource);
+
+/**
+ * release_resource - release a previously reserved resource
+ * @old: resource pointer
+ */
+int release_resource(struct resource *old)
+{
+ int retval;
+
+ write_lock(&resource_lock);
+ retval = __release_resource(old);
+ write_unlock(&resource_lock);
+ return retval;
+}
+
+EXPORT_SYMBOL(release_resource);
+
+/*
+ * Finds the lowest iomem reosurce exists with-in [res->start.res->end)
+ * the caller must specify res->start, res->end, res->flags and "name".
+ * If found, returns 0, res is overwritten, if not found, returns -1.
+ * This walks through whole tree and not just first level children
+ * until and unless first_level_children_only is true.
+ */
+static int find_next_iomem_res(struct resource *res, char *name,
+ bool first_level_children_only)
+{
+ resource_size_t start, end;
+ struct resource *p;
+ bool sibling_only = false;
+
+ BUG_ON(!res);
+
+ start = res->start;
+ end = res->end;
+ BUG_ON(start >= end);
+
+ if (first_level_children_only)
+ sibling_only = true;
+
+ read_lock(&resource_lock);
+
+ for (p = iomem_resource.child; p; p = next_resource(p, sibling_only)) {
+ if (p->flags != res->flags)
+ continue;
+ if (name && strcmp(p->name, name))
+ continue;
+ if (p->start > end) {
+ p = NULL;
+ break;
+ }
+ if ((p->end >= start) && (p->start < end))
+ break;
+ }
+
+ read_unlock(&resource_lock);
+ if (!p)
+ return -1;
+ /* copy data */
+ if (res->start < p->start)
+ res->start = p->start;
+ if (res->end > p->end)
+ res->end = p->end;
+ return 0;
+}
+
+/*
+ * Walks through iomem resources and calls func() with matching resource
+ * ranges. This walks through whole tree and not just first level children.
+ * All the memory ranges which overlap start,end and also match flags and
+ * name are valid candidates.
+ *
+ * @name: name of resource
+ * @flags: resource flags
+ * @start: start addr
+ * @end: end addr
+ */
+int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end,
+ void *arg, int (*func)(u64, u64, void *))
+{
+ struct resource res;
+ u64 orig_end;
+ int ret = -1;
+
+ res.start = start;
+ res.end = end;
+ res.flags = flags;
+ orig_end = res.end;
+ while ((res.start < res.end) &&
+ (!find_next_iomem_res(&res, name, false))) {
+ ret = (*func)(res.start, res.end, arg);
+ if (ret)
+ break;
+ res.start = res.end + 1;
+ res.end = orig_end;
+ }
+ return ret;
+}
+
+/*
+ * This function calls callback against all memory range of "System RAM"
+ * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY.
+ * Now, this function is only for "System RAM". This function deals with
+ * full ranges and not pfn. If resources are not pfn aligned, dealing
+ * with pfn can truncate ranges.
+ */
+int walk_system_ram_res(u64 start, u64 end, void *arg,
+ int (*func)(u64, u64, void *))
+{
+ struct resource res;
+ u64 orig_end;
+ int ret = -1;
+
+ res.start = start;
+ res.end = end;
+ res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ orig_end = res.end;
+ while ((res.start < res.end) &&
+ (!find_next_iomem_res(&res, "System RAM", true))) {
+ ret = (*func)(res.start, res.end, arg);
+ if (ret)
+ break;
+ res.start = res.end + 1;
+ res.end = orig_end;
+ }
+ return ret;
+}
+
+#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
+
+/*
+ * This function calls callback against all memory range of "System RAM"
+ * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY.
+ * Now, this function is only for "System RAM".
+ */
+int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
+ void *arg, int (*func)(unsigned long, unsigned long, void *))
+{
+ struct resource res;
+ unsigned long pfn, end_pfn;
+ u64 orig_end;
+ int ret = -1;
+
+ res.start = (u64) start_pfn << PAGE_SHIFT;
+ res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
+ res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ orig_end = res.end;
+ while ((res.start < res.end) &&
+ (find_next_iomem_res(&res, "System RAM", true) >= 0)) {
+ pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ end_pfn = (res.end + 1) >> PAGE_SHIFT;
+ if (end_pfn > pfn)
+ ret = (*func)(pfn, end_pfn - pfn, arg);
+ if (ret)
+ break;
+ res.start = res.end + 1;
+ res.end = orig_end;
+ }
+ return ret;
+}
+
+#endif
+
+static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg)
+{
+ return 1;
+}
+/*
+ * This generic page_is_ram() returns true if specified address is
+ * registered as "System RAM" in iomem_resource list.
+ */
+int __weak page_is_ram(unsigned long pfn)
+{
+ return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1;
+}
+EXPORT_SYMBOL_GPL(page_is_ram);
+
+/*
+ * Search for a resouce entry that fully contains the specified region.
+ * If found, return 1 if it is RAM, 0 if not.
+ * If not found, or region is not fully contained, return -1
+ *
+ * Used by the ioremap functions to ensure the user is not remapping RAM and is
+ * a vast speed up over walking through the resource table page by page.
+ */
+int region_is_ram(resource_size_t start, unsigned long size)
+{
+ struct resource *p;
+ resource_size_t end = start + size - 1;
+ int flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ const char *name = "System RAM";
+ int ret = -1;
+
+ read_lock(&resource_lock);
+ for (p = iomem_resource.child; p ; p = p->sibling) {
+ if (end < p->start)
+ continue;
+
+ if (p->start <= start && end <= p->end) {
+ /* resource fully contains region */
+ if ((p->flags != flags) || strcmp(p->name, name))
+ ret = 0;
+ else
+ ret = 1;
+ break;
+ }
+ if (p->end < start)
+ break; /* not found */
+ }
+ read_unlock(&resource_lock);
+ return ret;
+}
+
+void __weak arch_remove_reservations(struct resource *avail)
+{
+}
+
+static resource_size_t simple_align_resource(void *data,
+ const struct resource *avail,
+ resource_size_t size,
+ resource_size_t align)
+{
+ return avail->start;
+}
+
+static void resource_clip(struct resource *res, resource_size_t min,
+ resource_size_t max)
+{
+ if (res->start < min)
+ res->start = min;
+ if (res->end > max)
+ res->end = max;
+}
+
+/*
+ * Find empty slot in the resource tree with the given range and
+ * alignment constraints
+ */
+static int __find_resource(struct resource *root, struct resource *old,
+ struct resource *new,
+ resource_size_t size,
+ struct resource_constraint *constraint)
+{
+ struct resource *this = root->child;
+ struct resource tmp = *new, avail, alloc;
+
+ tmp.start = root->start;
+ /*
+ * Skip past an allocated resource that starts at 0, since the assignment
+ * of this->start - 1 to tmp->end below would cause an underflow.
+ */
+ if (this && this->start == root->start) {
+ tmp.start = (this == old) ? old->start : this->end + 1;
+ this = this->sibling;
+ }
+ for(;;) {
+ if (this)
+ tmp.end = (this == old) ? this->end : this->start - 1;
+ else
+ tmp.end = root->end;
+
+ if (tmp.end < tmp.start)
+ goto next;
+
+ resource_clip(&tmp, constraint->min, constraint->max);
+ arch_remove_reservations(&tmp);
+
+ /* Check for overflow after ALIGN() */
+ avail.start = ALIGN(tmp.start, constraint->align);
+ avail.end = tmp.end;
+ avail.flags = new->flags & ~IORESOURCE_UNSET;
+ if (avail.start >= tmp.start) {
+ alloc.flags = avail.flags;
+ alloc.start = constraint->alignf(constraint->alignf_data, &avail,
+ size, constraint->align);
+ alloc.end = alloc.start + size - 1;
+ if (resource_contains(&avail, &alloc)) {
+ new->start = alloc.start;
+ new->end = alloc.end;
+ return 0;
+ }
+ }
+
+next: if (!this || this->end == root->end)
+ break;
+
+ if (this != old)
+ tmp.start = this->end + 1;
+ this = this->sibling;
+ }
+ return -EBUSY;
+}
+
+/*
+ * Find empty slot in the resource tree given range and alignment.
+ */
+static int find_resource(struct resource *root, struct resource *new,
+ resource_size_t size,
+ struct resource_constraint *constraint)
+{
+ return __find_resource(root, NULL, new, size, constraint);
+}
+
+/**
+ * reallocate_resource - allocate a slot in the resource tree given range & alignment.
+ * The resource will be relocated if the new size cannot be reallocated in the
+ * current location.
+ *
+ * @root: root resource descriptor
+ * @old: resource descriptor desired by caller
+ * @newsize: new size of the resource descriptor
+ * @constraint: the size and alignment constraints to be met.
+ */
+static int reallocate_resource(struct resource *root, struct resource *old,
+ resource_size_t newsize,
+ struct resource_constraint *constraint)
+{
+ int err=0;
+ struct resource new = *old;
+ struct resource *conflict;
+
+ write_lock(&resource_lock);
+
+ if ((err = __find_resource(root, old, &new, newsize, constraint)))
+ goto out;
+
+ if (resource_contains(&new, old)) {
+ old->start = new.start;
+ old->end = new.end;
+ goto out;
+ }
+
+ if (old->child) {
+ err = -EBUSY;
+ goto out;
+ }
+
+ if (resource_contains(old, &new)) {
+ old->start = new.start;
+ old->end = new.end;
+ } else {
+ __release_resource(old);
+ *old = new;
+ conflict = __request_resource(root, old);
+ BUG_ON(conflict);
+ }
+out:
+ write_unlock(&resource_lock);
+ return err;
+}
+
+
+/**
+ * allocate_resource - allocate empty slot in the resource tree given range & alignment.
+ * The resource will be reallocated with a new size if it was already allocated
+ * @root: root resource descriptor
+ * @new: resource descriptor desired by caller
+ * @size: requested resource region size
+ * @min: minimum boundary to allocate
+ * @max: maximum boundary to allocate
+ * @align: alignment requested, in bytes
+ * @alignf: alignment function, optional, called if not NULL
+ * @alignf_data: arbitrary data to pass to the @alignf function
+ */
+int allocate_resource(struct resource *root, struct resource *new,
+ resource_size_t size, resource_size_t min,
+ resource_size_t max, resource_size_t align,
+ resource_size_t (*alignf)(void *,
+ const struct resource *,
+ resource_size_t,
+ resource_size_t),
+ void *alignf_data)
+{
+ int err;
+ struct resource_constraint constraint;
+
+ if (!alignf)
+ alignf = simple_align_resource;
+
+ constraint.min = min;
+ constraint.max = max;
+ constraint.align = align;
+ constraint.alignf = alignf;
+ constraint.alignf_data = alignf_data;
+
+ if ( new->parent ) {
+ /* resource is already allocated, try reallocating with
+ the new constraints */
+ return reallocate_resource(root, new, size, &constraint);
+ }
+
+ write_lock(&resource_lock);
+ err = find_resource(root, new, size, &constraint);
+ if (err >= 0 && __request_resource(root, new))
+ err = -EBUSY;
+ write_unlock(&resource_lock);
+ return err;
+}
+
+EXPORT_SYMBOL(allocate_resource);
+
+/**
+ * lookup_resource - find an existing resource by a resource start address
+ * @root: root resource descriptor
+ * @start: resource start address
+ *
+ * Returns a pointer to the resource if found, NULL otherwise
+ */
+struct resource *lookup_resource(struct resource *root, resource_size_t start)
+{
+ struct resource *res;
+
+ read_lock(&resource_lock);
+ for (res = root->child; res; res = res->sibling) {
+ if (res->start == start)
+ break;
+ }
+ read_unlock(&resource_lock);
+
+ return res;
+}
+
+/*
+ * Insert a resource into the resource tree. If successful, return NULL,
+ * otherwise return the conflicting resource (compare to __request_resource())
+ */
+static struct resource * __insert_resource(struct resource *parent, struct resource *new)
+{
+ struct resource *first, *next;
+
+ for (;; parent = first) {
+ first = __request_resource(parent, new);
+ if (!first)
+ return first;
+
+ if (first == parent)
+ return first;
+ if (WARN_ON(first == new)) /* duplicated insertion */
+ return first;
+
+ if ((first->start > new->start) || (first->end < new->end))
+ break;
+ if ((first->start == new->start) && (first->end == new->end))
+ break;
+ }
+
+ for (next = first; ; next = next->sibling) {
+ /* Partial overlap? Bad, and unfixable */
+ if (next->start < new->start || next->end > new->end)
+ return next;
+ if (!next->sibling)
+ break;
+ if (next->sibling->start > new->end)
+ break;
+ }
+
+ new->parent = parent;
+ new->sibling = next->sibling;
+ new->child = first;
+
+ next->sibling = NULL;
+ for (next = first; next; next = next->sibling)
+ next->parent = new;
+
+ if (parent->child == first) {
+ parent->child = new;
+ } else {
+ next = parent->child;
+ while (next->sibling != first)
+ next = next->sibling;
+ next->sibling = new;
+ }
+ return NULL;
+}
+
+/**
+ * insert_resource_conflict - Inserts resource in the resource tree
+ * @parent: parent of the new resource
+ * @new: new resource to insert
+ *
+ * Returns 0 on success, conflict resource if the resource can't be inserted.
+ *
+ * This function is equivalent to request_resource_conflict when no conflict
+ * happens. If a conflict happens, and the conflicting resources
+ * entirely fit within the range of the new resource, then the new
+ * resource is inserted and the conflicting resources become children of
+ * the new resource.
+ */
+struct resource *insert_resource_conflict(struct resource *parent, struct resource *new)
+{
+ struct resource *conflict;
+
+ write_lock(&resource_lock);
+ conflict = __insert_resource(parent, new);
+ write_unlock(&resource_lock);
+ return conflict;
+}
+
+/**
+ * insert_resource - Inserts a resource in the resource tree
+ * @parent: parent of the new resource
+ * @new: new resource to insert
+ *
+ * Returns 0 on success, -EBUSY if the resource can't be inserted.
+ */
+int insert_resource(struct resource *parent, struct resource *new)
+{
+ struct resource *conflict;
+
+ conflict = insert_resource_conflict(parent, new);
+ return conflict ? -EBUSY : 0;
+}
+
+/**
+ * insert_resource_expand_to_fit - Insert a resource into the resource tree
+ * @root: root resource descriptor
+ * @new: new resource to insert
+ *
+ * Insert a resource into the resource tree, possibly expanding it in order
+ * to make it encompass any conflicting resources.
+ */
+void insert_resource_expand_to_fit(struct resource *root, struct resource *new)
+{
+ if (new->parent)
+ return;
+
+ write_lock(&resource_lock);
+ for (;;) {
+ struct resource *conflict;
+
+ conflict = __insert_resource(root, new);
+ if (!conflict)
+ break;
+ if (conflict == root)
+ break;
+
+ /* Ok, expand resource to cover the conflict, then try again .. */
+ if (conflict->start < new->start)
+ new->start = conflict->start;
+ if (conflict->end > new->end)
+ new->end = conflict->end;
+
+ printk("Expanded resource %s due to conflict with %s\n", new->name, conflict->name);
+ }
+ write_unlock(&resource_lock);
+}
+
+static int __adjust_resource(struct resource *res, resource_size_t start,
+ resource_size_t size)
+{
+ struct resource *tmp, *parent = res->parent;
+ resource_size_t end = start + size - 1;
+ int result = -EBUSY;
+
+ if (!parent)
+ goto skip;
+
+ if ((start < parent->start) || (end > parent->end))
+ goto out;
+
+ if (res->sibling && (res->sibling->start <= end))
+ goto out;
+
+ tmp = parent->child;
+ if (tmp != res) {
+ while (tmp->sibling != res)
+ tmp = tmp->sibling;
+ if (start <= tmp->end)
+ goto out;
+ }
+
+skip:
+ for (tmp = res->child; tmp; tmp = tmp->sibling)
+ if ((tmp->start < start) || (tmp->end > end))
+ goto out;
+
+ res->start = start;
+ res->end = end;
+ result = 0;
+
+ out:
+ return result;
+}
+
+/**
+ * adjust_resource - modify a resource's start and size
+ * @res: resource to modify
+ * @start: new start value
+ * @size: new size
+ *
+ * Given an existing resource, change its start and size to match the
+ * arguments. Returns 0 on success, -EBUSY if it can't fit.
+ * Existing children of the resource are assumed to be immutable.
+ */
+int adjust_resource(struct resource *res, resource_size_t start,
+ resource_size_t size)
+{
+ int result;
+
+ write_lock(&resource_lock);
+ result = __adjust_resource(res, start, size);
+ write_unlock(&resource_lock);
+ return result;
+}
+EXPORT_SYMBOL(adjust_resource);
+
+static void __init __reserve_region_with_split(struct resource *root,
+ resource_size_t start, resource_size_t end,
+ const char *name)
+{
+ struct resource *parent = root;
+ struct resource *conflict;
+ struct resource *res = alloc_resource(GFP_ATOMIC);
+ struct resource *next_res = NULL;
+
+ if (!res)
+ return;
+
+ res->name = name;
+ res->start = start;
+ res->end = end;
+ res->flags = IORESOURCE_BUSY;
+
+ while (1) {
+
+ conflict = __request_resource(parent, res);
+ if (!conflict) {
+ if (!next_res)
+ break;
+ res = next_res;
+ next_res = NULL;
+ continue;
+ }
+
+ /* conflict covered whole area */
+ if (conflict->start <= res->start &&
+ conflict->end >= res->end) {
+ free_resource(res);
+ WARN_ON(next_res);
+ break;
+ }
+
+ /* failed, split and try again */
+ if (conflict->start > res->start) {
+ end = res->end;
+ res->end = conflict->start - 1;
+ if (conflict->end < end) {
+ next_res = alloc_resource(GFP_ATOMIC);
+ if (!next_res) {
+ free_resource(res);
+ break;
+ }
+ next_res->name = name;
+ next_res->start = conflict->end + 1;
+ next_res->end = end;
+ next_res->flags = IORESOURCE_BUSY;
+ }
+ } else {
+ res->start = conflict->end + 1;
+ }
+ }
+
+}
+
+void __init reserve_region_with_split(struct resource *root,
+ resource_size_t start, resource_size_t end,
+ const char *name)
+{
+ int abort = 0;
+
+ write_lock(&resource_lock);
+ if (root->start > start || root->end < end) {
+ pr_err("requested range [0x%llx-0x%llx] not in root %pr\n",
+ (unsigned long long)start, (unsigned long long)end,
+ root);
+ if (start > root->end || end < root->start)
+ abort = 1;
+ else {
+ if (end > root->end)
+ end = root->end;
+ if (start < root->start)
+ start = root->start;
+ pr_err("fixing request to [0x%llx-0x%llx]\n",
+ (unsigned long long)start,
+ (unsigned long long)end);
+ }
+ dump_stack();
+ }
+ if (!abort)
+ __reserve_region_with_split(root, start, end, name);
+ write_unlock(&resource_lock);
+}
+
+/**
+ * resource_alignment - calculate resource's alignment
+ * @res: resource pointer
+ *
+ * Returns alignment on success, 0 (invalid alignment) on failure.
+ */
+resource_size_t resource_alignment(struct resource *res)
+{
+ switch (res->flags & (IORESOURCE_SIZEALIGN | IORESOURCE_STARTALIGN)) {
+ case IORESOURCE_SIZEALIGN:
+ return resource_size(res);
+ case IORESOURCE_STARTALIGN:
+ return res->start;
+ default:
+ return 0;
+ }
+}
+
+/*
+ * This is compatibility stuff for IO resources.
+ *
+ * Note how this, unlike the above, knows about
+ * the IO flag meanings (busy etc).
+ *
+ * request_region creates a new busy region.
+ *
+ * release_region releases a matching busy region.
+ */
+
+static DECLARE_WAIT_QUEUE_HEAD(muxed_resource_wait);
+
+/**
+ * __request_region - create a new busy resource region
+ * @parent: parent resource descriptor
+ * @start: resource start address
+ * @n: resource region size
+ * @name: reserving caller's ID string
+ * @flags: IO resource flags
+ */
+struct resource * __request_region(struct resource *parent,
+ resource_size_t start, resource_size_t n,
+ const char *name, int flags)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ struct resource *res = alloc_resource(GFP_KERNEL);
+
+ if (!res)
+ return NULL;
+
+ res->name = name;
+ res->start = start;
+ res->end = start + n - 1;
+ res->flags = resource_type(parent);
+ res->flags |= IORESOURCE_BUSY | flags;
+
+ write_lock(&resource_lock);
+
+ for (;;) {
+ struct resource *conflict;
+
+ conflict = __request_resource(parent, res);
+ if (!conflict)
+ break;
+ if (conflict != parent) {
+ parent = conflict;
+ if (!(conflict->flags & IORESOURCE_BUSY))
+ continue;
+ }
+ if (conflict->flags & flags & IORESOURCE_MUXED) {
+ add_wait_queue(&muxed_resource_wait, &wait);
+ write_unlock(&resource_lock);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule();
+ remove_wait_queue(&muxed_resource_wait, &wait);
+ write_lock(&resource_lock);
+ continue;
+ }
+ /* Uhhuh, that didn't work out.. */
+ free_resource(res);
+ res = NULL;
+ break;
+ }
+ write_unlock(&resource_lock);
+ return res;
+}
+EXPORT_SYMBOL(__request_region);
+
+/**
+ * __release_region - release a previously reserved resource region
+ * @parent: parent resource descriptor
+ * @start: resource start address
+ * @n: resource region size
+ *
+ * The described resource region must match a currently busy region.
+ */
+void __release_region(struct resource *parent, resource_size_t start,
+ resource_size_t n)
+{
+ struct resource **p;
+ resource_size_t end;
+
+ p = &parent->child;
+ end = start + n - 1;
+
+ write_lock(&resource_lock);
+
+ for (;;) {
+ struct resource *res = *p;
+
+ if (!res)
+ break;
+ if (res->start <= start && res->end >= end) {
+ if (!(res->flags & IORESOURCE_BUSY)) {
+ p = &res->child;
+ continue;
+ }
+ if (res->start != start || res->end != end)
+ break;
+ *p = res->sibling;
+ write_unlock(&resource_lock);
+ if (res->flags & IORESOURCE_MUXED)
+ wake_up(&muxed_resource_wait);
+ free_resource(res);
+ return;
+ }
+ p = &res->sibling;
+ }
+
+ write_unlock(&resource_lock);
+
+ printk(KERN_WARNING "Trying to free nonexistent resource "
+ "<%016llx-%016llx>\n", (unsigned long long)start,
+ (unsigned long long)end);
+}
+EXPORT_SYMBOL(__release_region);
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+/**
+ * release_mem_region_adjustable - release a previously reserved memory region
+ * @parent: parent resource descriptor
+ * @start: resource start address
+ * @size: resource region size
+ *
+ * This interface is intended for memory hot-delete. The requested region
+ * is released from a currently busy memory resource. The requested region
+ * must either match exactly or fit into a single busy resource entry. In
+ * the latter case, the remaining resource is adjusted accordingly.
+ * Existing children of the busy memory resource must be immutable in the
+ * request.
+ *
+ * Note:
+ * - Additional release conditions, such as overlapping region, can be
+ * supported after they are confirmed as valid cases.
+ * - When a busy memory resource gets split into two entries, the code
+ * assumes that all children remain in the lower address entry for
+ * simplicity. Enhance this logic when necessary.
+ */
+int release_mem_region_adjustable(struct resource *parent,
+ resource_size_t start, resource_size_t size)
+{
+ struct resource **p;
+ struct resource *res;
+ struct resource *new_res;
+ resource_size_t end;
+ int ret = -EINVAL;
+
+ end = start + size - 1;
+ if ((start < parent->start) || (end > parent->end))
+ return ret;
+
+ /* The alloc_resource() result gets checked later */
+ new_res = alloc_resource(GFP_KERNEL);
+
+ p = &parent->child;
+ write_lock(&resource_lock);
+
+ while ((res = *p)) {
+ if (res->start >= end)
+ break;
+
+ /* look for the next resource if it does not fit into */
+ if (res->start > start || res->end < end) {
+ p = &res->sibling;
+ continue;
+ }
+
+ if (!(res->flags & IORESOURCE_MEM))
+ break;
+
+ if (!(res->flags & IORESOURCE_BUSY)) {
+ p = &res->child;
+ continue;
+ }
+
+ /* found the target resource; let's adjust accordingly */
+ if (res->start == start && res->end == end) {
+ /* free the whole entry */
+ *p = res->sibling;
+ free_resource(res);
+ ret = 0;
+ } else if (res->start == start && res->end != end) {
+ /* adjust the start */
+ ret = __adjust_resource(res, end + 1,
+ res->end - end);
+ } else if (res->start != start && res->end == end) {
+ /* adjust the end */
+ ret = __adjust_resource(res, res->start,
+ start - res->start);
+ } else {
+ /* split into two entries */
+ if (!new_res) {
+ ret = -ENOMEM;
+ break;
+ }
+ new_res->name = res->name;
+ new_res->start = end + 1;
+ new_res->end = res->end;
+ new_res->flags = res->flags;
+ new_res->parent = res->parent;
+ new_res->sibling = res->sibling;
+ new_res->child = NULL;
+
+ ret = __adjust_resource(res, res->start,
+ start - res->start);
+ if (ret)
+ break;
+ res->sibling = new_res;
+ new_res = NULL;
+ }
+
+ break;
+ }
+
+ write_unlock(&resource_lock);
+ free_resource(new_res);
+ return ret;
+}
+#endif /* CONFIG_MEMORY_HOTREMOVE */
+
+/*
+ * Managed region resource
+ */
+static void devm_resource_release(struct device *dev, void *ptr)
+{
+ struct resource **r = ptr;
+
+ release_resource(*r);
+}
+
+/**
+ * devm_request_resource() - request and reserve an I/O or memory resource
+ * @dev: device for which to request the resource
+ * @root: root of the resource tree from which to request the resource
+ * @new: descriptor of the resource to request
+ *
+ * This is a device-managed version of request_resource(). There is usually
+ * no need to release resources requested by this function explicitly since
+ * that will be taken care of when the device is unbound from its driver.
+ * If for some reason the resource needs to be released explicitly, because
+ * of ordering issues for example, drivers must call devm_release_resource()
+ * rather than the regular release_resource().
+ *
+ * When a conflict is detected between any existing resources and the newly
+ * requested resource, an error message will be printed.
+ *
+ * Returns 0 on success or a negative error code on failure.
+ */
+int devm_request_resource(struct device *dev, struct resource *root,
+ struct resource *new)
+{
+ struct resource *conflict, **ptr;
+
+ ptr = devres_alloc(devm_resource_release, sizeof(*ptr), GFP_KERNEL);
+ if (!ptr)
+ return -ENOMEM;
+
+ *ptr = new;
+
+ conflict = request_resource_conflict(root, new);
+ if (conflict) {
+ dev_err(dev, "resource collision: %pR conflicts with %s %pR\n",
+ new, conflict->name, conflict);
+ devres_free(ptr);
+ return -EBUSY;
+ }
+
+ devres_add(dev, ptr);
+ return 0;
+}
+EXPORT_SYMBOL(devm_request_resource);
+
+static int devm_resource_match(struct device *dev, void *res, void *data)
+{
+ struct resource **ptr = res;
+
+ return *ptr == data;
+}
+
+/**
+ * devm_release_resource() - release a previously requested resource
+ * @dev: device for which to release the resource
+ * @new: descriptor of the resource to release
+ *
+ * Releases a resource previously requested using devm_request_resource().
+ */
+void devm_release_resource(struct device *dev, struct resource *new)
+{
+ WARN_ON(devres_release(dev, devm_resource_release, devm_resource_match,
+ new));
+}
+EXPORT_SYMBOL(devm_release_resource);
+
+struct region_devres {
+ struct resource *parent;
+ resource_size_t start;
+ resource_size_t n;
+};
+
+static void devm_region_release(struct device *dev, void *res)
+{
+ struct region_devres *this = res;
+
+ __release_region(this->parent, this->start, this->n);
+}
+
+static int devm_region_match(struct device *dev, void *res, void *match_data)
+{
+ struct region_devres *this = res, *match = match_data;
+
+ return this->parent == match->parent &&
+ this->start == match->start && this->n == match->n;
+}
+
+struct resource * __devm_request_region(struct device *dev,
+ struct resource *parent, resource_size_t start,
+ resource_size_t n, const char *name)
+{
+ struct region_devres *dr = NULL;
+ struct resource *res;
+
+ dr = devres_alloc(devm_region_release, sizeof(struct region_devres),
+ GFP_KERNEL);
+ if (!dr)
+ return NULL;
+
+ dr->parent = parent;
+ dr->start = start;
+ dr->n = n;
+
+ res = __request_region(parent, start, n, name, 0);
+ if (res)
+ devres_add(dev, dr);
+ else
+ devres_free(dr);
+
+ return res;
+}
+EXPORT_SYMBOL(__devm_request_region);
+
+void __devm_release_region(struct device *dev, struct resource *parent,
+ resource_size_t start, resource_size_t n)
+{
+ struct region_devres match_data = { parent, start, n };
+
+ __release_region(parent, start, n);
+ WARN_ON(devres_destroy(dev, devm_region_release, devm_region_match,
+ &match_data));
+}
+EXPORT_SYMBOL(__devm_release_region);
+
+/*
+ * Called from init/main.c to reserve IO ports.
+ */
+#define MAXRESERVE 4
+static int __init reserve_setup(char *str)
+{
+ static int reserved;
+ static struct resource reserve[MAXRESERVE];
+
+ for (;;) {
+ unsigned int io_start, io_num;
+ int x = reserved;
+
+ if (get_option (&str, &io_start) != 2)
+ break;
+ if (get_option (&str, &io_num) == 0)
+ break;
+ if (x < MAXRESERVE) {
+ struct resource *res = reserve + x;
+ res->name = "reserved";
+ res->start = io_start;
+ res->end = io_start + io_num - 1;
+ res->flags = IORESOURCE_BUSY;
+ res->child = NULL;
+ if (request_resource(res->start >= 0x10000 ? &iomem_resource : &ioport_resource, res) == 0)
+ reserved = x+1;
+ }
+ }
+ return 1;
+}
+
+__setup("reserve=", reserve_setup);
+
+/*
+ * Check if the requested addr and size spans more than any slot in the
+ * iomem resource tree.
+ */
+int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
+{
+ struct resource *p = &iomem_resource;
+ int err = 0;
+ loff_t l;
+
+ read_lock(&resource_lock);
+ for (p = p->child; p ; p = r_next(NULL, p, &l)) {
+ /*
+ * We can probably skip the resources without
+ * IORESOURCE_IO attribute?
+ */
+ if (p->start >= addr + size)
+ continue;
+ if (p->end < addr)
+ continue;
+ if (PFN_DOWN(p->start) <= PFN_DOWN(addr) &&
+ PFN_DOWN(p->end) >= PFN_DOWN(addr + size - 1))
+ continue;
+ /*
+ * if a resource is "BUSY", it's not a hardware resource
+ * but a driver mapping of such a resource; we don't want
+ * to warn for those; some drivers legitimately map only
+ * partial hardware resources. (example: vesafb)
+ */
+ if (p->flags & IORESOURCE_BUSY)
+ continue;
+
+ printk(KERN_WARNING "resource sanity check: requesting [mem %#010llx-%#010llx], which spans more than %s %pR\n",
+ (unsigned long long)addr,
+ (unsigned long long)(addr + size - 1),
+ p->name, p);
+ err = -1;
+ break;
+ }
+ read_unlock(&resource_lock);
+
+ return err;
+}
+
+#ifdef CONFIG_STRICT_DEVMEM
+static int strict_iomem_checks = 1;
+#else
+static int strict_iomem_checks;
+#endif
+
+/*
+ * check if an address is reserved in the iomem resource tree
+ * returns 1 if reserved, 0 if not reserved.
+ */
+int iomem_is_exclusive(u64 addr)
+{
+ struct resource *p = &iomem_resource;
+ int err = 0;
+ loff_t l;
+ int size = PAGE_SIZE;
+
+ if (!strict_iomem_checks)
+ return 0;
+
+ addr = addr & PAGE_MASK;
+
+ read_lock(&resource_lock);
+ for (p = p->child; p ; p = r_next(NULL, p, &l)) {
+ /*
+ * We can probably skip the resources without
+ * IORESOURCE_IO attribute?
+ */
+ if (p->start >= addr + size)
+ break;
+ if (p->end < addr)
+ continue;
+ if (p->flags & IORESOURCE_BUSY &&
+ p->flags & IORESOURCE_EXCLUSIVE) {
+ err = 1;
+ break;
+ }
+ }
+ read_unlock(&resource_lock);
+
+ return err;
+}
+
+struct resource_entry *resource_list_create_entry(struct resource *res,
+ size_t extra_size)
+{
+ struct resource_entry *entry;
+
+ entry = kzalloc(sizeof(*entry) + extra_size, GFP_KERNEL);
+ if (entry) {
+ INIT_LIST_HEAD(&entry->node);
+ entry->res = res ? res : &entry->__res;
+ }
+
+ return entry;
+}
+EXPORT_SYMBOL(resource_list_create_entry);
+
+void resource_list_free(struct list_head *head)
+{
+ struct resource_entry *entry, *tmp;
+
+ list_for_each_entry_safe(entry, tmp, head, node)
+ resource_list_destroy_entry(entry);
+}
+EXPORT_SYMBOL(resource_list_free);
+
+static int __init strict_iomem(char *str)
+{
+ if (strstr(str, "relaxed"))
+ strict_iomem_checks = 0;
+ if (strstr(str, "strict"))
+ strict_iomem_checks = 1;
+ return 1;
+}
+
+__setup("iomem=", strict_iomem);
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
new file mode 100644
index 000000000..54b88a1c0
--- /dev/null
+++ b/kernel/sched/Makefile
@@ -0,0 +1,26 @@
+ifdef CONFIG_FUNCTION_TRACER
+CFLAGS_REMOVE_clock.o = $(CC_FLAGS_FTRACE)
+endif
+
+ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
+# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
+# needed for x86 only. Why this used to be enabled for all architectures is beyond
+# me. I suspect most platforms don't need this, but until we know that for sure
+# I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k
+# to get a correct value for the wait-channel (WCHAN in ps). --davidm
+CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
+endif
+
+ifdef CONFIG_SCHED_BFS
+obj-y += bfs.o clock.o
+else
+obj-y += core.o proc.o clock.o cputime.o
+obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
+obj-$(CONFIG_SMP) += cpudeadline.o
+obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
+obj-$(CONFIG_SCHED_DEBUG) += debug.o
+obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
+endif
+obj-y += wait.o completion.o idle.o
+obj-$(CONFIG_SMP) += cpupri.o
+obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
new file mode 100644
index 000000000..eae160dd6
--- /dev/null
+++ b/kernel/sched/auto_group.c
@@ -0,0 +1,253 @@
+#ifdef CONFIG_SCHED_AUTOGROUP
+
+#include "sched.h"
+
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/kallsyms.h>
+#include <linux/utsname.h>
+#include <linux/security.h>
+#include <linux/export.h>
+
+unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
+static struct autogroup autogroup_default;
+static atomic_t autogroup_seq_nr;
+
+void __init autogroup_init(struct task_struct *init_task)
+{
+ autogroup_default.tg = &root_task_group;
+ kref_init(&autogroup_default.kref);
+ init_rwsem(&autogroup_default.lock);
+ init_task->signal->autogroup = &autogroup_default;
+}
+
+void autogroup_free(struct task_group *tg)
+{
+ kfree(tg->autogroup);
+}
+
+static inline void autogroup_destroy(struct kref *kref)
+{
+ struct autogroup *ag = container_of(kref, struct autogroup, kref);
+
+#ifdef CONFIG_RT_GROUP_SCHED
+ /* We've redirected RT tasks to the root task group... */
+ ag->tg->rt_se = NULL;
+ ag->tg->rt_rq = NULL;
+#endif
+ sched_offline_group(ag->tg);
+ sched_destroy_group(ag->tg);
+}
+
+static inline void autogroup_kref_put(struct autogroup *ag)
+{
+ kref_put(&ag->kref, autogroup_destroy);
+}
+
+static inline struct autogroup *autogroup_kref_get(struct autogroup *ag)
+{
+ kref_get(&ag->kref);
+ return ag;
+}
+
+static inline struct autogroup *autogroup_task_get(struct task_struct *p)
+{
+ struct autogroup *ag;
+ unsigned long flags;
+
+ if (!lock_task_sighand(p, &flags))
+ return autogroup_kref_get(&autogroup_default);
+
+ ag = autogroup_kref_get(p->signal->autogroup);
+ unlock_task_sighand(p, &flags);
+
+ return ag;
+}
+
+static inline struct autogroup *autogroup_create(void)
+{
+ struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
+ struct task_group *tg;
+
+ if (!ag)
+ goto out_fail;
+
+ tg = sched_create_group(&root_task_group);
+
+ if (IS_ERR(tg))
+ goto out_free;
+
+ kref_init(&ag->kref);
+ init_rwsem(&ag->lock);
+ ag->id = atomic_inc_return(&autogroup_seq_nr);
+ ag->tg = tg;
+#ifdef CONFIG_RT_GROUP_SCHED
+ /*
+ * Autogroup RT tasks are redirected to the root task group
+ * so we don't have to move tasks around upon policy change,
+ * or flail around trying to allocate bandwidth on the fly.
+ * A bandwidth exception in __sched_setscheduler() allows
+ * the policy change to proceed.
+ */
+ free_rt_sched_group(tg);
+ tg->rt_se = root_task_group.rt_se;
+ tg->rt_rq = root_task_group.rt_rq;
+#endif
+ tg->autogroup = ag;
+
+ sched_online_group(tg, &root_task_group);
+ return ag;
+
+out_free:
+ kfree(ag);
+out_fail:
+ if (printk_ratelimit()) {
+ printk(KERN_WARNING "autogroup_create: %s failure.\n",
+ ag ? "sched_create_group()" : "kmalloc()");
+ }
+
+ return autogroup_kref_get(&autogroup_default);
+}
+
+bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
+{
+ if (tg != &root_task_group)
+ return false;
+
+ /*
+ * We can only assume the task group can't go away on us if
+ * autogroup_move_group() can see us on ->thread_group list.
+ */
+ if (p->flags & PF_EXITING)
+ return false;
+
+ return true;
+}
+
+static void
+autogroup_move_group(struct task_struct *p, struct autogroup *ag)
+{
+ struct autogroup *prev;
+ struct task_struct *t;
+ unsigned long flags;
+
+ BUG_ON(!lock_task_sighand(p, &flags));
+
+ prev = p->signal->autogroup;
+ if (prev == ag) {
+ unlock_task_sighand(p, &flags);
+ return;
+ }
+
+ p->signal->autogroup = autogroup_kref_get(ag);
+
+ if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
+ goto out;
+
+ for_each_thread(p, t)
+ sched_move_task(t);
+out:
+ unlock_task_sighand(p, &flags);
+ autogroup_kref_put(prev);
+}
+
+/* Allocates GFP_KERNEL, cannot be called under any spinlock */
+void sched_autogroup_create_attach(struct task_struct *p)
+{
+ struct autogroup *ag = autogroup_create();
+
+ autogroup_move_group(p, ag);
+ /* drop extra reference added by autogroup_create() */
+ autogroup_kref_put(ag);
+}
+EXPORT_SYMBOL(sched_autogroup_create_attach);
+
+/* Cannot be called under siglock. Currently has no users */
+void sched_autogroup_detach(struct task_struct *p)
+{
+ autogroup_move_group(p, &autogroup_default);
+}
+EXPORT_SYMBOL(sched_autogroup_detach);
+
+void sched_autogroup_fork(struct signal_struct *sig)
+{
+ sig->autogroup = autogroup_task_get(current);
+}
+
+void sched_autogroup_exit(struct signal_struct *sig)
+{
+ autogroup_kref_put(sig->autogroup);
+}
+
+static int __init setup_autogroup(char *str)
+{
+ sysctl_sched_autogroup_enabled = 0;
+
+ return 1;
+}
+
+__setup("noautogroup", setup_autogroup);
+
+#ifdef CONFIG_PROC_FS
+
+int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
+{
+ static unsigned long next = INITIAL_JIFFIES;
+ struct autogroup *ag;
+ int err;
+
+ if (nice < MIN_NICE || nice > MAX_NICE)
+ return -EINVAL;
+
+ err = security_task_setnice(current, nice);
+ if (err)
+ return err;
+
+ if (nice < 0 && !can_nice(current, nice))
+ return -EPERM;
+
+ /* this is a heavy operation taking global locks.. */
+ if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next))
+ return -EAGAIN;
+
+ next = HZ / 10 + jiffies;
+ ag = autogroup_task_get(p);
+
+ down_write(&ag->lock);
+ err = sched_group_set_shares(ag->tg, prio_to_weight[nice + 20]);
+ if (!err)
+ ag->nice = nice;
+ up_write(&ag->lock);
+
+ autogroup_kref_put(ag);
+
+ return err;
+}
+
+void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
+{
+ struct autogroup *ag = autogroup_task_get(p);
+
+ if (!task_group_is_autogroup(ag->tg))
+ goto out;
+
+ down_read(&ag->lock);
+ seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice);
+ up_read(&ag->lock);
+
+out:
+ autogroup_kref_put(ag);
+}
+#endif /* CONFIG_PROC_FS */
+
+#ifdef CONFIG_SCHED_DEBUG
+int autogroup_path(struct task_group *tg, char *buf, int buflen)
+{
+ if (!task_group_is_autogroup(tg))
+ return 0;
+
+ return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
+}
+#endif /* CONFIG_SCHED_DEBUG */
+
+#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched/auto_group.h b/kernel/sched/auto_group.h
new file mode 100644
index 000000000..8bd047142
--- /dev/null
+++ b/kernel/sched/auto_group.h
@@ -0,0 +1,64 @@
+#ifdef CONFIG_SCHED_AUTOGROUP
+
+#include <linux/kref.h>
+#include <linux/rwsem.h>
+
+struct autogroup {
+ /*
+ * reference doesn't mean how many thread attach to this
+ * autogroup now. It just stands for the number of task
+ * could use this autogroup.
+ */
+ struct kref kref;
+ struct task_group *tg;
+ struct rw_semaphore lock;
+ unsigned long id;
+ int nice;
+};
+
+extern void autogroup_init(struct task_struct *init_task);
+extern void autogroup_free(struct task_group *tg);
+
+static inline bool task_group_is_autogroup(struct task_group *tg)
+{
+ return !!tg->autogroup;
+}
+
+extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg);
+
+static inline struct task_group *
+autogroup_task_group(struct task_struct *p, struct task_group *tg)
+{
+ int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
+
+ if (enabled && task_wants_autogroup(p, tg))
+ return p->signal->autogroup->tg;
+
+ return tg;
+}
+
+extern int autogroup_path(struct task_group *tg, char *buf, int buflen);
+
+#else /* !CONFIG_SCHED_AUTOGROUP */
+
+static inline void autogroup_init(struct task_struct *init_task) { }
+static inline void autogroup_free(struct task_group *tg) { }
+static inline bool task_group_is_autogroup(struct task_group *tg)
+{
+ return 0;
+}
+
+static inline struct task_group *
+autogroup_task_group(struct task_struct *p, struct task_group *tg)
+{
+ return tg;
+}
+
+#ifdef CONFIG_SCHED_DEBUG
+static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
+{
+ return 0;
+}
+#endif
+
+#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched/bfs.c b/kernel/sched/bfs.c
new file mode 100644
index 000000000..a6d06efc1
--- /dev/null
+++ b/kernel/sched/bfs.c
@@ -0,0 +1,7429 @@
+/*
+ * kernel/sched/bfs.c, was kernel/sched.c
+ *
+ * Kernel scheduler and related syscalls
+ *
+ * Copyright (C) 1991-2002 Linus Torvalds
+ *
+ * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
+ * make semaphores SMP safe
+ * 1998-11-19 Implemented schedule_timeout() and related stuff
+ * by Andrea Arcangeli
+ * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
+ * hybrid priority-list and round-robin design with
+ * an array-switch method of distributing timeslices
+ * and per-CPU runqueues. Cleanups and useful suggestions
+ * by Davide Libenzi, preemptible kernel bits by Robert Love.
+ * 2003-09-03 Interactivity tuning by Con Kolivas.
+ * 2004-04-02 Scheduler domains code by Nick Piggin
+ * 2007-04-15 Work begun on replacing all interactivity tuning with a
+ * fair scheduling design by Con Kolivas.
+ * 2007-05-05 Load balancing (smp-nice) and other improvements
+ * by Peter Williams
+ * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
+ * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
+ * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,
+ * Thomas Gleixner, Mike Kravetz
+ * now Brainfuck deadline scheduling policy by Con Kolivas deletes
+ * a whole lot of those previous things.
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/nmi.h>
+#include <linux/init.h>
+#include <asm/uaccess.h>
+#include <linux/highmem.h>
+#include <asm/mmu_context.h>
+#include <linux/interrupt.h>
+#include <linux/capability.h>
+#include <linux/completion.h>
+#include <linux/kernel_stat.h>
+#include <linux/debug_locks.h>
+#include <linux/perf_event.h>
+#include <linux/security.h>
+#include <linux/notifier.h>
+#include <linux/profile.h>
+#include <linux/freezer.h>
+#include <linux/vmalloc.h>
+#include <linux/blkdev.h>
+#include <linux/delay.h>
+#include <linux/smp.h>
+#include <linux/threads.h>
+#include <linux/timer.h>
+#include <linux/rcupdate.h>
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/syscalls.h>
+#include <linux/sched/sysctl.h>
+#include <linux/times.h>
+#include <linux/tsacct_kern.h>
+#include <linux/kprobes.h>
+#include <linux/delayacct.h>
+#include <linux/log2.h>
+#include <linux/bootmem.h>
+#include <linux/ftrace.h>
+#include <linux/slab.h>
+#include <linux/init_task.h>
+#include <linux/binfmts.h>
+#include <linux/context_tracking.h>
+#include <linux/sched/prio.h>
+
+#include <asm/irq_regs.h>
+#include <asm/switch_to.h>
+#include <asm/tlb.h>
+#include <asm/unistd.h>
+#include <asm/mutex.h>
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#endif
+
+#include "cpupri.h"
+#include "../workqueue_internal.h"
+#include "../smpboot.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/sched.h>
+
+#include "bfs_sched.h"
+
+#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO)
+#define rt_task(p) rt_prio((p)->prio)
+#define rt_queue(rq) rt_prio((rq)->rq_prio)
+#define batch_task(p) (unlikely((p)->policy == SCHED_BATCH))
+#define is_rt_policy(policy) ((policy) == SCHED_FIFO || \
+ (policy) == SCHED_RR)
+#define has_rt_policy(p) unlikely(is_rt_policy((p)->policy))
+
+#define is_idle_policy(policy) ((policy) == SCHED_IDLEPRIO)
+#define idleprio_task(p) unlikely(is_idle_policy((p)->policy))
+#define task_running_idle(p) unlikely((p)->prio == IDLE_PRIO)
+#define idle_queue(rq) (unlikely(is_idle_policy((rq)->rq_policy)))
+
+#define is_iso_policy(policy) ((policy) == SCHED_ISO)
+#define iso_task(p) unlikely(is_iso_policy((p)->policy))
+#define iso_queue(rq) unlikely(is_iso_policy((rq)->rq_policy))
+#define task_running_iso(p) unlikely((p)->prio == ISO_PRIO)
+#define rq_running_iso(rq) ((rq)->rq_prio == ISO_PRIO)
+
+#define rq_idle(rq) ((rq)->rq_prio == PRIO_LIMIT)
+
+#define ISO_PERIOD ((5 * HZ * grq.noc) + 1)
+
+#define SCHED_PRIO(p) ((p) + MAX_RT_PRIO)
+#define STOP_PRIO (MAX_RT_PRIO - 1)
+
+/*
+ * Some helpers for converting to/from various scales. Use shifts to get
+ * approximate multiples of ten for less overhead.
+ */
+#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
+#define JIFFY_NS (1000000000 / HZ)
+#define HALF_JIFFY_NS (1000000000 / HZ / 2)
+#define HALF_JIFFY_US (1000000 / HZ / 2)
+#define MS_TO_NS(TIME) ((TIME) << 20)
+#define MS_TO_US(TIME) ((TIME) << 10)
+#define NS_TO_MS(TIME) ((TIME) >> 20)
+#define NS_TO_US(TIME) ((TIME) >> 10)
+
+#define RESCHED_US (100) /* Reschedule if less than this many μs left */
+
+void print_scheduler_version(void)
+{
+ printk(KERN_INFO "BFS CPU scheduler v0.463 by Con Kolivas.\n");
+}
+
+/*
+ * This is the time all tasks within the same priority round robin.
+ * Value is in ms and set to a minimum of 6ms. Scales with number of cpus.
+ * Tunable via /proc interface.
+ */
+#ifdef CONFIG_PCK_INTERACTIVE
+int rr_interval __read_mostly = 3;
+#else
+int rr_interval __read_mostly = 6;
+#endif
+
+/*
+ * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks
+ * are allowed to run five seconds as real time tasks. This is the total over
+ * all online cpus.
+ */
+#ifdef CONFIG_PCK_INTERACTIVE
+int sched_iso_cpu __read_mostly = 25;
+#else
+int sched_iso_cpu __read_mostly = 70;
+#endif
+
+/*
+ * The relative length of deadline for each priority(nice) level.
+ */
+static int prio_ratios[NICE_WIDTH] __read_mostly;
+
+/*
+ * The quota handed out to tasks of all priority levels when refilling their
+ * time_slice.
+ */
+static inline int timeslice(void)
+{
+ return MS_TO_US(rr_interval);
+}
+
+/*
+ * The global runqueue data that all CPUs work off. Data is protected either
+ * by the global grq lock, or the discrete lock that precedes the data in this
+ * struct.
+ */
+struct global_rq {
+ raw_spinlock_t lock;
+ unsigned long nr_running;
+ unsigned long nr_uninterruptible;
+ unsigned long long nr_switches;
+ struct list_head queue[PRIO_LIMIT];
+ DECLARE_BITMAP(prio_bitmap, PRIO_LIMIT + 1);
+ unsigned long qnr; /* queued not running */
+#ifdef CONFIG_SMP
+ cpumask_t cpu_idle_map;
+ bool idle_cpus;
+#endif
+ int noc; /* num_online_cpus stored and updated when it changes */
+ u64 niffies; /* Nanosecond jiffies */
+ unsigned long last_jiffy; /* Last jiffy we updated niffies */
+
+ raw_spinlock_t iso_lock;
+ int iso_ticks;
+ bool iso_refractory;
+};
+
+#ifdef CONFIG_SMP
+/*
+ * We add the notion of a root-domain which will be used to define per-domain
+ * variables. Each exclusive cpuset essentially defines an island domain by
+ * fully partitioning the member cpus from any other cpuset. Whenever a new
+ * exclusive cpuset is created, we also create and attach a new root-domain
+ * object.
+ *
+ */
+struct root_domain {
+ atomic_t refcount;
+ atomic_t rto_count;
+ struct rcu_head rcu;
+ cpumask_var_t span;
+ cpumask_var_t online;
+
+ /*
+ * The "RT overload" flag: it gets set if a CPU has more than
+ * one runnable RT task.
+ */
+ cpumask_var_t rto_mask;
+ struct cpupri cpupri;
+};
+
+/*
+ * By default the system creates a single root-domain with all cpus as
+ * members (mimicking the global state we have today).
+ */
+static struct root_domain def_root_domain;
+
+#endif /* CONFIG_SMP */
+
+/* There can be only one */
+static struct global_rq grq;
+
+static DEFINE_MUTEX(sched_hotcpu_mutex);
+
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+#ifdef CONFIG_SMP
+struct rq *cpu_rq(int cpu)
+{
+ return &per_cpu(runqueues, (cpu));
+}
+#define task_rq(p) cpu_rq(task_cpu(p))
+#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
+/*
+ * sched_domains_mutex serialises calls to init_sched_domains,
+ * detach_destroy_domains and partition_sched_domains.
+ */
+static DEFINE_MUTEX(sched_domains_mutex);
+
+/*
+ * By default the system creates a single root-domain with all cpus as
+ * members (mimicking the global state we have today).
+ */
+static struct root_domain def_root_domain;
+
+int __weak arch_sd_sibling_asym_packing(void)
+{
+ return 0*SD_ASYM_PACKING;
+}
+#else
+struct rq *uprq;
+#endif /* CONFIG_SMP */
+
+static inline void update_rq_clock(struct rq *rq);
+
+/*
+ * Sanity check should sched_clock return bogus values. We make sure it does
+ * not appear to go backwards, and use jiffies to determine the maximum and
+ * minimum it could possibly have increased, and round down to the nearest
+ * jiffy when it falls outside this.
+ */
+static inline void niffy_diff(s64 *niff_diff, int jiff_diff)
+{
+ unsigned long min_diff, max_diff;
+
+ if (jiff_diff > 1)
+ min_diff = JIFFIES_TO_NS(jiff_diff - 1);
+ else
+ min_diff = 1;
+ /* Round up to the nearest tick for maximum */
+ max_diff = JIFFIES_TO_NS(jiff_diff + 1);
+
+ if (unlikely(*niff_diff < min_diff || *niff_diff > max_diff))
+ *niff_diff = min_diff;
+}
+
+#ifdef CONFIG_SMP
+static inline int cpu_of(struct rq *rq)
+{
+ return rq->cpu;
+}
+
+/*
+ * Niffies are a globally increasing nanosecond counter. Whenever a runqueue
+ * clock is updated with the grq.lock held, it is an opportunity to update the
+ * niffies value. Any CPU can update it by adding how much its clock has
+ * increased since it last updated niffies, minus any added niffies by other
+ * CPUs.
+ */
+static inline void update_clocks(struct rq *rq)
+{
+ s64 ndiff;
+ long jdiff;
+
+ update_rq_clock(rq);
+ ndiff = rq->clock - rq->old_clock;
+ /* old_clock is only updated when we are updating niffies */
+ rq->old_clock = rq->clock;
+ ndiff -= grq.niffies - rq->last_niffy;
+ jdiff = jiffies - grq.last_jiffy;
+ niffy_diff(&ndiff, jdiff);
+ grq.last_jiffy += jdiff;
+ grq.niffies += ndiff;
+ rq->last_niffy = grq.niffies;
+}
+#else /* CONFIG_SMP */
+static inline int cpu_of(struct rq *rq)
+{
+ return 0;
+}
+
+static inline void update_clocks(struct rq *rq)
+{
+ s64 ndiff;
+ long jdiff;
+
+ update_rq_clock(rq);
+ ndiff = rq->clock - rq->old_clock;
+ rq->old_clock = rq->clock;
+ jdiff = jiffies - grq.last_jiffy;
+ niffy_diff(&ndiff, jdiff);
+ grq.last_jiffy += jdiff;
+ grq.niffies += ndiff;
+}
+#endif
+
+#include "stats.h"
+
+#ifndef prepare_arch_switch
+# define prepare_arch_switch(next) do { } while (0)
+#endif
+#ifndef finish_arch_switch
+# define finish_arch_switch(prev) do { } while (0)
+#endif
+#ifndef finish_arch_post_lock_switch
+# define finish_arch_post_lock_switch() do { } while (0)
+#endif
+
+/*
+ * All common locking functions performed on grq.lock. rq->clock is local to
+ * the CPU accessing it so it can be modified just with interrupts disabled
+ * when we're not updating niffies.
+ * Looking up task_rq must be done under grq.lock to be safe.
+ */
+static void update_rq_clock_task(struct rq *rq, s64 delta);
+
+static inline void update_rq_clock(struct rq *rq)
+{
+ s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
+
+ if (unlikely(delta < 0))
+ return;
+ rq->clock += delta;
+ update_rq_clock_task(rq, delta);
+}
+
+static inline bool task_running(struct task_struct *p)
+{
+ return p->on_cpu;
+}
+
+static inline void grq_lock(void)
+ __acquires(grq.lock)
+{
+ raw_spin_lock(&grq.lock);
+}
+
+static inline void grq_unlock(void)
+ __releases(grq.lock)
+{
+ raw_spin_unlock(&grq.lock);
+}
+
+static inline void grq_lock_irq(void)
+ __acquires(grq.lock)
+{
+ raw_spin_lock_irq(&grq.lock);
+}
+
+static inline void time_lock_grq(struct rq *rq)
+ __acquires(grq.lock)
+{
+ grq_lock();
+ update_clocks(rq);
+}
+
+static inline void grq_unlock_irq(void)
+ __releases(grq.lock)
+{
+ raw_spin_unlock_irq(&grq.lock);
+}
+
+static inline void grq_lock_irqsave(unsigned long *flags)
+ __acquires(grq.lock)
+{
+ raw_spin_lock_irqsave(&grq.lock, *flags);
+}
+
+static inline void grq_unlock_irqrestore(unsigned long *flags)
+ __releases(grq.lock)
+{
+ raw_spin_unlock_irqrestore(&grq.lock, *flags);
+}
+
+static inline struct rq
+*task_grq_lock(struct task_struct *p, unsigned long *flags)
+ __acquires(grq.lock)
+{
+ grq_lock_irqsave(flags);
+ return task_rq(p);
+}
+
+static inline struct rq
+*time_task_grq_lock(struct task_struct *p, unsigned long *flags)
+ __acquires(grq.lock)
+{
+ struct rq *rq = task_grq_lock(p, flags);
+ update_clocks(rq);
+ return rq;
+}
+
+static inline struct rq *task_grq_lock_irq(struct task_struct *p)
+ __acquires(grq.lock)
+{
+ grq_lock_irq();
+ return task_rq(p);
+}
+
+static inline void time_task_grq_lock_irq(struct task_struct *p)
+ __acquires(grq.lock)
+{
+ struct rq *rq = task_grq_lock_irq(p);
+ update_clocks(rq);
+}
+
+static inline void task_grq_unlock_irq(void)
+ __releases(grq.lock)
+{
+ grq_unlock_irq();
+}
+
+static inline void task_grq_unlock(unsigned long *flags)
+ __releases(grq.lock)
+{
+ grq_unlock_irqrestore(flags);
+}
+
+/**
+ * grunqueue_is_locked
+ *
+ * Returns true if the global runqueue is locked.
+ * This interface allows printk to be called with the runqueue lock
+ * held and know whether or not it is OK to wake up the klogd.
+ */
+bool grunqueue_is_locked(void)
+{
+ return raw_spin_is_locked(&grq.lock);
+}
+
+void grq_unlock_wait(void)
+ __releases(grq.lock)
+{
+ smp_mb(); /* spin-unlock-wait is not a full memory barrier */
+ raw_spin_unlock_wait(&grq.lock);
+}
+
+static inline void time_grq_lock(struct rq *rq, unsigned long *flags)
+ __acquires(grq.lock)
+{
+ local_irq_save(*flags);
+ time_lock_grq(rq);
+}
+
+static inline struct rq *__task_grq_lock(struct task_struct *p)
+ __acquires(grq.lock)
+{
+ grq_lock();
+ return task_rq(p);
+}
+
+static inline void __task_grq_unlock(void)
+ __releases(grq.lock)
+{
+ grq_unlock();
+}
+
+static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
+{
+}
+
+static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
+{
+#ifdef CONFIG_DEBUG_SPINLOCK
+ /* this is a valid case when another task releases the spinlock */
+ grq.lock.owner = current;
+#endif
+ /*
+ * If we are tracking spinlock dependencies then we have to
+ * fix up the runqueue lock - which gets 'carried over' from
+ * prev into current:
+ */
+ spin_acquire(&grq.lock.dep_map, 0, 0, _THIS_IP_);
+
+ grq_unlock_irq();
+}
+
+static inline bool deadline_before(u64 deadline, u64 time)
+{
+ return (deadline < time);
+}
+
+static inline bool deadline_after(u64 deadline, u64 time)
+{
+ return (deadline > time);
+}
+
+/*
+ * A task that is queued but not running will be on the grq run list.
+ * A task that is not running or queued will not be on the grq run list.
+ * A task that is currently running will have ->on_cpu set but not on the
+ * grq run list.
+ */
+static inline bool task_queued(struct task_struct *p)
+{
+ return (!list_empty(&p->run_list));
+}
+
+/*
+ * Removing from the global runqueue. Enter with grq locked.
+ */
+static void dequeue_task(struct task_struct *p)
+{
+ list_del_init(&p->run_list);
+ if (list_empty(grq.queue + p->prio))
+ __clear_bit(p->prio, grq.prio_bitmap);
+ sched_info_dequeued(task_rq(p), p);
+}
+
+/*
+ * To determine if it's safe for a task of SCHED_IDLEPRIO to actually run as
+ * an idle task, we ensure none of the following conditions are met.
+ */
+static bool idleprio_suitable(struct task_struct *p)
+{
+ return (!freezing(p) && !signal_pending(p) &&
+ !(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING)));
+}
+
+/*
+ * To determine if a task of SCHED_ISO can run in pseudo-realtime, we check
+ * that the iso_refractory flag is not set.
+ */
+static bool isoprio_suitable(void)
+{
+ return !grq.iso_refractory;
+}
+
+/*
+ * Adding to the global runqueue. Enter with grq locked.
+ */
+static void enqueue_task(struct task_struct *p, struct rq *rq)
+{
+ if (!rt_task(p)) {
+ /* Check it hasn't gotten rt from PI */
+ if ((idleprio_task(p) && idleprio_suitable(p)) ||
+ (iso_task(p) && isoprio_suitable()))
+ p->prio = p->normal_prio;
+ else
+ p->prio = NORMAL_PRIO;
+ }
+ __set_bit(p->prio, grq.prio_bitmap);
+ list_add_tail(&p->run_list, grq.queue + p->prio);
+ sched_info_queued(rq, p);
+}
+
+static inline void requeue_task(struct task_struct *p)
+{
+ sched_info_queued(task_rq(p), p);
+}
+
+/*
+ * Returns the relative length of deadline all compared to the shortest
+ * deadline which is that of nice -20.
+ */
+static inline int task_prio_ratio(struct task_struct *p)
+{
+ return prio_ratios[TASK_USER_PRIO(p)];
+}
+
+/*
+ * task_timeslice - all tasks of all priorities get the exact same timeslice
+ * length. CPU distribution is handled by giving different deadlines to
+ * tasks of different priorities. Use 128 as the base value for fast shifts.
+ */
+static inline int task_timeslice(struct task_struct *p)
+{
+ return (rr_interval * task_prio_ratio(p) / 128);
+}
+
+static void resched_task(struct task_struct *p);
+
+static inline void resched_curr(struct rq *rq)
+{
+ resched_task(rq->curr);
+}
+
+/*
+ * qnr is the "queued but not running" count which is the total number of
+ * tasks on the global runqueue list waiting for cpu time but not actually
+ * currently running on a cpu.
+ */
+static inline void inc_qnr(void)
+{
+ grq.qnr++;
+}
+
+static inline void dec_qnr(void)
+{
+ grq.qnr--;
+}
+
+static inline int queued_notrunning(void)
+{
+ return grq.qnr;
+}
+
+#ifdef CONFIG_SMP
+/*
+ * The cpu_idle_map stores a bitmap of all the CPUs currently idle to
+ * allow easy lookup of whether any suitable idle CPUs are available.
+ * It's cheaper to maintain a binary yes/no if there are any idle CPUs on the
+ * idle_cpus variable than to do a full bitmask check when we are busy.
+ */
+static inline void set_cpuidle_map(int cpu)
+{
+ if (likely(cpu_online(cpu))) {
+ cpumask_set_cpu(cpu, &grq.cpu_idle_map);
+ grq.idle_cpus = true;
+ }
+}
+
+static inline void clear_cpuidle_map(int cpu)
+{
+ cpumask_clear_cpu(cpu, &grq.cpu_idle_map);
+ if (cpumask_empty(&grq.cpu_idle_map))
+ grq.idle_cpus = false;
+}
+
+static bool suitable_idle_cpus(struct task_struct *p)
+{
+ if (!grq.idle_cpus)
+ return false;
+ return (cpumask_intersects(&p->cpus_allowed, &grq.cpu_idle_map));
+}
+
+#define CPUIDLE_DIFF_THREAD (1)
+#define CPUIDLE_DIFF_CORE (2)
+#define CPUIDLE_CACHE_BUSY (4)
+#define CPUIDLE_DIFF_CPU (8)
+#define CPUIDLE_THREAD_BUSY (16)
+#define CPUIDLE_THROTTLED (32)
+#define CPUIDLE_DIFF_NODE (64)
+
+static inline bool scaling_rq(struct rq *rq);
+
+/*
+ * The best idle CPU is chosen according to the CPUIDLE ranking above where the
+ * lowest value would give the most suitable CPU to schedule p onto next. The
+ * order works out to be the following:
+ *
+ * Same core, idle or busy cache, idle or busy threads
+ * Other core, same cache, idle or busy cache, idle threads.
+ * Same node, other CPU, idle cache, idle threads.
+ * Same node, other CPU, busy cache, idle threads.
+ * Other core, same cache, busy threads.
+ * Same node, other CPU, busy threads.
+ * Other node, other CPU, idle cache, idle threads.
+ * Other node, other CPU, busy cache, idle threads.
+ * Other node, other CPU, busy threads.
+ */
+static int best_mask_cpu(int best_cpu, struct rq *rq, cpumask_t *tmpmask)
+{
+ int best_ranking = CPUIDLE_DIFF_NODE | CPUIDLE_THROTTLED |
+ CPUIDLE_THREAD_BUSY | CPUIDLE_DIFF_CPU | CPUIDLE_CACHE_BUSY |
+ CPUIDLE_DIFF_CORE | CPUIDLE_DIFF_THREAD;
+ int cpu_tmp;
+
+ if (cpumask_test_cpu(best_cpu, tmpmask))
+ goto out;
+
+ for_each_cpu(cpu_tmp, tmpmask) {
+ int ranking, locality;
+ struct rq *tmp_rq;
+
+ ranking = 0;
+ tmp_rq = cpu_rq(cpu_tmp);
+
+ locality = rq->cpu_locality[cpu_tmp];
+#ifdef CONFIG_NUMA
+ if (locality > 3)
+ ranking |= CPUIDLE_DIFF_NODE;
+ else
+#endif
+ if (locality > 2)
+ ranking |= CPUIDLE_DIFF_CPU;
+#ifdef CONFIG_SCHED_MC
+ else if (locality == 2)
+ ranking |= CPUIDLE_DIFF_CORE;
+ if (!(tmp_rq->cache_idle(cpu_tmp)))
+ ranking |= CPUIDLE_CACHE_BUSY;
+#endif
+#ifdef CONFIG_SCHED_SMT
+ if (locality == 1)
+ ranking |= CPUIDLE_DIFF_THREAD;
+ if (!(tmp_rq->siblings_idle(cpu_tmp)))
+ ranking |= CPUIDLE_THREAD_BUSY;
+#endif
+ if (scaling_rq(tmp_rq))
+ ranking |= CPUIDLE_THROTTLED;
+
+ if (ranking < best_ranking) {
+ best_cpu = cpu_tmp;
+ best_ranking = ranking;
+ }
+ }
+out:
+ return best_cpu;
+}
+
+static void resched_best_mask(int best_cpu, struct rq *rq, cpumask_t *tmpmask)
+{
+ best_cpu = best_mask_cpu(best_cpu, rq, tmpmask);
+ resched_curr(cpu_rq(best_cpu));
+}
+
+bool cpus_share_cache(int this_cpu, int that_cpu)
+{
+ struct rq *this_rq = cpu_rq(this_cpu);
+
+ return (this_rq->cpu_locality[that_cpu] < 3);
+}
+
+#ifdef CONFIG_SCHED_SMT
+#ifdef CONFIG_SMT_NICE
+static const cpumask_t *thread_cpumask(int cpu);
+
+/* Find the best real time priority running on any SMT siblings of cpu and if
+ * none are running, the static priority of the best deadline task running.
+ * The lookups to the other runqueues is done lockless as the occasional wrong
+ * value would be harmless. */
+static int best_smt_bias(int cpu)
+{
+ int other_cpu, best_bias = 0;
+
+ for_each_cpu(other_cpu, thread_cpumask(cpu)) {
+ struct rq *rq;
+
+ if (other_cpu == cpu)
+ continue;
+ rq = cpu_rq(other_cpu);
+ if (rq_idle(rq))
+ continue;
+ if (!rq->online)
+ continue;
+ if (!rq->rq_mm)
+ continue;
+ if (likely(rq->rq_smt_bias > best_bias))
+ best_bias = rq->rq_smt_bias;
+ }
+ return best_bias;
+}
+
+static int task_prio_bias(struct task_struct *p)
+{
+ if (rt_task(p))
+ return 1 << 30;
+ else if (task_running_iso(p))
+ return 1 << 29;
+ else if (task_running_idle(p))
+ return 0;
+ return MAX_PRIO - p->static_prio;
+}
+
+/* We've already decided p can run on CPU, now test if it shouldn't for SMT
+ * nice reasons. */
+static bool smt_should_schedule(struct task_struct *p, int cpu)
+{
+ int best_bias, task_bias;
+
+ /* Kernel threads always run */
+ if (unlikely(!p->mm))
+ return true;
+ if (rt_task(p))
+ return true;
+ if (!idleprio_suitable(p))
+ return true;
+ best_bias = best_smt_bias(cpu);
+ /* The smt siblings are all idle or running IDLEPRIO */
+ if (best_bias < 1)
+ return true;
+ task_bias = task_prio_bias(p);
+ if (task_bias < 1)
+ return false;
+ if (task_bias >= best_bias)
+ return true;
+ /* Dither 25% cpu of normal tasks regardless of nice difference */
+ if (best_bias % 4 == 1)
+ return true;
+ /* Sorry, you lose */
+ return false;
+}
+#endif
+#endif
+
+static bool resched_best_idle(struct task_struct *p)
+{
+ cpumask_t tmpmask;
+ int best_cpu;
+
+ cpumask_and(&tmpmask, &p->cpus_allowed, &grq.cpu_idle_map);
+ best_cpu = best_mask_cpu(task_cpu(p), task_rq(p), &tmpmask);
+#ifdef CONFIG_SMT_NICE
+ if (!smt_should_schedule(p, best_cpu))
+ return false;
+#endif
+ resched_curr(cpu_rq(best_cpu));
+ return true;
+}
+
+static inline void resched_suitable_idle(struct task_struct *p)
+{
+ if (suitable_idle_cpus(p))
+ resched_best_idle(p);
+}
+/*
+ * Flags to tell us whether this CPU is running a CPU frequency governor that
+ * has slowed its speed or not. No locking required as the very rare wrongly
+ * read value would be harmless.
+ */
+void cpu_scaling(int cpu)
+{
+ cpu_rq(cpu)->scaling = true;
+}
+
+void cpu_nonscaling(int cpu)
+{
+ cpu_rq(cpu)->scaling = false;
+}
+
+static inline bool scaling_rq(struct rq *rq)
+{
+ return rq->scaling;
+}
+
+static inline int locality_diff(struct task_struct *p, struct rq *rq)
+{
+ return rq->cpu_locality[task_cpu(p)];
+}
+#else /* CONFIG_SMP */
+static inline void set_cpuidle_map(int cpu)
+{
+}
+
+static inline void clear_cpuidle_map(int cpu)
+{
+}
+
+static inline bool suitable_idle_cpus(struct task_struct *p)
+{
+ return uprq->curr == uprq->idle;
+}
+
+static inline void resched_suitable_idle(struct task_struct *p)
+{
+}
+
+void cpu_scaling(int __unused)
+{
+}
+
+void cpu_nonscaling(int __unused)
+{
+}
+
+/*
+ * Although CPUs can scale in UP, there is nowhere else for tasks to go so this
+ * always returns 0.
+ */
+static inline bool scaling_rq(struct rq *rq)
+{
+ return false;
+}
+
+static inline int locality_diff(struct task_struct *p, struct rq *rq)
+{
+ return 0;
+}
+#endif /* CONFIG_SMP */
+EXPORT_SYMBOL_GPL(cpu_scaling);
+EXPORT_SYMBOL_GPL(cpu_nonscaling);
+
+static inline int normal_prio(struct task_struct *p)
+{
+ if (has_rt_policy(p))
+ return MAX_RT_PRIO - 1 - p->rt_priority;
+ if (idleprio_task(p))
+ return IDLE_PRIO;
+ if (iso_task(p))
+ return ISO_PRIO;
+ return NORMAL_PRIO;
+}
+
+/*
+ * Calculate the current priority, i.e. the priority
+ * taken into account by the scheduler. This value might
+ * be boosted by RT tasks as it will be RT if the task got
+ * RT-boosted. If not then it returns p->normal_prio.
+ */
+static int effective_prio(struct task_struct *p)
+{
+ p->normal_prio = normal_prio(p);
+ /*
+ * If we are RT tasks or we were boosted to RT priority,
+ * keep the priority unchanged. Otherwise, update priority
+ * to the normal priority:
+ */
+ if (!rt_prio(p->prio))
+ return p->normal_prio;
+ return p->prio;
+}
+
+/*
+ * activate_task - move a task to the runqueue. Enter with grq locked.
+ */
+static void activate_task(struct task_struct *p, struct rq *rq)
+{
+ update_clocks(rq);
+
+ /*
+ * Sleep time is in units of nanosecs, so shift by 20 to get a
+ * milliseconds-range estimation of the amount of time that the task
+ * spent sleeping:
+ */
+ if (unlikely(prof_on == SLEEP_PROFILING)) {
+ if (p->state == TASK_UNINTERRUPTIBLE)
+ profile_hits(SLEEP_PROFILING, (void *)get_wchan(p),
+ (rq->clock_task - p->last_ran) >> 20);
+ }
+
+ p->prio = effective_prio(p);
+ if (task_contributes_to_load(p))
+ grq.nr_uninterruptible--;
+ enqueue_task(p, rq);
+ rq->soft_affined++;
+ p->on_rq = 1;
+ grq.nr_running++;
+ inc_qnr();
+}
+
+static inline void clear_sticky(struct task_struct *p);
+
+/*
+ * deactivate_task - If it's running, it's not on the grq and we can just
+ * decrement the nr_running. Enter with grq locked.
+ */
+static inline void deactivate_task(struct task_struct *p, struct rq *rq)
+{
+ if (task_contributes_to_load(p))
+ grq.nr_uninterruptible++;
+ rq->soft_affined--;
+ p->on_rq = 0;
+ grq.nr_running--;
+ clear_sticky(p);
+}
+
+static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
+
+void register_task_migration_notifier(struct notifier_block *n)
+{
+ atomic_notifier_chain_register(&task_migration_notifier, n);
+}
+
+#ifdef CONFIG_SMP
+void set_task_cpu(struct task_struct *p, unsigned int cpu)
+{
+#ifdef CONFIG_LOCKDEP
+ /*
+ * The caller should hold grq lock.
+ */
+ WARN_ON_ONCE(debug_locks && !lockdep_is_held(&grq.lock));
+#endif
+ if (task_cpu(p) == cpu)
+ return;
+ trace_sched_migrate_task(p, cpu);
+ perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
+
+ /*
+ * After ->cpu is set up to a new value, task_grq_lock(p, ...) can be
+ * successfully executed on another CPU. We must ensure that updates of
+ * per-task data have been completed by this moment.
+ */
+ smp_wmb();
+ if (p->on_rq) {
+ task_rq(p)->soft_affined--;
+ cpu_rq(cpu)->soft_affined++;
+ }
+ task_thread_info(p)->cpu = cpu;
+}
+
+static inline void clear_sticky(struct task_struct *p)
+{
+ p->sticky = false;
+}
+
+static inline bool task_sticky(struct task_struct *p)
+{
+ return p->sticky;
+}
+
+/* Reschedule the best idle CPU that is not this one. */
+static void
+resched_closest_idle(struct rq *rq, int cpu, struct task_struct *p)
+{
+ cpumask_t tmpmask;
+
+ cpumask_and(&tmpmask, &p->cpus_allowed, &grq.cpu_idle_map);
+ cpumask_clear_cpu(cpu, &tmpmask);
+ if (cpumask_empty(&tmpmask))
+ return;
+ resched_best_mask(cpu, rq, &tmpmask);
+}
+
+/*
+ * We set the sticky flag on a task that is descheduled involuntarily meaning
+ * it is awaiting further CPU time. If the last sticky task is still sticky
+ * but unlucky enough to not be the next task scheduled, we unstick it and try
+ * to find it an idle CPU. Realtime tasks do not stick to minimise their
+ * latency at all times.
+ */
+static inline void
+swap_sticky(struct rq *rq, int cpu, struct task_struct *p)
+{
+ if (rq->sticky_task) {
+ if (rq->sticky_task == p) {
+ p->sticky = true;
+ return;
+ }
+ if (task_sticky(rq->sticky_task)) {
+ clear_sticky(rq->sticky_task);
+ resched_closest_idle(rq, cpu, rq->sticky_task);
+ }
+ }
+ if (!rt_task(p)) {
+ p->sticky = true;
+ rq->sticky_task = p;
+ } else {
+ resched_closest_idle(rq, cpu, p);
+ rq->sticky_task = NULL;
+ }
+}
+
+static inline void unstick_task(struct rq *rq, struct task_struct *p)
+{
+ rq->sticky_task = NULL;
+ clear_sticky(p);
+}
+#else
+static inline void clear_sticky(struct task_struct *p)
+{
+}
+
+static inline bool task_sticky(struct task_struct *p)
+{
+ return false;
+}
+
+static inline void
+swap_sticky(struct rq *rq, int cpu, struct task_struct *p)
+{
+}
+
+static inline void unstick_task(struct rq *rq, struct task_struct *p)
+{
+}
+#endif
+
+/*
+ * Move a task off the global queue and take it to a cpu for it will
+ * become the running task.
+ */
+static inline void take_task(int cpu, struct task_struct *p)
+{
+ set_task_cpu(p, cpu);
+ dequeue_task(p);
+ clear_sticky(p);
+ dec_qnr();
+}
+
+/*
+ * Returns a descheduling task to the grq runqueue unless it is being
+ * deactivated.
+ */
+static inline void return_task(struct task_struct *p, struct rq *rq, bool deactivate)
+{
+ if (deactivate)
+ deactivate_task(p, rq);
+ else {
+ inc_qnr();
+ enqueue_task(p, rq);
+ }
+}
+
+/* Enter with grq lock held. We know p is on the local cpu */
+static inline void __set_tsk_resched(struct task_struct *p)
+{
+ set_tsk_need_resched(p);
+ set_preempt_need_resched();
+}
+
+/*
+ * resched_task - mark a task 'to be rescheduled now'.
+ *
+ * On UP this means the setting of the need_resched flag, on SMP it
+ * might also involve a cross-CPU call to trigger the scheduler on
+ * the target CPU.
+ */
+void resched_task(struct task_struct *p)
+{
+ int cpu;
+
+ lockdep_assert_held(&grq.lock);
+
+ if (test_tsk_need_resched(p))
+ return;
+
+ set_tsk_need_resched(p);
+
+ cpu = task_cpu(p);
+ if (cpu == smp_processor_id()) {
+ set_preempt_need_resched();
+ return;
+ }
+
+ smp_send_reschedule(cpu);
+}
+
+/**
+ * task_curr - is this task currently executing on a CPU?
+ * @p: the task in question.
+ *
+ * Return: 1 if the task is currently executing. 0 otherwise.
+ */
+inline int task_curr(const struct task_struct *p)
+{
+ return cpu_curr(task_cpu(p)) == p;
+}
+
+#ifdef CONFIG_SMP
+struct migration_req {
+ struct task_struct *task;
+ int dest_cpu;
+};
+
+/*
+ * wait_task_inactive - wait for a thread to unschedule.
+ *
+ * If @match_state is nonzero, it's the @p->state value just checked and
+ * not expected to change. If it changes, i.e. @p might have woken up,
+ * then return zero. When we succeed in waiting for @p to be off its CPU,
+ * we return a positive number (its total switch count). If a second call
+ * a short while later returns the same number, the caller can be sure that
+ * @p has remained unscheduled the whole time.
+ *
+ * The caller must ensure that the task *will* unschedule sometime soon,
+ * else this function might spin for a *long* time. This function can't
+ * be called with interrupts off, or it may introduce deadlock with
+ * smp_call_function() if an IPI is sent by the same process we are
+ * waiting to become inactive.
+ */
+unsigned long wait_task_inactive(struct task_struct *p, long match_state)
+{
+ unsigned long flags;
+ bool running, on_rq;
+ unsigned long ncsw;
+ struct rq *rq;
+
+ for (;;) {
+ rq = task_rq(p);
+
+ /*
+ * If the task is actively running on another CPU
+ * still, just relax and busy-wait without holding
+ * any locks.
+ *
+ * NOTE! Since we don't hold any locks, it's not
+ * even sure that "rq" stays as the right runqueue!
+ * But we don't care, since this will return false
+ * if the runqueue has changed and p is actually now
+ * running somewhere else!
+ */
+ while (task_running(p) && p == rq->curr) {
+ if (match_state && unlikely(p->state != match_state))
+ return 0;
+ cpu_relax();
+ }
+
+ /*
+ * Ok, time to look more closely! We need the grq
+ * lock now, to be *sure*. If we're wrong, we'll
+ * just go back and repeat.
+ */
+ rq = task_grq_lock(p, &flags);
+ trace_sched_wait_task(p);
+ running = task_running(p);
+ on_rq = p->on_rq;
+ ncsw = 0;
+ if (!match_state || p->state == match_state)
+ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
+ task_grq_unlock(&flags);
+
+ /*
+ * If it changed from the expected state, bail out now.
+ */
+ if (unlikely(!ncsw))
+ break;
+
+ /*
+ * Was it really running after all now that we
+ * checked with the proper locks actually held?
+ *
+ * Oops. Go back and try again..
+ */
+ if (unlikely(running)) {
+ cpu_relax();
+ continue;
+ }
+
+ /*
+ * It's not enough that it's not actively running,
+ * it must be off the runqueue _entirely_, and not
+ * preempted!
+ *
+ * So if it was still runnable (but just not actively
+ * running right now), it's preempted, and we should
+ * yield - it could be a while.
+ */
+ if (unlikely(on_rq)) {
+ ktime_t to = ktime_set(0, NSEC_PER_SEC / HZ);
+
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_hrtimeout(&to, HRTIMER_MODE_REL);
+ continue;
+ }
+
+ /*
+ * Ahh, all good. It wasn't running, and it wasn't
+ * runnable, which means that it will never become
+ * running in the future either. We're all done!
+ */
+ break;
+ }
+
+ return ncsw;
+}
+
+/***
+ * kick_process - kick a running thread to enter/exit the kernel
+ * @p: the to-be-kicked thread
+ *
+ * Cause a process which is running on another CPU to enter
+ * kernel-mode, without any delay. (to get signals handled.)
+ *
+ * NOTE: this function doesn't have to take the runqueue lock,
+ * because all it wants to ensure is that the remote task enters
+ * the kernel. If the IPI races and the task has been migrated
+ * to another CPU then no harm is done and the purpose has been
+ * achieved as well.
+ */
+void kick_process(struct task_struct *p)
+{
+ int cpu;
+
+ preempt_disable();
+ cpu = task_cpu(p);
+ if ((cpu != smp_processor_id()) && task_curr(p))
+ smp_send_reschedule(cpu);
+ preempt_enable();
+}
+EXPORT_SYMBOL_GPL(kick_process);
+#endif
+
+/*
+ * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the
+ * basis of earlier deadlines. SCHED_IDLEPRIO don't preempt anything else or
+ * between themselves, they cooperatively multitask. An idle rq scores as
+ * prio PRIO_LIMIT so it is always preempted.
+ */
+static inline bool
+can_preempt(struct task_struct *p, int prio, u64 deadline)
+{
+ /* Better static priority RT task or better policy preemption */
+ if (p->prio < prio)
+ return true;
+ if (p->prio > prio)
+ return false;
+ /* SCHED_NORMAL, BATCH and ISO will preempt based on deadline */
+ if (!deadline_before(p->deadline, deadline))
+ return false;
+ return true;
+}
+
+#ifdef CONFIG_SMP
+#define cpu_online_map (*(cpumask_t *)cpu_online_mask)
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * Check to see if there is a task that is affined only to offline CPUs but
+ * still wants runtime. This happens to kernel threads during suspend/halt and
+ * disabling of CPUs.
+ */
+static inline bool online_cpus(struct task_struct *p)
+{
+ return (likely(cpumask_intersects(&cpu_online_map, &p->cpus_allowed)));
+}
+#else /* CONFIG_HOTPLUG_CPU */
+/* All available CPUs are always online without hotplug. */
+static inline bool online_cpus(struct task_struct *p)
+{
+ return true;
+}
+#endif
+
+/*
+ * Check to see if p can run on cpu, and if not, whether there are any online
+ * CPUs it can run on instead.
+ */
+static inline bool needs_other_cpu(struct task_struct *p, int cpu)
+{
+ if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed)))
+ return true;
+ return false;
+}
+
+/*
+ * When all else is equal, still prefer this_rq.
+ */
+static void try_preempt(struct task_struct *p, struct rq *this_rq)
+{
+ struct rq *highest_prio_rq = NULL;
+ int cpu, highest_prio;
+ u64 latest_deadline;
+ cpumask_t tmp;
+
+ /*
+ * We clear the sticky flag here because for a task to have called
+ * try_preempt with the sticky flag enabled means some complicated
+ * re-scheduling has occurred and we should ignore the sticky flag.
+ */
+ clear_sticky(p);
+
+ if (suitable_idle_cpus(p) && resched_best_idle(p))
+ return;
+
+ /* IDLEPRIO tasks never preempt anything but idle */
+ if (p->policy == SCHED_IDLEPRIO)
+ return;
+
+ if (likely(online_cpus(p)))
+ cpumask_and(&tmp, &cpu_online_map, &p->cpus_allowed);
+ else
+ return;
+
+ highest_prio = latest_deadline = 0;
+
+ for_each_cpu(cpu, &tmp) {
+ struct rq *rq;
+ int rq_prio;
+
+ rq = cpu_rq(cpu);
+ rq_prio = rq->rq_prio;
+ if (rq_prio < highest_prio)
+ continue;
+
+ if (rq_prio > highest_prio ||
+ deadline_after(rq->rq_deadline, latest_deadline)) {
+ latest_deadline = rq->rq_deadline;
+ highest_prio = rq_prio;
+ highest_prio_rq = rq;
+ }
+ }
+
+ if (likely(highest_prio_rq)) {
+#ifdef CONFIG_SMT_NICE
+ cpu = cpu_of(highest_prio_rq);
+ if (!smt_should_schedule(p, cpu))
+ return;
+#endif
+ if (can_preempt(p, highest_prio, highest_prio_rq->rq_deadline))
+ resched_curr(highest_prio_rq);
+ }
+}
+#else /* CONFIG_SMP */
+static inline bool needs_other_cpu(struct task_struct *p, int cpu)
+{
+ return false;
+}
+
+static void try_preempt(struct task_struct *p, struct rq *this_rq)
+{
+ if (p->policy == SCHED_IDLEPRIO)
+ return;
+ if (can_preempt(p, uprq->rq_prio, uprq->rq_deadline))
+ resched_curr(uprq);
+}
+#endif /* CONFIG_SMP */
+
+static void
+ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
+{
+#ifdef CONFIG_SCHEDSTATS
+ struct rq *rq = this_rq();
+
+#ifdef CONFIG_SMP
+ int this_cpu = smp_processor_id();
+
+ if (cpu == this_cpu)
+ schedstat_inc(rq, ttwu_local);
+ else {
+ struct sched_domain *sd;
+
+ rcu_read_lock();
+ for_each_domain(this_cpu, sd) {
+ if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
+ schedstat_inc(sd, ttwu_wake_remote);
+ break;
+ }
+ }
+ rcu_read_unlock();
+ }
+
+#endif /* CONFIG_SMP */
+
+ schedstat_inc(rq, ttwu_count);
+#endif /* CONFIG_SCHEDSTATS */
+}
+
+void wake_up_if_idle(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+
+ rcu_read_lock();
+
+ if (!is_idle_task(rcu_dereference(rq->curr)))
+ goto out;
+
+ grq_lock_irqsave(&flags);
+ if (likely(is_idle_task(rq->curr)))
+ smp_send_reschedule(cpu);
+ /* Else cpu is not in idle, do nothing here */
+ grq_unlock_irqrestore(&flags);
+
+out:
+ rcu_read_unlock();
+}
+
+#ifdef CONFIG_SMP
+void scheduler_ipi(void)
+{
+ /*
+ * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
+ * TIF_NEED_RESCHED remotely (for the first time) will also send
+ * this IPI.
+ */
+ preempt_fold_need_resched();
+}
+#endif
+
+static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
+ bool is_sync)
+{
+ activate_task(p, rq);
+
+ /*
+ * Sync wakeups (i.e. those types of wakeups where the waker
+ * has indicated that it will leave the CPU in short order)
+ * don't trigger a preemption if there are no idle cpus,
+ * instead waiting for current to deschedule.
+ */
+ if (!is_sync || suitable_idle_cpus(p))
+ try_preempt(p, rq);
+}
+
+static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
+ bool success)
+{
+ trace_sched_wakeup(p, success);
+ p->state = TASK_RUNNING;
+
+ /*
+ * if a worker is waking up, notify workqueue. Note that on BFS, we
+ * don't really know what cpu it will be, so we fake it for
+ * wq_worker_waking_up :/
+ */
+ if ((p->flags & PF_WQ_WORKER) && success)
+ wq_worker_waking_up(p, cpu_of(rq));
+}
+
+/*
+ * wake flags
+ */
+#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */
+#define WF_FORK 0x02 /* child wakeup after fork */
+#define WF_MIGRATED 0x4 /* internal use, task got migrated */
+
+/***
+ * try_to_wake_up - wake up a thread
+ * @p: the thread to be awakened
+ * @state: the mask of task states that can be woken
+ * @wake_flags: wake modifier flags (WF_*)
+ *
+ * Put it on the run-queue if it's not already there. The "current"
+ * thread is always on the run-queue (except when the actual
+ * re-schedule is in progress), and as such you're allowed to do
+ * the simpler "current->state = TASK_RUNNING" to mark yourself
+ * runnable without the overhead of this.
+ *
+ * Return: %true if @p was woken up, %false if it was already running.
+ * or @state didn't match @p's state.
+ */
+static bool try_to_wake_up(struct task_struct *p, unsigned int state,
+ int wake_flags)
+{
+ bool success = false;
+ unsigned long flags;
+ struct rq *rq;
+ int cpu;
+
+ get_cpu();
+
+ /*
+ * If we are going to wake up a thread waiting for CONDITION we
+ * need to ensure that CONDITION=1 done by the caller can not be
+ * reordered with p->state check below. This pairs with mb() in
+ * set_current_state() the waiting thread does.
+ */
+ smp_mb__before_spinlock();
+
+ /*
+ * No need to do time_lock_grq as we only need to update the rq clock
+ * if we activate the task
+ */
+ rq = task_grq_lock(p, &flags);
+ cpu = task_cpu(p);
+
+ /* state is a volatile long, ă©ă†ă—ă¦ă€åˆ†ă‹ă‚‰ăªă„ */
+ if (!((unsigned int)p->state & state))
+ goto out_unlock;
+
+ if (task_queued(p) || task_running(p))
+ goto out_running;
+
+ ttwu_activate(p, rq, wake_flags & WF_SYNC);
+ success = true;
+
+out_running:
+ ttwu_post_activation(p, rq, success);
+out_unlock:
+ task_grq_unlock(&flags);
+
+ ttwu_stat(p, cpu, wake_flags);
+
+ put_cpu();
+
+ return success;
+}
+
+/**
+ * try_to_wake_up_local - try to wake up a local task with grq lock held
+ * @p: the thread to be awakened
+ *
+ * Put @p on the run-queue if it's not already there. The caller must
+ * ensure that grq is locked and, @p is not the current task.
+ * grq stays locked over invocation.
+ */
+static void try_to_wake_up_local(struct task_struct *p)
+{
+ struct rq *rq = task_rq(p);
+ bool success = false;
+
+ lockdep_assert_held(&grq.lock);
+
+ if (!(p->state & TASK_NORMAL))
+ return;
+
+ if (!task_queued(p)) {
+ if (likely(!task_running(p))) {
+ schedstat_inc(rq, ttwu_count);
+ schedstat_inc(rq, ttwu_local);
+ }
+ ttwu_activate(p, rq, false);
+ ttwu_stat(p, smp_processor_id(), 0);
+ success = true;
+ }
+ ttwu_post_activation(p, rq, success);
+}
+
+/**
+ * wake_up_process - Wake up a specific process
+ * @p: The process to be woken up.
+ *
+ * Attempt to wake up the nominated process and move it to the set of runnable
+ * processes.
+ *
+ * Return: 1 if the process was woken up, 0 if it was already running.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+int wake_up_process(struct task_struct *p)
+{
+ WARN_ON(task_is_stopped_or_traced(p));
+ return try_to_wake_up(p, TASK_NORMAL, 0);
+}
+EXPORT_SYMBOL(wake_up_process);
+
+int wake_up_state(struct task_struct *p, unsigned int state)
+{
+ return try_to_wake_up(p, state, 0);
+}
+
+static void time_slice_expired(struct task_struct *p);
+
+/*
+ * Perform scheduler related setup for a newly forked process p.
+ * p is forked by current.
+ */
+int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p)
+{
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+ INIT_HLIST_HEAD(&p->preempt_notifiers);
+#endif
+ /*
+ * The process state is set to the same value of the process executing
+ * do_fork() code. That is running. This guarantees that nobody will
+ * actually run it, and a signal or other external event cannot wake
+ * it up and insert it on the runqueue either.
+ */
+
+ /* Should be reset in fork.c but done here for ease of bfs patching */
+ p->on_rq =
+ p->utime =
+ p->stime =
+ p->utimescaled =
+ p->stimescaled =
+ p->sched_time =
+ p->stime_pc =
+ p->utime_pc = 0;
+
+ /*
+ * Revert to default priority/policy on fork if requested.
+ */
+ if (unlikely(p->sched_reset_on_fork)) {
+ if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
+ p->policy = SCHED_NORMAL;
+ p->normal_prio = normal_prio(p);
+ }
+
+ if (PRIO_TO_NICE(p->static_prio) < 0) {
+ p->static_prio = NICE_TO_PRIO(0);
+ p->normal_prio = p->static_prio;
+ }
+
+ /*
+ * We don't need the reset flag anymore after the fork. It has
+ * fulfilled its duty:
+ */
+ p->sched_reset_on_fork = 0;
+ }
+
+ INIT_LIST_HEAD(&p->run_list);
+#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+ if (unlikely(sched_info_on()))
+ memset(&p->sched_info, 0, sizeof(p->sched_info));
+#endif
+ p->on_cpu = false;
+ clear_sticky(p);
+ init_task_preempt_count(p);
+ return 0;
+}
+
+/*
+ * wake_up_new_task - wake up a newly created task for the first time.
+ *
+ * This function will do some initial scheduler statistics housekeeping
+ * that must be done for every newly created context, then puts the task
+ * on the runqueue and wakes it.
+ */
+void wake_up_new_task(struct task_struct *p)
+{
+ struct task_struct *parent;
+ unsigned long flags;
+ struct rq *rq;
+
+ parent = p->parent;
+ rq = task_grq_lock(p, &flags);
+
+ /*
+ * Reinit new task deadline as its creator deadline could have changed
+ * since call to dup_task_struct().
+ */
+ p->deadline = rq->rq_deadline;
+
+ /*
+ * If the task is a new process, current and parent are the same. If
+ * the task is a new thread in the thread group, it will have much more
+ * in common with current than with the parent.
+ */
+ set_task_cpu(p, task_cpu(rq->curr));
+
+ /*
+ * Make sure we do not leak PI boosting priority to the child.
+ */
+ p->prio = rq->curr->normal_prio;
+
+ activate_task(p, rq);
+ trace_sched_wakeup_new(p, 1);
+ if (unlikely(p->policy == SCHED_FIFO))
+ goto after_ts_init;
+
+ /*
+ * Share the timeslice between parent and child, thus the
+ * total amount of pending timeslices in the system doesn't change,
+ * resulting in more scheduling fairness. If it's negative, it won't
+ * matter since that's the same as being 0. current's time_slice is
+ * actually in rq_time_slice when it's running, as is its last_ran
+ * value. rq->rq_deadline is only modified within schedule() so it
+ * is always equal to current->deadline.
+ */
+ p->last_ran = rq->rq_last_ran;
+ if (likely(rq->rq_time_slice >= RESCHED_US * 2)) {
+ rq->rq_time_slice /= 2;
+ p->time_slice = rq->rq_time_slice;
+after_ts_init:
+ if (rq->curr == parent && !suitable_idle_cpus(p)) {
+ /*
+ * The VM isn't cloned, so we're in a good position to
+ * do child-runs-first in anticipation of an exec. This
+ * usually avoids a lot of COW overhead.
+ */
+ __set_tsk_resched(parent);
+ } else
+ try_preempt(p, rq);
+ } else {
+ if (rq->curr == parent) {
+ /*
+ * Forking task has run out of timeslice. Reschedule it and
+ * start its child with a new time slice and deadline. The
+ * child will end up running first because its deadline will
+ * be slightly earlier.
+ */
+ rq->rq_time_slice = 0;
+ __set_tsk_resched(parent);
+ }
+ time_slice_expired(p);
+ }
+ task_grq_unlock(&flags);
+}
+
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+
+/**
+ * preempt_notifier_register - tell me when current is being preempted & rescheduled
+ * @notifier: notifier struct to register
+ */
+void preempt_notifier_register(struct preempt_notifier *notifier)
+{
+ hlist_add_head(&notifier->link, &current->preempt_notifiers);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_register);
+
+/**
+ * preempt_notifier_unregister - no longer interested in preemption notifications
+ * @notifier: notifier struct to unregister
+ *
+ * This is safe to call from within a preemption notifier.
+ */
+void preempt_notifier_unregister(struct preempt_notifier *notifier)
+{
+ hlist_del(&notifier->link);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
+
+static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+ struct preempt_notifier *notifier;
+
+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
+ notifier->ops->sched_in(notifier, raw_smp_processor_id());
+}
+
+static void
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
+ struct task_struct *next)
+{
+ struct preempt_notifier *notifier;
+
+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
+ notifier->ops->sched_out(notifier, next);
+}
+
+#else /* !CONFIG_PREEMPT_NOTIFIERS */
+
+static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+}
+
+static void
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
+ struct task_struct *next)
+{
+}
+
+#endif /* CONFIG_PREEMPT_NOTIFIERS */
+
+/**
+ * prepare_task_switch - prepare to switch tasks
+ * @rq: the runqueue preparing to switch
+ * @next: the task we are going to switch to.
+ *
+ * This is called with the rq lock held and interrupts off. It must
+ * be paired with a subsequent finish_task_switch after the context
+ * switch.
+ *
+ * prepare_task_switch sets up locking and calls architecture specific
+ * hooks.
+ */
+static inline void
+prepare_task_switch(struct rq *rq, struct task_struct *prev,
+ struct task_struct *next)
+{
+ sched_info_switch(rq, prev, next);
+ perf_event_task_sched_out(prev, next);
+ fire_sched_out_preempt_notifiers(prev, next);
+ prepare_lock_switch(rq, next);
+ prepare_arch_switch(next);
+ trace_sched_switch(prev, next);
+}
+
+/**
+ * finish_task_switch - clean up after a task-switch
+ * @rq: runqueue associated with task-switch
+ * @prev: the thread we just switched away from.
+ *
+ * finish_task_switch must be called after the context switch, paired
+ * with a prepare_task_switch call before the context switch.
+ * finish_task_switch will reconcile locking set up by prepare_task_switch,
+ * and do any other architecture-specific cleanup actions.
+ *
+ * Note that we may have delayed dropping an mm in context_switch(). If
+ * so, we finish that here outside of the runqueue lock. (Doing it
+ * with the lock held can cause deadlocks; see schedule() for
+ * details.)
+ *
+ * The context switch have flipped the stack from under us and restored the
+ * local variables which were saved when this task called schedule() in the
+ * past. prev == current is still correct but we need to recalculate this_rq
+ * because prev may have moved to another CPU.
+ */
+static struct rq *finish_task_switch(struct task_struct *prev)
+ __releases(grq.lock)
+{
+ struct rq *rq = this_rq();
+ struct mm_struct *mm = rq->prev_mm;
+ long prev_state;
+
+ rq->prev_mm = NULL;
+
+ /*
+ * A task struct has one reference for the use as "current".
+ * If a task dies, then it sets TASK_DEAD in tsk->state and calls
+ * schedule one last time. The schedule call will never return, and
+ * the scheduled task must drop that reference.
+ * The test for TASK_DEAD must occur while the runqueue locks are
+ * still held, otherwise prev could be scheduled on another cpu, die
+ * there before we look at prev->state, and then the reference would
+ * be dropped twice.
+ * Manfred Spraul <manfred@colorfullife.com>
+ */
+ prev_state = prev->state;
+ vtime_task_switch(prev);
+ finish_arch_switch(prev);
+ perf_event_task_sched_in(prev, current);
+ finish_lock_switch(rq, prev);
+ finish_arch_post_lock_switch();
+
+ fire_sched_in_preempt_notifiers(current);
+ if (mm)
+ mmdrop(mm);
+ if (unlikely(prev_state == TASK_DEAD)) {
+ /*
+ * Remove function-return probe instances associated with this
+ * task and put them back on the free list.
+ */
+ kprobe_flush_task(prev);
+ put_task_struct(prev);
+ }
+ return rq;
+}
+
+/**
+ * schedule_tail - first thing a freshly forked thread must call.
+ * @prev: the thread we just switched away from.
+ */
+asmlinkage __visible void schedule_tail(struct task_struct *prev)
+ __releases(grq.lock)
+{
+ struct rq *rq;
+
+ /* finish_task_switch() drops rq->lock and enables preemption */
+ preempt_disable();
+ rq = finish_task_switch(prev);
+ preempt_enable();
+
+ if (current->set_child_tid)
+ put_user(task_pid_vnr(current), current->set_child_tid);
+}
+
+/*
+ * context_switch - switch to the new MM and the new thread's register state.
+ */
+static inline struct rq *
+context_switch(struct rq *rq, struct task_struct *prev,
+ struct task_struct *next)
+{
+ struct mm_struct *mm, *oldmm;
+
+ prepare_task_switch(rq, prev, next);
+
+ mm = next->mm;
+ oldmm = prev->active_mm;
+ /*
+ * For paravirt, this is coupled with an exit in switch_to to
+ * combine the page table reload and the switch backend into
+ * one hypercall.
+ */
+ arch_start_context_switch(prev);
+
+ if (!mm) {
+ next->active_mm = oldmm;
+ atomic_inc(&oldmm->mm_count);
+ enter_lazy_tlb(oldmm, next);
+ } else
+ switch_mm(oldmm, mm, next);
+
+ if (!prev->mm) {
+ prev->active_mm = NULL;
+ rq->prev_mm = oldmm;
+ }
+ /*
+ * Since the runqueue lock will be released by the next
+ * task (which is an invalid locking op but in the case
+ * of the scheduler it's an obvious special-case), so we
+ * do an early lockdep release here:
+ */
+ spin_release(&grq.lock.dep_map, 1, _THIS_IP_);
+
+ /* Here we just switch the register state and the stack. */
+ context_tracking_task_switch(prev, next);
+ switch_to(prev, next, prev);
+
+ barrier();
+
+ return finish_task_switch(prev);
+}
+
+/*
+ * nr_running, nr_uninterruptible and nr_context_switches:
+ *
+ * externally visible scheduler statistics: current number of runnable
+ * threads, total number of context switches performed since bootup. All are
+ * measured without grabbing the grq lock but the occasional inaccurate result
+ * doesn't matter so long as it's positive.
+ */
+unsigned long nr_running(void)
+{
+ long nr = grq.nr_running;
+
+ if (unlikely(nr < 0))
+ nr = 0;
+ return (unsigned long)nr;
+}
+
+static unsigned long nr_uninterruptible(void)
+{
+ long nu = grq.nr_uninterruptible;
+
+ if (unlikely(nu < 0))
+ nu = 0;
+ return nu;
+}
+
+/*
+ * Check if only the current task is running on the cpu.
+ */
+bool single_task_running(void)
+{
+ if (cpu_rq(smp_processor_id())->soft_affined == 1)
+ return true;
+ else
+ return false;
+}
+EXPORT_SYMBOL(single_task_running);
+
+unsigned long long nr_context_switches(void)
+{
+ long long ns = grq.nr_switches;
+
+ /* This is of course impossible */
+ if (unlikely(ns < 0))
+ ns = 1;
+ return (unsigned long long)ns;
+}
+
+unsigned long nr_iowait(void)
+{
+ unsigned long i, sum = 0;
+
+ for_each_possible_cpu(i)
+ sum += atomic_read(&cpu_rq(i)->nr_iowait);
+
+ return sum;
+}
+
+unsigned long nr_iowait_cpu(int cpu)
+{
+ struct rq *this = cpu_rq(cpu);
+ return atomic_read(&this->nr_iowait);
+}
+
+unsigned long nr_active(void)
+{
+ return nr_running() + nr_uninterruptible();
+}
+
+/* Beyond a task running on this CPU, load is equal everywhere on BFS, so we
+ * base it on the number of running or queued tasks with their ->rq pointer
+ * set to this cpu as being the CPU they're more likely to run on. */
+void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
+{
+ struct rq *this = this_rq();
+
+ *nr_waiters = atomic_read(&this->nr_iowait);
+ *load = this->soft_affined;
+}
+
+/* Variables and functions for calc_load */
+static unsigned long calc_load_update;
+unsigned long avenrun[3];
+EXPORT_SYMBOL(avenrun);
+
+/**
+ * get_avenrun - get the load average array
+ * @loads: pointer to dest load array
+ * @offset: offset to add
+ * @shift: shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
+{
+ loads[0] = (avenrun[0] + offset) << shift;
+ loads[1] = (avenrun[1] + offset) << shift;
+ loads[2] = (avenrun[2] + offset) << shift;
+}
+
+static unsigned long
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
+{
+ load *= exp;
+ load += active * (FIXED_1 - exp);
+ return load >> FSHIFT;
+}
+
+/*
+ * calc_load - update the avenrun load estimates every LOAD_FREQ seconds.
+ */
+void calc_global_load(unsigned long ticks)
+{
+ long active;
+
+ if (time_before(jiffies, calc_load_update))
+ return;
+ active = nr_active() * FIXED_1;
+
+ avenrun[0] = calc_load(avenrun[0], EXP_1, active);
+ avenrun[1] = calc_load(avenrun[1], EXP_5, active);
+ avenrun[2] = calc_load(avenrun[2], EXP_15, active);
+
+ calc_load_update = jiffies + LOAD_FREQ;
+}
+
+DEFINE_PER_CPU(struct kernel_stat, kstat);
+DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
+
+EXPORT_PER_CPU_SYMBOL(kstat);
+EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+
+/*
+ * There are no locks covering percpu hardirq/softirq time.
+ * They are only modified in account_system_vtime, on corresponding CPU
+ * with interrupts disabled. So, writes are safe.
+ * They are read and saved off onto struct rq in update_rq_clock().
+ * This may result in other CPU reading this CPU's irq time and can
+ * race with irq/account_system_vtime on this CPU. We would either get old
+ * or new value with a side effect of accounting a slice of irq time to wrong
+ * task when irq is in progress while we read rq->clock. That is a worthy
+ * compromise in place of having locks on each irq in account_system_time.
+ */
+static DEFINE_PER_CPU(u64, cpu_hardirq_time);
+static DEFINE_PER_CPU(u64, cpu_softirq_time);
+
+static DEFINE_PER_CPU(u64, irq_start_time);
+static int sched_clock_irqtime;
+
+void enable_sched_clock_irqtime(void)
+{
+ sched_clock_irqtime = 1;
+}
+
+void disable_sched_clock_irqtime(void)
+{
+ sched_clock_irqtime = 0;
+}
+
+#ifndef CONFIG_64BIT
+static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
+
+static inline void irq_time_write_begin(void)
+{
+ __this_cpu_inc(irq_time_seq.sequence);
+ smp_wmb();
+}
+
+static inline void irq_time_write_end(void)
+{
+ smp_wmb();
+ __this_cpu_inc(irq_time_seq.sequence);
+}
+
+static inline u64 irq_time_read(int cpu)
+{
+ u64 irq_time;
+ unsigned seq;
+
+ do {
+ seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
+ irq_time = per_cpu(cpu_softirq_time, cpu) +
+ per_cpu(cpu_hardirq_time, cpu);
+ } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
+
+ return irq_time;
+}
+#else /* CONFIG_64BIT */
+static inline void irq_time_write_begin(void)
+{
+}
+
+static inline void irq_time_write_end(void)
+{
+}
+
+static inline u64 irq_time_read(int cpu)
+{
+ return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
+}
+#endif /* CONFIG_64BIT */
+
+/*
+ * Called before incrementing preempt_count on {soft,}irq_enter
+ * and before decrementing preempt_count on {soft,}irq_exit.
+ */
+void irqtime_account_irq(struct task_struct *curr)
+{
+ unsigned long flags;
+ s64 delta;
+ int cpu;
+
+ if (!sched_clock_irqtime)
+ return;
+
+ local_irq_save(flags);
+
+ cpu = smp_processor_id();
+ delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
+ __this_cpu_add(irq_start_time, delta);
+
+ irq_time_write_begin();
+ /*
+ * We do not account for softirq time from ksoftirqd here.
+ * We want to continue accounting softirq time to ksoftirqd thread
+ * in that case, so as not to confuse scheduler with a special task
+ * that do not consume any time, but still wants to run.
+ */
+ if (hardirq_count())
+ __this_cpu_add(cpu_hardirq_time, delta);
+ else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
+ __this_cpu_add(cpu_softirq_time, delta);
+
+ irq_time_write_end();
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(irqtime_account_irq);
+
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+#ifdef CONFIG_PARAVIRT
+static inline u64 steal_ticks(u64 steal)
+{
+ if (unlikely(steal > NSEC_PER_SEC))
+ return div_u64(steal, TICK_NSEC);
+
+ return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
+}
+#endif
+
+static void update_rq_clock_task(struct rq *rq, s64 delta)
+{
+/*
+ * In theory, the compile should just see 0 here, and optimize out the call
+ * to sched_rt_avg_update. But I don't trust it...
+ */
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ s64 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
+
+ /*
+ * Since irq_time is only updated on {soft,}irq_exit, we might run into
+ * this case when a previous update_rq_clock() happened inside a
+ * {soft,}irq region.
+ *
+ * When this happens, we stop ->clock_task and only update the
+ * prev_irq_time stamp to account for the part that fit, so that a next
+ * update will consume the rest. This ensures ->clock_task is
+ * monotonic.
+ *
+ * It does however cause some slight miss-attribution of {soft,}irq
+ * time, a more accurate solution would be to update the irq_time using
+ * the current rq->clock timestamp, except that would require using
+ * atomic ops.
+ */
+ if (irq_delta > delta)
+ irq_delta = delta;
+
+ rq->prev_irq_time += irq_delta;
+ delta -= irq_delta;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+ if (static_key_false((&paravirt_steal_rq_enabled))) {
+ s64 steal = paravirt_steal_clock(cpu_of(rq));
+
+ steal -= rq->prev_steal_time_rq;
+
+ if (unlikely(steal > delta))
+ steal = delta;
+
+ rq->prev_steal_time_rq += steal;
+
+ delta -= steal;
+ }
+#endif
+
+ rq->clock_task += delta;
+}
+
+#ifndef nsecs_to_cputime
+# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
+#endif
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+static void irqtime_account_hi_si(void)
+{
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+ u64 latest_ns;
+
+ latest_ns = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time));
+ if (latest_ns > cpustat[CPUTIME_IRQ])
+ cpustat[CPUTIME_IRQ] += (__force u64)cputime_one_jiffy;
+
+ latest_ns = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time));
+ if (latest_ns > cpustat[CPUTIME_SOFTIRQ])
+ cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy;
+}
+#else /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+#define sched_clock_irqtime (0)
+
+static inline void irqtime_account_hi_si(void)
+{
+}
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+static __always_inline bool steal_account_process_tick(void)
+{
+#ifdef CONFIG_PARAVIRT
+ if (static_key_false(&paravirt_steal_enabled)) {
+ u64 steal;
+ cputime_t steal_ct;
+
+ steal = paravirt_steal_clock(smp_processor_id());
+ steal -= this_rq()->prev_steal_time;
+
+ /*
+ * cputime_t may be less precise than nsecs (eg: if it's
+ * based on jiffies). Lets cast the result to cputime
+ * granularity and account the rest on the next rounds.
+ */
+ steal_ct = nsecs_to_cputime(steal);
+ this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct);
+
+ account_steal_time(steal_ct);
+ return steal_ct;
+ }
+#endif
+ return false;
+}
+
+/*
+ * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
+ * tasks (sum on group iteration) belonging to @tsk's group.
+ */
+void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
+{
+ struct signal_struct *sig = tsk->signal;
+ cputime_t utime, stime;
+ struct task_struct *t;
+ unsigned int seq, nextseq;
+ unsigned long flags;
+
+ rcu_read_lock();
+ /* Attempt a lockless read on the first round. */
+ nextseq = 0;
+ do {
+ seq = nextseq;
+ flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
+ times->utime = sig->utime;
+ times->stime = sig->stime;
+ times->sum_exec_runtime = sig->sum_sched_runtime;
+
+ for_each_thread(tsk, t) {
+ task_cputime(t, &utime, &stime);
+ times->utime += utime;
+ times->stime += stime;
+ times->sum_exec_runtime += task_sched_runtime(t);
+ }
+ /* If lockless access failed, take the lock. */
+ nextseq = 1;
+ } while (need_seqretry(&sig->stats_lock, seq));
+ done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
+ rcu_read_unlock();
+}
+
+/*
+ * On each tick, see what percentage of that tick was attributed to each
+ * component and add the percentage to the _pc values. Once a _pc value has
+ * accumulated one tick's worth, account for that. This means the total
+ * percentage of load components will always be 128 (pseudo 100) per tick.
+ */
+static void pc_idle_time(struct rq *rq, struct task_struct *idle, unsigned long pc)
+{
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+
+ if (atomic_read(&rq->nr_iowait) > 0) {
+ rq->iowait_pc += pc;
+ if (rq->iowait_pc >= 128) {
+ cpustat[CPUTIME_IOWAIT] += (__force u64)cputime_one_jiffy * rq->iowait_pc / 128;
+ rq->iowait_pc %= 128;
+ }
+ } else {
+ rq->idle_pc += pc;
+ if (rq->idle_pc >= 128) {
+ cpustat[CPUTIME_IDLE] += (__force u64)cputime_one_jiffy * rq->idle_pc / 128;
+ rq->idle_pc %= 128;
+ }
+ }
+ acct_update_integrals(idle);
+}
+
+static void
+pc_system_time(struct rq *rq, struct task_struct *p, int hardirq_offset,
+ unsigned long pc, unsigned long ns)
+{
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+ cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+
+ p->stime_pc += pc;
+ if (p->stime_pc >= 128) {
+ int jiffs = p->stime_pc / 128;
+
+ p->stime_pc %= 128;
+ p->stime += (__force u64)cputime_one_jiffy * jiffs;
+ p->stimescaled += one_jiffy_scaled * jiffs;
+ account_group_system_time(p, cputime_one_jiffy * jiffs);
+ }
+ p->sched_time += ns;
+ account_group_exec_runtime(p, ns);
+
+ if (hardirq_count() - hardirq_offset) {
+ rq->irq_pc += pc;
+ if (rq->irq_pc >= 128) {
+ cpustat[CPUTIME_IRQ] += (__force u64)cputime_one_jiffy * rq->irq_pc / 128;
+ rq->irq_pc %= 128;
+ }
+ } else if (in_serving_softirq()) {
+ rq->softirq_pc += pc;
+ if (rq->softirq_pc >= 128) {
+ cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy * rq->softirq_pc / 128;
+ rq->softirq_pc %= 128;
+ }
+ } else {
+ rq->system_pc += pc;
+ if (rq->system_pc >= 128) {
+ cpustat[CPUTIME_SYSTEM] += (__force u64)cputime_one_jiffy * rq->system_pc / 128;
+ rq->system_pc %= 128;
+ }
+ }
+ acct_update_integrals(p);
+}
+
+static void pc_user_time(struct rq *rq, struct task_struct *p,
+ unsigned long pc, unsigned long ns)
+{
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+ cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+
+ p->utime_pc += pc;
+ if (p->utime_pc >= 128) {
+ int jiffs = p->utime_pc / 128;
+
+ p->utime_pc %= 128;
+ p->utime += (__force u64)cputime_one_jiffy * jiffs;
+ p->utimescaled += one_jiffy_scaled * jiffs;
+ account_group_user_time(p, cputime_one_jiffy * jiffs);
+ }
+ p->sched_time += ns;
+ account_group_exec_runtime(p, ns);
+
+ if (this_cpu_ksoftirqd() == p) {
+ /*
+ * ksoftirqd time do not get accounted in cpu_softirq_time.
+ * So, we have to handle it separately here.
+ */
+ rq->softirq_pc += pc;
+ if (rq->softirq_pc >= 128) {
+ cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy * rq->softirq_pc / 128;
+ rq->softirq_pc %= 128;
+ }
+ }
+
+ if (task_nice(p) > 0 || idleprio_task(p)) {
+ rq->nice_pc += pc;
+ if (rq->nice_pc >= 128) {
+ cpustat[CPUTIME_NICE] += (__force u64)cputime_one_jiffy * rq->nice_pc / 128;
+ rq->nice_pc %= 128;
+ }
+ } else {
+ rq->user_pc += pc;
+ if (rq->user_pc >= 128) {
+ cpustat[CPUTIME_USER] += (__force u64)cputime_one_jiffy * rq->user_pc / 128;
+ rq->user_pc %= 128;
+ }
+ }
+ acct_update_integrals(p);
+}
+
+/*
+ * Convert nanoseconds to pseudo percentage of one tick. Use 128 for fast
+ * shifts instead of 100
+ */
+#define NS_TO_PC(NS) (NS * 128 / JIFFY_NS)
+
+/*
+ * This is called on clock ticks.
+ * Bank in p->sched_time the ns elapsed since the last tick or switch.
+ * CPU scheduler quota accounting is also performed here in microseconds.
+ */
+static void
+update_cpu_clock_tick(struct rq *rq, struct task_struct *p)
+{
+ long account_ns = rq->clock_task - rq->rq_last_ran;
+ struct task_struct *idle = rq->idle;
+ unsigned long account_pc;
+
+ if (unlikely(account_ns < 0) || steal_account_process_tick())
+ goto ts_account;
+
+ account_pc = NS_TO_PC(account_ns);
+
+ /* Accurate tick timekeeping */
+ if (user_mode(get_irq_regs()))
+ pc_user_time(rq, p, account_pc, account_ns);
+ else if (p != idle || (irq_count() != HARDIRQ_OFFSET))
+ pc_system_time(rq, p, HARDIRQ_OFFSET,
+ account_pc, account_ns);
+ else
+ pc_idle_time(rq, idle, account_pc);
+
+ if (sched_clock_irqtime)
+ irqtime_account_hi_si();
+
+ts_account:
+ /* time_slice accounting is done in usecs to avoid overflow on 32bit */
+ if (rq->rq_policy != SCHED_FIFO && p != idle) {
+ s64 time_diff = rq->clock - rq->timekeep_clock;
+
+ niffy_diff(&time_diff, 1);
+ rq->rq_time_slice -= NS_TO_US(time_diff);
+ }
+
+ rq->rq_last_ran = rq->clock_task;
+ rq->timekeep_clock = rq->clock;
+}
+
+/*
+ * This is called on context switches.
+ * Bank in p->sched_time the ns elapsed since the last tick or switch.
+ * CPU scheduler quota accounting is also performed here in microseconds.
+ */
+static void
+update_cpu_clock_switch(struct rq *rq, struct task_struct *p)
+{
+ long account_ns = rq->clock_task - rq->rq_last_ran;
+ struct task_struct *idle = rq->idle;
+ unsigned long account_pc;
+
+ if (unlikely(account_ns < 0))
+ goto ts_account;
+
+ account_pc = NS_TO_PC(account_ns);
+
+ /* Accurate subtick timekeeping */
+ if (p != idle) {
+ pc_user_time(rq, p, account_pc, account_ns);
+ }
+ else
+ pc_idle_time(rq, idle, account_pc);
+
+ts_account:
+ /* time_slice accounting is done in usecs to avoid overflow on 32bit */
+ if (rq->rq_policy != SCHED_FIFO && p != idle) {
+ s64 time_diff = rq->clock - rq->timekeep_clock;
+
+ niffy_diff(&time_diff, 1);
+ rq->rq_time_slice -= NS_TO_US(time_diff);
+ }
+
+ rq->rq_last_ran = rq->clock_task;
+ rq->timekeep_clock = rq->clock;
+}
+
+/*
+ * Return any ns on the sched_clock that have not yet been accounted in
+ * @p in case that task is currently running.
+ *
+ * Called with task_grq_lock() held.
+ */
+static inline u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
+{
+ u64 ns = 0;
+
+ /*
+ * Must be ->curr _and_ ->on_rq. If dequeued, we would
+ * project cycles that may never be accounted to this
+ * thread, breaking clock_gettime().
+ */
+ if (p == rq->curr && p->on_rq) {
+ update_clocks(rq);
+ ns = rq->clock_task - rq->rq_last_ran;
+ if (unlikely((s64)ns < 0))
+ ns = 0;
+ }
+
+ return ns;
+}
+
+/*
+ * Return accounted runtime for the task.
+ * Return separately the current's pending runtime that have not been
+ * accounted yet.
+ *
+ */
+unsigned long long task_sched_runtime(struct task_struct *p)
+{
+ unsigned long flags;
+ struct rq *rq;
+ u64 ns;
+
+#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
+ /*
+ * 64-bit doesn't need locks to atomically read a 64bit value.
+ * So we have a optimization chance when the task's delta_exec is 0.
+ * Reading ->on_cpu is racy, but this is ok.
+ *
+ * If we race with it leaving cpu, we'll take a lock. So we're correct.
+ * If we race with it entering cpu, unaccounted time is 0. This is
+ * indistinguishable from the read occurring a few cycles earlier.
+ * If we see ->on_cpu without ->on_rq, the task is leaving, and has
+ * been accounted, so we're correct here as well.
+ */
+ if (!p->on_cpu || !p->on_rq)
+ return tsk_seruntime(p);
+#endif
+
+ rq = task_grq_lock(p, &flags);
+ ns = p->sched_time + do_task_delta_exec(p, rq);
+ task_grq_unlock(&flags);
+
+ return ns;
+}
+
+/* Compatibility crap */
+void account_user_time(struct task_struct *p, cputime_t cputime,
+ cputime_t cputime_scaled)
+{
+}
+
+void account_idle_time(cputime_t cputime)
+{
+}
+
+void update_cpu_load_nohz(void)
+{
+}
+
+#ifdef CONFIG_NO_HZ_COMMON
+void calc_load_enter_idle(void)
+{
+}
+
+void calc_load_exit_idle(void)
+{
+}
+#endif /* CONFIG_NO_HZ_COMMON */
+
+/*
+ * Account guest cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in virtual machine since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ */
+static void account_guest_time(struct task_struct *p, cputime_t cputime,
+ cputime_t cputime_scaled)
+{
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+
+ /* Add guest time to process. */
+ p->utime += (__force u64)cputime;
+ p->utimescaled += (__force u64)cputime_scaled;
+ account_group_user_time(p, cputime);
+ p->gtime += (__force u64)cputime;
+
+ /* Add guest time to cpustat. */
+ if (task_nice(p) > 0) {
+ cpustat[CPUTIME_NICE] += (__force u64)cputime;
+ cpustat[CPUTIME_GUEST_NICE] += (__force u64)cputime;
+ } else {
+ cpustat[CPUTIME_USER] += (__force u64)cputime;
+ cpustat[CPUTIME_GUEST] += (__force u64)cputime;
+ }
+}
+
+/*
+ * Account system cpu time to a process and desired cpustat field
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in kernel space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ * @target_cputime64: pointer to cpustat field that has to be updated
+ */
+static inline
+void __account_system_time(struct task_struct *p, cputime_t cputime,
+ cputime_t cputime_scaled, cputime64_t *target_cputime64)
+{
+ /* Add system time to process. */
+ p->stime += (__force u64)cputime;
+ p->stimescaled += (__force u64)cputime_scaled;
+ account_group_system_time(p, cputime);
+
+ /* Add system time to cpustat. */
+ *target_cputime64 += (__force u64)cputime;
+
+ /* Account for system time used */
+ acct_update_integrals(p);
+}
+
+/*
+ * Account system cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @hardirq_offset: the offset to subtract from hardirq_count()
+ * @cputime: the cpu time spent in kernel space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ * This is for guest only now.
+ */
+void account_system_time(struct task_struct *p, int hardirq_offset,
+ cputime_t cputime, cputime_t cputime_scaled)
+{
+
+ if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0))
+ account_guest_time(p, cputime, cputime_scaled);
+}
+
+/*
+ * Account for involuntary wait time.
+ * @steal: the cpu time spent in involuntary wait
+ */
+void account_steal_time(cputime_t cputime)
+{
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+
+ cpustat[CPUTIME_STEAL] += (__force u64)cputime;
+}
+
+/*
+ * Account for idle time.
+ * @cputime: the cpu time spent in idle wait
+ */
+static void account_idle_times(cputime_t cputime)
+{
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+ struct rq *rq = this_rq();
+
+ if (atomic_read(&rq->nr_iowait) > 0)
+ cpustat[CPUTIME_IOWAIT] += (__force u64)cputime;
+ else
+ cpustat[CPUTIME_IDLE] += (__force u64)cputime;
+}
+
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+
+void account_process_tick(struct task_struct *p, int user_tick)
+{
+}
+
+/*
+ * Account multiple ticks of steal time.
+ * @p: the process from which the cpu time has been stolen
+ * @ticks: number of stolen ticks
+ */
+void account_steal_ticks(unsigned long ticks)
+{
+ account_steal_time(jiffies_to_cputime(ticks));
+}
+
+/*
+ * Account multiple ticks of idle time.
+ * @ticks: number of stolen ticks
+ */
+void account_idle_ticks(unsigned long ticks)
+{
+ account_idle_times(jiffies_to_cputime(ticks));
+}
+#endif
+
+static inline void grq_iso_lock(void)
+ __acquires(grq.iso_lock)
+{
+ raw_spin_lock(&grq.iso_lock);
+}
+
+static inline void grq_iso_unlock(void)
+ __releases(grq.iso_lock)
+{
+ raw_spin_unlock(&grq.iso_lock);
+}
+
+/*
+ * Functions to test for when SCHED_ISO tasks have used their allocated
+ * quota as real time scheduling and convert them back to SCHED_NORMAL.
+ * Where possible, the data is tested lockless, to avoid grabbing iso_lock
+ * because the occasional inaccurate result won't matter. However the
+ * tick data is only ever modified under lock. iso_refractory is only simply
+ * set to 0 or 1 so it's not worth grabbing the lock yet again for that.
+ */
+static bool set_iso_refractory(void)
+{
+ grq.iso_refractory = true;
+ return grq.iso_refractory;
+}
+
+static bool clear_iso_refractory(void)
+{
+ grq.iso_refractory = false;
+ return grq.iso_refractory;
+}
+
+/*
+ * Test if SCHED_ISO tasks have run longer than their alloted period as RT
+ * tasks and set the refractory flag if necessary. There is 10% hysteresis
+ * for unsetting the flag. 115/128 is ~90/100 as a fast shift instead of a
+ * slow division.
+ */
+static bool test_ret_isorefractory(struct rq *rq)
+{
+ if (likely(!grq.iso_refractory)) {
+ if (grq.iso_ticks > ISO_PERIOD * sched_iso_cpu)
+ return set_iso_refractory();
+ } else {
+ if (grq.iso_ticks < ISO_PERIOD * (sched_iso_cpu * 115 / 128))
+ return clear_iso_refractory();
+ }
+ return grq.iso_refractory;
+}
+
+static void iso_tick(void)
+{
+ grq_iso_lock();
+ grq.iso_ticks += 100;
+ grq_iso_unlock();
+}
+
+/* No SCHED_ISO task was running so decrease rq->iso_ticks */
+static inline void no_iso_tick(void)
+{
+ if (grq.iso_ticks) {
+ grq_iso_lock();
+ grq.iso_ticks -= grq.iso_ticks / ISO_PERIOD + 1;
+ if (unlikely(grq.iso_refractory && grq.iso_ticks <
+ ISO_PERIOD * (sched_iso_cpu * 115 / 128)))
+ clear_iso_refractory();
+ grq_iso_unlock();
+ }
+}
+
+/* This manages tasks that have run out of timeslice during a scheduler_tick */
+static void task_running_tick(struct rq *rq)
+{
+ struct task_struct *p;
+
+ /*
+ * If a SCHED_ISO task is running we increment the iso_ticks. In
+ * order to prevent SCHED_ISO tasks from causing starvation in the
+ * presence of true RT tasks we account those as iso_ticks as well.
+ */
+ if ((rt_queue(rq) || (iso_queue(rq) && !grq.iso_refractory))) {
+ if (grq.iso_ticks <= (ISO_PERIOD * 128) - 128)
+ iso_tick();
+ } else
+ no_iso_tick();
+
+ if (iso_queue(rq)) {
+ if (unlikely(test_ret_isorefractory(rq))) {
+ if (rq_running_iso(rq)) {
+ /*
+ * SCHED_ISO task is running as RT and limit
+ * has been hit. Force it to reschedule as
+ * SCHED_NORMAL by zeroing its time_slice
+ */
+ rq->rq_time_slice = 0;
+ }
+ }
+ }
+
+ /* SCHED_FIFO tasks never run out of timeslice. */
+ if (rq->rq_policy == SCHED_FIFO)
+ return;
+ /*
+ * Tasks that were scheduled in the first half of a tick are not
+ * allowed to run into the 2nd half of the next tick if they will
+ * run out of time slice in the interim. Otherwise, if they have
+ * less than RESCHED_US μs of time slice left they will be rescheduled.
+ */
+ if (rq->dither) {
+ if (rq->rq_time_slice > HALF_JIFFY_US)
+ return;
+ else
+ rq->rq_time_slice = 0;
+ } else if (rq->rq_time_slice >= RESCHED_US)
+ return;
+
+ /* p->time_slice < RESCHED_US. We only modify task_struct under grq lock */
+ p = rq->curr;
+
+ grq_lock();
+ requeue_task(p);
+ __set_tsk_resched(p);
+ grq_unlock();
+}
+
+/*
+ * This function gets called by the timer code, with HZ frequency.
+ * We call it with interrupts disabled. The data modified is all
+ * local to struct rq so we don't need to grab grq lock.
+ */
+void scheduler_tick(void)
+{
+ int cpu __maybe_unused = smp_processor_id();
+ struct rq *rq = cpu_rq(cpu);
+
+ sched_clock_tick();
+ /* grq lock not grabbed, so only update rq clock */
+ update_rq_clock(rq);
+ update_cpu_clock_tick(rq, rq->curr);
+ if (!rq_idle(rq))
+ task_running_tick(rq);
+ else
+ no_iso_tick();
+ rq->last_tick = rq->clock;
+ perf_event_task_tick();
+}
+
+notrace unsigned long get_parent_ip(unsigned long addr)
+{
+ if (in_lock_functions(addr)) {
+ addr = CALLER_ADDR2;
+ if (in_lock_functions(addr))
+ addr = CALLER_ADDR3;
+ }
+ return addr;
+}
+
+#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+ defined(CONFIG_PREEMPT_TRACER))
+void preempt_count_add(int val)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+ /*
+ * Underflow?
+ */
+ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
+ return;
+#endif
+ __preempt_count_add(val);
+#ifdef CONFIG_DEBUG_PREEMPT
+ /*
+ * Spinlock count overflowing soon?
+ */
+ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
+ PREEMPT_MASK - 10);
+#endif
+ if (preempt_count() == val) {
+ unsigned long ip = get_parent_ip(CALLER_ADDR1);
+#ifdef CONFIG_DEBUG_PREEMPT
+ current->preempt_disable_ip = ip;
+#endif
+ trace_preempt_off(CALLER_ADDR0, ip);
+ }
+}
+EXPORT_SYMBOL(preempt_count_add);
+NOKPROBE_SYMBOL(preempt_count_add);
+
+void preempt_count_sub(int val)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+ /*
+ * Underflow?
+ */
+ if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
+ return;
+ /*
+ * Is the spinlock portion underflowing?
+ */
+ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
+ !(preempt_count() & PREEMPT_MASK)))
+ return;
+#endif
+
+ if (preempt_count() == val)
+ trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+ __preempt_count_sub(val);
+}
+EXPORT_SYMBOL(preempt_count_sub);
+NOKPROBE_SYMBOL(preempt_count_sub);
+#endif
+
+/*
+ * Deadline is "now" in niffies + (offset by priority). Setting the deadline
+ * is the key to everything. It distributes cpu fairly amongst tasks of the
+ * same nice value, it proportions cpu according to nice level, it means the
+ * task that last woke up the longest ago has the earliest deadline, thus
+ * ensuring that interactive tasks get low latency on wake up. The CPU
+ * proportion works out to the square of the virtual deadline difference, so
+ * this equation will give nice 19 3% CPU compared to nice 0.
+ */
+static inline u64 prio_deadline_diff(int user_prio)
+{
+ return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128));
+}
+
+static inline u64 task_deadline_diff(struct task_struct *p)
+{
+ return prio_deadline_diff(TASK_USER_PRIO(p));
+}
+
+static inline u64 static_deadline_diff(int static_prio)
+{
+ return prio_deadline_diff(USER_PRIO(static_prio));
+}
+
+static inline int longest_deadline_diff(void)
+{
+ return prio_deadline_diff(39);
+}
+
+static inline int ms_longest_deadline_diff(void)
+{
+ return NS_TO_MS(longest_deadline_diff());
+}
+
+/*
+ * The time_slice is only refilled when it is empty and that is when we set a
+ * new deadline.
+ */
+static void time_slice_expired(struct task_struct *p)
+{
+ p->time_slice = timeslice();
+ p->deadline = grq.niffies + task_deadline_diff(p);
+#ifdef CONFIG_SMT_NICE
+ if (!p->mm)
+ p->smt_bias = 0;
+ else if (rt_task(p))
+ p->smt_bias = 1 << 30;
+ else if (task_running_iso(p))
+ p->smt_bias = 1 << 29;
+ else if (idleprio_task(p)) {
+ if (task_running_idle(p))
+ p->smt_bias = 0;
+ else
+ p->smt_bias = 1;
+ } else if (--p->smt_bias < 1)
+ p->smt_bias = MAX_PRIO - p->static_prio;
+#endif
+}
+
+/*
+ * Timeslices below RESCHED_US are considered as good as expired as there's no
+ * point rescheduling when there's so little time left. SCHED_BATCH tasks
+ * have been flagged be not latency sensitive and likely to be fully CPU
+ * bound so every time they're rescheduled they have their time_slice
+ * refilled, but get a new later deadline to have little effect on
+ * SCHED_NORMAL tasks.
+
+ */
+static inline void check_deadline(struct task_struct *p)
+{
+ if (p->time_slice < RESCHED_US || batch_task(p))
+ time_slice_expired(p);
+}
+
+#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG)
+
+/*
+ * Scheduler queue bitmap specific find next bit.
+ */
+static inline unsigned long
+next_sched_bit(const unsigned long *addr, unsigned long offset)
+{
+ const unsigned long *p;
+ unsigned long result;
+ unsigned long size;
+ unsigned long tmp;
+
+ size = PRIO_LIMIT;
+ if (offset >= size)
+ return size;
+
+ p = addr + BITOP_WORD(offset);
+ result = offset & ~(BITS_PER_LONG-1);
+ size -= result;
+ offset %= BITS_PER_LONG;
+ if (offset) {
+ tmp = *(p++);
+ tmp &= (~0UL << offset);
+ if (size < BITS_PER_LONG)
+ goto found_first;
+ if (tmp)
+ goto found_middle;
+ size -= BITS_PER_LONG;
+ result += BITS_PER_LONG;
+ }
+ while (size & ~(BITS_PER_LONG-1)) {
+ if ((tmp = *(p++)))
+ goto found_middle;
+ result += BITS_PER_LONG;
+ size -= BITS_PER_LONG;
+ }
+ if (!size)
+ return result;
+ tmp = *p;
+
+found_first:
+ tmp &= (~0UL >> (BITS_PER_LONG - size));
+ if (tmp == 0UL) /* Are any bits set? */
+ return result + size; /* Nope. */
+found_middle:
+ return result + __ffs(tmp);
+}
+
+/*
+ * O(n) lookup of all tasks in the global runqueue. The real brainfuck
+ * of lock contention and O(n). It's not really O(n) as only the queued,
+ * but not running tasks are scanned, and is O(n) queued in the worst case
+ * scenario only because the right task can be found before scanning all of
+ * them.
+ * Tasks are selected in this order:
+ * Real time tasks are selected purely by their static priority and in the
+ * order they were queued, so the lowest value idx, and the first queued task
+ * of that priority value is chosen.
+ * If no real time tasks are found, the SCHED_ISO priority is checked, and
+ * all SCHED_ISO tasks have the same priority value, so they're selected by
+ * the earliest deadline value.
+ * If no SCHED_ISO tasks are found, SCHED_NORMAL tasks are selected by the
+ * earliest deadline.
+ * Finally if no SCHED_NORMAL tasks are found, SCHED_IDLEPRIO tasks are
+ * selected by the earliest deadline.
+ */
+static inline struct
+task_struct *earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle)
+{
+ struct task_struct *edt = NULL;
+ unsigned long idx = -1;
+
+ do {
+ struct list_head *queue;
+ struct task_struct *p;
+ u64 earliest_deadline;
+
+ idx = next_sched_bit(grq.prio_bitmap, ++idx);
+ if (idx >= PRIO_LIMIT)
+ return idle;
+ queue = grq.queue + idx;
+
+ if (idx < MAX_RT_PRIO) {
+ /* We found an rt task */
+ list_for_each_entry(p, queue, run_list) {
+ /* Make sure cpu affinity is ok */
+ if (needs_other_cpu(p, cpu))
+ continue;
+ edt = p;
+ goto out_take;
+ }
+ /*
+ * None of the RT tasks at this priority can run on
+ * this cpu
+ */
+ continue;
+ }
+
+ /*
+ * No rt tasks. Find the earliest deadline task. Now we're in
+ * O(n) territory.
+ */
+ earliest_deadline = ~0ULL;
+ list_for_each_entry(p, queue, run_list) {
+ u64 dl;
+
+ /* Make sure cpu affinity is ok */
+ if (needs_other_cpu(p, cpu))
+ continue;
+
+#ifdef CONFIG_SMT_NICE
+ if (!smt_should_schedule(p, cpu))
+ continue;
+#endif
+ /*
+ * Soft affinity happens here by not scheduling a task
+ * with its sticky flag set that ran on a different CPU
+ * last when the CPU is scaling, or by greatly biasing
+ * against its deadline when not, based on cpu cache
+ * locality.
+ */
+ if (task_sticky(p) && task_rq(p) != rq) {
+ if (scaling_rq(rq))
+ continue;
+ dl = p->deadline << locality_diff(p, rq);
+ } else
+ dl = p->deadline;
+
+ if (deadline_before(dl, earliest_deadline)) {
+ earliest_deadline = dl;
+ edt = p;
+ }
+ }
+ } while (!edt);
+
+out_take:
+ take_task(cpu, edt);
+ return edt;
+}
+
+
+/*
+ * Print scheduling while atomic bug:
+ */
+static noinline void __schedule_bug(struct task_struct *prev)
+{
+ if (oops_in_progress)
+ return;
+
+ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
+ prev->comm, prev->pid, preempt_count());
+
+ debug_show_held_locks(prev);
+ print_modules();
+ if (irqs_disabled())
+ print_irqtrace_events(prev);
+#ifdef CONFIG_DEBUG_PREEMPT
+ if (in_atomic_preempt_off()) {
+ pr_err("Preemption disabled at:");
+ print_ip_sym(current->preempt_disable_ip);
+ pr_cont("\n");
+ }
+#endif
+ dump_stack();
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
+}
+
+/*
+ * Various schedule()-time debugging checks and statistics:
+ */
+static inline void schedule_debug(struct task_struct *prev)
+{
+#ifdef CONFIG_SCHED_STACK_END_CHECK
+ BUG_ON(unlikely(task_stack_end_corrupted(prev)));
+#endif
+ /*
+ * Test if we are atomic. Since do_exit() needs to call into
+ * schedule() atomically, we ignore that path. Otherwise whine
+ * if we are scheduling when we should not.
+ */
+ if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))
+ __schedule_bug(prev);
+ rcu_sleep_check();
+
+ profile_hit(SCHED_PROFILING, __builtin_return_address(0));
+
+ schedstat_inc(this_rq(), sched_count);
+}
+
+/*
+ * The currently running task's information is all stored in rq local data
+ * which is only modified by the local CPU, thereby allowing the data to be
+ * changed without grabbing the grq lock.
+ */
+static inline void set_rq_task(struct rq *rq, struct task_struct *p)
+{
+ rq->rq_time_slice = p->time_slice;
+ rq->rq_deadline = p->deadline;
+ rq->rq_last_ran = p->last_ran = rq->clock_task;
+ rq->rq_policy = p->policy;
+ rq->rq_prio = p->prio;
+#ifdef CONFIG_SMT_NICE
+ rq->rq_mm = p->mm;
+ rq->rq_smt_bias = p->smt_bias;
+#endif
+ if (p != rq->idle)
+ rq->rq_running = true;
+ else
+ rq->rq_running = false;
+}
+
+static void reset_rq_task(struct rq *rq, struct task_struct *p)
+{
+ rq->rq_policy = p->policy;
+ rq->rq_prio = p->prio;
+#ifdef CONFIG_SMT_NICE
+ rq->rq_smt_bias = p->smt_bias;
+#endif
+}
+
+#ifdef CONFIG_SMT_NICE
+/* Iterate over smt siblings when we've scheduled a process on cpu and decide
+ * whether they should continue running or be descheduled. */
+static void check_smt_siblings(int cpu)
+{
+ int other_cpu;
+
+ for_each_cpu(other_cpu, thread_cpumask(cpu)) {
+ struct task_struct *p;
+ struct rq *rq;
+
+ if (other_cpu == cpu)
+ continue;
+ rq = cpu_rq(other_cpu);
+ if (rq_idle(rq))
+ continue;
+ if (!rq->online)
+ continue;
+ p = rq->curr;
+ if (!smt_should_schedule(p, cpu)) {
+ set_tsk_need_resched(p);
+ smp_send_reschedule(other_cpu);
+ }
+ }
+}
+
+static void wake_smt_siblings(int cpu)
+{
+ int other_cpu;
+
+ if (!queued_notrunning())
+ return;
+
+ for_each_cpu(other_cpu, thread_cpumask(cpu)) {
+ struct rq *rq;
+
+ if (other_cpu == cpu)
+ continue;
+ rq = cpu_rq(other_cpu);
+ if (rq_idle(rq)) {
+ struct task_struct *p = rq->curr;
+
+ set_tsk_need_resched(p);
+ smp_send_reschedule(other_cpu);
+ }
+ }
+}
+#else
+static void check_smt_siblings(int __maybe_unused cpu) {}
+static void wake_smt_siblings(int __maybe_unused cpu) {}
+#endif
+
+/*
+ * schedule() is the main scheduler function.
+ *
+ * The main means of driving the scheduler and thus entering this function are:
+ *
+ * 1. Explicit blocking: mutex, semaphore, waitqueue, etc.
+ *
+ * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
+ * paths. For example, see arch/x86/entry_64.S.
+ *
+ * To drive preemption between tasks, the scheduler sets the flag in timer
+ * interrupt handler scheduler_tick().
+ *
+ * 3. Wakeups don't really cause entry into schedule(). They add a
+ * task to the run-queue and that's it.
+ *
+ * Now, if the new task added to the run-queue preempts the current
+ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
+ * called on the nearest possible occasion:
+ *
+ * - If the kernel is preemptible (CONFIG_PREEMPT=y):
+ *
+ * - in syscall or exception context, at the next outmost
+ * preempt_enable(). (this might be as soon as the wake_up()'s
+ * spin_unlock()!)
+ *
+ * - in IRQ context, return from interrupt-handler to
+ * preemptible context
+ *
+ * - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
+ * then at the next:
+ *
+ * - cond_resched() call
+ * - explicit schedule() call
+ * - return from syscall or exception to user-space
+ * - return from interrupt-handler to user-space
+ *
+ * WARNING: all callers must re-check need_resched() afterward and reschedule
+ * accordingly in case an event triggered the need for rescheduling (such as
+ * an interrupt waking up a task) while preemption was disabled in __schedule().
+ */
+static void __sched __schedule(void)
+{
+ struct task_struct *prev, *next, *idle;
+ unsigned long *switch_count;
+ bool deactivate = false;
+ struct rq *rq;
+ int cpu;
+
+ preempt_disable();
+ cpu = smp_processor_id();
+ rq = cpu_rq(cpu);
+ rcu_note_context_switch();
+ prev = rq->curr;
+
+ schedule_debug(prev);
+
+ /*
+ * Make sure that signal_pending_state()->signal_pending() below
+ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
+ * done by the caller to avoid the race with signal_wake_up().
+ */
+ smp_mb__before_spinlock();
+ grq_lock_irq();
+
+ switch_count = &prev->nivcsw;
+ if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
+ if (unlikely(signal_pending_state(prev->state, prev))) {
+ prev->state = TASK_RUNNING;
+ } else {
+ deactivate = true;
+ prev->on_rq = 0;
+
+ /*
+ * If a worker is going to sleep, notify and
+ * ask workqueue whether it wants to wake up a
+ * task to maintain concurrency. If so, wake
+ * up the task.
+ */
+ if (prev->flags & PF_WQ_WORKER) {
+ struct task_struct *to_wakeup;
+
+ to_wakeup = wq_worker_sleeping(prev, cpu);
+ if (to_wakeup) {
+ /* This shouldn't happen, but does */
+ if (unlikely(to_wakeup == prev))
+ deactivate = false;
+ else
+ try_to_wake_up_local(to_wakeup);
+ }
+ }
+ }
+ switch_count = &prev->nvcsw;
+ }
+
+ update_clocks(rq);
+ update_cpu_clock_switch(rq, prev);
+ if (rq->clock - rq->last_tick > HALF_JIFFY_NS)
+ rq->dither = false;
+ else
+ rq->dither = true;
+
+ clear_tsk_need_resched(prev);
+ clear_preempt_need_resched();
+
+ idle = rq->idle;
+ if (idle != prev) {
+ /* Update all the information stored on struct rq */
+ prev->time_slice = rq->rq_time_slice;
+ prev->deadline = rq->rq_deadline;
+ check_deadline(prev);
+ prev->last_ran = rq->clock_task;
+
+ /* Task changed affinity off this CPU */
+ if (likely(!needs_other_cpu(prev, cpu))) {
+ if (!deactivate) {
+ if (!queued_notrunning()) {
+ /*
+ * We now know prev is the only thing that is
+ * awaiting CPU so we can bypass rechecking for
+ * the earliest deadline task and just run it
+ * again.
+ */
+ set_rq_task(rq, prev);
+ check_smt_siblings(cpu);
+ grq_unlock_irq();
+ goto rerun_prev_unlocked;
+ } else
+ swap_sticky(rq, cpu, prev);
+ }
+ }
+ return_task(prev, rq, deactivate);
+ }
+
+ if (unlikely(!queued_notrunning())) {
+ /*
+ * This CPU is now truly idle as opposed to when idle is
+ * scheduled as a high priority task in its own right.
+ */
+ next = idle;
+ schedstat_inc(rq, sched_goidle);
+ set_cpuidle_map(cpu);
+ } else {
+ next = earliest_deadline_task(rq, cpu, idle);
+ if (likely(next->prio != PRIO_LIMIT))
+ clear_cpuidle_map(cpu);
+ else
+ set_cpuidle_map(cpu);
+ }
+
+ if (likely(prev != next)) {
+ /*
+ * Don't reschedule an idle task or deactivated tasks
+ */
+ if (prev != idle && !deactivate)
+ resched_suitable_idle(prev);
+ /*
+ * Don't stick tasks when a real time task is going to run as
+ * they may literally get stuck.
+ */
+ if (rt_task(next))
+ unstick_task(rq, prev);
+ set_rq_task(rq, next);
+ if (next != idle)
+ check_smt_siblings(cpu);
+ else
+ wake_smt_siblings(cpu);
+ grq.nr_switches++;
+ prev->on_cpu = false;
+ next->on_cpu = true;
+ rq->curr = next;
+ ++*switch_count;
+
+ rq = context_switch(rq, prev, next); /* unlocks the grq */
+ cpu = cpu_of(rq);
+ idle = rq->idle;
+ } else {
+ check_smt_siblings(cpu);
+ grq_unlock_irq();
+ }
+
+rerun_prev_unlocked:
+ sched_preempt_enable_no_resched();
+}
+
+static inline void sched_submit_work(struct task_struct *tsk)
+{
+ if (!tsk->state || tsk_is_pi_blocked(tsk))
+ return;
+ /*
+ * If we are going to sleep and we have plugged IO queued,
+ * make sure to submit it to avoid deadlocks.
+ */
+ if (blk_needs_flush_plug(tsk))
+ blk_schedule_flush_plug(tsk);
+}
+
+asmlinkage __visible void __sched schedule(void)
+{
+ struct task_struct *tsk = current;
+
+ sched_submit_work(tsk);
+ do {
+ __schedule();
+ } while (need_resched());
+}
+
+EXPORT_SYMBOL(schedule);
+
+#ifdef CONFIG_CONTEXT_TRACKING
+asmlinkage __visible void __sched schedule_user(void)
+{
+ /*
+ * If we come here after a random call to set_need_resched(),
+ * or we have been woken up remotely but the IPI has not yet arrived,
+ * we haven't yet exited the RCU idle mode. Do it here manually until
+ * we find a better solution.
+ *
+ * NB: There are buggy callers of this function. Ideally we
+ * should warn if prev_state != IN_USER, but that will trigger
+ * too frequently to make sense yet.
+ */
+ enum ctx_state prev_state = exception_enter();
+ schedule();
+ exception_exit(prev_state);
+}
+#endif
+
+/**
+ * schedule_preempt_disabled - called with preemption disabled
+ *
+ * Returns with preemption disabled. Note: preempt_count must be 1
+ */
+void __sched schedule_preempt_disabled(void)
+{
+ sched_preempt_enable_no_resched();
+ schedule();
+ preempt_disable();
+}
+
+static void __sched notrace preempt_schedule_common(void)
+{
+ do {
+ __preempt_count_add(PREEMPT_ACTIVE);
+ __schedule();
+ __preempt_count_sub(PREEMPT_ACTIVE);
+
+ /*
+ * Check again in case we missed a preemption opportunity
+ * between schedule and now.
+ */
+ barrier();
+ } while (need_resched());
+}
+
+#ifdef CONFIG_PREEMPT
+/*
+ * this is the entry point to schedule() from in-kernel preemption
+ * off of preempt_enable. Kernel preemptions off return from interrupt
+ * occur there and call schedule directly.
+ */
+asmlinkage __visible void __sched notrace preempt_schedule(void)
+{
+ /*
+ * If there is a non-zero preempt_count or interrupts are disabled,
+ * we do not want to preempt the current task. Just return..
+ */
+ if (likely(!preemptible()))
+ return;
+
+ preempt_schedule_common();
+}
+NOKPROBE_SYMBOL(preempt_schedule);
+EXPORT_SYMBOL(preempt_schedule);
+
+#ifdef CONFIG_CONTEXT_TRACKING
+/**
+ * preempt_schedule_context - preempt_schedule called by tracing
+ *
+ * The tracing infrastructure uses preempt_enable_notrace to prevent
+ * recursion and tracing preempt enabling caused by the tracing
+ * infrastructure itself. But as tracing can happen in areas coming
+ * from userspace or just about to enter userspace, a preempt enable
+ * can occur before user_exit() is called. This will cause the scheduler
+ * to be called when the system is still in usermode.
+ *
+ * To prevent this, the preempt_enable_notrace will use this function
+ * instead of preempt_schedule() to exit user context if needed before
+ * calling the scheduler.
+ */
+asmlinkage __visible void __sched notrace preempt_schedule_context(void)
+{
+ enum ctx_state prev_ctx;
+
+ if (likely(!preemptible()))
+ return;
+
+ do {
+ __preempt_count_add(PREEMPT_ACTIVE);
+ /*
+ * Needs preempt disabled in case user_exit() is traced
+ * and the tracer calls preempt_enable_notrace() causing
+ * an infinite recursion.
+ */
+ prev_ctx = exception_enter();
+ __schedule();
+ exception_exit(prev_ctx);
+
+ __preempt_count_sub(PREEMPT_ACTIVE);
+ barrier();
+ } while (need_resched());
+}
+EXPORT_SYMBOL_GPL(preempt_schedule_context);
+#endif /* CONFIG_CONTEXT_TRACKING */
+
+#endif /* CONFIG_PREEMPT */
+
+/*
+ * this is the entry point to schedule() from kernel preemption
+ * off of irq context.
+ * Note, that this is called and return with irqs disabled. This will
+ * protect us against recursive calling from irq.
+ */
+asmlinkage __visible void __sched preempt_schedule_irq(void)
+{
+ enum ctx_state prev_state;
+
+ /* Catch callers which need to be fixed */
+ BUG_ON(preempt_count() || !irqs_disabled());
+
+ prev_state = exception_enter();
+
+ do {
+ __preempt_count_add(PREEMPT_ACTIVE);
+ local_irq_enable();
+ schedule();
+ local_irq_disable();
+ __preempt_count_sub(PREEMPT_ACTIVE);
+
+ /*
+ * Check again in case we missed a preemption opportunity
+ * between schedule and now.
+ */
+ barrier();
+ } while (need_resched());
+
+ exception_exit(prev_state);
+}
+
+int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
+ void *key)
+{
+ return try_to_wake_up(curr->private, mode, wake_flags);
+}
+EXPORT_SYMBOL(default_wake_function);
+
+#ifdef CONFIG_RT_MUTEXES
+
+/*
+ * rt_mutex_setprio - set the current priority of a task
+ * @p: task
+ * @prio: prio value (kernel-internal form)
+ *
+ * This function changes the 'effective' priority of a task. It does
+ * not touch ->normal_prio like __setscheduler().
+ *
+ * Used by the rt_mutex code to implement priority inheritance
+ * logic. Call site only calls if the priority of the task changed.
+ */
+void rt_mutex_setprio(struct task_struct *p, int prio)
+{
+ unsigned long flags;
+ int queued, oldprio;
+ struct rq *rq;
+
+ BUG_ON(prio < 0 || prio > MAX_PRIO);
+
+ rq = task_grq_lock(p, &flags);
+
+ /*
+ * Idle task boosting is a nono in general. There is one
+ * exception, when PREEMPT_RT and NOHZ is active:
+ *
+ * The idle task calls get_next_timer_interrupt() and holds
+ * the timer wheel base->lock on the CPU and another CPU wants
+ * to access the timer (probably to cancel it). We can safely
+ * ignore the boosting request, as the idle CPU runs this code
+ * with interrupts disabled and will complete the lock
+ * protected section without being interrupted. So there is no
+ * real need to boost.
+ */
+ if (unlikely(p == rq->idle)) {
+ WARN_ON(p != rq->curr);
+ WARN_ON(p->pi_blocked_on);
+ goto out_unlock;
+ }
+
+ trace_sched_pi_setprio(p, prio);
+ oldprio = p->prio;
+ queued = task_queued(p);
+ if (queued)
+ dequeue_task(p);
+ p->prio = prio;
+ if (task_running(p) && prio > oldprio)
+ resched_task(p);
+ if (queued) {
+ enqueue_task(p, rq);
+ try_preempt(p, rq);
+ }
+
+out_unlock:
+ task_grq_unlock(&flags);
+}
+
+#endif
+
+/*
+ * Adjust the deadline for when the priority is to change, before it's
+ * changed.
+ */
+static inline void adjust_deadline(struct task_struct *p, int new_prio)
+{
+ p->deadline += static_deadline_diff(new_prio) - task_deadline_diff(p);
+}
+
+void set_user_nice(struct task_struct *p, long nice)
+{
+ int queued, new_static, old_static;
+ unsigned long flags;
+ struct rq *rq;
+
+ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
+ return;
+ new_static = NICE_TO_PRIO(nice);
+ /*
+ * We have to be careful, if called from sys_setpriority(),
+ * the task might be in the middle of scheduling on another CPU.
+ */
+ rq = time_task_grq_lock(p, &flags);
+ /*
+ * The RT priorities are set via sched_setscheduler(), but we still
+ * allow the 'normal' nice value to be set - but as expected
+ * it wont have any effect on scheduling until the task is
+ * not SCHED_NORMAL/SCHED_BATCH:
+ */
+ if (has_rt_policy(p)) {
+ p->static_prio = new_static;
+ goto out_unlock;
+ }
+ queued = task_queued(p);
+ if (queued)
+ dequeue_task(p);
+
+ adjust_deadline(p, new_static);
+ old_static = p->static_prio;
+ p->static_prio = new_static;
+ p->prio = effective_prio(p);
+
+ if (queued) {
+ enqueue_task(p, rq);
+ if (new_static < old_static)
+ try_preempt(p, rq);
+ } else if (task_running(p)) {
+ reset_rq_task(rq, p);
+ if (old_static < new_static)
+ resched_task(p);
+ }
+out_unlock:
+ task_grq_unlock(&flags);
+}
+EXPORT_SYMBOL(set_user_nice);
+
+/*
+ * can_nice - check if a task can reduce its nice value
+ * @p: task
+ * @nice: nice value
+ */
+int can_nice(const struct task_struct *p, const int nice)
+{
+ /* convert nice value [19,-20] to rlimit style value [1,40] */
+ int nice_rlim = nice_to_rlimit(nice);
+
+ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
+ capable(CAP_SYS_NICE));
+}
+
+#ifdef __ARCH_WANT_SYS_NICE
+
+/*
+ * sys_nice - change the priority of the current process.
+ * @increment: priority increment
+ *
+ * sys_setpriority is a more generic, but much slower function that
+ * does similar things.
+ */
+SYSCALL_DEFINE1(nice, int, increment)
+{
+ long nice, retval;
+
+ /*
+ * Setpriority might change our priority at the same moment.
+ * We don't have to worry. Conceptually one call occurs first
+ * and we have a single winner.
+ */
+
+ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
+ nice = task_nice(current) + increment;
+
+ nice = clamp_val(nice, MIN_NICE, MAX_NICE);
+ if (increment < 0 && !can_nice(current, nice))
+ return -EPERM;
+
+ retval = security_task_setnice(current, nice);
+ if (retval)
+ return retval;
+
+ set_user_nice(current, nice);
+ return 0;
+}
+
+#endif
+
+/**
+ * task_prio - return the priority value of a given task.
+ * @p: the task in question.
+ *
+ * Return: The priority value as seen by users in /proc.
+ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes
+ * from 0 (SCHED_ISO) up to 82 (nice +19 SCHED_IDLEPRIO).
+ */
+int task_prio(const struct task_struct *p)
+{
+ int delta, prio = p->prio - MAX_RT_PRIO;
+
+ /* rt tasks and iso tasks */
+ if (prio <= 0)
+ goto out;
+
+ /* Convert to ms to avoid overflows */
+ delta = NS_TO_MS(p->deadline - grq.niffies);
+ delta = delta * 40 / ms_longest_deadline_diff();
+ if (delta > 0 && delta <= 80)
+ prio += delta;
+ if (idleprio_task(p))
+ prio += 40;
+out:
+ return prio;
+}
+
+/**
+ * idle_cpu - is a given cpu idle currently?
+ * @cpu: the processor in question.
+ *
+ * Return: 1 if the CPU is currently idle. 0 otherwise.
+ */
+int idle_cpu(int cpu)
+{
+ return cpu_curr(cpu) == cpu_rq(cpu)->idle;
+}
+
+/**
+ * idle_task - return the idle task for a given cpu.
+ * @cpu: the processor in question.
+ *
+ * Return: The idle task for the cpu @cpu.
+ */
+struct task_struct *idle_task(int cpu)
+{
+ return cpu_rq(cpu)->idle;
+}
+
+/**
+ * find_process_by_pid - find a process with a matching PID value.
+ * @pid: the pid in question.
+ *
+ * The task of @pid, if found. %NULL otherwise.
+ */
+static inline struct task_struct *find_process_by_pid(pid_t pid)
+{
+ return pid ? find_task_by_vpid(pid) : current;
+}
+
+/* Actually do priority change: must hold grq lock. */
+static void __setscheduler(struct task_struct *p, struct rq *rq, int policy,
+ int prio, bool keep_boost)
+{
+ int oldrtprio, oldprio;
+
+ p->policy = policy;
+ oldrtprio = p->rt_priority;
+ p->rt_priority = prio;
+ p->normal_prio = normal_prio(p);
+ oldprio = p->prio;
+ /*
+ * Keep a potential priority boosting if called from
+ * sched_setscheduler().
+ */
+ if (keep_boost)
+ p->prio = rt_mutex_get_effective_prio(p, p->normal_prio);
+ else
+ p->prio = p->normal_prio;
+ if (task_running(p)) {
+ reset_rq_task(rq, p);
+ /* Resched only if we might now be preempted */
+ if (p->prio > oldprio || p->rt_priority > oldrtprio)
+ resched_task(p);
+ }
+}
+
+/*
+ * check the target process has a UID that matches the current process's
+ */
+static bool check_same_owner(struct task_struct *p)
+{
+ const struct cred *cred = current_cred(), *pcred;
+ bool match;
+
+ rcu_read_lock();
+ pcred = __task_cred(p);
+ match = (uid_eq(cred->euid, pcred->euid) ||
+ uid_eq(cred->euid, pcred->uid));
+ rcu_read_unlock();
+ return match;
+}
+
+static int __sched_setscheduler(struct task_struct *p, int policy,
+ const struct sched_param *param, bool user)
+{
+ struct sched_param zero_param = { .sched_priority = 0 };
+ int queued, retval, oldpolicy = -1;
+ unsigned long flags, rlim_rtprio = 0;
+ int reset_on_fork;
+ struct rq *rq;
+
+ /* may grab non-irq protected spin_locks */
+ BUG_ON(in_interrupt());
+
+ if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) {
+ unsigned long lflags;
+
+ if (!lock_task_sighand(p, &lflags))
+ return -ESRCH;
+ rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
+ unlock_task_sighand(p, &lflags);
+ if (rlim_rtprio)
+ goto recheck;
+ /*
+ * If the caller requested an RT policy without having the
+ * necessary rights, we downgrade the policy to SCHED_ISO.
+ * We also set the parameter to zero to pass the checks.
+ */
+ policy = SCHED_ISO;
+ param = &zero_param;
+ }
+recheck:
+ /* double check policy once rq lock held */
+ if (policy < 0) {
+ reset_on_fork = p->sched_reset_on_fork;
+ policy = oldpolicy = p->policy;
+ } else {
+ reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
+ policy &= ~SCHED_RESET_ON_FORK;
+
+ if (!SCHED_RANGE(policy))
+ return -EINVAL;
+ }
+
+ /*
+ * Valid priorities for SCHED_FIFO and SCHED_RR are
+ * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and
+ * SCHED_BATCH is 0.
+ */
+ if (param->sched_priority < 0 ||
+ (p->mm && param->sched_priority > MAX_USER_RT_PRIO - 1) ||
+ (!p->mm && param->sched_priority > MAX_RT_PRIO - 1))
+ return -EINVAL;
+ if (is_rt_policy(policy) != (param->sched_priority != 0))
+ return -EINVAL;
+
+ /*
+ * Allow unprivileged RT tasks to decrease priority:
+ */
+ if (user && !capable(CAP_SYS_NICE)) {
+ if (is_rt_policy(policy)) {
+ unsigned long rlim_rtprio =
+ task_rlimit(p, RLIMIT_RTPRIO);
+
+ /* can't set/change the rt policy */
+ if (policy != p->policy && !rlim_rtprio)
+ return -EPERM;
+
+ /* can't increase priority */
+ if (param->sched_priority > p->rt_priority &&
+ param->sched_priority > rlim_rtprio)
+ return -EPERM;
+ } else {
+ switch (p->policy) {
+ /*
+ * Can only downgrade policies but not back to
+ * SCHED_NORMAL
+ */
+ case SCHED_ISO:
+ if (policy == SCHED_ISO)
+ goto out;
+ if (policy == SCHED_NORMAL)
+ return -EPERM;
+ break;
+ case SCHED_BATCH:
+ if (policy == SCHED_BATCH)
+ goto out;
+ if (policy != SCHED_IDLEPRIO)
+ return -EPERM;
+ break;
+ case SCHED_IDLEPRIO:
+ if (policy == SCHED_IDLEPRIO)
+ goto out;
+ return -EPERM;
+ default:
+ break;
+ }
+ }
+
+ /* can't change other user's priorities */
+ if (!check_same_owner(p))
+ return -EPERM;
+
+ /* Normal users shall not reset the sched_reset_on_fork flag */
+ if (p->sched_reset_on_fork && !reset_on_fork)
+ return -EPERM;
+ }
+
+ if (user) {
+ retval = security_task_setscheduler(p);
+ if (retval)
+ return retval;
+ }
+
+ /*
+ * make sure no PI-waiters arrive (or leave) while we are
+ * changing the priority of the task:
+ */
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ /*
+ * To be able to change p->policy safely, the grunqueue lock must be
+ * held.
+ */
+ rq = __task_grq_lock(p);
+
+ /*
+ * Changing the policy of the stop threads its a very bad idea
+ */
+ if (p == rq->stop) {
+ __task_grq_unlock();
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ return -EINVAL;
+ }
+
+ /*
+ * If not changing anything there's no need to proceed further:
+ */
+ if (unlikely(policy == p->policy && (!is_rt_policy(policy) ||
+ param->sched_priority == p->rt_priority))) {
+
+ __task_grq_unlock();
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ return 0;
+ }
+
+ /* recheck policy now with rq lock held */
+ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
+ policy = oldpolicy = -1;
+ __task_grq_unlock();
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ goto recheck;
+ }
+ update_clocks(rq);
+ p->sched_reset_on_fork = reset_on_fork;
+
+ queued = task_queued(p);
+ if (queued)
+ dequeue_task(p);
+ __setscheduler(p, rq, policy, param->sched_priority, true);
+ if (queued) {
+ enqueue_task(p, rq);
+ try_preempt(p, rq);
+ }
+ __task_grq_unlock();
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+ rt_mutex_adjust_pi(p);
+out:
+ return 0;
+}
+
+/**
+ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
+ * @p: the task in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * Return: 0 on success. An error code otherwise.
+ *
+ * NOTE that the task may be already dead.
+ */
+int sched_setscheduler(struct task_struct *p, int policy,
+ const struct sched_param *param)
+{
+ return __sched_setscheduler(p, policy, param, true);
+}
+
+EXPORT_SYMBOL_GPL(sched_setscheduler);
+
+int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
+{
+ const struct sched_param param = { .sched_priority = attr->sched_priority };
+ int policy = attr->sched_policy;
+
+ return __sched_setscheduler(p, policy, &param, true);
+}
+EXPORT_SYMBOL_GPL(sched_setattr);
+
+/**
+ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
+ * @p: the task in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * Just like sched_setscheduler, only don't bother checking if the
+ * current context has permission. For example, this is needed in
+ * stop_machine(): we create temporary high priority worker threads,
+ * but our caller might not have that capability.
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+int sched_setscheduler_nocheck(struct task_struct *p, int policy,
+ const struct sched_param *param)
+{
+ return __sched_setscheduler(p, policy, param, false);
+}
+
+static int
+do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
+{
+ struct sched_param lparam;
+ struct task_struct *p;
+ int retval;
+
+ if (!param || pid < 0)
+ return -EINVAL;
+ if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
+ return -EFAULT;
+
+ rcu_read_lock();
+ retval = -ESRCH;
+ p = find_process_by_pid(pid);
+ if (p != NULL)
+ retval = sched_setscheduler(p, policy, &lparam);
+ rcu_read_unlock();
+
+ return retval;
+}
+
+/*
+ * Mimics kernel/events/core.c perf_copy_attr().
+ */
+static int sched_copy_attr(struct sched_attr __user *uattr,
+ struct sched_attr *attr)
+{
+ u32 size;
+ int ret;
+
+ if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
+ return -EFAULT;
+
+ /*
+ * zero the full structure, so that a short copy will be nice.
+ */
+ memset(attr, 0, sizeof(*attr));
+
+ ret = get_user(size, &uattr->size);
+ if (ret)
+ return ret;
+
+ if (size > PAGE_SIZE) /* silly large */
+ goto err_size;
+
+ if (!size) /* abi compat */
+ size = SCHED_ATTR_SIZE_VER0;
+
+ if (size < SCHED_ATTR_SIZE_VER0)
+ goto err_size;
+
+ /*
+ * If we're handed a bigger struct than we know of,
+ * ensure all the unknown bits are 0 - i.e. new
+ * user-space does not rely on any kernel feature
+ * extensions we dont know about yet.
+ */
+ if (size > sizeof(*attr)) {
+ unsigned char __user *addr;
+ unsigned char __user *end;
+ unsigned char val;
+
+ addr = (void __user *)uattr + sizeof(*attr);
+ end = (void __user *)uattr + size;
+
+ for (; addr < end; addr++) {
+ ret = get_user(val, addr);
+ if (ret)
+ return ret;
+ if (val)
+ goto err_size;
+ }
+ size = sizeof(*attr);
+ }
+
+ ret = copy_from_user(attr, uattr, size);
+ if (ret)
+ return -EFAULT;
+
+ /*
+ * XXX: do we want to be lenient like existing syscalls; or do we want
+ * to be strict and return an error on out-of-bounds values?
+ */
+ attr->sched_nice = clamp(attr->sched_nice, -20, 19);
+
+ /* sched/core.c uses zero here but we already know ret is zero */
+ return 0;
+
+err_size:
+ put_user(sizeof(*attr), &uattr->size);
+ return -E2BIG;
+}
+
+/**
+ * sys_sched_setscheduler - set/change the scheduler policy and RT priority
+ * @pid: the pid in question.
+ * @policy: new policy.
+ *
+ * Return: 0 on success. An error code otherwise.
+ * @param: structure containing the new RT priority.
+ */
+asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
+ struct sched_param __user *param)
+{
+ /* negative values for policy are not valid */
+ if (policy < 0)
+ return -EINVAL;
+
+ return do_sched_setscheduler(pid, policy, param);
+}
+
+/*
+ * sched_setparam() passes in -1 for its policy, to let the functions
+ * it calls know not to change it.
+ */
+#define SETPARAM_POLICY -1
+
+/**
+ * sys_sched_setparam - set/change the RT priority of a thread
+ * @pid: the pid in question.
+ * @param: structure containing the new RT priority.
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
+{
+ return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
+}
+
+/**
+ * sys_sched_setattr - same as above, but with extended sched_attr
+ * @pid: the pid in question.
+ * @uattr: structure containing the extended parameters.
+ */
+SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
+ unsigned int, flags)
+{
+ struct sched_attr attr;
+ struct task_struct *p;
+ int retval;
+
+ if (!uattr || pid < 0 || flags)
+ return -EINVAL;
+
+ retval = sched_copy_attr(uattr, &attr);
+ if (retval)
+ return retval;
+
+ if ((int)attr.sched_policy < 0)
+ return -EINVAL;
+
+ rcu_read_lock();
+ retval = -ESRCH;
+ p = find_process_by_pid(pid);
+ if (p != NULL)
+ retval = sched_setattr(p, &attr);
+ rcu_read_unlock();
+
+ return retval;
+}
+
+/**
+ * sys_sched_getscheduler - get the policy (scheduling class) of a thread
+ * @pid: the pid in question.
+ *
+ * Return: On success, the policy of the thread. Otherwise, a negative error
+ * code.
+ */
+SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
+{
+ struct task_struct *p;
+ int retval = -EINVAL;
+
+ if (pid < 0)
+ goto out_nounlock;
+
+ retval = -ESRCH;
+ rcu_read_lock();
+ p = find_process_by_pid(pid);
+ if (p) {
+ retval = security_task_getscheduler(p);
+ if (!retval)
+ retval = p->policy;
+ }
+ rcu_read_unlock();
+
+out_nounlock:
+ return retval;
+}
+
+/**
+ * sys_sched_getscheduler - get the RT priority of a thread
+ * @pid: the pid in question.
+ * @param: structure containing the RT priority.
+ *
+ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
+ * code.
+ */
+SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
+{
+ struct sched_param lp = { .sched_priority = 0 };
+ struct task_struct *p;
+ int retval = -EINVAL;
+
+ if (!param || pid < 0)
+ goto out_nounlock;
+
+ rcu_read_lock();
+ p = find_process_by_pid(pid);
+ retval = -ESRCH;
+ if (!p)
+ goto out_unlock;
+
+ retval = security_task_getscheduler(p);
+ if (retval)
+ goto out_unlock;
+
+ if (has_rt_policy(p))
+ lp.sched_priority = p->rt_priority;
+ rcu_read_unlock();
+
+ /*
+ * This one might sleep, we cannot do it with a spinlock held ...
+ */
+ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
+
+out_nounlock:
+ return retval;
+
+out_unlock:
+ rcu_read_unlock();
+ return retval;
+}
+
+static int sched_read_attr(struct sched_attr __user *uattr,
+ struct sched_attr *attr,
+ unsigned int usize)
+{
+ int ret;
+
+ if (!access_ok(VERIFY_WRITE, uattr, usize))
+ return -EFAULT;
+
+ /*
+ * If we're handed a smaller struct than we know of,
+ * ensure all the unknown bits are 0 - i.e. old
+ * user-space does not get uncomplete information.
+ */
+ if (usize < sizeof(*attr)) {
+ unsigned char *addr;
+ unsigned char *end;
+
+ addr = (void *)attr + usize;
+ end = (void *)attr + sizeof(*attr);
+
+ for (; addr < end; addr++) {
+ if (*addr)
+ return -EFBIG;
+ }
+
+ attr->size = usize;
+ }
+
+ ret = copy_to_user(uattr, attr, attr->size);
+ if (ret)
+ return -EFAULT;
+
+ /* sched/core.c uses zero here but we already know ret is zero */
+ return ret;
+}
+
+/**
+ * sys_sched_getattr - similar to sched_getparam, but with sched_attr
+ * @pid: the pid in question.
+ * @uattr: structure containing the extended parameters.
+ * @size: sizeof(attr) for fwd/bwd comp.
+ * @flags: for future extension.
+ */
+SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
+ unsigned int, size, unsigned int, flags)
+{
+ struct sched_attr attr = {
+ .size = sizeof(struct sched_attr),
+ };
+ struct task_struct *p;
+ int retval;
+
+ if (!uattr || pid < 0 || size > PAGE_SIZE ||
+ size < SCHED_ATTR_SIZE_VER0 || flags)
+ return -EINVAL;
+
+ rcu_read_lock();
+ p = find_process_by_pid(pid);
+ retval = -ESRCH;
+ if (!p)
+ goto out_unlock;
+
+ retval = security_task_getscheduler(p);
+ if (retval)
+ goto out_unlock;
+
+ attr.sched_policy = p->policy;
+ if (rt_task(p))
+ attr.sched_priority = p->rt_priority;
+ else
+ attr.sched_nice = task_nice(p);
+
+ rcu_read_unlock();
+
+ retval = sched_read_attr(uattr, &attr, size);
+ return retval;
+
+out_unlock:
+ rcu_read_unlock();
+ return retval;
+}
+
+long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
+{
+ cpumask_var_t cpus_allowed, new_mask;
+ struct task_struct *p;
+ int retval;
+
+ get_online_cpus();
+ rcu_read_lock();
+
+ p = find_process_by_pid(pid);
+ if (!p) {
+ rcu_read_unlock();
+ put_online_cpus();
+ return -ESRCH;
+ }
+
+ /* Prevent p going away */
+ get_task_struct(p);
+ rcu_read_unlock();
+
+ if (p->flags & PF_NO_SETAFFINITY) {
+ retval = -EINVAL;
+ goto out_put_task;
+ }
+ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
+ retval = -ENOMEM;
+ goto out_put_task;
+ }
+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
+ retval = -ENOMEM;
+ goto out_free_cpus_allowed;
+ }
+ retval = -EPERM;
+ if (!check_same_owner(p)) {
+ rcu_read_lock();
+ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
+ rcu_read_unlock();
+ goto out_unlock;
+ }
+ rcu_read_unlock();
+ }
+
+ retval = security_task_setscheduler(p);
+ if (retval)
+ goto out_unlock;
+
+ cpuset_cpus_allowed(p, cpus_allowed);
+ cpumask_and(new_mask, in_mask, cpus_allowed);
+again:
+ retval = set_cpus_allowed_ptr(p, new_mask);
+
+ if (!retval) {
+ cpuset_cpus_allowed(p, cpus_allowed);
+ if (!cpumask_subset(new_mask, cpus_allowed)) {
+ /*
+ * We must have raced with a concurrent cpuset
+ * update. Just reset the cpus_allowed to the
+ * cpuset's cpus_allowed
+ */
+ cpumask_copy(new_mask, cpus_allowed);
+ goto again;
+ }
+ }
+out_unlock:
+ free_cpumask_var(new_mask);
+out_free_cpus_allowed:
+ free_cpumask_var(cpus_allowed);
+out_put_task:
+ put_task_struct(p);
+ put_online_cpus();
+ return retval;
+}
+
+static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
+ cpumask_t *new_mask)
+{
+ if (len < sizeof(cpumask_t)) {
+ memset(new_mask, 0, sizeof(cpumask_t));
+ } else if (len > sizeof(cpumask_t)) {
+ len = sizeof(cpumask_t);
+ }
+ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
+}
+
+
+/**
+ * sys_sched_setaffinity - set the cpu affinity of a process
+ * @pid: pid of the process
+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr
+ * @user_mask_ptr: user-space pointer to the new cpu mask
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
+ unsigned long __user *, user_mask_ptr)
+{
+ cpumask_var_t new_mask;
+ int retval;
+
+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
+ if (retval == 0)
+ retval = sched_setaffinity(pid, new_mask);
+ free_cpumask_var(new_mask);
+ return retval;
+}
+
+long sched_getaffinity(pid_t pid, cpumask_t *mask)
+{
+ struct task_struct *p;
+ unsigned long flags;
+ int retval;
+
+ get_online_cpus();
+ rcu_read_lock();
+
+ retval = -ESRCH;
+ p = find_process_by_pid(pid);
+ if (!p)
+ goto out_unlock;
+
+ retval = security_task_getscheduler(p);
+ if (retval)
+ goto out_unlock;
+
+ grq_lock_irqsave(&flags);
+ cpumask_and(mask, tsk_cpus_allowed(p), cpu_active_mask);
+ grq_unlock_irqrestore(&flags);
+
+out_unlock:
+ rcu_read_unlock();
+ put_online_cpus();
+
+ return retval;
+}
+
+/**
+ * sys_sched_getaffinity - get the cpu affinity of a process
+ * @pid: pid of the process
+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr
+ * @user_mask_ptr: user-space pointer to hold the current cpu mask
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
+ unsigned long __user *, user_mask_ptr)
+{
+ int ret;
+ cpumask_var_t mask;
+
+ if ((len * BITS_PER_BYTE) < nr_cpu_ids)
+ return -EINVAL;
+ if (len & (sizeof(unsigned long)-1))
+ return -EINVAL;
+
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ ret = sched_getaffinity(pid, mask);
+ if (ret == 0) {
+ size_t retlen = min_t(size_t, len, cpumask_size());
+
+ if (copy_to_user(user_mask_ptr, mask, retlen))
+ ret = -EFAULT;
+ else
+ ret = retlen;
+ }
+ free_cpumask_var(mask);
+
+ return ret;
+}
+
+/**
+ * sys_sched_yield - yield the current processor to other threads.
+ *
+ * This function yields the current CPU to other tasks. It does this by
+ * scheduling away the current task. If it still has the earliest deadline
+ * it will be scheduled again as the next task.
+ *
+ * Return: 0.
+ */
+SYSCALL_DEFINE0(sched_yield)
+{
+ struct task_struct *p;
+
+ p = current;
+ grq_lock_irq();
+ schedstat_inc(task_rq(p), yld_count);
+ requeue_task(p);
+
+ /*
+ * Since we are going to call schedule() anyway, there's
+ * no need to preempt or enable interrupts:
+ */
+ __release(grq.lock);
+ spin_release(&grq.lock.dep_map, 1, _THIS_IP_);
+ do_raw_spin_unlock(&grq.lock);
+ sched_preempt_enable_no_resched();
+
+ schedule();
+
+ return 0;
+}
+
+int __sched _cond_resched(void)
+{
+ if (should_resched()) {
+ preempt_schedule_common();
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(_cond_resched);
+
+/*
+ * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
+ * call schedule, and on return reacquire the lock.
+ *
+ * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
+ * operations here to prevent schedule() from being called twice (once via
+ * spin_unlock(), once by hand).
+ */
+int __cond_resched_lock(spinlock_t *lock)
+{
+ int resched = should_resched();
+ int ret = 0;
+
+ lockdep_assert_held(lock);
+
+ if (spin_needbreak(lock) || resched) {
+ spin_unlock(lock);
+ if (resched)
+ preempt_schedule_common();
+ else
+ cpu_relax();
+ ret = 1;
+ spin_lock(lock);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(__cond_resched_lock);
+
+int __sched __cond_resched_softirq(void)
+{
+ BUG_ON(!in_softirq());
+
+ if (should_resched()) {
+ local_bh_enable();
+ preempt_schedule_common();
+ local_bh_disable();
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(__cond_resched_softirq);
+
+/**
+ * yield - yield the current processor to other threads.
+ *
+ * Do not ever use this function, there's a 99% chance you're doing it wrong.
+ *
+ * The scheduler is at all times free to pick the calling task as the most
+ * eligible task to run, if removing the yield() call from your code breaks
+ * it, its already broken.
+ *
+ * Typical broken usage is:
+ *
+ * while (!event)
+ * yield();
+ *
+ * where one assumes that yield() will let 'the other' process run that will
+ * make event true. If the current task is a SCHED_FIFO task that will never
+ * happen. Never use yield() as a progress guarantee!!
+ *
+ * If you want to use yield() to wait for something, use wait_event().
+ * If you want to use yield() to be 'nice' for others, use cond_resched().
+ * If you still want to use yield(), do not!
+ */
+void __sched yield(void)
+{
+ set_current_state(TASK_RUNNING);
+ sys_sched_yield();
+}
+EXPORT_SYMBOL(yield);
+
+/**
+ * yield_to - yield the current processor to another thread in
+ * your thread group, or accelerate that thread toward the
+ * processor it's on.
+ * @p: target task
+ * @preempt: whether task preemption is allowed or not
+ *
+ * It's the caller's job to ensure that the target task struct
+ * can't go away on us before we can do any checks.
+ *
+ * Return:
+ * true (>0) if we indeed boosted the target task.
+ * false (0) if we failed to boost the target.
+ * -ESRCH if there's no task to yield to.
+ */
+int __sched yield_to(struct task_struct *p, bool preempt)
+{
+ struct rq *rq, *p_rq;
+ unsigned long flags;
+ int yielded = 0;
+
+ rq = this_rq();
+ grq_lock_irqsave(&flags);
+ if (task_running(p) || p->state) {
+ yielded = -ESRCH;
+ goto out_unlock;
+ }
+
+ p_rq = task_rq(p);
+ yielded = 1;
+ if (p->deadline > rq->rq_deadline)
+ p->deadline = rq->rq_deadline;
+ p->time_slice += rq->rq_time_slice;
+ rq->rq_time_slice = 0;
+ if (p->time_slice > timeslice())
+ p->time_slice = timeslice();
+ if (preempt && rq != p_rq)
+ resched_curr(p_rq);
+out_unlock:
+ grq_unlock_irqrestore(&flags);
+
+ if (yielded > 0)
+ schedule();
+ return yielded;
+}
+EXPORT_SYMBOL_GPL(yield_to);
+
+/*
+ * This task is about to go to sleep on IO. Increment rq->nr_iowait so
+ * that process accounting knows that this is a task in IO wait state.
+ *
+ * But don't do that if it is a deliberate, throttling IO wait (this task
+ * has set its backing_dev_info: the queue against which it should throttle)
+ */
+
+long __sched io_schedule_timeout(long timeout)
+{
+ int old_iowait = current->in_iowait;
+ struct rq *rq;
+ long ret;
+
+ current->in_iowait = 1;
+ blk_schedule_flush_plug(current);
+
+ delayacct_blkio_start();
+ rq = raw_rq();
+ atomic_inc(&rq->nr_iowait);
+ ret = schedule_timeout(timeout);
+ current->in_iowait = old_iowait;
+ atomic_dec(&rq->nr_iowait);
+ delayacct_blkio_end();
+
+ return ret;
+}
+EXPORT_SYMBOL(io_schedule_timeout);
+
+/**
+ * sys_sched_get_priority_max - return maximum RT priority.
+ * @policy: scheduling class.
+ *
+ * Return: On success, this syscall returns the maximum
+ * rt_priority that can be used by a given scheduling class.
+ * On failure, a negative error code is returned.
+ */
+SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
+{
+ int ret = -EINVAL;
+
+ switch (policy) {
+ case SCHED_FIFO:
+ case SCHED_RR:
+ ret = MAX_USER_RT_PRIO-1;
+ break;
+ case SCHED_NORMAL:
+ case SCHED_BATCH:
+ case SCHED_ISO:
+ case SCHED_IDLEPRIO:
+ ret = 0;
+ break;
+ }
+ return ret;
+}
+
+/**
+ * sys_sched_get_priority_min - return minimum RT priority.
+ * @policy: scheduling class.
+ *
+ * Return: On success, this syscall returns the minimum
+ * rt_priority that can be used by a given scheduling class.
+ * On failure, a negative error code is returned.
+ */
+SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
+{
+ int ret = -EINVAL;
+
+ switch (policy) {
+ case SCHED_FIFO:
+ case SCHED_RR:
+ ret = 1;
+ break;
+ case SCHED_NORMAL:
+ case SCHED_BATCH:
+ case SCHED_ISO:
+ case SCHED_IDLEPRIO:
+ ret = 0;
+ break;
+ }
+ return ret;
+}
+
+/**
+ * sys_sched_rr_get_interval - return the default timeslice of a process.
+ * @pid: pid of the process.
+ * @interval: userspace pointer to the timeslice value.
+ *
+ *
+ * Return: On success, 0 and the timeslice is in @interval. Otherwise,
+ * an error code.
+ */
+SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
+ struct timespec __user *, interval)
+{
+ struct task_struct *p;
+ unsigned int time_slice;
+ unsigned long flags;
+ int retval;
+ struct timespec t;
+
+ if (pid < 0)
+ return -EINVAL;
+
+ retval = -ESRCH;
+ rcu_read_lock();
+ p = find_process_by_pid(pid);
+ if (!p)
+ goto out_unlock;
+
+ retval = security_task_getscheduler(p);
+ if (retval)
+ goto out_unlock;
+
+ grq_lock_irqsave(&flags);
+ time_slice = p->policy == SCHED_FIFO ? 0 : MS_TO_NS(task_timeslice(p));
+ grq_unlock_irqrestore(&flags);
+
+ rcu_read_unlock();
+ t = ns_to_timespec(time_slice);
+ retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
+ return retval;
+
+out_unlock:
+ rcu_read_unlock();
+ return retval;
+}
+
+static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
+
+void sched_show_task(struct task_struct *p)
+{
+ unsigned long free = 0;
+ int ppid;
+ unsigned long state = p->state;
+
+ if (state)
+ state = __ffs(state) + 1;
+ printk(KERN_INFO "%-15.15s %c", p->comm,
+ state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
+#if BITS_PER_LONG == 32
+ if (state == TASK_RUNNING)
+ printk(KERN_CONT " running ");
+ else
+ printk(KERN_CONT " %08lx ", thread_saved_pc(p));
+#else
+ if (state == TASK_RUNNING)
+ printk(KERN_CONT " running task ");
+ else
+ printk(KERN_CONT " %016lx ", thread_saved_pc(p));
+#endif
+#ifdef CONFIG_DEBUG_STACK_USAGE
+ free = stack_not_used(p);
+#endif
+ ppid = 0;
+ rcu_read_lock();
+ if (pid_alive(p))
+ ppid = task_pid_nr(rcu_dereference(p->real_parent));
+ rcu_read_unlock();
+ printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
+ task_pid_nr(p), ppid,
+ (unsigned long)task_thread_info(p)->flags);
+
+ print_worker_info(KERN_INFO, p);
+ show_stack(p, NULL);
+}
+
+void show_state_filter(unsigned long state_filter)
+{
+ struct task_struct *g, *p;
+
+#if BITS_PER_LONG == 32
+ printk(KERN_INFO
+ " task PC stack pid father\n");
+#else
+ printk(KERN_INFO
+ " task PC stack pid father\n");
+#endif
+ rcu_read_lock();
+ for_each_process_thread(g, p) {
+ /*
+ * reset the NMI-timeout, listing all files on a slow
+ * console might take a lot of time:
+ */
+ touch_nmi_watchdog();
+ if (!state_filter || (p->state & state_filter))
+ sched_show_task(p);
+ }
+
+ touch_all_softlockup_watchdogs();
+
+ rcu_read_unlock();
+ /*
+ * Only show locks if all tasks are dumped:
+ */
+ if (!state_filter)
+ debug_show_all_locks();
+}
+
+void dump_cpu_task(int cpu)
+{
+ pr_info("Task dump for CPU %d:\n", cpu);
+ sched_show_task(cpu_curr(cpu));
+}
+
+#ifdef CONFIG_SMP
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+ cpumask_copy(tsk_cpus_allowed(p), new_mask);
+}
+#endif
+
+/**
+ * init_idle - set up an idle thread for a given CPU
+ * @idle: task in question
+ * @cpu: cpu the idle task belongs to
+ *
+ * NOTE: this function does not set the idle thread's NEED_RESCHED
+ * flag, to make booting more robust.
+ */
+void init_idle(struct task_struct *idle, int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+
+ time_grq_lock(rq, &flags);
+ idle->last_ran = rq->clock_task;
+ idle->state = TASK_RUNNING;
+ /* Setting prio to illegal value shouldn't matter when never queued */
+ idle->prio = PRIO_LIMIT;
+#ifdef CONFIG_SMT_NICE
+ idle->smt_bias = 0;
+#endif
+ set_rq_task(rq, idle);
+ do_set_cpus_allowed(idle, get_cpu_mask(cpu));
+ /* Silence PROVE_RCU */
+ rcu_read_lock();
+ set_task_cpu(idle, cpu);
+ rcu_read_unlock();
+ rq->curr = rq->idle = idle;
+ idle->on_cpu = 1;
+ grq_unlock_irqrestore(&flags);
+
+ /* Set the preempt count _outside_ the spinlocks! */
+ init_idle_preempt_count(idle, cpu);
+
+ ftrace_graph_init_idle_task(idle, cpu);
+#if defined(CONFIG_SMP)
+ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
+#endif
+}
+
+int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur,
+ const struct cpumask __maybe_unused *trial)
+{
+ return 1;
+}
+
+int task_can_attach(struct task_struct *p,
+ const struct cpumask *cs_cpus_allowed)
+{
+ int ret = 0;
+
+ /*
+ * Kthreads which disallow setaffinity shouldn't be moved
+ * to a new cpuset; we don't want to change their cpu
+ * affinity and isolating such threads by their set of
+ * allowed nodes is unnecessary. Thus, cpusets are not
+ * applicable for such threads. This prevents checking for
+ * success of set_cpus_allowed_ptr() on all attached tasks
+ * before cpus_allowed may be changed.
+ */
+ if (p->flags & PF_NO_SETAFFINITY)
+ ret = -EINVAL;
+
+ return ret;
+}
+
+void resched_cpu(int cpu)
+{
+ unsigned long flags;
+
+ grq_lock_irqsave(&flags);
+ resched_task(cpu_curr(cpu));
+ grq_unlock_irqrestore(&flags);
+}
+
+#ifdef CONFIG_SMP
+#ifdef CONFIG_NO_HZ_COMMON
+void nohz_balance_enter_idle(int cpu)
+{
+}
+
+void select_nohz_load_balancer(int stop_tick)
+{
+}
+
+void set_cpu_sd_state_idle(void) {}
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+/**
+ * lowest_flag_domain - Return lowest sched_domain containing flag.
+ * @cpu: The cpu whose lowest level of sched domain is to
+ * be returned.
+ * @flag: The flag to check for the lowest sched_domain
+ * for the given cpu.
+ *
+ * Returns the lowest sched_domain of a cpu which contains the given flag.
+ */
+static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
+{
+ struct sched_domain *sd;
+
+ for_each_domain(cpu, sd)
+ if (sd && (sd->flags & flag))
+ break;
+
+ return sd;
+}
+
+/**
+ * for_each_flag_domain - Iterates over sched_domains containing the flag.
+ * @cpu: The cpu whose domains we're iterating over.
+ * @sd: variable holding the value of the power_savings_sd
+ * for cpu.
+ * @flag: The flag to filter the sched_domains to be iterated.
+ *
+ * Iterates over all the scheduler domains for a given cpu that has the 'flag'
+ * set, starting from the lowest sched_domain to the highest.
+ */
+#define for_each_flag_domain(cpu, sd, flag) \
+ for (sd = lowest_flag_domain(cpu, flag); \
+ (sd && (sd->flags & flag)); sd = sd->parent)
+
+#endif /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
+
+/*
+ * In the semi idle case, use the nearest busy cpu for migrating timers
+ * from an idle cpu. This is good for power-savings.
+ *
+ * We don't do similar optimization for completely idle system, as
+ * selecting an idle cpu will add more delays to the timers than intended
+ * (as that cpu's timer base may not be uptodate wrt jiffies etc).
+ */
+int get_nohz_timer_target(int pinned)
+{
+ int cpu = smp_processor_id();
+ int i;
+ struct sched_domain *sd;
+
+ if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu))
+ return cpu;
+
+ rcu_read_lock();
+ for_each_domain(cpu, sd) {
+ for_each_cpu(i, sched_domain_span(sd)) {
+ if (!idle_cpu(i)) {
+ cpu = i;
+ goto unlock;
+ }
+ }
+ }
+unlock:
+ rcu_read_unlock();
+ return cpu;
+}
+
+/*
+ * When add_timer_on() enqueues a timer into the timer wheel of an
+ * idle CPU then this timer might expire before the next timer event
+ * which is scheduled to wake up that CPU. In case of a completely
+ * idle system the next event might even be infinite time into the
+ * future. wake_up_idle_cpu() ensures that the CPU is woken up and
+ * leaves the inner idle loop so the newly added timer is taken into
+ * account when the CPU goes back to idle and evaluates the timer
+ * wheel for the next timer event.
+ */
+void wake_up_idle_cpu(int cpu)
+{
+ if (cpu == smp_processor_id())
+ return;
+
+ set_tsk_need_resched(cpu_rq(cpu)->idle);
+ smp_send_reschedule(cpu);
+}
+
+void wake_up_nohz_cpu(int cpu)
+{
+ wake_up_idle_cpu(cpu);
+}
+#endif /* CONFIG_NO_HZ_COMMON */
+
+/*
+ * Change a given task's CPU affinity. Migrate the thread to a
+ * proper CPU and schedule it away if the CPU it's executing on
+ * is removed from the allowed bitmask.
+ *
+ * NOTE: the caller must have a valid reference to the task, the
+ * task must not exit() & deallocate itself prematurely. The
+ * call is not atomic; no spinlocks may be held.
+ */
+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+{
+ bool running_wrong = false;
+ bool queued = false;
+ unsigned long flags;
+ struct rq *rq;
+ int ret = 0;
+
+ rq = task_grq_lock(p, &flags);
+
+ if (cpumask_equal(tsk_cpus_allowed(p), new_mask))
+ goto out;
+
+ if (!cpumask_intersects(new_mask, cpu_active_mask)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ queued = task_queued(p);
+
+ do_set_cpus_allowed(p, new_mask);
+
+ /* Can the task run on the task's current CPU? If so, we're done */
+ if (cpumask_test_cpu(task_cpu(p), new_mask))
+ goto out;
+
+ if (task_running(p)) {
+ /* Task is running on the wrong cpu now, reschedule it. */
+ if (rq == this_rq()) {
+ set_tsk_need_resched(p);
+ running_wrong = true;
+ } else
+ resched_task(p);
+ } else
+ set_task_cpu(p, cpumask_any_and(cpu_active_mask, new_mask));
+
+out:
+ if (queued)
+ try_preempt(p, rq);
+ task_grq_unlock(&flags);
+
+ if (running_wrong)
+ preempt_schedule_common();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
+
+#ifdef CONFIG_HOTPLUG_CPU
+extern struct task_struct *cpu_stopper_task;
+/* Run through task list and find tasks affined to the dead cpu, then remove
+ * that cpu from the list, enable cpu0 and set the zerobound flag. */
+static void bind_zero(int src_cpu)
+{
+ struct task_struct *p, *t, *stopper;
+ int bound = 0;
+
+ if (src_cpu == 0)
+ return;
+
+ stopper = per_cpu(cpu_stopper_task, src_cpu);
+ do_each_thread(t, p) {
+ if (p != stopper && cpumask_test_cpu(src_cpu, tsk_cpus_allowed(p))) {
+ cpumask_clear_cpu(src_cpu, tsk_cpus_allowed(p));
+ cpumask_set_cpu(0, tsk_cpus_allowed(p));
+ p->zerobound = true;
+ bound++;
+ }
+ clear_sticky(p);
+ } while_each_thread(t, p);
+
+ if (bound) {
+ printk(KERN_INFO "Removed affinity for %d processes to cpu %d\n",
+ bound, src_cpu);
+ }
+}
+
+/* Find processes with the zerobound flag and reenable their affinity for the
+ * CPU coming alive. */
+static void unbind_zero(int src_cpu)
+{
+ int unbound = 0, zerobound = 0;
+ struct task_struct *p, *t;
+
+ if (src_cpu == 0)
+ return;
+
+ do_each_thread(t, p) {
+ if (!p->mm)
+ p->zerobound = false;
+ if (p->zerobound) {
+ unbound++;
+ cpumask_set_cpu(src_cpu, tsk_cpus_allowed(p));
+ /* Once every CPU affinity has been re-enabled, remove
+ * the zerobound flag */
+ if (cpumask_subset(cpu_possible_mask, tsk_cpus_allowed(p))) {
+ p->zerobound = false;
+ zerobound++;
+ }
+ }
+ } while_each_thread(t, p);
+
+ if (unbound) {
+ printk(KERN_INFO "Added affinity for %d processes to cpu %d\n",
+ unbound, src_cpu);
+ }
+ if (zerobound) {
+ printk(KERN_INFO "Released forced binding to cpu0 for %d processes\n",
+ zerobound);
+ }
+}
+
+/*
+ * Ensures that the idle task is using init_mm right before its cpu goes
+ * offline.
+ */
+void idle_task_exit(void)
+{
+ struct mm_struct *mm = current->active_mm;
+
+ BUG_ON(cpu_online(smp_processor_id()));
+
+ if (mm != &init_mm) {
+ switch_mm(mm, &init_mm, current);
+ finish_arch_post_lock_switch();
+ }
+ mmdrop(mm);
+}
+#else /* CONFIG_HOTPLUG_CPU */
+static void unbind_zero(int src_cpu) {}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+void sched_set_stop_task(int cpu, struct task_struct *stop)
+{
+ struct sched_param stop_param = { .sched_priority = STOP_PRIO };
+ struct sched_param start_param = { .sched_priority = 0 };
+ struct task_struct *old_stop = cpu_rq(cpu)->stop;
+
+ if (stop) {
+ /*
+ * Make it appear like a SCHED_FIFO task, its something
+ * userspace knows about and won't get confused about.
+ *
+ * Also, it will make PI more or less work without too
+ * much confusion -- but then, stop work should not
+ * rely on PI working anyway.
+ */
+ sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param);
+ }
+
+ cpu_rq(cpu)->stop = stop;
+
+ if (old_stop) {
+ /*
+ * Reset it back to a normal scheduling policy so that
+ * it can die in pieces.
+ */
+ sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param);
+ }
+}
+
+
+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
+
+static struct ctl_table sd_ctl_dir[] = {
+ {
+ .procname = "sched_domain",
+ .mode = 0555,
+ },
+ {}
+};
+
+static struct ctl_table sd_ctl_root[] = {
+ {
+ .procname = "kernel",
+ .mode = 0555,
+ .child = sd_ctl_dir,
+ },
+ {}
+};
+
+static struct ctl_table *sd_alloc_ctl_entry(int n)
+{
+ struct ctl_table *entry =
+ kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
+
+ return entry;
+}
+
+static void sd_free_ctl_entry(struct ctl_table **tablep)
+{
+ struct ctl_table *entry;
+
+ /*
+ * In the intermediate directories, both the child directory and
+ * procname are dynamically allocated and could fail but the mode
+ * will always be set. In the lowest directory the names are
+ * static strings and all have proc handlers.
+ */
+ for (entry = *tablep; entry->mode; entry++) {
+ if (entry->child)
+ sd_free_ctl_entry(&entry->child);
+ if (entry->proc_handler == NULL)
+ kfree(entry->procname);
+ }
+
+ kfree(*tablep);
+ *tablep = NULL;
+}
+
+static void
+set_table_entry(struct ctl_table *entry,
+ const char *procname, void *data, int maxlen,
+ mode_t mode, proc_handler *proc_handler)
+{
+ entry->procname = procname;
+ entry->data = data;
+ entry->maxlen = maxlen;
+ entry->mode = mode;
+ entry->proc_handler = proc_handler;
+}
+
+static struct ctl_table *
+sd_alloc_ctl_domain_table(struct sched_domain *sd)
+{
+ struct ctl_table *table = sd_alloc_ctl_entry(14);
+
+ if (table == NULL)
+ return NULL;
+
+ set_table_entry(&table[0], "min_interval", &sd->min_interval,
+ sizeof(long), 0644, proc_doulongvec_minmax);
+ set_table_entry(&table[1], "max_interval", &sd->max_interval,
+ sizeof(long), 0644, proc_doulongvec_minmax);
+ set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
+ sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
+ sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
+ sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
+ sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
+ sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
+ sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
+ sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[9], "cache_nice_tries",
+ &sd->cache_nice_tries,
+ sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[10], "flags", &sd->flags,
+ sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[11], "max_newidle_lb_cost",
+ &sd->max_newidle_lb_cost,
+ sizeof(long), 0644, proc_doulongvec_minmax);
+ set_table_entry(&table[12], "name", sd->name,
+ CORENAME_MAX_SIZE, 0444, proc_dostring);
+ /* &table[13] is terminator */
+
+ return table;
+}
+
+static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
+{
+ struct ctl_table *entry, *table;
+ struct sched_domain *sd;
+ int domain_num = 0, i;
+ char buf[32];
+
+ for_each_domain(cpu, sd)
+ domain_num++;
+ entry = table = sd_alloc_ctl_entry(domain_num + 1);
+ if (table == NULL)
+ return NULL;
+
+ i = 0;
+ for_each_domain(cpu, sd) {
+ snprintf(buf, 32, "domain%d", i);
+ entry->procname = kstrdup(buf, GFP_KERNEL);
+ entry->mode = 0555;
+ entry->child = sd_alloc_ctl_domain_table(sd);
+ entry++;
+ i++;
+ }
+ return table;
+}
+
+static struct ctl_table_header *sd_sysctl_header;
+static void register_sched_domain_sysctl(void)
+{
+ int i, cpu_num = num_possible_cpus();
+ struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
+ char buf[32];
+
+ WARN_ON(sd_ctl_dir[0].child);
+ sd_ctl_dir[0].child = entry;
+
+ if (entry == NULL)
+ return;
+
+ for_each_possible_cpu(i) {
+ snprintf(buf, 32, "cpu%d", i);
+ entry->procname = kstrdup(buf, GFP_KERNEL);
+ entry->mode = 0555;
+ entry->child = sd_alloc_ctl_cpu_table(i);
+ entry++;
+ }
+
+ WARN_ON(sd_sysctl_header);
+ sd_sysctl_header = register_sysctl_table(sd_ctl_root);
+}
+
+/* may be called multiple times per register */
+static void unregister_sched_domain_sysctl(void)
+{
+ if (sd_sysctl_header)
+ unregister_sysctl_table(sd_sysctl_header);
+ sd_sysctl_header = NULL;
+ if (sd_ctl_dir[0].child)
+ sd_free_ctl_entry(&sd_ctl_dir[0].child);
+}
+#else
+static void register_sched_domain_sysctl(void)
+{
+}
+static void unregister_sched_domain_sysctl(void)
+{
+}
+#endif
+
+static void set_rq_online(struct rq *rq)
+{
+ if (!rq->online) {
+ cpumask_set_cpu(cpu_of(rq), rq->rd->online);
+ rq->online = true;
+ }
+}
+
+static void set_rq_offline(struct rq *rq)
+{
+ if (rq->online) {
+ cpumask_clear_cpu(cpu_of(rq), rq->rd->online);
+ rq->online = false;
+ }
+}
+
+/*
+ * migration_call - callback that gets triggered when a CPU is added.
+ */
+static int
+migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+ int cpu = (long)hcpu;
+ unsigned long flags;
+ struct rq *rq = cpu_rq(cpu);
+#ifdef CONFIG_HOTPLUG_CPU
+ struct task_struct *idle = rq->idle;
+#endif
+
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_STARTING:
+ return NOTIFY_OK;
+ case CPU_UP_PREPARE:
+ break;
+
+ case CPU_ONLINE:
+ /* Update our root-domain */
+ grq_lock_irqsave(&flags);
+ if (rq->rd) {
+ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+
+ set_rq_online(rq);
+ }
+ unbind_zero(cpu);
+ grq.noc = num_online_cpus();
+ grq_unlock_irqrestore(&flags);
+ break;
+
+#ifdef CONFIG_HOTPLUG_CPU
+ case CPU_DEAD:
+ grq_lock_irq();
+ set_rq_task(rq, idle);
+ update_clocks(rq);
+ grq_unlock_irq();
+ break;
+
+ case CPU_DYING:
+ /* Update our root-domain */
+ grq_lock_irqsave(&flags);
+ if (rq->rd) {
+ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+ set_rq_offline(rq);
+ }
+ bind_zero(cpu);
+ grq.noc = num_online_cpus();
+ grq_unlock_irqrestore(&flags);
+ break;
+#endif
+ }
+ return NOTIFY_OK;
+}
+
+/*
+ * Register at high priority so that task migration (migrate_all_tasks)
+ * happens before everything else. This has to be lower priority than
+ * the notifier in the perf_counter subsystem, though.
+ */
+static struct notifier_block migration_notifier = {
+ .notifier_call = migration_call,
+ .priority = CPU_PRI_MIGRATION,
+};
+
+static int sched_cpu_active(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_DOWN_FAILED:
+ set_cpu_active((long)hcpu, true);
+ return NOTIFY_OK;
+ default:
+ return NOTIFY_DONE;
+ }
+}
+
+static int sched_cpu_inactive(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_DOWN_PREPARE:
+ set_cpu_active((long)hcpu, false);
+ return NOTIFY_OK;
+ default:
+ return NOTIFY_DONE;
+ }
+}
+
+int __init migration_init(void)
+{
+ void *cpu = (void *)(long)smp_processor_id();
+ int err;
+
+ /* Initialise migration for the boot CPU */
+ err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
+ BUG_ON(err == NOTIFY_BAD);
+ migration_call(&migration_notifier, CPU_ONLINE, cpu);
+ register_cpu_notifier(&migration_notifier);
+
+ /* Register cpu active notifiers */
+ cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
+ cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
+
+ return 0;
+}
+early_initcall(migration_init);
+#endif
+
+#ifdef CONFIG_SMP
+
+static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
+
+#ifdef CONFIG_SCHED_DEBUG
+
+static __read_mostly int sched_debug_enabled;
+
+static int __init sched_debug_setup(char *str)
+{
+ sched_debug_enabled = 1;
+
+ return 0;
+}
+early_param("sched_debug", sched_debug_setup);
+
+static inline bool sched_debug(void)
+{
+ return sched_debug_enabled;
+}
+
+static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
+ struct cpumask *groupmask)
+{
+ cpumask_clear(groupmask);
+
+ printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
+
+ if (!(sd->flags & SD_LOAD_BALANCE)) {
+ printk("does not load-balance\n");
+ if (sd->parent)
+ printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
+ " has parent");
+ return -1;
+ }
+
+ printk(KERN_CONT "span %*pbl level %s\n",
+ cpumask_pr_args(sched_domain_span(sd)), sd->name);
+
+ if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
+ printk(KERN_ERR "ERROR: domain->span does not contain "
+ "CPU%d\n", cpu);
+ }
+
+ printk(KERN_CONT "\n");
+
+ if (!cpumask_equal(sched_domain_span(sd), groupmask))
+ printk(KERN_ERR "ERROR: groups don't span domain->span\n");
+
+ if (sd->parent &&
+ !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
+ printk(KERN_ERR "ERROR: parent span is not a superset "
+ "of domain->span\n");
+ return 0;
+}
+
+static void sched_domain_debug(struct sched_domain *sd, int cpu)
+{
+ int level = 0;
+
+ if (!sched_debug_enabled)
+ return;
+
+ if (!sd) {
+ printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
+ return;
+ }
+
+ printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
+
+ for (;;) {
+ if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
+ break;
+ level++;
+ sd = sd->parent;
+ if (!sd)
+ break;
+ }
+}
+#else /* !CONFIG_SCHED_DEBUG */
+# define sched_domain_debug(sd, cpu) do { } while (0)
+static inline bool sched_debug(void)
+{
+ return false;
+}
+#endif /* CONFIG_SCHED_DEBUG */
+
+static int sd_degenerate(struct sched_domain *sd)
+{
+ if (cpumask_weight(sched_domain_span(sd)) == 1)
+ return 1;
+
+ /* Following flags don't use groups */
+ if (sd->flags & (SD_WAKE_AFFINE))
+ return 0;
+
+ return 1;
+}
+
+static int
+sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
+{
+ unsigned long cflags = sd->flags, pflags = parent->flags;
+
+ if (sd_degenerate(parent))
+ return 1;
+
+ if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
+ return 0;
+
+ if (~cflags & pflags)
+ return 0;
+
+ return 1;
+}
+
+static void free_rootdomain(struct rcu_head *rcu)
+{
+ struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
+
+ cpupri_cleanup(&rd->cpupri);
+ free_cpumask_var(rd->rto_mask);
+ free_cpumask_var(rd->online);
+ free_cpumask_var(rd->span);
+ kfree(rd);
+}
+
+static void rq_attach_root(struct rq *rq, struct root_domain *rd)
+{
+ struct root_domain *old_rd = NULL;
+ unsigned long flags;
+
+ grq_lock_irqsave(&flags);
+
+ if (rq->rd) {
+ old_rd = rq->rd;
+
+ if (cpumask_test_cpu(rq->cpu, old_rd->online))
+ set_rq_offline(rq);
+
+ cpumask_clear_cpu(rq->cpu, old_rd->span);
+
+ /*
+ * If we dont want to free the old_rd yet then
+ * set old_rd to NULL to skip the freeing later
+ * in this function:
+ */
+ if (!atomic_dec_and_test(&old_rd->refcount))
+ old_rd = NULL;
+ }
+
+ atomic_inc(&rd->refcount);
+ rq->rd = rd;
+
+ cpumask_set_cpu(rq->cpu, rd->span);
+ if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
+ set_rq_online(rq);
+
+ grq_unlock_irqrestore(&flags);
+
+ if (old_rd)
+ call_rcu_sched(&old_rd->rcu, free_rootdomain);
+}
+
+static int init_rootdomain(struct root_domain *rd)
+{
+ memset(rd, 0, sizeof(*rd));
+
+ if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
+ goto out;
+ if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
+ goto free_span;
+ if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+ goto free_online;
+
+ if (cpupri_init(&rd->cpupri) != 0)
+ goto free_rto_mask;
+ return 0;
+
+free_rto_mask:
+ free_cpumask_var(rd->rto_mask);
+free_online:
+ free_cpumask_var(rd->online);
+free_span:
+ free_cpumask_var(rd->span);
+out:
+ return -ENOMEM;
+}
+
+static void init_defrootdomain(void)
+{
+ init_rootdomain(&def_root_domain);
+
+ atomic_set(&def_root_domain.refcount, 1);
+}
+
+static struct root_domain *alloc_rootdomain(void)
+{
+ struct root_domain *rd;
+
+ rd = kmalloc(sizeof(*rd), GFP_KERNEL);
+ if (!rd)
+ return NULL;
+
+ if (init_rootdomain(rd) != 0) {
+ kfree(rd);
+ return NULL;
+ }
+
+ return rd;
+}
+
+static void free_sched_domain(struct rcu_head *rcu)
+{
+ struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
+
+ kfree(sd);
+}
+
+static void destroy_sched_domain(struct sched_domain *sd, int cpu)
+{
+ call_rcu(&sd->rcu, free_sched_domain);
+}
+
+static void destroy_sched_domains(struct sched_domain *sd, int cpu)
+{
+ for (; sd; sd = sd->parent)
+ destroy_sched_domain(sd, cpu);
+}
+
+/*
+ * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
+ * hold the hotplug lock.
+ */
+static void
+cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct sched_domain *tmp;
+
+ /* Remove the sched domains which do not contribute to scheduling. */
+ for (tmp = sd; tmp; ) {
+ struct sched_domain *parent = tmp->parent;
+ if (!parent)
+ break;
+
+ if (sd_parent_degenerate(tmp, parent)) {
+ tmp->parent = parent->parent;
+ if (parent->parent)
+ parent->parent->child = tmp;
+ /*
+ * Transfer SD_PREFER_SIBLING down in case of a
+ * degenerate parent; the spans match for this
+ * so the property transfers.
+ */
+ if (parent->flags & SD_PREFER_SIBLING)
+ tmp->flags |= SD_PREFER_SIBLING;
+ destroy_sched_domain(parent, cpu);
+ } else
+ tmp = tmp->parent;
+ }
+
+ if (sd && sd_degenerate(sd)) {
+ tmp = sd;
+ sd = sd->parent;
+ destroy_sched_domain(tmp, cpu);
+ if (sd)
+ sd->child = NULL;
+ }
+
+ sched_domain_debug(sd, cpu);
+
+ rq_attach_root(rq, rd);
+ tmp = rq->sd;
+ rcu_assign_pointer(rq->sd, sd);
+ destroy_sched_domains(tmp, cpu);
+}
+
+/* cpus with isolated domains */
+cpumask_var_t cpu_isolated_map;
+
+/* Setup the mask of cpus configured for isolated domains */
+static int __init isolated_cpu_setup(char *str)
+{
+ alloc_bootmem_cpumask_var(&cpu_isolated_map);
+ cpulist_parse(str, cpu_isolated_map);
+ return 1;
+}
+
+__setup("isolcpus=", isolated_cpu_setup);
+
+struct s_data {
+ struct sched_domain ** __percpu sd;
+ struct root_domain *rd;
+};
+
+enum s_alloc {
+ sa_rootdomain,
+ sa_sd,
+ sa_sd_storage,
+ sa_none,
+};
+
+/*
+ * Initializers for schedule domains
+ * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
+ */
+
+static int default_relax_domain_level = -1;
+int sched_domain_level_max;
+
+static int __init setup_relax_domain_level(char *str)
+{
+ if (kstrtoint(str, 0, &default_relax_domain_level))
+ pr_warn("Unable to set relax_domain_level\n");
+
+ return 1;
+}
+__setup("relax_domain_level=", setup_relax_domain_level);
+
+static void set_domain_attribute(struct sched_domain *sd,
+ struct sched_domain_attr *attr)
+{
+ int request;
+
+ if (!attr || attr->relax_domain_level < 0) {
+ if (default_relax_domain_level < 0)
+ return;
+ else
+ request = default_relax_domain_level;
+ } else
+ request = attr->relax_domain_level;
+ if (request < sd->level) {
+ /* turn off idle balance on this domain */
+ sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
+ } else {
+ /* turn on idle balance on this domain */
+ sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
+ }
+}
+
+static void __sdt_free(const struct cpumask *cpu_map);
+static int __sdt_alloc(const struct cpumask *cpu_map);
+
+static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
+ const struct cpumask *cpu_map)
+{
+ switch (what) {
+ case sa_rootdomain:
+ if (!atomic_read(&d->rd->refcount))
+ free_rootdomain(&d->rd->rcu); /* fall through */
+ case sa_sd:
+ free_percpu(d->sd); /* fall through */
+ case sa_sd_storage:
+ __sdt_free(cpu_map); /* fall through */
+ case sa_none:
+ break;
+ }
+}
+
+static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
+ const struct cpumask *cpu_map)
+{
+ memset(d, 0, sizeof(*d));
+
+ if (__sdt_alloc(cpu_map))
+ return sa_sd_storage;
+ d->sd = alloc_percpu(struct sched_domain *);
+ if (!d->sd)
+ return sa_sd_storage;
+ d->rd = alloc_rootdomain();
+ if (!d->rd)
+ return sa_sd;
+ return sa_rootdomain;
+}
+
+/*
+ * NULL the sd_data elements we've used to build the sched_domain
+ * structure so that the subsequent __free_domain_allocs()
+ * will not free the data we're using.
+ */
+static void claim_allocations(int cpu, struct sched_domain *sd)
+{
+ struct sd_data *sdd = sd->private;
+
+ WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
+ *per_cpu_ptr(sdd->sd, cpu) = NULL;
+}
+
+#ifdef CONFIG_NUMA
+static int sched_domains_numa_levels;
+static int *sched_domains_numa_distance;
+static struct cpumask ***sched_domains_numa_masks;
+static int sched_domains_curr_level;
+#endif
+
+/*
+ * SD_flags allowed in topology descriptions.
+ *
+ * SD_SHARE_CPUCAPACITY - describes SMT topologies
+ * SD_SHARE_PKG_RESOURCES - describes shared caches
+ * SD_NUMA - describes NUMA topologies
+ * SD_SHARE_POWERDOMAIN - describes shared power domain
+ *
+ * Odd one out:
+ * SD_ASYM_PACKING - describes SMT quirks
+ */
+#define TOPOLOGY_SD_FLAGS \
+ (SD_SHARE_CPUCAPACITY | \
+ SD_SHARE_PKG_RESOURCES | \
+ SD_NUMA | \
+ SD_ASYM_PACKING | \
+ SD_SHARE_POWERDOMAIN)
+
+static struct sched_domain *
+sd_init(struct sched_domain_topology_level *tl, int cpu)
+{
+ struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
+ int sd_weight, sd_flags = 0;
+
+#ifdef CONFIG_NUMA
+ /*
+ * Ugly hack to pass state to sd_numa_mask()...
+ */
+ sched_domains_curr_level = tl->numa_level;
+#endif
+
+ sd_weight = cpumask_weight(tl->mask(cpu));
+
+ if (tl->sd_flags)
+ sd_flags = (*tl->sd_flags)();
+ if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
+ "wrong sd_flags in topology description\n"))
+ sd_flags &= ~TOPOLOGY_SD_FLAGS;
+
+ *sd = (struct sched_domain){
+ .min_interval = sd_weight,
+ .max_interval = 2*sd_weight,
+ .busy_factor = 32,
+ .imbalance_pct = 125,
+
+ .cache_nice_tries = 0,
+ .busy_idx = 0,
+ .idle_idx = 0,
+ .newidle_idx = 0,
+ .wake_idx = 0,
+ .forkexec_idx = 0,
+
+ .flags = 1*SD_LOAD_BALANCE
+ | 1*SD_BALANCE_NEWIDLE
+ | 1*SD_BALANCE_EXEC
+ | 1*SD_BALANCE_FORK
+ | 0*SD_BALANCE_WAKE
+ | 1*SD_WAKE_AFFINE
+ | 0*SD_SHARE_CPUCAPACITY
+ | 0*SD_SHARE_PKG_RESOURCES
+ | 0*SD_SERIALIZE
+ | 0*SD_PREFER_SIBLING
+ | 0*SD_NUMA
+ | sd_flags
+ ,
+
+ .last_balance = jiffies,
+ .balance_interval = sd_weight,
+ .smt_gain = 0,
+ .max_newidle_lb_cost = 0,
+ .next_decay_max_lb_cost = jiffies,
+#ifdef CONFIG_SCHED_DEBUG
+ .name = tl->name,
+#endif
+ };
+
+ /*
+ * Convert topological properties into behaviour.
+ */
+
+ if (sd->flags & SD_SHARE_CPUCAPACITY) {
+ sd->flags |= SD_PREFER_SIBLING;
+ sd->imbalance_pct = 110;
+ sd->smt_gain = 1178; /* ~15% */
+
+ } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
+ sd->imbalance_pct = 117;
+ sd->cache_nice_tries = 1;
+ sd->busy_idx = 2;
+
+#ifdef CONFIG_NUMA
+ } else if (sd->flags & SD_NUMA) {
+ sd->cache_nice_tries = 2;
+ sd->busy_idx = 3;
+ sd->idle_idx = 2;
+
+ sd->flags |= SD_SERIALIZE;
+ if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
+ sd->flags &= ~(SD_BALANCE_EXEC |
+ SD_BALANCE_FORK |
+ SD_WAKE_AFFINE);
+ }
+
+#endif
+ } else {
+ sd->flags |= SD_PREFER_SIBLING;
+ sd->cache_nice_tries = 1;
+ sd->busy_idx = 2;
+ sd->idle_idx = 1;
+ }
+
+ sd->private = &tl->data;
+
+ return sd;
+}
+
+/*
+ * Topology list, bottom-up.
+ */
+static struct sched_domain_topology_level default_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+ { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+#ifdef CONFIG_SCHED_MC
+ { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+#endif
+ { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+ { NULL, },
+};
+
+struct sched_domain_topology_level *sched_domain_topology = default_topology;
+
+#define for_each_sd_topology(tl) \
+ for (tl = sched_domain_topology; tl->mask; tl++)
+
+void set_sched_topology(struct sched_domain_topology_level *tl)
+{
+ sched_domain_topology = tl;
+}
+
+#ifdef CONFIG_NUMA
+
+static const struct cpumask *sd_numa_mask(int cpu)
+{
+ return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
+}
+
+static void sched_numa_warn(const char *str)
+{
+ static int done = false;
+ int i,j;
+
+ if (done)
+ return;
+
+ done = true;
+
+ printk(KERN_WARNING "ERROR: %s\n\n", str);
+
+ for (i = 0; i < nr_node_ids; i++) {
+ printk(KERN_WARNING " ");
+ for (j = 0; j < nr_node_ids; j++)
+ printk(KERN_CONT "%02d ", node_distance(i,j));
+ printk(KERN_CONT "\n");
+ }
+ printk(KERN_WARNING "\n");
+}
+
+static bool find_numa_distance(int distance)
+{
+ int i;
+
+ if (distance == node_distance(0, 0))
+ return true;
+
+ for (i = 0; i < sched_domains_numa_levels; i++) {
+ if (sched_domains_numa_distance[i] == distance)
+ return true;
+ }
+
+ return false;
+}
+
+static void sched_init_numa(void)
+{
+ int next_distance, curr_distance = node_distance(0, 0);
+ struct sched_domain_topology_level *tl;
+ int level = 0;
+ int i, j, k;
+
+ sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
+ if (!sched_domains_numa_distance)
+ return;
+
+ /*
+ * O(nr_nodes^2) deduplicating selection sort -- in order to find the
+ * unique distances in the node_distance() table.
+ *
+ * Assumes node_distance(0,j) includes all distances in
+ * node_distance(i,j) in order to avoid cubic time.
+ */
+ next_distance = curr_distance;
+ for (i = 0; i < nr_node_ids; i++) {
+ for (j = 0; j < nr_node_ids; j++) {
+ for (k = 0; k < nr_node_ids; k++) {
+ int distance = node_distance(i, k);
+
+ if (distance > curr_distance &&
+ (distance < next_distance ||
+ next_distance == curr_distance))
+ next_distance = distance;
+
+ /*
+ * While not a strong assumption it would be nice to know
+ * about cases where if node A is connected to B, B is not
+ * equally connected to A.
+ */
+ if (sched_debug() && node_distance(k, i) != distance)
+ sched_numa_warn("Node-distance not symmetric");
+
+ if (sched_debug() && i && !find_numa_distance(distance))
+ sched_numa_warn("Node-0 not representative");
+ }
+ if (next_distance != curr_distance) {
+ sched_domains_numa_distance[level++] = next_distance;
+ sched_domains_numa_levels = level;
+ curr_distance = next_distance;
+ } else break;
+ }
+
+ /*
+ * In case of sched_debug() we verify the above assumption.
+ */
+ if (!sched_debug())
+ break;
+ }
+ /*
+ * 'level' contains the number of unique distances, excluding the
+ * identity distance node_distance(i,i).
+ *
+ * The sched_domains_numa_distance[] array includes the actual distance
+ * numbers.
+ */
+
+ /*
+ * Here, we should temporarily reset sched_domains_numa_levels to 0.
+ * If it fails to allocate memory for array sched_domains_numa_masks[][],
+ * the array will contain less then 'level' members. This could be
+ * dangerous when we use it to iterate array sched_domains_numa_masks[][]
+ * in other functions.
+ *
+ * We reset it to 'level' at the end of this function.
+ */
+ sched_domains_numa_levels = 0;
+
+ sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
+ if (!sched_domains_numa_masks)
+ return;
+
+ /*
+ * Now for each level, construct a mask per node which contains all
+ * cpus of nodes that are that many hops away from us.
+ */
+ for (i = 0; i < level; i++) {
+ sched_domains_numa_masks[i] =
+ kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
+ if (!sched_domains_numa_masks[i])
+ return;
+
+ for (j = 0; j < nr_node_ids; j++) {
+ struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
+ if (!mask)
+ return;
+
+ sched_domains_numa_masks[i][j] = mask;
+
+ for (k = 0; k < nr_node_ids; k++) {
+ if (node_distance(j, k) > sched_domains_numa_distance[i])
+ continue;
+
+ cpumask_or(mask, mask, cpumask_of_node(k));
+ }
+ }
+ }
+
+ /* Compute default topology size */
+ for (i = 0; sched_domain_topology[i].mask; i++);
+
+ tl = kzalloc((i + level + 1) *
+ sizeof(struct sched_domain_topology_level), GFP_KERNEL);
+ if (!tl)
+ return;
+
+ /*
+ * Copy the default topology bits..
+ */
+ for (i = 0; sched_domain_topology[i].mask; i++)
+ tl[i] = sched_domain_topology[i];
+
+ /*
+ * .. and append 'j' levels of NUMA goodness.
+ */
+ for (j = 0; j < level; i++, j++) {
+ tl[i] = (struct sched_domain_topology_level){
+ .mask = sd_numa_mask,
+ .sd_flags = cpu_numa_flags,
+ .flags = SDTL_OVERLAP,
+ .numa_level = j,
+ SD_INIT_NAME(NUMA)
+ };
+ }
+
+ sched_domain_topology = tl;
+
+ sched_domains_numa_levels = level;
+}
+
+static void sched_domains_numa_masks_set(int cpu)
+{
+ int i, j;
+ int node = cpu_to_node(cpu);
+
+ for (i = 0; i < sched_domains_numa_levels; i++) {
+ for (j = 0; j < nr_node_ids; j++) {
+ if (node_distance(j, node) <= sched_domains_numa_distance[i])
+ cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
+ }
+ }
+}
+
+static void sched_domains_numa_masks_clear(int cpu)
+{
+ int i, j;
+ for (i = 0; i < sched_domains_numa_levels; i++) {
+ for (j = 0; j < nr_node_ids; j++)
+ cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
+ }
+}
+
+/*
+ * Update sched_domains_numa_masks[level][node] array when new cpus
+ * are onlined.
+ */
+static int sched_domains_numa_masks_update(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ int cpu = (long)hcpu;
+
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_ONLINE:
+ sched_domains_numa_masks_set(cpu);
+ break;
+
+ case CPU_DEAD:
+ sched_domains_numa_masks_clear(cpu);
+ break;
+
+ default:
+ return NOTIFY_DONE;
+ }
+
+ return NOTIFY_OK;
+}
+#else
+static inline void sched_init_numa(void)
+{
+}
+
+static int sched_domains_numa_masks_update(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ return 0;
+}
+#endif /* CONFIG_NUMA */
+
+static int __sdt_alloc(const struct cpumask *cpu_map)
+{
+ struct sched_domain_topology_level *tl;
+ int j;
+
+ for_each_sd_topology(tl) {
+ struct sd_data *sdd = &tl->data;
+
+ sdd->sd = alloc_percpu(struct sched_domain *);
+ if (!sdd->sd)
+ return -ENOMEM;
+
+ for_each_cpu(j, cpu_map) {
+ struct sched_domain *sd;
+
+ sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
+ GFP_KERNEL, cpu_to_node(j));
+ if (!sd)
+ return -ENOMEM;
+
+ *per_cpu_ptr(sdd->sd, j) = sd;
+ }
+ }
+
+ return 0;
+}
+
+static void __sdt_free(const struct cpumask *cpu_map)
+{
+ struct sched_domain_topology_level *tl;
+ int j;
+
+ for_each_sd_topology(tl) {
+ struct sd_data *sdd = &tl->data;
+
+ for_each_cpu(j, cpu_map) {
+ struct sched_domain *sd;
+
+ if (sdd->sd) {
+ sd = *per_cpu_ptr(sdd->sd, j);
+ kfree(*per_cpu_ptr(sdd->sd, j));
+ }
+ }
+ free_percpu(sdd->sd);
+ sdd->sd = NULL;
+ }
+}
+
+struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
+ const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+ struct sched_domain *child, int cpu)
+{
+ struct sched_domain *sd = sd_init(tl, cpu);
+ if (!sd)
+ return child;
+
+ cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
+ if (child) {
+ sd->level = child->level + 1;
+ sched_domain_level_max = max(sched_domain_level_max, sd->level);
+ child->parent = sd;
+ sd->child = child;
+
+ if (!cpumask_subset(sched_domain_span(child),
+ sched_domain_span(sd))) {
+ pr_err("BUG: arch topology borken\n");
+#ifdef CONFIG_SCHED_DEBUG
+ pr_err(" the %s domain not a subset of the %s domain\n",
+ child->name, sd->name);
+#endif
+ /* Fixup, ensure @sd has at least @child cpus. */
+ cpumask_or(sched_domain_span(sd),
+ sched_domain_span(sd),
+ sched_domain_span(child));
+ }
+
+ }
+ set_domain_attribute(sd, attr);
+
+ return sd;
+}
+
+/*
+ * Build sched domains for a given set of cpus and attach the sched domains
+ * to the individual cpus
+ */
+static int build_sched_domains(const struct cpumask *cpu_map,
+ struct sched_domain_attr *attr)
+{
+ enum s_alloc alloc_state;
+ struct sched_domain *sd;
+ struct s_data d;
+ int i, ret = -ENOMEM;
+
+ alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
+ if (alloc_state != sa_rootdomain)
+ goto error;
+
+ /* Set up domains for cpus specified by the cpu_map. */
+ for_each_cpu(i, cpu_map) {
+ struct sched_domain_topology_level *tl;
+
+ sd = NULL;
+ for_each_sd_topology(tl) {
+ sd = build_sched_domain(tl, cpu_map, attr, sd, i);
+ if (tl == sched_domain_topology)
+ *per_cpu_ptr(d.sd, i) = sd;
+ if (tl->flags & SDTL_OVERLAP)
+ sd->flags |= SD_OVERLAP;
+ if (cpumask_equal(cpu_map, sched_domain_span(sd)))
+ break;
+ }
+ }
+
+ /* Calculate CPU capacity for physical packages and nodes */
+ for (i = nr_cpumask_bits-1; i >= 0; i--) {
+ if (!cpumask_test_cpu(i, cpu_map))
+ continue;
+
+ for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+ claim_allocations(i, sd);
+ }
+ }
+
+ /* Attach the domains */
+ rcu_read_lock();
+ for_each_cpu(i, cpu_map) {
+ sd = *per_cpu_ptr(d.sd, i);
+ cpu_attach_domain(sd, d.rd, i);
+ }
+ rcu_read_unlock();
+
+ ret = 0;
+error:
+ __free_domain_allocs(&d, alloc_state, cpu_map);
+ return ret;
+}
+
+static cpumask_var_t *doms_cur; /* current sched domains */
+static int ndoms_cur; /* number of sched domains in 'doms_cur' */
+static struct sched_domain_attr *dattr_cur;
+ /* attribues of custom domains in 'doms_cur' */
+
+/*
+ * Special case: If a kmalloc of a doms_cur partition (array of
+ * cpumask) fails, then fallback to a single sched domain,
+ * as determined by the single cpumask fallback_doms.
+ */
+static cpumask_var_t fallback_doms;
+
+/*
+ * arch_update_cpu_topology lets virtualized architectures update the
+ * cpu core maps. It is supposed to return 1 if the topology changed
+ * or 0 if it stayed the same.
+ */
+int __weak arch_update_cpu_topology(void)
+{
+ return 0;
+}
+
+cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
+{
+ int i;
+ cpumask_var_t *doms;
+
+ doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
+ if (!doms)
+ return NULL;
+ for (i = 0; i < ndoms; i++) {
+ if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
+ free_sched_domains(doms, i);
+ return NULL;
+ }
+ }
+ return doms;
+}
+
+void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
+{
+ unsigned int i;
+ for (i = 0; i < ndoms; i++)
+ free_cpumask_var(doms[i]);
+ kfree(doms);
+}
+
+/*
+ * Set up scheduler domains and groups. Callers must hold the hotplug lock.
+ * For now this just excludes isolated cpus, but could be used to
+ * exclude other special cases in the future.
+ */
+static int init_sched_domains(const struct cpumask *cpu_map)
+{
+ int err;
+
+ arch_update_cpu_topology();
+ ndoms_cur = 1;
+ doms_cur = alloc_sched_domains(ndoms_cur);
+ if (!doms_cur)
+ doms_cur = &fallback_doms;
+ cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
+ err = build_sched_domains(doms_cur[0], NULL);
+ register_sched_domain_sysctl();
+
+ return err;
+}
+
+/*
+ * Detach sched domains from a group of cpus specified in cpu_map
+ * These cpus will now be attached to the NULL domain
+ */
+static void detach_destroy_domains(const struct cpumask *cpu_map)
+{
+ int i;
+
+ rcu_read_lock();
+ for_each_cpu(i, cpu_map)
+ cpu_attach_domain(NULL, &def_root_domain, i);
+ rcu_read_unlock();
+}
+
+/* handle null as "default" */
+static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
+ struct sched_domain_attr *new, int idx_new)
+{
+ struct sched_domain_attr tmp;
+
+ /* fast path */
+ if (!new && !cur)
+ return 1;
+
+ tmp = SD_ATTR_INIT;
+ return !memcmp(cur ? (cur + idx_cur) : &tmp,
+ new ? (new + idx_new) : &tmp,
+ sizeof(struct sched_domain_attr));
+}
+
+/*
+ * Partition sched domains as specified by the 'ndoms_new'
+ * cpumasks in the array doms_new[] of cpumasks. This compares
+ * doms_new[] to the current sched domain partitioning, doms_cur[].
+ * It destroys each deleted domain and builds each new domain.
+ *
+ * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
+ * The masks don't intersect (don't overlap.) We should setup one
+ * sched domain for each mask. CPUs not in any of the cpumasks will
+ * not be load balanced. If the same cpumask appears both in the
+ * current 'doms_cur' domains and in the new 'doms_new', we can leave
+ * it as it is.
+ *
+ * The passed in 'doms_new' should be allocated using
+ * alloc_sched_domains. This routine takes ownership of it and will
+ * free_sched_domains it when done with it. If the caller failed the
+ * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
+ * and partition_sched_domains() will fallback to the single partition
+ * 'fallback_doms', it also forces the domains to be rebuilt.
+ *
+ * If doms_new == NULL it will be replaced with cpu_online_mask.
+ * ndoms_new == 0 is a special case for destroying existing domains,
+ * and it will not create the default domain.
+ *
+ * Call with hotplug lock held
+ */
+void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
+ struct sched_domain_attr *dattr_new)
+{
+ int i, j, n;
+ int new_topology;
+
+ mutex_lock(&sched_domains_mutex);
+
+ /* always unregister in case we don't destroy any domains */
+ unregister_sched_domain_sysctl();
+
+ /* Let architecture update cpu core mappings. */
+ new_topology = arch_update_cpu_topology();
+
+ n = doms_new ? ndoms_new : 0;
+
+ /* Destroy deleted domains */
+ for (i = 0; i < ndoms_cur; i++) {
+ for (j = 0; j < n && !new_topology; j++) {
+ if (cpumask_equal(doms_cur[i], doms_new[j])
+ && dattrs_equal(dattr_cur, i, dattr_new, j))
+ goto match1;
+ }
+ /* no match - a current sched domain not in new doms_new[] */
+ detach_destroy_domains(doms_cur[i]);
+match1:
+ ;
+ }
+
+ n = ndoms_cur;
+ if (doms_new == NULL) {
+ n = 0;
+ doms_new = &fallback_doms;
+ cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
+ WARN_ON_ONCE(dattr_new);
+ }
+
+ /* Build new domains */
+ for (i = 0; i < ndoms_new; i++) {
+ for (j = 0; j < n && !new_topology; j++) {
+ if (cpumask_equal(doms_new[i], doms_cur[j])
+ && dattrs_equal(dattr_new, i, dattr_cur, j))
+ goto match2;
+ }
+ /* no match - add a new doms_new */
+ build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
+match2:
+ ;
+ }
+
+ /* Remember the new sched domains */
+ if (doms_cur != &fallback_doms)
+ free_sched_domains(doms_cur, ndoms_cur);
+ kfree(dattr_cur); /* kfree(NULL) is safe */
+ doms_cur = doms_new;
+ dattr_cur = dattr_new;
+ ndoms_cur = ndoms_new;
+
+ register_sched_domain_sysctl();
+
+ mutex_unlock(&sched_domains_mutex);
+}
+
+static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */
+
+/*
+ * Update cpusets according to cpu_active mask. If cpusets are
+ * disabled, cpuset_update_active_cpus() becomes a simple wrapper
+ * around partition_sched_domains().
+ *
+ * If we come here as part of a suspend/resume, don't touch cpusets because we
+ * want to restore it back to its original state upon resume anyway.
+ */
+static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
+ void *hcpu)
+{
+ switch (action) {
+ case CPU_ONLINE_FROZEN:
+ case CPU_DOWN_FAILED_FROZEN:
+
+ /*
+ * num_cpus_frozen tracks how many CPUs are involved in suspend
+ * resume sequence. As long as this is not the last online
+ * operation in the resume sequence, just build a single sched
+ * domain, ignoring cpusets.
+ */
+ num_cpus_frozen--;
+ if (likely(num_cpus_frozen)) {
+ partition_sched_domains(1, NULL, NULL);
+ break;
+ }
+
+ /*
+ * This is the last CPU online operation. So fall through and
+ * restore the original sched domains by considering the
+ * cpuset configurations.
+ */
+
+ case CPU_ONLINE:
+ cpuset_update_active_cpus(true);
+ break;
+ default:
+ return NOTIFY_DONE;
+ }
+ return NOTIFY_OK;
+}
+
+static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
+ void *hcpu)
+{
+ switch (action) {
+ case CPU_DOWN_PREPARE:
+ cpuset_update_active_cpus(false);
+ break;
+ case CPU_DOWN_PREPARE_FROZEN:
+ num_cpus_frozen++;
+ partition_sched_domains(1, NULL, NULL);
+ break;
+ default:
+ return NOTIFY_DONE;
+ }
+ return NOTIFY_OK;
+}
+
+#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC)
+/*
+ * Cheaper version of the below functions in case support for SMT and MC is
+ * compiled in but CPUs have no siblings.
+ */
+static bool sole_cpu_idle(int cpu)
+{
+ return rq_idle(cpu_rq(cpu));
+}
+#endif
+#ifdef CONFIG_SCHED_SMT
+static const cpumask_t *thread_cpumask(int cpu)
+{
+ return topology_thread_cpumask(cpu);
+}
+/* All this CPU's SMT siblings are idle */
+static bool siblings_cpu_idle(int cpu)
+{
+ return cpumask_subset(thread_cpumask(cpu), &grq.cpu_idle_map);
+}
+#endif
+#ifdef CONFIG_SCHED_MC
+static const cpumask_t *core_cpumask(int cpu)
+{
+ return topology_core_cpumask(cpu);
+}
+/* All this CPU's shared cache siblings are idle */
+static bool cache_cpu_idle(int cpu)
+{
+ return cpumask_subset(core_cpumask(cpu), &grq.cpu_idle_map);
+}
+#endif
+
+enum sched_domain_level {
+ SD_LV_NONE = 0,
+ SD_LV_SIBLING,
+ SD_LV_MC,
+ SD_LV_BOOK,
+ SD_LV_CPU,
+ SD_LV_NODE,
+ SD_LV_ALLNODES,
+ SD_LV_MAX
+};
+
+void __init sched_init_smp(void)
+{
+ struct sched_domain *sd;
+ int cpu, other_cpu;
+
+ cpumask_var_t non_isolated_cpus;
+
+ alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
+ alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
+
+ sched_init_numa();
+
+ /*
+ * There's no userspace yet to cause hotplug operations; hence all the
+ * cpu masks are stable and all blatant races in the below code cannot
+ * happen.
+ */
+ mutex_lock(&sched_domains_mutex);
+ init_sched_domains(cpu_active_mask);
+ cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
+ if (cpumask_empty(non_isolated_cpus))
+ cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
+ mutex_unlock(&sched_domains_mutex);
+
+ hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
+ hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
+ hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
+
+ /* Move init over to a non-isolated CPU */
+ if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
+ BUG();
+ free_cpumask_var(non_isolated_cpus);
+
+ grq_lock_irq();
+ /*
+ * Set up the relative cache distance of each online cpu from each
+ * other in a simple array for quick lookup. Locality is determined
+ * by the closest sched_domain that CPUs are separated by. CPUs with
+ * shared cache in SMT and MC are treated as local. Separate CPUs
+ * (within the same package or physically) within the same node are
+ * treated as not local. CPUs not even in the same domain (different
+ * nodes) are treated as very distant.
+ */
+ for_each_online_cpu(cpu) {
+ struct rq *rq = cpu_rq(cpu);
+
+ /* First check if this cpu is in the same node */
+ for_each_domain(cpu, sd) {
+ if (sd->level > SD_LV_NODE)
+ continue;
+ /* Set locality to local node if not already found lower */
+ for_each_cpu(other_cpu, sched_domain_span(sd)) {
+ if (rq->cpu_locality[other_cpu] > 3)
+ rq->cpu_locality[other_cpu] = 3;
+ }
+ }
+
+ /*
+ * Each runqueue has its own function in case it doesn't have
+ * siblings of its own allowing mixed topologies.
+ */
+#ifdef CONFIG_SCHED_MC
+ for_each_cpu(other_cpu, core_cpumask(cpu)) {
+ if (rq->cpu_locality[other_cpu] > 2)
+ rq->cpu_locality[other_cpu] = 2;
+ }
+ if (cpumask_weight(core_cpumask(cpu)) > 1)
+ rq->cache_idle = cache_cpu_idle;
+#endif
+#ifdef CONFIG_SCHED_SMT
+ for_each_cpu(other_cpu, thread_cpumask(cpu))
+ rq->cpu_locality[other_cpu] = 1;
+ if (cpumask_weight(thread_cpumask(cpu)) > 1)
+ rq->siblings_idle = siblings_cpu_idle;
+#endif
+ }
+ grq_unlock_irq();
+
+ for_each_online_cpu(cpu) {
+ struct rq *rq = cpu_rq(cpu);
+ for_each_online_cpu(other_cpu) {
+ if (other_cpu <= cpu)
+ continue;
+ printk(KERN_DEBUG "BFS LOCALITY CPU %d to %d: %d\n", cpu, other_cpu, rq->cpu_locality[other_cpu]);
+ }
+ }
+}
+#else
+void __init sched_init_smp(void)
+{
+}
+#endif /* CONFIG_SMP */
+
+unsigned int sysctl_timer_migration = 1;
+
+int in_sched_functions(unsigned long addr)
+{
+ return in_lock_functions(addr) ||
+ (addr >= (unsigned long)__sched_text_start
+ && addr < (unsigned long)__sched_text_end);
+}
+
+void __init sched_init(void)
+{
+#ifdef CONFIG_SMP
+ int cpu_ids;
+#endif
+ int i;
+ struct rq *rq;
+
+ prio_ratios[0] = 128;
+ for (i = 1 ; i < NICE_WIDTH ; i++)
+ prio_ratios[i] = prio_ratios[i - 1] * 11 / 10;
+
+ raw_spin_lock_init(&grq.lock);
+ grq.nr_running = grq.nr_uninterruptible = grq.nr_switches = 0;
+ grq.niffies = 0;
+ grq.last_jiffy = jiffies;
+ raw_spin_lock_init(&grq.iso_lock);
+ grq.iso_ticks = 0;
+ grq.iso_refractory = false;
+ grq.noc = 1;
+#ifdef CONFIG_SMP
+ init_defrootdomain();
+ grq.qnr = grq.idle_cpus = 0;
+ cpumask_clear(&grq.cpu_idle_map);
+#else
+ uprq = &per_cpu(runqueues, 0);
+#endif
+ for_each_possible_cpu(i) {
+ rq = cpu_rq(i);
+ rq->grq_lock = &grq.lock;
+ rq->user_pc = rq->nice_pc = rq->softirq_pc = rq->system_pc =
+ rq->iowait_pc = rq->idle_pc = 0;
+ rq->dither = false;
+#ifdef CONFIG_SMP
+ rq->sticky_task = NULL;
+ rq->last_niffy = 0;
+ rq->sd = NULL;
+ rq->rd = NULL;
+ rq->online = false;
+ rq->cpu = i;
+ rq_attach_root(rq, &def_root_domain);
+#endif
+ atomic_set(&rq->nr_iowait, 0);
+ }
+
+#ifdef CONFIG_SMP
+ cpu_ids = i;
+ /*
+ * Set the base locality for cpu cache distance calculation to
+ * "distant" (3). Make sure the distance from a CPU to itself is 0.
+ */
+ for_each_possible_cpu(i) {
+ int j;
+
+ rq = cpu_rq(i);
+#ifdef CONFIG_SCHED_SMT
+ rq->siblings_idle = sole_cpu_idle;
+#endif
+#ifdef CONFIG_SCHED_MC
+ rq->cache_idle = sole_cpu_idle;
+#endif
+ rq->cpu_locality = kmalloc(cpu_ids * sizeof(int *), GFP_ATOMIC);
+ for_each_possible_cpu(j) {
+ if (i == j)
+ rq->cpu_locality[j] = 0;
+ else
+ rq->cpu_locality[j] = 4;
+ }
+ }
+#endif
+
+ for (i = 0; i < PRIO_LIMIT; i++)
+ INIT_LIST_HEAD(grq.queue + i);
+ /* delimiter for bitsearch */
+ __set_bit(PRIO_LIMIT, grq.prio_bitmap);
+
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+ INIT_HLIST_HEAD(&init_task.preempt_notifiers);
+#endif
+
+ /*
+ * The boot idle thread does lazy MMU switching as well:
+ */
+ atomic_inc(&init_mm.mm_count);
+ enter_lazy_tlb(&init_mm, current);
+
+ /*
+ * Make us the idle thread. Technically, schedule() should not be
+ * called from this thread, however somewhere below it might be,
+ * but because we are the idle thread, we just pick up running again
+ * when this runqueue becomes "idle".
+ */
+ init_idle(current, smp_processor_id());
+
+#ifdef CONFIG_SMP
+ zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
+ /* May be allocated at isolcpus cmdline parse time */
+ if (cpu_isolated_map == NULL)
+ zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
+ idle_thread_set_boot_cpu();
+#endif /* SMP */
+}
+
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
+static inline int preempt_count_equals(int preempt_offset)
+{
+ int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
+
+ return (nested == preempt_offset);
+}
+
+void __might_sleep(const char *file, int line, int preempt_offset)
+{
+ /*
+ * Blocking primitives will set (and therefore destroy) current->state,
+ * since we will exit with TASK_RUNNING make sure we enter with it,
+ * otherwise we will destroy state.
+ */
+ WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,
+ "do not call blocking ops when !TASK_RUNNING; "
+ "state=%lx set at [<%p>] %pS\n",
+ current->state,
+ (void *)current->task_state_change,
+ (void *)current->task_state_change);
+
+ ___might_sleep(file, line, preempt_offset);
+}
+EXPORT_SYMBOL(__might_sleep);
+
+void ___might_sleep(const char *file, int line, int preempt_offset)
+{
+ static unsigned long prev_jiffy; /* ratelimiting */
+
+ rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
+ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
+ !is_idle_task(current)) ||
+ system_state != SYSTEM_RUNNING || oops_in_progress)
+ return;
+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
+ return;
+ prev_jiffy = jiffies;
+
+ printk(KERN_ERR
+ "BUG: sleeping function called from invalid context at %s:%d\n",
+ file, line);
+ printk(KERN_ERR
+ "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
+ in_atomic(), irqs_disabled(),
+ current->pid, current->comm);
+
+ if (task_stack_end_corrupted(current))
+ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
+
+ debug_show_held_locks(current);
+ if (irqs_disabled())
+ print_irqtrace_events(current);
+#ifdef CONFIG_DEBUG_PREEMPT
+ if (!preempt_count_equals(preempt_offset)) {
+ pr_err("Preemption disabled at:");
+ print_ip_sym(current->preempt_disable_ip);
+ pr_cont("\n");
+ }
+#endif
+ dump_stack();
+}
+EXPORT_SYMBOL(___might_sleep);
+#endif
+
+#ifdef CONFIG_MAGIC_SYSRQ
+void normalize_rt_tasks(void)
+{
+ struct task_struct *g, *p;
+ unsigned long flags;
+ struct rq *rq;
+ int queued;
+
+ read_lock(&tasklist_lock);
+ for_each_process_thread(g, p) {
+ if (!rt_task(p) && !iso_task(p))
+ continue;
+
+ rq = task_grq_lock(p, &flags);
+ queued = task_queued(p);
+ if (queued)
+ dequeue_task(p);
+ __setscheduler(p, rq, SCHED_NORMAL, 0, false);
+ if (queued) {
+ enqueue_task(p, rq);
+ try_preempt(p, rq);
+ }
+
+ task_grq_unlock(&flags);
+ }
+ read_unlock(&tasklist_lock);
+}
+#endif /* CONFIG_MAGIC_SYSRQ */
+
+#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
+/*
+ * These functions are only useful for the IA64 MCA handling, or kdb.
+ *
+ * They can only be called when the whole system has been
+ * stopped - every CPU needs to be quiescent, and no scheduling
+ * activity can take place. Using them for anything else would
+ * be a serious bug, and as a result, they aren't even visible
+ * under any other configuration.
+ */
+
+/**
+ * curr_task - return the current task for a given cpu.
+ * @cpu: the processor in question.
+ *
+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ *
+ * Return: The current task for @cpu.
+ */
+struct task_struct *curr_task(int cpu)
+{
+ return cpu_curr(cpu);
+}
+
+#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
+
+#ifdef CONFIG_IA64
+/**
+ * set_curr_task - set the current task for a given cpu.
+ * @cpu: the processor in question.
+ * @p: the task pointer to set.
+ *
+ * Description: This function must only be used when non-maskable interrupts
+ * are serviced on a separate stack. It allows the architecture to switch the
+ * notion of the current task on a cpu in a non-blocking manner. This function
+ * must be called with all CPU's synchronised, and interrupts disabled, the
+ * and caller must save the original value of the current task (see
+ * curr_task() above) and restore that value before reenabling interrupts and
+ * re-starting the system.
+ *
+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ */
+void set_curr_task(int cpu, struct task_struct *p)
+{
+ cpu_curr(cpu) = p;
+}
+
+#endif
+
+/*
+ * Use precise platform statistics if available:
+ */
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+ *ut = p->utime;
+ *st = p->stime;
+}
+
+void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+ struct task_cputime cputime;
+
+ thread_group_cputime(p, &cputime);
+
+ *ut = cputime.utime;
+ *st = cputime.stime;
+}
+
+void vtime_account_system_irqsafe(struct task_struct *tsk)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ vtime_account_system(tsk);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe);
+
+#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
+void vtime_task_switch(struct task_struct *prev)
+{
+ if (is_idle_task(prev))
+ vtime_account_idle(prev);
+ else
+ vtime_account_system(prev);
+
+ vtime_account_user(prev);
+ arch_vtime_task_switch(prev);
+}
+#endif
+
+#else
+/*
+ * Perform (stime * rtime) / total, but avoid multiplication overflow by
+ * losing precision when the numbers are big.
+ */
+static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
+{
+ u64 scaled;
+
+ for (;;) {
+ /* Make sure "rtime" is the bigger of stime/rtime */
+ if (stime > rtime) {
+ u64 tmp = rtime; rtime = stime; stime = tmp;
+ }
+
+ /* Make sure 'total' fits in 32 bits */
+ if (total >> 32)
+ goto drop_precision;
+
+ /* Does rtime (and thus stime) fit in 32 bits? */
+ if (!(rtime >> 32))
+ break;
+
+ /* Can we just balance rtime/stime rather than dropping bits? */
+ if (stime >> 31)
+ goto drop_precision;
+
+ /* We can grow stime and shrink rtime and try to make them both fit */
+ stime <<= 1;
+ rtime >>= 1;
+ continue;
+
+drop_precision:
+ /* We drop from rtime, it has more bits than stime */
+ rtime >>= 1;
+ total >>= 1;
+ }
+
+ /*
+ * Make sure gcc understands that this is a 32x32->64 multiply,
+ * followed by a 64/32->64 divide.
+ */
+ scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
+ return (__force cputime_t) scaled;
+}
+
+/*
+ * Adjust tick based cputime random precision against scheduler
+ * runtime accounting.
+ */
+static void cputime_adjust(struct task_cputime *curr,
+ struct cputime *prev,
+ cputime_t *ut, cputime_t *st)
+{
+ cputime_t rtime, stime, utime, total;
+
+ stime = curr->stime;
+ total = stime + curr->utime;
+
+ /*
+ * Tick based cputime accounting depend on random scheduling
+ * timeslices of a task to be interrupted or not by the timer.
+ * Depending on these circumstances, the number of these interrupts
+ * may be over or under-optimistic, matching the real user and system
+ * cputime with a variable precision.
+ *
+ * Fix this by scaling these tick based values against the total
+ * runtime accounted by the CFS scheduler.
+ */
+ rtime = nsecs_to_cputime(curr->sum_exec_runtime);
+
+ /*
+ * Update userspace visible utime/stime values only if actual execution
+ * time is bigger than already exported. Note that can happen, that we
+ * provided bigger values due to scaling inaccuracy on big numbers.
+ */
+ if (prev->stime + prev->utime >= rtime)
+ goto out;
+
+ if (total) {
+ stime = scale_stime((__force u64)stime,
+ (__force u64)rtime, (__force u64)total);
+ utime = rtime - stime;
+ } else {
+ stime = rtime;
+ utime = 0;
+ }
+
+ /*
+ * If the tick based count grows faster than the scheduler one,
+ * the result of the scaling may go backward.
+ * Let's enforce monotonicity.
+ */
+ prev->stime = max(prev->stime, stime);
+ prev->utime = max(prev->utime, utime);
+
+out:
+ *ut = prev->utime;
+ *st = prev->stime;
+}
+
+void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+ struct task_cputime cputime = {
+ .sum_exec_runtime = tsk_seruntime(p),
+ };
+
+ task_cputime(p, &cputime.utime, &cputime.stime);
+ cputime_adjust(&cputime, &p->prev_cputime, ut, st);
+}
+
+/*
+ * Must be called with siglock held.
+ */
+void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+ struct task_cputime cputime;
+
+ thread_group_cputime(p, &cputime);
+ cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
+}
+#endif
+
+void init_idle_bootup_task(struct task_struct *idle)
+{}
+
+#ifdef CONFIG_SCHED_DEBUG
+void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
+{}
+
+void proc_sched_set_task(struct task_struct *p)
+{}
+#endif
+
+#ifdef CONFIG_SMP
+#define SCHED_LOAD_SHIFT (10)
+#define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT)
+
+unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
+{
+ return SCHED_LOAD_SCALE;
+}
+
+unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
+{
+ unsigned long weight = cpumask_weight(sched_domain_span(sd));
+ unsigned long smt_gain = sd->smt_gain;
+
+ smt_gain /= weight;
+
+ return smt_gain;
+}
+#endif
diff --git a/kernel/sched/bfs_sched.h b/kernel/sched/bfs_sched.h
new file mode 100644
index 000000000..876969fff
--- /dev/null
+++ b/kernel/sched/bfs_sched.h
@@ -0,0 +1,172 @@
+#include <linux/sched.h>
+#include <linux/cpuidle.h>
+
+#ifndef BFS_SCHED_H
+#define BFS_SCHED_H
+
+/*
+ * This is the main, per-CPU runqueue data structure.
+ * This data should only be modified by the local cpu.
+ */
+struct rq {
+ struct task_struct *curr, *idle, *stop;
+ struct mm_struct *prev_mm;
+
+ /* Pointer to grq spinlock */
+ raw_spinlock_t *grq_lock;
+
+ /* Stored data about rq->curr to work outside grq lock */
+ u64 rq_deadline;
+ unsigned int rq_policy;
+ int rq_time_slice;
+ u64 rq_last_ran;
+ int rq_prio;
+ bool rq_running; /* There is a task running */
+ int soft_affined; /* Running or queued tasks with this set as their rq */
+#ifdef CONFIG_SMT_NICE
+ struct mm_struct *rq_mm;
+ int rq_smt_bias; /* Policy/nice level bias across smt siblings */
+#endif
+ /* Accurate timekeeping data */
+ u64 timekeep_clock;
+ unsigned long user_pc, nice_pc, irq_pc, softirq_pc, system_pc,
+ iowait_pc, idle_pc;
+ atomic_t nr_iowait;
+
+#ifdef CONFIG_SMP
+ int cpu; /* cpu of this runqueue */
+ bool online;
+ bool scaling; /* This CPU is managed by a scaling CPU freq governor */
+ struct task_struct *sticky_task;
+
+ struct root_domain *rd;
+ struct sched_domain *sd;
+ int *cpu_locality; /* CPU relative cache distance */
+#ifdef CONFIG_SCHED_SMT
+ bool (*siblings_idle)(int cpu);
+ /* See if all smt siblings are idle */
+#endif /* CONFIG_SCHED_SMT */
+#ifdef CONFIG_SCHED_MC
+ bool (*cache_idle)(int cpu);
+ /* See if all cache siblings are idle */
+#endif /* CONFIG_SCHED_MC */
+ u64 last_niffy; /* Last time this RQ updated grq.niffies */
+#endif /* CONFIG_SMP */
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ u64 prev_irq_time;
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+#ifdef CONFIG_PARAVIRT
+ u64 prev_steal_time;
+#endif /* CONFIG_PARAVIRT */
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+ u64 prev_steal_time_rq;
+#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */
+
+ u64 clock, old_clock, last_tick;
+ u64 clock_task;
+ bool dither;
+
+#ifdef CONFIG_SCHEDSTATS
+
+ /* latency stats */
+ struct sched_info rq_sched_info;
+ unsigned long long rq_cpu_time;
+ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
+
+ /* sys_sched_yield() stats */
+ unsigned int yld_count;
+
+ /* schedule() stats */
+ unsigned int sched_switch;
+ unsigned int sched_count;
+ unsigned int sched_goidle;
+
+ /* try_to_wake_up() stats */
+ unsigned int ttwu_count;
+ unsigned int ttwu_local;
+#endif /* CONFIG_SCHEDSTATS */
+#ifdef CONFIG_CPU_IDLE
+ /* Must be inspected within a rcu lock section */
+ struct cpuidle_state *idle_state;
+#endif
+};
+
+#ifdef CONFIG_SMP
+struct rq *cpu_rq(int cpu);
+#endif
+
+#ifndef CONFIG_SMP
+extern struct rq *uprq;
+#define cpu_rq(cpu) (uprq)
+#define this_rq() (uprq)
+#define raw_rq() (uprq)
+#define task_rq(p) (uprq)
+#define cpu_curr(cpu) ((uprq)->curr)
+#else /* CONFIG_SMP */
+DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+#define this_rq() this_cpu_ptr(&runqueues)
+#define raw_rq() raw_cpu_ptr(&runqueues)
+#endif /* CONFIG_SMP */
+
+static inline u64 __rq_clock_broken(struct rq *rq)
+{
+ return ACCESS_ONCE(rq->clock);
+}
+
+static inline u64 rq_clock(struct rq *rq)
+{
+ lockdep_assert_held(rq->grq_lock);
+ return rq->clock;
+}
+
+static inline u64 rq_clock_task(struct rq *rq)
+{
+ lockdep_assert_held(rq->grq_lock);
+ return rq->clock_task;
+}
+
+#define rcu_dereference_check_sched_domain(p) \
+ rcu_dereference_check((p), \
+ lockdep_is_held(&sched_domains_mutex))
+
+/*
+ * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
+ * See detach_destroy_domains: synchronize_sched for details.
+ *
+ * The domain tree of any CPU may only be accessed from within
+ * preempt-disabled sections.
+ */
+#define for_each_domain(cpu, __sd) \
+ for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
+
+static inline void sched_ttwu_pending(void) { }
+
+static inline int task_on_rq_queued(struct task_struct *p)
+{
+ return p->on_rq;
+}
+
+#ifdef CONFIG_CPU_IDLE
+static inline void idle_set_state(struct rq *rq,
+ struct cpuidle_state *idle_state)
+{
+ rq->idle_state = idle_state;
+}
+
+static inline struct cpuidle_state *idle_get_state(struct rq *rq)
+{
+ WARN_ON(!rcu_read_lock_held());
+ return rq->idle_state;
+}
+#else
+static inline void idle_set_state(struct rq *rq,
+ struct cpuidle_state *idle_state)
+{
+}
+
+static inline struct cpuidle_state *idle_get_state(struct rq *rq)
+{
+ return NULL;
+}
+#endif
+#endif /* BFS_SCHED_H */
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
new file mode 100644
index 000000000..c0a205101
--- /dev/null
+++ b/kernel/sched/clock.c
@@ -0,0 +1,435 @@
+/*
+ * sched_clock for unstable cpu clocks
+ *
+ * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *
+ * Updates and enhancements:
+ * Copyright (C) 2008 Red Hat, Inc. Steven Rostedt <srostedt@redhat.com>
+ *
+ * Based on code by:
+ * Ingo Molnar <mingo@redhat.com>
+ * Guillaume Chazarain <guichaz@gmail.com>
+ *
+ *
+ * What:
+ *
+ * cpu_clock(i) provides a fast (execution time) high resolution
+ * clock with bounded drift between CPUs. The value of cpu_clock(i)
+ * is monotonic for constant i. The timestamp returned is in nanoseconds.
+ *
+ * ######################### BIG FAT WARNING ##########################
+ * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
+ * # go backwards !! #
+ * ####################################################################
+ *
+ * There is no strict promise about the base, although it tends to start
+ * at 0 on boot (but people really shouldn't rely on that).
+ *
+ * cpu_clock(i) -- can be used from any context, including NMI.
+ * local_clock() -- is cpu_clock() on the current cpu.
+ *
+ * sched_clock_cpu(i)
+ *
+ * How:
+ *
+ * The implementation either uses sched_clock() when
+ * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the
+ * sched_clock() is assumed to provide these properties (mostly it means
+ * the architecture provides a globally synchronized highres time source).
+ *
+ * Otherwise it tries to create a semi stable clock from a mixture of other
+ * clocks, including:
+ *
+ * - GTOD (clock monotomic)
+ * - sched_clock()
+ * - explicit idle events
+ *
+ * We use GTOD as base and use sched_clock() deltas to improve resolution. The
+ * deltas are filtered to provide monotonicity and keeping it within an
+ * expected window.
+ *
+ * Furthermore, explicit sleep and wakeup hooks allow us to account for time
+ * that is otherwise invisible (TSC gets stopped).
+ *
+ */
+#include <linux/spinlock.h>
+#include <linux/hardirq.h>
+#include <linux/export.h>
+#include <linux/percpu.h>
+#include <linux/ktime.h>
+#include <linux/sched.h>
+#include <linux/static_key.h>
+#include <linux/workqueue.h>
+#include <linux/compiler.h>
+
+/*
+ * Scheduler clock - returns current time in nanosec units.
+ * This is default implementation.
+ * Architectures and sub-architectures can override this.
+ */
+unsigned long long __weak sched_clock(void)
+{
+ return (unsigned long long)(jiffies - INITIAL_JIFFIES)
+ * (NSEC_PER_SEC / HZ);
+}
+EXPORT_SYMBOL_GPL(sched_clock);
+
+__read_mostly int sched_clock_running;
+
+#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+static struct static_key __sched_clock_stable = STATIC_KEY_INIT;
+static int __sched_clock_stable_early;
+
+int sched_clock_stable(void)
+{
+ return static_key_false(&__sched_clock_stable);
+}
+
+static void __set_sched_clock_stable(void)
+{
+ if (!sched_clock_stable())
+ static_key_slow_inc(&__sched_clock_stable);
+}
+
+void set_sched_clock_stable(void)
+{
+ __sched_clock_stable_early = 1;
+
+ smp_mb(); /* matches sched_clock_init() */
+
+ if (!sched_clock_running)
+ return;
+
+ __set_sched_clock_stable();
+}
+
+static void __clear_sched_clock_stable(struct work_struct *work)
+{
+ /* XXX worry about clock continuity */
+ if (sched_clock_stable())
+ static_key_slow_dec(&__sched_clock_stable);
+}
+
+static DECLARE_WORK(sched_clock_work, __clear_sched_clock_stable);
+
+void clear_sched_clock_stable(void)
+{
+ __sched_clock_stable_early = 0;
+
+ smp_mb(); /* matches sched_clock_init() */
+
+ if (!sched_clock_running)
+ return;
+
+ schedule_work(&sched_clock_work);
+}
+
+struct sched_clock_data {
+ u64 tick_raw;
+ u64 tick_gtod;
+ u64 clock;
+};
+
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
+
+static inline struct sched_clock_data *this_scd(void)
+{
+ return this_cpu_ptr(&sched_clock_data);
+}
+
+static inline struct sched_clock_data *cpu_sdc(int cpu)
+{
+ return &per_cpu(sched_clock_data, cpu);
+}
+
+void sched_clock_init(void)
+{
+ u64 ktime_now = ktime_to_ns(ktime_get());
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct sched_clock_data *scd = cpu_sdc(cpu);
+
+ scd->tick_raw = 0;
+ scd->tick_gtod = ktime_now;
+ scd->clock = ktime_now;
+ }
+
+ sched_clock_running = 1;
+
+ /*
+ * Ensure that it is impossible to not do a static_key update.
+ *
+ * Either {set,clear}_sched_clock_stable() must see sched_clock_running
+ * and do the update, or we must see their __sched_clock_stable_early
+ * and do the update, or both.
+ */
+ smp_mb(); /* matches {set,clear}_sched_clock_stable() */
+
+ if (__sched_clock_stable_early)
+ __set_sched_clock_stable();
+ else
+ __clear_sched_clock_stable(NULL);
+}
+
+/*
+ * min, max except they take wrapping into account
+ */
+
+static inline u64 wrap_min(u64 x, u64 y)
+{
+ return (s64)(x - y) < 0 ? x : y;
+}
+
+static inline u64 wrap_max(u64 x, u64 y)
+{
+ return (s64)(x - y) > 0 ? x : y;
+}
+
+/*
+ * update the percpu scd from the raw @now value
+ *
+ * - filter out backward motion
+ * - use the GTOD tick value to create a window to filter crazy TSC values
+ */
+static u64 sched_clock_local(struct sched_clock_data *scd)
+{
+ u64 now, clock, old_clock, min_clock, max_clock;
+ s64 delta;
+
+again:
+ now = sched_clock();
+ delta = now - scd->tick_raw;
+ if (unlikely(delta < 0))
+ delta = 0;
+
+ old_clock = scd->clock;
+
+ /*
+ * scd->clock = clamp(scd->tick_gtod + delta,
+ * max(scd->tick_gtod, scd->clock),
+ * scd->tick_gtod + TICK_NSEC);
+ */
+
+ clock = scd->tick_gtod + delta;
+ min_clock = wrap_max(scd->tick_gtod, old_clock);
+ max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC);
+
+ clock = wrap_max(clock, min_clock);
+ clock = wrap_min(clock, max_clock);
+
+ if (cmpxchg64(&scd->clock, old_clock, clock) != old_clock)
+ goto again;
+
+ return clock;
+}
+
+static u64 sched_clock_remote(struct sched_clock_data *scd)
+{
+ struct sched_clock_data *my_scd = this_scd();
+ u64 this_clock, remote_clock;
+ u64 *ptr, old_val, val;
+
+#if BITS_PER_LONG != 64
+again:
+ /*
+ * Careful here: The local and the remote clock values need to
+ * be read out atomic as we need to compare the values and
+ * then update either the local or the remote side. So the
+ * cmpxchg64 below only protects one readout.
+ *
+ * We must reread via sched_clock_local() in the retry case on
+ * 32bit as an NMI could use sched_clock_local() via the
+ * tracer and hit between the readout of
+ * the low32bit and the high 32bit portion.
+ */
+ this_clock = sched_clock_local(my_scd);
+ /*
+ * We must enforce atomic readout on 32bit, otherwise the
+ * update on the remote cpu can hit inbetween the readout of
+ * the low32bit and the high 32bit portion.
+ */
+ remote_clock = cmpxchg64(&scd->clock, 0, 0);
+#else
+ /*
+ * On 64bit the read of [my]scd->clock is atomic versus the
+ * update, so we can avoid the above 32bit dance.
+ */
+ sched_clock_local(my_scd);
+again:
+ this_clock = my_scd->clock;
+ remote_clock = scd->clock;
+#endif
+
+ /*
+ * Use the opportunity that we have both locks
+ * taken to couple the two clocks: we take the
+ * larger time as the latest time for both
+ * runqueues. (this creates monotonic movement)
+ */
+ if (likely((s64)(remote_clock - this_clock) < 0)) {
+ ptr = &scd->clock;
+ old_val = remote_clock;
+ val = this_clock;
+ } else {
+ /*
+ * Should be rare, but possible:
+ */
+ ptr = &my_scd->clock;
+ old_val = this_clock;
+ val = remote_clock;
+ }
+
+ if (cmpxchg64(ptr, old_val, val) != old_val)
+ goto again;
+
+ return val;
+}
+
+/*
+ * Similar to cpu_clock(), but requires local IRQs to be disabled.
+ *
+ * See cpu_clock().
+ */
+u64 sched_clock_cpu(int cpu)
+{
+ struct sched_clock_data *scd;
+ u64 clock;
+
+ if (sched_clock_stable())
+ return sched_clock();
+
+ if (unlikely(!sched_clock_running))
+ return 0ull;
+
+ preempt_disable_notrace();
+ scd = cpu_sdc(cpu);
+
+ if (cpu != smp_processor_id())
+ clock = sched_clock_remote(scd);
+ else
+ clock = sched_clock_local(scd);
+ preempt_enable_notrace();
+
+ return clock;
+}
+
+void sched_clock_tick(void)
+{
+ struct sched_clock_data *scd;
+ u64 now, now_gtod;
+
+ if (sched_clock_stable())
+ return;
+
+ if (unlikely(!sched_clock_running))
+ return;
+
+ WARN_ON_ONCE(!irqs_disabled());
+
+ scd = this_scd();
+ now_gtod = ktime_to_ns(ktime_get());
+ now = sched_clock();
+
+ scd->tick_raw = now;
+ scd->tick_gtod = now_gtod;
+ sched_clock_local(scd);
+}
+
+/*
+ * We are going deep-idle (irqs are disabled):
+ */
+void sched_clock_idle_sleep_event(void)
+{
+ sched_clock_cpu(smp_processor_id());
+}
+EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
+
+/*
+ * We just idled delta nanoseconds (called with irqs disabled):
+ */
+void sched_clock_idle_wakeup_event(u64 delta_ns)
+{
+ if (timekeeping_suspended)
+ return;
+
+ sched_clock_tick();
+ touch_softlockup_watchdog();
+}
+EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
+
+/*
+ * As outlined at the top, provides a fast, high resolution, nanosecond
+ * time source that is monotonic per cpu argument and has bounded drift
+ * between cpus.
+ *
+ * ######################### BIG FAT WARNING ##########################
+ * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
+ * # go backwards !! #
+ * ####################################################################
+ */
+u64 cpu_clock(int cpu)
+{
+ if (!sched_clock_stable())
+ return sched_clock_cpu(cpu);
+
+ return sched_clock();
+}
+
+/*
+ * Similar to cpu_clock() for the current cpu. Time will only be observed
+ * to be monotonic if care is taken to only compare timestampt taken on the
+ * same CPU.
+ *
+ * See cpu_clock().
+ */
+u64 local_clock(void)
+{
+ if (!sched_clock_stable())
+ return sched_clock_cpu(raw_smp_processor_id());
+
+ return sched_clock();
+}
+
+#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
+
+void sched_clock_init(void)
+{
+ sched_clock_running = 1;
+}
+
+u64 sched_clock_cpu(int cpu)
+{
+ if (unlikely(!sched_clock_running))
+ return 0;
+
+ return sched_clock();
+}
+
+u64 cpu_clock(int cpu)
+{
+ return sched_clock();
+}
+
+u64 local_clock(void)
+{
+ return sched_clock();
+}
+
+#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
+
+EXPORT_SYMBOL_GPL(cpu_clock);
+EXPORT_SYMBOL_GPL(local_clock);
+
+/*
+ * Running clock - returns the time that has elapsed while a guest has been
+ * running.
+ * On a guest this value should be local_clock minus the time the guest was
+ * suspended by the hypervisor (for any reason).
+ * On bare metal this function should return the same as local_clock.
+ * Architectures and sub-architectures can override this.
+ */
+u64 __weak running_clock(void)
+{
+ return local_clock();
+}
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
new file mode 100644
index 000000000..8d0f35deb
--- /dev/null
+++ b/kernel/sched/completion.c
@@ -0,0 +1,317 @@
+/*
+ * Generic wait-for-completion handler;
+ *
+ * It differs from semaphores in that their default case is the opposite,
+ * wait_for_completion default blocks whereas semaphore default non-block. The
+ * interface also makes it easy to 'complete' multiple waiting threads,
+ * something which isn't entirely natural for semaphores.
+ *
+ * But more importantly, the primitive documents the usage. Semaphores would
+ * typically be used for exclusion which gives rise to priority inversion.
+ * Waiting for completion is a typically sync point, but not an exclusion point.
+ */
+
+#include <linux/sched.h>
+#include <linux/completion.h>
+
+/**
+ * complete: - signals a single thread waiting on this completion
+ * @x: holds the state of this particular completion
+ *
+ * This will wake up a single thread waiting on this completion. Threads will be
+ * awakened in the same order in which they were queued.
+ *
+ * See also complete_all(), wait_for_completion() and related routines.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void complete(struct completion *x)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&x->wait.lock, flags);
+ x->done++;
+ __wake_up_locked(&x->wait, TASK_NORMAL, 1);
+ spin_unlock_irqrestore(&x->wait.lock, flags);
+}
+EXPORT_SYMBOL(complete);
+
+/**
+ * complete_all: - signals all threads waiting on this completion
+ * @x: holds the state of this particular completion
+ *
+ * This will wake up all threads waiting on this particular completion event.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void complete_all(struct completion *x)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&x->wait.lock, flags);
+ x->done += UINT_MAX/2;
+ __wake_up_locked(&x->wait, TASK_NORMAL, 0);
+ spin_unlock_irqrestore(&x->wait.lock, flags);
+}
+EXPORT_SYMBOL(complete_all);
+
+static inline long __sched
+do_wait_for_common(struct completion *x,
+ long (*action)(long), long timeout, int state)
+{
+ if (!x->done) {
+ DECLARE_WAITQUEUE(wait, current);
+
+ __add_wait_queue_tail_exclusive(&x->wait, &wait);
+ do {
+ if (signal_pending_state(state, current)) {
+ timeout = -ERESTARTSYS;
+ break;
+ }
+ __set_current_state(state);
+ spin_unlock_irq(&x->wait.lock);
+ timeout = action(timeout);
+ spin_lock_irq(&x->wait.lock);
+ } while (!x->done && timeout);
+ __remove_wait_queue(&x->wait, &wait);
+ if (!x->done)
+ return timeout;
+ }
+ x->done--;
+ return timeout ?: 1;
+}
+
+static inline long __sched
+__wait_for_common(struct completion *x,
+ long (*action)(long), long timeout, int state)
+{
+ might_sleep();
+
+ spin_lock_irq(&x->wait.lock);
+ timeout = do_wait_for_common(x, action, timeout, state);
+ spin_unlock_irq(&x->wait.lock);
+ return timeout;
+}
+
+static long __sched
+wait_for_common(struct completion *x, long timeout, int state)
+{
+ return __wait_for_common(x, schedule_timeout, timeout, state);
+}
+
+static long __sched
+wait_for_common_io(struct completion *x, long timeout, int state)
+{
+ return __wait_for_common(x, io_schedule_timeout, timeout, state);
+}
+
+/**
+ * wait_for_completion: - waits for completion of a task
+ * @x: holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It is NOT
+ * interruptible and there is no timeout.
+ *
+ * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
+ * and interrupt capability. Also see complete().
+ */
+void __sched wait_for_completion(struct completion *x)
+{
+ wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion);
+
+/**
+ * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
+ * @x: holds the state of this particular completion
+ * @timeout: timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. The timeout is in jiffies. It is not
+ * interruptible.
+ *
+ * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
+ * till timeout) if completed.
+ */
+unsigned long __sched
+wait_for_completion_timeout(struct completion *x, unsigned long timeout)
+{
+ return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_timeout);
+
+/**
+ * wait_for_completion_io: - waits for completion of a task
+ * @x: holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It is NOT
+ * interruptible and there is no timeout. The caller is accounted as waiting
+ * for IO (which traditionally means blkio only).
+ */
+void __sched wait_for_completion_io(struct completion *x)
+{
+ wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_io);
+
+/**
+ * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout)
+ * @x: holds the state of this particular completion
+ * @timeout: timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. The timeout is in jiffies. It is not
+ * interruptible. The caller is accounted as waiting for IO (which traditionally
+ * means blkio only).
+ *
+ * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
+ * till timeout) if completed.
+ */
+unsigned long __sched
+wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
+{
+ return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_io_timeout);
+
+/**
+ * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
+ * @x: holds the state of this particular completion
+ *
+ * This waits for completion of a specific task to be signaled. It is
+ * interruptible.
+ *
+ * Return: -ERESTARTSYS if interrupted, 0 if completed.
+ */
+int __sched wait_for_completion_interruptible(struct completion *x)
+{
+ long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
+ if (t == -ERESTARTSYS)
+ return t;
+ return 0;
+}
+EXPORT_SYMBOL(wait_for_completion_interruptible);
+
+/**
+ * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
+ * @x: holds the state of this particular completion
+ * @timeout: timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. It is interruptible. The timeout is in jiffies.
+ *
+ * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
+ * or number of jiffies left till timeout) if completed.
+ */
+long __sched
+wait_for_completion_interruptible_timeout(struct completion *x,
+ unsigned long timeout)
+{
+ return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
+
+/**
+ * wait_for_completion_killable: - waits for completion of a task (killable)
+ * @x: holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It can be
+ * interrupted by a kill signal.
+ *
+ * Return: -ERESTARTSYS if interrupted, 0 if completed.
+ */
+int __sched wait_for_completion_killable(struct completion *x)
+{
+ long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
+ if (t == -ERESTARTSYS)
+ return t;
+ return 0;
+}
+EXPORT_SYMBOL(wait_for_completion_killable);
+
+/**
+ * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
+ * @x: holds the state of this particular completion
+ * @timeout: timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be
+ * signaled or for a specified timeout to expire. It can be
+ * interrupted by a kill signal. The timeout is in jiffies.
+ *
+ * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
+ * or number of jiffies left till timeout) if completed.
+ */
+long __sched
+wait_for_completion_killable_timeout(struct completion *x,
+ unsigned long timeout)
+{
+ return wait_for_common(x, timeout, TASK_KILLABLE);
+}
+EXPORT_SYMBOL(wait_for_completion_killable_timeout);
+
+/**
+ * try_wait_for_completion - try to decrement a completion without blocking
+ * @x: completion structure
+ *
+ * Return: 0 if a decrement cannot be done without blocking
+ * 1 if a decrement succeeded.
+ *
+ * If a completion is being used as a counting completion,
+ * attempt to decrement the counter without blocking. This
+ * enables us to avoid waiting if the resource the completion
+ * is protecting is not available.
+ */
+bool try_wait_for_completion(struct completion *x)
+{
+ unsigned long flags;
+ int ret = 1;
+
+ /*
+ * Since x->done will need to be locked only
+ * in the non-blocking case, we check x->done
+ * first without taking the lock so we can
+ * return early in the blocking case.
+ */
+ if (!READ_ONCE(x->done))
+ return 0;
+
+ spin_lock_irqsave(&x->wait.lock, flags);
+ if (!x->done)
+ ret = 0;
+ else
+ x->done--;
+ spin_unlock_irqrestore(&x->wait.lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(try_wait_for_completion);
+
+/**
+ * completion_done - Test to see if a completion has any waiters
+ * @x: completion structure
+ *
+ * Return: 0 if there are waiters (wait_for_completion() in progress)
+ * 1 if there are no waiters.
+ *
+ */
+bool completion_done(struct completion *x)
+{
+ if (!READ_ONCE(x->done))
+ return false;
+
+ /*
+ * If ->done, we need to wait for complete() to release ->wait.lock
+ * otherwise we can end up freeing the completion before complete()
+ * is done referencing it.
+ *
+ * The RMB pairs with complete()'s RELEASE of ->wait.lock and orders
+ * the loads of ->done and ->wait.lock such that we cannot observe
+ * the lock before complete() acquires it while observing the ->done
+ * after it's acquired the lock.
+ */
+ smp_rmb();
+ spin_unlock_wait(&x->wait.lock);
+ return true;
+}
+EXPORT_SYMBOL(completion_done);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
new file mode 100644
index 000000000..123673291
--- /dev/null
+++ b/kernel/sched/core.c
@@ -0,0 +1,8381 @@
+/*
+ * kernel/sched/core.c
+ *
+ * Kernel scheduler and related syscalls
+ *
+ * Copyright (C) 1991-2002 Linus Torvalds
+ *
+ * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
+ * make semaphores SMP safe
+ * 1998-11-19 Implemented schedule_timeout() and related stuff
+ * by Andrea Arcangeli
+ * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
+ * hybrid priority-list and round-robin design with
+ * an array-switch method of distributing timeslices
+ * and per-CPU runqueues. Cleanups and useful suggestions
+ * by Davide Libenzi, preemptible kernel bits by Robert Love.
+ * 2003-09-03 Interactivity tuning by Con Kolivas.
+ * 2004-04-02 Scheduler domains code by Nick Piggin
+ * 2007-04-15 Work begun on replacing all interactivity tuning with a
+ * fair scheduling design by Con Kolivas.
+ * 2007-05-05 Load balancing (smp-nice) and other improvements
+ * by Peter Williams
+ * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
+ * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
+ * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,
+ * Thomas Gleixner, Mike Kravetz
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/nmi.h>
+#include <linux/init.h>
+#include <linux/uaccess.h>
+#include <linux/highmem.h>
+#include <asm/mmu_context.h>
+#include <linux/interrupt.h>
+#include <linux/capability.h>
+#include <linux/completion.h>
+#include <linux/kernel_stat.h>
+#include <linux/debug_locks.h>
+#include <linux/perf_event.h>
+#include <linux/security.h>
+#include <linux/notifier.h>
+#include <linux/profile.h>
+#include <linux/freezer.h>
+#include <linux/vmalloc.h>
+#include <linux/blkdev.h>
+#include <linux/delay.h>
+#include <linux/pid_namespace.h>
+#include <linux/smp.h>
+#include <linux/threads.h>
+#include <linux/timer.h>
+#include <linux/rcupdate.h>
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/percpu.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/sysctl.h>
+#include <linux/syscalls.h>
+#include <linux/times.h>
+#include <linux/tsacct_kern.h>
+#include <linux/kprobes.h>
+#include <linux/delayacct.h>
+#include <linux/unistd.h>
+#include <linux/pagemap.h>
+#include <linux/hrtimer.h>
+#include <linux/tick.h>
+#include <linux/debugfs.h>
+#include <linux/ctype.h>
+#include <linux/ftrace.h>
+#include <linux/slab.h>
+#include <linux/init_task.h>
+#include <linux/binfmts.h>
+#include <linux/context_tracking.h>
+#include <linux/compiler.h>
+
+#include <asm/switch_to.h>
+#include <asm/tlb.h>
+#include <asm/irq_regs.h>
+#include <asm/mutex.h>
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#endif
+
+#include "sched.h"
+#include "../workqueue_internal.h"
+#include "../smpboot.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/sched.h>
+
+void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
+{
+ unsigned long delta;
+ ktime_t soft, hard, now;
+
+ for (;;) {
+ if (hrtimer_active(period_timer))
+ break;
+
+ now = hrtimer_cb_get_time(period_timer);
+ hrtimer_forward(period_timer, now, period);
+
+ soft = hrtimer_get_softexpires(period_timer);
+ hard = hrtimer_get_expires(period_timer);
+ delta = ktime_to_ns(ktime_sub(hard, soft));
+ __hrtimer_start_range_ns(period_timer, soft, delta,
+ HRTIMER_MODE_ABS_PINNED, 0);
+ }
+}
+
+DEFINE_MUTEX(sched_domains_mutex);
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+
+static void update_rq_clock_task(struct rq *rq, s64 delta);
+
+void update_rq_clock(struct rq *rq)
+{
+ s64 delta;
+
+ lockdep_assert_held(&rq->lock);
+
+ if (rq->clock_skip_update & RQCF_ACT_SKIP)
+ return;
+
+ delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
+ if (delta < 0)
+ return;
+ rq->clock += delta;
+ update_rq_clock_task(rq, delta);
+}
+
+/*
+ * Debugging: various feature bits
+ */
+
+#define SCHED_FEAT(name, enabled) \
+ (1UL << __SCHED_FEAT_##name) * enabled |
+
+const_debug unsigned int sysctl_sched_features =
+#include "features.h"
+ 0;
+
+#undef SCHED_FEAT
+
+#ifdef CONFIG_SCHED_DEBUG
+#define SCHED_FEAT(name, enabled) \
+ #name ,
+
+static const char * const sched_feat_names[] = {
+#include "features.h"
+};
+
+#undef SCHED_FEAT
+
+static int sched_feat_show(struct seq_file *m, void *v)
+{
+ int i;
+
+ for (i = 0; i < __SCHED_FEAT_NR; i++) {
+ if (!(sysctl_sched_features & (1UL << i)))
+ seq_puts(m, "NO_");
+ seq_printf(m, "%s ", sched_feat_names[i]);
+ }
+ seq_puts(m, "\n");
+
+ return 0;
+}
+
+#ifdef HAVE_JUMP_LABEL
+
+#define jump_label_key__true STATIC_KEY_INIT_TRUE
+#define jump_label_key__false STATIC_KEY_INIT_FALSE
+
+#define SCHED_FEAT(name, enabled) \
+ jump_label_key__##enabled ,
+
+struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
+#include "features.h"
+};
+
+#undef SCHED_FEAT
+
+static void sched_feat_disable(int i)
+{
+ if (static_key_enabled(&sched_feat_keys[i]))
+ static_key_slow_dec(&sched_feat_keys[i]);
+}
+
+static void sched_feat_enable(int i)
+{
+ if (!static_key_enabled(&sched_feat_keys[i]))
+ static_key_slow_inc(&sched_feat_keys[i]);
+}
+#else
+static void sched_feat_disable(int i) { };
+static void sched_feat_enable(int i) { };
+#endif /* HAVE_JUMP_LABEL */
+
+static int sched_feat_set(char *cmp)
+{
+ int i;
+ int neg = 0;
+
+ if (strncmp(cmp, "NO_", 3) == 0) {
+ neg = 1;
+ cmp += 3;
+ }
+
+ for (i = 0; i < __SCHED_FEAT_NR; i++) {
+ if (strcmp(cmp, sched_feat_names[i]) == 0) {
+ if (neg) {
+ sysctl_sched_features &= ~(1UL << i);
+ sched_feat_disable(i);
+ } else {
+ sysctl_sched_features |= (1UL << i);
+ sched_feat_enable(i);
+ }
+ break;
+ }
+ }
+
+ return i;
+}
+
+static ssize_t
+sched_feat_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[64];
+ char *cmp;
+ int i;
+ struct inode *inode;
+
+ if (cnt > 63)
+ cnt = 63;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+ cmp = strstrip(buf);
+
+ /* Ensure the static_key remains in a consistent state */
+ inode = file_inode(filp);
+ mutex_lock(&inode->i_mutex);
+ i = sched_feat_set(cmp);
+ mutex_unlock(&inode->i_mutex);
+ if (i == __SCHED_FEAT_NR)
+ return -EINVAL;
+
+ *ppos += cnt;
+
+ return cnt;
+}
+
+static int sched_feat_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, sched_feat_show, NULL);
+}
+
+static const struct file_operations sched_feat_fops = {
+ .open = sched_feat_open,
+ .write = sched_feat_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static __init int sched_init_debug(void)
+{
+ debugfs_create_file("sched_features", 0644, NULL, NULL,
+ &sched_feat_fops);
+
+ return 0;
+}
+late_initcall(sched_init_debug);
+#endif /* CONFIG_SCHED_DEBUG */
+
+/*
+ * Number of tasks to iterate in a single balance run.
+ * Limited because this is done with IRQs disabled.
+ */
+const_debug unsigned int sysctl_sched_nr_migrate = 32;
+
+/*
+ * period over which we average the RT time consumption, measured
+ * in ms.
+ *
+ * default: 1s
+ */
+const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
+
+/*
+ * period over which we measure -rt task cpu usage in us.
+ * default: 1s
+ */
+unsigned int sysctl_sched_rt_period = 1000000;
+
+__read_mostly int scheduler_running;
+
+/*
+ * part of the period that we allow rt tasks to run in us.
+ * default: 0.95s
+ */
+int sysctl_sched_rt_runtime = 950000;
+
+/* cpus with isolated domains */
+cpumask_var_t cpu_isolated_map;
+
+/*
+ * this_rq_lock - lock this runqueue and disable interrupts.
+ */
+static struct rq *this_rq_lock(void)
+ __acquires(rq->lock)
+{
+ struct rq *rq;
+
+ local_irq_disable();
+ rq = this_rq();
+ raw_spin_lock(&rq->lock);
+
+ return rq;
+}
+
+#ifdef CONFIG_SCHED_HRTICK
+/*
+ * Use HR-timers to deliver accurate preemption points.
+ */
+
+static void hrtick_clear(struct rq *rq)
+{
+ if (hrtimer_active(&rq->hrtick_timer))
+ hrtimer_cancel(&rq->hrtick_timer);
+}
+
+/*
+ * High-resolution timer tick.
+ * Runs from hardirq context with interrupts disabled.
+ */
+static enum hrtimer_restart hrtick(struct hrtimer *timer)
+{
+ struct rq *rq = container_of(timer, struct rq, hrtick_timer);
+
+ WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
+
+ raw_spin_lock(&rq->lock);
+ update_rq_clock(rq);
+ rq->curr->sched_class->task_tick(rq, rq->curr, 1);
+ raw_spin_unlock(&rq->lock);
+
+ return HRTIMER_NORESTART;
+}
+
+#ifdef CONFIG_SMP
+
+static int __hrtick_restart(struct rq *rq)
+{
+ struct hrtimer *timer = &rq->hrtick_timer;
+ ktime_t time = hrtimer_get_softexpires(timer);
+
+ return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0);
+}
+
+/*
+ * called from hardirq (IPI) context
+ */
+static void __hrtick_start(void *arg)
+{
+ struct rq *rq = arg;
+
+ raw_spin_lock(&rq->lock);
+ __hrtick_restart(rq);
+ rq->hrtick_csd_pending = 0;
+ raw_spin_unlock(&rq->lock);
+}
+
+/*
+ * Called to set the hrtick timer state.
+ *
+ * called with rq->lock held and irqs disabled
+ */
+void hrtick_start(struct rq *rq, u64 delay)
+{
+ struct hrtimer *timer = &rq->hrtick_timer;
+ ktime_t time;
+ s64 delta;
+
+ /*
+ * Don't schedule slices shorter than 10000ns, that just
+ * doesn't make sense and can cause timer DoS.
+ */
+ delta = max_t(s64, delay, 10000LL);
+ time = ktime_add_ns(timer->base->get_time(), delta);
+
+ hrtimer_set_expires(timer, time);
+
+ if (rq == this_rq()) {
+ __hrtick_restart(rq);
+ } else if (!rq->hrtick_csd_pending) {
+ smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
+ rq->hrtick_csd_pending = 1;
+ }
+}
+
+static int
+hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+ int cpu = (int)(long)hcpu;
+
+ switch (action) {
+ case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
+ case CPU_DOWN_PREPARE:
+ case CPU_DOWN_PREPARE_FROZEN:
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ hrtick_clear(cpu_rq(cpu));
+ return NOTIFY_OK;
+ }
+
+ return NOTIFY_DONE;
+}
+
+static __init void init_hrtick(void)
+{
+ hotcpu_notifier(hotplug_hrtick, 0);
+}
+#else
+/*
+ * Called to set the hrtick timer state.
+ *
+ * called with rq->lock held and irqs disabled
+ */
+void hrtick_start(struct rq *rq, u64 delay)
+{
+ /*
+ * Don't schedule slices shorter than 10000ns, that just
+ * doesn't make sense. Rely on vruntime for fairness.
+ */
+ delay = max_t(u64, delay, 10000LL);
+ __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
+ HRTIMER_MODE_REL_PINNED, 0);
+}
+
+static inline void init_hrtick(void)
+{
+}
+#endif /* CONFIG_SMP */
+
+static void init_rq_hrtick(struct rq *rq)
+{
+#ifdef CONFIG_SMP
+ rq->hrtick_csd_pending = 0;
+
+ rq->hrtick_csd.flags = 0;
+ rq->hrtick_csd.func = __hrtick_start;
+ rq->hrtick_csd.info = rq;
+#endif
+
+ hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ rq->hrtick_timer.function = hrtick;
+}
+#else /* CONFIG_SCHED_HRTICK */
+static inline void hrtick_clear(struct rq *rq)
+{
+}
+
+static inline void init_rq_hrtick(struct rq *rq)
+{
+}
+
+static inline void init_hrtick(void)
+{
+}
+#endif /* CONFIG_SCHED_HRTICK */
+
+/*
+ * cmpxchg based fetch_or, macro so it works for different integer types
+ */
+#define fetch_or(ptr, val) \
+({ typeof(*(ptr)) __old, __val = *(ptr); \
+ for (;;) { \
+ __old = cmpxchg((ptr), __val, __val | (val)); \
+ if (__old == __val) \
+ break; \
+ __val = __old; \
+ } \
+ __old; \
+})
+
+#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
+/*
+ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
+ * this avoids any races wrt polling state changes and thereby avoids
+ * spurious IPIs.
+ */
+static bool set_nr_and_not_polling(struct task_struct *p)
+{
+ struct thread_info *ti = task_thread_info(p);
+ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
+}
+
+/*
+ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set.
+ *
+ * If this returns true, then the idle task promises to call
+ * sched_ttwu_pending() and reschedule soon.
+ */
+static bool set_nr_if_polling(struct task_struct *p)
+{
+ struct thread_info *ti = task_thread_info(p);
+ typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags);
+
+ for (;;) {
+ if (!(val & _TIF_POLLING_NRFLAG))
+ return false;
+ if (val & _TIF_NEED_RESCHED)
+ return true;
+ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
+ if (old == val)
+ break;
+ val = old;
+ }
+ return true;
+}
+
+#else
+static bool set_nr_and_not_polling(struct task_struct *p)
+{
+ set_tsk_need_resched(p);
+ return true;
+}
+
+#ifdef CONFIG_SMP
+static bool set_nr_if_polling(struct task_struct *p)
+{
+ return false;
+}
+#endif
+#endif
+
+/*
+ * resched_curr - mark rq's current task 'to be rescheduled now'.
+ *
+ * On UP this means the setting of the need_resched flag, on SMP it
+ * might also involve a cross-CPU call to trigger the scheduler on
+ * the target CPU.
+ */
+void resched_curr(struct rq *rq)
+{
+ struct task_struct *curr = rq->curr;
+ int cpu;
+
+ lockdep_assert_held(&rq->lock);
+
+ if (test_tsk_need_resched(curr))
+ return;
+
+ cpu = cpu_of(rq);
+
+ if (cpu == smp_processor_id()) {
+ set_tsk_need_resched(curr);
+ set_preempt_need_resched();
+ return;
+ }
+
+ if (set_nr_and_not_polling(curr))
+ smp_send_reschedule(cpu);
+ else
+ trace_sched_wake_idle_without_ipi(cpu);
+}
+
+void resched_cpu(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+
+ if (!raw_spin_trylock_irqsave(&rq->lock, flags))
+ return;
+ resched_curr(rq);
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+#ifdef CONFIG_SMP
+#ifdef CONFIG_NO_HZ_COMMON
+/*
+ * In the semi idle case, use the nearest busy cpu for migrating timers
+ * from an idle cpu. This is good for power-savings.
+ *
+ * We don't do similar optimization for completely idle system, as
+ * selecting an idle cpu will add more delays to the timers than intended
+ * (as that cpu's timer base may not be uptodate wrt jiffies etc).
+ */
+int get_nohz_timer_target(int pinned)
+{
+ int cpu = smp_processor_id();
+ int i;
+ struct sched_domain *sd;
+
+ if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu))
+ return cpu;
+
+ rcu_read_lock();
+ for_each_domain(cpu, sd) {
+ for_each_cpu(i, sched_domain_span(sd)) {
+ if (!idle_cpu(i)) {
+ cpu = i;
+ goto unlock;
+ }
+ }
+ }
+unlock:
+ rcu_read_unlock();
+ return cpu;
+}
+/*
+ * When add_timer_on() enqueues a timer into the timer wheel of an
+ * idle CPU then this timer might expire before the next timer event
+ * which is scheduled to wake up that CPU. In case of a completely
+ * idle system the next event might even be infinite time into the
+ * future. wake_up_idle_cpu() ensures that the CPU is woken up and
+ * leaves the inner idle loop so the newly added timer is taken into
+ * account when the CPU goes back to idle and evaluates the timer
+ * wheel for the next timer event.
+ */
+static void wake_up_idle_cpu(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ if (cpu == smp_processor_id())
+ return;
+
+ if (set_nr_and_not_polling(rq->idle))
+ smp_send_reschedule(cpu);
+ else
+ trace_sched_wake_idle_without_ipi(cpu);
+}
+
+static bool wake_up_full_nohz_cpu(int cpu)
+{
+ /*
+ * We just need the target to call irq_exit() and re-evaluate
+ * the next tick. The nohz full kick at least implies that.
+ * If needed we can still optimize that later with an
+ * empty IRQ.
+ */
+ if (tick_nohz_full_cpu(cpu)) {
+ if (cpu != smp_processor_id() ||
+ tick_nohz_tick_stopped())
+ tick_nohz_full_kick_cpu(cpu);
+ return true;
+ }
+
+ return false;
+}
+
+void wake_up_nohz_cpu(int cpu)
+{
+ if (!wake_up_full_nohz_cpu(cpu))
+ wake_up_idle_cpu(cpu);
+}
+
+static inline bool got_nohz_idle_kick(void)
+{
+ int cpu = smp_processor_id();
+
+ if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)))
+ return false;
+
+ if (idle_cpu(cpu) && !need_resched())
+ return true;
+
+ /*
+ * We can't run Idle Load Balance on this CPU for this time so we
+ * cancel it and clear NOHZ_BALANCE_KICK
+ */
+ clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
+ return false;
+}
+
+#else /* CONFIG_NO_HZ_COMMON */
+
+static inline bool got_nohz_idle_kick(void)
+{
+ return false;
+}
+
+#endif /* CONFIG_NO_HZ_COMMON */
+
+#ifdef CONFIG_NO_HZ_FULL
+bool sched_can_stop_tick(void)
+{
+ /*
+ * FIFO realtime policy runs the highest priority task. Other runnable
+ * tasks are of a lower priority. The scheduler tick does nothing.
+ */
+ if (current->policy == SCHED_FIFO)
+ return true;
+
+ /*
+ * Round-robin realtime tasks time slice with other tasks at the same
+ * realtime priority. Is this task the only one at this priority?
+ */
+ if (current->policy == SCHED_RR) {
+ struct sched_rt_entity *rt_se = &current->rt;
+
+ return rt_se->run_list.prev == rt_se->run_list.next;
+ }
+
+ /*
+ * More than one running task need preemption.
+ * nr_running update is assumed to be visible
+ * after IPI is sent from wakers.
+ */
+ if (this_rq()->nr_running > 1)
+ return false;
+
+ return true;
+}
+#endif /* CONFIG_NO_HZ_FULL */
+
+void sched_avg_update(struct rq *rq)
+{
+ s64 period = sched_avg_period();
+
+ while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {
+ /*
+ * Inline assembly required to prevent the compiler
+ * optimising this loop into a divmod call.
+ * See __iter_div_u64_rem() for another example of this.
+ */
+ asm("" : "+rm" (rq->age_stamp));
+ rq->age_stamp += period;
+ rq->rt_avg /= 2;
+ }
+}
+
+#endif /* CONFIG_SMP */
+
+#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
+ (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
+/*
+ * Iterate task_group tree rooted at *from, calling @down when first entering a
+ * node and @up when leaving it for the final time.
+ *
+ * Caller must hold rcu_lock or sufficient equivalent.
+ */
+int walk_tg_tree_from(struct task_group *from,
+ tg_visitor down, tg_visitor up, void *data)
+{
+ struct task_group *parent, *child;
+ int ret;
+
+ parent = from;
+
+down:
+ ret = (*down)(parent, data);
+ if (ret)
+ goto out;
+ list_for_each_entry_rcu(child, &parent->children, siblings) {
+ parent = child;
+ goto down;
+
+up:
+ continue;
+ }
+ ret = (*up)(parent, data);
+ if (ret || parent == from)
+ goto out;
+
+ child = parent;
+ parent = parent->parent;
+ if (parent)
+ goto up;
+out:
+ return ret;
+}
+
+int tg_nop(struct task_group *tg, void *data)
+{
+ return 0;
+}
+#endif
+
+static void set_load_weight(struct task_struct *p)
+{
+ int prio = p->static_prio - MAX_RT_PRIO;
+ struct load_weight *load = &p->se.load;
+
+ /*
+ * SCHED_IDLE tasks get minimal weight:
+ */
+ if (p->policy == SCHED_IDLE) {
+ load->weight = scale_load(WEIGHT_IDLEPRIO);
+ load->inv_weight = WMULT_IDLEPRIO;
+ return;
+ }
+
+ load->weight = scale_load(prio_to_weight[prio]);
+ load->inv_weight = prio_to_wmult[prio];
+}
+
+static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
+{
+ update_rq_clock(rq);
+ sched_info_queued(rq, p);
+ p->sched_class->enqueue_task(rq, p, flags);
+}
+
+static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
+{
+ update_rq_clock(rq);
+ sched_info_dequeued(rq, p);
+ p->sched_class->dequeue_task(rq, p, flags);
+}
+
+void activate_task(struct rq *rq, struct task_struct *p, int flags)
+{
+ if (task_contributes_to_load(p))
+ rq->nr_uninterruptible--;
+
+ enqueue_task(rq, p, flags);
+}
+
+void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
+{
+ if (task_contributes_to_load(p))
+ rq->nr_uninterruptible++;
+
+ dequeue_task(rq, p, flags);
+}
+
+static void update_rq_clock_task(struct rq *rq, s64 delta)
+{
+/*
+ * In theory, the compile should just see 0 here, and optimize out the call
+ * to sched_rt_avg_update. But I don't trust it...
+ */
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+ s64 steal = 0, irq_delta = 0;
+#endif
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
+
+ /*
+ * Since irq_time is only updated on {soft,}irq_exit, we might run into
+ * this case when a previous update_rq_clock() happened inside a
+ * {soft,}irq region.
+ *
+ * When this happens, we stop ->clock_task and only update the
+ * prev_irq_time stamp to account for the part that fit, so that a next
+ * update will consume the rest. This ensures ->clock_task is
+ * monotonic.
+ *
+ * It does however cause some slight miss-attribution of {soft,}irq
+ * time, a more accurate solution would be to update the irq_time using
+ * the current rq->clock timestamp, except that would require using
+ * atomic ops.
+ */
+ if (irq_delta > delta)
+ irq_delta = delta;
+
+ rq->prev_irq_time += irq_delta;
+ delta -= irq_delta;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+ if (static_key_false((&paravirt_steal_rq_enabled))) {
+ steal = paravirt_steal_clock(cpu_of(rq));
+ steal -= rq->prev_steal_time_rq;
+
+ if (unlikely(steal > delta))
+ steal = delta;
+
+ rq->prev_steal_time_rq += steal;
+ delta -= steal;
+ }
+#endif
+
+ rq->clock_task += delta;
+
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+ if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
+ sched_rt_avg_update(rq, irq_delta + steal);
+#endif
+}
+
+void sched_set_stop_task(int cpu, struct task_struct *stop)
+{
+ struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
+ struct task_struct *old_stop = cpu_rq(cpu)->stop;
+
+ if (stop) {
+ /*
+ * Make it appear like a SCHED_FIFO task, its something
+ * userspace knows about and won't get confused about.
+ *
+ * Also, it will make PI more or less work without too
+ * much confusion -- but then, stop work should not
+ * rely on PI working anyway.
+ */
+ sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
+
+ stop->sched_class = &stop_sched_class;
+ }
+
+ cpu_rq(cpu)->stop = stop;
+
+ if (old_stop) {
+ /*
+ * Reset it back to a normal scheduling class so that
+ * it can die in pieces.
+ */
+ old_stop->sched_class = &rt_sched_class;
+ }
+}
+
+/*
+ * __normal_prio - return the priority that is based on the static prio
+ */
+static inline int __normal_prio(struct task_struct *p)
+{
+ return p->static_prio;
+}
+
+/*
+ * Calculate the expected normal priority: i.e. priority
+ * without taking RT-inheritance into account. Might be
+ * boosted by interactivity modifiers. Changes upon fork,
+ * setprio syscalls, and whenever the interactivity
+ * estimator recalculates.
+ */
+static inline int normal_prio(struct task_struct *p)
+{
+ int prio;
+
+ if (task_has_dl_policy(p))
+ prio = MAX_DL_PRIO-1;
+ else if (task_has_rt_policy(p))
+ prio = MAX_RT_PRIO-1 - p->rt_priority;
+ else
+ prio = __normal_prio(p);
+ return prio;
+}
+
+/*
+ * Calculate the current priority, i.e. the priority
+ * taken into account by the scheduler. This value might
+ * be boosted by RT tasks, or might be boosted by
+ * interactivity modifiers. Will be RT if the task got
+ * RT-boosted. If not then it returns p->normal_prio.
+ */
+static int effective_prio(struct task_struct *p)
+{
+ p->normal_prio = normal_prio(p);
+ /*
+ * If we are RT tasks or we were boosted to RT priority,
+ * keep the priority unchanged. Otherwise, update priority
+ * to the normal priority:
+ */
+ if (!rt_prio(p->prio))
+ return p->normal_prio;
+ return p->prio;
+}
+
+/**
+ * task_curr - is this task currently executing on a CPU?
+ * @p: the task in question.
+ *
+ * Return: 1 if the task is currently executing. 0 otherwise.
+ */
+inline int task_curr(const struct task_struct *p)
+{
+ return cpu_curr(task_cpu(p)) == p;
+}
+
+/*
+ * Can drop rq->lock because from sched_class::switched_from() methods drop it.
+ */
+static inline void check_class_changed(struct rq *rq, struct task_struct *p,
+ const struct sched_class *prev_class,
+ int oldprio)
+{
+ if (prev_class != p->sched_class) {
+ if (prev_class->switched_from)
+ prev_class->switched_from(rq, p);
+ /* Possble rq->lock 'hole'. */
+ p->sched_class->switched_to(rq, p);
+ } else if (oldprio != p->prio || dl_task(p))
+ p->sched_class->prio_changed(rq, p, oldprio);
+}
+
+void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
+{
+ const struct sched_class *class;
+
+ if (p->sched_class == rq->curr->sched_class) {
+ rq->curr->sched_class->check_preempt_curr(rq, p, flags);
+ } else {
+ for_each_class(class) {
+ if (class == rq->curr->sched_class)
+ break;
+ if (class == p->sched_class) {
+ resched_curr(rq);
+ break;
+ }
+ }
+ }
+
+ /*
+ * A queue event has occurred, and we're going to schedule. In
+ * this case, we can save a useless back to back clock update.
+ */
+ if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
+ rq_clock_skip_update(rq, true);
+}
+
+#ifdef CONFIG_SMP
+void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
+{
+#ifdef CONFIG_SCHED_DEBUG
+ /*
+ * We should never call set_task_cpu() on a blocked task,
+ * ttwu() will sort out the placement.
+ */
+ WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
+ !p->on_rq);
+
+#ifdef CONFIG_LOCKDEP
+ /*
+ * The caller should hold either p->pi_lock or rq->lock, when changing
+ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
+ *
+ * sched_move_task() holds both and thus holding either pins the cgroup,
+ * see task_group().
+ *
+ * Furthermore, all task_rq users should acquire both locks, see
+ * task_rq_lock().
+ */
+ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
+ lockdep_is_held(&task_rq(p)->lock)));
+#endif
+#endif
+
+ trace_sched_migrate_task(p, new_cpu);
+
+ if (task_cpu(p) != new_cpu) {
+ if (p->sched_class->migrate_task_rq)
+ p->sched_class->migrate_task_rq(p, new_cpu);
+ p->se.nr_migrations++;
+ perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
+ }
+
+ __set_task_cpu(p, new_cpu);
+}
+
+static void __migrate_swap_task(struct task_struct *p, int cpu)
+{
+ if (task_on_rq_queued(p)) {
+ struct rq *src_rq, *dst_rq;
+
+ src_rq = task_rq(p);
+ dst_rq = cpu_rq(cpu);
+
+ deactivate_task(src_rq, p, 0);
+ set_task_cpu(p, cpu);
+ activate_task(dst_rq, p, 0);
+ check_preempt_curr(dst_rq, p, 0);
+ } else {
+ /*
+ * Task isn't running anymore; make it appear like we migrated
+ * it before it went to sleep. This means on wakeup we make the
+ * previous cpu our targer instead of where it really is.
+ */
+ p->wake_cpu = cpu;
+ }
+}
+
+struct migration_swap_arg {
+ struct task_struct *src_task, *dst_task;
+ int src_cpu, dst_cpu;
+};
+
+static int migrate_swap_stop(void *data)
+{
+ struct migration_swap_arg *arg = data;
+ struct rq *src_rq, *dst_rq;
+ int ret = -EAGAIN;
+
+ src_rq = cpu_rq(arg->src_cpu);
+ dst_rq = cpu_rq(arg->dst_cpu);
+
+ double_raw_lock(&arg->src_task->pi_lock,
+ &arg->dst_task->pi_lock);
+ double_rq_lock(src_rq, dst_rq);
+ if (task_cpu(arg->dst_task) != arg->dst_cpu)
+ goto unlock;
+
+ if (task_cpu(arg->src_task) != arg->src_cpu)
+ goto unlock;
+
+ if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task)))
+ goto unlock;
+
+ if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task)))
+ goto unlock;
+
+ __migrate_swap_task(arg->src_task, arg->dst_cpu);
+ __migrate_swap_task(arg->dst_task, arg->src_cpu);
+
+ ret = 0;
+
+unlock:
+ double_rq_unlock(src_rq, dst_rq);
+ raw_spin_unlock(&arg->dst_task->pi_lock);
+ raw_spin_unlock(&arg->src_task->pi_lock);
+
+ return ret;
+}
+
+/*
+ * Cross migrate two tasks
+ */
+int migrate_swap(struct task_struct *cur, struct task_struct *p)
+{
+ struct migration_swap_arg arg;
+ int ret = -EINVAL;
+
+ arg = (struct migration_swap_arg){
+ .src_task = cur,
+ .src_cpu = task_cpu(cur),
+ .dst_task = p,
+ .dst_cpu = task_cpu(p),
+ };
+
+ if (arg.src_cpu == arg.dst_cpu)
+ goto out;
+
+ /*
+ * These three tests are all lockless; this is OK since all of them
+ * will be re-checked with proper locks held further down the line.
+ */
+ if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
+ goto out;
+
+ if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task)))
+ goto out;
+
+ if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
+ goto out;
+
+ trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
+ ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
+
+out:
+ return ret;
+}
+
+struct migration_arg {
+ struct task_struct *task;
+ int dest_cpu;
+};
+
+static int migration_cpu_stop(void *data);
+
+/*
+ * wait_task_inactive - wait for a thread to unschedule.
+ *
+ * If @match_state is nonzero, it's the @p->state value just checked and
+ * not expected to change. If it changes, i.e. @p might have woken up,
+ * then return zero. When we succeed in waiting for @p to be off its CPU,
+ * we return a positive number (its total switch count). If a second call
+ * a short while later returns the same number, the caller can be sure that
+ * @p has remained unscheduled the whole time.
+ *
+ * The caller must ensure that the task *will* unschedule sometime soon,
+ * else this function might spin for a *long* time. This function can't
+ * be called with interrupts off, or it may introduce deadlock with
+ * smp_call_function() if an IPI is sent by the same process we are
+ * waiting to become inactive.
+ */
+unsigned long wait_task_inactive(struct task_struct *p, long match_state)
+{
+ unsigned long flags;
+ int running, queued;
+ unsigned long ncsw;
+ struct rq *rq;
+
+ for (;;) {
+ /*
+ * We do the initial early heuristics without holding
+ * any task-queue locks at all. We'll only try to get
+ * the runqueue lock when things look like they will
+ * work out!
+ */
+ rq = task_rq(p);
+
+ /*
+ * If the task is actively running on another CPU
+ * still, just relax and busy-wait without holding
+ * any locks.
+ *
+ * NOTE! Since we don't hold any locks, it's not
+ * even sure that "rq" stays as the right runqueue!
+ * But we don't care, since "task_running()" will
+ * return false if the runqueue has changed and p
+ * is actually now running somewhere else!
+ */
+ while (task_running(rq, p)) {
+ if (match_state && unlikely(p->state != match_state))
+ return 0;
+ cpu_relax();
+ }
+
+ /*
+ * Ok, time to look more closely! We need the rq
+ * lock now, to be *sure*. If we're wrong, we'll
+ * just go back and repeat.
+ */
+ rq = task_rq_lock(p, &flags);
+ trace_sched_wait_task(p);
+ running = task_running(rq, p);
+ queued = task_on_rq_queued(p);
+ ncsw = 0;
+ if (!match_state || p->state == match_state)
+ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
+ task_rq_unlock(rq, p, &flags);
+
+ /*
+ * If it changed from the expected state, bail out now.
+ */
+ if (unlikely(!ncsw))
+ break;
+
+ /*
+ * Was it really running after all now that we
+ * checked with the proper locks actually held?
+ *
+ * Oops. Go back and try again..
+ */
+ if (unlikely(running)) {
+ cpu_relax();
+ continue;
+ }
+
+ /*
+ * It's not enough that it's not actively running,
+ * it must be off the runqueue _entirely_, and not
+ * preempted!
+ *
+ * So if it was still runnable (but just not actively
+ * running right now), it's preempted, and we should
+ * yield - it could be a while.
+ */
+ if (unlikely(queued)) {
+ ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
+
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_hrtimeout(&to, HRTIMER_MODE_REL);
+ continue;
+ }
+
+ /*
+ * Ahh, all good. It wasn't running, and it wasn't
+ * runnable, which means that it will never become
+ * running in the future either. We're all done!
+ */
+ break;
+ }
+
+ return ncsw;
+}
+
+/***
+ * kick_process - kick a running thread to enter/exit the kernel
+ * @p: the to-be-kicked thread
+ *
+ * Cause a process which is running on another CPU to enter
+ * kernel-mode, without any delay. (to get signals handled.)
+ *
+ * NOTE: this function doesn't have to take the runqueue lock,
+ * because all it wants to ensure is that the remote task enters
+ * the kernel. If the IPI races and the task has been migrated
+ * to another CPU then no harm is done and the purpose has been
+ * achieved as well.
+ */
+void kick_process(struct task_struct *p)
+{
+ int cpu;
+
+ preempt_disable();
+ cpu = task_cpu(p);
+ if ((cpu != smp_processor_id()) && task_curr(p))
+ smp_send_reschedule(cpu);
+ preempt_enable();
+}
+EXPORT_SYMBOL_GPL(kick_process);
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_SMP
+/*
+ * ->cpus_allowed is protected by both rq->lock and p->pi_lock
+ */
+static int select_fallback_rq(int cpu, struct task_struct *p)
+{
+ int nid = cpu_to_node(cpu);
+ const struct cpumask *nodemask = NULL;
+ enum { cpuset, possible, fail } state = cpuset;
+ int dest_cpu;
+
+ /*
+ * If the node that the cpu is on has been offlined, cpu_to_node()
+ * will return -1. There is no cpu on the node, and we should
+ * select the cpu on the other node.
+ */
+ if (nid != -1) {
+ nodemask = cpumask_of_node(nid);
+
+ /* Look for allowed, online CPU in same node. */
+ for_each_cpu(dest_cpu, nodemask) {
+ if (!cpu_online(dest_cpu))
+ continue;
+ if (!cpu_active(dest_cpu))
+ continue;
+ if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
+ return dest_cpu;
+ }
+ }
+
+ for (;;) {
+ /* Any allowed, online CPU? */
+ for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
+ if (!cpu_online(dest_cpu))
+ continue;
+ if (!cpu_active(dest_cpu))
+ continue;
+ goto out;
+ }
+
+ switch (state) {
+ case cpuset:
+ /* No more Mr. Nice Guy. */
+ cpuset_cpus_allowed_fallback(p);
+ state = possible;
+ break;
+
+ case possible:
+ do_set_cpus_allowed(p, cpu_possible_mask);
+ state = fail;
+ break;
+
+ case fail:
+ BUG();
+ break;
+ }
+ }
+
+out:
+ if (state != cpuset) {
+ /*
+ * Don't tell them about moving exiting tasks or
+ * kernel threads (both mm NULL), since they never
+ * leave kernel.
+ */
+ if (p->mm && printk_ratelimit()) {
+ printk_deferred("process %d (%s) no longer affine to cpu%d\n",
+ task_pid_nr(p), p->comm, cpu);
+ }
+ }
+
+ return dest_cpu;
+}
+
+/*
+ * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
+ */
+static inline
+int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
+{
+ if (p->nr_cpus_allowed > 1)
+ cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
+
+ /*
+ * In order not to call set_task_cpu() on a blocking task we need
+ * to rely on ttwu() to place the task on a valid ->cpus_allowed
+ * cpu.
+ *
+ * Since this is common to all placement strategies, this lives here.
+ *
+ * [ this allows ->select_task() to simply return task_cpu(p) and
+ * not worry about this generic constraint ]
+ */
+ if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
+ !cpu_online(cpu)))
+ cpu = select_fallback_rq(task_cpu(p), p);
+
+ return cpu;
+}
+
+static void update_avg(u64 *avg, u64 sample)
+{
+ s64 diff = sample - *avg;
+ *avg += diff >> 3;
+}
+#endif
+
+static void
+ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
+{
+#ifdef CONFIG_SCHEDSTATS
+ struct rq *rq = this_rq();
+
+#ifdef CONFIG_SMP
+ int this_cpu = smp_processor_id();
+
+ if (cpu == this_cpu) {
+ schedstat_inc(rq, ttwu_local);
+ schedstat_inc(p, se.statistics.nr_wakeups_local);
+ } else {
+ struct sched_domain *sd;
+
+ schedstat_inc(p, se.statistics.nr_wakeups_remote);
+ rcu_read_lock();
+ for_each_domain(this_cpu, sd) {
+ if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
+ schedstat_inc(sd, ttwu_wake_remote);
+ break;
+ }
+ }
+ rcu_read_unlock();
+ }
+
+ if (wake_flags & WF_MIGRATED)
+ schedstat_inc(p, se.statistics.nr_wakeups_migrate);
+
+#endif /* CONFIG_SMP */
+
+ schedstat_inc(rq, ttwu_count);
+ schedstat_inc(p, se.statistics.nr_wakeups);
+
+ if (wake_flags & WF_SYNC)
+ schedstat_inc(p, se.statistics.nr_wakeups_sync);
+
+#endif /* CONFIG_SCHEDSTATS */
+}
+
+static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
+{
+ activate_task(rq, p, en_flags);
+ p->on_rq = TASK_ON_RQ_QUEUED;
+
+ /* if a worker is waking up, notify workqueue */
+ if (p->flags & PF_WQ_WORKER)
+ wq_worker_waking_up(p, cpu_of(rq));
+}
+
+/*
+ * Mark the task runnable and perform wakeup-preemption.
+ */
+static void
+ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
+{
+ check_preempt_curr(rq, p, wake_flags);
+ trace_sched_wakeup(p, true);
+
+ p->state = TASK_RUNNING;
+#ifdef CONFIG_SMP
+ if (p->sched_class->task_woken)
+ p->sched_class->task_woken(rq, p);
+
+ if (rq->idle_stamp) {
+ u64 delta = rq_clock(rq) - rq->idle_stamp;
+ u64 max = 2*rq->max_idle_balance_cost;
+
+ update_avg(&rq->avg_idle, delta);
+
+ if (rq->avg_idle > max)
+ rq->avg_idle = max;
+
+ rq->idle_stamp = 0;
+ }
+#endif
+}
+
+static void
+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
+{
+#ifdef CONFIG_SMP
+ if (p->sched_contributes_to_load)
+ rq->nr_uninterruptible--;
+#endif
+
+ ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
+ ttwu_do_wakeup(rq, p, wake_flags);
+}
+
+/*
+ * Called in case the task @p isn't fully descheduled from its runqueue,
+ * in this case we must do a remote wakeup. Its a 'light' wakeup though,
+ * since all we need to do is flip p->state to TASK_RUNNING, since
+ * the task is still ->on_rq.
+ */
+static int ttwu_remote(struct task_struct *p, int wake_flags)
+{
+ struct rq *rq;
+ int ret = 0;
+
+ rq = __task_rq_lock(p);
+ if (task_on_rq_queued(p)) {
+ /* check_preempt_curr() may use rq clock */
+ update_rq_clock(rq);
+ ttwu_do_wakeup(rq, p, wake_flags);
+ ret = 1;
+ }
+ __task_rq_unlock(rq);
+
+ return ret;
+}
+
+#ifdef CONFIG_SMP
+void sched_ttwu_pending(void)
+{
+ struct rq *rq = this_rq();
+ struct llist_node *llist = llist_del_all(&rq->wake_list);
+ struct task_struct *p;
+ unsigned long flags;
+
+ if (!llist)
+ return;
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+
+ while (llist) {
+ p = llist_entry(llist, struct task_struct, wake_entry);
+ llist = llist_next(llist);
+ ttwu_do_activate(rq, p, 0);
+ }
+
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+void scheduler_ipi(void)
+{
+ /*
+ * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
+ * TIF_NEED_RESCHED remotely (for the first time) will also send
+ * this IPI.
+ */
+ preempt_fold_need_resched();
+
+ if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
+ return;
+
+ /*
+ * Not all reschedule IPI handlers call irq_enter/irq_exit, since
+ * traditionally all their work was done from the interrupt return
+ * path. Now that we actually do some work, we need to make sure
+ * we do call them.
+ *
+ * Some archs already do call them, luckily irq_enter/exit nest
+ * properly.
+ *
+ * Arguably we should visit all archs and update all handlers,
+ * however a fair share of IPIs are still resched only so this would
+ * somewhat pessimize the simple resched case.
+ */
+ irq_enter();
+ sched_ttwu_pending();
+
+ /*
+ * Check if someone kicked us for doing the nohz idle load balance.
+ */
+ if (unlikely(got_nohz_idle_kick())) {
+ this_rq()->idle_balance = 1;
+ raise_softirq_irqoff(SCHED_SOFTIRQ);
+ }
+ irq_exit();
+}
+
+static void ttwu_queue_remote(struct task_struct *p, int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
+ if (!set_nr_if_polling(rq->idle))
+ smp_send_reschedule(cpu);
+ else
+ trace_sched_wake_idle_without_ipi(cpu);
+ }
+}
+
+void wake_up_if_idle(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+
+ rcu_read_lock();
+
+ if (!is_idle_task(rcu_dereference(rq->curr)))
+ goto out;
+
+ if (set_nr_if_polling(rq->idle)) {
+ trace_sched_wake_idle_without_ipi(cpu);
+ } else {
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ if (is_idle_task(rq->curr))
+ smp_send_reschedule(cpu);
+ /* Else cpu is not in idle, do nothing here */
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+ }
+
+out:
+ rcu_read_unlock();
+}
+
+bool cpus_share_cache(int this_cpu, int that_cpu)
+{
+ return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
+}
+#endif /* CONFIG_SMP */
+
+static void ttwu_queue(struct task_struct *p, int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+#if defined(CONFIG_SMP)
+ if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
+ sched_clock_cpu(cpu); /* sync clocks x-cpu */
+ ttwu_queue_remote(p, cpu);
+ return;
+ }
+#endif
+
+ raw_spin_lock(&rq->lock);
+ ttwu_do_activate(rq, p, 0);
+ raw_spin_unlock(&rq->lock);
+}
+
+/**
+ * try_to_wake_up - wake up a thread
+ * @p: the thread to be awakened
+ * @state: the mask of task states that can be woken
+ * @wake_flags: wake modifier flags (WF_*)
+ *
+ * Put it on the run-queue if it's not already there. The "current"
+ * thread is always on the run-queue (except when the actual
+ * re-schedule is in progress), and as such you're allowed to do
+ * the simpler "current->state = TASK_RUNNING" to mark yourself
+ * runnable without the overhead of this.
+ *
+ * Return: %true if @p was woken up, %false if it was already running.
+ * or @state didn't match @p's state.
+ */
+static int
+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
+{
+ unsigned long flags;
+ int cpu, success = 0;
+
+ /*
+ * If we are going to wake up a thread waiting for CONDITION we
+ * need to ensure that CONDITION=1 done by the caller can not be
+ * reordered with p->state check below. This pairs with mb() in
+ * set_current_state() the waiting thread does.
+ */
+ smp_mb__before_spinlock();
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ if (!(p->state & state))
+ goto out;
+
+ success = 1; /* we're going to change ->state */
+ cpu = task_cpu(p);
+
+ if (p->on_rq && ttwu_remote(p, wake_flags))
+ goto stat;
+
+#ifdef CONFIG_SMP
+ /*
+ * If the owning (remote) cpu is still in the middle of schedule() with
+ * this task as prev, wait until its done referencing the task.
+ */
+ while (p->on_cpu)
+ cpu_relax();
+ /*
+ * Pairs with the smp_wmb() in finish_lock_switch().
+ */
+ smp_rmb();
+
+ p->sched_contributes_to_load = !!task_contributes_to_load(p);
+ p->state = TASK_WAKING;
+
+ if (p->sched_class->task_waking)
+ p->sched_class->task_waking(p);
+
+ cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
+ if (task_cpu(p) != cpu) {
+ wake_flags |= WF_MIGRATED;
+ set_task_cpu(p, cpu);
+ }
+#endif /* CONFIG_SMP */
+
+ ttwu_queue(p, cpu);
+stat:
+ ttwu_stat(p, cpu, wake_flags);
+out:
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+ return success;
+}
+
+/**
+ * try_to_wake_up_local - try to wake up a local task with rq lock held
+ * @p: the thread to be awakened
+ *
+ * Put @p on the run-queue if it's not already there. The caller must
+ * ensure that this_rq() is locked, @p is bound to this_rq() and not
+ * the current task.
+ */
+static void try_to_wake_up_local(struct task_struct *p)
+{
+ struct rq *rq = task_rq(p);
+
+ if (WARN_ON_ONCE(rq != this_rq()) ||
+ WARN_ON_ONCE(p == current))
+ return;
+
+ lockdep_assert_held(&rq->lock);
+
+ if (!raw_spin_trylock(&p->pi_lock)) {
+ raw_spin_unlock(&rq->lock);
+ raw_spin_lock(&p->pi_lock);
+ raw_spin_lock(&rq->lock);
+ }
+
+ if (!(p->state & TASK_NORMAL))
+ goto out;
+
+ if (!task_on_rq_queued(p))
+ ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+
+ ttwu_do_wakeup(rq, p, 0);
+ ttwu_stat(p, smp_processor_id(), 0);
+out:
+ raw_spin_unlock(&p->pi_lock);
+}
+
+/**
+ * wake_up_process - Wake up a specific process
+ * @p: The process to be woken up.
+ *
+ * Attempt to wake up the nominated process and move it to the set of runnable
+ * processes.
+ *
+ * Return: 1 if the process was woken up, 0 if it was already running.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+int wake_up_process(struct task_struct *p)
+{
+ WARN_ON(task_is_stopped_or_traced(p));
+ return try_to_wake_up(p, TASK_NORMAL, 0);
+}
+EXPORT_SYMBOL(wake_up_process);
+
+int wake_up_state(struct task_struct *p, unsigned int state)
+{
+ return try_to_wake_up(p, state, 0);
+}
+
+/*
+ * This function clears the sched_dl_entity static params.
+ */
+void __dl_clear_params(struct task_struct *p)
+{
+ struct sched_dl_entity *dl_se = &p->dl;
+
+ dl_se->dl_runtime = 0;
+ dl_se->dl_deadline = 0;
+ dl_se->dl_period = 0;
+ dl_se->flags = 0;
+ dl_se->dl_bw = 0;
+
+ dl_se->dl_throttled = 0;
+ dl_se->dl_new = 1;
+ dl_se->dl_yielded = 0;
+}
+
+/*
+ * Perform scheduler related setup for a newly forked process p.
+ * p is forked by current.
+ *
+ * __sched_fork() is basic setup used by init_idle() too:
+ */
+static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
+{
+ p->on_rq = 0;
+
+ p->se.on_rq = 0;
+ p->se.exec_start = 0;
+ p->se.sum_exec_runtime = 0;
+ p->se.prev_sum_exec_runtime = 0;
+ p->se.nr_migrations = 0;
+ p->se.vruntime = 0;
+#ifdef CONFIG_SMP
+ p->se.avg.decay_count = 0;
+#endif
+ INIT_LIST_HEAD(&p->se.group_node);
+
+#ifdef CONFIG_SCHEDSTATS
+ memset(&p->se.statistics, 0, sizeof(p->se.statistics));
+#endif
+
+ RB_CLEAR_NODE(&p->dl.rb_node);
+ init_dl_task_timer(&p->dl);
+ __dl_clear_params(p);
+
+ INIT_LIST_HEAD(&p->rt.run_list);
+
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+ INIT_HLIST_HEAD(&p->preempt_notifiers);
+#endif
+
+#ifdef CONFIG_NUMA_BALANCING
+ if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
+ p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
+ p->mm->numa_scan_seq = 0;
+ }
+
+ if (clone_flags & CLONE_VM)
+ p->numa_preferred_nid = current->numa_preferred_nid;
+ else
+ p->numa_preferred_nid = -1;
+
+ p->node_stamp = 0ULL;
+ p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
+ p->numa_scan_period = sysctl_numa_balancing_scan_delay;
+ p->numa_work.next = &p->numa_work;
+ p->numa_faults = NULL;
+ p->last_task_numa_placement = 0;
+ p->last_sum_exec_runtime = 0;
+
+ p->numa_group = NULL;
+#endif /* CONFIG_NUMA_BALANCING */
+}
+
+#ifdef CONFIG_NUMA_BALANCING
+#ifdef CONFIG_SCHED_DEBUG
+void set_numabalancing_state(bool enabled)
+{
+ if (enabled)
+ sched_feat_set("NUMA");
+ else
+ sched_feat_set("NO_NUMA");
+}
+#else
+__read_mostly bool numabalancing_enabled;
+
+void set_numabalancing_state(bool enabled)
+{
+ numabalancing_enabled = enabled;
+}
+#endif /* CONFIG_SCHED_DEBUG */
+
+#ifdef CONFIG_PROC_SYSCTL
+int sysctl_numa_balancing(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ int err;
+ int state = numabalancing_enabled;
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ t = *table;
+ t.data = &state;
+ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+ if (err < 0)
+ return err;
+ if (write)
+ set_numabalancing_state(state);
+ return err;
+}
+#endif
+#endif
+
+/*
+ * fork()/clone()-time setup:
+ */
+int sched_fork(unsigned long clone_flags, struct task_struct *p)
+{
+ unsigned long flags;
+ int cpu = get_cpu();
+
+ __sched_fork(clone_flags, p);
+ /*
+ * We mark the process as running here. This guarantees that
+ * nobody will actually run it, and a signal or other external
+ * event cannot wake it up and insert it on the runqueue either.
+ */
+ p->state = TASK_RUNNING;
+
+ /*
+ * Make sure we do not leak PI boosting priority to the child.
+ */
+ p->prio = current->normal_prio;
+
+ /*
+ * Revert to default priority/policy on fork if requested.
+ */
+ if (unlikely(p->sched_reset_on_fork)) {
+ if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
+ p->policy = SCHED_NORMAL;
+ p->static_prio = NICE_TO_PRIO(0);
+ p->rt_priority = 0;
+ } else if (PRIO_TO_NICE(p->static_prio) < 0)
+ p->static_prio = NICE_TO_PRIO(0);
+
+ p->prio = p->normal_prio = __normal_prio(p);
+ set_load_weight(p);
+
+ /*
+ * We don't need the reset flag anymore after the fork. It has
+ * fulfilled its duty:
+ */
+ p->sched_reset_on_fork = 0;
+ }
+
+ if (dl_prio(p->prio)) {
+ put_cpu();
+ return -EAGAIN;
+ } else if (rt_prio(p->prio)) {
+ p->sched_class = &rt_sched_class;
+ } else {
+ p->sched_class = &fair_sched_class;
+ }
+
+ if (p->sched_class->task_fork)
+ p->sched_class->task_fork(p);
+
+ /*
+ * The child is not yet in the pid-hash so no cgroup attach races,
+ * and the cgroup is pinned to this child due to cgroup_fork()
+ * is ran before sched_fork().
+ *
+ * Silence PROVE_RCU.
+ */
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ set_task_cpu(p, cpu);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+ if (likely(sched_info_on()))
+ memset(&p->sched_info, 0, sizeof(p->sched_info));
+#endif
+#if defined(CONFIG_SMP)
+ p->on_cpu = 0;
+#endif
+ init_task_preempt_count(p);
+#ifdef CONFIG_SMP
+ plist_node_init(&p->pushable_tasks, MAX_PRIO);
+ RB_CLEAR_NODE(&p->pushable_dl_tasks);
+#endif
+
+ put_cpu();
+ return 0;
+}
+
+unsigned long to_ratio(u64 period, u64 runtime)
+{
+ if (runtime == RUNTIME_INF)
+ return 1ULL << 20;
+
+ /*
+ * Doing this here saves a lot of checks in all
+ * the calling paths, and returning zero seems
+ * safe for them anyway.
+ */
+ if (period == 0)
+ return 0;
+
+ return div64_u64(runtime << 20, period);
+}
+
+#ifdef CONFIG_SMP
+inline struct dl_bw *dl_bw_of(int i)
+{
+ rcu_lockdep_assert(rcu_read_lock_sched_held(),
+ "sched RCU must be held");
+ return &cpu_rq(i)->rd->dl_bw;
+}
+
+static inline int dl_bw_cpus(int i)
+{
+ struct root_domain *rd = cpu_rq(i)->rd;
+ int cpus = 0;
+
+ rcu_lockdep_assert(rcu_read_lock_sched_held(),
+ "sched RCU must be held");
+ for_each_cpu_and(i, rd->span, cpu_active_mask)
+ cpus++;
+
+ return cpus;
+}
+#else
+inline struct dl_bw *dl_bw_of(int i)
+{
+ return &cpu_rq(i)->dl.dl_bw;
+}
+
+static inline int dl_bw_cpus(int i)
+{
+ return 1;
+}
+#endif
+
+/*
+ * We must be sure that accepting a new task (or allowing changing the
+ * parameters of an existing one) is consistent with the bandwidth
+ * constraints. If yes, this function also accordingly updates the currently
+ * allocated bandwidth to reflect the new situation.
+ *
+ * This function is called while holding p's rq->lock.
+ *
+ * XXX we should delay bw change until the task's 0-lag point, see
+ * __setparam_dl().
+ */
+static int dl_overflow(struct task_struct *p, int policy,
+ const struct sched_attr *attr)
+{
+
+ struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+ u64 period = attr->sched_period ?: attr->sched_deadline;
+ u64 runtime = attr->sched_runtime;
+ u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
+ int cpus, err = -1;
+
+ if (new_bw == p->dl.dl_bw)
+ return 0;
+
+ /*
+ * Either if a task, enters, leave, or stays -deadline but changes
+ * its parameters, we may need to update accordingly the total
+ * allocated bandwidth of the container.
+ */
+ raw_spin_lock(&dl_b->lock);
+ cpus = dl_bw_cpus(task_cpu(p));
+ if (dl_policy(policy) && !task_has_dl_policy(p) &&
+ !__dl_overflow(dl_b, cpus, 0, new_bw)) {
+ __dl_add(dl_b, new_bw);
+ err = 0;
+ } else if (dl_policy(policy) && task_has_dl_policy(p) &&
+ !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
+ __dl_clear(dl_b, p->dl.dl_bw);
+ __dl_add(dl_b, new_bw);
+ err = 0;
+ } else if (!dl_policy(policy) && task_has_dl_policy(p)) {
+ __dl_clear(dl_b, p->dl.dl_bw);
+ err = 0;
+ }
+ raw_spin_unlock(&dl_b->lock);
+
+ return err;
+}
+
+extern void init_dl_bw(struct dl_bw *dl_b);
+
+/*
+ * wake_up_new_task - wake up a newly created task for the first time.
+ *
+ * This function will do some initial scheduler statistics housekeeping
+ * that must be done for every newly created context, then puts the task
+ * on the runqueue and wakes it.
+ */
+void wake_up_new_task(struct task_struct *p)
+{
+ unsigned long flags;
+ struct rq *rq;
+
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+#ifdef CONFIG_SMP
+ /*
+ * Fork balancing, do it here and not earlier because:
+ * - cpus_allowed can change in the fork path
+ * - any previously selected cpu might disappear through hotplug
+ */
+ set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
+#endif
+
+ /* Initialize new task's runnable average */
+ init_task_runnable_average(p);
+ rq = __task_rq_lock(p);
+ activate_task(rq, p, 0);
+ p->on_rq = TASK_ON_RQ_QUEUED;
+ trace_sched_wakeup_new(p, true);
+ check_preempt_curr(rq, p, WF_FORK);
+#ifdef CONFIG_SMP
+ if (p->sched_class->task_woken)
+ p->sched_class->task_woken(rq, p);
+#endif
+ task_rq_unlock(rq, p, &flags);
+}
+
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+
+/**
+ * preempt_notifier_register - tell me when current is being preempted & rescheduled
+ * @notifier: notifier struct to register
+ */
+void preempt_notifier_register(struct preempt_notifier *notifier)
+{
+ hlist_add_head(&notifier->link, &current->preempt_notifiers);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_register);
+
+/**
+ * preempt_notifier_unregister - no longer interested in preemption notifications
+ * @notifier: notifier struct to unregister
+ *
+ * This is safe to call from within a preemption notifier.
+ */
+void preempt_notifier_unregister(struct preempt_notifier *notifier)
+{
+ hlist_del(&notifier->link);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
+
+static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+ struct preempt_notifier *notifier;
+
+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
+ notifier->ops->sched_in(notifier, raw_smp_processor_id());
+}
+
+static void
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
+ struct task_struct *next)
+{
+ struct preempt_notifier *notifier;
+
+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
+ notifier->ops->sched_out(notifier, next);
+}
+
+#else /* !CONFIG_PREEMPT_NOTIFIERS */
+
+static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+}
+
+static void
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
+ struct task_struct *next)
+{
+}
+
+#endif /* CONFIG_PREEMPT_NOTIFIERS */
+
+/**
+ * prepare_task_switch - prepare to switch tasks
+ * @rq: the runqueue preparing to switch
+ * @prev: the current task that is being switched out
+ * @next: the task we are going to switch to.
+ *
+ * This is called with the rq lock held and interrupts off. It must
+ * be paired with a subsequent finish_task_switch after the context
+ * switch.
+ *
+ * prepare_task_switch sets up locking and calls architecture specific
+ * hooks.
+ */
+static inline void
+prepare_task_switch(struct rq *rq, struct task_struct *prev,
+ struct task_struct *next)
+{
+ trace_sched_switch(prev, next);
+ sched_info_switch(rq, prev, next);
+ perf_event_task_sched_out(prev, next);
+ fire_sched_out_preempt_notifiers(prev, next);
+ prepare_lock_switch(rq, next);
+ prepare_arch_switch(next);
+}
+
+/**
+ * finish_task_switch - clean up after a task-switch
+ * @prev: the thread we just switched away from.
+ *
+ * finish_task_switch must be called after the context switch, paired
+ * with a prepare_task_switch call before the context switch.
+ * finish_task_switch will reconcile locking set up by prepare_task_switch,
+ * and do any other architecture-specific cleanup actions.
+ *
+ * Note that we may have delayed dropping an mm in context_switch(). If
+ * so, we finish that here outside of the runqueue lock. (Doing it
+ * with the lock held can cause deadlocks; see schedule() for
+ * details.)
+ *
+ * The context switch have flipped the stack from under us and restored the
+ * local variables which were saved when this task called schedule() in the
+ * past. prev == current is still correct but we need to recalculate this_rq
+ * because prev may have moved to another CPU.
+ */
+static struct rq *finish_task_switch(struct task_struct *prev)
+ __releases(rq->lock)
+{
+ struct rq *rq = this_rq();
+ struct mm_struct *mm = rq->prev_mm;
+ long prev_state;
+
+ rq->prev_mm = NULL;
+
+ /*
+ * A task struct has one reference for the use as "current".
+ * If a task dies, then it sets TASK_DEAD in tsk->state and calls
+ * schedule one last time. The schedule call will never return, and
+ * the scheduled task must drop that reference.
+ * The test for TASK_DEAD must occur while the runqueue locks are
+ * still held, otherwise prev could be scheduled on another cpu, die
+ * there before we look at prev->state, and then the reference would
+ * be dropped twice.
+ * Manfred Spraul <manfred@colorfullife.com>
+ */
+ prev_state = prev->state;
+ vtime_task_switch(prev);
+ finish_arch_switch(prev);
+ perf_event_task_sched_in(prev, current);
+ finish_lock_switch(rq, prev);
+ finish_arch_post_lock_switch();
+
+ fire_sched_in_preempt_notifiers(current);
+ if (mm)
+ mmdrop(mm);
+ if (unlikely(prev_state == TASK_DEAD)) {
+ if (prev->sched_class->task_dead)
+ prev->sched_class->task_dead(prev);
+
+ /*
+ * Remove function-return probe instances associated with this
+ * task and put them back on the free list.
+ */
+ kprobe_flush_task(prev);
+ put_task_struct(prev);
+ }
+
+ tick_nohz_task_switch(current);
+ return rq;
+}
+
+#ifdef CONFIG_SMP
+
+/* rq->lock is NOT held, but preemption is disabled */
+static inline void post_schedule(struct rq *rq)
+{
+ if (rq->post_schedule) {
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ if (rq->curr->sched_class->post_schedule)
+ rq->curr->sched_class->post_schedule(rq);
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+ rq->post_schedule = 0;
+ }
+}
+
+#else
+
+static inline void post_schedule(struct rq *rq)
+{
+}
+
+#endif
+
+/**
+ * schedule_tail - first thing a freshly forked thread must call.
+ * @prev: the thread we just switched away from.
+ */
+asmlinkage __visible void schedule_tail(struct task_struct *prev)
+ __releases(rq->lock)
+{
+ struct rq *rq;
+
+ /* finish_task_switch() drops rq->lock and enables preemtion */
+ preempt_disable();
+ rq = finish_task_switch(prev);
+ post_schedule(rq);
+ preempt_enable();
+
+ if (current->set_child_tid)
+ put_user(task_pid_vnr(current), current->set_child_tid);
+}
+
+/*
+ * context_switch - switch to the new MM and the new thread's register state.
+ */
+static inline struct rq *
+context_switch(struct rq *rq, struct task_struct *prev,
+ struct task_struct *next)
+{
+ struct mm_struct *mm, *oldmm;
+
+ prepare_task_switch(rq, prev, next);
+
+ mm = next->mm;
+ oldmm = prev->active_mm;
+ /*
+ * For paravirt, this is coupled with an exit in switch_to to
+ * combine the page table reload and the switch backend into
+ * one hypercall.
+ */
+ arch_start_context_switch(prev);
+
+ if (!mm) {
+ next->active_mm = oldmm;
+ atomic_inc(&oldmm->mm_count);
+ enter_lazy_tlb(oldmm, next);
+ } else
+ switch_mm(oldmm, mm, next);
+
+ if (!prev->mm) {
+ prev->active_mm = NULL;
+ rq->prev_mm = oldmm;
+ }
+ /*
+ * Since the runqueue lock will be released by the next
+ * task (which is an invalid locking op but in the case
+ * of the scheduler it's an obvious special-case), so we
+ * do an early lockdep release here:
+ */
+ spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
+
+ context_tracking_task_switch(prev, next);
+ /* Here we just switch the register state and the stack. */
+ switch_to(prev, next, prev);
+ barrier();
+
+ return finish_task_switch(prev);
+}
+
+/*
+ * nr_running and nr_context_switches:
+ *
+ * externally visible scheduler statistics: current number of runnable
+ * threads, total number of context switches performed since bootup.
+ */
+unsigned long nr_running(void)
+{
+ unsigned long i, sum = 0;
+
+ for_each_online_cpu(i)
+ sum += cpu_rq(i)->nr_running;
+
+ return sum;
+}
+
+/*
+ * Check if only the current task is running on the cpu.
+ */
+bool single_task_running(void)
+{
+ if (cpu_rq(smp_processor_id())->nr_running == 1)
+ return true;
+ else
+ return false;
+}
+EXPORT_SYMBOL(single_task_running);
+
+unsigned long long nr_context_switches(void)
+{
+ int i;
+ unsigned long long sum = 0;
+
+ for_each_possible_cpu(i)
+ sum += cpu_rq(i)->nr_switches;
+
+ return sum;
+}
+
+unsigned long nr_iowait(void)
+{
+ unsigned long i, sum = 0;
+
+ for_each_possible_cpu(i)
+ sum += atomic_read(&cpu_rq(i)->nr_iowait);
+
+ return sum;
+}
+
+unsigned long nr_iowait_cpu(int cpu)
+{
+ struct rq *this = cpu_rq(cpu);
+ return atomic_read(&this->nr_iowait);
+}
+
+void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
+{
+ struct rq *this = this_rq();
+ *nr_waiters = atomic_read(&this->nr_iowait);
+ *load = this->cpu_load[0];
+}
+
+#ifdef CONFIG_SMP
+
+/*
+ * sched_exec - execve() is a valuable balancing opportunity, because at
+ * this point the task has the smallest effective memory and cache footprint.
+ */
+void sched_exec(void)
+{
+ struct task_struct *p = current;
+ unsigned long flags;
+ int dest_cpu;
+
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
+ if (dest_cpu == smp_processor_id())
+ goto unlock;
+
+ if (likely(cpu_active(dest_cpu))) {
+ struct migration_arg arg = { p, dest_cpu };
+
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
+ return;
+ }
+unlock:
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+}
+
+#endif
+
+DEFINE_PER_CPU(struct kernel_stat, kstat);
+DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
+
+EXPORT_PER_CPU_SYMBOL(kstat);
+EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
+
+/*
+ * Return accounted runtime for the task.
+ * In case the task is currently running, return the runtime plus current's
+ * pending runtime that have not been accounted yet.
+ */
+unsigned long long task_sched_runtime(struct task_struct *p)
+{
+ unsigned long flags;
+ struct rq *rq;
+ u64 ns;
+
+#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
+ /*
+ * 64-bit doesn't need locks to atomically read a 64bit value.
+ * So we have a optimization chance when the task's delta_exec is 0.
+ * Reading ->on_cpu is racy, but this is ok.
+ *
+ * If we race with it leaving cpu, we'll take a lock. So we're correct.
+ * If we race with it entering cpu, unaccounted time is 0. This is
+ * indistinguishable from the read occurring a few cycles earlier.
+ * If we see ->on_cpu without ->on_rq, the task is leaving, and has
+ * been accounted, so we're correct here as well.
+ */
+ if (!p->on_cpu || !task_on_rq_queued(p))
+ return p->se.sum_exec_runtime;
+#endif
+
+ rq = task_rq_lock(p, &flags);
+ /*
+ * Must be ->curr _and_ ->on_rq. If dequeued, we would
+ * project cycles that may never be accounted to this
+ * thread, breaking clock_gettime().
+ */
+ if (task_current(rq, p) && task_on_rq_queued(p)) {
+ update_rq_clock(rq);
+ p->sched_class->update_curr(rq);
+ }
+ ns = p->se.sum_exec_runtime;
+ task_rq_unlock(rq, p, &flags);
+
+ return ns;
+}
+
+/*
+ * This function gets called by the timer code, with HZ frequency.
+ * We call it with interrupts disabled.
+ */
+void scheduler_tick(void)
+{
+ int cpu = smp_processor_id();
+ struct rq *rq = cpu_rq(cpu);
+ struct task_struct *curr = rq->curr;
+
+ sched_clock_tick();
+
+ raw_spin_lock(&rq->lock);
+ update_rq_clock(rq);
+ curr->sched_class->task_tick(rq, curr, 0);
+ update_cpu_load_active(rq);
+ raw_spin_unlock(&rq->lock);
+
+ perf_event_task_tick();
+
+#ifdef CONFIG_SMP
+ rq->idle_balance = idle_cpu(cpu);
+ trigger_load_balance(rq);
+#endif
+ rq_last_tick_reset(rq);
+}
+
+#ifdef CONFIG_NO_HZ_FULL
+/**
+ * scheduler_tick_max_deferment
+ *
+ * Keep at least one tick per second when a single
+ * active task is running because the scheduler doesn't
+ * yet completely support full dynticks environment.
+ *
+ * This makes sure that uptime, CFS vruntime, load
+ * balancing, etc... continue to move forward, even
+ * with a very low granularity.
+ *
+ * Return: Maximum deferment in nanoseconds.
+ */
+u64 scheduler_tick_max_deferment(void)
+{
+ struct rq *rq = this_rq();
+ unsigned long next, now = ACCESS_ONCE(jiffies);
+
+ next = rq->last_sched_tick + HZ;
+
+ if (time_before_eq(next, now))
+ return 0;
+
+ return jiffies_to_nsecs(next - now);
+}
+#endif
+
+notrace unsigned long get_parent_ip(unsigned long addr)
+{
+ if (in_lock_functions(addr)) {
+ addr = CALLER_ADDR2;
+ if (in_lock_functions(addr))
+ addr = CALLER_ADDR3;
+ }
+ return addr;
+}
+
+#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+ defined(CONFIG_PREEMPT_TRACER))
+
+void preempt_count_add(int val)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+ /*
+ * Underflow?
+ */
+ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
+ return;
+#endif
+ __preempt_count_add(val);
+#ifdef CONFIG_DEBUG_PREEMPT
+ /*
+ * Spinlock count overflowing soon?
+ */
+ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
+ PREEMPT_MASK - 10);
+#endif
+ if (preempt_count() == val) {
+ unsigned long ip = get_parent_ip(CALLER_ADDR1);
+#ifdef CONFIG_DEBUG_PREEMPT
+ current->preempt_disable_ip = ip;
+#endif
+ trace_preempt_off(CALLER_ADDR0, ip);
+ }
+}
+EXPORT_SYMBOL(preempt_count_add);
+NOKPROBE_SYMBOL(preempt_count_add);
+
+void preempt_count_sub(int val)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+ /*
+ * Underflow?
+ */
+ if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
+ return;
+ /*
+ * Is the spinlock portion underflowing?
+ */
+ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
+ !(preempt_count() & PREEMPT_MASK)))
+ return;
+#endif
+
+ if (preempt_count() == val)
+ trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+ __preempt_count_sub(val);
+}
+EXPORT_SYMBOL(preempt_count_sub);
+NOKPROBE_SYMBOL(preempt_count_sub);
+
+#endif
+
+/*
+ * Print scheduling while atomic bug:
+ */
+static noinline void __schedule_bug(struct task_struct *prev)
+{
+ if (oops_in_progress)
+ return;
+
+ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
+ prev->comm, prev->pid, preempt_count());
+
+ debug_show_held_locks(prev);
+ print_modules();
+ if (irqs_disabled())
+ print_irqtrace_events(prev);
+#ifdef CONFIG_DEBUG_PREEMPT
+ if (in_atomic_preempt_off()) {
+ pr_err("Preemption disabled at:");
+ print_ip_sym(current->preempt_disable_ip);
+ pr_cont("\n");
+ }
+#endif
+ dump_stack();
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
+}
+
+/*
+ * Various schedule()-time debugging checks and statistics:
+ */
+static inline void schedule_debug(struct task_struct *prev)
+{
+#ifdef CONFIG_SCHED_STACK_END_CHECK
+ BUG_ON(unlikely(task_stack_end_corrupted(prev)));
+#endif
+ /*
+ * Test if we are atomic. Since do_exit() needs to call into
+ * schedule() atomically, we ignore that path. Otherwise whine
+ * if we are scheduling when we should not.
+ */
+ if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))
+ __schedule_bug(prev);
+ rcu_sleep_check();
+
+ profile_hit(SCHED_PROFILING, __builtin_return_address(0));
+
+ schedstat_inc(this_rq(), sched_count);
+}
+
+/*
+ * Pick up the highest-prio task:
+ */
+static inline struct task_struct *
+pick_next_task(struct rq *rq, struct task_struct *prev)
+{
+ const struct sched_class *class = &fair_sched_class;
+ struct task_struct *p;
+
+ /*
+ * Optimization: we know that if all tasks are in
+ * the fair class we can call that function directly:
+ */
+ if (likely(prev->sched_class == class &&
+ rq->nr_running == rq->cfs.h_nr_running)) {
+ p = fair_sched_class.pick_next_task(rq, prev);
+ if (unlikely(p == RETRY_TASK))
+ goto again;
+
+ /* assumes fair_sched_class->next == idle_sched_class */
+ if (unlikely(!p))
+ p = idle_sched_class.pick_next_task(rq, prev);
+
+ return p;
+ }
+
+again:
+ for_each_class(class) {
+ p = class->pick_next_task(rq, prev);
+ if (p) {
+ if (unlikely(p == RETRY_TASK))
+ goto again;
+ return p;
+ }
+ }
+
+ BUG(); /* the idle class will always have a runnable task */
+}
+
+/*
+ * __schedule() is the main scheduler function.
+ *
+ * The main means of driving the scheduler and thus entering this function are:
+ *
+ * 1. Explicit blocking: mutex, semaphore, waitqueue, etc.
+ *
+ * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
+ * paths. For example, see arch/x86/entry_64.S.
+ *
+ * To drive preemption between tasks, the scheduler sets the flag in timer
+ * interrupt handler scheduler_tick().
+ *
+ * 3. Wakeups don't really cause entry into schedule(). They add a
+ * task to the run-queue and that's it.
+ *
+ * Now, if the new task added to the run-queue preempts the current
+ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
+ * called on the nearest possible occasion:
+ *
+ * - If the kernel is preemptible (CONFIG_PREEMPT=y):
+ *
+ * - in syscall or exception context, at the next outmost
+ * preempt_enable(). (this might be as soon as the wake_up()'s
+ * spin_unlock()!)
+ *
+ * - in IRQ context, return from interrupt-handler to
+ * preemptible context
+ *
+ * - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
+ * then at the next:
+ *
+ * - cond_resched() call
+ * - explicit schedule() call
+ * - return from syscall or exception to user-space
+ * - return from interrupt-handler to user-space
+ *
+ * WARNING: all callers must re-check need_resched() afterward and reschedule
+ * accordingly in case an event triggered the need for rescheduling (such as
+ * an interrupt waking up a task) while preemption was disabled in __schedule().
+ */
+static void __sched __schedule(void)
+{
+ struct task_struct *prev, *next;
+ unsigned long *switch_count;
+ struct rq *rq;
+ int cpu;
+
+ preempt_disable();
+ cpu = smp_processor_id();
+ rq = cpu_rq(cpu);
+ rcu_note_context_switch();
+ prev = rq->curr;
+
+ schedule_debug(prev);
+
+ if (sched_feat(HRTICK))
+ hrtick_clear(rq);
+
+ /*
+ * Make sure that signal_pending_state()->signal_pending() below
+ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
+ * done by the caller to avoid the race with signal_wake_up().
+ */
+ smp_mb__before_spinlock();
+ raw_spin_lock_irq(&rq->lock);
+
+ rq->clock_skip_update <<= 1; /* promote REQ to ACT */
+
+ switch_count = &prev->nivcsw;
+ if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
+ if (unlikely(signal_pending_state(prev->state, prev))) {
+ prev->state = TASK_RUNNING;
+ } else {
+ deactivate_task(rq, prev, DEQUEUE_SLEEP);
+ prev->on_rq = 0;
+
+ /*
+ * If a worker went to sleep, notify and ask workqueue
+ * whether it wants to wake up a task to maintain
+ * concurrency.
+ */
+ if (prev->flags & PF_WQ_WORKER) {
+ struct task_struct *to_wakeup;
+
+ to_wakeup = wq_worker_sleeping(prev, cpu);
+ if (to_wakeup)
+ try_to_wake_up_local(to_wakeup);
+ }
+ }
+ switch_count = &prev->nvcsw;
+ }
+
+ if (task_on_rq_queued(prev))
+ update_rq_clock(rq);
+
+ next = pick_next_task(rq, prev);
+ clear_tsk_need_resched(prev);
+ clear_preempt_need_resched();
+ rq->clock_skip_update = 0;
+
+ if (likely(prev != next)) {
+ rq->nr_switches++;
+ rq->curr = next;
+ ++*switch_count;
+
+ rq = context_switch(rq, prev, next); /* unlocks the rq */
+ cpu = cpu_of(rq);
+ } else
+ raw_spin_unlock_irq(&rq->lock);
+
+ post_schedule(rq);
+
+ sched_preempt_enable_no_resched();
+}
+
+static inline void sched_submit_work(struct task_struct *tsk)
+{
+ if (!tsk->state || tsk_is_pi_blocked(tsk))
+ return;
+ /*
+ * If we are going to sleep and we have plugged IO queued,
+ * make sure to submit it to avoid deadlocks.
+ */
+ if (blk_needs_flush_plug(tsk))
+ blk_schedule_flush_plug(tsk);
+}
+
+asmlinkage __visible void __sched schedule(void)
+{
+ struct task_struct *tsk = current;
+
+ sched_submit_work(tsk);
+ do {
+ __schedule();
+ } while (need_resched());
+}
+EXPORT_SYMBOL(schedule);
+
+#ifdef CONFIG_CONTEXT_TRACKING
+asmlinkage __visible void __sched schedule_user(void)
+{
+ /*
+ * If we come here after a random call to set_need_resched(),
+ * or we have been woken up remotely but the IPI has not yet arrived,
+ * we haven't yet exited the RCU idle mode. Do it here manually until
+ * we find a better solution.
+ *
+ * NB: There are buggy callers of this function. Ideally we
+ * should warn if prev_state != CONTEXT_USER, but that will trigger
+ * too frequently to make sense yet.
+ */
+ enum ctx_state prev_state = exception_enter();
+ schedule();
+ exception_exit(prev_state);
+}
+#endif
+
+/**
+ * schedule_preempt_disabled - called with preemption disabled
+ *
+ * Returns with preemption disabled. Note: preempt_count must be 1
+ */
+void __sched schedule_preempt_disabled(void)
+{
+ sched_preempt_enable_no_resched();
+ schedule();
+ preempt_disable();
+}
+
+static void __sched notrace preempt_schedule_common(void)
+{
+ do {
+ __preempt_count_add(PREEMPT_ACTIVE);
+ __schedule();
+ __preempt_count_sub(PREEMPT_ACTIVE);
+
+ /*
+ * Check again in case we missed a preemption opportunity
+ * between schedule and now.
+ */
+ barrier();
+ } while (need_resched());
+}
+
+#ifdef CONFIG_PREEMPT
+/*
+ * this is the entry point to schedule() from in-kernel preemption
+ * off of preempt_enable. Kernel preemptions off return from interrupt
+ * occur there and call schedule directly.
+ */
+asmlinkage __visible void __sched notrace preempt_schedule(void)
+{
+ /*
+ * If there is a non-zero preempt_count or interrupts are disabled,
+ * we do not want to preempt the current task. Just return..
+ */
+ if (likely(!preemptible()))
+ return;
+
+ preempt_schedule_common();
+}
+NOKPROBE_SYMBOL(preempt_schedule);
+EXPORT_SYMBOL(preempt_schedule);
+
+#ifdef CONFIG_CONTEXT_TRACKING
+/**
+ * preempt_schedule_context - preempt_schedule called by tracing
+ *
+ * The tracing infrastructure uses preempt_enable_notrace to prevent
+ * recursion and tracing preempt enabling caused by the tracing
+ * infrastructure itself. But as tracing can happen in areas coming
+ * from userspace or just about to enter userspace, a preempt enable
+ * can occur before user_exit() is called. This will cause the scheduler
+ * to be called when the system is still in usermode.
+ *
+ * To prevent this, the preempt_enable_notrace will use this function
+ * instead of preempt_schedule() to exit user context if needed before
+ * calling the scheduler.
+ */
+asmlinkage __visible void __sched notrace preempt_schedule_context(void)
+{
+ enum ctx_state prev_ctx;
+
+ if (likely(!preemptible()))
+ return;
+
+ do {
+ __preempt_count_add(PREEMPT_ACTIVE);
+ /*
+ * Needs preempt disabled in case user_exit() is traced
+ * and the tracer calls preempt_enable_notrace() causing
+ * an infinite recursion.
+ */
+ prev_ctx = exception_enter();
+ __schedule();
+ exception_exit(prev_ctx);
+
+ __preempt_count_sub(PREEMPT_ACTIVE);
+ barrier();
+ } while (need_resched());
+}
+EXPORT_SYMBOL_GPL(preempt_schedule_context);
+#endif /* CONFIG_CONTEXT_TRACKING */
+
+#endif /* CONFIG_PREEMPT */
+
+/*
+ * this is the entry point to schedule() from kernel preemption
+ * off of irq context.
+ * Note, that this is called and return with irqs disabled. This will
+ * protect us against recursive calling from irq.
+ */
+asmlinkage __visible void __sched preempt_schedule_irq(void)
+{
+ enum ctx_state prev_state;
+
+ /* Catch callers which need to be fixed */
+ BUG_ON(preempt_count() || !irqs_disabled());
+
+ prev_state = exception_enter();
+
+ do {
+ __preempt_count_add(PREEMPT_ACTIVE);
+ local_irq_enable();
+ __schedule();
+ local_irq_disable();
+ __preempt_count_sub(PREEMPT_ACTIVE);
+
+ /*
+ * Check again in case we missed a preemption opportunity
+ * between schedule and now.
+ */
+ barrier();
+ } while (need_resched());
+
+ exception_exit(prev_state);
+}
+
+int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
+ void *key)
+{
+ return try_to_wake_up(curr->private, mode, wake_flags);
+}
+EXPORT_SYMBOL(default_wake_function);
+
+#ifdef CONFIG_RT_MUTEXES
+
+/*
+ * rt_mutex_setprio - set the current priority of a task
+ * @p: task
+ * @prio: prio value (kernel-internal form)
+ *
+ * This function changes the 'effective' priority of a task. It does
+ * not touch ->normal_prio like __setscheduler().
+ *
+ * Used by the rt_mutex code to implement priority inheritance
+ * logic. Call site only calls if the priority of the task changed.
+ */
+void rt_mutex_setprio(struct task_struct *p, int prio)
+{
+ int oldprio, queued, running, enqueue_flag = 0;
+ struct rq *rq;
+ const struct sched_class *prev_class;
+
+ BUG_ON(prio > MAX_PRIO);
+
+ rq = __task_rq_lock(p);
+
+ /*
+ * Idle task boosting is a nono in general. There is one
+ * exception, when PREEMPT_RT and NOHZ is active:
+ *
+ * The idle task calls get_next_timer_interrupt() and holds
+ * the timer wheel base->lock on the CPU and another CPU wants
+ * to access the timer (probably to cancel it). We can safely
+ * ignore the boosting request, as the idle CPU runs this code
+ * with interrupts disabled and will complete the lock
+ * protected section without being interrupted. So there is no
+ * real need to boost.
+ */
+ if (unlikely(p == rq->idle)) {
+ WARN_ON(p != rq->curr);
+ WARN_ON(p->pi_blocked_on);
+ goto out_unlock;
+ }
+
+ trace_sched_pi_setprio(p, prio);
+ oldprio = p->prio;
+ prev_class = p->sched_class;
+ queued = task_on_rq_queued(p);
+ running = task_current(rq, p);
+ if (queued)
+ dequeue_task(rq, p, 0);
+ if (running)
+ put_prev_task(rq, p);
+
+ /*
+ * Boosting condition are:
+ * 1. -rt task is running and holds mutex A
+ * --> -dl task blocks on mutex A
+ *
+ * 2. -dl task is running and holds mutex A
+ * --> -dl task blocks on mutex A and could preempt the
+ * running task
+ */
+ if (dl_prio(prio)) {
+ struct task_struct *pi_task = rt_mutex_get_top_task(p);
+ if (!dl_prio(p->normal_prio) ||
+ (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
+ p->dl.dl_boosted = 1;
+ p->dl.dl_throttled = 0;
+ enqueue_flag = ENQUEUE_REPLENISH;
+ } else
+ p->dl.dl_boosted = 0;
+ p->sched_class = &dl_sched_class;
+ } else if (rt_prio(prio)) {
+ if (dl_prio(oldprio))
+ p->dl.dl_boosted = 0;
+ if (oldprio < prio)
+ enqueue_flag = ENQUEUE_HEAD;
+ p->sched_class = &rt_sched_class;
+ } else {
+ if (dl_prio(oldprio))
+ p->dl.dl_boosted = 0;
+ if (rt_prio(oldprio))
+ p->rt.timeout = 0;
+ p->sched_class = &fair_sched_class;
+ }
+
+ p->prio = prio;
+
+ if (running)
+ p->sched_class->set_curr_task(rq);
+ if (queued)
+ enqueue_task(rq, p, enqueue_flag);
+
+ check_class_changed(rq, p, prev_class, oldprio);
+out_unlock:
+ __task_rq_unlock(rq);
+}
+#endif
+
+void set_user_nice(struct task_struct *p, long nice)
+{
+ int old_prio, delta, queued;
+ unsigned long flags;
+ struct rq *rq;
+
+ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
+ return;
+ /*
+ * We have to be careful, if called from sys_setpriority(),
+ * the task might be in the middle of scheduling on another CPU.
+ */
+ rq = task_rq_lock(p, &flags);
+ /*
+ * The RT priorities are set via sched_setscheduler(), but we still
+ * allow the 'normal' nice value to be set - but as expected
+ * it wont have any effect on scheduling until the task is
+ * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:
+ */
+ if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
+ p->static_prio = NICE_TO_PRIO(nice);
+ goto out_unlock;
+ }
+ queued = task_on_rq_queued(p);
+ if (queued)
+ dequeue_task(rq, p, 0);
+
+ p->static_prio = NICE_TO_PRIO(nice);
+ set_load_weight(p);
+ old_prio = p->prio;
+ p->prio = effective_prio(p);
+ delta = p->prio - old_prio;
+
+ if (queued) {
+ enqueue_task(rq, p, 0);
+ /*
+ * If the task increased its priority or is running and
+ * lowered its priority, then reschedule its CPU:
+ */
+ if (delta < 0 || (delta > 0 && task_running(rq, p)))
+ resched_curr(rq);
+ }
+out_unlock:
+ task_rq_unlock(rq, p, &flags);
+}
+EXPORT_SYMBOL(set_user_nice);
+
+/*
+ * can_nice - check if a task can reduce its nice value
+ * @p: task
+ * @nice: nice value
+ */
+int can_nice(const struct task_struct *p, const int nice)
+{
+ /* convert nice value [19,-20] to rlimit style value [1,40] */
+ int nice_rlim = nice_to_rlimit(nice);
+
+ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
+ capable(CAP_SYS_NICE));
+}
+
+#ifdef __ARCH_WANT_SYS_NICE
+
+/*
+ * sys_nice - change the priority of the current process.
+ * @increment: priority increment
+ *
+ * sys_setpriority is a more generic, but much slower function that
+ * does similar things.
+ */
+SYSCALL_DEFINE1(nice, int, increment)
+{
+ long nice, retval;
+
+ /*
+ * Setpriority might change our priority at the same moment.
+ * We don't have to worry. Conceptually one call occurs first
+ * and we have a single winner.
+ */
+ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
+ nice = task_nice(current) + increment;
+
+ nice = clamp_val(nice, MIN_NICE, MAX_NICE);
+ if (increment < 0 && !can_nice(current, nice))
+ return -EPERM;
+
+ retval = security_task_setnice(current, nice);
+ if (retval)
+ return retval;
+
+ set_user_nice(current, nice);
+ return 0;
+}
+
+#endif
+
+/**
+ * task_prio - return the priority value of a given task.
+ * @p: the task in question.
+ *
+ * Return: The priority value as seen by users in /proc.
+ * RT tasks are offset by -200. Normal tasks are centered
+ * around 0, value goes from -16 to +15.
+ */
+int task_prio(const struct task_struct *p)
+{
+ return p->prio - MAX_RT_PRIO;
+}
+
+/**
+ * idle_cpu - is a given cpu idle currently?
+ * @cpu: the processor in question.
+ *
+ * Return: 1 if the CPU is currently idle. 0 otherwise.
+ */
+int idle_cpu(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ if (rq->curr != rq->idle)
+ return 0;
+
+ if (rq->nr_running)
+ return 0;
+
+#ifdef CONFIG_SMP
+ if (!llist_empty(&rq->wake_list))
+ return 0;
+#endif
+
+ return 1;
+}
+
+/**
+ * idle_task - return the idle task for a given cpu.
+ * @cpu: the processor in question.
+ *
+ * Return: The idle task for the cpu @cpu.
+ */
+struct task_struct *idle_task(int cpu)
+{
+ return cpu_rq(cpu)->idle;
+}
+
+/**
+ * find_process_by_pid - find a process with a matching PID value.
+ * @pid: the pid in question.
+ *
+ * The task of @pid, if found. %NULL otherwise.
+ */
+static struct task_struct *find_process_by_pid(pid_t pid)
+{
+ return pid ? find_task_by_vpid(pid) : current;
+}
+
+/*
+ * This function initializes the sched_dl_entity of a newly becoming
+ * SCHED_DEADLINE task.
+ *
+ * Only the static values are considered here, the actual runtime and the
+ * absolute deadline will be properly calculated when the task is enqueued
+ * for the first time with its new policy.
+ */
+static void
+__setparam_dl(struct task_struct *p, const struct sched_attr *attr)
+{
+ struct sched_dl_entity *dl_se = &p->dl;
+
+ dl_se->dl_runtime = attr->sched_runtime;
+ dl_se->dl_deadline = attr->sched_deadline;
+ dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
+ dl_se->flags = attr->sched_flags;
+ dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
+
+ /*
+ * Changing the parameters of a task is 'tricky' and we're not doing
+ * the correct thing -- also see task_dead_dl() and switched_from_dl().
+ *
+ * What we SHOULD do is delay the bandwidth release until the 0-lag
+ * point. This would include retaining the task_struct until that time
+ * and change dl_overflow() to not immediately decrement the current
+ * amount.
+ *
+ * Instead we retain the current runtime/deadline and let the new
+ * parameters take effect after the current reservation period lapses.
+ * This is safe (albeit pessimistic) because the 0-lag point is always
+ * before the current scheduling deadline.
+ *
+ * We can still have temporary overloads because we do not delay the
+ * change in bandwidth until that time; so admission control is
+ * not on the safe side. It does however guarantee tasks will never
+ * consume more than promised.
+ */
+}
+
+/*
+ * sched_setparam() passes in -1 for its policy, to let the functions
+ * it calls know not to change it.
+ */
+#define SETPARAM_POLICY -1
+
+static void __setscheduler_params(struct task_struct *p,
+ const struct sched_attr *attr)
+{
+ int policy = attr->sched_policy;
+
+ if (policy == SETPARAM_POLICY)
+ policy = p->policy;
+
+ p->policy = policy;
+
+ if (dl_policy(policy))
+ __setparam_dl(p, attr);
+ else if (fair_policy(policy))
+ p->static_prio = NICE_TO_PRIO(attr->sched_nice);
+
+ /*
+ * __sched_setscheduler() ensures attr->sched_priority == 0 when
+ * !rt_policy. Always setting this ensures that things like
+ * getparam()/getattr() don't report silly values for !rt tasks.
+ */
+ p->rt_priority = attr->sched_priority;
+ p->normal_prio = normal_prio(p);
+ set_load_weight(p);
+}
+
+/* Actually do priority change: must hold pi & rq lock. */
+static void __setscheduler(struct rq *rq, struct task_struct *p,
+ const struct sched_attr *attr, bool keep_boost)
+{
+ __setscheduler_params(p, attr);
+
+ /*
+ * Keep a potential priority boosting if called from
+ * sched_setscheduler().
+ */
+ if (keep_boost)
+ p->prio = rt_mutex_get_effective_prio(p, normal_prio(p));
+ else
+ p->prio = normal_prio(p);
+
+ if (dl_prio(p->prio))
+ p->sched_class = &dl_sched_class;
+ else if (rt_prio(p->prio))
+ p->sched_class = &rt_sched_class;
+ else
+ p->sched_class = &fair_sched_class;
+}
+
+static void
+__getparam_dl(struct task_struct *p, struct sched_attr *attr)
+{
+ struct sched_dl_entity *dl_se = &p->dl;
+
+ attr->sched_priority = p->rt_priority;
+ attr->sched_runtime = dl_se->dl_runtime;
+ attr->sched_deadline = dl_se->dl_deadline;
+ attr->sched_period = dl_se->dl_period;
+ attr->sched_flags = dl_se->flags;
+}
+
+/*
+ * This function validates the new parameters of a -deadline task.
+ * We ask for the deadline not being zero, and greater or equal
+ * than the runtime, as well as the period of being zero or
+ * greater than deadline. Furthermore, we have to be sure that
+ * user parameters are above the internal resolution of 1us (we
+ * check sched_runtime only since it is always the smaller one) and
+ * below 2^63 ns (we have to check both sched_deadline and
+ * sched_period, as the latter can be zero).
+ */
+static bool
+__checkparam_dl(const struct sched_attr *attr)
+{
+ /* deadline != 0 */
+ if (attr->sched_deadline == 0)
+ return false;
+
+ /*
+ * Since we truncate DL_SCALE bits, make sure we're at least
+ * that big.
+ */
+ if (attr->sched_runtime < (1ULL << DL_SCALE))
+ return false;
+
+ /*
+ * Since we use the MSB for wrap-around and sign issues, make
+ * sure it's not set (mind that period can be equal to zero).
+ */
+ if (attr->sched_deadline & (1ULL << 63) ||
+ attr->sched_period & (1ULL << 63))
+ return false;
+
+ /* runtime <= deadline <= period (if period != 0) */
+ if ((attr->sched_period != 0 &&
+ attr->sched_period < attr->sched_deadline) ||
+ attr->sched_deadline < attr->sched_runtime)
+ return false;
+
+ return true;
+}
+
+/*
+ * check the target process has a UID that matches the current process's
+ */
+static bool check_same_owner(struct task_struct *p)
+{
+ const struct cred *cred = current_cred(), *pcred;
+ bool match;
+
+ rcu_read_lock();
+ pcred = __task_cred(p);
+ match = (uid_eq(cred->euid, pcred->euid) ||
+ uid_eq(cred->euid, pcred->uid));
+ rcu_read_unlock();
+ return match;
+}
+
+static bool dl_param_changed(struct task_struct *p,
+ const struct sched_attr *attr)
+{
+ struct sched_dl_entity *dl_se = &p->dl;
+
+ if (dl_se->dl_runtime != attr->sched_runtime ||
+ dl_se->dl_deadline != attr->sched_deadline ||
+ dl_se->dl_period != attr->sched_period ||
+ dl_se->flags != attr->sched_flags)
+ return true;
+
+ return false;
+}
+
+static int __sched_setscheduler(struct task_struct *p,
+ const struct sched_attr *attr,
+ bool user)
+{
+ int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
+ MAX_RT_PRIO - 1 - attr->sched_priority;
+ int retval, oldprio, oldpolicy = -1, queued, running;
+ int new_effective_prio, policy = attr->sched_policy;
+ unsigned long flags;
+ const struct sched_class *prev_class;
+ struct rq *rq;
+ int reset_on_fork;
+
+ /* may grab non-irq protected spin_locks */
+ BUG_ON(in_interrupt());
+recheck:
+ /* double check policy once rq lock held */
+ if (policy < 0) {
+ reset_on_fork = p->sched_reset_on_fork;
+ policy = oldpolicy = p->policy;
+ } else {
+ reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
+
+ if (policy != SCHED_DEADLINE &&
+ policy != SCHED_FIFO && policy != SCHED_RR &&
+ policy != SCHED_NORMAL && policy != SCHED_BATCH &&
+ policy != SCHED_IDLE)
+ return -EINVAL;
+ }
+
+ if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK))
+ return -EINVAL;
+
+ /*
+ * Valid priorities for SCHED_FIFO and SCHED_RR are
+ * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
+ * SCHED_BATCH and SCHED_IDLE is 0.
+ */
+ if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
+ (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
+ return -EINVAL;
+ if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
+ (rt_policy(policy) != (attr->sched_priority != 0)))
+ return -EINVAL;
+
+ /*
+ * Allow unprivileged RT tasks to decrease priority:
+ */
+ if (user && !capable(CAP_SYS_NICE)) {
+ if (fair_policy(policy)) {
+ if (attr->sched_nice < task_nice(p) &&
+ !can_nice(p, attr->sched_nice))
+ return -EPERM;
+ }
+
+ if (rt_policy(policy)) {
+ unsigned long rlim_rtprio =
+ task_rlimit(p, RLIMIT_RTPRIO);
+
+ /* can't set/change the rt policy */
+ if (policy != p->policy && !rlim_rtprio)
+ return -EPERM;
+
+ /* can't increase priority */
+ if (attr->sched_priority > p->rt_priority &&
+ attr->sched_priority > rlim_rtprio)
+ return -EPERM;
+ }
+
+ /*
+ * Can't set/change SCHED_DEADLINE policy at all for now
+ * (safest behavior); in the future we would like to allow
+ * unprivileged DL tasks to increase their relative deadline
+ * or reduce their runtime (both ways reducing utilization)
+ */
+ if (dl_policy(policy))
+ return -EPERM;
+
+ /*
+ * Treat SCHED_IDLE as nice 20. Only allow a switch to
+ * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
+ */
+ if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
+ if (!can_nice(p, task_nice(p)))
+ return -EPERM;
+ }
+
+ /* can't change other user's priorities */
+ if (!check_same_owner(p))
+ return -EPERM;
+
+ /* Normal users shall not reset the sched_reset_on_fork flag */
+ if (p->sched_reset_on_fork && !reset_on_fork)
+ return -EPERM;
+ }
+
+ if (user) {
+ retval = security_task_setscheduler(p);
+ if (retval)
+ return retval;
+ }
+
+ /*
+ * make sure no PI-waiters arrive (or leave) while we are
+ * changing the priority of the task:
+ *
+ * To be able to change p->policy safely, the appropriate
+ * runqueue lock must be held.
+ */
+ rq = task_rq_lock(p, &flags);
+
+ /*
+ * Changing the policy of the stop threads its a very bad idea
+ */
+ if (p == rq->stop) {
+ task_rq_unlock(rq, p, &flags);
+ return -EINVAL;
+ }
+
+ /*
+ * If not changing anything there's no need to proceed further,
+ * but store a possible modification of reset_on_fork.
+ */
+ if (unlikely(policy == p->policy)) {
+ if (fair_policy(policy) && attr->sched_nice != task_nice(p))
+ goto change;
+ if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
+ goto change;
+ if (dl_policy(policy) && dl_param_changed(p, attr))
+ goto change;
+
+ p->sched_reset_on_fork = reset_on_fork;
+ task_rq_unlock(rq, p, &flags);
+ return 0;
+ }
+change:
+
+ if (user) {
+#ifdef CONFIG_RT_GROUP_SCHED
+ /*
+ * Do not allow realtime tasks into groups that have no runtime
+ * assigned.
+ */
+ if (rt_bandwidth_enabled() && rt_policy(policy) &&
+ task_group(p)->rt_bandwidth.rt_runtime == 0 &&
+ !task_group_is_autogroup(task_group(p))) {
+ task_rq_unlock(rq, p, &flags);
+ return -EPERM;
+ }
+#endif
+#ifdef CONFIG_SMP
+ if (dl_bandwidth_enabled() && dl_policy(policy)) {
+ cpumask_t *span = rq->rd->span;
+
+ /*
+ * Don't allow tasks with an affinity mask smaller than
+ * the entire root_domain to become SCHED_DEADLINE. We
+ * will also fail if there's no bandwidth available.
+ */
+ if (!cpumask_subset(span, &p->cpus_allowed) ||
+ rq->rd->dl_bw.bw == 0) {
+ task_rq_unlock(rq, p, &flags);
+ return -EPERM;
+ }
+ }
+#endif
+ }
+
+ /* recheck policy now with rq lock held */
+ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
+ policy = oldpolicy = -1;
+ task_rq_unlock(rq, p, &flags);
+ goto recheck;
+ }
+
+ /*
+ * If setscheduling to SCHED_DEADLINE (or changing the parameters
+ * of a SCHED_DEADLINE task) we need to check if enough bandwidth
+ * is available.
+ */
+ if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) {
+ task_rq_unlock(rq, p, &flags);
+ return -EBUSY;
+ }
+
+ p->sched_reset_on_fork = reset_on_fork;
+ oldprio = p->prio;
+
+ /*
+ * Take priority boosted tasks into account. If the new
+ * effective priority is unchanged, we just store the new
+ * normal parameters and do not touch the scheduler class and
+ * the runqueue. This will be done when the task deboost
+ * itself.
+ */
+ new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
+ if (new_effective_prio == oldprio) {
+ __setscheduler_params(p, attr);
+ task_rq_unlock(rq, p, &flags);
+ return 0;
+ }
+
+ queued = task_on_rq_queued(p);
+ running = task_current(rq, p);
+ if (queued)
+ dequeue_task(rq, p, 0);
+ if (running)
+ put_prev_task(rq, p);
+
+ prev_class = p->sched_class;
+ __setscheduler(rq, p, attr, true);
+
+ if (running)
+ p->sched_class->set_curr_task(rq);
+ if (queued) {
+ /*
+ * We enqueue to tail when the priority of a task is
+ * increased (user space view).
+ */
+ enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
+ }
+
+ check_class_changed(rq, p, prev_class, oldprio);
+ task_rq_unlock(rq, p, &flags);
+
+ rt_mutex_adjust_pi(p);
+
+ return 0;
+}
+
+static int _sched_setscheduler(struct task_struct *p, int policy,
+ const struct sched_param *param, bool check)
+{
+ struct sched_attr attr = {
+ .sched_policy = policy,
+ .sched_priority = param->sched_priority,
+ .sched_nice = PRIO_TO_NICE(p->static_prio),
+ };
+
+ /* Fixup the legacy SCHED_RESET_ON_FORK hack. */
+ if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
+ attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
+ policy &= ~SCHED_RESET_ON_FORK;
+ attr.sched_policy = policy;
+ }
+
+ return __sched_setscheduler(p, &attr, check);
+}
+/**
+ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
+ * @p: the task in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * Return: 0 on success. An error code otherwise.
+ *
+ * NOTE that the task may be already dead.
+ */
+int sched_setscheduler(struct task_struct *p, int policy,
+ const struct sched_param *param)
+{
+ return _sched_setscheduler(p, policy, param, true);
+}
+EXPORT_SYMBOL_GPL(sched_setscheduler);
+
+int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
+{
+ return __sched_setscheduler(p, attr, true);
+}
+EXPORT_SYMBOL_GPL(sched_setattr);
+
+/**
+ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
+ * @p: the task in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * Just like sched_setscheduler, only don't bother checking if the
+ * current context has permission. For example, this is needed in
+ * stop_machine(): we create temporary high priority worker threads,
+ * but our caller might not have that capability.
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+int sched_setscheduler_nocheck(struct task_struct *p, int policy,
+ const struct sched_param *param)
+{
+ return _sched_setscheduler(p, policy, param, false);
+}
+
+static int
+do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
+{
+ struct sched_param lparam;
+ struct task_struct *p;
+ int retval;
+
+ if (!param || pid < 0)
+ return -EINVAL;
+ if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
+ return -EFAULT;
+
+ rcu_read_lock();
+ retval = -ESRCH;
+ p = find_process_by_pid(pid);
+ if (p != NULL)
+ retval = sched_setscheduler(p, policy, &lparam);
+ rcu_read_unlock();
+
+ return retval;
+}
+
+/*
+ * Mimics kernel/events/core.c perf_copy_attr().
+ */
+static int sched_copy_attr(struct sched_attr __user *uattr,
+ struct sched_attr *attr)
+{
+ u32 size;
+ int ret;
+
+ if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
+ return -EFAULT;
+
+ /*
+ * zero the full structure, so that a short copy will be nice.
+ */
+ memset(attr, 0, sizeof(*attr));
+
+ ret = get_user(size, &uattr->size);
+ if (ret)
+ return ret;
+
+ if (size > PAGE_SIZE) /* silly large */
+ goto err_size;
+
+ if (!size) /* abi compat */
+ size = SCHED_ATTR_SIZE_VER0;
+
+ if (size < SCHED_ATTR_SIZE_VER0)
+ goto err_size;
+
+ /*
+ * If we're handed a bigger struct than we know of,
+ * ensure all the unknown bits are 0 - i.e. new
+ * user-space does not rely on any kernel feature
+ * extensions we dont know about yet.
+ */
+ if (size > sizeof(*attr)) {
+ unsigned char __user *addr;
+ unsigned char __user *end;
+ unsigned char val;
+
+ addr = (void __user *)uattr + sizeof(*attr);
+ end = (void __user *)uattr + size;
+
+ for (; addr < end; addr++) {
+ ret = get_user(val, addr);
+ if (ret)
+ return ret;
+ if (val)
+ goto err_size;
+ }
+ size = sizeof(*attr);
+ }
+
+ ret = copy_from_user(attr, uattr, size);
+ if (ret)
+ return -EFAULT;
+
+ /*
+ * XXX: do we want to be lenient like existing syscalls; or do we want
+ * to be strict and return an error on out-of-bounds values?
+ */
+ attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
+
+ return 0;
+
+err_size:
+ put_user(sizeof(*attr), &uattr->size);
+ return -E2BIG;
+}
+
+/**
+ * sys_sched_setscheduler - set/change the scheduler policy and RT priority
+ * @pid: the pid in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
+ struct sched_param __user *, param)
+{
+ /* negative values for policy are not valid */
+ if (policy < 0)
+ return -EINVAL;
+
+ return do_sched_setscheduler(pid, policy, param);
+}
+
+/**
+ * sys_sched_setparam - set/change the RT priority of a thread
+ * @pid: the pid in question.
+ * @param: structure containing the new RT priority.
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
+{
+ return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
+}
+
+/**
+ * sys_sched_setattr - same as above, but with extended sched_attr
+ * @pid: the pid in question.
+ * @uattr: structure containing the extended parameters.
+ * @flags: for future extension.
+ */
+SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
+ unsigned int, flags)
+{
+ struct sched_attr attr;
+ struct task_struct *p;
+ int retval;
+
+ if (!uattr || pid < 0 || flags)
+ return -EINVAL;
+
+ retval = sched_copy_attr(uattr, &attr);
+ if (retval)
+ return retval;
+
+ if ((int)attr.sched_policy < 0)
+ return -EINVAL;
+
+ rcu_read_lock();
+ retval = -ESRCH;
+ p = find_process_by_pid(pid);
+ if (p != NULL)
+ retval = sched_setattr(p, &attr);
+ rcu_read_unlock();
+
+ return retval;
+}
+
+/**
+ * sys_sched_getscheduler - get the policy (scheduling class) of a thread
+ * @pid: the pid in question.
+ *
+ * Return: On success, the policy of the thread. Otherwise, a negative error
+ * code.
+ */
+SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
+{
+ struct task_struct *p;
+ int retval;
+
+ if (pid < 0)
+ return -EINVAL;
+
+ retval = -ESRCH;
+ rcu_read_lock();
+ p = find_process_by_pid(pid);
+ if (p) {
+ retval = security_task_getscheduler(p);
+ if (!retval)
+ retval = p->policy
+ | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
+ }
+ rcu_read_unlock();
+ return retval;
+}
+
+/**
+ * sys_sched_getparam - get the RT priority of a thread
+ * @pid: the pid in question.
+ * @param: structure containing the RT priority.
+ *
+ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
+ * code.
+ */
+SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
+{
+ struct sched_param lp = { .sched_priority = 0 };
+ struct task_struct *p;
+ int retval;
+
+ if (!param || pid < 0)
+ return -EINVAL;
+
+ rcu_read_lock();
+ p = find_process_by_pid(pid);
+ retval = -ESRCH;
+ if (!p)
+ goto out_unlock;
+
+ retval = security_task_getscheduler(p);
+ if (retval)
+ goto out_unlock;
+
+ if (task_has_rt_policy(p))
+ lp.sched_priority = p->rt_priority;
+ rcu_read_unlock();
+
+ /*
+ * This one might sleep, we cannot do it with a spinlock held ...
+ */
+ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
+
+ return retval;
+
+out_unlock:
+ rcu_read_unlock();
+ return retval;
+}
+
+static int sched_read_attr(struct sched_attr __user *uattr,
+ struct sched_attr *attr,
+ unsigned int usize)
+{
+ int ret;
+
+ if (!access_ok(VERIFY_WRITE, uattr, usize))
+ return -EFAULT;
+
+ /*
+ * If we're handed a smaller struct than we know of,
+ * ensure all the unknown bits are 0 - i.e. old
+ * user-space does not get uncomplete information.
+ */
+ if (usize < sizeof(*attr)) {
+ unsigned char *addr;
+ unsigned char *end;
+
+ addr = (void *)attr + usize;
+ end = (void *)attr + sizeof(*attr);
+
+ for (; addr < end; addr++) {
+ if (*addr)
+ return -EFBIG;
+ }
+
+ attr->size = usize;
+ }
+
+ ret = copy_to_user(uattr, attr, attr->size);
+ if (ret)
+ return -EFAULT;
+
+ return 0;
+}
+
+/**
+ * sys_sched_getattr - similar to sched_getparam, but with sched_attr
+ * @pid: the pid in question.
+ * @uattr: structure containing the extended parameters.
+ * @size: sizeof(attr) for fwd/bwd comp.
+ * @flags: for future extension.
+ */
+SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
+ unsigned int, size, unsigned int, flags)
+{
+ struct sched_attr attr = {
+ .size = sizeof(struct sched_attr),
+ };
+ struct task_struct *p;
+ int retval;
+
+ if (!uattr || pid < 0 || size > PAGE_SIZE ||
+ size < SCHED_ATTR_SIZE_VER0 || flags)
+ return -EINVAL;
+
+ rcu_read_lock();
+ p = find_process_by_pid(pid);
+ retval = -ESRCH;
+ if (!p)
+ goto out_unlock;
+
+ retval = security_task_getscheduler(p);
+ if (retval)
+ goto out_unlock;
+
+ attr.sched_policy = p->policy;
+ if (p->sched_reset_on_fork)
+ attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
+ if (task_has_dl_policy(p))
+ __getparam_dl(p, &attr);
+ else if (task_has_rt_policy(p))
+ attr.sched_priority = p->rt_priority;
+ else
+ attr.sched_nice = task_nice(p);
+
+ rcu_read_unlock();
+
+ retval = sched_read_attr(uattr, &attr, size);
+ return retval;
+
+out_unlock:
+ rcu_read_unlock();
+ return retval;
+}
+
+long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
+{
+ cpumask_var_t cpus_allowed, new_mask;
+ struct task_struct *p;
+ int retval;
+
+ rcu_read_lock();
+
+ p = find_process_by_pid(pid);
+ if (!p) {
+ rcu_read_unlock();
+ return -ESRCH;
+ }
+
+ /* Prevent p going away */
+ get_task_struct(p);
+ rcu_read_unlock();
+
+ if (p->flags & PF_NO_SETAFFINITY) {
+ retval = -EINVAL;
+ goto out_put_task;
+ }
+ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
+ retval = -ENOMEM;
+ goto out_put_task;
+ }
+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
+ retval = -ENOMEM;
+ goto out_free_cpus_allowed;
+ }
+ retval = -EPERM;
+ if (!check_same_owner(p)) {
+ rcu_read_lock();
+ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
+ rcu_read_unlock();
+ goto out_free_new_mask;
+ }
+ rcu_read_unlock();
+ }
+
+ retval = security_task_setscheduler(p);
+ if (retval)
+ goto out_free_new_mask;
+
+
+ cpuset_cpus_allowed(p, cpus_allowed);
+ cpumask_and(new_mask, in_mask, cpus_allowed);
+
+ /*
+ * Since bandwidth control happens on root_domain basis,
+ * if admission test is enabled, we only admit -deadline
+ * tasks allowed to run on all the CPUs in the task's
+ * root_domain.
+ */
+#ifdef CONFIG_SMP
+ if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
+ rcu_read_lock();
+ if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) {
+ retval = -EBUSY;
+ rcu_read_unlock();
+ goto out_free_new_mask;
+ }
+ rcu_read_unlock();
+ }
+#endif
+again:
+ retval = set_cpus_allowed_ptr(p, new_mask);
+
+ if (!retval) {
+ cpuset_cpus_allowed(p, cpus_allowed);
+ if (!cpumask_subset(new_mask, cpus_allowed)) {
+ /*
+ * We must have raced with a concurrent cpuset
+ * update. Just reset the cpus_allowed to the
+ * cpuset's cpus_allowed
+ */
+ cpumask_copy(new_mask, cpus_allowed);
+ goto again;
+ }
+ }
+out_free_new_mask:
+ free_cpumask_var(new_mask);
+out_free_cpus_allowed:
+ free_cpumask_var(cpus_allowed);
+out_put_task:
+ put_task_struct(p);
+ return retval;
+}
+
+static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
+ struct cpumask *new_mask)
+{
+ if (len < cpumask_size())
+ cpumask_clear(new_mask);
+ else if (len > cpumask_size())
+ len = cpumask_size();
+
+ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
+}
+
+/**
+ * sys_sched_setaffinity - set the cpu affinity of a process
+ * @pid: pid of the process
+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr
+ * @user_mask_ptr: user-space pointer to the new cpu mask
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
+ unsigned long __user *, user_mask_ptr)
+{
+ cpumask_var_t new_mask;
+ int retval;
+
+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
+ if (retval == 0)
+ retval = sched_setaffinity(pid, new_mask);
+ free_cpumask_var(new_mask);
+ return retval;
+}
+
+long sched_getaffinity(pid_t pid, struct cpumask *mask)
+{
+ struct task_struct *p;
+ unsigned long flags;
+ int retval;
+
+ rcu_read_lock();
+
+ retval = -ESRCH;
+ p = find_process_by_pid(pid);
+ if (!p)
+ goto out_unlock;
+
+ retval = security_task_getscheduler(p);
+ if (retval)
+ goto out_unlock;
+
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+out_unlock:
+ rcu_read_unlock();
+
+ return retval;
+}
+
+/**
+ * sys_sched_getaffinity - get the cpu affinity of a process
+ * @pid: pid of the process
+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr
+ * @user_mask_ptr: user-space pointer to hold the current cpu mask
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
+ unsigned long __user *, user_mask_ptr)
+{
+ int ret;
+ cpumask_var_t mask;
+
+ if ((len * BITS_PER_BYTE) < nr_cpu_ids)
+ return -EINVAL;
+ if (len & (sizeof(unsigned long)-1))
+ return -EINVAL;
+
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ ret = sched_getaffinity(pid, mask);
+ if (ret == 0) {
+ size_t retlen = min_t(size_t, len, cpumask_size());
+
+ if (copy_to_user(user_mask_ptr, mask, retlen))
+ ret = -EFAULT;
+ else
+ ret = retlen;
+ }
+ free_cpumask_var(mask);
+
+ return ret;
+}
+
+/**
+ * sys_sched_yield - yield the current processor to other threads.
+ *
+ * This function yields the current CPU to other tasks. If there are no
+ * other threads running on this CPU then this function will return.
+ *
+ * Return: 0.
+ */
+SYSCALL_DEFINE0(sched_yield)
+{
+ struct rq *rq = this_rq_lock();
+
+ schedstat_inc(rq, yld_count);
+ current->sched_class->yield_task(rq);
+
+ /*
+ * Since we are going to call schedule() anyway, there's
+ * no need to preempt or enable interrupts:
+ */
+ __release(rq->lock);
+ spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
+ do_raw_spin_unlock(&rq->lock);
+ sched_preempt_enable_no_resched();
+
+ schedule();
+
+ return 0;
+}
+
+int __sched _cond_resched(void)
+{
+ if (should_resched()) {
+ preempt_schedule_common();
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(_cond_resched);
+
+/*
+ * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
+ * call schedule, and on return reacquire the lock.
+ *
+ * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
+ * operations here to prevent schedule() from being called twice (once via
+ * spin_unlock(), once by hand).
+ */
+int __cond_resched_lock(spinlock_t *lock)
+{
+ int resched = should_resched();
+ int ret = 0;
+
+ lockdep_assert_held(lock);
+
+ if (spin_needbreak(lock) || resched) {
+ spin_unlock(lock);
+ if (resched)
+ preempt_schedule_common();
+ else
+ cpu_relax();
+ ret = 1;
+ spin_lock(lock);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(__cond_resched_lock);
+
+int __sched __cond_resched_softirq(void)
+{
+ BUG_ON(!in_softirq());
+
+ if (should_resched()) {
+ local_bh_enable();
+ preempt_schedule_common();
+ local_bh_disable();
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(__cond_resched_softirq);
+
+/**
+ * yield - yield the current processor to other threads.
+ *
+ * Do not ever use this function, there's a 99% chance you're doing it wrong.
+ *
+ * The scheduler is at all times free to pick the calling task as the most
+ * eligible task to run, if removing the yield() call from your code breaks
+ * it, its already broken.
+ *
+ * Typical broken usage is:
+ *
+ * while (!event)
+ * yield();
+ *
+ * where one assumes that yield() will let 'the other' process run that will
+ * make event true. If the current task is a SCHED_FIFO task that will never
+ * happen. Never use yield() as a progress guarantee!!
+ *
+ * If you want to use yield() to wait for something, use wait_event().
+ * If you want to use yield() to be 'nice' for others, use cond_resched().
+ * If you still want to use yield(), do not!
+ */
+void __sched yield(void)
+{
+ set_current_state(TASK_RUNNING);
+ sys_sched_yield();
+}
+EXPORT_SYMBOL(yield);
+
+/**
+ * yield_to - yield the current processor to another thread in
+ * your thread group, or accelerate that thread toward the
+ * processor it's on.
+ * @p: target task
+ * @preempt: whether task preemption is allowed or not
+ *
+ * It's the caller's job to ensure that the target task struct
+ * can't go away on us before we can do any checks.
+ *
+ * Return:
+ * true (>0) if we indeed boosted the target task.
+ * false (0) if we failed to boost the target.
+ * -ESRCH if there's no task to yield to.
+ */
+int __sched yield_to(struct task_struct *p, bool preempt)
+{
+ struct task_struct *curr = current;
+ struct rq *rq, *p_rq;
+ unsigned long flags;
+ int yielded = 0;
+
+ local_irq_save(flags);
+ rq = this_rq();
+
+again:
+ p_rq = task_rq(p);
+ /*
+ * If we're the only runnable task on the rq and target rq also
+ * has only one task, there's absolutely no point in yielding.
+ */
+ if (rq->nr_running == 1 && p_rq->nr_running == 1) {
+ yielded = -ESRCH;
+ goto out_irq;
+ }
+
+ double_rq_lock(rq, p_rq);
+ if (task_rq(p) != p_rq) {
+ double_rq_unlock(rq, p_rq);
+ goto again;
+ }
+
+ if (!curr->sched_class->yield_to_task)
+ goto out_unlock;
+
+ if (curr->sched_class != p->sched_class)
+ goto out_unlock;
+
+ if (task_running(p_rq, p) || p->state)
+ goto out_unlock;
+
+ yielded = curr->sched_class->yield_to_task(rq, p, preempt);
+ if (yielded) {
+ schedstat_inc(rq, yld_count);
+ /*
+ * Make p's CPU reschedule; pick_next_entity takes care of
+ * fairness.
+ */
+ if (preempt && rq != p_rq)
+ resched_curr(p_rq);
+ }
+
+out_unlock:
+ double_rq_unlock(rq, p_rq);
+out_irq:
+ local_irq_restore(flags);
+
+ if (yielded > 0)
+ schedule();
+
+ return yielded;
+}
+EXPORT_SYMBOL_GPL(yield_to);
+
+/*
+ * This task is about to go to sleep on IO. Increment rq->nr_iowait so
+ * that process accounting knows that this is a task in IO wait state.
+ */
+long __sched io_schedule_timeout(long timeout)
+{
+ int old_iowait = current->in_iowait;
+ struct rq *rq;
+ long ret;
+
+ current->in_iowait = 1;
+ blk_schedule_flush_plug(current);
+
+ delayacct_blkio_start();
+ rq = raw_rq();
+ atomic_inc(&rq->nr_iowait);
+ ret = schedule_timeout(timeout);
+ current->in_iowait = old_iowait;
+ atomic_dec(&rq->nr_iowait);
+ delayacct_blkio_end();
+
+ return ret;
+}
+EXPORT_SYMBOL(io_schedule_timeout);
+
+/**
+ * sys_sched_get_priority_max - return maximum RT priority.
+ * @policy: scheduling class.
+ *
+ * Return: On success, this syscall returns the maximum
+ * rt_priority that can be used by a given scheduling class.
+ * On failure, a negative error code is returned.
+ */
+SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
+{
+ int ret = -EINVAL;
+
+ switch (policy) {
+ case SCHED_FIFO:
+ case SCHED_RR:
+ ret = MAX_USER_RT_PRIO-1;
+ break;
+ case SCHED_DEADLINE:
+ case SCHED_NORMAL:
+ case SCHED_BATCH:
+ case SCHED_IDLE:
+ ret = 0;
+ break;
+ }
+ return ret;
+}
+
+/**
+ * sys_sched_get_priority_min - return minimum RT priority.
+ * @policy: scheduling class.
+ *
+ * Return: On success, this syscall returns the minimum
+ * rt_priority that can be used by a given scheduling class.
+ * On failure, a negative error code is returned.
+ */
+SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
+{
+ int ret = -EINVAL;
+
+ switch (policy) {
+ case SCHED_FIFO:
+ case SCHED_RR:
+ ret = 1;
+ break;
+ case SCHED_DEADLINE:
+ case SCHED_NORMAL:
+ case SCHED_BATCH:
+ case SCHED_IDLE:
+ ret = 0;
+ }
+ return ret;
+}
+
+/**
+ * sys_sched_rr_get_interval - return the default timeslice of a process.
+ * @pid: pid of the process.
+ * @interval: userspace pointer to the timeslice value.
+ *
+ * this syscall writes the default timeslice value of a given process
+ * into the user-space timespec buffer. A value of '0' means infinity.
+ *
+ * Return: On success, 0 and the timeslice is in @interval. Otherwise,
+ * an error code.
+ */
+SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
+ struct timespec __user *, interval)
+{
+ struct task_struct *p;
+ unsigned int time_slice;
+ unsigned long flags;
+ struct rq *rq;
+ int retval;
+ struct timespec t;
+
+ if (pid < 0)
+ return -EINVAL;
+
+ retval = -ESRCH;
+ rcu_read_lock();
+ p = find_process_by_pid(pid);
+ if (!p)
+ goto out_unlock;
+
+ retval = security_task_getscheduler(p);
+ if (retval)
+ goto out_unlock;
+
+ rq = task_rq_lock(p, &flags);
+ time_slice = 0;
+ if (p->sched_class->get_rr_interval)
+ time_slice = p->sched_class->get_rr_interval(rq, p);
+ task_rq_unlock(rq, p, &flags);
+
+ rcu_read_unlock();
+ jiffies_to_timespec(time_slice, &t);
+ retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
+ return retval;
+
+out_unlock:
+ rcu_read_unlock();
+ return retval;
+}
+
+static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
+
+void sched_show_task(struct task_struct *p)
+{
+ unsigned long free = 0;
+ int ppid;
+ unsigned long state = p->state;
+
+ if (state)
+ state = __ffs(state) + 1;
+ printk(KERN_INFO "%-15.15s %c", p->comm,
+ state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
+#if BITS_PER_LONG == 32
+ if (state == TASK_RUNNING)
+ printk(KERN_CONT " running ");
+ else
+ printk(KERN_CONT " %08lx ", thread_saved_pc(p));
+#else
+ if (state == TASK_RUNNING)
+ printk(KERN_CONT " running task ");
+ else
+ printk(KERN_CONT " %016lx ", thread_saved_pc(p));
+#endif
+#ifdef CONFIG_DEBUG_STACK_USAGE
+ free = stack_not_used(p);
+#endif
+ ppid = 0;
+ rcu_read_lock();
+ if (pid_alive(p))
+ ppid = task_pid_nr(rcu_dereference(p->real_parent));
+ rcu_read_unlock();
+ printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
+ task_pid_nr(p), ppid,
+ (unsigned long)task_thread_info(p)->flags);
+
+ print_worker_info(KERN_INFO, p);
+ show_stack(p, NULL);
+}
+
+void show_state_filter(unsigned long state_filter)
+{
+ struct task_struct *g, *p;
+
+#if BITS_PER_LONG == 32
+ printk(KERN_INFO
+ " task PC stack pid father\n");
+#else
+ printk(KERN_INFO
+ " task PC stack pid father\n");
+#endif
+ rcu_read_lock();
+ for_each_process_thread(g, p) {
+ /*
+ * reset the NMI-timeout, listing all files on a slow
+ * console might take a lot of time:
+ */
+ touch_nmi_watchdog();
+ if (!state_filter || (p->state & state_filter))
+ sched_show_task(p);
+ }
+
+ touch_all_softlockup_watchdogs();
+
+#ifdef CONFIG_SCHED_DEBUG
+ sysrq_sched_debug_show();
+#endif
+ rcu_read_unlock();
+ /*
+ * Only show locks if all tasks are dumped:
+ */
+ if (!state_filter)
+ debug_show_all_locks();
+}
+
+void init_idle_bootup_task(struct task_struct *idle)
+{
+ idle->sched_class = &idle_sched_class;
+}
+
+/**
+ * init_idle - set up an idle thread for a given CPU
+ * @idle: task in question
+ * @cpu: cpu the idle task belongs to
+ *
+ * NOTE: this function does not set the idle thread's NEED_RESCHED
+ * flag, to make booting more robust.
+ */
+void init_idle(struct task_struct *idle, int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+
+ __sched_fork(0, idle);
+ idle->state = TASK_RUNNING;
+ idle->se.exec_start = sched_clock();
+
+ do_set_cpus_allowed(idle, cpumask_of(cpu));
+ /*
+ * We're having a chicken and egg problem, even though we are
+ * holding rq->lock, the cpu isn't yet set to this cpu so the
+ * lockdep check in task_group() will fail.
+ *
+ * Similar case to sched_fork(). / Alternatively we could
+ * use task_rq_lock() here and obtain the other rq->lock.
+ *
+ * Silence PROVE_RCU
+ */
+ rcu_read_lock();
+ __set_task_cpu(idle, cpu);
+ rcu_read_unlock();
+
+ rq->curr = rq->idle = idle;
+ idle->on_rq = TASK_ON_RQ_QUEUED;
+#if defined(CONFIG_SMP)
+ idle->on_cpu = 1;
+#endif
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+ /* Set the preempt count _outside_ the spinlocks! */
+ init_idle_preempt_count(idle, cpu);
+
+ /*
+ * The idle tasks have their own, simple scheduling class:
+ */
+ idle->sched_class = &idle_sched_class;
+ ftrace_graph_init_idle_task(idle, cpu);
+ vtime_init_idle(idle, cpu);
+#if defined(CONFIG_SMP)
+ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
+#endif
+}
+
+int cpuset_cpumask_can_shrink(const struct cpumask *cur,
+ const struct cpumask *trial)
+{
+ int ret = 1, trial_cpus;
+ struct dl_bw *cur_dl_b;
+ unsigned long flags;
+
+ if (!cpumask_weight(cur))
+ return ret;
+
+ rcu_read_lock_sched();
+ cur_dl_b = dl_bw_of(cpumask_any(cur));
+ trial_cpus = cpumask_weight(trial);
+
+ raw_spin_lock_irqsave(&cur_dl_b->lock, flags);
+ if (cur_dl_b->bw != -1 &&
+ cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw)
+ ret = 0;
+ raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
+ rcu_read_unlock_sched();
+
+ return ret;
+}
+
+int task_can_attach(struct task_struct *p,
+ const struct cpumask *cs_cpus_allowed)
+{
+ int ret = 0;
+
+ /*
+ * Kthreads which disallow setaffinity shouldn't be moved
+ * to a new cpuset; we don't want to change their cpu
+ * affinity and isolating such threads by their set of
+ * allowed nodes is unnecessary. Thus, cpusets are not
+ * applicable for such threads. This prevents checking for
+ * success of set_cpus_allowed_ptr() on all attached tasks
+ * before cpus_allowed may be changed.
+ */
+ if (p->flags & PF_NO_SETAFFINITY) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+#ifdef CONFIG_SMP
+ if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
+ cs_cpus_allowed)) {
+ unsigned int dest_cpu = cpumask_any_and(cpu_active_mask,
+ cs_cpus_allowed);
+ struct dl_bw *dl_b;
+ bool overflow;
+ int cpus;
+ unsigned long flags;
+
+ rcu_read_lock_sched();
+ dl_b = dl_bw_of(dest_cpu);
+ raw_spin_lock_irqsave(&dl_b->lock, flags);
+ cpus = dl_bw_cpus(dest_cpu);
+ overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
+ if (overflow)
+ ret = -EBUSY;
+ else {
+ /*
+ * We reserve space for this task in the destination
+ * root_domain, as we can't fail after this point.
+ * We will free resources in the source root_domain
+ * later on (see set_cpus_allowed_dl()).
+ */
+ __dl_add(dl_b, p->dl.dl_bw);
+ }
+ raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+ rcu_read_unlock_sched();
+
+ }
+#endif
+out:
+ return ret;
+}
+
+#ifdef CONFIG_SMP
+/*
+ * move_queued_task - move a queued task to new rq.
+ *
+ * Returns (locked) new rq. Old rq's lock is released.
+ */
+static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
+{
+ struct rq *rq = task_rq(p);
+
+ lockdep_assert_held(&rq->lock);
+
+ dequeue_task(rq, p, 0);
+ p->on_rq = TASK_ON_RQ_MIGRATING;
+ set_task_cpu(p, new_cpu);
+ raw_spin_unlock(&rq->lock);
+
+ rq = cpu_rq(new_cpu);
+
+ raw_spin_lock(&rq->lock);
+ BUG_ON(task_cpu(p) != new_cpu);
+ p->on_rq = TASK_ON_RQ_QUEUED;
+ enqueue_task(rq, p, 0);
+ check_preempt_curr(rq, p, 0);
+
+ return rq;
+}
+
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+ if (p->sched_class->set_cpus_allowed)
+ p->sched_class->set_cpus_allowed(p, new_mask);
+
+ cpumask_copy(&p->cpus_allowed, new_mask);
+ p->nr_cpus_allowed = cpumask_weight(new_mask);
+}
+
+/*
+ * This is how migration works:
+ *
+ * 1) we invoke migration_cpu_stop() on the target CPU using
+ * stop_one_cpu().
+ * 2) stopper starts to run (implicitly forcing the migrated thread
+ * off the CPU)
+ * 3) it checks whether the migrated task is still in the wrong runqueue.
+ * 4) if it's in the wrong runqueue then the migration thread removes
+ * it and puts it into the right queue.
+ * 5) stopper completes and stop_one_cpu() returns and the migration
+ * is done.
+ */
+
+/*
+ * Change a given task's CPU affinity. Migrate the thread to a
+ * proper CPU and schedule it away if the CPU it's executing on
+ * is removed from the allowed bitmask.
+ *
+ * NOTE: the caller must have a valid reference to the task, the
+ * task must not exit() & deallocate itself prematurely. The
+ * call is not atomic; no spinlocks may be held.
+ */
+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+{
+ unsigned long flags;
+ struct rq *rq;
+ unsigned int dest_cpu;
+ int ret = 0;
+
+ rq = task_rq_lock(p, &flags);
+
+ if (cpumask_equal(&p->cpus_allowed, new_mask))
+ goto out;
+
+ if (!cpumask_intersects(new_mask, cpu_active_mask)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ do_set_cpus_allowed(p, new_mask);
+
+ /* Can the task run on the task's current CPU? If so, we're done */
+ if (cpumask_test_cpu(task_cpu(p), new_mask))
+ goto out;
+
+ dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
+ if (task_running(rq, p) || p->state == TASK_WAKING) {
+ struct migration_arg arg = { p, dest_cpu };
+ /* Need help from migration thread: drop lock and wait. */
+ task_rq_unlock(rq, p, &flags);
+ stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+ tlb_migrate_finish(p->mm);
+ return 0;
+ } else if (task_on_rq_queued(p))
+ rq = move_queued_task(p, dest_cpu);
+out:
+ task_rq_unlock(rq, p, &flags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
+
+/*
+ * Move (not current) task off this cpu, onto dest cpu. We're doing
+ * this because either it can't run here any more (set_cpus_allowed()
+ * away from this CPU, or CPU going down), or because we're
+ * attempting to rebalance this task on exec (sched_exec).
+ *
+ * So we race with normal scheduler movements, but that's OK, as long
+ * as the task is no longer on this CPU.
+ *
+ * Returns non-zero if task was successfully migrated.
+ */
+static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
+{
+ struct rq *rq;
+ int ret = 0;
+
+ if (unlikely(!cpu_active(dest_cpu)))
+ return ret;
+
+ rq = cpu_rq(src_cpu);
+
+ raw_spin_lock(&p->pi_lock);
+ raw_spin_lock(&rq->lock);
+ /* Already moved. */
+ if (task_cpu(p) != src_cpu)
+ goto done;
+
+ /* Affinity changed (again). */
+ if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
+ goto fail;
+
+ /*
+ * If we're not on a rq, the next wake-up will ensure we're
+ * placed properly.
+ */
+ if (task_on_rq_queued(p))
+ rq = move_queued_task(p, dest_cpu);
+done:
+ ret = 1;
+fail:
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock(&p->pi_lock);
+ return ret;
+}
+
+#ifdef CONFIG_NUMA_BALANCING
+/* Migrate current task p to target_cpu */
+int migrate_task_to(struct task_struct *p, int target_cpu)
+{
+ struct migration_arg arg = { p, target_cpu };
+ int curr_cpu = task_cpu(p);
+
+ if (curr_cpu == target_cpu)
+ return 0;
+
+ if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p)))
+ return -EINVAL;
+
+ /* TODO: This is not properly updating schedstats */
+
+ trace_sched_move_numa(p, curr_cpu, target_cpu);
+ return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
+}
+
+/*
+ * Requeue a task on a given node and accurately track the number of NUMA
+ * tasks on the runqueues
+ */
+void sched_setnuma(struct task_struct *p, int nid)
+{
+ struct rq *rq;
+ unsigned long flags;
+ bool queued, running;
+
+ rq = task_rq_lock(p, &flags);
+ queued = task_on_rq_queued(p);
+ running = task_current(rq, p);
+
+ if (queued)
+ dequeue_task(rq, p, 0);
+ if (running)
+ put_prev_task(rq, p);
+
+ p->numa_preferred_nid = nid;
+
+ if (running)
+ p->sched_class->set_curr_task(rq);
+ if (queued)
+ enqueue_task(rq, p, 0);
+ task_rq_unlock(rq, p, &flags);
+}
+#endif
+
+/*
+ * migration_cpu_stop - this will be executed by a highprio stopper thread
+ * and performs thread migration by bumping thread off CPU then
+ * 'pushing' onto another runqueue.
+ */
+static int migration_cpu_stop(void *data)
+{
+ struct migration_arg *arg = data;
+
+ /*
+ * The original target cpu might have gone down and we might
+ * be on another cpu but it doesn't matter.
+ */
+ local_irq_disable();
+ /*
+ * We need to explicitly wake pending tasks before running
+ * __migrate_task() such that we will not miss enforcing cpus_allowed
+ * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
+ */
+ sched_ttwu_pending();
+ __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
+ local_irq_enable();
+ return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Ensures that the idle task is using init_mm right before its cpu goes
+ * offline.
+ */
+void idle_task_exit(void)
+{
+ struct mm_struct *mm = current->active_mm;
+
+ BUG_ON(cpu_online(smp_processor_id()));
+
+ if (mm != &init_mm) {
+ switch_mm(mm, &init_mm, current);
+ finish_arch_post_lock_switch();
+ }
+ mmdrop(mm);
+}
+
+/*
+ * Since this CPU is going 'away' for a while, fold any nr_active delta
+ * we might have. Assumes we're called after migrate_tasks() so that the
+ * nr_active count is stable.
+ *
+ * Also see the comment "Global load-average calculations".
+ */
+static void calc_load_migrate(struct rq *rq)
+{
+ long delta = calc_load_fold_active(rq);
+ if (delta)
+ atomic_long_add(delta, &calc_load_tasks);
+}
+
+static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
+{
+}
+
+static const struct sched_class fake_sched_class = {
+ .put_prev_task = put_prev_task_fake,
+};
+
+static struct task_struct fake_task = {
+ /*
+ * Avoid pull_{rt,dl}_task()
+ */
+ .prio = MAX_PRIO + 1,
+ .sched_class = &fake_sched_class,
+};
+
+/*
+ * Migrate all tasks from the rq, sleeping tasks will be migrated by
+ * try_to_wake_up()->select_task_rq().
+ *
+ * Called with rq->lock held even though we'er in stop_machine() and
+ * there's no concurrency possible, we hold the required locks anyway
+ * because of lock validation efforts.
+ */
+static void migrate_tasks(unsigned int dead_cpu)
+{
+ struct rq *rq = cpu_rq(dead_cpu);
+ struct task_struct *next, *stop = rq->stop;
+ int dest_cpu;
+
+ /*
+ * Fudge the rq selection such that the below task selection loop
+ * doesn't get stuck on the currently eligible stop task.
+ *
+ * We're currently inside stop_machine() and the rq is either stuck
+ * in the stop_machine_cpu_stop() loop, or we're executing this code,
+ * either way we should never end up calling schedule() until we're
+ * done here.
+ */
+ rq->stop = NULL;
+
+ /*
+ * put_prev_task() and pick_next_task() sched
+ * class method both need to have an up-to-date
+ * value of rq->clock[_task]
+ */
+ update_rq_clock(rq);
+
+ for ( ; ; ) {
+ /*
+ * There's this thread running, bail when that's the only
+ * remaining thread.
+ */
+ if (rq->nr_running == 1)
+ break;
+
+ next = pick_next_task(rq, &fake_task);
+ BUG_ON(!next);
+ next->sched_class->put_prev_task(rq, next);
+
+ /* Find suitable destination for @next, with force if needed. */
+ dest_cpu = select_fallback_rq(dead_cpu, next);
+ raw_spin_unlock(&rq->lock);
+
+ __migrate_task(next, dead_cpu, dest_cpu);
+
+ raw_spin_lock(&rq->lock);
+ }
+
+ rq->stop = stop;
+}
+
+#endif /* CONFIG_HOTPLUG_CPU */
+
+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
+
+static struct ctl_table sd_ctl_dir[] = {
+ {
+ .procname = "sched_domain",
+ .mode = 0555,
+ },
+ {}
+};
+
+static struct ctl_table sd_ctl_root[] = {
+ {
+ .procname = "kernel",
+ .mode = 0555,
+ .child = sd_ctl_dir,
+ },
+ {}
+};
+
+static struct ctl_table *sd_alloc_ctl_entry(int n)
+{
+ struct ctl_table *entry =
+ kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
+
+ return entry;
+}
+
+static void sd_free_ctl_entry(struct ctl_table **tablep)
+{
+ struct ctl_table *entry;
+
+ /*
+ * In the intermediate directories, both the child directory and
+ * procname are dynamically allocated and could fail but the mode
+ * will always be set. In the lowest directory the names are
+ * static strings and all have proc handlers.
+ */
+ for (entry = *tablep; entry->mode; entry++) {
+ if (entry->child)
+ sd_free_ctl_entry(&entry->child);
+ if (entry->proc_handler == NULL)
+ kfree(entry->procname);
+ }
+
+ kfree(*tablep);
+ *tablep = NULL;
+}
+
+static int min_load_idx = 0;
+static int max_load_idx = CPU_LOAD_IDX_MAX-1;
+
+static void
+set_table_entry(struct ctl_table *entry,
+ const char *procname, void *data, int maxlen,
+ umode_t mode, proc_handler *proc_handler,
+ bool load_idx)
+{
+ entry->procname = procname;
+ entry->data = data;
+ entry->maxlen = maxlen;
+ entry->mode = mode;
+ entry->proc_handler = proc_handler;
+
+ if (load_idx) {
+ entry->extra1 = &min_load_idx;
+ entry->extra2 = &max_load_idx;
+ }
+}
+
+static struct ctl_table *
+sd_alloc_ctl_domain_table(struct sched_domain *sd)
+{
+ struct ctl_table *table = sd_alloc_ctl_entry(14);
+
+ if (table == NULL)
+ return NULL;
+
+ set_table_entry(&table[0], "min_interval", &sd->min_interval,
+ sizeof(long), 0644, proc_doulongvec_minmax, false);
+ set_table_entry(&table[1], "max_interval", &sd->max_interval,
+ sizeof(long), 0644, proc_doulongvec_minmax, false);
+ set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
+ sizeof(int), 0644, proc_dointvec_minmax, true);
+ set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
+ sizeof(int), 0644, proc_dointvec_minmax, true);
+ set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
+ sizeof(int), 0644, proc_dointvec_minmax, true);
+ set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
+ sizeof(int), 0644, proc_dointvec_minmax, true);
+ set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
+ sizeof(int), 0644, proc_dointvec_minmax, true);
+ set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
+ sizeof(int), 0644, proc_dointvec_minmax, false);
+ set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
+ sizeof(int), 0644, proc_dointvec_minmax, false);
+ set_table_entry(&table[9], "cache_nice_tries",
+ &sd->cache_nice_tries,
+ sizeof(int), 0644, proc_dointvec_minmax, false);
+ set_table_entry(&table[10], "flags", &sd->flags,
+ sizeof(int), 0644, proc_dointvec_minmax, false);
+ set_table_entry(&table[11], "max_newidle_lb_cost",
+ &sd->max_newidle_lb_cost,
+ sizeof(long), 0644, proc_doulongvec_minmax, false);
+ set_table_entry(&table[12], "name", sd->name,
+ CORENAME_MAX_SIZE, 0444, proc_dostring, false);
+ /* &table[13] is terminator */
+
+ return table;
+}
+
+static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
+{
+ struct ctl_table *entry, *table;
+ struct sched_domain *sd;
+ int domain_num = 0, i;
+ char buf[32];
+
+ for_each_domain(cpu, sd)
+ domain_num++;
+ entry = table = sd_alloc_ctl_entry(domain_num + 1);
+ if (table == NULL)
+ return NULL;
+
+ i = 0;
+ for_each_domain(cpu, sd) {
+ snprintf(buf, 32, "domain%d", i);
+ entry->procname = kstrdup(buf, GFP_KERNEL);
+ entry->mode = 0555;
+ entry->child = sd_alloc_ctl_domain_table(sd);
+ entry++;
+ i++;
+ }
+ return table;
+}
+
+static struct ctl_table_header *sd_sysctl_header;
+static void register_sched_domain_sysctl(void)
+{
+ int i, cpu_num = num_possible_cpus();
+ struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
+ char buf[32];
+
+ WARN_ON(sd_ctl_dir[0].child);
+ sd_ctl_dir[0].child = entry;
+
+ if (entry == NULL)
+ return;
+
+ for_each_possible_cpu(i) {
+ snprintf(buf, 32, "cpu%d", i);
+ entry->procname = kstrdup(buf, GFP_KERNEL);
+ entry->mode = 0555;
+ entry->child = sd_alloc_ctl_cpu_table(i);
+ entry++;
+ }
+
+ WARN_ON(sd_sysctl_header);
+ sd_sysctl_header = register_sysctl_table(sd_ctl_root);
+}
+
+/* may be called multiple times per register */
+static void unregister_sched_domain_sysctl(void)
+{
+ if (sd_sysctl_header)
+ unregister_sysctl_table(sd_sysctl_header);
+ sd_sysctl_header = NULL;
+ if (sd_ctl_dir[0].child)
+ sd_free_ctl_entry(&sd_ctl_dir[0].child);
+}
+#else
+static void register_sched_domain_sysctl(void)
+{
+}
+static void unregister_sched_domain_sysctl(void)
+{
+}
+#endif
+
+static void set_rq_online(struct rq *rq)
+{
+ if (!rq->online) {
+ const struct sched_class *class;
+
+ cpumask_set_cpu(rq->cpu, rq->rd->online);
+ rq->online = 1;
+
+ for_each_class(class) {
+ if (class->rq_online)
+ class->rq_online(rq);
+ }
+ }
+}
+
+static void set_rq_offline(struct rq *rq)
+{
+ if (rq->online) {
+ const struct sched_class *class;
+
+ for_each_class(class) {
+ if (class->rq_offline)
+ class->rq_offline(rq);
+ }
+
+ cpumask_clear_cpu(rq->cpu, rq->rd->online);
+ rq->online = 0;
+ }
+}
+
+/*
+ * migration_call - callback that gets triggered when a CPU is added.
+ * Here we can start up the necessary migration thread for the new CPU.
+ */
+static int
+migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+ int cpu = (long)hcpu;
+ unsigned long flags;
+ struct rq *rq = cpu_rq(cpu);
+
+ switch (action & ~CPU_TASKS_FROZEN) {
+
+ case CPU_UP_PREPARE:
+ rq->calc_load_update = calc_load_update;
+ break;
+
+ case CPU_ONLINE:
+ /* Update our root-domain */
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ if (rq->rd) {
+ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+
+ set_rq_online(rq);
+ }
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+ break;
+
+#ifdef CONFIG_HOTPLUG_CPU
+ case CPU_DYING:
+ sched_ttwu_pending();
+ /* Update our root-domain */
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ if (rq->rd) {
+ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+ set_rq_offline(rq);
+ }
+ migrate_tasks(cpu);
+ BUG_ON(rq->nr_running != 1); /* the migration thread */
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+ break;
+
+ case CPU_DEAD:
+ calc_load_migrate(rq);
+ break;
+#endif
+ }
+
+ update_max_interval();
+
+ return NOTIFY_OK;
+}
+
+/*
+ * Register at high priority so that task migration (migrate_all_tasks)
+ * happens before everything else. This has to be lower priority than
+ * the notifier in the perf_event subsystem, though.
+ */
+static struct notifier_block migration_notifier = {
+ .notifier_call = migration_call,
+ .priority = CPU_PRI_MIGRATION,
+};
+
+static void __cpuinit set_cpu_rq_start_time(void)
+{
+ int cpu = smp_processor_id();
+ struct rq *rq = cpu_rq(cpu);
+ rq->age_stamp = sched_clock_cpu(cpu);
+}
+
+static int sched_cpu_active(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_STARTING:
+ set_cpu_rq_start_time();
+ return NOTIFY_OK;
+ case CPU_DOWN_FAILED:
+ set_cpu_active((long)hcpu, true);
+ return NOTIFY_OK;
+ default:
+ return NOTIFY_DONE;
+ }
+}
+
+static int sched_cpu_inactive(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_DOWN_PREPARE:
+ set_cpu_active((long)hcpu, false);
+ return NOTIFY_OK;
+ default:
+ return NOTIFY_DONE;
+ }
+}
+
+static int __init migration_init(void)
+{
+ void *cpu = (void *)(long)smp_processor_id();
+ int err;
+
+ /* Initialize migration for the boot CPU */
+ err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
+ BUG_ON(err == NOTIFY_BAD);
+ migration_call(&migration_notifier, CPU_ONLINE, cpu);
+ register_cpu_notifier(&migration_notifier);
+
+ /* Register cpu active notifiers */
+ cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
+ cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
+
+ return 0;
+}
+early_initcall(migration_init);
+#endif
+
+#ifdef CONFIG_SMP
+
+static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
+
+#ifdef CONFIG_SCHED_DEBUG
+
+static __read_mostly int sched_debug_enabled;
+
+static int __init sched_debug_setup(char *str)
+{
+ sched_debug_enabled = 1;
+
+ return 0;
+}
+early_param("sched_debug", sched_debug_setup);
+
+static inline bool sched_debug(void)
+{
+ return sched_debug_enabled;
+}
+
+static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
+ struct cpumask *groupmask)
+{
+ struct sched_group *group = sd->groups;
+
+ cpumask_clear(groupmask);
+
+ printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
+
+ if (!(sd->flags & SD_LOAD_BALANCE)) {
+ printk("does not load-balance\n");
+ if (sd->parent)
+ printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
+ " has parent");
+ return -1;
+ }
+
+ printk(KERN_CONT "span %*pbl level %s\n",
+ cpumask_pr_args(sched_domain_span(sd)), sd->name);
+
+ if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
+ printk(KERN_ERR "ERROR: domain->span does not contain "
+ "CPU%d\n", cpu);
+ }
+ if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
+ printk(KERN_ERR "ERROR: domain->groups does not contain"
+ " CPU%d\n", cpu);
+ }
+
+ printk(KERN_DEBUG "%*s groups:", level + 1, "");
+ do {
+ if (!group) {
+ printk("\n");
+ printk(KERN_ERR "ERROR: group is NULL\n");
+ break;
+ }
+
+ if (!cpumask_weight(sched_group_cpus(group))) {
+ printk(KERN_CONT "\n");
+ printk(KERN_ERR "ERROR: empty group\n");
+ break;
+ }
+
+ if (!(sd->flags & SD_OVERLAP) &&
+ cpumask_intersects(groupmask, sched_group_cpus(group))) {
+ printk(KERN_CONT "\n");
+ printk(KERN_ERR "ERROR: repeated CPUs\n");
+ break;
+ }
+
+ cpumask_or(groupmask, groupmask, sched_group_cpus(group));
+
+ printk(KERN_CONT " %*pbl",
+ cpumask_pr_args(sched_group_cpus(group)));
+ if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
+ printk(KERN_CONT " (cpu_capacity = %d)",
+ group->sgc->capacity);
+ }
+
+ group = group->next;
+ } while (group != sd->groups);
+ printk(KERN_CONT "\n");
+
+ if (!cpumask_equal(sched_domain_span(sd), groupmask))
+ printk(KERN_ERR "ERROR: groups don't span domain->span\n");
+
+ if (sd->parent &&
+ !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
+ printk(KERN_ERR "ERROR: parent span is not a superset "
+ "of domain->span\n");
+ return 0;
+}
+
+static void sched_domain_debug(struct sched_domain *sd, int cpu)
+{
+ int level = 0;
+
+ if (!sched_debug_enabled)
+ return;
+
+ if (!sd) {
+ printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
+ return;
+ }
+
+ printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
+
+ for (;;) {
+ if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
+ break;
+ level++;
+ sd = sd->parent;
+ if (!sd)
+ break;
+ }
+}
+#else /* !CONFIG_SCHED_DEBUG */
+# define sched_domain_debug(sd, cpu) do { } while (0)
+static inline bool sched_debug(void)
+{
+ return false;
+}
+#endif /* CONFIG_SCHED_DEBUG */
+
+static int sd_degenerate(struct sched_domain *sd)
+{
+ if (cpumask_weight(sched_domain_span(sd)) == 1)
+ return 1;
+
+ /* Following flags need at least 2 groups */
+ if (sd->flags & (SD_LOAD_BALANCE |
+ SD_BALANCE_NEWIDLE |
+ SD_BALANCE_FORK |
+ SD_BALANCE_EXEC |
+ SD_SHARE_CPUCAPACITY |
+ SD_SHARE_PKG_RESOURCES |
+ SD_SHARE_POWERDOMAIN)) {
+ if (sd->groups != sd->groups->next)
+ return 0;
+ }
+
+ /* Following flags don't use groups */
+ if (sd->flags & (SD_WAKE_AFFINE))
+ return 0;
+
+ return 1;
+}
+
+static int
+sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
+{
+ unsigned long cflags = sd->flags, pflags = parent->flags;
+
+ if (sd_degenerate(parent))
+ return 1;
+
+ if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
+ return 0;
+
+ /* Flags needing groups don't count if only 1 group in parent */
+ if (parent->groups == parent->groups->next) {
+ pflags &= ~(SD_LOAD_BALANCE |
+ SD_BALANCE_NEWIDLE |
+ SD_BALANCE_FORK |
+ SD_BALANCE_EXEC |
+ SD_SHARE_CPUCAPACITY |
+ SD_SHARE_PKG_RESOURCES |
+ SD_PREFER_SIBLING |
+ SD_SHARE_POWERDOMAIN);
+ if (nr_node_ids == 1)
+ pflags &= ~SD_SERIALIZE;
+ }
+ if (~cflags & pflags)
+ return 0;
+
+ return 1;
+}
+
+static void free_rootdomain(struct rcu_head *rcu)
+{
+ struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
+
+ cpupri_cleanup(&rd->cpupri);
+ cpudl_cleanup(&rd->cpudl);
+ free_cpumask_var(rd->dlo_mask);
+ free_cpumask_var(rd->rto_mask);
+ free_cpumask_var(rd->online);
+ free_cpumask_var(rd->span);
+ kfree(rd);
+}
+
+static void rq_attach_root(struct rq *rq, struct root_domain *rd)
+{
+ struct root_domain *old_rd = NULL;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+
+ if (rq->rd) {
+ old_rd = rq->rd;
+
+ if (cpumask_test_cpu(rq->cpu, old_rd->online))
+ set_rq_offline(rq);
+
+ cpumask_clear_cpu(rq->cpu, old_rd->span);
+
+ /*
+ * If we dont want to free the old_rd yet then
+ * set old_rd to NULL to skip the freeing later
+ * in this function:
+ */
+ if (!atomic_dec_and_test(&old_rd->refcount))
+ old_rd = NULL;
+ }
+
+ atomic_inc(&rd->refcount);
+ rq->rd = rd;
+
+ cpumask_set_cpu(rq->cpu, rd->span);
+ if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
+ set_rq_online(rq);
+
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+ if (old_rd)
+ call_rcu_sched(&old_rd->rcu, free_rootdomain);
+}
+
+static int init_rootdomain(struct root_domain *rd)
+{
+ memset(rd, 0, sizeof(*rd));
+
+ if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
+ goto out;
+ if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
+ goto free_span;
+ if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
+ goto free_online;
+ if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+ goto free_dlo_mask;
+
+ init_dl_bw(&rd->dl_bw);
+ if (cpudl_init(&rd->cpudl) != 0)
+ goto free_dlo_mask;
+
+ if (cpupri_init(&rd->cpupri) != 0)
+ goto free_rto_mask;
+ return 0;
+
+free_rto_mask:
+ free_cpumask_var(rd->rto_mask);
+free_dlo_mask:
+ free_cpumask_var(rd->dlo_mask);
+free_online:
+ free_cpumask_var(rd->online);
+free_span:
+ free_cpumask_var(rd->span);
+out:
+ return -ENOMEM;
+}
+
+/*
+ * By default the system creates a single root-domain with all cpus as
+ * members (mimicking the global state we have today).
+ */
+struct root_domain def_root_domain;
+
+static void init_defrootdomain(void)
+{
+ init_rootdomain(&def_root_domain);
+
+ atomic_set(&def_root_domain.refcount, 1);
+}
+
+static struct root_domain *alloc_rootdomain(void)
+{
+ struct root_domain *rd;
+
+ rd = kmalloc(sizeof(*rd), GFP_KERNEL);
+ if (!rd)
+ return NULL;
+
+ if (init_rootdomain(rd) != 0) {
+ kfree(rd);
+ return NULL;
+ }
+
+ return rd;
+}
+
+static void free_sched_groups(struct sched_group *sg, int free_sgc)
+{
+ struct sched_group *tmp, *first;
+
+ if (!sg)
+ return;
+
+ first = sg;
+ do {
+ tmp = sg->next;
+
+ if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
+ kfree(sg->sgc);
+
+ kfree(sg);
+ sg = tmp;
+ } while (sg != first);
+}
+
+static void free_sched_domain(struct rcu_head *rcu)
+{
+ struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
+
+ /*
+ * If its an overlapping domain it has private groups, iterate and
+ * nuke them all.
+ */
+ if (sd->flags & SD_OVERLAP) {
+ free_sched_groups(sd->groups, 1);
+ } else if (atomic_dec_and_test(&sd->groups->ref)) {
+ kfree(sd->groups->sgc);
+ kfree(sd->groups);
+ }
+ kfree(sd);
+}
+
+static void destroy_sched_domain(struct sched_domain *sd, int cpu)
+{
+ call_rcu(&sd->rcu, free_sched_domain);
+}
+
+static void destroy_sched_domains(struct sched_domain *sd, int cpu)
+{
+ for (; sd; sd = sd->parent)
+ destroy_sched_domain(sd, cpu);
+}
+
+/*
+ * Keep a special pointer to the highest sched_domain that has
+ * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
+ * allows us to avoid some pointer chasing select_idle_sibling().
+ *
+ * Also keep a unique ID per domain (we use the first cpu number in
+ * the cpumask of the domain), this allows us to quickly tell if
+ * two cpus are in the same cache domain, see cpus_share_cache().
+ */
+DEFINE_PER_CPU(struct sched_domain *, sd_llc);
+DEFINE_PER_CPU(int, sd_llc_size);
+DEFINE_PER_CPU(int, sd_llc_id);
+DEFINE_PER_CPU(struct sched_domain *, sd_numa);
+DEFINE_PER_CPU(struct sched_domain *, sd_busy);
+DEFINE_PER_CPU(struct sched_domain *, sd_asym);
+
+static void update_top_cache_domain(int cpu)
+{
+ struct sched_domain *sd;
+ struct sched_domain *busy_sd = NULL;
+ int id = cpu;
+ int size = 1;
+
+ sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
+ if (sd) {
+ id = cpumask_first(sched_domain_span(sd));
+ size = cpumask_weight(sched_domain_span(sd));
+ busy_sd = sd->parent; /* sd_busy */
+ }
+ rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);
+
+ rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
+ per_cpu(sd_llc_size, cpu) = size;
+ per_cpu(sd_llc_id, cpu) = id;
+
+ sd = lowest_flag_domain(cpu, SD_NUMA);
+ rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
+
+ sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
+ rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
+}
+
+/*
+ * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
+ * hold the hotplug lock.
+ */
+static void
+cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct sched_domain *tmp;
+
+ /* Remove the sched domains which do not contribute to scheduling. */
+ for (tmp = sd; tmp; ) {
+ struct sched_domain *parent = tmp->parent;
+ if (!parent)
+ break;
+
+ if (sd_parent_degenerate(tmp, parent)) {
+ tmp->parent = parent->parent;
+ if (parent->parent)
+ parent->parent->child = tmp;
+ /*
+ * Transfer SD_PREFER_SIBLING down in case of a
+ * degenerate parent; the spans match for this
+ * so the property transfers.
+ */
+ if (parent->flags & SD_PREFER_SIBLING)
+ tmp->flags |= SD_PREFER_SIBLING;
+ destroy_sched_domain(parent, cpu);
+ } else
+ tmp = tmp->parent;
+ }
+
+ if (sd && sd_degenerate(sd)) {
+ tmp = sd;
+ sd = sd->parent;
+ destroy_sched_domain(tmp, cpu);
+ if (sd)
+ sd->child = NULL;
+ }
+
+ sched_domain_debug(sd, cpu);
+
+ rq_attach_root(rq, rd);
+ tmp = rq->sd;
+ rcu_assign_pointer(rq->sd, sd);
+ destroy_sched_domains(tmp, cpu);
+
+ update_top_cache_domain(cpu);
+}
+
+/* Setup the mask of cpus configured for isolated domains */
+static int __init isolated_cpu_setup(char *str)
+{
+ alloc_bootmem_cpumask_var(&cpu_isolated_map);
+ cpulist_parse(str, cpu_isolated_map);
+ return 1;
+}
+
+__setup("isolcpus=", isolated_cpu_setup);
+
+struct s_data {
+ struct sched_domain ** __percpu sd;
+ struct root_domain *rd;
+};
+
+enum s_alloc {
+ sa_rootdomain,
+ sa_sd,
+ sa_sd_storage,
+ sa_none,
+};
+
+/*
+ * Build an iteration mask that can exclude certain CPUs from the upwards
+ * domain traversal.
+ *
+ * Asymmetric node setups can result in situations where the domain tree is of
+ * unequal depth, make sure to skip domains that already cover the entire
+ * range.
+ *
+ * In that case build_sched_domains() will have terminated the iteration early
+ * and our sibling sd spans will be empty. Domains should always include the
+ * cpu they're built on, so check that.
+ *
+ */
+static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
+{
+ const struct cpumask *span = sched_domain_span(sd);
+ struct sd_data *sdd = sd->private;
+ struct sched_domain *sibling;
+ int i;
+
+ for_each_cpu(i, span) {
+ sibling = *per_cpu_ptr(sdd->sd, i);
+ if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
+ continue;
+
+ cpumask_set_cpu(i, sched_group_mask(sg));
+ }
+}
+
+/*
+ * Return the canonical balance cpu for this group, this is the first cpu
+ * of this group that's also in the iteration mask.
+ */
+int group_balance_cpu(struct sched_group *sg)
+{
+ return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
+}
+
+static int
+build_overlap_sched_groups(struct sched_domain *sd, int cpu)
+{
+ struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
+ const struct cpumask *span = sched_domain_span(sd);
+ struct cpumask *covered = sched_domains_tmpmask;
+ struct sd_data *sdd = sd->private;
+ struct sched_domain *sibling;
+ int i;
+
+ cpumask_clear(covered);
+
+ for_each_cpu(i, span) {
+ struct cpumask *sg_span;
+
+ if (cpumask_test_cpu(i, covered))
+ continue;
+
+ sibling = *per_cpu_ptr(sdd->sd, i);
+
+ /* See the comment near build_group_mask(). */
+ if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
+ continue;
+
+ sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
+ GFP_KERNEL, cpu_to_node(cpu));
+
+ if (!sg)
+ goto fail;
+
+ sg_span = sched_group_cpus(sg);
+ if (sibling->child)
+ cpumask_copy(sg_span, sched_domain_span(sibling->child));
+ else
+ cpumask_set_cpu(i, sg_span);
+
+ cpumask_or(covered, covered, sg_span);
+
+ sg->sgc = *per_cpu_ptr(sdd->sgc, i);
+ if (atomic_inc_return(&sg->sgc->ref) == 1)
+ build_group_mask(sd, sg);
+
+ /*
+ * Initialize sgc->capacity such that even if we mess up the
+ * domains and no possible iteration will get us here, we won't
+ * die on a /0 trap.
+ */
+ sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
+
+ /*
+ * Make sure the first group of this domain contains the
+ * canonical balance cpu. Otherwise the sched_domain iteration
+ * breaks. See update_sg_lb_stats().
+ */
+ if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
+ group_balance_cpu(sg) == cpu)
+ groups = sg;
+
+ if (!first)
+ first = sg;
+ if (last)
+ last->next = sg;
+ last = sg;
+ last->next = first;
+ }
+ sd->groups = groups;
+
+ return 0;
+
+fail:
+ free_sched_groups(first, 0);
+
+ return -ENOMEM;
+}
+
+static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
+{
+ struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
+ struct sched_domain *child = sd->child;
+
+ if (child)
+ cpu = cpumask_first(sched_domain_span(child));
+
+ if (sg) {
+ *sg = *per_cpu_ptr(sdd->sg, cpu);
+ (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu);
+ atomic_set(&(*sg)->sgc->ref, 1); /* for claim_allocations */
+ }
+
+ return cpu;
+}
+
+/*
+ * build_sched_groups will build a circular linked list of the groups
+ * covered by the given span, and will set each group's ->cpumask correctly,
+ * and ->cpu_capacity to 0.
+ *
+ * Assumes the sched_domain tree is fully constructed
+ */
+static int
+build_sched_groups(struct sched_domain *sd, int cpu)
+{
+ struct sched_group *first = NULL, *last = NULL;
+ struct sd_data *sdd = sd->private;
+ const struct cpumask *span = sched_domain_span(sd);
+ struct cpumask *covered;
+ int i;
+
+ get_group(cpu, sdd, &sd->groups);
+ atomic_inc(&sd->groups->ref);
+
+ if (cpu != cpumask_first(span))
+ return 0;
+
+ lockdep_assert_held(&sched_domains_mutex);
+ covered = sched_domains_tmpmask;
+
+ cpumask_clear(covered);
+
+ for_each_cpu(i, span) {
+ struct sched_group *sg;
+ int group, j;
+
+ if (cpumask_test_cpu(i, covered))
+ continue;
+
+ group = get_group(i, sdd, &sg);
+ cpumask_setall(sched_group_mask(sg));
+
+ for_each_cpu(j, span) {
+ if (get_group(j, sdd, NULL) != group)
+ continue;
+
+ cpumask_set_cpu(j, covered);
+ cpumask_set_cpu(j, sched_group_cpus(sg));
+ }
+
+ if (!first)
+ first = sg;
+ if (last)
+ last->next = sg;
+ last = sg;
+ }
+ last->next = first;
+
+ return 0;
+}
+
+/*
+ * Initialize sched groups cpu_capacity.
+ *
+ * cpu_capacity indicates the capacity of sched group, which is used while
+ * distributing the load between different sched groups in a sched domain.
+ * Typically cpu_capacity for all the groups in a sched domain will be same
+ * unless there are asymmetries in the topology. If there are asymmetries,
+ * group having more cpu_capacity will pickup more load compared to the
+ * group having less cpu_capacity.
+ */
+static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
+{
+ struct sched_group *sg = sd->groups;
+
+ WARN_ON(!sg);
+
+ do {
+ sg->group_weight = cpumask_weight(sched_group_cpus(sg));
+ sg = sg->next;
+ } while (sg != sd->groups);
+
+ if (cpu != group_balance_cpu(sg))
+ return;
+
+ update_group_capacity(sd, cpu);
+ atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);
+}
+
+/*
+ * Initializers for schedule domains
+ * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
+ */
+
+static int default_relax_domain_level = -1;
+int sched_domain_level_max;
+
+static int __init setup_relax_domain_level(char *str)
+{
+ if (kstrtoint(str, 0, &default_relax_domain_level))
+ pr_warn("Unable to set relax_domain_level\n");
+
+ return 1;
+}
+__setup("relax_domain_level=", setup_relax_domain_level);
+
+static void set_domain_attribute(struct sched_domain *sd,
+ struct sched_domain_attr *attr)
+{
+ int request;
+
+ if (!attr || attr->relax_domain_level < 0) {
+ if (default_relax_domain_level < 0)
+ return;
+ else
+ request = default_relax_domain_level;
+ } else
+ request = attr->relax_domain_level;
+ if (request < sd->level) {
+ /* turn off idle balance on this domain */
+ sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
+ } else {
+ /* turn on idle balance on this domain */
+ sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
+ }
+}
+
+static void __sdt_free(const struct cpumask *cpu_map);
+static int __sdt_alloc(const struct cpumask *cpu_map);
+
+static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
+ const struct cpumask *cpu_map)
+{
+ switch (what) {
+ case sa_rootdomain:
+ if (!atomic_read(&d->rd->refcount))
+ free_rootdomain(&d->rd->rcu); /* fall through */
+ case sa_sd:
+ free_percpu(d->sd); /* fall through */
+ case sa_sd_storage:
+ __sdt_free(cpu_map); /* fall through */
+ case sa_none:
+ break;
+ }
+}
+
+static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
+ const struct cpumask *cpu_map)
+{
+ memset(d, 0, sizeof(*d));
+
+ if (__sdt_alloc(cpu_map))
+ return sa_sd_storage;
+ d->sd = alloc_percpu(struct sched_domain *);
+ if (!d->sd)
+ return sa_sd_storage;
+ d->rd = alloc_rootdomain();
+ if (!d->rd)
+ return sa_sd;
+ return sa_rootdomain;
+}
+
+/*
+ * NULL the sd_data elements we've used to build the sched_domain and
+ * sched_group structure so that the subsequent __free_domain_allocs()
+ * will not free the data we're using.
+ */
+static void claim_allocations(int cpu, struct sched_domain *sd)
+{
+ struct sd_data *sdd = sd->private;
+
+ WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
+ *per_cpu_ptr(sdd->sd, cpu) = NULL;
+
+ if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
+ *per_cpu_ptr(sdd->sg, cpu) = NULL;
+
+ if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
+ *per_cpu_ptr(sdd->sgc, cpu) = NULL;
+}
+
+#ifdef CONFIG_NUMA
+static int sched_domains_numa_levels;
+enum numa_topology_type sched_numa_topology_type;
+static int *sched_domains_numa_distance;
+int sched_max_numa_distance;
+static struct cpumask ***sched_domains_numa_masks;
+static int sched_domains_curr_level;
+#endif
+
+/*
+ * SD_flags allowed in topology descriptions.
+ *
+ * SD_SHARE_CPUCAPACITY - describes SMT topologies
+ * SD_SHARE_PKG_RESOURCES - describes shared caches
+ * SD_NUMA - describes NUMA topologies
+ * SD_SHARE_POWERDOMAIN - describes shared power domain
+ *
+ * Odd one out:
+ * SD_ASYM_PACKING - describes SMT quirks
+ */
+#define TOPOLOGY_SD_FLAGS \
+ (SD_SHARE_CPUCAPACITY | \
+ SD_SHARE_PKG_RESOURCES | \
+ SD_NUMA | \
+ SD_ASYM_PACKING | \
+ SD_SHARE_POWERDOMAIN)
+
+static struct sched_domain *
+sd_init(struct sched_domain_topology_level *tl, int cpu)
+{
+ struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
+ int sd_weight, sd_flags = 0;
+
+#ifdef CONFIG_NUMA
+ /*
+ * Ugly hack to pass state to sd_numa_mask()...
+ */
+ sched_domains_curr_level = tl->numa_level;
+#endif
+
+ sd_weight = cpumask_weight(tl->mask(cpu));
+
+ if (tl->sd_flags)
+ sd_flags = (*tl->sd_flags)();
+ if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
+ "wrong sd_flags in topology description\n"))
+ sd_flags &= ~TOPOLOGY_SD_FLAGS;
+
+ *sd = (struct sched_domain){
+ .min_interval = sd_weight,
+ .max_interval = 2*sd_weight,
+ .busy_factor = 32,
+ .imbalance_pct = 125,
+
+ .cache_nice_tries = 0,
+ .busy_idx = 0,
+ .idle_idx = 0,
+ .newidle_idx = 0,
+ .wake_idx = 0,
+ .forkexec_idx = 0,
+
+ .flags = 1*SD_LOAD_BALANCE
+ | 1*SD_BALANCE_NEWIDLE
+ | 1*SD_BALANCE_EXEC
+ | 1*SD_BALANCE_FORK
+ | 0*SD_BALANCE_WAKE
+ | 1*SD_WAKE_AFFINE
+ | 0*SD_SHARE_CPUCAPACITY
+ | 0*SD_SHARE_PKG_RESOURCES
+ | 0*SD_SERIALIZE
+ | 0*SD_PREFER_SIBLING
+ | 0*SD_NUMA
+ | sd_flags
+ ,
+
+ .last_balance = jiffies,
+ .balance_interval = sd_weight,
+ .smt_gain = 0,
+ .max_newidle_lb_cost = 0,
+ .next_decay_max_lb_cost = jiffies,
+#ifdef CONFIG_SCHED_DEBUG
+ .name = tl->name,
+#endif
+ };
+
+ /*
+ * Convert topological properties into behaviour.
+ */
+
+ if (sd->flags & SD_SHARE_CPUCAPACITY) {
+ sd->flags |= SD_PREFER_SIBLING;
+ sd->imbalance_pct = 110;
+ sd->smt_gain = 1178; /* ~15% */
+
+ } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
+ sd->imbalance_pct = 117;
+ sd->cache_nice_tries = 1;
+ sd->busy_idx = 2;
+
+#ifdef CONFIG_NUMA
+ } else if (sd->flags & SD_NUMA) {
+ sd->cache_nice_tries = 2;
+ sd->busy_idx = 3;
+ sd->idle_idx = 2;
+
+ sd->flags |= SD_SERIALIZE;
+ if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
+ sd->flags &= ~(SD_BALANCE_EXEC |
+ SD_BALANCE_FORK |
+ SD_WAKE_AFFINE);
+ }
+
+#endif
+ } else {
+ sd->flags |= SD_PREFER_SIBLING;
+ sd->cache_nice_tries = 1;
+ sd->busy_idx = 2;
+ sd->idle_idx = 1;
+ }
+
+ sd->private = &tl->data;
+
+ return sd;
+}
+
+/*
+ * Topology list, bottom-up.
+ */
+static struct sched_domain_topology_level default_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+ { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+#ifdef CONFIG_SCHED_MC
+ { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+#endif
+ { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+ { NULL, },
+};
+
+struct sched_domain_topology_level *sched_domain_topology = default_topology;
+
+#define for_each_sd_topology(tl) \
+ for (tl = sched_domain_topology; tl->mask; tl++)
+
+void set_sched_topology(struct sched_domain_topology_level *tl)
+{
+ sched_domain_topology = tl;
+}
+
+#ifdef CONFIG_NUMA
+
+static const struct cpumask *sd_numa_mask(int cpu)
+{
+ return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
+}
+
+static void sched_numa_warn(const char *str)
+{
+ static int done = false;
+ int i,j;
+
+ if (done)
+ return;
+
+ done = true;
+
+ printk(KERN_WARNING "ERROR: %s\n\n", str);
+
+ for (i = 0; i < nr_node_ids; i++) {
+ printk(KERN_WARNING " ");
+ for (j = 0; j < nr_node_ids; j++)
+ printk(KERN_CONT "%02d ", node_distance(i,j));
+ printk(KERN_CONT "\n");
+ }
+ printk(KERN_WARNING "\n");
+}
+
+bool find_numa_distance(int distance)
+{
+ int i;
+
+ if (distance == node_distance(0, 0))
+ return true;
+
+ for (i = 0; i < sched_domains_numa_levels; i++) {
+ if (sched_domains_numa_distance[i] == distance)
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * A system can have three types of NUMA topology:
+ * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
+ * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes
+ * NUMA_BACKPLANE: nodes can reach other nodes through a backplane
+ *
+ * The difference between a glueless mesh topology and a backplane
+ * topology lies in whether communication between not directly
+ * connected nodes goes through intermediary nodes (where programs
+ * could run), or through backplane controllers. This affects
+ * placement of programs.
+ *
+ * The type of topology can be discerned with the following tests:
+ * - If the maximum distance between any nodes is 1 hop, the system
+ * is directly connected.
+ * - If for two nodes A and B, located N > 1 hops away from each other,
+ * there is an intermediary node C, which is < N hops away from both
+ * nodes A and B, the system is a glueless mesh.
+ */
+static void init_numa_topology_type(void)
+{
+ int a, b, c, n;
+
+ n = sched_max_numa_distance;
+
+ if (n <= 1)
+ sched_numa_topology_type = NUMA_DIRECT;
+
+ for_each_online_node(a) {
+ for_each_online_node(b) {
+ /* Find two nodes furthest removed from each other. */
+ if (node_distance(a, b) < n)
+ continue;
+
+ /* Is there an intermediary node between a and b? */
+ for_each_online_node(c) {
+ if (node_distance(a, c) < n &&
+ node_distance(b, c) < n) {
+ sched_numa_topology_type =
+ NUMA_GLUELESS_MESH;
+ return;
+ }
+ }
+
+ sched_numa_topology_type = NUMA_BACKPLANE;
+ return;
+ }
+ }
+}
+
+static void sched_init_numa(void)
+{
+ int next_distance, curr_distance = node_distance(0, 0);
+ struct sched_domain_topology_level *tl;
+ int level = 0;
+ int i, j, k;
+
+ sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
+ if (!sched_domains_numa_distance)
+ return;
+
+ /*
+ * O(nr_nodes^2) deduplicating selection sort -- in order to find the
+ * unique distances in the node_distance() table.
+ *
+ * Assumes node_distance(0,j) includes all distances in
+ * node_distance(i,j) in order to avoid cubic time.
+ */
+ next_distance = curr_distance;
+ for (i = 0; i < nr_node_ids; i++) {
+ for (j = 0; j < nr_node_ids; j++) {
+ for (k = 0; k < nr_node_ids; k++) {
+ int distance = node_distance(i, k);
+
+ if (distance > curr_distance &&
+ (distance < next_distance ||
+ next_distance == curr_distance))
+ next_distance = distance;
+
+ /*
+ * While not a strong assumption it would be nice to know
+ * about cases where if node A is connected to B, B is not
+ * equally connected to A.
+ */
+ if (sched_debug() && node_distance(k, i) != distance)
+ sched_numa_warn("Node-distance not symmetric");
+
+ if (sched_debug() && i && !find_numa_distance(distance))
+ sched_numa_warn("Node-0 not representative");
+ }
+ if (next_distance != curr_distance) {
+ sched_domains_numa_distance[level++] = next_distance;
+ sched_domains_numa_levels = level;
+ curr_distance = next_distance;
+ } else break;
+ }
+
+ /*
+ * In case of sched_debug() we verify the above assumption.
+ */
+ if (!sched_debug())
+ break;
+ }
+
+ if (!level)
+ return;
+
+ /*
+ * 'level' contains the number of unique distances, excluding the
+ * identity distance node_distance(i,i).
+ *
+ * The sched_domains_numa_distance[] array includes the actual distance
+ * numbers.
+ */
+
+ /*
+ * Here, we should temporarily reset sched_domains_numa_levels to 0.
+ * If it fails to allocate memory for array sched_domains_numa_masks[][],
+ * the array will contain less then 'level' members. This could be
+ * dangerous when we use it to iterate array sched_domains_numa_masks[][]
+ * in other functions.
+ *
+ * We reset it to 'level' at the end of this function.
+ */
+ sched_domains_numa_levels = 0;
+
+ sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
+ if (!sched_domains_numa_masks)
+ return;
+
+ /*
+ * Now for each level, construct a mask per node which contains all
+ * cpus of nodes that are that many hops away from us.
+ */
+ for (i = 0; i < level; i++) {
+ sched_domains_numa_masks[i] =
+ kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
+ if (!sched_domains_numa_masks[i])
+ return;
+
+ for (j = 0; j < nr_node_ids; j++) {
+ struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
+ if (!mask)
+ return;
+
+ sched_domains_numa_masks[i][j] = mask;
+
+ for (k = 0; k < nr_node_ids; k++) {
+ if (node_distance(j, k) > sched_domains_numa_distance[i])
+ continue;
+
+ cpumask_or(mask, mask, cpumask_of_node(k));
+ }
+ }
+ }
+
+ /* Compute default topology size */
+ for (i = 0; sched_domain_topology[i].mask; i++);
+
+ tl = kzalloc((i + level + 1) *
+ sizeof(struct sched_domain_topology_level), GFP_KERNEL);
+ if (!tl)
+ return;
+
+ /*
+ * Copy the default topology bits..
+ */
+ for (i = 0; sched_domain_topology[i].mask; i++)
+ tl[i] = sched_domain_topology[i];
+
+ /*
+ * .. and append 'j' levels of NUMA goodness.
+ */
+ for (j = 0; j < level; i++, j++) {
+ tl[i] = (struct sched_domain_topology_level){
+ .mask = sd_numa_mask,
+ .sd_flags = cpu_numa_flags,
+ .flags = SDTL_OVERLAP,
+ .numa_level = j,
+ SD_INIT_NAME(NUMA)
+ };
+ }
+
+ sched_domain_topology = tl;
+
+ sched_domains_numa_levels = level;
+ sched_max_numa_distance = sched_domains_numa_distance[level - 1];
+
+ init_numa_topology_type();
+}
+
+static void sched_domains_numa_masks_set(int cpu)
+{
+ int i, j;
+ int node = cpu_to_node(cpu);
+
+ for (i = 0; i < sched_domains_numa_levels; i++) {
+ for (j = 0; j < nr_node_ids; j++) {
+ if (node_distance(j, node) <= sched_domains_numa_distance[i])
+ cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
+ }
+ }
+}
+
+static void sched_domains_numa_masks_clear(int cpu)
+{
+ int i, j;
+ for (i = 0; i < sched_domains_numa_levels; i++) {
+ for (j = 0; j < nr_node_ids; j++)
+ cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
+ }
+}
+
+/*
+ * Update sched_domains_numa_masks[level][node] array when new cpus
+ * are onlined.
+ */
+static int sched_domains_numa_masks_update(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ int cpu = (long)hcpu;
+
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_ONLINE:
+ sched_domains_numa_masks_set(cpu);
+ break;
+
+ case CPU_DEAD:
+ sched_domains_numa_masks_clear(cpu);
+ break;
+
+ default:
+ return NOTIFY_DONE;
+ }
+
+ return NOTIFY_OK;
+}
+#else
+static inline void sched_init_numa(void)
+{
+}
+
+static int sched_domains_numa_masks_update(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ return 0;
+}
+#endif /* CONFIG_NUMA */
+
+static int __sdt_alloc(const struct cpumask *cpu_map)
+{
+ struct sched_domain_topology_level *tl;
+ int j;
+
+ for_each_sd_topology(tl) {
+ struct sd_data *sdd = &tl->data;
+
+ sdd->sd = alloc_percpu(struct sched_domain *);
+ if (!sdd->sd)
+ return -ENOMEM;
+
+ sdd->sg = alloc_percpu(struct sched_group *);
+ if (!sdd->sg)
+ return -ENOMEM;
+
+ sdd->sgc = alloc_percpu(struct sched_group_capacity *);
+ if (!sdd->sgc)
+ return -ENOMEM;
+
+ for_each_cpu(j, cpu_map) {
+ struct sched_domain *sd;
+ struct sched_group *sg;
+ struct sched_group_capacity *sgc;
+
+ sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
+ GFP_KERNEL, cpu_to_node(j));
+ if (!sd)
+ return -ENOMEM;
+
+ *per_cpu_ptr(sdd->sd, j) = sd;
+
+ sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
+ GFP_KERNEL, cpu_to_node(j));
+ if (!sg)
+ return -ENOMEM;
+
+ sg->next = sg;
+
+ *per_cpu_ptr(sdd->sg, j) = sg;
+
+ sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
+ GFP_KERNEL, cpu_to_node(j));
+ if (!sgc)
+ return -ENOMEM;
+
+ *per_cpu_ptr(sdd->sgc, j) = sgc;
+ }
+ }
+
+ return 0;
+}
+
+static void __sdt_free(const struct cpumask *cpu_map)
+{
+ struct sched_domain_topology_level *tl;
+ int j;
+
+ for_each_sd_topology(tl) {
+ struct sd_data *sdd = &tl->data;
+
+ for_each_cpu(j, cpu_map) {
+ struct sched_domain *sd;
+
+ if (sdd->sd) {
+ sd = *per_cpu_ptr(sdd->sd, j);
+ if (sd && (sd->flags & SD_OVERLAP))
+ free_sched_groups(sd->groups, 0);
+ kfree(*per_cpu_ptr(sdd->sd, j));
+ }
+
+ if (sdd->sg)
+ kfree(*per_cpu_ptr(sdd->sg, j));
+ if (sdd->sgc)
+ kfree(*per_cpu_ptr(sdd->sgc, j));
+ }
+ free_percpu(sdd->sd);
+ sdd->sd = NULL;
+ free_percpu(sdd->sg);
+ sdd->sg = NULL;
+ free_percpu(sdd->sgc);
+ sdd->sgc = NULL;
+ }
+}
+
+struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
+ const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+ struct sched_domain *child, int cpu)
+{
+ struct sched_domain *sd = sd_init(tl, cpu);
+ if (!sd)
+ return child;
+
+ cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
+ if (child) {
+ sd->level = child->level + 1;
+ sched_domain_level_max = max(sched_domain_level_max, sd->level);
+ child->parent = sd;
+ sd->child = child;
+
+ if (!cpumask_subset(sched_domain_span(child),
+ sched_domain_span(sd))) {
+ pr_err("BUG: arch topology borken\n");
+#ifdef CONFIG_SCHED_DEBUG
+ pr_err(" the %s domain not a subset of the %s domain\n",
+ child->name, sd->name);
+#endif
+ /* Fixup, ensure @sd has at least @child cpus. */
+ cpumask_or(sched_domain_span(sd),
+ sched_domain_span(sd),
+ sched_domain_span(child));
+ }
+
+ }
+ set_domain_attribute(sd, attr);
+
+ return sd;
+}
+
+/*
+ * Build sched domains for a given set of cpus and attach the sched domains
+ * to the individual cpus
+ */
+static int build_sched_domains(const struct cpumask *cpu_map,
+ struct sched_domain_attr *attr)
+{
+ enum s_alloc alloc_state;
+ struct sched_domain *sd;
+ struct s_data d;
+ int i, ret = -ENOMEM;
+
+ alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
+ if (alloc_state != sa_rootdomain)
+ goto error;
+
+ /* Set up domains for cpus specified by the cpu_map. */
+ for_each_cpu(i, cpu_map) {
+ struct sched_domain_topology_level *tl;
+
+ sd = NULL;
+ for_each_sd_topology(tl) {
+ sd = build_sched_domain(tl, cpu_map, attr, sd, i);
+ if (tl == sched_domain_topology)
+ *per_cpu_ptr(d.sd, i) = sd;
+ if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
+ sd->flags |= SD_OVERLAP;
+ if (cpumask_equal(cpu_map, sched_domain_span(sd)))
+ break;
+ }
+ }
+
+ /* Build the groups for the domains */
+ for_each_cpu(i, cpu_map) {
+ for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+ sd->span_weight = cpumask_weight(sched_domain_span(sd));
+ if (sd->flags & SD_OVERLAP) {
+ if (build_overlap_sched_groups(sd, i))
+ goto error;
+ } else {
+ if (build_sched_groups(sd, i))
+ goto error;
+ }
+ }
+ }
+
+ /* Calculate CPU capacity for physical packages and nodes */
+ for (i = nr_cpumask_bits-1; i >= 0; i--) {
+ if (!cpumask_test_cpu(i, cpu_map))
+ continue;
+
+ for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+ claim_allocations(i, sd);
+ init_sched_groups_capacity(i, sd);
+ }
+ }
+
+ /* Attach the domains */
+ rcu_read_lock();
+ for_each_cpu(i, cpu_map) {
+ sd = *per_cpu_ptr(d.sd, i);
+ cpu_attach_domain(sd, d.rd, i);
+ }
+ rcu_read_unlock();
+
+ ret = 0;
+error:
+ __free_domain_allocs(&d, alloc_state, cpu_map);
+ return ret;
+}
+
+static cpumask_var_t *doms_cur; /* current sched domains */
+static int ndoms_cur; /* number of sched domains in 'doms_cur' */
+static struct sched_domain_attr *dattr_cur;
+ /* attribues of custom domains in 'doms_cur' */
+
+/*
+ * Special case: If a kmalloc of a doms_cur partition (array of
+ * cpumask) fails, then fallback to a single sched domain,
+ * as determined by the single cpumask fallback_doms.
+ */
+static cpumask_var_t fallback_doms;
+
+/*
+ * arch_update_cpu_topology lets virtualized architectures update the
+ * cpu core maps. It is supposed to return 1 if the topology changed
+ * or 0 if it stayed the same.
+ */
+int __weak arch_update_cpu_topology(void)
+{
+ return 0;
+}
+
+cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
+{
+ int i;
+ cpumask_var_t *doms;
+
+ doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
+ if (!doms)
+ return NULL;
+ for (i = 0; i < ndoms; i++) {
+ if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
+ free_sched_domains(doms, i);
+ return NULL;
+ }
+ }
+ return doms;
+}
+
+void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
+{
+ unsigned int i;
+ for (i = 0; i < ndoms; i++)
+ free_cpumask_var(doms[i]);
+ kfree(doms);
+}
+
+/*
+ * Set up scheduler domains and groups. Callers must hold the hotplug lock.
+ * For now this just excludes isolated cpus, but could be used to
+ * exclude other special cases in the future.
+ */
+static int init_sched_domains(const struct cpumask *cpu_map)
+{
+ int err;
+
+ arch_update_cpu_topology();
+ ndoms_cur = 1;
+ doms_cur = alloc_sched_domains(ndoms_cur);
+ if (!doms_cur)
+ doms_cur = &fallback_doms;
+ cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
+ err = build_sched_domains(doms_cur[0], NULL);
+ register_sched_domain_sysctl();
+
+ return err;
+}
+
+/*
+ * Detach sched domains from a group of cpus specified in cpu_map
+ * These cpus will now be attached to the NULL domain
+ */
+static void detach_destroy_domains(const struct cpumask *cpu_map)
+{
+ int i;
+
+ rcu_read_lock();
+ for_each_cpu(i, cpu_map)
+ cpu_attach_domain(NULL, &def_root_domain, i);
+ rcu_read_unlock();
+}
+
+/* handle null as "default" */
+static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
+ struct sched_domain_attr *new, int idx_new)
+{
+ struct sched_domain_attr tmp;
+
+ /* fast path */
+ if (!new && !cur)
+ return 1;
+
+ tmp = SD_ATTR_INIT;
+ return !memcmp(cur ? (cur + idx_cur) : &tmp,
+ new ? (new + idx_new) : &tmp,
+ sizeof(struct sched_domain_attr));
+}
+
+/*
+ * Partition sched domains as specified by the 'ndoms_new'
+ * cpumasks in the array doms_new[] of cpumasks. This compares
+ * doms_new[] to the current sched domain partitioning, doms_cur[].
+ * It destroys each deleted domain and builds each new domain.
+ *
+ * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
+ * The masks don't intersect (don't overlap.) We should setup one
+ * sched domain for each mask. CPUs not in any of the cpumasks will
+ * not be load balanced. If the same cpumask appears both in the
+ * current 'doms_cur' domains and in the new 'doms_new', we can leave
+ * it as it is.
+ *
+ * The passed in 'doms_new' should be allocated using
+ * alloc_sched_domains. This routine takes ownership of it and will
+ * free_sched_domains it when done with it. If the caller failed the
+ * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
+ * and partition_sched_domains() will fallback to the single partition
+ * 'fallback_doms', it also forces the domains to be rebuilt.
+ *
+ * If doms_new == NULL it will be replaced with cpu_online_mask.
+ * ndoms_new == 0 is a special case for destroying existing domains,
+ * and it will not create the default domain.
+ *
+ * Call with hotplug lock held
+ */
+void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
+ struct sched_domain_attr *dattr_new)
+{
+ int i, j, n;
+ int new_topology;
+
+ mutex_lock(&sched_domains_mutex);
+
+ /* always unregister in case we don't destroy any domains */
+ unregister_sched_domain_sysctl();
+
+ /* Let architecture update cpu core mappings. */
+ new_topology = arch_update_cpu_topology();
+
+ n = doms_new ? ndoms_new : 0;
+
+ /* Destroy deleted domains */
+ for (i = 0; i < ndoms_cur; i++) {
+ for (j = 0; j < n && !new_topology; j++) {
+ if (cpumask_equal(doms_cur[i], doms_new[j])
+ && dattrs_equal(dattr_cur, i, dattr_new, j))
+ goto match1;
+ }
+ /* no match - a current sched domain not in new doms_new[] */
+ detach_destroy_domains(doms_cur[i]);
+match1:
+ ;
+ }
+
+ n = ndoms_cur;
+ if (doms_new == NULL) {
+ n = 0;
+ doms_new = &fallback_doms;
+ cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
+ WARN_ON_ONCE(dattr_new);
+ }
+
+ /* Build new domains */
+ for (i = 0; i < ndoms_new; i++) {
+ for (j = 0; j < n && !new_topology; j++) {
+ if (cpumask_equal(doms_new[i], doms_cur[j])
+ && dattrs_equal(dattr_new, i, dattr_cur, j))
+ goto match2;
+ }
+ /* no match - add a new doms_new */
+ build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
+match2:
+ ;
+ }
+
+ /* Remember the new sched domains */
+ if (doms_cur != &fallback_doms)
+ free_sched_domains(doms_cur, ndoms_cur);
+ kfree(dattr_cur); /* kfree(NULL) is safe */
+ doms_cur = doms_new;
+ dattr_cur = dattr_new;
+ ndoms_cur = ndoms_new;
+
+ register_sched_domain_sysctl();
+
+ mutex_unlock(&sched_domains_mutex);
+}
+
+static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */
+
+/*
+ * Update cpusets according to cpu_active mask. If cpusets are
+ * disabled, cpuset_update_active_cpus() becomes a simple wrapper
+ * around partition_sched_domains().
+ *
+ * If we come here as part of a suspend/resume, don't touch cpusets because we
+ * want to restore it back to its original state upon resume anyway.
+ */
+static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
+ void *hcpu)
+{
+ switch (action) {
+ case CPU_ONLINE_FROZEN:
+ case CPU_DOWN_FAILED_FROZEN:
+
+ /*
+ * num_cpus_frozen tracks how many CPUs are involved in suspend
+ * resume sequence. As long as this is not the last online
+ * operation in the resume sequence, just build a single sched
+ * domain, ignoring cpusets.
+ */
+ num_cpus_frozen--;
+ if (likely(num_cpus_frozen)) {
+ partition_sched_domains(1, NULL, NULL);
+ break;
+ }
+
+ /*
+ * This is the last CPU online operation. So fall through and
+ * restore the original sched domains by considering the
+ * cpuset configurations.
+ */
+
+ case CPU_ONLINE:
+ cpuset_update_active_cpus(true);
+ break;
+ default:
+ return NOTIFY_DONE;
+ }
+ return NOTIFY_OK;
+}
+
+static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
+ void *hcpu)
+{
+ unsigned long flags;
+ long cpu = (long)hcpu;
+ struct dl_bw *dl_b;
+ bool overflow;
+ int cpus;
+
+ switch (action) {
+ case CPU_DOWN_PREPARE:
+ rcu_read_lock_sched();
+ dl_b = dl_bw_of(cpu);
+
+ raw_spin_lock_irqsave(&dl_b->lock, flags);
+ cpus = dl_bw_cpus(cpu);
+ overflow = __dl_overflow(dl_b, cpus, 0, 0);
+ raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+
+ rcu_read_unlock_sched();
+
+ if (overflow)
+ return notifier_from_errno(-EBUSY);
+ cpuset_update_active_cpus(false);
+ break;
+ case CPU_DOWN_PREPARE_FROZEN:
+ num_cpus_frozen++;
+ partition_sched_domains(1, NULL, NULL);
+ break;
+ default:
+ return NOTIFY_DONE;
+ }
+ return NOTIFY_OK;
+}
+
+void __init sched_init_smp(void)
+{
+ cpumask_var_t non_isolated_cpus;
+
+ alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
+ alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
+
+ sched_init_numa();
+
+ /*
+ * There's no userspace yet to cause hotplug operations; hence all the
+ * cpu masks are stable and all blatant races in the below code cannot
+ * happen.
+ */
+ mutex_lock(&sched_domains_mutex);
+ init_sched_domains(cpu_active_mask);
+ cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
+ if (cpumask_empty(non_isolated_cpus))
+ cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
+ mutex_unlock(&sched_domains_mutex);
+
+ hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
+ hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
+ hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
+
+ init_hrtick();
+
+ /* Move init over to a non-isolated CPU */
+ if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
+ BUG();
+ sched_init_granularity();
+ free_cpumask_var(non_isolated_cpus);
+
+ init_sched_rt_class();
+ init_sched_dl_class();
+}
+#else
+void __init sched_init_smp(void)
+{
+ sched_init_granularity();
+}
+#endif /* CONFIG_SMP */
+
+const_debug unsigned int sysctl_timer_migration = 1;
+
+int in_sched_functions(unsigned long addr)
+{
+ return in_lock_functions(addr) ||
+ (addr >= (unsigned long)__sched_text_start
+ && addr < (unsigned long)__sched_text_end);
+}
+
+#ifdef CONFIG_CGROUP_SCHED
+/*
+ * Default task group.
+ * Every task in system belongs to this group at bootup.
+ */
+struct task_group root_task_group;
+LIST_HEAD(task_groups);
+#endif
+
+DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
+
+void __init sched_init(void)
+{
+ int i, j;
+ unsigned long alloc_size = 0, ptr;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ alloc_size += 2 * nr_cpu_ids * sizeof(void **);
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+ alloc_size += 2 * nr_cpu_ids * sizeof(void **);
+#endif
+ if (alloc_size) {
+ ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ root_task_group.se = (struct sched_entity **)ptr;
+ ptr += nr_cpu_ids * sizeof(void **);
+
+ root_task_group.cfs_rq = (struct cfs_rq **)ptr;
+ ptr += nr_cpu_ids * sizeof(void **);
+
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+#ifdef CONFIG_RT_GROUP_SCHED
+ root_task_group.rt_se = (struct sched_rt_entity **)ptr;
+ ptr += nr_cpu_ids * sizeof(void **);
+
+ root_task_group.rt_rq = (struct rt_rq **)ptr;
+ ptr += nr_cpu_ids * sizeof(void **);
+
+#endif /* CONFIG_RT_GROUP_SCHED */
+ }
+#ifdef CONFIG_CPUMASK_OFFSTACK
+ for_each_possible_cpu(i) {
+ per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
+ cpumask_size(), GFP_KERNEL, cpu_to_node(i));
+ }
+#endif /* CONFIG_CPUMASK_OFFSTACK */
+
+ init_rt_bandwidth(&def_rt_bandwidth,
+ global_rt_period(), global_rt_runtime());
+ init_dl_bandwidth(&def_dl_bandwidth,
+ global_rt_period(), global_rt_runtime());
+
+#ifdef CONFIG_SMP
+ init_defrootdomain();
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+ init_rt_bandwidth(&root_task_group.rt_bandwidth,
+ global_rt_period(), global_rt_runtime());
+#endif /* CONFIG_RT_GROUP_SCHED */
+
+#ifdef CONFIG_CGROUP_SCHED
+ list_add(&root_task_group.list, &task_groups);
+ INIT_LIST_HEAD(&root_task_group.children);
+ INIT_LIST_HEAD(&root_task_group.siblings);
+ autogroup_init(&init_task);
+
+#endif /* CONFIG_CGROUP_SCHED */
+
+ for_each_possible_cpu(i) {
+ struct rq *rq;
+
+ rq = cpu_rq(i);
+ raw_spin_lock_init(&rq->lock);
+ rq->nr_running = 0;
+ rq->calc_load_active = 0;
+ rq->calc_load_update = jiffies + LOAD_FREQ;
+ init_cfs_rq(&rq->cfs);
+ init_rt_rq(&rq->rt);
+ init_dl_rq(&rq->dl);
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ root_task_group.shares = ROOT_TASK_GROUP_LOAD;
+ INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
+ /*
+ * How much cpu bandwidth does root_task_group get?
+ *
+ * In case of task-groups formed thr' the cgroup filesystem, it
+ * gets 100% of the cpu resources in the system. This overall
+ * system cpu resource is divided among the tasks of
+ * root_task_group and its child task-groups in a fair manner,
+ * based on each entity's (task or task-group's) weight
+ * (se->load.weight).
+ *
+ * In other words, if root_task_group has 10 tasks of weight
+ * 1024) and two child groups A0 and A1 (of weight 1024 each),
+ * then A0's share of the cpu resource is:
+ *
+ * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
+ *
+ * We achieve this by letting root_task_group's tasks sit
+ * directly in rq->cfs (i.e root_task_group->se[] = NULL).
+ */
+ init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
+ init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+ rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
+#ifdef CONFIG_RT_GROUP_SCHED
+ init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
+#endif
+
+ for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
+ rq->cpu_load[j] = 0;
+
+ rq->last_load_update_tick = jiffies;
+
+#ifdef CONFIG_SMP
+ rq->sd = NULL;
+ rq->rd = NULL;
+ rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
+ rq->post_schedule = 0;
+ rq->active_balance = 0;
+ rq->next_balance = jiffies;
+ rq->push_cpu = 0;
+ rq->cpu = i;
+ rq->online = 0;
+ rq->idle_stamp = 0;
+ rq->avg_idle = 2*sysctl_sched_migration_cost;
+ rq->max_idle_balance_cost = sysctl_sched_migration_cost;
+
+ INIT_LIST_HEAD(&rq->cfs_tasks);
+
+ rq_attach_root(rq, &def_root_domain);
+#ifdef CONFIG_NO_HZ_COMMON
+ rq->nohz_flags = 0;
+#endif
+#ifdef CONFIG_NO_HZ_FULL
+ rq->last_sched_tick = 0;
+#endif
+#endif
+ init_rq_hrtick(rq);
+ atomic_set(&rq->nr_iowait, 0);
+ }
+
+ set_load_weight(&init_task);
+
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+ INIT_HLIST_HEAD(&init_task.preempt_notifiers);
+#endif
+
+ /*
+ * The boot idle thread does lazy MMU switching as well:
+ */
+ atomic_inc(&init_mm.mm_count);
+ enter_lazy_tlb(&init_mm, current);
+
+ /*
+ * During early bootup we pretend to be a normal task:
+ */
+ current->sched_class = &fair_sched_class;
+
+ /*
+ * Make us the idle thread. Technically, schedule() should not be
+ * called from this thread, however somewhere below it might be,
+ * but because we are the idle thread, we just pick up running again
+ * when this runqueue becomes "idle".
+ */
+ init_idle(current, smp_processor_id());
+
+ calc_load_update = jiffies + LOAD_FREQ;
+
+#ifdef CONFIG_SMP
+ zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
+ /* May be allocated at isolcpus cmdline parse time */
+ if (cpu_isolated_map == NULL)
+ zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
+ idle_thread_set_boot_cpu();
+ set_cpu_rq_start_time();
+#endif
+ init_sched_fair_class();
+
+ scheduler_running = 1;
+}
+
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
+static inline int preempt_count_equals(int preempt_offset)
+{
+ int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
+
+ return (nested == preempt_offset);
+}
+
+void __might_sleep(const char *file, int line, int preempt_offset)
+{
+ /*
+ * Blocking primitives will set (and therefore destroy) current->state,
+ * since we will exit with TASK_RUNNING make sure we enter with it,
+ * otherwise we will destroy state.
+ */
+ WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,
+ "do not call blocking ops when !TASK_RUNNING; "
+ "state=%lx set at [<%p>] %pS\n",
+ current->state,
+ (void *)current->task_state_change,
+ (void *)current->task_state_change);
+
+ ___might_sleep(file, line, preempt_offset);
+}
+EXPORT_SYMBOL(__might_sleep);
+
+void ___might_sleep(const char *file, int line, int preempt_offset)
+{
+ static unsigned long prev_jiffy; /* ratelimiting */
+
+ rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
+ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
+ !is_idle_task(current)) ||
+ system_state != SYSTEM_RUNNING || oops_in_progress)
+ return;
+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
+ return;
+ prev_jiffy = jiffies;
+
+ printk(KERN_ERR
+ "BUG: sleeping function called from invalid context at %s:%d\n",
+ file, line);
+ printk(KERN_ERR
+ "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
+ in_atomic(), irqs_disabled(),
+ current->pid, current->comm);
+
+ if (task_stack_end_corrupted(current))
+ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
+
+ debug_show_held_locks(current);
+ if (irqs_disabled())
+ print_irqtrace_events(current);
+#ifdef CONFIG_DEBUG_PREEMPT
+ if (!preempt_count_equals(preempt_offset)) {
+ pr_err("Preemption disabled at:");
+ print_ip_sym(current->preempt_disable_ip);
+ pr_cont("\n");
+ }
+#endif
+ dump_stack();
+}
+EXPORT_SYMBOL(___might_sleep);
+#endif
+
+#ifdef CONFIG_MAGIC_SYSRQ
+static void normalize_task(struct rq *rq, struct task_struct *p)
+{
+ const struct sched_class *prev_class = p->sched_class;
+ struct sched_attr attr = {
+ .sched_policy = SCHED_NORMAL,
+ };
+ int old_prio = p->prio;
+ int queued;
+
+ queued = task_on_rq_queued(p);
+ if (queued)
+ dequeue_task(rq, p, 0);
+ __setscheduler(rq, p, &attr, false);
+ if (queued) {
+ enqueue_task(rq, p, 0);
+ resched_curr(rq);
+ }
+
+ check_class_changed(rq, p, prev_class, old_prio);
+}
+
+void normalize_rt_tasks(void)
+{
+ struct task_struct *g, *p;
+ unsigned long flags;
+ struct rq *rq;
+
+ read_lock(&tasklist_lock);
+ for_each_process_thread(g, p) {
+ /*
+ * Only normalize user tasks:
+ */
+ if (p->flags & PF_KTHREAD)
+ continue;
+
+ p->se.exec_start = 0;
+#ifdef CONFIG_SCHEDSTATS
+ p->se.statistics.wait_start = 0;
+ p->se.statistics.sleep_start = 0;
+ p->se.statistics.block_start = 0;
+#endif
+
+ if (!dl_task(p) && !rt_task(p)) {
+ /*
+ * Renice negative nice level userspace
+ * tasks back to 0:
+ */
+ if (task_nice(p) < 0)
+ set_user_nice(p, 0);
+ continue;
+ }
+
+ rq = task_rq_lock(p, &flags);
+ normalize_task(rq, p);
+ task_rq_unlock(rq, p, &flags);
+ }
+ read_unlock(&tasklist_lock);
+}
+
+#endif /* CONFIG_MAGIC_SYSRQ */
+
+#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
+/*
+ * These functions are only useful for the IA64 MCA handling, or kdb.
+ *
+ * They can only be called when the whole system has been
+ * stopped - every CPU needs to be quiescent, and no scheduling
+ * activity can take place. Using them for anything else would
+ * be a serious bug, and as a result, they aren't even visible
+ * under any other configuration.
+ */
+
+/**
+ * curr_task - return the current task for a given cpu.
+ * @cpu: the processor in question.
+ *
+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ *
+ * Return: The current task for @cpu.
+ */
+struct task_struct *curr_task(int cpu)
+{
+ return cpu_curr(cpu);
+}
+
+#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
+
+#ifdef CONFIG_IA64
+/**
+ * set_curr_task - set the current task for a given cpu.
+ * @cpu: the processor in question.
+ * @p: the task pointer to set.
+ *
+ * Description: This function must only be used when non-maskable interrupts
+ * are serviced on a separate stack. It allows the architecture to switch the
+ * notion of the current task on a cpu in a non-blocking manner. This function
+ * must be called with all CPU's synchronized, and interrupts disabled, the
+ * and caller must save the original value of the current task (see
+ * curr_task() above) and restore that value before reenabling interrupts and
+ * re-starting the system.
+ *
+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ */
+void set_curr_task(int cpu, struct task_struct *p)
+{
+ cpu_curr(cpu) = p;
+}
+
+#endif
+
+#ifdef CONFIG_CGROUP_SCHED
+/* task_group_lock serializes the addition/removal of task groups */
+static DEFINE_SPINLOCK(task_group_lock);
+
+static void free_sched_group(struct task_group *tg)
+{
+ free_fair_sched_group(tg);
+ free_rt_sched_group(tg);
+ autogroup_free(tg);
+ kfree(tg);
+}
+
+/* allocate runqueue etc for a new task group */
+struct task_group *sched_create_group(struct task_group *parent)
+{
+ struct task_group *tg;
+
+ tg = kzalloc(sizeof(*tg), GFP_KERNEL);
+ if (!tg)
+ return ERR_PTR(-ENOMEM);
+
+ if (!alloc_fair_sched_group(tg, parent))
+ goto err;
+
+ if (!alloc_rt_sched_group(tg, parent))
+ goto err;
+
+ return tg;
+
+err:
+ free_sched_group(tg);
+ return ERR_PTR(-ENOMEM);
+}
+
+void sched_online_group(struct task_group *tg, struct task_group *parent)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&task_group_lock, flags);
+ list_add_rcu(&tg->list, &task_groups);
+
+ WARN_ON(!parent); /* root should already exist */
+
+ tg->parent = parent;
+ INIT_LIST_HEAD(&tg->children);
+ list_add_rcu(&tg->siblings, &parent->children);
+ spin_unlock_irqrestore(&task_group_lock, flags);
+}
+
+/* rcu callback to free various structures associated with a task group */
+static void free_sched_group_rcu(struct rcu_head *rhp)
+{
+ /* now it should be safe to free those cfs_rqs */
+ free_sched_group(container_of(rhp, struct task_group, rcu));
+}
+
+/* Destroy runqueue etc associated with a task group */
+void sched_destroy_group(struct task_group *tg)
+{
+ /* wait for possible concurrent references to cfs_rqs complete */
+ call_rcu(&tg->rcu, free_sched_group_rcu);
+}
+
+void sched_offline_group(struct task_group *tg)
+{
+ unsigned long flags;
+ int i;
+
+ /* end participation in shares distribution */
+ for_each_possible_cpu(i)
+ unregister_fair_sched_group(tg, i);
+
+ spin_lock_irqsave(&task_group_lock, flags);
+ list_del_rcu(&tg->list);
+ list_del_rcu(&tg->siblings);
+ spin_unlock_irqrestore(&task_group_lock, flags);
+}
+
+/* change task's runqueue when it moves between groups.
+ * The caller of this function should have put the task in its new group
+ * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
+ * reflect its new group.
+ */
+void sched_move_task(struct task_struct *tsk)
+{
+ struct task_group *tg;
+ int queued, running;
+ unsigned long flags;
+ struct rq *rq;
+
+ rq = task_rq_lock(tsk, &flags);
+
+ running = task_current(rq, tsk);
+ queued = task_on_rq_queued(tsk);
+
+ if (queued)
+ dequeue_task(rq, tsk, 0);
+ if (unlikely(running))
+ put_prev_task(rq, tsk);
+
+ /*
+ * All callers are synchronized by task_rq_lock(); we do not use RCU
+ * which is pointless here. Thus, we pass "true" to task_css_check()
+ * to prevent lockdep warnings.
+ */
+ tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
+ struct task_group, css);
+ tg = autogroup_task_group(tsk, tg);
+ tsk->sched_task_group = tg;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ if (tsk->sched_class->task_move_group)
+ tsk->sched_class->task_move_group(tsk, queued);
+ else
+#endif
+ set_task_rq(tsk, task_cpu(tsk));
+
+ if (unlikely(running))
+ tsk->sched_class->set_curr_task(rq);
+ if (queued)
+ enqueue_task(rq, tsk, 0);
+
+ task_rq_unlock(rq, tsk, &flags);
+}
+#endif /* CONFIG_CGROUP_SCHED */
+
+#ifdef CONFIG_RT_GROUP_SCHED
+/*
+ * Ensure that the real time constraints are schedulable.
+ */
+static DEFINE_MUTEX(rt_constraints_mutex);
+
+/* Must be called with tasklist_lock held */
+static inline int tg_has_rt_tasks(struct task_group *tg)
+{
+ struct task_struct *g, *p;
+
+ /*
+ * Autogroups do not have RT tasks; see autogroup_create().
+ */
+ if (task_group_is_autogroup(tg))
+ return 0;
+
+ for_each_process_thread(g, p) {
+ if (rt_task(p) && task_group(p) == tg)
+ return 1;
+ }
+
+ return 0;
+}
+
+struct rt_schedulable_data {
+ struct task_group *tg;
+ u64 rt_period;
+ u64 rt_runtime;
+};
+
+static int tg_rt_schedulable(struct task_group *tg, void *data)
+{
+ struct rt_schedulable_data *d = data;
+ struct task_group *child;
+ unsigned long total, sum = 0;
+ u64 period, runtime;
+
+ period = ktime_to_ns(tg->rt_bandwidth.rt_period);
+ runtime = tg->rt_bandwidth.rt_runtime;
+
+ if (tg == d->tg) {
+ period = d->rt_period;
+ runtime = d->rt_runtime;
+ }
+
+ /*
+ * Cannot have more runtime than the period.
+ */
+ if (runtime > period && runtime != RUNTIME_INF)
+ return -EINVAL;
+
+ /*
+ * Ensure we don't starve existing RT tasks.
+ */
+ if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
+ return -EBUSY;
+
+ total = to_ratio(period, runtime);
+
+ /*
+ * Nobody can have more than the global setting allows.
+ */
+ if (total > to_ratio(global_rt_period(), global_rt_runtime()))
+ return -EINVAL;
+
+ /*
+ * The sum of our children's runtime should not exceed our own.
+ */
+ list_for_each_entry_rcu(child, &tg->children, siblings) {
+ period = ktime_to_ns(child->rt_bandwidth.rt_period);
+ runtime = child->rt_bandwidth.rt_runtime;
+
+ if (child == d->tg) {
+ period = d->rt_period;
+ runtime = d->rt_runtime;
+ }
+
+ sum += to_ratio(period, runtime);
+ }
+
+ if (sum > total)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
+{
+ int ret;
+
+ struct rt_schedulable_data data = {
+ .tg = tg,
+ .rt_period = period,
+ .rt_runtime = runtime,
+ };
+
+ rcu_read_lock();
+ ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static int tg_set_rt_bandwidth(struct task_group *tg,
+ u64 rt_period, u64 rt_runtime)
+{
+ int i, err = 0;
+
+ /*
+ * Disallowing the root group RT runtime is BAD, it would disallow the
+ * kernel creating (and or operating) RT threads.
+ */
+ if (tg == &root_task_group && rt_runtime == 0)
+ return -EINVAL;
+
+ /* No period doesn't make any sense. */
+ if (rt_period == 0)
+ return -EINVAL;
+
+ mutex_lock(&rt_constraints_mutex);
+ read_lock(&tasklist_lock);
+ err = __rt_schedulable(tg, rt_period, rt_runtime);
+ if (err)
+ goto unlock;
+
+ raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
+ tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
+ tg->rt_bandwidth.rt_runtime = rt_runtime;
+
+ for_each_possible_cpu(i) {
+ struct rt_rq *rt_rq = tg->rt_rq[i];
+
+ raw_spin_lock(&rt_rq->rt_runtime_lock);
+ rt_rq->rt_runtime = rt_runtime;
+ raw_spin_unlock(&rt_rq->rt_runtime_lock);
+ }
+ raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
+unlock:
+ read_unlock(&tasklist_lock);
+ mutex_unlock(&rt_constraints_mutex);
+
+ return err;
+}
+
+static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
+{
+ u64 rt_runtime, rt_period;
+
+ rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
+ rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
+ if (rt_runtime_us < 0)
+ rt_runtime = RUNTIME_INF;
+
+ return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
+}
+
+static long sched_group_rt_runtime(struct task_group *tg)
+{
+ u64 rt_runtime_us;
+
+ if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
+ return -1;
+
+ rt_runtime_us = tg->rt_bandwidth.rt_runtime;
+ do_div(rt_runtime_us, NSEC_PER_USEC);
+ return rt_runtime_us;
+}
+
+static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
+{
+ u64 rt_runtime, rt_period;
+
+ rt_period = (u64)rt_period_us * NSEC_PER_USEC;
+ rt_runtime = tg->rt_bandwidth.rt_runtime;
+
+ return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
+}
+
+static long sched_group_rt_period(struct task_group *tg)
+{
+ u64 rt_period_us;
+
+ rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
+ do_div(rt_period_us, NSEC_PER_USEC);
+ return rt_period_us;
+}
+#endif /* CONFIG_RT_GROUP_SCHED */
+
+#ifdef CONFIG_RT_GROUP_SCHED
+static int sched_rt_global_constraints(void)
+{
+ int ret = 0;
+
+ mutex_lock(&rt_constraints_mutex);
+ read_lock(&tasklist_lock);
+ ret = __rt_schedulable(NULL, 0, 0);
+ read_unlock(&tasklist_lock);
+ mutex_unlock(&rt_constraints_mutex);
+
+ return ret;
+}
+
+static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
+{
+ /* Don't accept realtime tasks when there is no way for them to run */
+ if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
+ return 0;
+
+ return 1;
+}
+
+#else /* !CONFIG_RT_GROUP_SCHED */
+static int sched_rt_global_constraints(void)
+{
+ unsigned long flags;
+ int i, ret = 0;
+
+ raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
+ for_each_possible_cpu(i) {
+ struct rt_rq *rt_rq = &cpu_rq(i)->rt;
+
+ raw_spin_lock(&rt_rq->rt_runtime_lock);
+ rt_rq->rt_runtime = global_rt_runtime();
+ raw_spin_unlock(&rt_rq->rt_runtime_lock);
+ }
+ raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
+
+ return ret;
+}
+#endif /* CONFIG_RT_GROUP_SCHED */
+
+static int sched_dl_global_validate(void)
+{
+ u64 runtime = global_rt_runtime();
+ u64 period = global_rt_period();
+ u64 new_bw = to_ratio(period, runtime);
+ struct dl_bw *dl_b;
+ int cpu, ret = 0;
+ unsigned long flags;
+
+ /*
+ * Here we want to check the bandwidth not being set to some
+ * value smaller than the currently allocated bandwidth in
+ * any of the root_domains.
+ *
+ * FIXME: Cycling on all the CPUs is overdoing, but simpler than
+ * cycling on root_domains... Discussion on different/better
+ * solutions is welcome!
+ */
+ for_each_possible_cpu(cpu) {
+ rcu_read_lock_sched();
+ dl_b = dl_bw_of(cpu);
+
+ raw_spin_lock_irqsave(&dl_b->lock, flags);
+ if (new_bw < dl_b->total_bw)
+ ret = -EBUSY;
+ raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+
+ rcu_read_unlock_sched();
+
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+static void sched_dl_do_global(void)
+{
+ u64 new_bw = -1;
+ struct dl_bw *dl_b;
+ int cpu;
+ unsigned long flags;
+
+ def_dl_bandwidth.dl_period = global_rt_period();
+ def_dl_bandwidth.dl_runtime = global_rt_runtime();
+
+ if (global_rt_runtime() != RUNTIME_INF)
+ new_bw = to_ratio(global_rt_period(), global_rt_runtime());
+
+ /*
+ * FIXME: As above...
+ */
+ for_each_possible_cpu(cpu) {
+ rcu_read_lock_sched();
+ dl_b = dl_bw_of(cpu);
+
+ raw_spin_lock_irqsave(&dl_b->lock, flags);
+ dl_b->bw = new_bw;
+ raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+
+ rcu_read_unlock_sched();
+ }
+}
+
+static int sched_rt_global_validate(void)
+{
+ if (sysctl_sched_rt_period <= 0)
+ return -EINVAL;
+
+ if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
+ (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
+ return -EINVAL;
+
+ return 0;
+}
+
+static void sched_rt_do_global(void)
+{
+ def_rt_bandwidth.rt_runtime = global_rt_runtime();
+ def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
+}
+
+int sched_rt_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int old_period, old_runtime;
+ static DEFINE_MUTEX(mutex);
+ int ret;
+
+ mutex_lock(&mutex);
+ old_period = sysctl_sched_rt_period;
+ old_runtime = sysctl_sched_rt_runtime;
+
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+
+ if (!ret && write) {
+ ret = sched_rt_global_validate();
+ if (ret)
+ goto undo;
+
+ ret = sched_dl_global_validate();
+ if (ret)
+ goto undo;
+
+ ret = sched_rt_global_constraints();
+ if (ret)
+ goto undo;
+
+ sched_rt_do_global();
+ sched_dl_do_global();
+ }
+ if (0) {
+undo:
+ sysctl_sched_rt_period = old_period;
+ sysctl_sched_rt_runtime = old_runtime;
+ }
+ mutex_unlock(&mutex);
+
+ return ret;
+}
+
+int sched_rr_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret;
+ static DEFINE_MUTEX(mutex);
+
+ mutex_lock(&mutex);
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ /* make sure that internally we keep jiffies */
+ /* also, writing zero resets timeslice to default */
+ if (!ret && write) {
+ sched_rr_timeslice = sched_rr_timeslice <= 0 ?
+ RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
+ }
+ mutex_unlock(&mutex);
+ return ret;
+}
+
+#ifdef CONFIG_CGROUP_SCHED
+
+static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
+{
+ return css ? container_of(css, struct task_group, css) : NULL;
+}
+
+static struct cgroup_subsys_state *
+cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct task_group *parent = css_tg(parent_css);
+ struct task_group *tg;
+
+ if (!parent) {
+ /* This is early initialization for the top cgroup */
+ return &root_task_group.css;
+ }
+
+ tg = sched_create_group(parent);
+ if (IS_ERR(tg))
+ return ERR_PTR(-ENOMEM);
+
+ return &tg->css;
+}
+
+static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
+{
+ struct task_group *tg = css_tg(css);
+ struct task_group *parent = css_tg(css->parent);
+
+ if (parent)
+ sched_online_group(tg, parent);
+ return 0;
+}
+
+static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
+{
+ struct task_group *tg = css_tg(css);
+
+ sched_destroy_group(tg);
+}
+
+static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
+{
+ struct task_group *tg = css_tg(css);
+
+ sched_offline_group(tg);
+}
+
+static void cpu_cgroup_fork(struct task_struct *task)
+{
+ sched_move_task(task);
+}
+
+static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
+{
+ struct task_struct *task;
+
+ cgroup_taskset_for_each(task, tset) {
+#ifdef CONFIG_RT_GROUP_SCHED
+ if (!sched_rt_can_attach(css_tg(css), task))
+ return -EINVAL;
+#else
+ /* We don't support RT-tasks being in separate groups */
+ if (task->sched_class != &fair_sched_class)
+ return -EINVAL;
+#endif
+ }
+ return 0;
+}
+
+static void cpu_cgroup_attach(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
+{
+ struct task_struct *task;
+
+ cgroup_taskset_for_each(task, tset)
+ sched_move_task(task);
+}
+
+static void cpu_cgroup_exit(struct cgroup_subsys_state *css,
+ struct cgroup_subsys_state *old_css,
+ struct task_struct *task)
+{
+ /*
+ * cgroup_exit() is called in the copy_process() failure path.
+ * Ignore this case since the task hasn't ran yet, this avoids
+ * trying to poke a half freed task state from generic code.
+ */
+ if (!(task->flags & PF_EXITING))
+ return;
+
+ sched_move_task(task);
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
+ struct cftype *cftype, u64 shareval)
+{
+ return sched_group_set_shares(css_tg(css), scale_load(shareval));
+}
+
+static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct task_group *tg = css_tg(css);
+
+ return (u64) scale_load_down(tg->shares);
+}
+
+#ifdef CONFIG_CFS_BANDWIDTH
+static DEFINE_MUTEX(cfs_constraints_mutex);
+
+const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
+const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
+
+static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
+
+static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
+{
+ int i, ret = 0, runtime_enabled, runtime_was_enabled;
+ struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+
+ if (tg == &root_task_group)
+ return -EINVAL;
+
+ /*
+ * Ensure we have at some amount of bandwidth every period. This is
+ * to prevent reaching a state of large arrears when throttled via
+ * entity_tick() resulting in prolonged exit starvation.
+ */
+ if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
+ return -EINVAL;
+
+ /*
+ * Likewise, bound things on the otherside by preventing insane quota
+ * periods. This also allows us to normalize in computing quota
+ * feasibility.
+ */
+ if (period > max_cfs_quota_period)
+ return -EINVAL;
+
+ /*
+ * Prevent race between setting of cfs_rq->runtime_enabled and
+ * unthrottle_offline_cfs_rqs().
+ */
+ get_online_cpus();
+ mutex_lock(&cfs_constraints_mutex);
+ ret = __cfs_schedulable(tg, period, quota);
+ if (ret)
+ goto out_unlock;
+
+ runtime_enabled = quota != RUNTIME_INF;
+ runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
+ /*
+ * If we need to toggle cfs_bandwidth_used, off->on must occur
+ * before making related changes, and on->off must occur afterwards
+ */
+ if (runtime_enabled && !runtime_was_enabled)
+ cfs_bandwidth_usage_inc();
+ raw_spin_lock_irq(&cfs_b->lock);
+ cfs_b->period = ns_to_ktime(period);
+ cfs_b->quota = quota;
+
+ __refill_cfs_bandwidth_runtime(cfs_b);
+ /* restart the period timer (if active) to handle new period expiry */
+ if (runtime_enabled && cfs_b->timer_active) {
+ /* force a reprogram */
+ __start_cfs_bandwidth(cfs_b, true);
+ }
+ raw_spin_unlock_irq(&cfs_b->lock);
+
+ for_each_online_cpu(i) {
+ struct cfs_rq *cfs_rq = tg->cfs_rq[i];
+ struct rq *rq = cfs_rq->rq;
+
+ raw_spin_lock_irq(&rq->lock);
+ cfs_rq->runtime_enabled = runtime_enabled;
+ cfs_rq->runtime_remaining = 0;
+
+ if (cfs_rq->throttled)
+ unthrottle_cfs_rq(cfs_rq);
+ raw_spin_unlock_irq(&rq->lock);
+ }
+ if (runtime_was_enabled && !runtime_enabled)
+ cfs_bandwidth_usage_dec();
+out_unlock:
+ mutex_unlock(&cfs_constraints_mutex);
+ put_online_cpus();
+
+ return ret;
+}
+
+int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
+{
+ u64 quota, period;
+
+ period = ktime_to_ns(tg->cfs_bandwidth.period);
+ if (cfs_quota_us < 0)
+ quota = RUNTIME_INF;
+ else
+ quota = (u64)cfs_quota_us * NSEC_PER_USEC;
+
+ return tg_set_cfs_bandwidth(tg, period, quota);
+}
+
+long tg_get_cfs_quota(struct task_group *tg)
+{
+ u64 quota_us;
+
+ if (tg->cfs_bandwidth.quota == RUNTIME_INF)
+ return -1;
+
+ quota_us = tg->cfs_bandwidth.quota;
+ do_div(quota_us, NSEC_PER_USEC);
+
+ return quota_us;
+}
+
+int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
+{
+ u64 quota, period;
+
+ period = (u64)cfs_period_us * NSEC_PER_USEC;
+ quota = tg->cfs_bandwidth.quota;
+
+ return tg_set_cfs_bandwidth(tg, period, quota);
+}
+
+long tg_get_cfs_period(struct task_group *tg)
+{
+ u64 cfs_period_us;
+
+ cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
+ do_div(cfs_period_us, NSEC_PER_USEC);
+
+ return cfs_period_us;
+}
+
+static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return tg_get_cfs_quota(css_tg(css));
+}
+
+static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
+ struct cftype *cftype, s64 cfs_quota_us)
+{
+ return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
+}
+
+static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return tg_get_cfs_period(css_tg(css));
+}
+
+static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
+ struct cftype *cftype, u64 cfs_period_us)
+{
+ return tg_set_cfs_period(css_tg(css), cfs_period_us);
+}
+
+struct cfs_schedulable_data {
+ struct task_group *tg;
+ u64 period, quota;
+};
+
+/*
+ * normalize group quota/period to be quota/max_period
+ * note: units are usecs
+ */
+static u64 normalize_cfs_quota(struct task_group *tg,
+ struct cfs_schedulable_data *d)
+{
+ u64 quota, period;
+
+ if (tg == d->tg) {
+ period = d->period;
+ quota = d->quota;
+ } else {
+ period = tg_get_cfs_period(tg);
+ quota = tg_get_cfs_quota(tg);
+ }
+
+ /* note: these should typically be equivalent */
+ if (quota == RUNTIME_INF || quota == -1)
+ return RUNTIME_INF;
+
+ return to_ratio(period, quota);
+}
+
+static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
+{
+ struct cfs_schedulable_data *d = data;
+ struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+ s64 quota = 0, parent_quota = -1;
+
+ if (!tg->parent) {
+ quota = RUNTIME_INF;
+ } else {
+ struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
+
+ quota = normalize_cfs_quota(tg, d);
+ parent_quota = parent_b->hierarchical_quota;
+
+ /*
+ * ensure max(child_quota) <= parent_quota, inherit when no
+ * limit is set
+ */
+ if (quota == RUNTIME_INF)
+ quota = parent_quota;
+ else if (parent_quota != RUNTIME_INF && quota > parent_quota)
+ return -EINVAL;
+ }
+ cfs_b->hierarchical_quota = quota;
+
+ return 0;
+}
+
+static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
+{
+ int ret;
+ struct cfs_schedulable_data data = {
+ .tg = tg,
+ .period = period,
+ .quota = quota,
+ };
+
+ if (quota != RUNTIME_INF) {
+ do_div(data.period, NSEC_PER_USEC);
+ do_div(data.quota, NSEC_PER_USEC);
+ }
+
+ rcu_read_lock();
+ ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static int cpu_stats_show(struct seq_file *sf, void *v)
+{
+ struct task_group *tg = css_tg(seq_css(sf));
+ struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+
+ seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
+ seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
+ seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
+
+ return 0;
+}
+#endif /* CONFIG_CFS_BANDWIDTH */
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+#ifdef CONFIG_RT_GROUP_SCHED
+static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
+ struct cftype *cft, s64 val)
+{
+ return sched_group_set_rt_runtime(css_tg(css), val);
+}
+
+static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return sched_group_rt_runtime(css_tg(css));
+}
+
+static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
+ struct cftype *cftype, u64 rt_period_us)
+{
+ return sched_group_set_rt_period(css_tg(css), rt_period_us);
+}
+
+static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return sched_group_rt_period(css_tg(css));
+}
+#endif /* CONFIG_RT_GROUP_SCHED */
+
+static struct cftype cpu_files[] = {
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ {
+ .name = "shares",
+ .read_u64 = cpu_shares_read_u64,
+ .write_u64 = cpu_shares_write_u64,
+ },
+#endif
+#ifdef CONFIG_CFS_BANDWIDTH
+ {
+ .name = "cfs_quota_us",
+ .read_s64 = cpu_cfs_quota_read_s64,
+ .write_s64 = cpu_cfs_quota_write_s64,
+ },
+ {
+ .name = "cfs_period_us",
+ .read_u64 = cpu_cfs_period_read_u64,
+ .write_u64 = cpu_cfs_period_write_u64,
+ },
+ {
+ .name = "stat",
+ .seq_show = cpu_stats_show,
+ },
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+ {
+ .name = "rt_runtime_us",
+ .read_s64 = cpu_rt_runtime_read,
+ .write_s64 = cpu_rt_runtime_write,
+ },
+ {
+ .name = "rt_period_us",
+ .read_u64 = cpu_rt_period_read_uint,
+ .write_u64 = cpu_rt_period_write_uint,
+ },
+#endif
+ { } /* terminate */
+};
+
+struct cgroup_subsys cpu_cgrp_subsys = {
+ .css_alloc = cpu_cgroup_css_alloc,
+ .css_free = cpu_cgroup_css_free,
+ .css_online = cpu_cgroup_css_online,
+ .css_offline = cpu_cgroup_css_offline,
+ .fork = cpu_cgroup_fork,
+ .can_attach = cpu_cgroup_can_attach,
+ .attach = cpu_cgroup_attach,
+ .exit = cpu_cgroup_exit,
+ .legacy_cftypes = cpu_files,
+ .early_init = 1,
+};
+
+#endif /* CONFIG_CGROUP_SCHED */
+
+void dump_cpu_task(int cpu)
+{
+ pr_info("Task dump for CPU %d:\n", cpu);
+ sched_show_task(cpu_curr(cpu));
+}
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
new file mode 100644
index 000000000..dd7cbb55b
--- /dev/null
+++ b/kernel/sched/cpuacct.c
@@ -0,0 +1,283 @@
+#include <linux/cgroup.h>
+#include <linux/slab.h>
+#include <linux/percpu.h>
+#include <linux/spinlock.h>
+#include <linux/cpumask.h>
+#include <linux/seq_file.h>
+#include <linux/rcupdate.h>
+#include <linux/kernel_stat.h>
+#include <linux/err.h>
+
+#include "sched.h"
+
+/*
+ * CPU accounting code for task groups.
+ *
+ * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
+ * (balbir@in.ibm.com).
+ */
+
+/* Time spent by the tasks of the cpu accounting group executing in ... */
+enum cpuacct_stat_index {
+ CPUACCT_STAT_USER, /* ... user mode */
+ CPUACCT_STAT_SYSTEM, /* ... kernel mode */
+
+ CPUACCT_STAT_NSTATS,
+};
+
+/* track cpu usage of a group of tasks and its child groups */
+struct cpuacct {
+ struct cgroup_subsys_state css;
+ /* cpuusage holds pointer to a u64-type object on every cpu */
+ u64 __percpu *cpuusage;
+ struct kernel_cpustat __percpu *cpustat;
+};
+
+static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css)
+{
+ return css ? container_of(css, struct cpuacct, css) : NULL;
+}
+
+/* return cpu accounting group to which this task belongs */
+static inline struct cpuacct *task_ca(struct task_struct *tsk)
+{
+ return css_ca(task_css(tsk, cpuacct_cgrp_id));
+}
+
+static inline struct cpuacct *parent_ca(struct cpuacct *ca)
+{
+ return css_ca(ca->css.parent);
+}
+
+static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage);
+static struct cpuacct root_cpuacct = {
+ .cpustat = &kernel_cpustat,
+ .cpuusage = &root_cpuacct_cpuusage,
+};
+
+/* create a new cpu accounting group */
+static struct cgroup_subsys_state *
+cpuacct_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct cpuacct *ca;
+
+ if (!parent_css)
+ return &root_cpuacct.css;
+
+ ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+ if (!ca)
+ goto out;
+
+ ca->cpuusage = alloc_percpu(u64);
+ if (!ca->cpuusage)
+ goto out_free_ca;
+
+ ca->cpustat = alloc_percpu(struct kernel_cpustat);
+ if (!ca->cpustat)
+ goto out_free_cpuusage;
+
+ return &ca->css;
+
+out_free_cpuusage:
+ free_percpu(ca->cpuusage);
+out_free_ca:
+ kfree(ca);
+out:
+ return ERR_PTR(-ENOMEM);
+}
+
+/* destroy an existing cpu accounting group */
+static void cpuacct_css_free(struct cgroup_subsys_state *css)
+{
+ struct cpuacct *ca = css_ca(css);
+
+ free_percpu(ca->cpustat);
+ free_percpu(ca->cpuusage);
+ kfree(ca);
+}
+
+static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
+{
+ u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+ u64 data;
+
+#ifndef CONFIG_64BIT
+ /*
+ * Take rq->lock to make 64-bit read safe on 32-bit platforms.
+ */
+ raw_spin_lock_irq(&cpu_rq(cpu)->lock);
+ data = *cpuusage;
+ raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
+#else
+ data = *cpuusage;
+#endif
+
+ return data;
+}
+
+static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
+{
+ u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+
+#ifndef CONFIG_64BIT
+ /*
+ * Take rq->lock to make 64-bit write safe on 32-bit platforms.
+ */
+ raw_spin_lock_irq(&cpu_rq(cpu)->lock);
+ *cpuusage = val;
+ raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
+#else
+ *cpuusage = val;
+#endif
+}
+
+/* return total cpu usage (in nanoseconds) of a group */
+static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ struct cpuacct *ca = css_ca(css);
+ u64 totalcpuusage = 0;
+ int i;
+
+ for_each_present_cpu(i)
+ totalcpuusage += cpuacct_cpuusage_read(ca, i);
+
+ return totalcpuusage;
+}
+
+static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
+ u64 reset)
+{
+ struct cpuacct *ca = css_ca(css);
+ int err = 0;
+ int i;
+
+ if (reset) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ for_each_present_cpu(i)
+ cpuacct_cpuusage_write(ca, i, 0);
+
+out:
+ return err;
+}
+
+static int cpuacct_percpu_seq_show(struct seq_file *m, void *V)
+{
+ struct cpuacct *ca = css_ca(seq_css(m));
+ u64 percpu;
+ int i;
+
+ for_each_present_cpu(i) {
+ percpu = cpuacct_cpuusage_read(ca, i);
+ seq_printf(m, "%llu ", (unsigned long long) percpu);
+ }
+ seq_printf(m, "\n");
+ return 0;
+}
+
+static const char * const cpuacct_stat_desc[] = {
+ [CPUACCT_STAT_USER] = "user",
+ [CPUACCT_STAT_SYSTEM] = "system",
+};
+
+static int cpuacct_stats_show(struct seq_file *sf, void *v)
+{
+ struct cpuacct *ca = css_ca(seq_css(sf));
+ int cpu;
+ s64 val = 0;
+
+ for_each_online_cpu(cpu) {
+ struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
+ val += kcpustat->cpustat[CPUTIME_USER];
+ val += kcpustat->cpustat[CPUTIME_NICE];
+ }
+ val = cputime64_to_clock_t(val);
+ seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val);
+
+ val = 0;
+ for_each_online_cpu(cpu) {
+ struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
+ val += kcpustat->cpustat[CPUTIME_SYSTEM];
+ val += kcpustat->cpustat[CPUTIME_IRQ];
+ val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
+ }
+
+ val = cputime64_to_clock_t(val);
+ seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
+
+ return 0;
+}
+
+static struct cftype files[] = {
+ {
+ .name = "usage",
+ .read_u64 = cpuusage_read,
+ .write_u64 = cpuusage_write,
+ },
+ {
+ .name = "usage_percpu",
+ .seq_show = cpuacct_percpu_seq_show,
+ },
+ {
+ .name = "stat",
+ .seq_show = cpuacct_stats_show,
+ },
+ { } /* terminate */
+};
+
+/*
+ * charge this task's execution time to its accounting group.
+ *
+ * called with rq->lock held.
+ */
+void cpuacct_charge(struct task_struct *tsk, u64 cputime)
+{
+ struct cpuacct *ca;
+ int cpu;
+
+ cpu = task_cpu(tsk);
+
+ rcu_read_lock();
+
+ ca = task_ca(tsk);
+
+ while (true) {
+ u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+ *cpuusage += cputime;
+
+ ca = parent_ca(ca);
+ if (!ca)
+ break;
+ }
+
+ rcu_read_unlock();
+}
+
+/*
+ * Add user/system time to cpuacct.
+ *
+ * Note: it's the caller that updates the account of the root cgroup.
+ */
+void cpuacct_account_field(struct task_struct *p, int index, u64 val)
+{
+ struct kernel_cpustat *kcpustat;
+ struct cpuacct *ca;
+
+ rcu_read_lock();
+ ca = task_ca(p);
+ while (ca != &root_cpuacct) {
+ kcpustat = this_cpu_ptr(ca->cpustat);
+ kcpustat->cpustat[index] += val;
+ ca = parent_ca(ca);
+ }
+ rcu_read_unlock();
+}
+
+struct cgroup_subsys cpuacct_cgrp_subsys = {
+ .css_alloc = cpuacct_css_alloc,
+ .css_free = cpuacct_css_free,
+ .legacy_cftypes = files,
+ .early_init = 1,
+};
diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h
new file mode 100644
index 000000000..ed605624a
--- /dev/null
+++ b/kernel/sched/cpuacct.h
@@ -0,0 +1,17 @@
+#ifdef CONFIG_CGROUP_CPUACCT
+
+extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
+extern void cpuacct_account_field(struct task_struct *p, int index, u64 val);
+
+#else
+
+static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime)
+{
+}
+
+static inline void
+cpuacct_account_field(struct task_struct *p, int index, u64 val)
+{
+}
+
+#endif
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
new file mode 100644
index 000000000..c6acb0746
--- /dev/null
+++ b/kernel/sched/cpudeadline.c
@@ -0,0 +1,246 @@
+/*
+ * kernel/sched/cpudl.c
+ *
+ * Global CPU deadline management
+ *
+ * Author: Juri Lelli <j.lelli@sssup.it>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/gfp.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include "cpudeadline.h"
+
+static inline int parent(int i)
+{
+ return (i - 1) >> 1;
+}
+
+static inline int left_child(int i)
+{
+ return (i << 1) + 1;
+}
+
+static inline int right_child(int i)
+{
+ return (i << 1) + 2;
+}
+
+static inline int dl_time_before(u64 a, u64 b)
+{
+ return (s64)(a - b) < 0;
+}
+
+static void cpudl_exchange(struct cpudl *cp, int a, int b)
+{
+ int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu;
+
+ swap(cp->elements[a].cpu, cp->elements[b].cpu);
+ swap(cp->elements[a].dl , cp->elements[b].dl );
+
+ swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx);
+}
+
+static void cpudl_heapify(struct cpudl *cp, int idx)
+{
+ int l, r, largest;
+
+ /* adapted from lib/prio_heap.c */
+ while(1) {
+ l = left_child(idx);
+ r = right_child(idx);
+ largest = idx;
+
+ if ((l < cp->size) && dl_time_before(cp->elements[idx].dl,
+ cp->elements[l].dl))
+ largest = l;
+ if ((r < cp->size) && dl_time_before(cp->elements[largest].dl,
+ cp->elements[r].dl))
+ largest = r;
+ if (largest == idx)
+ break;
+
+ /* Push idx down the heap one level and bump one up */
+ cpudl_exchange(cp, largest, idx);
+ idx = largest;
+ }
+}
+
+static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl)
+{
+ WARN_ON(idx == IDX_INVALID || !cpu_present(idx));
+
+ if (dl_time_before(new_dl, cp->elements[idx].dl)) {
+ cp->elements[idx].dl = new_dl;
+ cpudl_heapify(cp, idx);
+ } else {
+ cp->elements[idx].dl = new_dl;
+ while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl,
+ cp->elements[idx].dl)) {
+ cpudl_exchange(cp, idx, parent(idx));
+ idx = parent(idx);
+ }
+ }
+}
+
+static inline int cpudl_maximum(struct cpudl *cp)
+{
+ return cp->elements[0].cpu;
+}
+
+/*
+ * cpudl_find - find the best (later-dl) CPU in the system
+ * @cp: the cpudl max-heap context
+ * @p: the task
+ * @later_mask: a mask to fill in with the selected CPUs (or NULL)
+ *
+ * Returns: int - best CPU (heap maximum if suitable)
+ */
+int cpudl_find(struct cpudl *cp, struct task_struct *p,
+ struct cpumask *later_mask)
+{
+ int best_cpu = -1;
+ const struct sched_dl_entity *dl_se = &p->dl;
+
+ if (later_mask &&
+ cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
+ best_cpu = cpumask_any(later_mask);
+ goto out;
+ } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
+ dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
+ best_cpu = cpudl_maximum(cp);
+ if (later_mask)
+ cpumask_set_cpu(best_cpu, later_mask);
+ }
+
+out:
+ WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
+
+ return best_cpu;
+}
+
+/*
+ * cpudl_set - update the cpudl max-heap
+ * @cp: the cpudl max-heap context
+ * @cpu: the target cpu
+ * @dl: the new earliest deadline for this cpu
+ *
+ * Notes: assumes cpu_rq(cpu)->lock is locked
+ *
+ * Returns: (void)
+ */
+void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
+{
+ int old_idx, new_cpu;
+ unsigned long flags;
+
+ WARN_ON(!cpu_present(cpu));
+
+ raw_spin_lock_irqsave(&cp->lock, flags);
+ old_idx = cp->elements[cpu].idx;
+ if (!is_valid) {
+ /* remove item */
+ if (old_idx == IDX_INVALID) {
+ /*
+ * Nothing to remove if old_idx was invalid.
+ * This could happen if a rq_offline_dl is
+ * called for a CPU without -dl tasks running.
+ */
+ goto out;
+ }
+ new_cpu = cp->elements[cp->size - 1].cpu;
+ cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl;
+ cp->elements[old_idx].cpu = new_cpu;
+ cp->size--;
+ cp->elements[new_cpu].idx = old_idx;
+ cp->elements[cpu].idx = IDX_INVALID;
+ while (old_idx > 0 && dl_time_before(
+ cp->elements[parent(old_idx)].dl,
+ cp->elements[old_idx].dl)) {
+ cpudl_exchange(cp, old_idx, parent(old_idx));
+ old_idx = parent(old_idx);
+ }
+ cpumask_set_cpu(cpu, cp->free_cpus);
+ cpudl_heapify(cp, old_idx);
+
+ goto out;
+ }
+
+ if (old_idx == IDX_INVALID) {
+ cp->size++;
+ cp->elements[cp->size - 1].dl = 0;
+ cp->elements[cp->size - 1].cpu = cpu;
+ cp->elements[cpu].idx = cp->size - 1;
+ cpudl_change_key(cp, cp->size - 1, dl);
+ cpumask_clear_cpu(cpu, cp->free_cpus);
+ } else {
+ cpudl_change_key(cp, old_idx, dl);
+ }
+
+out:
+ raw_spin_unlock_irqrestore(&cp->lock, flags);
+}
+
+/*
+ * cpudl_set_freecpu - Set the cpudl.free_cpus
+ * @cp: the cpudl max-heap context
+ * @cpu: rd attached cpu
+ */
+void cpudl_set_freecpu(struct cpudl *cp, int cpu)
+{
+ cpumask_set_cpu(cpu, cp->free_cpus);
+}
+
+/*
+ * cpudl_clear_freecpu - Clear the cpudl.free_cpus
+ * @cp: the cpudl max-heap context
+ * @cpu: rd attached cpu
+ */
+void cpudl_clear_freecpu(struct cpudl *cp, int cpu)
+{
+ cpumask_clear_cpu(cpu, cp->free_cpus);
+}
+
+/*
+ * cpudl_init - initialize the cpudl structure
+ * @cp: the cpudl max-heap context
+ */
+int cpudl_init(struct cpudl *cp)
+{
+ int i;
+
+ memset(cp, 0, sizeof(*cp));
+ raw_spin_lock_init(&cp->lock);
+ cp->size = 0;
+
+ cp->elements = kcalloc(nr_cpu_ids,
+ sizeof(struct cpudl_item),
+ GFP_KERNEL);
+ if (!cp->elements)
+ return -ENOMEM;
+
+ if (!zalloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) {
+ kfree(cp->elements);
+ return -ENOMEM;
+ }
+
+ for_each_possible_cpu(i)
+ cp->elements[i].idx = IDX_INVALID;
+
+ return 0;
+}
+
+/*
+ * cpudl_cleanup - clean up the cpudl structure
+ * @cp: the cpudl max-heap context
+ */
+void cpudl_cleanup(struct cpudl *cp)
+{
+ free_cpumask_var(cp->free_cpus);
+ kfree(cp->elements);
+}
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
new file mode 100644
index 000000000..1a0a6ef2f
--- /dev/null
+++ b/kernel/sched/cpudeadline.h
@@ -0,0 +1,32 @@
+#ifndef _LINUX_CPUDL_H
+#define _LINUX_CPUDL_H
+
+#include <linux/sched.h>
+
+#define IDX_INVALID -1
+
+struct cpudl_item {
+ u64 dl;
+ int cpu;
+ int idx;
+};
+
+struct cpudl {
+ raw_spinlock_t lock;
+ int size;
+ cpumask_var_t free_cpus;
+ struct cpudl_item *elements;
+};
+
+
+#ifdef CONFIG_SMP
+int cpudl_find(struct cpudl *cp, struct task_struct *p,
+ struct cpumask *later_mask);
+void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
+int cpudl_init(struct cpudl *cp);
+void cpudl_set_freecpu(struct cpudl *cp, int cpu);
+void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
+void cpudl_cleanup(struct cpudl *cp);
+#endif /* CONFIG_SMP */
+
+#endif /* _LINUX_CPUDL_H */
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
new file mode 100644
index 000000000..981fcd7dc
--- /dev/null
+++ b/kernel/sched/cpupri.c
@@ -0,0 +1,248 @@
+/*
+ * kernel/sched/cpupri.c
+ *
+ * CPU priority management
+ *
+ * Copyright (C) 2007-2008 Novell
+ *
+ * Author: Gregory Haskins <ghaskins@novell.com>
+ *
+ * This code tracks the priority of each CPU so that global migration
+ * decisions are easy to calculate. Each CPU can be in a state as follows:
+ *
+ * (INVALID), IDLE, NORMAL, RT1, ... RT99
+ *
+ * going from the lowest priority to the highest. CPUs in the INVALID state
+ * are not eligible for routing. The system maintains this state with
+ * a 2 dimensional bitmap (the first for priority class, the second for cpus
+ * in that class). Therefore a typical application without affinity
+ * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
+ * searches). For tasks with affinity restrictions, the algorithm has a
+ * worst case complexity of O(min(102, nr_domcpus)), though the scenario that
+ * yields the worst case search is fairly contrived.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/gfp.h>
+#include <linux/sched.h>
+#include <linux/sched/rt.h>
+#include <linux/slab.h>
+#include "cpupri.h"
+
+/* Convert between a 140 based task->prio, and our 102 based cpupri */
+static int convert_prio(int prio)
+{
+ int cpupri;
+
+ if (prio == CPUPRI_INVALID)
+ cpupri = CPUPRI_INVALID;
+ else if (prio == MAX_PRIO)
+ cpupri = CPUPRI_IDLE;
+ else if (prio >= MAX_RT_PRIO)
+ cpupri = CPUPRI_NORMAL;
+ else
+ cpupri = MAX_RT_PRIO - prio + 1;
+
+ return cpupri;
+}
+
+/**
+ * cpupri_find - find the best (lowest-pri) CPU in the system
+ * @cp: The cpupri context
+ * @p: The task
+ * @lowest_mask: A mask to fill in with selected CPUs (or NULL)
+ *
+ * Note: This function returns the recommended CPUs as calculated during the
+ * current invocation. By the time the call returns, the CPUs may have in
+ * fact changed priorities any number of times. While not ideal, it is not
+ * an issue of correctness since the normal rebalancer logic will correct
+ * any discrepancies created by racing against the uncertainty of the current
+ * priority configuration.
+ *
+ * Return: (int)bool - CPUs were found
+ */
+int cpupri_find(struct cpupri *cp, struct task_struct *p,
+ struct cpumask *lowest_mask)
+{
+ int idx = 0;
+ int task_pri = convert_prio(p->prio);
+
+ BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES);
+
+ for (idx = 0; idx < task_pri; idx++) {
+ struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
+ int skip = 0;
+
+ if (!atomic_read(&(vec)->count))
+ skip = 1;
+ /*
+ * When looking at the vector, we need to read the counter,
+ * do a memory barrier, then read the mask.
+ *
+ * Note: This is still all racey, but we can deal with it.
+ * Ideally, we only want to look at masks that are set.
+ *
+ * If a mask is not set, then the only thing wrong is that we
+ * did a little more work than necessary.
+ *
+ * If we read a zero count but the mask is set, because of the
+ * memory barriers, that can only happen when the highest prio
+ * task for a run queue has left the run queue, in which case,
+ * it will be followed by a pull. If the task we are processing
+ * fails to find a proper place to go, that pull request will
+ * pull this task if the run queue is running at a lower
+ * priority.
+ */
+ smp_rmb();
+
+ /* Need to do the rmb for every iteration */
+ if (skip)
+ continue;
+
+ if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
+ continue;
+
+ if (lowest_mask) {
+ cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
+
+ /*
+ * We have to ensure that we have at least one bit
+ * still set in the array, since the map could have
+ * been concurrently emptied between the first and
+ * second reads of vec->mask. If we hit this
+ * condition, simply act as though we never hit this
+ * priority level and continue on.
+ */
+ if (cpumask_any(lowest_mask) >= nr_cpu_ids)
+ continue;
+ }
+
+ return 1;
+ }
+
+ return 0;
+}
+
+/**
+ * cpupri_set - update the cpu priority setting
+ * @cp: The cpupri context
+ * @cpu: The target cpu
+ * @newpri: The priority (INVALID-RT99) to assign to this CPU
+ *
+ * Note: Assumes cpu_rq(cpu)->lock is locked
+ *
+ * Returns: (void)
+ */
+void cpupri_set(struct cpupri *cp, int cpu, int newpri)
+{
+ int *currpri = &cp->cpu_to_pri[cpu];
+ int oldpri = *currpri;
+ int do_mb = 0;
+
+ newpri = convert_prio(newpri);
+
+ BUG_ON(newpri >= CPUPRI_NR_PRIORITIES);
+
+ if (newpri == oldpri)
+ return;
+
+ /*
+ * If the cpu was currently mapped to a different value, we
+ * need to map it to the new value then remove the old value.
+ * Note, we must add the new value first, otherwise we risk the
+ * cpu being missed by the priority loop in cpupri_find.
+ */
+ if (likely(newpri != CPUPRI_INVALID)) {
+ struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
+
+ cpumask_set_cpu(cpu, vec->mask);
+ /*
+ * When adding a new vector, we update the mask first,
+ * do a write memory barrier, and then update the count, to
+ * make sure the vector is visible when count is set.
+ */
+ smp_mb__before_atomic();
+ atomic_inc(&(vec)->count);
+ do_mb = 1;
+ }
+ if (likely(oldpri != CPUPRI_INVALID)) {
+ struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
+
+ /*
+ * Because the order of modification of the vec->count
+ * is important, we must make sure that the update
+ * of the new prio is seen before we decrement the
+ * old prio. This makes sure that the loop sees
+ * one or the other when we raise the priority of
+ * the run queue. We don't care about when we lower the
+ * priority, as that will trigger an rt pull anyway.
+ *
+ * We only need to do a memory barrier if we updated
+ * the new priority vec.
+ */
+ if (do_mb)
+ smp_mb__after_atomic();
+
+ /*
+ * When removing from the vector, we decrement the counter first
+ * do a memory barrier and then clear the mask.
+ */
+ atomic_dec(&(vec)->count);
+ smp_mb__after_atomic();
+ cpumask_clear_cpu(cpu, vec->mask);
+ }
+
+ *currpri = newpri;
+}
+
+/**
+ * cpupri_init - initialize the cpupri structure
+ * @cp: The cpupri context
+ *
+ * Return: -ENOMEM on memory allocation failure.
+ */
+int cpupri_init(struct cpupri *cp)
+{
+ int i;
+
+ memset(cp, 0, sizeof(*cp));
+
+ for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
+ struct cpupri_vec *vec = &cp->pri_to_cpu[i];
+
+ atomic_set(&vec->count, 0);
+ if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
+ goto cleanup;
+ }
+
+ cp->cpu_to_pri = kcalloc(nr_cpu_ids, sizeof(int), GFP_KERNEL);
+ if (!cp->cpu_to_pri)
+ goto cleanup;
+
+ for_each_possible_cpu(i)
+ cp->cpu_to_pri[i] = CPUPRI_INVALID;
+
+ return 0;
+
+cleanup:
+ for (i--; i >= 0; i--)
+ free_cpumask_var(cp->pri_to_cpu[i].mask);
+ return -ENOMEM;
+}
+
+/**
+ * cpupri_cleanup - clean up the cpupri structure
+ * @cp: The cpupri context
+ */
+void cpupri_cleanup(struct cpupri *cp)
+{
+ int i;
+
+ kfree(cp->cpu_to_pri);
+ for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)
+ free_cpumask_var(cp->pri_to_cpu[i].mask);
+}
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
new file mode 100644
index 000000000..63cbb9ca0
--- /dev/null
+++ b/kernel/sched/cpupri.h
@@ -0,0 +1,31 @@
+#ifndef _LINUX_CPUPRI_H
+#define _LINUX_CPUPRI_H
+
+#include <linux/sched.h>
+
+#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2)
+
+#define CPUPRI_INVALID -1
+#define CPUPRI_IDLE 0
+#define CPUPRI_NORMAL 1
+/* values 2-101 are RT priorities 0-99 */
+
+struct cpupri_vec {
+ atomic_t count;
+ cpumask_var_t mask;
+};
+
+struct cpupri {
+ struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
+ int *cpu_to_pri;
+};
+
+#ifdef CONFIG_SMP
+int cpupri_find(struct cpupri *cp,
+ struct task_struct *p, struct cpumask *lowest_mask);
+void cpupri_set(struct cpupri *cp, int cpu, int pri);
+int cpupri_init(struct cpupri *cp);
+void cpupri_cleanup(struct cpupri *cp);
+#endif
+
+#endif /* _LINUX_CPUPRI_H */
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
new file mode 100644
index 000000000..8394b1ee6
--- /dev/null
+++ b/kernel/sched/cputime.c
@@ -0,0 +1,852 @@
+#include <linux/export.h>
+#include <linux/sched.h>
+#include <linux/tsacct_kern.h>
+#include <linux/kernel_stat.h>
+#include <linux/static_key.h>
+#include <linux/context_tracking.h>
+#include "sched.h"
+
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+
+/*
+ * There are no locks covering percpu hardirq/softirq time.
+ * They are only modified in vtime_account, on corresponding CPU
+ * with interrupts disabled. So, writes are safe.
+ * They are read and saved off onto struct rq in update_rq_clock().
+ * This may result in other CPU reading this CPU's irq time and can
+ * race with irq/vtime_account on this CPU. We would either get old
+ * or new value with a side effect of accounting a slice of irq time to wrong
+ * task when irq is in progress while we read rq->clock. That is a worthy
+ * compromise in place of having locks on each irq in account_system_time.
+ */
+DEFINE_PER_CPU(u64, cpu_hardirq_time);
+DEFINE_PER_CPU(u64, cpu_softirq_time);
+
+static DEFINE_PER_CPU(u64, irq_start_time);
+static int sched_clock_irqtime;
+
+void enable_sched_clock_irqtime(void)
+{
+ sched_clock_irqtime = 1;
+}
+
+void disable_sched_clock_irqtime(void)
+{
+ sched_clock_irqtime = 0;
+}
+
+#ifndef CONFIG_64BIT
+DEFINE_PER_CPU(seqcount_t, irq_time_seq);
+#endif /* CONFIG_64BIT */
+
+/*
+ * Called before incrementing preempt_count on {soft,}irq_enter
+ * and before decrementing preempt_count on {soft,}irq_exit.
+ */
+void irqtime_account_irq(struct task_struct *curr)
+{
+ unsigned long flags;
+ s64 delta;
+ int cpu;
+
+ if (!sched_clock_irqtime)
+ return;
+
+ local_irq_save(flags);
+
+ cpu = smp_processor_id();
+ delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
+ __this_cpu_add(irq_start_time, delta);
+
+ irq_time_write_begin();
+ /*
+ * We do not account for softirq time from ksoftirqd here.
+ * We want to continue accounting softirq time to ksoftirqd thread
+ * in that case, so as not to confuse scheduler with a special task
+ * that do not consume any time, but still wants to run.
+ */
+ if (hardirq_count())
+ __this_cpu_add(cpu_hardirq_time, delta);
+ else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
+ __this_cpu_add(cpu_softirq_time, delta);
+
+ irq_time_write_end();
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(irqtime_account_irq);
+
+static int irqtime_account_hi_update(void)
+{
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+ unsigned long flags;
+ u64 latest_ns;
+ int ret = 0;
+
+ local_irq_save(flags);
+ latest_ns = this_cpu_read(cpu_hardirq_time);
+ if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
+ ret = 1;
+ local_irq_restore(flags);
+ return ret;
+}
+
+static int irqtime_account_si_update(void)
+{
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+ unsigned long flags;
+ u64 latest_ns;
+ int ret = 0;
+
+ local_irq_save(flags);
+ latest_ns = this_cpu_read(cpu_softirq_time);
+ if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
+ ret = 1;
+ local_irq_restore(flags);
+ return ret;
+}
+
+#else /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+#define sched_clock_irqtime (0)
+
+#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
+
+static inline void task_group_account_field(struct task_struct *p, int index,
+ u64 tmp)
+{
+ /*
+ * Since all updates are sure to touch the root cgroup, we
+ * get ourselves ahead and touch it first. If the root cgroup
+ * is the only cgroup, then nothing else should be necessary.
+ *
+ */
+ __this_cpu_add(kernel_cpustat.cpustat[index], tmp);
+
+ cpuacct_account_field(p, index, tmp);
+}
+
+/*
+ * Account user cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in user space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ */
+void account_user_time(struct task_struct *p, cputime_t cputime,
+ cputime_t cputime_scaled)
+{
+ int index;
+
+ /* Add user time to process. */
+ p->utime += cputime;
+ p->utimescaled += cputime_scaled;
+ account_group_user_time(p, cputime);
+
+ index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
+
+ /* Add user time to cpustat. */
+ task_group_account_field(p, index, (__force u64) cputime);
+
+ /* Account for user time used */
+ acct_account_cputime(p);
+}
+
+/*
+ * Account guest cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in virtual machine since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ */
+static void account_guest_time(struct task_struct *p, cputime_t cputime,
+ cputime_t cputime_scaled)
+{
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+
+ /* Add guest time to process. */
+ p->utime += cputime;
+ p->utimescaled += cputime_scaled;
+ account_group_user_time(p, cputime);
+ p->gtime += cputime;
+
+ /* Add guest time to cpustat. */
+ if (task_nice(p) > 0) {
+ cpustat[CPUTIME_NICE] += (__force u64) cputime;
+ cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
+ } else {
+ cpustat[CPUTIME_USER] += (__force u64) cputime;
+ cpustat[CPUTIME_GUEST] += (__force u64) cputime;
+ }
+}
+
+/*
+ * Account system cpu time to a process and desired cpustat field
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in kernel space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ * @target_cputime64: pointer to cpustat field that has to be updated
+ */
+static inline
+void __account_system_time(struct task_struct *p, cputime_t cputime,
+ cputime_t cputime_scaled, int index)
+{
+ /* Add system time to process. */
+ p->stime += cputime;
+ p->stimescaled += cputime_scaled;
+ account_group_system_time(p, cputime);
+
+ /* Add system time to cpustat. */
+ task_group_account_field(p, index, (__force u64) cputime);
+
+ /* Account for system time used */
+ acct_account_cputime(p);
+}
+
+/*
+ * Account system cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @hardirq_offset: the offset to subtract from hardirq_count()
+ * @cputime: the cpu time spent in kernel space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ */
+void account_system_time(struct task_struct *p, int hardirq_offset,
+ cputime_t cputime, cputime_t cputime_scaled)
+{
+ int index;
+
+ if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
+ account_guest_time(p, cputime, cputime_scaled);
+ return;
+ }
+
+ if (hardirq_count() - hardirq_offset)
+ index = CPUTIME_IRQ;
+ else if (in_serving_softirq())
+ index = CPUTIME_SOFTIRQ;
+ else
+ index = CPUTIME_SYSTEM;
+
+ __account_system_time(p, cputime, cputime_scaled, index);
+}
+
+/*
+ * Account for involuntary wait time.
+ * @cputime: the cpu time spent in involuntary wait
+ */
+void account_steal_time(cputime_t cputime)
+{
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+
+ cpustat[CPUTIME_STEAL] += (__force u64) cputime;
+}
+
+/*
+ * Account for idle time.
+ * @cputime: the cpu time spent in idle wait
+ */
+void account_idle_time(cputime_t cputime)
+{
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+ struct rq *rq = this_rq();
+
+ if (atomic_read(&rq->nr_iowait) > 0)
+ cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
+ else
+ cpustat[CPUTIME_IDLE] += (__force u64) cputime;
+}
+
+static __always_inline bool steal_account_process_tick(void)
+{
+#ifdef CONFIG_PARAVIRT
+ if (static_key_false(&paravirt_steal_enabled)) {
+ u64 steal;
+ cputime_t steal_ct;
+
+ steal = paravirt_steal_clock(smp_processor_id());
+ steal -= this_rq()->prev_steal_time;
+
+ /*
+ * cputime_t may be less precise than nsecs (eg: if it's
+ * based on jiffies). Lets cast the result to cputime
+ * granularity and account the rest on the next rounds.
+ */
+ steal_ct = nsecs_to_cputime(steal);
+ this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct);
+
+ account_steal_time(steal_ct);
+ return steal_ct;
+ }
+#endif
+ return false;
+}
+
+/*
+ * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
+ * tasks (sum on group iteration) belonging to @tsk's group.
+ */
+void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
+{
+ struct signal_struct *sig = tsk->signal;
+ cputime_t utime, stime;
+ struct task_struct *t;
+ unsigned int seq, nextseq;
+ unsigned long flags;
+
+ rcu_read_lock();
+ /* Attempt a lockless read on the first round. */
+ nextseq = 0;
+ do {
+ seq = nextseq;
+ flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
+ times->utime = sig->utime;
+ times->stime = sig->stime;
+ times->sum_exec_runtime = sig->sum_sched_runtime;
+
+ for_each_thread(tsk, t) {
+ task_cputime(t, &utime, &stime);
+ times->utime += utime;
+ times->stime += stime;
+ times->sum_exec_runtime += task_sched_runtime(t);
+ }
+ /* If lockless access failed, take the lock. */
+ nextseq = 1;
+ } while (need_seqretry(&sig->stats_lock, seq));
+ done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
+ rcu_read_unlock();
+}
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+/*
+ * Account a tick to a process and cpustat
+ * @p: the process that the cpu time gets accounted to
+ * @user_tick: is the tick from userspace
+ * @rq: the pointer to rq
+ *
+ * Tick demultiplexing follows the order
+ * - pending hardirq update
+ * - pending softirq update
+ * - user_time
+ * - idle_time
+ * - system time
+ * - check for guest_time
+ * - else account as system_time
+ *
+ * Check for hardirq is done both for system and user time as there is
+ * no timer going off while we are on hardirq and hence we may never get an
+ * opportunity to update it solely in system time.
+ * p->stime and friends are only updated on system time and not on irq
+ * softirq as those do not count in task exec_runtime any more.
+ */
+static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+ struct rq *rq, int ticks)
+{
+ cputime_t scaled = cputime_to_scaled(cputime_one_jiffy);
+ u64 cputime = (__force u64) cputime_one_jiffy;
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+
+ if (steal_account_process_tick())
+ return;
+
+ cputime *= ticks;
+ scaled *= ticks;
+
+ if (irqtime_account_hi_update()) {
+ cpustat[CPUTIME_IRQ] += cputime;
+ } else if (irqtime_account_si_update()) {
+ cpustat[CPUTIME_SOFTIRQ] += cputime;
+ } else if (this_cpu_ksoftirqd() == p) {
+ /*
+ * ksoftirqd time do not get accounted in cpu_softirq_time.
+ * So, we have to handle it separately here.
+ * Also, p->stime needs to be updated for ksoftirqd.
+ */
+ __account_system_time(p, cputime, scaled, CPUTIME_SOFTIRQ);
+ } else if (user_tick) {
+ account_user_time(p, cputime, scaled);
+ } else if (p == rq->idle) {
+ account_idle_time(cputime);
+ } else if (p->flags & PF_VCPU) { /* System time or guest time */
+ account_guest_time(p, cputime, scaled);
+ } else {
+ __account_system_time(p, cputime, scaled, CPUTIME_SYSTEM);
+ }
+}
+
+static void irqtime_account_idle_ticks(int ticks)
+{
+ struct rq *rq = this_rq();
+
+ irqtime_account_process_tick(current, 0, rq, ticks);
+}
+#else /* CONFIG_IRQ_TIME_ACCOUNTING */
+static inline void irqtime_account_idle_ticks(int ticks) {}
+static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+ struct rq *rq, int nr_ticks) {}
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+/*
+ * Use precise platform statistics if available:
+ */
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+
+#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
+void vtime_common_task_switch(struct task_struct *prev)
+{
+ if (is_idle_task(prev))
+ vtime_account_idle(prev);
+ else
+ vtime_account_system(prev);
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+ vtime_account_user(prev);
+#endif
+ arch_vtime_task_switch(prev);
+}
+#endif
+
+/*
+ * Archs that account the whole time spent in the idle task
+ * (outside irq) as idle time can rely on this and just implement
+ * vtime_account_system() and vtime_account_idle(). Archs that
+ * have other meaning of the idle time (s390 only includes the
+ * time spent by the CPU when it's in low power mode) must override
+ * vtime_account().
+ */
+#ifndef __ARCH_HAS_VTIME_ACCOUNT
+void vtime_common_account_irq_enter(struct task_struct *tsk)
+{
+ if (!in_interrupt()) {
+ /*
+ * If we interrupted user, context_tracking_in_user()
+ * is 1 because the context tracking don't hook
+ * on irq entry/exit. This way we know if
+ * we need to flush user time on kernel entry.
+ */
+ if (context_tracking_in_user()) {
+ vtime_account_user(tsk);
+ return;
+ }
+
+ if (is_idle_task(tsk)) {
+ vtime_account_idle(tsk);
+ return;
+ }
+ }
+ vtime_account_system(tsk);
+}
+EXPORT_SYMBOL_GPL(vtime_common_account_irq_enter);
+#endif /* __ARCH_HAS_VTIME_ACCOUNT */
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
+
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+ *ut = p->utime;
+ *st = p->stime;
+}
+
+void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+ struct task_cputime cputime;
+
+ thread_group_cputime(p, &cputime);
+
+ *ut = cputime.utime;
+ *st = cputime.stime;
+}
+#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
+/*
+ * Account a single tick of cpu time.
+ * @p: the process that the cpu time gets accounted to
+ * @user_tick: indicates if the tick is a user or a system tick
+ */
+void account_process_tick(struct task_struct *p, int user_tick)
+{
+ cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+ struct rq *rq = this_rq();
+
+ if (vtime_accounting_enabled())
+ return;
+
+ if (sched_clock_irqtime) {
+ irqtime_account_process_tick(p, user_tick, rq, 1);
+ return;
+ }
+
+ if (steal_account_process_tick())
+ return;
+
+ if (user_tick)
+ account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
+ else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
+ account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
+ one_jiffy_scaled);
+ else
+ account_idle_time(cputime_one_jiffy);
+}
+
+/*
+ * Account multiple ticks of steal time.
+ * @p: the process from which the cpu time has been stolen
+ * @ticks: number of stolen ticks
+ */
+void account_steal_ticks(unsigned long ticks)
+{
+ account_steal_time(jiffies_to_cputime(ticks));
+}
+
+/*
+ * Account multiple ticks of idle time.
+ * @ticks: number of stolen ticks
+ */
+void account_idle_ticks(unsigned long ticks)
+{
+
+ if (sched_clock_irqtime) {
+ irqtime_account_idle_ticks(ticks);
+ return;
+ }
+
+ account_idle_time(jiffies_to_cputime(ticks));
+}
+
+/*
+ * Perform (stime * rtime) / total, but avoid multiplication overflow by
+ * loosing precision when the numbers are big.
+ */
+static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
+{
+ u64 scaled;
+
+ for (;;) {
+ /* Make sure "rtime" is the bigger of stime/rtime */
+ if (stime > rtime)
+ swap(rtime, stime);
+
+ /* Make sure 'total' fits in 32 bits */
+ if (total >> 32)
+ goto drop_precision;
+
+ /* Does rtime (and thus stime) fit in 32 bits? */
+ if (!(rtime >> 32))
+ break;
+
+ /* Can we just balance rtime/stime rather than dropping bits? */
+ if (stime >> 31)
+ goto drop_precision;
+
+ /* We can grow stime and shrink rtime and try to make them both fit */
+ stime <<= 1;
+ rtime >>= 1;
+ continue;
+
+drop_precision:
+ /* We drop from rtime, it has more bits than stime */
+ rtime >>= 1;
+ total >>= 1;
+ }
+
+ /*
+ * Make sure gcc understands that this is a 32x32->64 multiply,
+ * followed by a 64/32->64 divide.
+ */
+ scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
+ return (__force cputime_t) scaled;
+}
+
+/*
+ * Atomically advance counter to the new value. Interrupts, vcpu
+ * scheduling, and scaling inaccuracies can cause cputime_advance
+ * to be occasionally called with a new value smaller than counter.
+ * Let's enforce atomicity.
+ *
+ * Normally a caller will only go through this loop once, or not
+ * at all in case a previous caller updated counter the same jiffy.
+ */
+static void cputime_advance(cputime_t *counter, cputime_t new)
+{
+ cputime_t old;
+
+ while (new > (old = ACCESS_ONCE(*counter)))
+ cmpxchg_cputime(counter, old, new);
+}
+
+/*
+ * Adjust tick based cputime random precision against scheduler
+ * runtime accounting.
+ */
+static void cputime_adjust(struct task_cputime *curr,
+ struct cputime *prev,
+ cputime_t *ut, cputime_t *st)
+{
+ cputime_t rtime, stime, utime;
+
+ /*
+ * Tick based cputime accounting depend on random scheduling
+ * timeslices of a task to be interrupted or not by the timer.
+ * Depending on these circumstances, the number of these interrupts
+ * may be over or under-optimistic, matching the real user and system
+ * cputime with a variable precision.
+ *
+ * Fix this by scaling these tick based values against the total
+ * runtime accounted by the CFS scheduler.
+ */
+ rtime = nsecs_to_cputime(curr->sum_exec_runtime);
+
+ /*
+ * Update userspace visible utime/stime values only if actual execution
+ * time is bigger than already exported. Note that can happen, that we
+ * provided bigger values due to scaling inaccuracy on big numbers.
+ */
+ if (prev->stime + prev->utime >= rtime)
+ goto out;
+
+ stime = curr->stime;
+ utime = curr->utime;
+
+ if (utime == 0) {
+ stime = rtime;
+ } else if (stime == 0) {
+ utime = rtime;
+ } else {
+ cputime_t total = stime + utime;
+
+ stime = scale_stime((__force u64)stime,
+ (__force u64)rtime, (__force u64)total);
+ utime = rtime - stime;
+ }
+
+ cputime_advance(&prev->stime, stime);
+ cputime_advance(&prev->utime, utime);
+
+out:
+ *ut = prev->utime;
+ *st = prev->stime;
+}
+
+void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+ struct task_cputime cputime = {
+ .sum_exec_runtime = p->se.sum_exec_runtime,
+ };
+
+ task_cputime(p, &cputime.utime, &cputime.stime);
+ cputime_adjust(&cputime, &p->prev_cputime, ut, st);
+}
+
+void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+ struct task_cputime cputime;
+
+ thread_group_cputime(p, &cputime);
+ cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
+}
+#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
+static unsigned long long vtime_delta(struct task_struct *tsk)
+{
+ unsigned long long clock;
+
+ clock = local_clock();
+ if (clock < tsk->vtime_snap)
+ return 0;
+
+ return clock - tsk->vtime_snap;
+}
+
+static cputime_t get_vtime_delta(struct task_struct *tsk)
+{
+ unsigned long long delta = vtime_delta(tsk);
+
+ WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_SLEEPING);
+ tsk->vtime_snap += delta;
+
+ /* CHECKME: always safe to convert nsecs to cputime? */
+ return nsecs_to_cputime(delta);
+}
+
+static void __vtime_account_system(struct task_struct *tsk)
+{
+ cputime_t delta_cpu = get_vtime_delta(tsk);
+
+ account_system_time(tsk, irq_count(), delta_cpu, cputime_to_scaled(delta_cpu));
+}
+
+void vtime_account_system(struct task_struct *tsk)
+{
+ write_seqlock(&tsk->vtime_seqlock);
+ __vtime_account_system(tsk);
+ write_sequnlock(&tsk->vtime_seqlock);
+}
+
+void vtime_gen_account_irq_exit(struct task_struct *tsk)
+{
+ write_seqlock(&tsk->vtime_seqlock);
+ __vtime_account_system(tsk);
+ if (context_tracking_in_user())
+ tsk->vtime_snap_whence = VTIME_USER;
+ write_sequnlock(&tsk->vtime_seqlock);
+}
+
+void vtime_account_user(struct task_struct *tsk)
+{
+ cputime_t delta_cpu;
+
+ write_seqlock(&tsk->vtime_seqlock);
+ delta_cpu = get_vtime_delta(tsk);
+ tsk->vtime_snap_whence = VTIME_SYS;
+ account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
+ write_sequnlock(&tsk->vtime_seqlock);
+}
+
+void vtime_user_enter(struct task_struct *tsk)
+{
+ write_seqlock(&tsk->vtime_seqlock);
+ __vtime_account_system(tsk);
+ tsk->vtime_snap_whence = VTIME_USER;
+ write_sequnlock(&tsk->vtime_seqlock);
+}
+
+void vtime_guest_enter(struct task_struct *tsk)
+{
+ /*
+ * The flags must be updated under the lock with
+ * the vtime_snap flush and update.
+ * That enforces a right ordering and update sequence
+ * synchronization against the reader (task_gtime())
+ * that can thus safely catch up with a tickless delta.
+ */
+ write_seqlock(&tsk->vtime_seqlock);
+ __vtime_account_system(tsk);
+ current->flags |= PF_VCPU;
+ write_sequnlock(&tsk->vtime_seqlock);
+}
+EXPORT_SYMBOL_GPL(vtime_guest_enter);
+
+void vtime_guest_exit(struct task_struct *tsk)
+{
+ write_seqlock(&tsk->vtime_seqlock);
+ __vtime_account_system(tsk);
+ current->flags &= ~PF_VCPU;
+ write_sequnlock(&tsk->vtime_seqlock);
+}
+EXPORT_SYMBOL_GPL(vtime_guest_exit);
+
+void vtime_account_idle(struct task_struct *tsk)
+{
+ cputime_t delta_cpu = get_vtime_delta(tsk);
+
+ account_idle_time(delta_cpu);
+}
+
+void arch_vtime_task_switch(struct task_struct *prev)
+{
+ write_seqlock(&prev->vtime_seqlock);
+ prev->vtime_snap_whence = VTIME_SLEEPING;
+ write_sequnlock(&prev->vtime_seqlock);
+
+ write_seqlock(&current->vtime_seqlock);
+ current->vtime_snap_whence = VTIME_SYS;
+ current->vtime_snap = sched_clock_cpu(smp_processor_id());
+ write_sequnlock(&current->vtime_seqlock);
+}
+
+void vtime_init_idle(struct task_struct *t, int cpu)
+{
+ unsigned long flags;
+
+ write_seqlock_irqsave(&t->vtime_seqlock, flags);
+ t->vtime_snap_whence = VTIME_SYS;
+ t->vtime_snap = sched_clock_cpu(cpu);
+ write_sequnlock_irqrestore(&t->vtime_seqlock, flags);
+}
+
+cputime_t task_gtime(struct task_struct *t)
+{
+ unsigned int seq;
+ cputime_t gtime;
+
+ do {
+ seq = read_seqbegin(&t->vtime_seqlock);
+
+ gtime = t->gtime;
+ if (t->flags & PF_VCPU)
+ gtime += vtime_delta(t);
+
+ } while (read_seqretry(&t->vtime_seqlock, seq));
+
+ return gtime;
+}
+
+/*
+ * Fetch cputime raw values from fields of task_struct and
+ * add up the pending nohz execution time since the last
+ * cputime snapshot.
+ */
+static void
+fetch_task_cputime(struct task_struct *t,
+ cputime_t *u_dst, cputime_t *s_dst,
+ cputime_t *u_src, cputime_t *s_src,
+ cputime_t *udelta, cputime_t *sdelta)
+{
+ unsigned int seq;
+ unsigned long long delta;
+
+ do {
+ *udelta = 0;
+ *sdelta = 0;
+
+ seq = read_seqbegin(&t->vtime_seqlock);
+
+ if (u_dst)
+ *u_dst = *u_src;
+ if (s_dst)
+ *s_dst = *s_src;
+
+ /* Task is sleeping, nothing to add */
+ if (t->vtime_snap_whence == VTIME_SLEEPING ||
+ is_idle_task(t))
+ continue;
+
+ delta = vtime_delta(t);
+
+ /*
+ * Task runs either in user or kernel space, add pending nohz time to
+ * the right place.
+ */
+ if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) {
+ *udelta = delta;
+ } else {
+ if (t->vtime_snap_whence == VTIME_SYS)
+ *sdelta = delta;
+ }
+ } while (read_seqretry(&t->vtime_seqlock, seq));
+}
+
+
+void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime)
+{
+ cputime_t udelta, sdelta;
+
+ fetch_task_cputime(t, utime, stime, &t->utime,
+ &t->stime, &udelta, &sdelta);
+ if (utime)
+ *utime += udelta;
+ if (stime)
+ *stime += sdelta;
+}
+
+void task_cputime_scaled(struct task_struct *t,
+ cputime_t *utimescaled, cputime_t *stimescaled)
+{
+ cputime_t udelta, sdelta;
+
+ fetch_task_cputime(t, utimescaled, stimescaled,
+ &t->utimescaled, &t->stimescaled, &udelta, &sdelta);
+ if (utimescaled)
+ *utimescaled += cputime_to_scaled(udelta);
+ if (stimescaled)
+ *stimescaled += cputime_to_scaled(sdelta);
+}
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
new file mode 100644
index 000000000..5e9514508
--- /dev/null
+++ b/kernel/sched/deadline.c
@@ -0,0 +1,1818 @@
+/*
+ * Deadline Scheduling Class (SCHED_DEADLINE)
+ *
+ * Earliest Deadline First (EDF) + Constant Bandwidth Server (CBS).
+ *
+ * Tasks that periodically executes their instances for less than their
+ * runtime won't miss any of their deadlines.
+ * Tasks that are not periodic or sporadic or that tries to execute more
+ * than their reserved bandwidth will be slowed down (and may potentially
+ * miss some of their deadlines), and won't affect any other task.
+ *
+ * Copyright (C) 2012 Dario Faggioli <raistlin@linux.it>,
+ * Juri Lelli <juri.lelli@gmail.com>,
+ * Michael Trimarchi <michael@amarulasolutions.com>,
+ * Fabio Checconi <fchecconi@gmail.com>
+ */
+#include "sched.h"
+
+#include <linux/slab.h>
+
+struct dl_bandwidth def_dl_bandwidth;
+
+static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
+{
+ return container_of(dl_se, struct task_struct, dl);
+}
+
+static inline struct rq *rq_of_dl_rq(struct dl_rq *dl_rq)
+{
+ return container_of(dl_rq, struct rq, dl);
+}
+
+static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se)
+{
+ struct task_struct *p = dl_task_of(dl_se);
+ struct rq *rq = task_rq(p);
+
+ return &rq->dl;
+}
+
+static inline int on_dl_rq(struct sched_dl_entity *dl_se)
+{
+ return !RB_EMPTY_NODE(&dl_se->rb_node);
+}
+
+static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
+{
+ struct sched_dl_entity *dl_se = &p->dl;
+
+ return dl_rq->rb_leftmost == &dl_se->rb_node;
+}
+
+void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime)
+{
+ raw_spin_lock_init(&dl_b->dl_runtime_lock);
+ dl_b->dl_period = period;
+ dl_b->dl_runtime = runtime;
+}
+
+void init_dl_bw(struct dl_bw *dl_b)
+{
+ raw_spin_lock_init(&dl_b->lock);
+ raw_spin_lock(&def_dl_bandwidth.dl_runtime_lock);
+ if (global_rt_runtime() == RUNTIME_INF)
+ dl_b->bw = -1;
+ else
+ dl_b->bw = to_ratio(global_rt_period(), global_rt_runtime());
+ raw_spin_unlock(&def_dl_bandwidth.dl_runtime_lock);
+ dl_b->total_bw = 0;
+}
+
+void init_dl_rq(struct dl_rq *dl_rq)
+{
+ dl_rq->rb_root = RB_ROOT;
+
+#ifdef CONFIG_SMP
+ /* zero means no -deadline tasks */
+ dl_rq->earliest_dl.curr = dl_rq->earliest_dl.next = 0;
+
+ dl_rq->dl_nr_migratory = 0;
+ dl_rq->overloaded = 0;
+ dl_rq->pushable_dl_tasks_root = RB_ROOT;
+#else
+ init_dl_bw(&dl_rq->dl_bw);
+#endif
+}
+
+#ifdef CONFIG_SMP
+
+static inline int dl_overloaded(struct rq *rq)
+{
+ return atomic_read(&rq->rd->dlo_count);
+}
+
+static inline void dl_set_overload(struct rq *rq)
+{
+ if (!rq->online)
+ return;
+
+ cpumask_set_cpu(rq->cpu, rq->rd->dlo_mask);
+ /*
+ * Must be visible before the overload count is
+ * set (as in sched_rt.c).
+ *
+ * Matched by the barrier in pull_dl_task().
+ */
+ smp_wmb();
+ atomic_inc(&rq->rd->dlo_count);
+}
+
+static inline void dl_clear_overload(struct rq *rq)
+{
+ if (!rq->online)
+ return;
+
+ atomic_dec(&rq->rd->dlo_count);
+ cpumask_clear_cpu(rq->cpu, rq->rd->dlo_mask);
+}
+
+static void update_dl_migration(struct dl_rq *dl_rq)
+{
+ if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_running > 1) {
+ if (!dl_rq->overloaded) {
+ dl_set_overload(rq_of_dl_rq(dl_rq));
+ dl_rq->overloaded = 1;
+ }
+ } else if (dl_rq->overloaded) {
+ dl_clear_overload(rq_of_dl_rq(dl_rq));
+ dl_rq->overloaded = 0;
+ }
+}
+
+static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+ struct task_struct *p = dl_task_of(dl_se);
+
+ if (p->nr_cpus_allowed > 1)
+ dl_rq->dl_nr_migratory++;
+
+ update_dl_migration(dl_rq);
+}
+
+static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+ struct task_struct *p = dl_task_of(dl_se);
+
+ if (p->nr_cpus_allowed > 1)
+ dl_rq->dl_nr_migratory--;
+
+ update_dl_migration(dl_rq);
+}
+
+/*
+ * The list of pushable -deadline task is not a plist, like in
+ * sched_rt.c, it is an rb-tree with tasks ordered by deadline.
+ */
+static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
+{
+ struct dl_rq *dl_rq = &rq->dl;
+ struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_node;
+ struct rb_node *parent = NULL;
+ struct task_struct *entry;
+ int leftmost = 1;
+
+ BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks));
+
+ while (*link) {
+ parent = *link;
+ entry = rb_entry(parent, struct task_struct,
+ pushable_dl_tasks);
+ if (dl_entity_preempt(&p->dl, &entry->dl))
+ link = &parent->rb_left;
+ else {
+ link = &parent->rb_right;
+ leftmost = 0;
+ }
+ }
+
+ if (leftmost)
+ dl_rq->pushable_dl_tasks_leftmost = &p->pushable_dl_tasks;
+
+ rb_link_node(&p->pushable_dl_tasks, parent, link);
+ rb_insert_color(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
+}
+
+static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
+{
+ struct dl_rq *dl_rq = &rq->dl;
+
+ if (RB_EMPTY_NODE(&p->pushable_dl_tasks))
+ return;
+
+ if (dl_rq->pushable_dl_tasks_leftmost == &p->pushable_dl_tasks) {
+ struct rb_node *next_node;
+
+ next_node = rb_next(&p->pushable_dl_tasks);
+ dl_rq->pushable_dl_tasks_leftmost = next_node;
+ }
+
+ rb_erase(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
+ RB_CLEAR_NODE(&p->pushable_dl_tasks);
+}
+
+static inline int has_pushable_dl_tasks(struct rq *rq)
+{
+ return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root);
+}
+
+static int push_dl_task(struct rq *rq);
+
+static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
+{
+ return dl_task(prev);
+}
+
+static inline void set_post_schedule(struct rq *rq)
+{
+ rq->post_schedule = has_pushable_dl_tasks(rq);
+}
+
+static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
+
+static void dl_task_offline_migration(struct rq *rq, struct task_struct *p)
+{
+ struct rq *later_rq = NULL;
+ bool fallback = false;
+
+ later_rq = find_lock_later_rq(p, rq);
+
+ if (!later_rq) {
+ int cpu;
+
+ /*
+ * If we cannot preempt any rq, fall back to pick any
+ * online cpu.
+ */
+ fallback = true;
+ cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p));
+ if (cpu >= nr_cpu_ids) {
+ /*
+ * Fail to find any suitable cpu.
+ * The task will never come back!
+ */
+ BUG_ON(dl_bandwidth_enabled());
+
+ /*
+ * If admission control is disabled we
+ * try a little harder to let the task
+ * run.
+ */
+ cpu = cpumask_any(cpu_active_mask);
+ }
+ later_rq = cpu_rq(cpu);
+ double_lock_balance(rq, later_rq);
+ }
+
+ deactivate_task(rq, p, 0);
+ set_task_cpu(p, later_rq->cpu);
+ activate_task(later_rq, p, ENQUEUE_REPLENISH);
+
+ if (!fallback)
+ resched_curr(later_rq);
+
+ double_unlock_balance(rq, later_rq);
+}
+
+#else
+
+static inline
+void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
+{
+}
+
+static inline
+void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
+{
+}
+
+static inline
+void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+}
+
+static inline
+void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+}
+
+static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
+{
+ return false;
+}
+
+static inline int pull_dl_task(struct rq *rq)
+{
+ return 0;
+}
+
+static inline void set_post_schedule(struct rq *rq)
+{
+}
+#endif /* CONFIG_SMP */
+
+static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
+static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags);
+static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
+ int flags);
+
+/*
+ * We are being explicitly informed that a new instance is starting,
+ * and this means that:
+ * - the absolute deadline of the entity has to be placed at
+ * current time + relative deadline;
+ * - the runtime of the entity has to be set to the maximum value.
+ *
+ * The capability of specifying such event is useful whenever a -deadline
+ * entity wants to (try to!) synchronize its behaviour with the scheduler's
+ * one, and to (try to!) reconcile itself with its own scheduling
+ * parameters.
+ */
+static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
+ struct sched_dl_entity *pi_se)
+{
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+ struct rq *rq = rq_of_dl_rq(dl_rq);
+
+ WARN_ON(!dl_se->dl_new || dl_se->dl_throttled);
+
+ /*
+ * We use the regular wall clock time to set deadlines in the
+ * future; in fact, we must consider execution overheads (time
+ * spent on hardirq context, etc.).
+ */
+ dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
+ dl_se->runtime = pi_se->dl_runtime;
+ dl_se->dl_new = 0;
+}
+
+/*
+ * Pure Earliest Deadline First (EDF) scheduling does not deal with the
+ * possibility of a entity lasting more than what it declared, and thus
+ * exhausting its runtime.
+ *
+ * Here we are interested in making runtime overrun possible, but we do
+ * not want a entity which is misbehaving to affect the scheduling of all
+ * other entities.
+ * Therefore, a budgeting strategy called Constant Bandwidth Server (CBS)
+ * is used, in order to confine each entity within its own bandwidth.
+ *
+ * This function deals exactly with that, and ensures that when the runtime
+ * of a entity is replenished, its deadline is also postponed. That ensures
+ * the overrunning entity can't interfere with other entity in the system and
+ * can't make them miss their deadlines. Reasons why this kind of overruns
+ * could happen are, typically, a entity voluntarily trying to overcome its
+ * runtime, or it just underestimated it during sched_setattr().
+ */
+static void replenish_dl_entity(struct sched_dl_entity *dl_se,
+ struct sched_dl_entity *pi_se)
+{
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+ struct rq *rq = rq_of_dl_rq(dl_rq);
+
+ BUG_ON(pi_se->dl_runtime <= 0);
+
+ /*
+ * This could be the case for a !-dl task that is boosted.
+ * Just go with full inherited parameters.
+ */
+ if (dl_se->dl_deadline == 0) {
+ dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
+ dl_se->runtime = pi_se->dl_runtime;
+ }
+
+ /*
+ * We keep moving the deadline away until we get some
+ * available runtime for the entity. This ensures correct
+ * handling of situations where the runtime overrun is
+ * arbitrary large.
+ */
+ while (dl_se->runtime <= 0) {
+ dl_se->deadline += pi_se->dl_period;
+ dl_se->runtime += pi_se->dl_runtime;
+ }
+
+ /*
+ * At this point, the deadline really should be "in
+ * the future" with respect to rq->clock. If it's
+ * not, we are, for some reason, lagging too much!
+ * Anyway, after having warn userspace abut that,
+ * we still try to keep the things running by
+ * resetting the deadline and the budget of the
+ * entity.
+ */
+ if (dl_time_before(dl_se->deadline, rq_clock(rq))) {
+ printk_deferred_once("sched: DL replenish lagged to much\n");
+ dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
+ dl_se->runtime = pi_se->dl_runtime;
+ }
+
+ if (dl_se->dl_yielded)
+ dl_se->dl_yielded = 0;
+ if (dl_se->dl_throttled)
+ dl_se->dl_throttled = 0;
+}
+
+/*
+ * Here we check if --at time t-- an entity (which is probably being
+ * [re]activated or, in general, enqueued) can use its remaining runtime
+ * and its current deadline _without_ exceeding the bandwidth it is
+ * assigned (function returns true if it can't). We are in fact applying
+ * one of the CBS rules: when a task wakes up, if the residual runtime
+ * over residual deadline fits within the allocated bandwidth, then we
+ * can keep the current (absolute) deadline and residual budget without
+ * disrupting the schedulability of the system. Otherwise, we should
+ * refill the runtime and set the deadline a period in the future,
+ * because keeping the current (absolute) deadline of the task would
+ * result in breaking guarantees promised to other tasks (refer to
+ * Documentation/scheduler/sched-deadline.txt for more informations).
+ *
+ * This function returns true if:
+ *
+ * runtime / (deadline - t) > dl_runtime / dl_period ,
+ *
+ * IOW we can't recycle current parameters.
+ *
+ * Notice that the bandwidth check is done against the period. For
+ * task with deadline equal to period this is the same of using
+ * dl_deadline instead of dl_period in the equation above.
+ */
+static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
+ struct sched_dl_entity *pi_se, u64 t)
+{
+ u64 left, right;
+
+ /*
+ * left and right are the two sides of the equation above,
+ * after a bit of shuffling to use multiplications instead
+ * of divisions.
+ *
+ * Note that none of the time values involved in the two
+ * multiplications are absolute: dl_deadline and dl_runtime
+ * are the relative deadline and the maximum runtime of each
+ * instance, runtime is the runtime left for the last instance
+ * and (deadline - t), since t is rq->clock, is the time left
+ * to the (absolute) deadline. Even if overflowing the u64 type
+ * is very unlikely to occur in both cases, here we scale down
+ * as we want to avoid that risk at all. Scaling down by 10
+ * means that we reduce granularity to 1us. We are fine with it,
+ * since this is only a true/false check and, anyway, thinking
+ * of anything below microseconds resolution is actually fiction
+ * (but still we want to give the user that illusion >;).
+ */
+ left = (pi_se->dl_period >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
+ right = ((dl_se->deadline - t) >> DL_SCALE) *
+ (pi_se->dl_runtime >> DL_SCALE);
+
+ return dl_time_before(right, left);
+}
+
+/*
+ * When a -deadline entity is queued back on the runqueue, its runtime and
+ * deadline might need updating.
+ *
+ * The policy here is that we update the deadline of the entity only if:
+ * - the current deadline is in the past,
+ * - using the remaining runtime with the current deadline would make
+ * the entity exceed its bandwidth.
+ */
+static void update_dl_entity(struct sched_dl_entity *dl_se,
+ struct sched_dl_entity *pi_se)
+{
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+ struct rq *rq = rq_of_dl_rq(dl_rq);
+
+ /*
+ * The arrival of a new instance needs special treatment, i.e.,
+ * the actual scheduling parameters have to be "renewed".
+ */
+ if (dl_se->dl_new) {
+ setup_new_dl_entity(dl_se, pi_se);
+ return;
+ }
+
+ if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
+ dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) {
+ dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
+ dl_se->runtime = pi_se->dl_runtime;
+ }
+}
+
+/*
+ * If the entity depleted all its runtime, and if we want it to sleep
+ * while waiting for some new execution time to become available, we
+ * set the bandwidth enforcement timer to the replenishment instant
+ * and try to activate it.
+ *
+ * Notice that it is important for the caller to know if the timer
+ * actually started or not (i.e., the replenishment instant is in
+ * the future or in the past).
+ */
+static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted)
+{
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+ struct rq *rq = rq_of_dl_rq(dl_rq);
+ ktime_t now, act;
+ ktime_t soft, hard;
+ unsigned long range;
+ s64 delta;
+
+ if (boosted)
+ return 0;
+ /*
+ * We want the timer to fire at the deadline, but considering
+ * that it is actually coming from rq->clock and not from
+ * hrtimer's time base reading.
+ */
+ act = ns_to_ktime(dl_se->deadline);
+ now = hrtimer_cb_get_time(&dl_se->dl_timer);
+ delta = ktime_to_ns(now) - rq_clock(rq);
+ act = ktime_add_ns(act, delta);
+
+ /*
+ * If the expiry time already passed, e.g., because the value
+ * chosen as the deadline is too small, don't even try to
+ * start the timer in the past!
+ */
+ if (ktime_us_delta(act, now) < 0)
+ return 0;
+
+ hrtimer_set_expires(&dl_se->dl_timer, act);
+
+ soft = hrtimer_get_softexpires(&dl_se->dl_timer);
+ hard = hrtimer_get_expires(&dl_se->dl_timer);
+ range = ktime_to_ns(ktime_sub(hard, soft));
+ __hrtimer_start_range_ns(&dl_se->dl_timer, soft,
+ range, HRTIMER_MODE_ABS, 0);
+
+ return hrtimer_active(&dl_se->dl_timer);
+}
+
+/*
+ * This is the bandwidth enforcement timer callback. If here, we know
+ * a task is not on its dl_rq, since the fact that the timer was running
+ * means the task is throttled and needs a runtime replenishment.
+ *
+ * However, what we actually do depends on the fact the task is active,
+ * (it is on its rq) or has been removed from there by a call to
+ * dequeue_task_dl(). In the former case we must issue the runtime
+ * replenishment and add the task back to the dl_rq; in the latter, we just
+ * do nothing but clearing dl_throttled, so that runtime and deadline
+ * updating (and the queueing back to dl_rq) will be done by the
+ * next call to enqueue_task_dl().
+ */
+static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
+{
+ struct sched_dl_entity *dl_se = container_of(timer,
+ struct sched_dl_entity,
+ dl_timer);
+ struct task_struct *p = dl_task_of(dl_se);
+ unsigned long flags;
+ struct rq *rq;
+
+ rq = task_rq_lock(p, &flags);
+
+ /*
+ * We need to take care of several possible races here:
+ *
+ * - the task might have changed its scheduling policy
+ * to something different than SCHED_DEADLINE
+ * - the task might have changed its reservation parameters
+ * (through sched_setattr())
+ * - the task might have been boosted by someone else and
+ * might be in the boosting/deboosting path
+ *
+ * In all this cases we bail out, as the task is already
+ * in the runqueue or is going to be enqueued back anyway.
+ */
+ if (!dl_task(p) || dl_se->dl_new ||
+ dl_se->dl_boosted || !dl_se->dl_throttled)
+ goto unlock;
+
+ sched_clock_tick();
+ update_rq_clock(rq);
+
+#ifdef CONFIG_SMP
+ /*
+ * If we find that the rq the task was on is no longer
+ * available, we need to select a new rq.
+ */
+ if (unlikely(!rq->online)) {
+ dl_task_offline_migration(rq, p);
+ goto unlock;
+ }
+#endif
+
+ /*
+ * If the throttle happened during sched-out; like:
+ *
+ * schedule()
+ * deactivate_task()
+ * dequeue_task_dl()
+ * update_curr_dl()
+ * start_dl_timer()
+ * __dequeue_task_dl()
+ * prev->on_rq = 0;
+ *
+ * We can be both throttled and !queued. Replenish the counter
+ * but do not enqueue -- wait for our wakeup to do that.
+ */
+ if (!task_on_rq_queued(p)) {
+ replenish_dl_entity(dl_se, dl_se);
+ goto unlock;
+ }
+
+ enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
+ if (dl_task(rq->curr))
+ check_preempt_curr_dl(rq, p, 0);
+ else
+ resched_curr(rq);
+#ifdef CONFIG_SMP
+ /*
+ * Queueing this task back might have overloaded rq,
+ * check if we need to kick someone away.
+ */
+ if (has_pushable_dl_tasks(rq))
+ push_dl_task(rq);
+#endif
+unlock:
+ task_rq_unlock(rq, p, &flags);
+
+ return HRTIMER_NORESTART;
+}
+
+void init_dl_task_timer(struct sched_dl_entity *dl_se)
+{
+ struct hrtimer *timer = &dl_se->dl_timer;
+
+ hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ timer->function = dl_task_timer;
+}
+
+static
+int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se)
+{
+ return (dl_se->runtime <= 0);
+}
+
+extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
+
+/*
+ * Update the current task's runtime statistics (provided it is still
+ * a -deadline task and has not been removed from the dl_rq).
+ */
+static void update_curr_dl(struct rq *rq)
+{
+ struct task_struct *curr = rq->curr;
+ struct sched_dl_entity *dl_se = &curr->dl;
+ u64 delta_exec;
+
+ if (!dl_task(curr) || !on_dl_rq(dl_se))
+ return;
+
+ /*
+ * Consumed budget is computed considering the time as
+ * observed by schedulable tasks (excluding time spent
+ * in hardirq context, etc.). Deadlines are instead
+ * computed using hard walltime. This seems to be the more
+ * natural solution, but the full ramifications of this
+ * approach need further study.
+ */
+ delta_exec = rq_clock_task(rq) - curr->se.exec_start;
+ if (unlikely((s64)delta_exec <= 0))
+ return;
+
+ schedstat_set(curr->se.statistics.exec_max,
+ max(curr->se.statistics.exec_max, delta_exec));
+
+ curr->se.sum_exec_runtime += delta_exec;
+ account_group_exec_runtime(curr, delta_exec);
+
+ curr->se.exec_start = rq_clock_task(rq);
+ cpuacct_charge(curr, delta_exec);
+
+ sched_rt_avg_update(rq, delta_exec);
+
+ dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
+ if (dl_runtime_exceeded(rq, dl_se)) {
+ dl_se->dl_throttled = 1;
+ __dequeue_task_dl(rq, curr, 0);
+ if (unlikely(!start_dl_timer(dl_se, curr->dl.dl_boosted)))
+ enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
+
+ if (!is_leftmost(curr, &rq->dl))
+ resched_curr(rq);
+ }
+
+ /*
+ * Because -- for now -- we share the rt bandwidth, we need to
+ * account our runtime there too, otherwise actual rt tasks
+ * would be able to exceed the shared quota.
+ *
+ * Account to the root rt group for now.
+ *
+ * The solution we're working towards is having the RT groups scheduled
+ * using deadline servers -- however there's a few nasties to figure
+ * out before that can happen.
+ */
+ if (rt_bandwidth_enabled()) {
+ struct rt_rq *rt_rq = &rq->rt;
+
+ raw_spin_lock(&rt_rq->rt_runtime_lock);
+ /*
+ * We'll let actual RT tasks worry about the overflow here, we
+ * have our own CBS to keep us inline; only account when RT
+ * bandwidth is relevant.
+ */
+ if (sched_rt_bandwidth_account(rt_rq))
+ rt_rq->rt_time += delta_exec;
+ raw_spin_unlock(&rt_rq->rt_runtime_lock);
+ }
+}
+
+#ifdef CONFIG_SMP
+
+static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu);
+
+static inline u64 next_deadline(struct rq *rq)
+{
+ struct task_struct *next = pick_next_earliest_dl_task(rq, rq->cpu);
+
+ if (next && dl_prio(next->prio))
+ return next->dl.deadline;
+ else
+ return 0;
+}
+
+static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
+{
+ struct rq *rq = rq_of_dl_rq(dl_rq);
+
+ if (dl_rq->earliest_dl.curr == 0 ||
+ dl_time_before(deadline, dl_rq->earliest_dl.curr)) {
+ /*
+ * If the dl_rq had no -deadline tasks, or if the new task
+ * has shorter deadline than the current one on dl_rq, we
+ * know that the previous earliest becomes our next earliest,
+ * as the new task becomes the earliest itself.
+ */
+ dl_rq->earliest_dl.next = dl_rq->earliest_dl.curr;
+ dl_rq->earliest_dl.curr = deadline;
+ cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1);
+ } else if (dl_rq->earliest_dl.next == 0 ||
+ dl_time_before(deadline, dl_rq->earliest_dl.next)) {
+ /*
+ * On the other hand, if the new -deadline task has a
+ * a later deadline than the earliest one on dl_rq, but
+ * it is earlier than the next (if any), we must
+ * recompute the next-earliest.
+ */
+ dl_rq->earliest_dl.next = next_deadline(rq);
+ }
+}
+
+static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
+{
+ struct rq *rq = rq_of_dl_rq(dl_rq);
+
+ /*
+ * Since we may have removed our earliest (and/or next earliest)
+ * task we must recompute them.
+ */
+ if (!dl_rq->dl_nr_running) {
+ dl_rq->earliest_dl.curr = 0;
+ dl_rq->earliest_dl.next = 0;
+ cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
+ } else {
+ struct rb_node *leftmost = dl_rq->rb_leftmost;
+ struct sched_dl_entity *entry;
+
+ entry = rb_entry(leftmost, struct sched_dl_entity, rb_node);
+ dl_rq->earliest_dl.curr = entry->deadline;
+ dl_rq->earliest_dl.next = next_deadline(rq);
+ cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1);
+ }
+}
+
+#else
+
+static inline void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {}
+static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {}
+
+#endif /* CONFIG_SMP */
+
+static inline
+void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+ int prio = dl_task_of(dl_se)->prio;
+ u64 deadline = dl_se->deadline;
+
+ WARN_ON(!dl_prio(prio));
+ dl_rq->dl_nr_running++;
+ add_nr_running(rq_of_dl_rq(dl_rq), 1);
+
+ inc_dl_deadline(dl_rq, deadline);
+ inc_dl_migration(dl_se, dl_rq);
+}
+
+static inline
+void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+ int prio = dl_task_of(dl_se)->prio;
+
+ WARN_ON(!dl_prio(prio));
+ WARN_ON(!dl_rq->dl_nr_running);
+ dl_rq->dl_nr_running--;
+ sub_nr_running(rq_of_dl_rq(dl_rq), 1);
+
+ dec_dl_deadline(dl_rq, dl_se->deadline);
+ dec_dl_migration(dl_se, dl_rq);
+}
+
+static void __enqueue_dl_entity(struct sched_dl_entity *dl_se)
+{
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+ struct rb_node **link = &dl_rq->rb_root.rb_node;
+ struct rb_node *parent = NULL;
+ struct sched_dl_entity *entry;
+ int leftmost = 1;
+
+ BUG_ON(!RB_EMPTY_NODE(&dl_se->rb_node));
+
+ while (*link) {
+ parent = *link;
+ entry = rb_entry(parent, struct sched_dl_entity, rb_node);
+ if (dl_time_before(dl_se->deadline, entry->deadline))
+ link = &parent->rb_left;
+ else {
+ link = &parent->rb_right;
+ leftmost = 0;
+ }
+ }
+
+ if (leftmost)
+ dl_rq->rb_leftmost = &dl_se->rb_node;
+
+ rb_link_node(&dl_se->rb_node, parent, link);
+ rb_insert_color(&dl_se->rb_node, &dl_rq->rb_root);
+
+ inc_dl_tasks(dl_se, dl_rq);
+}
+
+static void __dequeue_dl_entity(struct sched_dl_entity *dl_se)
+{
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+
+ if (RB_EMPTY_NODE(&dl_se->rb_node))
+ return;
+
+ if (dl_rq->rb_leftmost == &dl_se->rb_node) {
+ struct rb_node *next_node;
+
+ next_node = rb_next(&dl_se->rb_node);
+ dl_rq->rb_leftmost = next_node;
+ }
+
+ rb_erase(&dl_se->rb_node, &dl_rq->rb_root);
+ RB_CLEAR_NODE(&dl_se->rb_node);
+
+ dec_dl_tasks(dl_se, dl_rq);
+}
+
+static void
+enqueue_dl_entity(struct sched_dl_entity *dl_se,
+ struct sched_dl_entity *pi_se, int flags)
+{
+ BUG_ON(on_dl_rq(dl_se));
+
+ /*
+ * If this is a wakeup or a new instance, the scheduling
+ * parameters of the task might need updating. Otherwise,
+ * we want a replenishment of its runtime.
+ */
+ if (dl_se->dl_new || flags & ENQUEUE_WAKEUP)
+ update_dl_entity(dl_se, pi_se);
+ else if (flags & ENQUEUE_REPLENISH)
+ replenish_dl_entity(dl_se, pi_se);
+
+ __enqueue_dl_entity(dl_se);
+}
+
+static void dequeue_dl_entity(struct sched_dl_entity *dl_se)
+{
+ __dequeue_dl_entity(dl_se);
+}
+
+static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
+{
+ struct task_struct *pi_task = rt_mutex_get_top_task(p);
+ struct sched_dl_entity *pi_se = &p->dl;
+
+ /*
+ * Use the scheduling parameters of the top pi-waiter
+ * task if we have one and its (relative) deadline is
+ * smaller than our one... OTW we keep our runtime and
+ * deadline.
+ */
+ if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) {
+ pi_se = &pi_task->dl;
+ } else if (!dl_prio(p->normal_prio)) {
+ /*
+ * Special case in which we have a !SCHED_DEADLINE task
+ * that is going to be deboosted, but exceedes its
+ * runtime while doing so. No point in replenishing
+ * it, as it's going to return back to its original
+ * scheduling class after this.
+ */
+ BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH);
+ return;
+ }
+
+ /*
+ * If p is throttled, we do nothing. In fact, if it exhausted
+ * its budget it needs a replenishment and, since it now is on
+ * its rq, the bandwidth timer callback (which clearly has not
+ * run yet) will take care of this.
+ */
+ if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH))
+ return;
+
+ enqueue_dl_entity(&p->dl, pi_se, flags);
+
+ if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
+ enqueue_pushable_dl_task(rq, p);
+}
+
+static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
+{
+ dequeue_dl_entity(&p->dl);
+ dequeue_pushable_dl_task(rq, p);
+}
+
+static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
+{
+ update_curr_dl(rq);
+ __dequeue_task_dl(rq, p, flags);
+}
+
+/*
+ * Yield task semantic for -deadline tasks is:
+ *
+ * get off from the CPU until our next instance, with
+ * a new runtime. This is of little use now, since we
+ * don't have a bandwidth reclaiming mechanism. Anyway,
+ * bandwidth reclaiming is planned for the future, and
+ * yield_task_dl will indicate that some spare budget
+ * is available for other task instances to use it.
+ */
+static void yield_task_dl(struct rq *rq)
+{
+ struct task_struct *p = rq->curr;
+
+ /*
+ * We make the task go to sleep until its current deadline by
+ * forcing its runtime to zero. This way, update_curr_dl() stops
+ * it and the bandwidth timer will wake it up and will give it
+ * new scheduling parameters (thanks to dl_yielded=1).
+ */
+ if (p->dl.runtime > 0) {
+ rq->curr->dl.dl_yielded = 1;
+ p->dl.runtime = 0;
+ }
+ update_rq_clock(rq);
+ update_curr_dl(rq);
+ /*
+ * Tell update_rq_clock() that we've just updated,
+ * so we don't do microscopic update in schedule()
+ * and double the fastpath cost.
+ */
+ rq_clock_skip_update(rq, true);
+}
+
+#ifdef CONFIG_SMP
+
+static int find_later_rq(struct task_struct *task);
+
+static int
+select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
+{
+ struct task_struct *curr;
+ struct rq *rq;
+
+ if (sd_flag != SD_BALANCE_WAKE)
+ goto out;
+
+ rq = cpu_rq(cpu);
+
+ rcu_read_lock();
+ curr = ACCESS_ONCE(rq->curr); /* unlocked access */
+
+ /*
+ * If we are dealing with a -deadline task, we must
+ * decide where to wake it up.
+ * If it has a later deadline and the current task
+ * on this rq can't move (provided the waking task
+ * can!) we prefer to send it somewhere else. On the
+ * other hand, if it has a shorter deadline, we
+ * try to make it stay here, it might be important.
+ */
+ if (unlikely(dl_task(curr)) &&
+ (curr->nr_cpus_allowed < 2 ||
+ !dl_entity_preempt(&p->dl, &curr->dl)) &&
+ (p->nr_cpus_allowed > 1)) {
+ int target = find_later_rq(p);
+
+ if (target != -1)
+ cpu = target;
+ }
+ rcu_read_unlock();
+
+out:
+ return cpu;
+}
+
+static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
+{
+ /*
+ * Current can't be migrated, useless to reschedule,
+ * let's hope p can move out.
+ */
+ if (rq->curr->nr_cpus_allowed == 1 ||
+ cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1)
+ return;
+
+ /*
+ * p is migratable, so let's not schedule it and
+ * see if it is pushed or pulled somewhere else.
+ */
+ if (p->nr_cpus_allowed != 1 &&
+ cpudl_find(&rq->rd->cpudl, p, NULL) != -1)
+ return;
+
+ resched_curr(rq);
+}
+
+static int pull_dl_task(struct rq *this_rq);
+
+#endif /* CONFIG_SMP */
+
+/*
+ * Only called when both the current and waking task are -deadline
+ * tasks.
+ */
+static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
+ int flags)
+{
+ if (dl_entity_preempt(&p->dl, &rq->curr->dl)) {
+ resched_curr(rq);
+ return;
+ }
+
+#ifdef CONFIG_SMP
+ /*
+ * In the unlikely case current and p have the same deadline
+ * let us try to decide what's the best thing to do...
+ */
+ if ((p->dl.deadline == rq->curr->dl.deadline) &&
+ !test_tsk_need_resched(rq->curr))
+ check_preempt_equal_dl(rq, p);
+#endif /* CONFIG_SMP */
+}
+
+#ifdef CONFIG_SCHED_HRTICK
+static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
+{
+ hrtick_start(rq, p->dl.runtime);
+}
+#else /* !CONFIG_SCHED_HRTICK */
+static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
+{
+}
+#endif
+
+static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
+ struct dl_rq *dl_rq)
+{
+ struct rb_node *left = dl_rq->rb_leftmost;
+
+ if (!left)
+ return NULL;
+
+ return rb_entry(left, struct sched_dl_entity, rb_node);
+}
+
+struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
+{
+ struct sched_dl_entity *dl_se;
+ struct task_struct *p;
+ struct dl_rq *dl_rq;
+
+ dl_rq = &rq->dl;
+
+ if (need_pull_dl_task(rq, prev)) {
+ pull_dl_task(rq);
+ /*
+ * pull_rt_task() can drop (and re-acquire) rq->lock; this
+ * means a stop task can slip in, in which case we need to
+ * re-start task selection.
+ */
+ if (rq->stop && task_on_rq_queued(rq->stop))
+ return RETRY_TASK;
+ }
+
+ /*
+ * When prev is DL, we may throttle it in put_prev_task().
+ * So, we update time before we check for dl_nr_running.
+ */
+ if (prev->sched_class == &dl_sched_class)
+ update_curr_dl(rq);
+
+ if (unlikely(!dl_rq->dl_nr_running))
+ return NULL;
+
+ put_prev_task(rq, prev);
+
+ dl_se = pick_next_dl_entity(rq, dl_rq);
+ BUG_ON(!dl_se);
+
+ p = dl_task_of(dl_se);
+ p->se.exec_start = rq_clock_task(rq);
+
+ /* Running task will never be pushed. */
+ dequeue_pushable_dl_task(rq, p);
+
+ if (hrtick_enabled(rq))
+ start_hrtick_dl(rq, p);
+
+ set_post_schedule(rq);
+
+ return p;
+}
+
+static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
+{
+ update_curr_dl(rq);
+
+ if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
+ enqueue_pushable_dl_task(rq, p);
+}
+
+static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
+{
+ update_curr_dl(rq);
+
+ /*
+ * Even when we have runtime, update_curr_dl() might have resulted in us
+ * not being the leftmost task anymore. In that case NEED_RESCHED will
+ * be set and schedule() will start a new hrtick for the next task.
+ */
+ if (hrtick_enabled(rq) && queued && p->dl.runtime > 0 &&
+ is_leftmost(p, &rq->dl))
+ start_hrtick_dl(rq, p);
+}
+
+static void task_fork_dl(struct task_struct *p)
+{
+ /*
+ * SCHED_DEADLINE tasks cannot fork and this is achieved through
+ * sched_fork()
+ */
+}
+
+static void task_dead_dl(struct task_struct *p)
+{
+ struct hrtimer *timer = &p->dl.dl_timer;
+ struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+
+ /*
+ * Since we are TASK_DEAD we won't slip out of the domain!
+ */
+ raw_spin_lock_irq(&dl_b->lock);
+ /* XXX we should retain the bw until 0-lag */
+ dl_b->total_bw -= p->dl.dl_bw;
+ raw_spin_unlock_irq(&dl_b->lock);
+
+ hrtimer_cancel(timer);
+}
+
+static void set_curr_task_dl(struct rq *rq)
+{
+ struct task_struct *p = rq->curr;
+
+ p->se.exec_start = rq_clock_task(rq);
+
+ /* You can't push away the running task */
+ dequeue_pushable_dl_task(rq, p);
+}
+
+#ifdef CONFIG_SMP
+
+/* Only try algorithms three times */
+#define DL_MAX_TRIES 3
+
+static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
+{
+ if (!task_running(rq, p) &&
+ cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+ return 1;
+ return 0;
+}
+
+/* Returns the second earliest -deadline task, NULL otherwise */
+static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu)
+{
+ struct rb_node *next_node = rq->dl.rb_leftmost;
+ struct sched_dl_entity *dl_se;
+ struct task_struct *p = NULL;
+
+next_node:
+ next_node = rb_next(next_node);
+ if (next_node) {
+ dl_se = rb_entry(next_node, struct sched_dl_entity, rb_node);
+ p = dl_task_of(dl_se);
+
+ if (pick_dl_task(rq, p, cpu))
+ return p;
+
+ goto next_node;
+ }
+
+ return NULL;
+}
+
+static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl);
+
+static int find_later_rq(struct task_struct *task)
+{
+ struct sched_domain *sd;
+ struct cpumask *later_mask = this_cpu_cpumask_var_ptr(local_cpu_mask_dl);
+ int this_cpu = smp_processor_id();
+ int best_cpu, cpu = task_cpu(task);
+
+ /* Make sure the mask is initialized first */
+ if (unlikely(!later_mask))
+ return -1;
+
+ if (task->nr_cpus_allowed == 1)
+ return -1;
+
+ /*
+ * We have to consider system topology and task affinity
+ * first, then we can look for a suitable cpu.
+ */
+ best_cpu = cpudl_find(&task_rq(task)->rd->cpudl,
+ task, later_mask);
+ if (best_cpu == -1)
+ return -1;
+
+ /*
+ * If we are here, some target has been found,
+ * the most suitable of which is cached in best_cpu.
+ * This is, among the runqueues where the current tasks
+ * have later deadlines than the task's one, the rq
+ * with the latest possible one.
+ *
+ * Now we check how well this matches with task's
+ * affinity and system topology.
+ *
+ * The last cpu where the task run is our first
+ * guess, since it is most likely cache-hot there.
+ */
+ if (cpumask_test_cpu(cpu, later_mask))
+ return cpu;
+ /*
+ * Check if this_cpu is to be skipped (i.e., it is
+ * not in the mask) or not.
+ */
+ if (!cpumask_test_cpu(this_cpu, later_mask))
+ this_cpu = -1;
+
+ rcu_read_lock();
+ for_each_domain(cpu, sd) {
+ if (sd->flags & SD_WAKE_AFFINE) {
+
+ /*
+ * If possible, preempting this_cpu is
+ * cheaper than migrating.
+ */
+ if (this_cpu != -1 &&
+ cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
+ rcu_read_unlock();
+ return this_cpu;
+ }
+
+ /*
+ * Last chance: if best_cpu is valid and is
+ * in the mask, that becomes our choice.
+ */
+ if (best_cpu < nr_cpu_ids &&
+ cpumask_test_cpu(best_cpu, sched_domain_span(sd))) {
+ rcu_read_unlock();
+ return best_cpu;
+ }
+ }
+ }
+ rcu_read_unlock();
+
+ /*
+ * At this point, all our guesses failed, we just return
+ * 'something', and let the caller sort the things out.
+ */
+ if (this_cpu != -1)
+ return this_cpu;
+
+ cpu = cpumask_any(later_mask);
+ if (cpu < nr_cpu_ids)
+ return cpu;
+
+ return -1;
+}
+
+/* Locks the rq it finds */
+static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
+{
+ struct rq *later_rq = NULL;
+ int tries;
+ int cpu;
+
+ for (tries = 0; tries < DL_MAX_TRIES; tries++) {
+ cpu = find_later_rq(task);
+
+ if ((cpu == -1) || (cpu == rq->cpu))
+ break;
+
+ later_rq = cpu_rq(cpu);
+
+ /* Retry if something changed. */
+ if (double_lock_balance(rq, later_rq)) {
+ if (unlikely(task_rq(task) != rq ||
+ !cpumask_test_cpu(later_rq->cpu,
+ &task->cpus_allowed) ||
+ task_running(rq, task) ||
+ !task_on_rq_queued(task))) {
+ double_unlock_balance(rq, later_rq);
+ later_rq = NULL;
+ break;
+ }
+ }
+
+ /*
+ * If the rq we found has no -deadline task, or
+ * its earliest one has a later deadline than our
+ * task, the rq is a good one.
+ */
+ if (!later_rq->dl.dl_nr_running ||
+ dl_time_before(task->dl.deadline,
+ later_rq->dl.earliest_dl.curr))
+ break;
+
+ /* Otherwise we try again. */
+ double_unlock_balance(rq, later_rq);
+ later_rq = NULL;
+ }
+
+ return later_rq;
+}
+
+static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
+{
+ struct task_struct *p;
+
+ if (!has_pushable_dl_tasks(rq))
+ return NULL;
+
+ p = rb_entry(rq->dl.pushable_dl_tasks_leftmost,
+ struct task_struct, pushable_dl_tasks);
+
+ BUG_ON(rq->cpu != task_cpu(p));
+ BUG_ON(task_current(rq, p));
+ BUG_ON(p->nr_cpus_allowed <= 1);
+
+ BUG_ON(!task_on_rq_queued(p));
+ BUG_ON(!dl_task(p));
+
+ return p;
+}
+
+/*
+ * See if the non running -deadline tasks on this rq
+ * can be sent to some other CPU where they can preempt
+ * and start executing.
+ */
+static int push_dl_task(struct rq *rq)
+{
+ struct task_struct *next_task;
+ struct rq *later_rq;
+ int ret = 0;
+
+ if (!rq->dl.overloaded)
+ return 0;
+
+ next_task = pick_next_pushable_dl_task(rq);
+ if (!next_task)
+ return 0;
+
+retry:
+ if (unlikely(next_task == rq->curr)) {
+ WARN_ON(1);
+ return 0;
+ }
+
+ /*
+ * If next_task preempts rq->curr, and rq->curr
+ * can move away, it makes sense to just reschedule
+ * without going further in pushing next_task.
+ */
+ if (dl_task(rq->curr) &&
+ dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) &&
+ rq->curr->nr_cpus_allowed > 1) {
+ resched_curr(rq);
+ return 0;
+ }
+
+ /* We might release rq lock */
+ get_task_struct(next_task);
+
+ /* Will lock the rq it'll find */
+ later_rq = find_lock_later_rq(next_task, rq);
+ if (!later_rq) {
+ struct task_struct *task;
+
+ /*
+ * We must check all this again, since
+ * find_lock_later_rq releases rq->lock and it is
+ * then possible that next_task has migrated.
+ */
+ task = pick_next_pushable_dl_task(rq);
+ if (task_cpu(next_task) == rq->cpu && task == next_task) {
+ /*
+ * The task is still there. We don't try
+ * again, some other cpu will pull it when ready.
+ */
+ goto out;
+ }
+
+ if (!task)
+ /* No more tasks */
+ goto out;
+
+ put_task_struct(next_task);
+ next_task = task;
+ goto retry;
+ }
+
+ deactivate_task(rq, next_task, 0);
+ set_task_cpu(next_task, later_rq->cpu);
+ activate_task(later_rq, next_task, 0);
+ ret = 1;
+
+ resched_curr(later_rq);
+
+ double_unlock_balance(rq, later_rq);
+
+out:
+ put_task_struct(next_task);
+
+ return ret;
+}
+
+static void push_dl_tasks(struct rq *rq)
+{
+ /* Terminates as it moves a -deadline task */
+ while (push_dl_task(rq))
+ ;
+}
+
+static int pull_dl_task(struct rq *this_rq)
+{
+ int this_cpu = this_rq->cpu, ret = 0, cpu;
+ struct task_struct *p;
+ struct rq *src_rq;
+ u64 dmin = LONG_MAX;
+
+ if (likely(!dl_overloaded(this_rq)))
+ return 0;
+
+ /*
+ * Match the barrier from dl_set_overloaded; this guarantees that if we
+ * see overloaded we must also see the dlo_mask bit.
+ */
+ smp_rmb();
+
+ for_each_cpu(cpu, this_rq->rd->dlo_mask) {
+ if (this_cpu == cpu)
+ continue;
+
+ src_rq = cpu_rq(cpu);
+
+ /*
+ * It looks racy, abd it is! However, as in sched_rt.c,
+ * we are fine with this.
+ */
+ if (this_rq->dl.dl_nr_running &&
+ dl_time_before(this_rq->dl.earliest_dl.curr,
+ src_rq->dl.earliest_dl.next))
+ continue;
+
+ /* Might drop this_rq->lock */
+ double_lock_balance(this_rq, src_rq);
+
+ /*
+ * If there are no more pullable tasks on the
+ * rq, we're done with it.
+ */
+ if (src_rq->dl.dl_nr_running <= 1)
+ goto skip;
+
+ p = pick_next_earliest_dl_task(src_rq, this_cpu);
+
+ /*
+ * We found a task to be pulled if:
+ * - it preempts our current (if there's one),
+ * - it will preempt the last one we pulled (if any).
+ */
+ if (p && dl_time_before(p->dl.deadline, dmin) &&
+ (!this_rq->dl.dl_nr_running ||
+ dl_time_before(p->dl.deadline,
+ this_rq->dl.earliest_dl.curr))) {
+ WARN_ON(p == src_rq->curr);
+ WARN_ON(!task_on_rq_queued(p));
+
+ /*
+ * Then we pull iff p has actually an earlier
+ * deadline than the current task of its runqueue.
+ */
+ if (dl_time_before(p->dl.deadline,
+ src_rq->curr->dl.deadline))
+ goto skip;
+
+ ret = 1;
+
+ deactivate_task(src_rq, p, 0);
+ set_task_cpu(p, this_cpu);
+ activate_task(this_rq, p, 0);
+ dmin = p->dl.deadline;
+
+ /* Is there any other task even earlier? */
+ }
+skip:
+ double_unlock_balance(this_rq, src_rq);
+ }
+
+ return ret;
+}
+
+static void post_schedule_dl(struct rq *rq)
+{
+ push_dl_tasks(rq);
+}
+
+/*
+ * Since the task is not running and a reschedule is not going to happen
+ * anytime soon on its runqueue, we try pushing it away now.
+ */
+static void task_woken_dl(struct rq *rq, struct task_struct *p)
+{
+ if (!task_running(rq, p) &&
+ !test_tsk_need_resched(rq->curr) &&
+ has_pushable_dl_tasks(rq) &&
+ p->nr_cpus_allowed > 1 &&
+ dl_task(rq->curr) &&
+ (rq->curr->nr_cpus_allowed < 2 ||
+ !dl_entity_preempt(&p->dl, &rq->curr->dl))) {
+ push_dl_tasks(rq);
+ }
+}
+
+static void set_cpus_allowed_dl(struct task_struct *p,
+ const struct cpumask *new_mask)
+{
+ struct rq *rq;
+ struct root_domain *src_rd;
+ int weight;
+
+ BUG_ON(!dl_task(p));
+
+ rq = task_rq(p);
+ src_rd = rq->rd;
+ /*
+ * Migrating a SCHED_DEADLINE task between exclusive
+ * cpusets (different root_domains) entails a bandwidth
+ * update. We already made space for us in the destination
+ * domain (see cpuset_can_attach()).
+ */
+ if (!cpumask_intersects(src_rd->span, new_mask)) {
+ struct dl_bw *src_dl_b;
+
+ src_dl_b = dl_bw_of(cpu_of(rq));
+ /*
+ * We now free resources of the root_domain we are migrating
+ * off. In the worst case, sched_setattr() may temporary fail
+ * until we complete the update.
+ */
+ raw_spin_lock(&src_dl_b->lock);
+ __dl_clear(src_dl_b, p->dl.dl_bw);
+ raw_spin_unlock(&src_dl_b->lock);
+ }
+
+ /*
+ * Update only if the task is actually running (i.e.,
+ * it is on the rq AND it is not throttled).
+ */
+ if (!on_dl_rq(&p->dl))
+ return;
+
+ weight = cpumask_weight(new_mask);
+
+ /*
+ * Only update if the process changes its state from whether it
+ * can migrate or not.
+ */
+ if ((p->nr_cpus_allowed > 1) == (weight > 1))
+ return;
+
+ /*
+ * The process used to be able to migrate OR it can now migrate
+ */
+ if (weight <= 1) {
+ if (!task_current(rq, p))
+ dequeue_pushable_dl_task(rq, p);
+ BUG_ON(!rq->dl.dl_nr_migratory);
+ rq->dl.dl_nr_migratory--;
+ } else {
+ if (!task_current(rq, p))
+ enqueue_pushable_dl_task(rq, p);
+ rq->dl.dl_nr_migratory++;
+ }
+
+ update_dl_migration(&rq->dl);
+}
+
+/* Assumes rq->lock is held */
+static void rq_online_dl(struct rq *rq)
+{
+ if (rq->dl.overloaded)
+ dl_set_overload(rq);
+
+ cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu);
+ if (rq->dl.dl_nr_running > 0)
+ cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1);
+}
+
+/* Assumes rq->lock is held */
+static void rq_offline_dl(struct rq *rq)
+{
+ if (rq->dl.overloaded)
+ dl_clear_overload(rq);
+
+ cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
+ cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
+}
+
+void init_sched_dl_class(void)
+{
+ unsigned int i;
+
+ for_each_possible_cpu(i)
+ zalloc_cpumask_var_node(&per_cpu(local_cpu_mask_dl, i),
+ GFP_KERNEL, cpu_to_node(i));
+}
+
+#endif /* CONFIG_SMP */
+
+/*
+ * Ensure p's dl_timer is cancelled. May drop rq->lock for a while.
+ */
+static void cancel_dl_timer(struct rq *rq, struct task_struct *p)
+{
+ struct hrtimer *dl_timer = &p->dl.dl_timer;
+
+ /* Nobody will change task's class if pi_lock is held */
+ lockdep_assert_held(&p->pi_lock);
+
+ if (hrtimer_active(dl_timer)) {
+ int ret = hrtimer_try_to_cancel(dl_timer);
+
+ if (unlikely(ret == -1)) {
+ /*
+ * Note, p may migrate OR new deadline tasks
+ * may appear in rq when we are unlocking it.
+ * A caller of us must be fine with that.
+ */
+ raw_spin_unlock(&rq->lock);
+ hrtimer_cancel(dl_timer);
+ raw_spin_lock(&rq->lock);
+ }
+ }
+}
+
+static void switched_from_dl(struct rq *rq, struct task_struct *p)
+{
+ /* XXX we should retain the bw until 0-lag */
+ cancel_dl_timer(rq, p);
+ __dl_clear_params(p);
+
+ /*
+ * Since this might be the only -deadline task on the rq,
+ * this is the right place to try to pull some other one
+ * from an overloaded cpu, if any.
+ */
+ if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
+ return;
+
+ if (pull_dl_task(rq))
+ resched_curr(rq);
+}
+
+/*
+ * When switching to -deadline, we may overload the rq, then
+ * we try to push someone off, if possible.
+ */
+static void switched_to_dl(struct rq *rq, struct task_struct *p)
+{
+ int check_resched = 1;
+
+ if (task_on_rq_queued(p) && rq->curr != p) {
+#ifdef CONFIG_SMP
+ if (p->nr_cpus_allowed > 1 && rq->dl.overloaded &&
+ push_dl_task(rq) && rq != task_rq(p))
+ /* Only reschedule if pushing failed */
+ check_resched = 0;
+#endif /* CONFIG_SMP */
+ if (check_resched) {
+ if (dl_task(rq->curr))
+ check_preempt_curr_dl(rq, p, 0);
+ else
+ resched_curr(rq);
+ }
+ }
+}
+
+/*
+ * If the scheduling parameters of a -deadline task changed,
+ * a push or pull operation might be needed.
+ */
+static void prio_changed_dl(struct rq *rq, struct task_struct *p,
+ int oldprio)
+{
+ if (task_on_rq_queued(p) || rq->curr == p) {
+#ifdef CONFIG_SMP
+ /*
+ * This might be too much, but unfortunately
+ * we don't have the old deadline value, and
+ * we can't argue if the task is increasing
+ * or lowering its prio, so...
+ */
+ if (!rq->dl.overloaded)
+ pull_dl_task(rq);
+
+ /*
+ * If we now have a earlier deadline task than p,
+ * then reschedule, provided p is still on this
+ * runqueue.
+ */
+ if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) &&
+ rq->curr == p)
+ resched_curr(rq);
+#else
+ /*
+ * Again, we don't know if p has a earlier
+ * or later deadline, so let's blindly set a
+ * (maybe not needed) rescheduling point.
+ */
+ resched_curr(rq);
+#endif /* CONFIG_SMP */
+ } else
+ switched_to_dl(rq, p);
+}
+
+const struct sched_class dl_sched_class = {
+ .next = &rt_sched_class,
+ .enqueue_task = enqueue_task_dl,
+ .dequeue_task = dequeue_task_dl,
+ .yield_task = yield_task_dl,
+
+ .check_preempt_curr = check_preempt_curr_dl,
+
+ .pick_next_task = pick_next_task_dl,
+ .put_prev_task = put_prev_task_dl,
+
+#ifdef CONFIG_SMP
+ .select_task_rq = select_task_rq_dl,
+ .set_cpus_allowed = set_cpus_allowed_dl,
+ .rq_online = rq_online_dl,
+ .rq_offline = rq_offline_dl,
+ .post_schedule = post_schedule_dl,
+ .task_woken = task_woken_dl,
+#endif
+
+ .set_curr_task = set_curr_task_dl,
+ .task_tick = task_tick_dl,
+ .task_fork = task_fork_dl,
+ .task_dead = task_dead_dl,
+
+ .prio_changed = prio_changed_dl,
+ .switched_from = switched_from_dl,
+ .switched_to = switched_to_dl,
+
+ .update_curr = update_curr_dl,
+};
+
+#ifdef CONFIG_SCHED_DEBUG
+extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
+
+void print_dl_stats(struct seq_file *m, int cpu)
+{
+ print_dl_rq(m, cpu, &cpu_rq(cpu)->dl);
+}
+#endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
new file mode 100644
index 000000000..a245c1fc6
--- /dev/null
+++ b/kernel/sched/debug.c
@@ -0,0 +1,674 @@
+/*
+ * kernel/sched/debug.c
+ *
+ * Print the CFS rbtree
+ *
+ * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/kallsyms.h>
+#include <linux/utsname.h>
+#include <linux/mempolicy.h>
+
+#include "sched.h"
+
+static DEFINE_SPINLOCK(sched_debug_lock);
+
+/*
+ * This allows printing both to /proc/sched_debug and
+ * to the console
+ */
+#define SEQ_printf(m, x...) \
+ do { \
+ if (m) \
+ seq_printf(m, x); \
+ else \
+ printk(x); \
+ } while (0)
+
+/*
+ * Ease the printing of nsec fields:
+ */
+static long long nsec_high(unsigned long long nsec)
+{
+ if ((long long)nsec < 0) {
+ nsec = -nsec;
+ do_div(nsec, 1000000);
+ return -nsec;
+ }
+ do_div(nsec, 1000000);
+
+ return nsec;
+}
+
+static unsigned long nsec_low(unsigned long long nsec)
+{
+ if ((long long)nsec < 0)
+ nsec = -nsec;
+
+ return do_div(nsec, 1000000);
+}
+
+#define SPLIT_NS(x) nsec_high(x), nsec_low(x)
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
+{
+ struct sched_entity *se = tg->se[cpu];
+
+#define P(F) \
+ SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
+#define PN(F) \
+ SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
+
+ if (!se) {
+ struct sched_avg *avg = &cpu_rq(cpu)->avg;
+ P(avg->runnable_avg_sum);
+ P(avg->avg_period);
+ return;
+ }
+
+
+ PN(se->exec_start);
+ PN(se->vruntime);
+ PN(se->sum_exec_runtime);
+#ifdef CONFIG_SCHEDSTATS
+ PN(se->statistics.wait_start);
+ PN(se->statistics.sleep_start);
+ PN(se->statistics.block_start);
+ PN(se->statistics.sleep_max);
+ PN(se->statistics.block_max);
+ PN(se->statistics.exec_max);
+ PN(se->statistics.slice_max);
+ PN(se->statistics.wait_max);
+ PN(se->statistics.wait_sum);
+ P(se->statistics.wait_count);
+#endif
+ P(se->load.weight);
+#ifdef CONFIG_SMP
+ P(se->avg.runnable_avg_sum);
+ P(se->avg.running_avg_sum);
+ P(se->avg.avg_period);
+ P(se->avg.load_avg_contrib);
+ P(se->avg.utilization_avg_contrib);
+ P(se->avg.decay_count);
+#endif
+#undef PN
+#undef P
+}
+#endif
+
+#ifdef CONFIG_CGROUP_SCHED
+static char group_path[PATH_MAX];
+
+static char *task_group_path(struct task_group *tg)
+{
+ if (autogroup_path(tg, group_path, PATH_MAX))
+ return group_path;
+
+ return cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
+}
+#endif
+
+static void
+print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
+{
+ if (rq->curr == p)
+ SEQ_printf(m, "R");
+ else
+ SEQ_printf(m, " ");
+
+ SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ",
+ p->comm, task_pid_nr(p),
+ SPLIT_NS(p->se.vruntime),
+ (long long)(p->nvcsw + p->nivcsw),
+ p->prio);
+#ifdef CONFIG_SCHEDSTATS
+ SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
+ SPLIT_NS(p->se.vruntime),
+ SPLIT_NS(p->se.sum_exec_runtime),
+ SPLIT_NS(p->se.statistics.sum_sleep_runtime));
+#else
+ SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
+ 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
+#endif
+#ifdef CONFIG_NUMA_BALANCING
+ SEQ_printf(m, " %d", task_node(p));
+#endif
+#ifdef CONFIG_CGROUP_SCHED
+ SEQ_printf(m, " %s", task_group_path(task_group(p)));
+#endif
+
+ SEQ_printf(m, "\n");
+}
+
+static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
+{
+ struct task_struct *g, *p;
+
+ SEQ_printf(m,
+ "\nrunnable tasks:\n"
+ " task PID tree-key switches prio"
+ " exec-runtime sum-exec sum-sleep\n"
+ "------------------------------------------------------"
+ "----------------------------------------------------\n");
+
+ rcu_read_lock();
+ for_each_process_thread(g, p) {
+ if (task_cpu(p) != rq_cpu)
+ continue;
+
+ print_task(m, rq, p);
+ }
+ rcu_read_unlock();
+}
+
+void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
+{
+ s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
+ spread, rq0_min_vruntime, spread0;
+ struct rq *rq = cpu_rq(cpu);
+ struct sched_entity *last;
+ unsigned long flags;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg));
+#else
+ SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
+#endif
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
+ SPLIT_NS(cfs_rq->exec_clock));
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ if (cfs_rq->rb_leftmost)
+ MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
+ last = __pick_last_entity(cfs_rq);
+ if (last)
+ max_vruntime = last->vruntime;
+ min_vruntime = cfs_rq->min_vruntime;
+ rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
+ SPLIT_NS(MIN_vruntime));
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime",
+ SPLIT_NS(min_vruntime));
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime",
+ SPLIT_NS(max_vruntime));
+ spread = max_vruntime - MIN_vruntime;
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread",
+ SPLIT_NS(spread));
+ spread0 = min_vruntime - rq0_min_vruntime;
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0",
+ SPLIT_NS(spread0));
+ SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
+ cfs_rq->nr_spread_over);
+ SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
+ SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
+#ifdef CONFIG_SMP
+ SEQ_printf(m, " .%-30s: %ld\n", "runnable_load_avg",
+ cfs_rq->runnable_load_avg);
+ SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg",
+ cfs_rq->blocked_load_avg);
+ SEQ_printf(m, " .%-30s: %ld\n", "utilization_load_avg",
+ cfs_rq->utilization_load_avg);
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib",
+ cfs_rq->tg_load_contrib);
+ SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib",
+ cfs_rq->tg_runnable_contrib);
+ SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg",
+ atomic_long_read(&cfs_rq->tg->load_avg));
+ SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg",
+ atomic_read(&cfs_rq->tg->runnable_avg));
+#endif
+#endif
+#ifdef CONFIG_CFS_BANDWIDTH
+ SEQ_printf(m, " .%-30s: %d\n", "tg->cfs_bandwidth.timer_active",
+ cfs_rq->tg->cfs_bandwidth.timer_active);
+ SEQ_printf(m, " .%-30s: %d\n", "throttled",
+ cfs_rq->throttled);
+ SEQ_printf(m, " .%-30s: %d\n", "throttle_count",
+ cfs_rq->throttle_count);
+#endif
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ print_cfs_group_stats(m, cpu, cfs_rq->tg);
+#endif
+}
+
+void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
+{
+#ifdef CONFIG_RT_GROUP_SCHED
+ SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg));
+#else
+ SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
+#endif
+
+#define P(x) \
+ SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
+#define PN(x) \
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x))
+
+ P(rt_nr_running);
+ P(rt_throttled);
+ PN(rt_time);
+ PN(rt_runtime);
+
+#undef PN
+#undef P
+}
+
+void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
+{
+ SEQ_printf(m, "\ndl_rq[%d]:\n", cpu);
+ SEQ_printf(m, " .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running);
+}
+
+extern __read_mostly int sched_clock_running;
+
+static void print_cpu(struct seq_file *m, int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+
+#ifdef CONFIG_X86
+ {
+ unsigned int freq = cpu_khz ? : 1;
+
+ SEQ_printf(m, "cpu#%d, %u.%03u MHz\n",
+ cpu, freq / 1000, (freq % 1000));
+ }
+#else
+ SEQ_printf(m, "cpu#%d\n", cpu);
+#endif
+
+#define P(x) \
+do { \
+ if (sizeof(rq->x) == 4) \
+ SEQ_printf(m, " .%-30s: %ld\n", #x, (long)(rq->x)); \
+ else \
+ SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x));\
+} while (0)
+
+#define PN(x) \
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x))
+
+ P(nr_running);
+ SEQ_printf(m, " .%-30s: %lu\n", "load",
+ rq->load.weight);
+ P(nr_switches);
+ P(nr_load_updates);
+ P(nr_uninterruptible);
+ PN(next_balance);
+ SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr)));
+ PN(clock);
+ PN(clock_task);
+ P(cpu_load[0]);
+ P(cpu_load[1]);
+ P(cpu_load[2]);
+ P(cpu_load[3]);
+ P(cpu_load[4]);
+#undef P
+#undef PN
+
+#ifdef CONFIG_SCHEDSTATS
+#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
+#define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n);
+
+ P(yld_count);
+
+ P(sched_count);
+ P(sched_goidle);
+#ifdef CONFIG_SMP
+ P64(avg_idle);
+ P64(max_idle_balance_cost);
+#endif
+
+ P(ttwu_count);
+ P(ttwu_local);
+
+#undef P
+#undef P64
+#endif
+ spin_lock_irqsave(&sched_debug_lock, flags);
+ print_cfs_stats(m, cpu);
+ print_rt_stats(m, cpu);
+ print_dl_stats(m, cpu);
+
+ print_rq(m, rq, cpu);
+ spin_unlock_irqrestore(&sched_debug_lock, flags);
+ SEQ_printf(m, "\n");
+}
+
+static const char *sched_tunable_scaling_names[] = {
+ "none",
+ "logaritmic",
+ "linear"
+};
+
+static void sched_debug_header(struct seq_file *m)
+{
+ u64 ktime, sched_clk, cpu_clk;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ ktime = ktime_to_ns(ktime_get());
+ sched_clk = sched_clock();
+ cpu_clk = local_clock();
+ local_irq_restore(flags);
+
+ SEQ_printf(m, "Sched Debug Version: v0.11, %s %.*s\n",
+ init_utsname()->release,
+ (int)strcspn(init_utsname()->version, " "),
+ init_utsname()->version);
+
+#define P(x) \
+ SEQ_printf(m, "%-40s: %Ld\n", #x, (long long)(x))
+#define PN(x) \
+ SEQ_printf(m, "%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
+ PN(ktime);
+ PN(sched_clk);
+ PN(cpu_clk);
+ P(jiffies);
+#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+ P(sched_clock_stable());
+#endif
+#undef PN
+#undef P
+
+ SEQ_printf(m, "\n");
+ SEQ_printf(m, "sysctl_sched\n");
+
+#define P(x) \
+ SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x))
+#define PN(x) \
+ SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
+ PN(sysctl_sched_latency);
+ PN(sysctl_sched_min_granularity);
+ PN(sysctl_sched_wakeup_granularity);
+ P(sysctl_sched_child_runs_first);
+ P(sysctl_sched_features);
+#undef PN
+#undef P
+
+ SEQ_printf(m, " .%-40s: %d (%s)\n",
+ "sysctl_sched_tunable_scaling",
+ sysctl_sched_tunable_scaling,
+ sched_tunable_scaling_names[sysctl_sched_tunable_scaling]);
+ SEQ_printf(m, "\n");
+}
+
+static int sched_debug_show(struct seq_file *m, void *v)
+{
+ int cpu = (unsigned long)(v - 2);
+
+ if (cpu != -1)
+ print_cpu(m, cpu);
+ else
+ sched_debug_header(m);
+
+ return 0;
+}
+
+void sysrq_sched_debug_show(void)
+{
+ int cpu;
+
+ sched_debug_header(NULL);
+ for_each_online_cpu(cpu)
+ print_cpu(NULL, cpu);
+
+}
+
+/*
+ * This itererator needs some explanation.
+ * It returns 1 for the header position.
+ * This means 2 is cpu 0.
+ * In a hotplugged system some cpus, including cpu 0, may be missing so we have
+ * to use cpumask_* to iterate over the cpus.
+ */
+static void *sched_debug_start(struct seq_file *file, loff_t *offset)
+{
+ unsigned long n = *offset;
+
+ if (n == 0)
+ return (void *) 1;
+
+ n--;
+
+ if (n > 0)
+ n = cpumask_next(n - 1, cpu_online_mask);
+ else
+ n = cpumask_first(cpu_online_mask);
+
+ *offset = n + 1;
+
+ if (n < nr_cpu_ids)
+ return (void *)(unsigned long)(n + 2);
+ return NULL;
+}
+
+static void *sched_debug_next(struct seq_file *file, void *data, loff_t *offset)
+{
+ (*offset)++;
+ return sched_debug_start(file, offset);
+}
+
+static void sched_debug_stop(struct seq_file *file, void *data)
+{
+}
+
+static const struct seq_operations sched_debug_sops = {
+ .start = sched_debug_start,
+ .next = sched_debug_next,
+ .stop = sched_debug_stop,
+ .show = sched_debug_show,
+};
+
+static int sched_debug_release(struct inode *inode, struct file *file)
+{
+ seq_release(inode, file);
+
+ return 0;
+}
+
+static int sched_debug_open(struct inode *inode, struct file *filp)
+{
+ int ret = 0;
+
+ ret = seq_open(filp, &sched_debug_sops);
+
+ return ret;
+}
+
+static const struct file_operations sched_debug_fops = {
+ .open = sched_debug_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = sched_debug_release,
+};
+
+static int __init init_sched_debug_procfs(void)
+{
+ struct proc_dir_entry *pe;
+
+ pe = proc_create("sched_debug", 0444, NULL, &sched_debug_fops);
+ if (!pe)
+ return -ENOMEM;
+ return 0;
+}
+
+__initcall(init_sched_debug_procfs);
+
+#define __P(F) \
+ SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
+#define P(F) \
+ SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
+#define __PN(F) \
+ SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
+#define PN(F) \
+ SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
+
+
+static void sched_show_numa(struct task_struct *p, struct seq_file *m)
+{
+#ifdef CONFIG_NUMA_BALANCING
+ struct mempolicy *pol;
+ int node, i;
+
+ if (p->mm)
+ P(mm->numa_scan_seq);
+
+ task_lock(p);
+ pol = p->mempolicy;
+ if (pol && !(pol->flags & MPOL_F_MORON))
+ pol = NULL;
+ mpol_get(pol);
+ task_unlock(p);
+
+ SEQ_printf(m, "numa_migrations, %ld\n", xchg(&p->numa_pages_migrated, 0));
+
+ for_each_online_node(node) {
+ for (i = 0; i < 2; i++) {
+ unsigned long nr_faults = -1;
+ int cpu_current, home_node;
+
+ if (p->numa_faults)
+ nr_faults = p->numa_faults[2*node + i];
+
+ cpu_current = !i ? (task_node(p) == node) :
+ (pol && node_isset(node, pol->v.nodes));
+
+ home_node = (p->numa_preferred_nid == node);
+
+ SEQ_printf(m, "numa_faults_memory, %d, %d, %d, %d, %ld\n",
+ i, node, cpu_current, home_node, nr_faults);
+ }
+ }
+
+ mpol_put(pol);
+#endif
+}
+
+void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
+{
+ unsigned long nr_switches;
+
+ SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr(p),
+ get_nr_threads(p));
+ SEQ_printf(m,
+ "---------------------------------------------------------"
+ "----------\n");
+#define __P(F) \
+ SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
+#define P(F) \
+ SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
+#define __PN(F) \
+ SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
+#define PN(F) \
+ SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
+
+ PN(se.exec_start);
+ PN(se.vruntime);
+ PN(se.sum_exec_runtime);
+
+ nr_switches = p->nvcsw + p->nivcsw;
+
+#ifdef CONFIG_SCHEDSTATS
+ PN(se.statistics.wait_start);
+ PN(se.statistics.sleep_start);
+ PN(se.statistics.block_start);
+ PN(se.statistics.sleep_max);
+ PN(se.statistics.block_max);
+ PN(se.statistics.exec_max);
+ PN(se.statistics.slice_max);
+ PN(se.statistics.wait_max);
+ PN(se.statistics.wait_sum);
+ P(se.statistics.wait_count);
+ PN(se.statistics.iowait_sum);
+ P(se.statistics.iowait_count);
+ P(se.nr_migrations);
+ P(se.statistics.nr_migrations_cold);
+ P(se.statistics.nr_failed_migrations_affine);
+ P(se.statistics.nr_failed_migrations_running);
+ P(se.statistics.nr_failed_migrations_hot);
+ P(se.statistics.nr_forced_migrations);
+ P(se.statistics.nr_wakeups);
+ P(se.statistics.nr_wakeups_sync);
+ P(se.statistics.nr_wakeups_migrate);
+ P(se.statistics.nr_wakeups_local);
+ P(se.statistics.nr_wakeups_remote);
+ P(se.statistics.nr_wakeups_affine);
+ P(se.statistics.nr_wakeups_affine_attempts);
+ P(se.statistics.nr_wakeups_passive);
+ P(se.statistics.nr_wakeups_idle);
+
+ {
+ u64 avg_atom, avg_per_cpu;
+
+ avg_atom = p->se.sum_exec_runtime;
+ if (nr_switches)
+ avg_atom = div64_ul(avg_atom, nr_switches);
+ else
+ avg_atom = -1LL;
+
+ avg_per_cpu = p->se.sum_exec_runtime;
+ if (p->se.nr_migrations) {
+ avg_per_cpu = div64_u64(avg_per_cpu,
+ p->se.nr_migrations);
+ } else {
+ avg_per_cpu = -1LL;
+ }
+
+ __PN(avg_atom);
+ __PN(avg_per_cpu);
+ }
+#endif
+ __P(nr_switches);
+ SEQ_printf(m, "%-45s:%21Ld\n",
+ "nr_voluntary_switches", (long long)p->nvcsw);
+ SEQ_printf(m, "%-45s:%21Ld\n",
+ "nr_involuntary_switches", (long long)p->nivcsw);
+
+ P(se.load.weight);
+#ifdef CONFIG_SMP
+ P(se.avg.runnable_avg_sum);
+ P(se.avg.running_avg_sum);
+ P(se.avg.avg_period);
+ P(se.avg.load_avg_contrib);
+ P(se.avg.utilization_avg_contrib);
+ P(se.avg.decay_count);
+#endif
+ P(policy);
+ P(prio);
+#undef PN
+#undef __PN
+#undef P
+#undef __P
+
+ {
+ unsigned int this_cpu = raw_smp_processor_id();
+ u64 t0, t1;
+
+ t0 = cpu_clock(this_cpu);
+ t1 = cpu_clock(this_cpu);
+ SEQ_printf(m, "%-45s:%21Ld\n",
+ "clock-delta", (long long)(t1-t0));
+ }
+
+ sched_show_numa(p, m);
+}
+
+void proc_sched_set_task(struct task_struct *p)
+{
+#ifdef CONFIG_SCHEDSTATS
+ memset(&p->se.statistics, 0, sizeof(p->se.statistics));
+#endif
+}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
new file mode 100644
index 000000000..936664319
--- /dev/null
+++ b/kernel/sched/fair.c
@@ -0,0 +1,8310 @@
+/*
+ * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
+ *
+ * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * Interactivity improvements by Mike Galbraith
+ * (C) 2007 Mike Galbraith <efault@gmx.de>
+ *
+ * Various enhancements by Dmitry Adamushko.
+ * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
+ *
+ * Group scheduling enhancements by Srivatsa Vaddagiri
+ * Copyright IBM Corporation, 2007
+ * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
+ *
+ * Scaled math optimizations by Thomas Gleixner
+ * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
+ *
+ * Adaptive scheduling granularity, math enhancements by Peter Zijlstra
+ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ */
+
+#include <linux/latencytop.h>
+#include <linux/sched.h>
+#include <linux/cpumask.h>
+#include <linux/cpuidle.h>
+#include <linux/slab.h>
+#include <linux/profile.h>
+#include <linux/interrupt.h>
+#include <linux/mempolicy.h>
+#include <linux/migrate.h>
+#include <linux/task_work.h>
+
+#include <trace/events/sched.h>
+
+#include "sched.h"
+
+/*
+ * Targeted preemption latency for CPU-bound tasks:
+ * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
+ *
+ * NOTE: this latency value is not the same as the concept of
+ * 'timeslice length' - timeslices in CFS are of variable length
+ * and have no persistent notion like in traditional, time-slice
+ * based scheduling concepts.
+ *
+ * (to see the precise effective timeslice length of your workload,
+ * run vmstat and monitor the context-switches (cs) field)
+ */
+#ifdef CONFIG_PCK_INTERACTIVE
+unsigned int sysctl_sched_latency = 3000000ULL;
+unsigned int normalized_sysctl_sched_latency = 3000000ULL;
+#else
+unsigned int sysctl_sched_latency = 6000000ULL;
+unsigned int normalized_sysctl_sched_latency = 6000000ULL;
+#endif
+
+/*
+ * The initial- and re-scaling of tunables is configurable
+ * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
+ *
+ * Options are:
+ * SCHED_TUNABLESCALING_NONE - unscaled, always *1
+ * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
+ * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
+ */
+enum sched_tunable_scaling sysctl_sched_tunable_scaling
+ = SCHED_TUNABLESCALING_LOG;
+
+/*
+ * Minimal preemption granularity for CPU-bound tasks:
+ * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ */
+#ifdef CONFIG_PCK_INTERACTIVE
+unsigned int sysctl_sched_min_granularity = 300000ULL;
+unsigned int normalized_sysctl_sched_min_granularity = 300000ULL;
+#else
+unsigned int sysctl_sched_min_granularity = 750000ULL;
+unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
+#endif
+
+/*
+ * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
+ */
+#ifdef CONFIG_PCK_INTERACTIVE
+static unsigned int sched_nr_latency = 10;
+#else
+static unsigned int sched_nr_latency = 8;
+#endif
+
+/*
+ * After fork, child runs first. If set to 0 (default) then
+ * parent will (try to) run first.
+ */
+unsigned int sysctl_sched_child_runs_first __read_mostly;
+
+/*
+ * SCHED_OTHER wake-up granularity.
+ * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ *
+ * This option delays the preemption effects of decoupled workloads
+ * and reduces their over-scheduling. Synchronous workloads will still
+ * have immediate wakeup/sleep latencies.
+ */
+#ifdef CONFIG_PCK_INTERACTIVE
+unsigned int sysctl_sched_wakeup_granularity = 500000UL;
+unsigned int normalized_sysctl_sched_wakeup_granularity = 500000UL;
+
+const_debug unsigned int sysctl_sched_migration_cost = 250000UL;
+#else
+unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
+unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
+
+const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
+#endif
+
+/*
+ * The exponential sliding window over which load is averaged for shares
+ * distribution.
+ * (default: 10msec)
+ */
+unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
+
+#ifdef CONFIG_CFS_BANDWIDTH
+/*
+ * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
+ * each time a cfs_rq requests quota.
+ *
+ * Note: in the case that the slice exceeds the runtime remaining (either due
+ * to consumption or the quota being specified to be smaller than the slice)
+ * we will always only issue the remaining available time.
+ *
+ * default: 5 msec, units: microseconds
+ */
+#ifdef CONFIG_PCK_INTERACTIVE
+unsigned int sysctl_sched_cfs_bandwidth_slice = 3000UL;
+#else
+unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
+#endif
+#endif
+
+static inline void update_load_add(struct load_weight *lw, unsigned long inc)
+{
+ lw->weight += inc;
+ lw->inv_weight = 0;
+}
+
+static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
+{
+ lw->weight -= dec;
+ lw->inv_weight = 0;
+}
+
+static inline void update_load_set(struct load_weight *lw, unsigned long w)
+{
+ lw->weight = w;
+ lw->inv_weight = 0;
+}
+
+/*
+ * Increase the granularity value when there are more CPUs,
+ * because with more CPUs the 'effective latency' as visible
+ * to users decreases. But the relationship is not linear,
+ * so pick a second-best guess by going with the log2 of the
+ * number of CPUs.
+ *
+ * This idea comes from the SD scheduler of Con Kolivas:
+ */
+static int get_update_sysctl_factor(void)
+{
+ unsigned int cpus = min_t(int, num_online_cpus(), 8);
+ unsigned int factor;
+
+ switch (sysctl_sched_tunable_scaling) {
+ case SCHED_TUNABLESCALING_NONE:
+ factor = 1;
+ break;
+ case SCHED_TUNABLESCALING_LINEAR:
+ factor = cpus;
+ break;
+ case SCHED_TUNABLESCALING_LOG:
+ default:
+ factor = 1 + ilog2(cpus);
+ break;
+ }
+
+ return factor;
+}
+
+static void update_sysctl(void)
+{
+ unsigned int factor = get_update_sysctl_factor();
+
+#define SET_SYSCTL(name) \
+ (sysctl_##name = (factor) * normalized_sysctl_##name)
+ SET_SYSCTL(sched_min_granularity);
+ SET_SYSCTL(sched_latency);
+ SET_SYSCTL(sched_wakeup_granularity);
+#undef SET_SYSCTL
+}
+
+void sched_init_granularity(void)
+{
+ update_sysctl();
+}
+
+#define WMULT_CONST (~0U)
+#define WMULT_SHIFT 32
+
+static void __update_inv_weight(struct load_weight *lw)
+{
+ unsigned long w;
+
+ if (likely(lw->inv_weight))
+ return;
+
+ w = scale_load_down(lw->weight);
+
+ if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
+ lw->inv_weight = 1;
+ else if (unlikely(!w))
+ lw->inv_weight = WMULT_CONST;
+ else
+ lw->inv_weight = WMULT_CONST / w;
+}
+
+/*
+ * delta_exec * weight / lw.weight
+ * OR
+ * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
+ *
+ * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
+ * we're guaranteed shift stays positive because inv_weight is guaranteed to
+ * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
+ *
+ * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
+ * weight/lw.weight <= 1, and therefore our shift will also be positive.
+ */
+static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
+{
+ u64 fact = scale_load_down(weight);
+ int shift = WMULT_SHIFT;
+
+ __update_inv_weight(lw);
+
+ if (unlikely(fact >> 32)) {
+ while (fact >> 32) {
+ fact >>= 1;
+ shift--;
+ }
+ }
+
+ /* hint to use a 32x32->64 mul */
+ fact = (u64)(u32)fact * lw->inv_weight;
+
+ while (fact >> 32) {
+ fact >>= 1;
+ shift--;
+ }
+
+ return mul_u64_u32_shr(delta_exec, fact, shift);
+}
+
+
+const struct sched_class fair_sched_class;
+
+/**************************************************************
+ * CFS operations on generic schedulable entities:
+ */
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+/* cpu runqueue to which this cfs_rq is attached */
+static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
+{
+ return cfs_rq->rq;
+}
+
+/* An entity is a task if it doesn't "own" a runqueue */
+#define entity_is_task(se) (!se->my_q)
+
+static inline struct task_struct *task_of(struct sched_entity *se)
+{
+#ifdef CONFIG_SCHED_DEBUG
+ WARN_ON_ONCE(!entity_is_task(se));
+#endif
+ return container_of(se, struct task_struct, se);
+}
+
+/* Walk up scheduling entities hierarchy */
+#define for_each_sched_entity(se) \
+ for (; se; se = se->parent)
+
+static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
+{
+ return p->se.cfs_rq;
+}
+
+/* runqueue on which this entity is (to be) queued */
+static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+{
+ return se->cfs_rq;
+}
+
+/* runqueue "owned" by this group */
+static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
+{
+ return grp->my_q;
+}
+
+static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
+ int force_update);
+
+static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+ if (!cfs_rq->on_list) {
+ /*
+ * Ensure we either appear before our parent (if already
+ * enqueued) or force our parent to appear after us when it is
+ * enqueued. The fact that we always enqueue bottom-up
+ * reduces this to two cases.
+ */
+ if (cfs_rq->tg->parent &&
+ cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
+ list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
+ &rq_of(cfs_rq)->leaf_cfs_rq_list);
+ } else {
+ list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
+ &rq_of(cfs_rq)->leaf_cfs_rq_list);
+ }
+
+ cfs_rq->on_list = 1;
+ /* We should have no load, but we need to update last_decay. */
+ update_cfs_rq_blocked_load(cfs_rq, 0);
+ }
+}
+
+static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+ if (cfs_rq->on_list) {
+ list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
+ cfs_rq->on_list = 0;
+ }
+}
+
+/* Iterate thr' all leaf cfs_rq's on a runqueue */
+#define for_each_leaf_cfs_rq(rq, cfs_rq) \
+ list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
+
+/* Do the two (enqueued) entities belong to the same group ? */
+static inline struct cfs_rq *
+is_same_group(struct sched_entity *se, struct sched_entity *pse)
+{
+ if (se->cfs_rq == pse->cfs_rq)
+ return se->cfs_rq;
+
+ return NULL;
+}
+
+static inline struct sched_entity *parent_entity(struct sched_entity *se)
+{
+ return se->parent;
+}
+
+static void
+find_matching_se(struct sched_entity **se, struct sched_entity **pse)
+{
+ int se_depth, pse_depth;
+
+ /*
+ * preemption test can be made between sibling entities who are in the
+ * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
+ * both tasks until we find their ancestors who are siblings of common
+ * parent.
+ */
+
+ /* First walk up until both entities are at same depth */
+ se_depth = (*se)->depth;
+ pse_depth = (*pse)->depth;
+
+ while (se_depth > pse_depth) {
+ se_depth--;
+ *se = parent_entity(*se);
+ }
+
+ while (pse_depth > se_depth) {
+ pse_depth--;
+ *pse = parent_entity(*pse);
+ }
+
+ while (!is_same_group(*se, *pse)) {
+ *se = parent_entity(*se);
+ *pse = parent_entity(*pse);
+ }
+}
+
+#else /* !CONFIG_FAIR_GROUP_SCHED */
+
+static inline struct task_struct *task_of(struct sched_entity *se)
+{
+ return container_of(se, struct task_struct, se);
+}
+
+static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
+{
+ return container_of(cfs_rq, struct rq, cfs);
+}
+
+#define entity_is_task(se) 1
+
+#define for_each_sched_entity(se) \
+ for (; se; se = NULL)
+
+static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
+{
+ return &task_rq(p)->cfs;
+}
+
+static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+{
+ struct task_struct *p = task_of(se);
+ struct rq *rq = task_rq(p);
+
+ return &rq->cfs;
+}
+
+/* runqueue "owned" by this group */
+static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
+{
+ return NULL;
+}
+
+static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+}
+
+static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+}
+
+#define for_each_leaf_cfs_rq(rq, cfs_rq) \
+ for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
+
+static inline struct sched_entity *parent_entity(struct sched_entity *se)
+{
+ return NULL;
+}
+
+static inline void
+find_matching_se(struct sched_entity **se, struct sched_entity **pse)
+{
+}
+
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+static __always_inline
+void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
+
+/**************************************************************
+ * Scheduling class tree data structure manipulation methods:
+ */
+
+static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
+{
+ s64 delta = (s64)(vruntime - max_vruntime);
+ if (delta > 0)
+ max_vruntime = vruntime;
+
+ return max_vruntime;
+}
+
+static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
+{
+ s64 delta = (s64)(vruntime - min_vruntime);
+ if (delta < 0)
+ min_vruntime = vruntime;
+
+ return min_vruntime;
+}
+
+static inline int entity_before(struct sched_entity *a,
+ struct sched_entity *b)
+{
+ return (s64)(a->vruntime - b->vruntime) < 0;
+}
+
+static void update_min_vruntime(struct cfs_rq *cfs_rq)
+{
+ u64 vruntime = cfs_rq->min_vruntime;
+
+ if (cfs_rq->curr)
+ vruntime = cfs_rq->curr->vruntime;
+
+ if (cfs_rq->rb_leftmost) {
+ struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
+ struct sched_entity,
+ run_node);
+
+ if (!cfs_rq->curr)
+ vruntime = se->vruntime;
+ else
+ vruntime = min_vruntime(vruntime, se->vruntime);
+ }
+
+ /* ensure we never gain time by being placed backwards. */
+ cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
+#ifndef CONFIG_64BIT
+ smp_wmb();
+ cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
+#endif
+}
+
+/*
+ * Enqueue an entity into the rb-tree:
+ */
+static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
+ struct rb_node *parent = NULL;
+ struct sched_entity *entry;
+ int leftmost = 1;
+
+ /*
+ * Find the right place in the rbtree:
+ */
+ while (*link) {
+ parent = *link;
+ entry = rb_entry(parent, struct sched_entity, run_node);
+ /*
+ * We dont care about collisions. Nodes with
+ * the same key stay together.
+ */
+ if (entity_before(se, entry)) {
+ link = &parent->rb_left;
+ } else {
+ link = &parent->rb_right;
+ leftmost = 0;
+ }
+ }
+
+ /*
+ * Maintain a cache of leftmost tree entries (it is frequently
+ * used):
+ */
+ if (leftmost)
+ cfs_rq->rb_leftmost = &se->run_node;
+
+ rb_link_node(&se->run_node, parent, link);
+ rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
+}
+
+static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ if (cfs_rq->rb_leftmost == &se->run_node) {
+ struct rb_node *next_node;
+
+ next_node = rb_next(&se->run_node);
+ cfs_rq->rb_leftmost = next_node;
+ }
+
+ rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
+}
+
+struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
+{
+ struct rb_node *left = cfs_rq->rb_leftmost;
+
+ if (!left)
+ return NULL;
+
+ return rb_entry(left, struct sched_entity, run_node);
+}
+
+static struct sched_entity *__pick_next_entity(struct sched_entity *se)
+{
+ struct rb_node *next = rb_next(&se->run_node);
+
+ if (!next)
+ return NULL;
+
+ return rb_entry(next, struct sched_entity, run_node);
+}
+
+#ifdef CONFIG_SCHED_DEBUG
+struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
+{
+ struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
+
+ if (!last)
+ return NULL;
+
+ return rb_entry(last, struct sched_entity, run_node);
+}
+
+/**************************************************************
+ * Scheduling class statistics methods:
+ */
+
+int sched_proc_update_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ int factor = get_update_sysctl_factor();
+
+ if (ret || !write)
+ return ret;
+
+ sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
+ sysctl_sched_min_granularity);
+
+#define WRT_SYSCTL(name) \
+ (normalized_sysctl_##name = sysctl_##name / (factor))
+ WRT_SYSCTL(sched_min_granularity);
+ WRT_SYSCTL(sched_latency);
+ WRT_SYSCTL(sched_wakeup_granularity);
+#undef WRT_SYSCTL
+
+ return 0;
+}
+#endif
+
+/*
+ * delta /= w
+ */
+static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
+{
+ if (unlikely(se->load.weight != NICE_0_LOAD))
+ delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
+
+ return delta;
+}
+
+/*
+ * The idea is to set a period in which each task runs once.
+ *
+ * When there are too many tasks (sched_nr_latency) we have to stretch
+ * this period because otherwise the slices get too small.
+ *
+ * p = (nr <= nl) ? l : l*nr/nl
+ */
+static u64 __sched_period(unsigned long nr_running)
+{
+ u64 period = sysctl_sched_latency;
+ unsigned long nr_latency = sched_nr_latency;
+
+ if (unlikely(nr_running > nr_latency)) {
+ period = sysctl_sched_min_granularity;
+ period *= nr_running;
+ }
+
+ return period;
+}
+
+/*
+ * We calculate the wall-time slice from the period by taking a part
+ * proportional to the weight.
+ *
+ * s = p*P[w/rw]
+ */
+static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
+
+ for_each_sched_entity(se) {
+ struct load_weight *load;
+ struct load_weight lw;
+
+ cfs_rq = cfs_rq_of(se);
+ load = &cfs_rq->load;
+
+ if (unlikely(!se->on_rq)) {
+ lw = cfs_rq->load;
+
+ update_load_add(&lw, se->load.weight);
+ load = &lw;
+ }
+ slice = __calc_delta(slice, se->load.weight, load);
+ }
+ return slice;
+}
+
+/*
+ * We calculate the vruntime slice of a to-be-inserted task.
+ *
+ * vs = s/w
+ */
+static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ return calc_delta_fair(sched_slice(cfs_rq, se), se);
+}
+
+#ifdef CONFIG_SMP
+static int select_idle_sibling(struct task_struct *p, int cpu);
+static unsigned long task_h_load(struct task_struct *p);
+
+static inline void __update_task_entity_contrib(struct sched_entity *se);
+static inline void __update_task_entity_utilization(struct sched_entity *se);
+
+/* Give new task start runnable values to heavy its load in infant time */
+void init_task_runnable_average(struct task_struct *p)
+{
+ u32 slice;
+
+ slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
+ p->se.avg.runnable_avg_sum = p->se.avg.running_avg_sum = slice;
+ p->se.avg.avg_period = slice;
+ __update_task_entity_contrib(&p->se);
+ __update_task_entity_utilization(&p->se);
+}
+#else
+void init_task_runnable_average(struct task_struct *p)
+{
+}
+#endif
+
+/*
+ * Update the current task's runtime statistics.
+ */
+static void update_curr(struct cfs_rq *cfs_rq)
+{
+ struct sched_entity *curr = cfs_rq->curr;
+ u64 now = rq_clock_task(rq_of(cfs_rq));
+ u64 delta_exec;
+
+ if (unlikely(!curr))
+ return;
+
+ delta_exec = now - curr->exec_start;
+ if (unlikely((s64)delta_exec <= 0))
+ return;
+
+ curr->exec_start = now;
+
+ schedstat_set(curr->statistics.exec_max,
+ max(delta_exec, curr->statistics.exec_max));
+
+ curr->sum_exec_runtime += delta_exec;
+ schedstat_add(cfs_rq, exec_clock, delta_exec);
+
+ curr->vruntime += calc_delta_fair(delta_exec, curr);
+ update_min_vruntime(cfs_rq);
+
+ if (entity_is_task(curr)) {
+ struct task_struct *curtask = task_of(curr);
+
+ trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
+ cpuacct_charge(curtask, delta_exec);
+ account_group_exec_runtime(curtask, delta_exec);
+ }
+
+ account_cfs_rq_runtime(cfs_rq, delta_exec);
+}
+
+static void update_curr_fair(struct rq *rq)
+{
+ update_curr(cfs_rq_of(&rq->curr->se));
+}
+
+static inline void
+update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
+}
+
+/*
+ * Task is being enqueued - update stats:
+ */
+static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ /*
+ * Are we enqueueing a waiting task? (for current tasks
+ * a dequeue/enqueue event is a NOP)
+ */
+ if (se != cfs_rq->curr)
+ update_stats_wait_start(cfs_rq, se);
+}
+
+static void
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
+ rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
+ schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
+ schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
+ rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
+#ifdef CONFIG_SCHEDSTATS
+ if (entity_is_task(se)) {
+ trace_sched_stat_wait(task_of(se),
+ rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
+ }
+#endif
+ schedstat_set(se->statistics.wait_start, 0);
+}
+
+static inline void
+update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ /*
+ * Mark the end of the wait period if dequeueing a
+ * waiting task:
+ */
+ if (se != cfs_rq->curr)
+ update_stats_wait_end(cfs_rq, se);
+}
+
+/*
+ * We are picking a new current task - update its stats:
+ */
+static inline void
+update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ /*
+ * We are starting a new run period:
+ */
+ se->exec_start = rq_clock_task(rq_of(cfs_rq));
+}
+
+/**************************************************
+ * Scheduling class queueing methods:
+ */
+
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * Approximate time to scan a full NUMA task in ms. The task scan period is
+ * calculated based on the tasks virtual memory size and
+ * numa_balancing_scan_size.
+ */
+unsigned int sysctl_numa_balancing_scan_period_min = 1000;
+unsigned int sysctl_numa_balancing_scan_period_max = 60000;
+
+/* Portion of address space to scan in MB */
+unsigned int sysctl_numa_balancing_scan_size = 256;
+
+/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
+unsigned int sysctl_numa_balancing_scan_delay = 1000;
+
+static unsigned int task_nr_scan_windows(struct task_struct *p)
+{
+ unsigned long rss = 0;
+ unsigned long nr_scan_pages;
+
+ /*
+ * Calculations based on RSS as non-present and empty pages are skipped
+ * by the PTE scanner and NUMA hinting faults should be trapped based
+ * on resident pages
+ */
+ nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
+ rss = get_mm_rss(p->mm);
+ if (!rss)
+ rss = nr_scan_pages;
+
+ rss = round_up(rss, nr_scan_pages);
+ return rss / nr_scan_pages;
+}
+
+/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
+#define MAX_SCAN_WINDOW 2560
+
+static unsigned int task_scan_min(struct task_struct *p)
+{
+ unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size);
+ unsigned int scan, floor;
+ unsigned int windows = 1;
+
+ if (scan_size < MAX_SCAN_WINDOW)
+ windows = MAX_SCAN_WINDOW / scan_size;
+ floor = 1000 / windows;
+
+ scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
+ return max_t(unsigned int, floor, scan);
+}
+
+static unsigned int task_scan_max(struct task_struct *p)
+{
+ unsigned int smin = task_scan_min(p);
+ unsigned int smax;
+
+ /* Watch for min being lower than max due to floor calculations */
+ smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
+ return max(smin, smax);
+}
+
+static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
+{
+ rq->nr_numa_running += (p->numa_preferred_nid != -1);
+ rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
+}
+
+static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
+{
+ rq->nr_numa_running -= (p->numa_preferred_nid != -1);
+ rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
+}
+
+struct numa_group {
+ atomic_t refcount;
+
+ spinlock_t lock; /* nr_tasks, tasks */
+ int nr_tasks;
+ pid_t gid;
+
+ struct rcu_head rcu;
+ nodemask_t active_nodes;
+ unsigned long total_faults;
+ /*
+ * Faults_cpu is used to decide whether memory should move
+ * towards the CPU. As a consequence, these stats are weighted
+ * more by CPU use than by memory faults.
+ */
+ unsigned long *faults_cpu;
+ unsigned long faults[0];
+};
+
+/* Shared or private faults. */
+#define NR_NUMA_HINT_FAULT_TYPES 2
+
+/* Memory and CPU locality */
+#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
+
+/* Averaged statistics, and temporary buffers. */
+#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
+
+pid_t task_numa_group_id(struct task_struct *p)
+{
+ return p->numa_group ? p->numa_group->gid : 0;
+}
+
+/*
+ * The averaged statistics, shared & private, memory & cpu,
+ * occupy the first half of the array. The second half of the
+ * array is for current counters, which are averaged into the
+ * first set by task_numa_placement.
+ */
+static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
+{
+ return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
+}
+
+static inline unsigned long task_faults(struct task_struct *p, int nid)
+{
+ if (!p->numa_faults)
+ return 0;
+
+ return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
+ p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
+}
+
+static inline unsigned long group_faults(struct task_struct *p, int nid)
+{
+ if (!p->numa_group)
+ return 0;
+
+ return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
+ p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
+}
+
+static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
+{
+ return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
+ group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
+}
+
+/* Handle placement on systems where not all nodes are directly connected. */
+static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
+ int maxdist, bool task)
+{
+ unsigned long score = 0;
+ int node;
+
+ /*
+ * All nodes are directly connected, and the same distance
+ * from each other. No need for fancy placement algorithms.
+ */
+ if (sched_numa_topology_type == NUMA_DIRECT)
+ return 0;
+
+ /*
+ * This code is called for each node, introducing N^2 complexity,
+ * which should be ok given the number of nodes rarely exceeds 8.
+ */
+ for_each_online_node(node) {
+ unsigned long faults;
+ int dist = node_distance(nid, node);
+
+ /*
+ * The furthest away nodes in the system are not interesting
+ * for placement; nid was already counted.
+ */
+ if (dist == sched_max_numa_distance || node == nid)
+ continue;
+
+ /*
+ * On systems with a backplane NUMA topology, compare groups
+ * of nodes, and move tasks towards the group with the most
+ * memory accesses. When comparing two nodes at distance
+ * "hoplimit", only nodes closer by than "hoplimit" are part
+ * of each group. Skip other nodes.
+ */
+ if (sched_numa_topology_type == NUMA_BACKPLANE &&
+ dist > maxdist)
+ continue;
+
+ /* Add up the faults from nearby nodes. */
+ if (task)
+ faults = task_faults(p, node);
+ else
+ faults = group_faults(p, node);
+
+ /*
+ * On systems with a glueless mesh NUMA topology, there are
+ * no fixed "groups of nodes". Instead, nodes that are not
+ * directly connected bounce traffic through intermediate
+ * nodes; a numa_group can occupy any set of nodes.
+ * The further away a node is, the less the faults count.
+ * This seems to result in good task placement.
+ */
+ if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
+ faults *= (sched_max_numa_distance - dist);
+ faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
+ }
+
+ score += faults;
+ }
+
+ return score;
+}
+
+/*
+ * These return the fraction of accesses done by a particular task, or
+ * task group, on a particular numa node. The group weight is given a
+ * larger multiplier, in order to group tasks together that are almost
+ * evenly spread out between numa nodes.
+ */
+static inline unsigned long task_weight(struct task_struct *p, int nid,
+ int dist)
+{
+ unsigned long faults, total_faults;
+
+ if (!p->numa_faults)
+ return 0;
+
+ total_faults = p->total_numa_faults;
+
+ if (!total_faults)
+ return 0;
+
+ faults = task_faults(p, nid);
+ faults += score_nearby_nodes(p, nid, dist, true);
+
+ return 1000 * faults / total_faults;
+}
+
+static inline unsigned long group_weight(struct task_struct *p, int nid,
+ int dist)
+{
+ unsigned long faults, total_faults;
+
+ if (!p->numa_group)
+ return 0;
+
+ total_faults = p->numa_group->total_faults;
+
+ if (!total_faults)
+ return 0;
+
+ faults = group_faults(p, nid);
+ faults += score_nearby_nodes(p, nid, dist, false);
+
+ return 1000 * faults / total_faults;
+}
+
+bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
+ int src_nid, int dst_cpu)
+{
+ struct numa_group *ng = p->numa_group;
+ int dst_nid = cpu_to_node(dst_cpu);
+ int last_cpupid, this_cpupid;
+
+ this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
+
+ /*
+ * Multi-stage node selection is used in conjunction with a periodic
+ * migration fault to build a temporal task<->page relation. By using
+ * a two-stage filter we remove short/unlikely relations.
+ *
+ * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
+ * a task's usage of a particular page (n_p) per total usage of this
+ * page (n_t) (in a given time-span) to a probability.
+ *
+ * Our periodic faults will sample this probability and getting the
+ * same result twice in a row, given these samples are fully
+ * independent, is then given by P(n)^2, provided our sample period
+ * is sufficiently short compared to the usage pattern.
+ *
+ * This quadric squishes small probabilities, making it less likely we
+ * act on an unlikely task<->page relation.
+ */
+ last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
+ if (!cpupid_pid_unset(last_cpupid) &&
+ cpupid_to_nid(last_cpupid) != dst_nid)
+ return false;
+
+ /* Always allow migrate on private faults */
+ if (cpupid_match_pid(p, last_cpupid))
+ return true;
+
+ /* A shared fault, but p->numa_group has not been set up yet. */
+ if (!ng)
+ return true;
+
+ /*
+ * Do not migrate if the destination is not a node that
+ * is actively used by this numa group.
+ */
+ if (!node_isset(dst_nid, ng->active_nodes))
+ return false;
+
+ /*
+ * Source is a node that is not actively used by this
+ * numa group, while the destination is. Migrate.
+ */
+ if (!node_isset(src_nid, ng->active_nodes))
+ return true;
+
+ /*
+ * Both source and destination are nodes in active
+ * use by this numa group. Maximize memory bandwidth
+ * by migrating from more heavily used groups, to less
+ * heavily used ones, spreading the load around.
+ * Use a 1/4 hysteresis to avoid spurious page movement.
+ */
+ return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
+}
+
+static unsigned long weighted_cpuload(const int cpu);
+static unsigned long source_load(int cpu, int type);
+static unsigned long target_load(int cpu, int type);
+static unsigned long capacity_of(int cpu);
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
+
+/* Cached statistics for all CPUs within a node */
+struct numa_stats {
+ unsigned long nr_running;
+ unsigned long load;
+
+ /* Total compute capacity of CPUs on a node */
+ unsigned long compute_capacity;
+
+ /* Approximate capacity in terms of runnable tasks on a node */
+ unsigned long task_capacity;
+ int has_free_capacity;
+};
+
+/*
+ * XXX borrowed from update_sg_lb_stats
+ */
+static void update_numa_stats(struct numa_stats *ns, int nid)
+{
+ int smt, cpu, cpus = 0;
+ unsigned long capacity;
+
+ memset(ns, 0, sizeof(*ns));
+ for_each_cpu(cpu, cpumask_of_node(nid)) {
+ struct rq *rq = cpu_rq(cpu);
+
+ ns->nr_running += rq->nr_running;
+ ns->load += weighted_cpuload(cpu);
+ ns->compute_capacity += capacity_of(cpu);
+
+ cpus++;
+ }
+
+ /*
+ * If we raced with hotplug and there are no CPUs left in our mask
+ * the @ns structure is NULL'ed and task_numa_compare() will
+ * not find this node attractive.
+ *
+ * We'll either bail at !has_free_capacity, or we'll detect a huge
+ * imbalance and bail there.
+ */
+ if (!cpus)
+ return;
+
+ /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
+ smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
+ capacity = cpus / smt; /* cores */
+
+ ns->task_capacity = min_t(unsigned, capacity,
+ DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
+ ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
+}
+
+struct task_numa_env {
+ struct task_struct *p;
+
+ int src_cpu, src_nid;
+ int dst_cpu, dst_nid;
+
+ struct numa_stats src_stats, dst_stats;
+
+ int imbalance_pct;
+ int dist;
+
+ struct task_struct *best_task;
+ long best_imp;
+ int best_cpu;
+};
+
+static void task_numa_assign(struct task_numa_env *env,
+ struct task_struct *p, long imp)
+{
+ if (env->best_task)
+ put_task_struct(env->best_task);
+ if (p)
+ get_task_struct(p);
+
+ env->best_task = p;
+ env->best_imp = imp;
+ env->best_cpu = env->dst_cpu;
+}
+
+static bool load_too_imbalanced(long src_load, long dst_load,
+ struct task_numa_env *env)
+{
+ long src_capacity, dst_capacity;
+ long orig_src_load;
+ long load_a, load_b;
+ long moved_load;
+ long imb;
+
+ /*
+ * The load is corrected for the CPU capacity available on each node.
+ *
+ * src_load dst_load
+ * ------------ vs ---------
+ * src_capacity dst_capacity
+ */
+ src_capacity = env->src_stats.compute_capacity;
+ dst_capacity = env->dst_stats.compute_capacity;
+
+ /* We care about the slope of the imbalance, not the direction. */
+ load_a = dst_load;
+ load_b = src_load;
+ if (load_a < load_b)
+ swap(load_a, load_b);
+
+ /* Is the difference below the threshold? */
+ imb = load_a * src_capacity * 100 -
+ load_b * dst_capacity * env->imbalance_pct;
+ if (imb <= 0)
+ return false;
+
+ /*
+ * The imbalance is above the allowed threshold.
+ * Allow a move that brings us closer to a balanced situation,
+ * without moving things past the point of balance.
+ */
+ orig_src_load = env->src_stats.load;
+
+ /*
+ * In a task swap, there will be one load moving from src to dst,
+ * and another moving back. This is the net sum of both moves.
+ * A simple task move will always have a positive value.
+ * Allow the move if it brings the system closer to a balanced
+ * situation, without crossing over the balance point.
+ */
+ moved_load = orig_src_load - src_load;
+
+ if (moved_load > 0)
+ /* Moving src -> dst. Did we overshoot balance? */
+ return src_load * dst_capacity < dst_load * src_capacity;
+ else
+ /* Moving dst -> src. Did we overshoot balance? */
+ return dst_load * src_capacity < src_load * dst_capacity;
+}
+
+/*
+ * This checks if the overall compute and NUMA accesses of the system would
+ * be improved if the source tasks was migrated to the target dst_cpu taking
+ * into account that it might be best if task running on the dst_cpu should
+ * be exchanged with the source task
+ */
+static void task_numa_compare(struct task_numa_env *env,
+ long taskimp, long groupimp)
+{
+ struct rq *src_rq = cpu_rq(env->src_cpu);
+ struct rq *dst_rq = cpu_rq(env->dst_cpu);
+ struct task_struct *cur;
+ long src_load, dst_load;
+ long load;
+ long imp = env->p->numa_group ? groupimp : taskimp;
+ long moveimp = imp;
+ int dist = env->dist;
+
+ rcu_read_lock();
+
+ raw_spin_lock_irq(&dst_rq->lock);
+ cur = dst_rq->curr;
+ /*
+ * No need to move the exiting task, and this ensures that ->curr
+ * wasn't reaped and thus get_task_struct() in task_numa_assign()
+ * is safe under RCU read lock.
+ * Note that rcu_read_lock() itself can't protect from the final
+ * put_task_struct() after the last schedule().
+ */
+ if ((cur->flags & PF_EXITING) || is_idle_task(cur))
+ cur = NULL;
+ raw_spin_unlock_irq(&dst_rq->lock);
+
+ /*
+ * Because we have preemption enabled we can get migrated around and
+ * end try selecting ourselves (current == env->p) as a swap candidate.
+ */
+ if (cur == env->p)
+ goto unlock;
+
+ /*
+ * "imp" is the fault differential for the source task between the
+ * source and destination node. Calculate the total differential for
+ * the source task and potential destination task. The more negative
+ * the value is, the more rmeote accesses that would be expected to
+ * be incurred if the tasks were swapped.
+ */
+ if (cur) {
+ /* Skip this swap candidate if cannot move to the source cpu */
+ if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
+ goto unlock;
+
+ /*
+ * If dst and source tasks are in the same NUMA group, or not
+ * in any group then look only at task weights.
+ */
+ if (cur->numa_group == env->p->numa_group) {
+ imp = taskimp + task_weight(cur, env->src_nid, dist) -
+ task_weight(cur, env->dst_nid, dist);
+ /*
+ * Add some hysteresis to prevent swapping the
+ * tasks within a group over tiny differences.
+ */
+ if (cur->numa_group)
+ imp -= imp/16;
+ } else {
+ /*
+ * Compare the group weights. If a task is all by
+ * itself (not part of a group), use the task weight
+ * instead.
+ */
+ if (cur->numa_group)
+ imp += group_weight(cur, env->src_nid, dist) -
+ group_weight(cur, env->dst_nid, dist);
+ else
+ imp += task_weight(cur, env->src_nid, dist) -
+ task_weight(cur, env->dst_nid, dist);
+ }
+ }
+
+ if (imp <= env->best_imp && moveimp <= env->best_imp)
+ goto unlock;
+
+ if (!cur) {
+ /* Is there capacity at our destination? */
+ if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
+ !env->dst_stats.has_free_capacity)
+ goto unlock;
+
+ goto balance;
+ }
+
+ /* Balance doesn't matter much if we're running a task per cpu */
+ if (imp > env->best_imp && src_rq->nr_running == 1 &&
+ dst_rq->nr_running == 1)
+ goto assign;
+
+ /*
+ * In the overloaded case, try and keep the load balanced.
+ */
+balance:
+ load = task_h_load(env->p);
+ dst_load = env->dst_stats.load + load;
+ src_load = env->src_stats.load - load;
+
+ if (moveimp > imp && moveimp > env->best_imp) {
+ /*
+ * If the improvement from just moving env->p direction is
+ * better than swapping tasks around, check if a move is
+ * possible. Store a slightly smaller score than moveimp,
+ * so an actually idle CPU will win.
+ */
+ if (!load_too_imbalanced(src_load, dst_load, env)) {
+ imp = moveimp - 1;
+ cur = NULL;
+ goto assign;
+ }
+ }
+
+ if (imp <= env->best_imp)
+ goto unlock;
+
+ if (cur) {
+ load = task_h_load(cur);
+ dst_load -= load;
+ src_load += load;
+ }
+
+ if (load_too_imbalanced(src_load, dst_load, env))
+ goto unlock;
+
+ /*
+ * One idle CPU per node is evaluated for a task numa move.
+ * Call select_idle_sibling to maybe find a better one.
+ */
+ if (!cur)
+ env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
+
+assign:
+ task_numa_assign(env, cur, imp);
+unlock:
+ rcu_read_unlock();
+}
+
+static void task_numa_find_cpu(struct task_numa_env *env,
+ long taskimp, long groupimp)
+{
+ int cpu;
+
+ for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
+ /* Skip this CPU if the source task cannot migrate */
+ if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
+ continue;
+
+ env->dst_cpu = cpu;
+ task_numa_compare(env, taskimp, groupimp);
+ }
+}
+
+static int task_numa_migrate(struct task_struct *p)
+{
+ struct task_numa_env env = {
+ .p = p,
+
+ .src_cpu = task_cpu(p),
+ .src_nid = task_node(p),
+
+ .imbalance_pct = 112,
+
+ .best_task = NULL,
+ .best_imp = 0,
+ .best_cpu = -1
+ };
+ struct sched_domain *sd;
+ unsigned long taskweight, groupweight;
+ int nid, ret, dist;
+ long taskimp, groupimp;
+
+ /*
+ * Pick the lowest SD_NUMA domain, as that would have the smallest
+ * imbalance and would be the first to start moving tasks about.
+ *
+ * And we want to avoid any moving of tasks about, as that would create
+ * random movement of tasks -- counter the numa conditions we're trying
+ * to satisfy here.
+ */
+ rcu_read_lock();
+ sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
+ if (sd)
+ env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
+ rcu_read_unlock();
+
+ /*
+ * Cpusets can break the scheduler domain tree into smaller
+ * balance domains, some of which do not cross NUMA boundaries.
+ * Tasks that are "trapped" in such domains cannot be migrated
+ * elsewhere, so there is no point in (re)trying.
+ */
+ if (unlikely(!sd)) {
+ p->numa_preferred_nid = task_node(p);
+ return -EINVAL;
+ }
+
+ env.dst_nid = p->numa_preferred_nid;
+ dist = env.dist = node_distance(env.src_nid, env.dst_nid);
+ taskweight = task_weight(p, env.src_nid, dist);
+ groupweight = group_weight(p, env.src_nid, dist);
+ update_numa_stats(&env.src_stats, env.src_nid);
+ taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
+ groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
+ update_numa_stats(&env.dst_stats, env.dst_nid);
+
+ /* Try to find a spot on the preferred nid. */
+ task_numa_find_cpu(&env, taskimp, groupimp);
+
+ /*
+ * Look at other nodes in these cases:
+ * - there is no space available on the preferred_nid
+ * - the task is part of a numa_group that is interleaved across
+ * multiple NUMA nodes; in order to better consolidate the group,
+ * we need to check other locations.
+ */
+ if (env.best_cpu == -1 || (p->numa_group &&
+ nodes_weight(p->numa_group->active_nodes) > 1)) {
+ for_each_online_node(nid) {
+ if (nid == env.src_nid || nid == p->numa_preferred_nid)
+ continue;
+
+ dist = node_distance(env.src_nid, env.dst_nid);
+ if (sched_numa_topology_type == NUMA_BACKPLANE &&
+ dist != env.dist) {
+ taskweight = task_weight(p, env.src_nid, dist);
+ groupweight = group_weight(p, env.src_nid, dist);
+ }
+
+ /* Only consider nodes where both task and groups benefit */
+ taskimp = task_weight(p, nid, dist) - taskweight;
+ groupimp = group_weight(p, nid, dist) - groupweight;
+ if (taskimp < 0 && groupimp < 0)
+ continue;
+
+ env.dist = dist;
+ env.dst_nid = nid;
+ update_numa_stats(&env.dst_stats, env.dst_nid);
+ task_numa_find_cpu(&env, taskimp, groupimp);
+ }
+ }
+
+ /*
+ * If the task is part of a workload that spans multiple NUMA nodes,
+ * and is migrating into one of the workload's active nodes, remember
+ * this node as the task's preferred numa node, so the workload can
+ * settle down.
+ * A task that migrated to a second choice node will be better off
+ * trying for a better one later. Do not set the preferred node here.
+ */
+ if (p->numa_group) {
+ if (env.best_cpu == -1)
+ nid = env.src_nid;
+ else
+ nid = env.dst_nid;
+
+ if (node_isset(nid, p->numa_group->active_nodes))
+ sched_setnuma(p, env.dst_nid);
+ }
+
+ /* No better CPU than the current one was found. */
+ if (env.best_cpu == -1)
+ return -EAGAIN;
+
+ /*
+ * Reset the scan period if the task is being rescheduled on an
+ * alternative node to recheck if the tasks is now properly placed.
+ */
+ p->numa_scan_period = task_scan_min(p);
+
+ if (env.best_task == NULL) {
+ ret = migrate_task_to(p, env.best_cpu);
+ if (ret != 0)
+ trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
+ return ret;
+ }
+
+ ret = migrate_swap(p, env.best_task);
+ if (ret != 0)
+ trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
+ put_task_struct(env.best_task);
+ return ret;
+}
+
+/* Attempt to migrate a task to a CPU on the preferred node. */
+static void numa_migrate_preferred(struct task_struct *p)
+{
+ unsigned long interval = HZ;
+
+ /* This task has no NUMA fault statistics yet */
+ if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
+ return;
+
+ /* Periodically retry migrating the task to the preferred node */
+ interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
+ p->numa_migrate_retry = jiffies + interval;
+
+ /* Success if task is already running on preferred CPU */
+ if (task_node(p) == p->numa_preferred_nid)
+ return;
+
+ /* Otherwise, try migrate to a CPU on the preferred node */
+ task_numa_migrate(p);
+}
+
+/*
+ * Find the nodes on which the workload is actively running. We do this by
+ * tracking the nodes from which NUMA hinting faults are triggered. This can
+ * be different from the set of nodes where the workload's memory is currently
+ * located.
+ *
+ * The bitmask is used to make smarter decisions on when to do NUMA page
+ * migrations, To prevent flip-flopping, and excessive page migrations, nodes
+ * are added when they cause over 6/16 of the maximum number of faults, but
+ * only removed when they drop below 3/16.
+ */
+static void update_numa_active_node_mask(struct numa_group *numa_group)
+{
+ unsigned long faults, max_faults = 0;
+ int nid;
+
+ for_each_online_node(nid) {
+ faults = group_faults_cpu(numa_group, nid);
+ if (faults > max_faults)
+ max_faults = faults;
+ }
+
+ for_each_online_node(nid) {
+ faults = group_faults_cpu(numa_group, nid);
+ if (!node_isset(nid, numa_group->active_nodes)) {
+ if (faults > max_faults * 6 / 16)
+ node_set(nid, numa_group->active_nodes);
+ } else if (faults < max_faults * 3 / 16)
+ node_clear(nid, numa_group->active_nodes);
+ }
+}
+
+/*
+ * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
+ * increments. The more local the fault statistics are, the higher the scan
+ * period will be for the next scan window. If local/(local+remote) ratio is
+ * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
+ * the scan period will decrease. Aim for 70% local accesses.
+ */
+#define NUMA_PERIOD_SLOTS 10
+#define NUMA_PERIOD_THRESHOLD 7
+
+/*
+ * Increase the scan period (slow down scanning) if the majority of
+ * our memory is already on our local node, or if the majority of
+ * the page accesses are shared with other processes.
+ * Otherwise, decrease the scan period.
+ */
+static void update_task_scan_period(struct task_struct *p,
+ unsigned long shared, unsigned long private)
+{
+ unsigned int period_slot;
+ int ratio;
+ int diff;
+
+ unsigned long remote = p->numa_faults_locality[0];
+ unsigned long local = p->numa_faults_locality[1];
+
+ /*
+ * If there were no record hinting faults then either the task is
+ * completely idle or all activity is areas that are not of interest
+ * to automatic numa balancing. Related to that, if there were failed
+ * migration then it implies we are migrating too quickly or the local
+ * node is overloaded. In either case, scan slower
+ */
+ if (local + shared == 0 || p->numa_faults_locality[2]) {
+ p->numa_scan_period = min(p->numa_scan_period_max,
+ p->numa_scan_period << 1);
+
+ p->mm->numa_next_scan = jiffies +
+ msecs_to_jiffies(p->numa_scan_period);
+
+ return;
+ }
+
+ /*
+ * Prepare to scale scan period relative to the current period.
+ * == NUMA_PERIOD_THRESHOLD scan period stays the same
+ * < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
+ * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
+ */
+ period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
+ ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
+ if (ratio >= NUMA_PERIOD_THRESHOLD) {
+ int slot = ratio - NUMA_PERIOD_THRESHOLD;
+ if (!slot)
+ slot = 1;
+ diff = slot * period_slot;
+ } else {
+ diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
+
+ /*
+ * Scale scan rate increases based on sharing. There is an
+ * inverse relationship between the degree of sharing and
+ * the adjustment made to the scanning period. Broadly
+ * speaking the intent is that there is little point
+ * scanning faster if shared accesses dominate as it may
+ * simply bounce migrations uselessly
+ */
+ ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
+ diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
+ }
+
+ p->numa_scan_period = clamp(p->numa_scan_period + diff,
+ task_scan_min(p), task_scan_max(p));
+ memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
+}
+
+/*
+ * Get the fraction of time the task has been running since the last
+ * NUMA placement cycle. The scheduler keeps similar statistics, but
+ * decays those on a 32ms period, which is orders of magnitude off
+ * from the dozens-of-seconds NUMA balancing period. Use the scheduler
+ * stats only if the task is so new there are no NUMA statistics yet.
+ */
+static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
+{
+ u64 runtime, delta, now;
+ /* Use the start of this time slice to avoid calculations. */
+ now = p->se.exec_start;
+ runtime = p->se.sum_exec_runtime;
+
+ if (p->last_task_numa_placement) {
+ delta = runtime - p->last_sum_exec_runtime;
+ *period = now - p->last_task_numa_placement;
+ } else {
+ delta = p->se.avg.runnable_avg_sum;
+ *period = p->se.avg.avg_period;
+ }
+
+ p->last_sum_exec_runtime = runtime;
+ p->last_task_numa_placement = now;
+
+ return delta;
+}
+
+/*
+ * Determine the preferred nid for a task in a numa_group. This needs to
+ * be done in a way that produces consistent results with group_weight,
+ * otherwise workloads might not converge.
+ */
+static int preferred_group_nid(struct task_struct *p, int nid)
+{
+ nodemask_t nodes;
+ int dist;
+
+ /* Direct connections between all NUMA nodes. */
+ if (sched_numa_topology_type == NUMA_DIRECT)
+ return nid;
+
+ /*
+ * On a system with glueless mesh NUMA topology, group_weight
+ * scores nodes according to the number of NUMA hinting faults on
+ * both the node itself, and on nearby nodes.
+ */
+ if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
+ unsigned long score, max_score = 0;
+ int node, max_node = nid;
+
+ dist = sched_max_numa_distance;
+
+ for_each_online_node(node) {
+ score = group_weight(p, node, dist);
+ if (score > max_score) {
+ max_score = score;
+ max_node = node;
+ }
+ }
+ return max_node;
+ }
+
+ /*
+ * Finding the preferred nid in a system with NUMA backplane
+ * interconnect topology is more involved. The goal is to locate
+ * tasks from numa_groups near each other in the system, and
+ * untangle workloads from different sides of the system. This requires
+ * searching down the hierarchy of node groups, recursively searching
+ * inside the highest scoring group of nodes. The nodemask tricks
+ * keep the complexity of the search down.
+ */
+ nodes = node_online_map;
+ for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
+ unsigned long max_faults = 0;
+ nodemask_t max_group = NODE_MASK_NONE;
+ int a, b;
+
+ /* Are there nodes at this distance from each other? */
+ if (!find_numa_distance(dist))
+ continue;
+
+ for_each_node_mask(a, nodes) {
+ unsigned long faults = 0;
+ nodemask_t this_group;
+ nodes_clear(this_group);
+
+ /* Sum group's NUMA faults; includes a==b case. */
+ for_each_node_mask(b, nodes) {
+ if (node_distance(a, b) < dist) {
+ faults += group_faults(p, b);
+ node_set(b, this_group);
+ node_clear(b, nodes);
+ }
+ }
+
+ /* Remember the top group. */
+ if (faults > max_faults) {
+ max_faults = faults;
+ max_group = this_group;
+ /*
+ * subtle: at the smallest distance there is
+ * just one node left in each "group", the
+ * winner is the preferred nid.
+ */
+ nid = a;
+ }
+ }
+ /* Next round, evaluate the nodes within max_group. */
+ if (!max_faults)
+ break;
+ nodes = max_group;
+ }
+ return nid;
+}
+
+static void task_numa_placement(struct task_struct *p)
+{
+ int seq, nid, max_nid = -1, max_group_nid = -1;
+ unsigned long max_faults = 0, max_group_faults = 0;
+ unsigned long fault_types[2] = { 0, 0 };
+ unsigned long total_faults;
+ u64 runtime, period;
+ spinlock_t *group_lock = NULL;
+
+ seq = ACCESS_ONCE(p->mm->numa_scan_seq);
+ if (p->numa_scan_seq == seq)
+ return;
+ p->numa_scan_seq = seq;
+ p->numa_scan_period_max = task_scan_max(p);
+
+ total_faults = p->numa_faults_locality[0] +
+ p->numa_faults_locality[1];
+ runtime = numa_get_avg_runtime(p, &period);
+
+ /* If the task is part of a group prevent parallel updates to group stats */
+ if (p->numa_group) {
+ group_lock = &p->numa_group->lock;
+ spin_lock_irq(group_lock);
+ }
+
+ /* Find the node with the highest number of faults */
+ for_each_online_node(nid) {
+ /* Keep track of the offsets in numa_faults array */
+ int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
+ unsigned long faults = 0, group_faults = 0;
+ int priv;
+
+ for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
+ long diff, f_diff, f_weight;
+
+ mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
+ membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
+ cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
+ cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
+
+ /* Decay existing window, copy faults since last scan */
+ diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
+ fault_types[priv] += p->numa_faults[membuf_idx];
+ p->numa_faults[membuf_idx] = 0;
+
+ /*
+ * Normalize the faults_from, so all tasks in a group
+ * count according to CPU use, instead of by the raw
+ * number of faults. Tasks with little runtime have
+ * little over-all impact on throughput, and thus their
+ * faults are less important.
+ */
+ f_weight = div64_u64(runtime << 16, period + 1);
+ f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
+ (total_faults + 1);
+ f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
+ p->numa_faults[cpubuf_idx] = 0;
+
+ p->numa_faults[mem_idx] += diff;
+ p->numa_faults[cpu_idx] += f_diff;
+ faults += p->numa_faults[mem_idx];
+ p->total_numa_faults += diff;
+ if (p->numa_group) {
+ /*
+ * safe because we can only change our own group
+ *
+ * mem_idx represents the offset for a given
+ * nid and priv in a specific region because it
+ * is at the beginning of the numa_faults array.
+ */
+ p->numa_group->faults[mem_idx] += diff;
+ p->numa_group->faults_cpu[mem_idx] += f_diff;
+ p->numa_group->total_faults += diff;
+ group_faults += p->numa_group->faults[mem_idx];
+ }
+ }
+
+ if (faults > max_faults) {
+ max_faults = faults;
+ max_nid = nid;
+ }
+
+ if (group_faults > max_group_faults) {
+ max_group_faults = group_faults;
+ max_group_nid = nid;
+ }
+ }
+
+ update_task_scan_period(p, fault_types[0], fault_types[1]);
+
+ if (p->numa_group) {
+ update_numa_active_node_mask(p->numa_group);
+ spin_unlock_irq(group_lock);
+ max_nid = preferred_group_nid(p, max_group_nid);
+ }
+
+ if (max_faults) {
+ /* Set the new preferred node */
+ if (max_nid != p->numa_preferred_nid)
+ sched_setnuma(p, max_nid);
+
+ if (task_node(p) != p->numa_preferred_nid)
+ numa_migrate_preferred(p);
+ }
+}
+
+static inline int get_numa_group(struct numa_group *grp)
+{
+ return atomic_inc_not_zero(&grp->refcount);
+}
+
+static inline void put_numa_group(struct numa_group *grp)
+{
+ if (atomic_dec_and_test(&grp->refcount))
+ kfree_rcu(grp, rcu);
+}
+
+static void task_numa_group(struct task_struct *p, int cpupid, int flags,
+ int *priv)
+{
+ struct numa_group *grp, *my_grp;
+ struct task_struct *tsk;
+ bool join = false;
+ int cpu = cpupid_to_cpu(cpupid);
+ int i;
+
+ if (unlikely(!p->numa_group)) {
+ unsigned int size = sizeof(struct numa_group) +
+ 4*nr_node_ids*sizeof(unsigned long);
+
+ grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
+ if (!grp)
+ return;
+
+ atomic_set(&grp->refcount, 1);
+ spin_lock_init(&grp->lock);
+ grp->gid = p->pid;
+ /* Second half of the array tracks nids where faults happen */
+ grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
+ nr_node_ids;
+
+ node_set(task_node(current), grp->active_nodes);
+
+ for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
+ grp->faults[i] = p->numa_faults[i];
+
+ grp->total_faults = p->total_numa_faults;
+
+ grp->nr_tasks++;
+ rcu_assign_pointer(p->numa_group, grp);
+ }
+
+ rcu_read_lock();
+ tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);
+
+ if (!cpupid_match_pid(tsk, cpupid))
+ goto no_join;
+
+ grp = rcu_dereference(tsk->numa_group);
+ if (!grp)
+ goto no_join;
+
+ my_grp = p->numa_group;
+ if (grp == my_grp)
+ goto no_join;
+
+ /*
+ * Only join the other group if its bigger; if we're the bigger group,
+ * the other task will join us.
+ */
+ if (my_grp->nr_tasks > grp->nr_tasks)
+ goto no_join;
+
+ /*
+ * Tie-break on the grp address.
+ */
+ if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
+ goto no_join;
+
+ /* Always join threads in the same process. */
+ if (tsk->mm == current->mm)
+ join = true;
+
+ /* Simple filter to avoid false positives due to PID collisions */
+ if (flags & TNF_SHARED)
+ join = true;
+
+ /* Update priv based on whether false sharing was detected */
+ *priv = !join;
+
+ if (join && !get_numa_group(grp))
+ goto no_join;
+
+ rcu_read_unlock();
+
+ if (!join)
+ return;
+
+ BUG_ON(irqs_disabled());
+ double_lock_irq(&my_grp->lock, &grp->lock);
+
+ for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
+ my_grp->faults[i] -= p->numa_faults[i];
+ grp->faults[i] += p->numa_faults[i];
+ }
+ my_grp->total_faults -= p->total_numa_faults;
+ grp->total_faults += p->total_numa_faults;
+
+ my_grp->nr_tasks--;
+ grp->nr_tasks++;
+
+ spin_unlock(&my_grp->lock);
+ spin_unlock_irq(&grp->lock);
+
+ rcu_assign_pointer(p->numa_group, grp);
+
+ put_numa_group(my_grp);
+ return;
+
+no_join:
+ rcu_read_unlock();
+ return;
+}
+
+void task_numa_free(struct task_struct *p)
+{
+ struct numa_group *grp = p->numa_group;
+ void *numa_faults = p->numa_faults;
+ unsigned long flags;
+ int i;
+
+ if (grp) {
+ spin_lock_irqsave(&grp->lock, flags);
+ for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
+ grp->faults[i] -= p->numa_faults[i];
+ grp->total_faults -= p->total_numa_faults;
+
+ grp->nr_tasks--;
+ spin_unlock_irqrestore(&grp->lock, flags);
+ RCU_INIT_POINTER(p->numa_group, NULL);
+ put_numa_group(grp);
+ }
+
+ p->numa_faults = NULL;
+ kfree(numa_faults);
+}
+
+/*
+ * Got a PROT_NONE fault for a page on @node.
+ */
+void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
+{
+ struct task_struct *p = current;
+ bool migrated = flags & TNF_MIGRATED;
+ int cpu_node = task_node(current);
+ int local = !!(flags & TNF_FAULT_LOCAL);
+ int priv;
+
+ if (!numabalancing_enabled)
+ return;
+
+ /* for example, ksmd faulting in a user's mm */
+ if (!p->mm)
+ return;
+
+ /* Allocate buffer to track faults on a per-node basis */
+ if (unlikely(!p->numa_faults)) {
+ int size = sizeof(*p->numa_faults) *
+ NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
+
+ p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
+ if (!p->numa_faults)
+ return;
+
+ p->total_numa_faults = 0;
+ memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
+ }
+
+ /*
+ * First accesses are treated as private, otherwise consider accesses
+ * to be private if the accessing pid has not changed
+ */
+ if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
+ priv = 1;
+ } else {
+ priv = cpupid_match_pid(p, last_cpupid);
+ if (!priv && !(flags & TNF_NO_GROUP))
+ task_numa_group(p, last_cpupid, flags, &priv);
+ }
+
+ /*
+ * If a workload spans multiple NUMA nodes, a shared fault that
+ * occurs wholly within the set of nodes that the workload is
+ * actively using should be counted as local. This allows the
+ * scan rate to slow down when a workload has settled down.
+ */
+ if (!priv && !local && p->numa_group &&
+ node_isset(cpu_node, p->numa_group->active_nodes) &&
+ node_isset(mem_node, p->numa_group->active_nodes))
+ local = 1;
+
+ task_numa_placement(p);
+
+ /*
+ * Retry task to preferred node migration periodically, in case it
+ * case it previously failed, or the scheduler moved us.
+ */
+ if (time_after(jiffies, p->numa_migrate_retry))
+ numa_migrate_preferred(p);
+
+ if (migrated)
+ p->numa_pages_migrated += pages;
+ if (flags & TNF_MIGRATE_FAIL)
+ p->numa_faults_locality[2] += pages;
+
+ p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
+ p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
+ p->numa_faults_locality[local] += pages;
+}
+
+static void reset_ptenuma_scan(struct task_struct *p)
+{
+ ACCESS_ONCE(p->mm->numa_scan_seq)++;
+ p->mm->numa_scan_offset = 0;
+}
+
+/*
+ * The expensive part of numa migration is done from task_work context.
+ * Triggered from task_tick_numa().
+ */
+void task_numa_work(struct callback_head *work)
+{
+ unsigned long migrate, next_scan, now = jiffies;
+ struct task_struct *p = current;
+ struct mm_struct *mm = p->mm;
+ struct vm_area_struct *vma;
+ unsigned long start, end;
+ unsigned long nr_pte_updates = 0;
+ long pages;
+
+ WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
+
+ work->next = work; /* protect against double add */
+ /*
+ * Who cares about NUMA placement when they're dying.
+ *
+ * NOTE: make sure not to dereference p->mm before this check,
+ * exit_task_work() happens _after_ exit_mm() so we could be called
+ * without p->mm even though we still had it when we enqueued this
+ * work.
+ */
+ if (p->flags & PF_EXITING)
+ return;
+
+ if (!mm->numa_next_scan) {
+ mm->numa_next_scan = now +
+ msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
+ }
+
+ /*
+ * Enforce maximal scan/migration frequency..
+ */
+ migrate = mm->numa_next_scan;
+ if (time_before(now, migrate))
+ return;
+
+ if (p->numa_scan_period == 0) {
+ p->numa_scan_period_max = task_scan_max(p);
+ p->numa_scan_period = task_scan_min(p);
+ }
+
+ next_scan = now + msecs_to_jiffies(p->numa_scan_period);
+ if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
+ return;
+
+ /*
+ * Delay this task enough that another task of this mm will likely win
+ * the next time around.
+ */
+ p->node_stamp += 2 * TICK_NSEC;
+
+ start = mm->numa_scan_offset;
+ pages = sysctl_numa_balancing_scan_size;
+ pages <<= 20 - PAGE_SHIFT; /* MB in pages */
+ if (!pages)
+ return;
+
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, start);
+ if (!vma) {
+ reset_ptenuma_scan(p);
+ start = 0;
+ vma = mm->mmap;
+ }
+ for (; vma; vma = vma->vm_next) {
+ if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
+ is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
+ continue;
+ }
+
+ /*
+ * Shared library pages mapped by multiple processes are not
+ * migrated as it is expected they are cache replicated. Avoid
+ * hinting faults in read-only file-backed mappings or the vdso
+ * as migrating the pages will be of marginal benefit.
+ */
+ if (!vma->vm_mm ||
+ (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
+ continue;
+
+ /*
+ * Skip inaccessible VMAs to avoid any confusion between
+ * PROT_NONE and NUMA hinting ptes
+ */
+ if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
+ continue;
+
+ do {
+ start = max(start, vma->vm_start);
+ end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
+ end = min(end, vma->vm_end);
+ nr_pte_updates += change_prot_numa(vma, start, end);
+
+ /*
+ * Scan sysctl_numa_balancing_scan_size but ensure that
+ * at least one PTE is updated so that unused virtual
+ * address space is quickly skipped.
+ */
+ if (nr_pte_updates)
+ pages -= (end - start) >> PAGE_SHIFT;
+
+ start = end;
+ if (pages <= 0)
+ goto out;
+
+ cond_resched();
+ } while (end != vma->vm_end);
+ }
+
+out:
+ /*
+ * It is possible to reach the end of the VMA list but the last few
+ * VMAs are not guaranteed to the vma_migratable. If they are not, we
+ * would find the !migratable VMA on the next scan but not reset the
+ * scanner to the start so check it now.
+ */
+ if (vma)
+ mm->numa_scan_offset = start;
+ else
+ reset_ptenuma_scan(p);
+ up_read(&mm->mmap_sem);
+}
+
+/*
+ * Drive the periodic memory faults..
+ */
+void task_tick_numa(struct rq *rq, struct task_struct *curr)
+{
+ struct callback_head *work = &curr->numa_work;
+ u64 period, now;
+
+ /*
+ * We don't care about NUMA placement if we don't have memory.
+ */
+ if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
+ return;
+
+ /*
+ * Using runtime rather than walltime has the dual advantage that
+ * we (mostly) drive the selection from busy threads and that the
+ * task needs to have done some actual work before we bother with
+ * NUMA placement.
+ */
+ now = curr->se.sum_exec_runtime;
+ period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
+
+ if (now - curr->node_stamp > period) {
+ if (!curr->node_stamp)
+ curr->numa_scan_period = task_scan_min(curr);
+ curr->node_stamp += period;
+
+ if (!time_before(jiffies, curr->mm->numa_next_scan)) {
+ init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
+ task_work_add(curr, work, true);
+ }
+ }
+}
+#else
+static void task_tick_numa(struct rq *rq, struct task_struct *curr)
+{
+}
+
+static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
+{
+}
+
+static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
+{
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
+static void
+account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ update_load_add(&cfs_rq->load, se->load.weight);
+ if (!parent_entity(se))
+ update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
+#ifdef CONFIG_SMP
+ if (entity_is_task(se)) {
+ struct rq *rq = rq_of(cfs_rq);
+
+ account_numa_enqueue(rq, task_of(se));
+ list_add(&se->group_node, &rq->cfs_tasks);
+ }
+#endif
+ cfs_rq->nr_running++;
+}
+
+static void
+account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ update_load_sub(&cfs_rq->load, se->load.weight);
+ if (!parent_entity(se))
+ update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
+ if (entity_is_task(se)) {
+ account_numa_dequeue(rq_of(cfs_rq), task_of(se));
+ list_del_init(&se->group_node);
+ }
+ cfs_rq->nr_running--;
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+# ifdef CONFIG_SMP
+static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
+{
+ long tg_weight;
+
+ /*
+ * Use this CPU's actual weight instead of the last load_contribution
+ * to gain a more accurate current total weight. See
+ * update_cfs_rq_load_contribution().
+ */
+ tg_weight = atomic_long_read(&tg->load_avg);
+ tg_weight -= cfs_rq->tg_load_contrib;
+ tg_weight += cfs_rq->load.weight;
+
+ return tg_weight;
+}
+
+static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
+{
+ long tg_weight, load, shares;
+
+ tg_weight = calc_tg_weight(tg, cfs_rq);
+ load = cfs_rq->load.weight;
+
+ shares = (tg->shares * load);
+ if (tg_weight)
+ shares /= tg_weight;
+
+ if (shares < MIN_SHARES)
+ shares = MIN_SHARES;
+ if (shares > tg->shares)
+ shares = tg->shares;
+
+ return shares;
+}
+# else /* CONFIG_SMP */
+static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
+{
+ return tg->shares;
+}
+# endif /* CONFIG_SMP */
+static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+ unsigned long weight)
+{
+ if (se->on_rq) {
+ /* commit outstanding execution time */
+ if (cfs_rq->curr == se)
+ update_curr(cfs_rq);
+ account_entity_dequeue(cfs_rq, se);
+ }
+
+ update_load_set(&se->load, weight);
+
+ if (se->on_rq)
+ account_entity_enqueue(cfs_rq, se);
+}
+
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
+
+static void update_cfs_shares(struct cfs_rq *cfs_rq)
+{
+ struct task_group *tg;
+ struct sched_entity *se;
+ long shares;
+
+ tg = cfs_rq->tg;
+ se = tg->se[cpu_of(rq_of(cfs_rq))];
+ if (!se || throttled_hierarchy(cfs_rq))
+ return;
+#ifndef CONFIG_SMP
+ if (likely(se->load.weight == tg->shares))
+ return;
+#endif
+ shares = calc_cfs_shares(cfs_rq, tg);
+
+ reweight_entity(cfs_rq_of(se), se, shares);
+}
+#else /* CONFIG_FAIR_GROUP_SCHED */
+static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
+{
+}
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+#ifdef CONFIG_SMP
+/*
+ * We choose a half-life close to 1 scheduling period.
+ * Note: The tables below are dependent on this value.
+ */
+#define LOAD_AVG_PERIOD 32
+#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
+#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
+
+/* Precomputed fixed inverse multiplies for multiplication by y^n */
+static const u32 runnable_avg_yN_inv[] = {
+ 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
+ 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
+ 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
+ 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
+ 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
+ 0x85aac367, 0x82cd8698,
+};
+
+/*
+ * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
+ * over-estimates when re-combining.
+ */
+static const u32 runnable_avg_yN_sum[] = {
+ 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
+ 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
+ 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
+};
+
+/*
+ * Approximate:
+ * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
+ */
+static __always_inline u64 decay_load(u64 val, u64 n)
+{
+ unsigned int local_n;
+
+ if (!n)
+ return val;
+ else if (unlikely(n > LOAD_AVG_PERIOD * 63))
+ return 0;
+
+ /* after bounds checking we can collapse to 32-bit */
+ local_n = n;
+
+ /*
+ * As y^PERIOD = 1/2, we can combine
+ * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
+ * With a look-up table which covers y^n (n<PERIOD)
+ *
+ * To achieve constant time decay_load.
+ */
+ if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
+ val >>= local_n / LOAD_AVG_PERIOD;
+ local_n %= LOAD_AVG_PERIOD;
+ }
+
+ val *= runnable_avg_yN_inv[local_n];
+ /* We don't use SRR here since we always want to round down. */
+ return val >> 32;
+}
+
+/*
+ * For updates fully spanning n periods, the contribution to runnable
+ * average will be: \Sum 1024*y^n
+ *
+ * We can compute this reasonably efficiently by combining:
+ * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
+ */
+static u32 __compute_runnable_contrib(u64 n)
+{
+ u32 contrib = 0;
+
+ if (likely(n <= LOAD_AVG_PERIOD))
+ return runnable_avg_yN_sum[n];
+ else if (unlikely(n >= LOAD_AVG_MAX_N))
+ return LOAD_AVG_MAX;
+
+ /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
+ do {
+ contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
+ contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
+
+ n -= LOAD_AVG_PERIOD;
+ } while (n > LOAD_AVG_PERIOD);
+
+ contrib = decay_load(contrib, n);
+ return contrib + runnable_avg_yN_sum[n];
+}
+
+/*
+ * We can represent the historical contribution to runnable average as the
+ * coefficients of a geometric series. To do this we sub-divide our runnable
+ * history into segments of approximately 1ms (1024us); label the segment that
+ * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
+ *
+ * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
+ * p0 p1 p2
+ * (now) (~1ms ago) (~2ms ago)
+ *
+ * Let u_i denote the fraction of p_i that the entity was runnable.
+ *
+ * We then designate the fractions u_i as our co-efficients, yielding the
+ * following representation of historical load:
+ * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
+ *
+ * We choose y based on the with of a reasonably scheduling period, fixing:
+ * y^32 = 0.5
+ *
+ * This means that the contribution to load ~32ms ago (u_32) will be weighted
+ * approximately half as much as the contribution to load within the last ms
+ * (u_0).
+ *
+ * When a period "rolls over" and we have new u_0`, multiplying the previous
+ * sum again by y is sufficient to update:
+ * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
+ * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
+ */
+static __always_inline int __update_entity_runnable_avg(u64 now, int cpu,
+ struct sched_avg *sa,
+ int runnable,
+ int running)
+{
+ u64 delta, periods;
+ u32 runnable_contrib;
+ int delta_w, decayed = 0;
+ unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu);
+
+ delta = now - sa->last_runnable_update;
+ /*
+ * This should only happen when time goes backwards, which it
+ * unfortunately does during sched clock init when we swap over to TSC.
+ */
+ if ((s64)delta < 0) {
+ sa->last_runnable_update = now;
+ return 0;
+ }
+
+ /*
+ * Use 1024ns as the unit of measurement since it's a reasonable
+ * approximation of 1us and fast to compute.
+ */
+ delta >>= 10;
+ if (!delta)
+ return 0;
+ sa->last_runnable_update = now;
+
+ /* delta_w is the amount already accumulated against our next period */
+ delta_w = sa->avg_period % 1024;
+ if (delta + delta_w >= 1024) {
+ /* period roll-over */
+ decayed = 1;
+
+ /*
+ * Now that we know we're crossing a period boundary, figure
+ * out how much from delta we need to complete the current
+ * period and accrue it.
+ */
+ delta_w = 1024 - delta_w;
+ if (runnable)
+ sa->runnable_avg_sum += delta_w;
+ if (running)
+ sa->running_avg_sum += delta_w * scale_freq
+ >> SCHED_CAPACITY_SHIFT;
+ sa->avg_period += delta_w;
+
+ delta -= delta_w;
+
+ /* Figure out how many additional periods this update spans */
+ periods = delta / 1024;
+ delta %= 1024;
+
+ sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
+ periods + 1);
+ sa->running_avg_sum = decay_load(sa->running_avg_sum,
+ periods + 1);
+ sa->avg_period = decay_load(sa->avg_period,
+ periods + 1);
+
+ /* Efficiently calculate \sum (1..n_period) 1024*y^i */
+ runnable_contrib = __compute_runnable_contrib(periods);
+ if (runnable)
+ sa->runnable_avg_sum += runnable_contrib;
+ if (running)
+ sa->running_avg_sum += runnable_contrib * scale_freq
+ >> SCHED_CAPACITY_SHIFT;
+ sa->avg_period += runnable_contrib;
+ }
+
+ /* Remainder of delta accrued against u_0` */
+ if (runnable)
+ sa->runnable_avg_sum += delta;
+ if (running)
+ sa->running_avg_sum += delta * scale_freq
+ >> SCHED_CAPACITY_SHIFT;
+ sa->avg_period += delta;
+
+ return decayed;
+}
+
+/* Synchronize an entity's decay with its parenting cfs_rq.*/
+static inline u64 __synchronize_entity_decay(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 decays = atomic64_read(&cfs_rq->decay_counter);
+
+ decays -= se->avg.decay_count;
+ se->avg.decay_count = 0;
+ if (!decays)
+ return 0;
+
+ se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
+ se->avg.utilization_avg_contrib =
+ decay_load(se->avg.utilization_avg_contrib, decays);
+
+ return decays;
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
+ int force_update)
+{
+ struct task_group *tg = cfs_rq->tg;
+ long tg_contrib;
+
+ tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
+ tg_contrib -= cfs_rq->tg_load_contrib;
+
+ if (!tg_contrib)
+ return;
+
+ if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
+ atomic_long_add(tg_contrib, &tg->load_avg);
+ cfs_rq->tg_load_contrib += tg_contrib;
+ }
+}
+
+/*
+ * Aggregate cfs_rq runnable averages into an equivalent task_group
+ * representation for computing load contributions.
+ */
+static inline void __update_tg_runnable_avg(struct sched_avg *sa,
+ struct cfs_rq *cfs_rq)
+{
+ struct task_group *tg = cfs_rq->tg;
+ long contrib;
+
+ /* The fraction of a cpu used by this cfs_rq */
+ contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,
+ sa->avg_period + 1);
+ contrib -= cfs_rq->tg_runnable_contrib;
+
+ if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
+ atomic_add(contrib, &tg->runnable_avg);
+ cfs_rq->tg_runnable_contrib += contrib;
+ }
+}
+
+static inline void __update_group_entity_contrib(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = group_cfs_rq(se);
+ struct task_group *tg = cfs_rq->tg;
+ int runnable_avg;
+
+ u64 contrib;
+
+ contrib = cfs_rq->tg_load_contrib * tg->shares;
+ se->avg.load_avg_contrib = div_u64(contrib,
+ atomic_long_read(&tg->load_avg) + 1);
+
+ /*
+ * For group entities we need to compute a correction term in the case
+ * that they are consuming <1 cpu so that we would contribute the same
+ * load as a task of equal weight.
+ *
+ * Explicitly co-ordinating this measurement would be expensive, but
+ * fortunately the sum of each cpus contribution forms a usable
+ * lower-bound on the true value.
+ *
+ * Consider the aggregate of 2 contributions. Either they are disjoint
+ * (and the sum represents true value) or they are disjoint and we are
+ * understating by the aggregate of their overlap.
+ *
+ * Extending this to N cpus, for a given overlap, the maximum amount we
+ * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
+ * cpus that overlap for this interval and w_i is the interval width.
+ *
+ * On a small machine; the first term is well-bounded which bounds the
+ * total error since w_i is a subset of the period. Whereas on a
+ * larger machine, while this first term can be larger, if w_i is the
+ * of consequential size guaranteed to see n_i*w_i quickly converge to
+ * our upper bound of 1-cpu.
+ */
+ runnable_avg = atomic_read(&tg->runnable_avg);
+ if (runnable_avg < NICE_0_LOAD) {
+ se->avg.load_avg_contrib *= runnable_avg;
+ se->avg.load_avg_contrib >>= NICE_0_SHIFT;
+ }
+}
+
+static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
+{
+ __update_entity_runnable_avg(rq_clock_task(rq), cpu_of(rq), &rq->avg,
+ runnable, runnable);
+ __update_tg_runnable_avg(&rq->avg, &rq->cfs);
+}
+#else /* CONFIG_FAIR_GROUP_SCHED */
+static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
+ int force_update) {}
+static inline void __update_tg_runnable_avg(struct sched_avg *sa,
+ struct cfs_rq *cfs_rq) {}
+static inline void __update_group_entity_contrib(struct sched_entity *se) {}
+static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+static inline void __update_task_entity_contrib(struct sched_entity *se)
+{
+ u32 contrib;
+
+ /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
+ contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
+ contrib /= (se->avg.avg_period + 1);
+ se->avg.load_avg_contrib = scale_load(contrib);
+}
+
+/* Compute the current contribution to load_avg by se, return any delta */
+static long __update_entity_load_avg_contrib(struct sched_entity *se)
+{
+ long old_contrib = se->avg.load_avg_contrib;
+
+ if (entity_is_task(se)) {
+ __update_task_entity_contrib(se);
+ } else {
+ __update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
+ __update_group_entity_contrib(se);
+ }
+
+ return se->avg.load_avg_contrib - old_contrib;
+}
+
+
+static inline void __update_task_entity_utilization(struct sched_entity *se)
+{
+ u32 contrib;
+
+ /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
+ contrib = se->avg.running_avg_sum * scale_load_down(SCHED_LOAD_SCALE);
+ contrib /= (se->avg.avg_period + 1);
+ se->avg.utilization_avg_contrib = scale_load(contrib);
+}
+
+static long __update_entity_utilization_avg_contrib(struct sched_entity *se)
+{
+ long old_contrib = se->avg.utilization_avg_contrib;
+
+ if (entity_is_task(se))
+ __update_task_entity_utilization(se);
+ else
+ se->avg.utilization_avg_contrib =
+ group_cfs_rq(se)->utilization_load_avg;
+
+ return se->avg.utilization_avg_contrib - old_contrib;
+}
+
+static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
+ long load_contrib)
+{
+ if (likely(load_contrib < cfs_rq->blocked_load_avg))
+ cfs_rq->blocked_load_avg -= load_contrib;
+ else
+ cfs_rq->blocked_load_avg = 0;
+}
+
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
+
+/* Update a sched_entity's runnable average */
+static inline void update_entity_load_avg(struct sched_entity *se,
+ int update_cfs_rq)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ long contrib_delta, utilization_delta;
+ int cpu = cpu_of(rq_of(cfs_rq));
+ u64 now;
+
+ /*
+ * For a group entity we need to use their owned cfs_rq_clock_task() in
+ * case they are the parent of a throttled hierarchy.
+ */
+ if (entity_is_task(se))
+ now = cfs_rq_clock_task(cfs_rq);
+ else
+ now = cfs_rq_clock_task(group_cfs_rq(se));
+
+ if (!__update_entity_runnable_avg(now, cpu, &se->avg, se->on_rq,
+ cfs_rq->curr == se))
+ return;
+
+ contrib_delta = __update_entity_load_avg_contrib(se);
+ utilization_delta = __update_entity_utilization_avg_contrib(se);
+
+ if (!update_cfs_rq)
+ return;
+
+ if (se->on_rq) {
+ cfs_rq->runnable_load_avg += contrib_delta;
+ cfs_rq->utilization_load_avg += utilization_delta;
+ } else {
+ subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
+ }
+}
+
+/*
+ * Decay the load contributed by all blocked children and account this so that
+ * their contribution may appropriately discounted when they wake up.
+ */
+static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
+{
+ u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
+ u64 decays;
+
+ decays = now - cfs_rq->last_decay;
+ if (!decays && !force_update)
+ return;
+
+ if (atomic_long_read(&cfs_rq->removed_load)) {
+ unsigned long removed_load;
+ removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0);
+ subtract_blocked_load_contrib(cfs_rq, removed_load);
+ }
+
+ if (decays) {
+ cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
+ decays);
+ atomic64_add(decays, &cfs_rq->decay_counter);
+ cfs_rq->last_decay = now;
+ }
+
+ __update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
+}
+
+/* Add the load generated by se into cfs_rq's child load-average */
+static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
+ struct sched_entity *se,
+ int wakeup)
+{
+ /*
+ * We track migrations using entity decay_count <= 0, on a wake-up
+ * migration we use a negative decay count to track the remote decays
+ * accumulated while sleeping.
+ *
+ * Newly forked tasks are enqueued with se->avg.decay_count == 0, they
+ * are seen by enqueue_entity_load_avg() as a migration with an already
+ * constructed load_avg_contrib.
+ */
+ if (unlikely(se->avg.decay_count <= 0)) {
+ se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq));
+ if (se->avg.decay_count) {
+ /*
+ * In a wake-up migration we have to approximate the
+ * time sleeping. This is because we can't synchronize
+ * clock_task between the two cpus, and it is not
+ * guaranteed to be read-safe. Instead, we can
+ * approximate this using our carried decays, which are
+ * explicitly atomically readable.
+ */
+ se->avg.last_runnable_update -= (-se->avg.decay_count)
+ << 20;
+ update_entity_load_avg(se, 0);
+ /* Indicate that we're now synchronized and on-rq */
+ se->avg.decay_count = 0;
+ }
+ wakeup = 0;
+ } else {
+ __synchronize_entity_decay(se);
+ }
+
+ /* migrated tasks did not contribute to our blocked load */
+ if (wakeup) {
+ subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
+ update_entity_load_avg(se, 0);
+ }
+
+ cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
+ cfs_rq->utilization_load_avg += se->avg.utilization_avg_contrib;
+ /* we force update consideration on load-balancer moves */
+ update_cfs_rq_blocked_load(cfs_rq, !wakeup);
+}
+
+/*
+ * Remove se's load from this cfs_rq child load-average, if the entity is
+ * transitioning to a blocked state we track its projected decay using
+ * blocked_load_avg.
+ */
+static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
+ struct sched_entity *se,
+ int sleep)
+{
+ update_entity_load_avg(se, 1);
+ /* we force update consideration on load-balancer moves */
+ update_cfs_rq_blocked_load(cfs_rq, !sleep);
+
+ cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
+ cfs_rq->utilization_load_avg -= se->avg.utilization_avg_contrib;
+ if (sleep) {
+ cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
+ se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
+ } /* migrations, e.g. sleep=0 leave decay_count == 0 */
+}
+
+/*
+ * Update the rq's load with the elapsed running time before entering
+ * idle. if the last scheduled task is not a CFS task, idle_enter will
+ * be the only way to update the runnable statistic.
+ */
+void idle_enter_fair(struct rq *this_rq)
+{
+ update_rq_runnable_avg(this_rq, 1);
+}
+
+/*
+ * Update the rq's load with the elapsed idle time before a task is
+ * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
+ * be the only way to update the runnable statistic.
+ */
+void idle_exit_fair(struct rq *this_rq)
+{
+ update_rq_runnable_avg(this_rq, 0);
+}
+
+static int idle_balance(struct rq *this_rq);
+
+#else /* CONFIG_SMP */
+
+static inline void update_entity_load_avg(struct sched_entity *se,
+ int update_cfs_rq) {}
+static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
+static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
+ struct sched_entity *se,
+ int wakeup) {}
+static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
+ struct sched_entity *se,
+ int sleep) {}
+static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
+ int force_update) {}
+
+static inline int idle_balance(struct rq *rq)
+{
+ return 0;
+}
+
+#endif /* CONFIG_SMP */
+
+static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+#ifdef CONFIG_SCHEDSTATS
+ struct task_struct *tsk = NULL;
+
+ if (entity_is_task(se))
+ tsk = task_of(se);
+
+ if (se->statistics.sleep_start) {
+ u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
+
+ if ((s64)delta < 0)
+ delta = 0;
+
+ if (unlikely(delta > se->statistics.sleep_max))
+ se->statistics.sleep_max = delta;
+
+ se->statistics.sleep_start = 0;
+ se->statistics.sum_sleep_runtime += delta;
+
+ if (tsk) {
+ account_scheduler_latency(tsk, delta >> 10, 1);
+ trace_sched_stat_sleep(tsk, delta);
+ }
+ }
+ if (se->statistics.block_start) {
+ u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
+
+ if ((s64)delta < 0)
+ delta = 0;
+
+ if (unlikely(delta > se->statistics.block_max))
+ se->statistics.block_max = delta;
+
+ se->statistics.block_start = 0;
+ se->statistics.sum_sleep_runtime += delta;
+
+ if (tsk) {
+ if (tsk->in_iowait) {
+ se->statistics.iowait_sum += delta;
+ se->statistics.iowait_count++;
+ trace_sched_stat_iowait(tsk, delta);
+ }
+
+ trace_sched_stat_blocked(tsk, delta);
+
+ /*
+ * Blocking time is in units of nanosecs, so shift by
+ * 20 to get a milliseconds-range estimation of the
+ * amount of time that the task spent sleeping:
+ */
+ if (unlikely(prof_on == SLEEP_PROFILING)) {
+ profile_hits(SLEEP_PROFILING,
+ (void *)get_wchan(tsk),
+ delta >> 20);
+ }
+ account_scheduler_latency(tsk, delta >> 10, 0);
+ }
+ }
+#endif
+}
+
+static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+#ifdef CONFIG_SCHED_DEBUG
+ s64 d = se->vruntime - cfs_rq->min_vruntime;
+
+ if (d < 0)
+ d = -d;
+
+ if (d > 3*sysctl_sched_latency)
+ schedstat_inc(cfs_rq, nr_spread_over);
+#endif
+}
+
+static void
+place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
+{
+ u64 vruntime = cfs_rq->min_vruntime;
+
+ /*
+ * The 'current' period is already promised to the current tasks,
+ * however the extra weight of the new task will slow them down a
+ * little, place the new task so that it fits in the slot that
+ * stays open at the end.
+ */
+ if (initial && sched_feat(START_DEBIT))
+ vruntime += sched_vslice(cfs_rq, se);
+
+ /* sleeps up to a single latency don't count. */
+ if (!initial) {
+ unsigned long thresh = sysctl_sched_latency;
+
+ /*
+ * Halve their sleep time's effect, to allow
+ * for a gentler effect of sleepers:
+ */
+ if (sched_feat(GENTLE_FAIR_SLEEPERS))
+ thresh >>= 1;
+
+ vruntime -= thresh;
+ }
+
+ /* ensure we never gain time by being placed backwards. */
+ se->vruntime = max_vruntime(se->vruntime, vruntime);
+}
+
+static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
+
+static void
+enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+{
+ /*
+ * Update the normalized vruntime before updating min_vruntime
+ * through calling update_curr().
+ */
+ if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
+ se->vruntime += cfs_rq->min_vruntime;
+
+ /*
+ * Update run-time statistics of the 'current'.
+ */
+ update_curr(cfs_rq);
+ enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
+ account_entity_enqueue(cfs_rq, se);
+ update_cfs_shares(cfs_rq);
+
+ if (flags & ENQUEUE_WAKEUP) {
+ place_entity(cfs_rq, se, 0);
+ enqueue_sleeper(cfs_rq, se);
+ }
+
+ update_stats_enqueue(cfs_rq, se);
+ check_spread(cfs_rq, se);
+ if (se != cfs_rq->curr)
+ __enqueue_entity(cfs_rq, se);
+ se->on_rq = 1;
+
+ if (cfs_rq->nr_running == 1) {
+ list_add_leaf_cfs_rq(cfs_rq);
+ check_enqueue_throttle(cfs_rq);
+ }
+}
+
+static void __clear_buddies_last(struct sched_entity *se)
+{
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ if (cfs_rq->last != se)
+ break;
+
+ cfs_rq->last = NULL;
+ }
+}
+
+static void __clear_buddies_next(struct sched_entity *se)
+{
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ if (cfs_rq->next != se)
+ break;
+
+ cfs_rq->next = NULL;
+ }
+}
+
+static void __clear_buddies_skip(struct sched_entity *se)
+{
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ if (cfs_rq->skip != se)
+ break;
+
+ cfs_rq->skip = NULL;
+ }
+}
+
+static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ if (cfs_rq->last == se)
+ __clear_buddies_last(se);
+
+ if (cfs_rq->next == se)
+ __clear_buddies_next(se);
+
+ if (cfs_rq->skip == se)
+ __clear_buddies_skip(se);
+}
+
+static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+
+static void
+dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+{
+ /*
+ * Update run-time statistics of the 'current'.
+ */
+ update_curr(cfs_rq);
+ dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
+
+ update_stats_dequeue(cfs_rq, se);
+ if (flags & DEQUEUE_SLEEP) {
+#ifdef CONFIG_SCHEDSTATS
+ if (entity_is_task(se)) {
+ struct task_struct *tsk = task_of(se);
+
+ if (tsk->state & TASK_INTERRUPTIBLE)
+ se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
+ if (tsk->state & TASK_UNINTERRUPTIBLE)
+ se->statistics.block_start = rq_clock(rq_of(cfs_rq));
+ }
+#endif
+ }
+
+ clear_buddies(cfs_rq, se);
+
+ if (se != cfs_rq->curr)
+ __dequeue_entity(cfs_rq, se);
+ se->on_rq = 0;
+ account_entity_dequeue(cfs_rq, se);
+
+ /*
+ * Normalize the entity after updating the min_vruntime because the
+ * update can refer to the ->curr item and we need to reflect this
+ * movement in our normalized position.
+ */
+ if (!(flags & DEQUEUE_SLEEP))
+ se->vruntime -= cfs_rq->min_vruntime;
+
+ /* return excess runtime on last dequeue */
+ return_cfs_rq_runtime(cfs_rq);
+
+ update_min_vruntime(cfs_rq);
+ update_cfs_shares(cfs_rq);
+}
+
+/*
+ * Preempt the current task with a newly woken task if needed:
+ */
+static void
+check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+{
+ unsigned long ideal_runtime, delta_exec;
+ struct sched_entity *se;
+ s64 delta;
+
+ ideal_runtime = sched_slice(cfs_rq, curr);
+ delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
+ if (delta_exec > ideal_runtime) {
+ resched_curr(rq_of(cfs_rq));
+ /*
+ * The current task ran long enough, ensure it doesn't get
+ * re-elected due to buddy favours.
+ */
+ clear_buddies(cfs_rq, curr);
+ return;
+ }
+
+ /*
+ * Ensure that a task that missed wakeup preemption by a
+ * narrow margin doesn't have to wait for a full slice.
+ * This also mitigates buddy induced latencies under load.
+ */
+ if (delta_exec < sysctl_sched_min_granularity)
+ return;
+
+ se = __pick_first_entity(cfs_rq);
+ delta = curr->vruntime - se->vruntime;
+
+ if (delta < 0)
+ return;
+
+ if (delta > ideal_runtime)
+ resched_curr(rq_of(cfs_rq));
+}
+
+static void
+set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ /* 'current' is not kept within the tree. */
+ if (se->on_rq) {
+ /*
+ * Any task has to be enqueued before it get to execute on
+ * a CPU. So account for the time it spent waiting on the
+ * runqueue.
+ */
+ update_stats_wait_end(cfs_rq, se);
+ __dequeue_entity(cfs_rq, se);
+ update_entity_load_avg(se, 1);
+ }
+
+ update_stats_curr_start(cfs_rq, se);
+ cfs_rq->curr = se;
+#ifdef CONFIG_SCHEDSTATS
+ /*
+ * Track our maximum slice length, if the CPU's load is at
+ * least twice that of our own weight (i.e. dont track it
+ * when there are only lesser-weight tasks around):
+ */
+ if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
+ se->statistics.slice_max = max(se->statistics.slice_max,
+ se->sum_exec_runtime - se->prev_sum_exec_runtime);
+ }
+#endif
+ se->prev_sum_exec_runtime = se->sum_exec_runtime;
+}
+
+static int
+wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
+
+/*
+ * Pick the next process, keeping these things in mind, in this order:
+ * 1) keep things fair between processes/task groups
+ * 2) pick the "next" process, since someone really wants that to run
+ * 3) pick the "last" process, for cache locality
+ * 4) do not run the "skip" process, if something else is available
+ */
+static struct sched_entity *
+pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+{
+ struct sched_entity *left = __pick_first_entity(cfs_rq);
+ struct sched_entity *se;
+
+ /*
+ * If curr is set we have to see if its left of the leftmost entity
+ * still in the tree, provided there was anything in the tree at all.
+ */
+ if (!left || (curr && entity_before(curr, left)))
+ left = curr;
+
+ se = left; /* ideally we run the leftmost entity */
+
+ /*
+ * Avoid running the skip buddy, if running something else can
+ * be done without getting too unfair.
+ */
+ if (cfs_rq->skip == se) {
+ struct sched_entity *second;
+
+ if (se == curr) {
+ second = __pick_first_entity(cfs_rq);
+ } else {
+ second = __pick_next_entity(se);
+ if (!second || (curr && entity_before(curr, second)))
+ second = curr;
+ }
+
+ if (second && wakeup_preempt_entity(second, left) < 1)
+ se = second;
+ }
+
+ /*
+ * Prefer last buddy, try to return the CPU to a preempted task.
+ */
+ if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
+ se = cfs_rq->last;
+
+ /*
+ * Someone really wants this to run. If it's not unfair, run it.
+ */
+ if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
+ se = cfs_rq->next;
+
+ clear_buddies(cfs_rq, se);
+
+ return se;
+}
+
+static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+
+static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
+{
+ /*
+ * If still on the runqueue then deactivate_task()
+ * was not called and update_curr() has to be done:
+ */
+ if (prev->on_rq)
+ update_curr(cfs_rq);
+
+ /* throttle cfs_rqs exceeding runtime */
+ check_cfs_rq_runtime(cfs_rq);
+
+ check_spread(cfs_rq, prev);
+ if (prev->on_rq) {
+ update_stats_wait_start(cfs_rq, prev);
+ /* Put 'current' back into the tree. */
+ __enqueue_entity(cfs_rq, prev);
+ /* in !on_rq case, update occurred at dequeue */
+ update_entity_load_avg(prev, 1);
+ }
+ cfs_rq->curr = NULL;
+}
+
+static void
+entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
+{
+ /*
+ * Update run-time statistics of the 'current'.
+ */
+ update_curr(cfs_rq);
+
+ /*
+ * Ensure that runnable average is periodically updated.
+ */
+ update_entity_load_avg(curr, 1);
+ update_cfs_rq_blocked_load(cfs_rq, 1);
+ update_cfs_shares(cfs_rq);
+
+#ifdef CONFIG_SCHED_HRTICK
+ /*
+ * queued ticks are scheduled to match the slice, so don't bother
+ * validating it and just reschedule.
+ */
+ if (queued) {
+ resched_curr(rq_of(cfs_rq));
+ return;
+ }
+ /*
+ * don't let the period tick interfere with the hrtick preemption
+ */
+ if (!sched_feat(DOUBLE_TICK) &&
+ hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
+ return;
+#endif
+
+ if (cfs_rq->nr_running > 1)
+ check_preempt_tick(cfs_rq, curr);
+}
+
+
+/**************************************************
+ * CFS bandwidth control machinery
+ */
+
+#ifdef CONFIG_CFS_BANDWIDTH
+
+#ifdef HAVE_JUMP_LABEL
+static struct static_key __cfs_bandwidth_used;
+
+static inline bool cfs_bandwidth_used(void)
+{
+ return static_key_false(&__cfs_bandwidth_used);
+}
+
+void cfs_bandwidth_usage_inc(void)
+{
+ static_key_slow_inc(&__cfs_bandwidth_used);
+}
+
+void cfs_bandwidth_usage_dec(void)
+{
+ static_key_slow_dec(&__cfs_bandwidth_used);
+}
+#else /* HAVE_JUMP_LABEL */
+static bool cfs_bandwidth_used(void)
+{
+ return true;
+}
+
+void cfs_bandwidth_usage_inc(void) {}
+void cfs_bandwidth_usage_dec(void) {}
+#endif /* HAVE_JUMP_LABEL */
+
+/*
+ * default period for cfs group bandwidth.
+ * default: 0.1s, units: nanoseconds
+ */
+static inline u64 default_cfs_period(void)
+{
+ return 100000000ULL;
+}
+
+static inline u64 sched_cfs_bandwidth_slice(void)
+{
+ return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
+}
+
+/*
+ * Replenish runtime according to assigned quota and update expiration time.
+ * We use sched_clock_cpu directly instead of rq->clock to avoid adding
+ * additional synchronization around rq->lock.
+ *
+ * requires cfs_b->lock
+ */
+void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
+{
+ u64 now;
+
+ if (cfs_b->quota == RUNTIME_INF)
+ return;
+
+ now = sched_clock_cpu(smp_processor_id());
+ cfs_b->runtime = cfs_b->quota;
+ cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
+}
+
+static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
+{
+ return &tg->cfs_bandwidth;
+}
+
+/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
+{
+ if (unlikely(cfs_rq->throttle_count))
+ return cfs_rq->throttled_clock_task;
+
+ return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
+}
+
+/* returns 0 on failure to allocate runtime */
+static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+ struct task_group *tg = cfs_rq->tg;
+ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
+ u64 amount = 0, min_amount, expires;
+
+ /* note: this is a positive sum as runtime_remaining <= 0 */
+ min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
+
+ raw_spin_lock(&cfs_b->lock);
+ if (cfs_b->quota == RUNTIME_INF)
+ amount = min_amount;
+ else {
+ /*
+ * If the bandwidth pool has become inactive, then at least one
+ * period must have elapsed since the last consumption.
+ * Refresh the global state and ensure bandwidth timer becomes
+ * active.
+ */
+ if (!cfs_b->timer_active) {
+ __refill_cfs_bandwidth_runtime(cfs_b);
+ __start_cfs_bandwidth(cfs_b, false);
+ }
+
+ if (cfs_b->runtime > 0) {
+ amount = min(cfs_b->runtime, min_amount);
+ cfs_b->runtime -= amount;
+ cfs_b->idle = 0;
+ }
+ }
+ expires = cfs_b->runtime_expires;
+ raw_spin_unlock(&cfs_b->lock);
+
+ cfs_rq->runtime_remaining += amount;
+ /*
+ * we may have advanced our local expiration to account for allowed
+ * spread between our sched_clock and the one on which runtime was
+ * issued.
+ */
+ if ((s64)(expires - cfs_rq->runtime_expires) > 0)
+ cfs_rq->runtime_expires = expires;
+
+ return cfs_rq->runtime_remaining > 0;
+}
+
+/*
+ * Note: This depends on the synchronization provided by sched_clock and the
+ * fact that rq->clock snapshots this value.
+ */
+static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+
+ /* if the deadline is ahead of our clock, nothing to do */
+ if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
+ return;
+
+ if (cfs_rq->runtime_remaining < 0)
+ return;
+
+ /*
+ * If the local deadline has passed we have to consider the
+ * possibility that our sched_clock is 'fast' and the global deadline
+ * has not truly expired.
+ *
+ * Fortunately we can check determine whether this the case by checking
+ * whether the global deadline has advanced. It is valid to compare
+ * cfs_b->runtime_expires without any locks since we only care about
+ * exact equality, so a partial write will still work.
+ */
+
+ if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
+ /* extend local deadline, drift is bounded above by 2 ticks */
+ cfs_rq->runtime_expires += TICK_NSEC;
+ } else {
+ /* global deadline is ahead, expiration has passed */
+ cfs_rq->runtime_remaining = 0;
+ }
+}
+
+static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
+{
+ /* dock delta_exec before expiring quota (as it could span periods) */
+ cfs_rq->runtime_remaining -= delta_exec;
+ expire_cfs_rq_runtime(cfs_rq);
+
+ if (likely(cfs_rq->runtime_remaining > 0))
+ return;
+
+ /*
+ * if we're unable to extend our runtime we resched so that the active
+ * hierarchy can be throttled
+ */
+ if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
+ resched_curr(rq_of(cfs_rq));
+}
+
+static __always_inline
+void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
+{
+ if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
+ return;
+
+ __account_cfs_rq_runtime(cfs_rq, delta_exec);
+}
+
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
+{
+ return cfs_bandwidth_used() && cfs_rq->throttled;
+}
+
+/* check whether cfs_rq, or any parent, is throttled */
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
+{
+ return cfs_bandwidth_used() && cfs_rq->throttle_count;
+}
+
+/*
+ * Ensure that neither of the group entities corresponding to src_cpu or
+ * dest_cpu are members of a throttled hierarchy when performing group
+ * load-balance operations.
+ */
+static inline int throttled_lb_pair(struct task_group *tg,
+ int src_cpu, int dest_cpu)
+{
+ struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
+
+ src_cfs_rq = tg->cfs_rq[src_cpu];
+ dest_cfs_rq = tg->cfs_rq[dest_cpu];
+
+ return throttled_hierarchy(src_cfs_rq) ||
+ throttled_hierarchy(dest_cfs_rq);
+}
+
+/* updated child weight may affect parent so we have to do this bottom up */
+static int tg_unthrottle_up(struct task_group *tg, void *data)
+{
+ struct rq *rq = data;
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+
+ cfs_rq->throttle_count--;
+#ifdef CONFIG_SMP
+ if (!cfs_rq->throttle_count) {
+ /* adjust cfs_rq_clock_task() */
+ cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
+ cfs_rq->throttled_clock_task;
+ }
+#endif
+
+ return 0;
+}
+
+static int tg_throttle_down(struct task_group *tg, void *data)
+{
+ struct rq *rq = data;
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+
+ /* group is entering throttled state, stop time */
+ if (!cfs_rq->throttle_count)
+ cfs_rq->throttled_clock_task = rq_clock_task(rq);
+ cfs_rq->throttle_count++;
+
+ return 0;
+}
+
+static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
+{
+ struct rq *rq = rq_of(cfs_rq);
+ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+ struct sched_entity *se;
+ long task_delta, dequeue = 1;
+
+ se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
+
+ /* freeze hierarchy runnable averages while throttled */
+ rcu_read_lock();
+ walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
+ rcu_read_unlock();
+
+ task_delta = cfs_rq->h_nr_running;
+ for_each_sched_entity(se) {
+ struct cfs_rq *qcfs_rq = cfs_rq_of(se);
+ /* throttled entity or throttle-on-deactivate */
+ if (!se->on_rq)
+ break;
+
+ if (dequeue)
+ dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
+ qcfs_rq->h_nr_running -= task_delta;
+
+ if (qcfs_rq->load.weight)
+ dequeue = 0;
+ }
+
+ if (!se)
+ sub_nr_running(rq, task_delta);
+
+ cfs_rq->throttled = 1;
+ cfs_rq->throttled_clock = rq_clock(rq);
+ raw_spin_lock(&cfs_b->lock);
+ /*
+ * Add to the _head_ of the list, so that an already-started
+ * distribute_cfs_runtime will not see us
+ */
+ list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+ if (!cfs_b->timer_active)
+ __start_cfs_bandwidth(cfs_b, false);
+ raw_spin_unlock(&cfs_b->lock);
+}
+
+void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
+{
+ struct rq *rq = rq_of(cfs_rq);
+ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+ struct sched_entity *se;
+ int enqueue = 1;
+ long task_delta;
+
+ se = cfs_rq->tg->se[cpu_of(rq)];
+
+ cfs_rq->throttled = 0;
+
+ update_rq_clock(rq);
+
+ raw_spin_lock(&cfs_b->lock);
+ cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
+ list_del_rcu(&cfs_rq->throttled_list);
+ raw_spin_unlock(&cfs_b->lock);
+
+ /* update hierarchical throttle state */
+ walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
+
+ if (!cfs_rq->load.weight)
+ return;
+
+ task_delta = cfs_rq->h_nr_running;
+ for_each_sched_entity(se) {
+ if (se->on_rq)
+ enqueue = 0;
+
+ cfs_rq = cfs_rq_of(se);
+ if (enqueue)
+ enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
+ cfs_rq->h_nr_running += task_delta;
+
+ if (cfs_rq_throttled(cfs_rq))
+ break;
+ }
+
+ if (!se)
+ add_nr_running(rq, task_delta);
+
+ /* determine whether we need to wake up potentially idle cpu */
+ if (rq->curr == rq->idle && rq->cfs.nr_running)
+ resched_curr(rq);
+}
+
+static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
+ u64 remaining, u64 expires)
+{
+ struct cfs_rq *cfs_rq;
+ u64 runtime;
+ u64 starting_runtime = remaining;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
+ throttled_list) {
+ struct rq *rq = rq_of(cfs_rq);
+
+ raw_spin_lock(&rq->lock);
+ if (!cfs_rq_throttled(cfs_rq))
+ goto next;
+
+ runtime = -cfs_rq->runtime_remaining + 1;
+ if (runtime > remaining)
+ runtime = remaining;
+ remaining -= runtime;
+
+ cfs_rq->runtime_remaining += runtime;
+ cfs_rq->runtime_expires = expires;
+
+ /* we check whether we're throttled above */
+ if (cfs_rq->runtime_remaining > 0)
+ unthrottle_cfs_rq(cfs_rq);
+
+next:
+ raw_spin_unlock(&rq->lock);
+
+ if (!remaining)
+ break;
+ }
+ rcu_read_unlock();
+
+ return starting_runtime - remaining;
+}
+
+/*
+ * Responsible for refilling a task_group's bandwidth and unthrottling its
+ * cfs_rqs as appropriate. If there has been no activity within the last
+ * period the timer is deactivated until scheduling resumes; cfs_b->idle is
+ * used to track this state.
+ */
+static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
+{
+ u64 runtime, runtime_expires;
+ int throttled;
+
+ /* no need to continue the timer with no bandwidth constraint */
+ if (cfs_b->quota == RUNTIME_INF)
+ goto out_deactivate;
+
+ throttled = !list_empty(&cfs_b->throttled_cfs_rq);
+ cfs_b->nr_periods += overrun;
+
+ /*
+ * idle depends on !throttled (for the case of a large deficit), and if
+ * we're going inactive then everything else can be deferred
+ */
+ if (cfs_b->idle && !throttled)
+ goto out_deactivate;
+
+ /*
+ * if we have relooped after returning idle once, we need to update our
+ * status as actually running, so that other cpus doing
+ * __start_cfs_bandwidth will stop trying to cancel us.
+ */
+ cfs_b->timer_active = 1;
+
+ __refill_cfs_bandwidth_runtime(cfs_b);
+
+ if (!throttled) {
+ /* mark as potentially idle for the upcoming period */
+ cfs_b->idle = 1;
+ return 0;
+ }
+
+ /* account preceding periods in which throttling occurred */
+ cfs_b->nr_throttled += overrun;
+
+ runtime_expires = cfs_b->runtime_expires;
+
+ /*
+ * This check is repeated as we are holding onto the new bandwidth while
+ * we unthrottle. This can potentially race with an unthrottled group
+ * trying to acquire new bandwidth from the global pool. This can result
+ * in us over-using our runtime if it is all used during this loop, but
+ * only by limited amounts in that extreme case.
+ */
+ while (throttled && cfs_b->runtime > 0) {
+ runtime = cfs_b->runtime;
+ raw_spin_unlock(&cfs_b->lock);
+ /* we can't nest cfs_b->lock while distributing bandwidth */
+ runtime = distribute_cfs_runtime(cfs_b, runtime,
+ runtime_expires);
+ raw_spin_lock(&cfs_b->lock);
+
+ throttled = !list_empty(&cfs_b->throttled_cfs_rq);
+
+ cfs_b->runtime -= min(runtime, cfs_b->runtime);
+ }
+
+ /*
+ * While we are ensured activity in the period following an
+ * unthrottle, this also covers the case in which the new bandwidth is
+ * insufficient to cover the existing bandwidth deficit. (Forcing the
+ * timer to remain active while there are any throttled entities.)
+ */
+ cfs_b->idle = 0;
+
+ return 0;
+
+out_deactivate:
+ cfs_b->timer_active = 0;
+ return 1;
+}
+
+/* a cfs_rq won't donate quota below this amount */
+static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
+/* minimum remaining period time to redistribute slack quota */
+static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
+/* how long we wait to gather additional slack before distributing */
+static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
+
+/*
+ * Are we near the end of the current quota period?
+ *
+ * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
+ * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of
+ * migrate_hrtimers, base is never cleared, so we are fine.
+ */
+static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
+{
+ struct hrtimer *refresh_timer = &cfs_b->period_timer;
+ u64 remaining;
+
+ /* if the call-back is running a quota refresh is already occurring */
+ if (hrtimer_callback_running(refresh_timer))
+ return 1;
+
+ /* is a quota refresh about to occur? */
+ remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
+ if (remaining < min_expire)
+ return 1;
+
+ return 0;
+}
+
+static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+ u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
+
+ /* if there's a quota refresh soon don't bother with slack */
+ if (runtime_refresh_within(cfs_b, min_left))
+ return;
+
+ start_bandwidth_timer(&cfs_b->slack_timer,
+ ns_to_ktime(cfs_bandwidth_slack_period));
+}
+
+/* we know any runtime found here is valid as update_curr() precedes return */
+static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+ s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
+
+ if (slack_runtime <= 0)
+ return;
+
+ raw_spin_lock(&cfs_b->lock);
+ if (cfs_b->quota != RUNTIME_INF &&
+ cfs_rq->runtime_expires == cfs_b->runtime_expires) {
+ cfs_b->runtime += slack_runtime;
+
+ /* we are under rq->lock, defer unthrottling using a timer */
+ if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
+ !list_empty(&cfs_b->throttled_cfs_rq))
+ start_cfs_slack_bandwidth(cfs_b);
+ }
+ raw_spin_unlock(&cfs_b->lock);
+
+ /* even if it's not valid for return we don't want to try again */
+ cfs_rq->runtime_remaining -= slack_runtime;
+}
+
+static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+ if (!cfs_bandwidth_used())
+ return;
+
+ if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
+ return;
+
+ __return_cfs_rq_runtime(cfs_rq);
+}
+
+/*
+ * This is done with a timer (instead of inline with bandwidth return) since
+ * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
+ */
+static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
+{
+ u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
+ u64 expires;
+
+ /* confirm we're still not at a refresh boundary */
+ raw_spin_lock(&cfs_b->lock);
+ if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
+ raw_spin_unlock(&cfs_b->lock);
+ return;
+ }
+
+ if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
+ runtime = cfs_b->runtime;
+
+ expires = cfs_b->runtime_expires;
+ raw_spin_unlock(&cfs_b->lock);
+
+ if (!runtime)
+ return;
+
+ runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
+
+ raw_spin_lock(&cfs_b->lock);
+ if (expires == cfs_b->runtime_expires)
+ cfs_b->runtime -= min(runtime, cfs_b->runtime);
+ raw_spin_unlock(&cfs_b->lock);
+}
+
+/*
+ * When a group wakes up we want to make sure that its quota is not already
+ * expired/exceeded, otherwise it may be allowed to steal additional ticks of
+ * runtime as update_curr() throttling can not not trigger until it's on-rq.
+ */
+static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
+{
+ if (!cfs_bandwidth_used())
+ return;
+
+ /* an active group must be handled by the update_curr()->put() path */
+ if (!cfs_rq->runtime_enabled || cfs_rq->curr)
+ return;
+
+ /* ensure the group is not already throttled */
+ if (cfs_rq_throttled(cfs_rq))
+ return;
+
+ /* update runtime allocation */
+ account_cfs_rq_runtime(cfs_rq, 0);
+ if (cfs_rq->runtime_remaining <= 0)
+ throttle_cfs_rq(cfs_rq);
+}
+
+/* conditionally throttle active cfs_rq's from put_prev_entity() */
+static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+ if (!cfs_bandwidth_used())
+ return false;
+
+ if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
+ return false;
+
+ /*
+ * it's possible for a throttled entity to be forced into a running
+ * state (e.g. set_curr_task), in this case we're finished.
+ */
+ if (cfs_rq_throttled(cfs_rq))
+ return true;
+
+ throttle_cfs_rq(cfs_rq);
+ return true;
+}
+
+static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
+{
+ struct cfs_bandwidth *cfs_b =
+ container_of(timer, struct cfs_bandwidth, slack_timer);
+ do_sched_cfs_slack_timer(cfs_b);
+
+ return HRTIMER_NORESTART;
+}
+
+static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
+{
+ struct cfs_bandwidth *cfs_b =
+ container_of(timer, struct cfs_bandwidth, period_timer);
+ ktime_t now;
+ int overrun;
+ int idle = 0;
+
+ raw_spin_lock(&cfs_b->lock);
+ for (;;) {
+ now = hrtimer_cb_get_time(timer);
+ overrun = hrtimer_forward(timer, now, cfs_b->period);
+
+ if (!overrun)
+ break;
+
+ idle = do_sched_cfs_period_timer(cfs_b, overrun);
+ }
+ raw_spin_unlock(&cfs_b->lock);
+
+ return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
+}
+
+void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+ raw_spin_lock_init(&cfs_b->lock);
+ cfs_b->runtime = 0;
+ cfs_b->quota = RUNTIME_INF;
+ cfs_b->period = ns_to_ktime(default_cfs_period());
+
+ INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
+ hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ cfs_b->period_timer.function = sched_cfs_period_timer;
+ hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ cfs_b->slack_timer.function = sched_cfs_slack_timer;
+}
+
+static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+ cfs_rq->runtime_enabled = 0;
+ INIT_LIST_HEAD(&cfs_rq->throttled_list);
+}
+
+/* requires cfs_b->lock, may release to reprogram timer */
+void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force)
+{
+ /*
+ * The timer may be active because we're trying to set a new bandwidth
+ * period or because we're racing with the tear-down path
+ * (timer_active==0 becomes visible before the hrtimer call-back
+ * terminates). In either case we ensure that it's re-programmed
+ */
+ while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
+ hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
+ /* bounce the lock to allow do_sched_cfs_period_timer to run */
+ raw_spin_unlock(&cfs_b->lock);
+ cpu_relax();
+ raw_spin_lock(&cfs_b->lock);
+ /* if someone else restarted the timer then we're done */
+ if (!force && cfs_b->timer_active)
+ return;
+ }
+
+ cfs_b->timer_active = 1;
+ start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
+}
+
+static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+ /* init_cfs_bandwidth() was not called */
+ if (!cfs_b->throttled_cfs_rq.next)
+ return;
+
+ hrtimer_cancel(&cfs_b->period_timer);
+ hrtimer_cancel(&cfs_b->slack_timer);
+}
+
+static void __maybe_unused update_runtime_enabled(struct rq *rq)
+{
+ struct cfs_rq *cfs_rq;
+
+ for_each_leaf_cfs_rq(rq, cfs_rq) {
+ struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
+
+ raw_spin_lock(&cfs_b->lock);
+ cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
+ raw_spin_unlock(&cfs_b->lock);
+ }
+}
+
+static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
+{
+ struct cfs_rq *cfs_rq;
+
+ for_each_leaf_cfs_rq(rq, cfs_rq) {
+ if (!cfs_rq->runtime_enabled)
+ continue;
+
+ /*
+ * clock_task is not advancing so we just need to make sure
+ * there's some valid quota amount
+ */
+ cfs_rq->runtime_remaining = 1;
+ /*
+ * Offline rq is schedulable till cpu is completely disabled
+ * in take_cpu_down(), so we prevent new cfs throttling here.
+ */
+ cfs_rq->runtime_enabled = 0;
+
+ if (cfs_rq_throttled(cfs_rq))
+ unthrottle_cfs_rq(cfs_rq);
+ }
+}
+
+#else /* CONFIG_CFS_BANDWIDTH */
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
+{
+ return rq_clock_task(rq_of(cfs_rq));
+}
+
+static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
+static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
+static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
+static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
+{
+ return 0;
+}
+
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
+{
+ return 0;
+}
+
+static inline int throttled_lb_pair(struct task_group *tg,
+ int src_cpu, int dest_cpu)
+{
+ return 0;
+}
+
+void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+#endif
+
+static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
+{
+ return NULL;
+}
+static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
+static inline void update_runtime_enabled(struct rq *rq) {}
+static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
+
+#endif /* CONFIG_CFS_BANDWIDTH */
+
+/**************************************************
+ * CFS operations on tasks:
+ */
+
+#ifdef CONFIG_SCHED_HRTICK
+static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
+{
+ struct sched_entity *se = &p->se;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ WARN_ON(task_rq(p) != rq);
+
+ if (cfs_rq->nr_running > 1) {
+ u64 slice = sched_slice(cfs_rq, se);
+ u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+ s64 delta = slice - ran;
+
+ if (delta < 0) {
+ if (rq->curr == p)
+ resched_curr(rq);
+ return;
+ }
+ hrtick_start(rq, delta);
+ }
+}
+
+/*
+ * called from enqueue/dequeue and updates the hrtick when the
+ * current task is from our class and nr_running is low enough
+ * to matter.
+ */
+static void hrtick_update(struct rq *rq)
+{
+ struct task_struct *curr = rq->curr;
+
+ if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
+ return;
+
+ if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
+ hrtick_start_fair(rq, curr);
+}
+#else /* !CONFIG_SCHED_HRTICK */
+static inline void
+hrtick_start_fair(struct rq *rq, struct task_struct *p)
+{
+}
+
+static inline void hrtick_update(struct rq *rq)
+{
+}
+#endif
+
+/*
+ * The enqueue_task method is called before nr_running is
+ * increased. Here we update the fair scheduling stats and
+ * then put the task into the rbtree:
+ */
+static void
+enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+{
+ struct cfs_rq *cfs_rq;
+ struct sched_entity *se = &p->se;
+
+ for_each_sched_entity(se) {
+ if (se->on_rq)
+ break;
+ cfs_rq = cfs_rq_of(se);
+ enqueue_entity(cfs_rq, se, flags);
+
+ /*
+ * end evaluation on encountering a throttled cfs_rq
+ *
+ * note: in the case of encountering a throttled cfs_rq we will
+ * post the final h_nr_running increment below.
+ */
+ if (cfs_rq_throttled(cfs_rq))
+ break;
+ cfs_rq->h_nr_running++;
+
+ flags = ENQUEUE_WAKEUP;
+ }
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+ cfs_rq->h_nr_running++;
+
+ if (cfs_rq_throttled(cfs_rq))
+ break;
+
+ update_cfs_shares(cfs_rq);
+ update_entity_load_avg(se, 1);
+ }
+
+ if (!se) {
+ update_rq_runnable_avg(rq, rq->nr_running);
+ add_nr_running(rq, 1);
+ }
+ hrtick_update(rq);
+}
+
+static void set_next_buddy(struct sched_entity *se);
+
+/*
+ * The dequeue_task method is called before nr_running is
+ * decreased. We remove the task from the rbtree and
+ * update the fair scheduling stats:
+ */
+static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+{
+ struct cfs_rq *cfs_rq;
+ struct sched_entity *se = &p->se;
+ int task_sleep = flags & DEQUEUE_SLEEP;
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+ dequeue_entity(cfs_rq, se, flags);
+
+ /*
+ * end evaluation on encountering a throttled cfs_rq
+ *
+ * note: in the case of encountering a throttled cfs_rq we will
+ * post the final h_nr_running decrement below.
+ */
+ if (cfs_rq_throttled(cfs_rq))
+ break;
+ cfs_rq->h_nr_running--;
+
+ /* Don't dequeue parent if it has other entities besides us */
+ if (cfs_rq->load.weight) {
+ /*
+ * Bias pick_next to pick a task from this cfs_rq, as
+ * p is sleeping when it is within its sched_slice.
+ */
+ if (task_sleep && parent_entity(se))
+ set_next_buddy(parent_entity(se));
+
+ /* avoid re-evaluating load for this entity */
+ se = parent_entity(se);
+ break;
+ }
+ flags |= DEQUEUE_SLEEP;
+ }
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+ cfs_rq->h_nr_running--;
+
+ if (cfs_rq_throttled(cfs_rq))
+ break;
+
+ update_cfs_shares(cfs_rq);
+ update_entity_load_avg(se, 1);
+ }
+
+ if (!se) {
+ sub_nr_running(rq, 1);
+ update_rq_runnable_avg(rq, 1);
+ }
+ hrtick_update(rq);
+}
+
+#ifdef CONFIG_SMP
+/* Used instead of source_load when we know the type == 0 */
+static unsigned long weighted_cpuload(const int cpu)
+{
+ return cpu_rq(cpu)->cfs.runnable_load_avg;
+}
+
+/*
+ * Return a low guess at the load of a migration-source cpu weighted
+ * according to the scheduling class and "nice" value.
+ *
+ * We want to under-estimate the load of migration sources, to
+ * balance conservatively.
+ */
+static unsigned long source_load(int cpu, int type)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long total = weighted_cpuload(cpu);
+
+ if (type == 0 || !sched_feat(LB_BIAS))
+ return total;
+
+ return min(rq->cpu_load[type-1], total);
+}
+
+/*
+ * Return a high guess at the load of a migration-target cpu weighted
+ * according to the scheduling class and "nice" value.
+ */
+static unsigned long target_load(int cpu, int type)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long total = weighted_cpuload(cpu);
+
+ if (type == 0 || !sched_feat(LB_BIAS))
+ return total;
+
+ return max(rq->cpu_load[type-1], total);
+}
+
+static unsigned long capacity_of(int cpu)
+{
+ return cpu_rq(cpu)->cpu_capacity;
+}
+
+static unsigned long capacity_orig_of(int cpu)
+{
+ return cpu_rq(cpu)->cpu_capacity_orig;
+}
+
+static unsigned long cpu_avg_load_per_task(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running);
+ unsigned long load_avg = rq->cfs.runnable_load_avg;
+
+ if (nr_running)
+ return load_avg / nr_running;
+
+ return 0;
+}
+
+static void record_wakee(struct task_struct *p)
+{
+ /*
+ * Rough decay (wiping) for cost saving, don't worry
+ * about the boundary, really active task won't care
+ * about the loss.
+ */
+ if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
+ current->wakee_flips >>= 1;
+ current->wakee_flip_decay_ts = jiffies;
+ }
+
+ if (current->last_wakee != p) {
+ current->last_wakee = p;
+ current->wakee_flips++;
+ }
+}
+
+static void task_waking_fair(struct task_struct *p)
+{
+ struct sched_entity *se = &p->se;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 min_vruntime;
+
+#ifndef CONFIG_64BIT
+ u64 min_vruntime_copy;
+
+ do {
+ min_vruntime_copy = cfs_rq->min_vruntime_copy;
+ smp_rmb();
+ min_vruntime = cfs_rq->min_vruntime;
+ } while (min_vruntime != min_vruntime_copy);
+#else
+ min_vruntime = cfs_rq->min_vruntime;
+#endif
+
+ se->vruntime -= min_vruntime;
+ record_wakee(p);
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * effective_load() calculates the load change as seen from the root_task_group
+ *
+ * Adding load to a group doesn't make a group heavier, but can cause movement
+ * of group shares between cpus. Assuming the shares were perfectly aligned one
+ * can calculate the shift in shares.
+ *
+ * Calculate the effective load difference if @wl is added (subtracted) to @tg
+ * on this @cpu and results in a total addition (subtraction) of @wg to the
+ * total group weight.
+ *
+ * Given a runqueue weight distribution (rw_i) we can compute a shares
+ * distribution (s_i) using:
+ *
+ * s_i = rw_i / \Sum rw_j (1)
+ *
+ * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
+ * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
+ * shares distribution (s_i):
+ *
+ * rw_i = { 2, 4, 1, 0 }
+ * s_i = { 2/7, 4/7, 1/7, 0 }
+ *
+ * As per wake_affine() we're interested in the load of two CPUs (the CPU the
+ * task used to run on and the CPU the waker is running on), we need to
+ * compute the effect of waking a task on either CPU and, in case of a sync
+ * wakeup, compute the effect of the current task going to sleep.
+ *
+ * So for a change of @wl to the local @cpu with an overall group weight change
+ * of @wl we can compute the new shares distribution (s'_i) using:
+ *
+ * s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)
+ *
+ * Suppose we're interested in CPUs 0 and 1, and want to compute the load
+ * differences in waking a task to CPU 0. The additional task changes the
+ * weight and shares distributions like:
+ *
+ * rw'_i = { 3, 4, 1, 0 }
+ * s'_i = { 3/8, 4/8, 1/8, 0 }
+ *
+ * We can then compute the difference in effective weight by using:
+ *
+ * dw_i = S * (s'_i - s_i) (3)
+ *
+ * Where 'S' is the group weight as seen by its parent.
+ *
+ * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
+ * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
+ * 4/7) times the weight of the group.
+ */
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
+{
+ struct sched_entity *se = tg->se[cpu];
+
+ if (!tg->parent) /* the trivial, non-cgroup case */
+ return wl;
+
+ for_each_sched_entity(se) {
+ long w, W;
+
+ tg = se->my_q->tg;
+
+ /*
+ * W = @wg + \Sum rw_j
+ */
+ W = wg + calc_tg_weight(tg, se->my_q);
+
+ /*
+ * w = rw_i + @wl
+ */
+ w = se->my_q->load.weight + wl;
+
+ /*
+ * wl = S * s'_i; see (2)
+ */
+ if (W > 0 && w < W)
+ wl = (w * (long)tg->shares) / W;
+ else
+ wl = tg->shares;
+
+ /*
+ * Per the above, wl is the new se->load.weight value; since
+ * those are clipped to [MIN_SHARES, ...) do so now. See
+ * calc_cfs_shares().
+ */
+ if (wl < MIN_SHARES)
+ wl = MIN_SHARES;
+
+ /*
+ * wl = dw_i = S * (s'_i - s_i); see (3)
+ */
+ wl -= se->load.weight;
+
+ /*
+ * Recursively apply this logic to all parent groups to compute
+ * the final effective load change on the root group. Since
+ * only the @tg group gets extra weight, all parent groups can
+ * only redistribute existing shares. @wl is the shift in shares
+ * resulting from this level per the above.
+ */
+ wg = 0;
+ }
+
+ return wl;
+}
+#else
+
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
+{
+ return wl;
+}
+
+#endif
+
+static int wake_wide(struct task_struct *p)
+{
+ int factor = this_cpu_read(sd_llc_size);
+
+ /*
+ * Yeah, it's the switching-frequency, could means many wakee or
+ * rapidly switch, use factor here will just help to automatically
+ * adjust the loose-degree, so bigger node will lead to more pull.
+ */
+ if (p->wakee_flips > factor) {
+ /*
+ * wakee is somewhat hot, it needs certain amount of cpu
+ * resource, so if waker is far more hot, prefer to leave
+ * it alone.
+ */
+ if (current->wakee_flips > (factor * p->wakee_flips))
+ return 1;
+ }
+
+ return 0;
+}
+
+static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
+{
+ s64 this_load, load;
+ s64 this_eff_load, prev_eff_load;
+ int idx, this_cpu, prev_cpu;
+ struct task_group *tg;
+ unsigned long weight;
+ int balanced;
+
+ /*
+ * If we wake multiple tasks be careful to not bounce
+ * ourselves around too much.
+ */
+ if (wake_wide(p))
+ return 0;
+
+ idx = sd->wake_idx;
+ this_cpu = smp_processor_id();
+ prev_cpu = task_cpu(p);
+ load = source_load(prev_cpu, idx);
+ this_load = target_load(this_cpu, idx);
+
+ /*
+ * If sync wakeup then subtract the (maximum possible)
+ * effect of the currently running task from the load
+ * of the current CPU:
+ */
+ if (sync) {
+ tg = task_group(current);
+ weight = current->se.load.weight;
+
+ this_load += effective_load(tg, this_cpu, -weight, -weight);
+ load += effective_load(tg, prev_cpu, 0, -weight);
+ }
+
+ tg = task_group(p);
+ weight = p->se.load.weight;
+
+ /*
+ * In low-load situations, where prev_cpu is idle and this_cpu is idle
+ * due to the sync cause above having dropped this_load to 0, we'll
+ * always have an imbalance, but there's really nothing you can do
+ * about that, so that's good too.
+ *
+ * Otherwise check if either cpus are near enough in load to allow this
+ * task to be woken on this_cpu.
+ */
+ this_eff_load = 100;
+ this_eff_load *= capacity_of(prev_cpu);
+
+ prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
+ prev_eff_load *= capacity_of(this_cpu);
+
+ if (this_load > 0) {
+ this_eff_load *= this_load +
+ effective_load(tg, this_cpu, weight, weight);
+
+ prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
+ }
+
+ balanced = this_eff_load <= prev_eff_load;
+
+ schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
+
+ if (!balanced)
+ return 0;
+
+ schedstat_inc(sd, ttwu_move_affine);
+ schedstat_inc(p, se.statistics.nr_wakeups_affine);
+
+ return 1;
+}
+
+/*
+ * find_idlest_group finds and returns the least busy CPU group within the
+ * domain.
+ */
+static struct sched_group *
+find_idlest_group(struct sched_domain *sd, struct task_struct *p,
+ int this_cpu, int sd_flag)
+{
+ struct sched_group *idlest = NULL, *group = sd->groups;
+ unsigned long min_load = ULONG_MAX, this_load = 0;
+ int load_idx = sd->forkexec_idx;
+ int imbalance = 100 + (sd->imbalance_pct-100)/2;
+
+ if (sd_flag & SD_BALANCE_WAKE)
+ load_idx = sd->wake_idx;
+
+ do {
+ unsigned long load, avg_load;
+ int local_group;
+ int i;
+
+ /* Skip over this group if it has no CPUs allowed */
+ if (!cpumask_intersects(sched_group_cpus(group),
+ tsk_cpus_allowed(p)))
+ continue;
+
+ local_group = cpumask_test_cpu(this_cpu,
+ sched_group_cpus(group));
+
+ /* Tally up the load of all CPUs in the group */
+ avg_load = 0;
+
+ for_each_cpu(i, sched_group_cpus(group)) {
+ /* Bias balancing toward cpus of our domain */
+ if (local_group)
+ load = source_load(i, load_idx);
+ else
+ load = target_load(i, load_idx);
+
+ avg_load += load;
+ }
+
+ /* Adjust by relative CPU capacity of the group */
+ avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;
+
+ if (local_group) {
+ this_load = avg_load;
+ } else if (avg_load < min_load) {
+ min_load = avg_load;
+ idlest = group;
+ }
+ } while (group = group->next, group != sd->groups);
+
+ if (!idlest || 100*this_load < imbalance*min_load)
+ return NULL;
+ return idlest;
+}
+
+/*
+ * find_idlest_cpu - find the idlest cpu among the cpus in group.
+ */
+static int
+find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
+{
+ unsigned long load, min_load = ULONG_MAX;
+ unsigned int min_exit_latency = UINT_MAX;
+ u64 latest_idle_timestamp = 0;
+ int least_loaded_cpu = this_cpu;
+ int shallowest_idle_cpu = -1;
+ int i;
+
+ /* Traverse only the allowed CPUs */
+ for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
+ if (idle_cpu(i)) {
+ struct rq *rq = cpu_rq(i);
+ struct cpuidle_state *idle = idle_get_state(rq);
+ if (idle && idle->exit_latency < min_exit_latency) {
+ /*
+ * We give priority to a CPU whose idle state
+ * has the smallest exit latency irrespective
+ * of any idle timestamp.
+ */
+ min_exit_latency = idle->exit_latency;
+ latest_idle_timestamp = rq->idle_stamp;
+ shallowest_idle_cpu = i;
+ } else if ((!idle || idle->exit_latency == min_exit_latency) &&
+ rq->idle_stamp > latest_idle_timestamp) {
+ /*
+ * If equal or no active idle state, then
+ * the most recently idled CPU might have
+ * a warmer cache.
+ */
+ latest_idle_timestamp = rq->idle_stamp;
+ shallowest_idle_cpu = i;
+ }
+ } else if (shallowest_idle_cpu == -1) {
+ load = weighted_cpuload(i);
+ if (load < min_load || (load == min_load && i == this_cpu)) {
+ min_load = load;
+ least_loaded_cpu = i;
+ }
+ }
+ }
+
+ return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
+}
+
+/*
+ * Try and locate an idle CPU in the sched_domain.
+ */
+static int select_idle_sibling(struct task_struct *p, int target)
+{
+ struct sched_domain *sd;
+ struct sched_group *sg;
+ int i = task_cpu(p);
+
+ if (idle_cpu(target))
+ return target;
+
+ /*
+ * If the prevous cpu is cache affine and idle, don't be stupid.
+ */
+ if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
+ return i;
+
+ /*
+ * Otherwise, iterate the domains and find an elegible idle cpu.
+ */
+ sd = rcu_dereference(per_cpu(sd_llc, target));
+ for_each_lower_domain(sd) {
+ sg = sd->groups;
+ do {
+ if (!cpumask_intersects(sched_group_cpus(sg),
+ tsk_cpus_allowed(p)))
+ goto next;
+
+ for_each_cpu(i, sched_group_cpus(sg)) {
+ if (i == target || !idle_cpu(i))
+ goto next;
+ }
+
+ target = cpumask_first_and(sched_group_cpus(sg),
+ tsk_cpus_allowed(p));
+ goto done;
+next:
+ sg = sg->next;
+ } while (sg != sd->groups);
+ }
+done:
+ return target;
+}
+/*
+ * get_cpu_usage returns the amount of capacity of a CPU that is used by CFS
+ * tasks. The unit of the return value must be the one of capacity so we can
+ * compare the usage with the capacity of the CPU that is available for CFS
+ * task (ie cpu_capacity).
+ * cfs.utilization_load_avg is the sum of running time of runnable tasks on a
+ * CPU. It represents the amount of utilization of a CPU in the range
+ * [0..SCHED_LOAD_SCALE]. The usage of a CPU can't be higher than the full
+ * capacity of the CPU because it's about the running time on this CPU.
+ * Nevertheless, cfs.utilization_load_avg can be higher than SCHED_LOAD_SCALE
+ * because of unfortunate rounding in avg_period and running_load_avg or just
+ * after migrating tasks until the average stabilizes with the new running
+ * time. So we need to check that the usage stays into the range
+ * [0..cpu_capacity_orig] and cap if necessary.
+ * Without capping the usage, a group could be seen as overloaded (CPU0 usage
+ * at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity
+ */
+static int get_cpu_usage(int cpu)
+{
+ unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg;
+ unsigned long capacity = capacity_orig_of(cpu);
+
+ if (usage >= SCHED_LOAD_SCALE)
+ return capacity;
+
+ return (usage * capacity) >> SCHED_LOAD_SHIFT;
+}
+
+/*
+ * select_task_rq_fair: Select target runqueue for the waking task in domains
+ * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
+ * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
+ *
+ * Balances load by selecting the idlest cpu in the idlest group, or under
+ * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.
+ *
+ * Returns the target cpu number.
+ *
+ * preempt must be disabled.
+ */
+static int
+select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
+{
+ struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
+ int cpu = smp_processor_id();
+ int new_cpu = cpu;
+ int want_affine = 0;
+ int sync = wake_flags & WF_SYNC;
+
+ if (sd_flag & SD_BALANCE_WAKE)
+ want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+
+ rcu_read_lock();
+ for_each_domain(cpu, tmp) {
+ if (!(tmp->flags & SD_LOAD_BALANCE))
+ continue;
+
+ /*
+ * If both cpu and prev_cpu are part of this domain,
+ * cpu is a valid SD_WAKE_AFFINE target.
+ */
+ if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
+ cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
+ affine_sd = tmp;
+ break;
+ }
+
+ if (tmp->flags & sd_flag)
+ sd = tmp;
+ }
+
+ if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync))
+ prev_cpu = cpu;
+
+ if (sd_flag & SD_BALANCE_WAKE) {
+ new_cpu = select_idle_sibling(p, prev_cpu);
+ goto unlock;
+ }
+
+ while (sd) {
+ struct sched_group *group;
+ int weight;
+
+ if (!(sd->flags & sd_flag)) {
+ sd = sd->child;
+ continue;
+ }
+
+ group = find_idlest_group(sd, p, cpu, sd_flag);
+ if (!group) {
+ sd = sd->child;
+ continue;
+ }
+
+ new_cpu = find_idlest_cpu(group, p, cpu);
+ if (new_cpu == -1 || new_cpu == cpu) {
+ /* Now try balancing at a lower domain level of cpu */
+ sd = sd->child;
+ continue;
+ }
+
+ /* Now try balancing at a lower domain level of new_cpu */
+ cpu = new_cpu;
+ weight = sd->span_weight;
+ sd = NULL;
+ for_each_domain(cpu, tmp) {
+ if (weight <= tmp->span_weight)
+ break;
+ if (tmp->flags & sd_flag)
+ sd = tmp;
+ }
+ /* while loop will break here if sd == NULL */
+ }
+unlock:
+ rcu_read_unlock();
+
+ return new_cpu;
+}
+
+/*
+ * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
+ * cfs_rq_of(p) references at time of call are still valid and identify the
+ * previous cpu. However, the caller only guarantees p->pi_lock is held; no
+ * other assumptions, including the state of rq->lock, should be made.
+ */
+static void
+migrate_task_rq_fair(struct task_struct *p, int next_cpu)
+{
+ struct sched_entity *se = &p->se;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ /*
+ * Load tracking: accumulate removed load so that it can be processed
+ * when we next update owning cfs_rq under rq->lock. Tasks contribute
+ * to blocked load iff they have a positive decay-count. It can never
+ * be negative here since on-rq tasks have decay-count == 0.
+ */
+ if (se->avg.decay_count) {
+ se->avg.decay_count = -__synchronize_entity_decay(se);
+ atomic_long_add(se->avg.load_avg_contrib,
+ &cfs_rq->removed_load);
+ }
+
+ /* We have migrated, no longer consider this task hot */
+ se->exec_start = 0;
+}
+#endif /* CONFIG_SMP */
+
+static unsigned long
+wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
+{
+ unsigned long gran = sysctl_sched_wakeup_granularity;
+
+ /*
+ * Since its curr running now, convert the gran from real-time
+ * to virtual-time in his units.
+ *
+ * By using 'se' instead of 'curr' we penalize light tasks, so
+ * they get preempted easier. That is, if 'se' < 'curr' then
+ * the resulting gran will be larger, therefore penalizing the
+ * lighter, if otoh 'se' > 'curr' then the resulting gran will
+ * be smaller, again penalizing the lighter task.
+ *
+ * This is especially important for buddies when the leftmost
+ * task is higher priority than the buddy.
+ */
+ return calc_delta_fair(gran, se);
+}
+
+/*
+ * Should 'se' preempt 'curr'.
+ *
+ * |s1
+ * |s2
+ * |s3
+ * g
+ * |<--->|c
+ *
+ * w(c, s1) = -1
+ * w(c, s2) = 0
+ * w(c, s3) = 1
+ *
+ */
+static int
+wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
+{
+ s64 gran, vdiff = curr->vruntime - se->vruntime;
+
+ if (vdiff <= 0)
+ return -1;
+
+ gran = wakeup_gran(curr, se);
+ if (vdiff > gran)
+ return 1;
+
+ return 0;
+}
+
+static void set_last_buddy(struct sched_entity *se)
+{
+ if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
+ return;
+
+ for_each_sched_entity(se)
+ cfs_rq_of(se)->last = se;
+}
+
+static void set_next_buddy(struct sched_entity *se)
+{
+ if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
+ return;
+
+ for_each_sched_entity(se)
+ cfs_rq_of(se)->next = se;
+}
+
+static void set_skip_buddy(struct sched_entity *se)
+{
+ for_each_sched_entity(se)
+ cfs_rq_of(se)->skip = se;
+}
+
+/*
+ * Preempt the current task with a newly woken task if needed:
+ */
+static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
+{
+ struct task_struct *curr = rq->curr;
+ struct sched_entity *se = &curr->se, *pse = &p->se;
+ struct cfs_rq *cfs_rq = task_cfs_rq(curr);
+ int scale = cfs_rq->nr_running >= sched_nr_latency;
+ int next_buddy_marked = 0;
+
+ if (unlikely(se == pse))
+ return;
+
+ /*
+ * This is possible from callers such as attach_tasks(), in which we
+ * unconditionally check_prempt_curr() after an enqueue (which may have
+ * lead to a throttle). This both saves work and prevents false
+ * next-buddy nomination below.
+ */
+ if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
+ return;
+
+ if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
+ set_next_buddy(pse);
+ next_buddy_marked = 1;
+ }
+
+ /*
+ * We can come here with TIF_NEED_RESCHED already set from new task
+ * wake up path.
+ *
+ * Note: this also catches the edge-case of curr being in a throttled
+ * group (e.g. via set_curr_task), since update_curr() (in the
+ * enqueue of curr) will have resulted in resched being set. This
+ * prevents us from potentially nominating it as a false LAST_BUDDY
+ * below.
+ */
+ if (test_tsk_need_resched(curr))
+ return;
+
+ /* Idle tasks are by definition preempted by non-idle tasks. */
+ if (unlikely(curr->policy == SCHED_IDLE) &&
+ likely(p->policy != SCHED_IDLE))
+ goto preempt;
+
+ /*
+ * Batch and idle tasks do not preempt non-idle tasks (their preemption
+ * is driven by the tick):
+ */
+ if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
+ return;
+
+ find_matching_se(&se, &pse);
+ update_curr(cfs_rq_of(se));
+ BUG_ON(!pse);
+ if (wakeup_preempt_entity(se, pse) == 1) {
+ /*
+ * Bias pick_next to pick the sched entity that is
+ * triggering this preemption.
+ */
+ if (!next_buddy_marked)
+ set_next_buddy(pse);
+ goto preempt;
+ }
+
+ return;
+
+preempt:
+ resched_curr(rq);
+ /*
+ * Only set the backward buddy when the current task is still
+ * on the rq. This can happen when a wakeup gets interleaved
+ * with schedule on the ->pre_schedule() or idle_balance()
+ * point, either of which can * drop the rq lock.
+ *
+ * Also, during early boot the idle thread is in the fair class,
+ * for obvious reasons its a bad idea to schedule back to it.
+ */
+ if (unlikely(!se->on_rq || curr == rq->idle))
+ return;
+
+ if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
+ set_last_buddy(se);
+}
+
+static struct task_struct *
+pick_next_task_fair(struct rq *rq, struct task_struct *prev)
+{
+ struct cfs_rq *cfs_rq = &rq->cfs;
+ struct sched_entity *se;
+ struct task_struct *p;
+ int new_tasks;
+
+again:
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ if (!cfs_rq->nr_running)
+ goto idle;
+
+ if (prev->sched_class != &fair_sched_class)
+ goto simple;
+
+ /*
+ * Because of the set_next_buddy() in dequeue_task_fair() it is rather
+ * likely that a next task is from the same cgroup as the current.
+ *
+ * Therefore attempt to avoid putting and setting the entire cgroup
+ * hierarchy, only change the part that actually changes.
+ */
+
+ do {
+ struct sched_entity *curr = cfs_rq->curr;
+
+ /*
+ * Since we got here without doing put_prev_entity() we also
+ * have to consider cfs_rq->curr. If it is still a runnable
+ * entity, update_curr() will update its vruntime, otherwise
+ * forget we've ever seen it.
+ */
+ if (curr && curr->on_rq)
+ update_curr(cfs_rq);
+ else
+ curr = NULL;
+
+ /*
+ * This call to check_cfs_rq_runtime() will do the throttle and
+ * dequeue its entity in the parent(s). Therefore the 'simple'
+ * nr_running test will indeed be correct.
+ */
+ if (unlikely(check_cfs_rq_runtime(cfs_rq)))
+ goto simple;
+
+ se = pick_next_entity(cfs_rq, curr);
+ cfs_rq = group_cfs_rq(se);
+ } while (cfs_rq);
+
+ p = task_of(se);
+
+ /*
+ * Since we haven't yet done put_prev_entity and if the selected task
+ * is a different task than we started out with, try and touch the
+ * least amount of cfs_rqs.
+ */
+ if (prev != p) {
+ struct sched_entity *pse = &prev->se;
+
+ while (!(cfs_rq = is_same_group(se, pse))) {
+ int se_depth = se->depth;
+ int pse_depth = pse->depth;
+
+ if (se_depth <= pse_depth) {
+ put_prev_entity(cfs_rq_of(pse), pse);
+ pse = parent_entity(pse);
+ }
+ if (se_depth >= pse_depth) {
+ set_next_entity(cfs_rq_of(se), se);
+ se = parent_entity(se);
+ }
+ }
+
+ put_prev_entity(cfs_rq, pse);
+ set_next_entity(cfs_rq, se);
+ }
+
+ if (hrtick_enabled(rq))
+ hrtick_start_fair(rq, p);
+
+ return p;
+simple:
+ cfs_rq = &rq->cfs;
+#endif
+
+ if (!cfs_rq->nr_running)
+ goto idle;
+
+ put_prev_task(rq, prev);
+
+ do {
+ se = pick_next_entity(cfs_rq, NULL);
+ set_next_entity(cfs_rq, se);
+ cfs_rq = group_cfs_rq(se);
+ } while (cfs_rq);
+
+ p = task_of(se);
+
+ if (hrtick_enabled(rq))
+ hrtick_start_fair(rq, p);
+
+ return p;
+
+idle:
+ new_tasks = idle_balance(rq);
+ /*
+ * Because idle_balance() releases (and re-acquires) rq->lock, it is
+ * possible for any higher priority task to appear. In that case we
+ * must re-start the pick_next_entity() loop.
+ */
+ if (new_tasks < 0)
+ return RETRY_TASK;
+
+ if (new_tasks > 0)
+ goto again;
+
+ return NULL;
+}
+
+/*
+ * Account for a descheduled task:
+ */
+static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
+{
+ struct sched_entity *se = &prev->se;
+ struct cfs_rq *cfs_rq;
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+ put_prev_entity(cfs_rq, se);
+ }
+}
+
+/*
+ * sched_yield() is very simple
+ *
+ * The magic of dealing with the ->skip buddy is in pick_next_entity.
+ */
+static void yield_task_fair(struct rq *rq)
+{
+ struct task_struct *curr = rq->curr;
+ struct cfs_rq *cfs_rq = task_cfs_rq(curr);
+ struct sched_entity *se = &curr->se;
+
+ /*
+ * Are we the only task in the tree?
+ */
+ if (unlikely(rq->nr_running == 1))
+ return;
+
+ clear_buddies(cfs_rq, se);
+
+ if (curr->policy != SCHED_BATCH) {
+ update_rq_clock(rq);
+ /*
+ * Update run-time statistics of the 'current'.
+ */
+ update_curr(cfs_rq);
+ /*
+ * Tell update_rq_clock() that we've just updated,
+ * so we don't do microscopic update in schedule()
+ * and double the fastpath cost.
+ */
+ rq_clock_skip_update(rq, true);
+ }
+
+ set_skip_buddy(se);
+}
+
+static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
+{
+ struct sched_entity *se = &p->se;
+
+ /* throttled hierarchies are not runnable */
+ if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
+ return false;
+
+ /* Tell the scheduler that we'd really like pse to run next. */
+ set_next_buddy(se);
+
+ yield_task_fair(rq);
+
+ return true;
+}
+
+#ifdef CONFIG_SMP
+/**************************************************
+ * Fair scheduling class load-balancing methods.
+ *
+ * BASICS
+ *
+ * The purpose of load-balancing is to achieve the same basic fairness the
+ * per-cpu scheduler provides, namely provide a proportional amount of compute
+ * time to each task. This is expressed in the following equation:
+ *
+ * W_i,n/P_i == W_j,n/P_j for all i,j (1)
+ *
+ * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
+ * W_i,0 is defined as:
+ *
+ * W_i,0 = \Sum_j w_i,j (2)
+ *
+ * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
+ * is derived from the nice value as per prio_to_weight[].
+ *
+ * The weight average is an exponential decay average of the instantaneous
+ * weight:
+ *
+ * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
+ *
+ * C_i is the compute capacity of cpu i, typically it is the
+ * fraction of 'recent' time available for SCHED_OTHER task execution. But it
+ * can also include other factors [XXX].
+ *
+ * To achieve this balance we define a measure of imbalance which follows
+ * directly from (1):
+ *
+ * imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j } (4)
+ *
+ * We them move tasks around to minimize the imbalance. In the continuous
+ * function space it is obvious this converges, in the discrete case we get
+ * a few fun cases generally called infeasible weight scenarios.
+ *
+ * [XXX expand on:
+ * - infeasible weights;
+ * - local vs global optima in the discrete case. ]
+ *
+ *
+ * SCHED DOMAINS
+ *
+ * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
+ * for all i,j solution, we create a tree of cpus that follows the hardware
+ * topology where each level pairs two lower groups (or better). This results
+ * in O(log n) layers. Furthermore we reduce the number of cpus going up the
+ * tree to only the first of the previous level and we decrease the frequency
+ * of load-balance at each level inv. proportional to the number of cpus in
+ * the groups.
+ *
+ * This yields:
+ *
+ * log_2 n 1 n
+ * \Sum { --- * --- * 2^i } = O(n) (5)
+ * i = 0 2^i 2^i
+ * `- size of each group
+ * | | `- number of cpus doing load-balance
+ * | `- freq
+ * `- sum over all levels
+ *
+ * Coupled with a limit on how many tasks we can migrate every balance pass,
+ * this makes (5) the runtime complexity of the balancer.
+ *
+ * An important property here is that each CPU is still (indirectly) connected
+ * to every other cpu in at most O(log n) steps:
+ *
+ * The adjacency matrix of the resulting graph is given by:
+ *
+ * log_2 n
+ * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
+ * k = 0
+ *
+ * And you'll find that:
+ *
+ * A^(log_2 n)_i,j != 0 for all i,j (7)
+ *
+ * Showing there's indeed a path between every cpu in at most O(log n) steps.
+ * The task movement gives a factor of O(m), giving a convergence complexity
+ * of:
+ *
+ * O(nm log n), n := nr_cpus, m := nr_tasks (8)
+ *
+ *
+ * WORK CONSERVING
+ *
+ * In order to avoid CPUs going idle while there's still work to do, new idle
+ * balancing is more aggressive and has the newly idle cpu iterate up the domain
+ * tree itself instead of relying on other CPUs to bring it work.
+ *
+ * This adds some complexity to both (5) and (8) but it reduces the total idle
+ * time.
+ *
+ * [XXX more?]
+ *
+ *
+ * CGROUPS
+ *
+ * Cgroups make a horror show out of (2), instead of a simple sum we get:
+ *
+ * s_k,i
+ * W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
+ * S_k
+ *
+ * Where
+ *
+ * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
+ *
+ * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
+ *
+ * The big problem is S_k, its a global sum needed to compute a local (W_i)
+ * property.
+ *
+ * [XXX write more on how we solve this.. _after_ merging pjt's patches that
+ * rewrite all of this once again.]
+ */
+
+static unsigned long __read_mostly max_load_balance_interval = HZ/10;
+
+enum fbq_type { regular, remote, all };
+
+#define LBF_ALL_PINNED 0x01
+#define LBF_NEED_BREAK 0x02
+#define LBF_DST_PINNED 0x04
+#define LBF_SOME_PINNED 0x08
+
+struct lb_env {
+ struct sched_domain *sd;
+
+ struct rq *src_rq;
+ int src_cpu;
+
+ int dst_cpu;
+ struct rq *dst_rq;
+
+ struct cpumask *dst_grpmask;
+ int new_dst_cpu;
+ enum cpu_idle_type idle;
+ long imbalance;
+ /* The set of CPUs under consideration for load-balancing */
+ struct cpumask *cpus;
+
+ unsigned int flags;
+
+ unsigned int loop;
+ unsigned int loop_break;
+ unsigned int loop_max;
+
+ enum fbq_type fbq_type;
+ struct list_head tasks;
+};
+
+/*
+ * Is this task likely cache-hot:
+ */
+static int task_hot(struct task_struct *p, struct lb_env *env)
+{
+ s64 delta;
+
+ lockdep_assert_held(&env->src_rq->lock);
+
+ if (p->sched_class != &fair_sched_class)
+ return 0;
+
+ if (unlikely(p->policy == SCHED_IDLE))
+ return 0;
+
+ /*
+ * Buddy candidates are cache hot:
+ */
+ if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
+ (&p->se == cfs_rq_of(&p->se)->next ||
+ &p->se == cfs_rq_of(&p->se)->last))
+ return 1;
+
+ if (sysctl_sched_migration_cost == -1)
+ return 1;
+ if (sysctl_sched_migration_cost == 0)
+ return 0;
+
+ delta = rq_clock_task(env->src_rq) - p->se.exec_start;
+
+ return delta < (s64)sysctl_sched_migration_cost;
+}
+
+#ifdef CONFIG_NUMA_BALANCING
+/* Returns true if the destination node has incurred more faults */
+static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
+{
+ struct numa_group *numa_group = rcu_dereference(p->numa_group);
+ int src_nid, dst_nid;
+
+ if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
+ !(env->sd->flags & SD_NUMA)) {
+ return false;
+ }
+
+ src_nid = cpu_to_node(env->src_cpu);
+ dst_nid = cpu_to_node(env->dst_cpu);
+
+ if (src_nid == dst_nid)
+ return false;
+
+ if (numa_group) {
+ /* Task is already in the group's interleave set. */
+ if (node_isset(src_nid, numa_group->active_nodes))
+ return false;
+
+ /* Task is moving into the group's interleave set. */
+ if (node_isset(dst_nid, numa_group->active_nodes))
+ return true;
+
+ return group_faults(p, dst_nid) > group_faults(p, src_nid);
+ }
+
+ /* Encourage migration to the preferred node. */
+ if (dst_nid == p->numa_preferred_nid)
+ return true;
+
+ return task_faults(p, dst_nid) > task_faults(p, src_nid);
+}
+
+
+static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
+{
+ struct numa_group *numa_group = rcu_dereference(p->numa_group);
+ int src_nid, dst_nid;
+
+ if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
+ return false;
+
+ if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
+ return false;
+
+ src_nid = cpu_to_node(env->src_cpu);
+ dst_nid = cpu_to_node(env->dst_cpu);
+
+ if (src_nid == dst_nid)
+ return false;
+
+ if (numa_group) {
+ /* Task is moving within/into the group's interleave set. */
+ if (node_isset(dst_nid, numa_group->active_nodes))
+ return false;
+
+ /* Task is moving out of the group's interleave set. */
+ if (node_isset(src_nid, numa_group->active_nodes))
+ return true;
+
+ return group_faults(p, dst_nid) < group_faults(p, src_nid);
+ }
+
+ /* Migrating away from the preferred node is always bad. */
+ if (src_nid == p->numa_preferred_nid)
+ return true;
+
+ return task_faults(p, dst_nid) < task_faults(p, src_nid);
+}
+
+#else
+static inline bool migrate_improves_locality(struct task_struct *p,
+ struct lb_env *env)
+{
+ return false;
+}
+
+static inline bool migrate_degrades_locality(struct task_struct *p,
+ struct lb_env *env)
+{
+ return false;
+}
+#endif
+
+/*
+ * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
+ */
+static
+int can_migrate_task(struct task_struct *p, struct lb_env *env)
+{
+ int tsk_cache_hot = 0;
+
+ lockdep_assert_held(&env->src_rq->lock);
+
+ /*
+ * We do not migrate tasks that are:
+ * 1) throttled_lb_pair, or
+ * 2) cannot be migrated to this CPU due to cpus_allowed, or
+ * 3) running (obviously), or
+ * 4) are cache-hot on their current CPU.
+ */
+ if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
+ return 0;
+
+ if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
+ int cpu;
+
+ schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
+
+ env->flags |= LBF_SOME_PINNED;
+
+ /*
+ * Remember if this task can be migrated to any other cpu in
+ * our sched_group. We may want to revisit it if we couldn't
+ * meet load balance goals by pulling other tasks on src_cpu.
+ *
+ * Also avoid computing new_dst_cpu if we have already computed
+ * one in current iteration.
+ */
+ if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
+ return 0;
+
+ /* Prevent to re-select dst_cpu via env's cpus */
+ for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
+ if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
+ env->flags |= LBF_DST_PINNED;
+ env->new_dst_cpu = cpu;
+ break;
+ }
+ }
+
+ return 0;
+ }
+
+ /* Record that we found atleast one task that could run on dst_cpu */
+ env->flags &= ~LBF_ALL_PINNED;
+
+ if (task_running(env->src_rq, p)) {
+ schedstat_inc(p, se.statistics.nr_failed_migrations_running);
+ return 0;
+ }
+
+ /*
+ * Aggressive migration if:
+ * 1) destination numa is preferred
+ * 2) task is cache cold, or
+ * 3) too many balance attempts have failed.
+ */
+ tsk_cache_hot = task_hot(p, env);
+ if (!tsk_cache_hot)
+ tsk_cache_hot = migrate_degrades_locality(p, env);
+
+ if (migrate_improves_locality(p, env) || !tsk_cache_hot ||
+ env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
+ if (tsk_cache_hot) {
+ schedstat_inc(env->sd, lb_hot_gained[env->idle]);
+ schedstat_inc(p, se.statistics.nr_forced_migrations);
+ }
+ return 1;
+ }
+
+ schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
+ return 0;
+}
+
+/*
+ * detach_task() -- detach the task for the migration specified in env
+ */
+static void detach_task(struct task_struct *p, struct lb_env *env)
+{
+ lockdep_assert_held(&env->src_rq->lock);
+
+ deactivate_task(env->src_rq, p, 0);
+ p->on_rq = TASK_ON_RQ_MIGRATING;
+ set_task_cpu(p, env->dst_cpu);
+}
+
+/*
+ * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
+ * part of active balancing operations within "domain".
+ *
+ * Returns a task if successful and NULL otherwise.
+ */
+static struct task_struct *detach_one_task(struct lb_env *env)
+{
+ struct task_struct *p, *n;
+
+ lockdep_assert_held(&env->src_rq->lock);
+
+ list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
+ if (!can_migrate_task(p, env))
+ continue;
+
+ detach_task(p, env);
+
+ /*
+ * Right now, this is only the second place where
+ * lb_gained[env->idle] is updated (other is detach_tasks)
+ * so we can safely collect stats here rather than
+ * inside detach_tasks().
+ */
+ schedstat_inc(env->sd, lb_gained[env->idle]);
+ return p;
+ }
+ return NULL;
+}
+
+static const unsigned int sched_nr_migrate_break = 32;
+
+/*
+ * detach_tasks() -- tries to detach up to imbalance weighted load from
+ * busiest_rq, as part of a balancing operation within domain "sd".
+ *
+ * Returns number of detached tasks if successful and 0 otherwise.
+ */
+static int detach_tasks(struct lb_env *env)
+{
+ struct list_head *tasks = &env->src_rq->cfs_tasks;
+ struct task_struct *p;
+ unsigned long load;
+ int detached = 0;
+
+ lockdep_assert_held(&env->src_rq->lock);
+
+ if (env->imbalance <= 0)
+ return 0;
+
+ while (!list_empty(tasks)) {
+ p = list_first_entry(tasks, struct task_struct, se.group_node);
+
+ env->loop++;
+ /* We've more or less seen every task there is, call it quits */
+ if (env->loop > env->loop_max)
+ break;
+
+ /* take a breather every nr_migrate tasks */
+ if (env->loop > env->loop_break) {
+ env->loop_break += sched_nr_migrate_break;
+ env->flags |= LBF_NEED_BREAK;
+ break;
+ }
+
+ if (!can_migrate_task(p, env))
+ goto next;
+
+ load = task_h_load(p);
+
+ if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
+ goto next;
+
+ if ((load / 2) > env->imbalance)
+ goto next;
+
+ detach_task(p, env);
+ list_add(&p->se.group_node, &env->tasks);
+
+ detached++;
+ env->imbalance -= load;
+
+#ifdef CONFIG_PREEMPT
+ /*
+ * NEWIDLE balancing is a source of latency, so preemptible
+ * kernels will stop after the first task is detached to minimize
+ * the critical section.
+ */
+ if (env->idle == CPU_NEWLY_IDLE)
+ break;
+#endif
+
+ /*
+ * We only want to steal up to the prescribed amount of
+ * weighted load.
+ */
+ if (env->imbalance <= 0)
+ break;
+
+ continue;
+next:
+ list_move_tail(&p->se.group_node, tasks);
+ }
+
+ /*
+ * Right now, this is one of only two places we collect this stat
+ * so we can safely collect detach_one_task() stats here rather
+ * than inside detach_one_task().
+ */
+ schedstat_add(env->sd, lb_gained[env->idle], detached);
+
+ return detached;
+}
+
+/*
+ * attach_task() -- attach the task detached by detach_task() to its new rq.
+ */
+static void attach_task(struct rq *rq, struct task_struct *p)
+{
+ lockdep_assert_held(&rq->lock);
+
+ BUG_ON(task_rq(p) != rq);
+ p->on_rq = TASK_ON_RQ_QUEUED;
+ activate_task(rq, p, 0);
+ check_preempt_curr(rq, p, 0);
+}
+
+/*
+ * attach_one_task() -- attaches the task returned from detach_one_task() to
+ * its new rq.
+ */
+static void attach_one_task(struct rq *rq, struct task_struct *p)
+{
+ raw_spin_lock(&rq->lock);
+ attach_task(rq, p);
+ raw_spin_unlock(&rq->lock);
+}
+
+/*
+ * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
+ * new rq.
+ */
+static void attach_tasks(struct lb_env *env)
+{
+ struct list_head *tasks = &env->tasks;
+ struct task_struct *p;
+
+ raw_spin_lock(&env->dst_rq->lock);
+
+ while (!list_empty(tasks)) {
+ p = list_first_entry(tasks, struct task_struct, se.group_node);
+ list_del_init(&p->se.group_node);
+
+ attach_task(env->dst_rq, p);
+ }
+
+ raw_spin_unlock(&env->dst_rq->lock);
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * update tg->load_weight by folding this cpu's load_avg
+ */
+static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
+{
+ struct sched_entity *se = tg->se[cpu];
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
+
+ /* throttled entities do not contribute to load */
+ if (throttled_hierarchy(cfs_rq))
+ return;
+
+ update_cfs_rq_blocked_load(cfs_rq, 1);
+
+ if (se) {
+ update_entity_load_avg(se, 1);
+ /*
+ * We pivot on our runnable average having decayed to zero for
+ * list removal. This generally implies that all our children
+ * have also been removed (modulo rounding error or bandwidth
+ * control); however, such cases are rare and we can fix these
+ * at enqueue.
+ *
+ * TODO: fix up out-of-order children on enqueue.
+ */
+ if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
+ list_del_leaf_cfs_rq(cfs_rq);
+ } else {
+ struct rq *rq = rq_of(cfs_rq);
+ update_rq_runnable_avg(rq, rq->nr_running);
+ }
+}
+
+static void update_blocked_averages(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct cfs_rq *cfs_rq;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ update_rq_clock(rq);
+ /*
+ * Iterates the task_group tree in a bottom up fashion, see
+ * list_add_leaf_cfs_rq() for details.
+ */
+ for_each_leaf_cfs_rq(rq, cfs_rq) {
+ /*
+ * Note: We may want to consider periodically releasing
+ * rq->lock about these updates so that creating many task
+ * groups does not result in continually extending hold time.
+ */
+ __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
+ }
+
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+/*
+ * Compute the hierarchical load factor for cfs_rq and all its ascendants.
+ * This needs to be done in a top-down fashion because the load of a child
+ * group is a fraction of its parents load.
+ */
+static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
+{
+ struct rq *rq = rq_of(cfs_rq);
+ struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
+ unsigned long now = jiffies;
+ unsigned long load;
+
+ if (cfs_rq->last_h_load_update == now)
+ return;
+
+ cfs_rq->h_load_next = NULL;
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+ cfs_rq->h_load_next = se;
+ if (cfs_rq->last_h_load_update == now)
+ break;
+ }
+
+ if (!se) {
+ cfs_rq->h_load = cfs_rq->runnable_load_avg;
+ cfs_rq->last_h_load_update = now;
+ }
+
+ while ((se = cfs_rq->h_load_next) != NULL) {
+ load = cfs_rq->h_load;
+ load = div64_ul(load * se->avg.load_avg_contrib,
+ cfs_rq->runnable_load_avg + 1);
+ cfs_rq = group_cfs_rq(se);
+ cfs_rq->h_load = load;
+ cfs_rq->last_h_load_update = now;
+ }
+}
+
+static unsigned long task_h_load(struct task_struct *p)
+{
+ struct cfs_rq *cfs_rq = task_cfs_rq(p);
+
+ update_cfs_rq_h_load(cfs_rq);
+ return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,
+ cfs_rq->runnable_load_avg + 1);
+}
+#else
+static inline void update_blocked_averages(int cpu)
+{
+}
+
+static unsigned long task_h_load(struct task_struct *p)
+{
+ return p->se.avg.load_avg_contrib;
+}
+#endif
+
+/********** Helpers for find_busiest_group ************************/
+
+enum group_type {
+ group_other = 0,
+ group_imbalanced,
+ group_overloaded,
+};
+
+/*
+ * sg_lb_stats - stats of a sched_group required for load_balancing
+ */
+struct sg_lb_stats {
+ unsigned long avg_load; /*Avg load across the CPUs of the group */
+ unsigned long group_load; /* Total load over the CPUs of the group */
+ unsigned long sum_weighted_load; /* Weighted load of group's tasks */
+ unsigned long load_per_task;
+ unsigned long group_capacity;
+ unsigned long group_usage; /* Total usage of the group */
+ unsigned int sum_nr_running; /* Nr tasks running in the group */
+ unsigned int idle_cpus;
+ unsigned int group_weight;
+ enum group_type group_type;
+ int group_no_capacity;
+#ifdef CONFIG_NUMA_BALANCING
+ unsigned int nr_numa_running;
+ unsigned int nr_preferred_running;
+#endif
+};
+
+/*
+ * sd_lb_stats - Structure to store the statistics of a sched_domain
+ * during load balancing.
+ */
+struct sd_lb_stats {
+ struct sched_group *busiest; /* Busiest group in this sd */
+ struct sched_group *local; /* Local group in this sd */
+ unsigned long total_load; /* Total load of all groups in sd */
+ unsigned long total_capacity; /* Total capacity of all groups in sd */
+ unsigned long avg_load; /* Average load across all groups in sd */
+
+ struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
+ struct sg_lb_stats local_stat; /* Statistics of the local group */
+};
+
+static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
+{
+ /*
+ * Skimp on the clearing to avoid duplicate work. We can avoid clearing
+ * local_stat because update_sg_lb_stats() does a full clear/assignment.
+ * We must however clear busiest_stat::avg_load because
+ * update_sd_pick_busiest() reads this before assignment.
+ */
+ *sds = (struct sd_lb_stats){
+ .busiest = NULL,
+ .local = NULL,
+ .total_load = 0UL,
+ .total_capacity = 0UL,
+ .busiest_stat = {
+ .avg_load = 0UL,
+ .sum_nr_running = 0,
+ .group_type = group_other,
+ },
+ };
+}
+
+/**
+ * get_sd_load_idx - Obtain the load index for a given sched domain.
+ * @sd: The sched_domain whose load_idx is to be obtained.
+ * @idle: The idle status of the CPU for whose sd load_idx is obtained.
+ *
+ * Return: The load index.
+ */
+static inline int get_sd_load_idx(struct sched_domain *sd,
+ enum cpu_idle_type idle)
+{
+ int load_idx;
+
+ switch (idle) {
+ case CPU_NOT_IDLE:
+ load_idx = sd->busy_idx;
+ break;
+
+ case CPU_NEWLY_IDLE:
+ load_idx = sd->newidle_idx;
+ break;
+ default:
+ load_idx = sd->idle_idx;
+ break;
+ }
+
+ return load_idx;
+}
+
+static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu)
+{
+ if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
+ return sd->smt_gain / sd->span_weight;
+
+ return SCHED_CAPACITY_SCALE;
+}
+
+unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
+{
+ return default_scale_cpu_capacity(sd, cpu);
+}
+
+static unsigned long scale_rt_capacity(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ u64 total, used, age_stamp, avg;
+ s64 delta;
+
+ /*
+ * Since we're reading these variables without serialization make sure
+ * we read them once before doing sanity checks on them.
+ */
+ age_stamp = ACCESS_ONCE(rq->age_stamp);
+ avg = ACCESS_ONCE(rq->rt_avg);
+ delta = __rq_clock_broken(rq) - age_stamp;
+
+ if (unlikely(delta < 0))
+ delta = 0;
+
+ total = sched_avg_period() + delta;
+
+ used = div_u64(avg, total);
+
+ if (likely(used < SCHED_CAPACITY_SCALE))
+ return SCHED_CAPACITY_SCALE - used;
+
+ return 1;
+}
+
+static void update_cpu_capacity(struct sched_domain *sd, int cpu)
+{
+ unsigned long capacity = SCHED_CAPACITY_SCALE;
+ struct sched_group *sdg = sd->groups;
+
+ if (sched_feat(ARCH_CAPACITY))
+ capacity *= arch_scale_cpu_capacity(sd, cpu);
+ else
+ capacity *= default_scale_cpu_capacity(sd, cpu);
+
+ capacity >>= SCHED_CAPACITY_SHIFT;
+
+ cpu_rq(cpu)->cpu_capacity_orig = capacity;
+
+ capacity *= scale_rt_capacity(cpu);
+ capacity >>= SCHED_CAPACITY_SHIFT;
+
+ if (!capacity)
+ capacity = 1;
+
+ cpu_rq(cpu)->cpu_capacity = capacity;
+ sdg->sgc->capacity = capacity;
+}
+
+void update_group_capacity(struct sched_domain *sd, int cpu)
+{
+ struct sched_domain *child = sd->child;
+ struct sched_group *group, *sdg = sd->groups;
+ unsigned long capacity;
+ unsigned long interval;
+
+ interval = msecs_to_jiffies(sd->balance_interval);
+ interval = clamp(interval, 1UL, max_load_balance_interval);
+ sdg->sgc->next_update = jiffies + interval;
+
+ if (!child) {
+ update_cpu_capacity(sd, cpu);
+ return;
+ }
+
+ capacity = 0;
+
+ if (child->flags & SD_OVERLAP) {
+ /*
+ * SD_OVERLAP domains cannot assume that child groups
+ * span the current group.
+ */
+
+ for_each_cpu(cpu, sched_group_cpus(sdg)) {
+ struct sched_group_capacity *sgc;
+ struct rq *rq = cpu_rq(cpu);
+
+ /*
+ * build_sched_domains() -> init_sched_groups_capacity()
+ * gets here before we've attached the domains to the
+ * runqueues.
+ *
+ * Use capacity_of(), which is set irrespective of domains
+ * in update_cpu_capacity().
+ *
+ * This avoids capacity from being 0 and
+ * causing divide-by-zero issues on boot.
+ */
+ if (unlikely(!rq->sd)) {
+ capacity += capacity_of(cpu);
+ continue;
+ }
+
+ sgc = rq->sd->groups->sgc;
+ capacity += sgc->capacity;
+ }
+ } else {
+ /*
+ * !SD_OVERLAP domains can assume that child groups
+ * span the current group.
+ */
+
+ group = child->groups;
+ do {
+ capacity += group->sgc->capacity;
+ group = group->next;
+ } while (group != child->groups);
+ }
+
+ sdg->sgc->capacity = capacity;
+}
+
+/*
+ * Check whether the capacity of the rq has been noticeably reduced by side
+ * activity. The imbalance_pct is used for the threshold.
+ * Return true is the capacity is reduced
+ */
+static inline int
+check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
+{
+ return ((rq->cpu_capacity * sd->imbalance_pct) <
+ (rq->cpu_capacity_orig * 100));
+}
+
+/*
+ * Group imbalance indicates (and tries to solve) the problem where balancing
+ * groups is inadequate due to tsk_cpus_allowed() constraints.
+ *
+ * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
+ * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
+ * Something like:
+ *
+ * { 0 1 2 3 } { 4 5 6 7 }
+ * * * * *
+ *
+ * If we were to balance group-wise we'd place two tasks in the first group and
+ * two tasks in the second group. Clearly this is undesired as it will overload
+ * cpu 3 and leave one of the cpus in the second group unused.
+ *
+ * The current solution to this issue is detecting the skew in the first group
+ * by noticing the lower domain failed to reach balance and had difficulty
+ * moving tasks due to affinity constraints.
+ *
+ * When this is so detected; this group becomes a candidate for busiest; see
+ * update_sd_pick_busiest(). And calculate_imbalance() and
+ * find_busiest_group() avoid some of the usual balance conditions to allow it
+ * to create an effective group imbalance.
+ *
+ * This is a somewhat tricky proposition since the next run might not find the
+ * group imbalance and decide the groups need to be balanced again. A most
+ * subtle and fragile situation.
+ */
+
+static inline int sg_imbalanced(struct sched_group *group)
+{
+ return group->sgc->imbalance;
+}
+
+/*
+ * group_has_capacity returns true if the group has spare capacity that could
+ * be used by some tasks.
+ * We consider that a group has spare capacity if the * number of task is
+ * smaller than the number of CPUs or if the usage is lower than the available
+ * capacity for CFS tasks.
+ * For the latter, we use a threshold to stabilize the state, to take into
+ * account the variance of the tasks' load and to return true if the available
+ * capacity in meaningful for the load balancer.
+ * As an example, an available capacity of 1% can appear but it doesn't make
+ * any benefit for the load balance.
+ */
+static inline bool
+group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
+{
+ if (sgs->sum_nr_running < sgs->group_weight)
+ return true;
+
+ if ((sgs->group_capacity * 100) >
+ (sgs->group_usage * env->sd->imbalance_pct))
+ return true;
+
+ return false;
+}
+
+/*
+ * group_is_overloaded returns true if the group has more tasks than it can
+ * handle.
+ * group_is_overloaded is not equals to !group_has_capacity because a group
+ * with the exact right number of tasks, has no more spare capacity but is not
+ * overloaded so both group_has_capacity and group_is_overloaded return
+ * false.
+ */
+static inline bool
+group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
+{
+ if (sgs->sum_nr_running <= sgs->group_weight)
+ return false;
+
+ if ((sgs->group_capacity * 100) <
+ (sgs->group_usage * env->sd->imbalance_pct))
+ return true;
+
+ return false;
+}
+
+static enum group_type group_classify(struct lb_env *env,
+ struct sched_group *group,
+ struct sg_lb_stats *sgs)
+{
+ if (sgs->group_no_capacity)
+ return group_overloaded;
+
+ if (sg_imbalanced(group))
+ return group_imbalanced;
+
+ return group_other;
+}
+
+/**
+ * update_sg_lb_stats - Update sched_group's statistics for load balancing.
+ * @env: The load balancing environment.
+ * @group: sched_group whose statistics are to be updated.
+ * @load_idx: Load index of sched_domain of this_cpu for load calc.
+ * @local_group: Does group contain this_cpu.
+ * @sgs: variable to hold the statistics for this group.
+ * @overload: Indicate more than one runnable task for any CPU.
+ */
+static inline void update_sg_lb_stats(struct lb_env *env,
+ struct sched_group *group, int load_idx,
+ int local_group, struct sg_lb_stats *sgs,
+ bool *overload)
+{
+ unsigned long load;
+ int i;
+
+ memset(sgs, 0, sizeof(*sgs));
+
+ for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
+ struct rq *rq = cpu_rq(i);
+
+ /* Bias balancing toward cpus of our domain */
+ if (local_group)
+ load = target_load(i, load_idx);
+ else
+ load = source_load(i, load_idx);
+
+ sgs->group_load += load;
+ sgs->group_usage += get_cpu_usage(i);
+ sgs->sum_nr_running += rq->cfs.h_nr_running;
+
+ if (rq->nr_running > 1)
+ *overload = true;
+
+#ifdef CONFIG_NUMA_BALANCING
+ sgs->nr_numa_running += rq->nr_numa_running;
+ sgs->nr_preferred_running += rq->nr_preferred_running;
+#endif
+ sgs->sum_weighted_load += weighted_cpuload(i);
+ if (idle_cpu(i))
+ sgs->idle_cpus++;
+ }
+
+ /* Adjust by relative CPU capacity of the group */
+ sgs->group_capacity = group->sgc->capacity;
+ sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
+
+ if (sgs->sum_nr_running)
+ sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
+
+ sgs->group_weight = group->group_weight;
+
+ sgs->group_no_capacity = group_is_overloaded(env, sgs);
+ sgs->group_type = group_classify(env, group, sgs);
+}
+
+/**
+ * update_sd_pick_busiest - return 1 on busiest group
+ * @env: The load balancing environment.
+ * @sds: sched_domain statistics
+ * @sg: sched_group candidate to be checked for being the busiest
+ * @sgs: sched_group statistics
+ *
+ * Determine if @sg is a busier group than the previously selected
+ * busiest group.
+ *
+ * Return: %true if @sg is a busier group than the previously selected
+ * busiest group. %false otherwise.
+ */
+static bool update_sd_pick_busiest(struct lb_env *env,
+ struct sd_lb_stats *sds,
+ struct sched_group *sg,
+ struct sg_lb_stats *sgs)
+{
+ struct sg_lb_stats *busiest = &sds->busiest_stat;
+
+ if (sgs->group_type > busiest->group_type)
+ return true;
+
+ if (sgs->group_type < busiest->group_type)
+ return false;
+
+ if (sgs->avg_load <= busiest->avg_load)
+ return false;
+
+ /* This is the busiest node in its class. */
+ if (!(env->sd->flags & SD_ASYM_PACKING))
+ return true;
+
+ /*
+ * ASYM_PACKING needs to move all the work to the lowest
+ * numbered CPUs in the group, therefore mark all groups
+ * higher than ourself as busy.
+ */
+ if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) {
+ if (!sds->busiest)
+ return true;
+
+ if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
+ return true;
+ }
+
+ return false;
+}
+
+#ifdef CONFIG_NUMA_BALANCING
+static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
+{
+ if (sgs->sum_nr_running > sgs->nr_numa_running)
+ return regular;
+ if (sgs->sum_nr_running > sgs->nr_preferred_running)
+ return remote;
+ return all;
+}
+
+static inline enum fbq_type fbq_classify_rq(struct rq *rq)
+{
+ if (rq->nr_running > rq->nr_numa_running)
+ return regular;
+ if (rq->nr_running > rq->nr_preferred_running)
+ return remote;
+ return all;
+}
+#else
+static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
+{
+ return all;
+}
+
+static inline enum fbq_type fbq_classify_rq(struct rq *rq)
+{
+ return regular;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
+/**
+ * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
+ * @env: The load balancing environment.
+ * @sds: variable to hold the statistics for this sched_domain.
+ */
+static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
+{
+ struct sched_domain *child = env->sd->child;
+ struct sched_group *sg = env->sd->groups;
+ struct sg_lb_stats tmp_sgs;
+ int load_idx, prefer_sibling = 0;
+ bool overload = false;
+
+ if (child && child->flags & SD_PREFER_SIBLING)
+ prefer_sibling = 1;
+
+ load_idx = get_sd_load_idx(env->sd, env->idle);
+
+ do {
+ struct sg_lb_stats *sgs = &tmp_sgs;
+ int local_group;
+
+ local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
+ if (local_group) {
+ sds->local = sg;
+ sgs = &sds->local_stat;
+
+ if (env->idle != CPU_NEWLY_IDLE ||
+ time_after_eq(jiffies, sg->sgc->next_update))
+ update_group_capacity(env->sd, env->dst_cpu);
+ }
+
+ update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
+ &overload);
+
+ if (local_group)
+ goto next_group;
+
+ /*
+ * In case the child domain prefers tasks go to siblings
+ * first, lower the sg capacity so that we'll try
+ * and move all the excess tasks away. We lower the capacity
+ * of a group only if the local group has the capacity to fit
+ * these excess tasks. The extra check prevents the case where
+ * you always pull from the heaviest group when it is already
+ * under-utilized (possible with a large weight task outweighs
+ * the tasks on the system).
+ */
+ if (prefer_sibling && sds->local &&
+ group_has_capacity(env, &sds->local_stat) &&
+ (sgs->sum_nr_running > 1)) {
+ sgs->group_no_capacity = 1;
+ sgs->group_type = group_overloaded;
+ }
+
+ if (update_sd_pick_busiest(env, sds, sg, sgs)) {
+ sds->busiest = sg;
+ sds->busiest_stat = *sgs;
+ }
+
+next_group:
+ /* Now, start updating sd_lb_stats */
+ sds->total_load += sgs->group_load;
+ sds->total_capacity += sgs->group_capacity;
+
+ sg = sg->next;
+ } while (sg != env->sd->groups);
+
+ if (env->sd->flags & SD_NUMA)
+ env->fbq_type = fbq_classify_group(&sds->busiest_stat);
+
+ if (!env->sd->parent) {
+ /* update overload indicator if we are at root domain */
+ if (env->dst_rq->rd->overload != overload)
+ env->dst_rq->rd->overload = overload;
+ }
+
+}
+
+/**
+ * check_asym_packing - Check to see if the group is packed into the
+ * sched doman.
+ *
+ * This is primarily intended to used at the sibling level. Some
+ * cores like POWER7 prefer to use lower numbered SMT threads. In the
+ * case of POWER7, it can move to lower SMT modes only when higher
+ * threads are idle. When in lower SMT modes, the threads will
+ * perform better since they share less core resources. Hence when we
+ * have idle threads, we want them to be the higher ones.
+ *
+ * This packing function is run on idle threads. It checks to see if
+ * the busiest CPU in this domain (core in the P7 case) has a higher
+ * CPU number than the packing function is being run on. Here we are
+ * assuming lower CPU number will be equivalent to lower a SMT thread
+ * number.
+ *
+ * Return: 1 when packing is required and a task should be moved to
+ * this CPU. The amount of the imbalance is returned in *imbalance.
+ *
+ * @env: The load balancing environment.
+ * @sds: Statistics of the sched_domain which is to be packed
+ */
+static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
+{
+ int busiest_cpu;
+
+ if (!(env->sd->flags & SD_ASYM_PACKING))
+ return 0;
+
+ if (!sds->busiest)
+ return 0;
+
+ busiest_cpu = group_first_cpu(sds->busiest);
+ if (env->dst_cpu > busiest_cpu)
+ return 0;
+
+ env->imbalance = DIV_ROUND_CLOSEST(
+ sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
+ SCHED_CAPACITY_SCALE);
+
+ return 1;
+}
+
+/**
+ * fix_small_imbalance - Calculate the minor imbalance that exists
+ * amongst the groups of a sched_domain, during
+ * load balancing.
+ * @env: The load balancing environment.
+ * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
+ */
+static inline
+void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
+{
+ unsigned long tmp, capa_now = 0, capa_move = 0;
+ unsigned int imbn = 2;
+ unsigned long scaled_busy_load_per_task;
+ struct sg_lb_stats *local, *busiest;
+
+ local = &sds->local_stat;
+ busiest = &sds->busiest_stat;
+
+ if (!local->sum_nr_running)
+ local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
+ else if (busiest->load_per_task > local->load_per_task)
+ imbn = 1;
+
+ scaled_busy_load_per_task =
+ (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
+ busiest->group_capacity;
+
+ if (busiest->avg_load + scaled_busy_load_per_task >=
+ local->avg_load + (scaled_busy_load_per_task * imbn)) {
+ env->imbalance = busiest->load_per_task;
+ return;
+ }
+
+ /*
+ * OK, we don't have enough imbalance to justify moving tasks,
+ * however we may be able to increase total CPU capacity used by
+ * moving them.
+ */
+
+ capa_now += busiest->group_capacity *
+ min(busiest->load_per_task, busiest->avg_load);
+ capa_now += local->group_capacity *
+ min(local->load_per_task, local->avg_load);
+ capa_now /= SCHED_CAPACITY_SCALE;
+
+ /* Amount of load we'd subtract */
+ if (busiest->avg_load > scaled_busy_load_per_task) {
+ capa_move += busiest->group_capacity *
+ min(busiest->load_per_task,
+ busiest->avg_load - scaled_busy_load_per_task);
+ }
+
+ /* Amount of load we'd add */
+ if (busiest->avg_load * busiest->group_capacity <
+ busiest->load_per_task * SCHED_CAPACITY_SCALE) {
+ tmp = (busiest->avg_load * busiest->group_capacity) /
+ local->group_capacity;
+ } else {
+ tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
+ local->group_capacity;
+ }
+ capa_move += local->group_capacity *
+ min(local->load_per_task, local->avg_load + tmp);
+ capa_move /= SCHED_CAPACITY_SCALE;
+
+ /* Move if we gain throughput */
+ if (capa_move > capa_now)
+ env->imbalance = busiest->load_per_task;
+}
+
+/**
+ * calculate_imbalance - Calculate the amount of imbalance present within the
+ * groups of a given sched_domain during load balance.
+ * @env: load balance environment
+ * @sds: statistics of the sched_domain whose imbalance is to be calculated.
+ */
+static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
+{
+ unsigned long max_pull, load_above_capacity = ~0UL;
+ struct sg_lb_stats *local, *busiest;
+
+ local = &sds->local_stat;
+ busiest = &sds->busiest_stat;
+
+ if (busiest->group_type == group_imbalanced) {
+ /*
+ * In the group_imb case we cannot rely on group-wide averages
+ * to ensure cpu-load equilibrium, look at wider averages. XXX
+ */
+ busiest->load_per_task =
+ min(busiest->load_per_task, sds->avg_load);
+ }
+
+ /*
+ * In the presence of smp nice balancing, certain scenarios can have
+ * max load less than avg load(as we skip the groups at or below
+ * its cpu_capacity, while calculating max_load..)
+ */
+ if (busiest->avg_load <= sds->avg_load ||
+ local->avg_load >= sds->avg_load) {
+ env->imbalance = 0;
+ return fix_small_imbalance(env, sds);
+ }
+
+ /*
+ * If there aren't any idle cpus, avoid creating some.
+ */
+ if (busiest->group_type == group_overloaded &&
+ local->group_type == group_overloaded) {
+ load_above_capacity = busiest->sum_nr_running *
+ SCHED_LOAD_SCALE;
+ if (load_above_capacity > busiest->group_capacity)
+ load_above_capacity -= busiest->group_capacity;
+ else
+ load_above_capacity = ~0UL;
+ }
+
+ /*
+ * We're trying to get all the cpus to the average_load, so we don't
+ * want to push ourselves above the average load, nor do we wish to
+ * reduce the max loaded cpu below the average load. At the same time,
+ * we also don't want to reduce the group load below the group capacity
+ * (so that we can implement power-savings policies etc). Thus we look
+ * for the minimum possible imbalance.
+ */
+ max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
+
+ /* How much load to actually move to equalise the imbalance */
+ env->imbalance = min(
+ max_pull * busiest->group_capacity,
+ (sds->avg_load - local->avg_load) * local->group_capacity
+ ) / SCHED_CAPACITY_SCALE;
+
+ /*
+ * if *imbalance is less than the average load per runnable task
+ * there is no guarantee that any tasks will be moved so we'll have
+ * a think about bumping its value to force at least one task to be
+ * moved
+ */
+ if (env->imbalance < busiest->load_per_task)
+ return fix_small_imbalance(env, sds);
+}
+
+/******* find_busiest_group() helpers end here *********************/
+
+/**
+ * find_busiest_group - Returns the busiest group within the sched_domain
+ * if there is an imbalance. If there isn't an imbalance, and
+ * the user has opted for power-savings, it returns a group whose
+ * CPUs can be put to idle by rebalancing those tasks elsewhere, if
+ * such a group exists.
+ *
+ * Also calculates the amount of weighted load which should be moved
+ * to restore balance.
+ *
+ * @env: The load balancing environment.
+ *
+ * Return: - The busiest group if imbalance exists.
+ * - If no imbalance and user has opted for power-savings balance,
+ * return the least loaded group whose CPUs can be
+ * put to idle by rebalancing its tasks onto our group.
+ */
+static struct sched_group *find_busiest_group(struct lb_env *env)
+{
+ struct sg_lb_stats *local, *busiest;
+ struct sd_lb_stats sds;
+
+ init_sd_lb_stats(&sds);
+
+ /*
+ * Compute the various statistics relavent for load balancing at
+ * this level.
+ */
+ update_sd_lb_stats(env, &sds);
+ local = &sds.local_stat;
+ busiest = &sds.busiest_stat;
+
+ /* ASYM feature bypasses nice load balance check */
+ if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
+ check_asym_packing(env, &sds))
+ return sds.busiest;
+
+ /* There is no busy sibling group to pull tasks from */
+ if (!sds.busiest || busiest->sum_nr_running == 0)
+ goto out_balanced;
+
+ sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
+ / sds.total_capacity;
+
+ /*
+ * If the busiest group is imbalanced the below checks don't
+ * work because they assume all things are equal, which typically
+ * isn't true due to cpus_allowed constraints and the like.
+ */
+ if (busiest->group_type == group_imbalanced)
+ goto force_balance;
+
+ /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
+ if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
+ busiest->group_no_capacity)
+ goto force_balance;
+
+ /*
+ * If the local group is busier than the selected busiest group
+ * don't try and pull any tasks.
+ */
+ if (local->avg_load >= busiest->avg_load)
+ goto out_balanced;
+
+ /*
+ * Don't pull any tasks if this group is already above the domain
+ * average load.
+ */
+ if (local->avg_load >= sds.avg_load)
+ goto out_balanced;
+
+ if (env->idle == CPU_IDLE) {
+ /*
+ * This cpu is idle. If the busiest group is not overloaded
+ * and there is no imbalance between this and busiest group
+ * wrt idle cpus, it is balanced. The imbalance becomes
+ * significant if the diff is greater than 1 otherwise we
+ * might end up to just move the imbalance on another group
+ */
+ if ((busiest->group_type != group_overloaded) &&
+ (local->idle_cpus <= (busiest->idle_cpus + 1)))
+ goto out_balanced;
+ } else {
+ /*
+ * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
+ * imbalance_pct to be conservative.
+ */
+ if (100 * busiest->avg_load <=
+ env->sd->imbalance_pct * local->avg_load)
+ goto out_balanced;
+ }
+
+force_balance:
+ /* Looks like there is an imbalance. Compute it */
+ calculate_imbalance(env, &sds);
+ return sds.busiest;
+
+out_balanced:
+ env->imbalance = 0;
+ return NULL;
+}
+
+/*
+ * find_busiest_queue - find the busiest runqueue among the cpus in group.
+ */
+static struct rq *find_busiest_queue(struct lb_env *env,
+ struct sched_group *group)
+{
+ struct rq *busiest = NULL, *rq;
+ unsigned long busiest_load = 0, busiest_capacity = 1;
+ int i;
+
+ for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
+ unsigned long capacity, wl;
+ enum fbq_type rt;
+
+ rq = cpu_rq(i);
+ rt = fbq_classify_rq(rq);
+
+ /*
+ * We classify groups/runqueues into three groups:
+ * - regular: there are !numa tasks
+ * - remote: there are numa tasks that run on the 'wrong' node
+ * - all: there is no distinction
+ *
+ * In order to avoid migrating ideally placed numa tasks,
+ * ignore those when there's better options.
+ *
+ * If we ignore the actual busiest queue to migrate another
+ * task, the next balance pass can still reduce the busiest
+ * queue by moving tasks around inside the node.
+ *
+ * If we cannot move enough load due to this classification
+ * the next pass will adjust the group classification and
+ * allow migration of more tasks.
+ *
+ * Both cases only affect the total convergence complexity.
+ */
+ if (rt > env->fbq_type)
+ continue;
+
+ capacity = capacity_of(i);
+
+ wl = weighted_cpuload(i);
+
+ /*
+ * When comparing with imbalance, use weighted_cpuload()
+ * which is not scaled with the cpu capacity.
+ */
+
+ if (rq->nr_running == 1 && wl > env->imbalance &&
+ !check_cpu_capacity(rq, env->sd))
+ continue;
+
+ /*
+ * For the load comparisons with the other cpu's, consider
+ * the weighted_cpuload() scaled with the cpu capacity, so
+ * that the load can be moved away from the cpu that is
+ * potentially running at a lower capacity.
+ *
+ * Thus we're looking for max(wl_i / capacity_i), crosswise
+ * multiplication to rid ourselves of the division works out
+ * to: wl_i * capacity_j > wl_j * capacity_i; where j is
+ * our previous maximum.
+ */
+ if (wl * busiest_capacity > busiest_load * capacity) {
+ busiest_load = wl;
+ busiest_capacity = capacity;
+ busiest = rq;
+ }
+ }
+
+ return busiest;
+}
+
+/*
+ * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
+ * so long as it is large enough.
+ */
+#define MAX_PINNED_INTERVAL 512
+
+/* Working cpumask for load_balance and load_balance_newidle. */
+DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
+
+static int need_active_balance(struct lb_env *env)
+{
+ struct sched_domain *sd = env->sd;
+
+ if (env->idle == CPU_NEWLY_IDLE) {
+
+ /*
+ * ASYM_PACKING needs to force migrate tasks from busy but
+ * higher numbered CPUs in order to pack all tasks in the
+ * lowest numbered CPUs.
+ */
+ if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
+ return 1;
+ }
+
+ /*
+ * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
+ * It's worth migrating the task if the src_cpu's capacity is reduced
+ * because of other sched_class or IRQs if more capacity stays
+ * available on dst_cpu.
+ */
+ if ((env->idle != CPU_NOT_IDLE) &&
+ (env->src_rq->cfs.h_nr_running == 1)) {
+ if ((check_cpu_capacity(env->src_rq, sd)) &&
+ (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
+ return 1;
+ }
+
+ return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
+}
+
+static int active_load_balance_cpu_stop(void *data);
+
+static int should_we_balance(struct lb_env *env)
+{
+ struct sched_group *sg = env->sd->groups;
+ struct cpumask *sg_cpus, *sg_mask;
+ int cpu, balance_cpu = -1;
+
+ /*
+ * In the newly idle case, we will allow all the cpu's
+ * to do the newly idle load balance.
+ */
+ if (env->idle == CPU_NEWLY_IDLE)
+ return 1;
+
+ sg_cpus = sched_group_cpus(sg);
+ sg_mask = sched_group_mask(sg);
+ /* Try to find first idle cpu */
+ for_each_cpu_and(cpu, sg_cpus, env->cpus) {
+ if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
+ continue;
+
+ balance_cpu = cpu;
+ break;
+ }
+
+ if (balance_cpu == -1)
+ balance_cpu = group_balance_cpu(sg);
+
+ /*
+ * First idle cpu or the first cpu(busiest) in this sched group
+ * is eligible for doing load balancing at this and above domains.
+ */
+ return balance_cpu == env->dst_cpu;
+}
+
+/*
+ * Check this_cpu to ensure it is balanced within domain. Attempt to move
+ * tasks if there is an imbalance.
+ */
+static int load_balance(int this_cpu, struct rq *this_rq,
+ struct sched_domain *sd, enum cpu_idle_type idle,
+ int *continue_balancing)
+{
+ int ld_moved, cur_ld_moved, active_balance = 0;
+ struct sched_domain *sd_parent = sd->parent;
+ struct sched_group *group;
+ struct rq *busiest;
+ unsigned long flags;
+ struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
+
+ struct lb_env env = {
+ .sd = sd,
+ .dst_cpu = this_cpu,
+ .dst_rq = this_rq,
+ .dst_grpmask = sched_group_cpus(sd->groups),
+ .idle = idle,
+ .loop_break = sched_nr_migrate_break,
+ .cpus = cpus,
+ .fbq_type = all,
+ .tasks = LIST_HEAD_INIT(env.tasks),
+ };
+
+ /*
+ * For NEWLY_IDLE load_balancing, we don't need to consider
+ * other cpus in our group
+ */
+ if (idle == CPU_NEWLY_IDLE)
+ env.dst_grpmask = NULL;
+
+ cpumask_copy(cpus, cpu_active_mask);
+
+ schedstat_inc(sd, lb_count[idle]);
+
+redo:
+ if (!should_we_balance(&env)) {
+ *continue_balancing = 0;
+ goto out_balanced;
+ }
+
+ group = find_busiest_group(&env);
+ if (!group) {
+ schedstat_inc(sd, lb_nobusyg[idle]);
+ goto out_balanced;
+ }
+
+ busiest = find_busiest_queue(&env, group);
+ if (!busiest) {
+ schedstat_inc(sd, lb_nobusyq[idle]);
+ goto out_balanced;
+ }
+
+ BUG_ON(busiest == env.dst_rq);
+
+ schedstat_add(sd, lb_imbalance[idle], env.imbalance);
+
+ env.src_cpu = busiest->cpu;
+ env.src_rq = busiest;
+
+ ld_moved = 0;
+ if (busiest->nr_running > 1) {
+ /*
+ * Attempt to move tasks. If find_busiest_group has found
+ * an imbalance but busiest->nr_running <= 1, the group is
+ * still unbalanced. ld_moved simply stays zero, so it is
+ * correctly treated as an imbalance.
+ */
+ env.flags |= LBF_ALL_PINNED;
+ env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
+
+more_balance:
+ raw_spin_lock_irqsave(&busiest->lock, flags);
+
+ /*
+ * cur_ld_moved - load moved in current iteration
+ * ld_moved - cumulative load moved across iterations
+ */
+ cur_ld_moved = detach_tasks(&env);
+
+ /*
+ * We've detached some tasks from busiest_rq. Every
+ * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
+ * unlock busiest->lock, and we are able to be sure
+ * that nobody can manipulate the tasks in parallel.
+ * See task_rq_lock() family for the details.
+ */
+
+ raw_spin_unlock(&busiest->lock);
+
+ if (cur_ld_moved) {
+ attach_tasks(&env);
+ ld_moved += cur_ld_moved;
+ }
+
+ local_irq_restore(flags);
+
+ if (env.flags & LBF_NEED_BREAK) {
+ env.flags &= ~LBF_NEED_BREAK;
+ goto more_balance;
+ }
+
+ /*
+ * Revisit (affine) tasks on src_cpu that couldn't be moved to
+ * us and move them to an alternate dst_cpu in our sched_group
+ * where they can run. The upper limit on how many times we
+ * iterate on same src_cpu is dependent on number of cpus in our
+ * sched_group.
+ *
+ * This changes load balance semantics a bit on who can move
+ * load to a given_cpu. In addition to the given_cpu itself
+ * (or a ilb_cpu acting on its behalf where given_cpu is
+ * nohz-idle), we now have balance_cpu in a position to move
+ * load to given_cpu. In rare situations, this may cause
+ * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
+ * _independently_ and at _same_ time to move some load to
+ * given_cpu) causing exceess load to be moved to given_cpu.
+ * This however should not happen so much in practice and
+ * moreover subsequent load balance cycles should correct the
+ * excess load moved.
+ */
+ if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
+
+ /* Prevent to re-select dst_cpu via env's cpus */
+ cpumask_clear_cpu(env.dst_cpu, env.cpus);
+
+ env.dst_rq = cpu_rq(env.new_dst_cpu);
+ env.dst_cpu = env.new_dst_cpu;
+ env.flags &= ~LBF_DST_PINNED;
+ env.loop = 0;
+ env.loop_break = sched_nr_migrate_break;
+
+ /*
+ * Go back to "more_balance" rather than "redo" since we
+ * need to continue with same src_cpu.
+ */
+ goto more_balance;
+ }
+
+ /*
+ * We failed to reach balance because of affinity.
+ */
+ if (sd_parent) {
+ int *group_imbalance = &sd_parent->groups->sgc->imbalance;
+
+ if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
+ *group_imbalance = 1;
+ }
+
+ /* All tasks on this runqueue were pinned by CPU affinity */
+ if (unlikely(env.flags & LBF_ALL_PINNED)) {
+ cpumask_clear_cpu(cpu_of(busiest), cpus);
+ if (!cpumask_empty(cpus)) {
+ env.loop = 0;
+ env.loop_break = sched_nr_migrate_break;
+ goto redo;
+ }
+ goto out_all_pinned;
+ }
+ }
+
+ if (!ld_moved) {
+ schedstat_inc(sd, lb_failed[idle]);
+ /*
+ * Increment the failure counter only on periodic balance.
+ * We do not want newidle balance, which can be very
+ * frequent, pollute the failure counter causing
+ * excessive cache_hot migrations and active balances.
+ */
+ if (idle != CPU_NEWLY_IDLE)
+ sd->nr_balance_failed++;
+
+ if (need_active_balance(&env)) {
+ raw_spin_lock_irqsave(&busiest->lock, flags);
+
+ /* don't kick the active_load_balance_cpu_stop,
+ * if the curr task on busiest cpu can't be
+ * moved to this_cpu
+ */
+ if (!cpumask_test_cpu(this_cpu,
+ tsk_cpus_allowed(busiest->curr))) {
+ raw_spin_unlock_irqrestore(&busiest->lock,
+ flags);
+ env.flags |= LBF_ALL_PINNED;
+ goto out_one_pinned;
+ }
+
+ /*
+ * ->active_balance synchronizes accesses to
+ * ->active_balance_work. Once set, it's cleared
+ * only after active load balance is finished.
+ */
+ if (!busiest->active_balance) {
+ busiest->active_balance = 1;
+ busiest->push_cpu = this_cpu;
+ active_balance = 1;
+ }
+ raw_spin_unlock_irqrestore(&busiest->lock, flags);
+
+ if (active_balance) {
+ stop_one_cpu_nowait(cpu_of(busiest),
+ active_load_balance_cpu_stop, busiest,
+ &busiest->active_balance_work);
+ }
+
+ /*
+ * We've kicked active balancing, reset the failure
+ * counter.
+ */
+ sd->nr_balance_failed = sd->cache_nice_tries+1;
+ }
+ } else
+ sd->nr_balance_failed = 0;
+
+ if (likely(!active_balance)) {
+ /* We were unbalanced, so reset the balancing interval */
+ sd->balance_interval = sd->min_interval;
+ } else {
+ /*
+ * If we've begun active balancing, start to back off. This
+ * case may not be covered by the all_pinned logic if there
+ * is only 1 task on the busy runqueue (because we don't call
+ * detach_tasks).
+ */
+ if (sd->balance_interval < sd->max_interval)
+ sd->balance_interval *= 2;
+ }
+
+ goto out;
+
+out_balanced:
+ /*
+ * We reach balance although we may have faced some affinity
+ * constraints. Clear the imbalance flag if it was set.
+ */
+ if (sd_parent) {
+ int *group_imbalance = &sd_parent->groups->sgc->imbalance;
+
+ if (*group_imbalance)
+ *group_imbalance = 0;
+ }
+
+out_all_pinned:
+ /*
+ * We reach balance because all tasks are pinned at this level so
+ * we can't migrate them. Let the imbalance flag set so parent level
+ * can try to migrate them.
+ */
+ schedstat_inc(sd, lb_balanced[idle]);
+
+ sd->nr_balance_failed = 0;
+
+out_one_pinned:
+ /* tune up the balancing interval */
+ if (((env.flags & LBF_ALL_PINNED) &&
+ sd->balance_interval < MAX_PINNED_INTERVAL) ||
+ (sd->balance_interval < sd->max_interval))
+ sd->balance_interval *= 2;
+
+ ld_moved = 0;
+out:
+ return ld_moved;
+}
+
+static inline unsigned long
+get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
+{
+ unsigned long interval = sd->balance_interval;
+
+ if (cpu_busy)
+ interval *= sd->busy_factor;
+
+ /* scale ms to jiffies */
+ interval = msecs_to_jiffies(interval);
+ interval = clamp(interval, 1UL, max_load_balance_interval);
+
+ return interval;
+}
+
+static inline void
+update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)
+{
+ unsigned long interval, next;
+
+ interval = get_sd_balance_interval(sd, cpu_busy);
+ next = sd->last_balance + interval;
+
+ if (time_after(*next_balance, next))
+ *next_balance = next;
+}
+
+/*
+ * idle_balance is called by schedule() if this_cpu is about to become
+ * idle. Attempts to pull tasks from other CPUs.
+ */
+static int idle_balance(struct rq *this_rq)
+{
+ unsigned long next_balance = jiffies + HZ;
+ int this_cpu = this_rq->cpu;
+ struct sched_domain *sd;
+ int pulled_task = 0;
+ u64 curr_cost = 0;
+
+ idle_enter_fair(this_rq);
+
+ /*
+ * We must set idle_stamp _before_ calling idle_balance(), such that we
+ * measure the duration of idle_balance() as idle time.
+ */
+ this_rq->idle_stamp = rq_clock(this_rq);
+
+ if (this_rq->avg_idle < sysctl_sched_migration_cost ||
+ !this_rq->rd->overload) {
+ rcu_read_lock();
+ sd = rcu_dereference_check_sched_domain(this_rq->sd);
+ if (sd)
+ update_next_balance(sd, 0, &next_balance);
+ rcu_read_unlock();
+
+ goto out;
+ }
+
+ /*
+ * Drop the rq->lock, but keep IRQ/preempt disabled.
+ */
+ raw_spin_unlock(&this_rq->lock);
+
+ update_blocked_averages(this_cpu);
+ rcu_read_lock();
+ for_each_domain(this_cpu, sd) {
+ int continue_balancing = 1;
+ u64 t0, domain_cost;
+
+ if (!(sd->flags & SD_LOAD_BALANCE))
+ continue;
+
+ if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
+ update_next_balance(sd, 0, &next_balance);
+ break;
+ }
+
+ if (sd->flags & SD_BALANCE_NEWIDLE) {
+ t0 = sched_clock_cpu(this_cpu);
+
+ pulled_task = load_balance(this_cpu, this_rq,
+ sd, CPU_NEWLY_IDLE,
+ &continue_balancing);
+
+ domain_cost = sched_clock_cpu(this_cpu) - t0;
+ if (domain_cost > sd->max_newidle_lb_cost)
+ sd->max_newidle_lb_cost = domain_cost;
+
+ curr_cost += domain_cost;
+ }
+
+ update_next_balance(sd, 0, &next_balance);
+
+ /*
+ * Stop searching for tasks to pull if there are
+ * now runnable tasks on this rq.
+ */
+ if (pulled_task || this_rq->nr_running > 0)
+ break;
+ }
+ rcu_read_unlock();
+
+ raw_spin_lock(&this_rq->lock);
+
+ if (curr_cost > this_rq->max_idle_balance_cost)
+ this_rq->max_idle_balance_cost = curr_cost;
+
+ /*
+ * While browsing the domains, we released the rq lock, a task could
+ * have been enqueued in the meantime. Since we're not going idle,
+ * pretend we pulled a task.
+ */
+ if (this_rq->cfs.h_nr_running && !pulled_task)
+ pulled_task = 1;
+
+out:
+ /* Move the next balance forward */
+ if (time_after(this_rq->next_balance, next_balance))
+ this_rq->next_balance = next_balance;
+
+ /* Is there a task of a high priority class? */
+ if (this_rq->nr_running != this_rq->cfs.h_nr_running)
+ pulled_task = -1;
+
+ if (pulled_task) {
+ idle_exit_fair(this_rq);
+ this_rq->idle_stamp = 0;
+ }
+
+ return pulled_task;
+}
+
+/*
+ * active_load_balance_cpu_stop is run by cpu stopper. It pushes
+ * running tasks off the busiest CPU onto idle CPUs. It requires at
+ * least 1 task to be running on each physical CPU where possible, and
+ * avoids physical / logical imbalances.
+ */
+static int active_load_balance_cpu_stop(void *data)
+{
+ struct rq *busiest_rq = data;
+ int busiest_cpu = cpu_of(busiest_rq);
+ int target_cpu = busiest_rq->push_cpu;
+ struct rq *target_rq = cpu_rq(target_cpu);
+ struct sched_domain *sd;
+ struct task_struct *p = NULL;
+
+ raw_spin_lock_irq(&busiest_rq->lock);
+
+ /* make sure the requested cpu hasn't gone down in the meantime */
+ if (unlikely(busiest_cpu != smp_processor_id() ||
+ !busiest_rq->active_balance))
+ goto out_unlock;
+
+ /* Is there any task to move? */
+ if (busiest_rq->nr_running <= 1)
+ goto out_unlock;
+
+ /*
+ * This condition is "impossible", if it occurs
+ * we need to fix it. Originally reported by
+ * Bjorn Helgaas on a 128-cpu setup.
+ */
+ BUG_ON(busiest_rq == target_rq);
+
+ /* Search for an sd spanning us and the target CPU. */
+ rcu_read_lock();
+ for_each_domain(target_cpu, sd) {
+ if ((sd->flags & SD_LOAD_BALANCE) &&
+ cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
+ break;
+ }
+
+ if (likely(sd)) {
+ struct lb_env env = {
+ .sd = sd,
+ .dst_cpu = target_cpu,
+ .dst_rq = target_rq,
+ .src_cpu = busiest_rq->cpu,
+ .src_rq = busiest_rq,
+ .idle = CPU_IDLE,
+ };
+
+ schedstat_inc(sd, alb_count);
+
+ p = detach_one_task(&env);
+ if (p)
+ schedstat_inc(sd, alb_pushed);
+ else
+ schedstat_inc(sd, alb_failed);
+ }
+ rcu_read_unlock();
+out_unlock:
+ busiest_rq->active_balance = 0;
+ raw_spin_unlock(&busiest_rq->lock);
+
+ if (p)
+ attach_one_task(target_rq, p);
+
+ local_irq_enable();
+
+ return 0;
+}
+
+static inline int on_null_domain(struct rq *rq)
+{
+ return unlikely(!rcu_dereference_sched(rq->sd));
+}
+
+#ifdef CONFIG_NO_HZ_COMMON
+/*
+ * idle load balancing details
+ * - When one of the busy CPUs notice that there may be an idle rebalancing
+ * needed, they will kick the idle load balancer, which then does idle
+ * load balancing for all the idle CPUs.
+ */
+static struct {
+ cpumask_var_t idle_cpus_mask;
+ atomic_t nr_cpus;
+ unsigned long next_balance; /* in jiffy units */
+} nohz ____cacheline_aligned;
+
+static inline int find_new_ilb(void)
+{
+ int ilb = cpumask_first(nohz.idle_cpus_mask);
+
+ if (ilb < nr_cpu_ids && idle_cpu(ilb))
+ return ilb;
+
+ return nr_cpu_ids;
+}
+
+/*
+ * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
+ * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
+ * CPU (if there is one).
+ */
+static void nohz_balancer_kick(void)
+{
+ int ilb_cpu;
+
+ nohz.next_balance++;
+
+ ilb_cpu = find_new_ilb();
+
+ if (ilb_cpu >= nr_cpu_ids)
+ return;
+
+ if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
+ return;
+ /*
+ * Use smp_send_reschedule() instead of resched_cpu().
+ * This way we generate a sched IPI on the target cpu which
+ * is idle. And the softirq performing nohz idle load balance
+ * will be run before returning from the IPI.
+ */
+ smp_send_reschedule(ilb_cpu);
+ return;
+}
+
+static inline void nohz_balance_exit_idle(int cpu)
+{
+ if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
+ /*
+ * Completely isolated CPUs don't ever set, so we must test.
+ */
+ if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
+ cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
+ atomic_dec(&nohz.nr_cpus);
+ }
+ clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
+ }
+}
+
+static inline void set_cpu_sd_state_busy(void)
+{
+ struct sched_domain *sd;
+ int cpu = smp_processor_id();
+
+ rcu_read_lock();
+ sd = rcu_dereference(per_cpu(sd_busy, cpu));
+
+ if (!sd || !sd->nohz_idle)
+ goto unlock;
+ sd->nohz_idle = 0;
+
+ atomic_inc(&sd->groups->sgc->nr_busy_cpus);
+unlock:
+ rcu_read_unlock();
+}
+
+void set_cpu_sd_state_idle(void)
+{
+ struct sched_domain *sd;
+ int cpu = smp_processor_id();
+
+ rcu_read_lock();
+ sd = rcu_dereference(per_cpu(sd_busy, cpu));
+
+ if (!sd || sd->nohz_idle)
+ goto unlock;
+ sd->nohz_idle = 1;
+
+ atomic_dec(&sd->groups->sgc->nr_busy_cpus);
+unlock:
+ rcu_read_unlock();
+}
+
+/*
+ * This routine will record that the cpu is going idle with tick stopped.
+ * This info will be used in performing idle load balancing in the future.
+ */
+void nohz_balance_enter_idle(int cpu)
+{
+ /*
+ * If this cpu is going down, then nothing needs to be done.
+ */
+ if (!cpu_active(cpu))
+ return;
+
+ if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
+ return;
+
+ /*
+ * If we're a completely isolated CPU, we don't play.
+ */
+ if (on_null_domain(cpu_rq(cpu)))
+ return;
+
+ cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
+ atomic_inc(&nohz.nr_cpus);
+ set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
+}
+
+static int sched_ilb_notifier(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_DYING:
+ nohz_balance_exit_idle(smp_processor_id());
+ return NOTIFY_OK;
+ default:
+ return NOTIFY_DONE;
+ }
+}
+#endif
+
+static DEFINE_SPINLOCK(balancing);
+
+/*
+ * Scale the max load_balance interval with the number of CPUs in the system.
+ * This trades load-balance latency on larger machines for less cross talk.
+ */
+void update_max_interval(void)
+{
+ max_load_balance_interval = HZ*num_online_cpus()/10;
+}
+
+/*
+ * It checks each scheduling domain to see if it is due to be balanced,
+ * and initiates a balancing operation if so.
+ *
+ * Balancing parameters are set up in init_sched_domains.
+ */
+static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
+{
+ int continue_balancing = 1;
+ int cpu = rq->cpu;
+ unsigned long interval;
+ struct sched_domain *sd;
+ /* Earliest time when we have to do rebalance again */
+ unsigned long next_balance = jiffies + 60*HZ;
+ int update_next_balance = 0;
+ int need_serialize, need_decay = 0;
+ u64 max_cost = 0;
+
+ update_blocked_averages(cpu);
+
+ rcu_read_lock();
+ for_each_domain(cpu, sd) {
+ /*
+ * Decay the newidle max times here because this is a regular
+ * visit to all the domains. Decay ~1% per second.
+ */
+ if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
+ sd->max_newidle_lb_cost =
+ (sd->max_newidle_lb_cost * 253) / 256;
+ sd->next_decay_max_lb_cost = jiffies + HZ;
+ need_decay = 1;
+ }
+ max_cost += sd->max_newidle_lb_cost;
+
+ if (!(sd->flags & SD_LOAD_BALANCE))
+ continue;
+
+ /*
+ * Stop the load balance at this level. There is another
+ * CPU in our sched group which is doing load balancing more
+ * actively.
+ */
+ if (!continue_balancing) {
+ if (need_decay)
+ continue;
+ break;
+ }
+
+ interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
+
+ need_serialize = sd->flags & SD_SERIALIZE;
+ if (need_serialize) {
+ if (!spin_trylock(&balancing))
+ goto out;
+ }
+
+ if (time_after_eq(jiffies, sd->last_balance + interval)) {
+ if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
+ /*
+ * The LBF_DST_PINNED logic could have changed
+ * env->dst_cpu, so we can't know our idle
+ * state even if we migrated tasks. Update it.
+ */
+ idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
+ }
+ sd->last_balance = jiffies;
+ interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
+ }
+ if (need_serialize)
+ spin_unlock(&balancing);
+out:
+ if (time_after(next_balance, sd->last_balance + interval)) {
+ next_balance = sd->last_balance + interval;
+ update_next_balance = 1;
+ }
+ }
+ if (need_decay) {
+ /*
+ * Ensure the rq-wide value also decays but keep it at a
+ * reasonable floor to avoid funnies with rq->avg_idle.
+ */
+ rq->max_idle_balance_cost =
+ max((u64)sysctl_sched_migration_cost, max_cost);
+ }
+ rcu_read_unlock();
+
+ /*
+ * next_balance will be updated only when there is a need.
+ * When the cpu is attached to null domain for ex, it will not be
+ * updated.
+ */
+ if (likely(update_next_balance))
+ rq->next_balance = next_balance;
+}
+
+#ifdef CONFIG_NO_HZ_COMMON
+/*
+ * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
+ * rebalancing for all the cpus for whom scheduler ticks are stopped.
+ */
+static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
+{
+ int this_cpu = this_rq->cpu;
+ struct rq *rq;
+ int balance_cpu;
+
+ if (idle != CPU_IDLE ||
+ !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
+ goto end;
+
+ for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
+ if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
+ continue;
+
+ /*
+ * If this cpu gets work to do, stop the load balancing
+ * work being done for other cpus. Next load
+ * balancing owner will pick it up.
+ */
+ if (need_resched())
+ break;
+
+ rq = cpu_rq(balance_cpu);
+
+ /*
+ * If time for next balance is due,
+ * do the balance.
+ */
+ if (time_after_eq(jiffies, rq->next_balance)) {
+ raw_spin_lock_irq(&rq->lock);
+ update_rq_clock(rq);
+ update_idle_cpu_load(rq);
+ raw_spin_unlock_irq(&rq->lock);
+ rebalance_domains(rq, CPU_IDLE);
+ }
+
+ if (time_after(this_rq->next_balance, rq->next_balance))
+ this_rq->next_balance = rq->next_balance;
+ }
+ nohz.next_balance = this_rq->next_balance;
+end:
+ clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
+}
+
+/*
+ * Current heuristic for kicking the idle load balancer in the presence
+ * of an idle cpu in the system.
+ * - This rq has more than one task.
+ * - This rq has at least one CFS task and the capacity of the CPU is
+ * significantly reduced because of RT tasks or IRQs.
+ * - At parent of LLC scheduler domain level, this cpu's scheduler group has
+ * multiple busy cpu.
+ * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
+ * domain span are idle.
+ */
+static inline bool nohz_kick_needed(struct rq *rq)
+{
+ unsigned long now = jiffies;
+ struct sched_domain *sd;
+ struct sched_group_capacity *sgc;
+ int nr_busy, cpu = rq->cpu;
+ bool kick = false;
+
+ if (unlikely(rq->idle_balance))
+ return false;
+
+ /*
+ * We may be recently in ticked or tickless idle mode. At the first
+ * busy tick after returning from idle, we will update the busy stats.
+ */
+ set_cpu_sd_state_busy();
+ nohz_balance_exit_idle(cpu);
+
+ /*
+ * None are in tickless mode and hence no need for NOHZ idle load
+ * balancing.
+ */
+ if (likely(!atomic_read(&nohz.nr_cpus)))
+ return false;
+
+ if (time_before(now, nohz.next_balance))
+ return false;
+
+ if (rq->nr_running >= 2)
+ return true;
+
+ rcu_read_lock();
+ sd = rcu_dereference(per_cpu(sd_busy, cpu));
+ if (sd) {
+ sgc = sd->groups->sgc;
+ nr_busy = atomic_read(&sgc->nr_busy_cpus);
+
+ if (nr_busy > 1) {
+ kick = true;
+ goto unlock;
+ }
+
+ }
+
+ sd = rcu_dereference(rq->sd);
+ if (sd) {
+ if ((rq->cfs.h_nr_running >= 1) &&
+ check_cpu_capacity(rq, sd)) {
+ kick = true;
+ goto unlock;
+ }
+ }
+
+ sd = rcu_dereference(per_cpu(sd_asym, cpu));
+ if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
+ sched_domain_span(sd)) < cpu)) {
+ kick = true;
+ goto unlock;
+ }
+
+unlock:
+ rcu_read_unlock();
+ return kick;
+}
+#else
+static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
+#endif
+
+/*
+ * run_rebalance_domains is triggered when needed from the scheduler tick.
+ * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
+ */
+static void run_rebalance_domains(struct softirq_action *h)
+{
+ struct rq *this_rq = this_rq();
+ enum cpu_idle_type idle = this_rq->idle_balance ?
+ CPU_IDLE : CPU_NOT_IDLE;
+
+ /*
+ * If this cpu has a pending nohz_balance_kick, then do the
+ * balancing on behalf of the other idle cpus whose ticks are
+ * stopped. Do nohz_idle_balance *before* rebalance_domains to
+ * give the idle cpus a chance to load balance. Else we may
+ * load balance only within the local sched_domain hierarchy
+ * and abort nohz_idle_balance altogether if we pull some load.
+ */
+ nohz_idle_balance(this_rq, idle);
+ rebalance_domains(this_rq, idle);
+}
+
+/*
+ * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
+ */
+void trigger_load_balance(struct rq *rq)
+{
+ /* Don't need to rebalance while attached to NULL domain */
+ if (unlikely(on_null_domain(rq)))
+ return;
+
+ if (time_after_eq(jiffies, rq->next_balance))
+ raise_softirq(SCHED_SOFTIRQ);
+#ifdef CONFIG_NO_HZ_COMMON
+ if (nohz_kick_needed(rq))
+ nohz_balancer_kick();
+#endif
+}
+
+static void rq_online_fair(struct rq *rq)
+{
+ update_sysctl();
+
+ update_runtime_enabled(rq);
+}
+
+static void rq_offline_fair(struct rq *rq)
+{
+ update_sysctl();
+
+ /* Ensure any throttled groups are reachable by pick_next_task */
+ unthrottle_offline_cfs_rqs(rq);
+}
+
+#endif /* CONFIG_SMP */
+
+/*
+ * scheduler tick hitting a task of our scheduling class:
+ */
+static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
+{
+ struct cfs_rq *cfs_rq;
+ struct sched_entity *se = &curr->se;
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+ entity_tick(cfs_rq, se, queued);
+ }
+
+ if (numabalancing_enabled)
+ task_tick_numa(rq, curr);
+
+ update_rq_runnable_avg(rq, 1);
+}
+
+/*
+ * called on fork with the child task as argument from the parent's context
+ * - child not yet on the tasklist
+ * - preemption disabled
+ */
+static void task_fork_fair(struct task_struct *p)
+{
+ struct cfs_rq *cfs_rq;
+ struct sched_entity *se = &p->se, *curr;
+ int this_cpu = smp_processor_id();
+ struct rq *rq = this_rq();
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+
+ update_rq_clock(rq);
+
+ cfs_rq = task_cfs_rq(current);
+ curr = cfs_rq->curr;
+
+ /*
+ * Not only the cpu but also the task_group of the parent might have
+ * been changed after parent->se.parent,cfs_rq were copied to
+ * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
+ * of child point to valid ones.
+ */
+ rcu_read_lock();
+ __set_task_cpu(p, this_cpu);
+ rcu_read_unlock();
+
+ update_curr(cfs_rq);
+
+ if (curr)
+ se->vruntime = curr->vruntime;
+ place_entity(cfs_rq, se, 1);
+
+ if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
+ /*
+ * Upon rescheduling, sched_class::put_prev_task() will place
+ * 'current' within the tree based on its new key value.
+ */
+ swap(curr->vruntime, se->vruntime);
+ resched_curr(rq);
+ }
+
+ se->vruntime -= cfs_rq->min_vruntime;
+
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+/*
+ * Priority of the task has changed. Check to see if we preempt
+ * the current task.
+ */
+static void
+prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
+{
+ if (!task_on_rq_queued(p))
+ return;
+
+ /*
+ * Reschedule if we are currently running on this runqueue and
+ * our priority decreased, or if we are not currently running on
+ * this runqueue and our priority is higher than the current's
+ */
+ if (rq->curr == p) {
+ if (p->prio > oldprio)
+ resched_curr(rq);
+ } else
+ check_preempt_curr(rq, p, 0);
+}
+
+static void switched_from_fair(struct rq *rq, struct task_struct *p)
+{
+ struct sched_entity *se = &p->se;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ /*
+ * Ensure the task's vruntime is normalized, so that when it's
+ * switched back to the fair class the enqueue_entity(.flags=0) will
+ * do the right thing.
+ *
+ * If it's queued, then the dequeue_entity(.flags=0) will already
+ * have normalized the vruntime, if it's !queued, then only when
+ * the task is sleeping will it still have non-normalized vruntime.
+ */
+ if (!task_on_rq_queued(p) && p->state != TASK_RUNNING) {
+ /*
+ * Fix up our vruntime so that the current sleep doesn't
+ * cause 'unlimited' sleep bonus.
+ */
+ place_entity(cfs_rq, se, 0);
+ se->vruntime -= cfs_rq->min_vruntime;
+ }
+
+#ifdef CONFIG_SMP
+ /*
+ * Remove our load from contribution when we leave sched_fair
+ * and ensure we don't carry in an old decay_count if we
+ * switch back.
+ */
+ if (se->avg.decay_count) {
+ __synchronize_entity_decay(se);
+ subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
+ }
+#endif
+}
+
+/*
+ * We switched to the sched_fair class.
+ */
+static void switched_to_fair(struct rq *rq, struct task_struct *p)
+{
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ struct sched_entity *se = &p->se;
+ /*
+ * Since the real-depth could have been changed (only FAIR
+ * class maintain depth value), reset depth properly.
+ */
+ se->depth = se->parent ? se->parent->depth + 1 : 0;
+#endif
+ if (!task_on_rq_queued(p))
+ return;
+
+ /*
+ * We were most likely switched from sched_rt, so
+ * kick off the schedule if running, otherwise just see
+ * if we can still preempt the current task.
+ */
+ if (rq->curr == p)
+ resched_curr(rq);
+ else
+ check_preempt_curr(rq, p, 0);
+}
+
+/* Account for a task changing its policy or group.
+ *
+ * This routine is mostly called to set cfs_rq->curr field when a task
+ * migrates between groups/classes.
+ */
+static void set_curr_task_fair(struct rq *rq)
+{
+ struct sched_entity *se = &rq->curr->se;
+
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ set_next_entity(cfs_rq, se);
+ /* ensure bandwidth has been allocated on our new cfs_rq */
+ account_cfs_rq_runtime(cfs_rq, 0);
+ }
+}
+
+void init_cfs_rq(struct cfs_rq *cfs_rq)
+{
+ cfs_rq->tasks_timeline = RB_ROOT;
+ cfs_rq->min_vruntime = (u64)(-(1LL << 20));
+#ifndef CONFIG_64BIT
+ cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
+#endif
+#ifdef CONFIG_SMP
+ atomic64_set(&cfs_rq->decay_counter, 1);
+ atomic_long_set(&cfs_rq->removed_load, 0);
+#endif
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void task_move_group_fair(struct task_struct *p, int queued)
+{
+ struct sched_entity *se = &p->se;
+ struct cfs_rq *cfs_rq;
+
+ /*
+ * If the task was not on the rq at the time of this cgroup movement
+ * it must have been asleep, sleeping tasks keep their ->vruntime
+ * absolute on their old rq until wakeup (needed for the fair sleeper
+ * bonus in place_entity()).
+ *
+ * If it was on the rq, we've just 'preempted' it, which does convert
+ * ->vruntime to a relative base.
+ *
+ * Make sure both cases convert their relative position when migrating
+ * to another cgroup's rq. This does somewhat interfere with the
+ * fair sleeper stuff for the first placement, but who cares.
+ */
+ /*
+ * When !queued, vruntime of the task has usually NOT been normalized.
+ * But there are some cases where it has already been normalized:
+ *
+ * - Moving a forked child which is waiting for being woken up by
+ * wake_up_new_task().
+ * - Moving a task which has been woken up by try_to_wake_up() and
+ * waiting for actually being woken up by sched_ttwu_pending().
+ *
+ * To prevent boost or penalty in the new cfs_rq caused by delta
+ * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
+ */
+ if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING))
+ queued = 1;
+
+ if (!queued)
+ se->vruntime -= cfs_rq_of(se)->min_vruntime;
+ set_task_rq(p, task_cpu(p));
+ se->depth = se->parent ? se->parent->depth + 1 : 0;
+ if (!queued) {
+ cfs_rq = cfs_rq_of(se);
+ se->vruntime += cfs_rq->min_vruntime;
+#ifdef CONFIG_SMP
+ /*
+ * migrate_task_rq_fair() will have removed our previous
+ * contribution, but we must synchronize for ongoing future
+ * decay.
+ */
+ se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
+ cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
+#endif
+ }
+}
+
+void free_fair_sched_group(struct task_group *tg)
+{
+ int i;
+
+ destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
+
+ for_each_possible_cpu(i) {
+ if (tg->cfs_rq)
+ kfree(tg->cfs_rq[i]);
+ if (tg->se)
+ kfree(tg->se[i]);
+ }
+
+ kfree(tg->cfs_rq);
+ kfree(tg->se);
+}
+
+int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
+{
+ struct cfs_rq *cfs_rq;
+ struct sched_entity *se;
+ int i;
+
+ tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
+ if (!tg->cfs_rq)
+ goto err;
+ tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
+ if (!tg->se)
+ goto err;
+
+ tg->shares = NICE_0_LOAD;
+
+ init_cfs_bandwidth(tg_cfs_bandwidth(tg));
+
+ for_each_possible_cpu(i) {
+ cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
+ GFP_KERNEL, cpu_to_node(i));
+ if (!cfs_rq)
+ goto err;
+
+ se = kzalloc_node(sizeof(struct sched_entity),
+ GFP_KERNEL, cpu_to_node(i));
+ if (!se)
+ goto err_free_rq;
+
+ init_cfs_rq(cfs_rq);
+ init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
+ }
+
+ return 1;
+
+err_free_rq:
+ kfree(cfs_rq);
+err:
+ return 0;
+}
+
+void unregister_fair_sched_group(struct task_group *tg, int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+
+ /*
+ * Only empty task groups can be destroyed; so we can speculatively
+ * check on_list without danger of it being re-added.
+ */
+ if (!tg->cfs_rq[cpu]->on_list)
+ return;
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
+ struct sched_entity *se, int cpu,
+ struct sched_entity *parent)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ cfs_rq->tg = tg;
+ cfs_rq->rq = rq;
+ init_cfs_rq_runtime(cfs_rq);
+
+ tg->cfs_rq[cpu] = cfs_rq;
+ tg->se[cpu] = se;
+
+ /* se could be NULL for root_task_group */
+ if (!se)
+ return;
+
+ if (!parent) {
+ se->cfs_rq = &rq->cfs;
+ se->depth = 0;
+ } else {
+ se->cfs_rq = parent->my_q;
+ se->depth = parent->depth + 1;
+ }
+
+ se->my_q = cfs_rq;
+ /* guarantee group entities always have weight */
+ update_load_set(&se->load, NICE_0_LOAD);
+ se->parent = parent;
+}
+
+static DEFINE_MUTEX(shares_mutex);
+
+int sched_group_set_shares(struct task_group *tg, unsigned long shares)
+{
+ int i;
+ unsigned long flags;
+
+ /*
+ * We can't change the weight of the root cgroup.
+ */
+ if (!tg->se[0])
+ return -EINVAL;
+
+ shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
+
+ mutex_lock(&shares_mutex);
+ if (tg->shares == shares)
+ goto done;
+
+ tg->shares = shares;
+ for_each_possible_cpu(i) {
+ struct rq *rq = cpu_rq(i);
+ struct sched_entity *se;
+
+ se = tg->se[i];
+ /* Propagate contribution to hierarchy */
+ raw_spin_lock_irqsave(&rq->lock, flags);
+
+ /* Possible calls to update_curr() need rq clock */
+ update_rq_clock(rq);
+ for_each_sched_entity(se)
+ update_cfs_shares(group_cfs_rq(se));
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+ }
+
+done:
+ mutex_unlock(&shares_mutex);
+ return 0;
+}
+#else /* CONFIG_FAIR_GROUP_SCHED */
+
+void free_fair_sched_group(struct task_group *tg) { }
+
+int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
+{
+ return 1;
+}
+
+void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
+
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+
+static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
+{
+ struct sched_entity *se = &task->se;
+ unsigned int rr_interval = 0;
+
+ /*
+ * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
+ * idle runqueue:
+ */
+ if (rq->cfs.load.weight)
+ rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
+
+ return rr_interval;
+}
+
+/*
+ * All the scheduling class methods:
+ */
+const struct sched_class fair_sched_class = {
+ .next = &idle_sched_class,
+ .enqueue_task = enqueue_task_fair,
+ .dequeue_task = dequeue_task_fair,
+ .yield_task = yield_task_fair,
+ .yield_to_task = yield_to_task_fair,
+
+ .check_preempt_curr = check_preempt_wakeup,
+
+ .pick_next_task = pick_next_task_fair,
+ .put_prev_task = put_prev_task_fair,
+
+#ifdef CONFIG_SMP
+ .select_task_rq = select_task_rq_fair,
+ .migrate_task_rq = migrate_task_rq_fair,
+
+ .rq_online = rq_online_fair,
+ .rq_offline = rq_offline_fair,
+
+ .task_waking = task_waking_fair,
+#endif
+
+ .set_curr_task = set_curr_task_fair,
+ .task_tick = task_tick_fair,
+ .task_fork = task_fork_fair,
+
+ .prio_changed = prio_changed_fair,
+ .switched_from = switched_from_fair,
+ .switched_to = switched_to_fair,
+
+ .get_rr_interval = get_rr_interval_fair,
+
+ .update_curr = update_curr_fair,
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ .task_move_group = task_move_group_fair,
+#endif
+};
+
+#ifdef CONFIG_SCHED_DEBUG
+void print_cfs_stats(struct seq_file *m, int cpu)
+{
+ struct cfs_rq *cfs_rq;
+
+ rcu_read_lock();
+ for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
+ print_cfs_rq(m, cpu, cfs_rq);
+ rcu_read_unlock();
+}
+#endif
+
+__init void init_sched_fair_class(void)
+{
+#ifdef CONFIG_SMP
+ open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
+
+#ifdef CONFIG_NO_HZ_COMMON
+ nohz.next_balance = jiffies;
+ zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
+ cpu_notifier(sched_ilb_notifier, 0);
+#endif
+#endif /* SMP */
+
+}
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
new file mode 100644
index 000000000..91e33cd48
--- /dev/null
+++ b/kernel/sched/features.h
@@ -0,0 +1,98 @@
+/*
+ * Only give sleepers 50% of their service deficit. This allows
+ * them to run sooner, but does not allow tons of sleepers to
+ * rip the spread apart.
+ */
+SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
+
+/*
+ * Place new tasks ahead so that they do not starve already running
+ * tasks
+ */
+SCHED_FEAT(START_DEBIT, true)
+
+/*
+ * Prefer to schedule the task we woke last (assuming it failed
+ * wakeup-preemption), since its likely going to consume data we
+ * touched, increases cache locality.
+ */
+SCHED_FEAT(NEXT_BUDDY, false)
+
+/*
+ * Prefer to schedule the task that ran last (when we did
+ * wake-preempt) as that likely will touch the same data, increases
+ * cache locality.
+ */
+SCHED_FEAT(LAST_BUDDY, true)
+
+/*
+ * Consider buddies to be cache hot, decreases the likelyness of a
+ * cache buddy being migrated away, increases cache locality.
+ */
+SCHED_FEAT(CACHE_HOT_BUDDY, true)
+
+/*
+ * Allow wakeup-time preemption of the current task:
+ */
+SCHED_FEAT(WAKEUP_PREEMPTION, true)
+
+/*
+ * Use arch dependent cpu capacity functions
+ */
+SCHED_FEAT(ARCH_CAPACITY, true)
+
+SCHED_FEAT(HRTICK, false)
+SCHED_FEAT(DOUBLE_TICK, false)
+SCHED_FEAT(LB_BIAS, true)
+
+/*
+ * Decrement CPU capacity based on time not spent running tasks
+ */
+SCHED_FEAT(NONTASK_CAPACITY, true)
+
+/*
+ * Queue remote wakeups on the target CPU and process them
+ * using the scheduler IPI. Reduces rq->lock contention/bounces.
+ */
+SCHED_FEAT(TTWU_QUEUE, true)
+
+#ifdef HAVE_RT_PUSH_IPI
+/*
+ * In order to avoid a thundering herd attack of CPUs that are
+ * lowering their priorities at the same time, and there being
+ * a single CPU that has an RT task that can migrate and is waiting
+ * to run, where the other CPUs will try to take that CPUs
+ * rq lock and possibly create a large contention, sending an
+ * IPI to that CPU and let that CPU push the RT task to where
+ * it should go may be a better scenario.
+ */
+SCHED_FEAT(RT_PUSH_IPI, true)
+#endif
+
+SCHED_FEAT(FORCE_SD_OVERLAP, false)
+SCHED_FEAT(RT_RUNTIME_SHARE, true)
+SCHED_FEAT(LB_MIN, false)
+
+/*
+ * Apply the automatic NUMA scheduling policy. Enabled automatically
+ * at runtime if running on a NUMA machine. Can be controlled via
+ * numa_balancing=
+ */
+#ifdef CONFIG_NUMA_BALANCING
+SCHED_FEAT(NUMA, false)
+
+/*
+ * NUMA_FAVOUR_HIGHER will favor moving tasks towards nodes where a
+ * higher number of hinting faults are recorded during active load
+ * balancing.
+ */
+SCHED_FEAT(NUMA_FAVOUR_HIGHER, true)
+
+/*
+ * NUMA_RESIST_LOWER will resist moving tasks towards nodes where a
+ * lower number of hinting faults have been recorded. As this has
+ * the potential to prevent a task ever migrating to a new node
+ * due to CPU overload it is disabled by default.
+ */
+SCHED_FEAT(NUMA_RESIST_LOWER, false)
+#endif
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
new file mode 100644
index 000000000..70e698d02
--- /dev/null
+++ b/kernel/sched/idle.c
@@ -0,0 +1,302 @@
+/*
+ * Generic entry point for the idle threads
+ */
+#include <linux/sched.h>
+#include <linux/cpu.h>
+#include <linux/cpuidle.h>
+#include <linux/tick.h>
+#include <linux/mm.h>
+#include <linux/stackprotector.h>
+#include <linux/suspend.h>
+
+#include <asm/tlb.h>
+
+#include <trace/events/power.h>
+
+#ifdef CONFIG_SCHED_BFS
+#include "bfs_sched.h"
+#else
+#include "sched.h"
+#endif
+
+static int __read_mostly cpu_idle_force_poll;
+
+void cpu_idle_poll_ctrl(bool enable)
+{
+ if (enable) {
+ cpu_idle_force_poll++;
+ } else {
+ cpu_idle_force_poll--;
+ WARN_ON_ONCE(cpu_idle_force_poll < 0);
+ }
+}
+
+#ifdef CONFIG_GENERIC_IDLE_POLL_SETUP
+static int __init cpu_idle_poll_setup(char *__unused)
+{
+ cpu_idle_force_poll = 1;
+ return 1;
+}
+__setup("nohlt", cpu_idle_poll_setup);
+
+static int __init cpu_idle_nopoll_setup(char *__unused)
+{
+ cpu_idle_force_poll = 0;
+ return 1;
+}
+__setup("hlt", cpu_idle_nopoll_setup);
+#endif
+
+static inline int cpu_idle_poll(void)
+{
+ rcu_idle_enter();
+ trace_cpu_idle_rcuidle(0, smp_processor_id());
+ local_irq_enable();
+ while (!tif_need_resched() &&
+ (cpu_idle_force_poll || tick_check_broadcast_expired()))
+ cpu_relax();
+ trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
+ rcu_idle_exit();
+ return 1;
+}
+
+/* Weak implementations for optional arch specific functions */
+void __weak arch_cpu_idle_prepare(void) { }
+void __weak arch_cpu_idle_enter(void) { }
+void __weak arch_cpu_idle_exit(void) { }
+void __weak arch_cpu_idle_dead(void) { }
+void __weak arch_cpu_idle(void)
+{
+ cpu_idle_force_poll = 1;
+ local_irq_enable();
+}
+
+/**
+ * cpuidle_idle_call - the main idle function
+ *
+ * NOTE: no locks or semaphores should be used here
+ *
+ * On archs that support TIF_POLLING_NRFLAG, is called with polling
+ * set, and it returns with polling set. If it ever stops polling, it
+ * must clear the polling bit.
+ */
+static void cpuidle_idle_call(void)
+{
+ struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
+ struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
+ int next_state, entered_state;
+ bool reflect;
+
+ /*
+ * Check if the idle task must be rescheduled. If it is the
+ * case, exit the function after re-enabling the local irq.
+ */
+ if (need_resched()) {
+ local_irq_enable();
+ return;
+ }
+
+ /*
+ * During the idle period, stop measuring the disabled irqs
+ * critical sections latencies
+ */
+ stop_critical_timings();
+
+ /*
+ * Tell the RCU framework we are entering an idle section,
+ * so no more rcu read side critical sections and one more
+ * step to the grace period
+ */
+ rcu_idle_enter();
+
+ if (cpuidle_not_available(drv, dev))
+ goto use_default;
+
+ /*
+ * Suspend-to-idle ("freeze") is a system state in which all user space
+ * has been frozen, all I/O devices have been suspended and the only
+ * activity happens here and in iterrupts (if any). In that case bypass
+ * the cpuidle governor and go stratight for the deepest idle state
+ * available. Possibly also suspend the local tick and the entire
+ * timekeeping to prevent timer interrupts from kicking us out of idle
+ * until a proper wakeup interrupt happens.
+ */
+ if (idle_should_freeze()) {
+ entered_state = cpuidle_enter_freeze(drv, dev);
+ if (entered_state >= 0) {
+ local_irq_enable();
+ goto exit_idle;
+ }
+
+ reflect = false;
+ next_state = cpuidle_find_deepest_state(drv, dev);
+ } else {
+ reflect = true;
+ /*
+ * Ask the cpuidle framework to choose a convenient idle state.
+ */
+ next_state = cpuidle_select(drv, dev);
+ }
+ /* Fall back to the default arch idle method on errors. */
+ if (next_state < 0)
+ goto use_default;
+
+ /*
+ * The idle task must be scheduled, it is pointless to
+ * go to idle, just update no idle residency and get
+ * out of this function
+ */
+ if (current_clr_polling_and_test()) {
+ dev->last_residency = 0;
+ entered_state = next_state;
+ local_irq_enable();
+ goto exit_idle;
+ }
+
+ /* Take note of the planned idle state. */
+ idle_set_state(this_rq(), &drv->states[next_state]);
+
+ /*
+ * Enter the idle state previously returned by the governor decision.
+ * This function will block until an interrupt occurs and will take
+ * care of re-enabling the local interrupts
+ */
+ entered_state = cpuidle_enter(drv, dev, next_state);
+
+ /* The cpu is no longer idle or about to enter idle. */
+ idle_set_state(this_rq(), NULL);
+
+ if (entered_state == -EBUSY)
+ goto use_default;
+
+ /*
+ * Give the governor an opportunity to reflect on the outcome
+ */
+ if (reflect)
+ cpuidle_reflect(dev, entered_state);
+
+exit_idle:
+ __current_set_polling();
+
+ /*
+ * It is up to the idle functions to reenable local interrupts
+ */
+ if (WARN_ON_ONCE(irqs_disabled()))
+ local_irq_enable();
+
+ rcu_idle_exit();
+ start_critical_timings();
+ return;
+
+use_default:
+ /*
+ * We can't use the cpuidle framework, let's use the default
+ * idle routine.
+ */
+ if (current_clr_polling_and_test())
+ local_irq_enable();
+ else
+ arch_cpu_idle();
+
+ goto exit_idle;
+}
+
+DEFINE_PER_CPU(bool, cpu_dead_idle);
+
+/*
+ * Generic idle loop implementation
+ *
+ * Called with polling cleared.
+ */
+static void cpu_idle_loop(void)
+{
+ while (1) {
+ /*
+ * If the arch has a polling bit, we maintain an invariant:
+ *
+ * Our polling bit is clear if we're not scheduled (i.e. if
+ * rq->curr != rq->idle). This means that, if rq->idle has
+ * the polling bit set, then setting need_resched is
+ * guaranteed to cause the cpu to reschedule.
+ */
+
+ __current_set_polling();
+ tick_nohz_idle_enter();
+
+ while (!need_resched()) {
+ check_pgt_cache();
+ rmb();
+
+ if (cpu_is_offline(smp_processor_id())) {
+ rcu_cpu_notify(NULL, CPU_DYING_IDLE,
+ (void *)(long)smp_processor_id());
+ smp_mb(); /* all activity before dead. */
+ this_cpu_write(cpu_dead_idle, true);
+ arch_cpu_idle_dead();
+ }
+
+ local_irq_disable();
+ arch_cpu_idle_enter();
+
+ /*
+ * In poll mode we reenable interrupts and spin.
+ *
+ * Also if we detected in the wakeup from idle
+ * path that the tick broadcast device expired
+ * for us, we don't want to go deep idle as we
+ * know that the IPI is going to arrive right
+ * away
+ */
+ if (cpu_idle_force_poll || tick_check_broadcast_expired())
+ cpu_idle_poll();
+ else
+ cpuidle_idle_call();
+
+ arch_cpu_idle_exit();
+ }
+
+ /*
+ * Since we fell out of the loop above, we know
+ * TIF_NEED_RESCHED must be set, propagate it into
+ * PREEMPT_NEED_RESCHED.
+ *
+ * This is required because for polling idle loops we will
+ * not have had an IPI to fold the state for us.
+ */
+ preempt_set_need_resched();
+ tick_nohz_idle_exit();
+ __current_clr_polling();
+
+ /*
+ * We promise to call sched_ttwu_pending and reschedule
+ * if need_resched is set while polling is set. That
+ * means that clearing polling needs to be visible
+ * before doing these things.
+ */
+ smp_mb__after_atomic();
+
+ sched_ttwu_pending();
+ schedule_preempt_disabled();
+ }
+}
+
+void cpu_startup_entry(enum cpuhp_state state)
+{
+ /*
+ * This #ifdef needs to die, but it's too late in the cycle to
+ * make this generic (arm and sh have never invoked the canary
+ * init for the non boot cpus!). Will be fixed in 3.11
+ */
+#ifdef CONFIG_X86
+ /*
+ * If we're the non-boot CPU, nothing set the stack canary up
+ * for us. The boot CPU already has it initialized but no harm
+ * in doing it again. This is a good place for updating it, as
+ * we wont ever return from this function (so the invalid
+ * canaries already on the stack wont ever trigger).
+ */
+ boot_init_stack_canary();
+#endif
+ arch_cpu_idle_prepare();
+ cpu_idle_loop();
+}
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
new file mode 100644
index 000000000..c65dac8c9
--- /dev/null
+++ b/kernel/sched/idle_task.c
@@ -0,0 +1,109 @@
+#include "sched.h"
+
+/*
+ * idle-task scheduling class.
+ *
+ * (NOTE: these are not related to SCHED_IDLE tasks which are
+ * handled in sched/fair.c)
+ */
+
+#ifdef CONFIG_SMP
+static int
+select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
+{
+ return task_cpu(p); /* IDLE tasks as never migrated */
+}
+#endif /* CONFIG_SMP */
+
+/*
+ * Idle tasks are unconditionally rescheduled:
+ */
+static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
+{
+ resched_curr(rq);
+}
+
+static struct task_struct *
+pick_next_task_idle(struct rq *rq, struct task_struct *prev)
+{
+ put_prev_task(rq, prev);
+
+ schedstat_inc(rq, sched_goidle);
+ return rq->idle;
+}
+
+/*
+ * It is not legal to sleep in the idle task - print a warning
+ * message if some code attempts to do it:
+ */
+static void
+dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
+{
+ raw_spin_unlock_irq(&rq->lock);
+ printk(KERN_ERR "bad: scheduling from the idle thread!\n");
+ dump_stack();
+ raw_spin_lock_irq(&rq->lock);
+}
+
+static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
+{
+ idle_exit_fair(rq);
+ rq_last_tick_reset(rq);
+}
+
+static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
+{
+}
+
+static void set_curr_task_idle(struct rq *rq)
+{
+}
+
+static void switched_to_idle(struct rq *rq, struct task_struct *p)
+{
+ BUG();
+}
+
+static void
+prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
+{
+ BUG();
+}
+
+static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
+{
+ return 0;
+}
+
+static void update_curr_idle(struct rq *rq)
+{
+}
+
+/*
+ * Simple, special scheduling class for the per-CPU idle tasks:
+ */
+const struct sched_class idle_sched_class = {
+ /* .next is NULL */
+ /* no enqueue/yield_task for idle tasks */
+
+ /* dequeue is not valid, we print a debug message there: */
+ .dequeue_task = dequeue_task_idle,
+
+ .check_preempt_curr = check_preempt_curr_idle,
+
+ .pick_next_task = pick_next_task_idle,
+ .put_prev_task = put_prev_task_idle,
+
+#ifdef CONFIG_SMP
+ .select_task_rq = select_task_rq_idle,
+#endif
+
+ .set_curr_task = set_curr_task_idle,
+ .task_tick = task_tick_idle,
+
+ .get_rr_interval = get_rr_interval_idle,
+
+ .prio_changed = prio_changed_idle,
+ .switched_to = switched_to_idle,
+ .update_curr = update_curr_idle,
+};
diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c
new file mode 100644
index 000000000..8ecd552fe
--- /dev/null
+++ b/kernel/sched/proc.c
@@ -0,0 +1,584 @@
+/*
+ * kernel/sched/proc.c
+ *
+ * Kernel load calculations, forked from sched/core.c
+ */
+
+#include <linux/export.h>
+
+#include "sched.h"
+
+/*
+ * Global load-average calculations
+ *
+ * We take a distributed and async approach to calculating the global load-avg
+ * in order to minimize overhead.
+ *
+ * The global load average is an exponentially decaying average of nr_running +
+ * nr_uninterruptible.
+ *
+ * Once every LOAD_FREQ:
+ *
+ * nr_active = 0;
+ * for_each_possible_cpu(cpu)
+ * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
+ *
+ * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
+ *
+ * Due to a number of reasons the above turns in the mess below:
+ *
+ * - for_each_possible_cpu() is prohibitively expensive on machines with
+ * serious number of cpus, therefore we need to take a distributed approach
+ * to calculating nr_active.
+ *
+ * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
+ * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
+ *
+ * So assuming nr_active := 0 when we start out -- true per definition, we
+ * can simply take per-cpu deltas and fold those into a global accumulate
+ * to obtain the same result. See calc_load_fold_active().
+ *
+ * Furthermore, in order to avoid synchronizing all per-cpu delta folding
+ * across the machine, we assume 10 ticks is sufficient time for every
+ * cpu to have completed this task.
+ *
+ * This places an upper-bound on the IRQ-off latency of the machine. Then
+ * again, being late doesn't loose the delta, just wrecks the sample.
+ *
+ * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
+ * this would add another cross-cpu cacheline miss and atomic operation
+ * to the wakeup path. Instead we increment on whatever cpu the task ran
+ * when it went into uninterruptible state and decrement on whatever cpu
+ * did the wakeup. This means that only the sum of nr_uninterruptible over
+ * all cpus yields the correct result.
+ *
+ * This covers the NO_HZ=n code, for extra head-aches, see the comment below.
+ */
+
+/* Variables and functions for calc_load */
+atomic_long_t calc_load_tasks;
+unsigned long calc_load_update;
+unsigned long avenrun[3];
+EXPORT_SYMBOL(avenrun); /* should be removed */
+
+/**
+ * get_avenrun - get the load average array
+ * @loads: pointer to dest load array
+ * @offset: offset to add
+ * @shift: shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
+{
+ loads[0] = (avenrun[0] + offset) << shift;
+ loads[1] = (avenrun[1] + offset) << shift;
+ loads[2] = (avenrun[2] + offset) << shift;
+}
+
+long calc_load_fold_active(struct rq *this_rq)
+{
+ long nr_active, delta = 0;
+
+ nr_active = this_rq->nr_running;
+ nr_active += (long) this_rq->nr_uninterruptible;
+
+ if (nr_active != this_rq->calc_load_active) {
+ delta = nr_active - this_rq->calc_load_active;
+ this_rq->calc_load_active = nr_active;
+ }
+
+ return delta;
+}
+
+/*
+ * a1 = a0 * e + a * (1 - e)
+ */
+static unsigned long
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
+{
+ load *= exp;
+ load += active * (FIXED_1 - exp);
+ load += 1UL << (FSHIFT - 1);
+ return load >> FSHIFT;
+}
+
+#ifdef CONFIG_NO_HZ_COMMON
+/*
+ * Handle NO_HZ for the global load-average.
+ *
+ * Since the above described distributed algorithm to compute the global
+ * load-average relies on per-cpu sampling from the tick, it is affected by
+ * NO_HZ.
+ *
+ * The basic idea is to fold the nr_active delta into a global idle-delta upon
+ * entering NO_HZ state such that we can include this as an 'extra' cpu delta
+ * when we read the global state.
+ *
+ * Obviously reality has to ruin such a delightfully simple scheme:
+ *
+ * - When we go NO_HZ idle during the window, we can negate our sample
+ * contribution, causing under-accounting.
+ *
+ * We avoid this by keeping two idle-delta counters and flipping them
+ * when the window starts, thus separating old and new NO_HZ load.
+ *
+ * The only trick is the slight shift in index flip for read vs write.
+ *
+ * 0s 5s 10s 15s
+ * +10 +10 +10 +10
+ * |-|-----------|-|-----------|-|-----------|-|
+ * r:0 0 1 1 0 0 1 1 0
+ * w:0 1 1 0 0 1 1 0 0
+ *
+ * This ensures we'll fold the old idle contribution in this window while
+ * accumlating the new one.
+ *
+ * - When we wake up from NO_HZ idle during the window, we push up our
+ * contribution, since we effectively move our sample point to a known
+ * busy state.
+ *
+ * This is solved by pushing the window forward, and thus skipping the
+ * sample, for this cpu (effectively using the idle-delta for this cpu which
+ * was in effect at the time the window opened). This also solves the issue
+ * of having to deal with a cpu having been in NOHZ idle for multiple
+ * LOAD_FREQ intervals.
+ *
+ * When making the ILB scale, we should try to pull this in as well.
+ */
+static atomic_long_t calc_load_idle[2];
+static int calc_load_idx;
+
+static inline int calc_load_write_idx(void)
+{
+ int idx = calc_load_idx;
+
+ /*
+ * See calc_global_nohz(), if we observe the new index, we also
+ * need to observe the new update time.
+ */
+ smp_rmb();
+
+ /*
+ * If the folding window started, make sure we start writing in the
+ * next idle-delta.
+ */
+ if (!time_before(jiffies, calc_load_update))
+ idx++;
+
+ return idx & 1;
+}
+
+static inline int calc_load_read_idx(void)
+{
+ return calc_load_idx & 1;
+}
+
+void calc_load_enter_idle(void)
+{
+ struct rq *this_rq = this_rq();
+ long delta;
+
+ /*
+ * We're going into NOHZ mode, if there's any pending delta, fold it
+ * into the pending idle delta.
+ */
+ delta = calc_load_fold_active(this_rq);
+ if (delta) {
+ int idx = calc_load_write_idx();
+ atomic_long_add(delta, &calc_load_idle[idx]);
+ }
+}
+
+void calc_load_exit_idle(void)
+{
+ struct rq *this_rq = this_rq();
+
+ /*
+ * If we're still before the sample window, we're done.
+ */
+ if (time_before(jiffies, this_rq->calc_load_update))
+ return;
+
+ /*
+ * We woke inside or after the sample window, this means we're already
+ * accounted through the nohz accounting, so skip the entire deal and
+ * sync up for the next window.
+ */
+ this_rq->calc_load_update = calc_load_update;
+ if (time_before(jiffies, this_rq->calc_load_update + 10))
+ this_rq->calc_load_update += LOAD_FREQ;
+}
+
+static long calc_load_fold_idle(void)
+{
+ int idx = calc_load_read_idx();
+ long delta = 0;
+
+ if (atomic_long_read(&calc_load_idle[idx]))
+ delta = atomic_long_xchg(&calc_load_idle[idx], 0);
+
+ return delta;
+}
+
+/**
+ * fixed_power_int - compute: x^n, in O(log n) time
+ *
+ * @x: base of the power
+ * @frac_bits: fractional bits of @x
+ * @n: power to raise @x to.
+ *
+ * By exploiting the relation between the definition of the natural power
+ * function: x^n := x*x*...*x (x multiplied by itself for n times), and
+ * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
+ * (where: n_i \elem {0, 1}, the binary vector representing n),
+ * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
+ * of course trivially computable in O(log_2 n), the length of our binary
+ * vector.
+ */
+static unsigned long
+fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
+{
+ unsigned long result = 1UL << frac_bits;
+
+ if (n) for (;;) {
+ if (n & 1) {
+ result *= x;
+ result += 1UL << (frac_bits - 1);
+ result >>= frac_bits;
+ }
+ n >>= 1;
+ if (!n)
+ break;
+ x *= x;
+ x += 1UL << (frac_bits - 1);
+ x >>= frac_bits;
+ }
+
+ return result;
+}
+
+/*
+ * a1 = a0 * e + a * (1 - e)
+ *
+ * a2 = a1 * e + a * (1 - e)
+ * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
+ * = a0 * e^2 + a * (1 - e) * (1 + e)
+ *
+ * a3 = a2 * e + a * (1 - e)
+ * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
+ * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
+ *
+ * ...
+ *
+ * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
+ * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
+ * = a0 * e^n + a * (1 - e^n)
+ *
+ * [1] application of the geometric series:
+ *
+ * n 1 - x^(n+1)
+ * S_n := \Sum x^i = -------------
+ * i=0 1 - x
+ */
+static unsigned long
+calc_load_n(unsigned long load, unsigned long exp,
+ unsigned long active, unsigned int n)
+{
+
+ return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
+}
+
+/*
+ * NO_HZ can leave us missing all per-cpu ticks calling
+ * calc_load_account_active(), but since an idle CPU folds its delta into
+ * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
+ * in the pending idle delta if our idle period crossed a load cycle boundary.
+ *
+ * Once we've updated the global active value, we need to apply the exponential
+ * weights adjusted to the number of cycles missed.
+ */
+static void calc_global_nohz(void)
+{
+ long delta, active, n;
+
+ if (!time_before(jiffies, calc_load_update + 10)) {
+ /*
+ * Catch-up, fold however many we are behind still
+ */
+ delta = jiffies - calc_load_update - 10;
+ n = 1 + (delta / LOAD_FREQ);
+
+ active = atomic_long_read(&calc_load_tasks);
+ active = active > 0 ? active * FIXED_1 : 0;
+
+ avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+ avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+ avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+
+ calc_load_update += n * LOAD_FREQ;
+ }
+
+ /*
+ * Flip the idle index...
+ *
+ * Make sure we first write the new time then flip the index, so that
+ * calc_load_write_idx() will see the new time when it reads the new
+ * index, this avoids a double flip messing things up.
+ */
+ smp_wmb();
+ calc_load_idx++;
+}
+#else /* !CONFIG_NO_HZ_COMMON */
+
+static inline long calc_load_fold_idle(void) { return 0; }
+static inline void calc_global_nohz(void) { }
+
+#endif /* CONFIG_NO_HZ_COMMON */
+
+/*
+ * calc_load - update the avenrun load estimates 10 ticks after the
+ * CPUs have updated calc_load_tasks.
+ */
+void calc_global_load(unsigned long ticks)
+{
+ long active, delta;
+
+ if (time_before(jiffies, calc_load_update + 10))
+ return;
+
+ /*
+ * Fold the 'old' idle-delta to include all NO_HZ cpus.
+ */
+ delta = calc_load_fold_idle();
+ if (delta)
+ atomic_long_add(delta, &calc_load_tasks);
+
+ active = atomic_long_read(&calc_load_tasks);
+ active = active > 0 ? active * FIXED_1 : 0;
+
+ avenrun[0] = calc_load(avenrun[0], EXP_1, active);
+ avenrun[1] = calc_load(avenrun[1], EXP_5, active);
+ avenrun[2] = calc_load(avenrun[2], EXP_15, active);
+
+ calc_load_update += LOAD_FREQ;
+
+ /*
+ * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
+ */
+ calc_global_nohz();
+}
+
+/*
+ * Called from update_cpu_load() to periodically update this CPU's
+ * active count.
+ */
+static void calc_load_account_active(struct rq *this_rq)
+{
+ long delta;
+
+ if (time_before(jiffies, this_rq->calc_load_update))
+ return;
+
+ delta = calc_load_fold_active(this_rq);
+ if (delta)
+ atomic_long_add(delta, &calc_load_tasks);
+
+ this_rq->calc_load_update += LOAD_FREQ;
+}
+
+/*
+ * End of global load-average stuff
+ */
+
+/*
+ * The exact cpuload at various idx values, calculated at every tick would be
+ * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
+ *
+ * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
+ * on nth tick when cpu may be busy, then we have:
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
+ *
+ * decay_load_missed() below does efficient calculation of
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
+ *
+ * The calculation is approximated on a 128 point scale.
+ * degrade_zero_ticks is the number of ticks after which load at any
+ * particular idx is approximated to be zero.
+ * degrade_factor is a precomputed table, a row for each load idx.
+ * Each column corresponds to degradation factor for a power of two ticks,
+ * based on 128 point scale.
+ * Example:
+ * row 2, col 3 (=12) says that the degradation at load idx 2 after
+ * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
+ *
+ * With this power of 2 load factors, we can degrade the load n times
+ * by looking at 1 bits in n and doing as many mult/shift instead of
+ * n mult/shifts needed by the exact degradation.
+ */
+#define DEGRADE_SHIFT 7
+static const unsigned char
+ degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
+static const unsigned char
+ degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
+ {0, 0, 0, 0, 0, 0, 0, 0},
+ {64, 32, 8, 0, 0, 0, 0, 0},
+ {96, 72, 40, 12, 1, 0, 0},
+ {112, 98, 75, 43, 15, 1, 0},
+ {120, 112, 98, 76, 45, 16, 2} };
+
+/*
+ * Update cpu_load for any missed ticks, due to tickless idle. The backlog
+ * would be when CPU is idle and so we just decay the old load without
+ * adding any new load.
+ */
+static unsigned long
+decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
+{
+ int j = 0;
+
+ if (!missed_updates)
+ return load;
+
+ if (missed_updates >= degrade_zero_ticks[idx])
+ return 0;
+
+ if (idx == 1)
+ return load >> missed_updates;
+
+ while (missed_updates) {
+ if (missed_updates % 2)
+ load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
+
+ missed_updates >>= 1;
+ j++;
+ }
+ return load;
+}
+
+/*
+ * Update rq->cpu_load[] statistics. This function is usually called every
+ * scheduler tick (TICK_NSEC). With tickless idle this will not be called
+ * every tick. We fix it up based on jiffies.
+ */
+static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
+ unsigned long pending_updates)
+{
+ int i, scale;
+
+ this_rq->nr_load_updates++;
+
+ /* Update our load: */
+ this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
+ for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
+ unsigned long old_load, new_load;
+
+ /* scale is effectively 1 << i now, and >> i divides by scale */
+
+ old_load = this_rq->cpu_load[i];
+ old_load = decay_load_missed(old_load, pending_updates - 1, i);
+ new_load = this_load;
+ /*
+ * Round up the averaging division if load is increasing. This
+ * prevents us from getting stuck on 9 if the load is 10, for
+ * example.
+ */
+ if (new_load > old_load)
+ new_load += scale - 1;
+
+ this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
+ }
+
+ sched_avg_update(this_rq);
+}
+
+#ifdef CONFIG_SMP
+static inline unsigned long get_rq_runnable_load(struct rq *rq)
+{
+ return rq->cfs.runnable_load_avg;
+}
+#else
+static inline unsigned long get_rq_runnable_load(struct rq *rq)
+{
+ return rq->load.weight;
+}
+#endif
+
+#ifdef CONFIG_NO_HZ_COMMON
+/*
+ * There is no sane way to deal with nohz on smp when using jiffies because the
+ * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
+ * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
+ *
+ * Therefore we cannot use the delta approach from the regular tick since that
+ * would seriously skew the load calculation. However we'll make do for those
+ * updates happening while idle (nohz_idle_balance) or coming out of idle
+ * (tick_nohz_idle_exit).
+ *
+ * This means we might still be one tick off for nohz periods.
+ */
+
+/*
+ * Called from nohz_idle_balance() to update the load ratings before doing the
+ * idle balance.
+ */
+void update_idle_cpu_load(struct rq *this_rq)
+{
+ unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
+ unsigned long load = get_rq_runnable_load(this_rq);
+ unsigned long pending_updates;
+
+ /*
+ * bail if there's load or we're actually up-to-date.
+ */
+ if (load || curr_jiffies == this_rq->last_load_update_tick)
+ return;
+
+ pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+ this_rq->last_load_update_tick = curr_jiffies;
+
+ __update_cpu_load(this_rq, load, pending_updates);
+}
+
+/*
+ * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
+ */
+void update_cpu_load_nohz(void)
+{
+ struct rq *this_rq = this_rq();
+ unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
+ unsigned long pending_updates;
+
+ if (curr_jiffies == this_rq->last_load_update_tick)
+ return;
+
+ raw_spin_lock(&this_rq->lock);
+ pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+ if (pending_updates) {
+ this_rq->last_load_update_tick = curr_jiffies;
+ /*
+ * We were idle, this means load 0, the current load might be
+ * !0 due to remote wakeups and the sort.
+ */
+ __update_cpu_load(this_rq, 0, pending_updates);
+ }
+ raw_spin_unlock(&this_rq->lock);
+}
+#endif /* CONFIG_NO_HZ */
+
+/*
+ * Called from scheduler_tick()
+ */
+void update_cpu_load_active(struct rq *this_rq)
+{
+ unsigned long load = get_rq_runnable_load(this_rq);
+ /*
+ * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
+ */
+ this_rq->last_load_update_tick = jiffies;
+ __update_cpu_load(this_rq, load, 1);
+
+ calc_load_account_active(this_rq);
+}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
new file mode 100644
index 000000000..575da76a3
--- /dev/null
+++ b/kernel/sched/rt.c
@@ -0,0 +1,2346 @@
+/*
+ * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
+ * policies)
+ */
+
+#include "sched.h"
+
+#include <linux/slab.h>
+#include <linux/irq_work.h>
+
+int sched_rr_timeslice = RR_TIMESLICE;
+
+static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
+
+struct rt_bandwidth def_rt_bandwidth;
+
+static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
+{
+ struct rt_bandwidth *rt_b =
+ container_of(timer, struct rt_bandwidth, rt_period_timer);
+ ktime_t now;
+ int overrun;
+ int idle = 0;
+
+ for (;;) {
+ now = hrtimer_cb_get_time(timer);
+ overrun = hrtimer_forward(timer, now, rt_b->rt_period);
+
+ if (!overrun)
+ break;
+
+ idle = do_sched_rt_period_timer(rt_b, overrun);
+ }
+
+ return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
+}
+
+void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
+{
+ rt_b->rt_period = ns_to_ktime(period);
+ rt_b->rt_runtime = runtime;
+
+ raw_spin_lock_init(&rt_b->rt_runtime_lock);
+
+ hrtimer_init(&rt_b->rt_period_timer,
+ CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ rt_b->rt_period_timer.function = sched_rt_period_timer;
+}
+
+static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
+{
+ if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
+ return;
+
+ if (hrtimer_active(&rt_b->rt_period_timer))
+ return;
+
+ raw_spin_lock(&rt_b->rt_runtime_lock);
+ start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
+ raw_spin_unlock(&rt_b->rt_runtime_lock);
+}
+
+#ifdef CONFIG_SMP
+static void push_irq_work_func(struct irq_work *work);
+#endif
+
+void init_rt_rq(struct rt_rq *rt_rq)
+{
+ struct rt_prio_array *array;
+ int i;
+
+ array = &rt_rq->active;
+ for (i = 0; i < MAX_RT_PRIO; i++) {
+ INIT_LIST_HEAD(array->queue + i);
+ __clear_bit(i, array->bitmap);
+ }
+ /* delimiter for bitsearch: */
+ __set_bit(MAX_RT_PRIO, array->bitmap);
+
+#if defined CONFIG_SMP
+ rt_rq->highest_prio.curr = MAX_RT_PRIO;
+ rt_rq->highest_prio.next = MAX_RT_PRIO;
+ rt_rq->rt_nr_migratory = 0;
+ rt_rq->overloaded = 0;
+ plist_head_init(&rt_rq->pushable_tasks);
+
+#ifdef HAVE_RT_PUSH_IPI
+ rt_rq->push_flags = 0;
+ rt_rq->push_cpu = nr_cpu_ids;
+ raw_spin_lock_init(&rt_rq->push_lock);
+ init_irq_work(&rt_rq->push_work, push_irq_work_func);
+#endif
+#endif /* CONFIG_SMP */
+ /* We start is dequeued state, because no RT tasks are queued */
+ rt_rq->rt_queued = 0;
+
+ rt_rq->rt_time = 0;
+ rt_rq->rt_throttled = 0;
+ rt_rq->rt_runtime = 0;
+ raw_spin_lock_init(&rt_rq->rt_runtime_lock);
+}
+
+#ifdef CONFIG_RT_GROUP_SCHED
+static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
+{
+ hrtimer_cancel(&rt_b->rt_period_timer);
+}
+
+#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
+
+static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
+{
+#ifdef CONFIG_SCHED_DEBUG
+ WARN_ON_ONCE(!rt_entity_is_task(rt_se));
+#endif
+ return container_of(rt_se, struct task_struct, rt);
+}
+
+static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
+{
+ return rt_rq->rq;
+}
+
+static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+{
+ return rt_se->rt_rq;
+}
+
+static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
+{
+ struct rt_rq *rt_rq = rt_se->rt_rq;
+
+ return rt_rq->rq;
+}
+
+void free_rt_sched_group(struct task_group *tg)
+{
+ int i;
+
+ if (tg->rt_se)
+ destroy_rt_bandwidth(&tg->rt_bandwidth);
+
+ for_each_possible_cpu(i) {
+ if (tg->rt_rq)
+ kfree(tg->rt_rq[i]);
+ if (tg->rt_se)
+ kfree(tg->rt_se[i]);
+ }
+
+ kfree(tg->rt_rq);
+ kfree(tg->rt_se);
+}
+
+void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
+ struct sched_rt_entity *rt_se, int cpu,
+ struct sched_rt_entity *parent)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ rt_rq->highest_prio.curr = MAX_RT_PRIO;
+ rt_rq->rt_nr_boosted = 0;
+ rt_rq->rq = rq;
+ rt_rq->tg = tg;
+
+ tg->rt_rq[cpu] = rt_rq;
+ tg->rt_se[cpu] = rt_se;
+
+ if (!rt_se)
+ return;
+
+ if (!parent)
+ rt_se->rt_rq = &rq->rt;
+ else
+ rt_se->rt_rq = parent->my_q;
+
+ rt_se->my_q = rt_rq;
+ rt_se->parent = parent;
+ INIT_LIST_HEAD(&rt_se->run_list);
+}
+
+int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
+{
+ struct rt_rq *rt_rq;
+ struct sched_rt_entity *rt_se;
+ int i;
+
+ tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
+ if (!tg->rt_rq)
+ goto err;
+ tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
+ if (!tg->rt_se)
+ goto err;
+
+ init_rt_bandwidth(&tg->rt_bandwidth,
+ ktime_to_ns(def_rt_bandwidth.rt_period), 0);
+
+ for_each_possible_cpu(i) {
+ rt_rq = kzalloc_node(sizeof(struct rt_rq),
+ GFP_KERNEL, cpu_to_node(i));
+ if (!rt_rq)
+ goto err;
+
+ rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
+ GFP_KERNEL, cpu_to_node(i));
+ if (!rt_se)
+ goto err_free_rq;
+
+ init_rt_rq(rt_rq);
+ rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
+ init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
+ }
+
+ return 1;
+
+err_free_rq:
+ kfree(rt_rq);
+err:
+ return 0;
+}
+
+#else /* CONFIG_RT_GROUP_SCHED */
+
+#define rt_entity_is_task(rt_se) (1)
+
+static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
+{
+ return container_of(rt_se, struct task_struct, rt);
+}
+
+static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
+{
+ return container_of(rt_rq, struct rq, rt);
+}
+
+static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
+{
+ struct task_struct *p = rt_task_of(rt_se);
+
+ return task_rq(p);
+}
+
+static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+{
+ struct rq *rq = rq_of_rt_se(rt_se);
+
+ return &rq->rt;
+}
+
+void free_rt_sched_group(struct task_group *tg) { }
+
+int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
+{
+ return 1;
+}
+#endif /* CONFIG_RT_GROUP_SCHED */
+
+#ifdef CONFIG_SMP
+
+static int pull_rt_task(struct rq *this_rq);
+
+static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
+{
+ /* Try to pull RT tasks here if we lower this rq's prio */
+ return rq->rt.highest_prio.curr > prev->prio;
+}
+
+static inline int rt_overloaded(struct rq *rq)
+{
+ return atomic_read(&rq->rd->rto_count);
+}
+
+static inline void rt_set_overload(struct rq *rq)
+{
+ if (!rq->online)
+ return;
+
+ cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
+ /*
+ * Make sure the mask is visible before we set
+ * the overload count. That is checked to determine
+ * if we should look at the mask. It would be a shame
+ * if we looked at the mask, but the mask was not
+ * updated yet.
+ *
+ * Matched by the barrier in pull_rt_task().
+ */
+ smp_wmb();
+ atomic_inc(&rq->rd->rto_count);
+}
+
+static inline void rt_clear_overload(struct rq *rq)
+{
+ if (!rq->online)
+ return;
+
+ /* the order here really doesn't matter */
+ atomic_dec(&rq->rd->rto_count);
+ cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
+}
+
+static void update_rt_migration(struct rt_rq *rt_rq)
+{
+ if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
+ if (!rt_rq->overloaded) {
+ rt_set_overload(rq_of_rt_rq(rt_rq));
+ rt_rq->overloaded = 1;
+ }
+ } else if (rt_rq->overloaded) {
+ rt_clear_overload(rq_of_rt_rq(rt_rq));
+ rt_rq->overloaded = 0;
+ }
+}
+
+static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+ struct task_struct *p;
+
+ if (!rt_entity_is_task(rt_se))
+ return;
+
+ p = rt_task_of(rt_se);
+ rt_rq = &rq_of_rt_rq(rt_rq)->rt;
+
+ rt_rq->rt_nr_total++;
+ if (p->nr_cpus_allowed > 1)
+ rt_rq->rt_nr_migratory++;
+
+ update_rt_migration(rt_rq);
+}
+
+static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+ struct task_struct *p;
+
+ if (!rt_entity_is_task(rt_se))
+ return;
+
+ p = rt_task_of(rt_se);
+ rt_rq = &rq_of_rt_rq(rt_rq)->rt;
+
+ rt_rq->rt_nr_total--;
+ if (p->nr_cpus_allowed > 1)
+ rt_rq->rt_nr_migratory--;
+
+ update_rt_migration(rt_rq);
+}
+
+static inline int has_pushable_tasks(struct rq *rq)
+{
+ return !plist_head_empty(&rq->rt.pushable_tasks);
+}
+
+static inline void set_post_schedule(struct rq *rq)
+{
+ /*
+ * We detect this state here so that we can avoid taking the RQ
+ * lock again later if there is no need to push
+ */
+ rq->post_schedule = has_pushable_tasks(rq);
+}
+
+static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
+{
+ plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
+ plist_node_init(&p->pushable_tasks, p->prio);
+ plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
+
+ /* Update the highest prio pushable task */
+ if (p->prio < rq->rt.highest_prio.next)
+ rq->rt.highest_prio.next = p->prio;
+}
+
+static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
+{
+ plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
+
+ /* Update the new highest prio pushable task */
+ if (has_pushable_tasks(rq)) {
+ p = plist_first_entry(&rq->rt.pushable_tasks,
+ struct task_struct, pushable_tasks);
+ rq->rt.highest_prio.next = p->prio;
+ } else
+ rq->rt.highest_prio.next = MAX_RT_PRIO;
+}
+
+#else
+
+static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
+{
+}
+
+static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
+{
+}
+
+static inline
+void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+}
+
+static inline
+void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+}
+
+static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
+{
+ return false;
+}
+
+static inline int pull_rt_task(struct rq *this_rq)
+{
+ return 0;
+}
+
+static inline void set_post_schedule(struct rq *rq)
+{
+}
+#endif /* CONFIG_SMP */
+
+static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
+static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
+
+static inline int on_rt_rq(struct sched_rt_entity *rt_se)
+{
+ return !list_empty(&rt_se->run_list);
+}
+
+#ifdef CONFIG_RT_GROUP_SCHED
+
+static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
+{
+ if (!rt_rq->tg)
+ return RUNTIME_INF;
+
+ return rt_rq->rt_runtime;
+}
+
+static inline u64 sched_rt_period(struct rt_rq *rt_rq)
+{
+ return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
+}
+
+typedef struct task_group *rt_rq_iter_t;
+
+static inline struct task_group *next_task_group(struct task_group *tg)
+{
+ do {
+ tg = list_entry_rcu(tg->list.next,
+ typeof(struct task_group), list);
+ } while (&tg->list != &task_groups && task_group_is_autogroup(tg));
+
+ if (&tg->list == &task_groups)
+ tg = NULL;
+
+ return tg;
+}
+
+#define for_each_rt_rq(rt_rq, iter, rq) \
+ for (iter = container_of(&task_groups, typeof(*iter), list); \
+ (iter = next_task_group(iter)) && \
+ (rt_rq = iter->rt_rq[cpu_of(rq)]);)
+
+#define for_each_sched_rt_entity(rt_se) \
+ for (; rt_se; rt_se = rt_se->parent)
+
+static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
+{
+ return rt_se->my_q;
+}
+
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
+static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
+
+static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
+{
+ struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
+ struct rq *rq = rq_of_rt_rq(rt_rq);
+ struct sched_rt_entity *rt_se;
+
+ int cpu = cpu_of(rq);
+
+ rt_se = rt_rq->tg->rt_se[cpu];
+
+ if (rt_rq->rt_nr_running) {
+ if (!rt_se)
+ enqueue_top_rt_rq(rt_rq);
+ else if (!on_rt_rq(rt_se))
+ enqueue_rt_entity(rt_se, false);
+
+ if (rt_rq->highest_prio.curr < curr->prio)
+ resched_curr(rq);
+ }
+}
+
+static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
+{
+ struct sched_rt_entity *rt_se;
+ int cpu = cpu_of(rq_of_rt_rq(rt_rq));
+
+ rt_se = rt_rq->tg->rt_se[cpu];
+
+ if (!rt_se)
+ dequeue_top_rt_rq(rt_rq);
+ else if (on_rt_rq(rt_se))
+ dequeue_rt_entity(rt_se);
+}
+
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+ return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
+}
+
+static int rt_se_boosted(struct sched_rt_entity *rt_se)
+{
+ struct rt_rq *rt_rq = group_rt_rq(rt_se);
+ struct task_struct *p;
+
+ if (rt_rq)
+ return !!rt_rq->rt_nr_boosted;
+
+ p = rt_task_of(rt_se);
+ return p->prio != p->normal_prio;
+}
+
+#ifdef CONFIG_SMP
+static inline const struct cpumask *sched_rt_period_mask(void)
+{
+ return this_rq()->rd->span;
+}
+#else
+static inline const struct cpumask *sched_rt_period_mask(void)
+{
+ return cpu_online_mask;
+}
+#endif
+
+static inline
+struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
+{
+ return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
+}
+
+static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
+{
+ return &rt_rq->tg->rt_bandwidth;
+}
+
+#else /* !CONFIG_RT_GROUP_SCHED */
+
+static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
+{
+ return rt_rq->rt_runtime;
+}
+
+static inline u64 sched_rt_period(struct rt_rq *rt_rq)
+{
+ return ktime_to_ns(def_rt_bandwidth.rt_period);
+}
+
+typedef struct rt_rq *rt_rq_iter_t;
+
+#define for_each_rt_rq(rt_rq, iter, rq) \
+ for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
+
+#define for_each_sched_rt_entity(rt_se) \
+ for (; rt_se; rt_se = NULL)
+
+static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
+{
+ return NULL;
+}
+
+static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
+{
+ struct rq *rq = rq_of_rt_rq(rt_rq);
+
+ if (!rt_rq->rt_nr_running)
+ return;
+
+ enqueue_top_rt_rq(rt_rq);
+ resched_curr(rq);
+}
+
+static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
+{
+ dequeue_top_rt_rq(rt_rq);
+}
+
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+ return rt_rq->rt_throttled;
+}
+
+static inline const struct cpumask *sched_rt_period_mask(void)
+{
+ return cpu_online_mask;
+}
+
+static inline
+struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
+{
+ return &cpu_rq(cpu)->rt;
+}
+
+static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
+{
+ return &def_rt_bandwidth;
+}
+
+#endif /* CONFIG_RT_GROUP_SCHED */
+
+bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
+{
+ struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+
+ return (hrtimer_active(&rt_b->rt_period_timer) ||
+ rt_rq->rt_time < rt_b->rt_runtime);
+}
+
+#ifdef CONFIG_SMP
+/*
+ * We ran out of runtime, see if we can borrow some from our neighbours.
+ */
+static int do_balance_runtime(struct rt_rq *rt_rq)
+{
+ struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+ struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
+ int i, weight, more = 0;
+ u64 rt_period;
+
+ weight = cpumask_weight(rd->span);
+
+ raw_spin_lock(&rt_b->rt_runtime_lock);
+ rt_period = ktime_to_ns(rt_b->rt_period);
+ for_each_cpu(i, rd->span) {
+ struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
+ s64 diff;
+
+ if (iter == rt_rq)
+ continue;
+
+ raw_spin_lock(&iter->rt_runtime_lock);
+ /*
+ * Either all rqs have inf runtime and there's nothing to steal
+ * or __disable_runtime() below sets a specific rq to inf to
+ * indicate its been disabled and disalow stealing.
+ */
+ if (iter->rt_runtime == RUNTIME_INF)
+ goto next;
+
+ /*
+ * From runqueues with spare time, take 1/n part of their
+ * spare time, but no more than our period.
+ */
+ diff = iter->rt_runtime - iter->rt_time;
+ if (diff > 0) {
+ diff = div_u64((u64)diff, weight);
+ if (rt_rq->rt_runtime + diff > rt_period)
+ diff = rt_period - rt_rq->rt_runtime;
+ iter->rt_runtime -= diff;
+ rt_rq->rt_runtime += diff;
+ more = 1;
+ if (rt_rq->rt_runtime == rt_period) {
+ raw_spin_unlock(&iter->rt_runtime_lock);
+ break;
+ }
+ }
+next:
+ raw_spin_unlock(&iter->rt_runtime_lock);
+ }
+ raw_spin_unlock(&rt_b->rt_runtime_lock);
+
+ return more;
+}
+
+/*
+ * Ensure this RQ takes back all the runtime it lend to its neighbours.
+ */
+static void __disable_runtime(struct rq *rq)
+{
+ struct root_domain *rd = rq->rd;
+ rt_rq_iter_t iter;
+ struct rt_rq *rt_rq;
+
+ if (unlikely(!scheduler_running))
+ return;
+
+ for_each_rt_rq(rt_rq, iter, rq) {
+ struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+ s64 want;
+ int i;
+
+ raw_spin_lock(&rt_b->rt_runtime_lock);
+ raw_spin_lock(&rt_rq->rt_runtime_lock);
+ /*
+ * Either we're all inf and nobody needs to borrow, or we're
+ * already disabled and thus have nothing to do, or we have
+ * exactly the right amount of runtime to take out.
+ */
+ if (rt_rq->rt_runtime == RUNTIME_INF ||
+ rt_rq->rt_runtime == rt_b->rt_runtime)
+ goto balanced;
+ raw_spin_unlock(&rt_rq->rt_runtime_lock);
+
+ /*
+ * Calculate the difference between what we started out with
+ * and what we current have, that's the amount of runtime
+ * we lend and now have to reclaim.
+ */
+ want = rt_b->rt_runtime - rt_rq->rt_runtime;
+
+ /*
+ * Greedy reclaim, take back as much as we can.
+ */
+ for_each_cpu(i, rd->span) {
+ struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
+ s64 diff;
+
+ /*
+ * Can't reclaim from ourselves or disabled runqueues.
+ */
+ if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
+ continue;
+
+ raw_spin_lock(&iter->rt_runtime_lock);
+ if (want > 0) {
+ diff = min_t(s64, iter->rt_runtime, want);
+ iter->rt_runtime -= diff;
+ want -= diff;
+ } else {
+ iter->rt_runtime -= want;
+ want -= want;
+ }
+ raw_spin_unlock(&iter->rt_runtime_lock);
+
+ if (!want)
+ break;
+ }
+
+ raw_spin_lock(&rt_rq->rt_runtime_lock);
+ /*
+ * We cannot be left wanting - that would mean some runtime
+ * leaked out of the system.
+ */
+ BUG_ON(want);
+balanced:
+ /*
+ * Disable all the borrow logic by pretending we have inf
+ * runtime - in which case borrowing doesn't make sense.
+ */
+ rt_rq->rt_runtime = RUNTIME_INF;
+ rt_rq->rt_throttled = 0;
+ raw_spin_unlock(&rt_rq->rt_runtime_lock);
+ raw_spin_unlock(&rt_b->rt_runtime_lock);
+
+ /* Make rt_rq available for pick_next_task() */
+ sched_rt_rq_enqueue(rt_rq);
+ }
+}
+
+static void __enable_runtime(struct rq *rq)
+{
+ rt_rq_iter_t iter;
+ struct rt_rq *rt_rq;
+
+ if (unlikely(!scheduler_running))
+ return;
+
+ /*
+ * Reset each runqueue's bandwidth settings
+ */
+ for_each_rt_rq(rt_rq, iter, rq) {
+ struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+
+ raw_spin_lock(&rt_b->rt_runtime_lock);
+ raw_spin_lock(&rt_rq->rt_runtime_lock);
+ rt_rq->rt_runtime = rt_b->rt_runtime;
+ rt_rq->rt_time = 0;
+ rt_rq->rt_throttled = 0;
+ raw_spin_unlock(&rt_rq->rt_runtime_lock);
+ raw_spin_unlock(&rt_b->rt_runtime_lock);
+ }
+}
+
+static int balance_runtime(struct rt_rq *rt_rq)
+{
+ int more = 0;
+
+ if (!sched_feat(RT_RUNTIME_SHARE))
+ return more;
+
+ if (rt_rq->rt_time > rt_rq->rt_runtime) {
+ raw_spin_unlock(&rt_rq->rt_runtime_lock);
+ more = do_balance_runtime(rt_rq);
+ raw_spin_lock(&rt_rq->rt_runtime_lock);
+ }
+
+ return more;
+}
+#else /* !CONFIG_SMP */
+static inline int balance_runtime(struct rt_rq *rt_rq)
+{
+ return 0;
+}
+#endif /* CONFIG_SMP */
+
+static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
+{
+ int i, idle = 1, throttled = 0;
+ const struct cpumask *span;
+
+ span = sched_rt_period_mask();
+#ifdef CONFIG_RT_GROUP_SCHED
+ /*
+ * FIXME: isolated CPUs should really leave the root task group,
+ * whether they are isolcpus or were isolated via cpusets, lest
+ * the timer run on a CPU which does not service all runqueues,
+ * potentially leaving other CPUs indefinitely throttled. If
+ * isolation is really required, the user will turn the throttle
+ * off to kill the perturbations it causes anyway. Meanwhile,
+ * this maintains functionality for boot and/or troubleshooting.
+ */
+ if (rt_b == &root_task_group.rt_bandwidth)
+ span = cpu_online_mask;
+#endif
+ for_each_cpu(i, span) {
+ int enqueue = 0;
+ struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
+ struct rq *rq = rq_of_rt_rq(rt_rq);
+
+ raw_spin_lock(&rq->lock);
+ if (rt_rq->rt_time) {
+ u64 runtime;
+
+ raw_spin_lock(&rt_rq->rt_runtime_lock);
+ if (rt_rq->rt_throttled)
+ balance_runtime(rt_rq);
+ runtime = rt_rq->rt_runtime;
+ rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
+ if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
+ rt_rq->rt_throttled = 0;
+ enqueue = 1;
+
+ /*
+ * When we're idle and a woken (rt) task is
+ * throttled check_preempt_curr() will set
+ * skip_update and the time between the wakeup
+ * and this unthrottle will get accounted as
+ * 'runtime'.
+ */
+ if (rt_rq->rt_nr_running && rq->curr == rq->idle)
+ rq_clock_skip_update(rq, false);
+ }
+ if (rt_rq->rt_time || rt_rq->rt_nr_running)
+ idle = 0;
+ raw_spin_unlock(&rt_rq->rt_runtime_lock);
+ } else if (rt_rq->rt_nr_running) {
+ idle = 0;
+ if (!rt_rq_throttled(rt_rq))
+ enqueue = 1;
+ }
+ if (rt_rq->rt_throttled)
+ throttled = 1;
+
+ if (enqueue)
+ sched_rt_rq_enqueue(rt_rq);
+ raw_spin_unlock(&rq->lock);
+ }
+
+ if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
+ return 1;
+
+ return idle;
+}
+
+static inline int rt_se_prio(struct sched_rt_entity *rt_se)
+{
+#ifdef CONFIG_RT_GROUP_SCHED
+ struct rt_rq *rt_rq = group_rt_rq(rt_se);
+
+ if (rt_rq)
+ return rt_rq->highest_prio.curr;
+#endif
+
+ return rt_task_of(rt_se)->prio;
+}
+
+static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
+{
+ u64 runtime = sched_rt_runtime(rt_rq);
+
+ if (rt_rq->rt_throttled)
+ return rt_rq_throttled(rt_rq);
+
+ if (runtime >= sched_rt_period(rt_rq))
+ return 0;
+
+ balance_runtime(rt_rq);
+ runtime = sched_rt_runtime(rt_rq);
+ if (runtime == RUNTIME_INF)
+ return 0;
+
+ if (rt_rq->rt_time > runtime) {
+ struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+
+ /*
+ * Don't actually throttle groups that have no runtime assigned
+ * but accrue some time due to boosting.
+ */
+ if (likely(rt_b->rt_runtime)) {
+ rt_rq->rt_throttled = 1;
+ printk_deferred_once("sched: RT throttling activated\n");
+ } else {
+ /*
+ * In case we did anyway, make it go away,
+ * replenishment is a joke, since it will replenish us
+ * with exactly 0 ns.
+ */
+ rt_rq->rt_time = 0;
+ }
+
+ if (rt_rq_throttled(rt_rq)) {
+ sched_rt_rq_dequeue(rt_rq);
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Update the current task's runtime statistics. Skip current tasks that
+ * are not in our scheduling class.
+ */
+static void update_curr_rt(struct rq *rq)
+{
+ struct task_struct *curr = rq->curr;
+ struct sched_rt_entity *rt_se = &curr->rt;
+ u64 delta_exec;
+
+ if (curr->sched_class != &rt_sched_class)
+ return;
+
+ delta_exec = rq_clock_task(rq) - curr->se.exec_start;
+ if (unlikely((s64)delta_exec <= 0))
+ return;
+
+ schedstat_set(curr->se.statistics.exec_max,
+ max(curr->se.statistics.exec_max, delta_exec));
+
+ curr->se.sum_exec_runtime += delta_exec;
+ account_group_exec_runtime(curr, delta_exec);
+
+ curr->se.exec_start = rq_clock_task(rq);
+ cpuacct_charge(curr, delta_exec);
+
+ sched_rt_avg_update(rq, delta_exec);
+
+ if (!rt_bandwidth_enabled())
+ return;
+
+ for_each_sched_rt_entity(rt_se) {
+ struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+
+ if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
+ raw_spin_lock(&rt_rq->rt_runtime_lock);
+ rt_rq->rt_time += delta_exec;
+ if (sched_rt_runtime_exceeded(rt_rq))
+ resched_curr(rq);
+ raw_spin_unlock(&rt_rq->rt_runtime_lock);
+ }
+ }
+}
+
+static void
+dequeue_top_rt_rq(struct rt_rq *rt_rq)
+{
+ struct rq *rq = rq_of_rt_rq(rt_rq);
+
+ BUG_ON(&rq->rt != rt_rq);
+
+ if (!rt_rq->rt_queued)
+ return;
+
+ BUG_ON(!rq->nr_running);
+
+ sub_nr_running(rq, rt_rq->rt_nr_running);
+ rt_rq->rt_queued = 0;
+}
+
+static void
+enqueue_top_rt_rq(struct rt_rq *rt_rq)
+{
+ struct rq *rq = rq_of_rt_rq(rt_rq);
+
+ BUG_ON(&rq->rt != rt_rq);
+
+ if (rt_rq->rt_queued)
+ return;
+ if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running)
+ return;
+
+ add_nr_running(rq, rt_rq->rt_nr_running);
+ rt_rq->rt_queued = 1;
+}
+
+#if defined CONFIG_SMP
+
+static void
+inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
+{
+ struct rq *rq = rq_of_rt_rq(rt_rq);
+
+#ifdef CONFIG_RT_GROUP_SCHED
+ /*
+ * Change rq's cpupri only if rt_rq is the top queue.
+ */
+ if (&rq->rt != rt_rq)
+ return;
+#endif
+ if (rq->online && prio < prev_prio)
+ cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
+}
+
+static void
+dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
+{
+ struct rq *rq = rq_of_rt_rq(rt_rq);
+
+#ifdef CONFIG_RT_GROUP_SCHED
+ /*
+ * Change rq's cpupri only if rt_rq is the top queue.
+ */
+ if (&rq->rt != rt_rq)
+ return;
+#endif
+ if (rq->online && rt_rq->highest_prio.curr != prev_prio)
+ cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
+}
+
+#else /* CONFIG_SMP */
+
+static inline
+void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
+static inline
+void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
+
+#endif /* CONFIG_SMP */
+
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+static void
+inc_rt_prio(struct rt_rq *rt_rq, int prio)
+{
+ int prev_prio = rt_rq->highest_prio.curr;
+
+ if (prio < prev_prio)
+ rt_rq->highest_prio.curr = prio;
+
+ inc_rt_prio_smp(rt_rq, prio, prev_prio);
+}
+
+static void
+dec_rt_prio(struct rt_rq *rt_rq, int prio)
+{
+ int prev_prio = rt_rq->highest_prio.curr;
+
+ if (rt_rq->rt_nr_running) {
+
+ WARN_ON(prio < prev_prio);
+
+ /*
+ * This may have been our highest task, and therefore
+ * we may have some recomputation to do
+ */
+ if (prio == prev_prio) {
+ struct rt_prio_array *array = &rt_rq->active;
+
+ rt_rq->highest_prio.curr =
+ sched_find_first_bit(array->bitmap);
+ }
+
+ } else
+ rt_rq->highest_prio.curr = MAX_RT_PRIO;
+
+ dec_rt_prio_smp(rt_rq, prio, prev_prio);
+}
+
+#else
+
+static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
+static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
+
+#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
+
+#ifdef CONFIG_RT_GROUP_SCHED
+
+static void
+inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+ if (rt_se_boosted(rt_se))
+ rt_rq->rt_nr_boosted++;
+
+ if (rt_rq->tg)
+ start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
+}
+
+static void
+dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+ if (rt_se_boosted(rt_se))
+ rt_rq->rt_nr_boosted--;
+
+ WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
+}
+
+#else /* CONFIG_RT_GROUP_SCHED */
+
+static void
+inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+ start_rt_bandwidth(&def_rt_bandwidth);
+}
+
+static inline
+void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
+
+#endif /* CONFIG_RT_GROUP_SCHED */
+
+static inline
+unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
+{
+ struct rt_rq *group_rq = group_rt_rq(rt_se);
+
+ if (group_rq)
+ return group_rq->rt_nr_running;
+ else
+ return 1;
+}
+
+static inline
+void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+ int prio = rt_se_prio(rt_se);
+
+ WARN_ON(!rt_prio(prio));
+ rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
+
+ inc_rt_prio(rt_rq, prio);
+ inc_rt_migration(rt_se, rt_rq);
+ inc_rt_group(rt_se, rt_rq);
+}
+
+static inline
+void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+ WARN_ON(!rt_prio(rt_se_prio(rt_se)));
+ WARN_ON(!rt_rq->rt_nr_running);
+ rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
+
+ dec_rt_prio(rt_rq, rt_se_prio(rt_se));
+ dec_rt_migration(rt_se, rt_rq);
+ dec_rt_group(rt_se, rt_rq);
+}
+
+static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
+{
+ struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+ struct rt_prio_array *array = &rt_rq->active;
+ struct rt_rq *group_rq = group_rt_rq(rt_se);
+ struct list_head *queue = array->queue + rt_se_prio(rt_se);
+
+ /*
+ * Don't enqueue the group if its throttled, or when empty.
+ * The latter is a consequence of the former when a child group
+ * get throttled and the current group doesn't have any other
+ * active members.
+ */
+ if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
+ return;
+
+ if (head)
+ list_add(&rt_se->run_list, queue);
+ else
+ list_add_tail(&rt_se->run_list, queue);
+ __set_bit(rt_se_prio(rt_se), array->bitmap);
+
+ inc_rt_tasks(rt_se, rt_rq);
+}
+
+static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
+{
+ struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+ struct rt_prio_array *array = &rt_rq->active;
+
+ list_del_init(&rt_se->run_list);
+ if (list_empty(array->queue + rt_se_prio(rt_se)))
+ __clear_bit(rt_se_prio(rt_se), array->bitmap);
+
+ dec_rt_tasks(rt_se, rt_rq);
+}
+
+/*
+ * Because the prio of an upper entry depends on the lower
+ * entries, we must remove entries top - down.
+ */
+static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
+{
+ struct sched_rt_entity *back = NULL;
+
+ for_each_sched_rt_entity(rt_se) {
+ rt_se->back = back;
+ back = rt_se;
+ }
+
+ dequeue_top_rt_rq(rt_rq_of_se(back));
+
+ for (rt_se = back; rt_se; rt_se = rt_se->back) {
+ if (on_rt_rq(rt_se))
+ __dequeue_rt_entity(rt_se);
+ }
+}
+
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
+{
+ struct rq *rq = rq_of_rt_se(rt_se);
+
+ dequeue_rt_stack(rt_se);
+ for_each_sched_rt_entity(rt_se)
+ __enqueue_rt_entity(rt_se, head);
+ enqueue_top_rt_rq(&rq->rt);
+}
+
+static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
+{
+ struct rq *rq = rq_of_rt_se(rt_se);
+
+ dequeue_rt_stack(rt_se);
+
+ for_each_sched_rt_entity(rt_se) {
+ struct rt_rq *rt_rq = group_rt_rq(rt_se);
+
+ if (rt_rq && rt_rq->rt_nr_running)
+ __enqueue_rt_entity(rt_se, false);
+ }
+ enqueue_top_rt_rq(&rq->rt);
+}
+
+/*
+ * Adding/removing a task to/from a priority array:
+ */
+static void
+enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
+{
+ struct sched_rt_entity *rt_se = &p->rt;
+
+ if (flags & ENQUEUE_WAKEUP)
+ rt_se->timeout = 0;
+
+ enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
+
+ if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
+ enqueue_pushable_task(rq, p);
+}
+
+static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
+{
+ struct sched_rt_entity *rt_se = &p->rt;
+
+ update_curr_rt(rq);
+ dequeue_rt_entity(rt_se);
+
+ dequeue_pushable_task(rq, p);
+}
+
+/*
+ * Put task to the head or the end of the run list without the overhead of
+ * dequeue followed by enqueue.
+ */
+static void
+requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
+{
+ if (on_rt_rq(rt_se)) {
+ struct rt_prio_array *array = &rt_rq->active;
+ struct list_head *queue = array->queue + rt_se_prio(rt_se);
+
+ if (head)
+ list_move(&rt_se->run_list, queue);
+ else
+ list_move_tail(&rt_se->run_list, queue);
+ }
+}
+
+static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
+{
+ struct sched_rt_entity *rt_se = &p->rt;
+ struct rt_rq *rt_rq;
+
+ for_each_sched_rt_entity(rt_se) {
+ rt_rq = rt_rq_of_se(rt_se);
+ requeue_rt_entity(rt_rq, rt_se, head);
+ }
+}
+
+static void yield_task_rt(struct rq *rq)
+{
+ requeue_task_rt(rq, rq->curr, 0);
+}
+
+#ifdef CONFIG_SMP
+static int find_lowest_rq(struct task_struct *task);
+
+static int
+select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
+{
+ struct task_struct *curr;
+ struct rq *rq;
+
+ /* For anything but wake ups, just return the task_cpu */
+ if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
+ goto out;
+
+ rq = cpu_rq(cpu);
+
+ rcu_read_lock();
+ curr = ACCESS_ONCE(rq->curr); /* unlocked access */
+
+ /*
+ * If the current task on @p's runqueue is an RT task, then
+ * try to see if we can wake this RT task up on another
+ * runqueue. Otherwise simply start this RT task
+ * on its current runqueue.
+ *
+ * We want to avoid overloading runqueues. If the woken
+ * task is a higher priority, then it will stay on this CPU
+ * and the lower prio task should be moved to another CPU.
+ * Even though this will probably make the lower prio task
+ * lose its cache, we do not want to bounce a higher task
+ * around just because it gave up its CPU, perhaps for a
+ * lock?
+ *
+ * For equal prio tasks, we just let the scheduler sort it out.
+ *
+ * Otherwise, just let it ride on the affined RQ and the
+ * post-schedule router will push the preempted task away
+ *
+ * This test is optimistic, if we get it wrong the load-balancer
+ * will have to sort it out.
+ */
+ if (curr && unlikely(rt_task(curr)) &&
+ (curr->nr_cpus_allowed < 2 ||
+ curr->prio <= p->prio)) {
+ int target = find_lowest_rq(p);
+
+ /*
+ * Don't bother moving it if the destination CPU is
+ * not running a lower priority task.
+ */
+ if (target != -1 &&
+ p->prio < cpu_rq(target)->rt.highest_prio.curr)
+ cpu = target;
+ }
+ rcu_read_unlock();
+
+out:
+ return cpu;
+}
+
+static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
+{
+ /*
+ * Current can't be migrated, useless to reschedule,
+ * let's hope p can move out.
+ */
+ if (rq->curr->nr_cpus_allowed == 1 ||
+ !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
+ return;
+
+ /*
+ * p is migratable, so let's not schedule it and
+ * see if it is pushed or pulled somewhere else.
+ */
+ if (p->nr_cpus_allowed != 1
+ && cpupri_find(&rq->rd->cpupri, p, NULL))
+ return;
+
+ /*
+ * There appears to be other cpus that can accept
+ * current and none to run 'p', so lets reschedule
+ * to try and push current away:
+ */
+ requeue_task_rt(rq, p, 1);
+ resched_curr(rq);
+}
+
+#endif /* CONFIG_SMP */
+
+/*
+ * Preempt the current task with a newly woken task if needed:
+ */
+static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
+{
+ if (p->prio < rq->curr->prio) {
+ resched_curr(rq);
+ return;
+ }
+
+#ifdef CONFIG_SMP
+ /*
+ * If:
+ *
+ * - the newly woken task is of equal priority to the current task
+ * - the newly woken task is non-migratable while current is migratable
+ * - current will be preempted on the next reschedule
+ *
+ * we should check to see if current can readily move to a different
+ * cpu. If so, we will reschedule to allow the push logic to try
+ * to move current somewhere else, making room for our non-migratable
+ * task.
+ */
+ if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
+ check_preempt_equal_prio(rq, p);
+#endif
+}
+
+static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
+ struct rt_rq *rt_rq)
+{
+ struct rt_prio_array *array = &rt_rq->active;
+ struct sched_rt_entity *next = NULL;
+ struct list_head *queue;
+ int idx;
+
+ idx = sched_find_first_bit(array->bitmap);
+ BUG_ON(idx >= MAX_RT_PRIO);
+
+ queue = array->queue + idx;
+ next = list_entry(queue->next, struct sched_rt_entity, run_list);
+
+ return next;
+}
+
+static struct task_struct *_pick_next_task_rt(struct rq *rq)
+{
+ struct sched_rt_entity *rt_se;
+ struct task_struct *p;
+ struct rt_rq *rt_rq = &rq->rt;
+
+ do {
+ rt_se = pick_next_rt_entity(rq, rt_rq);
+ BUG_ON(!rt_se);
+ rt_rq = group_rt_rq(rt_se);
+ } while (rt_rq);
+
+ p = rt_task_of(rt_se);
+ p->se.exec_start = rq_clock_task(rq);
+
+ return p;
+}
+
+static struct task_struct *
+pick_next_task_rt(struct rq *rq, struct task_struct *prev)
+{
+ struct task_struct *p;
+ struct rt_rq *rt_rq = &rq->rt;
+
+ if (need_pull_rt_task(rq, prev)) {
+ pull_rt_task(rq);
+ /*
+ * pull_rt_task() can drop (and re-acquire) rq->lock; this
+ * means a dl or stop task can slip in, in which case we need
+ * to re-start task selection.
+ */
+ if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) ||
+ rq->dl.dl_nr_running))
+ return RETRY_TASK;
+ }
+
+ /*
+ * We may dequeue prev's rt_rq in put_prev_task().
+ * So, we update time before rt_nr_running check.
+ */
+ if (prev->sched_class == &rt_sched_class)
+ update_curr_rt(rq);
+
+ if (!rt_rq->rt_queued)
+ return NULL;
+
+ put_prev_task(rq, prev);
+
+ p = _pick_next_task_rt(rq);
+
+ /* The running task is never eligible for pushing */
+ dequeue_pushable_task(rq, p);
+
+ set_post_schedule(rq);
+
+ return p;
+}
+
+static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
+{
+ update_curr_rt(rq);
+
+ /*
+ * The previous task needs to be made eligible for pushing
+ * if it is still active
+ */
+ if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
+ enqueue_pushable_task(rq, p);
+}
+
+#ifdef CONFIG_SMP
+
+/* Only try algorithms three times */
+#define RT_MAX_TRIES 3
+
+static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
+{
+ if (!task_running(rq, p) &&
+ cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+ return 1;
+ return 0;
+}
+
+/*
+ * Return the highest pushable rq's task, which is suitable to be executed
+ * on the cpu, NULL otherwise
+ */
+static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
+{
+ struct plist_head *head = &rq->rt.pushable_tasks;
+ struct task_struct *p;
+
+ if (!has_pushable_tasks(rq))
+ return NULL;
+
+ plist_for_each_entry(p, head, pushable_tasks) {
+ if (pick_rt_task(rq, p, cpu))
+ return p;
+ }
+
+ return NULL;
+}
+
+static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
+
+static int find_lowest_rq(struct task_struct *task)
+{
+ struct sched_domain *sd;
+ struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
+ int this_cpu = smp_processor_id();
+ int cpu = task_cpu(task);
+
+ /* Make sure the mask is initialized first */
+ if (unlikely(!lowest_mask))
+ return -1;
+
+ if (task->nr_cpus_allowed == 1)
+ return -1; /* No other targets possible */
+
+ if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
+ return -1; /* No targets found */
+
+ /*
+ * At this point we have built a mask of cpus representing the
+ * lowest priority tasks in the system. Now we want to elect
+ * the best one based on our affinity and topology.
+ *
+ * We prioritize the last cpu that the task executed on since
+ * it is most likely cache-hot in that location.
+ */
+ if (cpumask_test_cpu(cpu, lowest_mask))
+ return cpu;
+
+ /*
+ * Otherwise, we consult the sched_domains span maps to figure
+ * out which cpu is logically closest to our hot cache data.
+ */
+ if (!cpumask_test_cpu(this_cpu, lowest_mask))
+ this_cpu = -1; /* Skip this_cpu opt if not among lowest */
+
+ rcu_read_lock();
+ for_each_domain(cpu, sd) {
+ if (sd->flags & SD_WAKE_AFFINE) {
+ int best_cpu;
+
+ /*
+ * "this_cpu" is cheaper to preempt than a
+ * remote processor.
+ */
+ if (this_cpu != -1 &&
+ cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
+ rcu_read_unlock();
+ return this_cpu;
+ }
+
+ best_cpu = cpumask_first_and(lowest_mask,
+ sched_domain_span(sd));
+ if (best_cpu < nr_cpu_ids) {
+ rcu_read_unlock();
+ return best_cpu;
+ }
+ }
+ }
+ rcu_read_unlock();
+
+ /*
+ * And finally, if there were no matches within the domains
+ * just give the caller *something* to work with from the compatible
+ * locations.
+ */
+ if (this_cpu != -1)
+ return this_cpu;
+
+ cpu = cpumask_any(lowest_mask);
+ if (cpu < nr_cpu_ids)
+ return cpu;
+ return -1;
+}
+
+/* Will lock the rq it finds */
+static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
+{
+ struct rq *lowest_rq = NULL;
+ int tries;
+ int cpu;
+
+ for (tries = 0; tries < RT_MAX_TRIES; tries++) {
+ cpu = find_lowest_rq(task);
+
+ if ((cpu == -1) || (cpu == rq->cpu))
+ break;
+
+ lowest_rq = cpu_rq(cpu);
+
+ if (lowest_rq->rt.highest_prio.curr <= task->prio) {
+ /*
+ * Target rq has tasks of equal or higher priority,
+ * retrying does not release any lock and is unlikely
+ * to yield a different result.
+ */
+ lowest_rq = NULL;
+ break;
+ }
+
+ /* if the prio of this runqueue changed, try again */
+ if (double_lock_balance(rq, lowest_rq)) {
+ /*
+ * We had to unlock the run queue. In
+ * the mean time, task could have
+ * migrated already or had its affinity changed.
+ * Also make sure that it wasn't scheduled on its rq.
+ */
+ if (unlikely(task_rq(task) != rq ||
+ !cpumask_test_cpu(lowest_rq->cpu,
+ tsk_cpus_allowed(task)) ||
+ task_running(rq, task) ||
+ !task_on_rq_queued(task))) {
+
+ double_unlock_balance(rq, lowest_rq);
+ lowest_rq = NULL;
+ break;
+ }
+ }
+
+ /* If this rq is still suitable use it. */
+ if (lowest_rq->rt.highest_prio.curr > task->prio)
+ break;
+
+ /* try again */
+ double_unlock_balance(rq, lowest_rq);
+ lowest_rq = NULL;
+ }
+
+ return lowest_rq;
+}
+
+static struct task_struct *pick_next_pushable_task(struct rq *rq)
+{
+ struct task_struct *p;
+
+ if (!has_pushable_tasks(rq))
+ return NULL;
+
+ p = plist_first_entry(&rq->rt.pushable_tasks,
+ struct task_struct, pushable_tasks);
+
+ BUG_ON(rq->cpu != task_cpu(p));
+ BUG_ON(task_current(rq, p));
+ BUG_ON(p->nr_cpus_allowed <= 1);
+
+ BUG_ON(!task_on_rq_queued(p));
+ BUG_ON(!rt_task(p));
+
+ return p;
+}
+
+/*
+ * If the current CPU has more than one RT task, see if the non
+ * running task can migrate over to a CPU that is running a task
+ * of lesser priority.
+ */
+static int push_rt_task(struct rq *rq)
+{
+ struct task_struct *next_task;
+ struct rq *lowest_rq;
+ int ret = 0;
+
+ if (!rq->rt.overloaded)
+ return 0;
+
+ next_task = pick_next_pushable_task(rq);
+ if (!next_task)
+ return 0;
+
+retry:
+ if (unlikely(next_task == rq->curr)) {
+ WARN_ON(1);
+ return 0;
+ }
+
+ /*
+ * It's possible that the next_task slipped in of
+ * higher priority than current. If that's the case
+ * just reschedule current.
+ */
+ if (unlikely(next_task->prio < rq->curr->prio)) {
+ resched_curr(rq);
+ return 0;
+ }
+
+ /* We might release rq lock */
+ get_task_struct(next_task);
+
+ /* find_lock_lowest_rq locks the rq if found */
+ lowest_rq = find_lock_lowest_rq(next_task, rq);
+ if (!lowest_rq) {
+ struct task_struct *task;
+ /*
+ * find_lock_lowest_rq releases rq->lock
+ * so it is possible that next_task has migrated.
+ *
+ * We need to make sure that the task is still on the same
+ * run-queue and is also still the next task eligible for
+ * pushing.
+ */
+ task = pick_next_pushable_task(rq);
+ if (task_cpu(next_task) == rq->cpu && task == next_task) {
+ /*
+ * The task hasn't migrated, and is still the next
+ * eligible task, but we failed to find a run-queue
+ * to push it to. Do not retry in this case, since
+ * other cpus will pull from us when ready.
+ */
+ goto out;
+ }
+
+ if (!task)
+ /* No more tasks, just exit */
+ goto out;
+
+ /*
+ * Something has shifted, try again.
+ */
+ put_task_struct(next_task);
+ next_task = task;
+ goto retry;
+ }
+
+ deactivate_task(rq, next_task, 0);
+ set_task_cpu(next_task, lowest_rq->cpu);
+ activate_task(lowest_rq, next_task, 0);
+ ret = 1;
+
+ resched_curr(lowest_rq);
+
+ double_unlock_balance(rq, lowest_rq);
+
+out:
+ put_task_struct(next_task);
+
+ return ret;
+}
+
+static void push_rt_tasks(struct rq *rq)
+{
+ /* push_rt_task will return true if it moved an RT */
+ while (push_rt_task(rq))
+ ;
+}
+
+#ifdef HAVE_RT_PUSH_IPI
+/*
+ * The search for the next cpu always starts at rq->cpu and ends
+ * when we reach rq->cpu again. It will never return rq->cpu.
+ * This returns the next cpu to check, or nr_cpu_ids if the loop
+ * is complete.
+ *
+ * rq->rt.push_cpu holds the last cpu returned by this function,
+ * or if this is the first instance, it must hold rq->cpu.
+ */
+static int rto_next_cpu(struct rq *rq)
+{
+ int prev_cpu = rq->rt.push_cpu;
+ int cpu;
+
+ cpu = cpumask_next(prev_cpu, rq->rd->rto_mask);
+
+ /*
+ * If the previous cpu is less than the rq's CPU, then it already
+ * passed the end of the mask, and has started from the beginning.
+ * We end if the next CPU is greater or equal to rq's CPU.
+ */
+ if (prev_cpu < rq->cpu) {
+ if (cpu >= rq->cpu)
+ return nr_cpu_ids;
+
+ } else if (cpu >= nr_cpu_ids) {
+ /*
+ * We passed the end of the mask, start at the beginning.
+ * If the result is greater or equal to the rq's CPU, then
+ * the loop is finished.
+ */
+ cpu = cpumask_first(rq->rd->rto_mask);
+ if (cpu >= rq->cpu)
+ return nr_cpu_ids;
+ }
+ rq->rt.push_cpu = cpu;
+
+ /* Return cpu to let the caller know if the loop is finished or not */
+ return cpu;
+}
+
+static int find_next_push_cpu(struct rq *rq)
+{
+ struct rq *next_rq;
+ int cpu;
+
+ while (1) {
+ cpu = rto_next_cpu(rq);
+ if (cpu >= nr_cpu_ids)
+ break;
+ next_rq = cpu_rq(cpu);
+
+ /* Make sure the next rq can push to this rq */
+ if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr)
+ break;
+ }
+
+ return cpu;
+}
+
+#define RT_PUSH_IPI_EXECUTING 1
+#define RT_PUSH_IPI_RESTART 2
+
+static void tell_cpu_to_push(struct rq *rq)
+{
+ int cpu;
+
+ if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
+ raw_spin_lock(&rq->rt.push_lock);
+ /* Make sure it's still executing */
+ if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
+ /*
+ * Tell the IPI to restart the loop as things have
+ * changed since it started.
+ */
+ rq->rt.push_flags |= RT_PUSH_IPI_RESTART;
+ raw_spin_unlock(&rq->rt.push_lock);
+ return;
+ }
+ raw_spin_unlock(&rq->rt.push_lock);
+ }
+
+ /* When here, there's no IPI going around */
+
+ rq->rt.push_cpu = rq->cpu;
+ cpu = find_next_push_cpu(rq);
+ if (cpu >= nr_cpu_ids)
+ return;
+
+ rq->rt.push_flags = RT_PUSH_IPI_EXECUTING;
+
+ irq_work_queue_on(&rq->rt.push_work, cpu);
+}
+
+/* Called from hardirq context */
+static void try_to_push_tasks(void *arg)
+{
+ struct rt_rq *rt_rq = arg;
+ struct rq *rq, *src_rq;
+ int this_cpu;
+ int cpu;
+
+ this_cpu = rt_rq->push_cpu;
+
+ /* Paranoid check */
+ BUG_ON(this_cpu != smp_processor_id());
+
+ rq = cpu_rq(this_cpu);
+ src_rq = rq_of_rt_rq(rt_rq);
+
+again:
+ if (has_pushable_tasks(rq)) {
+ raw_spin_lock(&rq->lock);
+ push_rt_task(rq);
+ raw_spin_unlock(&rq->lock);
+ }
+
+ /* Pass the IPI to the next rt overloaded queue */
+ raw_spin_lock(&rt_rq->push_lock);
+ /*
+ * If the source queue changed since the IPI went out,
+ * we need to restart the search from that CPU again.
+ */
+ if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) {
+ rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART;
+ rt_rq->push_cpu = src_rq->cpu;
+ }
+
+ cpu = find_next_push_cpu(src_rq);
+
+ if (cpu >= nr_cpu_ids)
+ rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING;
+ raw_spin_unlock(&rt_rq->push_lock);
+
+ if (cpu >= nr_cpu_ids)
+ return;
+
+ /*
+ * It is possible that a restart caused this CPU to be
+ * chosen again. Don't bother with an IPI, just see if we
+ * have more to push.
+ */
+ if (unlikely(cpu == rq->cpu))
+ goto again;
+
+ /* Try the next RT overloaded CPU */
+ irq_work_queue_on(&rt_rq->push_work, cpu);
+}
+
+static void push_irq_work_func(struct irq_work *work)
+{
+ struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work);
+
+ try_to_push_tasks(rt_rq);
+}
+#endif /* HAVE_RT_PUSH_IPI */
+
+static int pull_rt_task(struct rq *this_rq)
+{
+ int this_cpu = this_rq->cpu, ret = 0, cpu;
+ struct task_struct *p;
+ struct rq *src_rq;
+
+ if (likely(!rt_overloaded(this_rq)))
+ return 0;
+
+ /*
+ * Match the barrier from rt_set_overloaded; this guarantees that if we
+ * see overloaded we must also see the rto_mask bit.
+ */
+ smp_rmb();
+
+#ifdef HAVE_RT_PUSH_IPI
+ if (sched_feat(RT_PUSH_IPI)) {
+ tell_cpu_to_push(this_rq);
+ return 0;
+ }
+#endif
+
+ for_each_cpu(cpu, this_rq->rd->rto_mask) {
+ if (this_cpu == cpu)
+ continue;
+
+ src_rq = cpu_rq(cpu);
+
+ /*
+ * Don't bother taking the src_rq->lock if the next highest
+ * task is known to be lower-priority than our current task.
+ * This may look racy, but if this value is about to go
+ * logically higher, the src_rq will push this task away.
+ * And if its going logically lower, we do not care
+ */
+ if (src_rq->rt.highest_prio.next >=
+ this_rq->rt.highest_prio.curr)
+ continue;
+
+ /*
+ * We can potentially drop this_rq's lock in
+ * double_lock_balance, and another CPU could
+ * alter this_rq
+ */
+ double_lock_balance(this_rq, src_rq);
+
+ /*
+ * We can pull only a task, which is pushable
+ * on its rq, and no others.
+ */
+ p = pick_highest_pushable_task(src_rq, this_cpu);
+
+ /*
+ * Do we have an RT task that preempts
+ * the to-be-scheduled task?
+ */
+ if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
+ WARN_ON(p == src_rq->curr);
+ WARN_ON(!task_on_rq_queued(p));
+
+ /*
+ * There's a chance that p is higher in priority
+ * than what's currently running on its cpu.
+ * This is just that p is wakeing up and hasn't
+ * had a chance to schedule. We only pull
+ * p if it is lower in priority than the
+ * current task on the run queue
+ */
+ if (p->prio < src_rq->curr->prio)
+ goto skip;
+
+ ret = 1;
+
+ deactivate_task(src_rq, p, 0);
+ set_task_cpu(p, this_cpu);
+ activate_task(this_rq, p, 0);
+ /*
+ * We continue with the search, just in
+ * case there's an even higher prio task
+ * in another runqueue. (low likelihood
+ * but possible)
+ */
+ }
+skip:
+ double_unlock_balance(this_rq, src_rq);
+ }
+
+ return ret;
+}
+
+static void post_schedule_rt(struct rq *rq)
+{
+ push_rt_tasks(rq);
+}
+
+/*
+ * If we are not running and we are not going to reschedule soon, we should
+ * try to push tasks away now
+ */
+static void task_woken_rt(struct rq *rq, struct task_struct *p)
+{
+ if (!task_running(rq, p) &&
+ !test_tsk_need_resched(rq->curr) &&
+ has_pushable_tasks(rq) &&
+ p->nr_cpus_allowed > 1 &&
+ (dl_task(rq->curr) || rt_task(rq->curr)) &&
+ (rq->curr->nr_cpus_allowed < 2 ||
+ rq->curr->prio <= p->prio))
+ push_rt_tasks(rq);
+}
+
+static void set_cpus_allowed_rt(struct task_struct *p,
+ const struct cpumask *new_mask)
+{
+ struct rq *rq;
+ int weight;
+
+ BUG_ON(!rt_task(p));
+
+ if (!task_on_rq_queued(p))
+ return;
+
+ weight = cpumask_weight(new_mask);
+
+ /*
+ * Only update if the process changes its state from whether it
+ * can migrate or not.
+ */
+ if ((p->nr_cpus_allowed > 1) == (weight > 1))
+ return;
+
+ rq = task_rq(p);
+
+ /*
+ * The process used to be able to migrate OR it can now migrate
+ */
+ if (weight <= 1) {
+ if (!task_current(rq, p))
+ dequeue_pushable_task(rq, p);
+ BUG_ON(!rq->rt.rt_nr_migratory);
+ rq->rt.rt_nr_migratory--;
+ } else {
+ if (!task_current(rq, p))
+ enqueue_pushable_task(rq, p);
+ rq->rt.rt_nr_migratory++;
+ }
+
+ update_rt_migration(&rq->rt);
+}
+
+/* Assumes rq->lock is held */
+static void rq_online_rt(struct rq *rq)
+{
+ if (rq->rt.overloaded)
+ rt_set_overload(rq);
+
+ __enable_runtime(rq);
+
+ cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
+}
+
+/* Assumes rq->lock is held */
+static void rq_offline_rt(struct rq *rq)
+{
+ if (rq->rt.overloaded)
+ rt_clear_overload(rq);
+
+ __disable_runtime(rq);
+
+ cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
+}
+
+/*
+ * When switch from the rt queue, we bring ourselves to a position
+ * that we might want to pull RT tasks from other runqueues.
+ */
+static void switched_from_rt(struct rq *rq, struct task_struct *p)
+{
+ /*
+ * If there are other RT tasks then we will reschedule
+ * and the scheduling of the other RT tasks will handle
+ * the balancing. But if we are the last RT task
+ * we may need to handle the pulling of RT tasks
+ * now.
+ */
+ if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
+ return;
+
+ if (pull_rt_task(rq))
+ resched_curr(rq);
+}
+
+void __init init_sched_rt_class(void)
+{
+ unsigned int i;
+
+ for_each_possible_cpu(i) {
+ zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
+ GFP_KERNEL, cpu_to_node(i));
+ }
+}
+#endif /* CONFIG_SMP */
+
+/*
+ * When switching a task to RT, we may overload the runqueue
+ * with RT tasks. In this case we try to push them off to
+ * other runqueues.
+ */
+static void switched_to_rt(struct rq *rq, struct task_struct *p)
+{
+ int check_resched = 1;
+
+ /*
+ * If we are already running, then there's nothing
+ * that needs to be done. But if we are not running
+ * we may need to preempt the current running task.
+ * If that current running task is also an RT task
+ * then see if we can move to another run queue.
+ */
+ if (task_on_rq_queued(p) && rq->curr != p) {
+#ifdef CONFIG_SMP
+ if (p->nr_cpus_allowed > 1 && rq->rt.overloaded &&
+ /* Don't resched if we changed runqueues */
+ push_rt_task(rq) && rq != task_rq(p))
+ check_resched = 0;
+#endif /* CONFIG_SMP */
+ if (check_resched && p->prio < rq->curr->prio)
+ resched_curr(rq);
+ }
+}
+
+/*
+ * Priority of the task has changed. This may cause
+ * us to initiate a push or pull.
+ */
+static void
+prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
+{
+ if (!task_on_rq_queued(p))
+ return;
+
+ if (rq->curr == p) {
+#ifdef CONFIG_SMP
+ /*
+ * If our priority decreases while running, we
+ * may need to pull tasks to this runqueue.
+ */
+ if (oldprio < p->prio)
+ pull_rt_task(rq);
+ /*
+ * If there's a higher priority task waiting to run
+ * then reschedule. Note, the above pull_rt_task
+ * can release the rq lock and p could migrate.
+ * Only reschedule if p is still on the same runqueue.
+ */
+ if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
+ resched_curr(rq);
+#else
+ /* For UP simply resched on drop of prio */
+ if (oldprio < p->prio)
+ resched_curr(rq);
+#endif /* CONFIG_SMP */
+ } else {
+ /*
+ * This task is not running, but if it is
+ * greater than the current running task
+ * then reschedule.
+ */
+ if (p->prio < rq->curr->prio)
+ resched_curr(rq);
+ }
+}
+
+static void watchdog(struct rq *rq, struct task_struct *p)
+{
+ unsigned long soft, hard;
+
+ /* max may change after cur was read, this will be fixed next tick */
+ soft = task_rlimit(p, RLIMIT_RTTIME);
+ hard = task_rlimit_max(p, RLIMIT_RTTIME);
+
+ if (soft != RLIM_INFINITY) {
+ unsigned long next;
+
+ if (p->rt.watchdog_stamp != jiffies) {
+ p->rt.timeout++;
+ p->rt.watchdog_stamp = jiffies;
+ }
+
+ next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
+ if (p->rt.timeout > next)
+ p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
+ }
+}
+
+static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
+{
+ struct sched_rt_entity *rt_se = &p->rt;
+
+ update_curr_rt(rq);
+
+ watchdog(rq, p);
+
+ /*
+ * RR tasks need a special form of timeslice management.
+ * FIFO tasks have no timeslices.
+ */
+ if (p->policy != SCHED_RR)
+ return;
+
+ if (--p->rt.time_slice)
+ return;
+
+ p->rt.time_slice = sched_rr_timeslice;
+
+ /*
+ * Requeue to the end of queue if we (and all of our ancestors) are not
+ * the only element on the queue
+ */
+ for_each_sched_rt_entity(rt_se) {
+ if (rt_se->run_list.prev != rt_se->run_list.next) {
+ requeue_task_rt(rq, p, 0);
+ resched_curr(rq);
+ return;
+ }
+ }
+}
+
+static void set_curr_task_rt(struct rq *rq)
+{
+ struct task_struct *p = rq->curr;
+
+ p->se.exec_start = rq_clock_task(rq);
+
+ /* The running task is never eligible for pushing */
+ dequeue_pushable_task(rq, p);
+}
+
+static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
+{
+ /*
+ * Time slice is 0 for SCHED_FIFO tasks
+ */
+ if (task->policy == SCHED_RR)
+ return sched_rr_timeslice;
+ else
+ return 0;
+}
+
+const struct sched_class rt_sched_class = {
+ .next = &fair_sched_class,
+ .enqueue_task = enqueue_task_rt,
+ .dequeue_task = dequeue_task_rt,
+ .yield_task = yield_task_rt,
+
+ .check_preempt_curr = check_preempt_curr_rt,
+
+ .pick_next_task = pick_next_task_rt,
+ .put_prev_task = put_prev_task_rt,
+
+#ifdef CONFIG_SMP
+ .select_task_rq = select_task_rq_rt,
+
+ .set_cpus_allowed = set_cpus_allowed_rt,
+ .rq_online = rq_online_rt,
+ .rq_offline = rq_offline_rt,
+ .post_schedule = post_schedule_rt,
+ .task_woken = task_woken_rt,
+ .switched_from = switched_from_rt,
+#endif
+
+ .set_curr_task = set_curr_task_rt,
+ .task_tick = task_tick_rt,
+
+ .get_rr_interval = get_rr_interval_rt,
+
+ .prio_changed = prio_changed_rt,
+ .switched_to = switched_to_rt,
+
+ .update_curr = update_curr_rt,
+};
+
+#ifdef CONFIG_SCHED_DEBUG
+extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
+
+void print_rt_stats(struct seq_file *m, int cpu)
+{
+ rt_rq_iter_t iter;
+ struct rt_rq *rt_rq;
+
+ rcu_read_lock();
+ for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
+ print_rt_rq(m, cpu, rt_rq);
+ rcu_read_unlock();
+}
+#endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
new file mode 100644
index 000000000..e0e129993
--- /dev/null
+++ b/kernel/sched/sched.h
@@ -0,0 +1,1736 @@
+
+#include <linux/sched.h>
+#include <linux/sched/sysctl.h>
+#include <linux/sched/rt.h>
+#include <linux/sched/deadline.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/stop_machine.h>
+#include <linux/irq_work.h>
+#include <linux/tick.h>
+#include <linux/slab.h>
+
+#include "cpupri.h"
+#include "cpudeadline.h"
+#include "cpuacct.h"
+
+struct rq;
+struct cpuidle_state;
+
+/* task_struct::on_rq states: */
+#define TASK_ON_RQ_QUEUED 1
+#define TASK_ON_RQ_MIGRATING 2
+
+extern __read_mostly int scheduler_running;
+
+extern unsigned long calc_load_update;
+extern atomic_long_t calc_load_tasks;
+
+extern long calc_load_fold_active(struct rq *this_rq);
+extern void update_cpu_load_active(struct rq *this_rq);
+
+/*
+ * Helpers for converting nanosecond timing to jiffy resolution
+ */
+#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
+
+/*
+ * Increase resolution of nice-level calculations for 64-bit architectures.
+ * The extra resolution improves shares distribution and load balancing of
+ * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup
+ * hierarchies, especially on larger systems. This is not a user-visible change
+ * and does not change the user-interface for setting shares/weights.
+ *
+ * We increase resolution only if we have enough bits to allow this increased
+ * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution
+ * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the
+ * increased costs.
+ */
+#if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load */
+# define SCHED_LOAD_RESOLUTION 10
+# define scale_load(w) ((w) << SCHED_LOAD_RESOLUTION)
+# define scale_load_down(w) ((w) >> SCHED_LOAD_RESOLUTION)
+#else
+# define SCHED_LOAD_RESOLUTION 0
+# define scale_load(w) (w)
+# define scale_load_down(w) (w)
+#endif
+
+#define SCHED_LOAD_SHIFT (10 + SCHED_LOAD_RESOLUTION)
+#define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT)
+
+#define NICE_0_LOAD SCHED_LOAD_SCALE
+#define NICE_0_SHIFT SCHED_LOAD_SHIFT
+
+/*
+ * Single value that decides SCHED_DEADLINE internal math precision.
+ * 10 -> just above 1us
+ * 9 -> just above 0.5us
+ */
+#define DL_SCALE (10)
+
+/*
+ * These are the 'tuning knobs' of the scheduler:
+ */
+
+/*
+ * single value that denotes runtime == period, ie unlimited time.
+ */
+#define RUNTIME_INF ((u64)~0ULL)
+
+static inline int fair_policy(int policy)
+{
+ return policy == SCHED_NORMAL || policy == SCHED_BATCH;
+}
+
+static inline int rt_policy(int policy)
+{
+ return policy == SCHED_FIFO || policy == SCHED_RR;
+}
+
+static inline int dl_policy(int policy)
+{
+ return policy == SCHED_DEADLINE;
+}
+
+static inline int task_has_rt_policy(struct task_struct *p)
+{
+ return rt_policy(p->policy);
+}
+
+static inline int task_has_dl_policy(struct task_struct *p)
+{
+ return dl_policy(p->policy);
+}
+
+static inline bool dl_time_before(u64 a, u64 b)
+{
+ return (s64)(a - b) < 0;
+}
+
+/*
+ * Tells if entity @a should preempt entity @b.
+ */
+static inline bool
+dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b)
+{
+ return dl_time_before(a->deadline, b->deadline);
+}
+
+/*
+ * This is the priority-queue data structure of the RT scheduling class:
+ */
+struct rt_prio_array {
+ DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
+ struct list_head queue[MAX_RT_PRIO];
+};
+
+struct rt_bandwidth {
+ /* nests inside the rq lock: */
+ raw_spinlock_t rt_runtime_lock;
+ ktime_t rt_period;
+ u64 rt_runtime;
+ struct hrtimer rt_period_timer;
+};
+
+void __dl_clear_params(struct task_struct *p);
+
+/*
+ * To keep the bandwidth of -deadline tasks and groups under control
+ * we need some place where:
+ * - store the maximum -deadline bandwidth of the system (the group);
+ * - cache the fraction of that bandwidth that is currently allocated.
+ *
+ * This is all done in the data structure below. It is similar to the
+ * one used for RT-throttling (rt_bandwidth), with the main difference
+ * that, since here we are only interested in admission control, we
+ * do not decrease any runtime while the group "executes", neither we
+ * need a timer to replenish it.
+ *
+ * With respect to SMP, the bandwidth is given on a per-CPU basis,
+ * meaning that:
+ * - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU;
+ * - dl_total_bw array contains, in the i-eth element, the currently
+ * allocated bandwidth on the i-eth CPU.
+ * Moreover, groups consume bandwidth on each CPU, while tasks only
+ * consume bandwidth on the CPU they're running on.
+ * Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw
+ * that will be shown the next time the proc or cgroup controls will
+ * be red. It on its turn can be changed by writing on its own
+ * control.
+ */
+struct dl_bandwidth {
+ raw_spinlock_t dl_runtime_lock;
+ u64 dl_runtime;
+ u64 dl_period;
+};
+
+static inline int dl_bandwidth_enabled(void)
+{
+ return sysctl_sched_rt_runtime >= 0;
+}
+
+extern struct dl_bw *dl_bw_of(int i);
+
+struct dl_bw {
+ raw_spinlock_t lock;
+ u64 bw, total_bw;
+};
+
+static inline
+void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
+{
+ dl_b->total_bw -= tsk_bw;
+}
+
+static inline
+void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
+{
+ dl_b->total_bw += tsk_bw;
+}
+
+static inline
+bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
+{
+ return dl_b->bw != -1 &&
+ dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
+}
+
+extern struct mutex sched_domains_mutex;
+
+#ifdef CONFIG_CGROUP_SCHED
+
+#include <linux/cgroup.h>
+
+struct cfs_rq;
+struct rt_rq;
+
+extern struct list_head task_groups;
+
+struct cfs_bandwidth {
+#ifdef CONFIG_CFS_BANDWIDTH
+ raw_spinlock_t lock;
+ ktime_t period;
+ u64 quota, runtime;
+ s64 hierarchical_quota;
+ u64 runtime_expires;
+
+ int idle, timer_active;
+ struct hrtimer period_timer, slack_timer;
+ struct list_head throttled_cfs_rq;
+
+ /* statistics */
+ int nr_periods, nr_throttled;
+ u64 throttled_time;
+#endif
+};
+
+/* task group related information */
+struct task_group {
+ struct cgroup_subsys_state css;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ /* schedulable entities of this group on each cpu */
+ struct sched_entity **se;
+ /* runqueue "owned" by this group on each cpu */
+ struct cfs_rq **cfs_rq;
+ unsigned long shares;
+
+#ifdef CONFIG_SMP
+ atomic_long_t load_avg;
+ atomic_t runnable_avg;
+#endif
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+ struct sched_rt_entity **rt_se;
+ struct rt_rq **rt_rq;
+
+ struct rt_bandwidth rt_bandwidth;
+#endif
+
+ struct rcu_head rcu;
+ struct list_head list;
+
+ struct task_group *parent;
+ struct list_head siblings;
+ struct list_head children;
+
+#ifdef CONFIG_SCHED_AUTOGROUP
+ struct autogroup *autogroup;
+#endif
+
+ struct cfs_bandwidth cfs_bandwidth;
+};
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+#define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
+
+/*
+ * A weight of 0 or 1 can cause arithmetics problems.
+ * A weight of a cfs_rq is the sum of weights of which entities
+ * are queued on this cfs_rq, so a weight of a entity should not be
+ * too large, so as the shares value of a task group.
+ * (The default weight is 1024 - so there's no practical
+ * limitation from this.)
+ */
+#define MIN_SHARES (1UL << 1)
+#define MAX_SHARES (1UL << 18)
+#endif
+
+typedef int (*tg_visitor)(struct task_group *, void *);
+
+extern int walk_tg_tree_from(struct task_group *from,
+ tg_visitor down, tg_visitor up, void *data);
+
+/*
+ * Iterate the full tree, calling @down when first entering a node and @up when
+ * leaving it for the final time.
+ *
+ * Caller must hold rcu_lock or sufficient equivalent.
+ */
+static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
+{
+ return walk_tg_tree_from(&root_task_group, down, up, data);
+}
+
+extern int tg_nop(struct task_group *tg, void *data);
+
+extern void free_fair_sched_group(struct task_group *tg);
+extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
+extern void unregister_fair_sched_group(struct task_group *tg, int cpu);
+extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
+ struct sched_entity *se, int cpu,
+ struct sched_entity *parent);
+extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
+extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
+
+extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
+extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force);
+extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
+
+extern void free_rt_sched_group(struct task_group *tg);
+extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent);
+extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
+ struct sched_rt_entity *rt_se, int cpu,
+ struct sched_rt_entity *parent);
+
+extern struct task_group *sched_create_group(struct task_group *parent);
+extern void sched_online_group(struct task_group *tg,
+ struct task_group *parent);
+extern void sched_destroy_group(struct task_group *tg);
+extern void sched_offline_group(struct task_group *tg);
+
+extern void sched_move_task(struct task_struct *tsk);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
+#endif
+
+#else /* CONFIG_CGROUP_SCHED */
+
+struct cfs_bandwidth { };
+
+#endif /* CONFIG_CGROUP_SCHED */
+
+/* CFS-related fields in a runqueue */
+struct cfs_rq {
+ struct load_weight load;
+ unsigned int nr_running, h_nr_running;
+
+ u64 exec_clock;
+ u64 min_vruntime;
+#ifndef CONFIG_64BIT
+ u64 min_vruntime_copy;
+#endif
+
+ struct rb_root tasks_timeline;
+ struct rb_node *rb_leftmost;
+
+ /*
+ * 'curr' points to currently running entity on this cfs_rq.
+ * It is set to NULL otherwise (i.e when none are currently running).
+ */
+ struct sched_entity *curr, *next, *last, *skip;
+
+#ifdef CONFIG_SCHED_DEBUG
+ unsigned int nr_spread_over;
+#endif
+
+#ifdef CONFIG_SMP
+ /*
+ * CFS Load tracking
+ * Under CFS, load is tracked on a per-entity basis and aggregated up.
+ * This allows for the description of both thread and group usage (in
+ * the FAIR_GROUP_SCHED case).
+ * runnable_load_avg is the sum of the load_avg_contrib of the
+ * sched_entities on the rq.
+ * blocked_load_avg is similar to runnable_load_avg except that its
+ * the blocked sched_entities on the rq.
+ * utilization_load_avg is the sum of the average running time of the
+ * sched_entities on the rq.
+ */
+ unsigned long runnable_load_avg, blocked_load_avg, utilization_load_avg;
+ atomic64_t decay_counter;
+ u64 last_decay;
+ atomic_long_t removed_load;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ /* Required to track per-cpu representation of a task_group */
+ u32 tg_runnable_contrib;
+ unsigned long tg_load_contrib;
+
+ /*
+ * h_load = weight * f(tg)
+ *
+ * Where f(tg) is the recursive weight fraction assigned to
+ * this group.
+ */
+ unsigned long h_load;
+ u64 last_h_load_update;
+ struct sched_entity *h_load_next;
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
+
+ /*
+ * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
+ * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
+ * (like users, containers etc.)
+ *
+ * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
+ * list is used during load balance.
+ */
+ int on_list;
+ struct list_head leaf_cfs_rq_list;
+ struct task_group *tg; /* group that "owns" this runqueue */
+
+#ifdef CONFIG_CFS_BANDWIDTH
+ int runtime_enabled;
+ u64 runtime_expires;
+ s64 runtime_remaining;
+
+ u64 throttled_clock, throttled_clock_task;
+ u64 throttled_clock_task_time;
+ int throttled, throttle_count;
+ struct list_head throttled_list;
+#endif /* CONFIG_CFS_BANDWIDTH */
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+};
+
+static inline int rt_bandwidth_enabled(void)
+{
+ return sysctl_sched_rt_runtime >= 0;
+}
+
+/* RT IPI pull logic requires IRQ_WORK */
+#ifdef CONFIG_IRQ_WORK
+# define HAVE_RT_PUSH_IPI
+#endif
+
+/* Real-Time classes' related field in a runqueue: */
+struct rt_rq {
+ struct rt_prio_array active;
+ unsigned int rt_nr_running;
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+ struct {
+ int curr; /* highest queued rt task prio */
+#ifdef CONFIG_SMP
+ int next; /* next highest */
+#endif
+ } highest_prio;
+#endif
+#ifdef CONFIG_SMP
+ unsigned long rt_nr_migratory;
+ unsigned long rt_nr_total;
+ int overloaded;
+ struct plist_head pushable_tasks;
+#ifdef HAVE_RT_PUSH_IPI
+ int push_flags;
+ int push_cpu;
+ struct irq_work push_work;
+ raw_spinlock_t push_lock;
+#endif
+#endif /* CONFIG_SMP */
+ int rt_queued;
+
+ int rt_throttled;
+ u64 rt_time;
+ u64 rt_runtime;
+ /* Nests inside the rq lock: */
+ raw_spinlock_t rt_runtime_lock;
+
+#ifdef CONFIG_RT_GROUP_SCHED
+ unsigned long rt_nr_boosted;
+
+ struct rq *rq;
+ struct task_group *tg;
+#endif
+};
+
+/* Deadline class' related fields in a runqueue */
+struct dl_rq {
+ /* runqueue is an rbtree, ordered by deadline */
+ struct rb_root rb_root;
+ struct rb_node *rb_leftmost;
+
+ unsigned long dl_nr_running;
+
+#ifdef CONFIG_SMP
+ /*
+ * Deadline values of the currently executing and the
+ * earliest ready task on this rq. Caching these facilitates
+ * the decision wether or not a ready but not running task
+ * should migrate somewhere else.
+ */
+ struct {
+ u64 curr;
+ u64 next;
+ } earliest_dl;
+
+ unsigned long dl_nr_migratory;
+ int overloaded;
+
+ /*
+ * Tasks on this rq that can be pushed away. They are kept in
+ * an rb-tree, ordered by tasks' deadlines, with caching
+ * of the leftmost (earliest deadline) element.
+ */
+ struct rb_root pushable_dl_tasks_root;
+ struct rb_node *pushable_dl_tasks_leftmost;
+#else
+ struct dl_bw dl_bw;
+#endif
+};
+
+#ifdef CONFIG_SMP
+
+/*
+ * We add the notion of a root-domain which will be used to define per-domain
+ * variables. Each exclusive cpuset essentially defines an island domain by
+ * fully partitioning the member cpus from any other cpuset. Whenever a new
+ * exclusive cpuset is created, we also create and attach a new root-domain
+ * object.
+ *
+ */
+struct root_domain {
+ atomic_t refcount;
+ atomic_t rto_count;
+ struct rcu_head rcu;
+ cpumask_var_t span;
+ cpumask_var_t online;
+
+ /* Indicate more than one runnable task for any CPU */
+ bool overload;
+
+ /*
+ * The bit corresponding to a CPU gets set here if such CPU has more
+ * than one runnable -deadline task (as it is below for RT tasks).
+ */
+ cpumask_var_t dlo_mask;
+ atomic_t dlo_count;
+ struct dl_bw dl_bw;
+ struct cpudl cpudl;
+
+ /*
+ * The "RT overload" flag: it gets set if a CPU has more than
+ * one runnable RT task.
+ */
+ cpumask_var_t rto_mask;
+ struct cpupri cpupri;
+};
+
+extern struct root_domain def_root_domain;
+
+#endif /* CONFIG_SMP */
+
+/*
+ * This is the main, per-CPU runqueue data structure.
+ *
+ * Locking rule: those places that want to lock multiple runqueues
+ * (such as the load balancing or the thread migration code), lock
+ * acquire operations must be ordered by ascending &runqueue.
+ */
+struct rq {
+ /* runqueue lock: */
+ raw_spinlock_t lock;
+
+ /*
+ * nr_running and cpu_load should be in the same cacheline because
+ * remote CPUs use both these fields when doing load calculation.
+ */
+ unsigned int nr_running;
+#ifdef CONFIG_NUMA_BALANCING
+ unsigned int nr_numa_running;
+ unsigned int nr_preferred_running;
+#endif
+ #define CPU_LOAD_IDX_MAX 5
+ unsigned long cpu_load[CPU_LOAD_IDX_MAX];
+ unsigned long last_load_update_tick;
+#ifdef CONFIG_NO_HZ_COMMON
+ u64 nohz_stamp;
+ unsigned long nohz_flags;
+#endif
+#ifdef CONFIG_NO_HZ_FULL
+ unsigned long last_sched_tick;
+#endif
+ /* capture load from *all* tasks on this cpu: */
+ struct load_weight load;
+ unsigned long nr_load_updates;
+ u64 nr_switches;
+
+ struct cfs_rq cfs;
+ struct rt_rq rt;
+ struct dl_rq dl;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ /* list of leaf cfs_rq on this cpu: */
+ struct list_head leaf_cfs_rq_list;
+
+ struct sched_avg avg;
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+ /*
+ * This is part of a global counter where only the total sum
+ * over all CPUs matters. A task can increase this counter on
+ * one CPU and if it got migrated afterwards it may decrease
+ * it on another CPU. Always updated under the runqueue lock:
+ */
+ unsigned long nr_uninterruptible;
+
+ struct task_struct *curr, *idle, *stop;
+ unsigned long next_balance;
+ struct mm_struct *prev_mm;
+
+ unsigned int clock_skip_update;
+ u64 clock;
+ u64 clock_task;
+
+ atomic_t nr_iowait;
+
+#ifdef CONFIG_SMP
+ struct root_domain *rd;
+ struct sched_domain *sd;
+
+ unsigned long cpu_capacity;
+ unsigned long cpu_capacity_orig;
+
+ unsigned char idle_balance;
+ /* For active balancing */
+ int post_schedule;
+ int active_balance;
+ int push_cpu;
+ struct cpu_stop_work active_balance_work;
+ /* cpu of this runqueue: */
+ int cpu;
+ int online;
+
+ struct list_head cfs_tasks;
+
+ u64 rt_avg;
+ u64 age_stamp;
+ u64 idle_stamp;
+ u64 avg_idle;
+
+ /* This is used to determine avg_idle's max value */
+ u64 max_idle_balance_cost;
+#endif
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ u64 prev_irq_time;
+#endif
+#ifdef CONFIG_PARAVIRT
+ u64 prev_steal_time;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+ u64 prev_steal_time_rq;
+#endif
+
+ /* calc_load related fields */
+ unsigned long calc_load_update;
+ long calc_load_active;
+
+#ifdef CONFIG_SCHED_HRTICK
+#ifdef CONFIG_SMP
+ int hrtick_csd_pending;
+ struct call_single_data hrtick_csd;
+#endif
+ struct hrtimer hrtick_timer;
+#endif
+
+#ifdef CONFIG_SCHEDSTATS
+ /* latency stats */
+ struct sched_info rq_sched_info;
+ unsigned long long rq_cpu_time;
+ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
+
+ /* sys_sched_yield() stats */
+ unsigned int yld_count;
+
+ /* schedule() stats */
+ unsigned int sched_count;
+ unsigned int sched_goidle;
+
+ /* try_to_wake_up() stats */
+ unsigned int ttwu_count;
+ unsigned int ttwu_local;
+#endif
+
+#ifdef CONFIG_SMP
+ struct llist_head wake_list;
+#endif
+
+#ifdef CONFIG_CPU_IDLE
+ /* Must be inspected within a rcu lock section */
+ struct cpuidle_state *idle_state;
+#endif
+};
+
+static inline int cpu_of(struct rq *rq)
+{
+#ifdef CONFIG_SMP
+ return rq->cpu;
+#else
+ return 0;
+#endif
+}
+
+DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+
+#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
+#define this_rq() this_cpu_ptr(&runqueues)
+#define task_rq(p) cpu_rq(task_cpu(p))
+#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
+#define raw_rq() raw_cpu_ptr(&runqueues)
+
+static inline u64 __rq_clock_broken(struct rq *rq)
+{
+ return ACCESS_ONCE(rq->clock);
+}
+
+static inline u64 rq_clock(struct rq *rq)
+{
+ lockdep_assert_held(&rq->lock);
+ return rq->clock;
+}
+
+static inline u64 rq_clock_task(struct rq *rq)
+{
+ lockdep_assert_held(&rq->lock);
+ return rq->clock_task;
+}
+
+#define RQCF_REQ_SKIP 0x01
+#define RQCF_ACT_SKIP 0x02
+
+static inline void rq_clock_skip_update(struct rq *rq, bool skip)
+{
+ lockdep_assert_held(&rq->lock);
+ if (skip)
+ rq->clock_skip_update |= RQCF_REQ_SKIP;
+ else
+ rq->clock_skip_update &= ~RQCF_REQ_SKIP;
+}
+
+#ifdef CONFIG_NUMA
+enum numa_topology_type {
+ NUMA_DIRECT,
+ NUMA_GLUELESS_MESH,
+ NUMA_BACKPLANE,
+};
+extern enum numa_topology_type sched_numa_topology_type;
+extern int sched_max_numa_distance;
+extern bool find_numa_distance(int distance);
+#endif
+
+#ifdef CONFIG_NUMA_BALANCING
+/* The regions in numa_faults array from task_struct */
+enum numa_faults_stats {
+ NUMA_MEM = 0,
+ NUMA_CPU,
+ NUMA_MEMBUF,
+ NUMA_CPUBUF
+};
+extern void sched_setnuma(struct task_struct *p, int node);
+extern int migrate_task_to(struct task_struct *p, int cpu);
+extern int migrate_swap(struct task_struct *, struct task_struct *);
+#endif /* CONFIG_NUMA_BALANCING */
+
+#ifdef CONFIG_SMP
+
+extern void sched_ttwu_pending(void);
+
+#define rcu_dereference_check_sched_domain(p) \
+ rcu_dereference_check((p), \
+ lockdep_is_held(&sched_domains_mutex))
+
+/*
+ * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
+ * See detach_destroy_domains: synchronize_sched for details.
+ *
+ * The domain tree of any CPU may only be accessed from within
+ * preempt-disabled sections.
+ */
+#define for_each_domain(cpu, __sd) \
+ for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \
+ __sd; __sd = __sd->parent)
+
+#define for_each_lower_domain(sd) for (; sd; sd = sd->child)
+
+/**
+ * highest_flag_domain - Return highest sched_domain containing flag.
+ * @cpu: The cpu whose highest level of sched domain is to
+ * be returned.
+ * @flag: The flag to check for the highest sched_domain
+ * for the given cpu.
+ *
+ * Returns the highest sched_domain of a cpu which contains the given flag.
+ */
+static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
+{
+ struct sched_domain *sd, *hsd = NULL;
+
+ for_each_domain(cpu, sd) {
+ if (!(sd->flags & flag))
+ break;
+ hsd = sd;
+ }
+
+ return hsd;
+}
+
+static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
+{
+ struct sched_domain *sd;
+
+ for_each_domain(cpu, sd) {
+ if (sd->flags & flag)
+ break;
+ }
+
+ return sd;
+}
+
+DECLARE_PER_CPU(struct sched_domain *, sd_llc);
+DECLARE_PER_CPU(int, sd_llc_size);
+DECLARE_PER_CPU(int, sd_llc_id);
+DECLARE_PER_CPU(struct sched_domain *, sd_numa);
+DECLARE_PER_CPU(struct sched_domain *, sd_busy);
+DECLARE_PER_CPU(struct sched_domain *, sd_asym);
+
+struct sched_group_capacity {
+ atomic_t ref;
+ /*
+ * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity
+ * for a single CPU.
+ */
+ unsigned int capacity;
+ unsigned long next_update;
+ int imbalance; /* XXX unrelated to capacity but shared group state */
+ /*
+ * Number of busy cpus in this group.
+ */
+ atomic_t nr_busy_cpus;
+
+ unsigned long cpumask[0]; /* iteration mask */
+};
+
+struct sched_group {
+ struct sched_group *next; /* Must be a circular list */
+ atomic_t ref;
+
+ unsigned int group_weight;
+ struct sched_group_capacity *sgc;
+
+ /*
+ * The CPUs this group covers.
+ *
+ * NOTE: this field is variable length. (Allocated dynamically
+ * by attaching extra space to the end of the structure,
+ * depending on how many CPUs the kernel has booted up with)
+ */
+ unsigned long cpumask[0];
+};
+
+static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
+{
+ return to_cpumask(sg->cpumask);
+}
+
+/*
+ * cpumask masking which cpus in the group are allowed to iterate up the domain
+ * tree.
+ */
+static inline struct cpumask *sched_group_mask(struct sched_group *sg)
+{
+ return to_cpumask(sg->sgc->cpumask);
+}
+
+/**
+ * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
+ * @group: The group whose first cpu is to be returned.
+ */
+static inline unsigned int group_first_cpu(struct sched_group *group)
+{
+ return cpumask_first(sched_group_cpus(group));
+}
+
+extern int group_balance_cpu(struct sched_group *sg);
+
+#else
+
+static inline void sched_ttwu_pending(void) { }
+
+#endif /* CONFIG_SMP */
+
+#include "stats.h"
+#include "auto_group.h"
+
+#ifdef CONFIG_CGROUP_SCHED
+
+/*
+ * Return the group to which this tasks belongs.
+ *
+ * We cannot use task_css() and friends because the cgroup subsystem
+ * changes that value before the cgroup_subsys::attach() method is called,
+ * therefore we cannot pin it and might observe the wrong value.
+ *
+ * The same is true for autogroup's p->signal->autogroup->tg, the autogroup
+ * core changes this before calling sched_move_task().
+ *
+ * Instead we use a 'copy' which is updated from sched_move_task() while
+ * holding both task_struct::pi_lock and rq::lock.
+ */
+static inline struct task_group *task_group(struct task_struct *p)
+{
+ return p->sched_task_group;
+}
+
+/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
+static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
+{
+#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED)
+ struct task_group *tg = task_group(p);
+#endif
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ p->se.cfs_rq = tg->cfs_rq[cpu];
+ p->se.parent = tg->se[cpu];
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+ p->rt.rt_rq = tg->rt_rq[cpu];
+ p->rt.parent = tg->rt_se[cpu];
+#endif
+}
+
+#else /* CONFIG_CGROUP_SCHED */
+
+static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
+static inline struct task_group *task_group(struct task_struct *p)
+{
+ return NULL;
+}
+
+#endif /* CONFIG_CGROUP_SCHED */
+
+static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
+{
+ set_task_rq(p, cpu);
+#ifdef CONFIG_SMP
+ /*
+ * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
+ * successfuly executed on another CPU. We must ensure that updates of
+ * per-task data have been completed by this moment.
+ */
+ smp_wmb();
+ task_thread_info(p)->cpu = cpu;
+ p->wake_cpu = cpu;
+#endif
+}
+
+/*
+ * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
+ */
+#ifdef CONFIG_SCHED_DEBUG
+# include <linux/static_key.h>
+# define const_debug __read_mostly
+#else
+# define const_debug const
+#endif
+
+extern const_debug unsigned int sysctl_sched_features;
+
+#define SCHED_FEAT(name, enabled) \
+ __SCHED_FEAT_##name ,
+
+enum {
+#include "features.h"
+ __SCHED_FEAT_NR,
+};
+
+#undef SCHED_FEAT
+
+#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
+#define SCHED_FEAT(name, enabled) \
+static __always_inline bool static_branch_##name(struct static_key *key) \
+{ \
+ return static_key_##enabled(key); \
+}
+
+#include "features.h"
+
+#undef SCHED_FEAT
+
+extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
+#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
+#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */
+#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
+#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
+
+#ifdef CONFIG_NUMA_BALANCING
+#define sched_feat_numa(x) sched_feat(x)
+#ifdef CONFIG_SCHED_DEBUG
+#define numabalancing_enabled sched_feat_numa(NUMA)
+#else
+extern bool numabalancing_enabled;
+#endif /* CONFIG_SCHED_DEBUG */
+#else
+#define sched_feat_numa(x) (0)
+#define numabalancing_enabled (0)
+#endif /* CONFIG_NUMA_BALANCING */
+
+static inline u64 global_rt_period(void)
+{
+ return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
+}
+
+static inline u64 global_rt_runtime(void)
+{
+ if (sysctl_sched_rt_runtime < 0)
+ return RUNTIME_INF;
+
+ return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
+}
+
+static inline int task_current(struct rq *rq, struct task_struct *p)
+{
+ return rq->curr == p;
+}
+
+static inline int task_running(struct rq *rq, struct task_struct *p)
+{
+#ifdef CONFIG_SMP
+ return p->on_cpu;
+#else
+ return task_current(rq, p);
+#endif
+}
+
+static inline int task_on_rq_queued(struct task_struct *p)
+{
+ return p->on_rq == TASK_ON_RQ_QUEUED;
+}
+
+static inline int task_on_rq_migrating(struct task_struct *p)
+{
+ return p->on_rq == TASK_ON_RQ_MIGRATING;
+}
+
+#ifndef prepare_arch_switch
+# define prepare_arch_switch(next) do { } while (0)
+#endif
+#ifndef finish_arch_switch
+# define finish_arch_switch(prev) do { } while (0)
+#endif
+#ifndef finish_arch_post_lock_switch
+# define finish_arch_post_lock_switch() do { } while (0)
+#endif
+
+static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
+{
+#ifdef CONFIG_SMP
+ /*
+ * We can optimise this out completely for !SMP, because the
+ * SMP rebalancing from interrupt is the only thing that cares
+ * here.
+ */
+ next->on_cpu = 1;
+#endif
+}
+
+static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
+{
+#ifdef CONFIG_SMP
+ /*
+ * After ->on_cpu is cleared, the task can be moved to a different CPU.
+ * We must ensure this doesn't happen until the switch is completely
+ * finished.
+ */
+ smp_wmb();
+ prev->on_cpu = 0;
+#endif
+#ifdef CONFIG_DEBUG_SPINLOCK
+ /* this is a valid case when another task releases the spinlock */
+ rq->lock.owner = current;
+#endif
+ /*
+ * If we are tracking spinlock dependencies then we have to
+ * fix up the runqueue lock - which gets 'carried over' from
+ * prev into current:
+ */
+ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
+
+ raw_spin_unlock_irq(&rq->lock);
+}
+
+/*
+ * wake flags
+ */
+#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */
+#define WF_FORK 0x02 /* child wakeup after fork */
+#define WF_MIGRATED 0x4 /* internal use, task got migrated */
+
+/*
+ * To aid in avoiding the subversion of "niceness" due to uneven distribution
+ * of tasks with abnormal "nice" values across CPUs the contribution that
+ * each task makes to its run queue's load is weighted according to its
+ * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
+ * scaled version of the new time slice allocation that they receive on time
+ * slice expiry etc.
+ */
+
+#define WEIGHT_IDLEPRIO 3
+#define WMULT_IDLEPRIO 1431655765
+
+/*
+ * Nice levels are multiplicative, with a gentle 10% change for every
+ * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
+ * nice 1, it will get ~10% less CPU time than another CPU-bound task
+ * that remained on nice 0.
+ *
+ * The "10% effect" is relative and cumulative: from _any_ nice level,
+ * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
+ * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
+ * If a task goes up by ~10% and another task goes down by ~10% then
+ * the relative distance between them is ~25%.)
+ */
+static const int prio_to_weight[40] = {
+ /* -20 */ 88761, 71755, 56483, 46273, 36291,
+ /* -15 */ 29154, 23254, 18705, 14949, 11916,
+ /* -10 */ 9548, 7620, 6100, 4904, 3906,
+ /* -5 */ 3121, 2501, 1991, 1586, 1277,
+ /* 0 */ 1024, 820, 655, 526, 423,
+ /* 5 */ 335, 272, 215, 172, 137,
+ /* 10 */ 110, 87, 70, 56, 45,
+ /* 15 */ 36, 29, 23, 18, 15,
+};
+
+/*
+ * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
+ *
+ * In cases where the weight does not change often, we can use the
+ * precalculated inverse to speed up arithmetics by turning divisions
+ * into multiplications:
+ */
+static const u32 prio_to_wmult[40] = {
+ /* -20 */ 48388, 59856, 76040, 92818, 118348,
+ /* -15 */ 147320, 184698, 229616, 287308, 360437,
+ /* -10 */ 449829, 563644, 704093, 875809, 1099582,
+ /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
+ /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
+ /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
+ /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
+ /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
+};
+
+#define ENQUEUE_WAKEUP 1
+#define ENQUEUE_HEAD 2
+#ifdef CONFIG_SMP
+#define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */
+#else
+#define ENQUEUE_WAKING 0
+#endif
+#define ENQUEUE_REPLENISH 8
+
+#define DEQUEUE_SLEEP 1
+
+#define RETRY_TASK ((void *)-1UL)
+
+struct sched_class {
+ const struct sched_class *next;
+
+ void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
+ void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
+ void (*yield_task) (struct rq *rq);
+ bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt);
+
+ void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
+
+ /*
+ * It is the responsibility of the pick_next_task() method that will
+ * return the next task to call put_prev_task() on the @prev task or
+ * something equivalent.
+ *
+ * May return RETRY_TASK when it finds a higher prio class has runnable
+ * tasks.
+ */
+ struct task_struct * (*pick_next_task) (struct rq *rq,
+ struct task_struct *prev);
+ void (*put_prev_task) (struct rq *rq, struct task_struct *p);
+
+#ifdef CONFIG_SMP
+ int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
+ void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
+
+ void (*post_schedule) (struct rq *this_rq);
+ void (*task_waking) (struct task_struct *task);
+ void (*task_woken) (struct rq *this_rq, struct task_struct *task);
+
+ void (*set_cpus_allowed)(struct task_struct *p,
+ const struct cpumask *newmask);
+
+ void (*rq_online)(struct rq *rq);
+ void (*rq_offline)(struct rq *rq);
+#endif
+
+ void (*set_curr_task) (struct rq *rq);
+ void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
+ void (*task_fork) (struct task_struct *p);
+ void (*task_dead) (struct task_struct *p);
+
+ /*
+ * The switched_from() call is allowed to drop rq->lock, therefore we
+ * cannot assume the switched_from/switched_to pair is serliazed by
+ * rq->lock. They are however serialized by p->pi_lock.
+ */
+ void (*switched_from) (struct rq *this_rq, struct task_struct *task);
+ void (*switched_to) (struct rq *this_rq, struct task_struct *task);
+ void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
+ int oldprio);
+
+ unsigned int (*get_rr_interval) (struct rq *rq,
+ struct task_struct *task);
+
+ void (*update_curr) (struct rq *rq);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ void (*task_move_group) (struct task_struct *p, int on_rq);
+#endif
+};
+
+static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
+{
+ prev->sched_class->put_prev_task(rq, prev);
+}
+
+#define sched_class_highest (&stop_sched_class)
+#define for_each_class(class) \
+ for (class = sched_class_highest; class; class = class->next)
+
+extern const struct sched_class stop_sched_class;
+extern const struct sched_class dl_sched_class;
+extern const struct sched_class rt_sched_class;
+extern const struct sched_class fair_sched_class;
+extern const struct sched_class idle_sched_class;
+
+
+#ifdef CONFIG_SMP
+
+extern void update_group_capacity(struct sched_domain *sd, int cpu);
+
+extern void trigger_load_balance(struct rq *rq);
+
+extern void idle_enter_fair(struct rq *this_rq);
+extern void idle_exit_fair(struct rq *this_rq);
+
+#else
+
+static inline void idle_enter_fair(struct rq *rq) { }
+static inline void idle_exit_fair(struct rq *rq) { }
+
+#endif
+
+#ifdef CONFIG_CPU_IDLE
+static inline void idle_set_state(struct rq *rq,
+ struct cpuidle_state *idle_state)
+{
+ rq->idle_state = idle_state;
+}
+
+static inline struct cpuidle_state *idle_get_state(struct rq *rq)
+{
+ WARN_ON(!rcu_read_lock_held());
+ return rq->idle_state;
+}
+#else
+static inline void idle_set_state(struct rq *rq,
+ struct cpuidle_state *idle_state)
+{
+}
+
+static inline struct cpuidle_state *idle_get_state(struct rq *rq)
+{
+ return NULL;
+}
+#endif
+
+extern void sysrq_sched_debug_show(void);
+extern void sched_init_granularity(void);
+extern void update_max_interval(void);
+
+extern void init_sched_dl_class(void);
+extern void init_sched_rt_class(void);
+extern void init_sched_fair_class(void);
+extern void init_sched_dl_class(void);
+
+extern void resched_curr(struct rq *rq);
+extern void resched_cpu(int cpu);
+
+extern struct rt_bandwidth def_rt_bandwidth;
+extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
+
+extern struct dl_bandwidth def_dl_bandwidth;
+extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
+extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
+
+unsigned long to_ratio(u64 period, u64 runtime);
+
+extern void update_idle_cpu_load(struct rq *this_rq);
+
+extern void init_task_runnable_average(struct task_struct *p);
+
+static inline void add_nr_running(struct rq *rq, unsigned count)
+{
+ unsigned prev_nr = rq->nr_running;
+
+ rq->nr_running = prev_nr + count;
+
+ if (prev_nr < 2 && rq->nr_running >= 2) {
+#ifdef CONFIG_SMP
+ if (!rq->rd->overload)
+ rq->rd->overload = true;
+#endif
+
+#ifdef CONFIG_NO_HZ_FULL
+ if (tick_nohz_full_cpu(rq->cpu)) {
+ /*
+ * Tick is needed if more than one task runs on a CPU.
+ * Send the target an IPI to kick it out of nohz mode.
+ *
+ * We assume that IPI implies full memory barrier and the
+ * new value of rq->nr_running is visible on reception
+ * from the target.
+ */
+ tick_nohz_full_kick_cpu(rq->cpu);
+ }
+#endif
+ }
+}
+
+static inline void sub_nr_running(struct rq *rq, unsigned count)
+{
+ rq->nr_running -= count;
+}
+
+static inline void rq_last_tick_reset(struct rq *rq)
+{
+#ifdef CONFIG_NO_HZ_FULL
+ rq->last_sched_tick = jiffies;
+#endif
+}
+
+extern void update_rq_clock(struct rq *rq);
+
+extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
+extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
+
+extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
+
+extern const_debug unsigned int sysctl_sched_time_avg;
+extern const_debug unsigned int sysctl_sched_nr_migrate;
+extern const_debug unsigned int sysctl_sched_migration_cost;
+
+static inline u64 sched_avg_period(void)
+{
+ return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
+}
+
+#ifdef CONFIG_SCHED_HRTICK
+
+/*
+ * Use hrtick when:
+ * - enabled by features
+ * - hrtimer is actually high res
+ */
+static inline int hrtick_enabled(struct rq *rq)
+{
+ if (!sched_feat(HRTICK))
+ return 0;
+ if (!cpu_active(cpu_of(rq)))
+ return 0;
+ return hrtimer_is_hres_active(&rq->hrtick_timer);
+}
+
+void hrtick_start(struct rq *rq, u64 delay);
+
+#else
+
+static inline int hrtick_enabled(struct rq *rq)
+{
+ return 0;
+}
+
+#endif /* CONFIG_SCHED_HRTICK */
+
+#ifdef CONFIG_SMP
+extern void sched_avg_update(struct rq *rq);
+
+#ifndef arch_scale_freq_capacity
+static __always_inline
+unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
+{
+ return SCHED_CAPACITY_SCALE;
+}
+#endif
+
+static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
+{
+ rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
+ sched_avg_update(rq);
+}
+#else
+static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
+static inline void sched_avg_update(struct rq *rq) { }
+#endif
+
+extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);
+
+/*
+ * __task_rq_lock - lock the rq @p resides on.
+ */
+static inline struct rq *__task_rq_lock(struct task_struct *p)
+ __acquires(rq->lock)
+{
+ struct rq *rq;
+
+ lockdep_assert_held(&p->pi_lock);
+
+ for (;;) {
+ rq = task_rq(p);
+ raw_spin_lock(&rq->lock);
+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
+ return rq;
+ raw_spin_unlock(&rq->lock);
+
+ while (unlikely(task_on_rq_migrating(p)))
+ cpu_relax();
+ }
+}
+
+/*
+ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
+ */
+static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
+ __acquires(p->pi_lock)
+ __acquires(rq->lock)
+{
+ struct rq *rq;
+
+ for (;;) {
+ raw_spin_lock_irqsave(&p->pi_lock, *flags);
+ rq = task_rq(p);
+ raw_spin_lock(&rq->lock);
+ /*
+ * move_queued_task() task_rq_lock()
+ *
+ * ACQUIRE (rq->lock)
+ * [S] ->on_rq = MIGRATING [L] rq = task_rq()
+ * WMB (__set_task_cpu()) ACQUIRE (rq->lock);
+ * [S] ->cpu = new_cpu [L] task_rq()
+ * [L] ->on_rq
+ * RELEASE (rq->lock)
+ *
+ * If we observe the old cpu in task_rq_lock, the acquire of
+ * the old rq->lock will fully serialize against the stores.
+ *
+ * If we observe the new cpu in task_rq_lock, the acquire will
+ * pair with the WMB to ensure we must then also see migrating.
+ */
+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
+ return rq;
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
+
+ while (unlikely(task_on_rq_migrating(p)))
+ cpu_relax();
+ }
+}
+
+static inline void __task_rq_unlock(struct rq *rq)
+ __releases(rq->lock)
+{
+ raw_spin_unlock(&rq->lock);
+}
+
+static inline void
+task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
+ __releases(rq->lock)
+ __releases(p->pi_lock)
+{
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
+}
+
+#ifdef CONFIG_SMP
+#ifdef CONFIG_PREEMPT
+
+static inline void double_rq_lock(struct rq *rq1, struct rq *rq2);
+
+/*
+ * fair double_lock_balance: Safely acquires both rq->locks in a fair
+ * way at the expense of forcing extra atomic operations in all
+ * invocations. This assures that the double_lock is acquired using the
+ * same underlying policy as the spinlock_t on this architecture, which
+ * reduces latency compared to the unfair variant below. However, it
+ * also adds more overhead and therefore may reduce throughput.
+ */
+static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
+ __releases(this_rq->lock)
+ __acquires(busiest->lock)
+ __acquires(this_rq->lock)
+{
+ raw_spin_unlock(&this_rq->lock);
+ double_rq_lock(this_rq, busiest);
+
+ return 1;
+}
+
+#else
+/*
+ * Unfair double_lock_balance: Optimizes throughput at the expense of
+ * latency by eliminating extra atomic operations when the locks are
+ * already in proper order on entry. This favors lower cpu-ids and will
+ * grant the double lock to lower cpus over higher ids under contention,
+ * regardless of entry order into the function.
+ */
+static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
+ __releases(this_rq->lock)
+ __acquires(busiest->lock)
+ __acquires(this_rq->lock)
+{
+ int ret = 0;
+
+ if (unlikely(!raw_spin_trylock(&busiest->lock))) {
+ if (busiest < this_rq) {
+ raw_spin_unlock(&this_rq->lock);
+ raw_spin_lock(&busiest->lock);
+ raw_spin_lock_nested(&this_rq->lock,
+ SINGLE_DEPTH_NESTING);
+ ret = 1;
+ } else
+ raw_spin_lock_nested(&busiest->lock,
+ SINGLE_DEPTH_NESTING);
+ }
+ return ret;
+}
+
+#endif /* CONFIG_PREEMPT */
+
+/*
+ * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ */
+static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+{
+ if (unlikely(!irqs_disabled())) {
+ /* printk() doesn't work good under rq->lock */
+ raw_spin_unlock(&this_rq->lock);
+ BUG_ON(1);
+ }
+
+ return _double_lock_balance(this_rq, busiest);
+}
+
+static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
+ __releases(busiest->lock)
+{
+ raw_spin_unlock(&busiest->lock);
+ lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
+}
+
+static inline void double_lock(spinlock_t *l1, spinlock_t *l2)
+{
+ if (l1 > l2)
+ swap(l1, l2);
+
+ spin_lock(l1);
+ spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
+}
+
+static inline void double_lock_irq(spinlock_t *l1, spinlock_t *l2)
+{
+ if (l1 > l2)
+ swap(l1, l2);
+
+ spin_lock_irq(l1);
+ spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
+}
+
+static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2)
+{
+ if (l1 > l2)
+ swap(l1, l2);
+
+ raw_spin_lock(l1);
+ raw_spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
+}
+
+/*
+ * double_rq_lock - safely lock two runqueues
+ *
+ * Note this does not disable interrupts like task_rq_lock,
+ * you need to do so manually before calling.
+ */
+static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
+ __acquires(rq1->lock)
+ __acquires(rq2->lock)
+{
+ BUG_ON(!irqs_disabled());
+ if (rq1 == rq2) {
+ raw_spin_lock(&rq1->lock);
+ __acquire(rq2->lock); /* Fake it out ;) */
+ } else {
+ if (rq1 < rq2) {
+ raw_spin_lock(&rq1->lock);
+ raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
+ } else {
+ raw_spin_lock(&rq2->lock);
+ raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
+ }
+ }
+}
+
+/*
+ * double_rq_unlock - safely unlock two runqueues
+ *
+ * Note this does not restore interrupts like task_rq_unlock,
+ * you need to do so manually after calling.
+ */
+static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
+ __releases(rq1->lock)
+ __releases(rq2->lock)
+{
+ raw_spin_unlock(&rq1->lock);
+ if (rq1 != rq2)
+ raw_spin_unlock(&rq2->lock);
+ else
+ __release(rq2->lock);
+}
+
+#else /* CONFIG_SMP */
+
+/*
+ * double_rq_lock - safely lock two runqueues
+ *
+ * Note this does not disable interrupts like task_rq_lock,
+ * you need to do so manually before calling.
+ */
+static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
+ __acquires(rq1->lock)
+ __acquires(rq2->lock)
+{
+ BUG_ON(!irqs_disabled());
+ BUG_ON(rq1 != rq2);
+ raw_spin_lock(&rq1->lock);
+ __acquire(rq2->lock); /* Fake it out ;) */
+}
+
+/*
+ * double_rq_unlock - safely unlock two runqueues
+ *
+ * Note this does not restore interrupts like task_rq_unlock,
+ * you need to do so manually after calling.
+ */
+static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
+ __releases(rq1->lock)
+ __releases(rq2->lock)
+{
+ BUG_ON(rq1 != rq2);
+ raw_spin_unlock(&rq1->lock);
+ __release(rq2->lock);
+}
+
+#endif
+
+extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
+extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
+extern void print_cfs_stats(struct seq_file *m, int cpu);
+extern void print_rt_stats(struct seq_file *m, int cpu);
+extern void print_dl_stats(struct seq_file *m, int cpu);
+
+extern void init_cfs_rq(struct cfs_rq *cfs_rq);
+extern void init_rt_rq(struct rt_rq *rt_rq);
+extern void init_dl_rq(struct dl_rq *dl_rq);
+
+extern void cfs_bandwidth_usage_inc(void);
+extern void cfs_bandwidth_usage_dec(void);
+
+#ifdef CONFIG_NO_HZ_COMMON
+enum rq_nohz_flag_bits {
+ NOHZ_TICK_STOPPED,
+ NOHZ_BALANCE_KICK,
+};
+
+#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
+#endif
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+
+DECLARE_PER_CPU(u64, cpu_hardirq_time);
+DECLARE_PER_CPU(u64, cpu_softirq_time);
+
+#ifndef CONFIG_64BIT
+DECLARE_PER_CPU(seqcount_t, irq_time_seq);
+
+static inline void irq_time_write_begin(void)
+{
+ __this_cpu_inc(irq_time_seq.sequence);
+ smp_wmb();
+}
+
+static inline void irq_time_write_end(void)
+{
+ smp_wmb();
+ __this_cpu_inc(irq_time_seq.sequence);
+}
+
+static inline u64 irq_time_read(int cpu)
+{
+ u64 irq_time;
+ unsigned seq;
+
+ do {
+ seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
+ irq_time = per_cpu(cpu_softirq_time, cpu) +
+ per_cpu(cpu_hardirq_time, cpu);
+ } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
+
+ return irq_time;
+}
+#else /* CONFIG_64BIT */
+static inline void irq_time_write_begin(void)
+{
+}
+
+static inline void irq_time_write_end(void)
+{
+}
+
+static inline u64 irq_time_read(int cpu)
+{
+ return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
+}
+#endif /* CONFIG_64BIT */
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
new file mode 100644
index 000000000..7466a0bb2
--- /dev/null
+++ b/kernel/sched/stats.c
@@ -0,0 +1,142 @@
+
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+
+#ifndef CONFIG_SCHED_BFS
+#include "sched.h"
+#else
+#include "bfs_sched.h"
+#endif
+
+/*
+ * bump this up when changing the output format or the meaning of an existing
+ * format, so that tools can adapt (or abort)
+ */
+#define SCHEDSTAT_VERSION 15
+
+static int show_schedstat(struct seq_file *seq, void *v)
+{
+ int cpu;
+
+ if (v == (void *)1) {
+ seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
+ seq_printf(seq, "timestamp %lu\n", jiffies);
+ } else {
+ struct rq *rq;
+#ifdef CONFIG_SMP
+ struct sched_domain *sd;
+ int dcount = 0;
+#endif
+ cpu = (unsigned long)(v - 2);
+ rq = cpu_rq(cpu);
+
+ /* runqueue-specific stats */
+ seq_printf(seq,
+ "cpu%d %u 0 %u %u %u %u %llu %llu %lu",
+ cpu, rq->yld_count,
+ rq->sched_count, rq->sched_goidle,
+ rq->ttwu_count, rq->ttwu_local,
+ rq->rq_cpu_time,
+ rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
+
+ seq_printf(seq, "\n");
+
+#ifdef CONFIG_SMP
+ /* domain-specific stats */
+ rcu_read_lock();
+ for_each_domain(cpu, sd) {
+ enum cpu_idle_type itype;
+
+ seq_printf(seq, "domain%d %*pb", dcount++,
+ cpumask_pr_args(sched_domain_span(sd)));
+ for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
+ itype++) {
+ seq_printf(seq, " %u %u %u %u %u %u %u %u",
+ sd->lb_count[itype],
+ sd->lb_balanced[itype],
+ sd->lb_failed[itype],
+ sd->lb_imbalance[itype],
+ sd->lb_gained[itype],
+ sd->lb_hot_gained[itype],
+ sd->lb_nobusyq[itype],
+ sd->lb_nobusyg[itype]);
+ }
+ seq_printf(seq,
+ " %u %u %u %u %u %u %u %u %u %u %u %u\n",
+ sd->alb_count, sd->alb_failed, sd->alb_pushed,
+ sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
+ sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
+ sd->ttwu_wake_remote, sd->ttwu_move_affine,
+ sd->ttwu_move_balance);
+ }
+ rcu_read_unlock();
+#endif
+ }
+ return 0;
+}
+
+/*
+ * This itererator needs some explanation.
+ * It returns 1 for the header position.
+ * This means 2 is cpu 0.
+ * In a hotplugged system some cpus, including cpu 0, may be missing so we have
+ * to use cpumask_* to iterate over the cpus.
+ */
+static void *schedstat_start(struct seq_file *file, loff_t *offset)
+{
+ unsigned long n = *offset;
+
+ if (n == 0)
+ return (void *) 1;
+
+ n--;
+
+ if (n > 0)
+ n = cpumask_next(n - 1, cpu_online_mask);
+ else
+ n = cpumask_first(cpu_online_mask);
+
+ *offset = n + 1;
+
+ if (n < nr_cpu_ids)
+ return (void *)(unsigned long)(n + 2);
+ return NULL;
+}
+
+static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset)
+{
+ (*offset)++;
+ return schedstat_start(file, offset);
+}
+
+static void schedstat_stop(struct seq_file *file, void *data)
+{
+}
+
+static const struct seq_operations schedstat_sops = {
+ .start = schedstat_start,
+ .next = schedstat_next,
+ .stop = schedstat_stop,
+ .show = show_schedstat,
+};
+
+static int schedstat_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &schedstat_sops);
+}
+
+static const struct file_operations proc_schedstat_operations = {
+ .open = schedstat_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int __init proc_schedstat_init(void)
+{
+ proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
+ return 0;
+}
+subsys_initcall(proc_schedstat_init);
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
new file mode 100644
index 000000000..4ab704339
--- /dev/null
+++ b/kernel/sched/stats.h
@@ -0,0 +1,267 @@
+
+#ifdef CONFIG_SCHEDSTATS
+
+/*
+ * Expects runqueue lock to be held for atomicity of update
+ */
+static inline void
+rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
+{
+ if (rq) {
+ rq->rq_sched_info.run_delay += delta;
+ rq->rq_sched_info.pcount++;
+ }
+}
+
+/*
+ * Expects runqueue lock to be held for atomicity of update
+ */
+static inline void
+rq_sched_info_depart(struct rq *rq, unsigned long long delta)
+{
+ if (rq)
+ rq->rq_cpu_time += delta;
+}
+
+static inline void
+rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
+{
+ if (rq)
+ rq->rq_sched_info.run_delay += delta;
+}
+# define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
+# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
+# define schedstat_set(var, val) do { var = (val); } while (0)
+#else /* !CONFIG_SCHEDSTATS */
+static inline void
+rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
+{}
+static inline void
+rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
+{}
+static inline void
+rq_sched_info_depart(struct rq *rq, unsigned long long delta)
+{}
+# define schedstat_inc(rq, field) do { } while (0)
+# define schedstat_add(rq, field, amt) do { } while (0)
+# define schedstat_set(var, val) do { } while (0)
+#endif
+
+#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+static inline void sched_info_reset_dequeued(struct task_struct *t)
+{
+ t->sched_info.last_queued = 0;
+}
+
+/*
+ * We are interested in knowing how long it was from the *first* time a
+ * task was queued to the time that it finally hit a cpu, we call this routine
+ * from dequeue_task() to account for possible rq->clock skew across cpus. The
+ * delta taken on each cpu would annul the skew.
+ */
+static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
+{
+ unsigned long long now = rq_clock(rq), delta = 0;
+
+ if (unlikely(sched_info_on()))
+ if (t->sched_info.last_queued)
+ delta = now - t->sched_info.last_queued;
+ sched_info_reset_dequeued(t);
+ t->sched_info.run_delay += delta;
+
+ rq_sched_info_dequeued(rq, delta);
+}
+
+/*
+ * Called when a task finally hits the cpu. We can now calculate how
+ * long it was waiting to run. We also note when it began so that we
+ * can keep stats on how long its timeslice is.
+ */
+static void sched_info_arrive(struct rq *rq, struct task_struct *t)
+{
+ unsigned long long now = rq_clock(rq), delta = 0;
+
+ if (t->sched_info.last_queued)
+ delta = now - t->sched_info.last_queued;
+ sched_info_reset_dequeued(t);
+ t->sched_info.run_delay += delta;
+ t->sched_info.last_arrival = now;
+ t->sched_info.pcount++;
+
+ rq_sched_info_arrive(rq, delta);
+}
+
+/*
+ * This function is only called from enqueue_task(), but also only updates
+ * the timestamp if it is already not set. It's assumed that
+ * sched_info_dequeued() will clear that stamp when appropriate.
+ */
+static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
+{
+ if (unlikely(sched_info_on()))
+ if (!t->sched_info.last_queued)
+ t->sched_info.last_queued = rq_clock(rq);
+}
+
+/*
+ * Called when a process ceases being the active-running process involuntarily
+ * due, typically, to expiring its time slice (this may also be called when
+ * switching to the idle task). Now we can calculate how long we ran.
+ * Also, if the process is still in the TASK_RUNNING state, call
+ * sched_info_queued() to mark that it has now again started waiting on
+ * the runqueue.
+ */
+static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
+{
+ unsigned long long delta = rq_clock(rq) -
+ t->sched_info.last_arrival;
+
+ rq_sched_info_depart(rq, delta);
+
+ if (t->state == TASK_RUNNING)
+ sched_info_queued(rq, t);
+}
+
+/*
+ * Called when tasks are switched involuntarily due, typically, to expiring
+ * their time slice. (This may also be called when switching to or from
+ * the idle task.) We are only called when prev != next.
+ */
+static inline void
+__sched_info_switch(struct rq *rq,
+ struct task_struct *prev, struct task_struct *next)
+{
+ /*
+ * prev now departs the cpu. It's not interesting to record
+ * stats about how efficient we were at scheduling the idle
+ * process, however.
+ */
+ if (prev != rq->idle)
+ sched_info_depart(rq, prev);
+
+ if (next != rq->idle)
+ sched_info_arrive(rq, next);
+}
+static inline void
+sched_info_switch(struct rq *rq,
+ struct task_struct *prev, struct task_struct *next)
+{
+ if (unlikely(sched_info_on()))
+ __sched_info_switch(rq, prev, next);
+}
+#else
+#define sched_info_queued(rq, t) do { } while (0)
+#define sched_info_reset_dequeued(t) do { } while (0)
+#define sched_info_dequeued(rq, t) do { } while (0)
+#define sched_info_depart(rq, t) do { } while (0)
+#define sched_info_arrive(rq, next) do { } while (0)
+#define sched_info_switch(rq, t, next) do { } while (0)
+#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
+
+/*
+ * The following are functions that support scheduler-internal time accounting.
+ * These functions are generally called at the timer tick. None of this depends
+ * on CONFIG_SCHEDSTATS.
+ */
+
+/**
+ * cputimer_running - return true if cputimer is running
+ *
+ * @tsk: Pointer to target task.
+ */
+static inline bool cputimer_running(struct task_struct *tsk)
+
+{
+ struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+
+ if (!cputimer->running)
+ return false;
+
+ /*
+ * After we flush the task's sum_exec_runtime to sig->sum_sched_runtime
+ * in __exit_signal(), we won't account to the signal struct further
+ * cputime consumed by that task, even though the task can still be
+ * ticking after __exit_signal().
+ *
+ * In order to keep a consistent behaviour between thread group cputime
+ * and thread group cputimer accounting, lets also ignore the cputime
+ * elapsing after __exit_signal() in any thread group timer running.
+ *
+ * This makes sure that POSIX CPU clocks and timers are synchronized, so
+ * that a POSIX CPU timer won't expire while the corresponding POSIX CPU
+ * clock delta is behind the expiring timer value.
+ */
+ if (unlikely(!tsk->sighand))
+ return false;
+
+ return true;
+}
+
+/**
+ * account_group_user_time - Maintain utime for a thread group.
+ *
+ * @tsk: Pointer to task structure.
+ * @cputime: Time value by which to increment the utime field of the
+ * thread_group_cputime structure.
+ *
+ * If thread group time is being maintained, get the structure for the
+ * running CPU and update the utime field there.
+ */
+static inline void account_group_user_time(struct task_struct *tsk,
+ cputime_t cputime)
+{
+ struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+
+ if (!cputimer_running(tsk))
+ return;
+
+ raw_spin_lock(&cputimer->lock);
+ cputimer->cputime.utime += cputime;
+ raw_spin_unlock(&cputimer->lock);
+}
+
+/**
+ * account_group_system_time - Maintain stime for a thread group.
+ *
+ * @tsk: Pointer to task structure.
+ * @cputime: Time value by which to increment the stime field of the
+ * thread_group_cputime structure.
+ *
+ * If thread group time is being maintained, get the structure for the
+ * running CPU and update the stime field there.
+ */
+static inline void account_group_system_time(struct task_struct *tsk,
+ cputime_t cputime)
+{
+ struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+
+ if (!cputimer_running(tsk))
+ return;
+
+ raw_spin_lock(&cputimer->lock);
+ cputimer->cputime.stime += cputime;
+ raw_spin_unlock(&cputimer->lock);
+}
+
+/**
+ * account_group_exec_runtime - Maintain exec runtime for a thread group.
+ *
+ * @tsk: Pointer to task structure.
+ * @ns: Time value by which to increment the sum_exec_runtime field
+ * of the thread_group_cputime structure.
+ *
+ * If thread group time is being maintained, get the structure for the
+ * running CPU and update the sum_exec_runtime field there.
+ */
+static inline void account_group_exec_runtime(struct task_struct *tsk,
+ unsigned long long ns)
+{
+ struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+
+ if (!cputimer_running(tsk))
+ return;
+
+ raw_spin_lock(&cputimer->lock);
+ cputimer->cputime.sum_exec_runtime += ns;
+ raw_spin_unlock(&cputimer->lock);
+}
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
new file mode 100644
index 000000000..79ffec45a
--- /dev/null
+++ b/kernel/sched/stop_task.c
@@ -0,0 +1,136 @@
+#include "sched.h"
+
+/*
+ * stop-task scheduling class.
+ *
+ * The stop task is the highest priority task in the system, it preempts
+ * everything and will be preempted by nothing.
+ *
+ * See kernel/stop_machine.c
+ */
+
+#ifdef CONFIG_SMP
+static int
+select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags)
+{
+ return task_cpu(p); /* stop tasks as never migrate */
+}
+#endif /* CONFIG_SMP */
+
+static void
+check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
+{
+ /* we're never preempted */
+}
+
+static struct task_struct *
+pick_next_task_stop(struct rq *rq, struct task_struct *prev)
+{
+ struct task_struct *stop = rq->stop;
+
+ if (!stop || !task_on_rq_queued(stop))
+ return NULL;
+
+ put_prev_task(rq, prev);
+
+ stop->se.exec_start = rq_clock_task(rq);
+
+ return stop;
+}
+
+static void
+enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
+{
+ add_nr_running(rq, 1);
+}
+
+static void
+dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
+{
+ sub_nr_running(rq, 1);
+}
+
+static void yield_task_stop(struct rq *rq)
+{
+ BUG(); /* the stop task should never yield, its pointless. */
+}
+
+static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
+{
+ struct task_struct *curr = rq->curr;
+ u64 delta_exec;
+
+ delta_exec = rq_clock_task(rq) - curr->se.exec_start;
+ if (unlikely((s64)delta_exec < 0))
+ delta_exec = 0;
+
+ schedstat_set(curr->se.statistics.exec_max,
+ max(curr->se.statistics.exec_max, delta_exec));
+
+ curr->se.sum_exec_runtime += delta_exec;
+ account_group_exec_runtime(curr, delta_exec);
+
+ curr->se.exec_start = rq_clock_task(rq);
+ cpuacct_charge(curr, delta_exec);
+}
+
+static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
+{
+}
+
+static void set_curr_task_stop(struct rq *rq)
+{
+ struct task_struct *stop = rq->stop;
+
+ stop->se.exec_start = rq_clock_task(rq);
+}
+
+static void switched_to_stop(struct rq *rq, struct task_struct *p)
+{
+ BUG(); /* its impossible to change to this class */
+}
+
+static void
+prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio)
+{
+ BUG(); /* how!?, what priority? */
+}
+
+static unsigned int
+get_rr_interval_stop(struct rq *rq, struct task_struct *task)
+{
+ return 0;
+}
+
+static void update_curr_stop(struct rq *rq)
+{
+}
+
+/*
+ * Simple, special scheduling class for the per-CPU stop tasks:
+ */
+const struct sched_class stop_sched_class = {
+ .next = &dl_sched_class,
+
+ .enqueue_task = enqueue_task_stop,
+ .dequeue_task = dequeue_task_stop,
+ .yield_task = yield_task_stop,
+
+ .check_preempt_curr = check_preempt_curr_stop,
+
+ .pick_next_task = pick_next_task_stop,
+ .put_prev_task = put_prev_task_stop,
+
+#ifdef CONFIG_SMP
+ .select_task_rq = select_task_rq_stop,
+#endif
+
+ .set_curr_task = set_curr_task_stop,
+ .task_tick = task_tick_stop,
+
+ .get_rr_interval = get_rr_interval_stop,
+
+ .prio_changed = prio_changed_stop,
+ .switched_to = switched_to_stop,
+ .update_curr = update_curr_stop,
+};
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
new file mode 100644
index 000000000..852143a79
--- /dev/null
+++ b/kernel/sched/wait.c
@@ -0,0 +1,624 @@
+/*
+ * Generic waiting primitives.
+ *
+ * (C) 2004 Nadia Yvette Chambers, Oracle
+ */
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/wait.h>
+#include <linux/hash.h>
+#include <linux/kthread.h>
+
+void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key)
+{
+ spin_lock_init(&q->lock);
+ lockdep_set_class_and_name(&q->lock, key, name);
+ INIT_LIST_HEAD(&q->task_list);
+}
+
+EXPORT_SYMBOL(__init_waitqueue_head);
+
+void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
+{
+ unsigned long flags;
+
+ wait->flags &= ~WQ_FLAG_EXCLUSIVE;
+ spin_lock_irqsave(&q->lock, flags);
+ __add_wait_queue(q, wait);
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(add_wait_queue);
+
+void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait)
+{
+ unsigned long flags;
+
+ wait->flags |= WQ_FLAG_EXCLUSIVE;
+ spin_lock_irqsave(&q->lock, flags);
+ __add_wait_queue_tail(q, wait);
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(add_wait_queue_exclusive);
+
+void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&q->lock, flags);
+ __remove_wait_queue(q, wait);
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(remove_wait_queue);
+
+
+/*
+ * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
+ * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
+ * number) then we wake all the non-exclusive tasks and one exclusive task.
+ *
+ * There are circumstances in which we can try to wake a task which has already
+ * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
+ * zero in this (rare) case, and we handle it by continuing to scan the queue.
+ */
+static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
+ int nr_exclusive, int wake_flags, void *key)
+{
+ wait_queue_t *curr, *next;
+
+ list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
+ unsigned flags = curr->flags;
+
+ if (curr->func(curr, mode, wake_flags, key) &&
+ (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
+ break;
+ }
+}
+
+/**
+ * __wake_up - wake up threads blocked on a waitqueue.
+ * @q: the waitqueue
+ * @mode: which threads
+ * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ * @key: is directly passed to the wakeup function
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void __wake_up(wait_queue_head_t *q, unsigned int mode,
+ int nr_exclusive, void *key)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&q->lock, flags);
+ __wake_up_common(q, mode, nr_exclusive, 0, key);
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(__wake_up);
+
+/*
+ * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
+ */
+void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
+{
+ __wake_up_common(q, mode, nr, 0, NULL);
+}
+EXPORT_SYMBOL_GPL(__wake_up_locked);
+
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
+{
+ __wake_up_common(q, mode, 1, 0, key);
+}
+EXPORT_SYMBOL_GPL(__wake_up_locked_key);
+
+/**
+ * __wake_up_sync_key - wake up threads blocked on a waitqueue.
+ * @q: the waitqueue
+ * @mode: which threads
+ * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ * @key: opaque value to be passed to wakeup targets
+ *
+ * The sync wakeup differs that the waker knows that it will schedule
+ * away soon, so while the target thread will be woken up, it will not
+ * be migrated to another CPU - ie. the two threads are 'synchronized'
+ * with each other. This can prevent needless bouncing between CPUs.
+ *
+ * On UP it can prevent extra preemption.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
+ int nr_exclusive, void *key)
+{
+ unsigned long flags;
+ int wake_flags = 1; /* XXX WF_SYNC */
+
+ if (unlikely(!q))
+ return;
+
+ if (unlikely(nr_exclusive != 1))
+ wake_flags = 0;
+
+ spin_lock_irqsave(&q->lock, flags);
+ __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL_GPL(__wake_up_sync_key);
+
+/*
+ * __wake_up_sync - see __wake_up_sync_key()
+ */
+void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+{
+ __wake_up_sync_key(q, mode, nr_exclusive, NULL);
+}
+EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
+
+/*
+ * Note: we use "set_current_state()" _after_ the wait-queue add,
+ * because we need a memory barrier there on SMP, so that any
+ * wake-function that tests for the wait-queue being active
+ * will be guaranteed to see waitqueue addition _or_ subsequent
+ * tests in this thread will see the wakeup having taken place.
+ *
+ * The spin_unlock() itself is semi-permeable and only protects
+ * one way (it only protects stuff inside the critical region and
+ * stops them from bleeding out - it would still allow subsequent
+ * loads to move into the critical region).
+ */
+void
+prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
+{
+ unsigned long flags;
+
+ wait->flags &= ~WQ_FLAG_EXCLUSIVE;
+ spin_lock_irqsave(&q->lock, flags);
+ if (list_empty(&wait->task_list))
+ __add_wait_queue(q, wait);
+ set_current_state(state);
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(prepare_to_wait);
+
+void
+prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
+{
+ unsigned long flags;
+
+ wait->flags |= WQ_FLAG_EXCLUSIVE;
+ spin_lock_irqsave(&q->lock, flags);
+ if (list_empty(&wait->task_list))
+ __add_wait_queue_tail(q, wait);
+ set_current_state(state);
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(prepare_to_wait_exclusive);
+
+long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state)
+{
+ unsigned long flags;
+
+ if (signal_pending_state(state, current))
+ return -ERESTARTSYS;
+
+ wait->private = current;
+ wait->func = autoremove_wake_function;
+
+ spin_lock_irqsave(&q->lock, flags);
+ if (list_empty(&wait->task_list)) {
+ if (wait->flags & WQ_FLAG_EXCLUSIVE)
+ __add_wait_queue_tail(q, wait);
+ else
+ __add_wait_queue(q, wait);
+ }
+ set_current_state(state);
+ spin_unlock_irqrestore(&q->lock, flags);
+
+ return 0;
+}
+EXPORT_SYMBOL(prepare_to_wait_event);
+
+/**
+ * finish_wait - clean up after waiting in a queue
+ * @q: waitqueue waited on
+ * @wait: wait descriptor
+ *
+ * Sets current thread back to running state and removes
+ * the wait descriptor from the given waitqueue if still
+ * queued.
+ */
+void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
+{
+ unsigned long flags;
+
+ __set_current_state(TASK_RUNNING);
+ /*
+ * We can check for list emptiness outside the lock
+ * IFF:
+ * - we use the "careful" check that verifies both
+ * the next and prev pointers, so that there cannot
+ * be any half-pending updates in progress on other
+ * CPU's that we haven't seen yet (and that might
+ * still change the stack area.
+ * and
+ * - all other users take the lock (ie we can only
+ * have _one_ other CPU that looks at or modifies
+ * the list).
+ */
+ if (!list_empty_careful(&wait->task_list)) {
+ spin_lock_irqsave(&q->lock, flags);
+ list_del_init(&wait->task_list);
+ spin_unlock_irqrestore(&q->lock, flags);
+ }
+}
+EXPORT_SYMBOL(finish_wait);
+
+/**
+ * abort_exclusive_wait - abort exclusive waiting in a queue
+ * @q: waitqueue waited on
+ * @wait: wait descriptor
+ * @mode: runstate of the waiter to be woken
+ * @key: key to identify a wait bit queue or %NULL
+ *
+ * Sets current thread back to running state and removes
+ * the wait descriptor from the given waitqueue if still
+ * queued.
+ *
+ * Wakes up the next waiter if the caller is concurrently
+ * woken up through the queue.
+ *
+ * This prevents waiter starvation where an exclusive waiter
+ * aborts and is woken up concurrently and no one wakes up
+ * the next waiter.
+ */
+void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
+ unsigned int mode, void *key)
+{
+ unsigned long flags;
+
+ __set_current_state(TASK_RUNNING);
+ spin_lock_irqsave(&q->lock, flags);
+ if (!list_empty(&wait->task_list))
+ list_del_init(&wait->task_list);
+ else if (waitqueue_active(q))
+ __wake_up_locked_key(q, mode, key);
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(abort_exclusive_wait);
+
+int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+ int ret = default_wake_function(wait, mode, sync, key);
+
+ if (ret)
+ list_del_init(&wait->task_list);
+ return ret;
+}
+EXPORT_SYMBOL(autoremove_wake_function);
+
+static inline bool is_kthread_should_stop(void)
+{
+ return (current->flags & PF_KTHREAD) && kthread_should_stop();
+}
+
+/*
+ * DEFINE_WAIT_FUNC(wait, woken_wake_func);
+ *
+ * add_wait_queue(&wq, &wait);
+ * for (;;) {
+ * if (condition)
+ * break;
+ *
+ * p->state = mode; condition = true;
+ * smp_mb(); // A smp_wmb(); // C
+ * if (!wait->flags & WQ_FLAG_WOKEN) wait->flags |= WQ_FLAG_WOKEN;
+ * schedule() try_to_wake_up();
+ * p->state = TASK_RUNNING; ~~~~~~~~~~~~~~~~~~
+ * wait->flags &= ~WQ_FLAG_WOKEN; condition = true;
+ * smp_mb() // B smp_wmb(); // C
+ * wait->flags |= WQ_FLAG_WOKEN;
+ * }
+ * remove_wait_queue(&wq, &wait);
+ *
+ */
+long wait_woken(wait_queue_t *wait, unsigned mode, long timeout)
+{
+ set_current_state(mode); /* A */
+ /*
+ * The above implies an smp_mb(), which matches with the smp_wmb() from
+ * woken_wake_function() such that if we observe WQ_FLAG_WOKEN we must
+ * also observe all state before the wakeup.
+ */
+ if (!(wait->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop())
+ timeout = schedule_timeout(timeout);
+ __set_current_state(TASK_RUNNING);
+
+ /*
+ * The below implies an smp_mb(), it too pairs with the smp_wmb() from
+ * woken_wake_function() such that we must either observe the wait
+ * condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss
+ * an event.
+ */
+ set_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */
+
+ return timeout;
+}
+EXPORT_SYMBOL(wait_woken);
+
+int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+ /*
+ * Although this function is called under waitqueue lock, LOCK
+ * doesn't imply write barrier and the users expects write
+ * barrier semantics on wakeup functions. The following
+ * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
+ * and is paired with set_mb() in wait_woken().
+ */
+ smp_wmb(); /* C */
+ wait->flags |= WQ_FLAG_WOKEN;
+
+ return default_wake_function(wait, mode, sync, key);
+}
+EXPORT_SYMBOL(woken_wake_function);
+
+int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg)
+{
+ struct wait_bit_key *key = arg;
+ struct wait_bit_queue *wait_bit
+ = container_of(wait, struct wait_bit_queue, wait);
+
+ if (wait_bit->key.flags != key->flags ||
+ wait_bit->key.bit_nr != key->bit_nr ||
+ test_bit(key->bit_nr, key->flags))
+ return 0;
+ else
+ return autoremove_wake_function(wait, mode, sync, key);
+}
+EXPORT_SYMBOL(wake_bit_function);
+
+/*
+ * To allow interruptible waiting and asynchronous (i.e. nonblocking)
+ * waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are
+ * permitted return codes. Nonzero return codes halt waiting and return.
+ */
+int __sched
+__wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q,
+ wait_bit_action_f *action, unsigned mode)
+{
+ int ret = 0;
+
+ do {
+ prepare_to_wait(wq, &q->wait, mode);
+ if (test_bit(q->key.bit_nr, q->key.flags))
+ ret = (*action)(&q->key);
+ } while (test_bit(q->key.bit_nr, q->key.flags) && !ret);
+ finish_wait(wq, &q->wait);
+ return ret;
+}
+EXPORT_SYMBOL(__wait_on_bit);
+
+int __sched out_of_line_wait_on_bit(void *word, int bit,
+ wait_bit_action_f *action, unsigned mode)
+{
+ wait_queue_head_t *wq = bit_waitqueue(word, bit);
+ DEFINE_WAIT_BIT(wait, word, bit);
+
+ return __wait_on_bit(wq, &wait, action, mode);
+}
+EXPORT_SYMBOL(out_of_line_wait_on_bit);
+
+int __sched out_of_line_wait_on_bit_timeout(
+ void *word, int bit, wait_bit_action_f *action,
+ unsigned mode, unsigned long timeout)
+{
+ wait_queue_head_t *wq = bit_waitqueue(word, bit);
+ DEFINE_WAIT_BIT(wait, word, bit);
+
+ wait.key.timeout = jiffies + timeout;
+ return __wait_on_bit(wq, &wait, action, mode);
+}
+EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout);
+
+int __sched
+__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
+ wait_bit_action_f *action, unsigned mode)
+{
+ do {
+ int ret;
+
+ prepare_to_wait_exclusive(wq, &q->wait, mode);
+ if (!test_bit(q->key.bit_nr, q->key.flags))
+ continue;
+ ret = action(&q->key);
+ if (!ret)
+ continue;
+ abort_exclusive_wait(wq, &q->wait, mode, &q->key);
+ return ret;
+ } while (test_and_set_bit(q->key.bit_nr, q->key.flags));
+ finish_wait(wq, &q->wait);
+ return 0;
+}
+EXPORT_SYMBOL(__wait_on_bit_lock);
+
+int __sched out_of_line_wait_on_bit_lock(void *word, int bit,
+ wait_bit_action_f *action, unsigned mode)
+{
+ wait_queue_head_t *wq = bit_waitqueue(word, bit);
+ DEFINE_WAIT_BIT(wait, word, bit);
+
+ return __wait_on_bit_lock(wq, &wait, action, mode);
+}
+EXPORT_SYMBOL(out_of_line_wait_on_bit_lock);
+
+void __wake_up_bit(wait_queue_head_t *wq, void *word, int bit)
+{
+ struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit);
+ if (waitqueue_active(wq))
+ __wake_up(wq, TASK_NORMAL, 1, &key);
+}
+EXPORT_SYMBOL(__wake_up_bit);
+
+/**
+ * wake_up_bit - wake up a waiter on a bit
+ * @word: the word being waited on, a kernel virtual address
+ * @bit: the bit of the word being waited on
+ *
+ * There is a standard hashed waitqueue table for generic use. This
+ * is the part of the hashtable's accessor API that wakes up waiters
+ * on a bit. For instance, if one were to have waiters on a bitflag,
+ * one would call wake_up_bit() after clearing the bit.
+ *
+ * In order for this to function properly, as it uses waitqueue_active()
+ * internally, some kind of memory barrier must be done prior to calling
+ * this. Typically, this will be smp_mb__after_atomic(), but in some
+ * cases where bitflags are manipulated non-atomically under a lock, one
+ * may need to use a less regular barrier, such fs/inode.c's smp_mb(),
+ * because spin_unlock() does not guarantee a memory barrier.
+ */
+void wake_up_bit(void *word, int bit)
+{
+ __wake_up_bit(bit_waitqueue(word, bit), word, bit);
+}
+EXPORT_SYMBOL(wake_up_bit);
+
+wait_queue_head_t *bit_waitqueue(void *word, int bit)
+{
+ const int shift = BITS_PER_LONG == 32 ? 5 : 6;
+ const struct zone *zone = page_zone(virt_to_page(word));
+ unsigned long val = (unsigned long)word << shift | bit;
+
+ return &zone->wait_table[hash_long(val, zone->wait_table_bits)];
+}
+EXPORT_SYMBOL(bit_waitqueue);
+
+/*
+ * Manipulate the atomic_t address to produce a better bit waitqueue table hash
+ * index (we're keying off bit -1, but that would produce a horrible hash
+ * value).
+ */
+static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p)
+{
+ if (BITS_PER_LONG == 64) {
+ unsigned long q = (unsigned long)p;
+ return bit_waitqueue((void *)(q & ~1), q & 1);
+ }
+ return bit_waitqueue(p, 0);
+}
+
+static int wake_atomic_t_function(wait_queue_t *wait, unsigned mode, int sync,
+ void *arg)
+{
+ struct wait_bit_key *key = arg;
+ struct wait_bit_queue *wait_bit
+ = container_of(wait, struct wait_bit_queue, wait);
+ atomic_t *val = key->flags;
+
+ if (wait_bit->key.flags != key->flags ||
+ wait_bit->key.bit_nr != key->bit_nr ||
+ atomic_read(val) != 0)
+ return 0;
+ return autoremove_wake_function(wait, mode, sync, key);
+}
+
+/*
+ * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting,
+ * the actions of __wait_on_atomic_t() are permitted return codes. Nonzero
+ * return codes halt waiting and return.
+ */
+static __sched
+int __wait_on_atomic_t(wait_queue_head_t *wq, struct wait_bit_queue *q,
+ int (*action)(atomic_t *), unsigned mode)
+{
+ atomic_t *val;
+ int ret = 0;
+
+ do {
+ prepare_to_wait(wq, &q->wait, mode);
+ val = q->key.flags;
+ if (atomic_read(val) == 0)
+ break;
+ ret = (*action)(val);
+ } while (!ret && atomic_read(val) != 0);
+ finish_wait(wq, &q->wait);
+ return ret;
+}
+
+#define DEFINE_WAIT_ATOMIC_T(name, p) \
+ struct wait_bit_queue name = { \
+ .key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p), \
+ .wait = { \
+ .private = current, \
+ .func = wake_atomic_t_function, \
+ .task_list = \
+ LIST_HEAD_INIT((name).wait.task_list), \
+ }, \
+ }
+
+__sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *),
+ unsigned mode)
+{
+ wait_queue_head_t *wq = atomic_t_waitqueue(p);
+ DEFINE_WAIT_ATOMIC_T(wait, p);
+
+ return __wait_on_atomic_t(wq, &wait, action, mode);
+}
+EXPORT_SYMBOL(out_of_line_wait_on_atomic_t);
+
+/**
+ * wake_up_atomic_t - Wake up a waiter on a atomic_t
+ * @p: The atomic_t being waited on, a kernel virtual address
+ *
+ * Wake up anyone waiting for the atomic_t to go to zero.
+ *
+ * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t
+ * check is done by the waiter's wake function, not the by the waker itself).
+ */
+void wake_up_atomic_t(atomic_t *p)
+{
+ __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR);
+}
+EXPORT_SYMBOL(wake_up_atomic_t);
+
+__sched int bit_wait(struct wait_bit_key *word)
+{
+ if (signal_pending_state(current->state, current))
+ return 1;
+ schedule();
+ return 0;
+}
+EXPORT_SYMBOL(bit_wait);
+
+__sched int bit_wait_io(struct wait_bit_key *word)
+{
+ if (signal_pending_state(current->state, current))
+ return 1;
+ io_schedule();
+ return 0;
+}
+EXPORT_SYMBOL(bit_wait_io);
+
+__sched int bit_wait_timeout(struct wait_bit_key *word)
+{
+ unsigned long now = ACCESS_ONCE(jiffies);
+ if (signal_pending_state(current->state, current))
+ return 1;
+ if (time_after_eq(now, word->timeout))
+ return -EAGAIN;
+ schedule_timeout(word->timeout - now);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(bit_wait_timeout);
+
+__sched int bit_wait_io_timeout(struct wait_bit_key *word)
+{
+ unsigned long now = ACCESS_ONCE(jiffies);
+ if (signal_pending_state(current->state, current))
+ return 1;
+ if (time_after_eq(now, word->timeout))
+ return -EAGAIN;
+ io_schedule_timeout(word->timeout - now);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(bit_wait_io_timeout);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
new file mode 100644
index 000000000..4f4402894
--- /dev/null
+++ b/kernel/seccomp.c
@@ -0,0 +1,904 @@
+/*
+ * linux/kernel/seccomp.c
+ *
+ * Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com>
+ *
+ * Copyright (C) 2012 Google, Inc.
+ * Will Drewry <wad@chromium.org>
+ *
+ * This defines a simple but solid secure-computing facility.
+ *
+ * Mode 1 uses a fixed list of allowed system calls.
+ * Mode 2 allows user-defined system call filters in the form
+ * of Berkeley Packet Filters/Linux Socket Filters.
+ */
+
+#include <linux/atomic.h>
+#include <linux/audit.h>
+#include <linux/compat.h>
+#include <linux/sched.h>
+#include <linux/seccomp.h>
+#include <linux/slab.h>
+#include <linux/syscalls.h>
+
+#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
+#include <asm/syscall.h>
+#endif
+
+#ifdef CONFIG_SECCOMP_FILTER
+#include <linux/filter.h>
+#include <linux/pid.h>
+#include <linux/ptrace.h>
+#include <linux/security.h>
+#include <linux/tracehook.h>
+#include <linux/uaccess.h>
+
+/**
+ * struct seccomp_filter - container for seccomp BPF programs
+ *
+ * @usage: reference count to manage the object lifetime.
+ * get/put helpers should be used when accessing an instance
+ * outside of a lifetime-guarded section. In general, this
+ * is only needed for handling filters shared across tasks.
+ * @prev: points to a previously installed, or inherited, filter
+ * @len: the number of instructions in the program
+ * @insnsi: the BPF program instructions to evaluate
+ *
+ * seccomp_filter objects are organized in a tree linked via the @prev
+ * pointer. For any task, it appears to be a singly-linked list starting
+ * with current->seccomp.filter, the most recently attached or inherited filter.
+ * However, multiple filters may share a @prev node, by way of fork(), which
+ * results in a unidirectional tree existing in memory. This is similar to
+ * how namespaces work.
+ *
+ * seccomp_filter objects should never be modified after being attached
+ * to a task_struct (other than @usage).
+ */
+struct seccomp_filter {
+ atomic_t usage;
+ struct seccomp_filter *prev;
+ struct bpf_prog *prog;
+};
+
+/* Limit any path through the tree to 256KB worth of instructions. */
+#define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter))
+
+/*
+ * Endianness is explicitly ignored and left for BPF program authors to manage
+ * as per the specific architecture.
+ */
+static void populate_seccomp_data(struct seccomp_data *sd)
+{
+ struct task_struct *task = current;
+ struct pt_regs *regs = task_pt_regs(task);
+ unsigned long args[6];
+
+ sd->nr = syscall_get_nr(task, regs);
+ sd->arch = syscall_get_arch();
+ syscall_get_arguments(task, regs, 0, 6, args);
+ sd->args[0] = args[0];
+ sd->args[1] = args[1];
+ sd->args[2] = args[2];
+ sd->args[3] = args[3];
+ sd->args[4] = args[4];
+ sd->args[5] = args[5];
+ sd->instruction_pointer = KSTK_EIP(task);
+}
+
+/**
+ * seccomp_check_filter - verify seccomp filter code
+ * @filter: filter to verify
+ * @flen: length of filter
+ *
+ * Takes a previously checked filter (by bpf_check_classic) and
+ * redirects all filter code that loads struct sk_buff data
+ * and related data through seccomp_bpf_load. It also
+ * enforces length and alignment checking of those loads.
+ *
+ * Returns 0 if the rule set is legal or -EINVAL if not.
+ */
+static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
+{
+ int pc;
+ for (pc = 0; pc < flen; pc++) {
+ struct sock_filter *ftest = &filter[pc];
+ u16 code = ftest->code;
+ u32 k = ftest->k;
+
+ switch (code) {
+ case BPF_LD | BPF_W | BPF_ABS:
+ ftest->code = BPF_LDX | BPF_W | BPF_ABS;
+ /* 32-bit aligned and not out of bounds. */
+ if (k >= sizeof(struct seccomp_data) || k & 3)
+ return -EINVAL;
+ continue;
+ case BPF_LD | BPF_W | BPF_LEN:
+ ftest->code = BPF_LD | BPF_IMM;
+ ftest->k = sizeof(struct seccomp_data);
+ continue;
+ case BPF_LDX | BPF_W | BPF_LEN:
+ ftest->code = BPF_LDX | BPF_IMM;
+ ftest->k = sizeof(struct seccomp_data);
+ continue;
+ /* Explicitly include allowed calls. */
+ case BPF_RET | BPF_K:
+ case BPF_RET | BPF_A:
+ case BPF_ALU | BPF_ADD | BPF_K:
+ case BPF_ALU | BPF_ADD | BPF_X:
+ case BPF_ALU | BPF_SUB | BPF_K:
+ case BPF_ALU | BPF_SUB | BPF_X:
+ case BPF_ALU | BPF_MUL | BPF_K:
+ case BPF_ALU | BPF_MUL | BPF_X:
+ case BPF_ALU | BPF_DIV | BPF_K:
+ case BPF_ALU | BPF_DIV | BPF_X:
+ case BPF_ALU | BPF_AND | BPF_K:
+ case BPF_ALU | BPF_AND | BPF_X:
+ case BPF_ALU | BPF_OR | BPF_K:
+ case BPF_ALU | BPF_OR | BPF_X:
+ case BPF_ALU | BPF_XOR | BPF_K:
+ case BPF_ALU | BPF_XOR | BPF_X:
+ case BPF_ALU | BPF_LSH | BPF_K:
+ case BPF_ALU | BPF_LSH | BPF_X:
+ case BPF_ALU | BPF_RSH | BPF_K:
+ case BPF_ALU | BPF_RSH | BPF_X:
+ case BPF_ALU | BPF_NEG:
+ case BPF_LD | BPF_IMM:
+ case BPF_LDX | BPF_IMM:
+ case BPF_MISC | BPF_TAX:
+ case BPF_MISC | BPF_TXA:
+ case BPF_LD | BPF_MEM:
+ case BPF_LDX | BPF_MEM:
+ case BPF_ST:
+ case BPF_STX:
+ case BPF_JMP | BPF_JA:
+ case BPF_JMP | BPF_JEQ | BPF_K:
+ case BPF_JMP | BPF_JEQ | BPF_X:
+ case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JGE | BPF_X:
+ case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JGT | BPF_X:
+ case BPF_JMP | BPF_JSET | BPF_K:
+ case BPF_JMP | BPF_JSET | BPF_X:
+ continue;
+ default:
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+/**
+ * seccomp_run_filters - evaluates all seccomp filters against @syscall
+ * @syscall: number of the current system call
+ *
+ * Returns valid seccomp BPF response codes.
+ */
+static u32 seccomp_run_filters(struct seccomp_data *sd)
+{
+ struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter);
+ struct seccomp_data sd_local;
+ u32 ret = SECCOMP_RET_ALLOW;
+
+ /* Ensure unexpected behavior doesn't result in failing open. */
+ if (unlikely(WARN_ON(f == NULL)))
+ return SECCOMP_RET_KILL;
+
+ /* Make sure cross-thread synced filter points somewhere sane. */
+ smp_read_barrier_depends();
+
+ if (!sd) {
+ populate_seccomp_data(&sd_local);
+ sd = &sd_local;
+ }
+
+ /*
+ * All filters in the list are evaluated and the lowest BPF return
+ * value always takes priority (ignoring the DATA).
+ */
+ for (; f; f = f->prev) {
+ u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)sd);
+
+ if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
+ ret = cur_ret;
+ }
+ return ret;
+}
+#endif /* CONFIG_SECCOMP_FILTER */
+
+static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode)
+{
+ assert_spin_locked(&current->sighand->siglock);
+
+ if (current->seccomp.mode && current->seccomp.mode != seccomp_mode)
+ return false;
+
+ return true;
+}
+
+static inline void seccomp_assign_mode(struct task_struct *task,
+ unsigned long seccomp_mode)
+{
+ assert_spin_locked(&task->sighand->siglock);
+
+ task->seccomp.mode = seccomp_mode;
+ /*
+ * Make sure TIF_SECCOMP cannot be set before the mode (and
+ * filter) is set.
+ */
+ smp_mb__before_atomic();
+ set_tsk_thread_flag(task, TIF_SECCOMP);
+}
+
+#ifdef CONFIG_SECCOMP_FILTER
+/* Returns 1 if the parent is an ancestor of the child. */
+static int is_ancestor(struct seccomp_filter *parent,
+ struct seccomp_filter *child)
+{
+ /* NULL is the root ancestor. */
+ if (parent == NULL)
+ return 1;
+ for (; child; child = child->prev)
+ if (child == parent)
+ return 1;
+ return 0;
+}
+
+/**
+ * seccomp_can_sync_threads: checks if all threads can be synchronized
+ *
+ * Expects sighand and cred_guard_mutex locks to be held.
+ *
+ * Returns 0 on success, -ve on error, or the pid of a thread which was
+ * either not in the correct seccomp mode or it did not have an ancestral
+ * seccomp filter.
+ */
+static inline pid_t seccomp_can_sync_threads(void)
+{
+ struct task_struct *thread, *caller;
+
+ BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
+ assert_spin_locked(&current->sighand->siglock);
+
+ /* Validate all threads being eligible for synchronization. */
+ caller = current;
+ for_each_thread(caller, thread) {
+ pid_t failed;
+
+ /* Skip current, since it is initiating the sync. */
+ if (thread == caller)
+ continue;
+
+ if (thread->seccomp.mode == SECCOMP_MODE_DISABLED ||
+ (thread->seccomp.mode == SECCOMP_MODE_FILTER &&
+ is_ancestor(thread->seccomp.filter,
+ caller->seccomp.filter)))
+ continue;
+
+ /* Return the first thread that cannot be synchronized. */
+ failed = task_pid_vnr(thread);
+ /* If the pid cannot be resolved, then return -ESRCH */
+ if (unlikely(WARN_ON(failed == 0)))
+ failed = -ESRCH;
+ return failed;
+ }
+
+ return 0;
+}
+
+/**
+ * seccomp_sync_threads: sets all threads to use current's filter
+ *
+ * Expects sighand and cred_guard_mutex locks to be held, and for
+ * seccomp_can_sync_threads() to have returned success already
+ * without dropping the locks.
+ *
+ */
+static inline void seccomp_sync_threads(void)
+{
+ struct task_struct *thread, *caller;
+
+ BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
+ assert_spin_locked(&current->sighand->siglock);
+
+ /* Synchronize all threads. */
+ caller = current;
+ for_each_thread(caller, thread) {
+ /* Skip current, since it needs no changes. */
+ if (thread == caller)
+ continue;
+
+ /* Get a task reference for the new leaf node. */
+ get_seccomp_filter(caller);
+ /*
+ * Drop the task reference to the shared ancestor since
+ * current's path will hold a reference. (This also
+ * allows a put before the assignment.)
+ */
+ put_seccomp_filter(thread);
+ smp_store_release(&thread->seccomp.filter,
+ caller->seccomp.filter);
+ /*
+ * Opt the other thread into seccomp if needed.
+ * As threads are considered to be trust-realm
+ * equivalent (see ptrace_may_access), it is safe to
+ * allow one thread to transition the other.
+ */
+ if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) {
+ /*
+ * Don't let an unprivileged task work around
+ * the no_new_privs restriction by creating
+ * a thread that sets it up, enters seccomp,
+ * then dies.
+ */
+ if (task_no_new_privs(caller))
+ task_set_no_new_privs(thread);
+
+ seccomp_assign_mode(thread, SECCOMP_MODE_FILTER);
+ }
+ }
+}
+
+/**
+ * seccomp_prepare_filter: Prepares a seccomp filter for use.
+ * @fprog: BPF program to install
+ *
+ * Returns filter on success or an ERR_PTR on failure.
+ */
+static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
+{
+ struct seccomp_filter *filter;
+ unsigned long fp_size;
+ struct sock_filter *fp;
+ int new_len;
+ long ret;
+
+ if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
+ return ERR_PTR(-EINVAL);
+ BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter));
+ fp_size = fprog->len * sizeof(struct sock_filter);
+
+ /*
+ * Installing a seccomp filter requires that the task has
+ * CAP_SYS_ADMIN in its namespace or be running with no_new_privs.
+ * This avoids scenarios where unprivileged tasks can affect the
+ * behavior of privileged children.
+ */
+ if (!task_no_new_privs(current) &&
+ security_capable_noaudit(current_cred(), current_user_ns(),
+ CAP_SYS_ADMIN) != 0)
+ return ERR_PTR(-EACCES);
+
+ fp = kzalloc(fp_size, GFP_KERNEL|__GFP_NOWARN);
+ if (!fp)
+ return ERR_PTR(-ENOMEM);
+
+ /* Copy the instructions from fprog. */
+ ret = -EFAULT;
+ if (copy_from_user(fp, fprog->filter, fp_size))
+ goto free_prog;
+
+ /* Check and rewrite the fprog via the skb checker */
+ ret = bpf_check_classic(fp, fprog->len);
+ if (ret)
+ goto free_prog;
+
+ /* Check and rewrite the fprog for seccomp use */
+ ret = seccomp_check_filter(fp, fprog->len);
+ if (ret)
+ goto free_prog;
+
+ /* Convert 'sock_filter' insns to 'bpf_insn' insns */
+ ret = bpf_convert_filter(fp, fprog->len, NULL, &new_len);
+ if (ret)
+ goto free_prog;
+
+ /* Allocate a new seccomp_filter */
+ ret = -ENOMEM;
+ filter = kzalloc(sizeof(struct seccomp_filter),
+ GFP_KERNEL|__GFP_NOWARN);
+ if (!filter)
+ goto free_prog;
+
+ filter->prog = bpf_prog_alloc(bpf_prog_size(new_len), __GFP_NOWARN);
+ if (!filter->prog)
+ goto free_filter;
+
+ ret = bpf_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len);
+ if (ret)
+ goto free_filter_prog;
+
+ kfree(fp);
+ atomic_set(&filter->usage, 1);
+ filter->prog->len = new_len;
+
+ bpf_prog_select_runtime(filter->prog);
+
+ return filter;
+
+free_filter_prog:
+ __bpf_prog_free(filter->prog);
+free_filter:
+ kfree(filter);
+free_prog:
+ kfree(fp);
+ return ERR_PTR(ret);
+}
+
+/**
+ * seccomp_prepare_user_filter - prepares a user-supplied sock_fprog
+ * @user_filter: pointer to the user data containing a sock_fprog.
+ *
+ * Returns 0 on success and non-zero otherwise.
+ */
+static struct seccomp_filter *
+seccomp_prepare_user_filter(const char __user *user_filter)
+{
+ struct sock_fprog fprog;
+ struct seccomp_filter *filter = ERR_PTR(-EFAULT);
+
+#ifdef CONFIG_COMPAT
+ if (is_compat_task()) {
+ struct compat_sock_fprog fprog32;
+ if (copy_from_user(&fprog32, user_filter, sizeof(fprog32)))
+ goto out;
+ fprog.len = fprog32.len;
+ fprog.filter = compat_ptr(fprog32.filter);
+ } else /* falls through to the if below. */
+#endif
+ if (copy_from_user(&fprog, user_filter, sizeof(fprog)))
+ goto out;
+ filter = seccomp_prepare_filter(&fprog);
+out:
+ return filter;
+}
+
+/**
+ * seccomp_attach_filter: validate and attach filter
+ * @flags: flags to change filter behavior
+ * @filter: seccomp filter to add to the current process
+ *
+ * Caller must be holding current->sighand->siglock lock.
+ *
+ * Returns 0 on success, -ve on error.
+ */
+static long seccomp_attach_filter(unsigned int flags,
+ struct seccomp_filter *filter)
+{
+ unsigned long total_insns;
+ struct seccomp_filter *walker;
+
+ assert_spin_locked(&current->sighand->siglock);
+
+ /* Validate resulting filter length. */
+ total_insns = filter->prog->len;
+ for (walker = current->seccomp.filter; walker; walker = walker->prev)
+ total_insns += walker->prog->len + 4; /* 4 instr penalty */
+ if (total_insns > MAX_INSNS_PER_PATH)
+ return -ENOMEM;
+
+ /* If thread sync has been requested, check that it is possible. */
+ if (flags & SECCOMP_FILTER_FLAG_TSYNC) {
+ int ret;
+
+ ret = seccomp_can_sync_threads();
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * If there is an existing filter, make it the prev and don't drop its
+ * task reference.
+ */
+ filter->prev = current->seccomp.filter;
+ current->seccomp.filter = filter;
+
+ /* Now that the new filter is in place, synchronize to all threads. */
+ if (flags & SECCOMP_FILTER_FLAG_TSYNC)
+ seccomp_sync_threads();
+
+ return 0;
+}
+
+/* get_seccomp_filter - increments the reference count of the filter on @tsk */
+void get_seccomp_filter(struct task_struct *tsk)
+{
+ struct seccomp_filter *orig = tsk->seccomp.filter;
+ if (!orig)
+ return;
+ /* Reference count is bounded by the number of total processes. */
+ atomic_inc(&orig->usage);
+}
+
+static inline void seccomp_filter_free(struct seccomp_filter *filter)
+{
+ if (filter) {
+ bpf_prog_free(filter->prog);
+ kfree(filter);
+ }
+}
+
+/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
+void put_seccomp_filter(struct task_struct *tsk)
+{
+ struct seccomp_filter *orig = tsk->seccomp.filter;
+ /* Clean up single-reference branches iteratively. */
+ while (orig && atomic_dec_and_test(&orig->usage)) {
+ struct seccomp_filter *freeme = orig;
+ orig = orig->prev;
+ seccomp_filter_free(freeme);
+ }
+}
+
+/**
+ * seccomp_send_sigsys - signals the task to allow in-process syscall emulation
+ * @syscall: syscall number to send to userland
+ * @reason: filter-supplied reason code to send to userland (via si_errno)
+ *
+ * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info.
+ */
+static void seccomp_send_sigsys(int syscall, int reason)
+{
+ struct siginfo info;
+ memset(&info, 0, sizeof(info));
+ info.si_signo = SIGSYS;
+ info.si_code = SYS_SECCOMP;
+ info.si_call_addr = (void __user *)KSTK_EIP(current);
+ info.si_errno = reason;
+ info.si_arch = syscall_get_arch();
+ info.si_syscall = syscall;
+ force_sig_info(SIGSYS, &info, current);
+}
+#endif /* CONFIG_SECCOMP_FILTER */
+
+/*
+ * Secure computing mode 1 allows only read/write/exit/sigreturn.
+ * To be fully secure this must be combined with rlimit
+ * to limit the stack allocations too.
+ */
+static int mode1_syscalls[] = {
+ __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn,
+ 0, /* null terminated */
+};
+
+#ifdef CONFIG_COMPAT
+static int mode1_syscalls_32[] = {
+ __NR_seccomp_read_32, __NR_seccomp_write_32, __NR_seccomp_exit_32, __NR_seccomp_sigreturn_32,
+ 0, /* null terminated */
+};
+#endif
+
+static void __secure_computing_strict(int this_syscall)
+{
+ int *syscall_whitelist = mode1_syscalls;
+#ifdef CONFIG_COMPAT
+ if (is_compat_task())
+ syscall_whitelist = mode1_syscalls_32;
+#endif
+ do {
+ if (*syscall_whitelist == this_syscall)
+ return;
+ } while (*++syscall_whitelist);
+
+#ifdef SECCOMP_DEBUG
+ dump_stack();
+#endif
+ audit_seccomp(this_syscall, SIGKILL, SECCOMP_RET_KILL);
+ do_exit(SIGKILL);
+}
+
+#ifndef CONFIG_HAVE_ARCH_SECCOMP_FILTER
+void secure_computing_strict(int this_syscall)
+{
+ int mode = current->seccomp.mode;
+
+ if (mode == 0)
+ return;
+ else if (mode == SECCOMP_MODE_STRICT)
+ __secure_computing_strict(this_syscall);
+ else
+ BUG();
+}
+#else
+int __secure_computing(void)
+{
+ u32 phase1_result = seccomp_phase1(NULL);
+
+ if (likely(phase1_result == SECCOMP_PHASE1_OK))
+ return 0;
+ else if (likely(phase1_result == SECCOMP_PHASE1_SKIP))
+ return -1;
+ else
+ return seccomp_phase2(phase1_result);
+}
+
+#ifdef CONFIG_SECCOMP_FILTER
+static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd)
+{
+ u32 filter_ret, action;
+ int data;
+
+ /*
+ * Make sure that any changes to mode from another thread have
+ * been seen after TIF_SECCOMP was seen.
+ */
+ rmb();
+
+ filter_ret = seccomp_run_filters(sd);
+ data = filter_ret & SECCOMP_RET_DATA;
+ action = filter_ret & SECCOMP_RET_ACTION;
+
+ switch (action) {
+ case SECCOMP_RET_ERRNO:
+ /* Set low-order bits as an errno, capped at MAX_ERRNO. */
+ if (data > MAX_ERRNO)
+ data = MAX_ERRNO;
+ syscall_set_return_value(current, task_pt_regs(current),
+ -data, 0);
+ goto skip;
+
+ case SECCOMP_RET_TRAP:
+ /* Show the handler the original registers. */
+ syscall_rollback(current, task_pt_regs(current));
+ /* Let the filter pass back 16 bits of data. */
+ seccomp_send_sigsys(this_syscall, data);
+ goto skip;
+
+ case SECCOMP_RET_TRACE:
+ return filter_ret; /* Save the rest for phase 2. */
+
+ case SECCOMP_RET_ALLOW:
+ return SECCOMP_PHASE1_OK;
+
+ case SECCOMP_RET_KILL:
+ default:
+ audit_seccomp(this_syscall, SIGSYS, action);
+ do_exit(SIGSYS);
+ }
+
+ unreachable();
+
+skip:
+ audit_seccomp(this_syscall, 0, action);
+ return SECCOMP_PHASE1_SKIP;
+}
+#endif
+
+/**
+ * seccomp_phase1() - run fast path seccomp checks on the current syscall
+ * @arg sd: The seccomp_data or NULL
+ *
+ * This only reads pt_regs via the syscall_xyz helpers. The only change
+ * it will make to pt_regs is via syscall_set_return_value, and it will
+ * only do that if it returns SECCOMP_PHASE1_SKIP.
+ *
+ * If sd is provided, it will not read pt_regs at all.
+ *
+ * It may also call do_exit or force a signal; these actions must be
+ * safe.
+ *
+ * If it returns SECCOMP_PHASE1_OK, the syscall passes checks and should
+ * be processed normally.
+ *
+ * If it returns SECCOMP_PHASE1_SKIP, then the syscall should not be
+ * invoked. In this case, seccomp_phase1 will have set the return value
+ * using syscall_set_return_value.
+ *
+ * If it returns anything else, then the return value should be passed
+ * to seccomp_phase2 from a context in which ptrace hooks are safe.
+ */
+u32 seccomp_phase1(struct seccomp_data *sd)
+{
+ int mode = current->seccomp.mode;
+ int this_syscall = sd ? sd->nr :
+ syscall_get_nr(current, task_pt_regs(current));
+
+ switch (mode) {
+ case SECCOMP_MODE_STRICT:
+ __secure_computing_strict(this_syscall); /* may call do_exit */
+ return SECCOMP_PHASE1_OK;
+#ifdef CONFIG_SECCOMP_FILTER
+ case SECCOMP_MODE_FILTER:
+ return __seccomp_phase1_filter(this_syscall, sd);
+#endif
+ default:
+ BUG();
+ }
+}
+
+/**
+ * seccomp_phase2() - finish slow path seccomp work for the current syscall
+ * @phase1_result: The return value from seccomp_phase1()
+ *
+ * This must be called from a context in which ptrace hooks can be used.
+ *
+ * Returns 0 if the syscall should be processed or -1 to skip the syscall.
+ */
+int seccomp_phase2(u32 phase1_result)
+{
+ struct pt_regs *regs = task_pt_regs(current);
+ u32 action = phase1_result & SECCOMP_RET_ACTION;
+ int data = phase1_result & SECCOMP_RET_DATA;
+
+ BUG_ON(action != SECCOMP_RET_TRACE);
+
+ audit_seccomp(syscall_get_nr(current, regs), 0, action);
+
+ /* Skip these calls if there is no tracer. */
+ if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
+ syscall_set_return_value(current, regs,
+ -ENOSYS, 0);
+ return -1;
+ }
+
+ /* Allow the BPF to provide the event message */
+ ptrace_event(PTRACE_EVENT_SECCOMP, data);
+ /*
+ * The delivery of a fatal signal during event
+ * notification may silently skip tracer notification.
+ * Terminating the task now avoids executing a system
+ * call that may not be intended.
+ */
+ if (fatal_signal_pending(current))
+ do_exit(SIGSYS);
+ if (syscall_get_nr(current, regs) < 0)
+ return -1; /* Explicit request to skip. */
+
+ return 0;
+}
+#endif /* CONFIG_HAVE_ARCH_SECCOMP_FILTER */
+
+long prctl_get_seccomp(void)
+{
+ return current->seccomp.mode;
+}
+
+/**
+ * seccomp_set_mode_strict: internal function for setting strict seccomp
+ *
+ * Once current->seccomp.mode is non-zero, it may not be changed.
+ *
+ * Returns 0 on success or -EINVAL on failure.
+ */
+static long seccomp_set_mode_strict(void)
+{
+ const unsigned long seccomp_mode = SECCOMP_MODE_STRICT;
+ long ret = -EINVAL;
+
+ spin_lock_irq(&current->sighand->siglock);
+
+ if (!seccomp_may_assign_mode(seccomp_mode))
+ goto out;
+
+#ifdef TIF_NOTSC
+ disable_TSC();
+#endif
+ seccomp_assign_mode(current, seccomp_mode);
+ ret = 0;
+
+out:
+ spin_unlock_irq(&current->sighand->siglock);
+
+ return ret;
+}
+
+#ifdef CONFIG_SECCOMP_FILTER
+/**
+ * seccomp_set_mode_filter: internal function for setting seccomp filter
+ * @flags: flags to change filter behavior
+ * @filter: struct sock_fprog containing filter
+ *
+ * This function may be called repeatedly to install additional filters.
+ * Every filter successfully installed will be evaluated (in reverse order)
+ * for each system call the task makes.
+ *
+ * Once current->seccomp.mode is non-zero, it may not be changed.
+ *
+ * Returns 0 on success or -EINVAL on failure.
+ */
+static long seccomp_set_mode_filter(unsigned int flags,
+ const char __user *filter)
+{
+ const unsigned long seccomp_mode = SECCOMP_MODE_FILTER;
+ struct seccomp_filter *prepared = NULL;
+ long ret = -EINVAL;
+
+ /* Validate flags. */
+ if (flags & ~SECCOMP_FILTER_FLAG_MASK)
+ return -EINVAL;
+
+ /* Prepare the new filter before holding any locks. */
+ prepared = seccomp_prepare_user_filter(filter);
+ if (IS_ERR(prepared))
+ return PTR_ERR(prepared);
+
+ /*
+ * Make sure we cannot change seccomp or nnp state via TSYNC
+ * while another thread is in the middle of calling exec.
+ */
+ if (flags & SECCOMP_FILTER_FLAG_TSYNC &&
+ mutex_lock_killable(&current->signal->cred_guard_mutex))
+ goto out_free;
+
+ spin_lock_irq(&current->sighand->siglock);
+
+ if (!seccomp_may_assign_mode(seccomp_mode))
+ goto out;
+
+ ret = seccomp_attach_filter(flags, prepared);
+ if (ret)
+ goto out;
+ /* Do not free the successfully attached filter. */
+ prepared = NULL;
+
+ seccomp_assign_mode(current, seccomp_mode);
+out:
+ spin_unlock_irq(&current->sighand->siglock);
+ if (flags & SECCOMP_FILTER_FLAG_TSYNC)
+ mutex_unlock(&current->signal->cred_guard_mutex);
+out_free:
+ seccomp_filter_free(prepared);
+ return ret;
+}
+#else
+static inline long seccomp_set_mode_filter(unsigned int flags,
+ const char __user *filter)
+{
+ return -EINVAL;
+}
+#endif
+
+/* Common entry point for both prctl and syscall. */
+static long do_seccomp(unsigned int op, unsigned int flags,
+ const char __user *uargs)
+{
+ switch (op) {
+ case SECCOMP_SET_MODE_STRICT:
+ if (flags != 0 || uargs != NULL)
+ return -EINVAL;
+ return seccomp_set_mode_strict();
+ case SECCOMP_SET_MODE_FILTER:
+ return seccomp_set_mode_filter(flags, uargs);
+ default:
+ return -EINVAL;
+ }
+}
+
+SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned int, flags,
+ const char __user *, uargs)
+{
+ return do_seccomp(op, flags, uargs);
+}
+
+/**
+ * prctl_set_seccomp: configures current->seccomp.mode
+ * @seccomp_mode: requested mode to use
+ * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
+ *
+ * Returns 0 on success or -EINVAL on failure.
+ */
+long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
+{
+ unsigned int op;
+ char __user *uargs;
+
+ switch (seccomp_mode) {
+ case SECCOMP_MODE_STRICT:
+ op = SECCOMP_SET_MODE_STRICT;
+ /*
+ * Setting strict mode through prctl always ignored filter,
+ * so make sure it is always NULL here to pass the internal
+ * check in do_seccomp().
+ */
+ uargs = NULL;
+ break;
+ case SECCOMP_MODE_FILTER:
+ op = SECCOMP_SET_MODE_FILTER;
+ uargs = filter;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ /* prctl interface doesn't have flags, so they are always zero. */
+ return do_seccomp(op, 0, uargs);
+}
diff --git a/kernel/signal.c b/kernel/signal.c
new file mode 100644
index 000000000..d51c5ddd8
--- /dev/null
+++ b/kernel/signal.c
@@ -0,0 +1,3674 @@
+/*
+ * linux/kernel/signal.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * 1997-11-02 Modified for POSIX.1b signals by Richard Henderson
+ *
+ * 2003-06-02 Jim Houston - Concurrent Computer Corp.
+ * Changes to use preallocated sigqueue structures
+ * to allow signals to be sent reliably.
+ */
+
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/tty.h>
+#include <linux/binfmts.h>
+#include <linux/coredump.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <linux/ptrace.h>
+#include <linux/signal.h>
+#include <linux/signalfd.h>
+#include <linux/ratelimit.h>
+#include <linux/tracehook.h>
+#include <linux/capability.h>
+#include <linux/freezer.h>
+#include <linux/pid_namespace.h>
+#include <linux/nsproxy.h>
+#include <linux/user_namespace.h>
+#include <linux/uprobes.h>
+#include <linux/compat.h>
+#include <linux/cn_proc.h>
+#include <linux/compiler.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/signal.h>
+
+#include <asm/param.h>
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+#include <asm/siginfo.h>
+#include <asm/cacheflush.h>
+#include "audit.h" /* audit_signal_info() */
+
+/*
+ * SLAB caches for signal bits.
+ */
+
+static struct kmem_cache *sigqueue_cachep;
+
+int print_fatal_signals __read_mostly;
+
+static void __user *sig_handler(struct task_struct *t, int sig)
+{
+ return t->sighand->action[sig - 1].sa.sa_handler;
+}
+
+static int sig_handler_ignored(void __user *handler, int sig)
+{
+ /* Is it explicitly or implicitly ignored? */
+ return handler == SIG_IGN ||
+ (handler == SIG_DFL && sig_kernel_ignore(sig));
+}
+
+static int sig_task_ignored(struct task_struct *t, int sig, bool force)
+{
+ void __user *handler;
+
+ handler = sig_handler(t, sig);
+
+ if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) &&
+ handler == SIG_DFL && !force)
+ return 1;
+
+ return sig_handler_ignored(handler, sig);
+}
+
+static int sig_ignored(struct task_struct *t, int sig, bool force)
+{
+ /*
+ * Blocked signals are never ignored, since the
+ * signal handler may change by the time it is
+ * unblocked.
+ */
+ if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
+ return 0;
+
+ if (!sig_task_ignored(t, sig, force))
+ return 0;
+
+ /*
+ * Tracers may want to know about even ignored signals.
+ */
+ return !t->ptrace;
+}
+
+/*
+ * Re-calculate pending state from the set of locally pending
+ * signals, globally pending signals, and blocked signals.
+ */
+static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
+{
+ unsigned long ready;
+ long i;
+
+ switch (_NSIG_WORDS) {
+ default:
+ for (i = _NSIG_WORDS, ready = 0; --i >= 0 ;)
+ ready |= signal->sig[i] &~ blocked->sig[i];
+ break;
+
+ case 4: ready = signal->sig[3] &~ blocked->sig[3];
+ ready |= signal->sig[2] &~ blocked->sig[2];
+ ready |= signal->sig[1] &~ blocked->sig[1];
+ ready |= signal->sig[0] &~ blocked->sig[0];
+ break;
+
+ case 2: ready = signal->sig[1] &~ blocked->sig[1];
+ ready |= signal->sig[0] &~ blocked->sig[0];
+ break;
+
+ case 1: ready = signal->sig[0] &~ blocked->sig[0];
+ }
+ return ready != 0;
+}
+
+#define PENDING(p,b) has_pending_signals(&(p)->signal, (b))
+
+static int recalc_sigpending_tsk(struct task_struct *t)
+{
+ if ((t->jobctl & JOBCTL_PENDING_MASK) ||
+ PENDING(&t->pending, &t->blocked) ||
+ PENDING(&t->signal->shared_pending, &t->blocked)) {
+ set_tsk_thread_flag(t, TIF_SIGPENDING);
+ return 1;
+ }
+ /*
+ * We must never clear the flag in another thread, or in current
+ * when it's possible the current syscall is returning -ERESTART*.
+ * So we don't clear it here, and only callers who know they should do.
+ */
+ return 0;
+}
+
+/*
+ * After recalculating TIF_SIGPENDING, we need to make sure the task wakes up.
+ * This is superfluous when called on current, the wakeup is a harmless no-op.
+ */
+void recalc_sigpending_and_wake(struct task_struct *t)
+{
+ if (recalc_sigpending_tsk(t))
+ signal_wake_up(t, 0);
+}
+
+void recalc_sigpending(void)
+{
+ if (!recalc_sigpending_tsk(current) && !freezing(current))
+ clear_thread_flag(TIF_SIGPENDING);
+
+}
+
+/* Given the mask, find the first available signal that should be serviced. */
+
+#define SYNCHRONOUS_MASK \
+ (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \
+ sigmask(SIGTRAP) | sigmask(SIGFPE) | sigmask(SIGSYS))
+
+int next_signal(struct sigpending *pending, sigset_t *mask)
+{
+ unsigned long i, *s, *m, x;
+ int sig = 0;
+
+ s = pending->signal.sig;
+ m = mask->sig;
+
+ /*
+ * Handle the first word specially: it contains the
+ * synchronous signals that need to be dequeued first.
+ */
+ x = *s &~ *m;
+ if (x) {
+ if (x & SYNCHRONOUS_MASK)
+ x &= SYNCHRONOUS_MASK;
+ sig = ffz(~x) + 1;
+ return sig;
+ }
+
+ switch (_NSIG_WORDS) {
+ default:
+ for (i = 1; i < _NSIG_WORDS; ++i) {
+ x = *++s &~ *++m;
+ if (!x)
+ continue;
+ sig = ffz(~x) + i*_NSIG_BPW + 1;
+ break;
+ }
+ break;
+
+ case 2:
+ x = s[1] &~ m[1];
+ if (!x)
+ break;
+ sig = ffz(~x) + _NSIG_BPW + 1;
+ break;
+
+ case 1:
+ /* Nothing to do */
+ break;
+ }
+
+ return sig;
+}
+
+static inline void print_dropped_signal(int sig)
+{
+ static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
+
+ if (!print_fatal_signals)
+ return;
+
+ if (!__ratelimit(&ratelimit_state))
+ return;
+
+ printk(KERN_INFO "%s/%d: reached RLIMIT_SIGPENDING, dropped signal %d\n",
+ current->comm, current->pid, sig);
+}
+
+/**
+ * task_set_jobctl_pending - set jobctl pending bits
+ * @task: target task
+ * @mask: pending bits to set
+ *
+ * Clear @mask from @task->jobctl. @mask must be subset of
+ * %JOBCTL_PENDING_MASK | %JOBCTL_STOP_CONSUME | %JOBCTL_STOP_SIGMASK |
+ * %JOBCTL_TRAPPING. If stop signo is being set, the existing signo is
+ * cleared. If @task is already being killed or exiting, this function
+ * becomes noop.
+ *
+ * CONTEXT:
+ * Must be called with @task->sighand->siglock held.
+ *
+ * RETURNS:
+ * %true if @mask is set, %false if made noop because @task was dying.
+ */
+bool task_set_jobctl_pending(struct task_struct *task, unsigned int mask)
+{
+ BUG_ON(mask & ~(JOBCTL_PENDING_MASK | JOBCTL_STOP_CONSUME |
+ JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING));
+ BUG_ON((mask & JOBCTL_TRAPPING) && !(mask & JOBCTL_PENDING_MASK));
+
+ if (unlikely(fatal_signal_pending(task) || (task->flags & PF_EXITING)))
+ return false;
+
+ if (mask & JOBCTL_STOP_SIGMASK)
+ task->jobctl &= ~JOBCTL_STOP_SIGMASK;
+
+ task->jobctl |= mask;
+ return true;
+}
+
+/**
+ * task_clear_jobctl_trapping - clear jobctl trapping bit
+ * @task: target task
+ *
+ * If JOBCTL_TRAPPING is set, a ptracer is waiting for us to enter TRACED.
+ * Clear it and wake up the ptracer. Note that we don't need any further
+ * locking. @task->siglock guarantees that @task->parent points to the
+ * ptracer.
+ *
+ * CONTEXT:
+ * Must be called with @task->sighand->siglock held.
+ */
+void task_clear_jobctl_trapping(struct task_struct *task)
+{
+ if (unlikely(task->jobctl & JOBCTL_TRAPPING)) {
+ task->jobctl &= ~JOBCTL_TRAPPING;
+ smp_mb(); /* advised by wake_up_bit() */
+ wake_up_bit(&task->jobctl, JOBCTL_TRAPPING_BIT);
+ }
+}
+
+/**
+ * task_clear_jobctl_pending - clear jobctl pending bits
+ * @task: target task
+ * @mask: pending bits to clear
+ *
+ * Clear @mask from @task->jobctl. @mask must be subset of
+ * %JOBCTL_PENDING_MASK. If %JOBCTL_STOP_PENDING is being cleared, other
+ * STOP bits are cleared together.
+ *
+ * If clearing of @mask leaves no stop or trap pending, this function calls
+ * task_clear_jobctl_trapping().
+ *
+ * CONTEXT:
+ * Must be called with @task->sighand->siglock held.
+ */
+void task_clear_jobctl_pending(struct task_struct *task, unsigned int mask)
+{
+ BUG_ON(mask & ~JOBCTL_PENDING_MASK);
+
+ if (mask & JOBCTL_STOP_PENDING)
+ mask |= JOBCTL_STOP_CONSUME | JOBCTL_STOP_DEQUEUED;
+
+ task->jobctl &= ~mask;
+
+ if (!(task->jobctl & JOBCTL_PENDING_MASK))
+ task_clear_jobctl_trapping(task);
+}
+
+/**
+ * task_participate_group_stop - participate in a group stop
+ * @task: task participating in a group stop
+ *
+ * @task has %JOBCTL_STOP_PENDING set and is participating in a group stop.
+ * Group stop states are cleared and the group stop count is consumed if
+ * %JOBCTL_STOP_CONSUME was set. If the consumption completes the group
+ * stop, the appropriate %SIGNAL_* flags are set.
+ *
+ * CONTEXT:
+ * Must be called with @task->sighand->siglock held.
+ *
+ * RETURNS:
+ * %true if group stop completion should be notified to the parent, %false
+ * otherwise.
+ */
+static bool task_participate_group_stop(struct task_struct *task)
+{
+ struct signal_struct *sig = task->signal;
+ bool consume = task->jobctl & JOBCTL_STOP_CONSUME;
+
+ WARN_ON_ONCE(!(task->jobctl & JOBCTL_STOP_PENDING));
+
+ task_clear_jobctl_pending(task, JOBCTL_STOP_PENDING);
+
+ if (!consume)
+ return false;
+
+ if (!WARN_ON_ONCE(sig->group_stop_count == 0))
+ sig->group_stop_count--;
+
+ /*
+ * Tell the caller to notify completion iff we are entering into a
+ * fresh group stop. Read comment in do_signal_stop() for details.
+ */
+ if (!sig->group_stop_count && !(sig->flags & SIGNAL_STOP_STOPPED)) {
+ sig->flags = SIGNAL_STOP_STOPPED;
+ return true;
+ }
+ return false;
+}
+
+/*
+ * allocate a new signal queue record
+ * - this may be called without locks if and only if t == current, otherwise an
+ * appropriate lock must be held to stop the target task from exiting
+ */
+static struct sigqueue *
+__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
+{
+ struct sigqueue *q = NULL;
+ struct user_struct *user;
+
+ /*
+ * Protect access to @t credentials. This can go away when all
+ * callers hold rcu read lock.
+ */
+ rcu_read_lock();
+ user = get_uid(__task_cred(t)->user);
+ atomic_inc(&user->sigpending);
+ rcu_read_unlock();
+
+ if (override_rlimit ||
+ atomic_read(&user->sigpending) <=
+ task_rlimit(t, RLIMIT_SIGPENDING)) {
+ q = kmem_cache_alloc(sigqueue_cachep, flags);
+ } else {
+ print_dropped_signal(sig);
+ }
+
+ if (unlikely(q == NULL)) {
+ atomic_dec(&user->sigpending);
+ free_uid(user);
+ } else {
+ INIT_LIST_HEAD(&q->list);
+ q->flags = 0;
+ q->user = user;
+ }
+
+ return q;
+}
+
+static void __sigqueue_free(struct sigqueue *q)
+{
+ if (q->flags & SIGQUEUE_PREALLOC)
+ return;
+ atomic_dec(&q->user->sigpending);
+ free_uid(q->user);
+ kmem_cache_free(sigqueue_cachep, q);
+}
+
+void flush_sigqueue(struct sigpending *queue)
+{
+ struct sigqueue *q;
+
+ sigemptyset(&queue->signal);
+ while (!list_empty(&queue->list)) {
+ q = list_entry(queue->list.next, struct sigqueue , list);
+ list_del_init(&q->list);
+ __sigqueue_free(q);
+ }
+}
+
+/*
+ * Flush all pending signals for a task.
+ */
+void __flush_signals(struct task_struct *t)
+{
+ clear_tsk_thread_flag(t, TIF_SIGPENDING);
+ flush_sigqueue(&t->pending);
+ flush_sigqueue(&t->signal->shared_pending);
+}
+
+void flush_signals(struct task_struct *t)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&t->sighand->siglock, flags);
+ __flush_signals(t);
+ spin_unlock_irqrestore(&t->sighand->siglock, flags);
+}
+
+static void __flush_itimer_signals(struct sigpending *pending)
+{
+ sigset_t signal, retain;
+ struct sigqueue *q, *n;
+
+ signal = pending->signal;
+ sigemptyset(&retain);
+
+ list_for_each_entry_safe(q, n, &pending->list, list) {
+ int sig = q->info.si_signo;
+
+ if (likely(q->info.si_code != SI_TIMER)) {
+ sigaddset(&retain, sig);
+ } else {
+ sigdelset(&signal, sig);
+ list_del_init(&q->list);
+ __sigqueue_free(q);
+ }
+ }
+
+ sigorsets(&pending->signal, &signal, &retain);
+}
+
+void flush_itimer_signals(void)
+{
+ struct task_struct *tsk = current;
+ unsigned long flags;
+
+ spin_lock_irqsave(&tsk->sighand->siglock, flags);
+ __flush_itimer_signals(&tsk->pending);
+ __flush_itimer_signals(&tsk->signal->shared_pending);
+ spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
+}
+
+void ignore_signals(struct task_struct *t)
+{
+ int i;
+
+ for (i = 0; i < _NSIG; ++i)
+ t->sighand->action[i].sa.sa_handler = SIG_IGN;
+
+ flush_signals(t);
+}
+
+/*
+ * Flush all handlers for a task.
+ */
+
+void
+flush_signal_handlers(struct task_struct *t, int force_default)
+{
+ int i;
+ struct k_sigaction *ka = &t->sighand->action[0];
+ for (i = _NSIG ; i != 0 ; i--) {
+ if (force_default || ka->sa.sa_handler != SIG_IGN)
+ ka->sa.sa_handler = SIG_DFL;
+ ka->sa.sa_flags = 0;
+#ifdef __ARCH_HAS_SA_RESTORER
+ ka->sa.sa_restorer = NULL;
+#endif
+ sigemptyset(&ka->sa.sa_mask);
+ ka++;
+ }
+}
+
+int unhandled_signal(struct task_struct *tsk, int sig)
+{
+ void __user *handler = tsk->sighand->action[sig-1].sa.sa_handler;
+ if (is_global_init(tsk))
+ return 1;
+ if (handler != SIG_IGN && handler != SIG_DFL)
+ return 0;
+ /* if ptraced, let the tracer determine */
+ return !tsk->ptrace;
+}
+
+/*
+ * Notify the system that a driver wants to block all signals for this
+ * process, and wants to be notified if any signals at all were to be
+ * sent/acted upon. If the notifier routine returns non-zero, then the
+ * signal will be acted upon after all. If the notifier routine returns 0,
+ * then then signal will be blocked. Only one block per process is
+ * allowed. priv is a pointer to private data that the notifier routine
+ * can use to determine if the signal should be blocked or not.
+ */
+void
+block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&current->sighand->siglock, flags);
+ current->notifier_mask = mask;
+ current->notifier_data = priv;
+ current->notifier = notifier;
+ spin_unlock_irqrestore(&current->sighand->siglock, flags);
+}
+
+/* Notify the system that blocking has ended. */
+
+void
+unblock_all_signals(void)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&current->sighand->siglock, flags);
+ current->notifier = NULL;
+ current->notifier_data = NULL;
+ recalc_sigpending();
+ spin_unlock_irqrestore(&current->sighand->siglock, flags);
+}
+
+static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
+{
+ struct sigqueue *q, *first = NULL;
+
+ /*
+ * Collect the siginfo appropriate to this signal. Check if
+ * there is another siginfo for the same signal.
+ */
+ list_for_each_entry(q, &list->list, list) {
+ if (q->info.si_signo == sig) {
+ if (first)
+ goto still_pending;
+ first = q;
+ }
+ }
+
+ sigdelset(&list->signal, sig);
+
+ if (first) {
+still_pending:
+ list_del_init(&first->list);
+ copy_siginfo(info, &first->info);
+ __sigqueue_free(first);
+ } else {
+ /*
+ * Ok, it wasn't in the queue. This must be
+ * a fast-pathed signal or we must have been
+ * out of queue space. So zero out the info.
+ */
+ info->si_signo = sig;
+ info->si_errno = 0;
+ info->si_code = SI_USER;
+ info->si_pid = 0;
+ info->si_uid = 0;
+ }
+}
+
+static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
+ siginfo_t *info)
+{
+ int sig = next_signal(pending, mask);
+
+ if (sig) {
+ if (current->notifier) {
+ if (sigismember(current->notifier_mask, sig)) {
+ if (!(current->notifier)(current->notifier_data)) {
+ clear_thread_flag(TIF_SIGPENDING);
+ return 0;
+ }
+ }
+ }
+
+ collect_signal(sig, pending, info);
+ }
+
+ return sig;
+}
+
+/*
+ * Dequeue a signal and return the element to the caller, which is
+ * expected to free it.
+ *
+ * All callers have to hold the siglock.
+ */
+int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
+{
+ int signr;
+
+ /* We only dequeue private signals from ourselves, we don't let
+ * signalfd steal them
+ */
+ signr = __dequeue_signal(&tsk->pending, mask, info);
+ if (!signr) {
+ signr = __dequeue_signal(&tsk->signal->shared_pending,
+ mask, info);
+ /*
+ * itimer signal ?
+ *
+ * itimers are process shared and we restart periodic
+ * itimers in the signal delivery path to prevent DoS
+ * attacks in the high resolution timer case. This is
+ * compliant with the old way of self-restarting
+ * itimers, as the SIGALRM is a legacy signal and only
+ * queued once. Changing the restart behaviour to
+ * restart the timer in the signal dequeue path is
+ * reducing the timer noise on heavy loaded !highres
+ * systems too.
+ */
+ if (unlikely(signr == SIGALRM)) {
+ struct hrtimer *tmr = &tsk->signal->real_timer;
+
+ if (!hrtimer_is_queued(tmr) &&
+ tsk->signal->it_real_incr.tv64 != 0) {
+ hrtimer_forward(tmr, tmr->base->get_time(),
+ tsk->signal->it_real_incr);
+ hrtimer_restart(tmr);
+ }
+ }
+ }
+
+ recalc_sigpending();
+ if (!signr)
+ return 0;
+
+ if (unlikely(sig_kernel_stop(signr))) {
+ /*
+ * Set a marker that we have dequeued a stop signal. Our
+ * caller might release the siglock and then the pending
+ * stop signal it is about to process is no longer in the
+ * pending bitmasks, but must still be cleared by a SIGCONT
+ * (and overruled by a SIGKILL). So those cases clear this
+ * shared flag after we've set it. Note that this flag may
+ * remain set after the signal we return is ignored or
+ * handled. That doesn't matter because its only purpose
+ * is to alert stop-signal processing code when another
+ * processor has come along and cleared the flag.
+ */
+ current->jobctl |= JOBCTL_STOP_DEQUEUED;
+ }
+ if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) {
+ /*
+ * Release the siglock to ensure proper locking order
+ * of timer locks outside of siglocks. Note, we leave
+ * irqs disabled here, since the posix-timers code is
+ * about to disable them again anyway.
+ */
+ spin_unlock(&tsk->sighand->siglock);
+ do_schedule_next_timer(info);
+ spin_lock(&tsk->sighand->siglock);
+ }
+ return signr;
+}
+
+/*
+ * Tell a process that it has a new active signal..
+ *
+ * NOTE! we rely on the previous spin_lock to
+ * lock interrupts for us! We can only be called with
+ * "siglock" held, and the local interrupt must
+ * have been disabled when that got acquired!
+ *
+ * No need to set need_resched since signal event passing
+ * goes through ->blocked
+ */
+void signal_wake_up_state(struct task_struct *t, unsigned int state)
+{
+ set_tsk_thread_flag(t, TIF_SIGPENDING);
+ /*
+ * TASK_WAKEKILL also means wake it up in the stopped/traced/killable
+ * case. We don't check t->state here because there is a race with it
+ * executing another processor and just now entering stopped state.
+ * By using wake_up_state, we ensure the process will wake up and
+ * handle its death signal.
+ */
+ if (!wake_up_state(t, state | TASK_INTERRUPTIBLE))
+ kick_process(t);
+}
+
+/*
+ * Remove signals in mask from the pending set and queue.
+ * Returns 1 if any signals were found.
+ *
+ * All callers must be holding the siglock.
+ */
+static int flush_sigqueue_mask(sigset_t *mask, struct sigpending *s)
+{
+ struct sigqueue *q, *n;
+ sigset_t m;
+
+ sigandsets(&m, mask, &s->signal);
+ if (sigisemptyset(&m))
+ return 0;
+
+ sigandnsets(&s->signal, &s->signal, mask);
+ list_for_each_entry_safe(q, n, &s->list, list) {
+ if (sigismember(mask, q->info.si_signo)) {
+ list_del_init(&q->list);
+ __sigqueue_free(q);
+ }
+ }
+ return 1;
+}
+
+static inline int is_si_special(const struct siginfo *info)
+{
+ return info <= SEND_SIG_FORCED;
+}
+
+static inline bool si_fromuser(const struct siginfo *info)
+{
+ return info == SEND_SIG_NOINFO ||
+ (!is_si_special(info) && SI_FROMUSER(info));
+}
+
+/*
+ * called with RCU read lock from check_kill_permission()
+ */
+static int kill_ok_by_cred(struct task_struct *t)
+{
+ const struct cred *cred = current_cred();
+ const struct cred *tcred = __task_cred(t);
+
+ if (uid_eq(cred->euid, tcred->suid) ||
+ uid_eq(cred->euid, tcred->uid) ||
+ uid_eq(cred->uid, tcred->suid) ||
+ uid_eq(cred->uid, tcred->uid))
+ return 1;
+
+ if (ns_capable(tcred->user_ns, CAP_KILL))
+ return 1;
+
+ return 0;
+}
+
+/*
+ * Bad permissions for sending the signal
+ * - the caller must hold the RCU read lock
+ */
+static int check_kill_permission(int sig, struct siginfo *info,
+ struct task_struct *t)
+{
+ struct pid *sid;
+ int error;
+
+ if (!valid_signal(sig))
+ return -EINVAL;
+
+ if (!si_fromuser(info))
+ return 0;
+
+ error = audit_signal_info(sig, t); /* Let audit system see the signal */
+ if (error)
+ return error;
+
+ if (!same_thread_group(current, t) &&
+ !kill_ok_by_cred(t)) {
+ switch (sig) {
+ case SIGCONT:
+ sid = task_session(t);
+ /*
+ * We don't return the error if sid == NULL. The
+ * task was unhashed, the caller must notice this.
+ */
+ if (!sid || sid == task_session(current))
+ break;
+ default:
+ return -EPERM;
+ }
+ }
+
+ return security_task_kill(t, info, sig, 0);
+}
+
+/**
+ * ptrace_trap_notify - schedule trap to notify ptracer
+ * @t: tracee wanting to notify tracer
+ *
+ * This function schedules sticky ptrace trap which is cleared on the next
+ * TRAP_STOP to notify ptracer of an event. @t must have been seized by
+ * ptracer.
+ *
+ * If @t is running, STOP trap will be taken. If trapped for STOP and
+ * ptracer is listening for events, tracee is woken up so that it can
+ * re-trap for the new event. If trapped otherwise, STOP trap will be
+ * eventually taken without returning to userland after the existing traps
+ * are finished by PTRACE_CONT.
+ *
+ * CONTEXT:
+ * Must be called with @task->sighand->siglock held.
+ */
+static void ptrace_trap_notify(struct task_struct *t)
+{
+ WARN_ON_ONCE(!(t->ptrace & PT_SEIZED));
+ assert_spin_locked(&t->sighand->siglock);
+
+ task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY);
+ ptrace_signal_wake_up(t, t->jobctl & JOBCTL_LISTENING);
+}
+
+/*
+ * Handle magic process-wide effects of stop/continue signals. Unlike
+ * the signal actions, these happen immediately at signal-generation
+ * time regardless of blocking, ignoring, or handling. This does the
+ * actual continuing for SIGCONT, but not the actual stopping for stop
+ * signals. The process stop is done as a signal action for SIG_DFL.
+ *
+ * Returns true if the signal should be actually delivered, otherwise
+ * it should be dropped.
+ */
+static bool prepare_signal(int sig, struct task_struct *p, bool force)
+{
+ struct signal_struct *signal = p->signal;
+ struct task_struct *t;
+ sigset_t flush;
+
+ if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) {
+ if (signal->flags & SIGNAL_GROUP_COREDUMP)
+ return sig == SIGKILL;
+ /*
+ * The process is in the middle of dying, nothing to do.
+ */
+ } else if (sig_kernel_stop(sig)) {
+ /*
+ * This is a stop signal. Remove SIGCONT from all queues.
+ */
+ siginitset(&flush, sigmask(SIGCONT));
+ flush_sigqueue_mask(&flush, &signal->shared_pending);
+ for_each_thread(p, t)
+ flush_sigqueue_mask(&flush, &t->pending);
+ } else if (sig == SIGCONT) {
+ unsigned int why;
+ /*
+ * Remove all stop signals from all queues, wake all threads.
+ */
+ siginitset(&flush, SIG_KERNEL_STOP_MASK);
+ flush_sigqueue_mask(&flush, &signal->shared_pending);
+ for_each_thread(p, t) {
+ flush_sigqueue_mask(&flush, &t->pending);
+ task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING);
+ if (likely(!(t->ptrace & PT_SEIZED)))
+ wake_up_state(t, __TASK_STOPPED);
+ else
+ ptrace_trap_notify(t);
+ }
+
+ /*
+ * Notify the parent with CLD_CONTINUED if we were stopped.
+ *
+ * If we were in the middle of a group stop, we pretend it
+ * was already finished, and then continued. Since SIGCHLD
+ * doesn't queue we report only CLD_STOPPED, as if the next
+ * CLD_CONTINUED was dropped.
+ */
+ why = 0;
+ if (signal->flags & SIGNAL_STOP_STOPPED)
+ why |= SIGNAL_CLD_CONTINUED;
+ else if (signal->group_stop_count)
+ why |= SIGNAL_CLD_STOPPED;
+
+ if (why) {
+ /*
+ * The first thread which returns from do_signal_stop()
+ * will take ->siglock, notice SIGNAL_CLD_MASK, and
+ * notify its parent. See get_signal_to_deliver().
+ */
+ signal->flags = why | SIGNAL_STOP_CONTINUED;
+ signal->group_stop_count = 0;
+ signal->group_exit_code = 0;
+ }
+ }
+
+ return !sig_ignored(p, sig, force);
+}
+
+/*
+ * Test if P wants to take SIG. After we've checked all threads with this,
+ * it's equivalent to finding no threads not blocking SIG. Any threads not
+ * blocking SIG were ruled out because they are not running and already
+ * have pending signals. Such threads will dequeue from the shared queue
+ * as soon as they're available, so putting the signal on the shared queue
+ * will be equivalent to sending it to one such thread.
+ */
+static inline int wants_signal(int sig, struct task_struct *p)
+{
+ if (sigismember(&p->blocked, sig))
+ return 0;
+ if (p->flags & PF_EXITING)
+ return 0;
+ if (sig == SIGKILL)
+ return 1;
+ if (task_is_stopped_or_traced(p))
+ return 0;
+ return task_curr(p) || !signal_pending(p);
+}
+
+static void complete_signal(int sig, struct task_struct *p, int group)
+{
+ struct signal_struct *signal = p->signal;
+ struct task_struct *t;
+
+ /*
+ * Now find a thread we can wake up to take the signal off the queue.
+ *
+ * If the main thread wants the signal, it gets first crack.
+ * Probably the least surprising to the average bear.
+ */
+ if (wants_signal(sig, p))
+ t = p;
+ else if (!group || thread_group_empty(p))
+ /*
+ * There is just one thread and it does not need to be woken.
+ * It will dequeue unblocked signals before it runs again.
+ */
+ return;
+ else {
+ /*
+ * Otherwise try to find a suitable thread.
+ */
+ t = signal->curr_target;
+ while (!wants_signal(sig, t)) {
+ t = next_thread(t);
+ if (t == signal->curr_target)
+ /*
+ * No thread needs to be woken.
+ * Any eligible threads will see
+ * the signal in the queue soon.
+ */
+ return;
+ }
+ signal->curr_target = t;
+ }
+
+ /*
+ * Found a killable thread. If the signal will be fatal,
+ * then start taking the whole group down immediately.
+ */
+ if (sig_fatal(p, sig) &&
+ !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
+ !sigismember(&t->real_blocked, sig) &&
+ (sig == SIGKILL || !t->ptrace)) {
+ /*
+ * This signal will be fatal to the whole group.
+ */
+ if (!sig_kernel_coredump(sig)) {
+ /*
+ * Start a group exit and wake everybody up.
+ * This way we don't have other threads
+ * running and doing things after a slower
+ * thread has the fatal signal pending.
+ */
+ signal->flags = SIGNAL_GROUP_EXIT;
+ signal->group_exit_code = sig;
+ signal->group_stop_count = 0;
+ t = p;
+ do {
+ task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
+ sigaddset(&t->pending.signal, SIGKILL);
+ signal_wake_up(t, 1);
+ } while_each_thread(p, t);
+ return;
+ }
+ }
+
+ /*
+ * The signal is already in the shared-pending queue.
+ * Tell the chosen thread to wake up and dequeue it.
+ */
+ signal_wake_up(t, sig == SIGKILL);
+ return;
+}
+
+static inline int legacy_queue(struct sigpending *signals, int sig)
+{
+ return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
+}
+
+#ifdef CONFIG_USER_NS
+static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t)
+{
+ if (current_user_ns() == task_cred_xxx(t, user_ns))
+ return;
+
+ if (SI_FROMKERNEL(info))
+ return;
+
+ rcu_read_lock();
+ info->si_uid = from_kuid_munged(task_cred_xxx(t, user_ns),
+ make_kuid(current_user_ns(), info->si_uid));
+ rcu_read_unlock();
+}
+#else
+static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t)
+{
+ return;
+}
+#endif
+
+static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
+ int group, int from_ancestor_ns)
+{
+ struct sigpending *pending;
+ struct sigqueue *q;
+ int override_rlimit;
+ int ret = 0, result;
+
+ assert_spin_locked(&t->sighand->siglock);
+
+ result = TRACE_SIGNAL_IGNORED;
+ if (!prepare_signal(sig, t,
+ from_ancestor_ns || (info == SEND_SIG_FORCED)))
+ goto ret;
+
+ pending = group ? &t->signal->shared_pending : &t->pending;
+ /*
+ * Short-circuit ignored signals and support queuing
+ * exactly one non-rt signal, so that we can get more
+ * detailed information about the cause of the signal.
+ */
+ result = TRACE_SIGNAL_ALREADY_PENDING;
+ if (legacy_queue(pending, sig))
+ goto ret;
+
+ result = TRACE_SIGNAL_DELIVERED;
+ /*
+ * fast-pathed signals for kernel-internal things like SIGSTOP
+ * or SIGKILL.
+ */
+ if (info == SEND_SIG_FORCED)
+ goto out_set;
+
+ /*
+ * Real-time signals must be queued if sent by sigqueue, or
+ * some other real-time mechanism. It is implementation
+ * defined whether kill() does so. We attempt to do so, on
+ * the principle of least surprise, but since kill is not
+ * allowed to fail with EAGAIN when low on memory we just
+ * make sure at least one signal gets delivered and don't
+ * pass on the info struct.
+ */
+ if (sig < SIGRTMIN)
+ override_rlimit = (is_si_special(info) || info->si_code >= 0);
+ else
+ override_rlimit = 0;
+
+ q = __sigqueue_alloc(sig, t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE,
+ override_rlimit);
+ if (q) {
+ list_add_tail(&q->list, &pending->list);
+ switch ((unsigned long) info) {
+ case (unsigned long) SEND_SIG_NOINFO:
+ q->info.si_signo = sig;
+ q->info.si_errno = 0;
+ q->info.si_code = SI_USER;
+ q->info.si_pid = task_tgid_nr_ns(current,
+ task_active_pid_ns(t));
+ q->info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
+ break;
+ case (unsigned long) SEND_SIG_PRIV:
+ q->info.si_signo = sig;
+ q->info.si_errno = 0;
+ q->info.si_code = SI_KERNEL;
+ q->info.si_pid = 0;
+ q->info.si_uid = 0;
+ break;
+ default:
+ copy_siginfo(&q->info, info);
+ if (from_ancestor_ns)
+ q->info.si_pid = 0;
+ break;
+ }
+
+ userns_fixup_signal_uid(&q->info, t);
+
+ } else if (!is_si_special(info)) {
+ if (sig >= SIGRTMIN && info->si_code != SI_USER) {
+ /*
+ * Queue overflow, abort. We may abort if the
+ * signal was rt and sent by user using something
+ * other than kill().
+ */
+ result = TRACE_SIGNAL_OVERFLOW_FAIL;
+ ret = -EAGAIN;
+ goto ret;
+ } else {
+ /*
+ * This is a silent loss of information. We still
+ * send the signal, but the *info bits are lost.
+ */
+ result = TRACE_SIGNAL_LOSE_INFO;
+ }
+ }
+
+out_set:
+ signalfd_notify(t, sig);
+ sigaddset(&pending->signal, sig);
+ complete_signal(sig, t, group);
+ret:
+ trace_signal_generate(sig, info, t, group, result);
+ return ret;
+}
+
+static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
+ int group)
+{
+ int from_ancestor_ns = 0;
+
+#ifdef CONFIG_PID_NS
+ from_ancestor_ns = si_fromuser(info) &&
+ !task_pid_nr_ns(current, task_active_pid_ns(t));
+#endif
+
+ return __send_signal(sig, info, t, group, from_ancestor_ns);
+}
+
+static void print_fatal_signal(int signr)
+{
+ struct pt_regs *regs = signal_pt_regs();
+ printk(KERN_INFO "potentially unexpected fatal signal %d.\n", signr);
+
+#if defined(__i386__) && !defined(__arch_um__)
+ printk(KERN_INFO "code at %08lx: ", regs->ip);
+ {
+ int i;
+ for (i = 0; i < 16; i++) {
+ unsigned char insn;
+
+ if (get_user(insn, (unsigned char *)(regs->ip + i)))
+ break;
+ printk(KERN_CONT "%02x ", insn);
+ }
+ }
+ printk(KERN_CONT "\n");
+#endif
+ preempt_disable();
+ show_regs(regs);
+ preempt_enable();
+}
+
+static int __init setup_print_fatal_signals(char *str)
+{
+ get_option (&str, &print_fatal_signals);
+
+ return 1;
+}
+
+__setup("print-fatal-signals=", setup_print_fatal_signals);
+
+int
+__group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
+{
+ return send_signal(sig, info, p, 1);
+}
+
+static int
+specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
+{
+ return send_signal(sig, info, t, 0);
+}
+
+int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p,
+ bool group)
+{
+ unsigned long flags;
+ int ret = -ESRCH;
+
+ if (lock_task_sighand(p, &flags)) {
+ ret = send_signal(sig, info, p, group);
+ unlock_task_sighand(p, &flags);
+ }
+
+ return ret;
+}
+
+/*
+ * Force a signal that the process can't ignore: if necessary
+ * we unblock the signal and change any SIG_IGN to SIG_DFL.
+ *
+ * Note: If we unblock the signal, we always reset it to SIG_DFL,
+ * since we do not want to have a signal handler that was blocked
+ * be invoked when user space had explicitly blocked it.
+ *
+ * We don't want to have recursive SIGSEGV's etc, for example,
+ * that is why we also clear SIGNAL_UNKILLABLE.
+ */
+int
+force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
+{
+ unsigned long int flags;
+ int ret, blocked, ignored;
+ struct k_sigaction *action;
+
+ spin_lock_irqsave(&t->sighand->siglock, flags);
+ action = &t->sighand->action[sig-1];
+ ignored = action->sa.sa_handler == SIG_IGN;
+ blocked = sigismember(&t->blocked, sig);
+ if (blocked || ignored) {
+ action->sa.sa_handler = SIG_DFL;
+ if (blocked) {
+ sigdelset(&t->blocked, sig);
+ recalc_sigpending_and_wake(t);
+ }
+ }
+ if (action->sa.sa_handler == SIG_DFL)
+ t->signal->flags &= ~SIGNAL_UNKILLABLE;
+ ret = specific_send_sig_info(sig, info, t);
+ spin_unlock_irqrestore(&t->sighand->siglock, flags);
+
+ return ret;
+}
+
+/*
+ * Nuke all other threads in the group.
+ */
+int zap_other_threads(struct task_struct *p)
+{
+ struct task_struct *t = p;
+ int count = 0;
+
+ p->signal->group_stop_count = 0;
+
+ while_each_thread(p, t) {
+ task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
+ count++;
+
+ /* Don't bother with already dead threads */
+ if (t->exit_state)
+ continue;
+ sigaddset(&t->pending.signal, SIGKILL);
+ signal_wake_up(t, 1);
+ }
+
+ return count;
+}
+
+struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
+ unsigned long *flags)
+{
+ struct sighand_struct *sighand;
+
+ for (;;) {
+ /*
+ * Disable interrupts early to avoid deadlocks.
+ * See rcu_read_unlock() comment header for details.
+ */
+ local_irq_save(*flags);
+ rcu_read_lock();
+ sighand = rcu_dereference(tsk->sighand);
+ if (unlikely(sighand == NULL)) {
+ rcu_read_unlock();
+ local_irq_restore(*flags);
+ break;
+ }
+ /*
+ * This sighand can be already freed and even reused, but
+ * we rely on SLAB_DESTROY_BY_RCU and sighand_ctor() which
+ * initializes ->siglock: this slab can't go away, it has
+ * the same object type, ->siglock can't be reinitialized.
+ *
+ * We need to ensure that tsk->sighand is still the same
+ * after we take the lock, we can race with de_thread() or
+ * __exit_signal(). In the latter case the next iteration
+ * must see ->sighand == NULL.
+ */
+ spin_lock(&sighand->siglock);
+ if (likely(sighand == tsk->sighand)) {
+ rcu_read_unlock();
+ break;
+ }
+ spin_unlock(&sighand->siglock);
+ rcu_read_unlock();
+ local_irq_restore(*flags);
+ }
+
+ return sighand;
+}
+
+/*
+ * send signal info to all the members of a group
+ */
+int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
+{
+ int ret;
+
+ rcu_read_lock();
+ ret = check_kill_permission(sig, info, p);
+ rcu_read_unlock();
+
+ if (!ret && sig)
+ ret = do_send_sig_info(sig, info, p, true);
+
+ return ret;
+}
+
+/*
+ * __kill_pgrp_info() sends a signal to a process group: this is what the tty
+ * control characters do (^C, ^Z etc)
+ * - the caller must hold at least a readlock on tasklist_lock
+ */
+int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp)
+{
+ struct task_struct *p = NULL;
+ int retval, success;
+
+ success = 0;
+ retval = -ESRCH;
+ do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
+ int err = group_send_sig_info(sig, info, p);
+ success |= !err;
+ retval = err;
+ } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
+ return success ? 0 : retval;
+}
+
+int kill_pid_info(int sig, struct siginfo *info, struct pid *pid)
+{
+ int error = -ESRCH;
+ struct task_struct *p;
+
+ for (;;) {
+ rcu_read_lock();
+ p = pid_task(pid, PIDTYPE_PID);
+ if (p)
+ error = group_send_sig_info(sig, info, p);
+ rcu_read_unlock();
+ if (likely(!p || error != -ESRCH))
+ return error;
+
+ /*
+ * The task was unhashed in between, try again. If it
+ * is dead, pid_task() will return NULL, if we race with
+ * de_thread() it will find the new leader.
+ */
+ }
+}
+
+int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
+{
+ int error;
+ rcu_read_lock();
+ error = kill_pid_info(sig, info, find_vpid(pid));
+ rcu_read_unlock();
+ return error;
+}
+
+static int kill_as_cred_perm(const struct cred *cred,
+ struct task_struct *target)
+{
+ const struct cred *pcred = __task_cred(target);
+ if (!uid_eq(cred->euid, pcred->suid) && !uid_eq(cred->euid, pcred->uid) &&
+ !uid_eq(cred->uid, pcred->suid) && !uid_eq(cred->uid, pcred->uid))
+ return 0;
+ return 1;
+}
+
+/* like kill_pid_info(), but doesn't use uid/euid of "current" */
+int kill_pid_info_as_cred(int sig, struct siginfo *info, struct pid *pid,
+ const struct cred *cred, u32 secid)
+{
+ int ret = -EINVAL;
+ struct task_struct *p;
+ unsigned long flags;
+
+ if (!valid_signal(sig))
+ return ret;
+
+ rcu_read_lock();
+ p = pid_task(pid, PIDTYPE_PID);
+ if (!p) {
+ ret = -ESRCH;
+ goto out_unlock;
+ }
+ if (si_fromuser(info) && !kill_as_cred_perm(cred, p)) {
+ ret = -EPERM;
+ goto out_unlock;
+ }
+ ret = security_task_kill(p, info, sig, secid);
+ if (ret)
+ goto out_unlock;
+
+ if (sig) {
+ if (lock_task_sighand(p, &flags)) {
+ ret = __send_signal(sig, info, p, 1, 0);
+ unlock_task_sighand(p, &flags);
+ } else
+ ret = -ESRCH;
+ }
+out_unlock:
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(kill_pid_info_as_cred);
+
+/*
+ * kill_something_info() interprets pid in interesting ways just like kill(2).
+ *
+ * POSIX specifies that kill(-1,sig) is unspecified, but what we have
+ * is probably wrong. Should make it like BSD or SYSV.
+ */
+
+static int kill_something_info(int sig, struct siginfo *info, pid_t pid)
+{
+ int ret;
+
+ if (pid > 0) {
+ rcu_read_lock();
+ ret = kill_pid_info(sig, info, find_vpid(pid));
+ rcu_read_unlock();
+ return ret;
+ }
+
+ read_lock(&tasklist_lock);
+ if (pid != -1) {
+ ret = __kill_pgrp_info(sig, info,
+ pid ? find_vpid(-pid) : task_pgrp(current));
+ } else {
+ int retval = 0, count = 0;
+ struct task_struct * p;
+
+ for_each_process(p) {
+ if (task_pid_vnr(p) > 1 &&
+ !same_thread_group(p, current)) {
+ int err = group_send_sig_info(sig, info, p);
+ ++count;
+ if (err != -EPERM)
+ retval = err;
+ }
+ }
+ ret = count ? retval : -ESRCH;
+ }
+ read_unlock(&tasklist_lock);
+
+ return ret;
+}
+
+/*
+ * These are for backward compatibility with the rest of the kernel source.
+ */
+
+int send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
+{
+ /*
+ * Make sure legacy kernel users don't send in bad values
+ * (normal paths check this in check_kill_permission).
+ */
+ if (!valid_signal(sig))
+ return -EINVAL;
+
+ return do_send_sig_info(sig, info, p, false);
+}
+
+#define __si_special(priv) \
+ ((priv) ? SEND_SIG_PRIV : SEND_SIG_NOINFO)
+
+int
+send_sig(int sig, struct task_struct *p, int priv)
+{
+ return send_sig_info(sig, __si_special(priv), p);
+}
+
+void
+force_sig(int sig, struct task_struct *p)
+{
+ force_sig_info(sig, SEND_SIG_PRIV, p);
+}
+
+/*
+ * When things go south during signal handling, we
+ * will force a SIGSEGV. And if the signal that caused
+ * the problem was already a SIGSEGV, we'll want to
+ * make sure we don't even try to deliver the signal..
+ */
+int
+force_sigsegv(int sig, struct task_struct *p)
+{
+ if (sig == SIGSEGV) {
+ unsigned long flags;
+ spin_lock_irqsave(&p->sighand->siglock, flags);
+ p->sighand->action[sig - 1].sa.sa_handler = SIG_DFL;
+ spin_unlock_irqrestore(&p->sighand->siglock, flags);
+ }
+ force_sig(SIGSEGV, p);
+ return 0;
+}
+
+int kill_pgrp(struct pid *pid, int sig, int priv)
+{
+ int ret;
+
+ read_lock(&tasklist_lock);
+ ret = __kill_pgrp_info(sig, __si_special(priv), pid);
+ read_unlock(&tasklist_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL(kill_pgrp);
+
+int kill_pid(struct pid *pid, int sig, int priv)
+{
+ return kill_pid_info(sig, __si_special(priv), pid);
+}
+EXPORT_SYMBOL(kill_pid);
+
+/*
+ * These functions support sending signals using preallocated sigqueue
+ * structures. This is needed "because realtime applications cannot
+ * afford to lose notifications of asynchronous events, like timer
+ * expirations or I/O completions". In the case of POSIX Timers
+ * we allocate the sigqueue structure from the timer_create. If this
+ * allocation fails we are able to report the failure to the application
+ * with an EAGAIN error.
+ */
+struct sigqueue *sigqueue_alloc(void)
+{
+ struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0);
+
+ if (q)
+ q->flags |= SIGQUEUE_PREALLOC;
+
+ return q;
+}
+
+void sigqueue_free(struct sigqueue *q)
+{
+ unsigned long flags;
+ spinlock_t *lock = &current->sighand->siglock;
+
+ BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
+ /*
+ * We must hold ->siglock while testing q->list
+ * to serialize with collect_signal() or with
+ * __exit_signal()->flush_sigqueue().
+ */
+ spin_lock_irqsave(lock, flags);
+ q->flags &= ~SIGQUEUE_PREALLOC;
+ /*
+ * If it is queued it will be freed when dequeued,
+ * like the "regular" sigqueue.
+ */
+ if (!list_empty(&q->list))
+ q = NULL;
+ spin_unlock_irqrestore(lock, flags);
+
+ if (q)
+ __sigqueue_free(q);
+}
+
+int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
+{
+ int sig = q->info.si_signo;
+ struct sigpending *pending;
+ unsigned long flags;
+ int ret, result;
+
+ BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
+
+ ret = -1;
+ if (!likely(lock_task_sighand(t, &flags)))
+ goto ret;
+
+ ret = 1; /* the signal is ignored */
+ result = TRACE_SIGNAL_IGNORED;
+ if (!prepare_signal(sig, t, false))
+ goto out;
+
+ ret = 0;
+ if (unlikely(!list_empty(&q->list))) {
+ /*
+ * If an SI_TIMER entry is already queue just increment
+ * the overrun count.
+ */
+ BUG_ON(q->info.si_code != SI_TIMER);
+ q->info.si_overrun++;
+ result = TRACE_SIGNAL_ALREADY_PENDING;
+ goto out;
+ }
+ q->info.si_overrun = 0;
+
+ signalfd_notify(t, sig);
+ pending = group ? &t->signal->shared_pending : &t->pending;
+ list_add_tail(&q->list, &pending->list);
+ sigaddset(&pending->signal, sig);
+ complete_signal(sig, t, group);
+ result = TRACE_SIGNAL_DELIVERED;
+out:
+ trace_signal_generate(sig, &q->info, t, group, result);
+ unlock_task_sighand(t, &flags);
+ret:
+ return ret;
+}
+
+/*
+ * Let a parent know about the death of a child.
+ * For a stopped/continued status change, use do_notify_parent_cldstop instead.
+ *
+ * Returns true if our parent ignored us and so we've switched to
+ * self-reaping.
+ */
+bool do_notify_parent(struct task_struct *tsk, int sig)
+{
+ struct siginfo info;
+ unsigned long flags;
+ struct sighand_struct *psig;
+ bool autoreap = false;
+ cputime_t utime, stime;
+
+ BUG_ON(sig == -1);
+
+ /* do_notify_parent_cldstop should have been called instead. */
+ BUG_ON(task_is_stopped_or_traced(tsk));
+
+ BUG_ON(!tsk->ptrace &&
+ (tsk->group_leader != tsk || !thread_group_empty(tsk)));
+
+ if (sig != SIGCHLD) {
+ /*
+ * This is only possible if parent == real_parent.
+ * Check if it has changed security domain.
+ */
+ if (tsk->parent_exec_id != tsk->parent->self_exec_id)
+ sig = SIGCHLD;
+ }
+
+ info.si_signo = sig;
+ info.si_errno = 0;
+ /*
+ * We are under tasklist_lock here so our parent is tied to
+ * us and cannot change.
+ *
+ * task_active_pid_ns will always return the same pid namespace
+ * until a task passes through release_task.
+ *
+ * write_lock() currently calls preempt_disable() which is the
+ * same as rcu_read_lock(), but according to Oleg, this is not
+ * correct to rely on this
+ */
+ rcu_read_lock();
+ info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(tsk->parent));
+ info.si_uid = from_kuid_munged(task_cred_xxx(tsk->parent, user_ns),
+ task_uid(tsk));
+ rcu_read_unlock();
+
+ task_cputime(tsk, &utime, &stime);
+ info.si_utime = cputime_to_clock_t(utime + tsk->signal->utime);
+ info.si_stime = cputime_to_clock_t(stime + tsk->signal->stime);
+
+ info.si_status = tsk->exit_code & 0x7f;
+ if (tsk->exit_code & 0x80)
+ info.si_code = CLD_DUMPED;
+ else if (tsk->exit_code & 0x7f)
+ info.si_code = CLD_KILLED;
+ else {
+ info.si_code = CLD_EXITED;
+ info.si_status = tsk->exit_code >> 8;
+ }
+
+ psig = tsk->parent->sighand;
+ spin_lock_irqsave(&psig->siglock, flags);
+ if (!tsk->ptrace && sig == SIGCHLD &&
+ (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
+ (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) {
+ /*
+ * We are exiting and our parent doesn't care. POSIX.1
+ * defines special semantics for setting SIGCHLD to SIG_IGN
+ * or setting the SA_NOCLDWAIT flag: we should be reaped
+ * automatically and not left for our parent's wait4 call.
+ * Rather than having the parent do it as a magic kind of
+ * signal handler, we just set this to tell do_exit that we
+ * can be cleaned up without becoming a zombie. Note that
+ * we still call __wake_up_parent in this case, because a
+ * blocked sys_wait4 might now return -ECHILD.
+ *
+ * Whether we send SIGCHLD or not for SA_NOCLDWAIT
+ * is implementation-defined: we do (if you don't want
+ * it, just use SIG_IGN instead).
+ */
+ autoreap = true;
+ if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
+ sig = 0;
+ }
+ if (valid_signal(sig) && sig)
+ __group_send_sig_info(sig, &info, tsk->parent);
+ __wake_up_parent(tsk, tsk->parent);
+ spin_unlock_irqrestore(&psig->siglock, flags);
+
+ return autoreap;
+}
+
+/**
+ * do_notify_parent_cldstop - notify parent of stopped/continued state change
+ * @tsk: task reporting the state change
+ * @for_ptracer: the notification is for ptracer
+ * @why: CLD_{CONTINUED|STOPPED|TRAPPED} to report
+ *
+ * Notify @tsk's parent that the stopped/continued state has changed. If
+ * @for_ptracer is %false, @tsk's group leader notifies to its real parent.
+ * If %true, @tsk reports to @tsk->parent which should be the ptracer.
+ *
+ * CONTEXT:
+ * Must be called with tasklist_lock at least read locked.
+ */
+static void do_notify_parent_cldstop(struct task_struct *tsk,
+ bool for_ptracer, int why)
+{
+ struct siginfo info;
+ unsigned long flags;
+ struct task_struct *parent;
+ struct sighand_struct *sighand;
+ cputime_t utime, stime;
+
+ if (for_ptracer) {
+ parent = tsk->parent;
+ } else {
+ tsk = tsk->group_leader;
+ parent = tsk->real_parent;
+ }
+
+ info.si_signo = SIGCHLD;
+ info.si_errno = 0;
+ /*
+ * see comment in do_notify_parent() about the following 4 lines
+ */
+ rcu_read_lock();
+ info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(parent));
+ info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk));
+ rcu_read_unlock();
+
+ task_cputime(tsk, &utime, &stime);
+ info.si_utime = cputime_to_clock_t(utime);
+ info.si_stime = cputime_to_clock_t(stime);
+
+ info.si_code = why;
+ switch (why) {
+ case CLD_CONTINUED:
+ info.si_status = SIGCONT;
+ break;
+ case CLD_STOPPED:
+ info.si_status = tsk->signal->group_exit_code & 0x7f;
+ break;
+ case CLD_TRAPPED:
+ info.si_status = tsk->exit_code & 0x7f;
+ break;
+ default:
+ BUG();
+ }
+
+ sighand = parent->sighand;
+ spin_lock_irqsave(&sighand->siglock, flags);
+ if (sighand->action[SIGCHLD-1].sa.sa_handler != SIG_IGN &&
+ !(sighand->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDSTOP))
+ __group_send_sig_info(SIGCHLD, &info, parent);
+ /*
+ * Even if SIGCHLD is not generated, we must wake up wait4 calls.
+ */
+ __wake_up_parent(tsk, parent);
+ spin_unlock_irqrestore(&sighand->siglock, flags);
+}
+
+static inline int may_ptrace_stop(void)
+{
+ if (!likely(current->ptrace))
+ return 0;
+ /*
+ * Are we in the middle of do_coredump?
+ * If so and our tracer is also part of the coredump stopping
+ * is a deadlock situation, and pointless because our tracer
+ * is dead so don't allow us to stop.
+ * If SIGKILL was already sent before the caller unlocked
+ * ->siglock we must see ->core_state != NULL. Otherwise it
+ * is safe to enter schedule().
+ *
+ * This is almost outdated, a task with the pending SIGKILL can't
+ * block in TASK_TRACED. But PTRACE_EVENT_EXIT can be reported
+ * after SIGKILL was already dequeued.
+ */
+ if (unlikely(current->mm->core_state) &&
+ unlikely(current->mm == current->parent->mm))
+ return 0;
+
+ return 1;
+}
+
+/*
+ * Return non-zero if there is a SIGKILL that should be waking us up.
+ * Called with the siglock held.
+ */
+static int sigkill_pending(struct task_struct *tsk)
+{
+ return sigismember(&tsk->pending.signal, SIGKILL) ||
+ sigismember(&tsk->signal->shared_pending.signal, SIGKILL);
+}
+
+/*
+ * This must be called with current->sighand->siglock held.
+ *
+ * This should be the path for all ptrace stops.
+ * We always set current->last_siginfo while stopped here.
+ * That makes it a way to test a stopped process for
+ * being ptrace-stopped vs being job-control-stopped.
+ *
+ * If we actually decide not to stop at all because the tracer
+ * is gone, we keep current->exit_code unless clear_code.
+ */
+static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
+ __releases(&current->sighand->siglock)
+ __acquires(&current->sighand->siglock)
+{
+ bool gstop_done = false;
+
+ if (arch_ptrace_stop_needed(exit_code, info)) {
+ /*
+ * The arch code has something special to do before a
+ * ptrace stop. This is allowed to block, e.g. for faults
+ * on user stack pages. We can't keep the siglock while
+ * calling arch_ptrace_stop, so we must release it now.
+ * To preserve proper semantics, we must do this before
+ * any signal bookkeeping like checking group_stop_count.
+ * Meanwhile, a SIGKILL could come in before we retake the
+ * siglock. That must prevent us from sleeping in TASK_TRACED.
+ * So after regaining the lock, we must check for SIGKILL.
+ */
+ spin_unlock_irq(&current->sighand->siglock);
+ arch_ptrace_stop(exit_code, info);
+ spin_lock_irq(&current->sighand->siglock);
+ if (sigkill_pending(current))
+ return;
+ }
+
+ /*
+ * We're committing to trapping. TRACED should be visible before
+ * TRAPPING is cleared; otherwise, the tracer might fail do_wait().
+ * Also, transition to TRACED and updates to ->jobctl should be
+ * atomic with respect to siglock and should be done after the arch
+ * hook as siglock is released and regrabbed across it.
+ */
+ set_current_state(TASK_TRACED);
+
+ current->last_siginfo = info;
+ current->exit_code = exit_code;
+
+ /*
+ * If @why is CLD_STOPPED, we're trapping to participate in a group
+ * stop. Do the bookkeeping. Note that if SIGCONT was delievered
+ * across siglock relocks since INTERRUPT was scheduled, PENDING
+ * could be clear now. We act as if SIGCONT is received after
+ * TASK_TRACED is entered - ignore it.
+ */
+ if (why == CLD_STOPPED && (current->jobctl & JOBCTL_STOP_PENDING))
+ gstop_done = task_participate_group_stop(current);
+
+ /* any trap clears pending STOP trap, STOP trap clears NOTIFY */
+ task_clear_jobctl_pending(current, JOBCTL_TRAP_STOP);
+ if (info && info->si_code >> 8 == PTRACE_EVENT_STOP)
+ task_clear_jobctl_pending(current, JOBCTL_TRAP_NOTIFY);
+
+ /* entering a trap, clear TRAPPING */
+ task_clear_jobctl_trapping(current);
+
+ spin_unlock_irq(&current->sighand->siglock);
+ read_lock(&tasklist_lock);
+ if (may_ptrace_stop()) {
+ /*
+ * Notify parents of the stop.
+ *
+ * While ptraced, there are two parents - the ptracer and
+ * the real_parent of the group_leader. The ptracer should
+ * know about every stop while the real parent is only
+ * interested in the completion of group stop. The states
+ * for the two don't interact with each other. Notify
+ * separately unless they're gonna be duplicates.
+ */
+ do_notify_parent_cldstop(current, true, why);
+ if (gstop_done && ptrace_reparented(current))
+ do_notify_parent_cldstop(current, false, why);
+
+ /*
+ * Don't want to allow preemption here, because
+ * sys_ptrace() needs this task to be inactive.
+ *
+ * XXX: implement read_unlock_no_resched().
+ */
+ preempt_disable();
+ read_unlock(&tasklist_lock);
+ preempt_enable_no_resched();
+ freezable_schedule();
+ } else {
+ /*
+ * By the time we got the lock, our tracer went away.
+ * Don't drop the lock yet, another tracer may come.
+ *
+ * If @gstop_done, the ptracer went away between group stop
+ * completion and here. During detach, it would have set
+ * JOBCTL_STOP_PENDING on us and we'll re-enter
+ * TASK_STOPPED in do_signal_stop() on return, so notifying
+ * the real parent of the group stop completion is enough.
+ */
+ if (gstop_done)
+ do_notify_parent_cldstop(current, false, why);
+
+ /* tasklist protects us from ptrace_freeze_traced() */
+ __set_current_state(TASK_RUNNING);
+ if (clear_code)
+ current->exit_code = 0;
+ read_unlock(&tasklist_lock);
+ }
+
+ /*
+ * We are back. Now reacquire the siglock before touching
+ * last_siginfo, so that we are sure to have synchronized with
+ * any signal-sending on another CPU that wants to examine it.
+ */
+ spin_lock_irq(&current->sighand->siglock);
+ current->last_siginfo = NULL;
+
+ /* LISTENING can be set only during STOP traps, clear it */
+ current->jobctl &= ~JOBCTL_LISTENING;
+
+ /*
+ * Queued signals ignored us while we were stopped for tracing.
+ * So check for any that we should take before resuming user mode.
+ * This sets TIF_SIGPENDING, but never clears it.
+ */
+ recalc_sigpending_tsk(current);
+}
+
+static void ptrace_do_notify(int signr, int exit_code, int why)
+{
+ siginfo_t info;
+
+ memset(&info, 0, sizeof info);
+ info.si_signo = signr;
+ info.si_code = exit_code;
+ info.si_pid = task_pid_vnr(current);
+ info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
+
+ /* Let the debugger run. */
+ ptrace_stop(exit_code, why, 1, &info);
+}
+
+void ptrace_notify(int exit_code)
+{
+ BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
+ if (unlikely(current->task_works))
+ task_work_run();
+
+ spin_lock_irq(&current->sighand->siglock);
+ ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED);
+ spin_unlock_irq(&current->sighand->siglock);
+}
+
+/**
+ * do_signal_stop - handle group stop for SIGSTOP and other stop signals
+ * @signr: signr causing group stop if initiating
+ *
+ * If %JOBCTL_STOP_PENDING is not set yet, initiate group stop with @signr
+ * and participate in it. If already set, participate in the existing
+ * group stop. If participated in a group stop (and thus slept), %true is
+ * returned with siglock released.
+ *
+ * If ptraced, this function doesn't handle stop itself. Instead,
+ * %JOBCTL_TRAP_STOP is scheduled and %false is returned with siglock
+ * untouched. The caller must ensure that INTERRUPT trap handling takes
+ * places afterwards.
+ *
+ * CONTEXT:
+ * Must be called with @current->sighand->siglock held, which is released
+ * on %true return.
+ *
+ * RETURNS:
+ * %false if group stop is already cancelled or ptrace trap is scheduled.
+ * %true if participated in group stop.
+ */
+static bool do_signal_stop(int signr)
+ __releases(&current->sighand->siglock)
+{
+ struct signal_struct *sig = current->signal;
+
+ if (!(current->jobctl & JOBCTL_STOP_PENDING)) {
+ unsigned int gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME;
+ struct task_struct *t;
+
+ /* signr will be recorded in task->jobctl for retries */
+ WARN_ON_ONCE(signr & ~JOBCTL_STOP_SIGMASK);
+
+ if (!likely(current->jobctl & JOBCTL_STOP_DEQUEUED) ||
+ unlikely(signal_group_exit(sig)))
+ return false;
+ /*
+ * There is no group stop already in progress. We must
+ * initiate one now.
+ *
+ * While ptraced, a task may be resumed while group stop is
+ * still in effect and then receive a stop signal and
+ * initiate another group stop. This deviates from the
+ * usual behavior as two consecutive stop signals can't
+ * cause two group stops when !ptraced. That is why we
+ * also check !task_is_stopped(t) below.
+ *
+ * The condition can be distinguished by testing whether
+ * SIGNAL_STOP_STOPPED is already set. Don't generate
+ * group_exit_code in such case.
+ *
+ * This is not necessary for SIGNAL_STOP_CONTINUED because
+ * an intervening stop signal is required to cause two
+ * continued events regardless of ptrace.
+ */
+ if (!(sig->flags & SIGNAL_STOP_STOPPED))
+ sig->group_exit_code = signr;
+
+ sig->group_stop_count = 0;
+
+ if (task_set_jobctl_pending(current, signr | gstop))
+ sig->group_stop_count++;
+
+ t = current;
+ while_each_thread(current, t) {
+ /*
+ * Setting state to TASK_STOPPED for a group
+ * stop is always done with the siglock held,
+ * so this check has no races.
+ */
+ if (!task_is_stopped(t) &&
+ task_set_jobctl_pending(t, signr | gstop)) {
+ sig->group_stop_count++;
+ if (likely(!(t->ptrace & PT_SEIZED)))
+ signal_wake_up(t, 0);
+ else
+ ptrace_trap_notify(t);
+ }
+ }
+ }
+
+ if (likely(!current->ptrace)) {
+ int notify = 0;
+
+ /*
+ * If there are no other threads in the group, or if there
+ * is a group stop in progress and we are the last to stop,
+ * report to the parent.
+ */
+ if (task_participate_group_stop(current))
+ notify = CLD_STOPPED;
+
+ __set_current_state(TASK_STOPPED);
+ spin_unlock_irq(&current->sighand->siglock);
+
+ /*
+ * Notify the parent of the group stop completion. Because
+ * we're not holding either the siglock or tasklist_lock
+ * here, ptracer may attach inbetween; however, this is for
+ * group stop and should always be delivered to the real
+ * parent of the group leader. The new ptracer will get
+ * its notification when this task transitions into
+ * TASK_TRACED.
+ */
+ if (notify) {
+ read_lock(&tasklist_lock);
+ do_notify_parent_cldstop(current, false, notify);
+ read_unlock(&tasklist_lock);
+ }
+
+ /* Now we don't run again until woken by SIGCONT or SIGKILL */
+ freezable_schedule();
+ return true;
+ } else {
+ /*
+ * While ptraced, group stop is handled by STOP trap.
+ * Schedule it and let the caller deal with it.
+ */
+ task_set_jobctl_pending(current, JOBCTL_TRAP_STOP);
+ return false;
+ }
+}
+
+/**
+ * do_jobctl_trap - take care of ptrace jobctl traps
+ *
+ * When PT_SEIZED, it's used for both group stop and explicit
+ * SEIZE/INTERRUPT traps. Both generate PTRACE_EVENT_STOP trap with
+ * accompanying siginfo. If stopped, lower eight bits of exit_code contain
+ * the stop signal; otherwise, %SIGTRAP.
+ *
+ * When !PT_SEIZED, it's used only for group stop trap with stop signal
+ * number as exit_code and no siginfo.
+ *
+ * CONTEXT:
+ * Must be called with @current->sighand->siglock held, which may be
+ * released and re-acquired before returning with intervening sleep.
+ */
+static void do_jobctl_trap(void)
+{
+ struct signal_struct *signal = current->signal;
+ int signr = current->jobctl & JOBCTL_STOP_SIGMASK;
+
+ if (current->ptrace & PT_SEIZED) {
+ if (!signal->group_stop_count &&
+ !(signal->flags & SIGNAL_STOP_STOPPED))
+ signr = SIGTRAP;
+ WARN_ON_ONCE(!signr);
+ ptrace_do_notify(signr, signr | (PTRACE_EVENT_STOP << 8),
+ CLD_STOPPED);
+ } else {
+ WARN_ON_ONCE(!signr);
+ ptrace_stop(signr, CLD_STOPPED, 0, NULL);
+ current->exit_code = 0;
+ }
+}
+
+static int ptrace_signal(int signr, siginfo_t *info)
+{
+ ptrace_signal_deliver();
+ /*
+ * We do not check sig_kernel_stop(signr) but set this marker
+ * unconditionally because we do not know whether debugger will
+ * change signr. This flag has no meaning unless we are going
+ * to stop after return from ptrace_stop(). In this case it will
+ * be checked in do_signal_stop(), we should only stop if it was
+ * not cleared by SIGCONT while we were sleeping. See also the
+ * comment in dequeue_signal().
+ */
+ current->jobctl |= JOBCTL_STOP_DEQUEUED;
+ ptrace_stop(signr, CLD_TRAPPED, 0, info);
+
+ /* We're back. Did the debugger cancel the sig? */
+ signr = current->exit_code;
+ if (signr == 0)
+ return signr;
+
+ current->exit_code = 0;
+
+ /*
+ * Update the siginfo structure if the signal has
+ * changed. If the debugger wanted something
+ * specific in the siginfo structure then it should
+ * have updated *info via PTRACE_SETSIGINFO.
+ */
+ if (signr != info->si_signo) {
+ info->si_signo = signr;
+ info->si_errno = 0;
+ info->si_code = SI_USER;
+ rcu_read_lock();
+ info->si_pid = task_pid_vnr(current->parent);
+ info->si_uid = from_kuid_munged(current_user_ns(),
+ task_uid(current->parent));
+ rcu_read_unlock();
+ }
+
+ /* If the (new) signal is now blocked, requeue it. */
+ if (sigismember(&current->blocked, signr)) {
+ specific_send_sig_info(signr, info, current);
+ signr = 0;
+ }
+
+ return signr;
+}
+
+int get_signal(struct ksignal *ksig)
+{
+ struct sighand_struct *sighand = current->sighand;
+ struct signal_struct *signal = current->signal;
+ int signr;
+
+ if (unlikely(current->task_works))
+ task_work_run();
+
+ if (unlikely(uprobe_deny_signal()))
+ return 0;
+
+ /*
+ * Do this once, we can't return to user-mode if freezing() == T.
+ * do_signal_stop() and ptrace_stop() do freezable_schedule() and
+ * thus do not need another check after return.
+ */
+ try_to_freeze();
+
+relock:
+ spin_lock_irq(&sighand->siglock);
+ /*
+ * Every stopped thread goes here after wakeup. Check to see if
+ * we should notify the parent, prepare_signal(SIGCONT) encodes
+ * the CLD_ si_code into SIGNAL_CLD_MASK bits.
+ */
+ if (unlikely(signal->flags & SIGNAL_CLD_MASK)) {
+ int why;
+
+ if (signal->flags & SIGNAL_CLD_CONTINUED)
+ why = CLD_CONTINUED;
+ else
+ why = CLD_STOPPED;
+
+ signal->flags &= ~SIGNAL_CLD_MASK;
+
+ spin_unlock_irq(&sighand->siglock);
+
+ /*
+ * Notify the parent that we're continuing. This event is
+ * always per-process and doesn't make whole lot of sense
+ * for ptracers, who shouldn't consume the state via
+ * wait(2) either, but, for backward compatibility, notify
+ * the ptracer of the group leader too unless it's gonna be
+ * a duplicate.
+ */
+ read_lock(&tasklist_lock);
+ do_notify_parent_cldstop(current, false, why);
+
+ if (ptrace_reparented(current->group_leader))
+ do_notify_parent_cldstop(current->group_leader,
+ true, why);
+ read_unlock(&tasklist_lock);
+
+ goto relock;
+ }
+
+ for (;;) {
+ struct k_sigaction *ka;
+
+ if (unlikely(current->jobctl & JOBCTL_STOP_PENDING) &&
+ do_signal_stop(0))
+ goto relock;
+
+ if (unlikely(current->jobctl & JOBCTL_TRAP_MASK)) {
+ do_jobctl_trap();
+ spin_unlock_irq(&sighand->siglock);
+ goto relock;
+ }
+
+ signr = dequeue_signal(current, &current->blocked, &ksig->info);
+
+ if (!signr)
+ break; /* will return 0 */
+
+ if (unlikely(current->ptrace) && signr != SIGKILL) {
+ signr = ptrace_signal(signr, &ksig->info);
+ if (!signr)
+ continue;
+ }
+
+ ka = &sighand->action[signr-1];
+
+ /* Trace actually delivered signals. */
+ trace_signal_deliver(signr, &ksig->info, ka);
+
+ if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */
+ continue;
+ if (ka->sa.sa_handler != SIG_DFL) {
+ /* Run the handler. */
+ ksig->ka = *ka;
+
+ if (ka->sa.sa_flags & SA_ONESHOT)
+ ka->sa.sa_handler = SIG_DFL;
+
+ break; /* will return non-zero "signr" value */
+ }
+
+ /*
+ * Now we are doing the default action for this signal.
+ */
+ if (sig_kernel_ignore(signr)) /* Default is nothing. */
+ continue;
+
+ /*
+ * Global init gets no signals it doesn't want.
+ * Container-init gets no signals it doesn't want from same
+ * container.
+ *
+ * Note that if global/container-init sees a sig_kernel_only()
+ * signal here, the signal must have been generated internally
+ * or must have come from an ancestor namespace. In either
+ * case, the signal cannot be dropped.
+ */
+ if (unlikely(signal->flags & SIGNAL_UNKILLABLE) &&
+ !sig_kernel_only(signr))
+ continue;
+
+ if (sig_kernel_stop(signr)) {
+ /*
+ * The default action is to stop all threads in
+ * the thread group. The job control signals
+ * do nothing in an orphaned pgrp, but SIGSTOP
+ * always works. Note that siglock needs to be
+ * dropped during the call to is_orphaned_pgrp()
+ * because of lock ordering with tasklist_lock.
+ * This allows an intervening SIGCONT to be posted.
+ * We need to check for that and bail out if necessary.
+ */
+ if (signr != SIGSTOP) {
+ spin_unlock_irq(&sighand->siglock);
+
+ /* signals can be posted during this window */
+
+ if (is_current_pgrp_orphaned())
+ goto relock;
+
+ spin_lock_irq(&sighand->siglock);
+ }
+
+ if (likely(do_signal_stop(ksig->info.si_signo))) {
+ /* It released the siglock. */
+ goto relock;
+ }
+
+ /*
+ * We didn't actually stop, due to a race
+ * with SIGCONT or something like that.
+ */
+ continue;
+ }
+
+ spin_unlock_irq(&sighand->siglock);
+
+ /*
+ * Anything else is fatal, maybe with a core dump.
+ */
+ current->flags |= PF_SIGNALED;
+
+ if (sig_kernel_coredump(signr)) {
+ if (print_fatal_signals)
+ print_fatal_signal(ksig->info.si_signo);
+ proc_coredump_connector(current);
+ /*
+ * If it was able to dump core, this kills all
+ * other threads in the group and synchronizes with
+ * their demise. If we lost the race with another
+ * thread getting here, it set group_exit_code
+ * first and our do_group_exit call below will use
+ * that value and ignore the one we pass it.
+ */
+ do_coredump(&ksig->info);
+ }
+
+ /*
+ * Death signals, no core dump.
+ */
+ do_group_exit(ksig->info.si_signo);
+ /* NOTREACHED */
+ }
+ spin_unlock_irq(&sighand->siglock);
+
+ ksig->sig = signr;
+ return ksig->sig > 0;
+}
+
+/**
+ * signal_delivered -
+ * @ksig: kernel signal struct
+ * @stepping: nonzero if debugger single-step or block-step in use
+ *
+ * This function should be called when a signal has successfully been
+ * delivered. It updates the blocked signals accordingly (@ksig->ka.sa.sa_mask
+ * is always blocked, and the signal itself is blocked unless %SA_NODEFER
+ * is set in @ksig->ka.sa.sa_flags. Tracing is notified.
+ */
+static void signal_delivered(struct ksignal *ksig, int stepping)
+{
+ sigset_t blocked;
+
+ /* A signal was successfully delivered, and the
+ saved sigmask was stored on the signal frame,
+ and will be restored by sigreturn. So we can
+ simply clear the restore sigmask flag. */
+ clear_restore_sigmask();
+
+ sigorsets(&blocked, &current->blocked, &ksig->ka.sa.sa_mask);
+ if (!(ksig->ka.sa.sa_flags & SA_NODEFER))
+ sigaddset(&blocked, ksig->sig);
+ set_current_blocked(&blocked);
+ tracehook_signal_handler(stepping);
+}
+
+void signal_setup_done(int failed, struct ksignal *ksig, int stepping)
+{
+ if (failed)
+ force_sigsegv(ksig->sig, current);
+ else
+ signal_delivered(ksig, stepping);
+}
+
+/*
+ * It could be that complete_signal() picked us to notify about the
+ * group-wide signal. Other threads should be notified now to take
+ * the shared signals in @which since we will not.
+ */
+static void retarget_shared_pending(struct task_struct *tsk, sigset_t *which)
+{
+ sigset_t retarget;
+ struct task_struct *t;
+
+ sigandsets(&retarget, &tsk->signal->shared_pending.signal, which);
+ if (sigisemptyset(&retarget))
+ return;
+
+ t = tsk;
+ while_each_thread(tsk, t) {
+ if (t->flags & PF_EXITING)
+ continue;
+
+ if (!has_pending_signals(&retarget, &t->blocked))
+ continue;
+ /* Remove the signals this thread can handle. */
+ sigandsets(&retarget, &retarget, &t->blocked);
+
+ if (!signal_pending(t))
+ signal_wake_up(t, 0);
+
+ if (sigisemptyset(&retarget))
+ break;
+ }
+}
+
+void exit_signals(struct task_struct *tsk)
+{
+ int group_stop = 0;
+ sigset_t unblocked;
+
+ /*
+ * @tsk is about to have PF_EXITING set - lock out users which
+ * expect stable threadgroup.
+ */
+ threadgroup_change_begin(tsk);
+
+ if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) {
+ tsk->flags |= PF_EXITING;
+ threadgroup_change_end(tsk);
+ return;
+ }
+
+ spin_lock_irq(&tsk->sighand->siglock);
+ /*
+ * From now this task is not visible for group-wide signals,
+ * see wants_signal(), do_signal_stop().
+ */
+ tsk->flags |= PF_EXITING;
+
+ threadgroup_change_end(tsk);
+
+ if (!signal_pending(tsk))
+ goto out;
+
+ unblocked = tsk->blocked;
+ signotset(&unblocked);
+ retarget_shared_pending(tsk, &unblocked);
+
+ if (unlikely(tsk->jobctl & JOBCTL_STOP_PENDING) &&
+ task_participate_group_stop(tsk))
+ group_stop = CLD_STOPPED;
+out:
+ spin_unlock_irq(&tsk->sighand->siglock);
+
+ /*
+ * If group stop has completed, deliver the notification. This
+ * should always go to the real parent of the group leader.
+ */
+ if (unlikely(group_stop)) {
+ read_lock(&tasklist_lock);
+ do_notify_parent_cldstop(tsk, false, group_stop);
+ read_unlock(&tasklist_lock);
+ }
+}
+
+EXPORT_SYMBOL(recalc_sigpending);
+EXPORT_SYMBOL_GPL(dequeue_signal);
+EXPORT_SYMBOL(flush_signals);
+EXPORT_SYMBOL(force_sig);
+EXPORT_SYMBOL(send_sig);
+EXPORT_SYMBOL(send_sig_info);
+EXPORT_SYMBOL(sigprocmask);
+EXPORT_SYMBOL(block_all_signals);
+EXPORT_SYMBOL(unblock_all_signals);
+
+
+/*
+ * System call entry points.
+ */
+
+/**
+ * sys_restart_syscall - restart a system call
+ */
+SYSCALL_DEFINE0(restart_syscall)
+{
+ struct restart_block *restart = &current->restart_block;
+ return restart->fn(restart);
+}
+
+long do_no_restart_syscall(struct restart_block *param)
+{
+ return -EINTR;
+}
+
+static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset)
+{
+ if (signal_pending(tsk) && !thread_group_empty(tsk)) {
+ sigset_t newblocked;
+ /* A set of now blocked but previously unblocked signals. */
+ sigandnsets(&newblocked, newset, &current->blocked);
+ retarget_shared_pending(tsk, &newblocked);
+ }
+ tsk->blocked = *newset;
+ recalc_sigpending();
+}
+
+/**
+ * set_current_blocked - change current->blocked mask
+ * @newset: new mask
+ *
+ * It is wrong to change ->blocked directly, this helper should be used
+ * to ensure the process can't miss a shared signal we are going to block.
+ */
+void set_current_blocked(sigset_t *newset)
+{
+ sigdelsetmask(newset, sigmask(SIGKILL) | sigmask(SIGSTOP));
+ __set_current_blocked(newset);
+}
+
+void __set_current_blocked(const sigset_t *newset)
+{
+ struct task_struct *tsk = current;
+
+ spin_lock_irq(&tsk->sighand->siglock);
+ __set_task_blocked(tsk, newset);
+ spin_unlock_irq(&tsk->sighand->siglock);
+}
+
+/*
+ * This is also useful for kernel threads that want to temporarily
+ * (or permanently) block certain signals.
+ *
+ * NOTE! Unlike the user-mode sys_sigprocmask(), the kernel
+ * interface happily blocks "unblockable" signals like SIGKILL
+ * and friends.
+ */
+int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
+{
+ struct task_struct *tsk = current;
+ sigset_t newset;
+
+ /* Lockless, only current can change ->blocked, never from irq */
+ if (oldset)
+ *oldset = tsk->blocked;
+
+ switch (how) {
+ case SIG_BLOCK:
+ sigorsets(&newset, &tsk->blocked, set);
+ break;
+ case SIG_UNBLOCK:
+ sigandnsets(&newset, &tsk->blocked, set);
+ break;
+ case SIG_SETMASK:
+ newset = *set;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ __set_current_blocked(&newset);
+ return 0;
+}
+
+/**
+ * sys_rt_sigprocmask - change the list of currently blocked signals
+ * @how: whether to add, remove, or set signals
+ * @nset: stores pending signals
+ * @oset: previous value of signal mask if non-null
+ * @sigsetsize: size of sigset_t type
+ */
+SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset,
+ sigset_t __user *, oset, size_t, sigsetsize)
+{
+ sigset_t old_set, new_set;
+ int error;
+
+ /* XXX: Don't preclude handling different sized sigset_t's. */
+ if (sigsetsize != sizeof(sigset_t))
+ return -EINVAL;
+
+ old_set = current->blocked;
+
+ if (nset) {
+ if (copy_from_user(&new_set, nset, sizeof(sigset_t)))
+ return -EFAULT;
+ sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));
+
+ error = sigprocmask(how, &new_set, NULL);
+ if (error)
+ return error;
+ }
+
+ if (oset) {
+ if (copy_to_user(oset, &old_set, sizeof(sigset_t)))
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset,
+ compat_sigset_t __user *, oset, compat_size_t, sigsetsize)
+{
+#ifdef __BIG_ENDIAN
+ sigset_t old_set = current->blocked;
+
+ /* XXX: Don't preclude handling different sized sigset_t's. */
+ if (sigsetsize != sizeof(sigset_t))
+ return -EINVAL;
+
+ if (nset) {
+ compat_sigset_t new32;
+ sigset_t new_set;
+ int error;
+ if (copy_from_user(&new32, nset, sizeof(compat_sigset_t)))
+ return -EFAULT;
+
+ sigset_from_compat(&new_set, &new32);
+ sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));
+
+ error = sigprocmask(how, &new_set, NULL);
+ if (error)
+ return error;
+ }
+ if (oset) {
+ compat_sigset_t old32;
+ sigset_to_compat(&old32, &old_set);
+ if (copy_to_user(oset, &old32, sizeof(compat_sigset_t)))
+ return -EFAULT;
+ }
+ return 0;
+#else
+ return sys_rt_sigprocmask(how, (sigset_t __user *)nset,
+ (sigset_t __user *)oset, sigsetsize);
+#endif
+}
+#endif
+
+static int do_sigpending(void *set, unsigned long sigsetsize)
+{
+ if (sigsetsize > sizeof(sigset_t))
+ return -EINVAL;
+
+ spin_lock_irq(&current->sighand->siglock);
+ sigorsets(set, &current->pending.signal,
+ &current->signal->shared_pending.signal);
+ spin_unlock_irq(&current->sighand->siglock);
+
+ /* Outside the lock because only this thread touches it. */
+ sigandsets(set, &current->blocked, set);
+ return 0;
+}
+
+/**
+ * sys_rt_sigpending - examine a pending signal that has been raised
+ * while blocked
+ * @uset: stores pending signals
+ * @sigsetsize: size of sigset_t type or larger
+ */
+SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize)
+{
+ sigset_t set;
+ int err = do_sigpending(&set, sigsetsize);
+ if (!err && copy_to_user(uset, &set, sigsetsize))
+ err = -EFAULT;
+ return err;
+}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset,
+ compat_size_t, sigsetsize)
+{
+#ifdef __BIG_ENDIAN
+ sigset_t set;
+ int err = do_sigpending(&set, sigsetsize);
+ if (!err) {
+ compat_sigset_t set32;
+ sigset_to_compat(&set32, &set);
+ /* we can get here only if sigsetsize <= sizeof(set) */
+ if (copy_to_user(uset, &set32, sigsetsize))
+ err = -EFAULT;
+ }
+ return err;
+#else
+ return sys_rt_sigpending((sigset_t __user *)uset, sigsetsize);
+#endif
+}
+#endif
+
+#ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER
+
+int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
+{
+ int err;
+
+ if (!access_ok (VERIFY_WRITE, to, sizeof(siginfo_t)))
+ return -EFAULT;
+ if (from->si_code < 0)
+ return __copy_to_user(to, from, sizeof(siginfo_t))
+ ? -EFAULT : 0;
+ /*
+ * If you change siginfo_t structure, please be sure
+ * this code is fixed accordingly.
+ * Please remember to update the signalfd_copyinfo() function
+ * inside fs/signalfd.c too, in case siginfo_t changes.
+ * It should never copy any pad contained in the structure
+ * to avoid security leaks, but must copy the generic
+ * 3 ints plus the relevant union member.
+ */
+ err = __put_user(from->si_signo, &to->si_signo);
+ err |= __put_user(from->si_errno, &to->si_errno);
+ err |= __put_user((short)from->si_code, &to->si_code);
+ switch (from->si_code & __SI_MASK) {
+ case __SI_KILL:
+ err |= __put_user(from->si_pid, &to->si_pid);
+ err |= __put_user(from->si_uid, &to->si_uid);
+ break;
+ case __SI_TIMER:
+ err |= __put_user(from->si_tid, &to->si_tid);
+ err |= __put_user(from->si_overrun, &to->si_overrun);
+ err |= __put_user(from->si_ptr, &to->si_ptr);
+ break;
+ case __SI_POLL:
+ err |= __put_user(from->si_band, &to->si_band);
+ err |= __put_user(from->si_fd, &to->si_fd);
+ break;
+ case __SI_FAULT:
+ err |= __put_user(from->si_addr, &to->si_addr);
+#ifdef __ARCH_SI_TRAPNO
+ err |= __put_user(from->si_trapno, &to->si_trapno);
+#endif
+#ifdef BUS_MCEERR_AO
+ /*
+ * Other callers might not initialize the si_lsb field,
+ * so check explicitly for the right codes here.
+ */
+ if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO)
+ err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb);
+#endif
+#ifdef SEGV_BNDERR
+ err |= __put_user(from->si_lower, &to->si_lower);
+ err |= __put_user(from->si_upper, &to->si_upper);
+#endif
+ break;
+ case __SI_CHLD:
+ err |= __put_user(from->si_pid, &to->si_pid);
+ err |= __put_user(from->si_uid, &to->si_uid);
+ err |= __put_user(from->si_status, &to->si_status);
+ err |= __put_user(from->si_utime, &to->si_utime);
+ err |= __put_user(from->si_stime, &to->si_stime);
+ break;
+ case __SI_RT: /* This is not generated by the kernel as of now. */
+ case __SI_MESGQ: /* But this is */
+ err |= __put_user(from->si_pid, &to->si_pid);
+ err |= __put_user(from->si_uid, &to->si_uid);
+ err |= __put_user(from->si_ptr, &to->si_ptr);
+ break;
+#ifdef __ARCH_SIGSYS
+ case __SI_SYS:
+ err |= __put_user(from->si_call_addr, &to->si_call_addr);
+ err |= __put_user(from->si_syscall, &to->si_syscall);
+ err |= __put_user(from->si_arch, &to->si_arch);
+ break;
+#endif
+ default: /* this is just in case for now ... */
+ err |= __put_user(from->si_pid, &to->si_pid);
+ err |= __put_user(from->si_uid, &to->si_uid);
+ break;
+ }
+ return err;
+}
+
+#endif
+
+/**
+ * do_sigtimedwait - wait for queued signals specified in @which
+ * @which: queued signals to wait for
+ * @info: if non-null, the signal's siginfo is returned here
+ * @ts: upper bound on process time suspension
+ */
+int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
+ const struct timespec *ts)
+{
+ struct task_struct *tsk = current;
+ long timeout = MAX_SCHEDULE_TIMEOUT;
+ sigset_t mask = *which;
+ int sig;
+
+ if (ts) {
+ if (!timespec_valid(ts))
+ return -EINVAL;
+ timeout = timespec_to_jiffies(ts);
+ /*
+ * We can be close to the next tick, add another one
+ * to ensure we will wait at least the time asked for.
+ */
+ if (ts->tv_sec || ts->tv_nsec)
+ timeout++;
+ }
+
+ /*
+ * Invert the set of allowed signals to get those we want to block.
+ */
+ sigdelsetmask(&mask, sigmask(SIGKILL) | sigmask(SIGSTOP));
+ signotset(&mask);
+
+ spin_lock_irq(&tsk->sighand->siglock);
+ sig = dequeue_signal(tsk, &mask, info);
+ if (!sig && timeout) {
+ /*
+ * None ready, temporarily unblock those we're interested
+ * while we are sleeping in so that we'll be awakened when
+ * they arrive. Unblocking is always fine, we can avoid
+ * set_current_blocked().
+ */
+ tsk->real_blocked = tsk->blocked;
+ sigandsets(&tsk->blocked, &tsk->blocked, &mask);
+ recalc_sigpending();
+ spin_unlock_irq(&tsk->sighand->siglock);
+
+ timeout = freezable_schedule_timeout_interruptible(timeout);
+
+ spin_lock_irq(&tsk->sighand->siglock);
+ __set_task_blocked(tsk, &tsk->real_blocked);
+ sigemptyset(&tsk->real_blocked);
+ sig = dequeue_signal(tsk, &mask, info);
+ }
+ spin_unlock_irq(&tsk->sighand->siglock);
+
+ if (sig)
+ return sig;
+ return timeout ? -EINTR : -EAGAIN;
+}
+
+/**
+ * sys_rt_sigtimedwait - synchronously wait for queued signals specified
+ * in @uthese
+ * @uthese: queued signals to wait for
+ * @uinfo: if non-null, the signal's siginfo is returned here
+ * @uts: upper bound on process time suspension
+ * @sigsetsize: size of sigset_t type
+ */
+SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
+ siginfo_t __user *, uinfo, const struct timespec __user *, uts,
+ size_t, sigsetsize)
+{
+ sigset_t these;
+ struct timespec ts;
+ siginfo_t info;
+ int ret;
+
+ /* XXX: Don't preclude handling different sized sigset_t's. */
+ if (sigsetsize != sizeof(sigset_t))
+ return -EINVAL;
+
+ if (copy_from_user(&these, uthese, sizeof(these)))
+ return -EFAULT;
+
+ if (uts) {
+ if (copy_from_user(&ts, uts, sizeof(ts)))
+ return -EFAULT;
+ }
+
+ ret = do_sigtimedwait(&these, &info, uts ? &ts : NULL);
+
+ if (ret > 0 && uinfo) {
+ if (copy_siginfo_to_user(uinfo, &info))
+ ret = -EFAULT;
+ }
+
+ return ret;
+}
+
+/**
+ * sys_kill - send a signal to a process
+ * @pid: the PID of the process
+ * @sig: signal to be sent
+ */
+SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
+{
+ struct siginfo info;
+
+ info.si_signo = sig;
+ info.si_errno = 0;
+ info.si_code = SI_USER;
+ info.si_pid = task_tgid_vnr(current);
+ info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
+
+ return kill_something_info(sig, &info, pid);
+}
+
+static int
+do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
+{
+ struct task_struct *p;
+ int error = -ESRCH;
+
+ rcu_read_lock();
+ p = find_task_by_vpid(pid);
+ if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) {
+ error = check_kill_permission(sig, info, p);
+ /*
+ * The null signal is a permissions and process existence
+ * probe. No signal is actually delivered.
+ */
+ if (!error && sig) {
+ error = do_send_sig_info(sig, info, p, false);
+ /*
+ * If lock_task_sighand() failed we pretend the task
+ * dies after receiving the signal. The window is tiny,
+ * and the signal is private anyway.
+ */
+ if (unlikely(error == -ESRCH))
+ error = 0;
+ }
+ }
+ rcu_read_unlock();
+
+ return error;
+}
+
+static int do_tkill(pid_t tgid, pid_t pid, int sig)
+{
+ struct siginfo info = {};
+
+ info.si_signo = sig;
+ info.si_errno = 0;
+ info.si_code = SI_TKILL;
+ info.si_pid = task_tgid_vnr(current);
+ info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
+
+ return do_send_specific(tgid, pid, sig, &info);
+}
+
+/**
+ * sys_tgkill - send signal to one specific thread
+ * @tgid: the thread group ID of the thread
+ * @pid: the PID of the thread
+ * @sig: signal to be sent
+ *
+ * This syscall also checks the @tgid and returns -ESRCH even if the PID
+ * exists but it's not belonging to the target process anymore. This
+ * method solves the problem of threads exiting and PIDs getting reused.
+ */
+SYSCALL_DEFINE3(tgkill, pid_t, tgid, pid_t, pid, int, sig)
+{
+ /* This is only valid for single tasks */
+ if (pid <= 0 || tgid <= 0)
+ return -EINVAL;
+
+ return do_tkill(tgid, pid, sig);
+}
+
+/**
+ * sys_tkill - send signal to one specific task
+ * @pid: the PID of the task
+ * @sig: signal to be sent
+ *
+ * Send a signal to only one task, even if it's a CLONE_THREAD task.
+ */
+SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig)
+{
+ /* This is only valid for single tasks */
+ if (pid <= 0)
+ return -EINVAL;
+
+ return do_tkill(0, pid, sig);
+}
+
+static int do_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t *info)
+{
+ /* Not even root can pretend to send signals from the kernel.
+ * Nor can they impersonate a kill()/tgkill(), which adds source info.
+ */
+ if ((info->si_code >= 0 || info->si_code == SI_TKILL) &&
+ (task_pid_vnr(current) != pid))
+ return -EPERM;
+
+ info->si_signo = sig;
+
+ /* POSIX.1b doesn't mention process groups. */
+ return kill_proc_info(sig, info, pid);
+}
+
+/**
+ * sys_rt_sigqueueinfo - send signal information to a signal
+ * @pid: the PID of the thread
+ * @sig: signal to be sent
+ * @uinfo: signal info to be sent
+ */
+SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
+ siginfo_t __user *, uinfo)
+{
+ siginfo_t info;
+ if (copy_from_user(&info, uinfo, sizeof(siginfo_t)))
+ return -EFAULT;
+ return do_rt_sigqueueinfo(pid, sig, &info);
+}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE3(rt_sigqueueinfo,
+ compat_pid_t, pid,
+ int, sig,
+ struct compat_siginfo __user *, uinfo)
+{
+ siginfo_t info;
+ int ret = copy_siginfo_from_user32(&info, uinfo);
+ if (unlikely(ret))
+ return ret;
+ return do_rt_sigqueueinfo(pid, sig, &info);
+}
+#endif
+
+static int do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
+{
+ /* This is only valid for single tasks */
+ if (pid <= 0 || tgid <= 0)
+ return -EINVAL;
+
+ /* Not even root can pretend to send signals from the kernel.
+ * Nor can they impersonate a kill()/tgkill(), which adds source info.
+ */
+ if ((info->si_code >= 0 || info->si_code == SI_TKILL) &&
+ (task_pid_vnr(current) != pid))
+ return -EPERM;
+
+ info->si_signo = sig;
+
+ return do_send_specific(tgid, pid, sig, info);
+}
+
+SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig,
+ siginfo_t __user *, uinfo)
+{
+ siginfo_t info;
+
+ if (copy_from_user(&info, uinfo, sizeof(siginfo_t)))
+ return -EFAULT;
+
+ return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
+}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo,
+ compat_pid_t, tgid,
+ compat_pid_t, pid,
+ int, sig,
+ struct compat_siginfo __user *, uinfo)
+{
+ siginfo_t info;
+
+ if (copy_siginfo_from_user32(&info, uinfo))
+ return -EFAULT;
+ return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
+}
+#endif
+
+/*
+ * For kthreads only, must not be used if cloned with CLONE_SIGHAND
+ */
+void kernel_sigaction(int sig, __sighandler_t action)
+{
+ spin_lock_irq(&current->sighand->siglock);
+ current->sighand->action[sig - 1].sa.sa_handler = action;
+ if (action == SIG_IGN) {
+ sigset_t mask;
+
+ sigemptyset(&mask);
+ sigaddset(&mask, sig);
+
+ flush_sigqueue_mask(&mask, &current->signal->shared_pending);
+ flush_sigqueue_mask(&mask, &current->pending);
+ recalc_sigpending();
+ }
+ spin_unlock_irq(&current->sighand->siglock);
+}
+EXPORT_SYMBOL(kernel_sigaction);
+
+int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
+{
+ struct task_struct *p = current, *t;
+ struct k_sigaction *k;
+ sigset_t mask;
+
+ if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig)))
+ return -EINVAL;
+
+ k = &p->sighand->action[sig-1];
+
+ spin_lock_irq(&p->sighand->siglock);
+ if (oact)
+ *oact = *k;
+
+ if (act) {
+ sigdelsetmask(&act->sa.sa_mask,
+ sigmask(SIGKILL) | sigmask(SIGSTOP));
+ *k = *act;
+ /*
+ * POSIX 3.3.1.3:
+ * "Setting a signal action to SIG_IGN for a signal that is
+ * pending shall cause the pending signal to be discarded,
+ * whether or not it is blocked."
+ *
+ * "Setting a signal action to SIG_DFL for a signal that is
+ * pending and whose default action is to ignore the signal
+ * (for example, SIGCHLD), shall cause the pending signal to
+ * be discarded, whether or not it is blocked"
+ */
+ if (sig_handler_ignored(sig_handler(p, sig), sig)) {
+ sigemptyset(&mask);
+ sigaddset(&mask, sig);
+ flush_sigqueue_mask(&mask, &p->signal->shared_pending);
+ for_each_thread(p, t)
+ flush_sigqueue_mask(&mask, &t->pending);
+ }
+ }
+
+ spin_unlock_irq(&p->sighand->siglock);
+ return 0;
+}
+
+static int
+do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp)
+{
+ stack_t oss;
+ int error;
+
+ oss.ss_sp = (void __user *) current->sas_ss_sp;
+ oss.ss_size = current->sas_ss_size;
+ oss.ss_flags = sas_ss_flags(sp);
+
+ if (uss) {
+ void __user *ss_sp;
+ size_t ss_size;
+ int ss_flags;
+
+ error = -EFAULT;
+ if (!access_ok(VERIFY_READ, uss, sizeof(*uss)))
+ goto out;
+ error = __get_user(ss_sp, &uss->ss_sp) |
+ __get_user(ss_flags, &uss->ss_flags) |
+ __get_user(ss_size, &uss->ss_size);
+ if (error)
+ goto out;
+
+ error = -EPERM;
+ if (on_sig_stack(sp))
+ goto out;
+
+ error = -EINVAL;
+ /*
+ * Note - this code used to test ss_flags incorrectly:
+ * old code may have been written using ss_flags==0
+ * to mean ss_flags==SS_ONSTACK (as this was the only
+ * way that worked) - this fix preserves that older
+ * mechanism.
+ */
+ if (ss_flags != SS_DISABLE && ss_flags != SS_ONSTACK && ss_flags != 0)
+ goto out;
+
+ if (ss_flags == SS_DISABLE) {
+ ss_size = 0;
+ ss_sp = NULL;
+ } else {
+ error = -ENOMEM;
+ if (ss_size < MINSIGSTKSZ)
+ goto out;
+ }
+
+ current->sas_ss_sp = (unsigned long) ss_sp;
+ current->sas_ss_size = ss_size;
+ }
+
+ error = 0;
+ if (uoss) {
+ error = -EFAULT;
+ if (!access_ok(VERIFY_WRITE, uoss, sizeof(*uoss)))
+ goto out;
+ error = __put_user(oss.ss_sp, &uoss->ss_sp) |
+ __put_user(oss.ss_size, &uoss->ss_size) |
+ __put_user(oss.ss_flags, &uoss->ss_flags);
+ }
+
+out:
+ return error;
+}
+SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss)
+{
+ return do_sigaltstack(uss, uoss, current_user_stack_pointer());
+}
+
+int restore_altstack(const stack_t __user *uss)
+{
+ int err = do_sigaltstack(uss, NULL, current_user_stack_pointer());
+ /* squash all but EFAULT for now */
+ return err == -EFAULT ? err : 0;
+}
+
+int __save_altstack(stack_t __user *uss, unsigned long sp)
+{
+ struct task_struct *t = current;
+ return __put_user((void __user *)t->sas_ss_sp, &uss->ss_sp) |
+ __put_user(sas_ss_flags(sp), &uss->ss_flags) |
+ __put_user(t->sas_ss_size, &uss->ss_size);
+}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(sigaltstack,
+ const compat_stack_t __user *, uss_ptr,
+ compat_stack_t __user *, uoss_ptr)
+{
+ stack_t uss, uoss;
+ int ret;
+ mm_segment_t seg;
+
+ if (uss_ptr) {
+ compat_stack_t uss32;
+
+ memset(&uss, 0, sizeof(stack_t));
+ if (copy_from_user(&uss32, uss_ptr, sizeof(compat_stack_t)))
+ return -EFAULT;
+ uss.ss_sp = compat_ptr(uss32.ss_sp);
+ uss.ss_flags = uss32.ss_flags;
+ uss.ss_size = uss32.ss_size;
+ }
+ seg = get_fs();
+ set_fs(KERNEL_DS);
+ ret = do_sigaltstack((stack_t __force __user *) (uss_ptr ? &uss : NULL),
+ (stack_t __force __user *) &uoss,
+ compat_user_stack_pointer());
+ set_fs(seg);
+ if (ret >= 0 && uoss_ptr) {
+ if (!access_ok(VERIFY_WRITE, uoss_ptr, sizeof(compat_stack_t)) ||
+ __put_user(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp) ||
+ __put_user(uoss.ss_flags, &uoss_ptr->ss_flags) ||
+ __put_user(uoss.ss_size, &uoss_ptr->ss_size))
+ ret = -EFAULT;
+ }
+ return ret;
+}
+
+int compat_restore_altstack(const compat_stack_t __user *uss)
+{
+ int err = compat_sys_sigaltstack(uss, NULL);
+ /* squash all but -EFAULT for now */
+ return err == -EFAULT ? err : 0;
+}
+
+int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp)
+{
+ struct task_struct *t = current;
+ return __put_user(ptr_to_compat((void __user *)t->sas_ss_sp), &uss->ss_sp) |
+ __put_user(sas_ss_flags(sp), &uss->ss_flags) |
+ __put_user(t->sas_ss_size, &uss->ss_size);
+}
+#endif
+
+#ifdef __ARCH_WANT_SYS_SIGPENDING
+
+/**
+ * sys_sigpending - examine pending signals
+ * @set: where mask of pending signal is returned
+ */
+SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
+{
+ return sys_rt_sigpending((sigset_t __user *)set, sizeof(old_sigset_t));
+}
+
+#endif
+
+#ifdef __ARCH_WANT_SYS_SIGPROCMASK
+/**
+ * sys_sigprocmask - examine and change blocked signals
+ * @how: whether to add, remove, or set signals
+ * @nset: signals to add or remove (if non-null)
+ * @oset: previous value of signal mask if non-null
+ *
+ * Some platforms have their own version with special arguments;
+ * others support only sys_rt_sigprocmask.
+ */
+
+SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset,
+ old_sigset_t __user *, oset)
+{
+ old_sigset_t old_set, new_set;
+ sigset_t new_blocked;
+
+ old_set = current->blocked.sig[0];
+
+ if (nset) {
+ if (copy_from_user(&new_set, nset, sizeof(*nset)))
+ return -EFAULT;
+
+ new_blocked = current->blocked;
+
+ switch (how) {
+ case SIG_BLOCK:
+ sigaddsetmask(&new_blocked, new_set);
+ break;
+ case SIG_UNBLOCK:
+ sigdelsetmask(&new_blocked, new_set);
+ break;
+ case SIG_SETMASK:
+ new_blocked.sig[0] = new_set;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ set_current_blocked(&new_blocked);
+ }
+
+ if (oset) {
+ if (copy_to_user(oset, &old_set, sizeof(*oset)))
+ return -EFAULT;
+ }
+
+ return 0;
+}
+#endif /* __ARCH_WANT_SYS_SIGPROCMASK */
+
+#ifndef CONFIG_ODD_RT_SIGACTION
+/**
+ * sys_rt_sigaction - alter an action taken by a process
+ * @sig: signal to be sent
+ * @act: new sigaction
+ * @oact: used to save the previous sigaction
+ * @sigsetsize: size of sigset_t type
+ */
+SYSCALL_DEFINE4(rt_sigaction, int, sig,
+ const struct sigaction __user *, act,
+ struct sigaction __user *, oact,
+ size_t, sigsetsize)
+{
+ struct k_sigaction new_sa, old_sa;
+ int ret = -EINVAL;
+
+ /* XXX: Don't preclude handling different sized sigset_t's. */
+ if (sigsetsize != sizeof(sigset_t))
+ goto out;
+
+ if (act) {
+ if (copy_from_user(&new_sa.sa, act, sizeof(new_sa.sa)))
+ return -EFAULT;
+ }
+
+ ret = do_sigaction(sig, act ? &new_sa : NULL, oact ? &old_sa : NULL);
+
+ if (!ret && oact) {
+ if (copy_to_user(oact, &old_sa.sa, sizeof(old_sa.sa)))
+ return -EFAULT;
+ }
+out:
+ return ret;
+}
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
+ const struct compat_sigaction __user *, act,
+ struct compat_sigaction __user *, oact,
+ compat_size_t, sigsetsize)
+{
+ struct k_sigaction new_ka, old_ka;
+ compat_sigset_t mask;
+#ifdef __ARCH_HAS_SA_RESTORER
+ compat_uptr_t restorer;
+#endif
+ int ret;
+
+ /* XXX: Don't preclude handling different sized sigset_t's. */
+ if (sigsetsize != sizeof(compat_sigset_t))
+ return -EINVAL;
+
+ if (act) {
+ compat_uptr_t handler;
+ ret = get_user(handler, &act->sa_handler);
+ new_ka.sa.sa_handler = compat_ptr(handler);
+#ifdef __ARCH_HAS_SA_RESTORER
+ ret |= get_user(restorer, &act->sa_restorer);
+ new_ka.sa.sa_restorer = compat_ptr(restorer);
+#endif
+ ret |= copy_from_user(&mask, &act->sa_mask, sizeof(mask));
+ ret |= get_user(new_ka.sa.sa_flags, &act->sa_flags);
+ if (ret)
+ return -EFAULT;
+ sigset_from_compat(&new_ka.sa.sa_mask, &mask);
+ }
+
+ ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
+ if (!ret && oact) {
+ sigset_to_compat(&mask, &old_ka.sa.sa_mask);
+ ret = put_user(ptr_to_compat(old_ka.sa.sa_handler),
+ &oact->sa_handler);
+ ret |= copy_to_user(&oact->sa_mask, &mask, sizeof(mask));
+ ret |= put_user(old_ka.sa.sa_flags, &oact->sa_flags);
+#ifdef __ARCH_HAS_SA_RESTORER
+ ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer),
+ &oact->sa_restorer);
+#endif
+ }
+ return ret;
+}
+#endif
+#endif /* !CONFIG_ODD_RT_SIGACTION */
+
+#ifdef CONFIG_OLD_SIGACTION
+SYSCALL_DEFINE3(sigaction, int, sig,
+ const struct old_sigaction __user *, act,
+ struct old_sigaction __user *, oact)
+{
+ struct k_sigaction new_ka, old_ka;
+ int ret;
+
+ if (act) {
+ old_sigset_t mask;
+ if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
+ __get_user(new_ka.sa.sa_handler, &act->sa_handler) ||
+ __get_user(new_ka.sa.sa_restorer, &act->sa_restorer) ||
+ __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
+ __get_user(mask, &act->sa_mask))
+ return -EFAULT;
+#ifdef __ARCH_HAS_KA_RESTORER
+ new_ka.ka_restorer = NULL;
+#endif
+ siginitset(&new_ka.sa.sa_mask, mask);
+ }
+
+ ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
+
+ if (!ret && oact) {
+ if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
+ __put_user(old_ka.sa.sa_handler, &oact->sa_handler) ||
+ __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer) ||
+ __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
+ __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask))
+ return -EFAULT;
+ }
+
+ return ret;
+}
+#endif
+#ifdef CONFIG_COMPAT_OLD_SIGACTION
+COMPAT_SYSCALL_DEFINE3(sigaction, int, sig,
+ const struct compat_old_sigaction __user *, act,
+ struct compat_old_sigaction __user *, oact)
+{
+ struct k_sigaction new_ka, old_ka;
+ int ret;
+ compat_old_sigset_t mask;
+ compat_uptr_t handler, restorer;
+
+ if (act) {
+ if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
+ __get_user(handler, &act->sa_handler) ||
+ __get_user(restorer, &act->sa_restorer) ||
+ __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
+ __get_user(mask, &act->sa_mask))
+ return -EFAULT;
+
+#ifdef __ARCH_HAS_KA_RESTORER
+ new_ka.ka_restorer = NULL;
+#endif
+ new_ka.sa.sa_handler = compat_ptr(handler);
+ new_ka.sa.sa_restorer = compat_ptr(restorer);
+ siginitset(&new_ka.sa.sa_mask, mask);
+ }
+
+ ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
+
+ if (!ret && oact) {
+ if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
+ __put_user(ptr_to_compat(old_ka.sa.sa_handler),
+ &oact->sa_handler) ||
+ __put_user(ptr_to_compat(old_ka.sa.sa_restorer),
+ &oact->sa_restorer) ||
+ __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
+ __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask))
+ return -EFAULT;
+ }
+ return ret;
+}
+#endif
+
+#ifdef CONFIG_SGETMASK_SYSCALL
+
+/*
+ * For backwards compatibility. Functionality superseded by sigprocmask.
+ */
+SYSCALL_DEFINE0(sgetmask)
+{
+ /* SMP safe */
+ return current->blocked.sig[0];
+}
+
+SYSCALL_DEFINE1(ssetmask, int, newmask)
+{
+ int old = current->blocked.sig[0];
+ sigset_t newset;
+
+ siginitset(&newset, newmask);
+ set_current_blocked(&newset);
+
+ return old;
+}
+#endif /* CONFIG_SGETMASK_SYSCALL */
+
+#ifdef __ARCH_WANT_SYS_SIGNAL
+/*
+ * For backwards compatibility. Functionality superseded by sigaction.
+ */
+SYSCALL_DEFINE2(signal, int, sig, __sighandler_t, handler)
+{
+ struct k_sigaction new_sa, old_sa;
+ int ret;
+
+ new_sa.sa.sa_handler = handler;
+ new_sa.sa.sa_flags = SA_ONESHOT | SA_NOMASK;
+ sigemptyset(&new_sa.sa.sa_mask);
+
+ ret = do_sigaction(sig, &new_sa, &old_sa);
+
+ return ret ? ret : (unsigned long)old_sa.sa.sa_handler;
+}
+#endif /* __ARCH_WANT_SYS_SIGNAL */
+
+#ifdef __ARCH_WANT_SYS_PAUSE
+
+SYSCALL_DEFINE0(pause)
+{
+ while (!signal_pending(current)) {
+ __set_current_state(TASK_INTERRUPTIBLE);
+ schedule();
+ }
+ return -ERESTARTNOHAND;
+}
+
+#endif
+
+int sigsuspend(sigset_t *set)
+{
+ current->saved_sigmask = current->blocked;
+ set_current_blocked(set);
+
+ __set_current_state(TASK_INTERRUPTIBLE);
+ schedule();
+ set_restore_sigmask();
+ return -ERESTARTNOHAND;
+}
+
+/**
+ * sys_rt_sigsuspend - replace the signal mask for a value with the
+ * @unewset value until a signal is received
+ * @unewset: new signal mask value
+ * @sigsetsize: size of sigset_t type
+ */
+SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
+{
+ sigset_t newset;
+
+ /* XXX: Don't preclude handling different sized sigset_t's. */
+ if (sigsetsize != sizeof(sigset_t))
+ return -EINVAL;
+
+ if (copy_from_user(&newset, unewset, sizeof(newset)))
+ return -EFAULT;
+ return sigsuspend(&newset);
+}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(rt_sigsuspend, compat_sigset_t __user *, unewset, compat_size_t, sigsetsize)
+{
+#ifdef __BIG_ENDIAN
+ sigset_t newset;
+ compat_sigset_t newset32;
+
+ /* XXX: Don't preclude handling different sized sigset_t's. */
+ if (sigsetsize != sizeof(sigset_t))
+ return -EINVAL;
+
+ if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t)))
+ return -EFAULT;
+ sigset_from_compat(&newset, &newset32);
+ return sigsuspend(&newset);
+#else
+ /* on little-endian bitmaps don't care about granularity */
+ return sys_rt_sigsuspend((sigset_t __user *)unewset, sigsetsize);
+#endif
+}
+#endif
+
+#ifdef CONFIG_OLD_SIGSUSPEND
+SYSCALL_DEFINE1(sigsuspend, old_sigset_t, mask)
+{
+ sigset_t blocked;
+ siginitset(&blocked, mask);
+ return sigsuspend(&blocked);
+}
+#endif
+#ifdef CONFIG_OLD_SIGSUSPEND3
+SYSCALL_DEFINE3(sigsuspend, int, unused1, int, unused2, old_sigset_t, mask)
+{
+ sigset_t blocked;
+ siginitset(&blocked, mask);
+ return sigsuspend(&blocked);
+}
+#endif
+
+__weak const char *arch_vma_name(struct vm_area_struct *vma)
+{
+ return NULL;
+}
+
+void __init signals_init(void)
+{
+ sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC);
+}
+
+#ifdef CONFIG_KGDB_KDB
+#include <linux/kdb.h>
+/*
+ * kdb_send_sig_info - Allows kdb to send signals without exposing
+ * signal internals. This function checks if the required locks are
+ * available before calling the main signal code, to avoid kdb
+ * deadlocks.
+ */
+void
+kdb_send_sig_info(struct task_struct *t, struct siginfo *info)
+{
+ static struct task_struct *kdb_prev_t;
+ int sig, new_t;
+ if (!spin_trylock(&t->sighand->siglock)) {
+ kdb_printf("Can't do kill command now.\n"
+ "The sigmask lock is held somewhere else in "
+ "kernel, try again later\n");
+ return;
+ }
+ spin_unlock(&t->sighand->siglock);
+ new_t = kdb_prev_t != t;
+ kdb_prev_t = t;
+ if (t->state != TASK_RUNNING && new_t) {
+ kdb_printf("Process is not RUNNING, sending a signal from "
+ "kdb risks deadlock\n"
+ "on the run queue locks. "
+ "The signal has _not_ been sent.\n"
+ "Reissue the kill command if you want to risk "
+ "the deadlock.\n");
+ return;
+ }
+ sig = info->si_signo;
+ if (send_sig_info(sig, info, t))
+ kdb_printf("Fail to deliver Signal %d to process %d.\n",
+ sig, t->pid);
+ else
+ kdb_printf("Signal %d is sent to process %d.\n", sig, t->pid);
+}
+#endif /* CONFIG_KGDB_KDB */
diff --git a/kernel/smp.c b/kernel/smp.c
new file mode 100644
index 000000000..07854477c
--- /dev/null
+++ b/kernel/smp.c
@@ -0,0 +1,741 @@
+/*
+ * Generic helpers for smp ipi calls
+ *
+ * (C) Jens Axboe <jens.axboe@oracle.com> 2008
+ */
+#include <linux/irq_work.h>
+#include <linux/rcupdate.h>
+#include <linux/rculist.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <linux/gfp.h>
+#include <linux/smp.h>
+#include <linux/cpu.h>
+#include <linux/sched.h>
+
+#include "smpboot.h"
+
+enum {
+ CSD_FLAG_LOCK = 0x01,
+ CSD_FLAG_SYNCHRONOUS = 0x02,
+};
+
+struct call_function_data {
+ struct call_single_data __percpu *csd;
+ cpumask_var_t cpumask;
+};
+
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
+
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);
+
+static void flush_smp_call_function_queue(bool warn_cpu_offline);
+
+static int
+hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+ long cpu = (long)hcpu;
+ struct call_function_data *cfd = &per_cpu(cfd_data, cpu);
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
+ if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
+ cpu_to_node(cpu)))
+ return notifier_from_errno(-ENOMEM);
+ cfd->csd = alloc_percpu(struct call_single_data);
+ if (!cfd->csd) {
+ free_cpumask_var(cfd->cpumask);
+ return notifier_from_errno(-ENOMEM);
+ }
+ break;
+
+#ifdef CONFIG_HOTPLUG_CPU
+ case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
+ /* Fall-through to the CPU_DEAD[_FROZEN] case. */
+
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ free_cpumask_var(cfd->cpumask);
+ free_percpu(cfd->csd);
+ break;
+
+ case CPU_DYING:
+ case CPU_DYING_FROZEN:
+ /*
+ * The IPIs for the smp-call-function callbacks queued by other
+ * CPUs might arrive late, either due to hardware latencies or
+ * because this CPU disabled interrupts (inside stop-machine)
+ * before the IPIs were sent. So flush out any pending callbacks
+ * explicitly (without waiting for the IPIs to arrive), to
+ * ensure that the outgoing CPU doesn't go offline with work
+ * still pending.
+ */
+ flush_smp_call_function_queue(false);
+ break;
+#endif
+ };
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block hotplug_cfd_notifier = {
+ .notifier_call = hotplug_cfd,
+};
+
+void __init call_function_init(void)
+{
+ void *cpu = (void *)(long)smp_processor_id();
+ int i;
+
+ for_each_possible_cpu(i)
+ init_llist_head(&per_cpu(call_single_queue, i));
+
+ hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu);
+ register_cpu_notifier(&hotplug_cfd_notifier);
+}
+
+/*
+ * csd_lock/csd_unlock used to serialize access to per-cpu csd resources
+ *
+ * For non-synchronous ipi calls the csd can still be in use by the
+ * previous function call. For multi-cpu calls its even more interesting
+ * as we'll have to ensure no other cpu is observing our csd.
+ */
+static void csd_lock_wait(struct call_single_data *csd)
+{
+ while (smp_load_acquire(&csd->flags) & CSD_FLAG_LOCK)
+ cpu_relax();
+}
+
+static void csd_lock(struct call_single_data *csd)
+{
+ csd_lock_wait(csd);
+ csd->flags |= CSD_FLAG_LOCK;
+
+ /*
+ * prevent CPU from reordering the above assignment
+ * to ->flags with any subsequent assignments to other
+ * fields of the specified call_single_data structure:
+ */
+ smp_wmb();
+}
+
+static void csd_unlock(struct call_single_data *csd)
+{
+ WARN_ON(!(csd->flags & CSD_FLAG_LOCK));
+
+ /*
+ * ensure we're all done before releasing data:
+ */
+ smp_store_release(&csd->flags, 0);
+}
+
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
+
+/*
+ * Insert a previously allocated call_single_data element
+ * for execution on the given CPU. data must already have
+ * ->func, ->info, and ->flags set.
+ */
+static int generic_exec_single(int cpu, struct call_single_data *csd,
+ smp_call_func_t func, void *info)
+{
+ if (cpu == smp_processor_id()) {
+ unsigned long flags;
+
+ /*
+ * We can unlock early even for the synchronous on-stack case,
+ * since we're doing this from the same CPU..
+ */
+ csd_unlock(csd);
+ local_irq_save(flags);
+ func(info);
+ local_irq_restore(flags);
+ return 0;
+ }
+
+
+ if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) {
+ csd_unlock(csd);
+ return -ENXIO;
+ }
+
+ csd->func = func;
+ csd->info = info;
+
+ /*
+ * The list addition should be visible before sending the IPI
+ * handler locks the list to pull the entry off it because of
+ * normal cache coherency rules implied by spinlocks.
+ *
+ * If IPIs can go out of order to the cache coherency protocol
+ * in an architecture, sufficient synchronisation should be added
+ * to arch code to make it appear to obey cache coherency WRT
+ * locking and barrier primitives. Generic code isn't really
+ * equipped to do the right thing...
+ */
+ if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)))
+ arch_send_call_function_single_ipi(cpu);
+
+ return 0;
+}
+
+/**
+ * generic_smp_call_function_single_interrupt - Execute SMP IPI callbacks
+ *
+ * Invoked by arch to handle an IPI for call function single.
+ * Must be called with interrupts disabled.
+ */
+void generic_smp_call_function_single_interrupt(void)
+{
+ flush_smp_call_function_queue(true);
+}
+
+/**
+ * flush_smp_call_function_queue - Flush pending smp-call-function callbacks
+ *
+ * @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an
+ * offline CPU. Skip this check if set to 'false'.
+ *
+ * Flush any pending smp-call-function callbacks queued on this CPU. This is
+ * invoked by the generic IPI handler, as well as by a CPU about to go offline,
+ * to ensure that all pending IPI callbacks are run before it goes completely
+ * offline.
+ *
+ * Loop through the call_single_queue and run all the queued callbacks.
+ * Must be called with interrupts disabled.
+ */
+static void flush_smp_call_function_queue(bool warn_cpu_offline)
+{
+ struct llist_head *head;
+ struct llist_node *entry;
+ struct call_single_data *csd, *csd_next;
+ static bool warned;
+
+ WARN_ON(!irqs_disabled());
+
+ head = this_cpu_ptr(&call_single_queue);
+ entry = llist_del_all(head);
+ entry = llist_reverse_order(entry);
+
+ /* There shouldn't be any pending callbacks on an offline CPU. */
+ if (unlikely(warn_cpu_offline && !cpu_online(smp_processor_id()) &&
+ !warned && !llist_empty(head))) {
+ warned = true;
+ WARN(1, "IPI on offline CPU %d\n", smp_processor_id());
+
+ /*
+ * We don't have to use the _safe() variant here
+ * because we are not invoking the IPI handlers yet.
+ */
+ llist_for_each_entry(csd, entry, llist)
+ pr_warn("IPI callback %pS sent to offline CPU\n",
+ csd->func);
+ }
+
+ llist_for_each_entry_safe(csd, csd_next, entry, llist) {
+ smp_call_func_t func = csd->func;
+ void *info = csd->info;
+
+ /* Do we wait until *after* callback? */
+ if (csd->flags & CSD_FLAG_SYNCHRONOUS) {
+ func(info);
+ csd_unlock(csd);
+ } else {
+ csd_unlock(csd);
+ func(info);
+ }
+ }
+
+ /*
+ * Handle irq works queued remotely by irq_work_queue_on().
+ * Smp functions above are typically synchronous so they
+ * better run first since some other CPUs may be busy waiting
+ * for them.
+ */
+ irq_work_run();
+}
+
+/*
+ * smp_call_function_single - Run a function on a specific CPU
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @wait: If true, wait until function has completed on other CPUs.
+ *
+ * Returns 0 on success, else a negative status code.
+ */
+int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
+ int wait)
+{
+ struct call_single_data *csd;
+ struct call_single_data csd_stack = { .flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS };
+ int this_cpu;
+ int err;
+
+ /*
+ * prevent preemption and reschedule on another processor,
+ * as well as CPU removal
+ */
+ this_cpu = get_cpu();
+
+ /*
+ * Can deadlock when called with interrupts disabled.
+ * We allow cpu's that are not yet online though, as no one else can
+ * send smp call function interrupt to this cpu and as such deadlocks
+ * can't happen.
+ */
+ WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
+ && !oops_in_progress);
+
+ csd = &csd_stack;
+ if (!wait) {
+ csd = this_cpu_ptr(&csd_data);
+ csd_lock(csd);
+ }
+
+ err = generic_exec_single(cpu, csd, func, info);
+
+ if (wait)
+ csd_lock_wait(csd);
+
+ put_cpu();
+
+ return err;
+}
+EXPORT_SYMBOL(smp_call_function_single);
+
+/**
+ * smp_call_function_single_async(): Run an asynchronous function on a
+ * specific CPU.
+ * @cpu: The CPU to run on.
+ * @csd: Pre-allocated and setup data structure
+ *
+ * Like smp_call_function_single(), but the call is asynchonous and
+ * can thus be done from contexts with disabled interrupts.
+ *
+ * The caller passes his own pre-allocated data structure
+ * (ie: embedded in an object) and is responsible for synchronizing it
+ * such that the IPIs performed on the @csd are strictly serialized.
+ *
+ * NOTE: Be careful, there is unfortunately no current debugging facility to
+ * validate the correctness of this serialization.
+ */
+int smp_call_function_single_async(int cpu, struct call_single_data *csd)
+{
+ int err = 0;
+
+ preempt_disable();
+
+ /* We could deadlock if we have to wait here with interrupts disabled! */
+ if (WARN_ON_ONCE(csd->flags & CSD_FLAG_LOCK))
+ csd_lock_wait(csd);
+
+ csd->flags = CSD_FLAG_LOCK;
+ smp_wmb();
+
+ err = generic_exec_single(cpu, csd, csd->func, csd->info);
+ preempt_enable();
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(smp_call_function_single_async);
+
+/*
+ * smp_call_function_any - Run a function on any of the given cpus
+ * @mask: The mask of cpus it can run on.
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @wait: If true, wait until function has completed.
+ *
+ * Returns 0 on success, else a negative status code (if no cpus were online).
+ *
+ * Selection preference:
+ * 1) current cpu if in @mask
+ * 2) any cpu of current node if in @mask
+ * 3) any other online cpu in @mask
+ */
+int smp_call_function_any(const struct cpumask *mask,
+ smp_call_func_t func, void *info, int wait)
+{
+ unsigned int cpu;
+ const struct cpumask *nodemask;
+ int ret;
+
+ /* Try for same CPU (cheapest) */
+ cpu = get_cpu();
+ if (cpumask_test_cpu(cpu, mask))
+ goto call;
+
+ /* Try for same node. */
+ nodemask = cpumask_of_node(cpu_to_node(cpu));
+ for (cpu = cpumask_first_and(nodemask, mask); cpu < nr_cpu_ids;
+ cpu = cpumask_next_and(cpu, nodemask, mask)) {
+ if (cpu_online(cpu))
+ goto call;
+ }
+
+ /* Any online will do: smp_call_function_single handles nr_cpu_ids. */
+ cpu = cpumask_any_and(mask, cpu_online_mask);
+call:
+ ret = smp_call_function_single(cpu, func, info, wait);
+ put_cpu();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(smp_call_function_any);
+
+/**
+ * smp_call_function_many(): Run a function on a set of other CPUs.
+ * @mask: The set of cpus to run on (only runs on online subset).
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @wait: If true, wait (atomically) until function has completed
+ * on other CPUs.
+ *
+ * If @wait is true, then returns once @func has returned.
+ *
+ * You must not call this function with disabled interrupts or from a
+ * hardware interrupt handler or from a bottom half handler. Preemption
+ * must be disabled when calling this function.
+ */
+void smp_call_function_many(const struct cpumask *mask,
+ smp_call_func_t func, void *info, bool wait)
+{
+ struct call_function_data *cfd;
+ int cpu, next_cpu, this_cpu = smp_processor_id();
+
+ /*
+ * Can deadlock when called with interrupts disabled.
+ * We allow cpu's that are not yet online though, as no one else can
+ * send smp call function interrupt to this cpu and as such deadlocks
+ * can't happen.
+ */
+ WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
+ && !oops_in_progress && !early_boot_irqs_disabled);
+
+ /* Try to fastpath. So, what's a CPU they want? Ignoring this one. */
+ cpu = cpumask_first_and(mask, cpu_online_mask);
+ if (cpu == this_cpu)
+ cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
+
+ /* No online cpus? We're done. */
+ if (cpu >= nr_cpu_ids)
+ return;
+
+ /* Do we have another CPU which isn't us? */
+ next_cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
+ if (next_cpu == this_cpu)
+ next_cpu = cpumask_next_and(next_cpu, mask, cpu_online_mask);
+
+ /* Fastpath: do that cpu by itself. */
+ if (next_cpu >= nr_cpu_ids) {
+ smp_call_function_single(cpu, func, info, wait);
+ return;
+ }
+
+ cfd = this_cpu_ptr(&cfd_data);
+
+ cpumask_and(cfd->cpumask, mask, cpu_online_mask);
+ cpumask_clear_cpu(this_cpu, cfd->cpumask);
+
+ /* Some callers race with other cpus changing the passed mask */
+ if (unlikely(!cpumask_weight(cfd->cpumask)))
+ return;
+
+ for_each_cpu(cpu, cfd->cpumask) {
+ struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu);
+
+ csd_lock(csd);
+ if (wait)
+ csd->flags |= CSD_FLAG_SYNCHRONOUS;
+ csd->func = func;
+ csd->info = info;
+ llist_add(&csd->llist, &per_cpu(call_single_queue, cpu));
+ }
+
+ /* Send a message to all CPUs in the map */
+ arch_send_call_function_ipi_mask(cfd->cpumask);
+
+ if (wait) {
+ for_each_cpu(cpu, cfd->cpumask) {
+ struct call_single_data *csd;
+
+ csd = per_cpu_ptr(cfd->csd, cpu);
+ csd_lock_wait(csd);
+ }
+ }
+}
+EXPORT_SYMBOL(smp_call_function_many);
+
+/**
+ * smp_call_function(): Run a function on all other CPUs.
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @wait: If true, wait (atomically) until function has completed
+ * on other CPUs.
+ *
+ * Returns 0.
+ *
+ * If @wait is true, then returns once @func has returned; otherwise
+ * it returns just before the target cpu calls @func.
+ *
+ * You must not call this function with disabled interrupts or from a
+ * hardware interrupt handler or from a bottom half handler.
+ */
+int smp_call_function(smp_call_func_t func, void *info, int wait)
+{
+ preempt_disable();
+ smp_call_function_many(cpu_online_mask, func, info, wait);
+ preempt_enable();
+
+ return 0;
+}
+EXPORT_SYMBOL(smp_call_function);
+
+/* Setup configured maximum number of CPUs to activate */
+unsigned int setup_max_cpus = NR_CPUS;
+EXPORT_SYMBOL(setup_max_cpus);
+
+
+/*
+ * Setup routine for controlling SMP activation
+ *
+ * Command-line option of "nosmp" or "maxcpus=0" will disable SMP
+ * activation entirely (the MPS table probe still happens, though).
+ *
+ * Command-line option of "maxcpus=<NUM>", where <NUM> is an integer
+ * greater than 0, limits the maximum number of CPUs activated in
+ * SMP mode to <NUM>.
+ */
+
+void __weak arch_disable_smp_support(void) { }
+
+static int __init nosmp(char *str)
+{
+ setup_max_cpus = 0;
+ arch_disable_smp_support();
+
+ return 0;
+}
+
+early_param("nosmp", nosmp);
+
+/* this is hard limit */
+static int __init nrcpus(char *str)
+{
+ int nr_cpus;
+
+ get_option(&str, &nr_cpus);
+ if (nr_cpus > 0 && nr_cpus < nr_cpu_ids)
+ nr_cpu_ids = nr_cpus;
+
+ return 0;
+}
+
+early_param("nr_cpus", nrcpus);
+
+static int __init maxcpus(char *str)
+{
+ get_option(&str, &setup_max_cpus);
+ if (setup_max_cpus == 0)
+ arch_disable_smp_support();
+
+ return 0;
+}
+
+early_param("maxcpus", maxcpus);
+
+/* Setup number of possible processor ids */
+int nr_cpu_ids __read_mostly = NR_CPUS;
+EXPORT_SYMBOL(nr_cpu_ids);
+
+/* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */
+void __init setup_nr_cpu_ids(void)
+{
+ nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
+}
+
+void __weak smp_announce(void)
+{
+ printk(KERN_INFO "Brought up %d CPUs\n", num_online_cpus());
+}
+
+/* Called by boot processor to activate the rest. */
+void __init smp_init(void)
+{
+ unsigned int cpu;
+
+ idle_threads_init();
+
+ /* FIXME: This should be done in userspace --RR */
+ for_each_present_cpu(cpu) {
+ if (num_online_cpus() >= setup_max_cpus)
+ break;
+ if (!cpu_online(cpu))
+ cpu_up(cpu);
+ }
+
+ /* Any cleanup work */
+ smp_announce();
+ smp_cpus_done(setup_max_cpus);
+}
+
+/*
+ * Call a function on all processors. May be used during early boot while
+ * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead
+ * of local_irq_disable/enable().
+ */
+int on_each_cpu(void (*func) (void *info), void *info, int wait)
+{
+ unsigned long flags;
+ int ret = 0;
+
+ preempt_disable();
+ ret = smp_call_function(func, info, wait);
+ local_irq_save(flags);
+ func(info);
+ local_irq_restore(flags);
+ preempt_enable();
+ return ret;
+}
+EXPORT_SYMBOL(on_each_cpu);
+
+/**
+ * on_each_cpu_mask(): Run a function on processors specified by
+ * cpumask, which may include the local processor.
+ * @mask: The set of cpus to run on (only runs on online subset).
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @wait: If true, wait (atomically) until function has completed
+ * on other CPUs.
+ *
+ * If @wait is true, then returns once @func has returned.
+ *
+ * You must not call this function with disabled interrupts or from a
+ * hardware interrupt handler or from a bottom half handler. The
+ * exception is that it may be used during early boot while
+ * early_boot_irqs_disabled is set.
+ */
+void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func,
+ void *info, bool wait)
+{
+ int cpu = get_cpu();
+
+ smp_call_function_many(mask, func, info, wait);
+ if (cpumask_test_cpu(cpu, mask)) {
+ unsigned long flags;
+ local_irq_save(flags);
+ func(info);
+ local_irq_restore(flags);
+ }
+ put_cpu();
+}
+EXPORT_SYMBOL(on_each_cpu_mask);
+
+/*
+ * on_each_cpu_cond(): Call a function on each processor for which
+ * the supplied function cond_func returns true, optionally waiting
+ * for all the required CPUs to finish. This may include the local
+ * processor.
+ * @cond_func: A callback function that is passed a cpu id and
+ * the the info parameter. The function is called
+ * with preemption disabled. The function should
+ * return a blooean value indicating whether to IPI
+ * the specified CPU.
+ * @func: The function to run on all applicable CPUs.
+ * This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to both functions.
+ * @wait: If true, wait (atomically) until function has
+ * completed on other CPUs.
+ * @gfp_flags: GFP flags to use when allocating the cpumask
+ * used internally by the function.
+ *
+ * The function might sleep if the GFP flags indicates a non
+ * atomic allocation is allowed.
+ *
+ * Preemption is disabled to protect against CPUs going offline but not online.
+ * CPUs going online during the call will not be seen or sent an IPI.
+ *
+ * You must not call this function with disabled interrupts or
+ * from a hardware interrupt handler or from a bottom half handler.
+ */
+void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
+ smp_call_func_t func, void *info, bool wait,
+ gfp_t gfp_flags)
+{
+ cpumask_var_t cpus;
+ int cpu, ret;
+
+ might_sleep_if(gfp_flags & __GFP_WAIT);
+
+ if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) {
+ preempt_disable();
+ for_each_online_cpu(cpu)
+ if (cond_func(cpu, info))
+ cpumask_set_cpu(cpu, cpus);
+ on_each_cpu_mask(cpus, func, info, wait);
+ preempt_enable();
+ free_cpumask_var(cpus);
+ } else {
+ /*
+ * No free cpumask, bother. No matter, we'll
+ * just have to IPI them one by one.
+ */
+ preempt_disable();
+ for_each_online_cpu(cpu)
+ if (cond_func(cpu, info)) {
+ ret = smp_call_function_single(cpu, func,
+ info, wait);
+ WARN_ON_ONCE(ret);
+ }
+ preempt_enable();
+ }
+}
+EXPORT_SYMBOL(on_each_cpu_cond);
+
+static void do_nothing(void *unused)
+{
+}
+
+/**
+ * kick_all_cpus_sync - Force all cpus out of idle
+ *
+ * Used to synchronize the update of pm_idle function pointer. It's
+ * called after the pointer is updated and returns after the dummy
+ * callback function has been executed on all cpus. The execution of
+ * the function can only happen on the remote cpus after they have
+ * left the idle function which had been called via pm_idle function
+ * pointer. So it's guaranteed that nothing uses the previous pointer
+ * anymore.
+ */
+void kick_all_cpus_sync(void)
+{
+ /* Make sure the change is visible before we kick the cpus */
+ smp_mb();
+ smp_call_function(do_nothing, NULL, 1);
+}
+EXPORT_SYMBOL_GPL(kick_all_cpus_sync);
+
+/**
+ * wake_up_all_idle_cpus - break all cpus out of idle
+ * wake_up_all_idle_cpus try to break all cpus which is in idle state even
+ * including idle polling cpus, for non-idle cpus, we will do nothing
+ * for them.
+ */
+void wake_up_all_idle_cpus(void)
+{
+ int cpu;
+
+ preempt_disable();
+ for_each_online_cpu(cpu) {
+ if (cpu == smp_processor_id())
+ continue;
+
+ wake_up_if_idle(cpu);
+ }
+ preempt_enable();
+}
+EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus);
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
new file mode 100644
index 000000000..bdcc6c018
--- /dev/null
+++ b/kernel/smpboot.c
@@ -0,0 +1,472 @@
+/*
+ * Common SMP CPU bringup/teardown functions
+ */
+#include <linux/cpu.h>
+#include <linux/err.h>
+#include <linux/smp.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/export.h>
+#include <linux/percpu.h>
+#include <linux/kthread.h>
+#include <linux/smpboot.h>
+
+#include "smpboot.h"
+
+#ifdef CONFIG_SMP
+
+#ifdef CONFIG_GENERIC_SMP_IDLE_THREAD
+/*
+ * For the hotplug case we keep the task structs around and reuse
+ * them.
+ */
+static DEFINE_PER_CPU(struct task_struct *, idle_threads);
+
+struct task_struct *idle_thread_get(unsigned int cpu)
+{
+ struct task_struct *tsk = per_cpu(idle_threads, cpu);
+
+ if (!tsk)
+ return ERR_PTR(-ENOMEM);
+ init_idle(tsk, cpu);
+ return tsk;
+}
+
+void __init idle_thread_set_boot_cpu(void)
+{
+ per_cpu(idle_threads, smp_processor_id()) = current;
+}
+
+/**
+ * idle_init - Initialize the idle thread for a cpu
+ * @cpu: The cpu for which the idle thread should be initialized
+ *
+ * Creates the thread if it does not exist.
+ */
+static inline void idle_init(unsigned int cpu)
+{
+ struct task_struct *tsk = per_cpu(idle_threads, cpu);
+
+ if (!tsk) {
+ tsk = fork_idle(cpu);
+ if (IS_ERR(tsk))
+ pr_err("SMP: fork_idle() failed for CPU %u\n", cpu);
+ else
+ per_cpu(idle_threads, cpu) = tsk;
+ }
+}
+
+/**
+ * idle_threads_init - Initialize idle threads for all cpus
+ */
+void __init idle_threads_init(void)
+{
+ unsigned int cpu, boot_cpu;
+
+ boot_cpu = smp_processor_id();
+
+ for_each_possible_cpu(cpu) {
+ if (cpu != boot_cpu)
+ idle_init(cpu);
+ }
+}
+#endif
+
+#endif /* #ifdef CONFIG_SMP */
+
+static LIST_HEAD(hotplug_threads);
+static DEFINE_MUTEX(smpboot_threads_lock);
+
+struct smpboot_thread_data {
+ unsigned int cpu;
+ unsigned int status;
+ struct smp_hotplug_thread *ht;
+};
+
+enum {
+ HP_THREAD_NONE = 0,
+ HP_THREAD_ACTIVE,
+ HP_THREAD_PARKED,
+};
+
+/**
+ * smpboot_thread_fn - percpu hotplug thread loop function
+ * @data: thread data pointer
+ *
+ * Checks for thread stop and park conditions. Calls the necessary
+ * setup, cleanup, park and unpark functions for the registered
+ * thread.
+ *
+ * Returns 1 when the thread should exit, 0 otherwise.
+ */
+static int smpboot_thread_fn(void *data)
+{
+ struct smpboot_thread_data *td = data;
+ struct smp_hotplug_thread *ht = td->ht;
+
+ while (1) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ preempt_disable();
+ if (kthread_should_stop()) {
+ __set_current_state(TASK_RUNNING);
+ preempt_enable();
+ if (ht->cleanup)
+ ht->cleanup(td->cpu, cpu_online(td->cpu));
+ kfree(td);
+ return 0;
+ }
+
+ if (kthread_should_park()) {
+ __set_current_state(TASK_RUNNING);
+ preempt_enable();
+ if (ht->park && td->status == HP_THREAD_ACTIVE) {
+ BUG_ON(td->cpu != smp_processor_id());
+ ht->park(td->cpu);
+ td->status = HP_THREAD_PARKED;
+ }
+ kthread_parkme();
+ /* We might have been woken for stop */
+ continue;
+ }
+
+ BUG_ON(td->cpu != smp_processor_id());
+
+ /* Check for state change setup */
+ switch (td->status) {
+ case HP_THREAD_NONE:
+ __set_current_state(TASK_RUNNING);
+ preempt_enable();
+ if (ht->setup)
+ ht->setup(td->cpu);
+ td->status = HP_THREAD_ACTIVE;
+ continue;
+
+ case HP_THREAD_PARKED:
+ __set_current_state(TASK_RUNNING);
+ preempt_enable();
+ if (ht->unpark)
+ ht->unpark(td->cpu);
+ td->status = HP_THREAD_ACTIVE;
+ continue;
+ }
+
+ if (!ht->thread_should_run(td->cpu)) {
+ preempt_enable_no_resched();
+ schedule();
+ } else {
+ __set_current_state(TASK_RUNNING);
+ preempt_enable();
+ ht->thread_fn(td->cpu);
+ }
+ }
+}
+
+static int
+__smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
+{
+ struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
+ struct smpboot_thread_data *td;
+
+ if (tsk)
+ return 0;
+
+ td = kzalloc_node(sizeof(*td), GFP_KERNEL | ___GFP_TOI_NOTRACK, cpu_to_node(cpu));
+ if (!td)
+ return -ENOMEM;
+ td->cpu = cpu;
+ td->ht = ht;
+
+ tsk = kthread_create_on_cpu(smpboot_thread_fn, td, cpu,
+ ht->thread_comm);
+ if (IS_ERR(tsk)) {
+ kfree(td);
+ return PTR_ERR(tsk);
+ }
+ get_task_struct(tsk);
+ *per_cpu_ptr(ht->store, cpu) = tsk;
+ if (ht->create) {
+ /*
+ * Make sure that the task has actually scheduled out
+ * into park position, before calling the create
+ * callback. At least the migration thread callback
+ * requires that the task is off the runqueue.
+ */
+ if (!wait_task_inactive(tsk, TASK_PARKED))
+ WARN_ON(1);
+ else
+ ht->create(cpu);
+ }
+ return 0;
+}
+
+int smpboot_create_threads(unsigned int cpu)
+{
+ struct smp_hotplug_thread *cur;
+ int ret = 0;
+
+ mutex_lock(&smpboot_threads_lock);
+ list_for_each_entry(cur, &hotplug_threads, list) {
+ ret = __smpboot_create_thread(cur, cpu);
+ if (ret)
+ break;
+ }
+ mutex_unlock(&smpboot_threads_lock);
+ return ret;
+}
+
+static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
+{
+ struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
+
+ if (ht->pre_unpark)
+ ht->pre_unpark(cpu);
+ kthread_unpark(tsk);
+}
+
+void smpboot_unpark_threads(unsigned int cpu)
+{
+ struct smp_hotplug_thread *cur;
+
+ mutex_lock(&smpboot_threads_lock);
+ list_for_each_entry(cur, &hotplug_threads, list)
+ smpboot_unpark_thread(cur, cpu);
+ mutex_unlock(&smpboot_threads_lock);
+}
+
+static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
+{
+ struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
+
+ if (tsk && !ht->selfparking)
+ kthread_park(tsk);
+}
+
+void smpboot_park_threads(unsigned int cpu)
+{
+ struct smp_hotplug_thread *cur;
+
+ mutex_lock(&smpboot_threads_lock);
+ list_for_each_entry_reverse(cur, &hotplug_threads, list)
+ smpboot_park_thread(cur, cpu);
+ mutex_unlock(&smpboot_threads_lock);
+}
+
+static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
+{
+ unsigned int cpu;
+
+ /* We need to destroy also the parked threads of offline cpus */
+ for_each_possible_cpu(cpu) {
+ struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
+
+ if (tsk) {
+ kthread_stop(tsk);
+ put_task_struct(tsk);
+ *per_cpu_ptr(ht->store, cpu) = NULL;
+ }
+ }
+}
+
+/**
+ * smpboot_register_percpu_thread - Register a per_cpu thread related to hotplug
+ * @plug_thread: Hotplug thread descriptor
+ *
+ * Creates and starts the threads on all online cpus.
+ */
+int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
+{
+ unsigned int cpu;
+ int ret = 0;
+
+ get_online_cpus();
+ mutex_lock(&smpboot_threads_lock);
+ for_each_online_cpu(cpu) {
+ ret = __smpboot_create_thread(plug_thread, cpu);
+ if (ret) {
+ smpboot_destroy_threads(plug_thread);
+ goto out;
+ }
+ smpboot_unpark_thread(plug_thread, cpu);
+ }
+ list_add(&plug_thread->list, &hotplug_threads);
+out:
+ mutex_unlock(&smpboot_threads_lock);
+ put_online_cpus();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread);
+
+/**
+ * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug
+ * @plug_thread: Hotplug thread descriptor
+ *
+ * Stops all threads on all possible cpus.
+ */
+void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread)
+{
+ get_online_cpus();
+ mutex_lock(&smpboot_threads_lock);
+ list_del(&plug_thread->list);
+ smpboot_destroy_threads(plug_thread);
+ mutex_unlock(&smpboot_threads_lock);
+ put_online_cpus();
+}
+EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
+
+static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD);
+
+/*
+ * Called to poll specified CPU's state, for example, when waiting for
+ * a CPU to come online.
+ */
+int cpu_report_state(int cpu)
+{
+ return atomic_read(&per_cpu(cpu_hotplug_state, cpu));
+}
+
+/*
+ * If CPU has died properly, set its state to CPU_UP_PREPARE and
+ * return success. Otherwise, return -EBUSY if the CPU died after
+ * cpu_wait_death() timed out. And yet otherwise again, return -EAGAIN
+ * if cpu_wait_death() timed out and the CPU still hasn't gotten around
+ * to dying. In the latter two cases, the CPU might not be set up
+ * properly, but it is up to the arch-specific code to decide.
+ * Finally, -EIO indicates an unanticipated problem.
+ *
+ * Note that it is permissible to omit this call entirely, as is
+ * done in architectures that do no CPU-hotplug error checking.
+ */
+int cpu_check_up_prepare(int cpu)
+{
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) {
+ atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_UP_PREPARE);
+ return 0;
+ }
+
+ switch (atomic_read(&per_cpu(cpu_hotplug_state, cpu))) {
+
+ case CPU_POST_DEAD:
+
+ /* The CPU died properly, so just start it up again. */
+ atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_UP_PREPARE);
+ return 0;
+
+ case CPU_DEAD_FROZEN:
+
+ /*
+ * Timeout during CPU death, so let caller know.
+ * The outgoing CPU completed its processing, but after
+ * cpu_wait_death() timed out and reported the error. The
+ * caller is free to proceed, in which case the state
+ * will be reset properly by cpu_set_state_online().
+ * Proceeding despite this -EBUSY return makes sense
+ * for systems where the outgoing CPUs take themselves
+ * offline, with no post-death manipulation required from
+ * a surviving CPU.
+ */
+ return -EBUSY;
+
+ case CPU_BROKEN:
+
+ /*
+ * The most likely reason we got here is that there was
+ * a timeout during CPU death, and the outgoing CPU never
+ * did complete its processing. This could happen on
+ * a virtualized system if the outgoing VCPU gets preempted
+ * for more than five seconds, and the user attempts to
+ * immediately online that same CPU. Trying again later
+ * might return -EBUSY above, hence -EAGAIN.
+ */
+ return -EAGAIN;
+
+ default:
+
+ /* Should not happen. Famous last words. */
+ return -EIO;
+ }
+}
+
+/*
+ * Mark the specified CPU online.
+ *
+ * Note that it is permissible to omit this call entirely, as is
+ * done in architectures that do no CPU-hotplug error checking.
+ */
+void cpu_set_state_online(int cpu)
+{
+ (void)atomic_xchg(&per_cpu(cpu_hotplug_state, cpu), CPU_ONLINE);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Wait for the specified CPU to exit the idle loop and die.
+ */
+bool cpu_wait_death(unsigned int cpu, int seconds)
+{
+ int jf_left = seconds * HZ;
+ int oldstate;
+ bool ret = true;
+ int sleep_jf = 1;
+
+ might_sleep();
+
+ /* The outgoing CPU will normally get done quite quickly. */
+ if (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) == CPU_DEAD)
+ goto update_state;
+ udelay(5);
+
+ /* But if the outgoing CPU dawdles, wait increasingly long times. */
+ while (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) != CPU_DEAD) {
+ schedule_timeout_uninterruptible(sleep_jf);
+ jf_left -= sleep_jf;
+ if (jf_left <= 0)
+ break;
+ sleep_jf = DIV_ROUND_UP(sleep_jf * 11, 10);
+ }
+update_state:
+ oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu));
+ if (oldstate == CPU_DEAD) {
+ /* Outgoing CPU died normally, update state. */
+ smp_mb(); /* atomic_read() before update. */
+ atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_POST_DEAD);
+ } else {
+ /* Outgoing CPU still hasn't died, set state accordingly. */
+ if (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu),
+ oldstate, CPU_BROKEN) != oldstate)
+ goto update_state;
+ ret = false;
+ }
+ return ret;
+}
+
+/*
+ * Called by the outgoing CPU to report its successful death. Return
+ * false if this report follows the surviving CPU's timing out.
+ *
+ * A separate "CPU_DEAD_FROZEN" is used when the surviving CPU
+ * timed out. This approach allows architectures to omit calls to
+ * cpu_check_up_prepare() and cpu_set_state_online() without defeating
+ * the next cpu_wait_death()'s polling loop.
+ */
+bool cpu_report_death(void)
+{
+ int oldstate;
+ int newstate;
+ int cpu = smp_processor_id();
+
+ do {
+ oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu));
+ if (oldstate != CPU_BROKEN)
+ newstate = CPU_DEAD;
+ else
+ newstate = CPU_DEAD_FROZEN;
+ } while (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu),
+ oldstate, newstate) != oldstate);
+ return newstate == CPU_DEAD;
+}
+
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
diff --git a/kernel/smpboot.h b/kernel/smpboot.h
new file mode 100644
index 000000000..72415a0eb
--- /dev/null
+++ b/kernel/smpboot.h
@@ -0,0 +1,20 @@
+#ifndef SMPBOOT_H
+#define SMPBOOT_H
+
+struct task_struct;
+
+#ifdef CONFIG_GENERIC_SMP_IDLE_THREAD
+struct task_struct *idle_thread_get(unsigned int cpu);
+void idle_thread_set_boot_cpu(void);
+void idle_threads_init(void);
+#else
+static inline struct task_struct *idle_thread_get(unsigned int cpu) { return NULL; }
+static inline void idle_thread_set_boot_cpu(void) { }
+static inline void idle_threads_init(void) { }
+#endif
+
+int smpboot_create_threads(unsigned int cpu);
+void smpboot_park_threads(unsigned int cpu);
+void smpboot_unpark_threads(unsigned int cpu);
+
+#endif
diff --git a/kernel/softirq.c b/kernel/softirq.c
new file mode 100644
index 000000000..479e4436f
--- /dev/null
+++ b/kernel/softirq.c
@@ -0,0 +1,787 @@
+/*
+ * linux/kernel/softirq.c
+ *
+ * Copyright (C) 1992 Linus Torvalds
+ *
+ * Distribute under GPLv2.
+ *
+ * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/export.h>
+#include <linux/kernel_stat.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/notifier.h>
+#include <linux/percpu.h>
+#include <linux/cpu.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/rcupdate.h>
+#include <linux/ftrace.h>
+#include <linux/smp.h>
+#include <linux/smpboot.h>
+#include <linux/tick.h>
+#include <linux/irq.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/irq.h>
+
+/*
+ - No shared variables, all the data are CPU local.
+ - If a softirq needs serialization, let it serialize itself
+ by its own spinlocks.
+ - Even if softirq is serialized, only local cpu is marked for
+ execution. Hence, we get something sort of weak cpu binding.
+ Though it is still not clear, will it result in better locality
+ or will not.
+
+ Examples:
+ - NET RX softirq. It is multithreaded and does not require
+ any global serialization.
+ - NET TX softirq. It kicks software netdevice queues, hence
+ it is logically serialized per device, but this serialization
+ is invisible to common code.
+ - Tasklets: serialized wrt itself.
+ */
+
+#ifndef __ARCH_IRQ_STAT
+irq_cpustat_t irq_stat[NR_CPUS] ____cacheline_aligned;
+EXPORT_SYMBOL(irq_stat);
+#endif
+
+static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
+
+DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
+
+const char * const softirq_to_name[NR_SOFTIRQS] = {
+ "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
+ "TASKLET", "SCHED", "HRTIMER", "RCU"
+};
+
+/*
+ * we cannot loop indefinitely here to avoid userspace starvation,
+ * but we also don't want to introduce a worst case 1/HZ latency
+ * to the pending events, so lets the scheduler to balance
+ * the softirq load for us.
+ */
+static void wakeup_softirqd(void)
+{
+ /* Interrupts are disabled: no need to stop preemption */
+ struct task_struct *tsk = __this_cpu_read(ksoftirqd);
+
+ if (tsk && tsk->state != TASK_RUNNING)
+ wake_up_process(tsk);
+}
+
+/*
+ * preempt_count and SOFTIRQ_OFFSET usage:
+ * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
+ * softirq processing.
+ * - preempt_count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET)
+ * on local_bh_disable or local_bh_enable.
+ * This lets us distinguish between whether we are currently processing
+ * softirq and whether we just have bh disabled.
+ */
+
+/*
+ * This one is for softirq.c-internal use,
+ * where hardirqs are disabled legitimately:
+ */
+#ifdef CONFIG_TRACE_IRQFLAGS
+void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
+{
+ unsigned long flags;
+
+ WARN_ON_ONCE(in_irq());
+
+ raw_local_irq_save(flags);
+ /*
+ * The preempt tracer hooks into preempt_count_add and will break
+ * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET
+ * is set and before current->softirq_enabled is cleared.
+ * We must manually increment preempt_count here and manually
+ * call the trace_preempt_off later.
+ */
+ __preempt_count_add(cnt);
+ /*
+ * Were softirqs turned off above:
+ */
+ if (softirq_count() == (cnt & SOFTIRQ_MASK))
+ trace_softirqs_off(ip);
+ raw_local_irq_restore(flags);
+
+ if (preempt_count() == cnt) {
+#ifdef CONFIG_DEBUG_PREEMPT
+ current->preempt_disable_ip = get_parent_ip(CALLER_ADDR1);
+#endif
+ trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+ }
+}
+EXPORT_SYMBOL(__local_bh_disable_ip);
+#endif /* CONFIG_TRACE_IRQFLAGS */
+
+static void __local_bh_enable(unsigned int cnt)
+{
+ WARN_ON_ONCE(!irqs_disabled());
+
+ if (softirq_count() == (cnt & SOFTIRQ_MASK))
+ trace_softirqs_on(_RET_IP_);
+ preempt_count_sub(cnt);
+}
+
+/*
+ * Special-case - softirqs can safely be enabled in
+ * cond_resched_softirq(), or by __do_softirq(),
+ * without processing still-pending softirqs:
+ */
+void _local_bh_enable(void)
+{
+ WARN_ON_ONCE(in_irq());
+ __local_bh_enable(SOFTIRQ_DISABLE_OFFSET);
+}
+EXPORT_SYMBOL(_local_bh_enable);
+
+void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
+{
+ WARN_ON_ONCE(in_irq() || irqs_disabled());
+#ifdef CONFIG_TRACE_IRQFLAGS
+ local_irq_disable();
+#endif
+ /*
+ * Are softirqs going to be turned on now:
+ */
+ if (softirq_count() == SOFTIRQ_DISABLE_OFFSET)
+ trace_softirqs_on(ip);
+ /*
+ * Keep preemption disabled until we are done with
+ * softirq processing:
+ */
+ preempt_count_sub(cnt - 1);
+
+ if (unlikely(!in_interrupt() && local_softirq_pending())) {
+ /*
+ * Run softirq if any pending. And do it in its own stack
+ * as we may be calling this deep in a task call stack already.
+ */
+ do_softirq();
+ }
+
+ preempt_count_dec();
+#ifdef CONFIG_TRACE_IRQFLAGS
+ local_irq_enable();
+#endif
+ preempt_check_resched();
+}
+EXPORT_SYMBOL(__local_bh_enable_ip);
+
+/*
+ * We restart softirq processing for at most MAX_SOFTIRQ_RESTART times,
+ * but break the loop if need_resched() is set or after 2 ms.
+ * The MAX_SOFTIRQ_TIME provides a nice upper bound in most cases, but in
+ * certain cases, such as stop_machine(), jiffies may cease to
+ * increment and so we need the MAX_SOFTIRQ_RESTART limit as
+ * well to make sure we eventually return from this method.
+ *
+ * These limits have been established via experimentation.
+ * The two things to balance is latency against fairness -
+ * we want to handle softirqs as soon as possible, but they
+ * should not be able to lock up the box.
+ */
+#define MAX_SOFTIRQ_TIME msecs_to_jiffies(2)
+#define MAX_SOFTIRQ_RESTART 10
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+/*
+ * When we run softirqs from irq_exit() and thus on the hardirq stack we need
+ * to keep the lockdep irq context tracking as tight as possible in order to
+ * not miss-qualify lock contexts and miss possible deadlocks.
+ */
+
+static inline bool lockdep_softirq_start(void)
+{
+ bool in_hardirq = false;
+
+ if (trace_hardirq_context(current)) {
+ in_hardirq = true;
+ trace_hardirq_exit();
+ }
+
+ lockdep_softirq_enter();
+
+ return in_hardirq;
+}
+
+static inline void lockdep_softirq_end(bool in_hardirq)
+{
+ lockdep_softirq_exit();
+
+ if (in_hardirq)
+ trace_hardirq_enter();
+}
+#else
+static inline bool lockdep_softirq_start(void) { return false; }
+static inline void lockdep_softirq_end(bool in_hardirq) { }
+#endif
+
+asmlinkage __visible void __do_softirq(void)
+{
+ unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
+ unsigned long old_flags = current->flags;
+ int max_restart = MAX_SOFTIRQ_RESTART;
+ struct softirq_action *h;
+ bool in_hardirq;
+ __u32 pending;
+ int softirq_bit;
+
+ /*
+ * Mask out PF_MEMALLOC s current task context is borrowed for the
+ * softirq. A softirq handled such as network RX might set PF_MEMALLOC
+ * again if the socket is related to swap
+ */
+ current->flags &= ~PF_MEMALLOC;
+
+ pending = local_softirq_pending();
+ account_irq_enter_time(current);
+
+ __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
+ in_hardirq = lockdep_softirq_start();
+
+restart:
+ /* Reset the pending bitmask before enabling irqs */
+ set_softirq_pending(0);
+
+ local_irq_enable();
+
+ h = softirq_vec;
+
+ while ((softirq_bit = ffs(pending))) {
+ unsigned int vec_nr;
+ int prev_count;
+
+ h += softirq_bit - 1;
+
+ vec_nr = h - softirq_vec;
+ prev_count = preempt_count();
+
+ kstat_incr_softirqs_this_cpu(vec_nr);
+
+ trace_softirq_entry(vec_nr);
+ h->action(h);
+ trace_softirq_exit(vec_nr);
+ if (unlikely(prev_count != preempt_count())) {
+ pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
+ vec_nr, softirq_to_name[vec_nr], h->action,
+ prev_count, preempt_count());
+ preempt_count_set(prev_count);
+ }
+ h++;
+ pending >>= softirq_bit;
+ }
+
+ rcu_bh_qs();
+ local_irq_disable();
+
+ pending = local_softirq_pending();
+ if (pending) {
+ if (time_before(jiffies, end) && !need_resched() &&
+ --max_restart)
+ goto restart;
+
+ wakeup_softirqd();
+ }
+
+ lockdep_softirq_end(in_hardirq);
+ account_irq_exit_time(current);
+ __local_bh_enable(SOFTIRQ_OFFSET);
+ WARN_ON_ONCE(in_interrupt());
+ tsk_restore_flags(current, old_flags, PF_MEMALLOC);
+}
+
+asmlinkage __visible void do_softirq(void)
+{
+ __u32 pending;
+ unsigned long flags;
+
+ if (in_interrupt())
+ return;
+
+ local_irq_save(flags);
+
+ pending = local_softirq_pending();
+
+ if (pending)
+ do_softirq_own_stack();
+
+ local_irq_restore(flags);
+}
+
+/*
+ * Enter an interrupt context.
+ */
+void irq_enter(void)
+{
+ rcu_irq_enter();
+ if (is_idle_task(current) && !in_interrupt()) {
+ /*
+ * Prevent raise_softirq from needlessly waking up ksoftirqd
+ * here, as softirq will be serviced on return from interrupt.
+ */
+ local_bh_disable();
+ tick_irq_enter();
+ _local_bh_enable();
+ }
+
+ __irq_enter();
+}
+
+static inline void invoke_softirq(void)
+{
+ if (!force_irqthreads) {
+#ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
+ /*
+ * We can safely execute softirq on the current stack if
+ * it is the irq stack, because it should be near empty
+ * at this stage.
+ */
+ __do_softirq();
+#else
+ /*
+ * Otherwise, irq_exit() is called on the task stack that can
+ * be potentially deep already. So call softirq in its own stack
+ * to prevent from any overrun.
+ */
+ do_softirq_own_stack();
+#endif
+ } else {
+ wakeup_softirqd();
+ }
+}
+
+static inline void tick_irq_exit(void)
+{
+#ifdef CONFIG_NO_HZ_COMMON
+ int cpu = smp_processor_id();
+
+ /* Make sure that timer wheel updates are propagated */
+ if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) {
+ if (!in_interrupt())
+ tick_nohz_irq_exit();
+ }
+#endif
+}
+
+/*
+ * Exit an interrupt context. Process softirqs if needed and possible:
+ */
+void irq_exit(void)
+{
+#ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED
+ local_irq_disable();
+#else
+ WARN_ON_ONCE(!irqs_disabled());
+#endif
+
+ account_irq_exit_time(current);
+ preempt_count_sub(HARDIRQ_OFFSET);
+ if (!in_interrupt() && local_softirq_pending())
+ invoke_softirq();
+
+ tick_irq_exit();
+ rcu_irq_exit();
+ trace_hardirq_exit(); /* must be last! */
+}
+
+/*
+ * This function must run with irqs disabled!
+ */
+inline void raise_softirq_irqoff(unsigned int nr)
+{
+ __raise_softirq_irqoff(nr);
+
+ /*
+ * If we're in an interrupt or softirq, we're done
+ * (this also catches softirq-disabled code). We will
+ * actually run the softirq once we return from
+ * the irq or softirq.
+ *
+ * Otherwise we wake up ksoftirqd to make sure we
+ * schedule the softirq soon.
+ */
+ if (!in_interrupt())
+ wakeup_softirqd();
+}
+
+void raise_softirq(unsigned int nr)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ raise_softirq_irqoff(nr);
+ local_irq_restore(flags);
+}
+
+void __raise_softirq_irqoff(unsigned int nr)
+{
+ trace_softirq_raise(nr);
+ or_softirq_pending(1UL << nr);
+}
+
+void open_softirq(int nr, void (*action)(struct softirq_action *))
+{
+ softirq_vec[nr].action = action;
+}
+
+/*
+ * Tasklets
+ */
+struct tasklet_head {
+ struct tasklet_struct *head;
+ struct tasklet_struct **tail;
+};
+
+static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec);
+static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec);
+
+void __tasklet_schedule(struct tasklet_struct *t)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ t->next = NULL;
+ *__this_cpu_read(tasklet_vec.tail) = t;
+ __this_cpu_write(tasklet_vec.tail, &(t->next));
+ raise_softirq_irqoff(TASKLET_SOFTIRQ);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL(__tasklet_schedule);
+
+void __tasklet_hi_schedule(struct tasklet_struct *t)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ t->next = NULL;
+ *__this_cpu_read(tasklet_hi_vec.tail) = t;
+ __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
+ raise_softirq_irqoff(HI_SOFTIRQ);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL(__tasklet_hi_schedule);
+
+void __tasklet_hi_schedule_first(struct tasklet_struct *t)
+{
+ BUG_ON(!irqs_disabled());
+
+ t->next = __this_cpu_read(tasklet_hi_vec.head);
+ __this_cpu_write(tasklet_hi_vec.head, t);
+ __raise_softirq_irqoff(HI_SOFTIRQ);
+}
+EXPORT_SYMBOL(__tasklet_hi_schedule_first);
+
+static void tasklet_action(struct softirq_action *a)
+{
+ struct tasklet_struct *list;
+
+ local_irq_disable();
+ list = __this_cpu_read(tasklet_vec.head);
+ __this_cpu_write(tasklet_vec.head, NULL);
+ __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
+ local_irq_enable();
+
+ while (list) {
+ struct tasklet_struct *t = list;
+
+ list = list->next;
+
+ if (tasklet_trylock(t)) {
+ if (!atomic_read(&t->count)) {
+ if (!test_and_clear_bit(TASKLET_STATE_SCHED,
+ &t->state))
+ BUG();
+ t->func(t->data);
+ tasklet_unlock(t);
+ continue;
+ }
+ tasklet_unlock(t);
+ }
+
+ local_irq_disable();
+ t->next = NULL;
+ *__this_cpu_read(tasklet_vec.tail) = t;
+ __this_cpu_write(tasklet_vec.tail, &(t->next));
+ __raise_softirq_irqoff(TASKLET_SOFTIRQ);
+ local_irq_enable();
+ }
+}
+
+static void tasklet_hi_action(struct softirq_action *a)
+{
+ struct tasklet_struct *list;
+
+ local_irq_disable();
+ list = __this_cpu_read(tasklet_hi_vec.head);
+ __this_cpu_write(tasklet_hi_vec.head, NULL);
+ __this_cpu_write(tasklet_hi_vec.tail, this_cpu_ptr(&tasklet_hi_vec.head));
+ local_irq_enable();
+
+ while (list) {
+ struct tasklet_struct *t = list;
+
+ list = list->next;
+
+ if (tasklet_trylock(t)) {
+ if (!atomic_read(&t->count)) {
+ if (!test_and_clear_bit(TASKLET_STATE_SCHED,
+ &t->state))
+ BUG();
+ t->func(t->data);
+ tasklet_unlock(t);
+ continue;
+ }
+ tasklet_unlock(t);
+ }
+
+ local_irq_disable();
+ t->next = NULL;
+ *__this_cpu_read(tasklet_hi_vec.tail) = t;
+ __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
+ __raise_softirq_irqoff(HI_SOFTIRQ);
+ local_irq_enable();
+ }
+}
+
+void tasklet_init(struct tasklet_struct *t,
+ void (*func)(unsigned long), unsigned long data)
+{
+ t->next = NULL;
+ t->state = 0;
+ atomic_set(&t->count, 0);
+ t->func = func;
+ t->data = data;
+}
+EXPORT_SYMBOL(tasklet_init);
+
+void tasklet_kill(struct tasklet_struct *t)
+{
+ if (in_interrupt())
+ pr_notice("Attempt to kill tasklet from interrupt\n");
+
+ while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
+ do {
+ yield();
+ } while (test_bit(TASKLET_STATE_SCHED, &t->state));
+ }
+ tasklet_unlock_wait(t);
+ clear_bit(TASKLET_STATE_SCHED, &t->state);
+}
+EXPORT_SYMBOL(tasklet_kill);
+
+/*
+ * tasklet_hrtimer
+ */
+
+/*
+ * The trampoline is called when the hrtimer expires. It schedules a tasklet
+ * to run __tasklet_hrtimer_trampoline() which in turn will call the intended
+ * hrtimer callback, but from softirq context.
+ */
+static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer)
+{
+ struct tasklet_hrtimer *ttimer =
+ container_of(timer, struct tasklet_hrtimer, timer);
+
+ tasklet_hi_schedule(&ttimer->tasklet);
+ return HRTIMER_NORESTART;
+}
+
+/*
+ * Helper function which calls the hrtimer callback from
+ * tasklet/softirq context
+ */
+static void __tasklet_hrtimer_trampoline(unsigned long data)
+{
+ struct tasklet_hrtimer *ttimer = (void *)data;
+ enum hrtimer_restart restart;
+
+ restart = ttimer->function(&ttimer->timer);
+ if (restart != HRTIMER_NORESTART)
+ hrtimer_restart(&ttimer->timer);
+}
+
+/**
+ * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks
+ * @ttimer: tasklet_hrtimer which is initialized
+ * @function: hrtimer callback function which gets called from softirq context
+ * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME)
+ * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL)
+ */
+void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer,
+ enum hrtimer_restart (*function)(struct hrtimer *),
+ clockid_t which_clock, enum hrtimer_mode mode)
+{
+ hrtimer_init(&ttimer->timer, which_clock, mode);
+ ttimer->timer.function = __hrtimer_tasklet_trampoline;
+ tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline,
+ (unsigned long)ttimer);
+ ttimer->function = function;
+}
+EXPORT_SYMBOL_GPL(tasklet_hrtimer_init);
+
+void __init softirq_init(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ per_cpu(tasklet_vec, cpu).tail =
+ &per_cpu(tasklet_vec, cpu).head;
+ per_cpu(tasklet_hi_vec, cpu).tail =
+ &per_cpu(tasklet_hi_vec, cpu).head;
+ }
+
+ open_softirq(TASKLET_SOFTIRQ, tasklet_action);
+ open_softirq(HI_SOFTIRQ, tasklet_hi_action);
+}
+
+static int ksoftirqd_should_run(unsigned int cpu)
+{
+ return local_softirq_pending();
+}
+
+static void run_ksoftirqd(unsigned int cpu)
+{
+ local_irq_disable();
+ if (local_softirq_pending()) {
+ /*
+ * We can safely run softirq on inline stack, as we are not deep
+ * in the task stack here.
+ */
+ __do_softirq();
+ local_irq_enable();
+ cond_resched_rcu_qs();
+ return;
+ }
+ local_irq_enable();
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * tasklet_kill_immediate is called to remove a tasklet which can already be
+ * scheduled for execution on @cpu.
+ *
+ * Unlike tasklet_kill, this function removes the tasklet
+ * _immediately_, even if the tasklet is in TASKLET_STATE_SCHED state.
+ *
+ * When this function is called, @cpu must be in the CPU_DEAD state.
+ */
+void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu)
+{
+ struct tasklet_struct **i;
+
+ BUG_ON(cpu_online(cpu));
+ BUG_ON(test_bit(TASKLET_STATE_RUN, &t->state));
+
+ if (!test_bit(TASKLET_STATE_SCHED, &t->state))
+ return;
+
+ /* CPU is dead, so no lock needed. */
+ for (i = &per_cpu(tasklet_vec, cpu).head; *i; i = &(*i)->next) {
+ if (*i == t) {
+ *i = t->next;
+ /* If this was the tail element, move the tail ptr */
+ if (*i == NULL)
+ per_cpu(tasklet_vec, cpu).tail = i;
+ return;
+ }
+ }
+ BUG();
+}
+
+static void takeover_tasklets(unsigned int cpu)
+{
+ /* CPU is dead, so no lock needed. */
+ local_irq_disable();
+
+ /* Find end, append list for that CPU. */
+ if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) {
+ *__this_cpu_read(tasklet_vec.tail) = per_cpu(tasklet_vec, cpu).head;
+ this_cpu_write(tasklet_vec.tail, per_cpu(tasklet_vec, cpu).tail);
+ per_cpu(tasklet_vec, cpu).head = NULL;
+ per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head;
+ }
+ raise_softirq_irqoff(TASKLET_SOFTIRQ);
+
+ if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) {
+ *__this_cpu_read(tasklet_hi_vec.tail) = per_cpu(tasklet_hi_vec, cpu).head;
+ __this_cpu_write(tasklet_hi_vec.tail, per_cpu(tasklet_hi_vec, cpu).tail);
+ per_cpu(tasklet_hi_vec, cpu).head = NULL;
+ per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head;
+ }
+ raise_softirq_irqoff(HI_SOFTIRQ);
+
+ local_irq_enable();
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+static int cpu_callback(struct notifier_block *nfb, unsigned long action,
+ void *hcpu)
+{
+ switch (action) {
+#ifdef CONFIG_HOTPLUG_CPU
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ takeover_tasklets((unsigned long)hcpu);
+ break;
+#endif /* CONFIG_HOTPLUG_CPU */
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block cpu_nfb = {
+ .notifier_call = cpu_callback
+};
+
+static struct smp_hotplug_thread softirq_threads = {
+ .store = &ksoftirqd,
+ .thread_should_run = ksoftirqd_should_run,
+ .thread_fn = run_ksoftirqd,
+ .thread_comm = "ksoftirqd/%u",
+};
+
+static __init int spawn_ksoftirqd(void)
+{
+ register_cpu_notifier(&cpu_nfb);
+
+ BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
+
+ return 0;
+}
+early_initcall(spawn_ksoftirqd);
+
+/*
+ * [ These __weak aliases are kept in a separate compilation unit, so that
+ * GCC does not inline them incorrectly. ]
+ */
+
+int __init __weak early_irq_init(void)
+{
+ return 0;
+}
+
+int __init __weak arch_probe_nr_irqs(void)
+{
+ return NR_IRQS_LEGACY;
+}
+
+int __init __weak arch_early_irq_init(void)
+{
+ return 0;
+}
+
+unsigned int __weak arch_dynirq_lower_bound(unsigned int from)
+{
+ return from;
+}
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
new file mode 100644
index 000000000..b6e4c1637
--- /dev/null
+++ b/kernel/stacktrace.c
@@ -0,0 +1,75 @@
+/*
+ * kernel/stacktrace.c
+ *
+ * Stack trace management functions
+ *
+ * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ */
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/kallsyms.h>
+#include <linux/stacktrace.h>
+
+void print_stack_trace(struct stack_trace *trace, int spaces)
+{
+ int i;
+
+ if (WARN_ON(!trace->entries))
+ return;
+
+ for (i = 0; i < trace->nr_entries; i++) {
+ printk("%*c", 1 + spaces, ' ');
+ print_ip_sym(trace->entries[i]);
+ }
+}
+EXPORT_SYMBOL_GPL(print_stack_trace);
+
+int snprint_stack_trace(char *buf, size_t size,
+ struct stack_trace *trace, int spaces)
+{
+ int i;
+ unsigned long ip;
+ int generated;
+ int total = 0;
+
+ if (WARN_ON(!trace->entries))
+ return 0;
+
+ for (i = 0; i < trace->nr_entries; i++) {
+ ip = trace->entries[i];
+ generated = snprintf(buf, size, "%*c[<%p>] %pS\n",
+ 1 + spaces, ' ', (void *) ip, (void *) ip);
+
+ total += generated;
+
+ /* Assume that generated isn't a negative number */
+ if (generated >= size) {
+ buf += size;
+ size = 0;
+ } else {
+ buf += generated;
+ size -= generated;
+ }
+ }
+
+ return total;
+}
+EXPORT_SYMBOL_GPL(snprint_stack_trace);
+
+/*
+ * Architectures that do not implement save_stack_trace_tsk or
+ * save_stack_trace_regs get this weak alias and a once-per-bootup warning
+ * (whenever this facility is utilized - for example by procfs):
+ */
+__weak void
+save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
+{
+ WARN_ONCE(1, KERN_INFO "save_stack_trace_tsk() not implemented yet.\n");
+}
+
+__weak void
+save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace)
+{
+ WARN_ONCE(1, KERN_INFO "save_stack_trace_regs() not implemented yet.\n");
+}
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
new file mode 100644
index 000000000..263b0e1ad
--- /dev/null
+++ b/kernel/stop_machine.c
@@ -0,0 +1,649 @@
+/*
+ * kernel/stop_machine.c
+ *
+ * Copyright (C) 2008, 2005 IBM Corporation.
+ * Copyright (C) 2008, 2005 Rusty Russell rusty@rustcorp.com.au
+ * Copyright (C) 2010 SUSE Linux Products GmbH
+ * Copyright (C) 2010 Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2 and any later version.
+ */
+#include <linux/completion.h>
+#include <linux/cpu.h>
+#include <linux/init.h>
+#include <linux/kthread.h>
+#include <linux/export.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/stop_machine.h>
+#include <linux/interrupt.h>
+#include <linux/kallsyms.h>
+#include <linux/smpboot.h>
+#include <linux/atomic.h>
+#include <linux/lglock.h>
+
+/*
+ * Structure to determine completion condition and record errors. May
+ * be shared by works on different cpus.
+ */
+struct cpu_stop_done {
+ atomic_t nr_todo; /* nr left to execute */
+ bool executed; /* actually executed? */
+ int ret; /* collected return value */
+ struct completion completion; /* fired if nr_todo reaches 0 */
+};
+
+/* the actual stopper, one per every possible cpu, enabled on online cpus */
+struct cpu_stopper {
+ spinlock_t lock;
+ bool enabled; /* is this stopper enabled? */
+ struct list_head works; /* list of pending works */
+};
+
+static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
+DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task);
+
+static bool stop_machine_initialized = false;
+
+/*
+ * Avoids a race between stop_two_cpus and global stop_cpus, where
+ * the stoppers could get queued up in reverse order, leading to
+ * system deadlock. Using an lglock means stop_two_cpus remains
+ * relatively cheap.
+ */
+DEFINE_STATIC_LGLOCK(stop_cpus_lock);
+
+static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
+{
+ memset(done, 0, sizeof(*done));
+ atomic_set(&done->nr_todo, nr_todo);
+ init_completion(&done->completion);
+}
+
+/* signal completion unless @done is NULL */
+static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
+{
+ if (done) {
+ if (executed)
+ done->executed = true;
+ if (atomic_dec_and_test(&done->nr_todo))
+ complete(&done->completion);
+ }
+}
+
+/* queue @work to @stopper. if offline, @work is completed immediately */
+static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
+{
+ struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
+ struct task_struct *p = per_cpu(cpu_stopper_task, cpu);
+
+ unsigned long flags;
+
+ spin_lock_irqsave(&stopper->lock, flags);
+
+ if (stopper->enabled) {
+ list_add_tail(&work->list, &stopper->works);
+ wake_up_process(p);
+ } else
+ cpu_stop_signal_done(work->done, false);
+
+ spin_unlock_irqrestore(&stopper->lock, flags);
+}
+
+/**
+ * stop_one_cpu - stop a cpu
+ * @cpu: cpu to stop
+ * @fn: function to execute
+ * @arg: argument to @fn
+ *
+ * Execute @fn(@arg) on @cpu. @fn is run in a process context with
+ * the highest priority preempting any task on the cpu and
+ * monopolizing it. This function returns after the execution is
+ * complete.
+ *
+ * This function doesn't guarantee @cpu stays online till @fn
+ * completes. If @cpu goes down in the middle, execution may happen
+ * partially or fully on different cpus. @fn should either be ready
+ * for that or the caller should ensure that @cpu stays online until
+ * this function completes.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * -ENOENT if @fn(@arg) was not executed because @cpu was offline;
+ * otherwise, the return value of @fn.
+ */
+int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
+{
+ struct cpu_stop_done done;
+ struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };
+
+ cpu_stop_init_done(&done, 1);
+ cpu_stop_queue_work(cpu, &work);
+ wait_for_completion(&done.completion);
+ return done.executed ? done.ret : -ENOENT;
+}
+
+/* This controls the threads on each CPU. */
+enum multi_stop_state {
+ /* Dummy starting state for thread. */
+ MULTI_STOP_NONE,
+ /* Awaiting everyone to be scheduled. */
+ MULTI_STOP_PREPARE,
+ /* Disable interrupts. */
+ MULTI_STOP_DISABLE_IRQ,
+ /* Run the function */
+ MULTI_STOP_RUN,
+ /* Exit */
+ MULTI_STOP_EXIT,
+};
+
+struct multi_stop_data {
+ int (*fn)(void *);
+ void *data;
+ /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
+ unsigned int num_threads;
+ const struct cpumask *active_cpus;
+
+ enum multi_stop_state state;
+ atomic_t thread_ack;
+};
+
+static void set_state(struct multi_stop_data *msdata,
+ enum multi_stop_state newstate)
+{
+ /* Reset ack counter. */
+ atomic_set(&msdata->thread_ack, msdata->num_threads);
+ smp_wmb();
+ msdata->state = newstate;
+}
+
+/* Last one to ack a state moves to the next state. */
+static void ack_state(struct multi_stop_data *msdata)
+{
+ if (atomic_dec_and_test(&msdata->thread_ack))
+ set_state(msdata, msdata->state + 1);
+}
+
+/* This is the cpu_stop function which stops the CPU. */
+static int multi_cpu_stop(void *data)
+{
+ struct multi_stop_data *msdata = data;
+ enum multi_stop_state curstate = MULTI_STOP_NONE;
+ int cpu = smp_processor_id(), err = 0;
+ unsigned long flags;
+ bool is_active;
+
+ /*
+ * When called from stop_machine_from_inactive_cpu(), irq might
+ * already be disabled. Save the state and restore it on exit.
+ */
+ local_save_flags(flags);
+
+ if (!msdata->active_cpus)
+ is_active = cpu == cpumask_first(cpu_online_mask);
+ else
+ is_active = cpumask_test_cpu(cpu, msdata->active_cpus);
+
+ /* Simple state machine */
+ do {
+ /* Chill out and ensure we re-read multi_stop_state. */
+ cpu_relax();
+ if (msdata->state != curstate) {
+ curstate = msdata->state;
+ switch (curstate) {
+ case MULTI_STOP_DISABLE_IRQ:
+ local_irq_disable();
+ hard_irq_disable();
+ break;
+ case MULTI_STOP_RUN:
+ if (is_active)
+ err = msdata->fn(msdata->data);
+ break;
+ default:
+ break;
+ }
+ ack_state(msdata);
+ }
+ } while (curstate != MULTI_STOP_EXIT);
+
+ local_irq_restore(flags);
+ return err;
+}
+
+struct irq_cpu_stop_queue_work_info {
+ int cpu1;
+ int cpu2;
+ struct cpu_stop_work *work1;
+ struct cpu_stop_work *work2;
+};
+
+/*
+ * This function is always run with irqs and preemption disabled.
+ * This guarantees that both work1 and work2 get queued, before
+ * our local migrate thread gets the chance to preempt us.
+ */
+static void irq_cpu_stop_queue_work(void *arg)
+{
+ struct irq_cpu_stop_queue_work_info *info = arg;
+ cpu_stop_queue_work(info->cpu1, info->work1);
+ cpu_stop_queue_work(info->cpu2, info->work2);
+}
+
+/**
+ * stop_two_cpus - stops two cpus
+ * @cpu1: the cpu to stop
+ * @cpu2: the other cpu to stop
+ * @fn: function to execute
+ * @arg: argument to @fn
+ *
+ * Stops both the current and specified CPU and runs @fn on one of them.
+ *
+ * returns when both are completed.
+ */
+int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg)
+{
+ struct cpu_stop_done done;
+ struct cpu_stop_work work1, work2;
+ struct irq_cpu_stop_queue_work_info call_args;
+ struct multi_stop_data msdata;
+
+ preempt_disable();
+ msdata = (struct multi_stop_data){
+ .fn = fn,
+ .data = arg,
+ .num_threads = 2,
+ .active_cpus = cpumask_of(cpu1),
+ };
+
+ work1 = work2 = (struct cpu_stop_work){
+ .fn = multi_cpu_stop,
+ .arg = &msdata,
+ .done = &done
+ };
+
+ call_args = (struct irq_cpu_stop_queue_work_info){
+ .cpu1 = cpu1,
+ .cpu2 = cpu2,
+ .work1 = &work1,
+ .work2 = &work2,
+ };
+
+ cpu_stop_init_done(&done, 2);
+ set_state(&msdata, MULTI_STOP_PREPARE);
+
+ /*
+ * If we observe both CPUs active we know _cpu_down() cannot yet have
+ * queued its stop_machine works and therefore ours will get executed
+ * first. Or its not either one of our CPUs that's getting unplugged,
+ * in which case we don't care.
+ *
+ * This relies on the stopper workqueues to be FIFO.
+ */
+ if (!cpu_active(cpu1) || !cpu_active(cpu2)) {
+ preempt_enable();
+ return -ENOENT;
+ }
+
+ lg_local_lock(&stop_cpus_lock);
+ /*
+ * Queuing needs to be done by the lowest numbered CPU, to ensure
+ * that works are always queued in the same order on every CPU.
+ * This prevents deadlocks.
+ */
+ smp_call_function_single(min(cpu1, cpu2),
+ &irq_cpu_stop_queue_work,
+ &call_args, 1);
+ lg_local_unlock(&stop_cpus_lock);
+ preempt_enable();
+
+ wait_for_completion(&done.completion);
+
+ return done.executed ? done.ret : -ENOENT;
+}
+
+/**
+ * stop_one_cpu_nowait - stop a cpu but don't wait for completion
+ * @cpu: cpu to stop
+ * @fn: function to execute
+ * @arg: argument to @fn
+ * @work_buf: pointer to cpu_stop_work structure
+ *
+ * Similar to stop_one_cpu() but doesn't wait for completion. The
+ * caller is responsible for ensuring @work_buf is currently unused
+ * and will remain untouched until stopper starts executing @fn.
+ *
+ * CONTEXT:
+ * Don't care.
+ */
+void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
+ struct cpu_stop_work *work_buf)
+{
+ *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, };
+ cpu_stop_queue_work(cpu, work_buf);
+}
+
+/* static data for stop_cpus */
+static DEFINE_MUTEX(stop_cpus_mutex);
+static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work);
+
+static void queue_stop_cpus_work(const struct cpumask *cpumask,
+ cpu_stop_fn_t fn, void *arg,
+ struct cpu_stop_done *done)
+{
+ struct cpu_stop_work *work;
+ unsigned int cpu;
+
+ /* initialize works and done */
+ for_each_cpu(cpu, cpumask) {
+ work = &per_cpu(stop_cpus_work, cpu);
+ work->fn = fn;
+ work->arg = arg;
+ work->done = done;
+ }
+
+ /*
+ * Disable preemption while queueing to avoid getting
+ * preempted by a stopper which might wait for other stoppers
+ * to enter @fn which can lead to deadlock.
+ */
+ lg_global_lock(&stop_cpus_lock);
+ for_each_cpu(cpu, cpumask)
+ cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu));
+ lg_global_unlock(&stop_cpus_lock);
+}
+
+static int __stop_cpus(const struct cpumask *cpumask,
+ cpu_stop_fn_t fn, void *arg)
+{
+ struct cpu_stop_done done;
+
+ cpu_stop_init_done(&done, cpumask_weight(cpumask));
+ queue_stop_cpus_work(cpumask, fn, arg, &done);
+ wait_for_completion(&done.completion);
+ return done.executed ? done.ret : -ENOENT;
+}
+
+/**
+ * stop_cpus - stop multiple cpus
+ * @cpumask: cpus to stop
+ * @fn: function to execute
+ * @arg: argument to @fn
+ *
+ * Execute @fn(@arg) on online cpus in @cpumask. On each target cpu,
+ * @fn is run in a process context with the highest priority
+ * preempting any task on the cpu and monopolizing it. This function
+ * returns after all executions are complete.
+ *
+ * This function doesn't guarantee the cpus in @cpumask stay online
+ * till @fn completes. If some cpus go down in the middle, execution
+ * on the cpu may happen partially or fully on different cpus. @fn
+ * should either be ready for that or the caller should ensure that
+ * the cpus stay online until this function completes.
+ *
+ * All stop_cpus() calls are serialized making it safe for @fn to wait
+ * for all cpus to start executing it.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * -ENOENT if @fn(@arg) was not executed at all because all cpus in
+ * @cpumask were offline; otherwise, 0 if all executions of @fn
+ * returned 0, any non zero return value if any returned non zero.
+ */
+int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
+{
+ int ret;
+
+ /* static works are used, process one request at a time */
+ mutex_lock(&stop_cpus_mutex);
+ ret = __stop_cpus(cpumask, fn, arg);
+ mutex_unlock(&stop_cpus_mutex);
+ return ret;
+}
+
+/**
+ * try_stop_cpus - try to stop multiple cpus
+ * @cpumask: cpus to stop
+ * @fn: function to execute
+ * @arg: argument to @fn
+ *
+ * Identical to stop_cpus() except that it fails with -EAGAIN if
+ * someone else is already using the facility.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * -EAGAIN if someone else is already stopping cpus, -ENOENT if
+ * @fn(@arg) was not executed at all because all cpus in @cpumask were
+ * offline; otherwise, 0 if all executions of @fn returned 0, any non
+ * zero return value if any returned non zero.
+ */
+int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
+{
+ int ret;
+
+ /* static works are used, process one request at a time */
+ if (!mutex_trylock(&stop_cpus_mutex))
+ return -EAGAIN;
+ ret = __stop_cpus(cpumask, fn, arg);
+ mutex_unlock(&stop_cpus_mutex);
+ return ret;
+}
+
+static int cpu_stop_should_run(unsigned int cpu)
+{
+ struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
+ unsigned long flags;
+ int run;
+
+ spin_lock_irqsave(&stopper->lock, flags);
+ run = !list_empty(&stopper->works);
+ spin_unlock_irqrestore(&stopper->lock, flags);
+ return run;
+}
+
+static void cpu_stopper_thread(unsigned int cpu)
+{
+ struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
+ struct cpu_stop_work *work;
+ int ret;
+
+repeat:
+ work = NULL;
+ spin_lock_irq(&stopper->lock);
+ if (!list_empty(&stopper->works)) {
+ work = list_first_entry(&stopper->works,
+ struct cpu_stop_work, list);
+ list_del_init(&work->list);
+ }
+ spin_unlock_irq(&stopper->lock);
+
+ if (work) {
+ cpu_stop_fn_t fn = work->fn;
+ void *arg = work->arg;
+ struct cpu_stop_done *done = work->done;
+ char ksym_buf[KSYM_NAME_LEN] __maybe_unused;
+
+ /* cpu stop callbacks are not allowed to sleep */
+ preempt_disable();
+
+ ret = fn(arg);
+ if (ret)
+ done->ret = ret;
+
+ /* restore preemption and check it's still balanced */
+ preempt_enable();
+ WARN_ONCE(preempt_count(),
+ "cpu_stop: %s(%p) leaked preempt count\n",
+ kallsyms_lookup((unsigned long)fn, NULL, NULL, NULL,
+ ksym_buf), arg);
+
+ cpu_stop_signal_done(done, true);
+ goto repeat;
+ }
+}
+
+extern void sched_set_stop_task(int cpu, struct task_struct *stop);
+
+static void cpu_stop_create(unsigned int cpu)
+{
+ sched_set_stop_task(cpu, per_cpu(cpu_stopper_task, cpu));
+}
+
+static void cpu_stop_park(unsigned int cpu)
+{
+ struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
+ struct cpu_stop_work *work;
+ unsigned long flags;
+
+ /* drain remaining works */
+ spin_lock_irqsave(&stopper->lock, flags);
+ list_for_each_entry(work, &stopper->works, list)
+ cpu_stop_signal_done(work->done, false);
+ stopper->enabled = false;
+ spin_unlock_irqrestore(&stopper->lock, flags);
+}
+
+static void cpu_stop_unpark(unsigned int cpu)
+{
+ struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
+
+ spin_lock_irq(&stopper->lock);
+ stopper->enabled = true;
+ spin_unlock_irq(&stopper->lock);
+}
+
+static struct smp_hotplug_thread cpu_stop_threads = {
+ .store = &cpu_stopper_task,
+ .thread_should_run = cpu_stop_should_run,
+ .thread_fn = cpu_stopper_thread,
+ .thread_comm = "migration/%u",
+ .create = cpu_stop_create,
+ .setup = cpu_stop_unpark,
+ .park = cpu_stop_park,
+ .pre_unpark = cpu_stop_unpark,
+ .selfparking = true,
+};
+
+static int __init cpu_stop_init(void)
+{
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
+
+ spin_lock_init(&stopper->lock);
+ INIT_LIST_HEAD(&stopper->works);
+ }
+
+ BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads));
+ stop_machine_initialized = true;
+ return 0;
+}
+early_initcall(cpu_stop_init);
+
+#ifdef CONFIG_STOP_MACHINE
+
+int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
+{
+ struct multi_stop_data msdata = {
+ .fn = fn,
+ .data = data,
+ .num_threads = num_online_cpus(),
+ .active_cpus = cpus,
+ };
+
+ if (!stop_machine_initialized) {
+ /*
+ * Handle the case where stop_machine() is called
+ * early in boot before stop_machine() has been
+ * initialized.
+ */
+ unsigned long flags;
+ int ret;
+
+ WARN_ON_ONCE(msdata.num_threads != 1);
+
+ local_irq_save(flags);
+ hard_irq_disable();
+ ret = (*fn)(data);
+ local_irq_restore(flags);
+
+ return ret;
+ }
+
+ /* Set the initial state and stop all online cpus. */
+ set_state(&msdata, MULTI_STOP_PREPARE);
+ return stop_cpus(cpu_online_mask, multi_cpu_stop, &msdata);
+}
+
+int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
+{
+ int ret;
+
+ /* No CPUs can come up or down during this. */
+ get_online_cpus();
+ ret = __stop_machine(fn, data, cpus);
+ put_online_cpus();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(stop_machine);
+
+/**
+ * stop_machine_from_inactive_cpu - stop_machine() from inactive CPU
+ * @fn: the function to run
+ * @data: the data ptr for the @fn()
+ * @cpus: the cpus to run the @fn() on (NULL = any online cpu)
+ *
+ * This is identical to stop_machine() but can be called from a CPU which
+ * is not active. The local CPU is in the process of hotplug (so no other
+ * CPU hotplug can start) and not marked active and doesn't have enough
+ * context to sleep.
+ *
+ * This function provides stop_machine() functionality for such state by
+ * using busy-wait for synchronization and executing @fn directly for local
+ * CPU.
+ *
+ * CONTEXT:
+ * Local CPU is inactive. Temporarily stops all active CPUs.
+ *
+ * RETURNS:
+ * 0 if all executions of @fn returned 0, any non zero return value if any
+ * returned non zero.
+ */
+int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data,
+ const struct cpumask *cpus)
+{
+ struct multi_stop_data msdata = { .fn = fn, .data = data,
+ .active_cpus = cpus };
+ struct cpu_stop_done done;
+ int ret;
+
+ /* Local CPU must be inactive and CPU hotplug in progress. */
+ BUG_ON(cpu_active(raw_smp_processor_id()));
+ msdata.num_threads = num_active_cpus() + 1; /* +1 for local */
+
+ /* No proper task established and can't sleep - busy wait for lock. */
+ while (!mutex_trylock(&stop_cpus_mutex))
+ cpu_relax();
+
+ /* Schedule work on other CPUs and execute directly for local CPU */
+ set_state(&msdata, MULTI_STOP_PREPARE);
+ cpu_stop_init_done(&done, num_active_cpus());
+ queue_stop_cpus_work(cpu_active_mask, multi_cpu_stop, &msdata,
+ &done);
+ ret = multi_cpu_stop(&msdata);
+
+ /* Busy wait for completion. */
+ while (!completion_done(&done.completion))
+ cpu_relax();
+
+ mutex_unlock(&stop_cpus_mutex);
+ return ret ?: done.ret;
+}
+
+#endif /* CONFIG_STOP_MACHINE */
diff --git a/kernel/sys.c b/kernel/sys.c
new file mode 100644
index 000000000..a4e372b79
--- /dev/null
+++ b/kernel/sys.c
@@ -0,0 +1,2408 @@
+/*
+ * linux/kernel/sys.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ */
+
+#include <linux/export.h>
+#include <linux/mm.h>
+#include <linux/utsname.h>
+#include <linux/mman.h>
+#include <linux/reboot.h>
+#include <linux/prctl.h>
+#include <linux/highuid.h>
+#include <linux/fs.h>
+#include <linux/kmod.h>
+#include <linux/perf_event.h>
+#include <linux/resource.h>
+#include <linux/kernel.h>
+#include <linux/workqueue.h>
+#include <linux/capability.h>
+#include <linux/device.h>
+#include <linux/key.h>
+#include <linux/times.h>
+#include <linux/posix-timers.h>
+#include <linux/security.h>
+#include <linux/dcookies.h>
+#include <linux/suspend.h>
+#include <linux/tty.h>
+#include <linux/signal.h>
+#include <linux/cn_proc.h>
+#include <linux/getcpu.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/seccomp.h>
+#include <linux/cpu.h>
+#include <linux/personality.h>
+#include <linux/ptrace.h>
+#include <linux/fs_struct.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/gfp.h>
+#include <linux/syscore_ops.h>
+#include <linux/version.h>
+#include <linux/ctype.h>
+
+#include <linux/compat.h>
+#include <linux/syscalls.h>
+#include <linux/kprobes.h>
+#include <linux/user_namespace.h>
+#include <linux/binfmts.h>
+
+#include <linux/sched.h>
+#include <linux/rcupdate.h>
+#include <linux/uidgid.h>
+#include <linux/cred.h>
+
+#include <linux/kmsg_dump.h>
+/* Move somewhere else to avoid recompiling? */
+#include <generated/utsrelease.h>
+
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/unistd.h>
+
+#ifndef SET_UNALIGN_CTL
+# define SET_UNALIGN_CTL(a, b) (-EINVAL)
+#endif
+#ifndef GET_UNALIGN_CTL
+# define GET_UNALIGN_CTL(a, b) (-EINVAL)
+#endif
+#ifndef SET_FPEMU_CTL
+# define SET_FPEMU_CTL(a, b) (-EINVAL)
+#endif
+#ifndef GET_FPEMU_CTL
+# define GET_FPEMU_CTL(a, b) (-EINVAL)
+#endif
+#ifndef SET_FPEXC_CTL
+# define SET_FPEXC_CTL(a, b) (-EINVAL)
+#endif
+#ifndef GET_FPEXC_CTL
+# define GET_FPEXC_CTL(a, b) (-EINVAL)
+#endif
+#ifndef GET_ENDIAN
+# define GET_ENDIAN(a, b) (-EINVAL)
+#endif
+#ifndef SET_ENDIAN
+# define SET_ENDIAN(a, b) (-EINVAL)
+#endif
+#ifndef GET_TSC_CTL
+# define GET_TSC_CTL(a) (-EINVAL)
+#endif
+#ifndef SET_TSC_CTL
+# define SET_TSC_CTL(a) (-EINVAL)
+#endif
+#ifndef MPX_ENABLE_MANAGEMENT
+# define MPX_ENABLE_MANAGEMENT(a) (-EINVAL)
+#endif
+#ifndef MPX_DISABLE_MANAGEMENT
+# define MPX_DISABLE_MANAGEMENT(a) (-EINVAL)
+#endif
+#ifndef GET_FP_MODE
+# define GET_FP_MODE(a) (-EINVAL)
+#endif
+#ifndef SET_FP_MODE
+# define SET_FP_MODE(a,b) (-EINVAL)
+#endif
+
+/*
+ * this is where the system-wide overflow UID and GID are defined, for
+ * architectures that now have 32-bit UID/GID but didn't in the past
+ */
+
+int overflowuid = DEFAULT_OVERFLOWUID;
+int overflowgid = DEFAULT_OVERFLOWGID;
+
+EXPORT_SYMBOL(overflowuid);
+EXPORT_SYMBOL(overflowgid);
+
+/*
+ * the same as above, but for filesystems which can only store a 16-bit
+ * UID and GID. as such, this is needed on all architectures
+ */
+
+int fs_overflowuid = DEFAULT_FS_OVERFLOWUID;
+int fs_overflowgid = DEFAULT_FS_OVERFLOWUID;
+
+EXPORT_SYMBOL(fs_overflowuid);
+EXPORT_SYMBOL(fs_overflowgid);
+
+/*
+ * Returns true if current's euid is same as p's uid or euid,
+ * or has CAP_SYS_NICE to p's user_ns.
+ *
+ * Called with rcu_read_lock, creds are safe
+ */
+static bool set_one_prio_perm(struct task_struct *p)
+{
+ const struct cred *cred = current_cred(), *pcred = __task_cred(p);
+
+ if (uid_eq(pcred->uid, cred->euid) ||
+ uid_eq(pcred->euid, cred->euid))
+ return true;
+ if (ns_capable(pcred->user_ns, CAP_SYS_NICE))
+ return true;
+ return false;
+}
+
+/*
+ * set the priority of a task
+ * - the caller must hold the RCU read lock
+ */
+static int set_one_prio(struct task_struct *p, int niceval, int error)
+{
+ int no_nice;
+
+ if (!set_one_prio_perm(p)) {
+ error = -EPERM;
+ goto out;
+ }
+ if (niceval < task_nice(p) && !can_nice(p, niceval)) {
+ error = -EACCES;
+ goto out;
+ }
+ no_nice = security_task_setnice(p, niceval);
+ if (no_nice) {
+ error = no_nice;
+ goto out;
+ }
+ if (error == -ESRCH)
+ error = 0;
+ set_user_nice(p, niceval);
+out:
+ return error;
+}
+
+SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
+{
+ struct task_struct *g, *p;
+ struct user_struct *user;
+ const struct cred *cred = current_cred();
+ int error = -EINVAL;
+ struct pid *pgrp;
+ kuid_t uid;
+
+ if (which > PRIO_USER || which < PRIO_PROCESS)
+ goto out;
+
+ /* normalize: avoid signed division (rounding problems) */
+ error = -ESRCH;
+ if (niceval < MIN_NICE)
+ niceval = MIN_NICE;
+ if (niceval > MAX_NICE)
+ niceval = MAX_NICE;
+
+ rcu_read_lock();
+ read_lock(&tasklist_lock);
+ switch (which) {
+ case PRIO_PROCESS:
+ if (who)
+ p = find_task_by_vpid(who);
+ else
+ p = current;
+ if (p)
+ error = set_one_prio(p, niceval, error);
+ break;
+ case PRIO_PGRP:
+ if (who)
+ pgrp = find_vpid(who);
+ else
+ pgrp = task_pgrp(current);
+ do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
+ error = set_one_prio(p, niceval, error);
+ } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
+ break;
+ case PRIO_USER:
+ uid = make_kuid(cred->user_ns, who);
+ user = cred->user;
+ if (!who)
+ uid = cred->uid;
+ else if (!uid_eq(uid, cred->uid)) {
+ user = find_user(uid);
+ if (!user)
+ goto out_unlock; /* No processes for this user */
+ }
+ do_each_thread(g, p) {
+ if (uid_eq(task_uid(p), uid))
+ error = set_one_prio(p, niceval, error);
+ } while_each_thread(g, p);
+ if (!uid_eq(uid, cred->uid))
+ free_uid(user); /* For find_user() */
+ break;
+ }
+out_unlock:
+ read_unlock(&tasklist_lock);
+ rcu_read_unlock();
+out:
+ return error;
+}
+
+/*
+ * Ugh. To avoid negative return values, "getpriority()" will
+ * not return the normal nice-value, but a negated value that
+ * has been offset by 20 (ie it returns 40..1 instead of -20..19)
+ * to stay compatible.
+ */
+SYSCALL_DEFINE2(getpriority, int, which, int, who)
+{
+ struct task_struct *g, *p;
+ struct user_struct *user;
+ const struct cred *cred = current_cred();
+ long niceval, retval = -ESRCH;
+ struct pid *pgrp;
+ kuid_t uid;
+
+ if (which > PRIO_USER || which < PRIO_PROCESS)
+ return -EINVAL;
+
+ rcu_read_lock();
+ read_lock(&tasklist_lock);
+ switch (which) {
+ case PRIO_PROCESS:
+ if (who)
+ p = find_task_by_vpid(who);
+ else
+ p = current;
+ if (p) {
+ niceval = nice_to_rlimit(task_nice(p));
+ if (niceval > retval)
+ retval = niceval;
+ }
+ break;
+ case PRIO_PGRP:
+ if (who)
+ pgrp = find_vpid(who);
+ else
+ pgrp = task_pgrp(current);
+ do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
+ niceval = nice_to_rlimit(task_nice(p));
+ if (niceval > retval)
+ retval = niceval;
+ } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
+ break;
+ case PRIO_USER:
+ uid = make_kuid(cred->user_ns, who);
+ user = cred->user;
+ if (!who)
+ uid = cred->uid;
+ else if (!uid_eq(uid, cred->uid)) {
+ user = find_user(uid);
+ if (!user)
+ goto out_unlock; /* No processes for this user */
+ }
+ do_each_thread(g, p) {
+ if (uid_eq(task_uid(p), uid)) {
+ niceval = nice_to_rlimit(task_nice(p));
+ if (niceval > retval)
+ retval = niceval;
+ }
+ } while_each_thread(g, p);
+ if (!uid_eq(uid, cred->uid))
+ free_uid(user); /* for find_user() */
+ break;
+ }
+out_unlock:
+ read_unlock(&tasklist_lock);
+ rcu_read_unlock();
+
+ return retval;
+}
+
+/*
+ * Unprivileged users may change the real gid to the effective gid
+ * or vice versa. (BSD-style)
+ *
+ * If you set the real gid at all, or set the effective gid to a value not
+ * equal to the real gid, then the saved gid is set to the new effective gid.
+ *
+ * This makes it possible for a setgid program to completely drop its
+ * privileges, which is often a useful assertion to make when you are doing
+ * a security audit over a program.
+ *
+ * The general idea is that a program which uses just setregid() will be
+ * 100% compatible with BSD. A program which uses just setgid() will be
+ * 100% compatible with POSIX with saved IDs.
+ *
+ * SMP: There are not races, the GIDs are checked only by filesystem
+ * operations (as far as semantic preservation is concerned).
+ */
+#ifdef CONFIG_MULTIUSER
+SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
+{
+ struct user_namespace *ns = current_user_ns();
+ const struct cred *old;
+ struct cred *new;
+ int retval;
+ kgid_t krgid, kegid;
+
+ krgid = make_kgid(ns, rgid);
+ kegid = make_kgid(ns, egid);
+
+ if ((rgid != (gid_t) -1) && !gid_valid(krgid))
+ return -EINVAL;
+ if ((egid != (gid_t) -1) && !gid_valid(kegid))
+ return -EINVAL;
+
+ new = prepare_creds();
+ if (!new)
+ return -ENOMEM;
+ old = current_cred();
+
+ retval = -EPERM;
+ if (rgid != (gid_t) -1) {
+ if (gid_eq(old->gid, krgid) ||
+ gid_eq(old->egid, krgid) ||
+ ns_capable(old->user_ns, CAP_SETGID))
+ new->gid = krgid;
+ else
+ goto error;
+ }
+ if (egid != (gid_t) -1) {
+ if (gid_eq(old->gid, kegid) ||
+ gid_eq(old->egid, kegid) ||
+ gid_eq(old->sgid, kegid) ||
+ ns_capable(old->user_ns, CAP_SETGID))
+ new->egid = kegid;
+ else
+ goto error;
+ }
+
+ if (rgid != (gid_t) -1 ||
+ (egid != (gid_t) -1 && !gid_eq(kegid, old->gid)))
+ new->sgid = new->egid;
+ new->fsgid = new->egid;
+
+ return commit_creds(new);
+
+error:
+ abort_creds(new);
+ return retval;
+}
+
+/*
+ * setgid() is implemented like SysV w/ SAVED_IDS
+ *
+ * SMP: Same implicit races as above.
+ */
+SYSCALL_DEFINE1(setgid, gid_t, gid)
+{
+ struct user_namespace *ns = current_user_ns();
+ const struct cred *old;
+ struct cred *new;
+ int retval;
+ kgid_t kgid;
+
+ kgid = make_kgid(ns, gid);
+ if (!gid_valid(kgid))
+ return -EINVAL;
+
+ new = prepare_creds();
+ if (!new)
+ return -ENOMEM;
+ old = current_cred();
+
+ retval = -EPERM;
+ if (ns_capable(old->user_ns, CAP_SETGID))
+ new->gid = new->egid = new->sgid = new->fsgid = kgid;
+ else if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->sgid))
+ new->egid = new->fsgid = kgid;
+ else
+ goto error;
+
+ return commit_creds(new);
+
+error:
+ abort_creds(new);
+ return retval;
+}
+
+/*
+ * change the user struct in a credentials set to match the new UID
+ */
+static int set_user(struct cred *new)
+{
+ struct user_struct *new_user;
+
+ new_user = alloc_uid(new->uid);
+ if (!new_user)
+ return -EAGAIN;
+
+ /*
+ * We don't fail in case of NPROC limit excess here because too many
+ * poorly written programs don't check set*uid() return code, assuming
+ * it never fails if called by root. We may still enforce NPROC limit
+ * for programs doing set*uid()+execve() by harmlessly deferring the
+ * failure to the execve() stage.
+ */
+ if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) &&
+ new_user != INIT_USER)
+ current->flags |= PF_NPROC_EXCEEDED;
+ else
+ current->flags &= ~PF_NPROC_EXCEEDED;
+
+ free_uid(new->user);
+ new->user = new_user;
+ return 0;
+}
+
+/*
+ * Unprivileged users may change the real uid to the effective uid
+ * or vice versa. (BSD-style)
+ *
+ * If you set the real uid at all, or set the effective uid to a value not
+ * equal to the real uid, then the saved uid is set to the new effective uid.
+ *
+ * This makes it possible for a setuid program to completely drop its
+ * privileges, which is often a useful assertion to make when you are doing
+ * a security audit over a program.
+ *
+ * The general idea is that a program which uses just setreuid() will be
+ * 100% compatible with BSD. A program which uses just setuid() will be
+ * 100% compatible with POSIX with saved IDs.
+ */
+SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
+{
+ struct user_namespace *ns = current_user_ns();
+ const struct cred *old;
+ struct cred *new;
+ int retval;
+ kuid_t kruid, keuid;
+
+ kruid = make_kuid(ns, ruid);
+ keuid = make_kuid(ns, euid);
+
+ if ((ruid != (uid_t) -1) && !uid_valid(kruid))
+ return -EINVAL;
+ if ((euid != (uid_t) -1) && !uid_valid(keuid))
+ return -EINVAL;
+
+ new = prepare_creds();
+ if (!new)
+ return -ENOMEM;
+ old = current_cred();
+
+ retval = -EPERM;
+ if (ruid != (uid_t) -1) {
+ new->uid = kruid;
+ if (!uid_eq(old->uid, kruid) &&
+ !uid_eq(old->euid, kruid) &&
+ !ns_capable(old->user_ns, CAP_SETUID))
+ goto error;
+ }
+
+ if (euid != (uid_t) -1) {
+ new->euid = keuid;
+ if (!uid_eq(old->uid, keuid) &&
+ !uid_eq(old->euid, keuid) &&
+ !uid_eq(old->suid, keuid) &&
+ !ns_capable(old->user_ns, CAP_SETUID))
+ goto error;
+ }
+
+ if (!uid_eq(new->uid, old->uid)) {
+ retval = set_user(new);
+ if (retval < 0)
+ goto error;
+ }
+ if (ruid != (uid_t) -1 ||
+ (euid != (uid_t) -1 && !uid_eq(keuid, old->uid)))
+ new->suid = new->euid;
+ new->fsuid = new->euid;
+
+ retval = security_task_fix_setuid(new, old, LSM_SETID_RE);
+ if (retval < 0)
+ goto error;
+
+ return commit_creds(new);
+
+error:
+ abort_creds(new);
+ return retval;
+}
+
+/*
+ * setuid() is implemented like SysV with SAVED_IDS
+ *
+ * Note that SAVED_ID's is deficient in that a setuid root program
+ * like sendmail, for example, cannot set its uid to be a normal
+ * user and then switch back, because if you're root, setuid() sets
+ * the saved uid too. If you don't like this, blame the bright people
+ * in the POSIX committee and/or USG. Note that the BSD-style setreuid()
+ * will allow a root program to temporarily drop privileges and be able to
+ * regain them by swapping the real and effective uid.
+ */
+SYSCALL_DEFINE1(setuid, uid_t, uid)
+{
+ struct user_namespace *ns = current_user_ns();
+ const struct cred *old;
+ struct cred *new;
+ int retval;
+ kuid_t kuid;
+
+ kuid = make_kuid(ns, uid);
+ if (!uid_valid(kuid))
+ return -EINVAL;
+
+ new = prepare_creds();
+ if (!new)
+ return -ENOMEM;
+ old = current_cred();
+
+ retval = -EPERM;
+ if (ns_capable(old->user_ns, CAP_SETUID)) {
+ new->suid = new->uid = kuid;
+ if (!uid_eq(kuid, old->uid)) {
+ retval = set_user(new);
+ if (retval < 0)
+ goto error;
+ }
+ } else if (!uid_eq(kuid, old->uid) && !uid_eq(kuid, new->suid)) {
+ goto error;
+ }
+
+ new->fsuid = new->euid = kuid;
+
+ retval = security_task_fix_setuid(new, old, LSM_SETID_ID);
+ if (retval < 0)
+ goto error;
+
+ return commit_creds(new);
+
+error:
+ abort_creds(new);
+ return retval;
+}
+
+
+/*
+ * This function implements a generic ability to update ruid, euid,
+ * and suid. This allows you to implement the 4.4 compatible seteuid().
+ */
+SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
+{
+ struct user_namespace *ns = current_user_ns();
+ const struct cred *old;
+ struct cred *new;
+ int retval;
+ kuid_t kruid, keuid, ksuid;
+
+ kruid = make_kuid(ns, ruid);
+ keuid = make_kuid(ns, euid);
+ ksuid = make_kuid(ns, suid);
+
+ if ((ruid != (uid_t) -1) && !uid_valid(kruid))
+ return -EINVAL;
+
+ if ((euid != (uid_t) -1) && !uid_valid(keuid))
+ return -EINVAL;
+
+ if ((suid != (uid_t) -1) && !uid_valid(ksuid))
+ return -EINVAL;
+
+ new = prepare_creds();
+ if (!new)
+ return -ENOMEM;
+
+ old = current_cred();
+
+ retval = -EPERM;
+ if (!ns_capable(old->user_ns, CAP_SETUID)) {
+ if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) &&
+ !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid))
+ goto error;
+ if (euid != (uid_t) -1 && !uid_eq(keuid, old->uid) &&
+ !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid))
+ goto error;
+ if (suid != (uid_t) -1 && !uid_eq(ksuid, old->uid) &&
+ !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid))
+ goto error;
+ }
+
+ if (ruid != (uid_t) -1) {
+ new->uid = kruid;
+ if (!uid_eq(kruid, old->uid)) {
+ retval = set_user(new);
+ if (retval < 0)
+ goto error;
+ }
+ }
+ if (euid != (uid_t) -1)
+ new->euid = keuid;
+ if (suid != (uid_t) -1)
+ new->suid = ksuid;
+ new->fsuid = new->euid;
+
+ retval = security_task_fix_setuid(new, old, LSM_SETID_RES);
+ if (retval < 0)
+ goto error;
+
+ return commit_creds(new);
+
+error:
+ abort_creds(new);
+ return retval;
+}
+
+SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t __user *, suidp)
+{
+ const struct cred *cred = current_cred();
+ int retval;
+ uid_t ruid, euid, suid;
+
+ ruid = from_kuid_munged(cred->user_ns, cred->uid);
+ euid = from_kuid_munged(cred->user_ns, cred->euid);
+ suid = from_kuid_munged(cred->user_ns, cred->suid);
+
+ retval = put_user(ruid, ruidp);
+ if (!retval) {
+ retval = put_user(euid, euidp);
+ if (!retval)
+ return put_user(suid, suidp);
+ }
+ return retval;
+}
+
+/*
+ * Same as above, but for rgid, egid, sgid.
+ */
+SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
+{
+ struct user_namespace *ns = current_user_ns();
+ const struct cred *old;
+ struct cred *new;
+ int retval;
+ kgid_t krgid, kegid, ksgid;
+
+ krgid = make_kgid(ns, rgid);
+ kegid = make_kgid(ns, egid);
+ ksgid = make_kgid(ns, sgid);
+
+ if ((rgid != (gid_t) -1) && !gid_valid(krgid))
+ return -EINVAL;
+ if ((egid != (gid_t) -1) && !gid_valid(kegid))
+ return -EINVAL;
+ if ((sgid != (gid_t) -1) && !gid_valid(ksgid))
+ return -EINVAL;
+
+ new = prepare_creds();
+ if (!new)
+ return -ENOMEM;
+ old = current_cred();
+
+ retval = -EPERM;
+ if (!ns_capable(old->user_ns, CAP_SETGID)) {
+ if (rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) &&
+ !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid))
+ goto error;
+ if (egid != (gid_t) -1 && !gid_eq(kegid, old->gid) &&
+ !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid))
+ goto error;
+ if (sgid != (gid_t) -1 && !gid_eq(ksgid, old->gid) &&
+ !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid))
+ goto error;
+ }
+
+ if (rgid != (gid_t) -1)
+ new->gid = krgid;
+ if (egid != (gid_t) -1)
+ new->egid = kegid;
+ if (sgid != (gid_t) -1)
+ new->sgid = ksgid;
+ new->fsgid = new->egid;
+
+ return commit_creds(new);
+
+error:
+ abort_creds(new);
+ return retval;
+}
+
+SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t __user *, sgidp)
+{
+ const struct cred *cred = current_cred();
+ int retval;
+ gid_t rgid, egid, sgid;
+
+ rgid = from_kgid_munged(cred->user_ns, cred->gid);
+ egid = from_kgid_munged(cred->user_ns, cred->egid);
+ sgid = from_kgid_munged(cred->user_ns, cred->sgid);
+
+ retval = put_user(rgid, rgidp);
+ if (!retval) {
+ retval = put_user(egid, egidp);
+ if (!retval)
+ retval = put_user(sgid, sgidp);
+ }
+
+ return retval;
+}
+
+
+/*
+ * "setfsuid()" sets the fsuid - the uid used for filesystem checks. This
+ * is used for "access()" and for the NFS daemon (letting nfsd stay at
+ * whatever uid it wants to). It normally shadows "euid", except when
+ * explicitly set by setfsuid() or for access..
+ */
+SYSCALL_DEFINE1(setfsuid, uid_t, uid)
+{
+ const struct cred *old;
+ struct cred *new;
+ uid_t old_fsuid;
+ kuid_t kuid;
+
+ old = current_cred();
+ old_fsuid = from_kuid_munged(old->user_ns, old->fsuid);
+
+ kuid = make_kuid(old->user_ns, uid);
+ if (!uid_valid(kuid))
+ return old_fsuid;
+
+ new = prepare_creds();
+ if (!new)
+ return old_fsuid;
+
+ if (uid_eq(kuid, old->uid) || uid_eq(kuid, old->euid) ||
+ uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) ||
+ ns_capable(old->user_ns, CAP_SETUID)) {
+ if (!uid_eq(kuid, old->fsuid)) {
+ new->fsuid = kuid;
+ if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
+ goto change_okay;
+ }
+ }
+
+ abort_creds(new);
+ return old_fsuid;
+
+change_okay:
+ commit_creds(new);
+ return old_fsuid;
+}
+
+/*
+ * Samma pĂ¥ svenska..
+ */
+SYSCALL_DEFINE1(setfsgid, gid_t, gid)
+{
+ const struct cred *old;
+ struct cred *new;
+ gid_t old_fsgid;
+ kgid_t kgid;
+
+ old = current_cred();
+ old_fsgid = from_kgid_munged(old->user_ns, old->fsgid);
+
+ kgid = make_kgid(old->user_ns, gid);
+ if (!gid_valid(kgid))
+ return old_fsgid;
+
+ new = prepare_creds();
+ if (!new)
+ return old_fsgid;
+
+ if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->egid) ||
+ gid_eq(kgid, old->sgid) || gid_eq(kgid, old->fsgid) ||
+ ns_capable(old->user_ns, CAP_SETGID)) {
+ if (!gid_eq(kgid, old->fsgid)) {
+ new->fsgid = kgid;
+ goto change_okay;
+ }
+ }
+
+ abort_creds(new);
+ return old_fsgid;
+
+change_okay:
+ commit_creds(new);
+ return old_fsgid;
+}
+#endif /* CONFIG_MULTIUSER */
+
+/**
+ * sys_getpid - return the thread group id of the current process
+ *
+ * Note, despite the name, this returns the tgid not the pid. The tgid and
+ * the pid are identical unless CLONE_THREAD was specified on clone() in
+ * which case the tgid is the same in all threads of the same group.
+ *
+ * This is SMP safe as current->tgid does not change.
+ */
+SYSCALL_DEFINE0(getpid)
+{
+ return task_tgid_vnr(current);
+}
+
+/* Thread ID - the internal kernel "pid" */
+SYSCALL_DEFINE0(gettid)
+{
+ return task_pid_vnr(current);
+}
+
+/*
+ * Accessing ->real_parent is not SMP-safe, it could
+ * change from under us. However, we can use a stale
+ * value of ->real_parent under rcu_read_lock(), see
+ * release_task()->call_rcu(delayed_put_task_struct).
+ */
+SYSCALL_DEFINE0(getppid)
+{
+ int pid;
+
+ rcu_read_lock();
+ pid = task_tgid_vnr(rcu_dereference(current->real_parent));
+ rcu_read_unlock();
+
+ return pid;
+}
+
+SYSCALL_DEFINE0(getuid)
+{
+ /* Only we change this so SMP safe */
+ return from_kuid_munged(current_user_ns(), current_uid());
+}
+
+SYSCALL_DEFINE0(geteuid)
+{
+ /* Only we change this so SMP safe */
+ return from_kuid_munged(current_user_ns(), current_euid());
+}
+
+SYSCALL_DEFINE0(getgid)
+{
+ /* Only we change this so SMP safe */
+ return from_kgid_munged(current_user_ns(), current_gid());
+}
+
+SYSCALL_DEFINE0(getegid)
+{
+ /* Only we change this so SMP safe */
+ return from_kgid_munged(current_user_ns(), current_egid());
+}
+
+void do_sys_times(struct tms *tms)
+{
+ cputime_t tgutime, tgstime, cutime, cstime;
+
+ thread_group_cputime_adjusted(current, &tgutime, &tgstime);
+ cutime = current->signal->cutime;
+ cstime = current->signal->cstime;
+ tms->tms_utime = cputime_to_clock_t(tgutime);
+ tms->tms_stime = cputime_to_clock_t(tgstime);
+ tms->tms_cutime = cputime_to_clock_t(cutime);
+ tms->tms_cstime = cputime_to_clock_t(cstime);
+}
+
+SYSCALL_DEFINE1(times, struct tms __user *, tbuf)
+{
+ if (tbuf) {
+ struct tms tmp;
+
+ do_sys_times(&tmp);
+ if (copy_to_user(tbuf, &tmp, sizeof(struct tms)))
+ return -EFAULT;
+ }
+ force_successful_syscall_return();
+ return (long) jiffies_64_to_clock_t(get_jiffies_64());
+}
+
+/*
+ * This needs some heavy checking ...
+ * I just haven't the stomach for it. I also don't fully
+ * understand sessions/pgrp etc. Let somebody who does explain it.
+ *
+ * OK, I think I have the protection semantics right.... this is really
+ * only important on a multi-user system anyway, to make sure one user
+ * can't send a signal to a process owned by another. -TYT, 12/12/91
+ *
+ * !PF_FORKNOEXEC check to conform completely to POSIX.
+ */
+SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
+{
+ struct task_struct *p;
+ struct task_struct *group_leader = current->group_leader;
+ struct pid *pgrp;
+ int err;
+
+ if (!pid)
+ pid = task_pid_vnr(group_leader);
+ if (!pgid)
+ pgid = pid;
+ if (pgid < 0)
+ return -EINVAL;
+ rcu_read_lock();
+
+ /* From this point forward we keep holding onto the tasklist lock
+ * so that our parent does not change from under us. -DaveM
+ */
+ write_lock_irq(&tasklist_lock);
+
+ err = -ESRCH;
+ p = find_task_by_vpid(pid);
+ if (!p)
+ goto out;
+
+ err = -EINVAL;
+ if (!thread_group_leader(p))
+ goto out;
+
+ if (same_thread_group(p->real_parent, group_leader)) {
+ err = -EPERM;
+ if (task_session(p) != task_session(group_leader))
+ goto out;
+ err = -EACCES;
+ if (!(p->flags & PF_FORKNOEXEC))
+ goto out;
+ } else {
+ err = -ESRCH;
+ if (p != group_leader)
+ goto out;
+ }
+
+ err = -EPERM;
+ if (p->signal->leader)
+ goto out;
+
+ pgrp = task_pid(p);
+ if (pgid != pid) {
+ struct task_struct *g;
+
+ pgrp = find_vpid(pgid);
+ g = pid_task(pgrp, PIDTYPE_PGID);
+ if (!g || task_session(g) != task_session(group_leader))
+ goto out;
+ }
+
+ err = security_task_setpgid(p, pgid);
+ if (err)
+ goto out;
+
+ if (task_pgrp(p) != pgrp)
+ change_pid(p, PIDTYPE_PGID, pgrp);
+
+ err = 0;
+out:
+ /* All paths lead to here, thus we are safe. -DaveM */
+ write_unlock_irq(&tasklist_lock);
+ rcu_read_unlock();
+ return err;
+}
+
+SYSCALL_DEFINE1(getpgid, pid_t, pid)
+{
+ struct task_struct *p;
+ struct pid *grp;
+ int retval;
+
+ rcu_read_lock();
+ if (!pid)
+ grp = task_pgrp(current);
+ else {
+ retval = -ESRCH;
+ p = find_task_by_vpid(pid);
+ if (!p)
+ goto out;
+ grp = task_pgrp(p);
+ if (!grp)
+ goto out;
+
+ retval = security_task_getpgid(p);
+ if (retval)
+ goto out;
+ }
+ retval = pid_vnr(grp);
+out:
+ rcu_read_unlock();
+ return retval;
+}
+
+#ifdef __ARCH_WANT_SYS_GETPGRP
+
+SYSCALL_DEFINE0(getpgrp)
+{
+ return sys_getpgid(0);
+}
+
+#endif
+
+SYSCALL_DEFINE1(getsid, pid_t, pid)
+{
+ struct task_struct *p;
+ struct pid *sid;
+ int retval;
+
+ rcu_read_lock();
+ if (!pid)
+ sid = task_session(current);
+ else {
+ retval = -ESRCH;
+ p = find_task_by_vpid(pid);
+ if (!p)
+ goto out;
+ sid = task_session(p);
+ if (!sid)
+ goto out;
+
+ retval = security_task_getsid(p);
+ if (retval)
+ goto out;
+ }
+ retval = pid_vnr(sid);
+out:
+ rcu_read_unlock();
+ return retval;
+}
+
+static void set_special_pids(struct pid *pid)
+{
+ struct task_struct *curr = current->group_leader;
+
+ if (task_session(curr) != pid)
+ change_pid(curr, PIDTYPE_SID, pid);
+
+ if (task_pgrp(curr) != pid)
+ change_pid(curr, PIDTYPE_PGID, pid);
+}
+
+SYSCALL_DEFINE0(setsid)
+{
+ struct task_struct *group_leader = current->group_leader;
+ struct pid *sid = task_pid(group_leader);
+ pid_t session = pid_vnr(sid);
+ int err = -EPERM;
+
+ write_lock_irq(&tasklist_lock);
+ /* Fail if I am already a session leader */
+ if (group_leader->signal->leader)
+ goto out;
+
+ /* Fail if a process group id already exists that equals the
+ * proposed session id.
+ */
+ if (pid_task(sid, PIDTYPE_PGID))
+ goto out;
+
+ group_leader->signal->leader = 1;
+ set_special_pids(sid);
+
+ proc_clear_tty(group_leader);
+
+ err = session;
+out:
+ write_unlock_irq(&tasklist_lock);
+ if (err > 0) {
+ proc_sid_connector(group_leader);
+ sched_autogroup_create_attach(group_leader);
+ }
+ return err;
+}
+
+DECLARE_RWSEM(uts_sem);
+
+#ifdef COMPAT_UTS_MACHINE
+#define override_architecture(name) \
+ (personality(current->personality) == PER_LINUX32 && \
+ copy_to_user(name->machine, COMPAT_UTS_MACHINE, \
+ sizeof(COMPAT_UTS_MACHINE)))
+#else
+#define override_architecture(name) 0
+#endif
+
+/*
+ * Work around broken programs that cannot handle "Linux 3.0".
+ * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40
+ * And we map 4.x to 2.6.60+x, so 4.0 would be 2.6.60.
+ */
+static int override_release(char __user *release, size_t len)
+{
+ int ret = 0;
+
+ if (current->personality & UNAME26) {
+ const char *rest = UTS_RELEASE;
+ char buf[65] = { 0 };
+ int ndots = 0;
+ unsigned v;
+ size_t copy;
+
+ while (*rest) {
+ if (*rest == '.' && ++ndots >= 3)
+ break;
+ if (!isdigit(*rest) && *rest != '.')
+ break;
+ rest++;
+ }
+ v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 60;
+ copy = clamp_t(size_t, len, 1, sizeof(buf));
+ copy = scnprintf(buf, copy, "2.6.%u%s", v, rest);
+ ret = copy_to_user(release, buf, copy + 1);
+ }
+ return ret;
+}
+
+SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
+{
+ int errno = 0;
+
+ down_read(&uts_sem);
+ if (copy_to_user(name, utsname(), sizeof *name))
+ errno = -EFAULT;
+ up_read(&uts_sem);
+
+ if (!errno && override_release(name->release, sizeof(name->release)))
+ errno = -EFAULT;
+ if (!errno && override_architecture(name))
+ errno = -EFAULT;
+ return errno;
+}
+
+#ifdef __ARCH_WANT_SYS_OLD_UNAME
+/*
+ * Old cruft
+ */
+SYSCALL_DEFINE1(uname, struct old_utsname __user *, name)
+{
+ int error = 0;
+
+ if (!name)
+ return -EFAULT;
+
+ down_read(&uts_sem);
+ if (copy_to_user(name, utsname(), sizeof(*name)))
+ error = -EFAULT;
+ up_read(&uts_sem);
+
+ if (!error && override_release(name->release, sizeof(name->release)))
+ error = -EFAULT;
+ if (!error && override_architecture(name))
+ error = -EFAULT;
+ return error;
+}
+
+SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name)
+{
+ int error;
+
+ if (!name)
+ return -EFAULT;
+ if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname)))
+ return -EFAULT;
+
+ down_read(&uts_sem);
+ error = __copy_to_user(&name->sysname, &utsname()->sysname,
+ __OLD_UTS_LEN);
+ error |= __put_user(0, name->sysname + __OLD_UTS_LEN);
+ error |= __copy_to_user(&name->nodename, &utsname()->nodename,
+ __OLD_UTS_LEN);
+ error |= __put_user(0, name->nodename + __OLD_UTS_LEN);
+ error |= __copy_to_user(&name->release, &utsname()->release,
+ __OLD_UTS_LEN);
+ error |= __put_user(0, name->release + __OLD_UTS_LEN);
+ error |= __copy_to_user(&name->version, &utsname()->version,
+ __OLD_UTS_LEN);
+ error |= __put_user(0, name->version + __OLD_UTS_LEN);
+ error |= __copy_to_user(&name->machine, &utsname()->machine,
+ __OLD_UTS_LEN);
+ error |= __put_user(0, name->machine + __OLD_UTS_LEN);
+ up_read(&uts_sem);
+
+ if (!error && override_architecture(name))
+ error = -EFAULT;
+ if (!error && override_release(name->release, sizeof(name->release)))
+ error = -EFAULT;
+ return error ? -EFAULT : 0;
+}
+#endif
+
+SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
+{
+ int errno;
+ char tmp[__NEW_UTS_LEN];
+
+ if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (len < 0 || len > __NEW_UTS_LEN)
+ return -EINVAL;
+ down_write(&uts_sem);
+ errno = -EFAULT;
+ if (!copy_from_user(tmp, name, len)) {
+ struct new_utsname *u = utsname();
+
+ memcpy(u->nodename, tmp, len);
+ memset(u->nodename + len, 0, sizeof(u->nodename) - len);
+ errno = 0;
+ uts_proc_notify(UTS_PROC_HOSTNAME);
+ }
+ up_write(&uts_sem);
+ return errno;
+}
+
+#ifdef __ARCH_WANT_SYS_GETHOSTNAME
+
+SYSCALL_DEFINE2(gethostname, char __user *, name, int, len)
+{
+ int i, errno;
+ struct new_utsname *u;
+
+ if (len < 0)
+ return -EINVAL;
+ down_read(&uts_sem);
+ u = utsname();
+ i = 1 + strlen(u->nodename);
+ if (i > len)
+ i = len;
+ errno = 0;
+ if (copy_to_user(name, u->nodename, i))
+ errno = -EFAULT;
+ up_read(&uts_sem);
+ return errno;
+}
+
+#endif
+
+/*
+ * Only setdomainname; getdomainname can be implemented by calling
+ * uname()
+ */
+SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
+{
+ int errno;
+ char tmp[__NEW_UTS_LEN];
+
+ if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+ if (len < 0 || len > __NEW_UTS_LEN)
+ return -EINVAL;
+
+ down_write(&uts_sem);
+ errno = -EFAULT;
+ if (!copy_from_user(tmp, name, len)) {
+ struct new_utsname *u = utsname();
+
+ memcpy(u->domainname, tmp, len);
+ memset(u->domainname + len, 0, sizeof(u->domainname) - len);
+ errno = 0;
+ uts_proc_notify(UTS_PROC_DOMAINNAME);
+ }
+ up_write(&uts_sem);
+ return errno;
+}
+
+SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim)
+{
+ struct rlimit value;
+ int ret;
+
+ ret = do_prlimit(current, resource, NULL, &value);
+ if (!ret)
+ ret = copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0;
+
+ return ret;
+}
+
+#ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT
+
+/*
+ * Back compatibility for getrlimit. Needed for some apps.
+ */
+SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
+ struct rlimit __user *, rlim)
+{
+ struct rlimit x;
+ if (resource >= RLIM_NLIMITS)
+ return -EINVAL;
+
+ task_lock(current->group_leader);
+ x = current->signal->rlim[resource];
+ task_unlock(current->group_leader);
+ if (x.rlim_cur > 0x7FFFFFFF)
+ x.rlim_cur = 0x7FFFFFFF;
+ if (x.rlim_max > 0x7FFFFFFF)
+ x.rlim_max = 0x7FFFFFFF;
+ return copy_to_user(rlim, &x, sizeof(x)) ? -EFAULT : 0;
+}
+
+#endif
+
+static inline bool rlim64_is_infinity(__u64 rlim64)
+{
+#if BITS_PER_LONG < 64
+ return rlim64 >= ULONG_MAX;
+#else
+ return rlim64 == RLIM64_INFINITY;
+#endif
+}
+
+static void rlim_to_rlim64(const struct rlimit *rlim, struct rlimit64 *rlim64)
+{
+ if (rlim->rlim_cur == RLIM_INFINITY)
+ rlim64->rlim_cur = RLIM64_INFINITY;
+ else
+ rlim64->rlim_cur = rlim->rlim_cur;
+ if (rlim->rlim_max == RLIM_INFINITY)
+ rlim64->rlim_max = RLIM64_INFINITY;
+ else
+ rlim64->rlim_max = rlim->rlim_max;
+}
+
+static void rlim64_to_rlim(const struct rlimit64 *rlim64, struct rlimit *rlim)
+{
+ if (rlim64_is_infinity(rlim64->rlim_cur))
+ rlim->rlim_cur = RLIM_INFINITY;
+ else
+ rlim->rlim_cur = (unsigned long)rlim64->rlim_cur;
+ if (rlim64_is_infinity(rlim64->rlim_max))
+ rlim->rlim_max = RLIM_INFINITY;
+ else
+ rlim->rlim_max = (unsigned long)rlim64->rlim_max;
+}
+
+/* make sure you are allowed to change @tsk limits before calling this */
+int do_prlimit(struct task_struct *tsk, unsigned int resource,
+ struct rlimit *new_rlim, struct rlimit *old_rlim)
+{
+ struct rlimit *rlim;
+ int retval = 0;
+
+ if (resource >= RLIM_NLIMITS)
+ return -EINVAL;
+ if (new_rlim) {
+ if (new_rlim->rlim_cur > new_rlim->rlim_max)
+ return -EINVAL;
+ if (resource == RLIMIT_NOFILE &&
+ new_rlim->rlim_max > sysctl_nr_open)
+ return -EPERM;
+ }
+
+ /* protect tsk->signal and tsk->sighand from disappearing */
+ read_lock(&tasklist_lock);
+ if (!tsk->sighand) {
+ retval = -ESRCH;
+ goto out;
+ }
+
+ rlim = tsk->signal->rlim + resource;
+ task_lock(tsk->group_leader);
+ if (new_rlim) {
+ /* Keep the capable check against init_user_ns until
+ cgroups can contain all limits */
+ if (new_rlim->rlim_max > rlim->rlim_max &&
+ !capable(CAP_SYS_RESOURCE))
+ retval = -EPERM;
+ if (!retval)
+ retval = security_task_setrlimit(tsk->group_leader,
+ resource, new_rlim);
+ if (resource == RLIMIT_CPU && new_rlim->rlim_cur == 0) {
+ /*
+ * The caller is asking for an immediate RLIMIT_CPU
+ * expiry. But we use the zero value to mean "it was
+ * never set". So let's cheat and make it one second
+ * instead
+ */
+ new_rlim->rlim_cur = 1;
+ }
+ }
+ if (!retval) {
+ if (old_rlim)
+ *old_rlim = *rlim;
+ if (new_rlim)
+ *rlim = *new_rlim;
+ }
+ task_unlock(tsk->group_leader);
+
+ /*
+ * RLIMIT_CPU handling. Note that the kernel fails to return an error
+ * code if it rejected the user's attempt to set RLIMIT_CPU. This is a
+ * very long-standing error, and fixing it now risks breakage of
+ * applications, so we live with it
+ */
+ if (!retval && new_rlim && resource == RLIMIT_CPU &&
+ new_rlim->rlim_cur != RLIM_INFINITY)
+ update_rlimit_cpu(tsk, new_rlim->rlim_cur);
+out:
+ read_unlock(&tasklist_lock);
+ return retval;
+}
+
+/* rcu lock must be held */
+static int check_prlimit_permission(struct task_struct *task)
+{
+ const struct cred *cred = current_cred(), *tcred;
+
+ if (current == task)
+ return 0;
+
+ tcred = __task_cred(task);
+ if (uid_eq(cred->uid, tcred->euid) &&
+ uid_eq(cred->uid, tcred->suid) &&
+ uid_eq(cred->uid, tcred->uid) &&
+ gid_eq(cred->gid, tcred->egid) &&
+ gid_eq(cred->gid, tcred->sgid) &&
+ gid_eq(cred->gid, tcred->gid))
+ return 0;
+ if (ns_capable(tcred->user_ns, CAP_SYS_RESOURCE))
+ return 0;
+
+ return -EPERM;
+}
+
+SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
+ const struct rlimit64 __user *, new_rlim,
+ struct rlimit64 __user *, old_rlim)
+{
+ struct rlimit64 old64, new64;
+ struct rlimit old, new;
+ struct task_struct *tsk;
+ int ret;
+
+ if (new_rlim) {
+ if (copy_from_user(&new64, new_rlim, sizeof(new64)))
+ return -EFAULT;
+ rlim64_to_rlim(&new64, &new);
+ }
+
+ rcu_read_lock();
+ tsk = pid ? find_task_by_vpid(pid) : current;
+ if (!tsk) {
+ rcu_read_unlock();
+ return -ESRCH;
+ }
+ ret = check_prlimit_permission(tsk);
+ if (ret) {
+ rcu_read_unlock();
+ return ret;
+ }
+ get_task_struct(tsk);
+ rcu_read_unlock();
+
+ ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL,
+ old_rlim ? &old : NULL);
+
+ if (!ret && old_rlim) {
+ rlim_to_rlim64(&old, &old64);
+ if (copy_to_user(old_rlim, &old64, sizeof(old64)))
+ ret = -EFAULT;
+ }
+
+ put_task_struct(tsk);
+ return ret;
+}
+
+SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim)
+{
+ struct rlimit new_rlim;
+
+ if (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
+ return -EFAULT;
+ return do_prlimit(current, resource, &new_rlim, NULL);
+}
+
+/*
+ * It would make sense to put struct rusage in the task_struct,
+ * except that would make the task_struct be *really big*. After
+ * task_struct gets moved into malloc'ed memory, it would
+ * make sense to do this. It will make moving the rest of the information
+ * a lot simpler! (Which we're not doing right now because we're not
+ * measuring them yet).
+ *
+ * When sampling multiple threads for RUSAGE_SELF, under SMP we might have
+ * races with threads incrementing their own counters. But since word
+ * reads are atomic, we either get new values or old values and we don't
+ * care which for the sums. We always take the siglock to protect reading
+ * the c* fields from p->signal from races with exit.c updating those
+ * fields when reaping, so a sample either gets all the additions of a
+ * given child after it's reaped, or none so this sample is before reaping.
+ *
+ * Locking:
+ * We need to take the siglock for CHILDEREN, SELF and BOTH
+ * for the cases current multithreaded, non-current single threaded
+ * non-current multithreaded. Thread traversal is now safe with
+ * the siglock held.
+ * Strictly speaking, we donot need to take the siglock if we are current and
+ * single threaded, as no one else can take our signal_struct away, no one
+ * else can reap the children to update signal->c* counters, and no one else
+ * can race with the signal-> fields. If we do not take any lock, the
+ * signal-> fields could be read out of order while another thread was just
+ * exiting. So we should place a read memory barrier when we avoid the lock.
+ * On the writer side, write memory barrier is implied in __exit_signal
+ * as __exit_signal releases the siglock spinlock after updating the signal->
+ * fields. But we don't do this yet to keep things simple.
+ *
+ */
+
+static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r)
+{
+ r->ru_nvcsw += t->nvcsw;
+ r->ru_nivcsw += t->nivcsw;
+ r->ru_minflt += t->min_flt;
+ r->ru_majflt += t->maj_flt;
+ r->ru_inblock += task_io_get_inblock(t);
+ r->ru_oublock += task_io_get_oublock(t);
+}
+
+static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
+{
+ struct task_struct *t;
+ unsigned long flags;
+ cputime_t tgutime, tgstime, utime, stime;
+ unsigned long maxrss = 0;
+
+ memset((char *)r, 0, sizeof (*r));
+ utime = stime = 0;
+
+ if (who == RUSAGE_THREAD) {
+ task_cputime_adjusted(current, &utime, &stime);
+ accumulate_thread_rusage(p, r);
+ maxrss = p->signal->maxrss;
+ goto out;
+ }
+
+ if (!lock_task_sighand(p, &flags))
+ return;
+
+ switch (who) {
+ case RUSAGE_BOTH:
+ case RUSAGE_CHILDREN:
+ utime = p->signal->cutime;
+ stime = p->signal->cstime;
+ r->ru_nvcsw = p->signal->cnvcsw;
+ r->ru_nivcsw = p->signal->cnivcsw;
+ r->ru_minflt = p->signal->cmin_flt;
+ r->ru_majflt = p->signal->cmaj_flt;
+ r->ru_inblock = p->signal->cinblock;
+ r->ru_oublock = p->signal->coublock;
+ maxrss = p->signal->cmaxrss;
+
+ if (who == RUSAGE_CHILDREN)
+ break;
+
+ case RUSAGE_SELF:
+ thread_group_cputime_adjusted(p, &tgutime, &tgstime);
+ utime += tgutime;
+ stime += tgstime;
+ r->ru_nvcsw += p->signal->nvcsw;
+ r->ru_nivcsw += p->signal->nivcsw;
+ r->ru_minflt += p->signal->min_flt;
+ r->ru_majflt += p->signal->maj_flt;
+ r->ru_inblock += p->signal->inblock;
+ r->ru_oublock += p->signal->oublock;
+ if (maxrss < p->signal->maxrss)
+ maxrss = p->signal->maxrss;
+ t = p;
+ do {
+ accumulate_thread_rusage(t, r);
+ } while_each_thread(p, t);
+ break;
+
+ default:
+ BUG();
+ }
+ unlock_task_sighand(p, &flags);
+
+out:
+ cputime_to_timeval(utime, &r->ru_utime);
+ cputime_to_timeval(stime, &r->ru_stime);
+
+ if (who != RUSAGE_CHILDREN) {
+ struct mm_struct *mm = get_task_mm(p);
+
+ if (mm) {
+ setmax_mm_hiwater_rss(&maxrss, mm);
+ mmput(mm);
+ }
+ }
+ r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */
+}
+
+int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
+{
+ struct rusage r;
+
+ k_getrusage(p, who, &r);
+ return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
+}
+
+SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru)
+{
+ if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN &&
+ who != RUSAGE_THREAD)
+ return -EINVAL;
+ return getrusage(current, who, ru);
+}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(getrusage, int, who, struct compat_rusage __user *, ru)
+{
+ struct rusage r;
+
+ if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN &&
+ who != RUSAGE_THREAD)
+ return -EINVAL;
+
+ k_getrusage(current, who, &r);
+ return put_compat_rusage(&r, ru);
+}
+#endif
+
+SYSCALL_DEFINE1(umask, int, mask)
+{
+ mask = xchg(&current->fs->umask, mask & S_IRWXUGO);
+ return mask;
+}
+
+static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
+{
+ struct fd exe;
+ struct file *old_exe, *exe_file;
+ struct inode *inode;
+ int err;
+
+ exe = fdget(fd);
+ if (!exe.file)
+ return -EBADF;
+
+ inode = file_inode(exe.file);
+
+ /*
+ * Because the original mm->exe_file points to executable file, make
+ * sure that this one is executable as well, to avoid breaking an
+ * overall picture.
+ */
+ err = -EACCES;
+ if (!S_ISREG(inode->i_mode) ||
+ exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC)
+ goto exit;
+
+ err = inode_permission(inode, MAY_EXEC);
+ if (err)
+ goto exit;
+
+ /*
+ * Forbid mm->exe_file change if old file still mapped.
+ */
+ exe_file = get_mm_exe_file(mm);
+ err = -EBUSY;
+ if (exe_file) {
+ struct vm_area_struct *vma;
+
+ down_read(&mm->mmap_sem);
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (!vma->vm_file)
+ continue;
+ if (path_equal(&vma->vm_file->f_path,
+ &exe_file->f_path))
+ goto exit_err;
+ }
+
+ up_read(&mm->mmap_sem);
+ fput(exe_file);
+ }
+
+ /*
+ * The symlink can be changed only once, just to disallow arbitrary
+ * transitions malicious software might bring in. This means one
+ * could make a snapshot over all processes running and monitor
+ * /proc/pid/exe changes to notice unusual activity if needed.
+ */
+ err = -EPERM;
+ if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags))
+ goto exit;
+
+ err = 0;
+ /* set the new file, lockless */
+ get_file(exe.file);
+ old_exe = xchg(&mm->exe_file, exe.file);
+ if (old_exe)
+ fput(old_exe);
+exit:
+ fdput(exe);
+ return err;
+exit_err:
+ up_read(&mm->mmap_sem);
+ fput(exe_file);
+ goto exit;
+}
+
+#ifdef CONFIG_CHECKPOINT_RESTORE
+/*
+ * WARNING: we don't require any capability here so be very careful
+ * in what is allowed for modification from userspace.
+ */
+static int validate_prctl_map(struct prctl_mm_map *prctl_map)
+{
+ unsigned long mmap_max_addr = TASK_SIZE;
+ struct mm_struct *mm = current->mm;
+ int error = -EINVAL, i;
+
+ static const unsigned char offsets[] = {
+ offsetof(struct prctl_mm_map, start_code),
+ offsetof(struct prctl_mm_map, end_code),
+ offsetof(struct prctl_mm_map, start_data),
+ offsetof(struct prctl_mm_map, end_data),
+ offsetof(struct prctl_mm_map, start_brk),
+ offsetof(struct prctl_mm_map, brk),
+ offsetof(struct prctl_mm_map, start_stack),
+ offsetof(struct prctl_mm_map, arg_start),
+ offsetof(struct prctl_mm_map, arg_end),
+ offsetof(struct prctl_mm_map, env_start),
+ offsetof(struct prctl_mm_map, env_end),
+ };
+
+ /*
+ * Make sure the members are not somewhere outside
+ * of allowed address space.
+ */
+ for (i = 0; i < ARRAY_SIZE(offsets); i++) {
+ u64 val = *(u64 *)((char *)prctl_map + offsets[i]);
+
+ if ((unsigned long)val >= mmap_max_addr ||
+ (unsigned long)val < mmap_min_addr)
+ goto out;
+ }
+
+ /*
+ * Make sure the pairs are ordered.
+ */
+#define __prctl_check_order(__m1, __op, __m2) \
+ ((unsigned long)prctl_map->__m1 __op \
+ (unsigned long)prctl_map->__m2) ? 0 : -EINVAL
+ error = __prctl_check_order(start_code, <, end_code);
+ error |= __prctl_check_order(start_data, <, end_data);
+ error |= __prctl_check_order(start_brk, <=, brk);
+ error |= __prctl_check_order(arg_start, <=, arg_end);
+ error |= __prctl_check_order(env_start, <=, env_end);
+ if (error)
+ goto out;
+#undef __prctl_check_order
+
+ error = -EINVAL;
+
+ /*
+ * @brk should be after @end_data in traditional maps.
+ */
+ if (prctl_map->start_brk <= prctl_map->end_data ||
+ prctl_map->brk <= prctl_map->end_data)
+ goto out;
+
+ /*
+ * Neither we should allow to override limits if they set.
+ */
+ if (check_data_rlimit(rlimit(RLIMIT_DATA), prctl_map->brk,
+ prctl_map->start_brk, prctl_map->end_data,
+ prctl_map->start_data))
+ goto out;
+
+ /*
+ * Someone is trying to cheat the auxv vector.
+ */
+ if (prctl_map->auxv_size) {
+ if (!prctl_map->auxv || prctl_map->auxv_size > sizeof(mm->saved_auxv))
+ goto out;
+ }
+
+ /*
+ * Finally, make sure the caller has the rights to
+ * change /proc/pid/exe link: only local root should
+ * be allowed to.
+ */
+ if (prctl_map->exe_fd != (u32)-1) {
+ struct user_namespace *ns = current_user_ns();
+ const struct cred *cred = current_cred();
+
+ if (!uid_eq(cred->uid, make_kuid(ns, 0)) ||
+ !gid_eq(cred->gid, make_kgid(ns, 0)))
+ goto out;
+ }
+
+ error = 0;
+out:
+ return error;
+}
+
+static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size)
+{
+ struct prctl_mm_map prctl_map = { .exe_fd = (u32)-1, };
+ unsigned long user_auxv[AT_VECTOR_SIZE];
+ struct mm_struct *mm = current->mm;
+ int error;
+
+ BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));
+ BUILD_BUG_ON(sizeof(struct prctl_mm_map) > 256);
+
+ if (opt == PR_SET_MM_MAP_SIZE)
+ return put_user((unsigned int)sizeof(prctl_map),
+ (unsigned int __user *)addr);
+
+ if (data_size != sizeof(prctl_map))
+ return -EINVAL;
+
+ if (copy_from_user(&prctl_map, addr, sizeof(prctl_map)))
+ return -EFAULT;
+
+ error = validate_prctl_map(&prctl_map);
+ if (error)
+ return error;
+
+ if (prctl_map.auxv_size) {
+ memset(user_auxv, 0, sizeof(user_auxv));
+ if (copy_from_user(user_auxv,
+ (const void __user *)prctl_map.auxv,
+ prctl_map.auxv_size))
+ return -EFAULT;
+
+ /* Last entry must be AT_NULL as specification requires */
+ user_auxv[AT_VECTOR_SIZE - 2] = AT_NULL;
+ user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL;
+ }
+
+ if (prctl_map.exe_fd != (u32)-1)
+ error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd);
+ down_read(&mm->mmap_sem);
+ if (error)
+ goto out;
+
+ /*
+ * We don't validate if these members are pointing to
+ * real present VMAs because application may have correspond
+ * VMAs already unmapped and kernel uses these members for statistics
+ * output in procfs mostly, except
+ *
+ * - @start_brk/@brk which are used in do_brk but kernel lookups
+ * for VMAs when updating these memvers so anything wrong written
+ * here cause kernel to swear at userspace program but won't lead
+ * to any problem in kernel itself
+ */
+
+ mm->start_code = prctl_map.start_code;
+ mm->end_code = prctl_map.end_code;
+ mm->start_data = prctl_map.start_data;
+ mm->end_data = prctl_map.end_data;
+ mm->start_brk = prctl_map.start_brk;
+ mm->brk = prctl_map.brk;
+ mm->start_stack = prctl_map.start_stack;
+ mm->arg_start = prctl_map.arg_start;
+ mm->arg_end = prctl_map.arg_end;
+ mm->env_start = prctl_map.env_start;
+ mm->env_end = prctl_map.env_end;
+
+ /*
+ * Note this update of @saved_auxv is lockless thus
+ * if someone reads this member in procfs while we're
+ * updating -- it may get partly updated results. It's
+ * known and acceptable trade off: we leave it as is to
+ * not introduce additional locks here making the kernel
+ * more complex.
+ */
+ if (prctl_map.auxv_size)
+ memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv));
+
+ error = 0;
+out:
+ up_read(&mm->mmap_sem);
+ return error;
+}
+#endif /* CONFIG_CHECKPOINT_RESTORE */
+
+static int prctl_set_mm(int opt, unsigned long addr,
+ unsigned long arg4, unsigned long arg5)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ int error;
+
+ if (arg5 || (arg4 && (opt != PR_SET_MM_AUXV &&
+ opt != PR_SET_MM_MAP &&
+ opt != PR_SET_MM_MAP_SIZE)))
+ return -EINVAL;
+
+#ifdef CONFIG_CHECKPOINT_RESTORE
+ if (opt == PR_SET_MM_MAP || opt == PR_SET_MM_MAP_SIZE)
+ return prctl_set_mm_map(opt, (const void __user *)addr, arg4);
+#endif
+
+ if (!capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+
+ if (opt == PR_SET_MM_EXE_FILE)
+ return prctl_set_mm_exe_file(mm, (unsigned int)addr);
+
+ if (addr >= TASK_SIZE || addr < mmap_min_addr)
+ return -EINVAL;
+
+ error = -EINVAL;
+
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, addr);
+
+ switch (opt) {
+ case PR_SET_MM_START_CODE:
+ mm->start_code = addr;
+ break;
+ case PR_SET_MM_END_CODE:
+ mm->end_code = addr;
+ break;
+ case PR_SET_MM_START_DATA:
+ mm->start_data = addr;
+ break;
+ case PR_SET_MM_END_DATA:
+ mm->end_data = addr;
+ break;
+
+ case PR_SET_MM_START_BRK:
+ if (addr <= mm->end_data)
+ goto out;
+
+ if (check_data_rlimit(rlimit(RLIMIT_DATA), mm->brk, addr,
+ mm->end_data, mm->start_data))
+ goto out;
+
+ mm->start_brk = addr;
+ break;
+
+ case PR_SET_MM_BRK:
+ if (addr <= mm->end_data)
+ goto out;
+
+ if (check_data_rlimit(rlimit(RLIMIT_DATA), addr, mm->start_brk,
+ mm->end_data, mm->start_data))
+ goto out;
+
+ mm->brk = addr;
+ break;
+
+ /*
+ * If command line arguments and environment
+ * are placed somewhere else on stack, we can
+ * set them up here, ARG_START/END to setup
+ * command line argumets and ENV_START/END
+ * for environment.
+ */
+ case PR_SET_MM_START_STACK:
+ case PR_SET_MM_ARG_START:
+ case PR_SET_MM_ARG_END:
+ case PR_SET_MM_ENV_START:
+ case PR_SET_MM_ENV_END:
+ if (!vma) {
+ error = -EFAULT;
+ goto out;
+ }
+ if (opt == PR_SET_MM_START_STACK)
+ mm->start_stack = addr;
+ else if (opt == PR_SET_MM_ARG_START)
+ mm->arg_start = addr;
+ else if (opt == PR_SET_MM_ARG_END)
+ mm->arg_end = addr;
+ else if (opt == PR_SET_MM_ENV_START)
+ mm->env_start = addr;
+ else if (opt == PR_SET_MM_ENV_END)
+ mm->env_end = addr;
+ break;
+
+ /*
+ * This doesn't move auxiliary vector itself
+ * since it's pinned to mm_struct, but allow
+ * to fill vector with new values. It's up
+ * to a caller to provide sane values here
+ * otherwise user space tools which use this
+ * vector might be unhappy.
+ */
+ case PR_SET_MM_AUXV: {
+ unsigned long user_auxv[AT_VECTOR_SIZE];
+
+ if (arg4 > sizeof(user_auxv))
+ goto out;
+ up_read(&mm->mmap_sem);
+
+ if (copy_from_user(user_auxv, (const void __user *)addr, arg4))
+ return -EFAULT;
+
+ /* Make sure the last entry is always AT_NULL */
+ user_auxv[AT_VECTOR_SIZE - 2] = 0;
+ user_auxv[AT_VECTOR_SIZE - 1] = 0;
+
+ BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));
+
+ task_lock(current);
+ memcpy(mm->saved_auxv, user_auxv, arg4);
+ task_unlock(current);
+
+ return 0;
+ }
+ default:
+ goto out;
+ }
+
+ error = 0;
+out:
+ up_read(&mm->mmap_sem);
+ return error;
+}
+
+#ifdef CONFIG_CHECKPOINT_RESTORE
+static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
+{
+ return put_user(me->clear_child_tid, tid_addr);
+}
+#else
+static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
+{
+ return -EINVAL;
+}
+#endif
+
+SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
+ unsigned long, arg4, unsigned long, arg5)
+{
+ struct task_struct *me = current;
+ unsigned char comm[sizeof(me->comm)];
+ long error;
+
+ error = security_task_prctl(option, arg2, arg3, arg4, arg5);
+ if (error != -ENOSYS)
+ return error;
+
+ error = 0;
+ switch (option) {
+ case PR_SET_PDEATHSIG:
+ if (!valid_signal(arg2)) {
+ error = -EINVAL;
+ break;
+ }
+ me->pdeath_signal = arg2;
+ break;
+ case PR_GET_PDEATHSIG:
+ error = put_user(me->pdeath_signal, (int __user *)arg2);
+ break;
+ case PR_GET_DUMPABLE:
+ error = get_dumpable(me->mm);
+ break;
+ case PR_SET_DUMPABLE:
+ if (arg2 != SUID_DUMP_DISABLE && arg2 != SUID_DUMP_USER) {
+ error = -EINVAL;
+ break;
+ }
+ set_dumpable(me->mm, arg2);
+ break;
+
+ case PR_SET_UNALIGN:
+ error = SET_UNALIGN_CTL(me, arg2);
+ break;
+ case PR_GET_UNALIGN:
+ error = GET_UNALIGN_CTL(me, arg2);
+ break;
+ case PR_SET_FPEMU:
+ error = SET_FPEMU_CTL(me, arg2);
+ break;
+ case PR_GET_FPEMU:
+ error = GET_FPEMU_CTL(me, arg2);
+ break;
+ case PR_SET_FPEXC:
+ error = SET_FPEXC_CTL(me, arg2);
+ break;
+ case PR_GET_FPEXC:
+ error = GET_FPEXC_CTL(me, arg2);
+ break;
+ case PR_GET_TIMING:
+ error = PR_TIMING_STATISTICAL;
+ break;
+ case PR_SET_TIMING:
+ if (arg2 != PR_TIMING_STATISTICAL)
+ error = -EINVAL;
+ break;
+ case PR_SET_NAME:
+ comm[sizeof(me->comm) - 1] = 0;
+ if (strncpy_from_user(comm, (char __user *)arg2,
+ sizeof(me->comm) - 1) < 0)
+ return -EFAULT;
+ set_task_comm(me, comm);
+ proc_comm_connector(me);
+ break;
+ case PR_GET_NAME:
+ get_task_comm(comm, me);
+ if (copy_to_user((char __user *)arg2, comm, sizeof(comm)))
+ return -EFAULT;
+ break;
+ case PR_GET_ENDIAN:
+ error = GET_ENDIAN(me, arg2);
+ break;
+ case PR_SET_ENDIAN:
+ error = SET_ENDIAN(me, arg2);
+ break;
+ case PR_GET_SECCOMP:
+ error = prctl_get_seccomp();
+ break;
+ case PR_SET_SECCOMP:
+ error = prctl_set_seccomp(arg2, (char __user *)arg3);
+ break;
+ case PR_GET_TSC:
+ error = GET_TSC_CTL(arg2);
+ break;
+ case PR_SET_TSC:
+ error = SET_TSC_CTL(arg2);
+ break;
+ case PR_TASK_PERF_EVENTS_DISABLE:
+ error = perf_event_task_disable();
+ break;
+ case PR_TASK_PERF_EVENTS_ENABLE:
+ error = perf_event_task_enable();
+ break;
+ case PR_GET_TIMERSLACK:
+ error = current->timer_slack_ns;
+ break;
+ case PR_SET_TIMERSLACK:
+ if (arg2 <= 0)
+ current->timer_slack_ns =
+ current->default_timer_slack_ns;
+ else
+ current->timer_slack_ns = arg2;
+ break;
+ case PR_MCE_KILL:
+ if (arg4 | arg5)
+ return -EINVAL;
+ switch (arg2) {
+ case PR_MCE_KILL_CLEAR:
+ if (arg3 != 0)
+ return -EINVAL;
+ current->flags &= ~PF_MCE_PROCESS;
+ break;
+ case PR_MCE_KILL_SET:
+ current->flags |= PF_MCE_PROCESS;
+ if (arg3 == PR_MCE_KILL_EARLY)
+ current->flags |= PF_MCE_EARLY;
+ else if (arg3 == PR_MCE_KILL_LATE)
+ current->flags &= ~PF_MCE_EARLY;
+ else if (arg3 == PR_MCE_KILL_DEFAULT)
+ current->flags &=
+ ~(PF_MCE_EARLY|PF_MCE_PROCESS);
+ else
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+ break;
+ case PR_MCE_KILL_GET:
+ if (arg2 | arg3 | arg4 | arg5)
+ return -EINVAL;
+ if (current->flags & PF_MCE_PROCESS)
+ error = (current->flags & PF_MCE_EARLY) ?
+ PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE;
+ else
+ error = PR_MCE_KILL_DEFAULT;
+ break;
+ case PR_SET_MM:
+ error = prctl_set_mm(arg2, arg3, arg4, arg5);
+ break;
+ case PR_GET_TID_ADDRESS:
+ error = prctl_get_tid_address(me, (int __user **)arg2);
+ break;
+ case PR_SET_CHILD_SUBREAPER:
+ me->signal->is_child_subreaper = !!arg2;
+ break;
+ case PR_GET_CHILD_SUBREAPER:
+ error = put_user(me->signal->is_child_subreaper,
+ (int __user *)arg2);
+ break;
+ case PR_SET_NO_NEW_PRIVS:
+ if (arg2 != 1 || arg3 || arg4 || arg5)
+ return -EINVAL;
+
+ task_set_no_new_privs(current);
+ break;
+ case PR_GET_NO_NEW_PRIVS:
+ if (arg2 || arg3 || arg4 || arg5)
+ return -EINVAL;
+ return task_no_new_privs(current) ? 1 : 0;
+ case PR_GET_THP_DISABLE:
+ if (arg2 || arg3 || arg4 || arg5)
+ return -EINVAL;
+ error = !!(me->mm->def_flags & VM_NOHUGEPAGE);
+ break;
+ case PR_SET_THP_DISABLE:
+ if (arg3 || arg4 || arg5)
+ return -EINVAL;
+ down_write(&me->mm->mmap_sem);
+ if (arg2)
+ me->mm->def_flags |= VM_NOHUGEPAGE;
+ else
+ me->mm->def_flags &= ~VM_NOHUGEPAGE;
+ up_write(&me->mm->mmap_sem);
+ break;
+ case PR_MPX_ENABLE_MANAGEMENT:
+ if (arg2 || arg3 || arg4 || arg5)
+ return -EINVAL;
+ error = MPX_ENABLE_MANAGEMENT(me);
+ break;
+ case PR_MPX_DISABLE_MANAGEMENT:
+ if (arg2 || arg3 || arg4 || arg5)
+ return -EINVAL;
+ error = MPX_DISABLE_MANAGEMENT(me);
+ break;
+ case PR_SET_FP_MODE:
+ error = SET_FP_MODE(me, arg2);
+ break;
+ case PR_GET_FP_MODE:
+ error = GET_FP_MODE(me);
+ break;
+ default:
+ error = -EINVAL;
+ break;
+ }
+ return error;
+}
+
+SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
+ struct getcpu_cache __user *, unused)
+{
+ int err = 0;
+ int cpu = raw_smp_processor_id();
+
+ if (cpup)
+ err |= put_user(cpu, cpup);
+ if (nodep)
+ err |= put_user(cpu_to_node(cpu), nodep);
+ return err ? -EFAULT : 0;
+}
+
+/**
+ * do_sysinfo - fill in sysinfo struct
+ * @info: pointer to buffer to fill
+ */
+static int do_sysinfo(struct sysinfo *info)
+{
+ unsigned long mem_total, sav_total;
+ unsigned int mem_unit, bitcount;
+ struct timespec tp;
+
+ memset(info, 0, sizeof(struct sysinfo));
+
+ get_monotonic_boottime(&tp);
+ info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
+
+ get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
+
+ info->procs = nr_threads;
+
+ si_meminfo(info);
+ si_swapinfo(info);
+
+ /*
+ * If the sum of all the available memory (i.e. ram + swap)
+ * is less than can be stored in a 32 bit unsigned long then
+ * we can be binary compatible with 2.2.x kernels. If not,
+ * well, in that case 2.2.x was broken anyways...
+ *
+ * -Erik Andersen <andersee@debian.org>
+ */
+
+ mem_total = info->totalram + info->totalswap;
+ if (mem_total < info->totalram || mem_total < info->totalswap)
+ goto out;
+ bitcount = 0;
+ mem_unit = info->mem_unit;
+ while (mem_unit > 1) {
+ bitcount++;
+ mem_unit >>= 1;
+ sav_total = mem_total;
+ mem_total <<= 1;
+ if (mem_total < sav_total)
+ goto out;
+ }
+
+ /*
+ * If mem_total did not overflow, multiply all memory values by
+ * info->mem_unit and set it to 1. This leaves things compatible
+ * with 2.2.x, and also retains compatibility with earlier 2.4.x
+ * kernels...
+ */
+
+ info->mem_unit = 1;
+ info->totalram <<= bitcount;
+ info->freeram <<= bitcount;
+ info->sharedram <<= bitcount;
+ info->bufferram <<= bitcount;
+ info->totalswap <<= bitcount;
+ info->freeswap <<= bitcount;
+ info->totalhigh <<= bitcount;
+ info->freehigh <<= bitcount;
+
+out:
+ return 0;
+}
+
+SYSCALL_DEFINE1(sysinfo, struct sysinfo __user *, info)
+{
+ struct sysinfo val;
+
+ do_sysinfo(&val);
+
+ if (copy_to_user(info, &val, sizeof(struct sysinfo)))
+ return -EFAULT;
+
+ return 0;
+}
+
+#ifdef CONFIG_COMPAT
+struct compat_sysinfo {
+ s32 uptime;
+ u32 loads[3];
+ u32 totalram;
+ u32 freeram;
+ u32 sharedram;
+ u32 bufferram;
+ u32 totalswap;
+ u32 freeswap;
+ u16 procs;
+ u16 pad;
+ u32 totalhigh;
+ u32 freehigh;
+ u32 mem_unit;
+ char _f[20-2*sizeof(u32)-sizeof(int)];
+};
+
+COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info)
+{
+ struct sysinfo s;
+
+ do_sysinfo(&s);
+
+ /* Check to see if any memory value is too large for 32-bit and scale
+ * down if needed
+ */
+ if (upper_32_bits(s.totalram) || upper_32_bits(s.totalswap)) {
+ int bitcount = 0;
+
+ while (s.mem_unit < PAGE_SIZE) {
+ s.mem_unit <<= 1;
+ bitcount++;
+ }
+
+ s.totalram >>= bitcount;
+ s.freeram >>= bitcount;
+ s.sharedram >>= bitcount;
+ s.bufferram >>= bitcount;
+ s.totalswap >>= bitcount;
+ s.freeswap >>= bitcount;
+ s.totalhigh >>= bitcount;
+ s.freehigh >>= bitcount;
+ }
+
+ if (!access_ok(VERIFY_WRITE, info, sizeof(struct compat_sysinfo)) ||
+ __put_user(s.uptime, &info->uptime) ||
+ __put_user(s.loads[0], &info->loads[0]) ||
+ __put_user(s.loads[1], &info->loads[1]) ||
+ __put_user(s.loads[2], &info->loads[2]) ||
+ __put_user(s.totalram, &info->totalram) ||
+ __put_user(s.freeram, &info->freeram) ||
+ __put_user(s.sharedram, &info->sharedram) ||
+ __put_user(s.bufferram, &info->bufferram) ||
+ __put_user(s.totalswap, &info->totalswap) ||
+ __put_user(s.freeswap, &info->freeswap) ||
+ __put_user(s.procs, &info->procs) ||
+ __put_user(s.totalhigh, &info->totalhigh) ||
+ __put_user(s.freehigh, &info->freehigh) ||
+ __put_user(s.mem_unit, &info->mem_unit))
+ return -EFAULT;
+
+ return 0;
+}
+#endif /* CONFIG_COMPAT */
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
new file mode 100644
index 000000000..7995ef586
--- /dev/null
+++ b/kernel/sys_ni.c
@@ -0,0 +1,245 @@
+
+#include <linux/linkage.h>
+#include <linux/errno.h>
+
+#include <asm/unistd.h>
+
+/* we can't #include <linux/syscalls.h> here,
+ but tell gcc to not warn with -Wmissing-prototypes */
+asmlinkage long sys_ni_syscall(void);
+
+/*
+ * Non-implemented system calls get redirected here.
+ */
+asmlinkage long sys_ni_syscall(void)
+{
+ return -ENOSYS;
+}
+
+cond_syscall(sys_quotactl);
+cond_syscall(sys32_quotactl);
+cond_syscall(sys_acct);
+cond_syscall(sys_lookup_dcookie);
+cond_syscall(compat_sys_lookup_dcookie);
+cond_syscall(sys_swapon);
+cond_syscall(sys_swapoff);
+cond_syscall(sys_kexec_load);
+cond_syscall(compat_sys_kexec_load);
+cond_syscall(sys_kexec_file_load);
+cond_syscall(sys_init_module);
+cond_syscall(sys_finit_module);
+cond_syscall(sys_delete_module);
+cond_syscall(sys_socketpair);
+cond_syscall(sys_bind);
+cond_syscall(sys_listen);
+cond_syscall(sys_accept);
+cond_syscall(sys_accept4);
+cond_syscall(sys_connect);
+cond_syscall(sys_getsockname);
+cond_syscall(sys_getpeername);
+cond_syscall(sys_sendto);
+cond_syscall(sys_send);
+cond_syscall(sys_recvfrom);
+cond_syscall(sys_recv);
+cond_syscall(sys_socket);
+cond_syscall(sys_setsockopt);
+cond_syscall(compat_sys_setsockopt);
+cond_syscall(sys_getsockopt);
+cond_syscall(compat_sys_getsockopt);
+cond_syscall(sys_shutdown);
+cond_syscall(sys_sendmsg);
+cond_syscall(sys_sendmmsg);
+cond_syscall(compat_sys_sendmsg);
+cond_syscall(compat_sys_sendmmsg);
+cond_syscall(sys_recvmsg);
+cond_syscall(sys_recvmmsg);
+cond_syscall(compat_sys_recvmsg);
+cond_syscall(compat_sys_recv);
+cond_syscall(compat_sys_recvfrom);
+cond_syscall(compat_sys_recvmmsg);
+cond_syscall(sys_socketcall);
+cond_syscall(sys_futex);
+cond_syscall(compat_sys_futex);
+cond_syscall(sys_set_robust_list);
+cond_syscall(compat_sys_set_robust_list);
+cond_syscall(sys_get_robust_list);
+cond_syscall(compat_sys_get_robust_list);
+cond_syscall(sys_epoll_create);
+cond_syscall(sys_epoll_create1);
+cond_syscall(sys_epoll_ctl);
+cond_syscall(sys_epoll_wait);
+cond_syscall(sys_epoll_pwait);
+cond_syscall(compat_sys_epoll_pwait);
+cond_syscall(sys_semget);
+cond_syscall(sys_semop);
+cond_syscall(sys_semtimedop);
+cond_syscall(compat_sys_semtimedop);
+cond_syscall(sys_semctl);
+cond_syscall(compat_sys_semctl);
+cond_syscall(sys_msgget);
+cond_syscall(sys_msgsnd);
+cond_syscall(compat_sys_msgsnd);
+cond_syscall(sys_msgrcv);
+cond_syscall(compat_sys_msgrcv);
+cond_syscall(sys_msgctl);
+cond_syscall(compat_sys_msgctl);
+cond_syscall(sys_shmget);
+cond_syscall(sys_shmat);
+cond_syscall(compat_sys_shmat);
+cond_syscall(sys_shmdt);
+cond_syscall(sys_shmctl);
+cond_syscall(compat_sys_shmctl);
+cond_syscall(sys_mq_open);
+cond_syscall(sys_mq_unlink);
+cond_syscall(sys_mq_timedsend);
+cond_syscall(sys_mq_timedreceive);
+cond_syscall(sys_mq_notify);
+cond_syscall(sys_mq_getsetattr);
+cond_syscall(compat_sys_mq_open);
+cond_syscall(compat_sys_mq_timedsend);
+cond_syscall(compat_sys_mq_timedreceive);
+cond_syscall(compat_sys_mq_notify);
+cond_syscall(compat_sys_mq_getsetattr);
+cond_syscall(sys_mbind);
+cond_syscall(sys_get_mempolicy);
+cond_syscall(sys_set_mempolicy);
+cond_syscall(compat_sys_mbind);
+cond_syscall(compat_sys_get_mempolicy);
+cond_syscall(compat_sys_set_mempolicy);
+cond_syscall(sys_add_key);
+cond_syscall(sys_request_key);
+cond_syscall(sys_keyctl);
+cond_syscall(compat_sys_keyctl);
+cond_syscall(compat_sys_socketcall);
+cond_syscall(sys_inotify_init);
+cond_syscall(sys_inotify_init1);
+cond_syscall(sys_inotify_add_watch);
+cond_syscall(sys_inotify_rm_watch);
+cond_syscall(sys_migrate_pages);
+cond_syscall(sys_move_pages);
+cond_syscall(sys_chown16);
+cond_syscall(sys_fchown16);
+cond_syscall(sys_getegid16);
+cond_syscall(sys_geteuid16);
+cond_syscall(sys_getgid16);
+cond_syscall(sys_getgroups16);
+cond_syscall(sys_getresgid16);
+cond_syscall(sys_getresuid16);
+cond_syscall(sys_getuid16);
+cond_syscall(sys_lchown16);
+cond_syscall(sys_setfsgid16);
+cond_syscall(sys_setfsuid16);
+cond_syscall(sys_setgid16);
+cond_syscall(sys_setgroups16);
+cond_syscall(sys_setregid16);
+cond_syscall(sys_setresgid16);
+cond_syscall(sys_setresuid16);
+cond_syscall(sys_setreuid16);
+cond_syscall(sys_setuid16);
+cond_syscall(sys_sgetmask);
+cond_syscall(sys_ssetmask);
+cond_syscall(sys_vm86old);
+cond_syscall(sys_vm86);
+cond_syscall(sys_ipc);
+cond_syscall(compat_sys_ipc);
+cond_syscall(compat_sys_sysctl);
+cond_syscall(sys_flock);
+cond_syscall(sys_io_setup);
+cond_syscall(sys_io_destroy);
+cond_syscall(sys_io_submit);
+cond_syscall(sys_io_cancel);
+cond_syscall(sys_io_getevents);
+cond_syscall(sys_sysfs);
+cond_syscall(sys_syslog);
+cond_syscall(sys_process_vm_readv);
+cond_syscall(sys_process_vm_writev);
+cond_syscall(compat_sys_process_vm_readv);
+cond_syscall(compat_sys_process_vm_writev);
+cond_syscall(sys_uselib);
+cond_syscall(sys_fadvise64);
+cond_syscall(sys_fadvise64_64);
+cond_syscall(sys_madvise);
+cond_syscall(sys_setuid);
+cond_syscall(sys_setregid);
+cond_syscall(sys_setgid);
+cond_syscall(sys_setreuid);
+cond_syscall(sys_setresuid);
+cond_syscall(sys_getresuid);
+cond_syscall(sys_setresgid);
+cond_syscall(sys_getresgid);
+cond_syscall(sys_setgroups);
+cond_syscall(sys_getgroups);
+cond_syscall(sys_setfsuid);
+cond_syscall(sys_setfsgid);
+cond_syscall(sys_capget);
+cond_syscall(sys_capset);
+
+/* arch-specific weak syscall entries */
+cond_syscall(sys_pciconfig_read);
+cond_syscall(sys_pciconfig_write);
+cond_syscall(sys_pciconfig_iobase);
+cond_syscall(compat_sys_s390_ipc);
+cond_syscall(ppc_rtas);
+cond_syscall(sys_spu_run);
+cond_syscall(sys_spu_create);
+cond_syscall(sys_subpage_prot);
+cond_syscall(sys_s390_pci_mmio_read);
+cond_syscall(sys_s390_pci_mmio_write);
+
+/* mmu depending weak syscall entries */
+cond_syscall(sys_mprotect);
+cond_syscall(sys_msync);
+cond_syscall(sys_mlock);
+cond_syscall(sys_munlock);
+cond_syscall(sys_mlockall);
+cond_syscall(sys_munlockall);
+cond_syscall(sys_mincore);
+cond_syscall(sys_madvise);
+cond_syscall(sys_mremap);
+cond_syscall(sys_remap_file_pages);
+cond_syscall(compat_sys_move_pages);
+cond_syscall(compat_sys_migrate_pages);
+
+/* block-layer dependent */
+cond_syscall(sys_bdflush);
+cond_syscall(sys_ioprio_set);
+cond_syscall(sys_ioprio_get);
+
+/* New file descriptors */
+cond_syscall(sys_signalfd);
+cond_syscall(sys_signalfd4);
+cond_syscall(compat_sys_signalfd);
+cond_syscall(compat_sys_signalfd4);
+cond_syscall(sys_timerfd_create);
+cond_syscall(sys_timerfd_settime);
+cond_syscall(sys_timerfd_gettime);
+cond_syscall(compat_sys_timerfd_settime);
+cond_syscall(compat_sys_timerfd_gettime);
+cond_syscall(sys_eventfd);
+cond_syscall(sys_eventfd2);
+cond_syscall(sys_memfd_create);
+
+/* performance counters: */
+cond_syscall(sys_perf_event_open);
+
+/* fanotify! */
+cond_syscall(sys_fanotify_init);
+cond_syscall(sys_fanotify_mark);
+cond_syscall(compat_sys_fanotify_mark);
+
+/* open by handle */
+cond_syscall(sys_name_to_handle_at);
+cond_syscall(sys_open_by_handle_at);
+cond_syscall(compat_sys_open_by_handle_at);
+
+/* compare kernel pointers */
+cond_syscall(sys_kcmp);
+
+/* operate on Secure Computing state */
+cond_syscall(sys_seccomp);
+
+/* access BPF programs and maps */
+cond_syscall(sys_bpf);
+
+/* execveat */
+cond_syscall(sys_execveat);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
new file mode 100644
index 000000000..7f45887fa
--- /dev/null
+++ b/kernel/sysctl.c
@@ -0,0 +1,2819 @@
+/*
+ * sysctl.c: General linux system control interface
+ *
+ * Begun 24 March 1995, Stephen Tweedie
+ * Added /proc support, Dec 1995
+ * Added bdflush entry and intvec min/max checking, 2/23/96, Tom Dyas.
+ * Added hooks for /proc/sys/net (minor, minor patch), 96/4/1, Mike Shaver.
+ * Added kernel/java-{interpreter,appletviewer}, 96/5/10, Mike Shaver.
+ * Dynamic registration fixes, Stephen Tweedie.
+ * Added kswapd-interval, ctrl-alt-del, printk stuff, 1/8/97, Chris Horn.
+ * Made sysctl support optional via CONFIG_SYSCTL, 1/10/97, Chris
+ * Horn.
+ * Added proc_doulongvec_ms_jiffies_minmax, 09/08/99, Carlos H. Bauer.
+ * Added proc_doulongvec_minmax, 09/08/99, Carlos H. Bauer.
+ * Changed linked lists to use list.h instead of lists.h, 02/24/00, Bill
+ * Wendling.
+ * The list_for_each() macro wasn't appropriate for the sysctl loop.
+ * Removed it and replaced it with older style, 03/23/00, Bill Wendling
+ */
+
+#include <linux/module.h>
+#include <linux/aio.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/slab.h>
+#include <linux/sysctl.h>
+#include <linux/bitmap.h>
+#include <linux/signal.h>
+#include <linux/printk.h>
+#include <linux/proc_fs.h>
+#include <linux/security.h>
+#include <linux/ctype.h>
+#include <linux/kmemcheck.h>
+#include <linux/kmemleak.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/kobject.h>
+#include <linux/net.h>
+#include <linux/sysrq.h>
+#include <linux/highuid.h>
+#include <linux/writeback.h>
+#include <linux/ratelimit.h>
+#include <linux/compaction.h>
+#include <linux/hugetlb.h>
+#include <linux/initrd.h>
+#include <linux/key.h>
+#include <linux/times.h>
+#include <linux/limits.h>
+#include <linux/dcache.h>
+#include <linux/dnotify.h>
+#include <linux/syscalls.h>
+#include <linux/vmstat.h>
+#include <linux/nfs_fs.h>
+#include <linux/acpi.h>
+#include <linux/reboot.h>
+#include <linux/ftrace.h>
+#include <linux/perf_event.h>
+#include <linux/kprobes.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/oom.h>
+#include <linux/kmod.h>
+#include <linux/capability.h>
+#include <linux/binfmts.h>
+#include <linux/sched/sysctl.h>
+#include <linux/kexec.h>
+
+#include <asm/uaccess.h>
+#include <asm/processor.h>
+
+#ifdef CONFIG_X86
+#include <asm/nmi.h>
+#include <asm/stacktrace.h>
+#include <asm/io.h>
+#endif
+#ifdef CONFIG_SPARC
+#include <asm/setup.h>
+#endif
+#ifdef CONFIG_BSD_PROCESS_ACCT
+#include <linux/acct.h>
+#endif
+#ifdef CONFIG_RT_MUTEXES
+#include <linux/rtmutex.h>
+#endif
+#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_LOCK_STAT)
+#include <linux/lockdep.h>
+#endif
+#ifdef CONFIG_CHR_DEV_SG
+#include <scsi/sg.h>
+#endif
+
+#ifdef CONFIG_LOCKUP_DETECTOR
+#include <linux/nmi.h>
+#endif
+
+#if defined(CONFIG_SYSCTL)
+
+/* External variables not in a header file. */
+extern int suid_dumpable;
+#ifdef CONFIG_COREDUMP
+extern int core_uses_pid;
+extern char core_pattern[];
+extern unsigned int core_pipe_limit;
+#endif
+extern int pid_max;
+extern int pid_max_min, pid_max_max;
+extern int percpu_pagelist_fraction;
+extern int compat_log;
+extern int latencytop_enabled;
+extern int sysctl_nr_open_min, sysctl_nr_open_max;
+#ifndef CONFIG_MMU
+extern int sysctl_nr_trim_pages;
+#endif
+
+/* Constants used for minimum and maximum */
+#ifdef CONFIG_LOCKUP_DETECTOR
+static int sixty = 60;
+#endif
+
+static int __maybe_unused neg_one = -1;
+
+static int zero;
+static int __maybe_unused one = 1;
+static int __maybe_unused two = 2;
+static int __maybe_unused four = 4;
+static unsigned long one_ul = 1;
+static int __maybe_unused one_hundred = 100;
+#ifdef CONFIG_SCHED_BFS
+extern int rr_interval;
+extern int sched_iso_cpu;
+static int __read_mostly one_thousand = 1000;
+#endif
+#ifdef CONFIG_PRINTK
+static int ten_thousand = 10000;
+#endif
+
+/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
+static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
+
+/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
+static int maxolduid = 65535;
+static int minolduid;
+
+static int ngroups_max = NGROUPS_MAX;
+static const int cap_last_cap = CAP_LAST_CAP;
+
+/*this is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs */
+#ifdef CONFIG_DETECT_HUNG_TASK
+static unsigned long hung_task_timeout_max = (LONG_MAX/HZ);
+#endif
+
+#ifdef CONFIG_INOTIFY_USER
+#include <linux/inotify.h>
+#endif
+#ifdef CONFIG_SPARC
+#endif
+
+#ifdef __hppa__
+extern int pwrsw_enabled;
+#endif
+
+#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW
+extern int unaligned_enabled;
+#endif
+
+#ifdef CONFIG_IA64
+extern int unaligned_dump_stack;
+#endif
+
+#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN
+extern int no_unaligned_warning;
+#endif
+
+#ifdef CONFIG_PROC_SYSCTL
+
+#define SYSCTL_WRITES_LEGACY -1
+#define SYSCTL_WRITES_WARN 0
+#define SYSCTL_WRITES_STRICT 1
+
+static int sysctl_writes_strict = SYSCTL_WRITES_WARN;
+
+static int proc_do_cad_pid(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos);
+static int proc_taint(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos);
+#endif
+
+#ifdef CONFIG_PRINTK
+static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos);
+#endif
+
+static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos);
+#ifdef CONFIG_COREDUMP
+static int proc_dostring_coredump(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos);
+#endif
+
+#ifdef CONFIG_MAGIC_SYSRQ
+/* Note: sysrq code uses it's own private copy */
+static int __sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE;
+
+static int sysrq_sysctl_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int error;
+
+ error = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (error)
+ return error;
+
+ if (write)
+ sysrq_toggle_support(__sysrq_enabled);
+
+ return 0;
+}
+
+#endif
+
+static struct ctl_table kern_table[];
+static struct ctl_table vm_table[];
+static struct ctl_table fs_table[];
+static struct ctl_table debug_table[];
+static struct ctl_table dev_table[];
+extern struct ctl_table random_table[];
+#ifdef CONFIG_EPOLL
+extern struct ctl_table epoll_table[];
+#endif
+
+#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
+int sysctl_legacy_va_layout;
+#endif
+
+/* The default sysctl tables: */
+
+static struct ctl_table sysctl_base_table[] = {
+ {
+ .procname = "kernel",
+ .mode = 0555,
+ .child = kern_table,
+ },
+ {
+ .procname = "vm",
+ .mode = 0555,
+ .child = vm_table,
+ },
+ {
+ .procname = "fs",
+ .mode = 0555,
+ .child = fs_table,
+ },
+ {
+ .procname = "debug",
+ .mode = 0555,
+ .child = debug_table,
+ },
+ {
+ .procname = "dev",
+ .mode = 0555,
+ .child = dev_table,
+ },
+ { }
+};
+
+#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_BFS)
+static int min_sched_granularity_ns = 100000; /* 100 usecs */
+static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
+static int min_wakeup_granularity_ns; /* 0 usecs */
+static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
+#ifdef CONFIG_SMP
+static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
+static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_SCHED_DEBUG */
+
+#ifdef CONFIG_COMPACTION
+static int min_extfrag_threshold;
+static int max_extfrag_threshold = 1000;
+#endif
+
+static struct ctl_table kern_table[] = {
+#ifndef CONFIG_SCHED_BFS
+ {
+ .procname = "sched_child_runs_first",
+ .data = &sysctl_sched_child_runs_first,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#ifdef CONFIG_SCHED_DEBUG
+ {
+ .procname = "sched_min_granularity_ns",
+ .data = &sysctl_sched_min_granularity,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sched_proc_update_handler,
+ .extra1 = &min_sched_granularity_ns,
+ .extra2 = &max_sched_granularity_ns,
+ },
+ {
+ .procname = "sched_latency_ns",
+ .data = &sysctl_sched_latency,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sched_proc_update_handler,
+ .extra1 = &min_sched_granularity_ns,
+ .extra2 = &max_sched_granularity_ns,
+ },
+ {
+ .procname = "sched_wakeup_granularity_ns",
+ .data = &sysctl_sched_wakeup_granularity,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sched_proc_update_handler,
+ .extra1 = &min_wakeup_granularity_ns,
+ .extra2 = &max_wakeup_granularity_ns,
+ },
+#ifdef CONFIG_SMP
+ {
+ .procname = "sched_tunable_scaling",
+ .data = &sysctl_sched_tunable_scaling,
+ .maxlen = sizeof(enum sched_tunable_scaling),
+ .mode = 0644,
+ .proc_handler = sched_proc_update_handler,
+ .extra1 = &min_sched_tunable_scaling,
+ .extra2 = &max_sched_tunable_scaling,
+ },
+ {
+ .procname = "sched_migration_cost_ns",
+ .data = &sysctl_sched_migration_cost,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sched_nr_migrate",
+ .data = &sysctl_sched_nr_migrate,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sched_time_avg_ms",
+ .data = &sysctl_sched_time_avg,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sched_shares_window_ns",
+ .data = &sysctl_sched_shares_window,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "timer_migration",
+ .data = &sysctl_timer_migration,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif /* CONFIG_SMP */
+#ifdef CONFIG_NUMA_BALANCING
+ {
+ .procname = "numa_balancing_scan_delay_ms",
+ .data = &sysctl_numa_balancing_scan_delay,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "numa_balancing_scan_period_min_ms",
+ .data = &sysctl_numa_balancing_scan_period_min,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "numa_balancing_scan_period_max_ms",
+ .data = &sysctl_numa_balancing_scan_period_max,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "numa_balancing_scan_size_mb",
+ .data = &sysctl_numa_balancing_scan_size,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
+ },
+ {
+ .procname = "numa_balancing",
+ .data = NULL, /* filled in by handler */
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_numa_balancing,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif /* CONFIG_NUMA_BALANCING */
+#endif /* CONFIG_SCHED_DEBUG */
+ {
+ .procname = "sched_rt_period_us",
+ .data = &sysctl_sched_rt_period,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sched_rt_handler,
+ },
+ {
+ .procname = "sched_rt_runtime_us",
+ .data = &sysctl_sched_rt_runtime,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = sched_rt_handler,
+ },
+ {
+ .procname = "sched_rr_timeslice_ms",
+ .data = &sched_rr_timeslice,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = sched_rr_handler,
+ },
+#ifdef CONFIG_SCHED_AUTOGROUP
+ {
+ .procname = "sched_autogroup_enabled",
+ .data = &sysctl_sched_autogroup_enabled,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif
+#ifdef CONFIG_CFS_BANDWIDTH
+ {
+ .procname = "sched_cfs_bandwidth_slice_us",
+ .data = &sysctl_sched_cfs_bandwidth_slice,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
+ },
+#endif
+#endif /* !CONFIG_SCHED_BFS */
+#ifdef CONFIG_PROVE_LOCKING
+ {
+ .procname = "prove_locking",
+ .data = &prove_locking,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
+#ifdef CONFIG_LOCK_STAT
+ {
+ .procname = "lock_stat",
+ .data = &lock_stat,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
+ {
+ .procname = "panic",
+ .data = &panic_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#ifdef CONFIG_COREDUMP
+ {
+ .procname = "core_uses_pid",
+ .data = &core_uses_pid,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "core_pattern",
+ .data = core_pattern,
+ .maxlen = CORENAME_MAX_SIZE,
+ .mode = 0644,
+ .proc_handler = proc_dostring_coredump,
+ },
+ {
+ .procname = "core_pipe_limit",
+ .data = &core_pipe_limit,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
+#ifdef CONFIG_PROC_SYSCTL
+ {
+ .procname = "tainted",
+ .maxlen = sizeof(long),
+ .mode = 0644,
+ .proc_handler = proc_taint,
+ },
+ {
+ .procname = "sysctl_writes_strict",
+ .data = &sysctl_writes_strict,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &neg_one,
+ .extra2 = &one,
+ },
+#endif
+#ifdef CONFIG_LATENCYTOP
+ {
+ .procname = "latencytop",
+ .data = &latencytop_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
+#ifdef CONFIG_BLK_DEV_INITRD
+ {
+ .procname = "real-root-dev",
+ .data = &real_root_dev,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
+ {
+ .procname = "print-fatal-signals",
+ .data = &print_fatal_signals,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#ifdef CONFIG_SPARC
+ {
+ .procname = "reboot-cmd",
+ .data = reboot_command,
+ .maxlen = 256,
+ .mode = 0644,
+ .proc_handler = proc_dostring,
+ },
+ {
+ .procname = "stop-a",
+ .data = &stop_a_enabled,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "scons-poweroff",
+ .data = &scons_pwroff,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
+#ifdef CONFIG_SPARC64
+ {
+ .procname = "tsb-ratio",
+ .data = &sysctl_tsb_ratio,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
+#ifdef __hppa__
+ {
+ .procname = "soft-power",
+ .data = &pwrsw_enabled,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
+#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW
+ {
+ .procname = "unaligned-trap",
+ .data = &unaligned_enabled,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
+ {
+ .procname = "ctrl-alt-del",
+ .data = &C_A_D,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#ifdef CONFIG_FUNCTION_TRACER
+ {
+ .procname = "ftrace_enabled",
+ .data = &ftrace_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = ftrace_enable_sysctl,
+ },
+#endif
+#ifdef CONFIG_STACK_TRACER
+ {
+ .procname = "stack_tracer_enabled",
+ .data = &stack_tracer_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = stack_trace_sysctl,
+ },
+#endif
+#ifdef CONFIG_TRACING
+ {
+ .procname = "ftrace_dump_on_oops",
+ .data = &ftrace_dump_on_oops,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "traceoff_on_warning",
+ .data = &__disable_trace_on_warning,
+ .maxlen = sizeof(__disable_trace_on_warning),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "tracepoint_printk",
+ .data = &tracepoint_printk,
+ .maxlen = sizeof(tracepoint_printk),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
+#ifdef CONFIG_KEXEC
+ {
+ .procname = "kexec_load_disabled",
+ .data = &kexec_load_disabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ /* only handle a transition from default "0" to "1" */
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
+ .extra2 = &one,
+ },
+#endif
+#ifdef CONFIG_MODULES
+ {
+ .procname = "modprobe",
+ .data = &modprobe_path,
+ .maxlen = KMOD_PATH_LEN,
+ .mode = 0644,
+ .proc_handler = proc_dostring,
+ },
+ {
+ .procname = "modules_disabled",
+ .data = &modules_disabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ /* only handle a transition from default "0" to "1" */
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
+ .extra2 = &one,
+ },
+#endif
+#ifdef CONFIG_UEVENT_HELPER
+ {
+ .procname = "hotplug",
+ .data = &uevent_helper,
+ .maxlen = UEVENT_HELPER_PATH_LEN,
+ .mode = 0644,
+ .proc_handler = proc_dostring,
+ },
+#endif
+#ifdef CONFIG_CHR_DEV_SG
+ {
+ .procname = "sg-big-buff",
+ .data = &sg_big_buff,
+ .maxlen = sizeof (int),
+ .mode = 0444,
+ .proc_handler = proc_dointvec,
+ },
+#endif
+#ifdef CONFIG_BSD_PROCESS_ACCT
+ {
+ .procname = "acct",
+ .data = &acct_parm,
+ .maxlen = 3*sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
+#ifdef CONFIG_MAGIC_SYSRQ
+ {
+ .procname = "sysrq",
+ .data = &__sysrq_enabled,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = sysrq_sysctl_handler,
+ },
+#endif
+#ifdef CONFIG_PROC_SYSCTL
+ {
+ .procname = "cad_pid",
+ .data = NULL,
+ .maxlen = sizeof (int),
+ .mode = 0600,
+ .proc_handler = proc_do_cad_pid,
+ },
+#endif
+ {
+ .procname = "threads-max",
+ .data = NULL,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = sysctl_max_threads,
+ },
+ {
+ .procname = "random",
+ .mode = 0555,
+ .child = random_table,
+ },
+ {
+ .procname = "usermodehelper",
+ .mode = 0555,
+ .child = usermodehelper_table,
+ },
+ {
+ .procname = "overflowuid",
+ .data = &overflowuid,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &minolduid,
+ .extra2 = &maxolduid,
+ },
+ {
+ .procname = "overflowgid",
+ .data = &overflowgid,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &minolduid,
+ .extra2 = &maxolduid,
+ },
+#ifdef CONFIG_S390
+#ifdef CONFIG_MATHEMU
+ {
+ .procname = "ieee_emulation_warnings",
+ .data = &sysctl_ieee_emulation_warnings,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
+ {
+ .procname = "userprocess_debug",
+ .data = &show_unhandled_signals,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
+ {
+ .procname = "pid_max",
+ .data = &pid_max,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &pid_max_min,
+ .extra2 = &pid_max_max,
+ },
+ {
+ .procname = "panic_on_oops",
+ .data = &panic_on_oops,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#if defined CONFIG_PRINTK
+ {
+ .procname = "printk",
+ .data = &console_loglevel,
+ .maxlen = 4*sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "printk_ratelimit",
+ .data = &printk_ratelimit_state.interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "printk_ratelimit_burst",
+ .data = &printk_ratelimit_state.burst,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "printk_delay",
+ .data = &printk_delay_msec,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &ten_thousand,
+ },
+ {
+ .procname = "dmesg_restrict",
+ .data = &dmesg_restrict,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax_sysadmin,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+ {
+ .procname = "kptr_restrict",
+ .data = &kptr_restrict,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax_sysadmin,
+ .extra1 = &zero,
+ .extra2 = &two,
+ },
+#endif
+ {
+ .procname = "ngroups_max",
+ .data = &ngroups_max,
+ .maxlen = sizeof (int),
+ .mode = 0444,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "cap_last_cap",
+ .data = (void *)&cap_last_cap,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = proc_dointvec,
+ },
+#if defined(CONFIG_LOCKUP_DETECTOR)
+ {
+ .procname = "watchdog",
+ .data = &watchdog_user_enabled,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = proc_watchdog,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+ {
+ .procname = "watchdog_thresh",
+ .data = &watchdog_thresh,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_watchdog_thresh,
+ .extra1 = &zero,
+ .extra2 = &sixty,
+ },
+ {
+ .procname = "nmi_watchdog",
+ .data = &nmi_watchdog_enabled,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = proc_nmi_watchdog,
+ .extra1 = &zero,
+#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
+ .extra2 = &one,
+#else
+ .extra2 = &zero,
+#endif
+ },
+ {
+ .procname = "soft_watchdog",
+ .data = &soft_watchdog_enabled,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = proc_soft_watchdog,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+ {
+ .procname = "softlockup_panic",
+ .data = &softlockup_panic,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#ifdef CONFIG_SMP
+ {
+ .procname = "softlockup_all_cpu_backtrace",
+ .data = &sysctl_softlockup_all_cpu_backtrace,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif /* CONFIG_SMP */
+#endif
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
+ {
+ .procname = "unknown_nmi_panic",
+ .data = &unknown_nmi_panic,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
+#if defined(CONFIG_X86)
+ {
+ .procname = "panic_on_unrecovered_nmi",
+ .data = &panic_on_unrecovered_nmi,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "panic_on_io_nmi",
+ .data = &panic_on_io_nmi,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+ {
+ .procname = "panic_on_stackoverflow",
+ .data = &sysctl_panic_on_stackoverflow,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
+ {
+ .procname = "bootloader_type",
+ .data = &bootloader_type,
+ .maxlen = sizeof (int),
+ .mode = 0444,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "bootloader_version",
+ .data = &bootloader_version,
+ .maxlen = sizeof (int),
+ .mode = 0444,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "kstack_depth_to_print",
+ .data = &kstack_depth_to_print,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "io_delay_type",
+ .data = &io_delay_type,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
+#if defined(CONFIG_MMU)
+ {
+ .procname = "randomize_va_space",
+ .data = &randomize_va_space,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
+#ifdef CONFIG_SCHED_BFS
+ {
+ .procname = "rr_interval",
+ .data = &rr_interval,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .extra1 = &one,
+ .extra2 = &one_thousand,
+ },
+ {
+ .procname = "iso_cpu",
+ .data = &sched_iso_cpu,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one_hundred,
+ },
+#endif
+#if defined(CONFIG_S390) && defined(CONFIG_SMP)
+ {
+ .procname = "spin_retry",
+ .data = &spin_retry,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
+#if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86)
+ {
+ .procname = "acpi_video_flags",
+ .data = &acpi_realmode_flags,
+ .maxlen = sizeof (unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+#endif
+#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN
+ {
+ .procname = "ignore-unaligned-usertrap",
+ .data = &no_unaligned_warning,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
+#ifdef CONFIG_IA64
+ {
+ .procname = "unaligned-dump-stack",
+ .data = &unaligned_dump_stack,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
+#ifdef CONFIG_DETECT_HUNG_TASK
+ {
+ .procname = "hung_task_panic",
+ .data = &sysctl_hung_task_panic,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+ {
+ .procname = "hung_task_check_count",
+ .data = &sysctl_hung_task_check_count,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ },
+ {
+ .procname = "hung_task_timeout_secs",
+ .data = &sysctl_hung_task_timeout_secs,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_dohung_task_timeout_secs,
+ .extra2 = &hung_task_timeout_max,
+ },
+ {
+ .procname = "hung_task_warnings",
+ .data = &sysctl_hung_task_warnings,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &neg_one,
+ },
+#endif
+#ifdef CONFIG_COMPAT
+ {
+ .procname = "compat-log",
+ .data = &compat_log,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
+#ifdef CONFIG_RT_MUTEXES
+ {
+ .procname = "max_lock_depth",
+ .data = &max_lock_depth,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
+ {
+ .procname = "poweroff_cmd",
+ .data = &poweroff_cmd,
+ .maxlen = POWEROFF_CMD_PATH_LEN,
+ .mode = 0644,
+ .proc_handler = proc_dostring,
+ },
+#ifdef CONFIG_KEYS
+ {
+ .procname = "keys",
+ .mode = 0555,
+ .child = key_sysctls,
+ },
+#endif
+#ifdef CONFIG_PERF_EVENTS
+ /*
+ * User-space scripts rely on the existence of this file
+ * as a feature check for perf_events being enabled.
+ *
+ * So it's an ABI, do not remove!
+ */
+ {
+ .procname = "perf_event_paranoid",
+ .data = &sysctl_perf_event_paranoid,
+ .maxlen = sizeof(sysctl_perf_event_paranoid),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "perf_event_mlock_kb",
+ .data = &sysctl_perf_event_mlock,
+ .maxlen = sizeof(sysctl_perf_event_mlock),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "perf_event_max_sample_rate",
+ .data = &sysctl_perf_event_sample_rate,
+ .maxlen = sizeof(sysctl_perf_event_sample_rate),
+ .mode = 0644,
+ .proc_handler = perf_proc_update_handler,
+ .extra1 = &one,
+ },
+ {
+ .procname = "perf_cpu_time_max_percent",
+ .data = &sysctl_perf_cpu_time_max_percent,
+ .maxlen = sizeof(sysctl_perf_cpu_time_max_percent),
+ .mode = 0644,
+ .proc_handler = perf_cpu_time_max_percent_handler,
+ .extra1 = &zero,
+ .extra2 = &one_hundred,
+ },
+#endif
+#ifdef CONFIG_KMEMCHECK
+ {
+ .procname = "kmemcheck",
+ .data = &kmemcheck_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
+ {
+ .procname = "panic_on_warn",
+ .data = &panic_on_warn,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+ { }
+};
+
+static struct ctl_table vm_table[] = {
+ {
+ .procname = "overcommit_memory",
+ .data = &sysctl_overcommit_memory,
+ .maxlen = sizeof(sysctl_overcommit_memory),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &two,
+ },
+ {
+ .procname = "panic_on_oom",
+ .data = &sysctl_panic_on_oom,
+ .maxlen = sizeof(sysctl_panic_on_oom),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &two,
+ },
+ {
+ .procname = "oom_kill_allocating_task",
+ .data = &sysctl_oom_kill_allocating_task,
+ .maxlen = sizeof(sysctl_oom_kill_allocating_task),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "oom_dump_tasks",
+ .data = &sysctl_oom_dump_tasks,
+ .maxlen = sizeof(sysctl_oom_dump_tasks),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "overcommit_ratio",
+ .data = &sysctl_overcommit_ratio,
+ .maxlen = sizeof(sysctl_overcommit_ratio),
+ .mode = 0644,
+ .proc_handler = overcommit_ratio_handler,
+ },
+ {
+ .procname = "overcommit_kbytes",
+ .data = &sysctl_overcommit_kbytes,
+ .maxlen = sizeof(sysctl_overcommit_kbytes),
+ .mode = 0644,
+ .proc_handler = overcommit_kbytes_handler,
+ },
+ {
+ .procname = "page-cluster",
+ .data = &page_cluster,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ },
+ {
+ .procname = "dirty_background_ratio",
+ .data = &dirty_background_ratio,
+ .maxlen = sizeof(dirty_background_ratio),
+ .mode = 0644,
+ .proc_handler = dirty_background_ratio_handler,
+ .extra1 = &zero,
+ .extra2 = &one_hundred,
+ },
+ {
+ .procname = "dirty_background_bytes",
+ .data = &dirty_background_bytes,
+ .maxlen = sizeof(dirty_background_bytes),
+ .mode = 0644,
+ .proc_handler = dirty_background_bytes_handler,
+ .extra1 = &one_ul,
+ },
+ {
+ .procname = "dirty_ratio",
+ .data = &vm_dirty_ratio,
+ .maxlen = sizeof(vm_dirty_ratio),
+ .mode = 0644,
+ .proc_handler = dirty_ratio_handler,
+ .extra1 = &zero,
+ .extra2 = &one_hundred,
+ },
+ {
+ .procname = "dirty_bytes",
+ .data = &vm_dirty_bytes,
+ .maxlen = sizeof(vm_dirty_bytes),
+ .mode = 0644,
+ .proc_handler = dirty_bytes_handler,
+ .extra1 = &dirty_bytes_min,
+ },
+ {
+ .procname = "dirty_writeback_centisecs",
+ .data = &dirty_writeback_interval,
+ .maxlen = sizeof(dirty_writeback_interval),
+ .mode = 0644,
+ .proc_handler = dirty_writeback_centisecs_handler,
+ },
+ {
+ .procname = "dirty_expire_centisecs",
+ .data = &dirty_expire_interval,
+ .maxlen = sizeof(dirty_expire_interval),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ },
+ {
+ .procname = "dirtytime_expire_seconds",
+ .data = &dirtytime_expire_interval,
+ .maxlen = sizeof(dirty_expire_interval),
+ .mode = 0644,
+ .proc_handler = dirtytime_interval_handler,
+ .extra1 = &zero,
+ },
+ {
+ .procname = "nr_pdflush_threads",
+ .mode = 0444 /* read-only */,
+ .proc_handler = pdflush_proc_obsolete,
+ },
+ {
+ .procname = "swappiness",
+ .data = &vm_swappiness,
+ .maxlen = sizeof(vm_swappiness),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one_hundred,
+ },
+#ifdef CONFIG_HUGETLB_PAGE
+ {
+ .procname = "nr_hugepages",
+ .data = NULL,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = hugetlb_sysctl_handler,
+ },
+#ifdef CONFIG_NUMA
+ {
+ .procname = "nr_hugepages_mempolicy",
+ .data = NULL,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = &hugetlb_mempolicy_sysctl_handler,
+ },
+#endif
+ {
+ .procname = "hugetlb_shm_group",
+ .data = &sysctl_hugetlb_shm_group,
+ .maxlen = sizeof(gid_t),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "hugepages_treat_as_movable",
+ .data = &hugepages_treat_as_movable,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "nr_overcommit_hugepages",
+ .data = NULL,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = hugetlb_overcommit_handler,
+ },
+#endif
+ {
+ .procname = "lowmem_reserve_ratio",
+ .data = &sysctl_lowmem_reserve_ratio,
+ .maxlen = sizeof(sysctl_lowmem_reserve_ratio),
+ .mode = 0644,
+ .proc_handler = lowmem_reserve_ratio_sysctl_handler,
+ },
+ {
+ .procname = "drop_caches",
+ .data = &sysctl_drop_caches,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = drop_caches_sysctl_handler,
+ .extra1 = &one,
+ .extra2 = &four,
+ },
+#ifdef CONFIG_COMPACTION
+ {
+ .procname = "compact_memory",
+ .data = &sysctl_compact_memory,
+ .maxlen = sizeof(int),
+ .mode = 0200,
+ .proc_handler = sysctl_compaction_handler,
+ },
+ {
+ .procname = "extfrag_threshold",
+ .data = &sysctl_extfrag_threshold,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = sysctl_extfrag_handler,
+ .extra1 = &min_extfrag_threshold,
+ .extra2 = &max_extfrag_threshold,
+ },
+ {
+ .procname = "compact_unevictable_allowed",
+ .data = &sysctl_compact_unevictable_allowed,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+
+#endif /* CONFIG_COMPACTION */
+ {
+ .procname = "min_free_kbytes",
+ .data = &min_free_kbytes,
+ .maxlen = sizeof(min_free_kbytes),
+ .mode = 0644,
+ .proc_handler = min_free_kbytes_sysctl_handler,
+ .extra1 = &zero,
+ },
+ {
+ .procname = "percpu_pagelist_fraction",
+ .data = &percpu_pagelist_fraction,
+ .maxlen = sizeof(percpu_pagelist_fraction),
+ .mode = 0644,
+ .proc_handler = percpu_pagelist_fraction_sysctl_handler,
+ .extra1 = &zero,
+ },
+#ifdef CONFIG_MMU
+ {
+ .procname = "max_map_count",
+ .data = &sysctl_max_map_count,
+ .maxlen = sizeof(sysctl_max_map_count),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ },
+#else
+ {
+ .procname = "nr_trim_pages",
+ .data = &sysctl_nr_trim_pages,
+ .maxlen = sizeof(sysctl_nr_trim_pages),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ },
+#endif
+ {
+ .procname = "laptop_mode",
+ .data = &laptop_mode,
+ .maxlen = sizeof(laptop_mode),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "block_dump",
+ .data = &block_dump,
+ .maxlen = sizeof(block_dump),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ .extra1 = &zero,
+ },
+ {
+ .procname = "vfs_cache_pressure",
+ .data = &sysctl_vfs_cache_pressure,
+ .maxlen = sizeof(sysctl_vfs_cache_pressure),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ .extra1 = &zero,
+ },
+#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
+ {
+ .procname = "legacy_va_layout",
+ .data = &sysctl_legacy_va_layout,
+ .maxlen = sizeof(sysctl_legacy_va_layout),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ .extra1 = &zero,
+ },
+#endif
+#ifdef CONFIG_NUMA
+ {
+ .procname = "zone_reclaim_mode",
+ .data = &zone_reclaim_mode,
+ .maxlen = sizeof(zone_reclaim_mode),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ .extra1 = &zero,
+ },
+ {
+ .procname = "min_unmapped_ratio",
+ .data = &sysctl_min_unmapped_ratio,
+ .maxlen = sizeof(sysctl_min_unmapped_ratio),
+ .mode = 0644,
+ .proc_handler = sysctl_min_unmapped_ratio_sysctl_handler,
+ .extra1 = &zero,
+ .extra2 = &one_hundred,
+ },
+ {
+ .procname = "min_slab_ratio",
+ .data = &sysctl_min_slab_ratio,
+ .maxlen = sizeof(sysctl_min_slab_ratio),
+ .mode = 0644,
+ .proc_handler = sysctl_min_slab_ratio_sysctl_handler,
+ .extra1 = &zero,
+ .extra2 = &one_hundred,
+ },
+#endif
+#ifdef CONFIG_SMP
+ {
+ .procname = "stat_interval",
+ .data = &sysctl_stat_interval,
+ .maxlen = sizeof(sysctl_stat_interval),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+#endif
+#ifdef CONFIG_MMU
+ {
+ .procname = "mmap_min_addr",
+ .data = &dac_mmap_min_addr,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = mmap_min_addr_handler,
+ },
+#endif
+#ifdef CONFIG_NUMA
+ {
+ .procname = "numa_zonelist_order",
+ .data = &numa_zonelist_order,
+ .maxlen = NUMA_ZONELIST_ORDER_LEN,
+ .mode = 0644,
+ .proc_handler = numa_zonelist_order_handler,
+ },
+#endif
+#if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \
+ (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL))
+ {
+ .procname = "vdso_enabled",
+#ifdef CONFIG_X86_32
+ .data = &vdso32_enabled,
+ .maxlen = sizeof(vdso32_enabled),
+#else
+ .data = &vdso_enabled,
+ .maxlen = sizeof(vdso_enabled),
+#endif
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ .extra1 = &zero,
+ },
+#endif
+#ifdef CONFIG_HIGHMEM
+ {
+ .procname = "highmem_is_dirtyable",
+ .data = &vm_highmem_is_dirtyable,
+ .maxlen = sizeof(vm_highmem_is_dirtyable),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif
+#ifdef CONFIG_MEMORY_FAILURE
+ {
+ .procname = "memory_failure_early_kill",
+ .data = &sysctl_memory_failure_early_kill,
+ .maxlen = sizeof(sysctl_memory_failure_early_kill),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+ {
+ .procname = "memory_failure_recovery",
+ .data = &sysctl_memory_failure_recovery,
+ .maxlen = sizeof(sysctl_memory_failure_recovery),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif
+ {
+ .procname = "user_reserve_kbytes",
+ .data = &sysctl_user_reserve_kbytes,
+ .maxlen = sizeof(sysctl_user_reserve_kbytes),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {
+ .procname = "admin_reserve_kbytes",
+ .data = &sysctl_admin_reserve_kbytes,
+ .maxlen = sizeof(sysctl_admin_reserve_kbytes),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ { }
+};
+
+static struct ctl_table fs_table[] = {
+ {
+ .procname = "inode-nr",
+ .data = &inodes_stat,
+ .maxlen = 2*sizeof(long),
+ .mode = 0444,
+ .proc_handler = proc_nr_inodes,
+ },
+ {
+ .procname = "inode-state",
+ .data = &inodes_stat,
+ .maxlen = 7*sizeof(long),
+ .mode = 0444,
+ .proc_handler = proc_nr_inodes,
+ },
+ {
+ .procname = "file-nr",
+ .data = &files_stat,
+ .maxlen = sizeof(files_stat),
+ .mode = 0444,
+ .proc_handler = proc_nr_files,
+ },
+ {
+ .procname = "file-max",
+ .data = &files_stat.max_files,
+ .maxlen = sizeof(files_stat.max_files),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {
+ .procname = "nr_open",
+ .data = &sysctl_nr_open,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &sysctl_nr_open_min,
+ .extra2 = &sysctl_nr_open_max,
+ },
+ {
+ .procname = "dentry-state",
+ .data = &dentry_stat,
+ .maxlen = 6*sizeof(long),
+ .mode = 0444,
+ .proc_handler = proc_nr_dentry,
+ },
+ {
+ .procname = "overflowuid",
+ .data = &fs_overflowuid,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &minolduid,
+ .extra2 = &maxolduid,
+ },
+ {
+ .procname = "overflowgid",
+ .data = &fs_overflowgid,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &minolduid,
+ .extra2 = &maxolduid,
+ },
+#ifdef CONFIG_FILE_LOCKING
+ {
+ .procname = "leases-enable",
+ .data = &leases_enable,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
+#ifdef CONFIG_DNOTIFY
+ {
+ .procname = "dir-notify-enable",
+ .data = &dir_notify_enable,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
+#ifdef CONFIG_MMU
+#ifdef CONFIG_FILE_LOCKING
+ {
+ .procname = "lease-break-time",
+ .data = &lease_break_time,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
+#ifdef CONFIG_AIO
+ {
+ .procname = "aio-nr",
+ .data = &aio_nr,
+ .maxlen = sizeof(aio_nr),
+ .mode = 0444,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {
+ .procname = "aio-max-nr",
+ .data = &aio_max_nr,
+ .maxlen = sizeof(aio_max_nr),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+#endif /* CONFIG_AIO */
+#ifdef CONFIG_INOTIFY_USER
+ {
+ .procname = "inotify",
+ .mode = 0555,
+ .child = inotify_table,
+ },
+#endif
+#ifdef CONFIG_EPOLL
+ {
+ .procname = "epoll",
+ .mode = 0555,
+ .child = epoll_table,
+ },
+#endif
+#endif
+ {
+ .procname = "protected_symlinks",
+ .data = &sysctl_protected_symlinks,
+ .maxlen = sizeof(int),
+ .mode = 0600,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+ {
+ .procname = "protected_hardlinks",
+ .data = &sysctl_protected_hardlinks,
+ .maxlen = sizeof(int),
+ .mode = 0600,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+ {
+ .procname = "suid_dumpable",
+ .data = &suid_dumpable,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax_coredump,
+ .extra1 = &zero,
+ .extra2 = &two,
+ },
+#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
+ {
+ .procname = "binfmt_misc",
+ .mode = 0555,
+ .child = sysctl_mount_point,
+ },
+#endif
+ {
+ .procname = "pipe-max-size",
+ .data = &pipe_max_size,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &pipe_proc_fn,
+ .extra1 = &pipe_min_size,
+ },
+ { }
+};
+
+static struct ctl_table debug_table[] = {
+#ifdef CONFIG_SYSCTL_EXCEPTION_TRACE
+ {
+ .procname = "exception-trace",
+ .data = &show_unhandled_signals,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+#endif
+#if defined(CONFIG_OPTPROBES)
+ {
+ .procname = "kprobes-optimization",
+ .data = &sysctl_kprobes_optimization,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_kprobes_optimization_handler,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif
+ { }
+};
+
+static struct ctl_table dev_table[] = {
+ { }
+};
+
+int __init sysctl_init(void)
+{
+ struct ctl_table_header *hdr;
+
+ hdr = register_sysctl_table(sysctl_base_table);
+ kmemleak_not_leak(hdr);
+ return 0;
+}
+
+#endif /* CONFIG_SYSCTL */
+
+/*
+ * /proc/sys support
+ */
+
+#ifdef CONFIG_PROC_SYSCTL
+
+static int _proc_do_string(char *data, int maxlen, int write,
+ char __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ size_t len;
+ char __user *p;
+ char c;
+
+ if (!data || !maxlen || !*lenp) {
+ *lenp = 0;
+ return 0;
+ }
+
+ if (write) {
+ if (sysctl_writes_strict == SYSCTL_WRITES_STRICT) {
+ /* Only continue writes not past the end of buffer. */
+ len = strlen(data);
+ if (len > maxlen - 1)
+ len = maxlen - 1;
+
+ if (*ppos > len)
+ return 0;
+ len = *ppos;
+ } else {
+ /* Start writing from beginning of buffer. */
+ len = 0;
+ }
+
+ *ppos += *lenp;
+ p = buffer;
+ while ((p - buffer) < *lenp && len < maxlen - 1) {
+ if (get_user(c, p++))
+ return -EFAULT;
+ if (c == 0 || c == '\n')
+ break;
+ data[len++] = c;
+ }
+ data[len] = 0;
+ } else {
+ len = strlen(data);
+ if (len > maxlen)
+ len = maxlen;
+
+ if (*ppos > len) {
+ *lenp = 0;
+ return 0;
+ }
+
+ data += *ppos;
+ len -= *ppos;
+
+ if (len > *lenp)
+ len = *lenp;
+ if (len)
+ if (copy_to_user(buffer, data, len))
+ return -EFAULT;
+ if (len < *lenp) {
+ if (put_user('\n', buffer + len))
+ return -EFAULT;
+ len++;
+ }
+ *lenp = len;
+ *ppos += len;
+ }
+ return 0;
+}
+
+static void warn_sysctl_write(struct ctl_table *table)
+{
+ pr_warn_once("%s wrote to %s when file position was not 0!\n"
+ "This will not be supported in the future. To silence this\n"
+ "warning, set kernel.sysctl_writes_strict = -1\n",
+ current->comm, table->procname);
+}
+
+/**
+ * proc_dostring - read a string sysctl
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes a string from/to the user buffer. If the kernel
+ * buffer provided is not large enough to hold the string, the
+ * string is truncated. The copied string is %NULL-terminated.
+ * If the string is being read by the user process, it is copied
+ * and a newline '\n' is added. It is truncated if the buffer is
+ * not large enough.
+ *
+ * Returns 0 on success.
+ */
+int proc_dostring(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ if (write && *ppos && sysctl_writes_strict == SYSCTL_WRITES_WARN)
+ warn_sysctl_write(table);
+
+ return _proc_do_string((char *)(table->data), table->maxlen, write,
+ (char __user *)buffer, lenp, ppos);
+}
+
+static size_t proc_skip_spaces(char **buf)
+{
+ size_t ret;
+ char *tmp = skip_spaces(*buf);
+ ret = tmp - *buf;
+ *buf = tmp;
+ return ret;
+}
+
+static void proc_skip_char(char **buf, size_t *size, const char v)
+{
+ while (*size) {
+ if (**buf != v)
+ break;
+ (*size)--;
+ (*buf)++;
+ }
+}
+
+#define TMPBUFLEN 22
+/**
+ * proc_get_long - reads an ASCII formatted integer from a user buffer
+ *
+ * @buf: a kernel buffer
+ * @size: size of the kernel buffer
+ * @val: this is where the number will be stored
+ * @neg: set to %TRUE if number is negative
+ * @perm_tr: a vector which contains the allowed trailers
+ * @perm_tr_len: size of the perm_tr vector
+ * @tr: pointer to store the trailer character
+ *
+ * In case of success %0 is returned and @buf and @size are updated with
+ * the amount of bytes read. If @tr is non-NULL and a trailing
+ * character exists (size is non-zero after returning from this
+ * function), @tr is updated with the trailing character.
+ */
+static int proc_get_long(char **buf, size_t *size,
+ unsigned long *val, bool *neg,
+ const char *perm_tr, unsigned perm_tr_len, char *tr)
+{
+ int len;
+ char *p, tmp[TMPBUFLEN];
+
+ if (!*size)
+ return -EINVAL;
+
+ len = *size;
+ if (len > TMPBUFLEN - 1)
+ len = TMPBUFLEN - 1;
+
+ memcpy(tmp, *buf, len);
+
+ tmp[len] = 0;
+ p = tmp;
+ if (*p == '-' && *size > 1) {
+ *neg = true;
+ p++;
+ } else
+ *neg = false;
+ if (!isdigit(*p))
+ return -EINVAL;
+
+ *val = simple_strtoul(p, &p, 0);
+
+ len = p - tmp;
+
+ /* We don't know if the next char is whitespace thus we may accept
+ * invalid integers (e.g. 1234...a) or two integers instead of one
+ * (e.g. 123...1). So lets not allow such large numbers. */
+ if (len == TMPBUFLEN - 1)
+ return -EINVAL;
+
+ if (len < *size && perm_tr_len && !memchr(perm_tr, *p, perm_tr_len))
+ return -EINVAL;
+
+ if (tr && (len < *size))
+ *tr = *p;
+
+ *buf += len;
+ *size -= len;
+
+ return 0;
+}
+
+/**
+ * proc_put_long - converts an integer to a decimal ASCII formatted string
+ *
+ * @buf: the user buffer
+ * @size: the size of the user buffer
+ * @val: the integer to be converted
+ * @neg: sign of the number, %TRUE for negative
+ *
+ * In case of success %0 is returned and @buf and @size are updated with
+ * the amount of bytes written.
+ */
+static int proc_put_long(void __user **buf, size_t *size, unsigned long val,
+ bool neg)
+{
+ int len;
+ char tmp[TMPBUFLEN], *p = tmp;
+
+ sprintf(p, "%s%lu", neg ? "-" : "", val);
+ len = strlen(tmp);
+ if (len > *size)
+ len = *size;
+ if (copy_to_user(*buf, tmp, len))
+ return -EFAULT;
+ *size -= len;
+ *buf += len;
+ return 0;
+}
+#undef TMPBUFLEN
+
+static int proc_put_char(void __user **buf, size_t *size, char c)
+{
+ if (*size) {
+ char __user **buffer = (char __user **)buf;
+ if (put_user(c, *buffer))
+ return -EFAULT;
+ (*size)--, (*buffer)++;
+ *buf = *buffer;
+ }
+ return 0;
+}
+
+static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
+ int *valp,
+ int write, void *data)
+{
+ if (write) {
+ if (*negp) {
+ if (*lvalp > (unsigned long) INT_MAX + 1)
+ return -EINVAL;
+ *valp = -*lvalp;
+ } else {
+ if (*lvalp > (unsigned long) INT_MAX)
+ return -EINVAL;
+ *valp = *lvalp;
+ }
+ } else {
+ int val = *valp;
+ if (val < 0) {
+ *negp = true;
+ *lvalp = (unsigned long)-val;
+ } else {
+ *negp = false;
+ *lvalp = (unsigned long)val;
+ }
+ }
+ return 0;
+}
+
+static const char proc_wspace_sep[] = { ' ', '\t', '\n' };
+
+static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
+ int write, void __user *buffer,
+ size_t *lenp, loff_t *ppos,
+ int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
+ int write, void *data),
+ void *data)
+{
+ int *i, vleft, first = 1, err = 0;
+ unsigned long page = 0;
+ size_t left;
+ char *kbuf;
+
+ if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
+ *lenp = 0;
+ return 0;
+ }
+
+ i = (int *) tbl_data;
+ vleft = table->maxlen / sizeof(*i);
+ left = *lenp;
+
+ if (!conv)
+ conv = do_proc_dointvec_conv;
+
+ if (write) {
+ if (*ppos) {
+ switch (sysctl_writes_strict) {
+ case SYSCTL_WRITES_STRICT:
+ goto out;
+ case SYSCTL_WRITES_WARN:
+ warn_sysctl_write(table);
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (left > PAGE_SIZE - 1)
+ left = PAGE_SIZE - 1;
+ page = __get_free_page(GFP_TEMPORARY);
+ kbuf = (char *) page;
+ if (!kbuf)
+ return -ENOMEM;
+ if (copy_from_user(kbuf, buffer, left)) {
+ err = -EFAULT;
+ goto free;
+ }
+ kbuf[left] = 0;
+ }
+
+ for (; left && vleft--; i++, first=0) {
+ unsigned long lval;
+ bool neg;
+
+ if (write) {
+ left -= proc_skip_spaces(&kbuf);
+
+ if (!left)
+ break;
+ err = proc_get_long(&kbuf, &left, &lval, &neg,
+ proc_wspace_sep,
+ sizeof(proc_wspace_sep), NULL);
+ if (err)
+ break;
+ if (conv(&neg, &lval, i, 1, data)) {
+ err = -EINVAL;
+ break;
+ }
+ } else {
+ if (conv(&neg, &lval, i, 0, data)) {
+ err = -EINVAL;
+ break;
+ }
+ if (!first)
+ err = proc_put_char(&buffer, &left, '\t');
+ if (err)
+ break;
+ err = proc_put_long(&buffer, &left, lval, neg);
+ if (err)
+ break;
+ }
+ }
+
+ if (!write && !first && left && !err)
+ err = proc_put_char(&buffer, &left, '\n');
+ if (write && !err && left)
+ left -= proc_skip_spaces(&kbuf);
+free:
+ if (write) {
+ free_page(page);
+ if (first)
+ return err ? : -EINVAL;
+ }
+ *lenp -= left;
+out:
+ *ppos += *lenp;
+ return err;
+}
+
+static int do_proc_dointvec(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos,
+ int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
+ int write, void *data),
+ void *data)
+{
+ return __do_proc_dointvec(table->data, table, write,
+ buffer, lenp, ppos, conv, data);
+}
+
+/**
+ * proc_dointvec - read a vector of integers
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return do_proc_dointvec(table,write,buffer,lenp,ppos,
+ NULL,NULL);
+}
+
+/*
+ * Taint values can only be increased
+ * This means we can safely use a temporary.
+ */
+static int proc_taint(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ unsigned long tmptaint = get_taint();
+ int err;
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ t = *table;
+ t.data = &tmptaint;
+ err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
+ if (err < 0)
+ return err;
+
+ if (write) {
+ /*
+ * Poor man's atomic or. Not worth adding a primitive
+ * to everyone's atomic.h for this
+ */
+ int i;
+ for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) {
+ if ((tmptaint >> i) & 1)
+ add_taint(i, LOCKDEP_STILL_OK);
+ }
+ }
+
+ return err;
+}
+
+#ifdef CONFIG_PRINTK
+static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+}
+#endif
+
+struct do_proc_dointvec_minmax_conv_param {
+ int *min;
+ int *max;
+};
+
+static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
+ int *valp,
+ int write, void *data)
+{
+ struct do_proc_dointvec_minmax_conv_param *param = data;
+ if (write) {
+ int val = *negp ? -*lvalp : *lvalp;
+ if ((param->min && *param->min > val) ||
+ (param->max && *param->max < val))
+ return -EINVAL;
+ *valp = val;
+ } else {
+ int val = *valp;
+ if (val < 0) {
+ *negp = true;
+ *lvalp = (unsigned long)-val;
+ } else {
+ *negp = false;
+ *lvalp = (unsigned long)val;
+ }
+ }
+ return 0;
+}
+
+/**
+ * proc_dointvec_minmax - read a vector of integers with min/max values
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ *
+ * This routine will ensure the values are within the range specified by
+ * table->extra1 (min) and table->extra2 (max).
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec_minmax(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct do_proc_dointvec_minmax_conv_param param = {
+ .min = (int *) table->extra1,
+ .max = (int *) table->extra2,
+ };
+ return do_proc_dointvec(table, write, buffer, lenp, ppos,
+ do_proc_dointvec_minmax_conv, &param);
+}
+
+static void validate_coredump_safety(void)
+{
+#ifdef CONFIG_COREDUMP
+ if (suid_dumpable == SUID_DUMP_ROOT &&
+ core_pattern[0] != '/' && core_pattern[0] != '|') {
+ printk(KERN_WARNING "Unsafe core_pattern used with "\
+ "suid_dumpable=2. Pipe handler or fully qualified "\
+ "core dump path required.\n");
+ }
+#endif
+}
+
+static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (!error)
+ validate_coredump_safety();
+ return error;
+}
+
+#ifdef CONFIG_COREDUMP
+static int proc_dostring_coredump(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int error = proc_dostring(table, write, buffer, lenp, ppos);
+ if (!error)
+ validate_coredump_safety();
+ return error;
+}
+#endif
+
+static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos,
+ unsigned long convmul,
+ unsigned long convdiv)
+{
+ unsigned long *i, *min, *max;
+ int vleft, first = 1, err = 0;
+ unsigned long page = 0;
+ size_t left;
+ char *kbuf;
+
+ if (!data || !table->maxlen || !*lenp || (*ppos && !write)) {
+ *lenp = 0;
+ return 0;
+ }
+
+ i = (unsigned long *) data;
+ min = (unsigned long *) table->extra1;
+ max = (unsigned long *) table->extra2;
+ vleft = table->maxlen / sizeof(unsigned long);
+ left = *lenp;
+
+ if (write) {
+ if (*ppos) {
+ switch (sysctl_writes_strict) {
+ case SYSCTL_WRITES_STRICT:
+ goto out;
+ case SYSCTL_WRITES_WARN:
+ warn_sysctl_write(table);
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (left > PAGE_SIZE - 1)
+ left = PAGE_SIZE - 1;
+ page = __get_free_page(GFP_TEMPORARY);
+ kbuf = (char *) page;
+ if (!kbuf)
+ return -ENOMEM;
+ if (copy_from_user(kbuf, buffer, left)) {
+ err = -EFAULT;
+ goto free;
+ }
+ kbuf[left] = 0;
+ }
+
+ for (; left && vleft--; i++, first = 0) {
+ unsigned long val;
+
+ if (write) {
+ bool neg;
+
+ left -= proc_skip_spaces(&kbuf);
+
+ err = proc_get_long(&kbuf, &left, &val, &neg,
+ proc_wspace_sep,
+ sizeof(proc_wspace_sep), NULL);
+ if (err)
+ break;
+ if (neg)
+ continue;
+ if ((min && val < *min) || (max && val > *max))
+ continue;
+ *i = val;
+ } else {
+ val = convdiv * (*i) / convmul;
+ if (!first) {
+ err = proc_put_char(&buffer, &left, '\t');
+ if (err)
+ break;
+ }
+ err = proc_put_long(&buffer, &left, val, false);
+ if (err)
+ break;
+ }
+ }
+
+ if (!write && !first && left && !err)
+ err = proc_put_char(&buffer, &left, '\n');
+ if (write && !err)
+ left -= proc_skip_spaces(&kbuf);
+free:
+ if (write) {
+ free_page(page);
+ if (first)
+ return err ? : -EINVAL;
+ }
+ *lenp -= left;
+out:
+ *ppos += *lenp;
+ return err;
+}
+
+static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos,
+ unsigned long convmul,
+ unsigned long convdiv)
+{
+ return __do_proc_doulongvec_minmax(table->data, table, write,
+ buffer, lenp, ppos, convmul, convdiv);
+}
+
+/**
+ * proc_doulongvec_minmax - read a vector of long integers with min/max values
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long
+ * values from/to the user buffer, treated as an ASCII string.
+ *
+ * This routine will ensure the values are within the range specified by
+ * table->extra1 (min) and table->extra2 (max).
+ *
+ * Returns 0 on success.
+ */
+int proc_doulongvec_minmax(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l);
+}
+
+/**
+ * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long
+ * values from/to the user buffer, treated as an ASCII string. The values
+ * are treated as milliseconds, and converted to jiffies when they are stored.
+ *
+ * This routine will ensure the values are within the range specified by
+ * table->extra1 (min) and table->extra2 (max).
+ *
+ * Returns 0 on success.
+ */
+int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ return do_proc_doulongvec_minmax(table, write, buffer,
+ lenp, ppos, HZ, 1000l);
+}
+
+
+static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp,
+ int *valp,
+ int write, void *data)
+{
+ if (write) {
+ if (*lvalp > LONG_MAX / HZ)
+ return 1;
+ *valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ);
+ } else {
+ int val = *valp;
+ unsigned long lval;
+ if (val < 0) {
+ *negp = true;
+ lval = (unsigned long)-val;
+ } else {
+ *negp = false;
+ lval = (unsigned long)val;
+ }
+ *lvalp = lval / HZ;
+ }
+ return 0;
+}
+
+static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp,
+ int *valp,
+ int write, void *data)
+{
+ if (write) {
+ if (USER_HZ < HZ && *lvalp > (LONG_MAX / HZ) * USER_HZ)
+ return 1;
+ *valp = clock_t_to_jiffies(*negp ? -*lvalp : *lvalp);
+ } else {
+ int val = *valp;
+ unsigned long lval;
+ if (val < 0) {
+ *negp = true;
+ lval = (unsigned long)-val;
+ } else {
+ *negp = false;
+ lval = (unsigned long)val;
+ }
+ *lvalp = jiffies_to_clock_t(lval);
+ }
+ return 0;
+}
+
+static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
+ int *valp,
+ int write, void *data)
+{
+ if (write) {
+ unsigned long jif = msecs_to_jiffies(*negp ? -*lvalp : *lvalp);
+
+ if (jif > INT_MAX)
+ return 1;
+ *valp = (int)jif;
+ } else {
+ int val = *valp;
+ unsigned long lval;
+ if (val < 0) {
+ *negp = true;
+ lval = (unsigned long)-val;
+ } else {
+ *negp = false;
+ lval = (unsigned long)val;
+ }
+ *lvalp = jiffies_to_msecs(lval);
+ }
+ return 0;
+}
+
+/**
+ * proc_dointvec_jiffies - read a vector of integers as seconds
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ * The values read are assumed to be in seconds, and are converted into
+ * jiffies.
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec_jiffies(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return do_proc_dointvec(table,write,buffer,lenp,ppos,
+ do_proc_dointvec_jiffies_conv,NULL);
+}
+
+/**
+ * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: pointer to the file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ * The values read are assumed to be in 1/USER_HZ seconds, and
+ * are converted into jiffies.
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return do_proc_dointvec(table,write,buffer,lenp,ppos,
+ do_proc_dointvec_userhz_jiffies_conv,NULL);
+}
+
+/**
+ * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ * @ppos: the current position in the file
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ * The values read are assumed to be in 1/1000 seconds, and
+ * are converted into jiffies.
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return do_proc_dointvec(table, write, buffer, lenp, ppos,
+ do_proc_dointvec_ms_jiffies_conv, NULL);
+}
+
+static int proc_do_cad_pid(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct pid *new_pid;
+ pid_t tmp;
+ int r;
+
+ tmp = pid_vnr(cad_pid);
+
+ r = __do_proc_dointvec(&tmp, table, write, buffer,
+ lenp, ppos, NULL, NULL);
+ if (r || !write)
+ return r;
+
+ new_pid = find_get_pid(tmp);
+ if (!new_pid)
+ return -ESRCH;
+
+ put_pid(xchg(&cad_pid, new_pid));
+ return 0;
+}
+
+/**
+ * proc_do_large_bitmap - read/write from/to a large bitmap
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * The bitmap is stored at table->data and the bitmap length (in bits)
+ * in table->maxlen.
+ *
+ * We use a range comma separated format (e.g. 1,3-4,10-10) so that
+ * large bitmaps may be represented in a compact manner. Writing into
+ * the file will clear the bitmap then update it with the given input.
+ *
+ * Returns 0 on success.
+ */
+int proc_do_large_bitmap(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int err = 0;
+ bool first = 1;
+ size_t left = *lenp;
+ unsigned long bitmap_len = table->maxlen;
+ unsigned long *bitmap = *(unsigned long **) table->data;
+ unsigned long *tmp_bitmap = NULL;
+ char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c;
+
+ if (!bitmap || !bitmap_len || !left || (*ppos && !write)) {
+ *lenp = 0;
+ return 0;
+ }
+
+ if (write) {
+ unsigned long page = 0;
+ char *kbuf;
+
+ if (left > PAGE_SIZE - 1)
+ left = PAGE_SIZE - 1;
+
+ page = __get_free_page(GFP_TEMPORARY);
+ kbuf = (char *) page;
+ if (!kbuf)
+ return -ENOMEM;
+ if (copy_from_user(kbuf, buffer, left)) {
+ free_page(page);
+ return -EFAULT;
+ }
+ kbuf[left] = 0;
+
+ tmp_bitmap = kzalloc(BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long),
+ GFP_KERNEL);
+ if (!tmp_bitmap) {
+ free_page(page);
+ return -ENOMEM;
+ }
+ proc_skip_char(&kbuf, &left, '\n');
+ while (!err && left) {
+ unsigned long val_a, val_b;
+ bool neg;
+
+ err = proc_get_long(&kbuf, &left, &val_a, &neg, tr_a,
+ sizeof(tr_a), &c);
+ if (err)
+ break;
+ if (val_a >= bitmap_len || neg) {
+ err = -EINVAL;
+ break;
+ }
+
+ val_b = val_a;
+ if (left) {
+ kbuf++;
+ left--;
+ }
+
+ if (c == '-') {
+ err = proc_get_long(&kbuf, &left, &val_b,
+ &neg, tr_b, sizeof(tr_b),
+ &c);
+ if (err)
+ break;
+ if (val_b >= bitmap_len || neg ||
+ val_a > val_b) {
+ err = -EINVAL;
+ break;
+ }
+ if (left) {
+ kbuf++;
+ left--;
+ }
+ }
+
+ bitmap_set(tmp_bitmap, val_a, val_b - val_a + 1);
+ first = 0;
+ proc_skip_char(&kbuf, &left, '\n');
+ }
+ free_page(page);
+ } else {
+ unsigned long bit_a, bit_b = 0;
+
+ while (left) {
+ bit_a = find_next_bit(bitmap, bitmap_len, bit_b);
+ if (bit_a >= bitmap_len)
+ break;
+ bit_b = find_next_zero_bit(bitmap, bitmap_len,
+ bit_a + 1) - 1;
+
+ if (!first) {
+ err = proc_put_char(&buffer, &left, ',');
+ if (err)
+ break;
+ }
+ err = proc_put_long(&buffer, &left, bit_a, false);
+ if (err)
+ break;
+ if (bit_a != bit_b) {
+ err = proc_put_char(&buffer, &left, '-');
+ if (err)
+ break;
+ err = proc_put_long(&buffer, &left, bit_b, false);
+ if (err)
+ break;
+ }
+
+ first = 0; bit_b++;
+ }
+ if (!err)
+ err = proc_put_char(&buffer, &left, '\n');
+ }
+
+ if (!err) {
+ if (write) {
+ if (*ppos)
+ bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len);
+ else
+ bitmap_copy(bitmap, tmp_bitmap, bitmap_len);
+ }
+ kfree(tmp_bitmap);
+ *lenp -= left;
+ *ppos += *lenp;
+ return 0;
+ } else {
+ kfree(tmp_bitmap);
+ return err;
+ }
+}
+
+#else /* CONFIG_PROC_SYSCTL */
+
+int proc_dostring(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_dointvec(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_dointvec_minmax(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_dointvec_jiffies(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_doulongvec_minmax(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
+
+#endif /* CONFIG_PROC_SYSCTL */
+
+/*
+ * No sense putting this after each symbol definition, twice,
+ * exception granted :-)
+ */
+EXPORT_SYMBOL(proc_dointvec);
+EXPORT_SYMBOL(proc_dointvec_jiffies);
+EXPORT_SYMBOL(proc_dointvec_minmax);
+EXPORT_SYMBOL(proc_dointvec_userhz_jiffies);
+EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
+EXPORT_SYMBOL(proc_dostring);
+EXPORT_SYMBOL(proc_doulongvec_minmax);
+EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
new file mode 100644
index 000000000..7e7746a42
--- /dev/null
+++ b/kernel/sysctl_binary.c
@@ -0,0 +1,1494 @@
+#include <linux/stat.h>
+#include <linux/sysctl.h>
+#include "../fs/xfs/xfs_sysctl.h"
+#include <linux/sunrpc/debug.h>
+#include <linux/string.h>
+#include <linux/syscalls.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/fs.h>
+#include <linux/nsproxy.h>
+#include <linux/pid_namespace.h>
+#include <linux/file.h>
+#include <linux/ctype.h>
+#include <linux/netdevice.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/compat.h>
+
+#ifdef CONFIG_SYSCTL_SYSCALL
+
+struct bin_table;
+typedef ssize_t bin_convert_t(struct file *file,
+ void __user *oldval, size_t oldlen, void __user *newval, size_t newlen);
+
+static bin_convert_t bin_dir;
+static bin_convert_t bin_string;
+static bin_convert_t bin_intvec;
+static bin_convert_t bin_ulongvec;
+static bin_convert_t bin_uuid;
+static bin_convert_t bin_dn_node_address;
+
+#define CTL_DIR bin_dir
+#define CTL_STR bin_string
+#define CTL_INT bin_intvec
+#define CTL_ULONG bin_ulongvec
+#define CTL_UUID bin_uuid
+#define CTL_DNADR bin_dn_node_address
+
+#define BUFSZ 256
+
+struct bin_table {
+ bin_convert_t *convert;
+ int ctl_name;
+ const char *procname;
+ const struct bin_table *child;
+};
+
+static const struct bin_table bin_random_table[] = {
+ { CTL_INT, RANDOM_POOLSIZE, "poolsize" },
+ { CTL_INT, RANDOM_ENTROPY_COUNT, "entropy_avail" },
+ { CTL_INT, RANDOM_READ_THRESH, "read_wakeup_threshold" },
+ { CTL_INT, RANDOM_WRITE_THRESH, "write_wakeup_threshold" },
+ { CTL_UUID, RANDOM_BOOT_ID, "boot_id" },
+ { CTL_UUID, RANDOM_UUID, "uuid" },
+ {}
+};
+
+static const struct bin_table bin_pty_table[] = {
+ { CTL_INT, PTY_MAX, "max" },
+ { CTL_INT, PTY_NR, "nr" },
+ {}
+};
+
+static const struct bin_table bin_kern_table[] = {
+ { CTL_STR, KERN_OSTYPE, "ostype" },
+ { CTL_STR, KERN_OSRELEASE, "osrelease" },
+ /* KERN_OSREV not used */
+ { CTL_STR, KERN_VERSION, "version" },
+ /* KERN_SECUREMASK not used */
+ /* KERN_PROF not used */
+ { CTL_STR, KERN_NODENAME, "hostname" },
+ { CTL_STR, KERN_DOMAINNAME, "domainname" },
+
+ { CTL_INT, KERN_PANIC, "panic" },
+ { CTL_INT, KERN_REALROOTDEV, "real-root-dev" },
+
+ { CTL_STR, KERN_SPARC_REBOOT, "reboot-cmd" },
+ { CTL_INT, KERN_CTLALTDEL, "ctrl-alt-del" },
+ { CTL_INT, KERN_PRINTK, "printk" },
+
+ /* KERN_NAMETRANS not used */
+ /* KERN_PPC_HTABRECLAIM not used */
+ /* KERN_PPC_ZEROPAGED not used */
+ { CTL_INT, KERN_PPC_POWERSAVE_NAP, "powersave-nap" },
+
+ { CTL_STR, KERN_MODPROBE, "modprobe" },
+ { CTL_INT, KERN_SG_BIG_BUFF, "sg-big-buff" },
+ { CTL_INT, KERN_ACCT, "acct" },
+ /* KERN_PPC_L2CR "l2cr" no longer used */
+
+ /* KERN_RTSIGNR not used */
+ /* KERN_RTSIGMAX not used */
+
+ { CTL_ULONG, KERN_SHMMAX, "shmmax" },
+ { CTL_INT, KERN_MSGMAX, "msgmax" },
+ { CTL_INT, KERN_MSGMNB, "msgmnb" },
+ /* KERN_MSGPOOL not used*/
+ { CTL_INT, KERN_SYSRQ, "sysrq" },
+ { CTL_INT, KERN_MAX_THREADS, "threads-max" },
+ { CTL_DIR, KERN_RANDOM, "random", bin_random_table },
+ { CTL_ULONG, KERN_SHMALL, "shmall" },
+ { CTL_INT, KERN_MSGMNI, "msgmni" },
+ { CTL_INT, KERN_SEM, "sem" },
+ { CTL_INT, KERN_SPARC_STOP_A, "stop-a" },
+ { CTL_INT, KERN_SHMMNI, "shmmni" },
+
+ { CTL_INT, KERN_OVERFLOWUID, "overflowuid" },
+ { CTL_INT, KERN_OVERFLOWGID, "overflowgid" },
+
+ { CTL_STR, KERN_HOTPLUG, "hotplug", },
+ { CTL_INT, KERN_IEEE_EMULATION_WARNINGS, "ieee_emulation_warnings" },
+
+ { CTL_INT, KERN_S390_USER_DEBUG_LOGGING, "userprocess_debug" },
+ { CTL_INT, KERN_CORE_USES_PID, "core_uses_pid" },
+ /* KERN_TAINTED "tainted" no longer used */
+ { CTL_INT, KERN_CADPID, "cad_pid" },
+ { CTL_INT, KERN_PIDMAX, "pid_max" },
+ { CTL_STR, KERN_CORE_PATTERN, "core_pattern" },
+ { CTL_INT, KERN_PANIC_ON_OOPS, "panic_on_oops" },
+ { CTL_INT, KERN_HPPA_PWRSW, "soft-power" },
+ { CTL_INT, KERN_HPPA_UNALIGNED, "unaligned-trap" },
+
+ { CTL_INT, KERN_PRINTK_RATELIMIT, "printk_ratelimit" },
+ { CTL_INT, KERN_PRINTK_RATELIMIT_BURST, "printk_ratelimit_burst" },
+
+ { CTL_DIR, KERN_PTY, "pty", bin_pty_table },
+ { CTL_INT, KERN_NGROUPS_MAX, "ngroups_max" },
+ { CTL_INT, KERN_SPARC_SCONS_PWROFF, "scons-poweroff" },
+ /* KERN_HZ_TIMER "hz_timer" no longer used */
+ { CTL_INT, KERN_UNKNOWN_NMI_PANIC, "unknown_nmi_panic" },
+ { CTL_INT, KERN_BOOTLOADER_TYPE, "bootloader_type" },
+ { CTL_INT, KERN_RANDOMIZE, "randomize_va_space" },
+
+ { CTL_INT, KERN_SPIN_RETRY, "spin_retry" },
+ /* KERN_ACPI_VIDEO_FLAGS "acpi_video_flags" no longer used */
+ { CTL_INT, KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" },
+ { CTL_INT, KERN_COMPAT_LOG, "compat-log" },
+ { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" },
+ { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" },
+ { CTL_INT, KERN_PANIC_ON_WARN, "panic_on_warn" },
+ {}
+};
+
+static const struct bin_table bin_vm_table[] = {
+ { CTL_INT, VM_OVERCOMMIT_MEMORY, "overcommit_memory" },
+ { CTL_INT, VM_PAGE_CLUSTER, "page-cluster" },
+ { CTL_INT, VM_DIRTY_BACKGROUND, "dirty_background_ratio" },
+ { CTL_INT, VM_DIRTY_RATIO, "dirty_ratio" },
+ /* VM_DIRTY_WB_CS "dirty_writeback_centisecs" no longer used */
+ /* VM_DIRTY_EXPIRE_CS "dirty_expire_centisecs" no longer used */
+ /* VM_NR_PDFLUSH_THREADS "nr_pdflush_threads" no longer used */
+ { CTL_INT, VM_OVERCOMMIT_RATIO, "overcommit_ratio" },
+ /* VM_PAGEBUF unused */
+ /* VM_HUGETLB_PAGES "nr_hugepages" no longer used */
+ { CTL_INT, VM_SWAPPINESS, "swappiness" },
+ { CTL_INT, VM_LOWMEM_RESERVE_RATIO, "lowmem_reserve_ratio" },
+ { CTL_INT, VM_MIN_FREE_KBYTES, "min_free_kbytes" },
+ { CTL_INT, VM_MAX_MAP_COUNT, "max_map_count" },
+ { CTL_INT, VM_LAPTOP_MODE, "laptop_mode" },
+ { CTL_INT, VM_BLOCK_DUMP, "block_dump" },
+ { CTL_INT, VM_HUGETLB_GROUP, "hugetlb_shm_group" },
+ { CTL_INT, VM_VFS_CACHE_PRESSURE, "vfs_cache_pressure" },
+ { CTL_INT, VM_LEGACY_VA_LAYOUT, "legacy_va_layout" },
+ /* VM_SWAP_TOKEN_TIMEOUT unused */
+ { CTL_INT, VM_DROP_PAGECACHE, "drop_caches" },
+ { CTL_INT, VM_PERCPU_PAGELIST_FRACTION, "percpu_pagelist_fraction" },
+ { CTL_INT, VM_ZONE_RECLAIM_MODE, "zone_reclaim_mode" },
+ { CTL_INT, VM_MIN_UNMAPPED, "min_unmapped_ratio" },
+ { CTL_INT, VM_PANIC_ON_OOM, "panic_on_oom" },
+ { CTL_INT, VM_VDSO_ENABLED, "vdso_enabled" },
+ { CTL_INT, VM_MIN_SLAB, "min_slab_ratio" },
+
+ {}
+};
+
+static const struct bin_table bin_net_core_table[] = {
+ { CTL_INT, NET_CORE_WMEM_MAX, "wmem_max" },
+ { CTL_INT, NET_CORE_RMEM_MAX, "rmem_max" },
+ { CTL_INT, NET_CORE_WMEM_DEFAULT, "wmem_default" },
+ { CTL_INT, NET_CORE_RMEM_DEFAULT, "rmem_default" },
+ /* NET_CORE_DESTROY_DELAY unused */
+ { CTL_INT, NET_CORE_MAX_BACKLOG, "netdev_max_backlog" },
+ /* NET_CORE_FASTROUTE unused */
+ { CTL_INT, NET_CORE_MSG_COST, "message_cost" },
+ { CTL_INT, NET_CORE_MSG_BURST, "message_burst" },
+ { CTL_INT, NET_CORE_OPTMEM_MAX, "optmem_max" },
+ /* NET_CORE_HOT_LIST_LENGTH unused */
+ /* NET_CORE_DIVERT_VERSION unused */
+ /* NET_CORE_NO_CONG_THRESH unused */
+ /* NET_CORE_NO_CONG unused */
+ /* NET_CORE_LO_CONG unused */
+ /* NET_CORE_MOD_CONG unused */
+ { CTL_INT, NET_CORE_DEV_WEIGHT, "dev_weight" },
+ { CTL_INT, NET_CORE_SOMAXCONN, "somaxconn" },
+ { CTL_INT, NET_CORE_BUDGET, "netdev_budget" },
+ { CTL_INT, NET_CORE_AEVENT_ETIME, "xfrm_aevent_etime" },
+ { CTL_INT, NET_CORE_AEVENT_RSEQTH, "xfrm_aevent_rseqth" },
+ { CTL_INT, NET_CORE_WARNINGS, "warnings" },
+ {},
+};
+
+static const struct bin_table bin_net_unix_table[] = {
+ /* NET_UNIX_DESTROY_DELAY unused */
+ /* NET_UNIX_DELETE_DELAY unused */
+ { CTL_INT, NET_UNIX_MAX_DGRAM_QLEN, "max_dgram_qlen" },
+ {}
+};
+
+static const struct bin_table bin_net_ipv4_route_table[] = {
+ { CTL_INT, NET_IPV4_ROUTE_FLUSH, "flush" },
+ /* NET_IPV4_ROUTE_MIN_DELAY "min_delay" no longer used */
+ /* NET_IPV4_ROUTE_MAX_DELAY "max_delay" no longer used */
+ { CTL_INT, NET_IPV4_ROUTE_GC_THRESH, "gc_thresh" },
+ { CTL_INT, NET_IPV4_ROUTE_MAX_SIZE, "max_size" },
+ { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" },
+ { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" },
+ { CTL_INT, NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout" },
+ /* NET_IPV4_ROUTE_GC_INTERVAL "gc_interval" no longer used */
+ { CTL_INT, NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load" },
+ { CTL_INT, NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number" },
+ { CTL_INT, NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence" },
+ { CTL_INT, NET_IPV4_ROUTE_ERROR_COST, "error_cost" },
+ { CTL_INT, NET_IPV4_ROUTE_ERROR_BURST, "error_burst" },
+ { CTL_INT, NET_IPV4_ROUTE_GC_ELASTICITY, "gc_elasticity" },
+ { CTL_INT, NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires" },
+ { CTL_INT, NET_IPV4_ROUTE_MIN_PMTU, "min_pmtu" },
+ { CTL_INT, NET_IPV4_ROUTE_MIN_ADVMSS, "min_adv_mss" },
+ {}
+};
+
+static const struct bin_table bin_net_ipv4_conf_vars_table[] = {
+ { CTL_INT, NET_IPV4_CONF_FORWARDING, "forwarding" },
+ { CTL_INT, NET_IPV4_CONF_MC_FORWARDING, "mc_forwarding" },
+
+ { CTL_INT, NET_IPV4_CONF_ACCEPT_REDIRECTS, "accept_redirects" },
+ { CTL_INT, NET_IPV4_CONF_SECURE_REDIRECTS, "secure_redirects" },
+ { CTL_INT, NET_IPV4_CONF_SEND_REDIRECTS, "send_redirects" },
+ { CTL_INT, NET_IPV4_CONF_SHARED_MEDIA, "shared_media" },
+ { CTL_INT, NET_IPV4_CONF_RP_FILTER, "rp_filter" },
+ { CTL_INT, NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE, "accept_source_route" },
+ { CTL_INT, NET_IPV4_CONF_PROXY_ARP, "proxy_arp" },
+ { CTL_INT, NET_IPV4_CONF_MEDIUM_ID, "medium_id" },
+ { CTL_INT, NET_IPV4_CONF_BOOTP_RELAY, "bootp_relay" },
+ { CTL_INT, NET_IPV4_CONF_LOG_MARTIANS, "log_martians" },
+ { CTL_INT, NET_IPV4_CONF_TAG, "tag" },
+ { CTL_INT, NET_IPV4_CONF_ARPFILTER, "arp_filter" },
+ { CTL_INT, NET_IPV4_CONF_ARP_ANNOUNCE, "arp_announce" },
+ { CTL_INT, NET_IPV4_CONF_ARP_IGNORE, "arp_ignore" },
+ { CTL_INT, NET_IPV4_CONF_ARP_ACCEPT, "arp_accept" },
+ { CTL_INT, NET_IPV4_CONF_ARP_NOTIFY, "arp_notify" },
+
+ { CTL_INT, NET_IPV4_CONF_NOXFRM, "disable_xfrm" },
+ { CTL_INT, NET_IPV4_CONF_NOPOLICY, "disable_policy" },
+ { CTL_INT, NET_IPV4_CONF_FORCE_IGMP_VERSION, "force_igmp_version" },
+ { CTL_INT, NET_IPV4_CONF_PROMOTE_SECONDARIES, "promote_secondaries" },
+ {}
+};
+
+static const struct bin_table bin_net_ipv4_conf_table[] = {
+ { CTL_DIR, NET_PROTO_CONF_ALL, "all", bin_net_ipv4_conf_vars_table },
+ { CTL_DIR, NET_PROTO_CONF_DEFAULT, "default", bin_net_ipv4_conf_vars_table },
+ { CTL_DIR, 0, NULL, bin_net_ipv4_conf_vars_table },
+ {}
+};
+
+static const struct bin_table bin_net_neigh_vars_table[] = {
+ { CTL_INT, NET_NEIGH_MCAST_SOLICIT, "mcast_solicit" },
+ { CTL_INT, NET_NEIGH_UCAST_SOLICIT, "ucast_solicit" },
+ { CTL_INT, NET_NEIGH_APP_SOLICIT, "app_solicit" },
+ /* NET_NEIGH_RETRANS_TIME "retrans_time" no longer used */
+ { CTL_INT, NET_NEIGH_REACHABLE_TIME, "base_reachable_time" },
+ { CTL_INT, NET_NEIGH_DELAY_PROBE_TIME, "delay_first_probe_time" },
+ { CTL_INT, NET_NEIGH_GC_STALE_TIME, "gc_stale_time" },
+ { CTL_INT, NET_NEIGH_UNRES_QLEN, "unres_qlen" },
+ { CTL_INT, NET_NEIGH_PROXY_QLEN, "proxy_qlen" },
+ /* NET_NEIGH_ANYCAST_DELAY "anycast_delay" no longer used */
+ /* NET_NEIGH_PROXY_DELAY "proxy_delay" no longer used */
+ /* NET_NEIGH_LOCKTIME "locktime" no longer used */
+ { CTL_INT, NET_NEIGH_GC_INTERVAL, "gc_interval" },
+ { CTL_INT, NET_NEIGH_GC_THRESH1, "gc_thresh1" },
+ { CTL_INT, NET_NEIGH_GC_THRESH2, "gc_thresh2" },
+ { CTL_INT, NET_NEIGH_GC_THRESH3, "gc_thresh3" },
+ { CTL_INT, NET_NEIGH_RETRANS_TIME_MS, "retrans_time_ms" },
+ { CTL_INT, NET_NEIGH_REACHABLE_TIME_MS, "base_reachable_time_ms" },
+ {}
+};
+
+static const struct bin_table bin_net_neigh_table[] = {
+ { CTL_DIR, NET_PROTO_CONF_DEFAULT, "default", bin_net_neigh_vars_table },
+ { CTL_DIR, 0, NULL, bin_net_neigh_vars_table },
+ {}
+};
+
+static const struct bin_table bin_net_ipv4_netfilter_table[] = {
+ { CTL_INT, NET_IPV4_NF_CONNTRACK_MAX, "ip_conntrack_max" },
+
+ /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT "ip_conntrack_tcp_timeout_syn_sent" no longer used */
+ /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV "ip_conntrack_tcp_timeout_syn_recv" no longer used */
+ /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED "ip_conntrack_tcp_timeout_established" no longer used */
+ /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT "ip_conntrack_tcp_timeout_fin_wait" no longer used */
+ /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT "ip_conntrack_tcp_timeout_close_wait" no longer used */
+ /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK "ip_conntrack_tcp_timeout_last_ack" no longer used */
+ /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT "ip_conntrack_tcp_timeout_time_wait" no longer used */
+ /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE "ip_conntrack_tcp_timeout_close" no longer used */
+
+ /* NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT "ip_conntrack_udp_timeout" no longer used */
+ /* NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT_STREAM "ip_conntrack_udp_timeout_stream" no longer used */
+ /* NET_IPV4_NF_CONNTRACK_ICMP_TIMEOUT "ip_conntrack_icmp_timeout" no longer used */
+ /* NET_IPV4_NF_CONNTRACK_GENERIC_TIMEOUT "ip_conntrack_generic_timeout" no longer used */
+
+ { CTL_INT, NET_IPV4_NF_CONNTRACK_BUCKETS, "ip_conntrack_buckets" },
+ { CTL_INT, NET_IPV4_NF_CONNTRACK_LOG_INVALID, "ip_conntrack_log_invalid" },
+ /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS "ip_conntrack_tcp_timeout_max_retrans" no longer used */
+ { CTL_INT, NET_IPV4_NF_CONNTRACK_TCP_LOOSE, "ip_conntrack_tcp_loose" },
+ { CTL_INT, NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL, "ip_conntrack_tcp_be_liberal" },
+ { CTL_INT, NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS, "ip_conntrack_tcp_max_retrans" },
+
+ /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED "ip_conntrack_sctp_timeout_closed" no longer used */
+ /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT "ip_conntrack_sctp_timeout_cookie_wait" no longer used */
+ /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED "ip_conntrack_sctp_timeout_cookie_echoed" no longer used */
+ /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED "ip_conntrack_sctp_timeout_established" no longer used */
+ /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT "ip_conntrack_sctp_timeout_shutdown_sent" no longer used */
+ /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD "ip_conntrack_sctp_timeout_shutdown_recd" no longer used */
+ /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT "ip_conntrack_sctp_timeout_shutdown_ack_sent" no longer used */
+
+ { CTL_INT, NET_IPV4_NF_CONNTRACK_COUNT, "ip_conntrack_count" },
+ { CTL_INT, NET_IPV4_NF_CONNTRACK_CHECKSUM, "ip_conntrack_checksum" },
+ {}
+};
+
+static const struct bin_table bin_net_ipv4_table[] = {
+ {CTL_INT, NET_IPV4_FORWARD, "ip_forward" },
+
+ { CTL_DIR, NET_IPV4_CONF, "conf", bin_net_ipv4_conf_table },
+ { CTL_DIR, NET_IPV4_NEIGH, "neigh", bin_net_neigh_table },
+ { CTL_DIR, NET_IPV4_ROUTE, "route", bin_net_ipv4_route_table },
+ /* NET_IPV4_FIB_HASH unused */
+ { CTL_DIR, NET_IPV4_NETFILTER, "netfilter", bin_net_ipv4_netfilter_table },
+
+ { CTL_INT, NET_IPV4_TCP_TIMESTAMPS, "tcp_timestamps" },
+ { CTL_INT, NET_IPV4_TCP_WINDOW_SCALING, "tcp_window_scaling" },
+ { CTL_INT, NET_IPV4_TCP_SACK, "tcp_sack" },
+ { CTL_INT, NET_IPV4_TCP_RETRANS_COLLAPSE, "tcp_retrans_collapse" },
+ { CTL_INT, NET_IPV4_DEFAULT_TTL, "ip_default_ttl" },
+ /* NET_IPV4_AUTOCONFIG unused */
+ { CTL_INT, NET_IPV4_NO_PMTU_DISC, "ip_no_pmtu_disc" },
+ { CTL_INT, NET_IPV4_NONLOCAL_BIND, "ip_nonlocal_bind" },
+ { CTL_INT, NET_IPV4_TCP_SYN_RETRIES, "tcp_syn_retries" },
+ { CTL_INT, NET_TCP_SYNACK_RETRIES, "tcp_synack_retries" },
+ { CTL_INT, NET_TCP_MAX_ORPHANS, "tcp_max_orphans" },
+ { CTL_INT, NET_TCP_MAX_TW_BUCKETS, "tcp_max_tw_buckets" },
+ { CTL_INT, NET_IPV4_DYNADDR, "ip_dynaddr" },
+ { CTL_INT, NET_IPV4_TCP_KEEPALIVE_TIME, "tcp_keepalive_time" },
+ { CTL_INT, NET_IPV4_TCP_KEEPALIVE_PROBES, "tcp_keepalive_probes" },
+ { CTL_INT, NET_IPV4_TCP_KEEPALIVE_INTVL, "tcp_keepalive_intvl" },
+ { CTL_INT, NET_IPV4_TCP_RETRIES1, "tcp_retries1" },
+ { CTL_INT, NET_IPV4_TCP_RETRIES2, "tcp_retries2" },
+ { CTL_INT, NET_IPV4_TCP_FIN_TIMEOUT, "tcp_fin_timeout" },
+ { CTL_INT, NET_TCP_SYNCOOKIES, "tcp_syncookies" },
+ { CTL_INT, NET_TCP_TW_RECYCLE, "tcp_tw_recycle" },
+ { CTL_INT, NET_TCP_ABORT_ON_OVERFLOW, "tcp_abort_on_overflow" },
+ { CTL_INT, NET_TCP_STDURG, "tcp_stdurg" },
+ { CTL_INT, NET_TCP_RFC1337, "tcp_rfc1337" },
+ { CTL_INT, NET_TCP_MAX_SYN_BACKLOG, "tcp_max_syn_backlog" },
+ { CTL_INT, NET_IPV4_LOCAL_PORT_RANGE, "ip_local_port_range" },
+ { CTL_INT, NET_IPV4_IGMP_MAX_MEMBERSHIPS, "igmp_max_memberships" },
+ { CTL_INT, NET_IPV4_IGMP_MAX_MSF, "igmp_max_msf" },
+ { CTL_INT, NET_IPV4_INET_PEER_THRESHOLD, "inet_peer_threshold" },
+ { CTL_INT, NET_IPV4_INET_PEER_MINTTL, "inet_peer_minttl" },
+ { CTL_INT, NET_IPV4_INET_PEER_MAXTTL, "inet_peer_maxttl" },
+ { CTL_INT, NET_IPV4_INET_PEER_GC_MINTIME, "inet_peer_gc_mintime" },
+ { CTL_INT, NET_IPV4_INET_PEER_GC_MAXTIME, "inet_peer_gc_maxtime" },
+ { CTL_INT, NET_TCP_ORPHAN_RETRIES, "tcp_orphan_retries" },
+ { CTL_INT, NET_TCP_FACK, "tcp_fack" },
+ { CTL_INT, NET_TCP_REORDERING, "tcp_reordering" },
+ { CTL_INT, NET_TCP_ECN, "tcp_ecn" },
+ { CTL_INT, NET_TCP_DSACK, "tcp_dsack" },
+ { CTL_INT, NET_TCP_MEM, "tcp_mem" },
+ { CTL_INT, NET_TCP_WMEM, "tcp_wmem" },
+ { CTL_INT, NET_TCP_RMEM, "tcp_rmem" },
+ { CTL_INT, NET_TCP_APP_WIN, "tcp_app_win" },
+ { CTL_INT, NET_TCP_ADV_WIN_SCALE, "tcp_adv_win_scale" },
+ { CTL_INT, NET_TCP_TW_REUSE, "tcp_tw_reuse" },
+ { CTL_INT, NET_TCP_FRTO, "tcp_frto" },
+ { CTL_INT, NET_TCP_FRTO_RESPONSE, "tcp_frto_response" },
+ { CTL_INT, NET_TCP_LOW_LATENCY, "tcp_low_latency" },
+ { CTL_INT, NET_TCP_NO_METRICS_SAVE, "tcp_no_metrics_save" },
+ { CTL_INT, NET_TCP_MODERATE_RCVBUF, "tcp_moderate_rcvbuf" },
+ { CTL_INT, NET_TCP_TSO_WIN_DIVISOR, "tcp_tso_win_divisor" },
+ { CTL_STR, NET_TCP_CONG_CONTROL, "tcp_congestion_control" },
+ { CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" },
+ { CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" },
+ { CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" },
+ { CTL_INT, NET_TCP_SLOW_START_AFTER_IDLE, "tcp_slow_start_after_idle" },
+ { CTL_INT, NET_CIPSOV4_CACHE_ENABLE, "cipso_cache_enable" },
+ { CTL_INT, NET_CIPSOV4_CACHE_BUCKET_SIZE, "cipso_cache_bucket_size" },
+ { CTL_INT, NET_CIPSOV4_RBM_OPTFMT, "cipso_rbm_optfmt" },
+ { CTL_INT, NET_CIPSOV4_RBM_STRICTVALID, "cipso_rbm_strictvalid" },
+ /* NET_TCP_AVAIL_CONG_CONTROL "tcp_available_congestion_control" no longer used */
+ { CTL_STR, NET_TCP_ALLOWED_CONG_CONTROL, "tcp_allowed_congestion_control" },
+ { CTL_INT, NET_TCP_MAX_SSTHRESH, "tcp_max_ssthresh" },
+
+ { CTL_INT, NET_IPV4_ICMP_ECHO_IGNORE_ALL, "icmp_echo_ignore_all" },
+ { CTL_INT, NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS, "icmp_echo_ignore_broadcasts" },
+ { CTL_INT, NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES, "icmp_ignore_bogus_error_responses" },
+ { CTL_INT, NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR, "icmp_errors_use_inbound_ifaddr" },
+ { CTL_INT, NET_IPV4_ICMP_RATELIMIT, "icmp_ratelimit" },
+ { CTL_INT, NET_IPV4_ICMP_RATEMASK, "icmp_ratemask" },
+
+ { CTL_INT, NET_IPV4_IPFRAG_HIGH_THRESH, "ipfrag_high_thresh" },
+ { CTL_INT, NET_IPV4_IPFRAG_LOW_THRESH, "ipfrag_low_thresh" },
+ { CTL_INT, NET_IPV4_IPFRAG_TIME, "ipfrag_time" },
+
+ { CTL_INT, NET_IPV4_IPFRAG_SECRET_INTERVAL, "ipfrag_secret_interval" },
+ /* NET_IPV4_IPFRAG_MAX_DIST "ipfrag_max_dist" no longer used */
+
+ { CTL_INT, 2088 /* NET_IPQ_QMAX */, "ip_queue_maxlen" },
+
+ /* NET_TCP_DEFAULT_WIN_SCALE unused */
+ /* NET_TCP_BIC_BETA unused */
+ /* NET_IPV4_TCP_MAX_KA_PROBES unused */
+ /* NET_IPV4_IP_MASQ_DEBUG unused */
+ /* NET_TCP_SYN_TAILDROP unused */
+ /* NET_IPV4_ICMP_SOURCEQUENCH_RATE unused */
+ /* NET_IPV4_ICMP_DESTUNREACH_RATE unused */
+ /* NET_IPV4_ICMP_TIMEEXCEED_RATE unused */
+ /* NET_IPV4_ICMP_PARAMPROB_RATE unused */
+ /* NET_IPV4_ICMP_ECHOREPLY_RATE unused */
+ /* NET_IPV4_ALWAYS_DEFRAG unused */
+ {}
+};
+
+static const struct bin_table bin_net_ipx_table[] = {
+ { CTL_INT, NET_IPX_PPROP_BROADCASTING, "ipx_pprop_broadcasting" },
+ /* NET_IPX_FORWARDING unused */
+ {}
+};
+
+static const struct bin_table bin_net_atalk_table[] = {
+ { CTL_INT, NET_ATALK_AARP_EXPIRY_TIME, "aarp-expiry-time" },
+ { CTL_INT, NET_ATALK_AARP_TICK_TIME, "aarp-tick-time" },
+ { CTL_INT, NET_ATALK_AARP_RETRANSMIT_LIMIT, "aarp-retransmit-limit" },
+ { CTL_INT, NET_ATALK_AARP_RESOLVE_TIME, "aarp-resolve-time" },
+ {},
+};
+
+static const struct bin_table bin_net_netrom_table[] = {
+ { CTL_INT, NET_NETROM_DEFAULT_PATH_QUALITY, "default_path_quality" },
+ { CTL_INT, NET_NETROM_OBSOLESCENCE_COUNT_INITIALISER, "obsolescence_count_initialiser" },
+ { CTL_INT, NET_NETROM_NETWORK_TTL_INITIALISER, "network_ttl_initialiser" },
+ { CTL_INT, NET_NETROM_TRANSPORT_TIMEOUT, "transport_timeout" },
+ { CTL_INT, NET_NETROM_TRANSPORT_MAXIMUM_TRIES, "transport_maximum_tries" },
+ { CTL_INT, NET_NETROM_TRANSPORT_ACKNOWLEDGE_DELAY, "transport_acknowledge_delay" },
+ { CTL_INT, NET_NETROM_TRANSPORT_BUSY_DELAY, "transport_busy_delay" },
+ { CTL_INT, NET_NETROM_TRANSPORT_REQUESTED_WINDOW_SIZE, "transport_requested_window_size" },
+ { CTL_INT, NET_NETROM_TRANSPORT_NO_ACTIVITY_TIMEOUT, "transport_no_activity_timeout" },
+ { CTL_INT, NET_NETROM_ROUTING_CONTROL, "routing_control" },
+ { CTL_INT, NET_NETROM_LINK_FAILS_COUNT, "link_fails_count" },
+ { CTL_INT, NET_NETROM_RESET, "reset" },
+ {}
+};
+
+static const struct bin_table bin_net_ax25_param_table[] = {
+ { CTL_INT, NET_AX25_IP_DEFAULT_MODE, "ip_default_mode" },
+ { CTL_INT, NET_AX25_DEFAULT_MODE, "ax25_default_mode" },
+ { CTL_INT, NET_AX25_BACKOFF_TYPE, "backoff_type" },
+ { CTL_INT, NET_AX25_CONNECT_MODE, "connect_mode" },
+ { CTL_INT, NET_AX25_STANDARD_WINDOW, "standard_window_size" },
+ { CTL_INT, NET_AX25_EXTENDED_WINDOW, "extended_window_size" },
+ { CTL_INT, NET_AX25_T1_TIMEOUT, "t1_timeout" },
+ { CTL_INT, NET_AX25_T2_TIMEOUT, "t2_timeout" },
+ { CTL_INT, NET_AX25_T3_TIMEOUT, "t3_timeout" },
+ { CTL_INT, NET_AX25_IDLE_TIMEOUT, "idle_timeout" },
+ { CTL_INT, NET_AX25_N2, "maximum_retry_count" },
+ { CTL_INT, NET_AX25_PACLEN, "maximum_packet_length" },
+ { CTL_INT, NET_AX25_PROTOCOL, "protocol" },
+ { CTL_INT, NET_AX25_DAMA_SLAVE_TIMEOUT, "dama_slave_timeout" },
+ {}
+};
+
+static const struct bin_table bin_net_ax25_table[] = {
+ { CTL_DIR, 0, NULL, bin_net_ax25_param_table },
+ {}
+};
+
+static const struct bin_table bin_net_rose_table[] = {
+ { CTL_INT, NET_ROSE_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" },
+ { CTL_INT, NET_ROSE_CALL_REQUEST_TIMEOUT, "call_request_timeout" },
+ { CTL_INT, NET_ROSE_RESET_REQUEST_TIMEOUT, "reset_request_timeout" },
+ { CTL_INT, NET_ROSE_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" },
+ { CTL_INT, NET_ROSE_ACK_HOLD_BACK_TIMEOUT, "acknowledge_hold_back_timeout" },
+ { CTL_INT, NET_ROSE_ROUTING_CONTROL, "routing_control" },
+ { CTL_INT, NET_ROSE_LINK_FAIL_TIMEOUT, "link_fail_timeout" },
+ { CTL_INT, NET_ROSE_MAX_VCS, "maximum_virtual_circuits" },
+ { CTL_INT, NET_ROSE_WINDOW_SIZE, "window_size" },
+ { CTL_INT, NET_ROSE_NO_ACTIVITY_TIMEOUT, "no_activity_timeout" },
+ {}
+};
+
+static const struct bin_table bin_net_ipv6_conf_var_table[] = {
+ { CTL_INT, NET_IPV6_FORWARDING, "forwarding" },
+ { CTL_INT, NET_IPV6_HOP_LIMIT, "hop_limit" },
+ { CTL_INT, NET_IPV6_MTU, "mtu" },
+ { CTL_INT, NET_IPV6_ACCEPT_RA, "accept_ra" },
+ { CTL_INT, NET_IPV6_ACCEPT_REDIRECTS, "accept_redirects" },
+ { CTL_INT, NET_IPV6_AUTOCONF, "autoconf" },
+ { CTL_INT, NET_IPV6_DAD_TRANSMITS, "dad_transmits" },
+ { CTL_INT, NET_IPV6_RTR_SOLICITS, "router_solicitations" },
+ { CTL_INT, NET_IPV6_RTR_SOLICIT_INTERVAL, "router_solicitation_interval" },
+ { CTL_INT, NET_IPV6_RTR_SOLICIT_DELAY, "router_solicitation_delay" },
+ { CTL_INT, NET_IPV6_USE_TEMPADDR, "use_tempaddr" },
+ { CTL_INT, NET_IPV6_TEMP_VALID_LFT, "temp_valid_lft" },
+ { CTL_INT, NET_IPV6_TEMP_PREFERED_LFT, "temp_prefered_lft" },
+ { CTL_INT, NET_IPV6_REGEN_MAX_RETRY, "regen_max_retry" },
+ { CTL_INT, NET_IPV6_MAX_DESYNC_FACTOR, "max_desync_factor" },
+ { CTL_INT, NET_IPV6_MAX_ADDRESSES, "max_addresses" },
+ { CTL_INT, NET_IPV6_FORCE_MLD_VERSION, "force_mld_version" },
+ { CTL_INT, NET_IPV6_ACCEPT_RA_DEFRTR, "accept_ra_defrtr" },
+ { CTL_INT, NET_IPV6_ACCEPT_RA_PINFO, "accept_ra_pinfo" },
+ { CTL_INT, NET_IPV6_ACCEPT_RA_RTR_PREF, "accept_ra_rtr_pref" },
+ { CTL_INT, NET_IPV6_RTR_PROBE_INTERVAL, "router_probe_interval" },
+ { CTL_INT, NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN, "accept_ra_rt_info_max_plen" },
+ { CTL_INT, NET_IPV6_PROXY_NDP, "proxy_ndp" },
+ { CTL_INT, NET_IPV6_ACCEPT_SOURCE_ROUTE, "accept_source_route" },
+ { CTL_INT, NET_IPV6_ACCEPT_RA_FROM_LOCAL, "accept_ra_from_local" },
+ {}
+};
+
+static const struct bin_table bin_net_ipv6_conf_table[] = {
+ { CTL_DIR, NET_PROTO_CONF_ALL, "all", bin_net_ipv6_conf_var_table },
+ { CTL_DIR, NET_PROTO_CONF_DEFAULT, "default", bin_net_ipv6_conf_var_table },
+ { CTL_DIR, 0, NULL, bin_net_ipv6_conf_var_table },
+ {}
+};
+
+static const struct bin_table bin_net_ipv6_route_table[] = {
+ /* NET_IPV6_ROUTE_FLUSH "flush" no longer used */
+ { CTL_INT, NET_IPV6_ROUTE_GC_THRESH, "gc_thresh" },
+ { CTL_INT, NET_IPV6_ROUTE_MAX_SIZE, "max_size" },
+ { CTL_INT, NET_IPV6_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" },
+ { CTL_INT, NET_IPV6_ROUTE_GC_TIMEOUT, "gc_timeout" },
+ { CTL_INT, NET_IPV6_ROUTE_GC_INTERVAL, "gc_interval" },
+ { CTL_INT, NET_IPV6_ROUTE_GC_ELASTICITY, "gc_elasticity" },
+ { CTL_INT, NET_IPV6_ROUTE_MTU_EXPIRES, "mtu_expires" },
+ { CTL_INT, NET_IPV6_ROUTE_MIN_ADVMSS, "min_adv_mss" },
+ { CTL_INT, NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" },
+ {}
+};
+
+static const struct bin_table bin_net_ipv6_icmp_table[] = {
+ { CTL_INT, NET_IPV6_ICMP_RATELIMIT, "ratelimit" },
+ {}
+};
+
+static const struct bin_table bin_net_ipv6_table[] = {
+ { CTL_DIR, NET_IPV6_CONF, "conf", bin_net_ipv6_conf_table },
+ { CTL_DIR, NET_IPV6_NEIGH, "neigh", bin_net_neigh_table },
+ { CTL_DIR, NET_IPV6_ROUTE, "route", bin_net_ipv6_route_table },
+ { CTL_DIR, NET_IPV6_ICMP, "icmp", bin_net_ipv6_icmp_table },
+ { CTL_INT, NET_IPV6_BINDV6ONLY, "bindv6only" },
+ { CTL_INT, NET_IPV6_IP6FRAG_HIGH_THRESH, "ip6frag_high_thresh" },
+ { CTL_INT, NET_IPV6_IP6FRAG_LOW_THRESH, "ip6frag_low_thresh" },
+ { CTL_INT, NET_IPV6_IP6FRAG_TIME, "ip6frag_time" },
+ { CTL_INT, NET_IPV6_IP6FRAG_SECRET_INTERVAL, "ip6frag_secret_interval" },
+ { CTL_INT, NET_IPV6_MLD_MAX_MSF, "mld_max_msf" },
+ { CTL_INT, 2088 /* IPQ_QMAX */, "ip6_queue_maxlen" },
+ {}
+};
+
+static const struct bin_table bin_net_x25_table[] = {
+ { CTL_INT, NET_X25_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" },
+ { CTL_INT, NET_X25_CALL_REQUEST_TIMEOUT, "call_request_timeout" },
+ { CTL_INT, NET_X25_RESET_REQUEST_TIMEOUT, "reset_request_timeout" },
+ { CTL_INT, NET_X25_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" },
+ { CTL_INT, NET_X25_ACK_HOLD_BACK_TIMEOUT, "acknowledgement_hold_back_timeout" },
+ { CTL_INT, NET_X25_FORWARD, "x25_forward" },
+ {}
+};
+
+static const struct bin_table bin_net_tr_table[] = {
+ { CTL_INT, NET_TR_RIF_TIMEOUT, "rif_timeout" },
+ {}
+};
+
+
+static const struct bin_table bin_net_decnet_conf_vars[] = {
+ { CTL_INT, NET_DECNET_CONF_DEV_FORWARDING, "forwarding" },
+ { CTL_INT, NET_DECNET_CONF_DEV_PRIORITY, "priority" },
+ { CTL_INT, NET_DECNET_CONF_DEV_T2, "t2" },
+ { CTL_INT, NET_DECNET_CONF_DEV_T3, "t3" },
+ {}
+};
+
+static const struct bin_table bin_net_decnet_conf[] = {
+ { CTL_DIR, NET_DECNET_CONF_ETHER, "ethernet", bin_net_decnet_conf_vars },
+ { CTL_DIR, NET_DECNET_CONF_GRE, "ipgre", bin_net_decnet_conf_vars },
+ { CTL_DIR, NET_DECNET_CONF_X25, "x25", bin_net_decnet_conf_vars },
+ { CTL_DIR, NET_DECNET_CONF_PPP, "ppp", bin_net_decnet_conf_vars },
+ { CTL_DIR, NET_DECNET_CONF_DDCMP, "ddcmp", bin_net_decnet_conf_vars },
+ { CTL_DIR, NET_DECNET_CONF_LOOPBACK, "loopback", bin_net_decnet_conf_vars },
+ { CTL_DIR, 0, NULL, bin_net_decnet_conf_vars },
+ {}
+};
+
+static const struct bin_table bin_net_decnet_table[] = {
+ { CTL_DIR, NET_DECNET_CONF, "conf", bin_net_decnet_conf },
+ { CTL_DNADR, NET_DECNET_NODE_ADDRESS, "node_address" },
+ { CTL_STR, NET_DECNET_NODE_NAME, "node_name" },
+ { CTL_STR, NET_DECNET_DEFAULT_DEVICE, "default_device" },
+ { CTL_INT, NET_DECNET_TIME_WAIT, "time_wait" },
+ { CTL_INT, NET_DECNET_DN_COUNT, "dn_count" },
+ { CTL_INT, NET_DECNET_DI_COUNT, "di_count" },
+ { CTL_INT, NET_DECNET_DR_COUNT, "dr_count" },
+ { CTL_INT, NET_DECNET_DST_GC_INTERVAL, "dst_gc_interval" },
+ { CTL_INT, NET_DECNET_NO_FC_MAX_CWND, "no_fc_max_cwnd" },
+ { CTL_INT, NET_DECNET_MEM, "decnet_mem" },
+ { CTL_INT, NET_DECNET_RMEM, "decnet_rmem" },
+ { CTL_INT, NET_DECNET_WMEM, "decnet_wmem" },
+ { CTL_INT, NET_DECNET_DEBUG_LEVEL, "debug" },
+ {}
+};
+
+static const struct bin_table bin_net_sctp_table[] = {
+ { CTL_INT, NET_SCTP_RTO_INITIAL, "rto_initial" },
+ { CTL_INT, NET_SCTP_RTO_MIN, "rto_min" },
+ { CTL_INT, NET_SCTP_RTO_MAX, "rto_max" },
+ { CTL_INT, NET_SCTP_RTO_ALPHA, "rto_alpha_exp_divisor" },
+ { CTL_INT, NET_SCTP_RTO_BETA, "rto_beta_exp_divisor" },
+ { CTL_INT, NET_SCTP_VALID_COOKIE_LIFE, "valid_cookie_life" },
+ { CTL_INT, NET_SCTP_ASSOCIATION_MAX_RETRANS, "association_max_retrans" },
+ { CTL_INT, NET_SCTP_PATH_MAX_RETRANS, "path_max_retrans" },
+ { CTL_INT, NET_SCTP_MAX_INIT_RETRANSMITS, "max_init_retransmits" },
+ { CTL_INT, NET_SCTP_HB_INTERVAL, "hb_interval" },
+ { CTL_INT, NET_SCTP_PRESERVE_ENABLE, "cookie_preserve_enable" },
+ { CTL_INT, NET_SCTP_MAX_BURST, "max_burst" },
+ { CTL_INT, NET_SCTP_ADDIP_ENABLE, "addip_enable" },
+ { CTL_INT, NET_SCTP_PRSCTP_ENABLE, "prsctp_enable" },
+ { CTL_INT, NET_SCTP_SNDBUF_POLICY, "sndbuf_policy" },
+ { CTL_INT, NET_SCTP_SACK_TIMEOUT, "sack_timeout" },
+ { CTL_INT, NET_SCTP_RCVBUF_POLICY, "rcvbuf_policy" },
+ {}
+};
+
+static const struct bin_table bin_net_llc_llc2_timeout_table[] = {
+ { CTL_INT, NET_LLC2_ACK_TIMEOUT, "ack" },
+ { CTL_INT, NET_LLC2_P_TIMEOUT, "p" },
+ { CTL_INT, NET_LLC2_REJ_TIMEOUT, "rej" },
+ { CTL_INT, NET_LLC2_BUSY_TIMEOUT, "busy" },
+ {}
+};
+
+static const struct bin_table bin_net_llc_station_table[] = {
+ { CTL_INT, NET_LLC_STATION_ACK_TIMEOUT, "ack_timeout" },
+ {}
+};
+
+static const struct bin_table bin_net_llc_llc2_table[] = {
+ { CTL_DIR, NET_LLC2, "timeout", bin_net_llc_llc2_timeout_table },
+ {}
+};
+
+static const struct bin_table bin_net_llc_table[] = {
+ { CTL_DIR, NET_LLC2, "llc2", bin_net_llc_llc2_table },
+ { CTL_DIR, NET_LLC_STATION, "station", bin_net_llc_station_table },
+ {}
+};
+
+static const struct bin_table bin_net_netfilter_table[] = {
+ { CTL_INT, NET_NF_CONNTRACK_MAX, "nf_conntrack_max" },
+ /* NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT "nf_conntrack_tcp_timeout_syn_sent" no longer used */
+ /* NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV "nf_conntrack_tcp_timeout_syn_recv" no longer used */
+ /* NET_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED "nf_conntrack_tcp_timeout_established" no longer used */
+ /* NET_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT "nf_conntrack_tcp_timeout_fin_wait" no longer used */
+ /* NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT "nf_conntrack_tcp_timeout_close_wait" no longer used */
+ /* NET_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK "nf_conntrack_tcp_timeout_last_ack" no longer used */
+ /* NET_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT "nf_conntrack_tcp_timeout_time_wait" no longer used */
+ /* NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE "nf_conntrack_tcp_timeout_close" no longer used */
+ /* NET_NF_CONNTRACK_UDP_TIMEOUT "nf_conntrack_udp_timeout" no longer used */
+ /* NET_NF_CONNTRACK_UDP_TIMEOUT_STREAM "nf_conntrack_udp_timeout_stream" no longer used */
+ /* NET_NF_CONNTRACK_ICMP_TIMEOUT "nf_conntrack_icmp_timeout" no longer used */
+ /* NET_NF_CONNTRACK_GENERIC_TIMEOUT "nf_conntrack_generic_timeout" no longer used */
+ { CTL_INT, NET_NF_CONNTRACK_BUCKETS, "nf_conntrack_buckets" },
+ { CTL_INT, NET_NF_CONNTRACK_LOG_INVALID, "nf_conntrack_log_invalid" },
+ /* NET_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS "nf_conntrack_tcp_timeout_max_retrans" no longer used */
+ { CTL_INT, NET_NF_CONNTRACK_TCP_LOOSE, "nf_conntrack_tcp_loose" },
+ { CTL_INT, NET_NF_CONNTRACK_TCP_BE_LIBERAL, "nf_conntrack_tcp_be_liberal" },
+ { CTL_INT, NET_NF_CONNTRACK_TCP_MAX_RETRANS, "nf_conntrack_tcp_max_retrans" },
+ /* NET_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED "nf_conntrack_sctp_timeout_closed" no longer used */
+ /* NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT "nf_conntrack_sctp_timeout_cookie_wait" no longer used */
+ /* NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED "nf_conntrack_sctp_timeout_cookie_echoed" no longer used */
+ /* NET_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED "nf_conntrack_sctp_timeout_established" no longer used */
+ /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT "nf_conntrack_sctp_timeout_shutdown_sent" no longer used */
+ /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD "nf_conntrack_sctp_timeout_shutdown_recd" no longer used */
+ /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT "nf_conntrack_sctp_timeout_shutdown_ack_sent" no longer used */
+ { CTL_INT, NET_NF_CONNTRACK_COUNT, "nf_conntrack_count" },
+ /* NET_NF_CONNTRACK_ICMPV6_TIMEOUT "nf_conntrack_icmpv6_timeout" no longer used */
+ /* NET_NF_CONNTRACK_FRAG6_TIMEOUT "nf_conntrack_frag6_timeout" no longer used */
+ { CTL_INT, NET_NF_CONNTRACK_FRAG6_LOW_THRESH, "nf_conntrack_frag6_low_thresh" },
+ { CTL_INT, NET_NF_CONNTRACK_FRAG6_HIGH_THRESH, "nf_conntrack_frag6_high_thresh" },
+ { CTL_INT, NET_NF_CONNTRACK_CHECKSUM, "nf_conntrack_checksum" },
+
+ {}
+};
+
+static const struct bin_table bin_net_irda_table[] = {
+ { CTL_INT, NET_IRDA_DISCOVERY, "discovery" },
+ { CTL_STR, NET_IRDA_DEVNAME, "devname" },
+ { CTL_INT, NET_IRDA_DEBUG, "debug" },
+ { CTL_INT, NET_IRDA_FAST_POLL, "fast_poll_increase" },
+ { CTL_INT, NET_IRDA_DISCOVERY_SLOTS, "discovery_slots" },
+ { CTL_INT, NET_IRDA_DISCOVERY_TIMEOUT, "discovery_timeout" },
+ { CTL_INT, NET_IRDA_SLOT_TIMEOUT, "slot_timeout" },
+ { CTL_INT, NET_IRDA_MAX_BAUD_RATE, "max_baud_rate" },
+ { CTL_INT, NET_IRDA_MIN_TX_TURN_TIME, "min_tx_turn_time" },
+ { CTL_INT, NET_IRDA_MAX_TX_DATA_SIZE, "max_tx_data_size" },
+ { CTL_INT, NET_IRDA_MAX_TX_WINDOW, "max_tx_window" },
+ { CTL_INT, NET_IRDA_MAX_NOREPLY_TIME, "max_noreply_time" },
+ { CTL_INT, NET_IRDA_WARN_NOREPLY_TIME, "warn_noreply_time" },
+ { CTL_INT, NET_IRDA_LAP_KEEPALIVE_TIME, "lap_keepalive_time" },
+ {}
+};
+
+static const struct bin_table bin_net_table[] = {
+ { CTL_DIR, NET_CORE, "core", bin_net_core_table },
+ /* NET_ETHER not used */
+ /* NET_802 not used */
+ { CTL_DIR, NET_UNIX, "unix", bin_net_unix_table },
+ { CTL_DIR, NET_IPV4, "ipv4", bin_net_ipv4_table },
+ { CTL_DIR, NET_IPX, "ipx", bin_net_ipx_table },
+ { CTL_DIR, NET_ATALK, "appletalk", bin_net_atalk_table },
+ { CTL_DIR, NET_NETROM, "netrom", bin_net_netrom_table },
+ { CTL_DIR, NET_AX25, "ax25", bin_net_ax25_table },
+ /* NET_BRIDGE "bridge" no longer used */
+ { CTL_DIR, NET_ROSE, "rose", bin_net_rose_table },
+ { CTL_DIR, NET_IPV6, "ipv6", bin_net_ipv6_table },
+ { CTL_DIR, NET_X25, "x25", bin_net_x25_table },
+ { CTL_DIR, NET_TR, "token-ring", bin_net_tr_table },
+ { CTL_DIR, NET_DECNET, "decnet", bin_net_decnet_table },
+ /* NET_ECONET not used */
+ { CTL_DIR, NET_SCTP, "sctp", bin_net_sctp_table },
+ { CTL_DIR, NET_LLC, "llc", bin_net_llc_table },
+ { CTL_DIR, NET_NETFILTER, "netfilter", bin_net_netfilter_table },
+ /* NET_DCCP "dccp" no longer used */
+ { CTL_DIR, NET_IRDA, "irda", bin_net_irda_table },
+ { CTL_INT, 2089, "nf_conntrack_max" },
+ {}
+};
+
+static const struct bin_table bin_fs_quota_table[] = {
+ { CTL_INT, FS_DQ_LOOKUPS, "lookups" },
+ { CTL_INT, FS_DQ_DROPS, "drops" },
+ { CTL_INT, FS_DQ_READS, "reads" },
+ { CTL_INT, FS_DQ_WRITES, "writes" },
+ { CTL_INT, FS_DQ_CACHE_HITS, "cache_hits" },
+ { CTL_INT, FS_DQ_ALLOCATED, "allocated_dquots" },
+ { CTL_INT, FS_DQ_FREE, "free_dquots" },
+ { CTL_INT, FS_DQ_SYNCS, "syncs" },
+ { CTL_INT, FS_DQ_WARNINGS, "warnings" },
+ {}
+};
+
+static const struct bin_table bin_fs_xfs_table[] = {
+ { CTL_INT, XFS_SGID_INHERIT, "irix_sgid_inherit" },
+ { CTL_INT, XFS_SYMLINK_MODE, "irix_symlink_mode" },
+ { CTL_INT, XFS_PANIC_MASK, "panic_mask" },
+
+ { CTL_INT, XFS_ERRLEVEL, "error_level" },
+ { CTL_INT, XFS_SYNCD_TIMER, "xfssyncd_centisecs" },
+ { CTL_INT, XFS_INHERIT_SYNC, "inherit_sync" },
+ { CTL_INT, XFS_INHERIT_NODUMP, "inherit_nodump" },
+ { CTL_INT, XFS_INHERIT_NOATIME, "inherit_noatime" },
+ { CTL_INT, XFS_BUF_TIMER, "xfsbufd_centisecs" },
+ { CTL_INT, XFS_BUF_AGE, "age_buffer_centisecs" },
+ { CTL_INT, XFS_INHERIT_NOSYM, "inherit_nosymlinks" },
+ { CTL_INT, XFS_ROTORSTEP, "rotorstep" },
+ { CTL_INT, XFS_INHERIT_NODFRG, "inherit_nodefrag" },
+ { CTL_INT, XFS_FILESTREAM_TIMER, "filestream_centisecs" },
+ { CTL_INT, XFS_STATS_CLEAR, "stats_clear" },
+ {}
+};
+
+static const struct bin_table bin_fs_ocfs2_nm_table[] = {
+ { CTL_STR, 1, "hb_ctl_path" },
+ {}
+};
+
+static const struct bin_table bin_fs_ocfs2_table[] = {
+ { CTL_DIR, 1, "nm", bin_fs_ocfs2_nm_table },
+ {}
+};
+
+static const struct bin_table bin_inotify_table[] = {
+ { CTL_INT, INOTIFY_MAX_USER_INSTANCES, "max_user_instances" },
+ { CTL_INT, INOTIFY_MAX_USER_WATCHES, "max_user_watches" },
+ { CTL_INT, INOTIFY_MAX_QUEUED_EVENTS, "max_queued_events" },
+ {}
+};
+
+static const struct bin_table bin_fs_table[] = {
+ { CTL_INT, FS_NRINODE, "inode-nr" },
+ { CTL_INT, FS_STATINODE, "inode-state" },
+ /* FS_MAXINODE unused */
+ /* FS_NRDQUOT unused */
+ /* FS_MAXDQUOT unused */
+ /* FS_NRFILE "file-nr" no longer used */
+ { CTL_INT, FS_MAXFILE, "file-max" },
+ { CTL_INT, FS_DENTRY, "dentry-state" },
+ /* FS_NRSUPER unused */
+ /* FS_MAXUPSER unused */
+ { CTL_INT, FS_OVERFLOWUID, "overflowuid" },
+ { CTL_INT, FS_OVERFLOWGID, "overflowgid" },
+ { CTL_INT, FS_LEASES, "leases-enable" },
+ { CTL_INT, FS_DIR_NOTIFY, "dir-notify-enable" },
+ { CTL_INT, FS_LEASE_TIME, "lease-break-time" },
+ { CTL_DIR, FS_DQSTATS, "quota", bin_fs_quota_table },
+ { CTL_DIR, FS_XFS, "xfs", bin_fs_xfs_table },
+ { CTL_ULONG, FS_AIO_NR, "aio-nr" },
+ { CTL_ULONG, FS_AIO_MAX_NR, "aio-max-nr" },
+ { CTL_DIR, FS_INOTIFY, "inotify", bin_inotify_table },
+ { CTL_DIR, FS_OCFS2, "ocfs2", bin_fs_ocfs2_table },
+ { CTL_INT, KERN_SETUID_DUMPABLE, "suid_dumpable" },
+ {}
+};
+
+static const struct bin_table bin_ipmi_table[] = {
+ { CTL_INT, DEV_IPMI_POWEROFF_POWERCYCLE, "poweroff_powercycle" },
+ {}
+};
+
+static const struct bin_table bin_mac_hid_files[] = {
+ /* DEV_MAC_HID_KEYBOARD_SENDS_LINUX_KEYCODES unused */
+ /* DEV_MAC_HID_KEYBOARD_LOCK_KEYCODES unused */
+ { CTL_INT, DEV_MAC_HID_MOUSE_BUTTON_EMULATION, "mouse_button_emulation" },
+ { CTL_INT, DEV_MAC_HID_MOUSE_BUTTON2_KEYCODE, "mouse_button2_keycode" },
+ { CTL_INT, DEV_MAC_HID_MOUSE_BUTTON3_KEYCODE, "mouse_button3_keycode" },
+ /* DEV_MAC_HID_ADB_MOUSE_SENDS_KEYCODES unused */
+ {}
+};
+
+static const struct bin_table bin_raid_table[] = {
+ { CTL_INT, DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min" },
+ { CTL_INT, DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max" },
+ {}
+};
+
+static const struct bin_table bin_scsi_table[] = {
+ { CTL_INT, DEV_SCSI_LOGGING_LEVEL, "logging_level" },
+ {}
+};
+
+static const struct bin_table bin_dev_table[] = {
+ /* DEV_CDROM "cdrom" no longer used */
+ /* DEV_HWMON unused */
+ /* DEV_PARPORT "parport" no longer used */
+ { CTL_DIR, DEV_RAID, "raid", bin_raid_table },
+ { CTL_DIR, DEV_MAC_HID, "mac_hid", bin_mac_hid_files },
+ { CTL_DIR, DEV_SCSI, "scsi", bin_scsi_table },
+ { CTL_DIR, DEV_IPMI, "ipmi", bin_ipmi_table },
+ {}
+};
+
+static const struct bin_table bin_bus_isa_table[] = {
+ { CTL_INT, BUS_ISA_MEM_BASE, "membase" },
+ { CTL_INT, BUS_ISA_PORT_BASE, "portbase" },
+ { CTL_INT, BUS_ISA_PORT_SHIFT, "portshift" },
+ {}
+};
+
+static const struct bin_table bin_bus_table[] = {
+ { CTL_DIR, CTL_BUS_ISA, "isa", bin_bus_isa_table },
+ {}
+};
+
+
+static const struct bin_table bin_s390dbf_table[] = {
+ { CTL_INT, 5678 /* CTL_S390DBF_STOPPABLE */, "debug_stoppable" },
+ { CTL_INT, 5679 /* CTL_S390DBF_ACTIVE */, "debug_active" },
+ {}
+};
+
+static const struct bin_table bin_sunrpc_table[] = {
+ /* CTL_RPCDEBUG "rpc_debug" no longer used */
+ /* CTL_NFSDEBUG "nfs_debug" no longer used */
+ /* CTL_NFSDDEBUG "nfsd_debug" no longer used */
+ /* CTL_NLMDEBUG "nlm_debug" no longer used */
+
+ { CTL_INT, CTL_SLOTTABLE_UDP, "udp_slot_table_entries" },
+ { CTL_INT, CTL_SLOTTABLE_TCP, "tcp_slot_table_entries" },
+ { CTL_INT, CTL_MIN_RESVPORT, "min_resvport" },
+ { CTL_INT, CTL_MAX_RESVPORT, "max_resvport" },
+ {}
+};
+
+static const struct bin_table bin_pm_table[] = {
+ /* frv specific */
+ /* 1 == CTL_PM_SUSPEND "suspend" no longer used" */
+ { CTL_INT, 2 /* CTL_PM_CMODE */, "cmode" },
+ { CTL_INT, 3 /* CTL_PM_P0 */, "p0" },
+ { CTL_INT, 4 /* CTL_PM_CM */, "cm" },
+ {}
+};
+
+static const struct bin_table bin_root_table[] = {
+ { CTL_DIR, CTL_KERN, "kernel", bin_kern_table },
+ { CTL_DIR, CTL_VM, "vm", bin_vm_table },
+ { CTL_DIR, CTL_NET, "net", bin_net_table },
+ /* CTL_PROC not used */
+ { CTL_DIR, CTL_FS, "fs", bin_fs_table },
+ /* CTL_DEBUG "debug" no longer used */
+ { CTL_DIR, CTL_DEV, "dev", bin_dev_table },
+ { CTL_DIR, CTL_BUS, "bus", bin_bus_table },
+ { CTL_DIR, CTL_ABI, "abi" },
+ /* CTL_CPU not used */
+ /* CTL_ARLAN "arlan" no longer used */
+ { CTL_DIR, CTL_S390DBF, "s390dbf", bin_s390dbf_table },
+ { CTL_DIR, CTL_SUNRPC, "sunrpc", bin_sunrpc_table },
+ { CTL_DIR, CTL_PM, "pm", bin_pm_table },
+ {}
+};
+
+static ssize_t bin_dir(struct file *file,
+ void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
+{
+ return -ENOTDIR;
+}
+
+
+static ssize_t bin_string(struct file *file,
+ void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
+{
+ ssize_t result, copied = 0;
+
+ if (oldval && oldlen) {
+ char __user *lastp;
+ loff_t pos = 0;
+ int ch;
+
+ result = vfs_read(file, oldval, oldlen, &pos);
+ if (result < 0)
+ goto out;
+
+ copied = result;
+ lastp = oldval + copied - 1;
+
+ result = -EFAULT;
+ if (get_user(ch, lastp))
+ goto out;
+
+ /* Trim off the trailing newline */
+ if (ch == '\n') {
+ result = -EFAULT;
+ if (put_user('\0', lastp))
+ goto out;
+ copied -= 1;
+ }
+ }
+
+ if (newval && newlen) {
+ loff_t pos = 0;
+
+ result = vfs_write(file, newval, newlen, &pos);
+ if (result < 0)
+ goto out;
+ }
+
+ result = copied;
+out:
+ return result;
+}
+
+static ssize_t bin_intvec(struct file *file,
+ void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
+{
+ ssize_t copied = 0;
+ char *buffer;
+ ssize_t result;
+
+ result = -ENOMEM;
+ buffer = kmalloc(BUFSZ, GFP_KERNEL);
+ if (!buffer)
+ goto out;
+
+ if (oldval && oldlen) {
+ unsigned __user *vec = oldval;
+ size_t length = oldlen / sizeof(*vec);
+ char *str, *end;
+ int i;
+
+ result = kernel_read(file, 0, buffer, BUFSZ - 1);
+ if (result < 0)
+ goto out_kfree;
+
+ str = buffer;
+ end = str + result;
+ *end++ = '\0';
+ for (i = 0; i < length; i++) {
+ unsigned long value;
+
+ value = simple_strtoul(str, &str, 10);
+ while (isspace(*str))
+ str++;
+
+ result = -EFAULT;
+ if (put_user(value, vec + i))
+ goto out_kfree;
+
+ copied += sizeof(*vec);
+ if (!isdigit(*str))
+ break;
+ }
+ }
+
+ if (newval && newlen) {
+ unsigned __user *vec = newval;
+ size_t length = newlen / sizeof(*vec);
+ char *str, *end;
+ int i;
+
+ str = buffer;
+ end = str + BUFSZ;
+ for (i = 0; i < length; i++) {
+ unsigned long value;
+
+ result = -EFAULT;
+ if (get_user(value, vec + i))
+ goto out_kfree;
+
+ str += scnprintf(str, end - str, "%lu\t", value);
+ }
+
+ result = kernel_write(file, buffer, str - buffer, 0);
+ if (result < 0)
+ goto out_kfree;
+ }
+ result = copied;
+out_kfree:
+ kfree(buffer);
+out:
+ return result;
+}
+
+static ssize_t bin_ulongvec(struct file *file,
+ void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
+{
+ ssize_t copied = 0;
+ char *buffer;
+ ssize_t result;
+
+ result = -ENOMEM;
+ buffer = kmalloc(BUFSZ, GFP_KERNEL);
+ if (!buffer)
+ goto out;
+
+ if (oldval && oldlen) {
+ unsigned long __user *vec = oldval;
+ size_t length = oldlen / sizeof(*vec);
+ char *str, *end;
+ int i;
+
+ result = kernel_read(file, 0, buffer, BUFSZ - 1);
+ if (result < 0)
+ goto out_kfree;
+
+ str = buffer;
+ end = str + result;
+ *end++ = '\0';
+ for (i = 0; i < length; i++) {
+ unsigned long value;
+
+ value = simple_strtoul(str, &str, 10);
+ while (isspace(*str))
+ str++;
+
+ result = -EFAULT;
+ if (put_user(value, vec + i))
+ goto out_kfree;
+
+ copied += sizeof(*vec);
+ if (!isdigit(*str))
+ break;
+ }
+ }
+
+ if (newval && newlen) {
+ unsigned long __user *vec = newval;
+ size_t length = newlen / sizeof(*vec);
+ char *str, *end;
+ int i;
+
+ str = buffer;
+ end = str + BUFSZ;
+ for (i = 0; i < length; i++) {
+ unsigned long value;
+
+ result = -EFAULT;
+ if (get_user(value, vec + i))
+ goto out_kfree;
+
+ str += scnprintf(str, end - str, "%lu\t", value);
+ }
+
+ result = kernel_write(file, buffer, str - buffer, 0);
+ if (result < 0)
+ goto out_kfree;
+ }
+ result = copied;
+out_kfree:
+ kfree(buffer);
+out:
+ return result;
+}
+
+static ssize_t bin_uuid(struct file *file,
+ void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
+{
+ ssize_t result, copied = 0;
+
+ /* Only supports reads */
+ if (oldval && oldlen) {
+ char buf[40], *str = buf;
+ unsigned char uuid[16];
+ int i;
+
+ result = kernel_read(file, 0, buf, sizeof(buf) - 1);
+ if (result < 0)
+ goto out;
+
+ buf[result] = '\0';
+
+ /* Convert the uuid to from a string to binary */
+ for (i = 0; i < 16; i++) {
+ result = -EIO;
+ if (!isxdigit(str[0]) || !isxdigit(str[1]))
+ goto out;
+
+ uuid[i] = (hex_to_bin(str[0]) << 4) |
+ hex_to_bin(str[1]);
+ str += 2;
+ if (*str == '-')
+ str++;
+ }
+
+ if (oldlen > 16)
+ oldlen = 16;
+
+ result = -EFAULT;
+ if (copy_to_user(oldval, uuid, oldlen))
+ goto out;
+
+ copied = oldlen;
+ }
+ result = copied;
+out:
+ return result;
+}
+
+static ssize_t bin_dn_node_address(struct file *file,
+ void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
+{
+ ssize_t result, copied = 0;
+
+ if (oldval && oldlen) {
+ char buf[15], *nodep;
+ unsigned long area, node;
+ __le16 dnaddr;
+
+ result = kernel_read(file, 0, buf, sizeof(buf) - 1);
+ if (result < 0)
+ goto out;
+
+ buf[result] = '\0';
+
+ /* Convert the decnet address to binary */
+ result = -EIO;
+ nodep = strchr(buf, '.');
+ if (!nodep)
+ goto out;
+ ++nodep;
+
+ area = simple_strtoul(buf, NULL, 10);
+ node = simple_strtoul(nodep, NULL, 10);
+
+ result = -EIO;
+ if ((area > 63)||(node > 1023))
+ goto out;
+
+ dnaddr = cpu_to_le16((area << 10) | node);
+
+ result = -EFAULT;
+ if (put_user(dnaddr, (__le16 __user *)oldval))
+ goto out;
+
+ copied = sizeof(dnaddr);
+ }
+
+ if (newval && newlen) {
+ __le16 dnaddr;
+ char buf[15];
+ int len;
+
+ result = -EINVAL;
+ if (newlen != sizeof(dnaddr))
+ goto out;
+
+ result = -EFAULT;
+ if (get_user(dnaddr, (__le16 __user *)newval))
+ goto out;
+
+ len = scnprintf(buf, sizeof(buf), "%hu.%hu",
+ le16_to_cpu(dnaddr) >> 10,
+ le16_to_cpu(dnaddr) & 0x3ff);
+
+ result = kernel_write(file, buf, len, 0);
+ if (result < 0)
+ goto out;
+ }
+
+ result = copied;
+out:
+ return result;
+}
+
+static const struct bin_table *get_sysctl(const int *name, int nlen, char *path)
+{
+ const struct bin_table *table = &bin_root_table[0];
+ int ctl_name;
+
+ /* The binary sysctl tables have a small maximum depth so
+ * there is no danger of overflowing our path as it PATH_MAX
+ * bytes long.
+ */
+ memcpy(path, "sys/", 4);
+ path += 4;
+
+repeat:
+ if (!nlen)
+ return ERR_PTR(-ENOTDIR);
+ ctl_name = *name;
+ name++;
+ nlen--;
+ for ( ; table->convert; table++) {
+ int len = 0;
+
+ /*
+ * For a wild card entry map from ifindex to network
+ * device name.
+ */
+ if (!table->ctl_name) {
+#ifdef CONFIG_NET
+ struct net *net = current->nsproxy->net_ns;
+ struct net_device *dev;
+ dev = dev_get_by_index(net, ctl_name);
+ if (dev) {
+ len = strlen(dev->name);
+ memcpy(path, dev->name, len);
+ dev_put(dev);
+ }
+#endif
+ /* Use the well known sysctl number to proc name mapping */
+ } else if (ctl_name == table->ctl_name) {
+ len = strlen(table->procname);
+ memcpy(path, table->procname, len);
+ }
+ if (len) {
+ path += len;
+ if (table->child) {
+ *path++ = '/';
+ table = table->child;
+ goto repeat;
+ }
+ *path = '\0';
+ return table;
+ }
+ }
+ return ERR_PTR(-ENOTDIR);
+}
+
+static char *sysctl_getname(const int *name, int nlen, const struct bin_table **tablep)
+{
+ char *tmp, *result;
+
+ result = ERR_PTR(-ENOMEM);
+ tmp = __getname();
+ if (tmp) {
+ const struct bin_table *table = get_sysctl(name, nlen, tmp);
+ result = tmp;
+ *tablep = table;
+ if (IS_ERR(table)) {
+ __putname(tmp);
+ result = ERR_CAST(table);
+ }
+ }
+ return result;
+}
+
+static ssize_t binary_sysctl(const int *name, int nlen,
+ void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
+{
+ const struct bin_table *table = NULL;
+ struct vfsmount *mnt;
+ struct file *file;
+ ssize_t result;
+ char *pathname;
+ int flags;
+
+ pathname = sysctl_getname(name, nlen, &table);
+ result = PTR_ERR(pathname);
+ if (IS_ERR(pathname))
+ goto out;
+
+ /* How should the sysctl be accessed? */
+ if (oldval && oldlen && newval && newlen) {
+ flags = O_RDWR;
+ } else if (newval && newlen) {
+ flags = O_WRONLY;
+ } else if (oldval && oldlen) {
+ flags = O_RDONLY;
+ } else {
+ result = 0;
+ goto out_putname;
+ }
+
+ mnt = task_active_pid_ns(current)->proc_mnt;
+ file = file_open_root(mnt->mnt_root, mnt, pathname, flags);
+ result = PTR_ERR(file);
+ if (IS_ERR(file))
+ goto out_putname;
+
+ result = table->convert(file, oldval, oldlen, newval, newlen);
+
+ fput(file);
+out_putname:
+ __putname(pathname);
+out:
+ return result;
+}
+
+
+#else /* CONFIG_SYSCTL_SYSCALL */
+
+static ssize_t binary_sysctl(const int *name, int nlen,
+ void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
+{
+ return -ENOSYS;
+}
+
+#endif /* CONFIG_SYSCTL_SYSCALL */
+
+
+static void deprecated_sysctl_warning(const int *name, int nlen)
+{
+ int i;
+
+ /*
+ * CTL_KERN/KERN_VERSION is used by older glibc and cannot
+ * ever go away.
+ */
+ if (name[0] == CTL_KERN && name[1] == KERN_VERSION)
+ return;
+
+ if (printk_ratelimit()) {
+ printk(KERN_INFO
+ "warning: process `%s' used the deprecated sysctl "
+ "system call with ", current->comm);
+ for (i = 0; i < nlen; i++)
+ printk("%d.", name[i]);
+ printk("\n");
+ }
+ return;
+}
+
+#define WARN_ONCE_HASH_BITS 8
+#define WARN_ONCE_HASH_SIZE (1<<WARN_ONCE_HASH_BITS)
+
+static DECLARE_BITMAP(warn_once_bitmap, WARN_ONCE_HASH_SIZE);
+
+#define FNV32_OFFSET 2166136261U
+#define FNV32_PRIME 0x01000193
+
+/*
+ * Print each legacy sysctl (approximately) only once.
+ * To avoid making the tables non-const use a external
+ * hash-table instead.
+ * Worst case hash collision: 6, but very rarely.
+ * NOTE! We don't use the SMP-safe bit tests. We simply
+ * don't care enough.
+ */
+static void warn_on_bintable(const int *name, int nlen)
+{
+ int i;
+ u32 hash = FNV32_OFFSET;
+
+ for (i = 0; i < nlen; i++)
+ hash = (hash ^ name[i]) * FNV32_PRIME;
+ hash %= WARN_ONCE_HASH_SIZE;
+ if (__test_and_set_bit(hash, warn_once_bitmap))
+ return;
+ deprecated_sysctl_warning(name, nlen);
+}
+
+static ssize_t do_sysctl(int __user *args_name, int nlen,
+ void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
+{
+ int name[CTL_MAXNAME];
+ int i;
+
+ /* Check args->nlen. */
+ if (nlen < 0 || nlen > CTL_MAXNAME)
+ return -ENOTDIR;
+ /* Read in the sysctl name for simplicity */
+ for (i = 0; i < nlen; i++)
+ if (get_user(name[i], args_name + i))
+ return -EFAULT;
+
+ warn_on_bintable(name, nlen);
+
+ return binary_sysctl(name, nlen, oldval, oldlen, newval, newlen);
+}
+
+SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args)
+{
+ struct __sysctl_args tmp;
+ size_t oldlen = 0;
+ ssize_t result;
+
+ if (copy_from_user(&tmp, args, sizeof(tmp)))
+ return -EFAULT;
+
+ if (tmp.oldval && !tmp.oldlenp)
+ return -EFAULT;
+
+ if (tmp.oldlenp && get_user(oldlen, tmp.oldlenp))
+ return -EFAULT;
+
+ result = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, oldlen,
+ tmp.newval, tmp.newlen);
+
+ if (result >= 0) {
+ oldlen = result;
+ result = 0;
+ }
+
+ if (tmp.oldlenp && put_user(oldlen, tmp.oldlenp))
+ return -EFAULT;
+
+ return result;
+}
+
+
+#ifdef CONFIG_COMPAT
+
+struct compat_sysctl_args {
+ compat_uptr_t name;
+ int nlen;
+ compat_uptr_t oldval;
+ compat_uptr_t oldlenp;
+ compat_uptr_t newval;
+ compat_size_t newlen;
+ compat_ulong_t __unused[4];
+};
+
+COMPAT_SYSCALL_DEFINE1(sysctl, struct compat_sysctl_args __user *, args)
+{
+ struct compat_sysctl_args tmp;
+ compat_size_t __user *compat_oldlenp;
+ size_t oldlen = 0;
+ ssize_t result;
+
+ if (copy_from_user(&tmp, args, sizeof(tmp)))
+ return -EFAULT;
+
+ if (tmp.oldval && !tmp.oldlenp)
+ return -EFAULT;
+
+ compat_oldlenp = compat_ptr(tmp.oldlenp);
+ if (compat_oldlenp && get_user(oldlen, compat_oldlenp))
+ return -EFAULT;
+
+ result = do_sysctl(compat_ptr(tmp.name), tmp.nlen,
+ compat_ptr(tmp.oldval), oldlen,
+ compat_ptr(tmp.newval), tmp.newlen);
+
+ if (result >= 0) {
+ oldlen = result;
+ result = 0;
+ }
+
+ if (compat_oldlenp && put_user(oldlen, compat_oldlenp))
+ return -EFAULT;
+
+ return result;
+}
+
+#endif /* CONFIG_COMPAT */
diff --git a/kernel/system_certificates.S b/kernel/system_certificates.S
new file mode 100644
index 000000000..3e9868d47
--- /dev/null
+++ b/kernel/system_certificates.S
@@ -0,0 +1,20 @@
+#include <linux/export.h>
+#include <linux/init.h>
+
+ __INITRODATA
+
+ .align 8
+ .globl VMLINUX_SYMBOL(system_certificate_list)
+VMLINUX_SYMBOL(system_certificate_list):
+__cert_list_start:
+ .incbin "kernel/x509_certificate_list"
+__cert_list_end:
+
+ .align 8
+ .globl VMLINUX_SYMBOL(system_certificate_list_size)
+VMLINUX_SYMBOL(system_certificate_list_size):
+#ifdef CONFIG_64BIT
+ .quad __cert_list_end - __cert_list_start
+#else
+ .long __cert_list_end - __cert_list_start
+#endif
diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c
new file mode 100644
index 000000000..875f64e89
--- /dev/null
+++ b/kernel/system_keyring.c
@@ -0,0 +1,106 @@
+/* System trusted keyring for trusted public keys
+ *
+ * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/cred.h>
+#include <linux/err.h>
+#include <keys/asymmetric-type.h>
+#include <keys/system_keyring.h>
+#include "module-internal.h"
+
+struct key *system_trusted_keyring;
+EXPORT_SYMBOL_GPL(system_trusted_keyring);
+
+extern __initconst const u8 system_certificate_list[];
+extern __initconst const unsigned long system_certificate_list_size;
+
+/*
+ * Load the compiled-in keys
+ */
+static __init int system_trusted_keyring_init(void)
+{
+ pr_notice("Initialise system trusted keyring\n");
+
+ system_trusted_keyring =
+ keyring_alloc(".system_keyring",
+ KUIDT_INIT(0), KGIDT_INIT(0), current_cred(),
+ ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
+ KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH),
+ KEY_ALLOC_NOT_IN_QUOTA, NULL);
+ if (IS_ERR(system_trusted_keyring))
+ panic("Can't allocate system trusted keyring\n");
+
+ set_bit(KEY_FLAG_TRUSTED_ONLY, &system_trusted_keyring->flags);
+ return 0;
+}
+
+/*
+ * Must be initialised before we try and load the keys into the keyring.
+ */
+device_initcall(system_trusted_keyring_init);
+
+/*
+ * Load the compiled-in list of X.509 certificates.
+ */
+static __init int load_system_certificate_list(void)
+{
+ key_ref_t key;
+ const u8 *p, *end;
+ size_t plen;
+
+ pr_notice("Loading compiled-in X.509 certificates\n");
+
+ p = system_certificate_list;
+ end = p + system_certificate_list_size;
+ while (p < end) {
+ /* Each cert begins with an ASN.1 SEQUENCE tag and must be more
+ * than 256 bytes in size.
+ */
+ if (end - p < 4)
+ goto dodgy_cert;
+ if (p[0] != 0x30 &&
+ p[1] != 0x82)
+ goto dodgy_cert;
+ plen = (p[2] << 8) | p[3];
+ plen += 4;
+ if (plen > end - p)
+ goto dodgy_cert;
+
+ key = key_create_or_update(make_key_ref(system_trusted_keyring, 1),
+ "asymmetric",
+ NULL,
+ p,
+ plen,
+ ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
+ KEY_USR_VIEW | KEY_USR_READ),
+ KEY_ALLOC_NOT_IN_QUOTA |
+ KEY_ALLOC_TRUSTED);
+ if (IS_ERR(key)) {
+ pr_err("Problem loading in-kernel X.509 certificate (%ld)\n",
+ PTR_ERR(key));
+ } else {
+ set_bit(KEY_FLAG_BUILTIN, &key_ref_to_ptr(key)->flags);
+ pr_notice("Loaded X.509 cert '%s'\n",
+ key_ref_to_ptr(key)->description);
+ key_ref_put(key);
+ }
+ p += plen;
+ }
+
+ return 0;
+
+dodgy_cert:
+ pr_err("Problem parsing in-kernel X.509 certificate list\n");
+ return 0;
+}
+late_initcall(load_system_certificate_list);
diff --git a/kernel/task_work.c b/kernel/task_work.c
new file mode 100644
index 000000000..8727032e3
--- /dev/null
+++ b/kernel/task_work.c
@@ -0,0 +1,128 @@
+#include <linux/spinlock.h>
+#include <linux/task_work.h>
+#include <linux/tracehook.h>
+
+static struct callback_head work_exited; /* all we need is ->next == NULL */
+
+/**
+ * task_work_add - ask the @task to execute @work->func()
+ * @task: the task which should run the callback
+ * @work: the callback to run
+ * @notify: send the notification if true
+ *
+ * Queue @work for task_work_run() below and notify the @task if @notify.
+ * Fails if the @task is exiting/exited and thus it can't process this @work.
+ * Otherwise @work->func() will be called when the @task returns from kernel
+ * mode or exits.
+ *
+ * This is like the signal handler which runs in kernel mode, but it doesn't
+ * try to wake up the @task.
+ *
+ * RETURNS:
+ * 0 if succeeds or -ESRCH.
+ */
+int
+task_work_add(struct task_struct *task, struct callback_head *work, bool notify)
+{
+ struct callback_head *head;
+
+ do {
+ head = ACCESS_ONCE(task->task_works);
+ if (unlikely(head == &work_exited))
+ return -ESRCH;
+ work->next = head;
+ } while (cmpxchg(&task->task_works, head, work) != head);
+
+ if (notify)
+ set_notify_resume(task);
+ return 0;
+}
+
+/**
+ * task_work_cancel - cancel a pending work added by task_work_add()
+ * @task: the task which should execute the work
+ * @func: identifies the work to remove
+ *
+ * Find the last queued pending work with ->func == @func and remove
+ * it from queue.
+ *
+ * RETURNS:
+ * The found work or NULL if not found.
+ */
+struct callback_head *
+task_work_cancel(struct task_struct *task, task_work_func_t func)
+{
+ struct callback_head **pprev = &task->task_works;
+ struct callback_head *work;
+ unsigned long flags;
+ /*
+ * If cmpxchg() fails we continue without updating pprev.
+ * Either we raced with task_work_add() which added the
+ * new entry before this work, we will find it again. Or
+ * we raced with task_work_run(), *pprev == NULL/exited.
+ */
+ raw_spin_lock_irqsave(&task->pi_lock, flags);
+ while ((work = ACCESS_ONCE(*pprev))) {
+ smp_read_barrier_depends();
+ if (work->func != func)
+ pprev = &work->next;
+ else if (cmpxchg(pprev, work, work->next) == work)
+ break;
+ }
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+
+ return work;
+}
+
+/**
+ * task_work_run - execute the works added by task_work_add()
+ *
+ * Flush the pending works. Should be used by the core kernel code.
+ * Called before the task returns to the user-mode or stops, or when
+ * it exits. In the latter case task_work_add() can no longer add the
+ * new work after task_work_run() returns.
+ */
+void task_work_run(void)
+{
+ struct task_struct *task = current;
+ struct callback_head *work, *head, *next;
+
+ for (;;) {
+ /*
+ * work->func() can do task_work_add(), do not set
+ * work_exited unless the list is empty.
+ */
+ do {
+ work = ACCESS_ONCE(task->task_works);
+ head = !work && (task->flags & PF_EXITING) ?
+ &work_exited : NULL;
+ } while (cmpxchg(&task->task_works, work, head) != work);
+
+ if (!work)
+ break;
+ /*
+ * Synchronize with task_work_cancel(). It can't remove
+ * the first entry == work, cmpxchg(task_works) should
+ * fail, but it can play with *work and other entries.
+ */
+ raw_spin_unlock_wait(&task->pi_lock);
+ smp_mb();
+
+ /* Reverse the list to run the works in fifo order */
+ head = NULL;
+ do {
+ next = work->next;
+ work->next = head;
+ head = work;
+ work = next;
+ } while (work);
+
+ work = head;
+ do {
+ next = work->next;
+ work->func(work);
+ work = next;
+ cond_resched();
+ } while (work);
+ }
+}
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
new file mode 100644
index 000000000..21f82c29c
--- /dev/null
+++ b/kernel/taskstats.c
@@ -0,0 +1,710 @@
+/*
+ * taskstats.c - Export per-task statistics to userland
+ *
+ * Copyright (C) Shailabh Nagar, IBM Corp. 2006
+ * (C) Balbir Singh, IBM Corp. 2006
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/taskstats_kern.h>
+#include <linux/tsacct_kern.h>
+#include <linux/delayacct.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
+#include <linux/slab.h>
+#include <linux/cgroupstats.h>
+#include <linux/cgroup.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/pid_namespace.h>
+#include <net/genetlink.h>
+#include <linux/atomic.h>
+
+/*
+ * Maximum length of a cpumask that can be specified in
+ * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute
+ */
+#define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS)
+
+static DEFINE_PER_CPU(__u32, taskstats_seqnum);
+static int family_registered;
+struct kmem_cache *taskstats_cache;
+
+static struct genl_family family = {
+ .id = GENL_ID_GENERATE,
+ .name = TASKSTATS_GENL_NAME,
+ .version = TASKSTATS_GENL_VERSION,
+ .maxattr = TASKSTATS_CMD_ATTR_MAX,
+};
+
+static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = {
+ [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 },
+ [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 },
+ [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
+ [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};
+
+static const struct nla_policy cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] = {
+ [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 },
+};
+
+struct listener {
+ struct list_head list;
+ pid_t pid;
+ char valid;
+};
+
+struct listener_list {
+ struct rw_semaphore sem;
+ struct list_head list;
+};
+static DEFINE_PER_CPU(struct listener_list, listener_array);
+
+enum actions {
+ REGISTER,
+ DEREGISTER,
+ CPU_DONT_CARE
+};
+
+static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
+ size_t size)
+{
+ struct sk_buff *skb;
+ void *reply;
+
+ /*
+ * If new attributes are added, please revisit this allocation
+ */
+ skb = genlmsg_new(size, GFP_KERNEL);
+ if (!skb)
+ return -ENOMEM;
+
+ if (!info) {
+ int seq = this_cpu_inc_return(taskstats_seqnum) - 1;
+
+ reply = genlmsg_put(skb, 0, seq, &family, 0, cmd);
+ } else
+ reply = genlmsg_put_reply(skb, info, &family, 0, cmd);
+ if (reply == NULL) {
+ nlmsg_free(skb);
+ return -EINVAL;
+ }
+
+ *skbp = skb;
+ return 0;
+}
+
+/*
+ * Send taskstats data in @skb to listener with nl_pid @pid
+ */
+static int send_reply(struct sk_buff *skb, struct genl_info *info)
+{
+ struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb));
+ void *reply = genlmsg_data(genlhdr);
+
+ genlmsg_end(skb, reply);
+
+ return genlmsg_reply(skb, info);
+}
+
+/*
+ * Send taskstats data in @skb to listeners registered for @cpu's exit data
+ */
+static void send_cpu_listeners(struct sk_buff *skb,
+ struct listener_list *listeners)
+{
+ struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb));
+ struct listener *s, *tmp;
+ struct sk_buff *skb_next, *skb_cur = skb;
+ void *reply = genlmsg_data(genlhdr);
+ int rc, delcount = 0;
+
+ genlmsg_end(skb, reply);
+
+ rc = 0;
+ down_read(&listeners->sem);
+ list_for_each_entry(s, &listeners->list, list) {
+ skb_next = NULL;
+ if (!list_is_last(&s->list, &listeners->list)) {
+ skb_next = skb_clone(skb_cur, GFP_KERNEL);
+ if (!skb_next)
+ break;
+ }
+ rc = genlmsg_unicast(&init_net, skb_cur, s->pid);
+ if (rc == -ECONNREFUSED) {
+ s->valid = 0;
+ delcount++;
+ }
+ skb_cur = skb_next;
+ }
+ up_read(&listeners->sem);
+
+ if (skb_cur)
+ nlmsg_free(skb_cur);
+
+ if (!delcount)
+ return;
+
+ /* Delete invalidated entries */
+ down_write(&listeners->sem);
+ list_for_each_entry_safe(s, tmp, &listeners->list, list) {
+ if (!s->valid) {
+ list_del(&s->list);
+ kfree(s);
+ }
+ }
+ up_write(&listeners->sem);
+}
+
+static void fill_stats(struct user_namespace *user_ns,
+ struct pid_namespace *pid_ns,
+ struct task_struct *tsk, struct taskstats *stats)
+{
+ memset(stats, 0, sizeof(*stats));
+ /*
+ * Each accounting subsystem adds calls to its functions to
+ * fill in relevant parts of struct taskstsats as follows
+ *
+ * per-task-foo(stats, tsk);
+ */
+
+ delayacct_add_tsk(stats, tsk);
+
+ /* fill in basic acct fields */
+ stats->version = TASKSTATS_VERSION;
+ stats->nvcsw = tsk->nvcsw;
+ stats->nivcsw = tsk->nivcsw;
+ bacct_add_tsk(user_ns, pid_ns, stats, tsk);
+
+ /* fill in extended acct fields */
+ xacct_add_tsk(stats, tsk);
+}
+
+static int fill_stats_for_pid(pid_t pid, struct taskstats *stats)
+{
+ struct task_struct *tsk;
+
+ rcu_read_lock();
+ tsk = find_task_by_vpid(pid);
+ if (tsk)
+ get_task_struct(tsk);
+ rcu_read_unlock();
+ if (!tsk)
+ return -ESRCH;
+ fill_stats(current_user_ns(), task_active_pid_ns(current), tsk, stats);
+ put_task_struct(tsk);
+ return 0;
+}
+
+static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats)
+{
+ struct task_struct *tsk, *first;
+ unsigned long flags;
+ int rc = -ESRCH;
+
+ /*
+ * Add additional stats from live tasks except zombie thread group
+ * leaders who are already counted with the dead tasks
+ */
+ rcu_read_lock();
+ first = find_task_by_vpid(tgid);
+
+ if (!first || !lock_task_sighand(first, &flags))
+ goto out;
+
+ if (first->signal->stats)
+ memcpy(stats, first->signal->stats, sizeof(*stats));
+ else
+ memset(stats, 0, sizeof(*stats));
+
+ tsk = first;
+ do {
+ if (tsk->exit_state)
+ continue;
+ /*
+ * Accounting subsystem can call its functions here to
+ * fill in relevant parts of struct taskstsats as follows
+ *
+ * per-task-foo(stats, tsk);
+ */
+ delayacct_add_tsk(stats, tsk);
+
+ stats->nvcsw += tsk->nvcsw;
+ stats->nivcsw += tsk->nivcsw;
+ } while_each_thread(first, tsk);
+
+ unlock_task_sighand(first, &flags);
+ rc = 0;
+out:
+ rcu_read_unlock();
+
+ stats->version = TASKSTATS_VERSION;
+ /*
+ * Accounting subsystems can also add calls here to modify
+ * fields of taskstats.
+ */
+ return rc;
+}
+
+static void fill_tgid_exit(struct task_struct *tsk)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&tsk->sighand->siglock, flags);
+ if (!tsk->signal->stats)
+ goto ret;
+
+ /*
+ * Each accounting subsystem calls its functions here to
+ * accumalate its per-task stats for tsk, into the per-tgid structure
+ *
+ * per-task-foo(tsk->signal->stats, tsk);
+ */
+ delayacct_add_tsk(tsk->signal->stats, tsk);
+ret:
+ spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
+ return;
+}
+
+static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
+{
+ struct listener_list *listeners;
+ struct listener *s, *tmp, *s2;
+ unsigned int cpu;
+ int ret = 0;
+
+ if (!cpumask_subset(mask, cpu_possible_mask))
+ return -EINVAL;
+
+ if (current_user_ns() != &init_user_ns)
+ return -EINVAL;
+
+ if (task_active_pid_ns(current) != &init_pid_ns)
+ return -EINVAL;
+
+ if (isadd == REGISTER) {
+ for_each_cpu(cpu, mask) {
+ s = kmalloc_node(sizeof(struct listener),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!s) {
+ ret = -ENOMEM;
+ goto cleanup;
+ }
+ s->pid = pid;
+ s->valid = 1;
+
+ listeners = &per_cpu(listener_array, cpu);
+ down_write(&listeners->sem);
+ list_for_each_entry(s2, &listeners->list, list) {
+ if (s2->pid == pid && s2->valid)
+ goto exists;
+ }
+ list_add(&s->list, &listeners->list);
+ s = NULL;
+exists:
+ up_write(&listeners->sem);
+ kfree(s); /* nop if NULL */
+ }
+ return 0;
+ }
+
+ /* Deregister or cleanup */
+cleanup:
+ for_each_cpu(cpu, mask) {
+ listeners = &per_cpu(listener_array, cpu);
+ down_write(&listeners->sem);
+ list_for_each_entry_safe(s, tmp, &listeners->list, list) {
+ if (s->pid == pid) {
+ list_del(&s->list);
+ kfree(s);
+ break;
+ }
+ }
+ up_write(&listeners->sem);
+ }
+ return ret;
+}
+
+static int parse(struct nlattr *na, struct cpumask *mask)
+{
+ char *data;
+ int len;
+ int ret;
+
+ if (na == NULL)
+ return 1;
+ len = nla_len(na);
+ if (len > TASKSTATS_CPUMASK_MAXLEN)
+ return -E2BIG;
+ if (len < 1)
+ return -EINVAL;
+ data = kmalloc(len, GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+ nla_strlcpy(data, na, len);
+ ret = cpulist_parse(data, mask);
+ kfree(data);
+ return ret;
+}
+
+#if defined(CONFIG_64BIT) && !defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
+#define TASKSTATS_NEEDS_PADDING 1
+#endif
+
+static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
+{
+ struct nlattr *na, *ret;
+ int aggr;
+
+ aggr = (type == TASKSTATS_TYPE_PID)
+ ? TASKSTATS_TYPE_AGGR_PID
+ : TASKSTATS_TYPE_AGGR_TGID;
+
+ /*
+ * The taskstats structure is internally aligned on 8 byte
+ * boundaries but the layout of the aggregrate reply, with
+ * two NLA headers and the pid (each 4 bytes), actually
+ * force the entire structure to be unaligned. This causes
+ * the kernel to issue unaligned access warnings on some
+ * architectures like ia64. Unfortunately, some software out there
+ * doesn't properly unroll the NLA packet and assumes that the start
+ * of the taskstats structure will always be 20 bytes from the start
+ * of the netlink payload. Aligning the start of the taskstats
+ * structure breaks this software, which we don't want. So, for now
+ * the alignment only happens on architectures that require it
+ * and those users will have to update to fixed versions of those
+ * packages. Space is reserved in the packet only when needed.
+ * This ifdef should be removed in several years e.g. 2012 once
+ * we can be confident that fixed versions are installed on most
+ * systems. We add the padding before the aggregate since the
+ * aggregate is already a defined type.
+ */
+#ifdef TASKSTATS_NEEDS_PADDING
+ if (nla_put(skb, TASKSTATS_TYPE_NULL, 0, NULL) < 0)
+ goto err;
+#endif
+ na = nla_nest_start(skb, aggr);
+ if (!na)
+ goto err;
+
+ if (nla_put(skb, type, sizeof(pid), &pid) < 0) {
+ nla_nest_cancel(skb, na);
+ goto err;
+ }
+ ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats));
+ if (!ret) {
+ nla_nest_cancel(skb, na);
+ goto err;
+ }
+ nla_nest_end(skb, na);
+
+ return nla_data(ret);
+err:
+ return NULL;
+}
+
+static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
+{
+ int rc = 0;
+ struct sk_buff *rep_skb;
+ struct cgroupstats *stats;
+ struct nlattr *na;
+ size_t size;
+ u32 fd;
+ struct fd f;
+
+ na = info->attrs[CGROUPSTATS_CMD_ATTR_FD];
+ if (!na)
+ return -EINVAL;
+
+ fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]);
+ f = fdget(fd);
+ if (!f.file)
+ return 0;
+
+ size = nla_total_size(sizeof(struct cgroupstats));
+
+ rc = prepare_reply(info, CGROUPSTATS_CMD_NEW, &rep_skb,
+ size);
+ if (rc < 0)
+ goto err;
+
+ na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS,
+ sizeof(struct cgroupstats));
+ if (na == NULL) {
+ nlmsg_free(rep_skb);
+ rc = -EMSGSIZE;
+ goto err;
+ }
+
+ stats = nla_data(na);
+ memset(stats, 0, sizeof(*stats));
+
+ rc = cgroupstats_build(stats, f.file->f_path.dentry);
+ if (rc < 0) {
+ nlmsg_free(rep_skb);
+ goto err;
+ }
+
+ rc = send_reply(rep_skb, info);
+
+err:
+ fdput(f);
+ return rc;
+}
+
+static int cmd_attr_register_cpumask(struct genl_info *info)
+{
+ cpumask_var_t mask;
+ int rc;
+
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+ rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask);
+ if (rc < 0)
+ goto out;
+ rc = add_del_listener(info->snd_portid, mask, REGISTER);
+out:
+ free_cpumask_var(mask);
+ return rc;
+}
+
+static int cmd_attr_deregister_cpumask(struct genl_info *info)
+{
+ cpumask_var_t mask;
+ int rc;
+
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+ rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask);
+ if (rc < 0)
+ goto out;
+ rc = add_del_listener(info->snd_portid, mask, DEREGISTER);
+out:
+ free_cpumask_var(mask);
+ return rc;
+}
+
+static size_t taskstats_packet_size(void)
+{
+ size_t size;
+
+ size = nla_total_size(sizeof(u32)) +
+ nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
+#ifdef TASKSTATS_NEEDS_PADDING
+ size += nla_total_size(0); /* Padding for alignment */
+#endif
+ return size;
+}
+
+static int cmd_attr_pid(struct genl_info *info)
+{
+ struct taskstats *stats;
+ struct sk_buff *rep_skb;
+ size_t size;
+ u32 pid;
+ int rc;
+
+ size = taskstats_packet_size();
+
+ rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size);
+ if (rc < 0)
+ return rc;
+
+ rc = -EINVAL;
+ pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
+ stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid);
+ if (!stats)
+ goto err;
+
+ rc = fill_stats_for_pid(pid, stats);
+ if (rc < 0)
+ goto err;
+ return send_reply(rep_skb, info);
+err:
+ nlmsg_free(rep_skb);
+ return rc;
+}
+
+static int cmd_attr_tgid(struct genl_info *info)
+{
+ struct taskstats *stats;
+ struct sk_buff *rep_skb;
+ size_t size;
+ u32 tgid;
+ int rc;
+
+ size = taskstats_packet_size();
+
+ rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size);
+ if (rc < 0)
+ return rc;
+
+ rc = -EINVAL;
+ tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
+ stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid);
+ if (!stats)
+ goto err;
+
+ rc = fill_stats_for_tgid(tgid, stats);
+ if (rc < 0)
+ goto err;
+ return send_reply(rep_skb, info);
+err:
+ nlmsg_free(rep_skb);
+ return rc;
+}
+
+static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
+{
+ if (info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK])
+ return cmd_attr_register_cpumask(info);
+ else if (info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK])
+ return cmd_attr_deregister_cpumask(info);
+ else if (info->attrs[TASKSTATS_CMD_ATTR_PID])
+ return cmd_attr_pid(info);
+ else if (info->attrs[TASKSTATS_CMD_ATTR_TGID])
+ return cmd_attr_tgid(info);
+ else
+ return -EINVAL;
+}
+
+static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk)
+{
+ struct signal_struct *sig = tsk->signal;
+ struct taskstats *stats;
+
+ if (sig->stats || thread_group_empty(tsk))
+ goto ret;
+
+ /* No problem if kmem_cache_zalloc() fails */
+ stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL);
+
+ spin_lock_irq(&tsk->sighand->siglock);
+ if (!sig->stats) {
+ sig->stats = stats;
+ stats = NULL;
+ }
+ spin_unlock_irq(&tsk->sighand->siglock);
+
+ if (stats)
+ kmem_cache_free(taskstats_cache, stats);
+ret:
+ return sig->stats;
+}
+
+/* Send pid data out on exit */
+void taskstats_exit(struct task_struct *tsk, int group_dead)
+{
+ int rc;
+ struct listener_list *listeners;
+ struct taskstats *stats;
+ struct sk_buff *rep_skb;
+ size_t size;
+ int is_thread_group;
+
+ if (!family_registered)
+ return;
+
+ /*
+ * Size includes space for nested attributes
+ */
+ size = taskstats_packet_size();
+
+ is_thread_group = !!taskstats_tgid_alloc(tsk);
+ if (is_thread_group) {
+ /* PID + STATS + TGID + STATS */
+ size = 2 * size;
+ /* fill the tsk->signal->stats structure */
+ fill_tgid_exit(tsk);
+ }
+
+ listeners = raw_cpu_ptr(&listener_array);
+ if (list_empty(&listeners->list))
+ return;
+
+ rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, size);
+ if (rc < 0)
+ return;
+
+ stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID,
+ task_pid_nr_ns(tsk, &init_pid_ns));
+ if (!stats)
+ goto err;
+
+ fill_stats(&init_user_ns, &init_pid_ns, tsk, stats);
+
+ /*
+ * Doesn't matter if tsk is the leader or the last group member leaving
+ */
+ if (!is_thread_group || !group_dead)
+ goto send;
+
+ stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID,
+ task_tgid_nr_ns(tsk, &init_pid_ns));
+ if (!stats)
+ goto err;
+
+ memcpy(stats, tsk->signal->stats, sizeof(*stats));
+
+send:
+ send_cpu_listeners(rep_skb, listeners);
+ return;
+err:
+ nlmsg_free(rep_skb);
+}
+
+static const struct genl_ops taskstats_ops[] = {
+ {
+ .cmd = TASKSTATS_CMD_GET,
+ .doit = taskstats_user_cmd,
+ .policy = taskstats_cmd_get_policy,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = CGROUPSTATS_CMD_GET,
+ .doit = cgroupstats_user_cmd,
+ .policy = cgroupstats_cmd_get_policy,
+ },
+};
+
+/* Needed early in initialization */
+void __init taskstats_init_early(void)
+{
+ unsigned int i;
+
+ taskstats_cache = KMEM_CACHE(taskstats, SLAB_PANIC);
+ for_each_possible_cpu(i) {
+ INIT_LIST_HEAD(&(per_cpu(listener_array, i).list));
+ init_rwsem(&(per_cpu(listener_array, i).sem));
+ }
+}
+
+static int __init taskstats_init(void)
+{
+ int rc;
+
+ rc = genl_register_family_with_ops(&family, taskstats_ops);
+ if (rc)
+ return rc;
+
+ family_registered = 1;
+ pr_info("registered taskstats version %d\n", TASKSTATS_GENL_VERSION);
+ return 0;
+}
+
+/*
+ * late initcall ensures initialization of statistics collection
+ * mechanisms precedes initialization of the taskstats interface
+ */
+late_initcall(taskstats_init);
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
new file mode 100644
index 000000000..0dbab6d1a
--- /dev/null
+++ b/kernel/test_kprobes.c
@@ -0,0 +1,389 @@
+/*
+ * test_kprobes.c - simple sanity test for *probes
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ */
+
+#define pr_fmt(fmt) "Kprobe smoke test: " fmt
+
+#include <linux/kernel.h>
+#include <linux/kprobes.h>
+#include <linux/random.h>
+
+#define div_factor 3
+
+static u32 rand1, preh_val, posth_val, jph_val;
+static int errors, handler_errors, num_tests;
+static u32 (*target)(u32 value);
+static u32 (*target2)(u32 value);
+
+static noinline u32 kprobe_target(u32 value)
+{
+ return (value / div_factor);
+}
+
+static int kp_pre_handler(struct kprobe *p, struct pt_regs *regs)
+{
+ preh_val = (rand1 / div_factor);
+ return 0;
+}
+
+static void kp_post_handler(struct kprobe *p, struct pt_regs *regs,
+ unsigned long flags)
+{
+ if (preh_val != (rand1 / div_factor)) {
+ handler_errors++;
+ pr_err("incorrect value in post_handler\n");
+ }
+ posth_val = preh_val + div_factor;
+}
+
+static struct kprobe kp = {
+ .symbol_name = "kprobe_target",
+ .pre_handler = kp_pre_handler,
+ .post_handler = kp_post_handler
+};
+
+static int test_kprobe(void)
+{
+ int ret;
+
+ ret = register_kprobe(&kp);
+ if (ret < 0) {
+ pr_err("register_kprobe returned %d\n", ret);
+ return ret;
+ }
+
+ ret = target(rand1);
+ unregister_kprobe(&kp);
+
+ if (preh_val == 0) {
+ pr_err("kprobe pre_handler not called\n");
+ handler_errors++;
+ }
+
+ if (posth_val == 0) {
+ pr_err("kprobe post_handler not called\n");
+ handler_errors++;
+ }
+
+ return 0;
+}
+
+static noinline u32 kprobe_target2(u32 value)
+{
+ return (value / div_factor) + 1;
+}
+
+static int kp_pre_handler2(struct kprobe *p, struct pt_regs *regs)
+{
+ preh_val = (rand1 / div_factor) + 1;
+ return 0;
+}
+
+static void kp_post_handler2(struct kprobe *p, struct pt_regs *regs,
+ unsigned long flags)
+{
+ if (preh_val != (rand1 / div_factor) + 1) {
+ handler_errors++;
+ pr_err("incorrect value in post_handler2\n");
+ }
+ posth_val = preh_val + div_factor;
+}
+
+static struct kprobe kp2 = {
+ .symbol_name = "kprobe_target2",
+ .pre_handler = kp_pre_handler2,
+ .post_handler = kp_post_handler2
+};
+
+static int test_kprobes(void)
+{
+ int ret;
+ struct kprobe *kps[2] = {&kp, &kp2};
+
+ /* addr and flags should be cleard for reusing kprobe. */
+ kp.addr = NULL;
+ kp.flags = 0;
+ ret = register_kprobes(kps, 2);
+ if (ret < 0) {
+ pr_err("register_kprobes returned %d\n", ret);
+ return ret;
+ }
+
+ preh_val = 0;
+ posth_val = 0;
+ ret = target(rand1);
+
+ if (preh_val == 0) {
+ pr_err("kprobe pre_handler not called\n");
+ handler_errors++;
+ }
+
+ if (posth_val == 0) {
+ pr_err("kprobe post_handler not called\n");
+ handler_errors++;
+ }
+
+ preh_val = 0;
+ posth_val = 0;
+ ret = target2(rand1);
+
+ if (preh_val == 0) {
+ pr_err("kprobe pre_handler2 not called\n");
+ handler_errors++;
+ }
+
+ if (posth_val == 0) {
+ pr_err("kprobe post_handler2 not called\n");
+ handler_errors++;
+ }
+
+ unregister_kprobes(kps, 2);
+ return 0;
+
+}
+
+static u32 j_kprobe_target(u32 value)
+{
+ if (value != rand1) {
+ handler_errors++;
+ pr_err("incorrect value in jprobe handler\n");
+ }
+
+ jph_val = rand1;
+ jprobe_return();
+ return 0;
+}
+
+static struct jprobe jp = {
+ .entry = j_kprobe_target,
+ .kp.symbol_name = "kprobe_target"
+};
+
+static int test_jprobe(void)
+{
+ int ret;
+
+ ret = register_jprobe(&jp);
+ if (ret < 0) {
+ pr_err("register_jprobe returned %d\n", ret);
+ return ret;
+ }
+
+ ret = target(rand1);
+ unregister_jprobe(&jp);
+ if (jph_val == 0) {
+ pr_err("jprobe handler not called\n");
+ handler_errors++;
+ }
+
+ return 0;
+}
+
+static struct jprobe jp2 = {
+ .entry = j_kprobe_target,
+ .kp.symbol_name = "kprobe_target2"
+};
+
+static int test_jprobes(void)
+{
+ int ret;
+ struct jprobe *jps[2] = {&jp, &jp2};
+
+ /* addr and flags should be cleard for reusing kprobe. */
+ jp.kp.addr = NULL;
+ jp.kp.flags = 0;
+ ret = register_jprobes(jps, 2);
+ if (ret < 0) {
+ pr_err("register_jprobes returned %d\n", ret);
+ return ret;
+ }
+
+ jph_val = 0;
+ ret = target(rand1);
+ if (jph_val == 0) {
+ pr_err("jprobe handler not called\n");
+ handler_errors++;
+ }
+
+ jph_val = 0;
+ ret = target2(rand1);
+ if (jph_val == 0) {
+ pr_err("jprobe handler2 not called\n");
+ handler_errors++;
+ }
+ unregister_jprobes(jps, 2);
+
+ return 0;
+}
+#ifdef CONFIG_KRETPROBES
+static u32 krph_val;
+
+static int entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
+{
+ krph_val = (rand1 / div_factor);
+ return 0;
+}
+
+static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
+{
+ unsigned long ret = regs_return_value(regs);
+
+ if (ret != (rand1 / div_factor)) {
+ handler_errors++;
+ pr_err("incorrect value in kretprobe handler\n");
+ }
+ if (krph_val == 0) {
+ handler_errors++;
+ pr_err("call to kretprobe entry handler failed\n");
+ }
+
+ krph_val = rand1;
+ return 0;
+}
+
+static struct kretprobe rp = {
+ .handler = return_handler,
+ .entry_handler = entry_handler,
+ .kp.symbol_name = "kprobe_target"
+};
+
+static int test_kretprobe(void)
+{
+ int ret;
+
+ ret = register_kretprobe(&rp);
+ if (ret < 0) {
+ pr_err("register_kretprobe returned %d\n", ret);
+ return ret;
+ }
+
+ ret = target(rand1);
+ unregister_kretprobe(&rp);
+ if (krph_val != rand1) {
+ pr_err("kretprobe handler not called\n");
+ handler_errors++;
+ }
+
+ return 0;
+}
+
+static int return_handler2(struct kretprobe_instance *ri, struct pt_regs *regs)
+{
+ unsigned long ret = regs_return_value(regs);
+
+ if (ret != (rand1 / div_factor) + 1) {
+ handler_errors++;
+ pr_err("incorrect value in kretprobe handler2\n");
+ }
+ if (krph_val == 0) {
+ handler_errors++;
+ pr_err("call to kretprobe entry handler failed\n");
+ }
+
+ krph_val = rand1;
+ return 0;
+}
+
+static struct kretprobe rp2 = {
+ .handler = return_handler2,
+ .entry_handler = entry_handler,
+ .kp.symbol_name = "kprobe_target2"
+};
+
+static int test_kretprobes(void)
+{
+ int ret;
+ struct kretprobe *rps[2] = {&rp, &rp2};
+
+ /* addr and flags should be cleard for reusing kprobe. */
+ rp.kp.addr = NULL;
+ rp.kp.flags = 0;
+ ret = register_kretprobes(rps, 2);
+ if (ret < 0) {
+ pr_err("register_kretprobe returned %d\n", ret);
+ return ret;
+ }
+
+ krph_val = 0;
+ ret = target(rand1);
+ if (krph_val != rand1) {
+ pr_err("kretprobe handler not called\n");
+ handler_errors++;
+ }
+
+ krph_val = 0;
+ ret = target2(rand1);
+ if (krph_val != rand1) {
+ pr_err("kretprobe handler2 not called\n");
+ handler_errors++;
+ }
+ unregister_kretprobes(rps, 2);
+ return 0;
+}
+#endif /* CONFIG_KRETPROBES */
+
+int init_test_probes(void)
+{
+ int ret;
+
+ target = kprobe_target;
+ target2 = kprobe_target2;
+
+ do {
+ rand1 = prandom_u32();
+ } while (rand1 <= div_factor);
+
+ pr_info("started\n");
+ num_tests++;
+ ret = test_kprobe();
+ if (ret < 0)
+ errors++;
+
+ num_tests++;
+ ret = test_kprobes();
+ if (ret < 0)
+ errors++;
+
+ num_tests++;
+ ret = test_jprobe();
+ if (ret < 0)
+ errors++;
+
+ num_tests++;
+ ret = test_jprobes();
+ if (ret < 0)
+ errors++;
+
+#ifdef CONFIG_KRETPROBES
+ num_tests++;
+ ret = test_kretprobe();
+ if (ret < 0)
+ errors++;
+
+ num_tests++;
+ ret = test_kretprobes();
+ if (ret < 0)
+ errors++;
+#endif /* CONFIG_KRETPROBES */
+
+ if (errors)
+ pr_err("BUG: %d out of %d tests failed\n", errors, num_tests);
+ else if (handler_errors)
+ pr_err("BUG: %d error(s) running handlers\n", handler_errors);
+ else
+ pr_info("passed successfully\n");
+
+ return 0;
+}
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
new file mode 100644
index 000000000..7ceb68656
--- /dev/null
+++ b/kernel/time/Kconfig
@@ -0,0 +1,199 @@
+#
+# Timer subsystem related configuration options
+#
+
+# Options selectable by arch Kconfig
+
+# Watchdog function for clocksources to detect instabilities
+config CLOCKSOURCE_WATCHDOG
+ bool
+
+# Architecture has extra clocksource data
+config ARCH_CLOCKSOURCE_DATA
+ bool
+
+# Clocksources require validation of the clocksource against the last
+# cycle update - x86/TSC misfeature
+config CLOCKSOURCE_VALIDATE_LAST_CYCLE
+ bool
+
+# Timekeeping vsyscall support
+config GENERIC_TIME_VSYSCALL
+ bool
+
+# Timekeeping vsyscall support
+config GENERIC_TIME_VSYSCALL_OLD
+ bool
+
+# Old style timekeeping
+config ARCH_USES_GETTIMEOFFSET
+ bool
+
+# The generic clock events infrastructure
+config GENERIC_CLOCKEVENTS
+ bool
+
+# Architecture can handle broadcast in a driver-agnostic way
+config ARCH_HAS_TICK_BROADCAST
+ bool
+
+# Clockevents broadcasting infrastructure
+config GENERIC_CLOCKEVENTS_BROADCAST
+ bool
+ depends on GENERIC_CLOCKEVENTS
+
+# Automatically adjust the min. reprogramming time for
+# clock event device
+config GENERIC_CLOCKEVENTS_MIN_ADJUST
+ bool
+
+# Generic update of CMOS clock
+config GENERIC_CMOS_UPDATE
+ bool
+
+if GENERIC_CLOCKEVENTS
+menu "Timers subsystem"
+
+# Core internal switch. Selected by NO_HZ_COMMON / HIGH_RES_TIMERS. This is
+# only related to the tick functionality. Oneshot clockevent devices
+# are supported independ of this.
+config TICK_ONESHOT
+ bool
+
+config NO_HZ_COMMON
+ bool
+ depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
+ select TICK_ONESHOT
+
+choice
+ prompt "Timer tick handling"
+ default NO_HZ_IDLE if NO_HZ
+
+config HZ_PERIODIC
+ bool "Periodic timer ticks (constant rate, no dynticks)"
+ help
+ This option keeps the tick running periodically at a constant
+ rate, even when the CPU doesn't need it.
+
+config NO_HZ_IDLE
+ bool "Idle dynticks system (tickless idle)"
+ depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
+ select NO_HZ_COMMON
+ help
+ This option enables a tickless idle system: timer interrupts
+ will only trigger on an as-needed basis when the system is idle.
+ This is usually interesting for energy saving.
+
+ Most of the time you want to say Y here.
+
+config NO_HZ_FULL
+ bool "Full dynticks system (tickless)"
+ # NO_HZ_COMMON dependency
+ depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS && !SCHED_BFS
+ # We need at least one periodic CPU for timekeeping
+ depends on SMP
+ # RCU_USER_QS dependency
+ depends on HAVE_CONTEXT_TRACKING
+ # VIRT_CPU_ACCOUNTING_GEN dependency
+ depends on HAVE_VIRT_CPU_ACCOUNTING_GEN
+ select NO_HZ_COMMON
+ select RCU_USER_QS
+ select RCU_NOCB_CPU
+ select VIRT_CPU_ACCOUNTING_GEN
+ select IRQ_WORK
+ help
+ Adaptively try to shutdown the tick whenever possible, even when
+ the CPU is running tasks. Typically this requires running a single
+ task on the CPU. Chances for running tickless are maximized when
+ the task mostly runs in userspace and has few kernel activity.
+
+ You need to fill up the nohz_full boot parameter with the
+ desired range of dynticks CPUs.
+
+ This is implemented at the expense of some overhead in user <-> kernel
+ transitions: syscalls, exceptions and interrupts. Even when it's
+ dynamically off.
+
+ Say N.
+
+endchoice
+
+config NO_HZ_FULL_ALL
+ bool "Full dynticks system on all CPUs by default (except CPU 0)"
+ depends on NO_HZ_FULL
+ help
+ If the user doesn't pass the nohz_full boot option to
+ define the range of full dynticks CPUs, consider that all
+ CPUs in the system are full dynticks by default.
+ Note the boot CPU will still be kept outside the range to
+ handle the timekeeping duty.
+
+config NO_HZ_FULL_SYSIDLE
+ bool "Detect full-system idle state for full dynticks system"
+ depends on NO_HZ_FULL
+ default n
+ help
+ At least one CPU must keep the scheduling-clock tick running for
+ timekeeping purposes whenever there is a non-idle CPU, where
+ "non-idle" also includes dynticks CPUs as long as they are
+ running non-idle tasks. Because the underlying adaptive-tick
+ support cannot distinguish between all CPUs being idle and
+ all CPUs each running a single task in dynticks mode, the
+ underlying support simply ensures that there is always a CPU
+ handling the scheduling-clock tick, whether or not all CPUs
+ are idle. This Kconfig option enables scalable detection of
+ the all-CPUs-idle state, thus allowing the scheduling-clock
+ tick to be disabled when all CPUs are idle. Note that scalable
+ detection of the all-CPUs-idle state means that larger systems
+ will be slower to declare the all-CPUs-idle state.
+
+ Say Y if you would like to help debug all-CPUs-idle detection.
+
+ Say N if you are unsure.
+
+config NO_HZ_FULL_SYSIDLE_SMALL
+ int "Number of CPUs above which large-system approach is used"
+ depends on NO_HZ_FULL_SYSIDLE
+ range 1 NR_CPUS
+ default 8
+ help
+ The full-system idle detection mechanism takes a lazy approach
+ on large systems, as is required to attain decent scalability.
+ However, on smaller systems, scalability is not anywhere near as
+ large a concern as is energy efficiency. The sysidle subsystem
+ therefore uses a fast but non-scalable algorithm for small
+ systems and a lazier but scalable algorithm for large systems.
+ This Kconfig parameter defines the number of CPUs in the largest
+ system that will be considered to be "small".
+
+ The default value will be fine in most cases. Battery-powered
+ systems that (1) enable NO_HZ_FULL_SYSIDLE, (2) have larger
+ numbers of CPUs, and (3) are suffering from battery-lifetime
+ problems due to long sysidle latencies might wish to experiment
+ with larger values for this Kconfig parameter. On the other
+ hand, they might be even better served by disabling NO_HZ_FULL
+ entirely, given that NO_HZ_FULL is intended for HPC and
+ real-time workloads that at present do not tend to be run on
+ battery-powered systems.
+
+ Take the default if you are unsure.
+
+config NO_HZ
+ bool "Old Idle dynticks config"
+ depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
+ help
+ This is the old config entry that enables dynticks idle.
+ We keep it around for a little while to enforce backward
+ compatibility with older config files.
+
+config HIGH_RES_TIMERS
+ bool "High Resolution Timer Support"
+ depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
+ select TICK_ONESHOT
+ help
+ This option enables high resolution timer support. If your
+ hardware is not capable then this option only increases
+ the size of the kernel image.
+
+endmenu
+endif
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
new file mode 100644
index 000000000..01f031241
--- /dev/null
+++ b/kernel/time/Makefile
@@ -0,0 +1,31 @@
+obj-y += time.o timer.o hrtimer.o itimer.o posix-timers.o posix-cpu-timers.o
+obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
+obj-y += timeconv.o timecounter.o posix-clock.o alarmtimer.o
+
+obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o tick-common.o
+ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y)
+ obj-y += tick-broadcast.o
+ obj-$(CONFIG_TICK_ONESHOT) += tick-broadcast-hrtimer.o
+endif
+obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o
+obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o
+obj-$(CONFIG_TIMER_STATS) += timer_stats.o
+obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
+obj-$(CONFIG_TEST_UDELAY) += test_udelay.o
+
+$(obj)/time.o: $(obj)/timeconst.h
+
+quiet_cmd_hzfile = HZFILE $@
+ cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@
+
+targets += hz.bc
+$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE
+ $(call if_changed,hzfile)
+
+quiet_cmd_bc = BC $@
+ cmd_bc = bc -q $(filter-out FORCE,$^) > $@
+
+targets += timeconst.h
+$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE
+ $(call if_changed,bc)
+
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
new file mode 100644
index 000000000..1b001ed1e
--- /dev/null
+++ b/kernel/time/alarmtimer.c
@@ -0,0 +1,873 @@
+/*
+ * Alarmtimer interface
+ *
+ * This interface provides a timer which is similarto hrtimers,
+ * but triggers a RTC alarm if the box is suspend.
+ *
+ * This interface is influenced by the Android RTC Alarm timer
+ * interface.
+ *
+ * Copyright (C) 2010 IBM Corperation
+ *
+ * Author: John Stultz <john.stultz@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/time.h>
+#include <linux/hrtimer.h>
+#include <linux/timerqueue.h>
+#include <linux/rtc.h>
+#include <linux/alarmtimer.h>
+#include <linux/mutex.h>
+#include <linux/platform_device.h>
+#include <linux/posix-timers.h>
+#include <linux/workqueue.h>
+#include <linux/freezer.h>
+
+/**
+ * struct alarm_base - Alarm timer bases
+ * @lock: Lock for syncrhonized access to the base
+ * @timerqueue: Timerqueue head managing the list of events
+ * @timer: hrtimer used to schedule events while running
+ * @gettime: Function to read the time correlating to the base
+ * @base_clockid: clockid for the base
+ */
+static struct alarm_base {
+ spinlock_t lock;
+ struct timerqueue_head timerqueue;
+ ktime_t (*gettime)(void);
+ clockid_t base_clockid;
+} alarm_bases[ALARM_NUMTYPE];
+
+/* freezer delta & lock used to handle clock_nanosleep triggered wakeups */
+static ktime_t freezer_delta;
+static DEFINE_SPINLOCK(freezer_delta_lock);
+
+static struct wakeup_source *ws;
+
+#ifdef CONFIG_RTC_CLASS
+/* rtc timer and device for setting alarm wakeups at suspend */
+static struct rtc_timer rtctimer;
+static struct rtc_device *rtcdev;
+static DEFINE_SPINLOCK(rtcdev_lock);
+
+/**
+ * alarmtimer_get_rtcdev - Return selected rtcdevice
+ *
+ * This function returns the rtc device to use for wakealarms.
+ * If one has not already been chosen, it checks to see if a
+ * functional rtc device is available.
+ */
+struct rtc_device *alarmtimer_get_rtcdev(void)
+{
+ unsigned long flags;
+ struct rtc_device *ret;
+
+ spin_lock_irqsave(&rtcdev_lock, flags);
+ ret = rtcdev;
+ spin_unlock_irqrestore(&rtcdev_lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(alarmtimer_get_rtcdev);
+
+static int alarmtimer_rtc_add_device(struct device *dev,
+ struct class_interface *class_intf)
+{
+ unsigned long flags;
+ struct rtc_device *rtc = to_rtc_device(dev);
+
+ if (rtcdev)
+ return -EBUSY;
+
+ if (!rtc->ops->set_alarm)
+ return -1;
+ if (!device_may_wakeup(rtc->dev.parent))
+ return -1;
+
+ spin_lock_irqsave(&rtcdev_lock, flags);
+ if (!rtcdev) {
+ rtcdev = rtc;
+ /* hold a reference so it doesn't go away */
+ get_device(dev);
+ }
+ spin_unlock_irqrestore(&rtcdev_lock, flags);
+ return 0;
+}
+
+static inline void alarmtimer_rtc_timer_init(void)
+{
+ rtc_timer_init(&rtctimer, NULL, NULL);
+}
+
+static struct class_interface alarmtimer_rtc_interface = {
+ .add_dev = &alarmtimer_rtc_add_device,
+};
+
+static int alarmtimer_rtc_interface_setup(void)
+{
+ alarmtimer_rtc_interface.class = rtc_class;
+ return class_interface_register(&alarmtimer_rtc_interface);
+}
+static void alarmtimer_rtc_interface_remove(void)
+{
+ class_interface_unregister(&alarmtimer_rtc_interface);
+}
+#else
+struct rtc_device *alarmtimer_get_rtcdev(void)
+{
+ return NULL;
+}
+#define rtcdev (NULL)
+static inline int alarmtimer_rtc_interface_setup(void) { return 0; }
+static inline void alarmtimer_rtc_interface_remove(void) { }
+static inline void alarmtimer_rtc_timer_init(void) { }
+#endif
+
+/**
+ * alarmtimer_enqueue - Adds an alarm timer to an alarm_base timerqueue
+ * @base: pointer to the base where the timer is being run
+ * @alarm: pointer to alarm being enqueued.
+ *
+ * Adds alarm to a alarm_base timerqueue
+ *
+ * Must hold base->lock when calling.
+ */
+static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm)
+{
+ if (alarm->state & ALARMTIMER_STATE_ENQUEUED)
+ timerqueue_del(&base->timerqueue, &alarm->node);
+
+ timerqueue_add(&base->timerqueue, &alarm->node);
+ alarm->state |= ALARMTIMER_STATE_ENQUEUED;
+}
+
+/**
+ * alarmtimer_dequeue - Removes an alarm timer from an alarm_base timerqueue
+ * @base: pointer to the base where the timer is running
+ * @alarm: pointer to alarm being removed
+ *
+ * Removes alarm to a alarm_base timerqueue
+ *
+ * Must hold base->lock when calling.
+ */
+static void alarmtimer_dequeue(struct alarm_base *base, struct alarm *alarm)
+{
+ if (!(alarm->state & ALARMTIMER_STATE_ENQUEUED))
+ return;
+
+ timerqueue_del(&base->timerqueue, &alarm->node);
+ alarm->state &= ~ALARMTIMER_STATE_ENQUEUED;
+}
+
+
+/**
+ * alarmtimer_fired - Handles alarm hrtimer being fired.
+ * @timer: pointer to hrtimer being run
+ *
+ * When a alarm timer fires, this runs through the timerqueue to
+ * see which alarms expired, and runs those. If there are more alarm
+ * timers queued for the future, we set the hrtimer to fire when
+ * when the next future alarm timer expires.
+ */
+static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
+{
+ struct alarm *alarm = container_of(timer, struct alarm, timer);
+ struct alarm_base *base = &alarm_bases[alarm->type];
+ unsigned long flags;
+ int ret = HRTIMER_NORESTART;
+ int restart = ALARMTIMER_NORESTART;
+
+ spin_lock_irqsave(&base->lock, flags);
+ alarmtimer_dequeue(base, alarm);
+ spin_unlock_irqrestore(&base->lock, flags);
+
+ if (alarm->function)
+ restart = alarm->function(alarm, base->gettime());
+
+ spin_lock_irqsave(&base->lock, flags);
+ if (restart != ALARMTIMER_NORESTART) {
+ hrtimer_set_expires(&alarm->timer, alarm->node.expires);
+ alarmtimer_enqueue(base, alarm);
+ ret = HRTIMER_RESTART;
+ }
+ spin_unlock_irqrestore(&base->lock, flags);
+
+ return ret;
+
+}
+
+ktime_t alarm_expires_remaining(const struct alarm *alarm)
+{
+ struct alarm_base *base = &alarm_bases[alarm->type];
+ return ktime_sub(alarm->node.expires, base->gettime());
+}
+EXPORT_SYMBOL_GPL(alarm_expires_remaining);
+
+#ifdef CONFIG_RTC_CLASS
+/**
+ * alarmtimer_suspend - Suspend time callback
+ * @dev: unused
+ * @state: unused
+ *
+ * When we are going into suspend, we look through the bases
+ * to see which is the soonest timer to expire. We then
+ * set an rtc timer to fire that far into the future, which
+ * will wake us from suspend.
+ */
+static int alarmtimer_suspend(struct device *dev)
+{
+ struct rtc_time tm;
+ ktime_t min, now;
+ unsigned long flags;
+ struct rtc_device *rtc;
+ int i;
+ int ret;
+
+ spin_lock_irqsave(&freezer_delta_lock, flags);
+ min = freezer_delta;
+ freezer_delta = ktime_set(0, 0);
+ spin_unlock_irqrestore(&freezer_delta_lock, flags);
+
+ rtc = alarmtimer_get_rtcdev();
+ /* If we have no rtcdev, just return */
+ if (!rtc)
+ return 0;
+
+ /* Find the soonest timer to expire*/
+ for (i = 0; i < ALARM_NUMTYPE; i++) {
+ struct alarm_base *base = &alarm_bases[i];
+ struct timerqueue_node *next;
+ ktime_t delta;
+
+ spin_lock_irqsave(&base->lock, flags);
+ next = timerqueue_getnext(&base->timerqueue);
+ spin_unlock_irqrestore(&base->lock, flags);
+ if (!next)
+ continue;
+ delta = ktime_sub(next->expires, base->gettime());
+ if (!min.tv64 || (delta.tv64 < min.tv64))
+ min = delta;
+ }
+ if (min.tv64 == 0)
+ return 0;
+
+ if (ktime_to_ns(min) < 2 * NSEC_PER_SEC) {
+ __pm_wakeup_event(ws, 2 * MSEC_PER_SEC);
+ return -EBUSY;
+ }
+
+ /* Setup an rtc timer to fire that far in the future */
+ rtc_timer_cancel(rtc, &rtctimer);
+ rtc_read_time(rtc, &tm);
+ now = rtc_tm_to_ktime(tm);
+ now = ktime_add(now, min);
+
+ /* Set alarm, if in the past reject suspend briefly to handle */
+ ret = rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0));
+ if (ret < 0)
+ __pm_wakeup_event(ws, MSEC_PER_SEC);
+ return ret;
+}
+#else
+static int alarmtimer_suspend(struct device *dev)
+{
+ return 0;
+}
+#endif
+
+static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
+{
+ ktime_t delta;
+ unsigned long flags;
+ struct alarm_base *base = &alarm_bases[type];
+
+ delta = ktime_sub(absexp, base->gettime());
+
+ spin_lock_irqsave(&freezer_delta_lock, flags);
+ if (!freezer_delta.tv64 || (delta.tv64 < freezer_delta.tv64))
+ freezer_delta = delta;
+ spin_unlock_irqrestore(&freezer_delta_lock, flags);
+}
+
+
+/**
+ * alarm_init - Initialize an alarm structure
+ * @alarm: ptr to alarm to be initialized
+ * @type: the type of the alarm
+ * @function: callback that is run when the alarm fires
+ */
+void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
+ enum alarmtimer_restart (*function)(struct alarm *, ktime_t))
+{
+ timerqueue_init(&alarm->node);
+ hrtimer_init(&alarm->timer, alarm_bases[type].base_clockid,
+ HRTIMER_MODE_ABS);
+ alarm->timer.function = alarmtimer_fired;
+ alarm->function = function;
+ alarm->type = type;
+ alarm->state = ALARMTIMER_STATE_INACTIVE;
+}
+EXPORT_SYMBOL_GPL(alarm_init);
+
+/**
+ * alarm_start - Sets an absolute alarm to fire
+ * @alarm: ptr to alarm to set
+ * @start: time to run the alarm
+ */
+int alarm_start(struct alarm *alarm, ktime_t start)
+{
+ struct alarm_base *base = &alarm_bases[alarm->type];
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&base->lock, flags);
+ alarm->node.expires = start;
+ alarmtimer_enqueue(base, alarm);
+ ret = hrtimer_start(&alarm->timer, alarm->node.expires,
+ HRTIMER_MODE_ABS);
+ spin_unlock_irqrestore(&base->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(alarm_start);
+
+/**
+ * alarm_start_relative - Sets a relative alarm to fire
+ * @alarm: ptr to alarm to set
+ * @start: time relative to now to run the alarm
+ */
+int alarm_start_relative(struct alarm *alarm, ktime_t start)
+{
+ struct alarm_base *base = &alarm_bases[alarm->type];
+
+ start = ktime_add(start, base->gettime());
+ return alarm_start(alarm, start);
+}
+EXPORT_SYMBOL_GPL(alarm_start_relative);
+
+void alarm_restart(struct alarm *alarm)
+{
+ struct alarm_base *base = &alarm_bases[alarm->type];
+ unsigned long flags;
+
+ spin_lock_irqsave(&base->lock, flags);
+ hrtimer_set_expires(&alarm->timer, alarm->node.expires);
+ hrtimer_restart(&alarm->timer);
+ alarmtimer_enqueue(base, alarm);
+ spin_unlock_irqrestore(&base->lock, flags);
+}
+EXPORT_SYMBOL_GPL(alarm_restart);
+
+/**
+ * alarm_try_to_cancel - Tries to cancel an alarm timer
+ * @alarm: ptr to alarm to be canceled
+ *
+ * Returns 1 if the timer was canceled, 0 if it was not running,
+ * and -1 if the callback was running
+ */
+int alarm_try_to_cancel(struct alarm *alarm)
+{
+ struct alarm_base *base = &alarm_bases[alarm->type];
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&base->lock, flags);
+ ret = hrtimer_try_to_cancel(&alarm->timer);
+ if (ret >= 0)
+ alarmtimer_dequeue(base, alarm);
+ spin_unlock_irqrestore(&base->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(alarm_try_to_cancel);
+
+
+/**
+ * alarm_cancel - Spins trying to cancel an alarm timer until it is done
+ * @alarm: ptr to alarm to be canceled
+ *
+ * Returns 1 if the timer was canceled, 0 if it was not active.
+ */
+int alarm_cancel(struct alarm *alarm)
+{
+ for (;;) {
+ int ret = alarm_try_to_cancel(alarm);
+ if (ret >= 0)
+ return ret;
+ cpu_relax();
+ }
+}
+EXPORT_SYMBOL_GPL(alarm_cancel);
+
+
+u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval)
+{
+ u64 overrun = 1;
+ ktime_t delta;
+
+ delta = ktime_sub(now, alarm->node.expires);
+
+ if (delta.tv64 < 0)
+ return 0;
+
+ if (unlikely(delta.tv64 >= interval.tv64)) {
+ s64 incr = ktime_to_ns(interval);
+
+ overrun = ktime_divns(delta, incr);
+
+ alarm->node.expires = ktime_add_ns(alarm->node.expires,
+ incr*overrun);
+
+ if (alarm->node.expires.tv64 > now.tv64)
+ return overrun;
+ /*
+ * This (and the ktime_add() below) is the
+ * correction for exact:
+ */
+ overrun++;
+ }
+
+ alarm->node.expires = ktime_add(alarm->node.expires, interval);
+ return overrun;
+}
+EXPORT_SYMBOL_GPL(alarm_forward);
+
+u64 alarm_forward_now(struct alarm *alarm, ktime_t interval)
+{
+ struct alarm_base *base = &alarm_bases[alarm->type];
+
+ return alarm_forward(alarm, base->gettime(), interval);
+}
+EXPORT_SYMBOL_GPL(alarm_forward_now);
+
+
+/**
+ * clock2alarm - helper that converts from clockid to alarmtypes
+ * @clockid: clockid.
+ */
+static enum alarmtimer_type clock2alarm(clockid_t clockid)
+{
+ if (clockid == CLOCK_REALTIME_ALARM)
+ return ALARM_REALTIME;
+ if (clockid == CLOCK_BOOTTIME_ALARM)
+ return ALARM_BOOTTIME;
+ return -1;
+}
+
+/**
+ * alarm_handle_timer - Callback for posix timers
+ * @alarm: alarm that fired
+ *
+ * Posix timer callback for expired alarm timers.
+ */
+static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
+ ktime_t now)
+{
+ unsigned long flags;
+ struct k_itimer *ptr = container_of(alarm, struct k_itimer,
+ it.alarm.alarmtimer);
+ enum alarmtimer_restart result = ALARMTIMER_NORESTART;
+
+ spin_lock_irqsave(&ptr->it_lock, flags);
+ if ((ptr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) {
+ if (posix_timer_event(ptr, 0) != 0)
+ ptr->it_overrun++;
+ }
+
+ /* Re-add periodic timers */
+ if (ptr->it.alarm.interval.tv64) {
+ ptr->it_overrun += alarm_forward(alarm, now,
+ ptr->it.alarm.interval);
+ result = ALARMTIMER_RESTART;
+ }
+ spin_unlock_irqrestore(&ptr->it_lock, flags);
+
+ return result;
+}
+
+/**
+ * alarm_clock_getres - posix getres interface
+ * @which_clock: clockid
+ * @tp: timespec to fill
+ *
+ * Returns the granularity of underlying alarm base clock
+ */
+static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp)
+{
+ clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid;
+
+ if (!alarmtimer_get_rtcdev())
+ return -EINVAL;
+
+ return hrtimer_get_res(baseid, tp);
+}
+
+/**
+ * alarm_clock_get - posix clock_get interface
+ * @which_clock: clockid
+ * @tp: timespec to fill.
+ *
+ * Provides the underlying alarm base time.
+ */
+static int alarm_clock_get(clockid_t which_clock, struct timespec *tp)
+{
+ struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)];
+
+ if (!alarmtimer_get_rtcdev())
+ return -EINVAL;
+
+ *tp = ktime_to_timespec(base->gettime());
+ return 0;
+}
+
+/**
+ * alarm_timer_create - posix timer_create interface
+ * @new_timer: k_itimer pointer to manage
+ *
+ * Initializes the k_itimer structure.
+ */
+static int alarm_timer_create(struct k_itimer *new_timer)
+{
+ enum alarmtimer_type type;
+ struct alarm_base *base;
+
+ if (!alarmtimer_get_rtcdev())
+ return -ENOTSUPP;
+
+ if (!capable(CAP_WAKE_ALARM))
+ return -EPERM;
+
+ type = clock2alarm(new_timer->it_clock);
+ base = &alarm_bases[type];
+ alarm_init(&new_timer->it.alarm.alarmtimer, type, alarm_handle_timer);
+ return 0;
+}
+
+/**
+ * alarm_timer_get - posix timer_get interface
+ * @new_timer: k_itimer pointer
+ * @cur_setting: itimerspec data to fill
+ *
+ * Copies out the current itimerspec data
+ */
+static void alarm_timer_get(struct k_itimer *timr,
+ struct itimerspec *cur_setting)
+{
+ ktime_t relative_expiry_time =
+ alarm_expires_remaining(&(timr->it.alarm.alarmtimer));
+
+ if (ktime_to_ns(relative_expiry_time) > 0) {
+ cur_setting->it_value = ktime_to_timespec(relative_expiry_time);
+ } else {
+ cur_setting->it_value.tv_sec = 0;
+ cur_setting->it_value.tv_nsec = 0;
+ }
+
+ cur_setting->it_interval = ktime_to_timespec(timr->it.alarm.interval);
+}
+
+/**
+ * alarm_timer_del - posix timer_del interface
+ * @timr: k_itimer pointer to be deleted
+ *
+ * Cancels any programmed alarms for the given timer.
+ */
+static int alarm_timer_del(struct k_itimer *timr)
+{
+ if (!rtcdev)
+ return -ENOTSUPP;
+
+ if (alarm_try_to_cancel(&timr->it.alarm.alarmtimer) < 0)
+ return TIMER_RETRY;
+
+ return 0;
+}
+
+/**
+ * alarm_timer_set - posix timer_set interface
+ * @timr: k_itimer pointer to be deleted
+ * @flags: timer flags
+ * @new_setting: itimerspec to be used
+ * @old_setting: itimerspec being replaced
+ *
+ * Sets the timer to new_setting, and starts the timer.
+ */
+static int alarm_timer_set(struct k_itimer *timr, int flags,
+ struct itimerspec *new_setting,
+ struct itimerspec *old_setting)
+{
+ ktime_t exp;
+
+ if (!rtcdev)
+ return -ENOTSUPP;
+
+ if (flags & ~TIMER_ABSTIME)
+ return -EINVAL;
+
+ if (old_setting)
+ alarm_timer_get(timr, old_setting);
+
+ /* If the timer was already set, cancel it */
+ if (alarm_try_to_cancel(&timr->it.alarm.alarmtimer) < 0)
+ return TIMER_RETRY;
+
+ /* start the timer */
+ timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval);
+ exp = timespec_to_ktime(new_setting->it_value);
+ /* Convert (if necessary) to absolute time */
+ if (flags != TIMER_ABSTIME) {
+ ktime_t now;
+
+ now = alarm_bases[timr->it.alarm.alarmtimer.type].gettime();
+ exp = ktime_add(now, exp);
+ }
+
+ alarm_start(&timr->it.alarm.alarmtimer, exp);
+ return 0;
+}
+
+/**
+ * alarmtimer_nsleep_wakeup - Wakeup function for alarm_timer_nsleep
+ * @alarm: ptr to alarm that fired
+ *
+ * Wakes up the task that set the alarmtimer
+ */
+static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm,
+ ktime_t now)
+{
+ struct task_struct *task = (struct task_struct *)alarm->data;
+
+ alarm->data = NULL;
+ if (task)
+ wake_up_process(task);
+ return ALARMTIMER_NORESTART;
+}
+
+/**
+ * alarmtimer_do_nsleep - Internal alarmtimer nsleep implementation
+ * @alarm: ptr to alarmtimer
+ * @absexp: absolute expiration time
+ *
+ * Sets the alarm timer and sleeps until it is fired or interrupted.
+ */
+static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp)
+{
+ alarm->data = (void *)current;
+ do {
+ set_current_state(TASK_INTERRUPTIBLE);
+ alarm_start(alarm, absexp);
+ if (likely(alarm->data))
+ schedule();
+
+ alarm_cancel(alarm);
+ } while (alarm->data && !signal_pending(current));
+
+ __set_current_state(TASK_RUNNING);
+
+ return (alarm->data == NULL);
+}
+
+
+/**
+ * update_rmtp - Update remaining timespec value
+ * @exp: expiration time
+ * @type: timer type
+ * @rmtp: user pointer to remaining timepsec value
+ *
+ * Helper function that fills in rmtp value with time between
+ * now and the exp value
+ */
+static int update_rmtp(ktime_t exp, enum alarmtimer_type type,
+ struct timespec __user *rmtp)
+{
+ struct timespec rmt;
+ ktime_t rem;
+
+ rem = ktime_sub(exp, alarm_bases[type].gettime());
+
+ if (rem.tv64 <= 0)
+ return 0;
+ rmt = ktime_to_timespec(rem);
+
+ if (copy_to_user(rmtp, &rmt, sizeof(*rmtp)))
+ return -EFAULT;
+
+ return 1;
+
+}
+
+/**
+ * alarm_timer_nsleep_restart - restartblock alarmtimer nsleep
+ * @restart: ptr to restart block
+ *
+ * Handles restarted clock_nanosleep calls
+ */
+static long __sched alarm_timer_nsleep_restart(struct restart_block *restart)
+{
+ enum alarmtimer_type type = restart->nanosleep.clockid;
+ ktime_t exp;
+ struct timespec __user *rmtp;
+ struct alarm alarm;
+ int ret = 0;
+
+ exp.tv64 = restart->nanosleep.expires;
+ alarm_init(&alarm, type, alarmtimer_nsleep_wakeup);
+
+ if (alarmtimer_do_nsleep(&alarm, exp))
+ goto out;
+
+ if (freezing(current))
+ alarmtimer_freezerset(exp, type);
+
+ rmtp = restart->nanosleep.rmtp;
+ if (rmtp) {
+ ret = update_rmtp(exp, type, rmtp);
+ if (ret <= 0)
+ goto out;
+ }
+
+
+ /* The other values in restart are already filled in */
+ ret = -ERESTART_RESTARTBLOCK;
+out:
+ return ret;
+}
+
+/**
+ * alarm_timer_nsleep - alarmtimer nanosleep
+ * @which_clock: clockid
+ * @flags: determins abstime or relative
+ * @tsreq: requested sleep time (abs or rel)
+ * @rmtp: remaining sleep time saved
+ *
+ * Handles clock_nanosleep calls against _ALARM clockids
+ */
+static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
+ struct timespec *tsreq, struct timespec __user *rmtp)
+{
+ enum alarmtimer_type type = clock2alarm(which_clock);
+ struct alarm alarm;
+ ktime_t exp;
+ int ret = 0;
+ struct restart_block *restart;
+
+ if (!alarmtimer_get_rtcdev())
+ return -ENOTSUPP;
+
+ if (flags & ~TIMER_ABSTIME)
+ return -EINVAL;
+
+ if (!capable(CAP_WAKE_ALARM))
+ return -EPERM;
+
+ alarm_init(&alarm, type, alarmtimer_nsleep_wakeup);
+
+ exp = timespec_to_ktime(*tsreq);
+ /* Convert (if necessary) to absolute time */
+ if (flags != TIMER_ABSTIME) {
+ ktime_t now = alarm_bases[type].gettime();
+ exp = ktime_add(now, exp);
+ }
+
+ if (alarmtimer_do_nsleep(&alarm, exp))
+ goto out;
+
+ if (freezing(current))
+ alarmtimer_freezerset(exp, type);
+
+ /* abs timers don't set remaining time or restart */
+ if (flags == TIMER_ABSTIME) {
+ ret = -ERESTARTNOHAND;
+ goto out;
+ }
+
+ if (rmtp) {
+ ret = update_rmtp(exp, type, rmtp);
+ if (ret <= 0)
+ goto out;
+ }
+
+ restart = &current->restart_block;
+ restart->fn = alarm_timer_nsleep_restart;
+ restart->nanosleep.clockid = type;
+ restart->nanosleep.expires = exp.tv64;
+ restart->nanosleep.rmtp = rmtp;
+ ret = -ERESTART_RESTARTBLOCK;
+
+out:
+ return ret;
+}
+
+
+/* Suspend hook structures */
+static const struct dev_pm_ops alarmtimer_pm_ops = {
+ .suspend = alarmtimer_suspend,
+};
+
+static struct platform_driver alarmtimer_driver = {
+ .driver = {
+ .name = "alarmtimer",
+ .pm = &alarmtimer_pm_ops,
+ }
+};
+
+/**
+ * alarmtimer_init - Initialize alarm timer code
+ *
+ * This function initializes the alarm bases and registers
+ * the posix clock ids.
+ */
+static int __init alarmtimer_init(void)
+{
+ struct platform_device *pdev;
+ int error = 0;
+ int i;
+ struct k_clock alarm_clock = {
+ .clock_getres = alarm_clock_getres,
+ .clock_get = alarm_clock_get,
+ .timer_create = alarm_timer_create,
+ .timer_set = alarm_timer_set,
+ .timer_del = alarm_timer_del,
+ .timer_get = alarm_timer_get,
+ .nsleep = alarm_timer_nsleep,
+ };
+
+ alarmtimer_rtc_timer_init();
+
+ posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock);
+ posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock);
+
+ /* Initialize alarm bases */
+ alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME;
+ alarm_bases[ALARM_REALTIME].gettime = &ktime_get_real;
+ alarm_bases[ALARM_BOOTTIME].base_clockid = CLOCK_BOOTTIME;
+ alarm_bases[ALARM_BOOTTIME].gettime = &ktime_get_boottime;
+ for (i = 0; i < ALARM_NUMTYPE; i++) {
+ timerqueue_init_head(&alarm_bases[i].timerqueue);
+ spin_lock_init(&alarm_bases[i].lock);
+ }
+
+ error = alarmtimer_rtc_interface_setup();
+ if (error)
+ return error;
+
+ error = platform_driver_register(&alarmtimer_driver);
+ if (error)
+ goto out_if;
+
+ pdev = platform_device_register_simple("alarmtimer", -1, NULL, 0);
+ if (IS_ERR(pdev)) {
+ error = PTR_ERR(pdev);
+ goto out_drv;
+ }
+ ws = wakeup_source_register("alarmtimer");
+ return 0;
+
+out_drv:
+ platform_driver_unregister(&alarmtimer_driver);
+out_if:
+ alarmtimer_rtc_interface_remove();
+ return error;
+}
+device_initcall(alarmtimer_init);
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
new file mode 100644
index 000000000..637a09461
--- /dev/null
+++ b/kernel/time/clockevents.c
@@ -0,0 +1,792 @@
+/*
+ * linux/kernel/time/clockevents.c
+ *
+ * This file contains functions which manage clock event devices.
+ *
+ * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
+ * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
+ * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
+ *
+ * This code is licenced under the GPL version 2. For details see
+ * kernel-base/COPYING.
+ */
+
+#include <linux/clockchips.h>
+#include <linux/hrtimer.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/smp.h>
+#include <linux/device.h>
+
+#include "tick-internal.h"
+
+/* The registered clock event devices */
+static LIST_HEAD(clockevent_devices);
+static LIST_HEAD(clockevents_released);
+/* Protection for the above */
+static DEFINE_RAW_SPINLOCK(clockevents_lock);
+/* Protection for unbind operations */
+static DEFINE_MUTEX(clockevents_mutex);
+
+struct ce_unbind {
+ struct clock_event_device *ce;
+ int res;
+};
+
+static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt,
+ bool ismax)
+{
+ u64 clc = (u64) latch << evt->shift;
+ u64 rnd;
+
+ if (unlikely(!evt->mult)) {
+ evt->mult = 1;
+ WARN_ON(1);
+ }
+ rnd = (u64) evt->mult - 1;
+
+ /*
+ * Upper bound sanity check. If the backwards conversion is
+ * not equal latch, we know that the above shift overflowed.
+ */
+ if ((clc >> evt->shift) != (u64)latch)
+ clc = ~0ULL;
+
+ /*
+ * Scaled math oddities:
+ *
+ * For mult <= (1 << shift) we can safely add mult - 1 to
+ * prevent integer rounding loss. So the backwards conversion
+ * from nsec to device ticks will be correct.
+ *
+ * For mult > (1 << shift), i.e. device frequency is > 1GHz we
+ * need to be careful. Adding mult - 1 will result in a value
+ * which when converted back to device ticks can be larger
+ * than latch by up to (mult - 1) >> shift. For the min_delta
+ * calculation we still want to apply this in order to stay
+ * above the minimum device ticks limit. For the upper limit
+ * we would end up with a latch value larger than the upper
+ * limit of the device, so we omit the add to stay below the
+ * device upper boundary.
+ *
+ * Also omit the add if it would overflow the u64 boundary.
+ */
+ if ((~0ULL - clc > rnd) &&
+ (!ismax || evt->mult <= (1ULL << evt->shift)))
+ clc += rnd;
+
+ do_div(clc, evt->mult);
+
+ /* Deltas less than 1usec are pointless noise */
+ return clc > 1000 ? clc : 1000;
+}
+
+/**
+ * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds
+ * @latch: value to convert
+ * @evt: pointer to clock event device descriptor
+ *
+ * Math helper, returns latch value converted to nanoseconds (bound checked)
+ */
+u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt)
+{
+ return cev_delta2ns(latch, evt, false);
+}
+EXPORT_SYMBOL_GPL(clockevent_delta2ns);
+
+static int __clockevents_set_state(struct clock_event_device *dev,
+ enum clock_event_state state)
+{
+ /* Transition with legacy set_mode() callback */
+ if (dev->set_mode) {
+ /* Legacy callback doesn't support new modes */
+ if (state > CLOCK_EVT_STATE_ONESHOT)
+ return -ENOSYS;
+ /*
+ * 'clock_event_state' and 'clock_event_mode' have 1-to-1
+ * mapping until *_ONESHOT, and so a simple cast will work.
+ */
+ dev->set_mode((enum clock_event_mode)state, dev);
+ dev->mode = (enum clock_event_mode)state;
+ return 0;
+ }
+
+ if (dev->features & CLOCK_EVT_FEAT_DUMMY)
+ return 0;
+
+ /* Transition with new state-specific callbacks */
+ switch (state) {
+ case CLOCK_EVT_STATE_DETACHED:
+ /* The clockevent device is getting replaced. Shut it down. */
+
+ case CLOCK_EVT_STATE_SHUTDOWN:
+ return dev->set_state_shutdown(dev);
+
+ case CLOCK_EVT_STATE_PERIODIC:
+ /* Core internal bug */
+ if (!(dev->features & CLOCK_EVT_FEAT_PERIODIC))
+ return -ENOSYS;
+ return dev->set_state_periodic(dev);
+
+ case CLOCK_EVT_STATE_ONESHOT:
+ /* Core internal bug */
+ if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT))
+ return -ENOSYS;
+ return dev->set_state_oneshot(dev);
+
+ default:
+ return -ENOSYS;
+ }
+}
+
+/**
+ * clockevents_set_state - set the operating state of a clock event device
+ * @dev: device to modify
+ * @state: new state
+ *
+ * Must be called with interrupts disabled !
+ */
+void clockevents_set_state(struct clock_event_device *dev,
+ enum clock_event_state state)
+{
+ if (dev->state != state) {
+ if (__clockevents_set_state(dev, state))
+ return;
+
+ dev->state = state;
+
+ /*
+ * A nsec2cyc multiplicator of 0 is invalid and we'd crash
+ * on it, so fix it up and emit a warning:
+ */
+ if (state == CLOCK_EVT_STATE_ONESHOT) {
+ if (unlikely(!dev->mult)) {
+ dev->mult = 1;
+ WARN_ON(1);
+ }
+ }
+ }
+}
+
+/**
+ * clockevents_shutdown - shutdown the device and clear next_event
+ * @dev: device to shutdown
+ */
+void clockevents_shutdown(struct clock_event_device *dev)
+{
+ clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
+ dev->next_event.tv64 = KTIME_MAX;
+}
+
+/**
+ * clockevents_tick_resume - Resume the tick device before using it again
+ * @dev: device to resume
+ */
+int clockevents_tick_resume(struct clock_event_device *dev)
+{
+ int ret = 0;
+
+ if (dev->set_mode) {
+ dev->set_mode(CLOCK_EVT_MODE_RESUME, dev);
+ dev->mode = CLOCK_EVT_MODE_RESUME;
+ } else if (dev->tick_resume) {
+ ret = dev->tick_resume(dev);
+ }
+
+ return ret;
+}
+
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST
+
+/* Limit min_delta to a jiffie */
+#define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ)
+
+/**
+ * clockevents_increase_min_delta - raise minimum delta of a clock event device
+ * @dev: device to increase the minimum delta
+ *
+ * Returns 0 on success, -ETIME when the minimum delta reached the limit.
+ */
+static int clockevents_increase_min_delta(struct clock_event_device *dev)
+{
+ /* Nothing to do if we already reached the limit */
+ if (dev->min_delta_ns >= MIN_DELTA_LIMIT) {
+ printk_deferred(KERN_WARNING
+ "CE: Reprogramming failure. Giving up\n");
+ dev->next_event.tv64 = KTIME_MAX;
+ return -ETIME;
+ }
+
+ if (dev->min_delta_ns < 5000)
+ dev->min_delta_ns = 5000;
+ else
+ dev->min_delta_ns += dev->min_delta_ns >> 1;
+
+ if (dev->min_delta_ns > MIN_DELTA_LIMIT)
+ dev->min_delta_ns = MIN_DELTA_LIMIT;
+
+ printk_deferred(KERN_WARNING
+ "CE: %s increased min_delta_ns to %llu nsec\n",
+ dev->name ? dev->name : "?",
+ (unsigned long long) dev->min_delta_ns);
+ return 0;
+}
+
+/**
+ * clockevents_program_min_delta - Set clock event device to the minimum delay.
+ * @dev: device to program
+ *
+ * Returns 0 on success, -ETIME when the retry loop failed.
+ */
+static int clockevents_program_min_delta(struct clock_event_device *dev)
+{
+ unsigned long long clc;
+ int64_t delta;
+ int i;
+
+ for (i = 0;;) {
+ delta = dev->min_delta_ns;
+ dev->next_event = ktime_add_ns(ktime_get(), delta);
+
+ if (dev->state == CLOCK_EVT_STATE_SHUTDOWN)
+ return 0;
+
+ dev->retries++;
+ clc = ((unsigned long long) delta * dev->mult) >> dev->shift;
+ if (dev->set_next_event((unsigned long) clc, dev) == 0)
+ return 0;
+
+ if (++i > 2) {
+ /*
+ * We tried 3 times to program the device with the
+ * given min_delta_ns. Try to increase the minimum
+ * delta, if that fails as well get out of here.
+ */
+ if (clockevents_increase_min_delta(dev))
+ return -ETIME;
+ i = 0;
+ }
+ }
+}
+
+#else /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */
+
+/**
+ * clockevents_program_min_delta - Set clock event device to the minimum delay.
+ * @dev: device to program
+ *
+ * Returns 0 on success, -ETIME when the retry loop failed.
+ */
+static int clockevents_program_min_delta(struct clock_event_device *dev)
+{
+ unsigned long long clc;
+ int64_t delta;
+
+ delta = dev->min_delta_ns;
+ dev->next_event = ktime_add_ns(ktime_get(), delta);
+
+ if (dev->state == CLOCK_EVT_STATE_SHUTDOWN)
+ return 0;
+
+ dev->retries++;
+ clc = ((unsigned long long) delta * dev->mult) >> dev->shift;
+ return dev->set_next_event((unsigned long) clc, dev);
+}
+
+#endif /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */
+
+/**
+ * clockevents_program_event - Reprogram the clock event device.
+ * @dev: device to program
+ * @expires: absolute expiry time (monotonic clock)
+ * @force: program minimum delay if expires can not be set
+ *
+ * Returns 0 on success, -ETIME when the event is in the past.
+ */
+int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
+ bool force)
+{
+ unsigned long long clc;
+ int64_t delta;
+ int rc;
+
+ if (unlikely(expires.tv64 < 0)) {
+ WARN_ON_ONCE(1);
+ return -ETIME;
+ }
+
+ dev->next_event = expires;
+
+ if (dev->state == CLOCK_EVT_STATE_SHUTDOWN)
+ return 0;
+
+ /* Shortcut for clockevent devices that can deal with ktime. */
+ if (dev->features & CLOCK_EVT_FEAT_KTIME)
+ return dev->set_next_ktime(expires, dev);
+
+ delta = ktime_to_ns(ktime_sub(expires, ktime_get()));
+ if (delta <= 0)
+ return force ? clockevents_program_min_delta(dev) : -ETIME;
+
+ delta = min(delta, (int64_t) dev->max_delta_ns);
+ delta = max(delta, (int64_t) dev->min_delta_ns);
+
+ clc = ((unsigned long long) delta * dev->mult) >> dev->shift;
+ rc = dev->set_next_event((unsigned long) clc, dev);
+
+ return (rc && force) ? clockevents_program_min_delta(dev) : rc;
+}
+
+/*
+ * Called after a notify add to make devices available which were
+ * released from the notifier call.
+ */
+static void clockevents_notify_released(void)
+{
+ struct clock_event_device *dev;
+
+ while (!list_empty(&clockevents_released)) {
+ dev = list_entry(clockevents_released.next,
+ struct clock_event_device, list);
+ list_del(&dev->list);
+ list_add(&dev->list, &clockevent_devices);
+ tick_check_new_device(dev);
+ }
+}
+
+/*
+ * Try to install a replacement clock event device
+ */
+static int clockevents_replace(struct clock_event_device *ced)
+{
+ struct clock_event_device *dev, *newdev = NULL;
+
+ list_for_each_entry(dev, &clockevent_devices, list) {
+ if (dev == ced || dev->state != CLOCK_EVT_STATE_DETACHED)
+ continue;
+
+ if (!tick_check_replacement(newdev, dev))
+ continue;
+
+ if (!try_module_get(dev->owner))
+ continue;
+
+ if (newdev)
+ module_put(newdev->owner);
+ newdev = dev;
+ }
+ if (newdev) {
+ tick_install_replacement(newdev);
+ list_del_init(&ced->list);
+ }
+ return newdev ? 0 : -EBUSY;
+}
+
+/*
+ * Called with clockevents_mutex and clockevents_lock held
+ */
+static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu)
+{
+ /* Fast track. Device is unused */
+ if (ced->state == CLOCK_EVT_STATE_DETACHED) {
+ list_del_init(&ced->list);
+ return 0;
+ }
+
+ return ced == per_cpu(tick_cpu_device, cpu).evtdev ? -EAGAIN : -EBUSY;
+}
+
+/*
+ * SMP function call to unbind a device
+ */
+static void __clockevents_unbind(void *arg)
+{
+ struct ce_unbind *cu = arg;
+ int res;
+
+ raw_spin_lock(&clockevents_lock);
+ res = __clockevents_try_unbind(cu->ce, smp_processor_id());
+ if (res == -EAGAIN)
+ res = clockevents_replace(cu->ce);
+ cu->res = res;
+ raw_spin_unlock(&clockevents_lock);
+}
+
+/*
+ * Issues smp function call to unbind a per cpu device. Called with
+ * clockevents_mutex held.
+ */
+static int clockevents_unbind(struct clock_event_device *ced, int cpu)
+{
+ struct ce_unbind cu = { .ce = ced, .res = -ENODEV };
+
+ smp_call_function_single(cpu, __clockevents_unbind, &cu, 1);
+ return cu.res;
+}
+
+/*
+ * Unbind a clockevents device.
+ */
+int clockevents_unbind_device(struct clock_event_device *ced, int cpu)
+{
+ int ret;
+
+ mutex_lock(&clockevents_mutex);
+ ret = clockevents_unbind(ced, cpu);
+ mutex_unlock(&clockevents_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(clockevents_unbind_device);
+
+/* Sanity check of state transition callbacks */
+static int clockevents_sanity_check(struct clock_event_device *dev)
+{
+ /* Legacy set_mode() callback */
+ if (dev->set_mode) {
+ /* We shouldn't be supporting new modes now */
+ WARN_ON(dev->set_state_periodic || dev->set_state_oneshot ||
+ dev->set_state_shutdown || dev->tick_resume);
+
+ BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
+ return 0;
+ }
+
+ if (dev->features & CLOCK_EVT_FEAT_DUMMY)
+ return 0;
+
+ /* New state-specific callbacks */
+ if (!dev->set_state_shutdown)
+ return -EINVAL;
+
+ if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
+ !dev->set_state_periodic)
+ return -EINVAL;
+
+ if ((dev->features & CLOCK_EVT_FEAT_ONESHOT) &&
+ !dev->set_state_oneshot)
+ return -EINVAL;
+
+ return 0;
+}
+
+/**
+ * clockevents_register_device - register a clock event device
+ * @dev: device to register
+ */
+void clockevents_register_device(struct clock_event_device *dev)
+{
+ unsigned long flags;
+
+ BUG_ON(clockevents_sanity_check(dev));
+
+ /* Initialize state to DETACHED */
+ dev->state = CLOCK_EVT_STATE_DETACHED;
+
+ if (!dev->cpumask) {
+ WARN_ON(num_possible_cpus() > 1);
+ dev->cpumask = cpumask_of(smp_processor_id());
+ }
+
+ raw_spin_lock_irqsave(&clockevents_lock, flags);
+
+ list_add(&dev->list, &clockevent_devices);
+ tick_check_new_device(dev);
+ clockevents_notify_released();
+
+ raw_spin_unlock_irqrestore(&clockevents_lock, flags);
+}
+EXPORT_SYMBOL_GPL(clockevents_register_device);
+
+void clockevents_config(struct clock_event_device *dev, u32 freq)
+{
+ u64 sec;
+
+ if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT))
+ return;
+
+ /*
+ * Calculate the maximum number of seconds we can sleep. Limit
+ * to 10 minutes for hardware which can program more than
+ * 32bit ticks so we still get reasonable conversion values.
+ */
+ sec = dev->max_delta_ticks;
+ do_div(sec, freq);
+ if (!sec)
+ sec = 1;
+ else if (sec > 600 && dev->max_delta_ticks > UINT_MAX)
+ sec = 600;
+
+ clockevents_calc_mult_shift(dev, freq, sec);
+ dev->min_delta_ns = cev_delta2ns(dev->min_delta_ticks, dev, false);
+ dev->max_delta_ns = cev_delta2ns(dev->max_delta_ticks, dev, true);
+}
+
+/**
+ * clockevents_config_and_register - Configure and register a clock event device
+ * @dev: device to register
+ * @freq: The clock frequency
+ * @min_delta: The minimum clock ticks to program in oneshot mode
+ * @max_delta: The maximum clock ticks to program in oneshot mode
+ *
+ * min/max_delta can be 0 for devices which do not support oneshot mode.
+ */
+void clockevents_config_and_register(struct clock_event_device *dev,
+ u32 freq, unsigned long min_delta,
+ unsigned long max_delta)
+{
+ dev->min_delta_ticks = min_delta;
+ dev->max_delta_ticks = max_delta;
+ clockevents_config(dev, freq);
+ clockevents_register_device(dev);
+}
+EXPORT_SYMBOL_GPL(clockevents_config_and_register);
+
+int __clockevents_update_freq(struct clock_event_device *dev, u32 freq)
+{
+ clockevents_config(dev, freq);
+
+ if (dev->state == CLOCK_EVT_STATE_ONESHOT)
+ return clockevents_program_event(dev, dev->next_event, false);
+
+ if (dev->state == CLOCK_EVT_STATE_PERIODIC)
+ return __clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC);
+
+ return 0;
+}
+
+/**
+ * clockevents_update_freq - Update frequency and reprogram a clock event device.
+ * @dev: device to modify
+ * @freq: new device frequency
+ *
+ * Reconfigure and reprogram a clock event device in oneshot
+ * mode. Must be called on the cpu for which the device delivers per
+ * cpu timer events. If called for the broadcast device the core takes
+ * care of serialization.
+ *
+ * Returns 0 on success, -ETIME when the event is in the past.
+ */
+int clockevents_update_freq(struct clock_event_device *dev, u32 freq)
+{
+ unsigned long flags;
+ int ret;
+
+ local_irq_save(flags);
+ ret = tick_broadcast_update_freq(dev, freq);
+ if (ret == -ENODEV)
+ ret = __clockevents_update_freq(dev, freq);
+ local_irq_restore(flags);
+ return ret;
+}
+
+/*
+ * Noop handler when we shut down an event device
+ */
+void clockevents_handle_noop(struct clock_event_device *dev)
+{
+}
+
+/**
+ * clockevents_exchange_device - release and request clock devices
+ * @old: device to release (can be NULL)
+ * @new: device to request (can be NULL)
+ *
+ * Called from various tick functions with clockevents_lock held and
+ * interrupts disabled.
+ */
+void clockevents_exchange_device(struct clock_event_device *old,
+ struct clock_event_device *new)
+{
+ /*
+ * Caller releases a clock event device. We queue it into the
+ * released list and do a notify add later.
+ */
+ if (old) {
+ module_put(old->owner);
+ clockevents_set_state(old, CLOCK_EVT_STATE_DETACHED);
+ list_del(&old->list);
+ list_add(&old->list, &clockevents_released);
+ }
+
+ if (new) {
+ BUG_ON(new->state != CLOCK_EVT_STATE_DETACHED);
+ clockevents_shutdown(new);
+ }
+}
+
+/**
+ * clockevents_suspend - suspend clock devices
+ */
+void clockevents_suspend(void)
+{
+ struct clock_event_device *dev;
+
+ list_for_each_entry_reverse(dev, &clockevent_devices, list)
+ if (dev->suspend)
+ dev->suspend(dev);
+}
+
+/**
+ * clockevents_resume - resume clock devices
+ */
+void clockevents_resume(void)
+{
+ struct clock_event_device *dev;
+
+ list_for_each_entry(dev, &clockevent_devices, list)
+ if (dev->resume)
+ dev->resume(dev);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+/**
+ * tick_cleanup_dead_cpu - Cleanup the tick and clockevents of a dead cpu
+ */
+void tick_cleanup_dead_cpu(int cpu)
+{
+ struct clock_event_device *dev, *tmp;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&clockevents_lock, flags);
+
+ tick_shutdown_broadcast_oneshot(cpu);
+ tick_shutdown_broadcast(cpu);
+ tick_shutdown(cpu);
+ /*
+ * Unregister the clock event devices which were
+ * released from the users in the notify chain.
+ */
+ list_for_each_entry_safe(dev, tmp, &clockevents_released, list)
+ list_del(&dev->list);
+ /*
+ * Now check whether the CPU has left unused per cpu devices
+ */
+ list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) {
+ if (cpumask_test_cpu(cpu, dev->cpumask) &&
+ cpumask_weight(dev->cpumask) == 1 &&
+ !tick_is_broadcast_device(dev)) {
+ BUG_ON(dev->state != CLOCK_EVT_STATE_DETACHED);
+ list_del(&dev->list);
+ }
+ }
+ raw_spin_unlock_irqrestore(&clockevents_lock, flags);
+}
+#endif
+
+#ifdef CONFIG_SYSFS
+struct bus_type clockevents_subsys = {
+ .name = "clockevents",
+ .dev_name = "clockevent",
+};
+
+static DEFINE_PER_CPU(struct device, tick_percpu_dev);
+static struct tick_device *tick_get_tick_dev(struct device *dev);
+
+static ssize_t sysfs_show_current_tick_dev(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct tick_device *td;
+ ssize_t count = 0;
+
+ raw_spin_lock_irq(&clockevents_lock);
+ td = tick_get_tick_dev(dev);
+ if (td && td->evtdev)
+ count = snprintf(buf, PAGE_SIZE, "%s\n", td->evtdev->name);
+ raw_spin_unlock_irq(&clockevents_lock);
+ return count;
+}
+static DEVICE_ATTR(current_device, 0444, sysfs_show_current_tick_dev, NULL);
+
+/* We don't support the abomination of removable broadcast devices */
+static ssize_t sysfs_unbind_tick_dev(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ char name[CS_NAME_LEN];
+ ssize_t ret = sysfs_get_uname(buf, name, count);
+ struct clock_event_device *ce;
+
+ if (ret < 0)
+ return ret;
+
+ ret = -ENODEV;
+ mutex_lock(&clockevents_mutex);
+ raw_spin_lock_irq(&clockevents_lock);
+ list_for_each_entry(ce, &clockevent_devices, list) {
+ if (!strcmp(ce->name, name)) {
+ ret = __clockevents_try_unbind(ce, dev->id);
+ break;
+ }
+ }
+ raw_spin_unlock_irq(&clockevents_lock);
+ /*
+ * We hold clockevents_mutex, so ce can't go away
+ */
+ if (ret == -EAGAIN)
+ ret = clockevents_unbind(ce, dev->id);
+ mutex_unlock(&clockevents_mutex);
+ return ret ? ret : count;
+}
+static DEVICE_ATTR(unbind_device, 0200, NULL, sysfs_unbind_tick_dev);
+
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+static struct device tick_bc_dev = {
+ .init_name = "broadcast",
+ .id = 0,
+ .bus = &clockevents_subsys,
+};
+
+static struct tick_device *tick_get_tick_dev(struct device *dev)
+{
+ return dev == &tick_bc_dev ? tick_get_broadcast_device() :
+ &per_cpu(tick_cpu_device, dev->id);
+}
+
+static __init int tick_broadcast_init_sysfs(void)
+{
+ int err = device_register(&tick_bc_dev);
+
+ if (!err)
+ err = device_create_file(&tick_bc_dev, &dev_attr_current_device);
+ return err;
+}
+#else
+static struct tick_device *tick_get_tick_dev(struct device *dev)
+{
+ return &per_cpu(tick_cpu_device, dev->id);
+}
+static inline int tick_broadcast_init_sysfs(void) { return 0; }
+#endif
+
+static int __init tick_init_sysfs(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct device *dev = &per_cpu(tick_percpu_dev, cpu);
+ int err;
+
+ dev->id = cpu;
+ dev->bus = &clockevents_subsys;
+ err = device_register(dev);
+ if (!err)
+ err = device_create_file(dev, &dev_attr_current_device);
+ if (!err)
+ err = device_create_file(dev, &dev_attr_unbind_device);
+ if (err)
+ return err;
+ }
+ return tick_broadcast_init_sysfs();
+}
+
+static int __init clockevents_init_sysfs(void)
+{
+ int err = subsys_system_register(&clockevents_subsys, NULL);
+
+ if (!err)
+ err = tick_init_sysfs();
+ return err;
+}
+device_initcall(clockevents_init_sysfs);
+#endif /* SYSFS */
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
new file mode 100644
index 000000000..15facb1b9
--- /dev/null
+++ b/kernel/time/clocksource.c
@@ -0,0 +1,1020 @@
+/*
+ * linux/kernel/time/clocksource.c
+ *
+ * This file contains the functions which manage clocksource drivers.
+ *
+ * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * TODO WishList:
+ * o Allow clocksource drivers to be unregistered
+ */
+
+#include <linux/device.h>
+#include <linux/clocksource.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
+#include <linux/tick.h>
+#include <linux/kthread.h>
+
+#include "tick-internal.h"
+#include "timekeeping_internal.h"
+
+/**
+ * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks
+ * @mult: pointer to mult variable
+ * @shift: pointer to shift variable
+ * @from: frequency to convert from
+ * @to: frequency to convert to
+ * @maxsec: guaranteed runtime conversion range in seconds
+ *
+ * The function evaluates the shift/mult pair for the scaled math
+ * operations of clocksources and clockevents.
+ *
+ * @to and @from are frequency values in HZ. For clock sources @to is
+ * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock
+ * event @to is the counter frequency and @from is NSEC_PER_SEC.
+ *
+ * The @maxsec conversion range argument controls the time frame in
+ * seconds which must be covered by the runtime conversion with the
+ * calculated mult and shift factors. This guarantees that no 64bit
+ * overflow happens when the input value of the conversion is
+ * multiplied with the calculated mult factor. Larger ranges may
+ * reduce the conversion accuracy by chosing smaller mult and shift
+ * factors.
+ */
+void
+clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec)
+{
+ u64 tmp;
+ u32 sft, sftacc= 32;
+
+ /*
+ * Calculate the shift factor which is limiting the conversion
+ * range:
+ */
+ tmp = ((u64)maxsec * from) >> 32;
+ while (tmp) {
+ tmp >>=1;
+ sftacc--;
+ }
+
+ /*
+ * Find the conversion shift/mult pair which has the best
+ * accuracy and fits the maxsec conversion range:
+ */
+ for (sft = 32; sft > 0; sft--) {
+ tmp = (u64) to << sft;
+ tmp += from / 2;
+ do_div(tmp, from);
+ if ((tmp >> sftacc) == 0)
+ break;
+ }
+ *mult = tmp;
+ *shift = sft;
+}
+
+/*[Clocksource internal variables]---------
+ * curr_clocksource:
+ * currently selected clocksource.
+ * clocksource_list:
+ * linked list with the registered clocksources
+ * clocksource_mutex:
+ * protects manipulations to curr_clocksource and the clocksource_list
+ * override_name:
+ * Name of the user-specified clocksource.
+ */
+static struct clocksource *curr_clocksource;
+static LIST_HEAD(clocksource_list);
+static DEFINE_MUTEX(clocksource_mutex);
+static char override_name[CS_NAME_LEN];
+static int finished_booting;
+
+#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
+static void clocksource_watchdog_work(struct work_struct *work);
+static void clocksource_select(void);
+
+static LIST_HEAD(watchdog_list);
+static struct clocksource *watchdog;
+static struct timer_list watchdog_timer;
+static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
+static DEFINE_SPINLOCK(watchdog_lock);
+static int watchdog_running;
+static atomic_t watchdog_reset_pending;
+
+static int clocksource_watchdog_kthread(void *data);
+static void __clocksource_change_rating(struct clocksource *cs, int rating);
+
+/*
+ * Interval: 0.5sec Threshold: 0.0625s
+ */
+#define WATCHDOG_INTERVAL (HZ >> 1)
+#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)
+
+static void clocksource_watchdog_work(struct work_struct *work)
+{
+ /*
+ * If kthread_run fails the next watchdog scan over the
+ * watchdog_list will find the unstable clock again.
+ */
+ kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog");
+}
+
+static void __clocksource_unstable(struct clocksource *cs)
+{
+ cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
+ cs->flags |= CLOCK_SOURCE_UNSTABLE;
+ if (finished_booting)
+ schedule_work(&watchdog_work);
+}
+
+/**
+ * clocksource_mark_unstable - mark clocksource unstable via watchdog
+ * @cs: clocksource to be marked unstable
+ *
+ * This function is called instead of clocksource_change_rating from
+ * cpu hotplug code to avoid a deadlock between the clocksource mutex
+ * and the cpu hotplug mutex. It defers the update of the clocksource
+ * to the watchdog thread.
+ */
+void clocksource_mark_unstable(struct clocksource *cs)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&watchdog_lock, flags);
+ if (!(cs->flags & CLOCK_SOURCE_UNSTABLE)) {
+ if (list_empty(&cs->wd_list))
+ list_add(&cs->wd_list, &watchdog_list);
+ __clocksource_unstable(cs);
+ }
+ spin_unlock_irqrestore(&watchdog_lock, flags);
+}
+
+static void clocksource_watchdog(unsigned long data)
+{
+ struct clocksource *cs;
+ cycle_t csnow, wdnow, cslast, wdlast, delta;
+ int64_t wd_nsec, cs_nsec;
+ int next_cpu, reset_pending;
+
+ spin_lock(&watchdog_lock);
+ if (!watchdog_running)
+ goto out;
+
+ reset_pending = atomic_read(&watchdog_reset_pending);
+
+ list_for_each_entry(cs, &watchdog_list, wd_list) {
+
+ /* Clocksource already marked unstable? */
+ if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
+ if (finished_booting)
+ schedule_work(&watchdog_work);
+ continue;
+ }
+
+ local_irq_disable();
+ csnow = cs->read(cs);
+ wdnow = watchdog->read(watchdog);
+ local_irq_enable();
+
+ /* Clocksource initialized ? */
+ if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) ||
+ atomic_read(&watchdog_reset_pending)) {
+ cs->flags |= CLOCK_SOURCE_WATCHDOG;
+ cs->wd_last = wdnow;
+ cs->cs_last = csnow;
+ continue;
+ }
+
+ delta = clocksource_delta(wdnow, cs->wd_last, watchdog->mask);
+ wd_nsec = clocksource_cyc2ns(delta, watchdog->mult,
+ watchdog->shift);
+
+ delta = clocksource_delta(csnow, cs->cs_last, cs->mask);
+ cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift);
+ wdlast = cs->wd_last; /* save these in case we print them */
+ cslast = cs->cs_last;
+ cs->cs_last = csnow;
+ cs->wd_last = wdnow;
+
+ if (atomic_read(&watchdog_reset_pending))
+ continue;
+
+ /* Check the deviation from the watchdog clocksource. */
+ if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) {
+ pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable, because the skew is too large:\n", cs->name);
+ pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n",
+ watchdog->name, wdnow, wdlast, watchdog->mask);
+ pr_warn(" '%s' cs_now: %llx cs_last: %llx mask: %llx\n",
+ cs->name, csnow, cslast, cs->mask);
+ __clocksource_unstable(cs);
+ continue;
+ }
+
+ if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
+ (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
+ (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
+ /* Mark it valid for high-res. */
+ cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
+
+ /*
+ * clocksource_done_booting() will sort it if
+ * finished_booting is not set yet.
+ */
+ if (!finished_booting)
+ continue;
+
+ /*
+ * If this is not the current clocksource let
+ * the watchdog thread reselect it. Due to the
+ * change to high res this clocksource might
+ * be preferred now. If it is the current
+ * clocksource let the tick code know about
+ * that change.
+ */
+ if (cs != curr_clocksource) {
+ cs->flags |= CLOCK_SOURCE_RESELECT;
+ schedule_work(&watchdog_work);
+ } else {
+ tick_clock_notify();
+ }
+ }
+ }
+
+ /*
+ * We only clear the watchdog_reset_pending, when we did a
+ * full cycle through all clocksources.
+ */
+ if (reset_pending)
+ atomic_dec(&watchdog_reset_pending);
+
+ /*
+ * Cycle through CPUs to check if the CPUs stay synchronized
+ * to each other.
+ */
+ next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
+ if (next_cpu >= nr_cpu_ids)
+ next_cpu = cpumask_first(cpu_online_mask);
+ watchdog_timer.expires += WATCHDOG_INTERVAL;
+ add_timer_on(&watchdog_timer, next_cpu);
+out:
+ spin_unlock(&watchdog_lock);
+}
+
+static inline void clocksource_start_watchdog(void)
+{
+ if (watchdog_running || !watchdog || list_empty(&watchdog_list))
+ return;
+ init_timer(&watchdog_timer);
+ watchdog_timer.function = clocksource_watchdog;
+ watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
+ add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));
+ watchdog_running = 1;
+}
+
+static inline void clocksource_stop_watchdog(void)
+{
+ if (!watchdog_running || (watchdog && !list_empty(&watchdog_list)))
+ return;
+ del_timer(&watchdog_timer);
+ watchdog_running = 0;
+}
+
+static inline void clocksource_reset_watchdog(void)
+{
+ struct clocksource *cs;
+
+ list_for_each_entry(cs, &watchdog_list, wd_list)
+ cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
+}
+
+static void clocksource_resume_watchdog(void)
+{
+ atomic_inc(&watchdog_reset_pending);
+}
+
+static void clocksource_enqueue_watchdog(struct clocksource *cs)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&watchdog_lock, flags);
+ if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
+ /* cs is a clocksource to be watched. */
+ list_add(&cs->wd_list, &watchdog_list);
+ cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
+ } else {
+ /* cs is a watchdog. */
+ if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
+ cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
+ /* Pick the best watchdog. */
+ if (!watchdog || cs->rating > watchdog->rating) {
+ watchdog = cs;
+ /* Reset watchdog cycles */
+ clocksource_reset_watchdog();
+ }
+ }
+ /* Check if the watchdog timer needs to be started. */
+ clocksource_start_watchdog();
+ spin_unlock_irqrestore(&watchdog_lock, flags);
+}
+
+static void clocksource_dequeue_watchdog(struct clocksource *cs)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&watchdog_lock, flags);
+ if (cs != watchdog) {
+ if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
+ /* cs is a watched clocksource. */
+ list_del_init(&cs->wd_list);
+ /* Check if the watchdog timer needs to be stopped. */
+ clocksource_stop_watchdog();
+ }
+ }
+ spin_unlock_irqrestore(&watchdog_lock, flags);
+}
+
+static int __clocksource_watchdog_kthread(void)
+{
+ struct clocksource *cs, *tmp;
+ unsigned long flags;
+ LIST_HEAD(unstable);
+ int select = 0;
+
+ spin_lock_irqsave(&watchdog_lock, flags);
+ list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
+ if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
+ list_del_init(&cs->wd_list);
+ list_add(&cs->wd_list, &unstable);
+ select = 1;
+ }
+ if (cs->flags & CLOCK_SOURCE_RESELECT) {
+ cs->flags &= ~CLOCK_SOURCE_RESELECT;
+ select = 1;
+ }
+ }
+ /* Check if the watchdog timer needs to be stopped. */
+ clocksource_stop_watchdog();
+ spin_unlock_irqrestore(&watchdog_lock, flags);
+
+ /* Needs to be done outside of watchdog lock */
+ list_for_each_entry_safe(cs, tmp, &unstable, wd_list) {
+ list_del_init(&cs->wd_list);
+ __clocksource_change_rating(cs, 0);
+ }
+ return select;
+}
+
+static int clocksource_watchdog_kthread(void *data)
+{
+ mutex_lock(&clocksource_mutex);
+ if (__clocksource_watchdog_kthread())
+ clocksource_select();
+ mutex_unlock(&clocksource_mutex);
+ return 0;
+}
+
+static bool clocksource_is_watchdog(struct clocksource *cs)
+{
+ return cs == watchdog;
+}
+
+#else /* CONFIG_CLOCKSOURCE_WATCHDOG */
+
+static void clocksource_enqueue_watchdog(struct clocksource *cs)
+{
+ if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
+ cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
+}
+
+static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
+static inline void clocksource_resume_watchdog(void) { }
+static inline int __clocksource_watchdog_kthread(void) { return 0; }
+static bool clocksource_is_watchdog(struct clocksource *cs) { return false; }
+void clocksource_mark_unstable(struct clocksource *cs) { }
+
+#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
+
+/**
+ * clocksource_suspend - suspend the clocksource(s)
+ */
+void clocksource_suspend(void)
+{
+ struct clocksource *cs;
+
+ list_for_each_entry_reverse(cs, &clocksource_list, list)
+ if (cs->suspend)
+ cs->suspend(cs);
+}
+
+/**
+ * clocksource_resume - resume the clocksource(s)
+ */
+void clocksource_resume(void)
+{
+ struct clocksource *cs;
+
+ list_for_each_entry(cs, &clocksource_list, list)
+ if (cs->resume)
+ cs->resume(cs);
+
+ clocksource_resume_watchdog();
+}
+
+/**
+ * clocksource_touch_watchdog - Update watchdog
+ *
+ * Update the watchdog after exception contexts such as kgdb so as not
+ * to incorrectly trip the watchdog. This might fail when the kernel
+ * was stopped in code which holds watchdog_lock.
+ */
+void clocksource_touch_watchdog(void)
+{
+ clocksource_resume_watchdog();
+}
+
+/**
+ * clocksource_max_adjustment- Returns max adjustment amount
+ * @cs: Pointer to clocksource
+ *
+ */
+static u32 clocksource_max_adjustment(struct clocksource *cs)
+{
+ u64 ret;
+ /*
+ * We won't try to correct for more than 11% adjustments (110,000 ppm),
+ */
+ ret = (u64)cs->mult * 11;
+ do_div(ret,100);
+ return (u32)ret;
+}
+
+/**
+ * clocks_calc_max_nsecs - Returns maximum nanoseconds that can be converted
+ * @mult: cycle to nanosecond multiplier
+ * @shift: cycle to nanosecond divisor (power of two)
+ * @maxadj: maximum adjustment value to mult (~11%)
+ * @mask: bitmask for two's complement subtraction of non 64 bit counters
+ * @max_cyc: maximum cycle value before potential overflow (does not include
+ * any safety margin)
+ *
+ * NOTE: This function includes a safety margin of 50%, in other words, we
+ * return half the number of nanoseconds the hardware counter can technically
+ * cover. This is done so that we can potentially detect problems caused by
+ * delayed timers or bad hardware, which might result in time intervals that
+ * are larger then what the math used can handle without overflows.
+ */
+u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask, u64 *max_cyc)
+{
+ u64 max_nsecs, max_cycles;
+
+ /*
+ * Calculate the maximum number of cycles that we can pass to the
+ * cyc2ns() function without overflowing a 64-bit result.
+ */
+ max_cycles = ULLONG_MAX;
+ do_div(max_cycles, mult+maxadj);
+
+ /*
+ * The actual maximum number of cycles we can defer the clocksource is
+ * determined by the minimum of max_cycles and mask.
+ * Note: Here we subtract the maxadj to make sure we don't sleep for
+ * too long if there's a large negative adjustment.
+ */
+ max_cycles = min(max_cycles, mask);
+ max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift);
+
+ /* return the max_cycles value as well if requested */
+ if (max_cyc)
+ *max_cyc = max_cycles;
+
+ /* Return 50% of the actual maximum, so we can detect bad values */
+ max_nsecs >>= 1;
+
+ return max_nsecs;
+}
+
+/**
+ * clocksource_update_max_deferment - Updates the clocksource max_idle_ns & max_cycles
+ * @cs: Pointer to clocksource to be updated
+ *
+ */
+static inline void clocksource_update_max_deferment(struct clocksource *cs)
+{
+ cs->max_idle_ns = clocks_calc_max_nsecs(cs->mult, cs->shift,
+ cs->maxadj, cs->mask,
+ &cs->max_cycles);
+}
+
+#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
+
+static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur)
+{
+ struct clocksource *cs;
+
+ if (!finished_booting || list_empty(&clocksource_list))
+ return NULL;
+
+ /*
+ * We pick the clocksource with the highest rating. If oneshot
+ * mode is active, we pick the highres valid clocksource with
+ * the best rating.
+ */
+ list_for_each_entry(cs, &clocksource_list, list) {
+ if (skipcur && cs == curr_clocksource)
+ continue;
+ if (oneshot && !(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES))
+ continue;
+ return cs;
+ }
+ return NULL;
+}
+
+static void __clocksource_select(bool skipcur)
+{
+ bool oneshot = tick_oneshot_mode_active();
+ struct clocksource *best, *cs;
+
+ /* Find the best suitable clocksource */
+ best = clocksource_find_best(oneshot, skipcur);
+ if (!best)
+ return;
+
+ /* Check for the override clocksource. */
+ list_for_each_entry(cs, &clocksource_list, list) {
+ if (skipcur && cs == curr_clocksource)
+ continue;
+ if (strcmp(cs->name, override_name) != 0)
+ continue;
+ /*
+ * Check to make sure we don't switch to a non-highres
+ * capable clocksource if the tick code is in oneshot
+ * mode (highres or nohz)
+ */
+ if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) {
+ /* Override clocksource cannot be used. */
+ printk(KERN_WARNING "Override clocksource %s is not "
+ "HRT compatible. Cannot switch while in "
+ "HRT/NOHZ mode\n", cs->name);
+ override_name[0] = 0;
+ } else
+ /* Override clocksource can be used. */
+ best = cs;
+ break;
+ }
+
+ if (curr_clocksource != best && !timekeeping_notify(best)) {
+ pr_info("Switched to clocksource %s\n", best->name);
+ curr_clocksource = best;
+ }
+}
+
+/**
+ * clocksource_select - Select the best clocksource available
+ *
+ * Private function. Must hold clocksource_mutex when called.
+ *
+ * Select the clocksource with the best rating, or the clocksource,
+ * which is selected by userspace override.
+ */
+static void clocksource_select(void)
+{
+ return __clocksource_select(false);
+}
+
+static void clocksource_select_fallback(void)
+{
+ return __clocksource_select(true);
+}
+
+#else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */
+
+static inline void clocksource_select(void) { }
+static inline void clocksource_select_fallback(void) { }
+
+#endif
+
+/*
+ * clocksource_done_booting - Called near the end of core bootup
+ *
+ * Hack to avoid lots of clocksource churn at boot time.
+ * We use fs_initcall because we want this to start before
+ * device_initcall but after subsys_initcall.
+ */
+static int __init clocksource_done_booting(void)
+{
+ mutex_lock(&clocksource_mutex);
+ curr_clocksource = clocksource_default_clock();
+ finished_booting = 1;
+ /*
+ * Run the watchdog first to eliminate unstable clock sources
+ */
+ __clocksource_watchdog_kthread();
+ clocksource_select();
+ mutex_unlock(&clocksource_mutex);
+ return 0;
+}
+fs_initcall(clocksource_done_booting);
+
+/*
+ * Enqueue the clocksource sorted by rating
+ */
+static void clocksource_enqueue(struct clocksource *cs)
+{
+ struct list_head *entry = &clocksource_list;
+ struct clocksource *tmp;
+
+ list_for_each_entry(tmp, &clocksource_list, list)
+ /* Keep track of the place, where to insert */
+ if (tmp->rating >= cs->rating)
+ entry = &tmp->list;
+ list_add(&cs->list, entry);
+}
+
+/**
+ * __clocksource_update_freq_scale - Used update clocksource with new freq
+ * @cs: clocksource to be registered
+ * @scale: Scale factor multiplied against freq to get clocksource hz
+ * @freq: clocksource frequency (cycles per second) divided by scale
+ *
+ * This should only be called from the clocksource->enable() method.
+ *
+ * This *SHOULD NOT* be called directly! Please use the
+ * __clocksource_update_freq_hz() or __clocksource_update_freq_khz() helper
+ * functions.
+ */
+void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq)
+{
+ u64 sec;
+
+ /*
+ * Default clocksources are *special* and self-define their mult/shift.
+ * But, you're not special, so you should specify a freq value.
+ */
+ if (freq) {
+ /*
+ * Calc the maximum number of seconds which we can run before
+ * wrapping around. For clocksources which have a mask > 32-bit
+ * we need to limit the max sleep time to have a good
+ * conversion precision. 10 minutes is still a reasonable
+ * amount. That results in a shift value of 24 for a
+ * clocksource with mask >= 40-bit and f >= 4GHz. That maps to
+ * ~ 0.06ppm granularity for NTP.
+ */
+ sec = cs->mask;
+ do_div(sec, freq);
+ do_div(sec, scale);
+ if (!sec)
+ sec = 1;
+ else if (sec > 600 && cs->mask > UINT_MAX)
+ sec = 600;
+
+ clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
+ NSEC_PER_SEC / scale, sec * scale);
+ }
+ /*
+ * Ensure clocksources that have large 'mult' values don't overflow
+ * when adjusted.
+ */
+ cs->maxadj = clocksource_max_adjustment(cs);
+ while (freq && ((cs->mult + cs->maxadj < cs->mult)
+ || (cs->mult - cs->maxadj > cs->mult))) {
+ cs->mult >>= 1;
+ cs->shift--;
+ cs->maxadj = clocksource_max_adjustment(cs);
+ }
+
+ /*
+ * Only warn for *special* clocksources that self-define
+ * their mult/shift values and don't specify a freq.
+ */
+ WARN_ONCE(cs->mult + cs->maxadj < cs->mult,
+ "timekeeping: Clocksource %s might overflow on 11%% adjustment\n",
+ cs->name);
+
+ clocksource_update_max_deferment(cs);
+
+ pr_info("clocksource %s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n",
+ cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns);
+}
+EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale);
+
+/**
+ * __clocksource_register_scale - Used to install new clocksources
+ * @cs: clocksource to be registered
+ * @scale: Scale factor multiplied against freq to get clocksource hz
+ * @freq: clocksource frequency (cycles per second) divided by scale
+ *
+ * Returns -EBUSY if registration fails, zero otherwise.
+ *
+ * This *SHOULD NOT* be called directly! Please use the
+ * clocksource_register_hz() or clocksource_register_khz helper functions.
+ */
+int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
+{
+
+ /* Initialize mult/shift and max_idle_ns */
+ __clocksource_update_freq_scale(cs, scale, freq);
+
+ /* Add clocksource to the clocksource list */
+ mutex_lock(&clocksource_mutex);
+ clocksource_enqueue(cs);
+ clocksource_enqueue_watchdog(cs);
+ clocksource_select();
+ mutex_unlock(&clocksource_mutex);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__clocksource_register_scale);
+
+static void __clocksource_change_rating(struct clocksource *cs, int rating)
+{
+ list_del(&cs->list);
+ cs->rating = rating;
+ clocksource_enqueue(cs);
+}
+
+/**
+ * clocksource_change_rating - Change the rating of a registered clocksource
+ * @cs: clocksource to be changed
+ * @rating: new rating
+ */
+void clocksource_change_rating(struct clocksource *cs, int rating)
+{
+ mutex_lock(&clocksource_mutex);
+ __clocksource_change_rating(cs, rating);
+ clocksource_select();
+ mutex_unlock(&clocksource_mutex);
+}
+EXPORT_SYMBOL(clocksource_change_rating);
+
+/*
+ * Unbind clocksource @cs. Called with clocksource_mutex held
+ */
+static int clocksource_unbind(struct clocksource *cs)
+{
+ /*
+ * I really can't convince myself to support this on hardware
+ * designed by lobotomized monkeys.
+ */
+ if (clocksource_is_watchdog(cs))
+ return -EBUSY;
+
+ if (cs == curr_clocksource) {
+ /* Select and try to install a replacement clock source */
+ clocksource_select_fallback();
+ if (curr_clocksource == cs)
+ return -EBUSY;
+ }
+ clocksource_dequeue_watchdog(cs);
+ list_del_init(&cs->list);
+ return 0;
+}
+
+/**
+ * clocksource_unregister - remove a registered clocksource
+ * @cs: clocksource to be unregistered
+ */
+int clocksource_unregister(struct clocksource *cs)
+{
+ int ret = 0;
+
+ mutex_lock(&clocksource_mutex);
+ if (!list_empty(&cs->list))
+ ret = clocksource_unbind(cs);
+ mutex_unlock(&clocksource_mutex);
+ return ret;
+}
+EXPORT_SYMBOL(clocksource_unregister);
+
+#ifdef CONFIG_SYSFS
+/**
+ * sysfs_show_current_clocksources - sysfs interface for current clocksource
+ * @dev: unused
+ * @attr: unused
+ * @buf: char buffer to be filled with clocksource list
+ *
+ * Provides sysfs interface for listing current clocksource.
+ */
+static ssize_t
+sysfs_show_current_clocksources(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ ssize_t count = 0;
+
+ mutex_lock(&clocksource_mutex);
+ count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name);
+ mutex_unlock(&clocksource_mutex);
+
+ return count;
+}
+
+ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt)
+{
+ size_t ret = cnt;
+
+ /* strings from sysfs write are not 0 terminated! */
+ if (!cnt || cnt >= CS_NAME_LEN)
+ return -EINVAL;
+
+ /* strip of \n: */
+ if (buf[cnt-1] == '\n')
+ cnt--;
+ if (cnt > 0)
+ memcpy(dst, buf, cnt);
+ dst[cnt] = 0;
+ return ret;
+}
+
+/**
+ * sysfs_override_clocksource - interface for manually overriding clocksource
+ * @dev: unused
+ * @attr: unused
+ * @buf: name of override clocksource
+ * @count: length of buffer
+ *
+ * Takes input from sysfs interface for manually overriding the default
+ * clocksource selection.
+ */
+static ssize_t sysfs_override_clocksource(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ ssize_t ret;
+
+ mutex_lock(&clocksource_mutex);
+
+ ret = sysfs_get_uname(buf, override_name, count);
+ if (ret >= 0)
+ clocksource_select();
+
+ mutex_unlock(&clocksource_mutex);
+
+ return ret;
+}
+
+/**
+ * sysfs_unbind_current_clocksource - interface for manually unbinding clocksource
+ * @dev: unused
+ * @attr: unused
+ * @buf: unused
+ * @count: length of buffer
+ *
+ * Takes input from sysfs interface for manually unbinding a clocksource.
+ */
+static ssize_t sysfs_unbind_clocksource(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct clocksource *cs;
+ char name[CS_NAME_LEN];
+ ssize_t ret;
+
+ ret = sysfs_get_uname(buf, name, count);
+ if (ret < 0)
+ return ret;
+
+ ret = -ENODEV;
+ mutex_lock(&clocksource_mutex);
+ list_for_each_entry(cs, &clocksource_list, list) {
+ if (strcmp(cs->name, name))
+ continue;
+ ret = clocksource_unbind(cs);
+ break;
+ }
+ mutex_unlock(&clocksource_mutex);
+
+ return ret ? ret : count;
+}
+
+/**
+ * sysfs_show_available_clocksources - sysfs interface for listing clocksource
+ * @dev: unused
+ * @attr: unused
+ * @buf: char buffer to be filled with clocksource list
+ *
+ * Provides sysfs interface for listing registered clocksources
+ */
+static ssize_t
+sysfs_show_available_clocksources(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct clocksource *src;
+ ssize_t count = 0;
+
+ mutex_lock(&clocksource_mutex);
+ list_for_each_entry(src, &clocksource_list, list) {
+ /*
+ * Don't show non-HRES clocksource if the tick code is
+ * in one shot mode (highres=on or nohz=on)
+ */
+ if (!tick_oneshot_mode_active() ||
+ (src->flags & CLOCK_SOURCE_VALID_FOR_HRES))
+ count += snprintf(buf + count,
+ max((ssize_t)PAGE_SIZE - count, (ssize_t)0),
+ "%s ", src->name);
+ }
+ mutex_unlock(&clocksource_mutex);
+
+ count += snprintf(buf + count,
+ max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "\n");
+
+ return count;
+}
+
+/*
+ * Sysfs setup bits:
+ */
+static DEVICE_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources,
+ sysfs_override_clocksource);
+
+static DEVICE_ATTR(unbind_clocksource, 0200, NULL, sysfs_unbind_clocksource);
+
+static DEVICE_ATTR(available_clocksource, 0444,
+ sysfs_show_available_clocksources, NULL);
+
+static struct bus_type clocksource_subsys = {
+ .name = "clocksource",
+ .dev_name = "clocksource",
+};
+
+static struct device device_clocksource = {
+ .id = 0,
+ .bus = &clocksource_subsys,
+};
+
+static int __init init_clocksource_sysfs(void)
+{
+ int error = subsys_system_register(&clocksource_subsys, NULL);
+
+ if (!error)
+ error = device_register(&device_clocksource);
+ if (!error)
+ error = device_create_file(
+ &device_clocksource,
+ &dev_attr_current_clocksource);
+ if (!error)
+ error = device_create_file(&device_clocksource,
+ &dev_attr_unbind_clocksource);
+ if (!error)
+ error = device_create_file(
+ &device_clocksource,
+ &dev_attr_available_clocksource);
+ return error;
+}
+
+device_initcall(init_clocksource_sysfs);
+#endif /* CONFIG_SYSFS */
+
+/**
+ * boot_override_clocksource - boot clock override
+ * @str: override name
+ *
+ * Takes a clocksource= boot argument and uses it
+ * as the clocksource override name.
+ */
+static int __init boot_override_clocksource(char* str)
+{
+ mutex_lock(&clocksource_mutex);
+ if (str)
+ strlcpy(override_name, str, sizeof(override_name));
+ mutex_unlock(&clocksource_mutex);
+ return 1;
+}
+
+__setup("clocksource=", boot_override_clocksource);
+
+/**
+ * boot_override_clock - Compatibility layer for deprecated boot option
+ * @str: override name
+ *
+ * DEPRECATED! Takes a clock= boot argument and uses it
+ * as the clocksource override name
+ */
+static int __init boot_override_clock(char* str)
+{
+ if (!strcmp(str, "pmtmr")) {
+ printk("Warning: clock=pmtmr is deprecated. "
+ "Use clocksource=acpi_pm.\n");
+ return boot_override_clocksource("acpi_pm");
+ }
+ printk("Warning! clock= boot option is deprecated. "
+ "Use clocksource=xyz\n");
+ return boot_override_clocksource(str);
+}
+
+__setup("clock=", boot_override_clock);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
new file mode 100644
index 000000000..93ef7190b
--- /dev/null
+++ b/kernel/time/hrtimer.c
@@ -0,0 +1,1852 @@
+/*
+ * linux/kernel/hrtimer.c
+ *
+ * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
+ * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
+ * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner
+ *
+ * High-resolution kernel timers
+ *
+ * In contrast to the low-resolution timeout API implemented in
+ * kernel/timer.c, hrtimers provide finer resolution and accuracy
+ * depending on system configuration and capabilities.
+ *
+ * These timers are currently used for:
+ * - itimers
+ * - POSIX timers
+ * - nanosleep
+ * - precise in-kernel timing
+ *
+ * Started by: Thomas Gleixner and Ingo Molnar
+ *
+ * Credits:
+ * based on kernel/timer.c
+ *
+ * Help, testing, suggestions, bugfixes, improvements were
+ * provided by:
+ *
+ * George Anzinger, Andrew Morton, Steven Rostedt, Roman Zippel
+ * et. al.
+ *
+ * For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/cpu.h>
+#include <linux/export.h>
+#include <linux/percpu.h>
+#include <linux/hrtimer.h>
+#include <linux/notifier.h>
+#include <linux/syscalls.h>
+#include <linux/kallsyms.h>
+#include <linux/interrupt.h>
+#include <linux/tick.h>
+#include <linux/seq_file.h>
+#include <linux/err.h>
+#include <linux/debugobjects.h>
+#include <linux/sched.h>
+#include <linux/sched/sysctl.h>
+#include <linux/sched/rt.h>
+#include <linux/sched/deadline.h>
+#include <linux/timer.h>
+#include <linux/freezer.h>
+
+#include <asm/uaccess.h>
+
+#include <trace/events/timer.h>
+
+#include "tick-internal.h"
+
+/*
+ * The timer bases:
+ *
+ * There are more clockids then hrtimer bases. Thus, we index
+ * into the timer bases by the hrtimer_base_type enum. When trying
+ * to reach a base using a clockid, hrtimer_clockid_to_base()
+ * is used to convert from clockid to the proper hrtimer_base_type.
+ */
+DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
+{
+
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
+ .clock_base =
+ {
+ {
+ .index = HRTIMER_BASE_MONOTONIC,
+ .clockid = CLOCK_MONOTONIC,
+ .get_time = &ktime_get,
+ .resolution = KTIME_LOW_RES,
+ },
+ {
+ .index = HRTIMER_BASE_REALTIME,
+ .clockid = CLOCK_REALTIME,
+ .get_time = &ktime_get_real,
+ .resolution = KTIME_LOW_RES,
+ },
+ {
+ .index = HRTIMER_BASE_BOOTTIME,
+ .clockid = CLOCK_BOOTTIME,
+ .get_time = &ktime_get_boottime,
+ .resolution = KTIME_LOW_RES,
+ },
+ {
+ .index = HRTIMER_BASE_TAI,
+ .clockid = CLOCK_TAI,
+ .get_time = &ktime_get_clocktai,
+ .resolution = KTIME_LOW_RES,
+ },
+ }
+};
+
+static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
+ [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME,
+ [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC,
+ [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME,
+ [CLOCK_TAI] = HRTIMER_BASE_TAI,
+};
+
+static inline int hrtimer_clockid_to_base(clockid_t clock_id)
+{
+ return hrtimer_clock_to_base_table[clock_id];
+}
+
+
+/*
+ * Get the coarse grained time at the softirq based on xtime and
+ * wall_to_monotonic.
+ */
+static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
+{
+ ktime_t xtim, mono, boot, tai;
+ ktime_t off_real, off_boot, off_tai;
+
+ mono = ktime_get_update_offsets_tick(&off_real, &off_boot, &off_tai);
+ boot = ktime_add(mono, off_boot);
+ xtim = ktime_add(mono, off_real);
+ tai = ktime_add(mono, off_tai);
+
+ base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
+ base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
+ base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot;
+ base->clock_base[HRTIMER_BASE_TAI].softirq_time = tai;
+}
+
+/*
+ * Functions and macros which are different for UP/SMP systems are kept in a
+ * single place
+ */
+#ifdef CONFIG_SMP
+
+/*
+ * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
+ * means that all timers which are tied to this base via timer->base are
+ * locked, and the base itself is locked too.
+ *
+ * So __run_timers/migrate_timers can safely modify all timers which could
+ * be found on the lists/queues.
+ *
+ * When the timer's base is locked, and the timer removed from list, it is
+ * possible to set timer->base = NULL and drop the lock: the timer remains
+ * locked.
+ */
+static
+struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
+ unsigned long *flags)
+{
+ struct hrtimer_clock_base *base;
+
+ for (;;) {
+ base = timer->base;
+ if (likely(base != NULL)) {
+ raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
+ if (likely(base == timer->base))
+ return base;
+ /* The timer has migrated to another CPU: */
+ raw_spin_unlock_irqrestore(&base->cpu_base->lock, *flags);
+ }
+ cpu_relax();
+ }
+}
+
+/*
+ * With HIGHRES=y we do not migrate the timer when it is expiring
+ * before the next event on the target cpu because we cannot reprogram
+ * the target cpu hardware and we would cause it to fire late.
+ *
+ * Called with cpu_base->lock of target cpu held.
+ */
+static int
+hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
+{
+#ifdef CONFIG_HIGH_RES_TIMERS
+ ktime_t expires;
+
+ if (!new_base->cpu_base->hres_active)
+ return 0;
+
+ expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
+ return expires.tv64 <= new_base->cpu_base->expires_next.tv64;
+#else
+ return 0;
+#endif
+}
+
+/*
+ * Switch the timer base to the current CPU when possible.
+ */
+static inline struct hrtimer_clock_base *
+switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
+ int pinned)
+{
+ struct hrtimer_clock_base *new_base;
+ struct hrtimer_cpu_base *new_cpu_base;
+ int this_cpu = smp_processor_id();
+ int cpu = get_nohz_timer_target(pinned);
+ int basenum = base->index;
+
+again:
+ new_cpu_base = &per_cpu(hrtimer_bases, cpu);
+ new_base = &new_cpu_base->clock_base[basenum];
+
+ if (base != new_base) {
+ /*
+ * We are trying to move timer to new_base.
+ * However we can't change timer's base while it is running,
+ * so we keep it on the same CPU. No hassle vs. reprogramming
+ * the event source in the high resolution case. The softirq
+ * code will take care of this when the timer function has
+ * completed. There is no conflict as we hold the lock until
+ * the timer is enqueued.
+ */
+ if (unlikely(hrtimer_callback_running(timer)))
+ return base;
+
+ /* See the comment in lock_timer_base() */
+ timer->base = NULL;
+ raw_spin_unlock(&base->cpu_base->lock);
+ raw_spin_lock(&new_base->cpu_base->lock);
+
+ if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
+ cpu = this_cpu;
+ raw_spin_unlock(&new_base->cpu_base->lock);
+ raw_spin_lock(&base->cpu_base->lock);
+ timer->base = base;
+ goto again;
+ }
+ timer->base = new_base;
+ } else {
+ if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
+ cpu = this_cpu;
+ goto again;
+ }
+ }
+ return new_base;
+}
+
+#else /* CONFIG_SMP */
+
+static inline struct hrtimer_clock_base *
+lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
+{
+ struct hrtimer_clock_base *base = timer->base;
+
+ raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
+
+ return base;
+}
+
+# define switch_hrtimer_base(t, b, p) (b)
+
+#endif /* !CONFIG_SMP */
+
+/*
+ * Functions for the union type storage format of ktime_t which are
+ * too large for inlining:
+ */
+#if BITS_PER_LONG < 64
+/*
+ * Divide a ktime value by a nanosecond value
+ */
+s64 __ktime_divns(const ktime_t kt, s64 div)
+{
+ int sft = 0;
+ s64 dclc;
+ u64 tmp;
+
+ dclc = ktime_to_ns(kt);
+ tmp = dclc < 0 ? -dclc : dclc;
+
+ /* Make sure the divisor is less than 2^32: */
+ while (div >> 32) {
+ sft++;
+ div >>= 1;
+ }
+ tmp >>= sft;
+ do_div(tmp, (unsigned long) div);
+ return dclc < 0 ? -tmp : tmp;
+}
+EXPORT_SYMBOL_GPL(__ktime_divns);
+#endif /* BITS_PER_LONG >= 64 */
+
+/*
+ * Add two ktime values and do a safety check for overflow:
+ */
+ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs)
+{
+ ktime_t res = ktime_add(lhs, rhs);
+
+ /*
+ * We use KTIME_SEC_MAX here, the maximum timeout which we can
+ * return to user space in a timespec:
+ */
+ if (res.tv64 < 0 || res.tv64 < lhs.tv64 || res.tv64 < rhs.tv64)
+ res = ktime_set(KTIME_SEC_MAX, 0);
+
+ return res;
+}
+
+EXPORT_SYMBOL_GPL(ktime_add_safe);
+
+#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
+
+static struct debug_obj_descr hrtimer_debug_descr;
+
+static void *hrtimer_debug_hint(void *addr)
+{
+ return ((struct hrtimer *) addr)->function;
+}
+
+/*
+ * fixup_init is called when:
+ * - an active object is initialized
+ */
+static int hrtimer_fixup_init(void *addr, enum debug_obj_state state)
+{
+ struct hrtimer *timer = addr;
+
+ switch (state) {
+ case ODEBUG_STATE_ACTIVE:
+ hrtimer_cancel(timer);
+ debug_object_init(timer, &hrtimer_debug_descr);
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+/*
+ * fixup_activate is called when:
+ * - an active object is activated
+ * - an unknown object is activated (might be a statically initialized object)
+ */
+static int hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
+{
+ switch (state) {
+
+ case ODEBUG_STATE_NOTAVAILABLE:
+ WARN_ON_ONCE(1);
+ return 0;
+
+ case ODEBUG_STATE_ACTIVE:
+ WARN_ON(1);
+
+ default:
+ return 0;
+ }
+}
+
+/*
+ * fixup_free is called when:
+ * - an active object is freed
+ */
+static int hrtimer_fixup_free(void *addr, enum debug_obj_state state)
+{
+ struct hrtimer *timer = addr;
+
+ switch (state) {
+ case ODEBUG_STATE_ACTIVE:
+ hrtimer_cancel(timer);
+ debug_object_free(timer, &hrtimer_debug_descr);
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+static struct debug_obj_descr hrtimer_debug_descr = {
+ .name = "hrtimer",
+ .debug_hint = hrtimer_debug_hint,
+ .fixup_init = hrtimer_fixup_init,
+ .fixup_activate = hrtimer_fixup_activate,
+ .fixup_free = hrtimer_fixup_free,
+};
+
+static inline void debug_hrtimer_init(struct hrtimer *timer)
+{
+ debug_object_init(timer, &hrtimer_debug_descr);
+}
+
+static inline void debug_hrtimer_activate(struct hrtimer *timer)
+{
+ debug_object_activate(timer, &hrtimer_debug_descr);
+}
+
+static inline void debug_hrtimer_deactivate(struct hrtimer *timer)
+{
+ debug_object_deactivate(timer, &hrtimer_debug_descr);
+}
+
+static inline void debug_hrtimer_free(struct hrtimer *timer)
+{
+ debug_object_free(timer, &hrtimer_debug_descr);
+}
+
+static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
+ enum hrtimer_mode mode);
+
+void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id,
+ enum hrtimer_mode mode)
+{
+ debug_object_init_on_stack(timer, &hrtimer_debug_descr);
+ __hrtimer_init(timer, clock_id, mode);
+}
+EXPORT_SYMBOL_GPL(hrtimer_init_on_stack);
+
+void destroy_hrtimer_on_stack(struct hrtimer *timer)
+{
+ debug_object_free(timer, &hrtimer_debug_descr);
+}
+
+#else
+static inline void debug_hrtimer_init(struct hrtimer *timer) { }
+static inline void debug_hrtimer_activate(struct hrtimer *timer) { }
+static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
+#endif
+
+static inline void
+debug_init(struct hrtimer *timer, clockid_t clockid,
+ enum hrtimer_mode mode)
+{
+ debug_hrtimer_init(timer);
+ trace_hrtimer_init(timer, clockid, mode);
+}
+
+static inline void debug_activate(struct hrtimer *timer)
+{
+ debug_hrtimer_activate(timer);
+ trace_hrtimer_start(timer);
+}
+
+static inline void debug_deactivate(struct hrtimer *timer)
+{
+ debug_hrtimer_deactivate(timer);
+ trace_hrtimer_cancel(timer);
+}
+
+#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
+static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
+{
+ struct hrtimer_clock_base *base = cpu_base->clock_base;
+ ktime_t expires, expires_next = { .tv64 = KTIME_MAX };
+ int i;
+
+ for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
+ struct timerqueue_node *next;
+ struct hrtimer *timer;
+
+ next = timerqueue_getnext(&base->active);
+ if (!next)
+ continue;
+
+ timer = container_of(next, struct hrtimer, node);
+ expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
+ if (expires.tv64 < expires_next.tv64)
+ expires_next = expires;
+ }
+ /*
+ * clock_was_set() might have changed base->offset of any of
+ * the clock bases so the result might be negative. Fix it up
+ * to prevent a false positive in clockevents_program_event().
+ */
+ if (expires_next.tv64 < 0)
+ expires_next.tv64 = 0;
+ return expires_next;
+}
+#endif
+
+/* High resolution timer related functions */
+#ifdef CONFIG_HIGH_RES_TIMERS
+
+/*
+ * High resolution timer enabled ?
+ */
+static int hrtimer_hres_enabled __read_mostly = 1;
+
+/*
+ * Enable / Disable high resolution mode
+ */
+static int __init setup_hrtimer_hres(char *str)
+{
+ if (!strcmp(str, "off"))
+ hrtimer_hres_enabled = 0;
+ else if (!strcmp(str, "on"))
+ hrtimer_hres_enabled = 1;
+ else
+ return 0;
+ return 1;
+}
+
+__setup("highres=", setup_hrtimer_hres);
+
+/*
+ * hrtimer_high_res_enabled - query, if the highres mode is enabled
+ */
+static inline int hrtimer_is_hres_enabled(void)
+{
+ return hrtimer_hres_enabled;
+}
+
+/*
+ * Is the high resolution mode active ?
+ */
+static inline int hrtimer_hres_active(void)
+{
+ return __this_cpu_read(hrtimer_bases.hres_active);
+}
+
+/*
+ * Reprogram the event source with checking both queues for the
+ * next event
+ * Called with interrupts disabled and base->lock held
+ */
+static void
+hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
+{
+ ktime_t expires_next = __hrtimer_get_next_event(cpu_base);
+
+ if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64)
+ return;
+
+ cpu_base->expires_next.tv64 = expires_next.tv64;
+
+ /*
+ * If a hang was detected in the last timer interrupt then we
+ * leave the hang delay active in the hardware. We want the
+ * system to make progress. That also prevents the following
+ * scenario:
+ * T1 expires 50ms from now
+ * T2 expires 5s from now
+ *
+ * T1 is removed, so this code is called and would reprogram
+ * the hardware to 5s from now. Any hrtimer_start after that
+ * will not reprogram the hardware due to hang_detected being
+ * set. So we'd effectivly block all timers until the T2 event
+ * fires.
+ */
+ if (cpu_base->hang_detected)
+ return;
+
+ if (cpu_base->expires_next.tv64 != KTIME_MAX)
+ tick_program_event(cpu_base->expires_next, 1);
+}
+
+/*
+ * Shared reprogramming for clock_realtime and clock_monotonic
+ *
+ * When a timer is enqueued and expires earlier than the already enqueued
+ * timers, we have to check, whether it expires earlier than the timer for
+ * which the clock event device was armed.
+ *
+ * Note, that in case the state has HRTIMER_STATE_CALLBACK set, no reprogramming
+ * and no expiry check happens. The timer gets enqueued into the rbtree. The
+ * reprogramming and expiry check is done in the hrtimer_interrupt or in the
+ * softirq.
+ *
+ * Called with interrupts disabled and base->cpu_base.lock held
+ */
+static int hrtimer_reprogram(struct hrtimer *timer,
+ struct hrtimer_clock_base *base)
+{
+ struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+ ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
+ int res;
+
+ WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
+
+ /*
+ * When the callback is running, we do not reprogram the clock event
+ * device. The timer callback is either running on a different CPU or
+ * the callback is executed in the hrtimer_interrupt context. The
+ * reprogramming is handled either by the softirq, which called the
+ * callback or at the end of the hrtimer_interrupt.
+ */
+ if (hrtimer_callback_running(timer))
+ return 0;
+
+ /*
+ * CLOCK_REALTIME timer might be requested with an absolute
+ * expiry time which is less than base->offset. Nothing wrong
+ * about that, just avoid to call into the tick code, which
+ * has now objections against negative expiry values.
+ */
+ if (expires.tv64 < 0)
+ return -ETIME;
+
+ if (expires.tv64 >= cpu_base->expires_next.tv64)
+ return 0;
+
+ /*
+ * When the target cpu of the timer is currently executing
+ * hrtimer_interrupt(), then we do not touch the clock event
+ * device. hrtimer_interrupt() will reevaluate all clock bases
+ * before reprogramming the device.
+ */
+ if (cpu_base->in_hrtirq)
+ return 0;
+
+ /*
+ * If a hang was detected in the last timer interrupt then we
+ * do not schedule a timer which is earlier than the expiry
+ * which we enforced in the hang detection. We want the system
+ * to make progress.
+ */
+ if (cpu_base->hang_detected)
+ return 0;
+
+ /*
+ * Clockevents returns -ETIME, when the event was in the past.
+ */
+ res = tick_program_event(expires, 0);
+ if (!IS_ERR_VALUE(res))
+ cpu_base->expires_next = expires;
+ return res;
+}
+
+/*
+ * Initialize the high resolution related parts of cpu_base
+ */
+static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
+{
+ base->expires_next.tv64 = KTIME_MAX;
+ base->hres_active = 0;
+}
+
+static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
+{
+ ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
+ ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
+ ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
+
+ return ktime_get_update_offsets_now(offs_real, offs_boot, offs_tai);
+}
+
+/*
+ * Retrigger next event is called after clock was set
+ *
+ * Called with interrupts disabled via on_each_cpu()
+ */
+static void retrigger_next_event(void *arg)
+{
+ struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
+
+ if (!hrtimer_hres_active())
+ return;
+
+ raw_spin_lock(&base->lock);
+ hrtimer_update_base(base);
+ hrtimer_force_reprogram(base, 0);
+ raw_spin_unlock(&base->lock);
+}
+
+/*
+ * Switch to high resolution mode
+ */
+static int hrtimer_switch_to_hres(void)
+{
+ int i, cpu = smp_processor_id();
+ struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu);
+ unsigned long flags;
+
+ if (base->hres_active)
+ return 1;
+
+ local_irq_save(flags);
+
+ if (tick_init_highres()) {
+ local_irq_restore(flags);
+ printk(KERN_WARNING "Could not switch to high resolution "
+ "mode on CPU %d\n", cpu);
+ return 0;
+ }
+ base->hres_active = 1;
+ for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
+ base->clock_base[i].resolution = KTIME_HIGH_RES;
+
+ tick_setup_sched_timer();
+ /* "Retrigger" the interrupt to get things going */
+ retrigger_next_event(NULL);
+ local_irq_restore(flags);
+ return 1;
+}
+
+static void clock_was_set_work(struct work_struct *work)
+{
+ clock_was_set();
+}
+
+static DECLARE_WORK(hrtimer_work, clock_was_set_work);
+
+/*
+ * Called from timekeeping and resume code to reprogramm the hrtimer
+ * interrupt device on all cpus.
+ */
+void clock_was_set_delayed(void)
+{
+ schedule_work(&hrtimer_work);
+}
+
+#else
+
+static inline int hrtimer_hres_active(void) { return 0; }
+static inline int hrtimer_is_hres_enabled(void) { return 0; }
+static inline int hrtimer_switch_to_hres(void) { return 0; }
+static inline void
+hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
+static inline int hrtimer_reprogram(struct hrtimer *timer,
+ struct hrtimer_clock_base *base)
+{
+ return 0;
+}
+static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
+static inline void retrigger_next_event(void *arg) { }
+
+#endif /* CONFIG_HIGH_RES_TIMERS */
+
+/*
+ * Clock realtime was set
+ *
+ * Change the offset of the realtime clock vs. the monotonic
+ * clock.
+ *
+ * We might have to reprogram the high resolution timer interrupt. On
+ * SMP we call the architecture specific code to retrigger _all_ high
+ * resolution timer interrupts. On UP we just disable interrupts and
+ * call the high resolution interrupt code.
+ */
+void clock_was_set(void)
+{
+#ifdef CONFIG_HIGH_RES_TIMERS
+ /* Retrigger the CPU local events everywhere */
+ on_each_cpu(retrigger_next_event, NULL, 1);
+#endif
+ timerfd_clock_was_set();
+}
+
+/*
+ * During resume we might have to reprogram the high resolution timer
+ * interrupt on all online CPUs. However, all other CPUs will be
+ * stopped with IRQs interrupts disabled so the clock_was_set() call
+ * must be deferred.
+ */
+void hrtimers_resume(void)
+{
+ WARN_ONCE(!irqs_disabled(),
+ KERN_INFO "hrtimers_resume() called with IRQs enabled!");
+
+ /* Retrigger on the local CPU */
+ retrigger_next_event(NULL);
+ /* And schedule a retrigger for all others */
+ clock_was_set_delayed();
+}
+
+static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer)
+{
+#ifdef CONFIG_TIMER_STATS
+ if (timer->start_site)
+ return;
+ timer->start_site = __builtin_return_address(0);
+ memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
+ timer->start_pid = current->pid;
+#endif
+}
+
+static inline void timer_stats_hrtimer_clear_start_info(struct hrtimer *timer)
+{
+#ifdef CONFIG_TIMER_STATS
+ timer->start_site = NULL;
+#endif
+}
+
+static inline void timer_stats_account_hrtimer(struct hrtimer *timer)
+{
+#ifdef CONFIG_TIMER_STATS
+ if (likely(!timer_stats_active))
+ return;
+ timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
+ timer->function, timer->start_comm, 0);
+#endif
+}
+
+/*
+ * Counterpart to lock_hrtimer_base above:
+ */
+static inline
+void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
+{
+ raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags);
+}
+
+/**
+ * hrtimer_forward - forward the timer expiry
+ * @timer: hrtimer to forward
+ * @now: forward past this time
+ * @interval: the interval to forward
+ *
+ * Forward the timer expiry so it will expire in the future.
+ * Returns the number of overruns.
+ */
+u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
+{
+ u64 orun = 1;
+ ktime_t delta;
+
+ delta = ktime_sub(now, hrtimer_get_expires(timer));
+
+ if (delta.tv64 < 0)
+ return 0;
+
+ if (interval.tv64 < timer->base->resolution.tv64)
+ interval.tv64 = timer->base->resolution.tv64;
+
+ if (unlikely(delta.tv64 >= interval.tv64)) {
+ s64 incr = ktime_to_ns(interval);
+
+ orun = ktime_divns(delta, incr);
+ hrtimer_add_expires_ns(timer, incr * orun);
+ if (hrtimer_get_expires_tv64(timer) > now.tv64)
+ return orun;
+ /*
+ * This (and the ktime_add() below) is the
+ * correction for exact:
+ */
+ orun++;
+ }
+ hrtimer_add_expires(timer, interval);
+
+ return orun;
+}
+EXPORT_SYMBOL_GPL(hrtimer_forward);
+
+/*
+ * enqueue_hrtimer - internal function to (re)start a timer
+ *
+ * The timer is inserted in expiry order. Insertion into the
+ * red black tree is O(log(n)). Must hold the base lock.
+ *
+ * Returns 1 when the new timer is the leftmost timer in the tree.
+ */
+static int enqueue_hrtimer(struct hrtimer *timer,
+ struct hrtimer_clock_base *base)
+{
+ debug_activate(timer);
+
+ timerqueue_add(&base->active, &timer->node);
+ base->cpu_base->active_bases |= 1 << base->index;
+
+ /*
+ * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
+ * state of a possibly running callback.
+ */
+ timer->state |= HRTIMER_STATE_ENQUEUED;
+
+ return (&timer->node == base->active.next);
+}
+
+/*
+ * __remove_hrtimer - internal function to remove a timer
+ *
+ * Caller must hold the base lock.
+ *
+ * High resolution timer mode reprograms the clock event device when the
+ * timer is the one which expires next. The caller can disable this by setting
+ * reprogram to zero. This is useful, when the context does a reprogramming
+ * anyway (e.g. timer interrupt)
+ */
+static void __remove_hrtimer(struct hrtimer *timer,
+ struct hrtimer_clock_base *base,
+ unsigned long newstate, int reprogram)
+{
+ struct timerqueue_node *next_timer;
+ if (!(timer->state & HRTIMER_STATE_ENQUEUED))
+ goto out;
+
+ next_timer = timerqueue_getnext(&base->active);
+ timerqueue_del(&base->active, &timer->node);
+ if (&timer->node == next_timer) {
+#ifdef CONFIG_HIGH_RES_TIMERS
+ /* Reprogram the clock event device. if enabled */
+ if (reprogram && hrtimer_hres_active()) {
+ ktime_t expires;
+
+ expires = ktime_sub(hrtimer_get_expires(timer),
+ base->offset);
+ if (base->cpu_base->expires_next.tv64 == expires.tv64)
+ hrtimer_force_reprogram(base->cpu_base, 1);
+ }
+#endif
+ }
+ if (!timerqueue_getnext(&base->active))
+ base->cpu_base->active_bases &= ~(1 << base->index);
+out:
+ timer->state = newstate;
+}
+
+/*
+ * remove hrtimer, called with base lock held
+ */
+static inline int
+remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
+{
+ if (hrtimer_is_queued(timer)) {
+ unsigned long state;
+ int reprogram;
+
+ /*
+ * Remove the timer and force reprogramming when high
+ * resolution mode is active and the timer is on the current
+ * CPU. If we remove a timer on another CPU, reprogramming is
+ * skipped. The interrupt event on this CPU is fired and
+ * reprogramming happens in the interrupt handler. This is a
+ * rare case and less expensive than a smp call.
+ */
+ debug_deactivate(timer);
+ timer_stats_hrtimer_clear_start_info(timer);
+ reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
+ /*
+ * We must preserve the CALLBACK state flag here,
+ * otherwise we could move the timer base in
+ * switch_hrtimer_base.
+ */
+ state = timer->state & HRTIMER_STATE_CALLBACK;
+ __remove_hrtimer(timer, base, state, reprogram);
+ return 1;
+ }
+ return 0;
+}
+
+int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
+ unsigned long delta_ns, const enum hrtimer_mode mode,
+ int wakeup)
+{
+ struct hrtimer_clock_base *base, *new_base;
+ unsigned long flags;
+ int ret, leftmost;
+
+ base = lock_hrtimer_base(timer, &flags);
+
+ /* Remove an active timer from the queue: */
+ ret = remove_hrtimer(timer, base);
+
+ if (mode & HRTIMER_MODE_REL) {
+ tim = ktime_add_safe(tim, base->get_time());
+ /*
+ * CONFIG_TIME_LOW_RES is a temporary way for architectures
+ * to signal that they simply return xtime in
+ * do_gettimeoffset(). In this case we want to round up by
+ * resolution when starting a relative timer, to avoid short
+ * timeouts. This will go away with the GTOD framework.
+ */
+#ifdef CONFIG_TIME_LOW_RES
+ tim = ktime_add_safe(tim, base->resolution);
+#endif
+ }
+
+ hrtimer_set_expires_range_ns(timer, tim, delta_ns);
+
+ /* Switch the timer base, if necessary: */
+ new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
+
+ timer_stats_hrtimer_set_start_info(timer);
+
+ leftmost = enqueue_hrtimer(timer, new_base);
+
+ if (!leftmost) {
+ unlock_hrtimer_base(timer, &flags);
+ return ret;
+ }
+
+ if (!hrtimer_is_hres_active(timer)) {
+ /*
+ * Kick to reschedule the next tick to handle the new timer
+ * on dynticks target.
+ */
+ wake_up_nohz_cpu(new_base->cpu_base->cpu);
+ } else if (new_base->cpu_base == this_cpu_ptr(&hrtimer_bases) &&
+ hrtimer_reprogram(timer, new_base)) {
+ /*
+ * Only allow reprogramming if the new base is on this CPU.
+ * (it might still be on another CPU if the timer was pending)
+ *
+ * XXX send_remote_softirq() ?
+ */
+ if (wakeup) {
+ /*
+ * We need to drop cpu_base->lock to avoid a
+ * lock ordering issue vs. rq->lock.
+ */
+ raw_spin_unlock(&new_base->cpu_base->lock);
+ raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+ local_irq_restore(flags);
+ return ret;
+ } else {
+ __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+ }
+ }
+
+ unlock_hrtimer_base(timer, &flags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__hrtimer_start_range_ns);
+
+/**
+ * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
+ * @timer: the timer to be added
+ * @tim: expiry time
+ * @delta_ns: "slack" range for the timer
+ * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or
+ * relative (HRTIMER_MODE_REL)
+ *
+ * Returns:
+ * 0 on success
+ * 1 when the timer was active
+ */
+int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
+ unsigned long delta_ns, const enum hrtimer_mode mode)
+{
+ return __hrtimer_start_range_ns(timer, tim, delta_ns, mode, 1);
+}
+EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
+
+/**
+ * hrtimer_start - (re)start an hrtimer on the current CPU
+ * @timer: the timer to be added
+ * @tim: expiry time
+ * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or
+ * relative (HRTIMER_MODE_REL)
+ *
+ * Returns:
+ * 0 on success
+ * 1 when the timer was active
+ */
+int
+hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
+{
+ return __hrtimer_start_range_ns(timer, tim, 0, mode, 1);
+}
+EXPORT_SYMBOL_GPL(hrtimer_start);
+
+
+/**
+ * hrtimer_try_to_cancel - try to deactivate a timer
+ * @timer: hrtimer to stop
+ *
+ * Returns:
+ * 0 when the timer was not active
+ * 1 when the timer was active
+ * -1 when the timer is currently excuting the callback function and
+ * cannot be stopped
+ */
+int hrtimer_try_to_cancel(struct hrtimer *timer)
+{
+ struct hrtimer_clock_base *base;
+ unsigned long flags;
+ int ret = -1;
+
+ base = lock_hrtimer_base(timer, &flags);
+
+ if (!hrtimer_callback_running(timer))
+ ret = remove_hrtimer(timer, base);
+
+ unlock_hrtimer_base(timer, &flags);
+
+ return ret;
+
+}
+EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel);
+
+/**
+ * hrtimer_cancel - cancel a timer and wait for the handler to finish.
+ * @timer: the timer to be cancelled
+ *
+ * Returns:
+ * 0 when the timer was not active
+ * 1 when the timer was active
+ */
+int hrtimer_cancel(struct hrtimer *timer)
+{
+ for (;;) {
+ int ret = hrtimer_try_to_cancel(timer);
+
+ if (ret >= 0)
+ return ret;
+ cpu_relax();
+ }
+}
+EXPORT_SYMBOL_GPL(hrtimer_cancel);
+
+/**
+ * hrtimer_get_remaining - get remaining time for the timer
+ * @timer: the timer to read
+ */
+ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
+{
+ unsigned long flags;
+ ktime_t rem;
+
+ lock_hrtimer_base(timer, &flags);
+ rem = hrtimer_expires_remaining(timer);
+ unlock_hrtimer_base(timer, &flags);
+
+ return rem;
+}
+EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
+
+#ifdef CONFIG_NO_HZ_COMMON
+/**
+ * hrtimer_get_next_event - get the time until next expiry event
+ *
+ * Returns the delta to the next expiry event or KTIME_MAX if no timer
+ * is pending.
+ */
+ktime_t hrtimer_get_next_event(void)
+{
+ struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+ ktime_t mindelta = { .tv64 = KTIME_MAX };
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&cpu_base->lock, flags);
+
+ if (!hrtimer_hres_active())
+ mindelta = ktime_sub(__hrtimer_get_next_event(cpu_base),
+ ktime_get());
+
+ raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
+
+ if (mindelta.tv64 < 0)
+ mindelta.tv64 = 0;
+ return mindelta;
+}
+#endif
+
+static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
+ enum hrtimer_mode mode)
+{
+ struct hrtimer_cpu_base *cpu_base;
+ int base;
+
+ memset(timer, 0, sizeof(struct hrtimer));
+
+ cpu_base = raw_cpu_ptr(&hrtimer_bases);
+
+ if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS)
+ clock_id = CLOCK_MONOTONIC;
+
+ base = hrtimer_clockid_to_base(clock_id);
+ timer->base = &cpu_base->clock_base[base];
+ timerqueue_init(&timer->node);
+
+#ifdef CONFIG_TIMER_STATS
+ timer->start_site = NULL;
+ timer->start_pid = -1;
+ memset(timer->start_comm, 0, TASK_COMM_LEN);
+#endif
+}
+
+/**
+ * hrtimer_init - initialize a timer to the given clock
+ * @timer: the timer to be initialized
+ * @clock_id: the clock to be used
+ * @mode: timer mode abs/rel
+ */
+void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
+ enum hrtimer_mode mode)
+{
+ debug_init(timer, clock_id, mode);
+ __hrtimer_init(timer, clock_id, mode);
+}
+EXPORT_SYMBOL_GPL(hrtimer_init);
+
+/**
+ * hrtimer_get_res - get the timer resolution for a clock
+ * @which_clock: which clock to query
+ * @tp: pointer to timespec variable to store the resolution
+ *
+ * Store the resolution of the clock selected by @which_clock in the
+ * variable pointed to by @tp.
+ */
+int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
+{
+ struct hrtimer_cpu_base *cpu_base;
+ int base = hrtimer_clockid_to_base(which_clock);
+
+ cpu_base = raw_cpu_ptr(&hrtimer_bases);
+ *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(hrtimer_get_res);
+
+static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
+{
+ struct hrtimer_clock_base *base = timer->base;
+ struct hrtimer_cpu_base *cpu_base = base->cpu_base;
+ enum hrtimer_restart (*fn)(struct hrtimer *);
+ int restart;
+
+ WARN_ON(!irqs_disabled());
+
+ debug_deactivate(timer);
+ __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
+ timer_stats_account_hrtimer(timer);
+ fn = timer->function;
+
+ /*
+ * Because we run timers from hardirq context, there is no chance
+ * they get migrated to another cpu, therefore its safe to unlock
+ * the timer base.
+ */
+ raw_spin_unlock(&cpu_base->lock);
+ trace_hrtimer_expire_entry(timer, now);
+ restart = fn(timer);
+ trace_hrtimer_expire_exit(timer);
+ raw_spin_lock(&cpu_base->lock);
+
+ /*
+ * Note: We clear the CALLBACK bit after enqueue_hrtimer and
+ * we do not reprogramm the event hardware. Happens either in
+ * hrtimer_start_range_ns() or in hrtimer_interrupt()
+ */
+ if (restart != HRTIMER_NORESTART) {
+ BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
+ enqueue_hrtimer(timer, base);
+ }
+
+ WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK));
+
+ timer->state &= ~HRTIMER_STATE_CALLBACK;
+}
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+
+/*
+ * High resolution timer interrupt
+ * Called with interrupts disabled
+ */
+void hrtimer_interrupt(struct clock_event_device *dev)
+{
+ struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+ ktime_t expires_next, now, entry_time, delta;
+ int i, retries = 0;
+
+ BUG_ON(!cpu_base->hres_active);
+ cpu_base->nr_events++;
+ dev->next_event.tv64 = KTIME_MAX;
+
+ raw_spin_lock(&cpu_base->lock);
+ entry_time = now = hrtimer_update_base(cpu_base);
+retry:
+ cpu_base->in_hrtirq = 1;
+ /*
+ * We set expires_next to KTIME_MAX here with cpu_base->lock
+ * held to prevent that a timer is enqueued in our queue via
+ * the migration code. This does not affect enqueueing of
+ * timers which run their callback and need to be requeued on
+ * this CPU.
+ */
+ cpu_base->expires_next.tv64 = KTIME_MAX;
+
+ for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
+ struct hrtimer_clock_base *base;
+ struct timerqueue_node *node;
+ ktime_t basenow;
+
+ if (!(cpu_base->active_bases & (1 << i)))
+ continue;
+
+ base = cpu_base->clock_base + i;
+ basenow = ktime_add(now, base->offset);
+
+ while ((node = timerqueue_getnext(&base->active))) {
+ struct hrtimer *timer;
+
+ timer = container_of(node, struct hrtimer, node);
+
+ /*
+ * The immediate goal for using the softexpires is
+ * minimizing wakeups, not running timers at the
+ * earliest interrupt after their soft expiration.
+ * This allows us to avoid using a Priority Search
+ * Tree, which can answer a stabbing querry for
+ * overlapping intervals and instead use the simple
+ * BST we already have.
+ * We don't add extra wakeups by delaying timers that
+ * are right-of a not yet expired timer, because that
+ * timer will have to trigger a wakeup anyway.
+ */
+ if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer))
+ break;
+
+ __run_hrtimer(timer, &basenow);
+ }
+ }
+ /* Reevaluate the clock bases for the next expiry */
+ expires_next = __hrtimer_get_next_event(cpu_base);
+ /*
+ * Store the new expiry value so the migration code can verify
+ * against it.
+ */
+ cpu_base->expires_next = expires_next;
+ cpu_base->in_hrtirq = 0;
+ raw_spin_unlock(&cpu_base->lock);
+
+ /* Reprogramming necessary ? */
+ if (expires_next.tv64 == KTIME_MAX ||
+ !tick_program_event(expires_next, 0)) {
+ cpu_base->hang_detected = 0;
+ return;
+ }
+
+ /*
+ * The next timer was already expired due to:
+ * - tracing
+ * - long lasting callbacks
+ * - being scheduled away when running in a VM
+ *
+ * We need to prevent that we loop forever in the hrtimer
+ * interrupt routine. We give it 3 attempts to avoid
+ * overreacting on some spurious event.
+ *
+ * Acquire base lock for updating the offsets and retrieving
+ * the current time.
+ */
+ raw_spin_lock(&cpu_base->lock);
+ now = hrtimer_update_base(cpu_base);
+ cpu_base->nr_retries++;
+ if (++retries < 3)
+ goto retry;
+ /*
+ * Give the system a chance to do something else than looping
+ * here. We stored the entry time, so we know exactly how long
+ * we spent here. We schedule the next event this amount of
+ * time away.
+ */
+ cpu_base->nr_hangs++;
+ cpu_base->hang_detected = 1;
+ raw_spin_unlock(&cpu_base->lock);
+ delta = ktime_sub(now, entry_time);
+ if (delta.tv64 > cpu_base->max_hang_time.tv64)
+ cpu_base->max_hang_time = delta;
+ /*
+ * Limit it to a sensible value as we enforce a longer
+ * delay. Give the CPU at least 100ms to catch up.
+ */
+ if (delta.tv64 > 100 * NSEC_PER_MSEC)
+ expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC);
+ else
+ expires_next = ktime_add(now, delta);
+ tick_program_event(expires_next, 1);
+ printk_once(KERN_WARNING "hrtimer: interrupt took %llu ns\n",
+ ktime_to_ns(delta));
+}
+
+/*
+ * local version of hrtimer_peek_ahead_timers() called with interrupts
+ * disabled.
+ */
+static void __hrtimer_peek_ahead_timers(void)
+{
+ struct tick_device *td;
+
+ if (!hrtimer_hres_active())
+ return;
+
+ td = this_cpu_ptr(&tick_cpu_device);
+ if (td && td->evtdev)
+ hrtimer_interrupt(td->evtdev);
+}
+
+/**
+ * hrtimer_peek_ahead_timers -- run soft-expired timers now
+ *
+ * hrtimer_peek_ahead_timers will peek at the timer queue of
+ * the current cpu and check if there are any timers for which
+ * the soft expires time has passed. If any such timers exist,
+ * they are run immediately and then removed from the timer queue.
+ *
+ */
+void hrtimer_peek_ahead_timers(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __hrtimer_peek_ahead_timers();
+ local_irq_restore(flags);
+}
+
+static void run_hrtimer_softirq(struct softirq_action *h)
+{
+ hrtimer_peek_ahead_timers();
+}
+
+#else /* CONFIG_HIGH_RES_TIMERS */
+
+static inline void __hrtimer_peek_ahead_timers(void) { }
+
+#endif /* !CONFIG_HIGH_RES_TIMERS */
+
+/*
+ * Called from timer softirq every jiffy, expire hrtimers:
+ *
+ * For HRT its the fall back code to run the softirq in the timer
+ * softirq context in case the hrtimer initialization failed or has
+ * not been done yet.
+ */
+void hrtimer_run_pending(void)
+{
+ if (hrtimer_hres_active())
+ return;
+
+ /*
+ * This _is_ ugly: We have to check in the softirq context,
+ * whether we can switch to highres and / or nohz mode. The
+ * clocksource switch happens in the timer interrupt with
+ * xtime_lock held. Notification from there only sets the
+ * check bit in the tick_oneshot code, otherwise we might
+ * deadlock vs. xtime_lock.
+ */
+ if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
+ hrtimer_switch_to_hres();
+}
+
+/*
+ * Called from hardirq context every jiffy
+ */
+void hrtimer_run_queues(void)
+{
+ struct timerqueue_node *node;
+ struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+ struct hrtimer_clock_base *base;
+ int index, gettime = 1;
+
+ if (hrtimer_hres_active())
+ return;
+
+ for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
+ base = &cpu_base->clock_base[index];
+ if (!timerqueue_getnext(&base->active))
+ continue;
+
+ if (gettime) {
+ hrtimer_get_softirq_time(cpu_base);
+ gettime = 0;
+ }
+
+ raw_spin_lock(&cpu_base->lock);
+
+ while ((node = timerqueue_getnext(&base->active))) {
+ struct hrtimer *timer;
+
+ timer = container_of(node, struct hrtimer, node);
+ if (base->softirq_time.tv64 <=
+ hrtimer_get_expires_tv64(timer))
+ break;
+
+ __run_hrtimer(timer, &base->softirq_time);
+ }
+ raw_spin_unlock(&cpu_base->lock);
+ }
+}
+
+/*
+ * Sleep related functions:
+ */
+static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
+{
+ struct hrtimer_sleeper *t =
+ container_of(timer, struct hrtimer_sleeper, timer);
+ struct task_struct *task = t->task;
+
+ t->task = NULL;
+ if (task)
+ wake_up_process(task);
+
+ return HRTIMER_NORESTART;
+}
+
+void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
+{
+ sl->timer.function = hrtimer_wakeup;
+ sl->task = task;
+}
+EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
+
+static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
+{
+ hrtimer_init_sleeper(t, current);
+
+ do {
+ set_current_state(TASK_INTERRUPTIBLE);
+ hrtimer_start_expires(&t->timer, mode);
+ if (!hrtimer_active(&t->timer))
+ t->task = NULL;
+
+ if (likely(t->task))
+ freezable_schedule();
+
+ hrtimer_cancel(&t->timer);
+ mode = HRTIMER_MODE_ABS;
+
+ } while (t->task && !signal_pending(current));
+
+ __set_current_state(TASK_RUNNING);
+
+ return t->task == NULL;
+}
+
+static int update_rmtp(struct hrtimer *timer, struct timespec __user *rmtp)
+{
+ struct timespec rmt;
+ ktime_t rem;
+
+ rem = hrtimer_expires_remaining(timer);
+ if (rem.tv64 <= 0)
+ return 0;
+ rmt = ktime_to_timespec(rem);
+
+ if (copy_to_user(rmtp, &rmt, sizeof(*rmtp)))
+ return -EFAULT;
+
+ return 1;
+}
+
+long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
+{
+ struct hrtimer_sleeper t;
+ struct timespec __user *rmtp;
+ int ret = 0;
+
+ hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid,
+ HRTIMER_MODE_ABS);
+ hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
+
+ if (do_nanosleep(&t, HRTIMER_MODE_ABS))
+ goto out;
+
+ rmtp = restart->nanosleep.rmtp;
+ if (rmtp) {
+ ret = update_rmtp(&t.timer, rmtp);
+ if (ret <= 0)
+ goto out;
+ }
+
+ /* The other values in restart are already filled in */
+ ret = -ERESTART_RESTARTBLOCK;
+out:
+ destroy_hrtimer_on_stack(&t.timer);
+ return ret;
+}
+
+long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
+ const enum hrtimer_mode mode, const clockid_t clockid)
+{
+ struct restart_block *restart;
+ struct hrtimer_sleeper t;
+ int ret = 0;
+ unsigned long slack;
+
+ slack = current->timer_slack_ns;
+ if (dl_task(current) || rt_task(current))
+ slack = 0;
+
+ hrtimer_init_on_stack(&t.timer, clockid, mode);
+ hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack);
+ if (do_nanosleep(&t, mode))
+ goto out;
+
+ /* Absolute timers do not update the rmtp value and restart: */
+ if (mode == HRTIMER_MODE_ABS) {
+ ret = -ERESTARTNOHAND;
+ goto out;
+ }
+
+ if (rmtp) {
+ ret = update_rmtp(&t.timer, rmtp);
+ if (ret <= 0)
+ goto out;
+ }
+
+ restart = &current->restart_block;
+ restart->fn = hrtimer_nanosleep_restart;
+ restart->nanosleep.clockid = t.timer.base->clockid;
+ restart->nanosleep.rmtp = rmtp;
+ restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
+
+ ret = -ERESTART_RESTARTBLOCK;
+out:
+ destroy_hrtimer_on_stack(&t.timer);
+ return ret;
+}
+
+SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
+ struct timespec __user *, rmtp)
+{
+ struct timespec tu;
+
+ if (copy_from_user(&tu, rqtp, sizeof(tu)))
+ return -EFAULT;
+
+ if (!timespec_valid(&tu))
+ return -EINVAL;
+
+ return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
+}
+
+/*
+ * Functions related to boot-time initialization:
+ */
+static void init_hrtimers_cpu(int cpu)
+{
+ struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
+ int i;
+
+ for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
+ cpu_base->clock_base[i].cpu_base = cpu_base;
+ timerqueue_init_head(&cpu_base->clock_base[i].active);
+ }
+
+ cpu_base->cpu = cpu;
+ hrtimer_init_hres(cpu_base);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
+ struct hrtimer_clock_base *new_base)
+{
+ struct hrtimer *timer;
+ struct timerqueue_node *node;
+
+ while ((node = timerqueue_getnext(&old_base->active))) {
+ timer = container_of(node, struct hrtimer, node);
+ BUG_ON(hrtimer_callback_running(timer));
+ debug_deactivate(timer);
+
+ /*
+ * Mark it as STATE_MIGRATE not INACTIVE otherwise the
+ * timer could be seen as !active and just vanish away
+ * under us on another CPU
+ */
+ __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
+ timer->base = new_base;
+ /*
+ * Enqueue the timers on the new cpu. This does not
+ * reprogram the event device in case the timer
+ * expires before the earliest on this CPU, but we run
+ * hrtimer_interrupt after we migrated everything to
+ * sort out already expired timers and reprogram the
+ * event device.
+ */
+ enqueue_hrtimer(timer, new_base);
+
+ /* Clear the migration state bit */
+ timer->state &= ~HRTIMER_STATE_MIGRATE;
+ }
+}
+
+static void migrate_hrtimers(int scpu)
+{
+ struct hrtimer_cpu_base *old_base, *new_base;
+ int i;
+
+ BUG_ON(cpu_online(scpu));
+ tick_cancel_sched_timer(scpu);
+
+ local_irq_disable();
+ old_base = &per_cpu(hrtimer_bases, scpu);
+ new_base = this_cpu_ptr(&hrtimer_bases);
+ /*
+ * The caller is globally serialized and nobody else
+ * takes two locks at once, deadlock is not possible.
+ */
+ raw_spin_lock(&new_base->lock);
+ raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
+
+ for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
+ migrate_hrtimer_list(&old_base->clock_base[i],
+ &new_base->clock_base[i]);
+ }
+
+ raw_spin_unlock(&old_base->lock);
+ raw_spin_unlock(&new_base->lock);
+
+ /* Check, if we got expired work to do */
+ __hrtimer_peek_ahead_timers();
+ local_irq_enable();
+}
+
+#endif /* CONFIG_HOTPLUG_CPU */
+
+static int hrtimer_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ int scpu = (long)hcpu;
+
+ switch (action) {
+
+ case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
+ init_hrtimers_cpu(scpu);
+ break;
+
+#ifdef CONFIG_HOTPLUG_CPU
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ migrate_hrtimers(scpu);
+ break;
+#endif
+
+ default:
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block hrtimers_nb = {
+ .notifier_call = hrtimer_cpu_notify,
+};
+
+void __init hrtimers_init(void)
+{
+ hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
+ (void *)(long)smp_processor_id());
+ register_cpu_notifier(&hrtimers_nb);
+#ifdef CONFIG_HIGH_RES_TIMERS
+ open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
+#endif
+}
+
+/**
+ * schedule_hrtimeout_range_clock - sleep until timeout
+ * @expires: timeout value (ktime_t)
+ * @delta: slack in expires timeout (ktime_t)
+ * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
+ * @clock: timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME
+ */
+int __sched
+schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
+ const enum hrtimer_mode mode, int clock)
+{
+ struct hrtimer_sleeper t;
+
+ /*
+ * Optimize when a zero timeout value is given. It does not
+ * matter whether this is an absolute or a relative time.
+ */
+ if (expires && !expires->tv64) {
+ __set_current_state(TASK_RUNNING);
+ return 0;
+ }
+
+ /*
+ * A NULL parameter means "infinite"
+ */
+ if (!expires) {
+ schedule();
+ return -EINTR;
+ }
+
+ hrtimer_init_on_stack(&t.timer, clock, mode);
+ hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
+
+ hrtimer_init_sleeper(&t, current);
+
+ hrtimer_start_expires(&t.timer, mode);
+ if (!hrtimer_active(&t.timer))
+ t.task = NULL;
+
+ if (likely(t.task))
+ schedule();
+
+ hrtimer_cancel(&t.timer);
+ destroy_hrtimer_on_stack(&t.timer);
+
+ __set_current_state(TASK_RUNNING);
+
+ return !t.task ? 0 : -EINTR;
+}
+
+/**
+ * schedule_hrtimeout_range - sleep until timeout
+ * @expires: timeout value (ktime_t)
+ * @delta: slack in expires timeout (ktime_t)
+ * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
+ *
+ * Make the current task sleep until the given expiry time has
+ * elapsed. The routine will return immediately unless
+ * the current task state has been set (see set_current_state()).
+ *
+ * The @delta argument gives the kernel the freedom to schedule the
+ * actual wakeup to a time that is both power and performance friendly.
+ * The kernel give the normal best effort behavior for "@expires+@delta",
+ * but may decide to fire the timer earlier, but no earlier than @expires.
+ *
+ * You can set the task state as follows -
+ *
+ * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
+ * pass before the routine returns.
+ *
+ * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
+ * delivered to the current task.
+ *
+ * The current task state is guaranteed to be TASK_RUNNING when this
+ * routine returns.
+ *
+ * Returns 0 when the timer has expired otherwise -EINTR
+ */
+int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
+ const enum hrtimer_mode mode)
+{
+ return schedule_hrtimeout_range_clock(expires, delta, mode,
+ CLOCK_MONOTONIC);
+}
+EXPORT_SYMBOL_GPL(schedule_hrtimeout_range);
+
+/**
+ * schedule_hrtimeout - sleep until timeout
+ * @expires: timeout value (ktime_t)
+ * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
+ *
+ * Make the current task sleep until the given expiry time has
+ * elapsed. The routine will return immediately unless
+ * the current task state has been set (see set_current_state()).
+ *
+ * You can set the task state as follows -
+ *
+ * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
+ * pass before the routine returns.
+ *
+ * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
+ * delivered to the current task.
+ *
+ * The current task state is guaranteed to be TASK_RUNNING when this
+ * routine returns.
+ *
+ * Returns 0 when the timer has expired otherwise -EINTR
+ */
+int __sched schedule_hrtimeout(ktime_t *expires,
+ const enum hrtimer_mode mode)
+{
+ return schedule_hrtimeout_range(expires, 0, mode);
+}
+EXPORT_SYMBOL_GPL(schedule_hrtimeout);
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
new file mode 100644
index 000000000..8d262b467
--- /dev/null
+++ b/kernel/time/itimer.c
@@ -0,0 +1,301 @@
+/*
+ * linux/kernel/itimer.c
+ *
+ * Copyright (C) 1992 Darren Senn
+ */
+
+/* These are all the functions necessary to implement itimers */
+
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/syscalls.h>
+#include <linux/time.h>
+#include <linux/posix-timers.h>
+#include <linux/hrtimer.h>
+#include <trace/events/timer.h>
+
+#include <asm/uaccess.h>
+
+/**
+ * itimer_get_remtime - get remaining time for the timer
+ *
+ * @timer: the timer to read
+ *
+ * Returns the delta between the expiry time and now, which can be
+ * less than zero or 1usec for an pending expired timer
+ */
+static struct timeval itimer_get_remtime(struct hrtimer *timer)
+{
+ ktime_t rem = hrtimer_get_remaining(timer);
+
+ /*
+ * Racy but safe: if the itimer expires after the above
+ * hrtimer_get_remtime() call but before this condition
+ * then we return 0 - which is correct.
+ */
+ if (hrtimer_active(timer)) {
+ if (rem.tv64 <= 0)
+ rem.tv64 = NSEC_PER_USEC;
+ } else
+ rem.tv64 = 0;
+
+ return ktime_to_timeval(rem);
+}
+
+static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
+ struct itimerval *const value)
+{
+ cputime_t cval, cinterval;
+ struct cpu_itimer *it = &tsk->signal->it[clock_id];
+
+ spin_lock_irq(&tsk->sighand->siglock);
+
+ cval = it->expires;
+ cinterval = it->incr;
+ if (cval) {
+ struct task_cputime cputime;
+ cputime_t t;
+
+ thread_group_cputimer(tsk, &cputime);
+ if (clock_id == CPUCLOCK_PROF)
+ t = cputime.utime + cputime.stime;
+ else
+ /* CPUCLOCK_VIRT */
+ t = cputime.utime;
+
+ if (cval < t)
+ /* about to fire */
+ cval = cputime_one_jiffy;
+ else
+ cval = cval - t;
+ }
+
+ spin_unlock_irq(&tsk->sighand->siglock);
+
+ cputime_to_timeval(cval, &value->it_value);
+ cputime_to_timeval(cinterval, &value->it_interval);
+}
+
+int do_getitimer(int which, struct itimerval *value)
+{
+ struct task_struct *tsk = current;
+
+ switch (which) {
+ case ITIMER_REAL:
+ spin_lock_irq(&tsk->sighand->siglock);
+ value->it_value = itimer_get_remtime(&tsk->signal->real_timer);
+ value->it_interval =
+ ktime_to_timeval(tsk->signal->it_real_incr);
+ spin_unlock_irq(&tsk->sighand->siglock);
+ break;
+ case ITIMER_VIRTUAL:
+ get_cpu_itimer(tsk, CPUCLOCK_VIRT, value);
+ break;
+ case ITIMER_PROF:
+ get_cpu_itimer(tsk, CPUCLOCK_PROF, value);
+ break;
+ default:
+ return(-EINVAL);
+ }
+ return 0;
+}
+
+SYSCALL_DEFINE2(getitimer, int, which, struct itimerval __user *, value)
+{
+ int error = -EFAULT;
+ struct itimerval get_buffer;
+
+ if (value) {
+ error = do_getitimer(which, &get_buffer);
+ if (!error &&
+ copy_to_user(value, &get_buffer, sizeof(get_buffer)))
+ error = -EFAULT;
+ }
+ return error;
+}
+
+
+/*
+ * The timer is automagically restarted, when interval != 0
+ */
+enum hrtimer_restart it_real_fn(struct hrtimer *timer)
+{
+ struct signal_struct *sig =
+ container_of(timer, struct signal_struct, real_timer);
+
+ trace_itimer_expire(ITIMER_REAL, sig->leader_pid, 0);
+ kill_pid_info(SIGALRM, SEND_SIG_PRIV, sig->leader_pid);
+
+ return HRTIMER_NORESTART;
+}
+
+static inline u32 cputime_sub_ns(cputime_t ct, s64 real_ns)
+{
+ struct timespec ts;
+ s64 cpu_ns;
+
+ cputime_to_timespec(ct, &ts);
+ cpu_ns = timespec_to_ns(&ts);
+
+ return (cpu_ns <= real_ns) ? 0 : cpu_ns - real_ns;
+}
+
+static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
+ const struct itimerval *const value,
+ struct itimerval *const ovalue)
+{
+ cputime_t cval, nval, cinterval, ninterval;
+ s64 ns_ninterval, ns_nval;
+ u32 error, incr_error;
+ struct cpu_itimer *it = &tsk->signal->it[clock_id];
+
+ nval = timeval_to_cputime(&value->it_value);
+ ns_nval = timeval_to_ns(&value->it_value);
+ ninterval = timeval_to_cputime(&value->it_interval);
+ ns_ninterval = timeval_to_ns(&value->it_interval);
+
+ error = cputime_sub_ns(nval, ns_nval);
+ incr_error = cputime_sub_ns(ninterval, ns_ninterval);
+
+ spin_lock_irq(&tsk->sighand->siglock);
+
+ cval = it->expires;
+ cinterval = it->incr;
+ if (cval || nval) {
+ if (nval > 0)
+ nval += cputime_one_jiffy;
+ set_process_cpu_timer(tsk, clock_id, &nval, &cval);
+ }
+ it->expires = nval;
+ it->incr = ninterval;
+ it->error = error;
+ it->incr_error = incr_error;
+ trace_itimer_state(clock_id == CPUCLOCK_VIRT ?
+ ITIMER_VIRTUAL : ITIMER_PROF, value, nval);
+
+ spin_unlock_irq(&tsk->sighand->siglock);
+
+ if (ovalue) {
+ cputime_to_timeval(cval, &ovalue->it_value);
+ cputime_to_timeval(cinterval, &ovalue->it_interval);
+ }
+}
+
+/*
+ * Returns true if the timeval is in canonical form
+ */
+#define timeval_valid(t) \
+ (((t)->tv_sec >= 0) && (((unsigned long) (t)->tv_usec) < USEC_PER_SEC))
+
+int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
+{
+ struct task_struct *tsk = current;
+ struct hrtimer *timer;
+ ktime_t expires;
+
+ /*
+ * Validate the timevals in value.
+ */
+ if (!timeval_valid(&value->it_value) ||
+ !timeval_valid(&value->it_interval))
+ return -EINVAL;
+
+ switch (which) {
+ case ITIMER_REAL:
+again:
+ spin_lock_irq(&tsk->sighand->siglock);
+ timer = &tsk->signal->real_timer;
+ if (ovalue) {
+ ovalue->it_value = itimer_get_remtime(timer);
+ ovalue->it_interval
+ = ktime_to_timeval(tsk->signal->it_real_incr);
+ }
+ /* We are sharing ->siglock with it_real_fn() */
+ if (hrtimer_try_to_cancel(timer) < 0) {
+ spin_unlock_irq(&tsk->sighand->siglock);
+ goto again;
+ }
+ expires = timeval_to_ktime(value->it_value);
+ if (expires.tv64 != 0) {
+ tsk->signal->it_real_incr =
+ timeval_to_ktime(value->it_interval);
+ hrtimer_start(timer, expires, HRTIMER_MODE_REL);
+ } else
+ tsk->signal->it_real_incr.tv64 = 0;
+
+ trace_itimer_state(ITIMER_REAL, value, 0);
+ spin_unlock_irq(&tsk->sighand->siglock);
+ break;
+ case ITIMER_VIRTUAL:
+ set_cpu_itimer(tsk, CPUCLOCK_VIRT, value, ovalue);
+ break;
+ case ITIMER_PROF:
+ set_cpu_itimer(tsk, CPUCLOCK_PROF, value, ovalue);
+ break;
+ default:
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/**
+ * alarm_setitimer - set alarm in seconds
+ *
+ * @seconds: number of seconds until alarm
+ * 0 disables the alarm
+ *
+ * Returns the remaining time in seconds of a pending timer or 0 when
+ * the timer is not active.
+ *
+ * On 32 bit machines the seconds value is limited to (INT_MAX/2) to avoid
+ * negative timeval settings which would cause immediate expiry.
+ */
+unsigned int alarm_setitimer(unsigned int seconds)
+{
+ struct itimerval it_new, it_old;
+
+#if BITS_PER_LONG < 64
+ if (seconds > INT_MAX)
+ seconds = INT_MAX;
+#endif
+ it_new.it_value.tv_sec = seconds;
+ it_new.it_value.tv_usec = 0;
+ it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
+
+ do_setitimer(ITIMER_REAL, &it_new, &it_old);
+
+ /*
+ * We can't return 0 if we have an alarm pending ... And we'd
+ * better return too much than too little anyway
+ */
+ if ((!it_old.it_value.tv_sec && it_old.it_value.tv_usec) ||
+ it_old.it_value.tv_usec >= 500000)
+ it_old.it_value.tv_sec++;
+
+ return it_old.it_value.tv_sec;
+}
+
+SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value,
+ struct itimerval __user *, ovalue)
+{
+ struct itimerval set_buffer, get_buffer;
+ int error;
+
+ if (value) {
+ if(copy_from_user(&set_buffer, value, sizeof(set_buffer)))
+ return -EFAULT;
+ } else {
+ memset(&set_buffer, 0, sizeof(set_buffer));
+ printk_once(KERN_WARNING "%s calls setitimer() with new_value NULL pointer."
+ " Misfeature support will be removed\n",
+ current->comm);
+ }
+
+ error = do_setitimer(which, &set_buffer, ovalue ? &get_buffer : NULL);
+ if (error || !ovalue)
+ return error;
+
+ if (copy_to_user(ovalue, &get_buffer, sizeof(get_buffer)))
+ return -EFAULT;
+ return 0;
+}
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
new file mode 100644
index 000000000..347fecf86
--- /dev/null
+++ b/kernel/time/jiffies.c
@@ -0,0 +1,136 @@
+/***********************************************************************
+* linux/kernel/time/jiffies.c
+*
+* This file contains the jiffies based clocksource.
+*
+* Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com)
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*
+************************************************************************/
+#include <linux/clocksource.h>
+#include <linux/jiffies.h>
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include "timekeeping.h"
+
+/* The Jiffies based clocksource is the lowest common
+ * denominator clock source which should function on
+ * all systems. It has the same coarse resolution as
+ * the timer interrupt frequency HZ and it suffers
+ * inaccuracies caused by missed or lost timer
+ * interrupts and the inability for the timer
+ * interrupt hardware to accuratly tick at the
+ * requested HZ value. It is also not recommended
+ * for "tick-less" systems.
+ */
+#define NSEC_PER_JIFFY ((NSEC_PER_SEC+HZ/2)/HZ)
+
+/* Since jiffies uses a simple NSEC_PER_JIFFY multiplier
+ * conversion, the .shift value could be zero. However
+ * this would make NTP adjustments impossible as they are
+ * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to
+ * shift both the nominator and denominator the same
+ * amount, and give ntp adjustments in units of 1/2^8
+ *
+ * The value 8 is somewhat carefully chosen, as anything
+ * larger can result in overflows. NSEC_PER_JIFFY grows as
+ * HZ shrinks, so values greater than 8 overflow 32bits when
+ * HZ=100.
+ */
+#if HZ < 34
+#define JIFFIES_SHIFT 6
+#elif HZ < 67
+#define JIFFIES_SHIFT 7
+#else
+#define JIFFIES_SHIFT 8
+#endif
+
+static cycle_t jiffies_read(struct clocksource *cs)
+{
+ return (cycle_t) jiffies;
+}
+
+static struct clocksource clocksource_jiffies = {
+ .name = "jiffies",
+ .rating = 1, /* lowest valid rating*/
+ .read = jiffies_read,
+ .mask = 0xffffffff, /*32bits*/
+ .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
+ .shift = JIFFIES_SHIFT,
+ .max_cycles = 10,
+};
+
+__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
+
+#if (BITS_PER_LONG < 64)
+u64 get_jiffies_64(void)
+{
+ unsigned long seq;
+ u64 ret;
+
+ do {
+ seq = read_seqbegin(&jiffies_lock);
+ ret = jiffies_64;
+ } while (read_seqretry(&jiffies_lock, seq));
+ return ret;
+}
+EXPORT_SYMBOL(get_jiffies_64);
+#endif
+
+EXPORT_SYMBOL(jiffies);
+
+static int __init init_jiffies_clocksource(void)
+{
+ return __clocksource_register(&clocksource_jiffies);
+}
+
+core_initcall(init_jiffies_clocksource);
+
+struct clocksource * __init __weak clocksource_default_clock(void)
+{
+ return &clocksource_jiffies;
+}
+
+struct clocksource refined_jiffies;
+
+int register_refined_jiffies(long cycles_per_second)
+{
+ u64 nsec_per_tick, shift_hz;
+ long cycles_per_tick;
+
+
+
+ refined_jiffies = clocksource_jiffies;
+ refined_jiffies.name = "refined-jiffies";
+ refined_jiffies.rating++;
+
+ /* Calc cycles per tick */
+ cycles_per_tick = (cycles_per_second + HZ/2)/HZ;
+ /* shift_hz stores hz<<8 for extra accuracy */
+ shift_hz = (u64)cycles_per_second << 8;
+ shift_hz += cycles_per_tick/2;
+ do_div(shift_hz, cycles_per_tick);
+ /* Calculate nsec_per_tick using shift_hz */
+ nsec_per_tick = (u64)NSEC_PER_SEC << 8;
+ nsec_per_tick += (u32)shift_hz/2;
+ do_div(nsec_per_tick, (u32)shift_hz);
+
+ refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT;
+
+ __clocksource_register(&refined_jiffies);
+ return 0;
+}
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
new file mode 100644
index 000000000..7a6810030
--- /dev/null
+++ b/kernel/time/ntp.c
@@ -0,0 +1,965 @@
+/*
+ * NTP state machine interfaces and logic.
+ *
+ * This code was mainly moved from kernel/timer.c and kernel/time.c
+ * Please see those files for relevant copyright info and historical
+ * changelogs.
+ */
+#include <linux/capability.h>
+#include <linux/clocksource.h>
+#include <linux/workqueue.h>
+#include <linux/hrtimer.h>
+#include <linux/jiffies.h>
+#include <linux/math64.h>
+#include <linux/timex.h>
+#include <linux/time.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/rtc.h>
+
+#include "ntp_internal.h"
+
+/*
+ * NTP timekeeping variables:
+ *
+ * Note: All of the NTP state is protected by the timekeeping locks.
+ */
+
+
+/* USER_HZ period (usecs): */
+unsigned long tick_usec = TICK_USEC;
+
+/* SHIFTED_HZ period (nsecs): */
+unsigned long tick_nsec;
+
+static u64 tick_length;
+static u64 tick_length_base;
+
+#define MAX_TICKADJ 500LL /* usecs */
+#define MAX_TICKADJ_SCALED \
+ (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ)
+
+/*
+ * phase-lock loop variables
+ */
+
+/*
+ * clock synchronization status
+ *
+ * (TIME_ERROR prevents overwriting the CMOS clock)
+ */
+static int time_state = TIME_OK;
+
+/* clock status bits: */
+static int time_status = STA_UNSYNC;
+
+/* time adjustment (nsecs): */
+static s64 time_offset;
+
+/* pll time constant: */
+static long time_constant = 2;
+
+/* maximum error (usecs): */
+static long time_maxerror = NTP_PHASE_LIMIT;
+
+/* estimated error (usecs): */
+static long time_esterror = NTP_PHASE_LIMIT;
+
+/* frequency offset (scaled nsecs/secs): */
+static s64 time_freq;
+
+/* time at last adjustment (secs): */
+static long time_reftime;
+
+static long time_adjust;
+
+/* constant (boot-param configurable) NTP tick adjustment (upscaled) */
+static s64 ntp_tick_adj;
+
+#ifdef CONFIG_NTP_PPS
+
+/*
+ * The following variables are used when a pulse-per-second (PPS) signal
+ * is available. They establish the engineering parameters of the clock
+ * discipline loop when controlled by the PPS signal.
+ */
+#define PPS_VALID 10 /* PPS signal watchdog max (s) */
+#define PPS_POPCORN 4 /* popcorn spike threshold (shift) */
+#define PPS_INTMIN 2 /* min freq interval (s) (shift) */
+#define PPS_INTMAX 8 /* max freq interval (s) (shift) */
+#define PPS_INTCOUNT 4 /* number of consecutive good intervals to
+ increase pps_shift or consecutive bad
+ intervals to decrease it */
+#define PPS_MAXWANDER 100000 /* max PPS freq wander (ns/s) */
+
+static int pps_valid; /* signal watchdog counter */
+static long pps_tf[3]; /* phase median filter */
+static long pps_jitter; /* current jitter (ns) */
+static struct timespec pps_fbase; /* beginning of the last freq interval */
+static int pps_shift; /* current interval duration (s) (shift) */
+static int pps_intcnt; /* interval counter */
+static s64 pps_freq; /* frequency offset (scaled ns/s) */
+static long pps_stabil; /* current stability (scaled ns/s) */
+
+/*
+ * PPS signal quality monitors
+ */
+static long pps_calcnt; /* calibration intervals */
+static long pps_jitcnt; /* jitter limit exceeded */
+static long pps_stbcnt; /* stability limit exceeded */
+static long pps_errcnt; /* calibration errors */
+
+
+/* PPS kernel consumer compensates the whole phase error immediately.
+ * Otherwise, reduce the offset by a fixed factor times the time constant.
+ */
+static inline s64 ntp_offset_chunk(s64 offset)
+{
+ if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL)
+ return offset;
+ else
+ return shift_right(offset, SHIFT_PLL + time_constant);
+}
+
+static inline void pps_reset_freq_interval(void)
+{
+ /* the PPS calibration interval may end
+ surprisingly early */
+ pps_shift = PPS_INTMIN;
+ pps_intcnt = 0;
+}
+
+/**
+ * pps_clear - Clears the PPS state variables
+ */
+static inline void pps_clear(void)
+{
+ pps_reset_freq_interval();
+ pps_tf[0] = 0;
+ pps_tf[1] = 0;
+ pps_tf[2] = 0;
+ pps_fbase.tv_sec = pps_fbase.tv_nsec = 0;
+ pps_freq = 0;
+}
+
+/* Decrease pps_valid to indicate that another second has passed since
+ * the last PPS signal. When it reaches 0, indicate that PPS signal is
+ * missing.
+ */
+static inline void pps_dec_valid(void)
+{
+ if (pps_valid > 0)
+ pps_valid--;
+ else {
+ time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
+ STA_PPSWANDER | STA_PPSERROR);
+ pps_clear();
+ }
+}
+
+static inline void pps_set_freq(s64 freq)
+{
+ pps_freq = freq;
+}
+
+static inline int is_error_status(int status)
+{
+ return (status & (STA_UNSYNC|STA_CLOCKERR))
+ /* PPS signal lost when either PPS time or
+ * PPS frequency synchronization requested
+ */
+ || ((status & (STA_PPSFREQ|STA_PPSTIME))
+ && !(status & STA_PPSSIGNAL))
+ /* PPS jitter exceeded when
+ * PPS time synchronization requested */
+ || ((status & (STA_PPSTIME|STA_PPSJITTER))
+ == (STA_PPSTIME|STA_PPSJITTER))
+ /* PPS wander exceeded or calibration error when
+ * PPS frequency synchronization requested
+ */
+ || ((status & STA_PPSFREQ)
+ && (status & (STA_PPSWANDER|STA_PPSERROR)));
+}
+
+static inline void pps_fill_timex(struct timex *txc)
+{
+ txc->ppsfreq = shift_right((pps_freq >> PPM_SCALE_INV_SHIFT) *
+ PPM_SCALE_INV, NTP_SCALE_SHIFT);
+ txc->jitter = pps_jitter;
+ if (!(time_status & STA_NANO))
+ txc->jitter /= NSEC_PER_USEC;
+ txc->shift = pps_shift;
+ txc->stabil = pps_stabil;
+ txc->jitcnt = pps_jitcnt;
+ txc->calcnt = pps_calcnt;
+ txc->errcnt = pps_errcnt;
+ txc->stbcnt = pps_stbcnt;
+}
+
+#else /* !CONFIG_NTP_PPS */
+
+static inline s64 ntp_offset_chunk(s64 offset)
+{
+ return shift_right(offset, SHIFT_PLL + time_constant);
+}
+
+static inline void pps_reset_freq_interval(void) {}
+static inline void pps_clear(void) {}
+static inline void pps_dec_valid(void) {}
+static inline void pps_set_freq(s64 freq) {}
+
+static inline int is_error_status(int status)
+{
+ return status & (STA_UNSYNC|STA_CLOCKERR);
+}
+
+static inline void pps_fill_timex(struct timex *txc)
+{
+ /* PPS is not implemented, so these are zero */
+ txc->ppsfreq = 0;
+ txc->jitter = 0;
+ txc->shift = 0;
+ txc->stabil = 0;
+ txc->jitcnt = 0;
+ txc->calcnt = 0;
+ txc->errcnt = 0;
+ txc->stbcnt = 0;
+}
+
+#endif /* CONFIG_NTP_PPS */
+
+
+/**
+ * ntp_synced - Returns 1 if the NTP status is not UNSYNC
+ *
+ */
+static inline int ntp_synced(void)
+{
+ return !(time_status & STA_UNSYNC);
+}
+
+
+/*
+ * NTP methods:
+ */
+
+/*
+ * Update (tick_length, tick_length_base, tick_nsec), based
+ * on (tick_usec, ntp_tick_adj, time_freq):
+ */
+static void ntp_update_frequency(void)
+{
+ u64 second_length;
+ u64 new_base;
+
+ second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ)
+ << NTP_SCALE_SHIFT;
+
+ second_length += ntp_tick_adj;
+ second_length += time_freq;
+
+ tick_nsec = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT;
+ new_base = div_u64(second_length, NTP_INTERVAL_FREQ);
+
+ /*
+ * Don't wait for the next second_overflow, apply
+ * the change to the tick length immediately:
+ */
+ tick_length += new_base - tick_length_base;
+ tick_length_base = new_base;
+}
+
+static inline s64 ntp_update_offset_fll(s64 offset64, long secs)
+{
+ time_status &= ~STA_MODE;
+
+ if (secs < MINSEC)
+ return 0;
+
+ if (!(time_status & STA_FLL) && (secs <= MAXSEC))
+ return 0;
+
+ time_status |= STA_MODE;
+
+ return div64_long(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs);
+}
+
+static void ntp_update_offset(long offset)
+{
+ s64 freq_adj;
+ s64 offset64;
+ long secs;
+
+ if (!(time_status & STA_PLL))
+ return;
+
+ if (!(time_status & STA_NANO))
+ offset *= NSEC_PER_USEC;
+
+ /*
+ * Scale the phase adjustment and
+ * clamp to the operating range.
+ */
+ offset = min(offset, MAXPHASE);
+ offset = max(offset, -MAXPHASE);
+
+ /*
+ * Select how the frequency is to be controlled
+ * and in which mode (PLL or FLL).
+ */
+ secs = get_seconds() - time_reftime;
+ if (unlikely(time_status & STA_FREQHOLD))
+ secs = 0;
+
+ time_reftime = get_seconds();
+
+ offset64 = offset;
+ freq_adj = ntp_update_offset_fll(offset64, secs);
+
+ /*
+ * Clamp update interval to reduce PLL gain with low
+ * sampling rate (e.g. intermittent network connection)
+ * to avoid instability.
+ */
+ if (unlikely(secs > 1 << (SHIFT_PLL + 1 + time_constant)))
+ secs = 1 << (SHIFT_PLL + 1 + time_constant);
+
+ freq_adj += (offset64 * secs) <<
+ (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant));
+
+ freq_adj = min(freq_adj + time_freq, MAXFREQ_SCALED);
+
+ time_freq = max(freq_adj, -MAXFREQ_SCALED);
+
+ time_offset = div_s64(offset64 << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ);
+}
+
+/**
+ * ntp_clear - Clears the NTP state variables
+ */
+void ntp_clear(void)
+{
+ time_adjust = 0; /* stop active adjtime() */
+ time_status |= STA_UNSYNC;
+ time_maxerror = NTP_PHASE_LIMIT;
+ time_esterror = NTP_PHASE_LIMIT;
+
+ ntp_update_frequency();
+
+ tick_length = tick_length_base;
+ time_offset = 0;
+
+ /* Clear PPS state variables */
+ pps_clear();
+}
+
+
+u64 ntp_tick_length(void)
+{
+ return tick_length;
+}
+
+
+/*
+ * this routine handles the overflow of the microsecond field
+ *
+ * The tricky bits of code to handle the accurate clock support
+ * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
+ * They were originally developed for SUN and DEC kernels.
+ * All the kudos should go to Dave for this stuff.
+ *
+ * Also handles leap second processing, and returns leap offset
+ */
+int second_overflow(unsigned long secs)
+{
+ s64 delta;
+ int leap = 0;
+
+ /*
+ * Leap second processing. If in leap-insert state at the end of the
+ * day, the system clock is set back one second; if in leap-delete
+ * state, the system clock is set ahead one second.
+ */
+ switch (time_state) {
+ case TIME_OK:
+ if (time_status & STA_INS)
+ time_state = TIME_INS;
+ else if (time_status & STA_DEL)
+ time_state = TIME_DEL;
+ break;
+ case TIME_INS:
+ if (!(time_status & STA_INS))
+ time_state = TIME_OK;
+ else if (secs % 86400 == 0) {
+ leap = -1;
+ time_state = TIME_OOP;
+ printk(KERN_NOTICE
+ "Clock: inserting leap second 23:59:60 UTC\n");
+ }
+ break;
+ case TIME_DEL:
+ if (!(time_status & STA_DEL))
+ time_state = TIME_OK;
+ else if ((secs + 1) % 86400 == 0) {
+ leap = 1;
+ time_state = TIME_WAIT;
+ printk(KERN_NOTICE
+ "Clock: deleting leap second 23:59:59 UTC\n");
+ }
+ break;
+ case TIME_OOP:
+ time_state = TIME_WAIT;
+ break;
+
+ case TIME_WAIT:
+ if (!(time_status & (STA_INS | STA_DEL)))
+ time_state = TIME_OK;
+ break;
+ }
+
+
+ /* Bump the maxerror field */
+ time_maxerror += MAXFREQ / NSEC_PER_USEC;
+ if (time_maxerror > NTP_PHASE_LIMIT) {
+ time_maxerror = NTP_PHASE_LIMIT;
+ time_status |= STA_UNSYNC;
+ }
+
+ /* Compute the phase adjustment for the next second */
+ tick_length = tick_length_base;
+
+ delta = ntp_offset_chunk(time_offset);
+ time_offset -= delta;
+ tick_length += delta;
+
+ /* Check PPS signal */
+ pps_dec_valid();
+
+ if (!time_adjust)
+ goto out;
+
+ if (time_adjust > MAX_TICKADJ) {
+ time_adjust -= MAX_TICKADJ;
+ tick_length += MAX_TICKADJ_SCALED;
+ goto out;
+ }
+
+ if (time_adjust < -MAX_TICKADJ) {
+ time_adjust += MAX_TICKADJ;
+ tick_length -= MAX_TICKADJ_SCALED;
+ goto out;
+ }
+
+ tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ)
+ << NTP_SCALE_SHIFT;
+ time_adjust = 0;
+
+out:
+ return leap;
+}
+
+#ifdef CONFIG_GENERIC_CMOS_UPDATE
+int __weak update_persistent_clock64(struct timespec64 now64)
+{
+ struct timespec now;
+
+ now = timespec64_to_timespec(now64);
+ return update_persistent_clock(now);
+}
+#endif
+
+#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC)
+static void sync_cmos_clock(struct work_struct *work);
+
+static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock);
+
+static void sync_cmos_clock(struct work_struct *work)
+{
+ struct timespec64 now;
+ struct timespec next;
+ int fail = 1;
+
+ /*
+ * If we have an externally synchronized Linux clock, then update
+ * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
+ * called as close as possible to 500 ms before the new second starts.
+ * This code is run on a timer. If the clock is set, that timer
+ * may not expire at the correct time. Thus, we adjust...
+ * We want the clock to be within a couple of ticks from the target.
+ */
+ if (!ntp_synced()) {
+ /*
+ * Not synced, exit, do not restart a timer (if one is
+ * running, let it run out).
+ */
+ return;
+ }
+
+ getnstimeofday64(&now);
+ if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) {
+ struct timespec64 adjust = now;
+
+ fail = -ENODEV;
+ if (persistent_clock_is_local)
+ adjust.tv_sec -= (sys_tz.tz_minuteswest * 60);
+#ifdef CONFIG_GENERIC_CMOS_UPDATE
+ fail = update_persistent_clock64(adjust);
+#endif
+
+#ifdef CONFIG_RTC_SYSTOHC
+ if (fail == -ENODEV)
+ fail = rtc_set_ntp_time(adjust);
+#endif
+ }
+
+ next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2);
+ if (next.tv_nsec <= 0)
+ next.tv_nsec += NSEC_PER_SEC;
+
+ if (!fail || fail == -ENODEV)
+ next.tv_sec = 659;
+ else
+ next.tv_sec = 0;
+
+ if (next.tv_nsec >= NSEC_PER_SEC) {
+ next.tv_sec++;
+ next.tv_nsec -= NSEC_PER_SEC;
+ }
+ queue_delayed_work(system_power_efficient_wq,
+ &sync_cmos_work, timespec_to_jiffies(&next));
+}
+
+void ntp_notify_cmos_timer(void)
+{
+ queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0);
+}
+
+#else
+void ntp_notify_cmos_timer(void) { }
+#endif
+
+
+/*
+ * Propagate a new txc->status value into the NTP state:
+ */
+static inline void process_adj_status(struct timex *txc, struct timespec64 *ts)
+{
+ if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) {
+ time_state = TIME_OK;
+ time_status = STA_UNSYNC;
+ /* restart PPS frequency calibration */
+ pps_reset_freq_interval();
+ }
+
+ /*
+ * If we turn on PLL adjustments then reset the
+ * reference time to current time.
+ */
+ if (!(time_status & STA_PLL) && (txc->status & STA_PLL))
+ time_reftime = get_seconds();
+
+ /* only set allowed bits */
+ time_status &= STA_RONLY;
+ time_status |= txc->status & ~STA_RONLY;
+}
+
+
+static inline void process_adjtimex_modes(struct timex *txc,
+ struct timespec64 *ts,
+ s32 *time_tai)
+{
+ if (txc->modes & ADJ_STATUS)
+ process_adj_status(txc, ts);
+
+ if (txc->modes & ADJ_NANO)
+ time_status |= STA_NANO;
+
+ if (txc->modes & ADJ_MICRO)
+ time_status &= ~STA_NANO;
+
+ if (txc->modes & ADJ_FREQUENCY) {
+ time_freq = txc->freq * PPM_SCALE;
+ time_freq = min(time_freq, MAXFREQ_SCALED);
+ time_freq = max(time_freq, -MAXFREQ_SCALED);
+ /* update pps_freq */
+ pps_set_freq(time_freq);
+ }
+
+ if (txc->modes & ADJ_MAXERROR)
+ time_maxerror = txc->maxerror;
+
+ if (txc->modes & ADJ_ESTERROR)
+ time_esterror = txc->esterror;
+
+ if (txc->modes & ADJ_TIMECONST) {
+ time_constant = txc->constant;
+ if (!(time_status & STA_NANO))
+ time_constant += 4;
+ time_constant = min(time_constant, (long)MAXTC);
+ time_constant = max(time_constant, 0l);
+ }
+
+ if (txc->modes & ADJ_TAI && txc->constant > 0)
+ *time_tai = txc->constant;
+
+ if (txc->modes & ADJ_OFFSET)
+ ntp_update_offset(txc->offset);
+
+ if (txc->modes & ADJ_TICK)
+ tick_usec = txc->tick;
+
+ if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET))
+ ntp_update_frequency();
+}
+
+
+
+/**
+ * ntp_validate_timex - Ensures the timex is ok for use in do_adjtimex
+ */
+int ntp_validate_timex(struct timex *txc)
+{
+ if (txc->modes & ADJ_ADJTIME) {
+ /* singleshot must not be used with any other mode bits */
+ if (!(txc->modes & ADJ_OFFSET_SINGLESHOT))
+ return -EINVAL;
+ if (!(txc->modes & ADJ_OFFSET_READONLY) &&
+ !capable(CAP_SYS_TIME))
+ return -EPERM;
+ } else {
+ /* In order to modify anything, you gotta be super-user! */
+ if (txc->modes && !capable(CAP_SYS_TIME))
+ return -EPERM;
+ /*
+ * if the quartz is off by more than 10% then
+ * something is VERY wrong!
+ */
+ if (txc->modes & ADJ_TICK &&
+ (txc->tick < 900000/USER_HZ ||
+ txc->tick > 1100000/USER_HZ))
+ return -EINVAL;
+ }
+
+ if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME)))
+ return -EPERM;
+
+ /*
+ * Check for potential multiplication overflows that can
+ * only happen on 64-bit systems:
+ */
+ if ((txc->modes & ADJ_FREQUENCY) && (BITS_PER_LONG == 64)) {
+ if (LLONG_MIN / PPM_SCALE > txc->freq)
+ return -EINVAL;
+ if (LLONG_MAX / PPM_SCALE < txc->freq)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+
+/*
+ * adjtimex mainly allows reading (and writing, if superuser) of
+ * kernel time-keeping variables. used by xntpd.
+ */
+int __do_adjtimex(struct timex *txc, struct timespec64 *ts, s32 *time_tai)
+{
+ int result;
+
+ if (txc->modes & ADJ_ADJTIME) {
+ long save_adjust = time_adjust;
+
+ if (!(txc->modes & ADJ_OFFSET_READONLY)) {
+ /* adjtime() is independent from ntp_adjtime() */
+ time_adjust = txc->offset;
+ ntp_update_frequency();
+ }
+ txc->offset = save_adjust;
+ } else {
+
+ /* If there are input parameters, then process them: */
+ if (txc->modes)
+ process_adjtimex_modes(txc, ts, time_tai);
+
+ txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
+ NTP_SCALE_SHIFT);
+ if (!(time_status & STA_NANO))
+ txc->offset /= NSEC_PER_USEC;
+ }
+
+ result = time_state; /* mostly `TIME_OK' */
+ /* check for errors */
+ if (is_error_status(time_status))
+ result = TIME_ERROR;
+
+ txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) *
+ PPM_SCALE_INV, NTP_SCALE_SHIFT);
+ txc->maxerror = time_maxerror;
+ txc->esterror = time_esterror;
+ txc->status = time_status;
+ txc->constant = time_constant;
+ txc->precision = 1;
+ txc->tolerance = MAXFREQ_SCALED / PPM_SCALE;
+ txc->tick = tick_usec;
+ txc->tai = *time_tai;
+
+ /* fill PPS status fields */
+ pps_fill_timex(txc);
+
+ txc->time.tv_sec = (time_t)ts->tv_sec;
+ txc->time.tv_usec = ts->tv_nsec;
+ if (!(time_status & STA_NANO))
+ txc->time.tv_usec /= NSEC_PER_USEC;
+
+ return result;
+}
+
+#ifdef CONFIG_NTP_PPS
+
+/* actually struct pps_normtime is good old struct timespec, but it is
+ * semantically different (and it is the reason why it was invented):
+ * pps_normtime.nsec has a range of ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ]
+ * while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) */
+struct pps_normtime {
+ __kernel_time_t sec; /* seconds */
+ long nsec; /* nanoseconds */
+};
+
+/* normalize the timestamp so that nsec is in the
+ ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval */
+static inline struct pps_normtime pps_normalize_ts(struct timespec ts)
+{
+ struct pps_normtime norm = {
+ .sec = ts.tv_sec,
+ .nsec = ts.tv_nsec
+ };
+
+ if (norm.nsec > (NSEC_PER_SEC >> 1)) {
+ norm.nsec -= NSEC_PER_SEC;
+ norm.sec++;
+ }
+
+ return norm;
+}
+
+/* get current phase correction and jitter */
+static inline long pps_phase_filter_get(long *jitter)
+{
+ *jitter = pps_tf[0] - pps_tf[1];
+ if (*jitter < 0)
+ *jitter = -*jitter;
+
+ /* TODO: test various filters */
+ return pps_tf[0];
+}
+
+/* add the sample to the phase filter */
+static inline void pps_phase_filter_add(long err)
+{
+ pps_tf[2] = pps_tf[1];
+ pps_tf[1] = pps_tf[0];
+ pps_tf[0] = err;
+}
+
+/* decrease frequency calibration interval length.
+ * It is halved after four consecutive unstable intervals.
+ */
+static inline void pps_dec_freq_interval(void)
+{
+ if (--pps_intcnt <= -PPS_INTCOUNT) {
+ pps_intcnt = -PPS_INTCOUNT;
+ if (pps_shift > PPS_INTMIN) {
+ pps_shift--;
+ pps_intcnt = 0;
+ }
+ }
+}
+
+/* increase frequency calibration interval length.
+ * It is doubled after four consecutive stable intervals.
+ */
+static inline void pps_inc_freq_interval(void)
+{
+ if (++pps_intcnt >= PPS_INTCOUNT) {
+ pps_intcnt = PPS_INTCOUNT;
+ if (pps_shift < PPS_INTMAX) {
+ pps_shift++;
+ pps_intcnt = 0;
+ }
+ }
+}
+
+/* update clock frequency based on MONOTONIC_RAW clock PPS signal
+ * timestamps
+ *
+ * At the end of the calibration interval the difference between the
+ * first and last MONOTONIC_RAW clock timestamps divided by the length
+ * of the interval becomes the frequency update. If the interval was
+ * too long, the data are discarded.
+ * Returns the difference between old and new frequency values.
+ */
+static long hardpps_update_freq(struct pps_normtime freq_norm)
+{
+ long delta, delta_mod;
+ s64 ftemp;
+
+ /* check if the frequency interval was too long */
+ if (freq_norm.sec > (2 << pps_shift)) {
+ time_status |= STA_PPSERROR;
+ pps_errcnt++;
+ pps_dec_freq_interval();
+ printk_deferred(KERN_ERR
+ "hardpps: PPSERROR: interval too long - %ld s\n",
+ freq_norm.sec);
+ return 0;
+ }
+
+ /* here the raw frequency offset and wander (stability) is
+ * calculated. If the wander is less than the wander threshold
+ * the interval is increased; otherwise it is decreased.
+ */
+ ftemp = div_s64(((s64)(-freq_norm.nsec)) << NTP_SCALE_SHIFT,
+ freq_norm.sec);
+ delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT);
+ pps_freq = ftemp;
+ if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) {
+ printk_deferred(KERN_WARNING
+ "hardpps: PPSWANDER: change=%ld\n", delta);
+ time_status |= STA_PPSWANDER;
+ pps_stbcnt++;
+ pps_dec_freq_interval();
+ } else { /* good sample */
+ pps_inc_freq_interval();
+ }
+
+ /* the stability metric is calculated as the average of recent
+ * frequency changes, but is used only for performance
+ * monitoring
+ */
+ delta_mod = delta;
+ if (delta_mod < 0)
+ delta_mod = -delta_mod;
+ pps_stabil += (div_s64(((s64)delta_mod) <<
+ (NTP_SCALE_SHIFT - SHIFT_USEC),
+ NSEC_PER_USEC) - pps_stabil) >> PPS_INTMIN;
+
+ /* if enabled, the system clock frequency is updated */
+ if ((time_status & STA_PPSFREQ) != 0 &&
+ (time_status & STA_FREQHOLD) == 0) {
+ time_freq = pps_freq;
+ ntp_update_frequency();
+ }
+
+ return delta;
+}
+
+/* correct REALTIME clock phase error against PPS signal */
+static void hardpps_update_phase(long error)
+{
+ long correction = -error;
+ long jitter;
+
+ /* add the sample to the median filter */
+ pps_phase_filter_add(correction);
+ correction = pps_phase_filter_get(&jitter);
+
+ /* Nominal jitter is due to PPS signal noise. If it exceeds the
+ * threshold, the sample is discarded; otherwise, if so enabled,
+ * the time offset is updated.
+ */
+ if (jitter > (pps_jitter << PPS_POPCORN)) {
+ printk_deferred(KERN_WARNING
+ "hardpps: PPSJITTER: jitter=%ld, limit=%ld\n",
+ jitter, (pps_jitter << PPS_POPCORN));
+ time_status |= STA_PPSJITTER;
+ pps_jitcnt++;
+ } else if (time_status & STA_PPSTIME) {
+ /* correct the time using the phase offset */
+ time_offset = div_s64(((s64)correction) << NTP_SCALE_SHIFT,
+ NTP_INTERVAL_FREQ);
+ /* cancel running adjtime() */
+ time_adjust = 0;
+ }
+ /* update jitter */
+ pps_jitter += (jitter - pps_jitter) >> PPS_INTMIN;
+}
+
+/*
+ * __hardpps() - discipline CPU clock oscillator to external PPS signal
+ *
+ * This routine is called at each PPS signal arrival in order to
+ * discipline the CPU clock oscillator to the PPS signal. It takes two
+ * parameters: REALTIME and MONOTONIC_RAW clock timestamps. The former
+ * is used to correct clock phase error and the latter is used to
+ * correct the frequency.
+ *
+ * This code is based on David Mills's reference nanokernel
+ * implementation. It was mostly rewritten but keeps the same idea.
+ */
+void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
+{
+ struct pps_normtime pts_norm, freq_norm;
+
+ pts_norm = pps_normalize_ts(*phase_ts);
+
+ /* clear the error bits, they will be set again if needed */
+ time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
+
+ /* indicate signal presence */
+ time_status |= STA_PPSSIGNAL;
+ pps_valid = PPS_VALID;
+
+ /* when called for the first time,
+ * just start the frequency interval */
+ if (unlikely(pps_fbase.tv_sec == 0)) {
+ pps_fbase = *raw_ts;
+ return;
+ }
+
+ /* ok, now we have a base for frequency calculation */
+ freq_norm = pps_normalize_ts(timespec_sub(*raw_ts, pps_fbase));
+
+ /* check that the signal is in the range
+ * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it */
+ if ((freq_norm.sec == 0) ||
+ (freq_norm.nsec > MAXFREQ * freq_norm.sec) ||
+ (freq_norm.nsec < -MAXFREQ * freq_norm.sec)) {
+ time_status |= STA_PPSJITTER;
+ /* restart the frequency calibration interval */
+ pps_fbase = *raw_ts;
+ printk_deferred(KERN_ERR "hardpps: PPSJITTER: bad pulse\n");
+ return;
+ }
+
+ /* signal is ok */
+
+ /* check if the current frequency interval is finished */
+ if (freq_norm.sec >= (1 << pps_shift)) {
+ pps_calcnt++;
+ /* restart the frequency calibration interval */
+ pps_fbase = *raw_ts;
+ hardpps_update_freq(freq_norm);
+ }
+
+ hardpps_update_phase(pts_norm.nsec);
+
+}
+#endif /* CONFIG_NTP_PPS */
+
+static int __init ntp_tick_adj_setup(char *str)
+{
+ int rc = kstrtol(str, 0, (long *)&ntp_tick_adj);
+
+ if (rc)
+ return rc;
+ ntp_tick_adj <<= NTP_SCALE_SHIFT;
+
+ return 1;
+}
+
+__setup("ntp_tick_adj=", ntp_tick_adj_setup);
+
+void __init ntp_init(void)
+{
+ ntp_clear();
+}
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
new file mode 100644
index 000000000..bbd102ad9
--- /dev/null
+++ b/kernel/time/ntp_internal.h
@@ -0,0 +1,12 @@
+#ifndef _LINUX_NTP_INTERNAL_H
+#define _LINUX_NTP_INTERNAL_H
+
+extern void ntp_init(void);
+extern void ntp_clear(void);
+/* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */
+extern u64 ntp_tick_length(void);
+extern int second_overflow(unsigned long secs);
+extern int ntp_validate_timex(struct timex *);
+extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *);
+extern void __hardpps(const struct timespec *, const struct timespec *);
+#endif /* _LINUX_NTP_INTERNAL_H */
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
new file mode 100644
index 000000000..ce033c7aa
--- /dev/null
+++ b/kernel/time/posix-clock.c
@@ -0,0 +1,446 @@
+/*
+ * posix-clock.c - support for dynamic clock devices
+ *
+ * Copyright (C) 2010 OMICRON electronics GmbH
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/device.h>
+#include <linux/export.h>
+#include <linux/file.h>
+#include <linux/posix-clock.h>
+#include <linux/slab.h>
+#include <linux/syscalls.h>
+#include <linux/uaccess.h>
+
+static void delete_clock(struct kref *kref);
+
+/*
+ * Returns NULL if the posix_clock instance attached to 'fp' is old and stale.
+ */
+static struct posix_clock *get_posix_clock(struct file *fp)
+{
+ struct posix_clock *clk = fp->private_data;
+
+ down_read(&clk->rwsem);
+
+ if (!clk->zombie)
+ return clk;
+
+ up_read(&clk->rwsem);
+
+ return NULL;
+}
+
+static void put_posix_clock(struct posix_clock *clk)
+{
+ up_read(&clk->rwsem);
+}
+
+static ssize_t posix_clock_read(struct file *fp, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct posix_clock *clk = get_posix_clock(fp);
+ int err = -EINVAL;
+
+ if (!clk)
+ return -ENODEV;
+
+ if (clk->ops.read)
+ err = clk->ops.read(clk, fp->f_flags, buf, count);
+
+ put_posix_clock(clk);
+
+ return err;
+}
+
+static unsigned int posix_clock_poll(struct file *fp, poll_table *wait)
+{
+ struct posix_clock *clk = get_posix_clock(fp);
+ int result = 0;
+
+ if (!clk)
+ return -ENODEV;
+
+ if (clk->ops.poll)
+ result = clk->ops.poll(clk, fp, wait);
+
+ put_posix_clock(clk);
+
+ return result;
+}
+
+static int posix_clock_fasync(int fd, struct file *fp, int on)
+{
+ struct posix_clock *clk = get_posix_clock(fp);
+ int err = 0;
+
+ if (!clk)
+ return -ENODEV;
+
+ if (clk->ops.fasync)
+ err = clk->ops.fasync(clk, fd, fp, on);
+
+ put_posix_clock(clk);
+
+ return err;
+}
+
+static int posix_clock_mmap(struct file *fp, struct vm_area_struct *vma)
+{
+ struct posix_clock *clk = get_posix_clock(fp);
+ int err = -ENODEV;
+
+ if (!clk)
+ return -ENODEV;
+
+ if (clk->ops.mmap)
+ err = clk->ops.mmap(clk, vma);
+
+ put_posix_clock(clk);
+
+ return err;
+}
+
+static long posix_clock_ioctl(struct file *fp,
+ unsigned int cmd, unsigned long arg)
+{
+ struct posix_clock *clk = get_posix_clock(fp);
+ int err = -ENOTTY;
+
+ if (!clk)
+ return -ENODEV;
+
+ if (clk->ops.ioctl)
+ err = clk->ops.ioctl(clk, cmd, arg);
+
+ put_posix_clock(clk);
+
+ return err;
+}
+
+#ifdef CONFIG_COMPAT
+static long posix_clock_compat_ioctl(struct file *fp,
+ unsigned int cmd, unsigned long arg)
+{
+ struct posix_clock *clk = get_posix_clock(fp);
+ int err = -ENOTTY;
+
+ if (!clk)
+ return -ENODEV;
+
+ if (clk->ops.ioctl)
+ err = clk->ops.ioctl(clk, cmd, arg);
+
+ put_posix_clock(clk);
+
+ return err;
+}
+#endif
+
+static int posix_clock_open(struct inode *inode, struct file *fp)
+{
+ int err;
+ struct posix_clock *clk =
+ container_of(inode->i_cdev, struct posix_clock, cdev);
+
+ down_read(&clk->rwsem);
+
+ if (clk->zombie) {
+ err = -ENODEV;
+ goto out;
+ }
+ if (clk->ops.open)
+ err = clk->ops.open(clk, fp->f_mode);
+ else
+ err = 0;
+
+ if (!err) {
+ kref_get(&clk->kref);
+ fp->private_data = clk;
+ }
+out:
+ up_read(&clk->rwsem);
+ return err;
+}
+
+static int posix_clock_release(struct inode *inode, struct file *fp)
+{
+ struct posix_clock *clk = fp->private_data;
+ int err = 0;
+
+ if (clk->ops.release)
+ err = clk->ops.release(clk);
+
+ kref_put(&clk->kref, delete_clock);
+
+ fp->private_data = NULL;
+
+ return err;
+}
+
+static const struct file_operations posix_clock_file_operations = {
+ .owner = THIS_MODULE,
+ .llseek = no_llseek,
+ .read = posix_clock_read,
+ .poll = posix_clock_poll,
+ .unlocked_ioctl = posix_clock_ioctl,
+ .open = posix_clock_open,
+ .release = posix_clock_release,
+ .fasync = posix_clock_fasync,
+ .mmap = posix_clock_mmap,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = posix_clock_compat_ioctl,
+#endif
+};
+
+int posix_clock_register(struct posix_clock *clk, dev_t devid)
+{
+ int err;
+
+ kref_init(&clk->kref);
+ init_rwsem(&clk->rwsem);
+
+ cdev_init(&clk->cdev, &posix_clock_file_operations);
+ clk->cdev.owner = clk->ops.owner;
+ err = cdev_add(&clk->cdev, devid, 1);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(posix_clock_register);
+
+static void delete_clock(struct kref *kref)
+{
+ struct posix_clock *clk = container_of(kref, struct posix_clock, kref);
+
+ if (clk->release)
+ clk->release(clk);
+}
+
+void posix_clock_unregister(struct posix_clock *clk)
+{
+ cdev_del(&clk->cdev);
+
+ down_write(&clk->rwsem);
+ clk->zombie = true;
+ up_write(&clk->rwsem);
+
+ kref_put(&clk->kref, delete_clock);
+}
+EXPORT_SYMBOL_GPL(posix_clock_unregister);
+
+struct posix_clock_desc {
+ struct file *fp;
+ struct posix_clock *clk;
+};
+
+static int get_clock_desc(const clockid_t id, struct posix_clock_desc *cd)
+{
+ struct file *fp = fget(CLOCKID_TO_FD(id));
+ int err = -EINVAL;
+
+ if (!fp)
+ return err;
+
+ if (fp->f_op->open != posix_clock_open || !fp->private_data)
+ goto out;
+
+ cd->fp = fp;
+ cd->clk = get_posix_clock(fp);
+
+ err = cd->clk ? 0 : -ENODEV;
+out:
+ if (err)
+ fput(fp);
+ return err;
+}
+
+static void put_clock_desc(struct posix_clock_desc *cd)
+{
+ put_posix_clock(cd->clk);
+ fput(cd->fp);
+}
+
+static int pc_clock_adjtime(clockid_t id, struct timex *tx)
+{
+ struct posix_clock_desc cd;
+ int err;
+
+ err = get_clock_desc(id, &cd);
+ if (err)
+ return err;
+
+ if ((cd.fp->f_mode & FMODE_WRITE) == 0) {
+ err = -EACCES;
+ goto out;
+ }
+
+ if (cd.clk->ops.clock_adjtime)
+ err = cd.clk->ops.clock_adjtime(cd.clk, tx);
+ else
+ err = -EOPNOTSUPP;
+out:
+ put_clock_desc(&cd);
+
+ return err;
+}
+
+static int pc_clock_gettime(clockid_t id, struct timespec *ts)
+{
+ struct posix_clock_desc cd;
+ int err;
+
+ err = get_clock_desc(id, &cd);
+ if (err)
+ return err;
+
+ if (cd.clk->ops.clock_gettime)
+ err = cd.clk->ops.clock_gettime(cd.clk, ts);
+ else
+ err = -EOPNOTSUPP;
+
+ put_clock_desc(&cd);
+
+ return err;
+}
+
+static int pc_clock_getres(clockid_t id, struct timespec *ts)
+{
+ struct posix_clock_desc cd;
+ int err;
+
+ err = get_clock_desc(id, &cd);
+ if (err)
+ return err;
+
+ if (cd.clk->ops.clock_getres)
+ err = cd.clk->ops.clock_getres(cd.clk, ts);
+ else
+ err = -EOPNOTSUPP;
+
+ put_clock_desc(&cd);
+
+ return err;
+}
+
+static int pc_clock_settime(clockid_t id, const struct timespec *ts)
+{
+ struct posix_clock_desc cd;
+ int err;
+
+ err = get_clock_desc(id, &cd);
+ if (err)
+ return err;
+
+ if ((cd.fp->f_mode & FMODE_WRITE) == 0) {
+ err = -EACCES;
+ goto out;
+ }
+
+ if (cd.clk->ops.clock_settime)
+ err = cd.clk->ops.clock_settime(cd.clk, ts);
+ else
+ err = -EOPNOTSUPP;
+out:
+ put_clock_desc(&cd);
+
+ return err;
+}
+
+static int pc_timer_create(struct k_itimer *kit)
+{
+ clockid_t id = kit->it_clock;
+ struct posix_clock_desc cd;
+ int err;
+
+ err = get_clock_desc(id, &cd);
+ if (err)
+ return err;
+
+ if (cd.clk->ops.timer_create)
+ err = cd.clk->ops.timer_create(cd.clk, kit);
+ else
+ err = -EOPNOTSUPP;
+
+ put_clock_desc(&cd);
+
+ return err;
+}
+
+static int pc_timer_delete(struct k_itimer *kit)
+{
+ clockid_t id = kit->it_clock;
+ struct posix_clock_desc cd;
+ int err;
+
+ err = get_clock_desc(id, &cd);
+ if (err)
+ return err;
+
+ if (cd.clk->ops.timer_delete)
+ err = cd.clk->ops.timer_delete(cd.clk, kit);
+ else
+ err = -EOPNOTSUPP;
+
+ put_clock_desc(&cd);
+
+ return err;
+}
+
+static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts)
+{
+ clockid_t id = kit->it_clock;
+ struct posix_clock_desc cd;
+
+ if (get_clock_desc(id, &cd))
+ return;
+
+ if (cd.clk->ops.timer_gettime)
+ cd.clk->ops.timer_gettime(cd.clk, kit, ts);
+
+ put_clock_desc(&cd);
+}
+
+static int pc_timer_settime(struct k_itimer *kit, int flags,
+ struct itimerspec *ts, struct itimerspec *old)
+{
+ clockid_t id = kit->it_clock;
+ struct posix_clock_desc cd;
+ int err;
+
+ err = get_clock_desc(id, &cd);
+ if (err)
+ return err;
+
+ if (cd.clk->ops.timer_settime)
+ err = cd.clk->ops.timer_settime(cd.clk, kit, flags, ts, old);
+ else
+ err = -EOPNOTSUPP;
+
+ put_clock_desc(&cd);
+
+ return err;
+}
+
+struct k_clock clock_posix_dynamic = {
+ .clock_getres = pc_clock_getres,
+ .clock_set = pc_clock_settime,
+ .clock_get = pc_clock_gettime,
+ .clock_adj = pc_clock_adjtime,
+ .timer_create = pc_timer_create,
+ .timer_set = pc_timer_settime,
+ .timer_del = pc_timer_delete,
+ .timer_get = pc_timer_gettime,
+};
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
new file mode 100644
index 000000000..0ac829b48
--- /dev/null
+++ b/kernel/time/posix-cpu-timers.c
@@ -0,0 +1,1475 @@
+/*
+ * Implement CPU time clocks for the POSIX clock interface.
+ */
+
+#include <linux/sched.h>
+#include <linux/posix-timers.h>
+#include <linux/errno.h>
+#include <linux/math64.h>
+#include <asm/uaccess.h>
+#include <linux/kernel_stat.h>
+#include <trace/events/timer.h>
+#include <linux/random.h>
+#include <linux/tick.h>
+#include <linux/workqueue.h>
+
+/*
+ * Called after updating RLIMIT_CPU to run cpu timer and update
+ * tsk->signal->cputime_expires expiration cache if necessary. Needs
+ * siglock protection since other code may update expiration cache as
+ * well.
+ */
+void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new)
+{
+ cputime_t cputime = secs_to_cputime(rlim_new);
+
+ spin_lock_irq(&task->sighand->siglock);
+ set_process_cpu_timer(task, CPUCLOCK_PROF, &cputime, NULL);
+ spin_unlock_irq(&task->sighand->siglock);
+}
+
+static int check_clock(const clockid_t which_clock)
+{
+ int error = 0;
+ struct task_struct *p;
+ const pid_t pid = CPUCLOCK_PID(which_clock);
+
+ if (CPUCLOCK_WHICH(which_clock) >= CPUCLOCK_MAX)
+ return -EINVAL;
+
+ if (pid == 0)
+ return 0;
+
+ rcu_read_lock();
+ p = find_task_by_vpid(pid);
+ if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ?
+ same_thread_group(p, current) : has_group_leader_pid(p))) {
+ error = -EINVAL;
+ }
+ rcu_read_unlock();
+
+ return error;
+}
+
+static inline unsigned long long
+timespec_to_sample(const clockid_t which_clock, const struct timespec *tp)
+{
+ unsigned long long ret;
+
+ ret = 0; /* high half always zero when .cpu used */
+ if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
+ ret = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec;
+ } else {
+ ret = cputime_to_expires(timespec_to_cputime(tp));
+ }
+ return ret;
+}
+
+static void sample_to_timespec(const clockid_t which_clock,
+ unsigned long long expires,
+ struct timespec *tp)
+{
+ if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED)
+ *tp = ns_to_timespec(expires);
+ else
+ cputime_to_timespec((__force cputime_t)expires, tp);
+}
+
+/*
+ * Update expiry time from increment, and increase overrun count,
+ * given the current clock sample.
+ */
+static void bump_cpu_timer(struct k_itimer *timer,
+ unsigned long long now)
+{
+ int i;
+ unsigned long long delta, incr;
+
+ if (timer->it.cpu.incr == 0)
+ return;
+
+ if (now < timer->it.cpu.expires)
+ return;
+
+ incr = timer->it.cpu.incr;
+ delta = now + incr - timer->it.cpu.expires;
+
+ /* Don't use (incr*2 < delta), incr*2 might overflow. */
+ for (i = 0; incr < delta - incr; i++)
+ incr = incr << 1;
+
+ for (; i >= 0; incr >>= 1, i--) {
+ if (delta < incr)
+ continue;
+
+ timer->it.cpu.expires += incr;
+ timer->it_overrun += 1 << i;
+ delta -= incr;
+ }
+}
+
+/**
+ * task_cputime_zero - Check a task_cputime struct for all zero fields.
+ *
+ * @cputime: The struct to compare.
+ *
+ * Checks @cputime to see if all fields are zero. Returns true if all fields
+ * are zero, false if any field is nonzero.
+ */
+static inline int task_cputime_zero(const struct task_cputime *cputime)
+{
+ if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime)
+ return 1;
+ return 0;
+}
+
+static inline unsigned long long prof_ticks(struct task_struct *p)
+{
+ cputime_t utime, stime;
+
+ task_cputime(p, &utime, &stime);
+
+ return cputime_to_expires(utime + stime);
+}
+static inline unsigned long long virt_ticks(struct task_struct *p)
+{
+ cputime_t utime;
+
+ task_cputime(p, &utime, NULL);
+
+ return cputime_to_expires(utime);
+}
+
+static int
+posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
+{
+ int error = check_clock(which_clock);
+ if (!error) {
+ tp->tv_sec = 0;
+ tp->tv_nsec = ((NSEC_PER_SEC + HZ - 1) / HZ);
+ if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
+ /*
+ * If sched_clock is using a cycle counter, we
+ * don't have any idea of its true resolution
+ * exported, but it is much more than 1s/HZ.
+ */
+ tp->tv_nsec = 1;
+ }
+ }
+ return error;
+}
+
+static int
+posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
+{
+ /*
+ * You can never reset a CPU clock, but we check for other errors
+ * in the call before failing with EPERM.
+ */
+ int error = check_clock(which_clock);
+ if (error == 0) {
+ error = -EPERM;
+ }
+ return error;
+}
+
+
+/*
+ * Sample a per-thread clock for the given task.
+ */
+static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
+ unsigned long long *sample)
+{
+ switch (CPUCLOCK_WHICH(which_clock)) {
+ default:
+ return -EINVAL;
+ case CPUCLOCK_PROF:
+ *sample = prof_ticks(p);
+ break;
+ case CPUCLOCK_VIRT:
+ *sample = virt_ticks(p);
+ break;
+ case CPUCLOCK_SCHED:
+ *sample = task_sched_runtime(p);
+ break;
+ }
+ return 0;
+}
+
+static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
+{
+ if (b->utime > a->utime)
+ a->utime = b->utime;
+
+ if (b->stime > a->stime)
+ a->stime = b->stime;
+
+ if (b->sum_exec_runtime > a->sum_exec_runtime)
+ a->sum_exec_runtime = b->sum_exec_runtime;
+}
+
+void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
+{
+ struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+ struct task_cputime sum;
+ unsigned long flags;
+
+ if (!cputimer->running) {
+ /*
+ * The POSIX timer interface allows for absolute time expiry
+ * values through the TIMER_ABSTIME flag, therefore we have
+ * to synchronize the timer to the clock every time we start
+ * it.
+ */
+ thread_group_cputime(tsk, &sum);
+ raw_spin_lock_irqsave(&cputimer->lock, flags);
+ cputimer->running = 1;
+ update_gt_cputime(&cputimer->cputime, &sum);
+ } else
+ raw_spin_lock_irqsave(&cputimer->lock, flags);
+ *times = cputimer->cputime;
+ raw_spin_unlock_irqrestore(&cputimer->lock, flags);
+}
+
+/*
+ * Sample a process (thread group) clock for the given group_leader task.
+ * Must be called with task sighand lock held for safe while_each_thread()
+ * traversal.
+ */
+static int cpu_clock_sample_group(const clockid_t which_clock,
+ struct task_struct *p,
+ unsigned long long *sample)
+{
+ struct task_cputime cputime;
+
+ switch (CPUCLOCK_WHICH(which_clock)) {
+ default:
+ return -EINVAL;
+ case CPUCLOCK_PROF:
+ thread_group_cputime(p, &cputime);
+ *sample = cputime_to_expires(cputime.utime + cputime.stime);
+ break;
+ case CPUCLOCK_VIRT:
+ thread_group_cputime(p, &cputime);
+ *sample = cputime_to_expires(cputime.utime);
+ break;
+ case CPUCLOCK_SCHED:
+ thread_group_cputime(p, &cputime);
+ *sample = cputime.sum_exec_runtime;
+ break;
+ }
+ return 0;
+}
+
+static int posix_cpu_clock_get_task(struct task_struct *tsk,
+ const clockid_t which_clock,
+ struct timespec *tp)
+{
+ int err = -EINVAL;
+ unsigned long long rtn;
+
+ if (CPUCLOCK_PERTHREAD(which_clock)) {
+ if (same_thread_group(tsk, current))
+ err = cpu_clock_sample(which_clock, tsk, &rtn);
+ } else {
+ if (tsk == current || thread_group_leader(tsk))
+ err = cpu_clock_sample_group(which_clock, tsk, &rtn);
+ }
+
+ if (!err)
+ sample_to_timespec(which_clock, rtn, tp);
+
+ return err;
+}
+
+
+static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
+{
+ const pid_t pid = CPUCLOCK_PID(which_clock);
+ int err = -EINVAL;
+
+ if (pid == 0) {
+ /*
+ * Special case constant value for our own clocks.
+ * We don't have to do any lookup to find ourselves.
+ */
+ err = posix_cpu_clock_get_task(current, which_clock, tp);
+ } else {
+ /*
+ * Find the given PID, and validate that the caller
+ * should be able to see it.
+ */
+ struct task_struct *p;
+ rcu_read_lock();
+ p = find_task_by_vpid(pid);
+ if (p)
+ err = posix_cpu_clock_get_task(p, which_clock, tp);
+ rcu_read_unlock();
+ }
+
+ return err;
+}
+
+
+/*
+ * Validate the clockid_t for a new CPU-clock timer, and initialize the timer.
+ * This is called from sys_timer_create() and do_cpu_nanosleep() with the
+ * new timer already all-zeros initialized.
+ */
+static int posix_cpu_timer_create(struct k_itimer *new_timer)
+{
+ int ret = 0;
+ const pid_t pid = CPUCLOCK_PID(new_timer->it_clock);
+ struct task_struct *p;
+
+ if (CPUCLOCK_WHICH(new_timer->it_clock) >= CPUCLOCK_MAX)
+ return -EINVAL;
+
+ INIT_LIST_HEAD(&new_timer->it.cpu.entry);
+
+ rcu_read_lock();
+ if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) {
+ if (pid == 0) {
+ p = current;
+ } else {
+ p = find_task_by_vpid(pid);
+ if (p && !same_thread_group(p, current))
+ p = NULL;
+ }
+ } else {
+ if (pid == 0) {
+ p = current->group_leader;
+ } else {
+ p = find_task_by_vpid(pid);
+ if (p && !has_group_leader_pid(p))
+ p = NULL;
+ }
+ }
+ new_timer->it.cpu.task = p;
+ if (p) {
+ get_task_struct(p);
+ } else {
+ ret = -EINVAL;
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
+/*
+ * Clean up a CPU-clock timer that is about to be destroyed.
+ * This is called from timer deletion with the timer already locked.
+ * If we return TIMER_RETRY, it's necessary to release the timer's lock
+ * and try again. (This happens when the timer is in the middle of firing.)
+ */
+static int posix_cpu_timer_del(struct k_itimer *timer)
+{
+ int ret = 0;
+ unsigned long flags;
+ struct sighand_struct *sighand;
+ struct task_struct *p = timer->it.cpu.task;
+
+ WARN_ON_ONCE(p == NULL);
+
+ /*
+ * Protect against sighand release/switch in exit/exec and process/
+ * thread timer list entry concurrent read/writes.
+ */
+ sighand = lock_task_sighand(p, &flags);
+ if (unlikely(sighand == NULL)) {
+ /*
+ * We raced with the reaping of the task.
+ * The deletion should have cleared us off the list.
+ */
+ WARN_ON_ONCE(!list_empty(&timer->it.cpu.entry));
+ } else {
+ if (timer->it.cpu.firing)
+ ret = TIMER_RETRY;
+ else
+ list_del(&timer->it.cpu.entry);
+
+ unlock_task_sighand(p, &flags);
+ }
+
+ if (!ret)
+ put_task_struct(p);
+
+ return ret;
+}
+
+static void cleanup_timers_list(struct list_head *head)
+{
+ struct cpu_timer_list *timer, *next;
+
+ list_for_each_entry_safe(timer, next, head, entry)
+ list_del_init(&timer->entry);
+}
+
+/*
+ * Clean out CPU timers still ticking when a thread exited. The task
+ * pointer is cleared, and the expiry time is replaced with the residual
+ * time for later timer_gettime calls to return.
+ * This must be called with the siglock held.
+ */
+static void cleanup_timers(struct list_head *head)
+{
+ cleanup_timers_list(head);
+ cleanup_timers_list(++head);
+ cleanup_timers_list(++head);
+}
+
+/*
+ * These are both called with the siglock held, when the current thread
+ * is being reaped. When the final (leader) thread in the group is reaped,
+ * posix_cpu_timers_exit_group will be called after posix_cpu_timers_exit.
+ */
+void posix_cpu_timers_exit(struct task_struct *tsk)
+{
+ add_device_randomness((const void*) &tsk_seruntime(tsk),
+ sizeof(unsigned long long));
+ cleanup_timers(tsk->cpu_timers);
+
+}
+void posix_cpu_timers_exit_group(struct task_struct *tsk)
+{
+ cleanup_timers(tsk->signal->cpu_timers);
+}
+
+static inline int expires_gt(cputime_t expires, cputime_t new_exp)
+{
+ return expires == 0 || expires > new_exp;
+}
+
+/*
+ * Insert the timer on the appropriate list before any timers that
+ * expire later. This must be called with the sighand lock held.
+ */
+static void arm_timer(struct k_itimer *timer)
+{
+ struct task_struct *p = timer->it.cpu.task;
+ struct list_head *head, *listpos;
+ struct task_cputime *cputime_expires;
+ struct cpu_timer_list *const nt = &timer->it.cpu;
+ struct cpu_timer_list *next;
+
+ if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
+ head = p->cpu_timers;
+ cputime_expires = &p->cputime_expires;
+ } else {
+ head = p->signal->cpu_timers;
+ cputime_expires = &p->signal->cputime_expires;
+ }
+ head += CPUCLOCK_WHICH(timer->it_clock);
+
+ listpos = head;
+ list_for_each_entry(next, head, entry) {
+ if (nt->expires < next->expires)
+ break;
+ listpos = &next->entry;
+ }
+ list_add(&nt->entry, listpos);
+
+ if (listpos == head) {
+ unsigned long long exp = nt->expires;
+
+ /*
+ * We are the new earliest-expiring POSIX 1.b timer, hence
+ * need to update expiration cache. Take into account that
+ * for process timers we share expiration cache with itimers
+ * and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME.
+ */
+
+ switch (CPUCLOCK_WHICH(timer->it_clock)) {
+ case CPUCLOCK_PROF:
+ if (expires_gt(cputime_expires->prof_exp, expires_to_cputime(exp)))
+ cputime_expires->prof_exp = expires_to_cputime(exp);
+ break;
+ case CPUCLOCK_VIRT:
+ if (expires_gt(cputime_expires->virt_exp, expires_to_cputime(exp)))
+ cputime_expires->virt_exp = expires_to_cputime(exp);
+ break;
+ case CPUCLOCK_SCHED:
+ if (cputime_expires->sched_exp == 0 ||
+ cputime_expires->sched_exp > exp)
+ cputime_expires->sched_exp = exp;
+ break;
+ }
+ }
+}
+
+/*
+ * The timer is locked, fire it and arrange for its reload.
+ */
+static void cpu_timer_fire(struct k_itimer *timer)
+{
+ if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
+ /*
+ * User don't want any signal.
+ */
+ timer->it.cpu.expires = 0;
+ } else if (unlikely(timer->sigq == NULL)) {
+ /*
+ * This a special case for clock_nanosleep,
+ * not a normal timer from sys_timer_create.
+ */
+ wake_up_process(timer->it_process);
+ timer->it.cpu.expires = 0;
+ } else if (timer->it.cpu.incr == 0) {
+ /*
+ * One-shot timer. Clear it as soon as it's fired.
+ */
+ posix_timer_event(timer, 0);
+ timer->it.cpu.expires = 0;
+ } else if (posix_timer_event(timer, ++timer->it_requeue_pending)) {
+ /*
+ * The signal did not get queued because the signal
+ * was ignored, so we won't get any callback to
+ * reload the timer. But we need to keep it
+ * ticking in case the signal is deliverable next time.
+ */
+ posix_cpu_timer_schedule(timer);
+ }
+}
+
+/*
+ * Sample a process (thread group) timer for the given group_leader task.
+ * Must be called with task sighand lock held for safe while_each_thread()
+ * traversal.
+ */
+static int cpu_timer_sample_group(const clockid_t which_clock,
+ struct task_struct *p,
+ unsigned long long *sample)
+{
+ struct task_cputime cputime;
+
+ thread_group_cputimer(p, &cputime);
+ switch (CPUCLOCK_WHICH(which_clock)) {
+ default:
+ return -EINVAL;
+ case CPUCLOCK_PROF:
+ *sample = cputime_to_expires(cputime.utime + cputime.stime);
+ break;
+ case CPUCLOCK_VIRT:
+ *sample = cputime_to_expires(cputime.utime);
+ break;
+ case CPUCLOCK_SCHED:
+ *sample = cputime.sum_exec_runtime;
+ break;
+ }
+ return 0;
+}
+
+#ifdef CONFIG_NO_HZ_FULL
+static void nohz_kick_work_fn(struct work_struct *work)
+{
+ tick_nohz_full_kick_all();
+}
+
+static DECLARE_WORK(nohz_kick_work, nohz_kick_work_fn);
+
+/*
+ * We need the IPIs to be sent from sane process context.
+ * The posix cpu timers are always set with irqs disabled.
+ */
+static void posix_cpu_timer_kick_nohz(void)
+{
+ if (context_tracking_is_enabled())
+ schedule_work(&nohz_kick_work);
+}
+
+bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk)
+{
+ if (!task_cputime_zero(&tsk->cputime_expires))
+ return false;
+
+ if (tsk->signal->cputimer.running)
+ return false;
+
+ return true;
+}
+#else
+static inline void posix_cpu_timer_kick_nohz(void) { }
+#endif
+
+/*
+ * Guts of sys_timer_settime for CPU timers.
+ * This is called with the timer locked and interrupts disabled.
+ * If we return TIMER_RETRY, it's necessary to release the timer's lock
+ * and try again. (This happens when the timer is in the middle of firing.)
+ */
+static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
+ struct itimerspec *new, struct itimerspec *old)
+{
+ unsigned long flags;
+ struct sighand_struct *sighand;
+ struct task_struct *p = timer->it.cpu.task;
+ unsigned long long old_expires, new_expires, old_incr, val;
+ int ret;
+
+ WARN_ON_ONCE(p == NULL);
+
+ new_expires = timespec_to_sample(timer->it_clock, &new->it_value);
+
+ /*
+ * Protect against sighand release/switch in exit/exec and p->cpu_timers
+ * and p->signal->cpu_timers read/write in arm_timer()
+ */
+ sighand = lock_task_sighand(p, &flags);
+ /*
+ * If p has just been reaped, we can no
+ * longer get any information about it at all.
+ */
+ if (unlikely(sighand == NULL)) {
+ return -ESRCH;
+ }
+
+ /*
+ * Disarm any old timer after extracting its expiry time.
+ */
+ WARN_ON_ONCE(!irqs_disabled());
+
+ ret = 0;
+ old_incr = timer->it.cpu.incr;
+ old_expires = timer->it.cpu.expires;
+ if (unlikely(timer->it.cpu.firing)) {
+ timer->it.cpu.firing = -1;
+ ret = TIMER_RETRY;
+ } else
+ list_del_init(&timer->it.cpu.entry);
+
+ /*
+ * We need to sample the current value to convert the new
+ * value from to relative and absolute, and to convert the
+ * old value from absolute to relative. To set a process
+ * timer, we need a sample to balance the thread expiry
+ * times (in arm_timer). With an absolute time, we must
+ * check if it's already passed. In short, we need a sample.
+ */
+ if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
+ cpu_clock_sample(timer->it_clock, p, &val);
+ } else {
+ cpu_timer_sample_group(timer->it_clock, p, &val);
+ }
+
+ if (old) {
+ if (old_expires == 0) {
+ old->it_value.tv_sec = 0;
+ old->it_value.tv_nsec = 0;
+ } else {
+ /*
+ * Update the timer in case it has
+ * overrun already. If it has,
+ * we'll report it as having overrun
+ * and with the next reloaded timer
+ * already ticking, though we are
+ * swallowing that pending
+ * notification here to install the
+ * new setting.
+ */
+ bump_cpu_timer(timer, val);
+ if (val < timer->it.cpu.expires) {
+ old_expires = timer->it.cpu.expires - val;
+ sample_to_timespec(timer->it_clock,
+ old_expires,
+ &old->it_value);
+ } else {
+ old->it_value.tv_nsec = 1;
+ old->it_value.tv_sec = 0;
+ }
+ }
+ }
+
+ if (unlikely(ret)) {
+ /*
+ * We are colliding with the timer actually firing.
+ * Punt after filling in the timer's old value, and
+ * disable this firing since we are already reporting
+ * it as an overrun (thanks to bump_cpu_timer above).
+ */
+ unlock_task_sighand(p, &flags);
+ goto out;
+ }
+
+ if (new_expires != 0 && !(timer_flags & TIMER_ABSTIME)) {
+ new_expires += val;
+ }
+
+ /*
+ * Install the new expiry time (or zero).
+ * For a timer with no notification action, we don't actually
+ * arm the timer (we'll just fake it for timer_gettime).
+ */
+ timer->it.cpu.expires = new_expires;
+ if (new_expires != 0 && val < new_expires) {
+ arm_timer(timer);
+ }
+
+ unlock_task_sighand(p, &flags);
+ /*
+ * Install the new reload setting, and
+ * set up the signal and overrun bookkeeping.
+ */
+ timer->it.cpu.incr = timespec_to_sample(timer->it_clock,
+ &new->it_interval);
+
+ /*
+ * This acts as a modification timestamp for the timer,
+ * so any automatic reload attempt will punt on seeing
+ * that we have reset the timer manually.
+ */
+ timer->it_requeue_pending = (timer->it_requeue_pending + 2) &
+ ~REQUEUE_PENDING;
+ timer->it_overrun_last = 0;
+ timer->it_overrun = -1;
+
+ if (new_expires != 0 && !(val < new_expires)) {
+ /*
+ * The designated time already passed, so we notify
+ * immediately, even if the thread never runs to
+ * accumulate more time on this clock.
+ */
+ cpu_timer_fire(timer);
+ }
+
+ ret = 0;
+ out:
+ if (old) {
+ sample_to_timespec(timer->it_clock,
+ old_incr, &old->it_interval);
+ }
+ if (!ret)
+ posix_cpu_timer_kick_nohz();
+ return ret;
+}
+
+static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
+{
+ unsigned long long now;
+ struct task_struct *p = timer->it.cpu.task;
+
+ WARN_ON_ONCE(p == NULL);
+
+ /*
+ * Easy part: convert the reload time.
+ */
+ sample_to_timespec(timer->it_clock,
+ timer->it.cpu.incr, &itp->it_interval);
+
+ if (timer->it.cpu.expires == 0) { /* Timer not armed at all. */
+ itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;
+ return;
+ }
+
+ /*
+ * Sample the clock to take the difference with the expiry time.
+ */
+ if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
+ cpu_clock_sample(timer->it_clock, p, &now);
+ } else {
+ struct sighand_struct *sighand;
+ unsigned long flags;
+
+ /*
+ * Protect against sighand release/switch in exit/exec and
+ * also make timer sampling safe if it ends up calling
+ * thread_group_cputime().
+ */
+ sighand = lock_task_sighand(p, &flags);
+ if (unlikely(sighand == NULL)) {
+ /*
+ * The process has been reaped.
+ * We can't even collect a sample any more.
+ * Call the timer disarmed, nothing else to do.
+ */
+ timer->it.cpu.expires = 0;
+ sample_to_timespec(timer->it_clock, timer->it.cpu.expires,
+ &itp->it_value);
+ } else {
+ cpu_timer_sample_group(timer->it_clock, p, &now);
+ unlock_task_sighand(p, &flags);
+ }
+ }
+
+ if (now < timer->it.cpu.expires) {
+ sample_to_timespec(timer->it_clock,
+ timer->it.cpu.expires - now,
+ &itp->it_value);
+ } else {
+ /*
+ * The timer should have expired already, but the firing
+ * hasn't taken place yet. Say it's just about to expire.
+ */
+ itp->it_value.tv_nsec = 1;
+ itp->it_value.tv_sec = 0;
+ }
+}
+
+static unsigned long long
+check_timers_list(struct list_head *timers,
+ struct list_head *firing,
+ unsigned long long curr)
+{
+ int maxfire = 20;
+
+ while (!list_empty(timers)) {
+ struct cpu_timer_list *t;
+
+ t = list_first_entry(timers, struct cpu_timer_list, entry);
+
+ if (!--maxfire || curr < t->expires)
+ return t->expires;
+
+ t->firing = 1;
+ list_move_tail(&t->entry, firing);
+ }
+
+ return 0;
+}
+
+/*
+ * Check for any per-thread CPU timers that have fired and move them off
+ * the tsk->cpu_timers[N] list onto the firing list. Here we update the
+ * tsk->it_*_expires values to reflect the remaining thread CPU timers.
+ */
+static void check_thread_timers(struct task_struct *tsk,
+ struct list_head *firing)
+{
+ struct list_head *timers = tsk->cpu_timers;
+ struct signal_struct *const sig = tsk->signal;
+ struct task_cputime *tsk_expires = &tsk->cputime_expires;
+ unsigned long long expires;
+ unsigned long soft;
+
+ expires = check_timers_list(timers, firing, prof_ticks(tsk));
+ tsk_expires->prof_exp = expires_to_cputime(expires);
+
+ expires = check_timers_list(++timers, firing, virt_ticks(tsk));
+ tsk_expires->virt_exp = expires_to_cputime(expires);
+
+ tsk_expires->sched_exp = check_timers_list(++timers, firing,
+ tsk_seruntime(tsk));
+
+ /*
+ * Check for the special case thread timers.
+ */
+ soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
+ if (soft != RLIM_INFINITY) {
+ unsigned long hard =
+ ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
+
+ if (hard != RLIM_INFINITY &&
+ tsk_rttimeout(tsk) > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
+ /*
+ * At the hard limit, we just die.
+ * No need to calculate anything else now.
+ */
+ __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
+ return;
+ }
+ if (tsk_rttimeout(tsk) > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
+ /*
+ * At the soft limit, send a SIGXCPU every second.
+ */
+ if (soft < hard) {
+ soft += USEC_PER_SEC;
+ sig->rlim[RLIMIT_RTTIME].rlim_cur = soft;
+ }
+ printk(KERN_INFO
+ "RT Watchdog Timeout: %s[%d]\n",
+ tsk->comm, task_pid_nr(tsk));
+ __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
+ }
+ }
+}
+
+static void stop_process_timers(struct signal_struct *sig)
+{
+ struct thread_group_cputimer *cputimer = &sig->cputimer;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&cputimer->lock, flags);
+ cputimer->running = 0;
+ raw_spin_unlock_irqrestore(&cputimer->lock, flags);
+}
+
+static u32 onecputick;
+
+static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
+ unsigned long long *expires,
+ unsigned long long cur_time, int signo)
+{
+ if (!it->expires)
+ return;
+
+ if (cur_time >= it->expires) {
+ if (it->incr) {
+ it->expires += it->incr;
+ it->error += it->incr_error;
+ if (it->error >= onecputick) {
+ it->expires -= cputime_one_jiffy;
+ it->error -= onecputick;
+ }
+ } else {
+ it->expires = 0;
+ }
+
+ trace_itimer_expire(signo == SIGPROF ?
+ ITIMER_PROF : ITIMER_VIRTUAL,
+ tsk->signal->leader_pid, cur_time);
+ __group_send_sig_info(signo, SEND_SIG_PRIV, tsk);
+ }
+
+ if (it->expires && (!*expires || it->expires < *expires)) {
+ *expires = it->expires;
+ }
+}
+
+/*
+ * Check for any per-thread CPU timers that have fired and move them
+ * off the tsk->*_timers list onto the firing list. Per-thread timers
+ * have already been taken off.
+ */
+static void check_process_timers(struct task_struct *tsk,
+ struct list_head *firing)
+{
+ struct signal_struct *const sig = tsk->signal;
+ unsigned long long utime, ptime, virt_expires, prof_expires;
+ unsigned long long sum_sched_runtime, sched_expires;
+ struct list_head *timers = sig->cpu_timers;
+ struct task_cputime cputime;
+ unsigned long soft;
+
+ /*
+ * Collect the current process totals.
+ */
+ thread_group_cputimer(tsk, &cputime);
+ utime = cputime_to_expires(cputime.utime);
+ ptime = utime + cputime_to_expires(cputime.stime);
+ sum_sched_runtime = cputime.sum_exec_runtime;
+
+ prof_expires = check_timers_list(timers, firing, ptime);
+ virt_expires = check_timers_list(++timers, firing, utime);
+ sched_expires = check_timers_list(++timers, firing, sum_sched_runtime);
+
+ /*
+ * Check for the special case process timers.
+ */
+ check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF], &prof_expires, ptime,
+ SIGPROF);
+ check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
+ SIGVTALRM);
+ soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
+ if (soft != RLIM_INFINITY) {
+ unsigned long psecs = cputime_to_secs(ptime);
+ unsigned long hard =
+ ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
+ cputime_t x;
+ if (psecs >= hard) {
+ /*
+ * At the hard limit, we just die.
+ * No need to calculate anything else now.
+ */
+ __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
+ return;
+ }
+ if (psecs >= soft) {
+ /*
+ * At the soft limit, send a SIGXCPU every second.
+ */
+ __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
+ if (soft < hard) {
+ soft++;
+ sig->rlim[RLIMIT_CPU].rlim_cur = soft;
+ }
+ }
+ x = secs_to_cputime(soft);
+ if (!prof_expires || x < prof_expires) {
+ prof_expires = x;
+ }
+ }
+
+ sig->cputime_expires.prof_exp = expires_to_cputime(prof_expires);
+ sig->cputime_expires.virt_exp = expires_to_cputime(virt_expires);
+ sig->cputime_expires.sched_exp = sched_expires;
+ if (task_cputime_zero(&sig->cputime_expires))
+ stop_process_timers(sig);
+}
+
+/*
+ * This is called from the signal code (via do_schedule_next_timer)
+ * when the last timer signal was delivered and we have to reload the timer.
+ */
+void posix_cpu_timer_schedule(struct k_itimer *timer)
+{
+ struct sighand_struct *sighand;
+ unsigned long flags;
+ struct task_struct *p = timer->it.cpu.task;
+ unsigned long long now;
+
+ WARN_ON_ONCE(p == NULL);
+
+ /*
+ * Fetch the current sample and update the timer's expiry time.
+ */
+ if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
+ cpu_clock_sample(timer->it_clock, p, &now);
+ bump_cpu_timer(timer, now);
+ if (unlikely(p->exit_state))
+ goto out;
+
+ /* Protect timer list r/w in arm_timer() */
+ sighand = lock_task_sighand(p, &flags);
+ if (!sighand)
+ goto out;
+ } else {
+ /*
+ * Protect arm_timer() and timer sampling in case of call to
+ * thread_group_cputime().
+ */
+ sighand = lock_task_sighand(p, &flags);
+ if (unlikely(sighand == NULL)) {
+ /*
+ * The process has been reaped.
+ * We can't even collect a sample any more.
+ */
+ timer->it.cpu.expires = 0;
+ goto out;
+ } else if (unlikely(p->exit_state) && thread_group_empty(p)) {
+ unlock_task_sighand(p, &flags);
+ /* Optimizations: if the process is dying, no need to rearm */
+ goto out;
+ }
+ cpu_timer_sample_group(timer->it_clock, p, &now);
+ bump_cpu_timer(timer, now);
+ /* Leave the sighand locked for the call below. */
+ }
+
+ /*
+ * Now re-arm for the new expiry time.
+ */
+ WARN_ON_ONCE(!irqs_disabled());
+ arm_timer(timer);
+ unlock_task_sighand(p, &flags);
+
+ /* Kick full dynticks CPUs in case they need to tick on the new timer */
+ posix_cpu_timer_kick_nohz();
+out:
+ timer->it_overrun_last = timer->it_overrun;
+ timer->it_overrun = -1;
+ ++timer->it_requeue_pending;
+}
+
+/**
+ * task_cputime_expired - Compare two task_cputime entities.
+ *
+ * @sample: The task_cputime structure to be checked for expiration.
+ * @expires: Expiration times, against which @sample will be checked.
+ *
+ * Checks @sample against @expires to see if any field of @sample has expired.
+ * Returns true if any field of the former is greater than the corresponding
+ * field of the latter if the latter field is set. Otherwise returns false.
+ */
+static inline int task_cputime_expired(const struct task_cputime *sample,
+ const struct task_cputime *expires)
+{
+ if (expires->utime && sample->utime >= expires->utime)
+ return 1;
+ if (expires->stime && sample->utime + sample->stime >= expires->stime)
+ return 1;
+ if (expires->sum_exec_runtime != 0 &&
+ sample->sum_exec_runtime >= expires->sum_exec_runtime)
+ return 1;
+ return 0;
+}
+
+/**
+ * fastpath_timer_check - POSIX CPU timers fast path.
+ *
+ * @tsk: The task (thread) being checked.
+ *
+ * Check the task and thread group timers. If both are zero (there are no
+ * timers set) return false. Otherwise snapshot the task and thread group
+ * timers and compare them with the corresponding expiration times. Return
+ * true if a timer has expired, else return false.
+ */
+static inline int fastpath_timer_check(struct task_struct *tsk)
+{
+ struct signal_struct *sig;
+ cputime_t utime, stime;
+
+ task_cputime(tsk, &utime, &stime);
+
+ if (!task_cputime_zero(&tsk->cputime_expires)) {
+ struct task_cputime task_sample = {
+ .utime = utime,
+ .stime = stime,
+ .sum_exec_runtime = tsk_seruntime(tsk)
+ };
+
+ if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
+ return 1;
+ }
+
+ sig = tsk->signal;
+ if (sig->cputimer.running) {
+ struct task_cputime group_sample;
+
+ raw_spin_lock(&sig->cputimer.lock);
+ group_sample = sig->cputimer.cputime;
+ raw_spin_unlock(&sig->cputimer.lock);
+
+ if (task_cputime_expired(&group_sample, &sig->cputime_expires))
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * This is called from the timer interrupt handler. The irq handler has
+ * already updated our counts. We need to check if any timers fire now.
+ * Interrupts are disabled.
+ */
+void run_posix_cpu_timers(struct task_struct *tsk)
+{
+ LIST_HEAD(firing);
+ struct k_itimer *timer, *next;
+ unsigned long flags;
+
+ WARN_ON_ONCE(!irqs_disabled());
+
+ /*
+ * The fast path checks that there are no expired thread or thread
+ * group timers. If that's so, just return.
+ */
+ if (!fastpath_timer_check(tsk))
+ return;
+
+ if (!lock_task_sighand(tsk, &flags))
+ return;
+ /*
+ * Here we take off tsk->signal->cpu_timers[N] and
+ * tsk->cpu_timers[N] all the timers that are firing, and
+ * put them on the firing list.
+ */
+ check_thread_timers(tsk, &firing);
+ /*
+ * If there are any active process wide timers (POSIX 1.b, itimers,
+ * RLIMIT_CPU) cputimer must be running.
+ */
+ if (tsk->signal->cputimer.running)
+ check_process_timers(tsk, &firing);
+
+ /*
+ * We must release these locks before taking any timer's lock.
+ * There is a potential race with timer deletion here, as the
+ * siglock now protects our private firing list. We have set
+ * the firing flag in each timer, so that a deletion attempt
+ * that gets the timer lock before we do will give it up and
+ * spin until we've taken care of that timer below.
+ */
+ unlock_task_sighand(tsk, &flags);
+
+ /*
+ * Now that all the timers on our list have the firing flag,
+ * no one will touch their list entries but us. We'll take
+ * each timer's lock before clearing its firing flag, so no
+ * timer call will interfere.
+ */
+ list_for_each_entry_safe(timer, next, &firing, it.cpu.entry) {
+ int cpu_firing;
+
+ spin_lock(&timer->it_lock);
+ list_del_init(&timer->it.cpu.entry);
+ cpu_firing = timer->it.cpu.firing;
+ timer->it.cpu.firing = 0;
+ /*
+ * The firing flag is -1 if we collided with a reset
+ * of the timer, which already reported this
+ * almost-firing as an overrun. So don't generate an event.
+ */
+ if (likely(cpu_firing >= 0))
+ cpu_timer_fire(timer);
+ spin_unlock(&timer->it_lock);
+ }
+}
+
+/*
+ * Set one of the process-wide special case CPU timers or RLIMIT_CPU.
+ * The tsk->sighand->siglock must be held by the caller.
+ */
+void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
+ cputime_t *newval, cputime_t *oldval)
+{
+ unsigned long long now;
+
+ WARN_ON_ONCE(clock_idx == CPUCLOCK_SCHED);
+ cpu_timer_sample_group(clock_idx, tsk, &now);
+
+ if (oldval) {
+ /*
+ * We are setting itimer. The *oldval is absolute and we update
+ * it to be relative, *newval argument is relative and we update
+ * it to be absolute.
+ */
+ if (*oldval) {
+ if (*oldval <= now) {
+ /* Just about to fire. */
+ *oldval = cputime_one_jiffy;
+ } else {
+ *oldval -= now;
+ }
+ }
+
+ if (!*newval)
+ goto out;
+ *newval += now;
+ }
+
+ /*
+ * Update expiration cache if we are the earliest timer, or eventually
+ * RLIMIT_CPU limit is earlier than prof_exp cpu timer expire.
+ */
+ switch (clock_idx) {
+ case CPUCLOCK_PROF:
+ if (expires_gt(tsk->signal->cputime_expires.prof_exp, *newval))
+ tsk->signal->cputime_expires.prof_exp = *newval;
+ break;
+ case CPUCLOCK_VIRT:
+ if (expires_gt(tsk->signal->cputime_expires.virt_exp, *newval))
+ tsk->signal->cputime_expires.virt_exp = *newval;
+ break;
+ }
+out:
+ posix_cpu_timer_kick_nohz();
+}
+
+static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
+ struct timespec *rqtp, struct itimerspec *it)
+{
+ struct k_itimer timer;
+ int error;
+
+ /*
+ * Set up a temporary timer and then wait for it to go off.
+ */
+ memset(&timer, 0, sizeof timer);
+ spin_lock_init(&timer.it_lock);
+ timer.it_clock = which_clock;
+ timer.it_overrun = -1;
+ error = posix_cpu_timer_create(&timer);
+ timer.it_process = current;
+ if (!error) {
+ static struct itimerspec zero_it;
+
+ memset(it, 0, sizeof *it);
+ it->it_value = *rqtp;
+
+ spin_lock_irq(&timer.it_lock);
+ error = posix_cpu_timer_set(&timer, flags, it, NULL);
+ if (error) {
+ spin_unlock_irq(&timer.it_lock);
+ return error;
+ }
+
+ while (!signal_pending(current)) {
+ if (timer.it.cpu.expires == 0) {
+ /*
+ * Our timer fired and was reset, below
+ * deletion can not fail.
+ */
+ posix_cpu_timer_del(&timer);
+ spin_unlock_irq(&timer.it_lock);
+ return 0;
+ }
+
+ /*
+ * Block until cpu_timer_fire (or a signal) wakes us.
+ */
+ __set_current_state(TASK_INTERRUPTIBLE);
+ spin_unlock_irq(&timer.it_lock);
+ schedule();
+ spin_lock_irq(&timer.it_lock);
+ }
+
+ /*
+ * We were interrupted by a signal.
+ */
+ sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp);
+ error = posix_cpu_timer_set(&timer, 0, &zero_it, it);
+ if (!error) {
+ /*
+ * Timer is now unarmed, deletion can not fail.
+ */
+ posix_cpu_timer_del(&timer);
+ }
+ spin_unlock_irq(&timer.it_lock);
+
+ while (error == TIMER_RETRY) {
+ /*
+ * We need to handle case when timer was or is in the
+ * middle of firing. In other cases we already freed
+ * resources.
+ */
+ spin_lock_irq(&timer.it_lock);
+ error = posix_cpu_timer_del(&timer);
+ spin_unlock_irq(&timer.it_lock);
+ }
+
+ if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) {
+ /*
+ * It actually did fire already.
+ */
+ return 0;
+ }
+
+ error = -ERESTART_RESTARTBLOCK;
+ }
+
+ return error;
+}
+
+static long posix_cpu_nsleep_restart(struct restart_block *restart_block);
+
+static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
+ struct timespec *rqtp, struct timespec __user *rmtp)
+{
+ struct restart_block *restart_block = &current->restart_block;
+ struct itimerspec it;
+ int error;
+
+ /*
+ * Diagnose required errors first.
+ */
+ if (CPUCLOCK_PERTHREAD(which_clock) &&
+ (CPUCLOCK_PID(which_clock) == 0 ||
+ CPUCLOCK_PID(which_clock) == current->pid))
+ return -EINVAL;
+
+ error = do_cpu_nanosleep(which_clock, flags, rqtp, &it);
+
+ if (error == -ERESTART_RESTARTBLOCK) {
+
+ if (flags & TIMER_ABSTIME)
+ return -ERESTARTNOHAND;
+ /*
+ * Report back to the user the time still remaining.
+ */
+ if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
+ return -EFAULT;
+
+ restart_block->fn = posix_cpu_nsleep_restart;
+ restart_block->nanosleep.clockid = which_clock;
+ restart_block->nanosleep.rmtp = rmtp;
+ restart_block->nanosleep.expires = timespec_to_ns(rqtp);
+ }
+ return error;
+}
+
+static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
+{
+ clockid_t which_clock = restart_block->nanosleep.clockid;
+ struct timespec t;
+ struct itimerspec it;
+ int error;
+
+ t = ns_to_timespec(restart_block->nanosleep.expires);
+
+ error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it);
+
+ if (error == -ERESTART_RESTARTBLOCK) {
+ struct timespec __user *rmtp = restart_block->nanosleep.rmtp;
+ /*
+ * Report back to the user the time still remaining.
+ */
+ if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
+ return -EFAULT;
+
+ restart_block->nanosleep.expires = timespec_to_ns(&t);
+ }
+ return error;
+
+}
+
+#define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED)
+#define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED)
+
+static int process_cpu_clock_getres(const clockid_t which_clock,
+ struct timespec *tp)
+{
+ return posix_cpu_clock_getres(PROCESS_CLOCK, tp);
+}
+static int process_cpu_clock_get(const clockid_t which_clock,
+ struct timespec *tp)
+{
+ return posix_cpu_clock_get(PROCESS_CLOCK, tp);
+}
+static int process_cpu_timer_create(struct k_itimer *timer)
+{
+ timer->it_clock = PROCESS_CLOCK;
+ return posix_cpu_timer_create(timer);
+}
+static int process_cpu_nsleep(const clockid_t which_clock, int flags,
+ struct timespec *rqtp,
+ struct timespec __user *rmtp)
+{
+ return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp);
+}
+static long process_cpu_nsleep_restart(struct restart_block *restart_block)
+{
+ return -EINVAL;
+}
+static int thread_cpu_clock_getres(const clockid_t which_clock,
+ struct timespec *tp)
+{
+ return posix_cpu_clock_getres(THREAD_CLOCK, tp);
+}
+static int thread_cpu_clock_get(const clockid_t which_clock,
+ struct timespec *tp)
+{
+ return posix_cpu_clock_get(THREAD_CLOCK, tp);
+}
+static int thread_cpu_timer_create(struct k_itimer *timer)
+{
+ timer->it_clock = THREAD_CLOCK;
+ return posix_cpu_timer_create(timer);
+}
+
+struct k_clock clock_posix_cpu = {
+ .clock_getres = posix_cpu_clock_getres,
+ .clock_set = posix_cpu_clock_set,
+ .clock_get = posix_cpu_clock_get,
+ .timer_create = posix_cpu_timer_create,
+ .nsleep = posix_cpu_nsleep,
+ .nsleep_restart = posix_cpu_nsleep_restart,
+ .timer_set = posix_cpu_timer_set,
+ .timer_del = posix_cpu_timer_del,
+ .timer_get = posix_cpu_timer_get,
+};
+
+static __init int init_posix_cpu_timers(void)
+{
+ struct k_clock process = {
+ .clock_getres = process_cpu_clock_getres,
+ .clock_get = process_cpu_clock_get,
+ .timer_create = process_cpu_timer_create,
+ .nsleep = process_cpu_nsleep,
+ .nsleep_restart = process_cpu_nsleep_restart,
+ };
+ struct k_clock thread = {
+ .clock_getres = thread_cpu_clock_getres,
+ .clock_get = thread_cpu_clock_get,
+ .timer_create = thread_cpu_timer_create,
+ };
+ struct timespec ts;
+
+ posix_timers_register_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
+ posix_timers_register_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
+
+ cputime_to_timespec(cputime_one_jiffy, &ts);
+ onecputick = ts.tv_nsec;
+ WARN_ON(ts.tv_sec != 0);
+
+ return 0;
+}
+__initcall(init_posix_cpu_timers);
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
new file mode 100644
index 000000000..31ea01f42
--- /dev/null
+++ b/kernel/time/posix-timers.c
@@ -0,0 +1,1124 @@
+/*
+ * linux/kernel/posix-timers.c
+ *
+ *
+ * 2002-10-15 Posix Clocks & timers
+ * by George Anzinger george@mvista.com
+ *
+ * Copyright (C) 2002 2003 by MontaVista Software.
+ *
+ * 2004-06-01 Fix CLOCK_REALTIME clock/timer TIMER_ABSTIME bug.
+ * Copyright (C) 2004 Boris Hu
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * MontaVista Software | 1237 East Arques Avenue | Sunnyvale | CA 94085 | USA
+ */
+
+/* These are all the functions necessary to implement
+ * POSIX clocks & timers
+ */
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <linux/mutex.h>
+
+#include <asm/uaccess.h>
+#include <linux/list.h>
+#include <linux/init.h>
+#include <linux/compiler.h>
+#include <linux/hash.h>
+#include <linux/posix-clock.h>
+#include <linux/posix-timers.h>
+#include <linux/syscalls.h>
+#include <linux/wait.h>
+#include <linux/workqueue.h>
+#include <linux/export.h>
+#include <linux/hashtable.h>
+
+#include "timekeeping.h"
+
+/*
+ * Management arrays for POSIX timers. Timers are now kept in static hash table
+ * with 512 entries.
+ * Timer ids are allocated by local routine, which selects proper hash head by
+ * key, constructed from current->signal address and per signal struct counter.
+ * This keeps timer ids unique per process, but now they can intersect between
+ * processes.
+ */
+
+/*
+ * Lets keep our timers in a slab cache :-)
+ */
+static struct kmem_cache *posix_timers_cache;
+
+static DEFINE_HASHTABLE(posix_timers_hashtable, 9);
+static DEFINE_SPINLOCK(hash_lock);
+
+/*
+ * we assume that the new SIGEV_THREAD_ID shares no bits with the other
+ * SIGEV values. Here we put out an error if this assumption fails.
+ */
+#if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \
+ ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD))
+#error "SIGEV_THREAD_ID must not share bit with other SIGEV values!"
+#endif
+
+/*
+ * parisc wants ENOTSUP instead of EOPNOTSUPP
+ */
+#ifndef ENOTSUP
+# define ENANOSLEEP_NOTSUP EOPNOTSUPP
+#else
+# define ENANOSLEEP_NOTSUP ENOTSUP
+#endif
+
+/*
+ * The timer ID is turned into a timer address by idr_find().
+ * Verifying a valid ID consists of:
+ *
+ * a) checking that idr_find() returns other than -1.
+ * b) checking that the timer id matches the one in the timer itself.
+ * c) that the timer owner is in the callers thread group.
+ */
+
+/*
+ * CLOCKs: The POSIX standard calls for a couple of clocks and allows us
+ * to implement others. This structure defines the various
+ * clocks.
+ *
+ * RESOLUTION: Clock resolution is used to round up timer and interval
+ * times, NOT to report clock times, which are reported with as
+ * much resolution as the system can muster. In some cases this
+ * resolution may depend on the underlying clock hardware and
+ * may not be quantifiable until run time, and only then is the
+ * necessary code is written. The standard says we should say
+ * something about this issue in the documentation...
+ *
+ * FUNCTIONS: The CLOCKs structure defines possible functions to
+ * handle various clock functions.
+ *
+ * The standard POSIX timer management code assumes the
+ * following: 1.) The k_itimer struct (sched.h) is used for
+ * the timer. 2.) The list, it_lock, it_clock, it_id and
+ * it_pid fields are not modified by timer code.
+ *
+ * Permissions: It is assumed that the clock_settime() function defined
+ * for each clock will take care of permission checks. Some
+ * clocks may be set able by any user (i.e. local process
+ * clocks) others not. Currently the only set able clock we
+ * have is CLOCK_REALTIME and its high res counter part, both of
+ * which we beg off on and pass to do_sys_settimeofday().
+ */
+
+static struct k_clock posix_clocks[MAX_CLOCKS];
+
+/*
+ * These ones are defined below.
+ */
+static int common_nsleep(const clockid_t, int flags, struct timespec *t,
+ struct timespec __user *rmtp);
+static int common_timer_create(struct k_itimer *new_timer);
+static void common_timer_get(struct k_itimer *, struct itimerspec *);
+static int common_timer_set(struct k_itimer *, int,
+ struct itimerspec *, struct itimerspec *);
+static int common_timer_del(struct k_itimer *timer);
+
+static enum hrtimer_restart posix_timer_fn(struct hrtimer *data);
+
+static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags);
+
+#define lock_timer(tid, flags) \
+({ struct k_itimer *__timr; \
+ __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid, flags)); \
+ __timr; \
+})
+
+static int hash(struct signal_struct *sig, unsigned int nr)
+{
+ return hash_32(hash32_ptr(sig) ^ nr, HASH_BITS(posix_timers_hashtable));
+}
+
+static struct k_itimer *__posix_timers_find(struct hlist_head *head,
+ struct signal_struct *sig,
+ timer_t id)
+{
+ struct k_itimer *timer;
+
+ hlist_for_each_entry_rcu(timer, head, t_hash) {
+ if ((timer->it_signal == sig) && (timer->it_id == id))
+ return timer;
+ }
+ return NULL;
+}
+
+static struct k_itimer *posix_timer_by_id(timer_t id)
+{
+ struct signal_struct *sig = current->signal;
+ struct hlist_head *head = &posix_timers_hashtable[hash(sig, id)];
+
+ return __posix_timers_find(head, sig, id);
+}
+
+static int posix_timer_add(struct k_itimer *timer)
+{
+ struct signal_struct *sig = current->signal;
+ int first_free_id = sig->posix_timer_id;
+ struct hlist_head *head;
+ int ret = -ENOENT;
+
+ do {
+ spin_lock(&hash_lock);
+ head = &posix_timers_hashtable[hash(sig, sig->posix_timer_id)];
+ if (!__posix_timers_find(head, sig, sig->posix_timer_id)) {
+ hlist_add_head_rcu(&timer->t_hash, head);
+ ret = sig->posix_timer_id;
+ }
+ if (++sig->posix_timer_id < 0)
+ sig->posix_timer_id = 0;
+ if ((sig->posix_timer_id == first_free_id) && (ret == -ENOENT))
+ /* Loop over all possible ids completed */
+ ret = -EAGAIN;
+ spin_unlock(&hash_lock);
+ } while (ret == -ENOENT);
+ return ret;
+}
+
+static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
+{
+ spin_unlock_irqrestore(&timr->it_lock, flags);
+}
+
+/* Get clock_realtime */
+static int posix_clock_realtime_get(clockid_t which_clock, struct timespec *tp)
+{
+ ktime_get_real_ts(tp);
+ return 0;
+}
+
+/* Set clock_realtime */
+static int posix_clock_realtime_set(const clockid_t which_clock,
+ const struct timespec *tp)
+{
+ return do_sys_settimeofday(tp, NULL);
+}
+
+static int posix_clock_realtime_adj(const clockid_t which_clock,
+ struct timex *t)
+{
+ return do_adjtimex(t);
+}
+
+/*
+ * Get monotonic time for posix timers
+ */
+static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp)
+{
+ ktime_get_ts(tp);
+ return 0;
+}
+
+/*
+ * Get monotonic-raw time for posix timers
+ */
+static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
+{
+ getrawmonotonic(tp);
+ return 0;
+}
+
+
+static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec *tp)
+{
+ *tp = current_kernel_time();
+ return 0;
+}
+
+static int posix_get_monotonic_coarse(clockid_t which_clock,
+ struct timespec *tp)
+{
+ *tp = get_monotonic_coarse();
+ return 0;
+}
+
+static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp)
+{
+ *tp = ktime_to_timespec(KTIME_LOW_RES);
+ return 0;
+}
+
+static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp)
+{
+ get_monotonic_boottime(tp);
+ return 0;
+}
+
+static int posix_get_tai(clockid_t which_clock, struct timespec *tp)
+{
+ timekeeping_clocktai(tp);
+ return 0;
+}
+
+/*
+ * Initialize everything, well, just everything in Posix clocks/timers ;)
+ */
+static __init int init_posix_timers(void)
+{
+ struct k_clock clock_realtime = {
+ .clock_getres = hrtimer_get_res,
+ .clock_get = posix_clock_realtime_get,
+ .clock_set = posix_clock_realtime_set,
+ .clock_adj = posix_clock_realtime_adj,
+ .nsleep = common_nsleep,
+ .nsleep_restart = hrtimer_nanosleep_restart,
+ .timer_create = common_timer_create,
+ .timer_set = common_timer_set,
+ .timer_get = common_timer_get,
+ .timer_del = common_timer_del,
+ };
+ struct k_clock clock_monotonic = {
+ .clock_getres = hrtimer_get_res,
+ .clock_get = posix_ktime_get_ts,
+ .nsleep = common_nsleep,
+ .nsleep_restart = hrtimer_nanosleep_restart,
+ .timer_create = common_timer_create,
+ .timer_set = common_timer_set,
+ .timer_get = common_timer_get,
+ .timer_del = common_timer_del,
+ };
+ struct k_clock clock_monotonic_raw = {
+ .clock_getres = hrtimer_get_res,
+ .clock_get = posix_get_monotonic_raw,
+ };
+ struct k_clock clock_realtime_coarse = {
+ .clock_getres = posix_get_coarse_res,
+ .clock_get = posix_get_realtime_coarse,
+ };
+ struct k_clock clock_monotonic_coarse = {
+ .clock_getres = posix_get_coarse_res,
+ .clock_get = posix_get_monotonic_coarse,
+ };
+ struct k_clock clock_tai = {
+ .clock_getres = hrtimer_get_res,
+ .clock_get = posix_get_tai,
+ .nsleep = common_nsleep,
+ .nsleep_restart = hrtimer_nanosleep_restart,
+ .timer_create = common_timer_create,
+ .timer_set = common_timer_set,
+ .timer_get = common_timer_get,
+ .timer_del = common_timer_del,
+ };
+ struct k_clock clock_boottime = {
+ .clock_getres = hrtimer_get_res,
+ .clock_get = posix_get_boottime,
+ .nsleep = common_nsleep,
+ .nsleep_restart = hrtimer_nanosleep_restart,
+ .timer_create = common_timer_create,
+ .timer_set = common_timer_set,
+ .timer_get = common_timer_get,
+ .timer_del = common_timer_del,
+ };
+
+ posix_timers_register_clock(CLOCK_REALTIME, &clock_realtime);
+ posix_timers_register_clock(CLOCK_MONOTONIC, &clock_monotonic);
+ posix_timers_register_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
+ posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
+ posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
+ posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime);
+ posix_timers_register_clock(CLOCK_TAI, &clock_tai);
+
+ posix_timers_cache = kmem_cache_create("posix_timers_cache",
+ sizeof (struct k_itimer), 0, SLAB_PANIC,
+ NULL);
+ return 0;
+}
+
+__initcall(init_posix_timers);
+
+static void schedule_next_timer(struct k_itimer *timr)
+{
+ struct hrtimer *timer = &timr->it.real.timer;
+
+ if (timr->it.real.interval.tv64 == 0)
+ return;
+
+ timr->it_overrun += (unsigned int) hrtimer_forward(timer,
+ timer->base->get_time(),
+ timr->it.real.interval);
+
+ timr->it_overrun_last = timr->it_overrun;
+ timr->it_overrun = -1;
+ ++timr->it_requeue_pending;
+ hrtimer_restart(timer);
+}
+
+/*
+ * This function is exported for use by the signal deliver code. It is
+ * called just prior to the info block being released and passes that
+ * block to us. It's function is to update the overrun entry AND to
+ * restart the timer. It should only be called if the timer is to be
+ * restarted (i.e. we have flagged this in the sys_private entry of the
+ * info block).
+ *
+ * To protect against the timer going away while the interrupt is queued,
+ * we require that the it_requeue_pending flag be set.
+ */
+void do_schedule_next_timer(struct siginfo *info)
+{
+ struct k_itimer *timr;
+ unsigned long flags;
+
+ timr = lock_timer(info->si_tid, &flags);
+
+ if (timr && timr->it_requeue_pending == info->si_sys_private) {
+ if (timr->it_clock < 0)
+ posix_cpu_timer_schedule(timr);
+ else
+ schedule_next_timer(timr);
+
+ info->si_overrun += timr->it_overrun_last;
+ }
+
+ if (timr)
+ unlock_timer(timr, flags);
+}
+
+int posix_timer_event(struct k_itimer *timr, int si_private)
+{
+ struct task_struct *task;
+ int shared, ret = -1;
+ /*
+ * FIXME: if ->sigq is queued we can race with
+ * dequeue_signal()->do_schedule_next_timer().
+ *
+ * If dequeue_signal() sees the "right" value of
+ * si_sys_private it calls do_schedule_next_timer().
+ * We re-queue ->sigq and drop ->it_lock().
+ * do_schedule_next_timer() locks the timer
+ * and re-schedules it while ->sigq is pending.
+ * Not really bad, but not that we want.
+ */
+ timr->sigq->info.si_sys_private = si_private;
+
+ rcu_read_lock();
+ task = pid_task(timr->it_pid, PIDTYPE_PID);
+ if (task) {
+ shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID);
+ ret = send_sigqueue(timr->sigq, task, shared);
+ }
+ rcu_read_unlock();
+ /* If we failed to send the signal the timer stops. */
+ return ret > 0;
+}
+EXPORT_SYMBOL_GPL(posix_timer_event);
+
+/*
+ * This function gets called when a POSIX.1b interval timer expires. It
+ * is used as a callback from the kernel internal timer. The
+ * run_timer_list code ALWAYS calls with interrupts on.
+
+ * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers.
+ */
+static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
+{
+ struct k_itimer *timr;
+ unsigned long flags;
+ int si_private = 0;
+ enum hrtimer_restart ret = HRTIMER_NORESTART;
+
+ timr = container_of(timer, struct k_itimer, it.real.timer);
+ spin_lock_irqsave(&timr->it_lock, flags);
+
+ if (timr->it.real.interval.tv64 != 0)
+ si_private = ++timr->it_requeue_pending;
+
+ if (posix_timer_event(timr, si_private)) {
+ /*
+ * signal was not sent because of sig_ignor
+ * we will not get a call back to restart it AND
+ * it should be restarted.
+ */
+ if (timr->it.real.interval.tv64 != 0) {
+ ktime_t now = hrtimer_cb_get_time(timer);
+
+ /*
+ * FIXME: What we really want, is to stop this
+ * timer completely and restart it in case the
+ * SIG_IGN is removed. This is a non trivial
+ * change which involves sighand locking
+ * (sigh !), which we don't want to do late in
+ * the release cycle.
+ *
+ * For now we just let timers with an interval
+ * less than a jiffie expire every jiffie to
+ * avoid softirq starvation in case of SIG_IGN
+ * and a very small interval, which would put
+ * the timer right back on the softirq pending
+ * list. By moving now ahead of time we trick
+ * hrtimer_forward() to expire the timer
+ * later, while we still maintain the overrun
+ * accuracy, but have some inconsistency in
+ * the timer_gettime() case. This is at least
+ * better than a starved softirq. A more
+ * complex fix which solves also another related
+ * inconsistency is already in the pipeline.
+ */
+#ifdef CONFIG_HIGH_RES_TIMERS
+ {
+ ktime_t kj = ktime_set(0, NSEC_PER_SEC / HZ);
+
+ if (timr->it.real.interval.tv64 < kj.tv64)
+ now = ktime_add(now, kj);
+ }
+#endif
+ timr->it_overrun += (unsigned int)
+ hrtimer_forward(timer, now,
+ timr->it.real.interval);
+ ret = HRTIMER_RESTART;
+ ++timr->it_requeue_pending;
+ }
+ }
+
+ unlock_timer(timr, flags);
+ return ret;
+}
+
+static struct pid *good_sigevent(sigevent_t * event)
+{
+ struct task_struct *rtn = current->group_leader;
+
+ if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
+ (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) ||
+ !same_thread_group(rtn, current) ||
+ (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL))
+ return NULL;
+
+ if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) &&
+ ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX)))
+ return NULL;
+
+ return task_pid(rtn);
+}
+
+void posix_timers_register_clock(const clockid_t clock_id,
+ struct k_clock *new_clock)
+{
+ if ((unsigned) clock_id >= MAX_CLOCKS) {
+ printk(KERN_WARNING "POSIX clock register failed for clock_id %d\n",
+ clock_id);
+ return;
+ }
+
+ if (!new_clock->clock_get) {
+ printk(KERN_WARNING "POSIX clock id %d lacks clock_get()\n",
+ clock_id);
+ return;
+ }
+ if (!new_clock->clock_getres) {
+ printk(KERN_WARNING "POSIX clock id %d lacks clock_getres()\n",
+ clock_id);
+ return;
+ }
+
+ posix_clocks[clock_id] = *new_clock;
+}
+EXPORT_SYMBOL_GPL(posix_timers_register_clock);
+
+static struct k_itimer * alloc_posix_timer(void)
+{
+ struct k_itimer *tmr;
+ tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL);
+ if (!tmr)
+ return tmr;
+ if (unlikely(!(tmr->sigq = sigqueue_alloc()))) {
+ kmem_cache_free(posix_timers_cache, tmr);
+ return NULL;
+ }
+ memset(&tmr->sigq->info, 0, sizeof(siginfo_t));
+ return tmr;
+}
+
+static void k_itimer_rcu_free(struct rcu_head *head)
+{
+ struct k_itimer *tmr = container_of(head, struct k_itimer, it.rcu);
+
+ kmem_cache_free(posix_timers_cache, tmr);
+}
+
+#define IT_ID_SET 1
+#define IT_ID_NOT_SET 0
+static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
+{
+ if (it_id_set) {
+ unsigned long flags;
+ spin_lock_irqsave(&hash_lock, flags);
+ hlist_del_rcu(&tmr->t_hash);
+ spin_unlock_irqrestore(&hash_lock, flags);
+ }
+ put_pid(tmr->it_pid);
+ sigqueue_free(tmr->sigq);
+ call_rcu(&tmr->it.rcu, k_itimer_rcu_free);
+}
+
+static struct k_clock *clockid_to_kclock(const clockid_t id)
+{
+ if (id < 0)
+ return (id & CLOCKFD_MASK) == CLOCKFD ?
+ &clock_posix_dynamic : &clock_posix_cpu;
+
+ if (id >= MAX_CLOCKS || !posix_clocks[id].clock_getres)
+ return NULL;
+ return &posix_clocks[id];
+}
+
+static int common_timer_create(struct k_itimer *new_timer)
+{
+ hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
+ return 0;
+}
+
+/* Create a POSIX.1b interval timer. */
+
+SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
+ struct sigevent __user *, timer_event_spec,
+ timer_t __user *, created_timer_id)
+{
+ struct k_clock *kc = clockid_to_kclock(which_clock);
+ struct k_itimer *new_timer;
+ int error, new_timer_id;
+ sigevent_t event;
+ int it_id_set = IT_ID_NOT_SET;
+
+ if (!kc)
+ return -EINVAL;
+ if (!kc->timer_create)
+ return -EOPNOTSUPP;
+
+ new_timer = alloc_posix_timer();
+ if (unlikely(!new_timer))
+ return -EAGAIN;
+
+ spin_lock_init(&new_timer->it_lock);
+ new_timer_id = posix_timer_add(new_timer);
+ if (new_timer_id < 0) {
+ error = new_timer_id;
+ goto out;
+ }
+
+ it_id_set = IT_ID_SET;
+ new_timer->it_id = (timer_t) new_timer_id;
+ new_timer->it_clock = which_clock;
+ new_timer->it_overrun = -1;
+
+ if (timer_event_spec) {
+ if (copy_from_user(&event, timer_event_spec, sizeof (event))) {
+ error = -EFAULT;
+ goto out;
+ }
+ rcu_read_lock();
+ new_timer->it_pid = get_pid(good_sigevent(&event));
+ rcu_read_unlock();
+ if (!new_timer->it_pid) {
+ error = -EINVAL;
+ goto out;
+ }
+ } else {
+ memset(&event.sigev_value, 0, sizeof(event.sigev_value));
+ event.sigev_notify = SIGEV_SIGNAL;
+ event.sigev_signo = SIGALRM;
+ event.sigev_value.sival_int = new_timer->it_id;
+ new_timer->it_pid = get_pid(task_tgid(current));
+ }
+
+ new_timer->it_sigev_notify = event.sigev_notify;
+ new_timer->sigq->info.si_signo = event.sigev_signo;
+ new_timer->sigq->info.si_value = event.sigev_value;
+ new_timer->sigq->info.si_tid = new_timer->it_id;
+ new_timer->sigq->info.si_code = SI_TIMER;
+
+ if (copy_to_user(created_timer_id,
+ &new_timer_id, sizeof (new_timer_id))) {
+ error = -EFAULT;
+ goto out;
+ }
+
+ error = kc->timer_create(new_timer);
+ if (error)
+ goto out;
+
+ spin_lock_irq(&current->sighand->siglock);
+ new_timer->it_signal = current->signal;
+ list_add(&new_timer->list, &current->signal->posix_timers);
+ spin_unlock_irq(&current->sighand->siglock);
+
+ return 0;
+ /*
+ * In the case of the timer belonging to another task, after
+ * the task is unlocked, the timer is owned by the other task
+ * and may cease to exist at any time. Don't use or modify
+ * new_timer after the unlock call.
+ */
+out:
+ release_posix_timer(new_timer, it_id_set);
+ return error;
+}
+
+/*
+ * Locking issues: We need to protect the result of the id look up until
+ * we get the timer locked down so it is not deleted under us. The
+ * removal is done under the idr spinlock so we use that here to bridge
+ * the find to the timer lock. To avoid a dead lock, the timer id MUST
+ * be release with out holding the timer lock.
+ */
+static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
+{
+ struct k_itimer *timr;
+
+ /*
+ * timer_t could be any type >= int and we want to make sure any
+ * @timer_id outside positive int range fails lookup.
+ */
+ if ((unsigned long long)timer_id > INT_MAX)
+ return NULL;
+
+ rcu_read_lock();
+ timr = posix_timer_by_id(timer_id);
+ if (timr) {
+ spin_lock_irqsave(&timr->it_lock, *flags);
+ if (timr->it_signal == current->signal) {
+ rcu_read_unlock();
+ return timr;
+ }
+ spin_unlock_irqrestore(&timr->it_lock, *flags);
+ }
+ rcu_read_unlock();
+
+ return NULL;
+}
+
+/*
+ * Get the time remaining on a POSIX.1b interval timer. This function
+ * is ALWAYS called with spin_lock_irq on the timer, thus it must not
+ * mess with irq.
+ *
+ * We have a couple of messes to clean up here. First there is the case
+ * of a timer that has a requeue pending. These timers should appear to
+ * be in the timer list with an expiry as if we were to requeue them
+ * now.
+ *
+ * The second issue is the SIGEV_NONE timer which may be active but is
+ * not really ever put in the timer list (to save system resources).
+ * This timer may be expired, and if so, we will do it here. Otherwise
+ * it is the same as a requeue pending timer WRT to what we should
+ * report.
+ */
+static void
+common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
+{
+ ktime_t now, remaining, iv;
+ struct hrtimer *timer = &timr->it.real.timer;
+
+ memset(cur_setting, 0, sizeof(struct itimerspec));
+
+ iv = timr->it.real.interval;
+
+ /* interval timer ? */
+ if (iv.tv64)
+ cur_setting->it_interval = ktime_to_timespec(iv);
+ else if (!hrtimer_active(timer) &&
+ (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)
+ return;
+
+ now = timer->base->get_time();
+
+ /*
+ * When a requeue is pending or this is a SIGEV_NONE
+ * timer move the expiry time forward by intervals, so
+ * expiry is > now.
+ */
+ if (iv.tv64 && (timr->it_requeue_pending & REQUEUE_PENDING ||
+ (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE))
+ timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv);
+
+ remaining = ktime_sub(hrtimer_get_expires(timer), now);
+ /* Return 0 only, when the timer is expired and not pending */
+ if (remaining.tv64 <= 0) {
+ /*
+ * A single shot SIGEV_NONE timer must return 0, when
+ * it is expired !
+ */
+ if ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)
+ cur_setting->it_value.tv_nsec = 1;
+ } else
+ cur_setting->it_value = ktime_to_timespec(remaining);
+}
+
+/* Get the time remaining on a POSIX.1b interval timer. */
+SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
+ struct itimerspec __user *, setting)
+{
+ struct itimerspec cur_setting;
+ struct k_itimer *timr;
+ struct k_clock *kc;
+ unsigned long flags;
+ int ret = 0;
+
+ timr = lock_timer(timer_id, &flags);
+ if (!timr)
+ return -EINVAL;
+
+ kc = clockid_to_kclock(timr->it_clock);
+ if (WARN_ON_ONCE(!kc || !kc->timer_get))
+ ret = -EINVAL;
+ else
+ kc->timer_get(timr, &cur_setting);
+
+ unlock_timer(timr, flags);
+
+ if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting)))
+ return -EFAULT;
+
+ return ret;
+}
+
+/*
+ * Get the number of overruns of a POSIX.1b interval timer. This is to
+ * be the overrun of the timer last delivered. At the same time we are
+ * accumulating overruns on the next timer. The overrun is frozen when
+ * the signal is delivered, either at the notify time (if the info block
+ * is not queued) or at the actual delivery time (as we are informed by
+ * the call back to do_schedule_next_timer(). So all we need to do is
+ * to pick up the frozen overrun.
+ */
+SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
+{
+ struct k_itimer *timr;
+ int overrun;
+ unsigned long flags;
+
+ timr = lock_timer(timer_id, &flags);
+ if (!timr)
+ return -EINVAL;
+
+ overrun = timr->it_overrun_last;
+ unlock_timer(timr, flags);
+
+ return overrun;
+}
+
+/* Set a POSIX.1b interval timer. */
+/* timr->it_lock is taken. */
+static int
+common_timer_set(struct k_itimer *timr, int flags,
+ struct itimerspec *new_setting, struct itimerspec *old_setting)
+{
+ struct hrtimer *timer = &timr->it.real.timer;
+ enum hrtimer_mode mode;
+
+ if (old_setting)
+ common_timer_get(timr, old_setting);
+
+ /* disable the timer */
+ timr->it.real.interval.tv64 = 0;
+ /*
+ * careful here. If smp we could be in the "fire" routine which will
+ * be spinning as we hold the lock. But this is ONLY an SMP issue.
+ */
+ if (hrtimer_try_to_cancel(timer) < 0)
+ return TIMER_RETRY;
+
+ timr->it_requeue_pending = (timr->it_requeue_pending + 2) &
+ ~REQUEUE_PENDING;
+ timr->it_overrun_last = 0;
+
+ /* switch off the timer when it_value is zero */
+ if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec)
+ return 0;
+
+ mode = flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL;
+ hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
+ timr->it.real.timer.function = posix_timer_fn;
+
+ hrtimer_set_expires(timer, timespec_to_ktime(new_setting->it_value));
+
+ /* Convert interval */
+ timr->it.real.interval = timespec_to_ktime(new_setting->it_interval);
+
+ /* SIGEV_NONE timers are not queued ! See common_timer_get */
+ if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) {
+ /* Setup correct expiry time for relative timers */
+ if (mode == HRTIMER_MODE_REL) {
+ hrtimer_add_expires(timer, timer->base->get_time());
+ }
+ return 0;
+ }
+
+ hrtimer_start_expires(timer, mode);
+ return 0;
+}
+
+/* Set a POSIX.1b interval timer */
+SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
+ const struct itimerspec __user *, new_setting,
+ struct itimerspec __user *, old_setting)
+{
+ struct k_itimer *timr;
+ struct itimerspec new_spec, old_spec;
+ int error = 0;
+ unsigned long flag;
+ struct itimerspec *rtn = old_setting ? &old_spec : NULL;
+ struct k_clock *kc;
+
+ if (!new_setting)
+ return -EINVAL;
+
+ if (copy_from_user(&new_spec, new_setting, sizeof (new_spec)))
+ return -EFAULT;
+
+ if (!timespec_valid(&new_spec.it_interval) ||
+ !timespec_valid(&new_spec.it_value))
+ return -EINVAL;
+retry:
+ timr = lock_timer(timer_id, &flag);
+ if (!timr)
+ return -EINVAL;
+
+ kc = clockid_to_kclock(timr->it_clock);
+ if (WARN_ON_ONCE(!kc || !kc->timer_set))
+ error = -EINVAL;
+ else
+ error = kc->timer_set(timr, flags, &new_spec, rtn);
+
+ unlock_timer(timr, flag);
+ if (error == TIMER_RETRY) {
+ rtn = NULL; // We already got the old time...
+ goto retry;
+ }
+
+ if (old_setting && !error &&
+ copy_to_user(old_setting, &old_spec, sizeof (old_spec)))
+ error = -EFAULT;
+
+ return error;
+}
+
+static int common_timer_del(struct k_itimer *timer)
+{
+ timer->it.real.interval.tv64 = 0;
+
+ if (hrtimer_try_to_cancel(&timer->it.real.timer) < 0)
+ return TIMER_RETRY;
+ return 0;
+}
+
+static inline int timer_delete_hook(struct k_itimer *timer)
+{
+ struct k_clock *kc = clockid_to_kclock(timer->it_clock);
+
+ if (WARN_ON_ONCE(!kc || !kc->timer_del))
+ return -EINVAL;
+ return kc->timer_del(timer);
+}
+
+/* Delete a POSIX.1b interval timer. */
+SYSCALL_DEFINE1(timer_delete, timer_t, timer_id)
+{
+ struct k_itimer *timer;
+ unsigned long flags;
+
+retry_delete:
+ timer = lock_timer(timer_id, &flags);
+ if (!timer)
+ return -EINVAL;
+
+ if (timer_delete_hook(timer) == TIMER_RETRY) {
+ unlock_timer(timer, flags);
+ goto retry_delete;
+ }
+
+ spin_lock(&current->sighand->siglock);
+ list_del(&timer->list);
+ spin_unlock(&current->sighand->siglock);
+ /*
+ * This keeps any tasks waiting on the spin lock from thinking
+ * they got something (see the lock code above).
+ */
+ timer->it_signal = NULL;
+
+ unlock_timer(timer, flags);
+ release_posix_timer(timer, IT_ID_SET);
+ return 0;
+}
+
+/*
+ * return timer owned by the process, used by exit_itimers
+ */
+static void itimer_delete(struct k_itimer *timer)
+{
+ unsigned long flags;
+
+retry_delete:
+ spin_lock_irqsave(&timer->it_lock, flags);
+
+ if (timer_delete_hook(timer) == TIMER_RETRY) {
+ unlock_timer(timer, flags);
+ goto retry_delete;
+ }
+ list_del(&timer->list);
+ /*
+ * This keeps any tasks waiting on the spin lock from thinking
+ * they got something (see the lock code above).
+ */
+ timer->it_signal = NULL;
+
+ unlock_timer(timer, flags);
+ release_posix_timer(timer, IT_ID_SET);
+}
+
+/*
+ * This is called by do_exit or de_thread, only when there are no more
+ * references to the shared signal_struct.
+ */
+void exit_itimers(struct signal_struct *sig)
+{
+ struct k_itimer *tmr;
+
+ while (!list_empty(&sig->posix_timers)) {
+ tmr = list_entry(sig->posix_timers.next, struct k_itimer, list);
+ itimer_delete(tmr);
+ }
+}
+
+SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
+ const struct timespec __user *, tp)
+{
+ struct k_clock *kc = clockid_to_kclock(which_clock);
+ struct timespec new_tp;
+
+ if (!kc || !kc->clock_set)
+ return -EINVAL;
+
+ if (copy_from_user(&new_tp, tp, sizeof (*tp)))
+ return -EFAULT;
+
+ return kc->clock_set(which_clock, &new_tp);
+}
+
+SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
+ struct timespec __user *,tp)
+{
+ struct k_clock *kc = clockid_to_kclock(which_clock);
+ struct timespec kernel_tp;
+ int error;
+
+ if (!kc)
+ return -EINVAL;
+
+ error = kc->clock_get(which_clock, &kernel_tp);
+
+ if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
+ error = -EFAULT;
+
+ return error;
+}
+
+SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
+ struct timex __user *, utx)
+{
+ struct k_clock *kc = clockid_to_kclock(which_clock);
+ struct timex ktx;
+ int err;
+
+ if (!kc)
+ return -EINVAL;
+ if (!kc->clock_adj)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&ktx, utx, sizeof(ktx)))
+ return -EFAULT;
+
+ err = kc->clock_adj(which_clock, &ktx);
+
+ if (err >= 0 && copy_to_user(utx, &ktx, sizeof(ktx)))
+ return -EFAULT;
+
+ return err;
+}
+
+SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
+ struct timespec __user *, tp)
+{
+ struct k_clock *kc = clockid_to_kclock(which_clock);
+ struct timespec rtn_tp;
+ int error;
+
+ if (!kc)
+ return -EINVAL;
+
+ error = kc->clock_getres(which_clock, &rtn_tp);
+
+ if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp)))
+ error = -EFAULT;
+
+ return error;
+}
+
+/*
+ * nanosleep for monotonic and realtime clocks
+ */
+static int common_nsleep(const clockid_t which_clock, int flags,
+ struct timespec *tsave, struct timespec __user *rmtp)
+{
+ return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ?
+ HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
+ which_clock);
+}
+
+SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
+ const struct timespec __user *, rqtp,
+ struct timespec __user *, rmtp)
+{
+ struct k_clock *kc = clockid_to_kclock(which_clock);
+ struct timespec t;
+
+ if (!kc)
+ return -EINVAL;
+ if (!kc->nsleep)
+ return -ENANOSLEEP_NOTSUP;
+
+ if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
+ return -EFAULT;
+
+ if (!timespec_valid(&t))
+ return -EINVAL;
+
+ return kc->nsleep(which_clock, flags, &t, rmtp);
+}
+
+/*
+ * This will restart clock_nanosleep. This is required only by
+ * compat_clock_nanosleep_restart for now.
+ */
+long clock_nanosleep_restart(struct restart_block *restart_block)
+{
+ clockid_t which_clock = restart_block->nanosleep.clockid;
+ struct k_clock *kc = clockid_to_kclock(which_clock);
+
+ if (WARN_ON_ONCE(!kc || !kc->nsleep_restart))
+ return -EINVAL;
+
+ return kc->nsleep_restart(restart_block);
+}
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
new file mode 100644
index 000000000..a26036d37
--- /dev/null
+++ b/kernel/time/sched_clock.c
@@ -0,0 +1,303 @@
+/*
+ * sched_clock.c: Generic sched_clock() support, to extend low level
+ * hardware time counters to full 64-bit ns values.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/clocksource.h>
+#include <linux/init.h>
+#include <linux/jiffies.h>
+#include <linux/ktime.h>
+#include <linux/kernel.h>
+#include <linux/moduleparam.h>
+#include <linux/sched.h>
+#include <linux/syscore_ops.h>
+#include <linux/hrtimer.h>
+#include <linux/sched_clock.h>
+#include <linux/seqlock.h>
+#include <linux/bitops.h>
+
+/**
+ * struct clock_read_data - data required to read from sched_clock()
+ *
+ * @epoch_ns: sched_clock() value at last update
+ * @epoch_cyc: Clock cycle value at last update.
+ * @sched_clock_mask: Bitmask for two's complement subtraction of non 64bit
+ * clocks.
+ * @read_sched_clock: Current clock source (or dummy source when suspended).
+ * @mult: Multipler for scaled math conversion.
+ * @shift: Shift value for scaled math conversion.
+ *
+ * Care must be taken when updating this structure; it is read by
+ * some very hot code paths. It occupies <=40 bytes and, when combined
+ * with the seqcount used to synchronize access, comfortably fits into
+ * a 64 byte cache line.
+ */
+struct clock_read_data {
+ u64 epoch_ns;
+ u64 epoch_cyc;
+ u64 sched_clock_mask;
+ u64 (*read_sched_clock)(void);
+ u32 mult;
+ u32 shift;
+};
+
+/**
+ * struct clock_data - all data needed for sched_clock() (including
+ * registration of a new clock source)
+ *
+ * @seq: Sequence counter for protecting updates. The lowest
+ * bit is the index for @read_data.
+ * @read_data: Data required to read from sched_clock.
+ * @wrap_kt: Duration for which clock can run before wrapping.
+ * @rate: Tick rate of the registered clock.
+ * @actual_read_sched_clock: Registered hardware level clock read function.
+ *
+ * The ordering of this structure has been chosen to optimize cache
+ * performance. In particular 'seq' and 'read_data[0]' (combined) should fit
+ * into a single 64-byte cache line.
+ */
+struct clock_data {
+ seqcount_t seq;
+ struct clock_read_data read_data[2];
+ ktime_t wrap_kt;
+ unsigned long rate;
+
+ u64 (*actual_read_sched_clock)(void);
+};
+
+static struct hrtimer sched_clock_timer;
+static int irqtime = -1;
+
+core_param(irqtime, irqtime, int, 0400);
+
+static u64 notrace jiffy_sched_clock_read(void)
+{
+ /*
+ * We don't need to use get_jiffies_64 on 32-bit arches here
+ * because we register with BITS_PER_LONG
+ */
+ return (u64)(jiffies - INITIAL_JIFFIES);
+}
+
+static struct clock_data cd ____cacheline_aligned = {
+ .read_data[0] = { .mult = NSEC_PER_SEC / HZ,
+ .read_sched_clock = jiffy_sched_clock_read, },
+ .actual_read_sched_clock = jiffy_sched_clock_read,
+};
+
+static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
+{
+ return (cyc * mult) >> shift;
+}
+
+unsigned long long notrace sched_clock(void)
+{
+ u64 cyc, res;
+ unsigned long seq;
+ struct clock_read_data *rd;
+
+ do {
+ seq = raw_read_seqcount(&cd.seq);
+ rd = cd.read_data + (seq & 1);
+
+ cyc = (rd->read_sched_clock() - rd->epoch_cyc) &
+ rd->sched_clock_mask;
+ res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift);
+ } while (read_seqcount_retry(&cd.seq, seq));
+
+ return res;
+}
+
+/*
+ * Updating the data required to read the clock.
+ *
+ * sched_clock() will never observe mis-matched data even if called from
+ * an NMI. We do this by maintaining an odd/even copy of the data and
+ * steering sched_clock() to one or the other using a sequence counter.
+ * In order to preserve the data cache profile of sched_clock() as much
+ * as possible the system reverts back to the even copy when the update
+ * completes; the odd copy is used *only* during an update.
+ */
+static void update_clock_read_data(struct clock_read_data *rd)
+{
+ /* update the backup (odd) copy with the new data */
+ cd.read_data[1] = *rd;
+
+ /* steer readers towards the odd copy */
+ raw_write_seqcount_latch(&cd.seq);
+
+ /* now its safe for us to update the normal (even) copy */
+ cd.read_data[0] = *rd;
+
+ /* switch readers back to the even copy */
+ raw_write_seqcount_latch(&cd.seq);
+}
+
+/*
+ * Atomically update the sched_clock() epoch.
+ */
+static void update_sched_clock(void)
+{
+ u64 cyc;
+ u64 ns;
+ struct clock_read_data rd;
+
+ rd = cd.read_data[0];
+
+ cyc = cd.actual_read_sched_clock();
+ ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift);
+
+ rd.epoch_ns = ns;
+ rd.epoch_cyc = cyc;
+
+ update_clock_read_data(&rd);
+}
+
+static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt)
+{
+ update_sched_clock();
+ hrtimer_forward_now(hrt, cd.wrap_kt);
+
+ return HRTIMER_RESTART;
+}
+
+void __init
+sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
+{
+ u64 res, wrap, new_mask, new_epoch, cyc, ns;
+ u32 new_mult, new_shift;
+ unsigned long r;
+ char r_unit;
+ struct clock_read_data rd;
+
+ if (cd.rate > rate)
+ return;
+
+ WARN_ON(!irqs_disabled());
+
+ /* Calculate the mult/shift to convert counter ticks to ns. */
+ clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600);
+
+ new_mask = CLOCKSOURCE_MASK(bits);
+ cd.rate = rate;
+
+ /* Calculate how many nanosecs until we risk wrapping */
+ wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask, NULL);
+ cd.wrap_kt = ns_to_ktime(wrap);
+
+ rd = cd.read_data[0];
+
+ /* Update epoch for new counter and update 'epoch_ns' from old counter*/
+ new_epoch = read();
+ cyc = cd.actual_read_sched_clock();
+ ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift);
+ cd.actual_read_sched_clock = read;
+
+ rd.read_sched_clock = read;
+ rd.sched_clock_mask = new_mask;
+ rd.mult = new_mult;
+ rd.shift = new_shift;
+ rd.epoch_cyc = new_epoch;
+ rd.epoch_ns = ns;
+
+ update_clock_read_data(&rd);
+
+ r = rate;
+ if (r >= 4000000) {
+ r /= 1000000;
+ r_unit = 'M';
+ } else {
+ if (r >= 1000) {
+ r /= 1000;
+ r_unit = 'k';
+ } else {
+ r_unit = ' ';
+ }
+ }
+
+ /* Calculate the ns resolution of this counter */
+ res = cyc_to_ns(1ULL, new_mult, new_shift);
+
+ pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n",
+ bits, r, r_unit, res, wrap);
+
+ /* Enable IRQ time accounting if we have a fast enough sched_clock() */
+ if (irqtime > 0 || (irqtime == -1 && rate >= 1000000))
+ enable_sched_clock_irqtime();
+
+ pr_debug("Registered %pF as sched_clock source\n", read);
+}
+
+void __init sched_clock_postinit(void)
+{
+ /*
+ * If no sched_clock() function has been provided at that point,
+ * make it the final one one.
+ */
+ if (cd.actual_read_sched_clock == jiffy_sched_clock_read)
+ sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ);
+
+ update_sched_clock();
+
+ /*
+ * Start the timer to keep sched_clock() properly updated and
+ * sets the initial epoch.
+ */
+ hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ sched_clock_timer.function = sched_clock_poll;
+ hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
+}
+
+/*
+ * Clock read function for use when the clock is suspended.
+ *
+ * This function makes it appear to sched_clock() as if the clock
+ * stopped counting at its last update.
+ *
+ * This function must only be called from the critical
+ * section in sched_clock(). It relies on the read_seqcount_retry()
+ * at the end of the critical section to be sure we observe the
+ * correct copy of 'epoch_cyc'.
+ */
+static u64 notrace suspended_sched_clock_read(void)
+{
+ unsigned long seq = raw_read_seqcount(&cd.seq);
+
+ return cd.read_data[seq & 1].epoch_cyc;
+}
+
+static int sched_clock_suspend(void)
+{
+ struct clock_read_data *rd = &cd.read_data[0];
+
+ update_sched_clock();
+ hrtimer_cancel(&sched_clock_timer);
+ rd->read_sched_clock = suspended_sched_clock_read;
+
+ return 0;
+}
+
+static void sched_clock_resume(void)
+{
+ struct clock_read_data *rd = &cd.read_data[0];
+
+ rd->epoch_cyc = cd.actual_read_sched_clock();
+ hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
+ rd->read_sched_clock = cd.actual_read_sched_clock;
+}
+
+static struct syscore_ops sched_clock_ops = {
+ .suspend = sched_clock_suspend,
+ .resume = sched_clock_resume,
+};
+
+static int __init sched_clock_syscore_init(void)
+{
+ register_syscore_ops(&sched_clock_ops);
+
+ return 0;
+}
+device_initcall(sched_clock_syscore_init);
diff --git a/kernel/time/test_udelay.c b/kernel/time/test_udelay.c
new file mode 100644
index 000000000..e622ba365
--- /dev/null
+++ b/kernel/time/test_udelay.c
@@ -0,0 +1,168 @@
+/*
+ * udelay() test kernel module
+ *
+ * Test is executed by writing and reading to /sys/kernel/debug/udelay_test
+ * Tests are configured by writing: USECS ITERATIONS
+ * Tests are executed by reading from the same file.
+ * Specifying usecs of 0 or negative values will run multiples tests.
+ *
+ * Copyright (C) 2014 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/debugfs.h>
+#include <linux/delay.h>
+#include <linux/ktime.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
+
+#define DEFAULT_ITERATIONS 100
+
+#define DEBUGFS_FILENAME "udelay_test"
+
+static DEFINE_MUTEX(udelay_test_lock);
+static struct dentry *udelay_test_debugfs_file;
+static int udelay_test_usecs;
+static int udelay_test_iterations = DEFAULT_ITERATIONS;
+
+static int udelay_test_single(struct seq_file *s, int usecs, uint32_t iters)
+{
+ int min = 0, max = 0, fail_count = 0;
+ uint64_t sum = 0;
+ uint64_t avg;
+ int i;
+ /* Allow udelay to be up to 0.5% fast */
+ int allowed_error_ns = usecs * 5;
+
+ for (i = 0; i < iters; ++i) {
+ struct timespec ts1, ts2;
+ int time_passed;
+
+ ktime_get_ts(&ts1);
+ udelay(usecs);
+ ktime_get_ts(&ts2);
+ time_passed = timespec_to_ns(&ts2) - timespec_to_ns(&ts1);
+
+ if (i == 0 || time_passed < min)
+ min = time_passed;
+ if (i == 0 || time_passed > max)
+ max = time_passed;
+ if ((time_passed + allowed_error_ns) / 1000 < usecs)
+ ++fail_count;
+ WARN_ON(time_passed < 0);
+ sum += time_passed;
+ }
+
+ avg = sum;
+ do_div(avg, iters);
+ seq_printf(s, "%d usecs x %d: exp=%d allowed=%d min=%d avg=%lld max=%d",
+ usecs, iters, usecs * 1000,
+ (usecs * 1000) - allowed_error_ns, min, avg, max);
+ if (fail_count)
+ seq_printf(s, " FAIL=%d", fail_count);
+ seq_puts(s, "\n");
+
+ return 0;
+}
+
+static int udelay_test_show(struct seq_file *s, void *v)
+{
+ int usecs;
+ int iters;
+ int ret = 0;
+
+ mutex_lock(&udelay_test_lock);
+ usecs = udelay_test_usecs;
+ iters = udelay_test_iterations;
+ mutex_unlock(&udelay_test_lock);
+
+ if (usecs > 0 && iters > 0) {
+ return udelay_test_single(s, usecs, iters);
+ } else if (usecs == 0) {
+ struct timespec ts;
+
+ ktime_get_ts(&ts);
+ seq_printf(s, "udelay() test (lpj=%ld kt=%ld.%09ld)\n",
+ loops_per_jiffy, ts.tv_sec, ts.tv_nsec);
+ seq_puts(s, "usage:\n");
+ seq_puts(s, "echo USECS [ITERS] > " DEBUGFS_FILENAME "\n");
+ seq_puts(s, "cat " DEBUGFS_FILENAME "\n");
+ }
+
+ return ret;
+}
+
+static int udelay_test_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, udelay_test_show, inode->i_private);
+}
+
+static ssize_t udelay_test_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *pos)
+{
+ char lbuf[32];
+ int ret;
+ int usecs;
+ int iters;
+
+ if (count >= sizeof(lbuf))
+ return -EINVAL;
+
+ if (copy_from_user(lbuf, buf, count))
+ return -EFAULT;
+ lbuf[count] = '\0';
+
+ ret = sscanf(lbuf, "%d %d", &usecs, &iters);
+ if (ret < 1)
+ return -EINVAL;
+ else if (ret < 2)
+ iters = DEFAULT_ITERATIONS;
+
+ mutex_lock(&udelay_test_lock);
+ udelay_test_usecs = usecs;
+ udelay_test_iterations = iters;
+ mutex_unlock(&udelay_test_lock);
+
+ return count;
+}
+
+static const struct file_operations udelay_test_debugfs_ops = {
+ .owner = THIS_MODULE,
+ .open = udelay_test_open,
+ .read = seq_read,
+ .write = udelay_test_write,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init udelay_test_init(void)
+{
+ mutex_lock(&udelay_test_lock);
+ udelay_test_debugfs_file = debugfs_create_file(DEBUGFS_FILENAME,
+ S_IRUSR, NULL, NULL, &udelay_test_debugfs_ops);
+ mutex_unlock(&udelay_test_lock);
+
+ return 0;
+}
+
+module_init(udelay_test_init);
+
+static void __exit udelay_test_exit(void)
+{
+ mutex_lock(&udelay_test_lock);
+ debugfs_remove(udelay_test_debugfs_file);
+ mutex_unlock(&udelay_test_lock);
+}
+
+module_exit(udelay_test_exit);
+
+MODULE_AUTHOR("David Riley <davidriley@chromium.org>");
+MODULE_LICENSE("GPL");
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
new file mode 100644
index 000000000..6aac4beed
--- /dev/null
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -0,0 +1,113 @@
+/*
+ * linux/kernel/time/tick-broadcast-hrtimer.c
+ * This file emulates a local clock event device
+ * via a pseudo clock device.
+ */
+#include <linux/cpu.h>
+#include <linux/err.h>
+#include <linux/hrtimer.h>
+#include <linux/interrupt.h>
+#include <linux/percpu.h>
+#include <linux/profile.h>
+#include <linux/clockchips.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/module.h>
+
+#include "tick-internal.h"
+
+static struct hrtimer bctimer;
+
+static void bc_set_mode(enum clock_event_mode mode,
+ struct clock_event_device *bc)
+{
+ switch (mode) {
+ case CLOCK_EVT_MODE_SHUTDOWN:
+ /*
+ * Note, we cannot cancel the timer here as we might
+ * run into the following live lock scenario:
+ *
+ * cpu 0 cpu1
+ * lock(broadcast_lock);
+ * hrtimer_interrupt()
+ * bc_handler()
+ * tick_handle_oneshot_broadcast();
+ * lock(broadcast_lock);
+ * hrtimer_cancel()
+ * wait_for_callback()
+ */
+ hrtimer_try_to_cancel(&bctimer);
+ break;
+ default:
+ break;
+ }
+}
+
+/*
+ * This is called from the guts of the broadcast code when the cpu
+ * which is about to enter idle has the earliest broadcast timer event.
+ */
+static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
+{
+ int bc_moved;
+ /*
+ * We try to cancel the timer first. If the callback is on
+ * flight on some other cpu then we let it handle it. If we
+ * were able to cancel the timer nothing can rearm it as we
+ * own broadcast_lock.
+ *
+ * However we can also be called from the event handler of
+ * ce_broadcast_hrtimer itself when it expires. We cannot
+ * restart the timer because we are in the callback, but we
+ * can set the expiry time and let the callback return
+ * HRTIMER_RESTART.
+ *
+ * Since we are in the idle loop at this point and because
+ * hrtimer_{start/cancel} functions call into tracing,
+ * calls to these functions must be bound within RCU_NONIDLE.
+ */
+ RCU_NONIDLE(bc_moved = (hrtimer_try_to_cancel(&bctimer) >= 0) ?
+ !hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED) :
+ 0);
+ if (bc_moved) {
+ /* Bind the "device" to the cpu */
+ bc->bound_on = smp_processor_id();
+ } else if (bc->bound_on == smp_processor_id()) {
+ hrtimer_set_expires(&bctimer, expires);
+ }
+ return 0;
+}
+
+static struct clock_event_device ce_broadcast_hrtimer = {
+ .set_mode = bc_set_mode,
+ .set_next_ktime = bc_set_next,
+ .features = CLOCK_EVT_FEAT_ONESHOT |
+ CLOCK_EVT_FEAT_KTIME |
+ CLOCK_EVT_FEAT_HRTIMER,
+ .rating = 0,
+ .bound_on = -1,
+ .min_delta_ns = 1,
+ .max_delta_ns = KTIME_MAX,
+ .min_delta_ticks = 1,
+ .max_delta_ticks = ULONG_MAX,
+ .mult = 1,
+ .shift = 0,
+ .cpumask = cpu_all_mask,
+};
+
+static enum hrtimer_restart bc_handler(struct hrtimer *t)
+{
+ ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer);
+
+ if (ce_broadcast_hrtimer.next_event.tv64 == KTIME_MAX)
+ return HRTIMER_NORESTART;
+
+ return HRTIMER_RESTART;
+}
+
+void tick_setup_hrtimer_broadcast(void)
+{
+ hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ bctimer.function = bc_handler;
+ clockevents_register_device(&ce_broadcast_hrtimer);
+}
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
new file mode 100644
index 000000000..7e8ca4f44
--- /dev/null
+++ b/kernel/time/tick-broadcast.c
@@ -0,0 +1,964 @@
+/*
+ * linux/kernel/time/tick-broadcast.c
+ *
+ * This file contains functions which emulate a local clock-event
+ * device via a broadcast event source.
+ *
+ * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
+ * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
+ * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
+ *
+ * This code is licenced under the GPL version 2. For details see
+ * kernel-base/COPYING.
+ */
+#include <linux/cpu.h>
+#include <linux/err.h>
+#include <linux/hrtimer.h>
+#include <linux/interrupt.h>
+#include <linux/percpu.h>
+#include <linux/profile.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/module.h>
+
+#include "tick-internal.h"
+
+/*
+ * Broadcast support for broken x86 hardware, where the local apic
+ * timer stops in C3 state.
+ */
+
+static struct tick_device tick_broadcast_device;
+static cpumask_var_t tick_broadcast_mask;
+static cpumask_var_t tick_broadcast_on;
+static cpumask_var_t tmpmask;
+static DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
+static int tick_broadcast_forced;
+
+#ifdef CONFIG_TICK_ONESHOT
+static void tick_broadcast_clear_oneshot(int cpu);
+static void tick_resume_broadcast_oneshot(struct clock_event_device *bc);
+#else
+static inline void tick_broadcast_clear_oneshot(int cpu) { }
+static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { }
+#endif
+
+/*
+ * Debugging: see timer_list.c
+ */
+struct tick_device *tick_get_broadcast_device(void)
+{
+ return &tick_broadcast_device;
+}
+
+struct cpumask *tick_get_broadcast_mask(void)
+{
+ return tick_broadcast_mask;
+}
+
+/*
+ * Start the device in periodic mode
+ */
+static void tick_broadcast_start_periodic(struct clock_event_device *bc)
+{
+ if (bc)
+ tick_setup_periodic(bc, 1);
+}
+
+/*
+ * Check, if the device can be utilized as broadcast device:
+ */
+static bool tick_check_broadcast_device(struct clock_event_device *curdev,
+ struct clock_event_device *newdev)
+{
+ if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) ||
+ (newdev->features & CLOCK_EVT_FEAT_PERCPU) ||
+ (newdev->features & CLOCK_EVT_FEAT_C3STOP))
+ return false;
+
+ if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT &&
+ !(newdev->features & CLOCK_EVT_FEAT_ONESHOT))
+ return false;
+
+ return !curdev || newdev->rating > curdev->rating;
+}
+
+/*
+ * Conditionally install/replace broadcast device
+ */
+void tick_install_broadcast_device(struct clock_event_device *dev)
+{
+ struct clock_event_device *cur = tick_broadcast_device.evtdev;
+
+ if (!tick_check_broadcast_device(cur, dev))
+ return;
+
+ if (!try_module_get(dev->owner))
+ return;
+
+ clockevents_exchange_device(cur, dev);
+ if (cur)
+ cur->event_handler = clockevents_handle_noop;
+ tick_broadcast_device.evtdev = dev;
+ if (!cpumask_empty(tick_broadcast_mask))
+ tick_broadcast_start_periodic(dev);
+ /*
+ * Inform all cpus about this. We might be in a situation
+ * where we did not switch to oneshot mode because the per cpu
+ * devices are affected by CLOCK_EVT_FEAT_C3STOP and the lack
+ * of a oneshot capable broadcast device. Without that
+ * notification the systems stays stuck in periodic mode
+ * forever.
+ */
+ if (dev->features & CLOCK_EVT_FEAT_ONESHOT)
+ tick_clock_notify();
+}
+
+/*
+ * Check, if the device is the broadcast device
+ */
+int tick_is_broadcast_device(struct clock_event_device *dev)
+{
+ return (dev && tick_broadcast_device.evtdev == dev);
+}
+
+int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq)
+{
+ int ret = -ENODEV;
+
+ if (tick_is_broadcast_device(dev)) {
+ raw_spin_lock(&tick_broadcast_lock);
+ ret = __clockevents_update_freq(dev, freq);
+ raw_spin_unlock(&tick_broadcast_lock);
+ }
+ return ret;
+}
+
+
+static void err_broadcast(const struct cpumask *mask)
+{
+ pr_crit_once("Failed to broadcast timer tick. Some CPUs may be unresponsive.\n");
+}
+
+static void tick_device_setup_broadcast_func(struct clock_event_device *dev)
+{
+ if (!dev->broadcast)
+ dev->broadcast = tick_broadcast;
+ if (!dev->broadcast) {
+ pr_warn_once("%s depends on broadcast, but no broadcast function available\n",
+ dev->name);
+ dev->broadcast = err_broadcast;
+ }
+}
+
+/*
+ * Check, if the device is disfunctional and a place holder, which
+ * needs to be handled by the broadcast device.
+ */
+int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
+{
+ struct clock_event_device *bc = tick_broadcast_device.evtdev;
+ unsigned long flags;
+ int ret;
+
+ raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
+
+ /*
+ * Devices might be registered with both periodic and oneshot
+ * mode disabled. This signals, that the device needs to be
+ * operated from the broadcast device and is a placeholder for
+ * the cpu local device.
+ */
+ if (!tick_device_is_functional(dev)) {
+ dev->event_handler = tick_handle_periodic;
+ tick_device_setup_broadcast_func(dev);
+ cpumask_set_cpu(cpu, tick_broadcast_mask);
+ if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
+ tick_broadcast_start_periodic(bc);
+ else
+ tick_broadcast_setup_oneshot(bc);
+ ret = 1;
+ } else {
+ /*
+ * Clear the broadcast bit for this cpu if the
+ * device is not power state affected.
+ */
+ if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
+ cpumask_clear_cpu(cpu, tick_broadcast_mask);
+ else
+ tick_device_setup_broadcast_func(dev);
+
+ /*
+ * Clear the broadcast bit if the CPU is not in
+ * periodic broadcast on state.
+ */
+ if (!cpumask_test_cpu(cpu, tick_broadcast_on))
+ cpumask_clear_cpu(cpu, tick_broadcast_mask);
+
+ switch (tick_broadcast_device.mode) {
+ case TICKDEV_MODE_ONESHOT:
+ /*
+ * If the system is in oneshot mode we can
+ * unconditionally clear the oneshot mask bit,
+ * because the CPU is running and therefore
+ * not in an idle state which causes the power
+ * state affected device to stop. Let the
+ * caller initialize the device.
+ */
+ tick_broadcast_clear_oneshot(cpu);
+ ret = 0;
+ break;
+
+ case TICKDEV_MODE_PERIODIC:
+ /*
+ * If the system is in periodic mode, check
+ * whether the broadcast device can be
+ * switched off now.
+ */
+ if (cpumask_empty(tick_broadcast_mask) && bc)
+ clockevents_shutdown(bc);
+ /*
+ * If we kept the cpu in the broadcast mask,
+ * tell the caller to leave the per cpu device
+ * in shutdown state. The periodic interrupt
+ * is delivered by the broadcast device.
+ */
+ ret = cpumask_test_cpu(cpu, tick_broadcast_mask);
+ break;
+ default:
+ /* Nothing to do */
+ ret = 0;
+ break;
+ }
+ }
+ raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+ return ret;
+}
+
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+int tick_receive_broadcast(void)
+{
+ struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
+ struct clock_event_device *evt = td->evtdev;
+
+ if (!evt)
+ return -ENODEV;
+
+ if (!evt->event_handler)
+ return -EINVAL;
+
+ evt->event_handler(evt);
+ return 0;
+}
+#endif
+
+/*
+ * Broadcast the event to the cpus, which are set in the mask (mangled).
+ */
+static void tick_do_broadcast(struct cpumask *mask)
+{
+ int cpu = smp_processor_id();
+ struct tick_device *td;
+
+ /*
+ * Check, if the current cpu is in the mask
+ */
+ if (cpumask_test_cpu(cpu, mask)) {
+ cpumask_clear_cpu(cpu, mask);
+ td = &per_cpu(tick_cpu_device, cpu);
+ td->evtdev->event_handler(td->evtdev);
+ }
+
+ if (!cpumask_empty(mask)) {
+ /*
+ * It might be necessary to actually check whether the devices
+ * have different broadcast functions. For now, just use the
+ * one of the first device. This works as long as we have this
+ * misfeature only on x86 (lapic)
+ */
+ td = &per_cpu(tick_cpu_device, cpumask_first(mask));
+ td->evtdev->broadcast(mask);
+ }
+}
+
+/*
+ * Periodic broadcast:
+ * - invoke the broadcast handlers
+ */
+static void tick_do_periodic_broadcast(void)
+{
+ cpumask_and(tmpmask, cpu_online_mask, tick_broadcast_mask);
+ tick_do_broadcast(tmpmask);
+}
+
+/*
+ * Event handler for periodic broadcast ticks
+ */
+static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
+{
+ ktime_t next;
+
+ raw_spin_lock(&tick_broadcast_lock);
+
+ tick_do_periodic_broadcast();
+
+ /*
+ * The device is in periodic mode. No reprogramming necessary:
+ */
+ if (dev->state == CLOCK_EVT_STATE_PERIODIC)
+ goto unlock;
+
+ /*
+ * Setup the next period for devices, which do not have
+ * periodic mode. We read dev->next_event first and add to it
+ * when the event already expired. clockevents_program_event()
+ * sets dev->next_event only when the event is really
+ * programmed to the device.
+ */
+ for (next = dev->next_event; ;) {
+ next = ktime_add(next, tick_period);
+
+ if (!clockevents_program_event(dev, next, false))
+ goto unlock;
+ tick_do_periodic_broadcast();
+ }
+unlock:
+ raw_spin_unlock(&tick_broadcast_lock);
+}
+
+/**
+ * tick_broadcast_control - Enable/disable or force broadcast mode
+ * @mode: The selected broadcast mode
+ *
+ * Called when the system enters a state where affected tick devices
+ * might stop. Note: TICK_BROADCAST_FORCE cannot be undone.
+ *
+ * Called with interrupts disabled, so clockevents_lock is not
+ * required here because the local clock event device cannot go away
+ * under us.
+ */
+void tick_broadcast_control(enum tick_broadcast_mode mode)
+{
+ struct clock_event_device *bc, *dev;
+ struct tick_device *td;
+ int cpu, bc_stopped;
+
+ td = this_cpu_ptr(&tick_cpu_device);
+ dev = td->evtdev;
+
+ /*
+ * Is the device not affected by the powerstate ?
+ */
+ if (!dev || !(dev->features & CLOCK_EVT_FEAT_C3STOP))
+ return;
+
+ if (!tick_device_is_functional(dev))
+ return;
+
+ raw_spin_lock(&tick_broadcast_lock);
+ cpu = smp_processor_id();
+ bc = tick_broadcast_device.evtdev;
+ bc_stopped = cpumask_empty(tick_broadcast_mask);
+
+ switch (mode) {
+ case TICK_BROADCAST_FORCE:
+ tick_broadcast_forced = 1;
+ case TICK_BROADCAST_ON:
+ cpumask_set_cpu(cpu, tick_broadcast_on);
+ if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) {
+ if (tick_broadcast_device.mode ==
+ TICKDEV_MODE_PERIODIC)
+ clockevents_shutdown(dev);
+ }
+ break;
+
+ case TICK_BROADCAST_OFF:
+ if (tick_broadcast_forced)
+ break;
+ cpumask_clear_cpu(cpu, tick_broadcast_on);
+ if (!tick_device_is_functional(dev))
+ break;
+ if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) {
+ if (tick_broadcast_device.mode ==
+ TICKDEV_MODE_PERIODIC)
+ tick_setup_periodic(dev, 0);
+ }
+ break;
+ }
+
+ if (cpumask_empty(tick_broadcast_mask)) {
+ if (!bc_stopped)
+ clockevents_shutdown(bc);
+ } else if (bc_stopped) {
+ if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
+ tick_broadcast_start_periodic(bc);
+ else
+ tick_broadcast_setup_oneshot(bc);
+ }
+ raw_spin_unlock(&tick_broadcast_lock);
+}
+EXPORT_SYMBOL_GPL(tick_broadcast_control);
+
+/*
+ * Set the periodic handler depending on broadcast on/off
+ */
+void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast)
+{
+ if (!broadcast)
+ dev->event_handler = tick_handle_periodic;
+ else
+ dev->event_handler = tick_handle_periodic_broadcast;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * Remove a CPU from broadcasting
+ */
+void tick_shutdown_broadcast(unsigned int cpu)
+{
+ struct clock_event_device *bc;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
+
+ bc = tick_broadcast_device.evtdev;
+ cpumask_clear_cpu(cpu, tick_broadcast_mask);
+ cpumask_clear_cpu(cpu, tick_broadcast_on);
+
+ if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
+ if (bc && cpumask_empty(tick_broadcast_mask))
+ clockevents_shutdown(bc);
+ }
+
+ raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+}
+#endif
+
+void tick_suspend_broadcast(void)
+{
+ struct clock_event_device *bc;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
+
+ bc = tick_broadcast_device.evtdev;
+ if (bc)
+ clockevents_shutdown(bc);
+
+ raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+}
+
+/*
+ * This is called from tick_resume_local() on a resuming CPU. That's
+ * called from the core resume function, tick_unfreeze() and the magic XEN
+ * resume hackery.
+ *
+ * In none of these cases the broadcast device mode can change and the
+ * bit of the resuming CPU in the broadcast mask is safe as well.
+ */
+bool tick_resume_check_broadcast(void)
+{
+ if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT)
+ return false;
+ else
+ return cpumask_test_cpu(smp_processor_id(), tick_broadcast_mask);
+}
+
+void tick_resume_broadcast(void)
+{
+ struct clock_event_device *bc;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
+
+ bc = tick_broadcast_device.evtdev;
+
+ if (bc) {
+ clockevents_tick_resume(bc);
+
+ switch (tick_broadcast_device.mode) {
+ case TICKDEV_MODE_PERIODIC:
+ if (!cpumask_empty(tick_broadcast_mask))
+ tick_broadcast_start_periodic(bc);
+ break;
+ case TICKDEV_MODE_ONESHOT:
+ if (!cpumask_empty(tick_broadcast_mask))
+ tick_resume_broadcast_oneshot(bc);
+ break;
+ }
+ }
+ raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+}
+
+#ifdef CONFIG_TICK_ONESHOT
+
+static cpumask_var_t tick_broadcast_oneshot_mask;
+static cpumask_var_t tick_broadcast_pending_mask;
+static cpumask_var_t tick_broadcast_force_mask;
+
+/*
+ * Exposed for debugging: see timer_list.c
+ */
+struct cpumask *tick_get_broadcast_oneshot_mask(void)
+{
+ return tick_broadcast_oneshot_mask;
+}
+
+/*
+ * Called before going idle with interrupts disabled. Checks whether a
+ * broadcast event from the other core is about to happen. We detected
+ * that in tick_broadcast_oneshot_control(). The callsite can use this
+ * to avoid a deep idle transition as we are about to get the
+ * broadcast IPI right away.
+ */
+int tick_check_broadcast_expired(void)
+{
+ return cpumask_test_cpu(smp_processor_id(), tick_broadcast_force_mask);
+}
+
+/*
+ * Set broadcast interrupt affinity
+ */
+static void tick_broadcast_set_affinity(struct clock_event_device *bc,
+ const struct cpumask *cpumask)
+{
+ if (!(bc->features & CLOCK_EVT_FEAT_DYNIRQ))
+ return;
+
+ if (cpumask_equal(bc->cpumask, cpumask))
+ return;
+
+ bc->cpumask = cpumask;
+ irq_set_affinity(bc->irq, bc->cpumask);
+}
+
+static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu,
+ ktime_t expires, int force)
+{
+ int ret;
+
+ if (bc->state != CLOCK_EVT_STATE_ONESHOT)
+ clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT);
+
+ ret = clockevents_program_event(bc, expires, force);
+ if (!ret)
+ tick_broadcast_set_affinity(bc, cpumask_of(cpu));
+ return ret;
+}
+
+static void tick_resume_broadcast_oneshot(struct clock_event_device *bc)
+{
+ clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT);
+}
+
+/*
+ * Called from irq_enter() when idle was interrupted to reenable the
+ * per cpu device.
+ */
+void tick_check_oneshot_broadcast_this_cpu(void)
+{
+ if (cpumask_test_cpu(smp_processor_id(), tick_broadcast_oneshot_mask)) {
+ struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
+
+ /*
+ * We might be in the middle of switching over from
+ * periodic to oneshot. If the CPU has not yet
+ * switched over, leave the device alone.
+ */
+ if (td->mode == TICKDEV_MODE_ONESHOT) {
+ clockevents_set_state(td->evtdev,
+ CLOCK_EVT_STATE_ONESHOT);
+ }
+ }
+}
+
+/*
+ * Handle oneshot mode broadcasting
+ */
+static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
+{
+ struct tick_device *td;
+ ktime_t now, next_event;
+ int cpu, next_cpu = 0;
+
+ raw_spin_lock(&tick_broadcast_lock);
+again:
+ dev->next_event.tv64 = KTIME_MAX;
+ next_event.tv64 = KTIME_MAX;
+ cpumask_clear(tmpmask);
+ now = ktime_get();
+ /* Find all expired events */
+ for_each_cpu(cpu, tick_broadcast_oneshot_mask) {
+ td = &per_cpu(tick_cpu_device, cpu);
+ if (td->evtdev->next_event.tv64 <= now.tv64) {
+ cpumask_set_cpu(cpu, tmpmask);
+ /*
+ * Mark the remote cpu in the pending mask, so
+ * it can avoid reprogramming the cpu local
+ * timer in tick_broadcast_oneshot_control().
+ */
+ cpumask_set_cpu(cpu, tick_broadcast_pending_mask);
+ } else if (td->evtdev->next_event.tv64 < next_event.tv64) {
+ next_event.tv64 = td->evtdev->next_event.tv64;
+ next_cpu = cpu;
+ }
+ }
+
+ /*
+ * Remove the current cpu from the pending mask. The event is
+ * delivered immediately in tick_do_broadcast() !
+ */
+ cpumask_clear_cpu(smp_processor_id(), tick_broadcast_pending_mask);
+
+ /* Take care of enforced broadcast requests */
+ cpumask_or(tmpmask, tmpmask, tick_broadcast_force_mask);
+ cpumask_clear(tick_broadcast_force_mask);
+
+ /*
+ * Sanity check. Catch the case where we try to broadcast to
+ * offline cpus.
+ */
+ if (WARN_ON_ONCE(!cpumask_subset(tmpmask, cpu_online_mask)))
+ cpumask_and(tmpmask, tmpmask, cpu_online_mask);
+
+ /*
+ * Wakeup the cpus which have an expired event.
+ */
+ tick_do_broadcast(tmpmask);
+
+ /*
+ * Two reasons for reprogram:
+ *
+ * - The global event did not expire any CPU local
+ * events. This happens in dyntick mode, as the maximum PIT
+ * delta is quite small.
+ *
+ * - There are pending events on sleeping CPUs which were not
+ * in the event mask
+ */
+ if (next_event.tv64 != KTIME_MAX) {
+ /*
+ * Rearm the broadcast device. If event expired,
+ * repeat the above
+ */
+ if (tick_broadcast_set_event(dev, next_cpu, next_event, 0))
+ goto again;
+ }
+ raw_spin_unlock(&tick_broadcast_lock);
+}
+
+static int broadcast_needs_cpu(struct clock_event_device *bc, int cpu)
+{
+ if (!(bc->features & CLOCK_EVT_FEAT_HRTIMER))
+ return 0;
+ if (bc->next_event.tv64 == KTIME_MAX)
+ return 0;
+ return bc->bound_on == cpu ? -EBUSY : 0;
+}
+
+static void broadcast_shutdown_local(struct clock_event_device *bc,
+ struct clock_event_device *dev)
+{
+ /*
+ * For hrtimer based broadcasting we cannot shutdown the cpu
+ * local device if our own event is the first one to expire or
+ * if we own the broadcast timer.
+ */
+ if (bc->features & CLOCK_EVT_FEAT_HRTIMER) {
+ if (broadcast_needs_cpu(bc, smp_processor_id()))
+ return;
+ if (dev->next_event.tv64 < bc->next_event.tv64)
+ return;
+ }
+ clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
+}
+
+/**
+ * tick_broadcast_oneshot_control - Enter/exit broadcast oneshot mode
+ * @state: The target state (enter/exit)
+ *
+ * The system enters/leaves a state, where affected devices might stop
+ * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups.
+ *
+ * Called with interrupts disabled, so clockevents_lock is not
+ * required here because the local clock event device cannot go away
+ * under us.
+ */
+int tick_broadcast_oneshot_control(enum tick_broadcast_state state)
+{
+ struct clock_event_device *bc, *dev;
+ struct tick_device *td;
+ int cpu, ret = 0;
+ ktime_t now;
+
+ /*
+ * Periodic mode does not care about the enter/exit of power
+ * states
+ */
+ if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
+ return 0;
+
+ /*
+ * We are called with preemtion disabled from the depth of the
+ * idle code, so we can't be moved away.
+ */
+ td = this_cpu_ptr(&tick_cpu_device);
+ dev = td->evtdev;
+
+ if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
+ return 0;
+
+ raw_spin_lock(&tick_broadcast_lock);
+ bc = tick_broadcast_device.evtdev;
+ cpu = smp_processor_id();
+
+ if (state == TICK_BROADCAST_ENTER) {
+ if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) {
+ WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask));
+ broadcast_shutdown_local(bc, dev);
+ /*
+ * We only reprogram the broadcast timer if we
+ * did not mark ourself in the force mask and
+ * if the cpu local event is earlier than the
+ * broadcast event. If the current CPU is in
+ * the force mask, then we are going to be
+ * woken by the IPI right away.
+ */
+ if (!cpumask_test_cpu(cpu, tick_broadcast_force_mask) &&
+ dev->next_event.tv64 < bc->next_event.tv64)
+ tick_broadcast_set_event(bc, cpu, dev->next_event, 1);
+ }
+ /*
+ * If the current CPU owns the hrtimer broadcast
+ * mechanism, it cannot go deep idle and we remove the
+ * CPU from the broadcast mask. We don't have to go
+ * through the EXIT path as the local timer is not
+ * shutdown.
+ */
+ ret = broadcast_needs_cpu(bc, cpu);
+ if (ret)
+ cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
+ } else {
+ if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) {
+ clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
+ /*
+ * The cpu which was handling the broadcast
+ * timer marked this cpu in the broadcast
+ * pending mask and fired the broadcast
+ * IPI. So we are going to handle the expired
+ * event anyway via the broadcast IPI
+ * handler. No need to reprogram the timer
+ * with an already expired event.
+ */
+ if (cpumask_test_and_clear_cpu(cpu,
+ tick_broadcast_pending_mask))
+ goto out;
+
+ /*
+ * Bail out if there is no next event.
+ */
+ if (dev->next_event.tv64 == KTIME_MAX)
+ goto out;
+ /*
+ * If the pending bit is not set, then we are
+ * either the CPU handling the broadcast
+ * interrupt or we got woken by something else.
+ *
+ * We are not longer in the broadcast mask, so
+ * if the cpu local expiry time is already
+ * reached, we would reprogram the cpu local
+ * timer with an already expired event.
+ *
+ * This can lead to a ping-pong when we return
+ * to idle and therefor rearm the broadcast
+ * timer before the cpu local timer was able
+ * to fire. This happens because the forced
+ * reprogramming makes sure that the event
+ * will happen in the future and depending on
+ * the min_delta setting this might be far
+ * enough out that the ping-pong starts.
+ *
+ * If the cpu local next_event has expired
+ * then we know that the broadcast timer
+ * next_event has expired as well and
+ * broadcast is about to be handled. So we
+ * avoid reprogramming and enforce that the
+ * broadcast handler, which did not run yet,
+ * will invoke the cpu local handler.
+ *
+ * We cannot call the handler directly from
+ * here, because we might be in a NOHZ phase
+ * and we did not go through the irq_enter()
+ * nohz fixups.
+ */
+ now = ktime_get();
+ if (dev->next_event.tv64 <= now.tv64) {
+ cpumask_set_cpu(cpu, tick_broadcast_force_mask);
+ goto out;
+ }
+ /*
+ * We got woken by something else. Reprogram
+ * the cpu local timer device.
+ */
+ tick_program_event(dev->next_event, 1);
+ }
+ }
+out:
+ raw_spin_unlock(&tick_broadcast_lock);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tick_broadcast_oneshot_control);
+
+/*
+ * Reset the one shot broadcast for a cpu
+ *
+ * Called with tick_broadcast_lock held
+ */
+static void tick_broadcast_clear_oneshot(int cpu)
+{
+ cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
+ cpumask_clear_cpu(cpu, tick_broadcast_pending_mask);
+}
+
+static void tick_broadcast_init_next_event(struct cpumask *mask,
+ ktime_t expires)
+{
+ struct tick_device *td;
+ int cpu;
+
+ for_each_cpu(cpu, mask) {
+ td = &per_cpu(tick_cpu_device, cpu);
+ if (td->evtdev)
+ td->evtdev->next_event = expires;
+ }
+}
+
+/**
+ * tick_broadcast_setup_oneshot - setup the broadcast device
+ */
+void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
+{
+ int cpu = smp_processor_id();
+
+ /* Set it up only once ! */
+ if (bc->event_handler != tick_handle_oneshot_broadcast) {
+ int was_periodic = bc->state == CLOCK_EVT_STATE_PERIODIC;
+
+ bc->event_handler = tick_handle_oneshot_broadcast;
+
+ /*
+ * We must be careful here. There might be other CPUs
+ * waiting for periodic broadcast. We need to set the
+ * oneshot_mask bits for those and program the
+ * broadcast device to fire.
+ */
+ cpumask_copy(tmpmask, tick_broadcast_mask);
+ cpumask_clear_cpu(cpu, tmpmask);
+ cpumask_or(tick_broadcast_oneshot_mask,
+ tick_broadcast_oneshot_mask, tmpmask);
+
+ if (was_periodic && !cpumask_empty(tmpmask)) {
+ clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT);
+ tick_broadcast_init_next_event(tmpmask,
+ tick_next_period);
+ tick_broadcast_set_event(bc, cpu, tick_next_period, 1);
+ } else
+ bc->next_event.tv64 = KTIME_MAX;
+ } else {
+ /*
+ * The first cpu which switches to oneshot mode sets
+ * the bit for all other cpus which are in the general
+ * (periodic) broadcast mask. So the bit is set and
+ * would prevent the first broadcast enter after this
+ * to program the bc device.
+ */
+ tick_broadcast_clear_oneshot(cpu);
+ }
+}
+
+/*
+ * Select oneshot operating mode for the broadcast device
+ */
+void tick_broadcast_switch_to_oneshot(void)
+{
+ struct clock_event_device *bc;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
+
+ tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT;
+ bc = tick_broadcast_device.evtdev;
+ if (bc)
+ tick_broadcast_setup_oneshot(bc);
+
+ raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+void hotplug_cpu__broadcast_tick_pull(int deadcpu)
+{
+ struct clock_event_device *bc;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
+ bc = tick_broadcast_device.evtdev;
+
+ if (bc && broadcast_needs_cpu(bc, deadcpu)) {
+ /* This moves the broadcast assignment to this CPU: */
+ clockevents_program_event(bc, bc->next_event, 1);
+ }
+ raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+}
+
+/*
+ * Remove a dead CPU from broadcasting
+ */
+void tick_shutdown_broadcast_oneshot(unsigned int cpu)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
+
+ /*
+ * Clear the broadcast masks for the dead cpu, but do not stop
+ * the broadcast device!
+ */
+ cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
+ cpumask_clear_cpu(cpu, tick_broadcast_pending_mask);
+ cpumask_clear_cpu(cpu, tick_broadcast_force_mask);
+
+ raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+}
+#endif
+
+/*
+ * Check, whether the broadcast device is in one shot mode
+ */
+int tick_broadcast_oneshot_active(void)
+{
+ return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT;
+}
+
+/*
+ * Check whether the broadcast device supports oneshot.
+ */
+bool tick_broadcast_oneshot_available(void)
+{
+ struct clock_event_device *bc = tick_broadcast_device.evtdev;
+
+ return bc ? bc->features & CLOCK_EVT_FEAT_ONESHOT : false;
+}
+
+#endif
+
+void __init tick_broadcast_init(void)
+{
+ zalloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT);
+ zalloc_cpumask_var(&tick_broadcast_on, GFP_NOWAIT);
+ zalloc_cpumask_var(&tmpmask, GFP_NOWAIT);
+#ifdef CONFIG_TICK_ONESHOT
+ zalloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT);
+ zalloc_cpumask_var(&tick_broadcast_pending_mask, GFP_NOWAIT);
+ zalloc_cpumask_var(&tick_broadcast_force_mask, GFP_NOWAIT);
+#endif
+}
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
new file mode 100644
index 000000000..3ae6afa1e
--- /dev/null
+++ b/kernel/time/tick-common.c
@@ -0,0 +1,498 @@
+/*
+ * linux/kernel/time/tick-common.c
+ *
+ * This file contains the base functions to manage periodic tick
+ * related events.
+ *
+ * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
+ * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
+ * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
+ *
+ * This code is licenced under the GPL version 2. For details see
+ * kernel-base/COPYING.
+ */
+#include <linux/cpu.h>
+#include <linux/err.h>
+#include <linux/hrtimer.h>
+#include <linux/interrupt.h>
+#include <linux/percpu.h>
+#include <linux/profile.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+
+#include <asm/irq_regs.h>
+
+#include "tick-internal.h"
+
+/*
+ * Tick devices
+ */
+DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
+/*
+ * Tick next event: keeps track of the tick time
+ */
+ktime_t tick_next_period;
+ktime_t tick_period;
+
+/*
+ * tick_do_timer_cpu is a timer core internal variable which holds the CPU NR
+ * which is responsible for calling do_timer(), i.e. the timekeeping stuff. This
+ * variable has two functions:
+ *
+ * 1) Prevent a thundering herd issue of a gazillion of CPUs trying to grab the
+ * timekeeping lock all at once. Only the CPU which is assigned to do the
+ * update is handling it.
+ *
+ * 2) Hand off the duty in the NOHZ idle case by setting the value to
+ * TICK_DO_TIMER_NONE, i.e. a non existing CPU. So the next cpu which looks
+ * at it will take over and keep the time keeping alive. The handover
+ * procedure also covers cpu hotplug.
+ */
+int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
+
+/*
+ * Debugging: see timer_list.c
+ */
+struct tick_device *tick_get_device(int cpu)
+{
+ return &per_cpu(tick_cpu_device, cpu);
+}
+
+/**
+ * tick_is_oneshot_available - check for a oneshot capable event device
+ */
+int tick_is_oneshot_available(void)
+{
+ struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
+
+ if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT))
+ return 0;
+ if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
+ return 1;
+ return tick_broadcast_oneshot_available();
+}
+
+/*
+ * Periodic tick
+ */
+static void tick_periodic(int cpu)
+{
+ if (tick_do_timer_cpu == cpu) {
+ write_seqlock(&jiffies_lock);
+
+ /* Keep track of the next tick event */
+ tick_next_period = ktime_add(tick_next_period, tick_period);
+
+ do_timer(1);
+ write_sequnlock(&jiffies_lock);
+ update_wall_time();
+ }
+
+ update_process_times(user_mode(get_irq_regs()));
+ profile_tick(CPU_PROFILING);
+}
+
+/*
+ * Event handler for periodic ticks
+ */
+void tick_handle_periodic(struct clock_event_device *dev)
+{
+ int cpu = smp_processor_id();
+ ktime_t next = dev->next_event;
+
+ tick_periodic(cpu);
+
+ if (dev->state != CLOCK_EVT_STATE_ONESHOT)
+ return;
+ for (;;) {
+ /*
+ * Setup the next period for devices, which do not have
+ * periodic mode:
+ */
+ next = ktime_add(next, tick_period);
+
+ if (!clockevents_program_event(dev, next, false))
+ return;
+ /*
+ * Have to be careful here. If we're in oneshot mode,
+ * before we call tick_periodic() in a loop, we need
+ * to be sure we're using a real hardware clocksource.
+ * Otherwise we could get trapped in an infinite
+ * loop, as the tick_periodic() increments jiffies,
+ * which then will increment time, possibly causing
+ * the loop to trigger again and again.
+ */
+ if (timekeeping_valid_for_hres())
+ tick_periodic(cpu);
+ }
+}
+
+/*
+ * Setup the device for a periodic tick
+ */
+void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
+{
+ tick_set_periodic_handler(dev, broadcast);
+
+ /* Broadcast setup ? */
+ if (!tick_device_is_functional(dev))
+ return;
+
+ if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
+ !tick_broadcast_oneshot_active()) {
+ clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC);
+ } else {
+ unsigned long seq;
+ ktime_t next;
+
+ do {
+ seq = read_seqbegin(&jiffies_lock);
+ next = tick_next_period;
+ } while (read_seqretry(&jiffies_lock, seq));
+
+ clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
+
+ for (;;) {
+ if (!clockevents_program_event(dev, next, false))
+ return;
+ next = ktime_add(next, tick_period);
+ }
+ }
+}
+
+/*
+ * Setup the tick device
+ */
+static void tick_setup_device(struct tick_device *td,
+ struct clock_event_device *newdev, int cpu,
+ const struct cpumask *cpumask)
+{
+ ktime_t next_event;
+ void (*handler)(struct clock_event_device *) = NULL;
+
+ /*
+ * First device setup ?
+ */
+ if (!td->evtdev) {
+ /*
+ * If no cpu took the do_timer update, assign it to
+ * this cpu:
+ */
+ if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {
+ if (!tick_nohz_full_cpu(cpu))
+ tick_do_timer_cpu = cpu;
+ else
+ tick_do_timer_cpu = TICK_DO_TIMER_NONE;
+ tick_next_period = ktime_get();
+ tick_period = ktime_set(0, NSEC_PER_SEC / HZ);
+ }
+
+ /*
+ * Startup in periodic mode first.
+ */
+ td->mode = TICKDEV_MODE_PERIODIC;
+ } else {
+ handler = td->evtdev->event_handler;
+ next_event = td->evtdev->next_event;
+ td->evtdev->event_handler = clockevents_handle_noop;
+ }
+
+ td->evtdev = newdev;
+
+ /*
+ * When the device is not per cpu, pin the interrupt to the
+ * current cpu:
+ */
+ if (!cpumask_equal(newdev->cpumask, cpumask))
+ irq_set_affinity(newdev->irq, cpumask);
+
+ /*
+ * When global broadcasting is active, check if the current
+ * device is registered as a placeholder for broadcast mode.
+ * This allows us to handle this x86 misfeature in a generic
+ * way. This function also returns !=0 when we keep the
+ * current active broadcast state for this CPU.
+ */
+ if (tick_device_uses_broadcast(newdev, cpu))
+ return;
+
+ if (td->mode == TICKDEV_MODE_PERIODIC)
+ tick_setup_periodic(newdev, 0);
+ else
+ tick_setup_oneshot(newdev, handler, next_event);
+}
+
+void tick_install_replacement(struct clock_event_device *newdev)
+{
+ struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
+ int cpu = smp_processor_id();
+
+ clockevents_exchange_device(td->evtdev, newdev);
+ tick_setup_device(td, newdev, cpu, cpumask_of(cpu));
+ if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)
+ tick_oneshot_notify();
+}
+
+static bool tick_check_percpu(struct clock_event_device *curdev,
+ struct clock_event_device *newdev, int cpu)
+{
+ if (!cpumask_test_cpu(cpu, newdev->cpumask))
+ return false;
+ if (cpumask_equal(newdev->cpumask, cpumask_of(cpu)))
+ return true;
+ /* Check if irq affinity can be set */
+ if (newdev->irq >= 0 && !irq_can_set_affinity(newdev->irq))
+ return false;
+ /* Prefer an existing cpu local device */
+ if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu)))
+ return false;
+ return true;
+}
+
+static bool tick_check_preferred(struct clock_event_device *curdev,
+ struct clock_event_device *newdev)
+{
+ /* Prefer oneshot capable device */
+ if (!(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) {
+ if (curdev && (curdev->features & CLOCK_EVT_FEAT_ONESHOT))
+ return false;
+ if (tick_oneshot_mode_active())
+ return false;
+ }
+
+ /*
+ * Use the higher rated one, but prefer a CPU local device with a lower
+ * rating than a non-CPU local device
+ */
+ return !curdev ||
+ newdev->rating > curdev->rating ||
+ !cpumask_equal(curdev->cpumask, newdev->cpumask);
+}
+
+/*
+ * Check whether the new device is a better fit than curdev. curdev
+ * can be NULL !
+ */
+bool tick_check_replacement(struct clock_event_device *curdev,
+ struct clock_event_device *newdev)
+{
+ if (!tick_check_percpu(curdev, newdev, smp_processor_id()))
+ return false;
+
+ return tick_check_preferred(curdev, newdev);
+}
+
+/*
+ * Check, if the new registered device should be used. Called with
+ * clockevents_lock held and interrupts disabled.
+ */
+void tick_check_new_device(struct clock_event_device *newdev)
+{
+ struct clock_event_device *curdev;
+ struct tick_device *td;
+ int cpu;
+
+ cpu = smp_processor_id();
+ if (!cpumask_test_cpu(cpu, newdev->cpumask))
+ goto out_bc;
+
+ td = &per_cpu(tick_cpu_device, cpu);
+ curdev = td->evtdev;
+
+ /* cpu local device ? */
+ if (!tick_check_percpu(curdev, newdev, cpu))
+ goto out_bc;
+
+ /* Preference decision */
+ if (!tick_check_preferred(curdev, newdev))
+ goto out_bc;
+
+ if (!try_module_get(newdev->owner))
+ return;
+
+ /*
+ * Replace the eventually existing device by the new
+ * device. If the current device is the broadcast device, do
+ * not give it back to the clockevents layer !
+ */
+ if (tick_is_broadcast_device(curdev)) {
+ clockevents_shutdown(curdev);
+ curdev = NULL;
+ }
+ clockevents_exchange_device(curdev, newdev);
+ tick_setup_device(td, newdev, cpu, cpumask_of(cpu));
+ if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)
+ tick_oneshot_notify();
+ return;
+
+out_bc:
+ /*
+ * Can the new device be used as a broadcast device ?
+ */
+ tick_install_broadcast_device(newdev);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * Transfer the do_timer job away from a dying cpu.
+ *
+ * Called with interrupts disabled. Not locking required. If
+ * tick_do_timer_cpu is owned by this cpu, nothing can change it.
+ */
+void tick_handover_do_timer(void)
+{
+ if (tick_do_timer_cpu == smp_processor_id()) {
+ int cpu = cpumask_first(cpu_online_mask);
+
+ tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu :
+ TICK_DO_TIMER_NONE;
+ }
+}
+
+/*
+ * Shutdown an event device on a given cpu:
+ *
+ * This is called on a life CPU, when a CPU is dead. So we cannot
+ * access the hardware device itself.
+ * We just set the mode and remove it from the lists.
+ */
+void tick_shutdown(unsigned int cpu)
+{
+ struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
+ struct clock_event_device *dev = td->evtdev;
+
+ td->mode = TICKDEV_MODE_PERIODIC;
+ if (dev) {
+ /*
+ * Prevent that the clock events layer tries to call
+ * the set mode function!
+ */
+ dev->state = CLOCK_EVT_STATE_DETACHED;
+ dev->mode = CLOCK_EVT_MODE_UNUSED;
+ clockevents_exchange_device(dev, NULL);
+ dev->event_handler = clockevents_handle_noop;
+ td->evtdev = NULL;
+ }
+}
+#endif
+
+/**
+ * tick_suspend_local - Suspend the local tick device
+ *
+ * Called from the local cpu for freeze with interrupts disabled.
+ *
+ * No locks required. Nothing can change the per cpu device.
+ */
+void tick_suspend_local(void)
+{
+ struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
+
+ clockevents_shutdown(td->evtdev);
+}
+
+/**
+ * tick_resume_local - Resume the local tick device
+ *
+ * Called from the local CPU for unfreeze or XEN resume magic.
+ *
+ * No locks required. Nothing can change the per cpu device.
+ */
+void tick_resume_local(void)
+{
+ struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
+ bool broadcast = tick_resume_check_broadcast();
+
+ clockevents_tick_resume(td->evtdev);
+ if (!broadcast) {
+ if (td->mode == TICKDEV_MODE_PERIODIC)
+ tick_setup_periodic(td->evtdev, 0);
+ else
+ tick_resume_oneshot();
+ }
+}
+
+/**
+ * tick_suspend - Suspend the tick and the broadcast device
+ *
+ * Called from syscore_suspend() via timekeeping_suspend with only one
+ * CPU online and interrupts disabled or from tick_unfreeze() under
+ * tick_freeze_lock.
+ *
+ * No locks required. Nothing can change the per cpu device.
+ */
+void tick_suspend(void)
+{
+ tick_suspend_local();
+ tick_suspend_broadcast();
+}
+
+/**
+ * tick_resume - Resume the tick and the broadcast device
+ *
+ * Called from syscore_resume() via timekeeping_resume with only one
+ * CPU online and interrupts disabled.
+ *
+ * No locks required. Nothing can change the per cpu device.
+ */
+void tick_resume(void)
+{
+ tick_resume_broadcast();
+ tick_resume_local();
+}
+
+static DEFINE_RAW_SPINLOCK(tick_freeze_lock);
+static unsigned int tick_freeze_depth;
+
+/**
+ * tick_freeze - Suspend the local tick and (possibly) timekeeping.
+ *
+ * Check if this is the last online CPU executing the function and if so,
+ * suspend timekeeping. Otherwise suspend the local tick.
+ *
+ * Call with interrupts disabled. Must be balanced with %tick_unfreeze().
+ * Interrupts must not be enabled before the subsequent %tick_unfreeze().
+ */
+void tick_freeze(void)
+{
+ raw_spin_lock(&tick_freeze_lock);
+
+ tick_freeze_depth++;
+ if (tick_freeze_depth == num_online_cpus())
+ timekeeping_suspend();
+ else
+ tick_suspend_local();
+
+ raw_spin_unlock(&tick_freeze_lock);
+}
+
+/**
+ * tick_unfreeze - Resume the local tick and (possibly) timekeeping.
+ *
+ * Check if this is the first CPU executing the function and if so, resume
+ * timekeeping. Otherwise resume the local tick.
+ *
+ * Call with interrupts disabled. Must be balanced with %tick_freeze().
+ * Interrupts must not be enabled after the preceding %tick_freeze().
+ */
+void tick_unfreeze(void)
+{
+ raw_spin_lock(&tick_freeze_lock);
+
+ if (tick_freeze_depth == num_online_cpus())
+ timekeeping_resume();
+ else
+ tick_resume_local();
+
+ tick_freeze_depth--;
+
+ raw_spin_unlock(&tick_freeze_lock);
+}
+
+/**
+ * tick_init - initialize the tick control
+ */
+void __init tick_init(void)
+{
+ tick_broadcast_init();
+ tick_nohz_init();
+}
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
new file mode 100644
index 000000000..b64fdd805
--- /dev/null
+++ b/kernel/time/tick-internal.h
@@ -0,0 +1,139 @@
+/*
+ * tick internal variable and functions used by low/high res code
+ */
+#include <linux/hrtimer.h>
+#include <linux/tick.h>
+
+#include "timekeeping.h"
+#include "tick-sched.h"
+
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+
+# define TICK_DO_TIMER_NONE -1
+# define TICK_DO_TIMER_BOOT -2
+
+DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
+extern ktime_t tick_next_period;
+extern ktime_t tick_period;
+extern int tick_do_timer_cpu __read_mostly;
+
+extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
+extern void tick_handle_periodic(struct clock_event_device *dev);
+extern void tick_check_new_device(struct clock_event_device *dev);
+extern void tick_shutdown(unsigned int cpu);
+extern void tick_suspend(void);
+extern void tick_resume(void);
+extern bool tick_check_replacement(struct clock_event_device *curdev,
+ struct clock_event_device *newdev);
+extern void tick_install_replacement(struct clock_event_device *dev);
+extern int tick_is_oneshot_available(void);
+extern struct tick_device *tick_get_device(int cpu);
+
+extern int clockevents_tick_resume(struct clock_event_device *dev);
+/* Check, if the device is functional or a dummy for broadcast */
+static inline int tick_device_is_functional(struct clock_event_device *dev)
+{
+ return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
+}
+
+extern void clockevents_shutdown(struct clock_event_device *dev);
+extern void clockevents_exchange_device(struct clock_event_device *old,
+ struct clock_event_device *new);
+extern void clockevents_set_state(struct clock_event_device *dev,
+ enum clock_event_state state);
+extern int clockevents_program_event(struct clock_event_device *dev,
+ ktime_t expires, bool force);
+extern void clockevents_handle_noop(struct clock_event_device *dev);
+extern int __clockevents_update_freq(struct clock_event_device *dev, u32 freq);
+extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
+
+/* Broadcasting support */
+# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
+extern void tick_install_broadcast_device(struct clock_event_device *dev);
+extern int tick_is_broadcast_device(struct clock_event_device *dev);
+extern void tick_shutdown_broadcast(unsigned int cpu);
+extern void tick_suspend_broadcast(void);
+extern void tick_resume_broadcast(void);
+extern bool tick_resume_check_broadcast(void);
+extern void tick_broadcast_init(void);
+extern void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast);
+extern int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq);
+extern struct tick_device *tick_get_broadcast_device(void);
+extern struct cpumask *tick_get_broadcast_mask(void);
+# else /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST: */
+static inline void tick_install_broadcast_device(struct clock_event_device *dev) { }
+static inline int tick_is_broadcast_device(struct clock_event_device *dev) { return 0; }
+static inline int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { return 0; }
+static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { }
+static inline void tick_shutdown_broadcast(unsigned int cpu) { }
+static inline void tick_suspend_broadcast(void) { }
+static inline void tick_resume_broadcast(void) { }
+static inline bool tick_resume_check_broadcast(void) { return false; }
+static inline void tick_broadcast_init(void) { }
+static inline int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq) { return -ENODEV; }
+
+/* Set the periodic handler in non broadcast mode */
+static inline void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast)
+{
+ dev->event_handler = tick_handle_periodic;
+}
+# endif /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST */
+
+#else /* !GENERIC_CLOCKEVENTS: */
+static inline void tick_suspend(void) { }
+static inline void tick_resume(void) { }
+#endif /* !GENERIC_CLOCKEVENTS */
+
+/* Oneshot related functions */
+#ifdef CONFIG_TICK_ONESHOT
+extern void tick_setup_oneshot(struct clock_event_device *newdev,
+ void (*handler)(struct clock_event_device *),
+ ktime_t nextevt);
+extern int tick_program_event(ktime_t expires, int force);
+extern void tick_oneshot_notify(void);
+extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *));
+extern void tick_resume_oneshot(void);
+static inline bool tick_oneshot_possible(void) { return true; }
+extern int tick_oneshot_mode_active(void);
+extern void tick_clock_notify(void);
+extern int tick_check_oneshot_change(int allow_nohz);
+extern int tick_init_highres(void);
+#else /* !CONFIG_TICK_ONESHOT: */
+static inline
+void tick_setup_oneshot(struct clock_event_device *newdev,
+ void (*handler)(struct clock_event_device *),
+ ktime_t nextevt) { BUG(); }
+static inline void tick_resume_oneshot(void) { BUG(); }
+static inline int tick_program_event(ktime_t expires, int force) { return 0; }
+static inline void tick_oneshot_notify(void) { }
+static inline bool tick_oneshot_possible(void) { return false; }
+static inline int tick_oneshot_mode_active(void) { return 0; }
+static inline void tick_clock_notify(void) { }
+static inline int tick_check_oneshot_change(int allow_nohz) { return 0; }
+#endif /* !CONFIG_TICK_ONESHOT */
+
+/* Functions related to oneshot broadcasting */
+#if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT)
+extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc);
+extern void tick_broadcast_switch_to_oneshot(void);
+extern void tick_shutdown_broadcast_oneshot(unsigned int cpu);
+extern int tick_broadcast_oneshot_active(void);
+extern void tick_check_oneshot_broadcast_this_cpu(void);
+bool tick_broadcast_oneshot_available(void);
+extern struct cpumask *tick_get_broadcast_oneshot_mask(void);
+#else /* !(BROADCAST && ONESHOT): */
+static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); }
+static inline void tick_broadcast_switch_to_oneshot(void) { }
+static inline void tick_shutdown_broadcast_oneshot(unsigned int cpu) { }
+static inline int tick_broadcast_oneshot_active(void) { return 0; }
+static inline void tick_check_oneshot_broadcast_this_cpu(void) { }
+static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_possible(); }
+#endif /* !(BROADCAST && ONESHOT) */
+
+/* NO_HZ_FULL internal */
+#ifdef CONFIG_NO_HZ_FULL
+extern void tick_nohz_init(void);
+# else
+static inline void tick_nohz_init(void) { }
+#endif
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
new file mode 100644
index 000000000..67a64b167
--- /dev/null
+++ b/kernel/time/tick-oneshot.c
@@ -0,0 +1,116 @@
+/*
+ * linux/kernel/time/tick-oneshot.c
+ *
+ * This file contains functions which manage high resolution tick
+ * related events.
+ *
+ * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
+ * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
+ * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
+ *
+ * This code is licenced under the GPL version 2. For details see
+ * kernel-base/COPYING.
+ */
+#include <linux/cpu.h>
+#include <linux/err.h>
+#include <linux/hrtimer.h>
+#include <linux/interrupt.h>
+#include <linux/percpu.h>
+#include <linux/profile.h>
+#include <linux/sched.h>
+
+#include "tick-internal.h"
+
+/**
+ * tick_program_event
+ */
+int tick_program_event(ktime_t expires, int force)
+{
+ struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
+
+ return clockevents_program_event(dev, expires, force);
+}
+
+/**
+ * tick_resume_onshot - resume oneshot mode
+ */
+void tick_resume_oneshot(void)
+{
+ struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
+
+ clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
+ clockevents_program_event(dev, ktime_get(), true);
+}
+
+/**
+ * tick_setup_oneshot - setup the event device for oneshot mode (hres or nohz)
+ */
+void tick_setup_oneshot(struct clock_event_device *newdev,
+ void (*handler)(struct clock_event_device *),
+ ktime_t next_event)
+{
+ newdev->event_handler = handler;
+ clockevents_set_state(newdev, CLOCK_EVT_STATE_ONESHOT);
+ clockevents_program_event(newdev, next_event, true);
+}
+
+/**
+ * tick_switch_to_oneshot - switch to oneshot mode
+ */
+int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
+{
+ struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
+ struct clock_event_device *dev = td->evtdev;
+
+ if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) ||
+ !tick_device_is_functional(dev)) {
+
+ printk(KERN_INFO "Clockevents: "
+ "could not switch to one-shot mode:");
+ if (!dev) {
+ printk(" no tick device\n");
+ } else {
+ if (!tick_device_is_functional(dev))
+ printk(" %s is not functional.\n", dev->name);
+ else
+ printk(" %s does not support one-shot mode.\n",
+ dev->name);
+ }
+ return -EINVAL;
+ }
+
+ td->mode = TICKDEV_MODE_ONESHOT;
+ dev->event_handler = handler;
+ clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
+ tick_broadcast_switch_to_oneshot();
+ return 0;
+}
+
+/**
+ * tick_check_oneshot_mode - check whether the system is in oneshot mode
+ *
+ * returns 1 when either nohz or highres are enabled. otherwise 0.
+ */
+int tick_oneshot_mode_active(void)
+{
+ unsigned long flags;
+ int ret;
+
+ local_irq_save(flags);
+ ret = __this_cpu_read(tick_cpu_device.mode) == TICKDEV_MODE_ONESHOT;
+ local_irq_restore(flags);
+
+ return ret;
+}
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+/**
+ * tick_init_highres - switch to high resolution mode
+ *
+ * Called with interrupts disabled.
+ */
+int tick_init_highres(void)
+{
+ return tick_switch_to_oneshot(hrtimer_interrupt);
+}
+#endif
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
new file mode 100644
index 000000000..914259128
--- /dev/null
+++ b/kernel/time/tick-sched.c
@@ -0,0 +1,1250 @@
+/*
+ * linux/kernel/time/tick-sched.c
+ *
+ * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
+ * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
+ * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner
+ *
+ * No idle tick implementation for low and high resolution timers
+ *
+ * Started by: Thomas Gleixner and Ingo Molnar
+ *
+ * Distribute under GPLv2.
+ */
+#include <linux/cpu.h>
+#include <linux/err.h>
+#include <linux/hrtimer.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+#include <linux/percpu.h>
+#include <linux/profile.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/irq_work.h>
+#include <linux/posix-timers.h>
+#include <linux/perf_event.h>
+#include <linux/context_tracking.h>
+
+#include <asm/irq_regs.h>
+
+#include "tick-internal.h"
+
+#include <trace/events/timer.h>
+
+/*
+ * Per cpu nohz control structure
+ */
+static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
+
+/*
+ * The time, when the last jiffy update happened. Protected by jiffies_lock.
+ */
+static ktime_t last_jiffies_update;
+
+struct tick_sched *tick_get_tick_sched(int cpu)
+{
+ return &per_cpu(tick_cpu_sched, cpu);
+}
+
+/*
+ * Must be called with interrupts disabled !
+ */
+static void tick_do_update_jiffies64(ktime_t now)
+{
+ unsigned long ticks = 0;
+ ktime_t delta;
+
+ /*
+ * Do a quick check without holding jiffies_lock:
+ */
+ delta = ktime_sub(now, last_jiffies_update);
+ if (delta.tv64 < tick_period.tv64)
+ return;
+
+ /* Reevalute with jiffies_lock held */
+ write_seqlock(&jiffies_lock);
+
+ delta = ktime_sub(now, last_jiffies_update);
+ if (delta.tv64 >= tick_period.tv64) {
+
+ delta = ktime_sub(delta, tick_period);
+ last_jiffies_update = ktime_add(last_jiffies_update,
+ tick_period);
+
+ /* Slow path for long timeouts */
+ if (unlikely(delta.tv64 >= tick_period.tv64)) {
+ s64 incr = ktime_to_ns(tick_period);
+
+ ticks = ktime_divns(delta, incr);
+
+ last_jiffies_update = ktime_add_ns(last_jiffies_update,
+ incr * ticks);
+ }
+ do_timer(++ticks);
+
+ /* Keep the tick_next_period variable up to date */
+ tick_next_period = ktime_add(last_jiffies_update, tick_period);
+ } else {
+ write_sequnlock(&jiffies_lock);
+ return;
+ }
+ write_sequnlock(&jiffies_lock);
+ update_wall_time();
+}
+
+/*
+ * Initialize and return retrieve the jiffies update.
+ */
+static ktime_t tick_init_jiffy_update(void)
+{
+ ktime_t period;
+
+ write_seqlock(&jiffies_lock);
+ /* Did we start the jiffies update yet ? */
+ if (last_jiffies_update.tv64 == 0)
+ last_jiffies_update = tick_next_period;
+ period = last_jiffies_update;
+ write_sequnlock(&jiffies_lock);
+ return period;
+}
+
+
+static void tick_sched_do_timer(ktime_t now)
+{
+ int cpu = smp_processor_id();
+
+#ifdef CONFIG_NO_HZ_COMMON
+ /*
+ * Check if the do_timer duty was dropped. We don't care about
+ * concurrency: This happens only when the cpu in charge went
+ * into a long sleep. If two cpus happen to assign themself to
+ * this duty, then the jiffies update is still serialized by
+ * jiffies_lock.
+ */
+ if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)
+ && !tick_nohz_full_cpu(cpu))
+ tick_do_timer_cpu = cpu;
+#endif
+
+ /* Check, if the jiffies need an update */
+ if (tick_do_timer_cpu == cpu)
+ tick_do_update_jiffies64(now);
+}
+
+static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
+{
+#ifdef CONFIG_NO_HZ_COMMON
+ /*
+ * When we are idle and the tick is stopped, we have to touch
+ * the watchdog as we might not schedule for a really long
+ * time. This happens on complete idle SMP systems while
+ * waiting on the login prompt. We also increment the "start of
+ * idle" jiffy stamp so the idle accounting adjustment we do
+ * when we go busy again does not account too much ticks.
+ */
+ if (ts->tick_stopped) {
+ touch_softlockup_watchdog();
+ if (is_idle_task(current))
+ ts->idle_jiffies++;
+ }
+#endif
+ update_process_times(user_mode(regs));
+ profile_tick(CPU_PROFILING);
+}
+
+#ifdef CONFIG_NO_HZ_FULL
+cpumask_var_t tick_nohz_full_mask;
+cpumask_var_t housekeeping_mask;
+bool tick_nohz_full_running;
+
+static bool can_stop_full_tick(void)
+{
+ WARN_ON_ONCE(!irqs_disabled());
+
+ if (!sched_can_stop_tick()) {
+ trace_tick_stop(0, "more than 1 task in runqueue\n");
+ return false;
+ }
+
+ if (!posix_cpu_timers_can_stop_tick(current)) {
+ trace_tick_stop(0, "posix timers running\n");
+ return false;
+ }
+
+ if (!perf_event_can_stop_tick()) {
+ trace_tick_stop(0, "perf events running\n");
+ return false;
+ }
+
+ /* sched_clock_tick() needs us? */
+#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+ /*
+ * TODO: kick full dynticks CPUs when
+ * sched_clock_stable is set.
+ */
+ if (!sched_clock_stable()) {
+ trace_tick_stop(0, "unstable sched clock\n");
+ /*
+ * Don't allow the user to think they can get
+ * full NO_HZ with this machine.
+ */
+ WARN_ONCE(tick_nohz_full_running,
+ "NO_HZ FULL will not work with unstable sched clock");
+ return false;
+ }
+#endif
+
+ return true;
+}
+
+static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now);
+
+/*
+ * Re-evaluate the need for the tick on the current CPU
+ * and restart it if necessary.
+ */
+void __tick_nohz_full_check(void)
+{
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+
+ if (tick_nohz_full_cpu(smp_processor_id())) {
+ if (ts->tick_stopped && !is_idle_task(current)) {
+ if (!can_stop_full_tick())
+ tick_nohz_restart_sched_tick(ts, ktime_get());
+ }
+ }
+}
+
+static void nohz_full_kick_work_func(struct irq_work *work)
+{
+ __tick_nohz_full_check();
+}
+
+static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
+ .func = nohz_full_kick_work_func,
+};
+
+/*
+ * Kick this CPU if it's full dynticks in order to force it to
+ * re-evaluate its dependency on the tick and restart it if necessary.
+ * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(),
+ * is NMI safe.
+ */
+void tick_nohz_full_kick(void)
+{
+ if (!tick_nohz_full_cpu(smp_processor_id()))
+ return;
+
+ irq_work_queue(this_cpu_ptr(&nohz_full_kick_work));
+}
+
+/*
+ * Kick the CPU if it's full dynticks in order to force it to
+ * re-evaluate its dependency on the tick and restart it if necessary.
+ */
+void tick_nohz_full_kick_cpu(int cpu)
+{
+ if (!tick_nohz_full_cpu(cpu))
+ return;
+
+ irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu);
+}
+
+static void nohz_full_kick_ipi(void *info)
+{
+ __tick_nohz_full_check();
+}
+
+/*
+ * Kick all full dynticks CPUs in order to force these to re-evaluate
+ * their dependency on the tick and restart it if necessary.
+ */
+void tick_nohz_full_kick_all(void)
+{
+ if (!tick_nohz_full_running)
+ return;
+
+ preempt_disable();
+ smp_call_function_many(tick_nohz_full_mask,
+ nohz_full_kick_ipi, NULL, false);
+ tick_nohz_full_kick();
+ preempt_enable();
+}
+
+/*
+ * Re-evaluate the need for the tick as we switch the current task.
+ * It might need the tick due to per task/process properties:
+ * perf events, posix cpu timers, ...
+ */
+void __tick_nohz_task_switch(struct task_struct *tsk)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+
+ if (!tick_nohz_full_cpu(smp_processor_id()))
+ goto out;
+
+ if (tick_nohz_tick_stopped() && !can_stop_full_tick())
+ tick_nohz_full_kick();
+
+out:
+ local_irq_restore(flags);
+}
+
+/* Parse the boot-time nohz CPU list from the kernel parameters. */
+static int __init tick_nohz_full_setup(char *str)
+{
+ alloc_bootmem_cpumask_var(&tick_nohz_full_mask);
+ if (cpulist_parse(str, tick_nohz_full_mask) < 0) {
+ pr_warning("NOHZ: Incorrect nohz_full cpumask\n");
+ free_bootmem_cpumask_var(tick_nohz_full_mask);
+ return 1;
+ }
+ tick_nohz_full_running = true;
+
+ return 1;
+}
+__setup("nohz_full=", tick_nohz_full_setup);
+
+static int tick_nohz_cpu_down_callback(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ unsigned int cpu = (unsigned long)hcpu;
+
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_DOWN_PREPARE:
+ /*
+ * If we handle the timekeeping duty for full dynticks CPUs,
+ * we can't safely shutdown that CPU.
+ */
+ if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
+ return NOTIFY_BAD;
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static int tick_nohz_init_all(void)
+{
+ int err = -1;
+
+#ifdef CONFIG_NO_HZ_FULL_ALL
+ if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) {
+ WARN(1, "NO_HZ: Can't allocate full dynticks cpumask\n");
+ return err;
+ }
+ err = 0;
+ cpumask_setall(tick_nohz_full_mask);
+ tick_nohz_full_running = true;
+#endif
+ return err;
+}
+
+void __init tick_nohz_init(void)
+{
+ int cpu;
+
+ if (!tick_nohz_full_running) {
+ if (tick_nohz_init_all() < 0)
+ return;
+ }
+
+ if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) {
+ WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n");
+ cpumask_clear(tick_nohz_full_mask);
+ tick_nohz_full_running = false;
+ return;
+ }
+
+ /*
+ * Full dynticks uses irq work to drive the tick rescheduling on safe
+ * locking contexts. But then we need irq work to raise its own
+ * interrupts to avoid circular dependency on the tick
+ */
+ if (!arch_irq_work_has_interrupt()) {
+ pr_warning("NO_HZ: Can't run full dynticks because arch doesn't "
+ "support irq work self-IPIs\n");
+ cpumask_clear(tick_nohz_full_mask);
+ cpumask_copy(housekeeping_mask, cpu_possible_mask);
+ tick_nohz_full_running = false;
+ return;
+ }
+
+ cpu = smp_processor_id();
+
+ if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
+ pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu);
+ cpumask_clear_cpu(cpu, tick_nohz_full_mask);
+ }
+
+ cpumask_andnot(housekeeping_mask,
+ cpu_possible_mask, tick_nohz_full_mask);
+
+ for_each_cpu(cpu, tick_nohz_full_mask)
+ context_tracking_cpu_set(cpu);
+
+ cpu_notifier(tick_nohz_cpu_down_callback, 0);
+ pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n",
+ cpumask_pr_args(tick_nohz_full_mask));
+}
+#endif
+
+/*
+ * NOHZ - aka dynamic tick functionality
+ */
+#ifdef CONFIG_NO_HZ_COMMON
+/*
+ * NO HZ enabled ?
+ */
+static int tick_nohz_enabled __read_mostly = 1;
+int tick_nohz_active __read_mostly;
+/*
+ * Enable / Disable tickless mode
+ */
+static int __init setup_tick_nohz(char *str)
+{
+ if (!strcmp(str, "off"))
+ tick_nohz_enabled = 0;
+ else if (!strcmp(str, "on"))
+ tick_nohz_enabled = 1;
+ else
+ return 0;
+ return 1;
+}
+
+__setup("nohz=", setup_tick_nohz);
+
+int tick_nohz_tick_stopped(void)
+{
+ return __this_cpu_read(tick_cpu_sched.tick_stopped);
+}
+
+/**
+ * tick_nohz_update_jiffies - update jiffies when idle was interrupted
+ *
+ * Called from interrupt entry when the CPU was idle
+ *
+ * In case the sched_tick was stopped on this CPU, we have to check if jiffies
+ * must be updated. Otherwise an interrupt handler could use a stale jiffy
+ * value. We do this unconditionally on any cpu, as we don't know whether the
+ * cpu, which has the update task assigned is in a long sleep.
+ */
+static void tick_nohz_update_jiffies(ktime_t now)
+{
+ unsigned long flags;
+
+ __this_cpu_write(tick_cpu_sched.idle_waketime, now);
+
+ local_irq_save(flags);
+ tick_do_update_jiffies64(now);
+ local_irq_restore(flags);
+
+ touch_softlockup_watchdog();
+}
+
+/*
+ * Updates the per cpu time idle statistics counters
+ */
+static void
+update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time)
+{
+ ktime_t delta;
+
+ if (ts->idle_active) {
+ delta = ktime_sub(now, ts->idle_entrytime);
+ if (nr_iowait_cpu(cpu) > 0)
+ ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
+ else
+ ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
+ ts->idle_entrytime = now;
+ }
+
+ if (last_update_time)
+ *last_update_time = ktime_to_us(now);
+
+}
+
+static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
+{
+ update_ts_time_stats(smp_processor_id(), ts, now, NULL);
+ ts->idle_active = 0;
+
+ sched_clock_idle_wakeup_event(0);
+}
+
+static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
+{
+ ktime_t now = ktime_get();
+
+ ts->idle_entrytime = now;
+ ts->idle_active = 1;
+ sched_clock_idle_sleep_event();
+ return now;
+}
+
+/**
+ * get_cpu_idle_time_us - get the total idle time of a cpu
+ * @cpu: CPU number to query
+ * @last_update_time: variable to store update time in. Do not update
+ * counters if NULL.
+ *
+ * Return the cummulative idle time (since boot) for a given
+ * CPU, in microseconds.
+ *
+ * This time is measured via accounting rather than sampling,
+ * and is as accurate as ktime_get() is.
+ *
+ * This function returns -1 if NOHZ is not enabled.
+ */
+u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
+{
+ struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+ ktime_t now, idle;
+
+ if (!tick_nohz_active)
+ return -1;
+
+ now = ktime_get();
+ if (last_update_time) {
+ update_ts_time_stats(cpu, ts, now, last_update_time);
+ idle = ts->idle_sleeptime;
+ } else {
+ if (ts->idle_active && !nr_iowait_cpu(cpu)) {
+ ktime_t delta = ktime_sub(now, ts->idle_entrytime);
+
+ idle = ktime_add(ts->idle_sleeptime, delta);
+ } else {
+ idle = ts->idle_sleeptime;
+ }
+ }
+
+ return ktime_to_us(idle);
+
+}
+EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
+
+/**
+ * get_cpu_iowait_time_us - get the total iowait time of a cpu
+ * @cpu: CPU number to query
+ * @last_update_time: variable to store update time in. Do not update
+ * counters if NULL.
+ *
+ * Return the cummulative iowait time (since boot) for a given
+ * CPU, in microseconds.
+ *
+ * This time is measured via accounting rather than sampling,
+ * and is as accurate as ktime_get() is.
+ *
+ * This function returns -1 if NOHZ is not enabled.
+ */
+u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
+{
+ struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+ ktime_t now, iowait;
+
+ if (!tick_nohz_active)
+ return -1;
+
+ now = ktime_get();
+ if (last_update_time) {
+ update_ts_time_stats(cpu, ts, now, last_update_time);
+ iowait = ts->iowait_sleeptime;
+ } else {
+ if (ts->idle_active && nr_iowait_cpu(cpu) > 0) {
+ ktime_t delta = ktime_sub(now, ts->idle_entrytime);
+
+ iowait = ktime_add(ts->iowait_sleeptime, delta);
+ } else {
+ iowait = ts->iowait_sleeptime;
+ }
+ }
+
+ return ktime_to_us(iowait);
+}
+EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
+
+static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
+ ktime_t now, int cpu)
+{
+ unsigned long seq, last_jiffies, next_jiffies, delta_jiffies;
+ ktime_t last_update, expires, ret = { .tv64 = 0 };
+ unsigned long rcu_delta_jiffies;
+ struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
+ u64 time_delta;
+
+ time_delta = timekeeping_max_deferment();
+
+ /* Read jiffies and the time when jiffies were updated last */
+ do {
+ seq = read_seqbegin(&jiffies_lock);
+ last_update = last_jiffies_update;
+ last_jiffies = jiffies;
+ } while (read_seqretry(&jiffies_lock, seq));
+
+ if (rcu_needs_cpu(&rcu_delta_jiffies) ||
+ arch_needs_cpu() || irq_work_needs_cpu()) {
+ next_jiffies = last_jiffies + 1;
+ delta_jiffies = 1;
+ } else {
+ /* Get the next timer wheel timer */
+ next_jiffies = get_next_timer_interrupt(last_jiffies);
+ delta_jiffies = next_jiffies - last_jiffies;
+ if (rcu_delta_jiffies < delta_jiffies) {
+ next_jiffies = last_jiffies + rcu_delta_jiffies;
+ delta_jiffies = rcu_delta_jiffies;
+ }
+ }
+
+ /*
+ * Do not stop the tick, if we are only one off (or less)
+ * or if the cpu is required for RCU:
+ */
+ if (!ts->tick_stopped && delta_jiffies <= 1)
+ goto out;
+
+ /* Schedule the tick, if we are at least one jiffie off */
+ if ((long)delta_jiffies >= 1) {
+
+ /*
+ * If this cpu is the one which updates jiffies, then
+ * give up the assignment and let it be taken by the
+ * cpu which runs the tick timer next, which might be
+ * this cpu as well. If we don't drop this here the
+ * jiffies might be stale and do_timer() never
+ * invoked. Keep track of the fact that it was the one
+ * which had the do_timer() duty last. If this cpu is
+ * the one which had the do_timer() duty last, we
+ * limit the sleep time to the timekeeping
+ * max_deferement value which we retrieved
+ * above. Otherwise we can sleep as long as we want.
+ */
+ if (cpu == tick_do_timer_cpu) {
+ tick_do_timer_cpu = TICK_DO_TIMER_NONE;
+ ts->do_timer_last = 1;
+ } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
+ time_delta = KTIME_MAX;
+ ts->do_timer_last = 0;
+ } else if (!ts->do_timer_last) {
+ time_delta = KTIME_MAX;
+ }
+
+#ifdef CONFIG_NO_HZ_FULL
+ if (!ts->inidle) {
+ time_delta = min(time_delta,
+ scheduler_tick_max_deferment());
+ }
+#endif
+
+ /*
+ * calculate the expiry time for the next timer wheel
+ * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals
+ * that there is no timer pending or at least extremely
+ * far into the future (12 days for HZ=1000). In this
+ * case we set the expiry to the end of time.
+ */
+ if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) {
+ /*
+ * Calculate the time delta for the next timer event.
+ * If the time delta exceeds the maximum time delta
+ * permitted by the current clocksource then adjust
+ * the time delta accordingly to ensure the
+ * clocksource does not wrap.
+ */
+ time_delta = min_t(u64, time_delta,
+ tick_period.tv64 * delta_jiffies);
+ }
+
+ if (time_delta < KTIME_MAX)
+ expires = ktime_add_ns(last_update, time_delta);
+ else
+ expires.tv64 = KTIME_MAX;
+
+ /* Skip reprogram of event if its not changed */
+ if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
+ goto out;
+
+ ret = expires;
+
+ /*
+ * nohz_stop_sched_tick can be called several times before
+ * the nohz_restart_sched_tick is called. This happens when
+ * interrupts arrive which do not cause a reschedule. In the
+ * first call we save the current tick time, so we can restart
+ * the scheduler tick in nohz_restart_sched_tick.
+ */
+ if (!ts->tick_stopped) {
+ nohz_balance_enter_idle(cpu);
+ calc_load_enter_idle();
+
+ ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
+ ts->tick_stopped = 1;
+ trace_tick_stop(1, " ");
+ }
+
+ /*
+ * If the expiration time == KTIME_MAX, then
+ * in this case we simply stop the tick timer.
+ */
+ if (unlikely(expires.tv64 == KTIME_MAX)) {
+ if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
+ hrtimer_cancel(&ts->sched_timer);
+ goto out;
+ }
+
+ if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
+ hrtimer_start(&ts->sched_timer, expires,
+ HRTIMER_MODE_ABS_PINNED);
+ /* Check, if the timer was already in the past */
+ if (hrtimer_active(&ts->sched_timer))
+ goto out;
+ } else if (!tick_program_event(expires, 0))
+ goto out;
+ /*
+ * We are past the event already. So we crossed a
+ * jiffie boundary. Update jiffies and raise the
+ * softirq.
+ */
+ tick_do_update_jiffies64(ktime_get());
+ }
+ raise_softirq_irqoff(TIMER_SOFTIRQ);
+out:
+ ts->next_jiffies = next_jiffies;
+ ts->last_jiffies = last_jiffies;
+ ts->sleep_length = ktime_sub(dev->next_event, now);
+
+ return ret;
+}
+
+static void tick_nohz_full_stop_tick(struct tick_sched *ts)
+{
+#ifdef CONFIG_NO_HZ_FULL
+ int cpu = smp_processor_id();
+
+ if (!tick_nohz_full_cpu(cpu) || is_idle_task(current))
+ return;
+
+ if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE)
+ return;
+
+ if (!can_stop_full_tick())
+ return;
+
+ tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
+#endif
+}
+
+static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
+{
+ /*
+ * If this cpu is offline and it is the one which updates
+ * jiffies, then give up the assignment and let it be taken by
+ * the cpu which runs the tick timer next. If we don't drop
+ * this here the jiffies might be stale and do_timer() never
+ * invoked.
+ */
+ if (unlikely(!cpu_online(cpu))) {
+ if (cpu == tick_do_timer_cpu)
+ tick_do_timer_cpu = TICK_DO_TIMER_NONE;
+ return false;
+ }
+
+ if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) {
+ ts->sleep_length = (ktime_t) { .tv64 = NSEC_PER_SEC/HZ };
+ return false;
+ }
+
+ if (need_resched())
+ return false;
+
+ if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
+ static int ratelimit;
+
+ if (ratelimit < 10 &&
+ (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
+ pr_warn("NOHZ: local_softirq_pending %02x\n",
+ (unsigned int) local_softirq_pending());
+ ratelimit++;
+ }
+ return false;
+ }
+
+ if (tick_nohz_full_enabled()) {
+ /*
+ * Keep the tick alive to guarantee timekeeping progression
+ * if there are full dynticks CPUs around
+ */
+ if (tick_do_timer_cpu == cpu)
+ return false;
+ /*
+ * Boot safety: make sure the timekeeping duty has been
+ * assigned before entering dyntick-idle mode,
+ */
+ if (tick_do_timer_cpu == TICK_DO_TIMER_NONE)
+ return false;
+ }
+
+ return true;
+}
+
+static void __tick_nohz_idle_enter(struct tick_sched *ts)
+{
+ ktime_t now, expires;
+ int cpu = smp_processor_id();
+
+ now = tick_nohz_start_idle(ts);
+
+ if (can_stop_idle_tick(cpu, ts)) {
+ int was_stopped = ts->tick_stopped;
+
+ ts->idle_calls++;
+
+ expires = tick_nohz_stop_sched_tick(ts, now, cpu);
+ if (expires.tv64 > 0LL) {
+ ts->idle_sleeps++;
+ ts->idle_expires = expires;
+ }
+
+ if (!was_stopped && ts->tick_stopped)
+ ts->idle_jiffies = ts->last_jiffies;
+ }
+}
+
+/**
+ * tick_nohz_idle_enter - stop the idle tick from the idle task
+ *
+ * When the next event is more than a tick into the future, stop the idle tick
+ * Called when we start the idle loop.
+ *
+ * The arch is responsible of calling:
+ *
+ * - rcu_idle_enter() after its last use of RCU before the CPU is put
+ * to sleep.
+ * - rcu_idle_exit() before the first use of RCU after the CPU is woken up.
+ */
+void tick_nohz_idle_enter(void)
+{
+ struct tick_sched *ts;
+
+ WARN_ON_ONCE(irqs_disabled());
+
+ /*
+ * Update the idle state in the scheduler domain hierarchy
+ * when tick_nohz_stop_sched_tick() is called from the idle loop.
+ * State will be updated to busy during the first busy tick after
+ * exiting idle.
+ */
+ set_cpu_sd_state_idle();
+
+ local_irq_disable();
+
+ ts = this_cpu_ptr(&tick_cpu_sched);
+ ts->inidle = 1;
+ __tick_nohz_idle_enter(ts);
+
+ local_irq_enable();
+}
+
+/**
+ * tick_nohz_irq_exit - update next tick event from interrupt exit
+ *
+ * When an interrupt fires while we are idle and it doesn't cause
+ * a reschedule, it may still add, modify or delete a timer, enqueue
+ * an RCU callback, etc...
+ * So we need to re-calculate and reprogram the next tick event.
+ */
+void tick_nohz_irq_exit(void)
+{
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+
+ if (ts->inidle)
+ __tick_nohz_idle_enter(ts);
+ else
+ tick_nohz_full_stop_tick(ts);
+}
+
+/**
+ * tick_nohz_get_sleep_length - return the length of the current sleep
+ *
+ * Called from power state control code with interrupts disabled
+ */
+ktime_t tick_nohz_get_sleep_length(void)
+{
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+
+ return ts->sleep_length;
+}
+
+static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
+{
+ hrtimer_cancel(&ts->sched_timer);
+ hrtimer_set_expires(&ts->sched_timer, ts->last_tick);
+
+ while (1) {
+ /* Forward the time to expire in the future */
+ hrtimer_forward(&ts->sched_timer, now, tick_period);
+
+ if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
+ hrtimer_start_expires(&ts->sched_timer,
+ HRTIMER_MODE_ABS_PINNED);
+ /* Check, if the timer was already in the past */
+ if (hrtimer_active(&ts->sched_timer))
+ break;
+ } else {
+ if (!tick_program_event(
+ hrtimer_get_expires(&ts->sched_timer), 0))
+ break;
+ }
+ /* Reread time and update jiffies */
+ now = ktime_get();
+ tick_do_update_jiffies64(now);
+ }
+}
+
+static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
+{
+ /* Update jiffies first */
+ tick_do_update_jiffies64(now);
+ update_cpu_load_nohz();
+
+ calc_load_exit_idle();
+ touch_softlockup_watchdog();
+ /*
+ * Cancel the scheduled timer and restore the tick
+ */
+ ts->tick_stopped = 0;
+ ts->idle_exittime = now;
+
+ tick_nohz_restart(ts, now);
+}
+
+static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
+{
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+ unsigned long ticks;
+
+ if (vtime_accounting_enabled())
+ return;
+ /*
+ * We stopped the tick in idle. Update process times would miss the
+ * time we slept as update_process_times does only a 1 tick
+ * accounting. Enforce that this is accounted to idle !
+ */
+ ticks = jiffies - ts->idle_jiffies;
+ /*
+ * We might be one off. Do not randomly account a huge number of ticks!
+ */
+ if (ticks && ticks < LONG_MAX)
+ account_idle_ticks(ticks);
+#endif
+}
+
+/**
+ * tick_nohz_idle_exit - restart the idle tick from the idle task
+ *
+ * Restart the idle tick when the CPU is woken up from idle
+ * This also exit the RCU extended quiescent state. The CPU
+ * can use RCU again after this function is called.
+ */
+void tick_nohz_idle_exit(void)
+{
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+ ktime_t now;
+
+ local_irq_disable();
+
+ WARN_ON_ONCE(!ts->inidle);
+
+ ts->inidle = 0;
+
+ if (ts->idle_active || ts->tick_stopped)
+ now = ktime_get();
+
+ if (ts->idle_active)
+ tick_nohz_stop_idle(ts, now);
+
+ if (ts->tick_stopped) {
+ tick_nohz_restart_sched_tick(ts, now);
+ tick_nohz_account_idle_ticks(ts);
+ }
+
+ local_irq_enable();
+}
+
+static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
+{
+ hrtimer_forward(&ts->sched_timer, now, tick_period);
+ return tick_program_event(hrtimer_get_expires(&ts->sched_timer), 0);
+}
+
+/*
+ * The nohz low res interrupt handler
+ */
+static void tick_nohz_handler(struct clock_event_device *dev)
+{
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+ struct pt_regs *regs = get_irq_regs();
+ ktime_t now = ktime_get();
+
+ dev->next_event.tv64 = KTIME_MAX;
+
+ tick_sched_do_timer(now);
+ tick_sched_handle(ts, regs);
+
+ /* No need to reprogram if we are running tickless */
+ if (unlikely(ts->tick_stopped))
+ return;
+
+ while (tick_nohz_reprogram(ts, now)) {
+ now = ktime_get();
+ tick_do_update_jiffies64(now);
+ }
+}
+
+/**
+ * tick_nohz_switch_to_nohz - switch to nohz mode
+ */
+static void tick_nohz_switch_to_nohz(void)
+{
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+ ktime_t next;
+
+ if (!tick_nohz_enabled)
+ return;
+
+ local_irq_disable();
+ if (tick_switch_to_oneshot(tick_nohz_handler)) {
+ local_irq_enable();
+ return;
+ }
+ tick_nohz_active = 1;
+ ts->nohz_mode = NOHZ_MODE_LOWRES;
+
+ /*
+ * Recycle the hrtimer in ts, so we can share the
+ * hrtimer_forward with the highres code.
+ */
+ hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ /* Get the next period */
+ next = tick_init_jiffy_update();
+
+ for (;;) {
+ hrtimer_set_expires(&ts->sched_timer, next);
+ if (!tick_program_event(next, 0))
+ break;
+ next = ktime_add(next, tick_period);
+ }
+ local_irq_enable();
+}
+
+/*
+ * When NOHZ is enabled and the tick is stopped, we need to kick the
+ * tick timer from irq_enter() so that the jiffies update is kept
+ * alive during long running softirqs. That's ugly as hell, but
+ * correctness is key even if we need to fix the offending softirq in
+ * the first place.
+ *
+ * Note, this is different to tick_nohz_restart. We just kick the
+ * timer and do not touch the other magic bits which need to be done
+ * when idle is left.
+ */
+static void tick_nohz_kick_tick(struct tick_sched *ts, ktime_t now)
+{
+#if 0
+ /* Switch back to 2.6.27 behaviour */
+ ktime_t delta;
+
+ /*
+ * Do not touch the tick device, when the next expiry is either
+ * already reached or less/equal than the tick period.
+ */
+ delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now);
+ if (delta.tv64 <= tick_period.tv64)
+ return;
+
+ tick_nohz_restart(ts, now);
+#endif
+}
+
+static inline void tick_nohz_irq_enter(void)
+{
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+ ktime_t now;
+
+ if (!ts->idle_active && !ts->tick_stopped)
+ return;
+ now = ktime_get();
+ if (ts->idle_active)
+ tick_nohz_stop_idle(ts, now);
+ if (ts->tick_stopped) {
+ tick_nohz_update_jiffies(now);
+ tick_nohz_kick_tick(ts, now);
+ }
+}
+
+#else
+
+static inline void tick_nohz_switch_to_nohz(void) { }
+static inline void tick_nohz_irq_enter(void) { }
+
+#endif /* CONFIG_NO_HZ_COMMON */
+
+/*
+ * Called from irq_enter to notify about the possible interruption of idle()
+ */
+void tick_irq_enter(void)
+{
+ tick_check_oneshot_broadcast_this_cpu();
+ tick_nohz_irq_enter();
+}
+
+/*
+ * High resolution timer specific code
+ */
+#ifdef CONFIG_HIGH_RES_TIMERS
+/*
+ * We rearm the timer until we get disabled by the idle code.
+ * Called with interrupts disabled.
+ */
+static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
+{
+ struct tick_sched *ts =
+ container_of(timer, struct tick_sched, sched_timer);
+ struct pt_regs *regs = get_irq_regs();
+ ktime_t now = ktime_get();
+
+ tick_sched_do_timer(now);
+
+ /*
+ * Do not call, when we are not in irq context and have
+ * no valid regs pointer
+ */
+ if (regs)
+ tick_sched_handle(ts, regs);
+
+ /* No need to reprogram if we are in idle or full dynticks mode */
+ if (unlikely(ts->tick_stopped))
+ return HRTIMER_NORESTART;
+
+ hrtimer_forward(timer, now, tick_period);
+
+ return HRTIMER_RESTART;
+}
+
+static int sched_skew_tick;
+
+static int __init skew_tick(char *str)
+{
+ get_option(&str, &sched_skew_tick);
+
+ return 0;
+}
+early_param("skew_tick", skew_tick);
+
+/**
+ * tick_setup_sched_timer - setup the tick emulation timer
+ */
+void tick_setup_sched_timer(void)
+{
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+ ktime_t now = ktime_get();
+
+ /*
+ * Emulate tick processing via per-CPU hrtimers:
+ */
+ hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ ts->sched_timer.function = tick_sched_timer;
+
+ /* Get the next period (per cpu) */
+ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
+
+ /* Offset the tick to avert jiffies_lock contention. */
+ if (sched_skew_tick) {
+ u64 offset = ktime_to_ns(tick_period) >> 1;
+ do_div(offset, num_possible_cpus());
+ offset *= smp_processor_id();
+ hrtimer_add_expires_ns(&ts->sched_timer, offset);
+ }
+
+ for (;;) {
+ hrtimer_forward(&ts->sched_timer, now, tick_period);
+ hrtimer_start_expires(&ts->sched_timer,
+ HRTIMER_MODE_ABS_PINNED);
+ /* Check, if the timer was already in the past */
+ if (hrtimer_active(&ts->sched_timer))
+ break;
+ now = ktime_get();
+ }
+
+#ifdef CONFIG_NO_HZ_COMMON
+ if (tick_nohz_enabled) {
+ ts->nohz_mode = NOHZ_MODE_HIGHRES;
+ tick_nohz_active = 1;
+ }
+#endif
+}
+#endif /* HIGH_RES_TIMERS */
+
+#if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS
+void tick_cancel_sched_timer(int cpu)
+{
+ struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+
+# ifdef CONFIG_HIGH_RES_TIMERS
+ if (ts->sched_timer.base)
+ hrtimer_cancel(&ts->sched_timer);
+# endif
+
+ memset(ts, 0, sizeof(*ts));
+}
+#endif
+
+/**
+ * Async notification about clocksource changes
+ */
+void tick_clock_notify(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks);
+}
+
+/*
+ * Async notification about clock event changes
+ */
+void tick_oneshot_notify(void)
+{
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+
+ set_bit(0, &ts->check_clocks);
+}
+
+/**
+ * Check, if a change happened, which makes oneshot possible.
+ *
+ * Called cyclic from the hrtimer softirq (driven by the timer
+ * softirq) allow_nohz signals, that we can switch into low-res nohz
+ * mode, because high resolution timers are disabled (either compile
+ * or runtime).
+ */
+int tick_check_oneshot_change(int allow_nohz)
+{
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+
+ if (!test_and_clear_bit(0, &ts->check_clocks))
+ return 0;
+
+ if (ts->nohz_mode != NOHZ_MODE_INACTIVE)
+ return 0;
+
+ if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available())
+ return 0;
+
+ if (!allow_nohz)
+ return 1;
+
+ tick_nohz_switch_to_nohz();
+ return 0;
+}
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
new file mode 100644
index 000000000..28b5da3e1
--- /dev/null
+++ b/kernel/time/tick-sched.h
@@ -0,0 +1,74 @@
+#ifndef _TICK_SCHED_H
+#define _TICK_SCHED_H
+
+#include <linux/hrtimer.h>
+
+enum tick_device_mode {
+ TICKDEV_MODE_PERIODIC,
+ TICKDEV_MODE_ONESHOT,
+};
+
+struct tick_device {
+ struct clock_event_device *evtdev;
+ enum tick_device_mode mode;
+};
+
+enum tick_nohz_mode {
+ NOHZ_MODE_INACTIVE,
+ NOHZ_MODE_LOWRES,
+ NOHZ_MODE_HIGHRES,
+};
+
+/**
+ * struct tick_sched - sched tick emulation and no idle tick control/stats
+ * @sched_timer: hrtimer to schedule the periodic tick in high
+ * resolution mode
+ * @last_tick: Store the last tick expiry time when the tick
+ * timer is modified for nohz sleeps. This is necessary
+ * to resume the tick timer operation in the timeline
+ * when the CPU returns from nohz sleep.
+ * @tick_stopped: Indicator that the idle tick has been stopped
+ * @idle_jiffies: jiffies at the entry to idle for idle time accounting
+ * @idle_calls: Total number of idle calls
+ * @idle_sleeps: Number of idle calls, where the sched tick was stopped
+ * @idle_entrytime: Time when the idle call was entered
+ * @idle_waketime: Time when the idle was interrupted
+ * @idle_exittime: Time when the idle state was left
+ * @idle_sleeptime: Sum of the time slept in idle with sched tick stopped
+ * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding
+ * @sleep_length: Duration of the current idle sleep
+ * @do_timer_lst: CPU was the last one doing do_timer before going idle
+ */
+struct tick_sched {
+ struct hrtimer sched_timer;
+ unsigned long check_clocks;
+ enum tick_nohz_mode nohz_mode;
+ ktime_t last_tick;
+ int inidle;
+ int tick_stopped;
+ unsigned long idle_jiffies;
+ unsigned long idle_calls;
+ unsigned long idle_sleeps;
+ int idle_active;
+ ktime_t idle_entrytime;
+ ktime_t idle_waketime;
+ ktime_t idle_exittime;
+ ktime_t idle_sleeptime;
+ ktime_t iowait_sleeptime;
+ ktime_t sleep_length;
+ unsigned long last_jiffies;
+ unsigned long next_jiffies;
+ ktime_t idle_expires;
+ int do_timer_last;
+};
+
+extern struct tick_sched *tick_get_tick_sched(int cpu);
+
+extern void tick_setup_sched_timer(void);
+#if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS
+extern void tick_cancel_sched_timer(int cpu);
+#else
+static inline void tick_cancel_sched_timer(int cpu) { }
+#endif
+
+#endif
diff --git a/kernel/time/time.c b/kernel/time/time.c
new file mode 100644
index 000000000..2c85b7724
--- /dev/null
+++ b/kernel/time/time.c
@@ -0,0 +1,785 @@
+/*
+ * linux/kernel/time.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * This file contains the interface functions for the various
+ * time related system calls: time, stime, gettimeofday, settimeofday,
+ * adjtime
+ */
+/*
+ * Modification history kernel/time.c
+ *
+ * 1993-09-02 Philip Gladstone
+ * Created file with time related functions from sched/core.c and adjtimex()
+ * 1993-10-08 Torsten Duwe
+ * adjtime interface update and CMOS clock write code
+ * 1995-08-13 Torsten Duwe
+ * kernel PLL updated to 1994-12-13 specs (rfc-1589)
+ * 1999-01-16 Ulrich Windl
+ * Introduced error checking for many cases in adjtimex().
+ * Updated NTP code according to technical memorandum Jan '96
+ * "A Kernel Model for Precision Timekeeping" by Dave Mills
+ * Allow time_constant larger than MAXTC(6) for NTP v4 (MAXTC == 10)
+ * (Even though the technical memorandum forbids it)
+ * 2004-07-14 Christoph Lameter
+ * Added getnstimeofday to allow the posix timer functions to return
+ * with nanosecond accuracy
+ */
+
+#include <linux/export.h>
+#include <linux/timex.h>
+#include <linux/capability.h>
+#include <linux/timekeeper_internal.h>
+#include <linux/errno.h>
+#include <linux/syscalls.h>
+#include <linux/security.h>
+#include <linux/fs.h>
+#include <linux/math64.h>
+#include <linux/ptrace.h>
+
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+
+#include "timeconst.h"
+#include "timekeeping.h"
+
+/*
+ * The timezone where the local system is located. Used as a default by some
+ * programs who obtain this value by using gettimeofday.
+ */
+struct timezone sys_tz;
+
+EXPORT_SYMBOL(sys_tz);
+
+#ifdef __ARCH_WANT_SYS_TIME
+
+/*
+ * sys_time() can be implemented in user-level using
+ * sys_gettimeofday(). Is this for backwards compatibility? If so,
+ * why not move it into the appropriate arch directory (for those
+ * architectures that need it).
+ */
+SYSCALL_DEFINE1(time, time_t __user *, tloc)
+{
+ time_t i = get_seconds();
+
+ if (tloc) {
+ if (put_user(i,tloc))
+ return -EFAULT;
+ }
+ force_successful_syscall_return();
+ return i;
+}
+
+/*
+ * sys_stime() can be implemented in user-level using
+ * sys_settimeofday(). Is this for backwards compatibility? If so,
+ * why not move it into the appropriate arch directory (for those
+ * architectures that need it).
+ */
+
+SYSCALL_DEFINE1(stime, time_t __user *, tptr)
+{
+ struct timespec tv;
+ int err;
+
+ if (get_user(tv.tv_sec, tptr))
+ return -EFAULT;
+
+ tv.tv_nsec = 0;
+
+ err = security_settime(&tv, NULL);
+ if (err)
+ return err;
+
+ do_settimeofday(&tv);
+ return 0;
+}
+
+#endif /* __ARCH_WANT_SYS_TIME */
+
+SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv,
+ struct timezone __user *, tz)
+{
+ if (likely(tv != NULL)) {
+ struct timeval ktv;
+ do_gettimeofday(&ktv);
+ if (copy_to_user(tv, &ktv, sizeof(ktv)))
+ return -EFAULT;
+ }
+ if (unlikely(tz != NULL)) {
+ if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
+ return -EFAULT;
+ }
+ return 0;
+}
+
+/*
+ * Indicates if there is an offset between the system clock and the hardware
+ * clock/persistent clock/rtc.
+ */
+int persistent_clock_is_local;
+
+/*
+ * Adjust the time obtained from the CMOS to be UTC time instead of
+ * local time.
+ *
+ * This is ugly, but preferable to the alternatives. Otherwise we
+ * would either need to write a program to do it in /etc/rc (and risk
+ * confusion if the program gets run more than once; it would also be
+ * hard to make the program warp the clock precisely n hours) or
+ * compile in the timezone information into the kernel. Bad, bad....
+ *
+ * - TYT, 1992-01-01
+ *
+ * The best thing to do is to keep the CMOS clock in universal time (UTC)
+ * as real UNIX machines always do it. This avoids all headaches about
+ * daylight saving times and warping kernel clocks.
+ */
+static inline void warp_clock(void)
+{
+ if (sys_tz.tz_minuteswest != 0) {
+ struct timespec adjust;
+
+ persistent_clock_is_local = 1;
+ adjust.tv_sec = sys_tz.tz_minuteswest * 60;
+ adjust.tv_nsec = 0;
+ timekeeping_inject_offset(&adjust);
+ }
+}
+
+/*
+ * In case for some reason the CMOS clock has not already been running
+ * in UTC, but in some local time: The first time we set the timezone,
+ * we will warp the clock so that it is ticking UTC time instead of
+ * local time. Presumably, if someone is setting the timezone then we
+ * are running in an environment where the programs understand about
+ * timezones. This should be done at boot time in the /etc/rc script,
+ * as soon as possible, so that the clock can be set right. Otherwise,
+ * various programs will get confused when the clock gets warped.
+ */
+
+int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz)
+{
+ static int firsttime = 1;
+ int error = 0;
+
+ if (tv && !timespec_valid(tv))
+ return -EINVAL;
+
+ error = security_settime(tv, tz);
+ if (error)
+ return error;
+
+ if (tz) {
+ sys_tz = *tz;
+ update_vsyscall_tz();
+ if (firsttime) {
+ firsttime = 0;
+ if (!tv)
+ warp_clock();
+ }
+ }
+ if (tv)
+ return do_settimeofday(tv);
+ return 0;
+}
+
+SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv,
+ struct timezone __user *, tz)
+{
+ struct timeval user_tv;
+ struct timespec new_ts;
+ struct timezone new_tz;
+
+ if (tv) {
+ if (copy_from_user(&user_tv, tv, sizeof(*tv)))
+ return -EFAULT;
+
+ if (!timeval_valid(&user_tv))
+ return -EINVAL;
+
+ new_ts.tv_sec = user_tv.tv_sec;
+ new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC;
+ }
+ if (tz) {
+ if (copy_from_user(&new_tz, tz, sizeof(*tz)))
+ return -EFAULT;
+ }
+
+ return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
+}
+
+SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p)
+{
+ struct timex txc; /* Local copy of parameter */
+ int ret;
+
+ /* Copy the user data space into the kernel copy
+ * structure. But bear in mind that the structures
+ * may change
+ */
+ if(copy_from_user(&txc, txc_p, sizeof(struct timex)))
+ return -EFAULT;
+ ret = do_adjtimex(&txc);
+ return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret;
+}
+
+/**
+ * current_fs_time - Return FS time
+ * @sb: Superblock.
+ *
+ * Return the current time truncated to the time granularity supported by
+ * the fs.
+ */
+struct timespec current_fs_time(struct super_block *sb)
+{
+ struct timespec now = current_kernel_time();
+ return timespec_trunc(now, sb->s_time_gran);
+}
+EXPORT_SYMBOL(current_fs_time);
+
+/*
+ * Convert jiffies to milliseconds and back.
+ *
+ * Avoid unnecessary multiplications/divisions in the
+ * two most common HZ cases:
+ */
+unsigned int jiffies_to_msecs(const unsigned long j)
+{
+#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
+ return (MSEC_PER_SEC / HZ) * j;
+#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
+ return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
+#else
+# if BITS_PER_LONG == 32
+ return (HZ_TO_MSEC_MUL32 * j) >> HZ_TO_MSEC_SHR32;
+# else
+ return (j * HZ_TO_MSEC_NUM) / HZ_TO_MSEC_DEN;
+# endif
+#endif
+}
+EXPORT_SYMBOL(jiffies_to_msecs);
+
+unsigned int jiffies_to_usecs(const unsigned long j)
+{
+#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
+ return (USEC_PER_SEC / HZ) * j;
+#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
+ return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC);
+#else
+# if BITS_PER_LONG == 32
+ return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32;
+# else
+ return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN;
+# endif
+#endif
+}
+EXPORT_SYMBOL(jiffies_to_usecs);
+
+/**
+ * timespec_trunc - Truncate timespec to a granularity
+ * @t: Timespec
+ * @gran: Granularity in ns.
+ *
+ * Truncate a timespec to a granularity. gran must be smaller than a second.
+ * Always rounds down.
+ *
+ * This function should be only used for timestamps returned by
+ * current_kernel_time() or CURRENT_TIME, not with do_gettimeofday() because
+ * it doesn't handle the better resolution of the latter.
+ */
+struct timespec timespec_trunc(struct timespec t, unsigned gran)
+{
+ /*
+ * Division is pretty slow so avoid it for common cases.
+ * Currently current_kernel_time() never returns better than
+ * jiffies resolution. Exploit that.
+ */
+ if (gran <= jiffies_to_usecs(1) * 1000) {
+ /* nothing */
+ } else if (gran == 1000000000) {
+ t.tv_nsec = 0;
+ } else {
+ t.tv_nsec -= t.tv_nsec % gran;
+ }
+ return t;
+}
+EXPORT_SYMBOL(timespec_trunc);
+
+/*
+ * mktime64 - Converts date to seconds.
+ * Converts Gregorian date to seconds since 1970-01-01 00:00:00.
+ * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
+ * => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
+ *
+ * [For the Julian calendar (which was used in Russia before 1917,
+ * Britain & colonies before 1752, anywhere else before 1582,
+ * and is still in use by some communities) leave out the
+ * -year/100+year/400 terms, and add 10.]
+ *
+ * This algorithm was first published by Gauss (I think).
+ */
+time64_t mktime64(const unsigned int year0, const unsigned int mon0,
+ const unsigned int day, const unsigned int hour,
+ const unsigned int min, const unsigned int sec)
+{
+ unsigned int mon = mon0, year = year0;
+
+ /* 1..12 -> 11,12,1..10 */
+ if (0 >= (int) (mon -= 2)) {
+ mon += 12; /* Puts Feb last since it has leap day */
+ year -= 1;
+ }
+
+ return ((((time64_t)
+ (year/4 - year/100 + year/400 + 367*mon/12 + day) +
+ year*365 - 719499
+ )*24 + hour /* now have hours */
+ )*60 + min /* now have minutes */
+ )*60 + sec; /* finally seconds */
+}
+EXPORT_SYMBOL(mktime64);
+
+/**
+ * set_normalized_timespec - set timespec sec and nsec parts and normalize
+ *
+ * @ts: pointer to timespec variable to be set
+ * @sec: seconds to set
+ * @nsec: nanoseconds to set
+ *
+ * Set seconds and nanoseconds field of a timespec variable and
+ * normalize to the timespec storage format
+ *
+ * Note: The tv_nsec part is always in the range of
+ * 0 <= tv_nsec < NSEC_PER_SEC
+ * For negative values only the tv_sec field is negative !
+ */
+void set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec)
+{
+ while (nsec >= NSEC_PER_SEC) {
+ /*
+ * The following asm() prevents the compiler from
+ * optimising this loop into a modulo operation. See
+ * also __iter_div_u64_rem() in include/linux/time.h
+ */
+ asm("" : "+rm"(nsec));
+ nsec -= NSEC_PER_SEC;
+ ++sec;
+ }
+ while (nsec < 0) {
+ asm("" : "+rm"(nsec));
+ nsec += NSEC_PER_SEC;
+ --sec;
+ }
+ ts->tv_sec = sec;
+ ts->tv_nsec = nsec;
+}
+EXPORT_SYMBOL(set_normalized_timespec);
+
+/**
+ * ns_to_timespec - Convert nanoseconds to timespec
+ * @nsec: the nanoseconds value to be converted
+ *
+ * Returns the timespec representation of the nsec parameter.
+ */
+struct timespec ns_to_timespec(const s64 nsec)
+{
+ struct timespec ts;
+ s32 rem;
+
+ if (!nsec)
+ return (struct timespec) {0, 0};
+
+ ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem);
+ if (unlikely(rem < 0)) {
+ ts.tv_sec--;
+ rem += NSEC_PER_SEC;
+ }
+ ts.tv_nsec = rem;
+
+ return ts;
+}
+EXPORT_SYMBOL(ns_to_timespec);
+
+/**
+ * ns_to_timeval - Convert nanoseconds to timeval
+ * @nsec: the nanoseconds value to be converted
+ *
+ * Returns the timeval representation of the nsec parameter.
+ */
+struct timeval ns_to_timeval(const s64 nsec)
+{
+ struct timespec ts = ns_to_timespec(nsec);
+ struct timeval tv;
+
+ tv.tv_sec = ts.tv_sec;
+ tv.tv_usec = (suseconds_t) ts.tv_nsec / 1000;
+
+ return tv;
+}
+EXPORT_SYMBOL(ns_to_timeval);
+
+#if BITS_PER_LONG == 32
+/**
+ * set_normalized_timespec - set timespec sec and nsec parts and normalize
+ *
+ * @ts: pointer to timespec variable to be set
+ * @sec: seconds to set
+ * @nsec: nanoseconds to set
+ *
+ * Set seconds and nanoseconds field of a timespec variable and
+ * normalize to the timespec storage format
+ *
+ * Note: The tv_nsec part is always in the range of
+ * 0 <= tv_nsec < NSEC_PER_SEC
+ * For negative values only the tv_sec field is negative !
+ */
+void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec)
+{
+ while (nsec >= NSEC_PER_SEC) {
+ /*
+ * The following asm() prevents the compiler from
+ * optimising this loop into a modulo operation. See
+ * also __iter_div_u64_rem() in include/linux/time.h
+ */
+ asm("" : "+rm"(nsec));
+ nsec -= NSEC_PER_SEC;
+ ++sec;
+ }
+ while (nsec < 0) {
+ asm("" : "+rm"(nsec));
+ nsec += NSEC_PER_SEC;
+ --sec;
+ }
+ ts->tv_sec = sec;
+ ts->tv_nsec = nsec;
+}
+EXPORT_SYMBOL(set_normalized_timespec64);
+
+/**
+ * ns_to_timespec64 - Convert nanoseconds to timespec64
+ * @nsec: the nanoseconds value to be converted
+ *
+ * Returns the timespec64 representation of the nsec parameter.
+ */
+struct timespec64 ns_to_timespec64(const s64 nsec)
+{
+ struct timespec64 ts;
+ s32 rem;
+
+ if (!nsec)
+ return (struct timespec64) {0, 0};
+
+ ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem);
+ if (unlikely(rem < 0)) {
+ ts.tv_sec--;
+ rem += NSEC_PER_SEC;
+ }
+ ts.tv_nsec = rem;
+
+ return ts;
+}
+EXPORT_SYMBOL(ns_to_timespec64);
+#endif
+/*
+ * When we convert to jiffies then we interpret incoming values
+ * the following way:
+ *
+ * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET)
+ *
+ * - 'too large' values [that would result in larger than
+ * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too.
+ *
+ * - all other values are converted to jiffies by either multiplying
+ * the input value by a factor or dividing it with a factor
+ *
+ * We must also be careful about 32-bit overflows.
+ */
+unsigned long msecs_to_jiffies(const unsigned int m)
+{
+ /*
+ * Negative value, means infinite timeout:
+ */
+ if ((int)m < 0)
+ return MAX_JIFFY_OFFSET;
+
+#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
+ /*
+ * HZ is equal to or smaller than 1000, and 1000 is a nice
+ * round multiple of HZ, divide with the factor between them,
+ * but round upwards:
+ */
+ return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ);
+#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
+ /*
+ * HZ is larger than 1000, and HZ is a nice round multiple of
+ * 1000 - simply multiply with the factor between them.
+ *
+ * But first make sure the multiplication result cannot
+ * overflow:
+ */
+ if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
+ return MAX_JIFFY_OFFSET;
+
+ return m * (HZ / MSEC_PER_SEC);
+#else
+ /*
+ * Generic case - multiply, round and divide. But first
+ * check that if we are doing a net multiplication, that
+ * we wouldn't overflow:
+ */
+ if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
+ return MAX_JIFFY_OFFSET;
+
+ return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32)
+ >> MSEC_TO_HZ_SHR32;
+#endif
+}
+EXPORT_SYMBOL(msecs_to_jiffies);
+
+unsigned long usecs_to_jiffies(const unsigned int u)
+{
+ if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET))
+ return MAX_JIFFY_OFFSET;
+#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
+ return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ);
+#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
+ return u * (HZ / USEC_PER_SEC);
+#else
+ return (USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32)
+ >> USEC_TO_HZ_SHR32;
+#endif
+}
+EXPORT_SYMBOL(usecs_to_jiffies);
+
+/*
+ * The TICK_NSEC - 1 rounds up the value to the next resolution. Note
+ * that a remainder subtract here would not do the right thing as the
+ * resolution values don't fall on second boundries. I.e. the line:
+ * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding.
+ * Note that due to the small error in the multiplier here, this
+ * rounding is incorrect for sufficiently large values of tv_nsec, but
+ * well formed timespecs should have tv_nsec < NSEC_PER_SEC, so we're
+ * OK.
+ *
+ * Rather, we just shift the bits off the right.
+ *
+ * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec
+ * value to a scaled second value.
+ */
+static unsigned long
+__timespec_to_jiffies(unsigned long sec, long nsec)
+{
+ nsec = nsec + TICK_NSEC - 1;
+
+ if (sec >= MAX_SEC_IN_JIFFIES){
+ sec = MAX_SEC_IN_JIFFIES;
+ nsec = 0;
+ }
+ return (((u64)sec * SEC_CONVERSION) +
+ (((u64)nsec * NSEC_CONVERSION) >>
+ (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
+
+}
+
+unsigned long
+timespec_to_jiffies(const struct timespec *value)
+{
+ return __timespec_to_jiffies(value->tv_sec, value->tv_nsec);
+}
+
+EXPORT_SYMBOL(timespec_to_jiffies);
+
+void
+jiffies_to_timespec(const unsigned long jiffies, struct timespec *value)
+{
+ /*
+ * Convert jiffies to nanoseconds and separate with
+ * one divide.
+ */
+ u32 rem;
+ value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC,
+ NSEC_PER_SEC, &rem);
+ value->tv_nsec = rem;
+}
+EXPORT_SYMBOL(jiffies_to_timespec);
+
+/*
+ * We could use a similar algorithm to timespec_to_jiffies (with a
+ * different multiplier for usec instead of nsec). But this has a
+ * problem with rounding: we can't exactly add TICK_NSEC - 1 to the
+ * usec value, since it's not necessarily integral.
+ *
+ * We could instead round in the intermediate scaled representation
+ * (i.e. in units of 1/2^(large scale) jiffies) but that's also
+ * perilous: the scaling introduces a small positive error, which
+ * combined with a division-rounding-upward (i.e. adding 2^(scale) - 1
+ * units to the intermediate before shifting) leads to accidental
+ * overflow and overestimates.
+ *
+ * At the cost of one additional multiplication by a constant, just
+ * use the timespec implementation.
+ */
+unsigned long
+timeval_to_jiffies(const struct timeval *value)
+{
+ return __timespec_to_jiffies(value->tv_sec,
+ value->tv_usec * NSEC_PER_USEC);
+}
+EXPORT_SYMBOL(timeval_to_jiffies);
+
+void jiffies_to_timeval(const unsigned long jiffies, struct timeval *value)
+{
+ /*
+ * Convert jiffies to nanoseconds and separate with
+ * one divide.
+ */
+ u32 rem;
+
+ value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC,
+ NSEC_PER_SEC, &rem);
+ value->tv_usec = rem / NSEC_PER_USEC;
+}
+EXPORT_SYMBOL(jiffies_to_timeval);
+
+/*
+ * Convert jiffies/jiffies_64 to clock_t and back.
+ */
+clock_t jiffies_to_clock_t(unsigned long x)
+{
+#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
+# if HZ < USER_HZ
+ return x * (USER_HZ / HZ);
+# else
+ return x / (HZ / USER_HZ);
+# endif
+#else
+ return div_u64((u64)x * TICK_NSEC, NSEC_PER_SEC / USER_HZ);
+#endif
+}
+EXPORT_SYMBOL(jiffies_to_clock_t);
+
+unsigned long clock_t_to_jiffies(unsigned long x)
+{
+#if (HZ % USER_HZ)==0
+ if (x >= ~0UL / (HZ / USER_HZ))
+ return ~0UL;
+ return x * (HZ / USER_HZ);
+#else
+ /* Don't worry about loss of precision here .. */
+ if (x >= ~0UL / HZ * USER_HZ)
+ return ~0UL;
+
+ /* .. but do try to contain it here */
+ return div_u64((u64)x * HZ, USER_HZ);
+#endif
+}
+EXPORT_SYMBOL(clock_t_to_jiffies);
+
+u64 jiffies_64_to_clock_t(u64 x)
+{
+#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
+# if HZ < USER_HZ
+ x = div_u64(x * USER_HZ, HZ);
+# elif HZ > USER_HZ
+ x = div_u64(x, HZ / USER_HZ);
+# else
+ /* Nothing to do */
+# endif
+#else
+ /*
+ * There are better ways that don't overflow early,
+ * but even this doesn't overflow in hundreds of years
+ * in 64 bits, so..
+ */
+ x = div_u64(x * TICK_NSEC, (NSEC_PER_SEC / USER_HZ));
+#endif
+ return x;
+}
+EXPORT_SYMBOL(jiffies_64_to_clock_t);
+
+u64 nsec_to_clock_t(u64 x)
+{
+#if (NSEC_PER_SEC % USER_HZ) == 0
+ return div_u64(x, NSEC_PER_SEC / USER_HZ);
+#elif (USER_HZ % 512) == 0
+ return div_u64(x * USER_HZ / 512, NSEC_PER_SEC / 512);
+#else
+ /*
+ * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024,
+ * overflow after 64.99 years.
+ * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ...
+ */
+ return div_u64(x * 9, (9ull * NSEC_PER_SEC + (USER_HZ / 2)) / USER_HZ);
+#endif
+}
+
+/**
+ * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64
+ *
+ * @n: nsecs in u64
+ *
+ * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64.
+ * And this doesn't return MAX_JIFFY_OFFSET since this function is designed
+ * for scheduler, not for use in device drivers to calculate timeout value.
+ *
+ * note:
+ * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
+ * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
+ */
+u64 nsecs_to_jiffies64(u64 n)
+{
+#if (NSEC_PER_SEC % HZ) == 0
+ /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */
+ return div_u64(n, NSEC_PER_SEC / HZ);
+#elif (HZ % 512) == 0
+ /* overflow after 292 years if HZ = 1024 */
+ return div_u64(n * HZ / 512, NSEC_PER_SEC / 512);
+#else
+ /*
+ * Generic case - optimized for cases where HZ is a multiple of 3.
+ * overflow after 64.99 years, exact for HZ = 60, 72, 90, 120 etc.
+ */
+ return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ);
+#endif
+}
+EXPORT_SYMBOL(nsecs_to_jiffies64);
+
+/**
+ * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
+ *
+ * @n: nsecs in u64
+ *
+ * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64.
+ * And this doesn't return MAX_JIFFY_OFFSET since this function is designed
+ * for scheduler, not for use in device drivers to calculate timeout value.
+ *
+ * note:
+ * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
+ * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
+ */
+unsigned long nsecs_to_jiffies(u64 n)
+{
+ return (unsigned long)nsecs_to_jiffies64(n);
+}
+EXPORT_SYMBOL_GPL(nsecs_to_jiffies);
+
+/*
+ * Add two timespec values and do a safety check for overflow.
+ * It's assumed that both values are valid (>= 0)
+ */
+struct timespec timespec_add_safe(const struct timespec lhs,
+ const struct timespec rhs)
+{
+ struct timespec res;
+
+ set_normalized_timespec(&res, lhs.tv_sec + rhs.tv_sec,
+ lhs.tv_nsec + rhs.tv_nsec);
+
+ if (res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)
+ res.tv_sec = TIME_T_MAX;
+
+ return res;
+}
diff --git a/kernel/time/timeconst.bc b/kernel/time/timeconst.bc
new file mode 100644
index 000000000..511bdf2ca
--- /dev/null
+++ b/kernel/time/timeconst.bc
@@ -0,0 +1,108 @@
+scale=0
+
+define gcd(a,b) {
+ auto t;
+ while (b) {
+ t = b;
+ b = a % b;
+ a = t;
+ }
+ return a;
+}
+
+/* Division by reciprocal multiplication. */
+define fmul(b,n,d) {
+ return (2^b*n+d-1)/d;
+}
+
+/* Adjustment factor when a ceiling value is used. Use as:
+ (imul * n) + (fmulxx * n + fadjxx) >> xx) */
+define fadj(b,n,d) {
+ auto v;
+ d = d/gcd(n,d);
+ v = 2^b*(d-1)/d;
+ return v;
+}
+
+/* Compute the appropriate mul/adj values as well as a shift count,
+ which brings the mul value into the range 2^b-1 <= x < 2^b. Such
+ a shift value will be correct in the signed integer range and off
+ by at most one in the upper half of the unsigned range. */
+define fmuls(b,n,d) {
+ auto s, m;
+ for (s = 0; 1; s++) {
+ m = fmul(s,n,d);
+ if (m >= 2^(b-1))
+ return s;
+ }
+ return 0;
+}
+
+define timeconst(hz) {
+ print "/* Automatically generated by kernel/timeconst.bc */\n"
+ print "/* Time conversion constants for HZ == ", hz, " */\n"
+ print "\n"
+
+ print "#ifndef KERNEL_TIMECONST_H\n"
+ print "#define KERNEL_TIMECONST_H\n\n"
+
+ print "#include <linux/param.h>\n"
+ print "#include <linux/types.h>\n\n"
+
+ print "#if HZ != ", hz, "\n"
+ print "#error \qkernel/timeconst.h has the wrong HZ value!\q\n"
+ print "#endif\n\n"
+
+ if (hz < 2) {
+ print "#error Totally bogus HZ value!\n"
+ } else {
+ s=fmuls(32,1000,hz)
+ obase=16
+ print "#define HZ_TO_MSEC_MUL32\tU64_C(0x", fmul(s,1000,hz), ")\n"
+ print "#define HZ_TO_MSEC_ADJ32\tU64_C(0x", fadj(s,1000,hz), ")\n"
+ obase=10
+ print "#define HZ_TO_MSEC_SHR32\t", s, "\n"
+
+ s=fmuls(32,hz,1000)
+ obase=16
+ print "#define MSEC_TO_HZ_MUL32\tU64_C(0x", fmul(s,hz,1000), ")\n"
+ print "#define MSEC_TO_HZ_ADJ32\tU64_C(0x", fadj(s,hz,1000), ")\n"
+ obase=10
+ print "#define MSEC_TO_HZ_SHR32\t", s, "\n"
+
+ obase=10
+ cd=gcd(hz,1000)
+ print "#define HZ_TO_MSEC_NUM\t\t", 1000/cd, "\n"
+ print "#define HZ_TO_MSEC_DEN\t\t", hz/cd, "\n"
+ print "#define MSEC_TO_HZ_NUM\t\t", hz/cd, "\n"
+ print "#define MSEC_TO_HZ_DEN\t\t", 1000/cd, "\n"
+ print "\n"
+
+ s=fmuls(32,1000000,hz)
+ obase=16
+ print "#define HZ_TO_USEC_MUL32\tU64_C(0x", fmul(s,1000000,hz), ")\n"
+ print "#define HZ_TO_USEC_ADJ32\tU64_C(0x", fadj(s,1000000,hz), ")\n"
+ obase=10
+ print "#define HZ_TO_USEC_SHR32\t", s, "\n"
+
+ s=fmuls(32,hz,1000000)
+ obase=16
+ print "#define USEC_TO_HZ_MUL32\tU64_C(0x", fmul(s,hz,1000000), ")\n"
+ print "#define USEC_TO_HZ_ADJ32\tU64_C(0x", fadj(s,hz,1000000), ")\n"
+ obase=10
+ print "#define USEC_TO_HZ_SHR32\t", s, "\n"
+
+ obase=10
+ cd=gcd(hz,1000000)
+ print "#define HZ_TO_USEC_NUM\t\t", 1000000/cd, "\n"
+ print "#define HZ_TO_USEC_DEN\t\t", hz/cd, "\n"
+ print "#define USEC_TO_HZ_NUM\t\t", hz/cd, "\n"
+ print "#define USEC_TO_HZ_DEN\t\t", 1000000/cd, "\n"
+ print "\n"
+
+ print "#endif /* KERNEL_TIMECONST_H */\n"
+ }
+ halt
+}
+
+timeconst(hz)
diff --git a/kernel/time/timeconv.c b/kernel/time/timeconv.c
new file mode 100644
index 000000000..86628e755
--- /dev/null
+++ b/kernel/time/timeconv.c
@@ -0,0 +1,127 @@
+/*
+ * Copyright (C) 1993, 1994, 1995, 1996, 1997 Free Software Foundation, Inc.
+ * This file is part of the GNU C Library.
+ * Contributed by Paul Eggert (eggert@twinsun.com).
+ *
+ * The GNU C Library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * The GNU C Library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with the GNU C Library; see the file COPYING.LIB. If not,
+ * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+/*
+ * Converts the calendar time to broken-down time representation
+ * Based on code from glibc-2.6
+ *
+ * 2009-7-14:
+ * Moved from glibc-2.6 to kernel by Zhaolei<zhaolei@cn.fujitsu.com>
+ */
+
+#include <linux/time.h>
+#include <linux/module.h>
+
+/*
+ * Nonzero if YEAR is a leap year (every 4 years,
+ * except every 100th isn't, and every 400th is).
+ */
+static int __isleap(long year)
+{
+ return (year) % 4 == 0 && ((year) % 100 != 0 || (year) % 400 == 0);
+}
+
+/* do a mathdiv for long type */
+static long math_div(long a, long b)
+{
+ return a / b - (a % b < 0);
+}
+
+/* How many leap years between y1 and y2, y1 must less or equal to y2 */
+static long leaps_between(long y1, long y2)
+{
+ long leaps1 = math_div(y1 - 1, 4) - math_div(y1 - 1, 100)
+ + math_div(y1 - 1, 400);
+ long leaps2 = math_div(y2 - 1, 4) - math_div(y2 - 1, 100)
+ + math_div(y2 - 1, 400);
+ return leaps2 - leaps1;
+}
+
+/* How many days come before each month (0-12). */
+static const unsigned short __mon_yday[2][13] = {
+ /* Normal years. */
+ {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365},
+ /* Leap years. */
+ {0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366}
+};
+
+#define SECS_PER_HOUR (60 * 60)
+#define SECS_PER_DAY (SECS_PER_HOUR * 24)
+
+/**
+ * time_to_tm - converts the calendar time to local broken-down time
+ *
+ * @totalsecs the number of seconds elapsed since 00:00:00 on January 1, 1970,
+ * Coordinated Universal Time (UTC).
+ * @offset offset seconds adding to totalsecs.
+ * @result pointer to struct tm variable to receive broken-down time
+ */
+void time_to_tm(time_t totalsecs, int offset, struct tm *result)
+{
+ long days, rem, y;
+ const unsigned short *ip;
+
+ days = totalsecs / SECS_PER_DAY;
+ rem = totalsecs % SECS_PER_DAY;
+ rem += offset;
+ while (rem < 0) {
+ rem += SECS_PER_DAY;
+ --days;
+ }
+ while (rem >= SECS_PER_DAY) {
+ rem -= SECS_PER_DAY;
+ ++days;
+ }
+
+ result->tm_hour = rem / SECS_PER_HOUR;
+ rem %= SECS_PER_HOUR;
+ result->tm_min = rem / 60;
+ result->tm_sec = rem % 60;
+
+ /* January 1, 1970 was a Thursday. */
+ result->tm_wday = (4 + days) % 7;
+ if (result->tm_wday < 0)
+ result->tm_wday += 7;
+
+ y = 1970;
+
+ while (days < 0 || days >= (__isleap(y) ? 366 : 365)) {
+ /* Guess a corrected year, assuming 365 days per year. */
+ long yg = y + math_div(days, 365);
+
+ /* Adjust DAYS and Y to match the guessed year. */
+ days -= (yg - y) * 365 + leaps_between(y, yg);
+ y = yg;
+ }
+
+ result->tm_year = y - 1900;
+
+ result->tm_yday = days;
+
+ ip = __mon_yday[__isleap(y)];
+ for (y = 11; days < ip[y]; y--)
+ continue;
+ days -= ip[y];
+
+ result->tm_mon = y;
+ result->tm_mday = days + 1;
+}
+EXPORT_SYMBOL(time_to_tm);
diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c
new file mode 100644
index 000000000..4687b3104
--- /dev/null
+++ b/kernel/time/timecounter.c
@@ -0,0 +1,112 @@
+/*
+ * linux/kernel/time/timecounter.c
+ *
+ * based on code that migrated away from
+ * linux/kernel/time/clocksource.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/export.h>
+#include <linux/timecounter.h>
+
+void timecounter_init(struct timecounter *tc,
+ const struct cyclecounter *cc,
+ u64 start_tstamp)
+{
+ tc->cc = cc;
+ tc->cycle_last = cc->read(cc);
+ tc->nsec = start_tstamp;
+ tc->mask = (1ULL << cc->shift) - 1;
+ tc->frac = 0;
+}
+EXPORT_SYMBOL_GPL(timecounter_init);
+
+/**
+ * timecounter_read_delta - get nanoseconds since last call of this function
+ * @tc: Pointer to time counter
+ *
+ * When the underlying cycle counter runs over, this will be handled
+ * correctly as long as it does not run over more than once between
+ * calls.
+ *
+ * The first call to this function for a new time counter initializes
+ * the time tracking and returns an undefined result.
+ */
+static u64 timecounter_read_delta(struct timecounter *tc)
+{
+ cycle_t cycle_now, cycle_delta;
+ u64 ns_offset;
+
+ /* read cycle counter: */
+ cycle_now = tc->cc->read(tc->cc);
+
+ /* calculate the delta since the last timecounter_read_delta(): */
+ cycle_delta = (cycle_now - tc->cycle_last) & tc->cc->mask;
+
+ /* convert to nanoseconds: */
+ ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta,
+ tc->mask, &tc->frac);
+
+ /* update time stamp of timecounter_read_delta() call: */
+ tc->cycle_last = cycle_now;
+
+ return ns_offset;
+}
+
+u64 timecounter_read(struct timecounter *tc)
+{
+ u64 nsec;
+
+ /* increment time by nanoseconds since last call */
+ nsec = timecounter_read_delta(tc);
+ nsec += tc->nsec;
+ tc->nsec = nsec;
+
+ return nsec;
+}
+EXPORT_SYMBOL_GPL(timecounter_read);
+
+/*
+ * This is like cyclecounter_cyc2ns(), but it is used for computing a
+ * time previous to the time stored in the cycle counter.
+ */
+static u64 cc_cyc2ns_backwards(const struct cyclecounter *cc,
+ cycle_t cycles, u64 mask, u64 frac)
+{
+ u64 ns = (u64) cycles;
+
+ ns = ((ns * cc->mult) - frac) >> cc->shift;
+
+ return ns;
+}
+
+u64 timecounter_cyc2time(struct timecounter *tc,
+ cycle_t cycle_tstamp)
+{
+ u64 delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask;
+ u64 nsec = tc->nsec, frac = tc->frac;
+
+ /*
+ * Instead of always treating cycle_tstamp as more recent
+ * than tc->cycle_last, detect when it is too far in the
+ * future and treat it as old time stamp instead.
+ */
+ if (delta > tc->cc->mask / 2) {
+ delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask;
+ nsec -= cc_cyc2ns_backwards(tc->cc, delta, tc->mask, frac);
+ } else {
+ nsec += cyclecounter_cyc2ns(tc->cc, delta, tc->mask, &frac);
+ }
+
+ return nsec;
+}
+EXPORT_SYMBOL_GPL(timecounter_cyc2time);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
new file mode 100644
index 000000000..946acb721
--- /dev/null
+++ b/kernel/time/timekeeping.c
@@ -0,0 +1,2072 @@
+/*
+ * linux/kernel/time/timekeeping.c
+ *
+ * Kernel timekeeping code and accessor functions
+ *
+ * This code was moved from linux/kernel/timer.c.
+ * Please see that file for copyright and history logs.
+ *
+ */
+
+#include <linux/timekeeper_internal.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/syscore_ops.h>
+#include <linux/clocksource.h>
+#include <linux/jiffies.h>
+#include <linux/time.h>
+#include <linux/tick.h>
+#include <linux/stop_machine.h>
+#include <linux/pvclock_gtod.h>
+#include <linux/compiler.h>
+
+#include "tick-internal.h"
+#include "ntp_internal.h"
+#include "timekeeping_internal.h"
+
+#define TK_CLEAR_NTP (1 << 0)
+#define TK_MIRROR (1 << 1)
+#define TK_CLOCK_WAS_SET (1 << 2)
+
+/*
+ * The most important data for readout fits into a single 64 byte
+ * cache line.
+ */
+static struct {
+ seqcount_t seq;
+ struct timekeeper timekeeper;
+} tk_core ____cacheline_aligned;
+
+static DEFINE_RAW_SPINLOCK(timekeeper_lock);
+static struct timekeeper shadow_timekeeper;
+
+/**
+ * struct tk_fast - NMI safe timekeeper
+ * @seq: Sequence counter for protecting updates. The lowest bit
+ * is the index for the tk_read_base array
+ * @base: tk_read_base array. Access is indexed by the lowest bit of
+ * @seq.
+ *
+ * See @update_fast_timekeeper() below.
+ */
+struct tk_fast {
+ seqcount_t seq;
+ struct tk_read_base base[2];
+};
+
+static struct tk_fast tk_fast_mono ____cacheline_aligned;
+static struct tk_fast tk_fast_raw ____cacheline_aligned;
+
+/* flag for if timekeeping is suspended */
+int __read_mostly timekeeping_suspended;
+
+static inline void tk_normalize_xtime(struct timekeeper *tk)
+{
+ while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) {
+ tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
+ tk->xtime_sec++;
+ }
+}
+
+static inline struct timespec64 tk_xtime(struct timekeeper *tk)
+{
+ struct timespec64 ts;
+
+ ts.tv_sec = tk->xtime_sec;
+ ts.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
+ return ts;
+}
+
+static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts)
+{
+ tk->xtime_sec = ts->tv_sec;
+ tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift;
+}
+
+static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts)
+{
+ tk->xtime_sec += ts->tv_sec;
+ tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift;
+ tk_normalize_xtime(tk);
+}
+
+static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm)
+{
+ struct timespec64 tmp;
+
+ /*
+ * Verify consistency of: offset_real = -wall_to_monotonic
+ * before modifying anything
+ */
+ set_normalized_timespec64(&tmp, -tk->wall_to_monotonic.tv_sec,
+ -tk->wall_to_monotonic.tv_nsec);
+ WARN_ON_ONCE(tk->offs_real.tv64 != timespec64_to_ktime(tmp).tv64);
+ tk->wall_to_monotonic = wtm;
+ set_normalized_timespec64(&tmp, -wtm.tv_sec, -wtm.tv_nsec);
+ tk->offs_real = timespec64_to_ktime(tmp);
+ tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0));
+}
+
+static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
+{
+ tk->offs_boot = ktime_add(tk->offs_boot, delta);
+}
+
+#ifdef CONFIG_DEBUG_TIMEKEEPING
+#define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */
+/*
+ * These simple flag variables are managed
+ * without locks, which is racy, but ok since
+ * we don't really care about being super
+ * precise about how many events were seen,
+ * just that a problem was observed.
+ */
+static int timekeeping_underflow_seen;
+static int timekeeping_overflow_seen;
+
+/* last_warning is only modified under the timekeeping lock */
+static long timekeeping_last_warning;
+
+static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
+{
+
+ cycle_t max_cycles = tk->tkr_mono.clock->max_cycles;
+ const char *name = tk->tkr_mono.clock->name;
+
+ if (offset > max_cycles) {
+ printk_deferred("WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow danger\n",
+ offset, name, max_cycles);
+ printk_deferred(" timekeeping: Your kernel is sick, but tries to cope by capping time updates\n");
+ } else {
+ if (offset > (max_cycles >> 1)) {
+ printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the the '%s' clock's 50%% safety margin (%lld)\n",
+ offset, name, max_cycles >> 1);
+ printk_deferred(" timekeeping: Your kernel is still fine, but is feeling a bit nervous\n");
+ }
+ }
+
+ if (timekeeping_underflow_seen) {
+ if (jiffies - timekeeping_last_warning > WARNING_FREQ) {
+ printk_deferred("WARNING: Underflow in clocksource '%s' observed, time update ignored.\n", name);
+ printk_deferred(" Please report this, consider using a different clocksource, if possible.\n");
+ printk_deferred(" Your kernel is probably still fine.\n");
+ timekeeping_last_warning = jiffies;
+ }
+ timekeeping_underflow_seen = 0;
+ }
+
+ if (timekeeping_overflow_seen) {
+ if (jiffies - timekeeping_last_warning > WARNING_FREQ) {
+ printk_deferred("WARNING: Overflow in clocksource '%s' observed, time update capped.\n", name);
+ printk_deferred(" Please report this, consider using a different clocksource, if possible.\n");
+ printk_deferred(" Your kernel is probably still fine.\n");
+ timekeeping_last_warning = jiffies;
+ }
+ timekeeping_overflow_seen = 0;
+ }
+}
+
+static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
+{
+ cycle_t now, last, mask, max, delta;
+ unsigned int seq;
+
+ /*
+ * Since we're called holding a seqlock, the data may shift
+ * under us while we're doing the calculation. This can cause
+ * false positives, since we'd note a problem but throw the
+ * results away. So nest another seqlock here to atomically
+ * grab the points we are checking with.
+ */
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+ now = tkr->read(tkr->clock);
+ last = tkr->cycle_last;
+ mask = tkr->mask;
+ max = tkr->clock->max_cycles;
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ delta = clocksource_delta(now, last, mask);
+
+ /*
+ * Try to catch underflows by checking if we are seeing small
+ * mask-relative negative values.
+ */
+ if (unlikely((~delta & mask) < (mask >> 3))) {
+ timekeeping_underflow_seen = 1;
+ delta = 0;
+ }
+
+ /* Cap delta value to the max_cycles values to avoid mult overflows */
+ if (unlikely(delta > max)) {
+ timekeeping_overflow_seen = 1;
+ delta = tkr->clock->max_cycles;
+ }
+
+ return delta;
+}
+#else
+static inline void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
+{
+}
+static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
+{
+ cycle_t cycle_now, delta;
+
+ /* read clocksource */
+ cycle_now = tkr->read(tkr->clock);
+
+ /* calculate the delta since the last update_wall_time */
+ delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask);
+
+ return delta;
+}
+#endif
+
+/**
+ * tk_setup_internals - Set up internals to use clocksource clock.
+ *
+ * @tk: The target timekeeper to setup.
+ * @clock: Pointer to clocksource.
+ *
+ * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment
+ * pair and interval request.
+ *
+ * Unless you're the timekeeping code, you should not be using this!
+ */
+static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
+{
+ cycle_t interval;
+ u64 tmp, ntpinterval;
+ struct clocksource *old_clock;
+
+ old_clock = tk->tkr_mono.clock;
+ tk->tkr_mono.clock = clock;
+ tk->tkr_mono.read = clock->read;
+ tk->tkr_mono.mask = clock->mask;
+ tk->tkr_mono.cycle_last = tk->tkr_mono.read(clock);
+
+ tk->tkr_raw.clock = clock;
+ tk->tkr_raw.read = clock->read;
+ tk->tkr_raw.mask = clock->mask;
+ tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last;
+
+ /* Do the ns -> cycle conversion first, using original mult */
+ tmp = NTP_INTERVAL_LENGTH;
+ tmp <<= clock->shift;
+ ntpinterval = tmp;
+ tmp += clock->mult/2;
+ do_div(tmp, clock->mult);
+ if (tmp == 0)
+ tmp = 1;
+
+ interval = (cycle_t) tmp;
+ tk->cycle_interval = interval;
+
+ /* Go back from cycles -> shifted ns */
+ tk->xtime_interval = (u64) interval * clock->mult;
+ tk->xtime_remainder = ntpinterval - tk->xtime_interval;
+ tk->raw_interval =
+ ((u64) interval * clock->mult) >> clock->shift;
+
+ /* if changing clocks, convert xtime_nsec shift units */
+ if (old_clock) {
+ int shift_change = clock->shift - old_clock->shift;
+ if (shift_change < 0)
+ tk->tkr_mono.xtime_nsec >>= -shift_change;
+ else
+ tk->tkr_mono.xtime_nsec <<= shift_change;
+ }
+ tk->tkr_raw.xtime_nsec = 0;
+
+ tk->tkr_mono.shift = clock->shift;
+ tk->tkr_raw.shift = clock->shift;
+
+ tk->ntp_error = 0;
+ tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
+ tk->ntp_tick = ntpinterval << tk->ntp_error_shift;
+
+ /*
+ * The timekeeper keeps its own mult values for the currently
+ * active clocksource. These value will be adjusted via NTP
+ * to counteract clock drifting.
+ */
+ tk->tkr_mono.mult = clock->mult;
+ tk->tkr_raw.mult = clock->mult;
+ tk->ntp_err_mult = 0;
+}
+
+/* Timekeeper helper functions. */
+
+#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
+static u32 default_arch_gettimeoffset(void) { return 0; }
+u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset;
+#else
+static inline u32 arch_gettimeoffset(void) { return 0; }
+#endif
+
+static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
+{
+ cycle_t delta;
+ s64 nsec;
+
+ delta = timekeeping_get_delta(tkr);
+
+ nsec = delta * tkr->mult + tkr->xtime_nsec;
+ nsec >>= tkr->shift;
+
+ /* If arch requires, add in get_arch_timeoffset() */
+ return nsec + arch_gettimeoffset();
+}
+
+/**
+ * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
+ * @tkr: Timekeeping readout base from which we take the update
+ *
+ * We want to use this from any context including NMI and tracing /
+ * instrumenting the timekeeping code itself.
+ *
+ * So we handle this differently than the other timekeeping accessor
+ * functions which retry when the sequence count has changed. The
+ * update side does:
+ *
+ * smp_wmb(); <- Ensure that the last base[1] update is visible
+ * tkf->seq++;
+ * smp_wmb(); <- Ensure that the seqcount update is visible
+ * update(tkf->base[0], tkr);
+ * smp_wmb(); <- Ensure that the base[0] update is visible
+ * tkf->seq++;
+ * smp_wmb(); <- Ensure that the seqcount update is visible
+ * update(tkf->base[1], tkr);
+ *
+ * The reader side does:
+ *
+ * do {
+ * seq = tkf->seq;
+ * smp_rmb();
+ * idx = seq & 0x01;
+ * now = now(tkf->base[idx]);
+ * smp_rmb();
+ * } while (seq != tkf->seq)
+ *
+ * As long as we update base[0] readers are forced off to
+ * base[1]. Once base[0] is updated readers are redirected to base[0]
+ * and the base[1] update takes place.
+ *
+ * So if a NMI hits the update of base[0] then it will use base[1]
+ * which is still consistent. In the worst case this can result is a
+ * slightly wrong timestamp (a few nanoseconds). See
+ * @ktime_get_mono_fast_ns.
+ */
+static void update_fast_timekeeper(struct tk_read_base *tkr, struct tk_fast *tkf)
+{
+ struct tk_read_base *base = tkf->base;
+
+ /* Force readers off to base[1] */
+ raw_write_seqcount_latch(&tkf->seq);
+
+ /* Update base[0] */
+ memcpy(base, tkr, sizeof(*base));
+
+ /* Force readers back to base[0] */
+ raw_write_seqcount_latch(&tkf->seq);
+
+ /* Update base[1] */
+ memcpy(base + 1, base, sizeof(*base));
+}
+
+/**
+ * ktime_get_mono_fast_ns - Fast NMI safe access to clock monotonic
+ *
+ * This timestamp is not guaranteed to be monotonic across an update.
+ * The timestamp is calculated by:
+ *
+ * now = base_mono + clock_delta * slope
+ *
+ * So if the update lowers the slope, readers who are forced to the
+ * not yet updated second array are still using the old steeper slope.
+ *
+ * tmono
+ * ^
+ * | o n
+ * | o n
+ * | u
+ * | o
+ * |o
+ * |12345678---> reader order
+ *
+ * o = old slope
+ * u = update
+ * n = new slope
+ *
+ * So reader 6 will observe time going backwards versus reader 5.
+ *
+ * While other CPUs are likely to be able observe that, the only way
+ * for a CPU local observation is when an NMI hits in the middle of
+ * the update. Timestamps taken from that NMI context might be ahead
+ * of the following timestamps. Callers need to be aware of that and
+ * deal with it.
+ */
+static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
+{
+ struct tk_read_base *tkr;
+ unsigned int seq;
+ u64 now;
+
+ do {
+ seq = raw_read_seqcount(&tkf->seq);
+ tkr = tkf->base + (seq & 0x01);
+ now = ktime_to_ns(tkr->base) + timekeeping_get_ns(tkr);
+ } while (read_seqcount_retry(&tkf->seq, seq));
+
+ return now;
+}
+
+u64 ktime_get_mono_fast_ns(void)
+{
+ return __ktime_get_fast_ns(&tk_fast_mono);
+}
+EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);
+
+u64 ktime_get_raw_fast_ns(void)
+{
+ return __ktime_get_fast_ns(&tk_fast_raw);
+}
+EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns);
+
+/* Suspend-time cycles value for halted fast timekeeper. */
+static cycle_t cycles_at_suspend;
+
+static cycle_t dummy_clock_read(struct clocksource *cs)
+{
+ return cycles_at_suspend;
+}
+
+/**
+ * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource.
+ * @tk: Timekeeper to snapshot.
+ *
+ * It generally is unsafe to access the clocksource after timekeeping has been
+ * suspended, so take a snapshot of the readout base of @tk and use it as the
+ * fast timekeeper's readout base while suspended. It will return the same
+ * number of cycles every time until timekeeping is resumed at which time the
+ * proper readout base for the fast timekeeper will be restored automatically.
+ */
+static void halt_fast_timekeeper(struct timekeeper *tk)
+{
+ static struct tk_read_base tkr_dummy;
+ struct tk_read_base *tkr = &tk->tkr_mono;
+
+ memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
+ cycles_at_suspend = tkr->read(tkr->clock);
+ tkr_dummy.read = dummy_clock_read;
+ update_fast_timekeeper(&tkr_dummy, &tk_fast_mono);
+
+ tkr = &tk->tkr_raw;
+ memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
+ tkr_dummy.read = dummy_clock_read;
+ update_fast_timekeeper(&tkr_dummy, &tk_fast_raw);
+}
+
+#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
+
+static inline void update_vsyscall(struct timekeeper *tk)
+{
+ struct timespec xt, wm;
+
+ xt = timespec64_to_timespec(tk_xtime(tk));
+ wm = timespec64_to_timespec(tk->wall_to_monotonic);
+ update_vsyscall_old(&xt, &wm, tk->tkr_mono.clock, tk->tkr_mono.mult,
+ tk->tkr_mono.cycle_last);
+}
+
+static inline void old_vsyscall_fixup(struct timekeeper *tk)
+{
+ s64 remainder;
+
+ /*
+ * Store only full nanoseconds into xtime_nsec after rounding
+ * it up and add the remainder to the error difference.
+ * XXX - This is necessary to avoid small 1ns inconsistnecies caused
+ * by truncating the remainder in vsyscalls. However, it causes
+ * additional work to be done in timekeeping_adjust(). Once
+ * the vsyscall implementations are converted to use xtime_nsec
+ * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD
+ * users are removed, this can be killed.
+ */
+ remainder = tk->tkr_mono.xtime_nsec & ((1ULL << tk->tkr_mono.shift) - 1);
+ tk->tkr_mono.xtime_nsec -= remainder;
+ tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift;
+ tk->ntp_error += remainder << tk->ntp_error_shift;
+ tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift;
+}
+#else
+#define old_vsyscall_fixup(tk)
+#endif
+
+static RAW_NOTIFIER_HEAD(pvclock_gtod_chain);
+
+static void update_pvclock_gtod(struct timekeeper *tk, bool was_set)
+{
+ raw_notifier_call_chain(&pvclock_gtod_chain, was_set, tk);
+}
+
+/**
+ * pvclock_gtod_register_notifier - register a pvclock timedata update listener
+ */
+int pvclock_gtod_register_notifier(struct notifier_block *nb)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ unsigned long flags;
+ int ret;
+
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb);
+ update_pvclock_gtod(tk, true);
+ raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier);
+
+/**
+ * pvclock_gtod_unregister_notifier - unregister a pvclock
+ * timedata update listener
+ */
+int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
+{
+ unsigned long flags;
+ int ret;
+
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb);
+ raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
+
+/*
+ * Update the ktime_t based scalar nsec members of the timekeeper
+ */
+static inline void tk_update_ktime_data(struct timekeeper *tk)
+{
+ u64 seconds;
+ u32 nsec;
+
+ /*
+ * The xtime based monotonic readout is:
+ * nsec = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec + now();
+ * The ktime based monotonic readout is:
+ * nsec = base_mono + now();
+ * ==> base_mono = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec
+ */
+ seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec);
+ nsec = (u32) tk->wall_to_monotonic.tv_nsec;
+ tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);
+
+ /* Update the monotonic raw base */
+ tk->tkr_raw.base = timespec64_to_ktime(tk->raw_time);
+
+ /*
+ * The sum of the nanoseconds portions of xtime and
+ * wall_to_monotonic can be greater/equal one second. Take
+ * this into account before updating tk->ktime_sec.
+ */
+ nsec += (u32)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
+ if (nsec >= NSEC_PER_SEC)
+ seconds++;
+ tk->ktime_sec = seconds;
+}
+
+/* must hold timekeeper_lock */
+static void timekeeping_update(struct timekeeper *tk, unsigned int action)
+{
+ if (action & TK_CLEAR_NTP) {
+ tk->ntp_error = 0;
+ ntp_clear();
+ }
+
+ tk_update_ktime_data(tk);
+
+ update_vsyscall(tk);
+ update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
+
+ if (action & TK_MIRROR)
+ memcpy(&shadow_timekeeper, &tk_core.timekeeper,
+ sizeof(tk_core.timekeeper));
+
+ update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono);
+ update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw);
+}
+
+/**
+ * timekeeping_forward_now - update clock to the current time
+ *
+ * Forward the current clock to update its state since the last call to
+ * update_wall_time(). This is useful before significant clock changes,
+ * as it avoids having to deal with this time offset explicitly.
+ */
+static void timekeeping_forward_now(struct timekeeper *tk)
+{
+ struct clocksource *clock = tk->tkr_mono.clock;
+ cycle_t cycle_now, delta;
+ s64 nsec;
+
+ cycle_now = tk->tkr_mono.read(clock);
+ delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
+ tk->tkr_mono.cycle_last = cycle_now;
+ tk->tkr_raw.cycle_last = cycle_now;
+
+ tk->tkr_mono.xtime_nsec += delta * tk->tkr_mono.mult;
+
+ /* If arch requires, add in get_arch_timeoffset() */
+ tk->tkr_mono.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_mono.shift;
+
+ tk_normalize_xtime(tk);
+
+ nsec = clocksource_cyc2ns(delta, tk->tkr_raw.mult, tk->tkr_raw.shift);
+ timespec64_add_ns(&tk->raw_time, nsec);
+}
+
+/**
+ * __getnstimeofday64 - Returns the time of day in a timespec64.
+ * @ts: pointer to the timespec to be set
+ *
+ * Updates the time of day in the timespec.
+ * Returns 0 on success, or -ve when suspended (timespec will be undefined).
+ */
+int __getnstimeofday64(struct timespec64 *ts)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ unsigned long seq;
+ s64 nsecs = 0;
+
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+
+ ts->tv_sec = tk->xtime_sec;
+ nsecs = timekeeping_get_ns(&tk->tkr_mono);
+
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ ts->tv_nsec = 0;
+ timespec64_add_ns(ts, nsecs);
+
+ /*
+ * Do not bail out early, in case there were callers still using
+ * the value, even in the face of the WARN_ON.
+ */
+ if (unlikely(timekeeping_suspended))
+ return -EAGAIN;
+ return 0;
+}
+EXPORT_SYMBOL(__getnstimeofday64);
+
+/**
+ * getnstimeofday64 - Returns the time of day in a timespec64.
+ * @ts: pointer to the timespec64 to be set
+ *
+ * Returns the time of day in a timespec64 (WARN if suspended).
+ */
+void getnstimeofday64(struct timespec64 *ts)
+{
+ WARN_ON(__getnstimeofday64(ts));
+}
+EXPORT_SYMBOL(getnstimeofday64);
+
+ktime_t ktime_get(void)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ unsigned int seq;
+ ktime_t base;
+ s64 nsecs;
+
+ WARN_ON(timekeeping_suspended);
+
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+ base = tk->tkr_mono.base;
+ nsecs = timekeeping_get_ns(&tk->tkr_mono);
+
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ return ktime_add_ns(base, nsecs);
+}
+EXPORT_SYMBOL_GPL(ktime_get);
+
+static ktime_t *offsets[TK_OFFS_MAX] = {
+ [TK_OFFS_REAL] = &tk_core.timekeeper.offs_real,
+ [TK_OFFS_BOOT] = &tk_core.timekeeper.offs_boot,
+ [TK_OFFS_TAI] = &tk_core.timekeeper.offs_tai,
+};
+
+ktime_t ktime_get_with_offset(enum tk_offsets offs)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ unsigned int seq;
+ ktime_t base, *offset = offsets[offs];
+ s64 nsecs;
+
+ WARN_ON(timekeeping_suspended);
+
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+ base = ktime_add(tk->tkr_mono.base, *offset);
+ nsecs = timekeeping_get_ns(&tk->tkr_mono);
+
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ return ktime_add_ns(base, nsecs);
+
+}
+EXPORT_SYMBOL_GPL(ktime_get_with_offset);
+
+/**
+ * ktime_mono_to_any() - convert mononotic time to any other time
+ * @tmono: time to convert.
+ * @offs: which offset to use
+ */
+ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs)
+{
+ ktime_t *offset = offsets[offs];
+ unsigned long seq;
+ ktime_t tconv;
+
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+ tconv = ktime_add(tmono, *offset);
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ return tconv;
+}
+EXPORT_SYMBOL_GPL(ktime_mono_to_any);
+
+/**
+ * ktime_get_raw - Returns the raw monotonic time in ktime_t format
+ */
+ktime_t ktime_get_raw(void)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ unsigned int seq;
+ ktime_t base;
+ s64 nsecs;
+
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+ base = tk->tkr_raw.base;
+ nsecs = timekeeping_get_ns(&tk->tkr_raw);
+
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ return ktime_add_ns(base, nsecs);
+}
+EXPORT_SYMBOL_GPL(ktime_get_raw);
+
+/**
+ * ktime_get_ts64 - get the monotonic clock in timespec64 format
+ * @ts: pointer to timespec variable
+ *
+ * The function calculates the monotonic clock from the realtime
+ * clock and the wall_to_monotonic offset and stores the result
+ * in normalized timespec64 format in the variable pointed to by @ts.
+ */
+void ktime_get_ts64(struct timespec64 *ts)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ struct timespec64 tomono;
+ s64 nsec;
+ unsigned int seq;
+
+ WARN_ON(timekeeping_suspended);
+
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+ ts->tv_sec = tk->xtime_sec;
+ nsec = timekeeping_get_ns(&tk->tkr_mono);
+ tomono = tk->wall_to_monotonic;
+
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ ts->tv_sec += tomono.tv_sec;
+ ts->tv_nsec = 0;
+ timespec64_add_ns(ts, nsec + tomono.tv_nsec);
+}
+EXPORT_SYMBOL_GPL(ktime_get_ts64);
+
+/**
+ * ktime_get_seconds - Get the seconds portion of CLOCK_MONOTONIC
+ *
+ * Returns the seconds portion of CLOCK_MONOTONIC with a single non
+ * serialized read. tk->ktime_sec is of type 'unsigned long' so this
+ * works on both 32 and 64 bit systems. On 32 bit systems the readout
+ * covers ~136 years of uptime which should be enough to prevent
+ * premature wrap arounds.
+ */
+time64_t ktime_get_seconds(void)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+
+ WARN_ON(timekeeping_suspended);
+ return tk->ktime_sec;
+}
+EXPORT_SYMBOL_GPL(ktime_get_seconds);
+
+/**
+ * ktime_get_real_seconds - Get the seconds portion of CLOCK_REALTIME
+ *
+ * Returns the wall clock seconds since 1970. This replaces the
+ * get_seconds() interface which is not y2038 safe on 32bit systems.
+ *
+ * For 64bit systems the fast access to tk->xtime_sec is preserved. On
+ * 32bit systems the access must be protected with the sequence
+ * counter to provide "atomic" access to the 64bit tk->xtime_sec
+ * value.
+ */
+time64_t ktime_get_real_seconds(void)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ time64_t seconds;
+ unsigned int seq;
+
+ if (IS_ENABLED(CONFIG_64BIT))
+ return tk->xtime_sec;
+
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+ seconds = tk->xtime_sec;
+
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ return seconds;
+}
+EXPORT_SYMBOL_GPL(ktime_get_real_seconds);
+
+#ifdef CONFIG_NTP_PPS
+
+/**
+ * getnstime_raw_and_real - get day and raw monotonic time in timespec format
+ * @ts_raw: pointer to the timespec to be set to raw monotonic time
+ * @ts_real: pointer to the timespec to be set to the time of day
+ *
+ * This function reads both the time of day and raw monotonic time at the
+ * same time atomically and stores the resulting timestamps in timespec
+ * format.
+ */
+void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ unsigned long seq;
+ s64 nsecs_raw, nsecs_real;
+
+ WARN_ON_ONCE(timekeeping_suspended);
+
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+
+ *ts_raw = timespec64_to_timespec(tk->raw_time);
+ ts_real->tv_sec = tk->xtime_sec;
+ ts_real->tv_nsec = 0;
+
+ nsecs_raw = timekeeping_get_ns(&tk->tkr_raw);
+ nsecs_real = timekeeping_get_ns(&tk->tkr_mono);
+
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ timespec_add_ns(ts_raw, nsecs_raw);
+ timespec_add_ns(ts_real, nsecs_real);
+}
+EXPORT_SYMBOL(getnstime_raw_and_real);
+
+#endif /* CONFIG_NTP_PPS */
+
+/**
+ * do_gettimeofday - Returns the time of day in a timeval
+ * @tv: pointer to the timeval to be set
+ *
+ * NOTE: Users should be converted to using getnstimeofday()
+ */
+void do_gettimeofday(struct timeval *tv)
+{
+ struct timespec64 now;
+
+ getnstimeofday64(&now);
+ tv->tv_sec = now.tv_sec;
+ tv->tv_usec = now.tv_nsec/1000;
+}
+EXPORT_SYMBOL(do_gettimeofday);
+
+/**
+ * do_settimeofday64 - Sets the time of day.
+ * @ts: pointer to the timespec64 variable containing the new time
+ *
+ * Sets the time of day to the new time and update NTP and notify hrtimers
+ */
+int do_settimeofday64(const struct timespec64 *ts)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ struct timespec64 ts_delta, xt;
+ unsigned long flags;
+
+ if (!timespec64_valid_strict(ts))
+ return -EINVAL;
+
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ write_seqcount_begin(&tk_core.seq);
+
+ timekeeping_forward_now(tk);
+
+ xt = tk_xtime(tk);
+ ts_delta.tv_sec = ts->tv_sec - xt.tv_sec;
+ ts_delta.tv_nsec = ts->tv_nsec - xt.tv_nsec;
+
+ tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta));
+
+ tk_set_xtime(tk, ts);
+
+ timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
+
+ write_seqcount_end(&tk_core.seq);
+ raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+
+ /* signal hrtimers about time change */
+ clock_was_set();
+
+ return 0;
+}
+EXPORT_SYMBOL(do_settimeofday64);
+
+/**
+ * timekeeping_inject_offset - Adds or subtracts from the current time.
+ * @tv: pointer to the timespec variable containing the offset
+ *
+ * Adds or subtracts an offset value from the current time.
+ */
+int timekeeping_inject_offset(struct timespec *ts)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ unsigned long flags;
+ struct timespec64 ts64, tmp;
+ int ret = 0;
+
+ if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
+ return -EINVAL;
+
+ ts64 = timespec_to_timespec64(*ts);
+
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ write_seqcount_begin(&tk_core.seq);
+
+ timekeeping_forward_now(tk);
+
+ /* Make sure the proposed value is valid */
+ tmp = timespec64_add(tk_xtime(tk), ts64);
+ if (!timespec64_valid_strict(&tmp)) {
+ ret = -EINVAL;
+ goto error;
+ }
+
+ tk_xtime_add(tk, &ts64);
+ tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts64));
+
+error: /* even if we error out, we forwarded the time, so call update */
+ timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
+
+ write_seqcount_end(&tk_core.seq);
+ raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+
+ /* signal hrtimers about time change */
+ clock_was_set();
+
+ return ret;
+}
+EXPORT_SYMBOL(timekeeping_inject_offset);
+
+
+/**
+ * timekeeping_get_tai_offset - Returns current TAI offset from UTC
+ *
+ */
+s32 timekeeping_get_tai_offset(void)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ unsigned int seq;
+ s32 ret;
+
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+ ret = tk->tai_offset;
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ return ret;
+}
+
+/**
+ * __timekeeping_set_tai_offset - Lock free worker function
+ *
+ */
+static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
+{
+ tk->tai_offset = tai_offset;
+ tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tai_offset, 0));
+}
+
+/**
+ * timekeeping_set_tai_offset - Sets the current TAI offset from UTC
+ *
+ */
+void timekeeping_set_tai_offset(s32 tai_offset)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ write_seqcount_begin(&tk_core.seq);
+ __timekeeping_set_tai_offset(tk, tai_offset);
+ timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
+ write_seqcount_end(&tk_core.seq);
+ raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+ clock_was_set();
+}
+
+/**
+ * change_clocksource - Swaps clocksources if a new one is available
+ *
+ * Accumulates current time interval and initializes new clocksource
+ */
+static int change_clocksource(void *data)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ struct clocksource *new, *old;
+ unsigned long flags;
+
+ new = (struct clocksource *) data;
+
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ write_seqcount_begin(&tk_core.seq);
+
+ timekeeping_forward_now(tk);
+ /*
+ * If the cs is in module, get a module reference. Succeeds
+ * for built-in code (owner == NULL) as well.
+ */
+ if (try_module_get(new->owner)) {
+ if (!new->enable || new->enable(new) == 0) {
+ old = tk->tkr_mono.clock;
+ tk_setup_internals(tk, new);
+ if (old->disable)
+ old->disable(old);
+ module_put(old->owner);
+ } else {
+ module_put(new->owner);
+ }
+ }
+ timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
+
+ write_seqcount_end(&tk_core.seq);
+ raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+
+ return 0;
+}
+
+/**
+ * timekeeping_notify - Install a new clock source
+ * @clock: pointer to the clock source
+ *
+ * This function is called from clocksource.c after a new, better clock
+ * source has been registered. The caller holds the clocksource_mutex.
+ */
+int timekeeping_notify(struct clocksource *clock)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+
+ if (tk->tkr_mono.clock == clock)
+ return 0;
+ stop_machine(change_clocksource, clock, NULL);
+ tick_clock_notify();
+ return tk->tkr_mono.clock == clock ? 0 : -1;
+}
+
+/**
+ * getrawmonotonic64 - Returns the raw monotonic time in a timespec
+ * @ts: pointer to the timespec64 to be set
+ *
+ * Returns the raw monotonic time (completely un-modified by ntp)
+ */
+void getrawmonotonic64(struct timespec64 *ts)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ struct timespec64 ts64;
+ unsigned long seq;
+ s64 nsecs;
+
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+ nsecs = timekeeping_get_ns(&tk->tkr_raw);
+ ts64 = tk->raw_time;
+
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ timespec64_add_ns(&ts64, nsecs);
+ *ts = ts64;
+}
+EXPORT_SYMBOL(getrawmonotonic64);
+
+
+/**
+ * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
+ */
+int timekeeping_valid_for_hres(void)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ unsigned long seq;
+ int ret;
+
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+
+ ret = tk->tkr_mono.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
+
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ return ret;
+}
+
+/**
+ * timekeeping_max_deferment - Returns max time the clocksource can be deferred
+ */
+u64 timekeeping_max_deferment(void)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ unsigned long seq;
+ u64 ret;
+
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+
+ ret = tk->tkr_mono.clock->max_idle_ns;
+
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ return ret;
+}
+
+/**
+ * read_persistent_clock - Return time from the persistent clock.
+ *
+ * Weak dummy function for arches that do not yet support it.
+ * Reads the time from the battery backed persistent clock.
+ * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported.
+ *
+ * XXX - Do be sure to remove it once all arches implement it.
+ */
+void __weak read_persistent_clock(struct timespec *ts)
+{
+ ts->tv_sec = 0;
+ ts->tv_nsec = 0;
+}
+
+void __weak read_persistent_clock64(struct timespec64 *ts64)
+{
+ struct timespec ts;
+
+ read_persistent_clock(&ts);
+ *ts64 = timespec_to_timespec64(ts);
+}
+
+/**
+ * read_boot_clock - Return time of the system start.
+ *
+ * Weak dummy function for arches that do not yet support it.
+ * Function to read the exact time the system has been started.
+ * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported.
+ *
+ * XXX - Do be sure to remove it once all arches implement it.
+ */
+void __weak read_boot_clock(struct timespec *ts)
+{
+ ts->tv_sec = 0;
+ ts->tv_nsec = 0;
+}
+
+void __weak read_boot_clock64(struct timespec64 *ts64)
+{
+ struct timespec ts;
+
+ read_boot_clock(&ts);
+ *ts64 = timespec_to_timespec64(ts);
+}
+
+/* Flag for if timekeeping_resume() has injected sleeptime */
+static bool sleeptime_injected;
+
+/* Flag for if there is a persistent clock on this platform */
+static bool persistent_clock_exists;
+
+/*
+ * timekeeping_init - Initializes the clocksource and common timekeeping values
+ */
+void __init timekeeping_init(void)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ struct clocksource *clock;
+ unsigned long flags;
+ struct timespec64 now, boot, tmp;
+
+ read_persistent_clock64(&now);
+ if (!timespec64_valid_strict(&now)) {
+ pr_warn("WARNING: Persistent clock returned invalid value!\n"
+ " Check your CMOS/BIOS settings.\n");
+ now.tv_sec = 0;
+ now.tv_nsec = 0;
+ } else if (now.tv_sec || now.tv_nsec)
+ persistent_clock_exists = true;
+
+ read_boot_clock64(&boot);
+ if (!timespec64_valid_strict(&boot)) {
+ pr_warn("WARNING: Boot clock returned invalid value!\n"
+ " Check your CMOS/BIOS settings.\n");
+ boot.tv_sec = 0;
+ boot.tv_nsec = 0;
+ }
+
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ write_seqcount_begin(&tk_core.seq);
+ ntp_init();
+
+ clock = clocksource_default_clock();
+ if (clock->enable)
+ clock->enable(clock);
+ tk_setup_internals(tk, clock);
+
+ tk_set_xtime(tk, &now);
+ tk->raw_time.tv_sec = 0;
+ tk->raw_time.tv_nsec = 0;
+ if (boot.tv_sec == 0 && boot.tv_nsec == 0)
+ boot = tk_xtime(tk);
+
+ set_normalized_timespec64(&tmp, -boot.tv_sec, -boot.tv_nsec);
+ tk_set_wall_to_mono(tk, tmp);
+
+ timekeeping_update(tk, TK_MIRROR);
+
+ write_seqcount_end(&tk_core.seq);
+ raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+}
+
+/* time in seconds when suspend began for persistent clock */
+static struct timespec64 timekeeping_suspend_time;
+
+/**
+ * __timekeeping_inject_sleeptime - Internal function to add sleep interval
+ * @delta: pointer to a timespec delta value
+ *
+ * Takes a timespec offset measuring a suspend interval and properly
+ * adds the sleep offset to the timekeeping variables.
+ */
+static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
+ struct timespec64 *delta)
+{
+ if (!timespec64_valid_strict(delta)) {
+ printk_deferred(KERN_WARNING
+ "__timekeeping_inject_sleeptime: Invalid "
+ "sleep delta value!\n");
+ return;
+ }
+ tk_xtime_add(tk, delta);
+ tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *delta));
+ tk_update_sleep_time(tk, timespec64_to_ktime(*delta));
+ tk_debug_account_sleep_time(delta);
+}
+
+#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_RTC_HCTOSYS_DEVICE)
+/**
+ * We have three kinds of time sources to use for sleep time
+ * injection, the preference order is:
+ * 1) non-stop clocksource
+ * 2) persistent clock (ie: RTC accessible when irqs are off)
+ * 3) RTC
+ *
+ * 1) and 2) are used by timekeeping, 3) by RTC subsystem.
+ * If system has neither 1) nor 2), 3) will be used finally.
+ *
+ *
+ * If timekeeping has injected sleeptime via either 1) or 2),
+ * 3) becomes needless, so in this case we don't need to call
+ * rtc_resume(), and this is what timekeeping_rtc_skipresume()
+ * means.
+ */
+bool timekeeping_rtc_skipresume(void)
+{
+ return sleeptime_injected;
+}
+
+/**
+ * 1) can be determined whether to use or not only when doing
+ * timekeeping_resume() which is invoked after rtc_suspend(),
+ * so we can't skip rtc_suspend() surely if system has 1).
+ *
+ * But if system has 2), 2) will definitely be used, so in this
+ * case we don't need to call rtc_suspend(), and this is what
+ * timekeeping_rtc_skipsuspend() means.
+ */
+bool timekeeping_rtc_skipsuspend(void)
+{
+ return persistent_clock_exists;
+}
+
+/**
+ * timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values
+ * @delta: pointer to a timespec64 delta value
+ *
+ * This hook is for architectures that cannot support read_persistent_clock64
+ * because their RTC/persistent clock is only accessible when irqs are enabled.
+ * and also don't have an effective nonstop clocksource.
+ *
+ * This function should only be called by rtc_resume(), and allows
+ * a suspend offset to be injected into the timekeeping values.
+ */
+void timekeeping_inject_sleeptime64(struct timespec64 *delta)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ write_seqcount_begin(&tk_core.seq);
+
+ timekeeping_forward_now(tk);
+
+ __timekeeping_inject_sleeptime(tk, delta);
+
+ timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
+
+ write_seqcount_end(&tk_core.seq);
+ raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+
+ /* signal hrtimers about time change */
+ clock_was_set();
+}
+#endif
+
+/**
+ * timekeeping_resume - Resumes the generic timekeeping subsystem.
+ */
+void timekeeping_resume(void)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ struct clocksource *clock = tk->tkr_mono.clock;
+ unsigned long flags;
+ struct timespec64 ts_new, ts_delta;
+ cycle_t cycle_now, cycle_delta;
+
+ sleeptime_injected = false;
+ read_persistent_clock64(&ts_new);
+
+ clockevents_resume();
+ clocksource_resume();
+
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ write_seqcount_begin(&tk_core.seq);
+
+ /*
+ * After system resumes, we need to calculate the suspended time and
+ * compensate it for the OS time. There are 3 sources that could be
+ * used: Nonstop clocksource during suspend, persistent clock and rtc
+ * device.
+ *
+ * One specific platform may have 1 or 2 or all of them, and the
+ * preference will be:
+ * suspend-nonstop clocksource -> persistent clock -> rtc
+ * The less preferred source will only be tried if there is no better
+ * usable source. The rtc part is handled separately in rtc core code.
+ */
+ cycle_now = tk->tkr_mono.read(clock);
+ if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) &&
+ cycle_now > tk->tkr_mono.cycle_last) {
+ u64 num, max = ULLONG_MAX;
+ u32 mult = clock->mult;
+ u32 shift = clock->shift;
+ s64 nsec = 0;
+
+ cycle_delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last,
+ tk->tkr_mono.mask);
+
+ /*
+ * "cycle_delta * mutl" may cause 64 bits overflow, if the
+ * suspended time is too long. In that case we need do the
+ * 64 bits math carefully
+ */
+ do_div(max, mult);
+ if (cycle_delta > max) {
+ num = div64_u64(cycle_delta, max);
+ nsec = (((u64) max * mult) >> shift) * num;
+ cycle_delta -= num * max;
+ }
+ nsec += ((u64) cycle_delta * mult) >> shift;
+
+ ts_delta = ns_to_timespec64(nsec);
+ sleeptime_injected = true;
+ } else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) {
+ ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time);
+ sleeptime_injected = true;
+ }
+
+ if (sleeptime_injected)
+ __timekeeping_inject_sleeptime(tk, &ts_delta);
+
+ /* Re-base the last cycle value */
+ tk->tkr_mono.cycle_last = cycle_now;
+ tk->tkr_raw.cycle_last = cycle_now;
+
+ tk->ntp_error = 0;
+ timekeeping_suspended = 0;
+ timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
+ write_seqcount_end(&tk_core.seq);
+ raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+
+ touch_softlockup_watchdog();
+
+ tick_resume();
+ hrtimers_resume();
+}
+
+int timekeeping_suspend(void)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ unsigned long flags;
+ struct timespec64 delta, delta_delta;
+ static struct timespec64 old_delta;
+
+ read_persistent_clock64(&timekeeping_suspend_time);
+
+ /*
+ * On some systems the persistent_clock can not be detected at
+ * timekeeping_init by its return value, so if we see a valid
+ * value returned, update the persistent_clock_exists flag.
+ */
+ if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec)
+ persistent_clock_exists = true;
+
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ write_seqcount_begin(&tk_core.seq);
+ timekeeping_forward_now(tk);
+ timekeeping_suspended = 1;
+
+ if (persistent_clock_exists) {
+ /*
+ * To avoid drift caused by repeated suspend/resumes,
+ * which each can add ~1 second drift error,
+ * try to compensate so the difference in system time
+ * and persistent_clock time stays close to constant.
+ */
+ delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time);
+ delta_delta = timespec64_sub(delta, old_delta);
+ if (abs(delta_delta.tv_sec) >= 2) {
+ /*
+ * if delta_delta is too large, assume time correction
+ * has occurred and set old_delta to the current delta.
+ */
+ old_delta = delta;
+ } else {
+ /* Otherwise try to adjust old_system to compensate */
+ timekeeping_suspend_time =
+ timespec64_add(timekeeping_suspend_time, delta_delta);
+ }
+ }
+
+ timekeeping_update(tk, TK_MIRROR);
+ halt_fast_timekeeper(tk);
+ write_seqcount_end(&tk_core.seq);
+ raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+
+ tick_suspend();
+ clocksource_suspend();
+ clockevents_suspend();
+
+ return 0;
+}
+
+/* sysfs resume/suspend bits for timekeeping */
+static struct syscore_ops timekeeping_syscore_ops = {
+ .resume = timekeeping_resume,
+ .suspend = timekeeping_suspend,
+};
+
+static int __init timekeeping_init_ops(void)
+{
+ register_syscore_ops(&timekeeping_syscore_ops);
+ return 0;
+}
+device_initcall(timekeeping_init_ops);
+
+/*
+ * Apply a multiplier adjustment to the timekeeper
+ */
+static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
+ s64 offset,
+ bool negative,
+ int adj_scale)
+{
+ s64 interval = tk->cycle_interval;
+ s32 mult_adj = 1;
+
+ if (negative) {
+ mult_adj = -mult_adj;
+ interval = -interval;
+ offset = -offset;
+ }
+ mult_adj <<= adj_scale;
+ interval <<= adj_scale;
+ offset <<= adj_scale;
+
+ /*
+ * So the following can be confusing.
+ *
+ * To keep things simple, lets assume mult_adj == 1 for now.
+ *
+ * When mult_adj != 1, remember that the interval and offset values
+ * have been appropriately scaled so the math is the same.
+ *
+ * The basic idea here is that we're increasing the multiplier
+ * by one, this causes the xtime_interval to be incremented by
+ * one cycle_interval. This is because:
+ * xtime_interval = cycle_interval * mult
+ * So if mult is being incremented by one:
+ * xtime_interval = cycle_interval * (mult + 1)
+ * Its the same as:
+ * xtime_interval = (cycle_interval * mult) + cycle_interval
+ * Which can be shortened to:
+ * xtime_interval += cycle_interval
+ *
+ * So offset stores the non-accumulated cycles. Thus the current
+ * time (in shifted nanoseconds) is:
+ * now = (offset * adj) + xtime_nsec
+ * Now, even though we're adjusting the clock frequency, we have
+ * to keep time consistent. In other words, we can't jump back
+ * in time, and we also want to avoid jumping forward in time.
+ *
+ * So given the same offset value, we need the time to be the same
+ * both before and after the freq adjustment.
+ * now = (offset * adj_1) + xtime_nsec_1
+ * now = (offset * adj_2) + xtime_nsec_2
+ * So:
+ * (offset * adj_1) + xtime_nsec_1 =
+ * (offset * adj_2) + xtime_nsec_2
+ * And we know:
+ * adj_2 = adj_1 + 1
+ * So:
+ * (offset * adj_1) + xtime_nsec_1 =
+ * (offset * (adj_1+1)) + xtime_nsec_2
+ * (offset * adj_1) + xtime_nsec_1 =
+ * (offset * adj_1) + offset + xtime_nsec_2
+ * Canceling the sides:
+ * xtime_nsec_1 = offset + xtime_nsec_2
+ * Which gives us:
+ * xtime_nsec_2 = xtime_nsec_1 - offset
+ * Which simplfies to:
+ * xtime_nsec -= offset
+ *
+ * XXX - TODO: Doc ntp_error calculation.
+ */
+ if ((mult_adj > 0) && (tk->tkr_mono.mult + mult_adj < mult_adj)) {
+ /* NTP adjustment caused clocksource mult overflow */
+ WARN_ON_ONCE(1);
+ return;
+ }
+
+ tk->tkr_mono.mult += mult_adj;
+ tk->xtime_interval += interval;
+ tk->tkr_mono.xtime_nsec -= offset;
+ tk->ntp_error -= (interval - offset) << tk->ntp_error_shift;
+}
+
+/*
+ * Calculate the multiplier adjustment needed to match the frequency
+ * specified by NTP
+ */
+static __always_inline void timekeeping_freqadjust(struct timekeeper *tk,
+ s64 offset)
+{
+ s64 interval = tk->cycle_interval;
+ s64 xinterval = tk->xtime_interval;
+ s64 tick_error;
+ bool negative;
+ u32 adj;
+
+ /* Remove any current error adj from freq calculation */
+ if (tk->ntp_err_mult)
+ xinterval -= tk->cycle_interval;
+
+ tk->ntp_tick = ntp_tick_length();
+
+ /* Calculate current error per tick */
+ tick_error = ntp_tick_length() >> tk->ntp_error_shift;
+ tick_error -= (xinterval + tk->xtime_remainder);
+
+ /* Don't worry about correcting it if its small */
+ if (likely((tick_error >= 0) && (tick_error <= interval)))
+ return;
+
+ /* preserve the direction of correction */
+ negative = (tick_error < 0);
+
+ /* Sort out the magnitude of the correction */
+ tick_error = abs(tick_error);
+ for (adj = 0; tick_error > interval; adj++)
+ tick_error >>= 1;
+
+ /* scale the corrections */
+ timekeeping_apply_adjustment(tk, offset, negative, adj);
+}
+
+/*
+ * Adjust the timekeeper's multiplier to the correct frequency
+ * and also to reduce the accumulated error value.
+ */
+static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
+{
+ /* Correct for the current frequency error */
+ timekeeping_freqadjust(tk, offset);
+
+ /* Next make a small adjustment to fix any cumulative error */
+ if (!tk->ntp_err_mult && (tk->ntp_error > 0)) {
+ tk->ntp_err_mult = 1;
+ timekeeping_apply_adjustment(tk, offset, 0, 0);
+ } else if (tk->ntp_err_mult && (tk->ntp_error <= 0)) {
+ /* Undo any existing error adjustment */
+ timekeeping_apply_adjustment(tk, offset, 1, 0);
+ tk->ntp_err_mult = 0;
+ }
+
+ if (unlikely(tk->tkr_mono.clock->maxadj &&
+ (abs(tk->tkr_mono.mult - tk->tkr_mono.clock->mult)
+ > tk->tkr_mono.clock->maxadj))) {
+ printk_once(KERN_WARNING
+ "Adjusting %s more than 11%% (%ld vs %ld)\n",
+ tk->tkr_mono.clock->name, (long)tk->tkr_mono.mult,
+ (long)tk->tkr_mono.clock->mult + tk->tkr_mono.clock->maxadj);
+ }
+
+ /*
+ * It may be possible that when we entered this function, xtime_nsec
+ * was very small. Further, if we're slightly speeding the clocksource
+ * in the code above, its possible the required corrective factor to
+ * xtime_nsec could cause it to underflow.
+ *
+ * Now, since we already accumulated the second, cannot simply roll
+ * the accumulated second back, since the NTP subsystem has been
+ * notified via second_overflow. So instead we push xtime_nsec forward
+ * by the amount we underflowed, and add that amount into the error.
+ *
+ * We'll correct this error next time through this function, when
+ * xtime_nsec is not as small.
+ */
+ if (unlikely((s64)tk->tkr_mono.xtime_nsec < 0)) {
+ s64 neg = -(s64)tk->tkr_mono.xtime_nsec;
+ tk->tkr_mono.xtime_nsec = 0;
+ tk->ntp_error += neg << tk->ntp_error_shift;
+ }
+}
+
+/**
+ * accumulate_nsecs_to_secs - Accumulates nsecs into secs
+ *
+ * Helper function that accumulates a the nsecs greater then a second
+ * from the xtime_nsec field to the xtime_secs field.
+ * It also calls into the NTP code to handle leapsecond processing.
+ *
+ */
+static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
+{
+ u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
+ unsigned int clock_set = 0;
+
+ while (tk->tkr_mono.xtime_nsec >= nsecps) {
+ int leap;
+
+ tk->tkr_mono.xtime_nsec -= nsecps;
+ tk->xtime_sec++;
+
+ /* Figure out if its a leap sec and apply if needed */
+ leap = second_overflow(tk->xtime_sec);
+ if (unlikely(leap)) {
+ struct timespec64 ts;
+
+ tk->xtime_sec += leap;
+
+ ts.tv_sec = leap;
+ ts.tv_nsec = 0;
+ tk_set_wall_to_mono(tk,
+ timespec64_sub(tk->wall_to_monotonic, ts));
+
+ __timekeeping_set_tai_offset(tk, tk->tai_offset - leap);
+
+ clock_set = TK_CLOCK_WAS_SET;
+ }
+ }
+ return clock_set;
+}
+
+/**
+ * logarithmic_accumulation - shifted accumulation of cycles
+ *
+ * This functions accumulates a shifted interval of cycles into
+ * into a shifted interval nanoseconds. Allows for O(log) accumulation
+ * loop.
+ *
+ * Returns the unconsumed cycles.
+ */
+static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
+ u32 shift,
+ unsigned int *clock_set)
+{
+ cycle_t interval = tk->cycle_interval << shift;
+ u64 raw_nsecs;
+
+ /* If the offset is smaller then a shifted interval, do nothing */
+ if (offset < interval)
+ return offset;
+
+ /* Accumulate one shifted interval */
+ offset -= interval;
+ tk->tkr_mono.cycle_last += interval;
+ tk->tkr_raw.cycle_last += interval;
+
+ tk->tkr_mono.xtime_nsec += tk->xtime_interval << shift;
+ *clock_set |= accumulate_nsecs_to_secs(tk);
+
+ /* Accumulate raw time */
+ raw_nsecs = (u64)tk->raw_interval << shift;
+ raw_nsecs += tk->raw_time.tv_nsec;
+ if (raw_nsecs >= NSEC_PER_SEC) {
+ u64 raw_secs = raw_nsecs;
+ raw_nsecs = do_div(raw_secs, NSEC_PER_SEC);
+ tk->raw_time.tv_sec += raw_secs;
+ }
+ tk->raw_time.tv_nsec = raw_nsecs;
+
+ /* Accumulate error between NTP and clock interval */
+ tk->ntp_error += tk->ntp_tick << shift;
+ tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) <<
+ (tk->ntp_error_shift + shift);
+
+ return offset;
+}
+
+/**
+ * update_wall_time - Uses the current clocksource to increment the wall time
+ *
+ */
+void update_wall_time(void)
+{
+ struct timekeeper *real_tk = &tk_core.timekeeper;
+ struct timekeeper *tk = &shadow_timekeeper;
+ cycle_t offset;
+ int shift = 0, maxshift;
+ unsigned int clock_set = 0;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
+
+ /* Make sure we're fully resumed: */
+ if (unlikely(timekeeping_suspended))
+ goto out;
+
+#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
+ offset = real_tk->cycle_interval;
+#else
+ offset = clocksource_delta(tk->tkr_mono.read(tk->tkr_mono.clock),
+ tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
+#endif
+
+ /* Check if there's really nothing to do */
+ if (offset < real_tk->cycle_interval)
+ goto out;
+
+ /* Do some additional sanity checking */
+ timekeeping_check_update(real_tk, offset);
+
+ /*
+ * With NO_HZ we may have to accumulate many cycle_intervals
+ * (think "ticks") worth of time at once. To do this efficiently,
+ * we calculate the largest doubling multiple of cycle_intervals
+ * that is smaller than the offset. We then accumulate that
+ * chunk in one go, and then try to consume the next smaller
+ * doubled multiple.
+ */
+ shift = ilog2(offset) - ilog2(tk->cycle_interval);
+ shift = max(0, shift);
+ /* Bound shift to one less than what overflows tick_length */
+ maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1;
+ shift = min(shift, maxshift);
+ while (offset >= tk->cycle_interval) {
+ offset = logarithmic_accumulation(tk, offset, shift,
+ &clock_set);
+ if (offset < tk->cycle_interval<<shift)
+ shift--;
+ }
+
+ /* correct the clock when NTP error is too big */
+ timekeeping_adjust(tk, offset);
+
+ /*
+ * XXX This can be killed once everyone converts
+ * to the new update_vsyscall.
+ */
+ old_vsyscall_fixup(tk);
+
+ /*
+ * Finally, make sure that after the rounding
+ * xtime_nsec isn't larger than NSEC_PER_SEC
+ */
+ clock_set |= accumulate_nsecs_to_secs(tk);
+
+ write_seqcount_begin(&tk_core.seq);
+ /*
+ * Update the real timekeeper.
+ *
+ * We could avoid this memcpy by switching pointers, but that
+ * requires changes to all other timekeeper usage sites as
+ * well, i.e. move the timekeeper pointer getter into the
+ * spinlocked/seqcount protected sections. And we trade this
+ * memcpy under the tk_core.seq against one before we start
+ * updating.
+ */
+ memcpy(real_tk, tk, sizeof(*tk));
+ timekeeping_update(real_tk, clock_set);
+ write_seqcount_end(&tk_core.seq);
+out:
+ raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+ if (clock_set)
+ /* Have to call _delayed version, since in irq context*/
+ clock_was_set_delayed();
+}
+
+/**
+ * getboottime64 - Return the real time of system boot.
+ * @ts: pointer to the timespec64 to be set
+ *
+ * Returns the wall-time of boot in a timespec64.
+ *
+ * This is based on the wall_to_monotonic offset and the total suspend
+ * time. Calls to settimeofday will affect the value returned (which
+ * basically means that however wrong your real time clock is at boot time,
+ * you get the right time here).
+ */
+void getboottime64(struct timespec64 *ts)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot);
+
+ *ts = ktime_to_timespec64(t);
+}
+EXPORT_SYMBOL_GPL(getboottime64);
+
+unsigned long get_seconds(void)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+
+ return tk->xtime_sec;
+}
+EXPORT_SYMBOL(get_seconds);
+
+struct timespec __current_kernel_time(void)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+
+ return timespec64_to_timespec(tk_xtime(tk));
+}
+
+struct timespec current_kernel_time(void)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ struct timespec64 now;
+ unsigned long seq;
+
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+
+ now = tk_xtime(tk);
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ return timespec64_to_timespec(now);
+}
+EXPORT_SYMBOL(current_kernel_time);
+
+struct timespec64 get_monotonic_coarse64(void)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ struct timespec64 now, mono;
+ unsigned long seq;
+
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+
+ now = tk_xtime(tk);
+ mono = tk->wall_to_monotonic;
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ set_normalized_timespec64(&now, now.tv_sec + mono.tv_sec,
+ now.tv_nsec + mono.tv_nsec);
+
+ return now;
+}
+
+/*
+ * Must hold jiffies_lock
+ */
+void do_timer(unsigned long ticks)
+{
+ jiffies_64 += ticks;
+ calc_global_load(ticks);
+}
+
+/**
+ * ktime_get_update_offsets_tick - hrtimer helper
+ * @offs_real: pointer to storage for monotonic -> realtime offset
+ * @offs_boot: pointer to storage for monotonic -> boottime offset
+ * @offs_tai: pointer to storage for monotonic -> clock tai offset
+ *
+ * Returns monotonic time at last tick and various offsets
+ */
+ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, ktime_t *offs_boot,
+ ktime_t *offs_tai)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ unsigned int seq;
+ ktime_t base;
+ u64 nsecs;
+
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+
+ base = tk->tkr_mono.base;
+ nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
+
+ *offs_real = tk->offs_real;
+ *offs_boot = tk->offs_boot;
+ *offs_tai = tk->offs_tai;
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ return ktime_add_ns(base, nsecs);
+}
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+/**
+ * ktime_get_update_offsets_now - hrtimer helper
+ * @offs_real: pointer to storage for monotonic -> realtime offset
+ * @offs_boot: pointer to storage for monotonic -> boottime offset
+ * @offs_tai: pointer to storage for monotonic -> clock tai offset
+ *
+ * Returns current monotonic time and updates the offsets
+ * Called from hrtimer_interrupt() or retrigger_next_event()
+ */
+ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot,
+ ktime_t *offs_tai)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ unsigned int seq;
+ ktime_t base;
+ u64 nsecs;
+
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+
+ base = tk->tkr_mono.base;
+ nsecs = timekeeping_get_ns(&tk->tkr_mono);
+
+ *offs_real = tk->offs_real;
+ *offs_boot = tk->offs_boot;
+ *offs_tai = tk->offs_tai;
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ return ktime_add_ns(base, nsecs);
+}
+#endif
+
+/**
+ * do_adjtimex() - Accessor function to NTP __do_adjtimex function
+ */
+int do_adjtimex(struct timex *txc)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ unsigned long flags;
+ struct timespec64 ts;
+ s32 orig_tai, tai;
+ int ret;
+
+ /* Validate the data before disabling interrupts */
+ ret = ntp_validate_timex(txc);
+ if (ret)
+ return ret;
+
+ if (txc->modes & ADJ_SETOFFSET) {
+ struct timespec delta;
+ delta.tv_sec = txc->time.tv_sec;
+ delta.tv_nsec = txc->time.tv_usec;
+ if (!(txc->modes & ADJ_NANO))
+ delta.tv_nsec *= 1000;
+ ret = timekeeping_inject_offset(&delta);
+ if (ret)
+ return ret;
+ }
+
+ getnstimeofday64(&ts);
+
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ write_seqcount_begin(&tk_core.seq);
+
+ orig_tai = tai = tk->tai_offset;
+ ret = __do_adjtimex(txc, &ts, &tai);
+
+ if (tai != orig_tai) {
+ __timekeeping_set_tai_offset(tk, tai);
+ timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
+ }
+ write_seqcount_end(&tk_core.seq);
+ raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+
+ if (tai != orig_tai)
+ clock_was_set();
+
+ ntp_notify_cmos_timer();
+
+ return ret;
+}
+
+#ifdef CONFIG_NTP_PPS
+/**
+ * hardpps() - Accessor function to NTP __hardpps function
+ */
+void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ write_seqcount_begin(&tk_core.seq);
+
+ __hardpps(phase_ts, raw_ts);
+
+ write_seqcount_end(&tk_core.seq);
+ raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+}
+EXPORT_SYMBOL(hardpps);
+#endif
+
+/**
+ * xtime_update() - advances the timekeeping infrastructure
+ * @ticks: number of ticks, that have elapsed since the last call.
+ *
+ * Must be called with interrupts disabled.
+ */
+void xtime_update(unsigned long ticks)
+{
+ write_seqlock(&jiffies_lock);
+ do_timer(ticks);
+ write_sequnlock(&jiffies_lock);
+ update_wall_time();
+}
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
new file mode 100644
index 000000000..ead8794b9
--- /dev/null
+++ b/kernel/time/timekeeping.h
@@ -0,0 +1,29 @@
+#ifndef _KERNEL_TIME_TIMEKEEPING_H
+#define _KERNEL_TIME_TIMEKEEPING_H
+/*
+ * Internal interfaces for kernel/time/
+ */
+extern ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real,
+ ktime_t *offs_boot,
+ ktime_t *offs_tai);
+extern ktime_t ktime_get_update_offsets_now(ktime_t *offs_real,
+ ktime_t *offs_boot,
+ ktime_t *offs_tai);
+
+extern int timekeeping_valid_for_hres(void);
+extern u64 timekeeping_max_deferment(void);
+extern int timekeeping_inject_offset(struct timespec *ts);
+extern s32 timekeeping_get_tai_offset(void);
+extern void timekeeping_set_tai_offset(s32 tai_offset);
+extern void timekeeping_clocktai(struct timespec *ts);
+extern int timekeeping_suspend(void);
+extern void timekeeping_resume(void);
+
+extern void do_timer(unsigned long ticks);
+extern void update_wall_time(void);
+
+extern seqlock_t jiffies_lock;
+
+#define CS_NAME_LEN 32
+
+#endif
diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
new file mode 100644
index 000000000..f6bd65236
--- /dev/null
+++ b/kernel/time/timekeeping_debug.c
@@ -0,0 +1,74 @@
+/*
+ * debugfs file to track time spent in suspend
+ *
+ * Copyright (c) 2011, Google, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/debugfs.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/seq_file.h>
+#include <linux/time.h>
+
+#include "timekeeping_internal.h"
+
+static unsigned int sleep_time_bin[32] = {0};
+
+static int tk_debug_show_sleep_time(struct seq_file *s, void *data)
+{
+ unsigned int bin;
+ seq_puts(s, " time (secs) count\n");
+ seq_puts(s, "------------------------------\n");
+ for (bin = 0; bin < 32; bin++) {
+ if (sleep_time_bin[bin] == 0)
+ continue;
+ seq_printf(s, "%10u - %-10u %4u\n",
+ bin ? 1 << (bin - 1) : 0, 1 << bin,
+ sleep_time_bin[bin]);
+ }
+ return 0;
+}
+
+static int tk_debug_sleep_time_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, tk_debug_show_sleep_time, NULL);
+}
+
+static const struct file_operations tk_debug_sleep_time_fops = {
+ .open = tk_debug_sleep_time_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init tk_debug_sleep_time_init(void)
+{
+ struct dentry *d;
+
+ d = debugfs_create_file("sleep_time", 0444, NULL, NULL,
+ &tk_debug_sleep_time_fops);
+ if (!d) {
+ pr_err("Failed to create sleep_time debug file\n");
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+late_initcall(tk_debug_sleep_time_init);
+
+void tk_debug_account_sleep_time(struct timespec64 *t)
+{
+ sleep_time_bin[fls(t->tv_sec)]++;
+}
+
diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h
new file mode 100644
index 000000000..4ea005a7f
--- /dev/null
+++ b/kernel/time/timekeeping_internal.h
@@ -0,0 +1,29 @@
+#ifndef _TIMEKEEPING_INTERNAL_H
+#define _TIMEKEEPING_INTERNAL_H
+/*
+ * timekeeping debug functions
+ */
+#include <linux/clocksource.h>
+#include <linux/time.h>
+
+#ifdef CONFIG_DEBUG_FS
+extern void tk_debug_account_sleep_time(struct timespec64 *t);
+#else
+#define tk_debug_account_sleep_time(x)
+#endif
+
+#ifdef CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE
+static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask)
+{
+ cycle_t ret = (now - last) & mask;
+
+ return (s64) ret > 0 ? ret : 0;
+}
+#else
+static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask)
+{
+ return (now - last) & mask;
+}
+#endif
+
+#endif /* _TIMEKEEPING_INTERNAL_H */
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
new file mode 100644
index 000000000..2ece3aa50
--- /dev/null
+++ b/kernel/time/timer.c
@@ -0,0 +1,1720 @@
+/*
+ * linux/kernel/timer.c
+ *
+ * Kernel internal timers
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better.
+ *
+ * 1997-09-10 Updated NTP code according to technical memorandum Jan '96
+ * "A Kernel Model for Precision Timekeeping" by Dave Mills
+ * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
+ * serialize accesses to xtime/lost_ticks).
+ * Copyright (C) 1998 Andrea Arcangeli
+ * 1999-03-10 Improved NTP compatibility by Ulrich Windl
+ * 2002-05-31 Move sys_sysinfo here and make its locking sane, Robert Love
+ * 2000-10-05 Implemented scalable SMP per-CPU timer handling.
+ * Copyright (C) 2000, 2001, 2002 Ingo Molnar
+ * Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar
+ */
+
+#include <linux/kernel_stat.h>
+#include <linux/export.h>
+#include <linux/interrupt.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/pid_namespace.h>
+#include <linux/notifier.h>
+#include <linux/thread_info.h>
+#include <linux/time.h>
+#include <linux/jiffies.h>
+#include <linux/posix-timers.h>
+#include <linux/cpu.h>
+#include <linux/syscalls.h>
+#include <linux/delay.h>
+#include <linux/tick.h>
+#include <linux/kallsyms.h>
+#include <linux/irq_work.h>
+#include <linux/sched.h>
+#include <linux/sched/sysctl.h>
+#include <linux/slab.h>
+#include <linux/compat.h>
+
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+#include <asm/div64.h>
+#include <asm/timex.h>
+#include <asm/io.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/timer.h>
+
+__visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
+
+EXPORT_SYMBOL(jiffies_64);
+
+/*
+ * per-CPU timer vector definitions:
+ */
+#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6)
+#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)
+#define TVN_SIZE (1 << TVN_BITS)
+#define TVR_SIZE (1 << TVR_BITS)
+#define TVN_MASK (TVN_SIZE - 1)
+#define TVR_MASK (TVR_SIZE - 1)
+#define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1))
+
+struct tvec {
+ struct list_head vec[TVN_SIZE];
+};
+
+struct tvec_root {
+ struct list_head vec[TVR_SIZE];
+};
+
+struct tvec_base {
+ spinlock_t lock;
+ struct timer_list *running_timer;
+ unsigned long timer_jiffies;
+ unsigned long next_timer;
+ unsigned long active_timers;
+ unsigned long all_timers;
+ int cpu;
+ struct tvec_root tv1;
+ struct tvec tv2;
+ struct tvec tv3;
+ struct tvec tv4;
+ struct tvec tv5;
+} ____cacheline_aligned;
+
+/*
+ * __TIMER_INITIALIZER() needs to set ->base to a valid pointer (because we've
+ * made NULL special, hint: lock_timer_base()) and we cannot get a compile time
+ * pointer to per-cpu entries because we don't know where we'll map the section,
+ * even for the boot cpu.
+ *
+ * And so we use boot_tvec_bases for boot CPU and per-cpu __tvec_bases for the
+ * rest of them.
+ */
+struct tvec_base boot_tvec_bases;
+EXPORT_SYMBOL(boot_tvec_bases);
+
+static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
+
+/* Functions below help us manage 'deferrable' flag */
+static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
+{
+ return ((unsigned int)(unsigned long)base & TIMER_DEFERRABLE);
+}
+
+static inline unsigned int tbase_get_irqsafe(struct tvec_base *base)
+{
+ return ((unsigned int)(unsigned long)base & TIMER_IRQSAFE);
+}
+
+static inline struct tvec_base *tbase_get_base(struct tvec_base *base)
+{
+ return ((struct tvec_base *)((unsigned long)base & ~TIMER_FLAG_MASK));
+}
+
+static inline void
+timer_set_base(struct timer_list *timer, struct tvec_base *new_base)
+{
+ unsigned long flags = (unsigned long)timer->base & TIMER_FLAG_MASK;
+
+ timer->base = (struct tvec_base *)((unsigned long)(new_base) | flags);
+}
+
+static unsigned long round_jiffies_common(unsigned long j, int cpu,
+ bool force_up)
+{
+ int rem;
+ unsigned long original = j;
+
+ /*
+ * We don't want all cpus firing their timers at once hitting the
+ * same lock or cachelines, so we skew each extra cpu with an extra
+ * 3 jiffies. This 3 jiffies came originally from the mm/ code which
+ * already did this.
+ * The skew is done by adding 3*cpunr, then round, then subtract this
+ * extra offset again.
+ */
+ j += cpu * 3;
+
+ rem = j % HZ;
+
+ /*
+ * If the target jiffie is just after a whole second (which can happen
+ * due to delays of the timer irq, long irq off times etc etc) then
+ * we should round down to the whole second, not up. Use 1/4th second
+ * as cutoff for this rounding as an extreme upper bound for this.
+ * But never round down if @force_up is set.
+ */
+ if (rem < HZ/4 && !force_up) /* round down */
+ j = j - rem;
+ else /* round up */
+ j = j - rem + HZ;
+
+ /* now that we have rounded, subtract the extra skew again */
+ j -= cpu * 3;
+
+ /*
+ * Make sure j is still in the future. Otherwise return the
+ * unmodified value.
+ */
+ return time_is_after_jiffies(j) ? j : original;
+}
+
+/**
+ * __round_jiffies - function to round jiffies to a full second
+ * @j: the time in (absolute) jiffies that should be rounded
+ * @cpu: the processor number on which the timeout will happen
+ *
+ * __round_jiffies() rounds an absolute time in the future (in jiffies)
+ * up or down to (approximately) full seconds. This is useful for timers
+ * for which the exact time they fire does not matter too much, as long as
+ * they fire approximately every X seconds.
+ *
+ * By rounding these timers to whole seconds, all such timers will fire
+ * at the same time, rather than at various times spread out. The goal
+ * of this is to have the CPU wake up less, which saves power.
+ *
+ * The exact rounding is skewed for each processor to avoid all
+ * processors firing at the exact same time, which could lead
+ * to lock contention or spurious cache line bouncing.
+ *
+ * The return value is the rounded version of the @j parameter.
+ */
+unsigned long __round_jiffies(unsigned long j, int cpu)
+{
+ return round_jiffies_common(j, cpu, false);
+}
+EXPORT_SYMBOL_GPL(__round_jiffies);
+
+/**
+ * __round_jiffies_relative - function to round jiffies to a full second
+ * @j: the time in (relative) jiffies that should be rounded
+ * @cpu: the processor number on which the timeout will happen
+ *
+ * __round_jiffies_relative() rounds a time delta in the future (in jiffies)
+ * up or down to (approximately) full seconds. This is useful for timers
+ * for which the exact time they fire does not matter too much, as long as
+ * they fire approximately every X seconds.
+ *
+ * By rounding these timers to whole seconds, all such timers will fire
+ * at the same time, rather than at various times spread out. The goal
+ * of this is to have the CPU wake up less, which saves power.
+ *
+ * The exact rounding is skewed for each processor to avoid all
+ * processors firing at the exact same time, which could lead
+ * to lock contention or spurious cache line bouncing.
+ *
+ * The return value is the rounded version of the @j parameter.
+ */
+unsigned long __round_jiffies_relative(unsigned long j, int cpu)
+{
+ unsigned long j0 = jiffies;
+
+ /* Use j0 because jiffies might change while we run */
+ return round_jiffies_common(j + j0, cpu, false) - j0;
+}
+EXPORT_SYMBOL_GPL(__round_jiffies_relative);
+
+/**
+ * round_jiffies - function to round jiffies to a full second
+ * @j: the time in (absolute) jiffies that should be rounded
+ *
+ * round_jiffies() rounds an absolute time in the future (in jiffies)
+ * up or down to (approximately) full seconds. This is useful for timers
+ * for which the exact time they fire does not matter too much, as long as
+ * they fire approximately every X seconds.
+ *
+ * By rounding these timers to whole seconds, all such timers will fire
+ * at the same time, rather than at various times spread out. The goal
+ * of this is to have the CPU wake up less, which saves power.
+ *
+ * The return value is the rounded version of the @j parameter.
+ */
+unsigned long round_jiffies(unsigned long j)
+{
+ return round_jiffies_common(j, raw_smp_processor_id(), false);
+}
+EXPORT_SYMBOL_GPL(round_jiffies);
+
+/**
+ * round_jiffies_relative - function to round jiffies to a full second
+ * @j: the time in (relative) jiffies that should be rounded
+ *
+ * round_jiffies_relative() rounds a time delta in the future (in jiffies)
+ * up or down to (approximately) full seconds. This is useful for timers
+ * for which the exact time they fire does not matter too much, as long as
+ * they fire approximately every X seconds.
+ *
+ * By rounding these timers to whole seconds, all such timers will fire
+ * at the same time, rather than at various times spread out. The goal
+ * of this is to have the CPU wake up less, which saves power.
+ *
+ * The return value is the rounded version of the @j parameter.
+ */
+unsigned long round_jiffies_relative(unsigned long j)
+{
+ return __round_jiffies_relative(j, raw_smp_processor_id());
+}
+EXPORT_SYMBOL_GPL(round_jiffies_relative);
+
+/**
+ * __round_jiffies_up - function to round jiffies up to a full second
+ * @j: the time in (absolute) jiffies that should be rounded
+ * @cpu: the processor number on which the timeout will happen
+ *
+ * This is the same as __round_jiffies() except that it will never
+ * round down. This is useful for timeouts for which the exact time
+ * of firing does not matter too much, as long as they don't fire too
+ * early.
+ */
+unsigned long __round_jiffies_up(unsigned long j, int cpu)
+{
+ return round_jiffies_common(j, cpu, true);
+}
+EXPORT_SYMBOL_GPL(__round_jiffies_up);
+
+/**
+ * __round_jiffies_up_relative - function to round jiffies up to a full second
+ * @j: the time in (relative) jiffies that should be rounded
+ * @cpu: the processor number on which the timeout will happen
+ *
+ * This is the same as __round_jiffies_relative() except that it will never
+ * round down. This is useful for timeouts for which the exact time
+ * of firing does not matter too much, as long as they don't fire too
+ * early.
+ */
+unsigned long __round_jiffies_up_relative(unsigned long j, int cpu)
+{
+ unsigned long j0 = jiffies;
+
+ /* Use j0 because jiffies might change while we run */
+ return round_jiffies_common(j + j0, cpu, true) - j0;
+}
+EXPORT_SYMBOL_GPL(__round_jiffies_up_relative);
+
+/**
+ * round_jiffies_up - function to round jiffies up to a full second
+ * @j: the time in (absolute) jiffies that should be rounded
+ *
+ * This is the same as round_jiffies() except that it will never
+ * round down. This is useful for timeouts for which the exact time
+ * of firing does not matter too much, as long as they don't fire too
+ * early.
+ */
+unsigned long round_jiffies_up(unsigned long j)
+{
+ return round_jiffies_common(j, raw_smp_processor_id(), true);
+}
+EXPORT_SYMBOL_GPL(round_jiffies_up);
+
+/**
+ * round_jiffies_up_relative - function to round jiffies up to a full second
+ * @j: the time in (relative) jiffies that should be rounded
+ *
+ * This is the same as round_jiffies_relative() except that it will never
+ * round down. This is useful for timeouts for which the exact time
+ * of firing does not matter too much, as long as they don't fire too
+ * early.
+ */
+unsigned long round_jiffies_up_relative(unsigned long j)
+{
+ return __round_jiffies_up_relative(j, raw_smp_processor_id());
+}
+EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
+
+/**
+ * set_timer_slack - set the allowed slack for a timer
+ * @timer: the timer to be modified
+ * @slack_hz: the amount of time (in jiffies) allowed for rounding
+ *
+ * Set the amount of time, in jiffies, that a certain timer has
+ * in terms of slack. By setting this value, the timer subsystem
+ * will schedule the actual timer somewhere between
+ * the time mod_timer() asks for, and that time plus the slack.
+ *
+ * By setting the slack to -1, a percentage of the delay is used
+ * instead.
+ */
+void set_timer_slack(struct timer_list *timer, int slack_hz)
+{
+ timer->slack = slack_hz;
+}
+EXPORT_SYMBOL_GPL(set_timer_slack);
+
+/*
+ * If the list is empty, catch up ->timer_jiffies to the current time.
+ * The caller must hold the tvec_base lock. Returns true if the list
+ * was empty and therefore ->timer_jiffies was updated.
+ */
+static bool catchup_timer_jiffies(struct tvec_base *base)
+{
+ if (!base->all_timers) {
+ base->timer_jiffies = jiffies;
+ return true;
+ }
+ return false;
+}
+
+static void
+__internal_add_timer(struct tvec_base *base, struct timer_list *timer)
+{
+ unsigned long expires = timer->expires;
+ unsigned long idx = expires - base->timer_jiffies;
+ struct list_head *vec;
+
+ if (idx < TVR_SIZE) {
+ int i = expires & TVR_MASK;
+ vec = base->tv1.vec + i;
+ } else if (idx < 1 << (TVR_BITS + TVN_BITS)) {
+ int i = (expires >> TVR_BITS) & TVN_MASK;
+ vec = base->tv2.vec + i;
+ } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) {
+ int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK;
+ vec = base->tv3.vec + i;
+ } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) {
+ int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK;
+ vec = base->tv4.vec + i;
+ } else if ((signed long) idx < 0) {
+ /*
+ * Can happen if you add a timer with expires == jiffies,
+ * or you set a timer to go off in the past
+ */
+ vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK);
+ } else {
+ int i;
+ /* If the timeout is larger than MAX_TVAL (on 64-bit
+ * architectures or with CONFIG_BASE_SMALL=1) then we
+ * use the maximum timeout.
+ */
+ if (idx > MAX_TVAL) {
+ idx = MAX_TVAL;
+ expires = idx + base->timer_jiffies;
+ }
+ i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
+ vec = base->tv5.vec + i;
+ }
+ /*
+ * Timers are FIFO:
+ */
+ list_add_tail(&timer->entry, vec);
+}
+
+static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
+{
+ (void)catchup_timer_jiffies(base);
+ __internal_add_timer(base, timer);
+ /*
+ * Update base->active_timers and base->next_timer
+ */
+ if (!tbase_get_deferrable(timer->base)) {
+ if (!base->active_timers++ ||
+ time_before(timer->expires, base->next_timer))
+ base->next_timer = timer->expires;
+ }
+ base->all_timers++;
+
+ /*
+ * Check whether the other CPU is in dynticks mode and needs
+ * to be triggered to reevaluate the timer wheel.
+ * We are protected against the other CPU fiddling
+ * with the timer by holding the timer base lock. This also
+ * makes sure that a CPU on the way to stop its tick can not
+ * evaluate the timer wheel.
+ *
+ * Spare the IPI for deferrable timers on idle targets though.
+ * The next busy ticks will take care of it. Except full dynticks
+ * require special care against races with idle_cpu(), lets deal
+ * with that later.
+ */
+ if (!tbase_get_deferrable(base) || tick_nohz_full_cpu(base->cpu))
+ wake_up_nohz_cpu(base->cpu);
+}
+
+#ifdef CONFIG_TIMER_STATS
+void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr)
+{
+ if (timer->start_site)
+ return;
+
+ timer->start_site = addr;
+ memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
+ timer->start_pid = current->pid;
+}
+
+static void timer_stats_account_timer(struct timer_list *timer)
+{
+ unsigned int flag = 0;
+
+ if (likely(!timer->start_site))
+ return;
+ if (unlikely(tbase_get_deferrable(timer->base)))
+ flag |= TIMER_STATS_FLAG_DEFERRABLE;
+
+ timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
+ timer->function, timer->start_comm, flag);
+}
+
+#else
+static void timer_stats_account_timer(struct timer_list *timer) {}
+#endif
+
+#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
+
+static struct debug_obj_descr timer_debug_descr;
+
+static void *timer_debug_hint(void *addr)
+{
+ return ((struct timer_list *) addr)->function;
+}
+
+/*
+ * fixup_init is called when:
+ * - an active object is initialized
+ */
+static int timer_fixup_init(void *addr, enum debug_obj_state state)
+{
+ struct timer_list *timer = addr;
+
+ switch (state) {
+ case ODEBUG_STATE_ACTIVE:
+ del_timer_sync(timer);
+ debug_object_init(timer, &timer_debug_descr);
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+/* Stub timer callback for improperly used timers. */
+static void stub_timer(unsigned long data)
+{
+ WARN_ON(1);
+}
+
+/*
+ * fixup_activate is called when:
+ * - an active object is activated
+ * - an unknown object is activated (might be a statically initialized object)
+ */
+static int timer_fixup_activate(void *addr, enum debug_obj_state state)
+{
+ struct timer_list *timer = addr;
+
+ switch (state) {
+
+ case ODEBUG_STATE_NOTAVAILABLE:
+ /*
+ * This is not really a fixup. The timer was
+ * statically initialized. We just make sure that it
+ * is tracked in the object tracker.
+ */
+ if (timer->entry.next == NULL &&
+ timer->entry.prev == TIMER_ENTRY_STATIC) {
+ debug_object_init(timer, &timer_debug_descr);
+ debug_object_activate(timer, &timer_debug_descr);
+ return 0;
+ } else {
+ setup_timer(timer, stub_timer, 0);
+ return 1;
+ }
+ return 0;
+
+ case ODEBUG_STATE_ACTIVE:
+ WARN_ON(1);
+
+ default:
+ return 0;
+ }
+}
+
+/*
+ * fixup_free is called when:
+ * - an active object is freed
+ */
+static int timer_fixup_free(void *addr, enum debug_obj_state state)
+{
+ struct timer_list *timer = addr;
+
+ switch (state) {
+ case ODEBUG_STATE_ACTIVE:
+ del_timer_sync(timer);
+ debug_object_free(timer, &timer_debug_descr);
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+/*
+ * fixup_assert_init is called when:
+ * - an untracked/uninit-ed object is found
+ */
+static int timer_fixup_assert_init(void *addr, enum debug_obj_state state)
+{
+ struct timer_list *timer = addr;
+
+ switch (state) {
+ case ODEBUG_STATE_NOTAVAILABLE:
+ if (timer->entry.prev == TIMER_ENTRY_STATIC) {
+ /*
+ * This is not really a fixup. The timer was
+ * statically initialized. We just make sure that it
+ * is tracked in the object tracker.
+ */
+ debug_object_init(timer, &timer_debug_descr);
+ return 0;
+ } else {
+ setup_timer(timer, stub_timer, 0);
+ return 1;
+ }
+ default:
+ return 0;
+ }
+}
+
+static struct debug_obj_descr timer_debug_descr = {
+ .name = "timer_list",
+ .debug_hint = timer_debug_hint,
+ .fixup_init = timer_fixup_init,
+ .fixup_activate = timer_fixup_activate,
+ .fixup_free = timer_fixup_free,
+ .fixup_assert_init = timer_fixup_assert_init,
+};
+
+static inline void debug_timer_init(struct timer_list *timer)
+{
+ debug_object_init(timer, &timer_debug_descr);
+}
+
+static inline void debug_timer_activate(struct timer_list *timer)
+{
+ debug_object_activate(timer, &timer_debug_descr);
+}
+
+static inline void debug_timer_deactivate(struct timer_list *timer)
+{
+ debug_object_deactivate(timer, &timer_debug_descr);
+}
+
+static inline void debug_timer_free(struct timer_list *timer)
+{
+ debug_object_free(timer, &timer_debug_descr);
+}
+
+static inline void debug_timer_assert_init(struct timer_list *timer)
+{
+ debug_object_assert_init(timer, &timer_debug_descr);
+}
+
+static void do_init_timer(struct timer_list *timer, unsigned int flags,
+ const char *name, struct lock_class_key *key);
+
+void init_timer_on_stack_key(struct timer_list *timer, unsigned int flags,
+ const char *name, struct lock_class_key *key)
+{
+ debug_object_init_on_stack(timer, &timer_debug_descr);
+ do_init_timer(timer, flags, name, key);
+}
+EXPORT_SYMBOL_GPL(init_timer_on_stack_key);
+
+void destroy_timer_on_stack(struct timer_list *timer)
+{
+ debug_object_free(timer, &timer_debug_descr);
+}
+EXPORT_SYMBOL_GPL(destroy_timer_on_stack);
+
+#else
+static inline void debug_timer_init(struct timer_list *timer) { }
+static inline void debug_timer_activate(struct timer_list *timer) { }
+static inline void debug_timer_deactivate(struct timer_list *timer) { }
+static inline void debug_timer_assert_init(struct timer_list *timer) { }
+#endif
+
+static inline void debug_init(struct timer_list *timer)
+{
+ debug_timer_init(timer);
+ trace_timer_init(timer);
+}
+
+static inline void
+debug_activate(struct timer_list *timer, unsigned long expires)
+{
+ debug_timer_activate(timer);
+ trace_timer_start(timer, expires);
+}
+
+static inline void debug_deactivate(struct timer_list *timer)
+{
+ debug_timer_deactivate(timer);
+ trace_timer_cancel(timer);
+}
+
+static inline void debug_assert_init(struct timer_list *timer)
+{
+ debug_timer_assert_init(timer);
+}
+
+static void do_init_timer(struct timer_list *timer, unsigned int flags,
+ const char *name, struct lock_class_key *key)
+{
+ struct tvec_base *base = raw_cpu_read(tvec_bases);
+
+ timer->entry.next = NULL;
+ timer->base = (void *)((unsigned long)base | flags);
+ timer->slack = -1;
+#ifdef CONFIG_TIMER_STATS
+ timer->start_site = NULL;
+ timer->start_pid = -1;
+ memset(timer->start_comm, 0, TASK_COMM_LEN);
+#endif
+ lockdep_init_map(&timer->lockdep_map, name, key, 0);
+}
+
+/**
+ * init_timer_key - initialize a timer
+ * @timer: the timer to be initialized
+ * @flags: timer flags
+ * @name: name of the timer
+ * @key: lockdep class key of the fake lock used for tracking timer
+ * sync lock dependencies
+ *
+ * init_timer_key() must be done to a timer prior calling *any* of the
+ * other timer functions.
+ */
+void init_timer_key(struct timer_list *timer, unsigned int flags,
+ const char *name, struct lock_class_key *key)
+{
+ debug_init(timer);
+ do_init_timer(timer, flags, name, key);
+}
+EXPORT_SYMBOL(init_timer_key);
+
+static inline void detach_timer(struct timer_list *timer, bool clear_pending)
+{
+ struct list_head *entry = &timer->entry;
+
+ debug_deactivate(timer);
+
+ __list_del(entry->prev, entry->next);
+ if (clear_pending)
+ entry->next = NULL;
+ entry->prev = LIST_POISON2;
+}
+
+static inline void
+detach_expired_timer(struct timer_list *timer, struct tvec_base *base)
+{
+ detach_timer(timer, true);
+ if (!tbase_get_deferrable(timer->base))
+ base->active_timers--;
+ base->all_timers--;
+ (void)catchup_timer_jiffies(base);
+}
+
+static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
+ bool clear_pending)
+{
+ if (!timer_pending(timer))
+ return 0;
+
+ detach_timer(timer, clear_pending);
+ if (!tbase_get_deferrable(timer->base)) {
+ base->active_timers--;
+ if (timer->expires == base->next_timer)
+ base->next_timer = base->timer_jiffies;
+ }
+ base->all_timers--;
+ (void)catchup_timer_jiffies(base);
+ return 1;
+}
+
+/*
+ * We are using hashed locking: holding per_cpu(tvec_bases).lock
+ * means that all timers which are tied to this base via timer->base are
+ * locked, and the base itself is locked too.
+ *
+ * So __run_timers/migrate_timers can safely modify all timers which could
+ * be found on ->tvX lists.
+ *
+ * When the timer's base is locked, and the timer removed from list, it is
+ * possible to set timer->base = NULL and drop the lock: the timer remains
+ * locked.
+ */
+static struct tvec_base *lock_timer_base(struct timer_list *timer,
+ unsigned long *flags)
+ __acquires(timer->base->lock)
+{
+ struct tvec_base *base;
+
+ for (;;) {
+ struct tvec_base *prelock_base = timer->base;
+ base = tbase_get_base(prelock_base);
+ if (likely(base != NULL)) {
+ spin_lock_irqsave(&base->lock, *flags);
+ if (likely(prelock_base == timer->base))
+ return base;
+ /* The timer has migrated to another CPU */
+ spin_unlock_irqrestore(&base->lock, *flags);
+ }
+ cpu_relax();
+ }
+}
+
+static inline int
+__mod_timer(struct timer_list *timer, unsigned long expires,
+ bool pending_only, int pinned)
+{
+ struct tvec_base *base, *new_base;
+ unsigned long flags;
+ int ret = 0 , cpu;
+
+ timer_stats_timer_set_start_info(timer);
+ BUG_ON(!timer->function);
+
+ base = lock_timer_base(timer, &flags);
+
+ ret = detach_if_pending(timer, base, false);
+ if (!ret && pending_only)
+ goto out_unlock;
+
+ debug_activate(timer, expires);
+
+ cpu = get_nohz_timer_target(pinned);
+ new_base = per_cpu(tvec_bases, cpu);
+
+ if (base != new_base) {
+ /*
+ * We are trying to schedule the timer on the local CPU.
+ * However we can't change timer's base while it is running,
+ * otherwise del_timer_sync() can't detect that the timer's
+ * handler yet has not finished. This also guarantees that
+ * the timer is serialized wrt itself.
+ */
+ if (likely(base->running_timer != timer)) {
+ /* See the comment in lock_timer_base() */
+ timer_set_base(timer, NULL);
+ spin_unlock(&base->lock);
+ base = new_base;
+ spin_lock(&base->lock);
+ timer_set_base(timer, base);
+ }
+ }
+
+ timer->expires = expires;
+ internal_add_timer(base, timer);
+
+out_unlock:
+ spin_unlock_irqrestore(&base->lock, flags);
+
+ return ret;
+}
+
+/**
+ * mod_timer_pending - modify a pending timer's timeout
+ * @timer: the pending timer to be modified
+ * @expires: new timeout in jiffies
+ *
+ * mod_timer_pending() is the same for pending timers as mod_timer(),
+ * but will not re-activate and modify already deleted timers.
+ *
+ * It is useful for unserialized use of timers.
+ */
+int mod_timer_pending(struct timer_list *timer, unsigned long expires)
+{
+ return __mod_timer(timer, expires, true, TIMER_NOT_PINNED);
+}
+EXPORT_SYMBOL(mod_timer_pending);
+
+/*
+ * Decide where to put the timer while taking the slack into account
+ *
+ * Algorithm:
+ * 1) calculate the maximum (absolute) time
+ * 2) calculate the highest bit where the expires and new max are different
+ * 3) use this bit to make a mask
+ * 4) use the bitmask to round down the maximum time, so that all last
+ * bits are zeros
+ */
+static inline
+unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
+{
+ unsigned long expires_limit, mask;
+ int bit;
+
+ if (timer->slack >= 0) {
+ expires_limit = expires + timer->slack;
+ } else {
+ long delta = expires - jiffies;
+
+ if (delta < 256)
+ return expires;
+
+ expires_limit = expires + delta / 256;
+ }
+ mask = expires ^ expires_limit;
+ if (mask == 0)
+ return expires;
+
+ bit = find_last_bit(&mask, BITS_PER_LONG);
+
+ mask = (1UL << bit) - 1;
+
+ expires_limit = expires_limit & ~(mask);
+
+ return expires_limit;
+}
+
+/**
+ * mod_timer - modify a timer's timeout
+ * @timer: the timer to be modified
+ * @expires: new timeout in jiffies
+ *
+ * mod_timer() is a more efficient way to update the expire field of an
+ * active timer (if the timer is inactive it will be activated)
+ *
+ * mod_timer(timer, expires) is equivalent to:
+ *
+ * del_timer(timer); timer->expires = expires; add_timer(timer);
+ *
+ * Note that if there are multiple unserialized concurrent users of the
+ * same timer, then mod_timer() is the only safe way to modify the timeout,
+ * since add_timer() cannot modify an already running timer.
+ *
+ * The function returns whether it has modified a pending timer or not.
+ * (ie. mod_timer() of an inactive timer returns 0, mod_timer() of an
+ * active timer returns 1.)
+ */
+int mod_timer(struct timer_list *timer, unsigned long expires)
+{
+ expires = apply_slack(timer, expires);
+
+ /*
+ * This is a common optimization triggered by the
+ * networking code - if the timer is re-modified
+ * to be the same thing then just return:
+ */
+ if (timer_pending(timer) && timer->expires == expires)
+ return 1;
+
+ return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
+}
+EXPORT_SYMBOL(mod_timer);
+
+/**
+ * mod_timer_pinned - modify a timer's timeout
+ * @timer: the timer to be modified
+ * @expires: new timeout in jiffies
+ *
+ * mod_timer_pinned() is a way to update the expire field of an
+ * active timer (if the timer is inactive it will be activated)
+ * and to ensure that the timer is scheduled on the current CPU.
+ *
+ * Note that this does not prevent the timer from being migrated
+ * when the current CPU goes offline. If this is a problem for
+ * you, use CPU-hotplug notifiers to handle it correctly, for
+ * example, cancelling the timer when the corresponding CPU goes
+ * offline.
+ *
+ * mod_timer_pinned(timer, expires) is equivalent to:
+ *
+ * del_timer(timer); timer->expires = expires; add_timer(timer);
+ */
+int mod_timer_pinned(struct timer_list *timer, unsigned long expires)
+{
+ if (timer->expires == expires && timer_pending(timer))
+ return 1;
+
+ return __mod_timer(timer, expires, false, TIMER_PINNED);
+}
+EXPORT_SYMBOL(mod_timer_pinned);
+
+/**
+ * add_timer - start a timer
+ * @timer: the timer to be added
+ *
+ * The kernel will do a ->function(->data) callback from the
+ * timer interrupt at the ->expires point in the future. The
+ * current time is 'jiffies'.
+ *
+ * The timer's ->expires, ->function (and if the handler uses it, ->data)
+ * fields must be set prior calling this function.
+ *
+ * Timers with an ->expires field in the past will be executed in the next
+ * timer tick.
+ */
+void add_timer(struct timer_list *timer)
+{
+ BUG_ON(timer_pending(timer));
+ mod_timer(timer, timer->expires);
+}
+EXPORT_SYMBOL(add_timer);
+
+/**
+ * add_timer_on - start a timer on a particular CPU
+ * @timer: the timer to be added
+ * @cpu: the CPU to start it on
+ *
+ * This is not very scalable on SMP. Double adds are not possible.
+ */
+void add_timer_on(struct timer_list *timer, int cpu)
+{
+ struct tvec_base *base = per_cpu(tvec_bases, cpu);
+ unsigned long flags;
+
+ timer_stats_timer_set_start_info(timer);
+ BUG_ON(timer_pending(timer) || !timer->function);
+ spin_lock_irqsave(&base->lock, flags);
+ timer_set_base(timer, base);
+ debug_activate(timer, timer->expires);
+ internal_add_timer(base, timer);
+ spin_unlock_irqrestore(&base->lock, flags);
+}
+EXPORT_SYMBOL_GPL(add_timer_on);
+
+/**
+ * del_timer - deactive a timer.
+ * @timer: the timer to be deactivated
+ *
+ * del_timer() deactivates a timer - this works on both active and inactive
+ * timers.
+ *
+ * The function returns whether it has deactivated a pending timer or not.
+ * (ie. del_timer() of an inactive timer returns 0, del_timer() of an
+ * active timer returns 1.)
+ */
+int del_timer(struct timer_list *timer)
+{
+ struct tvec_base *base;
+ unsigned long flags;
+ int ret = 0;
+
+ debug_assert_init(timer);
+
+ timer_stats_timer_clear_start_info(timer);
+ if (timer_pending(timer)) {
+ base = lock_timer_base(timer, &flags);
+ ret = detach_if_pending(timer, base, true);
+ spin_unlock_irqrestore(&base->lock, flags);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(del_timer);
+
+/**
+ * try_to_del_timer_sync - Try to deactivate a timer
+ * @timer: timer do del
+ *
+ * This function tries to deactivate a timer. Upon successful (ret >= 0)
+ * exit the timer is not queued and the handler is not running on any CPU.
+ */
+int try_to_del_timer_sync(struct timer_list *timer)
+{
+ struct tvec_base *base;
+ unsigned long flags;
+ int ret = -1;
+
+ debug_assert_init(timer);
+
+ base = lock_timer_base(timer, &flags);
+
+ if (base->running_timer != timer) {
+ timer_stats_timer_clear_start_info(timer);
+ ret = detach_if_pending(timer, base, true);
+ }
+ spin_unlock_irqrestore(&base->lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(try_to_del_timer_sync);
+
+#ifdef CONFIG_SMP
+static DEFINE_PER_CPU(struct tvec_base, __tvec_bases);
+
+/**
+ * del_timer_sync - deactivate a timer and wait for the handler to finish.
+ * @timer: the timer to be deactivated
+ *
+ * This function only differs from del_timer() on SMP: besides deactivating
+ * the timer it also makes sure the handler has finished executing on other
+ * CPUs.
+ *
+ * Synchronization rules: Callers must prevent restarting of the timer,
+ * otherwise this function is meaningless. It must not be called from
+ * interrupt contexts unless the timer is an irqsafe one. The caller must
+ * not hold locks which would prevent completion of the timer's
+ * handler. The timer's handler must not call add_timer_on(). Upon exit the
+ * timer is not queued and the handler is not running on any CPU.
+ *
+ * Note: For !irqsafe timers, you must not hold locks that are held in
+ * interrupt context while calling this function. Even if the lock has
+ * nothing to do with the timer in question. Here's why:
+ *
+ * CPU0 CPU1
+ * ---- ----
+ * <SOFTIRQ>
+ * call_timer_fn();
+ * base->running_timer = mytimer;
+ * spin_lock_irq(somelock);
+ * <IRQ>
+ * spin_lock(somelock);
+ * del_timer_sync(mytimer);
+ * while (base->running_timer == mytimer);
+ *
+ * Now del_timer_sync() will never return and never release somelock.
+ * The interrupt on the other CPU is waiting to grab somelock but
+ * it has interrupted the softirq that CPU0 is waiting to finish.
+ *
+ * The function returns whether it has deactivated a pending timer or not.
+ */
+int del_timer_sync(struct timer_list *timer)
+{
+#ifdef CONFIG_LOCKDEP
+ unsigned long flags;
+
+ /*
+ * If lockdep gives a backtrace here, please reference
+ * the synchronization rules above.
+ */
+ local_irq_save(flags);
+ lock_map_acquire(&timer->lockdep_map);
+ lock_map_release(&timer->lockdep_map);
+ local_irq_restore(flags);
+#endif
+ /*
+ * don't use it in hardirq context, because it
+ * could lead to deadlock.
+ */
+ WARN_ON(in_irq() && !tbase_get_irqsafe(timer->base));
+ for (;;) {
+ int ret = try_to_del_timer_sync(timer);
+ if (ret >= 0)
+ return ret;
+ cpu_relax();
+ }
+}
+EXPORT_SYMBOL(del_timer_sync);
+#endif
+
+static int cascade(struct tvec_base *base, struct tvec *tv, int index)
+{
+ /* cascade all the timers from tv up one level */
+ struct timer_list *timer, *tmp;
+ struct list_head tv_list;
+
+ list_replace_init(tv->vec + index, &tv_list);
+
+ /*
+ * We are removing _all_ timers from the list, so we
+ * don't have to detach them individually.
+ */
+ list_for_each_entry_safe(timer, tmp, &tv_list, entry) {
+ BUG_ON(tbase_get_base(timer->base) != base);
+ /* No accounting, while moving them */
+ __internal_add_timer(base, timer);
+ }
+
+ return index;
+}
+
+static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
+ unsigned long data)
+{
+ int count = preempt_count();
+
+#ifdef CONFIG_LOCKDEP
+ /*
+ * It is permissible to free the timer from inside the
+ * function that is called from it, this we need to take into
+ * account for lockdep too. To avoid bogus "held lock freed"
+ * warnings as well as problems when looking into
+ * timer->lockdep_map, make a copy and use that here.
+ */
+ struct lockdep_map lockdep_map;
+
+ lockdep_copy_map(&lockdep_map, &timer->lockdep_map);
+#endif
+ /*
+ * Couple the lock chain with the lock chain at
+ * del_timer_sync() by acquiring the lock_map around the fn()
+ * call here and in del_timer_sync().
+ */
+ lock_map_acquire(&lockdep_map);
+
+ trace_timer_expire_entry(timer);
+ fn(data);
+ trace_timer_expire_exit(timer);
+
+ lock_map_release(&lockdep_map);
+
+ if (count != preempt_count()) {
+ WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n",
+ fn, count, preempt_count());
+ /*
+ * Restore the preempt count. That gives us a decent
+ * chance to survive and extract information. If the
+ * callback kept a lock held, bad luck, but not worse
+ * than the BUG() we had.
+ */
+ preempt_count_set(count);
+ }
+}
+
+#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
+
+/**
+ * __run_timers - run all expired timers (if any) on this CPU.
+ * @base: the timer vector to be processed.
+ *
+ * This function cascades all vectors and executes all expired timer
+ * vectors.
+ */
+static inline void __run_timers(struct tvec_base *base)
+{
+ struct timer_list *timer;
+
+ spin_lock_irq(&base->lock);
+ if (catchup_timer_jiffies(base)) {
+ spin_unlock_irq(&base->lock);
+ return;
+ }
+ while (time_after_eq(jiffies, base->timer_jiffies)) {
+ struct list_head work_list;
+ struct list_head *head = &work_list;
+ int index = base->timer_jiffies & TVR_MASK;
+
+ /*
+ * Cascade timers:
+ */
+ if (!index &&
+ (!cascade(base, &base->tv2, INDEX(0))) &&
+ (!cascade(base, &base->tv3, INDEX(1))) &&
+ !cascade(base, &base->tv4, INDEX(2)))
+ cascade(base, &base->tv5, INDEX(3));
+ ++base->timer_jiffies;
+ list_replace_init(base->tv1.vec + index, head);
+ while (!list_empty(head)) {
+ void (*fn)(unsigned long);
+ unsigned long data;
+ bool irqsafe;
+
+ timer = list_first_entry(head, struct timer_list,entry);
+ fn = timer->function;
+ data = timer->data;
+ irqsafe = tbase_get_irqsafe(timer->base);
+
+ timer_stats_account_timer(timer);
+
+ base->running_timer = timer;
+ detach_expired_timer(timer, base);
+
+ if (irqsafe) {
+ spin_unlock(&base->lock);
+ call_timer_fn(timer, fn, data);
+ spin_lock(&base->lock);
+ } else {
+ spin_unlock_irq(&base->lock);
+ call_timer_fn(timer, fn, data);
+ spin_lock_irq(&base->lock);
+ }
+ }
+ }
+ base->running_timer = NULL;
+ spin_unlock_irq(&base->lock);
+}
+
+#ifdef CONFIG_NO_HZ_COMMON
+/*
+ * Find out when the next timer event is due to happen. This
+ * is used on S/390 to stop all activity when a CPU is idle.
+ * This function needs to be called with interrupts disabled.
+ */
+static unsigned long __next_timer_interrupt(struct tvec_base *base)
+{
+ unsigned long timer_jiffies = base->timer_jiffies;
+ unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA;
+ int index, slot, array, found = 0;
+ struct timer_list *nte;
+ struct tvec *varray[4];
+
+ /* Look for timer events in tv1. */
+ index = slot = timer_jiffies & TVR_MASK;
+ do {
+ list_for_each_entry(nte, base->tv1.vec + slot, entry) {
+ if (tbase_get_deferrable(nte->base))
+ continue;
+
+ found = 1;
+ expires = nte->expires;
+ /* Look at the cascade bucket(s)? */
+ if (!index || slot < index)
+ goto cascade;
+ return expires;
+ }
+ slot = (slot + 1) & TVR_MASK;
+ } while (slot != index);
+
+cascade:
+ /* Calculate the next cascade event */
+ if (index)
+ timer_jiffies += TVR_SIZE - index;
+ timer_jiffies >>= TVR_BITS;
+
+ /* Check tv2-tv5. */
+ varray[0] = &base->tv2;
+ varray[1] = &base->tv3;
+ varray[2] = &base->tv4;
+ varray[3] = &base->tv5;
+
+ for (array = 0; array < 4; array++) {
+ struct tvec *varp = varray[array];
+
+ index = slot = timer_jiffies & TVN_MASK;
+ do {
+ list_for_each_entry(nte, varp->vec + slot, entry) {
+ if (tbase_get_deferrable(nte->base))
+ continue;
+
+ found = 1;
+ if (time_before(nte->expires, expires))
+ expires = nte->expires;
+ }
+ /*
+ * Do we still search for the first timer or are
+ * we looking up the cascade buckets ?
+ */
+ if (found) {
+ /* Look at the cascade bucket(s)? */
+ if (!index || slot < index)
+ break;
+ return expires;
+ }
+ slot = (slot + 1) & TVN_MASK;
+ } while (slot != index);
+
+ if (index)
+ timer_jiffies += TVN_SIZE - index;
+ timer_jiffies >>= TVN_BITS;
+ }
+ return expires;
+}
+
+/*
+ * Check, if the next hrtimer event is before the next timer wheel
+ * event:
+ */
+static unsigned long cmp_next_hrtimer_event(unsigned long now,
+ unsigned long expires)
+{
+ ktime_t hr_delta = hrtimer_get_next_event();
+ struct timespec tsdelta;
+ unsigned long delta;
+
+ if (hr_delta.tv64 == KTIME_MAX)
+ return expires;
+
+ /*
+ * Expired timer available, let it expire in the next tick
+ */
+ if (hr_delta.tv64 <= 0)
+ return now + 1;
+
+ tsdelta = ktime_to_timespec(hr_delta);
+ delta = timespec_to_jiffies(&tsdelta);
+
+ /*
+ * Limit the delta to the max value, which is checked in
+ * tick_nohz_stop_sched_tick():
+ */
+ if (delta > NEXT_TIMER_MAX_DELTA)
+ delta = NEXT_TIMER_MAX_DELTA;
+
+ /*
+ * Take rounding errors in to account and make sure, that it
+ * expires in the next tick. Otherwise we go into an endless
+ * ping pong due to tick_nohz_stop_sched_tick() retriggering
+ * the timer softirq
+ */
+ if (delta < 1)
+ delta = 1;
+ now += delta;
+ if (time_before(now, expires))
+ return now;
+ return expires;
+}
+
+/**
+ * get_next_timer_interrupt - return the jiffy of the next pending timer
+ * @now: current time (in jiffies)
+ */
+unsigned long get_next_timer_interrupt(unsigned long now)
+{
+ struct tvec_base *base = __this_cpu_read(tvec_bases);
+ unsigned long expires = now + NEXT_TIMER_MAX_DELTA;
+
+ /*
+ * Pretend that there is no timer pending if the cpu is offline.
+ * Possible pending timers will be migrated later to an active cpu.
+ */
+ if (cpu_is_offline(smp_processor_id()))
+ return expires;
+
+ spin_lock(&base->lock);
+ if (base->active_timers) {
+ if (time_before_eq(base->next_timer, base->timer_jiffies))
+ base->next_timer = __next_timer_interrupt(base);
+ expires = base->next_timer;
+ }
+ spin_unlock(&base->lock);
+
+ if (time_before_eq(expires, now))
+ return now;
+
+ return cmp_next_hrtimer_event(now, expires);
+}
+#endif
+
+/*
+ * Called from the timer interrupt handler to charge one tick to the current
+ * process. user_tick is 1 if the tick is user time, 0 for system.
+ */
+void update_process_times(int user_tick)
+{
+ struct task_struct *p = current;
+
+ /* Note: this timer irq context must be accounted for as well. */
+ account_process_tick(p, user_tick);
+ run_local_timers();
+ rcu_check_callbacks(user_tick);
+#ifdef CONFIG_IRQ_WORK
+ if (in_irq())
+ irq_work_tick();
+#endif
+ scheduler_tick();
+ run_posix_cpu_timers(p);
+}
+
+/*
+ * This function runs timers and the timer-tq in bottom half context.
+ */
+static void run_timer_softirq(struct softirq_action *h)
+{
+ struct tvec_base *base = __this_cpu_read(tvec_bases);
+
+ hrtimer_run_pending();
+
+ if (time_after_eq(jiffies, base->timer_jiffies))
+ __run_timers(base);
+}
+
+/*
+ * Called by the local, per-CPU timer interrupt on SMP.
+ */
+void run_local_timers(void)
+{
+ hrtimer_run_queues();
+ raise_softirq(TIMER_SOFTIRQ);
+}
+
+#ifdef __ARCH_WANT_SYS_ALARM
+
+/*
+ * For backwards compatibility? This can be done in libc so Alpha
+ * and all newer ports shouldn't need it.
+ */
+SYSCALL_DEFINE1(alarm, unsigned int, seconds)
+{
+ return alarm_setitimer(seconds);
+}
+
+#endif
+
+static void process_timeout(unsigned long __data)
+{
+ wake_up_process((struct task_struct *)__data);
+}
+
+/**
+ * schedule_timeout - sleep until timeout
+ * @timeout: timeout value in jiffies
+ *
+ * Make the current task sleep until @timeout jiffies have
+ * elapsed. The routine will return immediately unless
+ * the current task state has been set (see set_current_state()).
+ *
+ * You can set the task state as follows -
+ *
+ * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
+ * pass before the routine returns. The routine will return 0
+ *
+ * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
+ * delivered to the current task. In this case the remaining time
+ * in jiffies will be returned, or 0 if the timer expired in time
+ *
+ * The current task state is guaranteed to be TASK_RUNNING when this
+ * routine returns.
+ *
+ * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
+ * the CPU away without a bound on the timeout. In this case the return
+ * value will be %MAX_SCHEDULE_TIMEOUT.
+ *
+ * In all cases the return value is guaranteed to be non-negative.
+ */
+signed long __sched schedule_timeout(signed long timeout)
+{
+ struct timer_list timer;
+ unsigned long expire;
+
+ switch (timeout)
+ {
+ case MAX_SCHEDULE_TIMEOUT:
+ /*
+ * These two special cases are useful to be comfortable
+ * in the caller. Nothing more. We could take
+ * MAX_SCHEDULE_TIMEOUT from one of the negative value
+ * but I' d like to return a valid offset (>=0) to allow
+ * the caller to do everything it want with the retval.
+ */
+ schedule();
+ goto out;
+ default:
+ /*
+ * Another bit of PARANOID. Note that the retval will be
+ * 0 since no piece of kernel is supposed to do a check
+ * for a negative retval of schedule_timeout() (since it
+ * should never happens anyway). You just have the printk()
+ * that will tell you if something is gone wrong and where.
+ */
+ if (timeout < 0) {
+ printk(KERN_ERR "schedule_timeout: wrong timeout "
+ "value %lx\n", timeout);
+ dump_stack();
+ current->state = TASK_RUNNING;
+ goto out;
+ }
+ }
+
+ expire = timeout + jiffies;
+
+ setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
+ __mod_timer(&timer, expire, false, TIMER_NOT_PINNED);
+ schedule();
+ del_singleshot_timer_sync(&timer);
+
+ /* Remove the timer from the object tracker */
+ destroy_timer_on_stack(&timer);
+
+ timeout = expire - jiffies;
+
+ out:
+ return timeout < 0 ? 0 : timeout;
+}
+EXPORT_SYMBOL(schedule_timeout);
+
+/*
+ * We can use __set_current_state() here because schedule_timeout() calls
+ * schedule() unconditionally.
+ */
+signed long __sched schedule_timeout_interruptible(signed long timeout)
+{
+ __set_current_state(TASK_INTERRUPTIBLE);
+ return schedule_timeout(timeout);
+}
+EXPORT_SYMBOL(schedule_timeout_interruptible);
+
+signed long __sched schedule_timeout_killable(signed long timeout)
+{
+ __set_current_state(TASK_KILLABLE);
+ return schedule_timeout(timeout);
+}
+EXPORT_SYMBOL(schedule_timeout_killable);
+
+signed long __sched schedule_timeout_uninterruptible(signed long timeout)
+{
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ return schedule_timeout(timeout);
+}
+EXPORT_SYMBOL(schedule_timeout_uninterruptible);
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head)
+{
+ struct timer_list *timer;
+
+ while (!list_empty(head)) {
+ timer = list_first_entry(head, struct timer_list, entry);
+ /* We ignore the accounting on the dying cpu */
+ detach_timer(timer, false);
+ timer_set_base(timer, new_base);
+ internal_add_timer(new_base, timer);
+ }
+}
+
+static void migrate_timers(int cpu)
+{
+ struct tvec_base *old_base;
+ struct tvec_base *new_base;
+ int i;
+
+ BUG_ON(cpu_online(cpu));
+ old_base = per_cpu(tvec_bases, cpu);
+ new_base = get_cpu_var(tvec_bases);
+ /*
+ * The caller is globally serialized and nobody else
+ * takes two locks at once, deadlock is not possible.
+ */
+ spin_lock_irq(&new_base->lock);
+ spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
+
+ BUG_ON(old_base->running_timer);
+
+ for (i = 0; i < TVR_SIZE; i++)
+ migrate_timer_list(new_base, old_base->tv1.vec + i);
+ for (i = 0; i < TVN_SIZE; i++) {
+ migrate_timer_list(new_base, old_base->tv2.vec + i);
+ migrate_timer_list(new_base, old_base->tv3.vec + i);
+ migrate_timer_list(new_base, old_base->tv4.vec + i);
+ migrate_timer_list(new_base, old_base->tv5.vec + i);
+ }
+
+ old_base->active_timers = 0;
+ old_base->all_timers = 0;
+
+ spin_unlock(&old_base->lock);
+ spin_unlock_irq(&new_base->lock);
+ put_cpu_var(tvec_bases);
+}
+
+static int timer_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ switch (action) {
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ migrate_timers((long)hcpu);
+ break;
+ default:
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static inline void timer_register_cpu_notifier(void)
+{
+ cpu_notifier(timer_cpu_notify, 0);
+}
+#else
+static inline void timer_register_cpu_notifier(void) { }
+#endif /* CONFIG_HOTPLUG_CPU */
+
+static void __init init_timer_cpu(struct tvec_base *base, int cpu)
+{
+ int j;
+
+ BUG_ON(base != tbase_get_base(base));
+
+ base->cpu = cpu;
+ per_cpu(tvec_bases, cpu) = base;
+ spin_lock_init(&base->lock);
+
+ for (j = 0; j < TVN_SIZE; j++) {
+ INIT_LIST_HEAD(base->tv5.vec + j);
+ INIT_LIST_HEAD(base->tv4.vec + j);
+ INIT_LIST_HEAD(base->tv3.vec + j);
+ INIT_LIST_HEAD(base->tv2.vec + j);
+ }
+ for (j = 0; j < TVR_SIZE; j++)
+ INIT_LIST_HEAD(base->tv1.vec + j);
+
+ base->timer_jiffies = jiffies;
+ base->next_timer = base->timer_jiffies;
+}
+
+static void __init init_timer_cpus(void)
+{
+ struct tvec_base *base;
+ int local_cpu = smp_processor_id();
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ if (cpu == local_cpu)
+ base = &boot_tvec_bases;
+#ifdef CONFIG_SMP
+ else
+ base = per_cpu_ptr(&__tvec_bases, cpu);
+#endif
+
+ init_timer_cpu(base, cpu);
+ }
+}
+
+void __init init_timers(void)
+{
+ /* ensure there are enough low bits for flags in timer->base pointer */
+ BUILD_BUG_ON(__alignof__(struct tvec_base) & TIMER_FLAG_MASK);
+
+ init_timer_cpus();
+ init_timer_stats();
+ timer_register_cpu_notifier();
+ open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
+}
+
+/**
+ * msleep - sleep safely even with waitqueue interruptions
+ * @msecs: Time in milliseconds to sleep for
+ */
+void msleep(unsigned int msecs)
+{
+ unsigned long timeout = msecs_to_jiffies(msecs) + 1;
+
+ while (timeout)
+ timeout = schedule_timeout_uninterruptible(timeout);
+}
+
+EXPORT_SYMBOL(msleep);
+
+/**
+ * msleep_interruptible - sleep waiting for signals
+ * @msecs: Time in milliseconds to sleep for
+ */
+unsigned long msleep_interruptible(unsigned int msecs)
+{
+ unsigned long timeout = msecs_to_jiffies(msecs) + 1;
+
+ while (timeout && !signal_pending(current))
+ timeout = schedule_timeout_interruptible(timeout);
+ return jiffies_to_msecs(timeout);
+}
+
+EXPORT_SYMBOL(msleep_interruptible);
+
+static int __sched do_usleep_range(unsigned long min, unsigned long max)
+{
+ ktime_t kmin;
+ unsigned long delta;
+
+ kmin = ktime_set(0, min * NSEC_PER_USEC);
+ delta = (max - min) * NSEC_PER_USEC;
+ return schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL);
+}
+
+/**
+ * usleep_range - Drop in replacement for udelay where wakeup is flexible
+ * @min: Minimum time in usecs to sleep
+ * @max: Maximum time in usecs to sleep
+ */
+void usleep_range(unsigned long min, unsigned long max)
+{
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ do_usleep_range(min, max);
+}
+EXPORT_SYMBOL(usleep_range);
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
new file mode 100644
index 000000000..e878c2e0b
--- /dev/null
+++ b/kernel/time/timer_list.c
@@ -0,0 +1,396 @@
+/*
+ * kernel/time/timer_list.c
+ *
+ * List pending timers
+ *
+ * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/proc_fs.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/kallsyms.h>
+
+#include <asm/uaccess.h>
+
+#include "tick-internal.h"
+
+struct timer_list_iter {
+ int cpu;
+ bool second_pass;
+ u64 now;
+};
+
+typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes);
+
+DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
+
+/*
+ * This allows printing both to /proc/timer_list and
+ * to the console (on SysRq-Q):
+ */
+#define SEQ_printf(m, x...) \
+ do { \
+ if (m) \
+ seq_printf(m, x); \
+ else \
+ printk(x); \
+ } while (0)
+
+static void print_name_offset(struct seq_file *m, void *sym)
+{
+ char symname[KSYM_NAME_LEN];
+
+ if (lookup_symbol_name((unsigned long)sym, symname) < 0)
+ SEQ_printf(m, "<%pK>", sym);
+ else
+ SEQ_printf(m, "%s", symname);
+}
+
+static void
+print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer,
+ int idx, u64 now)
+{
+#ifdef CONFIG_TIMER_STATS
+ char tmp[TASK_COMM_LEN + 1];
+#endif
+ SEQ_printf(m, " #%d: ", idx);
+ print_name_offset(m, taddr);
+ SEQ_printf(m, ", ");
+ print_name_offset(m, timer->function);
+ SEQ_printf(m, ", S:%02lx", timer->state);
+#ifdef CONFIG_TIMER_STATS
+ SEQ_printf(m, ", ");
+ print_name_offset(m, timer->start_site);
+ memcpy(tmp, timer->start_comm, TASK_COMM_LEN);
+ tmp[TASK_COMM_LEN] = 0;
+ SEQ_printf(m, ", %s/%d", tmp, timer->start_pid);
+#endif
+ SEQ_printf(m, "\n");
+ SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n",
+ (unsigned long long)ktime_to_ns(hrtimer_get_softexpires(timer)),
+ (unsigned long long)ktime_to_ns(hrtimer_get_expires(timer)),
+ (long long)(ktime_to_ns(hrtimer_get_softexpires(timer)) - now),
+ (long long)(ktime_to_ns(hrtimer_get_expires(timer)) - now));
+}
+
+static void
+print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base,
+ u64 now)
+{
+ struct hrtimer *timer, tmp;
+ unsigned long next = 0, i;
+ struct timerqueue_node *curr;
+ unsigned long flags;
+
+next_one:
+ i = 0;
+ raw_spin_lock_irqsave(&base->cpu_base->lock, flags);
+
+ curr = timerqueue_getnext(&base->active);
+ /*
+ * Crude but we have to do this O(N*N) thing, because
+ * we have to unlock the base when printing:
+ */
+ while (curr && i < next) {
+ curr = timerqueue_iterate_next(curr);
+ i++;
+ }
+
+ if (curr) {
+
+ timer = container_of(curr, struct hrtimer, node);
+ tmp = *timer;
+ raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags);
+
+ print_timer(m, timer, &tmp, i, now);
+ next++;
+ goto next_one;
+ }
+ raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags);
+}
+
+static void
+print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
+{
+ SEQ_printf(m, " .base: %pK\n", base);
+ SEQ_printf(m, " .index: %d\n",
+ base->index);
+ SEQ_printf(m, " .resolution: %Lu nsecs\n",
+ (unsigned long long)ktime_to_ns(base->resolution));
+ SEQ_printf(m, " .get_time: ");
+ print_name_offset(m, base->get_time);
+ SEQ_printf(m, "\n");
+#ifdef CONFIG_HIGH_RES_TIMERS
+ SEQ_printf(m, " .offset: %Lu nsecs\n",
+ (unsigned long long) ktime_to_ns(base->offset));
+#endif
+ SEQ_printf(m, "active timers:\n");
+ print_active_timers(m, base, now);
+}
+
+static void print_cpu(struct seq_file *m, int cpu, u64 now)
+{
+ struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
+ int i;
+
+ SEQ_printf(m, "cpu: %d\n", cpu);
+ for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
+ SEQ_printf(m, " clock %d:\n", i);
+ print_base(m, cpu_base->clock_base + i, now);
+ }
+#define P(x) \
+ SEQ_printf(m, " .%-15s: %Lu\n", #x, \
+ (unsigned long long)(cpu_base->x))
+#define P_ns(x) \
+ SEQ_printf(m, " .%-15s: %Lu nsecs\n", #x, \
+ (unsigned long long)(ktime_to_ns(cpu_base->x)))
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+ P_ns(expires_next);
+ P(hres_active);
+ P(nr_events);
+ P(nr_retries);
+ P(nr_hangs);
+ P_ns(max_hang_time);
+#endif
+#undef P
+#undef P_ns
+
+#ifdef CONFIG_TICK_ONESHOT
+# define P(x) \
+ SEQ_printf(m, " .%-15s: %Lu\n", #x, \
+ (unsigned long long)(ts->x))
+# define P_ns(x) \
+ SEQ_printf(m, " .%-15s: %Lu nsecs\n", #x, \
+ (unsigned long long)(ktime_to_ns(ts->x)))
+ {
+ struct tick_sched *ts = tick_get_tick_sched(cpu);
+ P(nohz_mode);
+ P_ns(last_tick);
+ P(tick_stopped);
+ P(idle_jiffies);
+ P(idle_calls);
+ P(idle_sleeps);
+ P_ns(idle_entrytime);
+ P_ns(idle_waketime);
+ P_ns(idle_exittime);
+ P_ns(idle_sleeptime);
+ P_ns(iowait_sleeptime);
+ P(last_jiffies);
+ P(next_jiffies);
+ P_ns(idle_expires);
+ SEQ_printf(m, "jiffies: %Lu\n",
+ (unsigned long long)jiffies);
+ }
+#endif
+
+#undef P
+#undef P_ns
+ SEQ_printf(m, "\n");
+}
+
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+static void
+print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
+{
+ struct clock_event_device *dev = td->evtdev;
+
+ SEQ_printf(m, "Tick Device: mode: %d\n", td->mode);
+ if (cpu < 0)
+ SEQ_printf(m, "Broadcast device\n");
+ else
+ SEQ_printf(m, "Per CPU device: %d\n", cpu);
+
+ SEQ_printf(m, "Clock Event Device: ");
+ if (!dev) {
+ SEQ_printf(m, "<NULL>\n");
+ return;
+ }
+ SEQ_printf(m, "%s\n", dev->name);
+ SEQ_printf(m, " max_delta_ns: %llu\n",
+ (unsigned long long) dev->max_delta_ns);
+ SEQ_printf(m, " min_delta_ns: %llu\n",
+ (unsigned long long) dev->min_delta_ns);
+ SEQ_printf(m, " mult: %u\n", dev->mult);
+ SEQ_printf(m, " shift: %u\n", dev->shift);
+ SEQ_printf(m, " mode: %d\n", dev->mode);
+ SEQ_printf(m, " next_event: %Ld nsecs\n",
+ (unsigned long long) ktime_to_ns(dev->next_event));
+
+ SEQ_printf(m, " set_next_event: ");
+ print_name_offset(m, dev->set_next_event);
+ SEQ_printf(m, "\n");
+
+ if (dev->set_mode) {
+ SEQ_printf(m, " set_mode: ");
+ print_name_offset(m, dev->set_mode);
+ SEQ_printf(m, "\n");
+ } else {
+ if (dev->set_state_shutdown) {
+ SEQ_printf(m, " shutdown: ");
+ print_name_offset(m, dev->set_state_shutdown);
+ SEQ_printf(m, "\n");
+ }
+
+ if (dev->set_state_periodic) {
+ SEQ_printf(m, " periodic: ");
+ print_name_offset(m, dev->set_state_periodic);
+ SEQ_printf(m, "\n");
+ }
+
+ if (dev->set_state_oneshot) {
+ SEQ_printf(m, " oneshot: ");
+ print_name_offset(m, dev->set_state_oneshot);
+ SEQ_printf(m, "\n");
+ }
+
+ if (dev->tick_resume) {
+ SEQ_printf(m, " resume: ");
+ print_name_offset(m, dev->tick_resume);
+ SEQ_printf(m, "\n");
+ }
+ }
+
+ SEQ_printf(m, " event_handler: ");
+ print_name_offset(m, dev->event_handler);
+ SEQ_printf(m, "\n");
+ SEQ_printf(m, " retries: %lu\n", dev->retries);
+ SEQ_printf(m, "\n");
+}
+
+static void timer_list_show_tickdevices_header(struct seq_file *m)
+{
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+ print_tickdevice(m, tick_get_broadcast_device(), -1);
+ SEQ_printf(m, "tick_broadcast_mask: %08lx\n",
+ cpumask_bits(tick_get_broadcast_mask())[0]);
+#ifdef CONFIG_TICK_ONESHOT
+ SEQ_printf(m, "tick_broadcast_oneshot_mask: %08lx\n",
+ cpumask_bits(tick_get_broadcast_oneshot_mask())[0]);
+#endif
+ SEQ_printf(m, "\n");
+#endif
+}
+#endif
+
+static inline void timer_list_header(struct seq_file *m, u64 now)
+{
+ SEQ_printf(m, "Timer List Version: v0.7\n");
+ SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
+ SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
+ SEQ_printf(m, "\n");
+}
+
+static int timer_list_show(struct seq_file *m, void *v)
+{
+ struct timer_list_iter *iter = v;
+
+ if (iter->cpu == -1 && !iter->second_pass)
+ timer_list_header(m, iter->now);
+ else if (!iter->second_pass)
+ print_cpu(m, iter->cpu, iter->now);
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+ else if (iter->cpu == -1 && iter->second_pass)
+ timer_list_show_tickdevices_header(m);
+ else
+ print_tickdevice(m, tick_get_device(iter->cpu), iter->cpu);
+#endif
+ return 0;
+}
+
+void sysrq_timer_list_show(void)
+{
+ u64 now = ktime_to_ns(ktime_get());
+ int cpu;
+
+ timer_list_header(NULL, now);
+
+ for_each_online_cpu(cpu)
+ print_cpu(NULL, cpu, now);
+
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+ timer_list_show_tickdevices_header(NULL);
+ for_each_online_cpu(cpu)
+ print_tickdevice(NULL, tick_get_device(cpu), cpu);
+#endif
+ return;
+}
+
+static void *move_iter(struct timer_list_iter *iter, loff_t offset)
+{
+ for (; offset; offset--) {
+ iter->cpu = cpumask_next(iter->cpu, cpu_online_mask);
+ if (iter->cpu >= nr_cpu_ids) {
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+ if (!iter->second_pass) {
+ iter->cpu = -1;
+ iter->second_pass = true;
+ } else
+ return NULL;
+#else
+ return NULL;
+#endif
+ }
+ }
+ return iter;
+}
+
+static void *timer_list_start(struct seq_file *file, loff_t *offset)
+{
+ struct timer_list_iter *iter = file->private;
+
+ if (!*offset)
+ iter->now = ktime_to_ns(ktime_get());
+ iter->cpu = -1;
+ iter->second_pass = false;
+ return move_iter(iter, *offset);
+}
+
+static void *timer_list_next(struct seq_file *file, void *v, loff_t *offset)
+{
+ struct timer_list_iter *iter = file->private;
+ ++*offset;
+ return move_iter(iter, 1);
+}
+
+static void timer_list_stop(struct seq_file *seq, void *v)
+{
+}
+
+static const struct seq_operations timer_list_sops = {
+ .start = timer_list_start,
+ .next = timer_list_next,
+ .stop = timer_list_stop,
+ .show = timer_list_show,
+};
+
+static int timer_list_open(struct inode *inode, struct file *filp)
+{
+ return seq_open_private(filp, &timer_list_sops,
+ sizeof(struct timer_list_iter));
+}
+
+static const struct file_operations timer_list_fops = {
+ .open = timer_list_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+static int __init init_timer_list_procfs(void)
+{
+ struct proc_dir_entry *pe;
+
+ pe = proc_create("timer_list", 0444, NULL, &timer_list_fops);
+ if (!pe)
+ return -ENOMEM;
+ return 0;
+}
+__initcall(init_timer_list_procfs);
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
new file mode 100644
index 000000000..1fb08f213
--- /dev/null
+++ b/kernel/time/timer_stats.c
@@ -0,0 +1,425 @@
+/*
+ * kernel/time/timer_stats.c
+ *
+ * Collect timer usage statistics.
+ *
+ * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar
+ * Copyright(C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * timer_stats is based on timer_top, a similar functionality which was part of
+ * Con Kolivas dyntick patch set. It was developed by Daniel Petrini at the
+ * Instituto Nokia de Tecnologia - INdT - Manaus. timer_top's design was based
+ * on dynamic allocation of the statistics entries and linear search based
+ * lookup combined with a global lock, rather than the static array, hash
+ * and per-CPU locking which is used by timer_stats. It was written for the
+ * pre hrtimer kernel code and therefore did not take hrtimers into account.
+ * Nevertheless it provided the base for the timer_stats implementation and
+ * was a helpful source of inspiration. Kudos to Daniel and the Nokia folks
+ * for this effort.
+ *
+ * timer_top.c is
+ * Copyright (C) 2005 Instituto Nokia de Tecnologia - INdT - Manaus
+ * Written by Daniel Petrini <d.pensator@gmail.com>
+ * timer_top.c was released under the GNU General Public License version 2
+ *
+ * We export the addresses and counting of timer functions being called,
+ * the pid and cmdline from the owner process if applicable.
+ *
+ * Start/stop data collection:
+ * # echo [1|0] >/proc/timer_stats
+ *
+ * Display the information collected so far:
+ * # cat /proc/timer_stats
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/proc_fs.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/kallsyms.h>
+
+#include <asm/uaccess.h>
+
+/*
+ * This is our basic unit of interest: a timer expiry event identified
+ * by the timer, its start/expire functions and the PID of the task that
+ * started the timer. We count the number of times an event happens:
+ */
+struct entry {
+ /*
+ * Hash list:
+ */
+ struct entry *next;
+
+ /*
+ * Hash keys:
+ */
+ void *timer;
+ void *start_func;
+ void *expire_func;
+ pid_t pid;
+
+ /*
+ * Number of timeout events:
+ */
+ unsigned long count;
+ unsigned int timer_flag;
+
+ /*
+ * We save the command-line string to preserve
+ * this information past task exit:
+ */
+ char comm[TASK_COMM_LEN + 1];
+
+} ____cacheline_aligned_in_smp;
+
+/*
+ * Spinlock protecting the tables - not taken during lookup:
+ */
+static DEFINE_RAW_SPINLOCK(table_lock);
+
+/*
+ * Per-CPU lookup locks for fast hash lookup:
+ */
+static DEFINE_PER_CPU(raw_spinlock_t, tstats_lookup_lock);
+
+/*
+ * Mutex to serialize state changes with show-stats activities:
+ */
+static DEFINE_MUTEX(show_mutex);
+
+/*
+ * Collection status, active/inactive:
+ */
+int __read_mostly timer_stats_active;
+
+/*
+ * Beginning/end timestamps of measurement:
+ */
+static ktime_t time_start, time_stop;
+
+/*
+ * tstat entry structs only get allocated while collection is
+ * active and never freed during that time - this simplifies
+ * things quite a bit.
+ *
+ * They get freed when a new collection period is started.
+ */
+#define MAX_ENTRIES_BITS 10
+#define MAX_ENTRIES (1UL << MAX_ENTRIES_BITS)
+
+static unsigned long nr_entries;
+static struct entry entries[MAX_ENTRIES];
+
+static atomic_t overflow_count;
+
+/*
+ * The entries are in a hash-table, for fast lookup:
+ */
+#define TSTAT_HASH_BITS (MAX_ENTRIES_BITS - 1)
+#define TSTAT_HASH_SIZE (1UL << TSTAT_HASH_BITS)
+#define TSTAT_HASH_MASK (TSTAT_HASH_SIZE - 1)
+
+#define __tstat_hashfn(entry) \
+ (((unsigned long)(entry)->timer ^ \
+ (unsigned long)(entry)->start_func ^ \
+ (unsigned long)(entry)->expire_func ^ \
+ (unsigned long)(entry)->pid ) & TSTAT_HASH_MASK)
+
+#define tstat_hashentry(entry) (tstat_hash_table + __tstat_hashfn(entry))
+
+static struct entry *tstat_hash_table[TSTAT_HASH_SIZE] __read_mostly;
+
+static void reset_entries(void)
+{
+ nr_entries = 0;
+ memset(entries, 0, sizeof(entries));
+ memset(tstat_hash_table, 0, sizeof(tstat_hash_table));
+ atomic_set(&overflow_count, 0);
+}
+
+static struct entry *alloc_entry(void)
+{
+ if (nr_entries >= MAX_ENTRIES)
+ return NULL;
+
+ return entries + nr_entries++;
+}
+
+static int match_entries(struct entry *entry1, struct entry *entry2)
+{
+ return entry1->timer == entry2->timer &&
+ entry1->start_func == entry2->start_func &&
+ entry1->expire_func == entry2->expire_func &&
+ entry1->pid == entry2->pid;
+}
+
+/*
+ * Look up whether an entry matching this item is present
+ * in the hash already. Must be called with irqs off and the
+ * lookup lock held:
+ */
+static struct entry *tstat_lookup(struct entry *entry, char *comm)
+{
+ struct entry **head, *curr, *prev;
+
+ head = tstat_hashentry(entry);
+ curr = *head;
+
+ /*
+ * The fastpath is when the entry is already hashed,
+ * we do this with the lookup lock held, but with the
+ * table lock not held:
+ */
+ while (curr) {
+ if (match_entries(curr, entry))
+ return curr;
+
+ curr = curr->next;
+ }
+ /*
+ * Slowpath: allocate, set up and link a new hash entry:
+ */
+ prev = NULL;
+ curr = *head;
+
+ raw_spin_lock(&table_lock);
+ /*
+ * Make sure we have not raced with another CPU:
+ */
+ while (curr) {
+ if (match_entries(curr, entry))
+ goto out_unlock;
+
+ prev = curr;
+ curr = curr->next;
+ }
+
+ curr = alloc_entry();
+ if (curr) {
+ *curr = *entry;
+ curr->count = 0;
+ curr->next = NULL;
+ memcpy(curr->comm, comm, TASK_COMM_LEN);
+
+ smp_mb(); /* Ensure that curr is initialized before insert */
+
+ if (prev)
+ prev->next = curr;
+ else
+ *head = curr;
+ }
+ out_unlock:
+ raw_spin_unlock(&table_lock);
+
+ return curr;
+}
+
+/**
+ * timer_stats_update_stats - Update the statistics for a timer.
+ * @timer: pointer to either a timer_list or a hrtimer
+ * @pid: the pid of the task which set up the timer
+ * @startf: pointer to the function which did the timer setup
+ * @timerf: pointer to the timer callback function of the timer
+ * @comm: name of the process which set up the timer
+ *
+ * When the timer is already registered, then the event counter is
+ * incremented. Otherwise the timer is registered in a free slot.
+ */
+void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
+ void *timerf, char *comm,
+ unsigned int timer_flag)
+{
+ /*
+ * It doesn't matter which lock we take:
+ */
+ raw_spinlock_t *lock;
+ struct entry *entry, input;
+ unsigned long flags;
+
+ if (likely(!timer_stats_active))
+ return;
+
+ lock = &per_cpu(tstats_lookup_lock, raw_smp_processor_id());
+
+ input.timer = timer;
+ input.start_func = startf;
+ input.expire_func = timerf;
+ input.pid = pid;
+ input.timer_flag = timer_flag;
+
+ raw_spin_lock_irqsave(lock, flags);
+ if (!timer_stats_active)
+ goto out_unlock;
+
+ entry = tstat_lookup(&input, comm);
+ if (likely(entry))
+ entry->count++;
+ else
+ atomic_inc(&overflow_count);
+
+ out_unlock:
+ raw_spin_unlock_irqrestore(lock, flags);
+}
+
+static void print_name_offset(struct seq_file *m, unsigned long addr)
+{
+ char symname[KSYM_NAME_LEN];
+
+ if (lookup_symbol_name(addr, symname) < 0)
+ seq_printf(m, "<%p>", (void *)addr);
+ else
+ seq_printf(m, "%s", symname);
+}
+
+static int tstats_show(struct seq_file *m, void *v)
+{
+ struct timespec period;
+ struct entry *entry;
+ unsigned long ms;
+ long events = 0;
+ ktime_t time;
+ int i;
+
+ mutex_lock(&show_mutex);
+ /*
+ * If still active then calculate up to now:
+ */
+ if (timer_stats_active)
+ time_stop = ktime_get();
+
+ time = ktime_sub(time_stop, time_start);
+
+ period = ktime_to_timespec(time);
+ ms = period.tv_nsec / 1000000;
+
+ seq_puts(m, "Timer Stats Version: v0.3\n");
+ seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms);
+ if (atomic_read(&overflow_count))
+ seq_printf(m, "Overflow: %d entries\n", atomic_read(&overflow_count));
+ seq_printf(m, "Collection: %s\n", timer_stats_active ? "active" : "inactive");
+
+ for (i = 0; i < nr_entries; i++) {
+ entry = entries + i;
+ if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) {
+ seq_printf(m, "%4luD, %5d %-16s ",
+ entry->count, entry->pid, entry->comm);
+ } else {
+ seq_printf(m, " %4lu, %5d %-16s ",
+ entry->count, entry->pid, entry->comm);
+ }
+
+ print_name_offset(m, (unsigned long)entry->start_func);
+ seq_puts(m, " (");
+ print_name_offset(m, (unsigned long)entry->expire_func);
+ seq_puts(m, ")\n");
+
+ events += entry->count;
+ }
+
+ ms += period.tv_sec * 1000;
+ if (!ms)
+ ms = 1;
+
+ if (events && period.tv_sec)
+ seq_printf(m, "%ld total events, %ld.%03ld events/sec\n",
+ events, events * 1000 / ms,
+ (events * 1000000 / ms) % 1000);
+ else
+ seq_printf(m, "%ld total events\n", events);
+
+ mutex_unlock(&show_mutex);
+
+ return 0;
+}
+
+/*
+ * After a state change, make sure all concurrent lookup/update
+ * activities have stopped:
+ */
+static void sync_access(void)
+{
+ unsigned long flags;
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ raw_spinlock_t *lock = &per_cpu(tstats_lookup_lock, cpu);
+
+ raw_spin_lock_irqsave(lock, flags);
+ /* nothing */
+ raw_spin_unlock_irqrestore(lock, flags);
+ }
+}
+
+static ssize_t tstats_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *offs)
+{
+ char ctl[2];
+
+ if (count != 2 || *offs)
+ return -EINVAL;
+
+ if (copy_from_user(ctl, buf, count))
+ return -EFAULT;
+
+ mutex_lock(&show_mutex);
+ switch (ctl[0]) {
+ case '0':
+ if (timer_stats_active) {
+ timer_stats_active = 0;
+ time_stop = ktime_get();
+ sync_access();
+ }
+ break;
+ case '1':
+ if (!timer_stats_active) {
+ reset_entries();
+ time_start = ktime_get();
+ smp_mb();
+ timer_stats_active = 1;
+ }
+ break;
+ default:
+ count = -EINVAL;
+ }
+ mutex_unlock(&show_mutex);
+
+ return count;
+}
+
+static int tstats_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, tstats_show, NULL);
+}
+
+static const struct file_operations tstats_fops = {
+ .open = tstats_open,
+ .read = seq_read,
+ .write = tstats_write,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+void __init init_timer_stats(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ raw_spin_lock_init(&per_cpu(tstats_lookup_lock, cpu));
+}
+
+static int __init init_tstats_procfs(void)
+{
+ struct proc_dir_entry *pe;
+
+ pe = proc_create("timer_stats", 0644, NULL, &tstats_fops);
+ if (!pe)
+ return -ENOMEM;
+ return 0;
+}
+__initcall(init_tstats_procfs);
diff --git a/kernel/torture.c b/kernel/torture.c
new file mode 100644
index 000000000..dd70993c2
--- /dev/null
+++ b/kernel/torture.c
@@ -0,0 +1,741 @@
+/*
+ * Common functions for in-kernel torture tests.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright (C) IBM Corporation, 2014
+ *
+ * Author: Paul E. McKenney <paulmck@us.ibm.com>
+ * Based on kernel/rcu/torture.c.
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/err.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <linux/freezer.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/stat.h>
+#include <linux/slab.h>
+#include <linux/trace_clock.h>
+#include <asm/byteorder.h>
+#include <linux/torture.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>");
+
+static char *torture_type;
+static bool verbose;
+
+/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */
+#define FULLSTOP_DONTSTOP 0 /* Normal operation. */
+#define FULLSTOP_SHUTDOWN 1 /* System shutdown with torture running. */
+#define FULLSTOP_RMMOD 2 /* Normal rmmod of torture. */
+static int fullstop = FULLSTOP_RMMOD;
+static DEFINE_MUTEX(fullstop_mutex);
+static int *torture_runnable;
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Variables for online-offline handling. Only present if CPU hotplug
+ * is enabled, otherwise does nothing.
+ */
+
+static struct task_struct *onoff_task;
+static long onoff_holdoff;
+static long onoff_interval;
+static long n_offline_attempts;
+static long n_offline_successes;
+static unsigned long sum_offline;
+static int min_offline = -1;
+static int max_offline;
+static long n_online_attempts;
+static long n_online_successes;
+static unsigned long sum_online;
+static int min_online = -1;
+static int max_online;
+
+/*
+ * Execute random CPU-hotplug operations at the interval specified
+ * by the onoff_interval.
+ */
+static int
+torture_onoff(void *arg)
+{
+ int cpu;
+ unsigned long delta;
+ int maxcpu = -1;
+ DEFINE_TORTURE_RANDOM(rand);
+ int ret;
+ unsigned long starttime;
+
+ VERBOSE_TOROUT_STRING("torture_onoff task started");
+ for_each_online_cpu(cpu)
+ maxcpu = cpu;
+ WARN_ON(maxcpu < 0);
+ if (onoff_holdoff > 0) {
+ VERBOSE_TOROUT_STRING("torture_onoff begin holdoff");
+ schedule_timeout_interruptible(onoff_holdoff);
+ VERBOSE_TOROUT_STRING("torture_onoff end holdoff");
+ }
+ while (!torture_must_stop()) {
+ cpu = (torture_random(&rand) >> 4) % (maxcpu + 1);
+ if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) {
+ if (verbose)
+ pr_alert("%s" TORTURE_FLAG
+ "torture_onoff task: offlining %d\n",
+ torture_type, cpu);
+ starttime = jiffies;
+ n_offline_attempts++;
+ ret = cpu_down(cpu);
+ if (ret) {
+ if (verbose)
+ pr_alert("%s" TORTURE_FLAG
+ "torture_onoff task: offline %d failed: errno %d\n",
+ torture_type, cpu, ret);
+ } else {
+ if (verbose)
+ pr_alert("%s" TORTURE_FLAG
+ "torture_onoff task: offlined %d\n",
+ torture_type, cpu);
+ n_offline_successes++;
+ delta = jiffies - starttime;
+ sum_offline += delta;
+ if (min_offline < 0) {
+ min_offline = delta;
+ max_offline = delta;
+ }
+ if (min_offline > delta)
+ min_offline = delta;
+ if (max_offline < delta)
+ max_offline = delta;
+ }
+ } else if (cpu_is_hotpluggable(cpu)) {
+ if (verbose)
+ pr_alert("%s" TORTURE_FLAG
+ "torture_onoff task: onlining %d\n",
+ torture_type, cpu);
+ starttime = jiffies;
+ n_online_attempts++;
+ ret = cpu_up(cpu);
+ if (ret) {
+ if (verbose)
+ pr_alert("%s" TORTURE_FLAG
+ "torture_onoff task: online %d failed: errno %d\n",
+ torture_type, cpu, ret);
+ } else {
+ if (verbose)
+ pr_alert("%s" TORTURE_FLAG
+ "torture_onoff task: onlined %d\n",
+ torture_type, cpu);
+ n_online_successes++;
+ delta = jiffies - starttime;
+ sum_online += delta;
+ if (min_online < 0) {
+ min_online = delta;
+ max_online = delta;
+ }
+ if (min_online > delta)
+ min_online = delta;
+ if (max_online < delta)
+ max_online = delta;
+ }
+ }
+ schedule_timeout_interruptible(onoff_interval);
+ }
+ torture_kthread_stopping("torture_onoff");
+ return 0;
+}
+
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+
+/*
+ * Initiate online-offline handling.
+ */
+int torture_onoff_init(long ooholdoff, long oointerval)
+{
+ int ret = 0;
+
+#ifdef CONFIG_HOTPLUG_CPU
+ onoff_holdoff = ooholdoff;
+ onoff_interval = oointerval;
+ if (onoff_interval <= 0)
+ return 0;
+ ret = torture_create_kthread(torture_onoff, NULL, onoff_task);
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+ return ret;
+}
+EXPORT_SYMBOL_GPL(torture_onoff_init);
+
+/*
+ * Clean up after online/offline testing.
+ */
+static void torture_onoff_cleanup(void)
+{
+#ifdef CONFIG_HOTPLUG_CPU
+ if (onoff_task == NULL)
+ return;
+ VERBOSE_TOROUT_STRING("Stopping torture_onoff task");
+ kthread_stop(onoff_task);
+ onoff_task = NULL;
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+}
+EXPORT_SYMBOL_GPL(torture_onoff_cleanup);
+
+/*
+ * Print online/offline testing statistics.
+ */
+void torture_onoff_stats(void)
+{
+#ifdef CONFIG_HOTPLUG_CPU
+ pr_cont("onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ",
+ n_online_successes, n_online_attempts,
+ n_offline_successes, n_offline_attempts,
+ min_online, max_online,
+ min_offline, max_offline,
+ sum_online, sum_offline, HZ);
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+}
+EXPORT_SYMBOL_GPL(torture_onoff_stats);
+
+/*
+ * Were all the online/offline operations successful?
+ */
+bool torture_onoff_failures(void)
+{
+#ifdef CONFIG_HOTPLUG_CPU
+ return n_online_successes != n_online_attempts ||
+ n_offline_successes != n_offline_attempts;
+#else /* #ifdef CONFIG_HOTPLUG_CPU */
+ return false;
+#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
+}
+EXPORT_SYMBOL_GPL(torture_onoff_failures);
+
+#define TORTURE_RANDOM_MULT 39916801 /* prime */
+#define TORTURE_RANDOM_ADD 479001701 /* prime */
+#define TORTURE_RANDOM_REFRESH 10000
+
+/*
+ * Crude but fast random-number generator. Uses a linear congruential
+ * generator, with occasional help from cpu_clock().
+ */
+unsigned long
+torture_random(struct torture_random_state *trsp)
+{
+ if (--trsp->trs_count < 0) {
+ trsp->trs_state += (unsigned long)local_clock();
+ trsp->trs_count = TORTURE_RANDOM_REFRESH;
+ }
+ trsp->trs_state = trsp->trs_state * TORTURE_RANDOM_MULT +
+ TORTURE_RANDOM_ADD;
+ return swahw32(trsp->trs_state);
+}
+EXPORT_SYMBOL_GPL(torture_random);
+
+/*
+ * Variables for shuffling. The idea is to ensure that each CPU stays
+ * idle for an extended period to test interactions with dyntick idle,
+ * as well as interactions with any per-CPU varibles.
+ */
+struct shuffle_task {
+ struct list_head st_l;
+ struct task_struct *st_t;
+};
+
+static long shuffle_interval; /* In jiffies. */
+static struct task_struct *shuffler_task;
+static cpumask_var_t shuffle_tmp_mask;
+static int shuffle_idle_cpu; /* Force all torture tasks off this CPU */
+static struct list_head shuffle_task_list = LIST_HEAD_INIT(shuffle_task_list);
+static DEFINE_MUTEX(shuffle_task_mutex);
+
+/*
+ * Register a task to be shuffled. If there is no memory, just splat
+ * and don't bother registering.
+ */
+void torture_shuffle_task_register(struct task_struct *tp)
+{
+ struct shuffle_task *stp;
+
+ if (WARN_ON_ONCE(tp == NULL))
+ return;
+ stp = kmalloc(sizeof(*stp), GFP_KERNEL);
+ if (WARN_ON_ONCE(stp == NULL))
+ return;
+ stp->st_t = tp;
+ mutex_lock(&shuffle_task_mutex);
+ list_add(&stp->st_l, &shuffle_task_list);
+ mutex_unlock(&shuffle_task_mutex);
+}
+EXPORT_SYMBOL_GPL(torture_shuffle_task_register);
+
+/*
+ * Unregister all tasks, for example, at the end of the torture run.
+ */
+static void torture_shuffle_task_unregister_all(void)
+{
+ struct shuffle_task *stp;
+ struct shuffle_task *p;
+
+ mutex_lock(&shuffle_task_mutex);
+ list_for_each_entry_safe(stp, p, &shuffle_task_list, st_l) {
+ list_del(&stp->st_l);
+ kfree(stp);
+ }
+ mutex_unlock(&shuffle_task_mutex);
+}
+
+/* Shuffle tasks such that we allow shuffle_idle_cpu to become idle.
+ * A special case is when shuffle_idle_cpu = -1, in which case we allow
+ * the tasks to run on all CPUs.
+ */
+static void torture_shuffle_tasks(void)
+{
+ struct shuffle_task *stp;
+
+ cpumask_setall(shuffle_tmp_mask);
+ get_online_cpus();
+
+ /* No point in shuffling if there is only one online CPU (ex: UP) */
+ if (num_online_cpus() == 1) {
+ put_online_cpus();
+ return;
+ }
+
+ /* Advance to the next CPU. Upon overflow, don't idle any CPUs. */
+ shuffle_idle_cpu = cpumask_next(shuffle_idle_cpu, shuffle_tmp_mask);
+ if (shuffle_idle_cpu >= nr_cpu_ids)
+ shuffle_idle_cpu = -1;
+ else
+ cpumask_clear_cpu(shuffle_idle_cpu, shuffle_tmp_mask);
+
+ mutex_lock(&shuffle_task_mutex);
+ list_for_each_entry(stp, &shuffle_task_list, st_l)
+ set_cpus_allowed_ptr(stp->st_t, shuffle_tmp_mask);
+ mutex_unlock(&shuffle_task_mutex);
+
+ put_online_cpus();
+}
+
+/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the
+ * system to become idle at a time and cut off its timer ticks. This is meant
+ * to test the support for such tickless idle CPU in RCU.
+ */
+static int torture_shuffle(void *arg)
+{
+ VERBOSE_TOROUT_STRING("torture_shuffle task started");
+ do {
+ schedule_timeout_interruptible(shuffle_interval);
+ torture_shuffle_tasks();
+ torture_shutdown_absorb("torture_shuffle");
+ } while (!torture_must_stop());
+ torture_kthread_stopping("torture_shuffle");
+ return 0;
+}
+
+/*
+ * Start the shuffler, with shuffint in jiffies.
+ */
+int torture_shuffle_init(long shuffint)
+{
+ shuffle_interval = shuffint;
+
+ shuffle_idle_cpu = -1;
+
+ if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) {
+ VERBOSE_TOROUT_ERRSTRING("Failed to alloc mask");
+ return -ENOMEM;
+ }
+
+ /* Create the shuffler thread */
+ return torture_create_kthread(torture_shuffle, NULL, shuffler_task);
+}
+EXPORT_SYMBOL_GPL(torture_shuffle_init);
+
+/*
+ * Stop the shuffling.
+ */
+static void torture_shuffle_cleanup(void)
+{
+ torture_shuffle_task_unregister_all();
+ if (shuffler_task) {
+ VERBOSE_TOROUT_STRING("Stopping torture_shuffle task");
+ kthread_stop(shuffler_task);
+ free_cpumask_var(shuffle_tmp_mask);
+ }
+ shuffler_task = NULL;
+}
+EXPORT_SYMBOL_GPL(torture_shuffle_cleanup);
+
+/*
+ * Variables for auto-shutdown. This allows "lights out" torture runs
+ * to be fully scripted.
+ */
+static int shutdown_secs; /* desired test duration in seconds. */
+static struct task_struct *shutdown_task;
+static unsigned long shutdown_time; /* jiffies to system shutdown. */
+static void (*torture_shutdown_hook)(void);
+
+/*
+ * Absorb kthreads into a kernel function that won't return, so that
+ * they won't ever access module text or data again.
+ */
+void torture_shutdown_absorb(const char *title)
+{
+ while (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
+ pr_notice("torture thread %s parking due to system shutdown\n",
+ title);
+ schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT);
+ }
+}
+EXPORT_SYMBOL_GPL(torture_shutdown_absorb);
+
+/*
+ * Cause the torture test to shutdown the system after the test has
+ * run for the time specified by the shutdown_secs parameter.
+ */
+static int torture_shutdown(void *arg)
+{
+ long delta;
+ unsigned long jiffies_snap;
+
+ VERBOSE_TOROUT_STRING("torture_shutdown task started");
+ jiffies_snap = jiffies;
+ while (ULONG_CMP_LT(jiffies_snap, shutdown_time) &&
+ !torture_must_stop()) {
+ delta = shutdown_time - jiffies_snap;
+ if (verbose)
+ pr_alert("%s" TORTURE_FLAG
+ "torture_shutdown task: %lu jiffies remaining\n",
+ torture_type, delta);
+ schedule_timeout_interruptible(delta);
+ jiffies_snap = jiffies;
+ }
+ if (torture_must_stop()) {
+ torture_kthread_stopping("torture_shutdown");
+ return 0;
+ }
+
+ /* OK, shut down the system. */
+
+ VERBOSE_TOROUT_STRING("torture_shutdown task shutting down system");
+ shutdown_task = NULL; /* Avoid self-kill deadlock. */
+ if (torture_shutdown_hook)
+ torture_shutdown_hook();
+ else
+ VERBOSE_TOROUT_STRING("No torture_shutdown_hook(), skipping.");
+ kernel_power_off(); /* Shut down the system. */
+ return 0;
+}
+
+/*
+ * Start up the shutdown task.
+ */
+int torture_shutdown_init(int ssecs, void (*cleanup)(void))
+{
+ int ret = 0;
+
+ shutdown_secs = ssecs;
+ torture_shutdown_hook = cleanup;
+ if (shutdown_secs > 0) {
+ shutdown_time = jiffies + shutdown_secs * HZ;
+ ret = torture_create_kthread(torture_shutdown, NULL,
+ shutdown_task);
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(torture_shutdown_init);
+
+/*
+ * Detect and respond to a system shutdown.
+ */
+static int torture_shutdown_notify(struct notifier_block *unused1,
+ unsigned long unused2, void *unused3)
+{
+ mutex_lock(&fullstop_mutex);
+ if (ACCESS_ONCE(fullstop) == FULLSTOP_DONTSTOP) {
+ VERBOSE_TOROUT_STRING("Unscheduled system shutdown detected");
+ ACCESS_ONCE(fullstop) = FULLSTOP_SHUTDOWN;
+ } else {
+ pr_warn("Concurrent rmmod and shutdown illegal!\n");
+ }
+ mutex_unlock(&fullstop_mutex);
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block torture_shutdown_nb = {
+ .notifier_call = torture_shutdown_notify,
+};
+
+/*
+ * Shut down the shutdown task. Say what??? Heh! This can happen if
+ * the torture module gets an rmmod before the shutdown time arrives. ;-)
+ */
+static void torture_shutdown_cleanup(void)
+{
+ unregister_reboot_notifier(&torture_shutdown_nb);
+ if (shutdown_task != NULL) {
+ VERBOSE_TOROUT_STRING("Stopping torture_shutdown task");
+ kthread_stop(shutdown_task);
+ }
+ shutdown_task = NULL;
+}
+
+/*
+ * Variables for stuttering, which means to periodically pause and
+ * restart testing in order to catch bugs that appear when load is
+ * suddenly applied to or removed from the system.
+ */
+static struct task_struct *stutter_task;
+static int stutter_pause_test;
+static int stutter;
+
+/*
+ * Block until the stutter interval ends. This must be called periodically
+ * by all running kthreads that need to be subject to stuttering.
+ */
+void stutter_wait(const char *title)
+{
+ while (ACCESS_ONCE(stutter_pause_test) ||
+ (torture_runnable && !ACCESS_ONCE(*torture_runnable))) {
+ if (stutter_pause_test)
+ if (ACCESS_ONCE(stutter_pause_test) == 1)
+ schedule_timeout_interruptible(1);
+ else
+ while (ACCESS_ONCE(stutter_pause_test))
+ cond_resched();
+ else
+ schedule_timeout_interruptible(round_jiffies_relative(HZ));
+ torture_shutdown_absorb(title);
+ }
+}
+EXPORT_SYMBOL_GPL(stutter_wait);
+
+/*
+ * Cause the torture test to "stutter", starting and stopping all
+ * threads periodically.
+ */
+static int torture_stutter(void *arg)
+{
+ VERBOSE_TOROUT_STRING("torture_stutter task started");
+ do {
+ if (!torture_must_stop()) {
+ if (stutter > 1) {
+ schedule_timeout_interruptible(stutter - 1);
+ ACCESS_ONCE(stutter_pause_test) = 2;
+ }
+ schedule_timeout_interruptible(1);
+ ACCESS_ONCE(stutter_pause_test) = 1;
+ }
+ if (!torture_must_stop())
+ schedule_timeout_interruptible(stutter);
+ ACCESS_ONCE(stutter_pause_test) = 0;
+ torture_shutdown_absorb("torture_stutter");
+ } while (!torture_must_stop());
+ torture_kthread_stopping("torture_stutter");
+ return 0;
+}
+
+/*
+ * Initialize and kick off the torture_stutter kthread.
+ */
+int torture_stutter_init(int s)
+{
+ int ret;
+
+ stutter = s;
+ ret = torture_create_kthread(torture_stutter, NULL, stutter_task);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(torture_stutter_init);
+
+/*
+ * Cleanup after the torture_stutter kthread.
+ */
+static void torture_stutter_cleanup(void)
+{
+ if (!stutter_task)
+ return;
+ VERBOSE_TOROUT_STRING("Stopping torture_stutter task");
+ kthread_stop(stutter_task);
+ stutter_task = NULL;
+}
+
+/*
+ * Initialize torture module. Please note that this is -not- invoked via
+ * the usual module_init() mechanism, but rather by an explicit call from
+ * the client torture module. This call must be paired with a later
+ * torture_init_end().
+ *
+ * The runnable parameter points to a flag that controls whether or not
+ * the test is currently runnable. If there is no such flag, pass in NULL.
+ */
+bool torture_init_begin(char *ttype, bool v, int *runnable)
+{
+ mutex_lock(&fullstop_mutex);
+ if (torture_type != NULL) {
+ pr_alert("torture_init_begin: refusing %s init: %s running",
+ ttype, torture_type);
+ mutex_unlock(&fullstop_mutex);
+ return false;
+ }
+ torture_type = ttype;
+ verbose = v;
+ torture_runnable = runnable;
+ fullstop = FULLSTOP_DONTSTOP;
+ return true;
+}
+EXPORT_SYMBOL_GPL(torture_init_begin);
+
+/*
+ * Tell the torture module that initialization is complete.
+ */
+void torture_init_end(void)
+{
+ mutex_unlock(&fullstop_mutex);
+ register_reboot_notifier(&torture_shutdown_nb);
+}
+EXPORT_SYMBOL_GPL(torture_init_end);
+
+/*
+ * Clean up torture module. Please note that this is -not- invoked via
+ * the usual module_exit() mechanism, but rather by an explicit call from
+ * the client torture module. Returns true if a race with system shutdown
+ * is detected, otherwise, all kthreads started by functions in this file
+ * will be shut down.
+ *
+ * This must be called before the caller starts shutting down its own
+ * kthreads.
+ *
+ * Both torture_cleanup_begin() and torture_cleanup_end() must be paired,
+ * in order to correctly perform the cleanup. They are separated because
+ * threads can still need to reference the torture_type type, thus nullify
+ * only after completing all other relevant calls.
+ */
+bool torture_cleanup_begin(void)
+{
+ mutex_lock(&fullstop_mutex);
+ if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
+ pr_warn("Concurrent rmmod and shutdown illegal!\n");
+ mutex_unlock(&fullstop_mutex);
+ schedule_timeout_uninterruptible(10);
+ return true;
+ }
+ ACCESS_ONCE(fullstop) = FULLSTOP_RMMOD;
+ mutex_unlock(&fullstop_mutex);
+ torture_shutdown_cleanup();
+ torture_shuffle_cleanup();
+ torture_stutter_cleanup();
+ torture_onoff_cleanup();
+ return false;
+}
+EXPORT_SYMBOL_GPL(torture_cleanup_begin);
+
+void torture_cleanup_end(void)
+{
+ mutex_lock(&fullstop_mutex);
+ torture_type = NULL;
+ mutex_unlock(&fullstop_mutex);
+}
+EXPORT_SYMBOL_GPL(torture_cleanup_end);
+
+/*
+ * Is it time for the current torture test to stop?
+ */
+bool torture_must_stop(void)
+{
+ return torture_must_stop_irq() || kthread_should_stop();
+}
+EXPORT_SYMBOL_GPL(torture_must_stop);
+
+/*
+ * Is it time for the current torture test to stop? This is the irq-safe
+ * version, hence no check for kthread_should_stop().
+ */
+bool torture_must_stop_irq(void)
+{
+ return ACCESS_ONCE(fullstop) != FULLSTOP_DONTSTOP;
+}
+EXPORT_SYMBOL_GPL(torture_must_stop_irq);
+
+/*
+ * Each kthread must wait for kthread_should_stop() before returning from
+ * its top-level function, otherwise segfaults ensue. This function
+ * prints a "stopping" message and waits for kthread_should_stop(), and
+ * should be called from all torture kthreads immediately prior to
+ * returning.
+ */
+void torture_kthread_stopping(char *title)
+{
+ char buf[128];
+
+ snprintf(buf, sizeof(buf), "Stopping %s", title);
+ VERBOSE_TOROUT_STRING(buf);
+ while (!kthread_should_stop()) {
+ torture_shutdown_absorb(title);
+ schedule_timeout_uninterruptible(1);
+ }
+}
+EXPORT_SYMBOL_GPL(torture_kthread_stopping);
+
+/*
+ * Create a generic torture kthread that is immediately runnable. If you
+ * need the kthread to be stopped so that you can do something to it before
+ * it starts, you will need to open-code your own.
+ */
+int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m,
+ char *f, struct task_struct **tp)
+{
+ int ret = 0;
+
+ VERBOSE_TOROUT_STRING(m);
+ *tp = kthread_run(fn, arg, "%s", s);
+ if (IS_ERR(*tp)) {
+ ret = PTR_ERR(*tp);
+ VERBOSE_TOROUT_ERRSTRING(f);
+ *tp = NULL;
+ }
+ torture_shuffle_task_register(*tp);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(_torture_create_kthread);
+
+/*
+ * Stop a generic kthread, emitting a message.
+ */
+void _torture_stop_kthread(char *m, struct task_struct **tp)
+{
+ if (*tp == NULL)
+ return;
+ VERBOSE_TOROUT_STRING(m);
+ kthread_stop(*tp);
+ *tp = NULL;
+}
+EXPORT_SYMBOL_GPL(_torture_stop_kthread);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
new file mode 100644
index 000000000..3b9a48ae1
--- /dev/null
+++ b/kernel/trace/Kconfig
@@ -0,0 +1,641 @@
+#
+# Architectures that offer an FUNCTION_TRACER implementation should
+# select HAVE_FUNCTION_TRACER:
+#
+
+config USER_STACKTRACE_SUPPORT
+ bool
+
+config NOP_TRACER
+ bool
+
+config HAVE_FTRACE_NMI_ENTER
+ bool
+ help
+ See Documentation/trace/ftrace-design.txt
+
+config HAVE_FUNCTION_TRACER
+ bool
+ help
+ See Documentation/trace/ftrace-design.txt
+
+config HAVE_FUNCTION_GRAPH_TRACER
+ bool
+ help
+ See Documentation/trace/ftrace-design.txt
+
+config HAVE_FUNCTION_GRAPH_FP_TEST
+ bool
+ help
+ See Documentation/trace/ftrace-design.txt
+
+config HAVE_DYNAMIC_FTRACE
+ bool
+ help
+ See Documentation/trace/ftrace-design.txt
+
+config HAVE_DYNAMIC_FTRACE_WITH_REGS
+ bool
+
+config HAVE_FTRACE_MCOUNT_RECORD
+ bool
+ help
+ See Documentation/trace/ftrace-design.txt
+
+config HAVE_SYSCALL_TRACEPOINTS
+ bool
+ help
+ See Documentation/trace/ftrace-design.txt
+
+config HAVE_FENTRY
+ bool
+ help
+ Arch supports the gcc options -pg with -mfentry
+
+config HAVE_C_RECORDMCOUNT
+ bool
+ help
+ C version of recordmcount available?
+
+config TRACER_MAX_TRACE
+ bool
+
+config TRACE_CLOCK
+ bool
+
+config RING_BUFFER
+ bool
+ select TRACE_CLOCK
+ select IRQ_WORK
+
+config FTRACE_NMI_ENTER
+ bool
+ depends on HAVE_FTRACE_NMI_ENTER
+ default y
+
+config EVENT_TRACING
+ select CONTEXT_SWITCH_TRACER
+ bool
+
+config CONTEXT_SWITCH_TRACER
+ bool
+
+config RING_BUFFER_ALLOW_SWAP
+ bool
+ help
+ Allow the use of ring_buffer_swap_cpu.
+ Adds a very slight overhead to tracing when enabled.
+
+# All tracer options should select GENERIC_TRACER. For those options that are
+# enabled by all tracers (context switch and event tracer) they select TRACING.
+# This allows those options to appear when no other tracer is selected. But the
+# options do not appear when something else selects it. We need the two options
+# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the
+# hiding of the automatic options.
+
+config TRACING
+ bool
+ select DEBUG_FS
+ select RING_BUFFER
+ select STACKTRACE if STACKTRACE_SUPPORT
+ select TRACEPOINTS
+ select NOP_TRACER
+ select BINARY_PRINTF
+ select EVENT_TRACING
+ select TRACE_CLOCK
+
+config GENERIC_TRACER
+ bool
+ select TRACING
+
+#
+# Minimum requirements an architecture has to meet for us to
+# be able to offer generic tracing facilities:
+#
+config TRACING_SUPPORT
+ bool
+ # PPC32 has no irqflags tracing support, but it can use most of the
+ # tracers anyway, they were tested to build and work. Note that new
+ # exceptions to this list aren't welcomed, better implement the
+ # irqflags tracing for your architecture.
+ depends on TRACE_IRQFLAGS_SUPPORT || PPC32
+ depends on STACKTRACE_SUPPORT
+ default y
+
+if TRACING_SUPPORT
+
+menuconfig FTRACE
+ bool "Tracers"
+ default y if DEBUG_KERNEL
+ help
+ Enable the kernel tracing infrastructure.
+
+if FTRACE
+
+config FUNCTION_TRACER
+ bool "Kernel Function Tracer"
+ depends on HAVE_FUNCTION_TRACER
+ select KALLSYMS
+ select GENERIC_TRACER
+ select CONTEXT_SWITCH_TRACER
+ help
+ Enable the kernel to trace every kernel function. This is done
+ by using a compiler feature to insert a small, 5-byte No-Operation
+ instruction at the beginning of every kernel function, which NOP
+ sequence is then dynamically patched into a tracer call when
+ tracing is enabled by the administrator. If it's runtime disabled
+ (the bootup default), then the overhead of the instructions is very
+ small and not measurable even in micro-benchmarks.
+
+config FUNCTION_GRAPH_TRACER
+ bool "Kernel Function Graph Tracer"
+ depends on HAVE_FUNCTION_GRAPH_TRACER
+ depends on FUNCTION_TRACER
+ depends on !X86_32 || !CC_OPTIMIZE_FOR_SIZE
+ default y
+ help
+ Enable the kernel to trace a function at both its return
+ and its entry.
+ Its first purpose is to trace the duration of functions and
+ draw a call graph for each thread with some information like
+ the return value. This is done by setting the current return
+ address on the current task structure into a stack of calls.
+
+
+config IRQSOFF_TRACER
+ bool "Interrupts-off Latency Tracer"
+ default n
+ depends on TRACE_IRQFLAGS_SUPPORT
+ depends on !ARCH_USES_GETTIMEOFFSET
+ select TRACE_IRQFLAGS
+ select GENERIC_TRACER
+ select TRACER_MAX_TRACE
+ select RING_BUFFER_ALLOW_SWAP
+ select TRACER_SNAPSHOT
+ select TRACER_SNAPSHOT_PER_CPU_SWAP
+ help
+ This option measures the time spent in irqs-off critical
+ sections, with microsecond accuracy.
+
+ The default measurement method is a maximum search, which is
+ disabled by default and can be runtime (re-)started
+ via:
+
+ echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
+
+ (Note that kernel size and overhead increase with this option
+ enabled. This option and the preempt-off timing option can be
+ used together or separately.)
+
+config PREEMPT_TRACER
+ bool "Preemption-off Latency Tracer"
+ default n
+ depends on !ARCH_USES_GETTIMEOFFSET
+ depends on PREEMPT
+ select GENERIC_TRACER
+ select TRACER_MAX_TRACE
+ select RING_BUFFER_ALLOW_SWAP
+ select TRACER_SNAPSHOT
+ select TRACER_SNAPSHOT_PER_CPU_SWAP
+ help
+ This option measures the time spent in preemption-off critical
+ sections, with microsecond accuracy.
+
+ The default measurement method is a maximum search, which is
+ disabled by default and can be runtime (re-)started
+ via:
+
+ echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
+
+ (Note that kernel size and overhead increase with this option
+ enabled. This option and the irqs-off timing option can be
+ used together or separately.)
+
+config SCHED_TRACER
+ bool "Scheduling Latency Tracer"
+ select GENERIC_TRACER
+ select CONTEXT_SWITCH_TRACER
+ select TRACER_MAX_TRACE
+ select TRACER_SNAPSHOT
+ help
+ This tracer tracks the latency of the highest priority task
+ to be scheduled in, starting from the point it has woken up.
+
+config ENABLE_DEFAULT_TRACERS
+ bool "Trace process context switches and events"
+ depends on !GENERIC_TRACER
+ select TRACING
+ help
+ This tracer hooks to various trace points in the kernel,
+ allowing the user to pick and choose which trace point they
+ want to trace. It also includes the sched_switch tracer plugin.
+
+config FTRACE_SYSCALLS
+ bool "Trace syscalls"
+ depends on HAVE_SYSCALL_TRACEPOINTS
+ select GENERIC_TRACER
+ select KALLSYMS
+ help
+ Basic tracer to catch the syscall entry and exit events.
+
+config TRACER_SNAPSHOT
+ bool "Create a snapshot trace buffer"
+ select TRACER_MAX_TRACE
+ help
+ Allow tracing users to take snapshot of the current buffer using the
+ ftrace interface, e.g.:
+
+ echo 1 > /sys/kernel/debug/tracing/snapshot
+ cat snapshot
+
+config TRACER_SNAPSHOT_PER_CPU_SWAP
+ bool "Allow snapshot to swap per CPU"
+ depends on TRACER_SNAPSHOT
+ select RING_BUFFER_ALLOW_SWAP
+ help
+ Allow doing a snapshot of a single CPU buffer instead of a
+ full swap (all buffers). If this is set, then the following is
+ allowed:
+
+ echo 1 > /sys/kernel/debug/tracing/per_cpu/cpu2/snapshot
+
+ After which, only the tracing buffer for CPU 2 was swapped with
+ the main tracing buffer, and the other CPU buffers remain the same.
+
+ When this is enabled, this adds a little more overhead to the
+ trace recording, as it needs to add some checks to synchronize
+ recording with swaps. But this does not affect the performance
+ of the overall system. This is enabled by default when the preempt
+ or irq latency tracers are enabled, as those need to swap as well
+ and already adds the overhead (plus a lot more).
+
+config TRACE_BRANCH_PROFILING
+ bool
+ select GENERIC_TRACER
+
+choice
+ prompt "Branch Profiling"
+ default BRANCH_PROFILE_NONE
+ help
+ The branch profiling is a software profiler. It will add hooks
+ into the C conditionals to test which path a branch takes.
+
+ The likely/unlikely profiler only looks at the conditions that
+ are annotated with a likely or unlikely macro.
+
+ The "all branch" profiler will profile every if-statement in the
+ kernel. This profiler will also enable the likely/unlikely
+ profiler.
+
+ Either of the above profilers adds a bit of overhead to the system.
+ If unsure, choose "No branch profiling".
+
+config BRANCH_PROFILE_NONE
+ bool "No branch profiling"
+ help
+ No branch profiling. Branch profiling adds a bit of overhead.
+ Only enable it if you want to analyse the branching behavior.
+ Otherwise keep it disabled.
+
+config PROFILE_ANNOTATED_BRANCHES
+ bool "Trace likely/unlikely profiler"
+ select TRACE_BRANCH_PROFILING
+ help
+ This tracer profiles all likely and unlikely macros
+ in the kernel. It will display the results in:
+
+ /sys/kernel/debug/tracing/trace_stat/branch_annotated
+
+ Note: this will add a significant overhead; only turn this
+ on if you need to profile the system's use of these macros.
+
+config PROFILE_ALL_BRANCHES
+ bool "Profile all if conditionals"
+ select TRACE_BRANCH_PROFILING
+ help
+ This tracer profiles all branch conditions. Every if ()
+ taken in the kernel is recorded whether it hit or miss.
+ The results will be displayed in:
+
+ /sys/kernel/debug/tracing/trace_stat/branch_all
+
+ This option also enables the likely/unlikely profiler.
+
+ This configuration, when enabled, will impose a great overhead
+ on the system. This should only be enabled when the system
+ is to be analyzed in much detail.
+endchoice
+
+config TRACING_BRANCHES
+ bool
+ help
+ Selected by tracers that will trace the likely and unlikely
+ conditions. This prevents the tracers themselves from being
+ profiled. Profiling the tracing infrastructure can only happen
+ when the likelys and unlikelys are not being traced.
+
+config BRANCH_TRACER
+ bool "Trace likely/unlikely instances"
+ depends on TRACE_BRANCH_PROFILING
+ select TRACING_BRANCHES
+ help
+ This traces the events of likely and unlikely condition
+ calls in the kernel. The difference between this and the
+ "Trace likely/unlikely profiler" is that this is not a
+ histogram of the callers, but actually places the calling
+ events into a running trace buffer to see when and where the
+ events happened, as well as their results.
+
+ Say N if unsure.
+
+config STACK_TRACER
+ bool "Trace max stack"
+ depends on HAVE_FUNCTION_TRACER
+ select FUNCTION_TRACER
+ select STACKTRACE
+ select KALLSYMS
+ help
+ This special tracer records the maximum stack footprint of the
+ kernel and displays it in /sys/kernel/debug/tracing/stack_trace.
+
+ This tracer works by hooking into every function call that the
+ kernel executes, and keeping a maximum stack depth value and
+ stack-trace saved. If this is configured with DYNAMIC_FTRACE
+ then it will not have any overhead while the stack tracer
+ is disabled.
+
+ To enable the stack tracer on bootup, pass in 'stacktrace'
+ on the kernel command line.
+
+ The stack tracer can also be enabled or disabled via the
+ sysctl kernel.stack_tracer_enabled
+
+ Say N if unsure.
+
+config BLK_DEV_IO_TRACE
+ bool "Support for tracing block IO actions"
+ depends on SYSFS
+ depends on BLOCK
+ select RELAY
+ select DEBUG_FS
+ select TRACEPOINTS
+ select GENERIC_TRACER
+ select STACKTRACE
+ help
+ Say Y here if you want to be able to trace the block layer actions
+ on a given queue. Tracing allows you to see any traffic happening
+ on a block device queue. For more information (and the userspace
+ support tools needed), fetch the blktrace tools from:
+
+ git://git.kernel.dk/blktrace.git
+
+ Tracing also is possible using the ftrace interface, e.g.:
+
+ echo 1 > /sys/block/sda/sda1/trace/enable
+ echo blk > /sys/kernel/debug/tracing/current_tracer
+ cat /sys/kernel/debug/tracing/trace_pipe
+
+ If unsure, say N.
+
+config KPROBE_EVENT
+ depends on KPROBES
+ depends on HAVE_REGS_AND_STACK_ACCESS_API
+ bool "Enable kprobes-based dynamic events"
+ select TRACING
+ select PROBE_EVENTS
+ default y
+ help
+ This allows the user to add tracing events (similar to tracepoints)
+ on the fly via the ftrace interface. See
+ Documentation/trace/kprobetrace.txt for more details.
+
+ Those events can be inserted wherever kprobes can probe, and record
+ various register and memory values.
+
+ This option is also required by perf-probe subcommand of perf tools.
+ If you want to use perf tools, this option is strongly recommended.
+
+config UPROBE_EVENT
+ bool "Enable uprobes-based dynamic events"
+ depends on ARCH_SUPPORTS_UPROBES
+ depends on MMU
+ depends on PERF_EVENTS
+ select UPROBES
+ select PROBE_EVENTS
+ select TRACING
+ default n
+ help
+ This allows the user to add tracing events on top of userspace
+ dynamic events (similar to tracepoints) on the fly via the trace
+ events interface. Those events can be inserted wherever uprobes
+ can probe, and record various registers.
+ This option is required if you plan to use perf-probe subcommand
+ of perf tools on user space applications.
+
+config BPF_EVENTS
+ depends on BPF_SYSCALL
+ depends on KPROBE_EVENT
+ bool
+ default y
+ help
+ This allows the user to attach BPF programs to kprobe events.
+
+config PROBE_EVENTS
+ def_bool n
+
+config DYNAMIC_FTRACE
+ bool "enable/disable function tracing dynamically"
+ depends on FUNCTION_TRACER
+ depends on HAVE_DYNAMIC_FTRACE
+ default y
+ help
+ This option will modify all the calls to function tracing
+ dynamically (will patch them out of the binary image and
+ replace them with a No-Op instruction) on boot up. During
+ compile time, a table is made of all the locations that ftrace
+ can function trace, and this table is linked into the kernel
+ image. When this is enabled, functions can be individually
+ enabled, and the functions not enabled will not affect
+ performance of the system.
+
+ See the files in /sys/kernel/debug/tracing:
+ available_filter_functions
+ set_ftrace_filter
+ set_ftrace_notrace
+
+ This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but
+ otherwise has native performance as long as no tracing is active.
+
+config DYNAMIC_FTRACE_WITH_REGS
+ def_bool y
+ depends on DYNAMIC_FTRACE
+ depends on HAVE_DYNAMIC_FTRACE_WITH_REGS
+
+config FUNCTION_PROFILER
+ bool "Kernel function profiler"
+ depends on FUNCTION_TRACER
+ default n
+ help
+ This option enables the kernel function profiler. A file is created
+ in debugfs called function_profile_enabled which defaults to zero.
+ When a 1 is echoed into this file profiling begins, and when a
+ zero is entered, profiling stops. A "functions" file is created in
+ the trace_stats directory; this file shows the list of functions that
+ have been hit and their counters.
+
+ If in doubt, say N.
+
+config FTRACE_MCOUNT_RECORD
+ def_bool y
+ depends on DYNAMIC_FTRACE
+ depends on HAVE_FTRACE_MCOUNT_RECORD
+
+config FTRACE_SELFTEST
+ bool
+
+config FTRACE_STARTUP_TEST
+ bool "Perform a startup test on ftrace"
+ depends on GENERIC_TRACER
+ select FTRACE_SELFTEST
+ help
+ This option performs a series of startup tests on ftrace. On bootup
+ a series of tests are made to verify that the tracer is
+ functioning properly. It will do tests on all the configured
+ tracers of ftrace.
+
+config EVENT_TRACE_TEST_SYSCALLS
+ bool "Run selftest on syscall events"
+ depends on FTRACE_STARTUP_TEST
+ help
+ This option will also enable testing every syscall event.
+ It only enables the event and disables it and runs various loads
+ with the event enabled. This adds a bit more time for kernel boot
+ up since it runs this on every system call defined.
+
+ TBD - enable a way to actually call the syscalls as we test their
+ events
+
+config MMIOTRACE
+ bool "Memory mapped IO tracing"
+ depends on HAVE_MMIOTRACE_SUPPORT && PCI
+ select GENERIC_TRACER
+ help
+ Mmiotrace traces Memory Mapped I/O access and is meant for
+ debugging and reverse engineering. It is called from the ioremap
+ implementation and works via page faults. Tracing is disabled by
+ default and can be enabled at run-time.
+
+ See Documentation/trace/mmiotrace.txt.
+ If you are not helping to develop drivers, say N.
+
+config MMIOTRACE_TEST
+ tristate "Test module for mmiotrace"
+ depends on MMIOTRACE && m
+ help
+ This is a dumb module for testing mmiotrace. It is very dangerous
+ as it will write garbage to IO memory starting at a given address.
+ However, it should be safe to use on e.g. unused portion of VRAM.
+
+ Say N, unless you absolutely know what you are doing.
+
+config TRACEPOINT_BENCHMARK
+ bool "Add tracepoint that benchmarks tracepoints"
+ help
+ This option creates the tracepoint "benchmark:benchmark_event".
+ When the tracepoint is enabled, it kicks off a kernel thread that
+ goes into an infinite loop (calling cond_sched() to let other tasks
+ run), and calls the tracepoint. Each iteration will record the time
+ it took to write to the tracepoint and the next iteration that
+ data will be passed to the tracepoint itself. That is, the tracepoint
+ will report the time it took to do the previous tracepoint.
+ The string written to the tracepoint is a static string of 128 bytes
+ to keep the time the same. The initial string is simply a write of
+ "START". The second string records the cold cache time of the first
+ write which is not added to the rest of the calculations.
+
+ As it is a tight loop, it benchmarks as hot cache. That's fine because
+ we care most about hot paths that are probably in cache already.
+
+ An example of the output:
+
+ START
+ first=3672 [COLD CACHED]
+ last=632 first=3672 max=632 min=632 avg=316 std=446 std^2=199712
+ last=278 first=3672 max=632 min=278 avg=303 std=316 std^2=100337
+ last=277 first=3672 max=632 min=277 avg=296 std=258 std^2=67064
+ last=273 first=3672 max=632 min=273 avg=292 std=224 std^2=50411
+ last=273 first=3672 max=632 min=273 avg=288 std=200 std^2=40389
+ last=281 first=3672 max=632 min=273 avg=287 std=183 std^2=33666
+
+
+config RING_BUFFER_BENCHMARK
+ tristate "Ring buffer benchmark stress tester"
+ depends on RING_BUFFER
+ help
+ This option creates a test to stress the ring buffer and benchmark it.
+ It creates its own ring buffer such that it will not interfere with
+ any other users of the ring buffer (such as ftrace). It then creates
+ a producer and consumer that will run for 10 seconds and sleep for
+ 10 seconds. Each interval it will print out the number of events
+ it recorded and give a rough estimate of how long each iteration took.
+
+ It does not disable interrupts or raise its priority, so it may be
+ affected by processes that are running.
+
+ If unsure, say N.
+
+config RING_BUFFER_STARTUP_TEST
+ bool "Ring buffer startup self test"
+ depends on RING_BUFFER
+ help
+ Run a simple self test on the ring buffer on boot up. Late in the
+ kernel boot sequence, the test will start that kicks off
+ a thread per cpu. Each thread will write various size events
+ into the ring buffer. Another thread is created to send IPIs
+ to each of the threads, where the IPI handler will also write
+ to the ring buffer, to test/stress the nesting ability.
+ If any anomalies are discovered, a warning will be displayed
+ and all ring buffers will be disabled.
+
+ The test runs for 10 seconds. This will slow your boot time
+ by at least 10 more seconds.
+
+ At the end of the test, statics and more checks are done.
+ It will output the stats of each per cpu buffer. What
+ was written, the sizes, what was read, what was lost, and
+ other similar details.
+
+ If unsure, say N
+
+config TRACE_ENUM_MAP_FILE
+ bool "Show enum mappings for trace events"
+ depends on TRACING
+ help
+ The "print fmt" of the trace events will show the enum names instead
+ of their values. This can cause problems for user space tools that
+ use this string to parse the raw data as user space does not know
+ how to convert the string to its value.
+
+ To fix this, there's a special macro in the kernel that can be used
+ to convert the enum into its value. If this macro is used, then the
+ print fmt strings will have the enums converted to their values.
+
+ If something does not get converted properly, this option can be
+ used to show what enums the kernel tried to convert.
+
+ This option is for debugging the enum conversions. A file is created
+ in the tracing directory called "enum_map" that will show the enum
+ names matched with their values and what trace event system they
+ belong too.
+
+ Normally, the mapping of the strings to values will be freed after
+ boot up or module load. With this option, they will not be freed, as
+ they are needed for the "enum_map" file. Enabling this option will
+ increase the memory footprint of the running kernel.
+
+ If unsure, say N
+
+endif # FTRACE
+
+endif # TRACING_SUPPORT
+
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
new file mode 100644
index 000000000..9b1044e93
--- /dev/null
+++ b/kernel/trace/Makefile
@@ -0,0 +1,70 @@
+
+# Do not instrument the tracer itself:
+
+ifdef CONFIG_FUNCTION_TRACER
+ORIG_CFLAGS := $(KBUILD_CFLAGS)
+KBUILD_CFLAGS = $(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS))
+
+ifdef CONFIG_FTRACE_SELFTEST
+# selftest needs instrumentation
+CFLAGS_trace_selftest_dynamic.o = $(CC_FLAGS_FTRACE)
+obj-y += trace_selftest_dynamic.o
+endif
+endif
+
+# If unlikely tracing is enabled, do not trace these files
+ifdef CONFIG_TRACING_BRANCHES
+KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
+endif
+
+CFLAGS_trace_benchmark.o := -I$(src)
+CFLAGS_trace_events_filter.o := -I$(src)
+
+obj-$(CONFIG_TRACE_CLOCK) += trace_clock.o
+
+obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
+obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
+obj-$(CONFIG_RING_BUFFER_BENCHMARK) += ring_buffer_benchmark.o
+
+obj-$(CONFIG_TRACING) += trace.o
+obj-$(CONFIG_TRACING) += trace_output.o
+obj-$(CONFIG_TRACING) += trace_seq.o
+obj-$(CONFIG_TRACING) += trace_stat.o
+obj-$(CONFIG_TRACING) += trace_printk.o
+obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
+obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
+obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
+obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
+obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
+obj-$(CONFIG_NOP_TRACER) += trace_nop.o
+obj-$(CONFIG_STACK_TRACER) += trace_stack.o
+obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
+obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
+obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
+obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
+ifeq ($(CONFIG_BLOCK),y)
+obj-$(CONFIG_EVENT_TRACING) += blktrace.o
+endif
+obj-$(CONFIG_EVENT_TRACING) += trace_events.o
+obj-$(CONFIG_EVENT_TRACING) += trace_export.o
+obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
+ifeq ($(CONFIG_PERF_EVENTS),y)
+obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
+endif
+obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
+obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
+obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o
+obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
+obj-$(CONFIG_TRACEPOINTS) += power-traces.o
+ifeq ($(CONFIG_PM),y)
+obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o
+endif
+ifeq ($(CONFIG_TRACING),y)
+obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
+endif
+obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o
+obj-$(CONFIG_UPROBE_EVENT) += trace_uprobe.o
+
+obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o
+
+libftrace-y := ftrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
new file mode 100644
index 000000000..483cecfa5
--- /dev/null
+++ b/kernel/trace/blktrace.c
@@ -0,0 +1,1819 @@
+/*
+ * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/blktrace_api.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/debugfs.h>
+#include <linux/export.h>
+#include <linux/time.h>
+#include <linux/uaccess.h>
+#include <linux/list.h>
+
+#include <trace/events/block.h>
+
+#include "trace_output.h"
+
+#ifdef CONFIG_BLK_DEV_IO_TRACE
+
+static unsigned int blktrace_seq __read_mostly = 1;
+
+static struct trace_array *blk_tr;
+static bool blk_tracer_enabled __read_mostly;
+
+static LIST_HEAD(running_trace_list);
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(running_trace_lock);
+
+/* Select an alternative, minimalistic output than the original one */
+#define TRACE_BLK_OPT_CLASSIC 0x1
+
+static struct tracer_opt blk_tracer_opts[] = {
+ /* Default disable the minimalistic output */
+ { TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) },
+ { }
+};
+
+static struct tracer_flags blk_tracer_flags = {
+ .val = 0,
+ .opts = blk_tracer_opts,
+};
+
+/* Global reference count of probes */
+static atomic_t blk_probes_ref = ATOMIC_INIT(0);
+
+static void blk_register_tracepoints(void);
+static void blk_unregister_tracepoints(void);
+
+/*
+ * Send out a notify message.
+ */
+static void trace_note(struct blk_trace *bt, pid_t pid, int action,
+ const void *data, size_t len)
+{
+ struct blk_io_trace *t;
+ struct ring_buffer_event *event = NULL;
+ struct ring_buffer *buffer = NULL;
+ int pc = 0;
+ int cpu = smp_processor_id();
+ bool blk_tracer = blk_tracer_enabled;
+
+ if (blk_tracer) {
+ buffer = blk_tr->trace_buffer.buffer;
+ pc = preempt_count();
+ event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
+ sizeof(*t) + len,
+ 0, pc);
+ if (!event)
+ return;
+ t = ring_buffer_event_data(event);
+ goto record_it;
+ }
+
+ if (!bt->rchan)
+ return;
+
+ t = relay_reserve(bt->rchan, sizeof(*t) + len);
+ if (t) {
+ t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
+ t->time = ktime_to_ns(ktime_get());
+record_it:
+ t->device = bt->dev;
+ t->action = action;
+ t->pid = pid;
+ t->cpu = cpu;
+ t->pdu_len = len;
+ memcpy((void *) t + sizeof(*t), data, len);
+
+ if (blk_tracer)
+ trace_buffer_unlock_commit(buffer, event, 0, pc);
+ }
+}
+
+/*
+ * Send out a notify for this process, if we haven't done so since a trace
+ * started
+ */
+static void trace_note_tsk(struct task_struct *tsk)
+{
+ unsigned long flags;
+ struct blk_trace *bt;
+
+ tsk->btrace_seq = blktrace_seq;
+ spin_lock_irqsave(&running_trace_lock, flags);
+ list_for_each_entry(bt, &running_trace_list, running_list) {
+ trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm,
+ sizeof(tsk->comm));
+ }
+ spin_unlock_irqrestore(&running_trace_lock, flags);
+}
+
+static void trace_note_time(struct blk_trace *bt)
+{
+ struct timespec now;
+ unsigned long flags;
+ u32 words[2];
+
+ getnstimeofday(&now);
+ words[0] = now.tv_sec;
+ words[1] = now.tv_nsec;
+
+ local_irq_save(flags);
+ trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words));
+ local_irq_restore(flags);
+}
+
+void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
+{
+ int n;
+ va_list args;
+ unsigned long flags;
+ char *buf;
+
+ if (unlikely(bt->trace_state != Blktrace_running &&
+ !blk_tracer_enabled))
+ return;
+
+ /*
+ * If the BLK_TC_NOTIFY action mask isn't set, don't send any note
+ * message to the trace.
+ */
+ if (!(bt->act_mask & BLK_TC_NOTIFY))
+ return;
+
+ local_irq_save(flags);
+ buf = this_cpu_ptr(bt->msg_data);
+ va_start(args, fmt);
+ n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
+ va_end(args);
+
+ trace_note(bt, 0, BLK_TN_MESSAGE, buf, n);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(__trace_note_message);
+
+static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
+ pid_t pid)
+{
+ if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
+ return 1;
+ if (sector && (sector < bt->start_lba || sector > bt->end_lba))
+ return 1;
+ if (bt->pid && pid != bt->pid)
+ return 1;
+
+ return 0;
+}
+
+/*
+ * Data direction bit lookup
+ */
+static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
+ BLK_TC_ACT(BLK_TC_WRITE) };
+
+#define BLK_TC_RAHEAD BLK_TC_AHEAD
+
+/* The ilog2() calls fall out because they're constant */
+#define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \
+ (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - __REQ_ ## __name))
+
+/*
+ * The worker for the various blk_add_trace*() types. Fills out a
+ * blk_io_trace structure and places it in a per-cpu subbuffer.
+ */
+static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
+ int rw, u32 what, int error, int pdu_len, void *pdu_data)
+{
+ struct task_struct *tsk = current;
+ struct ring_buffer_event *event = NULL;
+ struct ring_buffer *buffer = NULL;
+ struct blk_io_trace *t;
+ unsigned long flags = 0;
+ unsigned long *sequence;
+ pid_t pid;
+ int cpu, pc = 0;
+ bool blk_tracer = blk_tracer_enabled;
+
+ if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer))
+ return;
+
+ what |= ddir_act[rw & WRITE];
+ what |= MASK_TC_BIT(rw, SYNC);
+ what |= MASK_TC_BIT(rw, RAHEAD);
+ what |= MASK_TC_BIT(rw, META);
+ what |= MASK_TC_BIT(rw, DISCARD);
+ what |= MASK_TC_BIT(rw, FLUSH);
+ what |= MASK_TC_BIT(rw, FUA);
+
+ pid = tsk->pid;
+ if (act_log_check(bt, what, sector, pid))
+ return;
+ cpu = raw_smp_processor_id();
+
+ if (blk_tracer) {
+ tracing_record_cmdline(current);
+
+ buffer = blk_tr->trace_buffer.buffer;
+ pc = preempt_count();
+ event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
+ sizeof(*t) + pdu_len,
+ 0, pc);
+ if (!event)
+ return;
+ t = ring_buffer_event_data(event);
+ goto record_it;
+ }
+
+ if (unlikely(tsk->btrace_seq != blktrace_seq))
+ trace_note_tsk(tsk);
+
+ /*
+ * A word about the locking here - we disable interrupts to reserve
+ * some space in the relay per-cpu buffer, to prevent an irq
+ * from coming in and stepping on our toes.
+ */
+ local_irq_save(flags);
+ t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
+ if (t) {
+ sequence = per_cpu_ptr(bt->sequence, cpu);
+
+ t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
+ t->sequence = ++(*sequence);
+ t->time = ktime_to_ns(ktime_get());
+record_it:
+ /*
+ * These two are not needed in ftrace as they are in the
+ * generic trace_entry, filled by tracing_generic_entry_update,
+ * but for the trace_event->bin() synthesizer benefit we do it
+ * here too.
+ */
+ t->cpu = cpu;
+ t->pid = pid;
+
+ t->sector = sector;
+ t->bytes = bytes;
+ t->action = what;
+ t->device = bt->dev;
+ t->error = error;
+ t->pdu_len = pdu_len;
+
+ if (pdu_len)
+ memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
+
+ if (blk_tracer) {
+ trace_buffer_unlock_commit(buffer, event, 0, pc);
+ return;
+ }
+ }
+
+ local_irq_restore(flags);
+}
+
+static struct dentry *blk_tree_root;
+static DEFINE_MUTEX(blk_tree_mutex);
+
+static void blk_trace_free(struct blk_trace *bt)
+{
+ debugfs_remove(bt->msg_file);
+ debugfs_remove(bt->dropped_file);
+ relay_close(bt->rchan);
+ debugfs_remove(bt->dir);
+ free_percpu(bt->sequence);
+ free_percpu(bt->msg_data);
+ kfree(bt);
+}
+
+static void blk_trace_cleanup(struct blk_trace *bt)
+{
+ blk_trace_free(bt);
+ if (atomic_dec_and_test(&blk_probes_ref))
+ blk_unregister_tracepoints();
+}
+
+int blk_trace_remove(struct request_queue *q)
+{
+ struct blk_trace *bt;
+
+ bt = xchg(&q->blk_trace, NULL);
+ if (!bt)
+ return -EINVAL;
+
+ if (bt->trace_state != Blktrace_running)
+ blk_trace_cleanup(bt);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(blk_trace_remove);
+
+static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
+ size_t count, loff_t *ppos)
+{
+ struct blk_trace *bt = filp->private_data;
+ char buf[16];
+
+ snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped));
+
+ return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf));
+}
+
+static const struct file_operations blk_dropped_fops = {
+ .owner = THIS_MODULE,
+ .open = simple_open,
+ .read = blk_dropped_read,
+ .llseek = default_llseek,
+};
+
+static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
+ size_t count, loff_t *ppos)
+{
+ char *msg;
+ struct blk_trace *bt;
+
+ if (count >= BLK_TN_MAX_MSG)
+ return -EINVAL;
+
+ msg = kmalloc(count + 1, GFP_KERNEL);
+ if (msg == NULL)
+ return -ENOMEM;
+
+ if (copy_from_user(msg, buffer, count)) {
+ kfree(msg);
+ return -EFAULT;
+ }
+
+ msg[count] = '\0';
+ bt = filp->private_data;
+ __trace_note_message(bt, "%s", msg);
+ kfree(msg);
+
+ return count;
+}
+
+static const struct file_operations blk_msg_fops = {
+ .owner = THIS_MODULE,
+ .open = simple_open,
+ .write = blk_msg_write,
+ .llseek = noop_llseek,
+};
+
+/*
+ * Keep track of how many times we encountered a full subbuffer, to aid
+ * the user space app in telling how many lost events there were.
+ */
+static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
+ void *prev_subbuf, size_t prev_padding)
+{
+ struct blk_trace *bt;
+
+ if (!relay_buf_full(buf))
+ return 1;
+
+ bt = buf->chan->private_data;
+ atomic_inc(&bt->dropped);
+ return 0;
+}
+
+static int blk_remove_buf_file_callback(struct dentry *dentry)
+{
+ debugfs_remove(dentry);
+
+ return 0;
+}
+
+static struct dentry *blk_create_buf_file_callback(const char *filename,
+ struct dentry *parent,
+ umode_t mode,
+ struct rchan_buf *buf,
+ int *is_global)
+{
+ return debugfs_create_file(filename, mode, parent, buf,
+ &relay_file_operations);
+}
+
+static struct rchan_callbacks blk_relay_callbacks = {
+ .subbuf_start = blk_subbuf_start_callback,
+ .create_buf_file = blk_create_buf_file_callback,
+ .remove_buf_file = blk_remove_buf_file_callback,
+};
+
+static void blk_trace_setup_lba(struct blk_trace *bt,
+ struct block_device *bdev)
+{
+ struct hd_struct *part = NULL;
+
+ if (bdev)
+ part = bdev->bd_part;
+
+ if (part) {
+ bt->start_lba = part->start_sect;
+ bt->end_lba = part->start_sect + part->nr_sects;
+ } else {
+ bt->start_lba = 0;
+ bt->end_lba = -1ULL;
+ }
+}
+
+/*
+ * Setup everything required to start tracing
+ */
+int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+ struct block_device *bdev,
+ struct blk_user_trace_setup *buts)
+{
+ struct blk_trace *old_bt, *bt = NULL;
+ struct dentry *dir = NULL;
+ int ret, i;
+
+ if (!buts->buf_size || !buts->buf_nr)
+ return -EINVAL;
+
+ strncpy(buts->name, name, BLKTRACE_BDEV_SIZE);
+ buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0';
+
+ /*
+ * some device names have larger paths - convert the slashes
+ * to underscores for this to work as expected
+ */
+ for (i = 0; i < strlen(buts->name); i++)
+ if (buts->name[i] == '/')
+ buts->name[i] = '_';
+
+ bt = kzalloc(sizeof(*bt), GFP_KERNEL);
+ if (!bt)
+ return -ENOMEM;
+
+ ret = -ENOMEM;
+ bt->sequence = alloc_percpu(unsigned long);
+ if (!bt->sequence)
+ goto err;
+
+ bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char));
+ if (!bt->msg_data)
+ goto err;
+
+ ret = -ENOENT;
+
+ mutex_lock(&blk_tree_mutex);
+ if (!blk_tree_root) {
+ blk_tree_root = debugfs_create_dir("block", NULL);
+ if (!blk_tree_root) {
+ mutex_unlock(&blk_tree_mutex);
+ goto err;
+ }
+ }
+ mutex_unlock(&blk_tree_mutex);
+
+ dir = debugfs_create_dir(buts->name, blk_tree_root);
+
+ if (!dir)
+ goto err;
+
+ bt->dir = dir;
+ bt->dev = dev;
+ atomic_set(&bt->dropped, 0);
+ INIT_LIST_HEAD(&bt->running_list);
+
+ ret = -EIO;
+ bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt,
+ &blk_dropped_fops);
+ if (!bt->dropped_file)
+ goto err;
+
+ bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops);
+ if (!bt->msg_file)
+ goto err;
+
+ bt->rchan = relay_open("trace", dir, buts->buf_size,
+ buts->buf_nr, &blk_relay_callbacks, bt);
+ if (!bt->rchan)
+ goto err;
+
+ bt->act_mask = buts->act_mask;
+ if (!bt->act_mask)
+ bt->act_mask = (u16) -1;
+
+ blk_trace_setup_lba(bt, bdev);
+
+ /* overwrite with user settings */
+ if (buts->start_lba)
+ bt->start_lba = buts->start_lba;
+ if (buts->end_lba)
+ bt->end_lba = buts->end_lba;
+
+ bt->pid = buts->pid;
+ bt->trace_state = Blktrace_setup;
+
+ ret = -EBUSY;
+ old_bt = xchg(&q->blk_trace, bt);
+ if (old_bt) {
+ (void) xchg(&q->blk_trace, old_bt);
+ goto err;
+ }
+
+ if (atomic_inc_return(&blk_probes_ref) == 1)
+ blk_register_tracepoints();
+
+ return 0;
+err:
+ blk_trace_free(bt);
+ return ret;
+}
+
+int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+ struct block_device *bdev,
+ char __user *arg)
+{
+ struct blk_user_trace_setup buts;
+ int ret;
+
+ ret = copy_from_user(&buts, arg, sizeof(buts));
+ if (ret)
+ return -EFAULT;
+
+ ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
+ if (ret)
+ return ret;
+
+ if (copy_to_user(arg, &buts, sizeof(buts))) {
+ blk_trace_remove(q);
+ return -EFAULT;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(blk_trace_setup);
+
+#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
+static int compat_blk_trace_setup(struct request_queue *q, char *name,
+ dev_t dev, struct block_device *bdev,
+ char __user *arg)
+{
+ struct blk_user_trace_setup buts;
+ struct compat_blk_user_trace_setup cbuts;
+ int ret;
+
+ if (copy_from_user(&cbuts, arg, sizeof(cbuts)))
+ return -EFAULT;
+
+ buts = (struct blk_user_trace_setup) {
+ .act_mask = cbuts.act_mask,
+ .buf_size = cbuts.buf_size,
+ .buf_nr = cbuts.buf_nr,
+ .start_lba = cbuts.start_lba,
+ .end_lba = cbuts.end_lba,
+ .pid = cbuts.pid,
+ };
+
+ ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
+ if (ret)
+ return ret;
+
+ if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) {
+ blk_trace_remove(q);
+ return -EFAULT;
+ }
+
+ return 0;
+}
+#endif
+
+int blk_trace_startstop(struct request_queue *q, int start)
+{
+ int ret;
+ struct blk_trace *bt = q->blk_trace;
+
+ if (bt == NULL)
+ return -EINVAL;
+
+ /*
+ * For starting a trace, we can transition from a setup or stopped
+ * trace. For stopping a trace, the state must be running
+ */
+ ret = -EINVAL;
+ if (start) {
+ if (bt->trace_state == Blktrace_setup ||
+ bt->trace_state == Blktrace_stopped) {
+ blktrace_seq++;
+ smp_mb();
+ bt->trace_state = Blktrace_running;
+ spin_lock_irq(&running_trace_lock);
+ list_add(&bt->running_list, &running_trace_list);
+ spin_unlock_irq(&running_trace_lock);
+
+ trace_note_time(bt);
+ ret = 0;
+ }
+ } else {
+ if (bt->trace_state == Blktrace_running) {
+ bt->trace_state = Blktrace_stopped;
+ spin_lock_irq(&running_trace_lock);
+ list_del_init(&bt->running_list);
+ spin_unlock_irq(&running_trace_lock);
+ relay_flush(bt->rchan);
+ ret = 0;
+ }
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(blk_trace_startstop);
+
+/**
+ * blk_trace_ioctl: - handle the ioctls associated with tracing
+ * @bdev: the block device
+ * @cmd: the ioctl cmd
+ * @arg: the argument data, if any
+ *
+ **/
+int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
+{
+ struct request_queue *q;
+ int ret, start = 0;
+ char b[BDEVNAME_SIZE];
+
+ q = bdev_get_queue(bdev);
+ if (!q)
+ return -ENXIO;
+
+ mutex_lock(&bdev->bd_mutex);
+
+ switch (cmd) {
+ case BLKTRACESETUP:
+ bdevname(bdev, b);
+ ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
+ break;
+#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
+ case BLKTRACESETUP32:
+ bdevname(bdev, b);
+ ret = compat_blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
+ break;
+#endif
+ case BLKTRACESTART:
+ start = 1;
+ case BLKTRACESTOP:
+ ret = blk_trace_startstop(q, start);
+ break;
+ case BLKTRACETEARDOWN:
+ ret = blk_trace_remove(q);
+ break;
+ default:
+ ret = -ENOTTY;
+ break;
+ }
+
+ mutex_unlock(&bdev->bd_mutex);
+ return ret;
+}
+
+/**
+ * blk_trace_shutdown: - stop and cleanup trace structures
+ * @q: the request queue associated with the device
+ *
+ **/
+void blk_trace_shutdown(struct request_queue *q)
+{
+ if (q->blk_trace) {
+ blk_trace_startstop(q, 0);
+ blk_trace_remove(q);
+ }
+}
+
+/*
+ * blktrace probes
+ */
+
+/**
+ * blk_add_trace_rq - Add a trace for a request oriented action
+ * @q: queue the io is for
+ * @rq: the source request
+ * @nr_bytes: number of completed bytes
+ * @what: the action
+ *
+ * Description:
+ * Records an action against a request. Will log the bio offset + size.
+ *
+ **/
+static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
+ unsigned int nr_bytes, u32 what)
+{
+ struct blk_trace *bt = q->blk_trace;
+
+ if (likely(!bt))
+ return;
+
+ if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
+ what |= BLK_TC_ACT(BLK_TC_PC);
+ __blk_add_trace(bt, 0, nr_bytes, rq->cmd_flags,
+ what, rq->errors, rq->cmd_len, rq->cmd);
+ } else {
+ what |= BLK_TC_ACT(BLK_TC_FS);
+ __blk_add_trace(bt, blk_rq_pos(rq), nr_bytes,
+ rq->cmd_flags, what, rq->errors, 0, NULL);
+ }
+}
+
+static void blk_add_trace_rq_abort(void *ignore,
+ struct request_queue *q, struct request *rq)
+{
+ blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ABORT);
+}
+
+static void blk_add_trace_rq_insert(void *ignore,
+ struct request_queue *q, struct request *rq)
+{
+ blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_INSERT);
+}
+
+static void blk_add_trace_rq_issue(void *ignore,
+ struct request_queue *q, struct request *rq)
+{
+ blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ISSUE);
+}
+
+static void blk_add_trace_rq_requeue(void *ignore,
+ struct request_queue *q,
+ struct request *rq)
+{
+ blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_REQUEUE);
+}
+
+static void blk_add_trace_rq_complete(void *ignore,
+ struct request_queue *q,
+ struct request *rq,
+ unsigned int nr_bytes)
+{
+ blk_add_trace_rq(q, rq, nr_bytes, BLK_TA_COMPLETE);
+}
+
+/**
+ * blk_add_trace_bio - Add a trace for a bio oriented action
+ * @q: queue the io is for
+ * @bio: the source bio
+ * @what: the action
+ * @error: error, if any
+ *
+ * Description:
+ * Records an action against a bio. Will log the bio offset + size.
+ *
+ **/
+static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
+ u32 what, int error)
+{
+ struct blk_trace *bt = q->blk_trace;
+
+ if (likely(!bt))
+ return;
+
+ if (!error && !bio_flagged(bio, BIO_UPTODATE))
+ error = EIO;
+
+ __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
+ bio->bi_rw, what, error, 0, NULL);
+}
+
+static void blk_add_trace_bio_bounce(void *ignore,
+ struct request_queue *q, struct bio *bio)
+{
+ blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
+}
+
+static void blk_add_trace_bio_complete(void *ignore,
+ struct request_queue *q, struct bio *bio,
+ int error)
+{
+ blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
+}
+
+static void blk_add_trace_bio_backmerge(void *ignore,
+ struct request_queue *q,
+ struct request *rq,
+ struct bio *bio)
+{
+ blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0);
+}
+
+static void blk_add_trace_bio_frontmerge(void *ignore,
+ struct request_queue *q,
+ struct request *rq,
+ struct bio *bio)
+{
+ blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0);
+}
+
+static void blk_add_trace_bio_queue(void *ignore,
+ struct request_queue *q, struct bio *bio)
+{
+ blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0);
+}
+
+static void blk_add_trace_getrq(void *ignore,
+ struct request_queue *q,
+ struct bio *bio, int rw)
+{
+ if (bio)
+ blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0);
+ else {
+ struct blk_trace *bt = q->blk_trace;
+
+ if (bt)
+ __blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL);
+ }
+}
+
+
+static void blk_add_trace_sleeprq(void *ignore,
+ struct request_queue *q,
+ struct bio *bio, int rw)
+{
+ if (bio)
+ blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0);
+ else {
+ struct blk_trace *bt = q->blk_trace;
+
+ if (bt)
+ __blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ,
+ 0, 0, NULL);
+ }
+}
+
+static void blk_add_trace_plug(void *ignore, struct request_queue *q)
+{
+ struct blk_trace *bt = q->blk_trace;
+
+ if (bt)
+ __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
+}
+
+static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
+ unsigned int depth, bool explicit)
+{
+ struct blk_trace *bt = q->blk_trace;
+
+ if (bt) {
+ __be64 rpdu = cpu_to_be64(depth);
+ u32 what;
+
+ if (explicit)
+ what = BLK_TA_UNPLUG_IO;
+ else
+ what = BLK_TA_UNPLUG_TIMER;
+
+ __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
+ }
+}
+
+static void blk_add_trace_split(void *ignore,
+ struct request_queue *q, struct bio *bio,
+ unsigned int pdu)
+{
+ struct blk_trace *bt = q->blk_trace;
+
+ if (bt) {
+ __be64 rpdu = cpu_to_be64(pdu);
+
+ __blk_add_trace(bt, bio->bi_iter.bi_sector,
+ bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_SPLIT,
+ !bio_flagged(bio, BIO_UPTODATE),
+ sizeof(rpdu), &rpdu);
+ }
+}
+
+/**
+ * blk_add_trace_bio_remap - Add a trace for a bio-remap operation
+ * @ignore: trace callback data parameter (not used)
+ * @q: queue the io is for
+ * @bio: the source bio
+ * @dev: target device
+ * @from: source sector
+ *
+ * Description:
+ * Device mapper or raid target sometimes need to split a bio because
+ * it spans a stripe (or similar). Add a trace for that action.
+ *
+ **/
+static void blk_add_trace_bio_remap(void *ignore,
+ struct request_queue *q, struct bio *bio,
+ dev_t dev, sector_t from)
+{
+ struct blk_trace *bt = q->blk_trace;
+ struct blk_io_trace_remap r;
+
+ if (likely(!bt))
+ return;
+
+ r.device_from = cpu_to_be32(dev);
+ r.device_to = cpu_to_be32(bio->bi_bdev->bd_dev);
+ r.sector_from = cpu_to_be64(from);
+
+ __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
+ bio->bi_rw, BLK_TA_REMAP,
+ !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
+}
+
+/**
+ * blk_add_trace_rq_remap - Add a trace for a request-remap operation
+ * @ignore: trace callback data parameter (not used)
+ * @q: queue the io is for
+ * @rq: the source request
+ * @dev: target device
+ * @from: source sector
+ *
+ * Description:
+ * Device mapper remaps request to other devices.
+ * Add a trace for that action.
+ *
+ **/
+static void blk_add_trace_rq_remap(void *ignore,
+ struct request_queue *q,
+ struct request *rq, dev_t dev,
+ sector_t from)
+{
+ struct blk_trace *bt = q->blk_trace;
+ struct blk_io_trace_remap r;
+
+ if (likely(!bt))
+ return;
+
+ r.device_from = cpu_to_be32(dev);
+ r.device_to = cpu_to_be32(disk_devt(rq->rq_disk));
+ r.sector_from = cpu_to_be64(from);
+
+ __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
+ rq_data_dir(rq), BLK_TA_REMAP, !!rq->errors,
+ sizeof(r), &r);
+}
+
+/**
+ * blk_add_driver_data - Add binary message with driver-specific data
+ * @q: queue the io is for
+ * @rq: io request
+ * @data: driver-specific data
+ * @len: length of driver-specific data
+ *
+ * Description:
+ * Some drivers might want to write driver-specific data per request.
+ *
+ **/
+void blk_add_driver_data(struct request_queue *q,
+ struct request *rq,
+ void *data, size_t len)
+{
+ struct blk_trace *bt = q->blk_trace;
+
+ if (likely(!bt))
+ return;
+
+ if (rq->cmd_type == REQ_TYPE_BLOCK_PC)
+ __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0,
+ BLK_TA_DRV_DATA, rq->errors, len, data);
+ else
+ __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0,
+ BLK_TA_DRV_DATA, rq->errors, len, data);
+}
+EXPORT_SYMBOL_GPL(blk_add_driver_data);
+
+static void blk_register_tracepoints(void)
+{
+ int ret;
+
+ ret = register_trace_block_rq_abort(blk_add_trace_rq_abort, NULL);
+ WARN_ON(ret);
+ ret = register_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
+ WARN_ON(ret);
+ ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
+ WARN_ON(ret);
+ ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
+ WARN_ON(ret);
+ ret = register_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
+ WARN_ON(ret);
+ ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
+ WARN_ON(ret);
+ ret = register_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
+ WARN_ON(ret);
+ ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
+ WARN_ON(ret);
+ ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
+ WARN_ON(ret);
+ ret = register_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
+ WARN_ON(ret);
+ ret = register_trace_block_getrq(blk_add_trace_getrq, NULL);
+ WARN_ON(ret);
+ ret = register_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
+ WARN_ON(ret);
+ ret = register_trace_block_plug(blk_add_trace_plug, NULL);
+ WARN_ON(ret);
+ ret = register_trace_block_unplug(blk_add_trace_unplug, NULL);
+ WARN_ON(ret);
+ ret = register_trace_block_split(blk_add_trace_split, NULL);
+ WARN_ON(ret);
+ ret = register_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
+ WARN_ON(ret);
+ ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
+ WARN_ON(ret);
+}
+
+static void blk_unregister_tracepoints(void)
+{
+ unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
+ unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
+ unregister_trace_block_split(blk_add_trace_split, NULL);
+ unregister_trace_block_unplug(blk_add_trace_unplug, NULL);
+ unregister_trace_block_plug(blk_add_trace_plug, NULL);
+ unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
+ unregister_trace_block_getrq(blk_add_trace_getrq, NULL);
+ unregister_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
+ unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
+ unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
+ unregister_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
+ unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
+ unregister_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
+ unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
+ unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
+ unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
+ unregister_trace_block_rq_abort(blk_add_trace_rq_abort, NULL);
+
+ tracepoint_synchronize_unregister();
+}
+
+/*
+ * struct blk_io_tracer formatting routines
+ */
+
+static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
+{
+ int i = 0;
+ int tc = t->action >> BLK_TC_SHIFT;
+
+ if (t->action == BLK_TN_MESSAGE) {
+ rwbs[i++] = 'N';
+ goto out;
+ }
+
+ if (tc & BLK_TC_FLUSH)
+ rwbs[i++] = 'F';
+
+ if (tc & BLK_TC_DISCARD)
+ rwbs[i++] = 'D';
+ else if (tc & BLK_TC_WRITE)
+ rwbs[i++] = 'W';
+ else if (t->bytes)
+ rwbs[i++] = 'R';
+ else
+ rwbs[i++] = 'N';
+
+ if (tc & BLK_TC_FUA)
+ rwbs[i++] = 'F';
+ if (tc & BLK_TC_AHEAD)
+ rwbs[i++] = 'A';
+ if (tc & BLK_TC_SYNC)
+ rwbs[i++] = 'S';
+ if (tc & BLK_TC_META)
+ rwbs[i++] = 'M';
+out:
+ rwbs[i] = '\0';
+}
+
+static inline
+const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent)
+{
+ return (const struct blk_io_trace *)ent;
+}
+
+static inline const void *pdu_start(const struct trace_entry *ent)
+{
+ return te_blk_io_trace(ent) + 1;
+}
+
+static inline u32 t_action(const struct trace_entry *ent)
+{
+ return te_blk_io_trace(ent)->action;
+}
+
+static inline u32 t_bytes(const struct trace_entry *ent)
+{
+ return te_blk_io_trace(ent)->bytes;
+}
+
+static inline u32 t_sec(const struct trace_entry *ent)
+{
+ return te_blk_io_trace(ent)->bytes >> 9;
+}
+
+static inline unsigned long long t_sector(const struct trace_entry *ent)
+{
+ return te_blk_io_trace(ent)->sector;
+}
+
+static inline __u16 t_error(const struct trace_entry *ent)
+{
+ return te_blk_io_trace(ent)->error;
+}
+
+static __u64 get_pdu_int(const struct trace_entry *ent)
+{
+ const __u64 *val = pdu_start(ent);
+ return be64_to_cpu(*val);
+}
+
+static void get_pdu_remap(const struct trace_entry *ent,
+ struct blk_io_trace_remap *r)
+{
+ const struct blk_io_trace_remap *__r = pdu_start(ent);
+ __u64 sector_from = __r->sector_from;
+
+ r->device_from = be32_to_cpu(__r->device_from);
+ r->device_to = be32_to_cpu(__r->device_to);
+ r->sector_from = be64_to_cpu(sector_from);
+}
+
+typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act);
+
+static void blk_log_action_classic(struct trace_iterator *iter, const char *act)
+{
+ char rwbs[RWBS_LEN];
+ unsigned long long ts = iter->ts;
+ unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC);
+ unsigned secs = (unsigned long)ts;
+ const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
+
+ fill_rwbs(rwbs, t);
+
+ trace_seq_printf(&iter->seq,
+ "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ",
+ MAJOR(t->device), MINOR(t->device), iter->cpu,
+ secs, nsec_rem, iter->ent->pid, act, rwbs);
+}
+
+static void blk_log_action(struct trace_iterator *iter, const char *act)
+{
+ char rwbs[RWBS_LEN];
+ const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
+
+ fill_rwbs(rwbs, t);
+ trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
+ MAJOR(t->device), MINOR(t->device), act, rwbs);
+}
+
+static void blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
+{
+ const unsigned char *pdu_buf;
+ int pdu_len;
+ int i, end;
+
+ pdu_buf = pdu_start(ent);
+ pdu_len = te_blk_io_trace(ent)->pdu_len;
+
+ if (!pdu_len)
+ return;
+
+ /* find the last zero that needs to be printed */
+ for (end = pdu_len - 1; end >= 0; end--)
+ if (pdu_buf[end])
+ break;
+ end++;
+
+ trace_seq_putc(s, '(');
+
+ for (i = 0; i < pdu_len; i++) {
+
+ trace_seq_printf(s, "%s%02x",
+ i == 0 ? "" : " ", pdu_buf[i]);
+
+ /*
+ * stop when the rest is just zeroes and indicate so
+ * with a ".." appended
+ */
+ if (i == end && end != pdu_len - 1) {
+ trace_seq_puts(s, " ..) ");
+ return;
+ }
+ }
+
+ trace_seq_puts(s, ") ");
+}
+
+static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
+{
+ char cmd[TASK_COMM_LEN];
+
+ trace_find_cmdline(ent->pid, cmd);
+
+ if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
+ trace_seq_printf(s, "%u ", t_bytes(ent));
+ blk_log_dump_pdu(s, ent);
+ trace_seq_printf(s, "[%s]\n", cmd);
+ } else {
+ if (t_sec(ent))
+ trace_seq_printf(s, "%llu + %u [%s]\n",
+ t_sector(ent), t_sec(ent), cmd);
+ else
+ trace_seq_printf(s, "[%s]\n", cmd);
+ }
+}
+
+static void blk_log_with_error(struct trace_seq *s,
+ const struct trace_entry *ent)
+{
+ if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
+ blk_log_dump_pdu(s, ent);
+ trace_seq_printf(s, "[%d]\n", t_error(ent));
+ } else {
+ if (t_sec(ent))
+ trace_seq_printf(s, "%llu + %u [%d]\n",
+ t_sector(ent),
+ t_sec(ent), t_error(ent));
+ else
+ trace_seq_printf(s, "%llu [%d]\n",
+ t_sector(ent), t_error(ent));
+ }
+}
+
+static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
+{
+ struct blk_io_trace_remap r = { .device_from = 0, };
+
+ get_pdu_remap(ent, &r);
+ trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
+ t_sector(ent), t_sec(ent),
+ MAJOR(r.device_from), MINOR(r.device_from),
+ (unsigned long long)r.sector_from);
+}
+
+static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
+{
+ char cmd[TASK_COMM_LEN];
+
+ trace_find_cmdline(ent->pid, cmd);
+
+ trace_seq_printf(s, "[%s]\n", cmd);
+}
+
+static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent)
+{
+ char cmd[TASK_COMM_LEN];
+
+ trace_find_cmdline(ent->pid, cmd);
+
+ trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent));
+}
+
+static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent)
+{
+ char cmd[TASK_COMM_LEN];
+
+ trace_find_cmdline(ent->pid, cmd);
+
+ trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent),
+ get_pdu_int(ent), cmd);
+}
+
+static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent)
+{
+ const struct blk_io_trace *t = te_blk_io_trace(ent);
+
+ trace_seq_putmem(s, t + 1, t->pdu_len);
+ trace_seq_putc(s, '\n');
+}
+
+/*
+ * struct tracer operations
+ */
+
+static void blk_tracer_print_header(struct seq_file *m)
+{
+ if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
+ return;
+ seq_puts(m, "# DEV CPU TIMESTAMP PID ACT FLG\n"
+ "# | | | | | |\n");
+}
+
+static void blk_tracer_start(struct trace_array *tr)
+{
+ blk_tracer_enabled = true;
+}
+
+static int blk_tracer_init(struct trace_array *tr)
+{
+ blk_tr = tr;
+ blk_tracer_start(tr);
+ return 0;
+}
+
+static void blk_tracer_stop(struct trace_array *tr)
+{
+ blk_tracer_enabled = false;
+}
+
+static void blk_tracer_reset(struct trace_array *tr)
+{
+ blk_tracer_stop(tr);
+}
+
+static const struct {
+ const char *act[2];
+ void (*print)(struct trace_seq *s, const struct trace_entry *ent);
+} what2act[] = {
+ [__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic },
+ [__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic },
+ [__BLK_TA_FRONTMERGE] = {{ "F", "frontmerge" }, blk_log_generic },
+ [__BLK_TA_GETRQ] = {{ "G", "getrq" }, blk_log_generic },
+ [__BLK_TA_SLEEPRQ] = {{ "S", "sleeprq" }, blk_log_generic },
+ [__BLK_TA_REQUEUE] = {{ "R", "requeue" }, blk_log_with_error },
+ [__BLK_TA_ISSUE] = {{ "D", "issue" }, blk_log_generic },
+ [__BLK_TA_COMPLETE] = {{ "C", "complete" }, blk_log_with_error },
+ [__BLK_TA_PLUG] = {{ "P", "plug" }, blk_log_plug },
+ [__BLK_TA_UNPLUG_IO] = {{ "U", "unplug_io" }, blk_log_unplug },
+ [__BLK_TA_UNPLUG_TIMER] = {{ "UT", "unplug_timer" }, blk_log_unplug },
+ [__BLK_TA_INSERT] = {{ "I", "insert" }, blk_log_generic },
+ [__BLK_TA_SPLIT] = {{ "X", "split" }, blk_log_split },
+ [__BLK_TA_BOUNCE] = {{ "B", "bounce" }, blk_log_generic },
+ [__BLK_TA_REMAP] = {{ "A", "remap" }, blk_log_remap },
+};
+
+static enum print_line_t print_one_line(struct trace_iterator *iter,
+ bool classic)
+{
+ struct trace_seq *s = &iter->seq;
+ const struct blk_io_trace *t;
+ u16 what;
+ bool long_act;
+ blk_log_action_t *log_action;
+
+ t = te_blk_io_trace(iter->ent);
+ what = t->action & ((1 << BLK_TC_SHIFT) - 1);
+ long_act = !!(trace_flags & TRACE_ITER_VERBOSE);
+ log_action = classic ? &blk_log_action_classic : &blk_log_action;
+
+ if (t->action == BLK_TN_MESSAGE) {
+ log_action(iter, long_act ? "message" : "m");
+ blk_log_msg(s, iter->ent);
+ }
+
+ if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
+ trace_seq_printf(s, "Unknown action %x\n", what);
+ else {
+ log_action(iter, what2act[what].act[long_act]);
+ what2act[what].print(s, iter->ent);
+ }
+
+ return trace_handle_return(s);
+}
+
+static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
+ int flags, struct trace_event *event)
+{
+ return print_one_line(iter, false);
+}
+
+static void blk_trace_synthesize_old_trace(struct trace_iterator *iter)
+{
+ struct trace_seq *s = &iter->seq;
+ struct blk_io_trace *t = (struct blk_io_trace *)iter->ent;
+ const int offset = offsetof(struct blk_io_trace, sector);
+ struct blk_io_trace old = {
+ .magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION,
+ .time = iter->ts,
+ };
+
+ trace_seq_putmem(s, &old, offset);
+ trace_seq_putmem(s, &t->sector,
+ sizeof(old) - offset + t->pdu_len);
+}
+
+static enum print_line_t
+blk_trace_event_print_binary(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ blk_trace_synthesize_old_trace(iter);
+
+ return trace_handle_return(&iter->seq);
+}
+
+static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
+{
+ if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
+ return TRACE_TYPE_UNHANDLED;
+
+ return print_one_line(iter, true);
+}
+
+static int
+blk_tracer_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
+{
+ /* don't output context-info for blk_classic output */
+ if (bit == TRACE_BLK_OPT_CLASSIC) {
+ if (set)
+ trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
+ else
+ trace_flags |= TRACE_ITER_CONTEXT_INFO;
+ }
+ return 0;
+}
+
+static struct tracer blk_tracer __read_mostly = {
+ .name = "blk",
+ .init = blk_tracer_init,
+ .reset = blk_tracer_reset,
+ .start = blk_tracer_start,
+ .stop = blk_tracer_stop,
+ .print_header = blk_tracer_print_header,
+ .print_line = blk_tracer_print_line,
+ .flags = &blk_tracer_flags,
+ .set_flag = blk_tracer_set_flag,
+};
+
+static struct trace_event_functions trace_blk_event_funcs = {
+ .trace = blk_trace_event_print,
+ .binary = blk_trace_event_print_binary,
+};
+
+static struct trace_event trace_blk_event = {
+ .type = TRACE_BLK,
+ .funcs = &trace_blk_event_funcs,
+};
+
+static int __init init_blk_tracer(void)
+{
+ if (!register_ftrace_event(&trace_blk_event)) {
+ pr_warning("Warning: could not register block events\n");
+ return 1;
+ }
+
+ if (register_tracer(&blk_tracer) != 0) {
+ pr_warning("Warning: could not register the block tracer\n");
+ unregister_ftrace_event(&trace_blk_event);
+ return 1;
+ }
+
+ return 0;
+}
+
+device_initcall(init_blk_tracer);
+
+static int blk_trace_remove_queue(struct request_queue *q)
+{
+ struct blk_trace *bt;
+
+ bt = xchg(&q->blk_trace, NULL);
+ if (bt == NULL)
+ return -EINVAL;
+
+ if (atomic_dec_and_test(&blk_probes_ref))
+ blk_unregister_tracepoints();
+
+ blk_trace_free(bt);
+ return 0;
+}
+
+/*
+ * Setup everything required to start tracing
+ */
+static int blk_trace_setup_queue(struct request_queue *q,
+ struct block_device *bdev)
+{
+ struct blk_trace *old_bt, *bt = NULL;
+ int ret = -ENOMEM;
+
+ bt = kzalloc(sizeof(*bt), GFP_KERNEL);
+ if (!bt)
+ return -ENOMEM;
+
+ bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char));
+ if (!bt->msg_data)
+ goto free_bt;
+
+ bt->dev = bdev->bd_dev;
+ bt->act_mask = (u16)-1;
+
+ blk_trace_setup_lba(bt, bdev);
+
+ old_bt = xchg(&q->blk_trace, bt);
+ if (old_bt != NULL) {
+ (void)xchg(&q->blk_trace, old_bt);
+ ret = -EBUSY;
+ goto free_bt;
+ }
+
+ if (atomic_inc_return(&blk_probes_ref) == 1)
+ blk_register_tracepoints();
+ return 0;
+
+free_bt:
+ blk_trace_free(bt);
+ return ret;
+}
+
+/*
+ * sysfs interface to enable and configure tracing
+ */
+
+static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf);
+static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count);
+#define BLK_TRACE_DEVICE_ATTR(_name) \
+ DEVICE_ATTR(_name, S_IRUGO | S_IWUSR, \
+ sysfs_blk_trace_attr_show, \
+ sysfs_blk_trace_attr_store)
+
+static BLK_TRACE_DEVICE_ATTR(enable);
+static BLK_TRACE_DEVICE_ATTR(act_mask);
+static BLK_TRACE_DEVICE_ATTR(pid);
+static BLK_TRACE_DEVICE_ATTR(start_lba);
+static BLK_TRACE_DEVICE_ATTR(end_lba);
+
+static struct attribute *blk_trace_attrs[] = {
+ &dev_attr_enable.attr,
+ &dev_attr_act_mask.attr,
+ &dev_attr_pid.attr,
+ &dev_attr_start_lba.attr,
+ &dev_attr_end_lba.attr,
+ NULL
+};
+
+struct attribute_group blk_trace_attr_group = {
+ .name = "trace",
+ .attrs = blk_trace_attrs,
+};
+
+static const struct {
+ int mask;
+ const char *str;
+} mask_maps[] = {
+ { BLK_TC_READ, "read" },
+ { BLK_TC_WRITE, "write" },
+ { BLK_TC_FLUSH, "flush" },
+ { BLK_TC_SYNC, "sync" },
+ { BLK_TC_QUEUE, "queue" },
+ { BLK_TC_REQUEUE, "requeue" },
+ { BLK_TC_ISSUE, "issue" },
+ { BLK_TC_COMPLETE, "complete" },
+ { BLK_TC_FS, "fs" },
+ { BLK_TC_PC, "pc" },
+ { BLK_TC_AHEAD, "ahead" },
+ { BLK_TC_META, "meta" },
+ { BLK_TC_DISCARD, "discard" },
+ { BLK_TC_DRV_DATA, "drv_data" },
+ { BLK_TC_FUA, "fua" },
+};
+
+static int blk_trace_str2mask(const char *str)
+{
+ int i;
+ int mask = 0;
+ char *buf, *s, *token;
+
+ buf = kstrdup(str, GFP_KERNEL);
+ if (buf == NULL)
+ return -ENOMEM;
+ s = strstrip(buf);
+
+ while (1) {
+ token = strsep(&s, ",");
+ if (token == NULL)
+ break;
+
+ if (*token == '\0')
+ continue;
+
+ for (i = 0; i < ARRAY_SIZE(mask_maps); i++) {
+ if (strcasecmp(token, mask_maps[i].str) == 0) {
+ mask |= mask_maps[i].mask;
+ break;
+ }
+ }
+ if (i == ARRAY_SIZE(mask_maps)) {
+ mask = -EINVAL;
+ break;
+ }
+ }
+ kfree(buf);
+
+ return mask;
+}
+
+static ssize_t blk_trace_mask2str(char *buf, int mask)
+{
+ int i;
+ char *p = buf;
+
+ for (i = 0; i < ARRAY_SIZE(mask_maps); i++) {
+ if (mask & mask_maps[i].mask) {
+ p += sprintf(p, "%s%s",
+ (p == buf) ? "" : ",", mask_maps[i].str);
+ }
+ }
+ *p++ = '\n';
+
+ return p - buf;
+}
+
+static struct request_queue *blk_trace_get_queue(struct block_device *bdev)
+{
+ if (bdev->bd_disk == NULL)
+ return NULL;
+
+ return bdev_get_queue(bdev);
+}
+
+static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct hd_struct *p = dev_to_part(dev);
+ struct request_queue *q;
+ struct block_device *bdev;
+ ssize_t ret = -ENXIO;
+
+ bdev = bdget(part_devt(p));
+ if (bdev == NULL)
+ goto out;
+
+ q = blk_trace_get_queue(bdev);
+ if (q == NULL)
+ goto out_bdput;
+
+ mutex_lock(&bdev->bd_mutex);
+
+ if (attr == &dev_attr_enable) {
+ ret = sprintf(buf, "%u\n", !!q->blk_trace);
+ goto out_unlock_bdev;
+ }
+
+ if (q->blk_trace == NULL)
+ ret = sprintf(buf, "disabled\n");
+ else if (attr == &dev_attr_act_mask)
+ ret = blk_trace_mask2str(buf, q->blk_trace->act_mask);
+ else if (attr == &dev_attr_pid)
+ ret = sprintf(buf, "%u\n", q->blk_trace->pid);
+ else if (attr == &dev_attr_start_lba)
+ ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba);
+ else if (attr == &dev_attr_end_lba)
+ ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba);
+
+out_unlock_bdev:
+ mutex_unlock(&bdev->bd_mutex);
+out_bdput:
+ bdput(bdev);
+out:
+ return ret;
+}
+
+static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct block_device *bdev;
+ struct request_queue *q;
+ struct hd_struct *p;
+ u64 value;
+ ssize_t ret = -EINVAL;
+
+ if (count == 0)
+ goto out;
+
+ if (attr == &dev_attr_act_mask) {
+ if (sscanf(buf, "%llx", &value) != 1) {
+ /* Assume it is a list of trace category names */
+ ret = blk_trace_str2mask(buf);
+ if (ret < 0)
+ goto out;
+ value = ret;
+ }
+ } else if (sscanf(buf, "%llu", &value) != 1)
+ goto out;
+
+ ret = -ENXIO;
+
+ p = dev_to_part(dev);
+ bdev = bdget(part_devt(p));
+ if (bdev == NULL)
+ goto out;
+
+ q = blk_trace_get_queue(bdev);
+ if (q == NULL)
+ goto out_bdput;
+
+ mutex_lock(&bdev->bd_mutex);
+
+ if (attr == &dev_attr_enable) {
+ if (value)
+ ret = blk_trace_setup_queue(q, bdev);
+ else
+ ret = blk_trace_remove_queue(q);
+ goto out_unlock_bdev;
+ }
+
+ ret = 0;
+ if (q->blk_trace == NULL)
+ ret = blk_trace_setup_queue(q, bdev);
+
+ if (ret == 0) {
+ if (attr == &dev_attr_act_mask)
+ q->blk_trace->act_mask = value;
+ else if (attr == &dev_attr_pid)
+ q->blk_trace->pid = value;
+ else if (attr == &dev_attr_start_lba)
+ q->blk_trace->start_lba = value;
+ else if (attr == &dev_attr_end_lba)
+ q->blk_trace->end_lba = value;
+ }
+
+out_unlock_bdev:
+ mutex_unlock(&bdev->bd_mutex);
+out_bdput:
+ bdput(bdev);
+out:
+ return ret ? ret : count;
+}
+
+int blk_trace_init_sysfs(struct device *dev)
+{
+ return sysfs_create_group(&dev->kobj, &blk_trace_attr_group);
+}
+
+void blk_trace_remove_sysfs(struct device *dev)
+{
+ sysfs_remove_group(&dev->kobj, &blk_trace_attr_group);
+}
+
+#endif /* CONFIG_BLK_DEV_IO_TRACE */
+
+#ifdef CONFIG_EVENT_TRACING
+
+void blk_dump_cmd(char *buf, struct request *rq)
+{
+ int i, end;
+ int len = rq->cmd_len;
+ unsigned char *cmd = rq->cmd;
+
+ if (rq->cmd_type != REQ_TYPE_BLOCK_PC) {
+ buf[0] = '\0';
+ return;
+ }
+
+ for (end = len - 1; end >= 0; end--)
+ if (cmd[end])
+ break;
+ end++;
+
+ for (i = 0; i < len; i++) {
+ buf += sprintf(buf, "%s%02x", i == 0 ? "" : " ", cmd[i]);
+ if (i == end && end != len - 1) {
+ sprintf(buf, " ..");
+ break;
+ }
+ }
+}
+
+void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
+{
+ int i = 0;
+
+ if (rw & REQ_FLUSH)
+ rwbs[i++] = 'F';
+
+ if (rw & WRITE)
+ rwbs[i++] = 'W';
+ else if (rw & REQ_DISCARD)
+ rwbs[i++] = 'D';
+ else if (bytes)
+ rwbs[i++] = 'R';
+ else
+ rwbs[i++] = 'N';
+
+ if (rw & REQ_FUA)
+ rwbs[i++] = 'F';
+ if (rw & REQ_RAHEAD)
+ rwbs[i++] = 'A';
+ if (rw & REQ_SYNC)
+ rwbs[i++] = 'S';
+ if (rw & REQ_META)
+ rwbs[i++] = 'M';
+ if (rw & REQ_SECURE)
+ rwbs[i++] = 'E';
+
+ rwbs[i] = '\0';
+}
+EXPORT_SYMBOL_GPL(blk_fill_rwbs);
+
+#endif /* CONFIG_EVENT_TRACING */
+
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
new file mode 100644
index 000000000..2d56ce501
--- /dev/null
+++ b/kernel/trace/bpf_trace.c
@@ -0,0 +1,222 @@
+/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/uaccess.h>
+#include <linux/ctype.h>
+#include "trace.h"
+
+static DEFINE_PER_CPU(int, bpf_prog_active);
+
+/**
+ * trace_call_bpf - invoke BPF program
+ * @prog: BPF program
+ * @ctx: opaque context pointer
+ *
+ * kprobe handlers execute BPF programs via this helper.
+ * Can be used from static tracepoints in the future.
+ *
+ * Return: BPF programs always return an integer which is interpreted by
+ * kprobe handler as:
+ * 0 - return from kprobe (event is filtered out)
+ * 1 - store kprobe event into ring buffer
+ * Other values are reserved and currently alias to 1
+ */
+unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
+{
+ unsigned int ret;
+
+ if (in_nmi()) /* not supported yet */
+ return 1;
+
+ preempt_disable();
+
+ if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
+ /*
+ * since some bpf program is already running on this cpu,
+ * don't call into another bpf program (same or different)
+ * and don't send kprobe event into ring-buffer,
+ * so return zero here
+ */
+ ret = 0;
+ goto out;
+ }
+
+ rcu_read_lock();
+ ret = BPF_PROG_RUN(prog, ctx);
+ rcu_read_unlock();
+
+ out:
+ __this_cpu_dec(bpf_prog_active);
+ preempt_enable();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(trace_call_bpf);
+
+static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ void *dst = (void *) (long) r1;
+ int size = (int) r2;
+ void *unsafe_ptr = (void *) (long) r3;
+
+ return probe_kernel_read(dst, unsafe_ptr, size);
+}
+
+static const struct bpf_func_proto bpf_probe_read_proto = {
+ .func = bpf_probe_read,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_STACK,
+ .arg2_type = ARG_CONST_STACK_SIZE,
+ .arg3_type = ARG_ANYTHING,
+};
+
+static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ /* NMI safe access to clock monotonic */
+ return ktime_get_mono_fast_ns();
+}
+
+static const struct bpf_func_proto bpf_ktime_get_ns_proto = {
+ .func = bpf_ktime_get_ns,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+};
+
+/*
+ * limited trace_printk()
+ * only %d %u %x %ld %lu %lx %lld %llu %llx %p conversion specifiers allowed
+ */
+static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
+{
+ char *fmt = (char *) (long) r1;
+ int mod[3] = {};
+ int fmt_cnt = 0;
+ int i;
+
+ /*
+ * bpf_check()->check_func_arg()->check_stack_boundary()
+ * guarantees that fmt points to bpf program stack,
+ * fmt_size bytes of it were initialized and fmt_size > 0
+ */
+ if (fmt[--fmt_size] != 0)
+ return -EINVAL;
+
+ /* check format string for allowed specifiers */
+ for (i = 0; i < fmt_size; i++) {
+ if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i]))
+ return -EINVAL;
+
+ if (fmt[i] != '%')
+ continue;
+
+ if (fmt_cnt >= 3)
+ return -EINVAL;
+
+ /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */
+ i++;
+ if (fmt[i] == 'l') {
+ mod[fmt_cnt]++;
+ i++;
+ } else if (fmt[i] == 'p') {
+ mod[fmt_cnt]++;
+ i++;
+ if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0)
+ return -EINVAL;
+ fmt_cnt++;
+ continue;
+ }
+
+ if (fmt[i] == 'l') {
+ mod[fmt_cnt]++;
+ i++;
+ }
+
+ if (fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x')
+ return -EINVAL;
+ fmt_cnt++;
+ }
+
+ return __trace_printk(1/* fake ip will not be printed */, fmt,
+ mod[0] == 2 ? r3 : mod[0] == 1 ? (long) r3 : (u32) r3,
+ mod[1] == 2 ? r4 : mod[1] == 1 ? (long) r4 : (u32) r4,
+ mod[2] == 2 ? r5 : mod[2] == 1 ? (long) r5 : (u32) r5);
+}
+
+static const struct bpf_func_proto bpf_trace_printk_proto = {
+ .func = bpf_trace_printk,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_STACK,
+ .arg2_type = ARG_CONST_STACK_SIZE,
+};
+
+static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_map_lookup_elem:
+ return &bpf_map_lookup_elem_proto;
+ case BPF_FUNC_map_update_elem:
+ return &bpf_map_update_elem_proto;
+ case BPF_FUNC_map_delete_elem:
+ return &bpf_map_delete_elem_proto;
+ case BPF_FUNC_probe_read:
+ return &bpf_probe_read_proto;
+ case BPF_FUNC_ktime_get_ns:
+ return &bpf_ktime_get_ns_proto;
+
+ case BPF_FUNC_trace_printk:
+ /*
+ * this program might be calling bpf_trace_printk,
+ * so allocate per-cpu printk buffers
+ */
+ trace_printk_init_buffers();
+
+ return &bpf_trace_printk_proto;
+ default:
+ return NULL;
+ }
+}
+
+/* bpf+kprobe programs can access fields of 'struct pt_regs' */
+static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type)
+{
+ /* check bounds */
+ if (off < 0 || off >= sizeof(struct pt_regs))
+ return false;
+
+ /* only read is allowed */
+ if (type != BPF_READ)
+ return false;
+
+ /* disallow misaligned access */
+ if (off % size != 0)
+ return false;
+
+ return true;
+}
+
+static struct bpf_verifier_ops kprobe_prog_ops = {
+ .get_func_proto = kprobe_prog_func_proto,
+ .is_valid_access = kprobe_prog_is_valid_access,
+};
+
+static struct bpf_prog_type_list kprobe_tl = {
+ .ops = &kprobe_prog_ops,
+ .type = BPF_PROG_TYPE_KPROBE,
+};
+
+static int __init register_kprobe_prog_ops(void)
+{
+ bpf_register_prog_type(&kprobe_tl);
+ return 0;
+}
+late_initcall(register_kprobe_prog_ops);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
new file mode 100644
index 000000000..02bece4a9
--- /dev/null
+++ b/kernel/trace/ftrace.c
@@ -0,0 +1,5951 @@
+/*
+ * Infrastructure for profiling code inserted by 'gcc -pg'.
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2004-2008 Ingo Molnar <mingo@redhat.com>
+ *
+ * Originally ported from the -rt patch by:
+ * Copyright (C) 2007 Arnaldo Carvalho de Melo <acme@redhat.com>
+ *
+ * Based on code in the latency_tracer, that is:
+ *
+ * Copyright (C) 2004-2006 Ingo Molnar
+ * Copyright (C) 2004 Nadia Yvette Chambers
+ */
+
+#include <linux/stop_machine.h>
+#include <linux/clocksource.h>
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/suspend.h>
+#include <linux/tracefs.h>
+#include <linux/hardirq.h>
+#include <linux/kthread.h>
+#include <linux/uaccess.h>
+#include <linux/bsearch.h>
+#include <linux/module.h>
+#include <linux/ftrace.h>
+#include <linux/sysctl.h>
+#include <linux/slab.h>
+#include <linux/ctype.h>
+#include <linux/sort.h>
+#include <linux/list.h>
+#include <linux/hash.h>
+#include <linux/rcupdate.h>
+
+#include <trace/events/sched.h>
+
+#include <asm/setup.h>
+
+#include "trace_output.h"
+#include "trace_stat.h"
+
+#define FTRACE_WARN_ON(cond) \
+ ({ \
+ int ___r = cond; \
+ if (WARN_ON(___r)) \
+ ftrace_kill(); \
+ ___r; \
+ })
+
+#define FTRACE_WARN_ON_ONCE(cond) \
+ ({ \
+ int ___r = cond; \
+ if (WARN_ON_ONCE(___r)) \
+ ftrace_kill(); \
+ ___r; \
+ })
+
+/* hash bits for specific function selection */
+#define FTRACE_HASH_BITS 7
+#define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS)
+#define FTRACE_HASH_DEFAULT_BITS 10
+#define FTRACE_HASH_MAX_BITS 12
+
+#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_CONTROL)
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+#define INIT_OPS_HASH(opsname) \
+ .func_hash = &opsname.local_hash, \
+ .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock),
+#define ASSIGN_OPS_HASH(opsname, val) \
+ .func_hash = val, \
+ .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock),
+#else
+#define INIT_OPS_HASH(opsname)
+#define ASSIGN_OPS_HASH(opsname, val)
+#endif
+
+static struct ftrace_ops ftrace_list_end __read_mostly = {
+ .func = ftrace_stub,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB,
+ INIT_OPS_HASH(ftrace_list_end)
+};
+
+/* ftrace_enabled is a method to turn ftrace on or off */
+int ftrace_enabled __read_mostly;
+static int last_ftrace_enabled;
+
+/* Current function tracing op */
+struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end;
+/* What to set function_trace_op to */
+static struct ftrace_ops *set_function_trace_op;
+
+/* List for set_ftrace_pid's pids. */
+LIST_HEAD(ftrace_pids);
+struct ftrace_pid {
+ struct list_head list;
+ struct pid *pid;
+};
+
+/*
+ * ftrace_disabled is set when an anomaly is discovered.
+ * ftrace_disabled is much stronger than ftrace_enabled.
+ */
+static int ftrace_disabled __read_mostly;
+
+static DEFINE_MUTEX(ftrace_lock);
+
+static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end;
+static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;
+ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
+ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
+static struct ftrace_ops global_ops;
+static struct ftrace_ops control_ops;
+
+static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *regs);
+
+#if ARCH_SUPPORTS_FTRACE_OPS
+static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *regs);
+#else
+/* See comment below, where ftrace_ops_list_func is defined */
+static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip);
+#define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops)
+#endif
+
+/*
+ * Traverse the ftrace_global_list, invoking all entries. The reason that we
+ * can use rcu_dereference_raw_notrace() is that elements removed from this list
+ * are simply leaked, so there is no need to interact with a grace-period
+ * mechanism. The rcu_dereference_raw_notrace() calls are needed to handle
+ * concurrent insertions into the ftrace_global_list.
+ *
+ * Silly Alpha and silly pointer-speculation compiler optimizations!
+ */
+#define do_for_each_ftrace_op(op, list) \
+ op = rcu_dereference_raw_notrace(list); \
+ do
+
+/*
+ * Optimized for just a single item in the list (as that is the normal case).
+ */
+#define while_for_each_ftrace_op(op) \
+ while (likely(op = rcu_dereference_raw_notrace((op)->next)) && \
+ unlikely((op) != &ftrace_list_end))
+
+static inline void ftrace_ops_init(struct ftrace_ops *ops)
+{
+#ifdef CONFIG_DYNAMIC_FTRACE
+ if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED)) {
+ mutex_init(&ops->local_hash.regex_lock);
+ ops->func_hash = &ops->local_hash;
+ ops->flags |= FTRACE_OPS_FL_INITIALIZED;
+ }
+#endif
+}
+
+/**
+ * ftrace_nr_registered_ops - return number of ops registered
+ *
+ * Returns the number of ftrace_ops registered and tracing functions
+ */
+int ftrace_nr_registered_ops(void)
+{
+ struct ftrace_ops *ops;
+ int cnt = 0;
+
+ mutex_lock(&ftrace_lock);
+
+ for (ops = ftrace_ops_list;
+ ops != &ftrace_list_end; ops = ops->next)
+ cnt++;
+
+ mutex_unlock(&ftrace_lock);
+
+ return cnt;
+}
+
+static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *regs)
+{
+ if (!test_tsk_trace_trace(current))
+ return;
+
+ ftrace_pid_function(ip, parent_ip, op, regs);
+}
+
+static void set_ftrace_pid_function(ftrace_func_t func)
+{
+ /* do not set ftrace_pid_function to itself! */
+ if (func != ftrace_pid_func)
+ ftrace_pid_function = func;
+}
+
+/**
+ * clear_ftrace_function - reset the ftrace function
+ *
+ * This NULLs the ftrace function and in essence stops
+ * tracing. There may be lag
+ */
+void clear_ftrace_function(void)
+{
+ ftrace_trace_function = ftrace_stub;
+ ftrace_pid_function = ftrace_stub;
+}
+
+static void control_ops_disable_all(struct ftrace_ops *ops)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ *per_cpu_ptr(ops->disabled, cpu) = 1;
+}
+
+static int control_ops_alloc(struct ftrace_ops *ops)
+{
+ int __percpu *disabled;
+
+ disabled = alloc_percpu(int);
+ if (!disabled)
+ return -ENOMEM;
+
+ ops->disabled = disabled;
+ control_ops_disable_all(ops);
+ return 0;
+}
+
+static void ftrace_sync(struct work_struct *work)
+{
+ /*
+ * This function is just a stub to implement a hard force
+ * of synchronize_sched(). This requires synchronizing
+ * tasks even in userspace and idle.
+ *
+ * Yes, function tracing is rude.
+ */
+}
+
+static void ftrace_sync_ipi(void *data)
+{
+ /* Probably not needed, but do it anyway */
+ smp_rmb();
+}
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static void update_function_graph_func(void);
+#else
+static inline void update_function_graph_func(void) { }
+#endif
+
+
+static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops)
+{
+ /*
+ * If this is a dynamic ops or we force list func,
+ * then it needs to call the list anyway.
+ */
+ if (ops->flags & FTRACE_OPS_FL_DYNAMIC || FTRACE_FORCE_LIST_FUNC)
+ return ftrace_ops_list_func;
+
+ return ftrace_ops_get_func(ops);
+}
+
+static void update_ftrace_function(void)
+{
+ ftrace_func_t func;
+
+ /*
+ * Prepare the ftrace_ops that the arch callback will use.
+ * If there's only one ftrace_ops registered, the ftrace_ops_list
+ * will point to the ops we want.
+ */
+ set_function_trace_op = ftrace_ops_list;
+
+ /* If there's no ftrace_ops registered, just call the stub function */
+ if (ftrace_ops_list == &ftrace_list_end) {
+ func = ftrace_stub;
+
+ /*
+ * If we are at the end of the list and this ops is
+ * recursion safe and not dynamic and the arch supports passing ops,
+ * then have the mcount trampoline call the function directly.
+ */
+ } else if (ftrace_ops_list->next == &ftrace_list_end) {
+ func = ftrace_ops_get_list_func(ftrace_ops_list);
+
+ } else {
+ /* Just use the default ftrace_ops */
+ set_function_trace_op = &ftrace_list_end;
+ func = ftrace_ops_list_func;
+ }
+
+ update_function_graph_func();
+
+ /* If there's no change, then do nothing more here */
+ if (ftrace_trace_function == func)
+ return;
+
+ /*
+ * If we are using the list function, it doesn't care
+ * about the function_trace_ops.
+ */
+ if (func == ftrace_ops_list_func) {
+ ftrace_trace_function = func;
+ /*
+ * Don't even bother setting function_trace_ops,
+ * it would be racy to do so anyway.
+ */
+ return;
+ }
+
+#ifndef CONFIG_DYNAMIC_FTRACE
+ /*
+ * For static tracing, we need to be a bit more careful.
+ * The function change takes affect immediately. Thus,
+ * we need to coorditate the setting of the function_trace_ops
+ * with the setting of the ftrace_trace_function.
+ *
+ * Set the function to the list ops, which will call the
+ * function we want, albeit indirectly, but it handles the
+ * ftrace_ops and doesn't depend on function_trace_op.
+ */
+ ftrace_trace_function = ftrace_ops_list_func;
+ /*
+ * Make sure all CPUs see this. Yes this is slow, but static
+ * tracing is slow and nasty to have enabled.
+ */
+ schedule_on_each_cpu(ftrace_sync);
+ /* Now all cpus are using the list ops. */
+ function_trace_op = set_function_trace_op;
+ /* Make sure the function_trace_op is visible on all CPUs */
+ smp_wmb();
+ /* Nasty way to force a rmb on all cpus */
+ smp_call_function(ftrace_sync_ipi, NULL, 1);
+ /* OK, we are all set to update the ftrace_trace_function now! */
+#endif /* !CONFIG_DYNAMIC_FTRACE */
+
+ ftrace_trace_function = func;
+}
+
+int using_ftrace_ops_list_func(void)
+{
+ return ftrace_trace_function == ftrace_ops_list_func;
+}
+
+static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
+{
+ ops->next = *list;
+ /*
+ * We are entering ops into the list but another
+ * CPU might be walking that list. We need to make sure
+ * the ops->next pointer is valid before another CPU sees
+ * the ops pointer included into the list.
+ */
+ rcu_assign_pointer(*list, ops);
+}
+
+static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
+{
+ struct ftrace_ops **p;
+
+ /*
+ * If we are removing the last function, then simply point
+ * to the ftrace_stub.
+ */
+ if (*list == ops && ops->next == &ftrace_list_end) {
+ *list = &ftrace_list_end;
+ return 0;
+ }
+
+ for (p = list; *p != &ftrace_list_end; p = &(*p)->next)
+ if (*p == ops)
+ break;
+
+ if (*p != ops)
+ return -1;
+
+ *p = (*p)->next;
+ return 0;
+}
+
+static void add_ftrace_list_ops(struct ftrace_ops **list,
+ struct ftrace_ops *main_ops,
+ struct ftrace_ops *ops)
+{
+ int first = *list == &ftrace_list_end;
+ add_ftrace_ops(list, ops);
+ if (first)
+ add_ftrace_ops(&ftrace_ops_list, main_ops);
+}
+
+static int remove_ftrace_list_ops(struct ftrace_ops **list,
+ struct ftrace_ops *main_ops,
+ struct ftrace_ops *ops)
+{
+ int ret = remove_ftrace_ops(list, ops);
+ if (!ret && *list == &ftrace_list_end)
+ ret = remove_ftrace_ops(&ftrace_ops_list, main_ops);
+ return ret;
+}
+
+static void ftrace_update_trampoline(struct ftrace_ops *ops);
+
+static int __register_ftrace_function(struct ftrace_ops *ops)
+{
+ if (ops->flags & FTRACE_OPS_FL_DELETED)
+ return -EINVAL;
+
+ if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED))
+ return -EBUSY;
+
+#ifndef CONFIG_DYNAMIC_FTRACE_WITH_REGS
+ /*
+ * If the ftrace_ops specifies SAVE_REGS, then it only can be used
+ * if the arch supports it, or SAVE_REGS_IF_SUPPORTED is also set.
+ * Setting SAVE_REGS_IF_SUPPORTED makes SAVE_REGS irrelevant.
+ */
+ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS &&
+ !(ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED))
+ return -EINVAL;
+
+ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED)
+ ops->flags |= FTRACE_OPS_FL_SAVE_REGS;
+#endif
+
+ if (!core_kernel_data((unsigned long)ops))
+ ops->flags |= FTRACE_OPS_FL_DYNAMIC;
+
+ if (ops->flags & FTRACE_OPS_FL_CONTROL) {
+ if (control_ops_alloc(ops))
+ return -ENOMEM;
+ add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops);
+ /* The control_ops needs the trampoline update */
+ ops = &control_ops;
+ } else
+ add_ftrace_ops(&ftrace_ops_list, ops);
+
+ ftrace_update_trampoline(ops);
+
+ if (ftrace_enabled)
+ update_ftrace_function();
+
+ return 0;
+}
+
+static int __unregister_ftrace_function(struct ftrace_ops *ops)
+{
+ int ret;
+
+ if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED)))
+ return -EBUSY;
+
+ if (ops->flags & FTRACE_OPS_FL_CONTROL) {
+ ret = remove_ftrace_list_ops(&ftrace_control_list,
+ &control_ops, ops);
+ } else
+ ret = remove_ftrace_ops(&ftrace_ops_list, ops);
+
+ if (ret < 0)
+ return ret;
+
+ if (ftrace_enabled)
+ update_ftrace_function();
+
+ return 0;
+}
+
+static void ftrace_update_pid_func(void)
+{
+ /* Only do something if we are tracing something */
+ if (ftrace_trace_function == ftrace_stub)
+ return;
+
+ update_ftrace_function();
+}
+
+#ifdef CONFIG_FUNCTION_PROFILER
+struct ftrace_profile {
+ struct hlist_node node;
+ unsigned long ip;
+ unsigned long counter;
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ unsigned long long time;
+ unsigned long long time_squared;
+#endif
+};
+
+struct ftrace_profile_page {
+ struct ftrace_profile_page *next;
+ unsigned long index;
+ struct ftrace_profile records[];
+};
+
+struct ftrace_profile_stat {
+ atomic_t disabled;
+ struct hlist_head *hash;
+ struct ftrace_profile_page *pages;
+ struct ftrace_profile_page *start;
+ struct tracer_stat stat;
+};
+
+#define PROFILE_RECORDS_SIZE \
+ (PAGE_SIZE - offsetof(struct ftrace_profile_page, records))
+
+#define PROFILES_PER_PAGE \
+ (PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile))
+
+static int ftrace_profile_enabled __read_mostly;
+
+/* ftrace_profile_lock - synchronize the enable and disable of the profiler */
+static DEFINE_MUTEX(ftrace_profile_lock);
+
+static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats);
+
+#define FTRACE_PROFILE_HASH_BITS 10
+#define FTRACE_PROFILE_HASH_SIZE (1 << FTRACE_PROFILE_HASH_BITS)
+
+static void *
+function_stat_next(void *v, int idx)
+{
+ struct ftrace_profile *rec = v;
+ struct ftrace_profile_page *pg;
+
+ pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK);
+
+ again:
+ if (idx != 0)
+ rec++;
+
+ if ((void *)rec >= (void *)&pg->records[pg->index]) {
+ pg = pg->next;
+ if (!pg)
+ return NULL;
+ rec = &pg->records[0];
+ if (!rec->counter)
+ goto again;
+ }
+
+ return rec;
+}
+
+static void *function_stat_start(struct tracer_stat *trace)
+{
+ struct ftrace_profile_stat *stat =
+ container_of(trace, struct ftrace_profile_stat, stat);
+
+ if (!stat || !stat->start)
+ return NULL;
+
+ return function_stat_next(&stat->start->records[0], 0);
+}
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+/* function graph compares on total time */
+static int function_stat_cmp(void *p1, void *p2)
+{
+ struct ftrace_profile *a = p1;
+ struct ftrace_profile *b = p2;
+
+ if (a->time < b->time)
+ return -1;
+ if (a->time > b->time)
+ return 1;
+ else
+ return 0;
+}
+#else
+/* not function graph compares against hits */
+static int function_stat_cmp(void *p1, void *p2)
+{
+ struct ftrace_profile *a = p1;
+ struct ftrace_profile *b = p2;
+
+ if (a->counter < b->counter)
+ return -1;
+ if (a->counter > b->counter)
+ return 1;
+ else
+ return 0;
+}
+#endif
+
+static int function_stat_headers(struct seq_file *m)
+{
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ seq_puts(m, " Function "
+ "Hit Time Avg s^2\n"
+ " -------- "
+ "--- ---- --- ---\n");
+#else
+ seq_puts(m, " Function Hit\n"
+ " -------- ---\n");
+#endif
+ return 0;
+}
+
+static int function_stat_show(struct seq_file *m, void *v)
+{
+ struct ftrace_profile *rec = v;
+ char str[KSYM_SYMBOL_LEN];
+ int ret = 0;
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ static struct trace_seq s;
+ unsigned long long avg;
+ unsigned long long stddev;
+#endif
+ mutex_lock(&ftrace_profile_lock);
+
+ /* we raced with function_profile_reset() */
+ if (unlikely(rec->counter == 0)) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+ seq_printf(m, " %-30.30s %10lu", str, rec->counter);
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ seq_puts(m, " ");
+ avg = rec->time;
+ do_div(avg, rec->counter);
+
+ /* Sample standard deviation (s^2) */
+ if (rec->counter <= 1)
+ stddev = 0;
+ else {
+ /*
+ * Apply Welford's method:
+ * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2)
+ */
+ stddev = rec->counter * rec->time_squared -
+ rec->time * rec->time;
+
+ /*
+ * Divide only 1000 for ns^2 -> us^2 conversion.
+ * trace_print_graph_duration will divide 1000 again.
+ */
+ do_div(stddev, rec->counter * (rec->counter - 1) * 1000);
+ }
+
+ trace_seq_init(&s);
+ trace_print_graph_duration(rec->time, &s);
+ trace_seq_puts(&s, " ");
+ trace_print_graph_duration(avg, &s);
+ trace_seq_puts(&s, " ");
+ trace_print_graph_duration(stddev, &s);
+ trace_print_seq(m, &s);
+#endif
+ seq_putc(m, '\n');
+out:
+ mutex_unlock(&ftrace_profile_lock);
+
+ return ret;
+}
+
+static void ftrace_profile_reset(struct ftrace_profile_stat *stat)
+{
+ struct ftrace_profile_page *pg;
+
+ pg = stat->pages = stat->start;
+
+ while (pg) {
+ memset(pg->records, 0, PROFILE_RECORDS_SIZE);
+ pg->index = 0;
+ pg = pg->next;
+ }
+
+ memset(stat->hash, 0,
+ FTRACE_PROFILE_HASH_SIZE * sizeof(struct hlist_head));
+}
+
+int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
+{
+ struct ftrace_profile_page *pg;
+ int functions;
+ int pages;
+ int i;
+
+ /* If we already allocated, do nothing */
+ if (stat->pages)
+ return 0;
+
+ stat->pages = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!stat->pages)
+ return -ENOMEM;
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+ functions = ftrace_update_tot_cnt;
+#else
+ /*
+ * We do not know the number of functions that exist because
+ * dynamic tracing is what counts them. With past experience
+ * we have around 20K functions. That should be more than enough.
+ * It is highly unlikely we will execute every function in
+ * the kernel.
+ */
+ functions = 20000;
+#endif
+
+ pg = stat->start = stat->pages;
+
+ pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE);
+
+ for (i = 1; i < pages; i++) {
+ pg->next = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!pg->next)
+ goto out_free;
+ pg = pg->next;
+ }
+
+ return 0;
+
+ out_free:
+ pg = stat->start;
+ while (pg) {
+ unsigned long tmp = (unsigned long)pg;
+
+ pg = pg->next;
+ free_page(tmp);
+ }
+
+ stat->pages = NULL;
+ stat->start = NULL;
+
+ return -ENOMEM;
+}
+
+static int ftrace_profile_init_cpu(int cpu)
+{
+ struct ftrace_profile_stat *stat;
+ int size;
+
+ stat = &per_cpu(ftrace_profile_stats, cpu);
+
+ if (stat->hash) {
+ /* If the profile is already created, simply reset it */
+ ftrace_profile_reset(stat);
+ return 0;
+ }
+
+ /*
+ * We are profiling all functions, but usually only a few thousand
+ * functions are hit. We'll make a hash of 1024 items.
+ */
+ size = FTRACE_PROFILE_HASH_SIZE;
+
+ stat->hash = kzalloc(sizeof(struct hlist_head) * size, GFP_KERNEL);
+
+ if (!stat->hash)
+ return -ENOMEM;
+
+ /* Preallocate the function profiling pages */
+ if (ftrace_profile_pages_init(stat) < 0) {
+ kfree(stat->hash);
+ stat->hash = NULL;
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static int ftrace_profile_init(void)
+{
+ int cpu;
+ int ret = 0;
+
+ for_each_possible_cpu(cpu) {
+ ret = ftrace_profile_init_cpu(cpu);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+/* interrupts must be disabled */
+static struct ftrace_profile *
+ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)
+{
+ struct ftrace_profile *rec;
+ struct hlist_head *hhd;
+ unsigned long key;
+
+ key = hash_long(ip, FTRACE_PROFILE_HASH_BITS);
+ hhd = &stat->hash[key];
+
+ if (hlist_empty(hhd))
+ return NULL;
+
+ hlist_for_each_entry_rcu_notrace(rec, hhd, node) {
+ if (rec->ip == ip)
+ return rec;
+ }
+
+ return NULL;
+}
+
+static void ftrace_add_profile(struct ftrace_profile_stat *stat,
+ struct ftrace_profile *rec)
+{
+ unsigned long key;
+
+ key = hash_long(rec->ip, FTRACE_PROFILE_HASH_BITS);
+ hlist_add_head_rcu(&rec->node, &stat->hash[key]);
+}
+
+/*
+ * The memory is already allocated, this simply finds a new record to use.
+ */
+static struct ftrace_profile *
+ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip)
+{
+ struct ftrace_profile *rec = NULL;
+
+ /* prevent recursion (from NMIs) */
+ if (atomic_inc_return(&stat->disabled) != 1)
+ goto out;
+
+ /*
+ * Try to find the function again since an NMI
+ * could have added it
+ */
+ rec = ftrace_find_profiled_func(stat, ip);
+ if (rec)
+ goto out;
+
+ if (stat->pages->index == PROFILES_PER_PAGE) {
+ if (!stat->pages->next)
+ goto out;
+ stat->pages = stat->pages->next;
+ }
+
+ rec = &stat->pages->records[stat->pages->index++];
+ rec->ip = ip;
+ ftrace_add_profile(stat, rec);
+
+ out:
+ atomic_dec(&stat->disabled);
+
+ return rec;
+}
+
+static void
+function_profile_call(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *ops, struct pt_regs *regs)
+{
+ struct ftrace_profile_stat *stat;
+ struct ftrace_profile *rec;
+ unsigned long flags;
+
+ if (!ftrace_profile_enabled)
+ return;
+
+ local_irq_save(flags);
+
+ stat = this_cpu_ptr(&ftrace_profile_stats);
+ if (!stat->hash || !ftrace_profile_enabled)
+ goto out;
+
+ rec = ftrace_find_profiled_func(stat, ip);
+ if (!rec) {
+ rec = ftrace_profile_alloc(stat, ip);
+ if (!rec)
+ goto out;
+ }
+
+ rec->counter++;
+ out:
+ local_irq_restore(flags);
+}
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static int profile_graph_entry(struct ftrace_graph_ent *trace)
+{
+ function_profile_call(trace->func, 0, NULL, NULL);
+ return 1;
+}
+
+static void profile_graph_return(struct ftrace_graph_ret *trace)
+{
+ struct ftrace_profile_stat *stat;
+ unsigned long long calltime;
+ struct ftrace_profile *rec;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ stat = this_cpu_ptr(&ftrace_profile_stats);
+ if (!stat->hash || !ftrace_profile_enabled)
+ goto out;
+
+ /* If the calltime was zero'd ignore it */
+ if (!trace->calltime)
+ goto out;
+
+ calltime = trace->rettime - trace->calltime;
+
+ if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) {
+ int index;
+
+ index = trace->depth;
+
+ /* Append this call time to the parent time to subtract */
+ if (index)
+ current->ret_stack[index - 1].subtime += calltime;
+
+ if (current->ret_stack[index].subtime < calltime)
+ calltime -= current->ret_stack[index].subtime;
+ else
+ calltime = 0;
+ }
+
+ rec = ftrace_find_profiled_func(stat, trace->func);
+ if (rec) {
+ rec->time += calltime;
+ rec->time_squared += calltime * calltime;
+ }
+
+ out:
+ local_irq_restore(flags);
+}
+
+static int register_ftrace_profiler(void)
+{
+ return register_ftrace_graph(&profile_graph_return,
+ &profile_graph_entry);
+}
+
+static void unregister_ftrace_profiler(void)
+{
+ unregister_ftrace_graph();
+}
+#else
+static struct ftrace_ops ftrace_profile_ops __read_mostly = {
+ .func = function_profile_call,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
+ INIT_OPS_HASH(ftrace_profile_ops)
+};
+
+static int register_ftrace_profiler(void)
+{
+ return register_ftrace_function(&ftrace_profile_ops);
+}
+
+static void unregister_ftrace_profiler(void)
+{
+ unregister_ftrace_function(&ftrace_profile_ops);
+}
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
+static ssize_t
+ftrace_profile_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
+
+ val = !!val;
+
+ mutex_lock(&ftrace_profile_lock);
+ if (ftrace_profile_enabled ^ val) {
+ if (val) {
+ ret = ftrace_profile_init();
+ if (ret < 0) {
+ cnt = ret;
+ goto out;
+ }
+
+ ret = register_ftrace_profiler();
+ if (ret < 0) {
+ cnt = ret;
+ goto out;
+ }
+ ftrace_profile_enabled = 1;
+ } else {
+ ftrace_profile_enabled = 0;
+ /*
+ * unregister_ftrace_profiler calls stop_machine
+ * so this acts like an synchronize_sched.
+ */
+ unregister_ftrace_profiler();
+ }
+ }
+ out:
+ mutex_unlock(&ftrace_profile_lock);
+
+ *ppos += cnt;
+
+ return cnt;
+}
+
+static ssize_t
+ftrace_profile_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[64]; /* big enough to hold a number */
+ int r;
+
+ r = sprintf(buf, "%u\n", ftrace_profile_enabled);
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static const struct file_operations ftrace_profile_fops = {
+ .open = tracing_open_generic,
+ .read = ftrace_profile_read,
+ .write = ftrace_profile_write,
+ .llseek = default_llseek,
+};
+
+/* used to initialize the real stat files */
+static struct tracer_stat function_stats __initdata = {
+ .name = "functions",
+ .stat_start = function_stat_start,
+ .stat_next = function_stat_next,
+ .stat_cmp = function_stat_cmp,
+ .stat_headers = function_stat_headers,
+ .stat_show = function_stat_show
+};
+
+static __init void ftrace_profile_tracefs(struct dentry *d_tracer)
+{
+ struct ftrace_profile_stat *stat;
+ struct dentry *entry;
+ char *name;
+ int ret;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ stat = &per_cpu(ftrace_profile_stats, cpu);
+
+ /* allocate enough for function name + cpu number */
+ name = kmalloc(32, GFP_KERNEL);
+ if (!name) {
+ /*
+ * The files created are permanent, if something happens
+ * we still do not free memory.
+ */
+ WARN(1,
+ "Could not allocate stat file for cpu %d\n",
+ cpu);
+ return;
+ }
+ stat->stat = function_stats;
+ snprintf(name, 32, "function%d", cpu);
+ stat->stat.name = name;
+ ret = register_stat_tracer(&stat->stat);
+ if (ret) {
+ WARN(1,
+ "Could not register function stat for cpu %d\n",
+ cpu);
+ kfree(name);
+ return;
+ }
+ }
+
+ entry = tracefs_create_file("function_profile_enabled", 0644,
+ d_tracer, NULL, &ftrace_profile_fops);
+ if (!entry)
+ pr_warning("Could not create tracefs "
+ "'function_profile_enabled' entry\n");
+}
+
+#else /* CONFIG_FUNCTION_PROFILER */
+static __init void ftrace_profile_tracefs(struct dentry *d_tracer)
+{
+}
+#endif /* CONFIG_FUNCTION_PROFILER */
+
+static struct pid * const ftrace_swapper_pid = &init_struct_pid;
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static int ftrace_graph_active;
+#else
+# define ftrace_graph_active 0
+#endif
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+static struct ftrace_ops *removed_ops;
+
+/*
+ * Set when doing a global update, like enabling all recs or disabling them.
+ * It is not set when just updating a single ftrace_ops.
+ */
+static bool update_all_ops;
+
+#ifndef CONFIG_FTRACE_MCOUNT_RECORD
+# error Dynamic ftrace depends on MCOUNT_RECORD
+#endif
+
+static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly;
+
+struct ftrace_func_probe {
+ struct hlist_node node;
+ struct ftrace_probe_ops *ops;
+ unsigned long flags;
+ unsigned long ip;
+ void *data;
+ struct list_head free_list;
+};
+
+struct ftrace_func_entry {
+ struct hlist_node hlist;
+ unsigned long ip;
+};
+
+struct ftrace_hash {
+ unsigned long size_bits;
+ struct hlist_head *buckets;
+ unsigned long count;
+ struct rcu_head rcu;
+};
+
+/*
+ * We make these constant because no one should touch them,
+ * but they are used as the default "empty hash", to avoid allocating
+ * it all the time. These are in a read only section such that if
+ * anyone does try to modify it, it will cause an exception.
+ */
+static const struct hlist_head empty_buckets[1];
+static const struct ftrace_hash empty_hash = {
+ .buckets = (struct hlist_head *)empty_buckets,
+};
+#define EMPTY_HASH ((struct ftrace_hash *)&empty_hash)
+
+static struct ftrace_ops global_ops = {
+ .func = ftrace_stub,
+ .local_hash.notrace_hash = EMPTY_HASH,
+ .local_hash.filter_hash = EMPTY_HASH,
+ INIT_OPS_HASH(global_ops)
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE |
+ FTRACE_OPS_FL_INITIALIZED,
+};
+
+/*
+ * This is used by __kernel_text_address() to return true if the
+ * address is on a dynamically allocated trampoline that would
+ * not return true for either core_kernel_text() or
+ * is_module_text_address().
+ */
+bool is_ftrace_trampoline(unsigned long addr)
+{
+ struct ftrace_ops *op;
+ bool ret = false;
+
+ /*
+ * Some of the ops may be dynamically allocated,
+ * they are freed after a synchronize_sched().
+ */
+ preempt_disable_notrace();
+
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+ /*
+ * This is to check for dynamically allocated trampolines.
+ * Trampolines that are in kernel text will have
+ * core_kernel_text() return true.
+ */
+ if (op->trampoline && op->trampoline_size)
+ if (addr >= op->trampoline &&
+ addr < op->trampoline + op->trampoline_size) {
+ ret = true;
+ goto out;
+ }
+ } while_for_each_ftrace_op(op);
+
+ out:
+ preempt_enable_notrace();
+
+ return ret;
+}
+
+struct ftrace_page {
+ struct ftrace_page *next;
+ struct dyn_ftrace *records;
+ int index;
+ int size;
+};
+
+#define ENTRY_SIZE sizeof(struct dyn_ftrace)
+#define ENTRIES_PER_PAGE (PAGE_SIZE / ENTRY_SIZE)
+
+/* estimate from running different kernels */
+#define NR_TO_INIT 10000
+
+static struct ftrace_page *ftrace_pages_start;
+static struct ftrace_page *ftrace_pages;
+
+static bool __always_inline ftrace_hash_empty(struct ftrace_hash *hash)
+{
+ return !hash || !hash->count;
+}
+
+static struct ftrace_func_entry *
+ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
+{
+ unsigned long key;
+ struct ftrace_func_entry *entry;
+ struct hlist_head *hhd;
+
+ if (ftrace_hash_empty(hash))
+ return NULL;
+
+ if (hash->size_bits > 0)
+ key = hash_long(ip, hash->size_bits);
+ else
+ key = 0;
+
+ hhd = &hash->buckets[key];
+
+ hlist_for_each_entry_rcu_notrace(entry, hhd, hlist) {
+ if (entry->ip == ip)
+ return entry;
+ }
+ return NULL;
+}
+
+static void __add_hash_entry(struct ftrace_hash *hash,
+ struct ftrace_func_entry *entry)
+{
+ struct hlist_head *hhd;
+ unsigned long key;
+
+ if (hash->size_bits)
+ key = hash_long(entry->ip, hash->size_bits);
+ else
+ key = 0;
+
+ hhd = &hash->buckets[key];
+ hlist_add_head(&entry->hlist, hhd);
+ hash->count++;
+}
+
+static int add_hash_entry(struct ftrace_hash *hash, unsigned long ip)
+{
+ struct ftrace_func_entry *entry;
+
+ entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ return -ENOMEM;
+
+ entry->ip = ip;
+ __add_hash_entry(hash, entry);
+
+ return 0;
+}
+
+static void
+free_hash_entry(struct ftrace_hash *hash,
+ struct ftrace_func_entry *entry)
+{
+ hlist_del(&entry->hlist);
+ kfree(entry);
+ hash->count--;
+}
+
+static void
+remove_hash_entry(struct ftrace_hash *hash,
+ struct ftrace_func_entry *entry)
+{
+ hlist_del(&entry->hlist);
+ hash->count--;
+}
+
+static void ftrace_hash_clear(struct ftrace_hash *hash)
+{
+ struct hlist_head *hhd;
+ struct hlist_node *tn;
+ struct ftrace_func_entry *entry;
+ int size = 1 << hash->size_bits;
+ int i;
+
+ if (!hash->count)
+ return;
+
+ for (i = 0; i < size; i++) {
+ hhd = &hash->buckets[i];
+ hlist_for_each_entry_safe(entry, tn, hhd, hlist)
+ free_hash_entry(hash, entry);
+ }
+ FTRACE_WARN_ON(hash->count);
+}
+
+static void free_ftrace_hash(struct ftrace_hash *hash)
+{
+ if (!hash || hash == EMPTY_HASH)
+ return;
+ ftrace_hash_clear(hash);
+ kfree(hash->buckets);
+ kfree(hash);
+}
+
+static void __free_ftrace_hash_rcu(struct rcu_head *rcu)
+{
+ struct ftrace_hash *hash;
+
+ hash = container_of(rcu, struct ftrace_hash, rcu);
+ free_ftrace_hash(hash);
+}
+
+static void free_ftrace_hash_rcu(struct ftrace_hash *hash)
+{
+ if (!hash || hash == EMPTY_HASH)
+ return;
+ call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu);
+}
+
+void ftrace_free_filter(struct ftrace_ops *ops)
+{
+ ftrace_ops_init(ops);
+ free_ftrace_hash(ops->func_hash->filter_hash);
+ free_ftrace_hash(ops->func_hash->notrace_hash);
+}
+
+static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
+{
+ struct ftrace_hash *hash;
+ int size;
+
+ hash = kzalloc(sizeof(*hash), GFP_KERNEL);
+ if (!hash)
+ return NULL;
+
+ size = 1 << size_bits;
+ hash->buckets = kcalloc(size, sizeof(*hash->buckets), GFP_KERNEL);
+
+ if (!hash->buckets) {
+ kfree(hash);
+ return NULL;
+ }
+
+ hash->size_bits = size_bits;
+
+ return hash;
+}
+
+static struct ftrace_hash *
+alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
+{
+ struct ftrace_func_entry *entry;
+ struct ftrace_hash *new_hash;
+ int size;
+ int ret;
+ int i;
+
+ new_hash = alloc_ftrace_hash(size_bits);
+ if (!new_hash)
+ return NULL;
+
+ /* Empty hash? */
+ if (ftrace_hash_empty(hash))
+ return new_hash;
+
+ size = 1 << hash->size_bits;
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
+ ret = add_hash_entry(new_hash, entry->ip);
+ if (ret < 0)
+ goto free_hash;
+ }
+ }
+
+ FTRACE_WARN_ON(new_hash->count != hash->count);
+
+ return new_hash;
+
+ free_hash:
+ free_ftrace_hash(new_hash);
+ return NULL;
+}
+
+static void
+ftrace_hash_rec_disable_modify(struct ftrace_ops *ops, int filter_hash);
+static void
+ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash);
+
+static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
+ struct ftrace_hash *new_hash);
+
+static int
+ftrace_hash_move(struct ftrace_ops *ops, int enable,
+ struct ftrace_hash **dst, struct ftrace_hash *src)
+{
+ struct ftrace_func_entry *entry;
+ struct hlist_node *tn;
+ struct hlist_head *hhd;
+ struct ftrace_hash *new_hash;
+ int size = src->count;
+ int bits = 0;
+ int ret;
+ int i;
+
+ /* Reject setting notrace hash on IPMODIFY ftrace_ops */
+ if (ops->flags & FTRACE_OPS_FL_IPMODIFY && !enable)
+ return -EINVAL;
+
+ /*
+ * If the new source is empty, just free dst and assign it
+ * the empty_hash.
+ */
+ if (!src->count) {
+ new_hash = EMPTY_HASH;
+ goto update;
+ }
+
+ /*
+ * Make the hash size about 1/2 the # found
+ */
+ for (size /= 2; size; size >>= 1)
+ bits++;
+
+ /* Don't allocate too much */
+ if (bits > FTRACE_HASH_MAX_BITS)
+ bits = FTRACE_HASH_MAX_BITS;
+
+ new_hash = alloc_ftrace_hash(bits);
+ if (!new_hash)
+ return -ENOMEM;
+
+ size = 1 << src->size_bits;
+ for (i = 0; i < size; i++) {
+ hhd = &src->buckets[i];
+ hlist_for_each_entry_safe(entry, tn, hhd, hlist) {
+ remove_hash_entry(src, entry);
+ __add_hash_entry(new_hash, entry);
+ }
+ }
+
+update:
+ /* Make sure this can be applied if it is IPMODIFY ftrace_ops */
+ if (enable) {
+ /* IPMODIFY should be updated only when filter_hash updating */
+ ret = ftrace_hash_ipmodify_update(ops, new_hash);
+ if (ret < 0) {
+ free_ftrace_hash(new_hash);
+ return ret;
+ }
+ }
+
+ /*
+ * Remove the current set, update the hash and add
+ * them back.
+ */
+ ftrace_hash_rec_disable_modify(ops, enable);
+
+ rcu_assign_pointer(*dst, new_hash);
+
+ ftrace_hash_rec_enable_modify(ops, enable);
+
+ return 0;
+}
+
+static bool hash_contains_ip(unsigned long ip,
+ struct ftrace_ops_hash *hash)
+{
+ /*
+ * The function record is a match if it exists in the filter
+ * hash and not in the notrace hash. Note, an emty hash is
+ * considered a match for the filter hash, but an empty
+ * notrace hash is considered not in the notrace hash.
+ */
+ return (ftrace_hash_empty(hash->filter_hash) ||
+ ftrace_lookup_ip(hash->filter_hash, ip)) &&
+ (ftrace_hash_empty(hash->notrace_hash) ||
+ !ftrace_lookup_ip(hash->notrace_hash, ip));
+}
+
+/*
+ * Test the hashes for this ops to see if we want to call
+ * the ops->func or not.
+ *
+ * It's a match if the ip is in the ops->filter_hash or
+ * the filter_hash does not exist or is empty,
+ * AND
+ * the ip is not in the ops->notrace_hash.
+ *
+ * This needs to be called with preemption disabled as
+ * the hashes are freed with call_rcu_sched().
+ */
+static int
+ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
+{
+ struct ftrace_ops_hash hash;
+ int ret;
+
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
+ /*
+ * There's a small race when adding ops that the ftrace handler
+ * that wants regs, may be called without them. We can not
+ * allow that handler to be called if regs is NULL.
+ */
+ if (regs == NULL && (ops->flags & FTRACE_OPS_FL_SAVE_REGS))
+ return 0;
+#endif
+
+ hash.filter_hash = rcu_dereference_raw_notrace(ops->func_hash->filter_hash);
+ hash.notrace_hash = rcu_dereference_raw_notrace(ops->func_hash->notrace_hash);
+
+ if (hash_contains_ip(ip, &hash))
+ ret = 1;
+ else
+ ret = 0;
+
+ return ret;
+}
+
+/*
+ * This is a double for. Do not use 'break' to break out of the loop,
+ * you must use a goto.
+ */
+#define do_for_each_ftrace_rec(pg, rec) \
+ for (pg = ftrace_pages_start; pg; pg = pg->next) { \
+ int _____i; \
+ for (_____i = 0; _____i < pg->index; _____i++) { \
+ rec = &pg->records[_____i];
+
+#define while_for_each_ftrace_rec() \
+ } \
+ }
+
+
+static int ftrace_cmp_recs(const void *a, const void *b)
+{
+ const struct dyn_ftrace *key = a;
+ const struct dyn_ftrace *rec = b;
+
+ if (key->flags < rec->ip)
+ return -1;
+ if (key->ip >= rec->ip + MCOUNT_INSN_SIZE)
+ return 1;
+ return 0;
+}
+
+static unsigned long ftrace_location_range(unsigned long start, unsigned long end)
+{
+ struct ftrace_page *pg;
+ struct dyn_ftrace *rec;
+ struct dyn_ftrace key;
+
+ key.ip = start;
+ key.flags = end; /* overload flags, as it is unsigned long */
+
+ for (pg = ftrace_pages_start; pg; pg = pg->next) {
+ if (end < pg->records[0].ip ||
+ start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE))
+ continue;
+ rec = bsearch(&key, pg->records, pg->index,
+ sizeof(struct dyn_ftrace),
+ ftrace_cmp_recs);
+ if (rec)
+ return rec->ip;
+ }
+
+ return 0;
+}
+
+/**
+ * ftrace_location - return true if the ip giving is a traced location
+ * @ip: the instruction pointer to check
+ *
+ * Returns rec->ip if @ip given is a pointer to a ftrace location.
+ * That is, the instruction that is either a NOP or call to
+ * the function tracer. It checks the ftrace internal tables to
+ * determine if the address belongs or not.
+ */
+unsigned long ftrace_location(unsigned long ip)
+{
+ return ftrace_location_range(ip, ip);
+}
+
+/**
+ * ftrace_text_reserved - return true if range contains an ftrace location
+ * @start: start of range to search
+ * @end: end of range to search (inclusive). @end points to the last byte to check.
+ *
+ * Returns 1 if @start and @end contains a ftrace location.
+ * That is, the instruction that is either a NOP or call to
+ * the function tracer. It checks the ftrace internal tables to
+ * determine if the address belongs or not.
+ */
+int ftrace_text_reserved(const void *start, const void *end)
+{
+ unsigned long ret;
+
+ ret = ftrace_location_range((unsigned long)start,
+ (unsigned long)end);
+
+ return (int)!!ret;
+}
+
+/* Test if ops registered to this rec needs regs */
+static bool test_rec_ops_needs_regs(struct dyn_ftrace *rec)
+{
+ struct ftrace_ops *ops;
+ bool keep_regs = false;
+
+ for (ops = ftrace_ops_list;
+ ops != &ftrace_list_end; ops = ops->next) {
+ /* pass rec in as regs to have non-NULL val */
+ if (ftrace_ops_test(ops, rec->ip, rec)) {
+ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
+ keep_regs = true;
+ break;
+ }
+ }
+ }
+
+ return keep_regs;
+}
+
+static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
+ int filter_hash,
+ bool inc)
+{
+ struct ftrace_hash *hash;
+ struct ftrace_hash *other_hash;
+ struct ftrace_page *pg;
+ struct dyn_ftrace *rec;
+ int count = 0;
+ int all = 0;
+
+ /* Only update if the ops has been registered */
+ if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
+ return;
+
+ /*
+ * In the filter_hash case:
+ * If the count is zero, we update all records.
+ * Otherwise we just update the items in the hash.
+ *
+ * In the notrace_hash case:
+ * We enable the update in the hash.
+ * As disabling notrace means enabling the tracing,
+ * and enabling notrace means disabling, the inc variable
+ * gets inversed.
+ */
+ if (filter_hash) {
+ hash = ops->func_hash->filter_hash;
+ other_hash = ops->func_hash->notrace_hash;
+ if (ftrace_hash_empty(hash))
+ all = 1;
+ } else {
+ inc = !inc;
+ hash = ops->func_hash->notrace_hash;
+ other_hash = ops->func_hash->filter_hash;
+ /*
+ * If the notrace hash has no items,
+ * then there's nothing to do.
+ */
+ if (ftrace_hash_empty(hash))
+ return;
+ }
+
+ do_for_each_ftrace_rec(pg, rec) {
+ int in_other_hash = 0;
+ int in_hash = 0;
+ int match = 0;
+
+ if (all) {
+ /*
+ * Only the filter_hash affects all records.
+ * Update if the record is not in the notrace hash.
+ */
+ if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip))
+ match = 1;
+ } else {
+ in_hash = !!ftrace_lookup_ip(hash, rec->ip);
+ in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip);
+
+ /*
+ * If filter_hash is set, we want to match all functions
+ * that are in the hash but not in the other hash.
+ *
+ * If filter_hash is not set, then we are decrementing.
+ * That means we match anything that is in the hash
+ * and also in the other_hash. That is, we need to turn
+ * off functions in the other hash because they are disabled
+ * by this hash.
+ */
+ if (filter_hash && in_hash && !in_other_hash)
+ match = 1;
+ else if (!filter_hash && in_hash &&
+ (in_other_hash || ftrace_hash_empty(other_hash)))
+ match = 1;
+ }
+ if (!match)
+ continue;
+
+ if (inc) {
+ rec->flags++;
+ if (FTRACE_WARN_ON(ftrace_rec_count(rec) == FTRACE_REF_MAX))
+ return;
+
+ /*
+ * If there's only a single callback registered to a
+ * function, and the ops has a trampoline registered
+ * for it, then we can call it directly.
+ */
+ if (ftrace_rec_count(rec) == 1 && ops->trampoline)
+ rec->flags |= FTRACE_FL_TRAMP;
+ else
+ /*
+ * If we are adding another function callback
+ * to this function, and the previous had a
+ * custom trampoline in use, then we need to go
+ * back to the default trampoline.
+ */
+ rec->flags &= ~FTRACE_FL_TRAMP;
+
+ /*
+ * If any ops wants regs saved for this function
+ * then all ops will get saved regs.
+ */
+ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS)
+ rec->flags |= FTRACE_FL_REGS;
+ } else {
+ if (FTRACE_WARN_ON(ftrace_rec_count(rec) == 0))
+ return;
+ rec->flags--;
+
+ /*
+ * If the rec had REGS enabled and the ops that is
+ * being removed had REGS set, then see if there is
+ * still any ops for this record that wants regs.
+ * If not, we can stop recording them.
+ */
+ if (ftrace_rec_count(rec) > 0 &&
+ rec->flags & FTRACE_FL_REGS &&
+ ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
+ if (!test_rec_ops_needs_regs(rec))
+ rec->flags &= ~FTRACE_FL_REGS;
+ }
+
+ /*
+ * If the rec had TRAMP enabled, then it needs to
+ * be cleared. As TRAMP can only be enabled iff
+ * there is only a single ops attached to it.
+ * In otherwords, always disable it on decrementing.
+ * In the future, we may set it if rec count is
+ * decremented to one, and the ops that is left
+ * has a trampoline.
+ */
+ rec->flags &= ~FTRACE_FL_TRAMP;
+
+ /*
+ * flags will be cleared in ftrace_check_record()
+ * if rec count is zero.
+ */
+ }
+ count++;
+ /* Shortcut, if we handled all records, we are done. */
+ if (!all && count == hash->count)
+ return;
+ } while_for_each_ftrace_rec();
+}
+
+static void ftrace_hash_rec_disable(struct ftrace_ops *ops,
+ int filter_hash)
+{
+ __ftrace_hash_rec_update(ops, filter_hash, 0);
+}
+
+static void ftrace_hash_rec_enable(struct ftrace_ops *ops,
+ int filter_hash)
+{
+ __ftrace_hash_rec_update(ops, filter_hash, 1);
+}
+
+static void ftrace_hash_rec_update_modify(struct ftrace_ops *ops,
+ int filter_hash, int inc)
+{
+ struct ftrace_ops *op;
+
+ __ftrace_hash_rec_update(ops, filter_hash, inc);
+
+ if (ops->func_hash != &global_ops.local_hash)
+ return;
+
+ /*
+ * If the ops shares the global_ops hash, then we need to update
+ * all ops that are enabled and use this hash.
+ */
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+ /* Already done */
+ if (op == ops)
+ continue;
+ if (op->func_hash == &global_ops.local_hash)
+ __ftrace_hash_rec_update(op, filter_hash, inc);
+ } while_for_each_ftrace_op(op);
+}
+
+static void ftrace_hash_rec_disable_modify(struct ftrace_ops *ops,
+ int filter_hash)
+{
+ ftrace_hash_rec_update_modify(ops, filter_hash, 0);
+}
+
+static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops,
+ int filter_hash)
+{
+ ftrace_hash_rec_update_modify(ops, filter_hash, 1);
+}
+
+/*
+ * Try to update IPMODIFY flag on each ftrace_rec. Return 0 if it is OK
+ * or no-needed to update, -EBUSY if it detects a conflict of the flag
+ * on a ftrace_rec, and -EINVAL if the new_hash tries to trace all recs.
+ * Note that old_hash and new_hash has below meanings
+ * - If the hash is NULL, it hits all recs (if IPMODIFY is set, this is rejected)
+ * - If the hash is EMPTY_HASH, it hits nothing
+ * - Anything else hits the recs which match the hash entries.
+ */
+static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
+ struct ftrace_hash *old_hash,
+ struct ftrace_hash *new_hash)
+{
+ struct ftrace_page *pg;
+ struct dyn_ftrace *rec, *end = NULL;
+ int in_old, in_new;
+
+ /* Only update if the ops has been registered */
+ if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
+ return 0;
+
+ if (!(ops->flags & FTRACE_OPS_FL_IPMODIFY))
+ return 0;
+
+ /*
+ * Since the IPMODIFY is a very address sensitive action, we do not
+ * allow ftrace_ops to set all functions to new hash.
+ */
+ if (!new_hash || !old_hash)
+ return -EINVAL;
+
+ /* Update rec->flags */
+ do_for_each_ftrace_rec(pg, rec) {
+ /* We need to update only differences of filter_hash */
+ in_old = !!ftrace_lookup_ip(old_hash, rec->ip);
+ in_new = !!ftrace_lookup_ip(new_hash, rec->ip);
+ if (in_old == in_new)
+ continue;
+
+ if (in_new) {
+ /* New entries must ensure no others are using it */
+ if (rec->flags & FTRACE_FL_IPMODIFY)
+ goto rollback;
+ rec->flags |= FTRACE_FL_IPMODIFY;
+ } else /* Removed entry */
+ rec->flags &= ~FTRACE_FL_IPMODIFY;
+ } while_for_each_ftrace_rec();
+
+ return 0;
+
+rollback:
+ end = rec;
+
+ /* Roll back what we did above */
+ do_for_each_ftrace_rec(pg, rec) {
+ if (rec == end)
+ goto err_out;
+
+ in_old = !!ftrace_lookup_ip(old_hash, rec->ip);
+ in_new = !!ftrace_lookup_ip(new_hash, rec->ip);
+ if (in_old == in_new)
+ continue;
+
+ if (in_new)
+ rec->flags &= ~FTRACE_FL_IPMODIFY;
+ else
+ rec->flags |= FTRACE_FL_IPMODIFY;
+ } while_for_each_ftrace_rec();
+
+err_out:
+ return -EBUSY;
+}
+
+static int ftrace_hash_ipmodify_enable(struct ftrace_ops *ops)
+{
+ struct ftrace_hash *hash = ops->func_hash->filter_hash;
+
+ if (ftrace_hash_empty(hash))
+ hash = NULL;
+
+ return __ftrace_hash_update_ipmodify(ops, EMPTY_HASH, hash);
+}
+
+/* Disabling always succeeds */
+static void ftrace_hash_ipmodify_disable(struct ftrace_ops *ops)
+{
+ struct ftrace_hash *hash = ops->func_hash->filter_hash;
+
+ if (ftrace_hash_empty(hash))
+ hash = NULL;
+
+ __ftrace_hash_update_ipmodify(ops, hash, EMPTY_HASH);
+}
+
+static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
+ struct ftrace_hash *new_hash)
+{
+ struct ftrace_hash *old_hash = ops->func_hash->filter_hash;
+
+ if (ftrace_hash_empty(old_hash))
+ old_hash = NULL;
+
+ if (ftrace_hash_empty(new_hash))
+ new_hash = NULL;
+
+ return __ftrace_hash_update_ipmodify(ops, old_hash, new_hash);
+}
+
+static void print_ip_ins(const char *fmt, unsigned char *p)
+{
+ int i;
+
+ printk(KERN_CONT "%s", fmt);
+
+ for (i = 0; i < MCOUNT_INSN_SIZE; i++)
+ printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
+}
+
+static struct ftrace_ops *
+ftrace_find_tramp_ops_any(struct dyn_ftrace *rec);
+
+/**
+ * ftrace_bug - report and shutdown function tracer
+ * @failed: The failed type (EFAULT, EINVAL, EPERM)
+ * @rec: The record that failed
+ *
+ * The arch code that enables or disables the function tracing
+ * can call ftrace_bug() when it has detected a problem in
+ * modifying the code. @failed should be one of either:
+ * EFAULT - if the problem happens on reading the @ip address
+ * EINVAL - if what is read at @ip is not what was expected
+ * EPERM - if the problem happens on writting to the @ip address
+ */
+void ftrace_bug(int failed, struct dyn_ftrace *rec)
+{
+ unsigned long ip = rec ? rec->ip : 0;
+
+ switch (failed) {
+ case -EFAULT:
+ FTRACE_WARN_ON_ONCE(1);
+ pr_info("ftrace faulted on modifying ");
+ print_ip_sym(ip);
+ break;
+ case -EINVAL:
+ FTRACE_WARN_ON_ONCE(1);
+ pr_info("ftrace failed to modify ");
+ print_ip_sym(ip);
+ print_ip_ins(" actual: ", (unsigned char *)ip);
+ pr_cont("\n");
+ break;
+ case -EPERM:
+ FTRACE_WARN_ON_ONCE(1);
+ pr_info("ftrace faulted on writing ");
+ print_ip_sym(ip);
+ break;
+ default:
+ FTRACE_WARN_ON_ONCE(1);
+ pr_info("ftrace faulted on unknown error ");
+ print_ip_sym(ip);
+ }
+ if (rec) {
+ struct ftrace_ops *ops = NULL;
+
+ pr_info("ftrace record flags: %lx\n", rec->flags);
+ pr_cont(" (%ld)%s", ftrace_rec_count(rec),
+ rec->flags & FTRACE_FL_REGS ? " R" : " ");
+ if (rec->flags & FTRACE_FL_TRAMP_EN) {
+ ops = ftrace_find_tramp_ops_any(rec);
+ if (ops)
+ pr_cont("\ttramp: %pS",
+ (void *)ops->trampoline);
+ else
+ pr_cont("\ttramp: ERROR!");
+
+ }
+ ip = ftrace_get_addr_curr(rec);
+ pr_cont(" expected tramp: %lx\n", ip);
+ }
+}
+
+static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
+{
+ unsigned long flag = 0UL;
+
+ /*
+ * If we are updating calls:
+ *
+ * If the record has a ref count, then we need to enable it
+ * because someone is using it.
+ *
+ * Otherwise we make sure its disabled.
+ *
+ * If we are disabling calls, then disable all records that
+ * are enabled.
+ */
+ if (enable && ftrace_rec_count(rec))
+ flag = FTRACE_FL_ENABLED;
+
+ /*
+ * If enabling and the REGS flag does not match the REGS_EN, or
+ * the TRAMP flag doesn't match the TRAMP_EN, then do not ignore
+ * this record. Set flags to fail the compare against ENABLED.
+ */
+ if (flag) {
+ if (!(rec->flags & FTRACE_FL_REGS) !=
+ !(rec->flags & FTRACE_FL_REGS_EN))
+ flag |= FTRACE_FL_REGS;
+
+ if (!(rec->flags & FTRACE_FL_TRAMP) !=
+ !(rec->flags & FTRACE_FL_TRAMP_EN))
+ flag |= FTRACE_FL_TRAMP;
+ }
+
+ /* If the state of this record hasn't changed, then do nothing */
+ if ((rec->flags & FTRACE_FL_ENABLED) == flag)
+ return FTRACE_UPDATE_IGNORE;
+
+ if (flag) {
+ /* Save off if rec is being enabled (for return value) */
+ flag ^= rec->flags & FTRACE_FL_ENABLED;
+
+ if (update) {
+ rec->flags |= FTRACE_FL_ENABLED;
+ if (flag & FTRACE_FL_REGS) {
+ if (rec->flags & FTRACE_FL_REGS)
+ rec->flags |= FTRACE_FL_REGS_EN;
+ else
+ rec->flags &= ~FTRACE_FL_REGS_EN;
+ }
+ if (flag & FTRACE_FL_TRAMP) {
+ if (rec->flags & FTRACE_FL_TRAMP)
+ rec->flags |= FTRACE_FL_TRAMP_EN;
+ else
+ rec->flags &= ~FTRACE_FL_TRAMP_EN;
+ }
+ }
+
+ /*
+ * If this record is being updated from a nop, then
+ * return UPDATE_MAKE_CALL.
+ * Otherwise,
+ * return UPDATE_MODIFY_CALL to tell the caller to convert
+ * from the save regs, to a non-save regs function or
+ * vice versa, or from a trampoline call.
+ */
+ if (flag & FTRACE_FL_ENABLED)
+ return FTRACE_UPDATE_MAKE_CALL;
+
+ return FTRACE_UPDATE_MODIFY_CALL;
+ }
+
+ if (update) {
+ /* If there's no more users, clear all flags */
+ if (!ftrace_rec_count(rec))
+ rec->flags = 0;
+ else
+ /*
+ * Just disable the record, but keep the ops TRAMP
+ * and REGS states. The _EN flags must be disabled though.
+ */
+ rec->flags &= ~(FTRACE_FL_ENABLED | FTRACE_FL_TRAMP_EN |
+ FTRACE_FL_REGS_EN);
+ }
+
+ return FTRACE_UPDATE_MAKE_NOP;
+}
+
+/**
+ * ftrace_update_record, set a record that now is tracing or not
+ * @rec: the record to update
+ * @enable: set to 1 if the record is tracing, zero to force disable
+ *
+ * The records that represent all functions that can be traced need
+ * to be updated when tracing has been enabled.
+ */
+int ftrace_update_record(struct dyn_ftrace *rec, int enable)
+{
+ return ftrace_check_record(rec, enable, 1);
+}
+
+/**
+ * ftrace_test_record, check if the record has been enabled or not
+ * @rec: the record to test
+ * @enable: set to 1 to check if enabled, 0 if it is disabled
+ *
+ * The arch code may need to test if a record is already set to
+ * tracing to determine how to modify the function code that it
+ * represents.
+ */
+int ftrace_test_record(struct dyn_ftrace *rec, int enable)
+{
+ return ftrace_check_record(rec, enable, 0);
+}
+
+static struct ftrace_ops *
+ftrace_find_tramp_ops_any(struct dyn_ftrace *rec)
+{
+ struct ftrace_ops *op;
+ unsigned long ip = rec->ip;
+
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+
+ if (!op->trampoline)
+ continue;
+
+ if (hash_contains_ip(ip, op->func_hash))
+ return op;
+ } while_for_each_ftrace_op(op);
+
+ return NULL;
+}
+
+static struct ftrace_ops *
+ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec)
+{
+ struct ftrace_ops *op;
+ unsigned long ip = rec->ip;
+
+ /*
+ * Need to check removed ops first.
+ * If they are being removed, and this rec has a tramp,
+ * and this rec is in the ops list, then it would be the
+ * one with the tramp.
+ */
+ if (removed_ops) {
+ if (hash_contains_ip(ip, &removed_ops->old_hash))
+ return removed_ops;
+ }
+
+ /*
+ * Need to find the current trampoline for a rec.
+ * Now, a trampoline is only attached to a rec if there
+ * was a single 'ops' attached to it. But this can be called
+ * when we are adding another op to the rec or removing the
+ * current one. Thus, if the op is being added, we can
+ * ignore it because it hasn't attached itself to the rec
+ * yet.
+ *
+ * If an ops is being modified (hooking to different functions)
+ * then we don't care about the new functions that are being
+ * added, just the old ones (that are probably being removed).
+ *
+ * If we are adding an ops to a function that already is using
+ * a trampoline, it needs to be removed (trampolines are only
+ * for single ops connected), then an ops that is not being
+ * modified also needs to be checked.
+ */
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+
+ if (!op->trampoline)
+ continue;
+
+ /*
+ * If the ops is being added, it hasn't gotten to
+ * the point to be removed from this tree yet.
+ */
+ if (op->flags & FTRACE_OPS_FL_ADDING)
+ continue;
+
+
+ /*
+ * If the ops is being modified and is in the old
+ * hash, then it is probably being removed from this
+ * function.
+ */
+ if ((op->flags & FTRACE_OPS_FL_MODIFYING) &&
+ hash_contains_ip(ip, &op->old_hash))
+ return op;
+ /*
+ * If the ops is not being added or modified, and it's
+ * in its normal filter hash, then this must be the one
+ * we want!
+ */
+ if (!(op->flags & FTRACE_OPS_FL_MODIFYING) &&
+ hash_contains_ip(ip, op->func_hash))
+ return op;
+
+ } while_for_each_ftrace_op(op);
+
+ return NULL;
+}
+
+static struct ftrace_ops *
+ftrace_find_tramp_ops_new(struct dyn_ftrace *rec)
+{
+ struct ftrace_ops *op;
+ unsigned long ip = rec->ip;
+
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+ /* pass rec in as regs to have non-NULL val */
+ if (hash_contains_ip(ip, op->func_hash))
+ return op;
+ } while_for_each_ftrace_op(op);
+
+ return NULL;
+}
+
+/**
+ * ftrace_get_addr_new - Get the call address to set to
+ * @rec: The ftrace record descriptor
+ *
+ * If the record has the FTRACE_FL_REGS set, that means that it
+ * wants to convert to a callback that saves all regs. If FTRACE_FL_REGS
+ * is not not set, then it wants to convert to the normal callback.
+ *
+ * Returns the address of the trampoline to set to
+ */
+unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec)
+{
+ struct ftrace_ops *ops;
+
+ /* Trampolines take precedence over regs */
+ if (rec->flags & FTRACE_FL_TRAMP) {
+ ops = ftrace_find_tramp_ops_new(rec);
+ if (FTRACE_WARN_ON(!ops || !ops->trampoline)) {
+ pr_warn("Bad trampoline accounting at: %p (%pS) (%lx)\n",
+ (void *)rec->ip, (void *)rec->ip, rec->flags);
+ /* Ftrace is shutting down, return anything */
+ return (unsigned long)FTRACE_ADDR;
+ }
+ return ops->trampoline;
+ }
+
+ if (rec->flags & FTRACE_FL_REGS)
+ return (unsigned long)FTRACE_REGS_ADDR;
+ else
+ return (unsigned long)FTRACE_ADDR;
+}
+
+/**
+ * ftrace_get_addr_curr - Get the call address that is already there
+ * @rec: The ftrace record descriptor
+ *
+ * The FTRACE_FL_REGS_EN is set when the record already points to
+ * a function that saves all the regs. Basically the '_EN' version
+ * represents the current state of the function.
+ *
+ * Returns the address of the trampoline that is currently being called
+ */
+unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec)
+{
+ struct ftrace_ops *ops;
+
+ /* Trampolines take precedence over regs */
+ if (rec->flags & FTRACE_FL_TRAMP_EN) {
+ ops = ftrace_find_tramp_ops_curr(rec);
+ if (FTRACE_WARN_ON(!ops)) {
+ pr_warning("Bad trampoline accounting at: %p (%pS)\n",
+ (void *)rec->ip, (void *)rec->ip);
+ /* Ftrace is shutting down, return anything */
+ return (unsigned long)FTRACE_ADDR;
+ }
+ return ops->trampoline;
+ }
+
+ if (rec->flags & FTRACE_FL_REGS_EN)
+ return (unsigned long)FTRACE_REGS_ADDR;
+ else
+ return (unsigned long)FTRACE_ADDR;
+}
+
+static int
+__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
+{
+ unsigned long ftrace_old_addr;
+ unsigned long ftrace_addr;
+ int ret;
+
+ ftrace_addr = ftrace_get_addr_new(rec);
+
+ /* This needs to be done before we call ftrace_update_record */
+ ftrace_old_addr = ftrace_get_addr_curr(rec);
+
+ ret = ftrace_update_record(rec, enable);
+
+ switch (ret) {
+ case FTRACE_UPDATE_IGNORE:
+ return 0;
+
+ case FTRACE_UPDATE_MAKE_CALL:
+ return ftrace_make_call(rec, ftrace_addr);
+
+ case FTRACE_UPDATE_MAKE_NOP:
+ return ftrace_make_nop(NULL, rec, ftrace_old_addr);
+
+ case FTRACE_UPDATE_MODIFY_CALL:
+ return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr);
+ }
+
+ return -1; /* unknow ftrace bug */
+}
+
+void __weak ftrace_replace_code(int enable)
+{
+ struct dyn_ftrace *rec;
+ struct ftrace_page *pg;
+ int failed;
+
+ if (unlikely(ftrace_disabled))
+ return;
+
+ do_for_each_ftrace_rec(pg, rec) {
+ failed = __ftrace_replace_code(rec, enable);
+ if (failed) {
+ ftrace_bug(failed, rec);
+ /* Stop processing */
+ return;
+ }
+ } while_for_each_ftrace_rec();
+}
+
+struct ftrace_rec_iter {
+ struct ftrace_page *pg;
+ int index;
+};
+
+/**
+ * ftrace_rec_iter_start, start up iterating over traced functions
+ *
+ * Returns an iterator handle that is used to iterate over all
+ * the records that represent address locations where functions
+ * are traced.
+ *
+ * May return NULL if no records are available.
+ */
+struct ftrace_rec_iter *ftrace_rec_iter_start(void)
+{
+ /*
+ * We only use a single iterator.
+ * Protected by the ftrace_lock mutex.
+ */
+ static struct ftrace_rec_iter ftrace_rec_iter;
+ struct ftrace_rec_iter *iter = &ftrace_rec_iter;
+
+ iter->pg = ftrace_pages_start;
+ iter->index = 0;
+
+ /* Could have empty pages */
+ while (iter->pg && !iter->pg->index)
+ iter->pg = iter->pg->next;
+
+ if (!iter->pg)
+ return NULL;
+
+ return iter;
+}
+
+/**
+ * ftrace_rec_iter_next, get the next record to process.
+ * @iter: The handle to the iterator.
+ *
+ * Returns the next iterator after the given iterator @iter.
+ */
+struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter)
+{
+ iter->index++;
+
+ if (iter->index >= iter->pg->index) {
+ iter->pg = iter->pg->next;
+ iter->index = 0;
+
+ /* Could have empty pages */
+ while (iter->pg && !iter->pg->index)
+ iter->pg = iter->pg->next;
+ }
+
+ if (!iter->pg)
+ return NULL;
+
+ return iter;
+}
+
+/**
+ * ftrace_rec_iter_record, get the record at the iterator location
+ * @iter: The current iterator location
+ *
+ * Returns the record that the current @iter is at.
+ */
+struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter)
+{
+ return &iter->pg->records[iter->index];
+}
+
+static int
+ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
+{
+ int ret;
+
+ if (unlikely(ftrace_disabled))
+ return 0;
+
+ ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR);
+ if (ret) {
+ ftrace_bug(ret, rec);
+ return 0;
+ }
+ return 1;
+}
+
+/*
+ * archs can override this function if they must do something
+ * before the modifying code is performed.
+ */
+int __weak ftrace_arch_code_modify_prepare(void)
+{
+ return 0;
+}
+
+/*
+ * archs can override this function if they must do something
+ * after the modifying code is performed.
+ */
+int __weak ftrace_arch_code_modify_post_process(void)
+{
+ return 0;
+}
+
+void ftrace_modify_all_code(int command)
+{
+ int update = command & FTRACE_UPDATE_TRACE_FUNC;
+ int err = 0;
+
+ /*
+ * If the ftrace_caller calls a ftrace_ops func directly,
+ * we need to make sure that it only traces functions it
+ * expects to trace. When doing the switch of functions,
+ * we need to update to the ftrace_ops_list_func first
+ * before the transition between old and new calls are set,
+ * as the ftrace_ops_list_func will check the ops hashes
+ * to make sure the ops are having the right functions
+ * traced.
+ */
+ if (update) {
+ err = ftrace_update_ftrace_func(ftrace_ops_list_func);
+ if (FTRACE_WARN_ON(err))
+ return;
+ }
+
+ if (command & FTRACE_UPDATE_CALLS)
+ ftrace_replace_code(1);
+ else if (command & FTRACE_DISABLE_CALLS)
+ ftrace_replace_code(0);
+
+ if (update && ftrace_trace_function != ftrace_ops_list_func) {
+ function_trace_op = set_function_trace_op;
+ smp_wmb();
+ /* If irqs are disabled, we are in stop machine */
+ if (!irqs_disabled())
+ smp_call_function(ftrace_sync_ipi, NULL, 1);
+ err = ftrace_update_ftrace_func(ftrace_trace_function);
+ if (FTRACE_WARN_ON(err))
+ return;
+ }
+
+ if (command & FTRACE_START_FUNC_RET)
+ err = ftrace_enable_ftrace_graph_caller();
+ else if (command & FTRACE_STOP_FUNC_RET)
+ err = ftrace_disable_ftrace_graph_caller();
+ FTRACE_WARN_ON(err);
+}
+
+static int __ftrace_modify_code(void *data)
+{
+ int *command = data;
+
+ ftrace_modify_all_code(*command);
+
+ return 0;
+}
+
+/**
+ * ftrace_run_stop_machine, go back to the stop machine method
+ * @command: The command to tell ftrace what to do
+ *
+ * If an arch needs to fall back to the stop machine method, the
+ * it can call this function.
+ */
+void ftrace_run_stop_machine(int command)
+{
+ stop_machine(__ftrace_modify_code, &command, NULL);
+}
+
+/**
+ * arch_ftrace_update_code, modify the code to trace or not trace
+ * @command: The command that needs to be done
+ *
+ * Archs can override this function if it does not need to
+ * run stop_machine() to modify code.
+ */
+void __weak arch_ftrace_update_code(int command)
+{
+ ftrace_run_stop_machine(command);
+}
+
+static void ftrace_run_update_code(int command)
+{
+ int ret;
+
+ ret = ftrace_arch_code_modify_prepare();
+ FTRACE_WARN_ON(ret);
+ if (ret)
+ return;
+
+ /*
+ * By default we use stop_machine() to modify the code.
+ * But archs can do what ever they want as long as it
+ * is safe. The stop_machine() is the safest, but also
+ * produces the most overhead.
+ */
+ arch_ftrace_update_code(command);
+
+ ret = ftrace_arch_code_modify_post_process();
+ FTRACE_WARN_ON(ret);
+}
+
+static void ftrace_run_modify_code(struct ftrace_ops *ops, int command,
+ struct ftrace_ops_hash *old_hash)
+{
+ ops->flags |= FTRACE_OPS_FL_MODIFYING;
+ ops->old_hash.filter_hash = old_hash->filter_hash;
+ ops->old_hash.notrace_hash = old_hash->notrace_hash;
+ ftrace_run_update_code(command);
+ ops->old_hash.filter_hash = NULL;
+ ops->old_hash.notrace_hash = NULL;
+ ops->flags &= ~FTRACE_OPS_FL_MODIFYING;
+}
+
+static ftrace_func_t saved_ftrace_func;
+static int ftrace_start_up;
+
+void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops)
+{
+}
+
+static void control_ops_free(struct ftrace_ops *ops)
+{
+ free_percpu(ops->disabled);
+}
+
+static void ftrace_startup_enable(int command)
+{
+ if (saved_ftrace_func != ftrace_trace_function) {
+ saved_ftrace_func = ftrace_trace_function;
+ command |= FTRACE_UPDATE_TRACE_FUNC;
+ }
+
+ if (!command || !ftrace_enabled)
+ return;
+
+ ftrace_run_update_code(command);
+}
+
+static void ftrace_startup_all(int command)
+{
+ update_all_ops = true;
+ ftrace_startup_enable(command);
+ update_all_ops = false;
+}
+
+static int ftrace_startup(struct ftrace_ops *ops, int command)
+{
+ int ret;
+
+ if (unlikely(ftrace_disabled))
+ return -ENODEV;
+
+ ret = __register_ftrace_function(ops);
+ if (ret)
+ return ret;
+
+ ftrace_start_up++;
+ command |= FTRACE_UPDATE_CALLS;
+
+ /*
+ * Note that ftrace probes uses this to start up
+ * and modify functions it will probe. But we still
+ * set the ADDING flag for modification, as probes
+ * do not have trampolines. If they add them in the
+ * future, then the probes will need to distinguish
+ * between adding and updating probes.
+ */
+ ops->flags |= FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_ADDING;
+
+ ret = ftrace_hash_ipmodify_enable(ops);
+ if (ret < 0) {
+ /* Rollback registration process */
+ __unregister_ftrace_function(ops);
+ ftrace_start_up--;
+ ops->flags &= ~FTRACE_OPS_FL_ENABLED;
+ return ret;
+ }
+
+ ftrace_hash_rec_enable(ops, 1);
+
+ ftrace_startup_enable(command);
+
+ ops->flags &= ~FTRACE_OPS_FL_ADDING;
+
+ return 0;
+}
+
+static int ftrace_shutdown(struct ftrace_ops *ops, int command)
+{
+ int ret;
+
+ if (unlikely(ftrace_disabled))
+ return -ENODEV;
+
+ ret = __unregister_ftrace_function(ops);
+ if (ret)
+ return ret;
+
+ ftrace_start_up--;
+ /*
+ * Just warn in case of unbalance, no need to kill ftrace, it's not
+ * critical but the ftrace_call callers may be never nopped again after
+ * further ftrace uses.
+ */
+ WARN_ON_ONCE(ftrace_start_up < 0);
+
+ /* Disabling ipmodify never fails */
+ ftrace_hash_ipmodify_disable(ops);
+ ftrace_hash_rec_disable(ops, 1);
+
+ ops->flags &= ~FTRACE_OPS_FL_ENABLED;
+
+ command |= FTRACE_UPDATE_CALLS;
+
+ if (saved_ftrace_func != ftrace_trace_function) {
+ saved_ftrace_func = ftrace_trace_function;
+ command |= FTRACE_UPDATE_TRACE_FUNC;
+ }
+
+ if (!command || !ftrace_enabled) {
+ /*
+ * If these are control ops, they still need their
+ * per_cpu field freed. Since, function tracing is
+ * not currently active, we can just free them
+ * without synchronizing all CPUs.
+ */
+ if (ops->flags & FTRACE_OPS_FL_CONTROL)
+ control_ops_free(ops);
+ return 0;
+ }
+
+ /*
+ * If the ops uses a trampoline, then it needs to be
+ * tested first on update.
+ */
+ ops->flags |= FTRACE_OPS_FL_REMOVING;
+ removed_ops = ops;
+
+ /* The trampoline logic checks the old hashes */
+ ops->old_hash.filter_hash = ops->func_hash->filter_hash;
+ ops->old_hash.notrace_hash = ops->func_hash->notrace_hash;
+
+ ftrace_run_update_code(command);
+
+ /*
+ * If there's no more ops registered with ftrace, run a
+ * sanity check to make sure all rec flags are cleared.
+ */
+ if (ftrace_ops_list == &ftrace_list_end) {
+ struct ftrace_page *pg;
+ struct dyn_ftrace *rec;
+
+ do_for_each_ftrace_rec(pg, rec) {
+ if (FTRACE_WARN_ON_ONCE(rec->flags))
+ pr_warn(" %pS flags:%lx\n",
+ (void *)rec->ip, rec->flags);
+ } while_for_each_ftrace_rec();
+ }
+
+ ops->old_hash.filter_hash = NULL;
+ ops->old_hash.notrace_hash = NULL;
+
+ removed_ops = NULL;
+ ops->flags &= ~FTRACE_OPS_FL_REMOVING;
+
+ /*
+ * Dynamic ops may be freed, we must make sure that all
+ * callers are done before leaving this function.
+ * The same goes for freeing the per_cpu data of the control
+ * ops.
+ *
+ * Again, normal synchronize_sched() is not good enough.
+ * We need to do a hard force of sched synchronization.
+ * This is because we use preempt_disable() to do RCU, but
+ * the function tracers can be called where RCU is not watching
+ * (like before user_exit()). We can not rely on the RCU
+ * infrastructure to do the synchronization, thus we must do it
+ * ourselves.
+ */
+ if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) {
+ schedule_on_each_cpu(ftrace_sync);
+
+ arch_ftrace_trampoline_free(ops);
+
+ if (ops->flags & FTRACE_OPS_FL_CONTROL)
+ control_ops_free(ops);
+ }
+
+ return 0;
+}
+
+static void ftrace_startup_sysctl(void)
+{
+ int command;
+
+ if (unlikely(ftrace_disabled))
+ return;
+
+ /* Force update next time */
+ saved_ftrace_func = NULL;
+ /* ftrace_start_up is true if we want ftrace running */
+ if (ftrace_start_up) {
+ command = FTRACE_UPDATE_CALLS;
+ if (ftrace_graph_active)
+ command |= FTRACE_START_FUNC_RET;
+ ftrace_startup_enable(command);
+ }
+}
+
+static void ftrace_shutdown_sysctl(void)
+{
+ int command;
+
+ if (unlikely(ftrace_disabled))
+ return;
+
+ /* ftrace_start_up is true if ftrace is running */
+ if (ftrace_start_up) {
+ command = FTRACE_DISABLE_CALLS;
+ if (ftrace_graph_active)
+ command |= FTRACE_STOP_FUNC_RET;
+ ftrace_run_update_code(command);
+ }
+}
+
+static cycle_t ftrace_update_time;
+unsigned long ftrace_update_tot_cnt;
+
+static inline int ops_traces_mod(struct ftrace_ops *ops)
+{
+ /*
+ * Filter_hash being empty will default to trace module.
+ * But notrace hash requires a test of individual module functions.
+ */
+ return ftrace_hash_empty(ops->func_hash->filter_hash) &&
+ ftrace_hash_empty(ops->func_hash->notrace_hash);
+}
+
+/*
+ * Check if the current ops references the record.
+ *
+ * If the ops traces all functions, then it was already accounted for.
+ * If the ops does not trace the current record function, skip it.
+ * If the ops ignores the function via notrace filter, skip it.
+ */
+static inline bool
+ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec)
+{
+ /* If ops isn't enabled, ignore it */
+ if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
+ return 0;
+
+ /* If ops traces all mods, we already accounted for it */
+ if (ops_traces_mod(ops))
+ return 0;
+
+ /* The function must be in the filter */
+ if (!ftrace_hash_empty(ops->func_hash->filter_hash) &&
+ !ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip))
+ return 0;
+
+ /* If in notrace hash, we ignore it too */
+ if (ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip))
+ return 0;
+
+ return 1;
+}
+
+static int referenced_filters(struct dyn_ftrace *rec)
+{
+ struct ftrace_ops *ops;
+ int cnt = 0;
+
+ for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) {
+ if (ops_references_rec(ops, rec))
+ cnt++;
+ }
+
+ return cnt;
+}
+
+static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
+{
+ struct ftrace_page *pg;
+ struct dyn_ftrace *p;
+ cycle_t start, stop;
+ unsigned long update_cnt = 0;
+ unsigned long ref = 0;
+ bool test = false;
+ int i;
+
+ /*
+ * When adding a module, we need to check if tracers are
+ * currently enabled and if they are set to trace all functions.
+ * If they are, we need to enable the module functions as well
+ * as update the reference counts for those function records.
+ */
+ if (mod) {
+ struct ftrace_ops *ops;
+
+ for (ops = ftrace_ops_list;
+ ops != &ftrace_list_end; ops = ops->next) {
+ if (ops->flags & FTRACE_OPS_FL_ENABLED) {
+ if (ops_traces_mod(ops))
+ ref++;
+ else
+ test = true;
+ }
+ }
+ }
+
+ start = ftrace_now(raw_smp_processor_id());
+
+ for (pg = new_pgs; pg; pg = pg->next) {
+
+ for (i = 0; i < pg->index; i++) {
+ int cnt = ref;
+
+ /* If something went wrong, bail without enabling anything */
+ if (unlikely(ftrace_disabled))
+ return -1;
+
+ p = &pg->records[i];
+ if (test)
+ cnt += referenced_filters(p);
+ p->flags = cnt;
+
+ /*
+ * Do the initial record conversion from mcount jump
+ * to the NOP instructions.
+ */
+ if (!ftrace_code_disable(mod, p))
+ break;
+
+ update_cnt++;
+
+ /*
+ * If the tracing is enabled, go ahead and enable the record.
+ *
+ * The reason not to enable the record immediatelly is the
+ * inherent check of ftrace_make_nop/ftrace_make_call for
+ * correct previous instructions. Making first the NOP
+ * conversion puts the module to the correct state, thus
+ * passing the ftrace_make_call check.
+ */
+ if (ftrace_start_up && cnt) {
+ int failed = __ftrace_replace_code(p, 1);
+ if (failed)
+ ftrace_bug(failed, p);
+ }
+ }
+ }
+
+ stop = ftrace_now(raw_smp_processor_id());
+ ftrace_update_time = stop - start;
+ ftrace_update_tot_cnt += update_cnt;
+
+ return 0;
+}
+
+static int ftrace_allocate_records(struct ftrace_page *pg, int count)
+{
+ int order;
+ int cnt;
+
+ if (WARN_ON(!count))
+ return -EINVAL;
+
+ order = get_count_order(DIV_ROUND_UP(count, ENTRIES_PER_PAGE));
+
+ /*
+ * We want to fill as much as possible. No more than a page
+ * may be empty.
+ */
+ while ((PAGE_SIZE << order) / ENTRY_SIZE >= count + ENTRIES_PER_PAGE)
+ order--;
+
+ again:
+ pg->records = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
+
+ if (!pg->records) {
+ /* if we can't allocate this size, try something smaller */
+ if (!order)
+ return -ENOMEM;
+ order >>= 1;
+ goto again;
+ }
+
+ cnt = (PAGE_SIZE << order) / ENTRY_SIZE;
+ pg->size = cnt;
+
+ if (cnt > count)
+ cnt = count;
+
+ return cnt;
+}
+
+static struct ftrace_page *
+ftrace_allocate_pages(unsigned long num_to_init)
+{
+ struct ftrace_page *start_pg;
+ struct ftrace_page *pg;
+ int order;
+ int cnt;
+
+ if (!num_to_init)
+ return 0;
+
+ start_pg = pg = kzalloc(sizeof(*pg), GFP_KERNEL);
+ if (!pg)
+ return NULL;
+
+ /*
+ * Try to allocate as much as possible in one continues
+ * location that fills in all of the space. We want to
+ * waste as little space as possible.
+ */
+ for (;;) {
+ cnt = ftrace_allocate_records(pg, num_to_init);
+ if (cnt < 0)
+ goto free_pages;
+
+ num_to_init -= cnt;
+ if (!num_to_init)
+ break;
+
+ pg->next = kzalloc(sizeof(*pg), GFP_KERNEL);
+ if (!pg->next)
+ goto free_pages;
+
+ pg = pg->next;
+ }
+
+ return start_pg;
+
+ free_pages:
+ pg = start_pg;
+ while (pg) {
+ order = get_count_order(pg->size / ENTRIES_PER_PAGE);
+ free_pages((unsigned long)pg->records, order);
+ start_pg = pg->next;
+ kfree(pg);
+ pg = start_pg;
+ }
+ pr_info("ftrace: FAILED to allocate memory for functions\n");
+ return NULL;
+}
+
+#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
+
+struct ftrace_iterator {
+ loff_t pos;
+ loff_t func_pos;
+ struct ftrace_page *pg;
+ struct dyn_ftrace *func;
+ struct ftrace_func_probe *probe;
+ struct trace_parser parser;
+ struct ftrace_hash *hash;
+ struct ftrace_ops *ops;
+ int hidx;
+ int idx;
+ unsigned flags;
+};
+
+static void *
+t_hash_next(struct seq_file *m, loff_t *pos)
+{
+ struct ftrace_iterator *iter = m->private;
+ struct hlist_node *hnd = NULL;
+ struct hlist_head *hhd;
+
+ (*pos)++;
+ iter->pos = *pos;
+
+ if (iter->probe)
+ hnd = &iter->probe->node;
+ retry:
+ if (iter->hidx >= FTRACE_FUNC_HASHSIZE)
+ return NULL;
+
+ hhd = &ftrace_func_hash[iter->hidx];
+
+ if (hlist_empty(hhd)) {
+ iter->hidx++;
+ hnd = NULL;
+ goto retry;
+ }
+
+ if (!hnd)
+ hnd = hhd->first;
+ else {
+ hnd = hnd->next;
+ if (!hnd) {
+ iter->hidx++;
+ goto retry;
+ }
+ }
+
+ if (WARN_ON_ONCE(!hnd))
+ return NULL;
+
+ iter->probe = hlist_entry(hnd, struct ftrace_func_probe, node);
+
+ return iter;
+}
+
+static void *t_hash_start(struct seq_file *m, loff_t *pos)
+{
+ struct ftrace_iterator *iter = m->private;
+ void *p = NULL;
+ loff_t l;
+
+ if (!(iter->flags & FTRACE_ITER_DO_HASH))
+ return NULL;
+
+ if (iter->func_pos > *pos)
+ return NULL;
+
+ iter->hidx = 0;
+ for (l = 0; l <= (*pos - iter->func_pos); ) {
+ p = t_hash_next(m, &l);
+ if (!p)
+ break;
+ }
+ if (!p)
+ return NULL;
+
+ /* Only set this if we have an item */
+ iter->flags |= FTRACE_ITER_HASH;
+
+ return iter;
+}
+
+static int
+t_hash_show(struct seq_file *m, struct ftrace_iterator *iter)
+{
+ struct ftrace_func_probe *rec;
+
+ rec = iter->probe;
+ if (WARN_ON_ONCE(!rec))
+ return -EIO;
+
+ if (rec->ops->print)
+ return rec->ops->print(m, rec->ip, rec->ops, rec->data);
+
+ seq_printf(m, "%ps:%ps", (void *)rec->ip, (void *)rec->ops->func);
+
+ if (rec->data)
+ seq_printf(m, ":%p", rec->data);
+ seq_putc(m, '\n');
+
+ return 0;
+}
+
+static void *
+t_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct ftrace_iterator *iter = m->private;
+ struct ftrace_ops *ops = iter->ops;
+ struct dyn_ftrace *rec = NULL;
+
+ if (unlikely(ftrace_disabled))
+ return NULL;
+
+ if (iter->flags & FTRACE_ITER_HASH)
+ return t_hash_next(m, pos);
+
+ (*pos)++;
+ iter->pos = iter->func_pos = *pos;
+
+ if (iter->flags & FTRACE_ITER_PRINTALL)
+ return t_hash_start(m, pos);
+
+ retry:
+ if (iter->idx >= iter->pg->index) {
+ if (iter->pg->next) {
+ iter->pg = iter->pg->next;
+ iter->idx = 0;
+ goto retry;
+ }
+ } else {
+ rec = &iter->pg->records[iter->idx++];
+ if (((iter->flags & FTRACE_ITER_FILTER) &&
+ !(ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip))) ||
+
+ ((iter->flags & FTRACE_ITER_NOTRACE) &&
+ !ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip)) ||
+
+ ((iter->flags & FTRACE_ITER_ENABLED) &&
+ !(rec->flags & FTRACE_FL_ENABLED))) {
+
+ rec = NULL;
+ goto retry;
+ }
+ }
+
+ if (!rec)
+ return t_hash_start(m, pos);
+
+ iter->func = rec;
+
+ return iter;
+}
+
+static void reset_iter_read(struct ftrace_iterator *iter)
+{
+ iter->pos = 0;
+ iter->func_pos = 0;
+ iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_HASH);
+}
+
+static void *t_start(struct seq_file *m, loff_t *pos)
+{
+ struct ftrace_iterator *iter = m->private;
+ struct ftrace_ops *ops = iter->ops;
+ void *p = NULL;
+ loff_t l;
+
+ mutex_lock(&ftrace_lock);
+
+ if (unlikely(ftrace_disabled))
+ return NULL;
+
+ /*
+ * If an lseek was done, then reset and start from beginning.
+ */
+ if (*pos < iter->pos)
+ reset_iter_read(iter);
+
+ /*
+ * For set_ftrace_filter reading, if we have the filter
+ * off, we can short cut and just print out that all
+ * functions are enabled.
+ */
+ if ((iter->flags & FTRACE_ITER_FILTER &&
+ ftrace_hash_empty(ops->func_hash->filter_hash)) ||
+ (iter->flags & FTRACE_ITER_NOTRACE &&
+ ftrace_hash_empty(ops->func_hash->notrace_hash))) {
+ if (*pos > 0)
+ return t_hash_start(m, pos);
+ iter->flags |= FTRACE_ITER_PRINTALL;
+ /* reset in case of seek/pread */
+ iter->flags &= ~FTRACE_ITER_HASH;
+ return iter;
+ }
+
+ if (iter->flags & FTRACE_ITER_HASH)
+ return t_hash_start(m, pos);
+
+ /*
+ * Unfortunately, we need to restart at ftrace_pages_start
+ * every time we let go of the ftrace_mutex. This is because
+ * those pointers can change without the lock.
+ */
+ iter->pg = ftrace_pages_start;
+ iter->idx = 0;
+ for (l = 0; l <= *pos; ) {
+ p = t_next(m, p, &l);
+ if (!p)
+ break;
+ }
+
+ if (!p)
+ return t_hash_start(m, pos);
+
+ return iter;
+}
+
+static void t_stop(struct seq_file *m, void *p)
+{
+ mutex_unlock(&ftrace_lock);
+}
+
+void * __weak
+arch_ftrace_trampoline_func(struct ftrace_ops *ops, struct dyn_ftrace *rec)
+{
+ return NULL;
+}
+
+static void add_trampoline_func(struct seq_file *m, struct ftrace_ops *ops,
+ struct dyn_ftrace *rec)
+{
+ void *ptr;
+
+ ptr = arch_ftrace_trampoline_func(ops, rec);
+ if (ptr)
+ seq_printf(m, " ->%pS", ptr);
+}
+
+static int t_show(struct seq_file *m, void *v)
+{
+ struct ftrace_iterator *iter = m->private;
+ struct dyn_ftrace *rec;
+
+ if (iter->flags & FTRACE_ITER_HASH)
+ return t_hash_show(m, iter);
+
+ if (iter->flags & FTRACE_ITER_PRINTALL) {
+ if (iter->flags & FTRACE_ITER_NOTRACE)
+ seq_puts(m, "#### no functions disabled ####\n");
+ else
+ seq_puts(m, "#### all functions enabled ####\n");
+ return 0;
+ }
+
+ rec = iter->func;
+
+ if (!rec)
+ return 0;
+
+ seq_printf(m, "%ps", (void *)rec->ip);
+ if (iter->flags & FTRACE_ITER_ENABLED) {
+ struct ftrace_ops *ops = NULL;
+
+ seq_printf(m, " (%ld)%s%s",
+ ftrace_rec_count(rec),
+ rec->flags & FTRACE_FL_REGS ? " R" : " ",
+ rec->flags & FTRACE_FL_IPMODIFY ? " I" : " ");
+ if (rec->flags & FTRACE_FL_TRAMP_EN) {
+ ops = ftrace_find_tramp_ops_any(rec);
+ if (ops)
+ seq_printf(m, "\ttramp: %pS",
+ (void *)ops->trampoline);
+ else
+ seq_puts(m, "\ttramp: ERROR!");
+
+ }
+ add_trampoline_func(m, ops, rec);
+ }
+
+ seq_putc(m, '\n');
+
+ return 0;
+}
+
+static const struct seq_operations show_ftrace_seq_ops = {
+ .start = t_start,
+ .next = t_next,
+ .stop = t_stop,
+ .show = t_show,
+};
+
+static int
+ftrace_avail_open(struct inode *inode, struct file *file)
+{
+ struct ftrace_iterator *iter;
+
+ if (unlikely(ftrace_disabled))
+ return -ENODEV;
+
+ iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter));
+ if (iter) {
+ iter->pg = ftrace_pages_start;
+ iter->ops = &global_ops;
+ }
+
+ return iter ? 0 : -ENOMEM;
+}
+
+static int
+ftrace_enabled_open(struct inode *inode, struct file *file)
+{
+ struct ftrace_iterator *iter;
+
+ iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter));
+ if (iter) {
+ iter->pg = ftrace_pages_start;
+ iter->flags = FTRACE_ITER_ENABLED;
+ iter->ops = &global_ops;
+ }
+
+ return iter ? 0 : -ENOMEM;
+}
+
+/**
+ * ftrace_regex_open - initialize function tracer filter files
+ * @ops: The ftrace_ops that hold the hash filters
+ * @flag: The type of filter to process
+ * @inode: The inode, usually passed in to your open routine
+ * @file: The file, usually passed in to your open routine
+ *
+ * ftrace_regex_open() initializes the filter files for the
+ * @ops. Depending on @flag it may process the filter hash or
+ * the notrace hash of @ops. With this called from the open
+ * routine, you can use ftrace_filter_write() for the write
+ * routine if @flag has FTRACE_ITER_FILTER set, or
+ * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set.
+ * tracing_lseek() should be used as the lseek routine, and
+ * release must call ftrace_regex_release().
+ */
+int
+ftrace_regex_open(struct ftrace_ops *ops, int flag,
+ struct inode *inode, struct file *file)
+{
+ struct ftrace_iterator *iter;
+ struct ftrace_hash *hash;
+ int ret = 0;
+
+ ftrace_ops_init(ops);
+
+ if (unlikely(ftrace_disabled))
+ return -ENODEV;
+
+ iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter)
+ return -ENOMEM;
+
+ if (trace_parser_get_init(&iter->parser, FTRACE_BUFF_MAX)) {
+ kfree(iter);
+ return -ENOMEM;
+ }
+
+ iter->ops = ops;
+ iter->flags = flag;
+
+ mutex_lock(&ops->func_hash->regex_lock);
+
+ if (flag & FTRACE_ITER_NOTRACE)
+ hash = ops->func_hash->notrace_hash;
+ else
+ hash = ops->func_hash->filter_hash;
+
+ if (file->f_mode & FMODE_WRITE) {
+ const int size_bits = FTRACE_HASH_DEFAULT_BITS;
+
+ if (file->f_flags & O_TRUNC)
+ iter->hash = alloc_ftrace_hash(size_bits);
+ else
+ iter->hash = alloc_and_copy_ftrace_hash(size_bits, hash);
+
+ if (!iter->hash) {
+ trace_parser_put(&iter->parser);
+ kfree(iter);
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+ }
+
+ if (file->f_mode & FMODE_READ) {
+ iter->pg = ftrace_pages_start;
+
+ ret = seq_open(file, &show_ftrace_seq_ops);
+ if (!ret) {
+ struct seq_file *m = file->private_data;
+ m->private = iter;
+ } else {
+ /* Failed */
+ free_ftrace_hash(iter->hash);
+ trace_parser_put(&iter->parser);
+ kfree(iter);
+ }
+ } else
+ file->private_data = iter;
+
+ out_unlock:
+ mutex_unlock(&ops->func_hash->regex_lock);
+
+ return ret;
+}
+
+static int
+ftrace_filter_open(struct inode *inode, struct file *file)
+{
+ struct ftrace_ops *ops = inode->i_private;
+
+ return ftrace_regex_open(ops,
+ FTRACE_ITER_FILTER | FTRACE_ITER_DO_HASH,
+ inode, file);
+}
+
+static int
+ftrace_notrace_open(struct inode *inode, struct file *file)
+{
+ struct ftrace_ops *ops = inode->i_private;
+
+ return ftrace_regex_open(ops, FTRACE_ITER_NOTRACE,
+ inode, file);
+}
+
+static int ftrace_match(char *str, char *regex, int len, int type)
+{
+ int matched = 0;
+ int slen;
+
+ switch (type) {
+ case MATCH_FULL:
+ if (strcmp(str, regex) == 0)
+ matched = 1;
+ break;
+ case MATCH_FRONT_ONLY:
+ if (strncmp(str, regex, len) == 0)
+ matched = 1;
+ break;
+ case MATCH_MIDDLE_ONLY:
+ if (strstr(str, regex))
+ matched = 1;
+ break;
+ case MATCH_END_ONLY:
+ slen = strlen(str);
+ if (slen >= len && memcmp(str + slen - len, regex, len) == 0)
+ matched = 1;
+ break;
+ }
+
+ return matched;
+}
+
+static int
+enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int not)
+{
+ struct ftrace_func_entry *entry;
+ int ret = 0;
+
+ entry = ftrace_lookup_ip(hash, rec->ip);
+ if (not) {
+ /* Do nothing if it doesn't exist */
+ if (!entry)
+ return 0;
+
+ free_hash_entry(hash, entry);
+ } else {
+ /* Do nothing if it exists */
+ if (entry)
+ return 0;
+
+ ret = add_hash_entry(hash, rec->ip);
+ }
+ return ret;
+}
+
+static int
+ftrace_match_record(struct dyn_ftrace *rec, char *mod,
+ char *regex, int len, int type)
+{
+ char str[KSYM_SYMBOL_LEN];
+ char *modname;
+
+ kallsyms_lookup(rec->ip, NULL, NULL, &modname, str);
+
+ if (mod) {
+ /* module lookup requires matching the module */
+ if (!modname || strcmp(modname, mod))
+ return 0;
+
+ /* blank search means to match all funcs in the mod */
+ if (!len)
+ return 1;
+ }
+
+ return ftrace_match(str, regex, len, type);
+}
+
+static int
+match_records(struct ftrace_hash *hash, char *buff,
+ int len, char *mod, int not)
+{
+ unsigned search_len = 0;
+ struct ftrace_page *pg;
+ struct dyn_ftrace *rec;
+ int type = MATCH_FULL;
+ char *search = buff;
+ int found = 0;
+ int ret;
+
+ if (len) {
+ type = filter_parse_regex(buff, len, &search, &not);
+ search_len = strlen(search);
+ }
+
+ mutex_lock(&ftrace_lock);
+
+ if (unlikely(ftrace_disabled))
+ goto out_unlock;
+
+ do_for_each_ftrace_rec(pg, rec) {
+ if (ftrace_match_record(rec, mod, search, search_len, type)) {
+ ret = enter_record(hash, rec, not);
+ if (ret < 0) {
+ found = ret;
+ goto out_unlock;
+ }
+ found = 1;
+ }
+ } while_for_each_ftrace_rec();
+ out_unlock:
+ mutex_unlock(&ftrace_lock);
+
+ return found;
+}
+
+static int
+ftrace_match_records(struct ftrace_hash *hash, char *buff, int len)
+{
+ return match_records(hash, buff, len, NULL, 0);
+}
+
+static int
+ftrace_match_module_records(struct ftrace_hash *hash, char *buff, char *mod)
+{
+ int not = 0;
+
+ /* blank or '*' mean the same */
+ if (strcmp(buff, "*") == 0)
+ buff[0] = 0;
+
+ /* handle the case of 'dont filter this module' */
+ if (strcmp(buff, "!") == 0 || strcmp(buff, "!*") == 0) {
+ buff[0] = 0;
+ not = 1;
+ }
+
+ return match_records(hash, buff, strlen(buff), mod, not);
+}
+
+/*
+ * We register the module command as a template to show others how
+ * to register the a command as well.
+ */
+
+static int
+ftrace_mod_callback(struct ftrace_hash *hash,
+ char *func, char *cmd, char *param, int enable)
+{
+ char *mod;
+ int ret = -EINVAL;
+
+ /*
+ * cmd == 'mod' because we only registered this func
+ * for the 'mod' ftrace_func_command.
+ * But if you register one func with multiple commands,
+ * you can tell which command was used by the cmd
+ * parameter.
+ */
+
+ /* we must have a module name */
+ if (!param)
+ return ret;
+
+ mod = strsep(&param, ":");
+ if (!strlen(mod))
+ return ret;
+
+ ret = ftrace_match_module_records(hash, func, mod);
+ if (!ret)
+ ret = -EINVAL;
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+static struct ftrace_func_command ftrace_mod_cmd = {
+ .name = "mod",
+ .func = ftrace_mod_callback,
+};
+
+static int __init ftrace_mod_cmd_init(void)
+{
+ return register_ftrace_command(&ftrace_mod_cmd);
+}
+core_initcall(ftrace_mod_cmd_init);
+
+static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *pt_regs)
+{
+ struct ftrace_func_probe *entry;
+ struct hlist_head *hhd;
+ unsigned long key;
+
+ key = hash_long(ip, FTRACE_HASH_BITS);
+
+ hhd = &ftrace_func_hash[key];
+
+ if (hlist_empty(hhd))
+ return;
+
+ /*
+ * Disable preemption for these calls to prevent a RCU grace
+ * period. This syncs the hash iteration and freeing of items
+ * on the hash. rcu_read_lock is too dangerous here.
+ */
+ preempt_disable_notrace();
+ hlist_for_each_entry_rcu_notrace(entry, hhd, node) {
+ if (entry->ip == ip)
+ entry->ops->func(ip, parent_ip, &entry->data);
+ }
+ preempt_enable_notrace();
+}
+
+static struct ftrace_ops trace_probe_ops __read_mostly =
+{
+ .func = function_trace_probe_call,
+ .flags = FTRACE_OPS_FL_INITIALIZED,
+ INIT_OPS_HASH(trace_probe_ops)
+};
+
+static int ftrace_probe_registered;
+
+static void __enable_ftrace_function_probe(struct ftrace_ops_hash *old_hash)
+{
+ int ret;
+ int i;
+
+ if (ftrace_probe_registered) {
+ /* still need to update the function call sites */
+ if (ftrace_enabled)
+ ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS,
+ old_hash);
+ return;
+ }
+
+ for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
+ struct hlist_head *hhd = &ftrace_func_hash[i];
+ if (hhd->first)
+ break;
+ }
+ /* Nothing registered? */
+ if (i == FTRACE_FUNC_HASHSIZE)
+ return;
+
+ ret = ftrace_startup(&trace_probe_ops, 0);
+
+ ftrace_probe_registered = 1;
+}
+
+static void __disable_ftrace_function_probe(void)
+{
+ int i;
+
+ if (!ftrace_probe_registered)
+ return;
+
+ for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
+ struct hlist_head *hhd = &ftrace_func_hash[i];
+ if (hhd->first)
+ return;
+ }
+
+ /* no more funcs left */
+ ftrace_shutdown(&trace_probe_ops, 0);
+
+ ftrace_probe_registered = 0;
+}
+
+
+static void ftrace_free_entry(struct ftrace_func_probe *entry)
+{
+ if (entry->ops->free)
+ entry->ops->free(entry->ops, entry->ip, &entry->data);
+ kfree(entry);
+}
+
+int
+register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
+ void *data)
+{
+ struct ftrace_ops_hash old_hash_ops;
+ struct ftrace_func_probe *entry;
+ struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash;
+ struct ftrace_hash *old_hash = *orig_hash;
+ struct ftrace_hash *hash;
+ struct ftrace_page *pg;
+ struct dyn_ftrace *rec;
+ int type, len, not;
+ unsigned long key;
+ int count = 0;
+ char *search;
+ int ret;
+
+ type = filter_parse_regex(glob, strlen(glob), &search, &not);
+ len = strlen(search);
+
+ /* we do not support '!' for function probes */
+ if (WARN_ON(not))
+ return -EINVAL;
+
+ mutex_lock(&trace_probe_ops.func_hash->regex_lock);
+
+ old_hash_ops.filter_hash = old_hash;
+ /* Probes only have filters */
+ old_hash_ops.notrace_hash = NULL;
+
+ hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash);
+ if (!hash) {
+ count = -ENOMEM;
+ goto out;
+ }
+
+ if (unlikely(ftrace_disabled)) {
+ count = -ENODEV;
+ goto out;
+ }
+
+ mutex_lock(&ftrace_lock);
+
+ do_for_each_ftrace_rec(pg, rec) {
+
+ if (!ftrace_match_record(rec, NULL, search, len, type))
+ continue;
+
+ entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry) {
+ /* If we did not process any, then return error */
+ if (!count)
+ count = -ENOMEM;
+ goto out_unlock;
+ }
+
+ count++;
+
+ entry->data = data;
+
+ /*
+ * The caller might want to do something special
+ * for each function we find. We call the callback
+ * to give the caller an opportunity to do so.
+ */
+ if (ops->init) {
+ if (ops->init(ops, rec->ip, &entry->data) < 0) {
+ /* caller does not like this func */
+ kfree(entry);
+ continue;
+ }
+ }
+
+ ret = enter_record(hash, rec, 0);
+ if (ret < 0) {
+ kfree(entry);
+ count = ret;
+ goto out_unlock;
+ }
+
+ entry->ops = ops;
+ entry->ip = rec->ip;
+
+ key = hash_long(entry->ip, FTRACE_HASH_BITS);
+ hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]);
+
+ } while_for_each_ftrace_rec();
+
+ ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
+
+ __enable_ftrace_function_probe(&old_hash_ops);
+
+ if (!ret)
+ free_ftrace_hash_rcu(old_hash);
+ else
+ count = ret;
+
+ out_unlock:
+ mutex_unlock(&ftrace_lock);
+ out:
+ mutex_unlock(&trace_probe_ops.func_hash->regex_lock);
+ free_ftrace_hash(hash);
+
+ return count;
+}
+
+enum {
+ PROBE_TEST_FUNC = 1,
+ PROBE_TEST_DATA = 2
+};
+
+static void
+__unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
+ void *data, int flags)
+{
+ struct ftrace_func_entry *rec_entry;
+ struct ftrace_func_probe *entry;
+ struct ftrace_func_probe *p;
+ struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash;
+ struct ftrace_hash *old_hash = *orig_hash;
+ struct list_head free_list;
+ struct ftrace_hash *hash;
+ struct hlist_node *tmp;
+ char str[KSYM_SYMBOL_LEN];
+ int type = MATCH_FULL;
+ int i, len = 0;
+ char *search;
+ int ret;
+
+ if (glob && (strcmp(glob, "*") == 0 || !strlen(glob)))
+ glob = NULL;
+ else if (glob) {
+ int not;
+
+ type = filter_parse_regex(glob, strlen(glob), &search, &not);
+ len = strlen(search);
+
+ /* we do not support '!' for function probes */
+ if (WARN_ON(not))
+ return;
+ }
+
+ mutex_lock(&trace_probe_ops.func_hash->regex_lock);
+
+ hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
+ if (!hash)
+ /* Hmm, should report this somehow */
+ goto out_unlock;
+
+ INIT_LIST_HEAD(&free_list);
+
+ for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
+ struct hlist_head *hhd = &ftrace_func_hash[i];
+
+ hlist_for_each_entry_safe(entry, tmp, hhd, node) {
+
+ /* break up if statements for readability */
+ if ((flags & PROBE_TEST_FUNC) && entry->ops != ops)
+ continue;
+
+ if ((flags & PROBE_TEST_DATA) && entry->data != data)
+ continue;
+
+ /* do this last, since it is the most expensive */
+ if (glob) {
+ kallsyms_lookup(entry->ip, NULL, NULL,
+ NULL, str);
+ if (!ftrace_match(str, glob, len, type))
+ continue;
+ }
+
+ rec_entry = ftrace_lookup_ip(hash, entry->ip);
+ /* It is possible more than one entry had this ip */
+ if (rec_entry)
+ free_hash_entry(hash, rec_entry);
+
+ hlist_del_rcu(&entry->node);
+ list_add(&entry->free_list, &free_list);
+ }
+ }
+ mutex_lock(&ftrace_lock);
+ __disable_ftrace_function_probe();
+ /*
+ * Remove after the disable is called. Otherwise, if the last
+ * probe is removed, a null hash means *all enabled*.
+ */
+ ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
+ synchronize_sched();
+ if (!ret)
+ free_ftrace_hash_rcu(old_hash);
+
+ list_for_each_entry_safe(entry, p, &free_list, free_list) {
+ list_del(&entry->free_list);
+ ftrace_free_entry(entry);
+ }
+ mutex_unlock(&ftrace_lock);
+
+ out_unlock:
+ mutex_unlock(&trace_probe_ops.func_hash->regex_lock);
+ free_ftrace_hash(hash);
+}
+
+void
+unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
+ void *data)
+{
+ __unregister_ftrace_function_probe(glob, ops, data,
+ PROBE_TEST_FUNC | PROBE_TEST_DATA);
+}
+
+void
+unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops)
+{
+ __unregister_ftrace_function_probe(glob, ops, NULL, PROBE_TEST_FUNC);
+}
+
+void unregister_ftrace_function_probe_all(char *glob)
+{
+ __unregister_ftrace_function_probe(glob, NULL, NULL, 0);
+}
+
+static LIST_HEAD(ftrace_commands);
+static DEFINE_MUTEX(ftrace_cmd_mutex);
+
+/*
+ * Currently we only register ftrace commands from __init, so mark this
+ * __init too.
+ */
+__init int register_ftrace_command(struct ftrace_func_command *cmd)
+{
+ struct ftrace_func_command *p;
+ int ret = 0;
+
+ mutex_lock(&ftrace_cmd_mutex);
+ list_for_each_entry(p, &ftrace_commands, list) {
+ if (strcmp(cmd->name, p->name) == 0) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+ }
+ list_add(&cmd->list, &ftrace_commands);
+ out_unlock:
+ mutex_unlock(&ftrace_cmd_mutex);
+
+ return ret;
+}
+
+/*
+ * Currently we only unregister ftrace commands from __init, so mark
+ * this __init too.
+ */
+__init int unregister_ftrace_command(struct ftrace_func_command *cmd)
+{
+ struct ftrace_func_command *p, *n;
+ int ret = -ENODEV;
+
+ mutex_lock(&ftrace_cmd_mutex);
+ list_for_each_entry_safe(p, n, &ftrace_commands, list) {
+ if (strcmp(cmd->name, p->name) == 0) {
+ ret = 0;
+ list_del_init(&p->list);
+ goto out_unlock;
+ }
+ }
+ out_unlock:
+ mutex_unlock(&ftrace_cmd_mutex);
+
+ return ret;
+}
+
+static int ftrace_process_regex(struct ftrace_hash *hash,
+ char *buff, int len, int enable)
+{
+ char *func, *command, *next = buff;
+ struct ftrace_func_command *p;
+ int ret = -EINVAL;
+
+ func = strsep(&next, ":");
+
+ if (!next) {
+ ret = ftrace_match_records(hash, func, len);
+ if (!ret)
+ ret = -EINVAL;
+ if (ret < 0)
+ return ret;
+ return 0;
+ }
+
+ /* command found */
+
+ command = strsep(&next, ":");
+
+ mutex_lock(&ftrace_cmd_mutex);
+ list_for_each_entry(p, &ftrace_commands, list) {
+ if (strcmp(p->name, command) == 0) {
+ ret = p->func(hash, func, command, next, enable);
+ goto out_unlock;
+ }
+ }
+ out_unlock:
+ mutex_unlock(&ftrace_cmd_mutex);
+
+ return ret;
+}
+
+static ssize_t
+ftrace_regex_write(struct file *file, const char __user *ubuf,
+ size_t cnt, loff_t *ppos, int enable)
+{
+ struct ftrace_iterator *iter;
+ struct trace_parser *parser;
+ ssize_t ret, read;
+
+ if (!cnt)
+ return 0;
+
+ if (file->f_mode & FMODE_READ) {
+ struct seq_file *m = file->private_data;
+ iter = m->private;
+ } else
+ iter = file->private_data;
+
+ if (unlikely(ftrace_disabled))
+ return -ENODEV;
+
+ /* iter->hash is a local copy, so we don't need regex_lock */
+
+ parser = &iter->parser;
+ read = trace_get_user(parser, ubuf, cnt, ppos);
+
+ if (read >= 0 && trace_parser_loaded(parser) &&
+ !trace_parser_cont(parser)) {
+ ret = ftrace_process_regex(iter->hash, parser->buffer,
+ parser->idx, enable);
+ trace_parser_clear(parser);
+ if (ret < 0)
+ goto out;
+ }
+
+ ret = read;
+ out:
+ return ret;
+}
+
+ssize_t
+ftrace_filter_write(struct file *file, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ return ftrace_regex_write(file, ubuf, cnt, ppos, 1);
+}
+
+ssize_t
+ftrace_notrace_write(struct file *file, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ return ftrace_regex_write(file, ubuf, cnt, ppos, 0);
+}
+
+static int
+ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
+{
+ struct ftrace_func_entry *entry;
+
+ if (!ftrace_location(ip))
+ return -EINVAL;
+
+ if (remove) {
+ entry = ftrace_lookup_ip(hash, ip);
+ if (!entry)
+ return -ENOENT;
+ free_hash_entry(hash, entry);
+ return 0;
+ }
+
+ return add_hash_entry(hash, ip);
+}
+
+static void ftrace_ops_update_code(struct ftrace_ops *ops,
+ struct ftrace_ops_hash *old_hash)
+{
+ struct ftrace_ops *op;
+
+ if (!ftrace_enabled)
+ return;
+
+ if (ops->flags & FTRACE_OPS_FL_ENABLED) {
+ ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash);
+ return;
+ }
+
+ /*
+ * If this is the shared global_ops filter, then we need to
+ * check if there is another ops that shares it, is enabled.
+ * If so, we still need to run the modify code.
+ */
+ if (ops->func_hash != &global_ops.local_hash)
+ return;
+
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+ if (op->func_hash == &global_ops.local_hash &&
+ op->flags & FTRACE_OPS_FL_ENABLED) {
+ ftrace_run_modify_code(op, FTRACE_UPDATE_CALLS, old_hash);
+ /* Only need to do this once */
+ return;
+ }
+ } while_for_each_ftrace_op(op);
+}
+
+static int
+ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
+ unsigned long ip, int remove, int reset, int enable)
+{
+ struct ftrace_hash **orig_hash;
+ struct ftrace_ops_hash old_hash_ops;
+ struct ftrace_hash *old_hash;
+ struct ftrace_hash *hash;
+ int ret;
+
+ if (unlikely(ftrace_disabled))
+ return -ENODEV;
+
+ mutex_lock(&ops->func_hash->regex_lock);
+
+ if (enable)
+ orig_hash = &ops->func_hash->filter_hash;
+ else
+ orig_hash = &ops->func_hash->notrace_hash;
+
+ if (reset)
+ hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS);
+ else
+ hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
+
+ if (!hash) {
+ ret = -ENOMEM;
+ goto out_regex_unlock;
+ }
+
+ if (buf && !ftrace_match_records(hash, buf, len)) {
+ ret = -EINVAL;
+ goto out_regex_unlock;
+ }
+ if (ip) {
+ ret = ftrace_match_addr(hash, ip, remove);
+ if (ret < 0)
+ goto out_regex_unlock;
+ }
+
+ mutex_lock(&ftrace_lock);
+ old_hash = *orig_hash;
+ old_hash_ops.filter_hash = ops->func_hash->filter_hash;
+ old_hash_ops.notrace_hash = ops->func_hash->notrace_hash;
+ ret = ftrace_hash_move(ops, enable, orig_hash, hash);
+ if (!ret) {
+ ftrace_ops_update_code(ops, &old_hash_ops);
+ free_ftrace_hash_rcu(old_hash);
+ }
+ mutex_unlock(&ftrace_lock);
+
+ out_regex_unlock:
+ mutex_unlock(&ops->func_hash->regex_lock);
+
+ free_ftrace_hash(hash);
+ return ret;
+}
+
+static int
+ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove,
+ int reset, int enable)
+{
+ return ftrace_set_hash(ops, 0, 0, ip, remove, reset, enable);
+}
+
+/**
+ * ftrace_set_filter_ip - set a function to filter on in ftrace by address
+ * @ops - the ops to set the filter with
+ * @ip - the address to add to or remove from the filter.
+ * @remove - non zero to remove the ip from the filter
+ * @reset - non zero to reset all filters before applying this filter.
+ *
+ * Filters denote which functions should be enabled when tracing is enabled
+ * If @ip is NULL, it failes to update filter.
+ */
+int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip,
+ int remove, int reset)
+{
+ ftrace_ops_init(ops);
+ return ftrace_set_addr(ops, ip, remove, reset, 1);
+}
+EXPORT_SYMBOL_GPL(ftrace_set_filter_ip);
+
+static int
+ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
+ int reset, int enable)
+{
+ return ftrace_set_hash(ops, buf, len, 0, 0, reset, enable);
+}
+
+/**
+ * ftrace_set_filter - set a function to filter on in ftrace
+ * @ops - the ops to set the filter with
+ * @buf - the string that holds the function filter text.
+ * @len - the length of the string.
+ * @reset - non zero to reset all filters before applying this filter.
+ *
+ * Filters denote which functions should be enabled when tracing is enabled.
+ * If @buf is NULL and reset is set, all functions will be enabled for tracing.
+ */
+int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf,
+ int len, int reset)
+{
+ ftrace_ops_init(ops);
+ return ftrace_set_regex(ops, buf, len, reset, 1);
+}
+EXPORT_SYMBOL_GPL(ftrace_set_filter);
+
+/**
+ * ftrace_set_notrace - set a function to not trace in ftrace
+ * @ops - the ops to set the notrace filter with
+ * @buf - the string that holds the function notrace text.
+ * @len - the length of the string.
+ * @reset - non zero to reset all filters before applying this filter.
+ *
+ * Notrace Filters denote which functions should not be enabled when tracing
+ * is enabled. If @buf is NULL and reset is set, all functions will be enabled
+ * for tracing.
+ */
+int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,
+ int len, int reset)
+{
+ ftrace_ops_init(ops);
+ return ftrace_set_regex(ops, buf, len, reset, 0);
+}
+EXPORT_SYMBOL_GPL(ftrace_set_notrace);
+/**
+ * ftrace_set_global_filter - set a function to filter on with global tracers
+ * @buf - the string that holds the function filter text.
+ * @len - the length of the string.
+ * @reset - non zero to reset all filters before applying this filter.
+ *
+ * Filters denote which functions should be enabled when tracing is enabled.
+ * If @buf is NULL and reset is set, all functions will be enabled for tracing.
+ */
+void ftrace_set_global_filter(unsigned char *buf, int len, int reset)
+{
+ ftrace_set_regex(&global_ops, buf, len, reset, 1);
+}
+EXPORT_SYMBOL_GPL(ftrace_set_global_filter);
+
+/**
+ * ftrace_set_global_notrace - set a function to not trace with global tracers
+ * @buf - the string that holds the function notrace text.
+ * @len - the length of the string.
+ * @reset - non zero to reset all filters before applying this filter.
+ *
+ * Notrace Filters denote which functions should not be enabled when tracing
+ * is enabled. If @buf is NULL and reset is set, all functions will be enabled
+ * for tracing.
+ */
+void ftrace_set_global_notrace(unsigned char *buf, int len, int reset)
+{
+ ftrace_set_regex(&global_ops, buf, len, reset, 0);
+}
+EXPORT_SYMBOL_GPL(ftrace_set_global_notrace);
+
+/*
+ * command line interface to allow users to set filters on boot up.
+ */
+#define FTRACE_FILTER_SIZE COMMAND_LINE_SIZE
+static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
+static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata;
+
+/* Used by function selftest to not test if filter is set */
+bool ftrace_filter_param __initdata;
+
+static int __init set_ftrace_notrace(char *str)
+{
+ ftrace_filter_param = true;
+ strlcpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
+ return 1;
+}
+__setup("ftrace_notrace=", set_ftrace_notrace);
+
+static int __init set_ftrace_filter(char *str)
+{
+ ftrace_filter_param = true;
+ strlcpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
+ return 1;
+}
+__setup("ftrace_filter=", set_ftrace_filter);
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
+static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
+static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer);
+
+static unsigned long save_global_trampoline;
+static unsigned long save_global_flags;
+
+static int __init set_graph_function(char *str)
+{
+ strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
+ return 1;
+}
+__setup("ftrace_graph_filter=", set_graph_function);
+
+static int __init set_graph_notrace_function(char *str)
+{
+ strlcpy(ftrace_graph_notrace_buf, str, FTRACE_FILTER_SIZE);
+ return 1;
+}
+__setup("ftrace_graph_notrace=", set_graph_notrace_function);
+
+static void __init set_ftrace_early_graph(char *buf, int enable)
+{
+ int ret;
+ char *func;
+ unsigned long *table = ftrace_graph_funcs;
+ int *count = &ftrace_graph_count;
+
+ if (!enable) {
+ table = ftrace_graph_notrace_funcs;
+ count = &ftrace_graph_notrace_count;
+ }
+
+ while (buf) {
+ func = strsep(&buf, ",");
+ /* we allow only one expression at a time */
+ ret = ftrace_set_func(table, count, FTRACE_GRAPH_MAX_FUNCS, func);
+ if (ret)
+ printk(KERN_DEBUG "ftrace: function %s not "
+ "traceable\n", func);
+ }
+}
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
+void __init
+ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable)
+{
+ char *func;
+
+ ftrace_ops_init(ops);
+
+ while (buf) {
+ func = strsep(&buf, ",");
+ ftrace_set_regex(ops, func, strlen(func), 0, enable);
+ }
+}
+
+static void __init set_ftrace_early_filters(void)
+{
+ if (ftrace_filter_buf[0])
+ ftrace_set_early_filter(&global_ops, ftrace_filter_buf, 1);
+ if (ftrace_notrace_buf[0])
+ ftrace_set_early_filter(&global_ops, ftrace_notrace_buf, 0);
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ if (ftrace_graph_buf[0])
+ set_ftrace_early_graph(ftrace_graph_buf, 1);
+ if (ftrace_graph_notrace_buf[0])
+ set_ftrace_early_graph(ftrace_graph_notrace_buf, 0);
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+}
+
+int ftrace_regex_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *m = (struct seq_file *)file->private_data;
+ struct ftrace_ops_hash old_hash_ops;
+ struct ftrace_iterator *iter;
+ struct ftrace_hash **orig_hash;
+ struct ftrace_hash *old_hash;
+ struct trace_parser *parser;
+ int filter_hash;
+ int ret;
+
+ if (file->f_mode & FMODE_READ) {
+ iter = m->private;
+ seq_release(inode, file);
+ } else
+ iter = file->private_data;
+
+ parser = &iter->parser;
+ if (trace_parser_loaded(parser)) {
+ parser->buffer[parser->idx] = 0;
+ ftrace_match_records(iter->hash, parser->buffer, parser->idx);
+ }
+
+ trace_parser_put(parser);
+
+ mutex_lock(&iter->ops->func_hash->regex_lock);
+
+ if (file->f_mode & FMODE_WRITE) {
+ filter_hash = !!(iter->flags & FTRACE_ITER_FILTER);
+
+ if (filter_hash)
+ orig_hash = &iter->ops->func_hash->filter_hash;
+ else
+ orig_hash = &iter->ops->func_hash->notrace_hash;
+
+ mutex_lock(&ftrace_lock);
+ old_hash = *orig_hash;
+ old_hash_ops.filter_hash = iter->ops->func_hash->filter_hash;
+ old_hash_ops.notrace_hash = iter->ops->func_hash->notrace_hash;
+ ret = ftrace_hash_move(iter->ops, filter_hash,
+ orig_hash, iter->hash);
+ if (!ret) {
+ ftrace_ops_update_code(iter->ops, &old_hash_ops);
+ free_ftrace_hash_rcu(old_hash);
+ }
+ mutex_unlock(&ftrace_lock);
+ }
+
+ mutex_unlock(&iter->ops->func_hash->regex_lock);
+ free_ftrace_hash(iter->hash);
+ kfree(iter);
+
+ return 0;
+}
+
+static const struct file_operations ftrace_avail_fops = {
+ .open = ftrace_avail_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+static const struct file_operations ftrace_enabled_fops = {
+ .open = ftrace_enabled_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+static const struct file_operations ftrace_filter_fops = {
+ .open = ftrace_filter_open,
+ .read = seq_read,
+ .write = ftrace_filter_write,
+ .llseek = tracing_lseek,
+ .release = ftrace_regex_release,
+};
+
+static const struct file_operations ftrace_notrace_fops = {
+ .open = ftrace_notrace_open,
+ .read = seq_read,
+ .write = ftrace_notrace_write,
+ .llseek = tracing_lseek,
+ .release = ftrace_regex_release,
+};
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+
+static DEFINE_MUTEX(graph_lock);
+
+int ftrace_graph_count;
+int ftrace_graph_notrace_count;
+unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
+unsigned long ftrace_graph_notrace_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
+
+struct ftrace_graph_data {
+ unsigned long *table;
+ size_t size;
+ int *count;
+ const struct seq_operations *seq_ops;
+};
+
+static void *
+__g_next(struct seq_file *m, loff_t *pos)
+{
+ struct ftrace_graph_data *fgd = m->private;
+
+ if (*pos >= *fgd->count)
+ return NULL;
+ return &fgd->table[*pos];
+}
+
+static void *
+g_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ (*pos)++;
+ return __g_next(m, pos);
+}
+
+static void *g_start(struct seq_file *m, loff_t *pos)
+{
+ struct ftrace_graph_data *fgd = m->private;
+
+ mutex_lock(&graph_lock);
+
+ /* Nothing, tell g_show to print all functions are enabled */
+ if (!*fgd->count && !*pos)
+ return (void *)1;
+
+ return __g_next(m, pos);
+}
+
+static void g_stop(struct seq_file *m, void *p)
+{
+ mutex_unlock(&graph_lock);
+}
+
+static int g_show(struct seq_file *m, void *v)
+{
+ unsigned long *ptr = v;
+
+ if (!ptr)
+ return 0;
+
+ if (ptr == (unsigned long *)1) {
+ struct ftrace_graph_data *fgd = m->private;
+
+ if (fgd->table == ftrace_graph_funcs)
+ seq_puts(m, "#### all functions enabled ####\n");
+ else
+ seq_puts(m, "#### no functions disabled ####\n");
+ return 0;
+ }
+
+ seq_printf(m, "%ps\n", (void *)*ptr);
+
+ return 0;
+}
+
+static const struct seq_operations ftrace_graph_seq_ops = {
+ .start = g_start,
+ .next = g_next,
+ .stop = g_stop,
+ .show = g_show,
+};
+
+static int
+__ftrace_graph_open(struct inode *inode, struct file *file,
+ struct ftrace_graph_data *fgd)
+{
+ int ret = 0;
+
+ mutex_lock(&graph_lock);
+ if ((file->f_mode & FMODE_WRITE) &&
+ (file->f_flags & O_TRUNC)) {
+ *fgd->count = 0;
+ memset(fgd->table, 0, fgd->size * sizeof(*fgd->table));
+ }
+ mutex_unlock(&graph_lock);
+
+ if (file->f_mode & FMODE_READ) {
+ ret = seq_open(file, fgd->seq_ops);
+ if (!ret) {
+ struct seq_file *m = file->private_data;
+ m->private = fgd;
+ }
+ } else
+ file->private_data = fgd;
+
+ return ret;
+}
+
+static int
+ftrace_graph_open(struct inode *inode, struct file *file)
+{
+ struct ftrace_graph_data *fgd;
+
+ if (unlikely(ftrace_disabled))
+ return -ENODEV;
+
+ fgd = kmalloc(sizeof(*fgd), GFP_KERNEL);
+ if (fgd == NULL)
+ return -ENOMEM;
+
+ fgd->table = ftrace_graph_funcs;
+ fgd->size = FTRACE_GRAPH_MAX_FUNCS;
+ fgd->count = &ftrace_graph_count;
+ fgd->seq_ops = &ftrace_graph_seq_ops;
+
+ return __ftrace_graph_open(inode, file, fgd);
+}
+
+static int
+ftrace_graph_notrace_open(struct inode *inode, struct file *file)
+{
+ struct ftrace_graph_data *fgd;
+
+ if (unlikely(ftrace_disabled))
+ return -ENODEV;
+
+ fgd = kmalloc(sizeof(*fgd), GFP_KERNEL);
+ if (fgd == NULL)
+ return -ENOMEM;
+
+ fgd->table = ftrace_graph_notrace_funcs;
+ fgd->size = FTRACE_GRAPH_MAX_FUNCS;
+ fgd->count = &ftrace_graph_notrace_count;
+ fgd->seq_ops = &ftrace_graph_seq_ops;
+
+ return __ftrace_graph_open(inode, file, fgd);
+}
+
+static int
+ftrace_graph_release(struct inode *inode, struct file *file)
+{
+ if (file->f_mode & FMODE_READ) {
+ struct seq_file *m = file->private_data;
+
+ kfree(m->private);
+ seq_release(inode, file);
+ } else {
+ kfree(file->private_data);
+ }
+
+ return 0;
+}
+
+static int
+ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer)
+{
+ struct dyn_ftrace *rec;
+ struct ftrace_page *pg;
+ int search_len;
+ int fail = 1;
+ int type, not;
+ char *search;
+ bool exists;
+ int i;
+
+ /* decode regex */
+ type = filter_parse_regex(buffer, strlen(buffer), &search, &not);
+ if (!not && *idx >= size)
+ return -EBUSY;
+
+ search_len = strlen(search);
+
+ mutex_lock(&ftrace_lock);
+
+ if (unlikely(ftrace_disabled)) {
+ mutex_unlock(&ftrace_lock);
+ return -ENODEV;
+ }
+
+ do_for_each_ftrace_rec(pg, rec) {
+
+ if (ftrace_match_record(rec, NULL, search, search_len, type)) {
+ /* if it is in the array */
+ exists = false;
+ for (i = 0; i < *idx; i++) {
+ if (array[i] == rec->ip) {
+ exists = true;
+ break;
+ }
+ }
+
+ if (!not) {
+ fail = 0;
+ if (!exists) {
+ array[(*idx)++] = rec->ip;
+ if (*idx >= size)
+ goto out;
+ }
+ } else {
+ if (exists) {
+ array[i] = array[--(*idx)];
+ array[*idx] = 0;
+ fail = 0;
+ }
+ }
+ }
+ } while_for_each_ftrace_rec();
+out:
+ mutex_unlock(&ftrace_lock);
+
+ if (fail)
+ return -EINVAL;
+
+ return 0;
+}
+
+static ssize_t
+ftrace_graph_write(struct file *file, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct trace_parser parser;
+ ssize_t read, ret = 0;
+ struct ftrace_graph_data *fgd = file->private_data;
+
+ if (!cnt)
+ return 0;
+
+ if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX))
+ return -ENOMEM;
+
+ read = trace_get_user(&parser, ubuf, cnt, ppos);
+
+ if (read >= 0 && trace_parser_loaded((&parser))) {
+ parser.buffer[parser.idx] = 0;
+
+ mutex_lock(&graph_lock);
+
+ /* we allow only one expression at a time */
+ ret = ftrace_set_func(fgd->table, fgd->count, fgd->size,
+ parser.buffer);
+
+ mutex_unlock(&graph_lock);
+ }
+
+ if (!ret)
+ ret = read;
+
+ trace_parser_put(&parser);
+
+ return ret;
+}
+
+static const struct file_operations ftrace_graph_fops = {
+ .open = ftrace_graph_open,
+ .read = seq_read,
+ .write = ftrace_graph_write,
+ .llseek = tracing_lseek,
+ .release = ftrace_graph_release,
+};
+
+static const struct file_operations ftrace_graph_notrace_fops = {
+ .open = ftrace_graph_notrace_open,
+ .read = seq_read,
+ .write = ftrace_graph_write,
+ .llseek = tracing_lseek,
+ .release = ftrace_graph_release,
+};
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
+void ftrace_create_filter_files(struct ftrace_ops *ops,
+ struct dentry *parent)
+{
+
+ trace_create_file("set_ftrace_filter", 0644, parent,
+ ops, &ftrace_filter_fops);
+
+ trace_create_file("set_ftrace_notrace", 0644, parent,
+ ops, &ftrace_notrace_fops);
+}
+
+/*
+ * The name "destroy_filter_files" is really a misnomer. Although
+ * in the future, it may actualy delete the files, but this is
+ * really intended to make sure the ops passed in are disabled
+ * and that when this function returns, the caller is free to
+ * free the ops.
+ *
+ * The "destroy" name is only to match the "create" name that this
+ * should be paired with.
+ */
+void ftrace_destroy_filter_files(struct ftrace_ops *ops)
+{
+ mutex_lock(&ftrace_lock);
+ if (ops->flags & FTRACE_OPS_FL_ENABLED)
+ ftrace_shutdown(ops, 0);
+ ops->flags |= FTRACE_OPS_FL_DELETED;
+ mutex_unlock(&ftrace_lock);
+}
+
+static __init int ftrace_init_dyn_tracefs(struct dentry *d_tracer)
+{
+
+ trace_create_file("available_filter_functions", 0444,
+ d_tracer, NULL, &ftrace_avail_fops);
+
+ trace_create_file("enabled_functions", 0444,
+ d_tracer, NULL, &ftrace_enabled_fops);
+
+ ftrace_create_filter_files(&global_ops, d_tracer);
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ trace_create_file("set_graph_function", 0444, d_tracer,
+ NULL,
+ &ftrace_graph_fops);
+ trace_create_file("set_graph_notrace", 0444, d_tracer,
+ NULL,
+ &ftrace_graph_notrace_fops);
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
+ return 0;
+}
+
+static int ftrace_cmp_ips(const void *a, const void *b)
+{
+ const unsigned long *ipa = a;
+ const unsigned long *ipb = b;
+
+ if (*ipa > *ipb)
+ return 1;
+ if (*ipa < *ipb)
+ return -1;
+ return 0;
+}
+
+static void ftrace_swap_ips(void *a, void *b, int size)
+{
+ unsigned long *ipa = a;
+ unsigned long *ipb = b;
+ unsigned long t;
+
+ t = *ipa;
+ *ipa = *ipb;
+ *ipb = t;
+}
+
+static int ftrace_process_locs(struct module *mod,
+ unsigned long *start,
+ unsigned long *end)
+{
+ struct ftrace_page *start_pg;
+ struct ftrace_page *pg;
+ struct dyn_ftrace *rec;
+ unsigned long count;
+ unsigned long *p;
+ unsigned long addr;
+ unsigned long flags = 0; /* Shut up gcc */
+ int ret = -ENOMEM;
+
+ count = end - start;
+
+ if (!count)
+ return 0;
+
+ sort(start, count, sizeof(*start),
+ ftrace_cmp_ips, ftrace_swap_ips);
+
+ start_pg = ftrace_allocate_pages(count);
+ if (!start_pg)
+ return -ENOMEM;
+
+ mutex_lock(&ftrace_lock);
+
+ /*
+ * Core and each module needs their own pages, as
+ * modules will free them when they are removed.
+ * Force a new page to be allocated for modules.
+ */
+ if (!mod) {
+ WARN_ON(ftrace_pages || ftrace_pages_start);
+ /* First initialization */
+ ftrace_pages = ftrace_pages_start = start_pg;
+ } else {
+ if (!ftrace_pages)
+ goto out;
+
+ if (WARN_ON(ftrace_pages->next)) {
+ /* Hmm, we have free pages? */
+ while (ftrace_pages->next)
+ ftrace_pages = ftrace_pages->next;
+ }
+
+ ftrace_pages->next = start_pg;
+ }
+
+ p = start;
+ pg = start_pg;
+ while (p < end) {
+ addr = ftrace_call_adjust(*p++);
+ /*
+ * Some architecture linkers will pad between
+ * the different mcount_loc sections of different
+ * object files to satisfy alignments.
+ * Skip any NULL pointers.
+ */
+ if (!addr)
+ continue;
+
+ if (pg->index == pg->size) {
+ /* We should have allocated enough */
+ if (WARN_ON(!pg->next))
+ break;
+ pg = pg->next;
+ }
+
+ rec = &pg->records[pg->index++];
+ rec->ip = addr;
+ }
+
+ /* We should have used all pages */
+ WARN_ON(pg->next);
+
+ /* Assign the last page to ftrace_pages */
+ ftrace_pages = pg;
+
+ /*
+ * We only need to disable interrupts on start up
+ * because we are modifying code that an interrupt
+ * may execute, and the modification is not atomic.
+ * But for modules, nothing runs the code we modify
+ * until we are finished with it, and there's no
+ * reason to cause large interrupt latencies while we do it.
+ */
+ if (!mod)
+ local_irq_save(flags);
+ ftrace_update_code(mod, start_pg);
+ if (!mod)
+ local_irq_restore(flags);
+ ret = 0;
+ out:
+ mutex_unlock(&ftrace_lock);
+
+ return ret;
+}
+
+#ifdef CONFIG_MODULES
+
+#define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next)
+
+void ftrace_release_mod(struct module *mod)
+{
+ struct dyn_ftrace *rec;
+ struct ftrace_page **last_pg;
+ struct ftrace_page *pg;
+ int order;
+
+ mutex_lock(&ftrace_lock);
+
+ if (ftrace_disabled)
+ goto out_unlock;
+
+ /*
+ * Each module has its own ftrace_pages, remove
+ * them from the list.
+ */
+ last_pg = &ftrace_pages_start;
+ for (pg = ftrace_pages_start; pg; pg = *last_pg) {
+ rec = &pg->records[0];
+ if (within_module_core(rec->ip, mod)) {
+ /*
+ * As core pages are first, the first
+ * page should never be a module page.
+ */
+ if (WARN_ON(pg == ftrace_pages_start))
+ goto out_unlock;
+
+ /* Check if we are deleting the last page */
+ if (pg == ftrace_pages)
+ ftrace_pages = next_to_ftrace_page(last_pg);
+
+ *last_pg = pg->next;
+ order = get_count_order(pg->size / ENTRIES_PER_PAGE);
+ free_pages((unsigned long)pg->records, order);
+ kfree(pg);
+ } else
+ last_pg = &pg->next;
+ }
+ out_unlock:
+ mutex_unlock(&ftrace_lock);
+}
+
+static void ftrace_init_module(struct module *mod,
+ unsigned long *start, unsigned long *end)
+{
+ if (ftrace_disabled || start == end)
+ return;
+ ftrace_process_locs(mod, start, end);
+}
+
+void ftrace_module_init(struct module *mod)
+{
+ ftrace_init_module(mod, mod->ftrace_callsites,
+ mod->ftrace_callsites +
+ mod->num_ftrace_callsites);
+}
+
+static int ftrace_module_notify_exit(struct notifier_block *self,
+ unsigned long val, void *data)
+{
+ struct module *mod = data;
+
+ if (val == MODULE_STATE_GOING)
+ ftrace_release_mod(mod);
+
+ return 0;
+}
+#else
+static int ftrace_module_notify_exit(struct notifier_block *self,
+ unsigned long val, void *data)
+{
+ return 0;
+}
+#endif /* CONFIG_MODULES */
+
+struct notifier_block ftrace_module_exit_nb = {
+ .notifier_call = ftrace_module_notify_exit,
+ .priority = INT_MIN, /* Run after anything that can remove kprobes */
+};
+
+void __init ftrace_init(void)
+{
+ extern unsigned long __start_mcount_loc[];
+ extern unsigned long __stop_mcount_loc[];
+ unsigned long count, flags;
+ int ret;
+
+ local_irq_save(flags);
+ ret = ftrace_dyn_arch_init();
+ local_irq_restore(flags);
+ if (ret)
+ goto failed;
+
+ count = __stop_mcount_loc - __start_mcount_loc;
+ if (!count) {
+ pr_info("ftrace: No functions to be traced?\n");
+ goto failed;
+ }
+
+ pr_info("ftrace: allocating %ld entries in %ld pages\n",
+ count, count / ENTRIES_PER_PAGE + 1);
+
+ last_ftrace_enabled = ftrace_enabled = 1;
+
+ ret = ftrace_process_locs(NULL,
+ __start_mcount_loc,
+ __stop_mcount_loc);
+
+ ret = register_module_notifier(&ftrace_module_exit_nb);
+ if (ret)
+ pr_warning("Failed to register trace ftrace module exit notifier\n");
+
+ set_ftrace_early_filters();
+
+ return;
+ failed:
+ ftrace_disabled = 1;
+}
+
+/* Do nothing if arch does not support this */
+void __weak arch_ftrace_update_trampoline(struct ftrace_ops *ops)
+{
+}
+
+static void ftrace_update_trampoline(struct ftrace_ops *ops)
+{
+
+/*
+ * Currently there's no safe way to free a trampoline when the kernel
+ * is configured with PREEMPT. That is because a task could be preempted
+ * when it jumped to the trampoline, it may be preempted for a long time
+ * depending on the system load, and currently there's no way to know
+ * when it will be off the trampoline. If the trampoline is freed
+ * too early, when the task runs again, it will be executing on freed
+ * memory and crash.
+ */
+#ifdef CONFIG_PREEMPT
+ /* Currently, only non dynamic ops can have a trampoline */
+ if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
+ return;
+#endif
+
+ arch_ftrace_update_trampoline(ops);
+}
+
+#else
+
+static struct ftrace_ops global_ops = {
+ .func = ftrace_stub,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
+};
+
+static int __init ftrace_nodyn_init(void)
+{
+ ftrace_enabled = 1;
+ return 0;
+}
+core_initcall(ftrace_nodyn_init);
+
+static inline int ftrace_init_dyn_tracefs(struct dentry *d_tracer) { return 0; }
+static inline void ftrace_startup_enable(int command) { }
+static inline void ftrace_startup_all(int command) { }
+/* Keep as macros so we do not need to define the commands */
+# define ftrace_startup(ops, command) \
+ ({ \
+ int ___ret = __register_ftrace_function(ops); \
+ if (!___ret) \
+ (ops)->flags |= FTRACE_OPS_FL_ENABLED; \
+ ___ret; \
+ })
+# define ftrace_shutdown(ops, command) \
+ ({ \
+ int ___ret = __unregister_ftrace_function(ops); \
+ if (!___ret) \
+ (ops)->flags &= ~FTRACE_OPS_FL_ENABLED; \
+ ___ret; \
+ })
+
+# define ftrace_startup_sysctl() do { } while (0)
+# define ftrace_shutdown_sysctl() do { } while (0)
+
+static inline int
+ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
+{
+ return 1;
+}
+
+static void ftrace_update_trampoline(struct ftrace_ops *ops)
+{
+}
+
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+__init void ftrace_init_global_array_ops(struct trace_array *tr)
+{
+ tr->ops = &global_ops;
+ tr->ops->private = tr;
+}
+
+void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func)
+{
+ /* If we filter on pids, update to use the pid function */
+ if (tr->flags & TRACE_ARRAY_FL_GLOBAL) {
+ if (WARN_ON(tr->ops->func != ftrace_stub))
+ printk("ftrace ops had %pS for function\n",
+ tr->ops->func);
+ /* Only the top level instance does pid tracing */
+ if (!list_empty(&ftrace_pids)) {
+ set_ftrace_pid_function(func);
+ func = ftrace_pid_func;
+ }
+ }
+ tr->ops->func = func;
+ tr->ops->private = tr;
+}
+
+void ftrace_reset_array_ops(struct trace_array *tr)
+{
+ tr->ops->func = ftrace_stub;
+}
+
+static void
+ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *regs)
+{
+ if (unlikely(trace_recursion_test(TRACE_CONTROL_BIT)))
+ return;
+
+ /*
+ * Some of the ops may be dynamically allocated,
+ * they must be freed after a synchronize_sched().
+ */
+ preempt_disable_notrace();
+ trace_recursion_set(TRACE_CONTROL_BIT);
+
+ /*
+ * Control funcs (perf) uses RCU. Only trace if
+ * RCU is currently active.
+ */
+ if (!rcu_is_watching())
+ goto out;
+
+ do_for_each_ftrace_op(op, ftrace_control_list) {
+ if (!(op->flags & FTRACE_OPS_FL_STUB) &&
+ !ftrace_function_local_disabled(op) &&
+ ftrace_ops_test(op, ip, regs))
+ op->func(ip, parent_ip, op, regs);
+ } while_for_each_ftrace_op(op);
+ out:
+ trace_recursion_clear(TRACE_CONTROL_BIT);
+ preempt_enable_notrace();
+}
+
+static struct ftrace_ops control_ops = {
+ .func = ftrace_ops_control_func,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
+ INIT_OPS_HASH(control_ops)
+};
+
+static inline void
+__ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *ignored, struct pt_regs *regs)
+{
+ struct ftrace_ops *op;
+ int bit;
+
+ bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
+ if (bit < 0)
+ return;
+
+ /*
+ * Some of the ops may be dynamically allocated,
+ * they must be freed after a synchronize_sched().
+ */
+ preempt_disable_notrace();
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+ if (ftrace_ops_test(op, ip, regs)) {
+ if (FTRACE_WARN_ON(!op->func)) {
+ pr_warn("op=%p %pS\n", op, op);
+ goto out;
+ }
+ op->func(ip, parent_ip, op, regs);
+ }
+ } while_for_each_ftrace_op(op);
+out:
+ preempt_enable_notrace();
+ trace_clear_recursion(bit);
+}
+
+/*
+ * Some archs only support passing ip and parent_ip. Even though
+ * the list function ignores the op parameter, we do not want any
+ * C side effects, where a function is called without the caller
+ * sending a third parameter.
+ * Archs are to support both the regs and ftrace_ops at the same time.
+ * If they support ftrace_ops, it is assumed they support regs.
+ * If call backs want to use regs, they must either check for regs
+ * being NULL, or CONFIG_DYNAMIC_FTRACE_WITH_REGS.
+ * Note, CONFIG_DYNAMIC_FTRACE_WITH_REGS expects a full regs to be saved.
+ * An architecture can pass partial regs with ftrace_ops and still
+ * set the ARCH_SUPPORT_FTARCE_OPS.
+ */
+#if ARCH_SUPPORTS_FTRACE_OPS
+static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *regs)
+{
+ __ftrace_ops_list_func(ip, parent_ip, NULL, regs);
+}
+#else
+static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip)
+{
+ __ftrace_ops_list_func(ip, parent_ip, NULL, NULL);
+}
+#endif
+
+/*
+ * If there's only one function registered but it does not support
+ * recursion, this function will be called by the mcount trampoline.
+ * This function will handle recursion protection.
+ */
+static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *regs)
+{
+ int bit;
+
+ bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
+ if (bit < 0)
+ return;
+
+ op->func(ip, parent_ip, op, regs);
+
+ trace_clear_recursion(bit);
+}
+
+/**
+ * ftrace_ops_get_func - get the function a trampoline should call
+ * @ops: the ops to get the function for
+ *
+ * Normally the mcount trampoline will call the ops->func, but there
+ * are times that it should not. For example, if the ops does not
+ * have its own recursion protection, then it should call the
+ * ftrace_ops_recurs_func() instead.
+ *
+ * Returns the function that the trampoline should call for @ops.
+ */
+ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
+{
+ /*
+ * If the func handles its own recursion, call it directly.
+ * Otherwise call the recursion protected function that
+ * will call the ftrace ops function.
+ */
+ if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE))
+ return ftrace_ops_recurs_func;
+
+ return ops->func;
+}
+
+static void clear_ftrace_swapper(void)
+{
+ struct task_struct *p;
+ int cpu;
+
+ get_online_cpus();
+ for_each_online_cpu(cpu) {
+ p = idle_task(cpu);
+ clear_tsk_trace_trace(p);
+ }
+ put_online_cpus();
+}
+
+static void set_ftrace_swapper(void)
+{
+ struct task_struct *p;
+ int cpu;
+
+ get_online_cpus();
+ for_each_online_cpu(cpu) {
+ p = idle_task(cpu);
+ set_tsk_trace_trace(p);
+ }
+ put_online_cpus();
+}
+
+static void clear_ftrace_pid(struct pid *pid)
+{
+ struct task_struct *p;
+
+ rcu_read_lock();
+ do_each_pid_task(pid, PIDTYPE_PID, p) {
+ clear_tsk_trace_trace(p);
+ } while_each_pid_task(pid, PIDTYPE_PID, p);
+ rcu_read_unlock();
+
+ put_pid(pid);
+}
+
+static void set_ftrace_pid(struct pid *pid)
+{
+ struct task_struct *p;
+
+ rcu_read_lock();
+ do_each_pid_task(pid, PIDTYPE_PID, p) {
+ set_tsk_trace_trace(p);
+ } while_each_pid_task(pid, PIDTYPE_PID, p);
+ rcu_read_unlock();
+}
+
+static void clear_ftrace_pid_task(struct pid *pid)
+{
+ if (pid == ftrace_swapper_pid)
+ clear_ftrace_swapper();
+ else
+ clear_ftrace_pid(pid);
+}
+
+static void set_ftrace_pid_task(struct pid *pid)
+{
+ if (pid == ftrace_swapper_pid)
+ set_ftrace_swapper();
+ else
+ set_ftrace_pid(pid);
+}
+
+static int ftrace_pid_add(int p)
+{
+ struct pid *pid;
+ struct ftrace_pid *fpid;
+ int ret = -EINVAL;
+
+ mutex_lock(&ftrace_lock);
+
+ if (!p)
+ pid = ftrace_swapper_pid;
+ else
+ pid = find_get_pid(p);
+
+ if (!pid)
+ goto out;
+
+ ret = 0;
+
+ list_for_each_entry(fpid, &ftrace_pids, list)
+ if (fpid->pid == pid)
+ goto out_put;
+
+ ret = -ENOMEM;
+
+ fpid = kmalloc(sizeof(*fpid), GFP_KERNEL);
+ if (!fpid)
+ goto out_put;
+
+ list_add(&fpid->list, &ftrace_pids);
+ fpid->pid = pid;
+
+ set_ftrace_pid_task(pid);
+
+ ftrace_update_pid_func();
+
+ ftrace_startup_all(0);
+
+ mutex_unlock(&ftrace_lock);
+ return 0;
+
+out_put:
+ if (pid != ftrace_swapper_pid)
+ put_pid(pid);
+
+out:
+ mutex_unlock(&ftrace_lock);
+ return ret;
+}
+
+static void ftrace_pid_reset(void)
+{
+ struct ftrace_pid *fpid, *safe;
+
+ mutex_lock(&ftrace_lock);
+ list_for_each_entry_safe(fpid, safe, &ftrace_pids, list) {
+ struct pid *pid = fpid->pid;
+
+ clear_ftrace_pid_task(pid);
+
+ list_del(&fpid->list);
+ kfree(fpid);
+ }
+
+ ftrace_update_pid_func();
+ ftrace_startup_all(0);
+
+ mutex_unlock(&ftrace_lock);
+}
+
+static void *fpid_start(struct seq_file *m, loff_t *pos)
+{
+ mutex_lock(&ftrace_lock);
+
+ if (list_empty(&ftrace_pids) && (!*pos))
+ return (void *) 1;
+
+ return seq_list_start(&ftrace_pids, *pos);
+}
+
+static void *fpid_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ if (v == (void *)1)
+ return NULL;
+
+ return seq_list_next(v, &ftrace_pids, pos);
+}
+
+static void fpid_stop(struct seq_file *m, void *p)
+{
+ mutex_unlock(&ftrace_lock);
+}
+
+static int fpid_show(struct seq_file *m, void *v)
+{
+ const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list);
+
+ if (v == (void *)1) {
+ seq_puts(m, "no pid\n");
+ return 0;
+ }
+
+ if (fpid->pid == ftrace_swapper_pid)
+ seq_puts(m, "swapper tasks\n");
+ else
+ seq_printf(m, "%u\n", pid_vnr(fpid->pid));
+
+ return 0;
+}
+
+static const struct seq_operations ftrace_pid_sops = {
+ .start = fpid_start,
+ .next = fpid_next,
+ .stop = fpid_stop,
+ .show = fpid_show,
+};
+
+static int
+ftrace_pid_open(struct inode *inode, struct file *file)
+{
+ int ret = 0;
+
+ if ((file->f_mode & FMODE_WRITE) &&
+ (file->f_flags & O_TRUNC))
+ ftrace_pid_reset();
+
+ if (file->f_mode & FMODE_READ)
+ ret = seq_open(file, &ftrace_pid_sops);
+
+ return ret;
+}
+
+static ssize_t
+ftrace_pid_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[64], *tmp;
+ long val;
+ int ret;
+
+ if (cnt >= sizeof(buf))
+ return -EINVAL;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+
+ /*
+ * Allow "echo > set_ftrace_pid" or "echo -n '' > set_ftrace_pid"
+ * to clean the filter quietly.
+ */
+ tmp = strstrip(buf);
+ if (strlen(tmp) == 0)
+ return 1;
+
+ ret = kstrtol(tmp, 10, &val);
+ if (ret < 0)
+ return ret;
+
+ ret = ftrace_pid_add(val);
+
+ return ret ? ret : cnt;
+}
+
+static int
+ftrace_pid_release(struct inode *inode, struct file *file)
+{
+ if (file->f_mode & FMODE_READ)
+ seq_release(inode, file);
+
+ return 0;
+}
+
+static const struct file_operations ftrace_pid_fops = {
+ .open = ftrace_pid_open,
+ .write = ftrace_pid_write,
+ .read = seq_read,
+ .llseek = tracing_lseek,
+ .release = ftrace_pid_release,
+};
+
+static __init int ftrace_init_tracefs(void)
+{
+ struct dentry *d_tracer;
+
+ d_tracer = tracing_init_dentry();
+ if (IS_ERR(d_tracer))
+ return 0;
+
+ ftrace_init_dyn_tracefs(d_tracer);
+
+ trace_create_file("set_ftrace_pid", 0644, d_tracer,
+ NULL, &ftrace_pid_fops);
+
+ ftrace_profile_tracefs(d_tracer);
+
+ return 0;
+}
+fs_initcall(ftrace_init_tracefs);
+
+/**
+ * ftrace_kill - kill ftrace
+ *
+ * This function should be used by panic code. It stops ftrace
+ * but in a not so nice way. If you need to simply kill ftrace
+ * from a non-atomic section, use ftrace_kill.
+ */
+void ftrace_kill(void)
+{
+ ftrace_disabled = 1;
+ ftrace_enabled = 0;
+ clear_ftrace_function();
+}
+
+/**
+ * Test if ftrace is dead or not.
+ */
+int ftrace_is_dead(void)
+{
+ return ftrace_disabled;
+}
+
+/**
+ * register_ftrace_function - register a function for profiling
+ * @ops - ops structure that holds the function for profiling.
+ *
+ * Register a function to be called by all functions in the
+ * kernel.
+ *
+ * Note: @ops->func and all the functions it calls must be labeled
+ * with "notrace", otherwise it will go into a
+ * recursive loop.
+ */
+int register_ftrace_function(struct ftrace_ops *ops)
+{
+ int ret = -1;
+
+ ftrace_ops_init(ops);
+
+ mutex_lock(&ftrace_lock);
+
+ ret = ftrace_startup(ops, 0);
+
+ mutex_unlock(&ftrace_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(register_ftrace_function);
+
+/**
+ * unregister_ftrace_function - unregister a function for profiling.
+ * @ops - ops structure that holds the function to unregister
+ *
+ * Unregister a function that was added to be called by ftrace profiling.
+ */
+int unregister_ftrace_function(struct ftrace_ops *ops)
+{
+ int ret;
+
+ mutex_lock(&ftrace_lock);
+ ret = ftrace_shutdown(ops, 0);
+ mutex_unlock(&ftrace_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(unregister_ftrace_function);
+
+int
+ftrace_enable_sysctl(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret = -ENODEV;
+
+ mutex_lock(&ftrace_lock);
+
+ if (unlikely(ftrace_disabled))
+ goto out;
+
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+
+ if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled))
+ goto out;
+
+ last_ftrace_enabled = !!ftrace_enabled;
+
+ if (ftrace_enabled) {
+
+ /* we are starting ftrace again */
+ if (ftrace_ops_list != &ftrace_list_end)
+ update_ftrace_function();
+
+ ftrace_startup_sysctl();
+
+ } else {
+ /* stopping ftrace calls (just send to ftrace_stub) */
+ ftrace_trace_function = ftrace_stub;
+
+ ftrace_shutdown_sysctl();
+ }
+
+ out:
+ mutex_unlock(&ftrace_lock);
+ return ret;
+}
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+
+static struct ftrace_ops graph_ops = {
+ .func = ftrace_stub,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE |
+ FTRACE_OPS_FL_INITIALIZED |
+ FTRACE_OPS_FL_STUB,
+#ifdef FTRACE_GRAPH_TRAMP_ADDR
+ .trampoline = FTRACE_GRAPH_TRAMP_ADDR,
+ /* trampoline_size is only needed for dynamically allocated tramps */
+#endif
+ ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash)
+};
+
+int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
+{
+ return 0;
+}
+
+/* The callbacks that hook a function */
+trace_func_graph_ret_t ftrace_graph_return =
+ (trace_func_graph_ret_t)ftrace_stub;
+trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub;
+static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub;
+
+/* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */
+static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
+{
+ int i;
+ int ret = 0;
+ unsigned long flags;
+ int start = 0, end = FTRACE_RETSTACK_ALLOC_SIZE;
+ struct task_struct *g, *t;
+
+ for (i = 0; i < FTRACE_RETSTACK_ALLOC_SIZE; i++) {
+ ret_stack_list[i] = kmalloc(FTRACE_RETFUNC_DEPTH
+ * sizeof(struct ftrace_ret_stack),
+ GFP_KERNEL);
+ if (!ret_stack_list[i]) {
+ start = 0;
+ end = i;
+ ret = -ENOMEM;
+ goto free;
+ }
+ }
+
+ read_lock_irqsave(&tasklist_lock, flags);
+ do_each_thread(g, t) {
+ if (start == end) {
+ ret = -EAGAIN;
+ goto unlock;
+ }
+
+ if (t->ret_stack == NULL) {
+ atomic_set(&t->tracing_graph_pause, 0);
+ atomic_set(&t->trace_overrun, 0);
+ t->curr_ret_stack = -1;
+ /* Make sure the tasks see the -1 first: */
+ smp_wmb();
+ t->ret_stack = ret_stack_list[start++];
+ }
+ } while_each_thread(g, t);
+
+unlock:
+ read_unlock_irqrestore(&tasklist_lock, flags);
+free:
+ for (i = start; i < end; i++)
+ kfree(ret_stack_list[i]);
+ return ret;
+}
+
+static void
+ftrace_graph_probe_sched_switch(void *ignore,
+ struct task_struct *prev, struct task_struct *next)
+{
+ unsigned long long timestamp;
+ int index;
+
+ /*
+ * Does the user want to count the time a function was asleep.
+ * If so, do not update the time stamps.
+ */
+ if (trace_flags & TRACE_ITER_SLEEP_TIME)
+ return;
+
+ timestamp = trace_clock_local();
+
+ prev->ftrace_timestamp = timestamp;
+
+ /* only process tasks that we timestamped */
+ if (!next->ftrace_timestamp)
+ return;
+
+ /*
+ * Update all the counters in next to make up for the
+ * time next was sleeping.
+ */
+ timestamp -= next->ftrace_timestamp;
+
+ for (index = next->curr_ret_stack; index >= 0; index--)
+ next->ret_stack[index].calltime += timestamp;
+}
+
+/* Allocate a return stack for each task */
+static int start_graph_tracing(void)
+{
+ struct ftrace_ret_stack **ret_stack_list;
+ int ret, cpu;
+
+ ret_stack_list = kmalloc(FTRACE_RETSTACK_ALLOC_SIZE *
+ sizeof(struct ftrace_ret_stack *),
+ GFP_KERNEL);
+
+ if (!ret_stack_list)
+ return -ENOMEM;
+
+ /* The cpu_boot init_task->ret_stack will never be freed */
+ for_each_online_cpu(cpu) {
+ if (!idle_task(cpu)->ret_stack)
+ ftrace_graph_init_idle_task(idle_task(cpu), cpu);
+ }
+
+ do {
+ ret = alloc_retstack_tasklist(ret_stack_list);
+ } while (ret == -EAGAIN);
+
+ if (!ret) {
+ ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
+ if (ret)
+ pr_info("ftrace_graph: Couldn't activate tracepoint"
+ " probe to kernel_sched_switch\n");
+ }
+
+ kfree(ret_stack_list);
+ return ret;
+}
+
+/*
+ * Hibernation protection.
+ * The state of the current task is too much unstable during
+ * suspend/restore to disk. We want to protect against that.
+ */
+static int
+ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state,
+ void *unused)
+{
+ switch (state) {
+ case PM_HIBERNATION_PREPARE:
+ pause_graph_tracing();
+ break;
+
+ case PM_POST_HIBERNATION:
+ unpause_graph_tracing();
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace)
+{
+ if (!ftrace_ops_test(&global_ops, trace->func, NULL))
+ return 0;
+ return __ftrace_graph_entry(trace);
+}
+
+/*
+ * The function graph tracer should only trace the functions defined
+ * by set_ftrace_filter and set_ftrace_notrace. If another function
+ * tracer ops is registered, the graph tracer requires testing the
+ * function against the global ops, and not just trace any function
+ * that any ftrace_ops registered.
+ */
+static void update_function_graph_func(void)
+{
+ struct ftrace_ops *op;
+ bool do_test = false;
+
+ /*
+ * The graph and global ops share the same set of functions
+ * to test. If any other ops is on the list, then
+ * the graph tracing needs to test if its the function
+ * it should call.
+ */
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+ if (op != &global_ops && op != &graph_ops &&
+ op != &ftrace_list_end) {
+ do_test = true;
+ /* in double loop, break out with goto */
+ goto out;
+ }
+ } while_for_each_ftrace_op(op);
+ out:
+ if (do_test)
+ ftrace_graph_entry = ftrace_graph_entry_test;
+ else
+ ftrace_graph_entry = __ftrace_graph_entry;
+}
+
+static struct notifier_block ftrace_suspend_notifier = {
+ .notifier_call = ftrace_suspend_notifier_call,
+};
+
+int register_ftrace_graph(trace_func_graph_ret_t retfunc,
+ trace_func_graph_ent_t entryfunc)
+{
+ int ret = 0;
+
+ mutex_lock(&ftrace_lock);
+
+ /* we currently allow only one tracer registered at a time */
+ if (ftrace_graph_active) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ register_pm_notifier(&ftrace_suspend_notifier);
+
+ ftrace_graph_active++;
+ ret = start_graph_tracing();
+ if (ret) {
+ ftrace_graph_active--;
+ goto out;
+ }
+
+ ftrace_graph_return = retfunc;
+
+ /*
+ * Update the indirect function to the entryfunc, and the
+ * function that gets called to the entry_test first. Then
+ * call the update fgraph entry function to determine if
+ * the entryfunc should be called directly or not.
+ */
+ __ftrace_graph_entry = entryfunc;
+ ftrace_graph_entry = ftrace_graph_entry_test;
+ update_function_graph_func();
+
+ ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET);
+out:
+ mutex_unlock(&ftrace_lock);
+ return ret;
+}
+
+void unregister_ftrace_graph(void)
+{
+ mutex_lock(&ftrace_lock);
+
+ if (unlikely(!ftrace_graph_active))
+ goto out;
+
+ ftrace_graph_active--;
+ ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
+ ftrace_graph_entry = ftrace_graph_entry_stub;
+ __ftrace_graph_entry = ftrace_graph_entry_stub;
+ ftrace_shutdown(&graph_ops, FTRACE_STOP_FUNC_RET);
+ unregister_pm_notifier(&ftrace_suspend_notifier);
+ unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+ /*
+ * Function graph does not allocate the trampoline, but
+ * other global_ops do. We need to reset the ALLOC_TRAMP flag
+ * if one was used.
+ */
+ global_ops.trampoline = save_global_trampoline;
+ if (save_global_flags & FTRACE_OPS_FL_ALLOC_TRAMP)
+ global_ops.flags |= FTRACE_OPS_FL_ALLOC_TRAMP;
+#endif
+
+ out:
+ mutex_unlock(&ftrace_lock);
+}
+
+static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack);
+
+static void
+graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack)
+{
+ atomic_set(&t->tracing_graph_pause, 0);
+ atomic_set(&t->trace_overrun, 0);
+ t->ftrace_timestamp = 0;
+ /* make curr_ret_stack visible before we add the ret_stack */
+ smp_wmb();
+ t->ret_stack = ret_stack;
+}
+
+/*
+ * Allocate a return stack for the idle task. May be the first
+ * time through, or it may be done by CPU hotplug online.
+ */
+void ftrace_graph_init_idle_task(struct task_struct *t, int cpu)
+{
+ t->curr_ret_stack = -1;
+ /*
+ * The idle task has no parent, it either has its own
+ * stack or no stack at all.
+ */
+ if (t->ret_stack)
+ WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu));
+
+ if (ftrace_graph_active) {
+ struct ftrace_ret_stack *ret_stack;
+
+ ret_stack = per_cpu(idle_ret_stack, cpu);
+ if (!ret_stack) {
+ ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
+ * sizeof(struct ftrace_ret_stack),
+ GFP_KERNEL);
+ if (!ret_stack)
+ return;
+ per_cpu(idle_ret_stack, cpu) = ret_stack;
+ }
+ graph_init_task(t, ret_stack);
+ }
+}
+
+/* Allocate a return stack for newly created task */
+void ftrace_graph_init_task(struct task_struct *t)
+{
+ /* Make sure we do not use the parent ret_stack */
+ t->ret_stack = NULL;
+ t->curr_ret_stack = -1;
+
+ if (ftrace_graph_active) {
+ struct ftrace_ret_stack *ret_stack;
+
+ ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
+ * sizeof(struct ftrace_ret_stack),
+ GFP_KERNEL);
+ if (!ret_stack)
+ return;
+ graph_init_task(t, ret_stack);
+ }
+}
+
+void ftrace_graph_exit_task(struct task_struct *t)
+{
+ struct ftrace_ret_stack *ret_stack = t->ret_stack;
+
+ t->ret_stack = NULL;
+ /* NULL must become visible to IRQs before we free it: */
+ barrier();
+
+ kfree(ret_stack);
+}
+#endif
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
new file mode 100644
index 000000000..eb4220a13
--- /dev/null
+++ b/kernel/trace/power-traces.c
@@ -0,0 +1,18 @@
+/*
+ * Power trace points
+ *
+ * Copyright (C) 2009 Arjan van de Ven <arjan@linux.intel.com>
+ */
+
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/power.h>
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume);
+EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
+
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
new file mode 100644
index 000000000..0315d4317
--- /dev/null
+++ b/kernel/trace/ring_buffer.c
@@ -0,0 +1,5014 @@
+/*
+ * Generic ring buffer
+ *
+ * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
+ */
+#include <linux/ftrace_event.h>
+#include <linux/ring_buffer.h>
+#include <linux/trace_clock.h>
+#include <linux/trace_seq.h>
+#include <linux/spinlock.h>
+#include <linux/irq_work.h>
+#include <linux/uaccess.h>
+#include <linux/hardirq.h>
+#include <linux/kthread.h> /* for self test */
+#include <linux/kmemcheck.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/mutex.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/hash.h>
+#include <linux/list.h>
+#include <linux/cpu.h>
+
+#include <asm/local.h>
+
+static void update_pages_handler(struct work_struct *work);
+
+/*
+ * The ring buffer header is special. We must manually up keep it.
+ */
+int ring_buffer_print_entry_header(struct trace_seq *s)
+{
+ trace_seq_puts(s, "# compressed entry header\n");
+ trace_seq_puts(s, "\ttype_len : 5 bits\n");
+ trace_seq_puts(s, "\ttime_delta : 27 bits\n");
+ trace_seq_puts(s, "\tarray : 32 bits\n");
+ trace_seq_putc(s, '\n');
+ trace_seq_printf(s, "\tpadding : type == %d\n",
+ RINGBUF_TYPE_PADDING);
+ trace_seq_printf(s, "\ttime_extend : type == %d\n",
+ RINGBUF_TYPE_TIME_EXTEND);
+ trace_seq_printf(s, "\tdata max type_len == %d\n",
+ RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
+
+ return !trace_seq_has_overflowed(s);
+}
+
+/*
+ * The ring buffer is made up of a list of pages. A separate list of pages is
+ * allocated for each CPU. A writer may only write to a buffer that is
+ * associated with the CPU it is currently executing on. A reader may read
+ * from any per cpu buffer.
+ *
+ * The reader is special. For each per cpu buffer, the reader has its own
+ * reader page. When a reader has read the entire reader page, this reader
+ * page is swapped with another page in the ring buffer.
+ *
+ * Now, as long as the writer is off the reader page, the reader can do what
+ * ever it wants with that page. The writer will never write to that page
+ * again (as long as it is out of the ring buffer).
+ *
+ * Here's some silly ASCII art.
+ *
+ * +------+
+ * |reader| RING BUFFER
+ * |page |
+ * +------+ +---+ +---+ +---+
+ * | |-->| |-->| |
+ * +---+ +---+ +---+
+ * ^ |
+ * | |
+ * +---------------+
+ *
+ *
+ * +------+
+ * |reader| RING BUFFER
+ * |page |------------------v
+ * +------+ +---+ +---+ +---+
+ * | |-->| |-->| |
+ * +---+ +---+ +---+
+ * ^ |
+ * | |
+ * +---------------+
+ *
+ *
+ * +------+
+ * |reader| RING BUFFER
+ * |page |------------------v
+ * +------+ +---+ +---+ +---+
+ * ^ | |-->| |-->| |
+ * | +---+ +---+ +---+
+ * | |
+ * | |
+ * +------------------------------+
+ *
+ *
+ * +------+
+ * |buffer| RING BUFFER
+ * |page |------------------v
+ * +------+ +---+ +---+ +---+
+ * ^ | | | |-->| |
+ * | New +---+ +---+ +---+
+ * | Reader------^ |
+ * | page |
+ * +------------------------------+
+ *
+ *
+ * After we make this swap, the reader can hand this page off to the splice
+ * code and be done with it. It can even allocate a new page if it needs to
+ * and swap that into the ring buffer.
+ *
+ * We will be using cmpxchg soon to make all this lockless.
+ *
+ */
+
+/*
+ * A fast way to enable or disable all ring buffers is to
+ * call tracing_on or tracing_off. Turning off the ring buffers
+ * prevents all ring buffers from being recorded to.
+ * Turning this switch on, makes it OK to write to the
+ * ring buffer, if the ring buffer is enabled itself.
+ *
+ * There's three layers that must be on in order to write
+ * to the ring buffer.
+ *
+ * 1) This global flag must be set.
+ * 2) The ring buffer must be enabled for recording.
+ * 3) The per cpu buffer must be enabled for recording.
+ *
+ * In case of an anomaly, this global flag has a bit set that
+ * will permantly disable all ring buffers.
+ */
+
+/*
+ * Global flag to disable all recording to ring buffers
+ * This has two bits: ON, DISABLED
+ *
+ * ON DISABLED
+ * ---- ----------
+ * 0 0 : ring buffers are off
+ * 1 0 : ring buffers are on
+ * X 1 : ring buffers are permanently disabled
+ */
+
+enum {
+ RB_BUFFERS_ON_BIT = 0,
+ RB_BUFFERS_DISABLED_BIT = 1,
+};
+
+enum {
+ RB_BUFFERS_ON = 1 << RB_BUFFERS_ON_BIT,
+ RB_BUFFERS_DISABLED = 1 << RB_BUFFERS_DISABLED_BIT,
+};
+
+static unsigned long ring_buffer_flags __read_mostly = RB_BUFFERS_ON;
+
+/* Used for individual buffers (after the counter) */
+#define RB_BUFFER_OFF (1 << 20)
+
+#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data)
+
+/**
+ * tracing_off_permanent - permanently disable ring buffers
+ *
+ * This function, once called, will disable all ring buffers
+ * permanently.
+ */
+void tracing_off_permanent(void)
+{
+ set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags);
+}
+
+#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
+#define RB_ALIGNMENT 4U
+#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
+#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */
+
+#ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS
+# define RB_FORCE_8BYTE_ALIGNMENT 0
+# define RB_ARCH_ALIGNMENT RB_ALIGNMENT
+#else
+# define RB_FORCE_8BYTE_ALIGNMENT 1
+# define RB_ARCH_ALIGNMENT 8U
+#endif
+
+#define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT)
+
+/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
+#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
+
+enum {
+ RB_LEN_TIME_EXTEND = 8,
+ RB_LEN_TIME_STAMP = 16,
+};
+
+#define skip_time_extend(event) \
+ ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND))
+
+static inline int rb_null_event(struct ring_buffer_event *event)
+{
+ return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
+}
+
+static void rb_event_set_padding(struct ring_buffer_event *event)
+{
+ /* padding has a NULL time_delta */
+ event->type_len = RINGBUF_TYPE_PADDING;
+ event->time_delta = 0;
+}
+
+static unsigned
+rb_event_data_length(struct ring_buffer_event *event)
+{
+ unsigned length;
+
+ if (event->type_len)
+ length = event->type_len * RB_ALIGNMENT;
+ else
+ length = event->array[0];
+ return length + RB_EVNT_HDR_SIZE;
+}
+
+/*
+ * Return the length of the given event. Will return
+ * the length of the time extend if the event is a
+ * time extend.
+ */
+static inline unsigned
+rb_event_length(struct ring_buffer_event *event)
+{
+ switch (event->type_len) {
+ case RINGBUF_TYPE_PADDING:
+ if (rb_null_event(event))
+ /* undefined */
+ return -1;
+ return event->array[0] + RB_EVNT_HDR_SIZE;
+
+ case RINGBUF_TYPE_TIME_EXTEND:
+ return RB_LEN_TIME_EXTEND;
+
+ case RINGBUF_TYPE_TIME_STAMP:
+ return RB_LEN_TIME_STAMP;
+
+ case RINGBUF_TYPE_DATA:
+ return rb_event_data_length(event);
+ default:
+ BUG();
+ }
+ /* not hit */
+ return 0;
+}
+
+/*
+ * Return total length of time extend and data,
+ * or just the event length for all other events.
+ */
+static inline unsigned
+rb_event_ts_length(struct ring_buffer_event *event)
+{
+ unsigned len = 0;
+
+ if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
+ /* time extends include the data event after it */
+ len = RB_LEN_TIME_EXTEND;
+ event = skip_time_extend(event);
+ }
+ return len + rb_event_length(event);
+}
+
+/**
+ * ring_buffer_event_length - return the length of the event
+ * @event: the event to get the length of
+ *
+ * Returns the size of the data load of a data event.
+ * If the event is something other than a data event, it
+ * returns the size of the event itself. With the exception
+ * of a TIME EXTEND, where it still returns the size of the
+ * data load of the data event after it.
+ */
+unsigned ring_buffer_event_length(struct ring_buffer_event *event)
+{
+ unsigned length;
+
+ if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
+ event = skip_time_extend(event);
+
+ length = rb_event_length(event);
+ if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
+ return length;
+ length -= RB_EVNT_HDR_SIZE;
+ if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0]))
+ length -= sizeof(event->array[0]);
+ return length;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_event_length);
+
+/* inline for ring buffer fast paths */
+static void *
+rb_event_data(struct ring_buffer_event *event)
+{
+ if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
+ event = skip_time_extend(event);
+ BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
+ /* If length is in len field, then array[0] has the data */
+ if (event->type_len)
+ return (void *)&event->array[0];
+ /* Otherwise length is in array[0] and array[1] has the data */
+ return (void *)&event->array[1];
+}
+
+/**
+ * ring_buffer_event_data - return the data of the event
+ * @event: the event to get the data from
+ */
+void *ring_buffer_event_data(struct ring_buffer_event *event)
+{
+ return rb_event_data(event);
+}
+EXPORT_SYMBOL_GPL(ring_buffer_event_data);
+
+#define for_each_buffer_cpu(buffer, cpu) \
+ for_each_cpu(cpu, buffer->cpumask)
+
+#define TS_SHIFT 27
+#define TS_MASK ((1ULL << TS_SHIFT) - 1)
+#define TS_DELTA_TEST (~TS_MASK)
+
+/* Flag when events were overwritten */
+#define RB_MISSED_EVENTS (1 << 31)
+/* Missed count stored at end */
+#define RB_MISSED_STORED (1 << 30)
+
+struct buffer_data_page {
+ u64 time_stamp; /* page time stamp */
+ local_t commit; /* write committed index */
+ unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */
+};
+
+/*
+ * Note, the buffer_page list must be first. The buffer pages
+ * are allocated in cache lines, which means that each buffer
+ * page will be at the beginning of a cache line, and thus
+ * the least significant bits will be zero. We use this to
+ * add flags in the list struct pointers, to make the ring buffer
+ * lockless.
+ */
+struct buffer_page {
+ struct list_head list; /* list of buffer pages */
+ local_t write; /* index for next write */
+ unsigned read; /* index for next read */
+ local_t entries; /* entries on this page */
+ unsigned long real_end; /* real end of data */
+ struct buffer_data_page *page; /* Actual data page */
+};
+
+/*
+ * The buffer page counters, write and entries, must be reset
+ * atomically when crossing page boundaries. To synchronize this
+ * update, two counters are inserted into the number. One is
+ * the actual counter for the write position or count on the page.
+ *
+ * The other is a counter of updaters. Before an update happens
+ * the update partition of the counter is incremented. This will
+ * allow the updater to update the counter atomically.
+ *
+ * The counter is 20 bits, and the state data is 12.
+ */
+#define RB_WRITE_MASK 0xfffff
+#define RB_WRITE_INTCNT (1 << 20)
+
+static void rb_init_page(struct buffer_data_page *bpage)
+{
+ local_set(&bpage->commit, 0);
+}
+
+/**
+ * ring_buffer_page_len - the size of data on the page.
+ * @page: The page to read
+ *
+ * Returns the amount of data on the page, including buffer page header.
+ */
+size_t ring_buffer_page_len(void *page)
+{
+ return local_read(&((struct buffer_data_page *)page)->commit)
+ + BUF_PAGE_HDR_SIZE;
+}
+
+/*
+ * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing
+ * this issue out.
+ */
+static void free_buffer_page(struct buffer_page *bpage)
+{
+ free_page((unsigned long)bpage->page);
+ kfree(bpage);
+}
+
+/*
+ * We need to fit the time_stamp delta into 27 bits.
+ */
+static inline int test_time_stamp(u64 delta)
+{
+ if (delta & TS_DELTA_TEST)
+ return 1;
+ return 0;
+}
+
+#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE)
+
+/* Max payload is BUF_PAGE_SIZE - header (8bytes) */
+#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
+
+int ring_buffer_print_page_header(struct trace_seq *s)
+{
+ struct buffer_data_page field;
+
+ trace_seq_printf(s, "\tfield: u64 timestamp;\t"
+ "offset:0;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)sizeof(field.time_stamp),
+ (unsigned int)is_signed_type(u64));
+
+ trace_seq_printf(s, "\tfield: local_t commit;\t"
+ "offset:%u;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)offsetof(typeof(field), commit),
+ (unsigned int)sizeof(field.commit),
+ (unsigned int)is_signed_type(long));
+
+ trace_seq_printf(s, "\tfield: int overwrite;\t"
+ "offset:%u;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)offsetof(typeof(field), commit),
+ 1,
+ (unsigned int)is_signed_type(long));
+
+ trace_seq_printf(s, "\tfield: char data;\t"
+ "offset:%u;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)offsetof(typeof(field), data),
+ (unsigned int)BUF_PAGE_SIZE,
+ (unsigned int)is_signed_type(char));
+
+ return !trace_seq_has_overflowed(s);
+}
+
+struct rb_irq_work {
+ struct irq_work work;
+ wait_queue_head_t waiters;
+ wait_queue_head_t full_waiters;
+ bool waiters_pending;
+ bool full_waiters_pending;
+ bool wakeup_full;
+};
+
+/*
+ * head_page == tail_page && head == tail then buffer is empty.
+ */
+struct ring_buffer_per_cpu {
+ int cpu;
+ atomic_t record_disabled;
+ struct ring_buffer *buffer;
+ raw_spinlock_t reader_lock; /* serialize readers */
+ arch_spinlock_t lock;
+ struct lock_class_key lock_key;
+ unsigned int nr_pages;
+ struct list_head *pages;
+ struct buffer_page *head_page; /* read from head */
+ struct buffer_page *tail_page; /* write to tail */
+ struct buffer_page *commit_page; /* committed pages */
+ struct buffer_page *reader_page;
+ unsigned long lost_events;
+ unsigned long last_overrun;
+ local_t entries_bytes;
+ local_t entries;
+ local_t overrun;
+ local_t commit_overrun;
+ local_t dropped_events;
+ local_t committing;
+ local_t commits;
+ unsigned long read;
+ unsigned long read_bytes;
+ u64 write_stamp;
+ u64 read_stamp;
+ /* ring buffer pages to update, > 0 to add, < 0 to remove */
+ int nr_pages_to_update;
+ struct list_head new_pages; /* new pages to add */
+ struct work_struct update_pages_work;
+ struct completion update_done;
+
+ struct rb_irq_work irq_work;
+};
+
+struct ring_buffer {
+ unsigned flags;
+ int cpus;
+ atomic_t record_disabled;
+ atomic_t resize_disabled;
+ cpumask_var_t cpumask;
+
+ struct lock_class_key *reader_lock_key;
+
+ struct mutex mutex;
+
+ struct ring_buffer_per_cpu **buffers;
+
+#ifdef CONFIG_HOTPLUG_CPU
+ struct notifier_block cpu_notify;
+#endif
+ u64 (*clock)(void);
+
+ struct rb_irq_work irq_work;
+};
+
+struct ring_buffer_iter {
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long head;
+ struct buffer_page *head_page;
+ struct buffer_page *cache_reader_page;
+ unsigned long cache_read;
+ u64 read_stamp;
+};
+
+/*
+ * rb_wake_up_waiters - wake up tasks waiting for ring buffer input
+ *
+ * Schedules a delayed work to wake up any task that is blocked on the
+ * ring buffer waiters queue.
+ */
+static void rb_wake_up_waiters(struct irq_work *work)
+{
+ struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work);
+
+ wake_up_all(&rbwork->waiters);
+ if (rbwork->wakeup_full) {
+ rbwork->wakeup_full = false;
+ wake_up_all(&rbwork->full_waiters);
+ }
+}
+
+/**
+ * ring_buffer_wait - wait for input to the ring buffer
+ * @buffer: buffer to wait on
+ * @cpu: the cpu buffer to wait on
+ * @full: wait until a full page is available, if @cpu != RING_BUFFER_ALL_CPUS
+ *
+ * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
+ * as data is added to any of the @buffer's cpu buffers. Otherwise
+ * it will wait for data to be added to a specific cpu buffer.
+ */
+int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
+{
+ struct ring_buffer_per_cpu *uninitialized_var(cpu_buffer);
+ DEFINE_WAIT(wait);
+ struct rb_irq_work *work;
+ int ret = 0;
+
+ /*
+ * Depending on what the caller is waiting for, either any
+ * data in any cpu buffer, or a specific buffer, put the
+ * caller on the appropriate wait queue.
+ */
+ if (cpu == RING_BUFFER_ALL_CPUS) {
+ work = &buffer->irq_work;
+ /* Full only makes sense on per cpu reads */
+ full = false;
+ } else {
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return -ENODEV;
+ cpu_buffer = buffer->buffers[cpu];
+ work = &cpu_buffer->irq_work;
+ }
+
+
+ while (true) {
+ if (full)
+ prepare_to_wait(&work->full_waiters, &wait, TASK_INTERRUPTIBLE);
+ else
+ prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
+
+ /*
+ * The events can happen in critical sections where
+ * checking a work queue can cause deadlocks.
+ * After adding a task to the queue, this flag is set
+ * only to notify events to try to wake up the queue
+ * using irq_work.
+ *
+ * We don't clear it even if the buffer is no longer
+ * empty. The flag only causes the next event to run
+ * irq_work to do the work queue wake up. The worse
+ * that can happen if we race with !trace_empty() is that
+ * an event will cause an irq_work to try to wake up
+ * an empty queue.
+ *
+ * There's no reason to protect this flag either, as
+ * the work queue and irq_work logic will do the necessary
+ * synchronization for the wake ups. The only thing
+ * that is necessary is that the wake up happens after
+ * a task has been queued. It's OK for spurious wake ups.
+ */
+ if (full)
+ work->full_waiters_pending = true;
+ else
+ work->waiters_pending = true;
+
+ if (signal_pending(current)) {
+ ret = -EINTR;
+ break;
+ }
+
+ if (cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer))
+ break;
+
+ if (cpu != RING_BUFFER_ALL_CPUS &&
+ !ring_buffer_empty_cpu(buffer, cpu)) {
+ unsigned long flags;
+ bool pagebusy;
+
+ if (!full)
+ break;
+
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+ if (!pagebusy)
+ break;
+ }
+
+ schedule();
+ }
+
+ if (full)
+ finish_wait(&work->full_waiters, &wait);
+ else
+ finish_wait(&work->waiters, &wait);
+
+ return ret;
+}
+
+/**
+ * ring_buffer_poll_wait - poll on buffer input
+ * @buffer: buffer to wait on
+ * @cpu: the cpu buffer to wait on
+ * @filp: the file descriptor
+ * @poll_table: The poll descriptor
+ *
+ * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
+ * as data is added to any of the @buffer's cpu buffers. Otherwise
+ * it will wait for data to be added to a specific cpu buffer.
+ *
+ * Returns POLLIN | POLLRDNORM if data exists in the buffers,
+ * zero otherwise.
+ */
+int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
+ struct file *filp, poll_table *poll_table)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct rb_irq_work *work;
+
+ if (cpu == RING_BUFFER_ALL_CPUS)
+ work = &buffer->irq_work;
+ else {
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return -EINVAL;
+
+ cpu_buffer = buffer->buffers[cpu];
+ work = &cpu_buffer->irq_work;
+ }
+
+ poll_wait(filp, &work->waiters, poll_table);
+ work->waiters_pending = true;
+ /*
+ * There's a tight race between setting the waiters_pending and
+ * checking if the ring buffer is empty. Once the waiters_pending bit
+ * is set, the next event will wake the task up, but we can get stuck
+ * if there's only a single event in.
+ *
+ * FIXME: Ideally, we need a memory barrier on the writer side as well,
+ * but adding a memory barrier to all events will cause too much of a
+ * performance hit in the fast path. We only need a memory barrier when
+ * the buffer goes from empty to having content. But as this race is
+ * extremely small, and it's not a problem if another event comes in, we
+ * will fix it later.
+ */
+ smp_mb();
+
+ if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
+ (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
+ return POLLIN | POLLRDNORM;
+ return 0;
+}
+
+/* buffer may be either ring_buffer or ring_buffer_per_cpu */
+#define RB_WARN_ON(b, cond) \
+ ({ \
+ int _____ret = unlikely(cond); \
+ if (_____ret) { \
+ if (__same_type(*(b), struct ring_buffer_per_cpu)) { \
+ struct ring_buffer_per_cpu *__b = \
+ (void *)b; \
+ atomic_inc(&__b->buffer->record_disabled); \
+ } else \
+ atomic_inc(&b->record_disabled); \
+ WARN_ON(1); \
+ } \
+ _____ret; \
+ })
+
+/* Up this if you want to test the TIME_EXTENTS and normalization */
+#define DEBUG_SHIFT 0
+
+static inline u64 rb_time_stamp(struct ring_buffer *buffer)
+{
+ /* shift to debug/test normalization and TIME_EXTENTS */
+ return buffer->clock() << DEBUG_SHIFT;
+}
+
+u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu)
+{
+ u64 time;
+
+ preempt_disable_notrace();
+ time = rb_time_stamp(buffer);
+ preempt_enable_no_resched_notrace();
+
+ return time;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_time_stamp);
+
+void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,
+ int cpu, u64 *ts)
+{
+ /* Just stupid testing the normalize function and deltas */
+ *ts >>= DEBUG_SHIFT;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
+
+/*
+ * Making the ring buffer lockless makes things tricky.
+ * Although writes only happen on the CPU that they are on,
+ * and they only need to worry about interrupts. Reads can
+ * happen on any CPU.
+ *
+ * The reader page is always off the ring buffer, but when the
+ * reader finishes with a page, it needs to swap its page with
+ * a new one from the buffer. The reader needs to take from
+ * the head (writes go to the tail). But if a writer is in overwrite
+ * mode and wraps, it must push the head page forward.
+ *
+ * Here lies the problem.
+ *
+ * The reader must be careful to replace only the head page, and
+ * not another one. As described at the top of the file in the
+ * ASCII art, the reader sets its old page to point to the next
+ * page after head. It then sets the page after head to point to
+ * the old reader page. But if the writer moves the head page
+ * during this operation, the reader could end up with the tail.
+ *
+ * We use cmpxchg to help prevent this race. We also do something
+ * special with the page before head. We set the LSB to 1.
+ *
+ * When the writer must push the page forward, it will clear the
+ * bit that points to the head page, move the head, and then set
+ * the bit that points to the new head page.
+ *
+ * We also don't want an interrupt coming in and moving the head
+ * page on another writer. Thus we use the second LSB to catch
+ * that too. Thus:
+ *
+ * head->list->prev->next bit 1 bit 0
+ * ------- -------
+ * Normal page 0 0
+ * Points to head page 0 1
+ * New head page 1 0
+ *
+ * Note we can not trust the prev pointer of the head page, because:
+ *
+ * +----+ +-----+ +-----+
+ * | |------>| T |---X--->| N |
+ * | |<------| | | |
+ * +----+ +-----+ +-----+
+ * ^ ^ |
+ * | +-----+ | |
+ * +----------| R |----------+ |
+ * | |<-----------+
+ * +-----+
+ *
+ * Key: ---X--> HEAD flag set in pointer
+ * T Tail page
+ * R Reader page
+ * N Next page
+ *
+ * (see __rb_reserve_next() to see where this happens)
+ *
+ * What the above shows is that the reader just swapped out
+ * the reader page with a page in the buffer, but before it
+ * could make the new header point back to the new page added
+ * it was preempted by a writer. The writer moved forward onto
+ * the new page added by the reader and is about to move forward
+ * again.
+ *
+ * You can see, it is legitimate for the previous pointer of
+ * the head (or any page) not to point back to itself. But only
+ * temporarially.
+ */
+
+#define RB_PAGE_NORMAL 0UL
+#define RB_PAGE_HEAD 1UL
+#define RB_PAGE_UPDATE 2UL
+
+
+#define RB_FLAG_MASK 3UL
+
+/* PAGE_MOVED is not part of the mask */
+#define RB_PAGE_MOVED 4UL
+
+/*
+ * rb_list_head - remove any bit
+ */
+static struct list_head *rb_list_head(struct list_head *list)
+{
+ unsigned long val = (unsigned long)list;
+
+ return (struct list_head *)(val & ~RB_FLAG_MASK);
+}
+
+/*
+ * rb_is_head_page - test if the given page is the head page
+ *
+ * Because the reader may move the head_page pointer, we can
+ * not trust what the head page is (it may be pointing to
+ * the reader page). But if the next page is a header page,
+ * its flags will be non zero.
+ */
+static inline int
+rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *page, struct list_head *list)
+{
+ unsigned long val;
+
+ val = (unsigned long)list->next;
+
+ if ((val & ~RB_FLAG_MASK) != (unsigned long)&page->list)
+ return RB_PAGE_MOVED;
+
+ return val & RB_FLAG_MASK;
+}
+
+/*
+ * rb_is_reader_page
+ *
+ * The unique thing about the reader page, is that, if the
+ * writer is ever on it, the previous pointer never points
+ * back to the reader page.
+ */
+static int rb_is_reader_page(struct buffer_page *page)
+{
+ struct list_head *list = page->list.prev;
+
+ return rb_list_head(list->next) != &page->list;
+}
+
+/*
+ * rb_set_list_to_head - set a list_head to be pointing to head.
+ */
+static void rb_set_list_to_head(struct ring_buffer_per_cpu *cpu_buffer,
+ struct list_head *list)
+{
+ unsigned long *ptr;
+
+ ptr = (unsigned long *)&list->next;
+ *ptr |= RB_PAGE_HEAD;
+ *ptr &= ~RB_PAGE_UPDATE;
+}
+
+/*
+ * rb_head_page_activate - sets up head page
+ */
+static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct buffer_page *head;
+
+ head = cpu_buffer->head_page;
+ if (!head)
+ return;
+
+ /*
+ * Set the previous list pointer to have the HEAD flag.
+ */
+ rb_set_list_to_head(cpu_buffer, head->list.prev);
+}
+
+static void rb_list_head_clear(struct list_head *list)
+{
+ unsigned long *ptr = (unsigned long *)&list->next;
+
+ *ptr &= ~RB_FLAG_MASK;
+}
+
+/*
+ * rb_head_page_dactivate - clears head page ptr (for free list)
+ */
+static void
+rb_head_page_deactivate(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct list_head *hd;
+
+ /* Go through the whole list and clear any pointers found. */
+ rb_list_head_clear(cpu_buffer->pages);
+
+ list_for_each(hd, cpu_buffer->pages)
+ rb_list_head_clear(hd);
+}
+
+static int rb_head_page_set(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *head,
+ struct buffer_page *prev,
+ int old_flag, int new_flag)
+{
+ struct list_head *list;
+ unsigned long val = (unsigned long)&head->list;
+ unsigned long ret;
+
+ list = &prev->list;
+
+ val &= ~RB_FLAG_MASK;
+
+ ret = cmpxchg((unsigned long *)&list->next,
+ val | old_flag, val | new_flag);
+
+ /* check if the reader took the page */
+ if ((ret & ~RB_FLAG_MASK) != val)
+ return RB_PAGE_MOVED;
+
+ return ret & RB_FLAG_MASK;
+}
+
+static int rb_head_page_set_update(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *head,
+ struct buffer_page *prev,
+ int old_flag)
+{
+ return rb_head_page_set(cpu_buffer, head, prev,
+ old_flag, RB_PAGE_UPDATE);
+}
+
+static int rb_head_page_set_head(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *head,
+ struct buffer_page *prev,
+ int old_flag)
+{
+ return rb_head_page_set(cpu_buffer, head, prev,
+ old_flag, RB_PAGE_HEAD);
+}
+
+static int rb_head_page_set_normal(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *head,
+ struct buffer_page *prev,
+ int old_flag)
+{
+ return rb_head_page_set(cpu_buffer, head, prev,
+ old_flag, RB_PAGE_NORMAL);
+}
+
+static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page **bpage)
+{
+ struct list_head *p = rb_list_head((*bpage)->list.next);
+
+ *bpage = list_entry(p, struct buffer_page, list);
+}
+
+static struct buffer_page *
+rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct buffer_page *head;
+ struct buffer_page *page;
+ struct list_head *list;
+ int i;
+
+ if (RB_WARN_ON(cpu_buffer, !cpu_buffer->head_page))
+ return NULL;
+
+ /* sanity check */
+ list = cpu_buffer->pages;
+ if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev->next) != list))
+ return NULL;
+
+ page = head = cpu_buffer->head_page;
+ /*
+ * It is possible that the writer moves the header behind
+ * where we started, and we miss in one loop.
+ * A second loop should grab the header, but we'll do
+ * three loops just because I'm paranoid.
+ */
+ for (i = 0; i < 3; i++) {
+ do {
+ if (rb_is_head_page(cpu_buffer, page, page->list.prev)) {
+ cpu_buffer->head_page = page;
+ return page;
+ }
+ rb_inc_page(cpu_buffer, &page);
+ } while (page != head);
+ }
+
+ RB_WARN_ON(cpu_buffer, 1);
+
+ return NULL;
+}
+
+static int rb_head_page_replace(struct buffer_page *old,
+ struct buffer_page *new)
+{
+ unsigned long *ptr = (unsigned long *)&old->list.prev->next;
+ unsigned long val;
+ unsigned long ret;
+
+ val = *ptr & ~RB_FLAG_MASK;
+ val |= RB_PAGE_HEAD;
+
+ ret = cmpxchg(ptr, val, (unsigned long)&new->list);
+
+ return ret == val;
+}
+
+/*
+ * rb_tail_page_update - move the tail page forward
+ *
+ * Returns 1 if moved tail page, 0 if someone else did.
+ */
+static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *tail_page,
+ struct buffer_page *next_page)
+{
+ struct buffer_page *old_tail;
+ unsigned long old_entries;
+ unsigned long old_write;
+ int ret = 0;
+
+ /*
+ * The tail page now needs to be moved forward.
+ *
+ * We need to reset the tail page, but without messing
+ * with possible erasing of data brought in by interrupts
+ * that have moved the tail page and are currently on it.
+ *
+ * We add a counter to the write field to denote this.
+ */
+ old_write = local_add_return(RB_WRITE_INTCNT, &next_page->write);
+ old_entries = local_add_return(RB_WRITE_INTCNT, &next_page->entries);
+
+ /*
+ * Just make sure we have seen our old_write and synchronize
+ * with any interrupts that come in.
+ */
+ barrier();
+
+ /*
+ * If the tail page is still the same as what we think
+ * it is, then it is up to us to update the tail
+ * pointer.
+ */
+ if (tail_page == cpu_buffer->tail_page) {
+ /* Zero the write counter */
+ unsigned long val = old_write & ~RB_WRITE_MASK;
+ unsigned long eval = old_entries & ~RB_WRITE_MASK;
+
+ /*
+ * This will only succeed if an interrupt did
+ * not come in and change it. In which case, we
+ * do not want to modify it.
+ *
+ * We add (void) to let the compiler know that we do not care
+ * about the return value of these functions. We use the
+ * cmpxchg to only update if an interrupt did not already
+ * do it for us. If the cmpxchg fails, we don't care.
+ */
+ (void)local_cmpxchg(&next_page->write, old_write, val);
+ (void)local_cmpxchg(&next_page->entries, old_entries, eval);
+
+ /*
+ * No need to worry about races with clearing out the commit.
+ * it only can increment when a commit takes place. But that
+ * only happens in the outer most nested commit.
+ */
+ local_set(&next_page->page->commit, 0);
+
+ old_tail = cmpxchg(&cpu_buffer->tail_page,
+ tail_page, next_page);
+
+ if (old_tail == tail_page)
+ ret = 1;
+ }
+
+ return ret;
+}
+
+static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *bpage)
+{
+ unsigned long val = (unsigned long)bpage;
+
+ if (RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK))
+ return 1;
+
+ return 0;
+}
+
+/**
+ * rb_check_list - make sure a pointer to a list has the last bits zero
+ */
+static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer,
+ struct list_head *list)
+{
+ if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev) != list->prev))
+ return 1;
+ if (RB_WARN_ON(cpu_buffer, rb_list_head(list->next) != list->next))
+ return 1;
+ return 0;
+}
+
+/**
+ * rb_check_pages - integrity check of buffer pages
+ * @cpu_buffer: CPU buffer with pages to test
+ *
+ * As a safety measure we check to make sure the data pages have not
+ * been corrupted.
+ */
+static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct list_head *head = cpu_buffer->pages;
+ struct buffer_page *bpage, *tmp;
+
+ /* Reset the head page if it exists */
+ if (cpu_buffer->head_page)
+ rb_set_head_page(cpu_buffer);
+
+ rb_head_page_deactivate(cpu_buffer);
+
+ if (RB_WARN_ON(cpu_buffer, head->next->prev != head))
+ return -1;
+ if (RB_WARN_ON(cpu_buffer, head->prev->next != head))
+ return -1;
+
+ if (rb_check_list(cpu_buffer, head))
+ return -1;
+
+ list_for_each_entry_safe(bpage, tmp, head, list) {
+ if (RB_WARN_ON(cpu_buffer,
+ bpage->list.next->prev != &bpage->list))
+ return -1;
+ if (RB_WARN_ON(cpu_buffer,
+ bpage->list.prev->next != &bpage->list))
+ return -1;
+ if (rb_check_list(cpu_buffer, &bpage->list))
+ return -1;
+ }
+
+ rb_head_page_activate(cpu_buffer);
+
+ return 0;
+}
+
+static int __rb_allocate_pages(int nr_pages, struct list_head *pages, int cpu)
+{
+ int i;
+ struct buffer_page *bpage, *tmp;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page;
+ /*
+ * __GFP_NORETRY flag makes sure that the allocation fails
+ * gracefully without invoking oom-killer and the system is
+ * not destabilized.
+ */
+ bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
+ GFP_KERNEL | __GFP_NORETRY,
+ cpu_to_node(cpu));
+ if (!bpage)
+ goto free_pages;
+
+ list_add(&bpage->list, pages);
+
+ page = alloc_pages_node(cpu_to_node(cpu),
+ GFP_KERNEL | __GFP_NORETRY, 0);
+ if (!page)
+ goto free_pages;
+ bpage->page = page_address(page);
+ rb_init_page(bpage->page);
+ }
+
+ return 0;
+
+free_pages:
+ list_for_each_entry_safe(bpage, tmp, pages, list) {
+ list_del_init(&bpage->list);
+ free_buffer_page(bpage);
+ }
+
+ return -ENOMEM;
+}
+
+static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned nr_pages)
+{
+ LIST_HEAD(pages);
+
+ WARN_ON(!nr_pages);
+
+ if (__rb_allocate_pages(nr_pages, &pages, cpu_buffer->cpu))
+ return -ENOMEM;
+
+ /*
+ * The ring buffer page list is a circular list that does not
+ * start and end with a list head. All page list items point to
+ * other pages.
+ */
+ cpu_buffer->pages = pages.next;
+ list_del(&pages);
+
+ cpu_buffer->nr_pages = nr_pages;
+
+ rb_check_pages(cpu_buffer);
+
+ return 0;
+}
+
+static struct ring_buffer_per_cpu *
+rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct buffer_page *bpage;
+ struct page *page;
+ int ret;
+
+ cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!cpu_buffer)
+ return NULL;
+
+ cpu_buffer->cpu = cpu;
+ cpu_buffer->buffer = buffer;
+ raw_spin_lock_init(&cpu_buffer->reader_lock);
+ lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
+ cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
+ INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler);
+ init_completion(&cpu_buffer->update_done);
+ init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters);
+ init_waitqueue_head(&cpu_buffer->irq_work.waiters);
+ init_waitqueue_head(&cpu_buffer->irq_work.full_waiters);
+
+ bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!bpage)
+ goto fail_free_buffer;
+
+ rb_check_bpage(cpu_buffer, bpage);
+
+ cpu_buffer->reader_page = bpage;
+ page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0);
+ if (!page)
+ goto fail_free_reader;
+ bpage->page = page_address(page);
+ rb_init_page(bpage->page);
+
+ INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
+ INIT_LIST_HEAD(&cpu_buffer->new_pages);
+
+ ret = rb_allocate_pages(cpu_buffer, nr_pages);
+ if (ret < 0)
+ goto fail_free_reader;
+
+ cpu_buffer->head_page
+ = list_entry(cpu_buffer->pages, struct buffer_page, list);
+ cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page;
+
+ rb_head_page_activate(cpu_buffer);
+
+ return cpu_buffer;
+
+ fail_free_reader:
+ free_buffer_page(cpu_buffer->reader_page);
+
+ fail_free_buffer:
+ kfree(cpu_buffer);
+ return NULL;
+}
+
+static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct list_head *head = cpu_buffer->pages;
+ struct buffer_page *bpage, *tmp;
+
+ free_buffer_page(cpu_buffer->reader_page);
+
+ rb_head_page_deactivate(cpu_buffer);
+
+ if (head) {
+ list_for_each_entry_safe(bpage, tmp, head, list) {
+ list_del_init(&bpage->list);
+ free_buffer_page(bpage);
+ }
+ bpage = list_entry(head, struct buffer_page, list);
+ free_buffer_page(bpage);
+ }
+
+ kfree(cpu_buffer);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int rb_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu);
+#endif
+
+/**
+ * __ring_buffer_alloc - allocate a new ring_buffer
+ * @size: the size in bytes per cpu that is needed.
+ * @flags: attributes to set for the ring buffer.
+ *
+ * Currently the only flag that is available is the RB_FL_OVERWRITE
+ * flag. This flag means that the buffer will overwrite old data
+ * when the buffer wraps. If this flag is not set, the buffer will
+ * drop data when the tail hits the head.
+ */
+struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
+ struct lock_class_key *key)
+{
+ struct ring_buffer *buffer;
+ int bsize;
+ int cpu, nr_pages;
+
+ /* keep it in its own cache line */
+ buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
+ GFP_KERNEL);
+ if (!buffer)
+ return NULL;
+
+ if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))
+ goto fail_free_buffer;
+
+ nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+ buffer->flags = flags;
+ buffer->clock = trace_clock_local;
+ buffer->reader_lock_key = key;
+
+ init_irq_work(&buffer->irq_work.work, rb_wake_up_waiters);
+ init_waitqueue_head(&buffer->irq_work.waiters);
+
+ /* need at least two pages */
+ if (nr_pages < 2)
+ nr_pages = 2;
+
+ /*
+ * In case of non-hotplug cpu, if the ring-buffer is allocated
+ * in early initcall, it will not be notified of secondary cpus.
+ * In that off case, we need to allocate for all possible cpus.
+ */
+#ifdef CONFIG_HOTPLUG_CPU
+ cpu_notifier_register_begin();
+ cpumask_copy(buffer->cpumask, cpu_online_mask);
+#else
+ cpumask_copy(buffer->cpumask, cpu_possible_mask);
+#endif
+ buffer->cpus = nr_cpu_ids;
+
+ bsize = sizeof(void *) * nr_cpu_ids;
+ buffer->buffers = kzalloc(ALIGN(bsize, cache_line_size()),
+ GFP_KERNEL);
+ if (!buffer->buffers)
+ goto fail_free_cpumask;
+
+ for_each_buffer_cpu(buffer, cpu) {
+ buffer->buffers[cpu] =
+ rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
+ if (!buffer->buffers[cpu])
+ goto fail_free_buffers;
+ }
+
+#ifdef CONFIG_HOTPLUG_CPU
+ buffer->cpu_notify.notifier_call = rb_cpu_notify;
+ buffer->cpu_notify.priority = 0;
+ __register_cpu_notifier(&buffer->cpu_notify);
+ cpu_notifier_register_done();
+#endif
+
+ mutex_init(&buffer->mutex);
+
+ return buffer;
+
+ fail_free_buffers:
+ for_each_buffer_cpu(buffer, cpu) {
+ if (buffer->buffers[cpu])
+ rb_free_cpu_buffer(buffer->buffers[cpu]);
+ }
+ kfree(buffer->buffers);
+
+ fail_free_cpumask:
+ free_cpumask_var(buffer->cpumask);
+#ifdef CONFIG_HOTPLUG_CPU
+ cpu_notifier_register_done();
+#endif
+
+ fail_free_buffer:
+ kfree(buffer);
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(__ring_buffer_alloc);
+
+/**
+ * ring_buffer_free - free a ring buffer.
+ * @buffer: the buffer to free.
+ */
+void
+ring_buffer_free(struct ring_buffer *buffer)
+{
+ int cpu;
+
+#ifdef CONFIG_HOTPLUG_CPU
+ cpu_notifier_register_begin();
+ __unregister_cpu_notifier(&buffer->cpu_notify);
+#endif
+
+ for_each_buffer_cpu(buffer, cpu)
+ rb_free_cpu_buffer(buffer->buffers[cpu]);
+
+#ifdef CONFIG_HOTPLUG_CPU
+ cpu_notifier_register_done();
+#endif
+
+ kfree(buffer->buffers);
+ free_cpumask_var(buffer->cpumask);
+
+ kfree(buffer);
+}
+EXPORT_SYMBOL_GPL(ring_buffer_free);
+
+void ring_buffer_set_clock(struct ring_buffer *buffer,
+ u64 (*clock)(void))
+{
+ buffer->clock = clock;
+}
+
+static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
+
+static inline unsigned long rb_page_entries(struct buffer_page *bpage)
+{
+ return local_read(&bpage->entries) & RB_WRITE_MASK;
+}
+
+static inline unsigned long rb_page_write(struct buffer_page *bpage)
+{
+ return local_read(&bpage->write) & RB_WRITE_MASK;
+}
+
+static int
+rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages)
+{
+ struct list_head *tail_page, *to_remove, *next_page;
+ struct buffer_page *to_remove_page, *tmp_iter_page;
+ struct buffer_page *last_page, *first_page;
+ unsigned int nr_removed;
+ unsigned long head_bit;
+ int page_entries;
+
+ head_bit = 0;
+
+ raw_spin_lock_irq(&cpu_buffer->reader_lock);
+ atomic_inc(&cpu_buffer->record_disabled);
+ /*
+ * We don't race with the readers since we have acquired the reader
+ * lock. We also don't race with writers after disabling recording.
+ * This makes it easy to figure out the first and the last page to be
+ * removed from the list. We unlink all the pages in between including
+ * the first and last pages. This is done in a busy loop so that we
+ * lose the least number of traces.
+ * The pages are freed after we restart recording and unlock readers.
+ */
+ tail_page = &cpu_buffer->tail_page->list;
+
+ /*
+ * tail page might be on reader page, we remove the next page
+ * from the ring buffer
+ */
+ if (cpu_buffer->tail_page == cpu_buffer->reader_page)
+ tail_page = rb_list_head(tail_page->next);
+ to_remove = tail_page;
+
+ /* start of pages to remove */
+ first_page = list_entry(rb_list_head(to_remove->next),
+ struct buffer_page, list);
+
+ for (nr_removed = 0; nr_removed < nr_pages; nr_removed++) {
+ to_remove = rb_list_head(to_remove)->next;
+ head_bit |= (unsigned long)to_remove & RB_PAGE_HEAD;
+ }
+
+ next_page = rb_list_head(to_remove)->next;
+
+ /*
+ * Now we remove all pages between tail_page and next_page.
+ * Make sure that we have head_bit value preserved for the
+ * next page
+ */
+ tail_page->next = (struct list_head *)((unsigned long)next_page |
+ head_bit);
+ next_page = rb_list_head(next_page);
+ next_page->prev = tail_page;
+
+ /* make sure pages points to a valid page in the ring buffer */
+ cpu_buffer->pages = next_page;
+
+ /* update head page */
+ if (head_bit)
+ cpu_buffer->head_page = list_entry(next_page,
+ struct buffer_page, list);
+
+ /*
+ * change read pointer to make sure any read iterators reset
+ * themselves
+ */
+ cpu_buffer->read = 0;
+
+ /* pages are removed, resume tracing and then free the pages */
+ atomic_dec(&cpu_buffer->record_disabled);
+ raw_spin_unlock_irq(&cpu_buffer->reader_lock);
+
+ RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages));
+
+ /* last buffer page to remove */
+ last_page = list_entry(rb_list_head(to_remove), struct buffer_page,
+ list);
+ tmp_iter_page = first_page;
+
+ do {
+ to_remove_page = tmp_iter_page;
+ rb_inc_page(cpu_buffer, &tmp_iter_page);
+
+ /* update the counters */
+ page_entries = rb_page_entries(to_remove_page);
+ if (page_entries) {
+ /*
+ * If something was added to this page, it was full
+ * since it is not the tail page. So we deduct the
+ * bytes consumed in ring buffer from here.
+ * Increment overrun to account for the lost events.
+ */
+ local_add(page_entries, &cpu_buffer->overrun);
+ local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
+ }
+
+ /*
+ * We have already removed references to this list item, just
+ * free up the buffer_page and its page
+ */
+ free_buffer_page(to_remove_page);
+ nr_removed--;
+
+ } while (to_remove_page != last_page);
+
+ RB_WARN_ON(cpu_buffer, nr_removed);
+
+ return nr_removed == 0;
+}
+
+static int
+rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct list_head *pages = &cpu_buffer->new_pages;
+ int retries, success;
+
+ raw_spin_lock_irq(&cpu_buffer->reader_lock);
+ /*
+ * We are holding the reader lock, so the reader page won't be swapped
+ * in the ring buffer. Now we are racing with the writer trying to
+ * move head page and the tail page.
+ * We are going to adapt the reader page update process where:
+ * 1. We first splice the start and end of list of new pages between
+ * the head page and its previous page.
+ * 2. We cmpxchg the prev_page->next to point from head page to the
+ * start of new pages list.
+ * 3. Finally, we update the head->prev to the end of new list.
+ *
+ * We will try this process 10 times, to make sure that we don't keep
+ * spinning.
+ */
+ retries = 10;
+ success = 0;
+ while (retries--) {
+ struct list_head *head_page, *prev_page, *r;
+ struct list_head *last_page, *first_page;
+ struct list_head *head_page_with_bit;
+
+ head_page = &rb_set_head_page(cpu_buffer)->list;
+ if (!head_page)
+ break;
+ prev_page = head_page->prev;
+
+ first_page = pages->next;
+ last_page = pages->prev;
+
+ head_page_with_bit = (struct list_head *)
+ ((unsigned long)head_page | RB_PAGE_HEAD);
+
+ last_page->next = head_page_with_bit;
+ first_page->prev = prev_page;
+
+ r = cmpxchg(&prev_page->next, head_page_with_bit, first_page);
+
+ if (r == head_page_with_bit) {
+ /*
+ * yay, we replaced the page pointer to our new list,
+ * now, we just have to update to head page's prev
+ * pointer to point to end of list
+ */
+ head_page->prev = last_page;
+ success = 1;
+ break;
+ }
+ }
+
+ if (success)
+ INIT_LIST_HEAD(pages);
+ /*
+ * If we weren't successful in adding in new pages, warn and stop
+ * tracing
+ */
+ RB_WARN_ON(cpu_buffer, !success);
+ raw_spin_unlock_irq(&cpu_buffer->reader_lock);
+
+ /* free pages if they weren't inserted */
+ if (!success) {
+ struct buffer_page *bpage, *tmp;
+ list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages,
+ list) {
+ list_del_init(&bpage->list);
+ free_buffer_page(bpage);
+ }
+ }
+ return success;
+}
+
+static void rb_update_pages(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ int success;
+
+ if (cpu_buffer->nr_pages_to_update > 0)
+ success = rb_insert_pages(cpu_buffer);
+ else
+ success = rb_remove_pages(cpu_buffer,
+ -cpu_buffer->nr_pages_to_update);
+
+ if (success)
+ cpu_buffer->nr_pages += cpu_buffer->nr_pages_to_update;
+}
+
+static void update_pages_handler(struct work_struct *work)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = container_of(work,
+ struct ring_buffer_per_cpu, update_pages_work);
+ rb_update_pages(cpu_buffer);
+ complete(&cpu_buffer->update_done);
+}
+
+/**
+ * ring_buffer_resize - resize the ring buffer
+ * @buffer: the buffer to resize.
+ * @size: the new size.
+ * @cpu_id: the cpu buffer to resize
+ *
+ * Minimum size is 2 * BUF_PAGE_SIZE.
+ *
+ * Returns 0 on success and < 0 on failure.
+ */
+int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
+ int cpu_id)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned nr_pages;
+ int cpu, err = 0;
+
+ /*
+ * Always succeed at resizing a non-existent buffer:
+ */
+ if (!buffer)
+ return size;
+
+ /* Make sure the requested buffer exists */
+ if (cpu_id != RING_BUFFER_ALL_CPUS &&
+ !cpumask_test_cpu(cpu_id, buffer->cpumask))
+ return size;
+
+ size = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+ size *= BUF_PAGE_SIZE;
+
+ /* we need a minimum of two pages */
+ if (size < BUF_PAGE_SIZE * 2)
+ size = BUF_PAGE_SIZE * 2;
+
+ nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+
+ /*
+ * Don't succeed if resizing is disabled, as a reader might be
+ * manipulating the ring buffer and is expecting a sane state while
+ * this is true.
+ */
+ if (atomic_read(&buffer->resize_disabled))
+ return -EBUSY;
+
+ /* prevent another thread from changing buffer sizes */
+ mutex_lock(&buffer->mutex);
+
+ if (cpu_id == RING_BUFFER_ALL_CPUS) {
+ /* calculate the pages to update */
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+
+ cpu_buffer->nr_pages_to_update = nr_pages -
+ cpu_buffer->nr_pages;
+ /*
+ * nothing more to do for removing pages or no update
+ */
+ if (cpu_buffer->nr_pages_to_update <= 0)
+ continue;
+ /*
+ * to add pages, make sure all new pages can be
+ * allocated without receiving ENOMEM
+ */
+ INIT_LIST_HEAD(&cpu_buffer->new_pages);
+ if (__rb_allocate_pages(cpu_buffer->nr_pages_to_update,
+ &cpu_buffer->new_pages, cpu)) {
+ /* not enough memory for new pages */
+ err = -ENOMEM;
+ goto out_err;
+ }
+ }
+
+ get_online_cpus();
+ /*
+ * Fire off all the required work handlers
+ * We can't schedule on offline CPUs, but it's not necessary
+ * since we can change their buffer sizes without any race.
+ */
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+ if (!cpu_buffer->nr_pages_to_update)
+ continue;
+
+ /* Can't run something on an offline CPU. */
+ if (!cpu_online(cpu)) {
+ rb_update_pages(cpu_buffer);
+ cpu_buffer->nr_pages_to_update = 0;
+ } else {
+ schedule_work_on(cpu,
+ &cpu_buffer->update_pages_work);
+ }
+ }
+
+ /* wait for all the updates to complete */
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+ if (!cpu_buffer->nr_pages_to_update)
+ continue;
+
+ if (cpu_online(cpu))
+ wait_for_completion(&cpu_buffer->update_done);
+ cpu_buffer->nr_pages_to_update = 0;
+ }
+
+ put_online_cpus();
+ } else {
+ /* Make sure this CPU has been intitialized */
+ if (!cpumask_test_cpu(cpu_id, buffer->cpumask))
+ goto out;
+
+ cpu_buffer = buffer->buffers[cpu_id];
+
+ if (nr_pages == cpu_buffer->nr_pages)
+ goto out;
+
+ cpu_buffer->nr_pages_to_update = nr_pages -
+ cpu_buffer->nr_pages;
+
+ INIT_LIST_HEAD(&cpu_buffer->new_pages);
+ if (cpu_buffer->nr_pages_to_update > 0 &&
+ __rb_allocate_pages(cpu_buffer->nr_pages_to_update,
+ &cpu_buffer->new_pages, cpu_id)) {
+ err = -ENOMEM;
+ goto out_err;
+ }
+
+ get_online_cpus();
+
+ /* Can't run something on an offline CPU. */
+ if (!cpu_online(cpu_id))
+ rb_update_pages(cpu_buffer);
+ else {
+ schedule_work_on(cpu_id,
+ &cpu_buffer->update_pages_work);
+ wait_for_completion(&cpu_buffer->update_done);
+ }
+
+ cpu_buffer->nr_pages_to_update = 0;
+ put_online_cpus();
+ }
+
+ out:
+ /*
+ * The ring buffer resize can happen with the ring buffer
+ * enabled, so that the update disturbs the tracing as little
+ * as possible. But if the buffer is disabled, we do not need
+ * to worry about that, and we can take the time to verify
+ * that the buffer is not corrupt.
+ */
+ if (atomic_read(&buffer->record_disabled)) {
+ atomic_inc(&buffer->record_disabled);
+ /*
+ * Even though the buffer was disabled, we must make sure
+ * that it is truly disabled before calling rb_check_pages.
+ * There could have been a race between checking
+ * record_disable and incrementing it.
+ */
+ synchronize_sched();
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+ rb_check_pages(cpu_buffer);
+ }
+ atomic_dec(&buffer->record_disabled);
+ }
+
+ mutex_unlock(&buffer->mutex);
+ return size;
+
+ out_err:
+ for_each_buffer_cpu(buffer, cpu) {
+ struct buffer_page *bpage, *tmp;
+
+ cpu_buffer = buffer->buffers[cpu];
+ cpu_buffer->nr_pages_to_update = 0;
+
+ if (list_empty(&cpu_buffer->new_pages))
+ continue;
+
+ list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages,
+ list) {
+ list_del_init(&bpage->list);
+ free_buffer_page(bpage);
+ }
+ }
+ mutex_unlock(&buffer->mutex);
+ return err;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_resize);
+
+void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val)
+{
+ mutex_lock(&buffer->mutex);
+ if (val)
+ buffer->flags |= RB_FL_OVERWRITE;
+ else
+ buffer->flags &= ~RB_FL_OVERWRITE;
+ mutex_unlock(&buffer->mutex);
+}
+EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite);
+
+static inline void *
+__rb_data_page_index(struct buffer_data_page *bpage, unsigned index)
+{
+ return bpage->data + index;
+}
+
+static inline void *__rb_page_index(struct buffer_page *bpage, unsigned index)
+{
+ return bpage->page->data + index;
+}
+
+static inline struct ring_buffer_event *
+rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return __rb_page_index(cpu_buffer->reader_page,
+ cpu_buffer->reader_page->read);
+}
+
+static inline struct ring_buffer_event *
+rb_iter_head_event(struct ring_buffer_iter *iter)
+{
+ return __rb_page_index(iter->head_page, iter->head);
+}
+
+static inline unsigned rb_page_commit(struct buffer_page *bpage)
+{
+ return local_read(&bpage->page->commit);
+}
+
+/* Size is determined by what has been committed */
+static inline unsigned rb_page_size(struct buffer_page *bpage)
+{
+ return rb_page_commit(bpage);
+}
+
+static inline unsigned
+rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return rb_page_commit(cpu_buffer->commit_page);
+}
+
+static inline unsigned
+rb_event_index(struct ring_buffer_event *event)
+{
+ unsigned long addr = (unsigned long)event;
+
+ return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE;
+}
+
+static inline int
+rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event)
+{
+ unsigned long addr = (unsigned long)event;
+ unsigned long index;
+
+ index = rb_event_index(event);
+ addr &= PAGE_MASK;
+
+ return cpu_buffer->commit_page->page == (void *)addr &&
+ rb_commit_index(cpu_buffer) == index;
+}
+
+static void
+rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ unsigned long max_count;
+
+ /*
+ * We only race with interrupts and NMIs on this CPU.
+ * If we own the commit event, then we can commit
+ * all others that interrupted us, since the interruptions
+ * are in stack format (they finish before they come
+ * back to us). This allows us to do a simple loop to
+ * assign the commit to the tail.
+ */
+ again:
+ max_count = cpu_buffer->nr_pages * 100;
+
+ while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
+ if (RB_WARN_ON(cpu_buffer, !(--max_count)))
+ return;
+ if (RB_WARN_ON(cpu_buffer,
+ rb_is_reader_page(cpu_buffer->tail_page)))
+ return;
+ local_set(&cpu_buffer->commit_page->page->commit,
+ rb_page_write(cpu_buffer->commit_page));
+ rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
+ cpu_buffer->write_stamp =
+ cpu_buffer->commit_page->page->time_stamp;
+ /* add barrier to keep gcc from optimizing too much */
+ barrier();
+ }
+ while (rb_commit_index(cpu_buffer) !=
+ rb_page_write(cpu_buffer->commit_page)) {
+
+ local_set(&cpu_buffer->commit_page->page->commit,
+ rb_page_write(cpu_buffer->commit_page));
+ RB_WARN_ON(cpu_buffer,
+ local_read(&cpu_buffer->commit_page->page->commit) &
+ ~RB_WRITE_MASK);
+ barrier();
+ }
+
+ /* again, keep gcc from optimizing */
+ barrier();
+
+ /*
+ * If an interrupt came in just after the first while loop
+ * and pushed the tail page forward, we will be left with
+ * a dangling commit that will never go forward.
+ */
+ if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page))
+ goto again;
+}
+
+static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp;
+ cpu_buffer->reader_page->read = 0;
+}
+
+static void rb_inc_iter(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+
+ /*
+ * The iterator could be on the reader page (it starts there).
+ * But the head could have moved, since the reader was
+ * found. Check for this case and assign the iterator
+ * to the head page instead of next.
+ */
+ if (iter->head_page == cpu_buffer->reader_page)
+ iter->head_page = rb_set_head_page(cpu_buffer);
+ else
+ rb_inc_page(cpu_buffer, &iter->head_page);
+
+ iter->read_stamp = iter->head_page->page->time_stamp;
+ iter->head = 0;
+}
+
+/* Slow path, do not inline */
+static noinline struct ring_buffer_event *
+rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
+{
+ event->type_len = RINGBUF_TYPE_TIME_EXTEND;
+
+ /* Not the first event on the page? */
+ if (rb_event_index(event)) {
+ event->time_delta = delta & TS_MASK;
+ event->array[0] = delta >> TS_SHIFT;
+ } else {
+ /* nope, just zero it */
+ event->time_delta = 0;
+ event->array[0] = 0;
+ }
+
+ return skip_time_extend(event);
+}
+
+/**
+ * rb_update_event - update event type and data
+ * @event: the event to update
+ * @type: the type of event
+ * @length: the size of the event field in the ring buffer
+ *
+ * Update the type and data fields of the event. The length
+ * is the actual size that is written to the ring buffer,
+ * and with this, we can determine what to place into the
+ * data field.
+ */
+static void
+rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event, unsigned length,
+ int add_timestamp, u64 delta)
+{
+ /* Only a commit updates the timestamp */
+ if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
+ delta = 0;
+
+ /*
+ * If we need to add a timestamp, then we
+ * add it to the start of the resevered space.
+ */
+ if (unlikely(add_timestamp)) {
+ event = rb_add_time_stamp(event, delta);
+ length -= RB_LEN_TIME_EXTEND;
+ delta = 0;
+ }
+
+ event->time_delta = delta;
+ length -= RB_EVNT_HDR_SIZE;
+ if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) {
+ event->type_len = 0;
+ event->array[0] = length;
+ } else
+ event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
+}
+
+/*
+ * rb_handle_head_page - writer hit the head page
+ *
+ * Returns: +1 to retry page
+ * 0 to continue
+ * -1 on error
+ */
+static int
+rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *tail_page,
+ struct buffer_page *next_page)
+{
+ struct buffer_page *new_head;
+ int entries;
+ int type;
+ int ret;
+
+ entries = rb_page_entries(next_page);
+
+ /*
+ * The hard part is here. We need to move the head
+ * forward, and protect against both readers on
+ * other CPUs and writers coming in via interrupts.
+ */
+ type = rb_head_page_set_update(cpu_buffer, next_page, tail_page,
+ RB_PAGE_HEAD);
+
+ /*
+ * type can be one of four:
+ * NORMAL - an interrupt already moved it for us
+ * HEAD - we are the first to get here.
+ * UPDATE - we are the interrupt interrupting
+ * a current move.
+ * MOVED - a reader on another CPU moved the next
+ * pointer to its reader page. Give up
+ * and try again.
+ */
+
+ switch (type) {
+ case RB_PAGE_HEAD:
+ /*
+ * We changed the head to UPDATE, thus
+ * it is our responsibility to update
+ * the counters.
+ */
+ local_add(entries, &cpu_buffer->overrun);
+ local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
+
+ /*
+ * The entries will be zeroed out when we move the
+ * tail page.
+ */
+
+ /* still more to do */
+ break;
+
+ case RB_PAGE_UPDATE:
+ /*
+ * This is an interrupt that interrupt the
+ * previous update. Still more to do.
+ */
+ break;
+ case RB_PAGE_NORMAL:
+ /*
+ * An interrupt came in before the update
+ * and processed this for us.
+ * Nothing left to do.
+ */
+ return 1;
+ case RB_PAGE_MOVED:
+ /*
+ * The reader is on another CPU and just did
+ * a swap with our next_page.
+ * Try again.
+ */
+ return 1;
+ default:
+ RB_WARN_ON(cpu_buffer, 1); /* WTF??? */
+ return -1;
+ }
+
+ /*
+ * Now that we are here, the old head pointer is
+ * set to UPDATE. This will keep the reader from
+ * swapping the head page with the reader page.
+ * The reader (on another CPU) will spin till
+ * we are finished.
+ *
+ * We just need to protect against interrupts
+ * doing the job. We will set the next pointer
+ * to HEAD. After that, we set the old pointer
+ * to NORMAL, but only if it was HEAD before.
+ * otherwise we are an interrupt, and only
+ * want the outer most commit to reset it.
+ */
+ new_head = next_page;
+ rb_inc_page(cpu_buffer, &new_head);
+
+ ret = rb_head_page_set_head(cpu_buffer, new_head, next_page,
+ RB_PAGE_NORMAL);
+
+ /*
+ * Valid returns are:
+ * HEAD - an interrupt came in and already set it.
+ * NORMAL - One of two things:
+ * 1) We really set it.
+ * 2) A bunch of interrupts came in and moved
+ * the page forward again.
+ */
+ switch (ret) {
+ case RB_PAGE_HEAD:
+ case RB_PAGE_NORMAL:
+ /* OK */
+ break;
+ default:
+ RB_WARN_ON(cpu_buffer, 1);
+ return -1;
+ }
+
+ /*
+ * It is possible that an interrupt came in,
+ * set the head up, then more interrupts came in
+ * and moved it again. When we get back here,
+ * the page would have been set to NORMAL but we
+ * just set it back to HEAD.
+ *
+ * How do you detect this? Well, if that happened
+ * the tail page would have moved.
+ */
+ if (ret == RB_PAGE_NORMAL) {
+ /*
+ * If the tail had moved passed next, then we need
+ * to reset the pointer.
+ */
+ if (cpu_buffer->tail_page != tail_page &&
+ cpu_buffer->tail_page != next_page)
+ rb_head_page_set_normal(cpu_buffer, new_head,
+ next_page,
+ RB_PAGE_HEAD);
+ }
+
+ /*
+ * If this was the outer most commit (the one that
+ * changed the original pointer from HEAD to UPDATE),
+ * then it is up to us to reset it to NORMAL.
+ */
+ if (type == RB_PAGE_HEAD) {
+ ret = rb_head_page_set_normal(cpu_buffer, next_page,
+ tail_page,
+ RB_PAGE_UPDATE);
+ if (RB_WARN_ON(cpu_buffer,
+ ret != RB_PAGE_UPDATE))
+ return -1;
+ }
+
+ return 0;
+}
+
+static unsigned rb_calculate_event_length(unsigned length)
+{
+ struct ring_buffer_event event; /* Used only for sizeof array */
+
+ /* zero length can cause confusions */
+ if (!length)
+ length = 1;
+
+ if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
+ length += sizeof(event.array[0]);
+
+ length += RB_EVNT_HDR_SIZE;
+ length = ALIGN(length, RB_ARCH_ALIGNMENT);
+
+ return length;
+}
+
+static inline void
+rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *tail_page,
+ unsigned long tail, unsigned long length)
+{
+ struct ring_buffer_event *event;
+
+ /*
+ * Only the event that crossed the page boundary
+ * must fill the old tail_page with padding.
+ */
+ if (tail >= BUF_PAGE_SIZE) {
+ /*
+ * If the page was filled, then we still need
+ * to update the real_end. Reset it to zero
+ * and the reader will ignore it.
+ */
+ if (tail == BUF_PAGE_SIZE)
+ tail_page->real_end = 0;
+
+ local_sub(length, &tail_page->write);
+ return;
+ }
+
+ event = __rb_page_index(tail_page, tail);
+ kmemcheck_annotate_bitfield(event, bitfield);
+
+ /* account for padding bytes */
+ local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes);
+
+ /*
+ * Save the original length to the meta data.
+ * This will be used by the reader to add lost event
+ * counter.
+ */
+ tail_page->real_end = tail;
+
+ /*
+ * If this event is bigger than the minimum size, then
+ * we need to be careful that we don't subtract the
+ * write counter enough to allow another writer to slip
+ * in on this page.
+ * We put in a discarded commit instead, to make sure
+ * that this space is not used again.
+ *
+ * If we are less than the minimum size, we don't need to
+ * worry about it.
+ */
+ if (tail > (BUF_PAGE_SIZE - RB_EVNT_MIN_SIZE)) {
+ /* No room for any events */
+
+ /* Mark the rest of the page with padding */
+ rb_event_set_padding(event);
+
+ /* Set the write back to the previous setting */
+ local_sub(length, &tail_page->write);
+ return;
+ }
+
+ /* Put in a discarded event */
+ event->array[0] = (BUF_PAGE_SIZE - tail) - RB_EVNT_HDR_SIZE;
+ event->type_len = RINGBUF_TYPE_PADDING;
+ /* time delta must be non zero */
+ event->time_delta = 1;
+
+ /* Set write to end of buffer */
+ length = (tail + length) - BUF_PAGE_SIZE;
+ local_sub(length, &tail_page->write);
+}
+
+/*
+ * This is the slow path, force gcc not to inline it.
+ */
+static noinline struct ring_buffer_event *
+rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned long length, unsigned long tail,
+ struct buffer_page *tail_page, u64 ts)
+{
+ struct buffer_page *commit_page = cpu_buffer->commit_page;
+ struct ring_buffer *buffer = cpu_buffer->buffer;
+ struct buffer_page *next_page;
+ int ret;
+
+ next_page = tail_page;
+
+ rb_inc_page(cpu_buffer, &next_page);
+
+ /*
+ * If for some reason, we had an interrupt storm that made
+ * it all the way around the buffer, bail, and warn
+ * about it.
+ */
+ if (unlikely(next_page == commit_page)) {
+ local_inc(&cpu_buffer->commit_overrun);
+ goto out_reset;
+ }
+
+ /*
+ * This is where the fun begins!
+ *
+ * We are fighting against races between a reader that
+ * could be on another CPU trying to swap its reader
+ * page with the buffer head.
+ *
+ * We are also fighting against interrupts coming in and
+ * moving the head or tail on us as well.
+ *
+ * If the next page is the head page then we have filled
+ * the buffer, unless the commit page is still on the
+ * reader page.
+ */
+ if (rb_is_head_page(cpu_buffer, next_page, &tail_page->list)) {
+
+ /*
+ * If the commit is not on the reader page, then
+ * move the header page.
+ */
+ if (!rb_is_reader_page(cpu_buffer->commit_page)) {
+ /*
+ * If we are not in overwrite mode,
+ * this is easy, just stop here.
+ */
+ if (!(buffer->flags & RB_FL_OVERWRITE)) {
+ local_inc(&cpu_buffer->dropped_events);
+ goto out_reset;
+ }
+
+ ret = rb_handle_head_page(cpu_buffer,
+ tail_page,
+ next_page);
+ if (ret < 0)
+ goto out_reset;
+ if (ret)
+ goto out_again;
+ } else {
+ /*
+ * We need to be careful here too. The
+ * commit page could still be on the reader
+ * page. We could have a small buffer, and
+ * have filled up the buffer with events
+ * from interrupts and such, and wrapped.
+ *
+ * Note, if the tail page is also the on the
+ * reader_page, we let it move out.
+ */
+ if (unlikely((cpu_buffer->commit_page !=
+ cpu_buffer->tail_page) &&
+ (cpu_buffer->commit_page ==
+ cpu_buffer->reader_page))) {
+ local_inc(&cpu_buffer->commit_overrun);
+ goto out_reset;
+ }
+ }
+ }
+
+ ret = rb_tail_page_update(cpu_buffer, tail_page, next_page);
+ if (ret) {
+ /*
+ * Nested commits always have zero deltas, so
+ * just reread the time stamp
+ */
+ ts = rb_time_stamp(buffer);
+ next_page->page->time_stamp = ts;
+ }
+
+ out_again:
+
+ rb_reset_tail(cpu_buffer, tail_page, tail, length);
+
+ /* fail and let the caller try again */
+ return ERR_PTR(-EAGAIN);
+
+ out_reset:
+ /* reset write */
+ rb_reset_tail(cpu_buffer, tail_page, tail, length);
+
+ return NULL;
+}
+
+static struct ring_buffer_event *
+__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned long length, u64 ts,
+ u64 delta, int add_timestamp)
+{
+ struct buffer_page *tail_page;
+ struct ring_buffer_event *event;
+ unsigned long tail, write;
+
+ /*
+ * If the time delta since the last event is too big to
+ * hold in the time field of the event, then we append a
+ * TIME EXTEND event ahead of the data event.
+ */
+ if (unlikely(add_timestamp))
+ length += RB_LEN_TIME_EXTEND;
+
+ tail_page = cpu_buffer->tail_page;
+ write = local_add_return(length, &tail_page->write);
+
+ /* set write to only the index of the write */
+ write &= RB_WRITE_MASK;
+ tail = write - length;
+
+ /*
+ * If this is the first commit on the page, then it has the same
+ * timestamp as the page itself.
+ */
+ if (!tail)
+ delta = 0;
+
+ /* See if we shot pass the end of this buffer page */
+ if (unlikely(write > BUF_PAGE_SIZE))
+ return rb_move_tail(cpu_buffer, length, tail,
+ tail_page, ts);
+
+ /* We reserved something on the buffer */
+
+ event = __rb_page_index(tail_page, tail);
+ kmemcheck_annotate_bitfield(event, bitfield);
+ rb_update_event(cpu_buffer, event, length, add_timestamp, delta);
+
+ local_inc(&tail_page->entries);
+
+ /*
+ * If this is the first commit on the page, then update
+ * its timestamp.
+ */
+ if (!tail)
+ tail_page->page->time_stamp = ts;
+
+ /* account for these added bytes */
+ local_add(length, &cpu_buffer->entries_bytes);
+
+ return event;
+}
+
+static inline int
+rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event)
+{
+ unsigned long new_index, old_index;
+ struct buffer_page *bpage;
+ unsigned long index;
+ unsigned long addr;
+
+ new_index = rb_event_index(event);
+ old_index = new_index + rb_event_ts_length(event);
+ addr = (unsigned long)event;
+ addr &= PAGE_MASK;
+
+ bpage = cpu_buffer->tail_page;
+
+ if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
+ unsigned long write_mask =
+ local_read(&bpage->write) & ~RB_WRITE_MASK;
+ unsigned long event_length = rb_event_length(event);
+ /*
+ * This is on the tail page. It is possible that
+ * a write could come in and move the tail page
+ * and write to the next page. That is fine
+ * because we just shorten what is on this page.
+ */
+ old_index += write_mask;
+ new_index += write_mask;
+ index = local_cmpxchg(&bpage->write, old_index, new_index);
+ if (index == old_index) {
+ /* update counters */
+ local_sub(event_length, &cpu_buffer->entries_bytes);
+ return 1;
+ }
+ }
+
+ /* could not discard */
+ return 0;
+}
+
+static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ local_inc(&cpu_buffer->committing);
+ local_inc(&cpu_buffer->commits);
+}
+
+static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ unsigned long commits;
+
+ if (RB_WARN_ON(cpu_buffer,
+ !local_read(&cpu_buffer->committing)))
+ return;
+
+ again:
+ commits = local_read(&cpu_buffer->commits);
+ /* synchronize with interrupts */
+ barrier();
+ if (local_read(&cpu_buffer->committing) == 1)
+ rb_set_commit_to_write(cpu_buffer);
+
+ local_dec(&cpu_buffer->committing);
+
+ /* synchronize with interrupts */
+ barrier();
+
+ /*
+ * Need to account for interrupts coming in between the
+ * updating of the commit page and the clearing of the
+ * committing counter.
+ */
+ if (unlikely(local_read(&cpu_buffer->commits) != commits) &&
+ !local_read(&cpu_buffer->committing)) {
+ local_inc(&cpu_buffer->committing);
+ goto again;
+ }
+}
+
+static struct ring_buffer_event *
+rb_reserve_next_event(struct ring_buffer *buffer,
+ struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned long length)
+{
+ struct ring_buffer_event *event;
+ u64 ts, delta;
+ int nr_loops = 0;
+ int add_timestamp;
+ u64 diff;
+
+ rb_start_commit(cpu_buffer);
+
+#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
+ /*
+ * Due to the ability to swap a cpu buffer from a buffer
+ * it is possible it was swapped before we committed.
+ * (committing stops a swap). We check for it here and
+ * if it happened, we have to fail the write.
+ */
+ barrier();
+ if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) {
+ local_dec(&cpu_buffer->committing);
+ local_dec(&cpu_buffer->commits);
+ return NULL;
+ }
+#endif
+
+ length = rb_calculate_event_length(length);
+ again:
+ add_timestamp = 0;
+ delta = 0;
+
+ /*
+ * We allow for interrupts to reenter here and do a trace.
+ * If one does, it will cause this original code to loop
+ * back here. Even with heavy interrupts happening, this
+ * should only happen a few times in a row. If this happens
+ * 1000 times in a row, there must be either an interrupt
+ * storm or we have something buggy.
+ * Bail!
+ */
+ if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
+ goto out_fail;
+
+ ts = rb_time_stamp(cpu_buffer->buffer);
+ diff = ts - cpu_buffer->write_stamp;
+
+ /* make sure this diff is calculated here */
+ barrier();
+
+ /* Did the write stamp get updated already? */
+ if (likely(ts >= cpu_buffer->write_stamp)) {
+ delta = diff;
+ if (unlikely(test_time_stamp(delta))) {
+ int local_clock_stable = 1;
+#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+ local_clock_stable = sched_clock_stable();
+#endif
+ WARN_ONCE(delta > (1ULL << 59),
+ KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
+ (unsigned long long)delta,
+ (unsigned long long)ts,
+ (unsigned long long)cpu_buffer->write_stamp,
+ local_clock_stable ? "" :
+ "If you just came from a suspend/resume,\n"
+ "please switch to the trace global clock:\n"
+ " echo global > /sys/kernel/debug/tracing/trace_clock\n");
+ add_timestamp = 1;
+ }
+ }
+
+ event = __rb_reserve_next(cpu_buffer, length, ts,
+ delta, add_timestamp);
+ if (unlikely(PTR_ERR(event) == -EAGAIN))
+ goto again;
+
+ if (!event)
+ goto out_fail;
+
+ return event;
+
+ out_fail:
+ rb_end_commit(cpu_buffer);
+ return NULL;
+}
+
+#ifdef CONFIG_TRACING
+
+/*
+ * The lock and unlock are done within a preempt disable section.
+ * The current_context per_cpu variable can only be modified
+ * by the current task between lock and unlock. But it can
+ * be modified more than once via an interrupt. To pass this
+ * information from the lock to the unlock without having to
+ * access the 'in_interrupt()' functions again (which do show
+ * a bit of overhead in something as critical as function tracing,
+ * we use a bitmask trick.
+ *
+ * bit 0 = NMI context
+ * bit 1 = IRQ context
+ * bit 2 = SoftIRQ context
+ * bit 3 = normal context.
+ *
+ * This works because this is the order of contexts that can
+ * preempt other contexts. A SoftIRQ never preempts an IRQ
+ * context.
+ *
+ * When the context is determined, the corresponding bit is
+ * checked and set (if it was set, then a recursion of that context
+ * happened).
+ *
+ * On unlock, we need to clear this bit. To do so, just subtract
+ * 1 from the current_context and AND it to itself.
+ *
+ * (binary)
+ * 101 - 1 = 100
+ * 101 & 100 = 100 (clearing bit zero)
+ *
+ * 1010 - 1 = 1001
+ * 1010 & 1001 = 1000 (clearing bit 1)
+ *
+ * The least significant bit can be cleared this way, and it
+ * just so happens that it is the same bit corresponding to
+ * the current context.
+ */
+static DEFINE_PER_CPU(unsigned int, current_context);
+
+static __always_inline int trace_recursive_lock(void)
+{
+ unsigned int val = __this_cpu_read(current_context);
+ int bit;
+
+ if (in_interrupt()) {
+ if (in_nmi())
+ bit = 0;
+ else if (in_irq())
+ bit = 1;
+ else
+ bit = 2;
+ } else
+ bit = 3;
+
+ if (unlikely(val & (1 << bit)))
+ return 1;
+
+ val |= (1 << bit);
+ __this_cpu_write(current_context, val);
+
+ return 0;
+}
+
+static __always_inline void trace_recursive_unlock(void)
+{
+ __this_cpu_and(current_context, __this_cpu_read(current_context) - 1);
+}
+
+#else
+
+#define trace_recursive_lock() (0)
+#define trace_recursive_unlock() do { } while (0)
+
+#endif
+
+/**
+ * ring_buffer_lock_reserve - reserve a part of the buffer
+ * @buffer: the ring buffer to reserve from
+ * @length: the length of the data to reserve (excluding event header)
+ *
+ * Returns a reseverd event on the ring buffer to copy directly to.
+ * The user of this interface will need to get the body to write into
+ * and can use the ring_buffer_event_data() interface.
+ *
+ * The length is the length of the data needed, not the event length
+ * which also includes the event header.
+ *
+ * Must be paired with ring_buffer_unlock_commit, unless NULL is returned.
+ * If NULL is returned, then nothing has been allocated or locked.
+ */
+struct ring_buffer_event *
+ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+ int cpu;
+
+ if (ring_buffer_flags != RB_BUFFERS_ON)
+ return NULL;
+
+ /* If we are tracing schedule, we don't want to recurse */
+ preempt_disable_notrace();
+
+ if (atomic_read(&buffer->record_disabled))
+ goto out_nocheck;
+
+ if (trace_recursive_lock())
+ goto out_nocheck;
+
+ cpu = raw_smp_processor_id();
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ goto out;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ if (atomic_read(&cpu_buffer->record_disabled))
+ goto out;
+
+ if (length > BUF_MAX_DATA_SIZE)
+ goto out;
+
+ event = rb_reserve_next_event(buffer, cpu_buffer, length);
+ if (!event)
+ goto out;
+
+ return event;
+
+ out:
+ trace_recursive_unlock();
+
+ out_nocheck:
+ preempt_enable_notrace();
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
+
+static void
+rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event)
+{
+ u64 delta;
+
+ /*
+ * The event first in the commit queue updates the
+ * time stamp.
+ */
+ if (rb_event_is_commit(cpu_buffer, event)) {
+ /*
+ * A commit event that is first on a page
+ * updates the write timestamp with the page stamp
+ */
+ if (!rb_event_index(event))
+ cpu_buffer->write_stamp =
+ cpu_buffer->commit_page->page->time_stamp;
+ else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
+ delta = event->array[0];
+ delta <<= TS_SHIFT;
+ delta += event->time_delta;
+ cpu_buffer->write_stamp += delta;
+ } else
+ cpu_buffer->write_stamp += event->time_delta;
+ }
+}
+
+static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event)
+{
+ local_inc(&cpu_buffer->entries);
+ rb_update_write_stamp(cpu_buffer, event);
+ rb_end_commit(cpu_buffer);
+}
+
+static __always_inline void
+rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
+{
+ bool pagebusy;
+
+ if (buffer->irq_work.waiters_pending) {
+ buffer->irq_work.waiters_pending = false;
+ /* irq_work_queue() supplies it's own memory barriers */
+ irq_work_queue(&buffer->irq_work.work);
+ }
+
+ if (cpu_buffer->irq_work.waiters_pending) {
+ cpu_buffer->irq_work.waiters_pending = false;
+ /* irq_work_queue() supplies it's own memory barriers */
+ irq_work_queue(&cpu_buffer->irq_work.work);
+ }
+
+ pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
+
+ if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) {
+ cpu_buffer->irq_work.wakeup_full = true;
+ cpu_buffer->irq_work.full_waiters_pending = false;
+ /* irq_work_queue() supplies it's own memory barriers */
+ irq_work_queue(&cpu_buffer->irq_work.work);
+ }
+}
+
+/**
+ * ring_buffer_unlock_commit - commit a reserved
+ * @buffer: The buffer to commit to
+ * @event: The event pointer to commit.
+ *
+ * This commits the data to the ring buffer, and releases any locks held.
+ *
+ * Must be paired with ring_buffer_lock_reserve.
+ */
+int ring_buffer_unlock_commit(struct ring_buffer *buffer,
+ struct ring_buffer_event *event)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu = raw_smp_processor_id();
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ rb_commit(cpu_buffer, event);
+
+ rb_wakeups(buffer, cpu_buffer);
+
+ trace_recursive_unlock();
+
+ preempt_enable_notrace();
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
+
+static inline void rb_event_discard(struct ring_buffer_event *event)
+{
+ if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
+ event = skip_time_extend(event);
+
+ /* array[0] holds the actual length for the discarded event */
+ event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
+ event->type_len = RINGBUF_TYPE_PADDING;
+ /* time delta must be non zero */
+ if (!event->time_delta)
+ event->time_delta = 1;
+}
+
+/*
+ * Decrement the entries to the page that an event is on.
+ * The event does not even need to exist, only the pointer
+ * to the page it is on. This may only be called before the commit
+ * takes place.
+ */
+static inline void
+rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event)
+{
+ unsigned long addr = (unsigned long)event;
+ struct buffer_page *bpage = cpu_buffer->commit_page;
+ struct buffer_page *start;
+
+ addr &= PAGE_MASK;
+
+ /* Do the likely case first */
+ if (likely(bpage->page == (void *)addr)) {
+ local_dec(&bpage->entries);
+ return;
+ }
+
+ /*
+ * Because the commit page may be on the reader page we
+ * start with the next page and check the end loop there.
+ */
+ rb_inc_page(cpu_buffer, &bpage);
+ start = bpage;
+ do {
+ if (bpage->page == (void *)addr) {
+ local_dec(&bpage->entries);
+ return;
+ }
+ rb_inc_page(cpu_buffer, &bpage);
+ } while (bpage != start);
+
+ /* commit not part of this buffer?? */
+ RB_WARN_ON(cpu_buffer, 1);
+}
+
+/**
+ * ring_buffer_commit_discard - discard an event that has not been committed
+ * @buffer: the ring buffer
+ * @event: non committed event to discard
+ *
+ * Sometimes an event that is in the ring buffer needs to be ignored.
+ * This function lets the user discard an event in the ring buffer
+ * and then that event will not be read later.
+ *
+ * This function only works if it is called before the the item has been
+ * committed. It will try to free the event from the ring buffer
+ * if another event has not been added behind it.
+ *
+ * If another event has been added behind it, it will set the event
+ * up as discarded, and perform the commit.
+ *
+ * If this function is called, do not call ring_buffer_unlock_commit on
+ * the event.
+ */
+void ring_buffer_discard_commit(struct ring_buffer *buffer,
+ struct ring_buffer_event *event)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu;
+
+ /* The event is discarded regardless */
+ rb_event_discard(event);
+
+ cpu = smp_processor_id();
+ cpu_buffer = buffer->buffers[cpu];
+
+ /*
+ * This must only be called if the event has not been
+ * committed yet. Thus we can assume that preemption
+ * is still disabled.
+ */
+ RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing));
+
+ rb_decrement_entry(cpu_buffer, event);
+ if (rb_try_to_discard(cpu_buffer, event))
+ goto out;
+
+ /*
+ * The commit is still visible by the reader, so we
+ * must still update the timestamp.
+ */
+ rb_update_write_stamp(cpu_buffer, event);
+ out:
+ rb_end_commit(cpu_buffer);
+
+ trace_recursive_unlock();
+
+ preempt_enable_notrace();
+
+}
+EXPORT_SYMBOL_GPL(ring_buffer_discard_commit);
+
+/**
+ * ring_buffer_write - write data to the buffer without reserving
+ * @buffer: The ring buffer to write to.
+ * @length: The length of the data being written (excluding the event header)
+ * @data: The data to write to the buffer.
+ *
+ * This is like ring_buffer_lock_reserve and ring_buffer_unlock_commit as
+ * one function. If you already have the data to write to the buffer, it
+ * may be easier to simply call this function.
+ *
+ * Note, like ring_buffer_lock_reserve, the length is the length of the data
+ * and not the length of the event which would hold the header.
+ */
+int ring_buffer_write(struct ring_buffer *buffer,
+ unsigned long length,
+ void *data)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+ void *body;
+ int ret = -EBUSY;
+ int cpu;
+
+ if (ring_buffer_flags != RB_BUFFERS_ON)
+ return -EBUSY;
+
+ preempt_disable_notrace();
+
+ if (atomic_read(&buffer->record_disabled))
+ goto out;
+
+ cpu = raw_smp_processor_id();
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ goto out;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ if (atomic_read(&cpu_buffer->record_disabled))
+ goto out;
+
+ if (length > BUF_MAX_DATA_SIZE)
+ goto out;
+
+ event = rb_reserve_next_event(buffer, cpu_buffer, length);
+ if (!event)
+ goto out;
+
+ body = rb_event_data(event);
+
+ memcpy(body, data, length);
+
+ rb_commit(cpu_buffer, event);
+
+ rb_wakeups(buffer, cpu_buffer);
+
+ ret = 0;
+ out:
+ preempt_enable_notrace();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_write);
+
+static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct buffer_page *reader = cpu_buffer->reader_page;
+ struct buffer_page *head = rb_set_head_page(cpu_buffer);
+ struct buffer_page *commit = cpu_buffer->commit_page;
+
+ /* In case of error, head will be NULL */
+ if (unlikely(!head))
+ return 1;
+
+ return reader->read == rb_page_commit(reader) &&
+ (commit == reader ||
+ (commit == head &&
+ head->read == rb_page_commit(commit)));
+}
+
+/**
+ * ring_buffer_record_disable - stop all writes into the buffer
+ * @buffer: The ring buffer to stop writes to.
+ *
+ * This prevents all writes to the buffer. Any attempt to write
+ * to the buffer after this will fail and return NULL.
+ *
+ * The caller should call synchronize_sched() after this.
+ */
+void ring_buffer_record_disable(struct ring_buffer *buffer)
+{
+ atomic_inc(&buffer->record_disabled);
+}
+EXPORT_SYMBOL_GPL(ring_buffer_record_disable);
+
+/**
+ * ring_buffer_record_enable - enable writes to the buffer
+ * @buffer: The ring buffer to enable writes
+ *
+ * Note, multiple disables will need the same number of enables
+ * to truly enable the writing (much like preempt_disable).
+ */
+void ring_buffer_record_enable(struct ring_buffer *buffer)
+{
+ atomic_dec(&buffer->record_disabled);
+}
+EXPORT_SYMBOL_GPL(ring_buffer_record_enable);
+
+/**
+ * ring_buffer_record_off - stop all writes into the buffer
+ * @buffer: The ring buffer to stop writes to.
+ *
+ * This prevents all writes to the buffer. Any attempt to write
+ * to the buffer after this will fail and return NULL.
+ *
+ * This is different than ring_buffer_record_disable() as
+ * it works like an on/off switch, where as the disable() version
+ * must be paired with a enable().
+ */
+void ring_buffer_record_off(struct ring_buffer *buffer)
+{
+ unsigned int rd;
+ unsigned int new_rd;
+
+ do {
+ rd = atomic_read(&buffer->record_disabled);
+ new_rd = rd | RB_BUFFER_OFF;
+ } while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd);
+}
+EXPORT_SYMBOL_GPL(ring_buffer_record_off);
+
+/**
+ * ring_buffer_record_on - restart writes into the buffer
+ * @buffer: The ring buffer to start writes to.
+ *
+ * This enables all writes to the buffer that was disabled by
+ * ring_buffer_record_off().
+ *
+ * This is different than ring_buffer_record_enable() as
+ * it works like an on/off switch, where as the enable() version
+ * must be paired with a disable().
+ */
+void ring_buffer_record_on(struct ring_buffer *buffer)
+{
+ unsigned int rd;
+ unsigned int new_rd;
+
+ do {
+ rd = atomic_read(&buffer->record_disabled);
+ new_rd = rd & ~RB_BUFFER_OFF;
+ } while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd);
+}
+EXPORT_SYMBOL_GPL(ring_buffer_record_on);
+
+/**
+ * ring_buffer_record_is_on - return true if the ring buffer can write
+ * @buffer: The ring buffer to see if write is enabled
+ *
+ * Returns true if the ring buffer is in a state that it accepts writes.
+ */
+int ring_buffer_record_is_on(struct ring_buffer *buffer)
+{
+ return !atomic_read(&buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer
+ * @buffer: The ring buffer to stop writes to.
+ * @cpu: The CPU buffer to stop
+ *
+ * This prevents all writes to the buffer. Any attempt to write
+ * to the buffer after this will fail and return NULL.
+ *
+ * The caller should call synchronize_sched() after this.
+ */
+void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return;
+
+ cpu_buffer = buffer->buffers[cpu];
+ atomic_inc(&cpu_buffer->record_disabled);
+}
+EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu);
+
+/**
+ * ring_buffer_record_enable_cpu - enable writes to the buffer
+ * @buffer: The ring buffer to enable writes
+ * @cpu: The CPU to enable.
+ *
+ * Note, multiple disables will need the same number of enables
+ * to truly enable the writing (much like preempt_disable).
+ */
+void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return;
+
+ cpu_buffer = buffer->buffers[cpu];
+ atomic_dec(&cpu_buffer->record_disabled);
+}
+EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
+
+/*
+ * The total entries in the ring buffer is the running counter
+ * of entries entered into the ring buffer, minus the sum of
+ * the entries read from the ring buffer and the number of
+ * entries that were overwritten.
+ */
+static inline unsigned long
+rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return local_read(&cpu_buffer->entries) -
+ (local_read(&cpu_buffer->overrun) + cpu_buffer->read);
+}
+
+/**
+ * ring_buffer_oldest_event_ts - get the oldest event timestamp from the buffer
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to read from.
+ */
+u64 ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu)
+{
+ unsigned long flags;
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct buffer_page *bpage;
+ u64 ret = 0;
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return 0;
+
+ cpu_buffer = buffer->buffers[cpu];
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ /*
+ * if the tail is on reader_page, oldest time stamp is on the reader
+ * page
+ */
+ if (cpu_buffer->tail_page == cpu_buffer->reader_page)
+ bpage = cpu_buffer->reader_page;
+ else
+ bpage = rb_set_head_page(cpu_buffer);
+ if (bpage)
+ ret = bpage->page->time_stamp;
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_oldest_event_ts);
+
+/**
+ * ring_buffer_bytes_cpu - get the number of bytes consumed in a cpu buffer
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to read from.
+ */
+unsigned long ring_buffer_bytes_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long ret;
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return 0;
+
+ cpu_buffer = buffer->buffers[cpu];
+ ret = local_read(&cpu_buffer->entries_bytes) - cpu_buffer->read_bytes;
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_bytes_cpu);
+
+/**
+ * ring_buffer_entries_cpu - get the number of entries in a cpu buffer
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the entries from.
+ */
+unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return 0;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ return rb_num_of_entries(cpu_buffer);
+}
+EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
+
+/**
+ * ring_buffer_overrun_cpu - get the number of overruns caused by the ring
+ * buffer wrapping around (only if RB_FL_OVERWRITE is on).
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the number of overruns from
+ */
+unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long ret;
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return 0;
+
+ cpu_buffer = buffer->buffers[cpu];
+ ret = local_read(&cpu_buffer->overrun);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
+
+/**
+ * ring_buffer_commit_overrun_cpu - get the number of overruns caused by
+ * commits failing due to the buffer wrapping around while there are uncommitted
+ * events, such as during an interrupt storm.
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the number of overruns from
+ */
+unsigned long
+ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long ret;
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return 0;
+
+ cpu_buffer = buffer->buffers[cpu];
+ ret = local_read(&cpu_buffer->commit_overrun);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu);
+
+/**
+ * ring_buffer_dropped_events_cpu - get the number of dropped events caused by
+ * the ring buffer filling up (only if RB_FL_OVERWRITE is off).
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the number of overruns from
+ */
+unsigned long
+ring_buffer_dropped_events_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long ret;
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return 0;
+
+ cpu_buffer = buffer->buffers[cpu];
+ ret = local_read(&cpu_buffer->dropped_events);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_dropped_events_cpu);
+
+/**
+ * ring_buffer_read_events_cpu - get the number of events successfully read
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the number of events read
+ */
+unsigned long
+ring_buffer_read_events_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return 0;
+
+ cpu_buffer = buffer->buffers[cpu];
+ return cpu_buffer->read;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_read_events_cpu);
+
+/**
+ * ring_buffer_entries - get the number of entries in a buffer
+ * @buffer: The ring buffer
+ *
+ * Returns the total number of entries in the ring buffer
+ * (all CPU entries)
+ */
+unsigned long ring_buffer_entries(struct ring_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long entries = 0;
+ int cpu;
+
+ /* if you care about this being correct, lock the buffer */
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+ entries += rb_num_of_entries(cpu_buffer);
+ }
+
+ return entries;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_entries);
+
+/**
+ * ring_buffer_overruns - get the number of overruns in buffer
+ * @buffer: The ring buffer
+ *
+ * Returns the total number of overruns in the ring buffer
+ * (all CPU entries)
+ */
+unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long overruns = 0;
+ int cpu;
+
+ /* if you care about this being correct, lock the buffer */
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+ overruns += local_read(&cpu_buffer->overrun);
+ }
+
+ return overruns;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_overruns);
+
+static void rb_iter_reset(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+
+ /* Iterator usage is expected to have record disabled */
+ iter->head_page = cpu_buffer->reader_page;
+ iter->head = cpu_buffer->reader_page->read;
+
+ iter->cache_reader_page = iter->head_page;
+ iter->cache_read = cpu_buffer->read;
+
+ if (iter->head)
+ iter->read_stamp = cpu_buffer->read_stamp;
+ else
+ iter->read_stamp = iter->head_page->page->time_stamp;
+}
+
+/**
+ * ring_buffer_iter_reset - reset an iterator
+ * @iter: The iterator to reset
+ *
+ * Resets the iterator, so that it will start from the beginning
+ * again.
+ */
+void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long flags;
+
+ if (!iter)
+ return;
+
+ cpu_buffer = iter->cpu_buffer;
+
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ rb_iter_reset(iter);
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+}
+EXPORT_SYMBOL_GPL(ring_buffer_iter_reset);
+
+/**
+ * ring_buffer_iter_empty - check if an iterator has no more to read
+ * @iter: The iterator to check
+ */
+int ring_buffer_iter_empty(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ cpu_buffer = iter->cpu_buffer;
+
+ return iter->head_page == cpu_buffer->commit_page &&
+ iter->head == rb_commit_index(cpu_buffer);
+}
+EXPORT_SYMBOL_GPL(ring_buffer_iter_empty);
+
+static void
+rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event)
+{
+ u64 delta;
+
+ switch (event->type_len) {
+ case RINGBUF_TYPE_PADDING:
+ return;
+
+ case RINGBUF_TYPE_TIME_EXTEND:
+ delta = event->array[0];
+ delta <<= TS_SHIFT;
+ delta += event->time_delta;
+ cpu_buffer->read_stamp += delta;
+ return;
+
+ case RINGBUF_TYPE_TIME_STAMP:
+ /* FIXME: not implemented */
+ return;
+
+ case RINGBUF_TYPE_DATA:
+ cpu_buffer->read_stamp += event->time_delta;
+ return;
+
+ default:
+ BUG();
+ }
+ return;
+}
+
+static void
+rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
+ struct ring_buffer_event *event)
+{
+ u64 delta;
+
+ switch (event->type_len) {
+ case RINGBUF_TYPE_PADDING:
+ return;
+
+ case RINGBUF_TYPE_TIME_EXTEND:
+ delta = event->array[0];
+ delta <<= TS_SHIFT;
+ delta += event->time_delta;
+ iter->read_stamp += delta;
+ return;
+
+ case RINGBUF_TYPE_TIME_STAMP:
+ /* FIXME: not implemented */
+ return;
+
+ case RINGBUF_TYPE_DATA:
+ iter->read_stamp += event->time_delta;
+ return;
+
+ default:
+ BUG();
+ }
+ return;
+}
+
+static struct buffer_page *
+rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct buffer_page *reader = NULL;
+ unsigned long overwrite;
+ unsigned long flags;
+ int nr_loops = 0;
+ int ret;
+
+ local_irq_save(flags);
+ arch_spin_lock(&cpu_buffer->lock);
+
+ again:
+ /*
+ * This should normally only loop twice. But because the
+ * start of the reader inserts an empty page, it causes
+ * a case where we will loop three times. There should be no
+ * reason to loop four times (that I know of).
+ */
+ if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3)) {
+ reader = NULL;
+ goto out;
+ }
+
+ reader = cpu_buffer->reader_page;
+
+ /* If there's more to read, return this page */
+ if (cpu_buffer->reader_page->read < rb_page_size(reader))
+ goto out;
+
+ /* Never should we have an index greater than the size */
+ if (RB_WARN_ON(cpu_buffer,
+ cpu_buffer->reader_page->read > rb_page_size(reader)))
+ goto out;
+
+ /* check if we caught up to the tail */
+ reader = NULL;
+ if (cpu_buffer->commit_page == cpu_buffer->reader_page)
+ goto out;
+
+ /* Don't bother swapping if the ring buffer is empty */
+ if (rb_num_of_entries(cpu_buffer) == 0)
+ goto out;
+
+ /*
+ * Reset the reader page to size zero.
+ */
+ local_set(&cpu_buffer->reader_page->write, 0);
+ local_set(&cpu_buffer->reader_page->entries, 0);
+ local_set(&cpu_buffer->reader_page->page->commit, 0);
+ cpu_buffer->reader_page->real_end = 0;
+
+ spin:
+ /*
+ * Splice the empty reader page into the list around the head.
+ */
+ reader = rb_set_head_page(cpu_buffer);
+ if (!reader)
+ goto out;
+ cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next);
+ cpu_buffer->reader_page->list.prev = reader->list.prev;
+
+ /*
+ * cpu_buffer->pages just needs to point to the buffer, it
+ * has no specific buffer page to point to. Lets move it out
+ * of our way so we don't accidentally swap it.
+ */
+ cpu_buffer->pages = reader->list.prev;
+
+ /* The reader page will be pointing to the new head */
+ rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list);
+
+ /*
+ * We want to make sure we read the overruns after we set up our
+ * pointers to the next object. The writer side does a
+ * cmpxchg to cross pages which acts as the mb on the writer
+ * side. Note, the reader will constantly fail the swap
+ * while the writer is updating the pointers, so this
+ * guarantees that the overwrite recorded here is the one we
+ * want to compare with the last_overrun.
+ */
+ smp_mb();
+ overwrite = local_read(&(cpu_buffer->overrun));
+
+ /*
+ * Here's the tricky part.
+ *
+ * We need to move the pointer past the header page.
+ * But we can only do that if a writer is not currently
+ * moving it. The page before the header page has the
+ * flag bit '1' set if it is pointing to the page we want.
+ * but if the writer is in the process of moving it
+ * than it will be '2' or already moved '0'.
+ */
+
+ ret = rb_head_page_replace(reader, cpu_buffer->reader_page);
+
+ /*
+ * If we did not convert it, then we must try again.
+ */
+ if (!ret)
+ goto spin;
+
+ /*
+ * Yeah! We succeeded in replacing the page.
+ *
+ * Now make the new head point back to the reader page.
+ */
+ rb_list_head(reader->list.next)->prev = &cpu_buffer->reader_page->list;
+ rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
+
+ /* Finally update the reader page to the new head */
+ cpu_buffer->reader_page = reader;
+ rb_reset_reader_page(cpu_buffer);
+
+ if (overwrite != cpu_buffer->last_overrun) {
+ cpu_buffer->lost_events = overwrite - cpu_buffer->last_overrun;
+ cpu_buffer->last_overrun = overwrite;
+ }
+
+ goto again;
+
+ out:
+ arch_spin_unlock(&cpu_buffer->lock);
+ local_irq_restore(flags);
+
+ return reader;
+}
+
+static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct ring_buffer_event *event;
+ struct buffer_page *reader;
+ unsigned length;
+
+ reader = rb_get_reader_page(cpu_buffer);
+
+ /* This function should not be called when buffer is empty */
+ if (RB_WARN_ON(cpu_buffer, !reader))
+ return;
+
+ event = rb_reader_event(cpu_buffer);
+
+ if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
+ cpu_buffer->read++;
+
+ rb_update_read_stamp(cpu_buffer, event);
+
+ length = rb_event_length(event);
+ cpu_buffer->reader_page->read += length;
+}
+
+static void rb_advance_iter(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+ unsigned length;
+
+ cpu_buffer = iter->cpu_buffer;
+
+ /*
+ * Check if we are at the end of the buffer.
+ */
+ if (iter->head >= rb_page_size(iter->head_page)) {
+ /* discarded commits can make the page empty */
+ if (iter->head_page == cpu_buffer->commit_page)
+ return;
+ rb_inc_iter(iter);
+ return;
+ }
+
+ event = rb_iter_head_event(iter);
+
+ length = rb_event_length(event);
+
+ /*
+ * This should not be called to advance the header if we are
+ * at the tail of the buffer.
+ */
+ if (RB_WARN_ON(cpu_buffer,
+ (iter->head_page == cpu_buffer->commit_page) &&
+ (iter->head + length > rb_commit_index(cpu_buffer))))
+ return;
+
+ rb_update_iter_read_stamp(iter, event);
+
+ iter->head += length;
+
+ /* check for end of page padding */
+ if ((iter->head >= rb_page_size(iter->head_page)) &&
+ (iter->head_page != cpu_buffer->commit_page))
+ rb_inc_iter(iter);
+}
+
+static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return cpu_buffer->lost_events;
+}
+
+static struct ring_buffer_event *
+rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
+ unsigned long *lost_events)
+{
+ struct ring_buffer_event *event;
+ struct buffer_page *reader;
+ int nr_loops = 0;
+
+ again:
+ /*
+ * We repeat when a time extend is encountered.
+ * Since the time extend is always attached to a data event,
+ * we should never loop more than once.
+ * (We never hit the following condition more than twice).
+ */
+ if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
+ return NULL;
+
+ reader = rb_get_reader_page(cpu_buffer);
+ if (!reader)
+ return NULL;
+
+ event = rb_reader_event(cpu_buffer);
+
+ switch (event->type_len) {
+ case RINGBUF_TYPE_PADDING:
+ if (rb_null_event(event))
+ RB_WARN_ON(cpu_buffer, 1);
+ /*
+ * Because the writer could be discarding every
+ * event it creates (which would probably be bad)
+ * if we were to go back to "again" then we may never
+ * catch up, and will trigger the warn on, or lock
+ * the box. Return the padding, and we will release
+ * the current locks, and try again.
+ */
+ return event;
+
+ case RINGBUF_TYPE_TIME_EXTEND:
+ /* Internal data, OK to advance */
+ rb_advance_reader(cpu_buffer);
+ goto again;
+
+ case RINGBUF_TYPE_TIME_STAMP:
+ /* FIXME: not implemented */
+ rb_advance_reader(cpu_buffer);
+ goto again;
+
+ case RINGBUF_TYPE_DATA:
+ if (ts) {
+ *ts = cpu_buffer->read_stamp + event->time_delta;
+ ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
+ cpu_buffer->cpu, ts);
+ }
+ if (lost_events)
+ *lost_events = rb_lost_events(cpu_buffer);
+ return event;
+
+ default:
+ BUG();
+ }
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_peek);
+
+static struct ring_buffer_event *
+rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
+{
+ struct ring_buffer *buffer;
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+ int nr_loops = 0;
+
+ cpu_buffer = iter->cpu_buffer;
+ buffer = cpu_buffer->buffer;
+
+ /*
+ * Check if someone performed a consuming read to
+ * the buffer. A consuming read invalidates the iterator
+ * and we need to reset the iterator in this case.
+ */
+ if (unlikely(iter->cache_read != cpu_buffer->read ||
+ iter->cache_reader_page != cpu_buffer->reader_page))
+ rb_iter_reset(iter);
+
+ again:
+ if (ring_buffer_iter_empty(iter))
+ return NULL;
+
+ /*
+ * We repeat when a time extend is encountered or we hit
+ * the end of the page. Since the time extend is always attached
+ * to a data event, we should never loop more than three times.
+ * Once for going to next page, once on time extend, and
+ * finally once to get the event.
+ * (We never hit the following condition more than thrice).
+ */
+ if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3))
+ return NULL;
+
+ if (rb_per_cpu_empty(cpu_buffer))
+ return NULL;
+
+ if (iter->head >= rb_page_size(iter->head_page)) {
+ rb_inc_iter(iter);
+ goto again;
+ }
+
+ event = rb_iter_head_event(iter);
+
+ switch (event->type_len) {
+ case RINGBUF_TYPE_PADDING:
+ if (rb_null_event(event)) {
+ rb_inc_iter(iter);
+ goto again;
+ }
+ rb_advance_iter(iter);
+ return event;
+
+ case RINGBUF_TYPE_TIME_EXTEND:
+ /* Internal data, OK to advance */
+ rb_advance_iter(iter);
+ goto again;
+
+ case RINGBUF_TYPE_TIME_STAMP:
+ /* FIXME: not implemented */
+ rb_advance_iter(iter);
+ goto again;
+
+ case RINGBUF_TYPE_DATA:
+ if (ts) {
+ *ts = iter->read_stamp + event->time_delta;
+ ring_buffer_normalize_time_stamp(buffer,
+ cpu_buffer->cpu, ts);
+ }
+ return event;
+
+ default:
+ BUG();
+ }
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_iter_peek);
+
+static inline int rb_ok_to_lock(void)
+{
+ /*
+ * If an NMI die dumps out the content of the ring buffer
+ * do not grab locks. We also permanently disable the ring
+ * buffer too. A one time deal is all you get from reading
+ * the ring buffer from an NMI.
+ */
+ if (likely(!in_nmi()))
+ return 1;
+
+ tracing_off_permanent();
+ return 0;
+}
+
+/**
+ * ring_buffer_peek - peek at the next event to be read
+ * @buffer: The ring buffer to read
+ * @cpu: The cpu to peak at
+ * @ts: The timestamp counter of this event.
+ * @lost_events: a variable to store if events were lost (may be NULL)
+ *
+ * This will return the event that will be read next, but does
+ * not consume the data.
+ */
+struct ring_buffer_event *
+ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
+ unsigned long *lost_events)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+ struct ring_buffer_event *event;
+ unsigned long flags;
+ int dolock;
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return NULL;
+
+ dolock = rb_ok_to_lock();
+ again:
+ local_irq_save(flags);
+ if (dolock)
+ raw_spin_lock(&cpu_buffer->reader_lock);
+ event = rb_buffer_peek(cpu_buffer, ts, lost_events);
+ if (event && event->type_len == RINGBUF_TYPE_PADDING)
+ rb_advance_reader(cpu_buffer);
+ if (dolock)
+ raw_spin_unlock(&cpu_buffer->reader_lock);
+ local_irq_restore(flags);
+
+ if (event && event->type_len == RINGBUF_TYPE_PADDING)
+ goto again;
+
+ return event;
+}
+
+/**
+ * ring_buffer_iter_peek - peek at the next event to be read
+ * @iter: The ring buffer iterator
+ * @ts: The timestamp counter of this event.
+ *
+ * This will return the event that will be read next, but does
+ * not increment the iterator.
+ */
+struct ring_buffer_event *
+ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+ struct ring_buffer_event *event;
+ unsigned long flags;
+
+ again:
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ event = rb_iter_peek(iter, ts);
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+ if (event && event->type_len == RINGBUF_TYPE_PADDING)
+ goto again;
+
+ return event;
+}
+
+/**
+ * ring_buffer_consume - return an event and consume it
+ * @buffer: The ring buffer to get the next event from
+ * @cpu: the cpu to read the buffer from
+ * @ts: a variable to store the timestamp (may be NULL)
+ * @lost_events: a variable to store if events were lost (may be NULL)
+ *
+ * Returns the next event in the ring buffer, and that event is consumed.
+ * Meaning, that sequential reads will keep returning a different event,
+ * and eventually empty the ring buffer if the producer is slower.
+ */
+struct ring_buffer_event *
+ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
+ unsigned long *lost_events)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event = NULL;
+ unsigned long flags;
+ int dolock;
+
+ dolock = rb_ok_to_lock();
+
+ again:
+ /* might be called in atomic */
+ preempt_disable();
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ goto out;
+
+ cpu_buffer = buffer->buffers[cpu];
+ local_irq_save(flags);
+ if (dolock)
+ raw_spin_lock(&cpu_buffer->reader_lock);
+
+ event = rb_buffer_peek(cpu_buffer, ts, lost_events);
+ if (event) {
+ cpu_buffer->lost_events = 0;
+ rb_advance_reader(cpu_buffer);
+ }
+
+ if (dolock)
+ raw_spin_unlock(&cpu_buffer->reader_lock);
+ local_irq_restore(flags);
+
+ out:
+ preempt_enable();
+
+ if (event && event->type_len == RINGBUF_TYPE_PADDING)
+ goto again;
+
+ return event;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_consume);
+
+/**
+ * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer
+ * @buffer: The ring buffer to read from
+ * @cpu: The cpu buffer to iterate over
+ *
+ * This performs the initial preparations necessary to iterate
+ * through the buffer. Memory is allocated, buffer recording
+ * is disabled, and the iterator pointer is returned to the caller.
+ *
+ * Disabling buffer recordng prevents the reading from being
+ * corrupted. This is not a consuming read, so a producer is not
+ * expected.
+ *
+ * After a sequence of ring_buffer_read_prepare calls, the user is
+ * expected to make at least one call to ring_buffer_read_prepare_sync.
+ * Afterwards, ring_buffer_read_start is invoked to get things going
+ * for real.
+ *
+ * This overall must be paired with ring_buffer_read_finish.
+ */
+struct ring_buffer_iter *
+ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_iter *iter;
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return NULL;
+
+ iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter)
+ return NULL;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ iter->cpu_buffer = cpu_buffer;
+
+ atomic_inc(&buffer->resize_disabled);
+ atomic_inc(&cpu_buffer->record_disabled);
+
+ return iter;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_read_prepare);
+
+/**
+ * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls
+ *
+ * All previously invoked ring_buffer_read_prepare calls to prepare
+ * iterators will be synchronized. Afterwards, read_buffer_read_start
+ * calls on those iterators are allowed.
+ */
+void
+ring_buffer_read_prepare_sync(void)
+{
+ synchronize_sched();
+}
+EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync);
+
+/**
+ * ring_buffer_read_start - start a non consuming read of the buffer
+ * @iter: The iterator returned by ring_buffer_read_prepare
+ *
+ * This finalizes the startup of an iteration through the buffer.
+ * The iterator comes from a call to ring_buffer_read_prepare and
+ * an intervening ring_buffer_read_prepare_sync must have been
+ * performed.
+ *
+ * Must be paired with ring_buffer_read_finish.
+ */
+void
+ring_buffer_read_start(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long flags;
+
+ if (!iter)
+ return;
+
+ cpu_buffer = iter->cpu_buffer;
+
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ arch_spin_lock(&cpu_buffer->lock);
+ rb_iter_reset(iter);
+ arch_spin_unlock(&cpu_buffer->lock);
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+}
+EXPORT_SYMBOL_GPL(ring_buffer_read_start);
+
+/**
+ * ring_buffer_read_finish - finish reading the iterator of the buffer
+ * @iter: The iterator retrieved by ring_buffer_start
+ *
+ * This re-enables the recording to the buffer, and frees the
+ * iterator.
+ */
+void
+ring_buffer_read_finish(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+ unsigned long flags;
+
+ /*
+ * Ring buffer is disabled from recording, here's a good place
+ * to check the integrity of the ring buffer.
+ * Must prevent readers from trying to read, as the check
+ * clears the HEAD page and readers require it.
+ */
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ rb_check_pages(cpu_buffer);
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+ atomic_dec(&cpu_buffer->record_disabled);
+ atomic_dec(&cpu_buffer->buffer->resize_disabled);
+ kfree(iter);
+}
+EXPORT_SYMBOL_GPL(ring_buffer_read_finish);
+
+/**
+ * ring_buffer_read - read the next item in the ring buffer by the iterator
+ * @iter: The ring buffer iterator
+ * @ts: The time stamp of the event read.
+ *
+ * This reads the next event in the ring buffer and increments the iterator.
+ */
+struct ring_buffer_event *
+ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
+{
+ struct ring_buffer_event *event;
+ struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ again:
+ event = rb_iter_peek(iter, ts);
+ if (!event)
+ goto out;
+
+ if (event->type_len == RINGBUF_TYPE_PADDING)
+ goto again;
+
+ rb_advance_iter(iter);
+ out:
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+ return event;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_read);
+
+/**
+ * ring_buffer_size - return the size of the ring buffer (in bytes)
+ * @buffer: The ring buffer.
+ */
+unsigned long ring_buffer_size(struct ring_buffer *buffer, int cpu)
+{
+ /*
+ * Earlier, this method returned
+ * BUF_PAGE_SIZE * buffer->nr_pages
+ * Since the nr_pages field is now removed, we have converted this to
+ * return the per cpu buffer value.
+ */
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return 0;
+
+ return BUF_PAGE_SIZE * buffer->buffers[cpu]->nr_pages;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_size);
+
+static void
+rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ rb_head_page_deactivate(cpu_buffer);
+
+ cpu_buffer->head_page
+ = list_entry(cpu_buffer->pages, struct buffer_page, list);
+ local_set(&cpu_buffer->head_page->write, 0);
+ local_set(&cpu_buffer->head_page->entries, 0);
+ local_set(&cpu_buffer->head_page->page->commit, 0);
+
+ cpu_buffer->head_page->read = 0;
+
+ cpu_buffer->tail_page = cpu_buffer->head_page;
+ cpu_buffer->commit_page = cpu_buffer->head_page;
+
+ INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
+ INIT_LIST_HEAD(&cpu_buffer->new_pages);
+ local_set(&cpu_buffer->reader_page->write, 0);
+ local_set(&cpu_buffer->reader_page->entries, 0);
+ local_set(&cpu_buffer->reader_page->page->commit, 0);
+ cpu_buffer->reader_page->read = 0;
+
+ local_set(&cpu_buffer->entries_bytes, 0);
+ local_set(&cpu_buffer->overrun, 0);
+ local_set(&cpu_buffer->commit_overrun, 0);
+ local_set(&cpu_buffer->dropped_events, 0);
+ local_set(&cpu_buffer->entries, 0);
+ local_set(&cpu_buffer->committing, 0);
+ local_set(&cpu_buffer->commits, 0);
+ cpu_buffer->read = 0;
+ cpu_buffer->read_bytes = 0;
+
+ cpu_buffer->write_stamp = 0;
+ cpu_buffer->read_stamp = 0;
+
+ cpu_buffer->lost_events = 0;
+ cpu_buffer->last_overrun = 0;
+
+ rb_head_page_activate(cpu_buffer);
+}
+
+/**
+ * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer
+ * @buffer: The ring buffer to reset a per cpu buffer of
+ * @cpu: The CPU buffer to be reset
+ */
+void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+ unsigned long flags;
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return;
+
+ atomic_inc(&buffer->resize_disabled);
+ atomic_inc(&cpu_buffer->record_disabled);
+
+ /* Make sure all commits have finished */
+ synchronize_sched();
+
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+
+ if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
+ goto out;
+
+ arch_spin_lock(&cpu_buffer->lock);
+
+ rb_reset_cpu(cpu_buffer);
+
+ arch_spin_unlock(&cpu_buffer->lock);
+
+ out:
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+ atomic_dec(&cpu_buffer->record_disabled);
+ atomic_dec(&buffer->resize_disabled);
+}
+EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
+
+/**
+ * ring_buffer_reset - reset a ring buffer
+ * @buffer: The ring buffer to reset all cpu buffers
+ */
+void ring_buffer_reset(struct ring_buffer *buffer)
+{
+ int cpu;
+
+ for_each_buffer_cpu(buffer, cpu)
+ ring_buffer_reset_cpu(buffer, cpu);
+}
+EXPORT_SYMBOL_GPL(ring_buffer_reset);
+
+/**
+ * rind_buffer_empty - is the ring buffer empty?
+ * @buffer: The ring buffer to test
+ */
+int ring_buffer_empty(struct ring_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long flags;
+ int dolock;
+ int cpu;
+ int ret;
+
+ dolock = rb_ok_to_lock();
+
+ /* yes this is racy, but if you don't like the race, lock the buffer */
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+ local_irq_save(flags);
+ if (dolock)
+ raw_spin_lock(&cpu_buffer->reader_lock);
+ ret = rb_per_cpu_empty(cpu_buffer);
+ if (dolock)
+ raw_spin_unlock(&cpu_buffer->reader_lock);
+ local_irq_restore(flags);
+
+ if (!ret)
+ return 0;
+ }
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_empty);
+
+/**
+ * ring_buffer_empty_cpu - is a cpu buffer of a ring buffer empty?
+ * @buffer: The ring buffer
+ * @cpu: The CPU buffer to test
+ */
+int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long flags;
+ int dolock;
+ int ret;
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return 1;
+
+ dolock = rb_ok_to_lock();
+
+ cpu_buffer = buffer->buffers[cpu];
+ local_irq_save(flags);
+ if (dolock)
+ raw_spin_lock(&cpu_buffer->reader_lock);
+ ret = rb_per_cpu_empty(cpu_buffer);
+ if (dolock)
+ raw_spin_unlock(&cpu_buffer->reader_lock);
+ local_irq_restore(flags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu);
+
+#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
+/**
+ * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers
+ * @buffer_a: One buffer to swap with
+ * @buffer_b: The other buffer to swap with
+ *
+ * This function is useful for tracers that want to take a "snapshot"
+ * of a CPU buffer and has another back up buffer lying around.
+ * it is expected that the tracer handles the cpu buffer not being
+ * used at the moment.
+ */
+int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
+ struct ring_buffer *buffer_b, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer_a;
+ struct ring_buffer_per_cpu *cpu_buffer_b;
+ int ret = -EINVAL;
+
+ if (!cpumask_test_cpu(cpu, buffer_a->cpumask) ||
+ !cpumask_test_cpu(cpu, buffer_b->cpumask))
+ goto out;
+
+ cpu_buffer_a = buffer_a->buffers[cpu];
+ cpu_buffer_b = buffer_b->buffers[cpu];
+
+ /* At least make sure the two buffers are somewhat the same */
+ if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages)
+ goto out;
+
+ ret = -EAGAIN;
+
+ if (ring_buffer_flags != RB_BUFFERS_ON)
+ goto out;
+
+ if (atomic_read(&buffer_a->record_disabled))
+ goto out;
+
+ if (atomic_read(&buffer_b->record_disabled))
+ goto out;
+
+ if (atomic_read(&cpu_buffer_a->record_disabled))
+ goto out;
+
+ if (atomic_read(&cpu_buffer_b->record_disabled))
+ goto out;
+
+ /*
+ * We can't do a synchronize_sched here because this
+ * function can be called in atomic context.
+ * Normally this will be called from the same CPU as cpu.
+ * If not it's up to the caller to protect this.
+ */
+ atomic_inc(&cpu_buffer_a->record_disabled);
+ atomic_inc(&cpu_buffer_b->record_disabled);
+
+ ret = -EBUSY;
+ if (local_read(&cpu_buffer_a->committing))
+ goto out_dec;
+ if (local_read(&cpu_buffer_b->committing))
+ goto out_dec;
+
+ buffer_a->buffers[cpu] = cpu_buffer_b;
+ buffer_b->buffers[cpu] = cpu_buffer_a;
+
+ cpu_buffer_b->buffer = buffer_a;
+ cpu_buffer_a->buffer = buffer_b;
+
+ ret = 0;
+
+out_dec:
+ atomic_dec(&cpu_buffer_a->record_disabled);
+ atomic_dec(&cpu_buffer_b->record_disabled);
+out:
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
+#endif /* CONFIG_RING_BUFFER_ALLOW_SWAP */
+
+/**
+ * ring_buffer_alloc_read_page - allocate a page to read from buffer
+ * @buffer: the buffer to allocate for.
+ * @cpu: the cpu buffer to allocate.
+ *
+ * This function is used in conjunction with ring_buffer_read_page.
+ * When reading a full page from the ring buffer, these functions
+ * can be used to speed up the process. The calling function should
+ * allocate a few pages first with this function. Then when it
+ * needs to get pages from the ring buffer, it passes the result
+ * of this function into ring_buffer_read_page, which will swap
+ * the page that was allocated, with the read page of the buffer.
+ *
+ * Returns:
+ * The page allocated, or NULL on error.
+ */
+void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu)
+{
+ struct buffer_data_page *bpage;
+ struct page *page;
+
+ page = alloc_pages_node(cpu_to_node(cpu),
+ GFP_KERNEL | __GFP_NORETRY, 0);
+ if (!page)
+ return NULL;
+
+ bpage = page_address(page);
+
+ rb_init_page(bpage);
+
+ return bpage;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page);
+
+/**
+ * ring_buffer_free_read_page - free an allocated read page
+ * @buffer: the buffer the page was allocate for
+ * @data: the page to free
+ *
+ * Free a page allocated from ring_buffer_alloc_read_page.
+ */
+void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
+{
+ free_page((unsigned long)data);
+}
+EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
+
+/**
+ * ring_buffer_read_page - extract a page from the ring buffer
+ * @buffer: buffer to extract from
+ * @data_page: the page to use allocated from ring_buffer_alloc_read_page
+ * @len: amount to extract
+ * @cpu: the cpu of the buffer to extract
+ * @full: should the extraction only happen when the page is full.
+ *
+ * This function will pull out a page from the ring buffer and consume it.
+ * @data_page must be the address of the variable that was returned
+ * from ring_buffer_alloc_read_page. This is because the page might be used
+ * to swap with a page in the ring buffer.
+ *
+ * for example:
+ * rpage = ring_buffer_alloc_read_page(buffer, cpu);
+ * if (!rpage)
+ * return error;
+ * ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0);
+ * if (ret >= 0)
+ * process_page(rpage, ret);
+ *
+ * When @full is set, the function will not return true unless
+ * the writer is off the reader page.
+ *
+ * Note: it is up to the calling functions to handle sleeps and wakeups.
+ * The ring buffer can be used anywhere in the kernel and can not
+ * blindly call wake_up. The layer that uses the ring buffer must be
+ * responsible for that.
+ *
+ * Returns:
+ * >=0 if data has been transferred, returns the offset of consumed data.
+ * <0 if no data has been transferred.
+ */
+int ring_buffer_read_page(struct ring_buffer *buffer,
+ void **data_page, size_t len, int cpu, int full)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+ struct ring_buffer_event *event;
+ struct buffer_data_page *bpage;
+ struct buffer_page *reader;
+ unsigned long missed_events;
+ unsigned long flags;
+ unsigned int commit;
+ unsigned int read;
+ u64 save_timestamp;
+ int ret = -1;
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ goto out;
+
+ /*
+ * If len is not big enough to hold the page header, then
+ * we can not copy anything.
+ */
+ if (len <= BUF_PAGE_HDR_SIZE)
+ goto out;
+
+ len -= BUF_PAGE_HDR_SIZE;
+
+ if (!data_page)
+ goto out;
+
+ bpage = *data_page;
+ if (!bpage)
+ goto out;
+
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+
+ reader = rb_get_reader_page(cpu_buffer);
+ if (!reader)
+ goto out_unlock;
+
+ event = rb_reader_event(cpu_buffer);
+
+ read = reader->read;
+ commit = rb_page_commit(reader);
+
+ /* Check if any events were dropped */
+ missed_events = cpu_buffer->lost_events;
+
+ /*
+ * If this page has been partially read or
+ * if len is not big enough to read the rest of the page or
+ * a writer is still on the page, then
+ * we must copy the data from the page to the buffer.
+ * Otherwise, we can simply swap the page with the one passed in.
+ */
+ if (read || (len < (commit - read)) ||
+ cpu_buffer->reader_page == cpu_buffer->commit_page) {
+ struct buffer_data_page *rpage = cpu_buffer->reader_page->page;
+ unsigned int rpos = read;
+ unsigned int pos = 0;
+ unsigned int size;
+
+ if (full)
+ goto out_unlock;
+
+ if (len > (commit - read))
+ len = (commit - read);
+
+ /* Always keep the time extend and data together */
+ size = rb_event_ts_length(event);
+
+ if (len < size)
+ goto out_unlock;
+
+ /* save the current timestamp, since the user will need it */
+ save_timestamp = cpu_buffer->read_stamp;
+
+ /* Need to copy one event at a time */
+ do {
+ /* We need the size of one event, because
+ * rb_advance_reader only advances by one event,
+ * whereas rb_event_ts_length may include the size of
+ * one or two events.
+ * We have already ensured there's enough space if this
+ * is a time extend. */
+ size = rb_event_length(event);
+ memcpy(bpage->data + pos, rpage->data + rpos, size);
+
+ len -= size;
+
+ rb_advance_reader(cpu_buffer);
+ rpos = reader->read;
+ pos += size;
+
+ if (rpos >= commit)
+ break;
+
+ event = rb_reader_event(cpu_buffer);
+ /* Always keep the time extend and data together */
+ size = rb_event_ts_length(event);
+ } while (len >= size);
+
+ /* update bpage */
+ local_set(&bpage->commit, pos);
+ bpage->time_stamp = save_timestamp;
+
+ /* we copied everything to the beginning */
+ read = 0;
+ } else {
+ /* update the entry counter */
+ cpu_buffer->read += rb_page_entries(reader);
+ cpu_buffer->read_bytes += BUF_PAGE_SIZE;
+
+ /* swap the pages */
+ rb_init_page(bpage);
+ bpage = reader->page;
+ reader->page = *data_page;
+ local_set(&reader->write, 0);
+ local_set(&reader->entries, 0);
+ reader->read = 0;
+ *data_page = bpage;
+
+ /*
+ * Use the real_end for the data size,
+ * This gives us a chance to store the lost events
+ * on the page.
+ */
+ if (reader->real_end)
+ local_set(&bpage->commit, reader->real_end);
+ }
+ ret = read;
+
+ cpu_buffer->lost_events = 0;
+
+ commit = local_read(&bpage->commit);
+ /*
+ * Set a flag in the commit field if we lost events
+ */
+ if (missed_events) {
+ /* If there is room at the end of the page to save the
+ * missed events, then record it there.
+ */
+ if (BUF_PAGE_SIZE - commit >= sizeof(missed_events)) {
+ memcpy(&bpage->data[commit], &missed_events,
+ sizeof(missed_events));
+ local_add(RB_MISSED_STORED, &bpage->commit);
+ commit += sizeof(missed_events);
+ }
+ local_add(RB_MISSED_EVENTS, &bpage->commit);
+ }
+
+ /*
+ * This page may be off to user land. Zero it out here.
+ */
+ if (commit < BUF_PAGE_SIZE)
+ memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit);
+
+ out_unlock:
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+ out:
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_read_page);
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int rb_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ struct ring_buffer *buffer =
+ container_of(self, struct ring_buffer, cpu_notify);
+ long cpu = (long)hcpu;
+ int cpu_i, nr_pages_same;
+ unsigned int nr_pages;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
+ if (cpumask_test_cpu(cpu, buffer->cpumask))
+ return NOTIFY_OK;
+
+ nr_pages = 0;
+ nr_pages_same = 1;
+ /* check if all cpu sizes are same */
+ for_each_buffer_cpu(buffer, cpu_i) {
+ /* fill in the size from first enabled cpu */
+ if (nr_pages == 0)
+ nr_pages = buffer->buffers[cpu_i]->nr_pages;
+ if (nr_pages != buffer->buffers[cpu_i]->nr_pages) {
+ nr_pages_same = 0;
+ break;
+ }
+ }
+ /* allocate minimum pages, user can later expand it */
+ if (!nr_pages_same)
+ nr_pages = 2;
+ buffer->buffers[cpu] =
+ rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
+ if (!buffer->buffers[cpu]) {
+ WARN(1, "failed to allocate ring buffer on CPU %ld\n",
+ cpu);
+ return NOTIFY_OK;
+ }
+ smp_wmb();
+ cpumask_set_cpu(cpu, buffer->cpumask);
+ break;
+ case CPU_DOWN_PREPARE:
+ case CPU_DOWN_PREPARE_FROZEN:
+ /*
+ * Do nothing.
+ * If we were to free the buffer, then the user would
+ * lose any trace that was in the buffer.
+ */
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+#endif
+
+#ifdef CONFIG_RING_BUFFER_STARTUP_TEST
+/*
+ * This is a basic integrity check of the ring buffer.
+ * Late in the boot cycle this test will run when configured in.
+ * It will kick off a thread per CPU that will go into a loop
+ * writing to the per cpu ring buffer various sizes of data.
+ * Some of the data will be large items, some small.
+ *
+ * Another thread is created that goes into a spin, sending out
+ * IPIs to the other CPUs to also write into the ring buffer.
+ * this is to test the nesting ability of the buffer.
+ *
+ * Basic stats are recorded and reported. If something in the
+ * ring buffer should happen that's not expected, a big warning
+ * is displayed and all ring buffers are disabled.
+ */
+static struct task_struct *rb_threads[NR_CPUS] __initdata;
+
+struct rb_test_data {
+ struct ring_buffer *buffer;
+ unsigned long events;
+ unsigned long bytes_written;
+ unsigned long bytes_alloc;
+ unsigned long bytes_dropped;
+ unsigned long events_nested;
+ unsigned long bytes_written_nested;
+ unsigned long bytes_alloc_nested;
+ unsigned long bytes_dropped_nested;
+ int min_size_nested;
+ int max_size_nested;
+ int max_size;
+ int min_size;
+ int cpu;
+ int cnt;
+};
+
+static struct rb_test_data rb_data[NR_CPUS] __initdata;
+
+/* 1 meg per cpu */
+#define RB_TEST_BUFFER_SIZE 1048576
+
+static char rb_string[] __initdata =
+ "abcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()?+\\"
+ "?+|:';\",.<>/?abcdefghijklmnopqrstuvwxyz1234567890"
+ "!@#$%^&*()?+\\?+|:';\",.<>/?abcdefghijklmnopqrstuv";
+
+static bool rb_test_started __initdata;
+
+struct rb_item {
+ int size;
+ char str[];
+};
+
+static __init int rb_write_something(struct rb_test_data *data, bool nested)
+{
+ struct ring_buffer_event *event;
+ struct rb_item *item;
+ bool started;
+ int event_len;
+ int size;
+ int len;
+ int cnt;
+
+ /* Have nested writes different that what is written */
+ cnt = data->cnt + (nested ? 27 : 0);
+
+ /* Multiply cnt by ~e, to make some unique increment */
+ size = (data->cnt * 68 / 25) % (sizeof(rb_string) - 1);
+
+ len = size + sizeof(struct rb_item);
+
+ started = rb_test_started;
+ /* read rb_test_started before checking buffer enabled */
+ smp_rmb();
+
+ event = ring_buffer_lock_reserve(data->buffer, len);
+ if (!event) {
+ /* Ignore dropped events before test starts. */
+ if (started) {
+ if (nested)
+ data->bytes_dropped += len;
+ else
+ data->bytes_dropped_nested += len;
+ }
+ return len;
+ }
+
+ event_len = ring_buffer_event_length(event);
+
+ if (RB_WARN_ON(data->buffer, event_len < len))
+ goto out;
+
+ item = ring_buffer_event_data(event);
+ item->size = size;
+ memcpy(item->str, rb_string, size);
+
+ if (nested) {
+ data->bytes_alloc_nested += event_len;
+ data->bytes_written_nested += len;
+ data->events_nested++;
+ if (!data->min_size_nested || len < data->min_size_nested)
+ data->min_size_nested = len;
+ if (len > data->max_size_nested)
+ data->max_size_nested = len;
+ } else {
+ data->bytes_alloc += event_len;
+ data->bytes_written += len;
+ data->events++;
+ if (!data->min_size || len < data->min_size)
+ data->max_size = len;
+ if (len > data->max_size)
+ data->max_size = len;
+ }
+
+ out:
+ ring_buffer_unlock_commit(data->buffer, event);
+
+ return 0;
+}
+
+static __init int rb_test(void *arg)
+{
+ struct rb_test_data *data = arg;
+
+ while (!kthread_should_stop()) {
+ rb_write_something(data, false);
+ data->cnt++;
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ /* Now sleep between a min of 100-300us and a max of 1ms */
+ usleep_range(((data->cnt % 3) + 1) * 100, 1000);
+ }
+
+ return 0;
+}
+
+static __init void rb_ipi(void *ignore)
+{
+ struct rb_test_data *data;
+ int cpu = smp_processor_id();
+
+ data = &rb_data[cpu];
+ rb_write_something(data, true);
+}
+
+static __init int rb_hammer_test(void *arg)
+{
+ while (!kthread_should_stop()) {
+
+ /* Send an IPI to all cpus to write data! */
+ smp_call_function(rb_ipi, NULL, 1);
+ /* No sleep, but for non preempt, let others run */
+ schedule();
+ }
+
+ return 0;
+}
+
+static __init int test_ringbuffer(void)
+{
+ struct task_struct *rb_hammer;
+ struct ring_buffer *buffer;
+ int cpu;
+ int ret = 0;
+
+ pr_info("Running ring buffer tests...\n");
+
+ buffer = ring_buffer_alloc(RB_TEST_BUFFER_SIZE, RB_FL_OVERWRITE);
+ if (WARN_ON(!buffer))
+ return 0;
+
+ /* Disable buffer so that threads can't write to it yet */
+ ring_buffer_record_off(buffer);
+
+ for_each_online_cpu(cpu) {
+ rb_data[cpu].buffer = buffer;
+ rb_data[cpu].cpu = cpu;
+ rb_data[cpu].cnt = cpu;
+ rb_threads[cpu] = kthread_create(rb_test, &rb_data[cpu],
+ "rbtester/%d", cpu);
+ if (WARN_ON(!rb_threads[cpu])) {
+ pr_cont("FAILED\n");
+ ret = -1;
+ goto out_free;
+ }
+
+ kthread_bind(rb_threads[cpu], cpu);
+ wake_up_process(rb_threads[cpu]);
+ }
+
+ /* Now create the rb hammer! */
+ rb_hammer = kthread_run(rb_hammer_test, NULL, "rbhammer");
+ if (WARN_ON(!rb_hammer)) {
+ pr_cont("FAILED\n");
+ ret = -1;
+ goto out_free;
+ }
+
+ ring_buffer_record_on(buffer);
+ /*
+ * Show buffer is enabled before setting rb_test_started.
+ * Yes there's a small race window where events could be
+ * dropped and the thread wont catch it. But when a ring
+ * buffer gets enabled, there will always be some kind of
+ * delay before other CPUs see it. Thus, we don't care about
+ * those dropped events. We care about events dropped after
+ * the threads see that the buffer is active.
+ */
+ smp_wmb();
+ rb_test_started = true;
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ /* Just run for 10 seconds */;
+ schedule_timeout(10 * HZ);
+
+ kthread_stop(rb_hammer);
+
+ out_free:
+ for_each_online_cpu(cpu) {
+ if (!rb_threads[cpu])
+ break;
+ kthread_stop(rb_threads[cpu]);
+ }
+ if (ret) {
+ ring_buffer_free(buffer);
+ return ret;
+ }
+
+ /* Report! */
+ pr_info("finished\n");
+ for_each_online_cpu(cpu) {
+ struct ring_buffer_event *event;
+ struct rb_test_data *data = &rb_data[cpu];
+ struct rb_item *item;
+ unsigned long total_events;
+ unsigned long total_dropped;
+ unsigned long total_written;
+ unsigned long total_alloc;
+ unsigned long total_read = 0;
+ unsigned long total_size = 0;
+ unsigned long total_len = 0;
+ unsigned long total_lost = 0;
+ unsigned long lost;
+ int big_event_size;
+ int small_event_size;
+
+ ret = -1;
+
+ total_events = data->events + data->events_nested;
+ total_written = data->bytes_written + data->bytes_written_nested;
+ total_alloc = data->bytes_alloc + data->bytes_alloc_nested;
+ total_dropped = data->bytes_dropped + data->bytes_dropped_nested;
+
+ big_event_size = data->max_size + data->max_size_nested;
+ small_event_size = data->min_size + data->min_size_nested;
+
+ pr_info("CPU %d:\n", cpu);
+ pr_info(" events: %ld\n", total_events);
+ pr_info(" dropped bytes: %ld\n", total_dropped);
+ pr_info(" alloced bytes: %ld\n", total_alloc);
+ pr_info(" written bytes: %ld\n", total_written);
+ pr_info(" biggest event: %d\n", big_event_size);
+ pr_info(" smallest event: %d\n", small_event_size);
+
+ if (RB_WARN_ON(buffer, total_dropped))
+ break;
+
+ ret = 0;
+
+ while ((event = ring_buffer_consume(buffer, cpu, NULL, &lost))) {
+ total_lost += lost;
+ item = ring_buffer_event_data(event);
+ total_len += ring_buffer_event_length(event);
+ total_size += item->size + sizeof(struct rb_item);
+ if (memcmp(&item->str[0], rb_string, item->size) != 0) {
+ pr_info("FAILED!\n");
+ pr_info("buffer had: %.*s\n", item->size, item->str);
+ pr_info("expected: %.*s\n", item->size, rb_string);
+ RB_WARN_ON(buffer, 1);
+ ret = -1;
+ break;
+ }
+ total_read++;
+ }
+ if (ret)
+ break;
+
+ ret = -1;
+
+ pr_info(" read events: %ld\n", total_read);
+ pr_info(" lost events: %ld\n", total_lost);
+ pr_info(" total events: %ld\n", total_lost + total_read);
+ pr_info(" recorded len bytes: %ld\n", total_len);
+ pr_info(" recorded size bytes: %ld\n", total_size);
+ if (total_lost)
+ pr_info(" With dropped events, record len and size may not match\n"
+ " alloced and written from above\n");
+ if (!total_lost) {
+ if (RB_WARN_ON(buffer, total_len != total_alloc ||
+ total_size != total_written))
+ break;
+ }
+ if (RB_WARN_ON(buffer, total_lost + total_read != total_events))
+ break;
+
+ ret = 0;
+ }
+ if (!ret)
+ pr_info("Ring buffer PASSED!\n");
+
+ ring_buffer_free(buffer);
+ return 0;
+}
+
+late_initcall(test_ringbuffer);
+#endif /* CONFIG_RING_BUFFER_STARTUP_TEST */
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
new file mode 100644
index 000000000..1b28df2d9
--- /dev/null
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -0,0 +1,483 @@
+/*
+ * ring buffer tester and benchmark
+ *
+ * Copyright (C) 2009 Steven Rostedt <srostedt@redhat.com>
+ */
+#include <linux/ring_buffer.h>
+#include <linux/completion.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/ktime.h>
+#include <asm/local.h>
+
+struct rb_page {
+ u64 ts;
+ local_t commit;
+ char data[4080];
+};
+
+/* run time and sleep time in seconds */
+#define RUN_TIME 10ULL
+#define SLEEP_TIME 10
+
+/* number of events for writer to wake up the reader */
+static int wakeup_interval = 100;
+
+static int reader_finish;
+static struct completion read_start;
+static struct completion read_done;
+
+static struct ring_buffer *buffer;
+static struct task_struct *producer;
+static struct task_struct *consumer;
+static unsigned long read;
+
+static int disable_reader;
+module_param(disable_reader, uint, 0644);
+MODULE_PARM_DESC(disable_reader, "only run producer");
+
+static int write_iteration = 50;
+module_param(write_iteration, uint, 0644);
+MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings");
+
+static int producer_nice = MAX_NICE;
+static int consumer_nice = MAX_NICE;
+
+static int producer_fifo = -1;
+static int consumer_fifo = -1;
+
+module_param(producer_nice, uint, 0644);
+MODULE_PARM_DESC(producer_nice, "nice prio for producer");
+
+module_param(consumer_nice, uint, 0644);
+MODULE_PARM_DESC(consumer_nice, "nice prio for consumer");
+
+module_param(producer_fifo, uint, 0644);
+MODULE_PARM_DESC(producer_fifo, "fifo prio for producer");
+
+module_param(consumer_fifo, uint, 0644);
+MODULE_PARM_DESC(consumer_fifo, "fifo prio for consumer");
+
+static int read_events;
+
+static int kill_test;
+
+#define KILL_TEST() \
+ do { \
+ if (!kill_test) { \
+ kill_test = 1; \
+ WARN_ON(1); \
+ } \
+ } while (0)
+
+enum event_status {
+ EVENT_FOUND,
+ EVENT_DROPPED,
+};
+
+static enum event_status read_event(int cpu)
+{
+ struct ring_buffer_event *event;
+ int *entry;
+ u64 ts;
+
+ event = ring_buffer_consume(buffer, cpu, &ts, NULL);
+ if (!event)
+ return EVENT_DROPPED;
+
+ entry = ring_buffer_event_data(event);
+ if (*entry != cpu) {
+ KILL_TEST();
+ return EVENT_DROPPED;
+ }
+
+ read++;
+ return EVENT_FOUND;
+}
+
+static enum event_status read_page(int cpu)
+{
+ struct ring_buffer_event *event;
+ struct rb_page *rpage;
+ unsigned long commit;
+ void *bpage;
+ int *entry;
+ int ret;
+ int inc;
+ int i;
+
+ bpage = ring_buffer_alloc_read_page(buffer, cpu);
+ if (!bpage)
+ return EVENT_DROPPED;
+
+ ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1);
+ if (ret >= 0) {
+ rpage = bpage;
+ /* The commit may have missed event flags set, clear them */
+ commit = local_read(&rpage->commit) & 0xfffff;
+ for (i = 0; i < commit && !kill_test; i += inc) {
+
+ if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) {
+ KILL_TEST();
+ break;
+ }
+
+ inc = -1;
+ event = (void *)&rpage->data[i];
+ switch (event->type_len) {
+ case RINGBUF_TYPE_PADDING:
+ /* failed writes may be discarded events */
+ if (!event->time_delta)
+ KILL_TEST();
+ inc = event->array[0] + 4;
+ break;
+ case RINGBUF_TYPE_TIME_EXTEND:
+ inc = 8;
+ break;
+ case 0:
+ entry = ring_buffer_event_data(event);
+ if (*entry != cpu) {
+ KILL_TEST();
+ break;
+ }
+ read++;
+ if (!event->array[0]) {
+ KILL_TEST();
+ break;
+ }
+ inc = event->array[0] + 4;
+ break;
+ default:
+ entry = ring_buffer_event_data(event);
+ if (*entry != cpu) {
+ KILL_TEST();
+ break;
+ }
+ read++;
+ inc = ((event->type_len + 1) * 4);
+ }
+ if (kill_test)
+ break;
+
+ if (inc <= 0) {
+ KILL_TEST();
+ break;
+ }
+ }
+ }
+ ring_buffer_free_read_page(buffer, bpage);
+
+ if (ret < 0)
+ return EVENT_DROPPED;
+ return EVENT_FOUND;
+}
+
+static void ring_buffer_consumer(void)
+{
+ /* toggle between reading pages and events */
+ read_events ^= 1;
+
+ read = 0;
+ while (!reader_finish && !kill_test) {
+ int found;
+
+ do {
+ int cpu;
+
+ found = 0;
+ for_each_online_cpu(cpu) {
+ enum event_status stat;
+
+ if (read_events)
+ stat = read_event(cpu);
+ else
+ stat = read_page(cpu);
+
+ if (kill_test)
+ break;
+ if (stat == EVENT_FOUND)
+ found = 1;
+ }
+ } while (found && !kill_test);
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (reader_finish)
+ break;
+
+ schedule();
+ }
+ reader_finish = 0;
+ complete(&read_done);
+}
+
+static void ring_buffer_producer(void)
+{
+ ktime_t start_time, end_time, timeout;
+ unsigned long long time;
+ unsigned long long entries;
+ unsigned long long overruns;
+ unsigned long missed = 0;
+ unsigned long hit = 0;
+ unsigned long avg;
+ int cnt = 0;
+
+ /*
+ * Hammer the buffer for 10 secs (this may
+ * make the system stall)
+ */
+ trace_printk("Starting ring buffer hammer\n");
+ start_time = ktime_get();
+ timeout = ktime_add_ns(start_time, RUN_TIME * NSEC_PER_SEC);
+ do {
+ struct ring_buffer_event *event;
+ int *entry;
+ int i;
+
+ for (i = 0; i < write_iteration; i++) {
+ event = ring_buffer_lock_reserve(buffer, 10);
+ if (!event) {
+ missed++;
+ } else {
+ hit++;
+ entry = ring_buffer_event_data(event);
+ *entry = smp_processor_id();
+ ring_buffer_unlock_commit(buffer, event);
+ }
+ }
+ end_time = ktime_get();
+
+ cnt++;
+ if (consumer && !(cnt % wakeup_interval))
+ wake_up_process(consumer);
+
+#ifndef CONFIG_PREEMPT
+ /*
+ * If we are a non preempt kernel, the 10 second run will
+ * stop everything while it runs. Instead, we will call
+ * cond_resched and also add any time that was lost by a
+ * rescedule.
+ *
+ * Do a cond resched at the same frequency we would wake up
+ * the reader.
+ */
+ if (cnt % wakeup_interval)
+ cond_resched();
+#endif
+
+ } while (ktime_before(end_time, timeout) && !kill_test);
+ trace_printk("End ring buffer hammer\n");
+
+ if (consumer) {
+ /* Init both completions here to avoid races */
+ init_completion(&read_start);
+ init_completion(&read_done);
+ /* the completions must be visible before the finish var */
+ smp_wmb();
+ reader_finish = 1;
+ /* finish var visible before waking up the consumer */
+ smp_wmb();
+ wake_up_process(consumer);
+ wait_for_completion(&read_done);
+ }
+
+ time = ktime_us_delta(end_time, start_time);
+
+ entries = ring_buffer_entries(buffer);
+ overruns = ring_buffer_overruns(buffer);
+
+ if (kill_test)
+ trace_printk("ERROR!\n");
+
+ if (!disable_reader) {
+ if (consumer_fifo < 0)
+ trace_printk("Running Consumer at nice: %d\n",
+ consumer_nice);
+ else
+ trace_printk("Running Consumer at SCHED_FIFO %d\n",
+ consumer_fifo);
+ }
+ if (producer_fifo < 0)
+ trace_printk("Running Producer at nice: %d\n",
+ producer_nice);
+ else
+ trace_printk("Running Producer at SCHED_FIFO %d\n",
+ producer_fifo);
+
+ /* Let the user know that the test is running at low priority */
+ if (producer_fifo < 0 && consumer_fifo < 0 &&
+ producer_nice == MAX_NICE && consumer_nice == MAX_NICE)
+ trace_printk("WARNING!!! This test is running at lowest priority.\n");
+
+ trace_printk("Time: %lld (usecs)\n", time);
+ trace_printk("Overruns: %lld\n", overruns);
+ if (disable_reader)
+ trace_printk("Read: (reader disabled)\n");
+ else
+ trace_printk("Read: %ld (by %s)\n", read,
+ read_events ? "events" : "pages");
+ trace_printk("Entries: %lld\n", entries);
+ trace_printk("Total: %lld\n", entries + overruns + read);
+ trace_printk("Missed: %ld\n", missed);
+ trace_printk("Hit: %ld\n", hit);
+
+ /* Convert time from usecs to millisecs */
+ do_div(time, USEC_PER_MSEC);
+ if (time)
+ hit /= (long)time;
+ else
+ trace_printk("TIME IS ZERO??\n");
+
+ trace_printk("Entries per millisec: %ld\n", hit);
+
+ if (hit) {
+ /* Calculate the average time in nanosecs */
+ avg = NSEC_PER_MSEC / hit;
+ trace_printk("%ld ns per entry\n", avg);
+ }
+
+ if (missed) {
+ if (time)
+ missed /= (long)time;
+
+ trace_printk("Total iterations per millisec: %ld\n",
+ hit + missed);
+
+ /* it is possible that hit + missed will overflow and be zero */
+ if (!(hit + missed)) {
+ trace_printk("hit + missed overflowed and totalled zero!\n");
+ hit--; /* make it non zero */
+ }
+
+ /* Caculate the average time in nanosecs */
+ avg = NSEC_PER_MSEC / (hit + missed);
+ trace_printk("%ld ns per entry\n", avg);
+ }
+}
+
+static void wait_to_die(void)
+{
+ set_current_state(TASK_INTERRUPTIBLE);
+ while (!kthread_should_stop()) {
+ schedule();
+ set_current_state(TASK_INTERRUPTIBLE);
+ }
+ __set_current_state(TASK_RUNNING);
+}
+
+static int ring_buffer_consumer_thread(void *arg)
+{
+ while (!kthread_should_stop() && !kill_test) {
+ complete(&read_start);
+
+ ring_buffer_consumer();
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (kthread_should_stop() || kill_test)
+ break;
+
+ schedule();
+ }
+ __set_current_state(TASK_RUNNING);
+
+ if (kill_test)
+ wait_to_die();
+
+ return 0;
+}
+
+static int ring_buffer_producer_thread(void *arg)
+{
+ init_completion(&read_start);
+
+ while (!kthread_should_stop() && !kill_test) {
+ ring_buffer_reset(buffer);
+
+ if (consumer) {
+ smp_wmb();
+ wake_up_process(consumer);
+ wait_for_completion(&read_start);
+ }
+
+ ring_buffer_producer();
+
+ trace_printk("Sleeping for 10 secs\n");
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(HZ * SLEEP_TIME);
+ }
+
+ if (kill_test)
+ wait_to_die();
+
+ return 0;
+}
+
+static int __init ring_buffer_benchmark_init(void)
+{
+ int ret;
+
+ /* make a one meg buffer in overwite mode */
+ buffer = ring_buffer_alloc(1000000, RB_FL_OVERWRITE);
+ if (!buffer)
+ return -ENOMEM;
+
+ if (!disable_reader) {
+ consumer = kthread_create(ring_buffer_consumer_thread,
+ NULL, "rb_consumer");
+ ret = PTR_ERR(consumer);
+ if (IS_ERR(consumer))
+ goto out_fail;
+ }
+
+ producer = kthread_run(ring_buffer_producer_thread,
+ NULL, "rb_producer");
+ ret = PTR_ERR(producer);
+
+ if (IS_ERR(producer))
+ goto out_kill;
+
+ /*
+ * Run them as low-prio background tasks by default:
+ */
+ if (!disable_reader) {
+ if (consumer_fifo >= 0) {
+ struct sched_param param = {
+ .sched_priority = consumer_fifo
+ };
+ sched_setscheduler(consumer, SCHED_FIFO, &param);
+ } else
+ set_user_nice(consumer, consumer_nice);
+ }
+
+ if (producer_fifo >= 0) {
+ struct sched_param param = {
+ .sched_priority = producer_fifo
+ };
+ sched_setscheduler(producer, SCHED_FIFO, &param);
+ } else
+ set_user_nice(producer, producer_nice);
+
+ return 0;
+
+ out_kill:
+ if (consumer)
+ kthread_stop(consumer);
+
+ out_fail:
+ ring_buffer_free(buffer);
+ return ret;
+}
+
+static void __exit ring_buffer_benchmark_exit(void)
+{
+ kthread_stop(producer);
+ if (consumer)
+ kthread_stop(consumer);
+ ring_buffer_free(buffer);
+}
+
+module_init(ring_buffer_benchmark_init);
+module_exit(ring_buffer_benchmark_exit);
+
+MODULE_AUTHOR("Steven Rostedt");
+MODULE_DESCRIPTION("ring_buffer_benchmark");
+MODULE_LICENSE("GPL");
diff --git a/kernel/trace/rpm-traces.c b/kernel/trace/rpm-traces.c
new file mode 100644
index 000000000..4b3b5eaf9
--- /dev/null
+++ b/kernel/trace/rpm-traces.c
@@ -0,0 +1,20 @@
+/*
+ * Power trace points
+ *
+ * Copyright (C) 2009 Ming Lei <ming.lei@canonical.com>
+ */
+
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/usb.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/rpm.h>
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_return_int);
+EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_idle);
+EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_suspend);
+EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_resume);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
new file mode 100644
index 000000000..05330494a
--- /dev/null
+++ b/kernel/trace/trace.c
@@ -0,0 +1,7205 @@
+/*
+ * ring buffer based function tracer
+ *
+ * Copyright (C) 2007-2012 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
+ *
+ * Originally taken from the RT patch by:
+ * Arnaldo Carvalho de Melo <acme@redhat.com>
+ *
+ * Based on code from the latency_tracer, that is:
+ * Copyright (C) 2004-2006 Ingo Molnar
+ * Copyright (C) 2004 Nadia Yvette Chambers
+ */
+#include <linux/ring_buffer.h>
+#include <generated/utsrelease.h>
+#include <linux/stacktrace.h>
+#include <linux/writeback.h>
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/notifier.h>
+#include <linux/irqflags.h>
+#include <linux/debugfs.h>
+#include <linux/tracefs.h>
+#include <linux/pagemap.h>
+#include <linux/hardirq.h>
+#include <linux/linkage.h>
+#include <linux/uaccess.h>
+#include <linux/kprobes.h>
+#include <linux/ftrace.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/splice.h>
+#include <linux/kdebug.h>
+#include <linux/string.h>
+#include <linux/mount.h>
+#include <linux/rwsem.h>
+#include <linux/slab.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <linux/nmi.h>
+#include <linux/fs.h>
+#include <linux/sched/rt.h>
+
+#include "trace.h"
+#include "trace_output.h"
+
+/*
+ * On boot up, the ring buffer is set to the minimum size, so that
+ * we do not waste memory on systems that are not using tracing.
+ */
+bool ring_buffer_expanded;
+
+/*
+ * We need to change this state when a selftest is running.
+ * A selftest will lurk into the ring-buffer to count the
+ * entries inserted during the selftest although some concurrent
+ * insertions into the ring-buffer such as trace_printk could occurred
+ * at the same time, giving false positive or negative results.
+ */
+static bool __read_mostly tracing_selftest_running;
+
+/*
+ * If a tracer is running, we do not want to run SELFTEST.
+ */
+bool __read_mostly tracing_selftest_disabled;
+
+/* Pipe tracepoints to printk */
+struct trace_iterator *tracepoint_print_iter;
+int tracepoint_printk;
+
+/* For tracers that don't implement custom flags */
+static struct tracer_opt dummy_tracer_opt[] = {
+ { }
+};
+
+static struct tracer_flags dummy_tracer_flags = {
+ .val = 0,
+ .opts = dummy_tracer_opt
+};
+
+static int
+dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
+{
+ return 0;
+}
+
+/*
+ * To prevent the comm cache from being overwritten when no
+ * tracing is active, only save the comm when a trace event
+ * occurred.
+ */
+static DEFINE_PER_CPU(bool, trace_cmdline_save);
+
+/*
+ * Kill all tracing for good (never come back).
+ * It is initialized to 1 but will turn to zero if the initialization
+ * of the tracer is successful. But that is the only place that sets
+ * this back to zero.
+ */
+static int tracing_disabled = 1;
+
+DEFINE_PER_CPU(int, ftrace_cpu_disabled);
+
+cpumask_var_t __read_mostly tracing_buffer_mask;
+
+/*
+ * ftrace_dump_on_oops - variable to dump ftrace buffer on oops
+ *
+ * If there is an oops (or kernel panic) and the ftrace_dump_on_oops
+ * is set, then ftrace_dump is called. This will output the contents
+ * of the ftrace buffers to the console. This is very useful for
+ * capturing traces that lead to crashes and outputing it to a
+ * serial console.
+ *
+ * It is default off, but you can enable it with either specifying
+ * "ftrace_dump_on_oops" in the kernel command line, or setting
+ * /proc/sys/kernel/ftrace_dump_on_oops
+ * Set 1 if you want to dump buffers of all CPUs
+ * Set 2 if you want to dump the buffer of the CPU that triggered oops
+ */
+
+enum ftrace_dump_mode ftrace_dump_on_oops;
+
+/* When set, tracing will stop when a WARN*() is hit */
+int __disable_trace_on_warning;
+
+#ifdef CONFIG_TRACE_ENUM_MAP_FILE
+/* Map of enums to their values, for "enum_map" file */
+struct trace_enum_map_head {
+ struct module *mod;
+ unsigned long length;
+};
+
+union trace_enum_map_item;
+
+struct trace_enum_map_tail {
+ /*
+ * "end" is first and points to NULL as it must be different
+ * than "mod" or "enum_string"
+ */
+ union trace_enum_map_item *next;
+ const char *end; /* points to NULL */
+};
+
+static DEFINE_MUTEX(trace_enum_mutex);
+
+/*
+ * The trace_enum_maps are saved in an array with two extra elements,
+ * one at the beginning, and one at the end. The beginning item contains
+ * the count of the saved maps (head.length), and the module they
+ * belong to if not built in (head.mod). The ending item contains a
+ * pointer to the next array of saved enum_map items.
+ */
+union trace_enum_map_item {
+ struct trace_enum_map map;
+ struct trace_enum_map_head head;
+ struct trace_enum_map_tail tail;
+};
+
+static union trace_enum_map_item *trace_enum_maps;
+#endif /* CONFIG_TRACE_ENUM_MAP_FILE */
+
+static int tracing_set_tracer(struct trace_array *tr, const char *buf);
+
+#define MAX_TRACER_SIZE 100
+static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
+static char *default_bootup_tracer;
+
+static bool allocate_snapshot;
+
+static int __init set_cmdline_ftrace(char *str)
+{
+ strlcpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
+ default_bootup_tracer = bootup_tracer_buf;
+ /* We are using ftrace early, expand it */
+ ring_buffer_expanded = true;
+ return 1;
+}
+__setup("ftrace=", set_cmdline_ftrace);
+
+static int __init set_ftrace_dump_on_oops(char *str)
+{
+ if (*str++ != '=' || !*str) {
+ ftrace_dump_on_oops = DUMP_ALL;
+ return 1;
+ }
+
+ if (!strcmp("orig_cpu", str)) {
+ ftrace_dump_on_oops = DUMP_ORIG;
+ return 1;
+ }
+
+ return 0;
+}
+__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
+
+static int __init stop_trace_on_warning(char *str)
+{
+ if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0))
+ __disable_trace_on_warning = 1;
+ return 1;
+}
+__setup("traceoff_on_warning", stop_trace_on_warning);
+
+static int __init boot_alloc_snapshot(char *str)
+{
+ allocate_snapshot = true;
+ /* We also need the main ring buffer expanded */
+ ring_buffer_expanded = true;
+ return 1;
+}
+__setup("alloc_snapshot", boot_alloc_snapshot);
+
+
+static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata;
+static char *trace_boot_options __initdata;
+
+static int __init set_trace_boot_options(char *str)
+{
+ strlcpy(trace_boot_options_buf, str, MAX_TRACER_SIZE);
+ trace_boot_options = trace_boot_options_buf;
+ return 0;
+}
+__setup("trace_options=", set_trace_boot_options);
+
+static char trace_boot_clock_buf[MAX_TRACER_SIZE] __initdata;
+static char *trace_boot_clock __initdata;
+
+static int __init set_trace_boot_clock(char *str)
+{
+ strlcpy(trace_boot_clock_buf, str, MAX_TRACER_SIZE);
+ trace_boot_clock = trace_boot_clock_buf;
+ return 0;
+}
+__setup("trace_clock=", set_trace_boot_clock);
+
+static int __init set_tracepoint_printk(char *str)
+{
+ if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0))
+ tracepoint_printk = 1;
+ return 1;
+}
+__setup("tp_printk", set_tracepoint_printk);
+
+unsigned long long ns2usecs(cycle_t nsec)
+{
+ nsec += 500;
+ do_div(nsec, 1000);
+ return nsec;
+}
+
+/*
+ * The global_trace is the descriptor that holds the tracing
+ * buffers for the live tracing. For each CPU, it contains
+ * a link list of pages that will store trace entries. The
+ * page descriptor of the pages in the memory is used to hold
+ * the link list by linking the lru item in the page descriptor
+ * to each of the pages in the buffer per CPU.
+ *
+ * For each active CPU there is a data field that holds the
+ * pages for the buffer for that CPU. Each CPU has the same number
+ * of pages allocated for its buffer.
+ */
+static struct trace_array global_trace;
+
+LIST_HEAD(ftrace_trace_arrays);
+
+int trace_array_get(struct trace_array *this_tr)
+{
+ struct trace_array *tr;
+ int ret = -ENODEV;
+
+ mutex_lock(&trace_types_lock);
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ if (tr == this_tr) {
+ tr->ref++;
+ ret = 0;
+ break;
+ }
+ }
+ mutex_unlock(&trace_types_lock);
+
+ return ret;
+}
+
+static void __trace_array_put(struct trace_array *this_tr)
+{
+ WARN_ON(!this_tr->ref);
+ this_tr->ref--;
+}
+
+void trace_array_put(struct trace_array *this_tr)
+{
+ mutex_lock(&trace_types_lock);
+ __trace_array_put(this_tr);
+ mutex_unlock(&trace_types_lock);
+}
+
+int filter_check_discard(struct ftrace_event_file *file, void *rec,
+ struct ring_buffer *buffer,
+ struct ring_buffer_event *event)
+{
+ if (unlikely(file->flags & FTRACE_EVENT_FL_FILTERED) &&
+ !filter_match_preds(file->filter, rec)) {
+ ring_buffer_discard_commit(buffer, event);
+ return 1;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(filter_check_discard);
+
+int call_filter_check_discard(struct ftrace_event_call *call, void *rec,
+ struct ring_buffer *buffer,
+ struct ring_buffer_event *event)
+{
+ if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) &&
+ !filter_match_preds(call->filter, rec)) {
+ ring_buffer_discard_commit(buffer, event);
+ return 1;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(call_filter_check_discard);
+
+static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu)
+{
+ u64 ts;
+
+ /* Early boot up does not have a buffer yet */
+ if (!buf->buffer)
+ return trace_clock_local();
+
+ ts = ring_buffer_time_stamp(buf->buffer, cpu);
+ ring_buffer_normalize_time_stamp(buf->buffer, cpu, &ts);
+
+ return ts;
+}
+
+cycle_t ftrace_now(int cpu)
+{
+ return buffer_ftrace_now(&global_trace.trace_buffer, cpu);
+}
+
+/**
+ * tracing_is_enabled - Show if global_trace has been disabled
+ *
+ * Shows if the global trace has been enabled or not. It uses the
+ * mirror flag "buffer_disabled" to be used in fast paths such as for
+ * the irqsoff tracer. But it may be inaccurate due to races. If you
+ * need to know the accurate state, use tracing_is_on() which is a little
+ * slower, but accurate.
+ */
+int tracing_is_enabled(void)
+{
+ /*
+ * For quick access (irqsoff uses this in fast path), just
+ * return the mirror variable of the state of the ring buffer.
+ * It's a little racy, but we don't really care.
+ */
+ smp_rmb();
+ return !global_trace.buffer_disabled;
+}
+
+/*
+ * trace_buf_size is the size in bytes that is allocated
+ * for a buffer. Note, the number of bytes is always rounded
+ * to page size.
+ *
+ * This number is purposely set to a low number of 16384.
+ * If the dump on oops happens, it will be much appreciated
+ * to not have to wait for all that output. Anyway this can be
+ * boot time and run time configurable.
+ */
+#define TRACE_BUF_SIZE_DEFAULT 1441792UL /* 16384 * 88 (sizeof(entry)) */
+
+static unsigned long trace_buf_size = TRACE_BUF_SIZE_DEFAULT;
+
+/* trace_types holds a link list of available tracers. */
+static struct tracer *trace_types __read_mostly;
+
+/*
+ * trace_types_lock is used to protect the trace_types list.
+ */
+DEFINE_MUTEX(trace_types_lock);
+
+/*
+ * serialize the access of the ring buffer
+ *
+ * ring buffer serializes readers, but it is low level protection.
+ * The validity of the events (which returns by ring_buffer_peek() ..etc)
+ * are not protected by ring buffer.
+ *
+ * The content of events may become garbage if we allow other process consumes
+ * these events concurrently:
+ * A) the page of the consumed events may become a normal page
+ * (not reader page) in ring buffer, and this page will be rewrited
+ * by events producer.
+ * B) The page of the consumed events may become a page for splice_read,
+ * and this page will be returned to system.
+ *
+ * These primitives allow multi process access to different cpu ring buffer
+ * concurrently.
+ *
+ * These primitives don't distinguish read-only and read-consume access.
+ * Multi read-only access are also serialized.
+ */
+
+#ifdef CONFIG_SMP
+static DECLARE_RWSEM(all_cpu_access_lock);
+static DEFINE_PER_CPU(struct mutex, cpu_access_lock);
+
+static inline void trace_access_lock(int cpu)
+{
+ if (cpu == RING_BUFFER_ALL_CPUS) {
+ /* gain it for accessing the whole ring buffer. */
+ down_write(&all_cpu_access_lock);
+ } else {
+ /* gain it for accessing a cpu ring buffer. */
+
+ /* Firstly block other trace_access_lock(RING_BUFFER_ALL_CPUS). */
+ down_read(&all_cpu_access_lock);
+
+ /* Secondly block other access to this @cpu ring buffer. */
+ mutex_lock(&per_cpu(cpu_access_lock, cpu));
+ }
+}
+
+static inline void trace_access_unlock(int cpu)
+{
+ if (cpu == RING_BUFFER_ALL_CPUS) {
+ up_write(&all_cpu_access_lock);
+ } else {
+ mutex_unlock(&per_cpu(cpu_access_lock, cpu));
+ up_read(&all_cpu_access_lock);
+ }
+}
+
+static inline void trace_access_lock_init(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ mutex_init(&per_cpu(cpu_access_lock, cpu));
+}
+
+#else
+
+static DEFINE_MUTEX(access_lock);
+
+static inline void trace_access_lock(int cpu)
+{
+ (void)cpu;
+ mutex_lock(&access_lock);
+}
+
+static inline void trace_access_unlock(int cpu)
+{
+ (void)cpu;
+ mutex_unlock(&access_lock);
+}
+
+static inline void trace_access_lock_init(void)
+{
+}
+
+#endif
+
+/* trace_flags holds trace_options default values */
+unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
+ TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
+ TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE |
+ TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | TRACE_ITER_FUNCTION;
+
+static void tracer_tracing_on(struct trace_array *tr)
+{
+ if (tr->trace_buffer.buffer)
+ ring_buffer_record_on(tr->trace_buffer.buffer);
+ /*
+ * This flag is looked at when buffers haven't been allocated
+ * yet, or by some tracers (like irqsoff), that just want to
+ * know if the ring buffer has been disabled, but it can handle
+ * races of where it gets disabled but we still do a record.
+ * As the check is in the fast path of the tracers, it is more
+ * important to be fast than accurate.
+ */
+ tr->buffer_disabled = 0;
+ /* Make the flag seen by readers */
+ smp_wmb();
+}
+
+/**
+ * tracing_on - enable tracing buffers
+ *
+ * This function enables tracing buffers that may have been
+ * disabled with tracing_off.
+ */
+void tracing_on(void)
+{
+ tracer_tracing_on(&global_trace);
+}
+EXPORT_SYMBOL_GPL(tracing_on);
+
+/**
+ * __trace_puts - write a constant string into the trace buffer.
+ * @ip: The address of the caller
+ * @str: The constant string to write
+ * @size: The size of the string.
+ */
+int __trace_puts(unsigned long ip, const char *str, int size)
+{
+ struct ring_buffer_event *event;
+ struct ring_buffer *buffer;
+ struct print_entry *entry;
+ unsigned long irq_flags;
+ int alloc;
+ int pc;
+
+ if (!(trace_flags & TRACE_ITER_PRINTK))
+ return 0;
+
+ pc = preempt_count();
+
+ if (unlikely(tracing_selftest_running || tracing_disabled))
+ return 0;
+
+ alloc = sizeof(*entry) + size + 2; /* possible \n added */
+
+ local_save_flags(irq_flags);
+ buffer = global_trace.trace_buffer.buffer;
+ event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc,
+ irq_flags, pc);
+ if (!event)
+ return 0;
+
+ entry = ring_buffer_event_data(event);
+ entry->ip = ip;
+
+ memcpy(&entry->buf, str, size);
+
+ /* Add a newline if necessary */
+ if (entry->buf[size - 1] != '\n') {
+ entry->buf[size] = '\n';
+ entry->buf[size + 1] = '\0';
+ } else
+ entry->buf[size] = '\0';
+
+ __buffer_unlock_commit(buffer, event);
+ ftrace_trace_stack(buffer, irq_flags, 4, pc);
+
+ return size;
+}
+EXPORT_SYMBOL_GPL(__trace_puts);
+
+/**
+ * __trace_bputs - write the pointer to a constant string into trace buffer
+ * @ip: The address of the caller
+ * @str: The constant string to write to the buffer to
+ */
+int __trace_bputs(unsigned long ip, const char *str)
+{
+ struct ring_buffer_event *event;
+ struct ring_buffer *buffer;
+ struct bputs_entry *entry;
+ unsigned long irq_flags;
+ int size = sizeof(struct bputs_entry);
+ int pc;
+
+ if (!(trace_flags & TRACE_ITER_PRINTK))
+ return 0;
+
+ pc = preempt_count();
+
+ if (unlikely(tracing_selftest_running || tracing_disabled))
+ return 0;
+
+ local_save_flags(irq_flags);
+ buffer = global_trace.trace_buffer.buffer;
+ event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
+ irq_flags, pc);
+ if (!event)
+ return 0;
+
+ entry = ring_buffer_event_data(event);
+ entry->ip = ip;
+ entry->str = str;
+
+ __buffer_unlock_commit(buffer, event);
+ ftrace_trace_stack(buffer, irq_flags, 4, pc);
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(__trace_bputs);
+
+#ifdef CONFIG_TRACER_SNAPSHOT
+/**
+ * trace_snapshot - take a snapshot of the current buffer.
+ *
+ * This causes a swap between the snapshot buffer and the current live
+ * tracing buffer. You can use this to take snapshots of the live
+ * trace when some condition is triggered, but continue to trace.
+ *
+ * Note, make sure to allocate the snapshot with either
+ * a tracing_snapshot_alloc(), or by doing it manually
+ * with: echo 1 > /sys/kernel/debug/tracing/snapshot
+ *
+ * If the snapshot buffer is not allocated, it will stop tracing.
+ * Basically making a permanent snapshot.
+ */
+void tracing_snapshot(void)
+{
+ struct trace_array *tr = &global_trace;
+ struct tracer *tracer = tr->current_trace;
+ unsigned long flags;
+
+ if (in_nmi()) {
+ internal_trace_puts("*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n");
+ internal_trace_puts("*** snapshot is being ignored ***\n");
+ return;
+ }
+
+ if (!tr->allocated_snapshot) {
+ internal_trace_puts("*** SNAPSHOT NOT ALLOCATED ***\n");
+ internal_trace_puts("*** stopping trace here! ***\n");
+ tracing_off();
+ return;
+ }
+
+ /* Note, snapshot can not be used when the tracer uses it */
+ if (tracer->use_max_tr) {
+ internal_trace_puts("*** LATENCY TRACER ACTIVE ***\n");
+ internal_trace_puts("*** Can not use snapshot (sorry) ***\n");
+ return;
+ }
+
+ local_irq_save(flags);
+ update_max_tr(tr, current, smp_processor_id());
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(tracing_snapshot);
+
+static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf,
+ struct trace_buffer *size_buf, int cpu_id);
+static void set_buffer_entries(struct trace_buffer *buf, unsigned long val);
+
+static int alloc_snapshot(struct trace_array *tr)
+{
+ int ret;
+
+ if (!tr->allocated_snapshot) {
+
+ /* allocate spare buffer */
+ ret = resize_buffer_duplicate_size(&tr->max_buffer,
+ &tr->trace_buffer, RING_BUFFER_ALL_CPUS);
+ if (ret < 0)
+ return ret;
+
+ tr->allocated_snapshot = true;
+ }
+
+ return 0;
+}
+
+static void free_snapshot(struct trace_array *tr)
+{
+ /*
+ * We don't free the ring buffer. instead, resize it because
+ * The max_tr ring buffer has some state (e.g. ring->clock) and
+ * we want preserve it.
+ */
+ ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS);
+ set_buffer_entries(&tr->max_buffer, 1);
+ tracing_reset_online_cpus(&tr->max_buffer);
+ tr->allocated_snapshot = false;
+}
+
+/**
+ * tracing_alloc_snapshot - allocate snapshot buffer.
+ *
+ * This only allocates the snapshot buffer if it isn't already
+ * allocated - it doesn't also take a snapshot.
+ *
+ * This is meant to be used in cases where the snapshot buffer needs
+ * to be set up for events that can't sleep but need to be able to
+ * trigger a snapshot.
+ */
+int tracing_alloc_snapshot(void)
+{
+ struct trace_array *tr = &global_trace;
+ int ret;
+
+ ret = alloc_snapshot(tr);
+ WARN_ON(ret < 0);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tracing_alloc_snapshot);
+
+/**
+ * trace_snapshot_alloc - allocate and take a snapshot of the current buffer.
+ *
+ * This is similar to trace_snapshot(), but it will allocate the
+ * snapshot buffer if it isn't already allocated. Use this only
+ * where it is safe to sleep, as the allocation may sleep.
+ *
+ * This causes a swap between the snapshot buffer and the current live
+ * tracing buffer. You can use this to take snapshots of the live
+ * trace when some condition is triggered, but continue to trace.
+ */
+void tracing_snapshot_alloc(void)
+{
+ int ret;
+
+ ret = tracing_alloc_snapshot();
+ if (ret < 0)
+ return;
+
+ tracing_snapshot();
+}
+EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);
+#else
+void tracing_snapshot(void)
+{
+ WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used");
+}
+EXPORT_SYMBOL_GPL(tracing_snapshot);
+int tracing_alloc_snapshot(void)
+{
+ WARN_ONCE(1, "Snapshot feature not enabled, but snapshot allocation used");
+ return -ENODEV;
+}
+EXPORT_SYMBOL_GPL(tracing_alloc_snapshot);
+void tracing_snapshot_alloc(void)
+{
+ /* Give warning */
+ tracing_snapshot();
+}
+EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);
+#endif /* CONFIG_TRACER_SNAPSHOT */
+
+static void tracer_tracing_off(struct trace_array *tr)
+{
+ if (tr->trace_buffer.buffer)
+ ring_buffer_record_off(tr->trace_buffer.buffer);
+ /*
+ * This flag is looked at when buffers haven't been allocated
+ * yet, or by some tracers (like irqsoff), that just want to
+ * know if the ring buffer has been disabled, but it can handle
+ * races of where it gets disabled but we still do a record.
+ * As the check is in the fast path of the tracers, it is more
+ * important to be fast than accurate.
+ */
+ tr->buffer_disabled = 1;
+ /* Make the flag seen by readers */
+ smp_wmb();
+}
+
+/**
+ * tracing_off - turn off tracing buffers
+ *
+ * This function stops the tracing buffers from recording data.
+ * It does not disable any overhead the tracers themselves may
+ * be causing. This function simply causes all recording to
+ * the ring buffers to fail.
+ */
+void tracing_off(void)
+{
+ tracer_tracing_off(&global_trace);
+}
+EXPORT_SYMBOL_GPL(tracing_off);
+
+void disable_trace_on_warning(void)
+{
+ if (__disable_trace_on_warning)
+ tracing_off();
+}
+
+/**
+ * tracer_tracing_is_on - show real state of ring buffer enabled
+ * @tr : the trace array to know if ring buffer is enabled
+ *
+ * Shows real state of the ring buffer if it is enabled or not.
+ */
+static int tracer_tracing_is_on(struct trace_array *tr)
+{
+ if (tr->trace_buffer.buffer)
+ return ring_buffer_record_is_on(tr->trace_buffer.buffer);
+ return !tr->buffer_disabled;
+}
+
+/**
+ * tracing_is_on - show state of ring buffers enabled
+ */
+int tracing_is_on(void)
+{
+ return tracer_tracing_is_on(&global_trace);
+}
+EXPORT_SYMBOL_GPL(tracing_is_on);
+
+static int __init set_buf_size(char *str)
+{
+ unsigned long buf_size;
+
+ if (!str)
+ return 0;
+ buf_size = memparse(str, &str);
+ /* nr_entries can not be zero */
+ if (buf_size == 0)
+ return 0;
+ trace_buf_size = buf_size;
+ return 1;
+}
+__setup("trace_buf_size=", set_buf_size);
+
+static int __init set_tracing_thresh(char *str)
+{
+ unsigned long threshold;
+ int ret;
+
+ if (!str)
+ return 0;
+ ret = kstrtoul(str, 0, &threshold);
+ if (ret < 0)
+ return 0;
+ tracing_thresh = threshold * 1000;
+ return 1;
+}
+__setup("tracing_thresh=", set_tracing_thresh);
+
+unsigned long nsecs_to_usecs(unsigned long nsecs)
+{
+ return nsecs / 1000;
+}
+
+/* These must match the bit postions in trace_iterator_flags */
+static const char *trace_options[] = {
+ "print-parent",
+ "sym-offset",
+ "sym-addr",
+ "verbose",
+ "raw",
+ "hex",
+ "bin",
+ "block",
+ "stacktrace",
+ "trace_printk",
+ "ftrace_preempt",
+ "branch",
+ "annotate",
+ "userstacktrace",
+ "sym-userobj",
+ "printk-msg-only",
+ "context-info",
+ "latency-format",
+ "sleep-time",
+ "graph-time",
+ "record-cmd",
+ "overwrite",
+ "disable_on_free",
+ "irq-info",
+ "markers",
+ "function-trace",
+ NULL
+};
+
+static struct {
+ u64 (*func)(void);
+ const char *name;
+ int in_ns; /* is this clock in nanoseconds? */
+} trace_clocks[] = {
+ { trace_clock_local, "local", 1 },
+ { trace_clock_global, "global", 1 },
+ { trace_clock_counter, "counter", 0 },
+ { trace_clock_jiffies, "uptime", 0 },
+ { trace_clock, "perf", 1 },
+ { ktime_get_mono_fast_ns, "mono", 1 },
+ ARCH_TRACE_CLOCKS
+};
+
+/*
+ * trace_parser_get_init - gets the buffer for trace parser
+ */
+int trace_parser_get_init(struct trace_parser *parser, int size)
+{
+ memset(parser, 0, sizeof(*parser));
+
+ parser->buffer = kmalloc(size, GFP_KERNEL);
+ if (!parser->buffer)
+ return 1;
+
+ parser->size = size;
+ return 0;
+}
+
+/*
+ * trace_parser_put - frees the buffer for trace parser
+ */
+void trace_parser_put(struct trace_parser *parser)
+{
+ kfree(parser->buffer);
+}
+
+/*
+ * trace_get_user - reads the user input string separated by space
+ * (matched by isspace(ch))
+ *
+ * For each string found the 'struct trace_parser' is updated,
+ * and the function returns.
+ *
+ * Returns number of bytes read.
+ *
+ * See kernel/trace/trace.h for 'struct trace_parser' details.
+ */
+int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char ch;
+ size_t read = 0;
+ ssize_t ret;
+
+ if (!*ppos)
+ trace_parser_clear(parser);
+
+ ret = get_user(ch, ubuf++);
+ if (ret)
+ goto out;
+
+ read++;
+ cnt--;
+
+ /*
+ * The parser is not finished with the last write,
+ * continue reading the user input without skipping spaces.
+ */
+ if (!parser->cont) {
+ /* skip white space */
+ while (cnt && isspace(ch)) {
+ ret = get_user(ch, ubuf++);
+ if (ret)
+ goto out;
+ read++;
+ cnt--;
+ }
+
+ /* only spaces were written */
+ if (isspace(ch)) {
+ *ppos += read;
+ ret = read;
+ goto out;
+ }
+
+ parser->idx = 0;
+ }
+
+ /* read the non-space input */
+ while (cnt && !isspace(ch)) {
+ if (parser->idx < parser->size - 1)
+ parser->buffer[parser->idx++] = ch;
+ else {
+ ret = -EINVAL;
+ goto out;
+ }
+ ret = get_user(ch, ubuf++);
+ if (ret)
+ goto out;
+ read++;
+ cnt--;
+ }
+
+ /* We either got finished input or we have to wait for another call. */
+ if (isspace(ch)) {
+ parser->buffer[parser->idx] = 0;
+ parser->cont = false;
+ } else if (parser->idx < parser->size - 1) {
+ parser->cont = true;
+ parser->buffer[parser->idx++] = ch;
+ } else {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ *ppos += read;
+ ret = read;
+
+out:
+ return ret;
+}
+
+/* TODO add a seq_buf_to_buffer() */
+static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
+{
+ int len;
+
+ if (trace_seq_used(s) <= s->seq.readpos)
+ return -EBUSY;
+
+ len = trace_seq_used(s) - s->seq.readpos;
+ if (cnt > len)
+ cnt = len;
+ memcpy(buf, s->buffer + s->seq.readpos, cnt);
+
+ s->seq.readpos += cnt;
+ return cnt;
+}
+
+unsigned long __read_mostly tracing_thresh;
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+/*
+ * Copy the new maximum trace into the separate maximum-trace
+ * structure. (this way the maximum trace is permanently saved,
+ * for later retrieval via /sys/kernel/debug/tracing/latency_trace)
+ */
+static void
+__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
+{
+ struct trace_buffer *trace_buf = &tr->trace_buffer;
+ struct trace_buffer *max_buf = &tr->max_buffer;
+ struct trace_array_cpu *data = per_cpu_ptr(trace_buf->data, cpu);
+ struct trace_array_cpu *max_data = per_cpu_ptr(max_buf->data, cpu);
+
+ max_buf->cpu = cpu;
+ max_buf->time_start = data->preempt_timestamp;
+
+ max_data->saved_latency = tr->max_latency;
+ max_data->critical_start = data->critical_start;
+ max_data->critical_end = data->critical_end;
+
+ memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN);
+ max_data->pid = tsk->pid;
+ /*
+ * If tsk == current, then use current_uid(), as that does not use
+ * RCU. The irq tracer can be called out of RCU scope.
+ */
+ if (tsk == current)
+ max_data->uid = current_uid();
+ else
+ max_data->uid = task_uid(tsk);
+
+ max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
+ max_data->policy = tsk->policy;
+ max_data->rt_priority = tsk->rt_priority;
+
+ /* record this tasks comm */
+ tracing_record_cmdline(tsk);
+}
+
+/**
+ * update_max_tr - snapshot all trace buffers from global_trace to max_tr
+ * @tr: tracer
+ * @tsk: the task with the latency
+ * @cpu: The cpu that initiated the trace.
+ *
+ * Flip the buffers between the @tr and the max_tr and record information
+ * about which task was the cause of this latency.
+ */
+void
+update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
+{
+ struct ring_buffer *buf;
+
+ if (tr->stop_count)
+ return;
+
+ WARN_ON_ONCE(!irqs_disabled());
+
+ if (!tr->allocated_snapshot) {
+ /* Only the nop tracer should hit this when disabling */
+ WARN_ON_ONCE(tr->current_trace != &nop_trace);
+ return;
+ }
+
+ arch_spin_lock(&tr->max_lock);
+
+ buf = tr->trace_buffer.buffer;
+ tr->trace_buffer.buffer = tr->max_buffer.buffer;
+ tr->max_buffer.buffer = buf;
+
+ __update_max_tr(tr, tsk, cpu);
+ arch_spin_unlock(&tr->max_lock);
+}
+
+/**
+ * update_max_tr_single - only copy one trace over, and reset the rest
+ * @tr - tracer
+ * @tsk - task with the latency
+ * @cpu - the cpu of the buffer to copy.
+ *
+ * Flip the trace of a single CPU buffer between the @tr and the max_tr.
+ */
+void
+update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
+{
+ int ret;
+
+ if (tr->stop_count)
+ return;
+
+ WARN_ON_ONCE(!irqs_disabled());
+ if (!tr->allocated_snapshot) {
+ /* Only the nop tracer should hit this when disabling */
+ WARN_ON_ONCE(tr->current_trace != &nop_trace);
+ return;
+ }
+
+ arch_spin_lock(&tr->max_lock);
+
+ ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->trace_buffer.buffer, cpu);
+
+ if (ret == -EBUSY) {
+ /*
+ * We failed to swap the buffer due to a commit taking
+ * place on this CPU. We fail to record, but we reset
+ * the max trace buffer (no one writes directly to it)
+ * and flag that it failed.
+ */
+ trace_array_printk_buf(tr->max_buffer.buffer, _THIS_IP_,
+ "Failed to swap buffers due to commit in progress\n");
+ }
+
+ WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY);
+
+ __update_max_tr(tr, tsk, cpu);
+ arch_spin_unlock(&tr->max_lock);
+}
+#endif /* CONFIG_TRACER_MAX_TRACE */
+
+static int wait_on_pipe(struct trace_iterator *iter, bool full)
+{
+ /* Iterators are static, they should be filled or empty */
+ if (trace_buffer_iter(iter, iter->cpu_file))
+ return 0;
+
+ return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file,
+ full);
+}
+
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+static int run_tracer_selftest(struct tracer *type)
+{
+ struct trace_array *tr = &global_trace;
+ struct tracer *saved_tracer = tr->current_trace;
+ int ret;
+
+ if (!type->selftest || tracing_selftest_disabled)
+ return 0;
+
+ /*
+ * Run a selftest on this tracer.
+ * Here we reset the trace buffer, and set the current
+ * tracer to be this tracer. The tracer can then run some
+ * internal tracing to verify that everything is in order.
+ * If we fail, we do not register this tracer.
+ */
+ tracing_reset_online_cpus(&tr->trace_buffer);
+
+ tr->current_trace = type;
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+ if (type->use_max_tr) {
+ /* If we expanded the buffers, make sure the max is expanded too */
+ if (ring_buffer_expanded)
+ ring_buffer_resize(tr->max_buffer.buffer, trace_buf_size,
+ RING_BUFFER_ALL_CPUS);
+ tr->allocated_snapshot = true;
+ }
+#endif
+
+ /* the test is responsible for initializing and enabling */
+ pr_info("Testing tracer %s: ", type->name);
+ ret = type->selftest(type, tr);
+ /* the test is responsible for resetting too */
+ tr->current_trace = saved_tracer;
+ if (ret) {
+ printk(KERN_CONT "FAILED!\n");
+ /* Add the warning after printing 'FAILED' */
+ WARN_ON(1);
+ return -1;
+ }
+ /* Only reset on passing, to avoid touching corrupted buffers */
+ tracing_reset_online_cpus(&tr->trace_buffer);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+ if (type->use_max_tr) {
+ tr->allocated_snapshot = false;
+
+ /* Shrink the max buffer again */
+ if (ring_buffer_expanded)
+ ring_buffer_resize(tr->max_buffer.buffer, 1,
+ RING_BUFFER_ALL_CPUS);
+ }
+#endif
+
+ printk(KERN_CONT "PASSED\n");
+ return 0;
+}
+#else
+static inline int run_tracer_selftest(struct tracer *type)
+{
+ return 0;
+}
+#endif /* CONFIG_FTRACE_STARTUP_TEST */
+
+/**
+ * register_tracer - register a tracer with the ftrace system.
+ * @type - the plugin for the tracer
+ *
+ * Register a new plugin tracer.
+ */
+int register_tracer(struct tracer *type)
+{
+ struct tracer *t;
+ int ret = 0;
+
+ if (!type->name) {
+ pr_info("Tracer must have a name\n");
+ return -1;
+ }
+
+ if (strlen(type->name) >= MAX_TRACER_SIZE) {
+ pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE);
+ return -1;
+ }
+
+ mutex_lock(&trace_types_lock);
+
+ tracing_selftest_running = true;
+
+ for (t = trace_types; t; t = t->next) {
+ if (strcmp(type->name, t->name) == 0) {
+ /* already found */
+ pr_info("Tracer %s already registered\n",
+ type->name);
+ ret = -1;
+ goto out;
+ }
+ }
+
+ if (!type->set_flag)
+ type->set_flag = &dummy_set_flag;
+ if (!type->flags)
+ type->flags = &dummy_tracer_flags;
+ else
+ if (!type->flags->opts)
+ type->flags->opts = dummy_tracer_opt;
+
+ ret = run_tracer_selftest(type);
+ if (ret < 0)
+ goto out;
+
+ type->next = trace_types;
+ trace_types = type;
+
+ out:
+ tracing_selftest_running = false;
+ mutex_unlock(&trace_types_lock);
+
+ if (ret || !default_bootup_tracer)
+ goto out_unlock;
+
+ if (strncmp(default_bootup_tracer, type->name, MAX_TRACER_SIZE))
+ goto out_unlock;
+
+ printk(KERN_INFO "Starting tracer '%s'\n", type->name);
+ /* Do we want this tracer to start on bootup? */
+ tracing_set_tracer(&global_trace, type->name);
+ default_bootup_tracer = NULL;
+ /* disable other selftests, since this will break it. */
+ tracing_selftest_disabled = true;
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+ printk(KERN_INFO "Disabling FTRACE selftests due to running tracer '%s'\n",
+ type->name);
+#endif
+
+ out_unlock:
+ return ret;
+}
+
+void tracing_reset(struct trace_buffer *buf, int cpu)
+{
+ struct ring_buffer *buffer = buf->buffer;
+
+ if (!buffer)
+ return;
+
+ ring_buffer_record_disable(buffer);
+
+ /* Make sure all commits have finished */
+ synchronize_sched();
+ ring_buffer_reset_cpu(buffer, cpu);
+
+ ring_buffer_record_enable(buffer);
+}
+
+void tracing_reset_online_cpus(struct trace_buffer *buf)
+{
+ struct ring_buffer *buffer = buf->buffer;
+ int cpu;
+
+ if (!buffer)
+ return;
+
+ ring_buffer_record_disable(buffer);
+
+ /* Make sure all commits have finished */
+ synchronize_sched();
+
+ buf->time_start = buffer_ftrace_now(buf, buf->cpu);
+
+ for_each_online_cpu(cpu)
+ ring_buffer_reset_cpu(buffer, cpu);
+
+ ring_buffer_record_enable(buffer);
+}
+
+/* Must have trace_types_lock held */
+void tracing_reset_all_online_cpus(void)
+{
+ struct trace_array *tr;
+
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ tracing_reset_online_cpus(&tr->trace_buffer);
+#ifdef CONFIG_TRACER_MAX_TRACE
+ tracing_reset_online_cpus(&tr->max_buffer);
+#endif
+ }
+}
+
+#define SAVED_CMDLINES_DEFAULT 128
+#define NO_CMDLINE_MAP UINT_MAX
+static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
+struct saved_cmdlines_buffer {
+ unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
+ unsigned *map_cmdline_to_pid;
+ unsigned cmdline_num;
+ int cmdline_idx;
+ char *saved_cmdlines;
+};
+static struct saved_cmdlines_buffer *savedcmd;
+
+/* temporary disable recording */
+static atomic_t trace_record_cmdline_disabled __read_mostly;
+
+static inline char *get_saved_cmdlines(int idx)
+{
+ return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN];
+}
+
+static inline void set_cmdline(int idx, const char *cmdline)
+{
+ memcpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN);
+}
+
+static int allocate_cmdlines_buffer(unsigned int val,
+ struct saved_cmdlines_buffer *s)
+{
+ s->map_cmdline_to_pid = kmalloc(val * sizeof(*s->map_cmdline_to_pid),
+ GFP_KERNEL);
+ if (!s->map_cmdline_to_pid)
+ return -ENOMEM;
+
+ s->saved_cmdlines = kmalloc(val * TASK_COMM_LEN, GFP_KERNEL);
+ if (!s->saved_cmdlines) {
+ kfree(s->map_cmdline_to_pid);
+ return -ENOMEM;
+ }
+
+ s->cmdline_idx = 0;
+ s->cmdline_num = val;
+ memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP,
+ sizeof(s->map_pid_to_cmdline));
+ memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP,
+ val * sizeof(*s->map_cmdline_to_pid));
+
+ return 0;
+}
+
+static int trace_create_savedcmd(void)
+{
+ int ret;
+
+ savedcmd = kmalloc(sizeof(*savedcmd), GFP_KERNEL);
+ if (!savedcmd)
+ return -ENOMEM;
+
+ ret = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT, savedcmd);
+ if (ret < 0) {
+ kfree(savedcmd);
+ savedcmd = NULL;
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+int is_tracing_stopped(void)
+{
+ return global_trace.stop_count;
+}
+
+/**
+ * tracing_start - quick start of the tracer
+ *
+ * If tracing is enabled but was stopped by tracing_stop,
+ * this will start the tracer back up.
+ */
+void tracing_start(void)
+{
+ struct ring_buffer *buffer;
+ unsigned long flags;
+
+ if (tracing_disabled)
+ return;
+
+ raw_spin_lock_irqsave(&global_trace.start_lock, flags);
+ if (--global_trace.stop_count) {
+ if (global_trace.stop_count < 0) {
+ /* Someone screwed up their debugging */
+ WARN_ON_ONCE(1);
+ global_trace.stop_count = 0;
+ }
+ goto out;
+ }
+
+ /* Prevent the buffers from switching */
+ arch_spin_lock(&global_trace.max_lock);
+
+ buffer = global_trace.trace_buffer.buffer;
+ if (buffer)
+ ring_buffer_record_enable(buffer);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+ buffer = global_trace.max_buffer.buffer;
+ if (buffer)
+ ring_buffer_record_enable(buffer);
+#endif
+
+ arch_spin_unlock(&global_trace.max_lock);
+
+ out:
+ raw_spin_unlock_irqrestore(&global_trace.start_lock, flags);
+}
+
+static void tracing_start_tr(struct trace_array *tr)
+{
+ struct ring_buffer *buffer;
+ unsigned long flags;
+
+ if (tracing_disabled)
+ return;
+
+ /* If global, we need to also start the max tracer */
+ if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
+ return tracing_start();
+
+ raw_spin_lock_irqsave(&tr->start_lock, flags);
+
+ if (--tr->stop_count) {
+ if (tr->stop_count < 0) {
+ /* Someone screwed up their debugging */
+ WARN_ON_ONCE(1);
+ tr->stop_count = 0;
+ }
+ goto out;
+ }
+
+ buffer = tr->trace_buffer.buffer;
+ if (buffer)
+ ring_buffer_record_enable(buffer);
+
+ out:
+ raw_spin_unlock_irqrestore(&tr->start_lock, flags);
+}
+
+/**
+ * tracing_stop - quick stop of the tracer
+ *
+ * Light weight way to stop tracing. Use in conjunction with
+ * tracing_start.
+ */
+void tracing_stop(void)
+{
+ struct ring_buffer *buffer;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&global_trace.start_lock, flags);
+ if (global_trace.stop_count++)
+ goto out;
+
+ /* Prevent the buffers from switching */
+ arch_spin_lock(&global_trace.max_lock);
+
+ buffer = global_trace.trace_buffer.buffer;
+ if (buffer)
+ ring_buffer_record_disable(buffer);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+ buffer = global_trace.max_buffer.buffer;
+ if (buffer)
+ ring_buffer_record_disable(buffer);
+#endif
+
+ arch_spin_unlock(&global_trace.max_lock);
+
+ out:
+ raw_spin_unlock_irqrestore(&global_trace.start_lock, flags);
+}
+
+static void tracing_stop_tr(struct trace_array *tr)
+{
+ struct ring_buffer *buffer;
+ unsigned long flags;
+
+ /* If global, we need to also stop the max tracer */
+ if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
+ return tracing_stop();
+
+ raw_spin_lock_irqsave(&tr->start_lock, flags);
+ if (tr->stop_count++)
+ goto out;
+
+ buffer = tr->trace_buffer.buffer;
+ if (buffer)
+ ring_buffer_record_disable(buffer);
+
+ out:
+ raw_spin_unlock_irqrestore(&tr->start_lock, flags);
+}
+
+void trace_stop_cmdline_recording(void);
+
+static int trace_save_cmdline(struct task_struct *tsk)
+{
+ unsigned pid, idx;
+
+ if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT))
+ return 0;
+
+ /*
+ * It's not the end of the world if we don't get
+ * the lock, but we also don't want to spin
+ * nor do we want to disable interrupts,
+ * so if we miss here, then better luck next time.
+ */
+ if (!arch_spin_trylock(&trace_cmdline_lock))
+ return 0;
+
+ idx = savedcmd->map_pid_to_cmdline[tsk->pid];
+ if (idx == NO_CMDLINE_MAP) {
+ idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num;
+
+ /*
+ * Check whether the cmdline buffer at idx has a pid
+ * mapped. We are going to overwrite that entry so we
+ * need to clear the map_pid_to_cmdline. Otherwise we
+ * would read the new comm for the old pid.
+ */
+ pid = savedcmd->map_cmdline_to_pid[idx];
+ if (pid != NO_CMDLINE_MAP)
+ savedcmd->map_pid_to_cmdline[pid] = NO_CMDLINE_MAP;
+
+ savedcmd->map_cmdline_to_pid[idx] = tsk->pid;
+ savedcmd->map_pid_to_cmdline[tsk->pid] = idx;
+
+ savedcmd->cmdline_idx = idx;
+ }
+
+ set_cmdline(idx, tsk->comm);
+
+ arch_spin_unlock(&trace_cmdline_lock);
+
+ return 1;
+}
+
+static void __trace_find_cmdline(int pid, char comm[])
+{
+ unsigned map;
+
+ if (!pid) {
+ strcpy(comm, "<idle>");
+ return;
+ }
+
+ if (WARN_ON_ONCE(pid < 0)) {
+ strcpy(comm, "<XXX>");
+ return;
+ }
+
+ if (pid > PID_MAX_DEFAULT) {
+ strcpy(comm, "<...>");
+ return;
+ }
+
+ map = savedcmd->map_pid_to_cmdline[pid];
+ if (map != NO_CMDLINE_MAP)
+ strcpy(comm, get_saved_cmdlines(map));
+ else
+ strcpy(comm, "<...>");
+}
+
+void trace_find_cmdline(int pid, char comm[])
+{
+ preempt_disable();
+ arch_spin_lock(&trace_cmdline_lock);
+
+ __trace_find_cmdline(pid, comm);
+
+ arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
+}
+
+void tracing_record_cmdline(struct task_struct *tsk)
+{
+ if (atomic_read(&trace_record_cmdline_disabled) || !tracing_is_on())
+ return;
+
+ if (!__this_cpu_read(trace_cmdline_save))
+ return;
+
+ if (trace_save_cmdline(tsk))
+ __this_cpu_write(trace_cmdline_save, false);
+}
+
+void
+tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
+ int pc)
+{
+ struct task_struct *tsk = current;
+
+ entry->preempt_count = pc & 0xff;
+ entry->pid = (tsk) ? tsk->pid : 0;
+ entry->flags =
+#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
+ (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
+#else
+ TRACE_FLAG_IRQS_NOSUPPORT |
+#endif
+ ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
+ ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
+ (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
+ (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
+}
+EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
+
+struct ring_buffer_event *
+trace_buffer_lock_reserve(struct ring_buffer *buffer,
+ int type,
+ unsigned long len,
+ unsigned long flags, int pc)
+{
+ struct ring_buffer_event *event;
+
+ event = ring_buffer_lock_reserve(buffer, len);
+ if (event != NULL) {
+ struct trace_entry *ent = ring_buffer_event_data(event);
+
+ tracing_generic_entry_update(ent, flags, pc);
+ ent->type = type;
+ }
+
+ return event;
+}
+
+void
+__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event)
+{
+ __this_cpu_write(trace_cmdline_save, true);
+ ring_buffer_unlock_commit(buffer, event);
+}
+
+static inline void
+__trace_buffer_unlock_commit(struct ring_buffer *buffer,
+ struct ring_buffer_event *event,
+ unsigned long flags, int pc)
+{
+ __buffer_unlock_commit(buffer, event);
+
+ ftrace_trace_stack(buffer, flags, 6, pc);
+ ftrace_trace_userstack(buffer, flags, pc);
+}
+
+void trace_buffer_unlock_commit(struct ring_buffer *buffer,
+ struct ring_buffer_event *event,
+ unsigned long flags, int pc)
+{
+ __trace_buffer_unlock_commit(buffer, event, flags, pc);
+}
+EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit);
+
+static struct ring_buffer *temp_buffer;
+
+struct ring_buffer_event *
+trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
+ struct ftrace_event_file *ftrace_file,
+ int type, unsigned long len,
+ unsigned long flags, int pc)
+{
+ struct ring_buffer_event *entry;
+
+ *current_rb = ftrace_file->tr->trace_buffer.buffer;
+ entry = trace_buffer_lock_reserve(*current_rb,
+ type, len, flags, pc);
+ /*
+ * If tracing is off, but we have triggers enabled
+ * we still need to look at the event data. Use the temp_buffer
+ * to store the trace event for the tigger to use. It's recusive
+ * safe and will not be recorded anywhere.
+ */
+ if (!entry && ftrace_file->flags & FTRACE_EVENT_FL_TRIGGER_COND) {
+ *current_rb = temp_buffer;
+ entry = trace_buffer_lock_reserve(*current_rb,
+ type, len, flags, pc);
+ }
+ return entry;
+}
+EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve);
+
+struct ring_buffer_event *
+trace_current_buffer_lock_reserve(struct ring_buffer **current_rb,
+ int type, unsigned long len,
+ unsigned long flags, int pc)
+{
+ *current_rb = global_trace.trace_buffer.buffer;
+ return trace_buffer_lock_reserve(*current_rb,
+ type, len, flags, pc);
+}
+EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve);
+
+void trace_current_buffer_unlock_commit(struct ring_buffer *buffer,
+ struct ring_buffer_event *event,
+ unsigned long flags, int pc)
+{
+ __trace_buffer_unlock_commit(buffer, event, flags, pc);
+}
+EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit);
+
+void trace_buffer_unlock_commit_regs(struct ring_buffer *buffer,
+ struct ring_buffer_event *event,
+ unsigned long flags, int pc,
+ struct pt_regs *regs)
+{
+ __buffer_unlock_commit(buffer, event);
+
+ ftrace_trace_stack_regs(buffer, flags, 0, pc, regs);
+ ftrace_trace_userstack(buffer, flags, pc);
+}
+EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit_regs);
+
+void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
+ struct ring_buffer_event *event)
+{
+ ring_buffer_discard_commit(buffer, event);
+}
+EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit);
+
+void
+trace_function(struct trace_array *tr,
+ unsigned long ip, unsigned long parent_ip, unsigned long flags,
+ int pc)
+{
+ struct ftrace_event_call *call = &event_function;
+ struct ring_buffer *buffer = tr->trace_buffer.buffer;
+ struct ring_buffer_event *event;
+ struct ftrace_entry *entry;
+
+ /* If we are reading the ring buffer, don't trace */
+ if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
+ return;
+
+ event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
+ flags, pc);
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ entry->ip = ip;
+ entry->parent_ip = parent_ip;
+
+ if (!call_filter_check_discard(call, entry, buffer, event))
+ __buffer_unlock_commit(buffer, event);
+}
+
+#ifdef CONFIG_STACKTRACE
+
+#define FTRACE_STACK_MAX_ENTRIES (PAGE_SIZE / sizeof(unsigned long))
+struct ftrace_stack {
+ unsigned long calls[FTRACE_STACK_MAX_ENTRIES];
+};
+
+static DEFINE_PER_CPU(struct ftrace_stack, ftrace_stack);
+static DEFINE_PER_CPU(int, ftrace_stack_reserve);
+
+static void __ftrace_trace_stack(struct ring_buffer *buffer,
+ unsigned long flags,
+ int skip, int pc, struct pt_regs *regs)
+{
+ struct ftrace_event_call *call = &event_kernel_stack;
+ struct ring_buffer_event *event;
+ struct stack_entry *entry;
+ struct stack_trace trace;
+ int use_stack;
+ int size = FTRACE_STACK_ENTRIES;
+
+ trace.nr_entries = 0;
+ trace.skip = skip;
+
+ /*
+ * Since events can happen in NMIs there's no safe way to
+ * use the per cpu ftrace_stacks. We reserve it and if an interrupt
+ * or NMI comes in, it will just have to use the default
+ * FTRACE_STACK_SIZE.
+ */
+ preempt_disable_notrace();
+
+ use_stack = __this_cpu_inc_return(ftrace_stack_reserve);
+ /*
+ * We don't need any atomic variables, just a barrier.
+ * If an interrupt comes in, we don't care, because it would
+ * have exited and put the counter back to what we want.
+ * We just need a barrier to keep gcc from moving things
+ * around.
+ */
+ barrier();
+ if (use_stack == 1) {
+ trace.entries = this_cpu_ptr(ftrace_stack.calls);
+ trace.max_entries = FTRACE_STACK_MAX_ENTRIES;
+
+ if (regs)
+ save_stack_trace_regs(regs, &trace);
+ else
+ save_stack_trace(&trace);
+
+ if (trace.nr_entries > size)
+ size = trace.nr_entries;
+ } else
+ /* From now on, use_stack is a boolean */
+ use_stack = 0;
+
+ size *= sizeof(unsigned long);
+
+ event = trace_buffer_lock_reserve(buffer, TRACE_STACK,
+ sizeof(*entry) + size, flags, pc);
+ if (!event)
+ goto out;
+ entry = ring_buffer_event_data(event);
+
+ memset(&entry->caller, 0, size);
+
+ if (use_stack)
+ memcpy(&entry->caller, trace.entries,
+ trace.nr_entries * sizeof(unsigned long));
+ else {
+ trace.max_entries = FTRACE_STACK_ENTRIES;
+ trace.entries = entry->caller;
+ if (regs)
+ save_stack_trace_regs(regs, &trace);
+ else
+ save_stack_trace(&trace);
+ }
+
+ entry->size = trace.nr_entries;
+
+ if (!call_filter_check_discard(call, entry, buffer, event))
+ __buffer_unlock_commit(buffer, event);
+
+ out:
+ /* Again, don't let gcc optimize things here */
+ barrier();
+ __this_cpu_dec(ftrace_stack_reserve);
+ preempt_enable_notrace();
+
+}
+
+void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags,
+ int skip, int pc, struct pt_regs *regs)
+{
+ if (!(trace_flags & TRACE_ITER_STACKTRACE))
+ return;
+
+ __ftrace_trace_stack(buffer, flags, skip, pc, regs);
+}
+
+void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
+ int skip, int pc)
+{
+ if (!(trace_flags & TRACE_ITER_STACKTRACE))
+ return;
+
+ __ftrace_trace_stack(buffer, flags, skip, pc, NULL);
+}
+
+void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
+ int pc)
+{
+ __ftrace_trace_stack(tr->trace_buffer.buffer, flags, skip, pc, NULL);
+}
+
+/**
+ * trace_dump_stack - record a stack back trace in the trace buffer
+ * @skip: Number of functions to skip (helper handlers)
+ */
+void trace_dump_stack(int skip)
+{
+ unsigned long flags;
+
+ if (tracing_disabled || tracing_selftest_running)
+ return;
+
+ local_save_flags(flags);
+
+ /*
+ * Skip 3 more, seems to get us at the caller of
+ * this function.
+ */
+ skip += 3;
+ __ftrace_trace_stack(global_trace.trace_buffer.buffer,
+ flags, skip, preempt_count(), NULL);
+}
+
+static DEFINE_PER_CPU(int, user_stack_count);
+
+void
+ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
+{
+ struct ftrace_event_call *call = &event_user_stack;
+ struct ring_buffer_event *event;
+ struct userstack_entry *entry;
+ struct stack_trace trace;
+
+ if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
+ return;
+
+ /*
+ * NMIs can not handle page faults, even with fix ups.
+ * The save user stack can (and often does) fault.
+ */
+ if (unlikely(in_nmi()))
+ return;
+
+ /*
+ * prevent recursion, since the user stack tracing may
+ * trigger other kernel events.
+ */
+ preempt_disable();
+ if (__this_cpu_read(user_stack_count))
+ goto out;
+
+ __this_cpu_inc(user_stack_count);
+
+ event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
+ sizeof(*entry), flags, pc);
+ if (!event)
+ goto out_drop_count;
+ entry = ring_buffer_event_data(event);
+
+ entry->tgid = current->tgid;
+ memset(&entry->caller, 0, sizeof(entry->caller));
+
+ trace.nr_entries = 0;
+ trace.max_entries = FTRACE_STACK_ENTRIES;
+ trace.skip = 0;
+ trace.entries = entry->caller;
+
+ save_stack_trace_user(&trace);
+ if (!call_filter_check_discard(call, entry, buffer, event))
+ __buffer_unlock_commit(buffer, event);
+
+ out_drop_count:
+ __this_cpu_dec(user_stack_count);
+ out:
+ preempt_enable();
+}
+
+#ifdef UNUSED
+static void __trace_userstack(struct trace_array *tr, unsigned long flags)
+{
+ ftrace_trace_userstack(tr, flags, preempt_count());
+}
+#endif /* UNUSED */
+
+#endif /* CONFIG_STACKTRACE */
+
+/* created for use with alloc_percpu */
+struct trace_buffer_struct {
+ char buffer[TRACE_BUF_SIZE];
+};
+
+static struct trace_buffer_struct *trace_percpu_buffer;
+static struct trace_buffer_struct *trace_percpu_sirq_buffer;
+static struct trace_buffer_struct *trace_percpu_irq_buffer;
+static struct trace_buffer_struct *trace_percpu_nmi_buffer;
+
+/*
+ * The buffer used is dependent on the context. There is a per cpu
+ * buffer for normal context, softirq contex, hard irq context and
+ * for NMI context. Thise allows for lockless recording.
+ *
+ * Note, if the buffers failed to be allocated, then this returns NULL
+ */
+static char *get_trace_buf(void)
+{
+ struct trace_buffer_struct *percpu_buffer;
+
+ /*
+ * If we have allocated per cpu buffers, then we do not
+ * need to do any locking.
+ */
+ if (in_nmi())
+ percpu_buffer = trace_percpu_nmi_buffer;
+ else if (in_irq())
+ percpu_buffer = trace_percpu_irq_buffer;
+ else if (in_softirq())
+ percpu_buffer = trace_percpu_sirq_buffer;
+ else
+ percpu_buffer = trace_percpu_buffer;
+
+ if (!percpu_buffer)
+ return NULL;
+
+ return this_cpu_ptr(&percpu_buffer->buffer[0]);
+}
+
+static int alloc_percpu_trace_buffer(void)
+{
+ struct trace_buffer_struct *buffers;
+ struct trace_buffer_struct *sirq_buffers;
+ struct trace_buffer_struct *irq_buffers;
+ struct trace_buffer_struct *nmi_buffers;
+
+ buffers = alloc_percpu(struct trace_buffer_struct);
+ if (!buffers)
+ goto err_warn;
+
+ sirq_buffers = alloc_percpu(struct trace_buffer_struct);
+ if (!sirq_buffers)
+ goto err_sirq;
+
+ irq_buffers = alloc_percpu(struct trace_buffer_struct);
+ if (!irq_buffers)
+ goto err_irq;
+
+ nmi_buffers = alloc_percpu(struct trace_buffer_struct);
+ if (!nmi_buffers)
+ goto err_nmi;
+
+ trace_percpu_buffer = buffers;
+ trace_percpu_sirq_buffer = sirq_buffers;
+ trace_percpu_irq_buffer = irq_buffers;
+ trace_percpu_nmi_buffer = nmi_buffers;
+
+ return 0;
+
+ err_nmi:
+ free_percpu(irq_buffers);
+ err_irq:
+ free_percpu(sirq_buffers);
+ err_sirq:
+ free_percpu(buffers);
+ err_warn:
+ WARN(1, "Could not allocate percpu trace_printk buffer");
+ return -ENOMEM;
+}
+
+static int buffers_allocated;
+
+void trace_printk_init_buffers(void)
+{
+ if (buffers_allocated)
+ return;
+
+ if (alloc_percpu_trace_buffer())
+ return;
+
+ /* trace_printk() is for debug use only. Don't use it in production. */
+
+ pr_warning("\n");
+ pr_warning("**********************************************************\n");
+ pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n");
+ pr_warning("** **\n");
+ pr_warning("** trace_printk() being used. Allocating extra memory. **\n");
+ pr_warning("** **\n");
+ pr_warning("** This means that this is a DEBUG kernel and it is **\n");
+ pr_warning("** unsafe for production use. **\n");
+ pr_warning("** **\n");
+ pr_warning("** If you see this message and you are not debugging **\n");
+ pr_warning("** the kernel, report this immediately to your vendor! **\n");
+ pr_warning("** **\n");
+ pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n");
+ pr_warning("**********************************************************\n");
+
+ /* Expand the buffers to set size */
+ tracing_update_buffers();
+
+ buffers_allocated = 1;
+
+ /*
+ * trace_printk_init_buffers() can be called by modules.
+ * If that happens, then we need to start cmdline recording
+ * directly here. If the global_trace.buffer is already
+ * allocated here, then this was called by module code.
+ */
+ if (global_trace.trace_buffer.buffer)
+ tracing_start_cmdline_record();
+}
+
+void trace_printk_start_comm(void)
+{
+ /* Start tracing comms if trace printk is set */
+ if (!buffers_allocated)
+ return;
+ tracing_start_cmdline_record();
+}
+
+static void trace_printk_start_stop_comm(int enabled)
+{
+ if (!buffers_allocated)
+ return;
+
+ if (enabled)
+ tracing_start_cmdline_record();
+ else
+ tracing_stop_cmdline_record();
+}
+
+/**
+ * trace_vbprintk - write binary msg to tracing buffer
+ *
+ */
+int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
+{
+ struct ftrace_event_call *call = &event_bprint;
+ struct ring_buffer_event *event;
+ struct ring_buffer *buffer;
+ struct trace_array *tr = &global_trace;
+ struct bprint_entry *entry;
+ unsigned long flags;
+ char *tbuffer;
+ int len = 0, size, pc;
+
+ if (unlikely(tracing_selftest_running || tracing_disabled))
+ return 0;
+
+ /* Don't pollute graph traces with trace_vprintk internals */
+ pause_graph_tracing();
+
+ pc = preempt_count();
+ preempt_disable_notrace();
+
+ tbuffer = get_trace_buf();
+ if (!tbuffer) {
+ len = 0;
+ goto out;
+ }
+
+ len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args);
+
+ if (len > TRACE_BUF_SIZE/sizeof(int) || len < 0)
+ goto out;
+
+ local_save_flags(flags);
+ size = sizeof(*entry) + sizeof(u32) * len;
+ buffer = tr->trace_buffer.buffer;
+ event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
+ flags, pc);
+ if (!event)
+ goto out;
+ entry = ring_buffer_event_data(event);
+ entry->ip = ip;
+ entry->fmt = fmt;
+
+ memcpy(entry->buf, tbuffer, sizeof(u32) * len);
+ if (!call_filter_check_discard(call, entry, buffer, event)) {
+ __buffer_unlock_commit(buffer, event);
+ ftrace_trace_stack(buffer, flags, 6, pc);
+ }
+
+out:
+ preempt_enable_notrace();
+ unpause_graph_tracing();
+
+ return len;
+}
+EXPORT_SYMBOL_GPL(trace_vbprintk);
+
+static int
+__trace_array_vprintk(struct ring_buffer *buffer,
+ unsigned long ip, const char *fmt, va_list args)
+{
+ struct ftrace_event_call *call = &event_print;
+ struct ring_buffer_event *event;
+ int len = 0, size, pc;
+ struct print_entry *entry;
+ unsigned long flags;
+ char *tbuffer;
+
+ if (tracing_disabled || tracing_selftest_running)
+ return 0;
+
+ /* Don't pollute graph traces with trace_vprintk internals */
+ pause_graph_tracing();
+
+ pc = preempt_count();
+ preempt_disable_notrace();
+
+
+ tbuffer = get_trace_buf();
+ if (!tbuffer) {
+ len = 0;
+ goto out;
+ }
+
+ len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args);
+
+ local_save_flags(flags);
+ size = sizeof(*entry) + len + 1;
+ event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
+ flags, pc);
+ if (!event)
+ goto out;
+ entry = ring_buffer_event_data(event);
+ entry->ip = ip;
+
+ memcpy(&entry->buf, tbuffer, len + 1);
+ if (!call_filter_check_discard(call, entry, buffer, event)) {
+ __buffer_unlock_commit(buffer, event);
+ ftrace_trace_stack(buffer, flags, 6, pc);
+ }
+ out:
+ preempt_enable_notrace();
+ unpause_graph_tracing();
+
+ return len;
+}
+
+int trace_array_vprintk(struct trace_array *tr,
+ unsigned long ip, const char *fmt, va_list args)
+{
+ return __trace_array_vprintk(tr->trace_buffer.buffer, ip, fmt, args);
+}
+
+int trace_array_printk(struct trace_array *tr,
+ unsigned long ip, const char *fmt, ...)
+{
+ int ret;
+ va_list ap;
+
+ if (!(trace_flags & TRACE_ITER_PRINTK))
+ return 0;
+
+ va_start(ap, fmt);
+ ret = trace_array_vprintk(tr, ip, fmt, ap);
+ va_end(ap);
+ return ret;
+}
+
+int trace_array_printk_buf(struct ring_buffer *buffer,
+ unsigned long ip, const char *fmt, ...)
+{
+ int ret;
+ va_list ap;
+
+ if (!(trace_flags & TRACE_ITER_PRINTK))
+ return 0;
+
+ va_start(ap, fmt);
+ ret = __trace_array_vprintk(buffer, ip, fmt, ap);
+ va_end(ap);
+ return ret;
+}
+
+int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
+{
+ return trace_array_vprintk(&global_trace, ip, fmt, args);
+}
+EXPORT_SYMBOL_GPL(trace_vprintk);
+
+static void trace_iterator_increment(struct trace_iterator *iter)
+{
+ struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, iter->cpu);
+
+ iter->idx++;
+ if (buf_iter)
+ ring_buffer_read(buf_iter, NULL);
+}
+
+static struct trace_entry *
+peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
+ unsigned long *lost_events)
+{
+ struct ring_buffer_event *event;
+ struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, cpu);
+
+ if (buf_iter)
+ event = ring_buffer_iter_peek(buf_iter, ts);
+ else
+ event = ring_buffer_peek(iter->trace_buffer->buffer, cpu, ts,
+ lost_events);
+
+ if (event) {
+ iter->ent_size = ring_buffer_event_length(event);
+ return ring_buffer_event_data(event);
+ }
+ iter->ent_size = 0;
+ return NULL;
+}
+
+static struct trace_entry *
+__find_next_entry(struct trace_iterator *iter, int *ent_cpu,
+ unsigned long *missing_events, u64 *ent_ts)
+{
+ struct ring_buffer *buffer = iter->trace_buffer->buffer;
+ struct trace_entry *ent, *next = NULL;
+ unsigned long lost_events = 0, next_lost = 0;
+ int cpu_file = iter->cpu_file;
+ u64 next_ts = 0, ts;
+ int next_cpu = -1;
+ int next_size = 0;
+ int cpu;
+
+ /*
+ * If we are in a per_cpu trace file, don't bother by iterating over
+ * all cpu and peek directly.
+ */
+ if (cpu_file > RING_BUFFER_ALL_CPUS) {
+ if (ring_buffer_empty_cpu(buffer, cpu_file))
+ return NULL;
+ ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events);
+ if (ent_cpu)
+ *ent_cpu = cpu_file;
+
+ return ent;
+ }
+
+ for_each_tracing_cpu(cpu) {
+
+ if (ring_buffer_empty_cpu(buffer, cpu))
+ continue;
+
+ ent = peek_next_entry(iter, cpu, &ts, &lost_events);
+
+ /*
+ * Pick the entry with the smallest timestamp:
+ */
+ if (ent && (!next || ts < next_ts)) {
+ next = ent;
+ next_cpu = cpu;
+ next_ts = ts;
+ next_lost = lost_events;
+ next_size = iter->ent_size;
+ }
+ }
+
+ iter->ent_size = next_size;
+
+ if (ent_cpu)
+ *ent_cpu = next_cpu;
+
+ if (ent_ts)
+ *ent_ts = next_ts;
+
+ if (missing_events)
+ *missing_events = next_lost;
+
+ return next;
+}
+
+/* Find the next real entry, without updating the iterator itself */
+struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
+ int *ent_cpu, u64 *ent_ts)
+{
+ return __find_next_entry(iter, ent_cpu, NULL, ent_ts);
+}
+
+/* Find the next real entry, and increment the iterator to the next entry */
+void *trace_find_next_entry_inc(struct trace_iterator *iter)
+{
+ iter->ent = __find_next_entry(iter, &iter->cpu,
+ &iter->lost_events, &iter->ts);
+
+ if (iter->ent)
+ trace_iterator_increment(iter);
+
+ return iter->ent ? iter : NULL;
+}
+
+static void trace_consume(struct trace_iterator *iter)
+{
+ ring_buffer_consume(iter->trace_buffer->buffer, iter->cpu, &iter->ts,
+ &iter->lost_events);
+}
+
+static void *s_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct trace_iterator *iter = m->private;
+ int i = (int)*pos;
+ void *ent;
+
+ WARN_ON_ONCE(iter->leftover);
+
+ (*pos)++;
+
+ /* can't go backwards */
+ if (iter->idx > i)
+ return NULL;
+
+ if (iter->idx < 0)
+ ent = trace_find_next_entry_inc(iter);
+ else
+ ent = iter;
+
+ while (ent && iter->idx < i)
+ ent = trace_find_next_entry_inc(iter);
+
+ iter->pos = *pos;
+
+ return ent;
+}
+
+void tracing_iter_reset(struct trace_iterator *iter, int cpu)
+{
+ struct ring_buffer_event *event;
+ struct ring_buffer_iter *buf_iter;
+ unsigned long entries = 0;
+ u64 ts;
+
+ per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = 0;
+
+ buf_iter = trace_buffer_iter(iter, cpu);
+ if (!buf_iter)
+ return;
+
+ ring_buffer_iter_reset(buf_iter);
+
+ /*
+ * We could have the case with the max latency tracers
+ * that a reset never took place on a cpu. This is evident
+ * by the timestamp being before the start of the buffer.
+ */
+ while ((event = ring_buffer_iter_peek(buf_iter, &ts))) {
+ if (ts >= iter->trace_buffer->time_start)
+ break;
+ entries++;
+ ring_buffer_read(buf_iter, NULL);
+ }
+
+ per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = entries;
+}
+
+/*
+ * The current tracer is copied to avoid a global locking
+ * all around.
+ */
+static void *s_start(struct seq_file *m, loff_t *pos)
+{
+ struct trace_iterator *iter = m->private;
+ struct trace_array *tr = iter->tr;
+ int cpu_file = iter->cpu_file;
+ void *p = NULL;
+ loff_t l = 0;
+ int cpu;
+
+ /*
+ * copy the tracer to avoid using a global lock all around.
+ * iter->trace is a copy of current_trace, the pointer to the
+ * name may be used instead of a strcmp(), as iter->trace->name
+ * will point to the same string as current_trace->name.
+ */
+ mutex_lock(&trace_types_lock);
+ if (unlikely(tr->current_trace && iter->trace->name != tr->current_trace->name))
+ *iter->trace = *tr->current_trace;
+ mutex_unlock(&trace_types_lock);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+ if (iter->snapshot && iter->trace->use_max_tr)
+ return ERR_PTR(-EBUSY);
+#endif
+
+ if (!iter->snapshot)
+ atomic_inc(&trace_record_cmdline_disabled);
+
+ if (*pos != iter->pos) {
+ iter->ent = NULL;
+ iter->cpu = 0;
+ iter->idx = -1;
+
+ if (cpu_file == RING_BUFFER_ALL_CPUS) {
+ for_each_tracing_cpu(cpu)
+ tracing_iter_reset(iter, cpu);
+ } else
+ tracing_iter_reset(iter, cpu_file);
+
+ iter->leftover = 0;
+ for (p = iter; p && l < *pos; p = s_next(m, p, &l))
+ ;
+
+ } else {
+ /*
+ * If we overflowed the seq_file before, then we want
+ * to just reuse the trace_seq buffer again.
+ */
+ if (iter->leftover)
+ p = iter;
+ else {
+ l = *pos - 1;
+ p = s_next(m, p, &l);
+ }
+ }
+
+ trace_event_read_lock();
+ trace_access_lock(cpu_file);
+ return p;
+}
+
+static void s_stop(struct seq_file *m, void *p)
+{
+ struct trace_iterator *iter = m->private;
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+ if (iter->snapshot && iter->trace->use_max_tr)
+ return;
+#endif
+
+ if (!iter->snapshot)
+ atomic_dec(&trace_record_cmdline_disabled);
+
+ trace_access_unlock(iter->cpu_file);
+ trace_event_read_unlock();
+}
+
+static void
+get_total_entries(struct trace_buffer *buf,
+ unsigned long *total, unsigned long *entries)
+{
+ unsigned long count;
+ int cpu;
+
+ *total = 0;
+ *entries = 0;
+
+ for_each_tracing_cpu(cpu) {
+ count = ring_buffer_entries_cpu(buf->buffer, cpu);
+ /*
+ * If this buffer has skipped entries, then we hold all
+ * entries for the trace and we need to ignore the
+ * ones before the time stamp.
+ */
+ if (per_cpu_ptr(buf->data, cpu)->skipped_entries) {
+ count -= per_cpu_ptr(buf->data, cpu)->skipped_entries;
+ /* total is the same as the entries */
+ *total += count;
+ } else
+ *total += count +
+ ring_buffer_overrun_cpu(buf->buffer, cpu);
+ *entries += count;
+ }
+}
+
+static void print_lat_help_header(struct seq_file *m)
+{
+ seq_puts(m, "# _------=> CPU# \n"
+ "# / _-----=> irqs-off \n"
+ "# | / _----=> need-resched \n"
+ "# || / _---=> hardirq/softirq \n"
+ "# ||| / _--=> preempt-depth \n"
+ "# |||| / delay \n"
+ "# cmd pid ||||| time | caller \n"
+ "# \\ / ||||| \\ | / \n");
+}
+
+static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
+{
+ unsigned long total;
+ unsigned long entries;
+
+ get_total_entries(buf, &total, &entries);
+ seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu #P:%d\n",
+ entries, total, num_online_cpus());
+ seq_puts(m, "#\n");
+}
+
+static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m)
+{
+ print_event_info(buf, m);
+ seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"
+ "# | | | | |\n");
+}
+
+static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m)
+{
+ print_event_info(buf, m);
+ seq_puts(m, "# _-----=> irqs-off\n"
+ "# / _----=> need-resched\n"
+ "# | / _---=> hardirq/softirq\n"
+ "# || / _--=> preempt-depth\n"
+ "# ||| / delay\n"
+ "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n"
+ "# | | | |||| | |\n");
+}
+
+void
+print_trace_header(struct seq_file *m, struct trace_iterator *iter)
+{
+ unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
+ struct trace_buffer *buf = iter->trace_buffer;
+ struct trace_array_cpu *data = per_cpu_ptr(buf->data, buf->cpu);
+ struct tracer *type = iter->trace;
+ unsigned long entries;
+ unsigned long total;
+ const char *name = "preemption";
+
+ name = type->name;
+
+ get_total_entries(buf, &total, &entries);
+
+ seq_printf(m, "# %s latency trace v1.1.5 on %s\n",
+ name, UTS_RELEASE);
+ seq_puts(m, "# -----------------------------------"
+ "---------------------------------\n");
+ seq_printf(m, "# latency: %lu us, #%lu/%lu, CPU#%d |"
+ " (M:%s VP:%d, KP:%d, SP:%d HP:%d",
+ nsecs_to_usecs(data->saved_latency),
+ entries,
+ total,
+ buf->cpu,
+#if defined(CONFIG_PREEMPT_NONE)
+ "server",
+#elif defined(CONFIG_PREEMPT_VOLUNTARY)
+ "desktop",
+#elif defined(CONFIG_PREEMPT)
+ "preempt",
+#else
+ "unknown",
+#endif
+ /* These are reserved for later use */
+ 0, 0, 0, 0);
+#ifdef CONFIG_SMP
+ seq_printf(m, " #P:%d)\n", num_online_cpus());
+#else
+ seq_puts(m, ")\n");
+#endif
+ seq_puts(m, "# -----------------\n");
+ seq_printf(m, "# | task: %.16s-%d "
+ "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n",
+ data->comm, data->pid,
+ from_kuid_munged(seq_user_ns(m), data->uid), data->nice,
+ data->policy, data->rt_priority);
+ seq_puts(m, "# -----------------\n");
+
+ if (data->critical_start) {
+ seq_puts(m, "# => started at: ");
+ seq_print_ip_sym(&iter->seq, data->critical_start, sym_flags);
+ trace_print_seq(m, &iter->seq);
+ seq_puts(m, "\n# => ended at: ");
+ seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags);
+ trace_print_seq(m, &iter->seq);
+ seq_puts(m, "\n#\n");
+ }
+
+ seq_puts(m, "#\n");
+}
+
+static void test_cpu_buff_start(struct trace_iterator *iter)
+{
+ struct trace_seq *s = &iter->seq;
+
+ if (!(trace_flags & TRACE_ITER_ANNOTATE))
+ return;
+
+ if (!(iter->iter_flags & TRACE_FILE_ANNOTATE))
+ return;
+
+ if (cpumask_test_cpu(iter->cpu, iter->started))
+ return;
+
+ if (per_cpu_ptr(iter->trace_buffer->data, iter->cpu)->skipped_entries)
+ return;
+
+ cpumask_set_cpu(iter->cpu, iter->started);
+
+ /* Don't print started cpu buffer for the first entry of the trace */
+ if (iter->idx > 1)
+ trace_seq_printf(s, "##### CPU %u buffer started ####\n",
+ iter->cpu);
+}
+
+static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
+{
+ struct trace_seq *s = &iter->seq;
+ unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
+ struct trace_entry *entry;
+ struct trace_event *event;
+
+ entry = iter->ent;
+
+ test_cpu_buff_start(iter);
+
+ event = ftrace_find_event(entry->type);
+
+ if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
+ if (iter->iter_flags & TRACE_FILE_LAT_FMT)
+ trace_print_lat_context(iter);
+ else
+ trace_print_context(iter);
+ }
+
+ if (trace_seq_has_overflowed(s))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ if (event)
+ return event->funcs->trace(iter, sym_flags, event);
+
+ trace_seq_printf(s, "Unknown type %d\n", entry->type);
+
+ return trace_handle_return(s);
+}
+
+static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
+{
+ struct trace_seq *s = &iter->seq;
+ struct trace_entry *entry;
+ struct trace_event *event;
+
+ entry = iter->ent;
+
+ if (trace_flags & TRACE_ITER_CONTEXT_INFO)
+ trace_seq_printf(s, "%d %d %llu ",
+ entry->pid, iter->cpu, iter->ts);
+
+ if (trace_seq_has_overflowed(s))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ event = ftrace_find_event(entry->type);
+ if (event)
+ return event->funcs->raw(iter, 0, event);
+
+ trace_seq_printf(s, "%d ?\n", entry->type);
+
+ return trace_handle_return(s);
+}
+
+static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
+{
+ struct trace_seq *s = &iter->seq;
+ unsigned char newline = '\n';
+ struct trace_entry *entry;
+ struct trace_event *event;
+
+ entry = iter->ent;
+
+ if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
+ SEQ_PUT_HEX_FIELD(s, entry->pid);
+ SEQ_PUT_HEX_FIELD(s, iter->cpu);
+ SEQ_PUT_HEX_FIELD(s, iter->ts);
+ if (trace_seq_has_overflowed(s))
+ return TRACE_TYPE_PARTIAL_LINE;
+ }
+
+ event = ftrace_find_event(entry->type);
+ if (event) {
+ enum print_line_t ret = event->funcs->hex(iter, 0, event);
+ if (ret != TRACE_TYPE_HANDLED)
+ return ret;
+ }
+
+ SEQ_PUT_FIELD(s, newline);
+
+ return trace_handle_return(s);
+}
+
+static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
+{
+ struct trace_seq *s = &iter->seq;
+ struct trace_entry *entry;
+ struct trace_event *event;
+
+ entry = iter->ent;
+
+ if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
+ SEQ_PUT_FIELD(s, entry->pid);
+ SEQ_PUT_FIELD(s, iter->cpu);
+ SEQ_PUT_FIELD(s, iter->ts);
+ if (trace_seq_has_overflowed(s))
+ return TRACE_TYPE_PARTIAL_LINE;
+ }
+
+ event = ftrace_find_event(entry->type);
+ return event ? event->funcs->binary(iter, 0, event) :
+ TRACE_TYPE_HANDLED;
+}
+
+int trace_empty(struct trace_iterator *iter)
+{
+ struct ring_buffer_iter *buf_iter;
+ int cpu;
+
+ /* If we are looking at one CPU buffer, only check that one */
+ if (iter->cpu_file != RING_BUFFER_ALL_CPUS) {
+ cpu = iter->cpu_file;
+ buf_iter = trace_buffer_iter(iter, cpu);
+ if (buf_iter) {
+ if (!ring_buffer_iter_empty(buf_iter))
+ return 0;
+ } else {
+ if (!ring_buffer_empty_cpu(iter->trace_buffer->buffer, cpu))
+ return 0;
+ }
+ return 1;
+ }
+
+ for_each_tracing_cpu(cpu) {
+ buf_iter = trace_buffer_iter(iter, cpu);
+ if (buf_iter) {
+ if (!ring_buffer_iter_empty(buf_iter))
+ return 0;
+ } else {
+ if (!ring_buffer_empty_cpu(iter->trace_buffer->buffer, cpu))
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+/* Called with trace_event_read_lock() held. */
+enum print_line_t print_trace_line(struct trace_iterator *iter)
+{
+ enum print_line_t ret;
+
+ if (iter->lost_events) {
+ trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
+ iter->cpu, iter->lost_events);
+ if (trace_seq_has_overflowed(&iter->seq))
+ return TRACE_TYPE_PARTIAL_LINE;
+ }
+
+ if (iter->trace && iter->trace->print_line) {
+ ret = iter->trace->print_line(iter);
+ if (ret != TRACE_TYPE_UNHANDLED)
+ return ret;
+ }
+
+ if (iter->ent->type == TRACE_BPUTS &&
+ trace_flags & TRACE_ITER_PRINTK &&
+ trace_flags & TRACE_ITER_PRINTK_MSGONLY)
+ return trace_print_bputs_msg_only(iter);
+
+ if (iter->ent->type == TRACE_BPRINT &&
+ trace_flags & TRACE_ITER_PRINTK &&
+ trace_flags & TRACE_ITER_PRINTK_MSGONLY)
+ return trace_print_bprintk_msg_only(iter);
+
+ if (iter->ent->type == TRACE_PRINT &&
+ trace_flags & TRACE_ITER_PRINTK &&
+ trace_flags & TRACE_ITER_PRINTK_MSGONLY)
+ return trace_print_printk_msg_only(iter);
+
+ if (trace_flags & TRACE_ITER_BIN)
+ return print_bin_fmt(iter);
+
+ if (trace_flags & TRACE_ITER_HEX)
+ return print_hex_fmt(iter);
+
+ if (trace_flags & TRACE_ITER_RAW)
+ return print_raw_fmt(iter);
+
+ return print_trace_fmt(iter);
+}
+
+void trace_latency_header(struct seq_file *m)
+{
+ struct trace_iterator *iter = m->private;
+
+ /* print nothing if the buffers are empty */
+ if (trace_empty(iter))
+ return;
+
+ if (iter->iter_flags & TRACE_FILE_LAT_FMT)
+ print_trace_header(m, iter);
+
+ if (!(trace_flags & TRACE_ITER_VERBOSE))
+ print_lat_help_header(m);
+}
+
+void trace_default_header(struct seq_file *m)
+{
+ struct trace_iterator *iter = m->private;
+
+ if (!(trace_flags & TRACE_ITER_CONTEXT_INFO))
+ return;
+
+ if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
+ /* print nothing if the buffers are empty */
+ if (trace_empty(iter))
+ return;
+ print_trace_header(m, iter);
+ if (!(trace_flags & TRACE_ITER_VERBOSE))
+ print_lat_help_header(m);
+ } else {
+ if (!(trace_flags & TRACE_ITER_VERBOSE)) {
+ if (trace_flags & TRACE_ITER_IRQ_INFO)
+ print_func_help_header_irq(iter->trace_buffer, m);
+ else
+ print_func_help_header(iter->trace_buffer, m);
+ }
+ }
+}
+
+static void test_ftrace_alive(struct seq_file *m)
+{
+ if (!ftrace_is_dead())
+ return;
+ seq_puts(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n"
+ "# MAY BE MISSING FUNCTION EVENTS\n");
+}
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+static void show_snapshot_main_help(struct seq_file *m)
+{
+ seq_puts(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"
+ "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"
+ "# Takes a snapshot of the main buffer.\n"
+ "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n"
+ "# (Doesn't have to be '2' works with any number that\n"
+ "# is not a '0' or '1')\n");
+}
+
+static void show_snapshot_percpu_help(struct seq_file *m)
+{
+ seq_puts(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n");
+#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
+ seq_puts(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"
+ "# Takes a snapshot of the main buffer for this cpu.\n");
+#else
+ seq_puts(m, "# echo 1 > snapshot : Not supported with this kernel.\n"
+ "# Must use main snapshot file to allocate.\n");
+#endif
+ seq_puts(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n"
+ "# (Doesn't have to be '2' works with any number that\n"
+ "# is not a '0' or '1')\n");
+}
+
+static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter)
+{
+ if (iter->tr->allocated_snapshot)
+ seq_puts(m, "#\n# * Snapshot is allocated *\n#\n");
+ else
+ seq_puts(m, "#\n# * Snapshot is freed *\n#\n");
+
+ seq_puts(m, "# Snapshot commands:\n");
+ if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
+ show_snapshot_main_help(m);
+ else
+ show_snapshot_percpu_help(m);
+}
+#else
+/* Should never be called */
+static inline void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) { }
+#endif
+
+static int s_show(struct seq_file *m, void *v)
+{
+ struct trace_iterator *iter = v;
+ int ret;
+
+ if (iter->ent == NULL) {
+ if (iter->tr) {
+ seq_printf(m, "# tracer: %s\n", iter->trace->name);
+ seq_puts(m, "#\n");
+ test_ftrace_alive(m);
+ }
+ if (iter->snapshot && trace_empty(iter))
+ print_snapshot_help(m, iter);
+ else if (iter->trace && iter->trace->print_header)
+ iter->trace->print_header(m);
+ else
+ trace_default_header(m);
+
+ } else if (iter->leftover) {
+ /*
+ * If we filled the seq_file buffer earlier, we
+ * want to just show it now.
+ */
+ ret = trace_print_seq(m, &iter->seq);
+
+ /* ret should this time be zero, but you never know */
+ iter->leftover = ret;
+
+ } else {
+ print_trace_line(iter);
+ ret = trace_print_seq(m, &iter->seq);
+ /*
+ * If we overflow the seq_file buffer, then it will
+ * ask us for this data again at start up.
+ * Use that instead.
+ * ret is 0 if seq_file write succeeded.
+ * -1 otherwise.
+ */
+ iter->leftover = ret;
+ }
+
+ return 0;
+}
+
+/*
+ * Should be used after trace_array_get(), trace_types_lock
+ * ensures that i_cdev was already initialized.
+ */
+static inline int tracing_get_cpu(struct inode *inode)
+{
+ if (inode->i_cdev) /* See trace_create_cpu_file() */
+ return (long)inode->i_cdev - 1;
+ return RING_BUFFER_ALL_CPUS;
+}
+
+static const struct seq_operations tracer_seq_ops = {
+ .start = s_start,
+ .next = s_next,
+ .stop = s_stop,
+ .show = s_show,
+};
+
+static struct trace_iterator *
+__tracing_open(struct inode *inode, struct file *file, bool snapshot)
+{
+ struct trace_array *tr = inode->i_private;
+ struct trace_iterator *iter;
+ int cpu;
+
+ if (tracing_disabled)
+ return ERR_PTR(-ENODEV);
+
+ iter = __seq_open_private(file, &tracer_seq_ops, sizeof(*iter));
+ if (!iter)
+ return ERR_PTR(-ENOMEM);
+
+ iter->buffer_iter = kzalloc(sizeof(*iter->buffer_iter) * num_possible_cpus(),
+ GFP_KERNEL);
+ if (!iter->buffer_iter)
+ goto release;
+
+ /*
+ * We make a copy of the current tracer to avoid concurrent
+ * changes on it while we are reading.
+ */
+ mutex_lock(&trace_types_lock);
+ iter->trace = kzalloc(sizeof(*iter->trace), GFP_KERNEL);
+ if (!iter->trace)
+ goto fail;
+
+ *iter->trace = *tr->current_trace;
+
+ if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL))
+ goto fail;
+
+ iter->tr = tr;
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+ /* Currently only the top directory has a snapshot */
+ if (tr->current_trace->print_max || snapshot)
+ iter->trace_buffer = &tr->max_buffer;
+ else
+#endif
+ iter->trace_buffer = &tr->trace_buffer;
+ iter->snapshot = snapshot;
+ iter->pos = -1;
+ iter->cpu_file = tracing_get_cpu(inode);
+ mutex_init(&iter->mutex);
+
+ /* Notify the tracer early; before we stop tracing. */
+ if (iter->trace && iter->trace->open)
+ iter->trace->open(iter);
+
+ /* Annotate start of buffers if we had overruns */
+ if (ring_buffer_overruns(iter->trace_buffer->buffer))
+ iter->iter_flags |= TRACE_FILE_ANNOTATE;
+
+ /* Output in nanoseconds only if we are using a clock in nanoseconds. */
+ if (trace_clocks[tr->clock_id].in_ns)
+ iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
+
+ /* stop the trace while dumping if we are not opening "snapshot" */
+ if (!iter->snapshot)
+ tracing_stop_tr(tr);
+
+ if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
+ for_each_tracing_cpu(cpu) {
+ iter->buffer_iter[cpu] =
+ ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu);
+ }
+ ring_buffer_read_prepare_sync();
+ for_each_tracing_cpu(cpu) {
+ ring_buffer_read_start(iter->buffer_iter[cpu]);
+ tracing_iter_reset(iter, cpu);
+ }
+ } else {
+ cpu = iter->cpu_file;
+ iter->buffer_iter[cpu] =
+ ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu);
+ ring_buffer_read_prepare_sync();
+ ring_buffer_read_start(iter->buffer_iter[cpu]);
+ tracing_iter_reset(iter, cpu);
+ }
+
+ mutex_unlock(&trace_types_lock);
+
+ return iter;
+
+ fail:
+ mutex_unlock(&trace_types_lock);
+ kfree(iter->trace);
+ kfree(iter->buffer_iter);
+release:
+ seq_release_private(inode, file);
+ return ERR_PTR(-ENOMEM);
+}
+
+int tracing_open_generic(struct inode *inode, struct file *filp)
+{
+ if (tracing_disabled)
+ return -ENODEV;
+
+ filp->private_data = inode->i_private;
+ return 0;
+}
+
+bool tracing_is_disabled(void)
+{
+ return (tracing_disabled) ? true: false;
+}
+
+/*
+ * Open and update trace_array ref count.
+ * Must have the current trace_array passed to it.
+ */
+static int tracing_open_generic_tr(struct inode *inode, struct file *filp)
+{
+ struct trace_array *tr = inode->i_private;
+
+ if (tracing_disabled)
+ return -ENODEV;
+
+ if (trace_array_get(tr) < 0)
+ return -ENODEV;
+
+ filp->private_data = inode->i_private;
+
+ return 0;
+}
+
+static int tracing_release(struct inode *inode, struct file *file)
+{
+ struct trace_array *tr = inode->i_private;
+ struct seq_file *m = file->private_data;
+ struct trace_iterator *iter;
+ int cpu;
+
+ if (!(file->f_mode & FMODE_READ)) {
+ trace_array_put(tr);
+ return 0;
+ }
+
+ /* Writes do not use seq_file */
+ iter = m->private;
+ mutex_lock(&trace_types_lock);
+
+ for_each_tracing_cpu(cpu) {
+ if (iter->buffer_iter[cpu])
+ ring_buffer_read_finish(iter->buffer_iter[cpu]);
+ }
+
+ if (iter->trace && iter->trace->close)
+ iter->trace->close(iter);
+
+ if (!iter->snapshot)
+ /* reenable tracing if it was previously enabled */
+ tracing_start_tr(tr);
+
+ __trace_array_put(tr);
+
+ mutex_unlock(&trace_types_lock);
+
+ mutex_destroy(&iter->mutex);
+ free_cpumask_var(iter->started);
+ kfree(iter->trace);
+ kfree(iter->buffer_iter);
+ seq_release_private(inode, file);
+
+ return 0;
+}
+
+static int tracing_release_generic_tr(struct inode *inode, struct file *file)
+{
+ struct trace_array *tr = inode->i_private;
+
+ trace_array_put(tr);
+ return 0;
+}
+
+static int tracing_single_release_tr(struct inode *inode, struct file *file)
+{
+ struct trace_array *tr = inode->i_private;
+
+ trace_array_put(tr);
+
+ return single_release(inode, file);
+}
+
+static int tracing_open(struct inode *inode, struct file *file)
+{
+ struct trace_array *tr = inode->i_private;
+ struct trace_iterator *iter;
+ int ret = 0;
+
+ if (trace_array_get(tr) < 0)
+ return -ENODEV;
+
+ /* If this file was open for write, then erase contents */
+ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
+ int cpu = tracing_get_cpu(inode);
+
+ if (cpu == RING_BUFFER_ALL_CPUS)
+ tracing_reset_online_cpus(&tr->trace_buffer);
+ else
+ tracing_reset(&tr->trace_buffer, cpu);
+ }
+
+ if (file->f_mode & FMODE_READ) {
+ iter = __tracing_open(inode, file, false);
+ if (IS_ERR(iter))
+ ret = PTR_ERR(iter);
+ else if (trace_flags & TRACE_ITER_LATENCY_FMT)
+ iter->iter_flags |= TRACE_FILE_LAT_FMT;
+ }
+
+ if (ret < 0)
+ trace_array_put(tr);
+
+ return ret;
+}
+
+/*
+ * Some tracers are not suitable for instance buffers.
+ * A tracer is always available for the global array (toplevel)
+ * or if it explicitly states that it is.
+ */
+static bool
+trace_ok_for_array(struct tracer *t, struct trace_array *tr)
+{
+ return (tr->flags & TRACE_ARRAY_FL_GLOBAL) || t->allow_instances;
+}
+
+/* Find the next tracer that this trace array may use */
+static struct tracer *
+get_tracer_for_array(struct trace_array *tr, struct tracer *t)
+{
+ while (t && !trace_ok_for_array(t, tr))
+ t = t->next;
+
+ return t;
+}
+
+static void *
+t_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct trace_array *tr = m->private;
+ struct tracer *t = v;
+
+ (*pos)++;
+
+ if (t)
+ t = get_tracer_for_array(tr, t->next);
+
+ return t;
+}
+
+static void *t_start(struct seq_file *m, loff_t *pos)
+{
+ struct trace_array *tr = m->private;
+ struct tracer *t;
+ loff_t l = 0;
+
+ mutex_lock(&trace_types_lock);
+
+ t = get_tracer_for_array(tr, trace_types);
+ for (; t && l < *pos; t = t_next(m, t, &l))
+ ;
+
+ return t;
+}
+
+static void t_stop(struct seq_file *m, void *p)
+{
+ mutex_unlock(&trace_types_lock);
+}
+
+static int t_show(struct seq_file *m, void *v)
+{
+ struct tracer *t = v;
+
+ if (!t)
+ return 0;
+
+ seq_puts(m, t->name);
+ if (t->next)
+ seq_putc(m, ' ');
+ else
+ seq_putc(m, '\n');
+
+ return 0;
+}
+
+static const struct seq_operations show_traces_seq_ops = {
+ .start = t_start,
+ .next = t_next,
+ .stop = t_stop,
+ .show = t_show,
+};
+
+static int show_traces_open(struct inode *inode, struct file *file)
+{
+ struct trace_array *tr = inode->i_private;
+ struct seq_file *m;
+ int ret;
+
+ if (tracing_disabled)
+ return -ENODEV;
+
+ ret = seq_open(file, &show_traces_seq_ops);
+ if (ret)
+ return ret;
+
+ m = file->private_data;
+ m->private = tr;
+
+ return 0;
+}
+
+static ssize_t
+tracing_write_stub(struct file *filp, const char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ return count;
+}
+
+loff_t tracing_lseek(struct file *file, loff_t offset, int whence)
+{
+ int ret;
+
+ if (file->f_mode & FMODE_READ)
+ ret = seq_lseek(file, offset, whence);
+ else
+ file->f_pos = ret = 0;
+
+ return ret;
+}
+
+static const struct file_operations tracing_fops = {
+ .open = tracing_open,
+ .read = seq_read,
+ .write = tracing_write_stub,
+ .llseek = tracing_lseek,
+ .release = tracing_release,
+};
+
+static const struct file_operations show_traces_fops = {
+ .open = show_traces_open,
+ .read = seq_read,
+ .release = seq_release,
+ .llseek = seq_lseek,
+};
+
+/*
+ * The tracer itself will not take this lock, but still we want
+ * to provide a consistent cpumask to user-space:
+ */
+static DEFINE_MUTEX(tracing_cpumask_update_lock);
+
+/*
+ * Temporary storage for the character representation of the
+ * CPU bitmask (and one more byte for the newline):
+ */
+static char mask_str[NR_CPUS + 1];
+
+static ssize_t
+tracing_cpumask_read(struct file *filp, char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ struct trace_array *tr = file_inode(filp)->i_private;
+ int len;
+
+ mutex_lock(&tracing_cpumask_update_lock);
+
+ len = snprintf(mask_str, count, "%*pb\n",
+ cpumask_pr_args(tr->tracing_cpumask));
+ if (len >= count) {
+ count = -EINVAL;
+ goto out_err;
+ }
+ count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1);
+
+out_err:
+ mutex_unlock(&tracing_cpumask_update_lock);
+
+ return count;
+}
+
+static ssize_t
+tracing_cpumask_write(struct file *filp, const char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ struct trace_array *tr = file_inode(filp)->i_private;
+ cpumask_var_t tracing_cpumask_new;
+ int err, cpu;
+
+ if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL))
+ return -ENOMEM;
+
+ err = cpumask_parse_user(ubuf, count, tracing_cpumask_new);
+ if (err)
+ goto err_unlock;
+
+ mutex_lock(&tracing_cpumask_update_lock);
+
+ local_irq_disable();
+ arch_spin_lock(&tr->max_lock);
+ for_each_tracing_cpu(cpu) {
+ /*
+ * Increase/decrease the disabled counter if we are
+ * about to flip a bit in the cpumask:
+ */
+ if (cpumask_test_cpu(cpu, tr->tracing_cpumask) &&
+ !cpumask_test_cpu(cpu, tracing_cpumask_new)) {
+ atomic_inc(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled);
+ ring_buffer_record_disable_cpu(tr->trace_buffer.buffer, cpu);
+ }
+ if (!cpumask_test_cpu(cpu, tr->tracing_cpumask) &&
+ cpumask_test_cpu(cpu, tracing_cpumask_new)) {
+ atomic_dec(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled);
+ ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu);
+ }
+ }
+ arch_spin_unlock(&tr->max_lock);
+ local_irq_enable();
+
+ cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new);
+
+ mutex_unlock(&tracing_cpumask_update_lock);
+ free_cpumask_var(tracing_cpumask_new);
+
+ return count;
+
+err_unlock:
+ free_cpumask_var(tracing_cpumask_new);
+
+ return err;
+}
+
+static const struct file_operations tracing_cpumask_fops = {
+ .open = tracing_open_generic_tr,
+ .read = tracing_cpumask_read,
+ .write = tracing_cpumask_write,
+ .release = tracing_release_generic_tr,
+ .llseek = generic_file_llseek,
+};
+
+static int tracing_trace_options_show(struct seq_file *m, void *v)
+{
+ struct tracer_opt *trace_opts;
+ struct trace_array *tr = m->private;
+ u32 tracer_flags;
+ int i;
+
+ mutex_lock(&trace_types_lock);
+ tracer_flags = tr->current_trace->flags->val;
+ trace_opts = tr->current_trace->flags->opts;
+
+ for (i = 0; trace_options[i]; i++) {
+ if (trace_flags & (1 << i))
+ seq_printf(m, "%s\n", trace_options[i]);
+ else
+ seq_printf(m, "no%s\n", trace_options[i]);
+ }
+
+ for (i = 0; trace_opts[i].name; i++) {
+ if (tracer_flags & trace_opts[i].bit)
+ seq_printf(m, "%s\n", trace_opts[i].name);
+ else
+ seq_printf(m, "no%s\n", trace_opts[i].name);
+ }
+ mutex_unlock(&trace_types_lock);
+
+ return 0;
+}
+
+static int __set_tracer_option(struct trace_array *tr,
+ struct tracer_flags *tracer_flags,
+ struct tracer_opt *opts, int neg)
+{
+ struct tracer *trace = tr->current_trace;
+ int ret;
+
+ ret = trace->set_flag(tr, tracer_flags->val, opts->bit, !neg);
+ if (ret)
+ return ret;
+
+ if (neg)
+ tracer_flags->val &= ~opts->bit;
+ else
+ tracer_flags->val |= opts->bit;
+ return 0;
+}
+
+/* Try to assign a tracer specific option */
+static int set_tracer_option(struct trace_array *tr, char *cmp, int neg)
+{
+ struct tracer *trace = tr->current_trace;
+ struct tracer_flags *tracer_flags = trace->flags;
+ struct tracer_opt *opts = NULL;
+ int i;
+
+ for (i = 0; tracer_flags->opts[i].name; i++) {
+ opts = &tracer_flags->opts[i];
+
+ if (strcmp(cmp, opts->name) == 0)
+ return __set_tracer_option(tr, trace->flags, opts, neg);
+ }
+
+ return -EINVAL;
+}
+
+/* Some tracers require overwrite to stay enabled */
+int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set)
+{
+ if (tracer->enabled && (mask & TRACE_ITER_OVERWRITE) && !set)
+ return -1;
+
+ return 0;
+}
+
+int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
+{
+ /* do nothing if flag is already set */
+ if (!!(trace_flags & mask) == !!enabled)
+ return 0;
+
+ /* Give the tracer a chance to approve the change */
+ if (tr->current_trace->flag_changed)
+ if (tr->current_trace->flag_changed(tr, mask, !!enabled))
+ return -EINVAL;
+
+ if (enabled)
+ trace_flags |= mask;
+ else
+ trace_flags &= ~mask;
+
+ if (mask == TRACE_ITER_RECORD_CMD)
+ trace_event_enable_cmd_record(enabled);
+
+ if (mask == TRACE_ITER_OVERWRITE) {
+ ring_buffer_change_overwrite(tr->trace_buffer.buffer, enabled);
+#ifdef CONFIG_TRACER_MAX_TRACE
+ ring_buffer_change_overwrite(tr->max_buffer.buffer, enabled);
+#endif
+ }
+
+ if (mask == TRACE_ITER_PRINTK)
+ trace_printk_start_stop_comm(enabled);
+
+ return 0;
+}
+
+static int trace_set_options(struct trace_array *tr, char *option)
+{
+ char *cmp;
+ int neg = 0;
+ int ret = -ENODEV;
+ int i;
+
+ cmp = strstrip(option);
+
+ if (strncmp(cmp, "no", 2) == 0) {
+ neg = 1;
+ cmp += 2;
+ }
+
+ mutex_lock(&trace_types_lock);
+
+ for (i = 0; trace_options[i]; i++) {
+ if (strcmp(cmp, trace_options[i]) == 0) {
+ ret = set_tracer_flag(tr, 1 << i, !neg);
+ break;
+ }
+ }
+
+ /* If no option could be set, test the specific tracer options */
+ if (!trace_options[i])
+ ret = set_tracer_option(tr, cmp, neg);
+
+ mutex_unlock(&trace_types_lock);
+
+ return ret;
+}
+
+static ssize_t
+tracing_trace_options_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct seq_file *m = filp->private_data;
+ struct trace_array *tr = m->private;
+ char buf[64];
+ int ret;
+
+ if (cnt >= sizeof(buf))
+ return -EINVAL;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+
+ ret = trace_set_options(tr, buf);
+ if (ret < 0)
+ return ret;
+
+ *ppos += cnt;
+
+ return cnt;
+}
+
+static int tracing_trace_options_open(struct inode *inode, struct file *file)
+{
+ struct trace_array *tr = inode->i_private;
+ int ret;
+
+ if (tracing_disabled)
+ return -ENODEV;
+
+ if (trace_array_get(tr) < 0)
+ return -ENODEV;
+
+ ret = single_open(file, tracing_trace_options_show, inode->i_private);
+ if (ret < 0)
+ trace_array_put(tr);
+
+ return ret;
+}
+
+static const struct file_operations tracing_iter_fops = {
+ .open = tracing_trace_options_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = tracing_single_release_tr,
+ .write = tracing_trace_options_write,
+};
+
+static const char readme_msg[] =
+ "tracing mini-HOWTO:\n\n"
+ "# echo 0 > tracing_on : quick way to disable tracing\n"
+ "# echo 1 > tracing_on : quick way to re-enable tracing\n\n"
+ " Important files:\n"
+ " trace\t\t\t- The static contents of the buffer\n"
+ "\t\t\t To clear the buffer write into this file: echo > trace\n"
+ " trace_pipe\t\t- A consuming read to see the contents of the buffer\n"
+ " current_tracer\t- function and latency tracers\n"
+ " available_tracers\t- list of configured tracers for current_tracer\n"
+ " buffer_size_kb\t- view and modify size of per cpu buffer\n"
+ " buffer_total_size_kb - view total size of all cpu buffers\n\n"
+ " trace_clock\t\t-change the clock used to order events\n"
+ " local: Per cpu clock but may not be synced across CPUs\n"
+ " global: Synced across CPUs but slows tracing down.\n"
+ " counter: Not a clock, but just an increment\n"
+ " uptime: Jiffy counter from time of boot\n"
+ " perf: Same clock that perf events use\n"
+#ifdef CONFIG_X86_64
+ " x86-tsc: TSC cycle counter\n"
+#endif
+ "\n trace_marker\t\t- Writes into this file writes into the kernel buffer\n"
+ " tracing_cpumask\t- Limit which CPUs to trace\n"
+ " instances\t\t- Make sub-buffers with: mkdir instances/foo\n"
+ "\t\t\t Remove sub-buffer with rmdir\n"
+ " trace_options\t\t- Set format or modify how tracing happens\n"
+ "\t\t\t Disable an option by adding a suffix 'no' to the\n"
+ "\t\t\t option name\n"
+ " saved_cmdlines_size\t- echo command number in here to store comm-pid list\n"
+#ifdef CONFIG_DYNAMIC_FTRACE
+ "\n available_filter_functions - list of functions that can be filtered on\n"
+ " set_ftrace_filter\t- echo function name in here to only trace these\n"
+ "\t\t\t functions\n"
+ "\t accepts: func_full_name, *func_end, func_begin*, *func_middle*\n"
+ "\t modules: Can select a group via module\n"
+ "\t Format: :mod:<module-name>\n"
+ "\t example: echo :mod:ext3 > set_ftrace_filter\n"
+ "\t triggers: a command to perform when function is hit\n"
+ "\t Format: <function>:<trigger>[:count]\n"
+ "\t trigger: traceon, traceoff\n"
+ "\t\t enable_event:<system>:<event>\n"
+ "\t\t disable_event:<system>:<event>\n"
+#ifdef CONFIG_STACKTRACE
+ "\t\t stacktrace\n"
+#endif
+#ifdef CONFIG_TRACER_SNAPSHOT
+ "\t\t snapshot\n"
+#endif
+ "\t\t dump\n"
+ "\t\t cpudump\n"
+ "\t example: echo do_fault:traceoff > set_ftrace_filter\n"
+ "\t echo do_trap:traceoff:3 > set_ftrace_filter\n"
+ "\t The first one will disable tracing every time do_fault is hit\n"
+ "\t The second will disable tracing at most 3 times when do_trap is hit\n"
+ "\t The first time do trap is hit and it disables tracing, the\n"
+ "\t counter will decrement to 2. If tracing is already disabled,\n"
+ "\t the counter will not decrement. It only decrements when the\n"
+ "\t trigger did work\n"
+ "\t To remove trigger without count:\n"
+ "\t echo '!<function>:<trigger> > set_ftrace_filter\n"
+ "\t To remove trigger with a count:\n"
+ "\t echo '!<function>:<trigger>:0 > set_ftrace_filter\n"
+ " set_ftrace_notrace\t- echo function name in here to never trace.\n"
+ "\t accepts: func_full_name, *func_end, func_begin*, *func_middle*\n"
+ "\t modules: Can select a group via module command :mod:\n"
+ "\t Does not accept triggers\n"
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#ifdef CONFIG_FUNCTION_TRACER
+ " set_ftrace_pid\t- Write pid(s) to only function trace those pids\n"
+ "\t\t (function)\n"
+#endif
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ " set_graph_function\t- Trace the nested calls of a function (function_graph)\n"
+ " set_graph_notrace\t- Do not trace the nested calls of a function (function_graph)\n"
+ " max_graph_depth\t- Trace a limited depth of nested calls (0 is unlimited)\n"
+#endif
+#ifdef CONFIG_TRACER_SNAPSHOT
+ "\n snapshot\t\t- Like 'trace' but shows the content of the static\n"
+ "\t\t\t snapshot buffer. Read the contents for more\n"
+ "\t\t\t information\n"
+#endif
+#ifdef CONFIG_STACK_TRACER
+ " stack_trace\t\t- Shows the max stack trace when active\n"
+ " stack_max_size\t- Shows current max stack size that was traced\n"
+ "\t\t\t Write into this file to reset the max size (trigger a\n"
+ "\t\t\t new trace)\n"
+#ifdef CONFIG_DYNAMIC_FTRACE
+ " stack_trace_filter\t- Like set_ftrace_filter but limits what stack_trace\n"
+ "\t\t\t traces\n"
+#endif
+#endif /* CONFIG_STACK_TRACER */
+ " events/\t\t- Directory containing all trace event subsystems:\n"
+ " enable\t\t- Write 0/1 to enable/disable tracing of all events\n"
+ " events/<system>/\t- Directory containing all trace events for <system>:\n"
+ " enable\t\t- Write 0/1 to enable/disable tracing of all <system>\n"
+ "\t\t\t events\n"
+ " filter\t\t- If set, only events passing filter are traced\n"
+ " events/<system>/<event>/\t- Directory containing control files for\n"
+ "\t\t\t <event>:\n"
+ " enable\t\t- Write 0/1 to enable/disable tracing of <event>\n"
+ " filter\t\t- If set, only events passing filter are traced\n"
+ " trigger\t\t- If set, a command to perform when event is hit\n"
+ "\t Format: <trigger>[:count][if <filter>]\n"
+ "\t trigger: traceon, traceoff\n"
+ "\t enable_event:<system>:<event>\n"
+ "\t disable_event:<system>:<event>\n"
+#ifdef CONFIG_STACKTRACE
+ "\t\t stacktrace\n"
+#endif
+#ifdef CONFIG_TRACER_SNAPSHOT
+ "\t\t snapshot\n"
+#endif
+ "\t example: echo traceoff > events/block/block_unplug/trigger\n"
+ "\t echo traceoff:3 > events/block/block_unplug/trigger\n"
+ "\t echo 'enable_event:kmem:kmalloc:3 if nr_rq > 1' > \\\n"
+ "\t events/block/block_unplug/trigger\n"
+ "\t The first disables tracing every time block_unplug is hit.\n"
+ "\t The second disables tracing the first 3 times block_unplug is hit.\n"
+ "\t The third enables the kmalloc event the first 3 times block_unplug\n"
+ "\t is hit and has value of greater than 1 for the 'nr_rq' event field.\n"
+ "\t Like function triggers, the counter is only decremented if it\n"
+ "\t enabled or disabled tracing.\n"
+ "\t To remove a trigger without a count:\n"
+ "\t echo '!<trigger> > <system>/<event>/trigger\n"
+ "\t To remove a trigger with a count:\n"
+ "\t echo '!<trigger>:0 > <system>/<event>/trigger\n"
+ "\t Filters can be ignored when removing a trigger.\n"
+;
+
+static ssize_t
+tracing_readme_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ return simple_read_from_buffer(ubuf, cnt, ppos,
+ readme_msg, strlen(readme_msg));
+}
+
+static const struct file_operations tracing_readme_fops = {
+ .open = tracing_open_generic,
+ .read = tracing_readme_read,
+ .llseek = generic_file_llseek,
+};
+
+static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ unsigned int *ptr = v;
+
+ if (*pos || m->count)
+ ptr++;
+
+ (*pos)++;
+
+ for (; ptr < &savedcmd->map_cmdline_to_pid[savedcmd->cmdline_num];
+ ptr++) {
+ if (*ptr == -1 || *ptr == NO_CMDLINE_MAP)
+ continue;
+
+ return ptr;
+ }
+
+ return NULL;
+}
+
+static void *saved_cmdlines_start(struct seq_file *m, loff_t *pos)
+{
+ void *v;
+ loff_t l = 0;
+
+ preempt_disable();
+ arch_spin_lock(&trace_cmdline_lock);
+
+ v = &savedcmd->map_cmdline_to_pid[0];
+ while (l <= *pos) {
+ v = saved_cmdlines_next(m, v, &l);
+ if (!v)
+ return NULL;
+ }
+
+ return v;
+}
+
+static void saved_cmdlines_stop(struct seq_file *m, void *v)
+{
+ arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
+}
+
+static int saved_cmdlines_show(struct seq_file *m, void *v)
+{
+ char buf[TASK_COMM_LEN];
+ unsigned int *pid = v;
+
+ __trace_find_cmdline(*pid, buf);
+ seq_printf(m, "%d %s\n", *pid, buf);
+ return 0;
+}
+
+static const struct seq_operations tracing_saved_cmdlines_seq_ops = {
+ .start = saved_cmdlines_start,
+ .next = saved_cmdlines_next,
+ .stop = saved_cmdlines_stop,
+ .show = saved_cmdlines_show,
+};
+
+static int tracing_saved_cmdlines_open(struct inode *inode, struct file *filp)
+{
+ if (tracing_disabled)
+ return -ENODEV;
+
+ return seq_open(filp, &tracing_saved_cmdlines_seq_ops);
+}
+
+static const struct file_operations tracing_saved_cmdlines_fops = {
+ .open = tracing_saved_cmdlines_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static ssize_t
+tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[64];
+ int r;
+
+ arch_spin_lock(&trace_cmdline_lock);
+ r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num);
+ arch_spin_unlock(&trace_cmdline_lock);
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s)
+{
+ kfree(s->saved_cmdlines);
+ kfree(s->map_cmdline_to_pid);
+ kfree(s);
+}
+
+static int tracing_resize_saved_cmdlines(unsigned int val)
+{
+ struct saved_cmdlines_buffer *s, *savedcmd_temp;
+
+ s = kmalloc(sizeof(*s), GFP_KERNEL);
+ if (!s)
+ return -ENOMEM;
+
+ if (allocate_cmdlines_buffer(val, s) < 0) {
+ kfree(s);
+ return -ENOMEM;
+ }
+
+ arch_spin_lock(&trace_cmdline_lock);
+ savedcmd_temp = savedcmd;
+ savedcmd = s;
+ arch_spin_unlock(&trace_cmdline_lock);
+ free_saved_cmdlines_buffer(savedcmd_temp);
+
+ return 0;
+}
+
+static ssize_t
+tracing_saved_cmdlines_size_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
+
+ /* must have at least 1 entry or less than PID_MAX_DEFAULT */
+ if (!val || val > PID_MAX_DEFAULT)
+ return -EINVAL;
+
+ ret = tracing_resize_saved_cmdlines((unsigned int)val);
+ if (ret < 0)
+ return ret;
+
+ *ppos += cnt;
+
+ return cnt;
+}
+
+static const struct file_operations tracing_saved_cmdlines_size_fops = {
+ .open = tracing_open_generic,
+ .read = tracing_saved_cmdlines_size_read,
+ .write = tracing_saved_cmdlines_size_write,
+};
+
+#ifdef CONFIG_TRACE_ENUM_MAP_FILE
+static union trace_enum_map_item *
+update_enum_map(union trace_enum_map_item *ptr)
+{
+ if (!ptr->map.enum_string) {
+ if (ptr->tail.next) {
+ ptr = ptr->tail.next;
+ /* Set ptr to the next real item (skip head) */
+ ptr++;
+ } else
+ return NULL;
+ }
+ return ptr;
+}
+
+static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ union trace_enum_map_item *ptr = v;
+
+ /*
+ * Paranoid! If ptr points to end, we don't want to increment past it.
+ * This really should never happen.
+ */
+ ptr = update_enum_map(ptr);
+ if (WARN_ON_ONCE(!ptr))
+ return NULL;
+
+ ptr++;
+
+ (*pos)++;
+
+ ptr = update_enum_map(ptr);
+
+ return ptr;
+}
+
+static void *enum_map_start(struct seq_file *m, loff_t *pos)
+{
+ union trace_enum_map_item *v;
+ loff_t l = 0;
+
+ mutex_lock(&trace_enum_mutex);
+
+ v = trace_enum_maps;
+ if (v)
+ v++;
+
+ while (v && l < *pos) {
+ v = enum_map_next(m, v, &l);
+ }
+
+ return v;
+}
+
+static void enum_map_stop(struct seq_file *m, void *v)
+{
+ mutex_unlock(&trace_enum_mutex);
+}
+
+static int enum_map_show(struct seq_file *m, void *v)
+{
+ union trace_enum_map_item *ptr = v;
+
+ seq_printf(m, "%s %ld (%s)\n",
+ ptr->map.enum_string, ptr->map.enum_value,
+ ptr->map.system);
+
+ return 0;
+}
+
+static const struct seq_operations tracing_enum_map_seq_ops = {
+ .start = enum_map_start,
+ .next = enum_map_next,
+ .stop = enum_map_stop,
+ .show = enum_map_show,
+};
+
+static int tracing_enum_map_open(struct inode *inode, struct file *filp)
+{
+ if (tracing_disabled)
+ return -ENODEV;
+
+ return seq_open(filp, &tracing_enum_map_seq_ops);
+}
+
+static const struct file_operations tracing_enum_map_fops = {
+ .open = tracing_enum_map_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static inline union trace_enum_map_item *
+trace_enum_jmp_to_tail(union trace_enum_map_item *ptr)
+{
+ /* Return tail of array given the head */
+ return ptr + ptr->head.length + 1;
+}
+
+static void
+trace_insert_enum_map_file(struct module *mod, struct trace_enum_map **start,
+ int len)
+{
+ struct trace_enum_map **stop;
+ struct trace_enum_map **map;
+ union trace_enum_map_item *map_array;
+ union trace_enum_map_item *ptr;
+
+ stop = start + len;
+
+ /*
+ * The trace_enum_maps contains the map plus a head and tail item,
+ * where the head holds the module and length of array, and the
+ * tail holds a pointer to the next list.
+ */
+ map_array = kmalloc(sizeof(*map_array) * (len + 2), GFP_KERNEL);
+ if (!map_array) {
+ pr_warning("Unable to allocate trace enum mapping\n");
+ return;
+ }
+
+ mutex_lock(&trace_enum_mutex);
+
+ if (!trace_enum_maps)
+ trace_enum_maps = map_array;
+ else {
+ ptr = trace_enum_maps;
+ for (;;) {
+ ptr = trace_enum_jmp_to_tail(ptr);
+ if (!ptr->tail.next)
+ break;
+ ptr = ptr->tail.next;
+
+ }
+ ptr->tail.next = map_array;
+ }
+ map_array->head.mod = mod;
+ map_array->head.length = len;
+ map_array++;
+
+ for (map = start; (unsigned long)map < (unsigned long)stop; map++) {
+ map_array->map = **map;
+ map_array++;
+ }
+ memset(map_array, 0, sizeof(*map_array));
+
+ mutex_unlock(&trace_enum_mutex);
+}
+
+static void trace_create_enum_file(struct dentry *d_tracer)
+{
+ trace_create_file("enum_map", 0444, d_tracer,
+ NULL, &tracing_enum_map_fops);
+}
+
+#else /* CONFIG_TRACE_ENUM_MAP_FILE */
+static inline void trace_create_enum_file(struct dentry *d_tracer) { }
+static inline void trace_insert_enum_map_file(struct module *mod,
+ struct trace_enum_map **start, int len) { }
+#endif /* !CONFIG_TRACE_ENUM_MAP_FILE */
+
+static void trace_insert_enum_map(struct module *mod,
+ struct trace_enum_map **start, int len)
+{
+ struct trace_enum_map **map;
+
+ if (len <= 0)
+ return;
+
+ map = start;
+
+ trace_event_enum_update(map, len);
+
+ trace_insert_enum_map_file(mod, start, len);
+}
+
+static ssize_t
+tracing_set_trace_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct trace_array *tr = filp->private_data;
+ char buf[MAX_TRACER_SIZE+2];
+ int r;
+
+ mutex_lock(&trace_types_lock);
+ r = sprintf(buf, "%s\n", tr->current_trace->name);
+ mutex_unlock(&trace_types_lock);
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+int tracer_init(struct tracer *t, struct trace_array *tr)
+{
+ tracing_reset_online_cpus(&tr->trace_buffer);
+ return t->init(tr);
+}
+
+static void set_buffer_entries(struct trace_buffer *buf, unsigned long val)
+{
+ int cpu;
+
+ for_each_tracing_cpu(cpu)
+ per_cpu_ptr(buf->data, cpu)->entries = val;
+}
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+/* resize @tr's buffer to the size of @size_tr's entries */
+static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf,
+ struct trace_buffer *size_buf, int cpu_id)
+{
+ int cpu, ret = 0;
+
+ if (cpu_id == RING_BUFFER_ALL_CPUS) {
+ for_each_tracing_cpu(cpu) {
+ ret = ring_buffer_resize(trace_buf->buffer,
+ per_cpu_ptr(size_buf->data, cpu)->entries, cpu);
+ if (ret < 0)
+ break;
+ per_cpu_ptr(trace_buf->data, cpu)->entries =
+ per_cpu_ptr(size_buf->data, cpu)->entries;
+ }
+ } else {
+ ret = ring_buffer_resize(trace_buf->buffer,
+ per_cpu_ptr(size_buf->data, cpu_id)->entries, cpu_id);
+ if (ret == 0)
+ per_cpu_ptr(trace_buf->data, cpu_id)->entries =
+ per_cpu_ptr(size_buf->data, cpu_id)->entries;
+ }
+
+ return ret;
+}
+#endif /* CONFIG_TRACER_MAX_TRACE */
+
+static int __tracing_resize_ring_buffer(struct trace_array *tr,
+ unsigned long size, int cpu)
+{
+ int ret;
+
+ /*
+ * If kernel or user changes the size of the ring buffer
+ * we use the size that was given, and we can forget about
+ * expanding it later.
+ */
+ ring_buffer_expanded = true;
+
+ /* May be called before buffers are initialized */
+ if (!tr->trace_buffer.buffer)
+ return 0;
+
+ ret = ring_buffer_resize(tr->trace_buffer.buffer, size, cpu);
+ if (ret < 0)
+ return ret;
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+ if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL) ||
+ !tr->current_trace->use_max_tr)
+ goto out;
+
+ ret = ring_buffer_resize(tr->max_buffer.buffer, size, cpu);
+ if (ret < 0) {
+ int r = resize_buffer_duplicate_size(&tr->trace_buffer,
+ &tr->trace_buffer, cpu);
+ if (r < 0) {
+ /*
+ * AARGH! We are left with different
+ * size max buffer!!!!
+ * The max buffer is our "snapshot" buffer.
+ * When a tracer needs a snapshot (one of the
+ * latency tracers), it swaps the max buffer
+ * with the saved snap shot. We succeeded to
+ * update the size of the main buffer, but failed to
+ * update the size of the max buffer. But when we tried
+ * to reset the main buffer to the original size, we
+ * failed there too. This is very unlikely to
+ * happen, but if it does, warn and kill all
+ * tracing.
+ */
+ WARN_ON(1);
+ tracing_disabled = 1;
+ }
+ return ret;
+ }
+
+ if (cpu == RING_BUFFER_ALL_CPUS)
+ set_buffer_entries(&tr->max_buffer, size);
+ else
+ per_cpu_ptr(tr->max_buffer.data, cpu)->entries = size;
+
+ out:
+#endif /* CONFIG_TRACER_MAX_TRACE */
+
+ if (cpu == RING_BUFFER_ALL_CPUS)
+ set_buffer_entries(&tr->trace_buffer, size);
+ else
+ per_cpu_ptr(tr->trace_buffer.data, cpu)->entries = size;
+
+ return ret;
+}
+
+static ssize_t tracing_resize_ring_buffer(struct trace_array *tr,
+ unsigned long size, int cpu_id)
+{
+ int ret = size;
+
+ mutex_lock(&trace_types_lock);
+
+ if (cpu_id != RING_BUFFER_ALL_CPUS) {
+ /* make sure, this cpu is enabled in the mask */
+ if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ ret = __tracing_resize_ring_buffer(tr, size, cpu_id);
+ if (ret < 0)
+ ret = -ENOMEM;
+
+out:
+ mutex_unlock(&trace_types_lock);
+
+ return ret;
+}
+
+
+/**
+ * tracing_update_buffers - used by tracing facility to expand ring buffers
+ *
+ * To save on memory when the tracing is never used on a system with it
+ * configured in. The ring buffers are set to a minimum size. But once
+ * a user starts to use the tracing facility, then they need to grow
+ * to their default size.
+ *
+ * This function is to be called when a tracer is about to be used.
+ */
+int tracing_update_buffers(void)
+{
+ int ret = 0;
+
+ mutex_lock(&trace_types_lock);
+ if (!ring_buffer_expanded)
+ ret = __tracing_resize_ring_buffer(&global_trace, trace_buf_size,
+ RING_BUFFER_ALL_CPUS);
+ mutex_unlock(&trace_types_lock);
+
+ return ret;
+}
+
+struct trace_option_dentry;
+
+static struct trace_option_dentry *
+create_trace_option_files(struct trace_array *tr, struct tracer *tracer);
+
+static void
+destroy_trace_option_files(struct trace_option_dentry *topts);
+
+/*
+ * Used to clear out the tracer before deletion of an instance.
+ * Must have trace_types_lock held.
+ */
+static void tracing_set_nop(struct trace_array *tr)
+{
+ if (tr->current_trace == &nop_trace)
+ return;
+
+ tr->current_trace->enabled--;
+
+ if (tr->current_trace->reset)
+ tr->current_trace->reset(tr);
+
+ tr->current_trace = &nop_trace;
+}
+
+static void update_tracer_options(struct trace_array *tr, struct tracer *t)
+{
+ static struct trace_option_dentry *topts;
+
+ /* Only enable if the directory has been created already. */
+ if (!tr->dir)
+ return;
+
+ /* Currently, only the top instance has options */
+ if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL))
+ return;
+
+ destroy_trace_option_files(topts);
+ topts = create_trace_option_files(tr, t);
+}
+
+static int tracing_set_tracer(struct trace_array *tr, const char *buf)
+{
+ struct tracer *t;
+#ifdef CONFIG_TRACER_MAX_TRACE
+ bool had_max_tr;
+#endif
+ int ret = 0;
+
+ mutex_lock(&trace_types_lock);
+
+ if (!ring_buffer_expanded) {
+ ret = __tracing_resize_ring_buffer(tr, trace_buf_size,
+ RING_BUFFER_ALL_CPUS);
+ if (ret < 0)
+ goto out;
+ ret = 0;
+ }
+
+ for (t = trace_types; t; t = t->next) {
+ if (strcmp(t->name, buf) == 0)
+ break;
+ }
+ if (!t) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (t == tr->current_trace)
+ goto out;
+
+ /* Some tracers are only allowed for the top level buffer */
+ if (!trace_ok_for_array(t, tr)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* If trace pipe files are being read, we can't change the tracer */
+ if (tr->current_trace->ref) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ trace_branch_disable();
+
+ tr->current_trace->enabled--;
+
+ if (tr->current_trace->reset)
+ tr->current_trace->reset(tr);
+
+ /* Current trace needs to be nop_trace before synchronize_sched */
+ tr->current_trace = &nop_trace;
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+ had_max_tr = tr->allocated_snapshot;
+
+ if (had_max_tr && !t->use_max_tr) {
+ /*
+ * We need to make sure that the update_max_tr sees that
+ * current_trace changed to nop_trace to keep it from
+ * swapping the buffers after we resize it.
+ * The update_max_tr is called from interrupts disabled
+ * so a synchronized_sched() is sufficient.
+ */
+ synchronize_sched();
+ free_snapshot(tr);
+ }
+#endif
+ update_tracer_options(tr, t);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+ if (t->use_max_tr && !had_max_tr) {
+ ret = alloc_snapshot(tr);
+ if (ret < 0)
+ goto out;
+ }
+#endif
+
+ if (t->init) {
+ ret = tracer_init(t, tr);
+ if (ret)
+ goto out;
+ }
+
+ tr->current_trace = t;
+ tr->current_trace->enabled++;
+ trace_branch_enable(tr);
+ out:
+ mutex_unlock(&trace_types_lock);
+
+ return ret;
+}
+
+static ssize_t
+tracing_set_trace_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct trace_array *tr = filp->private_data;
+ char buf[MAX_TRACER_SIZE+1];
+ int i;
+ size_t ret;
+ int err;
+
+ ret = cnt;
+
+ if (cnt > MAX_TRACER_SIZE)
+ cnt = MAX_TRACER_SIZE;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+
+ /* strip ending whitespace. */
+ for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
+ buf[i] = 0;
+
+ err = tracing_set_tracer(tr, buf);
+ if (err)
+ return err;
+
+ *ppos += ret;
+
+ return ret;
+}
+
+static ssize_t
+tracing_nsecs_read(unsigned long *ptr, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[64];
+ int r;
+
+ r = snprintf(buf, sizeof(buf), "%ld\n",
+ *ptr == (unsigned long)-1 ? -1 : nsecs_to_usecs(*ptr));
+ if (r > sizeof(buf))
+ r = sizeof(buf);
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+tracing_nsecs_write(unsigned long *ptr, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
+
+ *ptr = val * 1000;
+
+ return cnt;
+}
+
+static ssize_t
+tracing_thresh_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ return tracing_nsecs_read(&tracing_thresh, ubuf, cnt, ppos);
+}
+
+static ssize_t
+tracing_thresh_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct trace_array *tr = filp->private_data;
+ int ret;
+
+ mutex_lock(&trace_types_lock);
+ ret = tracing_nsecs_write(&tracing_thresh, ubuf, cnt, ppos);
+ if (ret < 0)
+ goto out;
+
+ if (tr->current_trace->update_thresh) {
+ ret = tr->current_trace->update_thresh(tr);
+ if (ret < 0)
+ goto out;
+ }
+
+ ret = cnt;
+out:
+ mutex_unlock(&trace_types_lock);
+
+ return ret;
+}
+
+static ssize_t
+tracing_max_lat_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ return tracing_nsecs_read(filp->private_data, ubuf, cnt, ppos);
+}
+
+static ssize_t
+tracing_max_lat_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ return tracing_nsecs_write(filp->private_data, ubuf, cnt, ppos);
+}
+
+static int tracing_open_pipe(struct inode *inode, struct file *filp)
+{
+ struct trace_array *tr = inode->i_private;
+ struct trace_iterator *iter;
+ int ret = 0;
+
+ if (tracing_disabled)
+ return -ENODEV;
+
+ if (trace_array_get(tr) < 0)
+ return -ENODEV;
+
+ mutex_lock(&trace_types_lock);
+
+ /* create a buffer to store the information to pass to userspace */
+ iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter) {
+ ret = -ENOMEM;
+ __trace_array_put(tr);
+ goto out;
+ }
+
+ trace_seq_init(&iter->seq);
+ iter->trace = tr->current_trace;
+
+ if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ /* trace pipe does not show start of buffer */
+ cpumask_setall(iter->started);
+
+ if (trace_flags & TRACE_ITER_LATENCY_FMT)
+ iter->iter_flags |= TRACE_FILE_LAT_FMT;
+
+ /* Output in nanoseconds only if we are using a clock in nanoseconds. */
+ if (trace_clocks[tr->clock_id].in_ns)
+ iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
+
+ iter->tr = tr;
+ iter->trace_buffer = &tr->trace_buffer;
+ iter->cpu_file = tracing_get_cpu(inode);
+ mutex_init(&iter->mutex);
+ filp->private_data = iter;
+
+ if (iter->trace->pipe_open)
+ iter->trace->pipe_open(iter);
+
+ nonseekable_open(inode, filp);
+
+ tr->current_trace->ref++;
+out:
+ mutex_unlock(&trace_types_lock);
+ return ret;
+
+fail:
+ kfree(iter->trace);
+ kfree(iter);
+ __trace_array_put(tr);
+ mutex_unlock(&trace_types_lock);
+ return ret;
+}
+
+static int tracing_release_pipe(struct inode *inode, struct file *file)
+{
+ struct trace_iterator *iter = file->private_data;
+ struct trace_array *tr = inode->i_private;
+
+ mutex_lock(&trace_types_lock);
+
+ tr->current_trace->ref--;
+
+ if (iter->trace->pipe_close)
+ iter->trace->pipe_close(iter);
+
+ mutex_unlock(&trace_types_lock);
+
+ free_cpumask_var(iter->started);
+ mutex_destroy(&iter->mutex);
+ kfree(iter);
+
+ trace_array_put(tr);
+
+ return 0;
+}
+
+static unsigned int
+trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_table)
+{
+ /* Iterators are static, they should be filled or empty */
+ if (trace_buffer_iter(iter, iter->cpu_file))
+ return POLLIN | POLLRDNORM;
+
+ if (trace_flags & TRACE_ITER_BLOCK)
+ /*
+ * Always select as readable when in blocking mode
+ */
+ return POLLIN | POLLRDNORM;
+ else
+ return ring_buffer_poll_wait(iter->trace_buffer->buffer, iter->cpu_file,
+ filp, poll_table);
+}
+
+static unsigned int
+tracing_poll_pipe(struct file *filp, poll_table *poll_table)
+{
+ struct trace_iterator *iter = filp->private_data;
+
+ return trace_poll(iter, filp, poll_table);
+}
+
+/* Must be called with iter->mutex held. */
+static int tracing_wait_pipe(struct file *filp)
+{
+ struct trace_iterator *iter = filp->private_data;
+ int ret;
+
+ while (trace_empty(iter)) {
+
+ if ((filp->f_flags & O_NONBLOCK)) {
+ return -EAGAIN;
+ }
+
+ /*
+ * We block until we read something and tracing is disabled.
+ * We still block if tracing is disabled, but we have never
+ * read anything. This allows a user to cat this file, and
+ * then enable tracing. But after we have read something,
+ * we give an EOF when tracing is again disabled.
+ *
+ * iter->pos will be 0 if we haven't read anything.
+ */
+ if (!tracing_is_on() && iter->pos)
+ break;
+
+ mutex_unlock(&iter->mutex);
+
+ ret = wait_on_pipe(iter, false);
+
+ mutex_lock(&iter->mutex);
+
+ if (ret)
+ return ret;
+ }
+
+ return 1;
+}
+
+/*
+ * Consumer reader.
+ */
+static ssize_t
+tracing_read_pipe(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct trace_iterator *iter = filp->private_data;
+ ssize_t sret;
+
+ /* return any leftover data */
+ sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
+ if (sret != -EBUSY)
+ return sret;
+
+ trace_seq_init(&iter->seq);
+
+ /*
+ * Avoid more than one consumer on a single file descriptor
+ * This is just a matter of traces coherency, the ring buffer itself
+ * is protected.
+ */
+ mutex_lock(&iter->mutex);
+ if (iter->trace->read) {
+ sret = iter->trace->read(iter, filp, ubuf, cnt, ppos);
+ if (sret)
+ goto out;
+ }
+
+waitagain:
+ sret = tracing_wait_pipe(filp);
+ if (sret <= 0)
+ goto out;
+
+ /* stop when tracing is finished */
+ if (trace_empty(iter)) {
+ sret = 0;
+ goto out;
+ }
+
+ if (cnt >= PAGE_SIZE)
+ cnt = PAGE_SIZE - 1;
+
+ /* reset all but tr, trace, and overruns */
+ memset(&iter->seq, 0,
+ sizeof(struct trace_iterator) -
+ offsetof(struct trace_iterator, seq));
+ cpumask_clear(iter->started);
+ iter->pos = -1;
+
+ trace_event_read_lock();
+ trace_access_lock(iter->cpu_file);
+ while (trace_find_next_entry_inc(iter) != NULL) {
+ enum print_line_t ret;
+ int save_len = iter->seq.seq.len;
+
+ ret = print_trace_line(iter);
+ if (ret == TRACE_TYPE_PARTIAL_LINE) {
+ /* don't print partial lines */
+ iter->seq.seq.len = save_len;
+ break;
+ }
+ if (ret != TRACE_TYPE_NO_CONSUME)
+ trace_consume(iter);
+
+ if (trace_seq_used(&iter->seq) >= cnt)
+ break;
+
+ /*
+ * Setting the full flag means we reached the trace_seq buffer
+ * size and we should leave by partial output condition above.
+ * One of the trace_seq_* functions is not used properly.
+ */
+ WARN_ONCE(iter->seq.full, "full flag set for trace type %d",
+ iter->ent->type);
+ }
+ trace_access_unlock(iter->cpu_file);
+ trace_event_read_unlock();
+
+ /* Now copy what we have to the user */
+ sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
+ if (iter->seq.seq.readpos >= trace_seq_used(&iter->seq))
+ trace_seq_init(&iter->seq);
+
+ /*
+ * If there was nothing to send to user, in spite of consuming trace
+ * entries, go back to wait for more entries.
+ */
+ if (sret == -EBUSY)
+ goto waitagain;
+
+out:
+ mutex_unlock(&iter->mutex);
+
+ return sret;
+}
+
+static void tracing_spd_release_pipe(struct splice_pipe_desc *spd,
+ unsigned int idx)
+{
+ __free_page(spd->pages[idx]);
+}
+
+static const struct pipe_buf_operations tracing_pipe_buf_ops = {
+ .can_merge = 0,
+ .confirm = generic_pipe_buf_confirm,
+ .release = generic_pipe_buf_release,
+ .steal = generic_pipe_buf_steal,
+ .get = generic_pipe_buf_get,
+};
+
+static size_t
+tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
+{
+ size_t count;
+ int save_len;
+ int ret;
+
+ /* Seq buffer is page-sized, exactly what we need. */
+ for (;;) {
+ save_len = iter->seq.seq.len;
+ ret = print_trace_line(iter);
+
+ if (trace_seq_has_overflowed(&iter->seq)) {
+ iter->seq.seq.len = save_len;
+ break;
+ }
+
+ /*
+ * This should not be hit, because it should only
+ * be set if the iter->seq overflowed. But check it
+ * anyway to be safe.
+ */
+ if (ret == TRACE_TYPE_PARTIAL_LINE) {
+ iter->seq.seq.len = save_len;
+ break;
+ }
+
+ count = trace_seq_used(&iter->seq) - save_len;
+ if (rem < count) {
+ rem = 0;
+ iter->seq.seq.len = save_len;
+ break;
+ }
+
+ if (ret != TRACE_TYPE_NO_CONSUME)
+ trace_consume(iter);
+ rem -= count;
+ if (!trace_find_next_entry_inc(iter)) {
+ rem = 0;
+ iter->ent = NULL;
+ break;
+ }
+ }
+
+ return rem;
+}
+
+static ssize_t tracing_splice_read_pipe(struct file *filp,
+ loff_t *ppos,
+ struct pipe_inode_info *pipe,
+ size_t len,
+ unsigned int flags)
+{
+ struct page *pages_def[PIPE_DEF_BUFFERS];
+ struct partial_page partial_def[PIPE_DEF_BUFFERS];
+ struct trace_iterator *iter = filp->private_data;
+ struct splice_pipe_desc spd = {
+ .pages = pages_def,
+ .partial = partial_def,
+ .nr_pages = 0, /* This gets updated below. */
+ .nr_pages_max = PIPE_DEF_BUFFERS,
+ .flags = flags,
+ .ops = &tracing_pipe_buf_ops,
+ .spd_release = tracing_spd_release_pipe,
+ };
+ ssize_t ret;
+ size_t rem;
+ unsigned int i;
+
+ if (splice_grow_spd(pipe, &spd))
+ return -ENOMEM;
+
+ mutex_lock(&iter->mutex);
+
+ if (iter->trace->splice_read) {
+ ret = iter->trace->splice_read(iter, filp,
+ ppos, pipe, len, flags);
+ if (ret)
+ goto out_err;
+ }
+
+ ret = tracing_wait_pipe(filp);
+ if (ret <= 0)
+ goto out_err;
+
+ if (!iter->ent && !trace_find_next_entry_inc(iter)) {
+ ret = -EFAULT;
+ goto out_err;
+ }
+
+ trace_event_read_lock();
+ trace_access_lock(iter->cpu_file);
+
+ /* Fill as many pages as possible. */
+ for (i = 0, rem = len; i < spd.nr_pages_max && rem; i++) {
+ spd.pages[i] = alloc_page(GFP_KERNEL);
+ if (!spd.pages[i])
+ break;
+
+ rem = tracing_fill_pipe_page(rem, iter);
+
+ /* Copy the data into the page, so we can start over. */
+ ret = trace_seq_to_buffer(&iter->seq,
+ page_address(spd.pages[i]),
+ trace_seq_used(&iter->seq));
+ if (ret < 0) {
+ __free_page(spd.pages[i]);
+ break;
+ }
+ spd.partial[i].offset = 0;
+ spd.partial[i].len = trace_seq_used(&iter->seq);
+
+ trace_seq_init(&iter->seq);
+ }
+
+ trace_access_unlock(iter->cpu_file);
+ trace_event_read_unlock();
+ mutex_unlock(&iter->mutex);
+
+ spd.nr_pages = i;
+
+ ret = splice_to_pipe(pipe, &spd);
+out:
+ splice_shrink_spd(&spd);
+ return ret;
+
+out_err:
+ mutex_unlock(&iter->mutex);
+ goto out;
+}
+
+static ssize_t
+tracing_entries_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct inode *inode = file_inode(filp);
+ struct trace_array *tr = inode->i_private;
+ int cpu = tracing_get_cpu(inode);
+ char buf[64];
+ int r = 0;
+ ssize_t ret;
+
+ mutex_lock(&trace_types_lock);
+
+ if (cpu == RING_BUFFER_ALL_CPUS) {
+ int cpu, buf_size_same;
+ unsigned long size;
+
+ size = 0;
+ buf_size_same = 1;
+ /* check if all cpu sizes are same */
+ for_each_tracing_cpu(cpu) {
+ /* fill in the size from first enabled cpu */
+ if (size == 0)
+ size = per_cpu_ptr(tr->trace_buffer.data, cpu)->entries;
+ if (size != per_cpu_ptr(tr->trace_buffer.data, cpu)->entries) {
+ buf_size_same = 0;
+ break;
+ }
+ }
+
+ if (buf_size_same) {
+ if (!ring_buffer_expanded)
+ r = sprintf(buf, "%lu (expanded: %lu)\n",
+ size >> 10,
+ trace_buf_size >> 10);
+ else
+ r = sprintf(buf, "%lu\n", size >> 10);
+ } else
+ r = sprintf(buf, "X\n");
+ } else
+ r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->trace_buffer.data, cpu)->entries >> 10);
+
+ mutex_unlock(&trace_types_lock);
+
+ ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+ return ret;
+}
+
+static ssize_t
+tracing_entries_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct inode *inode = file_inode(filp);
+ struct trace_array *tr = inode->i_private;
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
+
+ /* must have at least 1 entry */
+ if (!val)
+ return -EINVAL;
+
+ /* value is in KB */
+ val <<= 10;
+ ret = tracing_resize_ring_buffer(tr, val, tracing_get_cpu(inode));
+ if (ret < 0)
+ return ret;
+
+ *ppos += cnt;
+
+ return cnt;
+}
+
+static ssize_t
+tracing_total_entries_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct trace_array *tr = filp->private_data;
+ char buf[64];
+ int r, cpu;
+ unsigned long size = 0, expanded_size = 0;
+
+ mutex_lock(&trace_types_lock);
+ for_each_tracing_cpu(cpu) {
+ size += per_cpu_ptr(tr->trace_buffer.data, cpu)->entries >> 10;
+ if (!ring_buffer_expanded)
+ expanded_size += trace_buf_size >> 10;
+ }
+ if (ring_buffer_expanded)
+ r = sprintf(buf, "%lu\n", size);
+ else
+ r = sprintf(buf, "%lu (expanded: %lu)\n", size, expanded_size);
+ mutex_unlock(&trace_types_lock);
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+tracing_free_buffer_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ /*
+ * There is no need to read what the user has written, this function
+ * is just to make sure that there is no error when "echo" is used
+ */
+
+ *ppos += cnt;
+
+ return cnt;
+}
+
+static int
+tracing_free_buffer_release(struct inode *inode, struct file *filp)
+{
+ struct trace_array *tr = inode->i_private;
+
+ /* disable tracing ? */
+ if (trace_flags & TRACE_ITER_STOP_ON_FREE)
+ tracer_tracing_off(tr);
+ /* resize the ring buffer to 0 */
+ tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS);
+
+ trace_array_put(tr);
+
+ return 0;
+}
+
+static ssize_t
+tracing_mark_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *fpos)
+{
+ unsigned long addr = (unsigned long)ubuf;
+ struct trace_array *tr = filp->private_data;
+ struct ring_buffer_event *event;
+ struct ring_buffer *buffer;
+ struct print_entry *entry;
+ unsigned long irq_flags;
+ struct page *pages[2];
+ void *map_page[2];
+ int nr_pages = 1;
+ ssize_t written;
+ int offset;
+ int size;
+ int len;
+ int ret;
+ int i;
+
+ if (tracing_disabled)
+ return -EINVAL;
+
+ if (!(trace_flags & TRACE_ITER_MARKERS))
+ return -EINVAL;
+
+ if (cnt > TRACE_BUF_SIZE)
+ cnt = TRACE_BUF_SIZE;
+
+ /*
+ * Userspace is injecting traces into the kernel trace buffer.
+ * We want to be as non intrusive as possible.
+ * To do so, we do not want to allocate any special buffers
+ * or take any locks, but instead write the userspace data
+ * straight into the ring buffer.
+ *
+ * First we need to pin the userspace buffer into memory,
+ * which, most likely it is, because it just referenced it.
+ * But there's no guarantee that it is. By using get_user_pages_fast()
+ * and kmap_atomic/kunmap_atomic() we can get access to the
+ * pages directly. We then write the data directly into the
+ * ring buffer.
+ */
+ BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
+
+ /* check if we cross pages */
+ if ((addr & PAGE_MASK) != ((addr + cnt) & PAGE_MASK))
+ nr_pages = 2;
+
+ offset = addr & (PAGE_SIZE - 1);
+ addr &= PAGE_MASK;
+
+ ret = get_user_pages_fast(addr, nr_pages, 0, pages);
+ if (ret < nr_pages) {
+ while (--ret >= 0)
+ put_page(pages[ret]);
+ written = -EFAULT;
+ goto out;
+ }
+
+ for (i = 0; i < nr_pages; i++)
+ map_page[i] = kmap_atomic(pages[i]);
+
+ local_save_flags(irq_flags);
+ size = sizeof(*entry) + cnt + 2; /* possible \n added */
+ buffer = tr->trace_buffer.buffer;
+ event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
+ irq_flags, preempt_count());
+ if (!event) {
+ /* Ring buffer disabled, return as if not open for write */
+ written = -EBADF;
+ goto out_unlock;
+ }
+
+ entry = ring_buffer_event_data(event);
+ entry->ip = _THIS_IP_;
+
+ if (nr_pages == 2) {
+ len = PAGE_SIZE - offset;
+ memcpy(&entry->buf, map_page[0] + offset, len);
+ memcpy(&entry->buf[len], map_page[1], cnt - len);
+ } else
+ memcpy(&entry->buf, map_page[0] + offset, cnt);
+
+ if (entry->buf[cnt - 1] != '\n') {
+ entry->buf[cnt] = '\n';
+ entry->buf[cnt + 1] = '\0';
+ } else
+ entry->buf[cnt] = '\0';
+
+ __buffer_unlock_commit(buffer, event);
+
+ written = cnt;
+
+ *fpos += written;
+
+ out_unlock:
+ for (i = nr_pages - 1; i >= 0; i--) {
+ kunmap_atomic(map_page[i]);
+ put_page(pages[i]);
+ }
+ out:
+ return written;
+}
+
+static int tracing_clock_show(struct seq_file *m, void *v)
+{
+ struct trace_array *tr = m->private;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(trace_clocks); i++)
+ seq_printf(m,
+ "%s%s%s%s", i ? " " : "",
+ i == tr->clock_id ? "[" : "", trace_clocks[i].name,
+ i == tr->clock_id ? "]" : "");
+ seq_putc(m, '\n');
+
+ return 0;
+}
+
+static int tracing_set_clock(struct trace_array *tr, const char *clockstr)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) {
+ if (strcmp(trace_clocks[i].name, clockstr) == 0)
+ break;
+ }
+ if (i == ARRAY_SIZE(trace_clocks))
+ return -EINVAL;
+
+ mutex_lock(&trace_types_lock);
+
+ tr->clock_id = i;
+
+ ring_buffer_set_clock(tr->trace_buffer.buffer, trace_clocks[i].func);
+
+ /*
+ * New clock may not be consistent with the previous clock.
+ * Reset the buffer so that it doesn't have incomparable timestamps.
+ */
+ tracing_reset_online_cpus(&tr->trace_buffer);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+ if (tr->flags & TRACE_ARRAY_FL_GLOBAL && tr->max_buffer.buffer)
+ ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func);
+ tracing_reset_online_cpus(&tr->max_buffer);
+#endif
+
+ mutex_unlock(&trace_types_lock);
+
+ return 0;
+}
+
+static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *fpos)
+{
+ struct seq_file *m = filp->private_data;
+ struct trace_array *tr = m->private;
+ char buf[64];
+ const char *clockstr;
+ int ret;
+
+ if (cnt >= sizeof(buf))
+ return -EINVAL;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+
+ clockstr = strstrip(buf);
+
+ ret = tracing_set_clock(tr, clockstr);
+ if (ret)
+ return ret;
+
+ *fpos += cnt;
+
+ return cnt;
+}
+
+static int tracing_clock_open(struct inode *inode, struct file *file)
+{
+ struct trace_array *tr = inode->i_private;
+ int ret;
+
+ if (tracing_disabled)
+ return -ENODEV;
+
+ if (trace_array_get(tr))
+ return -ENODEV;
+
+ ret = single_open(file, tracing_clock_show, inode->i_private);
+ if (ret < 0)
+ trace_array_put(tr);
+
+ return ret;
+}
+
+struct ftrace_buffer_info {
+ struct trace_iterator iter;
+ void *spare;
+ unsigned int read;
+};
+
+#ifdef CONFIG_TRACER_SNAPSHOT
+static int tracing_snapshot_open(struct inode *inode, struct file *file)
+{
+ struct trace_array *tr = inode->i_private;
+ struct trace_iterator *iter;
+ struct seq_file *m;
+ int ret = 0;
+
+ if (trace_array_get(tr) < 0)
+ return -ENODEV;
+
+ if (file->f_mode & FMODE_READ) {
+ iter = __tracing_open(inode, file, true);
+ if (IS_ERR(iter))
+ ret = PTR_ERR(iter);
+ } else {
+ /* Writes still need the seq_file to hold the private data */
+ ret = -ENOMEM;
+ m = kzalloc(sizeof(*m), GFP_KERNEL);
+ if (!m)
+ goto out;
+ iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter) {
+ kfree(m);
+ goto out;
+ }
+ ret = 0;
+
+ iter->tr = tr;
+ iter->trace_buffer = &tr->max_buffer;
+ iter->cpu_file = tracing_get_cpu(inode);
+ m->private = iter;
+ file->private_data = m;
+ }
+out:
+ if (ret < 0)
+ trace_array_put(tr);
+
+ return ret;
+}
+
+static ssize_t
+tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ struct seq_file *m = filp->private_data;
+ struct trace_iterator *iter = m->private;
+ struct trace_array *tr = iter->tr;
+ unsigned long val;
+ int ret;
+
+ ret = tracing_update_buffers();
+ if (ret < 0)
+ return ret;
+
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
+
+ mutex_lock(&trace_types_lock);
+
+ if (tr->current_trace->use_max_tr) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ switch (val) {
+ case 0:
+ if (iter->cpu_file != RING_BUFFER_ALL_CPUS) {
+ ret = -EINVAL;
+ break;
+ }
+ if (tr->allocated_snapshot)
+ free_snapshot(tr);
+ break;
+ case 1:
+/* Only allow per-cpu swap if the ring buffer supports it */
+#ifndef CONFIG_RING_BUFFER_ALLOW_SWAP
+ if (iter->cpu_file != RING_BUFFER_ALL_CPUS) {
+ ret = -EINVAL;
+ break;
+ }
+#endif
+ if (!tr->allocated_snapshot) {
+ ret = alloc_snapshot(tr);
+ if (ret < 0)
+ break;
+ }
+ local_irq_disable();
+ /* Now, we're going to swap */
+ if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
+ update_max_tr(tr, current, smp_processor_id());
+ else
+ update_max_tr_single(tr, current, iter->cpu_file);
+ local_irq_enable();
+ break;
+ default:
+ if (tr->allocated_snapshot) {
+ if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
+ tracing_reset_online_cpus(&tr->max_buffer);
+ else
+ tracing_reset(&tr->max_buffer, iter->cpu_file);
+ }
+ break;
+ }
+
+ if (ret >= 0) {
+ *ppos += cnt;
+ ret = cnt;
+ }
+out:
+ mutex_unlock(&trace_types_lock);
+ return ret;
+}
+
+static int tracing_snapshot_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *m = file->private_data;
+ int ret;
+
+ ret = tracing_release(inode, file);
+
+ if (file->f_mode & FMODE_READ)
+ return ret;
+
+ /* If write only, the seq_file is just a stub */
+ if (m)
+ kfree(m->private);
+ kfree(m);
+
+ return 0;
+}
+
+static int tracing_buffers_open(struct inode *inode, struct file *filp);
+static ssize_t tracing_buffers_read(struct file *filp, char __user *ubuf,
+ size_t count, loff_t *ppos);
+static int tracing_buffers_release(struct inode *inode, struct file *file);
+static ssize_t tracing_buffers_splice_read(struct file *file, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len, unsigned int flags);
+
+static int snapshot_raw_open(struct inode *inode, struct file *filp)
+{
+ struct ftrace_buffer_info *info;
+ int ret;
+
+ ret = tracing_buffers_open(inode, filp);
+ if (ret < 0)
+ return ret;
+
+ info = filp->private_data;
+
+ if (info->iter.trace->use_max_tr) {
+ tracing_buffers_release(inode, filp);
+ return -EBUSY;
+ }
+
+ info->iter.snapshot = true;
+ info->iter.trace_buffer = &info->iter.tr->max_buffer;
+
+ return ret;
+}
+
+#endif /* CONFIG_TRACER_SNAPSHOT */
+
+
+static const struct file_operations tracing_thresh_fops = {
+ .open = tracing_open_generic,
+ .read = tracing_thresh_read,
+ .write = tracing_thresh_write,
+ .llseek = generic_file_llseek,
+};
+
+static const struct file_operations tracing_max_lat_fops = {
+ .open = tracing_open_generic,
+ .read = tracing_max_lat_read,
+ .write = tracing_max_lat_write,
+ .llseek = generic_file_llseek,
+};
+
+static const struct file_operations set_tracer_fops = {
+ .open = tracing_open_generic,
+ .read = tracing_set_trace_read,
+ .write = tracing_set_trace_write,
+ .llseek = generic_file_llseek,
+};
+
+static const struct file_operations tracing_pipe_fops = {
+ .open = tracing_open_pipe,
+ .poll = tracing_poll_pipe,
+ .read = tracing_read_pipe,
+ .splice_read = tracing_splice_read_pipe,
+ .release = tracing_release_pipe,
+ .llseek = no_llseek,
+};
+
+static const struct file_operations tracing_entries_fops = {
+ .open = tracing_open_generic_tr,
+ .read = tracing_entries_read,
+ .write = tracing_entries_write,
+ .llseek = generic_file_llseek,
+ .release = tracing_release_generic_tr,
+};
+
+static const struct file_operations tracing_total_entries_fops = {
+ .open = tracing_open_generic_tr,
+ .read = tracing_total_entries_read,
+ .llseek = generic_file_llseek,
+ .release = tracing_release_generic_tr,
+};
+
+static const struct file_operations tracing_free_buffer_fops = {
+ .open = tracing_open_generic_tr,
+ .write = tracing_free_buffer_write,
+ .release = tracing_free_buffer_release,
+};
+
+static const struct file_operations tracing_mark_fops = {
+ .open = tracing_open_generic_tr,
+ .write = tracing_mark_write,
+ .llseek = generic_file_llseek,
+ .release = tracing_release_generic_tr,
+};
+
+static const struct file_operations trace_clock_fops = {
+ .open = tracing_clock_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = tracing_single_release_tr,
+ .write = tracing_clock_write,
+};
+
+#ifdef CONFIG_TRACER_SNAPSHOT
+static const struct file_operations snapshot_fops = {
+ .open = tracing_snapshot_open,
+ .read = seq_read,
+ .write = tracing_snapshot_write,
+ .llseek = tracing_lseek,
+ .release = tracing_snapshot_release,
+};
+
+static const struct file_operations snapshot_raw_fops = {
+ .open = snapshot_raw_open,
+ .read = tracing_buffers_read,
+ .release = tracing_buffers_release,
+ .splice_read = tracing_buffers_splice_read,
+ .llseek = no_llseek,
+};
+
+#endif /* CONFIG_TRACER_SNAPSHOT */
+
+static int tracing_buffers_open(struct inode *inode, struct file *filp)
+{
+ struct trace_array *tr = inode->i_private;
+ struct ftrace_buffer_info *info;
+ int ret;
+
+ if (tracing_disabled)
+ return -ENODEV;
+
+ if (trace_array_get(tr) < 0)
+ return -ENODEV;
+
+ info = kzalloc(sizeof(*info), GFP_KERNEL);
+ if (!info) {
+ trace_array_put(tr);
+ return -ENOMEM;
+ }
+
+ mutex_lock(&trace_types_lock);
+
+ info->iter.tr = tr;
+ info->iter.cpu_file = tracing_get_cpu(inode);
+ info->iter.trace = tr->current_trace;
+ info->iter.trace_buffer = &tr->trace_buffer;
+ info->spare = NULL;
+ /* Force reading ring buffer for first read */
+ info->read = (unsigned int)-1;
+
+ filp->private_data = info;
+
+ tr->current_trace->ref++;
+
+ mutex_unlock(&trace_types_lock);
+
+ ret = nonseekable_open(inode, filp);
+ if (ret < 0)
+ trace_array_put(tr);
+
+ return ret;
+}
+
+static unsigned int
+tracing_buffers_poll(struct file *filp, poll_table *poll_table)
+{
+ struct ftrace_buffer_info *info = filp->private_data;
+ struct trace_iterator *iter = &info->iter;
+
+ return trace_poll(iter, filp, poll_table);
+}
+
+static ssize_t
+tracing_buffers_read(struct file *filp, char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ struct ftrace_buffer_info *info = filp->private_data;
+ struct trace_iterator *iter = &info->iter;
+ ssize_t ret;
+ ssize_t size;
+
+ if (!count)
+ return 0;
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+ if (iter->snapshot && iter->tr->current_trace->use_max_tr)
+ return -EBUSY;
+#endif
+
+ if (!info->spare)
+ info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer,
+ iter->cpu_file);
+ if (!info->spare)
+ return -ENOMEM;
+
+ /* Do we have previous read data to read? */
+ if (info->read < PAGE_SIZE)
+ goto read;
+
+ again:
+ trace_access_lock(iter->cpu_file);
+ ret = ring_buffer_read_page(iter->trace_buffer->buffer,
+ &info->spare,
+ count,
+ iter->cpu_file, 0);
+ trace_access_unlock(iter->cpu_file);
+
+ if (ret < 0) {
+ if (trace_empty(iter)) {
+ if ((filp->f_flags & O_NONBLOCK))
+ return -EAGAIN;
+
+ ret = wait_on_pipe(iter, false);
+ if (ret)
+ return ret;
+
+ goto again;
+ }
+ return 0;
+ }
+
+ info->read = 0;
+ read:
+ size = PAGE_SIZE - info->read;
+ if (size > count)
+ size = count;
+
+ ret = copy_to_user(ubuf, info->spare + info->read, size);
+ if (ret == size)
+ return -EFAULT;
+
+ size -= ret;
+
+ *ppos += size;
+ info->read += size;
+
+ return size;
+}
+
+static int tracing_buffers_release(struct inode *inode, struct file *file)
+{
+ struct ftrace_buffer_info *info = file->private_data;
+ struct trace_iterator *iter = &info->iter;
+
+ mutex_lock(&trace_types_lock);
+
+ iter->tr->current_trace->ref--;
+
+ __trace_array_put(iter->tr);
+
+ if (info->spare)
+ ring_buffer_free_read_page(iter->trace_buffer->buffer, info->spare);
+ kfree(info);
+
+ mutex_unlock(&trace_types_lock);
+
+ return 0;
+}
+
+struct buffer_ref {
+ struct ring_buffer *buffer;
+ void *page;
+ int ref;
+};
+
+static void buffer_pipe_buf_release(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf)
+{
+ struct buffer_ref *ref = (struct buffer_ref *)buf->private;
+
+ if (--ref->ref)
+ return;
+
+ ring_buffer_free_read_page(ref->buffer, ref->page);
+ kfree(ref);
+ buf->private = 0;
+}
+
+static void buffer_pipe_buf_get(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf)
+{
+ struct buffer_ref *ref = (struct buffer_ref *)buf->private;
+
+ ref->ref++;
+}
+
+/* Pipe buffer operations for a buffer. */
+static const struct pipe_buf_operations buffer_pipe_buf_ops = {
+ .can_merge = 0,
+ .confirm = generic_pipe_buf_confirm,
+ .release = buffer_pipe_buf_release,
+ .steal = generic_pipe_buf_steal,
+ .get = buffer_pipe_buf_get,
+};
+
+/*
+ * Callback from splice_to_pipe(), if we need to release some pages
+ * at the end of the spd in case we error'ed out in filling the pipe.
+ */
+static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i)
+{
+ struct buffer_ref *ref =
+ (struct buffer_ref *)spd->partial[i].private;
+
+ if (--ref->ref)
+ return;
+
+ ring_buffer_free_read_page(ref->buffer, ref->page);
+ kfree(ref);
+ spd->partial[i].private = 0;
+}
+
+static ssize_t
+tracing_buffers_splice_read(struct file *file, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags)
+{
+ struct ftrace_buffer_info *info = file->private_data;
+ struct trace_iterator *iter = &info->iter;
+ struct partial_page partial_def[PIPE_DEF_BUFFERS];
+ struct page *pages_def[PIPE_DEF_BUFFERS];
+ struct splice_pipe_desc spd = {
+ .pages = pages_def,
+ .partial = partial_def,
+ .nr_pages_max = PIPE_DEF_BUFFERS,
+ .flags = flags,
+ .ops = &buffer_pipe_buf_ops,
+ .spd_release = buffer_spd_release,
+ };
+ struct buffer_ref *ref;
+ int entries, size, i;
+ ssize_t ret = 0;
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+ if (iter->snapshot && iter->tr->current_trace->use_max_tr)
+ return -EBUSY;
+#endif
+
+ if (splice_grow_spd(pipe, &spd))
+ return -ENOMEM;
+
+ if (*ppos & (PAGE_SIZE - 1))
+ return -EINVAL;
+
+ if (len & (PAGE_SIZE - 1)) {
+ if (len < PAGE_SIZE)
+ return -EINVAL;
+ len &= PAGE_MASK;
+ }
+
+ again:
+ trace_access_lock(iter->cpu_file);
+ entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file);
+
+ for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= PAGE_SIZE) {
+ struct page *page;
+ int r;
+
+ ref = kzalloc(sizeof(*ref), GFP_KERNEL);
+ if (!ref) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ ref->ref = 1;
+ ref->buffer = iter->trace_buffer->buffer;
+ ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file);
+ if (!ref->page) {
+ ret = -ENOMEM;
+ kfree(ref);
+ break;
+ }
+
+ r = ring_buffer_read_page(ref->buffer, &ref->page,
+ len, iter->cpu_file, 1);
+ if (r < 0) {
+ ring_buffer_free_read_page(ref->buffer, ref->page);
+ kfree(ref);
+ break;
+ }
+
+ /*
+ * zero out any left over data, this is going to
+ * user land.
+ */
+ size = ring_buffer_page_len(ref->page);
+ if (size < PAGE_SIZE)
+ memset(ref->page + size, 0, PAGE_SIZE - size);
+
+ page = virt_to_page(ref->page);
+
+ spd.pages[i] = page;
+ spd.partial[i].len = PAGE_SIZE;
+ spd.partial[i].offset = 0;
+ spd.partial[i].private = (unsigned long)ref;
+ spd.nr_pages++;
+ *ppos += PAGE_SIZE;
+
+ entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file);
+ }
+
+ trace_access_unlock(iter->cpu_file);
+ spd.nr_pages = i;
+
+ /* did we read anything? */
+ if (!spd.nr_pages) {
+ if (ret)
+ return ret;
+
+ if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK))
+ return -EAGAIN;
+
+ ret = wait_on_pipe(iter, true);
+ if (ret)
+ return ret;
+
+ goto again;
+ }
+
+ ret = splice_to_pipe(pipe, &spd);
+ splice_shrink_spd(&spd);
+
+ return ret;
+}
+
+static const struct file_operations tracing_buffers_fops = {
+ .open = tracing_buffers_open,
+ .read = tracing_buffers_read,
+ .poll = tracing_buffers_poll,
+ .release = tracing_buffers_release,
+ .splice_read = tracing_buffers_splice_read,
+ .llseek = no_llseek,
+};
+
+static ssize_t
+tracing_stats_read(struct file *filp, char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ struct inode *inode = file_inode(filp);
+ struct trace_array *tr = inode->i_private;
+ struct trace_buffer *trace_buf = &tr->trace_buffer;
+ int cpu = tracing_get_cpu(inode);
+ struct trace_seq *s;
+ unsigned long cnt;
+ unsigned long long t;
+ unsigned long usec_rem;
+
+ s = kmalloc(sizeof(*s), GFP_KERNEL);
+ if (!s)
+ return -ENOMEM;
+
+ trace_seq_init(s);
+
+ cnt = ring_buffer_entries_cpu(trace_buf->buffer, cpu);
+ trace_seq_printf(s, "entries: %ld\n", cnt);
+
+ cnt = ring_buffer_overrun_cpu(trace_buf->buffer, cpu);
+ trace_seq_printf(s, "overrun: %ld\n", cnt);
+
+ cnt = ring_buffer_commit_overrun_cpu(trace_buf->buffer, cpu);
+ trace_seq_printf(s, "commit overrun: %ld\n", cnt);
+
+ cnt = ring_buffer_bytes_cpu(trace_buf->buffer, cpu);
+ trace_seq_printf(s, "bytes: %ld\n", cnt);
+
+ if (trace_clocks[tr->clock_id].in_ns) {
+ /* local or global for trace_clock */
+ t = ns2usecs(ring_buffer_oldest_event_ts(trace_buf->buffer, cpu));
+ usec_rem = do_div(t, USEC_PER_SEC);
+ trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n",
+ t, usec_rem);
+
+ t = ns2usecs(ring_buffer_time_stamp(trace_buf->buffer, cpu));
+ usec_rem = do_div(t, USEC_PER_SEC);
+ trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem);
+ } else {
+ /* counter or tsc mode for trace_clock */
+ trace_seq_printf(s, "oldest event ts: %llu\n",
+ ring_buffer_oldest_event_ts(trace_buf->buffer, cpu));
+
+ trace_seq_printf(s, "now ts: %llu\n",
+ ring_buffer_time_stamp(trace_buf->buffer, cpu));
+ }
+
+ cnt = ring_buffer_dropped_events_cpu(trace_buf->buffer, cpu);
+ trace_seq_printf(s, "dropped events: %ld\n", cnt);
+
+ cnt = ring_buffer_read_events_cpu(trace_buf->buffer, cpu);
+ trace_seq_printf(s, "read events: %ld\n", cnt);
+
+ count = simple_read_from_buffer(ubuf, count, ppos,
+ s->buffer, trace_seq_used(s));
+
+ kfree(s);
+
+ return count;
+}
+
+static const struct file_operations tracing_stats_fops = {
+ .open = tracing_open_generic_tr,
+ .read = tracing_stats_read,
+ .llseek = generic_file_llseek,
+ .release = tracing_release_generic_tr,
+};
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+int __weak ftrace_arch_read_dyn_info(char *buf, int size)
+{
+ return 0;
+}
+
+static ssize_t
+tracing_read_dyn_info(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ static char ftrace_dyn_info_buffer[1024];
+ static DEFINE_MUTEX(dyn_info_mutex);
+ unsigned long *p = filp->private_data;
+ char *buf = ftrace_dyn_info_buffer;
+ int size = ARRAY_SIZE(ftrace_dyn_info_buffer);
+ int r;
+
+ mutex_lock(&dyn_info_mutex);
+ r = sprintf(buf, "%ld ", *p);
+
+ r += ftrace_arch_read_dyn_info(buf+r, (size-1)-r);
+ buf[r++] = '\n';
+
+ r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+
+ mutex_unlock(&dyn_info_mutex);
+
+ return r;
+}
+
+static const struct file_operations tracing_dyn_info_fops = {
+ .open = tracing_open_generic,
+ .read = tracing_read_dyn_info,
+ .llseek = generic_file_llseek,
+};
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+#if defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE)
+static void
+ftrace_snapshot(unsigned long ip, unsigned long parent_ip, void **data)
+{
+ tracing_snapshot();
+}
+
+static void
+ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip, void **data)
+{
+ unsigned long *count = (long *)data;
+
+ if (!*count)
+ return;
+
+ if (*count != -1)
+ (*count)--;
+
+ tracing_snapshot();
+}
+
+static int
+ftrace_snapshot_print(struct seq_file *m, unsigned long ip,
+ struct ftrace_probe_ops *ops, void *data)
+{
+ long count = (long)data;
+
+ seq_printf(m, "%ps:", (void *)ip);
+
+ seq_puts(m, "snapshot");
+
+ if (count == -1)
+ seq_puts(m, ":unlimited\n");
+ else
+ seq_printf(m, ":count=%ld\n", count);
+
+ return 0;
+}
+
+static struct ftrace_probe_ops snapshot_probe_ops = {
+ .func = ftrace_snapshot,
+ .print = ftrace_snapshot_print,
+};
+
+static struct ftrace_probe_ops snapshot_count_probe_ops = {
+ .func = ftrace_count_snapshot,
+ .print = ftrace_snapshot_print,
+};
+
+static int
+ftrace_trace_snapshot_callback(struct ftrace_hash *hash,
+ char *glob, char *cmd, char *param, int enable)
+{
+ struct ftrace_probe_ops *ops;
+ void *count = (void *)-1;
+ char *number;
+ int ret;
+
+ /* hash funcs only work with set_ftrace_filter */
+ if (!enable)
+ return -EINVAL;
+
+ ops = param ? &snapshot_count_probe_ops : &snapshot_probe_ops;
+
+ if (glob[0] == '!') {
+ unregister_ftrace_function_probe_func(glob+1, ops);
+ return 0;
+ }
+
+ if (!param)
+ goto out_reg;
+
+ number = strsep(&param, ":");
+
+ if (!strlen(number))
+ goto out_reg;
+
+ /*
+ * We use the callback data field (which is a pointer)
+ * as our counter.
+ */
+ ret = kstrtoul(number, 0, (unsigned long *)&count);
+ if (ret)
+ return ret;
+
+ out_reg:
+ ret = register_ftrace_function_probe(glob, ops, count);
+
+ if (ret >= 0)
+ alloc_snapshot(&global_trace);
+
+ return ret < 0 ? ret : 0;
+}
+
+static struct ftrace_func_command ftrace_snapshot_cmd = {
+ .name = "snapshot",
+ .func = ftrace_trace_snapshot_callback,
+};
+
+static __init int register_snapshot_cmd(void)
+{
+ return register_ftrace_command(&ftrace_snapshot_cmd);
+}
+#else
+static inline __init int register_snapshot_cmd(void) { return 0; }
+#endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */
+
+static struct dentry *tracing_get_dentry(struct trace_array *tr)
+{
+ if (WARN_ON(!tr->dir))
+ return ERR_PTR(-ENODEV);
+
+ /* Top directory uses NULL as the parent */
+ if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
+ return NULL;
+
+ /* All sub buffers have a descriptor */
+ return tr->dir;
+}
+
+static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)
+{
+ struct dentry *d_tracer;
+
+ if (tr->percpu_dir)
+ return tr->percpu_dir;
+
+ d_tracer = tracing_get_dentry(tr);
+ if (IS_ERR(d_tracer))
+ return NULL;
+
+ tr->percpu_dir = tracefs_create_dir("per_cpu", d_tracer);
+
+ WARN_ONCE(!tr->percpu_dir,
+ "Could not create tracefs directory 'per_cpu/%d'\n", cpu);
+
+ return tr->percpu_dir;
+}
+
+static struct dentry *
+trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent,
+ void *data, long cpu, const struct file_operations *fops)
+{
+ struct dentry *ret = trace_create_file(name, mode, parent, data, fops);
+
+ if (ret) /* See tracing_get_cpu() */
+ d_inode(ret)->i_cdev = (void *)(cpu + 1);
+ return ret;
+}
+
+static void
+tracing_init_tracefs_percpu(struct trace_array *tr, long cpu)
+{
+ struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu);
+ struct dentry *d_cpu;
+ char cpu_dir[30]; /* 30 characters should be more than enough */
+
+ if (!d_percpu)
+ return;
+
+ snprintf(cpu_dir, 30, "cpu%ld", cpu);
+ d_cpu = tracefs_create_dir(cpu_dir, d_percpu);
+ if (!d_cpu) {
+ pr_warning("Could not create tracefs '%s' entry\n", cpu_dir);
+ return;
+ }
+
+ /* per cpu trace_pipe */
+ trace_create_cpu_file("trace_pipe", 0444, d_cpu,
+ tr, cpu, &tracing_pipe_fops);
+
+ /* per cpu trace */
+ trace_create_cpu_file("trace", 0644, d_cpu,
+ tr, cpu, &tracing_fops);
+
+ trace_create_cpu_file("trace_pipe_raw", 0444, d_cpu,
+ tr, cpu, &tracing_buffers_fops);
+
+ trace_create_cpu_file("stats", 0444, d_cpu,
+ tr, cpu, &tracing_stats_fops);
+
+ trace_create_cpu_file("buffer_size_kb", 0444, d_cpu,
+ tr, cpu, &tracing_entries_fops);
+
+#ifdef CONFIG_TRACER_SNAPSHOT
+ trace_create_cpu_file("snapshot", 0644, d_cpu,
+ tr, cpu, &snapshot_fops);
+
+ trace_create_cpu_file("snapshot_raw", 0444, d_cpu,
+ tr, cpu, &snapshot_raw_fops);
+#endif
+}
+
+#ifdef CONFIG_FTRACE_SELFTEST
+/* Let selftest have access to static functions in this file */
+#include "trace_selftest.c"
+#endif
+
+struct trace_option_dentry {
+ struct tracer_opt *opt;
+ struct tracer_flags *flags;
+ struct trace_array *tr;
+ struct dentry *entry;
+};
+
+static ssize_t
+trace_options_read(struct file *filp, char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ struct trace_option_dentry *topt = filp->private_data;
+ char *buf;
+
+ if (topt->flags->val & topt->opt->bit)
+ buf = "1\n";
+ else
+ buf = "0\n";
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
+}
+
+static ssize_t
+trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ struct trace_option_dentry *topt = filp->private_data;
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
+
+ if (val != 0 && val != 1)
+ return -EINVAL;
+
+ if (!!(topt->flags->val & topt->opt->bit) != val) {
+ mutex_lock(&trace_types_lock);
+ ret = __set_tracer_option(topt->tr, topt->flags,
+ topt->opt, !val);
+ mutex_unlock(&trace_types_lock);
+ if (ret)
+ return ret;
+ }
+
+ *ppos += cnt;
+
+ return cnt;
+}
+
+
+static const struct file_operations trace_options_fops = {
+ .open = tracing_open_generic,
+ .read = trace_options_read,
+ .write = trace_options_write,
+ .llseek = generic_file_llseek,
+};
+
+static ssize_t
+trace_options_core_read(struct file *filp, char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ long index = (long)filp->private_data;
+ char *buf;
+
+ if (trace_flags & (1 << index))
+ buf = "1\n";
+ else
+ buf = "0\n";
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
+}
+
+static ssize_t
+trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ struct trace_array *tr = &global_trace;
+ long index = (long)filp->private_data;
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
+
+ if (val != 0 && val != 1)
+ return -EINVAL;
+
+ mutex_lock(&trace_types_lock);
+ ret = set_tracer_flag(tr, 1 << index, val);
+ mutex_unlock(&trace_types_lock);
+
+ if (ret < 0)
+ return ret;
+
+ *ppos += cnt;
+
+ return cnt;
+}
+
+static const struct file_operations trace_options_core_fops = {
+ .open = tracing_open_generic,
+ .read = trace_options_core_read,
+ .write = trace_options_core_write,
+ .llseek = generic_file_llseek,
+};
+
+struct dentry *trace_create_file(const char *name,
+ umode_t mode,
+ struct dentry *parent,
+ void *data,
+ const struct file_operations *fops)
+{
+ struct dentry *ret;
+
+ ret = tracefs_create_file(name, mode, parent, data, fops);
+ if (!ret)
+ pr_warning("Could not create tracefs '%s' entry\n", name);
+
+ return ret;
+}
+
+
+static struct dentry *trace_options_init_dentry(struct trace_array *tr)
+{
+ struct dentry *d_tracer;
+
+ if (tr->options)
+ return tr->options;
+
+ d_tracer = tracing_get_dentry(tr);
+ if (IS_ERR(d_tracer))
+ return NULL;
+
+ tr->options = tracefs_create_dir("options", d_tracer);
+ if (!tr->options) {
+ pr_warning("Could not create tracefs directory 'options'\n");
+ return NULL;
+ }
+
+ return tr->options;
+}
+
+static void
+create_trace_option_file(struct trace_array *tr,
+ struct trace_option_dentry *topt,
+ struct tracer_flags *flags,
+ struct tracer_opt *opt)
+{
+ struct dentry *t_options;
+
+ t_options = trace_options_init_dentry(tr);
+ if (!t_options)
+ return;
+
+ topt->flags = flags;
+ topt->opt = opt;
+ topt->tr = tr;
+
+ topt->entry = trace_create_file(opt->name, 0644, t_options, topt,
+ &trace_options_fops);
+
+}
+
+static struct trace_option_dentry *
+create_trace_option_files(struct trace_array *tr, struct tracer *tracer)
+{
+ struct trace_option_dentry *topts;
+ struct tracer_flags *flags;
+ struct tracer_opt *opts;
+ int cnt;
+
+ if (!tracer)
+ return NULL;
+
+ flags = tracer->flags;
+
+ if (!flags || !flags->opts)
+ return NULL;
+
+ opts = flags->opts;
+
+ for (cnt = 0; opts[cnt].name; cnt++)
+ ;
+
+ topts = kcalloc(cnt + 1, sizeof(*topts), GFP_KERNEL);
+ if (!topts)
+ return NULL;
+
+ for (cnt = 0; opts[cnt].name; cnt++)
+ create_trace_option_file(tr, &topts[cnt], flags,
+ &opts[cnt]);
+
+ return topts;
+}
+
+static void
+destroy_trace_option_files(struct trace_option_dentry *topts)
+{
+ int cnt;
+
+ if (!topts)
+ return;
+
+ for (cnt = 0; topts[cnt].opt; cnt++)
+ tracefs_remove(topts[cnt].entry);
+
+ kfree(topts);
+}
+
+static struct dentry *
+create_trace_option_core_file(struct trace_array *tr,
+ const char *option, long index)
+{
+ struct dentry *t_options;
+
+ t_options = trace_options_init_dentry(tr);
+ if (!t_options)
+ return NULL;
+
+ return trace_create_file(option, 0644, t_options, (void *)index,
+ &trace_options_core_fops);
+}
+
+static __init void create_trace_options_dir(struct trace_array *tr)
+{
+ struct dentry *t_options;
+ int i;
+
+ t_options = trace_options_init_dentry(tr);
+ if (!t_options)
+ return;
+
+ for (i = 0; trace_options[i]; i++)
+ create_trace_option_core_file(tr, trace_options[i], i);
+}
+
+static ssize_t
+rb_simple_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct trace_array *tr = filp->private_data;
+ char buf[64];
+ int r;
+
+ r = tracer_tracing_is_on(tr);
+ r = sprintf(buf, "%d\n", r);
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+rb_simple_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct trace_array *tr = filp->private_data;
+ struct ring_buffer *buffer = tr->trace_buffer.buffer;
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
+
+ if (buffer) {
+ mutex_lock(&trace_types_lock);
+ if (val) {
+ tracer_tracing_on(tr);
+ if (tr->current_trace->start)
+ tr->current_trace->start(tr);
+ } else {
+ tracer_tracing_off(tr);
+ if (tr->current_trace->stop)
+ tr->current_trace->stop(tr);
+ }
+ mutex_unlock(&trace_types_lock);
+ }
+
+ (*ppos)++;
+
+ return cnt;
+}
+
+static const struct file_operations rb_simple_fops = {
+ .open = tracing_open_generic_tr,
+ .read = rb_simple_read,
+ .write = rb_simple_write,
+ .release = tracing_release_generic_tr,
+ .llseek = default_llseek,
+};
+
+struct dentry *trace_instance_dir;
+
+static void
+init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer);
+
+static int
+allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size)
+{
+ enum ring_buffer_flags rb_flags;
+
+ rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;
+
+ buf->tr = tr;
+
+ buf->buffer = ring_buffer_alloc(size, rb_flags);
+ if (!buf->buffer)
+ return -ENOMEM;
+
+ buf->data = alloc_percpu(struct trace_array_cpu);
+ if (!buf->data) {
+ ring_buffer_free(buf->buffer);
+ return -ENOMEM;
+ }
+
+ /* Allocate the first page for all buffers */
+ set_buffer_entries(&tr->trace_buffer,
+ ring_buffer_size(tr->trace_buffer.buffer, 0));
+
+ return 0;
+}
+
+static int allocate_trace_buffers(struct trace_array *tr, int size)
+{
+ int ret;
+
+ ret = allocate_trace_buffer(tr, &tr->trace_buffer, size);
+ if (ret)
+ return ret;
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+ ret = allocate_trace_buffer(tr, &tr->max_buffer,
+ allocate_snapshot ? size : 1);
+ if (WARN_ON(ret)) {
+ ring_buffer_free(tr->trace_buffer.buffer);
+ free_percpu(tr->trace_buffer.data);
+ return -ENOMEM;
+ }
+ tr->allocated_snapshot = allocate_snapshot;
+
+ /*
+ * Only the top level trace array gets its snapshot allocated
+ * from the kernel command line.
+ */
+ allocate_snapshot = false;
+#endif
+ return 0;
+}
+
+static void free_trace_buffer(struct trace_buffer *buf)
+{
+ if (buf->buffer) {
+ ring_buffer_free(buf->buffer);
+ buf->buffer = NULL;
+ free_percpu(buf->data);
+ buf->data = NULL;
+ }
+}
+
+static void free_trace_buffers(struct trace_array *tr)
+{
+ if (!tr)
+ return;
+
+ free_trace_buffer(&tr->trace_buffer);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+ free_trace_buffer(&tr->max_buffer);
+#endif
+}
+
+static int instance_mkdir(const char *name)
+{
+ struct trace_array *tr;
+ int ret;
+
+ mutex_lock(&trace_types_lock);
+
+ ret = -EEXIST;
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ if (tr->name && strcmp(tr->name, name) == 0)
+ goto out_unlock;
+ }
+
+ ret = -ENOMEM;
+ tr = kzalloc(sizeof(*tr), GFP_KERNEL);
+ if (!tr)
+ goto out_unlock;
+
+ tr->name = kstrdup(name, GFP_KERNEL);
+ if (!tr->name)
+ goto out_free_tr;
+
+ if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL))
+ goto out_free_tr;
+
+ cpumask_copy(tr->tracing_cpumask, cpu_all_mask);
+
+ raw_spin_lock_init(&tr->start_lock);
+
+ tr->max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
+
+ tr->current_trace = &nop_trace;
+
+ INIT_LIST_HEAD(&tr->systems);
+ INIT_LIST_HEAD(&tr->events);
+
+ if (allocate_trace_buffers(tr, trace_buf_size) < 0)
+ goto out_free_tr;
+
+ tr->dir = tracefs_create_dir(name, trace_instance_dir);
+ if (!tr->dir)
+ goto out_free_tr;
+
+ ret = event_trace_add_tracer(tr->dir, tr);
+ if (ret) {
+ tracefs_remove_recursive(tr->dir);
+ goto out_free_tr;
+ }
+
+ init_tracer_tracefs(tr, tr->dir);
+
+ list_add(&tr->list, &ftrace_trace_arrays);
+
+ mutex_unlock(&trace_types_lock);
+
+ return 0;
+
+ out_free_tr:
+ free_trace_buffers(tr);
+ free_cpumask_var(tr->tracing_cpumask);
+ kfree(tr->name);
+ kfree(tr);
+
+ out_unlock:
+ mutex_unlock(&trace_types_lock);
+
+ return ret;
+
+}
+
+static int instance_rmdir(const char *name)
+{
+ struct trace_array *tr;
+ int found = 0;
+ int ret;
+
+ mutex_lock(&trace_types_lock);
+
+ ret = -ENODEV;
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ if (tr->name && strcmp(tr->name, name) == 0) {
+ found = 1;
+ break;
+ }
+ }
+ if (!found)
+ goto out_unlock;
+
+ ret = -EBUSY;
+ if (tr->ref || (tr->current_trace && tr->current_trace->ref))
+ goto out_unlock;
+
+ list_del(&tr->list);
+
+ tracing_set_nop(tr);
+ event_trace_del_tracer(tr);
+ ftrace_destroy_function_files(tr);
+ debugfs_remove_recursive(tr->dir);
+ free_trace_buffers(tr);
+
+ kfree(tr->name);
+ kfree(tr);
+
+ ret = 0;
+
+ out_unlock:
+ mutex_unlock(&trace_types_lock);
+
+ return ret;
+}
+
+static __init void create_trace_instances(struct dentry *d_tracer)
+{
+ trace_instance_dir = tracefs_create_instance_dir("instances", d_tracer,
+ instance_mkdir,
+ instance_rmdir);
+ if (WARN_ON(!trace_instance_dir))
+ return;
+}
+
+static void
+init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
+{
+ int cpu;
+
+ trace_create_file("available_tracers", 0444, d_tracer,
+ tr, &show_traces_fops);
+
+ trace_create_file("current_tracer", 0644, d_tracer,
+ tr, &set_tracer_fops);
+
+ trace_create_file("tracing_cpumask", 0644, d_tracer,
+ tr, &tracing_cpumask_fops);
+
+ trace_create_file("trace_options", 0644, d_tracer,
+ tr, &tracing_iter_fops);
+
+ trace_create_file("trace", 0644, d_tracer,
+ tr, &tracing_fops);
+
+ trace_create_file("trace_pipe", 0444, d_tracer,
+ tr, &tracing_pipe_fops);
+
+ trace_create_file("buffer_size_kb", 0644, d_tracer,
+ tr, &tracing_entries_fops);
+
+ trace_create_file("buffer_total_size_kb", 0444, d_tracer,
+ tr, &tracing_total_entries_fops);
+
+ trace_create_file("free_buffer", 0200, d_tracer,
+ tr, &tracing_free_buffer_fops);
+
+ trace_create_file("trace_marker", 0220, d_tracer,
+ tr, &tracing_mark_fops);
+
+ trace_create_file("trace_clock", 0644, d_tracer, tr,
+ &trace_clock_fops);
+
+ trace_create_file("tracing_on", 0644, d_tracer,
+ tr, &rb_simple_fops);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+ trace_create_file("tracing_max_latency", 0644, d_tracer,
+ &tr->max_latency, &tracing_max_lat_fops);
+#endif
+
+ if (ftrace_create_function_files(tr, d_tracer))
+ WARN(1, "Could not allocate function filter files");
+
+#ifdef CONFIG_TRACER_SNAPSHOT
+ trace_create_file("snapshot", 0644, d_tracer,
+ tr, &snapshot_fops);
+#endif
+
+ for_each_tracing_cpu(cpu)
+ tracing_init_tracefs_percpu(tr, cpu);
+
+}
+
+static struct vfsmount *trace_automount(void *ingore)
+{
+ struct vfsmount *mnt;
+ struct file_system_type *type;
+
+ /*
+ * To maintain backward compatibility for tools that mount
+ * debugfs to get to the tracing facility, tracefs is automatically
+ * mounted to the debugfs/tracing directory.
+ */
+ type = get_fs_type("tracefs");
+ if (!type)
+ return NULL;
+ mnt = vfs_kern_mount(type, 0, "tracefs", NULL);
+ put_filesystem(type);
+ if (IS_ERR(mnt))
+ return NULL;
+ mntget(mnt);
+
+ return mnt;
+}
+
+/**
+ * tracing_init_dentry - initialize top level trace array
+ *
+ * This is called when creating files or directories in the tracing
+ * directory. It is called via fs_initcall() by any of the boot up code
+ * and expects to return the dentry of the top level tracing directory.
+ */
+struct dentry *tracing_init_dentry(void)
+{
+ struct trace_array *tr = &global_trace;
+
+ /* The top level trace array uses NULL as parent */
+ if (tr->dir)
+ return NULL;
+
+ if (WARN_ON(!debugfs_initialized()))
+ return ERR_PTR(-ENODEV);
+
+ /*
+ * As there may still be users that expect the tracing
+ * files to exist in debugfs/tracing, we must automount
+ * the tracefs file system there, so older tools still
+ * work with the newer kerenl.
+ */
+ tr->dir = debugfs_create_automount("tracing", NULL,
+ trace_automount, NULL);
+ if (!tr->dir) {
+ pr_warn_once("Could not create debugfs directory 'tracing'\n");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return NULL;
+}
+
+extern struct trace_enum_map *__start_ftrace_enum_maps[];
+extern struct trace_enum_map *__stop_ftrace_enum_maps[];
+
+static void __init trace_enum_init(void)
+{
+ int len;
+
+ len = __stop_ftrace_enum_maps - __start_ftrace_enum_maps;
+ trace_insert_enum_map(NULL, __start_ftrace_enum_maps, len);
+}
+
+#ifdef CONFIG_MODULES
+static void trace_module_add_enums(struct module *mod)
+{
+ if (!mod->num_trace_enums)
+ return;
+
+ /*
+ * Modules with bad taint do not have events created, do
+ * not bother with enums either.
+ */
+ if (trace_module_has_bad_taint(mod))
+ return;
+
+ trace_insert_enum_map(mod, mod->trace_enums, mod->num_trace_enums);
+}
+
+#ifdef CONFIG_TRACE_ENUM_MAP_FILE
+static void trace_module_remove_enums(struct module *mod)
+{
+ union trace_enum_map_item *map;
+ union trace_enum_map_item **last = &trace_enum_maps;
+
+ if (!mod->num_trace_enums)
+ return;
+
+ mutex_lock(&trace_enum_mutex);
+
+ map = trace_enum_maps;
+
+ while (map) {
+ if (map->head.mod == mod)
+ break;
+ map = trace_enum_jmp_to_tail(map);
+ last = &map->tail.next;
+ map = map->tail.next;
+ }
+ if (!map)
+ goto out;
+
+ *last = trace_enum_jmp_to_tail(map)->tail.next;
+ kfree(map);
+ out:
+ mutex_unlock(&trace_enum_mutex);
+}
+#else
+static inline void trace_module_remove_enums(struct module *mod) { }
+#endif /* CONFIG_TRACE_ENUM_MAP_FILE */
+
+static int trace_module_notify(struct notifier_block *self,
+ unsigned long val, void *data)
+{
+ struct module *mod = data;
+
+ switch (val) {
+ case MODULE_STATE_COMING:
+ trace_module_add_enums(mod);
+ break;
+ case MODULE_STATE_GOING:
+ trace_module_remove_enums(mod);
+ break;
+ }
+
+ return 0;
+}
+
+static struct notifier_block trace_module_nb = {
+ .notifier_call = trace_module_notify,
+ .priority = 0,
+};
+#endif /* CONFIG_MODULES */
+
+static __init int tracer_init_tracefs(void)
+{
+ struct dentry *d_tracer;
+
+ trace_access_lock_init();
+
+ d_tracer = tracing_init_dentry();
+ if (IS_ERR(d_tracer))
+ return 0;
+
+ init_tracer_tracefs(&global_trace, d_tracer);
+
+ trace_create_file("tracing_thresh", 0644, d_tracer,
+ &global_trace, &tracing_thresh_fops);
+
+ trace_create_file("README", 0444, d_tracer,
+ NULL, &tracing_readme_fops);
+
+ trace_create_file("saved_cmdlines", 0444, d_tracer,
+ NULL, &tracing_saved_cmdlines_fops);
+
+ trace_create_file("saved_cmdlines_size", 0644, d_tracer,
+ NULL, &tracing_saved_cmdlines_size_fops);
+
+ trace_enum_init();
+
+ trace_create_enum_file(d_tracer);
+
+#ifdef CONFIG_MODULES
+ register_module_notifier(&trace_module_nb);
+#endif
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+ trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
+ &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
+#endif
+
+ create_trace_instances(d_tracer);
+
+ create_trace_options_dir(&global_trace);
+
+ /* If the tracer was started via cmdline, create options for it here */
+ if (global_trace.current_trace != &nop_trace)
+ update_tracer_options(&global_trace, global_trace.current_trace);
+
+ return 0;
+}
+
+static int trace_panic_handler(struct notifier_block *this,
+ unsigned long event, void *unused)
+{
+ if (ftrace_dump_on_oops)
+ ftrace_dump(ftrace_dump_on_oops);
+ return NOTIFY_OK;
+}
+
+static struct notifier_block trace_panic_notifier = {
+ .notifier_call = trace_panic_handler,
+ .next = NULL,
+ .priority = 150 /* priority: INT_MAX >= x >= 0 */
+};
+
+static int trace_die_handler(struct notifier_block *self,
+ unsigned long val,
+ void *data)
+{
+ switch (val) {
+ case DIE_OOPS:
+ if (ftrace_dump_on_oops)
+ ftrace_dump(ftrace_dump_on_oops);
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block trace_die_notifier = {
+ .notifier_call = trace_die_handler,
+ .priority = 200
+};
+
+/*
+ * printk is set to max of 1024, we really don't need it that big.
+ * Nothing should be printing 1000 characters anyway.
+ */
+#define TRACE_MAX_PRINT 1000
+
+/*
+ * Define here KERN_TRACE so that we have one place to modify
+ * it if we decide to change what log level the ftrace dump
+ * should be at.
+ */
+#define KERN_TRACE KERN_EMERG
+
+void
+trace_printk_seq(struct trace_seq *s)
+{
+ /* Probably should print a warning here. */
+ if (s->seq.len >= TRACE_MAX_PRINT)
+ s->seq.len = TRACE_MAX_PRINT;
+
+ /*
+ * More paranoid code. Although the buffer size is set to
+ * PAGE_SIZE, and TRACE_MAX_PRINT is 1000, this is just
+ * an extra layer of protection.
+ */
+ if (WARN_ON_ONCE(s->seq.len >= s->seq.size))
+ s->seq.len = s->seq.size - 1;
+
+ /* should be zero ended, but we are paranoid. */
+ s->buffer[s->seq.len] = 0;
+
+ printk(KERN_TRACE "%s", s->buffer);
+
+ trace_seq_init(s);
+}
+
+void trace_init_global_iter(struct trace_iterator *iter)
+{
+ iter->tr = &global_trace;
+ iter->trace = iter->tr->current_trace;
+ iter->cpu_file = RING_BUFFER_ALL_CPUS;
+ iter->trace_buffer = &global_trace.trace_buffer;
+
+ if (iter->trace && iter->trace->open)
+ iter->trace->open(iter);
+
+ /* Annotate start of buffers if we had overruns */
+ if (ring_buffer_overruns(iter->trace_buffer->buffer))
+ iter->iter_flags |= TRACE_FILE_ANNOTATE;
+
+ /* Output in nanoseconds only if we are using a clock in nanoseconds. */
+ if (trace_clocks[iter->tr->clock_id].in_ns)
+ iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
+}
+
+void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
+{
+ /* use static because iter can be a bit big for the stack */
+ static struct trace_iterator iter;
+ static atomic_t dump_running;
+ unsigned int old_userobj;
+ unsigned long flags;
+ int cnt = 0, cpu;
+
+ /* Only allow one dump user at a time. */
+ if (atomic_inc_return(&dump_running) != 1) {
+ atomic_dec(&dump_running);
+ return;
+ }
+
+ /*
+ * Always turn off tracing when we dump.
+ * We don't need to show trace output of what happens
+ * between multiple crashes.
+ *
+ * If the user does a sysrq-z, then they can re-enable
+ * tracing with echo 1 > tracing_on.
+ */
+ tracing_off();
+
+ local_irq_save(flags);
+
+ /* Simulate the iterator */
+ trace_init_global_iter(&iter);
+
+ for_each_tracing_cpu(cpu) {
+ atomic_inc(&per_cpu_ptr(iter.tr->trace_buffer.data, cpu)->disabled);
+ }
+
+ old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ;
+
+ /* don't look at user memory in panic mode */
+ trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
+
+ switch (oops_dump_mode) {
+ case DUMP_ALL:
+ iter.cpu_file = RING_BUFFER_ALL_CPUS;
+ break;
+ case DUMP_ORIG:
+ iter.cpu_file = raw_smp_processor_id();
+ break;
+ case DUMP_NONE:
+ goto out_enable;
+ default:
+ printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n");
+ iter.cpu_file = RING_BUFFER_ALL_CPUS;
+ }
+
+ printk(KERN_TRACE "Dumping ftrace buffer:\n");
+
+ /* Did function tracer already get disabled? */
+ if (ftrace_is_dead()) {
+ printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n");
+ printk("# MAY BE MISSING FUNCTION EVENTS\n");
+ }
+
+ /*
+ * We need to stop all tracing on all CPUS to read the
+ * the next buffer. This is a bit expensive, but is
+ * not done often. We fill all what we can read,
+ * and then release the locks again.
+ */
+
+ while (!trace_empty(&iter)) {
+
+ if (!cnt)
+ printk(KERN_TRACE "---------------------------------\n");
+
+ cnt++;
+
+ /* reset all but tr, trace, and overruns */
+ memset(&iter.seq, 0,
+ sizeof(struct trace_iterator) -
+ offsetof(struct trace_iterator, seq));
+ iter.iter_flags |= TRACE_FILE_LAT_FMT;
+ iter.pos = -1;
+
+ if (trace_find_next_entry_inc(&iter) != NULL) {
+ int ret;
+
+ ret = print_trace_line(&iter);
+ if (ret != TRACE_TYPE_NO_CONSUME)
+ trace_consume(&iter);
+ }
+ touch_nmi_watchdog();
+
+ trace_printk_seq(&iter.seq);
+ }
+
+ if (!cnt)
+ printk(KERN_TRACE " (ftrace buffer empty)\n");
+ else
+ printk(KERN_TRACE "---------------------------------\n");
+
+ out_enable:
+ trace_flags |= old_userobj;
+
+ for_each_tracing_cpu(cpu) {
+ atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
+ }
+ atomic_dec(&dump_running);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(ftrace_dump);
+
+__init static int tracer_alloc_buffers(void)
+{
+ int ring_buf_size;
+ int ret = -ENOMEM;
+
+ if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL))
+ goto out;
+
+ if (!alloc_cpumask_var(&global_trace.tracing_cpumask, GFP_KERNEL))
+ goto out_free_buffer_mask;
+
+ /* Only allocate trace_printk buffers if a trace_printk exists */
+ if (__stop___trace_bprintk_fmt != __start___trace_bprintk_fmt)
+ /* Must be called before global_trace.buffer is allocated */
+ trace_printk_init_buffers();
+
+ /* To save memory, keep the ring buffer size to its minimum */
+ if (ring_buffer_expanded)
+ ring_buf_size = trace_buf_size;
+ else
+ ring_buf_size = 1;
+
+ cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
+ cpumask_copy(global_trace.tracing_cpumask, cpu_all_mask);
+
+ raw_spin_lock_init(&global_trace.start_lock);
+
+ /* Used for event triggers */
+ temp_buffer = ring_buffer_alloc(PAGE_SIZE, RB_FL_OVERWRITE);
+ if (!temp_buffer)
+ goto out_free_cpumask;
+
+ if (trace_create_savedcmd() < 0)
+ goto out_free_temp_buffer;
+
+ /* TODO: make the number of buffers hot pluggable with CPUS */
+ if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) {
+ printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
+ WARN_ON(1);
+ goto out_free_savedcmd;
+ }
+
+ if (global_trace.buffer_disabled)
+ tracing_off();
+
+ if (trace_boot_clock) {
+ ret = tracing_set_clock(&global_trace, trace_boot_clock);
+ if (ret < 0)
+ pr_warning("Trace clock %s not defined, going back to default\n",
+ trace_boot_clock);
+ }
+
+ /*
+ * register_tracer() might reference current_trace, so it
+ * needs to be set before we register anything. This is
+ * just a bootstrap of current_trace anyway.
+ */
+ global_trace.current_trace = &nop_trace;
+
+ global_trace.max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
+
+ ftrace_init_global_array_ops(&global_trace);
+
+ register_tracer(&nop_trace);
+
+ /* All seems OK, enable tracing */
+ tracing_disabled = 0;
+
+ atomic_notifier_chain_register(&panic_notifier_list,
+ &trace_panic_notifier);
+
+ register_die_notifier(&trace_die_notifier);
+
+ global_trace.flags = TRACE_ARRAY_FL_GLOBAL;
+
+ INIT_LIST_HEAD(&global_trace.systems);
+ INIT_LIST_HEAD(&global_trace.events);
+ list_add(&global_trace.list, &ftrace_trace_arrays);
+
+ while (trace_boot_options) {
+ char *option;
+
+ option = strsep(&trace_boot_options, ",");
+ trace_set_options(&global_trace, option);
+ }
+
+ register_snapshot_cmd();
+
+ return 0;
+
+out_free_savedcmd:
+ free_saved_cmdlines_buffer(savedcmd);
+out_free_temp_buffer:
+ ring_buffer_free(temp_buffer);
+out_free_cpumask:
+ free_cpumask_var(global_trace.tracing_cpumask);
+out_free_buffer_mask:
+ free_cpumask_var(tracing_buffer_mask);
+out:
+ return ret;
+}
+
+void __init trace_init(void)
+{
+ if (tracepoint_printk) {
+ tracepoint_print_iter =
+ kmalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL);
+ if (WARN_ON(!tracepoint_print_iter))
+ tracepoint_printk = 0;
+ }
+ tracer_alloc_buffers();
+ trace_event_init();
+}
+
+__init static int clear_boot_tracer(void)
+{
+ /*
+ * The default tracer at boot buffer is an init section.
+ * This function is called in lateinit. If we did not
+ * find the boot tracer, then clear it out, to prevent
+ * later registration from accessing the buffer that is
+ * about to be freed.
+ */
+ if (!default_bootup_tracer)
+ return 0;
+
+ printk(KERN_INFO "ftrace bootup tracer '%s' not registered.\n",
+ default_bootup_tracer);
+ default_bootup_tracer = NULL;
+
+ return 0;
+}
+
+fs_initcall(tracer_init_tracefs);
+late_initcall(clear_boot_tracer);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
new file mode 100644
index 000000000..921691c5c
--- /dev/null
+++ b/kernel/trace/trace.h
@@ -0,0 +1,1321 @@
+
+#ifndef _LINUX_KERNEL_TRACE_H
+#define _LINUX_KERNEL_TRACE_H
+
+#include <linux/fs.h>
+#include <linux/atomic.h>
+#include <linux/sched.h>
+#include <linux/clocksource.h>
+#include <linux/ring_buffer.h>
+#include <linux/mmiotrace.h>
+#include <linux/tracepoint.h>
+#include <linux/ftrace.h>
+#include <linux/hw_breakpoint.h>
+#include <linux/trace_seq.h>
+#include <linux/ftrace_event.h>
+#include <linux/compiler.h>
+#include <linux/trace_seq.h>
+
+#ifdef CONFIG_FTRACE_SYSCALLS
+#include <asm/unistd.h> /* For NR_SYSCALLS */
+#include <asm/syscall.h> /* some archs define it here */
+#endif
+
+enum trace_type {
+ __TRACE_FIRST_TYPE = 0,
+
+ TRACE_FN,
+ TRACE_CTX,
+ TRACE_WAKE,
+ TRACE_STACK,
+ TRACE_PRINT,
+ TRACE_BPRINT,
+ TRACE_MMIO_RW,
+ TRACE_MMIO_MAP,
+ TRACE_BRANCH,
+ TRACE_GRAPH_RET,
+ TRACE_GRAPH_ENT,
+ TRACE_USER_STACK,
+ TRACE_BLK,
+ TRACE_BPUTS,
+
+ __TRACE_LAST_TYPE,
+};
+
+
+#undef __field
+#define __field(type, item) type item;
+
+#undef __field_struct
+#define __field_struct(type, item) __field(type, item)
+
+#undef __field_desc
+#define __field_desc(type, container, item)
+
+#undef __array
+#define __array(type, item, size) type item[size];
+
+#undef __array_desc
+#define __array_desc(type, container, item, size)
+
+#undef __dynamic_array
+#define __dynamic_array(type, item) type item[];
+
+#undef F_STRUCT
+#define F_STRUCT(args...) args
+
+#undef FTRACE_ENTRY
+#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \
+ struct struct_name { \
+ struct trace_entry ent; \
+ tstruct \
+ }
+
+#undef TP_ARGS
+#define TP_ARGS(args...) args
+
+#undef FTRACE_ENTRY_DUP
+#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk, filter)
+
+#undef FTRACE_ENTRY_REG
+#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, \
+ filter, regfn) \
+ FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
+ filter)
+
+#include "trace_entries.h"
+
+/*
+ * syscalls are special, and need special handling, this is why
+ * they are not included in trace_entries.h
+ */
+struct syscall_trace_enter {
+ struct trace_entry ent;
+ int nr;
+ unsigned long args[];
+};
+
+struct syscall_trace_exit {
+ struct trace_entry ent;
+ int nr;
+ long ret;
+};
+
+struct kprobe_trace_entry_head {
+ struct trace_entry ent;
+ unsigned long ip;
+};
+
+struct kretprobe_trace_entry_head {
+ struct trace_entry ent;
+ unsigned long func;
+ unsigned long ret_ip;
+};
+
+/*
+ * trace_flag_type is an enumeration that holds different
+ * states when a trace occurs. These are:
+ * IRQS_OFF - interrupts were disabled
+ * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags
+ * NEED_RESCHED - reschedule is requested
+ * HARDIRQ - inside an interrupt handler
+ * SOFTIRQ - inside a softirq handler
+ */
+enum trace_flag_type {
+ TRACE_FLAG_IRQS_OFF = 0x01,
+ TRACE_FLAG_IRQS_NOSUPPORT = 0x02,
+ TRACE_FLAG_NEED_RESCHED = 0x04,
+ TRACE_FLAG_HARDIRQ = 0x08,
+ TRACE_FLAG_SOFTIRQ = 0x10,
+ TRACE_FLAG_PREEMPT_RESCHED = 0x20,
+};
+
+#define TRACE_BUF_SIZE 1024
+
+struct trace_array;
+
+/*
+ * The CPU trace array - it consists of thousands of trace entries
+ * plus some other descriptor data: (for example which task started
+ * the trace, etc.)
+ */
+struct trace_array_cpu {
+ atomic_t disabled;
+ void *buffer_page; /* ring buffer spare */
+
+ unsigned long entries;
+ unsigned long saved_latency;
+ unsigned long critical_start;
+ unsigned long critical_end;
+ unsigned long critical_sequence;
+ unsigned long nice;
+ unsigned long policy;
+ unsigned long rt_priority;
+ unsigned long skipped_entries;
+ cycle_t preempt_timestamp;
+ pid_t pid;
+ kuid_t uid;
+ char comm[TASK_COMM_LEN];
+};
+
+struct tracer;
+
+struct trace_buffer {
+ struct trace_array *tr;
+ struct ring_buffer *buffer;
+ struct trace_array_cpu __percpu *data;
+ cycle_t time_start;
+ int cpu;
+};
+
+/*
+ * The trace array - an array of per-CPU trace arrays. This is the
+ * highest level data structure that individual tracers deal with.
+ * They have on/off state as well:
+ */
+struct trace_array {
+ struct list_head list;
+ char *name;
+ struct trace_buffer trace_buffer;
+#ifdef CONFIG_TRACER_MAX_TRACE
+ /*
+ * The max_buffer is used to snapshot the trace when a maximum
+ * latency is reached, or when the user initiates a snapshot.
+ * Some tracers will use this to store a maximum trace while
+ * it continues examining live traces.
+ *
+ * The buffers for the max_buffer are set up the same as the trace_buffer
+ * When a snapshot is taken, the buffer of the max_buffer is swapped
+ * with the buffer of the trace_buffer and the buffers are reset for
+ * the trace_buffer so the tracing can continue.
+ */
+ struct trace_buffer max_buffer;
+ bool allocated_snapshot;
+ unsigned long max_latency;
+#endif
+ /*
+ * max_lock is used to protect the swapping of buffers
+ * when taking a max snapshot. The buffers themselves are
+ * protected by per_cpu spinlocks. But the action of the swap
+ * needs its own lock.
+ *
+ * This is defined as a arch_spinlock_t in order to help
+ * with performance when lockdep debugging is enabled.
+ *
+ * It is also used in other places outside the update_max_tr
+ * so it needs to be defined outside of the
+ * CONFIG_TRACER_MAX_TRACE.
+ */
+ arch_spinlock_t max_lock;
+ int buffer_disabled;
+#ifdef CONFIG_FTRACE_SYSCALLS
+ int sys_refcount_enter;
+ int sys_refcount_exit;
+ struct ftrace_event_file __rcu *enter_syscall_files[NR_syscalls];
+ struct ftrace_event_file __rcu *exit_syscall_files[NR_syscalls];
+#endif
+ int stop_count;
+ int clock_id;
+ struct tracer *current_trace;
+ unsigned int flags;
+ raw_spinlock_t start_lock;
+ struct dentry *dir;
+ struct dentry *options;
+ struct dentry *percpu_dir;
+ struct dentry *event_dir;
+ struct list_head systems;
+ struct list_head events;
+ cpumask_var_t tracing_cpumask; /* only trace on set CPUs */
+ int ref;
+#ifdef CONFIG_FUNCTION_TRACER
+ struct ftrace_ops *ops;
+ /* function tracing enabled */
+ int function_enabled;
+#endif
+};
+
+enum {
+ TRACE_ARRAY_FL_GLOBAL = (1 << 0)
+};
+
+extern struct list_head ftrace_trace_arrays;
+
+extern struct mutex trace_types_lock;
+
+extern int trace_array_get(struct trace_array *tr);
+extern void trace_array_put(struct trace_array *tr);
+
+/*
+ * The global tracer (top) should be the first trace array added,
+ * but we check the flag anyway.
+ */
+static inline struct trace_array *top_trace_array(void)
+{
+ struct trace_array *tr;
+
+ if (list_empty(&ftrace_trace_arrays))
+ return NULL;
+
+ tr = list_entry(ftrace_trace_arrays.prev,
+ typeof(*tr), list);
+ WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL));
+ return tr;
+}
+
+#define FTRACE_CMP_TYPE(var, type) \
+ __builtin_types_compatible_p(typeof(var), type *)
+
+#undef IF_ASSIGN
+#define IF_ASSIGN(var, entry, etype, id) \
+ if (FTRACE_CMP_TYPE(var, etype)) { \
+ var = (typeof(var))(entry); \
+ WARN_ON(id && (entry)->type != id); \
+ break; \
+ }
+
+/* Will cause compile errors if type is not found. */
+extern void __ftrace_bad_type(void);
+
+/*
+ * The trace_assign_type is a verifier that the entry type is
+ * the same as the type being assigned. To add new types simply
+ * add a line with the following format:
+ *
+ * IF_ASSIGN(var, ent, type, id);
+ *
+ * Where "type" is the trace type that includes the trace_entry
+ * as the "ent" item. And "id" is the trace identifier that is
+ * used in the trace_type enum.
+ *
+ * If the type can have more than one id, then use zero.
+ */
+#define trace_assign_type(var, ent) \
+ do { \
+ IF_ASSIGN(var, ent, struct ftrace_entry, TRACE_FN); \
+ IF_ASSIGN(var, ent, struct ctx_switch_entry, 0); \
+ IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \
+ IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
+ IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \
+ IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \
+ IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS); \
+ IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \
+ TRACE_MMIO_RW); \
+ IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \
+ TRACE_MMIO_MAP); \
+ IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \
+ IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry, \
+ TRACE_GRAPH_ENT); \
+ IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \
+ TRACE_GRAPH_RET); \
+ __ftrace_bad_type(); \
+ } while (0)
+
+/*
+ * An option specific to a tracer. This is a boolean value.
+ * The bit is the bit index that sets its value on the
+ * flags value in struct tracer_flags.
+ */
+struct tracer_opt {
+ const char *name; /* Will appear on the trace_options file */
+ u32 bit; /* Mask assigned in val field in tracer_flags */
+};
+
+/*
+ * The set of specific options for a tracer. Your tracer
+ * have to set the initial value of the flags val.
+ */
+struct tracer_flags {
+ u32 val;
+ struct tracer_opt *opts;
+};
+
+/* Makes more easy to define a tracer opt */
+#define TRACER_OPT(s, b) .name = #s, .bit = b
+
+
+/**
+ * struct tracer - a specific tracer and its callbacks to interact with tracefs
+ * @name: the name chosen to select it on the available_tracers file
+ * @init: called when one switches to this tracer (echo name > current_tracer)
+ * @reset: called when one switches to another tracer
+ * @start: called when tracing is unpaused (echo 1 > tracing_enabled)
+ * @stop: called when tracing is paused (echo 0 > tracing_enabled)
+ * @update_thresh: called when tracing_thresh is updated
+ * @open: called when the trace file is opened
+ * @pipe_open: called when the trace_pipe file is opened
+ * @close: called when the trace file is released
+ * @pipe_close: called when the trace_pipe file is released
+ * @read: override the default read callback on trace_pipe
+ * @splice_read: override the default splice_read callback on trace_pipe
+ * @selftest: selftest to run on boot (see trace_selftest.c)
+ * @print_headers: override the first lines that describe your columns
+ * @print_line: callback that prints a trace
+ * @set_flag: signals one of your private flags changed (trace_options file)
+ * @flags: your private flags
+ */
+struct tracer {
+ const char *name;
+ int (*init)(struct trace_array *tr);
+ void (*reset)(struct trace_array *tr);
+ void (*start)(struct trace_array *tr);
+ void (*stop)(struct trace_array *tr);
+ int (*update_thresh)(struct trace_array *tr);
+ void (*open)(struct trace_iterator *iter);
+ void (*pipe_open)(struct trace_iterator *iter);
+ void (*close)(struct trace_iterator *iter);
+ void (*pipe_close)(struct trace_iterator *iter);
+ ssize_t (*read)(struct trace_iterator *iter,
+ struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos);
+ ssize_t (*splice_read)(struct trace_iterator *iter,
+ struct file *filp,
+ loff_t *ppos,
+ struct pipe_inode_info *pipe,
+ size_t len,
+ unsigned int flags);
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+ int (*selftest)(struct tracer *trace,
+ struct trace_array *tr);
+#endif
+ void (*print_header)(struct seq_file *m);
+ enum print_line_t (*print_line)(struct trace_iterator *iter);
+ /* If you handled the flag setting, return 0 */
+ int (*set_flag)(struct trace_array *tr,
+ u32 old_flags, u32 bit, int set);
+ /* Return 0 if OK with change, else return non-zero */
+ int (*flag_changed)(struct trace_array *tr,
+ u32 mask, int set);
+ struct tracer *next;
+ struct tracer_flags *flags;
+ int enabled;
+ int ref;
+ bool print_max;
+ bool allow_instances;
+#ifdef CONFIG_TRACER_MAX_TRACE
+ bool use_max_tr;
+#endif
+};
+
+
+/* Only current can touch trace_recursion */
+
+/*
+ * For function tracing recursion:
+ * The order of these bits are important.
+ *
+ * When function tracing occurs, the following steps are made:
+ * If arch does not support a ftrace feature:
+ * call internal function (uses INTERNAL bits) which calls...
+ * If callback is registered to the "global" list, the list
+ * function is called and recursion checks the GLOBAL bits.
+ * then this function calls...
+ * The function callback, which can use the FTRACE bits to
+ * check for recursion.
+ *
+ * Now if the arch does not suppport a feature, and it calls
+ * the global list function which calls the ftrace callback
+ * all three of these steps will do a recursion protection.
+ * There's no reason to do one if the previous caller already
+ * did. The recursion that we are protecting against will
+ * go through the same steps again.
+ *
+ * To prevent the multiple recursion checks, if a recursion
+ * bit is set that is higher than the MAX bit of the current
+ * check, then we know that the check was made by the previous
+ * caller, and we can skip the current check.
+ */
+enum {
+ TRACE_BUFFER_BIT,
+ TRACE_BUFFER_NMI_BIT,
+ TRACE_BUFFER_IRQ_BIT,
+ TRACE_BUFFER_SIRQ_BIT,
+
+ /* Start of function recursion bits */
+ TRACE_FTRACE_BIT,
+ TRACE_FTRACE_NMI_BIT,
+ TRACE_FTRACE_IRQ_BIT,
+ TRACE_FTRACE_SIRQ_BIT,
+
+ /* INTERNAL_BITs must be greater than FTRACE_BITs */
+ TRACE_INTERNAL_BIT,
+ TRACE_INTERNAL_NMI_BIT,
+ TRACE_INTERNAL_IRQ_BIT,
+ TRACE_INTERNAL_SIRQ_BIT,
+
+ TRACE_CONTROL_BIT,
+
+ TRACE_BRANCH_BIT,
+/*
+ * Abuse of the trace_recursion.
+ * As we need a way to maintain state if we are tracing the function
+ * graph in irq because we want to trace a particular function that
+ * was called in irq context but we have irq tracing off. Since this
+ * can only be modified by current, we can reuse trace_recursion.
+ */
+ TRACE_IRQ_BIT,
+};
+
+#define trace_recursion_set(bit) do { (current)->trace_recursion |= (1<<(bit)); } while (0)
+#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(1<<(bit)); } while (0)
+#define trace_recursion_test(bit) ((current)->trace_recursion & (1<<(bit)))
+
+#define TRACE_CONTEXT_BITS 4
+
+#define TRACE_FTRACE_START TRACE_FTRACE_BIT
+#define TRACE_FTRACE_MAX ((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1)
+
+#define TRACE_LIST_START TRACE_INTERNAL_BIT
+#define TRACE_LIST_MAX ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1)
+
+#define TRACE_CONTEXT_MASK TRACE_LIST_MAX
+
+static __always_inline int trace_get_context_bit(void)
+{
+ int bit;
+
+ if (in_interrupt()) {
+ if (in_nmi())
+ bit = 0;
+
+ else if (in_irq())
+ bit = 1;
+ else
+ bit = 2;
+ } else
+ bit = 3;
+
+ return bit;
+}
+
+static __always_inline int trace_test_and_set_recursion(int start, int max)
+{
+ unsigned int val = current->trace_recursion;
+ int bit;
+
+ /* A previous recursion check was made */
+ if ((val & TRACE_CONTEXT_MASK) > max)
+ return 0;
+
+ bit = trace_get_context_bit() + start;
+ if (unlikely(val & (1 << bit)))
+ return -1;
+
+ val |= 1 << bit;
+ current->trace_recursion = val;
+ barrier();
+
+ return bit;
+}
+
+static __always_inline void trace_clear_recursion(int bit)
+{
+ unsigned int val = current->trace_recursion;
+
+ if (!bit)
+ return;
+
+ bit = 1 << bit;
+ val &= ~bit;
+
+ barrier();
+ current->trace_recursion = val;
+}
+
+static inline struct ring_buffer_iter *
+trace_buffer_iter(struct trace_iterator *iter, int cpu)
+{
+ if (iter->buffer_iter && iter->buffer_iter[cpu])
+ return iter->buffer_iter[cpu];
+ return NULL;
+}
+
+int tracer_init(struct tracer *t, struct trace_array *tr);
+int tracing_is_enabled(void);
+void tracing_reset(struct trace_buffer *buf, int cpu);
+void tracing_reset_online_cpus(struct trace_buffer *buf);
+void tracing_reset_current(int cpu);
+void tracing_reset_all_online_cpus(void);
+int tracing_open_generic(struct inode *inode, struct file *filp);
+bool tracing_is_disabled(void);
+struct dentry *trace_create_file(const char *name,
+ umode_t mode,
+ struct dentry *parent,
+ void *data,
+ const struct file_operations *fops);
+
+struct dentry *tracing_init_dentry(void);
+
+struct ring_buffer_event;
+
+struct ring_buffer_event *
+trace_buffer_lock_reserve(struct ring_buffer *buffer,
+ int type,
+ unsigned long len,
+ unsigned long flags,
+ int pc);
+
+struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
+ struct trace_array_cpu *data);
+
+struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
+ int *ent_cpu, u64 *ent_ts);
+
+void __buffer_unlock_commit(struct ring_buffer *buffer,
+ struct ring_buffer_event *event);
+
+int trace_empty(struct trace_iterator *iter);
+
+void *trace_find_next_entry_inc(struct trace_iterator *iter);
+
+void trace_init_global_iter(struct trace_iterator *iter);
+
+void tracing_iter_reset(struct trace_iterator *iter, int cpu);
+
+void trace_function(struct trace_array *tr,
+ unsigned long ip,
+ unsigned long parent_ip,
+ unsigned long flags, int pc);
+void trace_graph_function(struct trace_array *tr,
+ unsigned long ip,
+ unsigned long parent_ip,
+ unsigned long flags, int pc);
+void trace_latency_header(struct seq_file *m);
+void trace_default_header(struct seq_file *m);
+void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
+int trace_empty(struct trace_iterator *iter);
+
+void trace_graph_return(struct ftrace_graph_ret *trace);
+int trace_graph_entry(struct ftrace_graph_ent *trace);
+void set_graph_array(struct trace_array *tr);
+
+void tracing_start_cmdline_record(void);
+void tracing_stop_cmdline_record(void);
+int register_tracer(struct tracer *type);
+int is_tracing_stopped(void);
+
+loff_t tracing_lseek(struct file *file, loff_t offset, int whence);
+
+extern cpumask_var_t __read_mostly tracing_buffer_mask;
+
+#define for_each_tracing_cpu(cpu) \
+ for_each_cpu(cpu, tracing_buffer_mask)
+
+extern unsigned long nsecs_to_usecs(unsigned long nsecs);
+
+extern unsigned long tracing_thresh;
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
+void update_max_tr_single(struct trace_array *tr,
+ struct task_struct *tsk, int cpu);
+#endif /* CONFIG_TRACER_MAX_TRACE */
+
+#ifdef CONFIG_STACKTRACE
+void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
+ int skip, int pc);
+
+void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags,
+ int skip, int pc, struct pt_regs *regs);
+
+void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags,
+ int pc);
+
+void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
+ int pc);
+#else
+static inline void ftrace_trace_stack(struct ring_buffer *buffer,
+ unsigned long flags, int skip, int pc)
+{
+}
+
+static inline void ftrace_trace_stack_regs(struct ring_buffer *buffer,
+ unsigned long flags, int skip,
+ int pc, struct pt_regs *regs)
+{
+}
+
+static inline void ftrace_trace_userstack(struct ring_buffer *buffer,
+ unsigned long flags, int pc)
+{
+}
+
+static inline void __trace_stack(struct trace_array *tr, unsigned long flags,
+ int skip, int pc)
+{
+}
+#endif /* CONFIG_STACKTRACE */
+
+extern cycle_t ftrace_now(int cpu);
+
+extern void trace_find_cmdline(int pid, char comm[]);
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+extern unsigned long ftrace_update_tot_cnt;
+#endif
+#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func
+extern int DYN_FTRACE_TEST_NAME(void);
+#define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2
+extern int DYN_FTRACE_TEST_NAME2(void);
+
+extern bool ring_buffer_expanded;
+extern bool tracing_selftest_disabled;
+DECLARE_PER_CPU(int, ftrace_cpu_disabled);
+
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+extern int trace_selftest_startup_function(struct tracer *trace,
+ struct trace_array *tr);
+extern int trace_selftest_startup_function_graph(struct tracer *trace,
+ struct trace_array *tr);
+extern int trace_selftest_startup_irqsoff(struct tracer *trace,
+ struct trace_array *tr);
+extern int trace_selftest_startup_preemptoff(struct tracer *trace,
+ struct trace_array *tr);
+extern int trace_selftest_startup_preemptirqsoff(struct tracer *trace,
+ struct trace_array *tr);
+extern int trace_selftest_startup_wakeup(struct tracer *trace,
+ struct trace_array *tr);
+extern int trace_selftest_startup_nop(struct tracer *trace,
+ struct trace_array *tr);
+extern int trace_selftest_startup_sched_switch(struct tracer *trace,
+ struct trace_array *tr);
+extern int trace_selftest_startup_branch(struct tracer *trace,
+ struct trace_array *tr);
+/*
+ * Tracer data references selftest functions that only occur
+ * on boot up. These can be __init functions. Thus, when selftests
+ * are enabled, then the tracers need to reference __init functions.
+ */
+#define __tracer_data __refdata
+#else
+/* Tracers are seldom changed. Optimize when selftests are disabled. */
+#define __tracer_data __read_mostly
+#endif /* CONFIG_FTRACE_STARTUP_TEST */
+
+extern void *head_page(struct trace_array_cpu *data);
+extern unsigned long long ns2usecs(cycle_t nsec);
+extern int
+trace_vbprintk(unsigned long ip, const char *fmt, va_list args);
+extern int
+trace_vprintk(unsigned long ip, const char *fmt, va_list args);
+extern int
+trace_array_vprintk(struct trace_array *tr,
+ unsigned long ip, const char *fmt, va_list args);
+int trace_array_printk(struct trace_array *tr,
+ unsigned long ip, const char *fmt, ...);
+int trace_array_printk_buf(struct ring_buffer *buffer,
+ unsigned long ip, const char *fmt, ...);
+void trace_printk_seq(struct trace_seq *s);
+enum print_line_t print_trace_line(struct trace_iterator *iter);
+
+extern unsigned long trace_flags;
+
+extern char trace_find_mark(unsigned long long duration);
+
+/* Standard output formatting function used for function return traces */
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+
+/* Flag options */
+#define TRACE_GRAPH_PRINT_OVERRUN 0x1
+#define TRACE_GRAPH_PRINT_CPU 0x2
+#define TRACE_GRAPH_PRINT_OVERHEAD 0x4
+#define TRACE_GRAPH_PRINT_PROC 0x8
+#define TRACE_GRAPH_PRINT_DURATION 0x10
+#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
+#define TRACE_GRAPH_PRINT_IRQS 0x40
+#define TRACE_GRAPH_PRINT_TAIL 0x80
+#define TRACE_GRAPH_PRINT_FILL_SHIFT 28
+#define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT)
+
+extern enum print_line_t
+print_graph_function_flags(struct trace_iterator *iter, u32 flags);
+extern void print_graph_headers_flags(struct seq_file *s, u32 flags);
+extern void
+trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
+extern void graph_trace_open(struct trace_iterator *iter);
+extern void graph_trace_close(struct trace_iterator *iter);
+extern int __trace_graph_entry(struct trace_array *tr,
+ struct ftrace_graph_ent *trace,
+ unsigned long flags, int pc);
+extern void __trace_graph_return(struct trace_array *tr,
+ struct ftrace_graph_ret *trace,
+ unsigned long flags, int pc);
+
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+/* TODO: make this variable */
+#define FTRACE_GRAPH_MAX_FUNCS 32
+extern int ftrace_graph_count;
+extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS];
+extern int ftrace_graph_notrace_count;
+extern unsigned long ftrace_graph_notrace_funcs[FTRACE_GRAPH_MAX_FUNCS];
+
+static inline int ftrace_graph_addr(unsigned long addr)
+{
+ int i;
+
+ if (!ftrace_graph_count)
+ return 1;
+
+ for (i = 0; i < ftrace_graph_count; i++) {
+ if (addr == ftrace_graph_funcs[i]) {
+ /*
+ * If no irqs are to be traced, but a set_graph_function
+ * is set, and called by an interrupt handler, we still
+ * want to trace it.
+ */
+ if (in_irq())
+ trace_recursion_set(TRACE_IRQ_BIT);
+ else
+ trace_recursion_clear(TRACE_IRQ_BIT);
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+static inline int ftrace_graph_notrace_addr(unsigned long addr)
+{
+ int i;
+
+ if (!ftrace_graph_notrace_count)
+ return 0;
+
+ for (i = 0; i < ftrace_graph_notrace_count; i++) {
+ if (addr == ftrace_graph_notrace_funcs[i])
+ return 1;
+ }
+
+ return 0;
+}
+#else
+static inline int ftrace_graph_addr(unsigned long addr)
+{
+ return 1;
+}
+
+static inline int ftrace_graph_notrace_addr(unsigned long addr)
+{
+ return 0;
+}
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#else /* CONFIG_FUNCTION_GRAPH_TRACER */
+static inline enum print_line_t
+print_graph_function_flags(struct trace_iterator *iter, u32 flags)
+{
+ return TRACE_TYPE_UNHANDLED;
+}
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
+extern struct list_head ftrace_pids;
+
+#ifdef CONFIG_FUNCTION_TRACER
+extern bool ftrace_filter_param __initdata;
+static inline int ftrace_trace_task(struct task_struct *task)
+{
+ if (list_empty(&ftrace_pids))
+ return 1;
+
+ return test_tsk_trace_trace(task);
+}
+extern int ftrace_is_dead(void);
+int ftrace_create_function_files(struct trace_array *tr,
+ struct dentry *parent);
+void ftrace_destroy_function_files(struct trace_array *tr);
+void ftrace_init_global_array_ops(struct trace_array *tr);
+void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func);
+void ftrace_reset_array_ops(struct trace_array *tr);
+int using_ftrace_ops_list_func(void);
+#else
+static inline int ftrace_trace_task(struct task_struct *task)
+{
+ return 1;
+}
+static inline int ftrace_is_dead(void) { return 0; }
+static inline int
+ftrace_create_function_files(struct trace_array *tr,
+ struct dentry *parent)
+{
+ return 0;
+}
+static inline void ftrace_destroy_function_files(struct trace_array *tr) { }
+static inline __init void
+ftrace_init_global_array_ops(struct trace_array *tr) { }
+static inline void ftrace_reset_array_ops(struct trace_array *tr) { }
+/* ftace_func_t type is not defined, use macro instead of static inline */
+#define ftrace_init_array_ops(tr, func) do { } while (0)
+#endif /* CONFIG_FUNCTION_TRACER */
+
+#if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE)
+void ftrace_create_filter_files(struct ftrace_ops *ops,
+ struct dentry *parent);
+void ftrace_destroy_filter_files(struct ftrace_ops *ops);
+#else
+/*
+ * The ops parameter passed in is usually undefined.
+ * This must be a macro.
+ */
+#define ftrace_create_filter_files(ops, parent) do { } while (0)
+#define ftrace_destroy_filter_files(ops) do { } while (0)
+#endif /* CONFIG_FUNCTION_TRACER && CONFIG_DYNAMIC_FTRACE */
+
+int ftrace_event_is_function(struct ftrace_event_call *call);
+
+/*
+ * struct trace_parser - servers for reading the user input separated by spaces
+ * @cont: set if the input is not complete - no final space char was found
+ * @buffer: holds the parsed user input
+ * @idx: user input length
+ * @size: buffer size
+ */
+struct trace_parser {
+ bool cont;
+ char *buffer;
+ unsigned idx;
+ unsigned size;
+};
+
+static inline bool trace_parser_loaded(struct trace_parser *parser)
+{
+ return (parser->idx != 0);
+}
+
+static inline bool trace_parser_cont(struct trace_parser *parser)
+{
+ return parser->cont;
+}
+
+static inline void trace_parser_clear(struct trace_parser *parser)
+{
+ parser->cont = false;
+ parser->idx = 0;
+}
+
+extern int trace_parser_get_init(struct trace_parser *parser, int size);
+extern void trace_parser_put(struct trace_parser *parser);
+extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
+ size_t cnt, loff_t *ppos);
+
+/*
+ * trace_iterator_flags is an enumeration that defines bit
+ * positions into trace_flags that controls the output.
+ *
+ * NOTE: These bits must match the trace_options array in
+ * trace.c.
+ */
+enum trace_iterator_flags {
+ TRACE_ITER_PRINT_PARENT = 0x01,
+ TRACE_ITER_SYM_OFFSET = 0x02,
+ TRACE_ITER_SYM_ADDR = 0x04,
+ TRACE_ITER_VERBOSE = 0x08,
+ TRACE_ITER_RAW = 0x10,
+ TRACE_ITER_HEX = 0x20,
+ TRACE_ITER_BIN = 0x40,
+ TRACE_ITER_BLOCK = 0x80,
+ TRACE_ITER_STACKTRACE = 0x100,
+ TRACE_ITER_PRINTK = 0x200,
+ TRACE_ITER_PREEMPTONLY = 0x400,
+ TRACE_ITER_BRANCH = 0x800,
+ TRACE_ITER_ANNOTATE = 0x1000,
+ TRACE_ITER_USERSTACKTRACE = 0x2000,
+ TRACE_ITER_SYM_USEROBJ = 0x4000,
+ TRACE_ITER_PRINTK_MSGONLY = 0x8000,
+ TRACE_ITER_CONTEXT_INFO = 0x10000, /* Print pid/cpu/time */
+ TRACE_ITER_LATENCY_FMT = 0x20000,
+ TRACE_ITER_SLEEP_TIME = 0x40000,
+ TRACE_ITER_GRAPH_TIME = 0x80000,
+ TRACE_ITER_RECORD_CMD = 0x100000,
+ TRACE_ITER_OVERWRITE = 0x200000,
+ TRACE_ITER_STOP_ON_FREE = 0x400000,
+ TRACE_ITER_IRQ_INFO = 0x800000,
+ TRACE_ITER_MARKERS = 0x1000000,
+ TRACE_ITER_FUNCTION = 0x2000000,
+};
+
+/*
+ * TRACE_ITER_SYM_MASK masks the options in trace_flags that
+ * control the output of kernel symbols.
+ */
+#define TRACE_ITER_SYM_MASK \
+ (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR)
+
+extern struct tracer nop_trace;
+
+#ifdef CONFIG_BRANCH_TRACER
+extern int enable_branch_tracing(struct trace_array *tr);
+extern void disable_branch_tracing(void);
+static inline int trace_branch_enable(struct trace_array *tr)
+{
+ if (trace_flags & TRACE_ITER_BRANCH)
+ return enable_branch_tracing(tr);
+ return 0;
+}
+static inline void trace_branch_disable(void)
+{
+ /* due to races, always disable */
+ disable_branch_tracing();
+}
+#else
+static inline int trace_branch_enable(struct trace_array *tr)
+{
+ return 0;
+}
+static inline void trace_branch_disable(void)
+{
+}
+#endif /* CONFIG_BRANCH_TRACER */
+
+/* set ring buffers to default size if not already done so */
+int tracing_update_buffers(void);
+
+struct ftrace_event_field {
+ struct list_head link;
+ const char *name;
+ const char *type;
+ int filter_type;
+ int offset;
+ int size;
+ int is_signed;
+};
+
+struct event_filter {
+ int n_preds; /* Number assigned */
+ int a_preds; /* allocated */
+ struct filter_pred *preds;
+ struct filter_pred *root;
+ char *filter_string;
+};
+
+struct event_subsystem {
+ struct list_head list;
+ const char *name;
+ struct event_filter *filter;
+ int ref_count;
+};
+
+struct ftrace_subsystem_dir {
+ struct list_head list;
+ struct event_subsystem *subsystem;
+ struct trace_array *tr;
+ struct dentry *entry;
+ int ref_count;
+ int nr_events;
+};
+
+#define FILTER_PRED_INVALID ((unsigned short)-1)
+#define FILTER_PRED_IS_RIGHT (1 << 15)
+#define FILTER_PRED_FOLD (1 << 15)
+
+/*
+ * The max preds is the size of unsigned short with
+ * two flags at the MSBs. One bit is used for both the IS_RIGHT
+ * and FOLD flags. The other is reserved.
+ *
+ * 2^14 preds is way more than enough.
+ */
+#define MAX_FILTER_PRED 16384
+
+struct filter_pred;
+struct regex;
+
+typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
+
+typedef int (*regex_match_func)(char *str, struct regex *r, int len);
+
+enum regex_type {
+ MATCH_FULL = 0,
+ MATCH_FRONT_ONLY,
+ MATCH_MIDDLE_ONLY,
+ MATCH_END_ONLY,
+};
+
+struct regex {
+ char pattern[MAX_FILTER_STR_VAL];
+ int len;
+ int field_len;
+ regex_match_func match;
+};
+
+struct filter_pred {
+ filter_pred_fn_t fn;
+ u64 val;
+ struct regex regex;
+ unsigned short *ops;
+ struct ftrace_event_field *field;
+ int offset;
+ int not;
+ int op;
+ unsigned short index;
+ unsigned short parent;
+ unsigned short left;
+ unsigned short right;
+};
+
+extern enum regex_type
+filter_parse_regex(char *buff, int len, char **search, int *not);
+extern void print_event_filter(struct ftrace_event_file *file,
+ struct trace_seq *s);
+extern int apply_event_filter(struct ftrace_event_file *file,
+ char *filter_string);
+extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
+ char *filter_string);
+extern void print_subsystem_event_filter(struct event_subsystem *system,
+ struct trace_seq *s);
+extern int filter_assign_type(const char *type);
+extern int create_event_filter(struct ftrace_event_call *call,
+ char *filter_str, bool set_str,
+ struct event_filter **filterp);
+extern void free_event_filter(struct event_filter *filter);
+
+struct ftrace_event_field *
+trace_find_event_field(struct ftrace_event_call *call, char *name);
+
+extern void trace_event_enable_cmd_record(bool enable);
+extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr);
+extern int event_trace_del_tracer(struct trace_array *tr);
+
+extern struct ftrace_event_file *find_event_file(struct trace_array *tr,
+ const char *system,
+ const char *event);
+
+static inline void *event_file_data(struct file *filp)
+{
+ return ACCESS_ONCE(file_inode(filp)->i_private);
+}
+
+extern struct mutex event_mutex;
+extern struct list_head ftrace_events;
+
+extern const struct file_operations event_trigger_fops;
+
+extern int register_trigger_cmds(void);
+extern void clear_event_triggers(struct trace_array *tr);
+
+struct event_trigger_data {
+ unsigned long count;
+ int ref;
+ struct event_trigger_ops *ops;
+ struct event_command *cmd_ops;
+ struct event_filter __rcu *filter;
+ char *filter_str;
+ void *private_data;
+ struct list_head list;
+};
+
+/**
+ * struct event_trigger_ops - callbacks for trace event triggers
+ *
+ * The methods in this structure provide per-event trigger hooks for
+ * various trigger operations.
+ *
+ * All the methods below, except for @init() and @free(), must be
+ * implemented.
+ *
+ * @func: The trigger 'probe' function called when the triggering
+ * event occurs. The data passed into this callback is the data
+ * that was supplied to the event_command @reg() function that
+ * registered the trigger (see struct event_command).
+ *
+ * @init: An optional initialization function called for the trigger
+ * when the trigger is registered (via the event_command reg()
+ * function). This can be used to perform per-trigger
+ * initialization such as incrementing a per-trigger reference
+ * count, for instance. This is usually implemented by the
+ * generic utility function @event_trigger_init() (see
+ * trace_event_triggers.c).
+ *
+ * @free: An optional de-initialization function called for the
+ * trigger when the trigger is unregistered (via the
+ * event_command @reg() function). This can be used to perform
+ * per-trigger de-initialization such as decrementing a
+ * per-trigger reference count and freeing corresponding trigger
+ * data, for instance. This is usually implemented by the
+ * generic utility function @event_trigger_free() (see
+ * trace_event_triggers.c).
+ *
+ * @print: The callback function invoked to have the trigger print
+ * itself. This is usually implemented by a wrapper function
+ * that calls the generic utility function @event_trigger_print()
+ * (see trace_event_triggers.c).
+ */
+struct event_trigger_ops {
+ void (*func)(struct event_trigger_data *data);
+ int (*init)(struct event_trigger_ops *ops,
+ struct event_trigger_data *data);
+ void (*free)(struct event_trigger_ops *ops,
+ struct event_trigger_data *data);
+ int (*print)(struct seq_file *m,
+ struct event_trigger_ops *ops,
+ struct event_trigger_data *data);
+};
+
+/**
+ * struct event_command - callbacks and data members for event commands
+ *
+ * Event commands are invoked by users by writing the command name
+ * into the 'trigger' file associated with a trace event. The
+ * parameters associated with a specific invocation of an event
+ * command are used to create an event trigger instance, which is
+ * added to the list of trigger instances associated with that trace
+ * event. When the event is hit, the set of triggers associated with
+ * that event is invoked.
+ *
+ * The data members in this structure provide per-event command data
+ * for various event commands.
+ *
+ * All the data members below, except for @post_trigger, must be set
+ * for each event command.
+ *
+ * @name: The unique name that identifies the event command. This is
+ * the name used when setting triggers via trigger files.
+ *
+ * @trigger_type: A unique id that identifies the event command
+ * 'type'. This value has two purposes, the first to ensure that
+ * only one trigger of the same type can be set at a given time
+ * for a particular event e.g. it doesn't make sense to have both
+ * a traceon and traceoff trigger attached to a single event at
+ * the same time, so traceon and traceoff have the same type
+ * though they have different names. The @trigger_type value is
+ * also used as a bit value for deferring the actual trigger
+ * action until after the current event is finished. Some
+ * commands need to do this if they themselves log to the trace
+ * buffer (see the @post_trigger() member below). @trigger_type
+ * values are defined by adding new values to the trigger_type
+ * enum in include/linux/ftrace_event.h.
+ *
+ * @post_trigger: A flag that says whether or not this command needs
+ * to have its action delayed until after the current event has
+ * been closed. Some triggers need to avoid being invoked while
+ * an event is currently in the process of being logged, since
+ * the trigger may itself log data into the trace buffer. Thus
+ * we make sure the current event is committed before invoking
+ * those triggers. To do that, the trigger invocation is split
+ * in two - the first part checks the filter using the current
+ * trace record; if a command has the @post_trigger flag set, it
+ * sets a bit for itself in the return value, otherwise it
+ * directly invokes the trigger. Once all commands have been
+ * either invoked or set their return flag, the current record is
+ * either committed or discarded. At that point, if any commands
+ * have deferred their triggers, those commands are finally
+ * invoked following the close of the current event. In other
+ * words, if the event_trigger_ops @func() probe implementation
+ * itself logs to the trace buffer, this flag should be set,
+ * otherwise it can be left unspecified.
+ *
+ * All the methods below, except for @set_filter(), must be
+ * implemented.
+ *
+ * @func: The callback function responsible for parsing and
+ * registering the trigger written to the 'trigger' file by the
+ * user. It allocates the trigger instance and registers it with
+ * the appropriate trace event. It makes use of the other
+ * event_command callback functions to orchestrate this, and is
+ * usually implemented by the generic utility function
+ * @event_trigger_callback() (see trace_event_triggers.c).
+ *
+ * @reg: Adds the trigger to the list of triggers associated with the
+ * event, and enables the event trigger itself, after
+ * initializing it (via the event_trigger_ops @init() function).
+ * This is also where commands can use the @trigger_type value to
+ * make the decision as to whether or not multiple instances of
+ * the trigger should be allowed. This is usually implemented by
+ * the generic utility function @register_trigger() (see
+ * trace_event_triggers.c).
+ *
+ * @unreg: Removes the trigger from the list of triggers associated
+ * with the event, and disables the event trigger itself, after
+ * initializing it (via the event_trigger_ops @free() function).
+ * This is usually implemented by the generic utility function
+ * @unregister_trigger() (see trace_event_triggers.c).
+ *
+ * @set_filter: An optional function called to parse and set a filter
+ * for the trigger. If no @set_filter() method is set for the
+ * event command, filters set by the user for the command will be
+ * ignored. This is usually implemented by the generic utility
+ * function @set_trigger_filter() (see trace_event_triggers.c).
+ *
+ * @get_trigger_ops: The callback function invoked to retrieve the
+ * event_trigger_ops implementation associated with the command.
+ */
+struct event_command {
+ struct list_head list;
+ char *name;
+ enum event_trigger_type trigger_type;
+ bool post_trigger;
+ int (*func)(struct event_command *cmd_ops,
+ struct ftrace_event_file *file,
+ char *glob, char *cmd, char *params);
+ int (*reg)(char *glob,
+ struct event_trigger_ops *ops,
+ struct event_trigger_data *data,
+ struct ftrace_event_file *file);
+ void (*unreg)(char *glob,
+ struct event_trigger_ops *ops,
+ struct event_trigger_data *data,
+ struct ftrace_event_file *file);
+ int (*set_filter)(char *filter_str,
+ struct event_trigger_data *data,
+ struct ftrace_event_file *file);
+ struct event_trigger_ops *(*get_trigger_ops)(char *cmd, char *param);
+};
+
+extern int trace_event_enable_disable(struct ftrace_event_file *file,
+ int enable, int soft_disable);
+extern int tracing_alloc_snapshot(void);
+
+extern const char *__start___trace_bprintk_fmt[];
+extern const char *__stop___trace_bprintk_fmt[];
+
+extern const char *__start___tracepoint_str[];
+extern const char *__stop___tracepoint_str[];
+
+void trace_printk_init_buffers(void);
+void trace_printk_start_comm(void);
+int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set);
+int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
+
+/*
+ * Normal trace_printk() and friends allocates special buffers
+ * to do the manipulation, as well as saves the print formats
+ * into sections to display. But the trace infrastructure wants
+ * to use these without the added overhead at the price of being
+ * a bit slower (used mainly for warnings, where we don't care
+ * about performance). The internal_trace_puts() is for such
+ * a purpose.
+ */
+#define internal_trace_puts(str) __trace_puts(_THIS_IP_, str, strlen(str))
+
+#undef FTRACE_ENTRY
+#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \
+ extern struct ftrace_event_call \
+ __aligned(4) event_##call;
+#undef FTRACE_ENTRY_DUP
+#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \
+ FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \
+ filter)
+#include "trace_entries.h"
+
+#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_FUNCTION_TRACER)
+int perf_ftrace_event_register(struct ftrace_event_call *call,
+ enum trace_reg type, void *data);
+#else
+#define perf_ftrace_event_register NULL
+#endif
+
+#ifdef CONFIG_FTRACE_SYSCALLS
+void init_ftrace_syscalls(void);
+#else
+static inline void init_ftrace_syscalls(void) { }
+#endif
+
+#ifdef CONFIG_EVENT_TRACING
+void trace_event_init(void);
+void trace_event_enum_update(struct trace_enum_map **map, int len);
+#else
+static inline void __init trace_event_init(void) { }
+static inline void trace_event_enum_update(struct trace_enum_map **map, int len) { }
+#endif
+
+extern struct trace_iterator *tracepoint_print_iter;
+
+#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c
new file mode 100644
index 000000000..40a14cbcf
--- /dev/null
+++ b/kernel/trace/trace_benchmark.c
@@ -0,0 +1,198 @@
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/trace_clock.h>
+
+#define CREATE_TRACE_POINTS
+#include "trace_benchmark.h"
+
+static struct task_struct *bm_event_thread;
+
+static char bm_str[BENCHMARK_EVENT_STRLEN] = "START";
+
+static u64 bm_total;
+static u64 bm_totalsq;
+static u64 bm_last;
+static u64 bm_max;
+static u64 bm_min;
+static u64 bm_first;
+static u64 bm_cnt;
+static u64 bm_stddev;
+static unsigned int bm_avg;
+static unsigned int bm_std;
+
+/*
+ * This gets called in a loop recording the time it took to write
+ * the tracepoint. What it writes is the time statistics of the last
+ * tracepoint write. As there is nothing to write the first time
+ * it simply writes "START". As the first write is cold cache and
+ * the rest is hot, we save off that time in bm_first and it is
+ * reported as "first", which is shown in the second write to the
+ * tracepoint. The "first" field is writen within the statics from
+ * then on but never changes.
+ */
+static void trace_do_benchmark(void)
+{
+ u64 start;
+ u64 stop;
+ u64 delta;
+ u64 stddev;
+ u64 seed;
+ u64 last_seed;
+ unsigned int avg;
+ unsigned int std = 0;
+
+ /* Only run if the tracepoint is actually active */
+ if (!trace_benchmark_event_enabled())
+ return;
+
+ local_irq_disable();
+ start = trace_clock_local();
+ trace_benchmark_event(bm_str);
+ stop = trace_clock_local();
+ local_irq_enable();
+
+ bm_cnt++;
+
+ delta = stop - start;
+
+ /*
+ * The first read is cold cached, keep it separate from the
+ * other calculations.
+ */
+ if (bm_cnt == 1) {
+ bm_first = delta;
+ scnprintf(bm_str, BENCHMARK_EVENT_STRLEN,
+ "first=%llu [COLD CACHED]", bm_first);
+ return;
+ }
+
+ bm_last = delta;
+
+ if (delta > bm_max)
+ bm_max = delta;
+ if (!bm_min || delta < bm_min)
+ bm_min = delta;
+
+ /*
+ * When bm_cnt is greater than UINT_MAX, it breaks the statistics
+ * accounting. Freeze the statistics when that happens.
+ * We should have enough data for the avg and stddev anyway.
+ */
+ if (bm_cnt > UINT_MAX) {
+ scnprintf(bm_str, BENCHMARK_EVENT_STRLEN,
+ "last=%llu first=%llu max=%llu min=%llu ** avg=%u std=%d std^2=%lld",
+ bm_last, bm_first, bm_max, bm_min, bm_avg, bm_std, bm_stddev);
+ return;
+ }
+
+ bm_total += delta;
+ bm_totalsq += delta * delta;
+
+
+ if (bm_cnt > 1) {
+ /*
+ * Apply Welford's method to calculate standard deviation:
+ * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2)
+ */
+ stddev = (u64)bm_cnt * bm_totalsq - bm_total * bm_total;
+ do_div(stddev, (u32)bm_cnt);
+ do_div(stddev, (u32)bm_cnt - 1);
+ } else
+ stddev = 0;
+
+ delta = bm_total;
+ do_div(delta, bm_cnt);
+ avg = delta;
+
+ if (stddev > 0) {
+ int i = 0;
+ /*
+ * stddev is the square of standard deviation but
+ * we want the actualy number. Use the average
+ * as our seed to find the std.
+ *
+ * The next try is:
+ * x = (x + N/x) / 2
+ *
+ * Where N is the squared number to find the square
+ * root of.
+ */
+ seed = avg;
+ do {
+ last_seed = seed;
+ seed = stddev;
+ if (!last_seed)
+ break;
+ do_div(seed, last_seed);
+ seed += last_seed;
+ do_div(seed, 2);
+ } while (i++ < 10 && last_seed != seed);
+
+ std = seed;
+ }
+
+ scnprintf(bm_str, BENCHMARK_EVENT_STRLEN,
+ "last=%llu first=%llu max=%llu min=%llu avg=%u std=%d std^2=%lld",
+ bm_last, bm_first, bm_max, bm_min, avg, std, stddev);
+
+ bm_std = std;
+ bm_avg = avg;
+ bm_stddev = stddev;
+}
+
+static int benchmark_event_kthread(void *arg)
+{
+ /* sleep a bit to make sure the tracepoint gets activated */
+ msleep(100);
+
+ while (!kthread_should_stop()) {
+
+ trace_do_benchmark();
+
+ /*
+ * We don't go to sleep, but let others
+ * run as well.
+ */
+ cond_resched();
+ }
+
+ return 0;
+}
+
+/*
+ * When the benchmark tracepoint is enabled, it calls this
+ * function and the thread that calls the tracepoint is created.
+ */
+void trace_benchmark_reg(void)
+{
+ bm_event_thread = kthread_run(benchmark_event_kthread,
+ NULL, "event_benchmark");
+ WARN_ON(!bm_event_thread);
+}
+
+/*
+ * When the benchmark tracepoint is disabled, it calls this
+ * function and the thread that calls the tracepoint is deleted
+ * and all the numbers are reset.
+ */
+void trace_benchmark_unreg(void)
+{
+ if (!bm_event_thread)
+ return;
+
+ kthread_stop(bm_event_thread);
+
+ strcpy(bm_str, "START");
+ bm_total = 0;
+ bm_totalsq = 0;
+ bm_last = 0;
+ bm_max = 0;
+ bm_min = 0;
+ bm_cnt = 0;
+ /* These don't need to be reset but reset them anyway */
+ bm_first = 0;
+ bm_std = 0;
+ bm_avg = 0;
+ bm_stddev = 0;
+}
diff --git a/kernel/trace/trace_benchmark.h b/kernel/trace/trace_benchmark.h
new file mode 100644
index 000000000..3c1df1df4
--- /dev/null
+++ b/kernel/trace/trace_benchmark.h
@@ -0,0 +1,41 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM benchmark
+
+#if !defined(_TRACE_BENCHMARK_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_BENCHMARK_H
+
+#include <linux/tracepoint.h>
+
+extern void trace_benchmark_reg(void);
+extern void trace_benchmark_unreg(void);
+
+#define BENCHMARK_EVENT_STRLEN 128
+
+TRACE_EVENT_FN(benchmark_event,
+
+ TP_PROTO(const char *str),
+
+ TP_ARGS(str),
+
+ TP_STRUCT__entry(
+ __array( char, str, BENCHMARK_EVENT_STRLEN )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->str, str, BENCHMARK_EVENT_STRLEN);
+ ),
+
+ TP_printk("%s", __entry->str),
+
+ trace_benchmark_reg, trace_benchmark_unreg
+);
+
+#endif /* _TRACE_BENCHMARK_H */
+
+#undef TRACE_INCLUDE_FILE
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_benchmark
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
new file mode 100644
index 000000000..1879980f0
--- /dev/null
+++ b/kernel/trace/trace_branch.c
@@ -0,0 +1,414 @@
+/*
+ * unlikely profiler
+ *
+ * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
+ */
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+#include <linux/irqflags.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/ftrace.h>
+#include <linux/hash.h>
+#include <linux/fs.h>
+#include <asm/local.h>
+
+#include "trace.h"
+#include "trace_stat.h"
+#include "trace_output.h"
+
+#ifdef CONFIG_BRANCH_TRACER
+
+static struct tracer branch_trace;
+static int branch_tracing_enabled __read_mostly;
+static DEFINE_MUTEX(branch_tracing_mutex);
+
+static struct trace_array *branch_tracer;
+
+static void
+probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
+{
+ struct ftrace_event_call *call = &event_branch;
+ struct trace_array *tr = branch_tracer;
+ struct trace_array_cpu *data;
+ struct ring_buffer_event *event;
+ struct trace_branch *entry;
+ struct ring_buffer *buffer;
+ unsigned long flags;
+ int pc;
+ const char *p;
+
+ if (current->trace_recursion & TRACE_BRANCH_BIT)
+ return;
+
+ /*
+ * I would love to save just the ftrace_likely_data pointer, but
+ * this code can also be used by modules. Ugly things can happen
+ * if the module is unloaded, and then we go and read the
+ * pointer. This is slower, but much safer.
+ */
+
+ if (unlikely(!tr))
+ return;
+
+ raw_local_irq_save(flags);
+ current->trace_recursion |= TRACE_BRANCH_BIT;
+ data = this_cpu_ptr(tr->trace_buffer.data);
+ if (atomic_read(&data->disabled))
+ goto out;
+
+ pc = preempt_count();
+ buffer = tr->trace_buffer.buffer;
+ event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH,
+ sizeof(*entry), flags, pc);
+ if (!event)
+ goto out;
+
+ entry = ring_buffer_event_data(event);
+
+ /* Strip off the path, only save the file */
+ p = f->file + strlen(f->file);
+ while (p >= f->file && *p != '/')
+ p--;
+ p++;
+
+ strncpy(entry->func, f->func, TRACE_FUNC_SIZE);
+ strncpy(entry->file, p, TRACE_FILE_SIZE);
+ entry->func[TRACE_FUNC_SIZE] = 0;
+ entry->file[TRACE_FILE_SIZE] = 0;
+ entry->line = f->line;
+ entry->correct = val == expect;
+
+ if (!call_filter_check_discard(call, entry, buffer, event))
+ __buffer_unlock_commit(buffer, event);
+
+ out:
+ current->trace_recursion &= ~TRACE_BRANCH_BIT;
+ raw_local_irq_restore(flags);
+}
+
+static inline
+void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
+{
+ if (!branch_tracing_enabled)
+ return;
+
+ probe_likely_condition(f, val, expect);
+}
+
+int enable_branch_tracing(struct trace_array *tr)
+{
+ mutex_lock(&branch_tracing_mutex);
+ branch_tracer = tr;
+ /*
+ * Must be seen before enabling. The reader is a condition
+ * where we do not need a matching rmb()
+ */
+ smp_wmb();
+ branch_tracing_enabled++;
+ mutex_unlock(&branch_tracing_mutex);
+
+ return 0;
+}
+
+void disable_branch_tracing(void)
+{
+ mutex_lock(&branch_tracing_mutex);
+
+ if (!branch_tracing_enabled)
+ goto out_unlock;
+
+ branch_tracing_enabled--;
+
+ out_unlock:
+ mutex_unlock(&branch_tracing_mutex);
+}
+
+static void start_branch_trace(struct trace_array *tr)
+{
+ enable_branch_tracing(tr);
+}
+
+static void stop_branch_trace(struct trace_array *tr)
+{
+ disable_branch_tracing();
+}
+
+static int branch_trace_init(struct trace_array *tr)
+{
+ start_branch_trace(tr);
+ return 0;
+}
+
+static void branch_trace_reset(struct trace_array *tr)
+{
+ stop_branch_trace(tr);
+}
+
+static enum print_line_t trace_branch_print(struct trace_iterator *iter,
+ int flags, struct trace_event *event)
+{
+ struct trace_branch *field;
+
+ trace_assign_type(field, iter->ent);
+
+ trace_seq_printf(&iter->seq, "[%s] %s:%s:%d\n",
+ field->correct ? " ok " : " MISS ",
+ field->func,
+ field->file,
+ field->line);
+
+ return trace_handle_return(&iter->seq);
+}
+
+static void branch_print_header(struct seq_file *s)
+{
+ seq_puts(s, "# TASK-PID CPU# TIMESTAMP CORRECT"
+ " FUNC:FILE:LINE\n"
+ "# | | | | | "
+ " |\n");
+}
+
+static struct trace_event_functions trace_branch_funcs = {
+ .trace = trace_branch_print,
+};
+
+static struct trace_event trace_branch_event = {
+ .type = TRACE_BRANCH,
+ .funcs = &trace_branch_funcs,
+};
+
+static struct tracer branch_trace __read_mostly =
+{
+ .name = "branch",
+ .init = branch_trace_init,
+ .reset = branch_trace_reset,
+#ifdef CONFIG_FTRACE_SELFTEST
+ .selftest = trace_selftest_startup_branch,
+#endif /* CONFIG_FTRACE_SELFTEST */
+ .print_header = branch_print_header,
+};
+
+__init static int init_branch_tracer(void)
+{
+ int ret;
+
+ ret = register_ftrace_event(&trace_branch_event);
+ if (!ret) {
+ printk(KERN_WARNING "Warning: could not register "
+ "branch events\n");
+ return 1;
+ }
+ return register_tracer(&branch_trace);
+}
+core_initcall(init_branch_tracer);
+
+#else
+static inline
+void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
+{
+}
+#endif /* CONFIG_BRANCH_TRACER */
+
+void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect)
+{
+ /*
+ * I would love to have a trace point here instead, but the
+ * trace point code is so inundated with unlikely and likely
+ * conditions that the recursive nightmare that exists is too
+ * much to try to get working. At least for now.
+ */
+ trace_likely_condition(f, val, expect);
+
+ /* FIXME: Make this atomic! */
+ if (val == expect)
+ f->correct++;
+ else
+ f->incorrect++;
+}
+EXPORT_SYMBOL(ftrace_likely_update);
+
+extern unsigned long __start_annotated_branch_profile[];
+extern unsigned long __stop_annotated_branch_profile[];
+
+static int annotated_branch_stat_headers(struct seq_file *m)
+{
+ seq_puts(m, " correct incorrect % "
+ " Function "
+ " File Line\n"
+ " ------- --------- - "
+ " -------- "
+ " ---- ----\n");
+ return 0;
+}
+
+static inline long get_incorrect_percent(struct ftrace_branch_data *p)
+{
+ long percent;
+
+ if (p->correct) {
+ percent = p->incorrect * 100;
+ percent /= p->correct + p->incorrect;
+ } else
+ percent = p->incorrect ? 100 : -1;
+
+ return percent;
+}
+
+static int branch_stat_show(struct seq_file *m, void *v)
+{
+ struct ftrace_branch_data *p = v;
+ const char *f;
+ long percent;
+
+ /* Only print the file, not the path */
+ f = p->file + strlen(p->file);
+ while (f >= p->file && *f != '/')
+ f--;
+ f++;
+
+ /*
+ * The miss is overlayed on correct, and hit on incorrect.
+ */
+ percent = get_incorrect_percent(p);
+
+ seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect);
+ if (percent < 0)
+ seq_puts(m, " X ");
+ else
+ seq_printf(m, "%3ld ", percent);
+ seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line);
+ return 0;
+}
+
+static void *annotated_branch_stat_start(struct tracer_stat *trace)
+{
+ return __start_annotated_branch_profile;
+}
+
+static void *
+annotated_branch_stat_next(void *v, int idx)
+{
+ struct ftrace_branch_data *p = v;
+
+ ++p;
+
+ if ((void *)p >= (void *)__stop_annotated_branch_profile)
+ return NULL;
+
+ return p;
+}
+
+static int annotated_branch_stat_cmp(void *p1, void *p2)
+{
+ struct ftrace_branch_data *a = p1;
+ struct ftrace_branch_data *b = p2;
+
+ long percent_a, percent_b;
+
+ percent_a = get_incorrect_percent(a);
+ percent_b = get_incorrect_percent(b);
+
+ if (percent_a < percent_b)
+ return -1;
+ if (percent_a > percent_b)
+ return 1;
+
+ if (a->incorrect < b->incorrect)
+ return -1;
+ if (a->incorrect > b->incorrect)
+ return 1;
+
+ /*
+ * Since the above shows worse (incorrect) cases
+ * first, we continue that by showing best (correct)
+ * cases last.
+ */
+ if (a->correct > b->correct)
+ return -1;
+ if (a->correct < b->correct)
+ return 1;
+
+ return 0;
+}
+
+static struct tracer_stat annotated_branch_stats = {
+ .name = "branch_annotated",
+ .stat_start = annotated_branch_stat_start,
+ .stat_next = annotated_branch_stat_next,
+ .stat_cmp = annotated_branch_stat_cmp,
+ .stat_headers = annotated_branch_stat_headers,
+ .stat_show = branch_stat_show
+};
+
+__init static int init_annotated_branch_stats(void)
+{
+ int ret;
+
+ ret = register_stat_tracer(&annotated_branch_stats);
+ if (!ret) {
+ printk(KERN_WARNING "Warning: could not register "
+ "annotated branches stats\n");
+ return 1;
+ }
+ return 0;
+}
+fs_initcall(init_annotated_branch_stats);
+
+#ifdef CONFIG_PROFILE_ALL_BRANCHES
+
+extern unsigned long __start_branch_profile[];
+extern unsigned long __stop_branch_profile[];
+
+static int all_branch_stat_headers(struct seq_file *m)
+{
+ seq_puts(m, " miss hit % "
+ " Function "
+ " File Line\n"
+ " ------- --------- - "
+ " -------- "
+ " ---- ----\n");
+ return 0;
+}
+
+static void *all_branch_stat_start(struct tracer_stat *trace)
+{
+ return __start_branch_profile;
+}
+
+static void *
+all_branch_stat_next(void *v, int idx)
+{
+ struct ftrace_branch_data *p = v;
+
+ ++p;
+
+ if ((void *)p >= (void *)__stop_branch_profile)
+ return NULL;
+
+ return p;
+}
+
+static struct tracer_stat all_branch_stats = {
+ .name = "branch_all",
+ .stat_start = all_branch_stat_start,
+ .stat_next = all_branch_stat_next,
+ .stat_headers = all_branch_stat_headers,
+ .stat_show = branch_stat_show
+};
+
+__init static int all_annotated_branch_stats(void)
+{
+ int ret;
+
+ ret = register_stat_tracer(&all_branch_stats);
+ if (!ret) {
+ printk(KERN_WARNING "Warning: could not register "
+ "all branches stats\n");
+ return 1;
+ }
+ return 0;
+}
+fs_initcall(all_annotated_branch_stats);
+#endif /* CONFIG_PROFILE_ALL_BRANCHES */
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
new file mode 100644
index 000000000..57b67b1f2
--- /dev/null
+++ b/kernel/trace/trace_clock.c
@@ -0,0 +1,137 @@
+/*
+ * tracing clocks
+ *
+ * Copyright (C) 2009 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * Implements 3 trace clock variants, with differing scalability/precision
+ * tradeoffs:
+ *
+ * - local: CPU-local trace clock
+ * - medium: scalable global clock with some jitter
+ * - global: globally monotonic, serialized clock
+ *
+ * Tracer plugins will chose a default from these clocks.
+ */
+#include <linux/spinlock.h>
+#include <linux/irqflags.h>
+#include <linux/hardirq.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/ktime.h>
+#include <linux/trace_clock.h>
+
+/*
+ * trace_clock_local(): the simplest and least coherent tracing clock.
+ *
+ * Useful for tracing that does not cross to other CPUs nor
+ * does it go through idle events.
+ */
+u64 notrace trace_clock_local(void)
+{
+ u64 clock;
+
+ /*
+ * sched_clock() is an architecture implemented, fast, scalable,
+ * lockless clock. It is not guaranteed to be coherent across
+ * CPUs, nor across CPU idle events.
+ */
+ preempt_disable_notrace();
+ clock = sched_clock();
+ preempt_enable_notrace();
+
+ return clock;
+}
+EXPORT_SYMBOL_GPL(trace_clock_local);
+
+/*
+ * trace_clock(): 'between' trace clock. Not completely serialized,
+ * but not completely incorrect when crossing CPUs either.
+ *
+ * This is based on cpu_clock(), which will allow at most ~1 jiffy of
+ * jitter between CPUs. So it's a pretty scalable clock, but there
+ * can be offsets in the trace data.
+ */
+u64 notrace trace_clock(void)
+{
+ return local_clock();
+}
+
+/*
+ * trace_jiffy_clock(): Simply use jiffies as a clock counter.
+ * Note that this use of jiffies_64 is not completely safe on
+ * 32-bit systems. But the window is tiny, and the effect if
+ * we are affected is that we will have an obviously bogus
+ * timestamp on a trace event - i.e. not life threatening.
+ */
+u64 notrace trace_clock_jiffies(void)
+{
+ return jiffies_64_to_clock_t(jiffies_64 - INITIAL_JIFFIES);
+}
+
+/*
+ * trace_clock_global(): special globally coherent trace clock
+ *
+ * It has higher overhead than the other trace clocks but is still
+ * an order of magnitude faster than GTOD derived hardware clocks.
+ *
+ * Used by plugins that need globally coherent timestamps.
+ */
+
+/* keep prev_time and lock in the same cacheline. */
+static struct {
+ u64 prev_time;
+ arch_spinlock_t lock;
+} trace_clock_struct ____cacheline_aligned_in_smp =
+ {
+ .lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED,
+ };
+
+u64 notrace trace_clock_global(void)
+{
+ unsigned long flags;
+ int this_cpu;
+ u64 now;
+
+ local_irq_save(flags);
+
+ this_cpu = raw_smp_processor_id();
+ now = sched_clock_cpu(this_cpu);
+ /*
+ * If in an NMI context then dont risk lockups and return the
+ * cpu_clock() time:
+ */
+ if (unlikely(in_nmi()))
+ goto out;
+
+ arch_spin_lock(&trace_clock_struct.lock);
+
+ /*
+ * TODO: if this happens often then maybe we should reset
+ * my_scd->clock to prev_time+1, to make sure
+ * we start ticking with the local clock from now on?
+ */
+ if ((s64)(now - trace_clock_struct.prev_time) < 0)
+ now = trace_clock_struct.prev_time + 1;
+
+ trace_clock_struct.prev_time = now;
+
+ arch_spin_unlock(&trace_clock_struct.lock);
+
+ out:
+ local_irq_restore(flags);
+
+ return now;
+}
+
+static atomic64_t trace_counter;
+
+/*
+ * trace_clock_counter(): simply an atomic counter.
+ * Use the trace_counter "counter" for cases where you do not care
+ * about timings, but are interested in strict ordering.
+ */
+u64 notrace trace_clock_counter(void)
+{
+ return atomic64_add_return(1, &trace_counter);
+}
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
new file mode 100644
index 000000000..ee7b94a48
--- /dev/null
+++ b/kernel/trace/trace_entries.h
@@ -0,0 +1,324 @@
+/*
+ * This file defines the trace event structures that go into the ring
+ * buffer directly. They are created via macros so that changes for them
+ * appear in the format file. Using macros will automate this process.
+ *
+ * The macro used to create a ftrace data structure is:
+ *
+ * FTRACE_ENTRY( name, struct_name, id, structure, print )
+ *
+ * @name: the name used the event name, as well as the name of
+ * the directory that holds the format file.
+ *
+ * @struct_name: the name of the structure that is created.
+ *
+ * @id: The event identifier that is used to detect what event
+ * this is from the ring buffer.
+ *
+ * @structure: the structure layout
+ *
+ * - __field( type, item )
+ * This is equivalent to declaring
+ * type item;
+ * in the structure.
+ * - __array( type, item, size )
+ * This is equivalent to declaring
+ * type item[size];
+ * in the structure.
+ *
+ * * for structures within structures, the format of the internal
+ * structure is laid out. This allows the internal structure
+ * to be deciphered for the format file. Although these macros
+ * may become out of sync with the internal structure, they
+ * will create a compile error if it happens. Since the
+ * internel structures are just tracing helpers, this is not
+ * an issue.
+ *
+ * When an internal structure is used, it should use:
+ *
+ * __field_struct( type, item )
+ *
+ * instead of __field. This will prevent it from being shown in
+ * the output file. The fields in the structure should use.
+ *
+ * __field_desc( type, container, item )
+ * __array_desc( type, container, item, len )
+ *
+ * type, item and len are the same as __field and __array, but
+ * container is added. This is the name of the item in
+ * __field_struct that this is describing.
+ *
+ *
+ * @print: the print format shown to users in the format file.
+ */
+
+/*
+ * Function trace entry - function address and parent function address:
+ */
+FTRACE_ENTRY_REG(function, ftrace_entry,
+
+ TRACE_FN,
+
+ F_STRUCT(
+ __field( unsigned long, ip )
+ __field( unsigned long, parent_ip )
+ ),
+
+ F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip),
+
+ FILTER_TRACE_FN,
+
+ perf_ftrace_event_register
+);
+
+/* Function call entry */
+FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry,
+
+ TRACE_GRAPH_ENT,
+
+ F_STRUCT(
+ __field_struct( struct ftrace_graph_ent, graph_ent )
+ __field_desc( unsigned long, graph_ent, func )
+ __field_desc( int, graph_ent, depth )
+ ),
+
+ F_printk("--> %lx (%d)", __entry->func, __entry->depth),
+
+ FILTER_OTHER
+);
+
+/* Function return entry */
+FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry,
+
+ TRACE_GRAPH_RET,
+
+ F_STRUCT(
+ __field_struct( struct ftrace_graph_ret, ret )
+ __field_desc( unsigned long, ret, func )
+ __field_desc( unsigned long long, ret, calltime)
+ __field_desc( unsigned long long, ret, rettime )
+ __field_desc( unsigned long, ret, overrun )
+ __field_desc( int, ret, depth )
+ ),
+
+ F_printk("<-- %lx (%d) (start: %llx end: %llx) over: %d",
+ __entry->func, __entry->depth,
+ __entry->calltime, __entry->rettime,
+ __entry->depth),
+
+ FILTER_OTHER
+);
+
+/*
+ * Context switch trace entry - which task (and prio) we switched from/to:
+ *
+ * This is used for both wakeup and context switches. We only want
+ * to create one structure, but we need two outputs for it.
+ */
+#define FTRACE_CTX_FIELDS \
+ __field( unsigned int, prev_pid ) \
+ __field( unsigned int, next_pid ) \
+ __field( unsigned int, next_cpu ) \
+ __field( unsigned char, prev_prio ) \
+ __field( unsigned char, prev_state ) \
+ __field( unsigned char, next_prio ) \
+ __field( unsigned char, next_state )
+
+FTRACE_ENTRY(context_switch, ctx_switch_entry,
+
+ TRACE_CTX,
+
+ F_STRUCT(
+ FTRACE_CTX_FIELDS
+ ),
+
+ F_printk("%u:%u:%u ==> %u:%u:%u [%03u]",
+ __entry->prev_pid, __entry->prev_prio, __entry->prev_state,
+ __entry->next_pid, __entry->next_prio, __entry->next_state,
+ __entry->next_cpu),
+
+ FILTER_OTHER
+);
+
+/*
+ * FTRACE_ENTRY_DUP only creates the format file, it will not
+ * create another structure.
+ */
+FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry,
+
+ TRACE_WAKE,
+
+ F_STRUCT(
+ FTRACE_CTX_FIELDS
+ ),
+
+ F_printk("%u:%u:%u ==+ %u:%u:%u [%03u]",
+ __entry->prev_pid, __entry->prev_prio, __entry->prev_state,
+ __entry->next_pid, __entry->next_prio, __entry->next_state,
+ __entry->next_cpu),
+
+ FILTER_OTHER
+);
+
+/*
+ * Stack-trace entry:
+ */
+
+#define FTRACE_STACK_ENTRIES 8
+
+#ifndef CONFIG_64BIT
+# define IP_FMT "%08lx"
+#else
+# define IP_FMT "%016lx"
+#endif
+
+FTRACE_ENTRY(kernel_stack, stack_entry,
+
+ TRACE_STACK,
+
+ F_STRUCT(
+ __field( int, size )
+ __dynamic_array(unsigned long, caller )
+ ),
+
+ F_printk("\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n"
+ "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n"
+ "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n",
+ __entry->caller[0], __entry->caller[1], __entry->caller[2],
+ __entry->caller[3], __entry->caller[4], __entry->caller[5],
+ __entry->caller[6], __entry->caller[7]),
+
+ FILTER_OTHER
+);
+
+FTRACE_ENTRY(user_stack, userstack_entry,
+
+ TRACE_USER_STACK,
+
+ F_STRUCT(
+ __field( unsigned int, tgid )
+ __array( unsigned long, caller, FTRACE_STACK_ENTRIES )
+ ),
+
+ F_printk("\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n"
+ "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n"
+ "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n",
+ __entry->caller[0], __entry->caller[1], __entry->caller[2],
+ __entry->caller[3], __entry->caller[4], __entry->caller[5],
+ __entry->caller[6], __entry->caller[7]),
+
+ FILTER_OTHER
+);
+
+/*
+ * trace_printk entry:
+ */
+FTRACE_ENTRY(bprint, bprint_entry,
+
+ TRACE_BPRINT,
+
+ F_STRUCT(
+ __field( unsigned long, ip )
+ __field( const char *, fmt )
+ __dynamic_array( u32, buf )
+ ),
+
+ F_printk("%ps: %s",
+ (void *)__entry->ip, __entry->fmt),
+
+ FILTER_OTHER
+);
+
+FTRACE_ENTRY(print, print_entry,
+
+ TRACE_PRINT,
+
+ F_STRUCT(
+ __field( unsigned long, ip )
+ __dynamic_array( char, buf )
+ ),
+
+ F_printk("%ps: %s",
+ (void *)__entry->ip, __entry->buf),
+
+ FILTER_OTHER
+);
+
+FTRACE_ENTRY(bputs, bputs_entry,
+
+ TRACE_BPUTS,
+
+ F_STRUCT(
+ __field( unsigned long, ip )
+ __field( const char *, str )
+ ),
+
+ F_printk("%ps: %s",
+ (void *)__entry->ip, __entry->str),
+
+ FILTER_OTHER
+);
+
+FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw,
+
+ TRACE_MMIO_RW,
+
+ F_STRUCT(
+ __field_struct( struct mmiotrace_rw, rw )
+ __field_desc( resource_size_t, rw, phys )
+ __field_desc( unsigned long, rw, value )
+ __field_desc( unsigned long, rw, pc )
+ __field_desc( int, rw, map_id )
+ __field_desc( unsigned char, rw, opcode )
+ __field_desc( unsigned char, rw, width )
+ ),
+
+ F_printk("%lx %lx %lx %d %x %x",
+ (unsigned long)__entry->phys, __entry->value, __entry->pc,
+ __entry->map_id, __entry->opcode, __entry->width),
+
+ FILTER_OTHER
+);
+
+FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map,
+
+ TRACE_MMIO_MAP,
+
+ F_STRUCT(
+ __field_struct( struct mmiotrace_map, map )
+ __field_desc( resource_size_t, map, phys )
+ __field_desc( unsigned long, map, virt )
+ __field_desc( unsigned long, map, len )
+ __field_desc( int, map, map_id )
+ __field_desc( unsigned char, map, opcode )
+ ),
+
+ F_printk("%lx %lx %lx %d %x",
+ (unsigned long)__entry->phys, __entry->virt, __entry->len,
+ __entry->map_id, __entry->opcode),
+
+ FILTER_OTHER
+);
+
+
+#define TRACE_FUNC_SIZE 30
+#define TRACE_FILE_SIZE 20
+
+FTRACE_ENTRY(branch, trace_branch,
+
+ TRACE_BRANCH,
+
+ F_STRUCT(
+ __field( unsigned int, line )
+ __array( char, func, TRACE_FUNC_SIZE+1 )
+ __array( char, file, TRACE_FILE_SIZE+1 )
+ __field( char, correct )
+ ),
+
+ F_printk("%u:%s:%s (%u)",
+ __entry->line,
+ __entry->func, __entry->file, __entry->correct),
+
+ FILTER_OTHER
+);
+
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
new file mode 100644
index 000000000..6fa484de2
--- /dev/null
+++ b/kernel/trace/trace_event_perf.c
@@ -0,0 +1,384 @@
+/*
+ * trace event based perf event profiling/tracing
+ *
+ * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com>
+ * Copyright (C) 2009-2010 Frederic Weisbecker <fweisbec@gmail.com>
+ */
+
+#include <linux/module.h>
+#include <linux/kprobes.h>
+#include "trace.h"
+
+static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS];
+
+/*
+ * Force it to be aligned to unsigned long to avoid misaligned accesses
+ * suprises
+ */
+typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
+ perf_trace_t;
+
+/* Count the events in use (per event id, not per instance) */
+static int total_ref_count;
+
+static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
+ struct perf_event *p_event)
+{
+ if (tp_event->perf_perm) {
+ int ret = tp_event->perf_perm(tp_event, p_event);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * We checked and allowed to create parent,
+ * allow children without checking.
+ */
+ if (p_event->parent)
+ return 0;
+
+ /*
+ * It's ok to check current process (owner) permissions in here,
+ * because code below is called only via perf_event_open syscall.
+ */
+
+ /* The ftrace function trace is allowed only for root. */
+ if (ftrace_event_is_function(tp_event)) {
+ if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /*
+ * We don't allow user space callchains for function trace
+ * event, due to issues with page faults while tracing page
+ * fault handler and its overall trickiness nature.
+ */
+ if (!p_event->attr.exclude_callchain_user)
+ return -EINVAL;
+
+ /*
+ * Same reason to disable user stack dump as for user space
+ * callchains above.
+ */
+ if (p_event->attr.sample_type & PERF_SAMPLE_STACK_USER)
+ return -EINVAL;
+ }
+
+ /* No tracing, just counting, so no obvious leak */
+ if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW))
+ return 0;
+
+ /* Some events are ok to be traced by non-root users... */
+ if (p_event->attach_state == PERF_ATTACH_TASK) {
+ if (tp_event->flags & TRACE_EVENT_FL_CAP_ANY)
+ return 0;
+ }
+
+ /*
+ * ...otherwise raw tracepoint data can be a severe data leak,
+ * only allow root to have these.
+ */
+ if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ return 0;
+}
+
+static int perf_trace_event_reg(struct ftrace_event_call *tp_event,
+ struct perf_event *p_event)
+{
+ struct hlist_head __percpu *list;
+ int ret = -ENOMEM;
+ int cpu;
+
+ p_event->tp_event = tp_event;
+ if (tp_event->perf_refcount++ > 0)
+ return 0;
+
+ list = alloc_percpu(struct hlist_head);
+ if (!list)
+ goto fail;
+
+ for_each_possible_cpu(cpu)
+ INIT_HLIST_HEAD(per_cpu_ptr(list, cpu));
+
+ tp_event->perf_events = list;
+
+ if (!total_ref_count) {
+ char __percpu *buf;
+ int i;
+
+ for (i = 0; i < PERF_NR_CONTEXTS; i++) {
+ buf = (char __percpu *)alloc_percpu(perf_trace_t);
+ if (!buf)
+ goto fail;
+
+ perf_trace_buf[i] = buf;
+ }
+ }
+
+ ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER, NULL);
+ if (ret)
+ goto fail;
+
+ total_ref_count++;
+ return 0;
+
+fail:
+ if (!total_ref_count) {
+ int i;
+
+ for (i = 0; i < PERF_NR_CONTEXTS; i++) {
+ free_percpu(perf_trace_buf[i]);
+ perf_trace_buf[i] = NULL;
+ }
+ }
+
+ if (!--tp_event->perf_refcount) {
+ free_percpu(tp_event->perf_events);
+ tp_event->perf_events = NULL;
+ }
+
+ return ret;
+}
+
+static void perf_trace_event_unreg(struct perf_event *p_event)
+{
+ struct ftrace_event_call *tp_event = p_event->tp_event;
+ int i;
+
+ if (--tp_event->perf_refcount > 0)
+ goto out;
+
+ tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER, NULL);
+
+ /*
+ * Ensure our callback won't be called anymore. The buffers
+ * will be freed after that.
+ */
+ tracepoint_synchronize_unregister();
+
+ free_percpu(tp_event->perf_events);
+ tp_event->perf_events = NULL;
+
+ if (!--total_ref_count) {
+ for (i = 0; i < PERF_NR_CONTEXTS; i++) {
+ free_percpu(perf_trace_buf[i]);
+ perf_trace_buf[i] = NULL;
+ }
+ }
+out:
+ module_put(tp_event->mod);
+}
+
+static int perf_trace_event_open(struct perf_event *p_event)
+{
+ struct ftrace_event_call *tp_event = p_event->tp_event;
+ return tp_event->class->reg(tp_event, TRACE_REG_PERF_OPEN, p_event);
+}
+
+static void perf_trace_event_close(struct perf_event *p_event)
+{
+ struct ftrace_event_call *tp_event = p_event->tp_event;
+ tp_event->class->reg(tp_event, TRACE_REG_PERF_CLOSE, p_event);
+}
+
+static int perf_trace_event_init(struct ftrace_event_call *tp_event,
+ struct perf_event *p_event)
+{
+ int ret;
+
+ ret = perf_trace_event_perm(tp_event, p_event);
+ if (ret)
+ return ret;
+
+ ret = perf_trace_event_reg(tp_event, p_event);
+ if (ret)
+ return ret;
+
+ ret = perf_trace_event_open(p_event);
+ if (ret) {
+ perf_trace_event_unreg(p_event);
+ return ret;
+ }
+
+ return 0;
+}
+
+int perf_trace_init(struct perf_event *p_event)
+{
+ struct ftrace_event_call *tp_event;
+ u64 event_id = p_event->attr.config;
+ int ret = -EINVAL;
+
+ mutex_lock(&event_mutex);
+ list_for_each_entry(tp_event, &ftrace_events, list) {
+ if (tp_event->event.type == event_id &&
+ tp_event->class && tp_event->class->reg &&
+ try_module_get(tp_event->mod)) {
+ ret = perf_trace_event_init(tp_event, p_event);
+ if (ret)
+ module_put(tp_event->mod);
+ break;
+ }
+ }
+ mutex_unlock(&event_mutex);
+
+ return ret;
+}
+
+void perf_trace_destroy(struct perf_event *p_event)
+{
+ mutex_lock(&event_mutex);
+ perf_trace_event_close(p_event);
+ perf_trace_event_unreg(p_event);
+ mutex_unlock(&event_mutex);
+}
+
+int perf_trace_add(struct perf_event *p_event, int flags)
+{
+ struct ftrace_event_call *tp_event = p_event->tp_event;
+ struct hlist_head __percpu *pcpu_list;
+ struct hlist_head *list;
+
+ pcpu_list = tp_event->perf_events;
+ if (WARN_ON_ONCE(!pcpu_list))
+ return -EINVAL;
+
+ if (!(flags & PERF_EF_START))
+ p_event->hw.state = PERF_HES_STOPPED;
+
+ list = this_cpu_ptr(pcpu_list);
+ hlist_add_head_rcu(&p_event->hlist_entry, list);
+
+ return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event);
+}
+
+void perf_trace_del(struct perf_event *p_event, int flags)
+{
+ struct ftrace_event_call *tp_event = p_event->tp_event;
+ hlist_del_rcu(&p_event->hlist_entry);
+ tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
+}
+
+void *perf_trace_buf_prepare(int size, unsigned short type,
+ struct pt_regs **regs, int *rctxp)
+{
+ struct trace_entry *entry;
+ unsigned long flags;
+ char *raw_data;
+ int pc;
+
+ BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
+
+ if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
+ "perf buffer not large enough"))
+ return NULL;
+
+ pc = preempt_count();
+
+ *rctxp = perf_swevent_get_recursion_context();
+ if (*rctxp < 0)
+ return NULL;
+
+ if (regs)
+ *regs = this_cpu_ptr(&__perf_regs[*rctxp]);
+ raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]);
+
+ /* zero the dead bytes from align to not leak stack to user */
+ memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));
+
+ entry = (struct trace_entry *)raw_data;
+ local_save_flags(flags);
+ tracing_generic_entry_update(entry, flags, pc);
+ entry->type = type;
+
+ return raw_data;
+}
+EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
+NOKPROBE_SYMBOL(perf_trace_buf_prepare);
+
+#ifdef CONFIG_FUNCTION_TRACER
+static void
+perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *ops, struct pt_regs *pt_regs)
+{
+ struct ftrace_entry *entry;
+ struct hlist_head *head;
+ struct pt_regs regs;
+ int rctx;
+
+ head = this_cpu_ptr(event_function.perf_events);
+ if (hlist_empty(head))
+ return;
+
+#define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \
+ sizeof(u64)) - sizeof(u32))
+
+ BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE);
+
+ perf_fetch_caller_regs(&regs);
+
+ entry = perf_trace_buf_prepare(ENTRY_SIZE, TRACE_FN, NULL, &rctx);
+ if (!entry)
+ return;
+
+ entry->ip = ip;
+ entry->parent_ip = parent_ip;
+ perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0,
+ 1, &regs, head, NULL);
+
+#undef ENTRY_SIZE
+}
+
+static int perf_ftrace_function_register(struct perf_event *event)
+{
+ struct ftrace_ops *ops = &event->ftrace_ops;
+
+ ops->flags |= FTRACE_OPS_FL_CONTROL;
+ ops->func = perf_ftrace_function_call;
+ return register_ftrace_function(ops);
+}
+
+static int perf_ftrace_function_unregister(struct perf_event *event)
+{
+ struct ftrace_ops *ops = &event->ftrace_ops;
+ int ret = unregister_ftrace_function(ops);
+ ftrace_free_filter(ops);
+ return ret;
+}
+
+static void perf_ftrace_function_enable(struct perf_event *event)
+{
+ ftrace_function_local_enable(&event->ftrace_ops);
+}
+
+static void perf_ftrace_function_disable(struct perf_event *event)
+{
+ ftrace_function_local_disable(&event->ftrace_ops);
+}
+
+int perf_ftrace_event_register(struct ftrace_event_call *call,
+ enum trace_reg type, void *data)
+{
+ switch (type) {
+ case TRACE_REG_REGISTER:
+ case TRACE_REG_UNREGISTER:
+ break;
+ case TRACE_REG_PERF_REGISTER:
+ case TRACE_REG_PERF_UNREGISTER:
+ return 0;
+ case TRACE_REG_PERF_OPEN:
+ return perf_ftrace_function_register(data);
+ case TRACE_REG_PERF_CLOSE:
+ return perf_ftrace_function_unregister(data);
+ case TRACE_REG_PERF_ADD:
+ perf_ftrace_function_enable(data);
+ return 0;
+ case TRACE_REG_PERF_DEL:
+ perf_ftrace_function_disable(data);
+ return 0;
+ }
+
+ return -EINVAL;
+}
+#endif /* CONFIG_FUNCTION_TRACER */
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
new file mode 100644
index 000000000..c4de47fc5
--- /dev/null
+++ b/kernel/trace/trace_events.c
@@ -0,0 +1,2943 @@
+/*
+ * event tracer
+ *
+ * Copyright (C) 2008 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
+ *
+ * - Added format output of fields of the trace point.
+ * This was based off of work by Tom Zanussi <tzanussi@gmail.com>.
+ *
+ */
+
+#define pr_fmt(fmt) fmt
+
+#include <linux/workqueue.h>
+#include <linux/spinlock.h>
+#include <linux/kthread.h>
+#include <linux/tracefs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/slab.h>
+#include <linux/delay.h>
+
+#include <asm/setup.h>
+
+#include "trace_output.h"
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM "TRACE_SYSTEM"
+
+DEFINE_MUTEX(event_mutex);
+
+LIST_HEAD(ftrace_events);
+static LIST_HEAD(ftrace_common_fields);
+
+#define GFP_TRACE (GFP_KERNEL | __GFP_ZERO)
+
+static struct kmem_cache *field_cachep;
+static struct kmem_cache *file_cachep;
+
+#define SYSTEM_FL_FREE_NAME (1 << 31)
+
+static inline int system_refcount(struct event_subsystem *system)
+{
+ return system->ref_count & ~SYSTEM_FL_FREE_NAME;
+}
+
+static int system_refcount_inc(struct event_subsystem *system)
+{
+ return (system->ref_count++) & ~SYSTEM_FL_FREE_NAME;
+}
+
+static int system_refcount_dec(struct event_subsystem *system)
+{
+ return (--system->ref_count) & ~SYSTEM_FL_FREE_NAME;
+}
+
+/* Double loops, do not use break, only goto's work */
+#define do_for_each_event_file(tr, file) \
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) { \
+ list_for_each_entry(file, &tr->events, list)
+
+#define do_for_each_event_file_safe(tr, file) \
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) { \
+ struct ftrace_event_file *___n; \
+ list_for_each_entry_safe(file, ___n, &tr->events, list)
+
+#define while_for_each_event_file() \
+ }
+
+static struct list_head *
+trace_get_fields(struct ftrace_event_call *event_call)
+{
+ if (!event_call->class->get_fields)
+ return &event_call->class->fields;
+ return event_call->class->get_fields(event_call);
+}
+
+static struct ftrace_event_field *
+__find_event_field(struct list_head *head, char *name)
+{
+ struct ftrace_event_field *field;
+
+ list_for_each_entry(field, head, link) {
+ if (!strcmp(field->name, name))
+ return field;
+ }
+
+ return NULL;
+}
+
+struct ftrace_event_field *
+trace_find_event_field(struct ftrace_event_call *call, char *name)
+{
+ struct ftrace_event_field *field;
+ struct list_head *head;
+
+ field = __find_event_field(&ftrace_common_fields, name);
+ if (field)
+ return field;
+
+ head = trace_get_fields(call);
+ return __find_event_field(head, name);
+}
+
+static int __trace_define_field(struct list_head *head, const char *type,
+ const char *name, int offset, int size,
+ int is_signed, int filter_type)
+{
+ struct ftrace_event_field *field;
+
+ field = kmem_cache_alloc(field_cachep, GFP_TRACE);
+ if (!field)
+ return -ENOMEM;
+
+ field->name = name;
+ field->type = type;
+
+ if (filter_type == FILTER_OTHER)
+ field->filter_type = filter_assign_type(type);
+ else
+ field->filter_type = filter_type;
+
+ field->offset = offset;
+ field->size = size;
+ field->is_signed = is_signed;
+
+ list_add(&field->link, head);
+
+ return 0;
+}
+
+int trace_define_field(struct ftrace_event_call *call, const char *type,
+ const char *name, int offset, int size, int is_signed,
+ int filter_type)
+{
+ struct list_head *head;
+
+ if (WARN_ON(!call->class))
+ return 0;
+
+ head = trace_get_fields(call);
+ return __trace_define_field(head, type, name, offset, size,
+ is_signed, filter_type);
+}
+EXPORT_SYMBOL_GPL(trace_define_field);
+
+#define __common_field(type, item) \
+ ret = __trace_define_field(&ftrace_common_fields, #type, \
+ "common_" #item, \
+ offsetof(typeof(ent), item), \
+ sizeof(ent.item), \
+ is_signed_type(type), FILTER_OTHER); \
+ if (ret) \
+ return ret;
+
+static int trace_define_common_fields(void)
+{
+ int ret;
+ struct trace_entry ent;
+
+ __common_field(unsigned short, type);
+ __common_field(unsigned char, flags);
+ __common_field(unsigned char, preempt_count);
+ __common_field(int, pid);
+
+ return ret;
+}
+
+static void trace_destroy_fields(struct ftrace_event_call *call)
+{
+ struct ftrace_event_field *field, *next;
+ struct list_head *head;
+
+ head = trace_get_fields(call);
+ list_for_each_entry_safe(field, next, head, link) {
+ list_del(&field->link);
+ kmem_cache_free(field_cachep, field);
+ }
+}
+
+int trace_event_raw_init(struct ftrace_event_call *call)
+{
+ int id;
+
+ id = register_ftrace_event(&call->event);
+ if (!id)
+ return -ENODEV;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(trace_event_raw_init);
+
+void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer,
+ struct ftrace_event_file *ftrace_file,
+ unsigned long len)
+{
+ struct ftrace_event_call *event_call = ftrace_file->event_call;
+
+ local_save_flags(fbuffer->flags);
+ fbuffer->pc = preempt_count();
+ fbuffer->ftrace_file = ftrace_file;
+
+ fbuffer->event =
+ trace_event_buffer_lock_reserve(&fbuffer->buffer, ftrace_file,
+ event_call->event.type, len,
+ fbuffer->flags, fbuffer->pc);
+ if (!fbuffer->event)
+ return NULL;
+
+ fbuffer->entry = ring_buffer_event_data(fbuffer->event);
+ return fbuffer->entry;
+}
+EXPORT_SYMBOL_GPL(ftrace_event_buffer_reserve);
+
+static DEFINE_SPINLOCK(tracepoint_iter_lock);
+
+static void output_printk(struct ftrace_event_buffer *fbuffer)
+{
+ struct ftrace_event_call *event_call;
+ struct trace_event *event;
+ unsigned long flags;
+ struct trace_iterator *iter = tracepoint_print_iter;
+
+ if (!iter)
+ return;
+
+ event_call = fbuffer->ftrace_file->event_call;
+ if (!event_call || !event_call->event.funcs ||
+ !event_call->event.funcs->trace)
+ return;
+
+ event = &fbuffer->ftrace_file->event_call->event;
+
+ spin_lock_irqsave(&tracepoint_iter_lock, flags);
+ trace_seq_init(&iter->seq);
+ iter->ent = fbuffer->entry;
+ event_call->event.funcs->trace(iter, 0, event);
+ trace_seq_putc(&iter->seq, 0);
+ printk("%s", iter->seq.buffer);
+
+ spin_unlock_irqrestore(&tracepoint_iter_lock, flags);
+}
+
+void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer)
+{
+ if (tracepoint_printk)
+ output_printk(fbuffer);
+
+ event_trigger_unlock_commit(fbuffer->ftrace_file, fbuffer->buffer,
+ fbuffer->event, fbuffer->entry,
+ fbuffer->flags, fbuffer->pc);
+}
+EXPORT_SYMBOL_GPL(ftrace_event_buffer_commit);
+
+int ftrace_event_reg(struct ftrace_event_call *call,
+ enum trace_reg type, void *data)
+{
+ struct ftrace_event_file *file = data;
+
+ WARN_ON(!(call->flags & TRACE_EVENT_FL_TRACEPOINT));
+ switch (type) {
+ case TRACE_REG_REGISTER:
+ return tracepoint_probe_register(call->tp,
+ call->class->probe,
+ file);
+ case TRACE_REG_UNREGISTER:
+ tracepoint_probe_unregister(call->tp,
+ call->class->probe,
+ file);
+ return 0;
+
+#ifdef CONFIG_PERF_EVENTS
+ case TRACE_REG_PERF_REGISTER:
+ return tracepoint_probe_register(call->tp,
+ call->class->perf_probe,
+ call);
+ case TRACE_REG_PERF_UNREGISTER:
+ tracepoint_probe_unregister(call->tp,
+ call->class->perf_probe,
+ call);
+ return 0;
+ case TRACE_REG_PERF_OPEN:
+ case TRACE_REG_PERF_CLOSE:
+ case TRACE_REG_PERF_ADD:
+ case TRACE_REG_PERF_DEL:
+ return 0;
+#endif
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ftrace_event_reg);
+
+void trace_event_enable_cmd_record(bool enable)
+{
+ struct ftrace_event_file *file;
+ struct trace_array *tr;
+
+ mutex_lock(&event_mutex);
+ do_for_each_event_file(tr, file) {
+
+ if (!(file->flags & FTRACE_EVENT_FL_ENABLED))
+ continue;
+
+ if (enable) {
+ tracing_start_cmdline_record();
+ set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
+ } else {
+ tracing_stop_cmdline_record();
+ clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
+ }
+ } while_for_each_event_file();
+ mutex_unlock(&event_mutex);
+}
+
+static int __ftrace_event_enable_disable(struct ftrace_event_file *file,
+ int enable, int soft_disable)
+{
+ struct ftrace_event_call *call = file->event_call;
+ int ret = 0;
+ int disable;
+
+ switch (enable) {
+ case 0:
+ /*
+ * When soft_disable is set and enable is cleared, the sm_ref
+ * reference counter is decremented. If it reaches 0, we want
+ * to clear the SOFT_DISABLED flag but leave the event in the
+ * state that it was. That is, if the event was enabled and
+ * SOFT_DISABLED isn't set, then do nothing. But if SOFT_DISABLED
+ * is set we do not want the event to be enabled before we
+ * clear the bit.
+ *
+ * When soft_disable is not set but the SOFT_MODE flag is,
+ * we do nothing. Do not disable the tracepoint, otherwise
+ * "soft enable"s (clearing the SOFT_DISABLED bit) wont work.
+ */
+ if (soft_disable) {
+ if (atomic_dec_return(&file->sm_ref) > 0)
+ break;
+ disable = file->flags & FTRACE_EVENT_FL_SOFT_DISABLED;
+ clear_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags);
+ } else
+ disable = !(file->flags & FTRACE_EVENT_FL_SOFT_MODE);
+
+ if (disable && (file->flags & FTRACE_EVENT_FL_ENABLED)) {
+ clear_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags);
+ if (file->flags & FTRACE_EVENT_FL_RECORDED_CMD) {
+ tracing_stop_cmdline_record();
+ clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
+ }
+ call->class->reg(call, TRACE_REG_UNREGISTER, file);
+ }
+ /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */
+ if (file->flags & FTRACE_EVENT_FL_SOFT_MODE)
+ set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
+ else
+ clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
+ break;
+ case 1:
+ /*
+ * When soft_disable is set and enable is set, we want to
+ * register the tracepoint for the event, but leave the event
+ * as is. That means, if the event was already enabled, we do
+ * nothing (but set SOFT_MODE). If the event is disabled, we
+ * set SOFT_DISABLED before enabling the event tracepoint, so
+ * it still seems to be disabled.
+ */
+ if (!soft_disable)
+ clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
+ else {
+ if (atomic_inc_return(&file->sm_ref) > 1)
+ break;
+ set_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags);
+ }
+
+ if (!(file->flags & FTRACE_EVENT_FL_ENABLED)) {
+
+ /* Keep the event disabled, when going to SOFT_MODE. */
+ if (soft_disable)
+ set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
+
+ if (trace_flags & TRACE_ITER_RECORD_CMD) {
+ tracing_start_cmdline_record();
+ set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
+ }
+ ret = call->class->reg(call, TRACE_REG_REGISTER, file);
+ if (ret) {
+ tracing_stop_cmdline_record();
+ pr_info("event trace: Could not enable event "
+ "%s\n", ftrace_event_name(call));
+ break;
+ }
+ set_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags);
+
+ /* WAS_ENABLED gets set but never cleared. */
+ call->flags |= TRACE_EVENT_FL_WAS_ENABLED;
+ }
+ break;
+ }
+
+ return ret;
+}
+
+int trace_event_enable_disable(struct ftrace_event_file *file,
+ int enable, int soft_disable)
+{
+ return __ftrace_event_enable_disable(file, enable, soft_disable);
+}
+
+static int ftrace_event_enable_disable(struct ftrace_event_file *file,
+ int enable)
+{
+ return __ftrace_event_enable_disable(file, enable, 0);
+}
+
+static void ftrace_clear_events(struct trace_array *tr)
+{
+ struct ftrace_event_file *file;
+
+ mutex_lock(&event_mutex);
+ list_for_each_entry(file, &tr->events, list) {
+ ftrace_event_enable_disable(file, 0);
+ }
+ mutex_unlock(&event_mutex);
+}
+
+static void __put_system(struct event_subsystem *system)
+{
+ struct event_filter *filter = system->filter;
+
+ WARN_ON_ONCE(system_refcount(system) == 0);
+ if (system_refcount_dec(system))
+ return;
+
+ list_del(&system->list);
+
+ if (filter) {
+ kfree(filter->filter_string);
+ kfree(filter);
+ }
+ if (system->ref_count & SYSTEM_FL_FREE_NAME)
+ kfree(system->name);
+ kfree(system);
+}
+
+static void __get_system(struct event_subsystem *system)
+{
+ WARN_ON_ONCE(system_refcount(system) == 0);
+ system_refcount_inc(system);
+}
+
+static void __get_system_dir(struct ftrace_subsystem_dir *dir)
+{
+ WARN_ON_ONCE(dir->ref_count == 0);
+ dir->ref_count++;
+ __get_system(dir->subsystem);
+}
+
+static void __put_system_dir(struct ftrace_subsystem_dir *dir)
+{
+ WARN_ON_ONCE(dir->ref_count == 0);
+ /* If the subsystem is about to be freed, the dir must be too */
+ WARN_ON_ONCE(system_refcount(dir->subsystem) == 1 && dir->ref_count != 1);
+
+ __put_system(dir->subsystem);
+ if (!--dir->ref_count)
+ kfree(dir);
+}
+
+static void put_system(struct ftrace_subsystem_dir *dir)
+{
+ mutex_lock(&event_mutex);
+ __put_system_dir(dir);
+ mutex_unlock(&event_mutex);
+}
+
+static void remove_subsystem(struct ftrace_subsystem_dir *dir)
+{
+ if (!dir)
+ return;
+
+ if (!--dir->nr_events) {
+ tracefs_remove_recursive(dir->entry);
+ list_del(&dir->list);
+ __put_system_dir(dir);
+ }
+}
+
+static void remove_event_file_dir(struct ftrace_event_file *file)
+{
+ struct dentry *dir = file->dir;
+ struct dentry *child;
+
+ if (dir) {
+ spin_lock(&dir->d_lock); /* probably unneeded */
+ list_for_each_entry(child, &dir->d_subdirs, d_child) {
+ if (d_really_is_positive(child)) /* probably unneeded */
+ d_inode(child)->i_private = NULL;
+ }
+ spin_unlock(&dir->d_lock);
+
+ tracefs_remove_recursive(dir);
+ }
+
+ list_del(&file->list);
+ remove_subsystem(file->system);
+ free_event_filter(file->filter);
+ kmem_cache_free(file_cachep, file);
+}
+
+/*
+ * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events.
+ */
+static int
+__ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match,
+ const char *sub, const char *event, int set)
+{
+ struct ftrace_event_file *file;
+ struct ftrace_event_call *call;
+ const char *name;
+ int ret = -EINVAL;
+
+ list_for_each_entry(file, &tr->events, list) {
+
+ call = file->event_call;
+ name = ftrace_event_name(call);
+
+ if (!name || !call->class || !call->class->reg)
+ continue;
+
+ if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)
+ continue;
+
+ if (match &&
+ strcmp(match, name) != 0 &&
+ strcmp(match, call->class->system) != 0)
+ continue;
+
+ if (sub && strcmp(sub, call->class->system) != 0)
+ continue;
+
+ if (event && strcmp(event, name) != 0)
+ continue;
+
+ ftrace_event_enable_disable(file, set);
+
+ ret = 0;
+ }
+
+ return ret;
+}
+
+static int __ftrace_set_clr_event(struct trace_array *tr, const char *match,
+ const char *sub, const char *event, int set)
+{
+ int ret;
+
+ mutex_lock(&event_mutex);
+ ret = __ftrace_set_clr_event_nolock(tr, match, sub, event, set);
+ mutex_unlock(&event_mutex);
+
+ return ret;
+}
+
+static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
+{
+ char *event = NULL, *sub = NULL, *match;
+ int ret;
+
+ /*
+ * The buf format can be <subsystem>:<event-name>
+ * *:<event-name> means any event by that name.
+ * :<event-name> is the same.
+ *
+ * <subsystem>:* means all events in that subsystem
+ * <subsystem>: means the same.
+ *
+ * <name> (no ':') means all events in a subsystem with
+ * the name <name> or any event that matches <name>
+ */
+
+ match = strsep(&buf, ":");
+ if (buf) {
+ sub = match;
+ event = buf;
+ match = NULL;
+
+ if (!strlen(sub) || strcmp(sub, "*") == 0)
+ sub = NULL;
+ if (!strlen(event) || strcmp(event, "*") == 0)
+ event = NULL;
+ }
+
+ ret = __ftrace_set_clr_event(tr, match, sub, event, set);
+
+ /* Put back the colon to allow this to be called again */
+ if (buf)
+ *(buf - 1) = ':';
+
+ return ret;
+}
+
+/**
+ * trace_set_clr_event - enable or disable an event
+ * @system: system name to match (NULL for any system)
+ * @event: event name to match (NULL for all events, within system)
+ * @set: 1 to enable, 0 to disable
+ *
+ * This is a way for other parts of the kernel to enable or disable
+ * event recording.
+ *
+ * Returns 0 on success, -EINVAL if the parameters do not match any
+ * registered events.
+ */
+int trace_set_clr_event(const char *system, const char *event, int set)
+{
+ struct trace_array *tr = top_trace_array();
+
+ if (!tr)
+ return -ENODEV;
+
+ return __ftrace_set_clr_event(tr, NULL, system, event, set);
+}
+EXPORT_SYMBOL_GPL(trace_set_clr_event);
+
+/* 128 should be much more than enough */
+#define EVENT_BUF_SIZE 127
+
+static ssize_t
+ftrace_event_write(struct file *file, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct trace_parser parser;
+ struct seq_file *m = file->private_data;
+ struct trace_array *tr = m->private;
+ ssize_t read, ret;
+
+ if (!cnt)
+ return 0;
+
+ ret = tracing_update_buffers();
+ if (ret < 0)
+ return ret;
+
+ if (trace_parser_get_init(&parser, EVENT_BUF_SIZE + 1))
+ return -ENOMEM;
+
+ read = trace_get_user(&parser, ubuf, cnt, ppos);
+
+ if (read >= 0 && trace_parser_loaded((&parser))) {
+ int set = 1;
+
+ if (*parser.buffer == '!')
+ set = 0;
+
+ parser.buffer[parser.idx] = 0;
+
+ ret = ftrace_set_clr_event(tr, parser.buffer + !set, set);
+ if (ret)
+ goto out_put;
+ }
+
+ ret = read;
+
+ out_put:
+ trace_parser_put(&parser);
+
+ return ret;
+}
+
+static void *
+t_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct ftrace_event_file *file = v;
+ struct ftrace_event_call *call;
+ struct trace_array *tr = m->private;
+
+ (*pos)++;
+
+ list_for_each_entry_continue(file, &tr->events, list) {
+ call = file->event_call;
+ /*
+ * The ftrace subsystem is for showing formats only.
+ * They can not be enabled or disabled via the event files.
+ */
+ if (call->class && call->class->reg)
+ return file;
+ }
+
+ return NULL;
+}
+
+static void *t_start(struct seq_file *m, loff_t *pos)
+{
+ struct ftrace_event_file *file;
+ struct trace_array *tr = m->private;
+ loff_t l;
+
+ mutex_lock(&event_mutex);
+
+ file = list_entry(&tr->events, struct ftrace_event_file, list);
+ for (l = 0; l <= *pos; ) {
+ file = t_next(m, file, &l);
+ if (!file)
+ break;
+ }
+ return file;
+}
+
+static void *
+s_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct ftrace_event_file *file = v;
+ struct trace_array *tr = m->private;
+
+ (*pos)++;
+
+ list_for_each_entry_continue(file, &tr->events, list) {
+ if (file->flags & FTRACE_EVENT_FL_ENABLED)
+ return file;
+ }
+
+ return NULL;
+}
+
+static void *s_start(struct seq_file *m, loff_t *pos)
+{
+ struct ftrace_event_file *file;
+ struct trace_array *tr = m->private;
+ loff_t l;
+
+ mutex_lock(&event_mutex);
+
+ file = list_entry(&tr->events, struct ftrace_event_file, list);
+ for (l = 0; l <= *pos; ) {
+ file = s_next(m, file, &l);
+ if (!file)
+ break;
+ }
+ return file;
+}
+
+static int t_show(struct seq_file *m, void *v)
+{
+ struct ftrace_event_file *file = v;
+ struct ftrace_event_call *call = file->event_call;
+
+ if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
+ seq_printf(m, "%s:", call->class->system);
+ seq_printf(m, "%s\n", ftrace_event_name(call));
+
+ return 0;
+}
+
+static void t_stop(struct seq_file *m, void *p)
+{
+ mutex_unlock(&event_mutex);
+}
+
+static ssize_t
+event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ struct ftrace_event_file *file;
+ unsigned long flags;
+ char buf[4] = "0";
+
+ mutex_lock(&event_mutex);
+ file = event_file_data(filp);
+ if (likely(file))
+ flags = file->flags;
+ mutex_unlock(&event_mutex);
+
+ if (!file)
+ return -ENODEV;
+
+ if (flags & FTRACE_EVENT_FL_ENABLED &&
+ !(flags & FTRACE_EVENT_FL_SOFT_DISABLED))
+ strcpy(buf, "1");
+
+ if (flags & FTRACE_EVENT_FL_SOFT_DISABLED ||
+ flags & FTRACE_EVENT_FL_SOFT_MODE)
+ strcat(buf, "*");
+
+ strcat(buf, "\n");
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, strlen(buf));
+}
+
+static ssize_t
+event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ struct ftrace_event_file *file;
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
+
+ ret = tracing_update_buffers();
+ if (ret < 0)
+ return ret;
+
+ switch (val) {
+ case 0:
+ case 1:
+ ret = -ENODEV;
+ mutex_lock(&event_mutex);
+ file = event_file_data(filp);
+ if (likely(file))
+ ret = ftrace_event_enable_disable(file, val);
+ mutex_unlock(&event_mutex);
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ *ppos += cnt;
+
+ return ret ? ret : cnt;
+}
+
+static ssize_t
+system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ const char set_to_char[4] = { '?', '0', '1', 'X' };
+ struct ftrace_subsystem_dir *dir = filp->private_data;
+ struct event_subsystem *system = dir->subsystem;
+ struct ftrace_event_call *call;
+ struct ftrace_event_file *file;
+ struct trace_array *tr = dir->tr;
+ char buf[2];
+ int set = 0;
+ int ret;
+
+ mutex_lock(&event_mutex);
+ list_for_each_entry(file, &tr->events, list) {
+ call = file->event_call;
+ if (!ftrace_event_name(call) || !call->class || !call->class->reg)
+ continue;
+
+ if (system && strcmp(call->class->system, system->name) != 0)
+ continue;
+
+ /*
+ * We need to find out if all the events are set
+ * or if all events or cleared, or if we have
+ * a mixture.
+ */
+ set |= (1 << !!(file->flags & FTRACE_EVENT_FL_ENABLED));
+
+ /*
+ * If we have a mixture, no need to look further.
+ */
+ if (set == 3)
+ break;
+ }
+ mutex_unlock(&event_mutex);
+
+ buf[0] = set_to_char[set];
+ buf[1] = '\n';
+
+ ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
+
+ return ret;
+}
+
+static ssize_t
+system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ struct ftrace_subsystem_dir *dir = filp->private_data;
+ struct event_subsystem *system = dir->subsystem;
+ const char *name = NULL;
+ unsigned long val;
+ ssize_t ret;
+
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
+
+ ret = tracing_update_buffers();
+ if (ret < 0)
+ return ret;
+
+ if (val != 0 && val != 1)
+ return -EINVAL;
+
+ /*
+ * Opening of "enable" adds a ref count to system,
+ * so the name is safe to use.
+ */
+ if (system)
+ name = system->name;
+
+ ret = __ftrace_set_clr_event(dir->tr, NULL, name, NULL, val);
+ if (ret)
+ goto out;
+
+ ret = cnt;
+
+out:
+ *ppos += cnt;
+
+ return ret;
+}
+
+enum {
+ FORMAT_HEADER = 1,
+ FORMAT_FIELD_SEPERATOR = 2,
+ FORMAT_PRINTFMT = 3,
+};
+
+static void *f_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct ftrace_event_call *call = event_file_data(m->private);
+ struct list_head *common_head = &ftrace_common_fields;
+ struct list_head *head = trace_get_fields(call);
+ struct list_head *node = v;
+
+ (*pos)++;
+
+ switch ((unsigned long)v) {
+ case FORMAT_HEADER:
+ node = common_head;
+ break;
+
+ case FORMAT_FIELD_SEPERATOR:
+ node = head;
+ break;
+
+ case FORMAT_PRINTFMT:
+ /* all done */
+ return NULL;
+ }
+
+ node = node->prev;
+ if (node == common_head)
+ return (void *)FORMAT_FIELD_SEPERATOR;
+ else if (node == head)
+ return (void *)FORMAT_PRINTFMT;
+ else
+ return node;
+}
+
+static int f_show(struct seq_file *m, void *v)
+{
+ struct ftrace_event_call *call = event_file_data(m->private);
+ struct ftrace_event_field *field;
+ const char *array_descriptor;
+
+ switch ((unsigned long)v) {
+ case FORMAT_HEADER:
+ seq_printf(m, "name: %s\n", ftrace_event_name(call));
+ seq_printf(m, "ID: %d\n", call->event.type);
+ seq_puts(m, "format:\n");
+ return 0;
+
+ case FORMAT_FIELD_SEPERATOR:
+ seq_putc(m, '\n');
+ return 0;
+
+ case FORMAT_PRINTFMT:
+ seq_printf(m, "\nprint fmt: %s\n",
+ call->print_fmt);
+ return 0;
+ }
+
+ field = list_entry(v, struct ftrace_event_field, link);
+ /*
+ * Smartly shows the array type(except dynamic array).
+ * Normal:
+ * field:TYPE VAR
+ * If TYPE := TYPE[LEN], it is shown:
+ * field:TYPE VAR[LEN]
+ */
+ array_descriptor = strchr(field->type, '[');
+
+ if (!strncmp(field->type, "__data_loc", 10))
+ array_descriptor = NULL;
+
+ if (!array_descriptor)
+ seq_printf(m, "\tfield:%s %s;\toffset:%u;\tsize:%u;\tsigned:%d;\n",
+ field->type, field->name, field->offset,
+ field->size, !!field->is_signed);
+ else
+ seq_printf(m, "\tfield:%.*s %s%s;\toffset:%u;\tsize:%u;\tsigned:%d;\n",
+ (int)(array_descriptor - field->type),
+ field->type, field->name,
+ array_descriptor, field->offset,
+ field->size, !!field->is_signed);
+
+ return 0;
+}
+
+static void *f_start(struct seq_file *m, loff_t *pos)
+{
+ void *p = (void *)FORMAT_HEADER;
+ loff_t l = 0;
+
+ /* ->stop() is called even if ->start() fails */
+ mutex_lock(&event_mutex);
+ if (!event_file_data(m->private))
+ return ERR_PTR(-ENODEV);
+
+ while (l < *pos && p)
+ p = f_next(m, p, &l);
+
+ return p;
+}
+
+static void f_stop(struct seq_file *m, void *p)
+{
+ mutex_unlock(&event_mutex);
+}
+
+static const struct seq_operations trace_format_seq_ops = {
+ .start = f_start,
+ .next = f_next,
+ .stop = f_stop,
+ .show = f_show,
+};
+
+static int trace_format_open(struct inode *inode, struct file *file)
+{
+ struct seq_file *m;
+ int ret;
+
+ ret = seq_open(file, &trace_format_seq_ops);
+ if (ret < 0)
+ return ret;
+
+ m = file->private_data;
+ m->private = file;
+
+ return 0;
+}
+
+static ssize_t
+event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ int id = (long)event_file_data(filp);
+ char buf[32];
+ int len;
+
+ if (*ppos)
+ return 0;
+
+ if (unlikely(!id))
+ return -ENODEV;
+
+ len = sprintf(buf, "%d\n", id);
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, len);
+}
+
+static ssize_t
+event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ struct ftrace_event_file *file;
+ struct trace_seq *s;
+ int r = -ENODEV;
+
+ if (*ppos)
+ return 0;
+
+ s = kmalloc(sizeof(*s), GFP_KERNEL);
+
+ if (!s)
+ return -ENOMEM;
+
+ trace_seq_init(s);
+
+ mutex_lock(&event_mutex);
+ file = event_file_data(filp);
+ if (file)
+ print_event_filter(file, s);
+ mutex_unlock(&event_mutex);
+
+ if (file)
+ r = simple_read_from_buffer(ubuf, cnt, ppos,
+ s->buffer, trace_seq_used(s));
+
+ kfree(s);
+
+ return r;
+}
+
+static ssize_t
+event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ struct ftrace_event_file *file;
+ char *buf;
+ int err = -ENODEV;
+
+ if (cnt >= PAGE_SIZE)
+ return -EINVAL;
+
+ buf = (char *)__get_free_page(GFP_TEMPORARY);
+ if (!buf)
+ return -ENOMEM;
+
+ if (copy_from_user(buf, ubuf, cnt)) {
+ free_page((unsigned long) buf);
+ return -EFAULT;
+ }
+ buf[cnt] = '\0';
+
+ mutex_lock(&event_mutex);
+ file = event_file_data(filp);
+ if (file)
+ err = apply_event_filter(file, buf);
+ mutex_unlock(&event_mutex);
+
+ free_page((unsigned long) buf);
+ if (err < 0)
+ return err;
+
+ *ppos += cnt;
+
+ return cnt;
+}
+
+static LIST_HEAD(event_subsystems);
+
+static int subsystem_open(struct inode *inode, struct file *filp)
+{
+ struct event_subsystem *system = NULL;
+ struct ftrace_subsystem_dir *dir = NULL; /* Initialize for gcc */
+ struct trace_array *tr;
+ int ret;
+
+ if (tracing_is_disabled())
+ return -ENODEV;
+
+ /* Make sure the system still exists */
+ mutex_lock(&trace_types_lock);
+ mutex_lock(&event_mutex);
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ list_for_each_entry(dir, &tr->systems, list) {
+ if (dir == inode->i_private) {
+ /* Don't open systems with no events */
+ if (dir->nr_events) {
+ __get_system_dir(dir);
+ system = dir->subsystem;
+ }
+ goto exit_loop;
+ }
+ }
+ }
+ exit_loop:
+ mutex_unlock(&event_mutex);
+ mutex_unlock(&trace_types_lock);
+
+ if (!system)
+ return -ENODEV;
+
+ /* Some versions of gcc think dir can be uninitialized here */
+ WARN_ON(!dir);
+
+ /* Still need to increment the ref count of the system */
+ if (trace_array_get(tr) < 0) {
+ put_system(dir);
+ return -ENODEV;
+ }
+
+ ret = tracing_open_generic(inode, filp);
+ if (ret < 0) {
+ trace_array_put(tr);
+ put_system(dir);
+ }
+
+ return ret;
+}
+
+static int system_tr_open(struct inode *inode, struct file *filp)
+{
+ struct ftrace_subsystem_dir *dir;
+ struct trace_array *tr = inode->i_private;
+ int ret;
+
+ if (tracing_is_disabled())
+ return -ENODEV;
+
+ if (trace_array_get(tr) < 0)
+ return -ENODEV;
+
+ /* Make a temporary dir that has no system but points to tr */
+ dir = kzalloc(sizeof(*dir), GFP_KERNEL);
+ if (!dir) {
+ trace_array_put(tr);
+ return -ENOMEM;
+ }
+
+ dir->tr = tr;
+
+ ret = tracing_open_generic(inode, filp);
+ if (ret < 0) {
+ trace_array_put(tr);
+ kfree(dir);
+ return ret;
+ }
+
+ filp->private_data = dir;
+
+ return 0;
+}
+
+static int subsystem_release(struct inode *inode, struct file *file)
+{
+ struct ftrace_subsystem_dir *dir = file->private_data;
+
+ trace_array_put(dir->tr);
+
+ /*
+ * If dir->subsystem is NULL, then this is a temporary
+ * descriptor that was made for a trace_array to enable
+ * all subsystems.
+ */
+ if (dir->subsystem)
+ put_system(dir);
+ else
+ kfree(dir);
+
+ return 0;
+}
+
+static ssize_t
+subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ struct ftrace_subsystem_dir *dir = filp->private_data;
+ struct event_subsystem *system = dir->subsystem;
+ struct trace_seq *s;
+ int r;
+
+ if (*ppos)
+ return 0;
+
+ s = kmalloc(sizeof(*s), GFP_KERNEL);
+ if (!s)
+ return -ENOMEM;
+
+ trace_seq_init(s);
+
+ print_subsystem_event_filter(system, s);
+ r = simple_read_from_buffer(ubuf, cnt, ppos,
+ s->buffer, trace_seq_used(s));
+
+ kfree(s);
+
+ return r;
+}
+
+static ssize_t
+subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ struct ftrace_subsystem_dir *dir = filp->private_data;
+ char *buf;
+ int err;
+
+ if (cnt >= PAGE_SIZE)
+ return -EINVAL;
+
+ buf = (char *)__get_free_page(GFP_TEMPORARY);
+ if (!buf)
+ return -ENOMEM;
+
+ if (copy_from_user(buf, ubuf, cnt)) {
+ free_page((unsigned long) buf);
+ return -EFAULT;
+ }
+ buf[cnt] = '\0';
+
+ err = apply_subsystem_event_filter(dir, buf);
+ free_page((unsigned long) buf);
+ if (err < 0)
+ return err;
+
+ *ppos += cnt;
+
+ return cnt;
+}
+
+static ssize_t
+show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ int (*func)(struct trace_seq *s) = filp->private_data;
+ struct trace_seq *s;
+ int r;
+
+ if (*ppos)
+ return 0;
+
+ s = kmalloc(sizeof(*s), GFP_KERNEL);
+ if (!s)
+ return -ENOMEM;
+
+ trace_seq_init(s);
+
+ func(s);
+ r = simple_read_from_buffer(ubuf, cnt, ppos,
+ s->buffer, trace_seq_used(s));
+
+ kfree(s);
+
+ return r;
+}
+
+static int ftrace_event_avail_open(struct inode *inode, struct file *file);
+static int ftrace_event_set_open(struct inode *inode, struct file *file);
+static int ftrace_event_release(struct inode *inode, struct file *file);
+
+static const struct seq_operations show_event_seq_ops = {
+ .start = t_start,
+ .next = t_next,
+ .show = t_show,
+ .stop = t_stop,
+};
+
+static const struct seq_operations show_set_event_seq_ops = {
+ .start = s_start,
+ .next = s_next,
+ .show = t_show,
+ .stop = t_stop,
+};
+
+static const struct file_operations ftrace_avail_fops = {
+ .open = ftrace_event_avail_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static const struct file_operations ftrace_set_event_fops = {
+ .open = ftrace_event_set_open,
+ .read = seq_read,
+ .write = ftrace_event_write,
+ .llseek = seq_lseek,
+ .release = ftrace_event_release,
+};
+
+static const struct file_operations ftrace_enable_fops = {
+ .open = tracing_open_generic,
+ .read = event_enable_read,
+ .write = event_enable_write,
+ .llseek = default_llseek,
+};
+
+static const struct file_operations ftrace_event_format_fops = {
+ .open = trace_format_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static const struct file_operations ftrace_event_id_fops = {
+ .read = event_id_read,
+ .llseek = default_llseek,
+};
+
+static const struct file_operations ftrace_event_filter_fops = {
+ .open = tracing_open_generic,
+ .read = event_filter_read,
+ .write = event_filter_write,
+ .llseek = default_llseek,
+};
+
+static const struct file_operations ftrace_subsystem_filter_fops = {
+ .open = subsystem_open,
+ .read = subsystem_filter_read,
+ .write = subsystem_filter_write,
+ .llseek = default_llseek,
+ .release = subsystem_release,
+};
+
+static const struct file_operations ftrace_system_enable_fops = {
+ .open = subsystem_open,
+ .read = system_enable_read,
+ .write = system_enable_write,
+ .llseek = default_llseek,
+ .release = subsystem_release,
+};
+
+static const struct file_operations ftrace_tr_enable_fops = {
+ .open = system_tr_open,
+ .read = system_enable_read,
+ .write = system_enable_write,
+ .llseek = default_llseek,
+ .release = subsystem_release,
+};
+
+static const struct file_operations ftrace_show_header_fops = {
+ .open = tracing_open_generic,
+ .read = show_header,
+ .llseek = default_llseek,
+};
+
+static int
+ftrace_event_open(struct inode *inode, struct file *file,
+ const struct seq_operations *seq_ops)
+{
+ struct seq_file *m;
+ int ret;
+
+ ret = seq_open(file, seq_ops);
+ if (ret < 0)
+ return ret;
+ m = file->private_data;
+ /* copy tr over to seq ops */
+ m->private = inode->i_private;
+
+ return ret;
+}
+
+static int ftrace_event_release(struct inode *inode, struct file *file)
+{
+ struct trace_array *tr = inode->i_private;
+
+ trace_array_put(tr);
+
+ return seq_release(inode, file);
+}
+
+static int
+ftrace_event_avail_open(struct inode *inode, struct file *file)
+{
+ const struct seq_operations *seq_ops = &show_event_seq_ops;
+
+ return ftrace_event_open(inode, file, seq_ops);
+}
+
+static int
+ftrace_event_set_open(struct inode *inode, struct file *file)
+{
+ const struct seq_operations *seq_ops = &show_set_event_seq_ops;
+ struct trace_array *tr = inode->i_private;
+ int ret;
+
+ if (trace_array_get(tr) < 0)
+ return -ENODEV;
+
+ if ((file->f_mode & FMODE_WRITE) &&
+ (file->f_flags & O_TRUNC))
+ ftrace_clear_events(tr);
+
+ ret = ftrace_event_open(inode, file, seq_ops);
+ if (ret < 0)
+ trace_array_put(tr);
+ return ret;
+}
+
+static struct event_subsystem *
+create_new_subsystem(const char *name)
+{
+ struct event_subsystem *system;
+
+ /* need to create new entry */
+ system = kmalloc(sizeof(*system), GFP_KERNEL);
+ if (!system)
+ return NULL;
+
+ system->ref_count = 1;
+
+ /* Only allocate if dynamic (kprobes and modules) */
+ if (!core_kernel_data((unsigned long)name)) {
+ system->ref_count |= SYSTEM_FL_FREE_NAME;
+ system->name = kstrdup(name, GFP_KERNEL);
+ if (!system->name)
+ goto out_free;
+ } else
+ system->name = name;
+
+ system->filter = NULL;
+
+ system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL);
+ if (!system->filter)
+ goto out_free;
+
+ list_add(&system->list, &event_subsystems);
+
+ return system;
+
+ out_free:
+ if (system->ref_count & SYSTEM_FL_FREE_NAME)
+ kfree(system->name);
+ kfree(system);
+ return NULL;
+}
+
+static struct dentry *
+event_subsystem_dir(struct trace_array *tr, const char *name,
+ struct ftrace_event_file *file, struct dentry *parent)
+{
+ struct ftrace_subsystem_dir *dir;
+ struct event_subsystem *system;
+ struct dentry *entry;
+
+ /* First see if we did not already create this dir */
+ list_for_each_entry(dir, &tr->systems, list) {
+ system = dir->subsystem;
+ if (strcmp(system->name, name) == 0) {
+ dir->nr_events++;
+ file->system = dir;
+ return dir->entry;
+ }
+ }
+
+ /* Now see if the system itself exists. */
+ list_for_each_entry(system, &event_subsystems, list) {
+ if (strcmp(system->name, name) == 0)
+ break;
+ }
+ /* Reset system variable when not found */
+ if (&system->list == &event_subsystems)
+ system = NULL;
+
+ dir = kmalloc(sizeof(*dir), GFP_KERNEL);
+ if (!dir)
+ goto out_fail;
+
+ if (!system) {
+ system = create_new_subsystem(name);
+ if (!system)
+ goto out_free;
+ } else
+ __get_system(system);
+
+ dir->entry = tracefs_create_dir(name, parent);
+ if (!dir->entry) {
+ pr_warn("Failed to create system directory %s\n", name);
+ __put_system(system);
+ goto out_free;
+ }
+
+ dir->tr = tr;
+ dir->ref_count = 1;
+ dir->nr_events = 1;
+ dir->subsystem = system;
+ file->system = dir;
+
+ entry = tracefs_create_file("filter", 0644, dir->entry, dir,
+ &ftrace_subsystem_filter_fops);
+ if (!entry) {
+ kfree(system->filter);
+ system->filter = NULL;
+ pr_warn("Could not create tracefs '%s/filter' entry\n", name);
+ }
+
+ trace_create_file("enable", 0644, dir->entry, dir,
+ &ftrace_system_enable_fops);
+
+ list_add(&dir->list, &tr->systems);
+
+ return dir->entry;
+
+ out_free:
+ kfree(dir);
+ out_fail:
+ /* Only print this message if failed on memory allocation */
+ if (!dir || !system)
+ pr_warn("No memory to create event subsystem %s\n", name);
+ return NULL;
+}
+
+static int
+event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
+{
+ struct ftrace_event_call *call = file->event_call;
+ struct trace_array *tr = file->tr;
+ struct list_head *head;
+ struct dentry *d_events;
+ const char *name;
+ int ret;
+
+ /*
+ * If the trace point header did not define TRACE_SYSTEM
+ * then the system would be called "TRACE_SYSTEM".
+ */
+ if (strcmp(call->class->system, TRACE_SYSTEM) != 0) {
+ d_events = event_subsystem_dir(tr, call->class->system, file, parent);
+ if (!d_events)
+ return -ENOMEM;
+ } else
+ d_events = parent;
+
+ name = ftrace_event_name(call);
+ file->dir = tracefs_create_dir(name, d_events);
+ if (!file->dir) {
+ pr_warn("Could not create tracefs '%s' directory\n", name);
+ return -1;
+ }
+
+ if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE))
+ trace_create_file("enable", 0644, file->dir, file,
+ &ftrace_enable_fops);
+
+#ifdef CONFIG_PERF_EVENTS
+ if (call->event.type && call->class->reg)
+ trace_create_file("id", 0444, file->dir,
+ (void *)(long)call->event.type,
+ &ftrace_event_id_fops);
+#endif
+
+ /*
+ * Other events may have the same class. Only update
+ * the fields if they are not already defined.
+ */
+ head = trace_get_fields(call);
+ if (list_empty(head)) {
+ ret = call->class->define_fields(call);
+ if (ret < 0) {
+ pr_warn("Could not initialize trace point events/%s\n",
+ name);
+ return -1;
+ }
+ }
+ trace_create_file("filter", 0644, file->dir, file,
+ &ftrace_event_filter_fops);
+
+ trace_create_file("trigger", 0644, file->dir, file,
+ &event_trigger_fops);
+
+ trace_create_file("format", 0444, file->dir, call,
+ &ftrace_event_format_fops);
+
+ return 0;
+}
+
+static void remove_event_from_tracers(struct ftrace_event_call *call)
+{
+ struct ftrace_event_file *file;
+ struct trace_array *tr;
+
+ do_for_each_event_file_safe(tr, file) {
+ if (file->event_call != call)
+ continue;
+
+ remove_event_file_dir(file);
+ /*
+ * The do_for_each_event_file_safe() is
+ * a double loop. After finding the call for this
+ * trace_array, we use break to jump to the next
+ * trace_array.
+ */
+ break;
+ } while_for_each_event_file();
+}
+
+static void event_remove(struct ftrace_event_call *call)
+{
+ struct trace_array *tr;
+ struct ftrace_event_file *file;
+
+ do_for_each_event_file(tr, file) {
+ if (file->event_call != call)
+ continue;
+ ftrace_event_enable_disable(file, 0);
+ /*
+ * The do_for_each_event_file() is
+ * a double loop. After finding the call for this
+ * trace_array, we use break to jump to the next
+ * trace_array.
+ */
+ break;
+ } while_for_each_event_file();
+
+ if (call->event.funcs)
+ __unregister_ftrace_event(&call->event);
+ remove_event_from_tracers(call);
+ list_del(&call->list);
+}
+
+static int event_init(struct ftrace_event_call *call)
+{
+ int ret = 0;
+ const char *name;
+
+ name = ftrace_event_name(call);
+ if (WARN_ON(!name))
+ return -EINVAL;
+
+ if (call->class->raw_init) {
+ ret = call->class->raw_init(call);
+ if (ret < 0 && ret != -ENOSYS)
+ pr_warn("Could not initialize trace events/%s\n", name);
+ }
+
+ return ret;
+}
+
+static int
+__register_event(struct ftrace_event_call *call, struct module *mod)
+{
+ int ret;
+
+ ret = event_init(call);
+ if (ret < 0)
+ return ret;
+
+ list_add(&call->list, &ftrace_events);
+ call->mod = mod;
+
+ return 0;
+}
+
+static char *enum_replace(char *ptr, struct trace_enum_map *map, int len)
+{
+ int rlen;
+ int elen;
+
+ /* Find the length of the enum value as a string */
+ elen = snprintf(ptr, 0, "%ld", map->enum_value);
+ /* Make sure there's enough room to replace the string with the value */
+ if (len < elen)
+ return NULL;
+
+ snprintf(ptr, elen + 1, "%ld", map->enum_value);
+
+ /* Get the rest of the string of ptr */
+ rlen = strlen(ptr + len);
+ memmove(ptr + elen, ptr + len, rlen);
+ /* Make sure we end the new string */
+ ptr[elen + rlen] = 0;
+
+ return ptr + elen;
+}
+
+static void update_event_printk(struct ftrace_event_call *call,
+ struct trace_enum_map *map)
+{
+ char *ptr;
+ int quote = 0;
+ int len = strlen(map->enum_string);
+
+ for (ptr = call->print_fmt; *ptr; ptr++) {
+ if (*ptr == '\\') {
+ ptr++;
+ /* paranoid */
+ if (!*ptr)
+ break;
+ continue;
+ }
+ if (*ptr == '"') {
+ quote ^= 1;
+ continue;
+ }
+ if (quote)
+ continue;
+ if (isdigit(*ptr)) {
+ /* skip numbers */
+ do {
+ ptr++;
+ /* Check for alpha chars like ULL */
+ } while (isalnum(*ptr));
+ if (!*ptr)
+ break;
+ /*
+ * A number must have some kind of delimiter after
+ * it, and we can ignore that too.
+ */
+ continue;
+ }
+ if (isalpha(*ptr) || *ptr == '_') {
+ if (strncmp(map->enum_string, ptr, len) == 0 &&
+ !isalnum(ptr[len]) && ptr[len] != '_') {
+ ptr = enum_replace(ptr, map, len);
+ /* Hmm, enum string smaller than value */
+ if (WARN_ON_ONCE(!ptr))
+ return;
+ /*
+ * No need to decrement here, as enum_replace()
+ * returns the pointer to the character passed
+ * the enum, and two enums can not be placed
+ * back to back without something in between.
+ * We can skip that something in between.
+ */
+ continue;
+ }
+ skip_more:
+ do {
+ ptr++;
+ } while (isalnum(*ptr) || *ptr == '_');
+ if (!*ptr)
+ break;
+ /*
+ * If what comes after this variable is a '.' or
+ * '->' then we can continue to ignore that string.
+ */
+ if (*ptr == '.' || (ptr[0] == '-' && ptr[1] == '>')) {
+ ptr += *ptr == '.' ? 1 : 2;
+ if (!*ptr)
+ break;
+ goto skip_more;
+ }
+ /*
+ * Once again, we can skip the delimiter that came
+ * after the string.
+ */
+ continue;
+ }
+ }
+}
+
+void trace_event_enum_update(struct trace_enum_map **map, int len)
+{
+ struct ftrace_event_call *call, *p;
+ const char *last_system = NULL;
+ int last_i;
+ int i;
+
+ down_write(&trace_event_sem);
+ list_for_each_entry_safe(call, p, &ftrace_events, list) {
+ /* events are usually grouped together with systems */
+ if (!last_system || call->class->system != last_system) {
+ last_i = 0;
+ last_system = call->class->system;
+ }
+
+ for (i = last_i; i < len; i++) {
+ if (call->class->system == map[i]->system) {
+ /* Save the first system if need be */
+ if (!last_i)
+ last_i = i;
+ update_event_printk(call, map[i]);
+ }
+ }
+ }
+ up_write(&trace_event_sem);
+}
+
+static struct ftrace_event_file *
+trace_create_new_event(struct ftrace_event_call *call,
+ struct trace_array *tr)
+{
+ struct ftrace_event_file *file;
+
+ file = kmem_cache_alloc(file_cachep, GFP_TRACE);
+ if (!file)
+ return NULL;
+
+ file->event_call = call;
+ file->tr = tr;
+ atomic_set(&file->sm_ref, 0);
+ atomic_set(&file->tm_ref, 0);
+ INIT_LIST_HEAD(&file->triggers);
+ list_add(&file->list, &tr->events);
+
+ return file;
+}
+
+/* Add an event to a trace directory */
+static int
+__trace_add_new_event(struct ftrace_event_call *call, struct trace_array *tr)
+{
+ struct ftrace_event_file *file;
+
+ file = trace_create_new_event(call, tr);
+ if (!file)
+ return -ENOMEM;
+
+ return event_create_dir(tr->event_dir, file);
+}
+
+/*
+ * Just create a decriptor for early init. A descriptor is required
+ * for enabling events at boot. We want to enable events before
+ * the filesystem is initialized.
+ */
+static __init int
+__trace_early_add_new_event(struct ftrace_event_call *call,
+ struct trace_array *tr)
+{
+ struct ftrace_event_file *file;
+
+ file = trace_create_new_event(call, tr);
+ if (!file)
+ return -ENOMEM;
+
+ return 0;
+}
+
+struct ftrace_module_file_ops;
+static void __add_event_to_tracers(struct ftrace_event_call *call);
+
+/* Add an additional event_call dynamically */
+int trace_add_event_call(struct ftrace_event_call *call)
+{
+ int ret;
+ mutex_lock(&trace_types_lock);
+ mutex_lock(&event_mutex);
+
+ ret = __register_event(call, NULL);
+ if (ret >= 0)
+ __add_event_to_tracers(call);
+
+ mutex_unlock(&event_mutex);
+ mutex_unlock(&trace_types_lock);
+ return ret;
+}
+
+/*
+ * Must be called under locking of trace_types_lock, event_mutex and
+ * trace_event_sem.
+ */
+static void __trace_remove_event_call(struct ftrace_event_call *call)
+{
+ event_remove(call);
+ trace_destroy_fields(call);
+ free_event_filter(call->filter);
+ call->filter = NULL;
+}
+
+static int probe_remove_event_call(struct ftrace_event_call *call)
+{
+ struct trace_array *tr;
+ struct ftrace_event_file *file;
+
+#ifdef CONFIG_PERF_EVENTS
+ if (call->perf_refcount)
+ return -EBUSY;
+#endif
+ do_for_each_event_file(tr, file) {
+ if (file->event_call != call)
+ continue;
+ /*
+ * We can't rely on ftrace_event_enable_disable(enable => 0)
+ * we are going to do, FTRACE_EVENT_FL_SOFT_MODE can suppress
+ * TRACE_REG_UNREGISTER.
+ */
+ if (file->flags & FTRACE_EVENT_FL_ENABLED)
+ return -EBUSY;
+ /*
+ * The do_for_each_event_file_safe() is
+ * a double loop. After finding the call for this
+ * trace_array, we use break to jump to the next
+ * trace_array.
+ */
+ break;
+ } while_for_each_event_file();
+
+ __trace_remove_event_call(call);
+
+ return 0;
+}
+
+/* Remove an event_call */
+int trace_remove_event_call(struct ftrace_event_call *call)
+{
+ int ret;
+
+ mutex_lock(&trace_types_lock);
+ mutex_lock(&event_mutex);
+ down_write(&trace_event_sem);
+ ret = probe_remove_event_call(call);
+ up_write(&trace_event_sem);
+ mutex_unlock(&event_mutex);
+ mutex_unlock(&trace_types_lock);
+
+ return ret;
+}
+
+#define for_each_event(event, start, end) \
+ for (event = start; \
+ (unsigned long)event < (unsigned long)end; \
+ event++)
+
+#ifdef CONFIG_MODULES
+
+static void trace_module_add_events(struct module *mod)
+{
+ struct ftrace_event_call **call, **start, **end;
+
+ if (!mod->num_trace_events)
+ return;
+
+ /* Don't add infrastructure for mods without tracepoints */
+ if (trace_module_has_bad_taint(mod)) {
+ pr_err("%s: module has bad taint, not creating trace events\n",
+ mod->name);
+ return;
+ }
+
+ start = mod->trace_events;
+ end = mod->trace_events + mod->num_trace_events;
+
+ for_each_event(call, start, end) {
+ __register_event(*call, mod);
+ __add_event_to_tracers(*call);
+ }
+}
+
+static void trace_module_remove_events(struct module *mod)
+{
+ struct ftrace_event_call *call, *p;
+ bool clear_trace = false;
+
+ down_write(&trace_event_sem);
+ list_for_each_entry_safe(call, p, &ftrace_events, list) {
+ if (call->mod == mod) {
+ if (call->flags & TRACE_EVENT_FL_WAS_ENABLED)
+ clear_trace = true;
+ __trace_remove_event_call(call);
+ }
+ }
+ up_write(&trace_event_sem);
+
+ /*
+ * It is safest to reset the ring buffer if the module being unloaded
+ * registered any events that were used. The only worry is if
+ * a new module gets loaded, and takes on the same id as the events
+ * of this module. When printing out the buffer, traced events left
+ * over from this module may be passed to the new module events and
+ * unexpected results may occur.
+ */
+ if (clear_trace)
+ tracing_reset_all_online_cpus();
+}
+
+static int trace_module_notify(struct notifier_block *self,
+ unsigned long val, void *data)
+{
+ struct module *mod = data;
+
+ mutex_lock(&trace_types_lock);
+ mutex_lock(&event_mutex);
+ switch (val) {
+ case MODULE_STATE_COMING:
+ trace_module_add_events(mod);
+ break;
+ case MODULE_STATE_GOING:
+ trace_module_remove_events(mod);
+ break;
+ }
+ mutex_unlock(&event_mutex);
+ mutex_unlock(&trace_types_lock);
+
+ return 0;
+}
+
+static struct notifier_block trace_module_nb = {
+ .notifier_call = trace_module_notify,
+ .priority = 1, /* higher than trace.c module notify */
+};
+#endif /* CONFIG_MODULES */
+
+/* Create a new event directory structure for a trace directory. */
+static void
+__trace_add_event_dirs(struct trace_array *tr)
+{
+ struct ftrace_event_call *call;
+ int ret;
+
+ list_for_each_entry(call, &ftrace_events, list) {
+ ret = __trace_add_new_event(call, tr);
+ if (ret < 0)
+ pr_warn("Could not create directory for event %s\n",
+ ftrace_event_name(call));
+ }
+}
+
+struct ftrace_event_file *
+find_event_file(struct trace_array *tr, const char *system, const char *event)
+{
+ struct ftrace_event_file *file;
+ struct ftrace_event_call *call;
+ const char *name;
+
+ list_for_each_entry(file, &tr->events, list) {
+
+ call = file->event_call;
+ name = ftrace_event_name(call);
+
+ if (!name || !call->class || !call->class->reg)
+ continue;
+
+ if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)
+ continue;
+
+ if (strcmp(event, name) == 0 &&
+ strcmp(system, call->class->system) == 0)
+ return file;
+ }
+ return NULL;
+}
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+/* Avoid typos */
+#define ENABLE_EVENT_STR "enable_event"
+#define DISABLE_EVENT_STR "disable_event"
+
+struct event_probe_data {
+ struct ftrace_event_file *file;
+ unsigned long count;
+ int ref;
+ bool enable;
+};
+
+static void
+event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data)
+{
+ struct event_probe_data **pdata = (struct event_probe_data **)_data;
+ struct event_probe_data *data = *pdata;
+
+ if (!data)
+ return;
+
+ if (data->enable)
+ clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags);
+ else
+ set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags);
+}
+
+static void
+event_enable_count_probe(unsigned long ip, unsigned long parent_ip, void **_data)
+{
+ struct event_probe_data **pdata = (struct event_probe_data **)_data;
+ struct event_probe_data *data = *pdata;
+
+ if (!data)
+ return;
+
+ if (!data->count)
+ return;
+
+ /* Skip if the event is in a state we want to switch to */
+ if (data->enable == !(data->file->flags & FTRACE_EVENT_FL_SOFT_DISABLED))
+ return;
+
+ if (data->count != -1)
+ (data->count)--;
+
+ event_enable_probe(ip, parent_ip, _data);
+}
+
+static int
+event_enable_print(struct seq_file *m, unsigned long ip,
+ struct ftrace_probe_ops *ops, void *_data)
+{
+ struct event_probe_data *data = _data;
+
+ seq_printf(m, "%ps:", (void *)ip);
+
+ seq_printf(m, "%s:%s:%s",
+ data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR,
+ data->file->event_call->class->system,
+ ftrace_event_name(data->file->event_call));
+
+ if (data->count == -1)
+ seq_puts(m, ":unlimited\n");
+ else
+ seq_printf(m, ":count=%ld\n", data->count);
+
+ return 0;
+}
+
+static int
+event_enable_init(struct ftrace_probe_ops *ops, unsigned long ip,
+ void **_data)
+{
+ struct event_probe_data **pdata = (struct event_probe_data **)_data;
+ struct event_probe_data *data = *pdata;
+
+ data->ref++;
+ return 0;
+}
+
+static void
+event_enable_free(struct ftrace_probe_ops *ops, unsigned long ip,
+ void **_data)
+{
+ struct event_probe_data **pdata = (struct event_probe_data **)_data;
+ struct event_probe_data *data = *pdata;
+
+ if (WARN_ON_ONCE(data->ref <= 0))
+ return;
+
+ data->ref--;
+ if (!data->ref) {
+ /* Remove the SOFT_MODE flag */
+ __ftrace_event_enable_disable(data->file, 0, 1);
+ module_put(data->file->event_call->mod);
+ kfree(data);
+ }
+ *pdata = NULL;
+}
+
+static struct ftrace_probe_ops event_enable_probe_ops = {
+ .func = event_enable_probe,
+ .print = event_enable_print,
+ .init = event_enable_init,
+ .free = event_enable_free,
+};
+
+static struct ftrace_probe_ops event_enable_count_probe_ops = {
+ .func = event_enable_count_probe,
+ .print = event_enable_print,
+ .init = event_enable_init,
+ .free = event_enable_free,
+};
+
+static struct ftrace_probe_ops event_disable_probe_ops = {
+ .func = event_enable_probe,
+ .print = event_enable_print,
+ .init = event_enable_init,
+ .free = event_enable_free,
+};
+
+static struct ftrace_probe_ops event_disable_count_probe_ops = {
+ .func = event_enable_count_probe,
+ .print = event_enable_print,
+ .init = event_enable_init,
+ .free = event_enable_free,
+};
+
+static int
+event_enable_func(struct ftrace_hash *hash,
+ char *glob, char *cmd, char *param, int enabled)
+{
+ struct trace_array *tr = top_trace_array();
+ struct ftrace_event_file *file;
+ struct ftrace_probe_ops *ops;
+ struct event_probe_data *data;
+ const char *system;
+ const char *event;
+ char *number;
+ bool enable;
+ int ret;
+
+ if (!tr)
+ return -ENODEV;
+
+ /* hash funcs only work with set_ftrace_filter */
+ if (!enabled || !param)
+ return -EINVAL;
+
+ system = strsep(&param, ":");
+ if (!param)
+ return -EINVAL;
+
+ event = strsep(&param, ":");
+
+ mutex_lock(&event_mutex);
+
+ ret = -EINVAL;
+ file = find_event_file(tr, system, event);
+ if (!file)
+ goto out;
+
+ enable = strcmp(cmd, ENABLE_EVENT_STR) == 0;
+
+ if (enable)
+ ops = param ? &event_enable_count_probe_ops : &event_enable_probe_ops;
+ else
+ ops = param ? &event_disable_count_probe_ops : &event_disable_probe_ops;
+
+ if (glob[0] == '!') {
+ unregister_ftrace_function_probe_func(glob+1, ops);
+ ret = 0;
+ goto out;
+ }
+
+ ret = -ENOMEM;
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ goto out;
+
+ data->enable = enable;
+ data->count = -1;
+ data->file = file;
+
+ if (!param)
+ goto out_reg;
+
+ number = strsep(&param, ":");
+
+ ret = -EINVAL;
+ if (!strlen(number))
+ goto out_free;
+
+ /*
+ * We use the callback data field (which is a pointer)
+ * as our counter.
+ */
+ ret = kstrtoul(number, 0, &data->count);
+ if (ret)
+ goto out_free;
+
+ out_reg:
+ /* Don't let event modules unload while probe registered */
+ ret = try_module_get(file->event_call->mod);
+ if (!ret) {
+ ret = -EBUSY;
+ goto out_free;
+ }
+
+ ret = __ftrace_event_enable_disable(file, 1, 1);
+ if (ret < 0)
+ goto out_put;
+ ret = register_ftrace_function_probe(glob, ops, data);
+ /*
+ * The above returns on success the # of functions enabled,
+ * but if it didn't find any functions it returns zero.
+ * Consider no functions a failure too.
+ */
+ if (!ret) {
+ ret = -ENOENT;
+ goto out_disable;
+ } else if (ret < 0)
+ goto out_disable;
+ /* Just return zero, not the number of enabled functions */
+ ret = 0;
+ out:
+ mutex_unlock(&event_mutex);
+ return ret;
+
+ out_disable:
+ __ftrace_event_enable_disable(file, 0, 1);
+ out_put:
+ module_put(file->event_call->mod);
+ out_free:
+ kfree(data);
+ goto out;
+}
+
+static struct ftrace_func_command event_enable_cmd = {
+ .name = ENABLE_EVENT_STR,
+ .func = event_enable_func,
+};
+
+static struct ftrace_func_command event_disable_cmd = {
+ .name = DISABLE_EVENT_STR,
+ .func = event_enable_func,
+};
+
+static __init int register_event_cmds(void)
+{
+ int ret;
+
+ ret = register_ftrace_command(&event_enable_cmd);
+ if (WARN_ON(ret < 0))
+ return ret;
+ ret = register_ftrace_command(&event_disable_cmd);
+ if (WARN_ON(ret < 0))
+ unregister_ftrace_command(&event_enable_cmd);
+ return ret;
+}
+#else
+static inline int register_event_cmds(void) { return 0; }
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+/*
+ * The top level array has already had its ftrace_event_file
+ * descriptors created in order to allow for early events to
+ * be recorded. This function is called after the tracefs has been
+ * initialized, and we now have to create the files associated
+ * to the events.
+ */
+static __init void
+__trace_early_add_event_dirs(struct trace_array *tr)
+{
+ struct ftrace_event_file *file;
+ int ret;
+
+
+ list_for_each_entry(file, &tr->events, list) {
+ ret = event_create_dir(tr->event_dir, file);
+ if (ret < 0)
+ pr_warn("Could not create directory for event %s\n",
+ ftrace_event_name(file->event_call));
+ }
+}
+
+/*
+ * For early boot up, the top trace array requires to have
+ * a list of events that can be enabled. This must be done before
+ * the filesystem is set up in order to allow events to be traced
+ * early.
+ */
+static __init void
+__trace_early_add_events(struct trace_array *tr)
+{
+ struct ftrace_event_call *call;
+ int ret;
+
+ list_for_each_entry(call, &ftrace_events, list) {
+ /* Early boot up should not have any modules loaded */
+ if (WARN_ON_ONCE(call->mod))
+ continue;
+
+ ret = __trace_early_add_new_event(call, tr);
+ if (ret < 0)
+ pr_warn("Could not create early event %s\n",
+ ftrace_event_name(call));
+ }
+}
+
+/* Remove the event directory structure for a trace directory. */
+static void
+__trace_remove_event_dirs(struct trace_array *tr)
+{
+ struct ftrace_event_file *file, *next;
+
+ list_for_each_entry_safe(file, next, &tr->events, list)
+ remove_event_file_dir(file);
+}
+
+static void __add_event_to_tracers(struct ftrace_event_call *call)
+{
+ struct trace_array *tr;
+
+ list_for_each_entry(tr, &ftrace_trace_arrays, list)
+ __trace_add_new_event(call, tr);
+}
+
+extern struct ftrace_event_call *__start_ftrace_events[];
+extern struct ftrace_event_call *__stop_ftrace_events[];
+
+static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;
+
+static __init int setup_trace_event(char *str)
+{
+ strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE);
+ ring_buffer_expanded = true;
+ tracing_selftest_disabled = true;
+
+ return 1;
+}
+__setup("trace_event=", setup_trace_event);
+
+/* Expects to have event_mutex held when called */
+static int
+create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
+{
+ struct dentry *d_events;
+ struct dentry *entry;
+
+ entry = tracefs_create_file("set_event", 0644, parent,
+ tr, &ftrace_set_event_fops);
+ if (!entry) {
+ pr_warn("Could not create tracefs 'set_event' entry\n");
+ return -ENOMEM;
+ }
+
+ d_events = tracefs_create_dir("events", parent);
+ if (!d_events) {
+ pr_warn("Could not create tracefs 'events' directory\n");
+ return -ENOMEM;
+ }
+
+ /* ring buffer internal formats */
+ trace_create_file("header_page", 0444, d_events,
+ ring_buffer_print_page_header,
+ &ftrace_show_header_fops);
+
+ trace_create_file("header_event", 0444, d_events,
+ ring_buffer_print_entry_header,
+ &ftrace_show_header_fops);
+
+ trace_create_file("enable", 0644, d_events,
+ tr, &ftrace_tr_enable_fops);
+
+ tr->event_dir = d_events;
+
+ return 0;
+}
+
+/**
+ * event_trace_add_tracer - add a instance of a trace_array to events
+ * @parent: The parent dentry to place the files/directories for events in
+ * @tr: The trace array associated with these events
+ *
+ * When a new instance is created, it needs to set up its events
+ * directory, as well as other files associated with events. It also
+ * creates the event hierachry in the @parent/events directory.
+ *
+ * Returns 0 on success.
+ */
+int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr)
+{
+ int ret;
+
+ mutex_lock(&event_mutex);
+
+ ret = create_event_toplevel_files(parent, tr);
+ if (ret)
+ goto out_unlock;
+
+ down_write(&trace_event_sem);
+ __trace_add_event_dirs(tr);
+ up_write(&trace_event_sem);
+
+ out_unlock:
+ mutex_unlock(&event_mutex);
+
+ return ret;
+}
+
+/*
+ * The top trace array already had its file descriptors created.
+ * Now the files themselves need to be created.
+ */
+static __init int
+early_event_add_tracer(struct dentry *parent, struct trace_array *tr)
+{
+ int ret;
+
+ mutex_lock(&event_mutex);
+
+ ret = create_event_toplevel_files(parent, tr);
+ if (ret)
+ goto out_unlock;
+
+ down_write(&trace_event_sem);
+ __trace_early_add_event_dirs(tr);
+ up_write(&trace_event_sem);
+
+ out_unlock:
+ mutex_unlock(&event_mutex);
+
+ return ret;
+}
+
+int event_trace_del_tracer(struct trace_array *tr)
+{
+ mutex_lock(&event_mutex);
+
+ /* Disable any event triggers and associated soft-disabled events */
+ clear_event_triggers(tr);
+
+ /* Disable any running events */
+ __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0);
+
+ /* Access to events are within rcu_read_lock_sched() */
+ synchronize_sched();
+
+ down_write(&trace_event_sem);
+ __trace_remove_event_dirs(tr);
+ tracefs_remove_recursive(tr->event_dir);
+ up_write(&trace_event_sem);
+
+ tr->event_dir = NULL;
+
+ mutex_unlock(&event_mutex);
+
+ return 0;
+}
+
+static __init int event_trace_memsetup(void)
+{
+ field_cachep = KMEM_CACHE(ftrace_event_field, SLAB_PANIC);
+ file_cachep = KMEM_CACHE(ftrace_event_file, SLAB_PANIC);
+ return 0;
+}
+
+static __init void
+early_enable_events(struct trace_array *tr, bool disable_first)
+{
+ char *buf = bootup_event_buf;
+ char *token;
+ int ret;
+
+ while (true) {
+ token = strsep(&buf, ",");
+
+ if (!token)
+ break;
+ if (!*token)
+ continue;
+
+ /* Restarting syscalls requires that we stop them first */
+ if (disable_first)
+ ftrace_set_clr_event(tr, token, 0);
+
+ ret = ftrace_set_clr_event(tr, token, 1);
+ if (ret)
+ pr_warn("Failed to enable trace event: %s\n", token);
+
+ /* Put back the comma to allow this to be called again */
+ if (buf)
+ *(buf - 1) = ',';
+ }
+}
+
+static __init int event_trace_enable(void)
+{
+ struct trace_array *tr = top_trace_array();
+ struct ftrace_event_call **iter, *call;
+ int ret;
+
+ if (!tr)
+ return -ENODEV;
+
+ for_each_event(iter, __start_ftrace_events, __stop_ftrace_events) {
+
+ call = *iter;
+ ret = event_init(call);
+ if (!ret)
+ list_add(&call->list, &ftrace_events);
+ }
+
+ /*
+ * We need the top trace array to have a working set of trace
+ * points at early init, before the debug files and directories
+ * are created. Create the file entries now, and attach them
+ * to the actual file dentries later.
+ */
+ __trace_early_add_events(tr);
+
+ early_enable_events(tr, false);
+
+ trace_printk_start_comm();
+
+ register_event_cmds();
+
+ register_trigger_cmds();
+
+ return 0;
+}
+
+/*
+ * event_trace_enable() is called from trace_event_init() first to
+ * initialize events and perhaps start any events that are on the
+ * command line. Unfortunately, there are some events that will not
+ * start this early, like the system call tracepoints that need
+ * to set the TIF_SYSCALL_TRACEPOINT flag of pid 1. But event_trace_enable()
+ * is called before pid 1 starts, and this flag is never set, making
+ * the syscall tracepoint never get reached, but the event is enabled
+ * regardless (and not doing anything).
+ */
+static __init int event_trace_enable_again(void)
+{
+ struct trace_array *tr;
+
+ tr = top_trace_array();
+ if (!tr)
+ return -ENODEV;
+
+ early_enable_events(tr, true);
+
+ return 0;
+}
+
+early_initcall(event_trace_enable_again);
+
+static __init int event_trace_init(void)
+{
+ struct trace_array *tr;
+ struct dentry *d_tracer;
+ struct dentry *entry;
+ int ret;
+
+ tr = top_trace_array();
+ if (!tr)
+ return -ENODEV;
+
+ d_tracer = tracing_init_dentry();
+ if (IS_ERR(d_tracer))
+ return 0;
+
+ entry = tracefs_create_file("available_events", 0444, d_tracer,
+ tr, &ftrace_avail_fops);
+ if (!entry)
+ pr_warn("Could not create tracefs 'available_events' entry\n");
+
+ if (trace_define_common_fields())
+ pr_warn("tracing: Failed to allocate common fields");
+
+ ret = early_event_add_tracer(d_tracer, tr);
+ if (ret)
+ return ret;
+
+#ifdef CONFIG_MODULES
+ ret = register_module_notifier(&trace_module_nb);
+ if (ret)
+ pr_warn("Failed to register trace events module notifier\n");
+#endif
+ return 0;
+}
+
+void __init trace_event_init(void)
+{
+ event_trace_memsetup();
+ init_ftrace_syscalls();
+ event_trace_enable();
+}
+
+fs_initcall(event_trace_init);
+
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+
+static DEFINE_SPINLOCK(test_spinlock);
+static DEFINE_SPINLOCK(test_spinlock_irq);
+static DEFINE_MUTEX(test_mutex);
+
+static __init void test_work(struct work_struct *dummy)
+{
+ spin_lock(&test_spinlock);
+ spin_lock_irq(&test_spinlock_irq);
+ udelay(1);
+ spin_unlock_irq(&test_spinlock_irq);
+ spin_unlock(&test_spinlock);
+
+ mutex_lock(&test_mutex);
+ msleep(1);
+ mutex_unlock(&test_mutex);
+}
+
+static __init int event_test_thread(void *unused)
+{
+ void *test_malloc;
+
+ test_malloc = kmalloc(1234, GFP_KERNEL);
+ if (!test_malloc)
+ pr_info("failed to kmalloc\n");
+
+ schedule_on_each_cpu(test_work);
+
+ kfree(test_malloc);
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ while (!kthread_should_stop()) {
+ schedule();
+ set_current_state(TASK_INTERRUPTIBLE);
+ }
+ __set_current_state(TASK_RUNNING);
+
+ return 0;
+}
+
+/*
+ * Do various things that may trigger events.
+ */
+static __init void event_test_stuff(void)
+{
+ struct task_struct *test_thread;
+
+ test_thread = kthread_run(event_test_thread, NULL, "test-events");
+ msleep(1);
+ kthread_stop(test_thread);
+}
+
+/*
+ * For every trace event defined, we will test each trace point separately,
+ * and then by groups, and finally all trace points.
+ */
+static __init void event_trace_self_tests(void)
+{
+ struct ftrace_subsystem_dir *dir;
+ struct ftrace_event_file *file;
+ struct ftrace_event_call *call;
+ struct event_subsystem *system;
+ struct trace_array *tr;
+ int ret;
+
+ tr = top_trace_array();
+ if (!tr)
+ return;
+
+ pr_info("Running tests on trace events:\n");
+
+ list_for_each_entry(file, &tr->events, list) {
+
+ call = file->event_call;
+
+ /* Only test those that have a probe */
+ if (!call->class || !call->class->probe)
+ continue;
+
+/*
+ * Testing syscall events here is pretty useless, but
+ * we still do it if configured. But this is time consuming.
+ * What we really need is a user thread to perform the
+ * syscalls as we test.
+ */
+#ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS
+ if (call->class->system &&
+ strcmp(call->class->system, "syscalls") == 0)
+ continue;
+#endif
+
+ pr_info("Testing event %s: ", ftrace_event_name(call));
+
+ /*
+ * If an event is already enabled, someone is using
+ * it and the self test should not be on.
+ */
+ if (file->flags & FTRACE_EVENT_FL_ENABLED) {
+ pr_warn("Enabled event during self test!\n");
+ WARN_ON_ONCE(1);
+ continue;
+ }
+
+ ftrace_event_enable_disable(file, 1);
+ event_test_stuff();
+ ftrace_event_enable_disable(file, 0);
+
+ pr_cont("OK\n");
+ }
+
+ /* Now test at the sub system level */
+
+ pr_info("Running tests on trace event systems:\n");
+
+ list_for_each_entry(dir, &tr->systems, list) {
+
+ system = dir->subsystem;
+
+ /* the ftrace system is special, skip it */
+ if (strcmp(system->name, "ftrace") == 0)
+ continue;
+
+ pr_info("Testing event system %s: ", system->name);
+
+ ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1);
+ if (WARN_ON_ONCE(ret)) {
+ pr_warn("error enabling system %s\n",
+ system->name);
+ continue;
+ }
+
+ event_test_stuff();
+
+ ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0);
+ if (WARN_ON_ONCE(ret)) {
+ pr_warn("error disabling system %s\n",
+ system->name);
+ continue;
+ }
+
+ pr_cont("OK\n");
+ }
+
+ /* Test with all events enabled */
+
+ pr_info("Running tests on all trace events:\n");
+ pr_info("Testing all events: ");
+
+ ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1);
+ if (WARN_ON_ONCE(ret)) {
+ pr_warn("error enabling all events\n");
+ return;
+ }
+
+ event_test_stuff();
+
+ /* reset sysname */
+ ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0);
+ if (WARN_ON_ONCE(ret)) {
+ pr_warn("error disabling all events\n");
+ return;
+ }
+
+ pr_cont("OK\n");
+}
+
+#ifdef CONFIG_FUNCTION_TRACER
+
+static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable);
+
+static void
+function_test_events_call(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *pt_regs)
+{
+ struct ring_buffer_event *event;
+ struct ring_buffer *buffer;
+ struct ftrace_entry *entry;
+ unsigned long flags;
+ long disabled;
+ int cpu;
+ int pc;
+
+ pc = preempt_count();
+ preempt_disable_notrace();
+ cpu = raw_smp_processor_id();
+ disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu));
+
+ if (disabled != 1)
+ goto out;
+
+ local_save_flags(flags);
+
+ event = trace_current_buffer_lock_reserve(&buffer,
+ TRACE_FN, sizeof(*entry),
+ flags, pc);
+ if (!event)
+ goto out;
+ entry = ring_buffer_event_data(event);
+ entry->ip = ip;
+ entry->parent_ip = parent_ip;
+
+ trace_buffer_unlock_commit(buffer, event, flags, pc);
+
+ out:
+ atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
+ preempt_enable_notrace();
+}
+
+static struct ftrace_ops trace_ops __initdata =
+{
+ .func = function_test_events_call,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE,
+};
+
+static __init void event_trace_self_test_with_function(void)
+{
+ int ret;
+ ret = register_ftrace_function(&trace_ops);
+ if (WARN_ON(ret < 0)) {
+ pr_info("Failed to enable function tracer for event tests\n");
+ return;
+ }
+ pr_info("Running tests again, along with the function tracer\n");
+ event_trace_self_tests();
+ unregister_ftrace_function(&trace_ops);
+}
+#else
+static __init void event_trace_self_test_with_function(void)
+{
+}
+#endif
+
+static __init int event_trace_self_tests_init(void)
+{
+ if (!tracing_selftest_disabled) {
+ event_trace_self_tests();
+ event_trace_self_test_with_function();
+ }
+
+ return 0;
+}
+
+late_initcall(event_trace_self_tests_init);
+
+#endif
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
new file mode 100644
index 000000000..52adf02d7
--- /dev/null
+++ b/kernel/trace/trace_events_filter.c
@@ -0,0 +1,2440 @@
+/*
+ * trace_events_filter - generic event filtering
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com>
+ */
+
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/mutex.h>
+#include <linux/perf_event.h>
+#include <linux/slab.h>
+
+#include "trace.h"
+#include "trace_output.h"
+
+#define DEFAULT_SYS_FILTER_MESSAGE \
+ "### global filter ###\n" \
+ "# Use this to set filters for multiple events.\n" \
+ "# Only events with the given fields will be affected.\n" \
+ "# If no events are modified, an error message will be displayed here"
+
+enum filter_op_ids
+{
+ OP_OR,
+ OP_AND,
+ OP_GLOB,
+ OP_NE,
+ OP_EQ,
+ OP_LT,
+ OP_LE,
+ OP_GT,
+ OP_GE,
+ OP_BAND,
+ OP_NOT,
+ OP_NONE,
+ OP_OPEN_PAREN,
+};
+
+struct filter_op {
+ int id;
+ char *string;
+ int precedence;
+};
+
+/* Order must be the same as enum filter_op_ids above */
+static struct filter_op filter_ops[] = {
+ { OP_OR, "||", 1 },
+ { OP_AND, "&&", 2 },
+ { OP_GLOB, "~", 4 },
+ { OP_NE, "!=", 4 },
+ { OP_EQ, "==", 4 },
+ { OP_LT, "<", 5 },
+ { OP_LE, "<=", 5 },
+ { OP_GT, ">", 5 },
+ { OP_GE, ">=", 5 },
+ { OP_BAND, "&", 6 },
+ { OP_NOT, "!", 6 },
+ { OP_NONE, "OP_NONE", 0 },
+ { OP_OPEN_PAREN, "(", 0 },
+};
+
+enum {
+ FILT_ERR_NONE,
+ FILT_ERR_INVALID_OP,
+ FILT_ERR_UNBALANCED_PAREN,
+ FILT_ERR_TOO_MANY_OPERANDS,
+ FILT_ERR_OPERAND_TOO_LONG,
+ FILT_ERR_FIELD_NOT_FOUND,
+ FILT_ERR_ILLEGAL_FIELD_OP,
+ FILT_ERR_ILLEGAL_INTVAL,
+ FILT_ERR_BAD_SUBSYS_FILTER,
+ FILT_ERR_TOO_MANY_PREDS,
+ FILT_ERR_MISSING_FIELD,
+ FILT_ERR_INVALID_FILTER,
+ FILT_ERR_IP_FIELD_ONLY,
+ FILT_ERR_ILLEGAL_NOT_OP,
+};
+
+static char *err_text[] = {
+ "No error",
+ "Invalid operator",
+ "Unbalanced parens",
+ "Too many operands",
+ "Operand too long",
+ "Field not found",
+ "Illegal operation for field type",
+ "Illegal integer value",
+ "Couldn't find or set field in one of a subsystem's events",
+ "Too many terms in predicate expression",
+ "Missing field name and/or value",
+ "Meaningless filter expression",
+ "Only 'ip' field is supported for function trace",
+ "Illegal use of '!'",
+};
+
+struct opstack_op {
+ int op;
+ struct list_head list;
+};
+
+struct postfix_elt {
+ int op;
+ char *operand;
+ struct list_head list;
+};
+
+struct filter_parse_state {
+ struct filter_op *ops;
+ struct list_head opstack;
+ struct list_head postfix;
+ int lasterr;
+ int lasterr_pos;
+
+ struct {
+ char *string;
+ unsigned int cnt;
+ unsigned int tail;
+ } infix;
+
+ struct {
+ char string[MAX_FILTER_STR_VAL];
+ int pos;
+ unsigned int tail;
+ } operand;
+};
+
+struct pred_stack {
+ struct filter_pred **preds;
+ int index;
+};
+
+/* If not of not match is equal to not of not, then it is a match */
+#define DEFINE_COMPARISON_PRED(type) \
+static int filter_pred_##type(struct filter_pred *pred, void *event) \
+{ \
+ type *addr = (type *)(event + pred->offset); \
+ type val = (type)pred->val; \
+ int match = 0; \
+ \
+ switch (pred->op) { \
+ case OP_LT: \
+ match = (*addr < val); \
+ break; \
+ case OP_LE: \
+ match = (*addr <= val); \
+ break; \
+ case OP_GT: \
+ match = (*addr > val); \
+ break; \
+ case OP_GE: \
+ match = (*addr >= val); \
+ break; \
+ case OP_BAND: \
+ match = (*addr & val); \
+ break; \
+ default: \
+ break; \
+ } \
+ \
+ return !!match == !pred->not; \
+}
+
+#define DEFINE_EQUALITY_PRED(size) \
+static int filter_pred_##size(struct filter_pred *pred, void *event) \
+{ \
+ u##size *addr = (u##size *)(event + pred->offset); \
+ u##size val = (u##size)pred->val; \
+ int match; \
+ \
+ match = (val == *addr) ^ pred->not; \
+ \
+ return match; \
+}
+
+DEFINE_COMPARISON_PRED(s64);
+DEFINE_COMPARISON_PRED(u64);
+DEFINE_COMPARISON_PRED(s32);
+DEFINE_COMPARISON_PRED(u32);
+DEFINE_COMPARISON_PRED(s16);
+DEFINE_COMPARISON_PRED(u16);
+DEFINE_COMPARISON_PRED(s8);
+DEFINE_COMPARISON_PRED(u8);
+
+DEFINE_EQUALITY_PRED(64);
+DEFINE_EQUALITY_PRED(32);
+DEFINE_EQUALITY_PRED(16);
+DEFINE_EQUALITY_PRED(8);
+
+/* Filter predicate for fixed sized arrays of characters */
+static int filter_pred_string(struct filter_pred *pred, void *event)
+{
+ char *addr = (char *)(event + pred->offset);
+ int cmp, match;
+
+ cmp = pred->regex.match(addr, &pred->regex, pred->regex.field_len);
+
+ match = cmp ^ pred->not;
+
+ return match;
+}
+
+/* Filter predicate for char * pointers */
+static int filter_pred_pchar(struct filter_pred *pred, void *event)
+{
+ char **addr = (char **)(event + pred->offset);
+ int cmp, match;
+ int len = strlen(*addr) + 1; /* including tailing '\0' */
+
+ cmp = pred->regex.match(*addr, &pred->regex, len);
+
+ match = cmp ^ pred->not;
+
+ return match;
+}
+
+/*
+ * Filter predicate for dynamic sized arrays of characters.
+ * These are implemented through a list of strings at the end
+ * of the entry.
+ * Also each of these strings have a field in the entry which
+ * contains its offset from the beginning of the entry.
+ * We have then first to get this field, dereference it
+ * and add it to the address of the entry, and at last we have
+ * the address of the string.
+ */
+static int filter_pred_strloc(struct filter_pred *pred, void *event)
+{
+ u32 str_item = *(u32 *)(event + pred->offset);
+ int str_loc = str_item & 0xffff;
+ int str_len = str_item >> 16;
+ char *addr = (char *)(event + str_loc);
+ int cmp, match;
+
+ cmp = pred->regex.match(addr, &pred->regex, str_len);
+
+ match = cmp ^ pred->not;
+
+ return match;
+}
+
+static int filter_pred_none(struct filter_pred *pred, void *event)
+{
+ return 0;
+}
+
+/*
+ * regex_match_foo - Basic regex callbacks
+ *
+ * @str: the string to be searched
+ * @r: the regex structure containing the pattern string
+ * @len: the length of the string to be searched (including '\0')
+ *
+ * Note:
+ * - @str might not be NULL-terminated if it's of type DYN_STRING
+ * or STATIC_STRING
+ */
+
+static int regex_match_full(char *str, struct regex *r, int len)
+{
+ if (strncmp(str, r->pattern, len) == 0)
+ return 1;
+ return 0;
+}
+
+static int regex_match_front(char *str, struct regex *r, int len)
+{
+ if (strncmp(str, r->pattern, r->len) == 0)
+ return 1;
+ return 0;
+}
+
+static int regex_match_middle(char *str, struct regex *r, int len)
+{
+ if (strnstr(str, r->pattern, len))
+ return 1;
+ return 0;
+}
+
+static int regex_match_end(char *str, struct regex *r, int len)
+{
+ int strlen = len - 1;
+
+ if (strlen >= r->len &&
+ memcmp(str + strlen - r->len, r->pattern, r->len) == 0)
+ return 1;
+ return 0;
+}
+
+/**
+ * filter_parse_regex - parse a basic regex
+ * @buff: the raw regex
+ * @len: length of the regex
+ * @search: will point to the beginning of the string to compare
+ * @not: tell whether the match will have to be inverted
+ *
+ * This passes in a buffer containing a regex and this function will
+ * set search to point to the search part of the buffer and
+ * return the type of search it is (see enum above).
+ * This does modify buff.
+ *
+ * Returns enum type.
+ * search returns the pointer to use for comparison.
+ * not returns 1 if buff started with a '!'
+ * 0 otherwise.
+ */
+enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not)
+{
+ int type = MATCH_FULL;
+ int i;
+
+ if (buff[0] == '!') {
+ *not = 1;
+ buff++;
+ len--;
+ } else
+ *not = 0;
+
+ *search = buff;
+
+ for (i = 0; i < len; i++) {
+ if (buff[i] == '*') {
+ if (!i) {
+ *search = buff + 1;
+ type = MATCH_END_ONLY;
+ } else {
+ if (type == MATCH_END_ONLY)
+ type = MATCH_MIDDLE_ONLY;
+ else
+ type = MATCH_FRONT_ONLY;
+ buff[i] = 0;
+ break;
+ }
+ }
+ }
+
+ return type;
+}
+
+static void filter_build_regex(struct filter_pred *pred)
+{
+ struct regex *r = &pred->regex;
+ char *search;
+ enum regex_type type = MATCH_FULL;
+ int not = 0;
+
+ if (pred->op == OP_GLOB) {
+ type = filter_parse_regex(r->pattern, r->len, &search, &not);
+ r->len = strlen(search);
+ memmove(r->pattern, search, r->len+1);
+ }
+
+ switch (type) {
+ case MATCH_FULL:
+ r->match = regex_match_full;
+ break;
+ case MATCH_FRONT_ONLY:
+ r->match = regex_match_front;
+ break;
+ case MATCH_MIDDLE_ONLY:
+ r->match = regex_match_middle;
+ break;
+ case MATCH_END_ONLY:
+ r->match = regex_match_end;
+ break;
+ }
+
+ pred->not ^= not;
+}
+
+enum move_type {
+ MOVE_DOWN,
+ MOVE_UP_FROM_LEFT,
+ MOVE_UP_FROM_RIGHT
+};
+
+static struct filter_pred *
+get_pred_parent(struct filter_pred *pred, struct filter_pred *preds,
+ int index, enum move_type *move)
+{
+ if (pred->parent & FILTER_PRED_IS_RIGHT)
+ *move = MOVE_UP_FROM_RIGHT;
+ else
+ *move = MOVE_UP_FROM_LEFT;
+ pred = &preds[pred->parent & ~FILTER_PRED_IS_RIGHT];
+
+ return pred;
+}
+
+enum walk_return {
+ WALK_PRED_ABORT,
+ WALK_PRED_PARENT,
+ WALK_PRED_DEFAULT,
+};
+
+typedef int (*filter_pred_walkcb_t) (enum move_type move,
+ struct filter_pred *pred,
+ int *err, void *data);
+
+static int walk_pred_tree(struct filter_pred *preds,
+ struct filter_pred *root,
+ filter_pred_walkcb_t cb, void *data)
+{
+ struct filter_pred *pred = root;
+ enum move_type move = MOVE_DOWN;
+ int done = 0;
+
+ if (!preds)
+ return -EINVAL;
+
+ do {
+ int err = 0, ret;
+
+ ret = cb(move, pred, &err, data);
+ if (ret == WALK_PRED_ABORT)
+ return err;
+ if (ret == WALK_PRED_PARENT)
+ goto get_parent;
+
+ switch (move) {
+ case MOVE_DOWN:
+ if (pred->left != FILTER_PRED_INVALID) {
+ pred = &preds[pred->left];
+ continue;
+ }
+ goto get_parent;
+ case MOVE_UP_FROM_LEFT:
+ pred = &preds[pred->right];
+ move = MOVE_DOWN;
+ continue;
+ case MOVE_UP_FROM_RIGHT:
+ get_parent:
+ if (pred == root)
+ break;
+ pred = get_pred_parent(pred, preds,
+ pred->parent,
+ &move);
+ continue;
+ }
+ done = 1;
+ } while (!done);
+
+ /* We are fine. */
+ return 0;
+}
+
+/*
+ * A series of AND or ORs where found together. Instead of
+ * climbing up and down the tree branches, an array of the
+ * ops were made in order of checks. We can just move across
+ * the array and short circuit if needed.
+ */
+static int process_ops(struct filter_pred *preds,
+ struct filter_pred *op, void *rec)
+{
+ struct filter_pred *pred;
+ int match = 0;
+ int type;
+ int i;
+
+ /*
+ * Micro-optimization: We set type to true if op
+ * is an OR and false otherwise (AND). Then we
+ * just need to test if the match is equal to
+ * the type, and if it is, we can short circuit the
+ * rest of the checks:
+ *
+ * if ((match && op->op == OP_OR) ||
+ * (!match && op->op == OP_AND))
+ * return match;
+ */
+ type = op->op == OP_OR;
+
+ for (i = 0; i < op->val; i++) {
+ pred = &preds[op->ops[i]];
+ if (!WARN_ON_ONCE(!pred->fn))
+ match = pred->fn(pred, rec);
+ if (!!match == type)
+ break;
+ }
+ /* If not of not match is equal to not of not, then it is a match */
+ return !!match == !op->not;
+}
+
+struct filter_match_preds_data {
+ struct filter_pred *preds;
+ int match;
+ void *rec;
+};
+
+static int filter_match_preds_cb(enum move_type move, struct filter_pred *pred,
+ int *err, void *data)
+{
+ struct filter_match_preds_data *d = data;
+
+ *err = 0;
+ switch (move) {
+ case MOVE_DOWN:
+ /* only AND and OR have children */
+ if (pred->left != FILTER_PRED_INVALID) {
+ /* If ops is set, then it was folded. */
+ if (!pred->ops)
+ return WALK_PRED_DEFAULT;
+ /* We can treat folded ops as a leaf node */
+ d->match = process_ops(d->preds, pred, d->rec);
+ } else {
+ if (!WARN_ON_ONCE(!pred->fn))
+ d->match = pred->fn(pred, d->rec);
+ }
+
+ return WALK_PRED_PARENT;
+ case MOVE_UP_FROM_LEFT:
+ /*
+ * Check for short circuits.
+ *
+ * Optimization: !!match == (pred->op == OP_OR)
+ * is the same as:
+ * if ((match && pred->op == OP_OR) ||
+ * (!match && pred->op == OP_AND))
+ */
+ if (!!d->match == (pred->op == OP_OR))
+ return WALK_PRED_PARENT;
+ break;
+ case MOVE_UP_FROM_RIGHT:
+ break;
+ }
+
+ return WALK_PRED_DEFAULT;
+}
+
+/* return 1 if event matches, 0 otherwise (discard) */
+int filter_match_preds(struct event_filter *filter, void *rec)
+{
+ struct filter_pred *preds;
+ struct filter_pred *root;
+ struct filter_match_preds_data data = {
+ /* match is currently meaningless */
+ .match = -1,
+ .rec = rec,
+ };
+ int n_preds, ret;
+
+ /* no filter is considered a match */
+ if (!filter)
+ return 1;
+
+ n_preds = filter->n_preds;
+ if (!n_preds)
+ return 1;
+
+ /*
+ * n_preds, root and filter->preds are protect with preemption disabled.
+ */
+ root = rcu_dereference_sched(filter->root);
+ if (!root)
+ return 1;
+
+ data.preds = preds = rcu_dereference_sched(filter->preds);
+ ret = walk_pred_tree(preds, root, filter_match_preds_cb, &data);
+ WARN_ON(ret);
+ return data.match;
+}
+EXPORT_SYMBOL_GPL(filter_match_preds);
+
+static void parse_error(struct filter_parse_state *ps, int err, int pos)
+{
+ ps->lasterr = err;
+ ps->lasterr_pos = pos;
+}
+
+static void remove_filter_string(struct event_filter *filter)
+{
+ if (!filter)
+ return;
+
+ kfree(filter->filter_string);
+ filter->filter_string = NULL;
+}
+
+static int replace_filter_string(struct event_filter *filter,
+ char *filter_string)
+{
+ kfree(filter->filter_string);
+ filter->filter_string = kstrdup(filter_string, GFP_KERNEL);
+ if (!filter->filter_string)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static int append_filter_string(struct event_filter *filter,
+ char *string)
+{
+ int newlen;
+ char *new_filter_string;
+
+ BUG_ON(!filter->filter_string);
+ newlen = strlen(filter->filter_string) + strlen(string) + 1;
+ new_filter_string = kmalloc(newlen, GFP_KERNEL);
+ if (!new_filter_string)
+ return -ENOMEM;
+
+ strcpy(new_filter_string, filter->filter_string);
+ strcat(new_filter_string, string);
+ kfree(filter->filter_string);
+ filter->filter_string = new_filter_string;
+
+ return 0;
+}
+
+static void append_filter_err(struct filter_parse_state *ps,
+ struct event_filter *filter)
+{
+ int pos = ps->lasterr_pos;
+ char *buf, *pbuf;
+
+ buf = (char *)__get_free_page(GFP_TEMPORARY);
+ if (!buf)
+ return;
+
+ append_filter_string(filter, "\n");
+ memset(buf, ' ', PAGE_SIZE);
+ if (pos > PAGE_SIZE - 128)
+ pos = 0;
+ buf[pos] = '^';
+ pbuf = &buf[pos] + 1;
+
+ sprintf(pbuf, "\nparse_error: %s\n", err_text[ps->lasterr]);
+ append_filter_string(filter, buf);
+ free_page((unsigned long) buf);
+}
+
+static inline struct event_filter *event_filter(struct ftrace_event_file *file)
+{
+ if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
+ return file->event_call->filter;
+ else
+ return file->filter;
+}
+
+/* caller must hold event_mutex */
+void print_event_filter(struct ftrace_event_file *file, struct trace_seq *s)
+{
+ struct event_filter *filter = event_filter(file);
+
+ if (filter && filter->filter_string)
+ trace_seq_printf(s, "%s\n", filter->filter_string);
+ else
+ trace_seq_puts(s, "none\n");
+}
+
+void print_subsystem_event_filter(struct event_subsystem *system,
+ struct trace_seq *s)
+{
+ struct event_filter *filter;
+
+ mutex_lock(&event_mutex);
+ filter = system->filter;
+ if (filter && filter->filter_string)
+ trace_seq_printf(s, "%s\n", filter->filter_string);
+ else
+ trace_seq_puts(s, DEFAULT_SYS_FILTER_MESSAGE "\n");
+ mutex_unlock(&event_mutex);
+}
+
+static int __alloc_pred_stack(struct pred_stack *stack, int n_preds)
+{
+ stack->preds = kcalloc(n_preds + 1, sizeof(*stack->preds), GFP_KERNEL);
+ if (!stack->preds)
+ return -ENOMEM;
+ stack->index = n_preds;
+ return 0;
+}
+
+static void __free_pred_stack(struct pred_stack *stack)
+{
+ kfree(stack->preds);
+ stack->index = 0;
+}
+
+static int __push_pred_stack(struct pred_stack *stack,
+ struct filter_pred *pred)
+{
+ int index = stack->index;
+
+ if (WARN_ON(index == 0))
+ return -ENOSPC;
+
+ stack->preds[--index] = pred;
+ stack->index = index;
+ return 0;
+}
+
+static struct filter_pred *
+__pop_pred_stack(struct pred_stack *stack)
+{
+ struct filter_pred *pred;
+ int index = stack->index;
+
+ pred = stack->preds[index++];
+ if (!pred)
+ return NULL;
+
+ stack->index = index;
+ return pred;
+}
+
+static int filter_set_pred(struct event_filter *filter,
+ int idx,
+ struct pred_stack *stack,
+ struct filter_pred *src)
+{
+ struct filter_pred *dest = &filter->preds[idx];
+ struct filter_pred *left;
+ struct filter_pred *right;
+
+ *dest = *src;
+ dest->index = idx;
+
+ if (dest->op == OP_OR || dest->op == OP_AND) {
+ right = __pop_pred_stack(stack);
+ left = __pop_pred_stack(stack);
+ if (!left || !right)
+ return -EINVAL;
+ /*
+ * If both children can be folded
+ * and they are the same op as this op or a leaf,
+ * then this op can be folded.
+ */
+ if (left->index & FILTER_PRED_FOLD &&
+ ((left->op == dest->op && !left->not) ||
+ left->left == FILTER_PRED_INVALID) &&
+ right->index & FILTER_PRED_FOLD &&
+ ((right->op == dest->op && !right->not) ||
+ right->left == FILTER_PRED_INVALID))
+ dest->index |= FILTER_PRED_FOLD;
+
+ dest->left = left->index & ~FILTER_PRED_FOLD;
+ dest->right = right->index & ~FILTER_PRED_FOLD;
+ left->parent = dest->index & ~FILTER_PRED_FOLD;
+ right->parent = dest->index | FILTER_PRED_IS_RIGHT;
+ } else {
+ /*
+ * Make dest->left invalid to be used as a quick
+ * way to know this is a leaf node.
+ */
+ dest->left = FILTER_PRED_INVALID;
+
+ /* All leafs allow folding the parent ops. */
+ dest->index |= FILTER_PRED_FOLD;
+ }
+
+ return __push_pred_stack(stack, dest);
+}
+
+static void __free_preds(struct event_filter *filter)
+{
+ int i;
+
+ if (filter->preds) {
+ for (i = 0; i < filter->n_preds; i++)
+ kfree(filter->preds[i].ops);
+ kfree(filter->preds);
+ filter->preds = NULL;
+ }
+ filter->a_preds = 0;
+ filter->n_preds = 0;
+}
+
+static void filter_disable(struct ftrace_event_file *file)
+{
+ struct ftrace_event_call *call = file->event_call;
+
+ if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
+ call->flags &= ~TRACE_EVENT_FL_FILTERED;
+ else
+ file->flags &= ~FTRACE_EVENT_FL_FILTERED;
+}
+
+static void __free_filter(struct event_filter *filter)
+{
+ if (!filter)
+ return;
+
+ __free_preds(filter);
+ kfree(filter->filter_string);
+ kfree(filter);
+}
+
+void free_event_filter(struct event_filter *filter)
+{
+ __free_filter(filter);
+}
+
+static struct event_filter *__alloc_filter(void)
+{
+ struct event_filter *filter;
+
+ filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+ return filter;
+}
+
+static int __alloc_preds(struct event_filter *filter, int n_preds)
+{
+ struct filter_pred *pred;
+ int i;
+
+ if (filter->preds)
+ __free_preds(filter);
+
+ filter->preds = kcalloc(n_preds, sizeof(*filter->preds), GFP_KERNEL);
+
+ if (!filter->preds)
+ return -ENOMEM;
+
+ filter->a_preds = n_preds;
+ filter->n_preds = 0;
+
+ for (i = 0; i < n_preds; i++) {
+ pred = &filter->preds[i];
+ pred->fn = filter_pred_none;
+ }
+
+ return 0;
+}
+
+static inline void __remove_filter(struct ftrace_event_file *file)
+{
+ struct ftrace_event_call *call = file->event_call;
+
+ filter_disable(file);
+ if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
+ remove_filter_string(call->filter);
+ else
+ remove_filter_string(file->filter);
+}
+
+static void filter_free_subsystem_preds(struct ftrace_subsystem_dir *dir,
+ struct trace_array *tr)
+{
+ struct ftrace_event_file *file;
+
+ list_for_each_entry(file, &tr->events, list) {
+ if (file->system != dir)
+ continue;
+ __remove_filter(file);
+ }
+}
+
+static inline void __free_subsystem_filter(struct ftrace_event_file *file)
+{
+ struct ftrace_event_call *call = file->event_call;
+
+ if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) {
+ __free_filter(call->filter);
+ call->filter = NULL;
+ } else {
+ __free_filter(file->filter);
+ file->filter = NULL;
+ }
+}
+
+static void filter_free_subsystem_filters(struct ftrace_subsystem_dir *dir,
+ struct trace_array *tr)
+{
+ struct ftrace_event_file *file;
+
+ list_for_each_entry(file, &tr->events, list) {
+ if (file->system != dir)
+ continue;
+ __free_subsystem_filter(file);
+ }
+}
+
+static int filter_add_pred(struct filter_parse_state *ps,
+ struct event_filter *filter,
+ struct filter_pred *pred,
+ struct pred_stack *stack)
+{
+ int err;
+
+ if (WARN_ON(filter->n_preds == filter->a_preds)) {
+ parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
+ return -ENOSPC;
+ }
+
+ err = filter_set_pred(filter, filter->n_preds, stack, pred);
+ if (err)
+ return err;
+
+ filter->n_preds++;
+
+ return 0;
+}
+
+int filter_assign_type(const char *type)
+{
+ if (strstr(type, "__data_loc") && strstr(type, "char"))
+ return FILTER_DYN_STRING;
+
+ if (strchr(type, '[') && strstr(type, "char"))
+ return FILTER_STATIC_STRING;
+
+ return FILTER_OTHER;
+}
+
+static bool is_function_field(struct ftrace_event_field *field)
+{
+ return field->filter_type == FILTER_TRACE_FN;
+}
+
+static bool is_string_field(struct ftrace_event_field *field)
+{
+ return field->filter_type == FILTER_DYN_STRING ||
+ field->filter_type == FILTER_STATIC_STRING ||
+ field->filter_type == FILTER_PTR_STRING;
+}
+
+static int is_legal_op(struct ftrace_event_field *field, int op)
+{
+ if (is_string_field(field) &&
+ (op != OP_EQ && op != OP_NE && op != OP_GLOB))
+ return 0;
+ if (!is_string_field(field) && op == OP_GLOB)
+ return 0;
+
+ return 1;
+}
+
+static filter_pred_fn_t select_comparison_fn(int op, int field_size,
+ int field_is_signed)
+{
+ filter_pred_fn_t fn = NULL;
+
+ switch (field_size) {
+ case 8:
+ if (op == OP_EQ || op == OP_NE)
+ fn = filter_pred_64;
+ else if (field_is_signed)
+ fn = filter_pred_s64;
+ else
+ fn = filter_pred_u64;
+ break;
+ case 4:
+ if (op == OP_EQ || op == OP_NE)
+ fn = filter_pred_32;
+ else if (field_is_signed)
+ fn = filter_pred_s32;
+ else
+ fn = filter_pred_u32;
+ break;
+ case 2:
+ if (op == OP_EQ || op == OP_NE)
+ fn = filter_pred_16;
+ else if (field_is_signed)
+ fn = filter_pred_s16;
+ else
+ fn = filter_pred_u16;
+ break;
+ case 1:
+ if (op == OP_EQ || op == OP_NE)
+ fn = filter_pred_8;
+ else if (field_is_signed)
+ fn = filter_pred_s8;
+ else
+ fn = filter_pred_u8;
+ break;
+ }
+
+ return fn;
+}
+
+static int init_pred(struct filter_parse_state *ps,
+ struct ftrace_event_field *field,
+ struct filter_pred *pred)
+
+{
+ filter_pred_fn_t fn = filter_pred_none;
+ unsigned long long val;
+ int ret;
+
+ pred->offset = field->offset;
+
+ if (!is_legal_op(field, pred->op)) {
+ parse_error(ps, FILT_ERR_ILLEGAL_FIELD_OP, 0);
+ return -EINVAL;
+ }
+
+ if (is_string_field(field)) {
+ filter_build_regex(pred);
+
+ if (field->filter_type == FILTER_STATIC_STRING) {
+ fn = filter_pred_string;
+ pred->regex.field_len = field->size;
+ } else if (field->filter_type == FILTER_DYN_STRING)
+ fn = filter_pred_strloc;
+ else
+ fn = filter_pred_pchar;
+ } else if (is_function_field(field)) {
+ if (strcmp(field->name, "ip")) {
+ parse_error(ps, FILT_ERR_IP_FIELD_ONLY, 0);
+ return -EINVAL;
+ }
+ } else {
+ if (field->is_signed)
+ ret = kstrtoll(pred->regex.pattern, 0, &val);
+ else
+ ret = kstrtoull(pred->regex.pattern, 0, &val);
+ if (ret) {
+ parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0);
+ return -EINVAL;
+ }
+ pred->val = val;
+
+ fn = select_comparison_fn(pred->op, field->size,
+ field->is_signed);
+ if (!fn) {
+ parse_error(ps, FILT_ERR_INVALID_OP, 0);
+ return -EINVAL;
+ }
+ }
+
+ if (pred->op == OP_NE)
+ pred->not ^= 1;
+
+ pred->fn = fn;
+ return 0;
+}
+
+static void parse_init(struct filter_parse_state *ps,
+ struct filter_op *ops,
+ char *infix_string)
+{
+ memset(ps, '\0', sizeof(*ps));
+
+ ps->infix.string = infix_string;
+ ps->infix.cnt = strlen(infix_string);
+ ps->ops = ops;
+
+ INIT_LIST_HEAD(&ps->opstack);
+ INIT_LIST_HEAD(&ps->postfix);
+}
+
+static char infix_next(struct filter_parse_state *ps)
+{
+ if (!ps->infix.cnt)
+ return 0;
+
+ ps->infix.cnt--;
+
+ return ps->infix.string[ps->infix.tail++];
+}
+
+static char infix_peek(struct filter_parse_state *ps)
+{
+ if (ps->infix.tail == strlen(ps->infix.string))
+ return 0;
+
+ return ps->infix.string[ps->infix.tail];
+}
+
+static void infix_advance(struct filter_parse_state *ps)
+{
+ if (!ps->infix.cnt)
+ return;
+
+ ps->infix.cnt--;
+ ps->infix.tail++;
+}
+
+static inline int is_precedence_lower(struct filter_parse_state *ps,
+ int a, int b)
+{
+ return ps->ops[a].precedence < ps->ops[b].precedence;
+}
+
+static inline int is_op_char(struct filter_parse_state *ps, char c)
+{
+ int i;
+
+ for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
+ if (ps->ops[i].string[0] == c)
+ return 1;
+ }
+
+ return 0;
+}
+
+static int infix_get_op(struct filter_parse_state *ps, char firstc)
+{
+ char nextc = infix_peek(ps);
+ char opstr[3];
+ int i;
+
+ opstr[0] = firstc;
+ opstr[1] = nextc;
+ opstr[2] = '\0';
+
+ for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
+ if (!strcmp(opstr, ps->ops[i].string)) {
+ infix_advance(ps);
+ return ps->ops[i].id;
+ }
+ }
+
+ opstr[1] = '\0';
+
+ for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
+ if (!strcmp(opstr, ps->ops[i].string))
+ return ps->ops[i].id;
+ }
+
+ return OP_NONE;
+}
+
+static inline void clear_operand_string(struct filter_parse_state *ps)
+{
+ memset(ps->operand.string, '\0', MAX_FILTER_STR_VAL);
+ ps->operand.tail = 0;
+}
+
+static inline int append_operand_char(struct filter_parse_state *ps, char c)
+{
+ if (ps->operand.tail == MAX_FILTER_STR_VAL - 1)
+ return -EINVAL;
+
+ ps->operand.string[ps->operand.tail++] = c;
+
+ return 0;
+}
+
+static int filter_opstack_push(struct filter_parse_state *ps, int op)
+{
+ struct opstack_op *opstack_op;
+
+ opstack_op = kmalloc(sizeof(*opstack_op), GFP_KERNEL);
+ if (!opstack_op)
+ return -ENOMEM;
+
+ opstack_op->op = op;
+ list_add(&opstack_op->list, &ps->opstack);
+
+ return 0;
+}
+
+static int filter_opstack_empty(struct filter_parse_state *ps)
+{
+ return list_empty(&ps->opstack);
+}
+
+static int filter_opstack_top(struct filter_parse_state *ps)
+{
+ struct opstack_op *opstack_op;
+
+ if (filter_opstack_empty(ps))
+ return OP_NONE;
+
+ opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list);
+
+ return opstack_op->op;
+}
+
+static int filter_opstack_pop(struct filter_parse_state *ps)
+{
+ struct opstack_op *opstack_op;
+ int op;
+
+ if (filter_opstack_empty(ps))
+ return OP_NONE;
+
+ opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list);
+ op = opstack_op->op;
+ list_del(&opstack_op->list);
+
+ kfree(opstack_op);
+
+ return op;
+}
+
+static void filter_opstack_clear(struct filter_parse_state *ps)
+{
+ while (!filter_opstack_empty(ps))
+ filter_opstack_pop(ps);
+}
+
+static char *curr_operand(struct filter_parse_state *ps)
+{
+ return ps->operand.string;
+}
+
+static int postfix_append_operand(struct filter_parse_state *ps, char *operand)
+{
+ struct postfix_elt *elt;
+
+ elt = kmalloc(sizeof(*elt), GFP_KERNEL);
+ if (!elt)
+ return -ENOMEM;
+
+ elt->op = OP_NONE;
+ elt->operand = kstrdup(operand, GFP_KERNEL);
+ if (!elt->operand) {
+ kfree(elt);
+ return -ENOMEM;
+ }
+
+ list_add_tail(&elt->list, &ps->postfix);
+
+ return 0;
+}
+
+static int postfix_append_op(struct filter_parse_state *ps, int op)
+{
+ struct postfix_elt *elt;
+
+ elt = kmalloc(sizeof(*elt), GFP_KERNEL);
+ if (!elt)
+ return -ENOMEM;
+
+ elt->op = op;
+ elt->operand = NULL;
+
+ list_add_tail(&elt->list, &ps->postfix);
+
+ return 0;
+}
+
+static void postfix_clear(struct filter_parse_state *ps)
+{
+ struct postfix_elt *elt;
+
+ while (!list_empty(&ps->postfix)) {
+ elt = list_first_entry(&ps->postfix, struct postfix_elt, list);
+ list_del(&elt->list);
+ kfree(elt->operand);
+ kfree(elt);
+ }
+}
+
+static int filter_parse(struct filter_parse_state *ps)
+{
+ int in_string = 0;
+ int op, top_op;
+ char ch;
+
+ while ((ch = infix_next(ps))) {
+ if (ch == '"') {
+ in_string ^= 1;
+ continue;
+ }
+
+ if (in_string)
+ goto parse_operand;
+
+ if (isspace(ch))
+ continue;
+
+ if (is_op_char(ps, ch)) {
+ op = infix_get_op(ps, ch);
+ if (op == OP_NONE) {
+ parse_error(ps, FILT_ERR_INVALID_OP, 0);
+ return -EINVAL;
+ }
+
+ if (strlen(curr_operand(ps))) {
+ postfix_append_operand(ps, curr_operand(ps));
+ clear_operand_string(ps);
+ }
+
+ while (!filter_opstack_empty(ps)) {
+ top_op = filter_opstack_top(ps);
+ if (!is_precedence_lower(ps, top_op, op)) {
+ top_op = filter_opstack_pop(ps);
+ postfix_append_op(ps, top_op);
+ continue;
+ }
+ break;
+ }
+
+ filter_opstack_push(ps, op);
+ continue;
+ }
+
+ if (ch == '(') {
+ filter_opstack_push(ps, OP_OPEN_PAREN);
+ continue;
+ }
+
+ if (ch == ')') {
+ if (strlen(curr_operand(ps))) {
+ postfix_append_operand(ps, curr_operand(ps));
+ clear_operand_string(ps);
+ }
+
+ top_op = filter_opstack_pop(ps);
+ while (top_op != OP_NONE) {
+ if (top_op == OP_OPEN_PAREN)
+ break;
+ postfix_append_op(ps, top_op);
+ top_op = filter_opstack_pop(ps);
+ }
+ if (top_op == OP_NONE) {
+ parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0);
+ return -EINVAL;
+ }
+ continue;
+ }
+parse_operand:
+ if (append_operand_char(ps, ch)) {
+ parse_error(ps, FILT_ERR_OPERAND_TOO_LONG, 0);
+ return -EINVAL;
+ }
+ }
+
+ if (strlen(curr_operand(ps)))
+ postfix_append_operand(ps, curr_operand(ps));
+
+ while (!filter_opstack_empty(ps)) {
+ top_op = filter_opstack_pop(ps);
+ if (top_op == OP_NONE)
+ break;
+ if (top_op == OP_OPEN_PAREN) {
+ parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0);
+ return -EINVAL;
+ }
+ postfix_append_op(ps, top_op);
+ }
+
+ return 0;
+}
+
+static struct filter_pred *create_pred(struct filter_parse_state *ps,
+ struct ftrace_event_call *call,
+ int op, char *operand1, char *operand2)
+{
+ struct ftrace_event_field *field;
+ static struct filter_pred pred;
+
+ memset(&pred, 0, sizeof(pred));
+ pred.op = op;
+
+ if (op == OP_AND || op == OP_OR)
+ return &pred;
+
+ if (!operand1 || !operand2) {
+ parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
+ return NULL;
+ }
+
+ field = trace_find_event_field(call, operand1);
+ if (!field) {
+ parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);
+ return NULL;
+ }
+
+ strcpy(pred.regex.pattern, operand2);
+ pred.regex.len = strlen(pred.regex.pattern);
+ pred.field = field;
+ return init_pred(ps, field, &pred) ? NULL : &pred;
+}
+
+static int check_preds(struct filter_parse_state *ps)
+{
+ int n_normal_preds = 0, n_logical_preds = 0;
+ struct postfix_elt *elt;
+ int cnt = 0;
+
+ list_for_each_entry(elt, &ps->postfix, list) {
+ if (elt->op == OP_NONE) {
+ cnt++;
+ continue;
+ }
+
+ if (elt->op == OP_AND || elt->op == OP_OR) {
+ n_logical_preds++;
+ cnt--;
+ continue;
+ }
+ if (elt->op != OP_NOT)
+ cnt--;
+ n_normal_preds++;
+ /* all ops should have operands */
+ if (cnt < 0)
+ break;
+ }
+
+ if (cnt != 1 || !n_normal_preds || n_logical_preds >= n_normal_preds) {
+ parse_error(ps, FILT_ERR_INVALID_FILTER, 0);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int count_preds(struct filter_parse_state *ps)
+{
+ struct postfix_elt *elt;
+ int n_preds = 0;
+
+ list_for_each_entry(elt, &ps->postfix, list) {
+ if (elt->op == OP_NONE)
+ continue;
+ n_preds++;
+ }
+
+ return n_preds;
+}
+
+struct check_pred_data {
+ int count;
+ int max;
+};
+
+static int check_pred_tree_cb(enum move_type move, struct filter_pred *pred,
+ int *err, void *data)
+{
+ struct check_pred_data *d = data;
+
+ if (WARN_ON(d->count++ > d->max)) {
+ *err = -EINVAL;
+ return WALK_PRED_ABORT;
+ }
+ return WALK_PRED_DEFAULT;
+}
+
+/*
+ * The tree is walked at filtering of an event. If the tree is not correctly
+ * built, it may cause an infinite loop. Check here that the tree does
+ * indeed terminate.
+ */
+static int check_pred_tree(struct event_filter *filter,
+ struct filter_pred *root)
+{
+ struct check_pred_data data = {
+ /*
+ * The max that we can hit a node is three times.
+ * Once going down, once coming up from left, and
+ * once coming up from right. This is more than enough
+ * since leafs are only hit a single time.
+ */
+ .max = 3 * filter->n_preds,
+ .count = 0,
+ };
+
+ return walk_pred_tree(filter->preds, root,
+ check_pred_tree_cb, &data);
+}
+
+static int count_leafs_cb(enum move_type move, struct filter_pred *pred,
+ int *err, void *data)
+{
+ int *count = data;
+
+ if ((move == MOVE_DOWN) &&
+ (pred->left == FILTER_PRED_INVALID))
+ (*count)++;
+
+ return WALK_PRED_DEFAULT;
+}
+
+static int count_leafs(struct filter_pred *preds, struct filter_pred *root)
+{
+ int count = 0, ret;
+
+ ret = walk_pred_tree(preds, root, count_leafs_cb, &count);
+ WARN_ON(ret);
+ return count;
+}
+
+struct fold_pred_data {
+ struct filter_pred *root;
+ int count;
+ int children;
+};
+
+static int fold_pred_cb(enum move_type move, struct filter_pred *pred,
+ int *err, void *data)
+{
+ struct fold_pred_data *d = data;
+ struct filter_pred *root = d->root;
+
+ if (move != MOVE_DOWN)
+ return WALK_PRED_DEFAULT;
+ if (pred->left != FILTER_PRED_INVALID)
+ return WALK_PRED_DEFAULT;
+
+ if (WARN_ON(d->count == d->children)) {
+ *err = -EINVAL;
+ return WALK_PRED_ABORT;
+ }
+
+ pred->index &= ~FILTER_PRED_FOLD;
+ root->ops[d->count++] = pred->index;
+ return WALK_PRED_DEFAULT;
+}
+
+static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
+{
+ struct fold_pred_data data = {
+ .root = root,
+ .count = 0,
+ };
+ int children;
+
+ /* No need to keep the fold flag */
+ root->index &= ~FILTER_PRED_FOLD;
+
+ /* If the root is a leaf then do nothing */
+ if (root->left == FILTER_PRED_INVALID)
+ return 0;
+
+ /* count the children */
+ children = count_leafs(preds, &preds[root->left]);
+ children += count_leafs(preds, &preds[root->right]);
+
+ root->ops = kcalloc(children, sizeof(*root->ops), GFP_KERNEL);
+ if (!root->ops)
+ return -ENOMEM;
+
+ root->val = children;
+ data.children = children;
+ return walk_pred_tree(preds, root, fold_pred_cb, &data);
+}
+
+static int fold_pred_tree_cb(enum move_type move, struct filter_pred *pred,
+ int *err, void *data)
+{
+ struct filter_pred *preds = data;
+
+ if (move != MOVE_DOWN)
+ return WALK_PRED_DEFAULT;
+ if (!(pred->index & FILTER_PRED_FOLD))
+ return WALK_PRED_DEFAULT;
+
+ *err = fold_pred(preds, pred);
+ if (*err)
+ return WALK_PRED_ABORT;
+
+ /* eveyrhing below is folded, continue with parent */
+ return WALK_PRED_PARENT;
+}
+
+/*
+ * To optimize the processing of the ops, if we have several "ors" or
+ * "ands" together, we can put them in an array and process them all
+ * together speeding up the filter logic.
+ */
+static int fold_pred_tree(struct event_filter *filter,
+ struct filter_pred *root)
+{
+ return walk_pred_tree(filter->preds, root, fold_pred_tree_cb,
+ filter->preds);
+}
+
+static int replace_preds(struct ftrace_event_call *call,
+ struct event_filter *filter,
+ struct filter_parse_state *ps,
+ bool dry_run)
+{
+ char *operand1 = NULL, *operand2 = NULL;
+ struct filter_pred *pred;
+ struct filter_pred *root;
+ struct postfix_elt *elt;
+ struct pred_stack stack = { }; /* init to NULL */
+ int err;
+ int n_preds = 0;
+
+ n_preds = count_preds(ps);
+ if (n_preds >= MAX_FILTER_PRED) {
+ parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
+ return -ENOSPC;
+ }
+
+ err = check_preds(ps);
+ if (err)
+ return err;
+
+ if (!dry_run) {
+ err = __alloc_pred_stack(&stack, n_preds);
+ if (err)
+ return err;
+ err = __alloc_preds(filter, n_preds);
+ if (err)
+ goto fail;
+ }
+
+ n_preds = 0;
+ list_for_each_entry(elt, &ps->postfix, list) {
+ if (elt->op == OP_NONE) {
+ if (!operand1)
+ operand1 = elt->operand;
+ else if (!operand2)
+ operand2 = elt->operand;
+ else {
+ parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0);
+ err = -EINVAL;
+ goto fail;
+ }
+ continue;
+ }
+
+ if (elt->op == OP_NOT) {
+ if (!n_preds || operand1 || operand2) {
+ parse_error(ps, FILT_ERR_ILLEGAL_NOT_OP, 0);
+ err = -EINVAL;
+ goto fail;
+ }
+ if (!dry_run)
+ filter->preds[n_preds - 1].not ^= 1;
+ continue;
+ }
+
+ if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) {
+ parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
+ err = -ENOSPC;
+ goto fail;
+ }
+
+ pred = create_pred(ps, call, elt->op, operand1, operand2);
+ if (!pred) {
+ err = -EINVAL;
+ goto fail;
+ }
+
+ if (!dry_run) {
+ err = filter_add_pred(ps, filter, pred, &stack);
+ if (err)
+ goto fail;
+ }
+
+ operand1 = operand2 = NULL;
+ }
+
+ if (!dry_run) {
+ /* We should have one item left on the stack */
+ pred = __pop_pred_stack(&stack);
+ if (!pred)
+ return -EINVAL;
+ /* This item is where we start from in matching */
+ root = pred;
+ /* Make sure the stack is empty */
+ pred = __pop_pred_stack(&stack);
+ if (WARN_ON(pred)) {
+ err = -EINVAL;
+ filter->root = NULL;
+ goto fail;
+ }
+ err = check_pred_tree(filter, root);
+ if (err)
+ goto fail;
+
+ /* Optimize the tree */
+ err = fold_pred_tree(filter, root);
+ if (err)
+ goto fail;
+
+ /* We don't set root until we know it works */
+ barrier();
+ filter->root = root;
+ }
+
+ err = 0;
+fail:
+ __free_pred_stack(&stack);
+ return err;
+}
+
+static inline void event_set_filtered_flag(struct ftrace_event_file *file)
+{
+ struct ftrace_event_call *call = file->event_call;
+
+ if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
+ call->flags |= TRACE_EVENT_FL_FILTERED;
+ else
+ file->flags |= FTRACE_EVENT_FL_FILTERED;
+}
+
+static inline void event_set_filter(struct ftrace_event_file *file,
+ struct event_filter *filter)
+{
+ struct ftrace_event_call *call = file->event_call;
+
+ if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
+ rcu_assign_pointer(call->filter, filter);
+ else
+ rcu_assign_pointer(file->filter, filter);
+}
+
+static inline void event_clear_filter(struct ftrace_event_file *file)
+{
+ struct ftrace_event_call *call = file->event_call;
+
+ if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
+ RCU_INIT_POINTER(call->filter, NULL);
+ else
+ RCU_INIT_POINTER(file->filter, NULL);
+}
+
+static inline void
+event_set_no_set_filter_flag(struct ftrace_event_file *file)
+{
+ struct ftrace_event_call *call = file->event_call;
+
+ if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
+ call->flags |= TRACE_EVENT_FL_NO_SET_FILTER;
+ else
+ file->flags |= FTRACE_EVENT_FL_NO_SET_FILTER;
+}
+
+static inline void
+event_clear_no_set_filter_flag(struct ftrace_event_file *file)
+{
+ struct ftrace_event_call *call = file->event_call;
+
+ if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
+ call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER;
+ else
+ file->flags &= ~FTRACE_EVENT_FL_NO_SET_FILTER;
+}
+
+static inline bool
+event_no_set_filter_flag(struct ftrace_event_file *file)
+{
+ struct ftrace_event_call *call = file->event_call;
+
+ if (file->flags & FTRACE_EVENT_FL_NO_SET_FILTER)
+ return true;
+
+ if ((call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) &&
+ (call->flags & TRACE_EVENT_FL_NO_SET_FILTER))
+ return true;
+
+ return false;
+}
+
+struct filter_list {
+ struct list_head list;
+ struct event_filter *filter;
+};
+
+static int replace_system_preds(struct ftrace_subsystem_dir *dir,
+ struct trace_array *tr,
+ struct filter_parse_state *ps,
+ char *filter_string)
+{
+ struct ftrace_event_file *file;
+ struct filter_list *filter_item;
+ struct filter_list *tmp;
+ LIST_HEAD(filter_list);
+ bool fail = true;
+ int err;
+
+ list_for_each_entry(file, &tr->events, list) {
+ if (file->system != dir)
+ continue;
+
+ /*
+ * Try to see if the filter can be applied
+ * (filter arg is ignored on dry_run)
+ */
+ err = replace_preds(file->event_call, NULL, ps, true);
+ if (err)
+ event_set_no_set_filter_flag(file);
+ else
+ event_clear_no_set_filter_flag(file);
+ }
+
+ list_for_each_entry(file, &tr->events, list) {
+ struct event_filter *filter;
+
+ if (file->system != dir)
+ continue;
+
+ if (event_no_set_filter_flag(file))
+ continue;
+
+ filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL);
+ if (!filter_item)
+ goto fail_mem;
+
+ list_add_tail(&filter_item->list, &filter_list);
+
+ filter_item->filter = __alloc_filter();
+ if (!filter_item->filter)
+ goto fail_mem;
+ filter = filter_item->filter;
+
+ /* Can only fail on no memory */
+ err = replace_filter_string(filter, filter_string);
+ if (err)
+ goto fail_mem;
+
+ err = replace_preds(file->event_call, filter, ps, false);
+ if (err) {
+ filter_disable(file);
+ parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
+ append_filter_err(ps, filter);
+ } else
+ event_set_filtered_flag(file);
+ /*
+ * Regardless of if this returned an error, we still
+ * replace the filter for the call.
+ */
+ filter = event_filter(file);
+ event_set_filter(file, filter_item->filter);
+ filter_item->filter = filter;
+
+ fail = false;
+ }
+
+ if (fail)
+ goto fail;
+
+ /*
+ * The calls can still be using the old filters.
+ * Do a synchronize_sched() to ensure all calls are
+ * done with them before we free them.
+ */
+ synchronize_sched();
+ list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
+ __free_filter(filter_item->filter);
+ list_del(&filter_item->list);
+ kfree(filter_item);
+ }
+ return 0;
+ fail:
+ /* No call succeeded */
+ list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
+ list_del(&filter_item->list);
+ kfree(filter_item);
+ }
+ parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
+ return -EINVAL;
+ fail_mem:
+ /* If any call succeeded, we still need to sync */
+ if (!fail)
+ synchronize_sched();
+ list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
+ __free_filter(filter_item->filter);
+ list_del(&filter_item->list);
+ kfree(filter_item);
+ }
+ return -ENOMEM;
+}
+
+static int create_filter_start(char *filter_str, bool set_str,
+ struct filter_parse_state **psp,
+ struct event_filter **filterp)
+{
+ struct event_filter *filter;
+ struct filter_parse_state *ps = NULL;
+ int err = 0;
+
+ WARN_ON_ONCE(*psp || *filterp);
+
+ /* allocate everything, and if any fails, free all and fail */
+ filter = __alloc_filter();
+ if (filter && set_str)
+ err = replace_filter_string(filter, filter_str);
+
+ ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+
+ if (!filter || !ps || err) {
+ kfree(ps);
+ __free_filter(filter);
+ return -ENOMEM;
+ }
+
+ /* we're committed to creating a new filter */
+ *filterp = filter;
+ *psp = ps;
+
+ parse_init(ps, filter_ops, filter_str);
+ err = filter_parse(ps);
+ if (err && set_str)
+ append_filter_err(ps, filter);
+ return err;
+}
+
+static void create_filter_finish(struct filter_parse_state *ps)
+{
+ if (ps) {
+ filter_opstack_clear(ps);
+ postfix_clear(ps);
+ kfree(ps);
+ }
+}
+
+/**
+ * create_filter - create a filter for a ftrace_event_call
+ * @call: ftrace_event_call to create a filter for
+ * @filter_str: filter string
+ * @set_str: remember @filter_str and enable detailed error in filter
+ * @filterp: out param for created filter (always updated on return)
+ *
+ * Creates a filter for @call with @filter_str. If @set_str is %true,
+ * @filter_str is copied and recorded in the new filter.
+ *
+ * On success, returns 0 and *@filterp points to the new filter. On
+ * failure, returns -errno and *@filterp may point to %NULL or to a new
+ * filter. In the latter case, the returned filter contains error
+ * information if @set_str is %true and the caller is responsible for
+ * freeing it.
+ */
+static int create_filter(struct ftrace_event_call *call,
+ char *filter_str, bool set_str,
+ struct event_filter **filterp)
+{
+ struct event_filter *filter = NULL;
+ struct filter_parse_state *ps = NULL;
+ int err;
+
+ err = create_filter_start(filter_str, set_str, &ps, &filter);
+ if (!err) {
+ err = replace_preds(call, filter, ps, false);
+ if (err && set_str)
+ append_filter_err(ps, filter);
+ }
+ create_filter_finish(ps);
+
+ *filterp = filter;
+ return err;
+}
+
+int create_event_filter(struct ftrace_event_call *call,
+ char *filter_str, bool set_str,
+ struct event_filter **filterp)
+{
+ return create_filter(call, filter_str, set_str, filterp);
+}
+
+/**
+ * create_system_filter - create a filter for an event_subsystem
+ * @system: event_subsystem to create a filter for
+ * @filter_str: filter string
+ * @filterp: out param for created filter (always updated on return)
+ *
+ * Identical to create_filter() except that it creates a subsystem filter
+ * and always remembers @filter_str.
+ */
+static int create_system_filter(struct ftrace_subsystem_dir *dir,
+ struct trace_array *tr,
+ char *filter_str, struct event_filter **filterp)
+{
+ struct event_filter *filter = NULL;
+ struct filter_parse_state *ps = NULL;
+ int err;
+
+ err = create_filter_start(filter_str, true, &ps, &filter);
+ if (!err) {
+ err = replace_system_preds(dir, tr, ps, filter_str);
+ if (!err) {
+ /* System filters just show a default message */
+ kfree(filter->filter_string);
+ filter->filter_string = NULL;
+ } else {
+ append_filter_err(ps, filter);
+ }
+ }
+ create_filter_finish(ps);
+
+ *filterp = filter;
+ return err;
+}
+
+/* caller must hold event_mutex */
+int apply_event_filter(struct ftrace_event_file *file, char *filter_string)
+{
+ struct ftrace_event_call *call = file->event_call;
+ struct event_filter *filter;
+ int err;
+
+ if (!strcmp(strstrip(filter_string), "0")) {
+ filter_disable(file);
+ filter = event_filter(file);
+
+ if (!filter)
+ return 0;
+
+ event_clear_filter(file);
+
+ /* Make sure the filter is not being used */
+ synchronize_sched();
+ __free_filter(filter);
+
+ return 0;
+ }
+
+ err = create_filter(call, filter_string, true, &filter);
+
+ /*
+ * Always swap the call filter with the new filter
+ * even if there was an error. If there was an error
+ * in the filter, we disable the filter and show the error
+ * string
+ */
+ if (filter) {
+ struct event_filter *tmp;
+
+ tmp = event_filter(file);
+ if (!err)
+ event_set_filtered_flag(file);
+ else
+ filter_disable(file);
+
+ event_set_filter(file, filter);
+
+ if (tmp) {
+ /* Make sure the call is done with the filter */
+ synchronize_sched();
+ __free_filter(tmp);
+ }
+ }
+
+ return err;
+}
+
+int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
+ char *filter_string)
+{
+ struct event_subsystem *system = dir->subsystem;
+ struct trace_array *tr = dir->tr;
+ struct event_filter *filter;
+ int err = 0;
+
+ mutex_lock(&event_mutex);
+
+ /* Make sure the system still has events */
+ if (!dir->nr_events) {
+ err = -ENODEV;
+ goto out_unlock;
+ }
+
+ if (!strcmp(strstrip(filter_string), "0")) {
+ filter_free_subsystem_preds(dir, tr);
+ remove_filter_string(system->filter);
+ filter = system->filter;
+ system->filter = NULL;
+ /* Ensure all filters are no longer used */
+ synchronize_sched();
+ filter_free_subsystem_filters(dir, tr);
+ __free_filter(filter);
+ goto out_unlock;
+ }
+
+ err = create_system_filter(dir, tr, filter_string, &filter);
+ if (filter) {
+ /*
+ * No event actually uses the system filter
+ * we can free it without synchronize_sched().
+ */
+ __free_filter(system->filter);
+ system->filter = filter;
+ }
+out_unlock:
+ mutex_unlock(&event_mutex);
+
+ return err;
+}
+
+#ifdef CONFIG_PERF_EVENTS
+
+void ftrace_profile_free_filter(struct perf_event *event)
+{
+ struct event_filter *filter = event->filter;
+
+ event->filter = NULL;
+ __free_filter(filter);
+}
+
+struct function_filter_data {
+ struct ftrace_ops *ops;
+ int first_filter;
+ int first_notrace;
+};
+
+#ifdef CONFIG_FUNCTION_TRACER
+static char **
+ftrace_function_filter_re(char *buf, int len, int *count)
+{
+ char *str, *sep, **re;
+
+ str = kstrndup(buf, len, GFP_KERNEL);
+ if (!str)
+ return NULL;
+
+ /*
+ * The argv_split function takes white space
+ * as a separator, so convert ',' into spaces.
+ */
+ while ((sep = strchr(str, ',')))
+ *sep = ' ';
+
+ re = argv_split(GFP_KERNEL, str, count);
+ kfree(str);
+ return re;
+}
+
+static int ftrace_function_set_regexp(struct ftrace_ops *ops, int filter,
+ int reset, char *re, int len)
+{
+ int ret;
+
+ if (filter)
+ ret = ftrace_set_filter(ops, re, len, reset);
+ else
+ ret = ftrace_set_notrace(ops, re, len, reset);
+
+ return ret;
+}
+
+static int __ftrace_function_set_filter(int filter, char *buf, int len,
+ struct function_filter_data *data)
+{
+ int i, re_cnt, ret = -EINVAL;
+ int *reset;
+ char **re;
+
+ reset = filter ? &data->first_filter : &data->first_notrace;
+
+ /*
+ * The 'ip' field could have multiple filters set, separated
+ * either by space or comma. We first cut the filter and apply
+ * all pieces separatelly.
+ */
+ re = ftrace_function_filter_re(buf, len, &re_cnt);
+ if (!re)
+ return -EINVAL;
+
+ for (i = 0; i < re_cnt; i++) {
+ ret = ftrace_function_set_regexp(data->ops, filter, *reset,
+ re[i], strlen(re[i]));
+ if (ret)
+ break;
+
+ if (*reset)
+ *reset = 0;
+ }
+
+ argv_free(re);
+ return ret;
+}
+
+static int ftrace_function_check_pred(struct filter_pred *pred, int leaf)
+{
+ struct ftrace_event_field *field = pred->field;
+
+ if (leaf) {
+ /*
+ * Check the leaf predicate for function trace, verify:
+ * - only '==' and '!=' is used
+ * - the 'ip' field is used
+ */
+ if ((pred->op != OP_EQ) && (pred->op != OP_NE))
+ return -EINVAL;
+
+ if (strcmp(field->name, "ip"))
+ return -EINVAL;
+ } else {
+ /*
+ * Check the non leaf predicate for function trace, verify:
+ * - only '||' is used
+ */
+ if (pred->op != OP_OR)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int ftrace_function_set_filter_cb(enum move_type move,
+ struct filter_pred *pred,
+ int *err, void *data)
+{
+ /* Checking the node is valid for function trace. */
+ if ((move != MOVE_DOWN) ||
+ (pred->left != FILTER_PRED_INVALID)) {
+ *err = ftrace_function_check_pred(pred, 0);
+ } else {
+ *err = ftrace_function_check_pred(pred, 1);
+ if (*err)
+ return WALK_PRED_ABORT;
+
+ *err = __ftrace_function_set_filter(pred->op == OP_EQ,
+ pred->regex.pattern,
+ pred->regex.len,
+ data);
+ }
+
+ return (*err) ? WALK_PRED_ABORT : WALK_PRED_DEFAULT;
+}
+
+static int ftrace_function_set_filter(struct perf_event *event,
+ struct event_filter *filter)
+{
+ struct function_filter_data data = {
+ .first_filter = 1,
+ .first_notrace = 1,
+ .ops = &event->ftrace_ops,
+ };
+
+ return walk_pred_tree(filter->preds, filter->root,
+ ftrace_function_set_filter_cb, &data);
+}
+#else
+static int ftrace_function_set_filter(struct perf_event *event,
+ struct event_filter *filter)
+{
+ return -ENODEV;
+}
+#endif /* CONFIG_FUNCTION_TRACER */
+
+int ftrace_profile_set_filter(struct perf_event *event, int event_id,
+ char *filter_str)
+{
+ int err;
+ struct event_filter *filter;
+ struct ftrace_event_call *call;
+
+ mutex_lock(&event_mutex);
+
+ call = event->tp_event;
+
+ err = -EINVAL;
+ if (!call)
+ goto out_unlock;
+
+ err = -EEXIST;
+ if (event->filter)
+ goto out_unlock;
+
+ err = create_filter(call, filter_str, false, &filter);
+ if (err)
+ goto free_filter;
+
+ if (ftrace_event_is_function(call))
+ err = ftrace_function_set_filter(event, filter);
+ else
+ event->filter = filter;
+
+free_filter:
+ if (err || ftrace_event_is_function(call))
+ __free_filter(filter);
+
+out_unlock:
+ mutex_unlock(&event_mutex);
+
+ return err;
+}
+
+#endif /* CONFIG_PERF_EVENTS */
+
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+
+#include <linux/types.h>
+#include <linux/tracepoint.h>
+
+#define CREATE_TRACE_POINTS
+#include "trace_events_filter_test.h"
+
+#define DATA_REC(m, va, vb, vc, vd, ve, vf, vg, vh, nvisit) \
+{ \
+ .filter = FILTER, \
+ .rec = { .a = va, .b = vb, .c = vc, .d = vd, \
+ .e = ve, .f = vf, .g = vg, .h = vh }, \
+ .match = m, \
+ .not_visited = nvisit, \
+}
+#define YES 1
+#define NO 0
+
+static struct test_filter_data_t {
+ char *filter;
+ struct ftrace_raw_ftrace_test_filter rec;
+ int match;
+ char *not_visited;
+} test_filter_data[] = {
+#define FILTER "a == 1 && b == 1 && c == 1 && d == 1 && " \
+ "e == 1 && f == 1 && g == 1 && h == 1"
+ DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, ""),
+ DATA_REC(NO, 0, 1, 1, 1, 1, 1, 1, 1, "bcdefgh"),
+ DATA_REC(NO, 1, 1, 1, 1, 1, 1, 1, 0, ""),
+#undef FILTER
+#define FILTER "a == 1 || b == 1 || c == 1 || d == 1 || " \
+ "e == 1 || f == 1 || g == 1 || h == 1"
+ DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 0, ""),
+ DATA_REC(YES, 0, 0, 0, 0, 0, 0, 0, 1, ""),
+ DATA_REC(YES, 1, 0, 0, 0, 0, 0, 0, 0, "bcdefgh"),
+#undef FILTER
+#define FILTER "(a == 1 || b == 1) && (c == 1 || d == 1) && " \
+ "(e == 1 || f == 1) && (g == 1 || h == 1)"
+ DATA_REC(NO, 0, 0, 1, 1, 1, 1, 1, 1, "dfh"),
+ DATA_REC(YES, 0, 1, 0, 1, 0, 1, 0, 1, ""),
+ DATA_REC(YES, 1, 0, 1, 0, 0, 1, 0, 1, "bd"),
+ DATA_REC(NO, 1, 0, 1, 0, 0, 1, 0, 0, "bd"),
+#undef FILTER
+#define FILTER "(a == 1 && b == 1) || (c == 1 && d == 1) || " \
+ "(e == 1 && f == 1) || (g == 1 && h == 1)"
+ DATA_REC(YES, 1, 0, 1, 1, 1, 1, 1, 1, "efgh"),
+ DATA_REC(YES, 0, 0, 0, 0, 0, 0, 1, 1, ""),
+ DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 1, ""),
+#undef FILTER
+#define FILTER "(a == 1 && b == 1) && (c == 1 && d == 1) && " \
+ "(e == 1 && f == 1) || (g == 1 && h == 1)"
+ DATA_REC(YES, 1, 1, 1, 1, 1, 1, 0, 0, "gh"),
+ DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 1, ""),
+ DATA_REC(YES, 1, 1, 1, 1, 1, 0, 1, 1, ""),
+#undef FILTER
+#define FILTER "((a == 1 || b == 1) || (c == 1 || d == 1) || " \
+ "(e == 1 || f == 1)) && (g == 1 || h == 1)"
+ DATA_REC(YES, 1, 1, 1, 1, 1, 1, 0, 1, "bcdef"),
+ DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 0, ""),
+ DATA_REC(YES, 1, 1, 1, 1, 1, 0, 1, 1, "h"),
+#undef FILTER
+#define FILTER "((((((((a == 1) && (b == 1)) || (c == 1)) && (d == 1)) || " \
+ "(e == 1)) && (f == 1)) || (g == 1)) && (h == 1))"
+ DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, "ceg"),
+ DATA_REC(NO, 0, 1, 0, 1, 0, 1, 0, 1, ""),
+ DATA_REC(NO, 1, 0, 1, 0, 1, 0, 1, 0, ""),
+#undef FILTER
+#define FILTER "((((((((a == 1) || (b == 1)) && (c == 1)) || (d == 1)) && " \
+ "(e == 1)) || (f == 1)) && (g == 1)) || (h == 1))"
+ DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, "bdfh"),
+ DATA_REC(YES, 0, 1, 0, 1, 0, 1, 0, 1, ""),
+ DATA_REC(YES, 1, 0, 1, 0, 1, 0, 1, 0, "bdfh"),
+};
+
+#undef DATA_REC
+#undef FILTER
+#undef YES
+#undef NO
+
+#define DATA_CNT (sizeof(test_filter_data)/sizeof(struct test_filter_data_t))
+
+static int test_pred_visited;
+
+static int test_pred_visited_fn(struct filter_pred *pred, void *event)
+{
+ struct ftrace_event_field *field = pred->field;
+
+ test_pred_visited = 1;
+ printk(KERN_INFO "\npred visited %s\n", field->name);
+ return 1;
+}
+
+static int test_walk_pred_cb(enum move_type move, struct filter_pred *pred,
+ int *err, void *data)
+{
+ char *fields = data;
+
+ if ((move == MOVE_DOWN) &&
+ (pred->left == FILTER_PRED_INVALID)) {
+ struct ftrace_event_field *field = pred->field;
+
+ if (!field) {
+ WARN(1, "all leafs should have field defined");
+ return WALK_PRED_DEFAULT;
+ }
+ if (!strchr(fields, *field->name))
+ return WALK_PRED_DEFAULT;
+
+ WARN_ON(!pred->fn);
+ pred->fn = test_pred_visited_fn;
+ }
+ return WALK_PRED_DEFAULT;
+}
+
+static __init int ftrace_test_event_filter(void)
+{
+ int i;
+
+ printk(KERN_INFO "Testing ftrace filter: ");
+
+ for (i = 0; i < DATA_CNT; i++) {
+ struct event_filter *filter = NULL;
+ struct test_filter_data_t *d = &test_filter_data[i];
+ int err;
+
+ err = create_filter(&event_ftrace_test_filter, d->filter,
+ false, &filter);
+ if (err) {
+ printk(KERN_INFO
+ "Failed to get filter for '%s', err %d\n",
+ d->filter, err);
+ __free_filter(filter);
+ break;
+ }
+
+ /*
+ * The preemption disabling is not really needed for self
+ * tests, but the rcu dereference will complain without it.
+ */
+ preempt_disable();
+ if (*d->not_visited)
+ walk_pred_tree(filter->preds, filter->root,
+ test_walk_pred_cb,
+ d->not_visited);
+
+ test_pred_visited = 0;
+ err = filter_match_preds(filter, &d->rec);
+ preempt_enable();
+
+ __free_filter(filter);
+
+ if (test_pred_visited) {
+ printk(KERN_INFO
+ "Failed, unwanted pred visited for filter %s\n",
+ d->filter);
+ break;
+ }
+
+ if (err != d->match) {
+ printk(KERN_INFO
+ "Failed to match filter '%s', expected %d\n",
+ d->filter, d->match);
+ break;
+ }
+ }
+
+ if (i == DATA_CNT)
+ printk(KERN_CONT "OK\n");
+
+ return 0;
+}
+
+late_initcall(ftrace_test_event_filter);
+
+#endif /* CONFIG_FTRACE_STARTUP_TEST */
diff --git a/kernel/trace/trace_events_filter_test.h b/kernel/trace/trace_events_filter_test.h
new file mode 100644
index 000000000..bfd4dba0d
--- /dev/null
+++ b/kernel/trace/trace_events_filter_test.h
@@ -0,0 +1,50 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM test
+
+#if !defined(_TRACE_TEST_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_TEST_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(ftrace_test_filter,
+
+ TP_PROTO(int a, int b, int c, int d, int e, int f, int g, int h),
+
+ TP_ARGS(a, b, c, d, e, f, g, h),
+
+ TP_STRUCT__entry(
+ __field(int, a)
+ __field(int, b)
+ __field(int, c)
+ __field(int, d)
+ __field(int, e)
+ __field(int, f)
+ __field(int, g)
+ __field(int, h)
+ ),
+
+ TP_fast_assign(
+ __entry->a = a;
+ __entry->b = b;
+ __entry->c = c;
+ __entry->d = d;
+ __entry->e = e;
+ __entry->f = f;
+ __entry->g = g;
+ __entry->h = h;
+ ),
+
+ TP_printk("a %d, b %d, c %d, d %d, e %d, f %d, g %d, h %d",
+ __entry->a, __entry->b, __entry->c, __entry->d,
+ __entry->e, __entry->f, __entry->g, __entry->h)
+);
+
+#endif /* _TRACE_TEST_H || TRACE_HEADER_MULTI_READ */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_events_filter_test
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
new file mode 100644
index 000000000..8712df9de
--- /dev/null
+++ b/kernel/trace/trace_events_trigger.c
@@ -0,0 +1,1437 @@
+/*
+ * trace_events_trigger - trace event triggers
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2013 Tom Zanussi <tom.zanussi@linux.intel.com>
+ */
+
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+
+#include "trace.h"
+
+static LIST_HEAD(trigger_commands);
+static DEFINE_MUTEX(trigger_cmd_mutex);
+
+static void
+trigger_data_free(struct event_trigger_data *data)
+{
+ if (data->cmd_ops->set_filter)
+ data->cmd_ops->set_filter(NULL, data, NULL);
+
+ synchronize_sched(); /* make sure current triggers exit before free */
+ kfree(data);
+}
+
+/**
+ * event_triggers_call - Call triggers associated with a trace event
+ * @file: The ftrace_event_file associated with the event
+ * @rec: The trace entry for the event, NULL for unconditional invocation
+ *
+ * For each trigger associated with an event, invoke the trigger
+ * function registered with the associated trigger command. If rec is
+ * non-NULL, it means that the trigger requires further processing and
+ * shouldn't be unconditionally invoked. If rec is non-NULL and the
+ * trigger has a filter associated with it, rec will checked against
+ * the filter and if the record matches the trigger will be invoked.
+ * If the trigger is a 'post_trigger', meaning it shouldn't be invoked
+ * in any case until the current event is written, the trigger
+ * function isn't invoked but the bit associated with the deferred
+ * trigger is set in the return value.
+ *
+ * Returns an enum event_trigger_type value containing a set bit for
+ * any trigger that should be deferred, ETT_NONE if nothing to defer.
+ *
+ * Called from tracepoint handlers (with rcu_read_lock_sched() held).
+ *
+ * Return: an enum event_trigger_type value containing a set bit for
+ * any trigger that should be deferred, ETT_NONE if nothing to defer.
+ */
+enum event_trigger_type
+event_triggers_call(struct ftrace_event_file *file, void *rec)
+{
+ struct event_trigger_data *data;
+ enum event_trigger_type tt = ETT_NONE;
+ struct event_filter *filter;
+
+ if (list_empty(&file->triggers))
+ return tt;
+
+ list_for_each_entry_rcu(data, &file->triggers, list) {
+ if (!rec) {
+ data->ops->func(data);
+ continue;
+ }
+ filter = rcu_dereference_sched(data->filter);
+ if (filter && !filter_match_preds(filter, rec))
+ continue;
+ if (data->cmd_ops->post_trigger) {
+ tt |= data->cmd_ops->trigger_type;
+ continue;
+ }
+ data->ops->func(data);
+ }
+ return tt;
+}
+EXPORT_SYMBOL_GPL(event_triggers_call);
+
+/**
+ * event_triggers_post_call - Call 'post_triggers' for a trace event
+ * @file: The ftrace_event_file associated with the event
+ * @tt: enum event_trigger_type containing a set bit for each trigger to invoke
+ *
+ * For each trigger associated with an event, invoke the trigger
+ * function registered with the associated trigger command, if the
+ * corresponding bit is set in the tt enum passed into this function.
+ * See @event_triggers_call for details on how those bits are set.
+ *
+ * Called from tracepoint handlers (with rcu_read_lock_sched() held).
+ */
+void
+event_triggers_post_call(struct ftrace_event_file *file,
+ enum event_trigger_type tt)
+{
+ struct event_trigger_data *data;
+
+ list_for_each_entry_rcu(data, &file->triggers, list) {
+ if (data->cmd_ops->trigger_type & tt)
+ data->ops->func(data);
+ }
+}
+EXPORT_SYMBOL_GPL(event_triggers_post_call);
+
+#define SHOW_AVAILABLE_TRIGGERS (void *)(1UL)
+
+static void *trigger_next(struct seq_file *m, void *t, loff_t *pos)
+{
+ struct ftrace_event_file *event_file = event_file_data(m->private);
+
+ if (t == SHOW_AVAILABLE_TRIGGERS)
+ return NULL;
+
+ return seq_list_next(t, &event_file->triggers, pos);
+}
+
+static void *trigger_start(struct seq_file *m, loff_t *pos)
+{
+ struct ftrace_event_file *event_file;
+
+ /* ->stop() is called even if ->start() fails */
+ mutex_lock(&event_mutex);
+ event_file = event_file_data(m->private);
+ if (unlikely(!event_file))
+ return ERR_PTR(-ENODEV);
+
+ if (list_empty(&event_file->triggers))
+ return *pos == 0 ? SHOW_AVAILABLE_TRIGGERS : NULL;
+
+ return seq_list_start(&event_file->triggers, *pos);
+}
+
+static void trigger_stop(struct seq_file *m, void *t)
+{
+ mutex_unlock(&event_mutex);
+}
+
+static int trigger_show(struct seq_file *m, void *v)
+{
+ struct event_trigger_data *data;
+ struct event_command *p;
+
+ if (v == SHOW_AVAILABLE_TRIGGERS) {
+ seq_puts(m, "# Available triggers:\n");
+ seq_putc(m, '#');
+ mutex_lock(&trigger_cmd_mutex);
+ list_for_each_entry_reverse(p, &trigger_commands, list)
+ seq_printf(m, " %s", p->name);
+ seq_putc(m, '\n');
+ mutex_unlock(&trigger_cmd_mutex);
+ return 0;
+ }
+
+ data = list_entry(v, struct event_trigger_data, list);
+ data->ops->print(m, data->ops, data);
+
+ return 0;
+}
+
+static const struct seq_operations event_triggers_seq_ops = {
+ .start = trigger_start,
+ .next = trigger_next,
+ .stop = trigger_stop,
+ .show = trigger_show,
+};
+
+static int event_trigger_regex_open(struct inode *inode, struct file *file)
+{
+ int ret = 0;
+
+ mutex_lock(&event_mutex);
+
+ if (unlikely(!event_file_data(file))) {
+ mutex_unlock(&event_mutex);
+ return -ENODEV;
+ }
+
+ if (file->f_mode & FMODE_READ) {
+ ret = seq_open(file, &event_triggers_seq_ops);
+ if (!ret) {
+ struct seq_file *m = file->private_data;
+ m->private = file;
+ }
+ }
+
+ mutex_unlock(&event_mutex);
+
+ return ret;
+}
+
+static int trigger_process_regex(struct ftrace_event_file *file, char *buff)
+{
+ char *command, *next = buff;
+ struct event_command *p;
+ int ret = -EINVAL;
+
+ command = strsep(&next, ": \t");
+ command = (command[0] != '!') ? command : command + 1;
+
+ mutex_lock(&trigger_cmd_mutex);
+ list_for_each_entry(p, &trigger_commands, list) {
+ if (strcmp(p->name, command) == 0) {
+ ret = p->func(p, file, buff, command, next);
+ goto out_unlock;
+ }
+ }
+ out_unlock:
+ mutex_unlock(&trigger_cmd_mutex);
+
+ return ret;
+}
+
+static ssize_t event_trigger_regex_write(struct file *file,
+ const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct ftrace_event_file *event_file;
+ ssize_t ret;
+ char *buf;
+
+ if (!cnt)
+ return 0;
+
+ if (cnt >= PAGE_SIZE)
+ return -EINVAL;
+
+ buf = (char *)__get_free_page(GFP_TEMPORARY);
+ if (!buf)
+ return -ENOMEM;
+
+ if (copy_from_user(buf, ubuf, cnt)) {
+ free_page((unsigned long)buf);
+ return -EFAULT;
+ }
+ buf[cnt] = '\0';
+ strim(buf);
+
+ mutex_lock(&event_mutex);
+ event_file = event_file_data(file);
+ if (unlikely(!event_file)) {
+ mutex_unlock(&event_mutex);
+ free_page((unsigned long)buf);
+ return -ENODEV;
+ }
+ ret = trigger_process_regex(event_file, buf);
+ mutex_unlock(&event_mutex);
+
+ free_page((unsigned long)buf);
+ if (ret < 0)
+ goto out;
+
+ *ppos += cnt;
+ ret = cnt;
+ out:
+ return ret;
+}
+
+static int event_trigger_regex_release(struct inode *inode, struct file *file)
+{
+ mutex_lock(&event_mutex);
+
+ if (file->f_mode & FMODE_READ)
+ seq_release(inode, file);
+
+ mutex_unlock(&event_mutex);
+
+ return 0;
+}
+
+static ssize_t
+event_trigger_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ return event_trigger_regex_write(filp, ubuf, cnt, ppos);
+}
+
+static int
+event_trigger_open(struct inode *inode, struct file *filp)
+{
+ return event_trigger_regex_open(inode, filp);
+}
+
+static int
+event_trigger_release(struct inode *inode, struct file *file)
+{
+ return event_trigger_regex_release(inode, file);
+}
+
+const struct file_operations event_trigger_fops = {
+ .open = event_trigger_open,
+ .read = seq_read,
+ .write = event_trigger_write,
+ .llseek = tracing_lseek,
+ .release = event_trigger_release,
+};
+
+/*
+ * Currently we only register event commands from __init, so mark this
+ * __init too.
+ */
+static __init int register_event_command(struct event_command *cmd)
+{
+ struct event_command *p;
+ int ret = 0;
+
+ mutex_lock(&trigger_cmd_mutex);
+ list_for_each_entry(p, &trigger_commands, list) {
+ if (strcmp(cmd->name, p->name) == 0) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+ }
+ list_add(&cmd->list, &trigger_commands);
+ out_unlock:
+ mutex_unlock(&trigger_cmd_mutex);
+
+ return ret;
+}
+
+/*
+ * Currently we only unregister event commands from __init, so mark
+ * this __init too.
+ */
+static __init int unregister_event_command(struct event_command *cmd)
+{
+ struct event_command *p, *n;
+ int ret = -ENODEV;
+
+ mutex_lock(&trigger_cmd_mutex);
+ list_for_each_entry_safe(p, n, &trigger_commands, list) {
+ if (strcmp(cmd->name, p->name) == 0) {
+ ret = 0;
+ list_del_init(&p->list);
+ goto out_unlock;
+ }
+ }
+ out_unlock:
+ mutex_unlock(&trigger_cmd_mutex);
+
+ return ret;
+}
+
+/**
+ * event_trigger_print - Generic event_trigger_ops @print implementation
+ * @name: The name of the event trigger
+ * @m: The seq_file being printed to
+ * @data: Trigger-specific data
+ * @filter_str: filter_str to print, if present
+ *
+ * Common implementation for event triggers to print themselves.
+ *
+ * Usually wrapped by a function that simply sets the @name of the
+ * trigger command and then invokes this.
+ *
+ * Return: 0 on success, errno otherwise
+ */
+static int
+event_trigger_print(const char *name, struct seq_file *m,
+ void *data, char *filter_str)
+{
+ long count = (long)data;
+
+ seq_puts(m, name);
+
+ if (count == -1)
+ seq_puts(m, ":unlimited");
+ else
+ seq_printf(m, ":count=%ld", count);
+
+ if (filter_str)
+ seq_printf(m, " if %s\n", filter_str);
+ else
+ seq_putc(m, '\n');
+
+ return 0;
+}
+
+/**
+ * event_trigger_init - Generic event_trigger_ops @init implementation
+ * @ops: The trigger ops associated with the trigger
+ * @data: Trigger-specific data
+ *
+ * Common implementation of event trigger initialization.
+ *
+ * Usually used directly as the @init method in event trigger
+ * implementations.
+ *
+ * Return: 0 on success, errno otherwise
+ */
+static int
+event_trigger_init(struct event_trigger_ops *ops,
+ struct event_trigger_data *data)
+{
+ data->ref++;
+ return 0;
+}
+
+/**
+ * event_trigger_free - Generic event_trigger_ops @free implementation
+ * @ops: The trigger ops associated with the trigger
+ * @data: Trigger-specific data
+ *
+ * Common implementation of event trigger de-initialization.
+ *
+ * Usually used directly as the @free method in event trigger
+ * implementations.
+ */
+static void
+event_trigger_free(struct event_trigger_ops *ops,
+ struct event_trigger_data *data)
+{
+ if (WARN_ON_ONCE(data->ref <= 0))
+ return;
+
+ data->ref--;
+ if (!data->ref)
+ trigger_data_free(data);
+}
+
+static int trace_event_trigger_enable_disable(struct ftrace_event_file *file,
+ int trigger_enable)
+{
+ int ret = 0;
+
+ if (trigger_enable) {
+ if (atomic_inc_return(&file->tm_ref) > 1)
+ return ret;
+ set_bit(FTRACE_EVENT_FL_TRIGGER_MODE_BIT, &file->flags);
+ ret = trace_event_enable_disable(file, 1, 1);
+ } else {
+ if (atomic_dec_return(&file->tm_ref) > 0)
+ return ret;
+ clear_bit(FTRACE_EVENT_FL_TRIGGER_MODE_BIT, &file->flags);
+ ret = trace_event_enable_disable(file, 0, 1);
+ }
+
+ return ret;
+}
+
+/**
+ * clear_event_triggers - Clear all triggers associated with a trace array
+ * @tr: The trace array to clear
+ *
+ * For each trigger, the triggering event has its tm_ref decremented
+ * via trace_event_trigger_enable_disable(), and any associated event
+ * (in the case of enable/disable_event triggers) will have its sm_ref
+ * decremented via free()->trace_event_enable_disable(). That
+ * combination effectively reverses the soft-mode/trigger state added
+ * by trigger registration.
+ *
+ * Must be called with event_mutex held.
+ */
+void
+clear_event_triggers(struct trace_array *tr)
+{
+ struct ftrace_event_file *file;
+
+ list_for_each_entry(file, &tr->events, list) {
+ struct event_trigger_data *data;
+ list_for_each_entry_rcu(data, &file->triggers, list) {
+ trace_event_trigger_enable_disable(file, 0);
+ if (data->ops->free)
+ data->ops->free(data->ops, data);
+ }
+ }
+}
+
+/**
+ * update_cond_flag - Set or reset the TRIGGER_COND bit
+ * @file: The ftrace_event_file associated with the event
+ *
+ * If an event has triggers and any of those triggers has a filter or
+ * a post_trigger, trigger invocation needs to be deferred until after
+ * the current event has logged its data, and the event should have
+ * its TRIGGER_COND bit set, otherwise the TRIGGER_COND bit should be
+ * cleared.
+ */
+static void update_cond_flag(struct ftrace_event_file *file)
+{
+ struct event_trigger_data *data;
+ bool set_cond = false;
+
+ list_for_each_entry_rcu(data, &file->triggers, list) {
+ if (data->filter || data->cmd_ops->post_trigger) {
+ set_cond = true;
+ break;
+ }
+ }
+
+ if (set_cond)
+ set_bit(FTRACE_EVENT_FL_TRIGGER_COND_BIT, &file->flags);
+ else
+ clear_bit(FTRACE_EVENT_FL_TRIGGER_COND_BIT, &file->flags);
+}
+
+/**
+ * register_trigger - Generic event_command @reg implementation
+ * @glob: The raw string used to register the trigger
+ * @ops: The trigger ops associated with the trigger
+ * @data: Trigger-specific data to associate with the trigger
+ * @file: The ftrace_event_file associated with the event
+ *
+ * Common implementation for event trigger registration.
+ *
+ * Usually used directly as the @reg method in event command
+ * implementations.
+ *
+ * Return: 0 on success, errno otherwise
+ */
+static int register_trigger(char *glob, struct event_trigger_ops *ops,
+ struct event_trigger_data *data,
+ struct ftrace_event_file *file)
+{
+ struct event_trigger_data *test;
+ int ret = 0;
+
+ list_for_each_entry_rcu(test, &file->triggers, list) {
+ if (test->cmd_ops->trigger_type == data->cmd_ops->trigger_type) {
+ ret = -EEXIST;
+ goto out;
+ }
+ }
+
+ if (data->ops->init) {
+ ret = data->ops->init(data->ops, data);
+ if (ret < 0)
+ goto out;
+ }
+
+ list_add_rcu(&data->list, &file->triggers);
+ ret++;
+
+ if (trace_event_trigger_enable_disable(file, 1) < 0) {
+ list_del_rcu(&data->list);
+ ret--;
+ }
+ update_cond_flag(file);
+out:
+ return ret;
+}
+
+/**
+ * unregister_trigger - Generic event_command @unreg implementation
+ * @glob: The raw string used to register the trigger
+ * @ops: The trigger ops associated with the trigger
+ * @test: Trigger-specific data used to find the trigger to remove
+ * @file: The ftrace_event_file associated with the event
+ *
+ * Common implementation for event trigger unregistration.
+ *
+ * Usually used directly as the @unreg method in event command
+ * implementations.
+ */
+static void unregister_trigger(char *glob, struct event_trigger_ops *ops,
+ struct event_trigger_data *test,
+ struct ftrace_event_file *file)
+{
+ struct event_trigger_data *data;
+ bool unregistered = false;
+
+ list_for_each_entry_rcu(data, &file->triggers, list) {
+ if (data->cmd_ops->trigger_type == test->cmd_ops->trigger_type) {
+ unregistered = true;
+ list_del_rcu(&data->list);
+ update_cond_flag(file);
+ trace_event_trigger_enable_disable(file, 0);
+ break;
+ }
+ }
+
+ if (unregistered && data->ops->free)
+ data->ops->free(data->ops, data);
+}
+
+/**
+ * event_trigger_callback - Generic event_command @func implementation
+ * @cmd_ops: The command ops, used for trigger registration
+ * @file: The ftrace_event_file associated with the event
+ * @glob: The raw string used to register the trigger
+ * @cmd: The cmd portion of the string used to register the trigger
+ * @param: The params portion of the string used to register the trigger
+ *
+ * Common implementation for event command parsing and trigger
+ * instantiation.
+ *
+ * Usually used directly as the @func method in event command
+ * implementations.
+ *
+ * Return: 0 on success, errno otherwise
+ */
+static int
+event_trigger_callback(struct event_command *cmd_ops,
+ struct ftrace_event_file *file,
+ char *glob, char *cmd, char *param)
+{
+ struct event_trigger_data *trigger_data;
+ struct event_trigger_ops *trigger_ops;
+ char *trigger = NULL;
+ char *number;
+ int ret;
+
+ /* separate the trigger from the filter (t:n [if filter]) */
+ if (param && isdigit(param[0]))
+ trigger = strsep(&param, " \t");
+
+ trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger);
+
+ ret = -ENOMEM;
+ trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL);
+ if (!trigger_data)
+ goto out;
+
+ trigger_data->count = -1;
+ trigger_data->ops = trigger_ops;
+ trigger_data->cmd_ops = cmd_ops;
+ INIT_LIST_HEAD(&trigger_data->list);
+
+ if (glob[0] == '!') {
+ cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
+ kfree(trigger_data);
+ ret = 0;
+ goto out;
+ }
+
+ if (trigger) {
+ number = strsep(&trigger, ":");
+
+ ret = -EINVAL;
+ if (!strlen(number))
+ goto out_free;
+
+ /*
+ * We use the callback data field (which is a pointer)
+ * as our counter.
+ */
+ ret = kstrtoul(number, 0, &trigger_data->count);
+ if (ret)
+ goto out_free;
+ }
+
+ if (!param) /* if param is non-empty, it's supposed to be a filter */
+ goto out_reg;
+
+ if (!cmd_ops->set_filter)
+ goto out_reg;
+
+ ret = cmd_ops->set_filter(param, trigger_data, file);
+ if (ret < 0)
+ goto out_free;
+
+ out_reg:
+ ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file);
+ /*
+ * The above returns on success the # of functions enabled,
+ * but if it didn't find any functions it returns zero.
+ * Consider no functions a failure too.
+ */
+ if (!ret) {
+ ret = -ENOENT;
+ goto out_free;
+ } else if (ret < 0)
+ goto out_free;
+ ret = 0;
+ out:
+ return ret;
+
+ out_free:
+ if (cmd_ops->set_filter)
+ cmd_ops->set_filter(NULL, trigger_data, NULL);
+ kfree(trigger_data);
+ goto out;
+}
+
+/**
+ * set_trigger_filter - Generic event_command @set_filter implementation
+ * @filter_str: The filter string for the trigger, NULL to remove filter
+ * @trigger_data: Trigger-specific data
+ * @file: The ftrace_event_file associated with the event
+ *
+ * Common implementation for event command filter parsing and filter
+ * instantiation.
+ *
+ * Usually used directly as the @set_filter method in event command
+ * implementations.
+ *
+ * Also used to remove a filter (if filter_str = NULL).
+ *
+ * Return: 0 on success, errno otherwise
+ */
+static int set_trigger_filter(char *filter_str,
+ struct event_trigger_data *trigger_data,
+ struct ftrace_event_file *file)
+{
+ struct event_trigger_data *data = trigger_data;
+ struct event_filter *filter = NULL, *tmp;
+ int ret = -EINVAL;
+ char *s;
+
+ if (!filter_str) /* clear the current filter */
+ goto assign;
+
+ s = strsep(&filter_str, " \t");
+
+ if (!strlen(s) || strcmp(s, "if") != 0)
+ goto out;
+
+ if (!filter_str)
+ goto out;
+
+ /* The filter is for the 'trigger' event, not the triggered event */
+ ret = create_event_filter(file->event_call, filter_str, false, &filter);
+ if (ret)
+ goto out;
+ assign:
+ tmp = rcu_access_pointer(data->filter);
+
+ rcu_assign_pointer(data->filter, filter);
+
+ if (tmp) {
+ /* Make sure the call is done with the filter */
+ synchronize_sched();
+ free_event_filter(tmp);
+ }
+
+ kfree(data->filter_str);
+ data->filter_str = NULL;
+
+ if (filter_str) {
+ data->filter_str = kstrdup(filter_str, GFP_KERNEL);
+ if (!data->filter_str) {
+ free_event_filter(rcu_access_pointer(data->filter));
+ data->filter = NULL;
+ ret = -ENOMEM;
+ }
+ }
+ out:
+ return ret;
+}
+
+static void
+traceon_trigger(struct event_trigger_data *data)
+{
+ if (tracing_is_on())
+ return;
+
+ tracing_on();
+}
+
+static void
+traceon_count_trigger(struct event_trigger_data *data)
+{
+ if (tracing_is_on())
+ return;
+
+ if (!data->count)
+ return;
+
+ if (data->count != -1)
+ (data->count)--;
+
+ tracing_on();
+}
+
+static void
+traceoff_trigger(struct event_trigger_data *data)
+{
+ if (!tracing_is_on())
+ return;
+
+ tracing_off();
+}
+
+static void
+traceoff_count_trigger(struct event_trigger_data *data)
+{
+ if (!tracing_is_on())
+ return;
+
+ if (!data->count)
+ return;
+
+ if (data->count != -1)
+ (data->count)--;
+
+ tracing_off();
+}
+
+static int
+traceon_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
+ struct event_trigger_data *data)
+{
+ return event_trigger_print("traceon", m, (void *)data->count,
+ data->filter_str);
+}
+
+static int
+traceoff_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
+ struct event_trigger_data *data)
+{
+ return event_trigger_print("traceoff", m, (void *)data->count,
+ data->filter_str);
+}
+
+static struct event_trigger_ops traceon_trigger_ops = {
+ .func = traceon_trigger,
+ .print = traceon_trigger_print,
+ .init = event_trigger_init,
+ .free = event_trigger_free,
+};
+
+static struct event_trigger_ops traceon_count_trigger_ops = {
+ .func = traceon_count_trigger,
+ .print = traceon_trigger_print,
+ .init = event_trigger_init,
+ .free = event_trigger_free,
+};
+
+static struct event_trigger_ops traceoff_trigger_ops = {
+ .func = traceoff_trigger,
+ .print = traceoff_trigger_print,
+ .init = event_trigger_init,
+ .free = event_trigger_free,
+};
+
+static struct event_trigger_ops traceoff_count_trigger_ops = {
+ .func = traceoff_count_trigger,
+ .print = traceoff_trigger_print,
+ .init = event_trigger_init,
+ .free = event_trigger_free,
+};
+
+static struct event_trigger_ops *
+onoff_get_trigger_ops(char *cmd, char *param)
+{
+ struct event_trigger_ops *ops;
+
+ /* we register both traceon and traceoff to this callback */
+ if (strcmp(cmd, "traceon") == 0)
+ ops = param ? &traceon_count_trigger_ops :
+ &traceon_trigger_ops;
+ else
+ ops = param ? &traceoff_count_trigger_ops :
+ &traceoff_trigger_ops;
+
+ return ops;
+}
+
+static struct event_command trigger_traceon_cmd = {
+ .name = "traceon",
+ .trigger_type = ETT_TRACE_ONOFF,
+ .func = event_trigger_callback,
+ .reg = register_trigger,
+ .unreg = unregister_trigger,
+ .get_trigger_ops = onoff_get_trigger_ops,
+ .set_filter = set_trigger_filter,
+};
+
+static struct event_command trigger_traceoff_cmd = {
+ .name = "traceoff",
+ .trigger_type = ETT_TRACE_ONOFF,
+ .func = event_trigger_callback,
+ .reg = register_trigger,
+ .unreg = unregister_trigger,
+ .get_trigger_ops = onoff_get_trigger_ops,
+ .set_filter = set_trigger_filter,
+};
+
+#ifdef CONFIG_TRACER_SNAPSHOT
+static void
+snapshot_trigger(struct event_trigger_data *data)
+{
+ tracing_snapshot();
+}
+
+static void
+snapshot_count_trigger(struct event_trigger_data *data)
+{
+ if (!data->count)
+ return;
+
+ if (data->count != -1)
+ (data->count)--;
+
+ snapshot_trigger(data);
+}
+
+static int
+register_snapshot_trigger(char *glob, struct event_trigger_ops *ops,
+ struct event_trigger_data *data,
+ struct ftrace_event_file *file)
+{
+ int ret = register_trigger(glob, ops, data, file);
+
+ if (ret > 0 && tracing_alloc_snapshot() != 0) {
+ unregister_trigger(glob, ops, data, file);
+ ret = 0;
+ }
+
+ return ret;
+}
+
+static int
+snapshot_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
+ struct event_trigger_data *data)
+{
+ return event_trigger_print("snapshot", m, (void *)data->count,
+ data->filter_str);
+}
+
+static struct event_trigger_ops snapshot_trigger_ops = {
+ .func = snapshot_trigger,
+ .print = snapshot_trigger_print,
+ .init = event_trigger_init,
+ .free = event_trigger_free,
+};
+
+static struct event_trigger_ops snapshot_count_trigger_ops = {
+ .func = snapshot_count_trigger,
+ .print = snapshot_trigger_print,
+ .init = event_trigger_init,
+ .free = event_trigger_free,
+};
+
+static struct event_trigger_ops *
+snapshot_get_trigger_ops(char *cmd, char *param)
+{
+ return param ? &snapshot_count_trigger_ops : &snapshot_trigger_ops;
+}
+
+static struct event_command trigger_snapshot_cmd = {
+ .name = "snapshot",
+ .trigger_type = ETT_SNAPSHOT,
+ .func = event_trigger_callback,
+ .reg = register_snapshot_trigger,
+ .unreg = unregister_trigger,
+ .get_trigger_ops = snapshot_get_trigger_ops,
+ .set_filter = set_trigger_filter,
+};
+
+static __init int register_trigger_snapshot_cmd(void)
+{
+ int ret;
+
+ ret = register_event_command(&trigger_snapshot_cmd);
+ WARN_ON(ret < 0);
+
+ return ret;
+}
+#else
+static __init int register_trigger_snapshot_cmd(void) { return 0; }
+#endif /* CONFIG_TRACER_SNAPSHOT */
+
+#ifdef CONFIG_STACKTRACE
+/*
+ * Skip 3:
+ * stacktrace_trigger()
+ * event_triggers_post_call()
+ * ftrace_raw_event_xxx()
+ */
+#define STACK_SKIP 3
+
+static void
+stacktrace_trigger(struct event_trigger_data *data)
+{
+ trace_dump_stack(STACK_SKIP);
+}
+
+static void
+stacktrace_count_trigger(struct event_trigger_data *data)
+{
+ if (!data->count)
+ return;
+
+ if (data->count != -1)
+ (data->count)--;
+
+ stacktrace_trigger(data);
+}
+
+static int
+stacktrace_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
+ struct event_trigger_data *data)
+{
+ return event_trigger_print("stacktrace", m, (void *)data->count,
+ data->filter_str);
+}
+
+static struct event_trigger_ops stacktrace_trigger_ops = {
+ .func = stacktrace_trigger,
+ .print = stacktrace_trigger_print,
+ .init = event_trigger_init,
+ .free = event_trigger_free,
+};
+
+static struct event_trigger_ops stacktrace_count_trigger_ops = {
+ .func = stacktrace_count_trigger,
+ .print = stacktrace_trigger_print,
+ .init = event_trigger_init,
+ .free = event_trigger_free,
+};
+
+static struct event_trigger_ops *
+stacktrace_get_trigger_ops(char *cmd, char *param)
+{
+ return param ? &stacktrace_count_trigger_ops : &stacktrace_trigger_ops;
+}
+
+static struct event_command trigger_stacktrace_cmd = {
+ .name = "stacktrace",
+ .trigger_type = ETT_STACKTRACE,
+ .post_trigger = true,
+ .func = event_trigger_callback,
+ .reg = register_trigger,
+ .unreg = unregister_trigger,
+ .get_trigger_ops = stacktrace_get_trigger_ops,
+ .set_filter = set_trigger_filter,
+};
+
+static __init int register_trigger_stacktrace_cmd(void)
+{
+ int ret;
+
+ ret = register_event_command(&trigger_stacktrace_cmd);
+ WARN_ON(ret < 0);
+
+ return ret;
+}
+#else
+static __init int register_trigger_stacktrace_cmd(void) { return 0; }
+#endif /* CONFIG_STACKTRACE */
+
+static __init void unregister_trigger_traceon_traceoff_cmds(void)
+{
+ unregister_event_command(&trigger_traceon_cmd);
+ unregister_event_command(&trigger_traceoff_cmd);
+}
+
+/* Avoid typos */
+#define ENABLE_EVENT_STR "enable_event"
+#define DISABLE_EVENT_STR "disable_event"
+
+struct enable_trigger_data {
+ struct ftrace_event_file *file;
+ bool enable;
+};
+
+static void
+event_enable_trigger(struct event_trigger_data *data)
+{
+ struct enable_trigger_data *enable_data = data->private_data;
+
+ if (enable_data->enable)
+ clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &enable_data->file->flags);
+ else
+ set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &enable_data->file->flags);
+}
+
+static void
+event_enable_count_trigger(struct event_trigger_data *data)
+{
+ struct enable_trigger_data *enable_data = data->private_data;
+
+ if (!data->count)
+ return;
+
+ /* Skip if the event is in a state we want to switch to */
+ if (enable_data->enable == !(enable_data->file->flags & FTRACE_EVENT_FL_SOFT_DISABLED))
+ return;
+
+ if (data->count != -1)
+ (data->count)--;
+
+ event_enable_trigger(data);
+}
+
+static int
+event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
+ struct event_trigger_data *data)
+{
+ struct enable_trigger_data *enable_data = data->private_data;
+
+ seq_printf(m, "%s:%s:%s",
+ enable_data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR,
+ enable_data->file->event_call->class->system,
+ ftrace_event_name(enable_data->file->event_call));
+
+ if (data->count == -1)
+ seq_puts(m, ":unlimited");
+ else
+ seq_printf(m, ":count=%ld", data->count);
+
+ if (data->filter_str)
+ seq_printf(m, " if %s\n", data->filter_str);
+ else
+ seq_putc(m, '\n');
+
+ return 0;
+}
+
+static void
+event_enable_trigger_free(struct event_trigger_ops *ops,
+ struct event_trigger_data *data)
+{
+ struct enable_trigger_data *enable_data = data->private_data;
+
+ if (WARN_ON_ONCE(data->ref <= 0))
+ return;
+
+ data->ref--;
+ if (!data->ref) {
+ /* Remove the SOFT_MODE flag */
+ trace_event_enable_disable(enable_data->file, 0, 1);
+ module_put(enable_data->file->event_call->mod);
+ trigger_data_free(data);
+ kfree(enable_data);
+ }
+}
+
+static struct event_trigger_ops event_enable_trigger_ops = {
+ .func = event_enable_trigger,
+ .print = event_enable_trigger_print,
+ .init = event_trigger_init,
+ .free = event_enable_trigger_free,
+};
+
+static struct event_trigger_ops event_enable_count_trigger_ops = {
+ .func = event_enable_count_trigger,
+ .print = event_enable_trigger_print,
+ .init = event_trigger_init,
+ .free = event_enable_trigger_free,
+};
+
+static struct event_trigger_ops event_disable_trigger_ops = {
+ .func = event_enable_trigger,
+ .print = event_enable_trigger_print,
+ .init = event_trigger_init,
+ .free = event_enable_trigger_free,
+};
+
+static struct event_trigger_ops event_disable_count_trigger_ops = {
+ .func = event_enable_count_trigger,
+ .print = event_enable_trigger_print,
+ .init = event_trigger_init,
+ .free = event_enable_trigger_free,
+};
+
+static int
+event_enable_trigger_func(struct event_command *cmd_ops,
+ struct ftrace_event_file *file,
+ char *glob, char *cmd, char *param)
+{
+ struct ftrace_event_file *event_enable_file;
+ struct enable_trigger_data *enable_data;
+ struct event_trigger_data *trigger_data;
+ struct event_trigger_ops *trigger_ops;
+ struct trace_array *tr = file->tr;
+ const char *system;
+ const char *event;
+ char *trigger;
+ char *number;
+ bool enable;
+ int ret;
+
+ if (!param)
+ return -EINVAL;
+
+ /* separate the trigger from the filter (s:e:n [if filter]) */
+ trigger = strsep(&param, " \t");
+ if (!trigger)
+ return -EINVAL;
+
+ system = strsep(&trigger, ":");
+ if (!trigger)
+ return -EINVAL;
+
+ event = strsep(&trigger, ":");
+
+ ret = -EINVAL;
+ event_enable_file = find_event_file(tr, system, event);
+ if (!event_enable_file)
+ goto out;
+
+ enable = strcmp(cmd, ENABLE_EVENT_STR) == 0;
+
+ trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger);
+
+ ret = -ENOMEM;
+ trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL);
+ if (!trigger_data)
+ goto out;
+
+ enable_data = kzalloc(sizeof(*enable_data), GFP_KERNEL);
+ if (!enable_data) {
+ kfree(trigger_data);
+ goto out;
+ }
+
+ trigger_data->count = -1;
+ trigger_data->ops = trigger_ops;
+ trigger_data->cmd_ops = cmd_ops;
+ INIT_LIST_HEAD(&trigger_data->list);
+ RCU_INIT_POINTER(trigger_data->filter, NULL);
+
+ enable_data->enable = enable;
+ enable_data->file = event_enable_file;
+ trigger_data->private_data = enable_data;
+
+ if (glob[0] == '!') {
+ cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
+ kfree(trigger_data);
+ kfree(enable_data);
+ ret = 0;
+ goto out;
+ }
+
+ if (trigger) {
+ number = strsep(&trigger, ":");
+
+ ret = -EINVAL;
+ if (!strlen(number))
+ goto out_free;
+
+ /*
+ * We use the callback data field (which is a pointer)
+ * as our counter.
+ */
+ ret = kstrtoul(number, 0, &trigger_data->count);
+ if (ret)
+ goto out_free;
+ }
+
+ if (!param) /* if param is non-empty, it's supposed to be a filter */
+ goto out_reg;
+
+ if (!cmd_ops->set_filter)
+ goto out_reg;
+
+ ret = cmd_ops->set_filter(param, trigger_data, file);
+ if (ret < 0)
+ goto out_free;
+
+ out_reg:
+ /* Don't let event modules unload while probe registered */
+ ret = try_module_get(event_enable_file->event_call->mod);
+ if (!ret) {
+ ret = -EBUSY;
+ goto out_free;
+ }
+
+ ret = trace_event_enable_disable(event_enable_file, 1, 1);
+ if (ret < 0)
+ goto out_put;
+ ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file);
+ /*
+ * The above returns on success the # of functions enabled,
+ * but if it didn't find any functions it returns zero.
+ * Consider no functions a failure too.
+ */
+ if (!ret) {
+ ret = -ENOENT;
+ goto out_disable;
+ } else if (ret < 0)
+ goto out_disable;
+ /* Just return zero, not the number of enabled functions */
+ ret = 0;
+ out:
+ return ret;
+
+ out_disable:
+ trace_event_enable_disable(event_enable_file, 0, 1);
+ out_put:
+ module_put(event_enable_file->event_call->mod);
+ out_free:
+ if (cmd_ops->set_filter)
+ cmd_ops->set_filter(NULL, trigger_data, NULL);
+ kfree(trigger_data);
+ kfree(enable_data);
+ goto out;
+}
+
+static int event_enable_register_trigger(char *glob,
+ struct event_trigger_ops *ops,
+ struct event_trigger_data *data,
+ struct ftrace_event_file *file)
+{
+ struct enable_trigger_data *enable_data = data->private_data;
+ struct enable_trigger_data *test_enable_data;
+ struct event_trigger_data *test;
+ int ret = 0;
+
+ list_for_each_entry_rcu(test, &file->triggers, list) {
+ test_enable_data = test->private_data;
+ if (test_enable_data &&
+ (test_enable_data->file == enable_data->file)) {
+ ret = -EEXIST;
+ goto out;
+ }
+ }
+
+ if (data->ops->init) {
+ ret = data->ops->init(data->ops, data);
+ if (ret < 0)
+ goto out;
+ }
+
+ list_add_rcu(&data->list, &file->triggers);
+ ret++;
+
+ if (trace_event_trigger_enable_disable(file, 1) < 0) {
+ list_del_rcu(&data->list);
+ ret--;
+ }
+ update_cond_flag(file);
+out:
+ return ret;
+}
+
+static void event_enable_unregister_trigger(char *glob,
+ struct event_trigger_ops *ops,
+ struct event_trigger_data *test,
+ struct ftrace_event_file *file)
+{
+ struct enable_trigger_data *test_enable_data = test->private_data;
+ struct enable_trigger_data *enable_data;
+ struct event_trigger_data *data;
+ bool unregistered = false;
+
+ list_for_each_entry_rcu(data, &file->triggers, list) {
+ enable_data = data->private_data;
+ if (enable_data &&
+ (enable_data->file == test_enable_data->file)) {
+ unregistered = true;
+ list_del_rcu(&data->list);
+ update_cond_flag(file);
+ trace_event_trigger_enable_disable(file, 0);
+ break;
+ }
+ }
+
+ if (unregistered && data->ops->free)
+ data->ops->free(data->ops, data);
+}
+
+static struct event_trigger_ops *
+event_enable_get_trigger_ops(char *cmd, char *param)
+{
+ struct event_trigger_ops *ops;
+ bool enable;
+
+ enable = strcmp(cmd, ENABLE_EVENT_STR) == 0;
+
+ if (enable)
+ ops = param ? &event_enable_count_trigger_ops :
+ &event_enable_trigger_ops;
+ else
+ ops = param ? &event_disable_count_trigger_ops :
+ &event_disable_trigger_ops;
+
+ return ops;
+}
+
+static struct event_command trigger_enable_cmd = {
+ .name = ENABLE_EVENT_STR,
+ .trigger_type = ETT_EVENT_ENABLE,
+ .func = event_enable_trigger_func,
+ .reg = event_enable_register_trigger,
+ .unreg = event_enable_unregister_trigger,
+ .get_trigger_ops = event_enable_get_trigger_ops,
+ .set_filter = set_trigger_filter,
+};
+
+static struct event_command trigger_disable_cmd = {
+ .name = DISABLE_EVENT_STR,
+ .trigger_type = ETT_EVENT_ENABLE,
+ .func = event_enable_trigger_func,
+ .reg = event_enable_register_trigger,
+ .unreg = event_enable_unregister_trigger,
+ .get_trigger_ops = event_enable_get_trigger_ops,
+ .set_filter = set_trigger_filter,
+};
+
+static __init void unregister_trigger_enable_disable_cmds(void)
+{
+ unregister_event_command(&trigger_enable_cmd);
+ unregister_event_command(&trigger_disable_cmd);
+}
+
+static __init int register_trigger_enable_disable_cmds(void)
+{
+ int ret;
+
+ ret = register_event_command(&trigger_enable_cmd);
+ if (WARN_ON(ret < 0))
+ return ret;
+ ret = register_event_command(&trigger_disable_cmd);
+ if (WARN_ON(ret < 0))
+ unregister_trigger_enable_disable_cmds();
+
+ return ret;
+}
+
+static __init int register_trigger_traceon_traceoff_cmds(void)
+{
+ int ret;
+
+ ret = register_event_command(&trigger_traceon_cmd);
+ if (WARN_ON(ret < 0))
+ return ret;
+ ret = register_event_command(&trigger_traceoff_cmd);
+ if (WARN_ON(ret < 0))
+ unregister_trigger_traceon_traceoff_cmds();
+
+ return ret;
+}
+
+__init int register_trigger_cmds(void)
+{
+ register_trigger_traceon_traceoff_cmds();
+ register_trigger_snapshot_cmd();
+ register_trigger_stacktrace_cmd();
+ register_trigger_enable_disable_cmds();
+
+ return 0;
+}
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
new file mode 100644
index 000000000..174a6a711
--- /dev/null
+++ b/kernel/trace/trace_export.c
@@ -0,0 +1,195 @@
+/*
+ * trace_export.c - export basic ftrace utilities to user space
+ *
+ * Copyright (C) 2009 Steven Rostedt <srostedt@redhat.com>
+ */
+#include <linux/stringify.h>
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include "trace_output.h"
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM ftrace
+
+/*
+ * The FTRACE_ENTRY_REG macro allows ftrace entry to define register
+ * function and thus become accesible via perf.
+ */
+#undef FTRACE_ENTRY_REG
+#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, \
+ filter, regfn) \
+ FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
+ filter)
+
+/* not needed for this file */
+#undef __field_struct
+#define __field_struct(type, item)
+
+#undef __field
+#define __field(type, item) type item;
+
+#undef __field_desc
+#define __field_desc(type, container, item) type item;
+
+#undef __array
+#define __array(type, item, size) type item[size];
+
+#undef __array_desc
+#define __array_desc(type, container, item, size) type item[size];
+
+#undef __dynamic_array
+#define __dynamic_array(type, item) type item[];
+
+#undef F_STRUCT
+#define F_STRUCT(args...) args
+
+#undef F_printk
+#define F_printk(fmt, args...) fmt, args
+
+#undef FTRACE_ENTRY
+#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \
+struct ____ftrace_##name { \
+ tstruct \
+}; \
+static void __always_unused ____ftrace_check_##name(void) \
+{ \
+ struct ____ftrace_##name *__entry = NULL; \
+ \
+ /* force compile-time check on F_printk() */ \
+ printk(print); \
+}
+
+#undef FTRACE_ENTRY_DUP
+#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print, filter) \
+ FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
+ filter)
+
+#include "trace_entries.h"
+
+#undef __field
+#define __field(type, item) \
+ ret = trace_define_field(event_call, #type, #item, \
+ offsetof(typeof(field), item), \
+ sizeof(field.item), \
+ is_signed_type(type), filter_type); \
+ if (ret) \
+ return ret;
+
+#undef __field_desc
+#define __field_desc(type, container, item) \
+ ret = trace_define_field(event_call, #type, #item, \
+ offsetof(typeof(field), \
+ container.item), \
+ sizeof(field.container.item), \
+ is_signed_type(type), filter_type); \
+ if (ret) \
+ return ret;
+
+#undef __array
+#define __array(type, item, len) \
+ do { \
+ char *type_str = #type"["__stringify(len)"]"; \
+ BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
+ ret = trace_define_field(event_call, type_str, #item, \
+ offsetof(typeof(field), item), \
+ sizeof(field.item), \
+ is_signed_type(type), filter_type); \
+ if (ret) \
+ return ret; \
+ } while (0);
+
+#undef __array_desc
+#define __array_desc(type, container, item, len) \
+ BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
+ ret = trace_define_field(event_call, #type "[" #len "]", #item, \
+ offsetof(typeof(field), \
+ container.item), \
+ sizeof(field.container.item), \
+ is_signed_type(type), filter_type); \
+ if (ret) \
+ return ret;
+
+#undef __dynamic_array
+#define __dynamic_array(type, item) \
+ ret = trace_define_field(event_call, #type, #item, \
+ offsetof(typeof(field), item), \
+ 0, is_signed_type(type), filter_type);\
+ if (ret) \
+ return ret;
+
+#undef FTRACE_ENTRY
+#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \
+static int __init \
+ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
+{ \
+ struct struct_name field; \
+ int ret; \
+ int filter_type = filter; \
+ \
+ tstruct; \
+ \
+ return ret; \
+}
+
+#include "trace_entries.h"
+
+#undef __entry
+#define __entry REC
+
+#undef __field
+#define __field(type, item)
+
+#undef __field_desc
+#define __field_desc(type, container, item)
+
+#undef __array
+#define __array(type, item, len)
+
+#undef __array_desc
+#define __array_desc(type, container, item, len)
+
+#undef __dynamic_array
+#define __dynamic_array(type, item)
+
+#undef F_printk
+#define F_printk(fmt, args...) __stringify(fmt) ", " __stringify(args)
+
+#undef FTRACE_ENTRY_REG
+#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\
+ regfn) \
+ \
+struct ftrace_event_class __refdata event_class_ftrace_##call = { \
+ .system = __stringify(TRACE_SYSTEM), \
+ .define_fields = ftrace_define_fields_##call, \
+ .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\
+ .reg = regfn, \
+}; \
+ \
+struct ftrace_event_call __used event_##call = { \
+ .class = &event_class_ftrace_##call, \
+ { \
+ .name = #call, \
+ }, \
+ .event.type = etype, \
+ .print_fmt = print, \
+ .flags = TRACE_EVENT_FL_IGNORE_ENABLE, \
+}; \
+struct ftrace_event_call __used \
+__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
+
+#undef FTRACE_ENTRY
+#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print, filter) \
+ FTRACE_ENTRY_REG(call, struct_name, etype, \
+ PARAMS(tstruct), PARAMS(print), filter, NULL)
+
+int ftrace_event_is_function(struct ftrace_event_call *call)
+{
+ return call == &event_function;
+}
+
+#include "trace_entries.h"
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
new file mode 100644
index 000000000..fcd41a166
--- /dev/null
+++ b/kernel/trace/trace_functions.c
@@ -0,0 +1,689 @@
+/*
+ * ring buffer based function tracer
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
+ *
+ * Based on code from the latency_tracer, that is:
+ *
+ * Copyright (C) 2004-2006 Ingo Molnar
+ * Copyright (C) 2004 Nadia Yvette Chambers
+ */
+#include <linux/ring_buffer.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+
+#include "trace.h"
+
+static void tracing_start_function_trace(struct trace_array *tr);
+static void tracing_stop_function_trace(struct trace_array *tr);
+static void
+function_trace_call(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *pt_regs);
+static void
+function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *pt_regs);
+static struct tracer_flags func_flags;
+
+/* Our option */
+enum {
+ TRACE_FUNC_OPT_STACK = 0x1,
+};
+
+static int allocate_ftrace_ops(struct trace_array *tr)
+{
+ struct ftrace_ops *ops;
+
+ ops = kzalloc(sizeof(*ops), GFP_KERNEL);
+ if (!ops)
+ return -ENOMEM;
+
+ /* Currently only the non stack verision is supported */
+ ops->func = function_trace_call;
+ ops->flags = FTRACE_OPS_FL_RECURSION_SAFE;
+
+ tr->ops = ops;
+ ops->private = tr;
+ return 0;
+}
+
+
+int ftrace_create_function_files(struct trace_array *tr,
+ struct dentry *parent)
+{
+ int ret;
+
+ /*
+ * The top level array uses the "global_ops", and the files are
+ * created on boot up.
+ */
+ if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
+ return 0;
+
+ ret = allocate_ftrace_ops(tr);
+ if (ret)
+ return ret;
+
+ ftrace_create_filter_files(tr->ops, parent);
+
+ return 0;
+}
+
+void ftrace_destroy_function_files(struct trace_array *tr)
+{
+ ftrace_destroy_filter_files(tr->ops);
+ kfree(tr->ops);
+ tr->ops = NULL;
+}
+
+static int function_trace_init(struct trace_array *tr)
+{
+ ftrace_func_t func;
+
+ /*
+ * Instance trace_arrays get their ops allocated
+ * at instance creation. Unless it failed
+ * the allocation.
+ */
+ if (!tr->ops)
+ return -ENOMEM;
+
+ /* Currently only the global instance can do stack tracing */
+ if (tr->flags & TRACE_ARRAY_FL_GLOBAL &&
+ func_flags.val & TRACE_FUNC_OPT_STACK)
+ func = function_stack_trace_call;
+ else
+ func = function_trace_call;
+
+ ftrace_init_array_ops(tr, func);
+
+ tr->trace_buffer.cpu = get_cpu();
+ put_cpu();
+
+ tracing_start_cmdline_record();
+ tracing_start_function_trace(tr);
+ return 0;
+}
+
+static void function_trace_reset(struct trace_array *tr)
+{
+ tracing_stop_function_trace(tr);
+ tracing_stop_cmdline_record();
+ ftrace_reset_array_ops(tr);
+}
+
+static void function_trace_start(struct trace_array *tr)
+{
+ tracing_reset_online_cpus(&tr->trace_buffer);
+}
+
+static void
+function_trace_call(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *pt_regs)
+{
+ struct trace_array *tr = op->private;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+ int bit;
+ int cpu;
+ int pc;
+
+ if (unlikely(!tr->function_enabled))
+ return;
+
+ pc = preempt_count();
+ preempt_disable_notrace();
+
+ bit = trace_test_and_set_recursion(TRACE_FTRACE_START, TRACE_FTRACE_MAX);
+ if (bit < 0)
+ goto out;
+
+ cpu = smp_processor_id();
+ data = per_cpu_ptr(tr->trace_buffer.data, cpu);
+ if (!atomic_read(&data->disabled)) {
+ local_save_flags(flags);
+ trace_function(tr, ip, parent_ip, flags, pc);
+ }
+ trace_clear_recursion(bit);
+
+ out:
+ preempt_enable_notrace();
+}
+
+static void
+function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *pt_regs)
+{
+ struct trace_array *tr = op->private;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+ long disabled;
+ int cpu;
+ int pc;
+
+ if (unlikely(!tr->function_enabled))
+ return;
+
+ /*
+ * Need to use raw, since this must be called before the
+ * recursive protection is performed.
+ */
+ local_irq_save(flags);
+ cpu = raw_smp_processor_id();
+ data = per_cpu_ptr(tr->trace_buffer.data, cpu);
+ disabled = atomic_inc_return(&data->disabled);
+
+ if (likely(disabled == 1)) {
+ pc = preempt_count();
+ trace_function(tr, ip, parent_ip, flags, pc);
+ /*
+ * skip over 5 funcs:
+ * __ftrace_trace_stack,
+ * __trace_stack,
+ * function_stack_trace_call
+ * ftrace_list_func
+ * ftrace_call
+ */
+ __trace_stack(tr, flags, 5, pc);
+ }
+
+ atomic_dec(&data->disabled);
+ local_irq_restore(flags);
+}
+
+static struct tracer_opt func_opts[] = {
+#ifdef CONFIG_STACKTRACE
+ { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) },
+#endif
+ { } /* Always set a last empty entry */
+};
+
+static struct tracer_flags func_flags = {
+ .val = 0, /* By default: all flags disabled */
+ .opts = func_opts
+};
+
+static void tracing_start_function_trace(struct trace_array *tr)
+{
+ tr->function_enabled = 0;
+ register_ftrace_function(tr->ops);
+ tr->function_enabled = 1;
+}
+
+static void tracing_stop_function_trace(struct trace_array *tr)
+{
+ tr->function_enabled = 0;
+ unregister_ftrace_function(tr->ops);
+}
+
+static int
+func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
+{
+ switch (bit) {
+ case TRACE_FUNC_OPT_STACK:
+ /* do nothing if already set */
+ if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK))
+ break;
+
+ unregister_ftrace_function(tr->ops);
+
+ if (set) {
+ tr->ops->func = function_stack_trace_call;
+ register_ftrace_function(tr->ops);
+ } else {
+ tr->ops->func = function_trace_call;
+ register_ftrace_function(tr->ops);
+ }
+
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static struct tracer function_trace __tracer_data =
+{
+ .name = "function",
+ .init = function_trace_init,
+ .reset = function_trace_reset,
+ .start = function_trace_start,
+ .flags = &func_flags,
+ .set_flag = func_set_flag,
+ .allow_instances = true,
+#ifdef CONFIG_FTRACE_SELFTEST
+ .selftest = trace_selftest_startup_function,
+#endif
+};
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+static void update_traceon_count(void **data, bool on)
+{
+ long *count = (long *)data;
+ long old_count = *count;
+
+ /*
+ * Tracing gets disabled (or enabled) once per count.
+ * This function can be called at the same time on multiple CPUs.
+ * It is fine if both disable (or enable) tracing, as disabling
+ * (or enabling) the second time doesn't do anything as the
+ * state of the tracer is already disabled (or enabled).
+ * What needs to be synchronized in this case is that the count
+ * only gets decremented once, even if the tracer is disabled
+ * (or enabled) twice, as the second one is really a nop.
+ *
+ * The memory barriers guarantee that we only decrement the
+ * counter once. First the count is read to a local variable
+ * and a read barrier is used to make sure that it is loaded
+ * before checking if the tracer is in the state we want.
+ * If the tracer is not in the state we want, then the count
+ * is guaranteed to be the old count.
+ *
+ * Next the tracer is set to the state we want (disabled or enabled)
+ * then a write memory barrier is used to make sure that
+ * the new state is visible before changing the counter by
+ * one minus the old counter. This guarantees that another CPU
+ * executing this code will see the new state before seeing
+ * the new counter value, and would not do anything if the new
+ * counter is seen.
+ *
+ * Note, there is no synchronization between this and a user
+ * setting the tracing_on file. But we currently don't care
+ * about that.
+ */
+ if (!old_count)
+ return;
+
+ /* Make sure we see count before checking tracing state */
+ smp_rmb();
+
+ if (on == !!tracing_is_on())
+ return;
+
+ if (on)
+ tracing_on();
+ else
+ tracing_off();
+
+ /* unlimited? */
+ if (old_count == -1)
+ return;
+
+ /* Make sure tracing state is visible before updating count */
+ smp_wmb();
+
+ *count = old_count - 1;
+}
+
+static void
+ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, void **data)
+{
+ update_traceon_count(data, 1);
+}
+
+static void
+ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, void **data)
+{
+ update_traceon_count(data, 0);
+}
+
+static void
+ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data)
+{
+ if (tracing_is_on())
+ return;
+
+ tracing_on();
+}
+
+static void
+ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data)
+{
+ if (!tracing_is_on())
+ return;
+
+ tracing_off();
+}
+
+/*
+ * Skip 4:
+ * ftrace_stacktrace()
+ * function_trace_probe_call()
+ * ftrace_ops_list_func()
+ * ftrace_call()
+ */
+#define STACK_SKIP 4
+
+static void
+ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, void **data)
+{
+ trace_dump_stack(STACK_SKIP);
+}
+
+static void
+ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data)
+{
+ long *count = (long *)data;
+ long old_count;
+ long new_count;
+
+ /*
+ * Stack traces should only execute the number of times the
+ * user specified in the counter.
+ */
+ do {
+
+ if (!tracing_is_on())
+ return;
+
+ old_count = *count;
+
+ if (!old_count)
+ return;
+
+ /* unlimited? */
+ if (old_count == -1) {
+ trace_dump_stack(STACK_SKIP);
+ return;
+ }
+
+ new_count = old_count - 1;
+ new_count = cmpxchg(count, old_count, new_count);
+ if (new_count == old_count)
+ trace_dump_stack(STACK_SKIP);
+
+ } while (new_count != old_count);
+}
+
+static int update_count(void **data)
+{
+ unsigned long *count = (long *)data;
+
+ if (!*count)
+ return 0;
+
+ if (*count != -1)
+ (*count)--;
+
+ return 1;
+}
+
+static void
+ftrace_dump_probe(unsigned long ip, unsigned long parent_ip, void **data)
+{
+ if (update_count(data))
+ ftrace_dump(DUMP_ALL);
+}
+
+/* Only dump the current CPU buffer. */
+static void
+ftrace_cpudump_probe(unsigned long ip, unsigned long parent_ip, void **data)
+{
+ if (update_count(data))
+ ftrace_dump(DUMP_ORIG);
+}
+
+static int
+ftrace_probe_print(const char *name, struct seq_file *m,
+ unsigned long ip, void *data)
+{
+ long count = (long)data;
+
+ seq_printf(m, "%ps:%s", (void *)ip, name);
+
+ if (count == -1)
+ seq_puts(m, ":unlimited\n");
+ else
+ seq_printf(m, ":count=%ld\n", count);
+
+ return 0;
+}
+
+static int
+ftrace_traceon_print(struct seq_file *m, unsigned long ip,
+ struct ftrace_probe_ops *ops, void *data)
+{
+ return ftrace_probe_print("traceon", m, ip, data);
+}
+
+static int
+ftrace_traceoff_print(struct seq_file *m, unsigned long ip,
+ struct ftrace_probe_ops *ops, void *data)
+{
+ return ftrace_probe_print("traceoff", m, ip, data);
+}
+
+static int
+ftrace_stacktrace_print(struct seq_file *m, unsigned long ip,
+ struct ftrace_probe_ops *ops, void *data)
+{
+ return ftrace_probe_print("stacktrace", m, ip, data);
+}
+
+static int
+ftrace_dump_print(struct seq_file *m, unsigned long ip,
+ struct ftrace_probe_ops *ops, void *data)
+{
+ return ftrace_probe_print("dump", m, ip, data);
+}
+
+static int
+ftrace_cpudump_print(struct seq_file *m, unsigned long ip,
+ struct ftrace_probe_ops *ops, void *data)
+{
+ return ftrace_probe_print("cpudump", m, ip, data);
+}
+
+static struct ftrace_probe_ops traceon_count_probe_ops = {
+ .func = ftrace_traceon_count,
+ .print = ftrace_traceon_print,
+};
+
+static struct ftrace_probe_ops traceoff_count_probe_ops = {
+ .func = ftrace_traceoff_count,
+ .print = ftrace_traceoff_print,
+};
+
+static struct ftrace_probe_ops stacktrace_count_probe_ops = {
+ .func = ftrace_stacktrace_count,
+ .print = ftrace_stacktrace_print,
+};
+
+static struct ftrace_probe_ops dump_probe_ops = {
+ .func = ftrace_dump_probe,
+ .print = ftrace_dump_print,
+};
+
+static struct ftrace_probe_ops cpudump_probe_ops = {
+ .func = ftrace_cpudump_probe,
+ .print = ftrace_cpudump_print,
+};
+
+static struct ftrace_probe_ops traceon_probe_ops = {
+ .func = ftrace_traceon,
+ .print = ftrace_traceon_print,
+};
+
+static struct ftrace_probe_ops traceoff_probe_ops = {
+ .func = ftrace_traceoff,
+ .print = ftrace_traceoff_print,
+};
+
+static struct ftrace_probe_ops stacktrace_probe_ops = {
+ .func = ftrace_stacktrace,
+ .print = ftrace_stacktrace_print,
+};
+
+static int
+ftrace_trace_probe_callback(struct ftrace_probe_ops *ops,
+ struct ftrace_hash *hash, char *glob,
+ char *cmd, char *param, int enable)
+{
+ void *count = (void *)-1;
+ char *number;
+ int ret;
+
+ /* hash funcs only work with set_ftrace_filter */
+ if (!enable)
+ return -EINVAL;
+
+ if (glob[0] == '!') {
+ unregister_ftrace_function_probe_func(glob+1, ops);
+ return 0;
+ }
+
+ if (!param)
+ goto out_reg;
+
+ number = strsep(&param, ":");
+
+ if (!strlen(number))
+ goto out_reg;
+
+ /*
+ * We use the callback data field (which is a pointer)
+ * as our counter.
+ */
+ ret = kstrtoul(number, 0, (unsigned long *)&count);
+ if (ret)
+ return ret;
+
+ out_reg:
+ ret = register_ftrace_function_probe(glob, ops, count);
+
+ return ret < 0 ? ret : 0;
+}
+
+static int
+ftrace_trace_onoff_callback(struct ftrace_hash *hash,
+ char *glob, char *cmd, char *param, int enable)
+{
+ struct ftrace_probe_ops *ops;
+
+ /* we register both traceon and traceoff to this callback */
+ if (strcmp(cmd, "traceon") == 0)
+ ops = param ? &traceon_count_probe_ops : &traceon_probe_ops;
+ else
+ ops = param ? &traceoff_count_probe_ops : &traceoff_probe_ops;
+
+ return ftrace_trace_probe_callback(ops, hash, glob, cmd,
+ param, enable);
+}
+
+static int
+ftrace_stacktrace_callback(struct ftrace_hash *hash,
+ char *glob, char *cmd, char *param, int enable)
+{
+ struct ftrace_probe_ops *ops;
+
+ ops = param ? &stacktrace_count_probe_ops : &stacktrace_probe_ops;
+
+ return ftrace_trace_probe_callback(ops, hash, glob, cmd,
+ param, enable);
+}
+
+static int
+ftrace_dump_callback(struct ftrace_hash *hash,
+ char *glob, char *cmd, char *param, int enable)
+{
+ struct ftrace_probe_ops *ops;
+
+ ops = &dump_probe_ops;
+
+ /* Only dump once. */
+ return ftrace_trace_probe_callback(ops, hash, glob, cmd,
+ "1", enable);
+}
+
+static int
+ftrace_cpudump_callback(struct ftrace_hash *hash,
+ char *glob, char *cmd, char *param, int enable)
+{
+ struct ftrace_probe_ops *ops;
+
+ ops = &cpudump_probe_ops;
+
+ /* Only dump once. */
+ return ftrace_trace_probe_callback(ops, hash, glob, cmd,
+ "1", enable);
+}
+
+static struct ftrace_func_command ftrace_traceon_cmd = {
+ .name = "traceon",
+ .func = ftrace_trace_onoff_callback,
+};
+
+static struct ftrace_func_command ftrace_traceoff_cmd = {
+ .name = "traceoff",
+ .func = ftrace_trace_onoff_callback,
+};
+
+static struct ftrace_func_command ftrace_stacktrace_cmd = {
+ .name = "stacktrace",
+ .func = ftrace_stacktrace_callback,
+};
+
+static struct ftrace_func_command ftrace_dump_cmd = {
+ .name = "dump",
+ .func = ftrace_dump_callback,
+};
+
+static struct ftrace_func_command ftrace_cpudump_cmd = {
+ .name = "cpudump",
+ .func = ftrace_cpudump_callback,
+};
+
+static int __init init_func_cmd_traceon(void)
+{
+ int ret;
+
+ ret = register_ftrace_command(&ftrace_traceoff_cmd);
+ if (ret)
+ return ret;
+
+ ret = register_ftrace_command(&ftrace_traceon_cmd);
+ if (ret)
+ goto out_free_traceoff;
+
+ ret = register_ftrace_command(&ftrace_stacktrace_cmd);
+ if (ret)
+ goto out_free_traceon;
+
+ ret = register_ftrace_command(&ftrace_dump_cmd);
+ if (ret)
+ goto out_free_stacktrace;
+
+ ret = register_ftrace_command(&ftrace_cpudump_cmd);
+ if (ret)
+ goto out_free_dump;
+
+ return 0;
+
+ out_free_dump:
+ unregister_ftrace_command(&ftrace_dump_cmd);
+ out_free_stacktrace:
+ unregister_ftrace_command(&ftrace_stacktrace_cmd);
+ out_free_traceon:
+ unregister_ftrace_command(&ftrace_traceon_cmd);
+ out_free_traceoff:
+ unregister_ftrace_command(&ftrace_traceoff_cmd);
+
+ return ret;
+}
+#else
+static inline int init_func_cmd_traceon(void)
+{
+ return 0;
+}
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+static __init int init_function_trace(void)
+{
+ init_func_cmd_traceon();
+ return register_tracer(&function_trace);
+}
+core_initcall(init_function_trace);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
new file mode 100644
index 000000000..a51e79688
--- /dev/null
+++ b/kernel/trace/trace_functions_graph.c
@@ -0,0 +1,1470 @@
+/*
+ *
+ * Function graph tracer.
+ * Copyright (c) 2008-2009 Frederic Weisbecker <fweisbec@gmail.com>
+ * Mostly borrowed from function tracer which
+ * is Copyright (c) Steven Rostedt <srostedt@redhat.com>
+ *
+ */
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+
+#include "trace.h"
+#include "trace_output.h"
+
+static bool kill_ftrace_graph;
+
+/**
+ * ftrace_graph_is_dead - returns true if ftrace_graph_stop() was called
+ *
+ * ftrace_graph_stop() is called when a severe error is detected in
+ * the function graph tracing. This function is called by the critical
+ * paths of function graph to keep those paths from doing any more harm.
+ */
+bool ftrace_graph_is_dead(void)
+{
+ return kill_ftrace_graph;
+}
+
+/**
+ * ftrace_graph_stop - set to permanently disable function graph tracincg
+ *
+ * In case of an error int function graph tracing, this is called
+ * to try to keep function graph tracing from causing any more harm.
+ * Usually this is pretty severe and this is called to try to at least
+ * get a warning out to the user.
+ */
+void ftrace_graph_stop(void)
+{
+ kill_ftrace_graph = true;
+}
+
+/* When set, irq functions will be ignored */
+static int ftrace_graph_skip_irqs;
+
+struct fgraph_cpu_data {
+ pid_t last_pid;
+ int depth;
+ int depth_irq;
+ int ignore;
+ unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH];
+};
+
+struct fgraph_data {
+ struct fgraph_cpu_data __percpu *cpu_data;
+
+ /* Place to preserve last processed entry. */
+ struct ftrace_graph_ent_entry ent;
+ struct ftrace_graph_ret_entry ret;
+ int failed;
+ int cpu;
+};
+
+#define TRACE_GRAPH_INDENT 2
+
+static unsigned int max_depth;
+
+static struct tracer_opt trace_opts[] = {
+ /* Display overruns? (for self-debug purpose) */
+ { TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) },
+ /* Display CPU ? */
+ { TRACER_OPT(funcgraph-cpu, TRACE_GRAPH_PRINT_CPU) },
+ /* Display Overhead ? */
+ { TRACER_OPT(funcgraph-overhead, TRACE_GRAPH_PRINT_OVERHEAD) },
+ /* Display proc name/pid */
+ { TRACER_OPT(funcgraph-proc, TRACE_GRAPH_PRINT_PROC) },
+ /* Display duration of execution */
+ { TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) },
+ /* Display absolute time of an entry */
+ { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) },
+ /* Display interrupts */
+ { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) },
+ /* Display function name after trailing } */
+ { TRACER_OPT(funcgraph-tail, TRACE_GRAPH_PRINT_TAIL) },
+ { } /* Empty entry */
+};
+
+static struct tracer_flags tracer_flags = {
+ /* Don't display overruns, proc, or tail by default */
+ .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD |
+ TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS,
+ .opts = trace_opts
+};
+
+static struct trace_array *graph_array;
+
+/*
+ * DURATION column is being also used to display IRQ signs,
+ * following values are used by print_graph_irq and others
+ * to fill in space into DURATION column.
+ */
+enum {
+ FLAGS_FILL_FULL = 1 << TRACE_GRAPH_PRINT_FILL_SHIFT,
+ FLAGS_FILL_START = 2 << TRACE_GRAPH_PRINT_FILL_SHIFT,
+ FLAGS_FILL_END = 3 << TRACE_GRAPH_PRINT_FILL_SHIFT,
+};
+
+static void
+print_graph_duration(unsigned long long duration, struct trace_seq *s,
+ u32 flags);
+
+/* Add a function return address to the trace stack on thread info.*/
+int
+ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
+ unsigned long frame_pointer)
+{
+ unsigned long long calltime;
+ int index;
+
+ if (unlikely(ftrace_graph_is_dead()))
+ return -EBUSY;
+
+ if (!current->ret_stack)
+ return -EBUSY;
+
+ /*
+ * We must make sure the ret_stack is tested before we read
+ * anything else.
+ */
+ smp_rmb();
+
+ /* The return trace stack is full */
+ if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
+ atomic_inc(&current->trace_overrun);
+ return -EBUSY;
+ }
+
+ /*
+ * The curr_ret_stack is an index to ftrace return stack of
+ * current task. Its value should be in [0, FTRACE_RETFUNC_
+ * DEPTH) when the function graph tracer is used. To support
+ * filtering out specific functions, it makes the index
+ * negative by subtracting huge value (FTRACE_NOTRACE_DEPTH)
+ * so when it sees a negative index the ftrace will ignore
+ * the record. And the index gets recovered when returning
+ * from the filtered function by adding the FTRACE_NOTRACE_
+ * DEPTH and then it'll continue to record functions normally.
+ *
+ * The curr_ret_stack is initialized to -1 and get increased
+ * in this function. So it can be less than -1 only if it was
+ * filtered out via ftrace_graph_notrace_addr() which can be
+ * set from set_graph_notrace file in tracefs by user.
+ */
+ if (current->curr_ret_stack < -1)
+ return -EBUSY;
+
+ calltime = trace_clock_local();
+
+ index = ++current->curr_ret_stack;
+ if (ftrace_graph_notrace_addr(func))
+ current->curr_ret_stack -= FTRACE_NOTRACE_DEPTH;
+ barrier();
+ current->ret_stack[index].ret = ret;
+ current->ret_stack[index].func = func;
+ current->ret_stack[index].calltime = calltime;
+ current->ret_stack[index].subtime = 0;
+ current->ret_stack[index].fp = frame_pointer;
+ *depth = current->curr_ret_stack;
+
+ return 0;
+}
+
+/* Retrieve a function return address to the trace stack on thread info.*/
+static void
+ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
+ unsigned long frame_pointer)
+{
+ int index;
+
+ index = current->curr_ret_stack;
+
+ /*
+ * A negative index here means that it's just returned from a
+ * notrace'd function. Recover index to get an original
+ * return address. See ftrace_push_return_trace().
+ *
+ * TODO: Need to check whether the stack gets corrupted.
+ */
+ if (index < 0)
+ index += FTRACE_NOTRACE_DEPTH;
+
+ if (unlikely(index < 0 || index >= FTRACE_RETFUNC_DEPTH)) {
+ ftrace_graph_stop();
+ WARN_ON(1);
+ /* Might as well panic, otherwise we have no where to go */
+ *ret = (unsigned long)panic;
+ return;
+ }
+
+#if defined(CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST) && !defined(CC_USING_FENTRY)
+ /*
+ * The arch may choose to record the frame pointer used
+ * and check it here to make sure that it is what we expect it
+ * to be. If gcc does not set the place holder of the return
+ * address in the frame pointer, and does a copy instead, then
+ * the function graph trace will fail. This test detects this
+ * case.
+ *
+ * Currently, x86_32 with optimize for size (-Os) makes the latest
+ * gcc do the above.
+ *
+ * Note, -mfentry does not use frame pointers, and this test
+ * is not needed if CC_USING_FENTRY is set.
+ */
+ if (unlikely(current->ret_stack[index].fp != frame_pointer)) {
+ ftrace_graph_stop();
+ WARN(1, "Bad frame pointer: expected %lx, received %lx\n"
+ " from func %ps return to %lx\n",
+ current->ret_stack[index].fp,
+ frame_pointer,
+ (void *)current->ret_stack[index].func,
+ current->ret_stack[index].ret);
+ *ret = (unsigned long)panic;
+ return;
+ }
+#endif
+
+ *ret = current->ret_stack[index].ret;
+ trace->func = current->ret_stack[index].func;
+ trace->calltime = current->ret_stack[index].calltime;
+ trace->overrun = atomic_read(&current->trace_overrun);
+ trace->depth = index;
+}
+
+/*
+ * Send the trace to the ring-buffer.
+ * @return the original return address.
+ */
+unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
+{
+ struct ftrace_graph_ret trace;
+ unsigned long ret;
+
+ ftrace_pop_return_trace(&trace, &ret, frame_pointer);
+ trace.rettime = trace_clock_local();
+ barrier();
+ current->curr_ret_stack--;
+ /*
+ * The curr_ret_stack can be less than -1 only if it was
+ * filtered out and it's about to return from the function.
+ * Recover the index and continue to trace normal functions.
+ */
+ if (current->curr_ret_stack < -1) {
+ current->curr_ret_stack += FTRACE_NOTRACE_DEPTH;
+ return ret;
+ }
+
+ /*
+ * The trace should run after decrementing the ret counter
+ * in case an interrupt were to come in. We don't want to
+ * lose the interrupt if max_depth is set.
+ */
+ ftrace_graph_return(&trace);
+
+ if (unlikely(!ret)) {
+ ftrace_graph_stop();
+ WARN_ON(1);
+ /* Might as well panic. What else to do? */
+ ret = (unsigned long)panic;
+ }
+
+ return ret;
+}
+
+int __trace_graph_entry(struct trace_array *tr,
+ struct ftrace_graph_ent *trace,
+ unsigned long flags,
+ int pc)
+{
+ struct ftrace_event_call *call = &event_funcgraph_entry;
+ struct ring_buffer_event *event;
+ struct ring_buffer *buffer = tr->trace_buffer.buffer;
+ struct ftrace_graph_ent_entry *entry;
+
+ if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
+ return 0;
+
+ event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT,
+ sizeof(*entry), flags, pc);
+ if (!event)
+ return 0;
+ entry = ring_buffer_event_data(event);
+ entry->graph_ent = *trace;
+ if (!call_filter_check_discard(call, entry, buffer, event))
+ __buffer_unlock_commit(buffer, event);
+
+ return 1;
+}
+
+static inline int ftrace_graph_ignore_irqs(void)
+{
+ if (!ftrace_graph_skip_irqs || trace_recursion_test(TRACE_IRQ_BIT))
+ return 0;
+
+ return in_irq();
+}
+
+int trace_graph_entry(struct ftrace_graph_ent *trace)
+{
+ struct trace_array *tr = graph_array;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+ long disabled;
+ int ret;
+ int cpu;
+ int pc;
+
+ if (!ftrace_trace_task(current))
+ return 0;
+
+ /* trace it when it is-nested-in or is a function enabled. */
+ if ((!(trace->depth || ftrace_graph_addr(trace->func)) ||
+ ftrace_graph_ignore_irqs()) || (trace->depth < 0) ||
+ (max_depth && trace->depth >= max_depth))
+ return 0;
+
+ /*
+ * Do not trace a function if it's filtered by set_graph_notrace.
+ * Make the index of ret stack negative to indicate that it should
+ * ignore further functions. But it needs its own ret stack entry
+ * to recover the original index in order to continue tracing after
+ * returning from the function.
+ */
+ if (ftrace_graph_notrace_addr(trace->func))
+ return 1;
+
+ local_irq_save(flags);
+ cpu = raw_smp_processor_id();
+ data = per_cpu_ptr(tr->trace_buffer.data, cpu);
+ disabled = atomic_inc_return(&data->disabled);
+ if (likely(disabled == 1)) {
+ pc = preempt_count();
+ ret = __trace_graph_entry(tr, trace, flags, pc);
+ } else {
+ ret = 0;
+ }
+
+ atomic_dec(&data->disabled);
+ local_irq_restore(flags);
+
+ return ret;
+}
+
+static int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
+{
+ if (tracing_thresh)
+ return 1;
+ else
+ return trace_graph_entry(trace);
+}
+
+static void
+__trace_graph_function(struct trace_array *tr,
+ unsigned long ip, unsigned long flags, int pc)
+{
+ u64 time = trace_clock_local();
+ struct ftrace_graph_ent ent = {
+ .func = ip,
+ .depth = 0,
+ };
+ struct ftrace_graph_ret ret = {
+ .func = ip,
+ .depth = 0,
+ .calltime = time,
+ .rettime = time,
+ };
+
+ __trace_graph_entry(tr, &ent, flags, pc);
+ __trace_graph_return(tr, &ret, flags, pc);
+}
+
+void
+trace_graph_function(struct trace_array *tr,
+ unsigned long ip, unsigned long parent_ip,
+ unsigned long flags, int pc)
+{
+ __trace_graph_function(tr, ip, flags, pc);
+}
+
+void __trace_graph_return(struct trace_array *tr,
+ struct ftrace_graph_ret *trace,
+ unsigned long flags,
+ int pc)
+{
+ struct ftrace_event_call *call = &event_funcgraph_exit;
+ struct ring_buffer_event *event;
+ struct ring_buffer *buffer = tr->trace_buffer.buffer;
+ struct ftrace_graph_ret_entry *entry;
+
+ if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
+ return;
+
+ event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET,
+ sizeof(*entry), flags, pc);
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ entry->ret = *trace;
+ if (!call_filter_check_discard(call, entry, buffer, event))
+ __buffer_unlock_commit(buffer, event);
+}
+
+void trace_graph_return(struct ftrace_graph_ret *trace)
+{
+ struct trace_array *tr = graph_array;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+ long disabled;
+ int cpu;
+ int pc;
+
+ local_irq_save(flags);
+ cpu = raw_smp_processor_id();
+ data = per_cpu_ptr(tr->trace_buffer.data, cpu);
+ disabled = atomic_inc_return(&data->disabled);
+ if (likely(disabled == 1)) {
+ pc = preempt_count();
+ __trace_graph_return(tr, trace, flags, pc);
+ }
+ atomic_dec(&data->disabled);
+ local_irq_restore(flags);
+}
+
+void set_graph_array(struct trace_array *tr)
+{
+ graph_array = tr;
+
+ /* Make graph_array visible before we start tracing */
+
+ smp_mb();
+}
+
+static void trace_graph_thresh_return(struct ftrace_graph_ret *trace)
+{
+ if (tracing_thresh &&
+ (trace->rettime - trace->calltime < tracing_thresh))
+ return;
+ else
+ trace_graph_return(trace);
+}
+
+static int graph_trace_init(struct trace_array *tr)
+{
+ int ret;
+
+ set_graph_array(tr);
+ if (tracing_thresh)
+ ret = register_ftrace_graph(&trace_graph_thresh_return,
+ &trace_graph_thresh_entry);
+ else
+ ret = register_ftrace_graph(&trace_graph_return,
+ &trace_graph_entry);
+ if (ret)
+ return ret;
+ tracing_start_cmdline_record();
+
+ return 0;
+}
+
+static void graph_trace_reset(struct trace_array *tr)
+{
+ tracing_stop_cmdline_record();
+ unregister_ftrace_graph();
+}
+
+static int graph_trace_update_thresh(struct trace_array *tr)
+{
+ graph_trace_reset(tr);
+ return graph_trace_init(tr);
+}
+
+static int max_bytes_for_cpu;
+
+static void print_graph_cpu(struct trace_seq *s, int cpu)
+{
+ /*
+ * Start with a space character - to make it stand out
+ * to the right a bit when trace output is pasted into
+ * email:
+ */
+ trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu);
+}
+
+#define TRACE_GRAPH_PROCINFO_LENGTH 14
+
+static void print_graph_proc(struct trace_seq *s, pid_t pid)
+{
+ char comm[TASK_COMM_LEN];
+ /* sign + log10(MAX_INT) + '\0' */
+ char pid_str[11];
+ int spaces = 0;
+ int len;
+ int i;
+
+ trace_find_cmdline(pid, comm);
+ comm[7] = '\0';
+ sprintf(pid_str, "%d", pid);
+
+ /* 1 stands for the "-" character */
+ len = strlen(comm) + strlen(pid_str) + 1;
+
+ if (len < TRACE_GRAPH_PROCINFO_LENGTH)
+ spaces = TRACE_GRAPH_PROCINFO_LENGTH - len;
+
+ /* First spaces to align center */
+ for (i = 0; i < spaces / 2; i++)
+ trace_seq_putc(s, ' ');
+
+ trace_seq_printf(s, "%s-%s", comm, pid_str);
+
+ /* Last spaces to align center */
+ for (i = 0; i < spaces - (spaces / 2); i++)
+ trace_seq_putc(s, ' ');
+}
+
+
+static void print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
+{
+ trace_seq_putc(s, ' ');
+ trace_print_lat_fmt(s, entry);
+}
+
+/* If the pid changed since the last trace, output this event */
+static void
+verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
+{
+ pid_t prev_pid;
+ pid_t *last_pid;
+
+ if (!data)
+ return;
+
+ last_pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
+
+ if (*last_pid == pid)
+ return;
+
+ prev_pid = *last_pid;
+ *last_pid = pid;
+
+ if (prev_pid == -1)
+ return;
+/*
+ * Context-switch trace line:
+
+ ------------------------------------------
+ | 1) migration/0--1 => sshd-1755
+ ------------------------------------------
+
+ */
+ trace_seq_puts(s, " ------------------------------------------\n");
+ print_graph_cpu(s, cpu);
+ print_graph_proc(s, prev_pid);
+ trace_seq_puts(s, " => ");
+ print_graph_proc(s, pid);
+ trace_seq_puts(s, "\n ------------------------------------------\n\n");
+}
+
+static struct ftrace_graph_ret_entry *
+get_return_for_leaf(struct trace_iterator *iter,
+ struct ftrace_graph_ent_entry *curr)
+{
+ struct fgraph_data *data = iter->private;
+ struct ring_buffer_iter *ring_iter = NULL;
+ struct ring_buffer_event *event;
+ struct ftrace_graph_ret_entry *next;
+
+ /*
+ * If the previous output failed to write to the seq buffer,
+ * then we just reuse the data from before.
+ */
+ if (data && data->failed) {
+ curr = &data->ent;
+ next = &data->ret;
+ } else {
+
+ ring_iter = trace_buffer_iter(iter, iter->cpu);
+
+ /* First peek to compare current entry and the next one */
+ if (ring_iter)
+ event = ring_buffer_iter_peek(ring_iter, NULL);
+ else {
+ /*
+ * We need to consume the current entry to see
+ * the next one.
+ */
+ ring_buffer_consume(iter->trace_buffer->buffer, iter->cpu,
+ NULL, NULL);
+ event = ring_buffer_peek(iter->trace_buffer->buffer, iter->cpu,
+ NULL, NULL);
+ }
+
+ if (!event)
+ return NULL;
+
+ next = ring_buffer_event_data(event);
+
+ if (data) {
+ /*
+ * Save current and next entries for later reference
+ * if the output fails.
+ */
+ data->ent = *curr;
+ /*
+ * If the next event is not a return type, then
+ * we only care about what type it is. Otherwise we can
+ * safely copy the entire event.
+ */
+ if (next->ent.type == TRACE_GRAPH_RET)
+ data->ret = *next;
+ else
+ data->ret.ent.type = next->ent.type;
+ }
+ }
+
+ if (next->ent.type != TRACE_GRAPH_RET)
+ return NULL;
+
+ if (curr->ent.pid != next->ent.pid ||
+ curr->graph_ent.func != next->ret.func)
+ return NULL;
+
+ /* this is a leaf, now advance the iterator */
+ if (ring_iter)
+ ring_buffer_read(ring_iter, NULL);
+
+ return next;
+}
+
+static void print_graph_abs_time(u64 t, struct trace_seq *s)
+{
+ unsigned long usecs_rem;
+
+ usecs_rem = do_div(t, NSEC_PER_SEC);
+ usecs_rem /= 1000;
+
+ trace_seq_printf(s, "%5lu.%06lu | ",
+ (unsigned long)t, usecs_rem);
+}
+
+static void
+print_graph_irq(struct trace_iterator *iter, unsigned long addr,
+ enum trace_type type, int cpu, pid_t pid, u32 flags)
+{
+ struct trace_seq *s = &iter->seq;
+ struct trace_entry *ent = iter->ent;
+
+ if (addr < (unsigned long)__irqentry_text_start ||
+ addr >= (unsigned long)__irqentry_text_end)
+ return;
+
+ if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
+ /* Absolute time */
+ if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
+ print_graph_abs_time(iter->ts, s);
+
+ /* Cpu */
+ if (flags & TRACE_GRAPH_PRINT_CPU)
+ print_graph_cpu(s, cpu);
+
+ /* Proc */
+ if (flags & TRACE_GRAPH_PRINT_PROC) {
+ print_graph_proc(s, pid);
+ trace_seq_puts(s, " | ");
+ }
+
+ /* Latency format */
+ if (trace_flags & TRACE_ITER_LATENCY_FMT)
+ print_graph_lat_fmt(s, ent);
+ }
+
+ /* No overhead */
+ print_graph_duration(0, s, flags | FLAGS_FILL_START);
+
+ if (type == TRACE_GRAPH_ENT)
+ trace_seq_puts(s, "==========>");
+ else
+ trace_seq_puts(s, "<==========");
+
+ print_graph_duration(0, s, flags | FLAGS_FILL_END);
+ trace_seq_putc(s, '\n');
+}
+
+void
+trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
+{
+ unsigned long nsecs_rem = do_div(duration, 1000);
+ /* log10(ULONG_MAX) + '\0' */
+ char usecs_str[21];
+ char nsecs_str[5];
+ int len;
+ int i;
+
+ sprintf(usecs_str, "%lu", (unsigned long) duration);
+
+ /* Print msecs */
+ trace_seq_printf(s, "%s", usecs_str);
+
+ len = strlen(usecs_str);
+
+ /* Print nsecs (we don't want to exceed 7 numbers) */
+ if (len < 7) {
+ size_t slen = min_t(size_t, sizeof(nsecs_str), 8UL - len);
+
+ snprintf(nsecs_str, slen, "%03lu", nsecs_rem);
+ trace_seq_printf(s, ".%s", nsecs_str);
+ len += strlen(nsecs_str);
+ }
+
+ trace_seq_puts(s, " us ");
+
+ /* Print remaining spaces to fit the row's width */
+ for (i = len; i < 7; i++)
+ trace_seq_putc(s, ' ');
+}
+
+static void
+print_graph_duration(unsigned long long duration, struct trace_seq *s,
+ u32 flags)
+{
+ if (!(flags & TRACE_GRAPH_PRINT_DURATION) ||
+ !(trace_flags & TRACE_ITER_CONTEXT_INFO))
+ return;
+
+ /* No real adata, just filling the column with spaces */
+ switch (flags & TRACE_GRAPH_PRINT_FILL_MASK) {
+ case FLAGS_FILL_FULL:
+ trace_seq_puts(s, " | ");
+ return;
+ case FLAGS_FILL_START:
+ trace_seq_puts(s, " ");
+ return;
+ case FLAGS_FILL_END:
+ trace_seq_puts(s, " |");
+ return;
+ }
+
+ /* Signal a overhead of time execution to the output */
+ if (flags & TRACE_GRAPH_PRINT_OVERHEAD)
+ trace_seq_printf(s, "%c ", trace_find_mark(duration));
+ else
+ trace_seq_puts(s, " ");
+
+ trace_print_graph_duration(duration, s);
+ trace_seq_puts(s, "| ");
+}
+
+/* Case of a leaf function on its call entry */
+static enum print_line_t
+print_graph_entry_leaf(struct trace_iterator *iter,
+ struct ftrace_graph_ent_entry *entry,
+ struct ftrace_graph_ret_entry *ret_entry,
+ struct trace_seq *s, u32 flags)
+{
+ struct fgraph_data *data = iter->private;
+ struct ftrace_graph_ret *graph_ret;
+ struct ftrace_graph_ent *call;
+ unsigned long long duration;
+ int i;
+
+ graph_ret = &ret_entry->ret;
+ call = &entry->graph_ent;
+ duration = graph_ret->rettime - graph_ret->calltime;
+
+ if (data) {
+ struct fgraph_cpu_data *cpu_data;
+ int cpu = iter->cpu;
+
+ cpu_data = per_cpu_ptr(data->cpu_data, cpu);
+
+ /*
+ * Comments display at + 1 to depth. Since
+ * this is a leaf function, keep the comments
+ * equal to this depth.
+ */
+ cpu_data->depth = call->depth - 1;
+
+ /* No need to keep this function around for this depth */
+ if (call->depth < FTRACE_RETFUNC_DEPTH)
+ cpu_data->enter_funcs[call->depth] = 0;
+ }
+
+ /* Overhead and duration */
+ print_graph_duration(duration, s, flags);
+
+ /* Function */
+ for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++)
+ trace_seq_putc(s, ' ');
+
+ trace_seq_printf(s, "%ps();\n", (void *)call->func);
+
+ return trace_handle_return(s);
+}
+
+static enum print_line_t
+print_graph_entry_nested(struct trace_iterator *iter,
+ struct ftrace_graph_ent_entry *entry,
+ struct trace_seq *s, int cpu, u32 flags)
+{
+ struct ftrace_graph_ent *call = &entry->graph_ent;
+ struct fgraph_data *data = iter->private;
+ int i;
+
+ if (data) {
+ struct fgraph_cpu_data *cpu_data;
+ int cpu = iter->cpu;
+
+ cpu_data = per_cpu_ptr(data->cpu_data, cpu);
+ cpu_data->depth = call->depth;
+
+ /* Save this function pointer to see if the exit matches */
+ if (call->depth < FTRACE_RETFUNC_DEPTH)
+ cpu_data->enter_funcs[call->depth] = call->func;
+ }
+
+ /* No time */
+ print_graph_duration(0, s, flags | FLAGS_FILL_FULL);
+
+ /* Function */
+ for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++)
+ trace_seq_putc(s, ' ');
+
+ trace_seq_printf(s, "%ps() {\n", (void *)call->func);
+
+ if (trace_seq_has_overflowed(s))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ /*
+ * we already consumed the current entry to check the next one
+ * and see if this is a leaf.
+ */
+ return TRACE_TYPE_NO_CONSUME;
+}
+
+static void
+print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
+ int type, unsigned long addr, u32 flags)
+{
+ struct fgraph_data *data = iter->private;
+ struct trace_entry *ent = iter->ent;
+ int cpu = iter->cpu;
+
+ /* Pid */
+ verif_pid(s, ent->pid, cpu, data);
+
+ if (type)
+ /* Interrupt */
+ print_graph_irq(iter, addr, type, cpu, ent->pid, flags);
+
+ if (!(trace_flags & TRACE_ITER_CONTEXT_INFO))
+ return;
+
+ /* Absolute time */
+ if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
+ print_graph_abs_time(iter->ts, s);
+
+ /* Cpu */
+ if (flags & TRACE_GRAPH_PRINT_CPU)
+ print_graph_cpu(s, cpu);
+
+ /* Proc */
+ if (flags & TRACE_GRAPH_PRINT_PROC) {
+ print_graph_proc(s, ent->pid);
+ trace_seq_puts(s, " | ");
+ }
+
+ /* Latency format */
+ if (trace_flags & TRACE_ITER_LATENCY_FMT)
+ print_graph_lat_fmt(s, ent);
+
+ return;
+}
+
+/*
+ * Entry check for irq code
+ *
+ * returns 1 if
+ * - we are inside irq code
+ * - we just entered irq code
+ *
+ * retunns 0 if
+ * - funcgraph-interrupts option is set
+ * - we are not inside irq code
+ */
+static int
+check_irq_entry(struct trace_iterator *iter, u32 flags,
+ unsigned long addr, int depth)
+{
+ int cpu = iter->cpu;
+ int *depth_irq;
+ struct fgraph_data *data = iter->private;
+
+ /*
+ * If we are either displaying irqs, or we got called as
+ * a graph event and private data does not exist,
+ * then we bypass the irq check.
+ */
+ if ((flags & TRACE_GRAPH_PRINT_IRQS) ||
+ (!data))
+ return 0;
+
+ depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
+
+ /*
+ * We are inside the irq code
+ */
+ if (*depth_irq >= 0)
+ return 1;
+
+ if ((addr < (unsigned long)__irqentry_text_start) ||
+ (addr >= (unsigned long)__irqentry_text_end))
+ return 0;
+
+ /*
+ * We are entering irq code.
+ */
+ *depth_irq = depth;
+ return 1;
+}
+
+/*
+ * Return check for irq code
+ *
+ * returns 1 if
+ * - we are inside irq code
+ * - we just left irq code
+ *
+ * returns 0 if
+ * - funcgraph-interrupts option is set
+ * - we are not inside irq code
+ */
+static int
+check_irq_return(struct trace_iterator *iter, u32 flags, int depth)
+{
+ int cpu = iter->cpu;
+ int *depth_irq;
+ struct fgraph_data *data = iter->private;
+
+ /*
+ * If we are either displaying irqs, or we got called as
+ * a graph event and private data does not exist,
+ * then we bypass the irq check.
+ */
+ if ((flags & TRACE_GRAPH_PRINT_IRQS) ||
+ (!data))
+ return 0;
+
+ depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
+
+ /*
+ * We are not inside the irq code.
+ */
+ if (*depth_irq == -1)
+ return 0;
+
+ /*
+ * We are inside the irq code, and this is returning entry.
+ * Let's not trace it and clear the entry depth, since
+ * we are out of irq code.
+ *
+ * This condition ensures that we 'leave the irq code' once
+ * we are out of the entry depth. Thus protecting us from
+ * the RETURN entry loss.
+ */
+ if (*depth_irq >= depth) {
+ *depth_irq = -1;
+ return 1;
+ }
+
+ /*
+ * We are inside the irq code, and this is not the entry.
+ */
+ return 1;
+}
+
+static enum print_line_t
+print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
+ struct trace_iterator *iter, u32 flags)
+{
+ struct fgraph_data *data = iter->private;
+ struct ftrace_graph_ent *call = &field->graph_ent;
+ struct ftrace_graph_ret_entry *leaf_ret;
+ static enum print_line_t ret;
+ int cpu = iter->cpu;
+
+ if (check_irq_entry(iter, flags, call->func, call->depth))
+ return TRACE_TYPE_HANDLED;
+
+ print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags);
+
+ leaf_ret = get_return_for_leaf(iter, field);
+ if (leaf_ret)
+ ret = print_graph_entry_leaf(iter, field, leaf_ret, s, flags);
+ else
+ ret = print_graph_entry_nested(iter, field, s, cpu, flags);
+
+ if (data) {
+ /*
+ * If we failed to write our output, then we need to make
+ * note of it. Because we already consumed our entry.
+ */
+ if (s->full) {
+ data->failed = 1;
+ data->cpu = cpu;
+ } else
+ data->failed = 0;
+ }
+
+ return ret;
+}
+
+static enum print_line_t
+print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
+ struct trace_entry *ent, struct trace_iterator *iter,
+ u32 flags)
+{
+ unsigned long long duration = trace->rettime - trace->calltime;
+ struct fgraph_data *data = iter->private;
+ pid_t pid = ent->pid;
+ int cpu = iter->cpu;
+ int func_match = 1;
+ int i;
+
+ if (check_irq_return(iter, flags, trace->depth))
+ return TRACE_TYPE_HANDLED;
+
+ if (data) {
+ struct fgraph_cpu_data *cpu_data;
+ int cpu = iter->cpu;
+
+ cpu_data = per_cpu_ptr(data->cpu_data, cpu);
+
+ /*
+ * Comments display at + 1 to depth. This is the
+ * return from a function, we now want the comments
+ * to display at the same level of the bracket.
+ */
+ cpu_data->depth = trace->depth - 1;
+
+ if (trace->depth < FTRACE_RETFUNC_DEPTH) {
+ if (cpu_data->enter_funcs[trace->depth] != trace->func)
+ func_match = 0;
+ cpu_data->enter_funcs[trace->depth] = 0;
+ }
+ }
+
+ print_graph_prologue(iter, s, 0, 0, flags);
+
+ /* Overhead and duration */
+ print_graph_duration(duration, s, flags);
+
+ /* Closing brace */
+ for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++)
+ trace_seq_putc(s, ' ');
+
+ /*
+ * If the return function does not have a matching entry,
+ * then the entry was lost. Instead of just printing
+ * the '}' and letting the user guess what function this
+ * belongs to, write out the function name. Always do
+ * that if the funcgraph-tail option is enabled.
+ */
+ if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL))
+ trace_seq_puts(s, "}\n");
+ else
+ trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func);
+
+ /* Overrun */
+ if (flags & TRACE_GRAPH_PRINT_OVERRUN)
+ trace_seq_printf(s, " (Overruns: %lu)\n",
+ trace->overrun);
+
+ print_graph_irq(iter, trace->func, TRACE_GRAPH_RET,
+ cpu, pid, flags);
+
+ return trace_handle_return(s);
+}
+
+static enum print_line_t
+print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
+ struct trace_iterator *iter, u32 flags)
+{
+ unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
+ struct fgraph_data *data = iter->private;
+ struct trace_event *event;
+ int depth = 0;
+ int ret;
+ int i;
+
+ if (data)
+ depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth;
+
+ print_graph_prologue(iter, s, 0, 0, flags);
+
+ /* No time */
+ print_graph_duration(0, s, flags | FLAGS_FILL_FULL);
+
+ /* Indentation */
+ if (depth > 0)
+ for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++)
+ trace_seq_putc(s, ' ');
+
+ /* The comment */
+ trace_seq_puts(s, "/* ");
+
+ switch (iter->ent->type) {
+ case TRACE_BPRINT:
+ ret = trace_print_bprintk_msg_only(iter);
+ if (ret != TRACE_TYPE_HANDLED)
+ return ret;
+ break;
+ case TRACE_PRINT:
+ ret = trace_print_printk_msg_only(iter);
+ if (ret != TRACE_TYPE_HANDLED)
+ return ret;
+ break;
+ default:
+ event = ftrace_find_event(ent->type);
+ if (!event)
+ return TRACE_TYPE_UNHANDLED;
+
+ ret = event->funcs->trace(iter, sym_flags, event);
+ if (ret != TRACE_TYPE_HANDLED)
+ return ret;
+ }
+
+ if (trace_seq_has_overflowed(s))
+ goto out;
+
+ /* Strip ending newline */
+ if (s->buffer[s->seq.len - 1] == '\n') {
+ s->buffer[s->seq.len - 1] = '\0';
+ s->seq.len--;
+ }
+
+ trace_seq_puts(s, " */\n");
+ out:
+ return trace_handle_return(s);
+}
+
+
+enum print_line_t
+print_graph_function_flags(struct trace_iterator *iter, u32 flags)
+{
+ struct ftrace_graph_ent_entry *field;
+ struct fgraph_data *data = iter->private;
+ struct trace_entry *entry = iter->ent;
+ struct trace_seq *s = &iter->seq;
+ int cpu = iter->cpu;
+ int ret;
+
+ if (data && per_cpu_ptr(data->cpu_data, cpu)->ignore) {
+ per_cpu_ptr(data->cpu_data, cpu)->ignore = 0;
+ return TRACE_TYPE_HANDLED;
+ }
+
+ /*
+ * If the last output failed, there's a possibility we need
+ * to print out the missing entry which would never go out.
+ */
+ if (data && data->failed) {
+ field = &data->ent;
+ iter->cpu = data->cpu;
+ ret = print_graph_entry(field, s, iter, flags);
+ if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) {
+ per_cpu_ptr(data->cpu_data, iter->cpu)->ignore = 1;
+ ret = TRACE_TYPE_NO_CONSUME;
+ }
+ iter->cpu = cpu;
+ return ret;
+ }
+
+ switch (entry->type) {
+ case TRACE_GRAPH_ENT: {
+ /*
+ * print_graph_entry() may consume the current event,
+ * thus @field may become invalid, so we need to save it.
+ * sizeof(struct ftrace_graph_ent_entry) is very small,
+ * it can be safely saved at the stack.
+ */
+ struct ftrace_graph_ent_entry saved;
+ trace_assign_type(field, entry);
+ saved = *field;
+ return print_graph_entry(&saved, s, iter, flags);
+ }
+ case TRACE_GRAPH_RET: {
+ struct ftrace_graph_ret_entry *field;
+ trace_assign_type(field, entry);
+ return print_graph_return(&field->ret, s, entry, iter, flags);
+ }
+ case TRACE_STACK:
+ case TRACE_FN:
+ /* dont trace stack and functions as comments */
+ return TRACE_TYPE_UNHANDLED;
+
+ default:
+ return print_graph_comment(s, entry, iter, flags);
+ }
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t
+print_graph_function(struct trace_iterator *iter)
+{
+ return print_graph_function_flags(iter, tracer_flags.val);
+}
+
+static enum print_line_t
+print_graph_function_event(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ return print_graph_function(iter);
+}
+
+static void print_lat_header(struct seq_file *s, u32 flags)
+{
+ static const char spaces[] = " " /* 16 spaces */
+ " " /* 4 spaces */
+ " "; /* 17 spaces */
+ int size = 0;
+
+ if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
+ size += 16;
+ if (flags & TRACE_GRAPH_PRINT_CPU)
+ size += 4;
+ if (flags & TRACE_GRAPH_PRINT_PROC)
+ size += 17;
+
+ seq_printf(s, "#%.*s _-----=> irqs-off \n", size, spaces);
+ seq_printf(s, "#%.*s / _----=> need-resched \n", size, spaces);
+ seq_printf(s, "#%.*s| / _---=> hardirq/softirq \n", size, spaces);
+ seq_printf(s, "#%.*s|| / _--=> preempt-depth \n", size, spaces);
+ seq_printf(s, "#%.*s||| / \n", size, spaces);
+}
+
+static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
+{
+ int lat = trace_flags & TRACE_ITER_LATENCY_FMT;
+
+ if (lat)
+ print_lat_header(s, flags);
+
+ /* 1st line */
+ seq_putc(s, '#');
+ if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
+ seq_puts(s, " TIME ");
+ if (flags & TRACE_GRAPH_PRINT_CPU)
+ seq_puts(s, " CPU");
+ if (flags & TRACE_GRAPH_PRINT_PROC)
+ seq_puts(s, " TASK/PID ");
+ if (lat)
+ seq_puts(s, "||||");
+ if (flags & TRACE_GRAPH_PRINT_DURATION)
+ seq_puts(s, " DURATION ");
+ seq_puts(s, " FUNCTION CALLS\n");
+
+ /* 2nd line */
+ seq_putc(s, '#');
+ if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
+ seq_puts(s, " | ");
+ if (flags & TRACE_GRAPH_PRINT_CPU)
+ seq_puts(s, " | ");
+ if (flags & TRACE_GRAPH_PRINT_PROC)
+ seq_puts(s, " | | ");
+ if (lat)
+ seq_puts(s, "||||");
+ if (flags & TRACE_GRAPH_PRINT_DURATION)
+ seq_puts(s, " | | ");
+ seq_puts(s, " | | | |\n");
+}
+
+static void print_graph_headers(struct seq_file *s)
+{
+ print_graph_headers_flags(s, tracer_flags.val);
+}
+
+void print_graph_headers_flags(struct seq_file *s, u32 flags)
+{
+ struct trace_iterator *iter = s->private;
+
+ if (!(trace_flags & TRACE_ITER_CONTEXT_INFO))
+ return;
+
+ if (trace_flags & TRACE_ITER_LATENCY_FMT) {
+ /* print nothing if the buffers are empty */
+ if (trace_empty(iter))
+ return;
+
+ print_trace_header(s, iter);
+ }
+
+ __print_graph_headers_flags(s, flags);
+}
+
+void graph_trace_open(struct trace_iterator *iter)
+{
+ /* pid and depth on the last trace processed */
+ struct fgraph_data *data;
+ gfp_t gfpflags;
+ int cpu;
+
+ iter->private = NULL;
+
+ /* We can be called in atomic context via ftrace_dump() */
+ gfpflags = (in_atomic() || irqs_disabled()) ? GFP_ATOMIC : GFP_KERNEL;
+
+ data = kzalloc(sizeof(*data), gfpflags);
+ if (!data)
+ goto out_err;
+
+ data->cpu_data = alloc_percpu_gfp(struct fgraph_cpu_data, gfpflags);
+ if (!data->cpu_data)
+ goto out_err_free;
+
+ for_each_possible_cpu(cpu) {
+ pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
+ int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
+ int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore);
+ int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
+
+ *pid = -1;
+ *depth = 0;
+ *ignore = 0;
+ *depth_irq = -1;
+ }
+
+ iter->private = data;
+
+ return;
+
+ out_err_free:
+ kfree(data);
+ out_err:
+ pr_warning("function graph tracer: not enough memory\n");
+}
+
+void graph_trace_close(struct trace_iterator *iter)
+{
+ struct fgraph_data *data = iter->private;
+
+ if (data) {
+ free_percpu(data->cpu_data);
+ kfree(data);
+ }
+}
+
+static int
+func_graph_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
+{
+ if (bit == TRACE_GRAPH_PRINT_IRQS)
+ ftrace_graph_skip_irqs = !set;
+
+ return 0;
+}
+
+static struct trace_event_functions graph_functions = {
+ .trace = print_graph_function_event,
+};
+
+static struct trace_event graph_trace_entry_event = {
+ .type = TRACE_GRAPH_ENT,
+ .funcs = &graph_functions,
+};
+
+static struct trace_event graph_trace_ret_event = {
+ .type = TRACE_GRAPH_RET,
+ .funcs = &graph_functions
+};
+
+static struct tracer graph_trace __tracer_data = {
+ .name = "function_graph",
+ .update_thresh = graph_trace_update_thresh,
+ .open = graph_trace_open,
+ .pipe_open = graph_trace_open,
+ .close = graph_trace_close,
+ .pipe_close = graph_trace_close,
+ .init = graph_trace_init,
+ .reset = graph_trace_reset,
+ .print_line = print_graph_function,
+ .print_header = print_graph_headers,
+ .flags = &tracer_flags,
+ .set_flag = func_graph_set_flag,
+#ifdef CONFIG_FTRACE_SELFTEST
+ .selftest = trace_selftest_startup_function_graph,
+#endif
+};
+
+
+static ssize_t
+graph_depth_write(struct file *filp, const char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
+
+ max_depth = val;
+
+ *ppos += cnt;
+
+ return cnt;
+}
+
+static ssize_t
+graph_depth_read(struct file *filp, char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ char buf[15]; /* More than enough to hold UINT_MAX + "\n"*/
+ int n;
+
+ n = sprintf(buf, "%d\n", max_depth);
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, n);
+}
+
+static const struct file_operations graph_depth_fops = {
+ .open = tracing_open_generic,
+ .write = graph_depth_write,
+ .read = graph_depth_read,
+ .llseek = generic_file_llseek,
+};
+
+static __init int init_graph_tracefs(void)
+{
+ struct dentry *d_tracer;
+
+ d_tracer = tracing_init_dentry();
+ if (IS_ERR(d_tracer))
+ return 0;
+
+ trace_create_file("max_graph_depth", 0644, d_tracer,
+ NULL, &graph_depth_fops);
+
+ return 0;
+}
+fs_initcall(init_graph_tracefs);
+
+static __init int init_graph_trace(void)
+{
+ max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
+
+ if (!register_ftrace_event(&graph_trace_entry_event)) {
+ pr_warning("Warning: could not register graph trace events\n");
+ return 1;
+ }
+
+ if (!register_ftrace_event(&graph_trace_ret_event)) {
+ pr_warning("Warning: could not register graph trace events\n");
+ return 1;
+ }
+
+ return register_tracer(&graph_trace);
+}
+
+core_initcall(init_graph_trace);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
new file mode 100644
index 000000000..8523ea345
--- /dev/null
+++ b/kernel/trace/trace_irqsoff.c
@@ -0,0 +1,764 @@
+/*
+ * trace irqs off critical timings
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
+ *
+ * From code in the latency_tracer, that is:
+ *
+ * Copyright (C) 2004-2006 Ingo Molnar
+ * Copyright (C) 2004 Nadia Yvette Chambers
+ */
+#include <linux/kallsyms.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/ftrace.h>
+
+#include "trace.h"
+
+static struct trace_array *irqsoff_trace __read_mostly;
+static int tracer_enabled __read_mostly;
+
+static DEFINE_PER_CPU(int, tracing_cpu);
+
+static DEFINE_RAW_SPINLOCK(max_trace_lock);
+
+enum {
+ TRACER_IRQS_OFF = (1 << 1),
+ TRACER_PREEMPT_OFF = (1 << 2),
+};
+
+static int trace_type __read_mostly;
+
+static int save_flags;
+static bool function_enabled;
+
+static void stop_irqsoff_tracer(struct trace_array *tr, int graph);
+static int start_irqsoff_tracer(struct trace_array *tr, int graph);
+
+#ifdef CONFIG_PREEMPT_TRACER
+static inline int
+preempt_trace(void)
+{
+ return ((trace_type & TRACER_PREEMPT_OFF) && preempt_count());
+}
+#else
+# define preempt_trace() (0)
+#endif
+
+#ifdef CONFIG_IRQSOFF_TRACER
+static inline int
+irq_trace(void)
+{
+ return ((trace_type & TRACER_IRQS_OFF) &&
+ irqs_disabled());
+}
+#else
+# define irq_trace() (0)
+#endif
+
+#define TRACE_DISPLAY_GRAPH 1
+
+static struct tracer_opt trace_opts[] = {
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ /* display latency trace as call graph */
+ { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) },
+#endif
+ { } /* Empty entry */
+};
+
+static struct tracer_flags tracer_flags = {
+ .val = 0,
+ .opts = trace_opts,
+};
+
+#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH)
+
+/*
+ * Sequence count - we record it when starting a measurement and
+ * skip the latency if the sequence has changed - some other section
+ * did a maximum and could disturb our measurement with serial console
+ * printouts, etc. Truly coinciding maximum latencies should be rare
+ * and what happens together happens separately as well, so this doesn't
+ * decrease the validity of the maximum found:
+ */
+static __cacheline_aligned_in_smp unsigned long max_sequence;
+
+#ifdef CONFIG_FUNCTION_TRACER
+/*
+ * Prologue for the preempt and irqs off function tracers.
+ *
+ * Returns 1 if it is OK to continue, and data->disabled is
+ * incremented.
+ * 0 if the trace is to be ignored, and data->disabled
+ * is kept the same.
+ *
+ * Note, this function is also used outside this ifdef but
+ * inside the #ifdef of the function graph tracer below.
+ * This is OK, since the function graph tracer is
+ * dependent on the function tracer.
+ */
+static int func_prolog_dec(struct trace_array *tr,
+ struct trace_array_cpu **data,
+ unsigned long *flags)
+{
+ long disabled;
+ int cpu;
+
+ /*
+ * Does not matter if we preempt. We test the flags
+ * afterward, to see if irqs are disabled or not.
+ * If we preempt and get a false positive, the flags
+ * test will fail.
+ */
+ cpu = raw_smp_processor_id();
+ if (likely(!per_cpu(tracing_cpu, cpu)))
+ return 0;
+
+ local_save_flags(*flags);
+ /* slight chance to get a false positive on tracing_cpu */
+ if (!irqs_disabled_flags(*flags))
+ return 0;
+
+ *data = per_cpu_ptr(tr->trace_buffer.data, cpu);
+ disabled = atomic_inc_return(&(*data)->disabled);
+
+ if (likely(disabled == 1))
+ return 1;
+
+ atomic_dec(&(*data)->disabled);
+
+ return 0;
+}
+
+/*
+ * irqsoff uses its own tracer function to keep the overhead down:
+ */
+static void
+irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *pt_regs)
+{
+ struct trace_array *tr = irqsoff_trace;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+
+ if (!func_prolog_dec(tr, &data, &flags))
+ return;
+
+ trace_function(tr, ip, parent_ip, flags, preempt_count());
+
+ atomic_dec(&data->disabled);
+}
+#endif /* CONFIG_FUNCTION_TRACER */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static int
+irqsoff_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
+{
+ int cpu;
+
+ if (!(bit & TRACE_DISPLAY_GRAPH))
+ return -EINVAL;
+
+ if (!(is_graph() ^ set))
+ return 0;
+
+ stop_irqsoff_tracer(irqsoff_trace, !set);
+
+ for_each_possible_cpu(cpu)
+ per_cpu(tracing_cpu, cpu) = 0;
+
+ tr->max_latency = 0;
+ tracing_reset_online_cpus(&irqsoff_trace->trace_buffer);
+
+ return start_irqsoff_tracer(irqsoff_trace, set);
+}
+
+static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
+{
+ struct trace_array *tr = irqsoff_trace;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+ int ret;
+ int pc;
+
+ if (!func_prolog_dec(tr, &data, &flags))
+ return 0;
+
+ pc = preempt_count();
+ ret = __trace_graph_entry(tr, trace, flags, pc);
+ atomic_dec(&data->disabled);
+
+ return ret;
+}
+
+static void irqsoff_graph_return(struct ftrace_graph_ret *trace)
+{
+ struct trace_array *tr = irqsoff_trace;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+ int pc;
+
+ if (!func_prolog_dec(tr, &data, &flags))
+ return;
+
+ pc = preempt_count();
+ __trace_graph_return(tr, trace, flags, pc);
+ atomic_dec(&data->disabled);
+}
+
+static void irqsoff_trace_open(struct trace_iterator *iter)
+{
+ if (is_graph())
+ graph_trace_open(iter);
+
+}
+
+static void irqsoff_trace_close(struct trace_iterator *iter)
+{
+ if (iter->private)
+ graph_trace_close(iter);
+}
+
+#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \
+ TRACE_GRAPH_PRINT_PROC | \
+ TRACE_GRAPH_PRINT_ABS_TIME | \
+ TRACE_GRAPH_PRINT_DURATION)
+
+static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
+{
+ /*
+ * In graph mode call the graph tracer output function,
+ * otherwise go with the TRACE_FN event handler
+ */
+ if (is_graph())
+ return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS);
+
+ return TRACE_TYPE_UNHANDLED;
+}
+
+static void irqsoff_print_header(struct seq_file *s)
+{
+ if (is_graph())
+ print_graph_headers_flags(s, GRAPH_TRACER_FLAGS);
+ else
+ trace_default_header(s);
+}
+
+static void
+__trace_function(struct trace_array *tr,
+ unsigned long ip, unsigned long parent_ip,
+ unsigned long flags, int pc)
+{
+ if (is_graph())
+ trace_graph_function(tr, ip, parent_ip, flags, pc);
+ else
+ trace_function(tr, ip, parent_ip, flags, pc);
+}
+
+#else
+#define __trace_function trace_function
+
+static int
+irqsoff_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
+{
+ return -EINVAL;
+}
+
+static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
+{
+ return -1;
+}
+
+static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
+{
+ return TRACE_TYPE_UNHANDLED;
+}
+
+static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { }
+static void irqsoff_trace_open(struct trace_iterator *iter) { }
+static void irqsoff_trace_close(struct trace_iterator *iter) { }
+
+#ifdef CONFIG_FUNCTION_TRACER
+static void irqsoff_print_header(struct seq_file *s)
+{
+ trace_default_header(s);
+}
+#else
+static void irqsoff_print_header(struct seq_file *s)
+{
+ trace_latency_header(s);
+}
+#endif /* CONFIG_FUNCTION_TRACER */
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
+/*
+ * Should this new latency be reported/recorded?
+ */
+static int report_latency(struct trace_array *tr, cycle_t delta)
+{
+ if (tracing_thresh) {
+ if (delta < tracing_thresh)
+ return 0;
+ } else {
+ if (delta <= tr->max_latency)
+ return 0;
+ }
+ return 1;
+}
+
+static void
+check_critical_timing(struct trace_array *tr,
+ struct trace_array_cpu *data,
+ unsigned long parent_ip,
+ int cpu)
+{
+ cycle_t T0, T1, delta;
+ unsigned long flags;
+ int pc;
+
+ T0 = data->preempt_timestamp;
+ T1 = ftrace_now(cpu);
+ delta = T1-T0;
+
+ local_save_flags(flags);
+
+ pc = preempt_count();
+
+ if (!report_latency(tr, delta))
+ goto out;
+
+ raw_spin_lock_irqsave(&max_trace_lock, flags);
+
+ /* check if we are still the max latency */
+ if (!report_latency(tr, delta))
+ goto out_unlock;
+
+ __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
+ /* Skip 5 functions to get to the irq/preempt enable function */
+ __trace_stack(tr, flags, 5, pc);
+
+ if (data->critical_sequence != max_sequence)
+ goto out_unlock;
+
+ data->critical_end = parent_ip;
+
+ if (likely(!is_tracing_stopped())) {
+ tr->max_latency = delta;
+ update_max_tr_single(tr, current, cpu);
+ }
+
+ max_sequence++;
+
+out_unlock:
+ raw_spin_unlock_irqrestore(&max_trace_lock, flags);
+
+out:
+ data->critical_sequence = max_sequence;
+ data->preempt_timestamp = ftrace_now(cpu);
+ __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
+}
+
+static inline void
+start_critical_timing(unsigned long ip, unsigned long parent_ip)
+{
+ int cpu;
+ struct trace_array *tr = irqsoff_trace;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+
+ if (!tracer_enabled || !tracing_is_enabled())
+ return;
+
+ cpu = raw_smp_processor_id();
+
+ if (per_cpu(tracing_cpu, cpu))
+ return;
+
+ data = per_cpu_ptr(tr->trace_buffer.data, cpu);
+
+ if (unlikely(!data) || atomic_read(&data->disabled))
+ return;
+
+ atomic_inc(&data->disabled);
+
+ data->critical_sequence = max_sequence;
+ data->preempt_timestamp = ftrace_now(cpu);
+ data->critical_start = parent_ip ? : ip;
+
+ local_save_flags(flags);
+
+ __trace_function(tr, ip, parent_ip, flags, preempt_count());
+
+ per_cpu(tracing_cpu, cpu) = 1;
+
+ atomic_dec(&data->disabled);
+}
+
+static inline void
+stop_critical_timing(unsigned long ip, unsigned long parent_ip)
+{
+ int cpu;
+ struct trace_array *tr = irqsoff_trace;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+
+ cpu = raw_smp_processor_id();
+ /* Always clear the tracing cpu on stopping the trace */
+ if (unlikely(per_cpu(tracing_cpu, cpu)))
+ per_cpu(tracing_cpu, cpu) = 0;
+ else
+ return;
+
+ if (!tracer_enabled || !tracing_is_enabled())
+ return;
+
+ data = per_cpu_ptr(tr->trace_buffer.data, cpu);
+
+ if (unlikely(!data) ||
+ !data->critical_start || atomic_read(&data->disabled))
+ return;
+
+ atomic_inc(&data->disabled);
+
+ local_save_flags(flags);
+ __trace_function(tr, ip, parent_ip, flags, preempt_count());
+ check_critical_timing(tr, data, parent_ip ? : ip, cpu);
+ data->critical_start = 0;
+ atomic_dec(&data->disabled);
+}
+
+/* start and stop critical timings used to for stoppage (in idle) */
+void start_critical_timings(void)
+{
+ if (preempt_trace() || irq_trace())
+ start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+}
+EXPORT_SYMBOL_GPL(start_critical_timings);
+
+void stop_critical_timings(void)
+{
+ if (preempt_trace() || irq_trace())
+ stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+}
+EXPORT_SYMBOL_GPL(stop_critical_timings);
+
+#ifdef CONFIG_IRQSOFF_TRACER
+#ifdef CONFIG_PROVE_LOCKING
+void time_hardirqs_on(unsigned long a0, unsigned long a1)
+{
+ if (!preempt_trace() && irq_trace())
+ stop_critical_timing(a0, a1);
+}
+
+void time_hardirqs_off(unsigned long a0, unsigned long a1)
+{
+ if (!preempt_trace() && irq_trace())
+ start_critical_timing(a0, a1);
+}
+
+#else /* !CONFIG_PROVE_LOCKING */
+
+/*
+ * Stubs:
+ */
+
+void trace_softirqs_on(unsigned long ip)
+{
+}
+
+void trace_softirqs_off(unsigned long ip)
+{
+}
+
+inline void print_irqtrace_events(struct task_struct *curr)
+{
+}
+
+/*
+ * We are only interested in hardirq on/off events:
+ */
+void trace_hardirqs_on(void)
+{
+ if (!preempt_trace() && irq_trace())
+ stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+}
+EXPORT_SYMBOL(trace_hardirqs_on);
+
+void trace_hardirqs_off(void)
+{
+ if (!preempt_trace() && irq_trace())
+ start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+}
+EXPORT_SYMBOL(trace_hardirqs_off);
+
+__visible void trace_hardirqs_on_caller(unsigned long caller_addr)
+{
+ if (!preempt_trace() && irq_trace())
+ stop_critical_timing(CALLER_ADDR0, caller_addr);
+}
+EXPORT_SYMBOL(trace_hardirqs_on_caller);
+
+__visible void trace_hardirqs_off_caller(unsigned long caller_addr)
+{
+ if (!preempt_trace() && irq_trace())
+ start_critical_timing(CALLER_ADDR0, caller_addr);
+}
+EXPORT_SYMBOL(trace_hardirqs_off_caller);
+
+#endif /* CONFIG_PROVE_LOCKING */
+#endif /* CONFIG_IRQSOFF_TRACER */
+
+#ifdef CONFIG_PREEMPT_TRACER
+void trace_preempt_on(unsigned long a0, unsigned long a1)
+{
+ if (preempt_trace() && !irq_trace())
+ stop_critical_timing(a0, a1);
+}
+
+void trace_preempt_off(unsigned long a0, unsigned long a1)
+{
+ if (preempt_trace() && !irq_trace())
+ start_critical_timing(a0, a1);
+}
+#endif /* CONFIG_PREEMPT_TRACER */
+
+static int register_irqsoff_function(struct trace_array *tr, int graph, int set)
+{
+ int ret;
+
+ /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */
+ if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION)))
+ return 0;
+
+ if (graph)
+ ret = register_ftrace_graph(&irqsoff_graph_return,
+ &irqsoff_graph_entry);
+ else
+ ret = register_ftrace_function(tr->ops);
+
+ if (!ret)
+ function_enabled = true;
+
+ return ret;
+}
+
+static void unregister_irqsoff_function(struct trace_array *tr, int graph)
+{
+ if (!function_enabled)
+ return;
+
+ if (graph)
+ unregister_ftrace_graph();
+ else
+ unregister_ftrace_function(tr->ops);
+
+ function_enabled = false;
+}
+
+static void irqsoff_function_set(struct trace_array *tr, int set)
+{
+ if (set)
+ register_irqsoff_function(tr, is_graph(), 1);
+ else
+ unregister_irqsoff_function(tr, is_graph());
+}
+
+static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set)
+{
+ struct tracer *tracer = tr->current_trace;
+
+ if (mask & TRACE_ITER_FUNCTION)
+ irqsoff_function_set(tr, set);
+
+ return trace_keep_overwrite(tracer, mask, set);
+}
+
+static int start_irqsoff_tracer(struct trace_array *tr, int graph)
+{
+ int ret;
+
+ ret = register_irqsoff_function(tr, graph, 0);
+
+ if (!ret && tracing_is_enabled())
+ tracer_enabled = 1;
+ else
+ tracer_enabled = 0;
+
+ return ret;
+}
+
+static void stop_irqsoff_tracer(struct trace_array *tr, int graph)
+{
+ tracer_enabled = 0;
+
+ unregister_irqsoff_function(tr, graph);
+}
+
+static bool irqsoff_busy;
+
+static int __irqsoff_tracer_init(struct trace_array *tr)
+{
+ if (irqsoff_busy)
+ return -EBUSY;
+
+ save_flags = trace_flags;
+
+ /* non overwrite screws up the latency tracers */
+ set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1);
+ set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1);
+
+ tr->max_latency = 0;
+ irqsoff_trace = tr;
+ /* make sure that the tracer is visible */
+ smp_wmb();
+ tracing_reset_online_cpus(&tr->trace_buffer);
+
+ ftrace_init_array_ops(tr, irqsoff_tracer_call);
+
+ /* Only toplevel instance supports graph tracing */
+ if (start_irqsoff_tracer(tr, (tr->flags & TRACE_ARRAY_FL_GLOBAL &&
+ is_graph())))
+ printk(KERN_ERR "failed to start irqsoff tracer\n");
+
+ irqsoff_busy = true;
+ return 0;
+}
+
+static void irqsoff_tracer_reset(struct trace_array *tr)
+{
+ int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT;
+ int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE;
+
+ stop_irqsoff_tracer(tr, is_graph());
+
+ set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag);
+ set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag);
+ ftrace_reset_array_ops(tr);
+
+ irqsoff_busy = false;
+}
+
+static void irqsoff_tracer_start(struct trace_array *tr)
+{
+ tracer_enabled = 1;
+}
+
+static void irqsoff_tracer_stop(struct trace_array *tr)
+{
+ tracer_enabled = 0;
+}
+
+#ifdef CONFIG_IRQSOFF_TRACER
+static int irqsoff_tracer_init(struct trace_array *tr)
+{
+ trace_type = TRACER_IRQS_OFF;
+
+ return __irqsoff_tracer_init(tr);
+}
+static struct tracer irqsoff_tracer __read_mostly =
+{
+ .name = "irqsoff",
+ .init = irqsoff_tracer_init,
+ .reset = irqsoff_tracer_reset,
+ .start = irqsoff_tracer_start,
+ .stop = irqsoff_tracer_stop,
+ .print_max = true,
+ .print_header = irqsoff_print_header,
+ .print_line = irqsoff_print_line,
+ .flags = &tracer_flags,
+ .set_flag = irqsoff_set_flag,
+ .flag_changed = irqsoff_flag_changed,
+#ifdef CONFIG_FTRACE_SELFTEST
+ .selftest = trace_selftest_startup_irqsoff,
+#endif
+ .open = irqsoff_trace_open,
+ .close = irqsoff_trace_close,
+ .allow_instances = true,
+ .use_max_tr = true,
+};
+# define register_irqsoff(trace) register_tracer(&trace)
+#else
+# define register_irqsoff(trace) do { } while (0)
+#endif
+
+#ifdef CONFIG_PREEMPT_TRACER
+static int preemptoff_tracer_init(struct trace_array *tr)
+{
+ trace_type = TRACER_PREEMPT_OFF;
+
+ return __irqsoff_tracer_init(tr);
+}
+
+static struct tracer preemptoff_tracer __read_mostly =
+{
+ .name = "preemptoff",
+ .init = preemptoff_tracer_init,
+ .reset = irqsoff_tracer_reset,
+ .start = irqsoff_tracer_start,
+ .stop = irqsoff_tracer_stop,
+ .print_max = true,
+ .print_header = irqsoff_print_header,
+ .print_line = irqsoff_print_line,
+ .flags = &tracer_flags,
+ .set_flag = irqsoff_set_flag,
+ .flag_changed = irqsoff_flag_changed,
+#ifdef CONFIG_FTRACE_SELFTEST
+ .selftest = trace_selftest_startup_preemptoff,
+#endif
+ .open = irqsoff_trace_open,
+ .close = irqsoff_trace_close,
+ .allow_instances = true,
+ .use_max_tr = true,
+};
+# define register_preemptoff(trace) register_tracer(&trace)
+#else
+# define register_preemptoff(trace) do { } while (0)
+#endif
+
+#if defined(CONFIG_IRQSOFF_TRACER) && \
+ defined(CONFIG_PREEMPT_TRACER)
+
+static int preemptirqsoff_tracer_init(struct trace_array *tr)
+{
+ trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF;
+
+ return __irqsoff_tracer_init(tr);
+}
+
+static struct tracer preemptirqsoff_tracer __read_mostly =
+{
+ .name = "preemptirqsoff",
+ .init = preemptirqsoff_tracer_init,
+ .reset = irqsoff_tracer_reset,
+ .start = irqsoff_tracer_start,
+ .stop = irqsoff_tracer_stop,
+ .print_max = true,
+ .print_header = irqsoff_print_header,
+ .print_line = irqsoff_print_line,
+ .flags = &tracer_flags,
+ .set_flag = irqsoff_set_flag,
+ .flag_changed = irqsoff_flag_changed,
+#ifdef CONFIG_FTRACE_SELFTEST
+ .selftest = trace_selftest_startup_preemptirqsoff,
+#endif
+ .open = irqsoff_trace_open,
+ .close = irqsoff_trace_close,
+ .allow_instances = true,
+ .use_max_tr = true,
+};
+
+# define register_preemptirqsoff(trace) register_tracer(&trace)
+#else
+# define register_preemptirqsoff(trace) do { } while (0)
+#endif
+
+__init static int init_irqsoff_tracer(void)
+{
+ register_irqsoff(irqsoff_tracer);
+ register_preemptoff(preemptoff_tracer);
+ register_preemptirqsoff(preemptirqsoff_tracer);
+
+ return 0;
+}
+core_initcall(init_irqsoff_tracer);
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
new file mode 100644
index 000000000..3ccf5c2c1
--- /dev/null
+++ b/kernel/trace/trace_kdb.c
@@ -0,0 +1,140 @@
+/*
+ * kdb helper for dumping the ftrace buffer
+ *
+ * Copyright (C) 2010 Jason Wessel <jason.wessel@windriver.com>
+ *
+ * ftrace_dump_buf based on ftrace_dump:
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
+ *
+ */
+#include <linux/init.h>
+#include <linux/kgdb.h>
+#include <linux/kdb.h>
+#include <linux/ftrace.h>
+
+#include "trace.h"
+#include "trace_output.h"
+
+static void ftrace_dump_buf(int skip_lines, long cpu_file)
+{
+ /* use static because iter can be a bit big for the stack */
+ static struct trace_iterator iter;
+ static struct ring_buffer_iter *buffer_iter[CONFIG_NR_CPUS];
+ unsigned int old_userobj;
+ int cnt = 0, cpu;
+
+ trace_init_global_iter(&iter);
+ iter.buffer_iter = buffer_iter;
+
+ for_each_tracing_cpu(cpu) {
+ atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
+ }
+
+ old_userobj = trace_flags;
+
+ /* don't look at user memory in panic mode */
+ trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
+
+ kdb_printf("Dumping ftrace buffer:\n");
+
+ /* reset all but tr, trace, and overruns */
+ memset(&iter.seq, 0,
+ sizeof(struct trace_iterator) -
+ offsetof(struct trace_iterator, seq));
+ iter.iter_flags |= TRACE_FILE_LAT_FMT;
+ iter.pos = -1;
+
+ if (cpu_file == RING_BUFFER_ALL_CPUS) {
+ for_each_tracing_cpu(cpu) {
+ iter.buffer_iter[cpu] =
+ ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu);
+ ring_buffer_read_start(iter.buffer_iter[cpu]);
+ tracing_iter_reset(&iter, cpu);
+ }
+ } else {
+ iter.cpu_file = cpu_file;
+ iter.buffer_iter[cpu_file] =
+ ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu_file);
+ ring_buffer_read_start(iter.buffer_iter[cpu_file]);
+ tracing_iter_reset(&iter, cpu_file);
+ }
+
+ while (trace_find_next_entry_inc(&iter)) {
+ if (!cnt)
+ kdb_printf("---------------------------------\n");
+ cnt++;
+
+ if (!skip_lines) {
+ print_trace_line(&iter);
+ trace_printk_seq(&iter.seq);
+ } else {
+ skip_lines--;
+ }
+
+ if (KDB_FLAG(CMD_INTERRUPT))
+ goto out;
+ }
+
+ if (!cnt)
+ kdb_printf(" (ftrace buffer empty)\n");
+ else
+ kdb_printf("---------------------------------\n");
+
+out:
+ trace_flags = old_userobj;
+
+ for_each_tracing_cpu(cpu) {
+ atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
+ }
+
+ for_each_tracing_cpu(cpu) {
+ if (iter.buffer_iter[cpu]) {
+ ring_buffer_read_finish(iter.buffer_iter[cpu]);
+ iter.buffer_iter[cpu] = NULL;
+ }
+ }
+}
+
+/*
+ * kdb_ftdump - Dump the ftrace log buffer
+ */
+static int kdb_ftdump(int argc, const char **argv)
+{
+ int skip_lines = 0;
+ long cpu_file;
+ char *cp;
+
+ if (argc > 2)
+ return KDB_ARGCOUNT;
+
+ if (argc) {
+ skip_lines = simple_strtol(argv[1], &cp, 0);
+ if (*cp)
+ skip_lines = 0;
+ }
+
+ if (argc == 2) {
+ cpu_file = simple_strtol(argv[2], &cp, 0);
+ if (*cp || cpu_file >= NR_CPUS || cpu_file < 0 ||
+ !cpu_online(cpu_file))
+ return KDB_BADINT;
+ } else {
+ cpu_file = RING_BUFFER_ALL_CPUS;
+ }
+
+ kdb_trap_printk++;
+ ftrace_dump_buf(skip_lines, cpu_file);
+ kdb_trap_printk--;
+
+ return 0;
+}
+
+static __init int kdb_ftrace_register(void)
+{
+ kdb_register_flags("ftdump", kdb_ftdump, "[skip_#lines] [cpu]",
+ "Dump ftrace log", 0, KDB_ENABLE_ALWAYS_SAFE);
+ return 0;
+}
+
+late_initcall(kdb_ftrace_register);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
new file mode 100644
index 000000000..d0ce590f0
--- /dev/null
+++ b/kernel/trace/trace_kprobe.c
@@ -0,0 +1,1495 @@
+/*
+ * Kprobes-based tracing events
+ *
+ * Created by Masami Hiramatsu <mhiramat@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/module.h>
+#include <linux/uaccess.h>
+
+#include "trace_probe.h"
+
+#define KPROBE_EVENT_SYSTEM "kprobes"
+
+/**
+ * Kprobe event core functions
+ */
+struct trace_kprobe {
+ struct list_head list;
+ struct kretprobe rp; /* Use rp.kp for kprobe use */
+ unsigned long nhit;
+ const char *symbol; /* symbol name */
+ struct trace_probe tp;
+};
+
+#define SIZEOF_TRACE_KPROBE(n) \
+ (offsetof(struct trace_kprobe, tp.args) + \
+ (sizeof(struct probe_arg) * (n)))
+
+
+static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk)
+{
+ return tk->rp.handler != NULL;
+}
+
+static nokprobe_inline const char *trace_kprobe_symbol(struct trace_kprobe *tk)
+{
+ return tk->symbol ? tk->symbol : "unknown";
+}
+
+static nokprobe_inline unsigned long trace_kprobe_offset(struct trace_kprobe *tk)
+{
+ return tk->rp.kp.offset;
+}
+
+static nokprobe_inline bool trace_kprobe_has_gone(struct trace_kprobe *tk)
+{
+ return !!(kprobe_gone(&tk->rp.kp));
+}
+
+static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk,
+ struct module *mod)
+{
+ int len = strlen(mod->name);
+ const char *name = trace_kprobe_symbol(tk);
+ return strncmp(mod->name, name, len) == 0 && name[len] == ':';
+}
+
+static nokprobe_inline bool trace_kprobe_is_on_module(struct trace_kprobe *tk)
+{
+ return !!strchr(trace_kprobe_symbol(tk), ':');
+}
+
+static int register_kprobe_event(struct trace_kprobe *tk);
+static int unregister_kprobe_event(struct trace_kprobe *tk);
+
+static DEFINE_MUTEX(probe_lock);
+static LIST_HEAD(probe_list);
+
+static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs);
+static int kretprobe_dispatcher(struct kretprobe_instance *ri,
+ struct pt_regs *regs);
+
+/* Memory fetching by symbol */
+struct symbol_cache {
+ char *symbol;
+ long offset;
+ unsigned long addr;
+};
+
+unsigned long update_symbol_cache(struct symbol_cache *sc)
+{
+ sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol);
+
+ if (sc->addr)
+ sc->addr += sc->offset;
+
+ return sc->addr;
+}
+
+void free_symbol_cache(struct symbol_cache *sc)
+{
+ kfree(sc->symbol);
+ kfree(sc);
+}
+
+struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
+{
+ struct symbol_cache *sc;
+
+ if (!sym || strlen(sym) == 0)
+ return NULL;
+
+ sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL);
+ if (!sc)
+ return NULL;
+
+ sc->symbol = kstrdup(sym, GFP_KERNEL);
+ if (!sc->symbol) {
+ kfree(sc);
+ return NULL;
+ }
+ sc->offset = offset;
+ update_symbol_cache(sc);
+
+ return sc;
+}
+
+/*
+ * Kprobes-specific fetch functions
+ */
+#define DEFINE_FETCH_stack(type) \
+static void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs, \
+ void *offset, void *dest) \
+{ \
+ *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \
+ (unsigned int)((unsigned long)offset)); \
+} \
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(stack, type));
+
+DEFINE_BASIC_FETCH_FUNCS(stack)
+/* No string on the stack entry */
+#define fetch_stack_string NULL
+#define fetch_stack_string_size NULL
+
+#define DEFINE_FETCH_memory(type) \
+static void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs, \
+ void *addr, void *dest) \
+{ \
+ type retval; \
+ if (probe_kernel_address(addr, retval)) \
+ *(type *)dest = 0; \
+ else \
+ *(type *)dest = retval; \
+} \
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, type));
+
+DEFINE_BASIC_FETCH_FUNCS(memory)
+/*
+ * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
+ * length and relative data location.
+ */
+static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
+ void *addr, void *dest)
+{
+ long ret;
+ int maxlen = get_rloc_len(*(u32 *)dest);
+ u8 *dst = get_rloc_data(dest);
+ u8 *src = addr;
+ mm_segment_t old_fs = get_fs();
+
+ if (!maxlen)
+ return;
+
+ /*
+ * Try to get string again, since the string can be changed while
+ * probing.
+ */
+ set_fs(KERNEL_DS);
+ pagefault_disable();
+
+ do
+ ret = __copy_from_user_inatomic(dst++, src++, 1);
+ while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen);
+
+ dst[-1] = '\0';
+ pagefault_enable();
+ set_fs(old_fs);
+
+ if (ret < 0) { /* Failed to fetch string */
+ ((u8 *)get_rloc_data(dest))[0] = '\0';
+ *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest));
+ } else {
+ *(u32 *)dest = make_data_rloc(src - (u8 *)addr,
+ get_rloc_offs(*(u32 *)dest));
+ }
+}
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string));
+
+/* Return the length of string -- including null terminal byte */
+static void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
+ void *addr, void *dest)
+{
+ mm_segment_t old_fs;
+ int ret, len = 0;
+ u8 c;
+
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ pagefault_disable();
+
+ do {
+ ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1);
+ len++;
+ } while (c && ret == 0 && len < MAX_STRING_SIZE);
+
+ pagefault_enable();
+ set_fs(old_fs);
+
+ if (ret < 0) /* Failed to check the length */
+ *(u32 *)dest = 0;
+ else
+ *(u32 *)dest = len;
+}
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string_size));
+
+#define DEFINE_FETCH_symbol(type) \
+void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs, void *data, void *dest)\
+{ \
+ struct symbol_cache *sc = data; \
+ if (sc->addr) \
+ fetch_memory_##type(regs, (void *)sc->addr, dest); \
+ else \
+ *(type *)dest = 0; \
+} \
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(symbol, type));
+
+DEFINE_BASIC_FETCH_FUNCS(symbol)
+DEFINE_FETCH_symbol(string)
+DEFINE_FETCH_symbol(string_size)
+
+/* kprobes don't support file_offset fetch methods */
+#define fetch_file_offset_u8 NULL
+#define fetch_file_offset_u16 NULL
+#define fetch_file_offset_u32 NULL
+#define fetch_file_offset_u64 NULL
+#define fetch_file_offset_string NULL
+#define fetch_file_offset_string_size NULL
+
+/* Fetch type information table */
+static const struct fetch_type kprobes_fetch_type_table[] = {
+ /* Special types */
+ [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
+ sizeof(u32), 1, "__data_loc char[]"),
+ [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32,
+ string_size, sizeof(u32), 0, "u32"),
+ /* Basic types */
+ ASSIGN_FETCH_TYPE(u8, u8, 0),
+ ASSIGN_FETCH_TYPE(u16, u16, 0),
+ ASSIGN_FETCH_TYPE(u32, u32, 0),
+ ASSIGN_FETCH_TYPE(u64, u64, 0),
+ ASSIGN_FETCH_TYPE(s8, u8, 1),
+ ASSIGN_FETCH_TYPE(s16, u16, 1),
+ ASSIGN_FETCH_TYPE(s32, u32, 1),
+ ASSIGN_FETCH_TYPE(s64, u64, 1),
+
+ ASSIGN_FETCH_TYPE_END
+};
+
+/*
+ * Allocate new trace_probe and initialize it (including kprobes).
+ */
+static struct trace_kprobe *alloc_trace_kprobe(const char *group,
+ const char *event,
+ void *addr,
+ const char *symbol,
+ unsigned long offs,
+ int nargs, bool is_return)
+{
+ struct trace_kprobe *tk;
+ int ret = -ENOMEM;
+
+ tk = kzalloc(SIZEOF_TRACE_KPROBE(nargs), GFP_KERNEL);
+ if (!tk)
+ return ERR_PTR(ret);
+
+ if (symbol) {
+ tk->symbol = kstrdup(symbol, GFP_KERNEL);
+ if (!tk->symbol)
+ goto error;
+ tk->rp.kp.symbol_name = tk->symbol;
+ tk->rp.kp.offset = offs;
+ } else
+ tk->rp.kp.addr = addr;
+
+ if (is_return)
+ tk->rp.handler = kretprobe_dispatcher;
+ else
+ tk->rp.kp.pre_handler = kprobe_dispatcher;
+
+ if (!event || !is_good_name(event)) {
+ ret = -EINVAL;
+ goto error;
+ }
+
+ tk->tp.call.class = &tk->tp.class;
+ tk->tp.call.name = kstrdup(event, GFP_KERNEL);
+ if (!tk->tp.call.name)
+ goto error;
+
+ if (!group || !is_good_name(group)) {
+ ret = -EINVAL;
+ goto error;
+ }
+
+ tk->tp.class.system = kstrdup(group, GFP_KERNEL);
+ if (!tk->tp.class.system)
+ goto error;
+
+ INIT_LIST_HEAD(&tk->list);
+ INIT_LIST_HEAD(&tk->tp.files);
+ return tk;
+error:
+ kfree(tk->tp.call.name);
+ kfree(tk->symbol);
+ kfree(tk);
+ return ERR_PTR(ret);
+}
+
+static void free_trace_kprobe(struct trace_kprobe *tk)
+{
+ int i;
+
+ for (i = 0; i < tk->tp.nr_args; i++)
+ traceprobe_free_probe_arg(&tk->tp.args[i]);
+
+ kfree(tk->tp.call.class->system);
+ kfree(tk->tp.call.name);
+ kfree(tk->symbol);
+ kfree(tk);
+}
+
+static struct trace_kprobe *find_trace_kprobe(const char *event,
+ const char *group)
+{
+ struct trace_kprobe *tk;
+
+ list_for_each_entry(tk, &probe_list, list)
+ if (strcmp(ftrace_event_name(&tk->tp.call), event) == 0 &&
+ strcmp(tk->tp.call.class->system, group) == 0)
+ return tk;
+ return NULL;
+}
+
+/*
+ * Enable trace_probe
+ * if the file is NULL, enable "perf" handler, or enable "trace" handler.
+ */
+static int
+enable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file)
+{
+ int ret = 0;
+
+ if (file) {
+ struct event_file_link *link;
+
+ link = kmalloc(sizeof(*link), GFP_KERNEL);
+ if (!link) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ link->file = file;
+ list_add_tail_rcu(&link->list, &tk->tp.files);
+
+ tk->tp.flags |= TP_FLAG_TRACE;
+ } else
+ tk->tp.flags |= TP_FLAG_PROFILE;
+
+ if (trace_probe_is_registered(&tk->tp) && !trace_kprobe_has_gone(tk)) {
+ if (trace_kprobe_is_return(tk))
+ ret = enable_kretprobe(&tk->rp);
+ else
+ ret = enable_kprobe(&tk->rp.kp);
+ }
+ out:
+ return ret;
+}
+
+/*
+ * Disable trace_probe
+ * if the file is NULL, disable "perf" handler, or disable "trace" handler.
+ */
+static int
+disable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file)
+{
+ struct event_file_link *link = NULL;
+ int wait = 0;
+ int ret = 0;
+
+ if (file) {
+ link = find_event_file_link(&tk->tp, file);
+ if (!link) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ list_del_rcu(&link->list);
+ wait = 1;
+ if (!list_empty(&tk->tp.files))
+ goto out;
+
+ tk->tp.flags &= ~TP_FLAG_TRACE;
+ } else
+ tk->tp.flags &= ~TP_FLAG_PROFILE;
+
+ if (!trace_probe_is_enabled(&tk->tp) && trace_probe_is_registered(&tk->tp)) {
+ if (trace_kprobe_is_return(tk))
+ disable_kretprobe(&tk->rp);
+ else
+ disable_kprobe(&tk->rp.kp);
+ wait = 1;
+ }
+ out:
+ if (wait) {
+ /*
+ * Synchronize with kprobe_trace_func/kretprobe_trace_func
+ * to ensure disabled (all running handlers are finished).
+ * This is not only for kfree(), but also the caller,
+ * trace_remove_event_call() supposes it for releasing
+ * event_call related objects, which will be accessed in
+ * the kprobe_trace_func/kretprobe_trace_func.
+ */
+ synchronize_sched();
+ kfree(link); /* Ignored if link == NULL */
+ }
+
+ return ret;
+}
+
+/* Internal register function - just handle k*probes and flags */
+static int __register_trace_kprobe(struct trace_kprobe *tk)
+{
+ int i, ret;
+
+ if (trace_probe_is_registered(&tk->tp))
+ return -EINVAL;
+
+ for (i = 0; i < tk->tp.nr_args; i++)
+ traceprobe_update_arg(&tk->tp.args[i]);
+
+ /* Set/clear disabled flag according to tp->flag */
+ if (trace_probe_is_enabled(&tk->tp))
+ tk->rp.kp.flags &= ~KPROBE_FLAG_DISABLED;
+ else
+ tk->rp.kp.flags |= KPROBE_FLAG_DISABLED;
+
+ if (trace_kprobe_is_return(tk))
+ ret = register_kretprobe(&tk->rp);
+ else
+ ret = register_kprobe(&tk->rp.kp);
+
+ if (ret == 0)
+ tk->tp.flags |= TP_FLAG_REGISTERED;
+ else {
+ pr_warning("Could not insert probe at %s+%lu: %d\n",
+ trace_kprobe_symbol(tk), trace_kprobe_offset(tk), ret);
+ if (ret == -ENOENT && trace_kprobe_is_on_module(tk)) {
+ pr_warning("This probe might be able to register after"
+ "target module is loaded. Continue.\n");
+ ret = 0;
+ } else if (ret == -EILSEQ) {
+ pr_warning("Probing address(0x%p) is not an "
+ "instruction boundary.\n",
+ tk->rp.kp.addr);
+ ret = -EINVAL;
+ }
+ }
+
+ return ret;
+}
+
+/* Internal unregister function - just handle k*probes and flags */
+static void __unregister_trace_kprobe(struct trace_kprobe *tk)
+{
+ if (trace_probe_is_registered(&tk->tp)) {
+ if (trace_kprobe_is_return(tk))
+ unregister_kretprobe(&tk->rp);
+ else
+ unregister_kprobe(&tk->rp.kp);
+ tk->tp.flags &= ~TP_FLAG_REGISTERED;
+ /* Cleanup kprobe for reuse */
+ if (tk->rp.kp.symbol_name)
+ tk->rp.kp.addr = NULL;
+ }
+}
+
+/* Unregister a trace_probe and probe_event: call with locking probe_lock */
+static int unregister_trace_kprobe(struct trace_kprobe *tk)
+{
+ /* Enabled event can not be unregistered */
+ if (trace_probe_is_enabled(&tk->tp))
+ return -EBUSY;
+
+ /* Will fail if probe is being used by ftrace or perf */
+ if (unregister_kprobe_event(tk))
+ return -EBUSY;
+
+ __unregister_trace_kprobe(tk);
+ list_del(&tk->list);
+
+ return 0;
+}
+
+/* Register a trace_probe and probe_event */
+static int register_trace_kprobe(struct trace_kprobe *tk)
+{
+ struct trace_kprobe *old_tk;
+ int ret;
+
+ mutex_lock(&probe_lock);
+
+ /* Delete old (same name) event if exist */
+ old_tk = find_trace_kprobe(ftrace_event_name(&tk->tp.call),
+ tk->tp.call.class->system);
+ if (old_tk) {
+ ret = unregister_trace_kprobe(old_tk);
+ if (ret < 0)
+ goto end;
+ free_trace_kprobe(old_tk);
+ }
+
+ /* Register new event */
+ ret = register_kprobe_event(tk);
+ if (ret) {
+ pr_warning("Failed to register probe event(%d)\n", ret);
+ goto end;
+ }
+
+ /* Register k*probe */
+ ret = __register_trace_kprobe(tk);
+ if (ret < 0)
+ unregister_kprobe_event(tk);
+ else
+ list_add_tail(&tk->list, &probe_list);
+
+end:
+ mutex_unlock(&probe_lock);
+ return ret;
+}
+
+/* Module notifier call back, checking event on the module */
+static int trace_kprobe_module_callback(struct notifier_block *nb,
+ unsigned long val, void *data)
+{
+ struct module *mod = data;
+ struct trace_kprobe *tk;
+ int ret;
+
+ if (val != MODULE_STATE_COMING)
+ return NOTIFY_DONE;
+
+ /* Update probes on coming module */
+ mutex_lock(&probe_lock);
+ list_for_each_entry(tk, &probe_list, list) {
+ if (trace_kprobe_within_module(tk, mod)) {
+ /* Don't need to check busy - this should have gone. */
+ __unregister_trace_kprobe(tk);
+ ret = __register_trace_kprobe(tk);
+ if (ret)
+ pr_warning("Failed to re-register probe %s on"
+ "%s: %d\n",
+ ftrace_event_name(&tk->tp.call),
+ mod->name, ret);
+ }
+ }
+ mutex_unlock(&probe_lock);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block trace_kprobe_module_nb = {
+ .notifier_call = trace_kprobe_module_callback,
+ .priority = 1 /* Invoked after kprobe module callback */
+};
+
+static int create_trace_kprobe(int argc, char **argv)
+{
+ /*
+ * Argument syntax:
+ * - Add kprobe: p[:[GRP/]EVENT] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS]
+ * - Add kretprobe: r[:[GRP/]EVENT] [MOD:]KSYM[+0] [FETCHARGS]
+ * Fetch args:
+ * $retval : fetch return value
+ * $stack : fetch stack address
+ * $stackN : fetch Nth of stack (N:0-)
+ * @ADDR : fetch memory at ADDR (ADDR should be in kernel)
+ * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol)
+ * %REG : fetch register REG
+ * Dereferencing memory fetch:
+ * +|-offs(ARG) : fetch memory at ARG +|- offs address.
+ * Alias name of args:
+ * NAME=FETCHARG : set NAME as alias of FETCHARG.
+ * Type of args:
+ * FETCHARG:TYPE : use TYPE instead of unsigned long.
+ */
+ struct trace_kprobe *tk;
+ int i, ret = 0;
+ bool is_return = false, is_delete = false;
+ char *symbol = NULL, *event = NULL, *group = NULL;
+ char *arg;
+ unsigned long offset = 0;
+ void *addr = NULL;
+ char buf[MAX_EVENT_NAME_LEN];
+
+ /* argc must be >= 1 */
+ if (argv[0][0] == 'p')
+ is_return = false;
+ else if (argv[0][0] == 'r')
+ is_return = true;
+ else if (argv[0][0] == '-')
+ is_delete = true;
+ else {
+ pr_info("Probe definition must be started with 'p', 'r' or"
+ " '-'.\n");
+ return -EINVAL;
+ }
+
+ if (argv[0][1] == ':') {
+ event = &argv[0][2];
+ if (strchr(event, '/')) {
+ group = event;
+ event = strchr(group, '/') + 1;
+ event[-1] = '\0';
+ if (strlen(group) == 0) {
+ pr_info("Group name is not specified\n");
+ return -EINVAL;
+ }
+ }
+ if (strlen(event) == 0) {
+ pr_info("Event name is not specified\n");
+ return -EINVAL;
+ }
+ }
+ if (!group)
+ group = KPROBE_EVENT_SYSTEM;
+
+ if (is_delete) {
+ if (!event) {
+ pr_info("Delete command needs an event name.\n");
+ return -EINVAL;
+ }
+ mutex_lock(&probe_lock);
+ tk = find_trace_kprobe(event, group);
+ if (!tk) {
+ mutex_unlock(&probe_lock);
+ pr_info("Event %s/%s doesn't exist.\n", group, event);
+ return -ENOENT;
+ }
+ /* delete an event */
+ ret = unregister_trace_kprobe(tk);
+ if (ret == 0)
+ free_trace_kprobe(tk);
+ mutex_unlock(&probe_lock);
+ return ret;
+ }
+
+ if (argc < 2) {
+ pr_info("Probe point is not specified.\n");
+ return -EINVAL;
+ }
+ if (isdigit(argv[1][0])) {
+ if (is_return) {
+ pr_info("Return probe point must be a symbol.\n");
+ return -EINVAL;
+ }
+ /* an address specified */
+ ret = kstrtoul(&argv[1][0], 0, (unsigned long *)&addr);
+ if (ret) {
+ pr_info("Failed to parse address.\n");
+ return ret;
+ }
+ } else {
+ /* a symbol specified */
+ symbol = argv[1];
+ /* TODO: support .init module functions */
+ ret = traceprobe_split_symbol_offset(symbol, &offset);
+ if (ret) {
+ pr_info("Failed to parse symbol.\n");
+ return ret;
+ }
+ if (offset && is_return) {
+ pr_info("Return probe must be used without offset.\n");
+ return -EINVAL;
+ }
+ }
+ argc -= 2; argv += 2;
+
+ /* setup a probe */
+ if (!event) {
+ /* Make a new event name */
+ if (symbol)
+ snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_%ld",
+ is_return ? 'r' : 'p', symbol, offset);
+ else
+ snprintf(buf, MAX_EVENT_NAME_LEN, "%c_0x%p",
+ is_return ? 'r' : 'p', addr);
+ event = buf;
+ }
+ tk = alloc_trace_kprobe(group, event, addr, symbol, offset, argc,
+ is_return);
+ if (IS_ERR(tk)) {
+ pr_info("Failed to allocate trace_probe.(%d)\n",
+ (int)PTR_ERR(tk));
+ return PTR_ERR(tk);
+ }
+
+ /* parse arguments */
+ ret = 0;
+ for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
+ struct probe_arg *parg = &tk->tp.args[i];
+
+ /* Increment count for freeing args in error case */
+ tk->tp.nr_args++;
+
+ /* Parse argument name */
+ arg = strchr(argv[i], '=');
+ if (arg) {
+ *arg++ = '\0';
+ parg->name = kstrdup(argv[i], GFP_KERNEL);
+ } else {
+ arg = argv[i];
+ /* If argument name is omitted, set "argN" */
+ snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1);
+ parg->name = kstrdup(buf, GFP_KERNEL);
+ }
+
+ if (!parg->name) {
+ pr_info("Failed to allocate argument[%d] name.\n", i);
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ if (!is_good_name(parg->name)) {
+ pr_info("Invalid argument[%d] name: %s\n",
+ i, parg->name);
+ ret = -EINVAL;
+ goto error;
+ }
+
+ if (traceprobe_conflict_field_name(parg->name,
+ tk->tp.args, i)) {
+ pr_info("Argument[%d] name '%s' conflicts with "
+ "another field.\n", i, argv[i]);
+ ret = -EINVAL;
+ goto error;
+ }
+
+ /* Parse fetch argument */
+ ret = traceprobe_parse_probe_arg(arg, &tk->tp.size, parg,
+ is_return, true,
+ kprobes_fetch_type_table);
+ if (ret) {
+ pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
+ goto error;
+ }
+ }
+
+ ret = register_trace_kprobe(tk);
+ if (ret)
+ goto error;
+ return 0;
+
+error:
+ free_trace_kprobe(tk);
+ return ret;
+}
+
+static int release_all_trace_kprobes(void)
+{
+ struct trace_kprobe *tk;
+ int ret = 0;
+
+ mutex_lock(&probe_lock);
+ /* Ensure no probe is in use. */
+ list_for_each_entry(tk, &probe_list, list)
+ if (trace_probe_is_enabled(&tk->tp)) {
+ ret = -EBUSY;
+ goto end;
+ }
+ /* TODO: Use batch unregistration */
+ while (!list_empty(&probe_list)) {
+ tk = list_entry(probe_list.next, struct trace_kprobe, list);
+ ret = unregister_trace_kprobe(tk);
+ if (ret)
+ goto end;
+ free_trace_kprobe(tk);
+ }
+
+end:
+ mutex_unlock(&probe_lock);
+
+ return ret;
+}
+
+/* Probes listing interfaces */
+static void *probes_seq_start(struct seq_file *m, loff_t *pos)
+{
+ mutex_lock(&probe_lock);
+ return seq_list_start(&probe_list, *pos);
+}
+
+static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ return seq_list_next(v, &probe_list, pos);
+}
+
+static void probes_seq_stop(struct seq_file *m, void *v)
+{
+ mutex_unlock(&probe_lock);
+}
+
+static int probes_seq_show(struct seq_file *m, void *v)
+{
+ struct trace_kprobe *tk = v;
+ int i;
+
+ seq_putc(m, trace_kprobe_is_return(tk) ? 'r' : 'p');
+ seq_printf(m, ":%s/%s", tk->tp.call.class->system,
+ ftrace_event_name(&tk->tp.call));
+
+ if (!tk->symbol)
+ seq_printf(m, " 0x%p", tk->rp.kp.addr);
+ else if (tk->rp.kp.offset)
+ seq_printf(m, " %s+%u", trace_kprobe_symbol(tk),
+ tk->rp.kp.offset);
+ else
+ seq_printf(m, " %s", trace_kprobe_symbol(tk));
+
+ for (i = 0; i < tk->tp.nr_args; i++)
+ seq_printf(m, " %s=%s", tk->tp.args[i].name, tk->tp.args[i].comm);
+ seq_putc(m, '\n');
+
+ return 0;
+}
+
+static const struct seq_operations probes_seq_op = {
+ .start = probes_seq_start,
+ .next = probes_seq_next,
+ .stop = probes_seq_stop,
+ .show = probes_seq_show
+};
+
+static int probes_open(struct inode *inode, struct file *file)
+{
+ int ret;
+
+ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
+ ret = release_all_trace_kprobes();
+ if (ret < 0)
+ return ret;
+ }
+
+ return seq_open(file, &probes_seq_op);
+}
+
+static ssize_t probes_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *ppos)
+{
+ return traceprobe_probes_write(file, buffer, count, ppos,
+ create_trace_kprobe);
+}
+
+static const struct file_operations kprobe_events_ops = {
+ .owner = THIS_MODULE,
+ .open = probes_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+ .write = probes_write,
+};
+
+/* Probes profiling interfaces */
+static int probes_profile_seq_show(struct seq_file *m, void *v)
+{
+ struct trace_kprobe *tk = v;
+
+ seq_printf(m, " %-44s %15lu %15lu\n",
+ ftrace_event_name(&tk->tp.call), tk->nhit,
+ tk->rp.kp.nmissed);
+
+ return 0;
+}
+
+static const struct seq_operations profile_seq_op = {
+ .start = probes_seq_start,
+ .next = probes_seq_next,
+ .stop = probes_seq_stop,
+ .show = probes_profile_seq_show
+};
+
+static int profile_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &profile_seq_op);
+}
+
+static const struct file_operations kprobe_profile_ops = {
+ .owner = THIS_MODULE,
+ .open = profile_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+/* Kprobe handler */
+static nokprobe_inline void
+__kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
+ struct ftrace_event_file *ftrace_file)
+{
+ struct kprobe_trace_entry_head *entry;
+ struct ring_buffer_event *event;
+ struct ring_buffer *buffer;
+ int size, dsize, pc;
+ unsigned long irq_flags;
+ struct ftrace_event_call *call = &tk->tp.call;
+
+ WARN_ON(call != ftrace_file->event_call);
+
+ if (ftrace_trigger_soft_disabled(ftrace_file))
+ return;
+
+ local_save_flags(irq_flags);
+ pc = preempt_count();
+
+ dsize = __get_data_size(&tk->tp, regs);
+ size = sizeof(*entry) + tk->tp.size + dsize;
+
+ event = trace_event_buffer_lock_reserve(&buffer, ftrace_file,
+ call->event.type,
+ size, irq_flags, pc);
+ if (!event)
+ return;
+
+ entry = ring_buffer_event_data(event);
+ entry->ip = (unsigned long)tk->rp.kp.addr;
+ store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
+
+ event_trigger_unlock_commit_regs(ftrace_file, buffer, event,
+ entry, irq_flags, pc, regs);
+}
+
+static void
+kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs)
+{
+ struct event_file_link *link;
+
+ list_for_each_entry_rcu(link, &tk->tp.files, list)
+ __kprobe_trace_func(tk, regs, link->file);
+}
+NOKPROBE_SYMBOL(kprobe_trace_func);
+
+/* Kretprobe handler */
+static nokprobe_inline void
+__kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
+ struct pt_regs *regs,
+ struct ftrace_event_file *ftrace_file)
+{
+ struct kretprobe_trace_entry_head *entry;
+ struct ring_buffer_event *event;
+ struct ring_buffer *buffer;
+ int size, pc, dsize;
+ unsigned long irq_flags;
+ struct ftrace_event_call *call = &tk->tp.call;
+
+ WARN_ON(call != ftrace_file->event_call);
+
+ if (ftrace_trigger_soft_disabled(ftrace_file))
+ return;
+
+ local_save_flags(irq_flags);
+ pc = preempt_count();
+
+ dsize = __get_data_size(&tk->tp, regs);
+ size = sizeof(*entry) + tk->tp.size + dsize;
+
+ event = trace_event_buffer_lock_reserve(&buffer, ftrace_file,
+ call->event.type,
+ size, irq_flags, pc);
+ if (!event)
+ return;
+
+ entry = ring_buffer_event_data(event);
+ entry->func = (unsigned long)tk->rp.kp.addr;
+ entry->ret_ip = (unsigned long)ri->ret_addr;
+ store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
+
+ event_trigger_unlock_commit_regs(ftrace_file, buffer, event,
+ entry, irq_flags, pc, regs);
+}
+
+static void
+kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
+ struct pt_regs *regs)
+{
+ struct event_file_link *link;
+
+ list_for_each_entry_rcu(link, &tk->tp.files, list)
+ __kretprobe_trace_func(tk, ri, regs, link->file);
+}
+NOKPROBE_SYMBOL(kretprobe_trace_func);
+
+/* Event entry printers */
+static enum print_line_t
+print_kprobe_event(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct kprobe_trace_entry_head *field;
+ struct trace_seq *s = &iter->seq;
+ struct trace_probe *tp;
+ u8 *data;
+ int i;
+
+ field = (struct kprobe_trace_entry_head *)iter->ent;
+ tp = container_of(event, struct trace_probe, call.event);
+
+ trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call));
+
+ if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
+ goto out;
+
+ trace_seq_putc(s, ')');
+
+ data = (u8 *)&field[1];
+ for (i = 0; i < tp->nr_args; i++)
+ if (!tp->args[i].type->print(s, tp->args[i].name,
+ data + tp->args[i].offset, field))
+ goto out;
+
+ trace_seq_putc(s, '\n');
+ out:
+ return trace_handle_return(s);
+}
+
+static enum print_line_t
+print_kretprobe_event(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct kretprobe_trace_entry_head *field;
+ struct trace_seq *s = &iter->seq;
+ struct trace_probe *tp;
+ u8 *data;
+ int i;
+
+ field = (struct kretprobe_trace_entry_head *)iter->ent;
+ tp = container_of(event, struct trace_probe, call.event);
+
+ trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call));
+
+ if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET))
+ goto out;
+
+ trace_seq_puts(s, " <- ");
+
+ if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET))
+ goto out;
+
+ trace_seq_putc(s, ')');
+
+ data = (u8 *)&field[1];
+ for (i = 0; i < tp->nr_args; i++)
+ if (!tp->args[i].type->print(s, tp->args[i].name,
+ data + tp->args[i].offset, field))
+ goto out;
+
+ trace_seq_putc(s, '\n');
+
+ out:
+ return trace_handle_return(s);
+}
+
+
+static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
+{
+ int ret, i;
+ struct kprobe_trace_entry_head field;
+ struct trace_kprobe *tk = (struct trace_kprobe *)event_call->data;
+
+ DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
+ /* Set argument names as fields */
+ for (i = 0; i < tk->tp.nr_args; i++) {
+ struct probe_arg *parg = &tk->tp.args[i];
+
+ ret = trace_define_field(event_call, parg->type->fmttype,
+ parg->name,
+ sizeof(field) + parg->offset,
+ parg->type->size,
+ parg->type->is_signed,
+ FILTER_OTHER);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
+{
+ int ret, i;
+ struct kretprobe_trace_entry_head field;
+ struct trace_kprobe *tk = (struct trace_kprobe *)event_call->data;
+
+ DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0);
+ DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);
+ /* Set argument names as fields */
+ for (i = 0; i < tk->tp.nr_args; i++) {
+ struct probe_arg *parg = &tk->tp.args[i];
+
+ ret = trace_define_field(event_call, parg->type->fmttype,
+ parg->name,
+ sizeof(field) + parg->offset,
+ parg->type->size,
+ parg->type->is_signed,
+ FILTER_OTHER);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+#ifdef CONFIG_PERF_EVENTS
+
+/* Kprobe profile handler */
+static void
+kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
+{
+ struct ftrace_event_call *call = &tk->tp.call;
+ struct bpf_prog *prog = call->prog;
+ struct kprobe_trace_entry_head *entry;
+ struct hlist_head *head;
+ int size, __size, dsize;
+ int rctx;
+
+ if (prog && !trace_call_bpf(prog, regs))
+ return;
+
+ head = this_cpu_ptr(call->perf_events);
+ if (hlist_empty(head))
+ return;
+
+ dsize = __get_data_size(&tk->tp, regs);
+ __size = sizeof(*entry) + tk->tp.size + dsize;
+ size = ALIGN(__size + sizeof(u32), sizeof(u64));
+ size -= sizeof(u32);
+
+ entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
+ if (!entry)
+ return;
+
+ entry->ip = (unsigned long)tk->rp.kp.addr;
+ memset(&entry[1], 0, dsize);
+ store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
+ perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
+}
+NOKPROBE_SYMBOL(kprobe_perf_func);
+
+/* Kretprobe profile handler */
+static void
+kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
+ struct pt_regs *regs)
+{
+ struct ftrace_event_call *call = &tk->tp.call;
+ struct bpf_prog *prog = call->prog;
+ struct kretprobe_trace_entry_head *entry;
+ struct hlist_head *head;
+ int size, __size, dsize;
+ int rctx;
+
+ if (prog && !trace_call_bpf(prog, regs))
+ return;
+
+ head = this_cpu_ptr(call->perf_events);
+ if (hlist_empty(head))
+ return;
+
+ dsize = __get_data_size(&tk->tp, regs);
+ __size = sizeof(*entry) + tk->tp.size + dsize;
+ size = ALIGN(__size + sizeof(u32), sizeof(u64));
+ size -= sizeof(u32);
+
+ entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
+ if (!entry)
+ return;
+
+ entry->func = (unsigned long)tk->rp.kp.addr;
+ entry->ret_ip = (unsigned long)ri->ret_addr;
+ store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
+ perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
+}
+NOKPROBE_SYMBOL(kretprobe_perf_func);
+#endif /* CONFIG_PERF_EVENTS */
+
+/*
+ * called by perf_trace_init() or __ftrace_set_clr_event() under event_mutex.
+ *
+ * kprobe_trace_self_tests_init() does enable_trace_probe/disable_trace_probe
+ * lockless, but we can't race with this __init function.
+ */
+static int kprobe_register(struct ftrace_event_call *event,
+ enum trace_reg type, void *data)
+{
+ struct trace_kprobe *tk = (struct trace_kprobe *)event->data;
+ struct ftrace_event_file *file = data;
+
+ switch (type) {
+ case TRACE_REG_REGISTER:
+ return enable_trace_kprobe(tk, file);
+ case TRACE_REG_UNREGISTER:
+ return disable_trace_kprobe(tk, file);
+
+#ifdef CONFIG_PERF_EVENTS
+ case TRACE_REG_PERF_REGISTER:
+ return enable_trace_kprobe(tk, NULL);
+ case TRACE_REG_PERF_UNREGISTER:
+ return disable_trace_kprobe(tk, NULL);
+ case TRACE_REG_PERF_OPEN:
+ case TRACE_REG_PERF_CLOSE:
+ case TRACE_REG_PERF_ADD:
+ case TRACE_REG_PERF_DEL:
+ return 0;
+#endif
+ }
+ return 0;
+}
+
+static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
+{
+ struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp);
+
+ tk->nhit++;
+
+ if (tk->tp.flags & TP_FLAG_TRACE)
+ kprobe_trace_func(tk, regs);
+#ifdef CONFIG_PERF_EVENTS
+ if (tk->tp.flags & TP_FLAG_PROFILE)
+ kprobe_perf_func(tk, regs);
+#endif
+ return 0; /* We don't tweek kernel, so just return 0 */
+}
+NOKPROBE_SYMBOL(kprobe_dispatcher);
+
+static int
+kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
+{
+ struct trace_kprobe *tk = container_of(ri->rp, struct trace_kprobe, rp);
+
+ tk->nhit++;
+
+ if (tk->tp.flags & TP_FLAG_TRACE)
+ kretprobe_trace_func(tk, ri, regs);
+#ifdef CONFIG_PERF_EVENTS
+ if (tk->tp.flags & TP_FLAG_PROFILE)
+ kretprobe_perf_func(tk, ri, regs);
+#endif
+ return 0; /* We don't tweek kernel, so just return 0 */
+}
+NOKPROBE_SYMBOL(kretprobe_dispatcher);
+
+static struct trace_event_functions kretprobe_funcs = {
+ .trace = print_kretprobe_event
+};
+
+static struct trace_event_functions kprobe_funcs = {
+ .trace = print_kprobe_event
+};
+
+static int register_kprobe_event(struct trace_kprobe *tk)
+{
+ struct ftrace_event_call *call = &tk->tp.call;
+ int ret;
+
+ /* Initialize ftrace_event_call */
+ INIT_LIST_HEAD(&call->class->fields);
+ if (trace_kprobe_is_return(tk)) {
+ call->event.funcs = &kretprobe_funcs;
+ call->class->define_fields = kretprobe_event_define_fields;
+ } else {
+ call->event.funcs = &kprobe_funcs;
+ call->class->define_fields = kprobe_event_define_fields;
+ }
+ if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0)
+ return -ENOMEM;
+ ret = register_ftrace_event(&call->event);
+ if (!ret) {
+ kfree(call->print_fmt);
+ return -ENODEV;
+ }
+ call->flags = TRACE_EVENT_FL_KPROBE;
+ call->class->reg = kprobe_register;
+ call->data = tk;
+ ret = trace_add_event_call(call);
+ if (ret) {
+ pr_info("Failed to register kprobe event: %s\n",
+ ftrace_event_name(call));
+ kfree(call->print_fmt);
+ unregister_ftrace_event(&call->event);
+ }
+ return ret;
+}
+
+static int unregister_kprobe_event(struct trace_kprobe *tk)
+{
+ int ret;
+
+ /* tp->event is unregistered in trace_remove_event_call() */
+ ret = trace_remove_event_call(&tk->tp.call);
+ if (!ret)
+ kfree(tk->tp.call.print_fmt);
+ return ret;
+}
+
+/* Make a tracefs interface for controlling probe points */
+static __init int init_kprobe_trace(void)
+{
+ struct dentry *d_tracer;
+ struct dentry *entry;
+
+ if (register_module_notifier(&trace_kprobe_module_nb))
+ return -EINVAL;
+
+ d_tracer = tracing_init_dentry();
+ if (IS_ERR(d_tracer))
+ return 0;
+
+ entry = tracefs_create_file("kprobe_events", 0644, d_tracer,
+ NULL, &kprobe_events_ops);
+
+ /* Event list interface */
+ if (!entry)
+ pr_warning("Could not create tracefs "
+ "'kprobe_events' entry\n");
+
+ /* Profile interface */
+ entry = tracefs_create_file("kprobe_profile", 0444, d_tracer,
+ NULL, &kprobe_profile_ops);
+
+ if (!entry)
+ pr_warning("Could not create tracefs "
+ "'kprobe_profile' entry\n");
+ return 0;
+}
+fs_initcall(init_kprobe_trace);
+
+
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+
+/*
+ * The "__used" keeps gcc from removing the function symbol
+ * from the kallsyms table.
+ */
+static __used int kprobe_trace_selftest_target(int a1, int a2, int a3,
+ int a4, int a5, int a6)
+{
+ return a1 + a2 + a3 + a4 + a5 + a6;
+}
+
+static struct ftrace_event_file *
+find_trace_probe_file(struct trace_kprobe *tk, struct trace_array *tr)
+{
+ struct ftrace_event_file *file;
+
+ list_for_each_entry(file, &tr->events, list)
+ if (file->event_call == &tk->tp.call)
+ return file;
+
+ return NULL;
+}
+
+/*
+ * Nobody but us can call enable_trace_kprobe/disable_trace_kprobe at this
+ * stage, we can do this lockless.
+ */
+static __init int kprobe_trace_self_tests_init(void)
+{
+ int ret, warn = 0;
+ int (*target)(int, int, int, int, int, int);
+ struct trace_kprobe *tk;
+ struct ftrace_event_file *file;
+
+ if (tracing_is_disabled())
+ return -ENODEV;
+
+ target = kprobe_trace_selftest_target;
+
+ pr_info("Testing kprobe tracing: ");
+
+ ret = traceprobe_command("p:testprobe kprobe_trace_selftest_target "
+ "$stack $stack0 +0($stack)",
+ create_trace_kprobe);
+ if (WARN_ON_ONCE(ret)) {
+ pr_warn("error on probing function entry.\n");
+ warn++;
+ } else {
+ /* Enable trace point */
+ tk = find_trace_kprobe("testprobe", KPROBE_EVENT_SYSTEM);
+ if (WARN_ON_ONCE(tk == NULL)) {
+ pr_warn("error on getting new probe.\n");
+ warn++;
+ } else {
+ file = find_trace_probe_file(tk, top_trace_array());
+ if (WARN_ON_ONCE(file == NULL)) {
+ pr_warn("error on getting probe file.\n");
+ warn++;
+ } else
+ enable_trace_kprobe(tk, file);
+ }
+ }
+
+ ret = traceprobe_command("r:testprobe2 kprobe_trace_selftest_target "
+ "$retval", create_trace_kprobe);
+ if (WARN_ON_ONCE(ret)) {
+ pr_warn("error on probing function return.\n");
+ warn++;
+ } else {
+ /* Enable trace point */
+ tk = find_trace_kprobe("testprobe2", KPROBE_EVENT_SYSTEM);
+ if (WARN_ON_ONCE(tk == NULL)) {
+ pr_warn("error on getting 2nd new probe.\n");
+ warn++;
+ } else {
+ file = find_trace_probe_file(tk, top_trace_array());
+ if (WARN_ON_ONCE(file == NULL)) {
+ pr_warn("error on getting probe file.\n");
+ warn++;
+ } else
+ enable_trace_kprobe(tk, file);
+ }
+ }
+
+ if (warn)
+ goto end;
+
+ ret = target(1, 2, 3, 4, 5, 6);
+
+ /* Disable trace points before removing it */
+ tk = find_trace_kprobe("testprobe", KPROBE_EVENT_SYSTEM);
+ if (WARN_ON_ONCE(tk == NULL)) {
+ pr_warn("error on getting test probe.\n");
+ warn++;
+ } else {
+ file = find_trace_probe_file(tk, top_trace_array());
+ if (WARN_ON_ONCE(file == NULL)) {
+ pr_warn("error on getting probe file.\n");
+ warn++;
+ } else
+ disable_trace_kprobe(tk, file);
+ }
+
+ tk = find_trace_kprobe("testprobe2", KPROBE_EVENT_SYSTEM);
+ if (WARN_ON_ONCE(tk == NULL)) {
+ pr_warn("error on getting 2nd test probe.\n");
+ warn++;
+ } else {
+ file = find_trace_probe_file(tk, top_trace_array());
+ if (WARN_ON_ONCE(file == NULL)) {
+ pr_warn("error on getting probe file.\n");
+ warn++;
+ } else
+ disable_trace_kprobe(tk, file);
+ }
+
+ ret = traceprobe_command("-:testprobe", create_trace_kprobe);
+ if (WARN_ON_ONCE(ret)) {
+ pr_warn("error on deleting a probe.\n");
+ warn++;
+ }
+
+ ret = traceprobe_command("-:testprobe2", create_trace_kprobe);
+ if (WARN_ON_ONCE(ret)) {
+ pr_warn("error on deleting a probe.\n");
+ warn++;
+ }
+
+end:
+ release_all_trace_kprobes();
+ if (warn)
+ pr_cont("NG: Some tests are failed. Please check them.\n");
+ else
+ pr_cont("OK\n");
+ return 0;
+}
+
+late_initcall(kprobe_trace_self_tests_init);
+
+#endif
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
new file mode 100644
index 000000000..7a9ba62e9
--- /dev/null
+++ b/kernel/trace/trace_mmiotrace.c
@@ -0,0 +1,364 @@
+/*
+ * Memory mapped I/O tracing
+ *
+ * Copyright (C) 2008 Pekka Paalanen <pq@iki.fi>
+ */
+
+#define DEBUG 1
+
+#include <linux/kernel.h>
+#include <linux/mmiotrace.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+
+#include <linux/atomic.h>
+
+#include "trace.h"
+#include "trace_output.h"
+
+struct header_iter {
+ struct pci_dev *dev;
+};
+
+static struct trace_array *mmio_trace_array;
+static bool overrun_detected;
+static unsigned long prev_overruns;
+static atomic_t dropped_count;
+
+static void mmio_reset_data(struct trace_array *tr)
+{
+ overrun_detected = false;
+ prev_overruns = 0;
+
+ tracing_reset_online_cpus(&tr->trace_buffer);
+}
+
+static int mmio_trace_init(struct trace_array *tr)
+{
+ pr_debug("in %s\n", __func__);
+ mmio_trace_array = tr;
+
+ mmio_reset_data(tr);
+ enable_mmiotrace();
+ return 0;
+}
+
+static void mmio_trace_reset(struct trace_array *tr)
+{
+ pr_debug("in %s\n", __func__);
+
+ disable_mmiotrace();
+ mmio_reset_data(tr);
+ mmio_trace_array = NULL;
+}
+
+static void mmio_trace_start(struct trace_array *tr)
+{
+ pr_debug("in %s\n", __func__);
+ mmio_reset_data(tr);
+}
+
+static void mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
+{
+ int i;
+ resource_size_t start, end;
+ const struct pci_driver *drv = pci_dev_driver(dev);
+
+ trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x",
+ dev->bus->number, dev->devfn,
+ dev->vendor, dev->device, dev->irq);
+ /*
+ * XXX: is pci_resource_to_user() appropriate, since we are
+ * supposed to interpret the __ioremap() phys_addr argument based on
+ * these printed values?
+ */
+ for (i = 0; i < 7; i++) {
+ pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
+ trace_seq_printf(s, " %llx",
+ (unsigned long long)(start |
+ (dev->resource[i].flags & PCI_REGION_FLAG_MASK)));
+ }
+ for (i = 0; i < 7; i++) {
+ pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
+ trace_seq_printf(s, " %llx",
+ dev->resource[i].start < dev->resource[i].end ?
+ (unsigned long long)(end - start) + 1 : 0);
+ }
+ if (drv)
+ trace_seq_printf(s, " %s\n", drv->name);
+ else
+ trace_seq_puts(s, " \n");
+}
+
+static void destroy_header_iter(struct header_iter *hiter)
+{
+ if (!hiter)
+ return;
+ pci_dev_put(hiter->dev);
+ kfree(hiter);
+}
+
+static void mmio_pipe_open(struct trace_iterator *iter)
+{
+ struct header_iter *hiter;
+ struct trace_seq *s = &iter->seq;
+
+ trace_seq_puts(s, "VERSION 20070824\n");
+
+ hiter = kzalloc(sizeof(*hiter), GFP_KERNEL);
+ if (!hiter)
+ return;
+
+ hiter->dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, NULL);
+ iter->private = hiter;
+}
+
+/* XXX: This is not called when the pipe is closed! */
+static void mmio_close(struct trace_iterator *iter)
+{
+ struct header_iter *hiter = iter->private;
+ destroy_header_iter(hiter);
+ iter->private = NULL;
+}
+
+static unsigned long count_overruns(struct trace_iterator *iter)
+{
+ unsigned long cnt = atomic_xchg(&dropped_count, 0);
+ unsigned long over = ring_buffer_overruns(iter->trace_buffer->buffer);
+
+ if (over > prev_overruns)
+ cnt += over - prev_overruns;
+ prev_overruns = over;
+ return cnt;
+}
+
+static ssize_t mmio_read(struct trace_iterator *iter, struct file *filp,
+ char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ ssize_t ret;
+ struct header_iter *hiter = iter->private;
+ struct trace_seq *s = &iter->seq;
+ unsigned long n;
+
+ n = count_overruns(iter);
+ if (n) {
+ /* XXX: This is later than where events were lost. */
+ trace_seq_printf(s, "MARK 0.000000 Lost %lu events.\n", n);
+ if (!overrun_detected)
+ pr_warning("mmiotrace has lost events.\n");
+ overrun_detected = true;
+ goto print_out;
+ }
+
+ if (!hiter)
+ return 0;
+
+ mmio_print_pcidev(s, hiter->dev);
+ hiter->dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, hiter->dev);
+
+ if (!hiter->dev) {
+ destroy_header_iter(hiter);
+ iter->private = NULL;
+ }
+
+print_out:
+ ret = trace_seq_to_user(s, ubuf, cnt);
+ return (ret == -EBUSY) ? 0 : ret;
+}
+
+static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
+{
+ struct trace_entry *entry = iter->ent;
+ struct trace_mmiotrace_rw *field;
+ struct mmiotrace_rw *rw;
+ struct trace_seq *s = &iter->seq;
+ unsigned long long t = ns2usecs(iter->ts);
+ unsigned long usec_rem = do_div(t, USEC_PER_SEC);
+ unsigned secs = (unsigned long)t;
+
+ trace_assign_type(field, entry);
+ rw = &field->rw;
+
+ switch (rw->opcode) {
+ case MMIO_READ:
+ trace_seq_printf(s,
+ "R %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
+ rw->width, secs, usec_rem, rw->map_id,
+ (unsigned long long)rw->phys,
+ rw->value, rw->pc, 0);
+ break;
+ case MMIO_WRITE:
+ trace_seq_printf(s,
+ "W %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
+ rw->width, secs, usec_rem, rw->map_id,
+ (unsigned long long)rw->phys,
+ rw->value, rw->pc, 0);
+ break;
+ case MMIO_UNKNOWN_OP:
+ trace_seq_printf(s,
+ "UNKNOWN %u.%06lu %d 0x%llx %02lx,%02lx,"
+ "%02lx 0x%lx %d\n",
+ secs, usec_rem, rw->map_id,
+ (unsigned long long)rw->phys,
+ (rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff,
+ (rw->value >> 0) & 0xff, rw->pc, 0);
+ break;
+ default:
+ trace_seq_puts(s, "rw what?\n");
+ break;
+ }
+
+ return trace_handle_return(s);
+}
+
+static enum print_line_t mmio_print_map(struct trace_iterator *iter)
+{
+ struct trace_entry *entry = iter->ent;
+ struct trace_mmiotrace_map *field;
+ struct mmiotrace_map *m;
+ struct trace_seq *s = &iter->seq;
+ unsigned long long t = ns2usecs(iter->ts);
+ unsigned long usec_rem = do_div(t, USEC_PER_SEC);
+ unsigned secs = (unsigned long)t;
+
+ trace_assign_type(field, entry);
+ m = &field->map;
+
+ switch (m->opcode) {
+ case MMIO_PROBE:
+ trace_seq_printf(s,
+ "MAP %u.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n",
+ secs, usec_rem, m->map_id,
+ (unsigned long long)m->phys, m->virt, m->len,
+ 0UL, 0);
+ break;
+ case MMIO_UNPROBE:
+ trace_seq_printf(s,
+ "UNMAP %u.%06lu %d 0x%lx %d\n",
+ secs, usec_rem, m->map_id, 0UL, 0);
+ break;
+ default:
+ trace_seq_puts(s, "map what?\n");
+ break;
+ }
+
+ return trace_handle_return(s);
+}
+
+static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
+{
+ struct trace_entry *entry = iter->ent;
+ struct print_entry *print = (struct print_entry *)entry;
+ const char *msg = print->buf;
+ struct trace_seq *s = &iter->seq;
+ unsigned long long t = ns2usecs(iter->ts);
+ unsigned long usec_rem = do_div(t, USEC_PER_SEC);
+ unsigned secs = (unsigned long)t;
+
+ /* The trailing newline must be in the message. */
+ trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg);
+
+ return trace_handle_return(s);
+}
+
+static enum print_line_t mmio_print_line(struct trace_iterator *iter)
+{
+ switch (iter->ent->type) {
+ case TRACE_MMIO_RW:
+ return mmio_print_rw(iter);
+ case TRACE_MMIO_MAP:
+ return mmio_print_map(iter);
+ case TRACE_PRINT:
+ return mmio_print_mark(iter);
+ default:
+ return TRACE_TYPE_HANDLED; /* ignore unknown entries */
+ }
+}
+
+static struct tracer mmio_tracer __read_mostly =
+{
+ .name = "mmiotrace",
+ .init = mmio_trace_init,
+ .reset = mmio_trace_reset,
+ .start = mmio_trace_start,
+ .pipe_open = mmio_pipe_open,
+ .close = mmio_close,
+ .read = mmio_read,
+ .print_line = mmio_print_line,
+};
+
+__init static int init_mmio_trace(void)
+{
+ return register_tracer(&mmio_tracer);
+}
+device_initcall(init_mmio_trace);
+
+static void __trace_mmiotrace_rw(struct trace_array *tr,
+ struct trace_array_cpu *data,
+ struct mmiotrace_rw *rw)
+{
+ struct ftrace_event_call *call = &event_mmiotrace_rw;
+ struct ring_buffer *buffer = tr->trace_buffer.buffer;
+ struct ring_buffer_event *event;
+ struct trace_mmiotrace_rw *entry;
+ int pc = preempt_count();
+
+ event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_RW,
+ sizeof(*entry), 0, pc);
+ if (!event) {
+ atomic_inc(&dropped_count);
+ return;
+ }
+ entry = ring_buffer_event_data(event);
+ entry->rw = *rw;
+
+ if (!call_filter_check_discard(call, entry, buffer, event))
+ trace_buffer_unlock_commit(buffer, event, 0, pc);
+}
+
+void mmio_trace_rw(struct mmiotrace_rw *rw)
+{
+ struct trace_array *tr = mmio_trace_array;
+ struct trace_array_cpu *data = per_cpu_ptr(tr->trace_buffer.data, smp_processor_id());
+ __trace_mmiotrace_rw(tr, data, rw);
+}
+
+static void __trace_mmiotrace_map(struct trace_array *tr,
+ struct trace_array_cpu *data,
+ struct mmiotrace_map *map)
+{
+ struct ftrace_event_call *call = &event_mmiotrace_map;
+ struct ring_buffer *buffer = tr->trace_buffer.buffer;
+ struct ring_buffer_event *event;
+ struct trace_mmiotrace_map *entry;
+ int pc = preempt_count();
+
+ event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_MAP,
+ sizeof(*entry), 0, pc);
+ if (!event) {
+ atomic_inc(&dropped_count);
+ return;
+ }
+ entry = ring_buffer_event_data(event);
+ entry->map = *map;
+
+ if (!call_filter_check_discard(call, entry, buffer, event))
+ trace_buffer_unlock_commit(buffer, event, 0, pc);
+}
+
+void mmio_trace_mapping(struct mmiotrace_map *map)
+{
+ struct trace_array *tr = mmio_trace_array;
+ struct trace_array_cpu *data;
+
+ preempt_disable();
+ data = per_cpu_ptr(tr->trace_buffer.data, smp_processor_id());
+ __trace_mmiotrace_map(tr, data, map);
+ preempt_enable();
+}
+
+int mmio_trace_printk(const char *fmt, va_list args)
+{
+ return trace_vprintk(0, fmt, args);
+}
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
new file mode 100644
index 000000000..8bb207147
--- /dev/null
+++ b/kernel/trace/trace_nop.c
@@ -0,0 +1,99 @@
+/*
+ * nop tracer
+ *
+ * Copyright (C) 2008 Steven Noonan <steven@uplinklabs.net>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/ftrace.h>
+
+#include "trace.h"
+
+/* Our two options */
+enum {
+ TRACE_NOP_OPT_ACCEPT = 0x1,
+ TRACE_NOP_OPT_REFUSE = 0x2
+};
+
+/* Options for the tracer (see trace_options file) */
+static struct tracer_opt nop_opts[] = {
+ /* Option that will be accepted by set_flag callback */
+ { TRACER_OPT(test_nop_accept, TRACE_NOP_OPT_ACCEPT) },
+ /* Option that will be refused by set_flag callback */
+ { TRACER_OPT(test_nop_refuse, TRACE_NOP_OPT_REFUSE) },
+ { } /* Always set a last empty entry */
+};
+
+static struct tracer_flags nop_flags = {
+ /* You can check your flags value here when you want. */
+ .val = 0, /* By default: all flags disabled */
+ .opts = nop_opts
+};
+
+static struct trace_array *ctx_trace;
+
+static void start_nop_trace(struct trace_array *tr)
+{
+ /* Nothing to do! */
+}
+
+static void stop_nop_trace(struct trace_array *tr)
+{
+ /* Nothing to do! */
+}
+
+static int nop_trace_init(struct trace_array *tr)
+{
+ ctx_trace = tr;
+ start_nop_trace(tr);
+ return 0;
+}
+
+static void nop_trace_reset(struct trace_array *tr)
+{
+ stop_nop_trace(tr);
+}
+
+/* It only serves as a signal handler and a callback to
+ * accept or refuse tthe setting of a flag.
+ * If you don't implement it, then the flag setting will be
+ * automatically accepted.
+ */
+static int nop_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
+{
+ /*
+ * Note that you don't need to update nop_flags.val yourself.
+ * The tracing Api will do it automatically if you return 0
+ */
+ if (bit == TRACE_NOP_OPT_ACCEPT) {
+ printk(KERN_DEBUG "nop_test_accept flag set to %d: we accept."
+ " Now cat trace_options to see the result\n",
+ set);
+ return 0;
+ }
+
+ if (bit == TRACE_NOP_OPT_REFUSE) {
+ printk(KERN_DEBUG "nop_test_refuse flag set to %d: we refuse."
+ "Now cat trace_options to see the result\n",
+ set);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+
+struct tracer nop_trace __read_mostly =
+{
+ .name = "nop",
+ .init = nop_trace_init,
+ .reset = nop_trace_reset,
+#ifdef CONFIG_FTRACE_SELFTEST
+ .selftest = trace_selftest_startup_nop,
+#endif
+ .flags = &nop_flags,
+ .set_flag = nop_set_flag,
+ .allow_instances = true,
+};
+
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
new file mode 100644
index 000000000..25a086bcb
--- /dev/null
+++ b/kernel/trace/trace_output.c
@@ -0,0 +1,1256 @@
+/*
+ * trace_output.c
+ *
+ * Copyright (C) 2008 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/ftrace.h>
+
+#include "trace_output.h"
+
+/* must be a power of 2 */
+#define EVENT_HASHSIZE 128
+
+DECLARE_RWSEM(trace_event_sem);
+
+static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
+
+static int next_event_type = __TRACE_LAST_TYPE + 1;
+
+enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter)
+{
+ struct trace_seq *s = &iter->seq;
+ struct trace_entry *entry = iter->ent;
+ struct bputs_entry *field;
+
+ trace_assign_type(field, entry);
+
+ trace_seq_puts(s, field->str);
+
+ return trace_handle_return(s);
+}
+
+enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
+{
+ struct trace_seq *s = &iter->seq;
+ struct trace_entry *entry = iter->ent;
+ struct bprint_entry *field;
+
+ trace_assign_type(field, entry);
+
+ trace_seq_bprintf(s, field->fmt, field->buf);
+
+ return trace_handle_return(s);
+}
+
+enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
+{
+ struct trace_seq *s = &iter->seq;
+ struct trace_entry *entry = iter->ent;
+ struct print_entry *field;
+
+ trace_assign_type(field, entry);
+
+ trace_seq_puts(s, field->buf);
+
+ return trace_handle_return(s);
+}
+
+const char *
+ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
+ unsigned long flags,
+ const struct trace_print_flags *flag_array)
+{
+ unsigned long mask;
+ const char *str;
+ const char *ret = trace_seq_buffer_ptr(p);
+ int i, first = 1;
+
+ for (i = 0; flag_array[i].name && flags; i++) {
+
+ mask = flag_array[i].mask;
+ if ((flags & mask) != mask)
+ continue;
+
+ str = flag_array[i].name;
+ flags &= ~mask;
+ if (!first && delim)
+ trace_seq_puts(p, delim);
+ else
+ first = 0;
+ trace_seq_puts(p, str);
+ }
+
+ /* check for left over flags */
+ if (flags) {
+ if (!first && delim)
+ trace_seq_puts(p, delim);
+ trace_seq_printf(p, "0x%lx", flags);
+ }
+
+ trace_seq_putc(p, 0);
+
+ return ret;
+}
+EXPORT_SYMBOL(ftrace_print_flags_seq);
+
+const char *
+ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
+ const struct trace_print_flags *symbol_array)
+{
+ int i;
+ const char *ret = trace_seq_buffer_ptr(p);
+
+ for (i = 0; symbol_array[i].name; i++) {
+
+ if (val != symbol_array[i].mask)
+ continue;
+
+ trace_seq_puts(p, symbol_array[i].name);
+ break;
+ }
+
+ if (ret == (const char *)(trace_seq_buffer_ptr(p)))
+ trace_seq_printf(p, "0x%lx", val);
+
+ trace_seq_putc(p, 0);
+
+ return ret;
+}
+EXPORT_SYMBOL(ftrace_print_symbols_seq);
+
+#if BITS_PER_LONG == 32
+const char *
+ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
+ const struct trace_print_flags_u64 *symbol_array)
+{
+ int i;
+ const char *ret = trace_seq_buffer_ptr(p);
+
+ for (i = 0; symbol_array[i].name; i++) {
+
+ if (val != symbol_array[i].mask)
+ continue;
+
+ trace_seq_puts(p, symbol_array[i].name);
+ break;
+ }
+
+ if (ret == (const char *)(trace_seq_buffer_ptr(p)))
+ trace_seq_printf(p, "0x%llx", val);
+
+ trace_seq_putc(p, 0);
+
+ return ret;
+}
+EXPORT_SYMBOL(ftrace_print_symbols_seq_u64);
+#endif
+
+const char *
+ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,
+ unsigned int bitmask_size)
+{
+ const char *ret = trace_seq_buffer_ptr(p);
+
+ trace_seq_bitmask(p, bitmask_ptr, bitmask_size * 8);
+ trace_seq_putc(p, 0);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ftrace_print_bitmask_seq);
+
+const char *
+ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
+{
+ int i;
+ const char *ret = trace_seq_buffer_ptr(p);
+
+ for (i = 0; i < buf_len; i++)
+ trace_seq_printf(p, "%s%2.2x", i == 0 ? "" : " ", buf[i]);
+
+ trace_seq_putc(p, 0);
+
+ return ret;
+}
+EXPORT_SYMBOL(ftrace_print_hex_seq);
+
+const char *
+ftrace_print_array_seq(struct trace_seq *p, const void *buf, int count,
+ size_t el_size)
+{
+ const char *ret = trace_seq_buffer_ptr(p);
+ const char *prefix = "";
+ void *ptr = (void *)buf;
+ size_t buf_len = count * el_size;
+
+ trace_seq_putc(p, '{');
+
+ while (ptr < buf + buf_len) {
+ switch (el_size) {
+ case 1:
+ trace_seq_printf(p, "%s0x%x", prefix,
+ *(u8 *)ptr);
+ break;
+ case 2:
+ trace_seq_printf(p, "%s0x%x", prefix,
+ *(u16 *)ptr);
+ break;
+ case 4:
+ trace_seq_printf(p, "%s0x%x", prefix,
+ *(u32 *)ptr);
+ break;
+ case 8:
+ trace_seq_printf(p, "%s0x%llx", prefix,
+ *(u64 *)ptr);
+ break;
+ default:
+ trace_seq_printf(p, "BAD SIZE:%zu 0x%x", el_size,
+ *(u8 *)ptr);
+ el_size = 1;
+ }
+ prefix = ",";
+ ptr += el_size;
+ }
+
+ trace_seq_putc(p, '}');
+ trace_seq_putc(p, 0);
+
+ return ret;
+}
+EXPORT_SYMBOL(ftrace_print_array_seq);
+
+int ftrace_raw_output_prep(struct trace_iterator *iter,
+ struct trace_event *trace_event)
+{
+ struct ftrace_event_call *event;
+ struct trace_seq *s = &iter->seq;
+ struct trace_seq *p = &iter->tmp_seq;
+ struct trace_entry *entry;
+
+ event = container_of(trace_event, struct ftrace_event_call, event);
+ entry = iter->ent;
+
+ if (entry->type != event->event.type) {
+ WARN_ON_ONCE(1);
+ return TRACE_TYPE_UNHANDLED;
+ }
+
+ trace_seq_init(p);
+ trace_seq_printf(s, "%s: ", ftrace_event_name(event));
+
+ return trace_handle_return(s);
+}
+EXPORT_SYMBOL(ftrace_raw_output_prep);
+
+static int ftrace_output_raw(struct trace_iterator *iter, char *name,
+ char *fmt, va_list ap)
+{
+ struct trace_seq *s = &iter->seq;
+
+ trace_seq_printf(s, "%s: ", name);
+ trace_seq_vprintf(s, fmt, ap);
+
+ return trace_handle_return(s);
+}
+
+int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...)
+{
+ va_list ap;
+ int ret;
+
+ va_start(ap, fmt);
+ ret = ftrace_output_raw(iter, name, fmt, ap);
+ va_end(ap);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ftrace_output_call);
+
+#ifdef CONFIG_KRETPROBES
+static inline const char *kretprobed(const char *name)
+{
+ static const char tramp_name[] = "kretprobe_trampoline";
+ int size = sizeof(tramp_name);
+
+ if (strncmp(tramp_name, name, size) == 0)
+ return "[unknown/kretprobe'd]";
+ return name;
+}
+#else
+static inline const char *kretprobed(const char *name)
+{
+ return name;
+}
+#endif /* CONFIG_KRETPROBES */
+
+static void
+seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
+{
+#ifdef CONFIG_KALLSYMS
+ char str[KSYM_SYMBOL_LEN];
+ const char *name;
+
+ kallsyms_lookup(address, NULL, NULL, NULL, str);
+
+ name = kretprobed(str);
+
+ trace_seq_printf(s, fmt, name);
+#endif
+}
+
+static void
+seq_print_sym_offset(struct trace_seq *s, const char *fmt,
+ unsigned long address)
+{
+#ifdef CONFIG_KALLSYMS
+ char str[KSYM_SYMBOL_LEN];
+ const char *name;
+
+ sprint_symbol(str, address);
+ name = kretprobed(str);
+
+ trace_seq_printf(s, fmt, name);
+#endif
+}
+
+#ifndef CONFIG_64BIT
+# define IP_FMT "%08lx"
+#else
+# define IP_FMT "%016lx"
+#endif
+
+int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
+ unsigned long ip, unsigned long sym_flags)
+{
+ struct file *file = NULL;
+ unsigned long vmstart = 0;
+ int ret = 1;
+
+ if (s->full)
+ return 0;
+
+ if (mm) {
+ const struct vm_area_struct *vma;
+
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, ip);
+ if (vma) {
+ file = vma->vm_file;
+ vmstart = vma->vm_start;
+ }
+ if (file) {
+ ret = trace_seq_path(s, &file->f_path);
+ if (ret)
+ trace_seq_printf(s, "[+0x%lx]",
+ ip - vmstart);
+ }
+ up_read(&mm->mmap_sem);
+ }
+ if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
+ trace_seq_printf(s, " <" IP_FMT ">", ip);
+ return !trace_seq_has_overflowed(s);
+}
+
+int
+seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
+ unsigned long sym_flags)
+{
+ struct mm_struct *mm = NULL;
+ unsigned int i;
+
+ if (trace_flags & TRACE_ITER_SYM_USEROBJ) {
+ struct task_struct *task;
+ /*
+ * we do the lookup on the thread group leader,
+ * since individual threads might have already quit!
+ */
+ rcu_read_lock();
+ task = find_task_by_vpid(entry->tgid);
+ if (task)
+ mm = get_task_mm(task);
+ rcu_read_unlock();
+ }
+
+ for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
+ unsigned long ip = entry->caller[i];
+
+ if (ip == ULONG_MAX || trace_seq_has_overflowed(s))
+ break;
+
+ trace_seq_puts(s, " => ");
+
+ if (!ip) {
+ trace_seq_puts(s, "??");
+ trace_seq_putc(s, '\n');
+ continue;
+ }
+
+ seq_print_user_ip(s, mm, ip, sym_flags);
+ trace_seq_putc(s, '\n');
+ }
+
+ if (mm)
+ mmput(mm);
+
+ return !trace_seq_has_overflowed(s);
+}
+
+int
+seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
+{
+ if (!ip) {
+ trace_seq_putc(s, '0');
+ goto out;
+ }
+
+ if (sym_flags & TRACE_ITER_SYM_OFFSET)
+ seq_print_sym_offset(s, "%s", ip);
+ else
+ seq_print_sym_short(s, "%s", ip);
+
+ if (sym_flags & TRACE_ITER_SYM_ADDR)
+ trace_seq_printf(s, " <" IP_FMT ">", ip);
+
+ out:
+ return !trace_seq_has_overflowed(s);
+}
+
+/**
+ * trace_print_lat_fmt - print the irq, preempt and lockdep fields
+ * @s: trace seq struct to write to
+ * @entry: The trace entry field from the ring buffer
+ *
+ * Prints the generic fields of irqs off, in hard or softirq, preempt
+ * count.
+ */
+int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
+{
+ char hardsoft_irq;
+ char need_resched;
+ char irqs_off;
+ int hardirq;
+ int softirq;
+
+ hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
+ softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
+
+ irqs_off =
+ (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
+ (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' :
+ '.';
+
+ switch (entry->flags & (TRACE_FLAG_NEED_RESCHED |
+ TRACE_FLAG_PREEMPT_RESCHED)) {
+ case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_PREEMPT_RESCHED:
+ need_resched = 'N';
+ break;
+ case TRACE_FLAG_NEED_RESCHED:
+ need_resched = 'n';
+ break;
+ case TRACE_FLAG_PREEMPT_RESCHED:
+ need_resched = 'p';
+ break;
+ default:
+ need_resched = '.';
+ break;
+ }
+
+ hardsoft_irq =
+ (hardirq && softirq) ? 'H' :
+ hardirq ? 'h' :
+ softirq ? 's' :
+ '.';
+
+ trace_seq_printf(s, "%c%c%c",
+ irqs_off, need_resched, hardsoft_irq);
+
+ if (entry->preempt_count)
+ trace_seq_printf(s, "%x", entry->preempt_count);
+ else
+ trace_seq_putc(s, '.');
+
+ return !trace_seq_has_overflowed(s);
+}
+
+static int
+lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
+{
+ char comm[TASK_COMM_LEN];
+
+ trace_find_cmdline(entry->pid, comm);
+
+ trace_seq_printf(s, "%8.8s-%-5d %3d",
+ comm, entry->pid, cpu);
+
+ return trace_print_lat_fmt(s, entry);
+}
+
+#undef MARK
+#define MARK(v, s) {.val = v, .sym = s}
+/* trace overhead mark */
+static const struct trace_mark {
+ unsigned long long val; /* unit: nsec */
+ char sym;
+} mark[] = {
+ MARK(1000000000ULL , '$'), /* 1 sec */
+ MARK(1000000ULL , '#'), /* 1000 usecs */
+ MARK(100000ULL , '!'), /* 100 usecs */
+ MARK(10000ULL , '+'), /* 10 usecs */
+};
+#undef MARK
+
+char trace_find_mark(unsigned long long d)
+{
+ int i;
+ int size = ARRAY_SIZE(mark);
+
+ for (i = 0; i < size; i++) {
+ if (d >= mark[i].val)
+ break;
+ }
+
+ return (i == size) ? ' ' : mark[i].sym;
+}
+
+static int
+lat_print_timestamp(struct trace_iterator *iter, u64 next_ts)
+{
+ unsigned long verbose = trace_flags & TRACE_ITER_VERBOSE;
+ unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS;
+ unsigned long long abs_ts = iter->ts - iter->trace_buffer->time_start;
+ unsigned long long rel_ts = next_ts - iter->ts;
+ struct trace_seq *s = &iter->seq;
+
+ if (in_ns) {
+ abs_ts = ns2usecs(abs_ts);
+ rel_ts = ns2usecs(rel_ts);
+ }
+
+ if (verbose && in_ns) {
+ unsigned long abs_usec = do_div(abs_ts, USEC_PER_MSEC);
+ unsigned long abs_msec = (unsigned long)abs_ts;
+ unsigned long rel_usec = do_div(rel_ts, USEC_PER_MSEC);
+ unsigned long rel_msec = (unsigned long)rel_ts;
+
+ trace_seq_printf(
+ s, "[%08llx] %ld.%03ldms (+%ld.%03ldms): ",
+ ns2usecs(iter->ts),
+ abs_msec, abs_usec,
+ rel_msec, rel_usec);
+
+ } else if (verbose && !in_ns) {
+ trace_seq_printf(
+ s, "[%016llx] %lld (+%lld): ",
+ iter->ts, abs_ts, rel_ts);
+
+ } else if (!verbose && in_ns) {
+ trace_seq_printf(
+ s, " %4lldus%c: ",
+ abs_ts,
+ trace_find_mark(rel_ts * NSEC_PER_USEC));
+
+ } else { /* !verbose && !in_ns */
+ trace_seq_printf(s, " %4lld: ", abs_ts);
+ }
+
+ return !trace_seq_has_overflowed(s);
+}
+
+int trace_print_context(struct trace_iterator *iter)
+{
+ struct trace_seq *s = &iter->seq;
+ struct trace_entry *entry = iter->ent;
+ unsigned long long t;
+ unsigned long secs, usec_rem;
+ char comm[TASK_COMM_LEN];
+
+ trace_find_cmdline(entry->pid, comm);
+
+ trace_seq_printf(s, "%16s-%-5d [%03d] ",
+ comm, entry->pid, iter->cpu);
+
+ if (trace_flags & TRACE_ITER_IRQ_INFO)
+ trace_print_lat_fmt(s, entry);
+
+ if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) {
+ t = ns2usecs(iter->ts);
+ usec_rem = do_div(t, USEC_PER_SEC);
+ secs = (unsigned long)t;
+ trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem);
+ } else
+ trace_seq_printf(s, " %12llu: ", iter->ts);
+
+ return !trace_seq_has_overflowed(s);
+}
+
+int trace_print_lat_context(struct trace_iterator *iter)
+{
+ u64 next_ts;
+ /* trace_find_next_entry will reset ent_size */
+ int ent_size = iter->ent_size;
+ struct trace_seq *s = &iter->seq;
+ struct trace_entry *entry = iter->ent,
+ *next_entry = trace_find_next_entry(iter, NULL,
+ &next_ts);
+ unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE);
+
+ /* Restore the original ent_size */
+ iter->ent_size = ent_size;
+
+ if (!next_entry)
+ next_ts = iter->ts;
+
+ if (verbose) {
+ char comm[TASK_COMM_LEN];
+
+ trace_find_cmdline(entry->pid, comm);
+
+ trace_seq_printf(
+ s, "%16s %5d %3d %d %08x %08lx ",
+ comm, entry->pid, iter->cpu, entry->flags,
+ entry->preempt_count, iter->idx);
+ } else {
+ lat_print_generic(s, entry, iter->cpu);
+ }
+
+ lat_print_timestamp(iter, next_ts);
+
+ return !trace_seq_has_overflowed(s);
+}
+
+static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
+
+static int task_state_char(unsigned long state)
+{
+ int bit = state ? __ffs(state) + 1 : 0;
+
+ return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?';
+}
+
+/**
+ * ftrace_find_event - find a registered event
+ * @type: the type of event to look for
+ *
+ * Returns an event of type @type otherwise NULL
+ * Called with trace_event_read_lock() held.
+ */
+struct trace_event *ftrace_find_event(int type)
+{
+ struct trace_event *event;
+ unsigned key;
+
+ key = type & (EVENT_HASHSIZE - 1);
+
+ hlist_for_each_entry(event, &event_hash[key], node) {
+ if (event->type == type)
+ return event;
+ }
+
+ return NULL;
+}
+
+static LIST_HEAD(ftrace_event_list);
+
+static int trace_search_list(struct list_head **list)
+{
+ struct trace_event *e;
+ int last = __TRACE_LAST_TYPE;
+
+ if (list_empty(&ftrace_event_list)) {
+ *list = &ftrace_event_list;
+ return last + 1;
+ }
+
+ /*
+ * We used up all possible max events,
+ * lets see if somebody freed one.
+ */
+ list_for_each_entry(e, &ftrace_event_list, list) {
+ if (e->type != last + 1)
+ break;
+ last++;
+ }
+
+ /* Did we used up all 65 thousand events??? */
+ if ((last + 1) > FTRACE_MAX_EVENT)
+ return 0;
+
+ *list = &e->list;
+ return last + 1;
+}
+
+void trace_event_read_lock(void)
+{
+ down_read(&trace_event_sem);
+}
+
+void trace_event_read_unlock(void)
+{
+ up_read(&trace_event_sem);
+}
+
+/**
+ * register_ftrace_event - register output for an event type
+ * @event: the event type to register
+ *
+ * Event types are stored in a hash and this hash is used to
+ * find a way to print an event. If the @event->type is set
+ * then it will use that type, otherwise it will assign a
+ * type to use.
+ *
+ * If you assign your own type, please make sure it is added
+ * to the trace_type enum in trace.h, to avoid collisions
+ * with the dynamic types.
+ *
+ * Returns the event type number or zero on error.
+ */
+int register_ftrace_event(struct trace_event *event)
+{
+ unsigned key;
+ int ret = 0;
+
+ down_write(&trace_event_sem);
+
+ if (WARN_ON(!event))
+ goto out;
+
+ if (WARN_ON(!event->funcs))
+ goto out;
+
+ INIT_LIST_HEAD(&event->list);
+
+ if (!event->type) {
+ struct list_head *list = NULL;
+
+ if (next_event_type > FTRACE_MAX_EVENT) {
+
+ event->type = trace_search_list(&list);
+ if (!event->type)
+ goto out;
+
+ } else {
+
+ event->type = next_event_type++;
+ list = &ftrace_event_list;
+ }
+
+ if (WARN_ON(ftrace_find_event(event->type)))
+ goto out;
+
+ list_add_tail(&event->list, list);
+
+ } else if (event->type > __TRACE_LAST_TYPE) {
+ printk(KERN_WARNING "Need to add type to trace.h\n");
+ WARN_ON(1);
+ goto out;
+ } else {
+ /* Is this event already used */
+ if (ftrace_find_event(event->type))
+ goto out;
+ }
+
+ if (event->funcs->trace == NULL)
+ event->funcs->trace = trace_nop_print;
+ if (event->funcs->raw == NULL)
+ event->funcs->raw = trace_nop_print;
+ if (event->funcs->hex == NULL)
+ event->funcs->hex = trace_nop_print;
+ if (event->funcs->binary == NULL)
+ event->funcs->binary = trace_nop_print;
+
+ key = event->type & (EVENT_HASHSIZE - 1);
+
+ hlist_add_head(&event->node, &event_hash[key]);
+
+ ret = event->type;
+ out:
+ up_write(&trace_event_sem);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(register_ftrace_event);
+
+/*
+ * Used by module code with the trace_event_sem held for write.
+ */
+int __unregister_ftrace_event(struct trace_event *event)
+{
+ hlist_del(&event->node);
+ list_del(&event->list);
+ return 0;
+}
+
+/**
+ * unregister_ftrace_event - remove a no longer used event
+ * @event: the event to remove
+ */
+int unregister_ftrace_event(struct trace_event *event)
+{
+ down_write(&trace_event_sem);
+ __unregister_ftrace_event(event);
+ up_write(&trace_event_sem);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(unregister_ftrace_event);
+
+/*
+ * Standard events
+ */
+
+enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type);
+
+ return trace_handle_return(&iter->seq);
+}
+
+/* TRACE_FN */
+static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct ftrace_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ seq_print_ip_sym(s, field->ip, flags);
+
+ if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) {
+ trace_seq_puts(s, " <-");
+ seq_print_ip_sym(s, field->parent_ip, flags);
+ }
+
+ trace_seq_putc(s, '\n');
+
+ return trace_handle_return(s);
+}
+
+static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct ftrace_entry *field;
+
+ trace_assign_type(field, iter->ent);
+
+ trace_seq_printf(&iter->seq, "%lx %lx\n",
+ field->ip,
+ field->parent_ip);
+
+ return trace_handle_return(&iter->seq);
+}
+
+static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct ftrace_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ SEQ_PUT_HEX_FIELD(s, field->ip);
+ SEQ_PUT_HEX_FIELD(s, field->parent_ip);
+
+ return trace_handle_return(s);
+}
+
+static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct ftrace_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ SEQ_PUT_FIELD(s, field->ip);
+ SEQ_PUT_FIELD(s, field->parent_ip);
+
+ return trace_handle_return(s);
+}
+
+static struct trace_event_functions trace_fn_funcs = {
+ .trace = trace_fn_trace,
+ .raw = trace_fn_raw,
+ .hex = trace_fn_hex,
+ .binary = trace_fn_bin,
+};
+
+static struct trace_event trace_fn_event = {
+ .type = TRACE_FN,
+ .funcs = &trace_fn_funcs,
+};
+
+/* TRACE_CTX an TRACE_WAKE */
+static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
+ char *delim)
+{
+ struct ctx_switch_entry *field;
+ char comm[TASK_COMM_LEN];
+ int S, T;
+
+
+ trace_assign_type(field, iter->ent);
+
+ T = task_state_char(field->next_state);
+ S = task_state_char(field->prev_state);
+ trace_find_cmdline(field->next_pid, comm);
+ trace_seq_printf(&iter->seq,
+ " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
+ field->prev_pid,
+ field->prev_prio,
+ S, delim,
+ field->next_cpu,
+ field->next_pid,
+ field->next_prio,
+ T, comm);
+
+ return trace_handle_return(&iter->seq);
+}
+
+static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ return trace_ctxwake_print(iter, "==>");
+}
+
+static enum print_line_t trace_wake_print(struct trace_iterator *iter,
+ int flags, struct trace_event *event)
+{
+ return trace_ctxwake_print(iter, " +");
+}
+
+static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
+{
+ struct ctx_switch_entry *field;
+ int T;
+
+ trace_assign_type(field, iter->ent);
+
+ if (!S)
+ S = task_state_char(field->prev_state);
+ T = task_state_char(field->next_state);
+ trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n",
+ field->prev_pid,
+ field->prev_prio,
+ S,
+ field->next_cpu,
+ field->next_pid,
+ field->next_prio,
+ T);
+
+ return trace_handle_return(&iter->seq);
+}
+
+static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ return trace_ctxwake_raw(iter, 0);
+}
+
+static enum print_line_t trace_wake_raw(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ return trace_ctxwake_raw(iter, '+');
+}
+
+
+static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
+{
+ struct ctx_switch_entry *field;
+ struct trace_seq *s = &iter->seq;
+ int T;
+
+ trace_assign_type(field, iter->ent);
+
+ if (!S)
+ S = task_state_char(field->prev_state);
+ T = task_state_char(field->next_state);
+
+ SEQ_PUT_HEX_FIELD(s, field->prev_pid);
+ SEQ_PUT_HEX_FIELD(s, field->prev_prio);
+ SEQ_PUT_HEX_FIELD(s, S);
+ SEQ_PUT_HEX_FIELD(s, field->next_cpu);
+ SEQ_PUT_HEX_FIELD(s, field->next_pid);
+ SEQ_PUT_HEX_FIELD(s, field->next_prio);
+ SEQ_PUT_HEX_FIELD(s, T);
+
+ return trace_handle_return(s);
+}
+
+static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ return trace_ctxwake_hex(iter, 0);
+}
+
+static enum print_line_t trace_wake_hex(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ return trace_ctxwake_hex(iter, '+');
+}
+
+static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
+ int flags, struct trace_event *event)
+{
+ struct ctx_switch_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ SEQ_PUT_FIELD(s, field->prev_pid);
+ SEQ_PUT_FIELD(s, field->prev_prio);
+ SEQ_PUT_FIELD(s, field->prev_state);
+ SEQ_PUT_FIELD(s, field->next_cpu);
+ SEQ_PUT_FIELD(s, field->next_pid);
+ SEQ_PUT_FIELD(s, field->next_prio);
+ SEQ_PUT_FIELD(s, field->next_state);
+
+ return trace_handle_return(s);
+}
+
+static struct trace_event_functions trace_ctx_funcs = {
+ .trace = trace_ctx_print,
+ .raw = trace_ctx_raw,
+ .hex = trace_ctx_hex,
+ .binary = trace_ctxwake_bin,
+};
+
+static struct trace_event trace_ctx_event = {
+ .type = TRACE_CTX,
+ .funcs = &trace_ctx_funcs,
+};
+
+static struct trace_event_functions trace_wake_funcs = {
+ .trace = trace_wake_print,
+ .raw = trace_wake_raw,
+ .hex = trace_wake_hex,
+ .binary = trace_ctxwake_bin,
+};
+
+static struct trace_event trace_wake_event = {
+ .type = TRACE_WAKE,
+ .funcs = &trace_wake_funcs,
+};
+
+/* TRACE_STACK */
+
+static enum print_line_t trace_stack_print(struct trace_iterator *iter,
+ int flags, struct trace_event *event)
+{
+ struct stack_entry *field;
+ struct trace_seq *s = &iter->seq;
+ unsigned long *p;
+ unsigned long *end;
+
+ trace_assign_type(field, iter->ent);
+ end = (unsigned long *)((long)iter->ent + iter->ent_size);
+
+ trace_seq_puts(s, "<stack trace>\n");
+
+ for (p = field->caller; p && *p != ULONG_MAX && p < end; p++) {
+
+ if (trace_seq_has_overflowed(s))
+ break;
+
+ trace_seq_puts(s, " => ");
+ seq_print_ip_sym(s, *p, flags);
+ trace_seq_putc(s, '\n');
+ }
+
+ return trace_handle_return(s);
+}
+
+static struct trace_event_functions trace_stack_funcs = {
+ .trace = trace_stack_print,
+};
+
+static struct trace_event trace_stack_event = {
+ .type = TRACE_STACK,
+ .funcs = &trace_stack_funcs,
+};
+
+/* TRACE_USER_STACK */
+static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
+ int flags, struct trace_event *event)
+{
+ struct userstack_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ trace_seq_puts(s, "<user stack trace>\n");
+ seq_print_userip_objs(field, s, flags);
+
+ return trace_handle_return(s);
+}
+
+static struct trace_event_functions trace_user_stack_funcs = {
+ .trace = trace_user_stack_print,
+};
+
+static struct trace_event trace_user_stack_event = {
+ .type = TRACE_USER_STACK,
+ .funcs = &trace_user_stack_funcs,
+};
+
+/* TRACE_BPUTS */
+static enum print_line_t
+trace_bputs_print(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct trace_entry *entry = iter->ent;
+ struct trace_seq *s = &iter->seq;
+ struct bputs_entry *field;
+
+ trace_assign_type(field, entry);
+
+ seq_print_ip_sym(s, field->ip, flags);
+ trace_seq_puts(s, ": ");
+ trace_seq_puts(s, field->str);
+
+ return trace_handle_return(s);
+}
+
+
+static enum print_line_t
+trace_bputs_raw(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct bputs_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ trace_seq_printf(s, ": %lx : ", field->ip);
+ trace_seq_puts(s, field->str);
+
+ return trace_handle_return(s);
+}
+
+static struct trace_event_functions trace_bputs_funcs = {
+ .trace = trace_bputs_print,
+ .raw = trace_bputs_raw,
+};
+
+static struct trace_event trace_bputs_event = {
+ .type = TRACE_BPUTS,
+ .funcs = &trace_bputs_funcs,
+};
+
+/* TRACE_BPRINT */
+static enum print_line_t
+trace_bprint_print(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct trace_entry *entry = iter->ent;
+ struct trace_seq *s = &iter->seq;
+ struct bprint_entry *field;
+
+ trace_assign_type(field, entry);
+
+ seq_print_ip_sym(s, field->ip, flags);
+ trace_seq_puts(s, ": ");
+ trace_seq_bprintf(s, field->fmt, field->buf);
+
+ return trace_handle_return(s);
+}
+
+
+static enum print_line_t
+trace_bprint_raw(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct bprint_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ trace_seq_printf(s, ": %lx : ", field->ip);
+ trace_seq_bprintf(s, field->fmt, field->buf);
+
+ return trace_handle_return(s);
+}
+
+static struct trace_event_functions trace_bprint_funcs = {
+ .trace = trace_bprint_print,
+ .raw = trace_bprint_raw,
+};
+
+static struct trace_event trace_bprint_event = {
+ .type = TRACE_BPRINT,
+ .funcs = &trace_bprint_funcs,
+};
+
+/* TRACE_PRINT */
+static enum print_line_t trace_print_print(struct trace_iterator *iter,
+ int flags, struct trace_event *event)
+{
+ struct print_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ seq_print_ip_sym(s, field->ip, flags);
+ trace_seq_printf(s, ": %s", field->buf);
+
+ return trace_handle_return(s);
+}
+
+static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct print_entry *field;
+
+ trace_assign_type(field, iter->ent);
+
+ trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf);
+
+ return trace_handle_return(&iter->seq);
+}
+
+static struct trace_event_functions trace_print_funcs = {
+ .trace = trace_print_print,
+ .raw = trace_print_raw,
+};
+
+static struct trace_event trace_print_event = {
+ .type = TRACE_PRINT,
+ .funcs = &trace_print_funcs,
+};
+
+
+static struct trace_event *events[] __initdata = {
+ &trace_fn_event,
+ &trace_ctx_event,
+ &trace_wake_event,
+ &trace_stack_event,
+ &trace_user_stack_event,
+ &trace_bputs_event,
+ &trace_bprint_event,
+ &trace_print_event,
+ NULL
+};
+
+__init static int init_events(void)
+{
+ struct trace_event *event;
+ int i, ret;
+
+ for (i = 0; events[i]; i++) {
+ event = events[i];
+
+ ret = register_ftrace_event(event);
+ if (!ret) {
+ printk(KERN_WARNING "event %d failed to register\n",
+ event->type);
+ WARN_ON_ONCE(1);
+ }
+ }
+
+ return 0;
+}
+early_initcall(init_events);
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
new file mode 100644
index 000000000..8ef2c40ef
--- /dev/null
+++ b/kernel/trace/trace_output.h
@@ -0,0 +1,45 @@
+#ifndef __TRACE_EVENTS_H
+#define __TRACE_EVENTS_H
+
+#include <linux/trace_seq.h>
+#include "trace.h"
+
+extern enum print_line_t
+trace_print_bputs_msg_only(struct trace_iterator *iter);
+extern enum print_line_t
+trace_print_bprintk_msg_only(struct trace_iterator *iter);
+extern enum print_line_t
+trace_print_printk_msg_only(struct trace_iterator *iter);
+
+extern int
+seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
+ unsigned long sym_flags);
+extern int seq_print_userip_objs(const struct userstack_entry *entry,
+ struct trace_seq *s, unsigned long sym_flags);
+extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
+ unsigned long ip, unsigned long sym_flags);
+
+extern int trace_print_context(struct trace_iterator *iter);
+extern int trace_print_lat_context(struct trace_iterator *iter);
+
+extern void trace_event_read_lock(void);
+extern void trace_event_read_unlock(void);
+extern struct trace_event *ftrace_find_event(int type);
+
+extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
+ int flags, struct trace_event *event);
+extern int
+trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
+
+/* used by module unregistering */
+extern int __unregister_ftrace_event(struct trace_event *event);
+extern struct rw_semaphore trace_event_sem;
+
+#define SEQ_PUT_FIELD(s, x) \
+ trace_seq_putmem(s, &(x), sizeof(x))
+
+#define SEQ_PUT_HEX_FIELD(s, x) \
+ trace_seq_putmem_hex(s, &(x), sizeof(x))
+
+#endif
+
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
new file mode 100644
index 000000000..36c1455b7
--- /dev/null
+++ b/kernel/trace/trace_printk.c
@@ -0,0 +1,366 @@
+/*
+ * trace binary printk
+ *
+ * Copyright (C) 2008 Lai Jiangshan <laijs@cn.fujitsu.com>
+ *
+ */
+#include <linux/seq_file.h>
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/ftrace.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/ctype.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+
+#include "trace.h"
+
+#ifdef CONFIG_MODULES
+
+/*
+ * modules trace_printk()'s formats are autosaved in struct trace_bprintk_fmt
+ * which are queued on trace_bprintk_fmt_list.
+ */
+static LIST_HEAD(trace_bprintk_fmt_list);
+
+/* serialize accesses to trace_bprintk_fmt_list */
+static DEFINE_MUTEX(btrace_mutex);
+
+struct trace_bprintk_fmt {
+ struct list_head list;
+ const char *fmt;
+};
+
+static inline struct trace_bprintk_fmt *lookup_format(const char *fmt)
+{
+ struct trace_bprintk_fmt *pos;
+ list_for_each_entry(pos, &trace_bprintk_fmt_list, list) {
+ if (!strcmp(pos->fmt, fmt))
+ return pos;
+ }
+ return NULL;
+}
+
+static
+void hold_module_trace_bprintk_format(const char **start, const char **end)
+{
+ const char **iter;
+ char *fmt;
+
+ /* allocate the trace_printk per cpu buffers */
+ if (start != end)
+ trace_printk_init_buffers();
+
+ mutex_lock(&btrace_mutex);
+ for (iter = start; iter < end; iter++) {
+ struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter);
+ if (tb_fmt) {
+ *iter = tb_fmt->fmt;
+ continue;
+ }
+
+ fmt = NULL;
+ tb_fmt = kmalloc(sizeof(*tb_fmt), GFP_KERNEL);
+ if (tb_fmt) {
+ fmt = kmalloc(strlen(*iter) + 1, GFP_KERNEL);
+ if (fmt) {
+ list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list);
+ strcpy(fmt, *iter);
+ tb_fmt->fmt = fmt;
+ } else
+ kfree(tb_fmt);
+ }
+ *iter = fmt;
+
+ }
+ mutex_unlock(&btrace_mutex);
+}
+
+static int module_trace_bprintk_format_notify(struct notifier_block *self,
+ unsigned long val, void *data)
+{
+ struct module *mod = data;
+ if (mod->num_trace_bprintk_fmt) {
+ const char **start = mod->trace_bprintk_fmt_start;
+ const char **end = start + mod->num_trace_bprintk_fmt;
+
+ if (val == MODULE_STATE_COMING)
+ hold_module_trace_bprintk_format(start, end);
+ }
+ return 0;
+}
+
+/*
+ * The debugfs/tracing/printk_formats file maps the addresses with
+ * the ASCII formats that are used in the bprintk events in the
+ * buffer. For userspace tools to be able to decode the events from
+ * the buffer, they need to be able to map the address with the format.
+ *
+ * The addresses of the bprintk formats are in their own section
+ * __trace_printk_fmt. But for modules we copy them into a link list.
+ * The code to print the formats and their addresses passes around the
+ * address of the fmt string. If the fmt address passed into the seq
+ * functions is within the kernel core __trace_printk_fmt section, then
+ * it simply uses the next pointer in the list.
+ *
+ * When the fmt pointer is outside the kernel core __trace_printk_fmt
+ * section, then we need to read the link list pointers. The trick is
+ * we pass the address of the string to the seq function just like
+ * we do for the kernel core formats. To get back the structure that
+ * holds the format, we simply use containerof() and then go to the
+ * next format in the list.
+ */
+static const char **
+find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos)
+{
+ struct trace_bprintk_fmt *mod_fmt;
+
+ if (list_empty(&trace_bprintk_fmt_list))
+ return NULL;
+
+ /*
+ * v will point to the address of the fmt record from t_next
+ * v will be NULL from t_start.
+ * If this is the first pointer or called from start
+ * then we need to walk the list.
+ */
+ if (!v || start_index == *pos) {
+ struct trace_bprintk_fmt *p;
+
+ /* search the module list */
+ list_for_each_entry(p, &trace_bprintk_fmt_list, list) {
+ if (start_index == *pos)
+ return &p->fmt;
+ start_index++;
+ }
+ /* pos > index */
+ return NULL;
+ }
+
+ /*
+ * v points to the address of the fmt field in the mod list
+ * structure that holds the module print format.
+ */
+ mod_fmt = container_of(v, typeof(*mod_fmt), fmt);
+ if (mod_fmt->list.next == &trace_bprintk_fmt_list)
+ return NULL;
+
+ mod_fmt = container_of(mod_fmt->list.next, typeof(*mod_fmt), list);
+
+ return &mod_fmt->fmt;
+}
+
+static void format_mod_start(void)
+{
+ mutex_lock(&btrace_mutex);
+}
+
+static void format_mod_stop(void)
+{
+ mutex_unlock(&btrace_mutex);
+}
+
+#else /* !CONFIG_MODULES */
+__init static int
+module_trace_bprintk_format_notify(struct notifier_block *self,
+ unsigned long val, void *data)
+{
+ return 0;
+}
+static inline const char **
+find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos)
+{
+ return NULL;
+}
+static inline void format_mod_start(void) { }
+static inline void format_mod_stop(void) { }
+#endif /* CONFIG_MODULES */
+
+
+__initdata_or_module static
+struct notifier_block module_trace_bprintk_format_nb = {
+ .notifier_call = module_trace_bprintk_format_notify,
+};
+
+int __trace_bprintk(unsigned long ip, const char *fmt, ...)
+ {
+ int ret;
+ va_list ap;
+
+ if (unlikely(!fmt))
+ return 0;
+
+ if (!(trace_flags & TRACE_ITER_PRINTK))
+ return 0;
+
+ va_start(ap, fmt);
+ ret = trace_vbprintk(ip, fmt, ap);
+ va_end(ap);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__trace_bprintk);
+
+int __ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap)
+ {
+ if (unlikely(!fmt))
+ return 0;
+
+ if (!(trace_flags & TRACE_ITER_PRINTK))
+ return 0;
+
+ return trace_vbprintk(ip, fmt, ap);
+}
+EXPORT_SYMBOL_GPL(__ftrace_vbprintk);
+
+int __trace_printk(unsigned long ip, const char *fmt, ...)
+{
+ int ret;
+ va_list ap;
+
+ if (!(trace_flags & TRACE_ITER_PRINTK))
+ return 0;
+
+ va_start(ap, fmt);
+ ret = trace_vprintk(ip, fmt, ap);
+ va_end(ap);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__trace_printk);
+
+int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
+{
+ if (!(trace_flags & TRACE_ITER_PRINTK))
+ return 0;
+
+ return trace_vprintk(ip, fmt, ap);
+}
+EXPORT_SYMBOL_GPL(__ftrace_vprintk);
+
+static const char **find_next(void *v, loff_t *pos)
+{
+ const char **fmt = v;
+ int start_index;
+ int last_index;
+
+ start_index = __stop___trace_bprintk_fmt - __start___trace_bprintk_fmt;
+
+ if (*pos < start_index)
+ return __start___trace_bprintk_fmt + *pos;
+
+ /*
+ * The __tracepoint_str section is treated the same as the
+ * __trace_printk_fmt section. The difference is that the
+ * __trace_printk_fmt section should only be used by trace_printk()
+ * in a debugging environment, as if anything exists in that section
+ * the trace_prink() helper buffers are allocated, which would just
+ * waste space in a production environment.
+ *
+ * The __tracepoint_str sections on the other hand are used by
+ * tracepoints which need to map pointers to their strings to
+ * the ASCII text for userspace.
+ */
+ last_index = start_index;
+ start_index = __stop___tracepoint_str - __start___tracepoint_str;
+
+ if (*pos < last_index + start_index)
+ return __start___tracepoint_str + (*pos - last_index);
+
+ return find_next_mod_format(start_index, v, fmt, pos);
+}
+
+static void *
+t_start(struct seq_file *m, loff_t *pos)
+{
+ format_mod_start();
+ return find_next(NULL, pos);
+}
+
+static void *t_next(struct seq_file *m, void * v, loff_t *pos)
+{
+ (*pos)++;
+ return find_next(v, pos);
+}
+
+static int t_show(struct seq_file *m, void *v)
+{
+ const char **fmt = v;
+ const char *str = *fmt;
+ int i;
+
+ seq_printf(m, "0x%lx : \"", *(unsigned long *)fmt);
+
+ /*
+ * Tabs and new lines need to be converted.
+ */
+ for (i = 0; str[i]; i++) {
+ switch (str[i]) {
+ case '\n':
+ seq_puts(m, "\\n");
+ break;
+ case '\t':
+ seq_puts(m, "\\t");
+ break;
+ case '\\':
+ seq_putc(m, '\\');
+ break;
+ case '"':
+ seq_puts(m, "\\\"");
+ break;
+ default:
+ seq_putc(m, str[i]);
+ }
+ }
+ seq_puts(m, "\"\n");
+
+ return 0;
+}
+
+static void t_stop(struct seq_file *m, void *p)
+{
+ format_mod_stop();
+}
+
+static const struct seq_operations show_format_seq_ops = {
+ .start = t_start,
+ .next = t_next,
+ .show = t_show,
+ .stop = t_stop,
+};
+
+static int
+ftrace_formats_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &show_format_seq_ops);
+}
+
+static const struct file_operations ftrace_formats_fops = {
+ .open = ftrace_formats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static __init int init_trace_printk_function_export(void)
+{
+ struct dentry *d_tracer;
+
+ d_tracer = tracing_init_dentry();
+ if (IS_ERR(d_tracer))
+ return 0;
+
+ trace_create_file("printk_formats", 0444, d_tracer,
+ NULL, &ftrace_formats_fops);
+
+ return 0;
+}
+
+fs_initcall(init_trace_printk_function_export);
+
+static __init int init_trace_printk(void)
+{
+ return register_module_notifier(&module_trace_bprintk_format_nb);
+}
+
+early_initcall(init_trace_printk);
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
new file mode 100644
index 000000000..1769a81da
--- /dev/null
+++ b/kernel/trace/trace_probe.c
@@ -0,0 +1,723 @@
+/*
+ * Common code for probe-based Dynamic events.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * This code was copied from kernel/trace/trace_kprobe.c written by
+ * Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
+ *
+ * Updates to make this generic:
+ * Copyright (C) IBM Corporation, 2010-2011
+ * Author: Srikar Dronamraju
+ */
+
+#include "trace_probe.h"
+
+const char *reserved_field_names[] = {
+ "common_type",
+ "common_flags",
+ "common_preempt_count",
+ "common_pid",
+ "common_tgid",
+ FIELD_STRING_IP,
+ FIELD_STRING_RETIP,
+ FIELD_STRING_FUNC,
+};
+
+/* Printing in basic type function template */
+#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt) \
+int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \
+ void *data, void *ent) \
+{ \
+ trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \
+ return !trace_seq_has_overflowed(s); \
+} \
+const char PRINT_TYPE_FMT_NAME(type)[] = fmt; \
+NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(type));
+
+DEFINE_BASIC_PRINT_TYPE_FUNC(u8 , "0x%x")
+DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "0x%x")
+DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "0x%x")
+DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "0x%Lx")
+DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d")
+DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d")
+DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%d")
+DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%Ld")
+
+/* Print type function for string type */
+int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name,
+ void *data, void *ent)
+{
+ int len = *(u32 *)data >> 16;
+
+ if (!len)
+ trace_seq_printf(s, " %s=(fault)", name);
+ else
+ trace_seq_printf(s, " %s=\"%s\"", name,
+ (const char *)get_loc_data(data, ent));
+ return !trace_seq_has_overflowed(s);
+}
+NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(string));
+
+const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\"";
+
+#define CHECK_FETCH_FUNCS(method, fn) \
+ (((FETCH_FUNC_NAME(method, u8) == fn) || \
+ (FETCH_FUNC_NAME(method, u16) == fn) || \
+ (FETCH_FUNC_NAME(method, u32) == fn) || \
+ (FETCH_FUNC_NAME(method, u64) == fn) || \
+ (FETCH_FUNC_NAME(method, string) == fn) || \
+ (FETCH_FUNC_NAME(method, string_size) == fn)) \
+ && (fn != NULL))
+
+/* Data fetch function templates */
+#define DEFINE_FETCH_reg(type) \
+void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, void *offset, void *dest) \
+{ \
+ *(type *)dest = (type)regs_get_register(regs, \
+ (unsigned int)((unsigned long)offset)); \
+} \
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(reg, type));
+DEFINE_BASIC_FETCH_FUNCS(reg)
+/* No string on the register */
+#define fetch_reg_string NULL
+#define fetch_reg_string_size NULL
+
+#define DEFINE_FETCH_retval(type) \
+void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs, \
+ void *dummy, void *dest) \
+{ \
+ *(type *)dest = (type)regs_return_value(regs); \
+} \
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(retval, type));
+DEFINE_BASIC_FETCH_FUNCS(retval)
+/* No string on the retval */
+#define fetch_retval_string NULL
+#define fetch_retval_string_size NULL
+
+/* Dereference memory access function */
+struct deref_fetch_param {
+ struct fetch_param orig;
+ long offset;
+ fetch_func_t fetch;
+ fetch_func_t fetch_size;
+};
+
+#define DEFINE_FETCH_deref(type) \
+void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs, \
+ void *data, void *dest) \
+{ \
+ struct deref_fetch_param *dprm = data; \
+ unsigned long addr; \
+ call_fetch(&dprm->orig, regs, &addr); \
+ if (addr) { \
+ addr += dprm->offset; \
+ dprm->fetch(regs, (void *)addr, dest); \
+ } else \
+ *(type *)dest = 0; \
+} \
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(deref, type));
+DEFINE_BASIC_FETCH_FUNCS(deref)
+DEFINE_FETCH_deref(string)
+
+void FETCH_FUNC_NAME(deref, string_size)(struct pt_regs *regs,
+ void *data, void *dest)
+{
+ struct deref_fetch_param *dprm = data;
+ unsigned long addr;
+
+ call_fetch(&dprm->orig, regs, &addr);
+ if (addr && dprm->fetch_size) {
+ addr += dprm->offset;
+ dprm->fetch_size(regs, (void *)addr, dest);
+ } else
+ *(string_size *)dest = 0;
+}
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(deref, string_size));
+
+static void update_deref_fetch_param(struct deref_fetch_param *data)
+{
+ if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
+ update_deref_fetch_param(data->orig.data);
+ else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
+ update_symbol_cache(data->orig.data);
+}
+NOKPROBE_SYMBOL(update_deref_fetch_param);
+
+static void free_deref_fetch_param(struct deref_fetch_param *data)
+{
+ if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
+ free_deref_fetch_param(data->orig.data);
+ else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
+ free_symbol_cache(data->orig.data);
+ kfree(data);
+}
+NOKPROBE_SYMBOL(free_deref_fetch_param);
+
+/* Bitfield fetch function */
+struct bitfield_fetch_param {
+ struct fetch_param orig;
+ unsigned char hi_shift;
+ unsigned char low_shift;
+};
+
+#define DEFINE_FETCH_bitfield(type) \
+void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs, \
+ void *data, void *dest) \
+{ \
+ struct bitfield_fetch_param *bprm = data; \
+ type buf = 0; \
+ call_fetch(&bprm->orig, regs, &buf); \
+ if (buf) { \
+ buf <<= bprm->hi_shift; \
+ buf >>= bprm->low_shift; \
+ } \
+ *(type *)dest = buf; \
+} \
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(bitfield, type));
+DEFINE_BASIC_FETCH_FUNCS(bitfield)
+#define fetch_bitfield_string NULL
+#define fetch_bitfield_string_size NULL
+
+static void
+update_bitfield_fetch_param(struct bitfield_fetch_param *data)
+{
+ /*
+ * Don't check the bitfield itself, because this must be the
+ * last fetch function.
+ */
+ if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
+ update_deref_fetch_param(data->orig.data);
+ else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
+ update_symbol_cache(data->orig.data);
+}
+
+static void
+free_bitfield_fetch_param(struct bitfield_fetch_param *data)
+{
+ /*
+ * Don't check the bitfield itself, because this must be the
+ * last fetch function.
+ */
+ if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
+ free_deref_fetch_param(data->orig.data);
+ else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
+ free_symbol_cache(data->orig.data);
+
+ kfree(data);
+}
+
+static const struct fetch_type *find_fetch_type(const char *type,
+ const struct fetch_type *ftbl)
+{
+ int i;
+
+ if (!type)
+ type = DEFAULT_FETCH_TYPE_STR;
+
+ /* Special case: bitfield */
+ if (*type == 'b') {
+ unsigned long bs;
+
+ type = strchr(type, '/');
+ if (!type)
+ goto fail;
+
+ type++;
+ if (kstrtoul(type, 0, &bs))
+ goto fail;
+
+ switch (bs) {
+ case 8:
+ return find_fetch_type("u8", ftbl);
+ case 16:
+ return find_fetch_type("u16", ftbl);
+ case 32:
+ return find_fetch_type("u32", ftbl);
+ case 64:
+ return find_fetch_type("u64", ftbl);
+ default:
+ goto fail;
+ }
+ }
+
+ for (i = 0; ftbl[i].name; i++) {
+ if (strcmp(type, ftbl[i].name) == 0)
+ return &ftbl[i];
+ }
+
+fail:
+ return NULL;
+}
+
+/* Special function : only accept unsigned long */
+static void fetch_kernel_stack_address(struct pt_regs *regs, void *dummy, void *dest)
+{
+ *(unsigned long *)dest = kernel_stack_pointer(regs);
+}
+NOKPROBE_SYMBOL(fetch_kernel_stack_address);
+
+static void fetch_user_stack_address(struct pt_regs *regs, void *dummy, void *dest)
+{
+ *(unsigned long *)dest = user_stack_pointer(regs);
+}
+NOKPROBE_SYMBOL(fetch_user_stack_address);
+
+static fetch_func_t get_fetch_size_function(const struct fetch_type *type,
+ fetch_func_t orig_fn,
+ const struct fetch_type *ftbl)
+{
+ int i;
+
+ if (type != &ftbl[FETCH_TYPE_STRING])
+ return NULL; /* Only string type needs size function */
+
+ for (i = 0; i < FETCH_MTD_END; i++)
+ if (type->fetch[i] == orig_fn)
+ return ftbl[FETCH_TYPE_STRSIZE].fetch[i];
+
+ WARN_ON(1); /* This should not happen */
+
+ return NULL;
+}
+
+/* Split symbol and offset. */
+int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset)
+{
+ char *tmp;
+ int ret;
+
+ if (!offset)
+ return -EINVAL;
+
+ tmp = strchr(symbol, '+');
+ if (tmp) {
+ /* skip sign because kstrtoul doesn't accept '+' */
+ ret = kstrtoul(tmp + 1, 0, offset);
+ if (ret)
+ return ret;
+
+ *tmp = '\0';
+ } else
+ *offset = 0;
+
+ return 0;
+}
+
+#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
+
+static int parse_probe_vars(char *arg, const struct fetch_type *t,
+ struct fetch_param *f, bool is_return,
+ bool is_kprobe)
+{
+ int ret = 0;
+ unsigned long param;
+
+ if (strcmp(arg, "retval") == 0) {
+ if (is_return)
+ f->fn = t->fetch[FETCH_MTD_retval];
+ else
+ ret = -EINVAL;
+ } else if (strncmp(arg, "stack", 5) == 0) {
+ if (arg[5] == '\0') {
+ if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR))
+ return -EINVAL;
+
+ if (is_kprobe)
+ f->fn = fetch_kernel_stack_address;
+ else
+ f->fn = fetch_user_stack_address;
+ } else if (isdigit(arg[5])) {
+ ret = kstrtoul(arg + 5, 10, &param);
+ if (ret || (is_kprobe && param > PARAM_MAX_STACK))
+ ret = -EINVAL;
+ else {
+ f->fn = t->fetch[FETCH_MTD_stack];
+ f->data = (void *)param;
+ }
+ } else
+ ret = -EINVAL;
+ } else
+ ret = -EINVAL;
+
+ return ret;
+}
+
+/* Recursive argument parser */
+static int parse_probe_arg(char *arg, const struct fetch_type *t,
+ struct fetch_param *f, bool is_return, bool is_kprobe,
+ const struct fetch_type *ftbl)
+{
+ unsigned long param;
+ long offset;
+ char *tmp;
+ int ret = 0;
+
+ switch (arg[0]) {
+ case '$':
+ ret = parse_probe_vars(arg + 1, t, f, is_return, is_kprobe);
+ break;
+
+ case '%': /* named register */
+ ret = regs_query_register_offset(arg + 1);
+ if (ret >= 0) {
+ f->fn = t->fetch[FETCH_MTD_reg];
+ f->data = (void *)(unsigned long)ret;
+ ret = 0;
+ }
+ break;
+
+ case '@': /* memory, file-offset or symbol */
+ if (isdigit(arg[1])) {
+ ret = kstrtoul(arg + 1, 0, &param);
+ if (ret)
+ break;
+
+ f->fn = t->fetch[FETCH_MTD_memory];
+ f->data = (void *)param;
+ } else if (arg[1] == '+') {
+ /* kprobes don't support file offsets */
+ if (is_kprobe)
+ return -EINVAL;
+
+ ret = kstrtol(arg + 2, 0, &offset);
+ if (ret)
+ break;
+
+ f->fn = t->fetch[FETCH_MTD_file_offset];
+ f->data = (void *)offset;
+ } else {
+ /* uprobes don't support symbols */
+ if (!is_kprobe)
+ return -EINVAL;
+
+ ret = traceprobe_split_symbol_offset(arg + 1, &offset);
+ if (ret)
+ break;
+
+ f->data = alloc_symbol_cache(arg + 1, offset);
+ if (f->data)
+ f->fn = t->fetch[FETCH_MTD_symbol];
+ }
+ break;
+
+ case '+': /* deref memory */
+ arg++; /* Skip '+', because kstrtol() rejects it. */
+ case '-':
+ tmp = strchr(arg, '(');
+ if (!tmp)
+ break;
+
+ *tmp = '\0';
+ ret = kstrtol(arg, 0, &offset);
+
+ if (ret)
+ break;
+
+ arg = tmp + 1;
+ tmp = strrchr(arg, ')');
+
+ if (tmp) {
+ struct deref_fetch_param *dprm;
+ const struct fetch_type *t2;
+
+ t2 = find_fetch_type(NULL, ftbl);
+ *tmp = '\0';
+ dprm = kzalloc(sizeof(struct deref_fetch_param), GFP_KERNEL);
+
+ if (!dprm)
+ return -ENOMEM;
+
+ dprm->offset = offset;
+ dprm->fetch = t->fetch[FETCH_MTD_memory];
+ dprm->fetch_size = get_fetch_size_function(t,
+ dprm->fetch, ftbl);
+ ret = parse_probe_arg(arg, t2, &dprm->orig, is_return,
+ is_kprobe, ftbl);
+ if (ret)
+ kfree(dprm);
+ else {
+ f->fn = t->fetch[FETCH_MTD_deref];
+ f->data = (void *)dprm;
+ }
+ }
+ break;
+ }
+ if (!ret && !f->fn) { /* Parsed, but do not find fetch method */
+ pr_info("%s type has no corresponding fetch method.\n", t->name);
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+#define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long))
+
+/* Bitfield type needs to be parsed into a fetch function */
+static int __parse_bitfield_probe_arg(const char *bf,
+ const struct fetch_type *t,
+ struct fetch_param *f)
+{
+ struct bitfield_fetch_param *bprm;
+ unsigned long bw, bo;
+ char *tail;
+
+ if (*bf != 'b')
+ return 0;
+
+ bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
+ if (!bprm)
+ return -ENOMEM;
+
+ bprm->orig = *f;
+ f->fn = t->fetch[FETCH_MTD_bitfield];
+ f->data = (void *)bprm;
+ bw = simple_strtoul(bf + 1, &tail, 0); /* Use simple one */
+
+ if (bw == 0 || *tail != '@')
+ return -EINVAL;
+
+ bf = tail + 1;
+ bo = simple_strtoul(bf, &tail, 0);
+
+ if (tail == bf || *tail != '/')
+ return -EINVAL;
+
+ bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo);
+ bprm->low_shift = bprm->hi_shift + bo;
+
+ return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0;
+}
+
+/* String length checking wrapper */
+int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
+ struct probe_arg *parg, bool is_return, bool is_kprobe,
+ const struct fetch_type *ftbl)
+{
+ const char *t;
+ int ret;
+
+ if (strlen(arg) > MAX_ARGSTR_LEN) {
+ pr_info("Argument is too long.: %s\n", arg);
+ return -ENOSPC;
+ }
+ parg->comm = kstrdup(arg, GFP_KERNEL);
+ if (!parg->comm) {
+ pr_info("Failed to allocate memory for command '%s'.\n", arg);
+ return -ENOMEM;
+ }
+ t = strchr(parg->comm, ':');
+ if (t) {
+ arg[t - parg->comm] = '\0';
+ t++;
+ }
+ parg->type = find_fetch_type(t, ftbl);
+ if (!parg->type) {
+ pr_info("Unsupported type: %s\n", t);
+ return -EINVAL;
+ }
+ parg->offset = *size;
+ *size += parg->type->size;
+ ret = parse_probe_arg(arg, parg->type, &parg->fetch, is_return,
+ is_kprobe, ftbl);
+
+ if (ret >= 0 && t != NULL)
+ ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch);
+
+ if (ret >= 0) {
+ parg->fetch_size.fn = get_fetch_size_function(parg->type,
+ parg->fetch.fn,
+ ftbl);
+ parg->fetch_size.data = parg->fetch.data;
+ }
+
+ return ret;
+}
+
+/* Return 1 if name is reserved or already used by another argument */
+int traceprobe_conflict_field_name(const char *name,
+ struct probe_arg *args, int narg)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++)
+ if (strcmp(reserved_field_names[i], name) == 0)
+ return 1;
+
+ for (i = 0; i < narg; i++)
+ if (strcmp(args[i].name, name) == 0)
+ return 1;
+
+ return 0;
+}
+
+void traceprobe_update_arg(struct probe_arg *arg)
+{
+ if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
+ update_bitfield_fetch_param(arg->fetch.data);
+ else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
+ update_deref_fetch_param(arg->fetch.data);
+ else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
+ update_symbol_cache(arg->fetch.data);
+}
+
+void traceprobe_free_probe_arg(struct probe_arg *arg)
+{
+ if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
+ free_bitfield_fetch_param(arg->fetch.data);
+ else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
+ free_deref_fetch_param(arg->fetch.data);
+ else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
+ free_symbol_cache(arg->fetch.data);
+
+ kfree(arg->name);
+ kfree(arg->comm);
+}
+
+int traceprobe_command(const char *buf, int (*createfn)(int, char **))
+{
+ char **argv;
+ int argc, ret;
+
+ argc = 0;
+ ret = 0;
+ argv = argv_split(GFP_KERNEL, buf, &argc);
+ if (!argv)
+ return -ENOMEM;
+
+ if (argc)
+ ret = createfn(argc, argv);
+
+ argv_free(argv);
+
+ return ret;
+}
+
+#define WRITE_BUFSIZE 4096
+
+ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *ppos,
+ int (*createfn)(int, char **))
+{
+ char *kbuf, *tmp;
+ int ret = 0;
+ size_t done = 0;
+ size_t size;
+
+ kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
+ if (!kbuf)
+ return -ENOMEM;
+
+ while (done < count) {
+ size = count - done;
+
+ if (size >= WRITE_BUFSIZE)
+ size = WRITE_BUFSIZE - 1;
+
+ if (copy_from_user(kbuf, buffer + done, size)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ kbuf[size] = '\0';
+ tmp = strchr(kbuf, '\n');
+
+ if (tmp) {
+ *tmp = '\0';
+ size = tmp - kbuf + 1;
+ } else if (done + size < count) {
+ pr_warning("Line length is too long: "
+ "Should be less than %d.", WRITE_BUFSIZE);
+ ret = -EINVAL;
+ goto out;
+ }
+ done += size;
+ /* Remove comments */
+ tmp = strchr(kbuf, '#');
+
+ if (tmp)
+ *tmp = '\0';
+
+ ret = traceprobe_command(kbuf, createfn);
+ if (ret)
+ goto out;
+ }
+ ret = done;
+
+out:
+ kfree(kbuf);
+
+ return ret;
+}
+
+static int __set_print_fmt(struct trace_probe *tp, char *buf, int len,
+ bool is_return)
+{
+ int i;
+ int pos = 0;
+
+ const char *fmt, *arg;
+
+ if (!is_return) {
+ fmt = "(%lx)";
+ arg = "REC->" FIELD_STRING_IP;
+ } else {
+ fmt = "(%lx <- %lx)";
+ arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP;
+ }
+
+ /* When len=0, we just calculate the needed length */
+#define LEN_OR_ZERO (len ? len - pos : 0)
+
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt);
+
+ for (i = 0; i < tp->nr_args; i++) {
+ pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s",
+ tp->args[i].name, tp->args[i].type->fmt);
+ }
+
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
+
+ for (i = 0; i < tp->nr_args; i++) {
+ if (strcmp(tp->args[i].type->name, "string") == 0)
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ ", __get_str(%s)",
+ tp->args[i].name);
+ else
+ pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s",
+ tp->args[i].name);
+ }
+
+#undef LEN_OR_ZERO
+
+ /* return the length of print_fmt */
+ return pos;
+}
+
+int set_print_fmt(struct trace_probe *tp, bool is_return)
+{
+ int len;
+ char *print_fmt;
+
+ /* First: called with 0 length to calculate the needed length */
+ len = __set_print_fmt(tp, NULL, 0, is_return);
+ print_fmt = kmalloc(len + 1, GFP_KERNEL);
+ if (!print_fmt)
+ return -ENOMEM;
+
+ /* Second: actually write the @print_fmt */
+ __set_print_fmt(tp, print_fmt, len + 1, is_return);
+ tp->call.print_fmt = print_fmt;
+
+ return 0;
+}
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
new file mode 100644
index 000000000..ab283e146
--- /dev/null
+++ b/kernel/trace/trace_probe.h
@@ -0,0 +1,394 @@
+/*
+ * Common header file for probe-based Dynamic events.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * This code was copied from kernel/trace/trace_kprobe.h written by
+ * Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
+ *
+ * Updates to make this generic:
+ * Copyright (C) IBM Corporation, 2010-2011
+ * Author: Srikar Dronamraju
+ */
+
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/tracefs.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/ptrace.h>
+#include <linux/perf_event.h>
+#include <linux/kprobes.h>
+#include <linux/stringify.h>
+#include <linux/limits.h>
+#include <linux/uaccess.h>
+#include <asm/bitsperlong.h>
+
+#include "trace.h"
+#include "trace_output.h"
+
+#define MAX_TRACE_ARGS 128
+#define MAX_ARGSTR_LEN 63
+#define MAX_EVENT_NAME_LEN 64
+#define MAX_STRING_SIZE PATH_MAX
+
+/* Reserved field names */
+#define FIELD_STRING_IP "__probe_ip"
+#define FIELD_STRING_RETIP "__probe_ret_ip"
+#define FIELD_STRING_FUNC "__probe_func"
+
+#undef DEFINE_FIELD
+#define DEFINE_FIELD(type, item, name, is_signed) \
+ do { \
+ ret = trace_define_field(event_call, #type, name, \
+ offsetof(typeof(field), item), \
+ sizeof(field.item), is_signed, \
+ FILTER_OTHER); \
+ if (ret) \
+ return ret; \
+ } while (0)
+
+
+/* Flags for trace_probe */
+#define TP_FLAG_TRACE 1
+#define TP_FLAG_PROFILE 2
+#define TP_FLAG_REGISTERED 4
+
+
+/* data_rloc: data relative location, compatible with u32 */
+#define make_data_rloc(len, roffs) \
+ (((u32)(len) << 16) | ((u32)(roffs) & 0xffff))
+#define get_rloc_len(dl) ((u32)(dl) >> 16)
+#define get_rloc_offs(dl) ((u32)(dl) & 0xffff)
+
+/*
+ * Convert data_rloc to data_loc:
+ * data_rloc stores the offset from data_rloc itself, but data_loc
+ * stores the offset from event entry.
+ */
+#define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs))
+
+static nokprobe_inline void *get_rloc_data(u32 *dl)
+{
+ return (u8 *)dl + get_rloc_offs(*dl);
+}
+
+/* For data_loc conversion */
+static nokprobe_inline void *get_loc_data(u32 *dl, void *ent)
+{
+ return (u8 *)ent + get_rloc_offs(*dl);
+}
+
+/* Data fetch function type */
+typedef void (*fetch_func_t)(struct pt_regs *, void *, void *);
+/* Printing function type */
+typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *, void *);
+
+/* Fetch types */
+enum {
+ FETCH_MTD_reg = 0,
+ FETCH_MTD_stack,
+ FETCH_MTD_retval,
+ FETCH_MTD_memory,
+ FETCH_MTD_symbol,
+ FETCH_MTD_deref,
+ FETCH_MTD_bitfield,
+ FETCH_MTD_file_offset,
+ FETCH_MTD_END,
+};
+
+/* Fetch type information table */
+struct fetch_type {
+ const char *name; /* Name of type */
+ size_t size; /* Byte size of type */
+ int is_signed; /* Signed flag */
+ print_type_func_t print; /* Print functions */
+ const char *fmt; /* Fromat string */
+ const char *fmttype; /* Name in format file */
+ /* Fetch functions */
+ fetch_func_t fetch[FETCH_MTD_END];
+};
+
+struct fetch_param {
+ fetch_func_t fn;
+ void *data;
+};
+
+/* For defining macros, define string/string_size types */
+typedef u32 string;
+typedef u32 string_size;
+
+#define PRINT_TYPE_FUNC_NAME(type) print_type_##type
+#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type
+
+/* Printing in basic type function template */
+#define DECLARE_BASIC_PRINT_TYPE_FUNC(type) \
+int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \
+ void *data, void *ent); \
+extern const char PRINT_TYPE_FMT_NAME(type)[]
+
+DECLARE_BASIC_PRINT_TYPE_FUNC(u8);
+DECLARE_BASIC_PRINT_TYPE_FUNC(u16);
+DECLARE_BASIC_PRINT_TYPE_FUNC(u32);
+DECLARE_BASIC_PRINT_TYPE_FUNC(u64);
+DECLARE_BASIC_PRINT_TYPE_FUNC(s8);
+DECLARE_BASIC_PRINT_TYPE_FUNC(s16);
+DECLARE_BASIC_PRINT_TYPE_FUNC(s32);
+DECLARE_BASIC_PRINT_TYPE_FUNC(s64);
+DECLARE_BASIC_PRINT_TYPE_FUNC(string);
+
+#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type
+
+/* Declare macro for basic types */
+#define DECLARE_FETCH_FUNC(method, type) \
+extern void FETCH_FUNC_NAME(method, type)(struct pt_regs *regs, \
+ void *data, void *dest)
+
+#define DECLARE_BASIC_FETCH_FUNCS(method) \
+DECLARE_FETCH_FUNC(method, u8); \
+DECLARE_FETCH_FUNC(method, u16); \
+DECLARE_FETCH_FUNC(method, u32); \
+DECLARE_FETCH_FUNC(method, u64)
+
+DECLARE_BASIC_FETCH_FUNCS(reg);
+#define fetch_reg_string NULL
+#define fetch_reg_string_size NULL
+
+DECLARE_BASIC_FETCH_FUNCS(retval);
+#define fetch_retval_string NULL
+#define fetch_retval_string_size NULL
+
+DECLARE_BASIC_FETCH_FUNCS(symbol);
+DECLARE_FETCH_FUNC(symbol, string);
+DECLARE_FETCH_FUNC(symbol, string_size);
+
+DECLARE_BASIC_FETCH_FUNCS(deref);
+DECLARE_FETCH_FUNC(deref, string);
+DECLARE_FETCH_FUNC(deref, string_size);
+
+DECLARE_BASIC_FETCH_FUNCS(bitfield);
+#define fetch_bitfield_string NULL
+#define fetch_bitfield_string_size NULL
+
+/*
+ * Define macro for basic types - we don't need to define s* types, because
+ * we have to care only about bitwidth at recording time.
+ */
+#define DEFINE_BASIC_FETCH_FUNCS(method) \
+DEFINE_FETCH_##method(u8) \
+DEFINE_FETCH_##method(u16) \
+DEFINE_FETCH_##method(u32) \
+DEFINE_FETCH_##method(u64)
+
+/* Default (unsigned long) fetch type */
+#define __DEFAULT_FETCH_TYPE(t) u##t
+#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
+#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
+#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
+
+#define ASSIGN_FETCH_FUNC(method, type) \
+ [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type)
+
+#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \
+ {.name = _name, \
+ .size = _size, \
+ .is_signed = sign, \
+ .print = PRINT_TYPE_FUNC_NAME(ptype), \
+ .fmt = PRINT_TYPE_FMT_NAME(ptype), \
+ .fmttype = _fmttype, \
+ .fetch = { \
+ASSIGN_FETCH_FUNC(reg, ftype), \
+ASSIGN_FETCH_FUNC(stack, ftype), \
+ASSIGN_FETCH_FUNC(retval, ftype), \
+ASSIGN_FETCH_FUNC(memory, ftype), \
+ASSIGN_FETCH_FUNC(symbol, ftype), \
+ASSIGN_FETCH_FUNC(deref, ftype), \
+ASSIGN_FETCH_FUNC(bitfield, ftype), \
+ASSIGN_FETCH_FUNC(file_offset, ftype), \
+ } \
+ }
+
+#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \
+ __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype)
+
+#define ASSIGN_FETCH_TYPE_END {}
+
+#define FETCH_TYPE_STRING 0
+#define FETCH_TYPE_STRSIZE 1
+
+#ifdef CONFIG_KPROBE_EVENT
+struct symbol_cache;
+unsigned long update_symbol_cache(struct symbol_cache *sc);
+void free_symbol_cache(struct symbol_cache *sc);
+struct symbol_cache *alloc_symbol_cache(const char *sym, long offset);
+#else
+/* uprobes do not support symbol fetch methods */
+#define fetch_symbol_u8 NULL
+#define fetch_symbol_u16 NULL
+#define fetch_symbol_u32 NULL
+#define fetch_symbol_u64 NULL
+#define fetch_symbol_string NULL
+#define fetch_symbol_string_size NULL
+
+struct symbol_cache {
+};
+static inline unsigned long __used update_symbol_cache(struct symbol_cache *sc)
+{
+ return 0;
+}
+
+static inline void __used free_symbol_cache(struct symbol_cache *sc)
+{
+}
+
+static inline struct symbol_cache * __used
+alloc_symbol_cache(const char *sym, long offset)
+{
+ return NULL;
+}
+#endif /* CONFIG_KPROBE_EVENT */
+
+struct probe_arg {
+ struct fetch_param fetch;
+ struct fetch_param fetch_size;
+ unsigned int offset; /* Offset from argument entry */
+ const char *name; /* Name of this argument */
+ const char *comm; /* Command of this argument */
+ const struct fetch_type *type; /* Type of this argument */
+};
+
+struct trace_probe {
+ unsigned int flags; /* For TP_FLAG_* */
+ struct ftrace_event_class class;
+ struct ftrace_event_call call;
+ struct list_head files;
+ ssize_t size; /* trace entry size */
+ unsigned int nr_args;
+ struct probe_arg args[];
+};
+
+struct event_file_link {
+ struct ftrace_event_file *file;
+ struct list_head list;
+};
+
+static inline bool trace_probe_is_enabled(struct trace_probe *tp)
+{
+ return !!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE));
+}
+
+static inline bool trace_probe_is_registered(struct trace_probe *tp)
+{
+ return !!(tp->flags & TP_FLAG_REGISTERED);
+}
+
+static nokprobe_inline void call_fetch(struct fetch_param *fprm,
+ struct pt_regs *regs, void *dest)
+{
+ return fprm->fn(regs, fprm->data, dest);
+}
+
+/* Check the name is good for event/group/fields */
+static inline int is_good_name(const char *name)
+{
+ if (!isalpha(*name) && *name != '_')
+ return 0;
+ while (*++name != '\0') {
+ if (!isalpha(*name) && !isdigit(*name) && *name != '_')
+ return 0;
+ }
+ return 1;
+}
+
+static inline struct event_file_link *
+find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file)
+{
+ struct event_file_link *link;
+
+ list_for_each_entry(link, &tp->files, list)
+ if (link->file == file)
+ return link;
+
+ return NULL;
+}
+
+extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
+ struct probe_arg *parg, bool is_return, bool is_kprobe,
+ const struct fetch_type *ftbl);
+
+extern int traceprobe_conflict_field_name(const char *name,
+ struct probe_arg *args, int narg);
+
+extern void traceprobe_update_arg(struct probe_arg *arg);
+extern void traceprobe_free_probe_arg(struct probe_arg *arg);
+
+extern int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset);
+
+extern ssize_t traceprobe_probes_write(struct file *file,
+ const char __user *buffer, size_t count, loff_t *ppos,
+ int (*createfn)(int, char**));
+
+extern int traceprobe_command(const char *buf, int (*createfn)(int, char**));
+
+/* Sum up total data length for dynamic arraies (strings) */
+static nokprobe_inline int
+__get_data_size(struct trace_probe *tp, struct pt_regs *regs)
+{
+ int i, ret = 0;
+ u32 len;
+
+ for (i = 0; i < tp->nr_args; i++)
+ if (unlikely(tp->args[i].fetch_size.fn)) {
+ call_fetch(&tp->args[i].fetch_size, regs, &len);
+ ret += len;
+ }
+
+ return ret;
+}
+
+/* Store the value of each argument */
+static nokprobe_inline void
+store_trace_args(int ent_size, struct trace_probe *tp, struct pt_regs *regs,
+ u8 *data, int maxlen)
+{
+ int i;
+ u32 end = tp->size;
+ u32 *dl; /* Data (relative) location */
+
+ for (i = 0; i < tp->nr_args; i++) {
+ if (unlikely(tp->args[i].fetch_size.fn)) {
+ /*
+ * First, we set the relative location and
+ * maximum data length to *dl
+ */
+ dl = (u32 *)(data + tp->args[i].offset);
+ *dl = make_data_rloc(maxlen, end - tp->args[i].offset);
+ /* Then try to fetch string or dynamic array data */
+ call_fetch(&tp->args[i].fetch, regs, dl);
+ /* Reduce maximum length */
+ end += get_rloc_len(*dl);
+ maxlen -= get_rloc_len(*dl);
+ /* Trick here, convert data_rloc to data_loc */
+ *dl = convert_rloc_to_loc(*dl,
+ ent_size + tp->args[i].offset);
+ } else
+ /* Just fetching data normally */
+ call_fetch(&tp->args[i].fetch, regs,
+ data + tp->args[i].offset);
+ }
+}
+
+extern int set_print_fmt(struct trace_probe *tp, bool is_return);
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
new file mode 100644
index 000000000..419ca37e7
--- /dev/null
+++ b/kernel/trace/trace_sched_switch.c
@@ -0,0 +1,101 @@
+/*
+ * trace context switch
+ *
+ * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com>
+ *
+ */
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+#include <trace/events/sched.h>
+
+#include "trace.h"
+
+static int sched_ref;
+static DEFINE_MUTEX(sched_register_mutex);
+
+static void
+probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next)
+{
+ if (unlikely(!sched_ref))
+ return;
+
+ tracing_record_cmdline(prev);
+ tracing_record_cmdline(next);
+}
+
+static void
+probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success)
+{
+ if (unlikely(!sched_ref))
+ return;
+
+ tracing_record_cmdline(current);
+}
+
+static int tracing_sched_register(void)
+{
+ int ret;
+
+ ret = register_trace_sched_wakeup(probe_sched_wakeup, NULL);
+ if (ret) {
+ pr_info("wakeup trace: Couldn't activate tracepoint"
+ " probe to kernel_sched_wakeup\n");
+ return ret;
+ }
+
+ ret = register_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
+ if (ret) {
+ pr_info("wakeup trace: Couldn't activate tracepoint"
+ " probe to kernel_sched_wakeup_new\n");
+ goto fail_deprobe;
+ }
+
+ ret = register_trace_sched_switch(probe_sched_switch, NULL);
+ if (ret) {
+ pr_info("sched trace: Couldn't activate tracepoint"
+ " probe to kernel_sched_switch\n");
+ goto fail_deprobe_wake_new;
+ }
+
+ return ret;
+fail_deprobe_wake_new:
+ unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
+fail_deprobe:
+ unregister_trace_sched_wakeup(probe_sched_wakeup, NULL);
+ return ret;
+}
+
+static void tracing_sched_unregister(void)
+{
+ unregister_trace_sched_switch(probe_sched_switch, NULL);
+ unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
+ unregister_trace_sched_wakeup(probe_sched_wakeup, NULL);
+}
+
+static void tracing_start_sched_switch(void)
+{
+ mutex_lock(&sched_register_mutex);
+ if (!(sched_ref++))
+ tracing_sched_register();
+ mutex_unlock(&sched_register_mutex);
+}
+
+static void tracing_stop_sched_switch(void)
+{
+ mutex_lock(&sched_register_mutex);
+ if (!(--sched_ref))
+ tracing_sched_unregister();
+ mutex_unlock(&sched_register_mutex);
+}
+
+void tracing_start_cmdline_record(void)
+{
+ tracing_start_sched_switch();
+}
+
+void tracing_stop_cmdline_record(void)
+{
+ tracing_stop_sched_switch();
+}
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
new file mode 100644
index 000000000..d6e100372
--- /dev/null
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -0,0 +1,816 @@
+/*
+ * trace task wakeup timings
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
+ *
+ * Based on code from the latency_tracer, that is:
+ *
+ * Copyright (C) 2004-2006 Ingo Molnar
+ * Copyright (C) 2004 Nadia Yvette Chambers
+ */
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+#include <linux/sched/rt.h>
+#include <linux/sched/deadline.h>
+#include <trace/events/sched.h>
+#include "trace.h"
+
+static struct trace_array *wakeup_trace;
+static int __read_mostly tracer_enabled;
+
+static struct task_struct *wakeup_task;
+static int wakeup_cpu;
+static int wakeup_current_cpu;
+static unsigned wakeup_prio = -1;
+static int wakeup_rt;
+static int wakeup_dl;
+static int tracing_dl = 0;
+
+static arch_spinlock_t wakeup_lock =
+ (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
+
+static void wakeup_reset(struct trace_array *tr);
+static void __wakeup_reset(struct trace_array *tr);
+static int wakeup_graph_entry(struct ftrace_graph_ent *trace);
+static void wakeup_graph_return(struct ftrace_graph_ret *trace);
+
+static int save_flags;
+static bool function_enabled;
+
+#define TRACE_DISPLAY_GRAPH 1
+
+static struct tracer_opt trace_opts[] = {
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ /* display latency trace as call graph */
+ { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) },
+#endif
+ { } /* Empty entry */
+};
+
+static struct tracer_flags tracer_flags = {
+ .val = 0,
+ .opts = trace_opts,
+};
+
+#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH)
+
+#ifdef CONFIG_FUNCTION_TRACER
+
+/*
+ * Prologue for the wakeup function tracers.
+ *
+ * Returns 1 if it is OK to continue, and preemption
+ * is disabled and data->disabled is incremented.
+ * 0 if the trace is to be ignored, and preemption
+ * is not disabled and data->disabled is
+ * kept the same.
+ *
+ * Note, this function is also used outside this ifdef but
+ * inside the #ifdef of the function graph tracer below.
+ * This is OK, since the function graph tracer is
+ * dependent on the function tracer.
+ */
+static int
+func_prolog_preempt_disable(struct trace_array *tr,
+ struct trace_array_cpu **data,
+ int *pc)
+{
+ long disabled;
+ int cpu;
+
+ if (likely(!wakeup_task))
+ return 0;
+
+ *pc = preempt_count();
+ preempt_disable_notrace();
+
+ cpu = raw_smp_processor_id();
+ if (cpu != wakeup_current_cpu)
+ goto out_enable;
+
+ *data = per_cpu_ptr(tr->trace_buffer.data, cpu);
+ disabled = atomic_inc_return(&(*data)->disabled);
+ if (unlikely(disabled != 1))
+ goto out;
+
+ return 1;
+
+out:
+ atomic_dec(&(*data)->disabled);
+
+out_enable:
+ preempt_enable_notrace();
+ return 0;
+}
+
+/*
+ * wakeup uses its own tracer function to keep the overhead down:
+ */
+static void
+wakeup_tracer_call(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *pt_regs)
+{
+ struct trace_array *tr = wakeup_trace;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+ int pc;
+
+ if (!func_prolog_preempt_disable(tr, &data, &pc))
+ return;
+
+ local_irq_save(flags);
+ trace_function(tr, ip, parent_ip, flags, pc);
+ local_irq_restore(flags);
+
+ atomic_dec(&data->disabled);
+ preempt_enable_notrace();
+}
+#endif /* CONFIG_FUNCTION_TRACER */
+
+static int register_wakeup_function(struct trace_array *tr, int graph, int set)
+{
+ int ret;
+
+ /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */
+ if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION)))
+ return 0;
+
+ if (graph)
+ ret = register_ftrace_graph(&wakeup_graph_return,
+ &wakeup_graph_entry);
+ else
+ ret = register_ftrace_function(tr->ops);
+
+ if (!ret)
+ function_enabled = true;
+
+ return ret;
+}
+
+static void unregister_wakeup_function(struct trace_array *tr, int graph)
+{
+ if (!function_enabled)
+ return;
+
+ if (graph)
+ unregister_ftrace_graph();
+ else
+ unregister_ftrace_function(tr->ops);
+
+ function_enabled = false;
+}
+
+static void wakeup_function_set(struct trace_array *tr, int set)
+{
+ if (set)
+ register_wakeup_function(tr, is_graph(), 1);
+ else
+ unregister_wakeup_function(tr, is_graph());
+}
+
+static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set)
+{
+ struct tracer *tracer = tr->current_trace;
+
+ if (mask & TRACE_ITER_FUNCTION)
+ wakeup_function_set(tr, set);
+
+ return trace_keep_overwrite(tracer, mask, set);
+}
+
+static int start_func_tracer(struct trace_array *tr, int graph)
+{
+ int ret;
+
+ ret = register_wakeup_function(tr, graph, 0);
+
+ if (!ret && tracing_is_enabled())
+ tracer_enabled = 1;
+ else
+ tracer_enabled = 0;
+
+ return ret;
+}
+
+static void stop_func_tracer(struct trace_array *tr, int graph)
+{
+ tracer_enabled = 0;
+
+ unregister_wakeup_function(tr, graph);
+}
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static int
+wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
+{
+
+ if (!(bit & TRACE_DISPLAY_GRAPH))
+ return -EINVAL;
+
+ if (!(is_graph() ^ set))
+ return 0;
+
+ stop_func_tracer(tr, !set);
+
+ wakeup_reset(wakeup_trace);
+ tr->max_latency = 0;
+
+ return start_func_tracer(tr, set);
+}
+
+static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
+{
+ struct trace_array *tr = wakeup_trace;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+ int pc, ret = 0;
+
+ if (!func_prolog_preempt_disable(tr, &data, &pc))
+ return 0;
+
+ local_save_flags(flags);
+ ret = __trace_graph_entry(tr, trace, flags, pc);
+ atomic_dec(&data->disabled);
+ preempt_enable_notrace();
+
+ return ret;
+}
+
+static void wakeup_graph_return(struct ftrace_graph_ret *trace)
+{
+ struct trace_array *tr = wakeup_trace;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+ int pc;
+
+ if (!func_prolog_preempt_disable(tr, &data, &pc))
+ return;
+
+ local_save_flags(flags);
+ __trace_graph_return(tr, trace, flags, pc);
+ atomic_dec(&data->disabled);
+
+ preempt_enable_notrace();
+ return;
+}
+
+static void wakeup_trace_open(struct trace_iterator *iter)
+{
+ if (is_graph())
+ graph_trace_open(iter);
+}
+
+static void wakeup_trace_close(struct trace_iterator *iter)
+{
+ if (iter->private)
+ graph_trace_close(iter);
+}
+
+#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC | \
+ TRACE_GRAPH_PRINT_ABS_TIME | \
+ TRACE_GRAPH_PRINT_DURATION)
+
+static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
+{
+ /*
+ * In graph mode call the graph tracer output function,
+ * otherwise go with the TRACE_FN event handler
+ */
+ if (is_graph())
+ return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS);
+
+ return TRACE_TYPE_UNHANDLED;
+}
+
+static void wakeup_print_header(struct seq_file *s)
+{
+ if (is_graph())
+ print_graph_headers_flags(s, GRAPH_TRACER_FLAGS);
+ else
+ trace_default_header(s);
+}
+
+static void
+__trace_function(struct trace_array *tr,
+ unsigned long ip, unsigned long parent_ip,
+ unsigned long flags, int pc)
+{
+ if (is_graph())
+ trace_graph_function(tr, ip, parent_ip, flags, pc);
+ else
+ trace_function(tr, ip, parent_ip, flags, pc);
+}
+#else
+#define __trace_function trace_function
+
+static int
+wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
+{
+ return -EINVAL;
+}
+
+static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
+{
+ return -1;
+}
+
+static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
+{
+ return TRACE_TYPE_UNHANDLED;
+}
+
+static void wakeup_graph_return(struct ftrace_graph_ret *trace) { }
+static void wakeup_trace_open(struct trace_iterator *iter) { }
+static void wakeup_trace_close(struct trace_iterator *iter) { }
+
+#ifdef CONFIG_FUNCTION_TRACER
+static void wakeup_print_header(struct seq_file *s)
+{
+ trace_default_header(s);
+}
+#else
+static void wakeup_print_header(struct seq_file *s)
+{
+ trace_latency_header(s);
+}
+#endif /* CONFIG_FUNCTION_TRACER */
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
+/*
+ * Should this new latency be reported/recorded?
+ */
+static int report_latency(struct trace_array *tr, cycle_t delta)
+{
+ if (tracing_thresh) {
+ if (delta < tracing_thresh)
+ return 0;
+ } else {
+ if (delta <= tr->max_latency)
+ return 0;
+ }
+ return 1;
+}
+
+static void
+probe_wakeup_migrate_task(void *ignore, struct task_struct *task, int cpu)
+{
+ if (task != wakeup_task)
+ return;
+
+ wakeup_current_cpu = cpu;
+}
+
+static void
+tracing_sched_switch_trace(struct trace_array *tr,
+ struct task_struct *prev,
+ struct task_struct *next,
+ unsigned long flags, int pc)
+{
+ struct ftrace_event_call *call = &event_context_switch;
+ struct ring_buffer *buffer = tr->trace_buffer.buffer;
+ struct ring_buffer_event *event;
+ struct ctx_switch_entry *entry;
+
+ event = trace_buffer_lock_reserve(buffer, TRACE_CTX,
+ sizeof(*entry), flags, pc);
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ entry->prev_pid = prev->pid;
+ entry->prev_prio = prev->prio;
+ entry->prev_state = prev->state;
+ entry->next_pid = next->pid;
+ entry->next_prio = next->prio;
+ entry->next_state = next->state;
+ entry->next_cpu = task_cpu(next);
+
+ if (!call_filter_check_discard(call, entry, buffer, event))
+ trace_buffer_unlock_commit(buffer, event, flags, pc);
+}
+
+static void
+tracing_sched_wakeup_trace(struct trace_array *tr,
+ struct task_struct *wakee,
+ struct task_struct *curr,
+ unsigned long flags, int pc)
+{
+ struct ftrace_event_call *call = &event_wakeup;
+ struct ring_buffer_event *event;
+ struct ctx_switch_entry *entry;
+ struct ring_buffer *buffer = tr->trace_buffer.buffer;
+
+ event = trace_buffer_lock_reserve(buffer, TRACE_WAKE,
+ sizeof(*entry), flags, pc);
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ entry->prev_pid = curr->pid;
+ entry->prev_prio = curr->prio;
+ entry->prev_state = curr->state;
+ entry->next_pid = wakee->pid;
+ entry->next_prio = wakee->prio;
+ entry->next_state = wakee->state;
+ entry->next_cpu = task_cpu(wakee);
+
+ if (!call_filter_check_discard(call, entry, buffer, event))
+ trace_buffer_unlock_commit(buffer, event, flags, pc);
+}
+
+static void notrace
+probe_wakeup_sched_switch(void *ignore,
+ struct task_struct *prev, struct task_struct *next)
+{
+ struct trace_array_cpu *data;
+ cycle_t T0, T1, delta;
+ unsigned long flags;
+ long disabled;
+ int cpu;
+ int pc;
+
+ tracing_record_cmdline(prev);
+
+ if (unlikely(!tracer_enabled))
+ return;
+
+ /*
+ * When we start a new trace, we set wakeup_task to NULL
+ * and then set tracer_enabled = 1. We want to make sure
+ * that another CPU does not see the tracer_enabled = 1
+ * and the wakeup_task with an older task, that might
+ * actually be the same as next.
+ */
+ smp_rmb();
+
+ if (next != wakeup_task)
+ return;
+
+ pc = preempt_count();
+
+ /* disable local data, not wakeup_cpu data */
+ cpu = raw_smp_processor_id();
+ disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);
+ if (likely(disabled != 1))
+ goto out;
+
+ local_irq_save(flags);
+ arch_spin_lock(&wakeup_lock);
+
+ /* We could race with grabbing wakeup_lock */
+ if (unlikely(!tracer_enabled || next != wakeup_task))
+ goto out_unlock;
+
+ /* The task we are waiting for is waking up */
+ data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu);
+
+ __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
+ tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
+
+ T0 = data->preempt_timestamp;
+ T1 = ftrace_now(cpu);
+ delta = T1-T0;
+
+ if (!report_latency(wakeup_trace, delta))
+ goto out_unlock;
+
+ if (likely(!is_tracing_stopped())) {
+ wakeup_trace->max_latency = delta;
+ update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu);
+ }
+
+out_unlock:
+ __wakeup_reset(wakeup_trace);
+ arch_spin_unlock(&wakeup_lock);
+ local_irq_restore(flags);
+out:
+ atomic_dec(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);
+}
+
+static void __wakeup_reset(struct trace_array *tr)
+{
+ wakeup_cpu = -1;
+ wakeup_prio = -1;
+ tracing_dl = 0;
+
+ if (wakeup_task)
+ put_task_struct(wakeup_task);
+
+ wakeup_task = NULL;
+}
+
+static void wakeup_reset(struct trace_array *tr)
+{
+ unsigned long flags;
+
+ tracing_reset_online_cpus(&tr->trace_buffer);
+
+ local_irq_save(flags);
+ arch_spin_lock(&wakeup_lock);
+ __wakeup_reset(tr);
+ arch_spin_unlock(&wakeup_lock);
+ local_irq_restore(flags);
+}
+
+static void
+probe_wakeup(void *ignore, struct task_struct *p, int success)
+{
+ struct trace_array_cpu *data;
+ int cpu = smp_processor_id();
+ unsigned long flags;
+ long disabled;
+ int pc;
+
+ if (likely(!tracer_enabled))
+ return;
+
+ tracing_record_cmdline(p);
+ tracing_record_cmdline(current);
+
+ /*
+ * Semantic is like this:
+ * - wakeup tracer handles all tasks in the system, independently
+ * from their scheduling class;
+ * - wakeup_rt tracer handles tasks belonging to sched_dl and
+ * sched_rt class;
+ * - wakeup_dl handles tasks belonging to sched_dl class only.
+ */
+ if (tracing_dl || (wakeup_dl && !dl_task(p)) ||
+ (wakeup_rt && !dl_task(p) && !rt_task(p)) ||
+ (!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio)))
+ return;
+
+ pc = preempt_count();
+ disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);
+ if (unlikely(disabled != 1))
+ goto out;
+
+ /* interrupts should be off from try_to_wake_up */
+ arch_spin_lock(&wakeup_lock);
+
+ /* check for races. */
+ if (!tracer_enabled || tracing_dl ||
+ (!dl_task(p) && p->prio >= wakeup_prio))
+ goto out_locked;
+
+ /* reset the trace */
+ __wakeup_reset(wakeup_trace);
+
+ wakeup_cpu = task_cpu(p);
+ wakeup_current_cpu = wakeup_cpu;
+ wakeup_prio = p->prio;
+
+ /*
+ * Once you start tracing a -deadline task, don't bother tracing
+ * another task until the first one wakes up.
+ */
+ if (dl_task(p))
+ tracing_dl = 1;
+ else
+ tracing_dl = 0;
+
+ wakeup_task = p;
+ get_task_struct(wakeup_task);
+
+ local_save_flags(flags);
+
+ data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu);
+ data->preempt_timestamp = ftrace_now(cpu);
+ tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc);
+
+ /*
+ * We must be careful in using CALLER_ADDR2. But since wake_up
+ * is not called by an assembly function (where as schedule is)
+ * it should be safe to use it here.
+ */
+ __trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
+
+out_locked:
+ arch_spin_unlock(&wakeup_lock);
+out:
+ atomic_dec(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);
+}
+
+static void start_wakeup_tracer(struct trace_array *tr)
+{
+ int ret;
+
+ ret = register_trace_sched_wakeup(probe_wakeup, NULL);
+ if (ret) {
+ pr_info("wakeup trace: Couldn't activate tracepoint"
+ " probe to kernel_sched_wakeup\n");
+ return;
+ }
+
+ ret = register_trace_sched_wakeup_new(probe_wakeup, NULL);
+ if (ret) {
+ pr_info("wakeup trace: Couldn't activate tracepoint"
+ " probe to kernel_sched_wakeup_new\n");
+ goto fail_deprobe;
+ }
+
+ ret = register_trace_sched_switch(probe_wakeup_sched_switch, NULL);
+ if (ret) {
+ pr_info("sched trace: Couldn't activate tracepoint"
+ " probe to kernel_sched_switch\n");
+ goto fail_deprobe_wake_new;
+ }
+
+ ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL);
+ if (ret) {
+ pr_info("wakeup trace: Couldn't activate tracepoint"
+ " probe to kernel_sched_migrate_task\n");
+ return;
+ }
+
+ wakeup_reset(tr);
+
+ /*
+ * Don't let the tracer_enabled = 1 show up before
+ * the wakeup_task is reset. This may be overkill since
+ * wakeup_reset does a spin_unlock after setting the
+ * wakeup_task to NULL, but I want to be safe.
+ * This is a slow path anyway.
+ */
+ smp_wmb();
+
+ if (start_func_tracer(tr, is_graph()))
+ printk(KERN_ERR "failed to start wakeup tracer\n");
+
+ return;
+fail_deprobe_wake_new:
+ unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
+fail_deprobe:
+ unregister_trace_sched_wakeup(probe_wakeup, NULL);
+}
+
+static void stop_wakeup_tracer(struct trace_array *tr)
+{
+ tracer_enabled = 0;
+ stop_func_tracer(tr, is_graph());
+ unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL);
+ unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
+ unregister_trace_sched_wakeup(probe_wakeup, NULL);
+ unregister_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL);
+}
+
+static bool wakeup_busy;
+
+static int __wakeup_tracer_init(struct trace_array *tr)
+{
+ save_flags = trace_flags;
+
+ /* non overwrite screws up the latency tracers */
+ set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1);
+ set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1);
+
+ tr->max_latency = 0;
+ wakeup_trace = tr;
+ ftrace_init_array_ops(tr, wakeup_tracer_call);
+ start_wakeup_tracer(tr);
+
+ wakeup_busy = true;
+ return 0;
+}
+
+static int wakeup_tracer_init(struct trace_array *tr)
+{
+ if (wakeup_busy)
+ return -EBUSY;
+
+ wakeup_dl = 0;
+ wakeup_rt = 0;
+ return __wakeup_tracer_init(tr);
+}
+
+static int wakeup_rt_tracer_init(struct trace_array *tr)
+{
+ if (wakeup_busy)
+ return -EBUSY;
+
+ wakeup_dl = 0;
+ wakeup_rt = 1;
+ return __wakeup_tracer_init(tr);
+}
+
+static int wakeup_dl_tracer_init(struct trace_array *tr)
+{
+ if (wakeup_busy)
+ return -EBUSY;
+
+ wakeup_dl = 1;
+ wakeup_rt = 0;
+ return __wakeup_tracer_init(tr);
+}
+
+static void wakeup_tracer_reset(struct trace_array *tr)
+{
+ int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT;
+ int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE;
+
+ stop_wakeup_tracer(tr);
+ /* make sure we put back any tasks we are tracing */
+ wakeup_reset(tr);
+
+ set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag);
+ set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag);
+ ftrace_reset_array_ops(tr);
+ wakeup_busy = false;
+}
+
+static void wakeup_tracer_start(struct trace_array *tr)
+{
+ wakeup_reset(tr);
+ tracer_enabled = 1;
+}
+
+static void wakeup_tracer_stop(struct trace_array *tr)
+{
+ tracer_enabled = 0;
+}
+
+static struct tracer wakeup_tracer __read_mostly =
+{
+ .name = "wakeup",
+ .init = wakeup_tracer_init,
+ .reset = wakeup_tracer_reset,
+ .start = wakeup_tracer_start,
+ .stop = wakeup_tracer_stop,
+ .print_max = true,
+ .print_header = wakeup_print_header,
+ .print_line = wakeup_print_line,
+ .flags = &tracer_flags,
+ .set_flag = wakeup_set_flag,
+ .flag_changed = wakeup_flag_changed,
+#ifdef CONFIG_FTRACE_SELFTEST
+ .selftest = trace_selftest_startup_wakeup,
+#endif
+ .open = wakeup_trace_open,
+ .close = wakeup_trace_close,
+ .allow_instances = true,
+ .use_max_tr = true,
+};
+
+static struct tracer wakeup_rt_tracer __read_mostly =
+{
+ .name = "wakeup_rt",
+ .init = wakeup_rt_tracer_init,
+ .reset = wakeup_tracer_reset,
+ .start = wakeup_tracer_start,
+ .stop = wakeup_tracer_stop,
+ .print_max = true,
+ .print_header = wakeup_print_header,
+ .print_line = wakeup_print_line,
+ .flags = &tracer_flags,
+ .set_flag = wakeup_set_flag,
+ .flag_changed = wakeup_flag_changed,
+#ifdef CONFIG_FTRACE_SELFTEST
+ .selftest = trace_selftest_startup_wakeup,
+#endif
+ .open = wakeup_trace_open,
+ .close = wakeup_trace_close,
+ .allow_instances = true,
+ .use_max_tr = true,
+};
+
+static struct tracer wakeup_dl_tracer __read_mostly =
+{
+ .name = "wakeup_dl",
+ .init = wakeup_dl_tracer_init,
+ .reset = wakeup_tracer_reset,
+ .start = wakeup_tracer_start,
+ .stop = wakeup_tracer_stop,
+ .print_max = true,
+ .print_header = wakeup_print_header,
+ .print_line = wakeup_print_line,
+ .flags = &tracer_flags,
+ .set_flag = wakeup_set_flag,
+ .flag_changed = wakeup_flag_changed,
+#ifdef CONFIG_FTRACE_SELFTEST
+ .selftest = trace_selftest_startup_wakeup,
+#endif
+ .open = wakeup_trace_open,
+ .close = wakeup_trace_close,
+ .use_max_tr = true,
+};
+
+__init static int init_wakeup_tracer(void)
+{
+ int ret;
+
+ ret = register_tracer(&wakeup_tracer);
+ if (ret)
+ return ret;
+
+ ret = register_tracer(&wakeup_rt_tracer);
+ if (ret)
+ return ret;
+
+ ret = register_tracer(&wakeup_dl_tracer);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+core_initcall(init_wakeup_tracer);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
new file mode 100644
index 000000000..287cf721c
--- /dev/null
+++ b/kernel/trace/trace_selftest.c
@@ -0,0 +1,1220 @@
+/* Include in trace.c */
+
+#include <linux/stringify.h>
+#include <linux/kthread.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+
+static inline int trace_valid_entry(struct trace_entry *entry)
+{
+ switch (entry->type) {
+ case TRACE_FN:
+ case TRACE_CTX:
+ case TRACE_WAKE:
+ case TRACE_STACK:
+ case TRACE_PRINT:
+ case TRACE_BRANCH:
+ case TRACE_GRAPH_ENT:
+ case TRACE_GRAPH_RET:
+ return 1;
+ }
+ return 0;
+}
+
+static int trace_test_buffer_cpu(struct trace_buffer *buf, int cpu)
+{
+ struct ring_buffer_event *event;
+ struct trace_entry *entry;
+ unsigned int loops = 0;
+
+ while ((event = ring_buffer_consume(buf->buffer, cpu, NULL, NULL))) {
+ entry = ring_buffer_event_data(event);
+
+ /*
+ * The ring buffer is a size of trace_buf_size, if
+ * we loop more than the size, there's something wrong
+ * with the ring buffer.
+ */
+ if (loops++ > trace_buf_size) {
+ printk(KERN_CONT ".. bad ring buffer ");
+ goto failed;
+ }
+ if (!trace_valid_entry(entry)) {
+ printk(KERN_CONT ".. invalid entry %d ",
+ entry->type);
+ goto failed;
+ }
+ }
+ return 0;
+
+ failed:
+ /* disable tracing */
+ tracing_disabled = 1;
+ printk(KERN_CONT ".. corrupted trace buffer .. ");
+ return -1;
+}
+
+/*
+ * Test the trace buffer to see if all the elements
+ * are still sane.
+ */
+static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count)
+{
+ unsigned long flags, cnt = 0;
+ int cpu, ret = 0;
+
+ /* Don't allow flipping of max traces now */
+ local_irq_save(flags);
+ arch_spin_lock(&buf->tr->max_lock);
+
+ cnt = ring_buffer_entries(buf->buffer);
+
+ /*
+ * The trace_test_buffer_cpu runs a while loop to consume all data.
+ * If the calling tracer is broken, and is constantly filling
+ * the buffer, this will run forever, and hard lock the box.
+ * We disable the ring buffer while we do this test to prevent
+ * a hard lock up.
+ */
+ tracing_off();
+ for_each_possible_cpu(cpu) {
+ ret = trace_test_buffer_cpu(buf, cpu);
+ if (ret)
+ break;
+ }
+ tracing_on();
+ arch_spin_unlock(&buf->tr->max_lock);
+ local_irq_restore(flags);
+
+ if (count)
+ *count = cnt;
+
+ return ret;
+}
+
+static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret)
+{
+ printk(KERN_WARNING "Failed to init %s tracer, init returned %d\n",
+ trace->name, init_ret);
+}
+#ifdef CONFIG_FUNCTION_TRACER
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+static int trace_selftest_test_probe1_cnt;
+static void trace_selftest_test_probe1_func(unsigned long ip,
+ unsigned long pip,
+ struct ftrace_ops *op,
+ struct pt_regs *pt_regs)
+{
+ trace_selftest_test_probe1_cnt++;
+}
+
+static int trace_selftest_test_probe2_cnt;
+static void trace_selftest_test_probe2_func(unsigned long ip,
+ unsigned long pip,
+ struct ftrace_ops *op,
+ struct pt_regs *pt_regs)
+{
+ trace_selftest_test_probe2_cnt++;
+}
+
+static int trace_selftest_test_probe3_cnt;
+static void trace_selftest_test_probe3_func(unsigned long ip,
+ unsigned long pip,
+ struct ftrace_ops *op,
+ struct pt_regs *pt_regs)
+{
+ trace_selftest_test_probe3_cnt++;
+}
+
+static int trace_selftest_test_global_cnt;
+static void trace_selftest_test_global_func(unsigned long ip,
+ unsigned long pip,
+ struct ftrace_ops *op,
+ struct pt_regs *pt_regs)
+{
+ trace_selftest_test_global_cnt++;
+}
+
+static int trace_selftest_test_dyn_cnt;
+static void trace_selftest_test_dyn_func(unsigned long ip,
+ unsigned long pip,
+ struct ftrace_ops *op,
+ struct pt_regs *pt_regs)
+{
+ trace_selftest_test_dyn_cnt++;
+}
+
+static struct ftrace_ops test_probe1 = {
+ .func = trace_selftest_test_probe1_func,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE,
+};
+
+static struct ftrace_ops test_probe2 = {
+ .func = trace_selftest_test_probe2_func,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE,
+};
+
+static struct ftrace_ops test_probe3 = {
+ .func = trace_selftest_test_probe3_func,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE,
+};
+
+static void print_counts(void)
+{
+ printk("(%d %d %d %d %d) ",
+ trace_selftest_test_probe1_cnt,
+ trace_selftest_test_probe2_cnt,
+ trace_selftest_test_probe3_cnt,
+ trace_selftest_test_global_cnt,
+ trace_selftest_test_dyn_cnt);
+}
+
+static void reset_counts(void)
+{
+ trace_selftest_test_probe1_cnt = 0;
+ trace_selftest_test_probe2_cnt = 0;
+ trace_selftest_test_probe3_cnt = 0;
+ trace_selftest_test_global_cnt = 0;
+ trace_selftest_test_dyn_cnt = 0;
+}
+
+static int trace_selftest_ops(struct trace_array *tr, int cnt)
+{
+ int save_ftrace_enabled = ftrace_enabled;
+ struct ftrace_ops *dyn_ops;
+ char *func1_name;
+ char *func2_name;
+ int len1;
+ int len2;
+ int ret = -1;
+
+ printk(KERN_CONT "PASSED\n");
+ pr_info("Testing dynamic ftrace ops #%d: ", cnt);
+
+ ftrace_enabled = 1;
+ reset_counts();
+
+ /* Handle PPC64 '.' name */
+ func1_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
+ func2_name = "*" __stringify(DYN_FTRACE_TEST_NAME2);
+ len1 = strlen(func1_name);
+ len2 = strlen(func2_name);
+
+ /*
+ * Probe 1 will trace function 1.
+ * Probe 2 will trace function 2.
+ * Probe 3 will trace functions 1 and 2.
+ */
+ ftrace_set_filter(&test_probe1, func1_name, len1, 1);
+ ftrace_set_filter(&test_probe2, func2_name, len2, 1);
+ ftrace_set_filter(&test_probe3, func1_name, len1, 1);
+ ftrace_set_filter(&test_probe3, func2_name, len2, 0);
+
+ register_ftrace_function(&test_probe1);
+ register_ftrace_function(&test_probe2);
+ register_ftrace_function(&test_probe3);
+ /* First time we are running with main function */
+ if (cnt > 1) {
+ ftrace_init_array_ops(tr, trace_selftest_test_global_func);
+ register_ftrace_function(tr->ops);
+ }
+
+ DYN_FTRACE_TEST_NAME();
+
+ print_counts();
+
+ if (trace_selftest_test_probe1_cnt != 1)
+ goto out;
+ if (trace_selftest_test_probe2_cnt != 0)
+ goto out;
+ if (trace_selftest_test_probe3_cnt != 1)
+ goto out;
+ if (cnt > 1) {
+ if (trace_selftest_test_global_cnt == 0)
+ goto out;
+ }
+
+ DYN_FTRACE_TEST_NAME2();
+
+ print_counts();
+
+ if (trace_selftest_test_probe1_cnt != 1)
+ goto out;
+ if (trace_selftest_test_probe2_cnt != 1)
+ goto out;
+ if (trace_selftest_test_probe3_cnt != 2)
+ goto out;
+
+ /* Add a dynamic probe */
+ dyn_ops = kzalloc(sizeof(*dyn_ops), GFP_KERNEL);
+ if (!dyn_ops) {
+ printk("MEMORY ERROR ");
+ goto out;
+ }
+
+ dyn_ops->func = trace_selftest_test_dyn_func;
+
+ register_ftrace_function(dyn_ops);
+
+ trace_selftest_test_global_cnt = 0;
+
+ DYN_FTRACE_TEST_NAME();
+
+ print_counts();
+
+ if (trace_selftest_test_probe1_cnt != 2)
+ goto out_free;
+ if (trace_selftest_test_probe2_cnt != 1)
+ goto out_free;
+ if (trace_selftest_test_probe3_cnt != 3)
+ goto out_free;
+ if (cnt > 1) {
+ if (trace_selftest_test_global_cnt == 0)
+ goto out;
+ }
+ if (trace_selftest_test_dyn_cnt == 0)
+ goto out_free;
+
+ DYN_FTRACE_TEST_NAME2();
+
+ print_counts();
+
+ if (trace_selftest_test_probe1_cnt != 2)
+ goto out_free;
+ if (trace_selftest_test_probe2_cnt != 2)
+ goto out_free;
+ if (trace_selftest_test_probe3_cnt != 4)
+ goto out_free;
+
+ ret = 0;
+ out_free:
+ unregister_ftrace_function(dyn_ops);
+ kfree(dyn_ops);
+
+ out:
+ /* Purposely unregister in the same order */
+ unregister_ftrace_function(&test_probe1);
+ unregister_ftrace_function(&test_probe2);
+ unregister_ftrace_function(&test_probe3);
+ if (cnt > 1)
+ unregister_ftrace_function(tr->ops);
+ ftrace_reset_array_ops(tr);
+
+ /* Make sure everything is off */
+ reset_counts();
+ DYN_FTRACE_TEST_NAME();
+ DYN_FTRACE_TEST_NAME();
+
+ if (trace_selftest_test_probe1_cnt ||
+ trace_selftest_test_probe2_cnt ||
+ trace_selftest_test_probe3_cnt ||
+ trace_selftest_test_global_cnt ||
+ trace_selftest_test_dyn_cnt)
+ ret = -1;
+
+ ftrace_enabled = save_ftrace_enabled;
+
+ return ret;
+}
+
+/* Test dynamic code modification and ftrace filters */
+static int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
+ struct trace_array *tr,
+ int (*func)(void))
+{
+ int save_ftrace_enabled = ftrace_enabled;
+ unsigned long count;
+ char *func_name;
+ int ret;
+
+ /* The ftrace test PASSED */
+ printk(KERN_CONT "PASSED\n");
+ pr_info("Testing dynamic ftrace: ");
+
+ /* enable tracing, and record the filter function */
+ ftrace_enabled = 1;
+
+ /* passed in by parameter to fool gcc from optimizing */
+ func();
+
+ /*
+ * Some archs *cough*PowerPC*cough* add characters to the
+ * start of the function names. We simply put a '*' to
+ * accommodate them.
+ */
+ func_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
+
+ /* filter only on our function */
+ ftrace_set_global_filter(func_name, strlen(func_name), 1);
+
+ /* enable tracing */
+ ret = tracer_init(trace, tr);
+ if (ret) {
+ warn_failed_init_tracer(trace, ret);
+ goto out;
+ }
+
+ /* Sleep for a 1/10 of a second */
+ msleep(100);
+
+ /* we should have nothing in the buffer */
+ ret = trace_test_buffer(&tr->trace_buffer, &count);
+ if (ret)
+ goto out;
+
+ if (count) {
+ ret = -1;
+ printk(KERN_CONT ".. filter did not filter .. ");
+ goto out;
+ }
+
+ /* call our function again */
+ func();
+
+ /* sleep again */
+ msleep(100);
+
+ /* stop the tracing. */
+ tracing_stop();
+ ftrace_enabled = 0;
+
+ /* check the trace buffer */
+ ret = trace_test_buffer(&tr->trace_buffer, &count);
+
+ ftrace_enabled = 1;
+ tracing_start();
+
+ /* we should only have one item */
+ if (!ret && count != 1) {
+ trace->reset(tr);
+ printk(KERN_CONT ".. filter failed count=%ld ..", count);
+ ret = -1;
+ goto out;
+ }
+
+ /* Test the ops with global tracing running */
+ ret = trace_selftest_ops(tr, 1);
+ trace->reset(tr);
+
+ out:
+ ftrace_enabled = save_ftrace_enabled;
+
+ /* Enable tracing on all functions again */
+ ftrace_set_global_filter(NULL, 0, 1);
+
+ /* Test the ops with global tracing off */
+ if (!ret)
+ ret = trace_selftest_ops(tr, 2);
+
+ return ret;
+}
+
+static int trace_selftest_recursion_cnt;
+static void trace_selftest_test_recursion_func(unsigned long ip,
+ unsigned long pip,
+ struct ftrace_ops *op,
+ struct pt_regs *pt_regs)
+{
+ /*
+ * This function is registered without the recursion safe flag.
+ * The ftrace infrastructure should provide the recursion
+ * protection. If not, this will crash the kernel!
+ */
+ if (trace_selftest_recursion_cnt++ > 10)
+ return;
+ DYN_FTRACE_TEST_NAME();
+}
+
+static void trace_selftest_test_recursion_safe_func(unsigned long ip,
+ unsigned long pip,
+ struct ftrace_ops *op,
+ struct pt_regs *pt_regs)
+{
+ /*
+ * We said we would provide our own recursion. By calling
+ * this function again, we should recurse back into this function
+ * and count again. But this only happens if the arch supports
+ * all of ftrace features and nothing else is using the function
+ * tracing utility.
+ */
+ if (trace_selftest_recursion_cnt++)
+ return;
+ DYN_FTRACE_TEST_NAME();
+}
+
+static struct ftrace_ops test_rec_probe = {
+ .func = trace_selftest_test_recursion_func,
+};
+
+static struct ftrace_ops test_recsafe_probe = {
+ .func = trace_selftest_test_recursion_safe_func,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE,
+};
+
+static int
+trace_selftest_function_recursion(void)
+{
+ int save_ftrace_enabled = ftrace_enabled;
+ char *func_name;
+ int len;
+ int ret;
+
+ /* The previous test PASSED */
+ pr_cont("PASSED\n");
+ pr_info("Testing ftrace recursion: ");
+
+
+ /* enable tracing, and record the filter function */
+ ftrace_enabled = 1;
+
+ /* Handle PPC64 '.' name */
+ func_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
+ len = strlen(func_name);
+
+ ret = ftrace_set_filter(&test_rec_probe, func_name, len, 1);
+ if (ret) {
+ pr_cont("*Could not set filter* ");
+ goto out;
+ }
+
+ ret = register_ftrace_function(&test_rec_probe);
+ if (ret) {
+ pr_cont("*could not register callback* ");
+ goto out;
+ }
+
+ DYN_FTRACE_TEST_NAME();
+
+ unregister_ftrace_function(&test_rec_probe);
+
+ ret = -1;
+ if (trace_selftest_recursion_cnt != 1) {
+ pr_cont("*callback not called once (%d)* ",
+ trace_selftest_recursion_cnt);
+ goto out;
+ }
+
+ trace_selftest_recursion_cnt = 1;
+
+ pr_cont("PASSED\n");
+ pr_info("Testing ftrace recursion safe: ");
+
+ ret = ftrace_set_filter(&test_recsafe_probe, func_name, len, 1);
+ if (ret) {
+ pr_cont("*Could not set filter* ");
+ goto out;
+ }
+
+ ret = register_ftrace_function(&test_recsafe_probe);
+ if (ret) {
+ pr_cont("*could not register callback* ");
+ goto out;
+ }
+
+ DYN_FTRACE_TEST_NAME();
+
+ unregister_ftrace_function(&test_recsafe_probe);
+
+ ret = -1;
+ if (trace_selftest_recursion_cnt != 2) {
+ pr_cont("*callback not called expected 2 times (%d)* ",
+ trace_selftest_recursion_cnt);
+ goto out;
+ }
+
+ ret = 0;
+out:
+ ftrace_enabled = save_ftrace_enabled;
+
+ return ret;
+}
+#else
+# define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; })
+# define trace_selftest_function_recursion() ({ 0; })
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+static enum {
+ TRACE_SELFTEST_REGS_START,
+ TRACE_SELFTEST_REGS_FOUND,
+ TRACE_SELFTEST_REGS_NOT_FOUND,
+} trace_selftest_regs_stat;
+
+static void trace_selftest_test_regs_func(unsigned long ip,
+ unsigned long pip,
+ struct ftrace_ops *op,
+ struct pt_regs *pt_regs)
+{
+ if (pt_regs)
+ trace_selftest_regs_stat = TRACE_SELFTEST_REGS_FOUND;
+ else
+ trace_selftest_regs_stat = TRACE_SELFTEST_REGS_NOT_FOUND;
+}
+
+static struct ftrace_ops test_regs_probe = {
+ .func = trace_selftest_test_regs_func,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_SAVE_REGS,
+};
+
+static int
+trace_selftest_function_regs(void)
+{
+ int save_ftrace_enabled = ftrace_enabled;
+ char *func_name;
+ int len;
+ int ret;
+ int supported = 0;
+
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
+ supported = 1;
+#endif
+
+ /* The previous test PASSED */
+ pr_cont("PASSED\n");
+ pr_info("Testing ftrace regs%s: ",
+ !supported ? "(no arch support)" : "");
+
+ /* enable tracing, and record the filter function */
+ ftrace_enabled = 1;
+
+ /* Handle PPC64 '.' name */
+ func_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
+ len = strlen(func_name);
+
+ ret = ftrace_set_filter(&test_regs_probe, func_name, len, 1);
+ /*
+ * If DYNAMIC_FTRACE is not set, then we just trace all functions.
+ * This test really doesn't care.
+ */
+ if (ret && ret != -ENODEV) {
+ pr_cont("*Could not set filter* ");
+ goto out;
+ }
+
+ ret = register_ftrace_function(&test_regs_probe);
+ /*
+ * Now if the arch does not support passing regs, then this should
+ * have failed.
+ */
+ if (!supported) {
+ if (!ret) {
+ pr_cont("*registered save-regs without arch support* ");
+ goto out;
+ }
+ test_regs_probe.flags |= FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED;
+ ret = register_ftrace_function(&test_regs_probe);
+ }
+ if (ret) {
+ pr_cont("*could not register callback* ");
+ goto out;
+ }
+
+
+ DYN_FTRACE_TEST_NAME();
+
+ unregister_ftrace_function(&test_regs_probe);
+
+ ret = -1;
+
+ switch (trace_selftest_regs_stat) {
+ case TRACE_SELFTEST_REGS_START:
+ pr_cont("*callback never called* ");
+ goto out;
+
+ case TRACE_SELFTEST_REGS_FOUND:
+ if (supported)
+ break;
+ pr_cont("*callback received regs without arch support* ");
+ goto out;
+
+ case TRACE_SELFTEST_REGS_NOT_FOUND:
+ if (!supported)
+ break;
+ pr_cont("*callback received NULL regs* ");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ ftrace_enabled = save_ftrace_enabled;
+
+ return ret;
+}
+
+/*
+ * Simple verification test of ftrace function tracer.
+ * Enable ftrace, sleep 1/10 second, and then read the trace
+ * buffer to see if all is in order.
+ */
+__init int
+trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
+{
+ int save_ftrace_enabled = ftrace_enabled;
+ unsigned long count;
+ int ret;
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+ if (ftrace_filter_param) {
+ printk(KERN_CONT " ... kernel command line filter set: force PASS ... ");
+ return 0;
+ }
+#endif
+
+ /* make sure msleep has been recorded */
+ msleep(1);
+
+ /* start the tracing */
+ ftrace_enabled = 1;
+
+ ret = tracer_init(trace, tr);
+ if (ret) {
+ warn_failed_init_tracer(trace, ret);
+ goto out;
+ }
+
+ /* Sleep for a 1/10 of a second */
+ msleep(100);
+ /* stop the tracing. */
+ tracing_stop();
+ ftrace_enabled = 0;
+
+ /* check the trace buffer */
+ ret = trace_test_buffer(&tr->trace_buffer, &count);
+
+ ftrace_enabled = 1;
+ trace->reset(tr);
+ tracing_start();
+
+ if (!ret && !count) {
+ printk(KERN_CONT ".. no entries found ..");
+ ret = -1;
+ goto out;
+ }
+
+ ret = trace_selftest_startup_dynamic_tracing(trace, tr,
+ DYN_FTRACE_TEST_NAME);
+ if (ret)
+ goto out;
+
+ ret = trace_selftest_function_recursion();
+ if (ret)
+ goto out;
+
+ ret = trace_selftest_function_regs();
+ out:
+ ftrace_enabled = save_ftrace_enabled;
+
+ /* kill ftrace totally if we failed */
+ if (ret)
+ ftrace_kill();
+
+ return ret;
+}
+#endif /* CONFIG_FUNCTION_TRACER */
+
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+
+/* Maximum number of functions to trace before diagnosing a hang */
+#define GRAPH_MAX_FUNC_TEST 100000000
+
+static unsigned int graph_hang_thresh;
+
+/* Wrap the real function entry probe to avoid possible hanging */
+static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
+{
+ /* This is harmlessly racy, we want to approximately detect a hang */
+ if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) {
+ ftrace_graph_stop();
+ printk(KERN_WARNING "BUG: Function graph tracer hang!\n");
+ if (ftrace_dump_on_oops) {
+ ftrace_dump(DUMP_ALL);
+ /* ftrace_dump() disables tracing */
+ tracing_on();
+ }
+ return 0;
+ }
+
+ return trace_graph_entry(trace);
+}
+
+/*
+ * Pretty much the same than for the function tracer from which the selftest
+ * has been borrowed.
+ */
+__init int
+trace_selftest_startup_function_graph(struct tracer *trace,
+ struct trace_array *tr)
+{
+ int ret;
+ unsigned long count;
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+ if (ftrace_filter_param) {
+ printk(KERN_CONT " ... kernel command line filter set: force PASS ... ");
+ return 0;
+ }
+#endif
+
+ /*
+ * Simulate the init() callback but we attach a watchdog callback
+ * to detect and recover from possible hangs
+ */
+ tracing_reset_online_cpus(&tr->trace_buffer);
+ set_graph_array(tr);
+ ret = register_ftrace_graph(&trace_graph_return,
+ &trace_graph_entry_watchdog);
+ if (ret) {
+ warn_failed_init_tracer(trace, ret);
+ goto out;
+ }
+ tracing_start_cmdline_record();
+
+ /* Sleep for a 1/10 of a second */
+ msleep(100);
+
+ /* Have we just recovered from a hang? */
+ if (graph_hang_thresh > GRAPH_MAX_FUNC_TEST) {
+ tracing_selftest_disabled = true;
+ ret = -1;
+ goto out;
+ }
+
+ tracing_stop();
+
+ /* check the trace buffer */
+ ret = trace_test_buffer(&tr->trace_buffer, &count);
+
+ trace->reset(tr);
+ tracing_start();
+
+ if (!ret && !count) {
+ printk(KERN_CONT ".. no entries found ..");
+ ret = -1;
+ goto out;
+ }
+
+ /* Don't test dynamic tracing, the function tracer already did */
+
+out:
+ /* Stop it if we failed */
+ if (ret)
+ ftrace_graph_stop();
+
+ return ret;
+}
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
+
+#ifdef CONFIG_IRQSOFF_TRACER
+int
+trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
+{
+ unsigned long save_max = tr->max_latency;
+ unsigned long count;
+ int ret;
+
+ /* start the tracing */
+ ret = tracer_init(trace, tr);
+ if (ret) {
+ warn_failed_init_tracer(trace, ret);
+ return ret;
+ }
+
+ /* reset the max latency */
+ tr->max_latency = 0;
+ /* disable interrupts for a bit */
+ local_irq_disable();
+ udelay(100);
+ local_irq_enable();
+
+ /*
+ * Stop the tracer to avoid a warning subsequent
+ * to buffer flipping failure because tracing_stop()
+ * disables the tr and max buffers, making flipping impossible
+ * in case of parallels max irqs off latencies.
+ */
+ trace->stop(tr);
+ /* stop the tracing. */
+ tracing_stop();
+ /* check both trace buffers */
+ ret = trace_test_buffer(&tr->trace_buffer, NULL);
+ if (!ret)
+ ret = trace_test_buffer(&tr->max_buffer, &count);
+ trace->reset(tr);
+ tracing_start();
+
+ if (!ret && !count) {
+ printk(KERN_CONT ".. no entries found ..");
+ ret = -1;
+ }
+
+ tr->max_latency = save_max;
+
+ return ret;
+}
+#endif /* CONFIG_IRQSOFF_TRACER */
+
+#ifdef CONFIG_PREEMPT_TRACER
+int
+trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
+{
+ unsigned long save_max = tr->max_latency;
+ unsigned long count;
+ int ret;
+
+ /*
+ * Now that the big kernel lock is no longer preemptable,
+ * and this is called with the BKL held, it will always
+ * fail. If preemption is already disabled, simply
+ * pass the test. When the BKL is removed, or becomes
+ * preemptible again, we will once again test this,
+ * so keep it in.
+ */
+ if (preempt_count()) {
+ printk(KERN_CONT "can not test ... force ");
+ return 0;
+ }
+
+ /* start the tracing */
+ ret = tracer_init(trace, tr);
+ if (ret) {
+ warn_failed_init_tracer(trace, ret);
+ return ret;
+ }
+
+ /* reset the max latency */
+ tr->max_latency = 0;
+ /* disable preemption for a bit */
+ preempt_disable();
+ udelay(100);
+ preempt_enable();
+
+ /*
+ * Stop the tracer to avoid a warning subsequent
+ * to buffer flipping failure because tracing_stop()
+ * disables the tr and max buffers, making flipping impossible
+ * in case of parallels max preempt off latencies.
+ */
+ trace->stop(tr);
+ /* stop the tracing. */
+ tracing_stop();
+ /* check both trace buffers */
+ ret = trace_test_buffer(&tr->trace_buffer, NULL);
+ if (!ret)
+ ret = trace_test_buffer(&tr->max_buffer, &count);
+ trace->reset(tr);
+ tracing_start();
+
+ if (!ret && !count) {
+ printk(KERN_CONT ".. no entries found ..");
+ ret = -1;
+ }
+
+ tr->max_latency = save_max;
+
+ return ret;
+}
+#endif /* CONFIG_PREEMPT_TRACER */
+
+#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER)
+int
+trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr)
+{
+ unsigned long save_max = tr->max_latency;
+ unsigned long count;
+ int ret;
+
+ /*
+ * Now that the big kernel lock is no longer preemptable,
+ * and this is called with the BKL held, it will always
+ * fail. If preemption is already disabled, simply
+ * pass the test. When the BKL is removed, or becomes
+ * preemptible again, we will once again test this,
+ * so keep it in.
+ */
+ if (preempt_count()) {
+ printk(KERN_CONT "can not test ... force ");
+ return 0;
+ }
+
+ /* start the tracing */
+ ret = tracer_init(trace, tr);
+ if (ret) {
+ warn_failed_init_tracer(trace, ret);
+ goto out_no_start;
+ }
+
+ /* reset the max latency */
+ tr->max_latency = 0;
+
+ /* disable preemption and interrupts for a bit */
+ preempt_disable();
+ local_irq_disable();
+ udelay(100);
+ preempt_enable();
+ /* reverse the order of preempt vs irqs */
+ local_irq_enable();
+
+ /*
+ * Stop the tracer to avoid a warning subsequent
+ * to buffer flipping failure because tracing_stop()
+ * disables the tr and max buffers, making flipping impossible
+ * in case of parallels max irqs/preempt off latencies.
+ */
+ trace->stop(tr);
+ /* stop the tracing. */
+ tracing_stop();
+ /* check both trace buffers */
+ ret = trace_test_buffer(&tr->trace_buffer, NULL);
+ if (ret)
+ goto out;
+
+ ret = trace_test_buffer(&tr->max_buffer, &count);
+ if (ret)
+ goto out;
+
+ if (!ret && !count) {
+ printk(KERN_CONT ".. no entries found ..");
+ ret = -1;
+ goto out;
+ }
+
+ /* do the test by disabling interrupts first this time */
+ tr->max_latency = 0;
+ tracing_start();
+ trace->start(tr);
+
+ preempt_disable();
+ local_irq_disable();
+ udelay(100);
+ preempt_enable();
+ /* reverse the order of preempt vs irqs */
+ local_irq_enable();
+
+ trace->stop(tr);
+ /* stop the tracing. */
+ tracing_stop();
+ /* check both trace buffers */
+ ret = trace_test_buffer(&tr->trace_buffer, NULL);
+ if (ret)
+ goto out;
+
+ ret = trace_test_buffer(&tr->max_buffer, &count);
+
+ if (!ret && !count) {
+ printk(KERN_CONT ".. no entries found ..");
+ ret = -1;
+ goto out;
+ }
+
+out:
+ tracing_start();
+out_no_start:
+ trace->reset(tr);
+ tr->max_latency = save_max;
+
+ return ret;
+}
+#endif /* CONFIG_IRQSOFF_TRACER && CONFIG_PREEMPT_TRACER */
+
+#ifdef CONFIG_NOP_TRACER
+int
+trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr)
+{
+ /* What could possibly go wrong? */
+ return 0;
+}
+#endif
+
+#ifdef CONFIG_SCHED_TRACER
+
+struct wakeup_test_data {
+ struct completion is_ready;
+ int go;
+};
+
+static int trace_wakeup_test_thread(void *data)
+{
+ /* Make this a -deadline thread */
+ static const struct sched_attr attr = {
+#ifdef CONFIG_SCHED_BFS
+ /* No deadline on BFS, use RR */
+ .sched_policy = SCHED_RR,
+#else
+ .sched_policy = SCHED_DEADLINE,
+ .sched_runtime = 100000ULL,
+ .sched_deadline = 10000000ULL,
+ .sched_period = 10000000ULL
+#endif
+ };
+ struct wakeup_test_data *x = data;
+
+ sched_setattr(current, &attr);
+
+ /* Make it know we have a new prio */
+ complete(&x->is_ready);
+
+ /* now go to sleep and let the test wake us up */
+ set_current_state(TASK_INTERRUPTIBLE);
+ while (!x->go) {
+ schedule();
+ set_current_state(TASK_INTERRUPTIBLE);
+ }
+
+ complete(&x->is_ready);
+
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ /* we are awake, now wait to disappear */
+ while (!kthread_should_stop()) {
+ schedule();
+ set_current_state(TASK_INTERRUPTIBLE);
+ }
+
+ __set_current_state(TASK_RUNNING);
+
+ return 0;
+}
+int
+trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
+{
+ unsigned long save_max = tr->max_latency;
+ struct task_struct *p;
+ struct wakeup_test_data data;
+ unsigned long count;
+ int ret;
+
+ memset(&data, 0, sizeof(data));
+
+ init_completion(&data.is_ready);
+
+ /* create a -deadline thread */
+ p = kthread_run(trace_wakeup_test_thread, &data, "ftrace-test");
+ if (IS_ERR(p)) {
+ printk(KERN_CONT "Failed to create ftrace wakeup test thread ");
+ return -1;
+ }
+
+ /* make sure the thread is running at -deadline policy */
+ wait_for_completion(&data.is_ready);
+
+ /* start the tracing */
+ ret = tracer_init(trace, tr);
+ if (ret) {
+ warn_failed_init_tracer(trace, ret);
+ return ret;
+ }
+
+ /* reset the max latency */
+ tr->max_latency = 0;
+
+ while (p->on_rq) {
+ /*
+ * Sleep to make sure the -deadline thread is asleep too.
+ * On virtual machines we can't rely on timings,
+ * but we want to make sure this test still works.
+ */
+ msleep(100);
+ }
+
+ init_completion(&data.is_ready);
+
+ data.go = 1;
+ /* memory barrier is in the wake_up_process() */
+
+ wake_up_process(p);
+
+ /* Wait for the task to wake up */
+ wait_for_completion(&data.is_ready);
+
+ /* stop the tracing. */
+ tracing_stop();
+ /* check both trace buffers */
+ ret = trace_test_buffer(&tr->trace_buffer, NULL);
+ if (!ret)
+ ret = trace_test_buffer(&tr->max_buffer, &count);
+
+
+ trace->reset(tr);
+ tracing_start();
+
+ tr->max_latency = save_max;
+
+ /* kill the thread */
+ kthread_stop(p);
+
+ if (!ret && !count) {
+ printk(KERN_CONT ".. no entries found ..");
+ ret = -1;
+ }
+
+ return ret;
+}
+#endif /* CONFIG_SCHED_TRACER */
+
+#ifdef CONFIG_CONTEXT_SWITCH_TRACER
+int
+trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr)
+{
+ unsigned long count;
+ int ret;
+
+ /* start the tracing */
+ ret = tracer_init(trace, tr);
+ if (ret) {
+ warn_failed_init_tracer(trace, ret);
+ return ret;
+ }
+
+ /* Sleep for a 1/10 of a second */
+ msleep(100);
+ /* stop the tracing. */
+ tracing_stop();
+ /* check the trace buffer */
+ ret = trace_test_buffer(&tr->trace_buffer, &count);
+ trace->reset(tr);
+ tracing_start();
+
+ if (!ret && !count) {
+ printk(KERN_CONT ".. no entries found ..");
+ ret = -1;
+ }
+
+ return ret;
+}
+#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
+
+#ifdef CONFIG_BRANCH_TRACER
+int
+trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
+{
+ unsigned long count;
+ int ret;
+
+ /* start the tracing */
+ ret = tracer_init(trace, tr);
+ if (ret) {
+ warn_failed_init_tracer(trace, ret);
+ return ret;
+ }
+
+ /* Sleep for a 1/10 of a second */
+ msleep(100);
+ /* stop the tracing. */
+ tracing_stop();
+ /* check the trace buffer */
+ ret = trace_test_buffer(&tr->trace_buffer, &count);
+ trace->reset(tr);
+ tracing_start();
+
+ if (!ret && !count) {
+ printk(KERN_CONT ".. no entries found ..");
+ ret = -1;
+ }
+
+ return ret;
+}
+#endif /* CONFIG_BRANCH_TRACER */
+
diff --git a/kernel/trace/trace_selftest_dynamic.c b/kernel/trace/trace_selftest_dynamic.c
new file mode 100644
index 000000000..b4c475a0a
--- /dev/null
+++ b/kernel/trace/trace_selftest_dynamic.c
@@ -0,0 +1,13 @@
+#include "trace.h"
+
+int DYN_FTRACE_TEST_NAME(void)
+{
+ /* used to call mcount */
+ return 0;
+}
+
+int DYN_FTRACE_TEST_NAME2(void)
+{
+ /* used to call mcount */
+ return 0;
+}
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
new file mode 100644
index 000000000..e694c9f9e
--- /dev/null
+++ b/kernel/trace/trace_seq.c
@@ -0,0 +1,377 @@
+/*
+ * trace_seq.c
+ *
+ * Copyright (C) 2008-2014 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
+ *
+ * The trace_seq is a handy tool that allows you to pass a descriptor around
+ * to a buffer that other functions can write to. It is similar to the
+ * seq_file functionality but has some differences.
+ *
+ * To use it, the trace_seq must be initialized with trace_seq_init().
+ * This will set up the counters within the descriptor. You can call
+ * trace_seq_init() more than once to reset the trace_seq to start
+ * from scratch.
+ *
+ * The buffer size is currently PAGE_SIZE, although it may become dynamic
+ * in the future.
+ *
+ * A write to the buffer will either succed or fail. That is, unlike
+ * sprintf() there will not be a partial write (well it may write into
+ * the buffer but it wont update the pointers). This allows users to
+ * try to write something into the trace_seq buffer and if it fails
+ * they can flush it and try again.
+ *
+ */
+#include <linux/uaccess.h>
+#include <linux/seq_file.h>
+#include <linux/trace_seq.h>
+
+/* How much buffer is left on the trace_seq? */
+#define TRACE_SEQ_BUF_LEFT(s) seq_buf_buffer_left(&(s)->seq)
+
+/* How much buffer is written? */
+#define TRACE_SEQ_BUF_USED(s) seq_buf_used(&(s)->seq)
+
+/*
+ * trace_seq should work with being initialized with 0s.
+ */
+static inline void __trace_seq_init(struct trace_seq *s)
+{
+ if (unlikely(!s->seq.size))
+ trace_seq_init(s);
+}
+
+/**
+ * trace_print_seq - move the contents of trace_seq into a seq_file
+ * @m: the seq_file descriptor that is the destination
+ * @s: the trace_seq descriptor that is the source.
+ *
+ * Returns 0 on success and non zero on error. If it succeeds to
+ * write to the seq_file it will reset the trace_seq, otherwise
+ * it does not modify the trace_seq to let the caller try again.
+ */
+int trace_print_seq(struct seq_file *m, struct trace_seq *s)
+{
+ int ret;
+
+ __trace_seq_init(s);
+
+ ret = seq_buf_print_seq(m, &s->seq);
+
+ /*
+ * Only reset this buffer if we successfully wrote to the
+ * seq_file buffer. This lets the caller try again or
+ * do something else with the contents.
+ */
+ if (!ret)
+ trace_seq_init(s);
+
+ return ret;
+}
+
+/**
+ * trace_seq_printf - sequence printing of trace information
+ * @s: trace sequence descriptor
+ * @fmt: printf format string
+ *
+ * The tracer may use either sequence operations or its own
+ * copy to user routines. To simplify formating of a trace
+ * trace_seq_printf() is used to store strings into a special
+ * buffer (@s). Then the output may be either used by
+ * the sequencer or pulled into another buffer.
+ */
+void trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
+{
+ unsigned int save_len = s->seq.len;
+ va_list ap;
+
+ if (s->full)
+ return;
+
+ __trace_seq_init(s);
+
+ va_start(ap, fmt);
+ seq_buf_vprintf(&s->seq, fmt, ap);
+ va_end(ap);
+
+ /* If we can't write it all, don't bother writing anything */
+ if (unlikely(seq_buf_has_overflowed(&s->seq))) {
+ s->seq.len = save_len;
+ s->full = 1;
+ }
+}
+EXPORT_SYMBOL_GPL(trace_seq_printf);
+
+/**
+ * trace_seq_bitmask - write a bitmask array in its ASCII representation
+ * @s: trace sequence descriptor
+ * @maskp: points to an array of unsigned longs that represent a bitmask
+ * @nmaskbits: The number of bits that are valid in @maskp
+ *
+ * Writes a ASCII representation of a bitmask string into @s.
+ */
+void trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp,
+ int nmaskbits)
+{
+ unsigned int save_len = s->seq.len;
+
+ if (s->full)
+ return;
+
+ __trace_seq_init(s);
+
+ seq_buf_printf(&s->seq, "%*pb", nmaskbits, maskp);
+
+ if (unlikely(seq_buf_has_overflowed(&s->seq))) {
+ s->seq.len = save_len;
+ s->full = 1;
+ }
+}
+EXPORT_SYMBOL_GPL(trace_seq_bitmask);
+
+/**
+ * trace_seq_vprintf - sequence printing of trace information
+ * @s: trace sequence descriptor
+ * @fmt: printf format string
+ *
+ * The tracer may use either sequence operations or its own
+ * copy to user routines. To simplify formating of a trace
+ * trace_seq_printf is used to store strings into a special
+ * buffer (@s). Then the output may be either used by
+ * the sequencer or pulled into another buffer.
+ */
+void trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
+{
+ unsigned int save_len = s->seq.len;
+
+ if (s->full)
+ return;
+
+ __trace_seq_init(s);
+
+ seq_buf_vprintf(&s->seq, fmt, args);
+
+ /* If we can't write it all, don't bother writing anything */
+ if (unlikely(seq_buf_has_overflowed(&s->seq))) {
+ s->seq.len = save_len;
+ s->full = 1;
+ }
+}
+EXPORT_SYMBOL_GPL(trace_seq_vprintf);
+
+/**
+ * trace_seq_bprintf - Write the printf string from binary arguments
+ * @s: trace sequence descriptor
+ * @fmt: The format string for the @binary arguments
+ * @binary: The binary arguments for @fmt.
+ *
+ * When recording in a fast path, a printf may be recorded with just
+ * saving the format and the arguments as they were passed to the
+ * function, instead of wasting cycles converting the arguments into
+ * ASCII characters. Instead, the arguments are saved in a 32 bit
+ * word array that is defined by the format string constraints.
+ *
+ * This function will take the format and the binary array and finish
+ * the conversion into the ASCII string within the buffer.
+ */
+void trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
+{
+ unsigned int save_len = s->seq.len;
+
+ if (s->full)
+ return;
+
+ __trace_seq_init(s);
+
+ seq_buf_bprintf(&s->seq, fmt, binary);
+
+ /* If we can't write it all, don't bother writing anything */
+ if (unlikely(seq_buf_has_overflowed(&s->seq))) {
+ s->seq.len = save_len;
+ s->full = 1;
+ return;
+ }
+}
+EXPORT_SYMBOL_GPL(trace_seq_bprintf);
+
+/**
+ * trace_seq_puts - trace sequence printing of simple string
+ * @s: trace sequence descriptor
+ * @str: simple string to record
+ *
+ * The tracer may use either the sequence operations or its own
+ * copy to user routines. This function records a simple string
+ * into a special buffer (@s) for later retrieval by a sequencer
+ * or other mechanism.
+ */
+void trace_seq_puts(struct trace_seq *s, const char *str)
+{
+ unsigned int len = strlen(str);
+
+ if (s->full)
+ return;
+
+ __trace_seq_init(s);
+
+ if (len > TRACE_SEQ_BUF_LEFT(s)) {
+ s->full = 1;
+ return;
+ }
+
+ seq_buf_putmem(&s->seq, str, len);
+}
+EXPORT_SYMBOL_GPL(trace_seq_puts);
+
+/**
+ * trace_seq_putc - trace sequence printing of simple character
+ * @s: trace sequence descriptor
+ * @c: simple character to record
+ *
+ * The tracer may use either the sequence operations or its own
+ * copy to user routines. This function records a simple charater
+ * into a special buffer (@s) for later retrieval by a sequencer
+ * or other mechanism.
+ */
+void trace_seq_putc(struct trace_seq *s, unsigned char c)
+{
+ if (s->full)
+ return;
+
+ __trace_seq_init(s);
+
+ if (TRACE_SEQ_BUF_LEFT(s) < 1) {
+ s->full = 1;
+ return;
+ }
+
+ seq_buf_putc(&s->seq, c);
+}
+EXPORT_SYMBOL_GPL(trace_seq_putc);
+
+/**
+ * trace_seq_putmem - write raw data into the trace_seq buffer
+ * @s: trace sequence descriptor
+ * @mem: The raw memory to copy into the buffer
+ * @len: The length of the raw memory to copy (in bytes)
+ *
+ * There may be cases where raw memory needs to be written into the
+ * buffer and a strcpy() would not work. Using this function allows
+ * for such cases.
+ */
+void trace_seq_putmem(struct trace_seq *s, const void *mem, unsigned int len)
+{
+ if (s->full)
+ return;
+
+ __trace_seq_init(s);
+
+ if (len > TRACE_SEQ_BUF_LEFT(s)) {
+ s->full = 1;
+ return;
+ }
+
+ seq_buf_putmem(&s->seq, mem, len);
+}
+EXPORT_SYMBOL_GPL(trace_seq_putmem);
+
+/**
+ * trace_seq_putmem_hex - write raw memory into the buffer in ASCII hex
+ * @s: trace sequence descriptor
+ * @mem: The raw memory to write its hex ASCII representation of
+ * @len: The length of the raw memory to copy (in bytes)
+ *
+ * This is similar to trace_seq_putmem() except instead of just copying the
+ * raw memory into the buffer it writes its ASCII representation of it
+ * in hex characters.
+ */
+void trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
+ unsigned int len)
+{
+ unsigned int save_len = s->seq.len;
+
+ if (s->full)
+ return;
+
+ __trace_seq_init(s);
+
+ /* Each byte is represented by two chars */
+ if (len * 2 > TRACE_SEQ_BUF_LEFT(s)) {
+ s->full = 1;
+ return;
+ }
+
+ /* The added spaces can still cause an overflow */
+ seq_buf_putmem_hex(&s->seq, mem, len);
+
+ if (unlikely(seq_buf_has_overflowed(&s->seq))) {
+ s->seq.len = save_len;
+ s->full = 1;
+ return;
+ }
+}
+EXPORT_SYMBOL_GPL(trace_seq_putmem_hex);
+
+/**
+ * trace_seq_path - copy a path into the sequence buffer
+ * @s: trace sequence descriptor
+ * @path: path to write into the sequence buffer.
+ *
+ * Write a path name into the sequence buffer.
+ *
+ * Returns 1 if we successfully written all the contents to
+ * the buffer.
+ * Returns 0 if we the length to write is bigger than the
+ * reserved buffer space. In this case, nothing gets written.
+ */
+int trace_seq_path(struct trace_seq *s, const struct path *path)
+{
+ unsigned int save_len = s->seq.len;
+
+ if (s->full)
+ return 0;
+
+ __trace_seq_init(s);
+
+ if (TRACE_SEQ_BUF_LEFT(s) < 1) {
+ s->full = 1;
+ return 0;
+ }
+
+ seq_buf_path(&s->seq, path, "\n");
+
+ if (unlikely(seq_buf_has_overflowed(&s->seq))) {
+ s->seq.len = save_len;
+ s->full = 1;
+ return 0;
+ }
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(trace_seq_path);
+
+/**
+ * trace_seq_to_user - copy the squence buffer to user space
+ * @s: trace sequence descriptor
+ * @ubuf: The userspace memory location to copy to
+ * @cnt: The amount to copy
+ *
+ * Copies the sequence buffer into the userspace memory pointed to
+ * by @ubuf. It starts from the last read position (@s->readpos)
+ * and writes up to @cnt characters or till it reaches the end of
+ * the content in the buffer (@s->len), which ever comes first.
+ *
+ * On success, it returns a positive number of the number of bytes
+ * it copied.
+ *
+ * On failure it returns -EBUSY if all of the content in the
+ * sequence has been already read, which includes nothing in the
+ * sequenc (@s->len == @s->readpos).
+ *
+ * Returns -EFAULT if the copy to userspace fails.
+ */
+int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt)
+{
+ __trace_seq_init(s);
+ return seq_buf_to_user(&s->seq, ubuf, cnt);
+}
+EXPORT_SYMBOL_GPL(trace_seq_to_user);
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
new file mode 100644
index 000000000..3f3449624
--- /dev/null
+++ b/kernel/trace/trace_stack.c
@@ -0,0 +1,484 @@
+/*
+ * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
+ *
+ */
+#include <linux/stacktrace.h>
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+#include <linux/init.h>
+
+#include <asm/setup.h>
+
+#include "trace.h"
+
+#define STACK_TRACE_ENTRIES 500
+
+#ifdef CC_USING_FENTRY
+# define fentry 1
+#else
+# define fentry 0
+#endif
+
+static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] =
+ { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX };
+static unsigned stack_dump_index[STACK_TRACE_ENTRIES];
+
+/*
+ * Reserve one entry for the passed in ip. This will allow
+ * us to remove most or all of the stack size overhead
+ * added by the stack tracer itself.
+ */
+static struct stack_trace max_stack_trace = {
+ .max_entries = STACK_TRACE_ENTRIES - 1,
+ .entries = &stack_dump_trace[1],
+};
+
+static unsigned long max_stack_size;
+static arch_spinlock_t max_stack_lock =
+ (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
+
+static DEFINE_PER_CPU(int, trace_active);
+static DEFINE_MUTEX(stack_sysctl_mutex);
+
+int stack_tracer_enabled;
+static int last_stack_tracer_enabled;
+
+static inline void print_max_stack(void)
+{
+ long i;
+ int size;
+
+ pr_emerg(" Depth Size Location (%d entries)\n"
+ " ----- ---- --------\n",
+ max_stack_trace.nr_entries - 1);
+
+ for (i = 0; i < max_stack_trace.nr_entries; i++) {
+ if (stack_dump_trace[i] == ULONG_MAX)
+ break;
+ if (i+1 == max_stack_trace.nr_entries ||
+ stack_dump_trace[i+1] == ULONG_MAX)
+ size = stack_dump_index[i];
+ else
+ size = stack_dump_index[i] - stack_dump_index[i+1];
+
+ pr_emerg("%3ld) %8d %5d %pS\n", i, stack_dump_index[i],
+ size, (void *)stack_dump_trace[i]);
+ }
+}
+
+static inline void
+check_stack(unsigned long ip, unsigned long *stack)
+{
+ unsigned long this_size, flags; unsigned long *p, *top, *start;
+ static int tracer_frame;
+ int frame_size = ACCESS_ONCE(tracer_frame);
+ int i;
+
+ this_size = ((unsigned long)stack) & (THREAD_SIZE-1);
+ this_size = THREAD_SIZE - this_size;
+ /* Remove the frame of the tracer */
+ this_size -= frame_size;
+
+ if (this_size <= max_stack_size)
+ return;
+
+ /* we do not handle interrupt stacks yet */
+ if (!object_is_on_stack(stack))
+ return;
+
+ local_irq_save(flags);
+ arch_spin_lock(&max_stack_lock);
+
+ /* In case another CPU set the tracer_frame on us */
+ if (unlikely(!frame_size))
+ this_size -= tracer_frame;
+
+ /* a race could have already updated it */
+ if (this_size <= max_stack_size)
+ goto out;
+
+ max_stack_size = this_size;
+
+ max_stack_trace.nr_entries = 0;
+
+ if (using_ftrace_ops_list_func())
+ max_stack_trace.skip = 4;
+ else
+ max_stack_trace.skip = 3;
+
+ save_stack_trace(&max_stack_trace);
+
+ /*
+ * Add the passed in ip from the function tracer.
+ * Searching for this on the stack will skip over
+ * most of the overhead from the stack tracer itself.
+ */
+ stack_dump_trace[0] = ip;
+ max_stack_trace.nr_entries++;
+
+ /*
+ * Now find where in the stack these are.
+ */
+ i = 0;
+ start = stack;
+ top = (unsigned long *)
+ (((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE);
+
+ /*
+ * Loop through all the entries. One of the entries may
+ * for some reason be missed on the stack, so we may
+ * have to account for them. If they are all there, this
+ * loop will only happen once. This code only takes place
+ * on a new max, so it is far from a fast path.
+ */
+ while (i < max_stack_trace.nr_entries) {
+ int found = 0;
+
+ stack_dump_index[i] = this_size;
+ p = start;
+
+ for (; p < top && i < max_stack_trace.nr_entries; p++) {
+ if (*p == stack_dump_trace[i]) {
+ this_size = stack_dump_index[i++] =
+ (top - p) * sizeof(unsigned long);
+ found = 1;
+ /* Start the search from here */
+ start = p + 1;
+ /*
+ * We do not want to show the overhead
+ * of the stack tracer stack in the
+ * max stack. If we haven't figured
+ * out what that is, then figure it out
+ * now.
+ */
+ if (unlikely(!tracer_frame) && i == 1) {
+ tracer_frame = (p - stack) *
+ sizeof(unsigned long);
+ max_stack_size -= tracer_frame;
+ }
+ }
+ }
+
+ if (!found)
+ i++;
+ }
+
+ if (task_stack_end_corrupted(current)) {
+ print_max_stack();
+ BUG();
+ }
+
+ out:
+ arch_spin_unlock(&max_stack_lock);
+ local_irq_restore(flags);
+}
+
+static void
+stack_trace_call(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *pt_regs)
+{
+ unsigned long stack;
+ int cpu;
+
+ preempt_disable_notrace();
+
+ cpu = raw_smp_processor_id();
+ /* no atomic needed, we only modify this variable by this cpu */
+ if (per_cpu(trace_active, cpu)++ != 0)
+ goto out;
+
+ /*
+ * When fentry is used, the traced function does not get
+ * its stack frame set up, and we lose the parent.
+ * The ip is pretty useless because the function tracer
+ * was called before that function set up its stack frame.
+ * In this case, we use the parent ip.
+ *
+ * By adding the return address of either the parent ip
+ * or the current ip we can disregard most of the stack usage
+ * caused by the stack tracer itself.
+ *
+ * The function tracer always reports the address of where the
+ * mcount call was, but the stack will hold the return address.
+ */
+ if (fentry)
+ ip = parent_ip;
+ else
+ ip += MCOUNT_INSN_SIZE;
+
+ check_stack(ip, &stack);
+
+ out:
+ per_cpu(trace_active, cpu)--;
+ /* prevent recursion in schedule */
+ preempt_enable_notrace();
+}
+
+static struct ftrace_ops trace_ops __read_mostly =
+{
+ .func = stack_trace_call,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE,
+};
+
+static ssize_t
+stack_max_size_read(struct file *filp, char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ unsigned long *ptr = filp->private_data;
+ char buf[64];
+ int r;
+
+ r = snprintf(buf, sizeof(buf), "%ld\n", *ptr);
+ if (r > sizeof(buf))
+ r = sizeof(buf);
+ return simple_read_from_buffer(ubuf, count, ppos, buf, r);
+}
+
+static ssize_t
+stack_max_size_write(struct file *filp, const char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ long *ptr = filp->private_data;
+ unsigned long val, flags;
+ int ret;
+ int cpu;
+
+ ret = kstrtoul_from_user(ubuf, count, 10, &val);
+ if (ret)
+ return ret;
+
+ local_irq_save(flags);
+
+ /*
+ * In case we trace inside arch_spin_lock() or after (NMI),
+ * we will cause circular lock, so we also need to increase
+ * the percpu trace_active here.
+ */
+ cpu = smp_processor_id();
+ per_cpu(trace_active, cpu)++;
+
+ arch_spin_lock(&max_stack_lock);
+ *ptr = val;
+ arch_spin_unlock(&max_stack_lock);
+
+ per_cpu(trace_active, cpu)--;
+ local_irq_restore(flags);
+
+ return count;
+}
+
+static const struct file_operations stack_max_size_fops = {
+ .open = tracing_open_generic,
+ .read = stack_max_size_read,
+ .write = stack_max_size_write,
+ .llseek = default_llseek,
+};
+
+static void *
+__next(struct seq_file *m, loff_t *pos)
+{
+ long n = *pos - 1;
+
+ if (n >= max_stack_trace.nr_entries || stack_dump_trace[n] == ULONG_MAX)
+ return NULL;
+
+ m->private = (void *)n;
+ return &m->private;
+}
+
+static void *
+t_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ (*pos)++;
+ return __next(m, pos);
+}
+
+static void *t_start(struct seq_file *m, loff_t *pos)
+{
+ int cpu;
+
+ local_irq_disable();
+
+ cpu = smp_processor_id();
+ per_cpu(trace_active, cpu)++;
+
+ arch_spin_lock(&max_stack_lock);
+
+ if (*pos == 0)
+ return SEQ_START_TOKEN;
+
+ return __next(m, pos);
+}
+
+static void t_stop(struct seq_file *m, void *p)
+{
+ int cpu;
+
+ arch_spin_unlock(&max_stack_lock);
+
+ cpu = smp_processor_id();
+ per_cpu(trace_active, cpu)--;
+
+ local_irq_enable();
+}
+
+static void trace_lookup_stack(struct seq_file *m, long i)
+{
+ unsigned long addr = stack_dump_trace[i];
+
+ seq_printf(m, "%pS\n", (void *)addr);
+}
+
+static void print_disabled(struct seq_file *m)
+{
+ seq_puts(m, "#\n"
+ "# Stack tracer disabled\n"
+ "#\n"
+ "# To enable the stack tracer, either add 'stacktrace' to the\n"
+ "# kernel command line\n"
+ "# or 'echo 1 > /proc/sys/kernel/stack_tracer_enabled'\n"
+ "#\n");
+}
+
+static int t_show(struct seq_file *m, void *v)
+{
+ long i;
+ int size;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_printf(m, " Depth Size Location"
+ " (%d entries)\n"
+ " ----- ---- --------\n",
+ max_stack_trace.nr_entries - 1);
+
+ if (!stack_tracer_enabled && !max_stack_size)
+ print_disabled(m);
+
+ return 0;
+ }
+
+ i = *(long *)v;
+
+ if (i >= max_stack_trace.nr_entries ||
+ stack_dump_trace[i] == ULONG_MAX)
+ return 0;
+
+ if (i+1 == max_stack_trace.nr_entries ||
+ stack_dump_trace[i+1] == ULONG_MAX)
+ size = stack_dump_index[i];
+ else
+ size = stack_dump_index[i] - stack_dump_index[i+1];
+
+ seq_printf(m, "%3ld) %8d %5d ", i, stack_dump_index[i], size);
+
+ trace_lookup_stack(m, i);
+
+ return 0;
+}
+
+static const struct seq_operations stack_trace_seq_ops = {
+ .start = t_start,
+ .next = t_next,
+ .stop = t_stop,
+ .show = t_show,
+};
+
+static int stack_trace_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &stack_trace_seq_ops);
+}
+
+static const struct file_operations stack_trace_fops = {
+ .open = stack_trace_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int
+stack_trace_filter_open(struct inode *inode, struct file *file)
+{
+ return ftrace_regex_open(&trace_ops, FTRACE_ITER_FILTER,
+ inode, file);
+}
+
+static const struct file_operations stack_trace_filter_fops = {
+ .open = stack_trace_filter_open,
+ .read = seq_read,
+ .write = ftrace_filter_write,
+ .llseek = tracing_lseek,
+ .release = ftrace_regex_release,
+};
+
+int
+stack_trace_sysctl(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret;
+
+ mutex_lock(&stack_sysctl_mutex);
+
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+
+ if (ret || !write ||
+ (last_stack_tracer_enabled == !!stack_tracer_enabled))
+ goto out;
+
+ last_stack_tracer_enabled = !!stack_tracer_enabled;
+
+ if (stack_tracer_enabled)
+ register_ftrace_function(&trace_ops);
+ else
+ unregister_ftrace_function(&trace_ops);
+
+ out:
+ mutex_unlock(&stack_sysctl_mutex);
+ return ret;
+}
+
+static char stack_trace_filter_buf[COMMAND_LINE_SIZE+1] __initdata;
+
+static __init int enable_stacktrace(char *str)
+{
+ if (strncmp(str, "_filter=", 8) == 0)
+ strncpy(stack_trace_filter_buf, str+8, COMMAND_LINE_SIZE);
+
+ stack_tracer_enabled = 1;
+ last_stack_tracer_enabled = 1;
+ return 1;
+}
+__setup("stacktrace", enable_stacktrace);
+
+static __init int stack_trace_init(void)
+{
+ struct dentry *d_tracer;
+
+ d_tracer = tracing_init_dentry();
+ if (IS_ERR(d_tracer))
+ return 0;
+
+ trace_create_file("stack_max_size", 0644, d_tracer,
+ &max_stack_size, &stack_max_size_fops);
+
+ trace_create_file("stack_trace", 0444, d_tracer,
+ NULL, &stack_trace_fops);
+
+ trace_create_file("stack_trace_filter", 0444, d_tracer,
+ NULL, &stack_trace_filter_fops);
+
+ if (stack_trace_filter_buf[0])
+ ftrace_set_early_filter(&trace_ops, stack_trace_filter_buf, 1);
+
+ if (stack_tracer_enabled)
+ register_ftrace_function(&trace_ops);
+
+ return 0;
+}
+
+device_initcall(stack_trace_init);
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
new file mode 100644
index 000000000..6cf935316
--- /dev/null
+++ b/kernel/trace/trace_stat.c
@@ -0,0 +1,359 @@
+/*
+ * Infrastructure for statistic tracing (histogram output).
+ *
+ * Copyright (C) 2008-2009 Frederic Weisbecker <fweisbec@gmail.com>
+ *
+ * Based on the code from trace_branch.c which is
+ * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
+ *
+ */
+
+
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/rbtree.h>
+#include <linux/tracefs.h>
+#include "trace_stat.h"
+#include "trace.h"
+
+
+/*
+ * List of stat red-black nodes from a tracer
+ * We use a such tree to sort quickly the stat
+ * entries from the tracer.
+ */
+struct stat_node {
+ struct rb_node node;
+ void *stat;
+};
+
+/* A stat session is the stats output in one file */
+struct stat_session {
+ struct list_head session_list;
+ struct tracer_stat *ts;
+ struct rb_root stat_root;
+ struct mutex stat_mutex;
+ struct dentry *file;
+};
+
+/* All of the sessions currently in use. Each stat file embed one session */
+static LIST_HEAD(all_stat_sessions);
+static DEFINE_MUTEX(all_stat_sessions_mutex);
+
+/* The root directory for all stat files */
+static struct dentry *stat_dir;
+
+static void __reset_stat_session(struct stat_session *session)
+{
+ struct stat_node *snode, *n;
+
+ rbtree_postorder_for_each_entry_safe(snode, n, &session->stat_root, node) {
+ if (session->ts->stat_release)
+ session->ts->stat_release(snode->stat);
+ kfree(snode);
+ }
+
+ session->stat_root = RB_ROOT;
+}
+
+static void reset_stat_session(struct stat_session *session)
+{
+ mutex_lock(&session->stat_mutex);
+ __reset_stat_session(session);
+ mutex_unlock(&session->stat_mutex);
+}
+
+static void destroy_session(struct stat_session *session)
+{
+ tracefs_remove(session->file);
+ __reset_stat_session(session);
+ mutex_destroy(&session->stat_mutex);
+ kfree(session);
+}
+
+typedef int (*cmp_stat_t)(void *, void *);
+
+static int insert_stat(struct rb_root *root, void *stat, cmp_stat_t cmp)
+{
+ struct rb_node **new = &(root->rb_node), *parent = NULL;
+ struct stat_node *data;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+ data->stat = stat;
+
+ /*
+ * Figure out where to put new node
+ * This is a descendent sorting
+ */
+ while (*new) {
+ struct stat_node *this;
+ int result;
+
+ this = container_of(*new, struct stat_node, node);
+ result = cmp(data->stat, this->stat);
+
+ parent = *new;
+ if (result >= 0)
+ new = &((*new)->rb_left);
+ else
+ new = &((*new)->rb_right);
+ }
+
+ rb_link_node(&data->node, parent, new);
+ rb_insert_color(&data->node, root);
+ return 0;
+}
+
+/*
+ * For tracers that don't provide a stat_cmp callback.
+ * This one will force an insertion as right-most node
+ * in the rbtree.
+ */
+static int dummy_cmp(void *p1, void *p2)
+{
+ return -1;
+}
+
+/*
+ * Initialize the stat rbtree at each trace_stat file opening.
+ * All of these copies and sorting are required on all opening
+ * since the stats could have changed between two file sessions.
+ */
+static int stat_seq_init(struct stat_session *session)
+{
+ struct tracer_stat *ts = session->ts;
+ struct rb_root *root = &session->stat_root;
+ void *stat;
+ int ret = 0;
+ int i;
+
+ mutex_lock(&session->stat_mutex);
+ __reset_stat_session(session);
+
+ if (!ts->stat_cmp)
+ ts->stat_cmp = dummy_cmp;
+
+ stat = ts->stat_start(ts);
+ if (!stat)
+ goto exit;
+
+ ret = insert_stat(root, stat, ts->stat_cmp);
+ if (ret)
+ goto exit;
+
+ /*
+ * Iterate over the tracer stat entries and store them in an rbtree.
+ */
+ for (i = 1; ; i++) {
+ stat = ts->stat_next(stat, i);
+
+ /* End of insertion */
+ if (!stat)
+ break;
+
+ ret = insert_stat(root, stat, ts->stat_cmp);
+ if (ret)
+ goto exit_free_rbtree;
+ }
+
+exit:
+ mutex_unlock(&session->stat_mutex);
+ return ret;
+
+exit_free_rbtree:
+ __reset_stat_session(session);
+ mutex_unlock(&session->stat_mutex);
+ return ret;
+}
+
+
+static void *stat_seq_start(struct seq_file *s, loff_t *pos)
+{
+ struct stat_session *session = s->private;
+ struct rb_node *node;
+ int n = *pos;
+ int i;
+
+ /* Prevent from tracer switch or rbtree modification */
+ mutex_lock(&session->stat_mutex);
+
+ /* If we are in the beginning of the file, print the headers */
+ if (session->ts->stat_headers) {
+ if (n == 0)
+ return SEQ_START_TOKEN;
+ n--;
+ }
+
+ node = rb_first(&session->stat_root);
+ for (i = 0; node && i < n; i++)
+ node = rb_next(node);
+
+ return node;
+}
+
+static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos)
+{
+ struct stat_session *session = s->private;
+ struct rb_node *node = p;
+
+ (*pos)++;
+
+ if (p == SEQ_START_TOKEN)
+ return rb_first(&session->stat_root);
+
+ return rb_next(node);
+}
+
+static void stat_seq_stop(struct seq_file *s, void *p)
+{
+ struct stat_session *session = s->private;
+ mutex_unlock(&session->stat_mutex);
+}
+
+static int stat_seq_show(struct seq_file *s, void *v)
+{
+ struct stat_session *session = s->private;
+ struct stat_node *l = container_of(v, struct stat_node, node);
+
+ if (v == SEQ_START_TOKEN)
+ return session->ts->stat_headers(s);
+
+ return session->ts->stat_show(s, l->stat);
+}
+
+static const struct seq_operations trace_stat_seq_ops = {
+ .start = stat_seq_start,
+ .next = stat_seq_next,
+ .stop = stat_seq_stop,
+ .show = stat_seq_show
+};
+
+/* The session stat is refilled and resorted at each stat file opening */
+static int tracing_stat_open(struct inode *inode, struct file *file)
+{
+ int ret;
+ struct seq_file *m;
+ struct stat_session *session = inode->i_private;
+
+ ret = stat_seq_init(session);
+ if (ret)
+ return ret;
+
+ ret = seq_open(file, &trace_stat_seq_ops);
+ if (ret) {
+ reset_stat_session(session);
+ return ret;
+ }
+
+ m = file->private_data;
+ m->private = session;
+ return ret;
+}
+
+/*
+ * Avoid consuming memory with our now useless rbtree.
+ */
+static int tracing_stat_release(struct inode *i, struct file *f)
+{
+ struct stat_session *session = i->i_private;
+
+ reset_stat_session(session);
+
+ return seq_release(i, f);
+}
+
+static const struct file_operations tracing_stat_fops = {
+ .open = tracing_stat_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = tracing_stat_release
+};
+
+static int tracing_stat_init(void)
+{
+ struct dentry *d_tracing;
+
+ d_tracing = tracing_init_dentry();
+ if (IS_ERR(d_tracing))
+ return 0;
+
+ stat_dir = tracefs_create_dir("trace_stat", d_tracing);
+ if (!stat_dir)
+ pr_warning("Could not create tracefs "
+ "'trace_stat' entry\n");
+ return 0;
+}
+
+static int init_stat_file(struct stat_session *session)
+{
+ if (!stat_dir && tracing_stat_init())
+ return -ENODEV;
+
+ session->file = tracefs_create_file(session->ts->name, 0644,
+ stat_dir,
+ session, &tracing_stat_fops);
+ if (!session->file)
+ return -ENOMEM;
+ return 0;
+}
+
+int register_stat_tracer(struct tracer_stat *trace)
+{
+ struct stat_session *session, *node;
+ int ret;
+
+ if (!trace)
+ return -EINVAL;
+
+ if (!trace->stat_start || !trace->stat_next || !trace->stat_show)
+ return -EINVAL;
+
+ /* Already registered? */
+ mutex_lock(&all_stat_sessions_mutex);
+ list_for_each_entry(node, &all_stat_sessions, session_list) {
+ if (node->ts == trace) {
+ mutex_unlock(&all_stat_sessions_mutex);
+ return -EINVAL;
+ }
+ }
+ mutex_unlock(&all_stat_sessions_mutex);
+
+ /* Init the session */
+ session = kzalloc(sizeof(*session), GFP_KERNEL);
+ if (!session)
+ return -ENOMEM;
+
+ session->ts = trace;
+ INIT_LIST_HEAD(&session->session_list);
+ mutex_init(&session->stat_mutex);
+
+ ret = init_stat_file(session);
+ if (ret) {
+ destroy_session(session);
+ return ret;
+ }
+
+ /* Register */
+ mutex_lock(&all_stat_sessions_mutex);
+ list_add_tail(&session->session_list, &all_stat_sessions);
+ mutex_unlock(&all_stat_sessions_mutex);
+
+ return 0;
+}
+
+void unregister_stat_tracer(struct tracer_stat *trace)
+{
+ struct stat_session *node, *tmp;
+
+ mutex_lock(&all_stat_sessions_mutex);
+ list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
+ if (node->ts == trace) {
+ list_del(&node->session_list);
+ destroy_session(node);
+ break;
+ }
+ }
+ mutex_unlock(&all_stat_sessions_mutex);
+}
diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h
new file mode 100644
index 000000000..8f03914b9
--- /dev/null
+++ b/kernel/trace/trace_stat.h
@@ -0,0 +1,33 @@
+#ifndef __TRACE_STAT_H
+#define __TRACE_STAT_H
+
+#include <linux/seq_file.h>
+
+/*
+ * If you want to provide a stat file (one-shot statistics), fill
+ * an iterator with stat_start/stat_next and a stat_show callbacks.
+ * The others callbacks are optional.
+ */
+struct tracer_stat {
+ /* The name of your stat file */
+ const char *name;
+ /* Iteration over statistic entries */
+ void *(*stat_start)(struct tracer_stat *trace);
+ void *(*stat_next)(void *prev, int idx);
+ /* Compare two entries for stats sorting */
+ int (*stat_cmp)(void *p1, void *p2);
+ /* Print a stat entry */
+ int (*stat_show)(struct seq_file *s, void *p);
+ /* Release an entry */
+ void (*stat_release)(void *stat);
+ /* Print the headers of your stat entries */
+ int (*stat_headers)(struct seq_file *s);
+};
+
+/*
+ * Destroy or create a stat file
+ */
+extern int register_stat_tracer(struct tracer_stat *trace);
+extern void unregister_stat_tracer(struct tracer_stat *trace);
+
+#endif /* __TRACE_STAT_H */
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
new file mode 100644
index 000000000..f97f6e3a6
--- /dev/null
+++ b/kernel/trace/trace_syscalls.c
@@ -0,0 +1,750 @@
+#include <trace/syscall.h>
+#include <trace/events/syscalls.h>
+#include <linux/syscalls.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/module.h> /* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */
+#include <linux/ftrace.h>
+#include <linux/perf_event.h>
+#include <asm/syscall.h>
+
+#include "trace_output.h"
+#include "trace.h"
+
+static DEFINE_MUTEX(syscall_trace_lock);
+
+static int syscall_enter_register(struct ftrace_event_call *event,
+ enum trace_reg type, void *data);
+static int syscall_exit_register(struct ftrace_event_call *event,
+ enum trace_reg type, void *data);
+
+static struct list_head *
+syscall_get_enter_fields(struct ftrace_event_call *call)
+{
+ struct syscall_metadata *entry = call->data;
+
+ return &entry->enter_fields;
+}
+
+extern struct syscall_metadata *__start_syscalls_metadata[];
+extern struct syscall_metadata *__stop_syscalls_metadata[];
+
+static struct syscall_metadata **syscalls_metadata;
+
+#ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME
+static inline bool arch_syscall_match_sym_name(const char *sym, const char *name)
+{
+ /*
+ * Only compare after the "sys" prefix. Archs that use
+ * syscall wrappers may have syscalls symbols aliases prefixed
+ * with ".SyS" or ".sys" instead of "sys", leading to an unwanted
+ * mismatch.
+ */
+ return !strcmp(sym + 3, name + 3);
+}
+#endif
+
+#ifdef ARCH_TRACE_IGNORE_COMPAT_SYSCALLS
+/*
+ * Some architectures that allow for 32bit applications
+ * to run on a 64bit kernel, do not map the syscalls for
+ * the 32bit tasks the same as they do for 64bit tasks.
+ *
+ * *cough*x86*cough*
+ *
+ * In such a case, instead of reporting the wrong syscalls,
+ * simply ignore them.
+ *
+ * For an arch to ignore the compat syscalls it needs to
+ * define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS as well as
+ * define the function arch_trace_is_compat_syscall() to let
+ * the tracing system know that it should ignore it.
+ */
+static int
+trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs)
+{
+ if (unlikely(arch_trace_is_compat_syscall(regs)))
+ return -1;
+
+ return syscall_get_nr(task, regs);
+}
+#else
+static inline int
+trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs)
+{
+ return syscall_get_nr(task, regs);
+}
+#endif /* ARCH_TRACE_IGNORE_COMPAT_SYSCALLS */
+
+static __init struct syscall_metadata *
+find_syscall_meta(unsigned long syscall)
+{
+ struct syscall_metadata **start;
+ struct syscall_metadata **stop;
+ char str[KSYM_SYMBOL_LEN];
+
+
+ start = __start_syscalls_metadata;
+ stop = __stop_syscalls_metadata;
+ kallsyms_lookup(syscall, NULL, NULL, NULL, str);
+
+ if (arch_syscall_match_sym_name(str, "sys_ni_syscall"))
+ return NULL;
+
+ for ( ; start < stop; start++) {
+ if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name))
+ return *start;
+ }
+ return NULL;
+}
+
+static struct syscall_metadata *syscall_nr_to_meta(int nr)
+{
+ if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
+ return NULL;
+
+ return syscalls_metadata[nr];
+}
+
+static enum print_line_t
+print_syscall_enter(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct trace_seq *s = &iter->seq;
+ struct trace_entry *ent = iter->ent;
+ struct syscall_trace_enter *trace;
+ struct syscall_metadata *entry;
+ int i, syscall;
+
+ trace = (typeof(trace))ent;
+ syscall = trace->nr;
+ entry = syscall_nr_to_meta(syscall);
+
+ if (!entry)
+ goto end;
+
+ if (entry->enter_event->event.type != ent->type) {
+ WARN_ON_ONCE(1);
+ goto end;
+ }
+
+ trace_seq_printf(s, "%s(", entry->name);
+
+ for (i = 0; i < entry->nb_args; i++) {
+
+ if (trace_seq_has_overflowed(s))
+ goto end;
+
+ /* parameter types */
+ if (trace_flags & TRACE_ITER_VERBOSE)
+ trace_seq_printf(s, "%s ", entry->types[i]);
+
+ /* parameter values */
+ trace_seq_printf(s, "%s: %lx%s", entry->args[i],
+ trace->args[i],
+ i == entry->nb_args - 1 ? "" : ", ");
+ }
+
+ trace_seq_putc(s, ')');
+end:
+ trace_seq_putc(s, '\n');
+
+ return trace_handle_return(s);
+}
+
+static enum print_line_t
+print_syscall_exit(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct trace_seq *s = &iter->seq;
+ struct trace_entry *ent = iter->ent;
+ struct syscall_trace_exit *trace;
+ int syscall;
+ struct syscall_metadata *entry;
+
+ trace = (typeof(trace))ent;
+ syscall = trace->nr;
+ entry = syscall_nr_to_meta(syscall);
+
+ if (!entry) {
+ trace_seq_putc(s, '\n');
+ goto out;
+ }
+
+ if (entry->exit_event->event.type != ent->type) {
+ WARN_ON_ONCE(1);
+ return TRACE_TYPE_UNHANDLED;
+ }
+
+ trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
+ trace->ret);
+
+ out:
+ return trace_handle_return(s);
+}
+
+extern char *__bad_type_size(void);
+
+#define SYSCALL_FIELD(type, name) \
+ sizeof(type) != sizeof(trace.name) ? \
+ __bad_type_size() : \
+ #type, #name, offsetof(typeof(trace), name), \
+ sizeof(trace.name), is_signed_type(type)
+
+static int __init
+__set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
+{
+ int i;
+ int pos = 0;
+
+ /* When len=0, we just calculate the needed length */
+#define LEN_OR_ZERO (len ? len - pos : 0)
+
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
+ for (i = 0; i < entry->nb_args; i++) {
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s",
+ entry->args[i], sizeof(unsigned long),
+ i == entry->nb_args - 1 ? "" : ", ");
+ }
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
+
+ for (i = 0; i < entry->nb_args; i++) {
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ ", ((unsigned long)(REC->%s))", entry->args[i]);
+ }
+
+#undef LEN_OR_ZERO
+
+ /* return the length of print_fmt */
+ return pos;
+}
+
+static int __init set_syscall_print_fmt(struct ftrace_event_call *call)
+{
+ char *print_fmt;
+ int len;
+ struct syscall_metadata *entry = call->data;
+
+ if (entry->enter_event != call) {
+ call->print_fmt = "\"0x%lx\", REC->ret";
+ return 0;
+ }
+
+ /* First: called with 0 length to calculate the needed length */
+ len = __set_enter_print_fmt(entry, NULL, 0);
+
+ print_fmt = kmalloc(len + 1, GFP_KERNEL);
+ if (!print_fmt)
+ return -ENOMEM;
+
+ /* Second: actually write the @print_fmt */
+ __set_enter_print_fmt(entry, print_fmt, len + 1);
+ call->print_fmt = print_fmt;
+
+ return 0;
+}
+
+static void __init free_syscall_print_fmt(struct ftrace_event_call *call)
+{
+ struct syscall_metadata *entry = call->data;
+
+ if (entry->enter_event == call)
+ kfree(call->print_fmt);
+}
+
+static int __init syscall_enter_define_fields(struct ftrace_event_call *call)
+{
+ struct syscall_trace_enter trace;
+ struct syscall_metadata *meta = call->data;
+ int ret;
+ int i;
+ int offset = offsetof(typeof(trace), args);
+
+ ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < meta->nb_args; i++) {
+ ret = trace_define_field(call, meta->types[i],
+ meta->args[i], offset,
+ sizeof(unsigned long), 0,
+ FILTER_OTHER);
+ offset += sizeof(unsigned long);
+ }
+
+ return ret;
+}
+
+static int __init syscall_exit_define_fields(struct ftrace_event_call *call)
+{
+ struct syscall_trace_exit trace;
+ int ret;
+
+ ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
+ if (ret)
+ return ret;
+
+ ret = trace_define_field(call, SYSCALL_FIELD(long, ret),
+ FILTER_OTHER);
+
+ return ret;
+}
+
+static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
+{
+ struct trace_array *tr = data;
+ struct ftrace_event_file *ftrace_file;
+ struct syscall_trace_enter *entry;
+ struct syscall_metadata *sys_data;
+ struct ring_buffer_event *event;
+ struct ring_buffer *buffer;
+ unsigned long irq_flags;
+ int pc;
+ int syscall_nr;
+ int size;
+
+ syscall_nr = trace_get_syscall_nr(current, regs);
+ if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
+ return;
+
+ /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */
+ ftrace_file = rcu_dereference_sched(tr->enter_syscall_files[syscall_nr]);
+ if (!ftrace_file)
+ return;
+
+ if (ftrace_trigger_soft_disabled(ftrace_file))
+ return;
+
+ sys_data = syscall_nr_to_meta(syscall_nr);
+ if (!sys_data)
+ return;
+
+ size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
+
+ local_save_flags(irq_flags);
+ pc = preempt_count();
+
+ buffer = tr->trace_buffer.buffer;
+ event = trace_buffer_lock_reserve(buffer,
+ sys_data->enter_event->event.type, size, irq_flags, pc);
+ if (!event)
+ return;
+
+ entry = ring_buffer_event_data(event);
+ entry->nr = syscall_nr;
+ syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);
+
+ event_trigger_unlock_commit(ftrace_file, buffer, event, entry,
+ irq_flags, pc);
+}
+
+static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
+{
+ struct trace_array *tr = data;
+ struct ftrace_event_file *ftrace_file;
+ struct syscall_trace_exit *entry;
+ struct syscall_metadata *sys_data;
+ struct ring_buffer_event *event;
+ struct ring_buffer *buffer;
+ unsigned long irq_flags;
+ int pc;
+ int syscall_nr;
+
+ syscall_nr = trace_get_syscall_nr(current, regs);
+ if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
+ return;
+
+ /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */
+ ftrace_file = rcu_dereference_sched(tr->exit_syscall_files[syscall_nr]);
+ if (!ftrace_file)
+ return;
+
+ if (ftrace_trigger_soft_disabled(ftrace_file))
+ return;
+
+ sys_data = syscall_nr_to_meta(syscall_nr);
+ if (!sys_data)
+ return;
+
+ local_save_flags(irq_flags);
+ pc = preempt_count();
+
+ buffer = tr->trace_buffer.buffer;
+ event = trace_buffer_lock_reserve(buffer,
+ sys_data->exit_event->event.type, sizeof(*entry),
+ irq_flags, pc);
+ if (!event)
+ return;
+
+ entry = ring_buffer_event_data(event);
+ entry->nr = syscall_nr;
+ entry->ret = syscall_get_return_value(current, regs);
+
+ event_trigger_unlock_commit(ftrace_file, buffer, event, entry,
+ irq_flags, pc);
+}
+
+static int reg_event_syscall_enter(struct ftrace_event_file *file,
+ struct ftrace_event_call *call)
+{
+ struct trace_array *tr = file->tr;
+ int ret = 0;
+ int num;
+
+ num = ((struct syscall_metadata *)call->data)->syscall_nr;
+ if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
+ return -ENOSYS;
+ mutex_lock(&syscall_trace_lock);
+ if (!tr->sys_refcount_enter)
+ ret = register_trace_sys_enter(ftrace_syscall_enter, tr);
+ if (!ret) {
+ rcu_assign_pointer(tr->enter_syscall_files[num], file);
+ tr->sys_refcount_enter++;
+ }
+ mutex_unlock(&syscall_trace_lock);
+ return ret;
+}
+
+static void unreg_event_syscall_enter(struct ftrace_event_file *file,
+ struct ftrace_event_call *call)
+{
+ struct trace_array *tr = file->tr;
+ int num;
+
+ num = ((struct syscall_metadata *)call->data)->syscall_nr;
+ if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
+ return;
+ mutex_lock(&syscall_trace_lock);
+ tr->sys_refcount_enter--;
+ RCU_INIT_POINTER(tr->enter_syscall_files[num], NULL);
+ if (!tr->sys_refcount_enter)
+ unregister_trace_sys_enter(ftrace_syscall_enter, tr);
+ mutex_unlock(&syscall_trace_lock);
+}
+
+static int reg_event_syscall_exit(struct ftrace_event_file *file,
+ struct ftrace_event_call *call)
+{
+ struct trace_array *tr = file->tr;
+ int ret = 0;
+ int num;
+
+ num = ((struct syscall_metadata *)call->data)->syscall_nr;
+ if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
+ return -ENOSYS;
+ mutex_lock(&syscall_trace_lock);
+ if (!tr->sys_refcount_exit)
+ ret = register_trace_sys_exit(ftrace_syscall_exit, tr);
+ if (!ret) {
+ rcu_assign_pointer(tr->exit_syscall_files[num], file);
+ tr->sys_refcount_exit++;
+ }
+ mutex_unlock(&syscall_trace_lock);
+ return ret;
+}
+
+static void unreg_event_syscall_exit(struct ftrace_event_file *file,
+ struct ftrace_event_call *call)
+{
+ struct trace_array *tr = file->tr;
+ int num;
+
+ num = ((struct syscall_metadata *)call->data)->syscall_nr;
+ if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
+ return;
+ mutex_lock(&syscall_trace_lock);
+ tr->sys_refcount_exit--;
+ RCU_INIT_POINTER(tr->exit_syscall_files[num], NULL);
+ if (!tr->sys_refcount_exit)
+ unregister_trace_sys_exit(ftrace_syscall_exit, tr);
+ mutex_unlock(&syscall_trace_lock);
+}
+
+static int __init init_syscall_trace(struct ftrace_event_call *call)
+{
+ int id;
+ int num;
+
+ num = ((struct syscall_metadata *)call->data)->syscall_nr;
+ if (num < 0 || num >= NR_syscalls) {
+ pr_debug("syscall %s metadata not mapped, disabling ftrace event\n",
+ ((struct syscall_metadata *)call->data)->name);
+ return -ENOSYS;
+ }
+
+ if (set_syscall_print_fmt(call) < 0)
+ return -ENOMEM;
+
+ id = trace_event_raw_init(call);
+
+ if (id < 0) {
+ free_syscall_print_fmt(call);
+ return id;
+ }
+
+ return id;
+}
+
+struct trace_event_functions enter_syscall_print_funcs = {
+ .trace = print_syscall_enter,
+};
+
+struct trace_event_functions exit_syscall_print_funcs = {
+ .trace = print_syscall_exit,
+};
+
+struct ftrace_event_class __refdata event_class_syscall_enter = {
+ .system = "syscalls",
+ .reg = syscall_enter_register,
+ .define_fields = syscall_enter_define_fields,
+ .get_fields = syscall_get_enter_fields,
+ .raw_init = init_syscall_trace,
+};
+
+struct ftrace_event_class __refdata event_class_syscall_exit = {
+ .system = "syscalls",
+ .reg = syscall_exit_register,
+ .define_fields = syscall_exit_define_fields,
+ .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields),
+ .raw_init = init_syscall_trace,
+};
+
+unsigned long __init __weak arch_syscall_addr(int nr)
+{
+ return (unsigned long)sys_call_table[nr];
+}
+
+void __init init_ftrace_syscalls(void)
+{
+ struct syscall_metadata *meta;
+ unsigned long addr;
+ int i;
+
+ syscalls_metadata = kcalloc(NR_syscalls, sizeof(*syscalls_metadata),
+ GFP_KERNEL);
+ if (!syscalls_metadata) {
+ WARN_ON(1);
+ return;
+ }
+
+ for (i = 0; i < NR_syscalls; i++) {
+ addr = arch_syscall_addr(i);
+ meta = find_syscall_meta(addr);
+ if (!meta)
+ continue;
+
+ meta->syscall_nr = i;
+ syscalls_metadata[i] = meta;
+ }
+}
+
+#ifdef CONFIG_PERF_EVENTS
+
+static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls);
+static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
+static int sys_perf_refcount_enter;
+static int sys_perf_refcount_exit;
+
+static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
+{
+ struct syscall_metadata *sys_data;
+ struct syscall_trace_enter *rec;
+ struct hlist_head *head;
+ int syscall_nr;
+ int rctx;
+ int size;
+
+ syscall_nr = trace_get_syscall_nr(current, regs);
+ if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
+ return;
+ if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
+ return;
+
+ sys_data = syscall_nr_to_meta(syscall_nr);
+ if (!sys_data)
+ return;
+
+ head = this_cpu_ptr(sys_data->enter_event->perf_events);
+ if (hlist_empty(head))
+ return;
+
+ /* get the size after alignment with the u32 buffer size field */
+ size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
+ size = ALIGN(size + sizeof(u32), sizeof(u64));
+ size -= sizeof(u32);
+
+ rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
+ sys_data->enter_event->event.type, NULL, &rctx);
+ if (!rec)
+ return;
+
+ rec->nr = syscall_nr;
+ syscall_get_arguments(current, regs, 0, sys_data->nb_args,
+ (unsigned long *)&rec->args);
+ perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
+}
+
+static int perf_sysenter_enable(struct ftrace_event_call *call)
+{
+ int ret = 0;
+ int num;
+
+ num = ((struct syscall_metadata *)call->data)->syscall_nr;
+
+ mutex_lock(&syscall_trace_lock);
+ if (!sys_perf_refcount_enter)
+ ret = register_trace_sys_enter(perf_syscall_enter, NULL);
+ if (ret) {
+ pr_info("event trace: Could not activate"
+ "syscall entry trace point");
+ } else {
+ set_bit(num, enabled_perf_enter_syscalls);
+ sys_perf_refcount_enter++;
+ }
+ mutex_unlock(&syscall_trace_lock);
+ return ret;
+}
+
+static void perf_sysenter_disable(struct ftrace_event_call *call)
+{
+ int num;
+
+ num = ((struct syscall_metadata *)call->data)->syscall_nr;
+
+ mutex_lock(&syscall_trace_lock);
+ sys_perf_refcount_enter--;
+ clear_bit(num, enabled_perf_enter_syscalls);
+ if (!sys_perf_refcount_enter)
+ unregister_trace_sys_enter(perf_syscall_enter, NULL);
+ mutex_unlock(&syscall_trace_lock);
+}
+
+static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
+{
+ struct syscall_metadata *sys_data;
+ struct syscall_trace_exit *rec;
+ struct hlist_head *head;
+ int syscall_nr;
+ int rctx;
+ int size;
+
+ syscall_nr = trace_get_syscall_nr(current, regs);
+ if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
+ return;
+ if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
+ return;
+
+ sys_data = syscall_nr_to_meta(syscall_nr);
+ if (!sys_data)
+ return;
+
+ head = this_cpu_ptr(sys_data->exit_event->perf_events);
+ if (hlist_empty(head))
+ return;
+
+ /* We can probably do that at build time */
+ size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
+ size -= sizeof(u32);
+
+ rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
+ sys_data->exit_event->event.type, NULL, &rctx);
+ if (!rec)
+ return;
+
+ rec->nr = syscall_nr;
+ rec->ret = syscall_get_return_value(current, regs);
+ perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
+}
+
+static int perf_sysexit_enable(struct ftrace_event_call *call)
+{
+ int ret = 0;
+ int num;
+
+ num = ((struct syscall_metadata *)call->data)->syscall_nr;
+
+ mutex_lock(&syscall_trace_lock);
+ if (!sys_perf_refcount_exit)
+ ret = register_trace_sys_exit(perf_syscall_exit, NULL);
+ if (ret) {
+ pr_info("event trace: Could not activate"
+ "syscall exit trace point");
+ } else {
+ set_bit(num, enabled_perf_exit_syscalls);
+ sys_perf_refcount_exit++;
+ }
+ mutex_unlock(&syscall_trace_lock);
+ return ret;
+}
+
+static void perf_sysexit_disable(struct ftrace_event_call *call)
+{
+ int num;
+
+ num = ((struct syscall_metadata *)call->data)->syscall_nr;
+
+ mutex_lock(&syscall_trace_lock);
+ sys_perf_refcount_exit--;
+ clear_bit(num, enabled_perf_exit_syscalls);
+ if (!sys_perf_refcount_exit)
+ unregister_trace_sys_exit(perf_syscall_exit, NULL);
+ mutex_unlock(&syscall_trace_lock);
+}
+
+#endif /* CONFIG_PERF_EVENTS */
+
+static int syscall_enter_register(struct ftrace_event_call *event,
+ enum trace_reg type, void *data)
+{
+ struct ftrace_event_file *file = data;
+
+ switch (type) {
+ case TRACE_REG_REGISTER:
+ return reg_event_syscall_enter(file, event);
+ case TRACE_REG_UNREGISTER:
+ unreg_event_syscall_enter(file, event);
+ return 0;
+
+#ifdef CONFIG_PERF_EVENTS
+ case TRACE_REG_PERF_REGISTER:
+ return perf_sysenter_enable(event);
+ case TRACE_REG_PERF_UNREGISTER:
+ perf_sysenter_disable(event);
+ return 0;
+ case TRACE_REG_PERF_OPEN:
+ case TRACE_REG_PERF_CLOSE:
+ case TRACE_REG_PERF_ADD:
+ case TRACE_REG_PERF_DEL:
+ return 0;
+#endif
+ }
+ return 0;
+}
+
+static int syscall_exit_register(struct ftrace_event_call *event,
+ enum trace_reg type, void *data)
+{
+ struct ftrace_event_file *file = data;
+
+ switch (type) {
+ case TRACE_REG_REGISTER:
+ return reg_event_syscall_exit(file, event);
+ case TRACE_REG_UNREGISTER:
+ unreg_event_syscall_exit(file, event);
+ return 0;
+
+#ifdef CONFIG_PERF_EVENTS
+ case TRACE_REG_PERF_REGISTER:
+ return perf_sysexit_enable(event);
+ case TRACE_REG_PERF_UNREGISTER:
+ perf_sysexit_disable(event);
+ return 0;
+ case TRACE_REG_PERF_OPEN:
+ case TRACE_REG_PERF_CLOSE:
+ case TRACE_REG_PERF_ADD:
+ case TRACE_REG_PERF_DEL:
+ return 0;
+#endif
+ }
+ return 0;
+}
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
new file mode 100644
index 000000000..6dd022c7b
--- /dev/null
+++ b/kernel/trace/trace_uprobe.c
@@ -0,0 +1,1336 @@
+/*
+ * uprobes-based tracing events
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Copyright (C) IBM Corporation, 2010-2012
+ * Author: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
+ */
+
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <linux/uprobes.h>
+#include <linux/namei.h>
+#include <linux/string.h>
+
+#include "trace_probe.h"
+
+#define UPROBE_EVENT_SYSTEM "uprobes"
+
+struct uprobe_trace_entry_head {
+ struct trace_entry ent;
+ unsigned long vaddr[];
+};
+
+#define SIZEOF_TRACE_ENTRY(is_return) \
+ (sizeof(struct uprobe_trace_entry_head) + \
+ sizeof(unsigned long) * (is_return ? 2 : 1))
+
+#define DATAOF_TRACE_ENTRY(entry, is_return) \
+ ((void*)(entry) + SIZEOF_TRACE_ENTRY(is_return))
+
+struct trace_uprobe_filter {
+ rwlock_t rwlock;
+ int nr_systemwide;
+ struct list_head perf_events;
+};
+
+/*
+ * uprobe event core functions
+ */
+struct trace_uprobe {
+ struct list_head list;
+ struct trace_uprobe_filter filter;
+ struct uprobe_consumer consumer;
+ struct inode *inode;
+ char *filename;
+ unsigned long offset;
+ unsigned long nhit;
+ struct trace_probe tp;
+};
+
+#define SIZEOF_TRACE_UPROBE(n) \
+ (offsetof(struct trace_uprobe, tp.args) + \
+ (sizeof(struct probe_arg) * (n)))
+
+static int register_uprobe_event(struct trace_uprobe *tu);
+static int unregister_uprobe_event(struct trace_uprobe *tu);
+
+static DEFINE_MUTEX(uprobe_lock);
+static LIST_HEAD(uprobe_list);
+
+struct uprobe_dispatch_data {
+ struct trace_uprobe *tu;
+ unsigned long bp_addr;
+};
+
+static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs);
+static int uretprobe_dispatcher(struct uprobe_consumer *con,
+ unsigned long func, struct pt_regs *regs);
+
+#ifdef CONFIG_STACK_GROWSUP
+static unsigned long adjust_stack_addr(unsigned long addr, unsigned int n)
+{
+ return addr - (n * sizeof(long));
+}
+#else
+static unsigned long adjust_stack_addr(unsigned long addr, unsigned int n)
+{
+ return addr + (n * sizeof(long));
+}
+#endif
+
+static unsigned long get_user_stack_nth(struct pt_regs *regs, unsigned int n)
+{
+ unsigned long ret;
+ unsigned long addr = user_stack_pointer(regs);
+
+ addr = adjust_stack_addr(addr, n);
+
+ if (copy_from_user(&ret, (void __force __user *) addr, sizeof(ret)))
+ return 0;
+
+ return ret;
+}
+
+/*
+ * Uprobes-specific fetch functions
+ */
+#define DEFINE_FETCH_stack(type) \
+static void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs, \
+ void *offset, void *dest) \
+{ \
+ *(type *)dest = (type)get_user_stack_nth(regs, \
+ ((unsigned long)offset)); \
+}
+DEFINE_BASIC_FETCH_FUNCS(stack)
+/* No string on the stack entry */
+#define fetch_stack_string NULL
+#define fetch_stack_string_size NULL
+
+#define DEFINE_FETCH_memory(type) \
+static void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs, \
+ void *addr, void *dest) \
+{ \
+ type retval; \
+ void __user *vaddr = (void __force __user *) addr; \
+ \
+ if (copy_from_user(&retval, vaddr, sizeof(type))) \
+ *(type *)dest = 0; \
+ else \
+ *(type *) dest = retval; \
+}
+DEFINE_BASIC_FETCH_FUNCS(memory)
+/*
+ * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
+ * length and relative data location.
+ */
+static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
+ void *addr, void *dest)
+{
+ long ret;
+ u32 rloc = *(u32 *)dest;
+ int maxlen = get_rloc_len(rloc);
+ u8 *dst = get_rloc_data(dest);
+ void __user *src = (void __force __user *) addr;
+
+ if (!maxlen)
+ return;
+
+ ret = strncpy_from_user(dst, src, maxlen);
+
+ if (ret < 0) { /* Failed to fetch string */
+ ((u8 *)get_rloc_data(dest))[0] = '\0';
+ *(u32 *)dest = make_data_rloc(0, get_rloc_offs(rloc));
+ } else {
+ *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(rloc));
+ }
+}
+
+static void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
+ void *addr, void *dest)
+{
+ int len;
+ void __user *vaddr = (void __force __user *) addr;
+
+ len = strnlen_user(vaddr, MAX_STRING_SIZE);
+
+ if (len == 0 || len > MAX_STRING_SIZE) /* Failed to check length */
+ *(u32 *)dest = 0;
+ else
+ *(u32 *)dest = len;
+}
+
+static unsigned long translate_user_vaddr(void *file_offset)
+{
+ unsigned long base_addr;
+ struct uprobe_dispatch_data *udd;
+
+ udd = (void *) current->utask->vaddr;
+
+ base_addr = udd->bp_addr - udd->tu->offset;
+ return base_addr + (unsigned long)file_offset;
+}
+
+#define DEFINE_FETCH_file_offset(type) \
+static void FETCH_FUNC_NAME(file_offset, type)(struct pt_regs *regs, \
+ void *offset, void *dest)\
+{ \
+ void *vaddr = (void *)translate_user_vaddr(offset); \
+ \
+ FETCH_FUNC_NAME(memory, type)(regs, vaddr, dest); \
+}
+DEFINE_BASIC_FETCH_FUNCS(file_offset)
+DEFINE_FETCH_file_offset(string)
+DEFINE_FETCH_file_offset(string_size)
+
+/* Fetch type information table */
+static const struct fetch_type uprobes_fetch_type_table[] = {
+ /* Special types */
+ [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string,
+ sizeof(u32), 1, "__data_loc char[]"),
+ [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32,
+ string_size, sizeof(u32), 0, "u32"),
+ /* Basic types */
+ ASSIGN_FETCH_TYPE(u8, u8, 0),
+ ASSIGN_FETCH_TYPE(u16, u16, 0),
+ ASSIGN_FETCH_TYPE(u32, u32, 0),
+ ASSIGN_FETCH_TYPE(u64, u64, 0),
+ ASSIGN_FETCH_TYPE(s8, u8, 1),
+ ASSIGN_FETCH_TYPE(s16, u16, 1),
+ ASSIGN_FETCH_TYPE(s32, u32, 1),
+ ASSIGN_FETCH_TYPE(s64, u64, 1),
+
+ ASSIGN_FETCH_TYPE_END
+};
+
+static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter)
+{
+ rwlock_init(&filter->rwlock);
+ filter->nr_systemwide = 0;
+ INIT_LIST_HEAD(&filter->perf_events);
+}
+
+static inline bool uprobe_filter_is_empty(struct trace_uprobe_filter *filter)
+{
+ return !filter->nr_systemwide && list_empty(&filter->perf_events);
+}
+
+static inline bool is_ret_probe(struct trace_uprobe *tu)
+{
+ return tu->consumer.ret_handler != NULL;
+}
+
+/*
+ * Allocate new trace_uprobe and initialize it (including uprobes).
+ */
+static struct trace_uprobe *
+alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
+{
+ struct trace_uprobe *tu;
+
+ if (!event || !is_good_name(event))
+ return ERR_PTR(-EINVAL);
+
+ if (!group || !is_good_name(group))
+ return ERR_PTR(-EINVAL);
+
+ tu = kzalloc(SIZEOF_TRACE_UPROBE(nargs), GFP_KERNEL);
+ if (!tu)
+ return ERR_PTR(-ENOMEM);
+
+ tu->tp.call.class = &tu->tp.class;
+ tu->tp.call.name = kstrdup(event, GFP_KERNEL);
+ if (!tu->tp.call.name)
+ goto error;
+
+ tu->tp.class.system = kstrdup(group, GFP_KERNEL);
+ if (!tu->tp.class.system)
+ goto error;
+
+ INIT_LIST_HEAD(&tu->list);
+ INIT_LIST_HEAD(&tu->tp.files);
+ tu->consumer.handler = uprobe_dispatcher;
+ if (is_ret)
+ tu->consumer.ret_handler = uretprobe_dispatcher;
+ init_trace_uprobe_filter(&tu->filter);
+ return tu;
+
+error:
+ kfree(tu->tp.call.name);
+ kfree(tu);
+
+ return ERR_PTR(-ENOMEM);
+}
+
+static void free_trace_uprobe(struct trace_uprobe *tu)
+{
+ int i;
+
+ for (i = 0; i < tu->tp.nr_args; i++)
+ traceprobe_free_probe_arg(&tu->tp.args[i]);
+
+ iput(tu->inode);
+ kfree(tu->tp.call.class->system);
+ kfree(tu->tp.call.name);
+ kfree(tu->filename);
+ kfree(tu);
+}
+
+static struct trace_uprobe *find_probe_event(const char *event, const char *group)
+{
+ struct trace_uprobe *tu;
+
+ list_for_each_entry(tu, &uprobe_list, list)
+ if (strcmp(ftrace_event_name(&tu->tp.call), event) == 0 &&
+ strcmp(tu->tp.call.class->system, group) == 0)
+ return tu;
+
+ return NULL;
+}
+
+/* Unregister a trace_uprobe and probe_event: call with locking uprobe_lock */
+static int unregister_trace_uprobe(struct trace_uprobe *tu)
+{
+ int ret;
+
+ ret = unregister_uprobe_event(tu);
+ if (ret)
+ return ret;
+
+ list_del(&tu->list);
+ free_trace_uprobe(tu);
+ return 0;
+}
+
+/* Register a trace_uprobe and probe_event */
+static int register_trace_uprobe(struct trace_uprobe *tu)
+{
+ struct trace_uprobe *old_tu;
+ int ret;
+
+ mutex_lock(&uprobe_lock);
+
+ /* register as an event */
+ old_tu = find_probe_event(ftrace_event_name(&tu->tp.call),
+ tu->tp.call.class->system);
+ if (old_tu) {
+ /* delete old event */
+ ret = unregister_trace_uprobe(old_tu);
+ if (ret)
+ goto end;
+ }
+
+ ret = register_uprobe_event(tu);
+ if (ret) {
+ pr_warning("Failed to register probe event(%d)\n", ret);
+ goto end;
+ }
+
+ list_add_tail(&tu->list, &uprobe_list);
+
+end:
+ mutex_unlock(&uprobe_lock);
+
+ return ret;
+}
+
+/*
+ * Argument syntax:
+ * - Add uprobe: p|r[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS]
+ *
+ * - Remove uprobe: -:[GRP/]EVENT
+ */
+static int create_trace_uprobe(int argc, char **argv)
+{
+ struct trace_uprobe *tu;
+ struct inode *inode;
+ char *arg, *event, *group, *filename;
+ char buf[MAX_EVENT_NAME_LEN];
+ struct path path;
+ unsigned long offset;
+ bool is_delete, is_return;
+ int i, ret;
+
+ inode = NULL;
+ ret = 0;
+ is_delete = false;
+ is_return = false;
+ event = NULL;
+ group = NULL;
+
+ /* argc must be >= 1 */
+ if (argv[0][0] == '-')
+ is_delete = true;
+ else if (argv[0][0] == 'r')
+ is_return = true;
+ else if (argv[0][0] != 'p') {
+ pr_info("Probe definition must be started with 'p', 'r' or '-'.\n");
+ return -EINVAL;
+ }
+
+ if (argv[0][1] == ':') {
+ event = &argv[0][2];
+ arg = strchr(event, '/');
+
+ if (arg) {
+ group = event;
+ event = arg + 1;
+ event[-1] = '\0';
+
+ if (strlen(group) == 0) {
+ pr_info("Group name is not specified\n");
+ return -EINVAL;
+ }
+ }
+ if (strlen(event) == 0) {
+ pr_info("Event name is not specified\n");
+ return -EINVAL;
+ }
+ }
+ if (!group)
+ group = UPROBE_EVENT_SYSTEM;
+
+ if (is_delete) {
+ int ret;
+
+ if (!event) {
+ pr_info("Delete command needs an event name.\n");
+ return -EINVAL;
+ }
+ mutex_lock(&uprobe_lock);
+ tu = find_probe_event(event, group);
+
+ if (!tu) {
+ mutex_unlock(&uprobe_lock);
+ pr_info("Event %s/%s doesn't exist.\n", group, event);
+ return -ENOENT;
+ }
+ /* delete an event */
+ ret = unregister_trace_uprobe(tu);
+ mutex_unlock(&uprobe_lock);
+ return ret;
+ }
+
+ if (argc < 2) {
+ pr_info("Probe point is not specified.\n");
+ return -EINVAL;
+ }
+ if (isdigit(argv[1][0])) {
+ pr_info("probe point must be have a filename.\n");
+ return -EINVAL;
+ }
+ arg = strchr(argv[1], ':');
+ if (!arg) {
+ ret = -EINVAL;
+ goto fail_address_parse;
+ }
+
+ *arg++ = '\0';
+ filename = argv[1];
+ ret = kern_path(filename, LOOKUP_FOLLOW, &path);
+ if (ret)
+ goto fail_address_parse;
+
+ inode = igrab(d_inode(path.dentry));
+ path_put(&path);
+
+ if (!inode || !S_ISREG(inode->i_mode)) {
+ ret = -EINVAL;
+ goto fail_address_parse;
+ }
+
+ ret = kstrtoul(arg, 0, &offset);
+ if (ret)
+ goto fail_address_parse;
+
+ argc -= 2;
+ argv += 2;
+
+ /* setup a probe */
+ if (!event) {
+ char *tail;
+ char *ptr;
+
+ tail = kstrdup(kbasename(filename), GFP_KERNEL);
+ if (!tail) {
+ ret = -ENOMEM;
+ goto fail_address_parse;
+ }
+
+ ptr = strpbrk(tail, ".-_");
+ if (ptr)
+ *ptr = '\0';
+
+ snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_0x%lx", 'p', tail, offset);
+ event = buf;
+ kfree(tail);
+ }
+
+ tu = alloc_trace_uprobe(group, event, argc, is_return);
+ if (IS_ERR(tu)) {
+ pr_info("Failed to allocate trace_uprobe.(%d)\n", (int)PTR_ERR(tu));
+ ret = PTR_ERR(tu);
+ goto fail_address_parse;
+ }
+ tu->offset = offset;
+ tu->inode = inode;
+ tu->filename = kstrdup(filename, GFP_KERNEL);
+
+ if (!tu->filename) {
+ pr_info("Failed to allocate filename.\n");
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ /* parse arguments */
+ ret = 0;
+ for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
+ struct probe_arg *parg = &tu->tp.args[i];
+
+ /* Increment count for freeing args in error case */
+ tu->tp.nr_args++;
+
+ /* Parse argument name */
+ arg = strchr(argv[i], '=');
+ if (arg) {
+ *arg++ = '\0';
+ parg->name = kstrdup(argv[i], GFP_KERNEL);
+ } else {
+ arg = argv[i];
+ /* If argument name is omitted, set "argN" */
+ snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1);
+ parg->name = kstrdup(buf, GFP_KERNEL);
+ }
+
+ if (!parg->name) {
+ pr_info("Failed to allocate argument[%d] name.\n", i);
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ if (!is_good_name(parg->name)) {
+ pr_info("Invalid argument[%d] name: %s\n", i, parg->name);
+ ret = -EINVAL;
+ goto error;
+ }
+
+ if (traceprobe_conflict_field_name(parg->name, tu->tp.args, i)) {
+ pr_info("Argument[%d] name '%s' conflicts with "
+ "another field.\n", i, argv[i]);
+ ret = -EINVAL;
+ goto error;
+ }
+
+ /* Parse fetch argument */
+ ret = traceprobe_parse_probe_arg(arg, &tu->tp.size, parg,
+ is_return, false,
+ uprobes_fetch_type_table);
+ if (ret) {
+ pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
+ goto error;
+ }
+ }
+
+ ret = register_trace_uprobe(tu);
+ if (ret)
+ goto error;
+ return 0;
+
+error:
+ free_trace_uprobe(tu);
+ return ret;
+
+fail_address_parse:
+ iput(inode);
+
+ pr_info("Failed to parse address or file.\n");
+
+ return ret;
+}
+
+static int cleanup_all_probes(void)
+{
+ struct trace_uprobe *tu;
+ int ret = 0;
+
+ mutex_lock(&uprobe_lock);
+ while (!list_empty(&uprobe_list)) {
+ tu = list_entry(uprobe_list.next, struct trace_uprobe, list);
+ ret = unregister_trace_uprobe(tu);
+ if (ret)
+ break;
+ }
+ mutex_unlock(&uprobe_lock);
+ return ret;
+}
+
+/* Probes listing interfaces */
+static void *probes_seq_start(struct seq_file *m, loff_t *pos)
+{
+ mutex_lock(&uprobe_lock);
+ return seq_list_start(&uprobe_list, *pos);
+}
+
+static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ return seq_list_next(v, &uprobe_list, pos);
+}
+
+static void probes_seq_stop(struct seq_file *m, void *v)
+{
+ mutex_unlock(&uprobe_lock);
+}
+
+static int probes_seq_show(struct seq_file *m, void *v)
+{
+ struct trace_uprobe *tu = v;
+ char c = is_ret_probe(tu) ? 'r' : 'p';
+ int i;
+
+ seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system,
+ ftrace_event_name(&tu->tp.call));
+ seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset);
+
+ for (i = 0; i < tu->tp.nr_args; i++)
+ seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm);
+
+ seq_putc(m, '\n');
+ return 0;
+}
+
+static const struct seq_operations probes_seq_op = {
+ .start = probes_seq_start,
+ .next = probes_seq_next,
+ .stop = probes_seq_stop,
+ .show = probes_seq_show
+};
+
+static int probes_open(struct inode *inode, struct file *file)
+{
+ int ret;
+
+ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
+ ret = cleanup_all_probes();
+ if (ret)
+ return ret;
+ }
+
+ return seq_open(file, &probes_seq_op);
+}
+
+static ssize_t probes_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *ppos)
+{
+ return traceprobe_probes_write(file, buffer, count, ppos, create_trace_uprobe);
+}
+
+static const struct file_operations uprobe_events_ops = {
+ .owner = THIS_MODULE,
+ .open = probes_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+ .write = probes_write,
+};
+
+/* Probes profiling interfaces */
+static int probes_profile_seq_show(struct seq_file *m, void *v)
+{
+ struct trace_uprobe *tu = v;
+
+ seq_printf(m, " %s %-44s %15lu\n", tu->filename,
+ ftrace_event_name(&tu->tp.call), tu->nhit);
+ return 0;
+}
+
+static const struct seq_operations profile_seq_op = {
+ .start = probes_seq_start,
+ .next = probes_seq_next,
+ .stop = probes_seq_stop,
+ .show = probes_profile_seq_show
+};
+
+static int profile_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &profile_seq_op);
+}
+
+static const struct file_operations uprobe_profile_ops = {
+ .owner = THIS_MODULE,
+ .open = profile_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+struct uprobe_cpu_buffer {
+ struct mutex mutex;
+ void *buf;
+};
+static struct uprobe_cpu_buffer __percpu *uprobe_cpu_buffer;
+static int uprobe_buffer_refcnt;
+
+static int uprobe_buffer_init(void)
+{
+ int cpu, err_cpu;
+
+ uprobe_cpu_buffer = alloc_percpu(struct uprobe_cpu_buffer);
+ if (uprobe_cpu_buffer == NULL)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ struct page *p = alloc_pages_node(cpu_to_node(cpu),
+ GFP_KERNEL, 0);
+ if (p == NULL) {
+ err_cpu = cpu;
+ goto err;
+ }
+ per_cpu_ptr(uprobe_cpu_buffer, cpu)->buf = page_address(p);
+ mutex_init(&per_cpu_ptr(uprobe_cpu_buffer, cpu)->mutex);
+ }
+
+ return 0;
+
+err:
+ for_each_possible_cpu(cpu) {
+ if (cpu == err_cpu)
+ break;
+ free_page((unsigned long)per_cpu_ptr(uprobe_cpu_buffer, cpu)->buf);
+ }
+
+ free_percpu(uprobe_cpu_buffer);
+ return -ENOMEM;
+}
+
+static int uprobe_buffer_enable(void)
+{
+ int ret = 0;
+
+ BUG_ON(!mutex_is_locked(&event_mutex));
+
+ if (uprobe_buffer_refcnt++ == 0) {
+ ret = uprobe_buffer_init();
+ if (ret < 0)
+ uprobe_buffer_refcnt--;
+ }
+
+ return ret;
+}
+
+static void uprobe_buffer_disable(void)
+{
+ int cpu;
+
+ BUG_ON(!mutex_is_locked(&event_mutex));
+
+ if (--uprobe_buffer_refcnt == 0) {
+ for_each_possible_cpu(cpu)
+ free_page((unsigned long)per_cpu_ptr(uprobe_cpu_buffer,
+ cpu)->buf);
+
+ free_percpu(uprobe_cpu_buffer);
+ uprobe_cpu_buffer = NULL;
+ }
+}
+
+static struct uprobe_cpu_buffer *uprobe_buffer_get(void)
+{
+ struct uprobe_cpu_buffer *ucb;
+ int cpu;
+
+ cpu = raw_smp_processor_id();
+ ucb = per_cpu_ptr(uprobe_cpu_buffer, cpu);
+
+ /*
+ * Use per-cpu buffers for fastest access, but we might migrate
+ * so the mutex makes sure we have sole access to it.
+ */
+ mutex_lock(&ucb->mutex);
+
+ return ucb;
+}
+
+static void uprobe_buffer_put(struct uprobe_cpu_buffer *ucb)
+{
+ mutex_unlock(&ucb->mutex);
+}
+
+static void __uprobe_trace_func(struct trace_uprobe *tu,
+ unsigned long func, struct pt_regs *regs,
+ struct uprobe_cpu_buffer *ucb, int dsize,
+ struct ftrace_event_file *ftrace_file)
+{
+ struct uprobe_trace_entry_head *entry;
+ struct ring_buffer_event *event;
+ struct ring_buffer *buffer;
+ void *data;
+ int size, esize;
+ struct ftrace_event_call *call = &tu->tp.call;
+
+ WARN_ON(call != ftrace_file->event_call);
+
+ if (WARN_ON_ONCE(tu->tp.size + dsize > PAGE_SIZE))
+ return;
+
+ if (ftrace_trigger_soft_disabled(ftrace_file))
+ return;
+
+ esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
+ size = esize + tu->tp.size + dsize;
+ event = trace_event_buffer_lock_reserve(&buffer, ftrace_file,
+ call->event.type, size, 0, 0);
+ if (!event)
+ return;
+
+ entry = ring_buffer_event_data(event);
+ if (is_ret_probe(tu)) {
+ entry->vaddr[0] = func;
+ entry->vaddr[1] = instruction_pointer(regs);
+ data = DATAOF_TRACE_ENTRY(entry, true);
+ } else {
+ entry->vaddr[0] = instruction_pointer(regs);
+ data = DATAOF_TRACE_ENTRY(entry, false);
+ }
+
+ memcpy(data, ucb->buf, tu->tp.size + dsize);
+
+ event_trigger_unlock_commit(ftrace_file, buffer, event, entry, 0, 0);
+}
+
+/* uprobe handler */
+static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs,
+ struct uprobe_cpu_buffer *ucb, int dsize)
+{
+ struct event_file_link *link;
+
+ if (is_ret_probe(tu))
+ return 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(link, &tu->tp.files, list)
+ __uprobe_trace_func(tu, 0, regs, ucb, dsize, link->file);
+ rcu_read_unlock();
+
+ return 0;
+}
+
+static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func,
+ struct pt_regs *regs,
+ struct uprobe_cpu_buffer *ucb, int dsize)
+{
+ struct event_file_link *link;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(link, &tu->tp.files, list)
+ __uprobe_trace_func(tu, func, regs, ucb, dsize, link->file);
+ rcu_read_unlock();
+}
+
+/* Event entry printers */
+static enum print_line_t
+print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event)
+{
+ struct uprobe_trace_entry_head *entry;
+ struct trace_seq *s = &iter->seq;
+ struct trace_uprobe *tu;
+ u8 *data;
+ int i;
+
+ entry = (struct uprobe_trace_entry_head *)iter->ent;
+ tu = container_of(event, struct trace_uprobe, tp.call.event);
+
+ if (is_ret_probe(tu)) {
+ trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)",
+ ftrace_event_name(&tu->tp.call),
+ entry->vaddr[1], entry->vaddr[0]);
+ data = DATAOF_TRACE_ENTRY(entry, true);
+ } else {
+ trace_seq_printf(s, "%s: (0x%lx)",
+ ftrace_event_name(&tu->tp.call),
+ entry->vaddr[0]);
+ data = DATAOF_TRACE_ENTRY(entry, false);
+ }
+
+ for (i = 0; i < tu->tp.nr_args; i++) {
+ struct probe_arg *parg = &tu->tp.args[i];
+
+ if (!parg->type->print(s, parg->name, data + parg->offset, entry))
+ goto out;
+ }
+
+ trace_seq_putc(s, '\n');
+
+ out:
+ return trace_handle_return(s);
+}
+
+typedef bool (*filter_func_t)(struct uprobe_consumer *self,
+ enum uprobe_filter_ctx ctx,
+ struct mm_struct *mm);
+
+static int
+probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file,
+ filter_func_t filter)
+{
+ bool enabled = trace_probe_is_enabled(&tu->tp);
+ struct event_file_link *link = NULL;
+ int ret;
+
+ if (file) {
+ if (tu->tp.flags & TP_FLAG_PROFILE)
+ return -EINTR;
+
+ link = kmalloc(sizeof(*link), GFP_KERNEL);
+ if (!link)
+ return -ENOMEM;
+
+ link->file = file;
+ list_add_tail_rcu(&link->list, &tu->tp.files);
+
+ tu->tp.flags |= TP_FLAG_TRACE;
+ } else {
+ if (tu->tp.flags & TP_FLAG_TRACE)
+ return -EINTR;
+
+ tu->tp.flags |= TP_FLAG_PROFILE;
+ }
+
+ WARN_ON(!uprobe_filter_is_empty(&tu->filter));
+
+ if (enabled)
+ return 0;
+
+ ret = uprobe_buffer_enable();
+ if (ret)
+ goto err_flags;
+
+ tu->consumer.filter = filter;
+ ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
+ if (ret)
+ goto err_buffer;
+
+ return 0;
+
+ err_buffer:
+ uprobe_buffer_disable();
+
+ err_flags:
+ if (file) {
+ list_del(&link->list);
+ kfree(link);
+ tu->tp.flags &= ~TP_FLAG_TRACE;
+ } else {
+ tu->tp.flags &= ~TP_FLAG_PROFILE;
+ }
+ return ret;
+}
+
+static void
+probe_event_disable(struct trace_uprobe *tu, struct ftrace_event_file *file)
+{
+ if (!trace_probe_is_enabled(&tu->tp))
+ return;
+
+ if (file) {
+ struct event_file_link *link;
+
+ link = find_event_file_link(&tu->tp, file);
+ if (!link)
+ return;
+
+ list_del_rcu(&link->list);
+ /* synchronize with u{,ret}probe_trace_func */
+ synchronize_sched();
+ kfree(link);
+
+ if (!list_empty(&tu->tp.files))
+ return;
+ }
+
+ WARN_ON(!uprobe_filter_is_empty(&tu->filter));
+
+ uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
+ tu->tp.flags &= file ? ~TP_FLAG_TRACE : ~TP_FLAG_PROFILE;
+
+ uprobe_buffer_disable();
+}
+
+static int uprobe_event_define_fields(struct ftrace_event_call *event_call)
+{
+ int ret, i, size;
+ struct uprobe_trace_entry_head field;
+ struct trace_uprobe *tu = event_call->data;
+
+ if (is_ret_probe(tu)) {
+ DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_FUNC, 0);
+ DEFINE_FIELD(unsigned long, vaddr[1], FIELD_STRING_RETIP, 0);
+ size = SIZEOF_TRACE_ENTRY(true);
+ } else {
+ DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_IP, 0);
+ size = SIZEOF_TRACE_ENTRY(false);
+ }
+ /* Set argument names as fields */
+ for (i = 0; i < tu->tp.nr_args; i++) {
+ struct probe_arg *parg = &tu->tp.args[i];
+
+ ret = trace_define_field(event_call, parg->type->fmttype,
+ parg->name, size + parg->offset,
+ parg->type->size, parg->type->is_signed,
+ FILTER_OTHER);
+
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+#ifdef CONFIG_PERF_EVENTS
+static bool
+__uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm)
+{
+ struct perf_event *event;
+
+ if (filter->nr_systemwide)
+ return true;
+
+ list_for_each_entry(event, &filter->perf_events, hw.tp_list) {
+ if (event->hw.target->mm == mm)
+ return true;
+ }
+
+ return false;
+}
+
+static inline bool
+uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event)
+{
+ return __uprobe_perf_filter(&tu->filter, event->hw.target->mm);
+}
+
+static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
+{
+ bool done;
+
+ write_lock(&tu->filter.rwlock);
+ if (event->hw.target) {
+ list_del(&event->hw.tp_list);
+ done = tu->filter.nr_systemwide ||
+ (event->hw.target->flags & PF_EXITING) ||
+ uprobe_filter_event(tu, event);
+ } else {
+ tu->filter.nr_systemwide--;
+ done = tu->filter.nr_systemwide;
+ }
+ write_unlock(&tu->filter.rwlock);
+
+ if (!done)
+ return uprobe_apply(tu->inode, tu->offset, &tu->consumer, false);
+
+ return 0;
+}
+
+static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)
+{
+ bool done;
+ int err;
+
+ write_lock(&tu->filter.rwlock);
+ if (event->hw.target) {
+ /*
+ * event->parent != NULL means copy_process(), we can avoid
+ * uprobe_apply(). current->mm must be probed and we can rely
+ * on dup_mmap() which preserves the already installed bp's.
+ *
+ * attr.enable_on_exec means that exec/mmap will install the
+ * breakpoints we need.
+ */
+ done = tu->filter.nr_systemwide ||
+ event->parent || event->attr.enable_on_exec ||
+ uprobe_filter_event(tu, event);
+ list_add(&event->hw.tp_list, &tu->filter.perf_events);
+ } else {
+ done = tu->filter.nr_systemwide;
+ tu->filter.nr_systemwide++;
+ }
+ write_unlock(&tu->filter.rwlock);
+
+ err = 0;
+ if (!done) {
+ err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true);
+ if (err)
+ uprobe_perf_close(tu, event);
+ }
+ return err;
+}
+
+static bool uprobe_perf_filter(struct uprobe_consumer *uc,
+ enum uprobe_filter_ctx ctx, struct mm_struct *mm)
+{
+ struct trace_uprobe *tu;
+ int ret;
+
+ tu = container_of(uc, struct trace_uprobe, consumer);
+ read_lock(&tu->filter.rwlock);
+ ret = __uprobe_perf_filter(&tu->filter, mm);
+ read_unlock(&tu->filter.rwlock);
+
+ return ret;
+}
+
+static void __uprobe_perf_func(struct trace_uprobe *tu,
+ unsigned long func, struct pt_regs *regs,
+ struct uprobe_cpu_buffer *ucb, int dsize)
+{
+ struct ftrace_event_call *call = &tu->tp.call;
+ struct uprobe_trace_entry_head *entry;
+ struct hlist_head *head;
+ void *data;
+ int size, esize;
+ int rctx;
+
+ esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
+
+ size = esize + tu->tp.size + dsize;
+ size = ALIGN(size + sizeof(u32), sizeof(u64)) - sizeof(u32);
+ if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
+ return;
+
+ preempt_disable();
+ head = this_cpu_ptr(call->perf_events);
+ if (hlist_empty(head))
+ goto out;
+
+ entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
+ if (!entry)
+ goto out;
+
+ if (is_ret_probe(tu)) {
+ entry->vaddr[0] = func;
+ entry->vaddr[1] = instruction_pointer(regs);
+ data = DATAOF_TRACE_ENTRY(entry, true);
+ } else {
+ entry->vaddr[0] = instruction_pointer(regs);
+ data = DATAOF_TRACE_ENTRY(entry, false);
+ }
+
+ memcpy(data, ucb->buf, tu->tp.size + dsize);
+
+ if (size - esize > tu->tp.size + dsize) {
+ int len = tu->tp.size + dsize;
+
+ memset(data + len, 0, size - esize - len);
+ }
+
+ perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
+ out:
+ preempt_enable();
+}
+
+/* uprobe profile handler */
+static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs,
+ struct uprobe_cpu_buffer *ucb, int dsize)
+{
+ if (!uprobe_perf_filter(&tu->consumer, 0, current->mm))
+ return UPROBE_HANDLER_REMOVE;
+
+ if (!is_ret_probe(tu))
+ __uprobe_perf_func(tu, 0, regs, ucb, dsize);
+ return 0;
+}
+
+static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func,
+ struct pt_regs *regs,
+ struct uprobe_cpu_buffer *ucb, int dsize)
+{
+ __uprobe_perf_func(tu, func, regs, ucb, dsize);
+}
+#endif /* CONFIG_PERF_EVENTS */
+
+static int
+trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
+ void *data)
+{
+ struct trace_uprobe *tu = event->data;
+ struct ftrace_event_file *file = data;
+
+ switch (type) {
+ case TRACE_REG_REGISTER:
+ return probe_event_enable(tu, file, NULL);
+
+ case TRACE_REG_UNREGISTER:
+ probe_event_disable(tu, file);
+ return 0;
+
+#ifdef CONFIG_PERF_EVENTS
+ case TRACE_REG_PERF_REGISTER:
+ return probe_event_enable(tu, NULL, uprobe_perf_filter);
+
+ case TRACE_REG_PERF_UNREGISTER:
+ probe_event_disable(tu, NULL);
+ return 0;
+
+ case TRACE_REG_PERF_OPEN:
+ return uprobe_perf_open(tu, data);
+
+ case TRACE_REG_PERF_CLOSE:
+ return uprobe_perf_close(tu, data);
+
+#endif
+ default:
+ return 0;
+ }
+ return 0;
+}
+
+static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
+{
+ struct trace_uprobe *tu;
+ struct uprobe_dispatch_data udd;
+ struct uprobe_cpu_buffer *ucb;
+ int dsize, esize;
+ int ret = 0;
+
+
+ tu = container_of(con, struct trace_uprobe, consumer);
+ tu->nhit++;
+
+ udd.tu = tu;
+ udd.bp_addr = instruction_pointer(regs);
+
+ current->utask->vaddr = (unsigned long) &udd;
+
+ if (WARN_ON_ONCE(!uprobe_cpu_buffer))
+ return 0;
+
+ dsize = __get_data_size(&tu->tp, regs);
+ esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
+
+ ucb = uprobe_buffer_get();
+ store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize);
+
+ if (tu->tp.flags & TP_FLAG_TRACE)
+ ret |= uprobe_trace_func(tu, regs, ucb, dsize);
+
+#ifdef CONFIG_PERF_EVENTS
+ if (tu->tp.flags & TP_FLAG_PROFILE)
+ ret |= uprobe_perf_func(tu, regs, ucb, dsize);
+#endif
+ uprobe_buffer_put(ucb);
+ return ret;
+}
+
+static int uretprobe_dispatcher(struct uprobe_consumer *con,
+ unsigned long func, struct pt_regs *regs)
+{
+ struct trace_uprobe *tu;
+ struct uprobe_dispatch_data udd;
+ struct uprobe_cpu_buffer *ucb;
+ int dsize, esize;
+
+ tu = container_of(con, struct trace_uprobe, consumer);
+
+ udd.tu = tu;
+ udd.bp_addr = func;
+
+ current->utask->vaddr = (unsigned long) &udd;
+
+ if (WARN_ON_ONCE(!uprobe_cpu_buffer))
+ return 0;
+
+ dsize = __get_data_size(&tu->tp, regs);
+ esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
+
+ ucb = uprobe_buffer_get();
+ store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize);
+
+ if (tu->tp.flags & TP_FLAG_TRACE)
+ uretprobe_trace_func(tu, func, regs, ucb, dsize);
+
+#ifdef CONFIG_PERF_EVENTS
+ if (tu->tp.flags & TP_FLAG_PROFILE)
+ uretprobe_perf_func(tu, func, regs, ucb, dsize);
+#endif
+ uprobe_buffer_put(ucb);
+ return 0;
+}
+
+static struct trace_event_functions uprobe_funcs = {
+ .trace = print_uprobe_event
+};
+
+static int register_uprobe_event(struct trace_uprobe *tu)
+{
+ struct ftrace_event_call *call = &tu->tp.call;
+ int ret;
+
+ /* Initialize ftrace_event_call */
+ INIT_LIST_HEAD(&call->class->fields);
+ call->event.funcs = &uprobe_funcs;
+ call->class->define_fields = uprobe_event_define_fields;
+
+ if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0)
+ return -ENOMEM;
+
+ ret = register_ftrace_event(&call->event);
+ if (!ret) {
+ kfree(call->print_fmt);
+ return -ENODEV;
+ }
+
+ call->class->reg = trace_uprobe_register;
+ call->data = tu;
+ ret = trace_add_event_call(call);
+
+ if (ret) {
+ pr_info("Failed to register uprobe event: %s\n",
+ ftrace_event_name(call));
+ kfree(call->print_fmt);
+ unregister_ftrace_event(&call->event);
+ }
+
+ return ret;
+}
+
+static int unregister_uprobe_event(struct trace_uprobe *tu)
+{
+ int ret;
+
+ /* tu->event is unregistered in trace_remove_event_call() */
+ ret = trace_remove_event_call(&tu->tp.call);
+ if (ret)
+ return ret;
+ kfree(tu->tp.call.print_fmt);
+ tu->tp.call.print_fmt = NULL;
+ return 0;
+}
+
+/* Make a trace interface for controling probe points */
+static __init int init_uprobe_trace(void)
+{
+ struct dentry *d_tracer;
+
+ d_tracer = tracing_init_dentry();
+ if (IS_ERR(d_tracer))
+ return 0;
+
+ trace_create_file("uprobe_events", 0644, d_tracer,
+ NULL, &uprobe_events_ops);
+ /* Profile interface */
+ trace_create_file("uprobe_profile", 0444, d_tracer,
+ NULL, &uprobe_profile_ops);
+ return 0;
+}
+
+fs_initcall(init_uprobe_trace);
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
new file mode 100644
index 000000000..3490407dc
--- /dev/null
+++ b/kernel/tracepoint.c
@@ -0,0 +1,520 @@
+/*
+ * Copyright (C) 2008-2014 Mathieu Desnoyers
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/types.h>
+#include <linux/jhash.h>
+#include <linux/list.h>
+#include <linux/rcupdate.h>
+#include <linux/tracepoint.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/static_key.h>
+
+extern struct tracepoint * const __start___tracepoints_ptrs[];
+extern struct tracepoint * const __stop___tracepoints_ptrs[];
+
+/* Set to 1 to enable tracepoint debug output */
+static const int tracepoint_debug;
+
+#ifdef CONFIG_MODULES
+/*
+ * Tracepoint module list mutex protects the local module list.
+ */
+static DEFINE_MUTEX(tracepoint_module_list_mutex);
+
+/* Local list of struct tp_module */
+static LIST_HEAD(tracepoint_module_list);
+#endif /* CONFIG_MODULES */
+
+/*
+ * tracepoints_mutex protects the builtin and module tracepoints.
+ * tracepoints_mutex nests inside tracepoint_module_list_mutex.
+ */
+static DEFINE_MUTEX(tracepoints_mutex);
+
+/*
+ * Note about RCU :
+ * It is used to delay the free of multiple probes array until a quiescent
+ * state is reached.
+ */
+struct tp_probes {
+ struct rcu_head rcu;
+ struct tracepoint_func probes[0];
+};
+
+static inline void *allocate_probes(int count)
+{
+ struct tp_probes *p = kmalloc(count * sizeof(struct tracepoint_func)
+ + sizeof(struct tp_probes), GFP_KERNEL);
+ return p == NULL ? NULL : p->probes;
+}
+
+static void rcu_free_old_probes(struct rcu_head *head)
+{
+ kfree(container_of(head, struct tp_probes, rcu));
+}
+
+static inline void release_probes(struct tracepoint_func *old)
+{
+ if (old) {
+ struct tp_probes *tp_probes = container_of(old,
+ struct tp_probes, probes[0]);
+ call_rcu_sched(&tp_probes->rcu, rcu_free_old_probes);
+ }
+}
+
+static void debug_print_probes(struct tracepoint_func *funcs)
+{
+ int i;
+
+ if (!tracepoint_debug || !funcs)
+ return;
+
+ for (i = 0; funcs[i].func; i++)
+ printk(KERN_DEBUG "Probe %d : %p\n", i, funcs[i].func);
+}
+
+static struct tracepoint_func *func_add(struct tracepoint_func **funcs,
+ struct tracepoint_func *tp_func)
+{
+ int nr_probes = 0;
+ struct tracepoint_func *old, *new;
+
+ if (WARN_ON(!tp_func->func))
+ return ERR_PTR(-EINVAL);
+
+ debug_print_probes(*funcs);
+ old = *funcs;
+ if (old) {
+ /* (N -> N+1), (N != 0, 1) probes */
+ for (nr_probes = 0; old[nr_probes].func; nr_probes++)
+ if (old[nr_probes].func == tp_func->func &&
+ old[nr_probes].data == tp_func->data)
+ return ERR_PTR(-EEXIST);
+ }
+ /* + 2 : one for new probe, one for NULL func */
+ new = allocate_probes(nr_probes + 2);
+ if (new == NULL)
+ return ERR_PTR(-ENOMEM);
+ if (old)
+ memcpy(new, old, nr_probes * sizeof(struct tracepoint_func));
+ new[nr_probes] = *tp_func;
+ new[nr_probes + 1].func = NULL;
+ *funcs = new;
+ debug_print_probes(*funcs);
+ return old;
+}
+
+static void *func_remove(struct tracepoint_func **funcs,
+ struct tracepoint_func *tp_func)
+{
+ int nr_probes = 0, nr_del = 0, i;
+ struct tracepoint_func *old, *new;
+
+ old = *funcs;
+
+ if (!old)
+ return ERR_PTR(-ENOENT);
+
+ debug_print_probes(*funcs);
+ /* (N -> M), (N > 1, M >= 0) probes */
+ if (tp_func->func) {
+ for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
+ if (old[nr_probes].func == tp_func->func &&
+ old[nr_probes].data == tp_func->data)
+ nr_del++;
+ }
+ }
+
+ /*
+ * If probe is NULL, then nr_probes = nr_del = 0, and then the
+ * entire entry will be removed.
+ */
+ if (nr_probes - nr_del == 0) {
+ /* N -> 0, (N > 1) */
+ *funcs = NULL;
+ debug_print_probes(*funcs);
+ return old;
+ } else {
+ int j = 0;
+ /* N -> M, (N > 1, M > 0) */
+ /* + 1 for NULL */
+ new = allocate_probes(nr_probes - nr_del + 1);
+ if (new == NULL)
+ return ERR_PTR(-ENOMEM);
+ for (i = 0; old[i].func; i++)
+ if (old[i].func != tp_func->func
+ || old[i].data != tp_func->data)
+ new[j++] = old[i];
+ new[nr_probes - nr_del].func = NULL;
+ *funcs = new;
+ }
+ debug_print_probes(*funcs);
+ return old;
+}
+
+/*
+ * Add the probe function to a tracepoint.
+ */
+static int tracepoint_add_func(struct tracepoint *tp,
+ struct tracepoint_func *func)
+{
+ struct tracepoint_func *old, *tp_funcs;
+
+ if (tp->regfunc && !static_key_enabled(&tp->key))
+ tp->regfunc();
+
+ tp_funcs = rcu_dereference_protected(tp->funcs,
+ lockdep_is_held(&tracepoints_mutex));
+ old = func_add(&tp_funcs, func);
+ if (IS_ERR(old)) {
+ WARN_ON_ONCE(1);
+ return PTR_ERR(old);
+ }
+
+ /*
+ * rcu_assign_pointer has a smp_wmb() which makes sure that the new
+ * probe callbacks array is consistent before setting a pointer to it.
+ * This array is referenced by __DO_TRACE from
+ * include/linux/tracepoints.h. A matching smp_read_barrier_depends()
+ * is used.
+ */
+ rcu_assign_pointer(tp->funcs, tp_funcs);
+ if (!static_key_enabled(&tp->key))
+ static_key_slow_inc(&tp->key);
+ release_probes(old);
+ return 0;
+}
+
+/*
+ * Remove a probe function from a tracepoint.
+ * Note: only waiting an RCU period after setting elem->call to the empty
+ * function insures that the original callback is not used anymore. This insured
+ * by preempt_disable around the call site.
+ */
+static int tracepoint_remove_func(struct tracepoint *tp,
+ struct tracepoint_func *func)
+{
+ struct tracepoint_func *old, *tp_funcs;
+
+ tp_funcs = rcu_dereference_protected(tp->funcs,
+ lockdep_is_held(&tracepoints_mutex));
+ old = func_remove(&tp_funcs, func);
+ if (IS_ERR(old)) {
+ WARN_ON_ONCE(1);
+ return PTR_ERR(old);
+ }
+
+ if (!tp_funcs) {
+ /* Removed last function */
+ if (tp->unregfunc && static_key_enabled(&tp->key))
+ tp->unregfunc();
+
+ if (static_key_enabled(&tp->key))
+ static_key_slow_dec(&tp->key);
+ }
+ rcu_assign_pointer(tp->funcs, tp_funcs);
+ release_probes(old);
+ return 0;
+}
+
+/**
+ * tracepoint_probe_register - Connect a probe to a tracepoint
+ * @tp: tracepoint
+ * @probe: probe handler
+ * @data: tracepoint data
+ *
+ * Returns 0 if ok, error value on error.
+ * Note: if @tp is within a module, the caller is responsible for
+ * unregistering the probe before the module is gone. This can be
+ * performed either with a tracepoint module going notifier, or from
+ * within module exit functions.
+ */
+int tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data)
+{
+ struct tracepoint_func tp_func;
+ int ret;
+
+ mutex_lock(&tracepoints_mutex);
+ tp_func.func = probe;
+ tp_func.data = data;
+ ret = tracepoint_add_func(tp, &tp_func);
+ mutex_unlock(&tracepoints_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tracepoint_probe_register);
+
+/**
+ * tracepoint_probe_unregister - Disconnect a probe from a tracepoint
+ * @tp: tracepoint
+ * @probe: probe function pointer
+ * @data: tracepoint data
+ *
+ * Returns 0 if ok, error value on error.
+ */
+int tracepoint_probe_unregister(struct tracepoint *tp, void *probe, void *data)
+{
+ struct tracepoint_func tp_func;
+ int ret;
+
+ mutex_lock(&tracepoints_mutex);
+ tp_func.func = probe;
+ tp_func.data = data;
+ ret = tracepoint_remove_func(tp, &tp_func);
+ mutex_unlock(&tracepoints_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tracepoint_probe_unregister);
+
+#ifdef CONFIG_MODULES
+bool trace_module_has_bad_taint(struct module *mod)
+{
+ return mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP) |
+ (1 << TAINT_UNSIGNED_MODULE));
+}
+
+static BLOCKING_NOTIFIER_HEAD(tracepoint_notify_list);
+
+/**
+ * register_tracepoint_notifier - register tracepoint coming/going notifier
+ * @nb: notifier block
+ *
+ * Notifiers registered with this function are called on module
+ * coming/going with the tracepoint_module_list_mutex held.
+ * The notifier block callback should expect a "struct tp_module" data
+ * pointer.
+ */
+int register_tracepoint_module_notifier(struct notifier_block *nb)
+{
+ struct tp_module *tp_mod;
+ int ret;
+
+ mutex_lock(&tracepoint_module_list_mutex);
+ ret = blocking_notifier_chain_register(&tracepoint_notify_list, nb);
+ if (ret)
+ goto end;
+ list_for_each_entry(tp_mod, &tracepoint_module_list, list)
+ (void) nb->notifier_call(nb, MODULE_STATE_COMING, tp_mod);
+end:
+ mutex_unlock(&tracepoint_module_list_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(register_tracepoint_module_notifier);
+
+/**
+ * unregister_tracepoint_notifier - unregister tracepoint coming/going notifier
+ * @nb: notifier block
+ *
+ * The notifier block callback should expect a "struct tp_module" data
+ * pointer.
+ */
+int unregister_tracepoint_module_notifier(struct notifier_block *nb)
+{
+ struct tp_module *tp_mod;
+ int ret;
+
+ mutex_lock(&tracepoint_module_list_mutex);
+ ret = blocking_notifier_chain_unregister(&tracepoint_notify_list, nb);
+ if (ret)
+ goto end;
+ list_for_each_entry(tp_mod, &tracepoint_module_list, list)
+ (void) nb->notifier_call(nb, MODULE_STATE_GOING, tp_mod);
+end:
+ mutex_unlock(&tracepoint_module_list_mutex);
+ return ret;
+
+}
+EXPORT_SYMBOL_GPL(unregister_tracepoint_module_notifier);
+
+/*
+ * Ensure the tracer unregistered the module's probes before the module
+ * teardown is performed. Prevents leaks of probe and data pointers.
+ */
+static void tp_module_going_check_quiescent(struct tracepoint * const *begin,
+ struct tracepoint * const *end)
+{
+ struct tracepoint * const *iter;
+
+ if (!begin)
+ return;
+ for (iter = begin; iter < end; iter++)
+ WARN_ON_ONCE((*iter)->funcs);
+}
+
+static int tracepoint_module_coming(struct module *mod)
+{
+ struct tp_module *tp_mod;
+ int ret = 0;
+
+ if (!mod->num_tracepoints)
+ return 0;
+
+ /*
+ * We skip modules that taint the kernel, especially those with different
+ * module headers (for forced load), to make sure we don't cause a crash.
+ * Staging, out-of-tree, and unsigned GPL modules are fine.
+ */
+ if (trace_module_has_bad_taint(mod))
+ return 0;
+ mutex_lock(&tracepoint_module_list_mutex);
+ tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL);
+ if (!tp_mod) {
+ ret = -ENOMEM;
+ goto end;
+ }
+ tp_mod->mod = mod;
+ list_add_tail(&tp_mod->list, &tracepoint_module_list);
+ blocking_notifier_call_chain(&tracepoint_notify_list,
+ MODULE_STATE_COMING, tp_mod);
+end:
+ mutex_unlock(&tracepoint_module_list_mutex);
+ return ret;
+}
+
+static void tracepoint_module_going(struct module *mod)
+{
+ struct tp_module *tp_mod;
+
+ if (!mod->num_tracepoints)
+ return;
+
+ mutex_lock(&tracepoint_module_list_mutex);
+ list_for_each_entry(tp_mod, &tracepoint_module_list, list) {
+ if (tp_mod->mod == mod) {
+ blocking_notifier_call_chain(&tracepoint_notify_list,
+ MODULE_STATE_GOING, tp_mod);
+ list_del(&tp_mod->list);
+ kfree(tp_mod);
+ /*
+ * Called the going notifier before checking for
+ * quiescence.
+ */
+ tp_module_going_check_quiescent(mod->tracepoints_ptrs,
+ mod->tracepoints_ptrs + mod->num_tracepoints);
+ break;
+ }
+ }
+ /*
+ * In the case of modules that were tainted at "coming", we'll simply
+ * walk through the list without finding it. We cannot use the "tainted"
+ * flag on "going", in case a module taints the kernel only after being
+ * loaded.
+ */
+ mutex_unlock(&tracepoint_module_list_mutex);
+}
+
+static int tracepoint_module_notify(struct notifier_block *self,
+ unsigned long val, void *data)
+{
+ struct module *mod = data;
+ int ret = 0;
+
+ switch (val) {
+ case MODULE_STATE_COMING:
+ ret = tracepoint_module_coming(mod);
+ break;
+ case MODULE_STATE_LIVE:
+ break;
+ case MODULE_STATE_GOING:
+ tracepoint_module_going(mod);
+ break;
+ case MODULE_STATE_UNFORMED:
+ break;
+ }
+ return ret;
+}
+
+static struct notifier_block tracepoint_module_nb = {
+ .notifier_call = tracepoint_module_notify,
+ .priority = 0,
+};
+
+static __init int init_tracepoints(void)
+{
+ int ret;
+
+ ret = register_module_notifier(&tracepoint_module_nb);
+ if (ret)
+ pr_warning("Failed to register tracepoint module enter notifier\n");
+
+ return ret;
+}
+__initcall(init_tracepoints);
+#endif /* CONFIG_MODULES */
+
+static void for_each_tracepoint_range(struct tracepoint * const *begin,
+ struct tracepoint * const *end,
+ void (*fct)(struct tracepoint *tp, void *priv),
+ void *priv)
+{
+ struct tracepoint * const *iter;
+
+ if (!begin)
+ return;
+ for (iter = begin; iter < end; iter++)
+ fct(*iter, priv);
+}
+
+/**
+ * for_each_kernel_tracepoint - iteration on all kernel tracepoints
+ * @fct: callback
+ * @priv: private data
+ */
+void for_each_kernel_tracepoint(void (*fct)(struct tracepoint *tp, void *priv),
+ void *priv)
+{
+ for_each_tracepoint_range(__start___tracepoints_ptrs,
+ __stop___tracepoints_ptrs, fct, priv);
+}
+EXPORT_SYMBOL_GPL(for_each_kernel_tracepoint);
+
+#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
+
+/* NB: reg/unreg are called while guarded with the tracepoints_mutex */
+static int sys_tracepoint_refcount;
+
+void syscall_regfunc(void)
+{
+ struct task_struct *p, *t;
+
+ if (!sys_tracepoint_refcount) {
+ read_lock(&tasklist_lock);
+ for_each_process_thread(p, t) {
+ set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
+ }
+ read_unlock(&tasklist_lock);
+ }
+ sys_tracepoint_refcount++;
+}
+
+void syscall_unregfunc(void)
+{
+ struct task_struct *p, *t;
+
+ sys_tracepoint_refcount--;
+ if (!sys_tracepoint_refcount) {
+ read_lock(&tasklist_lock);
+ for_each_process_thread(p, t) {
+ clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
+ }
+ read_unlock(&tasklist_lock);
+ }
+}
+#endif
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
new file mode 100644
index 000000000..975cb49e3
--- /dev/null
+++ b/kernel/tsacct.c
@@ -0,0 +1,180 @@
+/*
+ * tsacct.c - System accounting over taskstats interface
+ *
+ * Copyright (C) Jay Lan, <jlan@sgi.com>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/tsacct_kern.h>
+#include <linux/acct.h>
+#include <linux/jiffies.h>
+#include <linux/mm.h>
+
+/*
+ * fill in basic accounting fields
+ */
+void bacct_add_tsk(struct user_namespace *user_ns,
+ struct pid_namespace *pid_ns,
+ struct taskstats *stats, struct task_struct *tsk)
+{
+ const struct cred *tcred;
+ cputime_t utime, stime, utimescaled, stimescaled;
+ u64 delta;
+
+ BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN);
+
+ /* calculate task elapsed time in nsec */
+ delta = ktime_get_ns() - tsk->start_time;
+ /* Convert to micro seconds */
+ do_div(delta, NSEC_PER_USEC);
+ stats->ac_etime = delta;
+ /* Convert to seconds for btime */
+ do_div(delta, USEC_PER_SEC);
+ stats->ac_btime = get_seconds() - delta;
+ if (thread_group_leader(tsk)) {
+ stats->ac_exitcode = tsk->exit_code;
+ if (tsk->flags & PF_FORKNOEXEC)
+ stats->ac_flag |= AFORK;
+ }
+ if (tsk->flags & PF_SUPERPRIV)
+ stats->ac_flag |= ASU;
+ if (tsk->flags & PF_DUMPCORE)
+ stats->ac_flag |= ACORE;
+ if (tsk->flags & PF_SIGNALED)
+ stats->ac_flag |= AXSIG;
+ stats->ac_nice = task_nice(tsk);
+ stats->ac_sched = tsk->policy;
+ stats->ac_pid = task_pid_nr_ns(tsk, pid_ns);
+ rcu_read_lock();
+ tcred = __task_cred(tsk);
+ stats->ac_uid = from_kuid_munged(user_ns, tcred->uid);
+ stats->ac_gid = from_kgid_munged(user_ns, tcred->gid);
+ stats->ac_ppid = pid_alive(tsk) ?
+ task_tgid_nr_ns(rcu_dereference(tsk->real_parent), pid_ns) : 0;
+ rcu_read_unlock();
+
+ task_cputime(tsk, &utime, &stime);
+ stats->ac_utime = cputime_to_usecs(utime);
+ stats->ac_stime = cputime_to_usecs(stime);
+
+ task_cputime_scaled(tsk, &utimescaled, &stimescaled);
+ stats->ac_utimescaled = cputime_to_usecs(utimescaled);
+ stats->ac_stimescaled = cputime_to_usecs(stimescaled);
+
+ stats->ac_minflt = tsk->min_flt;
+ stats->ac_majflt = tsk->maj_flt;
+
+ strncpy(stats->ac_comm, tsk->comm, sizeof(stats->ac_comm));
+}
+
+
+#ifdef CONFIG_TASK_XACCT
+
+#define KB 1024
+#define MB (1024*KB)
+#define KB_MASK (~(KB-1))
+/*
+ * fill in extended accounting fields
+ */
+void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
+{
+ struct mm_struct *mm;
+
+ /* convert pages-usec to Mbyte-usec */
+ stats->coremem = p->acct_rss_mem1 * PAGE_SIZE / MB;
+ stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE / MB;
+ mm = get_task_mm(p);
+ if (mm) {
+ /* adjust to KB unit */
+ stats->hiwater_rss = get_mm_hiwater_rss(mm) * PAGE_SIZE / KB;
+ stats->hiwater_vm = get_mm_hiwater_vm(mm) * PAGE_SIZE / KB;
+ mmput(mm);
+ }
+ stats->read_char = p->ioac.rchar & KB_MASK;
+ stats->write_char = p->ioac.wchar & KB_MASK;
+ stats->read_syscalls = p->ioac.syscr & KB_MASK;
+ stats->write_syscalls = p->ioac.syscw & KB_MASK;
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+ stats->read_bytes = p->ioac.read_bytes & KB_MASK;
+ stats->write_bytes = p->ioac.write_bytes & KB_MASK;
+ stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes & KB_MASK;
+#else
+ stats->read_bytes = 0;
+ stats->write_bytes = 0;
+ stats->cancelled_write_bytes = 0;
+#endif
+}
+#undef KB
+#undef MB
+
+static void __acct_update_integrals(struct task_struct *tsk,
+ cputime_t utime, cputime_t stime)
+{
+ if (likely(tsk->mm)) {
+ cputime_t time, dtime;
+ struct timeval value;
+ unsigned long flags;
+ u64 delta;
+
+ local_irq_save(flags);
+ time = stime + utime;
+ dtime = time - tsk->acct_timexpd;
+ jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
+ delta = value.tv_sec;
+ delta = delta * USEC_PER_SEC + value.tv_usec;
+
+ if (delta == 0)
+ goto out;
+ tsk->acct_timexpd = time;
+ tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
+ tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
+ out:
+ local_irq_restore(flags);
+ }
+}
+
+/**
+ * acct_update_integrals - update mm integral fields in task_struct
+ * @tsk: task_struct for accounting
+ */
+void acct_update_integrals(struct task_struct *tsk)
+{
+ cputime_t utime, stime;
+
+ task_cputime(tsk, &utime, &stime);
+ __acct_update_integrals(tsk, utime, stime);
+}
+
+/**
+ * acct_account_cputime - update mm integral after cputime update
+ * @tsk: task_struct for accounting
+ */
+void acct_account_cputime(struct task_struct *tsk)
+{
+ __acct_update_integrals(tsk, tsk->utime, tsk->stime);
+}
+
+/**
+ * acct_clear_integrals - clear the mm integral fields in task_struct
+ * @tsk: task_struct whose accounting fields are cleared
+ */
+void acct_clear_integrals(struct task_struct *tsk)
+{
+ tsk->acct_timexpd = 0;
+ tsk->acct_rss_mem1 = 0;
+ tsk->acct_vm_mem1 = 0;
+}
+#endif
diff --git a/kernel/uid16.c b/kernel/uid16.c
new file mode 100644
index 000000000..d58cc4d8f
--- /dev/null
+++ b/kernel/uid16.c
@@ -0,0 +1,217 @@
+/*
+ * Wrapper functions for 16bit uid back compatibility. All nicely tied
+ * together in the faint hope we can take the out in five years time.
+ */
+
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <linux/prctl.h>
+#include <linux/capability.h>
+#include <linux/init.h>
+#include <linux/highuid.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+
+#include <asm/uaccess.h>
+
+SYSCALL_DEFINE3(chown16, const char __user *, filename, old_uid_t, user, old_gid_t, group)
+{
+ return sys_chown(filename, low2highuid(user), low2highgid(group));
+}
+
+SYSCALL_DEFINE3(lchown16, const char __user *, filename, old_uid_t, user, old_gid_t, group)
+{
+ return sys_lchown(filename, low2highuid(user), low2highgid(group));
+}
+
+SYSCALL_DEFINE3(fchown16, unsigned int, fd, old_uid_t, user, old_gid_t, group)
+{
+ return sys_fchown(fd, low2highuid(user), low2highgid(group));
+}
+
+SYSCALL_DEFINE2(setregid16, old_gid_t, rgid, old_gid_t, egid)
+{
+ return sys_setregid(low2highgid(rgid), low2highgid(egid));
+}
+
+SYSCALL_DEFINE1(setgid16, old_gid_t, gid)
+{
+ return sys_setgid(low2highgid(gid));
+}
+
+SYSCALL_DEFINE2(setreuid16, old_uid_t, ruid, old_uid_t, euid)
+{
+ return sys_setreuid(low2highuid(ruid), low2highuid(euid));
+}
+
+SYSCALL_DEFINE1(setuid16, old_uid_t, uid)
+{
+ return sys_setuid(low2highuid(uid));
+}
+
+SYSCALL_DEFINE3(setresuid16, old_uid_t, ruid, old_uid_t, euid, old_uid_t, suid)
+{
+ return sys_setresuid(low2highuid(ruid), low2highuid(euid),
+ low2highuid(suid));
+}
+
+SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruidp, old_uid_t __user *, euidp, old_uid_t __user *, suidp)
+{
+ const struct cred *cred = current_cred();
+ int retval;
+ old_uid_t ruid, euid, suid;
+
+ ruid = high2lowuid(from_kuid_munged(cred->user_ns, cred->uid));
+ euid = high2lowuid(from_kuid_munged(cred->user_ns, cred->euid));
+ suid = high2lowuid(from_kuid_munged(cred->user_ns, cred->suid));
+
+ if (!(retval = put_user(ruid, ruidp)) &&
+ !(retval = put_user(euid, euidp)))
+ retval = put_user(suid, suidp);
+
+ return retval;
+}
+
+SYSCALL_DEFINE3(setresgid16, old_gid_t, rgid, old_gid_t, egid, old_gid_t, sgid)
+{
+ return sys_setresgid(low2highgid(rgid), low2highgid(egid),
+ low2highgid(sgid));
+}
+
+
+SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgidp, old_gid_t __user *, egidp, old_gid_t __user *, sgidp)
+{
+ const struct cred *cred = current_cred();
+ int retval;
+ old_gid_t rgid, egid, sgid;
+
+ rgid = high2lowgid(from_kgid_munged(cred->user_ns, cred->gid));
+ egid = high2lowgid(from_kgid_munged(cred->user_ns, cred->egid));
+ sgid = high2lowgid(from_kgid_munged(cred->user_ns, cred->sgid));
+
+ if (!(retval = put_user(rgid, rgidp)) &&
+ !(retval = put_user(egid, egidp)))
+ retval = put_user(sgid, sgidp);
+
+ return retval;
+}
+
+SYSCALL_DEFINE1(setfsuid16, old_uid_t, uid)
+{
+ return sys_setfsuid(low2highuid(uid));
+}
+
+SYSCALL_DEFINE1(setfsgid16, old_gid_t, gid)
+{
+ return sys_setfsgid(low2highgid(gid));
+}
+
+static int groups16_to_user(old_gid_t __user *grouplist,
+ struct group_info *group_info)
+{
+ struct user_namespace *user_ns = current_user_ns();
+ int i;
+ old_gid_t group;
+ kgid_t kgid;
+
+ for (i = 0; i < group_info->ngroups; i++) {
+ kgid = GROUP_AT(group_info, i);
+ group = high2lowgid(from_kgid_munged(user_ns, kgid));
+ if (put_user(group, grouplist+i))
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+static int groups16_from_user(struct group_info *group_info,
+ old_gid_t __user *grouplist)
+{
+ struct user_namespace *user_ns = current_user_ns();
+ int i;
+ old_gid_t group;
+ kgid_t kgid;
+
+ for (i = 0; i < group_info->ngroups; i++) {
+ if (get_user(group, grouplist+i))
+ return -EFAULT;
+
+ kgid = make_kgid(user_ns, low2highgid(group));
+ if (!gid_valid(kgid))
+ return -EINVAL;
+
+ GROUP_AT(group_info, i) = kgid;
+ }
+
+ return 0;
+}
+
+SYSCALL_DEFINE2(getgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
+{
+ const struct cred *cred = current_cred();
+ int i;
+
+ if (gidsetsize < 0)
+ return -EINVAL;
+
+ i = cred->group_info->ngroups;
+ if (gidsetsize) {
+ if (i > gidsetsize) {
+ i = -EINVAL;
+ goto out;
+ }
+ if (groups16_to_user(grouplist, cred->group_info)) {
+ i = -EFAULT;
+ goto out;
+ }
+ }
+out:
+ return i;
+}
+
+SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
+{
+ struct group_info *group_info;
+ int retval;
+
+ if (!may_setgroups())
+ return -EPERM;
+ if ((unsigned)gidsetsize > NGROUPS_MAX)
+ return -EINVAL;
+
+ group_info = groups_alloc(gidsetsize);
+ if (!group_info)
+ return -ENOMEM;
+ retval = groups16_from_user(group_info, grouplist);
+ if (retval) {
+ put_group_info(group_info);
+ return retval;
+ }
+
+ retval = set_current_groups(group_info);
+ put_group_info(group_info);
+
+ return retval;
+}
+
+SYSCALL_DEFINE0(getuid16)
+{
+ return high2lowuid(from_kuid_munged(current_user_ns(), current_uid()));
+}
+
+SYSCALL_DEFINE0(geteuid16)
+{
+ return high2lowuid(from_kuid_munged(current_user_ns(), current_euid()));
+}
+
+SYSCALL_DEFINE0(getgid16)
+{
+ return high2lowgid(from_kgid_munged(current_user_ns(), current_gid()));
+}
+
+SYSCALL_DEFINE0(getegid16)
+{
+ return high2lowgid(from_kgid_munged(current_user_ns(), current_egid()));
+}
diff --git a/kernel/up.c b/kernel/up.c
new file mode 100644
index 000000000..1760bf3d1
--- /dev/null
+++ b/kernel/up.c
@@ -0,0 +1,84 @@
+/*
+ * Uniprocessor-only support functions. The counterpart to kernel/smp.c
+ */
+
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/smp.h>
+
+int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
+ int wait)
+{
+ unsigned long flags;
+
+ WARN_ON(cpu != 0);
+
+ local_irq_save(flags);
+ func(info);
+ local_irq_restore(flags);
+
+ return 0;
+}
+EXPORT_SYMBOL(smp_call_function_single);
+
+int smp_call_function_single_async(int cpu, struct call_single_data *csd)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ csd->func(csd->info);
+ local_irq_restore(flags);
+ return 0;
+}
+EXPORT_SYMBOL(smp_call_function_single_async);
+
+int on_each_cpu(smp_call_func_t func, void *info, int wait)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ func(info);
+ local_irq_restore(flags);
+ return 0;
+}
+EXPORT_SYMBOL(on_each_cpu);
+
+/*
+ * Note we still need to test the mask even for UP
+ * because we actually can get an empty mask from
+ * code that on SMP might call us without the local
+ * CPU in the mask.
+ */
+void on_each_cpu_mask(const struct cpumask *mask,
+ smp_call_func_t func, void *info, bool wait)
+{
+ unsigned long flags;
+
+ if (cpumask_test_cpu(0, mask)) {
+ local_irq_save(flags);
+ func(info);
+ local_irq_restore(flags);
+ }
+}
+EXPORT_SYMBOL(on_each_cpu_mask);
+
+/*
+ * Preemption is disabled here to make sure the cond_func is called under the
+ * same condtions in UP and SMP.
+ */
+void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
+ smp_call_func_t func, void *info, bool wait,
+ gfp_t gfp_flags)
+{
+ unsigned long flags;
+
+ preempt_disable();
+ if (cond_func(0, info)) {
+ local_irq_save(flags);
+ func(info);
+ local_irq_restore(flags);
+ }
+ preempt_enable();
+}
+EXPORT_SYMBOL(on_each_cpu_cond);
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
new file mode 100644
index 000000000..9586b670a
--- /dev/null
+++ b/kernel/user-return-notifier.c
@@ -0,0 +1,44 @@
+
+#include <linux/user-return-notifier.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/export.h>
+
+static DEFINE_PER_CPU(struct hlist_head, return_notifier_list);
+
+/*
+ * Request a notification when the current cpu returns to userspace. Must be
+ * called in atomic context. The notifier will also be called in atomic
+ * context.
+ */
+void user_return_notifier_register(struct user_return_notifier *urn)
+{
+ set_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY);
+ hlist_add_head(&urn->link, this_cpu_ptr(&return_notifier_list));
+}
+EXPORT_SYMBOL_GPL(user_return_notifier_register);
+
+/*
+ * Removes a registered user return notifier. Must be called from atomic
+ * context, and from the same cpu registration occurred in.
+ */
+void user_return_notifier_unregister(struct user_return_notifier *urn)
+{
+ hlist_del(&urn->link);
+ if (hlist_empty(this_cpu_ptr(&return_notifier_list)))
+ clear_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY);
+}
+EXPORT_SYMBOL_GPL(user_return_notifier_unregister);
+
+/* Calls registered user return notifiers */
+void fire_user_return_notifiers(void)
+{
+ struct user_return_notifier *urn;
+ struct hlist_node *tmp2;
+ struct hlist_head *head;
+
+ head = &get_cpu_var(return_notifier_list);
+ hlist_for_each_entry_safe(urn, tmp2, head, link)
+ urn->on_user_return(urn);
+ put_cpu_var(return_notifier_list);
+}
diff --git a/kernel/user.c b/kernel/user.c
new file mode 100644
index 000000000..b069ccbfb
--- /dev/null
+++ b/kernel/user.c
@@ -0,0 +1,228 @@
+/*
+ * The "user cache".
+ *
+ * (C) Copyright 1991-2000 Linus Torvalds
+ *
+ * We have a per-user structure to keep track of how many
+ * processes, files etc the user has claimed, in order to be
+ * able to have per-user limits for system resources.
+ */
+
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/bitops.h>
+#include <linux/key.h>
+#include <linux/interrupt.h>
+#include <linux/export.h>
+#include <linux/user_namespace.h>
+#include <linux/proc_ns.h>
+
+/*
+ * userns count is 1 for root user, 1 for init_uts_ns,
+ * and 1 for... ?
+ */
+struct user_namespace init_user_ns = {
+ .uid_map = {
+ .nr_extents = 1,
+ .extent[0] = {
+ .first = 0,
+ .lower_first = 0,
+ .count = 4294967295U,
+ },
+ },
+ .gid_map = {
+ .nr_extents = 1,
+ .extent[0] = {
+ .first = 0,
+ .lower_first = 0,
+ .count = 4294967295U,
+ },
+ },
+ .projid_map = {
+ .nr_extents = 1,
+ .extent[0] = {
+ .first = 0,
+ .lower_first = 0,
+ .count = 4294967295U,
+ },
+ },
+ .count = ATOMIC_INIT(3),
+ .owner = GLOBAL_ROOT_UID,
+ .group = GLOBAL_ROOT_GID,
+ .ns.inum = PROC_USER_INIT_INO,
+#ifdef CONFIG_USER_NS
+ .ns.ops = &userns_operations,
+#endif
+ .flags = USERNS_INIT_FLAGS,
+#ifdef CONFIG_PERSISTENT_KEYRINGS
+ .persistent_keyring_register_sem =
+ __RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem),
+#endif
+};
+EXPORT_SYMBOL_GPL(init_user_ns);
+
+/*
+ * UID task count cache, to get fast user lookup in "alloc_uid"
+ * when changing user ID's (ie setuid() and friends).
+ */
+
+#define UIDHASH_BITS (CONFIG_BASE_SMALL ? 3 : 7)
+#define UIDHASH_SZ (1 << UIDHASH_BITS)
+#define UIDHASH_MASK (UIDHASH_SZ - 1)
+#define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK)
+#define uidhashentry(uid) (uidhash_table + __uidhashfn((__kuid_val(uid))))
+
+static struct kmem_cache *uid_cachep;
+struct hlist_head uidhash_table[UIDHASH_SZ];
+
+/*
+ * The uidhash_lock is mostly taken from process context, but it is
+ * occasionally also taken from softirq/tasklet context, when
+ * task-structs get RCU-freed. Hence all locking must be softirq-safe.
+ * But free_uid() is also called with local interrupts disabled, and running
+ * local_bh_enable() with local interrupts disabled is an error - we'll run
+ * softirq callbacks, and they can unconditionally enable interrupts, and
+ * the caller of free_uid() didn't expect that..
+ */
+static DEFINE_SPINLOCK(uidhash_lock);
+
+/* root_user.__count is 1, for init task cred */
+struct user_struct root_user = {
+ .__count = ATOMIC_INIT(1),
+ .processes = ATOMIC_INIT(1),
+ .sigpending = ATOMIC_INIT(0),
+ .locked_shm = 0,
+ .uid = GLOBAL_ROOT_UID,
+};
+
+/*
+ * These routines must be called with the uidhash spinlock held!
+ */
+static void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent)
+{
+ hlist_add_head(&up->uidhash_node, hashent);
+}
+
+static void uid_hash_remove(struct user_struct *up)
+{
+ hlist_del_init(&up->uidhash_node);
+}
+
+static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent)
+{
+ struct user_struct *user;
+
+ hlist_for_each_entry(user, hashent, uidhash_node) {
+ if (uid_eq(user->uid, uid)) {
+ atomic_inc(&user->__count);
+ return user;
+ }
+ }
+
+ return NULL;
+}
+
+/* IRQs are disabled and uidhash_lock is held upon function entry.
+ * IRQ state (as stored in flags) is restored and uidhash_lock released
+ * upon function exit.
+ */
+static void free_user(struct user_struct *up, unsigned long flags)
+ __releases(&uidhash_lock)
+{
+ uid_hash_remove(up);
+ spin_unlock_irqrestore(&uidhash_lock, flags);
+ key_put(up->uid_keyring);
+ key_put(up->session_keyring);
+ kmem_cache_free(uid_cachep, up);
+}
+
+/*
+ * Locate the user_struct for the passed UID. If found, take a ref on it. The
+ * caller must undo that ref with free_uid().
+ *
+ * If the user_struct could not be found, return NULL.
+ */
+struct user_struct *find_user(kuid_t uid)
+{
+ struct user_struct *ret;
+ unsigned long flags;
+
+ spin_lock_irqsave(&uidhash_lock, flags);
+ ret = uid_hash_find(uid, uidhashentry(uid));
+ spin_unlock_irqrestore(&uidhash_lock, flags);
+ return ret;
+}
+
+void free_uid(struct user_struct *up)
+{
+ unsigned long flags;
+
+ if (!up)
+ return;
+
+ local_irq_save(flags);
+ if (atomic_dec_and_lock(&up->__count, &uidhash_lock))
+ free_user(up, flags);
+ else
+ local_irq_restore(flags);
+}
+
+struct user_struct *alloc_uid(kuid_t uid)
+{
+ struct hlist_head *hashent = uidhashentry(uid);
+ struct user_struct *up, *new;
+
+ spin_lock_irq(&uidhash_lock);
+ up = uid_hash_find(uid, hashent);
+ spin_unlock_irq(&uidhash_lock);
+
+ if (!up) {
+ new = kmem_cache_zalloc(uid_cachep, GFP_KERNEL);
+ if (!new)
+ goto out_unlock;
+
+ new->uid = uid;
+ atomic_set(&new->__count, 1);
+
+ /*
+ * Before adding this, check whether we raced
+ * on adding the same user already..
+ */
+ spin_lock_irq(&uidhash_lock);
+ up = uid_hash_find(uid, hashent);
+ if (up) {
+ key_put(new->uid_keyring);
+ key_put(new->session_keyring);
+ kmem_cache_free(uid_cachep, new);
+ } else {
+ uid_hash_insert(new, hashent);
+ up = new;
+ }
+ spin_unlock_irq(&uidhash_lock);
+ }
+
+ return up;
+
+out_unlock:
+ return NULL;
+}
+
+static int __init uid_cache_init(void)
+{
+ int n;
+
+ uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct),
+ 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+
+ for(n = 0; n < UIDHASH_SZ; ++n)
+ INIT_HLIST_HEAD(uidhash_table + n);
+
+ /* Insert the root user immediately (init already runs as root) */
+ spin_lock_irq(&uidhash_lock);
+ uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID));
+ spin_unlock_irq(&uidhash_lock);
+
+ return 0;
+}
+subsys_initcall(uid_cache_init);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
new file mode 100644
index 000000000..4109f8320
--- /dev/null
+++ b/kernel/user_namespace.c
@@ -0,0 +1,1012 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ */
+
+#include <linux/export.h>
+#include <linux/nsproxy.h>
+#include <linux/slab.h>
+#include <linux/user_namespace.h>
+#include <linux/proc_ns.h>
+#include <linux/highuid.h>
+#include <linux/cred.h>
+#include <linux/securebits.h>
+#include <linux/keyctl.h>
+#include <linux/key-type.h>
+#include <keys/user-type.h>
+#include <linux/seq_file.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/ctype.h>
+#include <linux/projid.h>
+#include <linux/fs_struct.h>
+
+static struct kmem_cache *user_ns_cachep __read_mostly;
+static DEFINE_MUTEX(userns_state_mutex);
+
+static bool new_idmap_permitted(const struct file *file,
+ struct user_namespace *ns, int cap_setid,
+ struct uid_gid_map *map);
+
+static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
+{
+ /* Start with the same capabilities as init but useless for doing
+ * anything as the capabilities are bound to the new user namespace.
+ */
+ cred->securebits = SECUREBITS_DEFAULT;
+ cred->cap_inheritable = CAP_EMPTY_SET;
+ cred->cap_permitted = CAP_FULL_SET;
+ cred->cap_effective = CAP_FULL_SET;
+ cred->cap_bset = CAP_FULL_SET;
+#ifdef CONFIG_KEYS
+ key_put(cred->request_key_auth);
+ cred->request_key_auth = NULL;
+#endif
+ /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
+ cred->user_ns = user_ns;
+}
+
+/*
+ * Create a new user namespace, deriving the creator from the user in the
+ * passed credentials, and replacing that user with the new root user for the
+ * new namespace.
+ *
+ * This is called by copy_creds(), which will finish setting the target task's
+ * credentials.
+ */
+int create_user_ns(struct cred *new)
+{
+ struct user_namespace *ns, *parent_ns = new->user_ns;
+ kuid_t owner = new->euid;
+ kgid_t group = new->egid;
+ int ret;
+
+ if (parent_ns->level > 32)
+ return -EUSERS;
+
+ /*
+ * Verify that we can not violate the policy of which files
+ * may be accessed that is specified by the root directory,
+ * by verifing that the root directory is at the root of the
+ * mount namespace which allows all files to be accessed.
+ */
+ if (current_chrooted())
+ return -EPERM;
+
+ /* The creator needs a mapping in the parent user namespace
+ * or else we won't be able to reasonably tell userspace who
+ * created a user_namespace.
+ */
+ if (!kuid_has_mapping(parent_ns, owner) ||
+ !kgid_has_mapping(parent_ns, group))
+ return -EPERM;
+
+ ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL);
+ if (!ns)
+ return -ENOMEM;
+
+ ret = ns_alloc_inum(&ns->ns);
+ if (ret) {
+ kmem_cache_free(user_ns_cachep, ns);
+ return ret;
+ }
+ ns->ns.ops = &userns_operations;
+
+ atomic_set(&ns->count, 1);
+ /* Leave the new->user_ns reference with the new user namespace. */
+ ns->parent = parent_ns;
+ ns->level = parent_ns->level + 1;
+ ns->owner = owner;
+ ns->group = group;
+
+ /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
+ mutex_lock(&userns_state_mutex);
+ ns->flags = parent_ns->flags;
+ mutex_unlock(&userns_state_mutex);
+
+ set_cred_user_ns(new, ns);
+
+#ifdef CONFIG_PERSISTENT_KEYRINGS
+ init_rwsem(&ns->persistent_keyring_register_sem);
+#endif
+ return 0;
+}
+
+int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
+{
+ struct cred *cred;
+ int err = -ENOMEM;
+
+ if (!(unshare_flags & CLONE_NEWUSER))
+ return 0;
+
+ cred = prepare_creds();
+ if (cred) {
+ err = create_user_ns(cred);
+ if (err)
+ put_cred(cred);
+ else
+ *new_cred = cred;
+ }
+
+ return err;
+}
+
+void free_user_ns(struct user_namespace *ns)
+{
+ struct user_namespace *parent;
+
+ do {
+ parent = ns->parent;
+#ifdef CONFIG_PERSISTENT_KEYRINGS
+ key_put(ns->persistent_keyring_register);
+#endif
+ ns_free_inum(&ns->ns);
+ kmem_cache_free(user_ns_cachep, ns);
+ ns = parent;
+ } while (atomic_dec_and_test(&parent->count));
+}
+EXPORT_SYMBOL(free_user_ns);
+
+static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
+{
+ unsigned idx, extents;
+ u32 first, last, id2;
+
+ id2 = id + count - 1;
+
+ /* Find the matching extent */
+ extents = map->nr_extents;
+ smp_rmb();
+ for (idx = 0; idx < extents; idx++) {
+ first = map->extent[idx].first;
+ last = first + map->extent[idx].count - 1;
+ if (id >= first && id <= last &&
+ (id2 >= first && id2 <= last))
+ break;
+ }
+ /* Map the id or note failure */
+ if (idx < extents)
+ id = (id - first) + map->extent[idx].lower_first;
+ else
+ id = (u32) -1;
+
+ return id;
+}
+
+static u32 map_id_down(struct uid_gid_map *map, u32 id)
+{
+ unsigned idx, extents;
+ u32 first, last;
+
+ /* Find the matching extent */
+ extents = map->nr_extents;
+ smp_rmb();
+ for (idx = 0; idx < extents; idx++) {
+ first = map->extent[idx].first;
+ last = first + map->extent[idx].count - 1;
+ if (id >= first && id <= last)
+ break;
+ }
+ /* Map the id or note failure */
+ if (idx < extents)
+ id = (id - first) + map->extent[idx].lower_first;
+ else
+ id = (u32) -1;
+
+ return id;
+}
+
+static u32 map_id_up(struct uid_gid_map *map, u32 id)
+{
+ unsigned idx, extents;
+ u32 first, last;
+
+ /* Find the matching extent */
+ extents = map->nr_extents;
+ smp_rmb();
+ for (idx = 0; idx < extents; idx++) {
+ first = map->extent[idx].lower_first;
+ last = first + map->extent[idx].count - 1;
+ if (id >= first && id <= last)
+ break;
+ }
+ /* Map the id or note failure */
+ if (idx < extents)
+ id = (id - first) + map->extent[idx].first;
+ else
+ id = (u32) -1;
+
+ return id;
+}
+
+/**
+ * make_kuid - Map a user-namespace uid pair into a kuid.
+ * @ns: User namespace that the uid is in
+ * @uid: User identifier
+ *
+ * Maps a user-namespace uid pair into a kernel internal kuid,
+ * and returns that kuid.
+ *
+ * When there is no mapping defined for the user-namespace uid
+ * pair INVALID_UID is returned. Callers are expected to test
+ * for and handle INVALID_UID being returned. INVALID_UID
+ * may be tested for using uid_valid().
+ */
+kuid_t make_kuid(struct user_namespace *ns, uid_t uid)
+{
+ /* Map the uid to a global kernel uid */
+ return KUIDT_INIT(map_id_down(&ns->uid_map, uid));
+}
+EXPORT_SYMBOL(make_kuid);
+
+/**
+ * from_kuid - Create a uid from a kuid user-namespace pair.
+ * @targ: The user namespace we want a uid in.
+ * @kuid: The kernel internal uid to start with.
+ *
+ * Map @kuid into the user-namespace specified by @targ and
+ * return the resulting uid.
+ *
+ * There is always a mapping into the initial user_namespace.
+ *
+ * If @kuid has no mapping in @targ (uid_t)-1 is returned.
+ */
+uid_t from_kuid(struct user_namespace *targ, kuid_t kuid)
+{
+ /* Map the uid from a global kernel uid */
+ return map_id_up(&targ->uid_map, __kuid_val(kuid));
+}
+EXPORT_SYMBOL(from_kuid);
+
+/**
+ * from_kuid_munged - Create a uid from a kuid user-namespace pair.
+ * @targ: The user namespace we want a uid in.
+ * @kuid: The kernel internal uid to start with.
+ *
+ * Map @kuid into the user-namespace specified by @targ and
+ * return the resulting uid.
+ *
+ * There is always a mapping into the initial user_namespace.
+ *
+ * Unlike from_kuid from_kuid_munged never fails and always
+ * returns a valid uid. This makes from_kuid_munged appropriate
+ * for use in syscalls like stat and getuid where failing the
+ * system call and failing to provide a valid uid are not an
+ * options.
+ *
+ * If @kuid has no mapping in @targ overflowuid is returned.
+ */
+uid_t from_kuid_munged(struct user_namespace *targ, kuid_t kuid)
+{
+ uid_t uid;
+ uid = from_kuid(targ, kuid);
+
+ if (uid == (uid_t) -1)
+ uid = overflowuid;
+ return uid;
+}
+EXPORT_SYMBOL(from_kuid_munged);
+
+/**
+ * make_kgid - Map a user-namespace gid pair into a kgid.
+ * @ns: User namespace that the gid is in
+ * @gid: group identifier
+ *
+ * Maps a user-namespace gid pair into a kernel internal kgid,
+ * and returns that kgid.
+ *
+ * When there is no mapping defined for the user-namespace gid
+ * pair INVALID_GID is returned. Callers are expected to test
+ * for and handle INVALID_GID being returned. INVALID_GID may be
+ * tested for using gid_valid().
+ */
+kgid_t make_kgid(struct user_namespace *ns, gid_t gid)
+{
+ /* Map the gid to a global kernel gid */
+ return KGIDT_INIT(map_id_down(&ns->gid_map, gid));
+}
+EXPORT_SYMBOL(make_kgid);
+
+/**
+ * from_kgid - Create a gid from a kgid user-namespace pair.
+ * @targ: The user namespace we want a gid in.
+ * @kgid: The kernel internal gid to start with.
+ *
+ * Map @kgid into the user-namespace specified by @targ and
+ * return the resulting gid.
+ *
+ * There is always a mapping into the initial user_namespace.
+ *
+ * If @kgid has no mapping in @targ (gid_t)-1 is returned.
+ */
+gid_t from_kgid(struct user_namespace *targ, kgid_t kgid)
+{
+ /* Map the gid from a global kernel gid */
+ return map_id_up(&targ->gid_map, __kgid_val(kgid));
+}
+EXPORT_SYMBOL(from_kgid);
+
+/**
+ * from_kgid_munged - Create a gid from a kgid user-namespace pair.
+ * @targ: The user namespace we want a gid in.
+ * @kgid: The kernel internal gid to start with.
+ *
+ * Map @kgid into the user-namespace specified by @targ and
+ * return the resulting gid.
+ *
+ * There is always a mapping into the initial user_namespace.
+ *
+ * Unlike from_kgid from_kgid_munged never fails and always
+ * returns a valid gid. This makes from_kgid_munged appropriate
+ * for use in syscalls like stat and getgid where failing the
+ * system call and failing to provide a valid gid are not options.
+ *
+ * If @kgid has no mapping in @targ overflowgid is returned.
+ */
+gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid)
+{
+ gid_t gid;
+ gid = from_kgid(targ, kgid);
+
+ if (gid == (gid_t) -1)
+ gid = overflowgid;
+ return gid;
+}
+EXPORT_SYMBOL(from_kgid_munged);
+
+/**
+ * make_kprojid - Map a user-namespace projid pair into a kprojid.
+ * @ns: User namespace that the projid is in
+ * @projid: Project identifier
+ *
+ * Maps a user-namespace uid pair into a kernel internal kuid,
+ * and returns that kuid.
+ *
+ * When there is no mapping defined for the user-namespace projid
+ * pair INVALID_PROJID is returned. Callers are expected to test
+ * for and handle handle INVALID_PROJID being returned. INVALID_PROJID
+ * may be tested for using projid_valid().
+ */
+kprojid_t make_kprojid(struct user_namespace *ns, projid_t projid)
+{
+ /* Map the uid to a global kernel uid */
+ return KPROJIDT_INIT(map_id_down(&ns->projid_map, projid));
+}
+EXPORT_SYMBOL(make_kprojid);
+
+/**
+ * from_kprojid - Create a projid from a kprojid user-namespace pair.
+ * @targ: The user namespace we want a projid in.
+ * @kprojid: The kernel internal project identifier to start with.
+ *
+ * Map @kprojid into the user-namespace specified by @targ and
+ * return the resulting projid.
+ *
+ * There is always a mapping into the initial user_namespace.
+ *
+ * If @kprojid has no mapping in @targ (projid_t)-1 is returned.
+ */
+projid_t from_kprojid(struct user_namespace *targ, kprojid_t kprojid)
+{
+ /* Map the uid from a global kernel uid */
+ return map_id_up(&targ->projid_map, __kprojid_val(kprojid));
+}
+EXPORT_SYMBOL(from_kprojid);
+
+/**
+ * from_kprojid_munged - Create a projiid from a kprojid user-namespace pair.
+ * @targ: The user namespace we want a projid in.
+ * @kprojid: The kernel internal projid to start with.
+ *
+ * Map @kprojid into the user-namespace specified by @targ and
+ * return the resulting projid.
+ *
+ * There is always a mapping into the initial user_namespace.
+ *
+ * Unlike from_kprojid from_kprojid_munged never fails and always
+ * returns a valid projid. This makes from_kprojid_munged
+ * appropriate for use in syscalls like stat and where
+ * failing the system call and failing to provide a valid projid are
+ * not an options.
+ *
+ * If @kprojid has no mapping in @targ OVERFLOW_PROJID is returned.
+ */
+projid_t from_kprojid_munged(struct user_namespace *targ, kprojid_t kprojid)
+{
+ projid_t projid;
+ projid = from_kprojid(targ, kprojid);
+
+ if (projid == (projid_t) -1)
+ projid = OVERFLOW_PROJID;
+ return projid;
+}
+EXPORT_SYMBOL(from_kprojid_munged);
+
+
+static int uid_m_show(struct seq_file *seq, void *v)
+{
+ struct user_namespace *ns = seq->private;
+ struct uid_gid_extent *extent = v;
+ struct user_namespace *lower_ns;
+ uid_t lower;
+
+ lower_ns = seq_user_ns(seq);
+ if ((lower_ns == ns) && lower_ns->parent)
+ lower_ns = lower_ns->parent;
+
+ lower = from_kuid(lower_ns, KUIDT_INIT(extent->lower_first));
+
+ seq_printf(seq, "%10u %10u %10u\n",
+ extent->first,
+ lower,
+ extent->count);
+
+ return 0;
+}
+
+static int gid_m_show(struct seq_file *seq, void *v)
+{
+ struct user_namespace *ns = seq->private;
+ struct uid_gid_extent *extent = v;
+ struct user_namespace *lower_ns;
+ gid_t lower;
+
+ lower_ns = seq_user_ns(seq);
+ if ((lower_ns == ns) && lower_ns->parent)
+ lower_ns = lower_ns->parent;
+
+ lower = from_kgid(lower_ns, KGIDT_INIT(extent->lower_first));
+
+ seq_printf(seq, "%10u %10u %10u\n",
+ extent->first,
+ lower,
+ extent->count);
+
+ return 0;
+}
+
+static int projid_m_show(struct seq_file *seq, void *v)
+{
+ struct user_namespace *ns = seq->private;
+ struct uid_gid_extent *extent = v;
+ struct user_namespace *lower_ns;
+ projid_t lower;
+
+ lower_ns = seq_user_ns(seq);
+ if ((lower_ns == ns) && lower_ns->parent)
+ lower_ns = lower_ns->parent;
+
+ lower = from_kprojid(lower_ns, KPROJIDT_INIT(extent->lower_first));
+
+ seq_printf(seq, "%10u %10u %10u\n",
+ extent->first,
+ lower,
+ extent->count);
+
+ return 0;
+}
+
+static void *m_start(struct seq_file *seq, loff_t *ppos,
+ struct uid_gid_map *map)
+{
+ struct uid_gid_extent *extent = NULL;
+ loff_t pos = *ppos;
+
+ if (pos < map->nr_extents)
+ extent = &map->extent[pos];
+
+ return extent;
+}
+
+static void *uid_m_start(struct seq_file *seq, loff_t *ppos)
+{
+ struct user_namespace *ns = seq->private;
+
+ return m_start(seq, ppos, &ns->uid_map);
+}
+
+static void *gid_m_start(struct seq_file *seq, loff_t *ppos)
+{
+ struct user_namespace *ns = seq->private;
+
+ return m_start(seq, ppos, &ns->gid_map);
+}
+
+static void *projid_m_start(struct seq_file *seq, loff_t *ppos)
+{
+ struct user_namespace *ns = seq->private;
+
+ return m_start(seq, ppos, &ns->projid_map);
+}
+
+static void *m_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ (*pos)++;
+ return seq->op->start(seq, pos);
+}
+
+static void m_stop(struct seq_file *seq, void *v)
+{
+ return;
+}
+
+const struct seq_operations proc_uid_seq_operations = {
+ .start = uid_m_start,
+ .stop = m_stop,
+ .next = m_next,
+ .show = uid_m_show,
+};
+
+const struct seq_operations proc_gid_seq_operations = {
+ .start = gid_m_start,
+ .stop = m_stop,
+ .next = m_next,
+ .show = gid_m_show,
+};
+
+const struct seq_operations proc_projid_seq_operations = {
+ .start = projid_m_start,
+ .stop = m_stop,
+ .next = m_next,
+ .show = projid_m_show,
+};
+
+static bool mappings_overlap(struct uid_gid_map *new_map,
+ struct uid_gid_extent *extent)
+{
+ u32 upper_first, lower_first, upper_last, lower_last;
+ unsigned idx;
+
+ upper_first = extent->first;
+ lower_first = extent->lower_first;
+ upper_last = upper_first + extent->count - 1;
+ lower_last = lower_first + extent->count - 1;
+
+ for (idx = 0; idx < new_map->nr_extents; idx++) {
+ u32 prev_upper_first, prev_lower_first;
+ u32 prev_upper_last, prev_lower_last;
+ struct uid_gid_extent *prev;
+
+ prev = &new_map->extent[idx];
+
+ prev_upper_first = prev->first;
+ prev_lower_first = prev->lower_first;
+ prev_upper_last = prev_upper_first + prev->count - 1;
+ prev_lower_last = prev_lower_first + prev->count - 1;
+
+ /* Does the upper range intersect a previous extent? */
+ if ((prev_upper_first <= upper_last) &&
+ (prev_upper_last >= upper_first))
+ return true;
+
+ /* Does the lower range intersect a previous extent? */
+ if ((prev_lower_first <= lower_last) &&
+ (prev_lower_last >= lower_first))
+ return true;
+ }
+ return false;
+}
+
+static ssize_t map_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos,
+ int cap_setid,
+ struct uid_gid_map *map,
+ struct uid_gid_map *parent_map)
+{
+ struct seq_file *seq = file->private_data;
+ struct user_namespace *ns = seq->private;
+ struct uid_gid_map new_map;
+ unsigned idx;
+ struct uid_gid_extent *extent = NULL;
+ unsigned long page = 0;
+ char *kbuf, *pos, *next_line;
+ ssize_t ret = -EINVAL;
+
+ /*
+ * The userns_state_mutex serializes all writes to any given map.
+ *
+ * Any map is only ever written once.
+ *
+ * An id map fits within 1 cache line on most architectures.
+ *
+ * On read nothing needs to be done unless you are on an
+ * architecture with a crazy cache coherency model like alpha.
+ *
+ * There is a one time data dependency between reading the
+ * count of the extents and the values of the extents. The
+ * desired behavior is to see the values of the extents that
+ * were written before the count of the extents.
+ *
+ * To achieve this smp_wmb() is used on guarantee the write
+ * order and smp_rmb() is guaranteed that we don't have crazy
+ * architectures returning stale data.
+ */
+ mutex_lock(&userns_state_mutex);
+
+ ret = -EPERM;
+ /* Only allow one successful write to the map */
+ if (map->nr_extents != 0)
+ goto out;
+
+ /*
+ * Adjusting namespace settings requires capabilities on the target.
+ */
+ if (cap_valid(cap_setid) && !file_ns_capable(file, ns, CAP_SYS_ADMIN))
+ goto out;
+
+ /* Get a buffer */
+ ret = -ENOMEM;
+ page = __get_free_page(GFP_TEMPORARY);
+ kbuf = (char *) page;
+ if (!page)
+ goto out;
+
+ /* Only allow < page size writes at the beginning of the file */
+ ret = -EINVAL;
+ if ((*ppos != 0) || (count >= PAGE_SIZE))
+ goto out;
+
+ /* Slurp in the user data */
+ ret = -EFAULT;
+ if (copy_from_user(kbuf, buf, count))
+ goto out;
+ kbuf[count] = '\0';
+
+ /* Parse the user data */
+ ret = -EINVAL;
+ pos = kbuf;
+ new_map.nr_extents = 0;
+ for (; pos; pos = next_line) {
+ extent = &new_map.extent[new_map.nr_extents];
+
+ /* Find the end of line and ensure I don't look past it */
+ next_line = strchr(pos, '\n');
+ if (next_line) {
+ *next_line = '\0';
+ next_line++;
+ if (*next_line == '\0')
+ next_line = NULL;
+ }
+
+ pos = skip_spaces(pos);
+ extent->first = simple_strtoul(pos, &pos, 10);
+ if (!isspace(*pos))
+ goto out;
+
+ pos = skip_spaces(pos);
+ extent->lower_first = simple_strtoul(pos, &pos, 10);
+ if (!isspace(*pos))
+ goto out;
+
+ pos = skip_spaces(pos);
+ extent->count = simple_strtoul(pos, &pos, 10);
+ if (*pos && !isspace(*pos))
+ goto out;
+
+ /* Verify there is not trailing junk on the line */
+ pos = skip_spaces(pos);
+ if (*pos != '\0')
+ goto out;
+
+ /* Verify we have been given valid starting values */
+ if ((extent->first == (u32) -1) ||
+ (extent->lower_first == (u32) -1))
+ goto out;
+
+ /* Verify count is not zero and does not cause the
+ * extent to wrap
+ */
+ if ((extent->first + extent->count) <= extent->first)
+ goto out;
+ if ((extent->lower_first + extent->count) <=
+ extent->lower_first)
+ goto out;
+
+ /* Do the ranges in extent overlap any previous extents? */
+ if (mappings_overlap(&new_map, extent))
+ goto out;
+
+ new_map.nr_extents++;
+
+ /* Fail if the file contains too many extents */
+ if ((new_map.nr_extents == UID_GID_MAP_MAX_EXTENTS) &&
+ (next_line != NULL))
+ goto out;
+ }
+ /* Be very certaint the new map actually exists */
+ if (new_map.nr_extents == 0)
+ goto out;
+
+ ret = -EPERM;
+ /* Validate the user is allowed to use user id's mapped to. */
+ if (!new_idmap_permitted(file, ns, cap_setid, &new_map))
+ goto out;
+
+ /* Map the lower ids from the parent user namespace to the
+ * kernel global id space.
+ */
+ for (idx = 0; idx < new_map.nr_extents; idx++) {
+ u32 lower_first;
+ extent = &new_map.extent[idx];
+
+ lower_first = map_id_range_down(parent_map,
+ extent->lower_first,
+ extent->count);
+
+ /* Fail if we can not map the specified extent to
+ * the kernel global id space.
+ */
+ if (lower_first == (u32) -1)
+ goto out;
+
+ extent->lower_first = lower_first;
+ }
+
+ /* Install the map */
+ memcpy(map->extent, new_map.extent,
+ new_map.nr_extents*sizeof(new_map.extent[0]));
+ smp_wmb();
+ map->nr_extents = new_map.nr_extents;
+
+ *ppos = count;
+ ret = count;
+out:
+ mutex_unlock(&userns_state_mutex);
+ if (page)
+ free_page(page);
+ return ret;
+}
+
+ssize_t proc_uid_map_write(struct file *file, const char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ struct seq_file *seq = file->private_data;
+ struct user_namespace *ns = seq->private;
+ struct user_namespace *seq_ns = seq_user_ns(seq);
+
+ if (!ns->parent)
+ return -EPERM;
+
+ if ((seq_ns != ns) && (seq_ns != ns->parent))
+ return -EPERM;
+
+ return map_write(file, buf, size, ppos, CAP_SETUID,
+ &ns->uid_map, &ns->parent->uid_map);
+}
+
+ssize_t proc_gid_map_write(struct file *file, const char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ struct seq_file *seq = file->private_data;
+ struct user_namespace *ns = seq->private;
+ struct user_namespace *seq_ns = seq_user_ns(seq);
+
+ if (!ns->parent)
+ return -EPERM;
+
+ if ((seq_ns != ns) && (seq_ns != ns->parent))
+ return -EPERM;
+
+ return map_write(file, buf, size, ppos, CAP_SETGID,
+ &ns->gid_map, &ns->parent->gid_map);
+}
+
+ssize_t proc_projid_map_write(struct file *file, const char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ struct seq_file *seq = file->private_data;
+ struct user_namespace *ns = seq->private;
+ struct user_namespace *seq_ns = seq_user_ns(seq);
+
+ if (!ns->parent)
+ return -EPERM;
+
+ if ((seq_ns != ns) && (seq_ns != ns->parent))
+ return -EPERM;
+
+ /* Anyone can set any valid project id no capability needed */
+ return map_write(file, buf, size, ppos, -1,
+ &ns->projid_map, &ns->parent->projid_map);
+}
+
+static bool new_idmap_permitted(const struct file *file,
+ struct user_namespace *ns, int cap_setid,
+ struct uid_gid_map *new_map)
+{
+ const struct cred *cred = file->f_cred;
+ /* Don't allow mappings that would allow anything that wouldn't
+ * be allowed without the establishment of unprivileged mappings.
+ */
+ if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1) &&
+ uid_eq(ns->owner, cred->euid)) {
+ u32 id = new_map->extent[0].lower_first;
+ if (cap_setid == CAP_SETUID) {
+ kuid_t uid = make_kuid(ns->parent, id);
+ if (uid_eq(uid, cred->euid))
+ return true;
+ } else if (cap_setid == CAP_SETGID) {
+ kgid_t gid = make_kgid(ns->parent, id);
+ if (!(ns->flags & USERNS_SETGROUPS_ALLOWED) &&
+ gid_eq(gid, cred->egid))
+ return true;
+ }
+ }
+
+ /* Allow anyone to set a mapping that doesn't require privilege */
+ if (!cap_valid(cap_setid))
+ return true;
+
+ /* Allow the specified ids if we have the appropriate capability
+ * (CAP_SETUID or CAP_SETGID) over the parent user namespace.
+ * And the opener of the id file also had the approprpiate capability.
+ */
+ if (ns_capable(ns->parent, cap_setid) &&
+ file_ns_capable(file, ns->parent, cap_setid))
+ return true;
+
+ return false;
+}
+
+int proc_setgroups_show(struct seq_file *seq, void *v)
+{
+ struct user_namespace *ns = seq->private;
+ unsigned long userns_flags = ACCESS_ONCE(ns->flags);
+
+ seq_printf(seq, "%s\n",
+ (userns_flags & USERNS_SETGROUPS_ALLOWED) ?
+ "allow" : "deny");
+ return 0;
+}
+
+ssize_t proc_setgroups_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct seq_file *seq = file->private_data;
+ struct user_namespace *ns = seq->private;
+ char kbuf[8], *pos;
+ bool setgroups_allowed;
+ ssize_t ret;
+
+ /* Only allow a very narrow range of strings to be written */
+ ret = -EINVAL;
+ if ((*ppos != 0) || (count >= sizeof(kbuf)))
+ goto out;
+
+ /* What was written? */
+ ret = -EFAULT;
+ if (copy_from_user(kbuf, buf, count))
+ goto out;
+ kbuf[count] = '\0';
+ pos = kbuf;
+
+ /* What is being requested? */
+ ret = -EINVAL;
+ if (strncmp(pos, "allow", 5) == 0) {
+ pos += 5;
+ setgroups_allowed = true;
+ }
+ else if (strncmp(pos, "deny", 4) == 0) {
+ pos += 4;
+ setgroups_allowed = false;
+ }
+ else
+ goto out;
+
+ /* Verify there is not trailing junk on the line */
+ pos = skip_spaces(pos);
+ if (*pos != '\0')
+ goto out;
+
+ ret = -EPERM;
+ mutex_lock(&userns_state_mutex);
+ if (setgroups_allowed) {
+ /* Enabling setgroups after setgroups has been disabled
+ * is not allowed.
+ */
+ if (!(ns->flags & USERNS_SETGROUPS_ALLOWED))
+ goto out_unlock;
+ } else {
+ /* Permanently disabling setgroups after setgroups has
+ * been enabled by writing the gid_map is not allowed.
+ */
+ if (ns->gid_map.nr_extents != 0)
+ goto out_unlock;
+ ns->flags &= ~USERNS_SETGROUPS_ALLOWED;
+ }
+ mutex_unlock(&userns_state_mutex);
+
+ /* Report a successful write */
+ *ppos = count;
+ ret = count;
+out:
+ return ret;
+out_unlock:
+ mutex_unlock(&userns_state_mutex);
+ goto out;
+}
+
+bool userns_may_setgroups(const struct user_namespace *ns)
+{
+ bool allowed;
+
+ mutex_lock(&userns_state_mutex);
+ /* It is not safe to use setgroups until a gid mapping in
+ * the user namespace has been established.
+ */
+ allowed = ns->gid_map.nr_extents != 0;
+ /* Is setgroups allowed? */
+ allowed = allowed && (ns->flags & USERNS_SETGROUPS_ALLOWED);
+ mutex_unlock(&userns_state_mutex);
+
+ return allowed;
+}
+
+static inline struct user_namespace *to_user_ns(struct ns_common *ns)
+{
+ return container_of(ns, struct user_namespace, ns);
+}
+
+static struct ns_common *userns_get(struct task_struct *task)
+{
+ struct user_namespace *user_ns;
+
+ rcu_read_lock();
+ user_ns = get_user_ns(__task_cred(task)->user_ns);
+ rcu_read_unlock();
+
+ return user_ns ? &user_ns->ns : NULL;
+}
+
+static void userns_put(struct ns_common *ns)
+{
+ put_user_ns(to_user_ns(ns));
+}
+
+static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns)
+{
+ struct user_namespace *user_ns = to_user_ns(ns);
+ struct cred *cred;
+
+ /* Don't allow gaining capabilities by reentering
+ * the same user namespace.
+ */
+ if (user_ns == current_user_ns())
+ return -EINVAL;
+
+ /* Threaded processes may not enter a different user namespace */
+ if (atomic_read(&current->mm->mm_users) > 1)
+ return -EINVAL;
+
+ if (current->fs->users != 1)
+ return -EINVAL;
+
+ if (!ns_capable(user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+
+ cred = prepare_creds();
+ if (!cred)
+ return -ENOMEM;
+
+ put_user_ns(cred->user_ns);
+ set_cred_user_ns(cred, get_user_ns(user_ns));
+
+ return commit_creds(cred);
+}
+
+const struct proc_ns_operations userns_operations = {
+ .name = "user",
+ .type = CLONE_NEWUSER,
+ .get = userns_get,
+ .put = userns_put,
+ .install = userns_install,
+};
+
+static __init int user_namespaces_init(void)
+{
+ user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC);
+ return 0;
+}
+subsys_initcall(user_namespaces_init);
diff --git a/kernel/utsname.c b/kernel/utsname.c
new file mode 100644
index 000000000..831ea7108
--- /dev/null
+++ b/kernel/utsname.c
@@ -0,0 +1,139 @@
+/*
+ * Copyright (C) 2004 IBM Corporation
+ *
+ * Author: Serge Hallyn <serue@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ */
+
+#include <linux/export.h>
+#include <linux/uts.h>
+#include <linux/utsname.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/user_namespace.h>
+#include <linux/proc_ns.h>
+
+static struct uts_namespace *create_uts_ns(void)
+{
+ struct uts_namespace *uts_ns;
+
+ uts_ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL);
+ if (uts_ns)
+ kref_init(&uts_ns->kref);
+ return uts_ns;
+}
+
+/*
+ * Clone a new ns copying an original utsname, setting refcount to 1
+ * @old_ns: namespace to clone
+ * Return ERR_PTR(-ENOMEM) on error (failure to kmalloc), new ns otherwise
+ */
+static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
+ struct uts_namespace *old_ns)
+{
+ struct uts_namespace *ns;
+ int err;
+
+ ns = create_uts_ns();
+ if (!ns)
+ return ERR_PTR(-ENOMEM);
+
+ err = ns_alloc_inum(&ns->ns);
+ if (err) {
+ kfree(ns);
+ return ERR_PTR(err);
+ }
+
+ ns->ns.ops = &utsns_operations;
+
+ down_read(&uts_sem);
+ memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
+ ns->user_ns = get_user_ns(user_ns);
+ up_read(&uts_sem);
+ return ns;
+}
+
+/*
+ * Copy task tsk's utsname namespace, or clone it if flags
+ * specifies CLONE_NEWUTS. In latter case, changes to the
+ * utsname of this process won't be seen by parent, and vice
+ * versa.
+ */
+struct uts_namespace *copy_utsname(unsigned long flags,
+ struct user_namespace *user_ns, struct uts_namespace *old_ns)
+{
+ struct uts_namespace *new_ns;
+
+ BUG_ON(!old_ns);
+ get_uts_ns(old_ns);
+
+ if (!(flags & CLONE_NEWUTS))
+ return old_ns;
+
+ new_ns = clone_uts_ns(user_ns, old_ns);
+
+ put_uts_ns(old_ns);
+ return new_ns;
+}
+
+void free_uts_ns(struct kref *kref)
+{
+ struct uts_namespace *ns;
+
+ ns = container_of(kref, struct uts_namespace, kref);
+ put_user_ns(ns->user_ns);
+ ns_free_inum(&ns->ns);
+ kfree(ns);
+}
+
+static inline struct uts_namespace *to_uts_ns(struct ns_common *ns)
+{
+ return container_of(ns, struct uts_namespace, ns);
+}
+
+static struct ns_common *utsns_get(struct task_struct *task)
+{
+ struct uts_namespace *ns = NULL;
+ struct nsproxy *nsproxy;
+
+ task_lock(task);
+ nsproxy = task->nsproxy;
+ if (nsproxy) {
+ ns = nsproxy->uts_ns;
+ get_uts_ns(ns);
+ }
+ task_unlock(task);
+
+ return ns ? &ns->ns : NULL;
+}
+
+static void utsns_put(struct ns_common *ns)
+{
+ put_uts_ns(to_uts_ns(ns));
+}
+
+static int utsns_install(struct nsproxy *nsproxy, struct ns_common *new)
+{
+ struct uts_namespace *ns = to_uts_ns(new);
+
+ if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
+ !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ return -EPERM;
+
+ get_uts_ns(ns);
+ put_uts_ns(nsproxy->uts_ns);
+ nsproxy->uts_ns = ns;
+ return 0;
+}
+
+const struct proc_ns_operations utsns_operations = {
+ .name = "uts",
+ .type = CLONE_NEWUTS,
+ .get = utsns_get,
+ .put = utsns_put,
+ .install = utsns_install,
+};
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
new file mode 100644
index 000000000..c8eac4326
--- /dev/null
+++ b/kernel/utsname_sysctl.c
@@ -0,0 +1,138 @@
+/*
+ * Copyright (C) 2007
+ *
+ * Author: Eric Biederman <ebiederm@xmision.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ */
+
+#include <linux/export.h>
+#include <linux/uts.h>
+#include <linux/utsname.h>
+#include <linux/sysctl.h>
+#include <linux/wait.h>
+
+#ifdef CONFIG_PROC_SYSCTL
+
+static void *get_uts(struct ctl_table *table, int write)
+{
+ char *which = table->data;
+ struct uts_namespace *uts_ns;
+
+ uts_ns = current->nsproxy->uts_ns;
+ which = (which - (char *)&init_uts_ns) + (char *)uts_ns;
+
+ if (!write)
+ down_read(&uts_sem);
+ else
+ down_write(&uts_sem);
+ return which;
+}
+
+static void put_uts(struct ctl_table *table, int write, void *which)
+{
+ if (!write)
+ up_read(&uts_sem);
+ else
+ up_write(&uts_sem);
+}
+
+/*
+ * Special case of dostring for the UTS structure. This has locks
+ * to observe. Should this be in kernel/sys.c ????
+ */
+static int proc_do_uts_string(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table uts_table;
+ int r;
+ memcpy(&uts_table, table, sizeof(uts_table));
+ uts_table.data = get_uts(table, write);
+ r = proc_dostring(&uts_table, write, buffer, lenp, ppos);
+ put_uts(table, write, uts_table.data);
+
+ if (write)
+ proc_sys_poll_notify(table->poll);
+
+ return r;
+}
+#else
+#define proc_do_uts_string NULL
+#endif
+
+static DEFINE_CTL_TABLE_POLL(hostname_poll);
+static DEFINE_CTL_TABLE_POLL(domainname_poll);
+
+static struct ctl_table uts_kern_table[] = {
+ {
+ .procname = "ostype",
+ .data = init_uts_ns.name.sysname,
+ .maxlen = sizeof(init_uts_ns.name.sysname),
+ .mode = 0444,
+ .proc_handler = proc_do_uts_string,
+ },
+ {
+ .procname = "osrelease",
+ .data = init_uts_ns.name.release,
+ .maxlen = sizeof(init_uts_ns.name.release),
+ .mode = 0444,
+ .proc_handler = proc_do_uts_string,
+ },
+ {
+ .procname = "version",
+ .data = init_uts_ns.name.version,
+ .maxlen = sizeof(init_uts_ns.name.version),
+ .mode = 0444,
+ .proc_handler = proc_do_uts_string,
+ },
+ {
+ .procname = "hostname",
+ .data = init_uts_ns.name.nodename,
+ .maxlen = sizeof(init_uts_ns.name.nodename),
+ .mode = 0644,
+ .proc_handler = proc_do_uts_string,
+ .poll = &hostname_poll,
+ },
+ {
+ .procname = "domainname",
+ .data = init_uts_ns.name.domainname,
+ .maxlen = sizeof(init_uts_ns.name.domainname),
+ .mode = 0644,
+ .proc_handler = proc_do_uts_string,
+ .poll = &domainname_poll,
+ },
+ {}
+};
+
+static struct ctl_table uts_root_table[] = {
+ {
+ .procname = "kernel",
+ .mode = 0555,
+ .child = uts_kern_table,
+ },
+ {}
+};
+
+#ifdef CONFIG_PROC_SYSCTL
+/*
+ * Notify userspace about a change in a certain entry of uts_kern_table,
+ * identified by the parameter proc.
+ */
+void uts_proc_notify(enum uts_proc proc)
+{
+ struct ctl_table *table = &uts_kern_table[proc];
+
+ proc_sys_poll_notify(table->poll);
+}
+#endif
+
+static int __init utsname_sysctl_init(void)
+{
+ register_sysctl_table(uts_root_table);
+ return 0;
+}
+
+device_initcall(utsname_sysctl_init);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
new file mode 100644
index 000000000..581a68a04
--- /dev/null
+++ b/kernel/watchdog.c
@@ -0,0 +1,890 @@
+/*
+ * Detect hard and soft lockups on a system
+ *
+ * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
+ *
+ * Note: Most of this code is borrowed heavily from the original softlockup
+ * detector, so thanks to Ingo for the initial implementation.
+ * Some chunks also taken from the old x86-specific nmi watchdog code, thanks
+ * to those contributors as well.
+ */
+
+#define pr_fmt(fmt) "NMI watchdog: " fmt
+
+#include <linux/mm.h>
+#include <linux/cpu.h>
+#include <linux/nmi.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+#include <linux/smpboot.h>
+#include <linux/sched/rt.h>
+
+#include <asm/irq_regs.h>
+#include <linux/kvm_para.h>
+#include <linux/perf_event.h>
+
+/*
+ * The run state of the lockup detectors is controlled by the content of the
+ * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit -
+ * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector.
+ *
+ * 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled'
+ * are variables that are only used as an 'interface' between the parameters
+ * in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The
+ * 'watchdog_thresh' variable is handled differently because its value is not
+ * boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh'
+ * is equal zero.
+ */
+#define NMI_WATCHDOG_ENABLED_BIT 0
+#define SOFT_WATCHDOG_ENABLED_BIT 1
+#define NMI_WATCHDOG_ENABLED (1 << NMI_WATCHDOG_ENABLED_BIT)
+#define SOFT_WATCHDOG_ENABLED (1 << SOFT_WATCHDOG_ENABLED_BIT)
+
+static DEFINE_MUTEX(watchdog_proc_mutex);
+
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED;
+#else
+static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED;
+#endif
+int __read_mostly nmi_watchdog_enabled;
+int __read_mostly soft_watchdog_enabled;
+int __read_mostly watchdog_user_enabled;
+int __read_mostly watchdog_thresh = 10;
+
+#ifdef CONFIG_SMP
+int __read_mostly sysctl_softlockup_all_cpu_backtrace;
+#else
+#define sysctl_softlockup_all_cpu_backtrace 0
+#endif
+
+static int __read_mostly watchdog_running;
+static u64 __read_mostly sample_period;
+
+static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
+static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
+static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
+static DEFINE_PER_CPU(bool, softlockup_touch_sync);
+static DEFINE_PER_CPU(bool, soft_watchdog_warn);
+static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
+static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt);
+static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved);
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+static DEFINE_PER_CPU(bool, hard_watchdog_warn);
+static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
+static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
+static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
+#endif
+static unsigned long soft_lockup_nmi_warn;
+
+/* boot commands */
+/*
+ * Should we panic when a soft-lockup or hard-lockup occurs:
+ */
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+static int hardlockup_panic =
+ CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
+/*
+ * We may not want to enable hard lockup detection by default in all cases,
+ * for example when running the kernel as a guest on a hypervisor. In these
+ * cases this function can be called to disable hard lockup detection. This
+ * function should only be executed once by the boot processor before the
+ * kernel command line parameters are parsed, because otherwise it is not
+ * possible to override this in hardlockup_panic_setup().
+ */
+void hardlockup_detector_disable(void)
+{
+ watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
+}
+
+static int __init hardlockup_panic_setup(char *str)
+{
+ if (!strncmp(str, "panic", 5))
+ hardlockup_panic = 1;
+ else if (!strncmp(str, "nopanic", 7))
+ hardlockup_panic = 0;
+ else if (!strncmp(str, "0", 1))
+ watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
+ else if (!strncmp(str, "1", 1))
+ watchdog_enabled |= NMI_WATCHDOG_ENABLED;
+ return 1;
+}
+__setup("nmi_watchdog=", hardlockup_panic_setup);
+#endif
+
+unsigned int __read_mostly softlockup_panic =
+ CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
+
+static int __init softlockup_panic_setup(char *str)
+{
+ softlockup_panic = simple_strtoul(str, NULL, 0);
+
+ return 1;
+}
+__setup("softlockup_panic=", softlockup_panic_setup);
+
+static int __init nowatchdog_setup(char *str)
+{
+ watchdog_enabled = 0;
+ return 1;
+}
+__setup("nowatchdog", nowatchdog_setup);
+
+static int __init nosoftlockup_setup(char *str)
+{
+ watchdog_enabled &= ~SOFT_WATCHDOG_ENABLED;
+ return 1;
+}
+__setup("nosoftlockup", nosoftlockup_setup);
+
+#ifdef CONFIG_SMP
+static int __init softlockup_all_cpu_backtrace_setup(char *str)
+{
+ sysctl_softlockup_all_cpu_backtrace =
+ !!simple_strtol(str, NULL, 0);
+ return 1;
+}
+__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup);
+#endif
+
+/*
+ * Hard-lockup warnings should be triggered after just a few seconds. Soft-
+ * lockups can have false positives under extreme conditions. So we generally
+ * want a higher threshold for soft lockups than for hard lockups. So we couple
+ * the thresholds with a factor: we make the soft threshold twice the amount of
+ * time the hard threshold is.
+ */
+static int get_softlockup_thresh(void)
+{
+ return watchdog_thresh * 2;
+}
+
+/*
+ * Returns seconds, approximately. We don't need nanosecond
+ * resolution, and we don't need to waste time with a big divide when
+ * 2^30ns == 1.074s.
+ */
+static unsigned long get_timestamp(void)
+{
+ return running_clock() >> 30LL; /* 2^30 ~= 10^9 */
+}
+
+static void set_sample_period(void)
+{
+ /*
+ * convert watchdog_thresh from seconds to ns
+ * the divide by 5 is to give hrtimer several chances (two
+ * or three with the current relation between the soft
+ * and hard thresholds) to increment before the
+ * hardlockup detector generates a warning
+ */
+ sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5);
+}
+
+/* Commands for resetting the watchdog */
+static void __touch_watchdog(void)
+{
+ __this_cpu_write(watchdog_touch_ts, get_timestamp());
+}
+
+void touch_softlockup_watchdog(void)
+{
+ /*
+ * Preemption can be enabled. It doesn't matter which CPU's timestamp
+ * gets zeroed here, so use the raw_ operation.
+ */
+ raw_cpu_write(watchdog_touch_ts, 0);
+}
+EXPORT_SYMBOL(touch_softlockup_watchdog);
+
+void touch_all_softlockup_watchdogs(void)
+{
+ int cpu;
+
+ /*
+ * this is done lockless
+ * do we care if a 0 races with a timestamp?
+ * all it means is the softlock check starts one cycle later
+ */
+ for_each_online_cpu(cpu)
+ per_cpu(watchdog_touch_ts, cpu) = 0;
+}
+
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+void touch_nmi_watchdog(void)
+{
+ /*
+ * Using __raw here because some code paths have
+ * preemption enabled. If preemption is enabled
+ * then interrupts should be enabled too, in which
+ * case we shouldn't have to worry about the watchdog
+ * going off.
+ */
+ raw_cpu_write(watchdog_nmi_touch, true);
+ touch_softlockup_watchdog();
+}
+EXPORT_SYMBOL(touch_nmi_watchdog);
+
+#endif
+
+void touch_softlockup_watchdog_sync(void)
+{
+ __this_cpu_write(softlockup_touch_sync, true);
+ __this_cpu_write(watchdog_touch_ts, 0);
+}
+
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+/* watchdog detector functions */
+static int is_hardlockup(void)
+{
+ unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
+
+ if (__this_cpu_read(hrtimer_interrupts_saved) == hrint)
+ return 1;
+
+ __this_cpu_write(hrtimer_interrupts_saved, hrint);
+ return 0;
+}
+#endif
+
+static int is_softlockup(unsigned long touch_ts)
+{
+ unsigned long now = get_timestamp();
+
+ if (watchdog_enabled & SOFT_WATCHDOG_ENABLED) {
+ /* Warn about unreasonable delays. */
+ if (time_after(now, touch_ts + get_softlockup_thresh()))
+ return now - touch_ts;
+ }
+ return 0;
+}
+
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+
+static struct perf_event_attr wd_hw_attr = {
+ .type = PERF_TYPE_HARDWARE,
+ .config = PERF_COUNT_HW_CPU_CYCLES,
+ .size = sizeof(struct perf_event_attr),
+ .pinned = 1,
+ .disabled = 1,
+};
+
+/* Callback function for perf event subsystem */
+static void watchdog_overflow_callback(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ /* Ensure the watchdog never gets throttled */
+ event->hw.interrupts = 0;
+
+ if (__this_cpu_read(watchdog_nmi_touch) == true) {
+ __this_cpu_write(watchdog_nmi_touch, false);
+ return;
+ }
+
+ /* check for a hardlockup
+ * This is done by making sure our timer interrupt
+ * is incrementing. The timer interrupt should have
+ * fired multiple times before we overflow'd. If it hasn't
+ * then this is a good indication the cpu is stuck
+ */
+ if (is_hardlockup()) {
+ int this_cpu = smp_processor_id();
+
+ /* only print hardlockups once */
+ if (__this_cpu_read(hard_watchdog_warn) == true)
+ return;
+
+ if (hardlockup_panic)
+ panic("Watchdog detected hard LOCKUP on cpu %d",
+ this_cpu);
+ else
+ WARN(1, "Watchdog detected hard LOCKUP on cpu %d",
+ this_cpu);
+
+ __this_cpu_write(hard_watchdog_warn, true);
+ return;
+ }
+
+ __this_cpu_write(hard_watchdog_warn, false);
+ return;
+}
+#endif /* CONFIG_HARDLOCKUP_DETECTOR */
+
+static void watchdog_interrupt_count(void)
+{
+ __this_cpu_inc(hrtimer_interrupts);
+}
+
+static int watchdog_nmi_enable(unsigned int cpu);
+static void watchdog_nmi_disable(unsigned int cpu);
+
+/* watchdog kicker functions */
+static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
+{
+ unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);
+ struct pt_regs *regs = get_irq_regs();
+ int duration;
+ int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
+
+ /* kick the hardlockup detector */
+ watchdog_interrupt_count();
+
+ /* kick the softlockup detector */
+ wake_up_process(__this_cpu_read(softlockup_watchdog));
+
+ /* .. and repeat */
+ hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
+
+ if (touch_ts == 0) {
+ if (unlikely(__this_cpu_read(softlockup_touch_sync))) {
+ /*
+ * If the time stamp was touched atomically
+ * make sure the scheduler tick is up to date.
+ */
+ __this_cpu_write(softlockup_touch_sync, false);
+ sched_clock_tick();
+ }
+
+ /* Clear the guest paused flag on watchdog reset */
+ kvm_check_and_clear_guest_paused();
+ __touch_watchdog();
+ return HRTIMER_RESTART;
+ }
+
+ /* check for a softlockup
+ * This is done by making sure a high priority task is
+ * being scheduled. The task touches the watchdog to
+ * indicate it is getting cpu time. If it hasn't then
+ * this is a good indication some task is hogging the cpu
+ */
+ duration = is_softlockup(touch_ts);
+ if (unlikely(duration)) {
+ /*
+ * If a virtual machine is stopped by the host it can look to
+ * the watchdog like a soft lockup, check to see if the host
+ * stopped the vm before we issue the warning
+ */
+ if (kvm_check_and_clear_guest_paused())
+ return HRTIMER_RESTART;
+
+ /* only warn once */
+ if (__this_cpu_read(soft_watchdog_warn) == true) {
+ /*
+ * When multiple processes are causing softlockups the
+ * softlockup detector only warns on the first one
+ * because the code relies on a full quiet cycle to
+ * re-arm. The second process prevents the quiet cycle
+ * and never gets reported. Use task pointers to detect
+ * this.
+ */
+ if (__this_cpu_read(softlockup_task_ptr_saved) !=
+ current) {
+ __this_cpu_write(soft_watchdog_warn, false);
+ __touch_watchdog();
+ }
+ return HRTIMER_RESTART;
+ }
+
+ if (softlockup_all_cpu_backtrace) {
+ /* Prevent multiple soft-lockup reports if one cpu is already
+ * engaged in dumping cpu back traces
+ */
+ if (test_and_set_bit(0, &soft_lockup_nmi_warn)) {
+ /* Someone else will report us. Let's give up */
+ __this_cpu_write(soft_watchdog_warn, true);
+ return HRTIMER_RESTART;
+ }
+ }
+
+ pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
+ smp_processor_id(), duration,
+ current->comm, task_pid_nr(current));
+ __this_cpu_write(softlockup_task_ptr_saved, current);
+ print_modules();
+ print_irqtrace_events(current);
+ if (regs)
+ show_regs(regs);
+ else
+ dump_stack();
+
+ if (softlockup_all_cpu_backtrace) {
+ /* Avoid generating two back traces for current
+ * given that one is already made above
+ */
+ trigger_allbutself_cpu_backtrace();
+
+ clear_bit(0, &soft_lockup_nmi_warn);
+ /* Barrier to sync with other cpus */
+ smp_mb__after_atomic();
+ }
+
+ add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
+ if (softlockup_panic)
+ panic("softlockup: hung tasks");
+ __this_cpu_write(soft_watchdog_warn, true);
+ } else
+ __this_cpu_write(soft_watchdog_warn, false);
+
+ return HRTIMER_RESTART;
+}
+
+static void watchdog_set_prio(unsigned int policy, unsigned int prio)
+{
+ struct sched_param param = { .sched_priority = prio };
+
+ sched_setscheduler(current, policy, &param);
+}
+
+static void watchdog_enable(unsigned int cpu)
+{
+ struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
+
+ /* kick off the timer for the hardlockup detector */
+ hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer->function = watchdog_timer_fn;
+
+ /* Enable the perf event */
+ watchdog_nmi_enable(cpu);
+
+ /* done here because hrtimer_start can only pin to smp_processor_id() */
+ hrtimer_start(hrtimer, ns_to_ktime(sample_period),
+ HRTIMER_MODE_REL_PINNED);
+
+ /* initialize timestamp */
+ watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);
+ __touch_watchdog();
+}
+
+static void watchdog_disable(unsigned int cpu)
+{
+ struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
+
+ watchdog_set_prio(SCHED_NORMAL, 0);
+ hrtimer_cancel(hrtimer);
+ /* disable the perf event */
+ watchdog_nmi_disable(cpu);
+}
+
+static void watchdog_cleanup(unsigned int cpu, bool online)
+{
+ watchdog_disable(cpu);
+}
+
+static int watchdog_should_run(unsigned int cpu)
+{
+ return __this_cpu_read(hrtimer_interrupts) !=
+ __this_cpu_read(soft_lockup_hrtimer_cnt);
+}
+
+/*
+ * The watchdog thread function - touches the timestamp.
+ *
+ * It only runs once every sample_period seconds (4 seconds by
+ * default) to reset the softlockup timestamp. If this gets delayed
+ * for more than 2*watchdog_thresh seconds then the debug-printout
+ * triggers in watchdog_timer_fn().
+ */
+static void watchdog(unsigned int cpu)
+{
+ __this_cpu_write(soft_lockup_hrtimer_cnt,
+ __this_cpu_read(hrtimer_interrupts));
+ __touch_watchdog();
+
+ /*
+ * watchdog_nmi_enable() clears the NMI_WATCHDOG_ENABLED bit in the
+ * failure path. Check for failures that can occur asynchronously -
+ * for example, when CPUs are on-lined - and shut down the hardware
+ * perf event on each CPU accordingly.
+ *
+ * The only non-obvious place this bit can be cleared is through
+ * watchdog_nmi_enable(), so a pr_info() is placed there. Placing a
+ * pr_info here would be too noisy as it would result in a message
+ * every few seconds if the hardlockup was disabled but the softlockup
+ * enabled.
+ */
+ if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
+ watchdog_nmi_disable(cpu);
+}
+
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+/*
+ * People like the simple clean cpu node info on boot.
+ * Reduce the watchdog noise by only printing messages
+ * that are different from what cpu0 displayed.
+ */
+static unsigned long cpu0_err;
+
+static int watchdog_nmi_enable(unsigned int cpu)
+{
+ struct perf_event_attr *wd_attr;
+ struct perf_event *event = per_cpu(watchdog_ev, cpu);
+
+ /* nothing to do if the hard lockup detector is disabled */
+ if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
+ goto out;
+
+ /* is it already setup and enabled? */
+ if (event && event->state > PERF_EVENT_STATE_OFF)
+ goto out;
+
+ /* it is setup but not enabled */
+ if (event != NULL)
+ goto out_enable;
+
+ wd_attr = &wd_hw_attr;
+ wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
+
+ /* Try to register using hardware perf events */
+ event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
+
+ /* save cpu0 error for future comparision */
+ if (cpu == 0 && IS_ERR(event))
+ cpu0_err = PTR_ERR(event);
+
+ if (!IS_ERR(event)) {
+ /* only print for cpu0 or different than cpu0 */
+ if (cpu == 0 || cpu0_err)
+ pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n");
+ goto out_save;
+ }
+
+ /*
+ * Disable the hard lockup detector if _any_ CPU fails to set up
+ * set up the hardware perf event. The watchdog() function checks
+ * the NMI_WATCHDOG_ENABLED bit periodically.
+ *
+ * The barriers are for syncing up watchdog_enabled across all the
+ * cpus, as clear_bit() does not use barriers.
+ */
+ smp_mb__before_atomic();
+ clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled);
+ smp_mb__after_atomic();
+
+ /* skip displaying the same error again */
+ if (cpu > 0 && (PTR_ERR(event) == cpu0_err))
+ return PTR_ERR(event);
+
+ /* vary the KERN level based on the returned errno */
+ if (PTR_ERR(event) == -EOPNOTSUPP)
+ pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
+ else if (PTR_ERR(event) == -ENOENT)
+ pr_warn("disabled (cpu%i): hardware events not enabled\n",
+ cpu);
+ else
+ pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
+ cpu, PTR_ERR(event));
+
+ pr_info("Shutting down hard lockup detector on all cpus\n");
+
+ return PTR_ERR(event);
+
+ /* success path */
+out_save:
+ per_cpu(watchdog_ev, cpu) = event;
+out_enable:
+ perf_event_enable(per_cpu(watchdog_ev, cpu));
+out:
+ return 0;
+}
+
+static void watchdog_nmi_disable(unsigned int cpu)
+{
+ struct perf_event *event = per_cpu(watchdog_ev, cpu);
+
+ if (event) {
+ perf_event_disable(event);
+ per_cpu(watchdog_ev, cpu) = NULL;
+
+ /* should be in cleanup, but blocks oprofile */
+ perf_event_release_kernel(event);
+ }
+ if (cpu == 0) {
+ /* watchdog_nmi_enable() expects this to be zero initially. */
+ cpu0_err = 0;
+ }
+}
+
+void watchdog_nmi_enable_all(void)
+{
+ int cpu;
+
+ mutex_lock(&watchdog_proc_mutex);
+
+ if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
+ goto unlock;
+
+ get_online_cpus();
+ for_each_online_cpu(cpu)
+ watchdog_nmi_enable(cpu);
+ put_online_cpus();
+
+unlock:
+ mutex_unlock(&watchdog_proc_mutex);
+}
+
+void watchdog_nmi_disable_all(void)
+{
+ int cpu;
+
+ mutex_lock(&watchdog_proc_mutex);
+
+ if (!watchdog_running)
+ goto unlock;
+
+ get_online_cpus();
+ for_each_online_cpu(cpu)
+ watchdog_nmi_disable(cpu);
+ put_online_cpus();
+
+unlock:
+ mutex_unlock(&watchdog_proc_mutex);
+}
+#else
+static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
+static void watchdog_nmi_disable(unsigned int cpu) { return; }
+void watchdog_nmi_enable_all(void) {}
+void watchdog_nmi_disable_all(void) {}
+#endif /* CONFIG_HARDLOCKUP_DETECTOR */
+
+static struct smp_hotplug_thread watchdog_threads = {
+ .store = &softlockup_watchdog,
+ .thread_should_run = watchdog_should_run,
+ .thread_fn = watchdog,
+ .thread_comm = "watchdog/%u",
+ .setup = watchdog_enable,
+ .cleanup = watchdog_cleanup,
+ .park = watchdog_disable,
+ .unpark = watchdog_enable,
+};
+
+static void restart_watchdog_hrtimer(void *info)
+{
+ struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
+ int ret;
+
+ /*
+ * No need to cancel and restart hrtimer if it is currently executing
+ * because it will reprogram itself with the new period now.
+ * We should never see it unqueued here because we are running per-cpu
+ * with interrupts disabled.
+ */
+ ret = hrtimer_try_to_cancel(hrtimer);
+ if (ret == 1)
+ hrtimer_start(hrtimer, ns_to_ktime(sample_period),
+ HRTIMER_MODE_REL_PINNED);
+}
+
+static void update_watchdog(int cpu)
+{
+ /*
+ * Make sure that perf event counter will adopt to a new
+ * sampling period. Updating the sampling period directly would
+ * be much nicer but we do not have an API for that now so
+ * let's use a big hammer.
+ * Hrtimer will adopt the new period on the next tick but this
+ * might be late already so we have to restart the timer as well.
+ */
+ watchdog_nmi_disable(cpu);
+ smp_call_function_single(cpu, restart_watchdog_hrtimer, NULL, 1);
+ watchdog_nmi_enable(cpu);
+}
+
+static void update_watchdog_all_cpus(void)
+{
+ int cpu;
+
+ get_online_cpus();
+ for_each_online_cpu(cpu)
+ update_watchdog(cpu);
+ put_online_cpus();
+}
+
+static int watchdog_enable_all_cpus(void)
+{
+ int err = 0;
+
+ if (!watchdog_running) {
+ err = smpboot_register_percpu_thread(&watchdog_threads);
+ if (err)
+ pr_err("Failed to create watchdog threads, disabled\n");
+ else
+ watchdog_running = 1;
+ } else {
+ /*
+ * Enable/disable the lockup detectors or
+ * change the sample period 'on the fly'.
+ */
+ update_watchdog_all_cpus();
+ }
+
+ return err;
+}
+
+/* prepare/enable/disable routines */
+/* sysctl functions */
+#ifdef CONFIG_SYSCTL
+static void watchdog_disable_all_cpus(void)
+{
+ if (watchdog_running) {
+ watchdog_running = 0;
+ smpboot_unregister_percpu_thread(&watchdog_threads);
+ }
+}
+
+/*
+ * Update the run state of the lockup detectors.
+ */
+static int proc_watchdog_update(void)
+{
+ int err = 0;
+
+ /*
+ * Watchdog threads won't be started if they are already active.
+ * The 'watchdog_running' variable in watchdog_*_all_cpus() takes
+ * care of this. If those threads are already active, the sample
+ * period will be updated and the lockup detectors will be enabled
+ * or disabled 'on the fly'.
+ */
+ if (watchdog_enabled && watchdog_thresh)
+ err = watchdog_enable_all_cpus();
+ else
+ watchdog_disable_all_cpus();
+
+ return err;
+
+}
+
+/*
+ * common function for watchdog, nmi_watchdog and soft_watchdog parameter
+ *
+ * caller | table->data points to | 'which' contains the flag(s)
+ * -------------------|-----------------------|-----------------------------
+ * proc_watchdog | watchdog_user_enabled | NMI_WATCHDOG_ENABLED or'ed
+ * | | with SOFT_WATCHDOG_ENABLED
+ * -------------------|-----------------------|-----------------------------
+ * proc_nmi_watchdog | nmi_watchdog_enabled | NMI_WATCHDOG_ENABLED
+ * -------------------|-----------------------|-----------------------------
+ * proc_soft_watchdog | soft_watchdog_enabled | SOFT_WATCHDOG_ENABLED
+ */
+static int proc_watchdog_common(int which, struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int err, old, new;
+ int *watchdog_param = (int *)table->data;
+
+ mutex_lock(&watchdog_proc_mutex);
+
+ /*
+ * If the parameter is being read return the state of the corresponding
+ * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the
+ * run state of the lockup detectors.
+ */
+ if (!write) {
+ *watchdog_param = (watchdog_enabled & which) != 0;
+ err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ } else {
+ err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (err)
+ goto out;
+
+ /*
+ * There is a race window between fetching the current value
+ * from 'watchdog_enabled' and storing the new value. During
+ * this race window, watchdog_nmi_enable() can sneak in and
+ * clear the NMI_WATCHDOG_ENABLED bit in 'watchdog_enabled'.
+ * The 'cmpxchg' detects this race and the loop retries.
+ */
+ do {
+ old = watchdog_enabled;
+ /*
+ * If the parameter value is not zero set the
+ * corresponding bit(s), else clear it(them).
+ */
+ if (*watchdog_param)
+ new = old | which;
+ else
+ new = old & ~which;
+ } while (cmpxchg(&watchdog_enabled, old, new) != old);
+
+ /*
+ * Update the run state of the lockup detectors.
+ * Restore 'watchdog_enabled' on failure.
+ */
+ err = proc_watchdog_update();
+ if (err)
+ watchdog_enabled = old;
+ }
+out:
+ mutex_unlock(&watchdog_proc_mutex);
+ return err;
+}
+
+/*
+ * /proc/sys/kernel/watchdog
+ */
+int proc_watchdog(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return proc_watchdog_common(NMI_WATCHDOG_ENABLED|SOFT_WATCHDOG_ENABLED,
+ table, write, buffer, lenp, ppos);
+}
+
+/*
+ * /proc/sys/kernel/nmi_watchdog
+ */
+int proc_nmi_watchdog(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return proc_watchdog_common(NMI_WATCHDOG_ENABLED,
+ table, write, buffer, lenp, ppos);
+}
+
+/*
+ * /proc/sys/kernel/soft_watchdog
+ */
+int proc_soft_watchdog(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return proc_watchdog_common(SOFT_WATCHDOG_ENABLED,
+ table, write, buffer, lenp, ppos);
+}
+
+/*
+ * /proc/sys/kernel/watchdog_thresh
+ */
+int proc_watchdog_thresh(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int err, old;
+
+ mutex_lock(&watchdog_proc_mutex);
+
+ old = ACCESS_ONCE(watchdog_thresh);
+ err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+ if (err || !write)
+ goto out;
+
+ /*
+ * Update the sample period.
+ * Restore 'watchdog_thresh' on failure.
+ */
+ set_sample_period();
+ err = proc_watchdog_update();
+ if (err)
+ watchdog_thresh = old;
+out:
+ mutex_unlock(&watchdog_proc_mutex);
+ return err;
+}
+#endif /* CONFIG_SYSCTL */
+
+void __init lockup_detector_init(void)
+{
+ set_sample_period();
+
+ if (watchdog_enabled)
+ watchdog_enable_all_cpus();
+}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
new file mode 100644
index 000000000..586ad9130
--- /dev/null
+++ b/kernel/workqueue.c
@@ -0,0 +1,5140 @@
+/*
+ * kernel/workqueue.c - generic async execution with shared worker pool
+ *
+ * Copyright (C) 2002 Ingo Molnar
+ *
+ * Derived from the taskqueue/keventd code by:
+ * David Woodhouse <dwmw2@infradead.org>
+ * Andrew Morton
+ * Kai Petzke <wpp@marie.physik.tu-berlin.de>
+ * Theodore Ts'o <tytso@mit.edu>
+ *
+ * Made to use alloc_percpu by Christoph Lameter.
+ *
+ * Copyright (C) 2010 SUSE Linux Products GmbH
+ * Copyright (C) 2010 Tejun Heo <tj@kernel.org>
+ *
+ * This is the generic async execution mechanism. Work items as are
+ * executed in process context. The worker pool is shared and
+ * automatically managed. There are two worker pools for each CPU (one for
+ * normal work items and the other for high priority ones) and some extra
+ * pools for workqueues which are not bound to any specific CPU - the
+ * number of these backing pools is dynamic.
+ *
+ * Please read Documentation/workqueue.txt for details.
+ */
+
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/signal.h>
+#include <linux/completion.h>
+#include <linux/workqueue.h>
+#include <linux/slab.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
+#include <linux/kthread.h>
+#include <linux/hardirq.h>
+#include <linux/mempolicy.h>
+#include <linux/freezer.h>
+#include <linux/kallsyms.h>
+#include <linux/debug_locks.h>
+#include <linux/lockdep.h>
+#include <linux/idr.h>
+#include <linux/jhash.h>
+#include <linux/hashtable.h>
+#include <linux/rculist.h>
+#include <linux/nodemask.h>
+#include <linux/moduleparam.h>
+#include <linux/uaccess.h>
+
+#include "workqueue_internal.h"
+
+enum {
+ /*
+ * worker_pool flags
+ *
+ * A bound pool is either associated or disassociated with its CPU.
+ * While associated (!DISASSOCIATED), all workers are bound to the
+ * CPU and none has %WORKER_UNBOUND set and concurrency management
+ * is in effect.
+ *
+ * While DISASSOCIATED, the cpu may be offline and all workers have
+ * %WORKER_UNBOUND set and concurrency management disabled, and may
+ * be executing on any CPU. The pool behaves as an unbound one.
+ *
+ * Note that DISASSOCIATED should be flipped only while holding
+ * attach_mutex to avoid changing binding state while
+ * worker_attach_to_pool() is in progress.
+ */
+ POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */
+
+ /* worker flags */
+ WORKER_DIE = 1 << 1, /* die die die */
+ WORKER_IDLE = 1 << 2, /* is idle */
+ WORKER_PREP = 1 << 3, /* preparing to run works */
+ WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */
+ WORKER_UNBOUND = 1 << 7, /* worker is unbound */
+ WORKER_REBOUND = 1 << 8, /* worker was rebound */
+
+ WORKER_NOT_RUNNING = WORKER_PREP | WORKER_CPU_INTENSIVE |
+ WORKER_UNBOUND | WORKER_REBOUND,
+
+ NR_STD_WORKER_POOLS = 2, /* # standard pools per cpu */
+
+ UNBOUND_POOL_HASH_ORDER = 6, /* hashed by pool->attrs */
+ BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */
+
+ MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */
+ IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */
+
+ MAYDAY_INITIAL_TIMEOUT = HZ / 100 >= 2 ? HZ / 100 : 2,
+ /* call for help after 10ms
+ (min two ticks) */
+ MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */
+ CREATE_COOLDOWN = HZ, /* time to breath after fail */
+
+ /*
+ * Rescue workers are used only on emergencies and shared by
+ * all cpus. Give MIN_NICE.
+ */
+ RESCUER_NICE_LEVEL = MIN_NICE,
+ HIGHPRI_NICE_LEVEL = MIN_NICE,
+
+ WQ_NAME_LEN = 24,
+};
+
+/*
+ * Structure fields follow one of the following exclusion rules.
+ *
+ * I: Modifiable by initialization/destruction paths and read-only for
+ * everyone else.
+ *
+ * P: Preemption protected. Disabling preemption is enough and should
+ * only be modified and accessed from the local cpu.
+ *
+ * L: pool->lock protected. Access with pool->lock held.
+ *
+ * X: During normal operation, modification requires pool->lock and should
+ * be done only from local cpu. Either disabling preemption on local
+ * cpu or grabbing pool->lock is enough for read access. If
+ * POOL_DISASSOCIATED is set, it's identical to L.
+ *
+ * A: pool->attach_mutex protected.
+ *
+ * PL: wq_pool_mutex protected.
+ *
+ * PR: wq_pool_mutex protected for writes. Sched-RCU protected for reads.
+ *
+ * WQ: wq->mutex protected.
+ *
+ * WR: wq->mutex protected for writes. Sched-RCU protected for reads.
+ *
+ * MD: wq_mayday_lock protected.
+ */
+
+/* struct worker is defined in workqueue_internal.h */
+
+struct worker_pool {
+ spinlock_t lock; /* the pool lock */
+ int cpu; /* I: the associated cpu */
+ int node; /* I: the associated node ID */
+ int id; /* I: pool ID */
+ unsigned int flags; /* X: flags */
+
+ struct list_head worklist; /* L: list of pending works */
+ int nr_workers; /* L: total number of workers */
+
+ /* nr_idle includes the ones off idle_list for rebinding */
+ int nr_idle; /* L: currently idle ones */
+
+ struct list_head idle_list; /* X: list of idle workers */
+ struct timer_list idle_timer; /* L: worker idle timeout */
+ struct timer_list mayday_timer; /* L: SOS timer for workers */
+
+ /* a workers is either on busy_hash or idle_list, or the manager */
+ DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
+ /* L: hash of busy workers */
+
+ /* see manage_workers() for details on the two manager mutexes */
+ struct mutex manager_arb; /* manager arbitration */
+ struct worker *manager; /* L: purely informational */
+ struct mutex attach_mutex; /* attach/detach exclusion */
+ struct list_head workers; /* A: attached workers */
+ struct completion *detach_completion; /* all workers detached */
+
+ struct ida worker_ida; /* worker IDs for task name */
+
+ struct workqueue_attrs *attrs; /* I: worker attributes */
+ struct hlist_node hash_node; /* PL: unbound_pool_hash node */
+ int refcnt; /* PL: refcnt for unbound pools */
+
+ /*
+ * The current concurrency level. As it's likely to be accessed
+ * from other CPUs during try_to_wake_up(), put it in a separate
+ * cacheline.
+ */
+ atomic_t nr_running ____cacheline_aligned_in_smp;
+
+ /*
+ * Destruction of pool is sched-RCU protected to allow dereferences
+ * from get_work_pool().
+ */
+ struct rcu_head rcu;
+} ____cacheline_aligned_in_smp;
+
+/*
+ * The per-pool workqueue. While queued, the lower WORK_STRUCT_FLAG_BITS
+ * of work_struct->data are used for flags and the remaining high bits
+ * point to the pwq; thus, pwqs need to be aligned at two's power of the
+ * number of flag bits.
+ */
+struct pool_workqueue {
+ struct worker_pool *pool; /* I: the associated pool */
+ struct workqueue_struct *wq; /* I: the owning workqueue */
+ int work_color; /* L: current color */
+ int flush_color; /* L: flushing color */
+ int refcnt; /* L: reference count */
+ int nr_in_flight[WORK_NR_COLORS];
+ /* L: nr of in_flight works */
+ int nr_active; /* L: nr of active works */
+ int max_active; /* L: max active works */
+ struct list_head delayed_works; /* L: delayed works */
+ struct list_head pwqs_node; /* WR: node on wq->pwqs */
+ struct list_head mayday_node; /* MD: node on wq->maydays */
+
+ /*
+ * Release of unbound pwq is punted to system_wq. See put_pwq()
+ * and pwq_unbound_release_workfn() for details. pool_workqueue
+ * itself is also sched-RCU protected so that the first pwq can be
+ * determined without grabbing wq->mutex.
+ */
+ struct work_struct unbound_release_work;
+ struct rcu_head rcu;
+} __aligned(1 << WORK_STRUCT_FLAG_BITS);
+
+/*
+ * Structure used to wait for workqueue flush.
+ */
+struct wq_flusher {
+ struct list_head list; /* WQ: list of flushers */
+ int flush_color; /* WQ: flush color waiting for */
+ struct completion done; /* flush completion */
+};
+
+struct wq_device;
+
+/*
+ * The externally visible workqueue. It relays the issued work items to
+ * the appropriate worker_pool through its pool_workqueues.
+ */
+struct workqueue_struct {
+ struct list_head pwqs; /* WR: all pwqs of this wq */
+ struct list_head list; /* PR: list of all workqueues */
+
+ struct mutex mutex; /* protects this wq */
+ int work_color; /* WQ: current work color */
+ int flush_color; /* WQ: current flush color */
+ atomic_t nr_pwqs_to_flush; /* flush in progress */
+ struct wq_flusher *first_flusher; /* WQ: first flusher */
+ struct list_head flusher_queue; /* WQ: flush waiters */
+ struct list_head flusher_overflow; /* WQ: flush overflow list */
+
+ struct list_head maydays; /* MD: pwqs requesting rescue */
+ struct worker *rescuer; /* I: rescue worker */
+
+ int nr_drainers; /* WQ: drain in progress */
+ int saved_max_active; /* WQ: saved pwq max_active */
+
+ struct workqueue_attrs *unbound_attrs; /* WQ: only for unbound wqs */
+ struct pool_workqueue *dfl_pwq; /* WQ: only for unbound wqs */
+
+#ifdef CONFIG_SYSFS
+ struct wq_device *wq_dev; /* I: for sysfs interface */
+#endif
+#ifdef CONFIG_LOCKDEP
+ struct lockdep_map lockdep_map;
+#endif
+ char name[WQ_NAME_LEN]; /* I: workqueue name */
+
+ /*
+ * Destruction of workqueue_struct is sched-RCU protected to allow
+ * walking the workqueues list without grabbing wq_pool_mutex.
+ * This is used to dump all workqueues from sysrq.
+ */
+ struct rcu_head rcu;
+
+ /* hot fields used during command issue, aligned to cacheline */
+ unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */
+ struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */
+ struct pool_workqueue __rcu *numa_pwq_tbl[]; /* FR: unbound pwqs indexed by node */
+};
+
+static struct kmem_cache *pwq_cache;
+
+static cpumask_var_t *wq_numa_possible_cpumask;
+ /* possible CPUs of each node */
+
+static bool wq_disable_numa;
+module_param_named(disable_numa, wq_disable_numa, bool, 0444);
+
+/* see the comment above the definition of WQ_POWER_EFFICIENT */
+#ifdef CONFIG_WQ_POWER_EFFICIENT_DEFAULT
+static bool wq_power_efficient = true;
+#else
+static bool wq_power_efficient;
+#endif
+
+module_param_named(power_efficient, wq_power_efficient, bool, 0444);
+
+static bool wq_numa_enabled; /* unbound NUMA affinity enabled */
+
+/* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */
+static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf;
+
+static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */
+static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */
+
+static LIST_HEAD(workqueues); /* PR: list of all workqueues */
+static bool workqueue_freezing; /* PL: have wqs started freezing? */
+
+/* the per-cpu worker pools */
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
+ cpu_worker_pools);
+
+static DEFINE_IDR(worker_pool_idr); /* PR: idr of all pools */
+
+/* PL: hash of all unbound pools keyed by pool->attrs */
+static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER);
+
+/* I: attributes used when instantiating standard unbound pools on demand */
+static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS];
+
+/* I: attributes used when instantiating ordered pools on demand */
+static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS];
+
+struct workqueue_struct *system_wq __read_mostly;
+EXPORT_SYMBOL(system_wq);
+struct workqueue_struct *system_highpri_wq __read_mostly;
+EXPORT_SYMBOL_GPL(system_highpri_wq);
+struct workqueue_struct *system_long_wq __read_mostly;
+EXPORT_SYMBOL_GPL(system_long_wq);
+struct workqueue_struct *system_unbound_wq __read_mostly;
+EXPORT_SYMBOL_GPL(system_unbound_wq);
+struct workqueue_struct *system_freezable_wq __read_mostly;
+EXPORT_SYMBOL_GPL(system_freezable_wq);
+struct workqueue_struct *system_power_efficient_wq __read_mostly;
+EXPORT_SYMBOL_GPL(system_power_efficient_wq);
+struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
+EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
+
+static int worker_thread(void *__worker);
+static void copy_workqueue_attrs(struct workqueue_attrs *to,
+ const struct workqueue_attrs *from);
+static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/workqueue.h>
+
+#define assert_rcu_or_pool_mutex() \
+ rcu_lockdep_assert(rcu_read_lock_sched_held() || \
+ lockdep_is_held(&wq_pool_mutex), \
+ "sched RCU or wq_pool_mutex should be held")
+
+#define assert_rcu_or_wq_mutex(wq) \
+ rcu_lockdep_assert(rcu_read_lock_sched_held() || \
+ lockdep_is_held(&wq->mutex), \
+ "sched RCU or wq->mutex should be held")
+
+#define for_each_cpu_worker_pool(pool, cpu) \
+ for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \
+ (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \
+ (pool)++)
+
+/**
+ * for_each_pool - iterate through all worker_pools in the system
+ * @pool: iteration cursor
+ * @pi: integer used for iteration
+ *
+ * This must be called either with wq_pool_mutex held or sched RCU read
+ * locked. If the pool needs to be used beyond the locking in effect, the
+ * caller is responsible for guaranteeing that the pool stays online.
+ *
+ * The if/else clause exists only for the lockdep assertion and can be
+ * ignored.
+ */
+#define for_each_pool(pool, pi) \
+ idr_for_each_entry(&worker_pool_idr, pool, pi) \
+ if (({ assert_rcu_or_pool_mutex(); false; })) { } \
+ else
+
+/**
+ * for_each_pool_worker - iterate through all workers of a worker_pool
+ * @worker: iteration cursor
+ * @pool: worker_pool to iterate workers of
+ *
+ * This must be called with @pool->attach_mutex.
+ *
+ * The if/else clause exists only for the lockdep assertion and can be
+ * ignored.
+ */
+#define for_each_pool_worker(worker, pool) \
+ list_for_each_entry((worker), &(pool)->workers, node) \
+ if (({ lockdep_assert_held(&pool->attach_mutex); false; })) { } \
+ else
+
+/**
+ * for_each_pwq - iterate through all pool_workqueues of the specified workqueue
+ * @pwq: iteration cursor
+ * @wq: the target workqueue
+ *
+ * This must be called either with wq->mutex held or sched RCU read locked.
+ * If the pwq needs to be used beyond the locking in effect, the caller is
+ * responsible for guaranteeing that the pwq stays online.
+ *
+ * The if/else clause exists only for the lockdep assertion and can be
+ * ignored.
+ */
+#define for_each_pwq(pwq, wq) \
+ list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node) \
+ if (({ assert_rcu_or_wq_mutex(wq); false; })) { } \
+ else
+
+#ifdef CONFIG_DEBUG_OBJECTS_WORK
+
+static struct debug_obj_descr work_debug_descr;
+
+static void *work_debug_hint(void *addr)
+{
+ return ((struct work_struct *) addr)->func;
+}
+
+/*
+ * fixup_init is called when:
+ * - an active object is initialized
+ */
+static int work_fixup_init(void *addr, enum debug_obj_state state)
+{
+ struct work_struct *work = addr;
+
+ switch (state) {
+ case ODEBUG_STATE_ACTIVE:
+ cancel_work_sync(work);
+ debug_object_init(work, &work_debug_descr);
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+/*
+ * fixup_activate is called when:
+ * - an active object is activated
+ * - an unknown object is activated (might be a statically initialized object)
+ */
+static int work_fixup_activate(void *addr, enum debug_obj_state state)
+{
+ struct work_struct *work = addr;
+
+ switch (state) {
+
+ case ODEBUG_STATE_NOTAVAILABLE:
+ /*
+ * This is not really a fixup. The work struct was
+ * statically initialized. We just make sure that it
+ * is tracked in the object tracker.
+ */
+ if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) {
+ debug_object_init(work, &work_debug_descr);
+ debug_object_activate(work, &work_debug_descr);
+ return 0;
+ }
+ WARN_ON_ONCE(1);
+ return 0;
+
+ case ODEBUG_STATE_ACTIVE:
+ WARN_ON(1);
+
+ default:
+ return 0;
+ }
+}
+
+/*
+ * fixup_free is called when:
+ * - an active object is freed
+ */
+static int work_fixup_free(void *addr, enum debug_obj_state state)
+{
+ struct work_struct *work = addr;
+
+ switch (state) {
+ case ODEBUG_STATE_ACTIVE:
+ cancel_work_sync(work);
+ debug_object_free(work, &work_debug_descr);
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+static struct debug_obj_descr work_debug_descr = {
+ .name = "work_struct",
+ .debug_hint = work_debug_hint,
+ .fixup_init = work_fixup_init,
+ .fixup_activate = work_fixup_activate,
+ .fixup_free = work_fixup_free,
+};
+
+static inline void debug_work_activate(struct work_struct *work)
+{
+ debug_object_activate(work, &work_debug_descr);
+}
+
+static inline void debug_work_deactivate(struct work_struct *work)
+{
+ debug_object_deactivate(work, &work_debug_descr);
+}
+
+void __init_work(struct work_struct *work, int onstack)
+{
+ if (onstack)
+ debug_object_init_on_stack(work, &work_debug_descr);
+ else
+ debug_object_init(work, &work_debug_descr);
+}
+EXPORT_SYMBOL_GPL(__init_work);
+
+void destroy_work_on_stack(struct work_struct *work)
+{
+ debug_object_free(work, &work_debug_descr);
+}
+EXPORT_SYMBOL_GPL(destroy_work_on_stack);
+
+void destroy_delayed_work_on_stack(struct delayed_work *work)
+{
+ destroy_timer_on_stack(&work->timer);
+ debug_object_free(&work->work, &work_debug_descr);
+}
+EXPORT_SYMBOL_GPL(destroy_delayed_work_on_stack);
+
+#else
+static inline void debug_work_activate(struct work_struct *work) { }
+static inline void debug_work_deactivate(struct work_struct *work) { }
+#endif
+
+/**
+ * worker_pool_assign_id - allocate ID and assing it to @pool
+ * @pool: the pool pointer of interest
+ *
+ * Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned
+ * successfully, -errno on failure.
+ */
+static int worker_pool_assign_id(struct worker_pool *pool)
+{
+ int ret;
+
+ lockdep_assert_held(&wq_pool_mutex);
+
+ ret = idr_alloc(&worker_pool_idr, pool, 0, WORK_OFFQ_POOL_NONE,
+ GFP_KERNEL);
+ if (ret >= 0) {
+ pool->id = ret;
+ return 0;
+ }
+ return ret;
+}
+
+/**
+ * unbound_pwq_by_node - return the unbound pool_workqueue for the given node
+ * @wq: the target workqueue
+ * @node: the node ID
+ *
+ * This must be called either with pwq_lock held or sched RCU read locked.
+ * If the pwq needs to be used beyond the locking in effect, the caller is
+ * responsible for guaranteeing that the pwq stays online.
+ *
+ * Return: The unbound pool_workqueue for @node.
+ */
+static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq,
+ int node)
+{
+ assert_rcu_or_wq_mutex(wq);
+ return rcu_dereference_raw(wq->numa_pwq_tbl[node]);
+}
+
+static unsigned int work_color_to_flags(int color)
+{
+ return color << WORK_STRUCT_COLOR_SHIFT;
+}
+
+static int get_work_color(struct work_struct *work)
+{
+ return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) &
+ ((1 << WORK_STRUCT_COLOR_BITS) - 1);
+}
+
+static int work_next_color(int color)
+{
+ return (color + 1) % WORK_NR_COLORS;
+}
+
+/*
+ * While queued, %WORK_STRUCT_PWQ is set and non flag bits of a work's data
+ * contain the pointer to the queued pwq. Once execution starts, the flag
+ * is cleared and the high bits contain OFFQ flags and pool ID.
+ *
+ * set_work_pwq(), set_work_pool_and_clear_pending(), mark_work_canceling()
+ * and clear_work_data() can be used to set the pwq, pool or clear
+ * work->data. These functions should only be called while the work is
+ * owned - ie. while the PENDING bit is set.
+ *
+ * get_work_pool() and get_work_pwq() can be used to obtain the pool or pwq
+ * corresponding to a work. Pool is available once the work has been
+ * queued anywhere after initialization until it is sync canceled. pwq is
+ * available only while the work item is queued.
+ *
+ * %WORK_OFFQ_CANCELING is used to mark a work item which is being
+ * canceled. While being canceled, a work item may have its PENDING set
+ * but stay off timer and worklist for arbitrarily long and nobody should
+ * try to steal the PENDING bit.
+ */
+static inline void set_work_data(struct work_struct *work, unsigned long data,
+ unsigned long flags)
+{
+ WARN_ON_ONCE(!work_pending(work));
+ atomic_long_set(&work->data, data | flags | work_static(work));
+}
+
+static void set_work_pwq(struct work_struct *work, struct pool_workqueue *pwq,
+ unsigned long extra_flags)
+{
+ set_work_data(work, (unsigned long)pwq,
+ WORK_STRUCT_PENDING | WORK_STRUCT_PWQ | extra_flags);
+}
+
+static void set_work_pool_and_keep_pending(struct work_struct *work,
+ int pool_id)
+{
+ set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT,
+ WORK_STRUCT_PENDING);
+}
+
+static void set_work_pool_and_clear_pending(struct work_struct *work,
+ int pool_id)
+{
+ /*
+ * The following wmb is paired with the implied mb in
+ * test_and_set_bit(PENDING) and ensures all updates to @work made
+ * here are visible to and precede any updates by the next PENDING
+ * owner.
+ */
+ smp_wmb();
+ set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0);
+}
+
+static void clear_work_data(struct work_struct *work)
+{
+ smp_wmb(); /* see set_work_pool_and_clear_pending() */
+ set_work_data(work, WORK_STRUCT_NO_POOL, 0);
+}
+
+static struct pool_workqueue *get_work_pwq(struct work_struct *work)
+{
+ unsigned long data = atomic_long_read(&work->data);
+
+ if (data & WORK_STRUCT_PWQ)
+ return (void *)(data & WORK_STRUCT_WQ_DATA_MASK);
+ else
+ return NULL;
+}
+
+/**
+ * get_work_pool - return the worker_pool a given work was associated with
+ * @work: the work item of interest
+ *
+ * Pools are created and destroyed under wq_pool_mutex, and allows read
+ * access under sched-RCU read lock. As such, this function should be
+ * called under wq_pool_mutex or with preemption disabled.
+ *
+ * All fields of the returned pool are accessible as long as the above
+ * mentioned locking is in effect. If the returned pool needs to be used
+ * beyond the critical section, the caller is responsible for ensuring the
+ * returned pool is and stays online.
+ *
+ * Return: The worker_pool @work was last associated with. %NULL if none.
+ */
+static struct worker_pool *get_work_pool(struct work_struct *work)
+{
+ unsigned long data = atomic_long_read(&work->data);
+ int pool_id;
+
+ assert_rcu_or_pool_mutex();
+
+ if (data & WORK_STRUCT_PWQ)
+ return ((struct pool_workqueue *)
+ (data & WORK_STRUCT_WQ_DATA_MASK))->pool;
+
+ pool_id = data >> WORK_OFFQ_POOL_SHIFT;
+ if (pool_id == WORK_OFFQ_POOL_NONE)
+ return NULL;
+
+ return idr_find(&worker_pool_idr, pool_id);
+}
+
+/**
+ * get_work_pool_id - return the worker pool ID a given work is associated with
+ * @work: the work item of interest
+ *
+ * Return: The worker_pool ID @work was last associated with.
+ * %WORK_OFFQ_POOL_NONE if none.
+ */
+static int get_work_pool_id(struct work_struct *work)
+{
+ unsigned long data = atomic_long_read(&work->data);
+
+ if (data & WORK_STRUCT_PWQ)
+ return ((struct pool_workqueue *)
+ (data & WORK_STRUCT_WQ_DATA_MASK))->pool->id;
+
+ return data >> WORK_OFFQ_POOL_SHIFT;
+}
+
+static void mark_work_canceling(struct work_struct *work)
+{
+ unsigned long pool_id = get_work_pool_id(work);
+
+ pool_id <<= WORK_OFFQ_POOL_SHIFT;
+ set_work_data(work, pool_id | WORK_OFFQ_CANCELING, WORK_STRUCT_PENDING);
+}
+
+static bool work_is_canceling(struct work_struct *work)
+{
+ unsigned long data = atomic_long_read(&work->data);
+
+ return !(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_CANCELING);
+}
+
+/*
+ * Policy functions. These define the policies on how the global worker
+ * pools are managed. Unless noted otherwise, these functions assume that
+ * they're being called with pool->lock held.
+ */
+
+static bool __need_more_worker(struct worker_pool *pool)
+{
+ return !atomic_read(&pool->nr_running);
+}
+
+/*
+ * Need to wake up a worker? Called from anything but currently
+ * running workers.
+ *
+ * Note that, because unbound workers never contribute to nr_running, this
+ * function will always return %true for unbound pools as long as the
+ * worklist isn't empty.
+ */
+static bool need_more_worker(struct worker_pool *pool)
+{
+ return !list_empty(&pool->worklist) && __need_more_worker(pool);
+}
+
+/* Can I start working? Called from busy but !running workers. */
+static bool may_start_working(struct worker_pool *pool)
+{
+ return pool->nr_idle;
+}
+
+/* Do I need to keep working? Called from currently running workers. */
+static bool keep_working(struct worker_pool *pool)
+{
+ return !list_empty(&pool->worklist) &&
+ atomic_read(&pool->nr_running) <= 1;
+}
+
+/* Do we need a new worker? Called from manager. */
+static bool need_to_create_worker(struct worker_pool *pool)
+{
+ return need_more_worker(pool) && !may_start_working(pool);
+}
+
+/* Do we have too many workers and should some go away? */
+static bool too_many_workers(struct worker_pool *pool)
+{
+ bool managing = mutex_is_locked(&pool->manager_arb);
+ int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
+ int nr_busy = pool->nr_workers - nr_idle;
+
+ return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
+}
+
+/*
+ * Wake up functions.
+ */
+
+/* Return the first idle worker. Safe with preemption disabled */
+static struct worker *first_idle_worker(struct worker_pool *pool)
+{
+ if (unlikely(list_empty(&pool->idle_list)))
+ return NULL;
+
+ return list_first_entry(&pool->idle_list, struct worker, entry);
+}
+
+/**
+ * wake_up_worker - wake up an idle worker
+ * @pool: worker pool to wake worker from
+ *
+ * Wake up the first idle worker of @pool.
+ *
+ * CONTEXT:
+ * spin_lock_irq(pool->lock).
+ */
+static void wake_up_worker(struct worker_pool *pool)
+{
+ struct worker *worker = first_idle_worker(pool);
+
+ if (likely(worker))
+ wake_up_process(worker->task);
+}
+
+/**
+ * wq_worker_waking_up - a worker is waking up
+ * @task: task waking up
+ * @cpu: CPU @task is waking up to
+ *
+ * This function is called during try_to_wake_up() when a worker is
+ * being awoken.
+ *
+ * CONTEXT:
+ * spin_lock_irq(rq->lock)
+ */
+void wq_worker_waking_up(struct task_struct *task, int cpu)
+{
+ struct worker *worker = kthread_data(task);
+
+ if (!(worker->flags & WORKER_NOT_RUNNING)) {
+ WARN_ON_ONCE(worker->pool->cpu != cpu);
+ atomic_inc(&worker->pool->nr_running);
+ }
+}
+
+/**
+ * wq_worker_sleeping - a worker is going to sleep
+ * @task: task going to sleep
+ * @cpu: CPU in question, must be the current CPU number
+ *
+ * This function is called during schedule() when a busy worker is
+ * going to sleep. Worker on the same cpu can be woken up by
+ * returning pointer to its task.
+ *
+ * CONTEXT:
+ * spin_lock_irq(rq->lock)
+ *
+ * Return:
+ * Worker task on @cpu to wake up, %NULL if none.
+ */
+struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
+{
+ struct worker *worker = kthread_data(task), *to_wakeup = NULL;
+ struct worker_pool *pool;
+
+ /*
+ * Rescuers, which may not have all the fields set up like normal
+ * workers, also reach here, let's not access anything before
+ * checking NOT_RUNNING.
+ */
+ if (worker->flags & WORKER_NOT_RUNNING)
+ return NULL;
+
+ pool = worker->pool;
+
+ /* this can only happen on the local cpu */
+ if (WARN_ON_ONCE(cpu != raw_smp_processor_id() || pool->cpu != cpu))
+ return NULL;
+
+ /*
+ * The counterpart of the following dec_and_test, implied mb,
+ * worklist not empty test sequence is in insert_work().
+ * Please read comment there.
+ *
+ * NOT_RUNNING is clear. This means that we're bound to and
+ * running on the local cpu w/ rq lock held and preemption
+ * disabled, which in turn means that none else could be
+ * manipulating idle_list, so dereferencing idle_list without pool
+ * lock is safe.
+ */
+ if (atomic_dec_and_test(&pool->nr_running) &&
+ !list_empty(&pool->worklist))
+ to_wakeup = first_idle_worker(pool);
+ return to_wakeup ? to_wakeup->task : NULL;
+}
+
+/**
+ * worker_set_flags - set worker flags and adjust nr_running accordingly
+ * @worker: self
+ * @flags: flags to set
+ *
+ * Set @flags in @worker->flags and adjust nr_running accordingly.
+ *
+ * CONTEXT:
+ * spin_lock_irq(pool->lock)
+ */
+static inline void worker_set_flags(struct worker *worker, unsigned int flags)
+{
+ struct worker_pool *pool = worker->pool;
+
+ WARN_ON_ONCE(worker->task != current);
+
+ /* If transitioning into NOT_RUNNING, adjust nr_running. */
+ if ((flags & WORKER_NOT_RUNNING) &&
+ !(worker->flags & WORKER_NOT_RUNNING)) {
+ atomic_dec(&pool->nr_running);
+ }
+
+ worker->flags |= flags;
+}
+
+/**
+ * worker_clr_flags - clear worker flags and adjust nr_running accordingly
+ * @worker: self
+ * @flags: flags to clear
+ *
+ * Clear @flags in @worker->flags and adjust nr_running accordingly.
+ *
+ * CONTEXT:
+ * spin_lock_irq(pool->lock)
+ */
+static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
+{
+ struct worker_pool *pool = worker->pool;
+ unsigned int oflags = worker->flags;
+
+ WARN_ON_ONCE(worker->task != current);
+
+ worker->flags &= ~flags;
+
+ /*
+ * If transitioning out of NOT_RUNNING, increment nr_running. Note
+ * that the nested NOT_RUNNING is not a noop. NOT_RUNNING is mask
+ * of multiple flags, not a single flag.
+ */
+ if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
+ if (!(worker->flags & WORKER_NOT_RUNNING))
+ atomic_inc(&pool->nr_running);
+}
+
+/**
+ * find_worker_executing_work - find worker which is executing a work
+ * @pool: pool of interest
+ * @work: work to find worker for
+ *
+ * Find a worker which is executing @work on @pool by searching
+ * @pool->busy_hash which is keyed by the address of @work. For a worker
+ * to match, its current execution should match the address of @work and
+ * its work function. This is to avoid unwanted dependency between
+ * unrelated work executions through a work item being recycled while still
+ * being executed.
+ *
+ * This is a bit tricky. A work item may be freed once its execution
+ * starts and nothing prevents the freed area from being recycled for
+ * another work item. If the same work item address ends up being reused
+ * before the original execution finishes, workqueue will identify the
+ * recycled work item as currently executing and make it wait until the
+ * current execution finishes, introducing an unwanted dependency.
+ *
+ * This function checks the work item address and work function to avoid
+ * false positives. Note that this isn't complete as one may construct a
+ * work function which can introduce dependency onto itself through a
+ * recycled work item. Well, if somebody wants to shoot oneself in the
+ * foot that badly, there's only so much we can do, and if such deadlock
+ * actually occurs, it should be easy to locate the culprit work function.
+ *
+ * CONTEXT:
+ * spin_lock_irq(pool->lock).
+ *
+ * Return:
+ * Pointer to worker which is executing @work if found, %NULL
+ * otherwise.
+ */
+static struct worker *find_worker_executing_work(struct worker_pool *pool,
+ struct work_struct *work)
+{
+ struct worker *worker;
+
+ hash_for_each_possible(pool->busy_hash, worker, hentry,
+ (unsigned long)work)
+ if (worker->current_work == work &&
+ worker->current_func == work->func)
+ return worker;
+
+ return NULL;
+}
+
+/**
+ * move_linked_works - move linked works to a list
+ * @work: start of series of works to be scheduled
+ * @head: target list to append @work to
+ * @nextp: out paramter for nested worklist walking
+ *
+ * Schedule linked works starting from @work to @head. Work series to
+ * be scheduled starts at @work and includes any consecutive work with
+ * WORK_STRUCT_LINKED set in its predecessor.
+ *
+ * If @nextp is not NULL, it's updated to point to the next work of
+ * the last scheduled work. This allows move_linked_works() to be
+ * nested inside outer list_for_each_entry_safe().
+ *
+ * CONTEXT:
+ * spin_lock_irq(pool->lock).
+ */
+static void move_linked_works(struct work_struct *work, struct list_head *head,
+ struct work_struct **nextp)
+{
+ struct work_struct *n;
+
+ /*
+ * Linked worklist will always end before the end of the list,
+ * use NULL for list head.
+ */
+ list_for_each_entry_safe_from(work, n, NULL, entry) {
+ list_move_tail(&work->entry, head);
+ if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
+ break;
+ }
+
+ /*
+ * If we're already inside safe list traversal and have moved
+ * multiple works to the scheduled queue, the next position
+ * needs to be updated.
+ */
+ if (nextp)
+ *nextp = n;
+}
+
+/**
+ * get_pwq - get an extra reference on the specified pool_workqueue
+ * @pwq: pool_workqueue to get
+ *
+ * Obtain an extra reference on @pwq. The caller should guarantee that
+ * @pwq has positive refcnt and be holding the matching pool->lock.
+ */
+static void get_pwq(struct pool_workqueue *pwq)
+{
+ lockdep_assert_held(&pwq->pool->lock);
+ WARN_ON_ONCE(pwq->refcnt <= 0);
+ pwq->refcnt++;
+}
+
+/**
+ * put_pwq - put a pool_workqueue reference
+ * @pwq: pool_workqueue to put
+ *
+ * Drop a reference of @pwq. If its refcnt reaches zero, schedule its
+ * destruction. The caller should be holding the matching pool->lock.
+ */
+static void put_pwq(struct pool_workqueue *pwq)
+{
+ lockdep_assert_held(&pwq->pool->lock);
+ if (likely(--pwq->refcnt))
+ return;
+ if (WARN_ON_ONCE(!(pwq->wq->flags & WQ_UNBOUND)))
+ return;
+ /*
+ * @pwq can't be released under pool->lock, bounce to
+ * pwq_unbound_release_workfn(). This never recurses on the same
+ * pool->lock as this path is taken only for unbound workqueues and
+ * the release work item is scheduled on a per-cpu workqueue. To
+ * avoid lockdep warning, unbound pool->locks are given lockdep
+ * subclass of 1 in get_unbound_pool().
+ */
+ schedule_work(&pwq->unbound_release_work);
+}
+
+/**
+ * put_pwq_unlocked - put_pwq() with surrounding pool lock/unlock
+ * @pwq: pool_workqueue to put (can be %NULL)
+ *
+ * put_pwq() with locking. This function also allows %NULL @pwq.
+ */
+static void put_pwq_unlocked(struct pool_workqueue *pwq)
+{
+ if (pwq) {
+ /*
+ * As both pwqs and pools are sched-RCU protected, the
+ * following lock operations are safe.
+ */
+ spin_lock_irq(&pwq->pool->lock);
+ put_pwq(pwq);
+ spin_unlock_irq(&pwq->pool->lock);
+ }
+}
+
+static void pwq_activate_delayed_work(struct work_struct *work)
+{
+ struct pool_workqueue *pwq = get_work_pwq(work);
+
+ trace_workqueue_activate_work(work);
+ move_linked_works(work, &pwq->pool->worklist, NULL);
+ __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
+ pwq->nr_active++;
+}
+
+static void pwq_activate_first_delayed(struct pool_workqueue *pwq)
+{
+ struct work_struct *work = list_first_entry(&pwq->delayed_works,
+ struct work_struct, entry);
+
+ pwq_activate_delayed_work(work);
+}
+
+/**
+ * pwq_dec_nr_in_flight - decrement pwq's nr_in_flight
+ * @pwq: pwq of interest
+ * @color: color of work which left the queue
+ *
+ * A work either has completed or is removed from pending queue,
+ * decrement nr_in_flight of its pwq and handle workqueue flushing.
+ *
+ * CONTEXT:
+ * spin_lock_irq(pool->lock).
+ */
+static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color)
+{
+ /* uncolored work items don't participate in flushing or nr_active */
+ if (color == WORK_NO_COLOR)
+ goto out_put;
+
+ pwq->nr_in_flight[color]--;
+
+ pwq->nr_active--;
+ if (!list_empty(&pwq->delayed_works)) {
+ /* one down, submit a delayed one */
+ if (pwq->nr_active < pwq->max_active)
+ pwq_activate_first_delayed(pwq);
+ }
+
+ /* is flush in progress and are we at the flushing tip? */
+ if (likely(pwq->flush_color != color))
+ goto out_put;
+
+ /* are there still in-flight works? */
+ if (pwq->nr_in_flight[color])
+ goto out_put;
+
+ /* this pwq is done, clear flush_color */
+ pwq->flush_color = -1;
+
+ /*
+ * If this was the last pwq, wake up the first flusher. It
+ * will handle the rest.
+ */
+ if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush))
+ complete(&pwq->wq->first_flusher->done);
+out_put:
+ put_pwq(pwq);
+}
+
+/**
+ * try_to_grab_pending - steal work item from worklist and disable irq
+ * @work: work item to steal
+ * @is_dwork: @work is a delayed_work
+ * @flags: place to store irq state
+ *
+ * Try to grab PENDING bit of @work. This function can handle @work in any
+ * stable state - idle, on timer or on worklist.
+ *
+ * Return:
+ * 1 if @work was pending and we successfully stole PENDING
+ * 0 if @work was idle and we claimed PENDING
+ * -EAGAIN if PENDING couldn't be grabbed at the moment, safe to busy-retry
+ * -ENOENT if someone else is canceling @work, this state may persist
+ * for arbitrarily long
+ *
+ * Note:
+ * On >= 0 return, the caller owns @work's PENDING bit. To avoid getting
+ * interrupted while holding PENDING and @work off queue, irq must be
+ * disabled on entry. This, combined with delayed_work->timer being
+ * irqsafe, ensures that we return -EAGAIN for finite short period of time.
+ *
+ * On successful return, >= 0, irq is disabled and the caller is
+ * responsible for releasing it using local_irq_restore(*@flags).
+ *
+ * This function is safe to call from any context including IRQ handler.
+ */
+static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
+ unsigned long *flags)
+{
+ struct worker_pool *pool;
+ struct pool_workqueue *pwq;
+
+ local_irq_save(*flags);
+
+ /* try to steal the timer if it exists */
+ if (is_dwork) {
+ struct delayed_work *dwork = to_delayed_work(work);
+
+ /*
+ * dwork->timer is irqsafe. If del_timer() fails, it's
+ * guaranteed that the timer is not queued anywhere and not
+ * running on the local CPU.
+ */
+ if (likely(del_timer(&dwork->timer)))
+ return 1;
+ }
+
+ /* try to claim PENDING the normal way */
+ if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
+ return 0;
+
+ /*
+ * The queueing is in progress, or it is already queued. Try to
+ * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
+ */
+ pool = get_work_pool(work);
+ if (!pool)
+ goto fail;
+
+ spin_lock(&pool->lock);
+ /*
+ * work->data is guaranteed to point to pwq only while the work
+ * item is queued on pwq->wq, and both updating work->data to point
+ * to pwq on queueing and to pool on dequeueing are done under
+ * pwq->pool->lock. This in turn guarantees that, if work->data
+ * points to pwq which is associated with a locked pool, the work
+ * item is currently queued on that pool.
+ */
+ pwq = get_work_pwq(work);
+ if (pwq && pwq->pool == pool) {
+ debug_work_deactivate(work);
+
+ /*
+ * A delayed work item cannot be grabbed directly because
+ * it might have linked NO_COLOR work items which, if left
+ * on the delayed_list, will confuse pwq->nr_active
+ * management later on and cause stall. Make sure the work
+ * item is activated before grabbing.
+ */
+ if (*work_data_bits(work) & WORK_STRUCT_DELAYED)
+ pwq_activate_delayed_work(work);
+
+ list_del_init(&work->entry);
+ pwq_dec_nr_in_flight(pwq, get_work_color(work));
+
+ /* work->data points to pwq iff queued, point to pool */
+ set_work_pool_and_keep_pending(work, pool->id);
+
+ spin_unlock(&pool->lock);
+ return 1;
+ }
+ spin_unlock(&pool->lock);
+fail:
+ local_irq_restore(*flags);
+ if (work_is_canceling(work))
+ return -ENOENT;
+ cpu_relax();
+ return -EAGAIN;
+}
+
+/**
+ * insert_work - insert a work into a pool
+ * @pwq: pwq @work belongs to
+ * @work: work to insert
+ * @head: insertion point
+ * @extra_flags: extra WORK_STRUCT_* flags to set
+ *
+ * Insert @work which belongs to @pwq after @head. @extra_flags is or'd to
+ * work_struct flags.
+ *
+ * CONTEXT:
+ * spin_lock_irq(pool->lock).
+ */
+static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
+ struct list_head *head, unsigned int extra_flags)
+{
+ struct worker_pool *pool = pwq->pool;
+
+ /* we own @work, set data and link */
+ set_work_pwq(work, pwq, extra_flags);
+ list_add_tail(&work->entry, head);
+ get_pwq(pwq);
+
+ /*
+ * Ensure either wq_worker_sleeping() sees the above
+ * list_add_tail() or we see zero nr_running to avoid workers lying
+ * around lazily while there are works to be processed.
+ */
+ smp_mb();
+
+ if (__need_more_worker(pool))
+ wake_up_worker(pool);
+}
+
+/*
+ * Test whether @work is being queued from another work executing on the
+ * same workqueue.
+ */
+static bool is_chained_work(struct workqueue_struct *wq)
+{
+ struct worker *worker;
+
+ worker = current_wq_worker();
+ /*
+ * Return %true iff I'm a worker execuing a work item on @wq. If
+ * I'm @worker, it's safe to dereference it without locking.
+ */
+ return worker && worker->current_pwq->wq == wq;
+}
+
+static void __queue_work(int cpu, struct workqueue_struct *wq,
+ struct work_struct *work)
+{
+ struct pool_workqueue *pwq;
+ struct worker_pool *last_pool;
+ struct list_head *worklist;
+ unsigned int work_flags;
+ unsigned int req_cpu = cpu;
+
+ /*
+ * While a work item is PENDING && off queue, a task trying to
+ * steal the PENDING will busy-loop waiting for it to either get
+ * queued or lose PENDING. Grabbing PENDING and queueing should
+ * happen with IRQ disabled.
+ */
+ WARN_ON_ONCE(!irqs_disabled());
+
+ debug_work_activate(work);
+
+ /* if draining, only works from the same workqueue are allowed */
+ if (unlikely(wq->flags & __WQ_DRAINING) &&
+ WARN_ON_ONCE(!is_chained_work(wq)))
+ return;
+retry:
+ if (req_cpu == WORK_CPU_UNBOUND)
+ cpu = raw_smp_processor_id();
+
+ /* pwq which will be used unless @work is executing elsewhere */
+ if (!(wq->flags & WQ_UNBOUND))
+ pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
+ else
+ pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
+
+ /*
+ * If @work was previously on a different pool, it might still be
+ * running there, in which case the work needs to be queued on that
+ * pool to guarantee non-reentrancy.
+ */
+ last_pool = get_work_pool(work);
+ if (last_pool && last_pool != pwq->pool) {
+ struct worker *worker;
+
+ spin_lock(&last_pool->lock);
+
+ worker = find_worker_executing_work(last_pool, work);
+
+ if (worker && worker->current_pwq->wq == wq) {
+ pwq = worker->current_pwq;
+ } else {
+ /* meh... not running there, queue here */
+ spin_unlock(&last_pool->lock);
+ spin_lock(&pwq->pool->lock);
+ }
+ } else {
+ spin_lock(&pwq->pool->lock);
+ }
+
+ /*
+ * pwq is determined and locked. For unbound pools, we could have
+ * raced with pwq release and it could already be dead. If its
+ * refcnt is zero, repeat pwq selection. Note that pwqs never die
+ * without another pwq replacing it in the numa_pwq_tbl or while
+ * work items are executing on it, so the retrying is guaranteed to
+ * make forward-progress.
+ */
+ if (unlikely(!pwq->refcnt)) {
+ if (wq->flags & WQ_UNBOUND) {
+ spin_unlock(&pwq->pool->lock);
+ cpu_relax();
+ goto retry;
+ }
+ /* oops */
+ WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt",
+ wq->name, cpu);
+ }
+
+ /* pwq determined, queue */
+ trace_workqueue_queue_work(req_cpu, pwq, work);
+
+ if (WARN_ON(!list_empty(&work->entry))) {
+ spin_unlock(&pwq->pool->lock);
+ return;
+ }
+
+ pwq->nr_in_flight[pwq->work_color]++;
+ work_flags = work_color_to_flags(pwq->work_color);
+
+ if (likely(pwq->nr_active < pwq->max_active)) {
+ trace_workqueue_activate_work(work);
+ pwq->nr_active++;
+ worklist = &pwq->pool->worklist;
+ } else {
+ work_flags |= WORK_STRUCT_DELAYED;
+ worklist = &pwq->delayed_works;
+ }
+
+ insert_work(pwq, work, worklist, work_flags);
+
+ spin_unlock(&pwq->pool->lock);
+}
+
+/**
+ * queue_work_on - queue work on specific cpu
+ * @cpu: CPU number to execute work on
+ * @wq: workqueue to use
+ * @work: work to queue
+ *
+ * We queue the work to a specific CPU, the caller must ensure it
+ * can't go away.
+ *
+ * Return: %false if @work was already on a queue, %true otherwise.
+ */
+bool queue_work_on(int cpu, struct workqueue_struct *wq,
+ struct work_struct *work)
+{
+ bool ret = false;
+ unsigned long flags;
+
+ local_irq_save(flags);
+
+ if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
+ __queue_work(cpu, wq, work);
+ ret = true;
+ }
+
+ local_irq_restore(flags);
+ return ret;
+}
+EXPORT_SYMBOL(queue_work_on);
+
+void delayed_work_timer_fn(unsigned long __data)
+{
+ struct delayed_work *dwork = (struct delayed_work *)__data;
+
+ /* should have been called from irqsafe timer with irq already off */
+ __queue_work(dwork->cpu, dwork->wq, &dwork->work);
+}
+EXPORT_SYMBOL(delayed_work_timer_fn);
+
+static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
+ struct delayed_work *dwork, unsigned long delay)
+{
+ struct timer_list *timer = &dwork->timer;
+ struct work_struct *work = &dwork->work;
+
+ WARN_ON_ONCE(timer->function != delayed_work_timer_fn ||
+ timer->data != (unsigned long)dwork);
+ WARN_ON_ONCE(timer_pending(timer));
+ WARN_ON_ONCE(!list_empty(&work->entry));
+
+ /*
+ * If @delay is 0, queue @dwork->work immediately. This is for
+ * both optimization and correctness. The earliest @timer can
+ * expire is on the closest next tick and delayed_work users depend
+ * on that there's no such delay when @delay is 0.
+ */
+ if (!delay) {
+ __queue_work(cpu, wq, &dwork->work);
+ return;
+ }
+
+ timer_stats_timer_set_start_info(&dwork->timer);
+
+ dwork->wq = wq;
+ dwork->cpu = cpu;
+ timer->expires = jiffies + delay;
+
+ if (unlikely(cpu != WORK_CPU_UNBOUND))
+ add_timer_on(timer, cpu);
+ else
+ add_timer(timer);
+}
+
+/**
+ * queue_delayed_work_on - queue work on specific CPU after delay
+ * @cpu: CPU number to execute work on
+ * @wq: workqueue to use
+ * @dwork: work to queue
+ * @delay: number of jiffies to wait before queueing
+ *
+ * Return: %false if @work was already on a queue, %true otherwise. If
+ * @delay is zero and @dwork is idle, it will be scheduled for immediate
+ * execution.
+ */
+bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
+ struct delayed_work *dwork, unsigned long delay)
+{
+ struct work_struct *work = &dwork->work;
+ bool ret = false;
+ unsigned long flags;
+
+ /* read the comment in __queue_work() */
+ local_irq_save(flags);
+
+ if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
+ __queue_delayed_work(cpu, wq, dwork, delay);
+ ret = true;
+ }
+
+ local_irq_restore(flags);
+ return ret;
+}
+EXPORT_SYMBOL(queue_delayed_work_on);
+
+/**
+ * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU
+ * @cpu: CPU number to execute work on
+ * @wq: workqueue to use
+ * @dwork: work to queue
+ * @delay: number of jiffies to wait before queueing
+ *
+ * If @dwork is idle, equivalent to queue_delayed_work_on(); otherwise,
+ * modify @dwork's timer so that it expires after @delay. If @delay is
+ * zero, @work is guaranteed to be scheduled immediately regardless of its
+ * current state.
+ *
+ * Return: %false if @dwork was idle and queued, %true if @dwork was
+ * pending and its timer was modified.
+ *
+ * This function is safe to call from any context including IRQ handler.
+ * See try_to_grab_pending() for details.
+ */
+bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
+ struct delayed_work *dwork, unsigned long delay)
+{
+ unsigned long flags;
+ int ret;
+
+ do {
+ ret = try_to_grab_pending(&dwork->work, true, &flags);
+ } while (unlikely(ret == -EAGAIN));
+
+ if (likely(ret >= 0)) {
+ __queue_delayed_work(cpu, wq, dwork, delay);
+ local_irq_restore(flags);
+ }
+
+ /* -ENOENT from try_to_grab_pending() becomes %true */
+ return ret;
+}
+EXPORT_SYMBOL_GPL(mod_delayed_work_on);
+
+/**
+ * worker_enter_idle - enter idle state
+ * @worker: worker which is entering idle state
+ *
+ * @worker is entering idle state. Update stats and idle timer if
+ * necessary.
+ *
+ * LOCKING:
+ * spin_lock_irq(pool->lock).
+ */
+static void worker_enter_idle(struct worker *worker)
+{
+ struct worker_pool *pool = worker->pool;
+
+ if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) ||
+ WARN_ON_ONCE(!list_empty(&worker->entry) &&
+ (worker->hentry.next || worker->hentry.pprev)))
+ return;
+
+ /* can't use worker_set_flags(), also called from create_worker() */
+ worker->flags |= WORKER_IDLE;
+ pool->nr_idle++;
+ worker->last_active = jiffies;
+
+ /* idle_list is LIFO */
+ list_add(&worker->entry, &pool->idle_list);
+
+ if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
+ mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
+
+ /*
+ * Sanity check nr_running. Because wq_unbind_fn() releases
+ * pool->lock between setting %WORKER_UNBOUND and zapping
+ * nr_running, the warning may trigger spuriously. Check iff
+ * unbind is not in progress.
+ */
+ WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
+ pool->nr_workers == pool->nr_idle &&
+ atomic_read(&pool->nr_running));
+}
+
+/**
+ * worker_leave_idle - leave idle state
+ * @worker: worker which is leaving idle state
+ *
+ * @worker is leaving idle state. Update stats.
+ *
+ * LOCKING:
+ * spin_lock_irq(pool->lock).
+ */
+static void worker_leave_idle(struct worker *worker)
+{
+ struct worker_pool *pool = worker->pool;
+
+ if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE)))
+ return;
+ worker_clr_flags(worker, WORKER_IDLE);
+ pool->nr_idle--;
+ list_del_init(&worker->entry);
+}
+
+static struct worker *alloc_worker(int node)
+{
+ struct worker *worker;
+
+ worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, node);
+ if (worker) {
+ INIT_LIST_HEAD(&worker->entry);
+ INIT_LIST_HEAD(&worker->scheduled);
+ INIT_LIST_HEAD(&worker->node);
+ /* on creation a worker is in !idle && prep state */
+ worker->flags = WORKER_PREP;
+ }
+ return worker;
+}
+
+/**
+ * worker_attach_to_pool() - attach a worker to a pool
+ * @worker: worker to be attached
+ * @pool: the target pool
+ *
+ * Attach @worker to @pool. Once attached, the %WORKER_UNBOUND flag and
+ * cpu-binding of @worker are kept coordinated with the pool across
+ * cpu-[un]hotplugs.
+ */
+static void worker_attach_to_pool(struct worker *worker,
+ struct worker_pool *pool)
+{
+ mutex_lock(&pool->attach_mutex);
+
+ /*
+ * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
+ * online CPUs. It'll be re-applied when any of the CPUs come up.
+ */
+ set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
+
+ /*
+ * The pool->attach_mutex ensures %POOL_DISASSOCIATED remains
+ * stable across this function. See the comments above the
+ * flag definition for details.
+ */
+ if (pool->flags & POOL_DISASSOCIATED)
+ worker->flags |= WORKER_UNBOUND;
+
+ list_add_tail(&worker->node, &pool->workers);
+
+ mutex_unlock(&pool->attach_mutex);
+}
+
+/**
+ * worker_detach_from_pool() - detach a worker from its pool
+ * @worker: worker which is attached to its pool
+ * @pool: the pool @worker is attached to
+ *
+ * Undo the attaching which had been done in worker_attach_to_pool(). The
+ * caller worker shouldn't access to the pool after detached except it has
+ * other reference to the pool.
+ */
+static void worker_detach_from_pool(struct worker *worker,
+ struct worker_pool *pool)
+{
+ struct completion *detach_completion = NULL;
+
+ mutex_lock(&pool->attach_mutex);
+ list_del(&worker->node);
+ if (list_empty(&pool->workers))
+ detach_completion = pool->detach_completion;
+ mutex_unlock(&pool->attach_mutex);
+
+ /* clear leftover flags without pool->lock after it is detached */
+ worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND);
+
+ if (detach_completion)
+ complete(detach_completion);
+}
+
+/**
+ * create_worker - create a new workqueue worker
+ * @pool: pool the new worker will belong to
+ *
+ * Create and start a new worker which is attached to @pool.
+ *
+ * CONTEXT:
+ * Might sleep. Does GFP_KERNEL allocations.
+ *
+ * Return:
+ * Pointer to the newly created worker.
+ */
+static struct worker *create_worker(struct worker_pool *pool)
+{
+ struct worker *worker = NULL;
+ int id = -1;
+ char id_buf[16];
+
+ /* ID is needed to determine kthread name */
+ id = ida_simple_get(&pool->worker_ida, 0, 0, GFP_KERNEL);
+ if (id < 0)
+ goto fail;
+
+ worker = alloc_worker(pool->node);
+ if (!worker)
+ goto fail;
+
+ worker->pool = pool;
+ worker->id = id;
+
+ if (pool->cpu >= 0)
+ snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id,
+ pool->attrs->nice < 0 ? "H" : "");
+ else
+ snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id);
+
+ worker->task = kthread_create_on_node(worker_thread, worker, pool->node,
+ "kworker/%s", id_buf);
+ if (IS_ERR(worker->task))
+ goto fail;
+
+ set_user_nice(worker->task, pool->attrs->nice);
+
+ /* prevent userland from meddling with cpumask of workqueue workers */
+ worker->task->flags |= PF_NO_SETAFFINITY;
+
+ /* successful, attach the worker to the pool */
+ worker_attach_to_pool(worker, pool);
+
+ /* start the newly created worker */
+ spin_lock_irq(&pool->lock);
+ worker->pool->nr_workers++;
+ worker_enter_idle(worker);
+ wake_up_process(worker->task);
+ spin_unlock_irq(&pool->lock);
+
+ return worker;
+
+fail:
+ if (id >= 0)
+ ida_simple_remove(&pool->worker_ida, id);
+ kfree(worker);
+ return NULL;
+}
+
+/**
+ * destroy_worker - destroy a workqueue worker
+ * @worker: worker to be destroyed
+ *
+ * Destroy @worker and adjust @pool stats accordingly. The worker should
+ * be idle.
+ *
+ * CONTEXT:
+ * spin_lock_irq(pool->lock).
+ */
+static void destroy_worker(struct worker *worker)
+{
+ struct worker_pool *pool = worker->pool;
+
+ lockdep_assert_held(&pool->lock);
+
+ /* sanity check frenzy */
+ if (WARN_ON(worker->current_work) ||
+ WARN_ON(!list_empty(&worker->scheduled)) ||
+ WARN_ON(!(worker->flags & WORKER_IDLE)))
+ return;
+
+ pool->nr_workers--;
+ pool->nr_idle--;
+
+ list_del_init(&worker->entry);
+ worker->flags |= WORKER_DIE;
+ wake_up_process(worker->task);
+}
+
+static void idle_worker_timeout(unsigned long __pool)
+{
+ struct worker_pool *pool = (void *)__pool;
+
+ spin_lock_irq(&pool->lock);
+
+ while (too_many_workers(pool)) {
+ struct worker *worker;
+ unsigned long expires;
+
+ /* idle_list is kept in LIFO order, check the last one */
+ worker = list_entry(pool->idle_list.prev, struct worker, entry);
+ expires = worker->last_active + IDLE_WORKER_TIMEOUT;
+
+ if (time_before(jiffies, expires)) {
+ mod_timer(&pool->idle_timer, expires);
+ break;
+ }
+
+ destroy_worker(worker);
+ }
+
+ spin_unlock_irq(&pool->lock);
+}
+
+static void send_mayday(struct work_struct *work)
+{
+ struct pool_workqueue *pwq = get_work_pwq(work);
+ struct workqueue_struct *wq = pwq->wq;
+
+ lockdep_assert_held(&wq_mayday_lock);
+
+ if (!wq->rescuer)
+ return;
+
+ /* mayday mayday mayday */
+ if (list_empty(&pwq->mayday_node)) {
+ /*
+ * If @pwq is for an unbound wq, its base ref may be put at
+ * any time due to an attribute change. Pin @pwq until the
+ * rescuer is done with it.
+ */
+ get_pwq(pwq);
+ list_add_tail(&pwq->mayday_node, &wq->maydays);
+ wake_up_process(wq->rescuer->task);
+ }
+}
+
+static void pool_mayday_timeout(unsigned long __pool)
+{
+ struct worker_pool *pool = (void *)__pool;
+ struct work_struct *work;
+
+ spin_lock_irq(&pool->lock);
+ spin_lock(&wq_mayday_lock); /* for wq->maydays */
+
+ if (need_to_create_worker(pool)) {
+ /*
+ * We've been trying to create a new worker but
+ * haven't been successful. We might be hitting an
+ * allocation deadlock. Send distress signals to
+ * rescuers.
+ */
+ list_for_each_entry(work, &pool->worklist, entry)
+ send_mayday(work);
+ }
+
+ spin_unlock(&wq_mayday_lock);
+ spin_unlock_irq(&pool->lock);
+
+ mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
+}
+
+/**
+ * maybe_create_worker - create a new worker if necessary
+ * @pool: pool to create a new worker for
+ *
+ * Create a new worker for @pool if necessary. @pool is guaranteed to
+ * have at least one idle worker on return from this function. If
+ * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
+ * sent to all rescuers with works scheduled on @pool to resolve
+ * possible allocation deadlock.
+ *
+ * On return, need_to_create_worker() is guaranteed to be %false and
+ * may_start_working() %true.
+ *
+ * LOCKING:
+ * spin_lock_irq(pool->lock) which may be released and regrabbed
+ * multiple times. Does GFP_KERNEL allocations. Called only from
+ * manager.
+ */
+static void maybe_create_worker(struct worker_pool *pool)
+__releases(&pool->lock)
+__acquires(&pool->lock)
+{
+restart:
+ spin_unlock_irq(&pool->lock);
+
+ /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
+ mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
+
+ while (true) {
+ if (create_worker(pool) || !need_to_create_worker(pool))
+ break;
+
+ schedule_timeout_interruptible(CREATE_COOLDOWN);
+
+ if (!need_to_create_worker(pool))
+ break;
+ }
+
+ del_timer_sync(&pool->mayday_timer);
+ spin_lock_irq(&pool->lock);
+ /*
+ * This is necessary even after a new worker was just successfully
+ * created as @pool->lock was dropped and the new worker might have
+ * already become busy.
+ */
+ if (need_to_create_worker(pool))
+ goto restart;
+}
+
+/**
+ * manage_workers - manage worker pool
+ * @worker: self
+ *
+ * Assume the manager role and manage the worker pool @worker belongs
+ * to. At any given time, there can be only zero or one manager per
+ * pool. The exclusion is handled automatically by this function.
+ *
+ * The caller can safely start processing works on false return. On
+ * true return, it's guaranteed that need_to_create_worker() is false
+ * and may_start_working() is true.
+ *
+ * CONTEXT:
+ * spin_lock_irq(pool->lock) which may be released and regrabbed
+ * multiple times. Does GFP_KERNEL allocations.
+ *
+ * Return:
+ * %false if the pool doesn't need management and the caller can safely
+ * start processing works, %true if management function was performed and
+ * the conditions that the caller verified before calling the function may
+ * no longer be true.
+ */
+static bool manage_workers(struct worker *worker)
+{
+ struct worker_pool *pool = worker->pool;
+
+ /*
+ * Anyone who successfully grabs manager_arb wins the arbitration
+ * and becomes the manager. mutex_trylock() on pool->manager_arb
+ * failure while holding pool->lock reliably indicates that someone
+ * else is managing the pool and the worker which failed trylock
+ * can proceed to executing work items. This means that anyone
+ * grabbing manager_arb is responsible for actually performing
+ * manager duties. If manager_arb is grabbed and released without
+ * actual management, the pool may stall indefinitely.
+ */
+ if (!mutex_trylock(&pool->manager_arb))
+ return false;
+ pool->manager = worker;
+
+ maybe_create_worker(pool);
+
+ pool->manager = NULL;
+ mutex_unlock(&pool->manager_arb);
+ return true;
+}
+
+/**
+ * process_one_work - process single work
+ * @worker: self
+ * @work: work to process
+ *
+ * Process @work. This function contains all the logics necessary to
+ * process a single work including synchronization against and
+ * interaction with other workers on the same cpu, queueing and
+ * flushing. As long as context requirement is met, any worker can
+ * call this function to process a work.
+ *
+ * CONTEXT:
+ * spin_lock_irq(pool->lock) which is released and regrabbed.
+ */
+static void process_one_work(struct worker *worker, struct work_struct *work)
+__releases(&pool->lock)
+__acquires(&pool->lock)
+{
+ struct pool_workqueue *pwq = get_work_pwq(work);
+ struct worker_pool *pool = worker->pool;
+ bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE;
+ int work_color;
+ struct worker *collision;
+#ifdef CONFIG_LOCKDEP
+ /*
+ * It is permissible to free the struct work_struct from
+ * inside the function that is called from it, this we need to
+ * take into account for lockdep too. To avoid bogus "held
+ * lock freed" warnings as well as problems when looking into
+ * work->lockdep_map, make a copy and use that here.
+ */
+ struct lockdep_map lockdep_map;
+
+ lockdep_copy_map(&lockdep_map, &work->lockdep_map);
+#endif
+ /* ensure we're on the correct CPU */
+ WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
+ raw_smp_processor_id() != pool->cpu);
+
+ /*
+ * A single work shouldn't be executed concurrently by
+ * multiple workers on a single cpu. Check whether anyone is
+ * already processing the work. If so, defer the work to the
+ * currently executing one.
+ */
+ collision = find_worker_executing_work(pool, work);
+ if (unlikely(collision)) {
+ move_linked_works(work, &collision->scheduled, NULL);
+ return;
+ }
+
+ /* claim and dequeue */
+ debug_work_deactivate(work);
+ hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work);
+ worker->current_work = work;
+ worker->current_func = work->func;
+ worker->current_pwq = pwq;
+ work_color = get_work_color(work);
+
+ list_del_init(&work->entry);
+
+ /*
+ * CPU intensive works don't participate in concurrency management.
+ * They're the scheduler's responsibility. This takes @worker out
+ * of concurrency management and the next code block will chain
+ * execution of the pending work items.
+ */
+ if (unlikely(cpu_intensive))
+ worker_set_flags(worker, WORKER_CPU_INTENSIVE);
+
+ /*
+ * Wake up another worker if necessary. The condition is always
+ * false for normal per-cpu workers since nr_running would always
+ * be >= 1 at this point. This is used to chain execution of the
+ * pending work items for WORKER_NOT_RUNNING workers such as the
+ * UNBOUND and CPU_INTENSIVE ones.
+ */
+ if (need_more_worker(pool))
+ wake_up_worker(pool);
+
+ /*
+ * Record the last pool and clear PENDING which should be the last
+ * update to @work. Also, do this inside @pool->lock so that
+ * PENDING and queued state changes happen together while IRQ is
+ * disabled.
+ */
+ set_work_pool_and_clear_pending(work, pool->id);
+
+ spin_unlock_irq(&pool->lock);
+
+ lock_map_acquire_read(&pwq->wq->lockdep_map);
+ lock_map_acquire(&lockdep_map);
+ trace_workqueue_execute_start(work);
+ worker->current_func(work);
+ /*
+ * While we must be careful to not use "work" after this, the trace
+ * point will only record its address.
+ */
+ trace_workqueue_execute_end(work);
+ lock_map_release(&lockdep_map);
+ lock_map_release(&pwq->wq->lockdep_map);
+
+ if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
+ pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n"
+ " last function: %pf\n",
+ current->comm, preempt_count(), task_pid_nr(current),
+ worker->current_func);
+ debug_show_held_locks(current);
+ dump_stack();
+ }
+
+ /*
+ * The following prevents a kworker from hogging CPU on !PREEMPT
+ * kernels, where a requeueing work item waiting for something to
+ * happen could deadlock with stop_machine as such work item could
+ * indefinitely requeue itself while all other CPUs are trapped in
+ * stop_machine. At the same time, report a quiescent RCU state so
+ * the same condition doesn't freeze RCU.
+ */
+ cond_resched_rcu_qs();
+
+ spin_lock_irq(&pool->lock);
+
+ /* clear cpu intensive status */
+ if (unlikely(cpu_intensive))
+ worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
+
+ /* we're done with it, release */
+ hash_del(&worker->hentry);
+ worker->current_work = NULL;
+ worker->current_func = NULL;
+ worker->current_pwq = NULL;
+ worker->desc_valid = false;
+ pwq_dec_nr_in_flight(pwq, work_color);
+}
+
+/**
+ * process_scheduled_works - process scheduled works
+ * @worker: self
+ *
+ * Process all scheduled works. Please note that the scheduled list
+ * may change while processing a work, so this function repeatedly
+ * fetches a work from the top and executes it.
+ *
+ * CONTEXT:
+ * spin_lock_irq(pool->lock) which may be released and regrabbed
+ * multiple times.
+ */
+static void process_scheduled_works(struct worker *worker)
+{
+ while (!list_empty(&worker->scheduled)) {
+ struct work_struct *work = list_first_entry(&worker->scheduled,
+ struct work_struct, entry);
+ process_one_work(worker, work);
+ }
+}
+
+/**
+ * worker_thread - the worker thread function
+ * @__worker: self
+ *
+ * The worker thread function. All workers belong to a worker_pool -
+ * either a per-cpu one or dynamic unbound one. These workers process all
+ * work items regardless of their specific target workqueue. The only
+ * exception is work items which belong to workqueues with a rescuer which
+ * will be explained in rescuer_thread().
+ *
+ * Return: 0
+ */
+static int worker_thread(void *__worker)
+{
+ struct worker *worker = __worker;
+ struct worker_pool *pool = worker->pool;
+
+ /* tell the scheduler that this is a workqueue worker */
+ worker->task->flags |= PF_WQ_WORKER;
+woke_up:
+ spin_lock_irq(&pool->lock);
+
+ /* am I supposed to die? */
+ if (unlikely(worker->flags & WORKER_DIE)) {
+ spin_unlock_irq(&pool->lock);
+ WARN_ON_ONCE(!list_empty(&worker->entry));
+ worker->task->flags &= ~PF_WQ_WORKER;
+
+ set_task_comm(worker->task, "kworker/dying");
+ ida_simple_remove(&pool->worker_ida, worker->id);
+ worker_detach_from_pool(worker, pool);
+ kfree(worker);
+ return 0;
+ }
+
+ worker_leave_idle(worker);
+recheck:
+ /* no more worker necessary? */
+ if (!need_more_worker(pool))
+ goto sleep;
+
+ /* do we need to manage? */
+ if (unlikely(!may_start_working(pool)) && manage_workers(worker))
+ goto recheck;
+
+ /*
+ * ->scheduled list can only be filled while a worker is
+ * preparing to process a work or actually processing it.
+ * Make sure nobody diddled with it while I was sleeping.
+ */
+ WARN_ON_ONCE(!list_empty(&worker->scheduled));
+
+ /*
+ * Finish PREP stage. We're guaranteed to have at least one idle
+ * worker or that someone else has already assumed the manager
+ * role. This is where @worker starts participating in concurrency
+ * management if applicable and concurrency management is restored
+ * after being rebound. See rebind_workers() for details.
+ */
+ worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND);
+
+ do {
+ struct work_struct *work =
+ list_first_entry(&pool->worklist,
+ struct work_struct, entry);
+
+ if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
+ /* optimization path, not strictly necessary */
+ process_one_work(worker, work);
+ if (unlikely(!list_empty(&worker->scheduled)))
+ process_scheduled_works(worker);
+ } else {
+ move_linked_works(work, &worker->scheduled, NULL);
+ process_scheduled_works(worker);
+ }
+ } while (keep_working(pool));
+
+ worker_set_flags(worker, WORKER_PREP);
+sleep:
+ /*
+ * pool->lock is held and there's no work to process and no need to
+ * manage, sleep. Workers are woken up only while holding
+ * pool->lock or from local cpu, so setting the current state
+ * before releasing pool->lock is enough to prevent losing any
+ * event.
+ */
+ worker_enter_idle(worker);
+ __set_current_state(TASK_INTERRUPTIBLE);
+ spin_unlock_irq(&pool->lock);
+ schedule();
+ goto woke_up;
+}
+
+/**
+ * rescuer_thread - the rescuer thread function
+ * @__rescuer: self
+ *
+ * Workqueue rescuer thread function. There's one rescuer for each
+ * workqueue which has WQ_MEM_RECLAIM set.
+ *
+ * Regular work processing on a pool may block trying to create a new
+ * worker which uses GFP_KERNEL allocation which has slight chance of
+ * developing into deadlock if some works currently on the same queue
+ * need to be processed to satisfy the GFP_KERNEL allocation. This is
+ * the problem rescuer solves.
+ *
+ * When such condition is possible, the pool summons rescuers of all
+ * workqueues which have works queued on the pool and let them process
+ * those works so that forward progress can be guaranteed.
+ *
+ * This should happen rarely.
+ *
+ * Return: 0
+ */
+static int rescuer_thread(void *__rescuer)
+{
+ struct worker *rescuer = __rescuer;
+ struct workqueue_struct *wq = rescuer->rescue_wq;
+ struct list_head *scheduled = &rescuer->scheduled;
+ bool should_stop;
+
+ set_user_nice(current, RESCUER_NICE_LEVEL);
+
+ /*
+ * Mark rescuer as worker too. As WORKER_PREP is never cleared, it
+ * doesn't participate in concurrency management.
+ */
+ rescuer->task->flags |= PF_WQ_WORKER;
+repeat:
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ /*
+ * By the time the rescuer is requested to stop, the workqueue
+ * shouldn't have any work pending, but @wq->maydays may still have
+ * pwq(s) queued. This can happen by non-rescuer workers consuming
+ * all the work items before the rescuer got to them. Go through
+ * @wq->maydays processing before acting on should_stop so that the
+ * list is always empty on exit.
+ */
+ should_stop = kthread_should_stop();
+
+ /* see whether any pwq is asking for help */
+ spin_lock_irq(&wq_mayday_lock);
+
+ while (!list_empty(&wq->maydays)) {
+ struct pool_workqueue *pwq = list_first_entry(&wq->maydays,
+ struct pool_workqueue, mayday_node);
+ struct worker_pool *pool = pwq->pool;
+ struct work_struct *work, *n;
+
+ __set_current_state(TASK_RUNNING);
+ list_del_init(&pwq->mayday_node);
+
+ spin_unlock_irq(&wq_mayday_lock);
+
+ worker_attach_to_pool(rescuer, pool);
+
+ spin_lock_irq(&pool->lock);
+ rescuer->pool = pool;
+
+ /*
+ * Slurp in all works issued via this workqueue and
+ * process'em.
+ */
+ WARN_ON_ONCE(!list_empty(scheduled));
+ list_for_each_entry_safe(work, n, &pool->worklist, entry)
+ if (get_work_pwq(work) == pwq)
+ move_linked_works(work, scheduled, &n);
+
+ if (!list_empty(scheduled)) {
+ process_scheduled_works(rescuer);
+
+ /*
+ * The above execution of rescued work items could
+ * have created more to rescue through
+ * pwq_activate_first_delayed() or chained
+ * queueing. Let's put @pwq back on mayday list so
+ * that such back-to-back work items, which may be
+ * being used to relieve memory pressure, don't
+ * incur MAYDAY_INTERVAL delay inbetween.
+ */
+ if (need_to_create_worker(pool)) {
+ spin_lock(&wq_mayday_lock);
+ get_pwq(pwq);
+ list_move_tail(&pwq->mayday_node, &wq->maydays);
+ spin_unlock(&wq_mayday_lock);
+ }
+ }
+
+ /*
+ * Put the reference grabbed by send_mayday(). @pool won't
+ * go away while we're still attached to it.
+ */
+ put_pwq(pwq);
+
+ /*
+ * Leave this pool. If need_more_worker() is %true, notify a
+ * regular worker; otherwise, we end up with 0 concurrency
+ * and stalling the execution.
+ */
+ if (need_more_worker(pool))
+ wake_up_worker(pool);
+
+ rescuer->pool = NULL;
+ spin_unlock_irq(&pool->lock);
+
+ worker_detach_from_pool(rescuer, pool);
+
+ spin_lock_irq(&wq_mayday_lock);
+ }
+
+ spin_unlock_irq(&wq_mayday_lock);
+
+ if (should_stop) {
+ __set_current_state(TASK_RUNNING);
+ rescuer->task->flags &= ~PF_WQ_WORKER;
+ return 0;
+ }
+
+ /* rescuers should never participate in concurrency management */
+ WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));
+ schedule();
+ goto repeat;
+}
+
+struct wq_barrier {
+ struct work_struct work;
+ struct completion done;
+ struct task_struct *task; /* purely informational */
+};
+
+static void wq_barrier_func(struct work_struct *work)
+{
+ struct wq_barrier *barr = container_of(work, struct wq_barrier, work);
+ complete(&barr->done);
+}
+
+/**
+ * insert_wq_barrier - insert a barrier work
+ * @pwq: pwq to insert barrier into
+ * @barr: wq_barrier to insert
+ * @target: target work to attach @barr to
+ * @worker: worker currently executing @target, NULL if @target is not executing
+ *
+ * @barr is linked to @target such that @barr is completed only after
+ * @target finishes execution. Please note that the ordering
+ * guarantee is observed only with respect to @target and on the local
+ * cpu.
+ *
+ * Currently, a queued barrier can't be canceled. This is because
+ * try_to_grab_pending() can't determine whether the work to be
+ * grabbed is at the head of the queue and thus can't clear LINKED
+ * flag of the previous work while there must be a valid next work
+ * after a work with LINKED flag set.
+ *
+ * Note that when @worker is non-NULL, @target may be modified
+ * underneath us, so we can't reliably determine pwq from @target.
+ *
+ * CONTEXT:
+ * spin_lock_irq(pool->lock).
+ */
+static void insert_wq_barrier(struct pool_workqueue *pwq,
+ struct wq_barrier *barr,
+ struct work_struct *target, struct worker *worker)
+{
+ struct list_head *head;
+ unsigned int linked = 0;
+
+ /*
+ * debugobject calls are safe here even with pool->lock locked
+ * as we know for sure that this will not trigger any of the
+ * checks and call back into the fixup functions where we
+ * might deadlock.
+ */
+ INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
+ __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
+ init_completion(&barr->done);
+ barr->task = current;
+
+ /*
+ * If @target is currently being executed, schedule the
+ * barrier to the worker; otherwise, put it after @target.
+ */
+ if (worker)
+ head = worker->scheduled.next;
+ else {
+ unsigned long *bits = work_data_bits(target);
+
+ head = target->entry.next;
+ /* there can already be other linked works, inherit and set */
+ linked = *bits & WORK_STRUCT_LINKED;
+ __set_bit(WORK_STRUCT_LINKED_BIT, bits);
+ }
+
+ debug_work_activate(&barr->work);
+ insert_work(pwq, &barr->work, head,
+ work_color_to_flags(WORK_NO_COLOR) | linked);
+}
+
+/**
+ * flush_workqueue_prep_pwqs - prepare pwqs for workqueue flushing
+ * @wq: workqueue being flushed
+ * @flush_color: new flush color, < 0 for no-op
+ * @work_color: new work color, < 0 for no-op
+ *
+ * Prepare pwqs for workqueue flushing.
+ *
+ * If @flush_color is non-negative, flush_color on all pwqs should be
+ * -1. If no pwq has in-flight commands at the specified color, all
+ * pwq->flush_color's stay at -1 and %false is returned. If any pwq
+ * has in flight commands, its pwq->flush_color is set to
+ * @flush_color, @wq->nr_pwqs_to_flush is updated accordingly, pwq
+ * wakeup logic is armed and %true is returned.
+ *
+ * The caller should have initialized @wq->first_flusher prior to
+ * calling this function with non-negative @flush_color. If
+ * @flush_color is negative, no flush color update is done and %false
+ * is returned.
+ *
+ * If @work_color is non-negative, all pwqs should have the same
+ * work_color which is previous to @work_color and all will be
+ * advanced to @work_color.
+ *
+ * CONTEXT:
+ * mutex_lock(wq->mutex).
+ *
+ * Return:
+ * %true if @flush_color >= 0 and there's something to flush. %false
+ * otherwise.
+ */
+static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
+ int flush_color, int work_color)
+{
+ bool wait = false;
+ struct pool_workqueue *pwq;
+
+ if (flush_color >= 0) {
+ WARN_ON_ONCE(atomic_read(&wq->nr_pwqs_to_flush));
+ atomic_set(&wq->nr_pwqs_to_flush, 1);
+ }
+
+ for_each_pwq(pwq, wq) {
+ struct worker_pool *pool = pwq->pool;
+
+ spin_lock_irq(&pool->lock);
+
+ if (flush_color >= 0) {
+ WARN_ON_ONCE(pwq->flush_color != -1);
+
+ if (pwq->nr_in_flight[flush_color]) {
+ pwq->flush_color = flush_color;
+ atomic_inc(&wq->nr_pwqs_to_flush);
+ wait = true;
+ }
+ }
+
+ if (work_color >= 0) {
+ WARN_ON_ONCE(work_color != work_next_color(pwq->work_color));
+ pwq->work_color = work_color;
+ }
+
+ spin_unlock_irq(&pool->lock);
+ }
+
+ if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush))
+ complete(&wq->first_flusher->done);
+
+ return wait;
+}
+
+/**
+ * flush_workqueue - ensure that any scheduled work has run to completion.
+ * @wq: workqueue to flush
+ *
+ * This function sleeps until all work items which were queued on entry
+ * have finished execution, but it is not livelocked by new incoming ones.
+ */
+void flush_workqueue(struct workqueue_struct *wq)
+{
+ struct wq_flusher this_flusher = {
+ .list = LIST_HEAD_INIT(this_flusher.list),
+ .flush_color = -1,
+ .done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done),
+ };
+ int next_color;
+
+ lock_map_acquire(&wq->lockdep_map);
+ lock_map_release(&wq->lockdep_map);
+
+ mutex_lock(&wq->mutex);
+
+ /*
+ * Start-to-wait phase
+ */
+ next_color = work_next_color(wq->work_color);
+
+ if (next_color != wq->flush_color) {
+ /*
+ * Color space is not full. The current work_color
+ * becomes our flush_color and work_color is advanced
+ * by one.
+ */
+ WARN_ON_ONCE(!list_empty(&wq->flusher_overflow));
+ this_flusher.flush_color = wq->work_color;
+ wq->work_color = next_color;
+
+ if (!wq->first_flusher) {
+ /* no flush in progress, become the first flusher */
+ WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color);
+
+ wq->first_flusher = &this_flusher;
+
+ if (!flush_workqueue_prep_pwqs(wq, wq->flush_color,
+ wq->work_color)) {
+ /* nothing to flush, done */
+ wq->flush_color = next_color;
+ wq->first_flusher = NULL;
+ goto out_unlock;
+ }
+ } else {
+ /* wait in queue */
+ WARN_ON_ONCE(wq->flush_color == this_flusher.flush_color);
+ list_add_tail(&this_flusher.list, &wq->flusher_queue);
+ flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
+ }
+ } else {
+ /*
+ * Oops, color space is full, wait on overflow queue.
+ * The next flush completion will assign us
+ * flush_color and transfer to flusher_queue.
+ */
+ list_add_tail(&this_flusher.list, &wq->flusher_overflow);
+ }
+
+ mutex_unlock(&wq->mutex);
+
+ wait_for_completion(&this_flusher.done);
+
+ /*
+ * Wake-up-and-cascade phase
+ *
+ * First flushers are responsible for cascading flushes and
+ * handling overflow. Non-first flushers can simply return.
+ */
+ if (wq->first_flusher != &this_flusher)
+ return;
+
+ mutex_lock(&wq->mutex);
+
+ /* we might have raced, check again with mutex held */
+ if (wq->first_flusher != &this_flusher)
+ goto out_unlock;
+
+ wq->first_flusher = NULL;
+
+ WARN_ON_ONCE(!list_empty(&this_flusher.list));
+ WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color);
+
+ while (true) {
+ struct wq_flusher *next, *tmp;
+
+ /* complete all the flushers sharing the current flush color */
+ list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) {
+ if (next->flush_color != wq->flush_color)
+ break;
+ list_del_init(&next->list);
+ complete(&next->done);
+ }
+
+ WARN_ON_ONCE(!list_empty(&wq->flusher_overflow) &&
+ wq->flush_color != work_next_color(wq->work_color));
+
+ /* this flush_color is finished, advance by one */
+ wq->flush_color = work_next_color(wq->flush_color);
+
+ /* one color has been freed, handle overflow queue */
+ if (!list_empty(&wq->flusher_overflow)) {
+ /*
+ * Assign the same color to all overflowed
+ * flushers, advance work_color and append to
+ * flusher_queue. This is the start-to-wait
+ * phase for these overflowed flushers.
+ */
+ list_for_each_entry(tmp, &wq->flusher_overflow, list)
+ tmp->flush_color = wq->work_color;
+
+ wq->work_color = work_next_color(wq->work_color);
+
+ list_splice_tail_init(&wq->flusher_overflow,
+ &wq->flusher_queue);
+ flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
+ }
+
+ if (list_empty(&wq->flusher_queue)) {
+ WARN_ON_ONCE(wq->flush_color != wq->work_color);
+ break;
+ }
+
+ /*
+ * Need to flush more colors. Make the next flusher
+ * the new first flusher and arm pwqs.
+ */
+ WARN_ON_ONCE(wq->flush_color == wq->work_color);
+ WARN_ON_ONCE(wq->flush_color != next->flush_color);
+
+ list_del_init(&next->list);
+ wq->first_flusher = next;
+
+ if (flush_workqueue_prep_pwqs(wq, wq->flush_color, -1))
+ break;
+
+ /*
+ * Meh... this color is already done, clear first
+ * flusher and repeat cascading.
+ */
+ wq->first_flusher = NULL;
+ }
+
+out_unlock:
+ mutex_unlock(&wq->mutex);
+}
+EXPORT_SYMBOL_GPL(flush_workqueue);
+
+/**
+ * drain_workqueue - drain a workqueue
+ * @wq: workqueue to drain
+ *
+ * Wait until the workqueue becomes empty. While draining is in progress,
+ * only chain queueing is allowed. IOW, only currently pending or running
+ * work items on @wq can queue further work items on it. @wq is flushed
+ * repeatedly until it becomes empty. The number of flushing is detemined
+ * by the depth of chaining and should be relatively short. Whine if it
+ * takes too long.
+ */
+void drain_workqueue(struct workqueue_struct *wq)
+{
+ unsigned int flush_cnt = 0;
+ struct pool_workqueue *pwq;
+
+ /*
+ * __queue_work() needs to test whether there are drainers, is much
+ * hotter than drain_workqueue() and already looks at @wq->flags.
+ * Use __WQ_DRAINING so that queue doesn't have to check nr_drainers.
+ */
+ mutex_lock(&wq->mutex);
+ if (!wq->nr_drainers++)
+ wq->flags |= __WQ_DRAINING;
+ mutex_unlock(&wq->mutex);
+reflush:
+ flush_workqueue(wq);
+
+ mutex_lock(&wq->mutex);
+
+ for_each_pwq(pwq, wq) {
+ bool drained;
+
+ spin_lock_irq(&pwq->pool->lock);
+ drained = !pwq->nr_active && list_empty(&pwq->delayed_works);
+ spin_unlock_irq(&pwq->pool->lock);
+
+ if (drained)
+ continue;
+
+ if (++flush_cnt == 10 ||
+ (flush_cnt % 100 == 0 && flush_cnt <= 1000))
+ pr_warn("workqueue %s: drain_workqueue() isn't complete after %u tries\n",
+ wq->name, flush_cnt);
+
+ mutex_unlock(&wq->mutex);
+ goto reflush;
+ }
+
+ if (!--wq->nr_drainers)
+ wq->flags &= ~__WQ_DRAINING;
+ mutex_unlock(&wq->mutex);
+}
+EXPORT_SYMBOL_GPL(drain_workqueue);
+
+static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
+{
+ struct worker *worker = NULL;
+ struct worker_pool *pool;
+ struct pool_workqueue *pwq;
+
+ might_sleep();
+
+ local_irq_disable();
+ pool = get_work_pool(work);
+ if (!pool) {
+ local_irq_enable();
+ return false;
+ }
+
+ spin_lock(&pool->lock);
+ /* see the comment in try_to_grab_pending() with the same code */
+ pwq = get_work_pwq(work);
+ if (pwq) {
+ if (unlikely(pwq->pool != pool))
+ goto already_gone;
+ } else {
+ worker = find_worker_executing_work(pool, work);
+ if (!worker)
+ goto already_gone;
+ pwq = worker->current_pwq;
+ }
+
+ insert_wq_barrier(pwq, barr, work, worker);
+ spin_unlock_irq(&pool->lock);
+
+ /*
+ * If @max_active is 1 or rescuer is in use, flushing another work
+ * item on the same workqueue may lead to deadlock. Make sure the
+ * flusher is not running on the same workqueue by verifying write
+ * access.
+ */
+ if (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer)
+ lock_map_acquire(&pwq->wq->lockdep_map);
+ else
+ lock_map_acquire_read(&pwq->wq->lockdep_map);
+ lock_map_release(&pwq->wq->lockdep_map);
+
+ return true;
+already_gone:
+ spin_unlock_irq(&pool->lock);
+ return false;
+}
+
+/**
+ * flush_work - wait for a work to finish executing the last queueing instance
+ * @work: the work to flush
+ *
+ * Wait until @work has finished execution. @work is guaranteed to be idle
+ * on return if it hasn't been requeued since flush started.
+ *
+ * Return:
+ * %true if flush_work() waited for the work to finish execution,
+ * %false if it was already idle.
+ */
+bool flush_work(struct work_struct *work)
+{
+ struct wq_barrier barr;
+
+ lock_map_acquire(&work->lockdep_map);
+ lock_map_release(&work->lockdep_map);
+
+ if (start_flush_work(work, &barr)) {
+ wait_for_completion(&barr.done);
+ destroy_work_on_stack(&barr.work);
+ return true;
+ } else {
+ return false;
+ }
+}
+EXPORT_SYMBOL_GPL(flush_work);
+
+struct cwt_wait {
+ wait_queue_t wait;
+ struct work_struct *work;
+};
+
+static int cwt_wakefn(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+ struct cwt_wait *cwait = container_of(wait, struct cwt_wait, wait);
+
+ if (cwait->work != key)
+ return 0;
+ return autoremove_wake_function(wait, mode, sync, key);
+}
+
+static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
+{
+ static DECLARE_WAIT_QUEUE_HEAD(cancel_waitq);
+ unsigned long flags;
+ int ret;
+
+ do {
+ ret = try_to_grab_pending(work, is_dwork, &flags);
+ /*
+ * If someone else is already canceling, wait for it to
+ * finish. flush_work() doesn't work for PREEMPT_NONE
+ * because we may get scheduled between @work's completion
+ * and the other canceling task resuming and clearing
+ * CANCELING - flush_work() will return false immediately
+ * as @work is no longer busy, try_to_grab_pending() will
+ * return -ENOENT as @work is still being canceled and the
+ * other canceling task won't be able to clear CANCELING as
+ * we're hogging the CPU.
+ *
+ * Let's wait for completion using a waitqueue. As this
+ * may lead to the thundering herd problem, use a custom
+ * wake function which matches @work along with exclusive
+ * wait and wakeup.
+ */
+ if (unlikely(ret == -ENOENT)) {
+ struct cwt_wait cwait;
+
+ init_wait(&cwait.wait);
+ cwait.wait.func = cwt_wakefn;
+ cwait.work = work;
+
+ prepare_to_wait_exclusive(&cancel_waitq, &cwait.wait,
+ TASK_UNINTERRUPTIBLE);
+ if (work_is_canceling(work))
+ schedule();
+ finish_wait(&cancel_waitq, &cwait.wait);
+ }
+ } while (unlikely(ret < 0));
+
+ /* tell other tasks trying to grab @work to back off */
+ mark_work_canceling(work);
+ local_irq_restore(flags);
+
+ flush_work(work);
+ clear_work_data(work);
+
+ /*
+ * Paired with prepare_to_wait() above so that either
+ * waitqueue_active() is visible here or !work_is_canceling() is
+ * visible there.
+ */
+ smp_mb();
+ if (waitqueue_active(&cancel_waitq))
+ __wake_up(&cancel_waitq, TASK_NORMAL, 1, work);
+
+ return ret;
+}
+
+/**
+ * cancel_work_sync - cancel a work and wait for it to finish
+ * @work: the work to cancel
+ *
+ * Cancel @work and wait for its execution to finish. This function
+ * can be used even if the work re-queues itself or migrates to
+ * another workqueue. On return from this function, @work is
+ * guaranteed to be not pending or executing on any CPU.
+ *
+ * cancel_work_sync(&delayed_work->work) must not be used for
+ * delayed_work's. Use cancel_delayed_work_sync() instead.
+ *
+ * The caller must ensure that the workqueue on which @work was last
+ * queued can't be destroyed before this function returns.
+ *
+ * Return:
+ * %true if @work was pending, %false otherwise.
+ */
+bool cancel_work_sync(struct work_struct *work)
+{
+ return __cancel_work_timer(work, false);
+}
+EXPORT_SYMBOL_GPL(cancel_work_sync);
+
+/**
+ * flush_delayed_work - wait for a dwork to finish executing the last queueing
+ * @dwork: the delayed work to flush
+ *
+ * Delayed timer is cancelled and the pending work is queued for
+ * immediate execution. Like flush_work(), this function only
+ * considers the last queueing instance of @dwork.
+ *
+ * Return:
+ * %true if flush_work() waited for the work to finish execution,
+ * %false if it was already idle.
+ */
+bool flush_delayed_work(struct delayed_work *dwork)
+{
+ local_irq_disable();
+ if (del_timer_sync(&dwork->timer))
+ __queue_work(dwork->cpu, dwork->wq, &dwork->work);
+ local_irq_enable();
+ return flush_work(&dwork->work);
+}
+EXPORT_SYMBOL(flush_delayed_work);
+
+/**
+ * cancel_delayed_work - cancel a delayed work
+ * @dwork: delayed_work to cancel
+ *
+ * Kill off a pending delayed_work.
+ *
+ * Return: %true if @dwork was pending and canceled; %false if it wasn't
+ * pending.
+ *
+ * Note:
+ * The work callback function may still be running on return, unless
+ * it returns %true and the work doesn't re-arm itself. Explicitly flush or
+ * use cancel_delayed_work_sync() to wait on it.
+ *
+ * This function is safe to call from any context including IRQ handler.
+ */
+bool cancel_delayed_work(struct delayed_work *dwork)
+{
+ unsigned long flags;
+ int ret;
+
+ do {
+ ret = try_to_grab_pending(&dwork->work, true, &flags);
+ } while (unlikely(ret == -EAGAIN));
+
+ if (unlikely(ret < 0))
+ return false;
+
+ set_work_pool_and_clear_pending(&dwork->work,
+ get_work_pool_id(&dwork->work));
+ local_irq_restore(flags);
+ return ret;
+}
+EXPORT_SYMBOL(cancel_delayed_work);
+
+/**
+ * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish
+ * @dwork: the delayed work cancel
+ *
+ * This is cancel_work_sync() for delayed works.
+ *
+ * Return:
+ * %true if @dwork was pending, %false otherwise.
+ */
+bool cancel_delayed_work_sync(struct delayed_work *dwork)
+{
+ return __cancel_work_timer(&dwork->work, true);
+}
+EXPORT_SYMBOL(cancel_delayed_work_sync);
+
+/**
+ * schedule_on_each_cpu - execute a function synchronously on each online CPU
+ * @func: the function to call
+ *
+ * schedule_on_each_cpu() executes @func on each online CPU using the
+ * system workqueue and blocks until all CPUs have completed.
+ * schedule_on_each_cpu() is very slow.
+ *
+ * Return:
+ * 0 on success, -errno on failure.
+ */
+int schedule_on_each_cpu(work_func_t func)
+{
+ int cpu;
+ struct work_struct __percpu *works;
+
+ works = alloc_percpu(struct work_struct);
+ if (!works)
+ return -ENOMEM;
+
+ get_online_cpus();
+
+ for_each_online_cpu(cpu) {
+ struct work_struct *work = per_cpu_ptr(works, cpu);
+
+ INIT_WORK(work, func);
+ schedule_work_on(cpu, work);
+ }
+
+ for_each_online_cpu(cpu)
+ flush_work(per_cpu_ptr(works, cpu));
+
+ put_online_cpus();
+ free_percpu(works);
+ return 0;
+}
+
+/**
+ * flush_scheduled_work - ensure that any scheduled work has run to completion.
+ *
+ * Forces execution of the kernel-global workqueue and blocks until its
+ * completion.
+ *
+ * Think twice before calling this function! It's very easy to get into
+ * trouble if you don't take great care. Either of the following situations
+ * will lead to deadlock:
+ *
+ * One of the work items currently on the workqueue needs to acquire
+ * a lock held by your code or its caller.
+ *
+ * Your code is running in the context of a work routine.
+ *
+ * They will be detected by lockdep when they occur, but the first might not
+ * occur very often. It depends on what work items are on the workqueue and
+ * what locks they need, which you have no control over.
+ *
+ * In most situations flushing the entire workqueue is overkill; you merely
+ * need to know that a particular work item isn't queued and isn't running.
+ * In such cases you should use cancel_delayed_work_sync() or
+ * cancel_work_sync() instead.
+ */
+void flush_scheduled_work(void)
+{
+ flush_workqueue(system_wq);
+}
+EXPORT_SYMBOL(flush_scheduled_work);
+
+/**
+ * execute_in_process_context - reliably execute the routine with user context
+ * @fn: the function to execute
+ * @ew: guaranteed storage for the execute work structure (must
+ * be available when the work executes)
+ *
+ * Executes the function immediately if process context is available,
+ * otherwise schedules the function for delayed execution.
+ *
+ * Return: 0 - function was executed
+ * 1 - function was scheduled for execution
+ */
+int execute_in_process_context(work_func_t fn, struct execute_work *ew)
+{
+ if (!in_interrupt()) {
+ fn(&ew->work);
+ return 0;
+ }
+
+ INIT_WORK(&ew->work, fn);
+ schedule_work(&ew->work);
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(execute_in_process_context);
+
+/**
+ * free_workqueue_attrs - free a workqueue_attrs
+ * @attrs: workqueue_attrs to free
+ *
+ * Undo alloc_workqueue_attrs().
+ */
+void free_workqueue_attrs(struct workqueue_attrs *attrs)
+{
+ if (attrs) {
+ free_cpumask_var(attrs->cpumask);
+ kfree(attrs);
+ }
+}
+
+/**
+ * alloc_workqueue_attrs - allocate a workqueue_attrs
+ * @gfp_mask: allocation mask to use
+ *
+ * Allocate a new workqueue_attrs, initialize with default settings and
+ * return it.
+ *
+ * Return: The allocated new workqueue_attr on success. %NULL on failure.
+ */
+struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask)
+{
+ struct workqueue_attrs *attrs;
+
+ attrs = kzalloc(sizeof(*attrs), gfp_mask);
+ if (!attrs)
+ goto fail;
+ if (!alloc_cpumask_var(&attrs->cpumask, gfp_mask))
+ goto fail;
+
+ cpumask_copy(attrs->cpumask, cpu_possible_mask);
+ return attrs;
+fail:
+ free_workqueue_attrs(attrs);
+ return NULL;
+}
+
+static void copy_workqueue_attrs(struct workqueue_attrs *to,
+ const struct workqueue_attrs *from)
+{
+ to->nice = from->nice;
+ cpumask_copy(to->cpumask, from->cpumask);
+ /*
+ * Unlike hash and equality test, this function doesn't ignore
+ * ->no_numa as it is used for both pool and wq attrs. Instead,
+ * get_unbound_pool() explicitly clears ->no_numa after copying.
+ */
+ to->no_numa = from->no_numa;
+}
+
+/* hash value of the content of @attr */
+static u32 wqattrs_hash(const struct workqueue_attrs *attrs)
+{
+ u32 hash = 0;
+
+ hash = jhash_1word(attrs->nice, hash);
+ hash = jhash(cpumask_bits(attrs->cpumask),
+ BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash);
+ return hash;
+}
+
+/* content equality test */
+static bool wqattrs_equal(const struct workqueue_attrs *a,
+ const struct workqueue_attrs *b)
+{
+ if (a->nice != b->nice)
+ return false;
+ if (!cpumask_equal(a->cpumask, b->cpumask))
+ return false;
+ return true;
+}
+
+/**
+ * init_worker_pool - initialize a newly zalloc'd worker_pool
+ * @pool: worker_pool to initialize
+ *
+ * Initiailize a newly zalloc'd @pool. It also allocates @pool->attrs.
+ *
+ * Return: 0 on success, -errno on failure. Even on failure, all fields
+ * inside @pool proper are initialized and put_unbound_pool() can be called
+ * on @pool safely to release it.
+ */
+static int init_worker_pool(struct worker_pool *pool)
+{
+ spin_lock_init(&pool->lock);
+ pool->id = -1;
+ pool->cpu = -1;
+ pool->node = NUMA_NO_NODE;
+ pool->flags |= POOL_DISASSOCIATED;
+ INIT_LIST_HEAD(&pool->worklist);
+ INIT_LIST_HEAD(&pool->idle_list);
+ hash_init(pool->busy_hash);
+
+ init_timer_deferrable(&pool->idle_timer);
+ pool->idle_timer.function = idle_worker_timeout;
+ pool->idle_timer.data = (unsigned long)pool;
+
+ setup_timer(&pool->mayday_timer, pool_mayday_timeout,
+ (unsigned long)pool);
+
+ mutex_init(&pool->manager_arb);
+ mutex_init(&pool->attach_mutex);
+ INIT_LIST_HEAD(&pool->workers);
+
+ ida_init(&pool->worker_ida);
+ INIT_HLIST_NODE(&pool->hash_node);
+ pool->refcnt = 1;
+
+ /* shouldn't fail above this point */
+ pool->attrs = alloc_workqueue_attrs(GFP_KERNEL);
+ if (!pool->attrs)
+ return -ENOMEM;
+ return 0;
+}
+
+static void rcu_free_wq(struct rcu_head *rcu)
+{
+ struct workqueue_struct *wq =
+ container_of(rcu, struct workqueue_struct, rcu);
+
+ if (!(wq->flags & WQ_UNBOUND))
+ free_percpu(wq->cpu_pwqs);
+ else
+ free_workqueue_attrs(wq->unbound_attrs);
+
+ kfree(wq->rescuer);
+ kfree(wq);
+}
+
+static void rcu_free_pool(struct rcu_head *rcu)
+{
+ struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu);
+
+ ida_destroy(&pool->worker_ida);
+ free_workqueue_attrs(pool->attrs);
+ kfree(pool);
+}
+
+/**
+ * put_unbound_pool - put a worker_pool
+ * @pool: worker_pool to put
+ *
+ * Put @pool. If its refcnt reaches zero, it gets destroyed in sched-RCU
+ * safe manner. get_unbound_pool() calls this function on its failure path
+ * and this function should be able to release pools which went through,
+ * successfully or not, init_worker_pool().
+ *
+ * Should be called with wq_pool_mutex held.
+ */
+static void put_unbound_pool(struct worker_pool *pool)
+{
+ DECLARE_COMPLETION_ONSTACK(detach_completion);
+ struct worker *worker;
+
+ lockdep_assert_held(&wq_pool_mutex);
+
+ if (--pool->refcnt)
+ return;
+
+ /* sanity checks */
+ if (WARN_ON(!(pool->cpu < 0)) ||
+ WARN_ON(!list_empty(&pool->worklist)))
+ return;
+
+ /* release id and unhash */
+ if (pool->id >= 0)
+ idr_remove(&worker_pool_idr, pool->id);
+ hash_del(&pool->hash_node);
+
+ /*
+ * Become the manager and destroy all workers. Grabbing
+ * manager_arb prevents @pool's workers from blocking on
+ * attach_mutex.
+ */
+ mutex_lock(&pool->manager_arb);
+
+ spin_lock_irq(&pool->lock);
+ while ((worker = first_idle_worker(pool)))
+ destroy_worker(worker);
+ WARN_ON(pool->nr_workers || pool->nr_idle);
+ spin_unlock_irq(&pool->lock);
+
+ mutex_lock(&pool->attach_mutex);
+ if (!list_empty(&pool->workers))
+ pool->detach_completion = &detach_completion;
+ mutex_unlock(&pool->attach_mutex);
+
+ if (pool->detach_completion)
+ wait_for_completion(pool->detach_completion);
+
+ mutex_unlock(&pool->manager_arb);
+
+ /* shut down the timers */
+ del_timer_sync(&pool->idle_timer);
+ del_timer_sync(&pool->mayday_timer);
+
+ /* sched-RCU protected to allow dereferences from get_work_pool() */
+ call_rcu_sched(&pool->rcu, rcu_free_pool);
+}
+
+/**
+ * get_unbound_pool - get a worker_pool with the specified attributes
+ * @attrs: the attributes of the worker_pool to get
+ *
+ * Obtain a worker_pool which has the same attributes as @attrs, bump the
+ * reference count and return it. If there already is a matching
+ * worker_pool, it will be used; otherwise, this function attempts to
+ * create a new one.
+ *
+ * Should be called with wq_pool_mutex held.
+ *
+ * Return: On success, a worker_pool with the same attributes as @attrs.
+ * On failure, %NULL.
+ */
+static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
+{
+ u32 hash = wqattrs_hash(attrs);
+ struct worker_pool *pool;
+ int node;
+
+ lockdep_assert_held(&wq_pool_mutex);
+
+ /* do we already have a matching pool? */
+ hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) {
+ if (wqattrs_equal(pool->attrs, attrs)) {
+ pool->refcnt++;
+ return pool;
+ }
+ }
+
+ /* nope, create a new one */
+ pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+ if (!pool || init_worker_pool(pool) < 0)
+ goto fail;
+
+ lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */
+ copy_workqueue_attrs(pool->attrs, attrs);
+
+ /*
+ * no_numa isn't a worker_pool attribute, always clear it. See
+ * 'struct workqueue_attrs' comments for detail.
+ */
+ pool->attrs->no_numa = false;
+
+ /* if cpumask is contained inside a NUMA node, we belong to that node */
+ if (wq_numa_enabled) {
+ for_each_node(node) {
+ if (cpumask_subset(pool->attrs->cpumask,
+ wq_numa_possible_cpumask[node])) {
+ pool->node = node;
+ break;
+ }
+ }
+ }
+
+ if (worker_pool_assign_id(pool) < 0)
+ goto fail;
+
+ /* create and start the initial worker */
+ if (!create_worker(pool))
+ goto fail;
+
+ /* install */
+ hash_add(unbound_pool_hash, &pool->hash_node, hash);
+
+ return pool;
+fail:
+ if (pool)
+ put_unbound_pool(pool);
+ return NULL;
+}
+
+static void rcu_free_pwq(struct rcu_head *rcu)
+{
+ kmem_cache_free(pwq_cache,
+ container_of(rcu, struct pool_workqueue, rcu));
+}
+
+/*
+ * Scheduled on system_wq by put_pwq() when an unbound pwq hits zero refcnt
+ * and needs to be destroyed.
+ */
+static void pwq_unbound_release_workfn(struct work_struct *work)
+{
+ struct pool_workqueue *pwq = container_of(work, struct pool_workqueue,
+ unbound_release_work);
+ struct workqueue_struct *wq = pwq->wq;
+ struct worker_pool *pool = pwq->pool;
+ bool is_last;
+
+ if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)))
+ return;
+
+ mutex_lock(&wq->mutex);
+ list_del_rcu(&pwq->pwqs_node);
+ is_last = list_empty(&wq->pwqs);
+ mutex_unlock(&wq->mutex);
+
+ mutex_lock(&wq_pool_mutex);
+ put_unbound_pool(pool);
+ mutex_unlock(&wq_pool_mutex);
+
+ call_rcu_sched(&pwq->rcu, rcu_free_pwq);
+
+ /*
+ * If we're the last pwq going away, @wq is already dead and no one
+ * is gonna access it anymore. Schedule RCU free.
+ */
+ if (is_last)
+ call_rcu_sched(&wq->rcu, rcu_free_wq);
+}
+
+/**
+ * pwq_adjust_max_active - update a pwq's max_active to the current setting
+ * @pwq: target pool_workqueue
+ *
+ * If @pwq isn't freezing, set @pwq->max_active to the associated
+ * workqueue's saved_max_active and activate delayed work items
+ * accordingly. If @pwq is freezing, clear @pwq->max_active to zero.
+ */
+static void pwq_adjust_max_active(struct pool_workqueue *pwq)
+{
+ struct workqueue_struct *wq = pwq->wq;
+ bool freezable = wq->flags & WQ_FREEZABLE;
+
+ /* for @wq->saved_max_active */
+ lockdep_assert_held(&wq->mutex);
+
+ /* fast exit for non-freezable wqs */
+ if (!freezable && pwq->max_active == wq->saved_max_active)
+ return;
+
+ spin_lock_irq(&pwq->pool->lock);
+
+ /*
+ * During [un]freezing, the caller is responsible for ensuring that
+ * this function is called at least once after @workqueue_freezing
+ * is updated and visible.
+ */
+ if (!freezable || !workqueue_freezing) {
+ pwq->max_active = wq->saved_max_active;
+
+ while (!list_empty(&pwq->delayed_works) &&
+ pwq->nr_active < pwq->max_active)
+ pwq_activate_first_delayed(pwq);
+
+ /*
+ * Need to kick a worker after thawed or an unbound wq's
+ * max_active is bumped. It's a slow path. Do it always.
+ */
+ wake_up_worker(pwq->pool);
+ } else {
+ pwq->max_active = 0;
+ }
+
+ spin_unlock_irq(&pwq->pool->lock);
+}
+
+/* initialize newly alloced @pwq which is associated with @wq and @pool */
+static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,
+ struct worker_pool *pool)
+{
+ BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK);
+
+ memset(pwq, 0, sizeof(*pwq));
+
+ pwq->pool = pool;
+ pwq->wq = wq;
+ pwq->flush_color = -1;
+ pwq->refcnt = 1;
+ INIT_LIST_HEAD(&pwq->delayed_works);
+ INIT_LIST_HEAD(&pwq->pwqs_node);
+ INIT_LIST_HEAD(&pwq->mayday_node);
+ INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn);
+}
+
+/* sync @pwq with the current state of its associated wq and link it */
+static void link_pwq(struct pool_workqueue *pwq)
+{
+ struct workqueue_struct *wq = pwq->wq;
+
+ lockdep_assert_held(&wq->mutex);
+
+ /* may be called multiple times, ignore if already linked */
+ if (!list_empty(&pwq->pwqs_node))
+ return;
+
+ /* set the matching work_color */
+ pwq->work_color = wq->work_color;
+
+ /* sync max_active to the current setting */
+ pwq_adjust_max_active(pwq);
+
+ /* link in @pwq */
+ list_add_rcu(&pwq->pwqs_node, &wq->pwqs);
+}
+
+/* obtain a pool matching @attr and create a pwq associating the pool and @wq */
+static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
+ const struct workqueue_attrs *attrs)
+{
+ struct worker_pool *pool;
+ struct pool_workqueue *pwq;
+
+ lockdep_assert_held(&wq_pool_mutex);
+
+ pool = get_unbound_pool(attrs);
+ if (!pool)
+ return NULL;
+
+ pwq = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, pool->node);
+ if (!pwq) {
+ put_unbound_pool(pool);
+ return NULL;
+ }
+
+ init_pwq(pwq, wq, pool);
+ return pwq;
+}
+
+/* undo alloc_unbound_pwq(), used only in the error path */
+static void free_unbound_pwq(struct pool_workqueue *pwq)
+{
+ lockdep_assert_held(&wq_pool_mutex);
+
+ if (pwq) {
+ put_unbound_pool(pwq->pool);
+ kmem_cache_free(pwq_cache, pwq);
+ }
+}
+
+/**
+ * wq_calc_node_mask - calculate a wq_attrs' cpumask for the specified node
+ * @attrs: the wq_attrs of interest
+ * @node: the target NUMA node
+ * @cpu_going_down: if >= 0, the CPU to consider as offline
+ * @cpumask: outarg, the resulting cpumask
+ *
+ * Calculate the cpumask a workqueue with @attrs should use on @node. If
+ * @cpu_going_down is >= 0, that cpu is considered offline during
+ * calculation. The result is stored in @cpumask.
+ *
+ * If NUMA affinity is not enabled, @attrs->cpumask is always used. If
+ * enabled and @node has online CPUs requested by @attrs, the returned
+ * cpumask is the intersection of the possible CPUs of @node and
+ * @attrs->cpumask.
+ *
+ * The caller is responsible for ensuring that the cpumask of @node stays
+ * stable.
+ *
+ * Return: %true if the resulting @cpumask is different from @attrs->cpumask,
+ * %false if equal.
+ */
+static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node,
+ int cpu_going_down, cpumask_t *cpumask)
+{
+ if (!wq_numa_enabled || attrs->no_numa)
+ goto use_dfl;
+
+ /* does @node have any online CPUs @attrs wants? */
+ cpumask_and(cpumask, cpumask_of_node(node), attrs->cpumask);
+ if (cpu_going_down >= 0)
+ cpumask_clear_cpu(cpu_going_down, cpumask);
+
+ if (cpumask_empty(cpumask))
+ goto use_dfl;
+
+ /* yeap, return possible CPUs in @node that @attrs wants */
+ cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]);
+ return !cpumask_equal(cpumask, attrs->cpumask);
+
+use_dfl:
+ cpumask_copy(cpumask, attrs->cpumask);
+ return false;
+}
+
+/* install @pwq into @wq's numa_pwq_tbl[] for @node and return the old pwq */
+static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
+ int node,
+ struct pool_workqueue *pwq)
+{
+ struct pool_workqueue *old_pwq;
+
+ lockdep_assert_held(&wq->mutex);
+
+ /* link_pwq() can handle duplicate calls */
+ link_pwq(pwq);
+
+ old_pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]);
+ rcu_assign_pointer(wq->numa_pwq_tbl[node], pwq);
+ return old_pwq;
+}
+
+/**
+ * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue
+ * @wq: the target workqueue
+ * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs()
+ *
+ * Apply @attrs to an unbound workqueue @wq. Unless disabled, on NUMA
+ * machines, this function maps a separate pwq to each NUMA node with
+ * possibles CPUs in @attrs->cpumask so that work items are affine to the
+ * NUMA node it was issued on. Older pwqs are released as in-flight work
+ * items finish. Note that a work item which repeatedly requeues itself
+ * back-to-back will stay on its current pwq.
+ *
+ * Performs GFP_KERNEL allocations.
+ *
+ * Return: 0 on success and -errno on failure.
+ */
+int apply_workqueue_attrs(struct workqueue_struct *wq,
+ const struct workqueue_attrs *attrs)
+{
+ struct workqueue_attrs *new_attrs, *tmp_attrs;
+ struct pool_workqueue **pwq_tbl, *dfl_pwq;
+ int node, ret;
+
+ /* only unbound workqueues can change attributes */
+ if (WARN_ON(!(wq->flags & WQ_UNBOUND)))
+ return -EINVAL;
+
+ /* creating multiple pwqs breaks ordering guarantee */
+ if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
+ return -EINVAL;
+
+ pwq_tbl = kzalloc(nr_node_ids * sizeof(pwq_tbl[0]), GFP_KERNEL);
+ new_attrs = alloc_workqueue_attrs(GFP_KERNEL);
+ tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL);
+ if (!pwq_tbl || !new_attrs || !tmp_attrs)
+ goto enomem;
+
+ /* make a copy of @attrs and sanitize it */
+ copy_workqueue_attrs(new_attrs, attrs);
+ cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
+
+ /*
+ * We may create multiple pwqs with differing cpumasks. Make a
+ * copy of @new_attrs which will be modified and used to obtain
+ * pools.
+ */
+ copy_workqueue_attrs(tmp_attrs, new_attrs);
+
+ /*
+ * CPUs should stay stable across pwq creations and installations.
+ * Pin CPUs, determine the target cpumask for each node and create
+ * pwqs accordingly.
+ */
+ get_online_cpus();
+
+ mutex_lock(&wq_pool_mutex);
+
+ /*
+ * If something goes wrong during CPU up/down, we'll fall back to
+ * the default pwq covering whole @attrs->cpumask. Always create
+ * it even if we don't use it immediately.
+ */
+ dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
+ if (!dfl_pwq)
+ goto enomem_pwq;
+
+ for_each_node(node) {
+ if (wq_calc_node_cpumask(attrs, node, -1, tmp_attrs->cpumask)) {
+ pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
+ if (!pwq_tbl[node])
+ goto enomem_pwq;
+ } else {
+ dfl_pwq->refcnt++;
+ pwq_tbl[node] = dfl_pwq;
+ }
+ }
+
+ mutex_unlock(&wq_pool_mutex);
+
+ /* all pwqs have been created successfully, let's install'em */
+ mutex_lock(&wq->mutex);
+
+ copy_workqueue_attrs(wq->unbound_attrs, new_attrs);
+
+ /* save the previous pwq and install the new one */
+ for_each_node(node)
+ pwq_tbl[node] = numa_pwq_tbl_install(wq, node, pwq_tbl[node]);
+
+ /* @dfl_pwq might not have been used, ensure it's linked */
+ link_pwq(dfl_pwq);
+ swap(wq->dfl_pwq, dfl_pwq);
+
+ mutex_unlock(&wq->mutex);
+
+ /* put the old pwqs */
+ for_each_node(node)
+ put_pwq_unlocked(pwq_tbl[node]);
+ put_pwq_unlocked(dfl_pwq);
+
+ put_online_cpus();
+ ret = 0;
+ /* fall through */
+out_free:
+ free_workqueue_attrs(tmp_attrs);
+ free_workqueue_attrs(new_attrs);
+ kfree(pwq_tbl);
+ return ret;
+
+enomem_pwq:
+ free_unbound_pwq(dfl_pwq);
+ for_each_node(node)
+ if (pwq_tbl && pwq_tbl[node] != dfl_pwq)
+ free_unbound_pwq(pwq_tbl[node]);
+ mutex_unlock(&wq_pool_mutex);
+ put_online_cpus();
+enomem:
+ ret = -ENOMEM;
+ goto out_free;
+}
+
+/**
+ * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug
+ * @wq: the target workqueue
+ * @cpu: the CPU coming up or going down
+ * @online: whether @cpu is coming up or going down
+ *
+ * This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and
+ * %CPU_DOWN_FAILED. @cpu is being hot[un]plugged, update NUMA affinity of
+ * @wq accordingly.
+ *
+ * If NUMA affinity can't be adjusted due to memory allocation failure, it
+ * falls back to @wq->dfl_pwq which may not be optimal but is always
+ * correct.
+ *
+ * Note that when the last allowed CPU of a NUMA node goes offline for a
+ * workqueue with a cpumask spanning multiple nodes, the workers which were
+ * already executing the work items for the workqueue will lose their CPU
+ * affinity and may execute on any CPU. This is similar to how per-cpu
+ * workqueues behave on CPU_DOWN. If a workqueue user wants strict
+ * affinity, it's the user's responsibility to flush the work item from
+ * CPU_DOWN_PREPARE.
+ */
+static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
+ bool online)
+{
+ int node = cpu_to_node(cpu);
+ int cpu_off = online ? -1 : cpu;
+ struct pool_workqueue *old_pwq = NULL, *pwq;
+ struct workqueue_attrs *target_attrs;
+ cpumask_t *cpumask;
+
+ lockdep_assert_held(&wq_pool_mutex);
+
+ if (!wq_numa_enabled || !(wq->flags & WQ_UNBOUND))
+ return;
+
+ /*
+ * We don't wanna alloc/free wq_attrs for each wq for each CPU.
+ * Let's use a preallocated one. The following buf is protected by
+ * CPU hotplug exclusion.
+ */
+ target_attrs = wq_update_unbound_numa_attrs_buf;
+ cpumask = target_attrs->cpumask;
+
+ mutex_lock(&wq->mutex);
+ if (wq->unbound_attrs->no_numa)
+ goto out_unlock;
+
+ copy_workqueue_attrs(target_attrs, wq->unbound_attrs);
+ pwq = unbound_pwq_by_node(wq, node);
+
+ /*
+ * Let's determine what needs to be done. If the target cpumask is
+ * different from wq's, we need to compare it to @pwq's and create
+ * a new one if they don't match. If the target cpumask equals
+ * wq's, the default pwq should be used.
+ */
+ if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) {
+ if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask))
+ goto out_unlock;
+ } else {
+ goto use_dfl_pwq;
+ }
+
+ mutex_unlock(&wq->mutex);
+
+ /* create a new pwq */
+ pwq = alloc_unbound_pwq(wq, target_attrs);
+ if (!pwq) {
+ pr_warn("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n",
+ wq->name);
+ mutex_lock(&wq->mutex);
+ goto use_dfl_pwq;
+ }
+
+ /*
+ * Install the new pwq. As this function is called only from CPU
+ * hotplug callbacks and applying a new attrs is wrapped with
+ * get/put_online_cpus(), @wq->unbound_attrs couldn't have changed
+ * inbetween.
+ */
+ mutex_lock(&wq->mutex);
+ old_pwq = numa_pwq_tbl_install(wq, node, pwq);
+ goto out_unlock;
+
+use_dfl_pwq:
+ spin_lock_irq(&wq->dfl_pwq->pool->lock);
+ get_pwq(wq->dfl_pwq);
+ spin_unlock_irq(&wq->dfl_pwq->pool->lock);
+ old_pwq = numa_pwq_tbl_install(wq, node, wq->dfl_pwq);
+out_unlock:
+ mutex_unlock(&wq->mutex);
+ put_pwq_unlocked(old_pwq);
+}
+
+static int alloc_and_link_pwqs(struct workqueue_struct *wq)
+{
+ bool highpri = wq->flags & WQ_HIGHPRI;
+ int cpu, ret;
+
+ if (!(wq->flags & WQ_UNBOUND)) {
+ wq->cpu_pwqs = alloc_percpu(struct pool_workqueue);
+ if (!wq->cpu_pwqs)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ struct pool_workqueue *pwq =
+ per_cpu_ptr(wq->cpu_pwqs, cpu);
+ struct worker_pool *cpu_pools =
+ per_cpu(cpu_worker_pools, cpu);
+
+ init_pwq(pwq, wq, &cpu_pools[highpri]);
+
+ mutex_lock(&wq->mutex);
+ link_pwq(pwq);
+ mutex_unlock(&wq->mutex);
+ }
+ return 0;
+ } else if (wq->flags & __WQ_ORDERED) {
+ ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]);
+ /* there should only be single pwq for ordering guarantee */
+ WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node ||
+ wq->pwqs.prev != &wq->dfl_pwq->pwqs_node),
+ "ordering guarantee broken for workqueue %s\n", wq->name);
+ return ret;
+ } else {
+ return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);
+ }
+}
+
+static int wq_clamp_max_active(int max_active, unsigned int flags,
+ const char *name)
+{
+ int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE;
+
+ if (max_active < 1 || max_active > lim)
+ pr_warn("workqueue: max_active %d requested for %s is out of range, clamping between %d and %d\n",
+ max_active, name, 1, lim);
+
+ return clamp_val(max_active, 1, lim);
+}
+
+struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
+ unsigned int flags,
+ int max_active,
+ struct lock_class_key *key,
+ const char *lock_name, ...)
+{
+ size_t tbl_size = 0;
+ va_list args;
+ struct workqueue_struct *wq;
+ struct pool_workqueue *pwq;
+
+ /* see the comment above the definition of WQ_POWER_EFFICIENT */
+ if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient)
+ flags |= WQ_UNBOUND;
+
+ /* allocate wq and format name */
+ if (flags & WQ_UNBOUND)
+ tbl_size = nr_node_ids * sizeof(wq->numa_pwq_tbl[0]);
+
+ wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL);
+ if (!wq)
+ return NULL;
+
+ if (flags & WQ_UNBOUND) {
+ wq->unbound_attrs = alloc_workqueue_attrs(GFP_KERNEL);
+ if (!wq->unbound_attrs)
+ goto err_free_wq;
+ }
+
+ va_start(args, lock_name);
+ vsnprintf(wq->name, sizeof(wq->name), fmt, args);
+ va_end(args);
+
+ max_active = max_active ?: WQ_DFL_ACTIVE;
+ max_active = wq_clamp_max_active(max_active, flags, wq->name);
+
+ /* init wq */
+ wq->flags = flags;
+ wq->saved_max_active = max_active;
+ mutex_init(&wq->mutex);
+ atomic_set(&wq->nr_pwqs_to_flush, 0);
+ INIT_LIST_HEAD(&wq->pwqs);
+ INIT_LIST_HEAD(&wq->flusher_queue);
+ INIT_LIST_HEAD(&wq->flusher_overflow);
+ INIT_LIST_HEAD(&wq->maydays);
+
+ lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
+ INIT_LIST_HEAD(&wq->list);
+
+ if (alloc_and_link_pwqs(wq) < 0)
+ goto err_free_wq;
+
+ /*
+ * Workqueues which may be used during memory reclaim should
+ * have a rescuer to guarantee forward progress.
+ */
+ if (flags & WQ_MEM_RECLAIM) {
+ struct worker *rescuer;
+
+ rescuer = alloc_worker(NUMA_NO_NODE);
+ if (!rescuer)
+ goto err_destroy;
+
+ rescuer->rescue_wq = wq;
+ rescuer->task = kthread_create(rescuer_thread, rescuer, "%s",
+ wq->name);
+ if (IS_ERR(rescuer->task)) {
+ kfree(rescuer);
+ goto err_destroy;
+ }
+
+ wq->rescuer = rescuer;
+ rescuer->task->flags |= PF_NO_SETAFFINITY;
+ wake_up_process(rescuer->task);
+ }
+
+ if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq))
+ goto err_destroy;
+
+ /*
+ * wq_pool_mutex protects global freeze state and workqueues list.
+ * Grab it, adjust max_active and add the new @wq to workqueues
+ * list.
+ */
+ mutex_lock(&wq_pool_mutex);
+
+ mutex_lock(&wq->mutex);
+ for_each_pwq(pwq, wq)
+ pwq_adjust_max_active(pwq);
+ mutex_unlock(&wq->mutex);
+
+ list_add_tail_rcu(&wq->list, &workqueues);
+
+ mutex_unlock(&wq_pool_mutex);
+
+ return wq;
+
+err_free_wq:
+ free_workqueue_attrs(wq->unbound_attrs);
+ kfree(wq);
+ return NULL;
+err_destroy:
+ destroy_workqueue(wq);
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
+
+/**
+ * destroy_workqueue - safely terminate a workqueue
+ * @wq: target workqueue
+ *
+ * Safely destroy a workqueue. All work currently pending will be done first.
+ */
+void destroy_workqueue(struct workqueue_struct *wq)
+{
+ struct pool_workqueue *pwq;
+ int node;
+
+ /* drain it before proceeding with destruction */
+ drain_workqueue(wq);
+
+ /* sanity checks */
+ mutex_lock(&wq->mutex);
+ for_each_pwq(pwq, wq) {
+ int i;
+
+ for (i = 0; i < WORK_NR_COLORS; i++) {
+ if (WARN_ON(pwq->nr_in_flight[i])) {
+ mutex_unlock(&wq->mutex);
+ return;
+ }
+ }
+
+ if (WARN_ON((pwq != wq->dfl_pwq) && (pwq->refcnt > 1)) ||
+ WARN_ON(pwq->nr_active) ||
+ WARN_ON(!list_empty(&pwq->delayed_works))) {
+ mutex_unlock(&wq->mutex);
+ return;
+ }
+ }
+ mutex_unlock(&wq->mutex);
+
+ /*
+ * wq list is used to freeze wq, remove from list after
+ * flushing is complete in case freeze races us.
+ */
+ mutex_lock(&wq_pool_mutex);
+ list_del_rcu(&wq->list);
+ mutex_unlock(&wq_pool_mutex);
+
+ workqueue_sysfs_unregister(wq);
+
+ if (wq->rescuer)
+ kthread_stop(wq->rescuer->task);
+
+ if (!(wq->flags & WQ_UNBOUND)) {
+ /*
+ * The base ref is never dropped on per-cpu pwqs. Directly
+ * schedule RCU free.
+ */
+ call_rcu_sched(&wq->rcu, rcu_free_wq);
+ } else {
+ /*
+ * We're the sole accessor of @wq at this point. Directly
+ * access numa_pwq_tbl[] and dfl_pwq to put the base refs.
+ * @wq will be freed when the last pwq is released.
+ */
+ for_each_node(node) {
+ pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]);
+ RCU_INIT_POINTER(wq->numa_pwq_tbl[node], NULL);
+ put_pwq_unlocked(pwq);
+ }
+
+ /*
+ * Put dfl_pwq. @wq may be freed any time after dfl_pwq is
+ * put. Don't access it afterwards.
+ */
+ pwq = wq->dfl_pwq;
+ wq->dfl_pwq = NULL;
+ put_pwq_unlocked(pwq);
+ }
+}
+EXPORT_SYMBOL_GPL(destroy_workqueue);
+
+/**
+ * workqueue_set_max_active - adjust max_active of a workqueue
+ * @wq: target workqueue
+ * @max_active: new max_active value.
+ *
+ * Set max_active of @wq to @max_active.
+ *
+ * CONTEXT:
+ * Don't call from IRQ context.
+ */
+void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
+{
+ struct pool_workqueue *pwq;
+
+ /* disallow meddling with max_active for ordered workqueues */
+ if (WARN_ON(wq->flags & __WQ_ORDERED))
+ return;
+
+ max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);
+
+ mutex_lock(&wq->mutex);
+
+ wq->saved_max_active = max_active;
+
+ for_each_pwq(pwq, wq)
+ pwq_adjust_max_active(pwq);
+
+ mutex_unlock(&wq->mutex);
+}
+EXPORT_SYMBOL_GPL(workqueue_set_max_active);
+
+/**
+ * current_is_workqueue_rescuer - is %current workqueue rescuer?
+ *
+ * Determine whether %current is a workqueue rescuer. Can be used from
+ * work functions to determine whether it's being run off the rescuer task.
+ *
+ * Return: %true if %current is a workqueue rescuer. %false otherwise.
+ */
+bool current_is_workqueue_rescuer(void)
+{
+ struct worker *worker = current_wq_worker();
+
+ return worker && worker->rescue_wq;
+}
+
+/**
+ * workqueue_congested - test whether a workqueue is congested
+ * @cpu: CPU in question
+ * @wq: target workqueue
+ *
+ * Test whether @wq's cpu workqueue for @cpu is congested. There is
+ * no synchronization around this function and the test result is
+ * unreliable and only useful as advisory hints or for debugging.
+ *
+ * If @cpu is WORK_CPU_UNBOUND, the test is performed on the local CPU.
+ * Note that both per-cpu and unbound workqueues may be associated with
+ * multiple pool_workqueues which have separate congested states. A
+ * workqueue being congested on one CPU doesn't mean the workqueue is also
+ * contested on other CPUs / NUMA nodes.
+ *
+ * Return:
+ * %true if congested, %false otherwise.
+ */
+bool workqueue_congested(int cpu, struct workqueue_struct *wq)
+{
+ struct pool_workqueue *pwq;
+ bool ret;
+
+ rcu_read_lock_sched();
+
+ if (cpu == WORK_CPU_UNBOUND)
+ cpu = smp_processor_id();
+
+ if (!(wq->flags & WQ_UNBOUND))
+ pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
+ else
+ pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
+
+ ret = !list_empty(&pwq->delayed_works);
+ rcu_read_unlock_sched();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(workqueue_congested);
+
+/**
+ * work_busy - test whether a work is currently pending or running
+ * @work: the work to be tested
+ *
+ * Test whether @work is currently pending or running. There is no
+ * synchronization around this function and the test result is
+ * unreliable and only useful as advisory hints or for debugging.
+ *
+ * Return:
+ * OR'd bitmask of WORK_BUSY_* bits.
+ */
+unsigned int work_busy(struct work_struct *work)
+{
+ struct worker_pool *pool;
+ unsigned long flags;
+ unsigned int ret = 0;
+
+ if (work_pending(work))
+ ret |= WORK_BUSY_PENDING;
+
+ local_irq_save(flags);
+ pool = get_work_pool(work);
+ if (pool) {
+ spin_lock(&pool->lock);
+ if (find_worker_executing_work(pool, work))
+ ret |= WORK_BUSY_RUNNING;
+ spin_unlock(&pool->lock);
+ }
+ local_irq_restore(flags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(work_busy);
+
+/**
+ * set_worker_desc - set description for the current work item
+ * @fmt: printf-style format string
+ * @...: arguments for the format string
+ *
+ * This function can be called by a running work function to describe what
+ * the work item is about. If the worker task gets dumped, this
+ * information will be printed out together to help debugging. The
+ * description can be at most WORKER_DESC_LEN including the trailing '\0'.
+ */
+void set_worker_desc(const char *fmt, ...)
+{
+ struct worker *worker = current_wq_worker();
+ va_list args;
+
+ if (worker) {
+ va_start(args, fmt);
+ vsnprintf(worker->desc, sizeof(worker->desc), fmt, args);
+ va_end(args);
+ worker->desc_valid = true;
+ }
+}
+
+/**
+ * print_worker_info - print out worker information and description
+ * @log_lvl: the log level to use when printing
+ * @task: target task
+ *
+ * If @task is a worker and currently executing a work item, print out the
+ * name of the workqueue being serviced and worker description set with
+ * set_worker_desc() by the currently executing work item.
+ *
+ * This function can be safely called on any task as long as the
+ * task_struct itself is accessible. While safe, this function isn't
+ * synchronized and may print out mixups or garbages of limited length.
+ */
+void print_worker_info(const char *log_lvl, struct task_struct *task)
+{
+ work_func_t *fn = NULL;
+ char name[WQ_NAME_LEN] = { };
+ char desc[WORKER_DESC_LEN] = { };
+ struct pool_workqueue *pwq = NULL;
+ struct workqueue_struct *wq = NULL;
+ bool desc_valid = false;
+ struct worker *worker;
+
+ if (!(task->flags & PF_WQ_WORKER))
+ return;
+
+ /*
+ * This function is called without any synchronization and @task
+ * could be in any state. Be careful with dereferences.
+ */
+ worker = probe_kthread_data(task);
+
+ /*
+ * Carefully copy the associated workqueue's workfn and name. Keep
+ * the original last '\0' in case the original contains garbage.
+ */
+ probe_kernel_read(&fn, &worker->current_func, sizeof(fn));
+ probe_kernel_read(&pwq, &worker->current_pwq, sizeof(pwq));
+ probe_kernel_read(&wq, &pwq->wq, sizeof(wq));
+ probe_kernel_read(name, wq->name, sizeof(name) - 1);
+
+ /* copy worker description */
+ probe_kernel_read(&desc_valid, &worker->desc_valid, sizeof(desc_valid));
+ if (desc_valid)
+ probe_kernel_read(desc, worker->desc, sizeof(desc) - 1);
+
+ if (fn || name[0] || desc[0]) {
+ printk("%sWorkqueue: %s %pf", log_lvl, name, fn);
+ if (desc[0])
+ pr_cont(" (%s)", desc);
+ pr_cont("\n");
+ }
+}
+
+static void pr_cont_pool_info(struct worker_pool *pool)
+{
+ pr_cont(" cpus=%*pbl", nr_cpumask_bits, pool->attrs->cpumask);
+ if (pool->node != NUMA_NO_NODE)
+ pr_cont(" node=%d", pool->node);
+ pr_cont(" flags=0x%x nice=%d", pool->flags, pool->attrs->nice);
+}
+
+static void pr_cont_work(bool comma, struct work_struct *work)
+{
+ if (work->func == wq_barrier_func) {
+ struct wq_barrier *barr;
+
+ barr = container_of(work, struct wq_barrier, work);
+
+ pr_cont("%s BAR(%d)", comma ? "," : "",
+ task_pid_nr(barr->task));
+ } else {
+ pr_cont("%s %pf", comma ? "," : "", work->func);
+ }
+}
+
+static void show_pwq(struct pool_workqueue *pwq)
+{
+ struct worker_pool *pool = pwq->pool;
+ struct work_struct *work;
+ struct worker *worker;
+ bool has_in_flight = false, has_pending = false;
+ int bkt;
+
+ pr_info(" pwq %d:", pool->id);
+ pr_cont_pool_info(pool);
+
+ pr_cont(" active=%d/%d%s\n", pwq->nr_active, pwq->max_active,
+ !list_empty(&pwq->mayday_node) ? " MAYDAY" : "");
+
+ hash_for_each(pool->busy_hash, bkt, worker, hentry) {
+ if (worker->current_pwq == pwq) {
+ has_in_flight = true;
+ break;
+ }
+ }
+ if (has_in_flight) {
+ bool comma = false;
+
+ pr_info(" in-flight:");
+ hash_for_each(pool->busy_hash, bkt, worker, hentry) {
+ if (worker->current_pwq != pwq)
+ continue;
+
+ pr_cont("%s %d%s:%pf", comma ? "," : "",
+ task_pid_nr(worker->task),
+ worker == pwq->wq->rescuer ? "(RESCUER)" : "",
+ worker->current_func);
+ list_for_each_entry(work, &worker->scheduled, entry)
+ pr_cont_work(false, work);
+ comma = true;
+ }
+ pr_cont("\n");
+ }
+
+ list_for_each_entry(work, &pool->worklist, entry) {
+ if (get_work_pwq(work) == pwq) {
+ has_pending = true;
+ break;
+ }
+ }
+ if (has_pending) {
+ bool comma = false;
+
+ pr_info(" pending:");
+ list_for_each_entry(work, &pool->worklist, entry) {
+ if (get_work_pwq(work) != pwq)
+ continue;
+
+ pr_cont_work(comma, work);
+ comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
+ }
+ pr_cont("\n");
+ }
+
+ if (!list_empty(&pwq->delayed_works)) {
+ bool comma = false;
+
+ pr_info(" delayed:");
+ list_for_each_entry(work, &pwq->delayed_works, entry) {
+ pr_cont_work(comma, work);
+ comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
+ }
+ pr_cont("\n");
+ }
+}
+
+/**
+ * show_workqueue_state - dump workqueue state
+ *
+ * Called from a sysrq handler and prints out all busy workqueues and
+ * pools.
+ */
+void show_workqueue_state(void)
+{
+ struct workqueue_struct *wq;
+ struct worker_pool *pool;
+ unsigned long flags;
+ int pi;
+
+ rcu_read_lock_sched();
+
+ pr_info("Showing busy workqueues and worker pools:\n");
+
+ list_for_each_entry_rcu(wq, &workqueues, list) {
+ struct pool_workqueue *pwq;
+ bool idle = true;
+
+ for_each_pwq(pwq, wq) {
+ if (pwq->nr_active || !list_empty(&pwq->delayed_works)) {
+ idle = false;
+ break;
+ }
+ }
+ if (idle)
+ continue;
+
+ pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags);
+
+ for_each_pwq(pwq, wq) {
+ spin_lock_irqsave(&pwq->pool->lock, flags);
+ if (pwq->nr_active || !list_empty(&pwq->delayed_works))
+ show_pwq(pwq);
+ spin_unlock_irqrestore(&pwq->pool->lock, flags);
+ }
+ }
+
+ for_each_pool(pool, pi) {
+ struct worker *worker;
+ bool first = true;
+
+ spin_lock_irqsave(&pool->lock, flags);
+ if (pool->nr_workers == pool->nr_idle)
+ goto next_pool;
+
+ pr_info("pool %d:", pool->id);
+ pr_cont_pool_info(pool);
+ pr_cont(" workers=%d", pool->nr_workers);
+ if (pool->manager)
+ pr_cont(" manager: %d",
+ task_pid_nr(pool->manager->task));
+ list_for_each_entry(worker, &pool->idle_list, entry) {
+ pr_cont(" %s%d", first ? "idle: " : "",
+ task_pid_nr(worker->task));
+ first = false;
+ }
+ pr_cont("\n");
+ next_pool:
+ spin_unlock_irqrestore(&pool->lock, flags);
+ }
+
+ rcu_read_unlock_sched();
+}
+
+/*
+ * CPU hotplug.
+ *
+ * There are two challenges in supporting CPU hotplug. Firstly, there
+ * are a lot of assumptions on strong associations among work, pwq and
+ * pool which make migrating pending and scheduled works very
+ * difficult to implement without impacting hot paths. Secondly,
+ * worker pools serve mix of short, long and very long running works making
+ * blocked draining impractical.
+ *
+ * This is solved by allowing the pools to be disassociated from the CPU
+ * running as an unbound one and allowing it to be reattached later if the
+ * cpu comes back online.
+ */
+
+static void wq_unbind_fn(struct work_struct *work)
+{
+ int cpu = smp_processor_id();
+ struct worker_pool *pool;
+ struct worker *worker;
+
+ for_each_cpu_worker_pool(pool, cpu) {
+ mutex_lock(&pool->attach_mutex);
+ spin_lock_irq(&pool->lock);
+
+ /*
+ * We've blocked all attach/detach operations. Make all workers
+ * unbound and set DISASSOCIATED. Before this, all workers
+ * except for the ones which are still executing works from
+ * before the last CPU down must be on the cpu. After
+ * this, they may become diasporas.
+ */
+ for_each_pool_worker(worker, pool)
+ worker->flags |= WORKER_UNBOUND;
+
+ pool->flags |= POOL_DISASSOCIATED;
+
+ spin_unlock_irq(&pool->lock);
+ mutex_unlock(&pool->attach_mutex);
+
+ /*
+ * Call schedule() so that we cross rq->lock and thus can
+ * guarantee sched callbacks see the %WORKER_UNBOUND flag.
+ * This is necessary as scheduler callbacks may be invoked
+ * from other cpus.
+ */
+ schedule();
+
+ /*
+ * Sched callbacks are disabled now. Zap nr_running.
+ * After this, nr_running stays zero and need_more_worker()
+ * and keep_working() are always true as long as the
+ * worklist is not empty. This pool now behaves as an
+ * unbound (in terms of concurrency management) pool which
+ * are served by workers tied to the pool.
+ */
+ atomic_set(&pool->nr_running, 0);
+
+ /*
+ * With concurrency management just turned off, a busy
+ * worker blocking could lead to lengthy stalls. Kick off
+ * unbound chain execution of currently pending work items.
+ */
+ spin_lock_irq(&pool->lock);
+ wake_up_worker(pool);
+ spin_unlock_irq(&pool->lock);
+ }
+}
+
+/**
+ * rebind_workers - rebind all workers of a pool to the associated CPU
+ * @pool: pool of interest
+ *
+ * @pool->cpu is coming online. Rebind all workers to the CPU.
+ */
+static void rebind_workers(struct worker_pool *pool)
+{
+ struct worker *worker;
+
+ lockdep_assert_held(&pool->attach_mutex);
+
+ /*
+ * Restore CPU affinity of all workers. As all idle workers should
+ * be on the run-queue of the associated CPU before any local
+ * wake-ups for concurrency management happen, restore CPU affinty
+ * of all workers first and then clear UNBOUND. As we're called
+ * from CPU_ONLINE, the following shouldn't fail.
+ */
+ for_each_pool_worker(worker, pool)
+ WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
+ pool->attrs->cpumask) < 0);
+
+ spin_lock_irq(&pool->lock);
+ pool->flags &= ~POOL_DISASSOCIATED;
+
+ for_each_pool_worker(worker, pool) {
+ unsigned int worker_flags = worker->flags;
+
+ /*
+ * A bound idle worker should actually be on the runqueue
+ * of the associated CPU for local wake-ups targeting it to
+ * work. Kick all idle workers so that they migrate to the
+ * associated CPU. Doing this in the same loop as
+ * replacing UNBOUND with REBOUND is safe as no worker will
+ * be bound before @pool->lock is released.
+ */
+ if (worker_flags & WORKER_IDLE)
+ wake_up_process(worker->task);
+
+ /*
+ * We want to clear UNBOUND but can't directly call
+ * worker_clr_flags() or adjust nr_running. Atomically
+ * replace UNBOUND with another NOT_RUNNING flag REBOUND.
+ * @worker will clear REBOUND using worker_clr_flags() when
+ * it initiates the next execution cycle thus restoring
+ * concurrency management. Note that when or whether
+ * @worker clears REBOUND doesn't affect correctness.
+ *
+ * ACCESS_ONCE() is necessary because @worker->flags may be
+ * tested without holding any lock in
+ * wq_worker_waking_up(). Without it, NOT_RUNNING test may
+ * fail incorrectly leading to premature concurrency
+ * management operations.
+ */
+ WARN_ON_ONCE(!(worker_flags & WORKER_UNBOUND));
+ worker_flags |= WORKER_REBOUND;
+ worker_flags &= ~WORKER_UNBOUND;
+ ACCESS_ONCE(worker->flags) = worker_flags;
+ }
+
+ spin_unlock_irq(&pool->lock);
+}
+
+/**
+ * restore_unbound_workers_cpumask - restore cpumask of unbound workers
+ * @pool: unbound pool of interest
+ * @cpu: the CPU which is coming up
+ *
+ * An unbound pool may end up with a cpumask which doesn't have any online
+ * CPUs. When a worker of such pool get scheduled, the scheduler resets
+ * its cpus_allowed. If @cpu is in @pool's cpumask which didn't have any
+ * online CPU before, cpus_allowed of all its workers should be restored.
+ */
+static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
+{
+ static cpumask_t cpumask;
+ struct worker *worker;
+
+ lockdep_assert_held(&pool->attach_mutex);
+
+ /* is @cpu allowed for @pool? */
+ if (!cpumask_test_cpu(cpu, pool->attrs->cpumask))
+ return;
+
+ /* is @cpu the only online CPU? */
+ cpumask_and(&cpumask, pool->attrs->cpumask, cpu_online_mask);
+ if (cpumask_weight(&cpumask) != 1)
+ return;
+
+ /* as we're called from CPU_ONLINE, the following shouldn't fail */
+ for_each_pool_worker(worker, pool)
+ WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
+ pool->attrs->cpumask) < 0);
+}
+
+/*
+ * Workqueues should be brought up before normal priority CPU notifiers.
+ * This will be registered high priority CPU notifier.
+ */
+static int workqueue_cpu_up_callback(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ int cpu = (unsigned long)hcpu;
+ struct worker_pool *pool;
+ struct workqueue_struct *wq;
+ int pi;
+
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_UP_PREPARE:
+ for_each_cpu_worker_pool(pool, cpu) {
+ if (pool->nr_workers)
+ continue;
+ if (!create_worker(pool))
+ return NOTIFY_BAD;
+ }
+ break;
+
+ case CPU_DOWN_FAILED:
+ case CPU_ONLINE:
+ mutex_lock(&wq_pool_mutex);
+
+ for_each_pool(pool, pi) {
+ mutex_lock(&pool->attach_mutex);
+
+ if (pool->cpu == cpu)
+ rebind_workers(pool);
+ else if (pool->cpu < 0)
+ restore_unbound_workers_cpumask(pool, cpu);
+
+ mutex_unlock(&pool->attach_mutex);
+ }
+
+ /* update NUMA affinity of unbound workqueues */
+ list_for_each_entry(wq, &workqueues, list)
+ wq_update_unbound_numa(wq, cpu, true);
+
+ mutex_unlock(&wq_pool_mutex);
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+/*
+ * Workqueues should be brought down after normal priority CPU notifiers.
+ * This will be registered as low priority CPU notifier.
+ */
+static int workqueue_cpu_down_callback(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ int cpu = (unsigned long)hcpu;
+ struct work_struct unbind_work;
+ struct workqueue_struct *wq;
+
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_DOWN_PREPARE:
+ /* unbinding per-cpu workers should happen on the local CPU */
+ INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);
+ queue_work_on(cpu, system_highpri_wq, &unbind_work);
+
+ /* update NUMA affinity of unbound workqueues */
+ mutex_lock(&wq_pool_mutex);
+ list_for_each_entry(wq, &workqueues, list)
+ wq_update_unbound_numa(wq, cpu, false);
+ mutex_unlock(&wq_pool_mutex);
+
+ /* wait for per-cpu unbinding to finish */
+ flush_work(&unbind_work);
+ destroy_work_on_stack(&unbind_work);
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+#ifdef CONFIG_SMP
+
+struct work_for_cpu {
+ struct work_struct work;
+ long (*fn)(void *);
+ void *arg;
+ long ret;
+};
+
+static void work_for_cpu_fn(struct work_struct *work)
+{
+ struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work);
+
+ wfc->ret = wfc->fn(wfc->arg);
+}
+
+/**
+ * work_on_cpu - run a function in user context on a particular cpu
+ * @cpu: the cpu to run on
+ * @fn: the function to run
+ * @arg: the function arg
+ *
+ * It is up to the caller to ensure that the cpu doesn't go offline.
+ * The caller must not hold any locks which would prevent @fn from completing.
+ *
+ * Return: The value @fn returns.
+ */
+long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
+{
+ struct work_for_cpu wfc = { .fn = fn, .arg = arg };
+
+ INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);
+ schedule_work_on(cpu, &wfc.work);
+ flush_work(&wfc.work);
+ destroy_work_on_stack(&wfc.work);
+ return wfc.ret;
+}
+EXPORT_SYMBOL_GPL(work_on_cpu);
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_FREEZER
+
+/**
+ * freeze_workqueues_begin - begin freezing workqueues
+ *
+ * Start freezing workqueues. After this function returns, all freezable
+ * workqueues will queue new works to their delayed_works list instead of
+ * pool->worklist.
+ *
+ * CONTEXT:
+ * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.
+ */
+void freeze_workqueues_begin(void)
+{
+ struct workqueue_struct *wq;
+ struct pool_workqueue *pwq;
+
+ mutex_lock(&wq_pool_mutex);
+
+ WARN_ON_ONCE(workqueue_freezing);
+ workqueue_freezing = true;
+
+ list_for_each_entry(wq, &workqueues, list) {
+ mutex_lock(&wq->mutex);
+ for_each_pwq(pwq, wq)
+ pwq_adjust_max_active(pwq);
+ mutex_unlock(&wq->mutex);
+ }
+
+ mutex_unlock(&wq_pool_mutex);
+}
+
+/**
+ * freeze_workqueues_busy - are freezable workqueues still busy?
+ *
+ * Check whether freezing is complete. This function must be called
+ * between freeze_workqueues_begin() and thaw_workqueues().
+ *
+ * CONTEXT:
+ * Grabs and releases wq_pool_mutex.
+ *
+ * Return:
+ * %true if some freezable workqueues are still busy. %false if freezing
+ * is complete.
+ */
+bool freeze_workqueues_busy(void)
+{
+ bool busy = false;
+ struct workqueue_struct *wq;
+ struct pool_workqueue *pwq;
+
+ mutex_lock(&wq_pool_mutex);
+
+ WARN_ON_ONCE(!workqueue_freezing);
+
+ list_for_each_entry(wq, &workqueues, list) {
+ if (!(wq->flags & WQ_FREEZABLE))
+ continue;
+ /*
+ * nr_active is monotonically decreasing. It's safe
+ * to peek without lock.
+ */
+ rcu_read_lock_sched();
+ for_each_pwq(pwq, wq) {
+ WARN_ON_ONCE(pwq->nr_active < 0);
+ if (pwq->nr_active) {
+ busy = true;
+ rcu_read_unlock_sched();
+ goto out_unlock;
+ }
+ }
+ rcu_read_unlock_sched();
+ }
+out_unlock:
+ mutex_unlock(&wq_pool_mutex);
+ return busy;
+}
+
+/**
+ * thaw_workqueues - thaw workqueues
+ *
+ * Thaw workqueues. Normal queueing is restored and all collected
+ * frozen works are transferred to their respective pool worklists.
+ *
+ * CONTEXT:
+ * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.
+ */
+void thaw_workqueues(void)
+{
+ struct workqueue_struct *wq;
+ struct pool_workqueue *pwq;
+
+ mutex_lock(&wq_pool_mutex);
+
+ if (!workqueue_freezing)
+ goto out_unlock;
+
+ workqueue_freezing = false;
+
+ /* restore max_active and repopulate worklist */
+ list_for_each_entry(wq, &workqueues, list) {
+ mutex_lock(&wq->mutex);
+ for_each_pwq(pwq, wq)
+ pwq_adjust_max_active(pwq);
+ mutex_unlock(&wq->mutex);
+ }
+
+out_unlock:
+ mutex_unlock(&wq_pool_mutex);
+}
+#endif /* CONFIG_FREEZER */
+
+#ifdef CONFIG_SYSFS
+/*
+ * Workqueues with WQ_SYSFS flag set is visible to userland via
+ * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the
+ * following attributes.
+ *
+ * per_cpu RO bool : whether the workqueue is per-cpu or unbound
+ * max_active RW int : maximum number of in-flight work items
+ *
+ * Unbound workqueues have the following extra attributes.
+ *
+ * id RO int : the associated pool ID
+ * nice RW int : nice value of the workers
+ * cpumask RW mask : bitmask of allowed CPUs for the workers
+ */
+struct wq_device {
+ struct workqueue_struct *wq;
+ struct device dev;
+};
+
+static struct workqueue_struct *dev_to_wq(struct device *dev)
+{
+ struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
+
+ return wq_dev->wq;
+}
+
+static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+
+ return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
+}
+static DEVICE_ATTR_RO(per_cpu);
+
+static ssize_t max_active_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+
+ return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
+}
+
+static ssize_t max_active_store(struct device *dev,
+ struct device_attribute *attr, const char *buf,
+ size_t count)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ int val;
+
+ if (sscanf(buf, "%d", &val) != 1 || val <= 0)
+ return -EINVAL;
+
+ workqueue_set_max_active(wq, val);
+ return count;
+}
+static DEVICE_ATTR_RW(max_active);
+
+static struct attribute *wq_sysfs_attrs[] = {
+ &dev_attr_per_cpu.attr,
+ &dev_attr_max_active.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(wq_sysfs);
+
+static ssize_t wq_pool_ids_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ const char *delim = "";
+ int node, written = 0;
+
+ rcu_read_lock_sched();
+ for_each_node(node) {
+ written += scnprintf(buf + written, PAGE_SIZE - written,
+ "%s%d:%d", delim, node,
+ unbound_pwq_by_node(wq, node)->pool->id);
+ delim = " ";
+ }
+ written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
+ rcu_read_unlock_sched();
+
+ return written;
+}
+
+static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ int written;
+
+ mutex_lock(&wq->mutex);
+ written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice);
+ mutex_unlock(&wq->mutex);
+
+ return written;
+}
+
+/* prepare workqueue_attrs for sysfs store operations */
+static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
+{
+ struct workqueue_attrs *attrs;
+
+ attrs = alloc_workqueue_attrs(GFP_KERNEL);
+ if (!attrs)
+ return NULL;
+
+ mutex_lock(&wq->mutex);
+ copy_workqueue_attrs(attrs, wq->unbound_attrs);
+ mutex_unlock(&wq->mutex);
+ return attrs;
+}
+
+static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ struct workqueue_attrs *attrs;
+ int ret;
+
+ attrs = wq_sysfs_prep_attrs(wq);
+ if (!attrs)
+ return -ENOMEM;
+
+ if (sscanf(buf, "%d", &attrs->nice) == 1 &&
+ attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
+ ret = apply_workqueue_attrs(wq, attrs);
+ else
+ ret = -EINVAL;
+
+ free_workqueue_attrs(attrs);
+ return ret ?: count;
+}
+
+static ssize_t wq_cpumask_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ int written;
+
+ mutex_lock(&wq->mutex);
+ written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
+ cpumask_pr_args(wq->unbound_attrs->cpumask));
+ mutex_unlock(&wq->mutex);
+ return written;
+}
+
+static ssize_t wq_cpumask_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ struct workqueue_attrs *attrs;
+ int ret;
+
+ attrs = wq_sysfs_prep_attrs(wq);
+ if (!attrs)
+ return -ENOMEM;
+
+ ret = cpumask_parse(buf, attrs->cpumask);
+ if (!ret)
+ ret = apply_workqueue_attrs(wq, attrs);
+
+ free_workqueue_attrs(attrs);
+ return ret ?: count;
+}
+
+static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ int written;
+
+ mutex_lock(&wq->mutex);
+ written = scnprintf(buf, PAGE_SIZE, "%d\n",
+ !wq->unbound_attrs->no_numa);
+ mutex_unlock(&wq->mutex);
+
+ return written;
+}
+
+static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ struct workqueue_attrs *attrs;
+ int v, ret;
+
+ attrs = wq_sysfs_prep_attrs(wq);
+ if (!attrs)
+ return -ENOMEM;
+
+ ret = -EINVAL;
+ if (sscanf(buf, "%d", &v) == 1) {
+ attrs->no_numa = !v;
+ ret = apply_workqueue_attrs(wq, attrs);
+ }
+
+ free_workqueue_attrs(attrs);
+ return ret ?: count;
+}
+
+static struct device_attribute wq_sysfs_unbound_attrs[] = {
+ __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL),
+ __ATTR(nice, 0644, wq_nice_show, wq_nice_store),
+ __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
+ __ATTR(numa, 0644, wq_numa_show, wq_numa_store),
+ __ATTR_NULL,
+};
+
+static struct bus_type wq_subsys = {
+ .name = "workqueue",
+ .dev_groups = wq_sysfs_groups,
+};
+
+static int __init wq_sysfs_init(void)
+{
+ return subsys_virtual_register(&wq_subsys, NULL);
+}
+core_initcall(wq_sysfs_init);
+
+static void wq_device_release(struct device *dev)
+{
+ struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
+
+ kfree(wq_dev);
+}
+
+/**
+ * workqueue_sysfs_register - make a workqueue visible in sysfs
+ * @wq: the workqueue to register
+ *
+ * Expose @wq in sysfs under /sys/bus/workqueue/devices.
+ * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set
+ * which is the preferred method.
+ *
+ * Workqueue user should use this function directly iff it wants to apply
+ * workqueue_attrs before making the workqueue visible in sysfs; otherwise,
+ * apply_workqueue_attrs() may race against userland updating the
+ * attributes.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int workqueue_sysfs_register(struct workqueue_struct *wq)
+{
+ struct wq_device *wq_dev;
+ int ret;
+
+ /*
+ * Adjusting max_active or creating new pwqs by applyting
+ * attributes breaks ordering guarantee. Disallow exposing ordered
+ * workqueues.
+ */
+ if (WARN_ON(wq->flags & __WQ_ORDERED))
+ return -EINVAL;
+
+ wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
+ if (!wq_dev)
+ return -ENOMEM;
+
+ wq_dev->wq = wq;
+ wq_dev->dev.bus = &wq_subsys;
+ wq_dev->dev.init_name = wq->name;
+ wq_dev->dev.release = wq_device_release;
+
+ /*
+ * unbound_attrs are created separately. Suppress uevent until
+ * everything is ready.
+ */
+ dev_set_uevent_suppress(&wq_dev->dev, true);
+
+ ret = device_register(&wq_dev->dev);
+ if (ret) {
+ kfree(wq_dev);
+ wq->wq_dev = NULL;
+ return ret;
+ }
+
+ if (wq->flags & WQ_UNBOUND) {
+ struct device_attribute *attr;
+
+ for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) {
+ ret = device_create_file(&wq_dev->dev, attr);
+ if (ret) {
+ device_unregister(&wq_dev->dev);
+ wq->wq_dev = NULL;
+ return ret;
+ }
+ }
+ }
+
+ dev_set_uevent_suppress(&wq_dev->dev, false);
+ kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
+ return 0;
+}
+
+/**
+ * workqueue_sysfs_unregister - undo workqueue_sysfs_register()
+ * @wq: the workqueue to unregister
+ *
+ * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister.
+ */
+static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
+{
+ struct wq_device *wq_dev = wq->wq_dev;
+
+ if (!wq->wq_dev)
+ return;
+
+ wq->wq_dev = NULL;
+ device_unregister(&wq_dev->dev);
+}
+#else /* CONFIG_SYSFS */
+static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { }
+#endif /* CONFIG_SYSFS */
+
+static void __init wq_numa_init(void)
+{
+ cpumask_var_t *tbl;
+ int node, cpu;
+
+ if (num_possible_nodes() <= 1)
+ return;
+
+ if (wq_disable_numa) {
+ pr_info("workqueue: NUMA affinity support disabled\n");
+ return;
+ }
+
+ wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(GFP_KERNEL);
+ BUG_ON(!wq_update_unbound_numa_attrs_buf);
+
+ /*
+ * We want masks of possible CPUs of each node which isn't readily
+ * available. Build one from cpu_to_node() which should have been
+ * fully initialized by now.
+ */
+ tbl = kzalloc(nr_node_ids * sizeof(tbl[0]), GFP_KERNEL);
+ BUG_ON(!tbl);
+
+ for_each_node(node)
+ BUG_ON(!zalloc_cpumask_var_node(&tbl[node], GFP_KERNEL,
+ node_online(node) ? node : NUMA_NO_NODE));
+
+ for_each_possible_cpu(cpu) {
+ node = cpu_to_node(cpu);
+ if (WARN_ON(node == NUMA_NO_NODE)) {
+ pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu);
+ /* happens iff arch is bonkers, let's just proceed */
+ return;
+ }
+ cpumask_set_cpu(cpu, tbl[node]);
+ }
+
+ wq_numa_possible_cpumask = tbl;
+ wq_numa_enabled = true;
+}
+
+static int __init init_workqueues(void)
+{
+ int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
+ int i, cpu;
+
+ WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
+
+ pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
+
+ cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
+ hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
+
+ wq_numa_init();
+
+ /* initialize CPU pools */
+ for_each_possible_cpu(cpu) {
+ struct worker_pool *pool;
+
+ i = 0;
+ for_each_cpu_worker_pool(pool, cpu) {
+ BUG_ON(init_worker_pool(pool));
+ pool->cpu = cpu;
+ cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu));
+ pool->attrs->nice = std_nice[i++];
+ pool->node = cpu_to_node(cpu);
+
+ /* alloc pool ID */
+ mutex_lock(&wq_pool_mutex);
+ BUG_ON(worker_pool_assign_id(pool));
+ mutex_unlock(&wq_pool_mutex);
+ }
+ }
+
+ /* create the initial worker */
+ for_each_online_cpu(cpu) {
+ struct worker_pool *pool;
+
+ for_each_cpu_worker_pool(pool, cpu) {
+ pool->flags &= ~POOL_DISASSOCIATED;
+ BUG_ON(!create_worker(pool));
+ }
+ }
+
+ /* create default unbound and ordered wq attrs */
+ for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
+ struct workqueue_attrs *attrs;
+
+ BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
+ attrs->nice = std_nice[i];
+ unbound_std_wq_attrs[i] = attrs;
+
+ /*
+ * An ordered wq should have only one pwq as ordering is
+ * guaranteed by max_active which is enforced by pwqs.
+ * Turn off NUMA so that dfl_pwq is used for all nodes.
+ */
+ BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
+ attrs->nice = std_nice[i];
+ attrs->no_numa = true;
+ ordered_wq_attrs[i] = attrs;
+ }
+
+ system_wq = alloc_workqueue("events", 0, 0);
+ system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
+ system_long_wq = alloc_workqueue("events_long", 0, 0);
+ system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
+ WQ_UNBOUND_MAX_ACTIVE);
+ system_freezable_wq = alloc_workqueue("events_freezable",
+ WQ_FREEZABLE, 0);
+ system_power_efficient_wq = alloc_workqueue("events_power_efficient",
+ WQ_POWER_EFFICIENT, 0);
+ system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_power_efficient",
+ WQ_FREEZABLE | WQ_POWER_EFFICIENT,
+ 0);
+ BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
+ !system_unbound_wq || !system_freezable_wq ||
+ !system_power_efficient_wq ||
+ !system_freezable_power_efficient_wq);
+ return 0;
+}
+early_initcall(init_workqueues);
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
new file mode 100644
index 000000000..45215870a
--- /dev/null
+++ b/kernel/workqueue_internal.h
@@ -0,0 +1,74 @@
+/*
+ * kernel/workqueue_internal.h
+ *
+ * Workqueue internal header file. Only to be included by workqueue and
+ * core kernel subsystems.
+ */
+#ifndef _KERNEL_WORKQUEUE_INTERNAL_H
+#define _KERNEL_WORKQUEUE_INTERNAL_H
+
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+
+struct worker_pool;
+
+/*
+ * The poor guys doing the actual heavy lifting. All on-duty workers are
+ * either serving the manager role, on idle list or on busy hash. For
+ * details on the locking annotation (L, I, X...), refer to workqueue.c.
+ *
+ * Only to be used in workqueue and async.
+ */
+struct worker {
+ /* on idle list while idle, on busy hash table while busy */
+ union {
+ struct list_head entry; /* L: while idle */
+ struct hlist_node hentry; /* L: while busy */
+ };
+
+ struct work_struct *current_work; /* L: work being processed */
+ work_func_t current_func; /* L: current_work's fn */
+ struct pool_workqueue *current_pwq; /* L: current_work's pwq */
+ bool desc_valid; /* ->desc is valid */
+ struct list_head scheduled; /* L: scheduled works */
+
+ /* 64 bytes boundary on 64bit, 32 on 32bit */
+
+ struct task_struct *task; /* I: worker task */
+ struct worker_pool *pool; /* I: the associated pool */
+ /* L: for rescuers */
+ struct list_head node; /* A: anchored at pool->workers */
+ /* A: runs through worker->node */
+
+ unsigned long last_active; /* L: last active timestamp */
+ unsigned int flags; /* X: flags */
+ int id; /* I: worker id */
+
+ /*
+ * Opaque string set with work_set_desc(). Printed out with task
+ * dump for debugging - WARN, BUG, panic or sysrq.
+ */
+ char desc[WORKER_DESC_LEN];
+
+ /* used only by rescuers to point to the target workqueue */
+ struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */
+};
+
+/**
+ * current_wq_worker - return struct worker if %current is a workqueue worker
+ */
+static inline struct worker *current_wq_worker(void)
+{
+ if (current->flags & PF_WQ_WORKER)
+ return kthread_data(current);
+ return NULL;
+}
+
+/*
+ * Scheduler hooks for concurrency managed workqueue. Only to be used from
+ * sched/core.c and workqueue.c.
+ */
+void wq_worker_waking_up(struct task_struct *task, int cpu);
+struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu);
+
+#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */